6 năm trước cách đây · 1b8abc0961
--- a/archivebox/__init__.py
+++ b/archivebox/__init__.py
@@ -1,3 +1,6 @@
 
				 __package__ = 'archivebox'
			
 
				 
			
 
				 from . import core
			
 
				+from . import cli
			
 
				+
			
 
				+from .main import *
			
--- a/archivebox/__main__.py
+++ b/archivebox/__main__.py
@@ -2,9 +2,14 @@
 
				 
			
 
				 __package__ = 'archivebox'
			
 
				 
			
 
				-from .cli.archivebox import main
			
 
				+import sys
			
 
				+from .cli import archivebox
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    archivebox.main(args=sys.argv[1:], stdin=sys.stdin)
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    main()
			
 
				+    archivebox.main(args=sys.argv[1:], stdin=sys.stdin)
			
 
				 
			
--- a/archivebox/cli/__init__.py
+++ b/archivebox/cli/__init__.py
@@ -2,13 +2,17 @@ __package__ = 'archivebox.cli'
 
				 
			
 
				 import os
			
 
				 
			
 
				-from typing import Dict
			
 
				+from typing import Dict, List, Optional, IO
			
 
				 from importlib import import_module
			
 
				 
			
 
				 CLI_DIR = os.path.dirname(os.path.abspath(__file__))
			
 
				 
			
 
				 # these common commands will appear sorted before any others for ease-of-use
			
 
				-display_first = ('help', 'version', 'init', 'info', 'config', 'list', 'update', 'add', 'remove')
			
 
				+meta_cmds = ('help', 'version')
			
 
				+main_cmds = ('init', 'info', 'config')
			
 
				+archive_cmds = ('add', 'remove', 'update', 'list')
			
 
				+
			
 
				+display_first = (*meta_cmds, *main_cmds, *archive_cmds)
			
 
				 
			
 
				 # every imported command module must have these properties in order to be valid
			
 
				 required_attrs = ('__package__', '__command__', 'main')
			
@@ -42,11 +46,14 @@ def list_subcommands() -> Dict[str, str]:
 
				     return dict(sorted(COMMANDS, key=display_order))
			
 
				 
			
 
				 
			
 
				-def run_subcommand(subcommand: str, args=None) -> None:
			
 
				+def run_subcommand(subcommand: str,
			
 
				+                   subcommand_args: List[str]=None,
			
 
				+                   stdin: Optional[IO]=None,
			
 
				+                   pwd: Optional[str]=None) -> None:
			
 
				     """run a given ArchiveBox subcommand with the given list of args"""
			
 
				 
			
 
				     module = import_module('.archivebox_{}'.format(subcommand), __package__)
			
 
				-    module.main(args)    # type: ignore
			
 
				+    module.main(args=subcommand_args, stdin=stdin, pwd=pwd)    # type: ignore
			
 
				 
			
 
				 
			
 
				 SUBCOMMANDS = list_subcommands()
			
--- a/archivebox/cli/archivebox.py
+++ b/archivebox/cli/archivebox.py
@@ -5,19 +5,17 @@ __package__ = 'archivebox.cli'
 
				 __command__ = 'archivebox'
			
 
				 __description__ = 'ArchiveBox: The self-hosted internet archive.'
			
 
				 
			
 
				-import os
			
 
				 import sys
			
 
				 import argparse
			
 
				 
			
 
				-from . import list_subcommands, run_subcommand
			
 
				-from ..legacy.config import OUTPUT_DIR
			
 
				+from typing import Optional, List, IO
			
 
				 
			
 
				+from . import list_subcommands, run_subcommand
			
 
				+from ..config import OUTPUT_DIR
			
 
				 
			
 
				-def parse_args(args=None):
			
 
				-    args = sys.argv[1:] if args is None else args
			
 
				 
			
 
				+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
			
 
				     subcommands = list_subcommands()
			
 
				-
			
 
				     parser = argparse.ArgumentParser(
			
 
				         prog=__command__,
			
 
				         description=__description__,
			
@@ -43,54 +41,24 @@ def parse_args(args=None):
 
				         default=None,
			
 
				     )
			
 
				     parser.add_argument(
			
 
				-        "args",
			
 
				+        "subcommand_args",
			
 
				         help="Arguments for the subcommand",
			
 
				         nargs=argparse.REMAINDER,
			
 
				     )
			
 
				-    
			
 
				-    command = parser.parse_args(args)
			
 
				+    command = parser.parse_args(args or ())
			
 
				 
			
 
				-    if command.help:
			
 
				+    if command.help or command.subcommand is None:
			
 
				         command.subcommand = 'help'
			
 
				     if command.version:
			
 
				         command.subcommand = 'version'
			
 
				 
			
 
				-    # print('--------------------------------------------')
			
 
				-    # print('Command:     ', sys.argv[0])
			
 
				-    # print('Subcommand:  ', command.subcommand)
			
 
				-    # print('Args to pass:', args[1:])
			
 
				-    # print('--------------------------------------------')
			
 
				-
			
 
				-    return command.subcommand, command.args
			
 
				-
			
 
				-
			
 
				-def print_import_tutorial():
			
 
				-    print('Welcome to ArchiveBox!')
			
 
				-    print()
			
 
				-    print('To import an existing archive (from a previous version of ArchiveBox):')
			
 
				-    print('    1. cd into your data dir OUTPUT_DIR (usually ArchiveBox/output) and run:')
			
 
				-    print('    2. archivebox init')
			
 
				-    print()
			
 
				-    print('To start a new archive:')
			
 
				-    print('    1. Create an emptry directory, then cd into it and run:')
			
 
				-    print('    2. archivebox init')
			
 
				-    print()
			
 
				-    print('For more information, see the migration docs here:')
			
 
				-    print('    https://github.com/pirate/ArchiveBox/wiki/Migration')
			
 
				-
			
 
				-def main(args=None):
			
 
				-    subcommand, subcommand_args = parse_args(args)
			
 
				-    existing_index = os.path.exists(os.path.join(OUTPUT_DIR, 'index.json'))
			
 
				-
			
 
				-    if subcommand is None:
			
 
				-        if existing_index:
			
 
				-            run_subcommand('help', subcommand_args)
			
 
				-        else:
			
 
				-            print_import_tutorial()
			
 
				-        raise SystemExit(0)
			
 
				+    run_subcommand(
			
 
				+        subcommand=command.subcommand,
			
 
				+        subcommand_args=command.subcommand_args,
			
 
				+        stdin=stdin,
			
 
				+        pwd=pwd or OUTPUT_DIR,
			
 
				+    )
			
 
				 
			
 
				-    run_subcommand(subcommand, subcommand_args)
			
 
				-    
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    main()
			
 
				+    main(args=sys.argv[1:], stdin=sys.stdin)
			
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -7,90 +7,75 @@ __description__ = 'Add a new URL or list of URLs to your archive'
 
				 import sys
			
 
				 import argparse
			
 
				 
			
 
				-from typing import List, Optional
			
 
				+from typing import List, Optional, IO
			
 
				 
			
 
				-from ..legacy.config import stderr, check_dependencies, check_data_folder
			
 
				-from ..legacy.util import (
			
 
				-    handle_stdin_import,
			
 
				-    handle_file_import,
			
 
				-)
			
 
				-from ..legacy.main import update_archive_data
			
 
				+from ..main import add
			
 
				+from ..util import SmartFormatter, accept_stdin
			
 
				+from ..config import OUTPUT_DIR, ONLY_NEW
			
 
				 
			
 
				 
			
 
				-def main(args: List[str]=None, stdin: Optional[str]=None) -> None:
			
 
				-    check_data_folder()
			
 
				-    
			
 
				-    args = sys.argv[1:] if args is None else args
			
 
				-
			
 
				+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
			
 
				     parser = argparse.ArgumentParser(
			
 
				         prog=__command__,
			
 
				         description=__description__,
			
 
				         add_help=True,
			
 
				+        formatter_class=SmartFormatter,
			
 
				     )
			
 
				-    # parser.add_argument(
			
 
				-    #     '--depth', #'-d',
			
 
				-    #     type=int,
			
 
				-    #     help='Recursively archive all linked pages up to this many hops away',
			
 
				-    #     default=0,
			
 
				-    # )
			
 
				     parser.add_argument(
			
 
				-        '--only-new', #'-n',
			
 
				+        '--update-all', #'-n',
			
 
				         action='store_true',
			
 
				-        help="Don't attempt to retry previously skipped/failed links when updating",
			
 
				+        default=not ONLY_NEW,
			
 
				+        help="Also retry previously skipped/failed links when adding new links",
			
 
				     )
			
 
				     parser.add_argument(
			
 
				         '--index-only', #'-o',
			
 
				         action='store_true',
			
 
				         help="Add the links to the main index without archiving them",
			
 
				     )
			
 
				-    # parser.add_argument(
			
 
				-    #     '--mirror', #'-m',
			
 
				-    #     action='store_true',
			
 
				-    #     help='Archive an entire site (finding all linked pages below it on the same domain)',
			
 
				-    # )
			
 
				-    # parser.add_argument(
			
 
				-    #     '--crawler', #'-r',
			
 
				-    #     choices=('depth_first', 'breadth_first'),
			
 
				-    #     help='Controls which crawler to use in order to find outlinks in a given page',
			
 
				-    #     default=None,
			
 
				-    # )
			
 
				     parser.add_argument(
			
 
				-        'url',
			
 
				+        'import_path',
			
 
				         nargs='?',
			
 
				         type=str,
			
 
				         default=None,
			
 
				-        help='URL of page to archive (or path to local file)'
			
 
				+        help=(
			
 
				+            'URL or path to local file containing a list of links to import. e.g.:\n'
			
 
				+            '    https://getpocket.com/users/USERNAME/feed/all\n'
			
 
				+            '    https://example.com/some/rss/feed.xml\n'
			
 
				+            '    ~/Downloads/firefox_bookmarks_export.html\n'
			
 
				+            '    ~/Desktop/sites_list.csv\n'
			
 
				+        )
			
 
				     )
			
 
				-    command = parser.parse_args(args)
			
 
				-
			
 
				-    check_dependencies()
			
 
				-
			
 
				-    ### Handle ingesting urls piped in through stdin
			
 
				-    # (.e.g if user does cat example_urls.txt | archivebox add)
			
 
				-    import_path = None
			
 
				-    if stdin or not sys.stdin.isatty():
			
 
				-        stdin_raw_text = stdin or sys.stdin.read()
			
 
				-        if stdin_raw_text and command.url:
			
 
				-            stderr(
			
 
				-                '[X] You should pass either a path as an argument, '
			
 
				-                'or pass a list of links via stdin, but not both.\n'
			
 
				-            )
			
 
				-            raise SystemExit(1)
			
 
				-
			
 
				-        import_path = handle_stdin_import(stdin_raw_text)
			
 
				-
			
 
				-    ### Handle ingesting url from a remote file/feed
			
 
				-    # (e.g. if an RSS feed URL is used as the import path) 
			
 
				-    elif command.url:
			
 
				-        import_path = handle_file_import(command.url)
			
 
				-
			
 
				-    update_archive_data(
			
 
				-        import_path=import_path,
			
 
				-        resume=None,
			
 
				-        only_new=command.only_new,
			
 
				+    command = parser.parse_args(args or ())
			
 
				+    import_str = accept_stdin(stdin)
			
 
				+    add(
			
 
				+        import_str=import_str,
			
 
				+        import_path=command.import_path,
			
 
				+        update_all=command.update_all,
			
 
				         index_only=command.index_only,
			
 
				+        out_dir=pwd or OUTPUT_DIR,
			
 
				     )
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    main()
			
 
				+    main(args=sys.argv[1:], stdin=sys.stdin)
			
 
				+
			
 
				+
			
 
				+# TODO: Implement these
			
 
				+#
			
 
				+# parser.add_argument(
			
 
				+#     '--depth', #'-d',
			
 
				+#     type=int,
			
 
				+#     help='Recursively archive all linked pages up to this many hops away',
			
 
				+#     default=0,
			
 
				+# )
			
 
				+# parser.add_argument(
			
 
				+#     '--mirror', #'-m',
			
 
				+#     action='store_true',
			
 
				+#     help='Archive an entire site (finding all linked pages below it on the same domain)',
			
 
				+# )
			
 
				+# parser.add_argument(
			
 
				+#     '--crawler', #'-r',
			
 
				+#     choices=('depth_first', 'breadth_first'),
			
 
				+#     help='Controls which crawler to use in order to find outlinks in a given page',
			
 
				+#     default=None,
			
 
				+# )
			
--- a/archivebox/cli/archivebox_config.py
+++ b/archivebox/cli/archivebox_config.py
@@ -7,28 +7,14 @@ __description__ = 'Get and set your ArchiveBox project configuration values'
 
				 import sys
			
 
				 import argparse
			
 
				 
			
 
				-from typing import Optional, List
			
 
				+from typing import Optional, List, IO
			
 
				 
			
 
				-from ..legacy.util import SmartFormatter
			
 
				-from ..legacy.config import (
			
 
				-    check_data_folder,
			
 
				-    OUTPUT_DIR,
			
 
				-    load_all_config,
			
 
				-    write_config_file,
			
 
				-    CONFIG,
			
 
				-    CONFIG_FILE,
			
 
				-    USER_CONFIG,
			
 
				-    ConfigDict,
			
 
				-    stderr,
			
 
				-    get_real_name,
			
 
				-)
			
 
				+from ..main import config
			
 
				+from ..util import SmartFormatter, accept_stdin
			
 
				+from ..config import OUTPUT_DIR
			
 
				 
			
 
				 
			
 
				-def main(args: List[str]=None, stdin: Optional[str]=None) -> None:
			
 
				-    check_data_folder()
			
 
				-    
			
 
				-    args = sys.argv[1:] if args is None else args
			
 
				-
			
 
				+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
			
 
				     parser = argparse.ArgumentParser(
			
 
				         prog=__command__,
			
 
				         description=__description__,
			
@@ -57,102 +43,18 @@ def main(args: List[str]=None, stdin: Optional[str]=None) -> None:
 
				         type=str,
			
 
				         help='KEY or KEY=VALUE formatted config values to get or set',
			
 
				     )
			
 
				-    command = parser.parse_args(args)
			
 
				-
			
 
				-    if stdin or not sys.stdin.isatty():
			
 
				-        stdin_raw_text = stdin or sys.stdin.read()
			
 
				-        if stdin_raw_text and command.config_options:
			
 
				-            stderr(
			
 
				-                '[X] You should either pass config values as an arguments '
			
 
				-                'or via stdin, but not both.\n',
			
 
				-                color='red',
			
 
				-            )
			
 
				-            raise SystemExit(1)
			
 
				-
			
 
				-        config_options = stdin_raw_text.split('\n')
			
 
				-    else:
			
 
				-        config_options = command.config_options
			
 
				-
			
 
				-    no_args = not (command.get or command.set or command.reset or command.config_options)
			
 
				-
			
 
				-    matching_config: ConfigDict = {}
			
 
				-    if command.get or no_args:
			
 
				-        if config_options:
			
 
				-            config_options = [get_real_name(key) for key in config_options]
			
 
				-            matching_config = {key: CONFIG[key] for key in config_options if key in CONFIG}
			
 
				-            failed_config = [key for key in config_options if key not in CONFIG]
			
 
				-            if failed_config:
			
 
				-                stderr()
			
 
				-                stderr('[X] These options failed to get', color='red')
			
 
				-                stderr('    {}'.format('\n    '.join(config_options)))
			
 
				-                raise SystemExit(1)
			
 
				-        else:
			
 
				-            matching_config = CONFIG
			
 
				-        
			
 
				-        print(printable_config(matching_config))
			
 
				-        raise SystemExit(not matching_config)
			
 
				-    elif command.set:
			
 
				-        new_config = {}
			
 
				-        failed_options = []
			
 
				-        for line in config_options:
			
 
				-            if line.startswith('#') or not line.strip():
			
 
				-                continue
			
 
				-            if '=' not in line:
			
 
				-                stderr('[X] Config KEY=VALUE must have an = sign in it', color='red')
			
 
				-                stderr(f'    {line}')
			
 
				-                raise SystemExit(2)
			
 
				-
			
 
				-            raw_key, val = line.split('=')
			
 
				-            raw_key = raw_key.upper().strip()
			
 
				-            key = get_real_name(raw_key)
			
 
				-            if key != raw_key:
			
 
				-                stderr(f'[i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.', color='lightyellow')
			
 
				-
			
 
				-            if key in CONFIG:
			
 
				-                new_config[key] = val.strip()
			
 
				-            else:
			
 
				-                failed_options.append(line)
			
 
				-
			
 
				-        if new_config:
			
 
				-            before = CONFIG
			
 
				-            matching_config = write_config_file(new_config, out_dir=OUTPUT_DIR)
			
 
				-            after = load_all_config()
			
 
				-            print(printable_config(matching_config))
			
 
				-
			
 
				-            side_effect_changes: ConfigDict = {}
			
 
				-            for key, val in after.items():
			
 
				-                if key in USER_CONFIG and (before[key] != after[key]) and (key not in matching_config):
			
 
				-                    side_effect_changes[key] = after[key]
			
 
				-
			
 
				-            if side_effect_changes:
			
 
				-                stderr()
			
 
				-                stderr('[i] Note: This change also affected these other options that depended on it:', color='lightyellow')
			
 
				-                print('    {}'.format(printable_config(side_effect_changes, prefix='    ')))
			
 
				-        if failed_options:
			
 
				-            stderr()
			
 
				-            stderr('[X] These options failed to set:', color='red')
			
 
				-            stderr('    {}'.format('\n    '.join(failed_options)))
			
 
				-        raise SystemExit(bool(failed_options))
			
 
				-    elif command.reset:
			
 
				-        stderr('[X] This command is not implemented yet.', color='red')
			
 
				-        stderr('    Please manually remove the relevant lines from your config file:')
			
 
				-        stderr(f'        {CONFIG_FILE}')
			
 
				-        raise SystemExit(2)
			
 
				-
			
 
				-    else:
			
 
				-        stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red')
			
 
				-        stderr('    archivebox config')
			
 
				-        stderr('    archivebox config --get SOME_KEY')
			
 
				-        stderr('    archivebox config --set SOME_KEY=SOME_VALUE')
			
 
				-        raise SystemExit(2)
			
 
				-
			
 
				-
			
 
				-def printable_config(config: ConfigDict, prefix: str='') -> str:
			
 
				-    return f'\n{prefix}'.join(
			
 
				-        f'{key}={val}'
			
 
				-        for key, val in config.items()
			
 
				-        if not (isinstance(val, dict) or callable(val))
			
 
				+    command = parser.parse_args(args or ())
			
 
				+    config_options_str = accept_stdin(stdin)
			
 
				+
			
 
				+    config(
			
 
				+        config_options_str=config_options_str,
			
 
				+        config_options=command.config_options,
			
 
				+        get=command.get,
			
 
				+        set=command.set,
			
 
				+        reset=command.reset,
			
 
				+        out_dir=pwd or OUTPUT_DIR,
			
 
				     )
			
 
				 
			
 
				+
			
 
				 if __name__ == '__main__':
			
 
				-    main()
			
 
				+    main(args=sys.argv[1:], stdin=sys.stdin)
			
--- a/archivebox/cli/archivebox_help.py
+++ b/archivebox/cli/archivebox_help.py
@@ -7,52 +7,24 @@ __description__ = 'Print the ArchiveBox help message and usage'
 
				 import sys
			
 
				 import argparse
			
 
				 
			
 
				-from ..legacy.util import reject_stdin
			
 
				-from ..legacy.config import ANSI
			
 
				-from . import list_subcommands
			
 
				+from typing import Optional, List, IO
			
 
				 
			
 
				+from ..main import help
			
 
				+from ..util import reject_stdin
			
 
				+from ..config import OUTPUT_DIR
			
 
				 
			
 
				-def main(args=None):
			
 
				-    args = sys.argv[1:] if args is None else args
			
 
				 
			
 
				+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
			
 
				     parser = argparse.ArgumentParser(
			
 
				         prog=__command__,
			
 
				         description=__description__,
			
 
				         add_help=True,
			
 
				     )
			
 
				-    parser.parse_args(args)
			
 
				-    reject_stdin(__command__)
			
 
				+    parser.parse_args(args or ())
			
 
				+    reject_stdin(__command__, stdin)
			
 
				     
			
 
				-
			
 
				-    COMMANDS_HELP_TEXT = '\n    '.join(
			
 
				-        f'{cmd.ljust(20)} {summary}'
			
 
				-        for cmd, summary in list_subcommands().items()
			
 
				-    )
			
 
				-
			
 
				-    print('''{green}ArchiveBox: The self-hosted internet archive.{reset}
			
 
				-        
			
 
				-{lightblue}Usage:{reset}
			
 
				-    archivebox [command] [--help] [--version] [...args]
			
 
				-
			
 
				-{lightblue}Comamnds:{reset}
			
 
				-    {}
			
 
				-
			
 
				-{lightblue}Example Use:{reset}
			
 
				-    mkdir my-archive; cd my-archive/
			
 
				-    archivebox init
			
 
				-    archivebox info
			
 
				-
			
 
				-    archivebox add https://example.com/some/page
			
 
				-    archivebox add --depth=1 ~/Downloads/bookmarks_export.html
			
 
				-    
			
 
				-    archivebox list --sort=timestamp --csv=timestamp,url,is_archived
			
 
				-    archivebox schedule --every=week https://example.com/some/feed.rss
			
 
				-    archivebox update --resume=15109948213.123
			
 
				-
			
 
				-{lightblue}Documentation:{reset}
			
 
				-    https://github.com/pirate/ArchiveBox/wiki
			
 
				-'''.format(COMMANDS_HELP_TEXT, **ANSI))
			
 
				+    help(out_dir=pwd or OUTPUT_DIR)
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    main()
			
 
				+    main(args=sys.argv[1:], stdin=sys.stdin)
			
--- a/archivebox/cli/archivebox_info.py
+++ b/archivebox/cli/archivebox_info.py
@@ -7,25 +7,24 @@ __description__ = 'Print out some info and statistics about the archive collecti
 
				 import sys
			
 
				 import argparse
			
 
				 
			
 
				-from ..legacy.config import check_data_folder
			
 
				-from ..legacy.util import reject_stdin
			
 
				-from ..legacy.main import info
			
 
				+from typing import Optional, List, IO
			
 
				 
			
 
				+from ..main import info
			
 
				+from ..config import OUTPUT_DIR
			
 
				+from ..util import reject_stdin
			
 
				 
			
 
				-def main(args=None):
			
 
				-    check_data_folder()
			
 
				-    
			
 
				-    args = sys.argv[1:] if args is None else args
			
 
				 
			
 
				+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
			
 
				     parser = argparse.ArgumentParser(
			
 
				         prog=__command__,
			
 
				         description=__description__,
			
 
				         add_help=True,
			
 
				     )
			
 
				-    parser.parse_args(args)
			
 
				-    reject_stdin(__command__)
			
 
				+    parser.parse_args(args or ())
			
 
				+    reject_stdin(__command__, stdin)
			
 
				+
			
 
				+    info(out_dir=pwd or OUTPUT_DIR)
			
 
				 
			
 
				-    info()
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    main()
			
 
				+    main(args=sys.argv[1:], stdin=sys.stdin)
			
--- a/archivebox/cli/archivebox_init.py
+++ b/archivebox/cli/archivebox_init.py
@@ -7,23 +7,24 @@ __description__ = 'Initialize a new ArchiveBox collection in the current directo
 
				 import sys
			
 
				 import argparse
			
 
				 
			
 
				-from ..legacy.util import reject_stdin
			
 
				-from ..legacy.main import init
			
 
				+from typing import Optional, List, IO
			
 
				 
			
 
				+from ..main import init
			
 
				+from ..util import reject_stdin
			
 
				+from ..config import OUTPUT_DIR
			
 
				 
			
 
				-def main(args=None):
			
 
				-    args = sys.argv[1:] if args is None else args
			
 
				 
			
 
				+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
			
 
				     parser = argparse.ArgumentParser(
			
 
				         prog=__command__,
			
 
				         description=__description__,
			
 
				         add_help=True,
			
 
				     )
			
 
				-    parser.parse_args(args)
			
 
				-    reject_stdin(__command__)
			
 
				+    parser.parse_args(args or ())
			
 
				+    reject_stdin(__command__, stdin)
			
 
				 
			
 
				-    init()
			
 
				+    init(out_dir=pwd or OUTPUT_DIR)
			
 
				     
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    main()
			
 
				+    main(args=sys.argv[1:], stdin=sys.stdin)
			
--- a/archivebox/cli/archivebox_list.py
+++ b/archivebox/cli/archivebox_list.py
@@ -2,15 +2,17 @@
 
				 
			
 
				 __package__ = 'archivebox.cli'
			
 
				 __command__ = 'archivebox list'
			
 
				-__description__ = 'List all the URLs currently in the archive.'
			
 
				+__description__ = 'List, filter, and export information about archive entries'
			
 
				 
			
 
				 import sys
			
 
				 import argparse
			
 
				 
			
 
				-from ..legacy.util import SmartFormatter, reject_stdin, to_json, to_csv
			
 
				-from ..legacy.config import check_data_folder, OUTPUT_DIR
			
 
				-from ..legacy.main import (
			
 
				-    list_archive_data,
			
 
				+from typing import Optional, List, IO
			
 
				+
			
 
				+from ..main import list_all
			
 
				+from ..util import SmartFormatter, accept_stdin
			
 
				+from ..config import OUTPUT_DIR
			
 
				+from ..index import (
			
 
				     get_indexed_folders,
			
 
				     get_archived_folders,
			
 
				     get_unarchived_folders,
			
@@ -23,11 +25,7 @@ from ..legacy.main import (
 
				     get_unrecognized_folders,
			
 
				 )
			
 
				 
			
 
				-def main(args=None):
			
 
				-    check_data_folder()
			
 
				-    
			
 
				-    args = sys.argv[1:] if args is None else args
			
 
				-
			
 
				+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
			
 
				     parser = argparse.ArgumentParser(
			
 
				         prog=__command__,
			
 
				         description=__description__,
			
@@ -93,57 +91,27 @@ def main(args=None):
 
				         help='Type of pattern matching to use when filtering URLs',
			
 
				     )
			
 
				     parser.add_argument(
			
 
				-        'patterns',
			
 
				+        'filter_patterns',
			
 
				         nargs='*',
			
 
				         type=str,
			
 
				         default=None,
			
 
				         help='List only URLs matching these filter patterns.'
			
 
				     )
			
 
				-    command = parser.parse_args(args)
			
 
				-    reject_stdin(__command__)
			
 
				+    command = parser.parse_args(args or ())
			
 
				+    filter_patterns_str = accept_stdin(stdin)
			
 
				 
			
 
				-    links = list_archive_data(
			
 
				-        filter_patterns=command.patterns,
			
 
				+    list_all(
			
 
				+        filter_patterns_str=filter_patterns_str,
			
 
				+        filter_patterns=command.filter_patterns,
			
 
				         filter_type=command.filter_type,
			
 
				-        before=command.before,
			
 
				+        status=command.status,
			
 
				         after=command.after,
			
 
				+        before=command.before,
			
 
				+        sort=command.sort,
			
 
				+        csv=command.csv,
			
 
				+        json=command.json,
			
 
				+        out_dir=pwd or OUTPUT_DIR,
			
 
				     )
			
 
				 
			
 
				-    if command.sort:
			
 
				-        links = sorted(links, key=lambda link: getattr(link, command.sort))
			
 
				-
			
 
				-    links = list(links)
			
 
				-
			
 
				-    if command.status == 'indexed':
			
 
				-        folders = get_indexed_folders(links, out_dir=OUTPUT_DIR)
			
 
				-    elif command.status == 'archived':
			
 
				-        folders = get_archived_folders(links, out_dir=OUTPUT_DIR)
			
 
				-    elif command.status == 'unarchived':
			
 
				-        folders = get_unarchived_folders(links, out_dir=OUTPUT_DIR)
			
 
				-
			
 
				-    elif command.status == 'present':
			
 
				-        folders = get_present_folders(links, out_dir=OUTPUT_DIR)
			
 
				-    elif command.status == 'valid':
			
 
				-        folders = get_valid_folders(links, out_dir=OUTPUT_DIR)
			
 
				-    elif command.status == 'invalid':
			
 
				-        folders = get_invalid_folders(links, out_dir=OUTPUT_DIR)
			
 
				-
			
 
				-    elif command.status == 'duplicate':
			
 
				-        folders = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
			
 
				-    elif command.status == 'orphaned':
			
 
				-        folders = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
			
 
				-    elif command.status == 'corrupted':
			
 
				-        folders = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
			
 
				-    elif command.status == 'unrecognized':
			
 
				-        folders = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
			
 
				-
			
 
				-    if command.csv:
			
 
				-        print(to_csv(folders.values(), csv_cols=command.csv.split(','), header=True))
			
 
				-    elif command.json:
			
 
				-        print(to_json(folders.values(), indent=4, sort_keys=True))
			
 
				-    else:
			
 
				-        print('\n'.join(f'{folder} {link}' for folder, link in folders.items()))
			
 
				-    raise SystemExit(not folders)
			
 
				-
			
 
				 if __name__ == '__main__':
			
 
				-    main()
			
 
				+    main(args=sys.argv[1:], stdin=sys.stdin)
			
--- a/archivebox/cli/archivebox_manage.py
+++ b/archivebox/cli/archivebox_manage.py
@@ -6,24 +6,18 @@ __description__ = 'Run an ArchiveBox Django management command'
 
				 
			
 
				 import sys
			
 
				 
			
 
				-from ..legacy.config import OUTPUT_DIR, setup_django, check_data_folder
			
 
				+from typing import Optional, List, IO
			
 
				 
			
 
				+from ..main import manage
			
 
				+from ..config import OUTPUT_DIR
			
 
				 
			
 
				-def main(args=None):
			
 
				-    check_data_folder()
			
 
				 
			
 
				-    setup_django(OUTPUT_DIR)
			
 
				-    from django.core.management import execute_from_command_line
			
 
				-
			
 
				-    args = sys.argv if args is None else ['archivebox', *args]
			
 
				-
			
 
				-    args[0] = f'{sys.argv[0]} manage'
			
 
				-
			
 
				-    if args[1:] == []:
			
 
				-        args.append('help')
			
 
				-    
			
 
				-    execute_from_command_line(args)
			
 
				+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
			
 
				+    manage(
			
 
				+        args=args,
			
 
				+        out_dir=pwd or OUTPUT_DIR,
			
 
				+    )
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    main()
			
 
				+    main(args=sys.argv[1:], stdin=sys.stdin)
			
--- a/archivebox/cli/archivebox_remove.py
+++ b/archivebox/cli/archivebox_remove.py
@@ -7,17 +7,14 @@ __description__ = 'Remove the specified URLs from the archive.'
 
				 import sys
			
 
				 import argparse
			
 
				 
			
 
				+from typing import Optional, List, IO
			
 
				 
			
 
				-from ..legacy.config import check_data_folder
			
 
				-from ..legacy.util import reject_stdin
			
 
				-from ..legacy.main import remove_archive_links
			
 
				+from ..main import remove
			
 
				+from ..util import accept_stdin
			
 
				+from ..config import OUTPUT_DIR
			
 
				 
			
 
				 
			
 
				-def main(args=None):
			
 
				-    check_data_folder()
			
 
				-    
			
 
				-    args = sys.argv[1:] if args is None else args
			
 
				-
			
 
				+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
			
 
				     parser = argparse.ArgumentParser(
			
 
				         prog=__command__,
			
 
				         description=__description__,
			
@@ -56,33 +53,25 @@ def main(args=None):
 
				         help='Type of pattern matching to use when filtering URLs',
			
 
				     )
			
 
				     parser.add_argument(
			
 
				-        'pattern',
			
 
				+        'filter_patterns',
			
 
				         nargs='*',
			
 
				         type=str,
			
 
				-        default=None,
			
 
				         help='URLs matching this filter pattern will be removed from the index.'
			
 
				     )
			
 
				-    command = parser.parse_args(args)
			
 
				-
			
 
				-    if not sys.stdin.isatty():
			
 
				-        stdin_raw_text = sys.stdin.read()
			
 
				-        if stdin_raw_text and command.url:
			
 
				-            print(
			
 
				-                '[X] You should pass either a pattern as an argument, '
			
 
				-                'or pass a list of patterns via stdin, but not both.\n'
			
 
				-            )
			
 
				-            raise SystemExit(1)
			
 
				-
			
 
				-        patterns = [pattern.strip() for pattern in stdin_raw_text.split('\n')]
			
 
				-    else:
			
 
				-        patterns = command.pattern
			
 
				+    command = parser.parse_args(args or ())
			
 
				+    filter_str = accept_stdin(stdin)
			
 
				 
			
 
				-    remove_archive_links(
			
 
				-        filter_patterns=patterns, filter_type=command.filter_type,
			
 
				-        before=command.before, after=command.after,
			
 
				-        yes=command.yes, delete=command.delete,
			
 
				+    remove(
			
 
				+        filter_str=filter_str,
			
 
				+        filter_patterns=command.filter_patterns,
			
 
				+        filter_type=command.filter_type,
			
 
				+        before=command.before,
			
 
				+        after=command.after,
			
 
				+        yes=command.yes,
			
 
				+        delete=command.delete,
			
 
				+        out_dir=pwd or OUTPUT_DIR,
			
 
				     )
			
 
				     
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    main()
			
 
				+    main(args=sys.argv[1:], stdin=sys.stdin)
			
--- a/archivebox/cli/archivebox_schedule.py
+++ b/archivebox/cli/archivebox_schedule.py
@@ -4,34 +4,17 @@ __package__ = 'archivebox.cli'
 
				 __command__ = 'archivebox schedule'
			
 
				 __description__ = 'Set ArchiveBox to regularly import URLs at specific times using cron'
			
 
				 
			
 
				-import os
			
 
				 import sys
			
 
				 import argparse
			
 
				 
			
 
				-from datetime import datetime
			
 
				-from crontab import CronTab, CronSlices
			
 
				+from typing import Optional, List, IO
			
 
				 
			
 
				+from ..main import schedule
			
 
				+from ..util import reject_stdin
			
 
				+from ..config import OUTPUT_DIR
			
 
				 
			
 
				-from ..legacy.util import reject_stdin
			
 
				-from ..legacy.config import (
			
 
				-    OUTPUT_DIR,
			
 
				-    LOGS_DIR,
			
 
				-    ARCHIVEBOX_BINARY,
			
 
				-    USER,
			
 
				-    ANSI,
			
 
				-    stderr,
			
 
				-    check_data_folder,
			
 
				-)
			
 
				-
			
 
				-
			
 
				-CRON_COMMENT = 'archivebox_schedule'
			
 
				-
			
 
				-
			
 
				-def main(args=None):
			
 
				-    check_data_folder()
			
 
				-    
			
 
				-    args = sys.argv[1:] if args is None else args
			
 
				 
			
 
				+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
			
 
				     parser = argparse.ArgumentParser(
			
 
				         prog=__command__,
			
 
				         description=__description__,
			
@@ -57,7 +40,7 @@ def main(args=None):
 
				     group.add_argument(
			
 
				         '--clear', # '-c'
			
 
				         action='store_true',
			
 
				-        help=("Stop all ArchiveBox scheduled runs, clear it completely from cron"),
			
 
				+        help=("Stop all ArchiveBox scheduled runs (remove cron jobs)"),
			
 
				     )
			
 
				     group.add_argument(
			
 
				         '--show', # '-s'
			
@@ -67,13 +50,14 @@ def main(args=None):
 
				     group.add_argument(
			
 
				         '--foreground', '-f',
			
 
				         action='store_true',
			
 
				-        help=("Launch ArchiveBox as a long-running foreground task "
			
 
				+        help=("Launch ArchiveBox scheduler as a long-running foreground task "
			
 
				               "instead of using cron."),
			
 
				     )
			
 
				     group.add_argument(
			
 
				         '--run-all', # '-a',
			
 
				         action='store_true',
			
 
				-        help='Run all the scheduled jobs once immediately, independent of their configured schedules',
			
 
				+        help=("Run all the scheduled jobs once immediately, independent of "
			
 
				+              "their configured schedules, can be used together with --foreground"),
			
 
				     )
			
 
				     parser.add_argument(
			
 
				         'import_path',
			
@@ -83,115 +67,21 @@ def main(args=None):
 
				         help=("Check this path and import any new links on every run "
			
 
				               "(can be either local file or remote URL)"),
			
 
				     )
			
 
				-    command = parser.parse_args(args)
			
 
				-    reject_stdin(__command__)
			
 
				-
			
 
				-    os.makedirs(LOGS_DIR, exist_ok=True)
			
 
				-
			
 
				-    cron = CronTab(user=True)
			
 
				-    cron = dedupe_jobs(cron)
			
 
				-
			
 
				-    existing_jobs = list(cron.find_comment(CRON_COMMENT))
			
 
				-    if command.foreground or command.run_all:
			
 
				-        if command.import_path or (not existing_jobs):
			
 
				-            stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**ANSI))
			
 
				-            stderr('    archivebox schedule --every=hour https://example.com/some/rss/feed.xml')
			
 
				-            raise SystemExit(1)
			
 
				-        print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **ANSI))
			
 
				-        if command.run_all:
			
 
				-            try:
			
 
				-                for job in existing_jobs:
			
 
				-                    sys.stdout.write(f'  > {job.command}')
			
 
				-                    sys.stdout.flush()
			
 
				-                    job.run()
			
 
				-                    sys.stdout.write(f'\r  √ {job.command}\n')
			
 
				-            except KeyboardInterrupt:
			
 
				-                print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
			
 
				-                raise SystemExit(1)
			
 
				-        if command.foreground:
			
 
				-            try:
			
 
				-                for result in cron.run_scheduler():
			
 
				-                    print(result)
			
 
				-            except KeyboardInterrupt:
			
 
				-                print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
			
 
				-                raise SystemExit(1)
			
 
				-
			
 
				-    elif command.show:
			
 
				-        if existing_jobs:
			
 
				-            print('\n'.join(str(cmd) for cmd in existing_jobs))
			
 
				-        else:
			
 
				-            stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **ANSI))
			
 
				-            stderr('    To schedule a new job, run:')
			
 
				-            stderr('        archivebox schedule --every=[timeperiod] https://example.com/some/rss/feed.xml')
			
 
				-        raise SystemExit(0)
			
 
				-
			
 
				-    elif command.clear:
			
 
				-        print(cron.remove_all(comment=CRON_COMMENT))
			
 
				-        cron.write()
			
 
				-        raise SystemExit(0)
			
 
				-
			
 
				-    elif command.every:
			
 
				-        quoted = lambda s: f'"{s}"' if s and ' ' in s else s
			
 
				-        cmd = [
			
 
				-            'cd',
			
 
				-            quoted(OUTPUT_DIR),
			
 
				-            '&&',
			
 
				-            quoted(ARCHIVEBOX_BINARY),
			
 
				-            *(('add', f'"{command.import_path}"',) if command.import_path else ('update',)),
			
 
				-            '2>&1',
			
 
				-            '>',
			
 
				-            quoted(os.path.join(LOGS_DIR, 'archivebox.log')),
			
 
				-
			
 
				-        ]
			
 
				-        new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT)
			
 
				-
			
 
				-        if command.every in ('minute', 'hour', 'day', 'week', 'month', 'year'):
			
 
				-            set_every = getattr(new_job.every(), command.every)
			
 
				-            set_every()
			
 
				-        elif CronSlices.is_valid(command.every):
			
 
				-            new_job.setall(command.every)
			
 
				-        else:
			
 
				-            stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**ANSI))
			
 
				-            stderr('    It must be one of minute/hour/day/week/month')
			
 
				-            stderr('    or a quoted cron-format schedule like:')
			
 
				-            stderr('        archivebox init --every=day https://example.com/some/rss/feed.xml')
			
 
				-            stderr('        archivebox init --every="0/5 * * * *" https://example.com/some/rss/feed.xml')
			
 
				-            raise SystemExit(1)
			
 
				-
			
 
				-        cron = dedupe_jobs(cron)
			
 
				-        cron.write()
			
 
				-
			
 
				-        total_runs = sum(j.frequency_per_year() for j in cron)
			
 
				-        existing_jobs = list(cron.find_comment(CRON_COMMENT))
			
 
				-
			
 
				-        print()
			
 
				-        print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **ANSI))
			
 
				-        print('\n'.join(f'  > {cmd}' if str(cmd) == str(new_job) else f'    {cmd}' for cmd in existing_jobs))
			
 
				-        if total_runs > 60 and not command.quiet:
			
 
				-            stderr()
			
 
				-            stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **ANSI))
			
 
				-            stderr(f'    Congrats on being an enthusiastic internet archiver! 👌')
			
 
				-            stderr()
			
 
				-            stderr('    Make sure you have enough storage space available to hold all the data.')
			
 
				-            stderr('    Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.')
			
 
				-        raise SystemExit(0)
			
 
				-
			
 
				-
			
 
				-def dedupe_jobs(cron: CronTab) -> CronTab:
			
 
				-    deduped = set()
			
 
				-    for job in list(cron):
			
 
				-        unique_tuple = (str(job.slices), job.command)
			
 
				-        if unique_tuple not in deduped:
			
 
				-            deduped.add(unique_tuple)
			
 
				-        cron.remove(job)
			
 
				-
			
 
				-    for schedule, command in deduped:
			
 
				-        job = cron.new(command=command, comment=CRON_COMMENT)
			
 
				-        job.setall(schedule)
			
 
				-        job.enable()
			
 
				-
			
 
				-    return cron
			
 
				+    command = parser.parse_args(args or ())
			
 
				+    reject_stdin(__command__, stdin)
			
 
				+
			
 
				+    schedule(
			
 
				+        add=command.add,
			
 
				+        show=command.show,
			
 
				+        clear=command.clear,
			
 
				+        foreground=command.foreground,
			
 
				+        run_all=command.run_all,
			
 
				+        quiet=command.quiet,
			
 
				+        every=command.every,
			
 
				+        import_path=command.import_path,
			
 
				+        out_dir=pwd or OUTPUT_DIR,
			
 
				+    )
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    main()
			
 
				+    main(args=sys.argv[1:], stdin=sys.stdin)
			
--- a/archivebox/cli/archivebox_server.py
+++ b/archivebox/cli/archivebox_server.py
@@ -7,15 +7,14 @@ __description__ = 'Run the ArchiveBox HTTP server'
 
				 import sys
			
 
				 import argparse
			
 
				 
			
 
				-from ..legacy.config import setup_django, IS_TTY, OUTPUT_DIR, ANSI, check_data_folder
			
 
				-from ..legacy.util import reject_stdin
			
 
				+from typing import Optional, List, IO
			
 
				 
			
 
				+from ..main import server
			
 
				+from ..util import reject_stdin
			
 
				+from ..config import OUTPUT_DIR
			
 
				 
			
 
				-def main(args=None):
			
 
				-    check_data_folder()
			
 
				-
			
 
				-    args = sys.argv[1:] if args is None else args
			
 
				 
			
 
				+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
			
 
				     parser = argparse.ArgumentParser(
			
 
				         prog=__command__,
			
 
				         description=__description__,
			
@@ -33,26 +32,15 @@ def main(args=None):
 
				         action='store_true',
			
 
				         help='Enable auto-reloading when code or templates change',
			
 
				     )
			
 
				-    command = parser.parse_args(args)
			
 
				-    reject_stdin(__command__)
			
 
				+    command = parser.parse_args(args or ())
			
 
				+    reject_stdin(__command__, stdin)
			
 
				     
			
 
				-    setup_django(OUTPUT_DIR)
			
 
				-    from django.core.management import call_command
			
 
				-    from django.contrib.auth.models import User
			
 
				-
			
 
				-    if IS_TTY and not User.objects.filter(is_superuser=True).exists():
			
 
				-        print('{lightyellow}[!] No admin users exist yet, you will not be able to edit links in the UI.{reset}'.format(**ANSI))
			
 
				-        print()
			
 
				-        print('    To create an admin user, run:')
			
 
				-        print('        archivebox manage createsuperuser')
			
 
				-        print()
			
 
				-
			
 
				-    print('{green}[+] Starting ArchiveBox webserver...{reset}'.format(**ANSI))
			
 
				-    if not command.reload:
			
 
				-        command.runserver_args.append('--noreload')
			
 
				-
			
 
				-    call_command("runserver", *command.runserver_args)
			
 
				+    server(
			
 
				+        runserver_args=command.runserver_args,
			
 
				+        reload=command.reload,
			
 
				+        out_dir=pwd or OUTPUT_DIR,
			
 
				+    )
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    main()
			
 
				+    main(args=sys.argv[1:], stdin=sys.stdin)
			
--- a/archivebox/cli/archivebox_shell.py
+++ b/archivebox/cli/archivebox_shell.py
@@ -7,27 +7,26 @@ __description__ = 'Enter an interactive ArchiveBox Django shell'
 
				 import sys
			
 
				 import argparse
			
 
				 
			
 
				-from ..legacy.config import setup_django, OUTPUT_DIR, check_data_folder
			
 
				-from ..legacy.util import reject_stdin
			
 
				+from typing import Optional, List, IO
			
 
				 
			
 
				+from ..main import shell
			
 
				+from ..config import OUTPUT_DIR
			
 
				+from ..util import reject_stdin
			
 
				 
			
 
				-def main(args=None):
			
 
				-    check_data_folder()
			
 
				-    
			
 
				-    args = sys.argv[1:] if args is None else args
			
 
				 
			
 
				+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
			
 
				     parser = argparse.ArgumentParser(
			
 
				         prog=__command__,
			
 
				         description=__description__,
			
 
				         add_help=True,
			
 
				     )
			
 
				-    parser.parse_args(args)
			
 
				-    reject_stdin(__command__)
			
 
				+    parser.parse_args(args or ())
			
 
				+    reject_stdin(__command__, stdin)
			
 
				+    
			
 
				+    shell(
			
 
				+        out_dir=pwd or OUTPUT_DIR,
			
 
				+    )
			
 
				     
			
 
				-    setup_django(OUTPUT_DIR)
			
 
				-    from django.core.management import call_command
			
 
				-    call_command("shell_plus")
			
 
				-
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    main()
			
 
				+    main(args=sys.argv[1:], stdin=sys.stdin)
			
--- a/archivebox/cli/archivebox_update.py
+++ b/archivebox/cli/archivebox_update.py
@@ -2,27 +2,36 @@
 
				 
			
 
				 __package__ = 'archivebox.cli'
			
 
				 __command__ = 'archivebox update'
			
 
				-__description__ = 'Import any new links from subscriptions and retry any previously failed/skipped links.'
			
 
				+__description__ = 'Import any new links from subscriptions and retry any previously failed/skipped links'
			
 
				 
			
 
				 import sys
			
 
				 import argparse
			
 
				 
			
 
				-from typing import List
			
 
				+from typing import List, Optional, IO
			
 
				 
			
 
				-from ..legacy.config import check_data_folder
			
 
				-from ..legacy.util import reject_stdin
			
 
				-from ..legacy.main import update_archive_data
			
 
				+from ..main import update
			
 
				+from ..util import SmartFormatter, accept_stdin
			
 
				+from ..config import OUTPUT_DIR
			
 
				+from ..index import (
			
 
				+    get_indexed_folders,
			
 
				+    get_archived_folders,
			
 
				+    get_unarchived_folders,
			
 
				+    get_present_folders,
			
 
				+    get_valid_folders,
			
 
				+    get_invalid_folders,
			
 
				+    get_duplicate_folders,
			
 
				+    get_orphaned_folders,
			
 
				+    get_corrupted_folders,
			
 
				+    get_unrecognized_folders,
			
 
				+)
			
 
				 
			
 
				 
			
 
				-def main(args: List[str]=None):
			
 
				-    check_data_folder()
			
 
				-    
			
 
				-    args = sys.argv[1:] if args is None else args
			
 
				-
			
 
				+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
			
 
				     parser = argparse.ArgumentParser(
			
 
				         prog=__command__,
			
 
				         description=__description__,
			
 
				         add_help=True,
			
 
				+        formatter_class=SmartFormatter,
			
 
				     )
			
 
				     parser.add_argument(
			
 
				         '--only-new', #'-n',
			
@@ -40,16 +49,75 @@ def main(args: List[str]=None):
 
				         help='Resume the update process from a given timestamp',
			
 
				         default=None,
			
 
				     )
			
 
				+    parser.add_argument(
			
 
				+        '--overwrite', #'-x',
			
 
				+        action='store_true',
			
 
				+        help='Ignore existing archived content and overwrite with new versions (DANGEROUS)',
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        '--before', #'-b',
			
 
				+        type=float,
			
 
				+        help="Update only links bookmarked before the given timestamp.",
			
 
				+        default=None,
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        '--after', #'-a',
			
 
				+        type=float,
			
 
				+        help="Update only links bookmarked after the given timestamp.",
			
 
				+        default=None,
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        '--status',
			
 
				+        type=str,
			
 
				+        choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'),
			
 
				+        default='indexed',
			
 
				+        help=(
			
 
				+            'Update only links or data directories that have the given status\n'
			
 
				+            f'    indexed       {get_indexed_folders.__doc__} (the default)\n'
			
 
				+            f'    archived      {get_archived_folders.__doc__}\n'
			
 
				+            f'    unarchived    {get_unarchived_folders.__doc__}\n'
			
 
				+            '\n'
			
 
				+            f'    present       {get_present_folders.__doc__}\n'
			
 
				+            f'    valid         {get_valid_folders.__doc__}\n'
			
 
				+            f'    invalid       {get_invalid_folders.__doc__}\n'
			
 
				+            '\n'
			
 
				+            f'    duplicate     {get_duplicate_folders.__doc__}\n'
			
 
				+            f'    orphaned      {get_orphaned_folders.__doc__}\n'
			
 
				+            f'    corrupted     {get_corrupted_folders.__doc__}\n'
			
 
				+            f'    unrecognized  {get_unrecognized_folders.__doc__}\n'
			
 
				+        )
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        '--filter-type',
			
 
				+        type=str,
			
 
				+        choices=('exact', 'substring', 'domain', 'regex'),
			
 
				+        default='exact',
			
 
				+        help='Type of pattern matching to use when filtering URLs',
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        'filter_patterns',
			
 
				+        nargs='*',
			
 
				+        type=str,
			
 
				+        default=None,
			
 
				+        help='List only URLs matching these filter patterns.'
			
 
				+    )
			
 
				     command = parser.parse_args(args)
			
 
				-    reject_stdin(__command__)
			
 
				+    filter_patterns_str = accept_stdin(stdin)
			
 
				 
			
 
				-    update_archive_data(
			
 
				-        import_path=None,
			
 
				+    update(
			
 
				         resume=command.resume,
			
 
				         only_new=command.only_new,
			
 
				         index_only=command.index_only,
			
 
				+        overwrite=command.overwrite,
			
 
				+        filter_patterns_str=filter_patterns_str,
			
 
				+        filter_patterns=command.filter_patterns,
			
 
				+        filter_type=command.filter_type,
			
 
				+        status=command.status,
			
 
				+        after=command.after,
			
 
				+        before=command.before,
			
 
				+        out_dir=pwd or OUTPUT_DIR,
			
 
				     )
			
 
				     
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    main()
			
 
				+    main(args=sys.argv[1:], stdin=sys.stdin)
			
--- a/archivebox/cli/archivebox_version.py
+++ b/archivebox/cli/archivebox_version.py
@@ -4,26 +4,17 @@ __package__ = 'archivebox.cli'
 
				 __command__ = 'archivebox version'
			
 
				 __description__ = 'Print the ArchiveBox version and dependency information'
			
 
				 
			
 
				-import os
			
 
				-import re
			
 
				 import sys
			
 
				 import argparse
			
 
				 
			
 
				-from ..legacy.util import reject_stdin, human_readable_size
			
 
				-from ..legacy.config import (
			
 
				-    ANSI,
			
 
				-    VERSION,
			
 
				-    CODE_LOCATIONS,
			
 
				-    CONFIG_LOCATIONS,
			
 
				-    DATA_LOCATIONS,
			
 
				-    DEPENDENCIES,
			
 
				-    check_dependencies,
			
 
				-)
			
 
				+from typing import Optional, List, IO
			
 
				 
			
 
				+from ..main import version
			
 
				+from ..util import reject_stdin
			
 
				+from ..config import OUTPUT_DIR
			
 
				 
			
 
				-def main(args=None):
			
 
				-    args = sys.argv[1:] if args is None else args
			
 
				 
			
 
				+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
			
 
				     parser = argparse.ArgumentParser(
			
 
				         prog=__command__,
			
 
				         description=__description__,
			
@@ -34,92 +25,14 @@ def main(args=None):
 
				         action='store_true',
			
 
				         help='Only print ArchiveBox version number and nothing else.',
			
 
				     )
			
 
				-    command = parser.parse_args(args)
			
 
				-    reject_stdin(__command__)
			
 
				+    command = parser.parse_args(args or ())
			
 
				+    reject_stdin(__command__, stdin)
			
 
				     
			
 
				-    if command.quiet:
			
 
				-        print(VERSION)
			
 
				-    else:
			
 
				-        print('ArchiveBox v{}'.format(VERSION))
			
 
				-        print()
			
 
				-
			
 
				-        print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
			
 
				-        for name, dependency in DEPENDENCIES.items():
			
 
				-            print_dependency_version(name, dependency)
			
 
				-        
			
 
				-        print()
			
 
				-        print('{white}[i] Code locations:{reset}'.format(**ANSI))
			
 
				-        for name, folder in CODE_LOCATIONS.items():
			
 
				-            print_folder_status(name, folder)
			
 
				-
			
 
				-        print()
			
 
				-        print('{white}[i] Config locations:{reset}'.format(**ANSI))
			
 
				-        for name, folder in CONFIG_LOCATIONS.items():
			
 
				-            print_folder_status(name, folder)
			
 
				-
			
 
				-        print()
			
 
				-        print('{white}[i] Data locations:{reset}'.format(**ANSI))
			
 
				-        for name, folder in DATA_LOCATIONS.items():
			
 
				-            print_folder_status(name, folder)
			
 
				-
			
 
				-        print()
			
 
				-        check_dependencies()
			
 
				-
			
 
				-
			
 
				-def print_folder_status(name, folder):
			
 
				-    if folder['enabled']:
			
 
				-        if folder['is_valid']:
			
 
				-            color, symbol, note = 'green', '√', 'valid'
			
 
				-        else:
			
 
				-            color, symbol, note, num_files = 'red', 'X', 'invalid', '?'
			
 
				-    else:
			
 
				-        color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-'
			
 
				-
			
 
				-    if folder['path']:
			
 
				-        if os.path.exists(folder['path']):
			
 
				-            num_files = (
			
 
				-                f'{len(os.listdir(folder["path"]))} files'
			
 
				-                if os.path.isdir(folder['path']) else
			
 
				-                human_readable_size(os.path.getsize(folder['path']))
			
 
				-            )
			
 
				-        else:
			
 
				-            num_files = 'missing'
			
 
				-
			
 
				-    print(
			
 
				-        ANSI[color],
			
 
				-        symbol,
			
 
				-        ANSI['reset'],
			
 
				-        name.ljust(24),
			
 
				-        (folder["path"] or '').ljust(70),
			
 
				-        num_files.ljust(14),
			
 
				-        ANSI[color],
			
 
				-        note,
			
 
				-        ANSI['reset'],
			
 
				-    )
			
 
				-
			
 
				-
			
 
				-def print_dependency_version(name, dependency):
			
 
				-    if dependency['enabled']:
			
 
				-        if dependency['is_valid']:
			
 
				-            color, symbol, note = 'green', '√', 'valid'
			
 
				-            version = 'v' + re.search(r'[\d\.]+', dependency['version'])[0]
			
 
				-        else:
			
 
				-            color, symbol, note, version = 'red', 'X', 'invalid', '?'
			
 
				-    else:
			
 
				-        color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
			
 
				-
			
 
				-    print(
			
 
				-        ANSI[color],
			
 
				-        symbol,
			
 
				-        ANSI['reset'],
			
 
				-        name.ljust(24),
			
 
				-        (dependency["path"] or '').ljust(70),
			
 
				-        version.ljust(14),
			
 
				-        ANSI[color],
			
 
				-        note,
			
 
				-        ANSI['reset'],
			
 
				+    version(
			
 
				+        quiet=command.quiet,
			
 
				+        out_dir=pwd or OUTPUT_DIR,
			
 
				     )
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    main()
			
 
				+    main(args=sys.argv[1:], stdin=sys.stdin)
			
--- a/archivebox/cli/logging.py
+++ b/archivebox/cli/logging.py
@@ -1,3 +1,5 @@
 
				+__package__ = 'archivebox.cli'
			
 
				+
			
 
				 import os
			
 
				 import sys
			
 
				 
			
@@ -5,8 +7,8 @@ from datetime import datetime
 
				 from dataclasses import dataclass
			
 
				 from typing import Optional, List
			
 
				 
			
 
				-from .schema import Link, ArchiveResult
			
 
				-from .config import ANSI, OUTPUT_DIR, IS_TTY
			
 
				+from ..index.schema import Link, ArchiveResult
			
 
				+from ..config import ANSI, OUTPUT_DIR, IS_TTY
			
 
				 
			
 
				 
			
 
				 @dataclass
			
@@ -80,7 +82,7 @@ def log_indexing_finished(out_path: str):
 
				 
			
 
				 ### Archiving Stage
			
 
				 
			
 
				-def log_archiving_started(num_links: int, resume: Optional[float]):
			
 
				+def log_archiving_started(num_links: int, resume: Optional[float]=None):
			
 
				     start_ts = datetime.now()
			
 
				     _LAST_RUN_STATS.archiving_start_ts = start_ts
			
 
				     print()
			
@@ -92,7 +94,7 @@ def log_archiving_started(num_links: int, resume: Optional[float]):
 
				              **ANSI,
			
 
				         ))
			
 
				     else:
			
 
				-        print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format(
			
 
				+        print('{green}[▶] [{}] Updating content for {} matching pages in archive...{reset}'.format(
			
 
				              start_ts.strftime('%Y-%m-%d %H:%M:%S'),
			
 
				              num_links,
			
 
				              **ANSI,
			
@@ -213,18 +215,18 @@ def log_archive_method_finished(result: ArchiveResult):
 
				         print()
			
 
				 
			
 
				 
			
 
				-def log_list_started(filter_patterns: List[str], filter_type: str):
			
 
				+def log_list_started(filter_patterns: Optional[List[str]], filter_type: str):
			
 
				     print('{green}[*] Finding links in the archive index matching these {} patterns:{reset}'.format(
			
 
				         filter_type,
			
 
				         **ANSI,
			
 
				     ))
			
 
				-    print('    {}'.format(' '.join(filter_patterns)))
			
 
				+    print('    {}'.format(' '.join(filter_patterns or ())))
			
 
				 
			
 
				 def log_list_finished(links):
			
 
				-    from .util import to_csv
			
 
				+    from ..util import links_to_csv
			
 
				     print()
			
 
				     print('---------------------------------------------------------------------------------------------------')
			
 
				-    print(to_csv(links, csv_cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
			
 
				+    print(links_to_csv(links, csv_cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
			
 
				     print('---------------------------------------------------------------------------------------------------')
			
 
				     print()
			
 
				 
			
--- a/archivebox/cli/tests.py
+++ b/archivebox/cli/tests.py
@@ -1,6 +1,6 @@
 
				 #!/usr/bin/env python3
			
 
				 
			
 
				-__package__ = 'archivebox'
			
 
				+__package__ = 'archivebox.cli'
			
 
				 
			
 
				 
			
 
				 import os
			
@@ -29,15 +29,15 @@ TEST_CONFIG = {
 
				 OUTPUT_DIR = 'data.tests'
			
 
				 os.environ.update(TEST_CONFIG)
			
 
				 
			
 
				-from .legacy.main import init
			
 
				-from .legacy.index import load_main_index
			
 
				-from .legacy.config import (
			
 
				+from ..main import init
			
 
				+from ..index import load_main_index
			
 
				+from ..config import (
			
 
				     SQL_INDEX_FILENAME,
			
 
				     JSON_INDEX_FILENAME,
			
 
				     HTML_INDEX_FILENAME,
			
 
				 )
			
 
				 
			
 
				-from .cli import (
			
 
				+from . import (
			
 
				     archivebox_init,
			
 
				     archivebox_add,
			
 
				     archivebox_remove,
			
--- a/archivebox/config/__init__.py
+++ b/archivebox/config/__init__.py
@@ -1,4 +1,4 @@
 
				-__package__ = 'archivebox.legacy'
			
 
				+__package__ = 'archivebox.config'
			
 
				 
			
 
				 import os
			
 
				 import io
			
@@ -13,7 +13,7 @@ from typing import Optional, Type, Tuple, Dict
 
				 from subprocess import run, PIPE, DEVNULL
			
 
				 from configparser import ConfigParser
			
 
				 
			
 
				-from .config_stubs import (
			
 
				+from .stubs import (
			
 
				     SimpleConfigValueDict,
			
 
				     ConfigValue,
			
 
				     ConfigDict,
			
@@ -40,7 +40,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
 
				     'GENERAL_CONFIG': {
			
 
				         'OUTPUT_DIR':               {'type': str,   'default': None},
			
 
				         'CONFIG_FILE':              {'type': str,   'default': None},
			
 
				-        'ONLY_NEW':                 {'type': bool,  'default': False},
			
 
				+        'ONLY_NEW':                 {'type': bool,  'default': True},
			
 
				         'TIMEOUT':                  {'type': int,   'default': 60},
			
 
				         'MEDIA_TIMEOUT':            {'type': int,   'default': 3600},
			
 
				         'OUTPUT_PERMISSIONS':       {'type': str,   'default': '755'},
			
@@ -122,8 +122,7 @@ ANSI = {k: '' for k in DEFAULT_CLI_COLORS.keys()}
 
				 
			
 
				 VERSION_FILENAME = 'VERSION'
			
 
				 PYTHON_DIR_NAME = 'archivebox'
			
 
				-LEGACY_DIR_NAME = 'legacy'
			
 
				-TEMPLATES_DIR_NAME = 'templates'
			
 
				+TEMPLATES_DIR_NAME = 'themes'
			
 
				 
			
 
				 ARCHIVE_DIR_NAME = 'archive'
			
 
				 SOURCES_DIR_NAME = 'sources'
			
@@ -158,8 +157,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
 
				     
			
 
				     'REPO_DIR':                 {'default': lambda c: os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..'))},
			
 
				     'PYTHON_DIR':               {'default': lambda c: os.path.join(c['REPO_DIR'], PYTHON_DIR_NAME)},
			
 
				-    'LEGACY_DIR':               {'default': lambda c: os.path.join(c['PYTHON_DIR'], LEGACY_DIR_NAME)},
			
 
				-    'TEMPLATES_DIR':            {'default': lambda c: os.path.join(c['LEGACY_DIR'], TEMPLATES_DIR_NAME)},
			
 
				+    'TEMPLATES_DIR':            {'default': lambda c: os.path.join(c['PYTHON_DIR'], TEMPLATES_DIR_NAME, 'legacy')},
			
 
				     
			
 
				     'OUTPUT_DIR':               {'default': lambda c: os.path.abspath(os.path.expanduser(c['OUTPUT_DIR'])) if c['OUTPUT_DIR'] else os.path.abspath(os.curdir)},
			
 
				     'ARCHIVE_DIR':              {'default': lambda c: os.path.join(c['OUTPUT_DIR'], ARCHIVE_DIR_NAME)},
			
@@ -210,7 +208,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
 
				 
			
 
				     'DEPENDENCIES':             {'default': lambda c: get_dependency_info(c)},
			
 
				     'CODE_LOCATIONS':           {'default': lambda c: get_code_locations(c)},
			
 
				-    'CONFIG_LOCATIONS':         {'default': lambda c: get_config_locations(c)},
			
 
				+    'EXTERNAL_LOCATIONS':       {'default': lambda c: get_external_locations(c)},
			
 
				     'DATA_LOCATIONS':           {'default': lambda c: get_data_locations(c)},
			
 
				     'CHROME_OPTIONS':           {'default': lambda c: get_chrome_info(c)},
			
 
				 }
			
@@ -370,6 +368,7 @@ def load_config(defaults: ConfigDefaultDict,
 
				             stderr('    For config documentation and examples see:')
			
 
				             stderr('        https://github.com/pirate/ArchiveBox/wiki/Configuration')
			
 
				             stderr()
			
 
				+            raise
			
 
				             raise SystemExit(2)
			
 
				     
			
 
				     return extended_config
			
@@ -492,18 +491,13 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
 
				         'REPO_DIR': {
			
 
				             'path': os.path.abspath(config['REPO_DIR']),
			
 
				             'enabled': True,
			
 
				-            'is_valid': os.path.exists(os.path.join(config['REPO_DIR'], '.github')),
			
 
				+            'is_valid': os.path.exists(os.path.join(config['REPO_DIR'], 'archivebox')),
			
 
				         },
			
 
				         'PYTHON_DIR': {
			
 
				             'path': os.path.abspath(config['PYTHON_DIR']),
			
 
				             'enabled': True,
			
 
				             'is_valid': os.path.exists(os.path.join(config['PYTHON_DIR'], '__main__.py')),
			
 
				         },
			
 
				-        'LEGACY_DIR': {
			
 
				-            'path': os.path.abspath(config['LEGACY_DIR']),
			
 
				-            'enabled': True,
			
 
				-            'is_valid': os.path.exists(os.path.join(config['LEGACY_DIR'], 'util.py')),
			
 
				-        },
			
 
				         'TEMPLATES_DIR': {
			
 
				             'path': os.path.abspath(config['TEMPLATES_DIR']),
			
 
				             'enabled': True,
			
@@ -511,14 +505,9 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
 
				         },
			
 
				     }
			
 
				 
			
 
				-def get_config_locations(config: ConfigDict) -> ConfigValue:
			
 
				+def get_external_locations(config: ConfigDict) -> ConfigValue:
			
 
				     abspath = lambda path: None if path is None else os.path.abspath(path)
			
 
				     return {
			
 
				-        'CONFIG_FILE': {
			
 
				-            'path': abspath(config['CHROME_USER_DATA_DIR']),
			
 
				-            'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
			
 
				-            'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else os.path.exists(os.path.join(config['CHROME_USER_DATA_DIR'], 'Default')),
			
 
				-        },
			
 
				         'CHROME_USER_DATA_DIR': {
			
 
				             'path': abspath(config['CHROME_USER_DATA_DIR']),
			
 
				             'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
			
@@ -553,11 +542,26 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
 
				             'enabled': True,
			
 
				             'is_valid': os.path.exists(config['ARCHIVE_DIR']),
			
 
				         },
			
 
				+        'CONFIG_FILE': {
			
 
				+            'path': os.path.abspath(config['CONFIG_FILE']),
			
 
				+            'enabled': True,
			
 
				+            'is_valid': os.path.exists(config['CONFIG_FILE']),
			
 
				+        },
			
 
				         'SQL_INDEX': {
			
 
				+            'path': os.path.abspath(os.path.join(config['OUTPUT_DIR'], SQL_INDEX_FILENAME)),
			
 
				+            'enabled': True,
			
 
				+            'is_valid': os.path.exists(os.path.join(config['OUTPUT_DIR'], SQL_INDEX_FILENAME)),
			
 
				+        },
			
 
				+        'JSON_INDEX': {
			
 
				             'path': os.path.abspath(os.path.join(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)),
			
 
				             'enabled': True,
			
 
				             'is_valid': os.path.exists(os.path.join(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)),
			
 
				         },
			
 
				+        'HTML_INDEX': {
			
 
				+            'path': os.path.abspath(os.path.join(config['OUTPUT_DIR'], HTML_INDEX_FILENAME)),
			
 
				+            'enabled': True,
			
 
				+            'is_valid': os.path.exists(os.path.join(config['OUTPUT_DIR'], HTML_INDEX_FILENAME)),
			
 
				+        },
			
 
				     }
			
 
				 
			
 
				 def get_dependency_info(config: ConfigDict) -> ConfigValue:
			
@@ -731,7 +735,7 @@ def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) ->
 
				 
			
 
				     json_index_exists = os.path.exists(os.path.join(output_dir, JSON_INDEX_FILENAME))
			
 
				     if not json_index_exists:
			
 
				-        stderr('[X] No archive index was found in current directory.', color='red')
			
 
				+        stderr('[X] No archive main index was found in current directory.', color='red')
			
 
				         stderr(f'    {output_dir}')
			
 
				         stderr()
			
 
				         stderr('    Are you running archivebox in the right folder?')
			
@@ -743,7 +747,7 @@ def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) ->
 
				         raise SystemExit(2)
			
 
				 
			
 
				     sql_index_exists = os.path.exists(os.path.join(output_dir, SQL_INDEX_FILENAME))
			
 
				-    from .storage.sql import list_migrations
			
 
				+    from ..index.sql import list_migrations
			
 
				 
			
 
				     pending_migrations = [name for status, name in list_migrations() if not status]
			
 
				 
			
--- a/archivebox/legacy/config_stubs.py
+++ b/archivebox/legacy/config_stubs.py
@@ -17,6 +17,7 @@ class ConfigDict(BaseConfig, total=False):
 
				     SHOW_PROGRESS: bool
			
 
				 
			
 
				     OUTPUT_DIR: str
			
 
				+    CONFIG_FILE: str
			
 
				     ONLY_NEW: bool
			
 
				     TIMEOUT: int
			
 
				     MEDIA_TIMEOUT: int
			
@@ -63,7 +64,6 @@ class ConfigDict(BaseConfig, total=False):
 
				     ANSI: Dict[str, str]
			
 
				     REPO_DIR: str
			
 
				     PYTHON_DIR: str
			
 
				-    LEGACY_DIR: str
			
 
				     TEMPLATES_DIR: str
			
 
				     ARCHIVE_DIR: str
			
 
				     SOURCES_DIR: str
			
--- a/archivebox/core/admin.py
+++ b/archivebox/core/admin.py
@@ -1,9 +1,7 @@
 
				-
			
 
				-from datetime import datetime
			
 
				-
			
 
				 from django.contrib import admin
			
 
				 
			
 
				-from .models import Page
			
 
				+from core.models import Page
			
 
				+
			
 
				 
			
 
				 class PageAdmin(admin.ModelAdmin):
			
 
				     list_display = ('timestamp', 'short_url', 'title', 'is_archived', 'num_outputs', 'added', 'updated', 'url_hash')
			
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -4,8 +4,8 @@ import uuid
 
				 
			
 
				 from django.db import models
			
 
				 
			
 
				-from legacy.schema import Link
			
 
				-from legacy.util import parse_date
			
 
				+from ..util import parse_date
			
 
				+from ..index.schema import Link
			
 
				 
			
 
				 
			
 
				 class Page(models.Model):
			
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -2,8 +2,8 @@ from django.shortcuts import render
 
				 
			
 
				 from django.views import View
			
 
				 
			
 
				-from legacy.config import OUTPUT_DIR
			
 
				-from legacy.index import load_main_index, load_main_index_meta
			
 
				+from .index import load_main_index, load_main_index_meta
			
 
				+from .config import OUTPUT_DIR
			
 
				 
			
 
				 
			
 
				 class MainIndex(View):
			
@@ -34,7 +34,7 @@ class AddLinks(View):
 
				     def post(self, request):
			
 
				         import_path = request.POST['url']
			
 
				         
			
 
				-        # TODO: add the links to the index here using archivebox.legacy.main.update_archive_data
			
 
				+        # TODO: add the links to the index here using archivebox.main.add
			
 
				         print(f'Adding URL: {import_path}')
			
 
				 
			
 
				         return render(template_name=self.template, request=request, context={})
			
--- a/archivebox/core/welcome_message.py
+++ b/archivebox/core/welcome_message.py
@@ -1,4 +1,17 @@
 
				-print()
			
 
				-print('[i] Welcome to the ArchiveBox Shell! Example usage:')
			
 
				-print('    Page.objects.all()')
			
 
				-print('    User.objects.all()')
			
 
				+from cli import list_subcommands
			
 
				+
			
 
				+from .config import ANSI
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    print('{green}# ArchiveBox Imports{reset}'.format(**ANSI))
			
 
				+    # print('from archivebox.core.models import Page, User')
			
 
				+    print('{green}from archivebox.cli import\narchivebox_{}{reset}'.format("\narchivebox_".join(list_subcommands().keys()), **ANSI))
			
 
				+    print()
			
 
				+    print('[i] Welcome to the ArchiveBox Shell! Example use:')
			
 
				+    print('    print(Page.objects.filter(is_archived=True).count())')
			
 
				+    print('    Page.objects.get(url="https://example.com").as_json()')
			
 
				+
			
 
				+    print('    Page.objects.get(url="https://example.com").as_json()')
			
 
				+
			
 
				+    print('    from archivebox.main import get_invalid_folders')
			
--- a/archivebox/extractors/__init__.py
+++ b/archivebox/extractors/__init__.py
@@ -0,0 +1,105 @@
 
				+__package__ = 'archivebox.extractors'
			
 
				+
			
 
				+import os
			
 
				+
			
 
				+from typing import Optional
			
 
				+from datetime import datetime
			
 
				+
			
 
				+from ..index.schema import Link
			
 
				+from ..index import (
			
 
				+    load_link_details,
			
 
				+    write_link_details,
			
 
				+    patch_main_index,
			
 
				+)
			
 
				+from ..util import enforce_types
			
 
				+from ..cli.logging import (
			
 
				+    log_link_archiving_started,
			
 
				+    log_link_archiving_finished,
			
 
				+    log_archive_method_started,
			
 
				+    log_archive_method_finished,
			
 
				+)
			
 
				+
			
 
				+from .title import should_save_title, save_title
			
 
				+from .favicon import should_save_favicon, save_favicon
			
 
				+from .wget import should_save_wget, save_wget
			
 
				+from .pdf import should_save_pdf, save_pdf
			
 
				+from .screenshot import should_save_screenshot, save_screenshot
			
 
				+from .dom import should_save_dom, save_dom
			
 
				+from .git import should_save_git, save_git
			
 
				+from .media import should_save_media, save_media
			
 
				+from .archive_org import should_save_archive_dot_org, save_archive_dot_org
			
 
				+
			
 
				+
			
 
				+@enforce_types
			
 
				+def archive_link(link: Link, overwrite: bool=False, out_dir: Optional[str]=None) -> Link:
			
 
				+    """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
			
 
				+
			
 
				+    ARCHIVE_METHODS = (
			
 
				+        ('title', should_save_title, save_title),
			
 
				+        ('favicon', should_save_favicon, save_favicon),
			
 
				+        ('wget', should_save_wget, save_wget),
			
 
				+        ('pdf', should_save_pdf, save_pdf),
			
 
				+        ('screenshot', should_save_screenshot, save_screenshot),
			
 
				+        ('dom', should_save_dom, save_dom),
			
 
				+        ('git', should_save_git, save_git),
			
 
				+        ('media', should_save_media, save_media),
			
 
				+        ('archive_org', should_save_archive_dot_org, save_archive_dot_org),
			
 
				+    )
			
 
				+    
			
 
				+    out_dir = out_dir or link.link_dir
			
 
				+    try:
			
 
				+        is_new = not os.path.exists(out_dir)
			
 
				+        if is_new:
			
 
				+            os.makedirs(out_dir)
			
 
				+
			
 
				+        link = load_link_details(link, out_dir=out_dir)
			
 
				+        log_link_archiving_started(link, out_dir, is_new)
			
 
				+        link = link.overwrite(updated=datetime.now())
			
 
				+        stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
			
 
				+
			
 
				+        for method_name, should_run, method_function in ARCHIVE_METHODS:
			
 
				+            try:
			
 
				+                if method_name not in link.history:
			
 
				+                    link.history[method_name] = []
			
 
				+                
			
 
				+                if should_run(link, out_dir) or overwrite:
			
 
				+                    log_archive_method_started(method_name)
			
 
				+
			
 
				+                    result = method_function(link=link, out_dir=out_dir)
			
 
				+
			
 
				+                    link.history[method_name].append(result)
			
 
				+
			
 
				+                    stats[result.status] += 1
			
 
				+                    log_archive_method_finished(result)
			
 
				+                else:
			
 
				+                    stats['skipped'] += 1
			
 
				+            except Exception as e:
			
 
				+                raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
			
 
				+                    method_name,
			
 
				+                    link.url,
			
 
				+                )) from e
			
 
				+
			
 
				+        # print('    ', stats)
			
 
				+
			
 
				+        write_link_details(link, out_dir=link.link_dir)
			
 
				+        patch_main_index(link)
			
 
				+        
			
 
				+        # # If any changes were made, update the main links index json and html
			
 
				+        # was_changed = stats['succeeded'] or stats['failed']
			
 
				+        # if was_changed:
			
 
				+        #     patch_main_index(link)
			
 
				+
			
 
				+        log_link_archiving_finished(link, link.link_dir, is_new, stats)
			
 
				+
			
 
				+    except KeyboardInterrupt:
			
 
				+        try:
			
 
				+            write_link_details(link, out_dir=link.link_dir)
			
 
				+        except:
			
 
				+            pass
			
 
				+        raise
			
 
				+
			
 
				+    except Exception as err:
			
 
				+        print('    ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
			
 
				+        raise
			
 
				+
			
 
				+    return link
			
--- a/archivebox/extractors/archive_org.py
+++ b/archivebox/extractors/archive_org.py
@@ -0,0 +1,115 @@
 
				+__package__ = 'archivebox.extractors'
			
 
				+
			
 
				+import os
			
 
				+
			
 
				+from typing import Optional, List, Dict, Tuple
			
 
				+from collections import defaultdict
			
 
				+
			
 
				+from ..index.schema import Link, ArchiveResult, ArchiveOutput
			
 
				+from ..util import (
			
 
				+    enforce_types,
			
 
				+    TimedProgress,
			
 
				+    run,
			
 
				+    PIPE,
			
 
				+    DEVNULL,
			
 
				+    is_static_file,
			
 
				+    ArchiveError,
			
 
				+    chmod_file,
			
 
				+)
			
 
				+from ..config import (
			
 
				+    VERSION,
			
 
				+    TIMEOUT,
			
 
				+    SAVE_ARCHIVE_DOT_ORG,
			
 
				+    CURL_BINARY,
			
 
				+    CURL_VERSION,
			
 
				+    CHECK_SSL_VALIDITY
			
 
				+)
			
 
				+
			
 
				+
			
 
				+
			
 
				+@enforce_types
			
 
				+def should_save_archive_dot_org(link: Link, out_dir: Optional[str]=None) -> bool:
			
 
				+    out_dir = out_dir or link.link_dir
			
 
				+    if is_static_file(link.url):
			
 
				+        return False
			
 
				+
			
 
				+    if os.path.exists(os.path.join(out_dir, 'archive.org.txt')):
			
 
				+        # if open(path, 'r').read().strip() != 'None':
			
 
				+        return False
			
 
				+
			
 
				+    return SAVE_ARCHIVE_DOT_ORG
			
 
				+
			
 
				+@enforce_types
			
 
				+def save_archive_dot_org(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
			
 
				+    """submit site to archive.org for archiving via their service, save returned archive url"""
			
 
				+
			
 
				+    out_dir = out_dir or link.link_dir
			
 
				+    output: ArchiveOutput = 'archive.org.txt'
			
 
				+    archive_org_url = None
			
 
				+    submit_url = 'https://web.archive.org/save/{}'.format(link.url)
			
 
				+    cmd = [
			
 
				+        CURL_BINARY,
			
 
				+        '--location',
			
 
				+        '--head',
			
 
				+        '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(VERSION),  # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
			
 
				+        '--max-time', str(timeout),
			
 
				+        *([] if CHECK_SSL_VALIDITY else ['--insecure']),
			
 
				+        submit_url,
			
 
				+    ]
			
 
				+    status = 'succeeded'
			
 
				+    timer = TimedProgress(timeout, prefix='      ')
			
 
				+    try:
			
 
				+        result = run(cmd, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout)
			
 
				+        content_location, errors = parse_archive_dot_org_response(result.stdout)
			
 
				+        if content_location:
			
 
				+            archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
			
 
				+        elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
			
 
				+            archive_org_url = None
			
 
				+            # raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url)))
			
 
				+        elif errors:
			
 
				+            raise ArchiveError(', '.join(errors))
			
 
				+        else:
			
 
				+            raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.')
			
 
				+    except Exception as err:
			
 
				+        status = 'failed'
			
 
				+        output = err
			
 
				+    finally:
			
 
				+        timer.end()
			
 
				+
			
 
				+    if output and not isinstance(output, Exception):
			
 
				+        # instead of writing None when archive.org rejects the url write the
			
 
				+        # url to resubmit it to archive.org. This is so when the user visits
			
 
				+        # the URL in person, it will attempt to re-archive it, and it'll show the
			
 
				+        # nicer error message explaining why the url was rejected if it fails.
			
 
				+        archive_org_url = archive_org_url or submit_url
			
 
				+        with open(os.path.join(out_dir, str(output)), 'w', encoding='utf-8') as f:
			
 
				+            f.write(archive_org_url)
			
 
				+        chmod_file('archive.org.txt', cwd=out_dir)
			
 
				+        output = archive_org_url
			
 
				+
			
 
				+    return ArchiveResult(
			
 
				+        cmd=cmd,
			
 
				+        pwd=out_dir,
			
 
				+        cmd_version=CURL_VERSION,
			
 
				+        output=output,
			
 
				+        status=status,
			
 
				+        **timer.stats,
			
 
				+    )
			
 
				+
			
 
				+@enforce_types
			
 
				+def parse_archive_dot_org_response(response: bytes) -> Tuple[List[str], List[str]]:
			
 
				+    # Parse archive.org response headers
			
 
				+    headers: Dict[str, List[str]] = defaultdict(list)
			
 
				+
			
 
				+    # lowercase all the header names and store in dict
			
 
				+    for header in response.splitlines():
			
 
				+        if b':' not in header or not header.strip():
			
 
				+            continue
			
 
				+        name, val = header.decode().split(':', 1)
			
 
				+        headers[name.lower().strip()].append(val.strip())
			
 
				+
			
 
				+    # Get successful archive url in "content-location" header or any errors
			
 
				+    content_location = headers['content-location']
			
 
				+    errors = headers['x-archive-wayback-runtime-error']
			
 
				+    return content_location, errors
			
 
				+
			
--- a/archivebox/extractors/dom.py
+++ b/archivebox/extractors/dom.py
@@ -0,0 +1,73 @@
 
				+__package__ = 'archivebox.extractors'
			
 
				+
			
 
				+import os
			
 
				+
			
 
				+from typing import Optional
			
 
				+
			
 
				+from ..index.schema import Link, ArchiveResult, ArchiveOutput
			
 
				+from ..util import (
			
 
				+    enforce_types,
			
 
				+    TimedProgress,
			
 
				+    run,
			
 
				+    PIPE,
			
 
				+    is_static_file,
			
 
				+    ArchiveError,
			
 
				+    chrome_args,
			
 
				+    chmod_file,
			
 
				+)
			
 
				+from ..config import (
			
 
				+    TIMEOUT,
			
 
				+    SAVE_DOM,
			
 
				+    CHROME_VERSION,
			
 
				+)
			
 
				+
			
 
				+
			
 
				+
			
 
				+@enforce_types
			
 
				+def should_save_dom(link: Link, out_dir: Optional[str]=None) -> bool:
			
 
				+    out_dir = out_dir or link.link_dir
			
 
				+    if is_static_file(link.url):
			
 
				+        return False
			
 
				+    
			
 
				+    if os.path.exists(os.path.join(out_dir, 'output.html')):
			
 
				+        return False
			
 
				+
			
 
				+    return SAVE_DOM
			
 
				+    
			
 
				+@enforce_types
			
 
				+def save_dom(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
			
 
				+    """print HTML of site to file using chrome --dump-html"""
			
 
				+
			
 
				+    out_dir = out_dir or link.link_dir
			
 
				+    output: ArchiveOutput = 'output.html'
			
 
				+    output_path = os.path.join(out_dir, str(output))
			
 
				+    cmd = [
			
 
				+        *chrome_args(TIMEOUT=timeout),
			
 
				+        '--dump-dom',
			
 
				+        link.url
			
 
				+    ]
			
 
				+    status = 'succeeded'
			
 
				+    timer = TimedProgress(timeout, prefix='      ')
			
 
				+    try:
			
 
				+        with open(output_path, 'w+') as f:
			
 
				+            result = run(cmd, stdout=f, stderr=PIPE, cwd=out_dir, timeout=timeout)
			
 
				+
			
 
				+        if result.returncode:
			
 
				+            hints = result.stderr.decode()
			
 
				+            raise ArchiveError('Failed to save DOM', hints)
			
 
				+
			
 
				+        chmod_file(output, cwd=out_dir)
			
 
				+    except Exception as err:
			
 
				+        status = 'failed'
			
 
				+        output = err
			
 
				+    finally:
			
 
				+        timer.end()
			
 
				+
			
 
				+    return ArchiveResult(
			
 
				+        cmd=cmd,
			
 
				+        pwd=out_dir,
			
 
				+        cmd_version=CHROME_VERSION,
			
 
				+        output=output,
			
 
				+        status=status,
			
 
				+        **timer.stats,
			
 
				+    )
			
--- a/archivebox/extractors/favicon.py
+++ b/archivebox/extractors/favicon.py
@@ -0,0 +1,65 @@
 
				+__package__ = 'archivebox.extractors'
			
 
				+
			
 
				+import os
			
 
				+
			
 
				+from typing import Optional
			
 
				+
			
 
				+from ..index.schema import Link, ArchiveResult, ArchiveOutput
			
 
				+from ..util import (
			
 
				+    enforce_types,
			
 
				+    TimedProgress,
			
 
				+    domain,
			
 
				+    run,
			
 
				+    PIPE,
			
 
				+    chmod_file,
			
 
				+)
			
 
				+from ..config import (
			
 
				+    TIMEOUT,
			
 
				+    SAVE_FAVICON,
			
 
				+    CURL_BINARY,
			
 
				+    CURL_VERSION,
			
 
				+    CHECK_SSL_VALIDITY,
			
 
				+)
			
 
				+
			
 
				+
			
 
				+@enforce_types
			
 
				+def should_save_favicon(link: Link, out_dir: Optional[str]=None) -> bool:
			
 
				+    out_dir = out_dir or link.link_dir
			
 
				+    if os.path.exists(os.path.join(out_dir, 'favicon.ico')):
			
 
				+        return False
			
 
				+
			
 
				+    return SAVE_FAVICON
			
 
				+    
			
 
				+@enforce_types
			
 
				+def save_favicon(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
			
 
				+    """download site favicon from google's favicon api"""
			
 
				+
			
 
				+    out_dir = out_dir or link.link_dir
			
 
				+    output: ArchiveOutput = 'favicon.ico'
			
 
				+    cmd = [
			
 
				+        CURL_BINARY,
			
 
				+        '--max-time', str(timeout),
			
 
				+        '--location',
			
 
				+        '--output', str(output),
			
 
				+        *([] if CHECK_SSL_VALIDITY else ['--insecure']),
			
 
				+        'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)),
			
 
				+    ]
			
 
				+    status = 'succeeded'
			
 
				+    timer = TimedProgress(timeout, prefix='      ')
			
 
				+    try:
			
 
				+        run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
			
 
				+        chmod_file(output, cwd=out_dir)
			
 
				+    except Exception as err:
			
 
				+        status = 'failed'
			
 
				+        output = err
			
 
				+    finally:
			
 
				+        timer.end()
			
 
				+
			
 
				+    return ArchiveResult(
			
 
				+        cmd=cmd,
			
 
				+        pwd=out_dir,
			
 
				+        cmd_version=CURL_VERSION,
			
 
				+        output=output,
			
 
				+        status=status,
			
 
				+        **timer.stats,
			
 
				+    )
			
--- a/archivebox/extractors/git.py
+++ b/archivebox/extractors/git.py
@@ -0,0 +1,94 @@
 
				+__package__ = 'archivebox.extractors'
			
 
				+
			
 
				+import os
			
 
				+
			
 
				+from typing import Optional
			
 
				+
			
 
				+from ..index.schema import Link, ArchiveResult, ArchiveOutput
			
 
				+from ..util import (
			
 
				+    enforce_types,
			
 
				+    TimedProgress,
			
 
				+    run,
			
 
				+    PIPE,
			
 
				+    is_static_file,
			
 
				+    ArchiveError,
			
 
				+    chmod_file,
			
 
				+    domain,
			
 
				+    extension,
			
 
				+    without_query,
			
 
				+    without_fragment,
			
 
				+)
			
 
				+from ..config import (
			
 
				+    TIMEOUT,
			
 
				+    SAVE_GIT,
			
 
				+    GIT_BINARY,
			
 
				+    GIT_VERSION,
			
 
				+    GIT_DOMAINS,
			
 
				+    CHECK_SSL_VALIDITY
			
 
				+)
			
 
				+
			
 
				+
			
 
				+
			
 
				+@enforce_types
			
 
				+def should_save_git(link: Link, out_dir: Optional[str]=None) -> bool:
			
 
				+    out_dir = out_dir or link.link_dir
			
 
				+    if is_static_file(link.url):
			
 
				+        return False
			
 
				+
			
 
				+    if os.path.exists(os.path.join(out_dir, 'git')):
			
 
				+        return False
			
 
				+
			
 
				+    is_clonable_url = (
			
 
				+        (domain(link.url) in GIT_DOMAINS)
			
 
				+        or (extension(link.url) == 'git')
			
 
				+    )
			
 
				+    if not is_clonable_url:
			
 
				+        return False
			
 
				+
			
 
				+    return SAVE_GIT
			
 
				+
			
 
				+
			
 
				+@enforce_types
			
 
				+def save_git(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
			
 
				+    """download full site using git"""
			
 
				+
			
 
				+    out_dir = out_dir or link.link_dir
			
 
				+    output: ArchiveOutput = 'git'
			
 
				+    output_path = os.path.join(out_dir, str(output))
			
 
				+    os.makedirs(output_path, exist_ok=True)
			
 
				+    cmd = [
			
 
				+        GIT_BINARY,
			
 
				+        'clone',
			
 
				+        '--mirror',
			
 
				+        '--recursive',
			
 
				+        *([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
			
 
				+        without_query(without_fragment(link.url)),
			
 
				+    ]
			
 
				+    status = 'succeeded'
			
 
				+    timer = TimedProgress(timeout, prefix='      ')
			
 
				+    try:
			
 
				+        result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
			
 
				+
			
 
				+        if result.returncode == 128:
			
 
				+            # ignore failed re-download when the folder already exists
			
 
				+            pass
			
 
				+        elif result.returncode > 0:
			
 
				+            hints = 'Got git response code: {}.'.format(result.returncode)
			
 
				+            raise ArchiveError('Failed to save git clone', hints)
			
 
				+
			
 
				+        chmod_file(output, cwd=out_dir)
			
 
				+
			
 
				+    except Exception as err:
			
 
				+        status = 'failed'
			
 
				+        output = err
			
 
				+    finally:
			
 
				+        timer.end()
			
 
				+
			
 
				+    return ArchiveResult(
			
 
				+        cmd=cmd,
			
 
				+        pwd=out_dir,
			
 
				+        cmd_version=GIT_VERSION,
			
 
				+        output=output,
			
 
				+        status=status,
			
 
				+        **timer.stats,
			
 
				+    )
			
--- a/archivebox/extractors/media.py
+++ b/archivebox/extractors/media.py
@@ -0,0 +1,100 @@
 
				+__package__ = 'archivebox.extractors'
			
 
				+
			
 
				+import os
			
 
				+
			
 
				+from typing import Optional
			
 
				+
			
 
				+from ..index.schema import Link, ArchiveResult, ArchiveOutput
			
 
				+from ..util import (
			
 
				+    enforce_types,
			
 
				+    TimedProgress,
			
 
				+    run,
			
 
				+    PIPE,
			
 
				+    is_static_file,
			
 
				+    ArchiveError,
			
 
				+    chmod_file,
			
 
				+)
			
 
				+from ..config import (
			
 
				+    MEDIA_TIMEOUT,
			
 
				+    SAVE_MEDIA,
			
 
				+    YOUTUBEDL_BINARY,
			
 
				+    YOUTUBEDL_VERSION,
			
 
				+    CHECK_SSL_VALIDITY
			
 
				+)
			
 
				+
			
 
				+
			
 
				+@enforce_types
			
 
				+def should_save_media(link: Link, out_dir: Optional[str]=None) -> bool:
			
 
				+    out_dir = out_dir or link.link_dir
			
 
				+
			
 
				+    if is_static_file(link.url):
			
 
				+        return False
			
 
				+
			
 
				+    if os.path.exists(os.path.join(out_dir, 'media')):
			
 
				+        return False
			
 
				+
			
 
				+    return SAVE_MEDIA
			
 
				+
			
 
				+@enforce_types
			
 
				+def save_media(link: Link, out_dir: Optional[str]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
			
 
				+    """Download playlists or individual video, audio, and subtitles using youtube-dl"""
			
 
				+
			
 
				+    out_dir = out_dir or link.link_dir
			
 
				+    output: ArchiveOutput = 'media'
			
 
				+    output_path = os.path.join(out_dir, str(output))
			
 
				+    os.makedirs(output_path, exist_ok=True)
			
 
				+    cmd = [
			
 
				+        YOUTUBEDL_BINARY,
			
 
				+        '--write-description',
			
 
				+        '--write-info-json',
			
 
				+        '--write-annotations',
			
 
				+        '--yes-playlist',
			
 
				+        '--write-thumbnail',
			
 
				+        '--no-call-home',
			
 
				+        '--no-check-certificate',
			
 
				+        '--user-agent',
			
 
				+        '--all-subs',
			
 
				+        '--extract-audio',
			
 
				+        '--keep-video',
			
 
				+        '--ignore-errors',
			
 
				+        '--geo-bypass',
			
 
				+        '--audio-format', 'mp3',
			
 
				+        '--audio-quality', '320K',
			
 
				+        '--embed-thumbnail',
			
 
				+        '--add-metadata',
			
 
				+        *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
			
 
				+        link.url,
			
 
				+    ]
			
 
				+    status = 'succeeded'
			
 
				+    timer = TimedProgress(timeout, prefix='      ')
			
 
				+    try:
			
 
				+        result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
			
 
				+        chmod_file(output, cwd=out_dir)
			
 
				+        if result.returncode:
			
 
				+            if (b'ERROR: Unsupported URL' in result.stderr
			
 
				+                or b'HTTP Error 404' in result.stderr
			
 
				+                or b'HTTP Error 403' in result.stderr
			
 
				+                or b'URL could be a direct video link' in result.stderr
			
 
				+                or b'Unable to extract container ID' in result.stderr):
			
 
				+                # These happen too frequently on non-media pages to warrant printing to console
			
 
				+                pass
			
 
				+            else:
			
 
				+                hints = (
			
 
				+                    'Got youtube-dl response code: {}.'.format(result.returncode),
			
 
				+                    *result.stderr.decode().split('\n'),
			
 
				+                )
			
 
				+                raise ArchiveError('Failed to save media', hints)
			
 
				+    except Exception as err:
			
 
				+        status = 'failed'
			
 
				+        output = err
			
 
				+    finally:
			
 
				+        timer.end()
			
 
				+
			
 
				+    return ArchiveResult(
			
 
				+        cmd=cmd,
			
 
				+        pwd=out_dir,
			
 
				+        cmd_version=YOUTUBEDL_VERSION,
			
 
				+        output=output,
			
 
				+        status=status,
			
 
				+        **timer.stats,
			
 
				+    )
			
--- a/archivebox/extractors/pdf.py
+++ b/archivebox/extractors/pdf.py
@@ -0,0 +1,72 @@
 
				+__package__ = 'archivebox.extractors'
			
 
				+
			
 
				+import os
			
 
				+
			
 
				+from typing import Optional
			
 
				+
			
 
				+from ..index.schema import Link, ArchiveResult, ArchiveOutput
			
 
				+from ..util import (
			
 
				+    enforce_types,
			
 
				+    TimedProgress,
			
 
				+    run,
			
 
				+    PIPE,
			
 
				+    is_static_file,
			
 
				+    ArchiveError,
			
 
				+    chrome_args,
			
 
				+    chmod_file,
			
 
				+)
			
 
				+from ..config import (
			
 
				+    TIMEOUT,
			
 
				+    SAVE_PDF,
			
 
				+    CHROME_VERSION,
			
 
				+)
			
 
				+
			
 
				+
			
 
				+
			
 
				+@enforce_types
			
 
				+def should_save_pdf(link: Link, out_dir: Optional[str]=None) -> bool:
			
 
				+    out_dir = out_dir or link.link_dir
			
 
				+    if is_static_file(link.url):
			
 
				+        return False
			
 
				+    
			
 
				+    if os.path.exists(os.path.join(out_dir, 'output.pdf')):
			
 
				+        return False
			
 
				+
			
 
				+    return SAVE_PDF
			
 
				+
			
 
				+
			
 
				+@enforce_types
			
 
				+def save_pdf(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
			
 
				+    """print PDF of site to file using chrome --headless"""
			
 
				+
			
 
				+    out_dir = out_dir or link.link_dir
			
 
				+    output: ArchiveOutput = 'output.pdf'
			
 
				+    cmd = [
			
 
				+        *chrome_args(TIMEOUT=timeout),
			
 
				+        '--print-to-pdf',
			
 
				+        link.url,
			
 
				+    ]
			
 
				+    status = 'succeeded'
			
 
				+    timer = TimedProgress(timeout, prefix='      ')
			
 
				+    try:
			
 
				+        result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
			
 
				+
			
 
				+        if result.returncode:
			
 
				+            hints = (result.stderr or result.stdout).decode()
			
 
				+            raise ArchiveError('Failed to save PDF', hints)
			
 
				+        
			
 
				+        chmod_file('output.pdf', cwd=out_dir)
			
 
				+    except Exception as err:
			
 
				+        status = 'failed'
			
 
				+        output = err
			
 
				+    finally:
			
 
				+        timer.end()
			
 
				+
			
 
				+    return ArchiveResult(
			
 
				+        cmd=cmd,
			
 
				+        pwd=out_dir,
			
 
				+        cmd_version=CHROME_VERSION,
			
 
				+        output=output,
			
 
				+        status=status,
			
 
				+        **timer.stats,
			
 
				+    )
			
--- a/archivebox/extractors/screenshot.py
+++ b/archivebox/extractors/screenshot.py
@@ -0,0 +1,71 @@
 
				+__package__ = 'archivebox.extractors'
			
 
				+
			
 
				+import os
			
 
				+
			
 
				+from typing import Optional
			
 
				+
			
 
				+from ..index.schema import Link, ArchiveResult, ArchiveOutput
			
 
				+from ..util import (
			
 
				+    enforce_types,
			
 
				+    TimedProgress,
			
 
				+    run,
			
 
				+    PIPE,
			
 
				+    is_static_file,
			
 
				+    ArchiveError,
			
 
				+    chrome_args,
			
 
				+    chmod_file,
			
 
				+)
			
 
				+from ..config import (
			
 
				+    TIMEOUT,
			
 
				+    SAVE_SCREENSHOT,
			
 
				+    CHROME_VERSION,
			
 
				+)
			
 
				+
			
 
				+
			
 
				+
			
 
				+@enforce_types
			
 
				+def should_save_screenshot(link: Link, out_dir: Optional[str]=None) -> bool:
			
 
				+    out_dir = out_dir or link.link_dir
			
 
				+    if is_static_file(link.url):
			
 
				+        return False
			
 
				+    
			
 
				+    if os.path.exists(os.path.join(out_dir, 'screenshot.png')):
			
 
				+        return False
			
 
				+
			
 
				+    return SAVE_SCREENSHOT
			
 
				+
			
 
				+@enforce_types
			
 
				+def save_screenshot(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
			
 
				+    """take screenshot of site using chrome --headless"""
			
 
				+    
			
 
				+    out_dir = out_dir or link.link_dir
			
 
				+    output: ArchiveOutput = 'screenshot.png'
			
 
				+    cmd = [
			
 
				+        *chrome_args(TIMEOUT=timeout),
			
 
				+        '--screenshot',
			
 
				+        link.url,
			
 
				+    ]
			
 
				+    status = 'succeeded'
			
 
				+    timer = TimedProgress(timeout, prefix='      ')
			
 
				+    try:
			
 
				+        result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
			
 
				+
			
 
				+        if result.returncode:
			
 
				+            hints = (result.stderr or result.stdout).decode()
			
 
				+            raise ArchiveError('Failed to save screenshot', hints)
			
 
				+
			
 
				+        chmod_file(output, cwd=out_dir)
			
 
				+    except Exception as err:
			
 
				+        status = 'failed'
			
 
				+        output = err
			
 
				+    finally:
			
 
				+        timer.end()
			
 
				+
			
 
				+    return ArchiveResult(
			
 
				+        cmd=cmd,
			
 
				+        pwd=out_dir,
			
 
				+        cmd_version=CHROME_VERSION,
			
 
				+        output=output,
			
 
				+        status=status,
			
 
				+        **timer.stats,
			
 
				+    )
			
--- a/archivebox/extractors/title.py
+++ b/archivebox/extractors/title.py
@@ -0,0 +1,63 @@
 
				+__package__ = 'archivebox.extractors'
			
 
				+
			
 
				+from typing import Optional
			
 
				+
			
 
				+from ..index.schema import Link, ArchiveResult, ArchiveOutput
			
 
				+from ..util import (
			
 
				+    enforce_types,
			
 
				+    TimedProgress,
			
 
				+    is_static_file,
			
 
				+    ArchiveError,
			
 
				+    fetch_page_title,
			
 
				+)
			
 
				+from ..config import (
			
 
				+    TIMEOUT,
			
 
				+    SAVE_TITLE,
			
 
				+    CURL_BINARY,
			
 
				+    CURL_VERSION,
			
 
				+)
			
 
				+
			
 
				+
			
 
				+@enforce_types
			
 
				+def should_save_title(link: Link, out_dir: Optional[str]=None) -> bool:
			
 
				+    # if link already has valid title, skip it
			
 
				+    if link.title and not link.title.lower().startswith('http'):
			
 
				+        return False
			
 
				+
			
 
				+    if is_static_file(link.url):
			
 
				+        return False
			
 
				+
			
 
				+    return SAVE_TITLE
			
 
				+
			
 
				+@enforce_types
			
 
				+def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
			
 
				+    """try to guess the page's title from its content"""
			
 
				+
			
 
				+    output: ArchiveOutput = None
			
 
				+    cmd = [
			
 
				+        CURL_BINARY,
			
 
				+        link.url,
			
 
				+        '|',
			
 
				+        'grep',
			
 
				+        '<title',
			
 
				+    ]
			
 
				+    status = 'succeeded'
			
 
				+    timer = TimedProgress(timeout, prefix='      ')
			
 
				+    try:
			
 
				+        output = fetch_page_title(link.url, timeout=timeout, progress=False)
			
 
				+        if not output:
			
 
				+            raise ArchiveError('Unable to detect page title')
			
 
				+    except Exception as err:
			
 
				+        status = 'failed'
			
 
				+        output = err
			
 
				+    finally:
			
 
				+        timer.end()
			
 
				+
			
 
				+    return ArchiveResult(
			
 
				+        cmd=cmd,
			
 
				+        pwd=out_dir,
			
 
				+        cmd_version=CURL_VERSION,
			
 
				+        output=output,
			
 
				+        status=status,
			
 
				+        **timer.stats,
			
 
				+    )
			
--- a/archivebox/extractors/wget.py
+++ b/archivebox/extractors/wget.py
@@ -0,0 +1,123 @@
 
				+__package__ = 'archivebox.extractors'
			
 
				+
			
 
				+import os
			
 
				+
			
 
				+from typing import Optional
			
 
				+from datetime import datetime
			
 
				+
			
 
				+from ..index.schema import Link, ArchiveResult, ArchiveOutput
			
 
				+from ..util import (
			
 
				+    enforce_types,
			
 
				+    TimedProgress,
			
 
				+    run,
			
 
				+    PIPE,
			
 
				+    wget_output_path,
			
 
				+    ArchiveError,
			
 
				+)
			
 
				+from ..config import (
			
 
				+    TIMEOUT,
			
 
				+    SAVE_WGET,
			
 
				+    SAVE_WARC,
			
 
				+    WGET_BINARY,
			
 
				+    WGET_VERSION,
			
 
				+    CHECK_SSL_VALIDITY,
			
 
				+    SAVE_WGET_REQUISITES,
			
 
				+    WGET_AUTO_COMPRESSION,
			
 
				+    WGET_USER_AGENT,
			
 
				+    COOKIES_FILE,
			
 
				+)
			
 
				+
			
 
				+
			
 
				+
			
 
				+@enforce_types
			
 
				+def should_save_wget(link: Link, out_dir: Optional[str]=None) -> bool:
			
 
				+    output_path = wget_output_path(link)
			
 
				+    out_dir = out_dir or link.link_dir
			
 
				+    if output_path and os.path.exists(os.path.join(out_dir, output_path)):
			
 
				+        return False
			
 
				+
			
 
				+    return SAVE_WGET
			
 
				+
			
 
				+
			
 
				+@enforce_types
			
 
				+def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
			
 
				+    """download full site using wget"""
			
 
				+
			
 
				+    out_dir = out_dir or link.link_dir
			
 
				+    if SAVE_WARC:
			
 
				+        warc_dir = os.path.join(out_dir, 'warc')
			
 
				+        os.makedirs(warc_dir, exist_ok=True)
			
 
				+        warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
			
 
				+
			
 
				+    # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
			
 
				+    output: ArchiveOutput = None
			
 
				+    cmd = [
			
 
				+        WGET_BINARY,
			
 
				+        # '--server-response',  # print headers for better error parsing
			
 
				+        '--no-verbose',
			
 
				+        '--adjust-extension',
			
 
				+        '--convert-links',
			
 
				+        '--force-directories',
			
 
				+        '--backup-converted',
			
 
				+        '--span-hosts',
			
 
				+        '--no-parent',
			
 
				+        '-e', 'robots=off',
			
 
				+        '--restrict-file-names=windows',
			
 
				+        '--timeout={}'.format(timeout),
			
 
				+        *([] if SAVE_WARC else ['--timestamping']),
			
 
				+        *(['--warc-file={}'.format(warc_path)] if SAVE_WARC else []),
			
 
				+        *(['--page-requisites'] if SAVE_WGET_REQUISITES else []),
			
 
				+        *(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []),
			
 
				+        *(['--load-cookies', COOKIES_FILE] if COOKIES_FILE else []),
			
 
				+        *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
			
 
				+        *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
			
 
				+        link.url,
			
 
				+    ]
			
 
				+    status = 'succeeded'
			
 
				+    timer = TimedProgress(timeout, prefix='      ')
			
 
				+    try:
			
 
				+        result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
			
 
				+        output = wget_output_path(link)
			
 
				+
			
 
				+        # parse out number of files downloaded from last line of stderr:
			
 
				+        #  "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
			
 
				+        output_tail = [
			
 
				+            line.strip()
			
 
				+            for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
			
 
				+            if line.strip()
			
 
				+        ]
			
 
				+        files_downloaded = (
			
 
				+            int(output_tail[-1].strip().split(' ', 2)[1] or 0)
			
 
				+            if 'Downloaded:' in output_tail[-1]
			
 
				+            else 0
			
 
				+        )
			
 
				+
			
 
				+        # Check for common failure cases
			
 
				+        if result.returncode > 0 and files_downloaded < 1:
			
 
				+            hints = (
			
 
				+                'Got wget response code: {}.'.format(result.returncode),
			
 
				+                *output_tail,
			
 
				+            )
			
 
				+            if b'403: Forbidden' in result.stderr:
			
 
				+                raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
			
 
				+            if b'404: Not Found' in result.stderr:
			
 
				+                raise ArchiveError('404 Not Found', hints)
			
 
				+            if b'ERROR 500: Internal Server Error' in result.stderr:
			
 
				+                raise ArchiveError('500 Internal Server Error', hints)
			
 
				+            raise ArchiveError('Got an error from the server', hints)
			
 
				+
			
 
				+        # chmod_file(output, cwd=out_dir)
			
 
				+    except Exception as err:
			
 
				+        status = 'failed'
			
 
				+        output = err
			
 
				+    finally:
			
 
				+        timer.end()
			
 
				+
			
 
				+    return ArchiveResult(
			
 
				+        cmd=cmd,
			
 
				+        pwd=out_dir,
			
 
				+        cmd_version=WGET_VERSION,
			
 
				+        output=output,
			
 
				+        status=status,
			
 
				+        **timer.stats,
			
 
				+    )
			
--- a/archivebox/index/__init__.py
+++ b/archivebox/index/__init__.py
@@ -1,14 +1,25 @@
 
				-__package__ = 'archivebox.legacy'
			
 
				+__package__ = 'archivebox.index'
			
 
				 
			
 
				+import re
			
 
				 import os
			
 
				-import json
			
 
				+import shutil
			
 
				+import json as pyjson
			
 
				 
			
 
				-from typing import List, Tuple, Optional, Iterable
			
 
				+from itertools import chain
			
 
				+from typing import List, Tuple, Dict, Optional, Iterable
			
 
				 from collections import OrderedDict
			
 
				 from contextlib import contextmanager
			
 
				 
			
 
				-from .schema import Link, ArchiveResult
			
 
				-from .config import (
			
 
				+from ..parsers import parse_links
			
 
				+from ..util import (
			
 
				+    scheme,
			
 
				+    enforce_types,
			
 
				+    TimedProgress,
			
 
				+    atomic_write,
			
 
				+    ExtendedEncoder,
			
 
				+)
			
 
				+from ..config import (
			
 
				+    ARCHIVE_DIR_NAME,
			
 
				     SQL_INDEX_FILENAME,
			
 
				     JSON_INDEX_FILENAME,
			
 
				     HTML_INDEX_FILENAME,
			
@@ -18,33 +29,30 @@ from .config import (
 
				     ANSI,
			
 
				     stderr,
			
 
				 )
			
 
				-from .storage.html import write_html_main_index, write_html_link_details
			
 
				-from .storage.json import (
			
 
				+from ..cli.logging import (
			
 
				+    log_indexing_process_started,
			
 
				+    log_indexing_process_finished,
			
 
				+    log_indexing_started,
			
 
				+    log_indexing_finished,
			
 
				+    log_parsing_started,
			
 
				+    log_parsing_finished,
			
 
				+)
			
 
				+
			
 
				+from .schema import Link, ArchiveResult
			
 
				+from .html import (
			
 
				+    write_html_main_index,
			
 
				+    write_html_link_details,
			
 
				+)
			
 
				+from .json import (
			
 
				     parse_json_main_index,
			
 
				     write_json_main_index,
			
 
				     parse_json_link_details, 
			
 
				     write_json_link_details,
			
 
				 )
			
 
				-from .storage.sql import (
			
 
				+from .sql import (
			
 
				     write_sql_main_index,
			
 
				     parse_sql_main_index,
			
 
				 )
			
 
				-from .util import (
			
 
				-    scheme,
			
 
				-    enforce_types,
			
 
				-    TimedProgress,
			
 
				-    atomic_write,
			
 
				-    ExtendedEncoder,
			
 
				-)
			
 
				-from .parse import parse_links
			
 
				-from .logs import (
			
 
				-    log_indexing_process_started,
			
 
				-    log_indexing_process_finished,
			
 
				-    log_indexing_started,
			
 
				-    log_indexing_finished,
			
 
				-    log_parsing_started,
			
 
				-    log_parsing_finished,
			
 
				-)
			
 
				 
			
 
				 ### Link filtering and checking
			
 
				 
			
@@ -95,11 +103,11 @@ def merge_links(a: Link, b: Link) -> Link:
 
				     }
			
 
				     for method in all_methods:
			
 
				         deduped_jsons = {
			
 
				-            json.dumps(result, sort_keys=True, cls=ExtendedEncoder)
			
 
				+            pyjson.dumps(result, sort_keys=True, cls=ExtendedEncoder)
			
 
				             for result in history[method]
			
 
				         }
			
 
				         history[method] = list(reversed(sorted(
			
 
				-            (ArchiveResult.from_json(json.loads(result)) for result in deduped_jsons),
			
 
				+            (ArchiveResult.from_json(pyjson.loads(result)) for result in deduped_jsons),
			
 
				             key=lambda result: result.start_ts,
			
 
				         )))
			
 
				 
			
@@ -114,7 +122,7 @@ def merge_links(a: Link, b: Link) -> Link:
 
				 
			
 
				 
			
 
				 @enforce_types
			
 
				-def validate_links(links: Iterable[Link]) -> Iterable[Link]:
			
 
				+def validate_links(links: Iterable[Link]) -> List[Link]:
			
 
				     links = archivable_links(links)  # remove chrome://, about:, mailto: etc.
			
 
				     links = sorted_links(links)      # deterministically sort the links based on timstamp, url
			
 
				     links = uniquefied_links(links)  # merge/dedupe duplicate timestamps & urls
			
@@ -128,7 +136,7 @@ def validate_links(links: Iterable[Link]) -> Iterable[Link]:
 
				         stderr('        archivebox help')
			
 
				         raise SystemExit(1)
			
 
				 
			
 
				-    return links
			
 
				+    return list(links)
			
 
				 
			
 
				 
			
 
				 @enforce_types
			
@@ -259,23 +267,32 @@ def load_main_index_meta(out_dir: str=OUTPUT_DIR) -> Optional[dict]:
 
				     index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
			
 
				     if os.path.exists(index_path):
			
 
				         with open(index_path, 'r', encoding='utf-8') as f:
			
 
				-            meta_dict = json.load(f)
			
 
				+            meta_dict = pyjson.load(f)
			
 
				             meta_dict.pop('links')
			
 
				             return meta_dict
			
 
				 
			
 
				     return None
			
 
				 
			
 
				 @enforce_types
			
 
				-def import_new_links(existing_links: List[Link], import_path: str) -> Tuple[List[Link], List[Link]]:
			
 
				+def import_new_links(existing_links: List[Link],
			
 
				+                     import_path: str,
			
 
				+                     out_dir: str=OUTPUT_DIR) -> Tuple[List[Link], List[Link]]:
			
 
				+
			
 
				     new_links: List[Link] = []
			
 
				 
			
 
				     # parse and validate the import file
			
 
				     log_parsing_started(import_path)
			
 
				     raw_links, parser_name = parse_links(import_path)
			
 
				-    new_links = list(validate_links(raw_links))
			
 
				+    new_links = validate_links(raw_links)
			
 
				 
			
 
				     # merge existing links in out_dir and new links
			
 
				-    all_links = list(validate_links(existing_links + new_links))
			
 
				+    all_links = validate_links(existing_links + new_links)
			
 
				+    all_link_urls = {link.url for link in existing_links}
			
 
				+
			
 
				+    new_links = [
			
 
				+        link for link in new_links
			
 
				+        if link.url not in all_link_urls
			
 
				+    ]
			
 
				 
			
 
				     if parser_name:
			
 
				         num_parsed = len(raw_links)
			
@@ -345,3 +362,231 @@ def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link:
 
				         return merge_links(existing_link, link)
			
 
				 
			
 
				     return link
			
 
				+
			
 
				+
			
 
				+
			
 
				+LINK_FILTERS = {
			
 
				+    'exact': lambda link, pattern: (link.url == pattern) or (link.base_url == pattern),
			
 
				+    'substring': lambda link, pattern: pattern in link.url,
			
 
				+    'regex': lambda link, pattern: bool(re.match(pattern, link.url)),
			
 
				+    'domain': lambda link, pattern: link.domain == pattern,
			
 
				+}
			
 
				+
			
 
				+@enforce_types
			
 
				+def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str='exact') -> bool:
			
 
				+    for pattern in filter_patterns:
			
 
				+        try:
			
 
				+            if LINK_FILTERS[filter_type](link, pattern):
			
 
				+                return True
			
 
				+        except Exception:
			
 
				+            stderr()
			
 
				+            stderr(
			
 
				+                f'[X] Got invalid pattern for --filter-type={filter_type}:',
			
 
				+                color='red',
			
 
				+            )
			
 
				+            stderr(f'    {pattern}')
			
 
				+            raise SystemExit(2)
			
 
				+
			
 
				+    return False
			
 
				+
			
 
				+
			
 
				+def get_indexed_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
			
 
				+    """indexed links without checking archive status or data directory validity"""
			
 
				+    return {
			
 
				+        link.link_dir: link
			
 
				+        for link in links
			
 
				+    }
			
 
				+
			
 
				+def get_archived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
			
 
				+    """indexed links that are archived with a valid data directory"""
			
 
				+    return {
			
 
				+        link.link_dir: link
			
 
				+        for link in filter(is_archived, links)
			
 
				+    }
			
 
				+
			
 
				+def get_unarchived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
			
 
				+    """indexed links that are unarchived with no data directory or an empty data directory"""
			
 
				+    return {
			
 
				+        link.link_dir: link
			
 
				+        for link in filter(is_unarchived, links)
			
 
				+    }
			
 
				+
			
 
				+def get_present_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
			
 
				+    """dirs that are expected to exist based on the main index"""
			
 
				+    all_folders = {}
			
 
				+
			
 
				+    for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
			
 
				+        if entry.is_dir(follow_symlinks=True):
			
 
				+            link = None
			
 
				+            try:
			
 
				+                link = parse_json_link_details(entry.path)
			
 
				+            except Exception:
			
 
				+                pass
			
 
				+
			
 
				+            all_folders[entry.path] = link
			
 
				+
			
 
				+    return all_folders
			
 
				+
			
 
				+def get_valid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
			
 
				+    """dirs with a valid index matched to the main index and archived content"""
			
 
				+    return {
			
 
				+        link.link_dir: link
			
 
				+        for link in filter(is_valid, links)
			
 
				+    }
			
 
				+
			
 
				+def get_invalid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
			
 
				+    """dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
			
 
				+    duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
			
 
				+    orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
			
 
				+    corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
			
 
				+    unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
			
 
				+    return {**duplicate, **orphaned, **corrupted, **unrecognized}
			
 
				+
			
 
				+
			
 
				+def get_duplicate_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
			
 
				+    """dirs that conflict with other directories that have the same link URL or timestamp"""
			
 
				+    links = list(links)
			
 
				+    by_url = {link.url: 0 for link in links}
			
 
				+    by_timestamp = {link.timestamp: 0 for link in links}
			
 
				+
			
 
				+    duplicate_folders = {}
			
 
				+
			
 
				+    indexed_folders = {link.link_dir for link in links}
			
 
				+    data_folders = (
			
 
				+        entry.path
			
 
				+        for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME))
			
 
				+        if entry.is_dir(follow_symlinks=True) and entry.path not in indexed_folders
			
 
				+    )
			
 
				+
			
 
				+    for path in chain(sorted(indexed_folders), sorted(data_folders)):
			
 
				+        link = None
			
 
				+        try:
			
 
				+            link = parse_json_link_details(path)
			
 
				+        except Exception:
			
 
				+            pass
			
 
				+
			
 
				+        if link:
			
 
				+            # link folder has same timestamp as different link folder
			
 
				+            by_timestamp[link.timestamp] = by_timestamp.get(link.timestamp, 0) + 1
			
 
				+            if by_timestamp[link.timestamp] > 1:
			
 
				+                duplicate_folders[path] = link
			
 
				+
			
 
				+            # link folder has same url as different link folder
			
 
				+            by_url[link.url] = by_url.get(link.url, 0) + 1
			
 
				+            if by_url[link.url] > 1:
			
 
				+                duplicate_folders[path] = link
			
 
				+
			
 
				+    return duplicate_folders
			
 
				+
			
 
				+def get_orphaned_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
			
 
				+    """dirs that contain a valid index but aren't listed in the main index"""
			
 
				+    links = list(links)
			
 
				+    indexed_folders = {link.link_dir: link for link in links}
			
 
				+    orphaned_folders = {}
			
 
				+
			
 
				+    for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
			
 
				+        if entry.is_dir(follow_symlinks=True):
			
 
				+            link = None
			
 
				+            try:
			
 
				+                link = parse_json_link_details(entry.path)
			
 
				+            except Exception:
			
 
				+                pass
			
 
				+
			
 
				+            if link and entry.path not in indexed_folders:
			
 
				+                # folder is a valid link data dir with index details, but it's not in the main index
			
 
				+                orphaned_folders[entry.path] = link
			
 
				+
			
 
				+    return orphaned_folders
			
 
				+
			
 
				+def get_corrupted_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
			
 
				+    """dirs that don't contain a valid index and aren't listed in the main index"""
			
 
				+    return {
			
 
				+        link.link_dir: link
			
 
				+        for link in filter(is_corrupt, links)
			
 
				+    }
			
 
				+
			
 
				+def get_unrecognized_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
			
 
				+    """dirs that don't contain recognizable archive data and aren't listed in the main index"""
			
 
				+    by_timestamp = {link.timestamp: 0 for link in links}
			
 
				+    unrecognized_folders: Dict[str, Optional[Link]] = {}
			
 
				+
			
 
				+    for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
			
 
				+        if entry.is_dir(follow_symlinks=True):
			
 
				+            index_exists = os.path.exists(os.path.join(entry.path, 'index.json'))
			
 
				+            link = None
			
 
				+            try:
			
 
				+                link = parse_json_link_details(entry.path)
			
 
				+            except Exception:
			
 
				+                pass
			
 
				+
			
 
				+            if index_exists and link is None:
			
 
				+                # index exists but it's corrupted or unparseable
			
 
				+                unrecognized_folders[entry.path] = link
			
 
				+            
			
 
				+            elif not index_exists:
			
 
				+                # link details index doesn't exist and the folder isn't in the main index
			
 
				+                timestamp = entry.path.rsplit('/', 1)[-1]
			
 
				+                if timestamp not in by_timestamp:
			
 
				+                    unrecognized_folders[entry.path] = link
			
 
				+
			
 
				+    return unrecognized_folders
			
 
				+
			
 
				+
			
 
				+def is_valid(link: Link) -> bool:
			
 
				+    dir_exists = os.path.exists(link.link_dir)
			
 
				+    index_exists = os.path.exists(os.path.join(link.link_dir, 'index.json'))
			
 
				+    if not dir_exists:
			
 
				+        # unarchived links are not included in the valid list
			
 
				+        return False
			
 
				+    if dir_exists and not index_exists:
			
 
				+        return False
			
 
				+    if dir_exists and index_exists:
			
 
				+        try:
			
 
				+            parsed_link = parse_json_link_details(link.link_dir)
			
 
				+            return link.url == parsed_link.url
			
 
				+        except Exception:
			
 
				+            pass
			
 
				+    return False
			
 
				+
			
 
				+def is_corrupt(link: Link) -> bool:
			
 
				+    if not os.path.exists(link.link_dir):
			
 
				+        # unarchived links are not considered corrupt
			
 
				+        return False
			
 
				+
			
 
				+    if is_valid(link):
			
 
				+        return False
			
 
				+
			
 
				+    return True
			
 
				+
			
 
				+def is_archived(link: Link) -> bool:
			
 
				+    return is_valid(link) and link.is_archived
			
 
				+    
			
 
				+def is_unarchived(link: Link) -> bool:
			
 
				+    if not os.path.exists(link.link_dir):
			
 
				+        return True
			
 
				+    return not link.is_archived
			
 
				+
			
 
				+
			
 
				+def fix_invalid_folder_locations(out_dir: str=OUTPUT_DIR) -> Tuple[List[str], List[str]]:
			
 
				+    fixed = []
			
 
				+    cant_fix = []
			
 
				+    for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
			
 
				+        if entry.is_dir(follow_symlinks=True):
			
 
				+            if os.path.exists(os.path.join(entry.path, 'index.json')):
			
 
				+                link = parse_json_link_details(entry.path)
			
 
				+                if not link:
			
 
				+                    continue
			
 
				+
			
 
				+                if not entry.path.endswith(f'/{link.timestamp}'):
			
 
				+                    dest = os.path.join(out_dir, ARCHIVE_DIR_NAME, link.timestamp)
			
 
				+                    if os.path.exists(dest):
			
 
				+                        cant_fix.append(entry.path)
			
 
				+                    else:
			
 
				+                        shutil.move(entry.path, dest)
			
 
				+                        fixed.append(dest)
			
 
				+
			
 
				+                if link.link_dir != entry.path:
			
 
				+                    link = link.overwrite(link_dir=entry.path)
			
 
				+                    write_json_link_details(link, out_dir=entry.path)
			
 
				+
			
 
				+    return fixed, cant_fix
			
--- a/archivebox/legacy/storage/html.py
+++ b/archivebox/legacy/storage/html.py
@@ -1,11 +1,22 @@
 
				-__package__ = 'archivebox.legacy.storage'
			
 
				+__package__ = 'archivebox.index'
			
 
				 
			
 
				 import os
			
 
				 
			
 
				 from datetime import datetime
			
 
				 from typing import List, Optional, Iterator
			
 
				 
			
 
				-from ..schema import Link
			
 
				+from .schema import Link
			
 
				+from ..util import (
			
 
				+    enforce_types,
			
 
				+    ts_to_date,
			
 
				+    urlencode,
			
 
				+    htmlencode,
			
 
				+    urldecode,
			
 
				+    wget_output_path,
			
 
				+    render_template,
			
 
				+    atomic_write,
			
 
				+    copy_and_overwrite,
			
 
				+)
			
 
				 from ..config import (
			
 
				     OUTPUT_DIR,
			
 
				     TEMPLATES_DIR,
			
@@ -18,17 +29,6 @@ from ..config import (
 
				     ROBOTS_TXT_FILENAME,
			
 
				     FAVICON_FILENAME,
			
 
				 )
			
 
				-from ..util import (
			
 
				-    enforce_types,
			
 
				-    ts_to_date,
			
 
				-    urlencode,
			
 
				-    htmlencode,
			
 
				-    urldecode,
			
 
				-    wget_output_path,
			
 
				-    render_template,
			
 
				-    atomic_write,
			
 
				-    copy_and_overwrite,
			
 
				-)
			
 
				 
			
 
				 join = lambda *paths: os.path.join(*paths)
			
 
				 MAIN_INDEX_TEMPLATE = join(TEMPLATES_DIR, 'main_index.html')
			
--- a/archivebox/legacy/storage/json.py
+++ b/archivebox/legacy/storage/json.py
@@ -1,4 +1,4 @@
 
				-__package__ = 'archivebox.legacy.storage'
			
 
				+__package__ = 'archivebox.index'
			
 
				 
			
 
				 import os
			
 
				 import sys
			
@@ -7,7 +7,8 @@ import json
 
				 from datetime import datetime
			
 
				 from typing import List, Optional, Iterator
			
 
				 
			
 
				-from ..schema import Link, ArchiveResult
			
 
				+from .schema import Link, ArchiveResult
			
 
				+from ..util import enforce_types, atomic_write
			
 
				 from ..config import (
			
 
				     VERSION,
			
 
				     OUTPUT_DIR,
			
@@ -17,14 +18,11 @@ from ..config import (
 
				     JSON_INDEX_FILENAME,
			
 
				     ARCHIVE_DIR_NAME,
			
 
				 )
			
 
				-from ..util import (
			
 
				-    enforce_types,
			
 
				-    atomic_write,
			
 
				-)
			
 
				+
			
 
				 
			
 
				 MAIN_INDEX_HEADER = {
			
 
				     'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
			
 
				-    'schema': 'archivebox.legacy.storage.json',
			
 
				+    'schema': 'archivebox.index.json',
			
 
				     'copyright_info': FOOTER_INFO,
			
 
				     'meta': {
			
 
				         'project': 'ArchiveBox',
			
@@ -43,7 +41,7 @@ MAIN_INDEX_HEADER = {
 
				 
			
 
				 @enforce_types
			
 
				 def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
			
 
				-    """parse a archive index json file and return the list of links"""
			
 
				+    """parse an archive index json file and return the list of links"""
			
 
				 
			
 
				     index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
			
 
				     if os.path.exists(index_path):
			
@@ -110,4 +108,6 @@ def parse_json_links_details(out_dir: str) -> Iterator[Link]:
 
				     for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
			
 
				         if entry.is_dir(follow_symlinks=True):
			
 
				             if os.path.exists(os.path.join(entry.path, 'index.json')):
			
 
				-                yield parse_json_link_details(entry.path)
			
 
				+                link = parse_json_link_details(entry.path)
			
 
				+                if link:
			
 
				+                    yield link
			
--- a/archivebox/legacy/schema.py
+++ b/archivebox/legacy/schema.py
@@ -1,3 +1,5 @@
 
				+__package__ = 'archivebox.index'
			
 
				+
			
 
				 import os
			
 
				 
			
 
				 from datetime import datetime
			
@@ -48,7 +50,7 @@ class ArchiveResult:
 
				 
			
 
				     @classmethod
			
 
				     def from_json(cls, json_info):
			
 
				-        from .util import parse_date
			
 
				+        from ..util import parse_date
			
 
				 
			
 
				         info = {
			
 
				             key: val
			
@@ -60,12 +62,12 @@ class ArchiveResult:
 
				         return cls(**info)
			
 
				 
			
 
				     def to_json(self, indent=4, sort_keys=True):
			
 
				-        from .util import to_json
			
 
				+        from ..util import to_json
			
 
				 
			
 
				         return to_json(self, indent=indent, sort_keys=sort_keys)
			
 
				 
			
 
				     def to_csv(self, cols=None, ljust: int=0, separator: str=','):
			
 
				-        from .util import to_json
			
 
				+        from ..util import to_json
			
 
				 
			
 
				         cols = cols or self.field_names()
			
 
				         return separator.join(
			
@@ -115,7 +117,7 @@ class Link:
 
				         return float(self.timestamp) > float(other.timestamp)
			
 
				 
			
 
				     def typecheck(self) -> None:
			
 
				-        from .config import stderr, ANSI
			
 
				+        from ..config import stderr, ANSI
			
 
				         try:
			
 
				             assert self.schema == self.__class__.__name__
			
 
				             assert isinstance(self.timestamp, str) and self.timestamp
			
@@ -176,7 +178,7 @@ class Link:
 
				 
			
 
				     @classmethod
			
 
				     def from_json(cls, json_info):
			
 
				-        from .util import parse_date
			
 
				+        from ..util import parse_date
			
 
				         
			
 
				         info = {
			
 
				             key: val
			
@@ -200,12 +202,12 @@ class Link:
 
				         return cls(**info)
			
 
				 
			
 
				     def to_json(self, indent=4, sort_keys=True):
			
 
				-        from .util import to_json
			
 
				+        from ..util import to_json
			
 
				 
			
 
				         return to_json(self, indent=indent, sort_keys=sort_keys)
			
 
				 
			
 
				     def to_csv(self, csv_cols: List[str], ljust: int=0, separator: str=','):
			
 
				-        from .util import to_json
			
 
				+        from ..util import to_json
			
 
				 
			
 
				         return separator.join(
			
 
				             to_json(getattr(self, col), indent=None).ljust(ljust)
			
@@ -218,60 +220,60 @@ class Link:
 
				 
			
 
				     @property
			
 
				     def link_dir(self) -> str:
			
 
				-        from .config import CONFIG
			
 
				+        from ..config import CONFIG
			
 
				         return os.path.join(CONFIG['ARCHIVE_DIR'], self.timestamp)
			
 
				 
			
 
				     @property
			
 
				     def archive_path(self) -> str:
			
 
				-        from .config import ARCHIVE_DIR_NAME
			
 
				+        from ..config import ARCHIVE_DIR_NAME
			
 
				         return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
			
 
				     
			
 
				     ### URL Helpers
			
 
				     @property
			
 
				     def url_hash(self):
			
 
				-        from .util import hashurl
			
 
				+        from ..util import hashurl
			
 
				 
			
 
				         return hashurl(self.url)
			
 
				 
			
 
				     @property
			
 
				     def scheme(self) -> str:
			
 
				-        from .util import scheme
			
 
				+        from ..util import scheme
			
 
				         return scheme(self.url)
			
 
				 
			
 
				     @property
			
 
				     def extension(self) -> str:
			
 
				-        from .util import extension
			
 
				+        from ..util import extension
			
 
				         return extension(self.url)
			
 
				 
			
 
				     @property
			
 
				     def domain(self) -> str:
			
 
				-        from .util import domain
			
 
				+        from ..util import domain
			
 
				         return domain(self.url)
			
 
				 
			
 
				     @property
			
 
				     def path(self) -> str:
			
 
				-        from .util import path
			
 
				+        from ..util import path
			
 
				         return path(self.url)
			
 
				 
			
 
				     @property
			
 
				     def basename(self) -> str:
			
 
				-        from .util import basename
			
 
				+        from ..util import basename
			
 
				         return basename(self.url)
			
 
				 
			
 
				     @property
			
 
				     def base_url(self) -> str:
			
 
				-        from .util import base_url
			
 
				+        from ..util import base_url
			
 
				         return base_url(self.url)
			
 
				 
			
 
				     ### Pretty Printing Helpers
			
 
				     @property
			
 
				     def bookmarked_date(self) -> Optional[str]:
			
 
				-        from .util import ts_to_date
			
 
				+        from ..util import ts_to_date
			
 
				         return ts_to_date(self.timestamp) if self.timestamp else None
			
 
				 
			
 
				     @property
			
 
				     def updated_date(self) -> Optional[str]:
			
 
				-        from .util import ts_to_date
			
 
				+        from ..util import ts_to_date
			
 
				         return ts_to_date(self.updated) if self.updated else None
			
 
				 
			
 
				     @property
			
@@ -304,13 +306,13 @@ class Link:
 
				 
			
 
				     @property
			
 
				     def is_static(self) -> bool:
			
 
				-        from .util import is_static_file
			
 
				+        from ..util import is_static_file
			
 
				         return is_static_file(self.url)
			
 
				 
			
 
				     @property
			
 
				     def is_archived(self) -> bool:
			
 
				-        from .config import ARCHIVE_DIR
			
 
				-        from .util import domain
			
 
				+        from ..config import ARCHIVE_DIR
			
 
				+        from ..util import domain
			
 
				 
			
 
				         output_paths = (
			
 
				             domain(self.url),
			
@@ -352,7 +354,7 @@ class Link:
 
				     def canonical_outputs(self) -> Dict[str, Optional[str]]:
			
 
				         """predict the expected output paths that should be present after archiving"""
			
 
				 
			
 
				-        from .util import wget_output_path
			
 
				+        from ..util import wget_output_path
			
 
				         canonical = {
			
 
				             'index_path': 'index.html',
			
 
				             'favicon_path': 'favicon.ico',
			
--- a/archivebox/legacy/storage/sql.py
+++ b/archivebox/legacy/storage/sql.py
@@ -1,9 +1,9 @@
 
				-__package__ = 'archivebox.legacy.storage'
			
 
				+__package__ = 'archivebox.index'
			
 
				 
			
 
				 from io import StringIO
			
 
				 from typing import List, Tuple, Iterator
			
 
				 
			
 
				-from ..schema import Link
			
 
				+from .schema import Link
			
 
				 from ..util import enforce_types
			
 
				 from ..config import setup_django, OUTPUT_DIR
			
 
				 
			
@@ -25,9 +25,19 @@ def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
 
				     setup_django(out_dir, check_db=True)
			
 
				     from core.models import Page
			
 
				 
			
 
				-    for link in links:
			
 
				+    all_urls = {link.url: link for link in links}
			
 
				+
			
 
				+    for page in Page.objects.all():
			
 
				+        if page.url in all_urls:
			
 
				+            info = {k: v for k, v in all_urls.pop(page.url)._asdict().items() if k in Page.keys}
			
 
				+            Page.objects.update(**info)
			
 
				+        else:
			
 
				+            page.delete()
			
 
				+
			
 
				+    for url, link in all_urls.items():
			
 
				         info = {k: v for k, v in link._asdict().items() if k in Page.keys}
			
 
				-        Page.objects.update_or_create(url=link.url, defaults=info)
			
 
				+        Page.objects.update_or_create(url=url, defaults=info)
			
 
				+
			
 
				 
			
 
				 
			
 
				 @enforce_types
			
--- a/archivebox/legacy/ArchiveBox.conf
+++ b/archivebox/legacy/ArchiveBox.conf
@@ -1,58 +0,0 @@
 
				-# This is the example default configiration file for ArchiveBox.
			
 
				-# 
			
 
				-# Copy example config from here into your project's ArchiveBox.conf file,
			
 
				-# DO NOT EDIT THIS FILE DIRECTLY!
			
 
				-#
			
 
				-# See the list of all the possible options. documentation, and examples here:
			
 
				-#    https://github.com/pirate/ArchiveBox/wiki/Configuration
			
 
				-
			
 
				-[GENERAL_CONFIG]
			
 
				-OUTPUT_PERMISSIONS = 755
			
 
				-ONLY_NEW = False
			
 
				-TIMEOUT = 60
			
 
				-MEDIA_TIMEOUT = 3600
			
 
				-ACTIVE_THEME = default
			
 
				-FOOTER_INFO = Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.
			
 
				-URL_BLACKLIST = (://(.*\.)?facebook\.com)|(://(.*\.)?ebay\.com)|(.*\.exe$)
			
 
				-
			
 
				-[ARCHIVE_METHOD_TOGGLES]
			
 
				-SAVE_TITLE = True
			
 
				-SAVE_FAVICON = True
			
 
				-SAVE_WGET = True
			
 
				-SAVE_WGET_REQUISITES = True
			
 
				-SAVE_WARC = True
			
 
				-SAVE_PDF = True
			
 
				-SAVE_SCREENSHOT = True
			
 
				-SAVE_DOM = True
			
 
				-SAVE_GIT = True
			
 
				-SAVE_MEDIA = False
			
 
				-SAVE_ARCHIVE_DOT_ORG = True
			
 
				-
			
 
				-
			
 
				-[ARCHIVE_METHOD_OPTIONS]
			
 
				-CHECK_SSL_VALIDITY = True
			
 
				-RESOLUTION = 1440,900
			
 
				-GIT_DOMAINS = github.com,bitbucket.org,gitlab.com
			
 
				-
			
 
				-CROME_HEADLESS = True
			
 
				-CROME_SANDBOX = True
			
 
				-
			
 
				-COOKIES_FILE = path/to/cookies.txt
			
 
				-CHROME_USER_DATA_DIR = ~/.config/google-chrome/Default
			
 
				-
			
 
				-WGET_USER_AGENT = Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36
			
 
				-CHROME_USER_AGENT = Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36
			
 
				-
			
 
				-
			
 
				-[DEPENDENCY_CONFIG]
			
 
				-USE_CURL = True
			
 
				-USE_WGET = True
			
 
				-USE_CHROME = True
			
 
				-USE_YOUTUBEDL = True
			
 
				-USE_GIT = True
			
 
				-
			
 
				-CURL_BINARY = curl
			
 
				-GIT_BINARY = git"
			
 
				-WGET_BINARY = wget
			
 
				-YOUTUBEDL_BINARY = youtube-dl
			
 
				-CHROME_BINARY = chromium
			
--- a/archivebox/legacy/__init__.py
+++ b/archivebox/legacy/__init__.py
@@ -1 +0,0 @@
 
				-__package__ = 'archivebox.legacy'
			
--- a/archivebox/legacy/archive_methods.py
+++ b/archivebox/legacy/archive_methods.py
@@ -1,694 +0,0 @@
 
				-import os
			
 
				-
			
 
				-from typing import Dict, List, Tuple, Optional
			
 
				-from collections import defaultdict
			
 
				-from datetime import datetime
			
 
				-
			
 
				-from .schema import Link, ArchiveResult, ArchiveOutput
			
 
				-from .index import (
			
 
				-    load_link_details,
			
 
				-    write_link_details,
			
 
				-    patch_main_index,
			
 
				-)
			
 
				-from .config import (
			
 
				-    CURL_BINARY,
			
 
				-    GIT_BINARY,
			
 
				-    WGET_BINARY,
			
 
				-    YOUTUBEDL_BINARY,
			
 
				-    SAVE_FAVICON,
			
 
				-    SAVE_TITLE,
			
 
				-    SAVE_WGET,
			
 
				-    SAVE_WGET_REQUISITES,
			
 
				-    SAVE_PDF,
			
 
				-    SAVE_SCREENSHOT,
			
 
				-    SAVE_DOM,
			
 
				-    SAVE_WARC,
			
 
				-    SAVE_GIT,
			
 
				-    SAVE_MEDIA,
			
 
				-    SAVE_ARCHIVE_DOT_ORG,
			
 
				-    TIMEOUT,
			
 
				-    MEDIA_TIMEOUT,
			
 
				-    GIT_DOMAINS,
			
 
				-    VERSION,
			
 
				-    WGET_USER_AGENT,
			
 
				-    CHECK_SSL_VALIDITY,
			
 
				-    COOKIES_FILE,
			
 
				-    CURL_VERSION,
			
 
				-    WGET_VERSION,
			
 
				-    CHROME_VERSION,
			
 
				-    GIT_VERSION,
			
 
				-    YOUTUBEDL_VERSION,
			
 
				-    WGET_AUTO_COMPRESSION,
			
 
				-)
			
 
				-from .util import (
			
 
				-    enforce_types,
			
 
				-    domain,
			
 
				-    extension,
			
 
				-    without_query,
			
 
				-    without_fragment,
			
 
				-    fetch_page_title,
			
 
				-    is_static_file,
			
 
				-    TimedProgress,
			
 
				-    chmod_file,
			
 
				-    wget_output_path,
			
 
				-    chrome_args,
			
 
				-    run, PIPE, DEVNULL,
			
 
				-)
			
 
				-from .logs import (
			
 
				-    log_link_archiving_started,
			
 
				-    log_link_archiving_finished,
			
 
				-    log_archive_method_started,
			
 
				-    log_archive_method_finished,
			
 
				-)
			
 
				-
			
 
				-
			
 
				-class ArchiveError(Exception):
			
 
				-    def __init__(self, message, hints=None):
			
 
				-        super().__init__(message)
			
 
				-        self.hints = hints
			
 
				-
			
 
				-
			
 
				-@enforce_types
			
 
				-def archive_link(link: Link, out_dir: Optional[str]=None) -> Link:
			
 
				-    """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
			
 
				-
			
 
				-    ARCHIVE_METHODS = (
			
 
				-        ('title', should_save_title, save_title),
			
 
				-        ('favicon', should_save_favicon, save_favicon),
			
 
				-        ('wget', should_save_wget, save_wget),
			
 
				-        ('pdf', should_save_pdf, save_pdf),
			
 
				-        ('screenshot', should_save_screenshot, save_screenshot),
			
 
				-        ('dom', should_save_dom, save_dom),
			
 
				-        ('git', should_save_git, save_git),
			
 
				-        ('media', should_save_media, save_media),
			
 
				-        ('archive_org', should_save_archive_dot_org, save_archive_dot_org),
			
 
				-    )
			
 
				-    
			
 
				-    out_dir = out_dir or link.link_dir
			
 
				-    try:
			
 
				-        is_new = not os.path.exists(out_dir)
			
 
				-        if is_new:
			
 
				-            os.makedirs(out_dir)
			
 
				-
			
 
				-        link = load_link_details(link, out_dir=out_dir)
			
 
				-        log_link_archiving_started(link, out_dir, is_new)
			
 
				-        link = link.overwrite(updated=datetime.now())
			
 
				-        stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
			
 
				-
			
 
				-        for method_name, should_run, method_function in ARCHIVE_METHODS:
			
 
				-            try:
			
 
				-                if method_name not in link.history:
			
 
				-                    link.history[method_name] = []
			
 
				-                
			
 
				-                if should_run(link, out_dir):
			
 
				-                    log_archive_method_started(method_name)
			
 
				-
			
 
				-                    result = method_function(link=link, out_dir=out_dir)
			
 
				-
			
 
				-                    link.history[method_name].append(result)
			
 
				-
			
 
				-                    stats[result.status] += 1
			
 
				-                    log_archive_method_finished(result)
			
 
				-                else:
			
 
				-                    stats['skipped'] += 1
			
 
				-            except Exception as e:
			
 
				-                raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
			
 
				-                    method_name,
			
 
				-                    link.url,
			
 
				-                )) from e
			
 
				-
			
 
				-        # print('    ', stats)
			
 
				-
			
 
				-        write_link_details(link, out_dir=link.link_dir)
			
 
				-        patch_main_index(link)
			
 
				-        
			
 
				-        # # If any changes were made, update the main links index json and html
			
 
				-        # was_changed = stats['succeeded'] or stats['failed']
			
 
				-        # if was_changed:
			
 
				-        #     patch_main_index(link)
			
 
				-
			
 
				-        log_link_archiving_finished(link, link.link_dir, is_new, stats)
			
 
				-
			
 
				-    except KeyboardInterrupt:
			
 
				-        try:
			
 
				-            write_link_details(link, out_dir=link.link_dir)
			
 
				-        except:
			
 
				-            pass
			
 
				-        raise
			
 
				-
			
 
				-    except Exception as err:
			
 
				-        print('    ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
			
 
				-        raise
			
 
				-
			
 
				-    return link
			
 
				-
			
 
				-
			
 
				-### Archive Method Functions
			
 
				-
			
 
				-@enforce_types
			
 
				-def should_save_title(link: Link, out_dir: Optional[str]=None) -> bool:
			
 
				-    # if link already has valid title, skip it
			
 
				-    if link.title and not link.title.lower().startswith('http'):
			
 
				-        return False
			
 
				-
			
 
				-    if is_static_file(link.url):
			
 
				-        return False
			
 
				-
			
 
				-    return SAVE_TITLE
			
 
				-
			
 
				-@enforce_types
			
 
				-def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
			
 
				-    """try to guess the page's title from its content"""
			
 
				-
			
 
				-    output: ArchiveOutput = None
			
 
				-    cmd = [
			
 
				-        CURL_BINARY,
			
 
				-        link.url,
			
 
				-        '|',
			
 
				-        'grep',
			
 
				-        '<title',
			
 
				-    ]
			
 
				-    status = 'succeeded'
			
 
				-    timer = TimedProgress(timeout, prefix='      ')
			
 
				-    try:
			
 
				-        output = fetch_page_title(link.url, timeout=timeout, progress=False)
			
 
				-        if not output:
			
 
				-            raise ArchiveError('Unable to detect page title')
			
 
				-    except Exception as err:
			
 
				-        status = 'failed'
			
 
				-        output = err
			
 
				-    finally:
			
 
				-        timer.end()
			
 
				-
			
 
				-    return ArchiveResult(
			
 
				-        cmd=cmd,
			
 
				-        pwd=out_dir,
			
 
				-        cmd_version=CURL_VERSION,
			
 
				-        output=output,
			
 
				-        status=status,
			
 
				-        **timer.stats,
			
 
				-    )
			
 
				-
			
 
				-
			
 
				-@enforce_types
			
 
				-def should_save_favicon(link: Link, out_dir: Optional[str]=None) -> bool:
			
 
				-    out_dir = out_dir or link.link_dir
			
 
				-    if os.path.exists(os.path.join(out_dir, 'favicon.ico')):
			
 
				-        return False
			
 
				-
			
 
				-    return SAVE_FAVICON
			
 
				-    
			
 
				-@enforce_types
			
 
				-def save_favicon(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
			
 
				-    """download site favicon from google's favicon api"""
			
 
				-
			
 
				-    out_dir = out_dir or link.link_dir
			
 
				-    output: ArchiveOutput = 'favicon.ico'
			
 
				-    cmd = [
			
 
				-        CURL_BINARY,
			
 
				-        '--max-time', str(timeout),
			
 
				-        '--location',
			
 
				-        '--output', str(output),
			
 
				-        *([] if CHECK_SSL_VALIDITY else ['--insecure']),
			
 
				-        'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)),
			
 
				-    ]
			
 
				-    status = 'succeeded'
			
 
				-    timer = TimedProgress(timeout, prefix='      ')
			
 
				-    try:
			
 
				-        run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
			
 
				-        chmod_file(output, cwd=out_dir)
			
 
				-    except Exception as err:
			
 
				-        status = 'failed'
			
 
				-        output = err
			
 
				-    finally:
			
 
				-        timer.end()
			
 
				-
			
 
				-    return ArchiveResult(
			
 
				-        cmd=cmd,
			
 
				-        pwd=out_dir,
			
 
				-        cmd_version=CURL_VERSION,
			
 
				-        output=output,
			
 
				-        status=status,
			
 
				-        **timer.stats,
			
 
				-    )
			
 
				-
			
 
				-@enforce_types
			
 
				-def should_save_wget(link: Link, out_dir: Optional[str]=None) -> bool:
			
 
				-    output_path = wget_output_path(link)
			
 
				-    out_dir = out_dir or link.link_dir
			
 
				-    if output_path and os.path.exists(os.path.join(out_dir, output_path)):
			
 
				-        return False
			
 
				-
			
 
				-    return SAVE_WGET
			
 
				-
			
 
				-
			
 
				-@enforce_types
			
 
				-def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
			
 
				-    """download full site using wget"""
			
 
				-
			
 
				-    out_dir = out_dir or link.link_dir
			
 
				-    if SAVE_WARC:
			
 
				-        warc_dir = os.path.join(out_dir, 'warc')
			
 
				-        os.makedirs(warc_dir, exist_ok=True)
			
 
				-        warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
			
 
				-
			
 
				-    # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
			
 
				-    output: ArchiveOutput = None
			
 
				-    cmd = [
			
 
				-        WGET_BINARY,
			
 
				-        # '--server-response',  # print headers for better error parsing
			
 
				-        '--no-verbose',
			
 
				-        '--adjust-extension',
			
 
				-        '--convert-links',
			
 
				-        '--force-directories',
			
 
				-        '--backup-converted',
			
 
				-        '--span-hosts',
			
 
				-        '--no-parent',
			
 
				-        '-e', 'robots=off',
			
 
				-        '--restrict-file-names=windows',
			
 
				-        '--timeout={}'.format(timeout),
			
 
				-        *([] if SAVE_WARC else ['--timestamping']),
			
 
				-        *(['--warc-file={}'.format(warc_path)] if SAVE_WARC else []),
			
 
				-        *(['--page-requisites'] if SAVE_WGET_REQUISITES else []),
			
 
				-        *(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []),
			
 
				-        *(['--load-cookies', COOKIES_FILE] if COOKIES_FILE else []),
			
 
				-        *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
			
 
				-        *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
			
 
				-        link.url,
			
 
				-    ]
			
 
				-    status = 'succeeded'
			
 
				-    timer = TimedProgress(timeout, prefix='      ')
			
 
				-    try:
			
 
				-        result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
			
 
				-        output = wget_output_path(link)
			
 
				-
			
 
				-        # parse out number of files downloaded from last line of stderr:
			
 
				-        #  "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
			
 
				-        output_tail = [
			
 
				-            line.strip()
			
 
				-            for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
			
 
				-            if line.strip()
			
 
				-        ]
			
 
				-        files_downloaded = (
			
 
				-            int(output_tail[-1].strip().split(' ', 2)[1] or 0)
			
 
				-            if 'Downloaded:' in output_tail[-1]
			
 
				-            else 0
			
 
				-        )
			
 
				-
			
 
				-        # Check for common failure cases
			
 
				-        if result.returncode > 0 and files_downloaded < 1:
			
 
				-            hints = (
			
 
				-                'Got wget response code: {}.'.format(result.returncode),
			
 
				-                *output_tail,
			
 
				-            )
			
 
				-            if b'403: Forbidden' in result.stderr:
			
 
				-                raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
			
 
				-            if b'404: Not Found' in result.stderr:
			
 
				-                raise ArchiveError('404 Not Found', hints)
			
 
				-            if b'ERROR 500: Internal Server Error' in result.stderr:
			
 
				-                raise ArchiveError('500 Internal Server Error', hints)
			
 
				-            raise ArchiveError('Got an error from the server', hints)
			
 
				-    except Exception as err:
			
 
				-        status = 'failed'
			
 
				-        output = err
			
 
				-    finally:
			
 
				-        timer.end()
			
 
				-
			
 
				-    return ArchiveResult(
			
 
				-        cmd=cmd,
			
 
				-        pwd=out_dir,
			
 
				-        cmd_version=WGET_VERSION,
			
 
				-        output=output,
			
 
				-        status=status,
			
 
				-        **timer.stats,
			
 
				-    )
			
 
				-
			
 
				-@enforce_types
			
 
				-def should_save_pdf(link: Link, out_dir: Optional[str]=None) -> bool:
			
 
				-    out_dir = out_dir or link.link_dir
			
 
				-    if is_static_file(link.url):
			
 
				-        return False
			
 
				-    
			
 
				-    if os.path.exists(os.path.join(out_dir, 'output.pdf')):
			
 
				-        return False
			
 
				-
			
 
				-    return SAVE_PDF
			
 
				-
			
 
				-
			
 
				-@enforce_types
			
 
				-def save_pdf(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
			
 
				-    """print PDF of site to file using chrome --headless"""
			
 
				-
			
 
				-    out_dir = out_dir or link.link_dir
			
 
				-    output: ArchiveOutput = 'output.pdf'
			
 
				-    cmd = [
			
 
				-        *chrome_args(TIMEOUT=timeout),
			
 
				-        '--print-to-pdf',
			
 
				-        link.url,
			
 
				-    ]
			
 
				-    status = 'succeeded'
			
 
				-    timer = TimedProgress(timeout, prefix='      ')
			
 
				-    try:
			
 
				-        result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
			
 
				-
			
 
				-        if result.returncode:
			
 
				-            hints = (result.stderr or result.stdout).decode()
			
 
				-            raise ArchiveError('Failed to save PDF', hints)
			
 
				-        
			
 
				-        chmod_file('output.pdf', cwd=out_dir)
			
 
				-    except Exception as err:
			
 
				-        status = 'failed'
			
 
				-        output = err
			
 
				-    finally:
			
 
				-        timer.end()
			
 
				-
			
 
				-    return ArchiveResult(
			
 
				-        cmd=cmd,
			
 
				-        pwd=out_dir,
			
 
				-        cmd_version=CHROME_VERSION,
			
 
				-        output=output,
			
 
				-        status=status,
			
 
				-        **timer.stats,
			
 
				-    )
			
 
				-
			
 
				-@enforce_types
			
 
				-def should_save_screenshot(link: Link, out_dir: Optional[str]=None) -> bool:
			
 
				-    out_dir = out_dir or link.link_dir
			
 
				-    if is_static_file(link.url):
			
 
				-        return False
			
 
				-    
			
 
				-    if os.path.exists(os.path.join(out_dir, 'screenshot.png')):
			
 
				-        return False
			
 
				-
			
 
				-    return SAVE_SCREENSHOT
			
 
				-
			
 
				-@enforce_types
			
 
				-def save_screenshot(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
			
 
				-    """take screenshot of site using chrome --headless"""
			
 
				-    
			
 
				-    out_dir = out_dir or link.link_dir
			
 
				-    output: ArchiveOutput = 'screenshot.png'
			
 
				-    cmd = [
			
 
				-        *chrome_args(TIMEOUT=timeout),
			
 
				-        '--screenshot',
			
 
				-        link.url,
			
 
				-    ]
			
 
				-    status = 'succeeded'
			
 
				-    timer = TimedProgress(timeout, prefix='      ')
			
 
				-    try:
			
 
				-        result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
			
 
				-
			
 
				-        if result.returncode:
			
 
				-            hints = (result.stderr or result.stdout).decode()
			
 
				-            raise ArchiveError('Failed to save screenshot', hints)
			
 
				-
			
 
				-        chmod_file(output, cwd=out_dir)
			
 
				-    except Exception as err:
			
 
				-        status = 'failed'
			
 
				-        output = err
			
 
				-    finally:
			
 
				-        timer.end()
			
 
				-
			
 
				-    return ArchiveResult(
			
 
				-        cmd=cmd,
			
 
				-        pwd=out_dir,
			
 
				-        cmd_version=CHROME_VERSION,
			
 
				-        output=output,
			
 
				-        status=status,
			
 
				-        **timer.stats,
			
 
				-    )
			
 
				-
			
 
				-@enforce_types
			
 
				-def should_save_dom(link: Link, out_dir: Optional[str]=None) -> bool:
			
 
				-    out_dir = out_dir or link.link_dir
			
 
				-    if is_static_file(link.url):
			
 
				-        return False
			
 
				-    
			
 
				-    if os.path.exists(os.path.join(out_dir, 'output.html')):
			
 
				-        return False
			
 
				-
			
 
				-    return SAVE_DOM
			
 
				-    
			
 
				-@enforce_types
			
 
				-def save_dom(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
			
 
				-    """print HTML of site to file using chrome --dump-html"""
			
 
				-
			
 
				-    out_dir = out_dir or link.link_dir
			
 
				-    output: ArchiveOutput = 'output.html'
			
 
				-    output_path = os.path.join(out_dir, str(output))
			
 
				-    cmd = [
			
 
				-        *chrome_args(TIMEOUT=timeout),
			
 
				-        '--dump-dom',
			
 
				-        link.url
			
 
				-    ]
			
 
				-    status = 'succeeded'
			
 
				-    timer = TimedProgress(timeout, prefix='      ')
			
 
				-    try:
			
 
				-        with open(output_path, 'w+') as f:
			
 
				-            result = run(cmd, stdout=f, stderr=PIPE, cwd=out_dir, timeout=timeout)
			
 
				-
			
 
				-        if result.returncode:
			
 
				-            hints = result.stderr.decode()
			
 
				-            raise ArchiveError('Failed to save DOM', hints)
			
 
				-
			
 
				-        chmod_file(output, cwd=out_dir)
			
 
				-    except Exception as err:
			
 
				-        status = 'failed'
			
 
				-        output = err
			
 
				-    finally:
			
 
				-        timer.end()
			
 
				-
			
 
				-    return ArchiveResult(
			
 
				-        cmd=cmd,
			
 
				-        pwd=out_dir,
			
 
				-        cmd_version=CHROME_VERSION,
			
 
				-        output=output,
			
 
				-        status=status,
			
 
				-        **timer.stats,
			
 
				-    )
			
 
				-
			
 
				-@enforce_types
			
 
				-def should_save_git(link: Link, out_dir: Optional[str]=None) -> bool:
			
 
				-    out_dir = out_dir or link.link_dir
			
 
				-    if is_static_file(link.url):
			
 
				-        return False
			
 
				-
			
 
				-    if os.path.exists(os.path.join(out_dir, 'git')):
			
 
				-        return False
			
 
				-
			
 
				-    is_clonable_url = (
			
 
				-        (domain(link.url) in GIT_DOMAINS)
			
 
				-        or (extension(link.url) == 'git')
			
 
				-    )
			
 
				-    if not is_clonable_url:
			
 
				-        return False
			
 
				-
			
 
				-    return SAVE_GIT
			
 
				-
			
 
				-
			
 
				-@enforce_types
			
 
				-def save_git(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
			
 
				-    """download full site using git"""
			
 
				-
			
 
				-    out_dir = out_dir or link.link_dir
			
 
				-    output: ArchiveOutput = 'git'
			
 
				-    output_path = os.path.join(out_dir, str(output))
			
 
				-    os.makedirs(output_path, exist_ok=True)
			
 
				-    cmd = [
			
 
				-        GIT_BINARY,
			
 
				-        'clone',
			
 
				-        '--mirror',
			
 
				-        '--recursive',
			
 
				-        *([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
			
 
				-        without_query(without_fragment(link.url)),
			
 
				-    ]
			
 
				-    status = 'succeeded'
			
 
				-    timer = TimedProgress(timeout, prefix='      ')
			
 
				-    try:
			
 
				-        result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
			
 
				-
			
 
				-        if result.returncode == 128:
			
 
				-            # ignore failed re-download when the folder already exists
			
 
				-            pass
			
 
				-        elif result.returncode > 0:
			
 
				-            hints = 'Got git response code: {}.'.format(result.returncode)
			
 
				-            raise ArchiveError('Failed to save git clone', hints)
			
 
				-
			
 
				-    except Exception as err:
			
 
				-        status = 'failed'
			
 
				-        output = err
			
 
				-    finally:
			
 
				-        timer.end()
			
 
				-
			
 
				-    return ArchiveResult(
			
 
				-        cmd=cmd,
			
 
				-        pwd=out_dir,
			
 
				-        cmd_version=GIT_VERSION,
			
 
				-        output=output,
			
 
				-        status=status,
			
 
				-        **timer.stats,
			
 
				-    )
			
 
				-
			
 
				-
			
 
				-@enforce_types
			
 
				-def should_save_media(link: Link, out_dir: Optional[str]=None) -> bool:
			
 
				-    out_dir = out_dir or link.link_dir
			
 
				-
			
 
				-    if is_static_file(link.url):
			
 
				-        return False
			
 
				-
			
 
				-    if os.path.exists(os.path.join(out_dir, 'media')):
			
 
				-        return False
			
 
				-
			
 
				-    return SAVE_MEDIA
			
 
				-
			
 
				-@enforce_types
			
 
				-def save_media(link: Link, out_dir: Optional[str]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
			
 
				-    """Download playlists or individual video, audio, and subtitles using youtube-dl"""
			
 
				-
			
 
				-    out_dir = out_dir or link.link_dir
			
 
				-    output: ArchiveOutput = 'media'
			
 
				-    output_path = os.path.join(out_dir, str(output))
			
 
				-    os.makedirs(output_path, exist_ok=True)
			
 
				-    cmd = [
			
 
				-        YOUTUBEDL_BINARY,
			
 
				-        '--write-description',
			
 
				-        '--write-info-json',
			
 
				-        '--write-annotations',
			
 
				-        '--yes-playlist',
			
 
				-        '--write-thumbnail',
			
 
				-        '--no-call-home',
			
 
				-        '--no-check-certificate',
			
 
				-        '--user-agent',
			
 
				-        '--all-subs',
			
 
				-        '--extract-audio',
			
 
				-        '--keep-video',
			
 
				-        '--ignore-errors',
			
 
				-        '--geo-bypass',
			
 
				-        '--audio-format', 'mp3',
			
 
				-        '--audio-quality', '320K',
			
 
				-        '--embed-thumbnail',
			
 
				-        '--add-metadata',
			
 
				-        *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
			
 
				-        link.url,
			
 
				-    ]
			
 
				-    status = 'succeeded'
			
 
				-    timer = TimedProgress(timeout, prefix='      ')
			
 
				-    try:
			
 
				-        result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
			
 
				-        chmod_file(output, cwd=out_dir)
			
 
				-        if result.returncode:
			
 
				-            if (b'ERROR: Unsupported URL' in result.stderr
			
 
				-                or b'HTTP Error 404' in result.stderr
			
 
				-                or b'HTTP Error 403' in result.stderr
			
 
				-                or b'URL could be a direct video link' in result.stderr
			
 
				-                or b'Unable to extract container ID' in result.stderr):
			
 
				-                # These happen too frequently on non-media pages to warrant printing to console
			
 
				-                pass
			
 
				-            else:
			
 
				-                hints = (
			
 
				-                    'Got youtube-dl response code: {}.'.format(result.returncode),
			
 
				-                    *result.stderr.decode().split('\n'),
			
 
				-                )
			
 
				-                raise ArchiveError('Failed to save media', hints)
			
 
				-    except Exception as err:
			
 
				-        status = 'failed'
			
 
				-        output = err
			
 
				-    finally:
			
 
				-        timer.end()
			
 
				-
			
 
				-    return ArchiveResult(
			
 
				-        cmd=cmd,
			
 
				-        pwd=out_dir,
			
 
				-        cmd_version=YOUTUBEDL_VERSION,
			
 
				-        output=output,
			
 
				-        status=status,
			
 
				-        **timer.stats,
			
 
				-    )
			
 
				-
			
 
				-
			
 
				-@enforce_types
			
 
				-def should_save_archive_dot_org(link: Link, out_dir: Optional[str]=None) -> bool:
			
 
				-    out_dir = out_dir or link.link_dir
			
 
				-    if is_static_file(link.url):
			
 
				-        return False
			
 
				-
			
 
				-    if os.path.exists(os.path.join(out_dir, 'archive.org.txt')):
			
 
				-        # if open(path, 'r').read().strip() != 'None':
			
 
				-        return False
			
 
				-
			
 
				-    return SAVE_ARCHIVE_DOT_ORG
			
 
				-
			
 
				-@enforce_types
			
 
				-def save_archive_dot_org(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
			
 
				-    """submit site to archive.org for archiving via their service, save returned archive url"""
			
 
				-
			
 
				-    out_dir = out_dir or link.link_dir
			
 
				-    output: ArchiveOutput = 'archive.org.txt'
			
 
				-    archive_org_url = None
			
 
				-    submit_url = 'https://web.archive.org/save/{}'.format(link.url)
			
 
				-    cmd = [
			
 
				-        CURL_BINARY,
			
 
				-        '--location',
			
 
				-        '--head',
			
 
				-        '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(VERSION),  # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
			
 
				-        '--max-time', str(timeout),
			
 
				-        *([] if CHECK_SSL_VALIDITY else ['--insecure']),
			
 
				-        submit_url,
			
 
				-    ]
			
 
				-    status = 'succeeded'
			
 
				-    timer = TimedProgress(timeout, prefix='      ')
			
 
				-    try:
			
 
				-        result = run(cmd, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout)
			
 
				-        content_location, errors = parse_archive_dot_org_response(result.stdout)
			
 
				-        if content_location:
			
 
				-            archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
			
 
				-        elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
			
 
				-            archive_org_url = None
			
 
				-            # raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url)))
			
 
				-        elif errors:
			
 
				-            raise ArchiveError(', '.join(errors))
			
 
				-        else:
			
 
				-            raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.')
			
 
				-    except Exception as err:
			
 
				-        status = 'failed'
			
 
				-        output = err
			
 
				-    finally:
			
 
				-        timer.end()
			
 
				-
			
 
				-    if output and not isinstance(output, Exception):
			
 
				-        # instead of writing None when archive.org rejects the url write the
			
 
				-        # url to resubmit it to archive.org. This is so when the user visits
			
 
				-        # the URL in person, it will attempt to re-archive it, and it'll show the
			
 
				-        # nicer error message explaining why the url was rejected if it fails.
			
 
				-        archive_org_url = archive_org_url or submit_url
			
 
				-        with open(os.path.join(out_dir, str(output)), 'w', encoding='utf-8') as f:
			
 
				-            f.write(archive_org_url)
			
 
				-        chmod_file('archive.org.txt', cwd=out_dir)
			
 
				-        output = archive_org_url
			
 
				-
			
 
				-    return ArchiveResult(
			
 
				-        cmd=cmd,
			
 
				-        pwd=out_dir,
			
 
				-        cmd_version=CURL_VERSION,
			
 
				-        output=output,
			
 
				-        status=status,
			
 
				-        **timer.stats,
			
 
				-    )
			
 
				-
			
 
				-@enforce_types
			
 
				-def parse_archive_dot_org_response(response: bytes) -> Tuple[List[str], List[str]]:
			
 
				-    # Parse archive.org response headers
			
 
				-    headers: Dict[str, List[str]] = defaultdict(list)
			
 
				-
			
 
				-    # lowercase all the header names and store in dict
			
 
				-    for header in response.splitlines():
			
 
				-        if b':' not in header or not header.strip():
			
 
				-            continue
			
 
				-        name, val = header.decode().split(':', 1)
			
 
				-        headers[name.lower().strip()].append(val.strip())
			
 
				-
			
 
				-    # Get successful archive url in "content-location" header or any errors
			
 
				-    content_location = headers['content-location']
			
 
				-    errors = headers['x-archive-wayback-runtime-error']
			
 
				-    return content_location, errors
			
--- a/archivebox/legacy/main.py
+++ b/archivebox/legacy/main.py
@@ -1,626 +0,0 @@
 
				-import os
			
 
				-import re
			
 
				-import shutil
			
 
				-
			
 
				-from typing import Dict, List, Optional, Iterable
			
 
				-from itertools import chain
			
 
				-
			
 
				-from .schema import Link
			
 
				-from .util import (
			
 
				-    enforce_types,
			
 
				-    TimedProgress,
			
 
				-    get_dir_size,
			
 
				-    human_readable_size,
			
 
				-)
			
 
				-from .index import (
			
 
				-    links_after_timestamp,
			
 
				-    load_main_index,
			
 
				-    import_new_links,
			
 
				-    write_main_index,
			
 
				-)
			
 
				-from .storage.json import (
			
 
				-    parse_json_main_index,
			
 
				-    parse_json_link_details,
			
 
				-    parse_json_links_details,
			
 
				-)
			
 
				-from .storage.sql import parse_sql_main_index, get_admins
			
 
				-from .storage.html import parse_html_main_index
			
 
				-from .archive_methods import archive_link
			
 
				-from .config import (
			
 
				-    stderr,
			
 
				-    ANSI,
			
 
				-    ONLY_NEW,
			
 
				-    OUTPUT_DIR,
			
 
				-    SOURCES_DIR,
			
 
				-    ARCHIVE_DIR,
			
 
				-    LOGS_DIR,
			
 
				-    CONFIG_FILE,
			
 
				-    ARCHIVE_DIR_NAME,
			
 
				-    SOURCES_DIR_NAME,
			
 
				-    LOGS_DIR_NAME,
			
 
				-    STATIC_DIR_NAME,
			
 
				-    JSON_INDEX_FILENAME,
			
 
				-    HTML_INDEX_FILENAME,
			
 
				-    SQL_INDEX_FILENAME,
			
 
				-    ROBOTS_TXT_FILENAME,
			
 
				-    FAVICON_FILENAME,
			
 
				-    check_dependencies,
			
 
				-    check_data_folder,
			
 
				-    setup_django,
			
 
				-    write_config_file,
			
 
				-)
			
 
				-from .logs import (
			
 
				-    log_archiving_started,
			
 
				-    log_archiving_paused,
			
 
				-    log_archiving_finished,
			
 
				-    log_removal_started,
			
 
				-    log_removal_finished,
			
 
				-    log_list_started,
			
 
				-    log_list_finished,
			
 
				-)
			
 
				-
			
 
				-
			
 
				-ALLOWED_IN_OUTPUT_DIR = {
			
 
				-    '.DS_Store',
			
 
				-    '.venv',
			
 
				-    'venv',
			
 
				-    'virtualenv',
			
 
				-    '.virtualenv',
			
 
				-    ARCHIVE_DIR_NAME,
			
 
				-    SOURCES_DIR_NAME,
			
 
				-    LOGS_DIR_NAME,
			
 
				-    STATIC_DIR_NAME,
			
 
				-    SQL_INDEX_FILENAME,
			
 
				-    JSON_INDEX_FILENAME,
			
 
				-    HTML_INDEX_FILENAME,
			
 
				-    ROBOTS_TXT_FILENAME,
			
 
				-    FAVICON_FILENAME,
			
 
				-}
			
 
				-
			
 
				-
			
 
				-@enforce_types
			
 
				-def init():
			
 
				-    os.makedirs(OUTPUT_DIR, exist_ok=True)
			
 
				-
			
 
				-    is_empty = not len(set(os.listdir(OUTPUT_DIR)) - ALLOWED_IN_OUTPUT_DIR)
			
 
				-    existing_index = os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME))
			
 
				-
			
 
				-    if is_empty and not existing_index:
			
 
				-        print('{green}[+] Initializing a new ArchiveBox collection in this folder...{reset}'.format(**ANSI))
			
 
				-        print(f'    {OUTPUT_DIR}')
			
 
				-        print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
			
 
				-    elif existing_index:
			
 
				-        print('{green}[*] Updating existing ArchiveBox collection in this folder...{reset}'.format(**ANSI))
			
 
				-        print(f'    {OUTPUT_DIR}')
			
 
				-        print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
			
 
				-    else:
			
 
				-        stderr(
			
 
				-            ("{red}[X] This folder appears to already have files in it, but no index.json is present.{reset}\n\n"
			
 
				-            "    You must run init in a completely empty directory, or an existing data folder.\n\n"
			
 
				-            "    {lightred}Hint:{reset} To import an existing data folder make sure to cd into the folder first, \n"
			
 
				-            "    then run and run 'archivebox init' to pick up where you left off.\n\n"
			
 
				-            "    (Always make sure your data folder is backed up first before updating ArchiveBox)"
			
 
				-            ).format(OUTPUT_DIR, **ANSI)
			
 
				-        )
			
 
				-        raise SystemExit(1)
			
 
				-
			
 
				-    if existing_index:
			
 
				-        print('\n{green}[*] Verifying archive folder structure...{reset}'.format(**ANSI))
			
 
				-    else:
			
 
				-        print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI))
			
 
				-    
			
 
				-    os.makedirs(SOURCES_DIR, exist_ok=True)
			
 
				-    print(f'    √ {SOURCES_DIR}')
			
 
				-    
			
 
				-    os.makedirs(ARCHIVE_DIR, exist_ok=True)
			
 
				-    print(f'    √ {ARCHIVE_DIR}')
			
 
				-
			
 
				-    os.makedirs(LOGS_DIR, exist_ok=True)
			
 
				-    print(f'    √ {LOGS_DIR}')
			
 
				-
			
 
				-    write_config_file({}, out_dir=OUTPUT_DIR)
			
 
				-    print(f'    √ {CONFIG_FILE}')
			
 
				-    
			
 
				-    if os.path.exists(os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME)):
			
 
				-        print('\n{green}[*] Verifying main SQL index and running migrations...{reset}'.format(**ANSI))
			
 
				-    else:
			
 
				-        print('\n{green}[+] Building main SQL index and running migrations...{reset}'.format(**ANSI))
			
 
				-    
			
 
				-    setup_django(OUTPUT_DIR, check_db=False)
			
 
				-    from django.conf import settings
			
 
				-    assert settings.DATABASE_FILE == os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME)
			
 
				-    print(f'    √ {settings.DATABASE_FILE}')
			
 
				-    print()
			
 
				-    from .storage.sql import apply_migrations
			
 
				-    for migration_line in apply_migrations(OUTPUT_DIR):
			
 
				-        print(f'    {migration_line}')
			
 
				-
			
 
				-
			
 
				-    assert os.path.exists(settings.DATABASE_FILE)
			
 
				-    
			
 
				-    # from django.contrib.auth.models import User
			
 
				-    # if IS_TTY and not User.objects.filter(is_superuser=True).exists():
			
 
				-    #     print('{green}[+] Creating admin user account...{reset}'.format(**ANSI))
			
 
				-    #     call_command("createsuperuser", interactive=True)
			
 
				-
			
 
				-    print()
			
 
				-    print('{green}[*] Collecting links from any existing index or archive folders...{reset}'.format(**ANSI))
			
 
				-
			
 
				-    all_links = {}
			
 
				-    if existing_index:
			
 
				-        all_links = {
			
 
				-            link.url: link
			
 
				-            for link in load_main_index(out_dir=OUTPUT_DIR, warn=False)
			
 
				-        }
			
 
				-        print('    √ Loaded {} links from existing main index...'.format(len(all_links)))
			
 
				-
			
 
				-    orphaned_json_links = {
			
 
				-        link.url: link
			
 
				-        for link in parse_json_main_index(OUTPUT_DIR)
			
 
				-        if link.url not in all_links
			
 
				-    }
			
 
				-    if orphaned_json_links:
			
 
				-        all_links.update(orphaned_json_links)
			
 
				-        print('    {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
			
 
				-
			
 
				-    orphaned_sql_links = {
			
 
				-        link.url: link
			
 
				-        for link in parse_sql_main_index(OUTPUT_DIR)
			
 
				-        if link.url not in all_links
			
 
				-    }
			
 
				-    if orphaned_sql_links:
			
 
				-        all_links.update(orphaned_sql_links)
			
 
				-        print('    {lightyellow}√ Added {} orphaned links from existing SQL index...{reset}'.format(len(orphaned_sql_links), **ANSI))
			
 
				-
			
 
				-    orphaned_data_dir_links = {
			
 
				-        link.url: link
			
 
				-        for link in parse_json_links_details(OUTPUT_DIR)
			
 
				-    }
			
 
				-    orphan_new_links = {
			
 
				-        url: link
			
 
				-        for url, link in orphaned_data_dir_links.items()
			
 
				-        if url not in all_links
			
 
				-    }
			
 
				-    orphan_duplicates = {
			
 
				-        url: link
			
 
				-        for url, link in orphaned_data_dir_links.items()
			
 
				-        if url in all_links
			
 
				-    }
			
 
				-    if orphan_new_links:
			
 
				-        all_links.update(orphan_new_links)
			
 
				-        print('    {lightyellow}√ Added {} orphaned links from existing archive directories...{reset}'.format(len(orphan_new_links), **ANSI))
			
 
				-    if orphan_duplicates:
			
 
				-        print('    {lightyellow}! Skipped adding {} invalid link data directories that would have overwritten or corrupted existing data.{reset}'.format(len(orphan_duplicates), **ANSI))
			
 
				-
			
 
				-    orphaned_data_dirs = {folder for folder in orphan_duplicates.keys()}
			
 
				-    invalid_folders = {
			
 
				-        folder: link
			
 
				-        for folder, link in get_invalid_folders(all_links.values(), out_dir=OUTPUT_DIR).items()
			
 
				-        if folder not in orphaned_data_dirs
			
 
				-    }
			
 
				-    if invalid_folders:
			
 
				-        print('    {lightyellow}! Skipped adding {} corrupted/unrecognized link data directories that could not be read.{reset}'.format(len(orphan_duplicates), **ANSI))
			
 
				-        
			
 
				-    if orphan_duplicates or invalid_folders:
			
 
				-        print('        For more information about the link data directories that were skipped, run:')
			
 
				-        print('            archivebox info')
			
 
				-        print('            archivebox list --status=invalid')
			
 
				-        print('            archivebox list --status=orphaned')
			
 
				-        print('            archivebox list --status=duplicate')
			
 
				-
			
 
				-
			
 
				-    write_main_index(list(all_links.values()), out_dir=OUTPUT_DIR)
			
 
				-
			
 
				-    print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))
			
 
				-    if existing_index:
			
 
				-        print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
			
 
				-    else:
			
 
				-        print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links), **ANSI))
			
 
				-    print()
			
 
				-    print('    To view your archive index, open:')
			
 
				-    print('        {}'.format(os.path.join(OUTPUT_DIR, HTML_INDEX_FILENAME)))
			
 
				-    print()
			
 
				-    print('    To add new links, you can run:')
			
 
				-    print("        archivebox add 'https://example.com'")
			
 
				-    print()
			
 
				-    print('    For more usage and examples, run:')
			
 
				-    print('        archivebox help')
			
 
				-
			
 
				-
			
 
				-@enforce_types
			
 
				-def info():
			
 
				-
			
 
				-    print('{green}[*] Scanning archive collection main index...{reset}'.format(**ANSI))
			
 
				-    print(f'    {OUTPUT_DIR}/*')
			
 
				-    num_bytes, num_dirs, num_files = get_dir_size(OUTPUT_DIR, recursive=False, pattern='index.')
			
 
				-    size = human_readable_size(num_bytes)
			
 
				-    print(f'    Size: {size} across {num_files} files')
			
 
				-    print()
			
 
				-
			
 
				-    links = list(load_main_index(out_dir=OUTPUT_DIR))
			
 
				-    num_json_links = len(links)
			
 
				-    num_sql_links = sum(1 for link in parse_sql_main_index(out_dir=OUTPUT_DIR))
			
 
				-    num_html_links = sum(1 for url in parse_html_main_index(out_dir=OUTPUT_DIR))
			
 
				-    num_link_details = sum(1 for link in parse_json_links_details(out_dir=OUTPUT_DIR))
			
 
				-    users = get_admins().values_list('username', flat=True)
			
 
				-    print(f'    > JSON Main Index: {num_json_links} links'.ljust(36),  f'(found in {JSON_INDEX_FILENAME})')
			
 
				-    print(f'    > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
			
 
				-    print(f'    > HTML Main Index: {num_html_links} links'.ljust(36), f'(found in {HTML_INDEX_FILENAME})')
			
 
				-    print(f'    > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR_NAME}/*/index.json)')
			
 
				-
			
 
				-    print(f'    > Admin: {len(users)} users {", ".join(users)}'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
			
 
				-    
			
 
				-    if num_html_links != len(links) or num_sql_links != len(links):
			
 
				-        print()
			
 
				-        print('    {lightred}Hint:{reset} You can fix index count differences automatically by running:'.format(**ANSI))
			
 
				-        print('        archivebox init')
			
 
				-    
			
 
				-    if not users:
			
 
				-        print()
			
 
				-        print('    {lightred}Hint:{reset} You can create an admin user by running:'.format(**ANSI))
			
 
				-        print('        archivebox manage createsuperuser')
			
 
				-
			
 
				-    print()
			
 
				-    print('{green}[*] Scanning archive collection link data directories...{reset}'.format(**ANSI))
			
 
				-    print(f'    {ARCHIVE_DIR}/*')
			
 
				-
			
 
				-    num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR)
			
 
				-    size = human_readable_size(num_bytes)
			
 
				-    print(f'    Size: {size} across {num_files} files in {num_dirs} directories')
			
 
				-    print()
			
 
				-
			
 
				-    num_indexed = len(get_indexed_folders(links, out_dir=OUTPUT_DIR))
			
 
				-    num_archived = len(get_archived_folders(links, out_dir=OUTPUT_DIR))
			
 
				-    num_unarchived = len(get_unarchived_folders(links, out_dir=OUTPUT_DIR))
			
 
				-    print(f'    > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})')
			
 
				-    print(f'      > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})')
			
 
				-    print(f'      > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})')
			
 
				-    
			
 
				-    num_present = len(get_present_folders(links, out_dir=OUTPUT_DIR))
			
 
				-    num_valid = len(get_valid_folders(links, out_dir=OUTPUT_DIR))
			
 
				-    print()
			
 
				-    print(f'    > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})')
			
 
				-    print(f'      > valid: {num_valid}'.ljust(36), f'({get_valid_folders.__doc__})')
			
 
				-    
			
 
				-    duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
			
 
				-    orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
			
 
				-    corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
			
 
				-    unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
			
 
				-    num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized})
			
 
				-    print(f'      > invalid: {num_invalid}'.ljust(36), f'({get_invalid_folders.__doc__})')
			
 
				-    print(f'        > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})')
			
 
				-    print(f'        > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})')
			
 
				-    print(f'        > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})')
			
 
				-    print(f'        > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})')
			
 
				-    
			
 
				-    if num_indexed:
			
 
				-        print()
			
 
				-        print('    {lightred}Hint:{reset} You can list link data directories by status like so:'.format(**ANSI))
			
 
				-        print('        archivebox list --status=<status>  (e.g. indexed, corrupted, archived, etc.)')
			
 
				-
			
 
				-    if orphaned:
			
 
				-        print()
			
 
				-        print('    {lightred}Hint:{reset} To automatically import orphaned data directories into the main index, run:'.format(**ANSI))
			
 
				-        print('        archivebox init')
			
 
				-
			
 
				-    if num_invalid:
			
 
				-        print()
			
 
				-        print('    {lightred}Hint:{reset} You may need to manually remove or fix some invalid data directories, afterwards make sure to run:'.format(**ANSI))
			
 
				-        print('        archivebox init')
			
 
				-    
			
 
				-    print()
			
 
				-
			
 
				-
			
 
				-
			
 
				-@enforce_types
			
 
				-def update_archive_data(import_path: Optional[str]=None, 
			
 
				-                        resume: Optional[float]=None,
			
 
				-                        only_new: bool=False,
			
 
				-                        index_only: bool=False) -> List[Link]:
			
 
				-    """The main ArchiveBox entrancepoint. Everything starts here."""
			
 
				-
			
 
				-    check_dependencies()
			
 
				-    check_data_folder()
			
 
				-
			
 
				-    # Step 1: Load list of links from the existing index
			
 
				-    #         merge in and dedupe new links from import_path
			
 
				-    all_links: List[Link] = []
			
 
				-    new_links: List[Link] = []
			
 
				-    all_links = load_main_index(out_dir=OUTPUT_DIR)
			
 
				-    if import_path:
			
 
				-        all_links, new_links = import_new_links(all_links, import_path)
			
 
				-
			
 
				-    # Step 2: Write updated index with deduped old and new links back to disk
			
 
				-    write_main_index(links=list(all_links), out_dir=OUTPUT_DIR)
			
 
				-
			
 
				-    if index_only:
			
 
				-        return all_links
			
 
				-        
			
 
				-    # Step 3: Run the archive methods for each link
			
 
				-    links = new_links if ONLY_NEW else all_links
			
 
				-    log_archiving_started(len(links), resume)
			
 
				-    idx: int = 0
			
 
				-    link: Link = None                                             # type: ignore
			
 
				-    try:
			
 
				-        for idx, link in enumerate(links_after_timestamp(links, resume)):
			
 
				-            archive_link(link, out_dir=link.link_dir)
			
 
				-
			
 
				-    except KeyboardInterrupt:
			
 
				-        log_archiving_paused(len(links), idx, link.timestamp if link else '0')
			
 
				-        raise SystemExit(0)
			
 
				-
			
 
				-    except:
			
 
				-        print()
			
 
				-        raise    
			
 
				-
			
 
				-    log_archiving_finished(len(links))
			
 
				-
			
 
				-    # Step 4: Re-write links index with updated titles, icons, and resources
			
 
				-    all_links = load_main_index(out_dir=OUTPUT_DIR)
			
 
				-    write_main_index(links=list(all_links), out_dir=OUTPUT_DIR, finished=True)
			
 
				-    return all_links
			
 
				-
			
 
				-
			
 
				-LINK_FILTERS = {
			
 
				-    'exact': lambda link, pattern: (link.url == pattern) or (link.base_url == pattern),
			
 
				-    'substring': lambda link, pattern: pattern in link.url,
			
 
				-    'regex': lambda link, pattern: bool(re.match(pattern, link.url)),
			
 
				-    'domain': lambda link, pattern: link.domain == pattern,
			
 
				-}
			
 
				-
			
 
				-@enforce_types
			
 
				-def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str='exact') -> bool:
			
 
				-    for pattern in filter_patterns:
			
 
				-        if LINK_FILTERS[filter_type](link, pattern):
			
 
				-            return True
			
 
				-
			
 
				-    return False
			
 
				-
			
 
				-
			
 
				-@enforce_types
			
 
				-def list_archive_data(filter_patterns: Optional[List[str]]=None, filter_type: str='exact',
			
 
				-                      after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]:
			
 
				-    
			
 
				-    all_links = load_main_index(out_dir=OUTPUT_DIR)
			
 
				-
			
 
				-    for link in all_links:
			
 
				-        if after is not None and float(link.timestamp) < after:
			
 
				-            continue
			
 
				-        if before is not None and float(link.timestamp) > before:
			
 
				-            continue
			
 
				-        
			
 
				-        if filter_patterns:
			
 
				-            if link_matches_filter(link, filter_patterns, filter_type):
			
 
				-                yield link
			
 
				-        else:
			
 
				-            yield link
			
 
				-
			
 
				-
			
 
				-@enforce_types
			
 
				-def remove_archive_links(filter_patterns: List[str], filter_type: str='exact',
			
 
				-                         after: Optional[float]=None, before: Optional[float]=None,
			
 
				-                         yes: bool=False, delete: bool=False) -> List[Link]:
			
 
				-    
			
 
				-    check_dependencies()
			
 
				-    check_data_folder()
			
 
				-
			
 
				-    log_list_started(filter_patterns, filter_type)
			
 
				-    timer = TimedProgress(360, prefix='      ')
			
 
				-    try:
			
 
				-        links = list(list_archive_data(
			
 
				-            filter_patterns=filter_patterns,
			
 
				-            filter_type=filter_type,
			
 
				-            after=after,
			
 
				-            before=before,
			
 
				-        ))
			
 
				-    finally:
			
 
				-        timer.end()
			
 
				-
			
 
				-    if not len(links):
			
 
				-        log_removal_finished(0, 0)
			
 
				-        raise SystemExit(1)
			
 
				-
			
 
				-
			
 
				-    log_list_finished(links)
			
 
				-    log_removal_started(links, yes=yes, delete=delete)
			
 
				-
			
 
				-    timer = TimedProgress(360, prefix='      ')
			
 
				-    try:
			
 
				-        to_keep = []
			
 
				-        all_links = load_main_index(out_dir=OUTPUT_DIR)
			
 
				-        for link in all_links:
			
 
				-            should_remove = (
			
 
				-                (after is not None and float(link.timestamp) < after)
			
 
				-                or (before is not None and float(link.timestamp) > before)
			
 
				-                or link_matches_filter(link, filter_patterns, filter_type)
			
 
				-            )
			
 
				-            if not should_remove:
			
 
				-                to_keep.append(link)
			
 
				-            elif should_remove and delete:
			
 
				-                shutil.rmtree(link.link_dir)
			
 
				-    finally:
			
 
				-        timer.end()
			
 
				-
			
 
				-    write_main_index(links=to_keep, out_dir=OUTPUT_DIR, finished=True)
			
 
				-    log_removal_finished(len(all_links), len(to_keep))
			
 
				-    
			
 
				-    return to_keep
			
 
				-
			
 
				-
			
 
				-
			
 
				-def get_indexed_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
			
 
				-    """indexed links without checking archive status or data directory validity"""
			
 
				-    return {
			
 
				-        link.link_dir: link
			
 
				-        for link in links
			
 
				-    }
			
 
				-
			
 
				-def get_archived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
			
 
				-    """indexed links that are archived with a valid data directory"""
			
 
				-    return {
			
 
				-        link.link_dir: link
			
 
				-        for link in filter(is_archived, links)
			
 
				-    }
			
 
				-
			
 
				-def get_unarchived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
			
 
				-    """indexed links that are unarchived with no data directory or an empty data directory"""
			
 
				-    return {
			
 
				-        link.link_dir: link
			
 
				-        for link in filter(is_unarchived, links)
			
 
				-    }
			
 
				-
			
 
				-def get_present_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
			
 
				-    """dirs that are expected to exist based on the main index"""
			
 
				-    all_folders = {}
			
 
				-
			
 
				-    for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
			
 
				-        if entry.is_dir(follow_symlinks=True):
			
 
				-            link = None
			
 
				-            try:
			
 
				-                link = parse_json_link_details(entry.path)
			
 
				-            except Exception:
			
 
				-                pass
			
 
				-
			
 
				-            all_folders[entry.path] = link
			
 
				-
			
 
				-    return all_folders
			
 
				-
			
 
				-def get_valid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
			
 
				-    """dirs with a valid index matched to the main index and archived content"""
			
 
				-    return {
			
 
				-        link.link_dir: link
			
 
				-        for link in filter(is_valid, links)
			
 
				-    }
			
 
				-
			
 
				-def get_invalid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
			
 
				-    """dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
			
 
				-    duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
			
 
				-    orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
			
 
				-    corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
			
 
				-    unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
			
 
				-    return {**duplicate, **orphaned, **corrupted, **unrecognized}
			
 
				-
			
 
				-
			
 
				-def get_duplicate_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
			
 
				-    """dirs that conflict with other directories that have the same link URL or timestamp"""
			
 
				-    links = list(links)
			
 
				-    by_url = {link.url: 0 for link in links}
			
 
				-    by_timestamp = {link.timestamp: 0 for link in links}
			
 
				-
			
 
				-    duplicate_folders = {}
			
 
				-
			
 
				-    indexed_folders = {link.link_dir for link in links}
			
 
				-    data_folders = (
			
 
				-        entry.path
			
 
				-        for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME))
			
 
				-        if entry.is_dir(follow_symlinks=True) and entry.path not in indexed_folders
			
 
				-    )
			
 
				-
			
 
				-    for path in chain(sorted(indexed_folders), sorted(data_folders)):
			
 
				-        link = None
			
 
				-        try:
			
 
				-            link = parse_json_link_details(path)
			
 
				-        except Exception:
			
 
				-            pass
			
 
				-
			
 
				-        if link:
			
 
				-            # link folder has same timestamp as different link folder
			
 
				-            by_timestamp[link.timestamp] = by_timestamp.get(link.timestamp, 0) + 1
			
 
				-            if by_timestamp[link.timestamp] > 1:
			
 
				-                duplicate_folders[path] = link
			
 
				-
			
 
				-            # link folder has same url as different link folder
			
 
				-            by_url[link.url] = by_url.get(link.url, 0) + 1
			
 
				-            if by_url[link.url] > 1:
			
 
				-                duplicate_folders[path] = link
			
 
				-
			
 
				-    return duplicate_folders
			
 
				-
			
 
				-def get_orphaned_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
			
 
				-    """dirs that contain a valid index but aren't listed in the main index"""
			
 
				-    links = list(links)
			
 
				-    indexed_folders = {link.link_dir: link for link in links}
			
 
				-    orphaned_folders = {}
			
 
				-
			
 
				-    for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
			
 
				-        if entry.is_dir(follow_symlinks=True):
			
 
				-            index_exists = os.path.exists(os.path.join(entry.path, 'index.json'))
			
 
				-            link = None
			
 
				-            try:
			
 
				-                link = parse_json_link_details(entry.path)
			
 
				-            except Exception:
			
 
				-                pass
			
 
				-
			
 
				-            if index_exists and entry.path not in indexed_folders:
			
 
				-                # folder is a valid link data dir with index details, but it's not in the main index
			
 
				-                orphaned_folders[entry.path] = link
			
 
				-
			
 
				-    return orphaned_folders
			
 
				-
			
 
				-def get_corrupted_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
			
 
				-    """dirs that don't contain a valid index and aren't listed in the main index"""
			
 
				-    return {
			
 
				-        link.link_dir: link
			
 
				-        for link in filter(is_corrupt, links)
			
 
				-    }
			
 
				-
			
 
				-def get_unrecognized_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
			
 
				-    """dirs that don't contain recognizable archive data and aren't listed in the main index"""
			
 
				-    by_timestamp = {link.timestamp: 0 for link in links}
			
 
				-    unrecognized_folders: Dict[str, Optional[Link]] = {}
			
 
				-
			
 
				-    for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
			
 
				-        if entry.is_dir(follow_symlinks=True):
			
 
				-            index_exists = os.path.exists(os.path.join(entry.path, 'index.json'))
			
 
				-            link = None
			
 
				-            try:
			
 
				-                link = parse_json_link_details(entry.path)
			
 
				-            except Exception:
			
 
				-                pass
			
 
				-
			
 
				-            if index_exists and link is None:
			
 
				-                # index exists but it's corrupted or unparseable
			
 
				-                unrecognized_folders[entry.path] = link
			
 
				-            
			
 
				-            elif not index_exists:
			
 
				-                # link details index doesn't exist and the folder isn't in the main index
			
 
				-                timestamp = entry.path.rsplit('/', 1)[-1]
			
 
				-                if timestamp not in by_timestamp:
			
 
				-                    unrecognized_folders[entry.path] = link
			
 
				-
			
 
				-    return unrecognized_folders
			
 
				-
			
 
				-
			
 
				-def is_valid(link: Link) -> bool:
			
 
				-    dir_exists = os.path.exists(link.link_dir)
			
 
				-    index_exists = os.path.exists(os.path.join(link.link_dir, 'index.json'))
			
 
				-    if not dir_exists:
			
 
				-        # unarchived links are not included in the valid list
			
 
				-        return False
			
 
				-    if dir_exists and not index_exists:
			
 
				-        return False
			
 
				-    if dir_exists and index_exists:
			
 
				-        try:
			
 
				-            parsed_link = parse_json_link_details(link.link_dir)
			
 
				-            return link.url == parsed_link.url
			
 
				-        except Exception:
			
 
				-            pass
			
 
				-    return False
			
 
				-
			
 
				-def is_corrupt(link: Link) -> bool:
			
 
				-    if not os.path.exists(link.link_dir):
			
 
				-        # unarchived links are not considered corrupt
			
 
				-        return False
			
 
				-
			
 
				-    if is_valid(link):
			
 
				-        return False
			
 
				-
			
 
				-    return True
			
 
				-
			
 
				-def is_archived(link: Link) -> bool:
			
 
				-    return is_valid(link) and link.is_archived
			
 
				-    
			
 
				-def is_unarchived(link: Link) -> bool:
			
 
				-    if not os.path.exists(link.link_dir):
			
 
				-        return True
			
 
				-    return not link.is_archived
			
--- a/archivebox/legacy/mypy_django.ini
+++ b/archivebox/legacy/mypy_django.ini
@@ -1,10 +0,0 @@
 
				-[mypy_django_plugin]
			
 
				-
			
 
				-# specify settings module to use for django.conf.settings, this setting
			
 
				-# could also be specified with DJANGO_SETTINGS_MODULE environment variable
			
 
				-# (it also takes priority over config file)
			
 
				-django_settings = core.settings
			
 
				-
			
 
				-# if True, all unknown settings in django.conf.settings will fallback to Any,
			
 
				-# specify it if your settings are loaded dynamically to avoid false positives
			
 
				-ignore_missing_settings = True
			
--- a/archivebox/legacy/parse.py
+++ b/archivebox/legacy/parse.py
@@ -1,331 +0,0 @@
 
				-"""
			
 
				-Everything related to parsing links from input sources.
			
 
				-
			
 
				-For a list of supported services, see the README.md.
			
 
				-For examples of supported import formats see tests/.
			
 
				-
			
 
				-Link: {
			
 
				-    'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
			
 
				-    'timestamp': '1544212312.4234',
			
 
				-    'title': 'Example.com Page Title',
			
 
				-    'tags': 'abc,def',
			
 
				-    'sources': [
			
 
				-        'output/sources/ril_export.html',
			
 
				-        'output/sources/getpocket.com-1523422111.txt',
			
 
				-        'output/sources/stdin-234234112312.txt'
			
 
				-    ]
			
 
				-}
			
 
				-"""
			
 
				-
			
 
				-import re
			
 
				-import json
			
 
				-
			
 
				-from typing import Tuple, List, IO, Iterable
			
 
				-from datetime import datetime
			
 
				-import xml.etree.ElementTree as etree
			
 
				-
			
 
				-from .config import TIMEOUT
			
 
				-from .util import (
			
 
				-    htmldecode,
			
 
				-    str_between,
			
 
				-    URL_REGEX,
			
 
				-    check_url_parsing_invariants,
			
 
				-    TimedProgress,
			
 
				-    Link,
			
 
				-    enforce_types,
			
 
				-)
			
 
				-
			
 
				-
			
 
				-@enforce_types
			
 
				-def parse_links(source_file: str) -> Tuple[List[Link], str]:
			
 
				-    """parse a list of URLs with their metadata from an 
			
 
				-       RSS feed, bookmarks export, or text file
			
 
				-    """
			
 
				-
			
 
				-    check_url_parsing_invariants()
			
 
				-    PARSERS = (
			
 
				-        # Specialized parsers
			
 
				-        ('Pocket HTML', parse_pocket_html_export),
			
 
				-        ('Pinboard RSS', parse_pinboard_rss_export),
			
 
				-        ('Shaarli RSS', parse_shaarli_rss_export),
			
 
				-        ('Medium RSS', parse_medium_rss_export),
			
 
				-        
			
 
				-        # General parsers
			
 
				-        ('Netscape HTML', parse_netscape_html_export),
			
 
				-        ('Generic RSS', parse_rss_export),
			
 
				-        ('Generic JSON', parse_json_export),
			
 
				-
			
 
				-        # Fallback parser
			
 
				-        ('Plain Text', parse_plain_text_export),
			
 
				-    )
			
 
				-    timer = TimedProgress(TIMEOUT * 4)
			
 
				-    with open(source_file, 'r', encoding='utf-8') as file:
			
 
				-        for parser_name, parser_func in PARSERS:
			
 
				-            try:
			
 
				-                links = list(parser_func(file))
			
 
				-                if links:
			
 
				-                    timer.end()
			
 
				-                    return links, parser_name
			
 
				-            except Exception as err:   # noqa
			
 
				-                # Parsers are tried one by one down the list, and the first one
			
 
				-                # that succeeds is used. To see why a certain parser was not used
			
 
				-                # due to error or format incompatibility, uncomment this line:
			
 
				-                # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
			
 
				-                pass
			
 
				-
			
 
				-    timer.end()
			
 
				-    return [], 'Failed to parse'
			
 
				-
			
 
				-
			
 
				-### Import Parser Functions
			
 
				-
			
 
				-@enforce_types
			
 
				-def parse_pocket_html_export(html_file: IO[str]) -> Iterable[Link]:
			
 
				-    """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
			
 
				-
			
 
				-    html_file.seek(0)
			
 
				-    pattern = re.compile("^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE)
			
 
				-    for line in html_file:
			
 
				-        # example line
			
 
				-        # <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
			
 
				-        match = pattern.search(line)
			
 
				-        if match:
			
 
				-            url = match.group(1).replace('http://www.readability.com/read?url=', '')           # remove old readability prefixes to get original url
			
 
				-            time = datetime.fromtimestamp(float(match.group(2)))
			
 
				-            tags = match.group(3)
			
 
				-            title = match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '')
			
 
				-            
			
 
				-            yield Link(
			
 
				-                url=htmldecode(url),
			
 
				-                timestamp=str(time.timestamp()),
			
 
				-                title=htmldecode(title) or None,
			
 
				-                tags=tags or '',
			
 
				-                sources=[html_file.name],
			
 
				-            )
			
 
				-
			
 
				-
			
 
				-@enforce_types
			
 
				-def parse_json_export(json_file: IO[str]) -> Iterable[Link]:
			
 
				-    """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
			
 
				-
			
 
				-    json_file.seek(0)
			
 
				-    links = json.load(json_file)
			
 
				-    json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
			
 
				-
			
 
				-    for link in links:
			
 
				-        # example line
			
 
				-        # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
			
 
				-        if link:
			
 
				-            # Parse URL
			
 
				-            url = link.get('href') or link.get('url') or link.get('URL')
			
 
				-            if not url:
			
 
				-                raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
			
 
				-
			
 
				-            # Parse the timestamp
			
 
				-            ts_str = str(datetime.now().timestamp())
			
 
				-            if link.get('timestamp'):
			
 
				-                # chrome/ff histories use a very precise timestamp
			
 
				-                ts_str = str(link['timestamp'] / 10000000)  
			
 
				-            elif link.get('time'):
			
 
				-                ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
			
 
				-            elif link.get('created_at'):
			
 
				-                ts_str = str(json_date(link['created_at']).timestamp())
			
 
				-            elif link.get('created'):
			
 
				-                ts_str = str(json_date(link['created']).timestamp())
			
 
				-            elif link.get('date'):
			
 
				-                ts_str = str(json_date(link['date']).timestamp())
			
 
				-            elif link.get('bookmarked'):
			
 
				-                ts_str = str(json_date(link['bookmarked']).timestamp())
			
 
				-            elif link.get('saved'):
			
 
				-                ts_str = str(json_date(link['saved']).timestamp())
			
 
				-            
			
 
				-            # Parse the title
			
 
				-            title = None
			
 
				-            if link.get('title'):
			
 
				-                title = link['title'].strip()
			
 
				-            elif link.get('description'):
			
 
				-                title = link['description'].replace(' — Readability', '').strip()
			
 
				-            elif link.get('name'):
			
 
				-                title = link['name'].strip()
			
 
				-
			
 
				-            yield Link(
			
 
				-                url=htmldecode(url),
			
 
				-                timestamp=ts_str,
			
 
				-                title=htmldecode(title) or None,
			
 
				-                tags=htmldecode(link.get('tags')) or '',
			
 
				-                sources=[json_file.name],
			
 
				-            )
			
 
				-
			
 
				-
			
 
				-@enforce_types
			
 
				-def parse_rss_export(rss_file: IO[str]) -> Iterable[Link]:
			
 
				-    """Parse RSS XML-format files into links"""
			
 
				-
			
 
				-    rss_file.seek(0)
			
 
				-    items = rss_file.read().split('<item>')
			
 
				-    items = items[1:] if items else []
			
 
				-    for item in items:
			
 
				-        # example item:
			
 
				-        # <item>
			
 
				-        # <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
			
 
				-        # <category>Unread</category>
			
 
				-        # <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
			
 
				-        # <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
			
 
				-        # <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
			
 
				-        # </item>
			
 
				-
			
 
				-        trailing_removed = item.split('</item>', 1)[0]
			
 
				-        leading_removed = trailing_removed.split('<item>', 1)[-1].strip()
			
 
				-        rows = leading_removed.split('\n')
			
 
				-
			
 
				-        def get_row(key):
			
 
				-            return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
			
 
				-
			
 
				-        url = str_between(get_row('link'), '<link>', '</link>')
			
 
				-        ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
			
 
				-        time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
			
 
				-        title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
			
 
				-
			
 
				-        yield Link(
			
 
				-            url=htmldecode(url),
			
 
				-            timestamp=str(time.timestamp()),
			
 
				-            title=htmldecode(title) or None,
			
 
				-            tags=None,
			
 
				-            sources=[rss_file.name],
			
 
				-        )
			
 
				-
			
 
				-
			
 
				-@enforce_types
			
 
				-def parse_shaarli_rss_export(rss_file: IO[str]) -> Iterable[Link]:
			
 
				-    """Parse Shaarli-specific RSS XML-format files into links"""
			
 
				-
			
 
				-    rss_file.seek(0)
			
 
				-    entries = rss_file.read().split('<entry>')[1:]
			
 
				-    for entry in entries:
			
 
				-        # example entry:
			
 
				-        # <entry>
			
 
				-        #   <title>Aktuelle Trojaner-Welle: Emotet lauert in gefÃ¤lschten Rechnungsmails | heise online</title>
			
 
				-        #   <link href="https://www.heise.de/security/meldung/Aktuelle-Trojaner-Welle-Emotet-lauert-in-gefaelschten-Rechnungsmails-4291268.html" />
			
 
				-        #   <id>https://demo.shaarli.org/?cEV4vw</id>
			
 
				-        #   <published>2019-01-30T06:06:01+00:00</published>
			
 
				-        #   <updated>2019-01-30T06:06:01+00:00</updated>
			
 
				-        #   <content type="html" xml:lang="en"><![CDATA[<div class="markdown"><p>&#8212; <a href="https://demo.shaarli.org/?cEV4vw">Permalink</a></p></div>]]></content>
			
 
				-        # </entry>
			
 
				-
			
 
				-        trailing_removed = entry.split('</entry>', 1)[0]
			
 
				-        leading_removed = trailing_removed.strip()
			
 
				-        rows = leading_removed.split('\n')
			
 
				-
			
 
				-        def get_row(key):
			
 
				-            return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0]
			
 
				-
			
 
				-        title = str_between(get_row('title'), '<title>', '</title>').strip()
			
 
				-        url = str_between(get_row('link'), '<link href="', '" />')
			
 
				-        ts_str = str_between(get_row('published'), '<published>', '</published>')
			
 
				-        time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
			
 
				-
			
 
				-        yield Link(
			
 
				-            url=htmldecode(url),
			
 
				-            timestamp=str(time.timestamp()),
			
 
				-            title=htmldecode(title) or None,
			
 
				-            tags=None,
			
 
				-            sources=[rss_file.name],
			
 
				-        )
			
 
				-
			
 
				-
			
 
				-@enforce_types
			
 
				-def parse_netscape_html_export(html_file: IO[str]) -> Iterable[Link]:
			
 
				-    """Parse netscape-format bookmarks export files (produced by all browsers)"""
			
 
				-
			
 
				-    html_file.seek(0)
			
 
				-    pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE)
			
 
				-    for line in html_file:
			
 
				-        # example line
			
 
				-        # <DT><A HREF="https://example.com/?q=1+2" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://example.com/favicon.ico" ICON="data:image/png;base64,...">example bookmark title</A>
			
 
				-        
			
 
				-        match = pattern.search(line)
			
 
				-        if match:
			
 
				-            url = match.group(1)
			
 
				-            time = datetime.fromtimestamp(float(match.group(2)))
			
 
				-            title = match.group(3).strip()
			
 
				-
			
 
				-            yield Link(
			
 
				-                url=htmldecode(url),
			
 
				-                timestamp=str(time.timestamp()),
			
 
				-                title=htmldecode(title) or None,
			
 
				-                tags=None,
			
 
				-                sources=[html_file.name],
			
 
				-            )
			
 
				-
			
 
				-
			
 
				-@enforce_types
			
 
				-def parse_pinboard_rss_export(rss_file: IO[str]) -> Iterable[Link]:
			
 
				-    """Parse Pinboard RSS feed files into links"""
			
 
				-
			
 
				-    rss_file.seek(0)
			
 
				-    root = etree.parse(rss_file).getroot()
			
 
				-    items = root.findall("{http://purl.org/rss/1.0/}item")
			
 
				-    for item in items:
			
 
				-        find = lambda p: item.find(p).text.strip() if item.find(p) else None    # type: ignore
			
 
				-
			
 
				-        url = find("{http://purl.org/rss/1.0/}link")
			
 
				-        tags = find("{http://purl.org/dc/elements/1.1/}subject")
			
 
				-        title = find("{http://purl.org/rss/1.0/}title")
			
 
				-        ts_str = find("{http://purl.org/dc/elements/1.1/}date")
			
 
				-        
			
 
				-        # Pinboard includes a colon in its date stamp timezone offsets, which
			
 
				-        # Python can't parse. Remove it:
			
 
				-        if ts_str and ts_str[-3:-2] == ":":
			
 
				-            ts_str = ts_str[:-3]+ts_str[-2:]
			
 
				-
			
 
				-        if ts_str:
			
 
				-            time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
			
 
				-        else:
			
 
				-            time = datetime.now()
			
 
				-
			
 
				-        yield Link(
			
 
				-            url=htmldecode(url),
			
 
				-            timestamp=str(time.timestamp()),
			
 
				-            title=htmldecode(title) or None,
			
 
				-            tags=htmldecode(tags) or None,
			
 
				-            sources=[rss_file.name],
			
 
				-        )
			
 
				-
			
 
				-
			
 
				-@enforce_types
			
 
				-def parse_medium_rss_export(rss_file: IO[str]) -> Iterable[Link]:
			
 
				-    """Parse Medium RSS feed files into links"""
			
 
				-
			
 
				-    rss_file.seek(0)
			
 
				-    root = etree.parse(rss_file).getroot()
			
 
				-    items = root.find("channel").findall("item")                        # type: ignore
			
 
				-    for item in items:
			
 
				-        url = item.find("link").text                                    # type: ignore
			
 
				-        title = item.find("title").text.strip()                         # type: ignore
			
 
				-        ts_str = item.find("pubDate").text                              # type: ignore
			
 
				-        time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z")    # type: ignore
			
 
				-        
			
 
				-        yield Link(
			
 
				-            url=htmldecode(url),
			
 
				-            timestamp=str(time.timestamp()),
			
 
				-            title=htmldecode(title) or None,
			
 
				-            tags=None,
			
 
				-            sources=[rss_file.name],
			
 
				-        )
			
 
				-
			
 
				-
			
 
				-@enforce_types
			
 
				-def parse_plain_text_export(text_file: IO[str]) -> Iterable[Link]:
			
 
				-    """Parse raw links from each line in a text file"""
			
 
				-
			
 
				-    text_file.seek(0)
			
 
				-    for line in text_file.readlines():
			
 
				-        urls = re.findall(URL_REGEX, line) if line.strip() else ()
			
 
				-        for url in urls:                                                # type: ignore
			
 
				-            yield Link(
			
 
				-                url=htmldecode(url),
			
 
				-                timestamp=str(datetime.now().timestamp()),
			
 
				-                title=None,
			
 
				-                tags=None,
			
 
				-                sources=[text_file.name],
			
 
				-            )
			
--- a/archivebox/legacy/purge.py
+++ b/archivebox/legacy/purge.py
@@ -1,89 +0,0 @@
 
				-#!/usr/bin/env python3
			
 
				-
			
 
				-import re
			
 
				-from argparse import ArgumentParser
			
 
				-from os.path import exists, join
			
 
				-from shutil import rmtree
			
 
				-from typing import List
			
 
				-
			
 
				-from .config import ARCHIVE_DIR, OUTPUT_DIR
			
 
				-from .index import (
			
 
				-    parse_json_links_index,
			
 
				-    write_html_links_index,
			
 
				-    write_json_links_index,
			
 
				-)
			
 
				-
			
 
				-
			
 
				-def cleanup_index(regexes: List[str], proceed: bool, delete: bool) -> None:
			
 
				-    if not exists(join(OUTPUT_DIR, 'index.json')):
			
 
				-        exit('index.json is missing; nothing to do')
			
 
				-
			
 
				-    compiled = [re.compile(r) for r in regexes]
			
 
				-    links = parse_json_links_index(OUTPUT_DIR)
			
 
				-    filtered = []
			
 
				-    remaining = []
			
 
				-
			
 
				-    for link in links:
			
 
				-        url = link.url
			
 
				-        for r in compiled:
			
 
				-            if r.search(url):
			
 
				-                filtered.append((link, r))
			
 
				-                break
			
 
				-        else:
			
 
				-            remaining.append(link)
			
 
				-
			
 
				-    if not filtered:
			
 
				-        exit('Search did not match any entries.')
			
 
				-
			
 
				-    print('Filtered out {}/{} urls:'.format(len(filtered), len(links)))
			
 
				-
			
 
				-    for link, regex in filtered:
			
 
				-        url = link.url
			
 
				-        print(' {url} via {regex}'.format(url=url, regex=regex.pattern))
			
 
				-
			
 
				-    if not proceed:
			
 
				-        answer = input('Remove {} entries from index? [y/n] '.format(
			
 
				-            len(filtered)))
			
 
				-        proceed = answer.strip().lower() in ('y', 'yes')
			
 
				-
			
 
				-    if not proceed:
			
 
				-        exit('Aborted')
			
 
				-
			
 
				-    write_json_links_index(OUTPUT_DIR, remaining)
			
 
				-    write_html_links_index(OUTPUT_DIR, remaining)
			
 
				-
			
 
				-    if delete:
			
 
				-        for link, _ in filtered:
			
 
				-            data_dir = join(ARCHIVE_DIR, link['timestamp'])
			
 
				-            if exists(data_dir):
			
 
				-                rmtree(data_dir)
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    p = ArgumentParser('Index purging tool')
			
 
				-    p.add_argument(
			
 
				-        '--regex',
			
 
				-        '-r',
			
 
				-        action='append',
			
 
				-        help='Regular expression matching URLs to purge',
			
 
				-    )
			
 
				-    p.add_argument(
			
 
				-        '--delete',
			
 
				-        '-d',
			
 
				-        action='store_true',
			
 
				-        default=False,
			
 
				-        help='Delete webpage files from archive',
			
 
				-    )
			
 
				-    p.add_argument(
			
 
				-        '--yes',
			
 
				-        '-y',
			
 
				-        action='store_true',
			
 
				-        default=False,
			
 
				-        help='Do not prompt for confirmation',
			
 
				-    )
			
 
				-
			
 
				-    args = p.parse_args()
			
 
				-    if args.regex:
			
 
				-        cleanup_index(args.regex, proceed=args.yes, delete=args.delete)
			
 
				-    else:
			
 
				-        p.print_help()
			
--- a/archivebox/legacy/storage/__init__.py
+++ b/archivebox/legacy/storage/__init__.py
@@ -1 +0,0 @@
 
				-__package__ = 'archivebox.legacy.storage'
			
--- a/archivebox/main.py
+++ b/archivebox/main.py
@@ -0,0 +1,1086 @@
 
				+__package__ = 'archivebox'
			
 
				+
			
 
				+import re
			
 
				+import os
			
 
				+import sys
			
 
				+import shutil
			
 
				+
			
 
				+from typing import Dict, List, Optional, Set, Tuple, Iterable, IO
			
 
				+
			
 
				+from crontab import CronTab, CronSlices
			
 
				+
			
 
				+from .cli import (
			
 
				+    list_subcommands,
			
 
				+    run_subcommand,
			
 
				+    display_first,
			
 
				+    meta_cmds,
			
 
				+    main_cmds,
			
 
				+    archive_cmds,
			
 
				+)
			
 
				+from .index.schema import Link
			
 
				+from .util import (
			
 
				+    enforce_types,
			
 
				+    TimedProgress,
			
 
				+    get_dir_size,
			
 
				+    human_readable_size,
			
 
				+    save_stdin_to_sources,
			
 
				+    save_file_to_sources,
			
 
				+    links_to_csv,
			
 
				+    to_json,
			
 
				+    folders_to_str,
			
 
				+)
			
 
				+from .index import (
			
 
				+    links_after_timestamp,
			
 
				+    load_main_index,
			
 
				+    import_new_links,
			
 
				+    write_main_index,
			
 
				+    link_matches_filter,
			
 
				+    get_indexed_folders,
			
 
				+    get_archived_folders,
			
 
				+    get_unarchived_folders,
			
 
				+    get_present_folders,
			
 
				+    get_valid_folders,
			
 
				+    get_invalid_folders,
			
 
				+    get_duplicate_folders,
			
 
				+    get_orphaned_folders,
			
 
				+    get_corrupted_folders,
			
 
				+    get_unrecognized_folders,
			
 
				+    fix_invalid_folder_locations,
			
 
				+)
			
 
				+from .index.json import (
			
 
				+    parse_json_main_index,
			
 
				+    parse_json_links_details,
			
 
				+)
			
 
				+from .index.sql import parse_sql_main_index, get_admins, apply_migrations
			
 
				+from .index.html import parse_html_main_index
			
 
				+from .extractors import archive_link
			
 
				+from .config import (
			
 
				+    stderr,
			
 
				+    ConfigDict,
			
 
				+    ANSI,
			
 
				+    IS_TTY,
			
 
				+    USER,
			
 
				+    ARCHIVEBOX_BINARY,
			
 
				+    ONLY_NEW,
			
 
				+    OUTPUT_DIR,
			
 
				+    SOURCES_DIR,
			
 
				+    ARCHIVE_DIR,
			
 
				+    LOGS_DIR,
			
 
				+    CONFIG_FILE,
			
 
				+    ARCHIVE_DIR_NAME,
			
 
				+    SOURCES_DIR_NAME,
			
 
				+    LOGS_DIR_NAME,
			
 
				+    STATIC_DIR_NAME,
			
 
				+    JSON_INDEX_FILENAME,
			
 
				+    HTML_INDEX_FILENAME,
			
 
				+    SQL_INDEX_FILENAME,
			
 
				+    ROBOTS_TXT_FILENAME,
			
 
				+    FAVICON_FILENAME,
			
 
				+    check_dependencies,
			
 
				+    check_data_folder,
			
 
				+    write_config_file,
			
 
				+    setup_django,
			
 
				+    VERSION,
			
 
				+    CODE_LOCATIONS,
			
 
				+    EXTERNAL_LOCATIONS,
			
 
				+    DATA_LOCATIONS,
			
 
				+    DEPENDENCIES,
			
 
				+    load_all_config,
			
 
				+    CONFIG,
			
 
				+    USER_CONFIG,
			
 
				+    get_real_name,
			
 
				+)
			
 
				+from .cli.logging import (
			
 
				+    log_archiving_started,
			
 
				+    log_archiving_paused,
			
 
				+    log_archiving_finished,
			
 
				+    log_removal_started,
			
 
				+    log_removal_finished,
			
 
				+    log_list_started,
			
 
				+    log_list_finished,
			
 
				+)
			
 
				+
			
 
				+
			
 
				+ALLOWED_IN_OUTPUT_DIR = {
			
 
				+    '.DS_Store',
			
 
				+    '.venv',
			
 
				+    'venv',
			
 
				+    'virtualenv',
			
 
				+    '.virtualenv',
			
 
				+    ARCHIVE_DIR_NAME,
			
 
				+    SOURCES_DIR_NAME,
			
 
				+    LOGS_DIR_NAME,
			
 
				+    STATIC_DIR_NAME,
			
 
				+    SQL_INDEX_FILENAME,
			
 
				+    JSON_INDEX_FILENAME,
			
 
				+    HTML_INDEX_FILENAME,
			
 
				+    ROBOTS_TXT_FILENAME,
			
 
				+    FAVICON_FILENAME,
			
 
				+}
			
 
				+
			
 
				+def help(out_dir: str=OUTPUT_DIR) -> None:
			
 
				+    all_subcommands = list_subcommands()
			
 
				+    COMMANDS_HELP_TEXT = '\n    '.join(
			
 
				+        f'{cmd.ljust(20)} {summary}'
			
 
				+        for cmd, summary in all_subcommands.items()
			
 
				+        if cmd in meta_cmds
			
 
				+    ) + '\n\n    ' + '\n    '.join(
			
 
				+        f'{cmd.ljust(20)} {summary}'
			
 
				+        for cmd, summary in all_subcommands.items()
			
 
				+        if cmd in main_cmds
			
 
				+    ) + '\n\n    ' + '\n    '.join(
			
 
				+        f'{cmd.ljust(20)} {summary}'
			
 
				+        for cmd, summary in all_subcommands.items()
			
 
				+        if cmd in archive_cmds
			
 
				+    ) + '\n\n    ' + '\n    '.join(
			
 
				+        f'{cmd.ljust(20)} {summary}'
			
 
				+        for cmd, summary in all_subcommands.items()
			
 
				+        if cmd not in display_first
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+    if os.path.exists(os.path.join(out_dir, JSON_INDEX_FILENAME)):
			
 
				+        print('''{green}ArchiveBox v{}: The self-hosted internet archive.{reset}
			
 
				+
			
 
				+{lightred}Active data directory:{reset}
			
 
				+    {}
			
 
				+
			
 
				+{lightred}Usage:{reset}
			
 
				+    archivebox [command] [--help] [--version] [...args]
			
 
				+
			
 
				+{lightred}Commands:{reset}
			
 
				+    {}
			
 
				+
			
 
				+{lightred}Example Use:{reset}
			
 
				+    mkdir my-archive; cd my-archive/
			
 
				+    archivebox init
			
 
				+    archivebox info
			
 
				+
			
 
				+    archivebox add https://example.com/some/page
			
 
				+    archivebox add --depth=1 ~/Downloads/bookmarks_export.html
			
 
				+    
			
 
				+    archivebox list --sort=timestamp --csv=timestamp,url,is_archived
			
 
				+    archivebox schedule --every=week https://example.com/some/feed.rss
			
 
				+    archivebox update --resume=15109948213.123
			
 
				+
			
 
				+{lightred}Documentation:{reset}
			
 
				+    https://github.com/pirate/ArchiveBox/wiki
			
 
				+'''.format(VERSION, out_dir, COMMANDS_HELP_TEXT, **ANSI))
			
 
				+    
			
 
				+    else:
			
 
				+        print('{green}Welcome to ArchiveBox v{}!{reset}'.format(VERSION, **ANSI))
			
 
				+        print()
			
 
				+        print('To import an existing archive (from a previous version of ArchiveBox):')
			
 
				+        print('    1. cd into your data dir OUTPUT_DIR (usually ArchiveBox/output) and run:')
			
 
				+        print('    2. archivebox init')
			
 
				+        print()
			
 
				+        print('To start a new archive:')
			
 
				+        print('    1. Create an empty directory, then cd into it and run:')
			
 
				+        print('    2. archivebox init')
			
 
				+        print()
			
 
				+        print('For more information, see the documentation here:')
			
 
				+        print('    https://github.com/pirate/ArchiveBox/wiki')
			
 
				+
			
 
				+
			
 
				+def version(quiet: bool=False, out_dir: str=OUTPUT_DIR) -> None:
			
 
				+    if quiet:
			
 
				+        print(VERSION)
			
 
				+    else:
			
 
				+        print('ArchiveBox v{}'.format(VERSION))
			
 
				+        print()
			
 
				+
			
 
				+        print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
			
 
				+        for name, dependency in DEPENDENCIES.items():
			
 
				+            print_dependency_version(name, dependency)
			
 
				+        
			
 
				+        print()
			
 
				+        print('{white}[i] Code locations:{reset}'.format(**ANSI))
			
 
				+        for name, folder in CODE_LOCATIONS.items():
			
 
				+            print_folder_status(name, folder)
			
 
				+
			
 
				+        print()
			
 
				+        print('{white}[i] External locations:{reset}'.format(**ANSI))
			
 
				+        for name, folder in EXTERNAL_LOCATIONS.items():
			
 
				+            print_folder_status(name, folder)
			
 
				+
			
 
				+        print()
			
 
				+        print('{white}[i] Data locations:{reset}'.format(**ANSI))
			
 
				+        for name, folder in DATA_LOCATIONS.items():
			
 
				+            print_folder_status(name, folder)
			
 
				+
			
 
				+        print()
			
 
				+        check_dependencies()
			
 
				+
			
 
				+
			
 
				+def run(subcommand: str, subcommand_args: Optional[List[str]], stdin: Optional[IO]=None, out_dir: str=OUTPUT_DIR) -> None:
			
 
				+    run_subcommand(
			
 
				+        subcommand=subcommand,
			
 
				+        subcommand_args=subcommand_args,
			
 
				+        stdin=stdin,
			
 
				+        out_dir=out_dir,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+def init(out_dir: str=OUTPUT_DIR) -> None:
			
 
				+    os.makedirs(out_dir, exist_ok=True)
			
 
				+
			
 
				+    is_empty = not len(set(os.listdir(out_dir)) - ALLOWED_IN_OUTPUT_DIR)
			
 
				+    existing_index = os.path.exists(os.path.join(out_dir, JSON_INDEX_FILENAME))
			
 
				+
			
 
				+    if is_empty and not existing_index:
			
 
				+        print('{green}[+] Initializing a new ArchiveBox collection in this folder...{reset}'.format(**ANSI))
			
 
				+        print(f'    {out_dir}')
			
 
				+        print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
			
 
				+    elif existing_index:
			
 
				+        print('{green}[*] Updating existing ArchiveBox collection in this folder...{reset}'.format(**ANSI))
			
 
				+        print(f'    {out_dir}')
			
 
				+        print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
			
 
				+    else:
			
 
				+        stderr(
			
 
				+            ("{red}[X] This folder appears to already have files in it, but no index.json is present.{reset}\n\n"
			
 
				+            "    You must run init in a completely empty directory, or an existing data folder.\n\n"
			
 
				+            "    {lightred}Hint:{reset} To import an existing data folder make sure to cd into the folder first, \n"
			
 
				+            "    then run and run 'archivebox init' to pick up where you left off.\n\n"
			
 
				+            "    (Always make sure your data folder is backed up first before updating ArchiveBox)"
			
 
				+            ).format(out_dir, **ANSI)
			
 
				+        )
			
 
				+        raise SystemExit(1)
			
 
				+
			
 
				+    if existing_index:
			
 
				+        print('\n{green}[*] Verifying archive folder structure...{reset}'.format(**ANSI))
			
 
				+    else:
			
 
				+        print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI))
			
 
				+    
			
 
				+    os.makedirs(SOURCES_DIR, exist_ok=True)
			
 
				+    print(f'    √ {SOURCES_DIR}')
			
 
				+    
			
 
				+    os.makedirs(ARCHIVE_DIR, exist_ok=True)
			
 
				+    print(f'    √ {ARCHIVE_DIR}')
			
 
				+
			
 
				+    os.makedirs(LOGS_DIR, exist_ok=True)
			
 
				+    print(f'    √ {LOGS_DIR}')
			
 
				+
			
 
				+    write_config_file({}, out_dir=out_dir)
			
 
				+    print(f'    √ {CONFIG_FILE}')
			
 
				+    
			
 
				+    if os.path.exists(os.path.join(out_dir, SQL_INDEX_FILENAME)):
			
 
				+        print('\n{green}[*] Verifying main SQL index and running migrations...{reset}'.format(**ANSI))
			
 
				+    else:
			
 
				+        print('\n{green}[+] Building main SQL index and running migrations...{reset}'.format(**ANSI))
			
 
				+    
			
 
				+    setup_django(out_dir, check_db=False)
			
 
				+    from django.conf import settings
			
 
				+    assert settings.DATABASE_FILE == os.path.join(out_dir, SQL_INDEX_FILENAME)
			
 
				+    print(f'    √ {settings.DATABASE_FILE}')
			
 
				+    print()
			
 
				+    for migration_line in apply_migrations(out_dir):
			
 
				+        print(f'    {migration_line}')
			
 
				+
			
 
				+
			
 
				+    assert os.path.exists(settings.DATABASE_FILE)
			
 
				+    
			
 
				+    # from django.contrib.auth.models import User
			
 
				+    # if IS_TTY and not User.objects.filter(is_superuser=True).exists():
			
 
				+    #     print('{green}[+] Creating admin user account...{reset}'.format(**ANSI))
			
 
				+    #     call_command("createsuperuser", interactive=True)
			
 
				+
			
 
				+    print()
			
 
				+    print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI))
			
 
				+
			
 
				+    all_links: Dict[str, Link] = {}
			
 
				+    if existing_index:
			
 
				+        all_links = {
			
 
				+            link.url: link
			
 
				+            for link in load_main_index(out_dir=out_dir, warn=False)
			
 
				+        }
			
 
				+        print('    √ Loaded {} links from existing main index.'.format(len(all_links)))
			
 
				+
			
 
				+    # Links in data folders that dont match their timestamp
			
 
				+    fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir)
			
 
				+    if fixed:
			
 
				+        print('    {lightyellow}√ Fixed {} data directory locations that didn\'t match their link timestamps.{reset}'.format(len(fixed), **ANSI))
			
 
				+    if cant_fix:
			
 
				+        print('    {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI))
			
 
				+
			
 
				+    # Links in JSON index but not in main index
			
 
				+    orphaned_json_links = {
			
 
				+        link.url: link
			
 
				+        for link in parse_json_main_index(out_dir)
			
 
				+        if link.url not in all_links
			
 
				+    }
			
 
				+    if orphaned_json_links:
			
 
				+        all_links.update(orphaned_json_links)
			
 
				+        print('    {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
			
 
				+
			
 
				+    # Links in SQL index but not in main index
			
 
				+    orphaned_sql_links = {
			
 
				+        link.url: link
			
 
				+        for link in parse_sql_main_index(out_dir)
			
 
				+        if link.url not in all_links
			
 
				+    }
			
 
				+    if orphaned_sql_links:
			
 
				+        all_links.update(orphaned_sql_links)
			
 
				+        print('    {lightyellow}√ Added {} orphaned links from existing SQL index...{reset}'.format(len(orphaned_sql_links), **ANSI))
			
 
				+
			
 
				+    # Links in data dir indexes but not in main index
			
 
				+    orphaned_data_dir_links = {
			
 
				+        link.url: link
			
 
				+        for link in parse_json_links_details(out_dir)
			
 
				+        if link.url not in all_links
			
 
				+    }
			
 
				+    if orphaned_data_dir_links:
			
 
				+        all_links.update(orphaned_data_dir_links)
			
 
				+        print('    {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI))
			
 
				+
			
 
				+    # Links in invalid/duplicate data dirs
			
 
				+    invalid_folders = {
			
 
				+        folder: link
			
 
				+        for folder, link in get_invalid_folders(all_links.values(), out_dir=out_dir).items()
			
 
				+    }
			
 
				+    if invalid_folders:
			
 
				+        print('    {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI))
			
 
				+        print('        X ' + '\n        X '.join(f'{folder} {link}' for folder, link in invalid_folders.items()))
			
 
				+        print()
			
 
				+        print('    {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI))
			
 
				+        print('        archivebox info')
			
 
				+        print('        archivebox list --status=invalid')
			
 
				+
			
 
				+
			
 
				+    write_main_index(list(all_links.values()), out_dir=out_dir)
			
 
				+
			
 
				+    print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))
			
 
				+    if existing_index:
			
 
				+        print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
			
 
				+    else:
			
 
				+        print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links), **ANSI))
			
 
				+    print()
			
 
				+    print('    To view your archive index, open:')
			
 
				+    print('        {}'.format(os.path.join(out_dir, HTML_INDEX_FILENAME)))
			
 
				+    print()
			
 
				+    print('    To add new links, you can run:')
			
 
				+    print("        archivebox add 'https://example.com'")
			
 
				+    print()
			
 
				+    print('    For more usage and examples, run:')
			
 
				+    print('        archivebox help')
			
 
				+
			
 
				+
			
 
				+def info(out_dir: str=OUTPUT_DIR) -> None:
			
 
				+    check_data_folder(out_dir=out_dir)
			
 
				+
			
 
				+    print('{green}[*] Scanning archive collection main index...{reset}'.format(**ANSI))
			
 
				+    print(f'    {out_dir}/*')
			
 
				+    num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern='index.')
			
 
				+    size = human_readable_size(num_bytes)
			
 
				+    print(f'    Size: {size} across {num_files} files')
			
 
				+    print()
			
 
				+
			
 
				+    links = list(load_main_index(out_dir=out_dir))
			
 
				+    num_json_links = len(links)
			
 
				+    num_sql_links = sum(1 for link in parse_sql_main_index(out_dir=out_dir))
			
 
				+    num_html_links = sum(1 for url in parse_html_main_index(out_dir=out_dir))
			
 
				+    num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
			
 
				+    users = get_admins().values_list('username', flat=True)
			
 
				+    print(f'    > JSON Main Index: {num_json_links} links'.ljust(36),  f'(found in {JSON_INDEX_FILENAME})')
			
 
				+    print(f'    > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
			
 
				+    print(f'    > HTML Main Index: {num_html_links} links'.ljust(36), f'(found in {HTML_INDEX_FILENAME})')
			
 
				+    print(f'    > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR_NAME}/*/index.json)')
			
 
				+
			
 
				+    print(f'    > Admin: {len(users)} users {", ".join(users)}'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
			
 
				+    
			
 
				+    if num_html_links != len(links) or num_sql_links != len(links):
			
 
				+        print()
			
 
				+        print('    {lightred}Hint:{reset} You can fix index count differences automatically by running:'.format(**ANSI))
			
 
				+        print('        archivebox init')
			
 
				+    
			
 
				+    if not users:
			
 
				+        print()
			
 
				+        print('    {lightred}Hint:{reset} You can create an admin user by running:'.format(**ANSI))
			
 
				+        print('        archivebox manage createsuperuser')
			
 
				+
			
 
				+    print()
			
 
				+    print('{green}[*] Scanning archive collection link data directories...{reset}'.format(**ANSI))
			
 
				+    print(f'    {ARCHIVE_DIR}/*')
			
 
				+
			
 
				+    num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR)
			
 
				+    size = human_readable_size(num_bytes)
			
 
				+    print(f'    Size: {size} across {num_files} files in {num_dirs} directories')
			
 
				+    print()
			
 
				+
			
 
				+    num_indexed = len(get_indexed_folders(links, out_dir=out_dir))
			
 
				+    num_archived = len(get_archived_folders(links, out_dir=out_dir))
			
 
				+    num_unarchived = len(get_unarchived_folders(links, out_dir=out_dir))
			
 
				+    print(f'    > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})')
			
 
				+    print(f'      > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})')
			
 
				+    print(f'      > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})')
			
 
				+    
			
 
				+    num_present = len(get_present_folders(links, out_dir=out_dir))
			
 
				+    num_valid = len(get_valid_folders(links, out_dir=out_dir))
			
 
				+    print()
			
 
				+    print(f'    > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})')
			
 
				+    print(f'      > valid: {num_valid}'.ljust(36), f'({get_valid_folders.__doc__})')
			
 
				+    
			
 
				+    duplicate = get_duplicate_folders(links, out_dir=out_dir)
			
 
				+    orphaned = get_orphaned_folders(links, out_dir=out_dir)
			
 
				+    corrupted = get_corrupted_folders(links, out_dir=out_dir)
			
 
				+    unrecognized = get_unrecognized_folders(links, out_dir=out_dir)
			
 
				+    num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized})
			
 
				+    print(f'      > invalid: {num_invalid}'.ljust(36), f'({get_invalid_folders.__doc__})')
			
 
				+    print(f'        > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})')
			
 
				+    print(f'        > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})')
			
 
				+    print(f'        > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})')
			
 
				+    print(f'        > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})')
			
 
				+    
			
 
				+    if num_indexed:
			
 
				+        print()
			
 
				+        print('    {lightred}Hint:{reset} You can list link data directories by status like so:'.format(**ANSI))
			
 
				+        print('        archivebox list --status=<status>  (e.g. indexed, corrupted, archived, etc.)')
			
 
				+
			
 
				+    if orphaned:
			
 
				+        print()
			
 
				+        print('    {lightred}Hint:{reset} To automatically import orphaned data directories into the main index, run:'.format(**ANSI))
			
 
				+        print('        archivebox init')
			
 
				+
			
 
				+    if num_invalid:
			
 
				+        print()
			
 
				+        print('    {lightred}Hint:{reset} You may need to manually remove or fix some invalid data directories, afterwards make sure to run:'.format(**ANSI))
			
 
				+        print('        archivebox init')
			
 
				+    
			
 
				+    print()
			
 
				+
			
 
				+
			
 
				+@enforce_types
			
 
				+def add(import_str: Optional[str]=None,
			
 
				+        import_path: Optional[str]=None,
			
 
				+        update_all: bool=not ONLY_NEW,
			
 
				+        index_only: bool=False,
			
 
				+        out_dir: str=OUTPUT_DIR) -> List[Link]:
			
 
				+    """The main ArchiveBox entrancepoint. Everything starts here."""
			
 
				+
			
 
				+    check_data_folder(out_dir=out_dir)
			
 
				+
			
 
				+    if import_str and import_path:
			
 
				+        stderr(
			
 
				+            '[X] You should pass either an import path as an argument, '
			
 
				+            'or pass a list of links via stdin, but not both.\n',
			
 
				+            color='red',
			
 
				+        )
			
 
				+        raise SystemExit(2)
			
 
				+    elif import_str:
			
 
				+        import_path = save_stdin_to_sources(import_str, out_dir=out_dir)
			
 
				+    else:
			
 
				+        import_path = save_file_to_sources(import_path, out_dir=out_dir)
			
 
				+
			
 
				+    check_dependencies()
			
 
				+
			
 
				+    # Step 1: Load list of links from the existing index
			
 
				+    #         merge in and dedupe new links from import_path
			
 
				+    all_links: List[Link] = []
			
 
				+    new_links: List[Link] = []
			
 
				+    all_links = load_main_index(out_dir=out_dir)
			
 
				+    if import_path:
			
 
				+        all_links, new_links = import_new_links(all_links, import_path, out_dir=out_dir)
			
 
				+
			
 
				+    # Step 2: Write updated index with deduped old and new links back to disk
			
 
				+    write_main_index(links=all_links, out_dir=out_dir)
			
 
				+
			
 
				+    if index_only:
			
 
				+        return all_links
			
 
				+        
			
 
				+    # Step 3: Run the archive methods for each link
			
 
				+    links = all_links if update_all else new_links
			
 
				+    log_archiving_started(len(links))
			
 
				+    idx: int = 0
			
 
				+    link: Link = None                                             # type: ignore
			
 
				+    try:
			
 
				+        for idx, link in enumerate(links):
			
 
				+            archive_link(link, out_dir=link.link_dir)
			
 
				+
			
 
				+    except KeyboardInterrupt:
			
 
				+        log_archiving_paused(len(links), idx, link.timestamp if link else '0')
			
 
				+        raise SystemExit(0)
			
 
				+
			
 
				+    except:
			
 
				+        print()
			
 
				+        raise    
			
 
				+
			
 
				+    log_archiving_finished(len(links))
			
 
				+
			
 
				+    # Step 4: Re-write links index with updated titles, icons, and resources
			
 
				+    all_links = load_main_index(out_dir=out_dir)
			
 
				+    write_main_index(links=list(all_links), out_dir=out_dir, finished=True)
			
 
				+    return all_links
			
 
				+
			
 
				+@enforce_types
			
 
				+def remove(filter_str: Optional[str]=None,
			
 
				+           filter_patterns: Optional[List[str]]=None,
			
 
				+           filter_type: str='exact',
			
 
				+           after: Optional[float]=None,
			
 
				+           before: Optional[float]=None,
			
 
				+           yes: bool=False,
			
 
				+           delete: bool=False,
			
 
				+           out_dir: str=OUTPUT_DIR) -> List[Link]:
			
 
				+    
			
 
				+    check_data_folder(out_dir=out_dir)
			
 
				+
			
 
				+    if filter_str and filter_patterns:
			
 
				+        stderr(
			
 
				+            '[X] You should pass either a pattern as an argument, '
			
 
				+            'or pass a list of patterns via stdin, but not both.\n',
			
 
				+            color='red',
			
 
				+        )
			
 
				+        raise SystemExit(2)
			
 
				+    elif not (filter_str or filter_patterns):
			
 
				+        stderr(
			
 
				+            '[X] You should pass either a pattern as an argument, '
			
 
				+            'or pass a list of patterns via stdin.',
			
 
				+            color='red',
			
 
				+        )
			
 
				+        stderr()
			
 
				+        stderr('    {lightred}Hint:{reset} To remove all urls you can run:'.format(**ANSI))
			
 
				+        stderr("        archivebox remove --filter-type=regex '.*'")
			
 
				+        stderr()
			
 
				+        raise SystemExit(2)
			
 
				+    elif filter_str:
			
 
				+        filter_patterns = [ptn.strip() for ptn in filter_str.split('\n')]
			
 
				+
			
 
				+    log_list_started(filter_patterns, filter_type)
			
 
				+    timer = TimedProgress(360, prefix='      ')
			
 
				+    try:
			
 
				+        links = list(list_links(
			
 
				+            filter_patterns=filter_patterns,
			
 
				+            filter_type=filter_type,
			
 
				+            after=after,
			
 
				+            before=before,
			
 
				+        ))
			
 
				+    finally:
			
 
				+        timer.end()
			
 
				+
			
 
				+    if not len(links):
			
 
				+        log_removal_finished(0, 0)
			
 
				+        raise SystemExit(1)
			
 
				+
			
 
				+
			
 
				+    log_list_finished(links)
			
 
				+    log_removal_started(links, yes=yes, delete=delete)
			
 
				+
			
 
				+    timer = TimedProgress(360, prefix='      ')
			
 
				+    try:
			
 
				+        to_keep = []
			
 
				+        all_links = load_main_index(out_dir=out_dir)
			
 
				+        for link in all_links:
			
 
				+            should_remove = (
			
 
				+                (after is not None and float(link.timestamp) < after)
			
 
				+                or (before is not None and float(link.timestamp) > before)
			
 
				+                or link_matches_filter(link, filter_patterns, filter_type)
			
 
				+            )
			
 
				+            if not should_remove:
			
 
				+                to_keep.append(link)
			
 
				+            elif should_remove and delete:
			
 
				+                shutil.rmtree(link.link_dir, ignore_errors=True)
			
 
				+    finally:
			
 
				+        timer.end()
			
 
				+
			
 
				+    write_main_index(links=to_keep, out_dir=out_dir, finished=True)
			
 
				+    log_removal_finished(len(all_links), len(to_keep))
			
 
				+    
			
 
				+    return to_keep
			
 
				+
			
 
				+@enforce_types
			
 
				+def update(resume: Optional[float]=None,
			
 
				+           only_new: bool=not ONLY_NEW,
			
 
				+           index_only: bool=False,
			
 
				+           overwrite: bool=False,
			
 
				+           filter_patterns_str: Optional[str]=None,
			
 
				+           filter_patterns: Optional[List[str]]=None,
			
 
				+           filter_type: Optional[str]=None,
			
 
				+           status: Optional[str]=None,
			
 
				+           after: Optional[str]=None,
			
 
				+           before: Optional[str]=None,
			
 
				+           out_dir: str=OUTPUT_DIR) -> List[Link]:
			
 
				+    """The main ArchiveBox entrancepoint. Everything starts here."""
			
 
				+
			
 
				+    check_dependencies()
			
 
				+    check_data_folder(out_dir=out_dir)
			
 
				+
			
 
				+    # Step 1: Load list of links from the existing index
			
 
				+    #         merge in and dedupe new links from import_path
			
 
				+    all_links: List[Link] = []
			
 
				+    new_links: List[Link] = []
			
 
				+    all_links = load_main_index(out_dir=out_dir)
			
 
				+
			
 
				+    # Step 2: Write updated index with deduped old and new links back to disk
			
 
				+    write_main_index(links=list(all_links), out_dir=out_dir)
			
 
				+
			
 
				+    # Step 3: Filter for selected_links
			
 
				+    matching_links = list_links(
			
 
				+        filter_patterns=filter_patterns,
			
 
				+        filter_type=filter_type,
			
 
				+        before=before,
			
 
				+        after=after,
			
 
				+    )
			
 
				+    matching_folders = list_folders(
			
 
				+        links=list(matching_links),
			
 
				+        status=status,
			
 
				+        out_dir=out_dir,
			
 
				+    )
			
 
				+    all_links = [link for link in matching_folders.values() if link]
			
 
				+
			
 
				+    if index_only:
			
 
				+        return all_links
			
 
				+        
			
 
				+    # Step 3: Run the archive methods for each link
			
 
				+    links = new_links if only_new else all_links
			
 
				+    log_archiving_started(len(links), resume)
			
 
				+    idx: int = 0
			
 
				+    link: Link = None                                             # type: ignore
			
 
				+    try:
			
 
				+        for idx, link in enumerate(links_after_timestamp(links, resume)):
			
 
				+            archive_link(link, overwrite=overwrite, out_dir=link.link_dir)
			
 
				+
			
 
				+    except KeyboardInterrupt:
			
 
				+        log_archiving_paused(len(links), idx, link.timestamp if link else '0')
			
 
				+        raise SystemExit(0)
			
 
				+
			
 
				+    except:
			
 
				+        print()
			
 
				+        raise    
			
 
				+
			
 
				+    log_archiving_finished(len(links))
			
 
				+
			
 
				+    # Step 4: Re-write links index with updated titles, icons, and resources
			
 
				+    all_links = load_main_index(out_dir=out_dir)
			
 
				+    write_main_index(links=list(all_links), out_dir=out_dir, finished=True)
			
 
				+    return all_links
			
 
				+
			
 
				+@enforce_types
			
 
				+def list_all(filter_patterns_str: Optional[str]=None,
			
 
				+             filter_patterns: Optional[List[str]]=None,
			
 
				+             filter_type: str='exact',
			
 
				+             status: Optional[str]=None,
			
 
				+             after: Optional[float]=None,
			
 
				+             before: Optional[float]=None,
			
 
				+             sort: Optional[str]=None,
			
 
				+             csv: Optional[str]=None,
			
 
				+             json: Optional[str]=None,
			
 
				+             out_dir: str=OUTPUT_DIR) -> Iterable[Link]:
			
 
				+    
			
 
				+    check_data_folder(out_dir=out_dir)
			
 
				+
			
 
				+    if filter_patterns and filter_patterns_str:
			
 
				+        stderr(
			
 
				+            '[X] You should either pass filter patterns as an arguments '
			
 
				+            'or via stdin, but not both.\n',
			
 
				+            color='red',
			
 
				+        )
			
 
				+        raise SystemExit(2)
			
 
				+    elif filter_patterns_str:
			
 
				+        filter_patterns = filter_patterns_str.split('\n')
			
 
				+
			
 
				+
			
 
				+    links = list_links(
			
 
				+        filter_patterns=filter_patterns,
			
 
				+        filter_type=filter_type,
			
 
				+        before=before,
			
 
				+        after=after,
			
 
				+    )
			
 
				+
			
 
				+    if sort:
			
 
				+        links = sorted(links, key=lambda link: getattr(link, sort))
			
 
				+
			
 
				+    folders = list_folders(
			
 
				+        links=list(links),
			
 
				+        status=status,
			
 
				+        out_dir=out_dir,
			
 
				+    )
			
 
				+    
			
 
				+    if csv:
			
 
				+        print(links_to_csv(folders.values(), csv_cols=csv.split(','), header=True))
			
 
				+    elif json:
			
 
				+        print(to_json(folders.values(), indent=4, sort_keys=True))
			
 
				+    else:
			
 
				+        print(folders_to_str(folders))
			
 
				+    raise SystemExit(not folders)
			
 
				+
			
 
				+
			
 
				+@enforce_types
			
 
				+def list_links(filter_patterns: Optional[List[str]]=None,
			
 
				+               filter_type: str='exact',
			
 
				+               after: Optional[float]=None,
			
 
				+               before: Optional[float]=None,
			
 
				+               out_dir: str=OUTPUT_DIR) -> Iterable[Link]:
			
 
				+    
			
 
				+    check_data_folder(out_dir=out_dir)
			
 
				+
			
 
				+    all_links = load_main_index(out_dir=out_dir)
			
 
				+
			
 
				+    for link in all_links:
			
 
				+        if after is not None and float(link.timestamp) < after:
			
 
				+            continue
			
 
				+        if before is not None and float(link.timestamp) > before:
			
 
				+            continue
			
 
				+        
			
 
				+        if filter_patterns:
			
 
				+            if link_matches_filter(link, filter_patterns, filter_type):
			
 
				+                yield link
			
 
				+        else:
			
 
				+            yield link
			
 
				+
			
 
				+@enforce_types
			
 
				+def list_folders(links: List[Link],
			
 
				+                 status: str,
			
 
				+                 out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
			
 
				+    
			
 
				+    check_data_folder()
			
 
				+
			
 
				+    if status == 'indexed':
			
 
				+        return get_indexed_folders(links, out_dir=out_dir)
			
 
				+    elif status == 'archived':
			
 
				+        return get_archived_folders(links, out_dir=out_dir)
			
 
				+    elif status == 'unarchived':
			
 
				+        return get_unarchived_folders(links, out_dir=out_dir)
			
 
				+
			
 
				+    elif status == 'present':
			
 
				+        return get_present_folders(links, out_dir=out_dir)
			
 
				+    elif status == 'valid':
			
 
				+        return get_valid_folders(links, out_dir=out_dir)
			
 
				+    elif status == 'invalid':
			
 
				+        return get_invalid_folders(links, out_dir=out_dir)
			
 
				+
			
 
				+    elif status == 'duplicate':
			
 
				+        return get_duplicate_folders(links, out_dir=out_dir)
			
 
				+    elif status == 'orphaned':
			
 
				+        return get_orphaned_folders(links, out_dir=out_dir)
			
 
				+    elif status == 'corrupted':
			
 
				+        return get_corrupted_folders(links, out_dir=out_dir)
			
 
				+    elif status == 'unrecognized':
			
 
				+        return get_unrecognized_folders(links, out_dir=out_dir)
			
 
				+
			
 
				+    raise ValueError('Status not recognized.')
			
 
				+
			
 
				+
			
 
				+def config(config_options_str: Optional[str]=None,
			
 
				+           config_options: Optional[List[str]]=None,
			
 
				+           get: bool=False,
			
 
				+           set: bool=False,
			
 
				+           reset: bool=False,
			
 
				+           out_dir: str=OUTPUT_DIR) -> None:
			
 
				+
			
 
				+    check_data_folder(out_dir=out_dir)
			
 
				+
			
 
				+    if config_options and config_options_str:
			
 
				+        stderr(
			
 
				+            '[X] You should either pass config values as an arguments '
			
 
				+            'or via stdin, but not both.\n',
			
 
				+            color='red',
			
 
				+        )
			
 
				+        raise SystemExit(2)
			
 
				+    elif config_options_str:
			
 
				+        config_options = stdin_raw_text.split('\n')
			
 
				+
			
 
				+    config_options = config_options or []
			
 
				+
			
 
				+    no_args = not (get or set or reset or config_options)
			
 
				+
			
 
				+    matching_config: ConfigDict = {}
			
 
				+    if get or no_args:
			
 
				+        if config_options:
			
 
				+            config_options = [get_real_name(key) for key in config_options]
			
 
				+            matching_config = {key: CONFIG[key] for key in config_options if key in CONFIG}
			
 
				+            failed_config = [key for key in config_options if key not in CONFIG]
			
 
				+            if failed_config:
			
 
				+                stderr()
			
 
				+                stderr('[X] These options failed to get', color='red')
			
 
				+                stderr('    {}'.format('\n    '.join(config_options)))
			
 
				+                raise SystemExit(1)
			
 
				+        else:
			
 
				+            matching_config = CONFIG
			
 
				+        
			
 
				+        print(printable_config(matching_config))
			
 
				+        raise SystemExit(not matching_config)
			
 
				+    elif set:
			
 
				+        new_config = {}
			
 
				+        failed_options = []
			
 
				+        for line in config_options:
			
 
				+            if line.startswith('#') or not line.strip():
			
 
				+                continue
			
 
				+            if '=' not in line:
			
 
				+                stderr('[X] Config KEY=VALUE must have an = sign in it', color='red')
			
 
				+                stderr(f'    {line}')
			
 
				+                raise SystemExit(2)
			
 
				+
			
 
				+            raw_key, val = line.split('=')
			
 
				+            raw_key = raw_key.upper().strip()
			
 
				+            key = get_real_name(raw_key)
			
 
				+            if key != raw_key:
			
 
				+                stderr(f'[i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.', color='lightyellow')
			
 
				+
			
 
				+            if key in CONFIG:
			
 
				+                new_config[key] = val.strip()
			
 
				+            else:
			
 
				+                failed_options.append(line)
			
 
				+
			
 
				+        if new_config:
			
 
				+            before = CONFIG
			
 
				+            matching_config = write_config_file(new_config, out_dir=OUTPUT_DIR)
			
 
				+            after = load_all_config()
			
 
				+            print(printable_config(matching_config))
			
 
				+
			
 
				+            side_effect_changes: ConfigDict = {}
			
 
				+            for key, val in after.items():
			
 
				+                if key in USER_CONFIG and (before[key] != after[key]) and (key not in matching_config):
			
 
				+                    side_effect_changes[key] = after[key]
			
 
				+
			
 
				+            if side_effect_changes:
			
 
				+                stderr()
			
 
				+                stderr('[i] Note: This change also affected these other options that depended on it:', color='lightyellow')
			
 
				+                print('    {}'.format(printable_config(side_effect_changes, prefix='    ')))
			
 
				+        if failed_options:
			
 
				+            stderr()
			
 
				+            stderr('[X] These options failed to set:', color='red')
			
 
				+            stderr('    {}'.format('\n    '.join(failed_options)))
			
 
				+        raise SystemExit(bool(failed_options))
			
 
				+    elif reset:
			
 
				+        stderr('[X] This command is not implemented yet.', color='red')
			
 
				+        stderr('    Please manually remove the relevant lines from your config file:')
			
 
				+        stderr(f'        {CONFIG_FILE}')
			
 
				+        raise SystemExit(2)
			
 
				+
			
 
				+    else:
			
 
				+        stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red')
			
 
				+        stderr('    archivebox config')
			
 
				+        stderr('    archivebox config --get SOME_KEY')
			
 
				+        stderr('    archivebox config --set SOME_KEY=SOME_VALUE')
			
 
				+        raise SystemExit(2)
			
 
				+
			
 
				+
			
 
				+CRON_COMMENT = 'archivebox_schedule'
			
 
				+
			
 
				+@enforce_types
			
 
				+def schedule(add: bool=False,
			
 
				+             show: bool=False,
			
 
				+             clear: bool=False,
			
 
				+             foreground: bool=False,
			
 
				+             run_all: bool=False,
			
 
				+             quiet: bool=False,
			
 
				+             every: Optional[str]=None,
			
 
				+             import_path: Optional[str]=None,
			
 
				+             out_dir: str=OUTPUT_DIR):
			
 
				+    
			
 
				+    check_data_folder(out_dir=out_dir)
			
 
				+
			
 
				+    os.makedirs(os.path.join(out_dir, LOGS_DIR_NAME), exist_ok=True)
			
 
				+
			
 
				+    cron = CronTab(user=True)
			
 
				+    cron = dedupe_jobs(cron)
			
 
				+
			
 
				+    existing_jobs = list(cron.find_comment(CRON_COMMENT))
			
 
				+    if foreground or run_all:
			
 
				+        if import_path or (not existing_jobs):
			
 
				+            stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**ANSI))
			
 
				+            stderr('    archivebox schedule --every=hour https://example.com/some/rss/feed.xml')
			
 
				+            raise SystemExit(1)
			
 
				+        print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **ANSI))
			
 
				+        if run_all:
			
 
				+            try:
			
 
				+                for job in existing_jobs:
			
 
				+                    sys.stdout.write(f'  > {job.command}')
			
 
				+                    sys.stdout.flush()
			
 
				+                    job.run()
			
 
				+                    sys.stdout.write(f'\r  √ {job.command}\n')
			
 
				+            except KeyboardInterrupt:
			
 
				+                print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
			
 
				+                raise SystemExit(1)
			
 
				+        if foreground:
			
 
				+            try:
			
 
				+                for result in cron.run_scheduler():
			
 
				+                    print(result)
			
 
				+            except KeyboardInterrupt:
			
 
				+                print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
			
 
				+                raise SystemExit(1)
			
 
				+
			
 
				+    elif show:
			
 
				+        if existing_jobs:
			
 
				+            print('\n'.join(str(cmd) for cmd in existing_jobs))
			
 
				+        else:
			
 
				+            stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **ANSI))
			
 
				+            stderr('    To schedule a new job, run:')
			
 
				+            stderr('        archivebox schedule --every=[timeperiod] https://example.com/some/rss/feed.xml')
			
 
				+        raise SystemExit(0)
			
 
				+
			
 
				+    elif clear:
			
 
				+        print(cron.remove_all(comment=CRON_COMMENT))
			
 
				+        cron.write()
			
 
				+        raise SystemExit(0)
			
 
				+
			
 
				+    elif every:
			
 
				+        quoted = lambda s: f'"{s}"' if s and ' ' in s else s
			
 
				+        cmd = [
			
 
				+            'cd',
			
 
				+            quoted(out_dir),
			
 
				+            '&&',
			
 
				+            quoted(ARCHIVEBOX_BINARY),
			
 
				+            *(['add', f'"{import_path}"'] if import_path else ['update']),
			
 
				+            '2>&1',
			
 
				+            '>',
			
 
				+            quoted(os.path.join(LOGS_DIR, 'archivebox.log')),
			
 
				+
			
 
				+        ]
			
 
				+        new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT)
			
 
				+
			
 
				+        if every in ('minute', 'hour', 'day', 'week', 'month', 'year'):
			
 
				+            set_every = getattr(new_job.every(), every)
			
 
				+            set_every()
			
 
				+        elif CronSlices.is_valid(every):
			
 
				+            new_job.setall(every)
			
 
				+        else:
			
 
				+            stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**ANSI))
			
 
				+            stderr('    It must be one of minute/hour/day/week/month')
			
 
				+            stderr('    or a quoted cron-format schedule like:')
			
 
				+            stderr('        archivebox init --every=day https://example.com/some/rss/feed.xml')
			
 
				+            stderr('        archivebox init --every="0/5 * * * *" https://example.com/some/rss/feed.xml')
			
 
				+            raise SystemExit(1)
			
 
				+
			
 
				+        cron = dedupe_jobs(cron)
			
 
				+        cron.write()
			
 
				+
			
 
				+        total_runs = sum(j.frequency_per_year() for j in cron)
			
 
				+        existing_jobs = list(cron.find_comment(CRON_COMMENT))
			
 
				+
			
 
				+        print()
			
 
				+        print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **ANSI))
			
 
				+        print('\n'.join(f'  > {cmd}' if str(cmd) == str(new_job) else f'    {cmd}' for cmd in existing_jobs))
			
 
				+        if total_runs > 60 and not quiet:
			
 
				+            stderr()
			
 
				+            stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **ANSI))
			
 
				+            stderr(f'    Congrats on being an enthusiastic internet archiver! 👌')
			
 
				+            stderr()
			
 
				+            stderr('    Make sure you have enough storage space available to hold all the data.')
			
 
				+            stderr('    Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.')
			
 
				+        raise SystemExit(0)
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+def server(runserver_args: Optional[List[str]]=None, reload: bool=False, out_dir: str=OUTPUT_DIR) -> None:
			
 
				+    runserver_args = runserver_args or []
			
 
				+    check_data_folder(out_dir=out_dir)
			
 
				+
			
 
				+    setup_django(out_dir)
			
 
				+    from django.core.management import call_command
			
 
				+    from django.contrib.auth.models import User
			
 
				+
			
 
				+    if IS_TTY and not User.objects.filter(is_superuser=True).exists():
			
 
				+        print('{lightyellow}[!] No admin users exist yet, you will not be able to edit links in the UI.{reset}'.format(**ANSI))
			
 
				+        print()
			
 
				+        print('    To create an admin user, run:')
			
 
				+        print('        archivebox manage createsuperuser')
			
 
				+        print()
			
 
				+
			
 
				+    print('{green}[+] Starting ArchiveBox webserver...{reset}'.format(**ANSI))
			
 
				+    if not reload:
			
 
				+        runserver_args.append('--noreload')
			
 
				+
			
 
				+    call_command("runserver", *runserver_args)
			
 
				+
			
 
				+
			
 
				+def manage(args: Optional[List[str]]=None, out_dir: str=OUTPUT_DIR) -> None:
			
 
				+    check_data_folder(out_dir=out_dir)
			
 
				+
			
 
				+    setup_django(out_dir)
			
 
				+    from django.core.management import execute_from_command_line
			
 
				+
			
 
				+    execute_from_command_line([f'{ARCHIVEBOX_BINARY} manage', *(args or ['help'])])
			
 
				+
			
 
				+def shell(out_dir: str=OUTPUT_DIR) -> None:
			
 
				+    check_data_folder(out_dir=out_dir)
			
 
				+
			
 
				+    setup_django(OUTPUT_DIR)
			
 
				+    from django.core.management import call_command
			
 
				+    call_command("shell_plus")
			
 
				+
			
 
				+# Helpers
			
 
				+
			
 
				+def printable_config(config: ConfigDict, prefix: str='') -> str:
			
 
				+    return f'\n{prefix}'.join(
			
 
				+        f'{key}={val}'
			
 
				+        for key, val in config.items()
			
 
				+        if not (isinstance(val, dict) or callable(val))
			
 
				+    )
			
 
				+
			
 
				+def dedupe_jobs(cron: CronTab) -> CronTab:
			
 
				+    deduped: Set[Tuple[str, str]] = set()
			
 
				+
			
 
				+    for job in list(cron):
			
 
				+        unique_tuple = (str(job.slices), job.command)
			
 
				+        if unique_tuple not in deduped:
			
 
				+            deduped.add(unique_tuple)
			
 
				+        cron.remove(job)
			
 
				+
			
 
				+    for schedule, command in deduped:
			
 
				+        job = cron.new(command=command, comment=CRON_COMMENT)
			
 
				+        job.setall(schedule)
			
 
				+        job.enable()
			
 
				+
			
 
				+    return cron
			
 
				+
			
 
				+
			
 
				+def print_folder_status(name, folder):
			
 
				+    if folder['enabled']:
			
 
				+        if folder['is_valid']:
			
 
				+            color, symbol, note = 'green', '√', 'valid'
			
 
				+        else:
			
 
				+            color, symbol, note, num_files = 'red', 'X', 'invalid', '?'
			
 
				+    else:
			
 
				+        color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-'
			
 
				+
			
 
				+    if folder['path']:
			
 
				+        if os.path.exists(folder['path']):
			
 
				+            num_files = (
			
 
				+                f'{len(os.listdir(folder["path"]))} files'
			
 
				+                if os.path.isdir(folder['path']) else
			
 
				+                human_readable_size(os.path.getsize(folder['path']))
			
 
				+            )
			
 
				+        else:
			
 
				+            num_files = 'missing'
			
 
				+
			
 
				+        if ' ' in folder['path']:
			
 
				+            folder['path'] = f'"{folder["path"]}"'
			
 
				+
			
 
				+    print(
			
 
				+        ANSI[color],
			
 
				+        symbol,
			
 
				+        ANSI['reset'],
			
 
				+        name.ljust(22),
			
 
				+        (folder["path"] or '').ljust(76),
			
 
				+        num_files.ljust(14),
			
 
				+        ANSI[color],
			
 
				+        note,
			
 
				+        ANSI['reset'],
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+def print_dependency_version(name, dependency):
			
 
				+    if dependency['enabled']:
			
 
				+        if dependency['is_valid']:
			
 
				+            color, symbol, note = 'green', '√', 'valid'
			
 
				+            version = 'v' + re.search(r'[\d\.]+', dependency['version'])[0]
			
 
				+        else:
			
 
				+            color, symbol, note, version = 'red', 'X', 'invalid', '?'
			
 
				+    else:
			
 
				+        color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
			
 
				+
			
 
				+    if ' ' in dependency["path"]:
			
 
				+        dependency["path"] = f'"{dependency["path"]}"'
			
 
				+
			
 
				+    print(
			
 
				+        ANSI[color],
			
 
				+        symbol,
			
 
				+        ANSI['reset'],
			
 
				+        name.ljust(22),
			
 
				+        (dependency["path"] or '').ljust(76),
			
 
				+        version.ljust(14),
			
 
				+        ANSI[color],
			
 
				+        note,
			
 
				+        ANSI['reset'],
			
 
				+    )
			
--- a/archivebox/parsers/__init__.py
+++ b/archivebox/parsers/__init__.py
@@ -0,0 +1,68 @@
 
				+"""
			
 
				+Everything related to parsing links from input sources.
			
 
				+
			
 
				+For a list of supported services, see the README.md.
			
 
				+For examples of supported import formats see tests/.
			
 
				+"""
			
 
				+
			
 
				+__package__ = 'archivebox.parsers'
			
 
				+
			
 
				+
			
 
				+from typing import Tuple, List
			
 
				+
			
 
				+from ..config import TIMEOUT
			
 
				+from ..util import (
			
 
				+    check_url_parsing_invariants,
			
 
				+    TimedProgress,
			
 
				+    Link,
			
 
				+    enforce_types,
			
 
				+)
			
 
				+from .pocket_html import parse_pocket_html_export
			
 
				+from .pinboard_rss import parse_pinboard_rss_export
			
 
				+from .shaarli_rss import parse_shaarli_rss_export
			
 
				+from .medium_rss import parse_medium_rss_export
			
 
				+from .netscape_html import parse_netscape_html_export
			
 
				+from .generic_rss import parse_generic_rss_export
			
 
				+from .generic_json import parse_generic_json_export
			
 
				+from .generic_txt import parse_generic_txt_export
			
 
				+
			
 
				+
			
 
				+@enforce_types
			
 
				+def parse_links(source_file: str) -> Tuple[List[Link], str]:
			
 
				+    """parse a list of URLs with their metadata from an 
			
 
				+       RSS feed, bookmarks export, or text file
			
 
				+    """
			
 
				+
			
 
				+    check_url_parsing_invariants()
			
 
				+    PARSERS = (
			
 
				+        # Specialized parsers
			
 
				+        ('Pocket HTML', parse_pocket_html_export),
			
 
				+        ('Pinboard RSS', parse_pinboard_rss_export),
			
 
				+        ('Shaarli RSS', parse_shaarli_rss_export),
			
 
				+        ('Medium RSS', parse_medium_rss_export),
			
 
				+        
			
 
				+        # General parsers
			
 
				+        ('Netscape HTML', parse_netscape_html_export),
			
 
				+        ('Generic RSS', parse_generic_rss_export),
			
 
				+        ('Generic JSON', parse_generic_json_export),
			
 
				+
			
 
				+        # Fallback parser
			
 
				+        ('Plain Text', parse_generic_txt_export),
			
 
				+    )
			
 
				+    timer = TimedProgress(TIMEOUT * 4)
			
 
				+    with open(source_file, 'r', encoding='utf-8') as file:
			
 
				+        for parser_name, parser_func in PARSERS:
			
 
				+            try:
			
 
				+                links = list(parser_func(file))
			
 
				+                if links:
			
 
				+                    timer.end()
			
 
				+                    return links, parser_name
			
 
				+            except Exception as err:   # noqa
			
 
				+                # Parsers are tried one by one down the list, and the first one
			
 
				+                # that succeeds is used. To see why a certain parser was not used
			
 
				+                # due to error or format incompatibility, uncomment this line:
			
 
				+                # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
			
 
				+                pass
			
 
				+
			
 
				+    timer.end()
			
 
				+    return [], 'Failed to parse'
			
--- a/archivebox/parsers/generic_json.py
+++ b/archivebox/parsers/generic_json.py
@@ -0,0 +1,65 @@
 
				+__package__ = 'archivebox.parsers'
			
 
				+
			
 
				+import json
			
 
				+
			
 
				+from typing import IO, Iterable
			
 
				+from datetime import datetime
			
 
				+
			
 
				+from ..index.schema import Link
			
 
				+from ..util import (
			
 
				+    htmldecode,
			
 
				+    enforce_types,
			
 
				+)
			
 
				+
			
 
				+
			
 
				+@enforce_types
			
 
				+def parse_generic_json_export(json_file: IO[str]) -> Iterable[Link]:
			
 
				+    """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
			
 
				+
			
 
				+    json_file.seek(0)
			
 
				+    links = json.load(json_file)
			
 
				+    json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
			
 
				+
			
 
				+    for link in links:
			
 
				+        # example line
			
 
				+        # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
			
 
				+        if link:
			
 
				+            # Parse URL
			
 
				+            url = link.get('href') or link.get('url') or link.get('URL')
			
 
				+            if not url:
			
 
				+                raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
			
 
				+
			
 
				+            # Parse the timestamp
			
 
				+            ts_str = str(datetime.now().timestamp())
			
 
				+            if link.get('timestamp'):
			
 
				+                # chrome/ff histories use a very precise timestamp
			
 
				+                ts_str = str(link['timestamp'] / 10000000)  
			
 
				+            elif link.get('time'):
			
 
				+                ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
			
 
				+            elif link.get('created_at'):
			
 
				+                ts_str = str(json_date(link['created_at']).timestamp())
			
 
				+            elif link.get('created'):
			
 
				+                ts_str = str(json_date(link['created']).timestamp())
			
 
				+            elif link.get('date'):
			
 
				+                ts_str = str(json_date(link['date']).timestamp())
			
 
				+            elif link.get('bookmarked'):
			
 
				+                ts_str = str(json_date(link['bookmarked']).timestamp())
			
 
				+            elif link.get('saved'):
			
 
				+                ts_str = str(json_date(link['saved']).timestamp())
			
 
				+            
			
 
				+            # Parse the title
			
 
				+            title = None
			
 
				+            if link.get('title'):
			
 
				+                title = link['title'].strip()
			
 
				+            elif link.get('description'):
			
 
				+                title = link['description'].replace(' — Readability', '').strip()
			
 
				+            elif link.get('name'):
			
 
				+                title = link['name'].strip()
			
 
				+
			
 
				+            yield Link(
			
 
				+                url=htmldecode(url),
			
 
				+                timestamp=ts_str,
			
 
				+                title=htmldecode(title) or None,
			
 
				+                tags=htmldecode(link.get('tags')) or '',
			
 
				+                sources=[json_file.name],
			
 
				+            )
			
--- a/archivebox/parsers/generic_rss.py
+++ b/archivebox/parsers/generic_rss.py
@@ -0,0 +1,49 @@
 
				+__package__ = 'archivebox.parsers'
			
 
				+
			
 
				+
			
 
				+from typing import IO, Iterable
			
 
				+from datetime import datetime
			
 
				+
			
 
				+from ..index.schema import Link
			
 
				+from ..util import (
			
 
				+    htmldecode,
			
 
				+    enforce_types,
			
 
				+    str_between,
			
 
				+)
			
 
				+
			
 
				+@enforce_types
			
 
				+def parse_generic_rss_export(rss_file: IO[str]) -> Iterable[Link]:
			
 
				+    """Parse RSS XML-format files into links"""
			
 
				+
			
 
				+    rss_file.seek(0)
			
 
				+    items = rss_file.read().split('<item>')
			
 
				+    items = items[1:] if items else []
			
 
				+    for item in items:
			
 
				+        # example item:
			
 
				+        # <item>
			
 
				+        # <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
			
 
				+        # <category>Unread</category>
			
 
				+        # <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
			
 
				+        # <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
			
 
				+        # <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
			
 
				+        # </item>
			
 
				+
			
 
				+        trailing_removed = item.split('</item>', 1)[0]
			
 
				+        leading_removed = trailing_removed.split('<item>', 1)[-1].strip()
			
 
				+        rows = leading_removed.split('\n')
			
 
				+
			
 
				+        def get_row(key):
			
 
				+            return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
			
 
				+
			
 
				+        url = str_between(get_row('link'), '<link>', '</link>')
			
 
				+        ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
			
 
				+        time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
			
 
				+        title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
			
 
				+
			
 
				+        yield Link(
			
 
				+            url=htmldecode(url),
			
 
				+            timestamp=str(time.timestamp()),
			
 
				+            title=htmldecode(title) or None,
			
 
				+            tags=None,
			
 
				+            sources=[rss_file.name],
			
 
				+        )
			
--- a/archivebox/parsers/generic_txt.py
+++ b/archivebox/parsers/generic_txt.py
@@ -0,0 +1,30 @@
 
				+__package__ = 'archivebox.parsers'
			
 
				+__description__ = 'Plain Text'
			
 
				+
			
 
				+import re
			
 
				+
			
 
				+from typing import IO, Iterable
			
 
				+from datetime import datetime
			
 
				+
			
 
				+from ..index.schema import Link
			
 
				+from ..util import (
			
 
				+    htmldecode,
			
 
				+    enforce_types,
			
 
				+    URL_REGEX
			
 
				+)
			
 
				+
			
 
				+@enforce_types
			
 
				+def parse_generic_txt_export(text_file: IO[str]) -> Iterable[Link]:
			
 
				+    """Parse raw links from each line in a text file"""
			
 
				+
			
 
				+    text_file.seek(0)
			
 
				+    for line in text_file.readlines():
			
 
				+        urls = re.findall(URL_REGEX, line) if line.strip() else ()
			
 
				+        for url in urls:                                                # type: ignore
			
 
				+            yield Link(
			
 
				+                url=htmldecode(url),
			
 
				+                timestamp=str(datetime.now().timestamp()),
			
 
				+                title=None,
			
 
				+                tags=None,
			
 
				+                sources=[text_file.name],
			
 
				+            )
			
--- a/archivebox/parsers/medium_rss.py
+++ b/archivebox/parsers/medium_rss.py
@@ -0,0 +1,35 @@
 
				+__package__ = 'archivebox.parsers'
			
 
				+
			
 
				+
			
 
				+from typing import IO, Iterable
			
 
				+from datetime import datetime
			
 
				+
			
 
				+from xml.etree import ElementTree
			
 
				+
			
 
				+from ..index.schema import Link
			
 
				+from ..util import (
			
 
				+    htmldecode,
			
 
				+    enforce_types,
			
 
				+)
			
 
				+
			
 
				+
			
 
				+@enforce_types
			
 
				+def parse_medium_rss_export(rss_file: IO[str]) -> Iterable[Link]:
			
 
				+    """Parse Medium RSS feed files into links"""
			
 
				+
			
 
				+    rss_file.seek(0)
			
 
				+    root = ElementTree.parse(rss_file).getroot()
			
 
				+    items = root.find("channel").findall("item")                        # type: ignore
			
 
				+    for item in items:
			
 
				+        url = item.find("link").text                                    # type: ignore
			
 
				+        title = item.find("title").text.strip()                         # type: ignore
			
 
				+        ts_str = item.find("pubDate").text                              # type: ignore
			
 
				+        time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z")    # type: ignore
			
 
				+        
			
 
				+        yield Link(
			
 
				+            url=htmldecode(url),
			
 
				+            timestamp=str(time.timestamp()),
			
 
				+            title=htmldecode(title) or None,
			
 
				+            tags=None,
			
 
				+            sources=[rss_file.name],
			
 
				+        )
			
--- a/archivebox/parsers/netscape_html.py
+++ b/archivebox/parsers/netscape_html.py
@@ -0,0 +1,39 @@
 
				+__package__ = 'archivebox.parsers'
			
 
				+
			
 
				+
			
 
				+import re
			
 
				+
			
 
				+from typing import IO, Iterable
			
 
				+from datetime import datetime
			
 
				+
			
 
				+from ..index.schema import Link
			
 
				+from ..util import (
			
 
				+    htmldecode,
			
 
				+    enforce_types,
			
 
				+)
			
 
				+
			
 
				+
			
 
				+@enforce_types
			
 
				+def parse_netscape_html_export(html_file: IO[str]) -> Iterable[Link]:
			
 
				+    """Parse netscape-format bookmarks export files (produced by all browsers)"""
			
 
				+
			
 
				+    html_file.seek(0)
			
 
				+    pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE)
			
 
				+    for line in html_file:
			
 
				+        # example line
			
 
				+        # <DT><A HREF="https://example.com/?q=1+2" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://example.com/favicon.ico" ICON="data:image/png;base64,...">example bookmark title</A>
			
 
				+        
			
 
				+        match = pattern.search(line)
			
 
				+        if match:
			
 
				+            url = match.group(1)
			
 
				+            time = datetime.fromtimestamp(float(match.group(2)))
			
 
				+            title = match.group(3).strip()
			
 
				+
			
 
				+            yield Link(
			
 
				+                url=htmldecode(url),
			
 
				+                timestamp=str(time.timestamp()),
			
 
				+                title=htmldecode(title) or None,
			
 
				+                tags=None,
			
 
				+                sources=[html_file.name],
			
 
				+            )
			
 
				+
			
--- a/archivebox/parsers/pinboard_rss.py
+++ b/archivebox/parsers/pinboard_rss.py
@@ -0,0 +1,47 @@
 
				+__package__ = 'archivebox.parsers'
			
 
				+
			
 
				+
			
 
				+from typing import IO, Iterable
			
 
				+from datetime import datetime
			
 
				+
			
 
				+from xml.etree import ElementTree
			
 
				+
			
 
				+from ..index.schema import Link
			
 
				+from ..util import (
			
 
				+    htmldecode,
			
 
				+    enforce_types,
			
 
				+)
			
 
				+
			
 
				+
			
 
				+@enforce_types
			
 
				+def parse_pinboard_rss_export(rss_file: IO[str]) -> Iterable[Link]:
			
 
				+    """Parse Pinboard RSS feed files into links"""
			
 
				+
			
 
				+    rss_file.seek(0)
			
 
				+    root = ElementTree.parse(rss_file).getroot()
			
 
				+    items = root.findall("{http://purl.org/rss/1.0/}item")
			
 
				+    for item in items:
			
 
				+        find = lambda p: item.find(p).text.strip() if item.find(p) else None    # type: ignore
			
 
				+
			
 
				+        url = find("{http://purl.org/rss/1.0/}link")
			
 
				+        tags = find("{http://purl.org/dc/elements/1.1/}subject")
			
 
				+        title = find("{http://purl.org/rss/1.0/}title")
			
 
				+        ts_str = find("{http://purl.org/dc/elements/1.1/}date")
			
 
				+        
			
 
				+        # Pinboard includes a colon in its date stamp timezone offsets, which
			
 
				+        # Python can't parse. Remove it:
			
 
				+        if ts_str and ts_str[-3:-2] == ":":
			
 
				+            ts_str = ts_str[:-3]+ts_str[-2:]
			
 
				+
			
 
				+        if ts_str:
			
 
				+            time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
			
 
				+        else:
			
 
				+            time = datetime.now()
			
 
				+
			
 
				+        yield Link(
			
 
				+            url=htmldecode(url),
			
 
				+            timestamp=str(time.timestamp()),
			
 
				+            title=htmldecode(title) or None,
			
 
				+            tags=htmldecode(tags) or None,
			
 
				+            sources=[rss_file.name],
			
 
				+        )
			
--- a/archivebox/parsers/pocket_html.py
+++ b/archivebox/parsers/pocket_html.py
@@ -0,0 +1,38 @@
 
				+__package__ = 'archivebox.parsers'
			
 
				+
			
 
				+
			
 
				+import re
			
 
				+
			
 
				+from typing import IO, Iterable
			
 
				+from datetime import datetime
			
 
				+
			
 
				+from ..index.schema import Link
			
 
				+from ..util import (
			
 
				+    htmldecode,
			
 
				+    enforce_types,
			
 
				+)
			
 
				+
			
 
				+
			
 
				+@enforce_types
			
 
				+def parse_pocket_html_export(html_file: IO[str]) -> Iterable[Link]:
			
 
				+    """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
			
 
				+
			
 
				+    html_file.seek(0)
			
 
				+    pattern = re.compile("^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE)
			
 
				+    for line in html_file:
			
 
				+        # example line
			
 
				+        # <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
			
 
				+        match = pattern.search(line)
			
 
				+        if match:
			
 
				+            url = match.group(1).replace('http://www.readability.com/read?url=', '')           # remove old readability prefixes to get original url
			
 
				+            time = datetime.fromtimestamp(float(match.group(2)))
			
 
				+            tags = match.group(3)
			
 
				+            title = match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '')
			
 
				+            
			
 
				+            yield Link(
			
 
				+                url=htmldecode(url),
			
 
				+                timestamp=str(time.timestamp()),
			
 
				+                title=htmldecode(title) or None,
			
 
				+                tags=tags or '',
			
 
				+                sources=[html_file.name],
			
 
				+            )
			
--- a/archivebox/parsers/shaarli_rss.py
+++ b/archivebox/parsers/shaarli_rss.py
@@ -0,0 +1,50 @@
 
				+__package__ = 'archivebox.parsers'
			
 
				+
			
 
				+
			
 
				+from typing import IO, Iterable
			
 
				+from datetime import datetime
			
 
				+
			
 
				+from ..index.schema import Link
			
 
				+from ..util import (
			
 
				+    htmldecode,
			
 
				+    enforce_types,
			
 
				+    str_between,
			
 
				+)
			
 
				+
			
 
				+
			
 
				+@enforce_types
			
 
				+def parse_shaarli_rss_export(rss_file: IO[str]) -> Iterable[Link]:
			
 
				+    """Parse Shaarli-specific RSS XML-format files into links"""
			
 
				+
			
 
				+    rss_file.seek(0)
			
 
				+    entries = rss_file.read().split('<entry>')[1:]
			
 
				+    for entry in entries:
			
 
				+        # example entry:
			
 
				+        # <entry>
			
 
				+        #   <title>Aktuelle Trojaner-Welle: Emotet lauert in gefÃ¤lschten Rechnungsmails | heise online</title>
			
 
				+        #   <link href="https://www.heise.de/security/meldung/Aktuelle-Trojaner-Welle-Emotet-lauert-in-gefaelschten-Rechnungsmails-4291268.html" />
			
 
				+        #   <id>https://demo.shaarli.org/?cEV4vw</id>
			
 
				+        #   <published>2019-01-30T06:06:01+00:00</published>
			
 
				+        #   <updated>2019-01-30T06:06:01+00:00</updated>
			
 
				+        #   <content type="html" xml:lang="en"><![CDATA[<div class="markdown"><p>&#8212; <a href="https://demo.shaarli.org/?cEV4vw">Permalink</a></p></div>]]></content>
			
 
				+        # </entry>
			
 
				+
			
 
				+        trailing_removed = entry.split('</entry>', 1)[0]
			
 
				+        leading_removed = trailing_removed.strip()
			
 
				+        rows = leading_removed.split('\n')
			
 
				+
			
 
				+        def get_row(key):
			
 
				+            return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0]
			
 
				+
			
 
				+        title = str_between(get_row('title'), '<title>', '</title>').strip()
			
 
				+        url = str_between(get_row('link'), '<link href="', '" />')
			
 
				+        ts_str = str_between(get_row('published'), '<published>', '</published>')
			
 
				+        time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
			
 
				+
			
 
				+        yield Link(
			
 
				+            url=htmldecode(url),
			
 
				+            timestamp=str(time.timestamp()),
			
 
				+            title=htmldecode(title) or None,
			
 
				+            tags=None,
			
 
				+            sources=[rss_file.name],
			
 
				+        )
			
--- a/archivebox/legacy/templates/favicon.ico
+++ b/archivebox/legacy/templates/favicon.ico
--- a/archivebox/legacy/templates/link_details.html
+++ b/archivebox/legacy/templates/link_details.html
--- a/archivebox/legacy/templates/main_index.html
+++ b/archivebox/legacy/templates/main_index.html
--- a/archivebox/legacy/templates/main_index_row.html
+++ b/archivebox/legacy/templates/main_index_row.html
--- a/archivebox/legacy/templates/robots.txt
+++ b/archivebox/legacy/templates/robots.txt
--- a/archivebox/legacy/templates/static/archive.png
+++ b/archivebox/legacy/templates/static/archive.png
--- a/archivebox/legacy/templates/static/bootstrap.min.css
+++ b/archivebox/legacy/templates/static/bootstrap.min.css
--- a/archivebox/legacy/templates/static/external.png
+++ b/archivebox/legacy/templates/static/external.png
--- a/archivebox/legacy/templates/static/jquery.dataTables.min.css
+++ b/archivebox/legacy/templates/static/jquery.dataTables.min.css
--- a/archivebox/legacy/templates/static/jquery.dataTables.min.js
+++ b/archivebox/legacy/templates/static/jquery.dataTables.min.js
--- a/archivebox/legacy/templates/static/jquery.min.js
+++ b/archivebox/legacy/templates/static/jquery.min.js
--- a/archivebox/legacy/templates/static/sort_asc.png
+++ b/archivebox/legacy/templates/static/sort_asc.png
--- a/archivebox/legacy/templates/static/sort_both.png
+++ b/archivebox/legacy/templates/static/sort_both.png
--- a/archivebox/legacy/templates/static/sort_desc.png
+++ b/archivebox/legacy/templates/static/sort_desc.png
--- a/archivebox/legacy/templates/static/spinner.gif
+++ b/archivebox/legacy/templates/static/spinner.gif
--- a/archivebox/legacy/util.py
+++ b/archivebox/legacy/util.py
@@ -1,6 +1,7 @@
 
				 import os
			
 
				 import re
			
 
				 import sys
			
 
				+import ssl
			
 
				 import json
			
 
				 import time
			
 
				 import shutil
			
@@ -8,7 +9,7 @@ import argparse
 
				 
			
 
				 from string import Template
			
 
				 from json import JSONEncoder
			
 
				-from typing import List, Optional, Any, Union, IO, Mapping, Tuple
			
 
				+from typing import List, Dict, Optional, Any, Union, IO, Mapping, Tuple
			
 
				 from inspect import signature
			
 
				 from functools import wraps
			
 
				 from hashlib import sha256
			
@@ -28,11 +29,12 @@ from subprocess import (
 
				 
			
 
				 from base32_crockford import encode as base32_encode         # type: ignore
			
 
				 
			
 
				-from .schema import Link
			
 
				+from .index.schema import Link
			
 
				 from .config import (
			
 
				     ANSI,
			
 
				     TERM_WIDTH,
			
 
				-    SOURCES_DIR,
			
 
				+    OUTPUT_DIR,
			
 
				+    SOURCES_DIR_NAME,
			
 
				     OUTPUT_PERMISSIONS,
			
 
				     TIMEOUT,
			
 
				     SHOW_PROGRESS,
			
@@ -40,8 +42,9 @@ from .config import (
 
				     CHECK_SSL_VALIDITY,
			
 
				     WGET_USER_AGENT,
			
 
				     CHROME_OPTIONS,
			
 
				+    check_data_folder,
			
 
				 )
			
 
				-from .logs import pretty_path
			
 
				+from .cli.logging import pretty_path
			
 
				 
			
 
				 ### Parsing Helpers
			
 
				 
			
@@ -187,31 +190,36 @@ def check_url_parsing_invariants() -> None:
 
				 ### Random Helpers
			
 
				 
			
 
				 @enforce_types
			
 
				-def handle_stdin_import(raw_text: str) -> str:
			
 
				-    if not os.path.exists(SOURCES_DIR):
			
 
				-        os.makedirs(SOURCES_DIR)
			
 
				+def save_stdin_to_sources(raw_text: str, out_dir: str=OUTPUT_DIR) -> str:
			
 
				+    check_data_folder(out_dir=out_dir)
			
 
				+
			
 
				+    sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
			
 
				+    if not os.path.exists(sources_dir):
			
 
				+        os.makedirs(sources_dir)
			
 
				 
			
 
				     ts = str(datetime.now().timestamp()).split('.', 1)[0]
			
 
				 
			
 
				-    source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format('stdin', ts))
			
 
				+    source_path = os.path.join(sources_dir, '{}-{}.txt'.format('stdin', ts))
			
 
				 
			
 
				     atomic_write(raw_text, source_path)
			
 
				     return source_path
			
 
				 
			
 
				 
			
 
				 @enforce_types
			
 
				-def handle_file_import(path: str, timeout: int=TIMEOUT) -> str:
			
 
				+def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DIR) -> str:
			
 
				     """download a given url's content into output/sources/domain-<timestamp>.txt"""
			
 
				+    check_data_folder(out_dir=out_dir)
			
 
				 
			
 
				-    if not os.path.exists(SOURCES_DIR):
			
 
				-        os.makedirs(SOURCES_DIR)
			
 
				+    sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
			
 
				+    if not os.path.exists(sources_dir):
			
 
				+        os.makedirs(sources_dir)
			
 
				 
			
 
				     ts = str(datetime.now().timestamp()).split('.', 1)[0]
			
 
				 
			
 
				-    source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(basename(path), ts))
			
 
				+    source_path = os.path.join(sources_dir, '{}-{}.txt'.format(basename(path), ts))
			
 
				 
			
 
				     if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
			
 
				-        source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(path), ts))
			
 
				+        source_path = os.path.join(sources_dir, '{}-{}.txt'.format(domain(path), ts))
			
 
				         print('{}[*] [{}] Downloading {}{}'.format(
			
 
				             ANSI['green'],
			
 
				             datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
			
@@ -532,7 +540,6 @@ def download_url(url: str, timeout: int=TIMEOUT) -> str:
 
				     if CHECK_SSL_VALIDITY:
			
 
				         resp = urlopen(req, timeout=timeout)
			
 
				     else:
			
 
				-        import ssl
			
 
				         insecure = ssl._create_unverified_context()
			
 
				         resp = urlopen(req, timeout=timeout, context=insecure)
			
 
				 
			
@@ -662,7 +669,7 @@ def to_json(obj: Any, file: IO=None, indent: Optional[int]=4, sort_keys: bool=Tr
 
				         return json.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
			
 
				 
			
 
				 
			
 
				-def to_csv(links: List[Link], csv_cols: Optional[List[str]]=None,
			
 
				+def links_to_csv(links: List[Link], csv_cols: Optional[List[str]]=None,
			
 
				            header: bool=True, ljust: int=0, separator: str=',') -> str:
			
 
				     csv_cols = csv_cols or ['timestamp', 'is_archived', 'url']
			
 
				     
			
@@ -677,6 +684,8 @@ def to_csv(links: List[Link], csv_cols: Optional[List[str]]=None,
 
				 
			
 
				     return '\n'.join((header_str, *row_strs))
			
 
				 
			
 
				+def folders_to_str(folders: Dict[str, Optional[Link]]) -> str:
			
 
				+    return '\n'.join(f'{folder} {link}' for folder, link in folders.items())
			
 
				 
			
 
				 @enforce_types
			
 
				 def render_template(template_path: str, context: Mapping[str, str]) -> str:
			
@@ -713,11 +722,11 @@ def atomic_write(contents: Union[dict, str, bytes], path: str) -> None:
 
				             os.remove(tmp_file)
			
 
				 
			
 
				 
			
 
				-def reject_stdin(caller: str) -> None:
			
 
				+def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None:
			
 
				     """Tell the user they passed stdin to a command that doesn't accept it"""
			
 
				 
			
 
				-    if not sys.stdin.isatty():
			
 
				-        stdin_raw_text = sys.stdin.read().strip()
			
 
				+    if stdin and not stdin.isatty():
			
 
				+        stdin_raw_text = stdin.read().strip()
			
 
				         if stdin_raw_text:
			
 
				             print(
			
 
				                 '{red}[X] The "{}" command does not accept stdin.{reset}\n'.format(
			
@@ -731,9 +740,30 @@ def reject_stdin(caller: str) -> None:
 
				             print()
			
 
				             raise SystemExit(1)
			
 
				 
			
 
				+def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]:
			
 
				+    if stdin and not stdin.isatty():
			
 
				+        return stdin.read()
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+def set_docstring(text: str):
			
 
				+    def decorator(func):
			
 
				+        @wraps(func)
			
 
				+        def wrapper_with_docstring(*args, **kwargs):
			
 
				+            return func(*args, **kwargs)
			
 
				+        wrapper_with_docstring.__doc__ = text
			
 
				+        return wrapper_with_docstring
			
 
				+    return decorator
			
 
				+
			
 
				 
			
 
				 class SmartFormatter(argparse.HelpFormatter):
			
 
				     def _split_lines(self, text, width):
			
 
				         if '\n' in text:
			
 
				             return text.splitlines()
			
 
				         return argparse.HelpFormatter._split_lines(self, text, width)
			
 
				+
			
 
				+
			
 
				+class ArchiveError(Exception):
			
 
				+    def __init__(self, message, hints=None):
			
 
				+        super().__init__(message)
			
 
				+        self.hints = hints