2
0
Эх сурвалжийг харах

move everything out of legacy folder

Nick Sweeting 6 жил өмнө
parent
commit
1b8abc0961
74 өөрчлөгдсөн 3155 нэмэгдсэн , 2622 устгасан
  1. 3 0
      archivebox/__init__.py
  2. 7 2
      archivebox/__main__.py
  3. 11 4
      archivebox/cli/__init__.py
  4. 14 46
      archivebox/cli/archivebox.py
  5. 46 61
      archivebox/cli/archivebox_add.py
  6. 17 115
      archivebox/cli/archivebox_config.py
  7. 9 37
      archivebox/cli/archivebox_help.py
  8. 10 11
      archivebox/cli/archivebox_info.py
  9. 9 8
      archivebox/cli/archivebox_init.py
  10. 21 53
      archivebox/cli/archivebox_list.py
  11. 9 15
      archivebox/cli/archivebox_manage.py
  12. 18 29
      archivebox/cli/archivebox_remove.py
  13. 24 134
      archivebox/cli/archivebox_schedule.py
  14. 13 25
      archivebox/cli/archivebox_server.py
  15. 12 13
      archivebox/cli/archivebox_shell.py
  16. 82 14
      archivebox/cli/archivebox_update.py
  17. 11 98
      archivebox/cli/archivebox_version.py
  18. 10 8
      archivebox/cli/logging.py
  19. 5 5
      archivebox/cli/tests.py
  20. 26 22
      archivebox/config/__init__.py
  21. 1 1
      archivebox/config/stubs.py
  22. 2 4
      archivebox/core/admin.py
  23. 2 2
      archivebox/core/models.py
  24. 3 3
      archivebox/core/views.py
  25. 17 4
      archivebox/core/welcome_message.py
  26. 105 0
      archivebox/extractors/__init__.py
  27. 115 0
      archivebox/extractors/archive_org.py
  28. 73 0
      archivebox/extractors/dom.py
  29. 65 0
      archivebox/extractors/favicon.py
  30. 94 0
      archivebox/extractors/git.py
  31. 100 0
      archivebox/extractors/media.py
  32. 72 0
      archivebox/extractors/pdf.py
  33. 71 0
      archivebox/extractors/screenshot.py
  34. 63 0
      archivebox/extractors/title.py
  35. 123 0
      archivebox/extractors/wget.py
  36. 277 32
      archivebox/index/__init__.py
  37. 13 13
      archivebox/index/html.py
  38. 9 9
      archivebox/index/json.py
  39. 24 22
      archivebox/index/schema.py
  40. 14 4
      archivebox/index/sql.py
  41. 0 58
      archivebox/legacy/ArchiveBox.conf
  42. 0 1
      archivebox/legacy/__init__.py
  43. 0 694
      archivebox/legacy/archive_methods.py
  44. 0 626
      archivebox/legacy/main.py
  45. 0 10
      archivebox/legacy/mypy_django.ini
  46. 0 331
      archivebox/legacy/parse.py
  47. 0 89
      archivebox/legacy/purge.py
  48. 0 1
      archivebox/legacy/storage/__init__.py
  49. 1086 0
      archivebox/main.py
  50. 68 0
      archivebox/parsers/__init__.py
  51. 65 0
      archivebox/parsers/generic_json.py
  52. 49 0
      archivebox/parsers/generic_rss.py
  53. 30 0
      archivebox/parsers/generic_txt.py
  54. 35 0
      archivebox/parsers/medium_rss.py
  55. 39 0
      archivebox/parsers/netscape_html.py
  56. 47 0
      archivebox/parsers/pinboard_rss.py
  57. 38 0
      archivebox/parsers/pocket_html.py
  58. 50 0
      archivebox/parsers/shaarli_rss.py
  59. 0 0
      archivebox/themes/legacy/favicon.ico
  60. 0 0
      archivebox/themes/legacy/link_details.html
  61. 0 0
      archivebox/themes/legacy/main_index.html
  62. 0 0
      archivebox/themes/legacy/main_index_row.html
  63. 0 0
      archivebox/themes/legacy/robots.txt
  64. 0 0
      archivebox/themes/legacy/static/archive.png
  65. 0 0
      archivebox/themes/legacy/static/bootstrap.min.css
  66. 0 0
      archivebox/themes/legacy/static/external.png
  67. 0 0
      archivebox/themes/legacy/static/jquery.dataTables.min.css
  68. 0 0
      archivebox/themes/legacy/static/jquery.dataTables.min.js
  69. 0 0
      archivebox/themes/legacy/static/jquery.min.js
  70. 0 0
      archivebox/themes/legacy/static/sort_asc.png
  71. 0 0
      archivebox/themes/legacy/static/sort_both.png
  72. 0 0
      archivebox/themes/legacy/static/sort_desc.png
  73. 0 0
      archivebox/themes/legacy/static/spinner.gif
  74. 48 18
      archivebox/util.py

+ 3 - 0
archivebox/__init__.py

@@ -1,3 +1,6 @@
 __package__ = 'archivebox'
 __package__ = 'archivebox'
 
 
 from . import core
 from . import core
+from . import cli
+
+from .main import *

+ 7 - 2
archivebox/__main__.py

@@ -2,9 +2,14 @@
 
 
 __package__ = 'archivebox'
 __package__ = 'archivebox'
 
 
-from .cli.archivebox import main
+import sys
+from .cli import archivebox
+
+
+def main():
+    archivebox.main(args=sys.argv[1:], stdin=sys.stdin)
 
 
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
-    main()
+    archivebox.main(args=sys.argv[1:], stdin=sys.stdin)
 
 

+ 11 - 4
archivebox/cli/__init__.py

@@ -2,13 +2,17 @@ __package__ = 'archivebox.cli'
 
 
 import os
 import os
 
 
-from typing import Dict
+from typing import Dict, List, Optional, IO
 from importlib import import_module
 from importlib import import_module
 
 
 CLI_DIR = os.path.dirname(os.path.abspath(__file__))
 CLI_DIR = os.path.dirname(os.path.abspath(__file__))
 
 
 # these common commands will appear sorted before any others for ease-of-use
 # these common commands will appear sorted before any others for ease-of-use
-display_first = ('help', 'version', 'init', 'info', 'config', 'list', 'update', 'add', 'remove')
+meta_cmds = ('help', 'version')
+main_cmds = ('init', 'info', 'config')
+archive_cmds = ('add', 'remove', 'update', 'list')
+
+display_first = (*meta_cmds, *main_cmds, *archive_cmds)
 
 
 # every imported command module must have these properties in order to be valid
 # every imported command module must have these properties in order to be valid
 required_attrs = ('__package__', '__command__', 'main')
 required_attrs = ('__package__', '__command__', 'main')
@@ -42,11 +46,14 @@ def list_subcommands() -> Dict[str, str]:
     return dict(sorted(COMMANDS, key=display_order))
     return dict(sorted(COMMANDS, key=display_order))
 
 
 
 
-def run_subcommand(subcommand: str, args=None) -> None:
+def run_subcommand(subcommand: str,
+                   subcommand_args: List[str]=None,
+                   stdin: Optional[IO]=None,
+                   pwd: Optional[str]=None) -> None:
     """run a given ArchiveBox subcommand with the given list of args"""
     """run a given ArchiveBox subcommand with the given list of args"""
 
 
     module = import_module('.archivebox_{}'.format(subcommand), __package__)
     module = import_module('.archivebox_{}'.format(subcommand), __package__)
-    module.main(args)    # type: ignore
+    module.main(args=subcommand_args, stdin=stdin, pwd=pwd)    # type: ignore
 
 
 
 
 SUBCOMMANDS = list_subcommands()
 SUBCOMMANDS = list_subcommands()

+ 14 - 46
archivebox/cli/archivebox.py

@@ -5,19 +5,17 @@ __package__ = 'archivebox.cli'
 __command__ = 'archivebox'
 __command__ = 'archivebox'
 __description__ = 'ArchiveBox: The self-hosted internet archive.'
 __description__ = 'ArchiveBox: The self-hosted internet archive.'
 
 
-import os
 import sys
 import sys
 import argparse
 import argparse
 
 
-from . import list_subcommands, run_subcommand
-from ..legacy.config import OUTPUT_DIR
+from typing import Optional, List, IO
 
 
+from . import list_subcommands, run_subcommand
+from ..config import OUTPUT_DIR
 
 
-def parse_args(args=None):
-    args = sys.argv[1:] if args is None else args
 
 
+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
     subcommands = list_subcommands()
     subcommands = list_subcommands()
-
     parser = argparse.ArgumentParser(
     parser = argparse.ArgumentParser(
         prog=__command__,
         prog=__command__,
         description=__description__,
         description=__description__,
@@ -43,54 +41,24 @@ def parse_args(args=None):
         default=None,
         default=None,
     )
     )
     parser.add_argument(
     parser.add_argument(
-        "args",
+        "subcommand_args",
         help="Arguments for the subcommand",
         help="Arguments for the subcommand",
         nargs=argparse.REMAINDER,
         nargs=argparse.REMAINDER,
     )
     )
-    
-    command = parser.parse_args(args)
+    command = parser.parse_args(args or ())
 
 
-    if command.help:
+    if command.help or command.subcommand is None:
         command.subcommand = 'help'
         command.subcommand = 'help'
     if command.version:
     if command.version:
         command.subcommand = 'version'
         command.subcommand = 'version'
 
 
-    # print('--------------------------------------------')
-    # print('Command:     ', sys.argv[0])
-    # print('Subcommand:  ', command.subcommand)
-    # print('Args to pass:', args[1:])
-    # print('--------------------------------------------')
-
-    return command.subcommand, command.args
-
-
-def print_import_tutorial():
-    print('Welcome to ArchiveBox!')
-    print()
-    print('To import an existing archive (from a previous version of ArchiveBox):')
-    print('    1. cd into your data dir OUTPUT_DIR (usually ArchiveBox/output) and run:')
-    print('    2. archivebox init')
-    print()
-    print('To start a new archive:')
-    print('    1. Create an emptry directory, then cd into it and run:')
-    print('    2. archivebox init')
-    print()
-    print('For more information, see the migration docs here:')
-    print('    https://github.com/pirate/ArchiveBox/wiki/Migration')
-
-def main(args=None):
-    subcommand, subcommand_args = parse_args(args)
-    existing_index = os.path.exists(os.path.join(OUTPUT_DIR, 'index.json'))
-
-    if subcommand is None:
-        if existing_index:
-            run_subcommand('help', subcommand_args)
-        else:
-            print_import_tutorial()
-        raise SystemExit(0)
+    run_subcommand(
+        subcommand=command.subcommand,
+        subcommand_args=command.subcommand_args,
+        stdin=stdin,
+        pwd=pwd or OUTPUT_DIR,
+    )
 
 
-    run_subcommand(subcommand, subcommand_args)
-    
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
-    main()
+    main(args=sys.argv[1:], stdin=sys.stdin)

+ 46 - 61
archivebox/cli/archivebox_add.py

@@ -7,90 +7,75 @@ __description__ = 'Add a new URL or list of URLs to your archive'
 import sys
 import sys
 import argparse
 import argparse
 
 
-from typing import List, Optional
+from typing import List, Optional, IO
 
 
-from ..legacy.config import stderr, check_dependencies, check_data_folder
-from ..legacy.util import (
-    handle_stdin_import,
-    handle_file_import,
-)
-from ..legacy.main import update_archive_data
+from ..main import add
+from ..util import SmartFormatter, accept_stdin
+from ..config import OUTPUT_DIR, ONLY_NEW
 
 
 
 
-def main(args: List[str]=None, stdin: Optional[str]=None) -> None:
-    check_data_folder()
-    
-    args = sys.argv[1:] if args is None else args
-
+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
     parser = argparse.ArgumentParser(
     parser = argparse.ArgumentParser(
         prog=__command__,
         prog=__command__,
         description=__description__,
         description=__description__,
         add_help=True,
         add_help=True,
+        formatter_class=SmartFormatter,
     )
     )
-    # parser.add_argument(
-    #     '--depth', #'-d',
-    #     type=int,
-    #     help='Recursively archive all linked pages up to this many hops away',
-    #     default=0,
-    # )
     parser.add_argument(
     parser.add_argument(
-        '--only-new', #'-n',
+        '--update-all', #'-n',
         action='store_true',
         action='store_true',
-        help="Don't attempt to retry previously skipped/failed links when updating",
+        default=not ONLY_NEW,
+        help="Also retry previously skipped/failed links when adding new links",
     )
     )
     parser.add_argument(
     parser.add_argument(
         '--index-only', #'-o',
         '--index-only', #'-o',
         action='store_true',
         action='store_true',
         help="Add the links to the main index without archiving them",
         help="Add the links to the main index without archiving them",
     )
     )
-    # parser.add_argument(
-    #     '--mirror', #'-m',
-    #     action='store_true',
-    #     help='Archive an entire site (finding all linked pages below it on the same domain)',
-    # )
-    # parser.add_argument(
-    #     '--crawler', #'-r',
-    #     choices=('depth_first', 'breadth_first'),
-    #     help='Controls which crawler to use in order to find outlinks in a given page',
-    #     default=None,
-    # )
     parser.add_argument(
     parser.add_argument(
-        'url',
+        'import_path',
         nargs='?',
         nargs='?',
         type=str,
         type=str,
         default=None,
         default=None,
-        help='URL of page to archive (or path to local file)'
+        help=(
+            'URL or path to local file containing a list of links to import. e.g.:\n'
+            '    https://getpocket.com/users/USERNAME/feed/all\n'
+            '    https://example.com/some/rss/feed.xml\n'
+            '    ~/Downloads/firefox_bookmarks_export.html\n'
+            '    ~/Desktop/sites_list.csv\n'
+        )
     )
     )
-    command = parser.parse_args(args)
-
-    check_dependencies()
-
-    ### Handle ingesting urls piped in through stdin
-    # (.e.g if user does cat example_urls.txt | archivebox add)
-    import_path = None
-    if stdin or not sys.stdin.isatty():
-        stdin_raw_text = stdin or sys.stdin.read()
-        if stdin_raw_text and command.url:
-            stderr(
-                '[X] You should pass either a path as an argument, '
-                'or pass a list of links via stdin, but not both.\n'
-            )
-            raise SystemExit(1)
-
-        import_path = handle_stdin_import(stdin_raw_text)
-
-    ### Handle ingesting url from a remote file/feed
-    # (e.g. if an RSS feed URL is used as the import path) 
-    elif command.url:
-        import_path = handle_file_import(command.url)
-
-    update_archive_data(
-        import_path=import_path,
-        resume=None,
-        only_new=command.only_new,
+    command = parser.parse_args(args or ())
+    import_str = accept_stdin(stdin)
+    add(
+        import_str=import_str,
+        import_path=command.import_path,
+        update_all=command.update_all,
         index_only=command.index_only,
         index_only=command.index_only,
+        out_dir=pwd or OUTPUT_DIR,
     )
     )
 
 
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
-    main()
+    main(args=sys.argv[1:], stdin=sys.stdin)
+
+
+# TODO: Implement these
+#
+# parser.add_argument(
+#     '--depth', #'-d',
+#     type=int,
+#     help='Recursively archive all linked pages up to this many hops away',
+#     default=0,
+# )
+# parser.add_argument(
+#     '--mirror', #'-m',
+#     action='store_true',
+#     help='Archive an entire site (finding all linked pages below it on the same domain)',
+# )
+# parser.add_argument(
+#     '--crawler', #'-r',
+#     choices=('depth_first', 'breadth_first'),
+#     help='Controls which crawler to use in order to find outlinks in a given page',
+#     default=None,
+# )

+ 17 - 115
archivebox/cli/archivebox_config.py

@@ -7,28 +7,14 @@ __description__ = 'Get and set your ArchiveBox project configuration values'
 import sys
 import sys
 import argparse
 import argparse
 
 
-from typing import Optional, List
+from typing import Optional, List, IO
 
 
-from ..legacy.util import SmartFormatter
-from ..legacy.config import (
-    check_data_folder,
-    OUTPUT_DIR,
-    load_all_config,
-    write_config_file,
-    CONFIG,
-    CONFIG_FILE,
-    USER_CONFIG,
-    ConfigDict,
-    stderr,
-    get_real_name,
-)
+from ..main import config
+from ..util import SmartFormatter, accept_stdin
+from ..config import OUTPUT_DIR
 
 
 
 
-def main(args: List[str]=None, stdin: Optional[str]=None) -> None:
-    check_data_folder()
-    
-    args = sys.argv[1:] if args is None else args
-
+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
     parser = argparse.ArgumentParser(
     parser = argparse.ArgumentParser(
         prog=__command__,
         prog=__command__,
         description=__description__,
         description=__description__,
@@ -57,102 +43,18 @@ def main(args: List[str]=None, stdin: Optional[str]=None) -> None:
         type=str,
         type=str,
         help='KEY or KEY=VALUE formatted config values to get or set',
         help='KEY or KEY=VALUE formatted config values to get or set',
     )
     )
-    command = parser.parse_args(args)
-
-    if stdin or not sys.stdin.isatty():
-        stdin_raw_text = stdin or sys.stdin.read()
-        if stdin_raw_text and command.config_options:
-            stderr(
-                '[X] You should either pass config values as an arguments '
-                'or via stdin, but not both.\n',
-                color='red',
-            )
-            raise SystemExit(1)
-
-        config_options = stdin_raw_text.split('\n')
-    else:
-        config_options = command.config_options
-
-    no_args = not (command.get or command.set or command.reset or command.config_options)
-
-    matching_config: ConfigDict = {}
-    if command.get or no_args:
-        if config_options:
-            config_options = [get_real_name(key) for key in config_options]
-            matching_config = {key: CONFIG[key] for key in config_options if key in CONFIG}
-            failed_config = [key for key in config_options if key not in CONFIG]
-            if failed_config:
-                stderr()
-                stderr('[X] These options failed to get', color='red')
-                stderr('    {}'.format('\n    '.join(config_options)))
-                raise SystemExit(1)
-        else:
-            matching_config = CONFIG
-        
-        print(printable_config(matching_config))
-        raise SystemExit(not matching_config)
-    elif command.set:
-        new_config = {}
-        failed_options = []
-        for line in config_options:
-            if line.startswith('#') or not line.strip():
-                continue
-            if '=' not in line:
-                stderr('[X] Config KEY=VALUE must have an = sign in it', color='red')
-                stderr(f'    {line}')
-                raise SystemExit(2)
-
-            raw_key, val = line.split('=')
-            raw_key = raw_key.upper().strip()
-            key = get_real_name(raw_key)
-            if key != raw_key:
-                stderr(f'[i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.', color='lightyellow')
-
-            if key in CONFIG:
-                new_config[key] = val.strip()
-            else:
-                failed_options.append(line)
-
-        if new_config:
-            before = CONFIG
-            matching_config = write_config_file(new_config, out_dir=OUTPUT_DIR)
-            after = load_all_config()
-            print(printable_config(matching_config))
-
-            side_effect_changes: ConfigDict = {}
-            for key, val in after.items():
-                if key in USER_CONFIG and (before[key] != after[key]) and (key not in matching_config):
-                    side_effect_changes[key] = after[key]
-
-            if side_effect_changes:
-                stderr()
-                stderr('[i] Note: This change also affected these other options that depended on it:', color='lightyellow')
-                print('    {}'.format(printable_config(side_effect_changes, prefix='    ')))
-        if failed_options:
-            stderr()
-            stderr('[X] These options failed to set:', color='red')
-            stderr('    {}'.format('\n    '.join(failed_options)))
-        raise SystemExit(bool(failed_options))
-    elif command.reset:
-        stderr('[X] This command is not implemented yet.', color='red')
-        stderr('    Please manually remove the relevant lines from your config file:')
-        stderr(f'        {CONFIG_FILE}')
-        raise SystemExit(2)
-
-    else:
-        stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red')
-        stderr('    archivebox config')
-        stderr('    archivebox config --get SOME_KEY')
-        stderr('    archivebox config --set SOME_KEY=SOME_VALUE')
-        raise SystemExit(2)
-
-
-def printable_config(config: ConfigDict, prefix: str='') -> str:
-    return f'\n{prefix}'.join(
-        f'{key}={val}'
-        for key, val in config.items()
-        if not (isinstance(val, dict) or callable(val))
+    command = parser.parse_args(args or ())
+    config_options_str = accept_stdin(stdin)
+
+    config(
+        config_options_str=config_options_str,
+        config_options=command.config_options,
+        get=command.get,
+        set=command.set,
+        reset=command.reset,
+        out_dir=pwd or OUTPUT_DIR,
     )
     )
 
 
+
 if __name__ == '__main__':
 if __name__ == '__main__':
-    main()
+    main(args=sys.argv[1:], stdin=sys.stdin)

+ 9 - 37
archivebox/cli/archivebox_help.py

@@ -7,52 +7,24 @@ __description__ = 'Print the ArchiveBox help message and usage'
 import sys
 import sys
 import argparse
 import argparse
 
 
-from ..legacy.util import reject_stdin
-from ..legacy.config import ANSI
-from . import list_subcommands
+from typing import Optional, List, IO
 
 
+from ..main import help
+from ..util import reject_stdin
+from ..config import OUTPUT_DIR
 
 
-def main(args=None):
-    args = sys.argv[1:] if args is None else args
 
 
+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
     parser = argparse.ArgumentParser(
     parser = argparse.ArgumentParser(
         prog=__command__,
         prog=__command__,
         description=__description__,
         description=__description__,
         add_help=True,
         add_help=True,
     )
     )
-    parser.parse_args(args)
-    reject_stdin(__command__)
+    parser.parse_args(args or ())
+    reject_stdin(__command__, stdin)
     
     
-
-    COMMANDS_HELP_TEXT = '\n    '.join(
-        f'{cmd.ljust(20)} {summary}'
-        for cmd, summary in list_subcommands().items()
-    )
-
-    print('''{green}ArchiveBox: The self-hosted internet archive.{reset}
-        
-{lightblue}Usage:{reset}
-    archivebox [command] [--help] [--version] [...args]
-
-{lightblue}Comamnds:{reset}
-    {}
-
-{lightblue}Example Use:{reset}
-    mkdir my-archive; cd my-archive/
-    archivebox init
-    archivebox info
-
-    archivebox add https://example.com/some/page
-    archivebox add --depth=1 ~/Downloads/bookmarks_export.html
-    
-    archivebox list --sort=timestamp --csv=timestamp,url,is_archived
-    archivebox schedule --every=week https://example.com/some/feed.rss
-    archivebox update --resume=15109948213.123
-
-{lightblue}Documentation:{reset}
-    https://github.com/pirate/ArchiveBox/wiki
-'''.format(COMMANDS_HELP_TEXT, **ANSI))
+    help(out_dir=pwd or OUTPUT_DIR)
 
 
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
-    main()
+    main(args=sys.argv[1:], stdin=sys.stdin)

+ 10 - 11
archivebox/cli/archivebox_info.py

@@ -7,25 +7,24 @@ __description__ = 'Print out some info and statistics about the archive collecti
 import sys
 import sys
 import argparse
 import argparse
 
 
-from ..legacy.config import check_data_folder
-from ..legacy.util import reject_stdin
-from ..legacy.main import info
+from typing import Optional, List, IO
 
 
+from ..main import info
+from ..config import OUTPUT_DIR
+from ..util import reject_stdin
 
 
-def main(args=None):
-    check_data_folder()
-    
-    args = sys.argv[1:] if args is None else args
 
 
+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
     parser = argparse.ArgumentParser(
     parser = argparse.ArgumentParser(
         prog=__command__,
         prog=__command__,
         description=__description__,
         description=__description__,
         add_help=True,
         add_help=True,
     )
     )
-    parser.parse_args(args)
-    reject_stdin(__command__)
+    parser.parse_args(args or ())
+    reject_stdin(__command__, stdin)
+
+    info(out_dir=pwd or OUTPUT_DIR)
 
 
-    info()
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
-    main()
+    main(args=sys.argv[1:], stdin=sys.stdin)

+ 9 - 8
archivebox/cli/archivebox_init.py

@@ -7,23 +7,24 @@ __description__ = 'Initialize a new ArchiveBox collection in the current directo
 import sys
 import sys
 import argparse
 import argparse
 
 
-from ..legacy.util import reject_stdin
-from ..legacy.main import init
+from typing import Optional, List, IO
 
 
+from ..main import init
+from ..util import reject_stdin
+from ..config import OUTPUT_DIR
 
 
-def main(args=None):
-    args = sys.argv[1:] if args is None else args
 
 
+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
     parser = argparse.ArgumentParser(
     parser = argparse.ArgumentParser(
         prog=__command__,
         prog=__command__,
         description=__description__,
         description=__description__,
         add_help=True,
         add_help=True,
     )
     )
-    parser.parse_args(args)
-    reject_stdin(__command__)
+    parser.parse_args(args or ())
+    reject_stdin(__command__, stdin)
 
 
-    init()
+    init(out_dir=pwd or OUTPUT_DIR)
     
     
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
-    main()
+    main(args=sys.argv[1:], stdin=sys.stdin)

+ 21 - 53
archivebox/cli/archivebox_list.py

@@ -2,15 +2,17 @@
 
 
 __package__ = 'archivebox.cli'
 __package__ = 'archivebox.cli'
 __command__ = 'archivebox list'
 __command__ = 'archivebox list'
-__description__ = 'List all the URLs currently in the archive.'
+__description__ = 'List, filter, and export information about archive entries'
 
 
 import sys
 import sys
 import argparse
 import argparse
 
 
-from ..legacy.util import SmartFormatter, reject_stdin, to_json, to_csv
-from ..legacy.config import check_data_folder, OUTPUT_DIR
-from ..legacy.main import (
-    list_archive_data,
+from typing import Optional, List, IO
+
+from ..main import list_all
+from ..util import SmartFormatter, accept_stdin
+from ..config import OUTPUT_DIR
+from ..index import (
     get_indexed_folders,
     get_indexed_folders,
     get_archived_folders,
     get_archived_folders,
     get_unarchived_folders,
     get_unarchived_folders,
@@ -23,11 +25,7 @@ from ..legacy.main import (
     get_unrecognized_folders,
     get_unrecognized_folders,
 )
 )
 
 
-def main(args=None):
-    check_data_folder()
-    
-    args = sys.argv[1:] if args is None else args
-
+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
     parser = argparse.ArgumentParser(
     parser = argparse.ArgumentParser(
         prog=__command__,
         prog=__command__,
         description=__description__,
         description=__description__,
@@ -93,57 +91,27 @@ def main(args=None):
         help='Type of pattern matching to use when filtering URLs',
         help='Type of pattern matching to use when filtering URLs',
     )
     )
     parser.add_argument(
     parser.add_argument(
-        'patterns',
+        'filter_patterns',
         nargs='*',
         nargs='*',
         type=str,
         type=str,
         default=None,
         default=None,
         help='List only URLs matching these filter patterns.'
         help='List only URLs matching these filter patterns.'
     )
     )
-    command = parser.parse_args(args)
-    reject_stdin(__command__)
+    command = parser.parse_args(args or ())
+    filter_patterns_str = accept_stdin(stdin)
 
 
-    links = list_archive_data(
-        filter_patterns=command.patterns,
+    list_all(
+        filter_patterns_str=filter_patterns_str,
+        filter_patterns=command.filter_patterns,
         filter_type=command.filter_type,
         filter_type=command.filter_type,
-        before=command.before,
+        status=command.status,
         after=command.after,
         after=command.after,
+        before=command.before,
+        sort=command.sort,
+        csv=command.csv,
+        json=command.json,
+        out_dir=pwd or OUTPUT_DIR,
     )
     )
 
 
-    if command.sort:
-        links = sorted(links, key=lambda link: getattr(link, command.sort))
-
-    links = list(links)
-
-    if command.status == 'indexed':
-        folders = get_indexed_folders(links, out_dir=OUTPUT_DIR)
-    elif command.status == 'archived':
-        folders = get_archived_folders(links, out_dir=OUTPUT_DIR)
-    elif command.status == 'unarchived':
-        folders = get_unarchived_folders(links, out_dir=OUTPUT_DIR)
-
-    elif command.status == 'present':
-        folders = get_present_folders(links, out_dir=OUTPUT_DIR)
-    elif command.status == 'valid':
-        folders = get_valid_folders(links, out_dir=OUTPUT_DIR)
-    elif command.status == 'invalid':
-        folders = get_invalid_folders(links, out_dir=OUTPUT_DIR)
-
-    elif command.status == 'duplicate':
-        folders = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
-    elif command.status == 'orphaned':
-        folders = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
-    elif command.status == 'corrupted':
-        folders = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
-    elif command.status == 'unrecognized':
-        folders = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
-
-    if command.csv:
-        print(to_csv(folders.values(), csv_cols=command.csv.split(','), header=True))
-    elif command.json:
-        print(to_json(folders.values(), indent=4, sort_keys=True))
-    else:
-        print('\n'.join(f'{folder} {link}' for folder, link in folders.items()))
-    raise SystemExit(not folders)
-
 if __name__ == '__main__':
 if __name__ == '__main__':
-    main()
+    main(args=sys.argv[1:], stdin=sys.stdin)

+ 9 - 15
archivebox/cli/archivebox_manage.py

@@ -6,24 +6,18 @@ __description__ = 'Run an ArchiveBox Django management command'
 
 
 import sys
 import sys
 
 
-from ..legacy.config import OUTPUT_DIR, setup_django, check_data_folder
+from typing import Optional, List, IO
 
 
+from ..main import manage
+from ..config import OUTPUT_DIR
 
 
-def main(args=None):
-    check_data_folder()
 
 
-    setup_django(OUTPUT_DIR)
-    from django.core.management import execute_from_command_line
-
-    args = sys.argv if args is None else ['archivebox', *args]
-
-    args[0] = f'{sys.argv[0]} manage'
-
-    if args[1:] == []:
-        args.append('help')
-    
-    execute_from_command_line(args)
+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
+    manage(
+        args=args,
+        out_dir=pwd or OUTPUT_DIR,
+    )
 
 
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
-    main()
+    main(args=sys.argv[1:], stdin=sys.stdin)

+ 18 - 29
archivebox/cli/archivebox_remove.py

@@ -7,17 +7,14 @@ __description__ = 'Remove the specified URLs from the archive.'
 import sys
 import sys
 import argparse
 import argparse
 
 
+from typing import Optional, List, IO
 
 
-from ..legacy.config import check_data_folder
-from ..legacy.util import reject_stdin
-from ..legacy.main import remove_archive_links
+from ..main import remove
+from ..util import accept_stdin
+from ..config import OUTPUT_DIR
 
 
 
 
-def main(args=None):
-    check_data_folder()
-    
-    args = sys.argv[1:] if args is None else args
-
+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
     parser = argparse.ArgumentParser(
     parser = argparse.ArgumentParser(
         prog=__command__,
         prog=__command__,
         description=__description__,
         description=__description__,
@@ -56,33 +53,25 @@ def main(args=None):
         help='Type of pattern matching to use when filtering URLs',
         help='Type of pattern matching to use when filtering URLs',
     )
     )
     parser.add_argument(
     parser.add_argument(
-        'pattern',
+        'filter_patterns',
         nargs='*',
         nargs='*',
         type=str,
         type=str,
-        default=None,
         help='URLs matching this filter pattern will be removed from the index.'
         help='URLs matching this filter pattern will be removed from the index.'
     )
     )
-    command = parser.parse_args(args)
-
-    if not sys.stdin.isatty():
-        stdin_raw_text = sys.stdin.read()
-        if stdin_raw_text and command.url:
-            print(
-                '[X] You should pass either a pattern as an argument, '
-                'or pass a list of patterns via stdin, but not both.\n'
-            )
-            raise SystemExit(1)
-
-        patterns = [pattern.strip() for pattern in stdin_raw_text.split('\n')]
-    else:
-        patterns = command.pattern
+    command = parser.parse_args(args or ())
+    filter_str = accept_stdin(stdin)
 
 
-    remove_archive_links(
-        filter_patterns=patterns, filter_type=command.filter_type,
-        before=command.before, after=command.after,
-        yes=command.yes, delete=command.delete,
+    remove(
+        filter_str=filter_str,
+        filter_patterns=command.filter_patterns,
+        filter_type=command.filter_type,
+        before=command.before,
+        after=command.after,
+        yes=command.yes,
+        delete=command.delete,
+        out_dir=pwd or OUTPUT_DIR,
     )
     )
     
     
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
-    main()
+    main(args=sys.argv[1:], stdin=sys.stdin)

+ 24 - 134
archivebox/cli/archivebox_schedule.py

@@ -4,34 +4,17 @@ __package__ = 'archivebox.cli'
 __command__ = 'archivebox schedule'
 __command__ = 'archivebox schedule'
 __description__ = 'Set ArchiveBox to regularly import URLs at specific times using cron'
 __description__ = 'Set ArchiveBox to regularly import URLs at specific times using cron'
 
 
-import os
 import sys
 import sys
 import argparse
 import argparse
 
 
-from datetime import datetime
-from crontab import CronTab, CronSlices
+from typing import Optional, List, IO
 
 
+from ..main import schedule
+from ..util import reject_stdin
+from ..config import OUTPUT_DIR
 
 
-from ..legacy.util import reject_stdin
-from ..legacy.config import (
-    OUTPUT_DIR,
-    LOGS_DIR,
-    ARCHIVEBOX_BINARY,
-    USER,
-    ANSI,
-    stderr,
-    check_data_folder,
-)
-
-
-CRON_COMMENT = 'archivebox_schedule'
-
-
-def main(args=None):
-    check_data_folder()
-    
-    args = sys.argv[1:] if args is None else args
 
 
+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
     parser = argparse.ArgumentParser(
     parser = argparse.ArgumentParser(
         prog=__command__,
         prog=__command__,
         description=__description__,
         description=__description__,
@@ -57,7 +40,7 @@ def main(args=None):
     group.add_argument(
     group.add_argument(
         '--clear', # '-c'
         '--clear', # '-c'
         action='store_true',
         action='store_true',
-        help=("Stop all ArchiveBox scheduled runs, clear it completely from cron"),
+        help=("Stop all ArchiveBox scheduled runs (remove cron jobs)"),
     )
     )
     group.add_argument(
     group.add_argument(
         '--show', # '-s'
         '--show', # '-s'
@@ -67,13 +50,14 @@ def main(args=None):
     group.add_argument(
     group.add_argument(
         '--foreground', '-f',
         '--foreground', '-f',
         action='store_true',
         action='store_true',
-        help=("Launch ArchiveBox as a long-running foreground task "
+        help=("Launch ArchiveBox scheduler as a long-running foreground task "
               "instead of using cron."),
               "instead of using cron."),
     )
     )
     group.add_argument(
     group.add_argument(
         '--run-all', # '-a',
         '--run-all', # '-a',
         action='store_true',
         action='store_true',
-        help='Run all the scheduled jobs once immediately, independent of their configured schedules',
+        help=("Run all the scheduled jobs once immediately, independent of "
+              "their configured schedules, can be used together with --foreground"),
     )
     )
     parser.add_argument(
     parser.add_argument(
         'import_path',
         'import_path',
@@ -83,115 +67,21 @@ def main(args=None):
         help=("Check this path and import any new links on every run "
         help=("Check this path and import any new links on every run "
               "(can be either local file or remote URL)"),
               "(can be either local file or remote URL)"),
     )
     )
-    command = parser.parse_args(args)
-    reject_stdin(__command__)
-
-    os.makedirs(LOGS_DIR, exist_ok=True)
-
-    cron = CronTab(user=True)
-    cron = dedupe_jobs(cron)
-
-    existing_jobs = list(cron.find_comment(CRON_COMMENT))
-    if command.foreground or command.run_all:
-        if command.import_path or (not existing_jobs):
-            stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**ANSI))
-            stderr('    archivebox schedule --every=hour https://example.com/some/rss/feed.xml')
-            raise SystemExit(1)
-        print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **ANSI))
-        if command.run_all:
-            try:
-                for job in existing_jobs:
-                    sys.stdout.write(f'  > {job.command}')
-                    sys.stdout.flush()
-                    job.run()
-                    sys.stdout.write(f'\r  √ {job.command}\n')
-            except KeyboardInterrupt:
-                print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
-                raise SystemExit(1)
-        if command.foreground:
-            try:
-                for result in cron.run_scheduler():
-                    print(result)
-            except KeyboardInterrupt:
-                print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
-                raise SystemExit(1)
-
-    elif command.show:
-        if existing_jobs:
-            print('\n'.join(str(cmd) for cmd in existing_jobs))
-        else:
-            stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **ANSI))
-            stderr('    To schedule a new job, run:')
-            stderr('        archivebox schedule --every=[timeperiod] https://example.com/some/rss/feed.xml')
-        raise SystemExit(0)
-
-    elif command.clear:
-        print(cron.remove_all(comment=CRON_COMMENT))
-        cron.write()
-        raise SystemExit(0)
-
-    elif command.every:
-        quoted = lambda s: f'"{s}"' if s and ' ' in s else s
-        cmd = [
-            'cd',
-            quoted(OUTPUT_DIR),
-            '&&',
-            quoted(ARCHIVEBOX_BINARY),
-            *(('add', f'"{command.import_path}"',) if command.import_path else ('update',)),
-            '2>&1',
-            '>',
-            quoted(os.path.join(LOGS_DIR, 'archivebox.log')),
-
-        ]
-        new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT)
-
-        if command.every in ('minute', 'hour', 'day', 'week', 'month', 'year'):
-            set_every = getattr(new_job.every(), command.every)
-            set_every()
-        elif CronSlices.is_valid(command.every):
-            new_job.setall(command.every)
-        else:
-            stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**ANSI))
-            stderr('    It must be one of minute/hour/day/week/month')
-            stderr('    or a quoted cron-format schedule like:')
-            stderr('        archivebox init --every=day https://example.com/some/rss/feed.xml')
-            stderr('        archivebox init --every="0/5 * * * *" https://example.com/some/rss/feed.xml')
-            raise SystemExit(1)
-
-        cron = dedupe_jobs(cron)
-        cron.write()
-
-        total_runs = sum(j.frequency_per_year() for j in cron)
-        existing_jobs = list(cron.find_comment(CRON_COMMENT))
-
-        print()
-        print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **ANSI))
-        print('\n'.join(f'  > {cmd}' if str(cmd) == str(new_job) else f'    {cmd}' for cmd in existing_jobs))
-        if total_runs > 60 and not command.quiet:
-            stderr()
-            stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **ANSI))
-            stderr(f'    Congrats on being an enthusiastic internet archiver! 👌')
-            stderr()
-            stderr('    Make sure you have enough storage space available to hold all the data.')
-            stderr('    Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.')
-        raise SystemExit(0)
-
-
-def dedupe_jobs(cron: CronTab) -> CronTab:
-    deduped = set()
-    for job in list(cron):
-        unique_tuple = (str(job.slices), job.command)
-        if unique_tuple not in deduped:
-            deduped.add(unique_tuple)
-        cron.remove(job)
-
-    for schedule, command in deduped:
-        job = cron.new(command=command, comment=CRON_COMMENT)
-        job.setall(schedule)
-        job.enable()
-
-    return cron
+    command = parser.parse_args(args or ())
+    reject_stdin(__command__, stdin)
+
+    schedule(
+        add=command.add,
+        show=command.show,
+        clear=command.clear,
+        foreground=command.foreground,
+        run_all=command.run_all,
+        quiet=command.quiet,
+        every=command.every,
+        import_path=command.import_path,
+        out_dir=pwd or OUTPUT_DIR,
+    )
 
 
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
-    main()
+    main(args=sys.argv[1:], stdin=sys.stdin)

+ 13 - 25
archivebox/cli/archivebox_server.py

@@ -7,15 +7,14 @@ __description__ = 'Run the ArchiveBox HTTP server'
 import sys
 import sys
 import argparse
 import argparse
 
 
-from ..legacy.config import setup_django, IS_TTY, OUTPUT_DIR, ANSI, check_data_folder
-from ..legacy.util import reject_stdin
+from typing import Optional, List, IO
 
 
+from ..main import server
+from ..util import reject_stdin
+from ..config import OUTPUT_DIR
 
 
-def main(args=None):
-    check_data_folder()
-
-    args = sys.argv[1:] if args is None else args
 
 
+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
     parser = argparse.ArgumentParser(
     parser = argparse.ArgumentParser(
         prog=__command__,
         prog=__command__,
         description=__description__,
         description=__description__,
@@ -33,26 +32,15 @@ def main(args=None):
         action='store_true',
         action='store_true',
         help='Enable auto-reloading when code or templates change',
         help='Enable auto-reloading when code or templates change',
     )
     )
-    command = parser.parse_args(args)
-    reject_stdin(__command__)
+    command = parser.parse_args(args or ())
+    reject_stdin(__command__, stdin)
     
     
-    setup_django(OUTPUT_DIR)
-    from django.core.management import call_command
-    from django.contrib.auth.models import User
-
-    if IS_TTY and not User.objects.filter(is_superuser=True).exists():
-        print('{lightyellow}[!] No admin users exist yet, you will not be able to edit links in the UI.{reset}'.format(**ANSI))
-        print()
-        print('    To create an admin user, run:')
-        print('        archivebox manage createsuperuser')
-        print()
-
-    print('{green}[+] Starting ArchiveBox webserver...{reset}'.format(**ANSI))
-    if not command.reload:
-        command.runserver_args.append('--noreload')
-
-    call_command("runserver", *command.runserver_args)
+    server(
+        runserver_args=command.runserver_args,
+        reload=command.reload,
+        out_dir=pwd or OUTPUT_DIR,
+    )
 
 
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
-    main()
+    main(args=sys.argv[1:], stdin=sys.stdin)

+ 12 - 13
archivebox/cli/archivebox_shell.py

@@ -7,27 +7,26 @@ __description__ = 'Enter an interactive ArchiveBox Django shell'
 import sys
 import sys
 import argparse
 import argparse
 
 
-from ..legacy.config import setup_django, OUTPUT_DIR, check_data_folder
-from ..legacy.util import reject_stdin
+from typing import Optional, List, IO
 
 
+from ..main import shell
+from ..config import OUTPUT_DIR
+from ..util import reject_stdin
 
 
-def main(args=None):
-    check_data_folder()
-    
-    args = sys.argv[1:] if args is None else args
 
 
+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
     parser = argparse.ArgumentParser(
     parser = argparse.ArgumentParser(
         prog=__command__,
         prog=__command__,
         description=__description__,
         description=__description__,
         add_help=True,
         add_help=True,
     )
     )
-    parser.parse_args(args)
-    reject_stdin(__command__)
+    parser.parse_args(args or ())
+    reject_stdin(__command__, stdin)
+    
+    shell(
+        out_dir=pwd or OUTPUT_DIR,
+    )
     
     
-    setup_django(OUTPUT_DIR)
-    from django.core.management import call_command
-    call_command("shell_plus")
-
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
-    main()
+    main(args=sys.argv[1:], stdin=sys.stdin)

+ 82 - 14
archivebox/cli/archivebox_update.py

@@ -2,27 +2,36 @@
 
 
 __package__ = 'archivebox.cli'
 __package__ = 'archivebox.cli'
 __command__ = 'archivebox update'
 __command__ = 'archivebox update'
-__description__ = 'Import any new links from subscriptions and retry any previously failed/skipped links.'
+__description__ = 'Import any new links from subscriptions and retry any previously failed/skipped links'
 
 
 import sys
 import sys
 import argparse
 import argparse
 
 
-from typing import List
+from typing import List, Optional, IO
 
 
-from ..legacy.config import check_data_folder
-from ..legacy.util import reject_stdin
-from ..legacy.main import update_archive_data
+from ..main import update
+from ..util import SmartFormatter, accept_stdin
+from ..config import OUTPUT_DIR
+from ..index import (
+    get_indexed_folders,
+    get_archived_folders,
+    get_unarchived_folders,
+    get_present_folders,
+    get_valid_folders,
+    get_invalid_folders,
+    get_duplicate_folders,
+    get_orphaned_folders,
+    get_corrupted_folders,
+    get_unrecognized_folders,
+)
 
 
 
 
-def main(args: List[str]=None):
-    check_data_folder()
-    
-    args = sys.argv[1:] if args is None else args
-
+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
     parser = argparse.ArgumentParser(
     parser = argparse.ArgumentParser(
         prog=__command__,
         prog=__command__,
         description=__description__,
         description=__description__,
         add_help=True,
         add_help=True,
+        formatter_class=SmartFormatter,
     )
     )
     parser.add_argument(
     parser.add_argument(
         '--only-new', #'-n',
         '--only-new', #'-n',
@@ -40,16 +49,75 @@ def main(args: List[str]=None):
         help='Resume the update process from a given timestamp',
         help='Resume the update process from a given timestamp',
         default=None,
         default=None,
     )
     )
+    parser.add_argument(
+        '--overwrite', #'-x',
+        action='store_true',
+        help='Ignore existing archived content and overwrite with new versions (DANGEROUS)',
+    )
+    parser.add_argument(
+        '--before', #'-b',
+        type=float,
+        help="Update only links bookmarked before the given timestamp.",
+        default=None,
+    )
+    parser.add_argument(
+        '--after', #'-a',
+        type=float,
+        help="Update only links bookmarked after the given timestamp.",
+        default=None,
+    )
+    parser.add_argument(
+        '--status',
+        type=str,
+        choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'),
+        default='indexed',
+        help=(
+            'Update only links or data directories that have the given status\n'
+            f'    indexed       {get_indexed_folders.__doc__} (the default)\n'
+            f'    archived      {get_archived_folders.__doc__}\n'
+            f'    unarchived    {get_unarchived_folders.__doc__}\n'
+            '\n'
+            f'    present       {get_present_folders.__doc__}\n'
+            f'    valid         {get_valid_folders.__doc__}\n'
+            f'    invalid       {get_invalid_folders.__doc__}\n'
+            '\n'
+            f'    duplicate     {get_duplicate_folders.__doc__}\n'
+            f'    orphaned      {get_orphaned_folders.__doc__}\n'
+            f'    corrupted     {get_corrupted_folders.__doc__}\n'
+            f'    unrecognized  {get_unrecognized_folders.__doc__}\n'
+        )
+    )
+    parser.add_argument(
+        '--filter-type',
+        type=str,
+        choices=('exact', 'substring', 'domain', 'regex'),
+        default='exact',
+        help='Type of pattern matching to use when filtering URLs',
+    )
+    parser.add_argument(
+        'filter_patterns',
+        nargs='*',
+        type=str,
+        default=None,
+        help='List only URLs matching these filter patterns.'
+    )
     command = parser.parse_args(args)
     command = parser.parse_args(args)
-    reject_stdin(__command__)
+    filter_patterns_str = accept_stdin(stdin)
 
 
-    update_archive_data(
-        import_path=None,
+    update(
         resume=command.resume,
         resume=command.resume,
         only_new=command.only_new,
         only_new=command.only_new,
         index_only=command.index_only,
         index_only=command.index_only,
+        overwrite=command.overwrite,
+        filter_patterns_str=filter_patterns_str,
+        filter_patterns=command.filter_patterns,
+        filter_type=command.filter_type,
+        status=command.status,
+        after=command.after,
+        before=command.before,
+        out_dir=pwd or OUTPUT_DIR,
     )
     )
     
     
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
-    main()
+    main(args=sys.argv[1:], stdin=sys.stdin)

+ 11 - 98
archivebox/cli/archivebox_version.py

@@ -4,26 +4,17 @@ __package__ = 'archivebox.cli'
 __command__ = 'archivebox version'
 __command__ = 'archivebox version'
 __description__ = 'Print the ArchiveBox version and dependency information'
 __description__ = 'Print the ArchiveBox version and dependency information'
 
 
-import os
-import re
 import sys
 import sys
 import argparse
 import argparse
 
 
-from ..legacy.util import reject_stdin, human_readable_size
-from ..legacy.config import (
-    ANSI,
-    VERSION,
-    CODE_LOCATIONS,
-    CONFIG_LOCATIONS,
-    DATA_LOCATIONS,
-    DEPENDENCIES,
-    check_dependencies,
-)
+from typing import Optional, List, IO
 
 
+from ..main import version
+from ..util import reject_stdin
+from ..config import OUTPUT_DIR
 
 
-def main(args=None):
-    args = sys.argv[1:] if args is None else args
 
 
+def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
     parser = argparse.ArgumentParser(
     parser = argparse.ArgumentParser(
         prog=__command__,
         prog=__command__,
         description=__description__,
         description=__description__,
@@ -34,92 +25,14 @@ def main(args=None):
         action='store_true',
         action='store_true',
         help='Only print ArchiveBox version number and nothing else.',
         help='Only print ArchiveBox version number and nothing else.',
     )
     )
-    command = parser.parse_args(args)
-    reject_stdin(__command__)
+    command = parser.parse_args(args or ())
+    reject_stdin(__command__, stdin)
     
     
-    if command.quiet:
-        print(VERSION)
-    else:
-        print('ArchiveBox v{}'.format(VERSION))
-        print()
-
-        print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
-        for name, dependency in DEPENDENCIES.items():
-            print_dependency_version(name, dependency)
-        
-        print()
-        print('{white}[i] Code locations:{reset}'.format(**ANSI))
-        for name, folder in CODE_LOCATIONS.items():
-            print_folder_status(name, folder)
-
-        print()
-        print('{white}[i] Config locations:{reset}'.format(**ANSI))
-        for name, folder in CONFIG_LOCATIONS.items():
-            print_folder_status(name, folder)
-
-        print()
-        print('{white}[i] Data locations:{reset}'.format(**ANSI))
-        for name, folder in DATA_LOCATIONS.items():
-            print_folder_status(name, folder)
-
-        print()
-        check_dependencies()
-
-
-def print_folder_status(name, folder):
-    if folder['enabled']:
-        if folder['is_valid']:
-            color, symbol, note = 'green', '√', 'valid'
-        else:
-            color, symbol, note, num_files = 'red', 'X', 'invalid', '?'
-    else:
-        color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-'
-
-    if folder['path']:
-        if os.path.exists(folder['path']):
-            num_files = (
-                f'{len(os.listdir(folder["path"]))} files'
-                if os.path.isdir(folder['path']) else
-                human_readable_size(os.path.getsize(folder['path']))
-            )
-        else:
-            num_files = 'missing'
-
-    print(
-        ANSI[color],
-        symbol,
-        ANSI['reset'],
-        name.ljust(24),
-        (folder["path"] or '').ljust(70),
-        num_files.ljust(14),
-        ANSI[color],
-        note,
-        ANSI['reset'],
-    )
-
-
-def print_dependency_version(name, dependency):
-    if dependency['enabled']:
-        if dependency['is_valid']:
-            color, symbol, note = 'green', '√', 'valid'
-            version = 'v' + re.search(r'[\d\.]+', dependency['version'])[0]
-        else:
-            color, symbol, note, version = 'red', 'X', 'invalid', '?'
-    else:
-        color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
-
-    print(
-        ANSI[color],
-        symbol,
-        ANSI['reset'],
-        name.ljust(24),
-        (dependency["path"] or '').ljust(70),
-        version.ljust(14),
-        ANSI[color],
-        note,
-        ANSI['reset'],
+    version(
+        quiet=command.quiet,
+        out_dir=pwd or OUTPUT_DIR,
     )
     )
 
 
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
-    main()
+    main(args=sys.argv[1:], stdin=sys.stdin)

+ 10 - 8
archivebox/legacy/logs.py → archivebox/cli/logging.py

@@ -1,3 +1,5 @@
+__package__ = 'archivebox.cli'
+
 import os
 import os
 import sys
 import sys
 
 
@@ -5,8 +7,8 @@ from datetime import datetime
 from dataclasses import dataclass
 from dataclasses import dataclass
 from typing import Optional, List
 from typing import Optional, List
 
 
-from .schema import Link, ArchiveResult
-from .config import ANSI, OUTPUT_DIR, IS_TTY
+from ..index.schema import Link, ArchiveResult
+from ..config import ANSI, OUTPUT_DIR, IS_TTY
 
 
 
 
 @dataclass
 @dataclass
@@ -80,7 +82,7 @@ def log_indexing_finished(out_path: str):
 
 
 ### Archiving Stage
 ### Archiving Stage
 
 
-def log_archiving_started(num_links: int, resume: Optional[float]):
+def log_archiving_started(num_links: int, resume: Optional[float]=None):
     start_ts = datetime.now()
     start_ts = datetime.now()
     _LAST_RUN_STATS.archiving_start_ts = start_ts
     _LAST_RUN_STATS.archiving_start_ts = start_ts
     print()
     print()
@@ -92,7 +94,7 @@ def log_archiving_started(num_links: int, resume: Optional[float]):
              **ANSI,
              **ANSI,
         ))
         ))
     else:
     else:
-        print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format(
+        print('{green}[▶] [{}] Updating content for {} matching pages in archive...{reset}'.format(
              start_ts.strftime('%Y-%m-%d %H:%M:%S'),
              start_ts.strftime('%Y-%m-%d %H:%M:%S'),
              num_links,
              num_links,
              **ANSI,
              **ANSI,
@@ -213,18 +215,18 @@ def log_archive_method_finished(result: ArchiveResult):
         print()
         print()
 
 
 
 
-def log_list_started(filter_patterns: List[str], filter_type: str):
+def log_list_started(filter_patterns: Optional[List[str]], filter_type: str):
     print('{green}[*] Finding links in the archive index matching these {} patterns:{reset}'.format(
     print('{green}[*] Finding links in the archive index matching these {} patterns:{reset}'.format(
         filter_type,
         filter_type,
         **ANSI,
         **ANSI,
     ))
     ))
-    print('    {}'.format(' '.join(filter_patterns)))
+    print('    {}'.format(' '.join(filter_patterns or ())))
 
 
 def log_list_finished(links):
 def log_list_finished(links):
-    from .util import to_csv
+    from ..util import links_to_csv
     print()
     print()
     print('---------------------------------------------------------------------------------------------------')
     print('---------------------------------------------------------------------------------------------------')
-    print(to_csv(links, csv_cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
+    print(links_to_csv(links, csv_cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
     print('---------------------------------------------------------------------------------------------------')
     print('---------------------------------------------------------------------------------------------------')
     print()
     print()
 
 

+ 5 - 5
archivebox/tests.py → archivebox/cli/tests.py

@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 #!/usr/bin/env python3
 
 
-__package__ = 'archivebox'
+__package__ = 'archivebox.cli'
 
 
 
 
 import os
 import os
@@ -29,15 +29,15 @@ TEST_CONFIG = {
 OUTPUT_DIR = 'data.tests'
 OUTPUT_DIR = 'data.tests'
 os.environ.update(TEST_CONFIG)
 os.environ.update(TEST_CONFIG)
 
 
-from .legacy.main import init
-from .legacy.index import load_main_index
-from .legacy.config import (
+from ..main import init
+from ..index import load_main_index
+from ..config import (
     SQL_INDEX_FILENAME,
     SQL_INDEX_FILENAME,
     JSON_INDEX_FILENAME,
     JSON_INDEX_FILENAME,
     HTML_INDEX_FILENAME,
     HTML_INDEX_FILENAME,
 )
 )
 
 
-from .cli import (
+from . import (
     archivebox_init,
     archivebox_init,
     archivebox_add,
     archivebox_add,
     archivebox_remove,
     archivebox_remove,

+ 26 - 22
archivebox/legacy/config.py → archivebox/config/__init__.py

@@ -1,4 +1,4 @@
-__package__ = 'archivebox.legacy'
+__package__ = 'archivebox.config'
 
 
 import os
 import os
 import io
 import io
@@ -13,7 +13,7 @@ from typing import Optional, Type, Tuple, Dict
 from subprocess import run, PIPE, DEVNULL
 from subprocess import run, PIPE, DEVNULL
 from configparser import ConfigParser
 from configparser import ConfigParser
 
 
-from .config_stubs import (
+from .stubs import (
     SimpleConfigValueDict,
     SimpleConfigValueDict,
     ConfigValue,
     ConfigValue,
     ConfigDict,
     ConfigDict,
@@ -40,7 +40,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
     'GENERAL_CONFIG': {
     'GENERAL_CONFIG': {
         'OUTPUT_DIR':               {'type': str,   'default': None},
         'OUTPUT_DIR':               {'type': str,   'default': None},
         'CONFIG_FILE':              {'type': str,   'default': None},
         'CONFIG_FILE':              {'type': str,   'default': None},
-        'ONLY_NEW':                 {'type': bool,  'default': False},
+        'ONLY_NEW':                 {'type': bool,  'default': True},
         'TIMEOUT':                  {'type': int,   'default': 60},
         'TIMEOUT':                  {'type': int,   'default': 60},
         'MEDIA_TIMEOUT':            {'type': int,   'default': 3600},
         'MEDIA_TIMEOUT':            {'type': int,   'default': 3600},
         'OUTPUT_PERMISSIONS':       {'type': str,   'default': '755'},
         'OUTPUT_PERMISSIONS':       {'type': str,   'default': '755'},
@@ -122,8 +122,7 @@ ANSI = {k: '' for k in DEFAULT_CLI_COLORS.keys()}
 
 
 VERSION_FILENAME = 'VERSION'
 VERSION_FILENAME = 'VERSION'
 PYTHON_DIR_NAME = 'archivebox'
 PYTHON_DIR_NAME = 'archivebox'
-LEGACY_DIR_NAME = 'legacy'
-TEMPLATES_DIR_NAME = 'templates'
+TEMPLATES_DIR_NAME = 'themes'
 
 
 ARCHIVE_DIR_NAME = 'archive'
 ARCHIVE_DIR_NAME = 'archive'
 SOURCES_DIR_NAME = 'sources'
 SOURCES_DIR_NAME = 'sources'
@@ -158,8 +157,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
     
     
     'REPO_DIR':                 {'default': lambda c: os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..'))},
     'REPO_DIR':                 {'default': lambda c: os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..'))},
     'PYTHON_DIR':               {'default': lambda c: os.path.join(c['REPO_DIR'], PYTHON_DIR_NAME)},
     'PYTHON_DIR':               {'default': lambda c: os.path.join(c['REPO_DIR'], PYTHON_DIR_NAME)},
-    'LEGACY_DIR':               {'default': lambda c: os.path.join(c['PYTHON_DIR'], LEGACY_DIR_NAME)},
-    'TEMPLATES_DIR':            {'default': lambda c: os.path.join(c['LEGACY_DIR'], TEMPLATES_DIR_NAME)},
+    'TEMPLATES_DIR':            {'default': lambda c: os.path.join(c['PYTHON_DIR'], TEMPLATES_DIR_NAME, 'legacy')},
     
     
     'OUTPUT_DIR':               {'default': lambda c: os.path.abspath(os.path.expanduser(c['OUTPUT_DIR'])) if c['OUTPUT_DIR'] else os.path.abspath(os.curdir)},
     'OUTPUT_DIR':               {'default': lambda c: os.path.abspath(os.path.expanduser(c['OUTPUT_DIR'])) if c['OUTPUT_DIR'] else os.path.abspath(os.curdir)},
     'ARCHIVE_DIR':              {'default': lambda c: os.path.join(c['OUTPUT_DIR'], ARCHIVE_DIR_NAME)},
     'ARCHIVE_DIR':              {'default': lambda c: os.path.join(c['OUTPUT_DIR'], ARCHIVE_DIR_NAME)},
@@ -210,7 +208,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
 
 
     'DEPENDENCIES':             {'default': lambda c: get_dependency_info(c)},
     'DEPENDENCIES':             {'default': lambda c: get_dependency_info(c)},
     'CODE_LOCATIONS':           {'default': lambda c: get_code_locations(c)},
     'CODE_LOCATIONS':           {'default': lambda c: get_code_locations(c)},
-    'CONFIG_LOCATIONS':         {'default': lambda c: get_config_locations(c)},
+    'EXTERNAL_LOCATIONS':       {'default': lambda c: get_external_locations(c)},
     'DATA_LOCATIONS':           {'default': lambda c: get_data_locations(c)},
     'DATA_LOCATIONS':           {'default': lambda c: get_data_locations(c)},
     'CHROME_OPTIONS':           {'default': lambda c: get_chrome_info(c)},
     'CHROME_OPTIONS':           {'default': lambda c: get_chrome_info(c)},
 }
 }
@@ -370,6 +368,7 @@ def load_config(defaults: ConfigDefaultDict,
             stderr('    For config documentation and examples see:')
             stderr('    For config documentation and examples see:')
             stderr('        https://github.com/pirate/ArchiveBox/wiki/Configuration')
             stderr('        https://github.com/pirate/ArchiveBox/wiki/Configuration')
             stderr()
             stderr()
+            raise
             raise SystemExit(2)
             raise SystemExit(2)
     
     
     return extended_config
     return extended_config
@@ -492,18 +491,13 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
         'REPO_DIR': {
         'REPO_DIR': {
             'path': os.path.abspath(config['REPO_DIR']),
             'path': os.path.abspath(config['REPO_DIR']),
             'enabled': True,
             'enabled': True,
-            'is_valid': os.path.exists(os.path.join(config['REPO_DIR'], '.github')),
+            'is_valid': os.path.exists(os.path.join(config['REPO_DIR'], 'archivebox')),
         },
         },
         'PYTHON_DIR': {
         'PYTHON_DIR': {
             'path': os.path.abspath(config['PYTHON_DIR']),
             'path': os.path.abspath(config['PYTHON_DIR']),
             'enabled': True,
             'enabled': True,
             'is_valid': os.path.exists(os.path.join(config['PYTHON_DIR'], '__main__.py')),
             'is_valid': os.path.exists(os.path.join(config['PYTHON_DIR'], '__main__.py')),
         },
         },
-        'LEGACY_DIR': {
-            'path': os.path.abspath(config['LEGACY_DIR']),
-            'enabled': True,
-            'is_valid': os.path.exists(os.path.join(config['LEGACY_DIR'], 'util.py')),
-        },
         'TEMPLATES_DIR': {
         'TEMPLATES_DIR': {
             'path': os.path.abspath(config['TEMPLATES_DIR']),
             'path': os.path.abspath(config['TEMPLATES_DIR']),
             'enabled': True,
             'enabled': True,
@@ -511,14 +505,9 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
         },
         },
     }
     }
 
 
-def get_config_locations(config: ConfigDict) -> ConfigValue:
+def get_external_locations(config: ConfigDict) -> ConfigValue:
     abspath = lambda path: None if path is None else os.path.abspath(path)
     abspath = lambda path: None if path is None else os.path.abspath(path)
     return {
     return {
-        'CONFIG_FILE': {
-            'path': abspath(config['CHROME_USER_DATA_DIR']),
-            'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
-            'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else os.path.exists(os.path.join(config['CHROME_USER_DATA_DIR'], 'Default')),
-        },
         'CHROME_USER_DATA_DIR': {
         'CHROME_USER_DATA_DIR': {
             'path': abspath(config['CHROME_USER_DATA_DIR']),
             'path': abspath(config['CHROME_USER_DATA_DIR']),
             'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
             'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
@@ -553,11 +542,26 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
             'enabled': True,
             'enabled': True,
             'is_valid': os.path.exists(config['ARCHIVE_DIR']),
             'is_valid': os.path.exists(config['ARCHIVE_DIR']),
         },
         },
+        'CONFIG_FILE': {
+            'path': os.path.abspath(config['CONFIG_FILE']),
+            'enabled': True,
+            'is_valid': os.path.exists(config['CONFIG_FILE']),
+        },
         'SQL_INDEX': {
         'SQL_INDEX': {
+            'path': os.path.abspath(os.path.join(config['OUTPUT_DIR'], SQL_INDEX_FILENAME)),
+            'enabled': True,
+            'is_valid': os.path.exists(os.path.join(config['OUTPUT_DIR'], SQL_INDEX_FILENAME)),
+        },
+        'JSON_INDEX': {
             'path': os.path.abspath(os.path.join(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)),
             'path': os.path.abspath(os.path.join(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)),
             'enabled': True,
             'enabled': True,
             'is_valid': os.path.exists(os.path.join(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)),
             'is_valid': os.path.exists(os.path.join(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)),
         },
         },
+        'HTML_INDEX': {
+            'path': os.path.abspath(os.path.join(config['OUTPUT_DIR'], HTML_INDEX_FILENAME)),
+            'enabled': True,
+            'is_valid': os.path.exists(os.path.join(config['OUTPUT_DIR'], HTML_INDEX_FILENAME)),
+        },
     }
     }
 
 
 def get_dependency_info(config: ConfigDict) -> ConfigValue:
 def get_dependency_info(config: ConfigDict) -> ConfigValue:
@@ -731,7 +735,7 @@ def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) ->
 
 
     json_index_exists = os.path.exists(os.path.join(output_dir, JSON_INDEX_FILENAME))
     json_index_exists = os.path.exists(os.path.join(output_dir, JSON_INDEX_FILENAME))
     if not json_index_exists:
     if not json_index_exists:
-        stderr('[X] No archive index was found in current directory.', color='red')
+        stderr('[X] No archive main index was found in current directory.', color='red')
         stderr(f'    {output_dir}')
         stderr(f'    {output_dir}')
         stderr()
         stderr()
         stderr('    Are you running archivebox in the right folder?')
         stderr('    Are you running archivebox in the right folder?')
@@ -743,7 +747,7 @@ def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) ->
         raise SystemExit(2)
         raise SystemExit(2)
 
 
     sql_index_exists = os.path.exists(os.path.join(output_dir, SQL_INDEX_FILENAME))
     sql_index_exists = os.path.exists(os.path.join(output_dir, SQL_INDEX_FILENAME))
-    from .storage.sql import list_migrations
+    from ..index.sql import list_migrations
 
 
     pending_migrations = [name for status, name in list_migrations() if not status]
     pending_migrations = [name for status, name in list_migrations() if not status]
 
 

+ 1 - 1
archivebox/legacy/config_stubs.py → archivebox/config/stubs.py

@@ -17,6 +17,7 @@ class ConfigDict(BaseConfig, total=False):
     SHOW_PROGRESS: bool
     SHOW_PROGRESS: bool
 
 
     OUTPUT_DIR: str
     OUTPUT_DIR: str
+    CONFIG_FILE: str
     ONLY_NEW: bool
     ONLY_NEW: bool
     TIMEOUT: int
     TIMEOUT: int
     MEDIA_TIMEOUT: int
     MEDIA_TIMEOUT: int
@@ -63,7 +64,6 @@ class ConfigDict(BaseConfig, total=False):
     ANSI: Dict[str, str]
     ANSI: Dict[str, str]
     REPO_DIR: str
     REPO_DIR: str
     PYTHON_DIR: str
     PYTHON_DIR: str
-    LEGACY_DIR: str
     TEMPLATES_DIR: str
     TEMPLATES_DIR: str
     ARCHIVE_DIR: str
     ARCHIVE_DIR: str
     SOURCES_DIR: str
     SOURCES_DIR: str

+ 2 - 4
archivebox/core/admin.py

@@ -1,9 +1,7 @@
-
-from datetime import datetime
-
 from django.contrib import admin
 from django.contrib import admin
 
 
-from .models import Page
+from core.models import Page
+
 
 
 class PageAdmin(admin.ModelAdmin):
 class PageAdmin(admin.ModelAdmin):
     list_display = ('timestamp', 'short_url', 'title', 'is_archived', 'num_outputs', 'added', 'updated', 'url_hash')
     list_display = ('timestamp', 'short_url', 'title', 'is_archived', 'num_outputs', 'added', 'updated', 'url_hash')

+ 2 - 2
archivebox/core/models.py

@@ -4,8 +4,8 @@ import uuid
 
 
 from django.db import models
 from django.db import models
 
 
-from legacy.schema import Link
-from legacy.util import parse_date
+from ..util import parse_date
+from ..index.schema import Link
 
 
 
 
 class Page(models.Model):
 class Page(models.Model):

+ 3 - 3
archivebox/core/views.py

@@ -2,8 +2,8 @@ from django.shortcuts import render
 
 
 from django.views import View
 from django.views import View
 
 
-from legacy.config import OUTPUT_DIR
-from legacy.index import load_main_index, load_main_index_meta
+from .index import load_main_index, load_main_index_meta
+from .config import OUTPUT_DIR
 
 
 
 
 class MainIndex(View):
 class MainIndex(View):
@@ -34,7 +34,7 @@ class AddLinks(View):
     def post(self, request):
     def post(self, request):
         import_path = request.POST['url']
         import_path = request.POST['url']
         
         
-        # TODO: add the links to the index here using archivebox.legacy.main.update_archive_data
+        # TODO: add the links to the index here using archivebox.main.add
         print(f'Adding URL: {import_path}')
         print(f'Adding URL: {import_path}')
 
 
         return render(template_name=self.template, request=request, context={})
         return render(template_name=self.template, request=request, context={})

+ 17 - 4
archivebox/core/welcome_message.py

@@ -1,4 +1,17 @@
-print()
-print('[i] Welcome to the ArchiveBox Shell! Example usage:')
-print('    Page.objects.all()')
-print('    User.objects.all()')
+from cli import list_subcommands
+
+from .config import ANSI
+
+
+if __name__ == '__main__':
+    print('{green}# ArchiveBox Imports{reset}'.format(**ANSI))
+    # print('from archivebox.core.models import Page, User')
+    print('{green}from archivebox.cli import\narchivebox_{}{reset}'.format("\narchivebox_".join(list_subcommands().keys()), **ANSI))
+    print()
+    print('[i] Welcome to the ArchiveBox Shell! Example use:')
+    print('    print(Page.objects.filter(is_archived=True).count())')
+    print('    Page.objects.get(url="https://example.com").as_json()')
+
+    print('    Page.objects.get(url="https://example.com").as_json()')
+
+    print('    from archivebox.main import get_invalid_folders')

+ 105 - 0
archivebox/extractors/__init__.py

@@ -0,0 +1,105 @@
+__package__ = 'archivebox.extractors'
+
+import os
+
+from typing import Optional
+from datetime import datetime
+
+from ..index.schema import Link
+from ..index import (
+    load_link_details,
+    write_link_details,
+    patch_main_index,
+)
+from ..util import enforce_types
+from ..cli.logging import (
+    log_link_archiving_started,
+    log_link_archiving_finished,
+    log_archive_method_started,
+    log_archive_method_finished,
+)
+
+from .title import should_save_title, save_title
+from .favicon import should_save_favicon, save_favicon
+from .wget import should_save_wget, save_wget
+from .pdf import should_save_pdf, save_pdf
+from .screenshot import should_save_screenshot, save_screenshot
+from .dom import should_save_dom, save_dom
+from .git import should_save_git, save_git
+from .media import should_save_media, save_media
+from .archive_org import should_save_archive_dot_org, save_archive_dot_org
+
+
+@enforce_types
+def archive_link(link: Link, overwrite: bool=False, out_dir: Optional[str]=None) -> Link:
+    """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
+
+    ARCHIVE_METHODS = (
+        ('title', should_save_title, save_title),
+        ('favicon', should_save_favicon, save_favicon),
+        ('wget', should_save_wget, save_wget),
+        ('pdf', should_save_pdf, save_pdf),
+        ('screenshot', should_save_screenshot, save_screenshot),
+        ('dom', should_save_dom, save_dom),
+        ('git', should_save_git, save_git),
+        ('media', should_save_media, save_media),
+        ('archive_org', should_save_archive_dot_org, save_archive_dot_org),
+    )
+    
+    out_dir = out_dir or link.link_dir
+    try:
+        is_new = not os.path.exists(out_dir)
+        if is_new:
+            os.makedirs(out_dir)
+
+        link = load_link_details(link, out_dir=out_dir)
+        log_link_archiving_started(link, out_dir, is_new)
+        link = link.overwrite(updated=datetime.now())
+        stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
+
+        for method_name, should_run, method_function in ARCHIVE_METHODS:
+            try:
+                if method_name not in link.history:
+                    link.history[method_name] = []
+                
+                if should_run(link, out_dir) or overwrite:
+                    log_archive_method_started(method_name)
+
+                    result = method_function(link=link, out_dir=out_dir)
+
+                    link.history[method_name].append(result)
+
+                    stats[result.status] += 1
+                    log_archive_method_finished(result)
+                else:
+                    stats['skipped'] += 1
+            except Exception as e:
+                raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
+                    method_name,
+                    link.url,
+                )) from e
+
+        # print('    ', stats)
+
+        write_link_details(link, out_dir=link.link_dir)
+        patch_main_index(link)
+        
+        # # If any changes were made, update the main links index json and html
+        # was_changed = stats['succeeded'] or stats['failed']
+        # if was_changed:
+        #     patch_main_index(link)
+
+        log_link_archiving_finished(link, link.link_dir, is_new, stats)
+
+    except KeyboardInterrupt:
+        try:
+            write_link_details(link, out_dir=link.link_dir)
+        except:
+            pass
+        raise
+
+    except Exception as err:
+        print('    ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
+        raise
+
+    return link

+ 115 - 0
archivebox/extractors/archive_org.py

@@ -0,0 +1,115 @@
+__package__ = 'archivebox.extractors'
+
+import os
+
+from typing import Optional, List, Dict, Tuple
+from collections import defaultdict
+
+from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..util import (
+    enforce_types,
+    TimedProgress,
+    run,
+    PIPE,
+    DEVNULL,
+    is_static_file,
+    ArchiveError,
+    chmod_file,
+)
+from ..config import (
+    VERSION,
+    TIMEOUT,
+    SAVE_ARCHIVE_DOT_ORG,
+    CURL_BINARY,
+    CURL_VERSION,
+    CHECK_SSL_VALIDITY
+)
+
+
+
+@enforce_types
+def should_save_archive_dot_org(link: Link, out_dir: Optional[str]=None) -> bool:
+    out_dir = out_dir or link.link_dir
+    if is_static_file(link.url):
+        return False
+
+    if os.path.exists(os.path.join(out_dir, 'archive.org.txt')):
+        # if open(path, 'r').read().strip() != 'None':
+        return False
+
+    return SAVE_ARCHIVE_DOT_ORG
+
+@enforce_types
+def save_archive_dot_org(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+    """submit site to archive.org for archiving via their service, save returned archive url"""
+
+    out_dir = out_dir or link.link_dir
+    output: ArchiveOutput = 'archive.org.txt'
+    archive_org_url = None
+    submit_url = 'https://web.archive.org/save/{}'.format(link.url)
+    cmd = [
+        CURL_BINARY,
+        '--location',
+        '--head',
+        '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(VERSION),  # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
+        '--max-time', str(timeout),
+        *([] if CHECK_SSL_VALIDITY else ['--insecure']),
+        submit_url,
+    ]
+    status = 'succeeded'
+    timer = TimedProgress(timeout, prefix='      ')
+    try:
+        result = run(cmd, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout)
+        content_location, errors = parse_archive_dot_org_response(result.stdout)
+        if content_location:
+            archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
+        elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
+            archive_org_url = None
+            # raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url)))
+        elif errors:
+            raise ArchiveError(', '.join(errors))
+        else:
+            raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.')
+    except Exception as err:
+        status = 'failed'
+        output = err
+    finally:
+        timer.end()
+
+    if output and not isinstance(output, Exception):
+        # instead of writing None when archive.org rejects the url write the
+        # url to resubmit it to archive.org. This is so when the user visits
+        # the URL in person, it will attempt to re-archive it, and it'll show the
+        # nicer error message explaining why the url was rejected if it fails.
+        archive_org_url = archive_org_url or submit_url
+        with open(os.path.join(out_dir, str(output)), 'w', encoding='utf-8') as f:
+            f.write(archive_org_url)
+        chmod_file('archive.org.txt', cwd=out_dir)
+        output = archive_org_url
+
+    return ArchiveResult(
+        cmd=cmd,
+        pwd=out_dir,
+        cmd_version=CURL_VERSION,
+        output=output,
+        status=status,
+        **timer.stats,
+    )
+
+@enforce_types
+def parse_archive_dot_org_response(response: bytes) -> Tuple[List[str], List[str]]:
+    # Parse archive.org response headers
+    headers: Dict[str, List[str]] = defaultdict(list)
+
+    # lowercase all the header names and store in dict
+    for header in response.splitlines():
+        if b':' not in header or not header.strip():
+            continue
+        name, val = header.decode().split(':', 1)
+        headers[name.lower().strip()].append(val.strip())
+
+    # Get successful archive url in "content-location" header or any errors
+    content_location = headers['content-location']
+    errors = headers['x-archive-wayback-runtime-error']
+    return content_location, errors
+

+ 73 - 0
archivebox/extractors/dom.py

@@ -0,0 +1,73 @@
+__package__ = 'archivebox.extractors'
+
+import os
+
+from typing import Optional
+
+from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..util import (
+    enforce_types,
+    TimedProgress,
+    run,
+    PIPE,
+    is_static_file,
+    ArchiveError,
+    chrome_args,
+    chmod_file,
+)
+from ..config import (
+    TIMEOUT,
+    SAVE_DOM,
+    CHROME_VERSION,
+)
+
+
+
+@enforce_types
+def should_save_dom(link: Link, out_dir: Optional[str]=None) -> bool:
+    out_dir = out_dir or link.link_dir
+    if is_static_file(link.url):
+        return False
+    
+    if os.path.exists(os.path.join(out_dir, 'output.html')):
+        return False
+
+    return SAVE_DOM
+    
+@enforce_types
+def save_dom(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+    """print HTML of site to file using chrome --dump-html"""
+
+    out_dir = out_dir or link.link_dir
+    output: ArchiveOutput = 'output.html'
+    output_path = os.path.join(out_dir, str(output))
+    cmd = [
+        *chrome_args(TIMEOUT=timeout),
+        '--dump-dom',
+        link.url
+    ]
+    status = 'succeeded'
+    timer = TimedProgress(timeout, prefix='      ')
+    try:
+        with open(output_path, 'w+') as f:
+            result = run(cmd, stdout=f, stderr=PIPE, cwd=out_dir, timeout=timeout)
+
+        if result.returncode:
+            hints = result.stderr.decode()
+            raise ArchiveError('Failed to save DOM', hints)
+
+        chmod_file(output, cwd=out_dir)
+    except Exception as err:
+        status = 'failed'
+        output = err
+    finally:
+        timer.end()
+
+    return ArchiveResult(
+        cmd=cmd,
+        pwd=out_dir,
+        cmd_version=CHROME_VERSION,
+        output=output,
+        status=status,
+        **timer.stats,
+    )

+ 65 - 0
archivebox/extractors/favicon.py

@@ -0,0 +1,65 @@
+__package__ = 'archivebox.extractors'
+
+import os
+
+from typing import Optional
+
+from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..util import (
+    enforce_types,
+    TimedProgress,
+    domain,
+    run,
+    PIPE,
+    chmod_file,
+)
+from ..config import (
+    TIMEOUT,
+    SAVE_FAVICON,
+    CURL_BINARY,
+    CURL_VERSION,
+    CHECK_SSL_VALIDITY,
+)
+
+
+@enforce_types
+def should_save_favicon(link: Link, out_dir: Optional[str]=None) -> bool:
+    out_dir = out_dir or link.link_dir
+    if os.path.exists(os.path.join(out_dir, 'favicon.ico')):
+        return False
+
+    return SAVE_FAVICON
+    
+@enforce_types
+def save_favicon(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+    """download site favicon from google's favicon api"""
+
+    out_dir = out_dir or link.link_dir
+    output: ArchiveOutput = 'favicon.ico'
+    cmd = [
+        CURL_BINARY,
+        '--max-time', str(timeout),
+        '--location',
+        '--output', str(output),
+        *([] if CHECK_SSL_VALIDITY else ['--insecure']),
+        'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)),
+    ]
+    status = 'succeeded'
+    timer = TimedProgress(timeout, prefix='      ')
+    try:
+        run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
+        chmod_file(output, cwd=out_dir)
+    except Exception as err:
+        status = 'failed'
+        output = err
+    finally:
+        timer.end()
+
+    return ArchiveResult(
+        cmd=cmd,
+        pwd=out_dir,
+        cmd_version=CURL_VERSION,
+        output=output,
+        status=status,
+        **timer.stats,
+    )

+ 94 - 0
archivebox/extractors/git.py

@@ -0,0 +1,94 @@
+__package__ = 'archivebox.extractors'
+
+import os
+
+from typing import Optional
+
+from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..util import (
+    enforce_types,
+    TimedProgress,
+    run,
+    PIPE,
+    is_static_file,
+    ArchiveError,
+    chmod_file,
+    domain,
+    extension,
+    without_query,
+    without_fragment,
+)
+from ..config import (
+    TIMEOUT,
+    SAVE_GIT,
+    GIT_BINARY,
+    GIT_VERSION,
+    GIT_DOMAINS,
+    CHECK_SSL_VALIDITY
+)
+
+
+
+@enforce_types
+def should_save_git(link: Link, out_dir: Optional[str]=None) -> bool:
+    out_dir = out_dir or link.link_dir
+    if is_static_file(link.url):
+        return False
+
+    if os.path.exists(os.path.join(out_dir, 'git')):
+        return False
+
+    is_clonable_url = (
+        (domain(link.url) in GIT_DOMAINS)
+        or (extension(link.url) == 'git')
+    )
+    if not is_clonable_url:
+        return False
+
+    return SAVE_GIT
+
+
+@enforce_types
+def save_git(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+    """download full site using git"""
+
+    out_dir = out_dir or link.link_dir
+    output: ArchiveOutput = 'git'
+    output_path = os.path.join(out_dir, str(output))
+    os.makedirs(output_path, exist_ok=True)
+    cmd = [
+        GIT_BINARY,
+        'clone',
+        '--mirror',
+        '--recursive',
+        *([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
+        without_query(without_fragment(link.url)),
+    ]
+    status = 'succeeded'
+    timer = TimedProgress(timeout, prefix='      ')
+    try:
+        result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
+
+        if result.returncode == 128:
+            # ignore failed re-download when the folder already exists
+            pass
+        elif result.returncode > 0:
+            hints = 'Got git response code: {}.'.format(result.returncode)
+            raise ArchiveError('Failed to save git clone', hints)
+
+        chmod_file(output, cwd=out_dir)
+
+    except Exception as err:
+        status = 'failed'
+        output = err
+    finally:
+        timer.end()
+
+    return ArchiveResult(
+        cmd=cmd,
+        pwd=out_dir,
+        cmd_version=GIT_VERSION,
+        output=output,
+        status=status,
+        **timer.stats,
+    )

+ 100 - 0
archivebox/extractors/media.py

@@ -0,0 +1,100 @@
+__package__ = 'archivebox.extractors'
+
+import os
+
+from typing import Optional
+
+from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..util import (
+    enforce_types,
+    TimedProgress,
+    run,
+    PIPE,
+    is_static_file,
+    ArchiveError,
+    chmod_file,
+)
+from ..config import (
+    MEDIA_TIMEOUT,
+    SAVE_MEDIA,
+    YOUTUBEDL_BINARY,
+    YOUTUBEDL_VERSION,
+    CHECK_SSL_VALIDITY
+)
+
+
+@enforce_types
+def should_save_media(link: Link, out_dir: Optional[str]=None) -> bool:
+    out_dir = out_dir or link.link_dir
+
+    if is_static_file(link.url):
+        return False
+
+    if os.path.exists(os.path.join(out_dir, 'media')):
+        return False
+
+    return SAVE_MEDIA
+
+@enforce_types
+def save_media(link: Link, out_dir: Optional[str]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
+    """Download playlists or individual video, audio, and subtitles using youtube-dl"""
+
+    out_dir = out_dir or link.link_dir
+    output: ArchiveOutput = 'media'
+    output_path = os.path.join(out_dir, str(output))
+    os.makedirs(output_path, exist_ok=True)
+    cmd = [
+        YOUTUBEDL_BINARY,
+        '--write-description',
+        '--write-info-json',
+        '--write-annotations',
+        '--yes-playlist',
+        '--write-thumbnail',
+        '--no-call-home',
+        '--no-check-certificate',
+        '--user-agent',
+        '--all-subs',
+        '--extract-audio',
+        '--keep-video',
+        '--ignore-errors',
+        '--geo-bypass',
+        '--audio-format', 'mp3',
+        '--audio-quality', '320K',
+        '--embed-thumbnail',
+        '--add-metadata',
+        *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
+        link.url,
+    ]
+    status = 'succeeded'
+    timer = TimedProgress(timeout, prefix='      ')
+    try:
+        result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
+        chmod_file(output, cwd=out_dir)
+        if result.returncode:
+            if (b'ERROR: Unsupported URL' in result.stderr
+                or b'HTTP Error 404' in result.stderr
+                or b'HTTP Error 403' in result.stderr
+                or b'URL could be a direct video link' in result.stderr
+                or b'Unable to extract container ID' in result.stderr):
+                # These happen too frequently on non-media pages to warrant printing to console
+                pass
+            else:
+                hints = (
+                    'Got youtube-dl response code: {}.'.format(result.returncode),
+                    *result.stderr.decode().split('\n'),
+                )
+                raise ArchiveError('Failed to save media', hints)
+    except Exception as err:
+        status = 'failed'
+        output = err
+    finally:
+        timer.end()
+
+    return ArchiveResult(
+        cmd=cmd,
+        pwd=out_dir,
+        cmd_version=YOUTUBEDL_VERSION,
+        output=output,
+        status=status,
+        **timer.stats,
+    )

+ 72 - 0
archivebox/extractors/pdf.py

@@ -0,0 +1,72 @@
+__package__ = 'archivebox.extractors'
+
+import os
+
+from typing import Optional
+
+from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..util import (
+    enforce_types,
+    TimedProgress,
+    run,
+    PIPE,
+    is_static_file,
+    ArchiveError,
+    chrome_args,
+    chmod_file,
+)
+from ..config import (
+    TIMEOUT,
+    SAVE_PDF,
+    CHROME_VERSION,
+)
+
+
+
+@enforce_types
+def should_save_pdf(link: Link, out_dir: Optional[str]=None) -> bool:
+    out_dir = out_dir or link.link_dir
+    if is_static_file(link.url):
+        return False
+    
+    if os.path.exists(os.path.join(out_dir, 'output.pdf')):
+        return False
+
+    return SAVE_PDF
+
+
+@enforce_types
+def save_pdf(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+    """print PDF of site to file using chrome --headless"""
+
+    out_dir = out_dir or link.link_dir
+    output: ArchiveOutput = 'output.pdf'
+    cmd = [
+        *chrome_args(TIMEOUT=timeout),
+        '--print-to-pdf',
+        link.url,
+    ]
+    status = 'succeeded'
+    timer = TimedProgress(timeout, prefix='      ')
+    try:
+        result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
+
+        if result.returncode:
+            hints = (result.stderr or result.stdout).decode()
+            raise ArchiveError('Failed to save PDF', hints)
+        
+        chmod_file('output.pdf', cwd=out_dir)
+    except Exception as err:
+        status = 'failed'
+        output = err
+    finally:
+        timer.end()
+
+    return ArchiveResult(
+        cmd=cmd,
+        pwd=out_dir,
+        cmd_version=CHROME_VERSION,
+        output=output,
+        status=status,
+        **timer.stats,
+    )

+ 71 - 0
archivebox/extractors/screenshot.py

@@ -0,0 +1,71 @@
+__package__ = 'archivebox.extractors'
+
+import os
+
+from typing import Optional
+
+from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..util import (
+    enforce_types,
+    TimedProgress,
+    run,
+    PIPE,
+    is_static_file,
+    ArchiveError,
+    chrome_args,
+    chmod_file,
+)
+from ..config import (
+    TIMEOUT,
+    SAVE_SCREENSHOT,
+    CHROME_VERSION,
+)
+
+
+
+@enforce_types
+def should_save_screenshot(link: Link, out_dir: Optional[str]=None) -> bool:
+    out_dir = out_dir or link.link_dir
+    if is_static_file(link.url):
+        return False
+    
+    if os.path.exists(os.path.join(out_dir, 'screenshot.png')):
+        return False
+
+    return SAVE_SCREENSHOT
+
+@enforce_types
+def save_screenshot(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+    """take screenshot of site using chrome --headless"""
+    
+    out_dir = out_dir or link.link_dir
+    output: ArchiveOutput = 'screenshot.png'
+    cmd = [
+        *chrome_args(TIMEOUT=timeout),
+        '--screenshot',
+        link.url,
+    ]
+    status = 'succeeded'
+    timer = TimedProgress(timeout, prefix='      ')
+    try:
+        result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
+
+        if result.returncode:
+            hints = (result.stderr or result.stdout).decode()
+            raise ArchiveError('Failed to save screenshot', hints)
+
+        chmod_file(output, cwd=out_dir)
+    except Exception as err:
+        status = 'failed'
+        output = err
+    finally:
+        timer.end()
+
+    return ArchiveResult(
+        cmd=cmd,
+        pwd=out_dir,
+        cmd_version=CHROME_VERSION,
+        output=output,
+        status=status,
+        **timer.stats,
+    )

+ 63 - 0
archivebox/extractors/title.py

@@ -0,0 +1,63 @@
+__package__ = 'archivebox.extractors'
+
+from typing import Optional
+
+from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..util import (
+    enforce_types,
+    TimedProgress,
+    is_static_file,
+    ArchiveError,
+    fetch_page_title,
+)
+from ..config import (
+    TIMEOUT,
+    SAVE_TITLE,
+    CURL_BINARY,
+    CURL_VERSION,
+)
+
+
+@enforce_types
+def should_save_title(link: Link, out_dir: Optional[str]=None) -> bool:
+    # if link already has valid title, skip it
+    if link.title and not link.title.lower().startswith('http'):
+        return False
+
+    if is_static_file(link.url):
+        return False
+
+    return SAVE_TITLE
+
+@enforce_types
+def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+    """try to guess the page's title from its content"""
+
+    output: ArchiveOutput = None
+    cmd = [
+        CURL_BINARY,
+        link.url,
+        '|',
+        'grep',
+        '<title',
+    ]
+    status = 'succeeded'
+    timer = TimedProgress(timeout, prefix='      ')
+    try:
+        output = fetch_page_title(link.url, timeout=timeout, progress=False)
+        if not output:
+            raise ArchiveError('Unable to detect page title')
+    except Exception as err:
+        status = 'failed'
+        output = err
+    finally:
+        timer.end()
+
+    return ArchiveResult(
+        cmd=cmd,
+        pwd=out_dir,
+        cmd_version=CURL_VERSION,
+        output=output,
+        status=status,
+        **timer.stats,
+    )

+ 123 - 0
archivebox/extractors/wget.py

@@ -0,0 +1,123 @@
+__package__ = 'archivebox.extractors'
+
+import os
+
+from typing import Optional
+from datetime import datetime
+
+from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..util import (
+    enforce_types,
+    TimedProgress,
+    run,
+    PIPE,
+    wget_output_path,
+    ArchiveError,
+)
+from ..config import (
+    TIMEOUT,
+    SAVE_WGET,
+    SAVE_WARC,
+    WGET_BINARY,
+    WGET_VERSION,
+    CHECK_SSL_VALIDITY,
+    SAVE_WGET_REQUISITES,
+    WGET_AUTO_COMPRESSION,
+    WGET_USER_AGENT,
+    COOKIES_FILE,
+)
+
+
+
+@enforce_types
+def should_save_wget(link: Link, out_dir: Optional[str]=None) -> bool:
+    output_path = wget_output_path(link)
+    out_dir = out_dir or link.link_dir
+    if output_path and os.path.exists(os.path.join(out_dir, output_path)):
+        return False
+
+    return SAVE_WGET
+
+
+@enforce_types
+def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+    """download full site using wget"""
+
+    out_dir = out_dir or link.link_dir
+    if SAVE_WARC:
+        warc_dir = os.path.join(out_dir, 'warc')
+        os.makedirs(warc_dir, exist_ok=True)
+        warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
+
+    # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
+    output: ArchiveOutput = None
+    cmd = [
+        WGET_BINARY,
+        # '--server-response',  # print headers for better error parsing
+        '--no-verbose',
+        '--adjust-extension',
+        '--convert-links',
+        '--force-directories',
+        '--backup-converted',
+        '--span-hosts',
+        '--no-parent',
+        '-e', 'robots=off',
+        '--restrict-file-names=windows',
+        '--timeout={}'.format(timeout),
+        *([] if SAVE_WARC else ['--timestamping']),
+        *(['--warc-file={}'.format(warc_path)] if SAVE_WARC else []),
+        *(['--page-requisites'] if SAVE_WGET_REQUISITES else []),
+        *(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []),
+        *(['--load-cookies', COOKIES_FILE] if COOKIES_FILE else []),
+        *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
+        *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
+        link.url,
+    ]
+    status = 'succeeded'
+    timer = TimedProgress(timeout, prefix='      ')
+    try:
+        result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
+        output = wget_output_path(link)
+
+        # parse out number of files downloaded from last line of stderr:
+        #  "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
+        output_tail = [
+            line.strip()
+            for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
+            if line.strip()
+        ]
+        files_downloaded = (
+            int(output_tail[-1].strip().split(' ', 2)[1] or 0)
+            if 'Downloaded:' in output_tail[-1]
+            else 0
+        )
+
+        # Check for common failure cases
+        if result.returncode > 0 and files_downloaded < 1:
+            hints = (
+                'Got wget response code: {}.'.format(result.returncode),
+                *output_tail,
+            )
+            if b'403: Forbidden' in result.stderr:
+                raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
+            if b'404: Not Found' in result.stderr:
+                raise ArchiveError('404 Not Found', hints)
+            if b'ERROR 500: Internal Server Error' in result.stderr:
+                raise ArchiveError('500 Internal Server Error', hints)
+            raise ArchiveError('Got an error from the server', hints)
+
+        # chmod_file(output, cwd=out_dir)
+    except Exception as err:
+        status = 'failed'
+        output = err
+    finally:
+        timer.end()
+
+    return ArchiveResult(
+        cmd=cmd,
+        pwd=out_dir,
+        cmd_version=WGET_VERSION,
+        output=output,
+        status=status,
+        **timer.stats,
+    )

+ 277 - 32
archivebox/legacy/index.py → archivebox/index/__init__.py

@@ -1,14 +1,25 @@
-__package__ = 'archivebox.legacy'
+__package__ = 'archivebox.index'
 
 
+import re
 import os
 import os
-import json
+import shutil
+import json as pyjson
 
 
-from typing import List, Tuple, Optional, Iterable
+from itertools import chain
+from typing import List, Tuple, Dict, Optional, Iterable
 from collections import OrderedDict
 from collections import OrderedDict
 from contextlib import contextmanager
 from contextlib import contextmanager
 
 
-from .schema import Link, ArchiveResult
-from .config import (
+from ..parsers import parse_links
+from ..util import (
+    scheme,
+    enforce_types,
+    TimedProgress,
+    atomic_write,
+    ExtendedEncoder,
+)
+from ..config import (
+    ARCHIVE_DIR_NAME,
     SQL_INDEX_FILENAME,
     SQL_INDEX_FILENAME,
     JSON_INDEX_FILENAME,
     JSON_INDEX_FILENAME,
     HTML_INDEX_FILENAME,
     HTML_INDEX_FILENAME,
@@ -18,33 +29,30 @@ from .config import (
     ANSI,
     ANSI,
     stderr,
     stderr,
 )
 )
-from .storage.html import write_html_main_index, write_html_link_details
-from .storage.json import (
+from ..cli.logging import (
+    log_indexing_process_started,
+    log_indexing_process_finished,
+    log_indexing_started,
+    log_indexing_finished,
+    log_parsing_started,
+    log_parsing_finished,
+)
+
+from .schema import Link, ArchiveResult
+from .html import (
+    write_html_main_index,
+    write_html_link_details,
+)
+from .json import (
     parse_json_main_index,
     parse_json_main_index,
     write_json_main_index,
     write_json_main_index,
     parse_json_link_details, 
     parse_json_link_details, 
     write_json_link_details,
     write_json_link_details,
 )
 )
-from .storage.sql import (
+from .sql import (
     write_sql_main_index,
     write_sql_main_index,
     parse_sql_main_index,
     parse_sql_main_index,
 )
 )
-from .util import (
-    scheme,
-    enforce_types,
-    TimedProgress,
-    atomic_write,
-    ExtendedEncoder,
-)
-from .parse import parse_links
-from .logs import (
-    log_indexing_process_started,
-    log_indexing_process_finished,
-    log_indexing_started,
-    log_indexing_finished,
-    log_parsing_started,
-    log_parsing_finished,
-)
 
 
 ### Link filtering and checking
 ### Link filtering and checking
 
 
@@ -95,11 +103,11 @@ def merge_links(a: Link, b: Link) -> Link:
     }
     }
     for method in all_methods:
     for method in all_methods:
         deduped_jsons = {
         deduped_jsons = {
-            json.dumps(result, sort_keys=True, cls=ExtendedEncoder)
+            pyjson.dumps(result, sort_keys=True, cls=ExtendedEncoder)
             for result in history[method]
             for result in history[method]
         }
         }
         history[method] = list(reversed(sorted(
         history[method] = list(reversed(sorted(
-            (ArchiveResult.from_json(json.loads(result)) for result in deduped_jsons),
+            (ArchiveResult.from_json(pyjson.loads(result)) for result in deduped_jsons),
             key=lambda result: result.start_ts,
             key=lambda result: result.start_ts,
         )))
         )))
 
 
@@ -114,7 +122,7 @@ def merge_links(a: Link, b: Link) -> Link:
 
 
 
 
 @enforce_types
 @enforce_types
-def validate_links(links: Iterable[Link]) -> Iterable[Link]:
+def validate_links(links: Iterable[Link]) -> List[Link]:
     links = archivable_links(links)  # remove chrome://, about:, mailto: etc.
     links = archivable_links(links)  # remove chrome://, about:, mailto: etc.
     links = sorted_links(links)      # deterministically sort the links based on timstamp, url
     links = sorted_links(links)      # deterministically sort the links based on timstamp, url
     links = uniquefied_links(links)  # merge/dedupe duplicate timestamps & urls
     links = uniquefied_links(links)  # merge/dedupe duplicate timestamps & urls
@@ -128,7 +136,7 @@ def validate_links(links: Iterable[Link]) -> Iterable[Link]:
         stderr('        archivebox help')
         stderr('        archivebox help')
         raise SystemExit(1)
         raise SystemExit(1)
 
 
-    return links
+    return list(links)
 
 
 
 
 @enforce_types
 @enforce_types
@@ -259,23 +267,32 @@ def load_main_index_meta(out_dir: str=OUTPUT_DIR) -> Optional[dict]:
     index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
     index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
     if os.path.exists(index_path):
     if os.path.exists(index_path):
         with open(index_path, 'r', encoding='utf-8') as f:
         with open(index_path, 'r', encoding='utf-8') as f:
-            meta_dict = json.load(f)
+            meta_dict = pyjson.load(f)
             meta_dict.pop('links')
             meta_dict.pop('links')
             return meta_dict
             return meta_dict
 
 
     return None
     return None
 
 
 @enforce_types
 @enforce_types
-def import_new_links(existing_links: List[Link], import_path: str) -> Tuple[List[Link], List[Link]]:
+def import_new_links(existing_links: List[Link],
+                     import_path: str,
+                     out_dir: str=OUTPUT_DIR) -> Tuple[List[Link], List[Link]]:
+
     new_links: List[Link] = []
     new_links: List[Link] = []
 
 
     # parse and validate the import file
     # parse and validate the import file
     log_parsing_started(import_path)
     log_parsing_started(import_path)
     raw_links, parser_name = parse_links(import_path)
     raw_links, parser_name = parse_links(import_path)
-    new_links = list(validate_links(raw_links))
+    new_links = validate_links(raw_links)
 
 
     # merge existing links in out_dir and new links
     # merge existing links in out_dir and new links
-    all_links = list(validate_links(existing_links + new_links))
+    all_links = validate_links(existing_links + new_links)
+    all_link_urls = {link.url for link in existing_links}
+
+    new_links = [
+        link for link in new_links
+        if link.url not in all_link_urls
+    ]
 
 
     if parser_name:
     if parser_name:
         num_parsed = len(raw_links)
         num_parsed = len(raw_links)
@@ -345,3 +362,231 @@ def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link:
         return merge_links(existing_link, link)
         return merge_links(existing_link, link)
 
 
     return link
     return link
+
+
+
+LINK_FILTERS = {
+    'exact': lambda link, pattern: (link.url == pattern) or (link.base_url == pattern),
+    'substring': lambda link, pattern: pattern in link.url,
+    'regex': lambda link, pattern: bool(re.match(pattern, link.url)),
+    'domain': lambda link, pattern: link.domain == pattern,
+}
+
+@enforce_types
+def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str='exact') -> bool:
+    for pattern in filter_patterns:
+        try:
+            if LINK_FILTERS[filter_type](link, pattern):
+                return True
+        except Exception:
+            stderr()
+            stderr(
+                f'[X] Got invalid pattern for --filter-type={filter_type}:',
+                color='red',
+            )
+            stderr(f'    {pattern}')
+            raise SystemExit(2)
+
+    return False
+
+
+def get_indexed_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+    """indexed links without checking archive status or data directory validity"""
+    return {
+        link.link_dir: link
+        for link in links
+    }
+
+def get_archived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+    """indexed links that are archived with a valid data directory"""
+    return {
+        link.link_dir: link
+        for link in filter(is_archived, links)
+    }
+
+def get_unarchived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+    """indexed links that are unarchived with no data directory or an empty data directory"""
+    return {
+        link.link_dir: link
+        for link in filter(is_unarchived, links)
+    }
+
+def get_present_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+    """dirs that are expected to exist based on the main index"""
+    all_folders = {}
+
+    for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
+        if entry.is_dir(follow_symlinks=True):
+            link = None
+            try:
+                link = parse_json_link_details(entry.path)
+            except Exception:
+                pass
+
+            all_folders[entry.path] = link
+
+    return all_folders
+
+def get_valid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+    """dirs with a valid index matched to the main index and archived content"""
+    return {
+        link.link_dir: link
+        for link in filter(is_valid, links)
+    }
+
+def get_invalid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+    """dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
+    duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
+    orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
+    corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
+    unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
+    return {**duplicate, **orphaned, **corrupted, **unrecognized}
+
+
+def get_duplicate_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+    """dirs that conflict with other directories that have the same link URL or timestamp"""
+    links = list(links)
+    by_url = {link.url: 0 for link in links}
+    by_timestamp = {link.timestamp: 0 for link in links}
+
+    duplicate_folders = {}
+
+    indexed_folders = {link.link_dir for link in links}
+    data_folders = (
+        entry.path
+        for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME))
+        if entry.is_dir(follow_symlinks=True) and entry.path not in indexed_folders
+    )
+
+    for path in chain(sorted(indexed_folders), sorted(data_folders)):
+        link = None
+        try:
+            link = parse_json_link_details(path)
+        except Exception:
+            pass
+
+        if link:
+            # link folder has same timestamp as different link folder
+            by_timestamp[link.timestamp] = by_timestamp.get(link.timestamp, 0) + 1
+            if by_timestamp[link.timestamp] > 1:
+                duplicate_folders[path] = link
+
+            # link folder has same url as different link folder
+            by_url[link.url] = by_url.get(link.url, 0) + 1
+            if by_url[link.url] > 1:
+                duplicate_folders[path] = link
+
+    return duplicate_folders
+
+def get_orphaned_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+    """dirs that contain a valid index but aren't listed in the main index"""
+    links = list(links)
+    indexed_folders = {link.link_dir: link for link in links}
+    orphaned_folders = {}
+
+    for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
+        if entry.is_dir(follow_symlinks=True):
+            link = None
+            try:
+                link = parse_json_link_details(entry.path)
+            except Exception:
+                pass
+
+            if link and entry.path not in indexed_folders:
+                # folder is a valid link data dir with index details, but it's not in the main index
+                orphaned_folders[entry.path] = link
+
+    return orphaned_folders
+
+def get_corrupted_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+    """dirs that don't contain a valid index and aren't listed in the main index"""
+    return {
+        link.link_dir: link
+        for link in filter(is_corrupt, links)
+    }
+
+def get_unrecognized_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+    """dirs that don't contain recognizable archive data and aren't listed in the main index"""
+    by_timestamp = {link.timestamp: 0 for link in links}
+    unrecognized_folders: Dict[str, Optional[Link]] = {}
+
+    for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
+        if entry.is_dir(follow_symlinks=True):
+            index_exists = os.path.exists(os.path.join(entry.path, 'index.json'))
+            link = None
+            try:
+                link = parse_json_link_details(entry.path)
+            except Exception:
+                pass
+
+            if index_exists and link is None:
+                # index exists but it's corrupted or unparseable
+                unrecognized_folders[entry.path] = link
+            
+            elif not index_exists:
+                # link details index doesn't exist and the folder isn't in the main index
+                timestamp = entry.path.rsplit('/', 1)[-1]
+                if timestamp not in by_timestamp:
+                    unrecognized_folders[entry.path] = link
+
+    return unrecognized_folders
+
+
+def is_valid(link: Link) -> bool:
+    dir_exists = os.path.exists(link.link_dir)
+    index_exists = os.path.exists(os.path.join(link.link_dir, 'index.json'))
+    if not dir_exists:
+        # unarchived links are not included in the valid list
+        return False
+    if dir_exists and not index_exists:
+        return False
+    if dir_exists and index_exists:
+        try:
+            parsed_link = parse_json_link_details(link.link_dir)
+            return link.url == parsed_link.url
+        except Exception:
+            pass
+    return False
+
+def is_corrupt(link: Link) -> bool:
+    if not os.path.exists(link.link_dir):
+        # unarchived links are not considered corrupt
+        return False
+
+    if is_valid(link):
+        return False
+
+    return True
+
+def is_archived(link: Link) -> bool:
+    return is_valid(link) and link.is_archived
+    
+def is_unarchived(link: Link) -> bool:
+    if not os.path.exists(link.link_dir):
+        return True
+    return not link.is_archived
+
+
+def fix_invalid_folder_locations(out_dir: str=OUTPUT_DIR) -> Tuple[List[str], List[str]]:
+    fixed = []
+    cant_fix = []
+    for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
+        if entry.is_dir(follow_symlinks=True):
+            if os.path.exists(os.path.join(entry.path, 'index.json')):
+                link = parse_json_link_details(entry.path)
+                if not link:
+                    continue
+
+                if not entry.path.endswith(f'/{link.timestamp}'):
+                    dest = os.path.join(out_dir, ARCHIVE_DIR_NAME, link.timestamp)
+                    if os.path.exists(dest):
+                        cant_fix.append(entry.path)
+                    else:
+                        shutil.move(entry.path, dest)
+                        fixed.append(dest)
+
+                if link.link_dir != entry.path:
+                    link = link.overwrite(link_dir=entry.path)
+                    write_json_link_details(link, out_dir=entry.path)
+
+    return fixed, cant_fix

+ 13 - 13
archivebox/legacy/storage/html.py → archivebox/index/html.py

@@ -1,11 +1,22 @@
-__package__ = 'archivebox.legacy.storage'
+__package__ = 'archivebox.index'
 
 
 import os
 import os
 
 
 from datetime import datetime
 from datetime import datetime
 from typing import List, Optional, Iterator
 from typing import List, Optional, Iterator
 
 
-from ..schema import Link
+from .schema import Link
+from ..util import (
+    enforce_types,
+    ts_to_date,
+    urlencode,
+    htmlencode,
+    urldecode,
+    wget_output_path,
+    render_template,
+    atomic_write,
+    copy_and_overwrite,
+)
 from ..config import (
 from ..config import (
     OUTPUT_DIR,
     OUTPUT_DIR,
     TEMPLATES_DIR,
     TEMPLATES_DIR,
@@ -18,17 +29,6 @@ from ..config import (
     ROBOTS_TXT_FILENAME,
     ROBOTS_TXT_FILENAME,
     FAVICON_FILENAME,
     FAVICON_FILENAME,
 )
 )
-from ..util import (
-    enforce_types,
-    ts_to_date,
-    urlencode,
-    htmlencode,
-    urldecode,
-    wget_output_path,
-    render_template,
-    atomic_write,
-    copy_and_overwrite,
-)
 
 
 join = lambda *paths: os.path.join(*paths)
 join = lambda *paths: os.path.join(*paths)
 MAIN_INDEX_TEMPLATE = join(TEMPLATES_DIR, 'main_index.html')
 MAIN_INDEX_TEMPLATE = join(TEMPLATES_DIR, 'main_index.html')

+ 9 - 9
archivebox/legacy/storage/json.py → archivebox/index/json.py

@@ -1,4 +1,4 @@
-__package__ = 'archivebox.legacy.storage'
+__package__ = 'archivebox.index'
 
 
 import os
 import os
 import sys
 import sys
@@ -7,7 +7,8 @@ import json
 from datetime import datetime
 from datetime import datetime
 from typing import List, Optional, Iterator
 from typing import List, Optional, Iterator
 
 
-from ..schema import Link, ArchiveResult
+from .schema import Link, ArchiveResult
+from ..util import enforce_types, atomic_write
 from ..config import (
 from ..config import (
     VERSION,
     VERSION,
     OUTPUT_DIR,
     OUTPUT_DIR,
@@ -17,14 +18,11 @@ from ..config import (
     JSON_INDEX_FILENAME,
     JSON_INDEX_FILENAME,
     ARCHIVE_DIR_NAME,
     ARCHIVE_DIR_NAME,
 )
 )
-from ..util import (
-    enforce_types,
-    atomic_write,
-)
+
 
 
 MAIN_INDEX_HEADER = {
 MAIN_INDEX_HEADER = {
     'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
     'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
-    'schema': 'archivebox.legacy.storage.json',
+    'schema': 'archivebox.index.json',
     'copyright_info': FOOTER_INFO,
     'copyright_info': FOOTER_INFO,
     'meta': {
     'meta': {
         'project': 'ArchiveBox',
         'project': 'ArchiveBox',
@@ -43,7 +41,7 @@ MAIN_INDEX_HEADER = {
 
 
 @enforce_types
 @enforce_types
 def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
 def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
-    """parse a archive index json file and return the list of links"""
+    """parse an archive index json file and return the list of links"""
 
 
     index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
     index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
     if os.path.exists(index_path):
     if os.path.exists(index_path):
@@ -110,4 +108,6 @@ def parse_json_links_details(out_dir: str) -> Iterator[Link]:
     for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
     for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
         if entry.is_dir(follow_symlinks=True):
         if entry.is_dir(follow_symlinks=True):
             if os.path.exists(os.path.join(entry.path, 'index.json')):
             if os.path.exists(os.path.join(entry.path, 'index.json')):
-                yield parse_json_link_details(entry.path)
+                link = parse_json_link_details(entry.path)
+                if link:
+                    yield link

+ 24 - 22
archivebox/legacy/schema.py → archivebox/index/schema.py

@@ -1,3 +1,5 @@
+__package__ = 'archivebox.index'
+
 import os
 import os
 
 
 from datetime import datetime
 from datetime import datetime
@@ -48,7 +50,7 @@ class ArchiveResult:
 
 
     @classmethod
     @classmethod
     def from_json(cls, json_info):
     def from_json(cls, json_info):
-        from .util import parse_date
+        from ..util import parse_date
 
 
         info = {
         info = {
             key: val
             key: val
@@ -60,12 +62,12 @@ class ArchiveResult:
         return cls(**info)
         return cls(**info)
 
 
     def to_json(self, indent=4, sort_keys=True):
     def to_json(self, indent=4, sort_keys=True):
-        from .util import to_json
+        from ..util import to_json
 
 
         return to_json(self, indent=indent, sort_keys=sort_keys)
         return to_json(self, indent=indent, sort_keys=sort_keys)
 
 
     def to_csv(self, cols=None, ljust: int=0, separator: str=','):
     def to_csv(self, cols=None, ljust: int=0, separator: str=','):
-        from .util import to_json
+        from ..util import to_json
 
 
         cols = cols or self.field_names()
         cols = cols or self.field_names()
         return separator.join(
         return separator.join(
@@ -115,7 +117,7 @@ class Link:
         return float(self.timestamp) > float(other.timestamp)
         return float(self.timestamp) > float(other.timestamp)
 
 
     def typecheck(self) -> None:
     def typecheck(self) -> None:
-        from .config import stderr, ANSI
+        from ..config import stderr, ANSI
         try:
         try:
             assert self.schema == self.__class__.__name__
             assert self.schema == self.__class__.__name__
             assert isinstance(self.timestamp, str) and self.timestamp
             assert isinstance(self.timestamp, str) and self.timestamp
@@ -176,7 +178,7 @@ class Link:
 
 
     @classmethod
     @classmethod
     def from_json(cls, json_info):
     def from_json(cls, json_info):
-        from .util import parse_date
+        from ..util import parse_date
         
         
         info = {
         info = {
             key: val
             key: val
@@ -200,12 +202,12 @@ class Link:
         return cls(**info)
         return cls(**info)
 
 
     def to_json(self, indent=4, sort_keys=True):
     def to_json(self, indent=4, sort_keys=True):
-        from .util import to_json
+        from ..util import to_json
 
 
         return to_json(self, indent=indent, sort_keys=sort_keys)
         return to_json(self, indent=indent, sort_keys=sort_keys)
 
 
     def to_csv(self, csv_cols: List[str], ljust: int=0, separator: str=','):
     def to_csv(self, csv_cols: List[str], ljust: int=0, separator: str=','):
-        from .util import to_json
+        from ..util import to_json
 
 
         return separator.join(
         return separator.join(
             to_json(getattr(self, col), indent=None).ljust(ljust)
             to_json(getattr(self, col), indent=None).ljust(ljust)
@@ -218,60 +220,60 @@ class Link:
 
 
     @property
     @property
     def link_dir(self) -> str:
     def link_dir(self) -> str:
-        from .config import CONFIG
+        from ..config import CONFIG
         return os.path.join(CONFIG['ARCHIVE_DIR'], self.timestamp)
         return os.path.join(CONFIG['ARCHIVE_DIR'], self.timestamp)
 
 
     @property
     @property
     def archive_path(self) -> str:
     def archive_path(self) -> str:
-        from .config import ARCHIVE_DIR_NAME
+        from ..config import ARCHIVE_DIR_NAME
         return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
         return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
     
     
     ### URL Helpers
     ### URL Helpers
     @property
     @property
     def url_hash(self):
     def url_hash(self):
-        from .util import hashurl
+        from ..util import hashurl
 
 
         return hashurl(self.url)
         return hashurl(self.url)
 
 
     @property
     @property
     def scheme(self) -> str:
     def scheme(self) -> str:
-        from .util import scheme
+        from ..util import scheme
         return scheme(self.url)
         return scheme(self.url)
 
 
     @property
     @property
     def extension(self) -> str:
     def extension(self) -> str:
-        from .util import extension
+        from ..util import extension
         return extension(self.url)
         return extension(self.url)
 
 
     @property
     @property
     def domain(self) -> str:
     def domain(self) -> str:
-        from .util import domain
+        from ..util import domain
         return domain(self.url)
         return domain(self.url)
 
 
     @property
     @property
     def path(self) -> str:
     def path(self) -> str:
-        from .util import path
+        from ..util import path
         return path(self.url)
         return path(self.url)
 
 
     @property
     @property
     def basename(self) -> str:
     def basename(self) -> str:
-        from .util import basename
+        from ..util import basename
         return basename(self.url)
         return basename(self.url)
 
 
     @property
     @property
     def base_url(self) -> str:
     def base_url(self) -> str:
-        from .util import base_url
+        from ..util import base_url
         return base_url(self.url)
         return base_url(self.url)
 
 
     ### Pretty Printing Helpers
     ### Pretty Printing Helpers
     @property
     @property
     def bookmarked_date(self) -> Optional[str]:
     def bookmarked_date(self) -> Optional[str]:
-        from .util import ts_to_date
+        from ..util import ts_to_date
         return ts_to_date(self.timestamp) if self.timestamp else None
         return ts_to_date(self.timestamp) if self.timestamp else None
 
 
     @property
     @property
     def updated_date(self) -> Optional[str]:
     def updated_date(self) -> Optional[str]:
-        from .util import ts_to_date
+        from ..util import ts_to_date
         return ts_to_date(self.updated) if self.updated else None
         return ts_to_date(self.updated) if self.updated else None
 
 
     @property
     @property
@@ -304,13 +306,13 @@ class Link:
 
 
     @property
     @property
     def is_static(self) -> bool:
     def is_static(self) -> bool:
-        from .util import is_static_file
+        from ..util import is_static_file
         return is_static_file(self.url)
         return is_static_file(self.url)
 
 
     @property
     @property
     def is_archived(self) -> bool:
     def is_archived(self) -> bool:
-        from .config import ARCHIVE_DIR
-        from .util import domain
+        from ..config import ARCHIVE_DIR
+        from ..util import domain
 
 
         output_paths = (
         output_paths = (
             domain(self.url),
             domain(self.url),
@@ -352,7 +354,7 @@ class Link:
     def canonical_outputs(self) -> Dict[str, Optional[str]]:
     def canonical_outputs(self) -> Dict[str, Optional[str]]:
         """predict the expected output paths that should be present after archiving"""
         """predict the expected output paths that should be present after archiving"""
 
 
-        from .util import wget_output_path
+        from ..util import wget_output_path
         canonical = {
         canonical = {
             'index_path': 'index.html',
             'index_path': 'index.html',
             'favicon_path': 'favicon.ico',
             'favicon_path': 'favicon.ico',

+ 14 - 4
archivebox/legacy/storage/sql.py → archivebox/index/sql.py

@@ -1,9 +1,9 @@
-__package__ = 'archivebox.legacy.storage'
+__package__ = 'archivebox.index'
 
 
 from io import StringIO
 from io import StringIO
 from typing import List, Tuple, Iterator
 from typing import List, Tuple, Iterator
 
 
-from ..schema import Link
+from .schema import Link
 from ..util import enforce_types
 from ..util import enforce_types
 from ..config import setup_django, OUTPUT_DIR
 from ..config import setup_django, OUTPUT_DIR
 
 
@@ -25,9 +25,19 @@ def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
     setup_django(out_dir, check_db=True)
     setup_django(out_dir, check_db=True)
     from core.models import Page
     from core.models import Page
 
 
-    for link in links:
+    all_urls = {link.url: link for link in links}
+
+    for page in Page.objects.all():
+        if page.url in all_urls:
+            info = {k: v for k, v in all_urls.pop(page.url)._asdict().items() if k in Page.keys}
+            Page.objects.update(**info)
+        else:
+            page.delete()
+
+    for url, link in all_urls.items():
         info = {k: v for k, v in link._asdict().items() if k in Page.keys}
         info = {k: v for k, v in link._asdict().items() if k in Page.keys}
-        Page.objects.update_or_create(url=link.url, defaults=info)
+        Page.objects.update_or_create(url=url, defaults=info)
+
 
 
 
 
 @enforce_types
 @enforce_types

+ 0 - 58
archivebox/legacy/ArchiveBox.conf

@@ -1,58 +0,0 @@
-# This is the example default configiration file for ArchiveBox.
-# 
-# Copy example config from here into your project's ArchiveBox.conf file,
-# DO NOT EDIT THIS FILE DIRECTLY!
-#
-# See the list of all the possible options. documentation, and examples here:
-#    https://github.com/pirate/ArchiveBox/wiki/Configuration
-
-[GENERAL_CONFIG]
-OUTPUT_PERMISSIONS = 755
-ONLY_NEW = False
-TIMEOUT = 60
-MEDIA_TIMEOUT = 3600
-ACTIVE_THEME = default
-FOOTER_INFO = Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.
-URL_BLACKLIST = (://(.*\.)?facebook\.com)|(://(.*\.)?ebay\.com)|(.*\.exe$)
-
-[ARCHIVE_METHOD_TOGGLES]
-SAVE_TITLE = True
-SAVE_FAVICON = True
-SAVE_WGET = True
-SAVE_WGET_REQUISITES = True
-SAVE_WARC = True
-SAVE_PDF = True
-SAVE_SCREENSHOT = True
-SAVE_DOM = True
-SAVE_GIT = True
-SAVE_MEDIA = False
-SAVE_ARCHIVE_DOT_ORG = True
-
-
-[ARCHIVE_METHOD_OPTIONS]
-CHECK_SSL_VALIDITY = True
-RESOLUTION = 1440,900
-GIT_DOMAINS = github.com,bitbucket.org,gitlab.com
-
-CROME_HEADLESS = True
-CROME_SANDBOX = True
-
-COOKIES_FILE = path/to/cookies.txt
-CHROME_USER_DATA_DIR = ~/.config/google-chrome/Default
-
-WGET_USER_AGENT = Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36
-CHROME_USER_AGENT = Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36
-
-
-[DEPENDENCY_CONFIG]
-USE_CURL = True
-USE_WGET = True
-USE_CHROME = True
-USE_YOUTUBEDL = True
-USE_GIT = True
-
-CURL_BINARY = curl
-GIT_BINARY = git"
-WGET_BINARY = wget
-YOUTUBEDL_BINARY = youtube-dl
-CHROME_BINARY = chromium

+ 0 - 1
archivebox/legacy/__init__.py

@@ -1 +0,0 @@
-__package__ = 'archivebox.legacy'

+ 0 - 694
archivebox/legacy/archive_methods.py

@@ -1,694 +0,0 @@
-import os
-
-from typing import Dict, List, Tuple, Optional
-from collections import defaultdict
-from datetime import datetime
-
-from .schema import Link, ArchiveResult, ArchiveOutput
-from .index import (
-    load_link_details,
-    write_link_details,
-    patch_main_index,
-)
-from .config import (
-    CURL_BINARY,
-    GIT_BINARY,
-    WGET_BINARY,
-    YOUTUBEDL_BINARY,
-    SAVE_FAVICON,
-    SAVE_TITLE,
-    SAVE_WGET,
-    SAVE_WGET_REQUISITES,
-    SAVE_PDF,
-    SAVE_SCREENSHOT,
-    SAVE_DOM,
-    SAVE_WARC,
-    SAVE_GIT,
-    SAVE_MEDIA,
-    SAVE_ARCHIVE_DOT_ORG,
-    TIMEOUT,
-    MEDIA_TIMEOUT,
-    GIT_DOMAINS,
-    VERSION,
-    WGET_USER_AGENT,
-    CHECK_SSL_VALIDITY,
-    COOKIES_FILE,
-    CURL_VERSION,
-    WGET_VERSION,
-    CHROME_VERSION,
-    GIT_VERSION,
-    YOUTUBEDL_VERSION,
-    WGET_AUTO_COMPRESSION,
-)
-from .util import (
-    enforce_types,
-    domain,
-    extension,
-    without_query,
-    without_fragment,
-    fetch_page_title,
-    is_static_file,
-    TimedProgress,
-    chmod_file,
-    wget_output_path,
-    chrome_args,
-    run, PIPE, DEVNULL,
-)
-from .logs import (
-    log_link_archiving_started,
-    log_link_archiving_finished,
-    log_archive_method_started,
-    log_archive_method_finished,
-)
-
-
-class ArchiveError(Exception):
-    def __init__(self, message, hints=None):
-        super().__init__(message)
-        self.hints = hints
-
-
-@enforce_types
-def archive_link(link: Link, out_dir: Optional[str]=None) -> Link:
-    """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
-
-    ARCHIVE_METHODS = (
-        ('title', should_save_title, save_title),
-        ('favicon', should_save_favicon, save_favicon),
-        ('wget', should_save_wget, save_wget),
-        ('pdf', should_save_pdf, save_pdf),
-        ('screenshot', should_save_screenshot, save_screenshot),
-        ('dom', should_save_dom, save_dom),
-        ('git', should_save_git, save_git),
-        ('media', should_save_media, save_media),
-        ('archive_org', should_save_archive_dot_org, save_archive_dot_org),
-    )
-    
-    out_dir = out_dir or link.link_dir
-    try:
-        is_new = not os.path.exists(out_dir)
-        if is_new:
-            os.makedirs(out_dir)
-
-        link = load_link_details(link, out_dir=out_dir)
-        log_link_archiving_started(link, out_dir, is_new)
-        link = link.overwrite(updated=datetime.now())
-        stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
-
-        for method_name, should_run, method_function in ARCHIVE_METHODS:
-            try:
-                if method_name not in link.history:
-                    link.history[method_name] = []
-                
-                if should_run(link, out_dir):
-                    log_archive_method_started(method_name)
-
-                    result = method_function(link=link, out_dir=out_dir)
-
-                    link.history[method_name].append(result)
-
-                    stats[result.status] += 1
-                    log_archive_method_finished(result)
-                else:
-                    stats['skipped'] += 1
-            except Exception as e:
-                raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
-                    method_name,
-                    link.url,
-                )) from e
-
-        # print('    ', stats)
-
-        write_link_details(link, out_dir=link.link_dir)
-        patch_main_index(link)
-        
-        # # If any changes were made, update the main links index json and html
-        # was_changed = stats['succeeded'] or stats['failed']
-        # if was_changed:
-        #     patch_main_index(link)
-
-        log_link_archiving_finished(link, link.link_dir, is_new, stats)
-
-    except KeyboardInterrupt:
-        try:
-            write_link_details(link, out_dir=link.link_dir)
-        except:
-            pass
-        raise
-
-    except Exception as err:
-        print('    ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
-        raise
-
-    return link
-
-
-### Archive Method Functions
-
-@enforce_types
-def should_save_title(link: Link, out_dir: Optional[str]=None) -> bool:
-    # if link already has valid title, skip it
-    if link.title and not link.title.lower().startswith('http'):
-        return False
-
-    if is_static_file(link.url):
-        return False
-
-    return SAVE_TITLE
-
-@enforce_types
-def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
-    """try to guess the page's title from its content"""
-
-    output: ArchiveOutput = None
-    cmd = [
-        CURL_BINARY,
-        link.url,
-        '|',
-        'grep',
-        '<title',
-    ]
-    status = 'succeeded'
-    timer = TimedProgress(timeout, prefix='      ')
-    try:
-        output = fetch_page_title(link.url, timeout=timeout, progress=False)
-        if not output:
-            raise ArchiveError('Unable to detect page title')
-    except Exception as err:
-        status = 'failed'
-        output = err
-    finally:
-        timer.end()
-
-    return ArchiveResult(
-        cmd=cmd,
-        pwd=out_dir,
-        cmd_version=CURL_VERSION,
-        output=output,
-        status=status,
-        **timer.stats,
-    )
-
-
-@enforce_types
-def should_save_favicon(link: Link, out_dir: Optional[str]=None) -> bool:
-    out_dir = out_dir or link.link_dir
-    if os.path.exists(os.path.join(out_dir, 'favicon.ico')):
-        return False
-
-    return SAVE_FAVICON
-    
-@enforce_types
-def save_favicon(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
-    """download site favicon from google's favicon api"""
-
-    out_dir = out_dir or link.link_dir
-    output: ArchiveOutput = 'favicon.ico'
-    cmd = [
-        CURL_BINARY,
-        '--max-time', str(timeout),
-        '--location',
-        '--output', str(output),
-        *([] if CHECK_SSL_VALIDITY else ['--insecure']),
-        'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)),
-    ]
-    status = 'succeeded'
-    timer = TimedProgress(timeout, prefix='      ')
-    try:
-        run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
-        chmod_file(output, cwd=out_dir)
-    except Exception as err:
-        status = 'failed'
-        output = err
-    finally:
-        timer.end()
-
-    return ArchiveResult(
-        cmd=cmd,
-        pwd=out_dir,
-        cmd_version=CURL_VERSION,
-        output=output,
-        status=status,
-        **timer.stats,
-    )
-
-@enforce_types
-def should_save_wget(link: Link, out_dir: Optional[str]=None) -> bool:
-    output_path = wget_output_path(link)
-    out_dir = out_dir or link.link_dir
-    if output_path and os.path.exists(os.path.join(out_dir, output_path)):
-        return False
-
-    return SAVE_WGET
-
-
-@enforce_types
-def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
-    """download full site using wget"""
-
-    out_dir = out_dir or link.link_dir
-    if SAVE_WARC:
-        warc_dir = os.path.join(out_dir, 'warc')
-        os.makedirs(warc_dir, exist_ok=True)
-        warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
-
-    # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
-    output: ArchiveOutput = None
-    cmd = [
-        WGET_BINARY,
-        # '--server-response',  # print headers for better error parsing
-        '--no-verbose',
-        '--adjust-extension',
-        '--convert-links',
-        '--force-directories',
-        '--backup-converted',
-        '--span-hosts',
-        '--no-parent',
-        '-e', 'robots=off',
-        '--restrict-file-names=windows',
-        '--timeout={}'.format(timeout),
-        *([] if SAVE_WARC else ['--timestamping']),
-        *(['--warc-file={}'.format(warc_path)] if SAVE_WARC else []),
-        *(['--page-requisites'] if SAVE_WGET_REQUISITES else []),
-        *(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []),
-        *(['--load-cookies', COOKIES_FILE] if COOKIES_FILE else []),
-        *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
-        *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
-        link.url,
-    ]
-    status = 'succeeded'
-    timer = TimedProgress(timeout, prefix='      ')
-    try:
-        result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
-        output = wget_output_path(link)
-
-        # parse out number of files downloaded from last line of stderr:
-        #  "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
-        output_tail = [
-            line.strip()
-            for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
-            if line.strip()
-        ]
-        files_downloaded = (
-            int(output_tail[-1].strip().split(' ', 2)[1] or 0)
-            if 'Downloaded:' in output_tail[-1]
-            else 0
-        )
-
-        # Check for common failure cases
-        if result.returncode > 0 and files_downloaded < 1:
-            hints = (
-                'Got wget response code: {}.'.format(result.returncode),
-                *output_tail,
-            )
-            if b'403: Forbidden' in result.stderr:
-                raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
-            if b'404: Not Found' in result.stderr:
-                raise ArchiveError('404 Not Found', hints)
-            if b'ERROR 500: Internal Server Error' in result.stderr:
-                raise ArchiveError('500 Internal Server Error', hints)
-            raise ArchiveError('Got an error from the server', hints)
-    except Exception as err:
-        status = 'failed'
-        output = err
-    finally:
-        timer.end()
-
-    return ArchiveResult(
-        cmd=cmd,
-        pwd=out_dir,
-        cmd_version=WGET_VERSION,
-        output=output,
-        status=status,
-        **timer.stats,
-    )
-
-@enforce_types
-def should_save_pdf(link: Link, out_dir: Optional[str]=None) -> bool:
-    out_dir = out_dir or link.link_dir
-    if is_static_file(link.url):
-        return False
-    
-    if os.path.exists(os.path.join(out_dir, 'output.pdf')):
-        return False
-
-    return SAVE_PDF
-
-
-@enforce_types
-def save_pdf(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
-    """print PDF of site to file using chrome --headless"""
-
-    out_dir = out_dir or link.link_dir
-    output: ArchiveOutput = 'output.pdf'
-    cmd = [
-        *chrome_args(TIMEOUT=timeout),
-        '--print-to-pdf',
-        link.url,
-    ]
-    status = 'succeeded'
-    timer = TimedProgress(timeout, prefix='      ')
-    try:
-        result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
-
-        if result.returncode:
-            hints = (result.stderr or result.stdout).decode()
-            raise ArchiveError('Failed to save PDF', hints)
-        
-        chmod_file('output.pdf', cwd=out_dir)
-    except Exception as err:
-        status = 'failed'
-        output = err
-    finally:
-        timer.end()
-
-    return ArchiveResult(
-        cmd=cmd,
-        pwd=out_dir,
-        cmd_version=CHROME_VERSION,
-        output=output,
-        status=status,
-        **timer.stats,
-    )
-
-@enforce_types
-def should_save_screenshot(link: Link, out_dir: Optional[str]=None) -> bool:
-    out_dir = out_dir or link.link_dir
-    if is_static_file(link.url):
-        return False
-    
-    if os.path.exists(os.path.join(out_dir, 'screenshot.png')):
-        return False
-
-    return SAVE_SCREENSHOT
-
-@enforce_types
-def save_screenshot(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
-    """take screenshot of site using chrome --headless"""
-    
-    out_dir = out_dir or link.link_dir
-    output: ArchiveOutput = 'screenshot.png'
-    cmd = [
-        *chrome_args(TIMEOUT=timeout),
-        '--screenshot',
-        link.url,
-    ]
-    status = 'succeeded'
-    timer = TimedProgress(timeout, prefix='      ')
-    try:
-        result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
-
-        if result.returncode:
-            hints = (result.stderr or result.stdout).decode()
-            raise ArchiveError('Failed to save screenshot', hints)
-
-        chmod_file(output, cwd=out_dir)
-    except Exception as err:
-        status = 'failed'
-        output = err
-    finally:
-        timer.end()
-
-    return ArchiveResult(
-        cmd=cmd,
-        pwd=out_dir,
-        cmd_version=CHROME_VERSION,
-        output=output,
-        status=status,
-        **timer.stats,
-    )
-
-@enforce_types
-def should_save_dom(link: Link, out_dir: Optional[str]=None) -> bool:
-    out_dir = out_dir or link.link_dir
-    if is_static_file(link.url):
-        return False
-    
-    if os.path.exists(os.path.join(out_dir, 'output.html')):
-        return False
-
-    return SAVE_DOM
-    
-@enforce_types
-def save_dom(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
-    """print HTML of site to file using chrome --dump-html"""
-
-    out_dir = out_dir or link.link_dir
-    output: ArchiveOutput = 'output.html'
-    output_path = os.path.join(out_dir, str(output))
-    cmd = [
-        *chrome_args(TIMEOUT=timeout),
-        '--dump-dom',
-        link.url
-    ]
-    status = 'succeeded'
-    timer = TimedProgress(timeout, prefix='      ')
-    try:
-        with open(output_path, 'w+') as f:
-            result = run(cmd, stdout=f, stderr=PIPE, cwd=out_dir, timeout=timeout)
-
-        if result.returncode:
-            hints = result.stderr.decode()
-            raise ArchiveError('Failed to save DOM', hints)
-
-        chmod_file(output, cwd=out_dir)
-    except Exception as err:
-        status = 'failed'
-        output = err
-    finally:
-        timer.end()
-
-    return ArchiveResult(
-        cmd=cmd,
-        pwd=out_dir,
-        cmd_version=CHROME_VERSION,
-        output=output,
-        status=status,
-        **timer.stats,
-    )
-
-@enforce_types
-def should_save_git(link: Link, out_dir: Optional[str]=None) -> bool:
-    out_dir = out_dir or link.link_dir
-    if is_static_file(link.url):
-        return False
-
-    if os.path.exists(os.path.join(out_dir, 'git')):
-        return False
-
-    is_clonable_url = (
-        (domain(link.url) in GIT_DOMAINS)
-        or (extension(link.url) == 'git')
-    )
-    if not is_clonable_url:
-        return False
-
-    return SAVE_GIT
-
-
-@enforce_types
-def save_git(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
-    """download full site using git"""
-
-    out_dir = out_dir or link.link_dir
-    output: ArchiveOutput = 'git'
-    output_path = os.path.join(out_dir, str(output))
-    os.makedirs(output_path, exist_ok=True)
-    cmd = [
-        GIT_BINARY,
-        'clone',
-        '--mirror',
-        '--recursive',
-        *([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
-        without_query(without_fragment(link.url)),
-    ]
-    status = 'succeeded'
-    timer = TimedProgress(timeout, prefix='      ')
-    try:
-        result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
-
-        if result.returncode == 128:
-            # ignore failed re-download when the folder already exists
-            pass
-        elif result.returncode > 0:
-            hints = 'Got git response code: {}.'.format(result.returncode)
-            raise ArchiveError('Failed to save git clone', hints)
-
-    except Exception as err:
-        status = 'failed'
-        output = err
-    finally:
-        timer.end()
-
-    return ArchiveResult(
-        cmd=cmd,
-        pwd=out_dir,
-        cmd_version=GIT_VERSION,
-        output=output,
-        status=status,
-        **timer.stats,
-    )
-
-
-@enforce_types
-def should_save_media(link: Link, out_dir: Optional[str]=None) -> bool:
-    out_dir = out_dir or link.link_dir
-
-    if is_static_file(link.url):
-        return False
-
-    if os.path.exists(os.path.join(out_dir, 'media')):
-        return False
-
-    return SAVE_MEDIA
-
-@enforce_types
-def save_media(link: Link, out_dir: Optional[str]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
-    """Download playlists or individual video, audio, and subtitles using youtube-dl"""
-
-    out_dir = out_dir or link.link_dir
-    output: ArchiveOutput = 'media'
-    output_path = os.path.join(out_dir, str(output))
-    os.makedirs(output_path, exist_ok=True)
-    cmd = [
-        YOUTUBEDL_BINARY,
-        '--write-description',
-        '--write-info-json',
-        '--write-annotations',
-        '--yes-playlist',
-        '--write-thumbnail',
-        '--no-call-home',
-        '--no-check-certificate',
-        '--user-agent',
-        '--all-subs',
-        '--extract-audio',
-        '--keep-video',
-        '--ignore-errors',
-        '--geo-bypass',
-        '--audio-format', 'mp3',
-        '--audio-quality', '320K',
-        '--embed-thumbnail',
-        '--add-metadata',
-        *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
-        link.url,
-    ]
-    status = 'succeeded'
-    timer = TimedProgress(timeout, prefix='      ')
-    try:
-        result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
-        chmod_file(output, cwd=out_dir)
-        if result.returncode:
-            if (b'ERROR: Unsupported URL' in result.stderr
-                or b'HTTP Error 404' in result.stderr
-                or b'HTTP Error 403' in result.stderr
-                or b'URL could be a direct video link' in result.stderr
-                or b'Unable to extract container ID' in result.stderr):
-                # These happen too frequently on non-media pages to warrant printing to console
-                pass
-            else:
-                hints = (
-                    'Got youtube-dl response code: {}.'.format(result.returncode),
-                    *result.stderr.decode().split('\n'),
-                )
-                raise ArchiveError('Failed to save media', hints)
-    except Exception as err:
-        status = 'failed'
-        output = err
-    finally:
-        timer.end()
-
-    return ArchiveResult(
-        cmd=cmd,
-        pwd=out_dir,
-        cmd_version=YOUTUBEDL_VERSION,
-        output=output,
-        status=status,
-        **timer.stats,
-    )
-
-
-@enforce_types
-def should_save_archive_dot_org(link: Link, out_dir: Optional[str]=None) -> bool:
-    out_dir = out_dir or link.link_dir
-    if is_static_file(link.url):
-        return False
-
-    if os.path.exists(os.path.join(out_dir, 'archive.org.txt')):
-        # if open(path, 'r').read().strip() != 'None':
-        return False
-
-    return SAVE_ARCHIVE_DOT_ORG
-
-@enforce_types
-def save_archive_dot_org(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
-    """submit site to archive.org for archiving via their service, save returned archive url"""
-
-    out_dir = out_dir or link.link_dir
-    output: ArchiveOutput = 'archive.org.txt'
-    archive_org_url = None
-    submit_url = 'https://web.archive.org/save/{}'.format(link.url)
-    cmd = [
-        CURL_BINARY,
-        '--location',
-        '--head',
-        '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(VERSION),  # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
-        '--max-time', str(timeout),
-        *([] if CHECK_SSL_VALIDITY else ['--insecure']),
-        submit_url,
-    ]
-    status = 'succeeded'
-    timer = TimedProgress(timeout, prefix='      ')
-    try:
-        result = run(cmd, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout)
-        content_location, errors = parse_archive_dot_org_response(result.stdout)
-        if content_location:
-            archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
-        elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
-            archive_org_url = None
-            # raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url)))
-        elif errors:
-            raise ArchiveError(', '.join(errors))
-        else:
-            raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.')
-    except Exception as err:
-        status = 'failed'
-        output = err
-    finally:
-        timer.end()
-
-    if output and not isinstance(output, Exception):
-        # instead of writing None when archive.org rejects the url write the
-        # url to resubmit it to archive.org. This is so when the user visits
-        # the URL in person, it will attempt to re-archive it, and it'll show the
-        # nicer error message explaining why the url was rejected if it fails.
-        archive_org_url = archive_org_url or submit_url
-        with open(os.path.join(out_dir, str(output)), 'w', encoding='utf-8') as f:
-            f.write(archive_org_url)
-        chmod_file('archive.org.txt', cwd=out_dir)
-        output = archive_org_url
-
-    return ArchiveResult(
-        cmd=cmd,
-        pwd=out_dir,
-        cmd_version=CURL_VERSION,
-        output=output,
-        status=status,
-        **timer.stats,
-    )
-
-@enforce_types
-def parse_archive_dot_org_response(response: bytes) -> Tuple[List[str], List[str]]:
-    # Parse archive.org response headers
-    headers: Dict[str, List[str]] = defaultdict(list)
-
-    # lowercase all the header names and store in dict
-    for header in response.splitlines():
-        if b':' not in header or not header.strip():
-            continue
-        name, val = header.decode().split(':', 1)
-        headers[name.lower().strip()].append(val.strip())
-
-    # Get successful archive url in "content-location" header or any errors
-    content_location = headers['content-location']
-    errors = headers['x-archive-wayback-runtime-error']
-    return content_location, errors

+ 0 - 626
archivebox/legacy/main.py

@@ -1,626 +0,0 @@
-import os
-import re
-import shutil
-
-from typing import Dict, List, Optional, Iterable
-from itertools import chain
-
-from .schema import Link
-from .util import (
-    enforce_types,
-    TimedProgress,
-    get_dir_size,
-    human_readable_size,
-)
-from .index import (
-    links_after_timestamp,
-    load_main_index,
-    import_new_links,
-    write_main_index,
-)
-from .storage.json import (
-    parse_json_main_index,
-    parse_json_link_details,
-    parse_json_links_details,
-)
-from .storage.sql import parse_sql_main_index, get_admins
-from .storage.html import parse_html_main_index
-from .archive_methods import archive_link
-from .config import (
-    stderr,
-    ANSI,
-    ONLY_NEW,
-    OUTPUT_DIR,
-    SOURCES_DIR,
-    ARCHIVE_DIR,
-    LOGS_DIR,
-    CONFIG_FILE,
-    ARCHIVE_DIR_NAME,
-    SOURCES_DIR_NAME,
-    LOGS_DIR_NAME,
-    STATIC_DIR_NAME,
-    JSON_INDEX_FILENAME,
-    HTML_INDEX_FILENAME,
-    SQL_INDEX_FILENAME,
-    ROBOTS_TXT_FILENAME,
-    FAVICON_FILENAME,
-    check_dependencies,
-    check_data_folder,
-    setup_django,
-    write_config_file,
-)
-from .logs import (
-    log_archiving_started,
-    log_archiving_paused,
-    log_archiving_finished,
-    log_removal_started,
-    log_removal_finished,
-    log_list_started,
-    log_list_finished,
-)
-
-
-ALLOWED_IN_OUTPUT_DIR = {
-    '.DS_Store',
-    '.venv',
-    'venv',
-    'virtualenv',
-    '.virtualenv',
-    ARCHIVE_DIR_NAME,
-    SOURCES_DIR_NAME,
-    LOGS_DIR_NAME,
-    STATIC_DIR_NAME,
-    SQL_INDEX_FILENAME,
-    JSON_INDEX_FILENAME,
-    HTML_INDEX_FILENAME,
-    ROBOTS_TXT_FILENAME,
-    FAVICON_FILENAME,
-}
-
-
-@enforce_types
-def init():
-    os.makedirs(OUTPUT_DIR, exist_ok=True)
-
-    is_empty = not len(set(os.listdir(OUTPUT_DIR)) - ALLOWED_IN_OUTPUT_DIR)
-    existing_index = os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME))
-
-    if is_empty and not existing_index:
-        print('{green}[+] Initializing a new ArchiveBox collection in this folder...{reset}'.format(**ANSI))
-        print(f'    {OUTPUT_DIR}')
-        print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
-    elif existing_index:
-        print('{green}[*] Updating existing ArchiveBox collection in this folder...{reset}'.format(**ANSI))
-        print(f'    {OUTPUT_DIR}')
-        print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
-    else:
-        stderr(
-            ("{red}[X] This folder appears to already have files in it, but no index.json is present.{reset}\n\n"
-            "    You must run init in a completely empty directory, or an existing data folder.\n\n"
-            "    {lightred}Hint:{reset} To import an existing data folder make sure to cd into the folder first, \n"
-            "    then run and run 'archivebox init' to pick up where you left off.\n\n"
-            "    (Always make sure your data folder is backed up first before updating ArchiveBox)"
-            ).format(OUTPUT_DIR, **ANSI)
-        )
-        raise SystemExit(1)
-
-    if existing_index:
-        print('\n{green}[*] Verifying archive folder structure...{reset}'.format(**ANSI))
-    else:
-        print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI))
-    
-    os.makedirs(SOURCES_DIR, exist_ok=True)
-    print(f'    √ {SOURCES_DIR}')
-    
-    os.makedirs(ARCHIVE_DIR, exist_ok=True)
-    print(f'    √ {ARCHIVE_DIR}')
-
-    os.makedirs(LOGS_DIR, exist_ok=True)
-    print(f'    √ {LOGS_DIR}')
-
-    write_config_file({}, out_dir=OUTPUT_DIR)
-    print(f'    √ {CONFIG_FILE}')
-    
-    if os.path.exists(os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME)):
-        print('\n{green}[*] Verifying main SQL index and running migrations...{reset}'.format(**ANSI))
-    else:
-        print('\n{green}[+] Building main SQL index and running migrations...{reset}'.format(**ANSI))
-    
-    setup_django(OUTPUT_DIR, check_db=False)
-    from django.conf import settings
-    assert settings.DATABASE_FILE == os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME)
-    print(f'    √ {settings.DATABASE_FILE}')
-    print()
-    from .storage.sql import apply_migrations
-    for migration_line in apply_migrations(OUTPUT_DIR):
-        print(f'    {migration_line}')
-
-
-    assert os.path.exists(settings.DATABASE_FILE)
-    
-    # from django.contrib.auth.models import User
-    # if IS_TTY and not User.objects.filter(is_superuser=True).exists():
-    #     print('{green}[+] Creating admin user account...{reset}'.format(**ANSI))
-    #     call_command("createsuperuser", interactive=True)
-
-    print()
-    print('{green}[*] Collecting links from any existing index or archive folders...{reset}'.format(**ANSI))
-
-    all_links = {}
-    if existing_index:
-        all_links = {
-            link.url: link
-            for link in load_main_index(out_dir=OUTPUT_DIR, warn=False)
-        }
-        print('    √ Loaded {} links from existing main index...'.format(len(all_links)))
-
-    orphaned_json_links = {
-        link.url: link
-        for link in parse_json_main_index(OUTPUT_DIR)
-        if link.url not in all_links
-    }
-    if orphaned_json_links:
-        all_links.update(orphaned_json_links)
-        print('    {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
-
-    orphaned_sql_links = {
-        link.url: link
-        for link in parse_sql_main_index(OUTPUT_DIR)
-        if link.url not in all_links
-    }
-    if orphaned_sql_links:
-        all_links.update(orphaned_sql_links)
-        print('    {lightyellow}√ Added {} orphaned links from existing SQL index...{reset}'.format(len(orphaned_sql_links), **ANSI))
-
-    orphaned_data_dir_links = {
-        link.url: link
-        for link in parse_json_links_details(OUTPUT_DIR)
-    }
-    orphan_new_links = {
-        url: link
-        for url, link in orphaned_data_dir_links.items()
-        if url not in all_links
-    }
-    orphan_duplicates = {
-        url: link
-        for url, link in orphaned_data_dir_links.items()
-        if url in all_links
-    }
-    if orphan_new_links:
-        all_links.update(orphan_new_links)
-        print('    {lightyellow}√ Added {} orphaned links from existing archive directories...{reset}'.format(len(orphan_new_links), **ANSI))
-    if orphan_duplicates:
-        print('    {lightyellow}! Skipped adding {} invalid link data directories that would have overwritten or corrupted existing data.{reset}'.format(len(orphan_duplicates), **ANSI))
-
-    orphaned_data_dirs = {folder for folder in orphan_duplicates.keys()}
-    invalid_folders = {
-        folder: link
-        for folder, link in get_invalid_folders(all_links.values(), out_dir=OUTPUT_DIR).items()
-        if folder not in orphaned_data_dirs
-    }
-    if invalid_folders:
-        print('    {lightyellow}! Skipped adding {} corrupted/unrecognized link data directories that could not be read.{reset}'.format(len(orphan_duplicates), **ANSI))
-        
-    if orphan_duplicates or invalid_folders:
-        print('        For more information about the link data directories that were skipped, run:')
-        print('            archivebox info')
-        print('            archivebox list --status=invalid')
-        print('            archivebox list --status=orphaned')
-        print('            archivebox list --status=duplicate')
-
-
-    write_main_index(list(all_links.values()), out_dir=OUTPUT_DIR)
-
-    print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))
-    if existing_index:
-        print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
-    else:
-        print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links), **ANSI))
-    print()
-    print('    To view your archive index, open:')
-    print('        {}'.format(os.path.join(OUTPUT_DIR, HTML_INDEX_FILENAME)))
-    print()
-    print('    To add new links, you can run:')
-    print("        archivebox add 'https://example.com'")
-    print()
-    print('    For more usage and examples, run:')
-    print('        archivebox help')
-
-
-@enforce_types
-def info():
-
-    print('{green}[*] Scanning archive collection main index...{reset}'.format(**ANSI))
-    print(f'    {OUTPUT_DIR}/*')
-    num_bytes, num_dirs, num_files = get_dir_size(OUTPUT_DIR, recursive=False, pattern='index.')
-    size = human_readable_size(num_bytes)
-    print(f'    Size: {size} across {num_files} files')
-    print()
-
-    links = list(load_main_index(out_dir=OUTPUT_DIR))
-    num_json_links = len(links)
-    num_sql_links = sum(1 for link in parse_sql_main_index(out_dir=OUTPUT_DIR))
-    num_html_links = sum(1 for url in parse_html_main_index(out_dir=OUTPUT_DIR))
-    num_link_details = sum(1 for link in parse_json_links_details(out_dir=OUTPUT_DIR))
-    users = get_admins().values_list('username', flat=True)
-    print(f'    > JSON Main Index: {num_json_links} links'.ljust(36),  f'(found in {JSON_INDEX_FILENAME})')
-    print(f'    > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
-    print(f'    > HTML Main Index: {num_html_links} links'.ljust(36), f'(found in {HTML_INDEX_FILENAME})')
-    print(f'    > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR_NAME}/*/index.json)')
-
-    print(f'    > Admin: {len(users)} users {", ".join(users)}'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
-    
-    if num_html_links != len(links) or num_sql_links != len(links):
-        print()
-        print('    {lightred}Hint:{reset} You can fix index count differences automatically by running:'.format(**ANSI))
-        print('        archivebox init')
-    
-    if not users:
-        print()
-        print('    {lightred}Hint:{reset} You can create an admin user by running:'.format(**ANSI))
-        print('        archivebox manage createsuperuser')
-
-    print()
-    print('{green}[*] Scanning archive collection link data directories...{reset}'.format(**ANSI))
-    print(f'    {ARCHIVE_DIR}/*')
-
-    num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR)
-    size = human_readable_size(num_bytes)
-    print(f'    Size: {size} across {num_files} files in {num_dirs} directories')
-    print()
-
-    num_indexed = len(get_indexed_folders(links, out_dir=OUTPUT_DIR))
-    num_archived = len(get_archived_folders(links, out_dir=OUTPUT_DIR))
-    num_unarchived = len(get_unarchived_folders(links, out_dir=OUTPUT_DIR))
-    print(f'    > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})')
-    print(f'      > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})')
-    print(f'      > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})')
-    
-    num_present = len(get_present_folders(links, out_dir=OUTPUT_DIR))
-    num_valid = len(get_valid_folders(links, out_dir=OUTPUT_DIR))
-    print()
-    print(f'    > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})')
-    print(f'      > valid: {num_valid}'.ljust(36), f'({get_valid_folders.__doc__})')
-    
-    duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
-    orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
-    corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
-    unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
-    num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized})
-    print(f'      > invalid: {num_invalid}'.ljust(36), f'({get_invalid_folders.__doc__})')
-    print(f'        > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})')
-    print(f'        > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})')
-    print(f'        > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})')
-    print(f'        > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})')
-    
-    if num_indexed:
-        print()
-        print('    {lightred}Hint:{reset} You can list link data directories by status like so:'.format(**ANSI))
-        print('        archivebox list --status=<status>  (e.g. indexed, corrupted, archived, etc.)')
-
-    if orphaned:
-        print()
-        print('    {lightred}Hint:{reset} To automatically import orphaned data directories into the main index, run:'.format(**ANSI))
-        print('        archivebox init')
-
-    if num_invalid:
-        print()
-        print('    {lightred}Hint:{reset} You may need to manually remove or fix some invalid data directories, afterwards make sure to run:'.format(**ANSI))
-        print('        archivebox init')
-    
-    print()
-
-
-
-@enforce_types
-def update_archive_data(import_path: Optional[str]=None, 
-                        resume: Optional[float]=None,
-                        only_new: bool=False,
-                        index_only: bool=False) -> List[Link]:
-    """The main ArchiveBox entrancepoint. Everything starts here."""
-
-    check_dependencies()
-    check_data_folder()
-
-    # Step 1: Load list of links from the existing index
-    #         merge in and dedupe new links from import_path
-    all_links: List[Link] = []
-    new_links: List[Link] = []
-    all_links = load_main_index(out_dir=OUTPUT_DIR)
-    if import_path:
-        all_links, new_links = import_new_links(all_links, import_path)
-
-    # Step 2: Write updated index with deduped old and new links back to disk
-    write_main_index(links=list(all_links), out_dir=OUTPUT_DIR)
-
-    if index_only:
-        return all_links
-        
-    # Step 3: Run the archive methods for each link
-    links = new_links if ONLY_NEW else all_links
-    log_archiving_started(len(links), resume)
-    idx: int = 0
-    link: Link = None                                             # type: ignore
-    try:
-        for idx, link in enumerate(links_after_timestamp(links, resume)):
-            archive_link(link, out_dir=link.link_dir)
-
-    except KeyboardInterrupt:
-        log_archiving_paused(len(links), idx, link.timestamp if link else '0')
-        raise SystemExit(0)
-
-    except:
-        print()
-        raise    
-
-    log_archiving_finished(len(links))
-
-    # Step 4: Re-write links index with updated titles, icons, and resources
-    all_links = load_main_index(out_dir=OUTPUT_DIR)
-    write_main_index(links=list(all_links), out_dir=OUTPUT_DIR, finished=True)
-    return all_links
-
-
-LINK_FILTERS = {
-    'exact': lambda link, pattern: (link.url == pattern) or (link.base_url == pattern),
-    'substring': lambda link, pattern: pattern in link.url,
-    'regex': lambda link, pattern: bool(re.match(pattern, link.url)),
-    'domain': lambda link, pattern: link.domain == pattern,
-}
-
-@enforce_types
-def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str='exact') -> bool:
-    for pattern in filter_patterns:
-        if LINK_FILTERS[filter_type](link, pattern):
-            return True
-
-    return False
-
-
-@enforce_types
-def list_archive_data(filter_patterns: Optional[List[str]]=None, filter_type: str='exact',
-                      after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]:
-    
-    all_links = load_main_index(out_dir=OUTPUT_DIR)
-
-    for link in all_links:
-        if after is not None and float(link.timestamp) < after:
-            continue
-        if before is not None and float(link.timestamp) > before:
-            continue
-        
-        if filter_patterns:
-            if link_matches_filter(link, filter_patterns, filter_type):
-                yield link
-        else:
-            yield link
-
-
-@enforce_types
-def remove_archive_links(filter_patterns: List[str], filter_type: str='exact',
-                         after: Optional[float]=None, before: Optional[float]=None,
-                         yes: bool=False, delete: bool=False) -> List[Link]:
-    
-    check_dependencies()
-    check_data_folder()
-
-    log_list_started(filter_patterns, filter_type)
-    timer = TimedProgress(360, prefix='      ')
-    try:
-        links = list(list_archive_data(
-            filter_patterns=filter_patterns,
-            filter_type=filter_type,
-            after=after,
-            before=before,
-        ))
-    finally:
-        timer.end()
-
-    if not len(links):
-        log_removal_finished(0, 0)
-        raise SystemExit(1)
-
-
-    log_list_finished(links)
-    log_removal_started(links, yes=yes, delete=delete)
-
-    timer = TimedProgress(360, prefix='      ')
-    try:
-        to_keep = []
-        all_links = load_main_index(out_dir=OUTPUT_DIR)
-        for link in all_links:
-            should_remove = (
-                (after is not None and float(link.timestamp) < after)
-                or (before is not None and float(link.timestamp) > before)
-                or link_matches_filter(link, filter_patterns, filter_type)
-            )
-            if not should_remove:
-                to_keep.append(link)
-            elif should_remove and delete:
-                shutil.rmtree(link.link_dir)
-    finally:
-        timer.end()
-
-    write_main_index(links=to_keep, out_dir=OUTPUT_DIR, finished=True)
-    log_removal_finished(len(all_links), len(to_keep))
-    
-    return to_keep
-
-
-
-def get_indexed_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
-    """indexed links without checking archive status or data directory validity"""
-    return {
-        link.link_dir: link
-        for link in links
-    }
-
-def get_archived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
-    """indexed links that are archived with a valid data directory"""
-    return {
-        link.link_dir: link
-        for link in filter(is_archived, links)
-    }
-
-def get_unarchived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
-    """indexed links that are unarchived with no data directory or an empty data directory"""
-    return {
-        link.link_dir: link
-        for link in filter(is_unarchived, links)
-    }
-
-def get_present_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
-    """dirs that are expected to exist based on the main index"""
-    all_folders = {}
-
-    for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
-        if entry.is_dir(follow_symlinks=True):
-            link = None
-            try:
-                link = parse_json_link_details(entry.path)
-            except Exception:
-                pass
-
-            all_folders[entry.path] = link
-
-    return all_folders
-
-def get_valid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
-    """dirs with a valid index matched to the main index and archived content"""
-    return {
-        link.link_dir: link
-        for link in filter(is_valid, links)
-    }
-
-def get_invalid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
-    """dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
-    duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
-    orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
-    corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
-    unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
-    return {**duplicate, **orphaned, **corrupted, **unrecognized}
-
-
-def get_duplicate_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
-    """dirs that conflict with other directories that have the same link URL or timestamp"""
-    links = list(links)
-    by_url = {link.url: 0 for link in links}
-    by_timestamp = {link.timestamp: 0 for link in links}
-
-    duplicate_folders = {}
-
-    indexed_folders = {link.link_dir for link in links}
-    data_folders = (
-        entry.path
-        for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME))
-        if entry.is_dir(follow_symlinks=True) and entry.path not in indexed_folders
-    )
-
-    for path in chain(sorted(indexed_folders), sorted(data_folders)):
-        link = None
-        try:
-            link = parse_json_link_details(path)
-        except Exception:
-            pass
-
-        if link:
-            # link folder has same timestamp as different link folder
-            by_timestamp[link.timestamp] = by_timestamp.get(link.timestamp, 0) + 1
-            if by_timestamp[link.timestamp] > 1:
-                duplicate_folders[path] = link
-
-            # link folder has same url as different link folder
-            by_url[link.url] = by_url.get(link.url, 0) + 1
-            if by_url[link.url] > 1:
-                duplicate_folders[path] = link
-
-    return duplicate_folders
-
-def get_orphaned_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
-    """dirs that contain a valid index but aren't listed in the main index"""
-    links = list(links)
-    indexed_folders = {link.link_dir: link for link in links}
-    orphaned_folders = {}
-
-    for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
-        if entry.is_dir(follow_symlinks=True):
-            index_exists = os.path.exists(os.path.join(entry.path, 'index.json'))
-            link = None
-            try:
-                link = parse_json_link_details(entry.path)
-            except Exception:
-                pass
-
-            if index_exists and entry.path not in indexed_folders:
-                # folder is a valid link data dir with index details, but it's not in the main index
-                orphaned_folders[entry.path] = link
-
-    return orphaned_folders
-
-def get_corrupted_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
-    """dirs that don't contain a valid index and aren't listed in the main index"""
-    return {
-        link.link_dir: link
-        for link in filter(is_corrupt, links)
-    }
-
-def get_unrecognized_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
-    """dirs that don't contain recognizable archive data and aren't listed in the main index"""
-    by_timestamp = {link.timestamp: 0 for link in links}
-    unrecognized_folders: Dict[str, Optional[Link]] = {}
-
-    for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
-        if entry.is_dir(follow_symlinks=True):
-            index_exists = os.path.exists(os.path.join(entry.path, 'index.json'))
-            link = None
-            try:
-                link = parse_json_link_details(entry.path)
-            except Exception:
-                pass
-
-            if index_exists and link is None:
-                # index exists but it's corrupted or unparseable
-                unrecognized_folders[entry.path] = link
-            
-            elif not index_exists:
-                # link details index doesn't exist and the folder isn't in the main index
-                timestamp = entry.path.rsplit('/', 1)[-1]
-                if timestamp not in by_timestamp:
-                    unrecognized_folders[entry.path] = link
-
-    return unrecognized_folders
-
-
-def is_valid(link: Link) -> bool:
-    dir_exists = os.path.exists(link.link_dir)
-    index_exists = os.path.exists(os.path.join(link.link_dir, 'index.json'))
-    if not dir_exists:
-        # unarchived links are not included in the valid list
-        return False
-    if dir_exists and not index_exists:
-        return False
-    if dir_exists and index_exists:
-        try:
-            parsed_link = parse_json_link_details(link.link_dir)
-            return link.url == parsed_link.url
-        except Exception:
-            pass
-    return False
-
-def is_corrupt(link: Link) -> bool:
-    if not os.path.exists(link.link_dir):
-        # unarchived links are not considered corrupt
-        return False
-
-    if is_valid(link):
-        return False
-
-    return True
-
-def is_archived(link: Link) -> bool:
-    return is_valid(link) and link.is_archived
-    
-def is_unarchived(link: Link) -> bool:
-    if not os.path.exists(link.link_dir):
-        return True
-    return not link.is_archived

+ 0 - 10
archivebox/legacy/mypy_django.ini

@@ -1,10 +0,0 @@
-[mypy_django_plugin]
-
-# specify settings module to use for django.conf.settings, this setting
-# could also be specified with DJANGO_SETTINGS_MODULE environment variable
-# (it also takes priority over config file)
-django_settings = core.settings
-
-# if True, all unknown settings in django.conf.settings will fallback to Any,
-# specify it if your settings are loaded dynamically to avoid false positives
-ignore_missing_settings = True

+ 0 - 331
archivebox/legacy/parse.py

@@ -1,331 +0,0 @@
-"""
-Everything related to parsing links from input sources.
-
-For a list of supported services, see the README.md.
-For examples of supported import formats see tests/.
-
-Link: {
-    'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
-    'timestamp': '1544212312.4234',
-    'title': 'Example.com Page Title',
-    'tags': 'abc,def',
-    'sources': [
-        'output/sources/ril_export.html',
-        'output/sources/getpocket.com-1523422111.txt',
-        'output/sources/stdin-234234112312.txt'
-    ]
-}
-"""
-
-import re
-import json
-
-from typing import Tuple, List, IO, Iterable
-from datetime import datetime
-import xml.etree.ElementTree as etree
-
-from .config import TIMEOUT
-from .util import (
-    htmldecode,
-    str_between,
-    URL_REGEX,
-    check_url_parsing_invariants,
-    TimedProgress,
-    Link,
-    enforce_types,
-)
-
-
-@enforce_types
-def parse_links(source_file: str) -> Tuple[List[Link], str]:
-    """parse a list of URLs with their metadata from an 
-       RSS feed, bookmarks export, or text file
-    """
-
-    check_url_parsing_invariants()
-    PARSERS = (
-        # Specialized parsers
-        ('Pocket HTML', parse_pocket_html_export),
-        ('Pinboard RSS', parse_pinboard_rss_export),
-        ('Shaarli RSS', parse_shaarli_rss_export),
-        ('Medium RSS', parse_medium_rss_export),
-        
-        # General parsers
-        ('Netscape HTML', parse_netscape_html_export),
-        ('Generic RSS', parse_rss_export),
-        ('Generic JSON', parse_json_export),
-
-        # Fallback parser
-        ('Plain Text', parse_plain_text_export),
-    )
-    timer = TimedProgress(TIMEOUT * 4)
-    with open(source_file, 'r', encoding='utf-8') as file:
-        for parser_name, parser_func in PARSERS:
-            try:
-                links = list(parser_func(file))
-                if links:
-                    timer.end()
-                    return links, parser_name
-            except Exception as err:   # noqa
-                # Parsers are tried one by one down the list, and the first one
-                # that succeeds is used. To see why a certain parser was not used
-                # due to error or format incompatibility, uncomment this line:
-                # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
-                pass
-
-    timer.end()
-    return [], 'Failed to parse'
-
-
-### Import Parser Functions
-
-@enforce_types
-def parse_pocket_html_export(html_file: IO[str]) -> Iterable[Link]:
-    """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
-
-    html_file.seek(0)
-    pattern = re.compile("^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE)
-    for line in html_file:
-        # example line
-        # <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
-        match = pattern.search(line)
-        if match:
-            url = match.group(1).replace('http://www.readability.com/read?url=', '')           # remove old readability prefixes to get original url
-            time = datetime.fromtimestamp(float(match.group(2)))
-            tags = match.group(3)
-            title = match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '')
-            
-            yield Link(
-                url=htmldecode(url),
-                timestamp=str(time.timestamp()),
-                title=htmldecode(title) or None,
-                tags=tags or '',
-                sources=[html_file.name],
-            )
-
-
-@enforce_types
-def parse_json_export(json_file: IO[str]) -> Iterable[Link]:
-    """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
-
-    json_file.seek(0)
-    links = json.load(json_file)
-    json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
-
-    for link in links:
-        # example line
-        # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
-        if link:
-            # Parse URL
-            url = link.get('href') or link.get('url') or link.get('URL')
-            if not url:
-                raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
-
-            # Parse the timestamp
-            ts_str = str(datetime.now().timestamp())
-            if link.get('timestamp'):
-                # chrome/ff histories use a very precise timestamp
-                ts_str = str(link['timestamp'] / 10000000)  
-            elif link.get('time'):
-                ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
-            elif link.get('created_at'):
-                ts_str = str(json_date(link['created_at']).timestamp())
-            elif link.get('created'):
-                ts_str = str(json_date(link['created']).timestamp())
-            elif link.get('date'):
-                ts_str = str(json_date(link['date']).timestamp())
-            elif link.get('bookmarked'):
-                ts_str = str(json_date(link['bookmarked']).timestamp())
-            elif link.get('saved'):
-                ts_str = str(json_date(link['saved']).timestamp())
-            
-            # Parse the title
-            title = None
-            if link.get('title'):
-                title = link['title'].strip()
-            elif link.get('description'):
-                title = link['description'].replace(' — Readability', '').strip()
-            elif link.get('name'):
-                title = link['name'].strip()
-
-            yield Link(
-                url=htmldecode(url),
-                timestamp=ts_str,
-                title=htmldecode(title) or None,
-                tags=htmldecode(link.get('tags')) or '',
-                sources=[json_file.name],
-            )
-
-
-@enforce_types
-def parse_rss_export(rss_file: IO[str]) -> Iterable[Link]:
-    """Parse RSS XML-format files into links"""
-
-    rss_file.seek(0)
-    items = rss_file.read().split('<item>')
-    items = items[1:] if items else []
-    for item in items:
-        # example item:
-        # <item>
-        # <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
-        # <category>Unread</category>
-        # <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
-        # <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
-        # <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
-        # </item>
-
-        trailing_removed = item.split('</item>', 1)[0]
-        leading_removed = trailing_removed.split('<item>', 1)[-1].strip()
-        rows = leading_removed.split('\n')
-
-        def get_row(key):
-            return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
-
-        url = str_between(get_row('link'), '<link>', '</link>')
-        ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
-        time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
-        title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
-
-        yield Link(
-            url=htmldecode(url),
-            timestamp=str(time.timestamp()),
-            title=htmldecode(title) or None,
-            tags=None,
-            sources=[rss_file.name],
-        )
-
-
-@enforce_types
-def parse_shaarli_rss_export(rss_file: IO[str]) -> Iterable[Link]:
-    """Parse Shaarli-specific RSS XML-format files into links"""
-
-    rss_file.seek(0)
-    entries = rss_file.read().split('<entry>')[1:]
-    for entry in entries:
-        # example entry:
-        # <entry>
-        #   <title>Aktuelle Trojaner-Welle: Emotet lauert in gefälschten Rechnungsmails | heise online</title>
-        #   <link href="https://www.heise.de/security/meldung/Aktuelle-Trojaner-Welle-Emotet-lauert-in-gefaelschten-Rechnungsmails-4291268.html" />
-        #   <id>https://demo.shaarli.org/?cEV4vw</id>
-        #   <published>2019-01-30T06:06:01+00:00</published>
-        #   <updated>2019-01-30T06:06:01+00:00</updated>
-        #   <content type="html" xml:lang="en"><![CDATA[<div class="markdown"><p>&#8212; <a href="https://demo.shaarli.org/?cEV4vw">Permalink</a></p></div>]]></content>
-        # </entry>
-
-        trailing_removed = entry.split('</entry>', 1)[0]
-        leading_removed = trailing_removed.strip()
-        rows = leading_removed.split('\n')
-
-        def get_row(key):
-            return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0]
-
-        title = str_between(get_row('title'), '<title>', '</title>').strip()
-        url = str_between(get_row('link'), '<link href="', '" />')
-        ts_str = str_between(get_row('published'), '<published>', '</published>')
-        time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
-
-        yield Link(
-            url=htmldecode(url),
-            timestamp=str(time.timestamp()),
-            title=htmldecode(title) or None,
-            tags=None,
-            sources=[rss_file.name],
-        )
-
-
-@enforce_types
-def parse_netscape_html_export(html_file: IO[str]) -> Iterable[Link]:
-    """Parse netscape-format bookmarks export files (produced by all browsers)"""
-
-    html_file.seek(0)
-    pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE)
-    for line in html_file:
-        # example line
-        # <DT><A HREF="https://example.com/?q=1+2" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://example.com/favicon.ico" ICON="data:image/png;base64,...">example bookmark title</A>
-        
-        match = pattern.search(line)
-        if match:
-            url = match.group(1)
-            time = datetime.fromtimestamp(float(match.group(2)))
-            title = match.group(3).strip()
-
-            yield Link(
-                url=htmldecode(url),
-                timestamp=str(time.timestamp()),
-                title=htmldecode(title) or None,
-                tags=None,
-                sources=[html_file.name],
-            )
-
-
-@enforce_types
-def parse_pinboard_rss_export(rss_file: IO[str]) -> Iterable[Link]:
-    """Parse Pinboard RSS feed files into links"""
-
-    rss_file.seek(0)
-    root = etree.parse(rss_file).getroot()
-    items = root.findall("{http://purl.org/rss/1.0/}item")
-    for item in items:
-        find = lambda p: item.find(p).text.strip() if item.find(p) else None    # type: ignore
-
-        url = find("{http://purl.org/rss/1.0/}link")
-        tags = find("{http://purl.org/dc/elements/1.1/}subject")
-        title = find("{http://purl.org/rss/1.0/}title")
-        ts_str = find("{http://purl.org/dc/elements/1.1/}date")
-        
-        # Pinboard includes a colon in its date stamp timezone offsets, which
-        # Python can't parse. Remove it:
-        if ts_str and ts_str[-3:-2] == ":":
-            ts_str = ts_str[:-3]+ts_str[-2:]
-
-        if ts_str:
-            time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
-        else:
-            time = datetime.now()
-
-        yield Link(
-            url=htmldecode(url),
-            timestamp=str(time.timestamp()),
-            title=htmldecode(title) or None,
-            tags=htmldecode(tags) or None,
-            sources=[rss_file.name],
-        )
-
-
-@enforce_types
-def parse_medium_rss_export(rss_file: IO[str]) -> Iterable[Link]:
-    """Parse Medium RSS feed files into links"""
-
-    rss_file.seek(0)
-    root = etree.parse(rss_file).getroot()
-    items = root.find("channel").findall("item")                        # type: ignore
-    for item in items:
-        url = item.find("link").text                                    # type: ignore
-        title = item.find("title").text.strip()                         # type: ignore
-        ts_str = item.find("pubDate").text                              # type: ignore
-        time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z")    # type: ignore
-        
-        yield Link(
-            url=htmldecode(url),
-            timestamp=str(time.timestamp()),
-            title=htmldecode(title) or None,
-            tags=None,
-            sources=[rss_file.name],
-        )
-
-
-@enforce_types
-def parse_plain_text_export(text_file: IO[str]) -> Iterable[Link]:
-    """Parse raw links from each line in a text file"""
-
-    text_file.seek(0)
-    for line in text_file.readlines():
-        urls = re.findall(URL_REGEX, line) if line.strip() else ()
-        for url in urls:                                                # type: ignore
-            yield Link(
-                url=htmldecode(url),
-                timestamp=str(datetime.now().timestamp()),
-                title=None,
-                tags=None,
-                sources=[text_file.name],
-            )

+ 0 - 89
archivebox/legacy/purge.py

@@ -1,89 +0,0 @@
-#!/usr/bin/env python3
-
-import re
-from argparse import ArgumentParser
-from os.path import exists, join
-from shutil import rmtree
-from typing import List
-
-from .config import ARCHIVE_DIR, OUTPUT_DIR
-from .index import (
-    parse_json_links_index,
-    write_html_links_index,
-    write_json_links_index,
-)
-
-
-def cleanup_index(regexes: List[str], proceed: bool, delete: bool) -> None:
-    if not exists(join(OUTPUT_DIR, 'index.json')):
-        exit('index.json is missing; nothing to do')
-
-    compiled = [re.compile(r) for r in regexes]
-    links = parse_json_links_index(OUTPUT_DIR)
-    filtered = []
-    remaining = []
-
-    for link in links:
-        url = link.url
-        for r in compiled:
-            if r.search(url):
-                filtered.append((link, r))
-                break
-        else:
-            remaining.append(link)
-
-    if not filtered:
-        exit('Search did not match any entries.')
-
-    print('Filtered out {}/{} urls:'.format(len(filtered), len(links)))
-
-    for link, regex in filtered:
-        url = link.url
-        print(' {url} via {regex}'.format(url=url, regex=regex.pattern))
-
-    if not proceed:
-        answer = input('Remove {} entries from index? [y/n] '.format(
-            len(filtered)))
-        proceed = answer.strip().lower() in ('y', 'yes')
-
-    if not proceed:
-        exit('Aborted')
-
-    write_json_links_index(OUTPUT_DIR, remaining)
-    write_html_links_index(OUTPUT_DIR, remaining)
-
-    if delete:
-        for link, _ in filtered:
-            data_dir = join(ARCHIVE_DIR, link['timestamp'])
-            if exists(data_dir):
-                rmtree(data_dir)
-
-
-if __name__ == '__main__':
-    p = ArgumentParser('Index purging tool')
-    p.add_argument(
-        '--regex',
-        '-r',
-        action='append',
-        help='Regular expression matching URLs to purge',
-    )
-    p.add_argument(
-        '--delete',
-        '-d',
-        action='store_true',
-        default=False,
-        help='Delete webpage files from archive',
-    )
-    p.add_argument(
-        '--yes',
-        '-y',
-        action='store_true',
-        default=False,
-        help='Do not prompt for confirmation',
-    )
-
-    args = p.parse_args()
-    if args.regex:
-        cleanup_index(args.regex, proceed=args.yes, delete=args.delete)
-    else:
-        p.print_help()

+ 0 - 1
archivebox/legacy/storage/__init__.py

@@ -1 +0,0 @@
-__package__ = 'archivebox.legacy.storage'

+ 1086 - 0
archivebox/main.py

@@ -0,0 +1,1086 @@
+__package__ = 'archivebox'
+
+import re
+import os
+import sys
+import shutil
+
+from typing import Dict, List, Optional, Set, Tuple, Iterable, IO
+
+from crontab import CronTab, CronSlices
+
+from .cli import (
+    list_subcommands,
+    run_subcommand,
+    display_first,
+    meta_cmds,
+    main_cmds,
+    archive_cmds,
+)
+from .index.schema import Link
+from .util import (
+    enforce_types,
+    TimedProgress,
+    get_dir_size,
+    human_readable_size,
+    save_stdin_to_sources,
+    save_file_to_sources,
+    links_to_csv,
+    to_json,
+    folders_to_str,
+)
+from .index import (
+    links_after_timestamp,
+    load_main_index,
+    import_new_links,
+    write_main_index,
+    link_matches_filter,
+    get_indexed_folders,
+    get_archived_folders,
+    get_unarchived_folders,
+    get_present_folders,
+    get_valid_folders,
+    get_invalid_folders,
+    get_duplicate_folders,
+    get_orphaned_folders,
+    get_corrupted_folders,
+    get_unrecognized_folders,
+    fix_invalid_folder_locations,
+)
+from .index.json import (
+    parse_json_main_index,
+    parse_json_links_details,
+)
+from .index.sql import parse_sql_main_index, get_admins, apply_migrations
+from .index.html import parse_html_main_index
+from .extractors import archive_link
+from .config import (
+    stderr,
+    ConfigDict,
+    ANSI,
+    IS_TTY,
+    USER,
+    ARCHIVEBOX_BINARY,
+    ONLY_NEW,
+    OUTPUT_DIR,
+    SOURCES_DIR,
+    ARCHIVE_DIR,
+    LOGS_DIR,
+    CONFIG_FILE,
+    ARCHIVE_DIR_NAME,
+    SOURCES_DIR_NAME,
+    LOGS_DIR_NAME,
+    STATIC_DIR_NAME,
+    JSON_INDEX_FILENAME,
+    HTML_INDEX_FILENAME,
+    SQL_INDEX_FILENAME,
+    ROBOTS_TXT_FILENAME,
+    FAVICON_FILENAME,
+    check_dependencies,
+    check_data_folder,
+    write_config_file,
+    setup_django,
+    VERSION,
+    CODE_LOCATIONS,
+    EXTERNAL_LOCATIONS,
+    DATA_LOCATIONS,
+    DEPENDENCIES,
+    load_all_config,
+    CONFIG,
+    USER_CONFIG,
+    get_real_name,
+)
+from .cli.logging import (
+    log_archiving_started,
+    log_archiving_paused,
+    log_archiving_finished,
+    log_removal_started,
+    log_removal_finished,
+    log_list_started,
+    log_list_finished,
+)
+
+
+ALLOWED_IN_OUTPUT_DIR = {
+    '.DS_Store',
+    '.venv',
+    'venv',
+    'virtualenv',
+    '.virtualenv',
+    ARCHIVE_DIR_NAME,
+    SOURCES_DIR_NAME,
+    LOGS_DIR_NAME,
+    STATIC_DIR_NAME,
+    SQL_INDEX_FILENAME,
+    JSON_INDEX_FILENAME,
+    HTML_INDEX_FILENAME,
+    ROBOTS_TXT_FILENAME,
+    FAVICON_FILENAME,
+}
+
+def help(out_dir: str=OUTPUT_DIR) -> None:
+    all_subcommands = list_subcommands()
+    COMMANDS_HELP_TEXT = '\n    '.join(
+        f'{cmd.ljust(20)} {summary}'
+        for cmd, summary in all_subcommands.items()
+        if cmd in meta_cmds
+    ) + '\n\n    ' + '\n    '.join(
+        f'{cmd.ljust(20)} {summary}'
+        for cmd, summary in all_subcommands.items()
+        if cmd in main_cmds
+    ) + '\n\n    ' + '\n    '.join(
+        f'{cmd.ljust(20)} {summary}'
+        for cmd, summary in all_subcommands.items()
+        if cmd in archive_cmds
+    ) + '\n\n    ' + '\n    '.join(
+        f'{cmd.ljust(20)} {summary}'
+        for cmd, summary in all_subcommands.items()
+        if cmd not in display_first
+    )
+
+
+    if os.path.exists(os.path.join(out_dir, JSON_INDEX_FILENAME)):
+        print('''{green}ArchiveBox v{}: The self-hosted internet archive.{reset}
+
+{lightred}Active data directory:{reset}
+    {}
+
+{lightred}Usage:{reset}
+    archivebox [command] [--help] [--version] [...args]
+
+{lightred}Commands:{reset}
+    {}
+
+{lightred}Example Use:{reset}
+    mkdir my-archive; cd my-archive/
+    archivebox init
+    archivebox info
+
+    archivebox add https://example.com/some/page
+    archivebox add --depth=1 ~/Downloads/bookmarks_export.html
+    
+    archivebox list --sort=timestamp --csv=timestamp,url,is_archived
+    archivebox schedule --every=week https://example.com/some/feed.rss
+    archivebox update --resume=15109948213.123
+
+{lightred}Documentation:{reset}
+    https://github.com/pirate/ArchiveBox/wiki
+'''.format(VERSION, out_dir, COMMANDS_HELP_TEXT, **ANSI))
+    
+    else:
+        print('{green}Welcome to ArchiveBox v{}!{reset}'.format(VERSION, **ANSI))
+        print()
+        print('To import an existing archive (from a previous version of ArchiveBox):')
+        print('    1. cd into your data dir OUTPUT_DIR (usually ArchiveBox/output) and run:')
+        print('    2. archivebox init')
+        print()
+        print('To start a new archive:')
+        print('    1. Create an empty directory, then cd into it and run:')
+        print('    2. archivebox init')
+        print()
+        print('For more information, see the documentation here:')
+        print('    https://github.com/pirate/ArchiveBox/wiki')
+
+
+def version(quiet: bool=False, out_dir: str=OUTPUT_DIR) -> None:
+    if quiet:
+        print(VERSION)
+    else:
+        print('ArchiveBox v{}'.format(VERSION))
+        print()
+
+        print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
+        for name, dependency in DEPENDENCIES.items():
+            print_dependency_version(name, dependency)
+        
+        print()
+        print('{white}[i] Code locations:{reset}'.format(**ANSI))
+        for name, folder in CODE_LOCATIONS.items():
+            print_folder_status(name, folder)
+
+        print()
+        print('{white}[i] External locations:{reset}'.format(**ANSI))
+        for name, folder in EXTERNAL_LOCATIONS.items():
+            print_folder_status(name, folder)
+
+        print()
+        print('{white}[i] Data locations:{reset}'.format(**ANSI))
+        for name, folder in DATA_LOCATIONS.items():
+            print_folder_status(name, folder)
+
+        print()
+        check_dependencies()
+
+
+def run(subcommand: str, subcommand_args: Optional[List[str]], stdin: Optional[IO]=None, out_dir: str=OUTPUT_DIR) -> None:
+    run_subcommand(
+        subcommand=subcommand,
+        subcommand_args=subcommand_args,
+        stdin=stdin,
+        out_dir=out_dir,
+    )
+
+
+def init(out_dir: str=OUTPUT_DIR) -> None:
+    os.makedirs(out_dir, exist_ok=True)
+
+    is_empty = not len(set(os.listdir(out_dir)) - ALLOWED_IN_OUTPUT_DIR)
+    existing_index = os.path.exists(os.path.join(out_dir, JSON_INDEX_FILENAME))
+
+    if is_empty and not existing_index:
+        print('{green}[+] Initializing a new ArchiveBox collection in this folder...{reset}'.format(**ANSI))
+        print(f'    {out_dir}')
+        print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
+    elif existing_index:
+        print('{green}[*] Updating existing ArchiveBox collection in this folder...{reset}'.format(**ANSI))
+        print(f'    {out_dir}')
+        print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
+    else:
+        stderr(
+            ("{red}[X] This folder appears to already have files in it, but no index.json is present.{reset}\n\n"
+            "    You must run init in a completely empty directory, or an existing data folder.\n\n"
+            "    {lightred}Hint:{reset} To import an existing data folder make sure to cd into the folder first, \n"
+            "    then run and run 'archivebox init' to pick up where you left off.\n\n"
+            "    (Always make sure your data folder is backed up first before updating ArchiveBox)"
+            ).format(out_dir, **ANSI)
+        )
+        raise SystemExit(1)
+
+    if existing_index:
+        print('\n{green}[*] Verifying archive folder structure...{reset}'.format(**ANSI))
+    else:
+        print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI))
+    
+    os.makedirs(SOURCES_DIR, exist_ok=True)
+    print(f'    √ {SOURCES_DIR}')
+    
+    os.makedirs(ARCHIVE_DIR, exist_ok=True)
+    print(f'    √ {ARCHIVE_DIR}')
+
+    os.makedirs(LOGS_DIR, exist_ok=True)
+    print(f'    √ {LOGS_DIR}')
+
+    write_config_file({}, out_dir=out_dir)
+    print(f'    √ {CONFIG_FILE}')
+    
+    if os.path.exists(os.path.join(out_dir, SQL_INDEX_FILENAME)):
+        print('\n{green}[*] Verifying main SQL index and running migrations...{reset}'.format(**ANSI))
+    else:
+        print('\n{green}[+] Building main SQL index and running migrations...{reset}'.format(**ANSI))
+    
+    setup_django(out_dir, check_db=False)
+    from django.conf import settings
+    assert settings.DATABASE_FILE == os.path.join(out_dir, SQL_INDEX_FILENAME)
+    print(f'    √ {settings.DATABASE_FILE}')
+    print()
+    for migration_line in apply_migrations(out_dir):
+        print(f'    {migration_line}')
+
+
+    assert os.path.exists(settings.DATABASE_FILE)
+    
+    # from django.contrib.auth.models import User
+    # if IS_TTY and not User.objects.filter(is_superuser=True).exists():
+    #     print('{green}[+] Creating admin user account...{reset}'.format(**ANSI))
+    #     call_command("createsuperuser", interactive=True)
+
+    print()
+    print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI))
+
+    all_links: Dict[str, Link] = {}
+    if existing_index:
+        all_links = {
+            link.url: link
+            for link in load_main_index(out_dir=out_dir, warn=False)
+        }
+        print('    √ Loaded {} links from existing main index.'.format(len(all_links)))
+
+    # Links in data folders that dont match their timestamp
+    fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir)
+    if fixed:
+        print('    {lightyellow}√ Fixed {} data directory locations that didn\'t match their link timestamps.{reset}'.format(len(fixed), **ANSI))
+    if cant_fix:
+        print('    {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI))
+
+    # Links in JSON index but not in main index
+    orphaned_json_links = {
+        link.url: link
+        for link in parse_json_main_index(out_dir)
+        if link.url not in all_links
+    }
+    if orphaned_json_links:
+        all_links.update(orphaned_json_links)
+        print('    {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
+
+    # Links in SQL index but not in main index
+    orphaned_sql_links = {
+        link.url: link
+        for link in parse_sql_main_index(out_dir)
+        if link.url not in all_links
+    }
+    if orphaned_sql_links:
+        all_links.update(orphaned_sql_links)
+        print('    {lightyellow}√ Added {} orphaned links from existing SQL index...{reset}'.format(len(orphaned_sql_links), **ANSI))
+
+    # Links in data dir indexes but not in main index
+    orphaned_data_dir_links = {
+        link.url: link
+        for link in parse_json_links_details(out_dir)
+        if link.url not in all_links
+    }
+    if orphaned_data_dir_links:
+        all_links.update(orphaned_data_dir_links)
+        print('    {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI))
+
+    # Links in invalid/duplicate data dirs
+    invalid_folders = {
+        folder: link
+        for folder, link in get_invalid_folders(all_links.values(), out_dir=out_dir).items()
+    }
+    if invalid_folders:
+        print('    {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI))
+        print('        X ' + '\n        X '.join(f'{folder} {link}' for folder, link in invalid_folders.items()))
+        print()
+        print('    {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI))
+        print('        archivebox info')
+        print('        archivebox list --status=invalid')
+
+
+    write_main_index(list(all_links.values()), out_dir=out_dir)
+
+    print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))
+    if existing_index:
+        print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
+    else:
+        print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links), **ANSI))
+    print()
+    print('    To view your archive index, open:')
+    print('        {}'.format(os.path.join(out_dir, HTML_INDEX_FILENAME)))
+    print()
+    print('    To add new links, you can run:')
+    print("        archivebox add 'https://example.com'")
+    print()
+    print('    For more usage and examples, run:')
+    print('        archivebox help')
+
+
+def info(out_dir: str=OUTPUT_DIR) -> None:
+    check_data_folder(out_dir=out_dir)
+
+    print('{green}[*] Scanning archive collection main index...{reset}'.format(**ANSI))
+    print(f'    {out_dir}/*')
+    num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern='index.')
+    size = human_readable_size(num_bytes)
+    print(f'    Size: {size} across {num_files} files')
+    print()
+
+    links = list(load_main_index(out_dir=out_dir))
+    num_json_links = len(links)
+    num_sql_links = sum(1 for link in parse_sql_main_index(out_dir=out_dir))
+    num_html_links = sum(1 for url in parse_html_main_index(out_dir=out_dir))
+    num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
+    users = get_admins().values_list('username', flat=True)
+    print(f'    > JSON Main Index: {num_json_links} links'.ljust(36),  f'(found in {JSON_INDEX_FILENAME})')
+    print(f'    > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
+    print(f'    > HTML Main Index: {num_html_links} links'.ljust(36), f'(found in {HTML_INDEX_FILENAME})')
+    print(f'    > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR_NAME}/*/index.json)')
+
+    print(f'    > Admin: {len(users)} users {", ".join(users)}'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
+    
+    if num_html_links != len(links) or num_sql_links != len(links):
+        print()
+        print('    {lightred}Hint:{reset} You can fix index count differences automatically by running:'.format(**ANSI))
+        print('        archivebox init')
+    
+    if not users:
+        print()
+        print('    {lightred}Hint:{reset} You can create an admin user by running:'.format(**ANSI))
+        print('        archivebox manage createsuperuser')
+
+    print()
+    print('{green}[*] Scanning archive collection link data directories...{reset}'.format(**ANSI))
+    print(f'    {ARCHIVE_DIR}/*')
+
+    num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR)
+    size = human_readable_size(num_bytes)
+    print(f'    Size: {size} across {num_files} files in {num_dirs} directories')
+    print()
+
+    num_indexed = len(get_indexed_folders(links, out_dir=out_dir))
+    num_archived = len(get_archived_folders(links, out_dir=out_dir))
+    num_unarchived = len(get_unarchived_folders(links, out_dir=out_dir))
+    print(f'    > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})')
+    print(f'      > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})')
+    print(f'      > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})')
+    
+    num_present = len(get_present_folders(links, out_dir=out_dir))
+    num_valid = len(get_valid_folders(links, out_dir=out_dir))
+    print()
+    print(f'    > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})')
+    print(f'      > valid: {num_valid}'.ljust(36), f'({get_valid_folders.__doc__})')
+    
+    duplicate = get_duplicate_folders(links, out_dir=out_dir)
+    orphaned = get_orphaned_folders(links, out_dir=out_dir)
+    corrupted = get_corrupted_folders(links, out_dir=out_dir)
+    unrecognized = get_unrecognized_folders(links, out_dir=out_dir)
+    num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized})
+    print(f'      > invalid: {num_invalid}'.ljust(36), f'({get_invalid_folders.__doc__})')
+    print(f'        > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})')
+    print(f'        > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})')
+    print(f'        > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})')
+    print(f'        > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})')
+    
+    if num_indexed:
+        print()
+        print('    {lightred}Hint:{reset} You can list link data directories by status like so:'.format(**ANSI))
+        print('        archivebox list --status=<status>  (e.g. indexed, corrupted, archived, etc.)')
+
+    if orphaned:
+        print()
+        print('    {lightred}Hint:{reset} To automatically import orphaned data directories into the main index, run:'.format(**ANSI))
+        print('        archivebox init')
+
+    if num_invalid:
+        print()
+        print('    {lightred}Hint:{reset} You may need to manually remove or fix some invalid data directories, afterwards make sure to run:'.format(**ANSI))
+        print('        archivebox init')
+    
+    print()
+
+
+@enforce_types
+def add(import_str: Optional[str]=None,
+        import_path: Optional[str]=None,
+        update_all: bool=not ONLY_NEW,
+        index_only: bool=False,
+        out_dir: str=OUTPUT_DIR) -> List[Link]:
+    """The main ArchiveBox entrancepoint. Everything starts here."""
+
+    check_data_folder(out_dir=out_dir)
+
+    if import_str and import_path:
+        stderr(
+            '[X] You should pass either an import path as an argument, '
+            'or pass a list of links via stdin, but not both.\n',
+            color='red',
+        )
+        raise SystemExit(2)
+    elif import_str:
+        import_path = save_stdin_to_sources(import_str, out_dir=out_dir)
+    else:
+        import_path = save_file_to_sources(import_path, out_dir=out_dir)
+
+    check_dependencies()
+
+    # Step 1: Load list of links from the existing index
+    #         merge in and dedupe new links from import_path
+    all_links: List[Link] = []
+    new_links: List[Link] = []
+    all_links = load_main_index(out_dir=out_dir)
+    if import_path:
+        all_links, new_links = import_new_links(all_links, import_path, out_dir=out_dir)
+
+    # Step 2: Write updated index with deduped old and new links back to disk
+    write_main_index(links=all_links, out_dir=out_dir)
+
+    if index_only:
+        return all_links
+        
+    # Step 3: Run the archive methods for each link
+    links = all_links if update_all else new_links
+    log_archiving_started(len(links))
+    idx: int = 0
+    link: Link = None                                             # type: ignore
+    try:
+        for idx, link in enumerate(links):
+            archive_link(link, out_dir=link.link_dir)
+
+    except KeyboardInterrupt:
+        log_archiving_paused(len(links), idx, link.timestamp if link else '0')
+        raise SystemExit(0)
+
+    except:
+        print()
+        raise    
+
+    log_archiving_finished(len(links))
+
+    # Step 4: Re-write links index with updated titles, icons, and resources
+    all_links = load_main_index(out_dir=out_dir)
+    write_main_index(links=list(all_links), out_dir=out_dir, finished=True)
+    return all_links
+
+@enforce_types
+def remove(filter_str: Optional[str]=None,
+           filter_patterns: Optional[List[str]]=None,
+           filter_type: str='exact',
+           after: Optional[float]=None,
+           before: Optional[float]=None,
+           yes: bool=False,
+           delete: bool=False,
+           out_dir: str=OUTPUT_DIR) -> List[Link]:
+    
+    check_data_folder(out_dir=out_dir)
+
+    if filter_str and filter_patterns:
+        stderr(
+            '[X] You should pass either a pattern as an argument, '
+            'or pass a list of patterns via stdin, but not both.\n',
+            color='red',
+        )
+        raise SystemExit(2)
+    elif not (filter_str or filter_patterns):
+        stderr(
+            '[X] You should pass either a pattern as an argument, '
+            'or pass a list of patterns via stdin.',
+            color='red',
+        )
+        stderr()
+        stderr('    {lightred}Hint:{reset} To remove all urls you can run:'.format(**ANSI))
+        stderr("        archivebox remove --filter-type=regex '.*'")
+        stderr()
+        raise SystemExit(2)
+    elif filter_str:
+        filter_patterns = [ptn.strip() for ptn in filter_str.split('\n')]
+
+    log_list_started(filter_patterns, filter_type)
+    timer = TimedProgress(360, prefix='      ')
+    try:
+        links = list(list_links(
+            filter_patterns=filter_patterns,
+            filter_type=filter_type,
+            after=after,
+            before=before,
+        ))
+    finally:
+        timer.end()
+
+    if not len(links):
+        log_removal_finished(0, 0)
+        raise SystemExit(1)
+
+
+    log_list_finished(links)
+    log_removal_started(links, yes=yes, delete=delete)
+
+    timer = TimedProgress(360, prefix='      ')
+    try:
+        to_keep = []
+        all_links = load_main_index(out_dir=out_dir)
+        for link in all_links:
+            should_remove = (
+                (after is not None and float(link.timestamp) < after)
+                or (before is not None and float(link.timestamp) > before)
+                or link_matches_filter(link, filter_patterns, filter_type)
+            )
+            if not should_remove:
+                to_keep.append(link)
+            elif should_remove and delete:
+                shutil.rmtree(link.link_dir, ignore_errors=True)
+    finally:
+        timer.end()
+
+    write_main_index(links=to_keep, out_dir=out_dir, finished=True)
+    log_removal_finished(len(all_links), len(to_keep))
+    
+    return to_keep
+
+@enforce_types
+def update(resume: Optional[float]=None,
+           only_new: bool=not ONLY_NEW,
+           index_only: bool=False,
+           overwrite: bool=False,
+           filter_patterns_str: Optional[str]=None,
+           filter_patterns: Optional[List[str]]=None,
+           filter_type: Optional[str]=None,
+           status: Optional[str]=None,
+           after: Optional[str]=None,
+           before: Optional[str]=None,
+           out_dir: str=OUTPUT_DIR) -> List[Link]:
+    """The main ArchiveBox entrancepoint. Everything starts here."""
+
+    check_dependencies()
+    check_data_folder(out_dir=out_dir)
+
+    # Step 1: Load list of links from the existing index
+    #         merge in and dedupe new links from import_path
+    all_links: List[Link] = []
+    new_links: List[Link] = []
+    all_links = load_main_index(out_dir=out_dir)
+
+    # Step 2: Write updated index with deduped old and new links back to disk
+    write_main_index(links=list(all_links), out_dir=out_dir)
+
+    # Step 3: Filter for selected_links
+    matching_links = list_links(
+        filter_patterns=filter_patterns,
+        filter_type=filter_type,
+        before=before,
+        after=after,
+    )
+    matching_folders = list_folders(
+        links=list(matching_links),
+        status=status,
+        out_dir=out_dir,
+    )
+    all_links = [link for link in matching_folders.values() if link]
+
+    if index_only:
+        return all_links
+        
+    # Step 3: Run the archive methods for each link
+    links = new_links if only_new else all_links
+    log_archiving_started(len(links), resume)
+    idx: int = 0
+    link: Link = None                                             # type: ignore
+    try:
+        for idx, link in enumerate(links_after_timestamp(links, resume)):
+            archive_link(link, overwrite=overwrite, out_dir=link.link_dir)
+
+    except KeyboardInterrupt:
+        log_archiving_paused(len(links), idx, link.timestamp if link else '0')
+        raise SystemExit(0)
+
+    except:
+        print()
+        raise    
+
+    log_archiving_finished(len(links))
+
+    # Step 4: Re-write links index with updated titles, icons, and resources
+    all_links = load_main_index(out_dir=out_dir)
+    write_main_index(links=list(all_links), out_dir=out_dir, finished=True)
+    return all_links
+
+@enforce_types
+def list_all(filter_patterns_str: Optional[str]=None,
+             filter_patterns: Optional[List[str]]=None,
+             filter_type: str='exact',
+             status: Optional[str]=None,
+             after: Optional[float]=None,
+             before: Optional[float]=None,
+             sort: Optional[str]=None,
+             csv: Optional[str]=None,
+             json: Optional[str]=None,
+             out_dir: str=OUTPUT_DIR) -> Iterable[Link]:
+    
+    check_data_folder(out_dir=out_dir)
+
+    if filter_patterns and filter_patterns_str:
+        stderr(
+            '[X] You should either pass filter patterns as an arguments '
+            'or via stdin, but not both.\n',
+            color='red',
+        )
+        raise SystemExit(2)
+    elif filter_patterns_str:
+        filter_patterns = filter_patterns_str.split('\n')
+
+
+    links = list_links(
+        filter_patterns=filter_patterns,
+        filter_type=filter_type,
+        before=before,
+        after=after,
+    )
+
+    if sort:
+        links = sorted(links, key=lambda link: getattr(link, sort))
+
+    folders = list_folders(
+        links=list(links),
+        status=status,
+        out_dir=out_dir,
+    )
+    
+    if csv:
+        print(links_to_csv(folders.values(), csv_cols=csv.split(','), header=True))
+    elif json:
+        print(to_json(folders.values(), indent=4, sort_keys=True))
+    else:
+        print(folders_to_str(folders))
+    raise SystemExit(not folders)
+
+
+@enforce_types
+def list_links(filter_patterns: Optional[List[str]]=None,
+               filter_type: str='exact',
+               after: Optional[float]=None,
+               before: Optional[float]=None,
+               out_dir: str=OUTPUT_DIR) -> Iterable[Link]:
+    
+    check_data_folder(out_dir=out_dir)
+
+    all_links = load_main_index(out_dir=out_dir)
+
+    for link in all_links:
+        if after is not None and float(link.timestamp) < after:
+            continue
+        if before is not None and float(link.timestamp) > before:
+            continue
+        
+        if filter_patterns:
+            if link_matches_filter(link, filter_patterns, filter_type):
+                yield link
+        else:
+            yield link
+
+@enforce_types
+def list_folders(links: List[Link],
+                 status: str,
+                 out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+    
+    check_data_folder()
+
+    if status == 'indexed':
+        return get_indexed_folders(links, out_dir=out_dir)
+    elif status == 'archived':
+        return get_archived_folders(links, out_dir=out_dir)
+    elif status == 'unarchived':
+        return get_unarchived_folders(links, out_dir=out_dir)
+
+    elif status == 'present':
+        return get_present_folders(links, out_dir=out_dir)
+    elif status == 'valid':
+        return get_valid_folders(links, out_dir=out_dir)
+    elif status == 'invalid':
+        return get_invalid_folders(links, out_dir=out_dir)
+
+    elif status == 'duplicate':
+        return get_duplicate_folders(links, out_dir=out_dir)
+    elif status == 'orphaned':
+        return get_orphaned_folders(links, out_dir=out_dir)
+    elif status == 'corrupted':
+        return get_corrupted_folders(links, out_dir=out_dir)
+    elif status == 'unrecognized':
+        return get_unrecognized_folders(links, out_dir=out_dir)
+
+    raise ValueError('Status not recognized.')
+
+
+def config(config_options_str: Optional[str]=None,
+           config_options: Optional[List[str]]=None,
+           get: bool=False,
+           set: bool=False,
+           reset: bool=False,
+           out_dir: str=OUTPUT_DIR) -> None:
+
+    check_data_folder(out_dir=out_dir)
+
+    if config_options and config_options_str:
+        stderr(
+            '[X] You should either pass config values as an arguments '
+            'or via stdin, but not both.\n',
+            color='red',
+        )
+        raise SystemExit(2)
+    elif config_options_str:
+        config_options = stdin_raw_text.split('\n')
+
+    config_options = config_options or []
+
+    no_args = not (get or set or reset or config_options)
+
+    matching_config: ConfigDict = {}
+    if get or no_args:
+        if config_options:
+            config_options = [get_real_name(key) for key in config_options]
+            matching_config = {key: CONFIG[key] for key in config_options if key in CONFIG}
+            failed_config = [key for key in config_options if key not in CONFIG]
+            if failed_config:
+                stderr()
+                stderr('[X] These options failed to get', color='red')
+                stderr('    {}'.format('\n    '.join(config_options)))
+                raise SystemExit(1)
+        else:
+            matching_config = CONFIG
+        
+        print(printable_config(matching_config))
+        raise SystemExit(not matching_config)
+    elif set:
+        new_config = {}
+        failed_options = []
+        for line in config_options:
+            if line.startswith('#') or not line.strip():
+                continue
+            if '=' not in line:
+                stderr('[X] Config KEY=VALUE must have an = sign in it', color='red')
+                stderr(f'    {line}')
+                raise SystemExit(2)
+
+            raw_key, val = line.split('=')
+            raw_key = raw_key.upper().strip()
+            key = get_real_name(raw_key)
+            if key != raw_key:
+                stderr(f'[i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.', color='lightyellow')
+
+            if key in CONFIG:
+                new_config[key] = val.strip()
+            else:
+                failed_options.append(line)
+
+        if new_config:
+            before = CONFIG
+            matching_config = write_config_file(new_config, out_dir=OUTPUT_DIR)
+            after = load_all_config()
+            print(printable_config(matching_config))
+
+            side_effect_changes: ConfigDict = {}
+            for key, val in after.items():
+                if key in USER_CONFIG and (before[key] != after[key]) and (key not in matching_config):
+                    side_effect_changes[key] = after[key]
+
+            if side_effect_changes:
+                stderr()
+                stderr('[i] Note: This change also affected these other options that depended on it:', color='lightyellow')
+                print('    {}'.format(printable_config(side_effect_changes, prefix='    ')))
+        if failed_options:
+            stderr()
+            stderr('[X] These options failed to set:', color='red')
+            stderr('    {}'.format('\n    '.join(failed_options)))
+        raise SystemExit(bool(failed_options))
+    elif reset:
+        stderr('[X] This command is not implemented yet.', color='red')
+        stderr('    Please manually remove the relevant lines from your config file:')
+        stderr(f'        {CONFIG_FILE}')
+        raise SystemExit(2)
+
+    else:
+        stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red')
+        stderr('    archivebox config')
+        stderr('    archivebox config --get SOME_KEY')
+        stderr('    archivebox config --set SOME_KEY=SOME_VALUE')
+        raise SystemExit(2)
+
+
+CRON_COMMENT = 'archivebox_schedule'
+
+@enforce_types
+def schedule(add: bool=False,
+             show: bool=False,
+             clear: bool=False,
+             foreground: bool=False,
+             run_all: bool=False,
+             quiet: bool=False,
+             every: Optional[str]=None,
+             import_path: Optional[str]=None,
+             out_dir: str=OUTPUT_DIR):
+    
+    check_data_folder(out_dir=out_dir)
+
+    os.makedirs(os.path.join(out_dir, LOGS_DIR_NAME), exist_ok=True)
+
+    cron = CronTab(user=True)
+    cron = dedupe_jobs(cron)
+
+    existing_jobs = list(cron.find_comment(CRON_COMMENT))
+    if foreground or run_all:
+        if import_path or (not existing_jobs):
+            stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**ANSI))
+            stderr('    archivebox schedule --every=hour https://example.com/some/rss/feed.xml')
+            raise SystemExit(1)
+        print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **ANSI))
+        if run_all:
+            try:
+                for job in existing_jobs:
+                    sys.stdout.write(f'  > {job.command}')
+                    sys.stdout.flush()
+                    job.run()
+                    sys.stdout.write(f'\r  √ {job.command}\n')
+            except KeyboardInterrupt:
+                print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
+                raise SystemExit(1)
+        if foreground:
+            try:
+                for result in cron.run_scheduler():
+                    print(result)
+            except KeyboardInterrupt:
+                print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
+                raise SystemExit(1)
+
+    elif show:
+        if existing_jobs:
+            print('\n'.join(str(cmd) for cmd in existing_jobs))
+        else:
+            stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **ANSI))
+            stderr('    To schedule a new job, run:')
+            stderr('        archivebox schedule --every=[timeperiod] https://example.com/some/rss/feed.xml')
+        raise SystemExit(0)
+
+    elif clear:
+        print(cron.remove_all(comment=CRON_COMMENT))
+        cron.write()
+        raise SystemExit(0)
+
+    elif every:
+        quoted = lambda s: f'"{s}"' if s and ' ' in s else s
+        cmd = [
+            'cd',
+            quoted(out_dir),
+            '&&',
+            quoted(ARCHIVEBOX_BINARY),
+            *(['add', f'"{import_path}"'] if import_path else ['update']),
+            '2>&1',
+            '>',
+            quoted(os.path.join(LOGS_DIR, 'archivebox.log')),
+
+        ]
+        new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT)
+
+        if every in ('minute', 'hour', 'day', 'week', 'month', 'year'):
+            set_every = getattr(new_job.every(), every)
+            set_every()
+        elif CronSlices.is_valid(every):
+            new_job.setall(every)
+        else:
+            stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**ANSI))
+            stderr('    It must be one of minute/hour/day/week/month')
+            stderr('    or a quoted cron-format schedule like:')
+            stderr('        archivebox init --every=day https://example.com/some/rss/feed.xml')
+            stderr('        archivebox init --every="0/5 * * * *" https://example.com/some/rss/feed.xml')
+            raise SystemExit(1)
+
+        cron = dedupe_jobs(cron)
+        cron.write()
+
+        total_runs = sum(j.frequency_per_year() for j in cron)
+        existing_jobs = list(cron.find_comment(CRON_COMMENT))
+
+        print()
+        print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **ANSI))
+        print('\n'.join(f'  > {cmd}' if str(cmd) == str(new_job) else f'    {cmd}' for cmd in existing_jobs))
+        if total_runs > 60 and not quiet:
+            stderr()
+            stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **ANSI))
+            stderr(f'    Congrats on being an enthusiastic internet archiver! 👌')
+            stderr()
+            stderr('    Make sure you have enough storage space available to hold all the data.')
+            stderr('    Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.')
+        raise SystemExit(0)
+
+
+
+
+
+def server(runserver_args: Optional[List[str]]=None, reload: bool=False, out_dir: str=OUTPUT_DIR) -> None:
+    runserver_args = runserver_args or []
+    check_data_folder(out_dir=out_dir)
+
+    setup_django(out_dir)
+    from django.core.management import call_command
+    from django.contrib.auth.models import User
+
+    if IS_TTY and not User.objects.filter(is_superuser=True).exists():
+        print('{lightyellow}[!] No admin users exist yet, you will not be able to edit links in the UI.{reset}'.format(**ANSI))
+        print()
+        print('    To create an admin user, run:')
+        print('        archivebox manage createsuperuser')
+        print()
+
+    print('{green}[+] Starting ArchiveBox webserver...{reset}'.format(**ANSI))
+    if not reload:
+        runserver_args.append('--noreload')
+
+    call_command("runserver", *runserver_args)
+
+
+def manage(args: Optional[List[str]]=None, out_dir: str=OUTPUT_DIR) -> None:
+    check_data_folder(out_dir=out_dir)
+
+    setup_django(out_dir)
+    from django.core.management import execute_from_command_line
+
+    execute_from_command_line([f'{ARCHIVEBOX_BINARY} manage', *(args or ['help'])])
+
+def shell(out_dir: str=OUTPUT_DIR) -> None:
+    check_data_folder(out_dir=out_dir)
+
+    setup_django(OUTPUT_DIR)
+    from django.core.management import call_command
+    call_command("shell_plus")
+
+# Helpers
+
+def printable_config(config: ConfigDict, prefix: str='') -> str:
+    return f'\n{prefix}'.join(
+        f'{key}={val}'
+        for key, val in config.items()
+        if not (isinstance(val, dict) or callable(val))
+    )
+
+def dedupe_jobs(cron: CronTab) -> CronTab:
+    deduped: Set[Tuple[str, str]] = set()
+
+    for job in list(cron):
+        unique_tuple = (str(job.slices), job.command)
+        if unique_tuple not in deduped:
+            deduped.add(unique_tuple)
+        cron.remove(job)
+
+    for schedule, command in deduped:
+        job = cron.new(command=command, comment=CRON_COMMENT)
+        job.setall(schedule)
+        job.enable()
+
+    return cron
+
+
+def print_folder_status(name, folder):
+    if folder['enabled']:
+        if folder['is_valid']:
+            color, symbol, note = 'green', '√', 'valid'
+        else:
+            color, symbol, note, num_files = 'red', 'X', 'invalid', '?'
+    else:
+        color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-'
+
+    if folder['path']:
+        if os.path.exists(folder['path']):
+            num_files = (
+                f'{len(os.listdir(folder["path"]))} files'
+                if os.path.isdir(folder['path']) else
+                human_readable_size(os.path.getsize(folder['path']))
+            )
+        else:
+            num_files = 'missing'
+
+        if ' ' in folder['path']:
+            folder['path'] = f'"{folder["path"]}"'
+
+    print(
+        ANSI[color],
+        symbol,
+        ANSI['reset'],
+        name.ljust(22),
+        (folder["path"] or '').ljust(76),
+        num_files.ljust(14),
+        ANSI[color],
+        note,
+        ANSI['reset'],
+    )
+
+
+def print_dependency_version(name, dependency):
+    if dependency['enabled']:
+        if dependency['is_valid']:
+            color, symbol, note = 'green', '√', 'valid'
+            version = 'v' + re.search(r'[\d\.]+', dependency['version'])[0]
+        else:
+            color, symbol, note, version = 'red', 'X', 'invalid', '?'
+    else:
+        color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
+
+    if ' ' in dependency["path"]:
+        dependency["path"] = f'"{dependency["path"]}"'
+
+    print(
+        ANSI[color],
+        symbol,
+        ANSI['reset'],
+        name.ljust(22),
+        (dependency["path"] or '').ljust(76),
+        version.ljust(14),
+        ANSI[color],
+        note,
+        ANSI['reset'],
+    )

+ 68 - 0
archivebox/parsers/__init__.py

@@ -0,0 +1,68 @@
+"""
+Everything related to parsing links from input sources.
+
+For a list of supported services, see the README.md.
+For examples of supported import formats see tests/.
+"""
+
+__package__ = 'archivebox.parsers'
+
+
+from typing import Tuple, List
+
+from ..config import TIMEOUT
+from ..util import (
+    check_url_parsing_invariants,
+    TimedProgress,
+    Link,
+    enforce_types,
+)
+from .pocket_html import parse_pocket_html_export
+from .pinboard_rss import parse_pinboard_rss_export
+from .shaarli_rss import parse_shaarli_rss_export
+from .medium_rss import parse_medium_rss_export
+from .netscape_html import parse_netscape_html_export
+from .generic_rss import parse_generic_rss_export
+from .generic_json import parse_generic_json_export
+from .generic_txt import parse_generic_txt_export
+
+
+@enforce_types
+def parse_links(source_file: str) -> Tuple[List[Link], str]:
+    """parse a list of URLs with their metadata from an 
+       RSS feed, bookmarks export, or text file
+    """
+
+    check_url_parsing_invariants()
+    PARSERS = (
+        # Specialized parsers
+        ('Pocket HTML', parse_pocket_html_export),
+        ('Pinboard RSS', parse_pinboard_rss_export),
+        ('Shaarli RSS', parse_shaarli_rss_export),
+        ('Medium RSS', parse_medium_rss_export),
+        
+        # General parsers
+        ('Netscape HTML', parse_netscape_html_export),
+        ('Generic RSS', parse_generic_rss_export),
+        ('Generic JSON', parse_generic_json_export),
+
+        # Fallback parser
+        ('Plain Text', parse_generic_txt_export),
+    )
+    timer = TimedProgress(TIMEOUT * 4)
+    with open(source_file, 'r', encoding='utf-8') as file:
+        for parser_name, parser_func in PARSERS:
+            try:
+                links = list(parser_func(file))
+                if links:
+                    timer.end()
+                    return links, parser_name
+            except Exception as err:   # noqa
+                # Parsers are tried one by one down the list, and the first one
+                # that succeeds is used. To see why a certain parser was not used
+                # due to error or format incompatibility, uncomment this line:
+                # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
+                pass
+
+    timer.end()
+    return [], 'Failed to parse'

+ 65 - 0
archivebox/parsers/generic_json.py

@@ -0,0 +1,65 @@
+__package__ = 'archivebox.parsers'
+
+import json
+
+from typing import IO, Iterable
+from datetime import datetime
+
+from ..index.schema import Link
+from ..util import (
+    htmldecode,
+    enforce_types,
+)
+
+
+@enforce_types
+def parse_generic_json_export(json_file: IO[str]) -> Iterable[Link]:
+    """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
+
+    json_file.seek(0)
+    links = json.load(json_file)
+    json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
+
+    for link in links:
+        # example line
+        # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
+        if link:
+            # Parse URL
+            url = link.get('href') or link.get('url') or link.get('URL')
+            if not url:
+                raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
+
+            # Parse the timestamp
+            ts_str = str(datetime.now().timestamp())
+            if link.get('timestamp'):
+                # chrome/ff histories use a very precise timestamp
+                ts_str = str(link['timestamp'] / 10000000)  
+            elif link.get('time'):
+                ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
+            elif link.get('created_at'):
+                ts_str = str(json_date(link['created_at']).timestamp())
+            elif link.get('created'):
+                ts_str = str(json_date(link['created']).timestamp())
+            elif link.get('date'):
+                ts_str = str(json_date(link['date']).timestamp())
+            elif link.get('bookmarked'):
+                ts_str = str(json_date(link['bookmarked']).timestamp())
+            elif link.get('saved'):
+                ts_str = str(json_date(link['saved']).timestamp())
+            
+            # Parse the title
+            title = None
+            if link.get('title'):
+                title = link['title'].strip()
+            elif link.get('description'):
+                title = link['description'].replace(' — Readability', '').strip()
+            elif link.get('name'):
+                title = link['name'].strip()
+
+            yield Link(
+                url=htmldecode(url),
+                timestamp=ts_str,
+                title=htmldecode(title) or None,
+                tags=htmldecode(link.get('tags')) or '',
+                sources=[json_file.name],
+            )

+ 49 - 0
archivebox/parsers/generic_rss.py

@@ -0,0 +1,49 @@
+__package__ = 'archivebox.parsers'
+
+
+from typing import IO, Iterable
+from datetime import datetime
+
+from ..index.schema import Link
+from ..util import (
+    htmldecode,
+    enforce_types,
+    str_between,
+)
+
+@enforce_types
+def parse_generic_rss_export(rss_file: IO[str]) -> Iterable[Link]:
+    """Parse RSS XML-format files into links"""
+
+    rss_file.seek(0)
+    items = rss_file.read().split('<item>')
+    items = items[1:] if items else []
+    for item in items:
+        # example item:
+        # <item>
+        # <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
+        # <category>Unread</category>
+        # <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
+        # <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
+        # <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
+        # </item>
+
+        trailing_removed = item.split('</item>', 1)[0]
+        leading_removed = trailing_removed.split('<item>', 1)[-1].strip()
+        rows = leading_removed.split('\n')
+
+        def get_row(key):
+            return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
+
+        url = str_between(get_row('link'), '<link>', '</link>')
+        ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
+        time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
+        title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
+
+        yield Link(
+            url=htmldecode(url),
+            timestamp=str(time.timestamp()),
+            title=htmldecode(title) or None,
+            tags=None,
+            sources=[rss_file.name],
+        )

+ 30 - 0
archivebox/parsers/generic_txt.py

@@ -0,0 +1,30 @@
+__package__ = 'archivebox.parsers'
+__description__ = 'Plain Text'
+
+import re
+
+from typing import IO, Iterable
+from datetime import datetime
+
+from ..index.schema import Link
+from ..util import (
+    htmldecode,
+    enforce_types,
+    URL_REGEX
+)
+
+@enforce_types
+def parse_generic_txt_export(text_file: IO[str]) -> Iterable[Link]:
+    """Parse raw links from each line in a text file"""
+
+    text_file.seek(0)
+    for line in text_file.readlines():
+        urls = re.findall(URL_REGEX, line) if line.strip() else ()
+        for url in urls:                                                # type: ignore
+            yield Link(
+                url=htmldecode(url),
+                timestamp=str(datetime.now().timestamp()),
+                title=None,
+                tags=None,
+                sources=[text_file.name],
+            )

+ 35 - 0
archivebox/parsers/medium_rss.py

@@ -0,0 +1,35 @@
+__package__ = 'archivebox.parsers'
+
+
+from typing import IO, Iterable
+from datetime import datetime
+
+from xml.etree import ElementTree
+
+from ..index.schema import Link
+from ..util import (
+    htmldecode,
+    enforce_types,
+)
+
+
+@enforce_types
+def parse_medium_rss_export(rss_file: IO[str]) -> Iterable[Link]:
+    """Parse Medium RSS feed files into links"""
+
+    rss_file.seek(0)
+    root = ElementTree.parse(rss_file).getroot()
+    items = root.find("channel").findall("item")                        # type: ignore
+    for item in items:
+        url = item.find("link").text                                    # type: ignore
+        title = item.find("title").text.strip()                         # type: ignore
+        ts_str = item.find("pubDate").text                              # type: ignore
+        time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z")    # type: ignore
+        
+        yield Link(
+            url=htmldecode(url),
+            timestamp=str(time.timestamp()),
+            title=htmldecode(title) or None,
+            tags=None,
+            sources=[rss_file.name],
+        )

+ 39 - 0
archivebox/parsers/netscape_html.py

@@ -0,0 +1,39 @@
+__package__ = 'archivebox.parsers'
+
+
+import re
+
+from typing import IO, Iterable
+from datetime import datetime
+
+from ..index.schema import Link
+from ..util import (
+    htmldecode,
+    enforce_types,
+)
+
+
+@enforce_types
+def parse_netscape_html_export(html_file: IO[str]) -> Iterable[Link]:
+    """Parse netscape-format bookmarks export files (produced by all browsers)"""
+
+    html_file.seek(0)
+    pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE)
+    for line in html_file:
+        # example line
+        # <DT><A HREF="https://example.com/?q=1+2" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://example.com/favicon.ico" ICON="data:image/png;base64,...">example bookmark title</A>
+        
+        match = pattern.search(line)
+        if match:
+            url = match.group(1)
+            time = datetime.fromtimestamp(float(match.group(2)))
+            title = match.group(3).strip()
+
+            yield Link(
+                url=htmldecode(url),
+                timestamp=str(time.timestamp()),
+                title=htmldecode(title) or None,
+                tags=None,
+                sources=[html_file.name],
+            )
+

+ 47 - 0
archivebox/parsers/pinboard_rss.py

@@ -0,0 +1,47 @@
+__package__ = 'archivebox.parsers'
+
+
+from typing import IO, Iterable
+from datetime import datetime
+
+from xml.etree import ElementTree
+
+from ..index.schema import Link
+from ..util import (
+    htmldecode,
+    enforce_types,
+)
+
+
+@enforce_types
+def parse_pinboard_rss_export(rss_file: IO[str]) -> Iterable[Link]:
+    """Parse Pinboard RSS feed files into links"""
+
+    rss_file.seek(0)
+    root = ElementTree.parse(rss_file).getroot()
+    items = root.findall("{http://purl.org/rss/1.0/}item")
+    for item in items:
+        find = lambda p: item.find(p).text.strip() if item.find(p) else None    # type: ignore
+
+        url = find("{http://purl.org/rss/1.0/}link")
+        tags = find("{http://purl.org/dc/elements/1.1/}subject")
+        title = find("{http://purl.org/rss/1.0/}title")
+        ts_str = find("{http://purl.org/dc/elements/1.1/}date")
+        
+        # Pinboard includes a colon in its date stamp timezone offsets, which
+        # Python can't parse. Remove it:
+        if ts_str and ts_str[-3:-2] == ":":
+            ts_str = ts_str[:-3]+ts_str[-2:]
+
+        if ts_str:
+            time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
+        else:
+            time = datetime.now()
+
+        yield Link(
+            url=htmldecode(url),
+            timestamp=str(time.timestamp()),
+            title=htmldecode(title) or None,
+            tags=htmldecode(tags) or None,
+            sources=[rss_file.name],
+        )

+ 38 - 0
archivebox/parsers/pocket_html.py

@@ -0,0 +1,38 @@
+__package__ = 'archivebox.parsers'
+
+
+import re
+
+from typing import IO, Iterable
+from datetime import datetime
+
+from ..index.schema import Link
+from ..util import (
+    htmldecode,
+    enforce_types,
+)
+
+
+@enforce_types
+def parse_pocket_html_export(html_file: IO[str]) -> Iterable[Link]:
+    """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
+
+    html_file.seek(0)
+    pattern = re.compile("^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE)
+    for line in html_file:
+        # example line
+        # <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
+        match = pattern.search(line)
+        if match:
+            url = match.group(1).replace('http://www.readability.com/read?url=', '')           # remove old readability prefixes to get original url
+            time = datetime.fromtimestamp(float(match.group(2)))
+            tags = match.group(3)
+            title = match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '')
+            
+            yield Link(
+                url=htmldecode(url),
+                timestamp=str(time.timestamp()),
+                title=htmldecode(title) or None,
+                tags=tags or '',
+                sources=[html_file.name],
+            )

+ 50 - 0
archivebox/parsers/shaarli_rss.py

@@ -0,0 +1,50 @@
+__package__ = 'archivebox.parsers'
+
+
+from typing import IO, Iterable
+from datetime import datetime
+
+from ..index.schema import Link
+from ..util import (
+    htmldecode,
+    enforce_types,
+    str_between,
+)
+
+
+@enforce_types
+def parse_shaarli_rss_export(rss_file: IO[str]) -> Iterable[Link]:
+    """Parse Shaarli-specific RSS XML-format files into links"""
+
+    rss_file.seek(0)
+    entries = rss_file.read().split('<entry>')[1:]
+    for entry in entries:
+        # example entry:
+        # <entry>
+        #   <title>Aktuelle Trojaner-Welle: Emotet lauert in gefälschten Rechnungsmails | heise online</title>
+        #   <link href="https://www.heise.de/security/meldung/Aktuelle-Trojaner-Welle-Emotet-lauert-in-gefaelschten-Rechnungsmails-4291268.html" />
+        #   <id>https://demo.shaarli.org/?cEV4vw</id>
+        #   <published>2019-01-30T06:06:01+00:00</published>
+        #   <updated>2019-01-30T06:06:01+00:00</updated>
+        #   <content type="html" xml:lang="en"><![CDATA[<div class="markdown"><p>&#8212; <a href="https://demo.shaarli.org/?cEV4vw">Permalink</a></p></div>]]></content>
+        # </entry>
+
+        trailing_removed = entry.split('</entry>', 1)[0]
+        leading_removed = trailing_removed.strip()
+        rows = leading_removed.split('\n')
+
+        def get_row(key):
+            return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0]
+
+        title = str_between(get_row('title'), '<title>', '</title>').strip()
+        url = str_between(get_row('link'), '<link href="', '" />')
+        ts_str = str_between(get_row('published'), '<published>', '</published>')
+        time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
+
+        yield Link(
+            url=htmldecode(url),
+            timestamp=str(time.timestamp()),
+            title=htmldecode(title) or None,
+            tags=None,
+            sources=[rss_file.name],
+        )

+ 0 - 0
archivebox/legacy/templates/favicon.ico → archivebox/themes/legacy/favicon.ico


+ 0 - 0
archivebox/legacy/templates/link_details.html → archivebox/themes/legacy/link_details.html


+ 0 - 0
archivebox/legacy/templates/main_index.html → archivebox/themes/legacy/main_index.html


+ 0 - 0
archivebox/legacy/templates/main_index_row.html → archivebox/themes/legacy/main_index_row.html


+ 0 - 0
archivebox/legacy/templates/robots.txt → archivebox/themes/legacy/robots.txt


+ 0 - 0
archivebox/legacy/templates/static/archive.png → archivebox/themes/legacy/static/archive.png


+ 0 - 0
archivebox/legacy/templates/static/bootstrap.min.css → archivebox/themes/legacy/static/bootstrap.min.css


+ 0 - 0
archivebox/legacy/templates/static/external.png → archivebox/themes/legacy/static/external.png


+ 0 - 0
archivebox/legacy/templates/static/jquery.dataTables.min.css → archivebox/themes/legacy/static/jquery.dataTables.min.css


+ 0 - 0
archivebox/legacy/templates/static/jquery.dataTables.min.js → archivebox/themes/legacy/static/jquery.dataTables.min.js


+ 0 - 0
archivebox/legacy/templates/static/jquery.min.js → archivebox/themes/legacy/static/jquery.min.js


+ 0 - 0
archivebox/legacy/templates/static/sort_asc.png → archivebox/themes/legacy/static/sort_asc.png


+ 0 - 0
archivebox/legacy/templates/static/sort_both.png → archivebox/themes/legacy/static/sort_both.png


+ 0 - 0
archivebox/legacy/templates/static/sort_desc.png → archivebox/themes/legacy/static/sort_desc.png


+ 0 - 0
archivebox/legacy/templates/static/spinner.gif → archivebox/themes/legacy/static/spinner.gif


+ 48 - 18
archivebox/legacy/util.py → archivebox/util.py

@@ -1,6 +1,7 @@
 import os
 import os
 import re
 import re
 import sys
 import sys
+import ssl
 import json
 import json
 import time
 import time
 import shutil
 import shutil
@@ -8,7 +9,7 @@ import argparse
 
 
 from string import Template
 from string import Template
 from json import JSONEncoder
 from json import JSONEncoder
-from typing import List, Optional, Any, Union, IO, Mapping, Tuple
+from typing import List, Dict, Optional, Any, Union, IO, Mapping, Tuple
 from inspect import signature
 from inspect import signature
 from functools import wraps
 from functools import wraps
 from hashlib import sha256
 from hashlib import sha256
@@ -28,11 +29,12 @@ from subprocess import (
 
 
 from base32_crockford import encode as base32_encode         # type: ignore
 from base32_crockford import encode as base32_encode         # type: ignore
 
 
-from .schema import Link
+from .index.schema import Link
 from .config import (
 from .config import (
     ANSI,
     ANSI,
     TERM_WIDTH,
     TERM_WIDTH,
-    SOURCES_DIR,
+    OUTPUT_DIR,
+    SOURCES_DIR_NAME,
     OUTPUT_PERMISSIONS,
     OUTPUT_PERMISSIONS,
     TIMEOUT,
     TIMEOUT,
     SHOW_PROGRESS,
     SHOW_PROGRESS,
@@ -40,8 +42,9 @@ from .config import (
     CHECK_SSL_VALIDITY,
     CHECK_SSL_VALIDITY,
     WGET_USER_AGENT,
     WGET_USER_AGENT,
     CHROME_OPTIONS,
     CHROME_OPTIONS,
+    check_data_folder,
 )
 )
-from .logs import pretty_path
+from .cli.logging import pretty_path
 
 
 ### Parsing Helpers
 ### Parsing Helpers
 
 
@@ -187,31 +190,36 @@ def check_url_parsing_invariants() -> None:
 ### Random Helpers
 ### Random Helpers
 
 
 @enforce_types
 @enforce_types
-def handle_stdin_import(raw_text: str) -> str:
-    if not os.path.exists(SOURCES_DIR):
-        os.makedirs(SOURCES_DIR)
+def save_stdin_to_sources(raw_text: str, out_dir: str=OUTPUT_DIR) -> str:
+    check_data_folder(out_dir=out_dir)
+
+    sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
+    if not os.path.exists(sources_dir):
+        os.makedirs(sources_dir)
 
 
     ts = str(datetime.now().timestamp()).split('.', 1)[0]
     ts = str(datetime.now().timestamp()).split('.', 1)[0]
 
 
-    source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format('stdin', ts))
+    source_path = os.path.join(sources_dir, '{}-{}.txt'.format('stdin', ts))
 
 
     atomic_write(raw_text, source_path)
     atomic_write(raw_text, source_path)
     return source_path
     return source_path
 
 
 
 
 @enforce_types
 @enforce_types
-def handle_file_import(path: str, timeout: int=TIMEOUT) -> str:
+def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DIR) -> str:
     """download a given url's content into output/sources/domain-<timestamp>.txt"""
     """download a given url's content into output/sources/domain-<timestamp>.txt"""
+    check_data_folder(out_dir=out_dir)
 
 
-    if not os.path.exists(SOURCES_DIR):
-        os.makedirs(SOURCES_DIR)
+    sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
+    if not os.path.exists(sources_dir):
+        os.makedirs(sources_dir)
 
 
     ts = str(datetime.now().timestamp()).split('.', 1)[0]
     ts = str(datetime.now().timestamp()).split('.', 1)[0]
 
 
-    source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(basename(path), ts))
+    source_path = os.path.join(sources_dir, '{}-{}.txt'.format(basename(path), ts))
 
 
     if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
     if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
-        source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(path), ts))
+        source_path = os.path.join(sources_dir, '{}-{}.txt'.format(domain(path), ts))
         print('{}[*] [{}] Downloading {}{}'.format(
         print('{}[*] [{}] Downloading {}{}'.format(
             ANSI['green'],
             ANSI['green'],
             datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
             datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
@@ -532,7 +540,6 @@ def download_url(url: str, timeout: int=TIMEOUT) -> str:
     if CHECK_SSL_VALIDITY:
     if CHECK_SSL_VALIDITY:
         resp = urlopen(req, timeout=timeout)
         resp = urlopen(req, timeout=timeout)
     else:
     else:
-        import ssl
         insecure = ssl._create_unverified_context()
         insecure = ssl._create_unverified_context()
         resp = urlopen(req, timeout=timeout, context=insecure)
         resp = urlopen(req, timeout=timeout, context=insecure)
 
 
@@ -662,7 +669,7 @@ def to_json(obj: Any, file: IO=None, indent: Optional[int]=4, sort_keys: bool=Tr
         return json.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
         return json.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
 
 
 
 
-def to_csv(links: List[Link], csv_cols: Optional[List[str]]=None,
+def links_to_csv(links: List[Link], csv_cols: Optional[List[str]]=None,
            header: bool=True, ljust: int=0, separator: str=',') -> str:
            header: bool=True, ljust: int=0, separator: str=',') -> str:
     csv_cols = csv_cols or ['timestamp', 'is_archived', 'url']
     csv_cols = csv_cols or ['timestamp', 'is_archived', 'url']
     
     
@@ -677,6 +684,8 @@ def to_csv(links: List[Link], csv_cols: Optional[List[str]]=None,
 
 
     return '\n'.join((header_str, *row_strs))
     return '\n'.join((header_str, *row_strs))
 
 
+def folders_to_str(folders: Dict[str, Optional[Link]]) -> str:
+    return '\n'.join(f'{folder} {link}' for folder, link in folders.items())
 
 
 @enforce_types
 @enforce_types
 def render_template(template_path: str, context: Mapping[str, str]) -> str:
 def render_template(template_path: str, context: Mapping[str, str]) -> str:
@@ -713,11 +722,11 @@ def atomic_write(contents: Union[dict, str, bytes], path: str) -> None:
             os.remove(tmp_file)
             os.remove(tmp_file)
 
 
 
 
-def reject_stdin(caller: str) -> None:
+def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None:
     """Tell the user they passed stdin to a command that doesn't accept it"""
     """Tell the user they passed stdin to a command that doesn't accept it"""
 
 
-    if not sys.stdin.isatty():
-        stdin_raw_text = sys.stdin.read().strip()
+    if stdin and not stdin.isatty():
+        stdin_raw_text = stdin.read().strip()
         if stdin_raw_text:
         if stdin_raw_text:
             print(
             print(
                 '{red}[X] The "{}" command does not accept stdin.{reset}\n'.format(
                 '{red}[X] The "{}" command does not accept stdin.{reset}\n'.format(
@@ -731,9 +740,30 @@ def reject_stdin(caller: str) -> None:
             print()
             print()
             raise SystemExit(1)
             raise SystemExit(1)
 
 
+def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]:
+    if stdin and not stdin.isatty():
+        return stdin.read()
+    return None
+
+
+def set_docstring(text: str):
+    def decorator(func):
+        @wraps(func)
+        def wrapper_with_docstring(*args, **kwargs):
+            return func(*args, **kwargs)
+        wrapper_with_docstring.__doc__ = text
+        return wrapper_with_docstring
+    return decorator
+
 
 
 class SmartFormatter(argparse.HelpFormatter):
 class SmartFormatter(argparse.HelpFormatter):
     def _split_lines(self, text, width):
     def _split_lines(self, text, width):
         if '\n' in text:
         if '\n' in text:
             return text.splitlines()
             return text.splitlines()
         return argparse.HelpFormatter._split_lines(self, text, width)
         return argparse.HelpFormatter._split_lines(self, text, width)
+
+
+class ArchiveError(Exception):
+    def __init__(self, message, hints=None):
+        super().__init__(message)
+        self.hints = hints