archivebox_add.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. #!/usr/bin/env python3
  2. __package__ = 'archivebox.cli'
  3. __command__ = 'archivebox add'
  4. import sys
  5. import argparse
  6. from typing import List, Optional, IO
  7. from ..main import add
  8. from ..util import docstring
  9. from ..config import OUTPUT_DIR, ONLY_NEW
  10. from ..logging_util import SmartFormatter, accept_stdin, stderr
  11. @docstring(add.__doc__)
  12. def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
  13. parser = argparse.ArgumentParser(
  14. prog=__command__,
  15. description=add.__doc__,
  16. add_help=True,
  17. formatter_class=SmartFormatter,
  18. )
  19. parser.add_argument(
  20. '--update-all', #'-n',
  21. action='store_true',
  22. default=not ONLY_NEW, # when ONLY_NEW=True we skip updating old links
  23. help="Also retry previously skipped/failed links when adding new links",
  24. )
  25. parser.add_argument(
  26. '--index-only', #'-o',
  27. action='store_true',
  28. help="Add the links to the main index without archiving them",
  29. )
  30. parser.add_argument(
  31. 'urls',
  32. nargs='*',
  33. type=str,
  34. default=None,
  35. help=(
  36. 'URLs or paths to archive e.g.:\n'
  37. ' https://getpocket.com/users/USERNAME/feed/all\n'
  38. ' https://example.com/some/rss/feed.xml\n'
  39. ' https://example.com\n'
  40. ' ~/Downloads/firefox_bookmarks_export.html\n'
  41. ' ~/Desktop/sites_list.csv\n'
  42. )
  43. )
  44. parser.add_argument(
  45. "--depth",
  46. action="store",
  47. default=0,
  48. choices=[0, 1],
  49. type=int,
  50. help="Recursively archive all linked pages up to this many hops away"
  51. )
  52. parser.add_argument(
  53. "--overwrite",
  54. default=False,
  55. action="store_true",
  56. help="Re-archive URLs from scratch, overwriting any existing files"
  57. )
  58. parser.add_argument(
  59. "--init", #'-i',
  60. action='store_true',
  61. help="Init/upgrade the curent data directory before adding",
  62. )
  63. parser.add_argument(
  64. "--extract",
  65. type=str,
  66. help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
  67. This does not take precedence over the configuration",
  68. default=""
  69. )
  70. command = parser.parse_args(args or ())
  71. urls = command.urls
  72. stdin_urls = accept_stdin(stdin)
  73. if (stdin_urls and urls) or (not stdin and not urls):
  74. stderr(
  75. '[X] You must pass URLs/paths to add via stdin or CLI arguments.\n',
  76. color='red',
  77. )
  78. raise SystemExit(2)
  79. add(
  80. urls=stdin_urls or urls,
  81. depth=command.depth,
  82. update_all=command.update_all,
  83. index_only=command.index_only,
  84. overwrite=command.overwrite,
  85. init=command.init,
  86. extractors=command.extract,
  87. out_dir=pwd or OUTPUT_DIR,
  88. )
  89. if __name__ == '__main__':
  90. main(args=sys.argv[1:], stdin=sys.stdin)
  91. # TODO: Implement these
  92. #
  93. # parser.add_argument(
  94. # '--mirror', #'-m',
  95. # action='store_true',
  96. # help='Archive an entire site (finding all linked pages below it on the same domain)',
  97. # )
  98. # parser.add_argument(
  99. # '--crawler', #'-r',
  100. # choices=('depth_first', 'breadth_first'),
  101. # help='Controls which crawler to use in order to find outlinks in a given page',
  102. # default=None,
  103. # )