archivebox_add.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
  1. #!/usr/bin/env python3
  2. __package__ = 'archivebox.cli'
  3. __command__ = 'archivebox add'
  4. import sys
  5. import argparse
  6. from typing import List, Optional, IO
  7. from ..main import add
  8. from ..util import docstring
  9. from ..config import OUTPUT_DIR, ONLY_NEW
  10. from ..logging_util import SmartFormatter, accept_stdin, stderr
  11. @docstring(add.__doc__)
  12. def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
  13. parser = argparse.ArgumentParser(
  14. prog=__command__,
  15. description=add.__doc__,
  16. add_help=True,
  17. formatter_class=SmartFormatter,
  18. )
  19. parser.add_argument(
  20. '--tag', '-t',
  21. type=str,
  22. default='',
  23. help="Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3",
  24. )
  25. parser.add_argument(
  26. '--update-all', #'-n',
  27. action='store_true',
  28. default=not ONLY_NEW, # when ONLY_NEW=True we skip updating old links
  29. help="Also retry previously skipped/failed links when adding new links",
  30. )
  31. parser.add_argument(
  32. '--index-only', #'-o',
  33. action='store_true',
  34. help="Add the links to the main index without archiving them",
  35. )
  36. parser.add_argument(
  37. 'urls',
  38. nargs='*',
  39. type=str,
  40. default=None,
  41. help=(
  42. 'URLs or paths to archive e.g.:\n'
  43. ' https://getpocket.com/users/USERNAME/feed/all\n'
  44. ' https://example.com/some/rss/feed.xml\n'
  45. ' https://example.com\n'
  46. ' ~/Downloads/firefox_bookmarks_export.html\n'
  47. ' ~/Desktop/sites_list.csv\n'
  48. )
  49. )
  50. parser.add_argument(
  51. "--depth",
  52. action="store",
  53. default=0,
  54. choices=[0, 1],
  55. type=int,
  56. help="Recursively archive all linked pages up to this many hops away"
  57. )
  58. parser.add_argument(
  59. "--overwrite",
  60. default=False,
  61. action="store_true",
  62. help="Re-archive URLs from scratch, overwriting any existing files"
  63. )
  64. parser.add_argument(
  65. "--init", #'-i',
  66. action='store_true',
  67. help="Init/upgrade the curent data directory before adding",
  68. )
  69. parser.add_argument(
  70. "--extract",
  71. type=str,
  72. help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
  73. This does not take precedence over the configuration",
  74. default=""
  75. )
  76. command = parser.parse_args(args or ())
  77. urls = command.urls
  78. stdin_urls = ''
  79. if not urls:
  80. stdin_urls = accept_stdin(stdin)
  81. if (stdin_urls and urls) or (not stdin and not urls):
  82. stderr(
  83. '[X] You must pass URLs/paths to add via stdin or CLI arguments.\n',
  84. color='red',
  85. )
  86. raise SystemExit(2)
  87. add(
  88. urls=stdin_urls or urls,
  89. depth=command.depth,
  90. tag=command.tag,
  91. update_all=command.update_all,
  92. index_only=command.index_only,
  93. overwrite=command.overwrite,
  94. init=command.init,
  95. extractors=command.extract,
  96. out_dir=pwd or OUTPUT_DIR,
  97. )
  98. if __name__ == '__main__':
  99. main(args=sys.argv[1:], stdin=sys.stdin)
  100. # TODO: Implement these
  101. #
  102. # parser.add_argument(
  103. # '--mirror', #'-m',
  104. # action='store_true',
  105. # help='Archive an entire site (finding all linked pages below it on the same domain)',
  106. # )
  107. # parser.add_argument(
  108. # '--crawler', #'-r',
  109. # choices=('depth_first', 'breadth_first'),
  110. # help='Controls which crawler to use in order to find outlinks in a given page',
  111. # default=None,
  112. # )