archivebox_add.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. #!/usr/bin/env python3
  2. __package__ = 'archivebox.cli'
  3. __command__ = 'archivebox add'
  4. import sys
  5. import argparse
  6. from typing import List, Optional, IO
  7. from ..main import add
  8. from ..util import docstring
  9. from ..parsers import PARSERS
  10. from ..config import OUTPUT_DIR, ONLY_NEW
  11. from ..logging_util import SmartFormatter, accept_stdin, stderr
  12. @docstring(add.__doc__)
  13. def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
  14. parser = argparse.ArgumentParser(
  15. prog=__command__,
  16. description=add.__doc__,
  17. add_help=True,
  18. formatter_class=SmartFormatter,
  19. )
  20. parser.add_argument(
  21. '--tag', '-t',
  22. type=str,
  23. default='',
  24. help="Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3",
  25. )
  26. parser.add_argument(
  27. '--update', #'-u',
  28. action='store_true',
  29. default=not ONLY_NEW, # when ONLY_NEW=True we skip updating old links
  30. help="Also retry previously skipped/failed links when adding new links",
  31. )
  32. parser.add_argument(
  33. '--update-all', #'-n',
  34. action='store_true',
  35. default=False,
  36. help="Also update ALL links in index when finished adding new links",
  37. )
  38. parser.add_argument(
  39. '--index-only', #'-o',
  40. action='store_true',
  41. help="Add the links to the main index without archiving them",
  42. )
  43. parser.add_argument(
  44. 'urls',
  45. nargs='*',
  46. type=str,
  47. default=None,
  48. help=(
  49. 'URLs or paths to archive e.g.:\n'
  50. ' https://getpocket.com/users/USERNAME/feed/all\n'
  51. ' https://example.com/some/rss/feed.xml\n'
  52. ' https://example.com\n'
  53. ' ~/Downloads/firefox_bookmarks_export.html\n'
  54. ' ~/Desktop/sites_list.csv\n'
  55. )
  56. )
  57. parser.add_argument(
  58. "--depth",
  59. action="store",
  60. default=0,
  61. choices=[0, 1],
  62. type=int,
  63. help="Recursively archive all linked pages up to this many hops away"
  64. )
  65. parser.add_argument(
  66. "--overwrite",
  67. default=False,
  68. action="store_true",
  69. help="Re-archive URLs from scratch, overwriting any existing files"
  70. )
  71. parser.add_argument(
  72. "--init", #'-i',
  73. action='store_true',
  74. help="Init/upgrade the curent data directory before adding",
  75. )
  76. parser.add_argument(
  77. "--extract",
  78. type=str,
  79. help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
  80. This does not take precedence over the configuration",
  81. default=""
  82. )
  83. parser.add_argument(
  84. "--parser",
  85. type=str,
  86. help="Parser used to read inputted URLs.",
  87. default="auto",
  88. choices=["auto", *PARSERS.keys()],
  89. )
  90. command = parser.parse_args(args or ())
  91. urls = command.urls
  92. stdin_urls = ''
  93. if not urls:
  94. stdin_urls = accept_stdin(stdin)
  95. if (stdin_urls and urls) or (not stdin and not urls):
  96. stderr(
  97. '[X] You must pass URLs/paths to add via stdin or CLI arguments.\n',
  98. color='red',
  99. )
  100. raise SystemExit(2)
  101. add(
  102. urls=stdin_urls or urls,
  103. depth=command.depth,
  104. tag=command.tag,
  105. update=command.update,
  106. update_all=command.update_all,
  107. index_only=command.index_only,
  108. overwrite=command.overwrite,
  109. init=command.init,
  110. extractors=command.extract,
  111. parser=command.parser,
  112. out_dir=pwd or OUTPUT_DIR,
  113. )
  114. if __name__ == '__main__':
  115. main(args=sys.argv[1:], stdin=sys.stdin)
  116. # TODO: Implement these
  117. #
  118. # parser.add_argument(
  119. # '--mirror', #'-m',
  120. # action='store_true',
  121. # help='Archive an entire site (finding all linked pages below it on the same domain)',
  122. # )
  123. # parser.add_argument(
  124. # '--crawler', #'-r',
  125. # choices=('depth_first', 'breadth_first'),
  126. # help='Controls which crawler to use in order to find outlinks in a given page',
  127. # default=None,
  128. # )