archivebox_add.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. #!/usr/bin/env python3
  2. __package__ = 'archivebox.cli'
  3. __command__ = 'archivebox add'
  4. import sys
  5. import argparse
  6. from typing import List, Optional, IO
  7. from archivebox.misc.util import docstring
  8. from archivebox.config import DATA_DIR
  9. from archivebox.config.common import ARCHIVING_CONFIG
  10. from ..main import add
  11. from ..parsers import PARSERS
  12. from ..logging_util import SmartFormatter, accept_stdin, stderr
  13. @docstring(add.__doc__)
  14. def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
  15. parser = argparse.ArgumentParser(
  16. prog=__command__,
  17. description=add.__doc__,
  18. add_help=True,
  19. formatter_class=SmartFormatter,
  20. )
  21. parser.add_argument(
  22. '--tag', '-t',
  23. type=str,
  24. default='',
  25. help="Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3",
  26. )
  27. parser.add_argument(
  28. '--update', #'-u',
  29. action='store_true',
  30. default=not ARCHIVING_CONFIG.ONLY_NEW, # when ONLY_NEW=True we skip updating old links
  31. help="Also retry previously skipped/failed links when adding new links",
  32. )
  33. parser.add_argument(
  34. '--update-all', #'-n',
  35. action='store_true',
  36. default=False,
  37. help="Also update ALL links in index when finished adding new links",
  38. )
  39. parser.add_argument(
  40. '--index-only', #'-o',
  41. action='store_true',
  42. help="Add the links to the main index without archiving them",
  43. )
  44. parser.add_argument(
  45. 'urls',
  46. nargs='*',
  47. type=str,
  48. default=None,
  49. help=(
  50. 'URLs or paths to archive e.g.:\n'
  51. ' https://getpocket.com/users/USERNAME/feed/all\n'
  52. ' https://example.com/some/rss/feed.xml\n'
  53. ' https://example.com\n'
  54. ' ~/Downloads/firefox_bookmarks_export.html\n'
  55. ' ~/Desktop/sites_list.csv\n'
  56. )
  57. )
  58. parser.add_argument(
  59. "--depth",
  60. action="store",
  61. default=0,
  62. choices=[0, 1],
  63. type=int,
  64. help="Recursively archive all linked pages up to this many hops away"
  65. )
  66. parser.add_argument(
  67. "--overwrite",
  68. default=False,
  69. action="store_true",
  70. help="Re-archive URLs from scratch, overwriting any existing files"
  71. )
  72. parser.add_argument(
  73. "--init", #'-i',
  74. action='store_true',
  75. help="Init/upgrade the curent data directory before adding",
  76. )
  77. parser.add_argument(
  78. "--extract",
  79. type=str,
  80. help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
  81. This does not take precedence over the configuration",
  82. default=""
  83. )
  84. parser.add_argument(
  85. "--parser",
  86. type=str,
  87. help="Parser used to read inputted URLs.",
  88. default="auto",
  89. choices=["auto", *PARSERS.keys()],
  90. )
  91. command = parser.parse_args(args or ())
  92. urls = command.urls
  93. stdin_urls = ''
  94. if not urls:
  95. stdin_urls = accept_stdin(stdin)
  96. if (stdin_urls and urls) or (not stdin and not urls):
  97. stderr(
  98. '[X] You must pass URLs/paths to add via stdin or CLI arguments.\n',
  99. color='red',
  100. )
  101. raise SystemExit(2)
  102. add(
  103. urls=stdin_urls or urls,
  104. depth=command.depth,
  105. tag=command.tag,
  106. update=command.update,
  107. update_all=command.update_all,
  108. index_only=command.index_only,
  109. overwrite=command.overwrite,
  110. init=command.init,
  111. extractors=command.extract,
  112. parser=command.parser,
  113. out_dir=pwd or DATA_DIR,
  114. )
  115. if __name__ == '__main__':
  116. main(args=sys.argv[1:], stdin=sys.stdin)
  117. # TODO: Implement these
  118. #
  119. # parser.add_argument(
  120. # '--mirror', #'-m',
  121. # action='store_true',
  122. # help='Archive an entire site (finding all linked pages below it on the same domain)',
  123. # )
  124. # parser.add_argument(
  125. # '--crawler', #'-r',
  126. # choices=('depth_first', 'breadth_first'),
  127. # help='Controls which crawler to use in order to find outlinks in a given page',
  128. # default=None,
  129. # )