archivebox_update.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. #!/usr/bin/env python3
  2. __package__ = 'archivebox.cli'
  3. __command__ = 'archivebox update'
  4. import sys
  5. import argparse
  6. from pathlib import Path
  7. from typing import List, Optional, IO
  8. from archivebox.misc.util import docstring
  9. from archivebox.config import DATA_DIR
  10. from ..index import (
  11. LINK_FILTERS,
  12. get_indexed_folders,
  13. get_archived_folders,
  14. get_unarchived_folders,
  15. get_present_folders,
  16. get_valid_folders,
  17. get_invalid_folders,
  18. get_duplicate_folders,
  19. get_orphaned_folders,
  20. get_corrupted_folders,
  21. get_unrecognized_folders,
  22. )
  23. from ..logging_util import SmartFormatter, accept_stdin
  24. from ..main import update
  25. @docstring(update.__doc__)
  26. def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
  27. parser = argparse.ArgumentParser(
  28. prog=__command__,
  29. description=update.__doc__,
  30. add_help=True,
  31. formatter_class=SmartFormatter,
  32. )
  33. parser.add_argument(
  34. '--only-new', #'-n',
  35. action='store_true',
  36. help="Don't attempt to retry previously skipped/failed links when updating",
  37. )
  38. parser.add_argument(
  39. '--index-only', #'-o',
  40. action='store_true',
  41. help="Update the main index without archiving any content",
  42. )
  43. parser.add_argument(
  44. '--resume', #'-r',
  45. type=float,
  46. help='Resume the update process from a given timestamp',
  47. default=None,
  48. )
  49. parser.add_argument(
  50. '--overwrite', #'-x',
  51. action='store_true',
  52. help='Ignore existing archived content and overwrite with new versions (DANGEROUS)',
  53. )
  54. parser.add_argument(
  55. '--before', #'-b',
  56. type=float,
  57. help="Update only links bookmarked before the given timestamp.",
  58. default=None,
  59. )
  60. parser.add_argument(
  61. '--after', #'-a',
  62. type=float,
  63. help="Update only links bookmarked after the given timestamp.",
  64. default=None,
  65. )
  66. parser.add_argument(
  67. '--status',
  68. type=str,
  69. choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'),
  70. default='indexed',
  71. help=(
  72. 'Update only links or data directories that have the given status\n'
  73. f' indexed {get_indexed_folders.__doc__} (the default)\n'
  74. f' archived {get_archived_folders.__doc__}\n'
  75. f' unarchived {get_unarchived_folders.__doc__}\n'
  76. '\n'
  77. f' present {get_present_folders.__doc__}\n'
  78. f' valid {get_valid_folders.__doc__}\n'
  79. f' invalid {get_invalid_folders.__doc__}\n'
  80. '\n'
  81. f' duplicate {get_duplicate_folders.__doc__}\n'
  82. f' orphaned {get_orphaned_folders.__doc__}\n'
  83. f' corrupted {get_corrupted_folders.__doc__}\n'
  84. f' unrecognized {get_unrecognized_folders.__doc__}\n'
  85. )
  86. )
  87. parser.add_argument(
  88. '--filter-type', '-t',
  89. type=str,
  90. choices=(*LINK_FILTERS.keys(), 'search'),
  91. default='exact',
  92. help='Type of pattern matching to use when filtering URLs',
  93. )
  94. parser.add_argument(
  95. 'filter_patterns',
  96. nargs='*',
  97. type=str,
  98. default=None,
  99. help='Update only URLs matching these filter patterns.'
  100. )
  101. parser.add_argument(
  102. "--extract",
  103. type=str,
  104. help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
  105. This does not take precedence over the configuration",
  106. default=""
  107. )
  108. command = parser.parse_args(args or ())
  109. filter_patterns_str = None
  110. if not command.filter_patterns:
  111. filter_patterns_str = accept_stdin(stdin)
  112. update(
  113. resume=command.resume,
  114. only_new=command.only_new,
  115. index_only=command.index_only,
  116. overwrite=command.overwrite,
  117. filter_patterns_str=filter_patterns_str,
  118. filter_patterns=command.filter_patterns,
  119. filter_type=command.filter_type,
  120. status=command.status,
  121. after=command.after,
  122. before=command.before,
  123. out_dir=Path(pwd) if pwd else DATA_DIR,
  124. extractors=command.extract,
  125. )
  126. if __name__ == '__main__':
  127. main(args=sys.argv[1:], stdin=sys.stdin)