archivebox_update.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. #!/usr/bin/env python3
  2. __package__ = 'archivebox.cli'
  3. __command__ = 'archivebox update'
  4. import sys
  5. import argparse
  6. from typing import List, Optional, IO
  7. from ..main import update, docstring
  8. from ..config import OUTPUT_DIR
  9. from ..index import (
  10. get_indexed_folders,
  11. get_archived_folders,
  12. get_unarchived_folders,
  13. get_present_folders,
  14. get_valid_folders,
  15. get_invalid_folders,
  16. get_duplicate_folders,
  17. get_orphaned_folders,
  18. get_corrupted_folders,
  19. get_unrecognized_folders,
  20. )
  21. from ..logging import SmartFormatter, accept_stdin
  22. @docstring(update.__doc__)
  23. def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
  24. parser = argparse.ArgumentParser(
  25. prog=__command__,
  26. description=update.__doc__,
  27. add_help=True,
  28. formatter_class=SmartFormatter,
  29. )
  30. parser.add_argument(
  31. '--only-new', #'-n',
  32. action='store_true',
  33. help="Don't attempt to retry previously skipped/failed links when updating",
  34. )
  35. parser.add_argument(
  36. '--index-only', #'-o',
  37. action='store_true',
  38. help="Update the main index without archiving any content",
  39. )
  40. parser.add_argument(
  41. '--resume', #'-r',
  42. type=float,
  43. help='Resume the update process from a given timestamp',
  44. default=None,
  45. )
  46. parser.add_argument(
  47. '--overwrite', #'-x',
  48. action='store_true',
  49. help='Ignore existing archived content and overwrite with new versions (DANGEROUS)',
  50. )
  51. parser.add_argument(
  52. '--before', #'-b',
  53. type=float,
  54. help="Update only links bookmarked before the given timestamp.",
  55. default=None,
  56. )
  57. parser.add_argument(
  58. '--after', #'-a',
  59. type=float,
  60. help="Update only links bookmarked after the given timestamp.",
  61. default=None,
  62. )
  63. parser.add_argument(
  64. '--status',
  65. type=str,
  66. choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'),
  67. default='indexed',
  68. help=(
  69. 'Update only links or data directories that have the given status\n'
  70. f' indexed {get_indexed_folders.__doc__} (the default)\n'
  71. f' archived {get_archived_folders.__doc__}\n'
  72. f' unarchived {get_unarchived_folders.__doc__}\n'
  73. '\n'
  74. f' present {get_present_folders.__doc__}\n'
  75. f' valid {get_valid_folders.__doc__}\n'
  76. f' invalid {get_invalid_folders.__doc__}\n'
  77. '\n'
  78. f' duplicate {get_duplicate_folders.__doc__}\n'
  79. f' orphaned {get_orphaned_folders.__doc__}\n'
  80. f' corrupted {get_corrupted_folders.__doc__}\n'
  81. f' unrecognized {get_unrecognized_folders.__doc__}\n'
  82. )
  83. )
  84. parser.add_argument(
  85. '--filter-type',
  86. type=str,
  87. choices=('exact', 'substring', 'domain', 'regex'),
  88. default='exact',
  89. help='Type of pattern matching to use when filtering URLs',
  90. )
  91. parser.add_argument(
  92. 'filter_patterns',
  93. nargs='*',
  94. type=str,
  95. default=None,
  96. help='Update only URLs matching these filter patterns.'
  97. )
  98. command = parser.parse_args(args or ())
  99. filter_patterns_str = accept_stdin(stdin)
  100. update(
  101. resume=command.resume,
  102. only_new=command.only_new,
  103. index_only=command.index_only,
  104. overwrite=command.overwrite,
  105. filter_patterns_str=filter_patterns_str,
  106. filter_patterns=command.filter_patterns,
  107. filter_type=command.filter_type,
  108. status=command.status,
  109. after=command.after,
  110. before=command.before,
  111. out_dir=pwd or OUTPUT_DIR,
  112. )
  113. if __name__ == '__main__':
  114. main(args=sys.argv[1:], stdin=sys.stdin)