archivebox_update.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. #!/usr/bin/env python3
  2. __package__ = 'archivebox.cli'
  3. __command__ = 'archivebox update'
  4. import sys
  5. import argparse
  6. from typing import List, Optional, IO
  7. from archivebox.misc.util import docstring
  8. from archivebox.index import (
  9. LINK_FILTERS,
  10. get_indexed_folders,
  11. get_archived_folders,
  12. get_unarchived_folders,
  13. get_present_folders,
  14. get_valid_folders,
  15. get_invalid_folders,
  16. get_duplicate_folders,
  17. get_orphaned_folders,
  18. get_corrupted_folders,
  19. get_unrecognized_folders,
  20. )
  21. from archivebox.logging_util import SmartFormatter, accept_stdin
  22. # from ..main import update
  23. def update():
  24. from archivebox.config.django import setup_django
  25. setup_django()
  26. from actors.orchestrator import Orchestrator
  27. orchestrator = Orchestrator(exit_on_idle=False)
  28. orchestrator.start()
  29. @docstring(update.__doc__)
  30. def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
  31. parser = argparse.ArgumentParser(
  32. prog=__command__,
  33. description=update.__doc__,
  34. add_help=True,
  35. formatter_class=SmartFormatter,
  36. )
  37. parser.add_argument(
  38. '--only-new', #'-n',
  39. action='store_true',
  40. help="Don't attempt to retry previously skipped/failed links when updating",
  41. )
  42. parser.add_argument(
  43. '--index-only', #'-o',
  44. action='store_true',
  45. help="Update the main index without archiving any content",
  46. )
  47. parser.add_argument(
  48. '--resume', #'-r',
  49. type=float,
  50. help='Resume the update process from a given timestamp',
  51. default=None,
  52. )
  53. parser.add_argument(
  54. '--overwrite', #'-x',
  55. action='store_true',
  56. help='Ignore existing archived content and overwrite with new versions (DANGEROUS)',
  57. )
  58. parser.add_argument(
  59. '--before', #'-b',
  60. type=float,
  61. help="Update only links bookmarked before the given timestamp.",
  62. default=None,
  63. )
  64. parser.add_argument(
  65. '--after', #'-a',
  66. type=float,
  67. help="Update only links bookmarked after the given timestamp.",
  68. default=None,
  69. )
  70. parser.add_argument(
  71. '--status',
  72. type=str,
  73. choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'),
  74. default='indexed',
  75. help=(
  76. 'Update only links or data directories that have the given status\n'
  77. f' indexed {get_indexed_folders.__doc__} (the default)\n'
  78. f' archived {get_archived_folders.__doc__}\n'
  79. f' unarchived {get_unarchived_folders.__doc__}\n'
  80. '\n'
  81. f' present {get_present_folders.__doc__}\n'
  82. f' valid {get_valid_folders.__doc__}\n'
  83. f' invalid {get_invalid_folders.__doc__}\n'
  84. '\n'
  85. f' duplicate {get_duplicate_folders.__doc__}\n'
  86. f' orphaned {get_orphaned_folders.__doc__}\n'
  87. f' corrupted {get_corrupted_folders.__doc__}\n'
  88. f' unrecognized {get_unrecognized_folders.__doc__}\n'
  89. )
  90. )
  91. parser.add_argument(
  92. '--filter-type', '-t',
  93. type=str,
  94. choices=(*LINK_FILTERS.keys(), 'search'),
  95. default='exact',
  96. help='Type of pattern matching to use when filtering URLs',
  97. )
  98. parser.add_argument(
  99. 'filter_patterns',
  100. nargs='*',
  101. type=str,
  102. default=None,
  103. help='Update only URLs matching these filter patterns.'
  104. )
  105. parser.add_argument(
  106. "--extract",
  107. type=str,
  108. help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
  109. This does not take precedence over the configuration",
  110. default=""
  111. )
  112. command = parser.parse_args(args or ())
  113. filter_patterns_str = None
  114. if not command.filter_patterns:
  115. filter_patterns_str = accept_stdin(stdin)
  116. update()
  117. # update(
  118. # resume=command.resume,
  119. # only_new=command.only_new,
  120. # index_only=command.index_only,
  121. # overwrite=command.overwrite,
  122. # filter_patterns_str=filter_patterns_str,
  123. # filter_patterns=command.filter_patterns,
  124. # filter_type=command.filter_type,
  125. # status=command.status,
  126. # after=command.after,
  127. # before=command.before,
  128. # out_dir=Path(pwd) if pwd else DATA_DIR,
  129. # extractors=command.extract,
  130. # )
  131. if __name__ == '__main__':
  132. main(args=sys.argv[1:], stdin=sys.stdin)