archivebox_update.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. #!/usr/bin/env python3
  2. __package__ = 'archivebox.cli'
  3. import rich_click as click
  4. from typing import Iterable
  5. from archivebox.misc.util import enforce_types, docstring
  6. from archivebox.index import (
  7. LINK_FILTERS,
  8. get_indexed_folders,
  9. get_archived_folders,
  10. get_unarchived_folders,
  11. get_present_folders,
  12. get_valid_folders,
  13. get_invalid_folders,
  14. get_duplicate_folders,
  15. get_orphaned_folders,
  16. get_corrupted_folders,
  17. get_unrecognized_folders,
  18. )
  19. @enforce_types
  20. def update(filter_patterns: Iterable[str]=(),
  21. only_new: bool=False,
  22. index_only: bool=False,
  23. resume: float | None=None,
  24. overwrite: bool=False,
  25. before: float | None=None,
  26. after: float | None=None,
  27. status: str='indexed',
  28. filter_type: str='exact',
  29. extract: str="") -> None:
  30. """Import any new links from subscriptions and retry any previously failed/skipped links"""
  31. from archivebox.config.django import setup_django
  32. setup_django()
  33. from workers.orchestrator import Orchestrator
  34. orchestrator = Orchestrator(exit_on_idle=False)
  35. orchestrator.start()
  36. @click.command()
  37. @click.option('--only-new', is_flag=True, help="Don't attempt to retry previously skipped/failed links when updating")
  38. @click.option('--index-only', is_flag=True, help="Update the main index without archiving any content")
  39. @click.option('--resume', type=float, help='Resume the update process from a given timestamp')
  40. @click.option('--overwrite', '-F', is_flag=True, help='Ignore existing archived content and overwrite with new versions (DANGEROUS)')
  41. @click.option('--before', type=float, help="Update only links bookmarked before the given timestamp")
  42. @click.option('--after', type=float, help="Update only links bookmarked after the given timestamp")
  43. @click.option('--status', type=click.Choice([
  44. 'indexed', 'archived', 'unarchived',
  45. 'present', 'valid', 'invalid',
  46. 'duplicate', 'orphaned', 'corrupted', 'unrecognized'
  47. ]), default='indexed', help=f'''
  48. Update only links or data directories that have the given status:
  49. indexed {get_indexed_folders.__doc__} (the default)
  50. archived {get_archived_folders.__doc__}
  51. unarchived {get_unarchived_folders.__doc__}
  52. present {get_present_folders.__doc__}
  53. valid {get_valid_folders.__doc__}
  54. invalid {get_invalid_folders.__doc__}
  55. duplicate {get_duplicate_folders.__doc__}
  56. orphaned {get_orphaned_folders.__doc__}
  57. corrupted {get_corrupted_folders.__doc__}
  58. unrecognized {get_unrecognized_folders.__doc__}
  59. ''')
  60. @click.option('--filter-type', '-t', type=click.Choice([*LINK_FILTERS.keys(), 'search']), default='exact', help='Type of pattern matching to use when filtering URLs')
  61. @click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
  62. @click.argument('filter_patterns', nargs=-1)
  63. @docstring(update.__doc__)
  64. def main(**kwargs):
  65. """Import any new links from subscriptions and retry any previously failed/skipped links"""
  66. update(**kwargs)
  67. if __name__ == '__main__':
  68. main()
  69. # LEGACY VERSION:
  70. # @enforce_types
  71. # def update(resume: Optional[float]=None,
  72. # only_new: bool=ARCHIVING_CONFIG.ONLY_NEW,
  73. # index_only: bool=False,
  74. # overwrite: bool=False,
  75. # filter_patterns_str: Optional[str]=None,
  76. # filter_patterns: Optional[List[str]]=None,
  77. # filter_type: Optional[str]=None,
  78. # status: Optional[str]=None,
  79. # after: Optional[str]=None,
  80. # before: Optional[str]=None,
  81. # extractors: str="",
  82. # out_dir: Path=DATA_DIR) -> List[Link]:
  83. # """Import any new links from subscriptions and retry any previously failed/skipped links"""
  84. # from core.models import ArchiveResult
  85. # from .search import index_links
  86. # # from workers.supervisord_util import start_cli_workers
  87. # check_data_folder()
  88. # # start_cli_workers()
  89. # new_links: List[Link] = [] # TODO: Remove input argument: only_new
  90. # extractors = extractors.split(",") if extractors else []
  91. # # Step 1: Filter for selected_links
  92. # print('[*] Finding matching Snapshots to update...')
  93. # print(f' - Filtering by {" ".join(filter_patterns)} ({filter_type}) {before=} {after=} {status=}...')
  94. # matching_snapshots = list_links(
  95. # filter_patterns=filter_patterns,
  96. # filter_type=filter_type,
  97. # before=before,
  98. # after=after,
  99. # )
  100. # print(f' - Checking {matching_snapshots.count()} snapshot folders for existing data with {status=}...')
  101. # matching_folders = list_folders(
  102. # links=matching_snapshots,
  103. # status=status,
  104. # out_dir=out_dir,
  105. # )
  106. # all_links = (link for link in matching_folders.values() if link)
  107. # print(' - Sorting by most unfinished -> least unfinished + date archived...')
  108. # all_links = sorted(all_links, key=lambda link: (ArchiveResult.objects.filter(snapshot__url=link.url).count(), link.timestamp))
  109. # if index_only:
  110. # for link in all_links:
  111. # write_link_details(link, out_dir=out_dir, skip_sql_index=True)
  112. # index_links(all_links, out_dir=out_dir)
  113. # return all_links
  114. # # Step 2: Run the archive methods for each link
  115. # to_archive = new_links if only_new else all_links
  116. # if resume:
  117. # to_archive = [
  118. # link for link in to_archive
  119. # if link.timestamp >= str(resume)
  120. # ]
  121. # if not to_archive:
  122. # stderr('')
  123. # stderr(f'[√] Nothing found to resume after {resume}', color='green')
  124. # return all_links
  125. # archive_kwargs = {
  126. # "out_dir": out_dir,
  127. # }
  128. # if extractors:
  129. # archive_kwargs["methods"] = extractors
  130. # archive_links(to_archive, overwrite=overwrite, **archive_kwargs)
  131. # # Step 4: Re-write links index with updated titles, icons, and resources
  132. # all_links = load_main_index(out_dir=out_dir)
  133. # return all_links