archivebox_remove.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. #!/usr/bin/env python3
  2. __package__ = 'archivebox.cli'
  3. __command__ = 'archivebox remove'
  4. import sys
  5. import argparse
  6. from pathlib import Path
  7. from typing import Optional, List, IO
  8. from django.db.models import QuerySet
  9. from archivebox.misc.util import docstring
  10. from archivebox.config import DATA_DIR
  11. from archivebox.misc.logging_util import SmartFormatter, accept_stdin
  12. from archivebox.index.schema import Link
  13. def remove(filter_str: Optional[str]=None,
  14. filter_patterns: Optional[list[str]]=None,
  15. filter_type: str='exact',
  16. snapshots: Optional[QuerySet]=None,
  17. after: Optional[float]=None,
  18. before: Optional[float]=None,
  19. yes: bool=False,
  20. delete: bool=False,
  21. out_dir: Path=DATA_DIR) -> list[Link]:
  22. """Remove the specified URLs from the archive"""
  23. check_data_folder()
  24. if snapshots is None:
  25. if filter_str and filter_patterns:
  26. stderr(
  27. '[X] You should pass either a pattern as an argument, '
  28. 'or pass a list of patterns via stdin, but not both.\n',
  29. color='red',
  30. )
  31. raise SystemExit(2)
  32. elif not (filter_str or filter_patterns):
  33. stderr(
  34. '[X] You should pass either a pattern as an argument, '
  35. 'or pass a list of patterns via stdin.',
  36. color='red',
  37. )
  38. stderr()
  39. hint(('To remove all urls you can run:',
  40. 'archivebox remove --filter-type=regex ".*"'))
  41. stderr()
  42. raise SystemExit(2)
  43. elif filter_str:
  44. filter_patterns = [ptn.strip() for ptn in filter_str.split('\n')]
  45. list_kwargs = {
  46. "filter_patterns": filter_patterns,
  47. "filter_type": filter_type,
  48. "after": after,
  49. "before": before,
  50. }
  51. if snapshots:
  52. list_kwargs["snapshots"] = snapshots
  53. log_list_started(filter_patterns, filter_type)
  54. timer = TimedProgress(360, prefix=' ')
  55. try:
  56. snapshots = list_links(**list_kwargs)
  57. finally:
  58. timer.end()
  59. if not snapshots.exists():
  60. log_removal_finished(0, 0)
  61. raise SystemExit(1)
  62. log_links = [link.as_link() for link in snapshots]
  63. log_list_finished(log_links)
  64. log_removal_started(log_links, yes=yes, delete=delete)
  65. timer = TimedProgress(360, prefix=' ')
  66. try:
  67. for snapshot in snapshots:
  68. if delete:
  69. shutil.rmtree(snapshot.as_link().link_dir, ignore_errors=True)
  70. finally:
  71. timer.end()
  72. to_remove = snapshots.count()
  73. from .search import flush_search_index
  74. flush_search_index(snapshots=snapshots)
  75. remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
  76. all_snapshots = load_main_index(out_dir=out_dir)
  77. log_removal_finished(all_snapshots.count(), to_remove)
  78. return all_snapshots
  79. @docstring(remove.__doc__)
  80. def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
  81. parser = argparse.ArgumentParser(
  82. prog=__command__,
  83. description=remove.__doc__,
  84. add_help=True,
  85. formatter_class=SmartFormatter,
  86. )
  87. parser.add_argument(
  88. '--yes', # '-y',
  89. action='store_true',
  90. help='Remove links instantly without prompting to confirm.',
  91. )
  92. parser.add_argument(
  93. '--delete', # '-r',
  94. action='store_true',
  95. help=(
  96. "In addition to removing the link from the index, "
  97. "also delete its archived content and metadata folder."
  98. ),
  99. )
  100. parser.add_argument(
  101. '--before', #'-b',
  102. type=float,
  103. help="List only URLs bookmarked before the given timestamp.",
  104. default=None,
  105. )
  106. parser.add_argument(
  107. '--after', #'-a',
  108. type=float,
  109. help="List only URLs bookmarked after the given timestamp.",
  110. default=None,
  111. )
  112. parser.add_argument(
  113. '--filter-type',
  114. type=str,
  115. choices=('exact', 'substring', 'domain', 'regex','tag'),
  116. default='exact',
  117. help='Type of pattern matching to use when filtering URLs',
  118. )
  119. parser.add_argument(
  120. 'filter_patterns',
  121. nargs='*',
  122. type=str,
  123. help='URLs matching this filter pattern will be removed from the index.'
  124. )
  125. command = parser.parse_args(args or ())
  126. filter_str = None
  127. if not command.filter_patterns:
  128. filter_str = accept_stdin(stdin)
  129. remove(
  130. filter_str=filter_str,
  131. filter_patterns=command.filter_patterns,
  132. filter_type=command.filter_type,
  133. before=command.before,
  134. after=command.after,
  135. yes=command.yes,
  136. delete=command.delete,
  137. out_dir=Path(pwd) if pwd else DATA_DIR,
  138. )
  139. if __name__ == '__main__':
  140. main(args=sys.argv[1:], stdin=sys.stdin)