archivebox_remove.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. #!/usr/bin/env python3
  2. __package__ = 'archivebox.cli'
  3. __command__ = 'archivebox remove'
  4. import shutil
  5. from pathlib import Path
  6. from typing import Iterable
  7. import rich_click as click
  8. from django.db.models import QuerySet
  9. from archivebox.config import DATA_DIR
  10. from archivebox.index.schema import Link
  11. from archivebox.config.django import setup_django
  12. from archivebox.index import load_main_index
  13. from archivebox.index.sql import remove_from_sql_main_index
  14. from archivebox.misc.util import enforce_types, docstring
  15. from archivebox.misc.checks import check_data_folder
  16. from archivebox.misc.logging_util import (
  17. log_list_started,
  18. log_list_finished,
  19. log_removal_started,
  20. log_removal_finished,
  21. TimedProgress,
  22. )
  23. @enforce_types
  24. def remove(filter_patterns: Iterable[str]=(),
  25. filter_type: str='exact',
  26. snapshots: QuerySet | None=None,
  27. after: float | None=None,
  28. before: float | None=None,
  29. yes: bool=False,
  30. delete: bool=False,
  31. out_dir: Path=DATA_DIR) -> Iterable[Link]:
  32. """Remove the specified URLs from the archive"""
  33. setup_django()
  34. check_data_folder()
  35. from archivebox.cli.archivebox_search import list_links
  36. list_kwargs = {
  37. "filter_patterns": filter_patterns,
  38. "filter_type": filter_type,
  39. "after": after,
  40. "before": before,
  41. }
  42. if snapshots:
  43. list_kwargs["snapshots"] = snapshots
  44. log_list_started(filter_patterns, filter_type)
  45. timer = TimedProgress(360, prefix=' ')
  46. try:
  47. snapshots = list_links(**list_kwargs)
  48. finally:
  49. timer.end()
  50. if not snapshots.exists():
  51. log_removal_finished(0, 0)
  52. raise SystemExit(1)
  53. log_links = [link.as_link() for link in snapshots]
  54. log_list_finished(log_links)
  55. log_removal_started(log_links, yes=yes, delete=delete)
  56. timer = TimedProgress(360, prefix=' ')
  57. try:
  58. for snapshot in snapshots:
  59. if delete:
  60. shutil.rmtree(snapshot.as_link().link_dir, ignore_errors=True)
  61. finally:
  62. timer.end()
  63. to_remove = snapshots.count()
  64. from archivebox.search import flush_search_index
  65. flush_search_index(snapshots=snapshots)
  66. remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
  67. all_snapshots = load_main_index(out_dir=out_dir)
  68. log_removal_finished(all_snapshots.count(), to_remove)
  69. return all_snapshots
  70. @click.command()
  71. @click.option('--yes', is_flag=True, help='Remove links instantly without prompting to confirm')
  72. @click.option('--delete', is_flag=True, help='Delete the archived content and metadata folder in addition to removing from index')
  73. @click.option('--before', type=float, help='Remove only URLs bookmarked before timestamp')
  74. @click.option('--after', type=float, help='Remove only URLs bookmarked after timestamp')
  75. @click.option('--filter-type', '-f', type=click.Choice(('exact', 'substring', 'domain', 'regex', 'tag')), default='exact', help='Type of pattern matching to use when filtering URLs')
  76. @click.argument('filter_patterns', nargs=-1)
  77. @docstring(remove.__doc__)
  78. def main(**kwargs):
  79. """Remove the specified URLs from the archive"""
  80. remove(**kwargs)
  81. if __name__ == '__main__':
  82. main()