archivebox_remove.py 2.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. #!/usr/bin/env python3
  2. __package__ = 'archivebox.cli'
  3. __command__ = 'archivebox remove'
  4. import shutil
  5. from pathlib import Path
  6. from typing import Iterable
  7. import rich_click as click
  8. from django.db.models import QuerySet
  9. from archivebox.config import DATA_DIR
  10. from archivebox.config.django import setup_django
  11. from archivebox.misc.util import enforce_types, docstring
  12. from archivebox.misc.checks import check_data_folder
  13. from archivebox.misc.logging_util import (
  14. log_list_started,
  15. log_list_finished,
  16. log_removal_started,
  17. log_removal_finished,
  18. TimedProgress,
  19. )
  20. @enforce_types
  21. def remove(filter_patterns: Iterable[str]=(),
  22. filter_type: str='exact',
  23. snapshots: QuerySet | None=None,
  24. after: float | None=None,
  25. before: float | None=None,
  26. yes: bool=False,
  27. delete: bool=False,
  28. out_dir: Path=DATA_DIR) -> QuerySet:
  29. """Remove the specified URLs from the archive"""
  30. setup_django()
  31. check_data_folder()
  32. from archivebox.cli.archivebox_search import get_snapshots
  33. log_list_started(filter_patterns, filter_type)
  34. timer = TimedProgress(360, prefix=' ')
  35. try:
  36. snapshots = get_snapshots(
  37. snapshots=snapshots,
  38. filter_patterns=list(filter_patterns) if filter_patterns else None,
  39. filter_type=filter_type,
  40. after=after,
  41. before=before,
  42. )
  43. finally:
  44. timer.end()
  45. if not snapshots.exists():
  46. log_removal_finished(0, 0)
  47. raise SystemExit(1)
  48. log_list_finished(snapshots)
  49. log_removal_started(snapshots, yes=yes, delete=delete)
  50. timer = TimedProgress(360, prefix=' ')
  51. try:
  52. for snapshot in snapshots:
  53. if delete:
  54. shutil.rmtree(snapshot.output_dir, ignore_errors=True)
  55. finally:
  56. timer.end()
  57. to_remove = snapshots.count()
  58. from archivebox.search import flush_search_index
  59. from archivebox.core.models import Snapshot
  60. flush_search_index(snapshots=snapshots)
  61. snapshots.delete()
  62. all_snapshots = Snapshot.objects.all()
  63. log_removal_finished(all_snapshots.count(), to_remove)
  64. return all_snapshots
  65. @click.command()
  66. @click.option('--yes', is_flag=True, help='Remove links instantly without prompting to confirm')
  67. @click.option('--delete', is_flag=True, help='Delete the archived content and metadata folder in addition to removing from index')
  68. @click.option('--before', type=float, help='Remove only URLs bookmarked before timestamp')
  69. @click.option('--after', type=float, help='Remove only URLs bookmarked after timestamp')
  70. @click.option('--filter-type', '-f', type=click.Choice(('exact', 'substring', 'domain', 'regex', 'tag')), default='exact', help='Type of pattern matching to use when filtering URLs')
  71. @click.argument('filter_patterns', nargs=-1)
  72. @docstring(remove.__doc__)
  73. def main(**kwargs):
  74. """Remove the specified URLs from the archive"""
  75. remove(**kwargs)
  76. if __name__ == '__main__':
  77. main()