| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165 |
- #!/usr/bin/env python3
- __package__ = 'archivebox.cli'
- __command__ = 'archivebox remove'
- import sys
- import argparse
- from pathlib import Path
- from typing import Optional, List, IO
- from django.db.models import QuerySet
- from archivebox.misc.util import docstring
- from archivebox.config import DATA_DIR
- from archivebox.misc.logging_util import SmartFormatter, accept_stdin
- from archivebox.index.schema import Link
- def remove(filter_str: Optional[str]=None,
- filter_patterns: Optional[list[str]]=None,
- filter_type: str='exact',
- snapshots: Optional[QuerySet]=None,
- after: Optional[float]=None,
- before: Optional[float]=None,
- yes: bool=False,
- delete: bool=False,
- out_dir: Path=DATA_DIR) -> list[Link]:
- """Remove the specified URLs from the archive"""
-
- check_data_folder()
- if snapshots is None:
- if filter_str and filter_patterns:
- stderr(
- '[X] You should pass either a pattern as an argument, '
- 'or pass a list of patterns via stdin, but not both.\n',
- color='red',
- )
- raise SystemExit(2)
- elif not (filter_str or filter_patterns):
- stderr(
- '[X] You should pass either a pattern as an argument, '
- 'or pass a list of patterns via stdin.',
- color='red',
- )
- stderr()
- hint(('To remove all urls you can run:',
- 'archivebox remove --filter-type=regex ".*"'))
- stderr()
- raise SystemExit(2)
- elif filter_str:
- filter_patterns = [ptn.strip() for ptn in filter_str.split('\n')]
- list_kwargs = {
- "filter_patterns": filter_patterns,
- "filter_type": filter_type,
- "after": after,
- "before": before,
- }
- if snapshots:
- list_kwargs["snapshots"] = snapshots
- log_list_started(filter_patterns, filter_type)
- timer = TimedProgress(360, prefix=' ')
- try:
- snapshots = list_links(**list_kwargs)
- finally:
- timer.end()
- if not snapshots.exists():
- log_removal_finished(0, 0)
- raise SystemExit(1)
- log_links = [link.as_link() for link in snapshots]
- log_list_finished(log_links)
- log_removal_started(log_links, yes=yes, delete=delete)
- timer = TimedProgress(360, prefix=' ')
- try:
- for snapshot in snapshots:
- if delete:
- shutil.rmtree(snapshot.as_link().link_dir, ignore_errors=True)
- finally:
- timer.end()
- to_remove = snapshots.count()
- from .search import flush_search_index
- flush_search_index(snapshots=snapshots)
- remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
- all_snapshots = load_main_index(out_dir=out_dir)
- log_removal_finished(all_snapshots.count(), to_remove)
-
- return all_snapshots
- @docstring(remove.__doc__)
- def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
- parser = argparse.ArgumentParser(
- prog=__command__,
- description=remove.__doc__,
- add_help=True,
- formatter_class=SmartFormatter,
- )
- parser.add_argument(
- '--yes', # '-y',
- action='store_true',
- help='Remove links instantly without prompting to confirm.',
- )
- parser.add_argument(
- '--delete', # '-r',
- action='store_true',
- help=(
- "In addition to removing the link from the index, "
- "also delete its archived content and metadata folder."
- ),
- )
- parser.add_argument(
- '--before', #'-b',
- type=float,
- help="List only URLs bookmarked before the given timestamp.",
- default=None,
- )
- parser.add_argument(
- '--after', #'-a',
- type=float,
- help="List only URLs bookmarked after the given timestamp.",
- default=None,
- )
- parser.add_argument(
- '--filter-type',
- type=str,
- choices=('exact', 'substring', 'domain', 'regex','tag'),
- default='exact',
- help='Type of pattern matching to use when filtering URLs',
- )
- parser.add_argument(
- 'filter_patterns',
- nargs='*',
- type=str,
- help='URLs matching this filter pattern will be removed from the index.'
- )
- command = parser.parse_args(args or ())
-
- filter_str = None
- if not command.filter_patterns:
- filter_str = accept_stdin(stdin)
- remove(
- filter_str=filter_str,
- filter_patterns=command.filter_patterns,
- filter_type=command.filter_type,
- before=command.before,
- after=command.after,
- yes=command.yes,
- delete=command.delete,
- out_dir=Path(pwd) if pwd else DATA_DIR,
- )
-
- if __name__ == '__main__':
- main(args=sys.argv[1:], stdin=sys.stdin)
|