purge.py 2.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. #!/usr/bin/env python3
  2. import re
  3. from argparse import ArgumentParser
  4. from os.path import exists, join
  5. from shutil import rmtree
  6. from typing import List
  7. from .config import ARCHIVE_DIR, OUTPUT_DIR
  8. from .index import parse_json_links_index, write_html_links_index, write_json_links_index
  9. def cleanup_index(regexes: List[str], proceed: bool, delete: bool) -> None:
  10. if not exists(join(OUTPUT_DIR, 'index.json')):
  11. exit('index.json is missing; nothing to do')
  12. compiled = [re.compile(r) for r in regexes]
  13. links = parse_json_links_index(OUTPUT_DIR)
  14. filtered = []
  15. remaining = []
  16. for link in links:
  17. url = link.url
  18. for r in compiled:
  19. if r.search(url):
  20. filtered.append((link, r))
  21. break
  22. else:
  23. remaining.append(link)
  24. if not filtered:
  25. exit('Search did not match any entries.')
  26. print('Filtered out {}/{} urls:'.format(len(filtered), len(links)))
  27. for link, regex in filtered:
  28. url = link.url
  29. print(' {url} via {regex}'.format(url=url, regex=regex.pattern))
  30. if not proceed:
  31. answer = input('Remove {} entries from index? [y/n] '.format(
  32. len(filtered)))
  33. proceed = answer.strip().lower() in ('y', 'yes')
  34. if not proceed:
  35. exit('Aborted')
  36. write_json_links_index(OUTPUT_DIR, remaining)
  37. write_html_links_index(OUTPUT_DIR, remaining)
  38. if delete:
  39. for link, _ in filtered:
  40. data_dir = join(ARCHIVE_DIR, link['timestamp'])
  41. if exists(data_dir):
  42. rmtree(data_dir)
  43. if __name__ == '__main__':
  44. p = ArgumentParser('Index purging tool')
  45. p.add_argument(
  46. '--regex',
  47. '-r',
  48. action='append',
  49. help='Regular expression matching URLs to purge',
  50. )
  51. p.add_argument(
  52. '--delete',
  53. '-d',
  54. action='store_true',
  55. default=False,
  56. help='Delete webpage files from archive',
  57. )
  58. p.add_argument(
  59. '--yes',
  60. '-y',
  61. action='store_true',
  62. default=False,
  63. help='Do not prompt for confirmation',
  64. )
  65. args = p.parse_args()
  66. if args.regex:
  67. cleanup_index(args.regex, proceed=args.yes, delete=args.delete)
  68. else:
  69. p.print_help()