archivebox_search.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. #!/usr/bin/env python3
  2. __package__ = 'archivebox.cli'
  3. __command__ = 'archivebox search'
  4. from pathlib import Path
  5. from typing import Optional, List, Iterable
  6. import rich_click as click
  7. from rich import print
  8. from django.db.models import QuerySet
  9. from archivebox.config import DATA_DIR
  10. from archivebox.index import LINK_FILTERS
  11. from archivebox.index.schema import Link
  12. from archivebox.misc.logging import stderr
  13. from archivebox.misc.util import enforce_types, docstring
  14. STATUS_CHOICES = [
  15. 'indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid',
  16. 'duplicate', 'orphaned', 'corrupted', 'unrecognized'
  17. ]
  18. def list_links(snapshots: Optional[QuerySet]=None,
  19. filter_patterns: Optional[List[str]]=None,
  20. filter_type: str='substring',
  21. after: Optional[float]=None,
  22. before: Optional[float]=None,
  23. out_dir: Path=DATA_DIR) -> Iterable[Link]:
  24. from archivebox.index import load_main_index
  25. from archivebox.index import snapshot_filter
  26. if snapshots:
  27. all_snapshots = snapshots
  28. else:
  29. all_snapshots = load_main_index(out_dir=out_dir)
  30. if after is not None:
  31. all_snapshots = all_snapshots.filter(timestamp__gte=after)
  32. if before is not None:
  33. all_snapshots = all_snapshots.filter(timestamp__lt=before)
  34. if filter_patterns:
  35. all_snapshots = snapshot_filter(all_snapshots, filter_patterns, filter_type)
  36. if not all_snapshots:
  37. stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow')
  38. return all_snapshots
  39. def list_folders(links: list[Link], status: str, out_dir: Path=DATA_DIR) -> dict[str, Link | None]:
  40. from archivebox.misc.checks import check_data_folder
  41. from archivebox.index import (
  42. get_indexed_folders,
  43. get_archived_folders,
  44. get_unarchived_folders,
  45. get_present_folders,
  46. get_valid_folders,
  47. get_invalid_folders,
  48. get_duplicate_folders,
  49. get_orphaned_folders,
  50. get_corrupted_folders,
  51. get_unrecognized_folders,
  52. )
  53. check_data_folder()
  54. STATUS_FUNCTIONS = {
  55. "indexed": get_indexed_folders,
  56. "archived": get_archived_folders,
  57. "unarchived": get_unarchived_folders,
  58. "present": get_present_folders,
  59. "valid": get_valid_folders,
  60. "invalid": get_invalid_folders,
  61. "duplicate": get_duplicate_folders,
  62. "orphaned": get_orphaned_folders,
  63. "corrupted": get_corrupted_folders,
  64. "unrecognized": get_unrecognized_folders,
  65. }
  66. try:
  67. return STATUS_FUNCTIONS[status](links, out_dir=out_dir)
  68. except KeyError:
  69. raise ValueError('Status not recognized.')
  70. @enforce_types
  71. def search(filter_patterns: list[str] | None=None,
  72. filter_type: str='substring',
  73. status: str='indexed',
  74. before: float | None=None,
  75. after: float | None=None,
  76. sort: str | None=None,
  77. json: bool=False,
  78. html: bool=False,
  79. csv: str | None=None,
  80. with_headers: bool=False):
  81. """List, filter, and export information about archive entries"""
  82. if with_headers and not (json or html or csv):
  83. stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')
  84. raise SystemExit(2)
  85. snapshots = list_links(
  86. filter_patterns=list(filter_patterns) if filter_patterns else None,
  87. filter_type=filter_type,
  88. before=before,
  89. after=after,
  90. )
  91. if sort:
  92. snapshots = snapshots.order_by(sort)
  93. folders = list_folders(
  94. links=snapshots,
  95. status=status,
  96. out_dir=DATA_DIR,
  97. )
  98. if json:
  99. from archivebox.index.json import generate_json_index_from_links
  100. output = generate_json_index_from_links(folders.values(), with_headers)
  101. elif html:
  102. from archivebox.index.html import generate_index_from_links
  103. output = generate_index_from_links(folders.values(), with_headers)
  104. elif csv:
  105. from archivebox.index.csv import links_to_csv
  106. output = links_to_csv(folders.values(), csv.split(','), with_headers)
  107. else:
  108. from archivebox.misc.logging_util import printable_folders
  109. output = printable_folders(folders, with_headers)
  110. print(output)
  111. return output
  112. @click.command()
  113. @click.option('--filter-type', '-f', type=click.Choice(['search', *LINK_FILTERS.keys()]), default='substring', help='Pattern matching type for filtering URLs')
  114. @click.option('--status', '-s', type=click.Choice(STATUS_CHOICES), default='indexed', help='List snapshots with the given status')
  115. @click.option('--before', '-b', type=float, help='List snapshots bookmarked before the given UNIX timestamp')
  116. @click.option('--after', '-a', type=float, help='List snapshots bookmarked after the given UNIX timestamp')
  117. @click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at')
  118. @click.option('--json', '-J', is_flag=True, help='Print output in JSON format')
  119. @click.option('--html', '-M', is_flag=True, help='Print output in HTML format (suitable for viewing statically without a server)')
  120. @click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: created_at,url,title')
  121. @click.option('--with-headers', '-H', is_flag=True, help='Include extra CSV/HTML headers in the output')
  122. @click.help_option('--help', '-h')
  123. @click.argument('filter_patterns', nargs=-1)
  124. @docstring(search.__doc__)
  125. def main(**kwargs):
  126. return search(**kwargs)
  127. if __name__ == '__main__':
  128. main()