archivebox_status.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. #!/usr/bin/env python3
  2. __package__ = 'archivebox.cli'
  3. from pathlib import Path
  4. import rich_click as click
  5. from rich import print
  6. from archivebox.misc.util import enforce_types, docstring
  7. from archivebox.config import DATA_DIR, CONSTANTS, ARCHIVE_DIR
  8. from archivebox.config.common import SHELL_CONFIG
  9. from archivebox.index.json import parse_json_links_details
  10. from archivebox.index import (
  11. load_main_index,
  12. get_indexed_folders,
  13. get_archived_folders,
  14. get_invalid_folders,
  15. get_unarchived_folders,
  16. get_present_folders,
  17. get_valid_folders,
  18. get_duplicate_folders,
  19. get_orphaned_folders,
  20. get_corrupted_folders,
  21. get_unrecognized_folders,
  22. )
  23. from archivebox.misc.system import get_dir_size
  24. from archivebox.misc.logging_util import printable_filesize
  25. @enforce_types
  26. def status(out_dir: Path=DATA_DIR) -> None:
  27. """Print out some info and statistics about the archive collection"""
  28. from django.contrib.auth import get_user_model
  29. from archivebox.index.sql import get_admins
  30. from core.models import Snapshot
  31. User = get_user_model()
  32. print('[green]\\[*] Scanning archive main index...[/green]')
  33. print(f'[yellow] {out_dir}/*[/yellow]')
  34. num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern='index.')
  35. size = printable_filesize(num_bytes)
  36. print(f' Index size: {size} across {num_files} files')
  37. print()
  38. links = load_main_index(out_dir=out_dir)
  39. num_sql_links = links.count()
  40. num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
  41. print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {CONSTANTS.SQL_INDEX_FILENAME})')
  42. print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR.name}/*/index.json)')
  43. print()
  44. print('[green]\\[*] Scanning archive data directories...[/green]')
  45. print(f'[yellow] {ARCHIVE_DIR}/*[/yellow]')
  46. num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR)
  47. size = printable_filesize(num_bytes)
  48. print(f' Size: {size} across {num_files} files in {num_dirs} directories')
  49. num_indexed = len(get_indexed_folders(links, out_dir=out_dir))
  50. num_archived = len(get_archived_folders(links, out_dir=out_dir))
  51. num_unarchived = len(get_unarchived_folders(links, out_dir=out_dir))
  52. print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})')
  53. print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})')
  54. print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})')
  55. num_present = len(get_present_folders(links, out_dir=out_dir))
  56. num_valid = len(get_valid_folders(links, out_dir=out_dir))
  57. print()
  58. print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})')
  59. print(f' > [green]valid:[/green] {num_valid}'.ljust(36), f' ({get_valid_folders.__doc__})')
  60. duplicate = get_duplicate_folders(links, out_dir=out_dir)
  61. orphaned = get_orphaned_folders(links, out_dir=out_dir)
  62. corrupted = get_corrupted_folders(links, out_dir=out_dir)
  63. unrecognized = get_unrecognized_folders(links, out_dir=out_dir)
  64. num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized})
  65. print(f' > [red]invalid:[/red] {num_invalid}'.ljust(36), f' ({get_invalid_folders.__doc__})')
  66. print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})')
  67. print(f' > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})')
  68. print(f' > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})')
  69. print(f' > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})')
  70. if num_indexed:
  71. print(' [violet]Hint:[/violet] You can list link data directories by status like so:')
  72. print(' [green]archivebox list --status=<status> (e.g. indexed, corrupted, archived, etc.)[/green]')
  73. if orphaned:
  74. print(' [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:')
  75. print(' [green]archivebox init[/green]')
  76. if num_invalid:
  77. print(' [violet]Hint:[/violet] You may need to manually remove or fix some invalid data directories, afterwards make sure to run:')
  78. print(' [green]archivebox init[/green]')
  79. print()
  80. print('[green]\\[*] Scanning recent archive changes and user logins:[/green]')
  81. print(f'[yellow] {CONSTANTS.LOGS_DIR}/*[/yellow]')
  82. users = get_admins().values_list('username', flat=True)
  83. print(f' UI users {len(users)}: {", ".join(users)}')
  84. last_login = User.objects.order_by('last_login').last()
  85. if last_login:
  86. print(f' Last UI login: {last_login.username} @ {str(last_login.last_login)[:16]}')
  87. last_downloaded = Snapshot.objects.order_by('downloaded_at').last()
  88. if last_downloaded:
  89. print(f' Last changes: {str(last_downloaded.downloaded_at)[:16]}')
  90. if not users:
  91. print()
  92. print(' [violet]Hint:[/violet] You can create an admin user by running:')
  93. print(' [green]archivebox manage createsuperuser[/green]')
  94. print()
  95. for snapshot in links.order_by('-downloaded_at')[:10]:
  96. if not snapshot.downloaded_at:
  97. continue
  98. print(
  99. '[grey53] ' +
  100. (
  101. f' > {str(snapshot.downloaded_at)[:16]} '
  102. f'[{snapshot.num_outputs} {("X", "√")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] '
  103. f'"{snapshot.title}": {snapshot.url}'
  104. )[:SHELL_CONFIG.TERM_WIDTH]
  105. + '[grey53]',
  106. )
  107. print('[grey53] ...')
  108. @click.command()
  109. @docstring(status.__doc__)
  110. def main(**kwargs):
  111. """Print out some info and statistics about the archive collection"""
  112. status(**kwargs)
  113. if __name__ == '__main__':
  114. main()