archivebox_status.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. #!/usr/bin/env python3
  2. __package__ = 'archivebox.cli'
  3. from pathlib import Path
  4. import rich_click as click
  5. from rich import print
  6. from archivebox.misc.util import enforce_types, docstring
  7. from archivebox.config import DATA_DIR, CONSTANTS, ARCHIVE_DIR
  8. from archivebox.config.common import SHELL_CONFIG
  9. from archivebox.misc.legacy import parse_json_links_details
  10. from archivebox.misc.system import get_dir_size
  11. from archivebox.misc.logging_util import printable_filesize
  12. @enforce_types
  13. def status(out_dir: Path=DATA_DIR) -> None:
  14. """Print out some info and statistics about the archive collection"""
  15. from django.contrib.auth import get_user_model
  16. from archivebox.misc.db import get_admins
  17. from archivebox.core.models import Snapshot
  18. User = get_user_model()
  19. print('[green]\\[*] Scanning archive main index...[/green]')
  20. print(f'[yellow] {out_dir}/*[/yellow]')
  21. num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern='index.')
  22. size = printable_filesize(num_bytes)
  23. print(f' Index size: {size} across {num_files} files')
  24. print()
  25. links = Snapshot.objects.all()
  26. num_sql_links = links.count()
  27. num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
  28. print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {CONSTANTS.SQL_INDEX_FILENAME})')
  29. print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR.name}/*/index.json)')
  30. print()
  31. print('[green]\\[*] Scanning archive data directories...[/green]')
  32. print(f'[yellow] {ARCHIVE_DIR}/*[/yellow]')
  33. num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR)
  34. size = printable_filesize(num_bytes)
  35. print(f' Size: {size} across {num_files} files in {num_dirs} directories')
  36. # Use DB as source of truth for snapshot status
  37. num_indexed = links.count()
  38. num_archived = links.filter(status='archived').count() or links.exclude(downloaded_at=None).count()
  39. num_unarchived = links.filter(status='queued').count() or links.filter(downloaded_at=None).count()
  40. print(f' > indexed: {num_indexed}'.ljust(36), '(total snapshots in DB)')
  41. print(f' > archived: {num_archived}'.ljust(36), '(snapshots with archived content)')
  42. print(f' > unarchived: {num_unarchived}'.ljust(36), '(snapshots pending archiving)')
  43. # Count directories on filesystem
  44. num_present = 0
  45. orphaned_dirs = []
  46. if ARCHIVE_DIR.exists():
  47. for entry in ARCHIVE_DIR.iterdir():
  48. if entry.is_dir():
  49. num_present += 1
  50. if not links.filter(timestamp=entry.name).exists():
  51. orphaned_dirs.append(str(entry))
  52. num_valid = min(num_present, num_indexed) # approximate
  53. print()
  54. print(f' > present: {num_present}'.ljust(36), '(directories in archive/)')
  55. print(f' > [green]valid:[/green] {num_valid}'.ljust(36), ' (directories with matching DB entry)')
  56. num_orphaned = len(orphaned_dirs)
  57. print(f' > [red]orphaned:[/red] {num_orphaned}'.ljust(36), ' (directories without matching DB entry)')
  58. if num_indexed:
  59. print(' [violet]Hint:[/violet] You can list snapshots by status like so:')
  60. print(' [green]archivebox list --status=<status> (e.g. archived, queued, etc.)[/green]')
  61. if orphaned_dirs:
  62. print(' [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:')
  63. print(' [green]archivebox init[/green]')
  64. print()
  65. print('[green]\\[*] Scanning recent archive changes and user logins:[/green]')
  66. print(f'[yellow] {CONSTANTS.LOGS_DIR}/*[/yellow]')
  67. users = get_admins().values_list('username', flat=True)
  68. print(f' UI users {len(users)}: {", ".join(users)}')
  69. last_login = User.objects.order_by('last_login').last()
  70. if last_login:
  71. print(f' Last UI login: {last_login.username} @ {str(last_login.last_login)[:16]}')
  72. last_downloaded = Snapshot.objects.order_by('downloaded_at').last()
  73. if last_downloaded:
  74. print(f' Last changes: {str(last_downloaded.downloaded_at)[:16]}')
  75. if not users:
  76. print()
  77. print(' [violet]Hint:[/violet] You can create an admin user by running:')
  78. print(' [green]archivebox manage createsuperuser[/green]')
  79. print()
  80. for snapshot in links.order_by('-downloaded_at')[:10]:
  81. if not snapshot.downloaded_at:
  82. continue
  83. print(
  84. '[grey53] ' +
  85. (
  86. f' > {str(snapshot.downloaded_at)[:16]} '
  87. f'[{snapshot.num_outputs} {("X", "√")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] '
  88. f'"{snapshot.title}": {snapshot.url}'
  89. )[:SHELL_CONFIG.TERM_WIDTH]
  90. + '[grey53]',
  91. )
  92. print('[grey53] ...')
  93. @click.command()
  94. @docstring(status.__doc__)
  95. def main(**kwargs):
  96. """Print out some info and statistics about the archive collection"""
  97. status(**kwargs)
  98. if __name__ == '__main__':
  99. main()