Browse Source

add archivebox info command to scan data dir

Nick Sweeting 6 years ago
parent
commit
ab68819332

+ 28 - 0
archivebox/cli/archivebox_info.py

@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox info'
+__description__ = 'Print out some info and statistics about the archive collection'
+
+import sys
+import argparse
+
+from ..legacy.main import info
+from ..legacy.util import reject_stdin
+
+
+def main(args=None):
+    args = sys.argv[1:] if args is None else args
+
+    parser = argparse.ArgumentParser(
+        prog=__command__,
+        description=__description__,
+        add_help=True,
+    )
+    parser.parse_args(args)
+    reject_stdin(__command__)
+
+    info()
+
+if __name__ == '__main__':
+    main()

+ 47 - 1
archivebox/legacy/main.py

@@ -5,7 +5,12 @@ import shutil
 from typing import List, Optional, Iterable
 from typing import List, Optional, Iterable
 
 
 from .schema import Link
 from .schema import Link
-from .util import enforce_types, TimedProgress
+from .util import (
+    enforce_types,
+    TimedProgress,
+    get_dir_size,
+    human_readable_size,
+)
 from .index import (
 from .index import (
     links_after_timestamp,
     links_after_timestamp,
     load_main_index,
     load_main_index,
@@ -119,6 +124,47 @@ def init():
     print('        archivebox help')
     print('        archivebox help')
 
 
 
 
+@enforce_types
+def info():
+    all_links = load_main_index(out_dir=OUTPUT_DIR)
+
+    print('{green}[*] Scanning archive collection main index with {} links:{reset}'.format(len(all_links), **ANSI))
+    print(f'    {OUTPUT_DIR}')
+    
+    num_bytes, num_dirs, num_files = get_dir_size(OUTPUT_DIR, recursive=False)
+    size = human_readable_size(num_bytes)
+    print(f'    > Index Size: {size} across {num_files} files in')
+    print()
+
+    print('{green}[*] Scanning archive collection data directory with {} entries:{reset}'.format(len(all_links), **ANSI))
+    print(f'    {ARCHIVE_DIR}')
+
+    num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR)
+    size = human_readable_size(num_bytes)
+    print(f'    > Total Size: {size} across {num_files} files in {num_dirs} directories')
+    print()
+
+    link_data_dirs = {link.link_dir for link in all_links}
+    valid_archive_dirs = set()
+    num_invalid = 0
+    for entry in os.scandir(ARCHIVE_DIR):
+        if entry.is_dir(follow_symlinks=True):
+            if os.path.exists(os.path.join(entry.path, 'index.json')):
+                valid_archive_dirs.add(entry.path)
+            else:
+                num_invalid += 1
+
+    print(f'    > {len(valid_archive_dirs)} valid archive data directories (valid directories matched to links in the index)')
+
+    num_unarchived = sum(1 for link in all_links if link.link_dir not in valid_archive_dirs)
+    print(f'    > {num_unarchived} missing data directories (directories missing for links in the index)')
+
+    print(f'    > {num_invalid} invalid data directories (directories present that don\'t contain an index file)')
+
+    num_orphaned = sum(1 for data_dir in valid_archive_dirs if data_dir not in link_data_dirs)
+    print(f'    > {num_orphaned} orphaned data directories (directories present for links that don\'t exist in the index)')
+    
+
 
 
 @enforce_types
 @enforce_types
 def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]=None, only_new: bool=False) -> List[Link]:
 def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]=None, only_new: bool=False) -> List[Link]:

+ 1 - 1
archivebox/legacy/storage/json.py

@@ -27,7 +27,6 @@ MAIN_INDEX_HEADER = {
     'copyright_info': FOOTER_INFO,
     'copyright_info': FOOTER_INFO,
     'meta': {
     'meta': {
         'project': 'ArchiveBox',
         'project': 'ArchiveBox',
-        'cmd': sys.argv,
         'version': VERSION,
         'version': VERSION,
         'git_sha': GIT_SHA,
         'git_sha': GIT_SHA,
         'website': 'https://ArchiveBox.io',
         'website': 'https://ArchiveBox.io',
@@ -72,6 +71,7 @@ def write_json_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
         **MAIN_INDEX_HEADER,
         **MAIN_INDEX_HEADER,
         'num_links': len(links),
         'num_links': len(links),
         'updated': datetime.now(),
         'updated': datetime.now(),
+        'last_run_cmd': sys.argv,
         'links': links,
         'links': links,
     }
     }
     atomic_write(main_index_json, os.path.join(out_dir, JSON_INDEX_FILENAME))
     atomic_write(main_index_json, os.path.join(out_dir, JSON_INDEX_FILENAME))

+ 29 - 1
archivebox/legacy/util.py

@@ -7,7 +7,7 @@ import shutil
 
 
 from string import Template
 from string import Template
 from json import JSONEncoder
 from json import JSONEncoder
-from typing import List, Optional, Any, Union, IO, Mapping
+from typing import List, Optional, Any, Union, IO, Mapping, Tuple
 from inspect import signature
 from inspect import signature
 from functools import wraps
 from functools import wraps
 from hashlib import sha256
 from hashlib import sha256
@@ -561,6 +561,34 @@ def copy_and_overwrite(from_path: str, to_path: str):
         with open(from_path, 'rb') as src:
         with open(from_path, 'rb') as src:
             atomic_write(src.read(), to_path)
             atomic_write(src.read(), to_path)
 
 
+
+@enforce_types
+def get_dir_size(path: str, recursive: bool=True) -> Tuple[int, int, int]:
+    num_bytes, num_dirs, num_files = 0, 0, 0
+    for entry in os.scandir(path):
+        if entry.is_dir(follow_symlinks=False):
+            if not recursive:
+                continue
+            num_dirs += 1
+            bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path)
+            num_bytes += bytes_inside
+            num_dirs += dirs_inside
+            num_files += files_inside
+        else:
+            num_bytes += entry.stat(follow_symlinks=False).st_size
+            num_files += 1
+    return num_bytes, num_dirs, num_files
+
+
+@enforce_types
+def human_readable_size(num_bytes: Union[int, float]) -> str:
+    for count in ['Bytes','KB','MB','GB']:
+        if num_bytes > -1024.0 and num_bytes < 1024.0:
+            return '%3.1f%s' % (num_bytes, count)
+        num_bytes /= 1024.0
+    return '%3.1f%s' % (num_bytes, 'TB')
+
+
 @enforce_types
 @enforce_types
 def chrome_args(**options) -> List[str]:
 def chrome_args(**options) -> List[str]:
     """helper to build up a chrome shell command with arguments"""
     """helper to build up a chrome shell command with arguments"""