6 سال پیش · ab68819332
--- a/archivebox/cli/archivebox_info.py
+++ b/archivebox/cli/archivebox_info.py
@@ -0,0 +1,28 @@
 
				+#!/usr/bin/env python3
			
 
				+
			
 
				+__package__ = 'archivebox.cli'
			
 
				+__command__ = 'archivebox info'
			
 
				+__description__ = 'Print out some info and statistics about the archive collection'
			
 
				+
			
 
				+import sys
			
 
				+import argparse
			
 
				+
			
 
				+from ..legacy.main import info
			
 
				+from ..legacy.util import reject_stdin
			
 
				+
			
 
				+
			
 
				+def main(args=None):
			
 
				+    args = sys.argv[1:] if args is None else args
			
 
				+
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        prog=__command__,
			
 
				+        description=__description__,
			
 
				+        add_help=True,
			
 
				+    )
			
 
				+    parser.parse_args(args)
			
 
				+    reject_stdin(__command__)
			
 
				+
			
 
				+    info()
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/archivebox/legacy/main.py
+++ b/archivebox/legacy/main.py
@@ -5,7 +5,12 @@ import shutil
 
				 from typing import List, Optional, Iterable
			
 
				 
			
 
				 from .schema import Link
			
 
				-from .util import enforce_types, TimedProgress
			
 
				+from .util import (
			
 
				+    enforce_types,
			
 
				+    TimedProgress,
			
 
				+    get_dir_size,
			
 
				+    human_readable_size,
			
 
				+)
			
 
				 from .index import (
			
 
				     links_after_timestamp,
			
 
				     load_main_index,
			
@@ -119,6 +124,47 @@ def init():
 
				     print('        archivebox help')
			
 
				 
			
 
				 
			
 
				+@enforce_types
			
 
				+def info():
			
 
				+    all_links = load_main_index(out_dir=OUTPUT_DIR)
			
 
				+
			
 
				+    print('{green}[*] Scanning archive collection main index with {} links:{reset}'.format(len(all_links), **ANSI))
			
 
				+    print(f'    {OUTPUT_DIR}')
			
 
				+    
			
 
				+    num_bytes, num_dirs, num_files = get_dir_size(OUTPUT_DIR, recursive=False)
			
 
				+    size = human_readable_size(num_bytes)
			
 
				+    print(f'    > Index Size: {size} across {num_files} files in')
			
 
				+    print()
			
 
				+
			
 
				+    print('{green}[*] Scanning archive collection data directory with {} entries:{reset}'.format(len(all_links), **ANSI))
			
 
				+    print(f'    {ARCHIVE_DIR}')
			
 
				+
			
 
				+    num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR)
			
 
				+    size = human_readable_size(num_bytes)
			
 
				+    print(f'    > Total Size: {size} across {num_files} files in {num_dirs} directories')
			
 
				+    print()
			
 
				+
			
 
				+    link_data_dirs = {link.link_dir for link in all_links}
			
 
				+    valid_archive_dirs = set()
			
 
				+    num_invalid = 0
			
 
				+    for entry in os.scandir(ARCHIVE_DIR):
			
 
				+        if entry.is_dir(follow_symlinks=True):
			
 
				+            if os.path.exists(os.path.join(entry.path, 'index.json')):
			
 
				+                valid_archive_dirs.add(entry.path)
			
 
				+            else:
			
 
				+                num_invalid += 1
			
 
				+
			
 
				+    print(f'    > {len(valid_archive_dirs)} valid archive data directories (valid directories matched to links in the index)')
			
 
				+
			
 
				+    num_unarchived = sum(1 for link in all_links if link.link_dir not in valid_archive_dirs)
			
 
				+    print(f'    > {num_unarchived} missing data directories (directories missing for links in the index)')
			
 
				+
			
 
				+    print(f'    > {num_invalid} invalid data directories (directories present that don\'t contain an index file)')
			
 
				+
			
 
				+    num_orphaned = sum(1 for data_dir in valid_archive_dirs if data_dir not in link_data_dirs)
			
 
				+    print(f'    > {num_orphaned} orphaned data directories (directories present for links that don\'t exist in the index)')
			
 
				+    
			
 
				+
			
 
				 
			
 
				 @enforce_types
			
 
				 def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]=None, only_new: bool=False) -> List[Link]:
			
--- a/archivebox/legacy/storage/json.py
+++ b/archivebox/legacy/storage/json.py
@@ -27,7 +27,6 @@ MAIN_INDEX_HEADER = {
 
				     'copyright_info': FOOTER_INFO,
			
 
				     'meta': {
			
 
				         'project': 'ArchiveBox',
			
 
				-        'cmd': sys.argv,
			
 
				         'version': VERSION,
			
 
				         'git_sha': GIT_SHA,
			
 
				         'website': 'https://ArchiveBox.io',
			
@@ -72,6 +71,7 @@ def write_json_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
 
				         **MAIN_INDEX_HEADER,
			
 
				         'num_links': len(links),
			
 
				         'updated': datetime.now(),
			
 
				+        'last_run_cmd': sys.argv,
			
 
				         'links': links,
			
 
				     }
			
 
				     atomic_write(main_index_json, os.path.join(out_dir, JSON_INDEX_FILENAME))
			
--- a/archivebox/legacy/util.py
+++ b/archivebox/legacy/util.py
@@ -7,7 +7,7 @@ import shutil
 
				 
			
 
				 from string import Template
			
 
				 from json import JSONEncoder
			
 
				-from typing import List, Optional, Any, Union, IO, Mapping
			
 
				+from typing import List, Optional, Any, Union, IO, Mapping, Tuple
			
 
				 from inspect import signature
			
 
				 from functools import wraps
			
 
				 from hashlib import sha256
			
@@ -561,6 +561,34 @@ def copy_and_overwrite(from_path: str, to_path: str):
 
				         with open(from_path, 'rb') as src:
			
 
				             atomic_write(src.read(), to_path)
			
 
				 
			
 
				+
			
 
				+@enforce_types
			
 
				+def get_dir_size(path: str, recursive: bool=True) -> Tuple[int, int, int]:
			
 
				+    num_bytes, num_dirs, num_files = 0, 0, 0
			
 
				+    for entry in os.scandir(path):
			
 
				+        if entry.is_dir(follow_symlinks=False):
			
 
				+            if not recursive:
			
 
				+                continue
			
 
				+            num_dirs += 1
			
 
				+            bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path)
			
 
				+            num_bytes += bytes_inside
			
 
				+            num_dirs += dirs_inside
			
 
				+            num_files += files_inside
			
 
				+        else:
			
 
				+            num_bytes += entry.stat(follow_symlinks=False).st_size
			
 
				+            num_files += 1
			
 
				+    return num_bytes, num_dirs, num_files
			
 
				+
			
 
				+
			
 
				+@enforce_types
			
 
				+def human_readable_size(num_bytes: Union[int, float]) -> str:
			
 
				+    for count in ['Bytes','KB','MB','GB']:
			
 
				+        if num_bytes > -1024.0 and num_bytes < 1024.0:
			
 
				+            return '%3.1f%s' % (num_bytes, count)
			
 
				+        num_bytes /= 1024.0
			
 
				+    return '%3.1f%s' % (num_bytes, 'TB')
			
 
				+
			
 
				+
			
 
				 @enforce_types
			
 
				 def chrome_args(**options) -> List[str]:
			
 
				     """helper to build up a chrome shell command with arguments"""