hashing.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273
  1. import hashlib
  2. import mimetypes
  3. from functools import lru_cache
  4. from pathlib import Path
  5. from typing import Callable
  6. from datetime import datetime
  7. import blake3 # pip install blake3
  8. @lru_cache(maxsize=1024)
  9. def _cached_file_hashes(filepath: str, size: int, mtime: float) -> tuple[str, str]:
  10. """Internal function to calculate file hashes with cache key based on path, size and mtime."""
  11. sha256_hash = hashlib.sha256()
  12. blake3_hash = blake3.blake3()
  13. with open(filepath, 'rb') as f:
  14. # Read file once and update both hashes simultaneously
  15. for chunk in iter(lambda: f.read(4096), b''):
  16. sha256_hash.update(chunk)
  17. blake3_hash.update(chunk)
  18. return sha256_hash.hexdigest(), blake3_hash.hexdigest()
  19. @lru_cache(maxsize=10)
  20. def hash_file(file_path: Path, pwd: Path | None = None) -> tuple[str, str]:
  21. """Calculate SHA256 and BLAKE3 hashes of a file with caching based on path, size and mtime."""
  22. pwd = Path(pwd) if pwd else None
  23. file_path = Path(file_path)
  24. if not file_path.is_absolute():
  25. file_path = pwd / file_path if pwd else file_path.absolute()
  26. abs_path = file_path.resolve()
  27. stat_info = abs_path.stat()
  28. return _cached_file_hashes(
  29. str(abs_path),
  30. stat_info.st_size,
  31. stat_info.st_mtime
  32. )
  33. @lru_cache(maxsize=10)
  34. def get_dir_hashes(dir_path: Path, pwd: Path | None = None, filter_func: Callable | None = None, max_depth: int = -1) -> dict[str, tuple[str, str]]:
  35. """Calculate SHA256 and BLAKE3 hashes for all files and directories recursively."""
  36. pwd = Path(pwd) if pwd else None
  37. dir_path = Path(dir_path)
  38. if not dir_path.is_absolute():
  39. dir_path = pwd / dir_path if pwd else dir_path.absolute()
  40. if not dir_path.is_dir():
  41. raise ValueError(f"Not a directory: {dir_path}")
  42. if max_depth < -1:
  43. raise ValueError(f"max_depth must be >= -1, got {max_depth}")
  44. # Get all files recursively
  45. all_files = get_dir_entries(
  46. dir_path, pwd=pwd, recursive=True,
  47. include_files=True, include_dirs=False,
  48. filter_func=filter_func
  49. )
  50. hashes: dict[str, tuple[str, str]] = {}
  51. hashable_summary_sha256 = []
  52. hashable_summary_blake3 = []
  53. # Calculate hashes for all files
  54. for subfile in all_files:
  55. subfile_path = dir_path / subfile
  56. sha256_hash, blake3_hash = hash_file(subfile_path)
  57. hashes[subfile] = (sha256_hash, blake3_hash)
  58. hashable_summary_sha256.append(f"{sha256_hash} ./{subfile}")
  59. hashable_summary_blake3.append(f"{blake3_hash} ./{subfile}")
  60. # Calculate hashes for all directories
  61. subdirs = get_dir_entries(
  62. dir_path, pwd=pwd, recursive=True,
  63. include_files=False, include_dirs=True,
  64. include_hidden=False, filter_func=filter_func,
  65. max_depth=max_depth
  66. )
  67. for subdir in subdirs:
  68. subdir_path = dir_path / subdir
  69. subdir_hashes = get_dir_hashes(
  70. subdir_path, filter_func=filter_func,
  71. max_depth=0
  72. )
  73. hashes[subdir] = subdir_hashes['.']
  74. # Filter results by max_depth
  75. if max_depth >= 0:
  76. hashes = {
  77. path: value for path, value in hashes.items()
  78. if len(Path(path).parts) <= max_depth + 1
  79. }
  80. # Calculate root directory hashes
  81. hashable_summary_sha256.sort()
  82. hashable_summary_blake3.sort()
  83. root_sha256 = hashlib.sha256('\n'.join(hashable_summary_sha256).encode()).hexdigest()
  84. root_blake3 = blake3.blake3('\n'.join(hashable_summary_blake3).encode()).hexdigest()
  85. hashes['.'] = (root_sha256, root_blake3)
  86. return hashes
  87. @lru_cache(maxsize=128)
  88. def get_dir_entries(dir_path: Path, pwd: Path | None = None, recursive: bool = True,
  89. include_files: bool = True, include_dirs: bool = True, include_hidden: bool = False,
  90. filter_func: Callable | None = None, max_depth: int = -1) -> tuple[str, ...]:
  91. """Get filtered list of directory entries."""
  92. pwd = Path(pwd) if pwd else None
  93. dir_path = Path(dir_path)
  94. if not dir_path.is_absolute():
  95. dir_path = pwd / dir_path if pwd else dir_path.absolute()
  96. results = []
  97. def process_path(path: Path, depth: int):
  98. if not include_hidden and path.name.startswith('.'):
  99. return False
  100. if max_depth >= 0 and depth > max_depth:
  101. return False
  102. if filter_func:
  103. info = {
  104. "abspath": str(path.absolute()),
  105. "relpath": str(path.relative_to(dir_path))
  106. }
  107. if not filter_func(info):
  108. return False
  109. return True
  110. for path in dir_path.rglob('*') if recursive else dir_path.glob('*'):
  111. current_depth = len(path.relative_to(dir_path).parts)
  112. if path.is_file() and include_files and process_path(path, current_depth):
  113. results.append(str(path.relative_to(dir_path)))
  114. elif path.is_dir() and include_dirs and process_path(path, current_depth):
  115. results.append(str(path.relative_to(dir_path)))
  116. if not recursive:
  117. break
  118. return tuple(sorted(results)) # Make immutable for caching
  119. @lru_cache(maxsize=1024)
  120. def get_dir_sizes(dir_path: Path, pwd: Path | None = None, **kwargs) -> dict[str, int]:
  121. """Calculate sizes for all files and directories recursively."""
  122. sizes: dict[str, int] = {}
  123. hashes = get_dir_hashes(dir_path, pwd=pwd, **kwargs)
  124. dir_path = Path(dir_path)
  125. for path_key in hashes:
  126. full_path = dir_path / path_key
  127. if full_path.is_file():
  128. sizes[path_key] = full_path.stat().st_size
  129. else:
  130. total = 0
  131. for file_path in full_path.rglob('*'):
  132. if file_path.is_file() and not file_path.name.startswith('.'):
  133. total += file_path.stat().st_size
  134. sizes[path_key + '/'] = total
  135. return sizes
  136. @lru_cache(maxsize=10)
  137. def get_dir_info(dir_path: Path, pwd: Path | None = None, filter_func: Callable | None = None, max_depth: int = -1) -> dict:
  138. """Get detailed information about directory contents including both hash types and sizes."""
  139. pwd = Path(pwd) if pwd else None
  140. dir_path = Path(dir_path)
  141. if not dir_path.is_absolute():
  142. dir_path = pwd / dir_path if pwd else dir_path.absolute()
  143. hashes = get_dir_hashes(dir_path, pwd=pwd, filter_func=filter_func, max_depth=max_depth)
  144. sizes = get_dir_sizes(str(dir_path), pwd=pwd, filter_func=filter_func, max_depth=max_depth)
  145. num_total_subpaths = sum(1 for name in hashes if name != '.')
  146. details = {}
  147. for filename, (sha256_hash, blake3_hash) in sorted(hashes.items()):
  148. abs_path = (dir_path / filename).resolve()
  149. stat_info = abs_path.stat()
  150. num_subpaths = sum(1 for p in hashes if p.startswith(filename + '/'))
  151. is_dir = abs_path.is_dir()
  152. if is_dir:
  153. mime_type = 'inode/directory'
  154. basename = abs_path.name
  155. extension = ''
  156. num_bytes = sizes[filename + '/']
  157. if filename == '.':
  158. num_subpaths = num_total_subpaths
  159. else:
  160. filename += '/'
  161. num_subpaths = num_subpaths
  162. else: # is_file
  163. num_subpaths = None
  164. mime_type = mimetypes.guess_type(str(abs_path))[0]
  165. extension = abs_path.suffix
  166. basename = abs_path.name.rsplit(extension, 1)[0]
  167. num_bytes = sizes[filename]
  168. details[filename] = {
  169. 'basename': basename,
  170. 'mime_type': mime_type,
  171. 'extension': extension,
  172. 'num_subpaths': num_subpaths,
  173. 'num_bytes': num_bytes,
  174. 'hash_sha256': sha256_hash,
  175. 'hash_blake3': blake3_hash,
  176. 'created_at': datetime.fromtimestamp(stat_info.st_ctime).isoformat(),
  177. 'modified_at': datetime.fromtimestamp(stat_info.st_mtime).isoformat(),
  178. }
  179. if filter_func and not filter_func(details[filename]):
  180. del details[filename]
  181. return details
  182. if __name__ == '__main__':
  183. import json
  184. dir_info = get_dir_info(Path('.'), max_depth=6)
  185. with open('.hashes.json', 'w') as f:
  186. json.dump(dir_info, f, indent=4)
  187. print('√ Wrote .hashes.json')
  188. # Example output:
  189. # {
  190. # ".": {
  191. # "basename": "misc",
  192. # "mime_type": "inode/directory",
  193. # "extension": "",
  194. # "num_subpaths": 25,
  195. # "num_bytes": 214677,
  196. # "hash_sha256": "addfacf88b2ff6b564846415fb7b21dcb7e63ee4e911bc0aec255ee354958530",
  197. # "hash_blake3": "3403a1f876453c7749f17ee3502769eff05cff20b5d6c2f2cf458e6353a380db",
  198. # "created_at": "2024-12-04T00:08:38.537449",
  199. # "modified_at": "2024-12-04T00:08:38.537449"
  200. # },
  201. # "__init__.py": {
  202. # "basename": "__init__",
  203. # "mime_type": "text/x-python",
  204. # "extension": ".py",
  205. # "num_subpaths": null,
  206. # "num_bytes": 32,
  207. # "hash_sha256": "b0e5e7ff17db3b60535cf664282787767c336e3e203a43e21b6326c6fe457551",
  208. # "hash_blake3": "4a801eb2a4cdde8d3422be1e2074b78574a5890afb3027cbe6f3b3cf4d113fd1",
  209. # "created_at": "2024-10-08T00:51:41.001359",
  210. # "modified_at": "2024-10-08T00:51:41.001359"
  211. # },
  212. # "__pycache__/": {
  213. # "basename": "__pycache__",
  214. # "mime_type": "inode/directory",
  215. # "extension": "",
  216. # "num_subpaths": 8,
  217. # "num_bytes": 107593,
  218. # "hash_sha256": "9e917a438be774ffc7ea9125de71008c29a7d9003b6f5e09e2085aa1ef3157b3",
  219. # "hash_blake3": "e87184485bd67bd9b723a9ee4d472e8c1d24a4388d373046a27e5a1e10467a06",
  220. # "created_at": "2024-12-04T00:00:16.149390",
  221. # "modified_at": "2024-12-04T00:00:16.149390"
  222. # },
  223. # "__pycache__/__init__.cpython-313.pyc": {
  224. # "basename": "__init__.cpython-313",
  225. # "mime_type": "application/x-python-code",
  226. # "extension": ".pyc",
  227. # "num_subpaths": null,
  228. # "num_bytes": 223,
  229. # "hash_sha256": "d29e3ee5e6b9b564422d9ef2c7325d28cf759b9fb868f59551ba43cd991d51be",
  230. # "hash_blake3": "279a6dc4c8161d6ddb18fa72c882f375324ed152dc6c7c7eac9ef5fdd066f2fd",
  231. # "created_at": "2024-12-03T03:13:43.257430",
  232. # "modified_at": "2024-12-03T03:13:43.257308"
  233. # },
  234. # ...
  235. # }