folders.py 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. """
  2. Folder utilities for ArchiveBox.
  3. Note: This file only contains legacy cleanup utilities.
  4. The DB is the single source of truth - use Snapshot.objects queries for all status checks.
  5. """
  6. __package__ = 'archivebox.misc'
  7. import os
  8. import json
  9. import shutil
  10. from pathlib import Path
  11. from typing import Tuple, List
  12. from archivebox.config import DATA_DIR, CONSTANTS
  13. from archivebox.misc.util import enforce_types
  14. @enforce_types
  15. def fix_invalid_folder_locations(out_dir: Path = DATA_DIR) -> Tuple[List[str], List[str]]:
  16. """
  17. Legacy cleanup: Move folders to their correct timestamp-named locations based on index.json.
  18. This is only used during 'archivebox init' for one-time cleanup of misnamed directories.
  19. After this runs once, 'archivebox update' handles all filesystem operations.
  20. """
  21. fixed = []
  22. cant_fix = []
  23. for entry in os.scandir(out_dir / CONSTANTS.ARCHIVE_DIR_NAME):
  24. if entry.is_dir(follow_symlinks=True):
  25. index_path = Path(entry.path) / 'index.json'
  26. if index_path.exists():
  27. try:
  28. with open(index_path, 'r') as f:
  29. data = json.load(f)
  30. timestamp = data.get('timestamp')
  31. url = data.get('url')
  32. except Exception:
  33. continue
  34. if not timestamp:
  35. continue
  36. if not entry.path.endswith(f'/{timestamp}'):
  37. dest = out_dir / CONSTANTS.ARCHIVE_DIR_NAME / timestamp
  38. if dest.exists():
  39. cant_fix.append(entry.path)
  40. else:
  41. shutil.move(entry.path, str(dest))
  42. fixed.append(str(dest))
  43. return fixed, cant_fix