json.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. __package__ = 'archivebox.index'
  2. import os
  3. import sys
  4. import json as pyjson
  5. from pathlib import Path
  6. from datetime import datetime
  7. from typing import List, Optional, Iterator, Any, Union
  8. from django.db.models import Model
  9. from .schema import Link
  10. from ..system import atomic_write
  11. from ..util import enforce_types
  12. from ..config import (
  13. VERSION,
  14. OUTPUT_DIR,
  15. FOOTER_INFO,
  16. GIT_SHA,
  17. DEPENDENCIES,
  18. JSON_INDEX_FILENAME,
  19. ARCHIVE_DIR_NAME,
  20. ANSI
  21. )
  22. MAIN_INDEX_HEADER = {
  23. 'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
  24. 'schema': 'archivebox.index.json',
  25. 'copyright_info': FOOTER_INFO,
  26. 'meta': {
  27. 'project': 'ArchiveBox',
  28. 'version': VERSION,
  29. 'git_sha': GIT_SHA,
  30. 'website': 'https://ArchiveBox.io',
  31. 'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
  32. 'source': 'https://github.com/ArchiveBox/ArchiveBox',
  33. 'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
  34. 'dependencies': DEPENDENCIES,
  35. },
  36. }
  37. @enforce_types
  38. def generate_json_index_from_snapshots(snapshots: List[Model], with_headers: bool):
  39. snapshots_json = [snapshot.as_json() for snapshot in snapshots]
  40. if with_headers:
  41. output = {
  42. **MAIN_INDEX_HEADER,
  43. 'num_links': len(snapshots),
  44. 'updated': datetime.now(),
  45. 'last_run_cmd': sys.argv,
  46. 'links': snapshots_json,
  47. }
  48. else:
  49. output = snapshots_json
  50. return to_json(output, indent=4, sort_keys=True)
  51. @enforce_types
  52. def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
  53. """parse an archive index json file and return the list of links"""
  54. index_path = Path(out_dir) / JSON_INDEX_FILENAME
  55. if index_path.exists():
  56. with open(index_path, 'r', encoding='utf-8') as f:
  57. links = pyjson.load(f)['links']
  58. for link_json in links:
  59. try:
  60. yield Link.from_json(link_json)
  61. except KeyError:
  62. try:
  63. detail_index_path = Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / link_json['timestamp']
  64. yield parse_json_link_details(str(detail_index_path))
  65. except KeyError:
  66. # as a last effort, try to guess the missing values out of existing ones
  67. try:
  68. yield Link.from_json(link_json, guess=True)
  69. except KeyError:
  70. print(" {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI))
  71. continue
  72. return ()
  73. ### Link Details Index
  74. @enforce_types
  75. def write_json_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> None:
  76. """write a json file with some info about the snapshot"""
  77. out_dir = out_dir or snapshot.snapshot_dir
  78. path = Path(out_dir) / JSON_INDEX_FILENAME
  79. atomic_write(str(path), snapshot.as_json())
  80. @enforce_types
  81. def load_json_snapshot(out_dir: Path) -> Optional[Model]:
  82. """
  83. Loads the detail from the local json index
  84. """
  85. from core.models import Snapshot
  86. existing_index = Path(out_dir) / JSON_INDEX_FILENAME
  87. if existing_index.exists():
  88. with open(existing_index, 'r', encoding='utf-8') as f:
  89. try:
  90. output = pyjson.load(f)
  91. output = Snapshot.from_json(output)
  92. return output
  93. except pyjson.JSONDecodeError:
  94. pass
  95. return None
  96. @enforce_types
  97. def parse_json_snapshot_details(out_dir: Union[Path, str]) -> Iterator[dict]:
  98. """read through all the archive data folders and return the parsed snapshots"""
  99. for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME):
  100. if entry.is_dir(follow_symlinks=True):
  101. if (Path(entry.path) / 'index.json').exists():
  102. try:
  103. snapshot_details = load_json_snapshot(Path(entry.path))
  104. except KeyError:
  105. snapshot_details = None
  106. if snapshot_details:
  107. yield snapshot_details
  108. ### Helpers
  109. class ExtendedEncoder(pyjson.JSONEncoder):
  110. """
  111. Extended json serializer that supports serializing several model
  112. fields and objects
  113. """
  114. def default(self, obj):
  115. cls_name = obj.__class__.__name__
  116. if hasattr(obj, '_asdict'):
  117. return obj._asdict()
  118. elif isinstance(obj, bytes):
  119. return obj.decode()
  120. elif isinstance(obj, datetime):
  121. return obj.isoformat()
  122. elif isinstance(obj, Exception):
  123. return '{}: {}'.format(obj.__class__.__name__, obj)
  124. elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
  125. return tuple(obj)
  126. return pyjson.JSONEncoder.default(self, obj)
  127. @enforce_types
  128. def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True, cls=ExtendedEncoder) -> str:
  129. return pyjson.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)