json.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
  1. __package__ = 'archivebox.index'
  2. import os
  3. import sys
  4. import json as pyjson
  5. from pathlib import Path
  6. from datetime import datetime
  7. from typing import List, Optional, Iterator, Any, Union
  8. from .schema import Link
  9. from ..system import atomic_write
  10. from ..util import enforce_types
  11. from ..config import (
  12. VERSION,
  13. OUTPUT_DIR,
  14. FOOTER_INFO,
  15. GIT_SHA,
  16. DEPENDENCIES,
  17. JSON_INDEX_FILENAME,
  18. ARCHIVE_DIR_NAME,
  19. ANSI
  20. )
  21. MAIN_INDEX_HEADER = {
  22. 'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
  23. 'schema': 'archivebox.index.json',
  24. 'copyright_info': FOOTER_INFO,
  25. 'meta': {
  26. 'project': 'ArchiveBox',
  27. 'version': VERSION,
  28. 'git_sha': GIT_SHA,
  29. 'website': 'https://ArchiveBox.io',
  30. 'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
  31. 'source': 'https://github.com/ArchiveBox/ArchiveBox',
  32. 'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
  33. 'dependencies': DEPENDENCIES,
  34. },
  35. }
  36. @enforce_types
  37. def generate_json_index_from_links(links: List[Link], with_headers: bool):
  38. if with_headers:
  39. output = {
  40. **MAIN_INDEX_HEADER,
  41. 'num_links': len(links),
  42. 'updated': datetime.now(),
  43. 'last_run_cmd': sys.argv,
  44. 'links': links,
  45. }
  46. else:
  47. output = links
  48. return to_json(output, indent=4, sort_keys=True)
  49. @enforce_types
  50. def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
  51. """parse an archive index json file and return the list of links"""
  52. index_path = Path(out_dir) / JSON_INDEX_FILENAME
  53. if index_path.exists():
  54. with open(index_path, 'r', encoding='utf-8') as f:
  55. links = pyjson.load(f)['links']
  56. for link_json in links:
  57. try:
  58. yield Link.from_json(link_json)
  59. except KeyError:
  60. try:
  61. detail_index_path = Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / link_json['timestamp']
  62. yield parse_json_link_details(str(detail_index_path))
  63. except KeyError:
  64. # as a last effort, try to guess the missing values out of existing ones
  65. try:
  66. yield Link.from_json(link_json, guess=True)
  67. except KeyError:
  68. print(" {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI))
  69. continue
  70. return ()
  71. ### Link Details Index
  72. @enforce_types
  73. def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
  74. """write a json file with some info about the link"""
  75. out_dir = out_dir or link.link_dir
  76. path = Path(out_dir) / JSON_INDEX_FILENAME
  77. atomic_write(str(path), link._asdict(extended=True))
  78. @enforce_types
  79. def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Link]:
  80. """load the json link index from a given directory"""
  81. existing_index = Path(out_dir) / JSON_INDEX_FILENAME
  82. if existing_index.exists():
  83. with open(existing_index, 'r', encoding='utf-8') as f:
  84. try:
  85. link_json = pyjson.load(f)
  86. return Link.from_json(link_json, guess)
  87. except pyjson.JSONDecodeError:
  88. pass
  89. return None
  90. @enforce_types
  91. def parse_json_links_details(out_dir: Union[Path, str]) -> Iterator[Link]:
  92. """read through all the archive data folders and return the parsed links"""
  93. for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME):
  94. if entry.is_dir(follow_symlinks=True):
  95. if (Path(entry.path) / 'index.json').exists():
  96. try:
  97. link = parse_json_link_details(entry.path)
  98. except KeyError:
  99. link = None
  100. if link:
  101. yield link
  102. ### Helpers
  103. class ExtendedEncoder(pyjson.JSONEncoder):
  104. """
  105. Extended json serializer that supports serializing several model
  106. fields and objects
  107. """
  108. def default(self, obj):
  109. cls_name = obj.__class__.__name__
  110. if hasattr(obj, '_asdict'):
  111. return obj._asdict()
  112. elif isinstance(obj, bytes):
  113. return obj.decode()
  114. elif isinstance(obj, datetime):
  115. return obj.isoformat()
  116. elif isinstance(obj, Exception):
  117. return '{}: {}'.format(obj.__class__.__name__, obj)
  118. elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
  119. return tuple(obj)
  120. return pyjson.JSONEncoder.default(self, obj)
  121. @enforce_types
  122. def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True, cls=ExtendedEncoder) -> str:
  123. return pyjson.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)