json.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. __package__ = 'archivebox.index'
  2. import os
  3. import sys
  4. import json as pyjson
  5. from pathlib import Path
  6. from datetime import datetime, timezone
  7. from typing import List, Optional, Iterator, Any, Union
  8. import abx
  9. from archivebox.config import VERSION, DATA_DIR, CONSTANTS
  10. from archivebox.config.common import SERVER_CONFIG, SHELL_CONFIG
  11. from .schema import Link
  12. from archivebox.misc.system import atomic_write
  13. from archivebox.misc.util import enforce_types
  14. @enforce_types
  15. def generate_json_index_from_links(links: List[Link], with_headers: bool=False):
  16. MAIN_INDEX_HEADER = {
  17. 'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
  18. 'schema': 'archivebox.index.json',
  19. 'copyright_info': SERVER_CONFIG.FOOTER_INFO,
  20. 'meta': {
  21. 'project': 'ArchiveBox',
  22. 'version': VERSION,
  23. 'git_sha': VERSION, # not used anymore, but kept for backwards compatibility
  24. 'website': 'https://ArchiveBox.io',
  25. 'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
  26. 'source': 'https://github.com/ArchiveBox/ArchiveBox',
  27. 'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
  28. 'dependencies': abx.as_dict(abx.pm.hook.get_BINARIES()),
  29. },
  30. } if with_headers else {}
  31. if with_headers:
  32. output = {
  33. **MAIN_INDEX_HEADER,
  34. 'num_links': len(links),
  35. 'updated': datetime.now(timezone.utc),
  36. 'last_run_cmd': sys.argv,
  37. 'links': links,
  38. }
  39. else:
  40. output = links
  41. return to_json(output, indent=4, sort_keys=True)
  42. @enforce_types
  43. def parse_json_main_index(out_dir: Path=DATA_DIR) -> Iterator[Link]:
  44. """parse an archive index json file and return the list of links"""
  45. index_path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
  46. if index_path.exists():
  47. with open(index_path, 'r', encoding='utf-8') as f:
  48. try:
  49. links = pyjson.load(f)['links']
  50. if links:
  51. Link.from_json(links[0])
  52. except Exception as err:
  53. print(" {lightyellow}! Found an index.json in the project root but couldn't load links from it: {} {}".format(
  54. err.__class__.__name__,
  55. err,
  56. **SHELL_CONFIG.ANSI,
  57. ))
  58. return ()
  59. for link_json in links:
  60. try:
  61. yield Link.from_json(link_json)
  62. except KeyError:
  63. try:
  64. detail_index_path = CONSTANTS.ARCHIVE_DIR / link_json['timestamp']
  65. yield parse_json_link_details(str(detail_index_path))
  66. except KeyError:
  67. # as a last effort, try to guess the missing values out of existing ones
  68. try:
  69. yield Link.from_json(link_json, guess=True)
  70. except KeyError:
  71. # print(" {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI))
  72. continue
  73. return ()
  74. ### Link Details Index
  75. @enforce_types
  76. def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
  77. """write a json file with some info about the link"""
  78. out_dir = out_dir or link.link_dir
  79. path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
  80. atomic_write(str(path), link._asdict(extended=True))
  81. @enforce_types
  82. def parse_json_link_details(out_dir: Union[Path, str], guess: bool=False) -> Optional[Link]:
  83. """load the json link index from a given directory"""
  84. existing_index = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
  85. if os.access(existing_index, os.F_OK):
  86. with open(existing_index, 'r', encoding='utf-8') as f:
  87. try:
  88. link_json = pyjson.load(f)
  89. return Link.from_json(link_json, guess)
  90. except pyjson.JSONDecodeError:
  91. pass
  92. return None
  93. @enforce_types
  94. def parse_json_links_details(out_dir: Union[Path, str]) -> Iterator[Link]:
  95. """read through all the archive data folders and return the parsed links"""
  96. for entry in os.scandir(CONSTANTS.ARCHIVE_DIR):
  97. if entry.is_dir(follow_symlinks=True):
  98. if os.access((Path(entry.path) / 'index.json'), os.F_OK):
  99. try:
  100. link = parse_json_link_details(entry.path)
  101. except KeyError:
  102. link = None
  103. if link:
  104. yield link
  105. ### Helpers
  106. class ExtendedEncoder(pyjson.JSONEncoder):
  107. """
  108. Extended json serializer that supports serializing several model
  109. fields and objects
  110. """
  111. def default(self, obj):
  112. cls_name = type(obj).__name__
  113. if hasattr(obj, '_asdict'):
  114. return obj._asdict()
  115. elif isinstance(obj, bytes):
  116. return obj.decode()
  117. elif isinstance(obj, Path):
  118. return str(obj)
  119. elif isinstance(obj, datetime):
  120. return obj.isoformat()
  121. elif isinstance(obj, Exception):
  122. return '{}: {}'.format(obj.__class__.__name__, obj)
  123. elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
  124. return list(obj)
  125. try:
  126. return dict(obj)
  127. except Exception:
  128. pass
  129. try:
  130. return list(obj)
  131. except Exception:
  132. pass
  133. try:
  134. return str(obj)
  135. except Exception:
  136. pass
  137. return pyjson.JSONEncoder.default(self, obj)
  138. @enforce_types
  139. def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True, cls=ExtendedEncoder, default=None) -> str:
  140. return pyjson.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder, default=default)