json.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. __package__ = 'archivebox.index'
  2. import os
  3. import sys
  4. import json as pyjson
  5. from pathlib import Path
  6. from datetime import datetime
  7. from typing import List, Optional, Iterator, Any
  8. from .schema import Link, ArchiveResult
  9. from ..system import atomic_write
  10. from ..util import enforce_types
  11. from ..config import (
  12. VERSION,
  13. OUTPUT_DIR,
  14. FOOTER_INFO,
  15. GIT_SHA,
  16. DEPENDENCIES,
  17. JSON_INDEX_FILENAME,
  18. ARCHIVE_DIR_NAME,
  19. ANSI
  20. )
  21. MAIN_INDEX_HEADER = {
  22. 'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
  23. 'schema': 'archivebox.index.json',
  24. 'copyright_info': FOOTER_INFO,
  25. 'meta': {
  26. 'project': 'ArchiveBox',
  27. 'version': VERSION,
  28. 'git_sha': GIT_SHA,
  29. 'website': 'https://ArchiveBox.io',
  30. 'docs': 'https://github.com/pirate/ArchiveBox/wiki',
  31. 'source': 'https://github.com/pirate/ArchiveBox',
  32. 'issues': 'https://github.com/pirate/ArchiveBox/issues',
  33. 'dependencies': DEPENDENCIES,
  34. },
  35. }
  36. ### Main Links Index
  37. @enforce_types
  38. def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
  39. """parse an archive index json file and return the list of links"""
  40. index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
  41. if os.path.exists(index_path):
  42. with open(index_path, 'r', encoding='utf-8') as f:
  43. links = pyjson.load(f)['links']
  44. for link_json in links:
  45. try:
  46. yield Link.from_json(link_json)
  47. except KeyError:
  48. try:
  49. detail_index_path = Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / link_json['timestamp']
  50. yield parse_json_link_details(str(detail_index_path))
  51. except KeyError:
  52. print(" {lightyellow}! Failed to retrieve index from {}. The index may be corrupt.".format(detail_index_path, **ANSI))
  53. continue
  54. return ()
  55. @enforce_types
  56. def write_json_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
  57. """write the json link index to a given path"""
  58. assert isinstance(links, List), 'Links must be a list, not a generator.'
  59. assert not links or isinstance(links[0].history, dict)
  60. assert not links or isinstance(links[0].sources, list)
  61. if links and links[0].history.get('title'):
  62. assert isinstance(links[0].history['title'][0], ArchiveResult)
  63. if links and links[0].sources:
  64. assert isinstance(links[0].sources[0], str)
  65. main_index_json = {
  66. **MAIN_INDEX_HEADER,
  67. 'num_links': len(links),
  68. 'updated': datetime.now(),
  69. 'last_run_cmd': sys.argv,
  70. 'links': links,
  71. }
  72. atomic_write(os.path.join(out_dir, JSON_INDEX_FILENAME), main_index_json)
  73. ### Link Details Index
  74. @enforce_types
  75. def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
  76. """write a json file with some info about the link"""
  77. out_dir = out_dir or link.link_dir
  78. path = os.path.join(out_dir, JSON_INDEX_FILENAME)
  79. atomic_write(path, link._asdict(extended=True))
  80. @enforce_types
  81. def parse_json_link_details(out_dir: str) -> Optional[Link]:
  82. """load the json link index from a given directory"""
  83. existing_index = os.path.join(out_dir, JSON_INDEX_FILENAME)
  84. if os.path.exists(existing_index):
  85. with open(existing_index, 'r', encoding='utf-8') as f:
  86. try:
  87. link_json = pyjson.load(f)
  88. return Link.from_json(link_json)
  89. except pyjson.JSONDecodeError:
  90. pass
  91. return None
  92. @enforce_types
  93. def parse_json_links_details(out_dir: str) -> Iterator[Link]:
  94. """read through all the archive data folders and return the parsed links"""
  95. for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
  96. if entry.is_dir(follow_symlinks=True):
  97. if os.path.exists(os.path.join(entry.path, 'index.json')):
  98. try:
  99. link = parse_json_link_details(entry.path)
  100. except KeyError:
  101. link = None
  102. if link:
  103. yield link
  104. ### Helpers
  105. class ExtendedEncoder(pyjson.JSONEncoder):
  106. """
  107. Extended json serializer that supports serializing several model
  108. fields and objects
  109. """
  110. def default(self, obj):
  111. cls_name = obj.__class__.__name__
  112. if hasattr(obj, '_asdict'):
  113. return obj._asdict()
  114. elif isinstance(obj, bytes):
  115. return obj.decode()
  116. elif isinstance(obj, datetime):
  117. return obj.isoformat()
  118. elif isinstance(obj, Exception):
  119. return '{}: {}'.format(obj.__class__.__name__, obj)
  120. elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
  121. return tuple(obj)
  122. return pyjson.JSONEncoder.default(self, obj)
  123. @enforce_types
  124. def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True, cls=ExtendedEncoder) -> str:
  125. return pyjson.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)