json.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. __package__ = 'archivebox.index'
  2. import os
  3. import sys
  4. import json as pyjson
  5. from pathlib import Path
  6. from datetime import datetime, timezone
  7. from typing import List, Optional, Iterator, Any, Union
  8. import archivebox
  9. from .schema import Link
  10. from ..system import atomic_write
  11. from ..util import enforce_types
  12. @enforce_types
  13. def generate_json_index_from_links(links: List[Link], with_headers: bool):
  14. from django.conf import settings
  15. from plugins_sys.config.apps import SERVER_CONFIG
  16. MAIN_INDEX_HEADER = {
  17. 'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
  18. 'schema': 'archivebox.index.json',
  19. 'copyright_info': SERVER_CONFIG.FOOTER_INFO,
  20. 'meta': {
  21. 'project': 'ArchiveBox',
  22. 'version': archivebox.VERSION,
  23. 'git_sha': archivebox.VERSION, # not used anymore, but kept for backwards compatibility
  24. 'website': 'https://ArchiveBox.io',
  25. 'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
  26. 'source': 'https://github.com/ArchiveBox/ArchiveBox',
  27. 'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
  28. 'dependencies': settings.BINARIES.to_dict(),
  29. },
  30. }
  31. if with_headers:
  32. output = {
  33. **MAIN_INDEX_HEADER,
  34. 'num_links': len(links),
  35. 'updated': datetime.now(timezone.utc),
  36. 'last_run_cmd': sys.argv,
  37. 'links': links,
  38. }
  39. else:
  40. output = links
  41. return to_json(output, indent=4, sort_keys=True)
  42. @enforce_types
  43. def parse_json_main_index(out_dir: Path=archivebox.DATA_DIR) -> Iterator[Link]:
  44. """parse an archive index json file and return the list of links"""
  45. from plugins_sys.config.constants import CONSTANTS
  46. index_path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
  47. if index_path.exists():
  48. with open(index_path, 'r', encoding='utf-8') as f:
  49. try:
  50. links = pyjson.load(f)['links']
  51. if links:
  52. Link.from_json(links[0])
  53. except Exception as err:
  54. print(" {lightyellow}! Found an index.json in the project root but couldn't load links from it: {} {}".format(
  55. err.__class__.__name__,
  56. err,
  57. **ANSI,
  58. ))
  59. return ()
  60. for link_json in links:
  61. try:
  62. yield Link.from_json(link_json)
  63. except KeyError:
  64. try:
  65. detail_index_path = CONSTANTS.ARCHIVE_DIR / link_json['timestamp']
  66. yield parse_json_link_details(str(detail_index_path))
  67. except KeyError:
  68. # as a last effort, try to guess the missing values out of existing ones
  69. try:
  70. yield Link.from_json(link_json, guess=True)
  71. except KeyError:
  72. # print(" {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI))
  73. continue
  74. return ()
  75. ### Link Details Index
  76. @enforce_types
  77. def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
  78. """write a json file with some info about the link"""
  79. from plugins_sys.config.constants import CONSTANTS
  80. out_dir = out_dir or link.link_dir
  81. path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
  82. atomic_write(str(path), link._asdict(extended=True))
  83. @enforce_types
  84. def parse_json_link_details(out_dir: Union[Path, str], guess: bool=False) -> Optional[Link]:
  85. """load the json link index from a given directory"""
  86. from plugins_sys.config.constants import CONSTANTS
  87. existing_index = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
  88. if existing_index.exists():
  89. with open(existing_index, 'r', encoding='utf-8') as f:
  90. try:
  91. link_json = pyjson.load(f)
  92. return Link.from_json(link_json, guess)
  93. except pyjson.JSONDecodeError:
  94. pass
  95. return None
  96. @enforce_types
  97. def parse_json_links_details(out_dir: Union[Path, str]) -> Iterator[Link]:
  98. """read through all the archive data folders and return the parsed links"""
  99. from plugins_sys.config.constants import CONSTANTS
  100. for entry in os.scandir(CONSTANTS.ARCHIVE_DIR):
  101. if entry.is_dir(follow_symlinks=True):
  102. if (Path(entry.path) / 'index.json').exists():
  103. try:
  104. link = parse_json_link_details(entry.path)
  105. except KeyError:
  106. link = None
  107. if link:
  108. yield link
  109. ### Helpers
  110. class ExtendedEncoder(pyjson.JSONEncoder):
  111. """
  112. Extended json serializer that supports serializing several model
  113. fields and objects
  114. """
  115. def default(self, obj):
  116. cls_name = obj.__class__.__name__
  117. if hasattr(obj, '_asdict'):
  118. return obj._asdict()
  119. elif isinstance(obj, bytes):
  120. return obj.decode()
  121. elif isinstance(obj, datetime):
  122. return obj.isoformat()
  123. elif isinstance(obj, Exception):
  124. return '{}: {}'.format(obj.__class__.__name__, obj)
  125. elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
  126. return tuple(obj)
  127. return pyjson.JSONEncoder.default(self, obj)
  128. @enforce_types
  129. def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True, cls=ExtendedEncoder) -> str:
  130. return pyjson.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)