json.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. __package__ = 'archivebox.legacy.storage'
  2. import os
  3. import sys
  4. import json
  5. from datetime import datetime
  6. from typing import List, Optional, Iterator
  7. from ..schema import Link, ArchiveResult
  8. from ..config import (
  9. VERSION,
  10. OUTPUT_DIR,
  11. FOOTER_INFO,
  12. GIT_SHA,
  13. DEPENDENCIES,
  14. JSON_INDEX_FILENAME,
  15. ARCHIVE_DIR_NAME,
  16. )
  17. from ..util import (
  18. enforce_types,
  19. atomic_write,
  20. )
  21. MAIN_INDEX_HEADER = {
  22. 'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
  23. 'schema': 'archivebox.legacy.storage.json',
  24. 'copyright_info': FOOTER_INFO,
  25. 'meta': {
  26. 'project': 'ArchiveBox',
  27. 'version': VERSION,
  28. 'git_sha': GIT_SHA,
  29. 'website': 'https://ArchiveBox.io',
  30. 'docs': 'https://github.com/pirate/ArchiveBox/wiki',
  31. 'source': 'https://github.com/pirate/ArchiveBox',
  32. 'issues': 'https://github.com/pirate/ArchiveBox/issues',
  33. 'dependencies': DEPENDENCIES,
  34. },
  35. }
  36. ### Main Links Index
  37. @enforce_types
  38. def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
  39. """parse a archive index json file and return the list of links"""
  40. index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
  41. if os.path.exists(index_path):
  42. with open(index_path, 'r', encoding='utf-8') as f:
  43. links = json.load(f)['links']
  44. for link_json in links:
  45. yield Link.from_json(link_json)
  46. return ()
  47. @enforce_types
  48. def write_json_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
  49. """write the json link index to a given path"""
  50. assert isinstance(links, List), 'Links must be a list, not a generator.'
  51. assert not links or isinstance(links[0].history, dict)
  52. assert not links or isinstance(links[0].sources, list)
  53. if links and links[0].history.get('title'):
  54. assert isinstance(links[0].history['title'][0], ArchiveResult)
  55. if links and links[0].sources:
  56. assert isinstance(links[0].sources[0], str)
  57. main_index_json = {
  58. **MAIN_INDEX_HEADER,
  59. 'num_links': len(links),
  60. 'updated': datetime.now(),
  61. 'last_run_cmd': sys.argv,
  62. 'links': links,
  63. }
  64. atomic_write(main_index_json, os.path.join(out_dir, JSON_INDEX_FILENAME))
  65. ### Link Details Index
  66. @enforce_types
  67. def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
  68. """write a json file with some info about the link"""
  69. out_dir = out_dir or link.link_dir
  70. path = os.path.join(out_dir, JSON_INDEX_FILENAME)
  71. atomic_write(link._asdict(extended=True), path)
  72. @enforce_types
  73. def parse_json_link_details(out_dir: str) -> Optional[Link]:
  74. """load the json link index from a given directory"""
  75. existing_index = os.path.join(out_dir, JSON_INDEX_FILENAME)
  76. if os.path.exists(existing_index):
  77. with open(existing_index, 'r', encoding='utf-8') as f:
  78. try:
  79. link_json = json.load(f)
  80. return Link.from_json(link_json)
  81. except json.JSONDecodeError:
  82. pass
  83. return None
  84. @enforce_types
  85. def parse_json_links_details(out_dir: str) -> Iterator[Link]:
  86. """read through all the archive data folders and return the parsed links"""
  87. for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
  88. if entry.is_dir(follow_symlinks=True):
  89. if os.path.exists(os.path.join(entry.path, 'index.json')):
  90. yield parse_json_link_details(entry.path)