json.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. __package__ = 'archivebox.index'
  2. import os
  3. import sys
  4. import json
  5. from datetime import datetime
  6. from typing import List, Optional, Iterator
  7. from .schema import Link, ArchiveResult
  8. from ..util import enforce_types, atomic_write
  9. from ..config import (
  10. VERSION,
  11. OUTPUT_DIR,
  12. FOOTER_INFO,
  13. GIT_SHA,
  14. DEPENDENCIES,
  15. JSON_INDEX_FILENAME,
  16. ARCHIVE_DIR_NAME,
  17. )
  18. MAIN_INDEX_HEADER = {
  19. 'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
  20. 'schema': 'archivebox.index.json',
  21. 'copyright_info': FOOTER_INFO,
  22. 'meta': {
  23. 'project': 'ArchiveBox',
  24. 'version': VERSION,
  25. 'git_sha': GIT_SHA,
  26. 'website': 'https://ArchiveBox.io',
  27. 'docs': 'https://github.com/pirate/ArchiveBox/wiki',
  28. 'source': 'https://github.com/pirate/ArchiveBox',
  29. 'issues': 'https://github.com/pirate/ArchiveBox/issues',
  30. 'dependencies': DEPENDENCIES,
  31. },
  32. }
  33. ### Main Links Index
  34. @enforce_types
  35. def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
  36. """parse an archive index json file and return the list of links"""
  37. index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
  38. if os.path.exists(index_path):
  39. with open(index_path, 'r', encoding='utf-8') as f:
  40. links = json.load(f)['links']
  41. for link_json in links:
  42. yield Link.from_json(link_json)
  43. return ()
  44. @enforce_types
  45. def write_json_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
  46. """write the json link index to a given path"""
  47. assert isinstance(links, List), 'Links must be a list, not a generator.'
  48. assert not links or isinstance(links[0].history, dict)
  49. assert not links or isinstance(links[0].sources, list)
  50. if links and links[0].history.get('title'):
  51. assert isinstance(links[0].history['title'][0], ArchiveResult)
  52. if links and links[0].sources:
  53. assert isinstance(links[0].sources[0], str)
  54. main_index_json = {
  55. **MAIN_INDEX_HEADER,
  56. 'num_links': len(links),
  57. 'updated': datetime.now(),
  58. 'last_run_cmd': sys.argv,
  59. 'links': links,
  60. }
  61. atomic_write(main_index_json, os.path.join(out_dir, JSON_INDEX_FILENAME))
  62. ### Link Details Index
  63. @enforce_types
  64. def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
  65. """write a json file with some info about the link"""
  66. out_dir = out_dir or link.link_dir
  67. path = os.path.join(out_dir, JSON_INDEX_FILENAME)
  68. atomic_write(link._asdict(extended=True), path)
  69. @enforce_types
  70. def parse_json_link_details(out_dir: str) -> Optional[Link]:
  71. """load the json link index from a given directory"""
  72. existing_index = os.path.join(out_dir, JSON_INDEX_FILENAME)
  73. if os.path.exists(existing_index):
  74. with open(existing_index, 'r', encoding='utf-8') as f:
  75. try:
  76. link_json = json.load(f)
  77. return Link.from_json(link_json)
  78. except json.JSONDecodeError:
  79. pass
  80. return None
  81. @enforce_types
  82. def parse_json_links_details(out_dir: str) -> Iterator[Link]:
  83. """read through all the archive data folders and return the parsed links"""
  84. for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
  85. if entry.is_dir(follow_symlinks=True):
  86. if os.path.exists(os.path.join(entry.path, 'index.json')):
  87. link = parse_json_link_details(entry.path)
  88. if link:
  89. yield link