legacy.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. """
  2. Legacy archive import utilities.
  3. These functions are used to import data from old ArchiveBox archive formats
  4. (JSON indexes, archive directory structures) into the new database.
  5. This is separate from the hooks-based parser system which handles importing
  6. new URLs from bookmark files, RSS feeds, etc.
  7. """
  8. __package__ = 'archivebox.misc'
  9. import os
  10. import json
  11. from pathlib import Path
  12. from datetime import datetime, timezone
  13. from typing import Iterator, TypedDict, List
  14. class SnapshotDict(TypedDict, total=False):
  15. """
  16. Dictionary type representing a snapshot/link, compatible with Snapshot model fields.
  17. """
  18. url: str # Required: the URL to archive
  19. timestamp: str # Optional: unix timestamp string
  20. title: str # Optional: page title
  21. tags: str # Optional: comma-separated tags string
  22. sources: List[str] # Optional: list of source file paths
  23. def parse_json_main_index(out_dir: Path) -> Iterator[SnapshotDict]:
  24. """
  25. Parse links from the main JSON index file (archive/index.json).
  26. This is used to recover links from old archive formats.
  27. """
  28. from archivebox.config import CONSTANTS
  29. index_path = out_dir / CONSTANTS.JSON_INDEX_FILENAME
  30. if not index_path.exists():
  31. return
  32. try:
  33. with open(index_path, 'r', encoding='utf-8') as f:
  34. data = json.load(f)
  35. links = data.get('links', [])
  36. for link in links:
  37. yield {
  38. 'url': link.get('url', ''),
  39. 'timestamp': link.get('timestamp', str(datetime.now(timezone.utc).timestamp())),
  40. 'title': link.get('title'),
  41. 'tags': link.get('tags', ''),
  42. }
  43. except (json.JSONDecodeError, KeyError, TypeError):
  44. return
  45. def parse_json_links_details(out_dir: Path) -> Iterator[SnapshotDict]:
  46. """
  47. Parse links from individual snapshot index.jsonl/index.json files in archive directories.
  48. Walks through archive/*/index.jsonl and archive/*/index.json files to discover orphaned snapshots.
  49. Prefers index.jsonl (new format) over index.json (legacy format).
  50. """
  51. from archivebox.config import CONSTANTS
  52. archive_dir = out_dir / CONSTANTS.ARCHIVE_DIR_NAME
  53. if not archive_dir.exists():
  54. return
  55. for entry in os.scandir(archive_dir):
  56. if not entry.is_dir():
  57. continue
  58. # Try index.jsonl first (new format)
  59. jsonl_file = Path(entry.path) / CONSTANTS.JSONL_INDEX_FILENAME
  60. json_file = Path(entry.path) / CONSTANTS.JSON_INDEX_FILENAME
  61. link = None
  62. if jsonl_file.exists():
  63. try:
  64. with open(jsonl_file, 'r', encoding='utf-8') as f:
  65. for line in f:
  66. line = line.strip()
  67. if line.startswith('{'):
  68. record = json.loads(line)
  69. if record.get('type') == 'Snapshot':
  70. link = record
  71. break
  72. except (json.JSONDecodeError, KeyError, TypeError):
  73. pass
  74. if link is None and json_file.exists():
  75. try:
  76. with open(json_file, 'r', encoding='utf-8') as f:
  77. link = json.load(f)
  78. except (json.JSONDecodeError, KeyError, TypeError):
  79. pass
  80. if link:
  81. yield {
  82. 'url': link.get('url', ''),
  83. 'timestamp': link.get('timestamp', entry.name),
  84. 'title': link.get('title'),
  85. 'tags': link.get('tags', ''),
  86. }