| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108 |
- """
- Legacy archive import utilities.
- These functions are used to import data from old ArchiveBox archive formats
- (JSON indexes, archive directory structures) into the new database.
- This is separate from the hooks-based parser system which handles importing
- new URLs from bookmark files, RSS feeds, etc.
- """
- __package__ = 'archivebox.misc'
- import os
- import json
- from pathlib import Path
- from datetime import datetime, timezone
- from typing import Iterator, TypedDict, List
- class SnapshotDict(TypedDict, total=False):
- """
- Dictionary type representing a snapshot/link, compatible with Snapshot model fields.
- """
- url: str # Required: the URL to archive
- timestamp: str # Optional: unix timestamp string
- title: str # Optional: page title
- tags: str # Optional: comma-separated tags string
- sources: List[str] # Optional: list of source file paths
- def parse_json_main_index(out_dir: Path) -> Iterator[SnapshotDict]:
- """
- Parse links from the main JSON index file (archive/index.json).
- This is used to recover links from old archive formats.
- """
- from archivebox.config import CONSTANTS
- index_path = out_dir / CONSTANTS.JSON_INDEX_FILENAME
- if not index_path.exists():
- return
- try:
- with open(index_path, 'r', encoding='utf-8') as f:
- data = json.load(f)
- links = data.get('links', [])
- for link in links:
- yield {
- 'url': link.get('url', ''),
- 'timestamp': link.get('timestamp', str(datetime.now(timezone.utc).timestamp())),
- 'title': link.get('title'),
- 'tags': link.get('tags', ''),
- }
- except (json.JSONDecodeError, KeyError, TypeError):
- return
- def parse_json_links_details(out_dir: Path) -> Iterator[SnapshotDict]:
- """
- Parse links from individual snapshot index.jsonl/index.json files in archive directories.
- Walks through archive/*/index.jsonl and archive/*/index.json files to discover orphaned snapshots.
- Prefers index.jsonl (new format) over index.json (legacy format).
- """
- from archivebox.config import CONSTANTS
- archive_dir = out_dir / CONSTANTS.ARCHIVE_DIR_NAME
- if not archive_dir.exists():
- return
- for entry in os.scandir(archive_dir):
- if not entry.is_dir():
- continue
- # Try index.jsonl first (new format)
- jsonl_file = Path(entry.path) / CONSTANTS.JSONL_INDEX_FILENAME
- json_file = Path(entry.path) / CONSTANTS.JSON_INDEX_FILENAME
- link = None
- if jsonl_file.exists():
- try:
- with open(jsonl_file, 'r', encoding='utf-8') as f:
- for line in f:
- line = line.strip()
- if line.startswith('{'):
- record = json.loads(line)
- if record.get('type') == 'Snapshot':
- link = record
- break
- except (json.JSONDecodeError, KeyError, TypeError):
- pass
- if link is None and json_file.exists():
- try:
- with open(json_file, 'r', encoding='utf-8') as f:
- link = json.load(f)
- except (json.JSONDecodeError, KeyError, TypeError):
- pass
- if link:
- yield {
- 'url': link.get('url', ''),
- 'timestamp': link.get('timestamp', entry.name),
- 'title': link.get('title'),
- 'tags': link.get('tags', ''),
- }
|