software
/
archival.ArchiveBox
mirror of https://github.com/ArchiveBox/ArchiveBox.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
							"""
Legacy archive import utilities.

These functions are used to import data from old ArchiveBox archive formats
(JSON indexes, archive directory structures) into the new database.

This is separate from the hooks-based parser system which handles importing
new URLs from bookmark files, RSS feeds, etc.
"""

__package__ = 'archivebox.misc'

import os
import json
from pathlib import Path
from datetime import datetime, timezone
from typing import Iterator, TypedDict, List


class SnapshotDict(TypedDict, total=False):
    """
    Dictionary type representing a snapshot/link, compatible with Snapshot model fields.
    """
    url: str              # Required: the URL to archive
    timestamp: str        # Optional: unix timestamp string
    title: str            # Optional: page title
    tags: str             # Optional: comma-separated tags string
    sources: List[str]    # Optional: list of source file paths


def parse_json_main_index(out_dir: Path) -> Iterator[SnapshotDict]:
    """
    Parse links from the main JSON index file (archive/index.json).

    This is used to recover links from old archive formats.
    """
    from archivebox.config import CONSTANTS

    index_path = out_dir / CONSTANTS.JSON_INDEX_FILENAME
    if not index_path.exists():
        return

    try:
        with open(index_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        links = data.get('links', [])
        for link in links:
            yield {
                'url': link.get('url', ''),
                'timestamp': link.get('timestamp', str(datetime.now(timezone.utc).timestamp())),
                'title': link.get('title'),
                'tags': link.get('tags', ''),
            }
    except (json.JSONDecodeError, KeyError, TypeError):
        return


def parse_json_links_details(out_dir: Path) -> Iterator[SnapshotDict]:
    """
    Parse links from individual snapshot index.jsonl/index.json files in archive directories.

    Walks through archive/*/index.jsonl and archive/*/index.json files to discover orphaned snapshots.
    Prefers index.jsonl (new format) over index.json (legacy format).
    """
    from archivebox.config import CONSTANTS

    archive_dir = out_dir / CONSTANTS.ARCHIVE_DIR_NAME
    if not archive_dir.exists():
        return

    for entry in os.scandir(archive_dir):
        if not entry.is_dir():
            continue

        # Try index.jsonl first (new format)
        jsonl_file = Path(entry.path) / CONSTANTS.JSONL_INDEX_FILENAME
        json_file = Path(entry.path) / CONSTANTS.JSON_INDEX_FILENAME

        link = None

        if jsonl_file.exists():
            try:
                with open(jsonl_file, 'r', encoding='utf-8') as f:
                    for line in f:
                        line = line.strip()
                        if line.startswith('{'):
                            record = json.loads(line)
                            if record.get('type') == 'Snapshot':
                                link = record
                                break
            except (json.JSONDecodeError, KeyError, TypeError):
                pass

        if link is None and json_file.exists():
            try:
                with open(json_file, 'r', encoding='utf-8') as f:
                    link = json.load(f)
            except (json.JSONDecodeError, KeyError, TypeError):
                pass

        if link:
            yield {
                'url': link.get('url', ''),
                'timestamp': link.get('timestamp', entry.name),
                'title': link.get('title'),
                'tags': link.get('tags', ''),
            }