software
/
archival.ArchiveBox
mirror of https://github.com/ArchiveBox/ArchiveBox.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276
							#!/usr/bin/env python3
"""Unit tests for parse_jsonl_urls extractor."""

import json
import subprocess
import sys
from pathlib import Path

import pytest

PLUGIN_DIR = Path(__file__).parent.parent
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_jsonl_urls.*'), None)


class TestParseJsonlUrls:
    """Test the parse_jsonl_urls extractor CLI."""

    def test_extracts_urls_from_jsonl(self, tmp_path):
        """Test extracting URLs from JSONL bookmark file."""
        input_file = tmp_path / 'bookmarks.jsonl'
        input_file.write_text(
            '{"url": "https://example.com", "title": "Example"}\n'
            '{"url": "https://foo.bar/page", "title": "Foo Bar"}\n'
            '{"url": "https://test.org", "title": "Test Org"}\n'
        )

        result = subprocess.run(
            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
            cwd=tmp_path,
            capture_output=True,
            text=True,
        )

        assert result.returncode == 0
        assert 'urls.jsonl' in result.stderr or 'urls.jsonl' in result.stdout

        # Output goes to stdout (JSONL)
        lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
        assert len(lines) == 3

        entries = [json.loads(line) for line in lines]
        urls = {e['url'] for e in entries}
        titles = {e.get('title') for e in entries}

        assert 'https://example.com' in urls
        assert 'https://foo.bar/page' in urls
        assert 'https://test.org' in urls
        assert 'Example' in titles
        assert 'Foo Bar' in titles
        assert 'Test Org' in titles

    def test_supports_href_field(self, tmp_path):
        """Test that 'href' field is recognized as URL."""
        input_file = tmp_path / 'bookmarks.jsonl'
        input_file.write_text('{"href": "https://example.com", "title": "Test"}\n')

        result = subprocess.run(
            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
            cwd=tmp_path,
            capture_output=True,
            text=True,
        )

        assert result.returncode == 0
        # Output goes to stdout (JSONL)
        lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
        entry = json.loads(lines[0])
        assert entry['url'] == 'https://example.com'

    def test_supports_description_as_title(self, tmp_path):
        """Test that 'description' field is used as title fallback."""
        input_file = tmp_path / 'bookmarks.jsonl'
        input_file.write_text('{"url": "https://example.com", "description": "A description"}\n')

        result = subprocess.run(
            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
            cwd=tmp_path,
            capture_output=True,
            text=True,
        )

        assert result.returncode == 0
        # Output goes to stdout (JSONL)
        lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
        entry = json.loads(lines[0])
        assert entry['title'] == 'A description'

    def test_parses_various_timestamp_formats(self, tmp_path):
        """Test parsing of different timestamp field names."""
        input_file = tmp_path / 'bookmarks.jsonl'
        input_file.write_text('{"url": "https://example.com", "timestamp": 1609459200000000}\n')

        result = subprocess.run(
            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
            cwd=tmp_path,
            capture_output=True,
            text=True,
        )

        assert result.returncode == 0
        # Output goes to stdout (JSONL)
        lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
        entry = json.loads(lines[0])
        # Parser converts timestamp to bookmarked_at
        assert 'bookmarked_at' in entry

    def test_parses_tags_as_string(self, tmp_path):
        """Test parsing tags as comma-separated string."""
        input_file = tmp_path / 'bookmarks.jsonl'
        input_file.write_text('{"url": "https://example.com", "tags": "tech,news,reading"}\n')

        result = subprocess.run(
            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
            cwd=tmp_path,
            capture_output=True,
            text=True,
        )

        assert result.returncode == 0
        # Output goes to stdout (JSONL)
        # Parser converts tags to separate Tag objects in the output
        content = result.stdout
        assert 'tech' in content or 'news' in content or 'Tag' in content

    def test_parses_tags_as_list(self, tmp_path):
        """Test parsing tags as JSON array."""
        input_file = tmp_path / 'bookmarks.jsonl'
        input_file.write_text('{"url": "https://example.com", "tags": ["tech", "news"]}\n')

        result = subprocess.run(
            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
            cwd=tmp_path,
            capture_output=True,
            text=True,
        )

        assert result.returncode == 0
        # Output goes to stdout (JSONL)
        # Parser converts tags to separate Tag objects in the output
        content = result.stdout
        assert 'tech' in content or 'news' in content or 'Tag' in content

    def test_skips_malformed_lines(self, tmp_path):
        """Test that malformed JSON lines are skipped."""
        input_file = tmp_path / 'bookmarks.jsonl'
        input_file.write_text(
            '{"url": "https://valid.com"}\n'
            'not valid json\n'
            '{"url": "https://also-valid.com"}\n'
        )

        result = subprocess.run(
            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
            cwd=tmp_path,
            capture_output=True,
            text=True,
        )

        assert result.returncode == 0
        # Output goes to stdout (JSONL)
        lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
        assert len(lines) == 2

    def test_skips_entries_without_url(self, tmp_path):
        """Test that entries without URL field are skipped."""
        input_file = tmp_path / 'bookmarks.jsonl'
        input_file.write_text(
            '{"url": "https://valid.com"}\n'
            '{"title": "No URL here"}\n'
            '{"url": "https://also-valid.com"}\n'
        )

        result = subprocess.run(
            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
            cwd=tmp_path,
            capture_output=True,
            text=True,
        )

        assert result.returncode == 0
        # Output goes to stdout (JSONL)
        lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
        assert len(lines) == 2

    def test_skips_when_no_urls_found(self, tmp_path):
        """Test that script returns skipped status when no URLs found."""
        input_file = tmp_path / 'empty.jsonl'
        input_file.write_text('{"title": "No URL"}\n')

        result = subprocess.run(
            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
            cwd=tmp_path,
            capture_output=True,
            text=True,
        )

        assert result.returncode == 0
        assert 'urls.jsonl' in result.stderr
        assert '"status": "skipped"' in result.stdout

    def test_exits_1_when_file_not_found(self, tmp_path):
        """Test that script exits with code 1 when file doesn't exist."""
        result = subprocess.run(
            [sys.executable, str(SCRIPT_PATH), '--url', 'file:///nonexistent/bookmarks.jsonl'],
            cwd=tmp_path,
            capture_output=True,
            text=True,
        )

        assert result.returncode == 1
        assert 'Failed to fetch' in result.stderr

    def test_handles_html_entities(self, tmp_path):
        """Test that HTML entities in URLs and titles are decoded."""
        input_file = tmp_path / 'bookmarks.jsonl'
        input_file.write_text('{"url": "https://example.com/page?a=1&amp;b=2", "title": "Test &amp; Title"}\n')

        result = subprocess.run(
            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
            cwd=tmp_path,
            capture_output=True,
            text=True,
        )

        assert result.returncode == 0
        # Output goes to stdout (JSONL)
        lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
        entry = json.loads(lines[0])
        assert entry['url'] == 'https://example.com/page?a=1&b=2'
        assert entry['title'] == 'Test & Title'

    def test_skips_empty_lines(self, tmp_path):
        """Test that empty lines are skipped."""
        input_file = tmp_path / 'bookmarks.jsonl'
        input_file.write_text(
            '{"url": "https://example.com"}\n'
            '\n'
            '   \n'
            '{"url": "https://other.com"}\n'
        )

        result = subprocess.run(
            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
            cwd=tmp_path,
            capture_output=True,
            text=True,
        )

        assert result.returncode == 0
        # Output goes to stdout (JSONL)
        lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
        assert len(lines) == 2

    def test_output_includes_required_fields(self, tmp_path):
        """Test that output includes required fields."""
        input_file = tmp_path / 'bookmarks.jsonl'
        input_file.write_text('{"url": "https://example.com"}\n')

        result = subprocess.run(
            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
            cwd=tmp_path,
            capture_output=True,
            text=True,
        )

        assert result.returncode == 0
        # Output goes to stdout (JSONL)
        lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
        entry = json.loads(lines[0])
        assert entry['url'] == 'https://example.com'
        assert 'type' in entry
        assert 'plugin' in entry


if __name__ == '__main__':
    pytest.main([__file__, '-v'])