| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351 |
- """
- Tests for the SQLite FTS5 search backend.
- Tests cover:
- 1. Search index creation
- 2. Indexing snapshots
- 3. Search queries with real test data
- 4. Flush operations
- 5. Edge cases (empty index, special characters)
- """
- import os
- import sqlite3
- import tempfile
- from pathlib import Path
- from unittest.mock import patch
- import pytest
- from django.test import TestCase, override_settings
- from archivebox.plugins.search_backend_sqlite.search import (
- get_db_path,
- search,
- flush,
- SQLITEFTS_DB,
- FTS_TOKENIZERS,
- )
- class TestSqliteSearchBackend(TestCase):
- """Test SQLite FTS5 search backend."""
- def setUp(self):
- """Create a temporary data directory with search index."""
- self.temp_dir = tempfile.mkdtemp()
- self.db_path = Path(self.temp_dir) / SQLITEFTS_DB
- # Patch DATA_DIR
- self.settings_patch = patch(
- 'archivebox.plugins.search_backend_sqlite.search.settings'
- )
- self.mock_settings = self.settings_patch.start()
- self.mock_settings.DATA_DIR = self.temp_dir
- # Create FTS5 table
- self._create_index()
- def tearDown(self):
- """Clean up temporary directory."""
- self.settings_patch.stop()
- import shutil
- shutil.rmtree(self.temp_dir, ignore_errors=True)
- def _create_index(self):
- """Create the FTS5 search index table."""
- conn = sqlite3.connect(str(self.db_path))
- try:
- conn.execute(f'''
- CREATE VIRTUAL TABLE IF NOT EXISTS search_index
- USING fts5(
- snapshot_id,
- url,
- title,
- content,
- tokenize = '{FTS_TOKENIZERS}'
- )
- ''')
- conn.commit()
- finally:
- conn.close()
- def _index_snapshot(self, snapshot_id: str, url: str, title: str, content: str):
- """Add a snapshot to the index."""
- conn = sqlite3.connect(str(self.db_path))
- try:
- conn.execute(
- 'INSERT INTO search_index (snapshot_id, url, title, content) VALUES (?, ?, ?, ?)',
- (snapshot_id, url, title, content)
- )
- conn.commit()
- finally:
- conn.close()
- def test_get_db_path(self):
- """get_db_path should return correct path."""
- path = get_db_path()
- self.assertEqual(path, Path(self.temp_dir) / SQLITEFTS_DB)
- def test_search_empty_index(self):
- """search should return empty list for empty index."""
- results = search('nonexistent')
- self.assertEqual(results, [])
- def test_search_no_index_file(self):
- """search should return empty list when index file doesn't exist."""
- os.remove(self.db_path)
- results = search('test')
- self.assertEqual(results, [])
- def test_search_single_result(self):
- """search should find matching snapshot."""
- self._index_snapshot(
- 'snap-001',
- 'https://example.com/page1',
- 'Example Page',
- 'This is example content about testing.'
- )
- results = search('example')
- self.assertEqual(len(results), 1)
- self.assertEqual(results[0], 'snap-001')
- def test_search_multiple_results(self):
- """search should find all matching snapshots."""
- self._index_snapshot('snap-001', 'https://example.com/1', 'Python Tutorial', 'Learn Python programming')
- self._index_snapshot('snap-002', 'https://example.com/2', 'Python Guide', 'Advanced Python concepts')
- self._index_snapshot('snap-003', 'https://example.com/3', 'JavaScript Basics', 'Learn JavaScript')
- results = search('Python')
- self.assertEqual(len(results), 2)
- self.assertIn('snap-001', results)
- self.assertIn('snap-002', results)
- self.assertNotIn('snap-003', results)
- def test_search_title_match(self):
- """search should match against title."""
- self._index_snapshot('snap-001', 'https://example.com', 'Django Web Framework', 'Content here')
- results = search('Django')
- self.assertEqual(len(results), 1)
- self.assertEqual(results[0], 'snap-001')
- def test_search_url_match(self):
- """search should match against URL."""
- self._index_snapshot('snap-001', 'https://archivebox.io/docs', 'Title', 'Content')
- results = search('archivebox')
- self.assertEqual(len(results), 1)
- def test_search_content_match(self):
- """search should match against content."""
- self._index_snapshot(
- 'snap-001',
- 'https://example.com',
- 'Generic Title',
- 'This document contains information about cryptography and security.'
- )
- results = search('cryptography')
- self.assertEqual(len(results), 1)
- def test_search_case_insensitive(self):
- """search should be case insensitive."""
- self._index_snapshot('snap-001', 'https://example.com', 'Title', 'PYTHON programming')
- results = search('python')
- self.assertEqual(len(results), 1)
- def test_search_stemming(self):
- """search should use porter stemmer for word stems."""
- self._index_snapshot('snap-001', 'https://example.com', 'Title', 'Programming concepts')
- # 'program' should match 'programming' with porter stemmer
- results = search('program')
- self.assertEqual(len(results), 1)
- def test_search_multiple_words(self):
- """search should match documents with all words."""
- self._index_snapshot('snap-001', 'https://example.com', 'Web Development', 'Learn web development skills')
- self._index_snapshot('snap-002', 'https://example.com', 'Web Design', 'Design beautiful websites')
- results = search('web development')
- # FTS5 defaults to OR, so both might match
- # With porter stemmer, both should match 'web'
- self.assertIn('snap-001', results)
- def test_search_phrase(self):
- """search should support phrase queries."""
- self._index_snapshot('snap-001', 'https://example.com', 'Title', 'machine learning algorithms')
- self._index_snapshot('snap-002', 'https://example.com', 'Title', 'machine algorithms learning')
- # Phrase search with quotes
- results = search('"machine learning"')
- self.assertEqual(len(results), 1)
- self.assertEqual(results[0], 'snap-001')
- def test_search_distinct_results(self):
- """search should return distinct snapshot IDs."""
- # Index same snapshot twice (could happen with multiple fields matching)
- self._index_snapshot('snap-001', 'https://python.org', 'Python', 'Python programming language')
- results = search('Python')
- self.assertEqual(len(results), 1)
- def test_flush_single(self):
- """flush should remove snapshot from index."""
- self._index_snapshot('snap-001', 'https://example.com', 'Title', 'Content')
- self._index_snapshot('snap-002', 'https://example.com', 'Title', 'Content')
- flush(['snap-001'])
- results = search('Content')
- self.assertEqual(len(results), 1)
- self.assertEqual(results[0], 'snap-002')
- def test_flush_multiple(self):
- """flush should remove multiple snapshots."""
- self._index_snapshot('snap-001', 'https://example.com', 'Title', 'Test')
- self._index_snapshot('snap-002', 'https://example.com', 'Title', 'Test')
- self._index_snapshot('snap-003', 'https://example.com', 'Title', 'Test')
- flush(['snap-001', 'snap-003'])
- results = search('Test')
- self.assertEqual(len(results), 1)
- self.assertEqual(results[0], 'snap-002')
- def test_flush_nonexistent(self):
- """flush should not raise for nonexistent snapshots."""
- # Should not raise
- flush(['nonexistent-snap'])
- def test_flush_no_index(self):
- """flush should not raise when index doesn't exist."""
- os.remove(self.db_path)
- # Should not raise
- flush(['snap-001'])
- def test_search_special_characters(self):
- """search should handle special characters in queries."""
- self._index_snapshot('snap-001', 'https://example.com', 'C++ Programming', 'Learn C++ basics')
- # FTS5 handles special chars
- results = search('C++')
- # May or may not match depending on tokenizer config
- # At minimum, should not raise
- self.assertIsInstance(results, list)
- def test_search_unicode(self):
- """search should handle unicode content."""
- self._index_snapshot('snap-001', 'https://example.com', 'Titre Francais', 'cafe resume')
- self._index_snapshot('snap-002', 'https://example.com', 'Japanese', 'Hello world')
- # With remove_diacritics, 'cafe' should match
- results = search('cafe')
- self.assertEqual(len(results), 1)
- class TestSqliteSearchWithRealData(TestCase):
- """Integration tests with realistic archived content."""
- def setUp(self):
- """Create index with realistic test data."""
- self.temp_dir = tempfile.mkdtemp()
- self.db_path = Path(self.temp_dir) / SQLITEFTS_DB
- self.settings_patch = patch(
- 'archivebox.plugins.search_backend_sqlite.search.settings'
- )
- self.mock_settings = self.settings_patch.start()
- self.mock_settings.DATA_DIR = self.temp_dir
- # Create index
- conn = sqlite3.connect(str(self.db_path))
- try:
- conn.execute(f'''
- CREATE VIRTUAL TABLE IF NOT EXISTS search_index
- USING fts5(
- snapshot_id,
- url,
- title,
- content,
- tokenize = '{FTS_TOKENIZERS}'
- )
- ''')
- # Index realistic data
- test_data = [
- ('snap-001', 'https://github.com/ArchiveBox/ArchiveBox',
- 'ArchiveBox - Self-hosted web archiving',
- 'Open source self-hosted web archiving. Collects, saves, and displays various types of content.'),
- ('snap-002', 'https://docs.python.org/3/tutorial/',
- 'Python 3 Tutorial',
- 'An informal introduction to Python. Python is an easy to learn, powerful programming language.'),
- ('snap-003', 'https://developer.mozilla.org/docs/Web/JavaScript',
- 'JavaScript - MDN Web Docs',
- 'JavaScript (JS) is a lightweight, interpreted programming language with first-class functions.'),
- ('snap-004', 'https://news.ycombinator.com',
- 'Hacker News',
- 'Social news website focusing on computer science and entrepreneurship.'),
- ('snap-005', 'https://en.wikipedia.org/wiki/Web_archiving',
- 'Web archiving - Wikipedia',
- 'Web archiving is the process of collecting portions of the World Wide Web to ensure the information is preserved.'),
- ]
- conn.executemany(
- 'INSERT INTO search_index (snapshot_id, url, title, content) VALUES (?, ?, ?, ?)',
- test_data
- )
- conn.commit()
- finally:
- conn.close()
- def tearDown(self):
- """Clean up."""
- self.settings_patch.stop()
- import shutil
- shutil.rmtree(self.temp_dir, ignore_errors=True)
- def test_search_archivebox(self):
- """Search for 'archivebox' should find relevant results."""
- results = search('archivebox')
- self.assertIn('snap-001', results)
- def test_search_programming(self):
- """Search for 'programming' should find Python and JS docs."""
- results = search('programming')
- self.assertIn('snap-002', results)
- self.assertIn('snap-003', results)
- def test_search_web_archiving(self):
- """Search for 'web archiving' should find relevant results."""
- results = search('web archiving')
- # Both ArchiveBox and Wikipedia should match
- self.assertIn('snap-001', results)
- self.assertIn('snap-005', results)
- def test_search_github(self):
- """Search for 'github' should find URL match."""
- results = search('github')
- self.assertIn('snap-001', results)
- def test_search_tutorial(self):
- """Search for 'tutorial' should find Python tutorial."""
- results = search('tutorial')
- self.assertIn('snap-002', results)
- def test_flush_and_search(self):
- """Flushing a snapshot should remove it from search results."""
- # Verify it's there first
- results = search('archivebox')
- self.assertIn('snap-001', results)
- # Flush it
- flush(['snap-001'])
- # Should no longer be found
- results = search('archivebox')
- self.assertNotIn('snap-001', results)
- if __name__ == '__main__':
- pytest.main([__file__, '-v'])
|