test_ripgrep_search.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. """
  2. Tests for the ripgrep search backend.
  3. Tests cover:
  4. 1. Search with ripgrep binary
  5. 2. Snapshot ID extraction from file paths
  6. 3. Timeout handling
  7. 4. Error handling
  8. 5. Environment variable configuration
  9. """
  10. import os
  11. import shutil
  12. import subprocess
  13. import tempfile
  14. from pathlib import Path
  15. from unittest.mock import patch, MagicMock
  16. import pytest
  17. from django.test import TestCase
  18. from archivebox.plugins.search_backend_ripgrep.search import (
  19. search,
  20. flush,
  21. get_env,
  22. get_env_int,
  23. get_env_array,
  24. )
  25. class TestEnvHelpers(TestCase):
  26. """Test environment variable helper functions."""
  27. def test_get_env_default(self):
  28. """get_env should return default for unset vars."""
  29. result = get_env('NONEXISTENT_VAR_12345', 'default')
  30. self.assertEqual(result, 'default')
  31. def test_get_env_set(self):
  32. """get_env should return value for set vars."""
  33. with patch.dict(os.environ, {'TEST_VAR': 'value'}):
  34. result = get_env('TEST_VAR', 'default')
  35. self.assertEqual(result, 'value')
  36. def test_get_env_strips_whitespace(self):
  37. """get_env should strip whitespace."""
  38. with patch.dict(os.environ, {'TEST_VAR': ' value '}):
  39. result = get_env('TEST_VAR', '')
  40. self.assertEqual(result, 'value')
  41. def test_get_env_int_default(self):
  42. """get_env_int should return default for unset vars."""
  43. result = get_env_int('NONEXISTENT_VAR_12345', 42)
  44. self.assertEqual(result, 42)
  45. def test_get_env_int_valid(self):
  46. """get_env_int should parse integer values."""
  47. with patch.dict(os.environ, {'TEST_INT': '100'}):
  48. result = get_env_int('TEST_INT', 0)
  49. self.assertEqual(result, 100)
  50. def test_get_env_int_invalid(self):
  51. """get_env_int should return default for invalid integers."""
  52. with patch.dict(os.environ, {'TEST_INT': 'not a number'}):
  53. result = get_env_int('TEST_INT', 42)
  54. self.assertEqual(result, 42)
  55. def test_get_env_array_default(self):
  56. """get_env_array should return default for unset vars."""
  57. result = get_env_array('NONEXISTENT_VAR_12345', ['default'])
  58. self.assertEqual(result, ['default'])
  59. def test_get_env_array_valid(self):
  60. """get_env_array should parse JSON arrays."""
  61. with patch.dict(os.environ, {'TEST_ARRAY': '["a", "b", "c"]'}):
  62. result = get_env_array('TEST_ARRAY', [])
  63. self.assertEqual(result, ['a', 'b', 'c'])
  64. def test_get_env_array_invalid_json(self):
  65. """get_env_array should return default for invalid JSON."""
  66. with patch.dict(os.environ, {'TEST_ARRAY': 'not json'}):
  67. result = get_env_array('TEST_ARRAY', ['default'])
  68. self.assertEqual(result, ['default'])
  69. def test_get_env_array_not_array(self):
  70. """get_env_array should return default for non-array JSON."""
  71. with patch.dict(os.environ, {'TEST_ARRAY': '{"key": "value"}'}):
  72. result = get_env_array('TEST_ARRAY', ['default'])
  73. self.assertEqual(result, ['default'])
  74. class TestRipgrepFlush(TestCase):
  75. """Test the flush function."""
  76. def test_flush_is_noop(self):
  77. """flush should be a no-op for ripgrep backend."""
  78. # Should not raise
  79. flush(['snap-001', 'snap-002'])
  80. class TestRipgrepSearch(TestCase):
  81. """Test the ripgrep search function."""
  82. def setUp(self):
  83. """Create temporary archive directory with test files."""
  84. self.temp_dir = tempfile.mkdtemp()
  85. self.archive_dir = Path(self.temp_dir) / 'archive'
  86. self.archive_dir.mkdir()
  87. # Create snapshot directories with searchable content
  88. self._create_snapshot('snap-001', {
  89. 'singlefile/index.html': '<html><body>Python programming tutorial</body></html>',
  90. 'title/title.txt': 'Learn Python Programming',
  91. })
  92. self._create_snapshot('snap-002', {
  93. 'singlefile/index.html': '<html><body>JavaScript guide</body></html>',
  94. 'title/title.txt': 'JavaScript Basics',
  95. })
  96. self._create_snapshot('snap-003', {
  97. 'wget/index.html': '<html><body>Web archiving guide and best practices</body></html>',
  98. 'title/title.txt': 'Web Archiving guide',
  99. })
  100. # Patch settings
  101. self.settings_patch = patch(
  102. 'archivebox.plugins.search_backend_ripgrep.search.settings'
  103. )
  104. self.mock_settings = self.settings_patch.start()
  105. self.mock_settings.ARCHIVE_DIR = str(self.archive_dir)
  106. def tearDown(self):
  107. """Clean up temporary directory."""
  108. self.settings_patch.stop()
  109. shutil.rmtree(self.temp_dir, ignore_errors=True)
  110. def _create_snapshot(self, snapshot_id: str, files: dict):
  111. """Create a snapshot directory with files."""
  112. snap_dir = self.archive_dir / snapshot_id
  113. for path, content in files.items():
  114. file_path = snap_dir / path
  115. file_path.parent.mkdir(parents=True, exist_ok=True)
  116. file_path.write_text(content)
  117. def _has_ripgrep(self) -> bool:
  118. """Check if ripgrep is available."""
  119. return shutil.which('rg') is not None
  120. def test_search_no_archive_dir(self):
  121. """search should return empty list when archive dir doesn't exist."""
  122. self.mock_settings.ARCHIVE_DIR = '/nonexistent/path'
  123. results = search('test')
  124. self.assertEqual(results, [])
  125. def test_search_single_match(self):
  126. """search should find matching snapshot."""
  127. results = search('Python programming')
  128. self.assertIn('snap-001', results)
  129. self.assertNotIn('snap-002', results)
  130. self.assertNotIn('snap-003', results)
  131. def test_search_multiple_matches(self):
  132. """search should find all matching snapshots."""
  133. # 'guide' appears in snap-002 (JavaScript guide) and snap-003 (Archiving Guide)
  134. results = search('guide')
  135. self.assertIn('snap-002', results)
  136. self.assertIn('snap-003', results)
  137. self.assertNotIn('snap-001', results)
  138. def test_search_case_insensitive_by_default(self):
  139. """search should be case-sensitive (ripgrep default)."""
  140. # By default rg is case-sensitive
  141. results_upper = search('PYTHON')
  142. results_lower = search('python')
  143. # Depending on ripgrep config, results may differ
  144. self.assertIsInstance(results_upper, list)
  145. self.assertIsInstance(results_lower, list)
  146. def test_search_no_results(self):
  147. """search should return empty list for no matches."""
  148. results = search('xyznonexistent123')
  149. self.assertEqual(results, [])
  150. def test_search_regex(self):
  151. """search should support regex patterns."""
  152. results = search('(Python|JavaScript)')
  153. self.assertIn('snap-001', results)
  154. self.assertIn('snap-002', results)
  155. def test_search_distinct_snapshots(self):
  156. """search should return distinct snapshot IDs."""
  157. # Query matches both files in snap-001
  158. results = search('Python')
  159. # Should only appear once
  160. self.assertEqual(results.count('snap-001'), 1)
  161. def test_search_missing_binary(self):
  162. """search should raise when ripgrep binary not found."""
  163. with patch.dict(os.environ, {'RIPGREP_BINARY': '/nonexistent/rg'}):
  164. with patch('shutil.which', return_value=None):
  165. with self.assertRaises(RuntimeError) as context:
  166. search('test')
  167. self.assertIn('ripgrep binary not found', str(context.exception))
  168. def test_search_with_custom_args(self):
  169. """search should use custom RIPGREP_ARGS."""
  170. with patch.dict(os.environ, {'RIPGREP_ARGS': '["-i"]'}): # Case insensitive
  171. results = search('PYTHON')
  172. # With -i flag, should find regardless of case
  173. self.assertIn('snap-001', results)
  174. def test_search_timeout(self):
  175. """search should handle timeout gracefully."""
  176. with patch.dict(os.environ, {'RIPGREP_TIMEOUT': '1'}):
  177. # Short timeout, should still complete for small archive
  178. results = search('Python')
  179. self.assertIsInstance(results, list)
  180. class TestRipgrepSearchIntegration(TestCase):
  181. """Integration tests with realistic archive structure."""
  182. def setUp(self):
  183. """Create archive with realistic structure."""
  184. self.temp_dir = tempfile.mkdtemp()
  185. self.archive_dir = Path(self.temp_dir) / 'archive'
  186. self.archive_dir.mkdir()
  187. # Realistic snapshot structure
  188. self._create_snapshot('1704067200.123456', { # 2024-01-01
  189. 'singlefile.html': '''<!DOCTYPE html>
  190. <html>
  191. <head><title>ArchiveBox Documentation</title></head>
  192. <body>
  193. <h1>Getting Started with ArchiveBox</h1>
  194. <p>ArchiveBox is a powerful, self-hosted web archiving tool.</p>
  195. <p>Install with: pip install archivebox</p>
  196. </body>
  197. </html>''',
  198. 'title/title.txt': 'ArchiveBox Documentation',
  199. 'screenshot/screenshot.png': b'PNG IMAGE DATA', # Binary file
  200. })
  201. self._create_snapshot('1704153600.654321', { # 2024-01-02
  202. 'wget/index.html': '''<html>
  203. <head><title>Python News</title></head>
  204. <body>
  205. <h1>Python 3.12 Released</h1>
  206. <p>New features include improved error messages and performance.</p>
  207. </body>
  208. </html>''',
  209. 'readability/content.html': '<p>Python 3.12 has been released with exciting new features.</p>',
  210. })
  211. self.settings_patch = patch(
  212. 'archivebox.plugins.search_backend_ripgrep.search.settings'
  213. )
  214. self.mock_settings = self.settings_patch.start()
  215. self.mock_settings.ARCHIVE_DIR = str(self.archive_dir)
  216. def tearDown(self):
  217. """Clean up."""
  218. self.settings_patch.stop()
  219. shutil.rmtree(self.temp_dir, ignore_errors=True)
  220. def _create_snapshot(self, timestamp: str, files: dict):
  221. """Create snapshot with timestamp-based ID."""
  222. snap_dir = self.archive_dir / timestamp
  223. for path, content in files.items():
  224. file_path = snap_dir / path
  225. file_path.parent.mkdir(parents=True, exist_ok=True)
  226. if isinstance(content, bytes):
  227. file_path.write_bytes(content)
  228. else:
  229. file_path.write_text(content)
  230. def test_search_archivebox(self):
  231. """Search for archivebox should find documentation snapshot."""
  232. results = search('archivebox')
  233. self.assertIn('1704067200.123456', results)
  234. def test_search_python(self):
  235. """Search for python should find Python news snapshot."""
  236. results = search('Python')
  237. self.assertIn('1704153600.654321', results)
  238. def test_search_pip_install(self):
  239. """Search for installation command."""
  240. results = search('pip install')
  241. self.assertIn('1704067200.123456', results)
  242. if __name__ == '__main__':
  243. pytest.main([__file__, '-v'])