search.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. """
  2. Ripgrep search backend - searches files directly without indexing.
  3. This backend doesn't maintain an index - it searches archived files directly
  4. using ripgrep (rg). This is simpler but slower for large archives.
  5. Environment variables:
  6. RIPGREP_BINARY: Path to ripgrep binary (default: rg)
  7. RIPGREP_ARGS: Default ripgrep arguments (JSON array)
  8. RIPGREP_ARGS_EXTRA: Extra arguments to append (JSON array)
  9. RIPGREP_TIMEOUT: Search timeout in seconds (default: 90)
  10. """
  11. import json
  12. import os
  13. import subprocess
  14. import shutil
  15. from pathlib import Path
  16. from typing import List, Iterable
  17. from django.conf import settings
  18. def get_env(name: str, default: str = '') -> str:
  19. return os.environ.get(name, default).strip()
  20. def get_env_int(name: str, default: int = 0) -> int:
  21. try:
  22. return int(get_env(name, str(default)))
  23. except ValueError:
  24. return default
  25. def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
  26. """Parse a JSON array from environment variable."""
  27. val = get_env(name, '')
  28. if not val:
  29. return default if default is not None else []
  30. try:
  31. result = json.loads(val)
  32. if isinstance(result, list):
  33. return [str(item) for item in result]
  34. return default if default is not None else []
  35. except json.JSONDecodeError:
  36. return default if default is not None else []
  37. def _get_archive_dir() -> Path:
  38. archive_dir = os.environ.get('ARCHIVE_DIR', '').strip()
  39. if archive_dir:
  40. return Path(archive_dir)
  41. data_dir = os.environ.get('DATA_DIR', '').strip()
  42. if data_dir:
  43. return Path(data_dir) / 'archive'
  44. settings_archive_dir = getattr(settings, 'ARCHIVE_DIR', None)
  45. if settings_archive_dir:
  46. return Path(settings_archive_dir)
  47. settings_data_dir = getattr(settings, 'DATA_DIR', None)
  48. if settings_data_dir:
  49. return Path(settings_data_dir) / 'archive'
  50. return Path.cwd() / 'archive'
  51. def search(query: str) -> List[str]:
  52. """Search for snapshots using ripgrep."""
  53. rg_binary = get_env('RIPGREP_BINARY', 'rg')
  54. rg_binary = shutil.which(rg_binary) or rg_binary
  55. if not rg_binary or not Path(rg_binary).exists():
  56. raise RuntimeError(f'ripgrep binary not found. Install with: apt install ripgrep')
  57. timeout = get_env_int('RIPGREP_TIMEOUT', 90)
  58. ripgrep_args = get_env_array('RIPGREP_ARGS', [])
  59. ripgrep_args_extra = get_env_array('RIPGREP_ARGS_EXTRA', [])
  60. archive_dir = _get_archive_dir()
  61. if not archive_dir.exists():
  62. return []
  63. cmd = [
  64. rg_binary,
  65. *ripgrep_args,
  66. *ripgrep_args_extra,
  67. '--regexp',
  68. query,
  69. str(archive_dir),
  70. ]
  71. try:
  72. result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
  73. # Extract snapshot IDs from file paths
  74. # Paths look like: archive/<snapshot_id>/<extractor>/file.txt
  75. snapshot_ids = set()
  76. for line in result.stdout.strip().split('\n'):
  77. if not line:
  78. continue
  79. path = Path(line)
  80. try:
  81. relative = path.relative_to(archive_dir)
  82. snapshot_id = relative.parts[0]
  83. snapshot_ids.add(snapshot_id)
  84. except (ValueError, IndexError):
  85. continue
  86. return list(snapshot_ids)
  87. except subprocess.TimeoutExpired:
  88. return []
  89. except Exception:
  90. return []
  91. def flush(snapshot_ids: Iterable[str]) -> None:
  92. """No-op for ripgrep - it searches files directly."""
  93. pass