on_Snapshot__90_index_sqlite.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. #!/usr/bin/env python3
  2. """
  3. SQLite FTS5 search backend - indexes snapshot content for full-text search.
  4. This hook runs after all extractors and indexes text content in SQLite FTS5.
  5. Only runs if SEARCH_BACKEND_ENGINE=sqlite.
  6. Usage: on_Snapshot__90_index_sqlite.py --url=<url> --snapshot-id=<uuid>
  7. Environment variables:
  8. SEARCH_BACKEND_ENGINE: Must be 'sqlite' for this hook to run
  9. USE_INDEXING_BACKEND: Enable search indexing (default: true)
  10. SQLITEFTS_DB: Database filename (default: search.sqlite3)
  11. FTS_TOKENIZERS: FTS5 tokenizer config (default: porter unicode61 remove_diacritics 2)
  12. """
  13. import json
  14. import os
  15. import re
  16. import sqlite3
  17. import sys
  18. from pathlib import Path
  19. import rich_click as click
  20. # Extractor metadata
  21. PLUGIN_NAME = 'index_sqlite'
  22. OUTPUT_DIR = '.'
  23. # Text file patterns to index, in priority order
  24. INDEXABLE_FILES = [
  25. ('readability', 'content.txt'),
  26. ('readability', 'content.html'),
  27. ('mercury', 'content.txt'),
  28. ('mercury', 'content.html'),
  29. ('htmltotext', 'output.txt'),
  30. ('singlefile', 'singlefile.html'),
  31. ('dom', 'output.html'),
  32. ('wget', '**/*.html'),
  33. ('wget', '**/*.htm'),
  34. ('title', 'title.txt'),
  35. ]
  36. def get_env(name: str, default: str = '') -> str:
  37. return os.environ.get(name, default).strip()
  38. def get_env_bool(name: str, default: bool = False) -> bool:
  39. val = get_env(name, '').lower()
  40. if val in ('true', '1', 'yes', 'on'):
  41. return True
  42. if val in ('false', '0', 'no', 'off'):
  43. return False
  44. return default
  45. def strip_html_tags(html: str) -> str:
  46. """Remove HTML tags, keeping text content."""
  47. html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
  48. html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL | re.IGNORECASE)
  49. html = re.sub(r'<[^>]+>', ' ', html)
  50. html = html.replace('&nbsp;', ' ').replace('&amp;', '&')
  51. html = html.replace('&lt;', '<').replace('&gt;', '>')
  52. html = html.replace('&quot;', '"')
  53. html = re.sub(r'\s+', ' ', html)
  54. return html.strip()
  55. def find_indexable_content() -> list[tuple[str, str]]:
  56. """Find text content to index from extractor outputs."""
  57. results = []
  58. cwd = Path.cwd()
  59. for extractor, file_pattern in INDEXABLE_FILES:
  60. plugin_dir = cwd / extractor
  61. if not plugin_dir.exists():
  62. continue
  63. if '*' in file_pattern:
  64. matches = list(plugin_dir.glob(file_pattern))
  65. else:
  66. match = plugin_dir / file_pattern
  67. matches = [match] if match.exists() else []
  68. for match in matches:
  69. if match.is_file() and match.stat().st_size > 0:
  70. try:
  71. content = match.read_text(encoding='utf-8', errors='ignore')
  72. if content.strip():
  73. if match.suffix in ('.html', '.htm'):
  74. content = strip_html_tags(content)
  75. results.append((f'{extractor}/{match.name}', content))
  76. except Exception:
  77. continue
  78. return results
  79. def get_db_path() -> Path:
  80. """Get path to the search index database."""
  81. data_dir = get_env('DATA_DIR', str(Path.cwd().parent.parent))
  82. db_name = get_env('SQLITEFTS_DB', 'search.sqlite3')
  83. return Path(data_dir) / db_name
  84. def index_in_sqlite(snapshot_id: str, texts: list[str]) -> None:
  85. """Index texts in SQLite FTS5."""
  86. db_path = get_db_path()
  87. tokenizers = get_env('FTS_TOKENIZERS', 'porter unicode61 remove_diacritics 2')
  88. conn = sqlite3.connect(str(db_path))
  89. try:
  90. # Create FTS5 table if needed
  91. conn.execute(f'''
  92. CREATE VIRTUAL TABLE IF NOT EXISTS search_index
  93. USING fts5(snapshot_id, content, tokenize='{tokenizers}')
  94. ''')
  95. # Remove existing entries
  96. conn.execute('DELETE FROM search_index WHERE snapshot_id = ?', (snapshot_id,))
  97. # Insert new content
  98. content = '\n\n'.join(texts)
  99. conn.execute(
  100. 'INSERT INTO search_index (snapshot_id, content) VALUES (?, ?)',
  101. (snapshot_id, content)
  102. )
  103. conn.commit()
  104. finally:
  105. conn.close()
  106. @click.command()
  107. @click.option('--url', required=True, help='URL that was archived')
  108. @click.option('--snapshot-id', required=True, help='Snapshot UUID')
  109. def main(url: str, snapshot_id: str):
  110. """Index snapshot content in SQLite FTS5."""
  111. output = None
  112. status = 'failed'
  113. error = ''
  114. indexed_sources = []
  115. try:
  116. # Check if this backend is enabled (permanent skips - don't retry)
  117. backend = get_env('SEARCH_BACKEND_ENGINE', 'sqlite')
  118. if backend != 'sqlite':
  119. print(f'Skipping SQLite indexing (SEARCH_BACKEND_ENGINE={backend})', file=sys.stderr)
  120. sys.exit(0) # Permanent skip - different backend selected
  121. if not get_env_bool('USE_INDEXING_BACKEND', True):
  122. print('Skipping indexing (USE_INDEXING_BACKEND=False)', file=sys.stderr)
  123. sys.exit(0) # Permanent skip - indexing disabled
  124. else:
  125. contents = find_indexable_content()
  126. indexed_sources = [source for source, _ in contents]
  127. if not contents:
  128. status = 'skipped'
  129. print('No indexable content found', file=sys.stderr)
  130. else:
  131. texts = [content for _, content in contents]
  132. index_in_sqlite(snapshot_id, texts)
  133. status = 'succeeded'
  134. output = OUTPUT_DIR
  135. except Exception as e:
  136. error = f'{type(e).__name__}: {e}'
  137. status = 'failed'
  138. if error:
  139. print(f'ERROR: {error}', file=sys.stderr)
  140. # Search indexing hooks don't emit ArchiveResult - they're utility hooks
  141. # Exit code indicates success/failure
  142. sys.exit(0 if status == 'succeeded' else 1)
  143. if __name__ == '__main__':
  144. main()