浏览代码

Implement backend architecture for search engines

JDC 5 年之前
父节点
当前提交
5f6673c72c
共有 4 个文件被更改,包括 67 次插入33 次删除
  1. 12 8
      archivebox/core/mixins.py
  2. 36 25
      archivebox/search/__init__.py
  3. 0 0
      archivebox/search/backends/__init__.py
  4. 19 0
      archivebox/search/backends/sonic.py

+ 12 - 8
archivebox/core/mixins.py

@@ -1,10 +1,10 @@
-from django.db.models import Q, Case, When, Value, IntegerField
+from django.contrib import messages
 
-from archivebox.search import search_index
+from archivebox.search import query_search_index
 
 class SearchResultsAdminMixin(object):
     def get_search_results(self, request, queryset, search_term):
-        ''' Show exact match for title and slug at top of admin search results.
+        ''' Enhances the search queryset with results from the search backend.
         '''
         qs, use_distinct = \
             super(SearchResultsAdminMixin, self).get_search_results(
@@ -13,9 +13,13 @@ class SearchResultsAdminMixin(object):
         search_term = search_term.strip()
         if not search_term:
             return qs, use_distinct
+        try:
+            snapshot_ids = query_search_index(search_term)
+        except Exception as err:
+            messages.add_message(request, messages.WARNING, f'Error from the search backend, only showing results from default admin search fields - Error: {err}')
+        else:
+            qsearch = queryset.filter(id__in=snapshot_ids)
+            qs |= qsearch
 
-        snapshot_ids = search_index(search_term)
-        qsearch = queryset.filter(id__in=snapshot_ids)
-        qs |= qsearch
-
-        return qs, use_distinct
+        finally:
+            return qs, use_distinct

+ 36 - 25
archivebox/search/__init__.py

@@ -1,40 +1,51 @@
-from typing import List, Optional, Union
+from typing import List, Union
 from pathlib import Path
+from importlib import import_module
 
-from sonic import IngestClient, SearchClient
 
-from ..index.schema import Link, ArchiveResult
-from ..util import enforce_types
-from ..config import setup_django, OUTPUT_DIR
+from archivebox.index.schema import Link
+from archivebox.util import enforce_types
+from archivebox.config import setup_django, OUTPUT_DIR
 
 
-@enforce_types
-def write_sonic_index(snapshot_id: str, texts: List[str]):
-    # TODO add variables to localhost, port, password, bucket, collection
-    with IngestClient("localhost", 1491, "SecretPassword") as ingestcl:
-        for text in texts:
-            ingestcl.push("archivebox", "snapshots", snapshot_id, str(text))
-
-@enforce_types
-def search_sonic_index(text: str) -> List:
-    with SearchClient("localhost", 1491, "SecretPassword") as querycl:
-        snap_ids = querycl.query("archivebox", "snapshots", text)
-    return snap_ids
+def indexing_enabled():
+    return True
+    # return FULLTEXT_INDEXING_ENABLED
 
+def search_backend_enabled():
+    return True
+    # return FULLTEXT_SEARCH_ENABLED
 
-@enforce_types
-def search_index(text: str) -> List:
-    # get backend
-    return search_sonic_index(text)
+def get_backend():
+    return 'search.backends.sonic'
 
+def import_backend():
+    backend_string = get_backend()
+    try:
+        backend = import_module(backend_string)
+    except Exception as err:
+        raise Exception("Could not load '%s' as a backend: %s" % (backend_string, err))
+    return backend
 
 @enforce_types
 def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=OUTPUT_DIR, skip_text_index: bool=False) -> None:
-    setup_django(out_dir, check_db=True)
-    from core.models import Snapshot
+    if not indexing_enabled():
+        return
 
     if not skip_text_index and texts:
+        setup_django(out_dir, check_db=True)
+        from core.models import Snapshot
+
         snap = Snapshot.objects.filter(url=link.url).first()
+        backend = import_backend()
         if snap:
-            # get backend
-            write_sonic_index(str(snap.id), texts)
+            backend.index(snapshot_id=str(snap.id), texts=texts)
+
+@enforce_types
+def query_search_index(text: str) -> List:
+    if search_backend_enabled():
+        backend = import_backend()
+        return backend.search(text)
+    else:
+        return []
+        

+ 0 - 0
archivebox/search/backends/__init__.py


+ 19 - 0
archivebox/search/backends/sonic.py

@@ -0,0 +1,19 @@
+from typing import List
+
+from sonic import IngestClient, SearchClient
+
+from archivebox.util import enforce_types
+
+@enforce_types
+def index(snapshot_id: str, texts: List[str]):
+    # TODO add variables to localhost, port, password, bucket, collection
+    with IngestClient("localhost", 1491, "SecretPassword") as ingestcl:
+        for text in texts:
+            ingestcl.push("archivebox", "snapshots", snapshot_id, str(text))
+
+@enforce_types
+def search(text: str) -> List:
+    with SearchClient("localhost", 1491, "SecretPassword") as querycl:
+        snap_ids = querycl.query("archivebox", "snapshots", text)
+    return snap_ids
+