5 жил өмнө · caf4660ac8
--- a/archivebox/main.py
+++ b/archivebox/main.py
@@ -115,7 +115,7 @@ from .logging_util import (
 
				     printable_dependency_version,
			
 
				 )
			
 
				 
			
 
				-from .search import flush_search_index
			
 
				+from .search import flush_search_index, index_links
			
 
				 
			
 
				 ALLOWED_IN_OUTPUT_DIR = {
			
 
				     'lost+found',
			
@@ -711,6 +711,7 @@ def update(resume: Optional[float]=None,
 
				     if index_only:
			
 
				         for link in all_links:
			
 
				             write_link_details(link, out_dir=out_dir, skip_sql_index=True)
			
 
				+        index_links(all_links, out_dir=out_dir)
			
 
				         return all_links
			
 
				         
			
 
				     # Step 2: Run the archive methods for each link
			
--- a/archivebox/search/__init__.py
+++ b/archivebox/search/__init__.py
@@ -8,6 +8,8 @@ from archivebox.index.schema import Link
 
				 from archivebox.util import enforce_types
			
 
				 from archivebox.config import setup_django,stderr, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE
			
 
				 
			
 
				+from .utils import get_indexable_content
			
 
				+
			
 
				 def indexing_enabled():
			
 
				     return USE_INDEXING_BACKEND
			
 
				 
			
@@ -83,3 +85,17 @@ def flush_search_index(snapshots: QuerySet):
 
				             f'[X] The search backend threw an exception={err}:',
			
 
				         color='red',
			
 
				         )
			
 
				+
			
 
				+@enforce_types
			
 
				+def index_links(links: Union[List[Link],None], out_dir: Path=OUTPUT_DIR):
			
 
				+    if not links:
			
 
				+        return
			
 
				+
			
 
				+    setup_django(out_dir=out_dir, check_db=True)
			
 
				+    from core.models import Snapshot, ArchiveResult
			
 
				+
			
 
				+    for link in links:
			
 
				+        if snap := Snapshot.objects.filter(url=link.url).first():
			
 
				+            results = ArchiveResult.objects.indexable().filter(snapshot=snap)
			
 
				+            texts = get_indexable_content(results)
			
 
				+            write_search_index(link,texts,out_dir=out_dir)
			
--- a/archivebox/search/utils.py
+++ b/archivebox/search/utils.py
@@ -0,0 +1,38 @@
 
				+from django.db.models import QuerySet
			
 
				+
			
 
				+from archivebox.util import enforce_types
			
 
				+
			
 
				+def get_file_result_content(res, extra_path, use_pwd=False):
			
 
				+    if use_pwd: 
			
 
				+        fpath = f'{res.pwd}/{res.output}'
			
 
				+    else:
			
 
				+        fpath = f'{res.output}'
			
 
				+    
			
 
				+    if extra_path:
			
 
				+        fpath = f'{fpath}/{extra_path}'
			
 
				+
			
 
				+    with open(fpath, 'r') as file:
			
 
				+        data = file.read().replace('\n', '')
			
 
				+    if data:
			
 
				+        return [data]
			
 
				+    return []
			
 
				+
			
 
				+
			
 
				+# This should be abstracted by a plugin interface for extractors
			
 
				+@enforce_types
			
 
				+def get_indexable_content(results: QuerySet):
			
 
				+    if not results:
			
 
				+        return []
			
 
				+    # Only use the first method available
			
 
				+    res, method = results.first(), results.first().extractor
			
 
				+    if method not in ('readability', 'singlefile', 'dom', 'wget'):
			
 
				+        return []
			
 
				+    # This should come from a plugin interface
			
 
				+    if method == 'readability':
			
 
				+        return get_file_result_content(res, 'content.txt')
			
 
				+    elif method == 'singlefile':
			
 
				+        return get_file_result_content(res, '')
			
 
				+    elif method == 'dom':
			
 
				+        return get_file_result_content(res,'',use_pwd=True)
			
 
				+    elif method == 'wget':
			
 
				+        return get_file_result_content(res,'',use_pwd=True)