4 anos atrás · f67a5a215a
--- a/archivebox/search/backends/sonic.py
+++ b/archivebox/search/backends/sonic.py
@@ -5,13 +5,19 @@ from sonic import IngestClient, SearchClient
 
				 from archivebox.util import enforce_types
			
 
				 from archivebox.config import SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD, SONIC_BUCKET, SONIC_COLLECTION
			
 
				 
			
 
				-MAX_SONIC_TEXT_LENGTH = 2000
			
 
				+MAX_SONIC_TEXT_TOTAL_LENGTH = 100000000     # dont index more than 100 million characters per text
			
 
				+MAX_SONIC_TEXT_CHUNK_LENGTH = 2000          # dont index more than 2000 characters per chunk
			
 
				+
			
 
				 
			
 
				 @enforce_types
			
 
				 def index(snapshot_id: str, texts: List[str]):
			
 
				     with IngestClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as ingestcl:
			
 
				         for text in texts:
			
 
				-            chunks = [text[i:i+MAX_SONIC_TEXT_LENGTH] for i in range(0, len(text), MAX_SONIC_TEXT_LENGTH)]
			
 
				+            max_length = 1000000
			
 
				+            chunks = (
			
 
				+                text[i:i+MAX_SONIC_TEXT_CHUNK_LENGTH]
			
 
				+                for i in range(0, min(len(text), MAX_SONIC_TEXT_TOTAL_LENGTH), MAX_SONIC_TEXT_CHUNK_LENGTH)
			
 
				+            )
			
 
				             for chunk in chunks:
			
 
				                 ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(chunk))
			
 
				 
			
--- a/archivebox/search/utils.py
+++ b/archivebox/search/utils.py
@@ -36,10 +36,10 @@ def get_indexable_content(results: QuerySet):
 
				 
			
 
				     # TODO: banish this duplication and get these from the extractor file
			
 
				     if method == 'readability':
			
 
				-        return get_file_result_content(res, 'content.txt')
			
 
				+        return get_file_result_content(res, 'content.txt', use_pwd=True)
			
 
				     elif method == 'singlefile':
			
 
				-        return get_file_result_content(res,'',use_pwd=True)
			
 
				+        return get_file_result_content(res, '', use_pwd=True)
			
 
				     elif method == 'dom':
			
 
				-        return get_file_result_content(res,'',use_pwd=True)
			
 
				+        return get_file_result_content(res, '', use_pwd=True)
			
 
				     elif method == 'wget':
			
 
				-        return get_file_result_content(res,'',use_pwd=True)
			
 
				+        return get_file_result_content(res, '', use_pwd=True)