Browse Source

fixed implict cutoff vs limit; added better fullscan detection in json queries

Ilya Kuznetsov 2 years ago
parent
commit
2b579012fc

+ 8 - 7
src/sphinx.cpp

@@ -1411,7 +1411,7 @@ private:
 	bool						SelectIteratorsFT ( const CSphQuery & tQuery, const CSphVector<CSphFilterSettings> & dFilters, const ISphSchema & tSorterSchema, ISphRanker * pRanker, CSphVector<SecondaryIndexInfo_t> & dSIInfo, int iCutoff, int iThreads, StrVec_t & dWarnings ) const;
 
 	bool						IsQueryFast ( const CSphQuery & tQuery, const CSphVector<SecondaryIndexInfo_t> & dEnabledIndexes, float fCost ) const;
-	CSphVector<SecondaryIndexInfo_t> GetEnabledIndexes ( const CSphQuery & tQuery, float & fCost, int iThreads ) const;
+	CSphVector<SecondaryIndexInfo_t> GetEnabledIndexes ( const CSphQuery & tQuery, bool bFT, float & fCost, int iThreads ) const;
 
 	Docstore_i *				GetDocstore() const override { return m_pDocstore.get(); }
 	columnar::Columnar_i *		GetColumnar() const override { return m_pColumnar.get(); }
@@ -3084,13 +3084,13 @@ static bool CheckQueryFilters ( const CSphQuery & tQuery, const CSphSchema & tSc
 }
 
 
-CSphVector<SecondaryIndexInfo_t> CSphIndex_VLN::GetEnabledIndexes ( const CSphQuery & tQuery, float & fCost, int iThreads ) const
+CSphVector<SecondaryIndexInfo_t> CSphIndex_VLN::GetEnabledIndexes ( const CSphQuery & tQuery, bool bFT, float & fCost, int iThreads ) const
 {
 	// if there's a filter tree, we don't have any indexes and there's no point in wasting time to eval them
 	if ( tQuery.m_dFilterTree.GetLength() )
 		return {};
 
-	int iCutoff = ApplyImplicitCutoff ( tQuery, {} );
+	int iCutoff = ApplyImplicitCutoff ( tQuery, {}, bFT );
 
 	StrVec_t dWarnings;
 	SelectIteratorCtx_t tCtx ( tQuery, tQuery.m_dFilters, m_tSchema, m_tSchema, m_pHistograms, m_pColumnar.get(), m_pSIdx.get(), iCutoff, m_iDocinfo, iThreads );
@@ -3114,7 +3114,8 @@ std::pair<int64_t,int> CSphIndex_VLN::GetPseudoShardingMetric ( const VecTraits_
 		auto & tQuery = dQueries[i];
 
 		// limit the number of threads for anything with FT as it looks better in average (some queries are faster without thread cap)
-		if ( !tQuery.m_sQuery.IsEmpty() )
+		bool bFulltext = !tQuery.m_pQueryParser->IsFullscan(tQuery);
+		if ( bFulltext )
 			iThreadCap = iThreadCap ? Min ( iThreadCap, iNumProc ) : iNumProc;
 
 		if ( !tQuery.m_sKNNAttr.IsEmpty() )
@@ -3124,7 +3125,7 @@ std::pair<int64_t,int> CSphIndex_VLN::GetPseudoShardingMetric ( const VecTraits_
 			continue;
 
 		float fCost = FLT_MAX;
-		CSphVector<SecondaryIndexInfo_t> dEnabledIndexes = GetEnabledIndexes ( tQuery, fCost, iThreads );
+		CSphVector<SecondaryIndexInfo_t> dEnabledIndexes = GetEnabledIndexes ( tQuery, bFulltext, fCost, iThreads );
 		bAllFast &= IsQueryFast ( tQuery, dEnabledIndexes, fCost );
 
 		// disable pseudo sharding if any of the queries use docid lookups
@@ -8393,7 +8394,7 @@ bool CSphIndex_VLN::MultiScan ( CSphQueryResult & tResult, const CSphQuery & tQu
 
 	SwitchProfile ( tMeta.m_pProfile, SPH_QSTATE_SETUP_ITER );
 
-	int iCutoff = ApplyImplicitCutoff ( tQuery, dSorters );
+	int iCutoff = ApplyImplicitCutoff ( tQuery, dSorters, false );
 	bool bAllPrecalc = dSorters.GetLength() && dSorters.all_of ( []( auto pSorter ){ return pSorter->IsPrecalc(); } );
 
 	// try to spawn an iterator from a secondary index
@@ -11593,7 +11594,7 @@ bool CSphIndex_VLN::ParsedMultiQuery ( const CSphQuery & tQuery, CSphQueryResult
 
 	SwitchProfile ( pProfile, SPH_QSTATE_SETUP_ITER );
 
-	int iCutoff = ApplyImplicitCutoff ( tQuery, dSorters );
+	int iCutoff = ApplyImplicitCutoff ( tQuery, dSorters, true );
 	CSphVector<CSphFilterSettings> dFiltersAfterIterator; // holds filter settings if they were modified. filters hold pointers to those settings
 	std::pair<RowidIterator_i *, bool> tSpawned = SpawnIterators ( tQuery, dTransformedFilters, tCtx, tFlx, tMaxSorterSchema, tMeta, iCutoff, tArgs.m_iTotalThreads, dFiltersAfterIterator, pRanker.get() );
 	std::unique_ptr<RowidIterator_i> pIterator = std::unique_ptr<RowidIterator_i> ( tSpawned.first );

+ 17 - 3
src/sphinxjsonquery.cpp

@@ -189,10 +189,9 @@ XQNode_t * QueryTreeBuilder_c::AddChildKeyword ( XQNode_t * pParent, const char
 class QueryParserJson_c : public QueryParser_i
 {
 public:
+	bool	IsFullscan ( const CSphQuery & tQuery ) const final;
 	bool	IsFullscan ( const XQQuery_t & tQuery ) const final;
-	bool	ParseQuery ( XQQuery_t & tParsed, const char * sQuery, const CSphQuery * pQuery,
-		TokenizerRefPtr_c pQueryTokenizer, TokenizerRefPtr_c pQueryTokenizerJson,
-		const CSphSchema * pSchema, const DictRefPtr_c& pDict, const CSphIndexSettings & tSettings ) const final;
+	bool	ParseQuery ( XQQuery_t & tParsed, const char * sQuery, const CSphQuery * pQuery, TokenizerRefPtr_c pQueryTokenizer, TokenizerRefPtr_c pQueryTokenizerJson, const CSphSchema * pSchema, const DictRefPtr_c& pDict, const CSphIndexSettings & tSettings ) const final;
 
 private:
 	XQNode_t *		ConstructMatchNode ( const JsonObj_c & tJson, bool bPhrase, bool bTerms, bool bSingleTerm, QueryTreeBuilder_c & tBuilder ) const;
@@ -207,6 +206,21 @@ private:
 };
 
 
+bool QueryParserJson_c::IsFullscan ( const CSphQuery & tQuery ) const
+{
+	const char * szQ = tQuery.m_sQuery.cstr();
+	if ( !szQ )									return true;
+	if ( strstr ( szQ, R"("match")" ) )			return false;
+	if ( strstr ( szQ, R"("terms")" ) )			return false;
+	if ( strstr ( szQ, R"("match_phrase")" ) )	return false;
+	if ( strstr ( szQ, R"("term")" ) )			return false;
+	if ( strstr ( szQ, R"("query_string")" ) )	return false;
+	if ( strstr ( szQ, R"("simple_query_string")" ) ) return false;
+
+	return true;
+}
+
+
 bool QueryParserJson_c::IsFullscan ( const XQQuery_t & tQuery ) const
 {
 	return !( tQuery.m_pRoot && ( tQuery.m_pRoot->m_dChildren.GetLength () || tQuery.m_pRoot->m_dWords.GetLength () ) );

+ 2 - 2
src/sphinxrt.cpp

@@ -7266,7 +7266,7 @@ static bool DoFullScanQuery ( const RtSegVec_c & dRamChunks, const ISphSchema &
 			return false;
 		// FIXME! OPTIMIZE! check if we can early reject the whole index
 
-		int iCutoff = ApplyImplicitCutoff ( tQuery, dSorters );
+		int iCutoff = ApplyImplicitCutoff ( tQuery, dSorters, false );
 		tMeta.m_bTotalMatchesApprox |= PerformFullscan ( dRamChunks, tMaxSorterSchema.GetDynamicSize(), tArgs.m_iIndexWeight, iStride, iCutoff, tmMaxTimer, pProfiler, tCtx, dSorters, tMeta.m_sWarning );
 	}
 
@@ -7417,7 +7417,7 @@ static bool DoFullTextSearch ( const RtSegVec_c & dRamChunks, const ISphSchema &
 		// FIXME! OPTIMIZE! check if we can early reject the whole index
 
 		// do searching
-		int iCutoff = ApplyImplicitCutoff ( tQuery, dSorters );
+		int iCutoff = ApplyImplicitCutoff ( tQuery, dSorters, true );
 		PerformFullTextSearch ( dRamChunks, tTermSetup, pRanker.get (), tArgs.m_iIndexWeight, iCutoff, pProfiler, tCtx, dSorters );
 	}
 

+ 2 - 5
src/sphinxsort.cpp

@@ -7308,7 +7308,7 @@ static void CreateSorters ( const VecTraits_T<CSphQuery> & dQueries, const VecTr
 }
 
 
-int ApplyImplicitCutoff ( const CSphQuery & tQuery, const VecTraits_T<ISphMatchSorter*> & dSorters )
+int ApplyImplicitCutoff ( const CSphQuery & tQuery, const VecTraits_T<ISphMatchSorter*> & dSorters, bool bFT )
 {
 	bool bAllPrecalc = dSorters.GetLength() && dSorters.all_of ( []( auto pSorter ){ return pSorter->IsPrecalc(); } );
 	if ( bAllPrecalc )
@@ -7330,10 +7330,7 @@ int ApplyImplicitCutoff ( const CSphQuery & tQuery, const VecTraits_T<ISphMatchS
 		return -1;
 
 	// implicit cutoff when there's no sorting and no grouping
-	bool bNoSortScan = tQuery.m_sQuery.IsEmpty() && ( tQuery.m_sSortBy=="@weight desc" || tQuery.m_sSortBy.IsEmpty() );
-	bool bNoSortFT = !tQuery.m_sQuery.IsEmpty() && !strstr ( tQuery.m_sSortBy.scstr(), "weight" );
-
-	if ( ( bNoSortScan || bNoSortFT ) && tQuery.m_sGroupBy.IsEmpty() && !tQuery.m_bFacet && !tQuery.m_bFacetHead )
+	if ( !bFT && ( tQuery.m_sSortBy=="@weight desc" || tQuery.m_sSortBy.IsEmpty() ) && tQuery.m_sGroupBy.IsEmpty() && !tQuery.m_bFacet && !tQuery.m_bFacetHead )
 		return tQuery.m_iLimit+tQuery.m_iOffset;
 
 	return -1;

+ 1 - 1
src/sphinxsort.h

@@ -200,7 +200,7 @@ bool			GetAccurateAggregationDefault();
 void			SetDistinctThreshDefault ( int iThresh );
 int 			GetDistinctThreshDefault();
 
-int				ApplyImplicitCutoff ( const CSphQuery & tQuery, const VecTraits_T<ISphMatchSorter*> & dSorters );
+int				ApplyImplicitCutoff ( const CSphQuery & tQuery, const VecTraits_T<ISphMatchSorter*> & dSorters, bool bFT );
 bool			HasImplicitGrouping ( const CSphQuery & tQuery );
 
 /// creates proper queue for given query

+ 1 - 1
src/tests.cpp

@@ -577,7 +577,7 @@ static float GetEstimatedCost ( CSphQuery & tQuery, CSphIndex * pIndex, Secondar
 		dSIInfo[i].m_eType = eType;
 	}
 
-	int iCutoff = ApplyImplicitCutoff ( tQuery, {} );
+	int iCutoff = ApplyImplicitCutoff ( tQuery, {}, false );
 	SelectIteratorCtx_t tCtx ( tQuery, tQuery.m_dFilters, pIndex->GetMatchSchema(), pIndex->GetMatchSchema(), pIndex->Debug_GetHistograms(), pIndex->GetColumnar(), pIndex->Debug_GetSI(), iCutoff, tStats.m_iTotalDocuments, 1 );
 	int iNumIterators = dSIInfo.count_of ( []( auto & tSI ){ return tSI.m_eType==SecondaryIndexType_e::INDEX || tSI.m_eType==SecondaryIndexType_e::ANALYZER; } );
 	if ( iNumIterators > 1 )

File diff suppressed because it is too large
+ 0 - 0
test/test_217/model.bin


File diff suppressed because it is too large
+ 0 - 0
test/test_377/model.bin


File diff suppressed because it is too large
+ 0 - 0
test/test_401/model.bin


Some files were not shown because too many files changed in this diff