Forráskód Böngészése

fixed #1604 CALL KEYWORDS provide multiple lemmas per query term
added query atom position for term at CALL KEYWORDS
fixed models
added regressions to test 222

git-svn-id: svn://svn.sphinxsearch.com/sphinx/trunk@4516 406a0c4d-033a-0410-8de8-e80135713968

tomat 12 éve
szülő
commit
776bcdabb9

+ 4 - 5
src/searchd.cpp

@@ -15108,7 +15108,8 @@ void HandleMysqlCallKeywords ( SqlRowBuffer_c & tOut, SqlStmt_t & tStmt )
 	}
 
 	// result set header packet
-	tOut.HeadBegin ( bStats ? 4 : 2 );
+	tOut.HeadBegin ( bStats ? 5 : 3 );
+	tOut.HeadColumn("qpos");
 	tOut.HeadColumn("tokenized");
 	tOut.HeadColumn("normalized");
 	if ( bStats )
@@ -15122,10 +15123,8 @@ void HandleMysqlCallKeywords ( SqlRowBuffer_c & tOut, SqlStmt_t & tStmt )
 	char sBuf[16];
 	ARRAY_FOREACH ( i, dKeywords )
 	{
-		char sDocs[16], sHits[16];
-		snprintf ( sDocs, sizeof(sDocs), "%d", dKeywords[i].m_iDocs );
-		snprintf ( sHits, sizeof(sHits), "%d", dKeywords[i].m_iHits );
-
+		snprintf ( sBuf, sizeof(sBuf), "%d", dKeywords[i].m_iQpos );
+		tOut.PutString ( sBuf );
 		tOut.PutString ( dKeywords[i].m_sTokenized.cstr() );
 		tOut.PutString ( dKeywords[i].m_sNormalized.cstr() );
 		if ( bStats )

+ 201 - 86
src/sphinx.cpp

@@ -1427,8 +1427,29 @@ public:
 	virtual	void				SetProgressCallback ( CSphIndexProgress::IndexingProgress_fn ) {}
 };
 
-bool CSphTokenizerIndex::GetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords,
-								const char * szQuery, bool, CSphString * ) const
+
+struct CSphTemplateQueryFilter : public ISphQueryFilter
+{
+	virtual void AddKeywordStats ( BYTE * sWord, const BYTE * sTokenized, int iQpos, CSphVector <CSphKeywordInfo> & dKeywords )
+	{
+		SphWordID_t iWord = m_pDict->GetWordID ( sWord );
+		if ( !iWord )
+			return;
+
+		CSphKeywordInfo & tInfo = dKeywords.Add();
+		tInfo.m_sTokenized = (const char *)sTokenized;
+		tInfo.m_sNormalized = (const char*)sWord;
+		tInfo.m_iDocs = 0;
+		tInfo.m_iHits = 0;
+		tInfo.m_iQpos = iQpos;
+
+		if ( tInfo.m_sNormalized.cstr()[0]==MAGIC_WORD_HEAD_NONSTEMMED )
+			*(char *)tInfo.m_sNormalized.cstr() = '=';
+	}
+};
+
+
+bool CSphTokenizerIndex::GetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords, const char * szQuery, bool, CSphString * ) const
 {
 	// short-cut if no query or keywords to fill
 	if ( !szQuery || !szQuery[0] )
@@ -1444,7 +1465,6 @@ bool CSphTokenizerIndex::GetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords,
 	if ( m_tSettings.m_bIndexExactWords )
 		pTokenizer->AddPlainChar ( '=' );
 
-
 	CSphScopedPtr<CSphDict> tDictCloned ( NULL );
 	CSphDict * pDictBase = m_pDict;
 	if ( pDictBase->HasState() )
@@ -1458,32 +1478,16 @@ bool CSphTokenizerIndex::GetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords,
 		pDict = new CSphDictExact ( pDict );
 
 	dKeywords.Resize ( 0 );
-	CSphString sTokenized;
-	BYTE * sWord;
 
-	CSphString sQbuf ( szQuery );
-	pTokenizer->SetBuffer ( (BYTE*)sQbuf.cstr(), strlen(szQuery) );
+	pTokenizer->SetBuffer ( (const BYTE*)szQuery, strlen(szQuery) );
 
-	while ( ( sWord = pTokenizer->GetToken() )!=NULL )
-	{
-		BYTE * sMultiform = pTokenizer->GetTokenizedMultiform();
-		if ( sMultiform )
-			sTokenized = (const char*)sMultiform;
-		else
-			sTokenized = (const char*)sWord;
+	CSphTemplateQueryFilter tAotFilter;
+	tAotFilter.m_pTokenizer = pTokenizer.Ptr();
+	tAotFilter.m_pDict = pDict;
+	tAotFilter.m_pSettings = &m_tSettings;
 
-		// result unused, however the stemmers applied
-		pDict->GetWordID ( sWord );
-
-		CSphKeywordInfo & tInfo = dKeywords.Add();
-		Swap ( tInfo.m_sTokenized, sTokenized );
-		tInfo.m_sNormalized = (const char*)sWord;
-		tInfo.m_iDocs = 0;
-		tInfo.m_iHits = 0;
+	tAotFilter.GetKeywords ( dKeywords );
 
-		if ( tInfo.m_sNormalized.cstr()[0]==MAGIC_WORD_HEAD_NONSTEMMED )
-			*(char *)tInfo.m_sNormalized.cstr() = '=';
-	}
 	return true;
 }
 
@@ -16308,21 +16312,7 @@ bool CSphIndex_VLN::LoadHeader ( const char * sHeaderName, bool bStripPath, CSph
 		SetupQueryTokenizer();
 
 		// initialize AOT if needed
-		CSphVector<CSphString> dMorphs;
-		sphSplit ( dMorphs, tDictSettings.m_sMorphology.cstr() );
-		m_tSettings.m_uAotFilterMask = 0;
-		for ( int j=0; j<AOT_LENGTH; ++j )
-		{
-			char buf_all[20];
-			sprintf ( buf_all, "lemmatize_%s_all", AOT_LANGUAGES[j] ); // NOLINT
-			ARRAY_FOREACH ( i, dMorphs )
-				if ( dMorphs[i]==buf_all )
-				{
-					m_tSettings.m_uAotFilterMask |= (1UL) << j;
-					break;
-				}
-		}
-
+		m_tSettings.m_uAotFilterMask = sphParseMorphAot ( tDictSettings.m_sMorphology.cstr() );
 	} else
 	{
 		if ( m_bId32to64 )
@@ -17556,6 +17546,158 @@ bool CSphIndex_VLN::GetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords,
 }
 
 
+DWORD sphParseMorphAot ( const char * sMorphology )
+{
+	if ( !sMorphology || !*sMorphology )
+		return 0;
+
+	CSphVector<CSphString> dMorphs;
+	sphSplit ( dMorphs, sMorphology );
+
+	DWORD uAotFilterMask = 0;
+	for ( int j=0; j<AOT_LENGTH; ++j )
+	{
+		char buf_all[20];
+		sprintf ( buf_all, "lemmatize_%s_all", AOT_LANGUAGES[j] ); // NOLINT
+		ARRAY_FOREACH ( i, dMorphs )
+		{
+			if ( dMorphs[i]==buf_all )
+			{
+				uAotFilterMask |= (1UL) << j;
+				break;
+			}
+		}
+	}
+
+	return uAotFilterMask;
+}
+
+
+ISphQueryFilter::ISphQueryFilter ()
+{
+	m_pTokenizer = NULL;
+	m_pDict = NULL;
+	m_pSettings = NULL;
+}
+
+
+ISphQueryFilter::~ISphQueryFilter ()
+{
+}
+
+
+void ISphQueryFilter::GetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords )
+{
+	assert ( m_pTokenizer && m_pDict && m_pSettings );
+
+	BYTE sTokenized[3*SPH_MAX_WORD_LEN+4];
+	BYTE * sWord;
+	int iQpos = 1;
+
+	// FIXME!!! got rid of duplicated term stat and qword setup
+	while ( ( sWord = m_pTokenizer->GetToken() )!=NULL )
+	{
+		const BYTE * sMultiform = m_pTokenizer->GetTokenizedMultiform();
+		strncpy ( (char *)sTokenized, sMultiform ? (const char*)sMultiform : (const char*)sWord, sizeof(sTokenized) );
+
+		AddKeywordStats ( sWord, sTokenized, iQpos, dKeywords );
+
+		// FIXME!!! handle consecutive blended wo blended parts
+		if ( !m_pTokenizer->TokenIsBlended() )
+			iQpos++;
+	}
+
+
+	if ( !m_pSettings->m_uAotFilterMask )
+		return;
+
+	XQLimitSpec_t tSpec;
+	BYTE sTmp[3*SPH_MAX_WORD_LEN+4];
+	CSphVector<XQNode_t *> dChildren ( 64 );
+
+	int iTokenizedTotal = dKeywords.GetLength();
+	for ( int iTokenized=0; iTokenized<iTokenizedTotal; iTokenized++ )
+	{
+		int iQpos = dKeywords[iTokenized].m_iQpos;
+		// MUST copy as Dict::GetWordID changes word and might add symbols
+		strncpy ( (char *)sTokenized, dKeywords[iTokenized].m_sTokenized.scstr(), sizeof(sTokenized) );
+		int iPreAotCount = dKeywords.GetLength();
+
+		XQNode_t tAotNode ( tSpec );
+		tAotNode.m_dWords.Resize ( 1 );
+		tAotNode.m_dWords.Begin()->m_sWord = (char *)sTokenized;
+		TransformAotFilter ( &tAotNode, m_pDict->GetWordforms(), *m_pSettings );
+
+		dChildren.Resize ( 0 );
+		dChildren.Add ( &tAotNode );
+
+		// recursion unfolded
+		ARRAY_FOREACH ( iChild, dChildren )
+		{
+			// process all words at node
+			ARRAY_FOREACH ( iAotKeyword, dChildren[iChild]->m_dWords )
+			{
+				// MUST copy as Dict::GetWordID changes word and might add symbols
+				strncpy ( (char *)sTmp, dChildren[iChild]->m_dWords[iAotKeyword].m_sWord.scstr(), sizeof(sTmp) );
+				AddKeywordStats ( sTmp, sTokenized, iQpos, dKeywords );
+			}
+
+			// push all child nodes at node to process list
+			const XQNode_t * pChild = dChildren[iChild];
+			ARRAY_FOREACH ( iRec, pChild->m_dChildren )
+				dChildren.Add ( pChild->m_dChildren[iRec] );
+		}
+
+		// remove (replace) original word in case of AOT taken place
+		if ( iPreAotCount!=dKeywords.GetLength() )
+		{
+			::Swap ( dKeywords[iTokenized], dKeywords.Last() );
+			dKeywords.Resize ( dKeywords.GetLength()-1 );
+		}
+	}
+
+	// sort by qpos
+	if ( dKeywords.GetLength()!=iTokenizedTotal )
+		sphSort ( dKeywords.Begin(), dKeywords.GetLength(), bind ( &CSphKeywordInfo::m_iQpos ) );
+}
+
+
+struct CSphPlainQueryFilter : public ISphQueryFilter
+{
+	const ISphQwordSetup *	m_pTermSetup;
+	ISphQword *				m_pQueryWord;
+	bool					m_bGetStats;
+
+	virtual void AddKeywordStats ( BYTE * sWord, const BYTE * sTokenized, int iQpos, CSphVector <CSphKeywordInfo> & dKeywords )
+	{
+		assert ( !m_bGetStats || ( m_pTermSetup && m_pQueryWord ) );
+
+		SphWordID_t iWord = m_pDict->GetWordID ( sWord );
+		if ( !iWord )
+			return;
+
+		if ( m_bGetStats )
+		{
+			m_pQueryWord->Reset ();
+			m_pQueryWord->m_sWord = (const char*)sWord;
+			m_pQueryWord->m_sDictWord = (const char*)sWord;
+			m_pQueryWord->m_uWordID = iWord;
+			m_pTermSetup->QwordSetup ( m_pQueryWord );
+		}
+
+		CSphKeywordInfo & tInfo = dKeywords.Add();
+		tInfo.m_sTokenized = (const char *)sTokenized;
+		tInfo.m_sNormalized = (const char*)sWord;
+		tInfo.m_iDocs = m_bGetStats ? m_pQueryWord->m_iDocs : 0;
+		tInfo.m_iHits = m_bGetStats ? m_pQueryWord->m_iHits : 0;
+		tInfo.m_iQpos = iQpos;
+
+		if ( tInfo.m_sNormalized.cstr()[0]==MAGIC_WORD_HEAD_NONSTEMMED )
+			*(char *)tInfo.m_sNormalized.cstr() = '=';
+	}
+};
+
+
 template < class Qword >
 bool CSphIndex_VLN::DoGetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords,
 	const char * szQuery, bool bGetStats, bool bFillOnly, CSphString * pError ) const
@@ -17574,9 +17716,6 @@ bool CSphIndex_VLN::DoGetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords,
 	if ( ( bFillOnly && !dKeywords.GetLength() ) || ( !bFillOnly && ( !szQuery || !szQuery[0] ) ) )
 		return true;
 
-	CSphScopedPtr <CSphAutofile> pDoclist ( NULL );
-	CSphScopedPtr <CSphAutofile> pHitlist ( NULL );
-
 	// TODO: in case of bFillOnly skip tokenizer cloning and setup
 	CSphScopedPtr<ISphTokenizer> pTokenizer ( m_pTokenizer->Clone ( SPH_CLONE_INDEX ) ); // avoid race
 	pTokenizer->EnableTokenizedMultiformTracking ();
@@ -17599,55 +17738,31 @@ bool CSphIndex_VLN::DoGetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords,
 	CSphScopedPtr<CSphDict> tDict2 ( NULL );
 	pDict = SetupExactDict ( tDict2, pDict );
 
-	// FIXME!!! missed bigram, aot transform, FieldFilter
+	// FIXME!!! missed bigram, FieldFilter, add flags to fold blended parts, show expanded terms
 
 	// prepare for setup
 	CSphAutofile tDummy1, tDummy2;
 
-	DiskIndexQwordSetup_c tTermSetup ( tDummy1, tDummy2, m_pSkiplists.GetWritePtr(), NULL );
+	DiskIndexQwordSetup_c tTermSetup ( tDummy1, tDummy2, NULL, NULL );
 	tTermSetup.m_pDict = pDict;
 	tTermSetup.m_pIndex = this;
 	tTermSetup.m_eDocinfo = m_tSettings.m_eDocinfo;
 
-	Qword QueryWord ( false, false );
+	Qword tQueryWord ( false, false );
+
+	CSphPlainQueryFilter tAotFilter;
+	tAotFilter.m_pTokenizer = pTokenizer.Ptr();
+	tAotFilter.m_pDict = pDict;
+	tAotFilter.m_pSettings = &m_tSettings;
+	tAotFilter.m_bGetStats = bGetStats;
+	tAotFilter.m_pTermSetup = &tTermSetup;
+	tAotFilter.m_pQueryWord = &tQueryWord;
 
 	if ( !bFillOnly )
 	{
-		CSphString sTokenized;
-		BYTE * sWord;
 		pTokenizer->SetBuffer ( (const BYTE *)szQuery, strlen(szQuery) );
 
-		// FIXME!!! got rid of duplicated term stat and qword setup
-		while ( ( sWord = pTokenizer->GetToken() )!=NULL )
-		{
-			BYTE * sMultiform = pTokenizer->GetTokenizedMultiform();
-			if ( sMultiform )
-				sTokenized = (const char*)sMultiform;
-			else
-				sTokenized = (const char*)sWord;
-
-			SphWordID_t iWord = pDict->GetWordID ( sWord );
-			if ( iWord )
-			{
-				if ( bGetStats )
-				{
-					QueryWord.Reset ();
-					QueryWord.m_sWord = (const char*)sWord;
-					QueryWord.m_sDictWord = (const char*)sWord;
-					QueryWord.m_uWordID = iWord;
-					tTermSetup.QwordSetup ( &QueryWord );
-				}
-
-				CSphKeywordInfo & tInfo = dKeywords.Add();
-				Swap ( tInfo.m_sTokenized, sTokenized );
-				tInfo.m_sNormalized = (const char*)sWord;
-				tInfo.m_iDocs = bGetStats ? QueryWord.m_iDocs : 0;
-				tInfo.m_iHits = bGetStats ? QueryWord.m_iHits : 0;
-
-				if ( tInfo.m_sNormalized.cstr()[0]==MAGIC_WORD_HEAD_NONSTEMMED )
-					*(char *)tInfo.m_sNormalized.cstr() = '=';
-			}
-		}
+		tAotFilter.GetKeywords ( dKeywords );
 	} else
 	{
 		BYTE sWord[MAX_KEYWORD_BYTES];
@@ -17662,14 +17777,14 @@ bool CSphIndex_VLN::DoGetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords,
 			SphWordID_t iWord = pDict->GetWordID ( sWord );
 			if ( iWord )
 			{
-				QueryWord.Reset ();
-				QueryWord.m_sWord = tInfo.m_sTokenized;
-				QueryWord.m_sDictWord = (const char*)sWord;
-				QueryWord.m_uWordID = iWord;
-				tTermSetup.QwordSetup ( &QueryWord );
+				tQueryWord.Reset ();
+				tQueryWord.m_sWord = tInfo.m_sTokenized;
+				tQueryWord.m_sDictWord = (const char*)sWord;
+				tQueryWord.m_uWordID = iWord;
+				tTermSetup.QwordSetup ( &tQueryWord );
 
-				tInfo.m_iDocs += QueryWord.m_iDocs;
-				tInfo.m_iHits += QueryWord.m_iHits;
+				tInfo.m_iDocs += tQueryWord.m_iDocs;
+				tInfo.m_iHits += tQueryWord.m_iHits;
 			}
 		}
 	}

+ 3 - 0
src/sphinx.h

@@ -2546,6 +2546,8 @@ struct CSphKeywordInfo
 	CSphString		m_sNormalized;
 	int				m_iDocs;
 	int				m_iHits;
+	int				m_iQpos;
+
 };
 
 inline void Swap ( CSphKeywordInfo & v1, CSphKeywordInfo & v2 )
@@ -2554,6 +2556,7 @@ inline void Swap ( CSphKeywordInfo & v1, CSphKeywordInfo & v2 )
 	v1.m_sNormalized.Swap ( v2.m_sNormalized );
 	::Swap ( v1.m_iDocs, v2.m_iDocs );
 	::Swap ( v1.m_iHits, v2.m_iHits );
+	::Swap ( v1.m_iQpos, v2.m_iQpos );
 }
 
 

+ 18 - 0
src/sphinxint.h

@@ -1490,6 +1490,24 @@ public:
 	virtual ISphTokenizer *			GetEmbeddedTokenizer () const					{ return m_pTokenizer; }
 };
 
+
+struct ISphQueryFilter
+{
+	ISphTokenizer *		m_pTokenizer;
+	CSphDict *					m_pDict;
+	const CSphIndexSettings *	m_pSettings;
+
+	ISphQueryFilter ();
+	virtual ~ISphQueryFilter ();
+
+	void GetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords );
+	virtual void AddKeywordStats ( BYTE * sWord, const BYTE * sTokenized, int iQpos, CSphVector <CSphKeywordInfo> & dKeywords ) = 0;
+};
+
+
+DWORD sphParseMorphAot ( const char * );
+
+
 //////////////////////////////////////////////////////////////////////////
 // USER VARIABLES
 //////////////////////////////////////////////////////////////////////////

+ 63 - 30
src/sphinxrt.cpp

@@ -1145,6 +1145,7 @@ public:
 	bool						DoGetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords, const char * szQuery, bool bGetStats, bool bFillOnly, CSphString * pError ) const;
 	virtual bool				GetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords, const char * szQuery, bool bGetStats, CSphString * pError ) const;
 	virtual bool				FillKeywords ( CSphVector <CSphKeywordInfo> & dKeywords ) const;
+	void						AddKeywordStats ( BYTE * sWord, const BYTE * sTokenized, CSphDict * pDict, bool bGetStats, int iQpos, RtQword_t * pQueryWord, CSphVector <CSphKeywordInfo> & dKeywords ) const;
 
 	void						CopyDocinfo ( CSphMatch & tMatch, const DWORD * pFound ) const;
 	const CSphRowitem *			FindDocinfo ( const RtSegment_t * pSeg, SphDocID_t uDocID ) const;
@@ -6995,6 +6996,50 @@ bool RtIndex_t::MultiQueryEx ( int iQueries, const CSphQuery * ppQueries, CSphQu
 }
 
 
+void RtIndex_t::AddKeywordStats ( BYTE * sWord, const BYTE * sTokenized, CSphDict * pDict, bool bGetStats, int iQpos, RtQword_t * pQueryWord, CSphVector <CSphKeywordInfo> & dKeywords ) const
+{
+	assert ( !bGetStats || pQueryWord );
+
+	SphWordID_t iWord = pDict->GetWordID ( sWord );
+	if ( !iWord )
+		return;
+
+	if ( bGetStats )
+	{
+		pQueryWord->Reset();
+		pQueryWord->m_uWordID = iWord;
+		pQueryWord->m_sWord = (const char *)sTokenized;
+		pQueryWord->m_sDictWord = (const char *)sWord;
+		ARRAY_FOREACH ( iSeg, m_dSegments )
+			RtQwordSetupSegment ( pQueryWord, m_dSegments[iSeg], false, m_bKeywordDict, m_iWordsCheckpoint );
+	}
+
+	CSphKeywordInfo & tInfo = dKeywords.Add();
+	tInfo.m_sTokenized = (const char *)sTokenized;
+	tInfo.m_sNormalized = (const char*)sWord;
+	tInfo.m_iDocs = bGetStats ? pQueryWord->m_iDocs : 0;
+	tInfo.m_iHits = bGetStats ? pQueryWord->m_iHits : 0;
+	tInfo.m_iQpos = iQpos;
+
+	if ( tInfo.m_sNormalized.cstr()[0]==MAGIC_WORD_HEAD_NONSTEMMED )
+		*(char *)tInfo.m_sNormalized.cstr() = '=';
+}
+
+
+struct CSphRtQueryFilter : public ISphQueryFilter
+{
+	const RtIndex_t *	m_pIndex;
+	RtQword_t *			m_pQword;
+	bool				m_bGetStats;
+
+	virtual void AddKeywordStats ( BYTE * sWord, const BYTE * sTokenized, int iQpos, CSphVector <CSphKeywordInfo> & dKeywords )
+	{
+		assert ( m_pIndex && m_pQword );
+		m_pIndex->AddKeywordStats ( sWord, sTokenized, m_pDict, m_bGetStats, iQpos, m_pQword, dKeywords );
+	}
+};
+
+
 bool RtIndex_t::DoGetKeywords ( CSphVector<CSphKeywordInfo> & dKeywords, const char * sQuery, bool bGetStats, bool bFillOnly, CSphString * pError ) const
 {
 	if ( !bFillOnly )
@@ -7006,6 +7051,14 @@ bool RtIndex_t::DoGetKeywords ( CSphVector<CSphKeywordInfo> & dKeywords, const c
 	RtQword_t tQword;
 
 	CSphScopedPtr<ISphTokenizer> pTokenizer ( m_pTokenizer->Clone ( SPH_CLONE_INDEX ) ); // avoid race
+	pTokenizer->EnableTokenizedMultiformTracking ();
+
+	// need to support '*' and '=' but not the other specials
+	// so m_pQueryTokenizer does not work for us, gotta clone and setup one manually
+	if ( IsStarDict() )
+		pTokenizer->AddPlainChar ( '*' );
+	if ( m_tSettings.m_bIndexExactWords )
+		pTokenizer->AddPlainChar ( '=' );
 
 	CSphScopedPtr<CSphDict> tDictCloned ( NULL );
 	CSphDict * pDictBase = m_pDict;
@@ -7020,40 +7073,20 @@ bool RtIndex_t::DoGetKeywords ( CSphVector<CSphKeywordInfo> & dKeywords, const c
 	CSphScopedPtr<CSphDict> tDict2 ( NULL );
 	pDict = SetupExactDict ( tDict2, pDict, pTokenizer.Ptr() );
 
+	// FIXME!!! missed bigram, FieldFilter
+	CSphRtQueryFilter tAotFilter;
+	tAotFilter.m_pTokenizer = pTokenizer.Ptr();
+	tAotFilter.m_pDict = pDict;
+	tAotFilter.m_pSettings = &m_tSettings;
+	tAotFilter.m_bGetStats = bGetStats;
+	tAotFilter.m_pIndex = this;
+	tAotFilter.m_pQword = &tQword;
+
 	if ( !bFillOnly )
 	{
 		pTokenizer->SetBuffer ( (BYTE *)sQuery, strlen ( sQuery ) );
 
-		while ( BYTE * pToken = pTokenizer->GetToken() )
-		{
-			// keep tokenized form
-			CSphString sTokenized = ( const char *)pToken;
-			SphWordID_t iWord = pDict->GetWordID ( pToken );
-			if ( iWord )
-			{
-				CSphKeywordInfo & tInfo = dKeywords.Add();
-				Swap ( tInfo.m_sTokenized, sTokenized );
-				tInfo.m_sNormalized = (const char *)pToken;
-				tInfo.m_iDocs = 0;
-				tInfo.m_iHits = 0;
-
-				if ( tInfo.m_sNormalized.cstr()[0]==MAGIC_WORD_HEAD_NONSTEMMED )
-					*(char *)tInfo.m_sNormalized.cstr() = '=';
-
-				if ( !bGetStats )
-					continue;
-
-				tQword.Reset();
-				tQword.m_uWordID = iWord;
-				tQword.m_sWord = tInfo.m_sTokenized;
-				tQword.m_sDictWord = tInfo.m_sNormalized;
-				ARRAY_FOREACH ( iSeg, m_dSegments )
-					RtQwordSetupSegment ( &tQword, m_dSegments[iSeg], false, m_bKeywordDict, m_iWordsCheckpoint );
-
-				tInfo.m_iDocs = tQword.m_iDocs;
-				tInfo.m_iHits = tQword.m_iHits;
-			}
-		}
+		tAotFilter.GetKeywords ( dKeywords );
 	} else
 	{
 		BYTE sWord[SPH_MAX_KEYWORD_LEN];

+ 3 - 0
src/sphinxutils.cpp

@@ -1480,6 +1480,9 @@ bool sphFixupIndexSettings ( CSphIndex * pIndex, const CSphConfigSection & hInde
 		{
 			sphConfDictionary ( hIndex, tSettings );
 			pDict = sphCreateDictionaryTemplate ( tSettings, NULL, pIndex->GetTokenizer (), pIndex->GetName(), sError );
+			CSphIndexSettings tIndexSettings = pIndex->GetSettings();
+			tIndexSettings.m_uAotFilterMask = sphParseMorphAot ( tSettings.m_sMorphology.cstr() );
+			pIndex->Setup ( tIndexSettings );
 		} else
 		{
 			if ( pIndex->m_bId32to64 )

A különbségek nem kerülnek megjelenítésre, a fájl túl nagy
+ 0 - 0
test/test_038/model.bin


A különbségek nem kerülnek megjelenítésre, a fájl túl nagy
+ 0 - 0
test/test_054/model.bin


A különbségek nem kerülnek megjelenítésre, a fájl túl nagy
+ 0 - 0
test/test_063/model.bin


A különbségek nem kerülnek megjelenítésre, a fájl túl nagy
+ 0 - 0
test/test_098/model.bin


A különbségek nem kerülnek megjelenítésre, a fájl túl nagy
+ 0 - 0
test/test_192/model.bin


A különbségek nem kerülnek megjelenítésre, a fájl túl nagy
+ 0 - 0
test/test_222/model.bin


+ 53 - 0
test/test_222/test.xml

@@ -105,6 +105,41 @@ index testple
 	min_infix_len	= 2
 }
 
+source src_plain_ckw
+{
+	type			= mysql
+	<sql_settings/>
+	sql_query_pre	= set names utf8
+	sql_query		= select 1 as id, 11 as idd, 'фичин баг, но не фичина бага' as title
+	sql_attr_uint	= idd
+}
+
+index plain_ckw
+{
+	source			= src_plain_ckw
+	dict			= keywords
+	path			= <data_path/>/plain_ckw
+	morphology		= lemmatize_en_all, lemmatize_ru_all
+}
+
+index rt_ckw
+{
+    type            = rt
+    dict            = keywords
+    path            = <data_path/>/rt_ckw
+    rt_attr_uint    = idd
+    rt_field        = title
+	morphology		= lemmatize_en_all, lemmatize_ru_all
+}
+
+
+index tmpl_ckw
+{
+	type			= template
+	dict			= keywords
+	morphology		= lemmatize_en_all, lemmatize_ru_all
+}
+
 </config>
 
 <db_create>
@@ -154,6 +189,24 @@ CREATE TABLE test_table
 <sphinxql>SELECT id,gid,weight() as my_weight FROM META:plain WHERE MATCH('"Фичин баг"') option ranker=sph04;show meta</sphinxql>
 <sphinxql>SELECT GROUPBY() FROM testw</sphinxql>
 
+<!--regression lemmas at call keywords output -->
+<sphinxql>INSERT INTO rt_ckw VALUES ( 1, 'фичин баг, но не фичина бага', 11 )</sphinxql>
+<sphinxql>CALL KEYWORDS ( 'фичин баг', 'plain_ckw', 1 )</sphinxql>
+<sphinxql>CALL KEYWORDS ( 'фичин баг', 'rt_ckw', 1 )</sphinxql>
+<sphinxql>CALL KEYWORDS ( 'фичин баг', 'tmpl_ckw', 1 )</sphinxql>
+<sphinxql>CALL KEYWORDS ( 'фичина бага', 'plain_ckw', 1 )</sphinxql>
+<sphinxql>CALL KEYWORDS ( 'фичина бага', 'rt_ckw', 1 )</sphinxql>
+<sphinxql>CALL KEYWORDS ( 'фичина бага', 'tmpl_ckw', 1 )</sphinxql>
+<sphinxql>CALL KEYWORDS ( 'сталин жил', 'plain_ckw', 1 )</sphinxql>
+<sphinxql>CALL KEYWORDS ( 'сталин жил', 'rt_ckw', 1 )</sphinxql>
+<sphinxql>CALL KEYWORDS ( 'сталин жил', 'tmpl_ckw', 1 )</sphinxql>
+<sphinxql>CALL KEYWORDS ( 'DovE', 'plain_ckw', 1 )</sphinxql>
+<sphinxql>CALL KEYWORDS ( 'DovE', 'rt_ckw', 1 )</sphinxql>
+<sphinxql>CALL KEYWORDS ( 'DovE', 'tmpl_ckw', 1 )</sphinxql>
+<sphinxql>CALL KEYWORDS ( 'records recorded recording men man mans mens', 'plain_ckw', 1 )</sphinxql>
+<sphinxql>CALL KEYWORDS ( 'records recorded recording men man mans mens', 'rt_ckw', 1 )</sphinxql>
+<sphinxql>CALL KEYWORDS ( 'records recorded recording men man mans mens', 'tmpl_ckw', 1 )</sphinxql>
+
 </sphqueries>
 
 </test>

A különbségek nem kerülnek megjelenítésre, a fájl túl nagy
+ 0 - 0
test/test_223/model.bin


Nem az összes módosított fájl került megjelenítésre, mert túl sok fájl változott