Przeglądaj źródła

merged rel21 branch (up to r4487) back into trunk

git-svn-id: svn://svn.sphinxsearch.com/sphinx/trunk@4488 406a0c4d-033a-0410-8de8-e80135713968
tomat 12 lat temu
rodzic
commit
f2fe5217af
6 zmienionych plików z 125 dodań i 6 usunięć
  1. 11 3
      src/sphinx.cpp
  2. 2 1
      src/sphinxint.h
  3. 5 1
      src/sphinxrt.cpp
  4. 0 0
      test/test_012/model.bin
  5. 0 0
      test/test_041/model.bin
  6. 107 1
      test/test_041/test.xml

+ 11 - 3
src/sphinx.cpp

@@ -18018,7 +18018,7 @@ XQNode_t * sphExpandXQNode ( XQNode_t * pNode, ExpansionContext_t & tCtx )
 	if ( !iWilds || iWilds==iLen )
 		return pNode;
 
-	ISphWordlist::Args_t tWordlist ( tCtx.m_bMergeSingles, tCtx.m_iExpansionLimit );
+	ISphWordlist::Args_t tWordlist ( tCtx.m_bMergeSingles, tCtx.m_iExpansionLimit, tCtx.m_bHasMorphology );
 
 	if ( !sphIsWild(*sFull) || tCtx.m_iMinInfixLen==0 )
 	{
@@ -30846,9 +30846,10 @@ const BYTE * CWordlist::AcquireDict ( const CSphWordlistCheckpoint * pCheckpoint
 }
 
 
-ISphWordlist::Args_t::Args_t ( bool bPayload, int iExpansionLimit )
+ISphWordlist::Args_t::Args_t ( bool bPayload, int iExpansionLimit, bool bHasMorphology )
 	: m_bPayload ( bPayload )
 	, m_iExpansionLimit ( iExpansionLimit )
+	, m_bHasMorphology ( bHasMorphology )
 {
 	m_sBuf.Reserve ( 2048 * SPH_MAX_WORD_LEN * 3 );
 	m_dExpanded.Reserve ( 2048 );
@@ -31163,6 +31164,7 @@ void CWordlist::GetInfixedWords ( const char * sSubstring, int iSubLen, const ch
 		return;
 
 	DictEntryDiskPayload_t tDict2Payload ( tArgs.m_bPayload );
+	const int iSkipMagic = ( tArgs.m_bHasMorphology ? 1 : 0 ); // whether to skip heading magic chars in the prefix, like NONSTEMMED maker
 
 	// walk those checkpoints, check all their words
 	ARRAY_FOREACH ( i, dPoints )
@@ -31170,8 +31172,14 @@ void CWordlist::GetInfixedWords ( const char * sSubstring, int iSubLen, const ch
 		// OPTIMIZE? add a quicker path than a generic wildcard for "*infix*" case?
 		KeywordsBlockReader_c tDictReader ( m_pBuf.GetWritePtr() + m_dCheckpoints[dPoints[i]-1].m_iWordlistOffset, m_bHaveSkips );
 		while ( tDictReader.UnpackWord() )
-			if ( sphWildcardMatch ( (const char *)tDictReader.m_sKeyword, sWildcard ) )
+		{
+			// stemmed terms should not match suffixes
+			if ( tArgs.m_bHasMorphology && *tDictReader.m_sKeyword!=MAGIC_WORD_HEAD_NONSTEMMED )
+				continue;
+
+			if ( sphWildcardMatch ( (const char *)tDictReader.m_sKeyword+iSkipMagic, sWildcard ) )
 				tDict2Payload.Add ( tDictReader, tDictReader.GetWordLen() );
+		}
 	}
 
 	tDict2Payload.Convert ( tArgs );

+ 2 - 1
src/sphinxint.h

@@ -1704,12 +1704,13 @@ public:
 		CSphVector<SphExpanded_t>	m_dExpanded;
 		const bool					m_bPayload;
 		int							m_iExpansionLimit;
+		const bool					m_bHasMorphology;
 
 		ISphSubstringPayload *		m_pPayload;
 		int							m_iTotalDocs;
 		int							m_iTotalHits;
 
-		Args_t ( bool bPayload, int iExpansionLimit );
+		Args_t ( bool bPayload, int iExpansionLimit, bool bHasMorphology );
 		~Args_t ();
 		void AddExpanded ( const BYTE * sWord, int iLen, int iDocs, int iHits );
 		const char * GetWordExpanded ( int iIndex ) const;

+ 5 - 1
src/sphinxrt.cpp

@@ -6046,6 +6046,7 @@ void RtIndex_t::GetInfixedWords ( const char * sSubstring, int iSubLen, const ch
 
 	// find those prefixes
 	CSphVector<int> dPoints;
+	const int iSkipMagic = ( tArgs.m_bHasMorphology ? 1 : 0 ); // whether to skip heading magic chars in the prefix, like NONSTEMMED maker
 
 	DictEntryRtPayload_t tDict2Payload ( tArgs.m_bPayload, m_dSegments.GetLength() );
 	ARRAY_FOREACH ( iSeg, m_dSegments )
@@ -6072,8 +6073,11 @@ void RtIndex_t::GetInfixedWords ( const char * sSubstring, int iSubLen, const ch
 			const RtWord_t * pWord = NULL;
 			while ( ( pWord = tReader.UnzipWord() )!=NULL )
 			{
+				if ( tArgs.m_bHasMorphology && pWord->m_sWord[1]!=MAGIC_WORD_HEAD_NONSTEMMED )
+					continue;
+
 				// check it
-				if ( !sphWildcardMatch ( (const char*)pWord->m_sWord+1, sWildcard ) )
+				if ( !sphWildcardMatch ( (const char*)pWord->m_sWord+1+iSkipMagic, sWildcard ) )
 					continue;
 
 				// matched, lets add

Plik diff jest za duży
+ 0 - 0
test/test_012/model.bin


Plik diff jest za duży
+ 0 - 0
test/test_041/model.bin


+ 107 - 1
test/test_041/test.xml

@@ -59,7 +59,6 @@ index rt_fix
 	morphology = stem_en
 }
 
-
 index wf
 {
 	source			= srctest
@@ -69,6 +68,60 @@ index wf
 	morphology = none
 	wordforms		= <this_test/>/wordforms.txt
 }
+
+source src_stem : srctest
+{
+	sql_query		= SELECT 1 id, 11 idd, 'busy' title UNION SELECT 2 id, 11 idd, 'busi' title UNION SELECT 3 id, 11 idd, 'feet' title UNION SELECT 4 id, 11 idd, 'foot' title UNION SELECT 5 id, 11 idd, 'fbusy' title UNION SELECT 6 id, 11 idd, 'fbusi' title
+	sql_attr_uint		= idd
+}
+
+index kw_p
+{
+	source			= src_stem
+	path			= <data_path/>/kw_p
+	charset_table = 0..9, a..z, A..Z->a..z
+	dict = keywords
+	min_prefix_len = 3
+	morphology		= stem_en
+	index_exact_words = 1
+}
+
+index kw_i
+{
+	source			= src_stem
+	path			= <data_path/>/kw_i
+	charset_table = 0..9, a..z, A..Z->a..z
+	dict = keywords
+	min_infix_len = 3
+	morphology		= stem_en
+	index_exact_words = 1
+}
+
+index rt_p
+{
+	type = rt
+	path			= <data_path/>/rt_p
+	rt_mem_limit = 128k
+	rt_field                = title
+	rt_attr_uint        = idd
+	dict = keywords
+	min_prefix_len = 3
+	morphology		= stem_en
+	index_exact_words = 1
+}
+
+index rt_i
+{
+	type = rt
+	path			= <data_path/>/rt_i
+	rt_mem_limit = 128k
+	rt_field                = title
+	rt_attr_uint        = idd
+	dict = keywords
+	min_infix_len = 3
+	morphology		= stem_en
+	index_exact_words = 1
+}
 </config>
 
 <queries>
@@ -131,6 +184,59 @@ REPLACE INTO rt (id, id1, title) VALUES ( 1, 1, 'work worked working workings wo
 <sphinxql>show meta</sphinxql>
 <sphinxql>SELECT * FROM rt_fix WHERE MATCH('ho*')</sphinxql>
 <sphinxql>show meta</sphinxql>
+
+<!--regression substring search vs stemmed terms -->
+<sphinxql>REPLACE INTO rt_p (id, idd, title) VALUES (1,11,'busy'), (2,11,'busi'), (3,11,'feet'), (4,11,'foot'), (5,11,'fbusy'), (6,11,'fbusi')</sphinxql>
+<sphinxql>REPLACE INTO rt_i (id, idd, title) VALUES (1,11,'busy'), (2,11,'busi'), (3,11,'feet'), (4,11,'foot'), (5,11,'fbusy'), (6,11,'fbusi')</sphinxql>
+<sphinxql>SELECT * FROM kw_p WHERE MATCH('busy*')</sphinxql>
+<sphinxql>show meta</sphinxql>
+<sphinxql>SELECT * FROM kw_i WHERE MATCH('busy*')</sphinxql>
+<sphinxql>show meta</sphinxql>
+<sphinxql>SELECT * FROM rt_p WHERE MATCH('busy*')</sphinxql>
+<sphinxql>show meta</sphinxql>
+<sphinxql>SELECT * FROM rt_i WHERE MATCH('busy*')</sphinxql>
+<sphinxql>show meta</sphinxql>
+<sphinxql>SELECT * FROM kw_p WHERE MATCH('busi*')</sphinxql>
+<sphinxql>show meta</sphinxql>
+<sphinxql>SELECT * FROM kw_i WHERE MATCH('busi*')</sphinxql>
+<sphinxql>show meta</sphinxql>
+<sphinxql>SELECT * FROM rt_p WHERE MATCH('busi*')</sphinxql>
+<sphinxql>show meta</sphinxql>
+<sphinxql>SELECT * FROM rt_i WHERE MATCH('busi*')</sphinxql>
+<sphinxql>show meta</sphinxql>
+<sphinxql>SELECT * FROM kw_i WHERE MATCH('*busy*')</sphinxql>
+<sphinxql>show meta</sphinxql>
+<sphinxql>SELECT * FROM rt_i WHERE MATCH('*busy*')</sphinxql>
+<sphinxql>show meta</sphinxql>
+<sphinxql>SELECT * FROM kw_i WHERE MATCH('*busi*')</sphinxql>
+<sphinxql>show meta</sphinxql>
+<sphinxql>SELECT * FROM rt_i WHERE MATCH('*busi*')</sphinxql>
+<sphinxql>show meta</sphinxql>
+<sphinxql>SELECT * FROM kw_p WHERE MATCH('foot*')</sphinxql>
+<sphinxql>show meta</sphinxql>
+<sphinxql>SELECT * FROM kw_i WHERE MATCH('foot*')</sphinxql>
+<sphinxql>show meta</sphinxql>
+<sphinxql>SELECT * FROM rt_p WHERE MATCH('foot*')</sphinxql>
+<sphinxql>show meta</sphinxql>
+<sphinxql>SELECT * FROM rt_i WHERE MATCH('foot*')</sphinxql>
+<sphinxql>show meta</sphinxql>
+<sphinxql>SELECT * FROM kw_p WHERE MATCH('feet*')</sphinxql>
+<sphinxql>show meta</sphinxql>
+<sphinxql>SELECT * FROM kw_i WHERE MATCH('feet*')</sphinxql>
+<sphinxql>show meta</sphinxql>
+<sphinxql>SELECT * FROM rt_p WHERE MATCH('feet*')</sphinxql>
+<sphinxql>show meta</sphinxql>
+<sphinxql>SELECT * FROM rt_i WHERE MATCH('feet*')</sphinxql>
+<sphinxql>show meta</sphinxql>
+<sphinxql>SELECT * FROM kw_i WHERE MATCH('*foot*')</sphinxql>
+<sphinxql>show meta</sphinxql>
+<sphinxql>SELECT * FROM rt_i WHERE MATCH('*foot*')</sphinxql>
+<sphinxql>show meta</sphinxql>
+<sphinxql>SELECT * FROM kw_i WHERE MATCH('*feet*')</sphinxql>
+<sphinxql>show meta</sphinxql>
+<sphinxql>SELECT * FROM rt_i WHERE MATCH('*feet*')</sphinxql>
+<sphinxql>show meta</sphinxql>
+
 </sphqueries>
 
 <db_create>

Niektóre pliki nie zostały wyświetlone z powodu dużej ilości zmienionych plików