Browse Source

fixed #1786 blended parts these are stopwords and stopword step is 0; set UDF version 10, added blended related functions; added regression to test 63

Stas Klinov 5 years ago
parent
commit
a4aaa0ccd6

+ 19 - 4
src/sphinx.cpp

@@ -4102,6 +4102,7 @@ public:
 		{
 			m_iPosDelta = 1; // default delta is 1
 			BYTE * pTok = (BYTE*) m_pFilter->m_fnGetExtraToken ( m_pUserdata, &m_iPosDelta );
+			GetBlended();
 			if ( pTok )
 				return pTok;
 			m_bGotExtra = false;
@@ -4119,11 +4120,17 @@ public:
 				m_bGotExtra = 0;
 				if ( m_pFilter->m_fnEndField )
 					if ( !m_pFilter->m_fnEndField ( m_pUserdata ) )
+					{
+						m_bBlended = false;
+						m_bBlendedPart = false;
 						return NULL;
+					}
 
 				// got them, start fetching
 				m_bGotExtra = true;
-				return (BYTE*)m_pFilter->m_fnGetExtraToken ( m_pUserdata, &m_iPosDelta );
+				BYTE * pTok = (BYTE*)m_pFilter->m_fnGetExtraToken ( m_pUserdata, &m_iPosDelta );
+				GetBlended();
+				return pTok;
 			}
 
 			// compute proper position delta
@@ -4134,6 +4141,7 @@ public:
 			int iExtra = 0;
 			BYTE * pTok = (BYTE*)m_pFilter->m_fnPushToken ( m_pUserdata, (char*)pRaw, &iExtra, &m_iPosDelta );
 			m_bGotExtra = ( iExtra!=0 );
+			GetBlended();
 			if ( pTok )
 				return pTok;
 		}
@@ -4144,9 +4152,13 @@ public:
 		return m_iPosDelta-1;
 	}
 
-	bool TokenIsBlended() const final
+private:
+	void GetBlended()
 	{
-		return false;
+		if ( m_pFilter->m_fnTokenIsBlended )
+			m_bBlended = ( !!m_pFilter->m_fnTokenIsBlended ( m_pUserdata ) );
+		if ( m_pFilter->m_fnTokenIsBlendedPart )
+			m_bBlendedPart = ( !!m_pFilter->m_fnTokenIsBlendedPart ( m_pUserdata ) );
 	}
 };
 
@@ -22717,7 +22729,10 @@ void CSphSource_Document::BuildRegularHits ( RowID_t tRowID, bool bPayload, bool
 			if ( m_bIndexExactWords && eMorph!=SPH_TOKEN_MORPH_GUESS )
 				m_tHits.Add ( { tRowID, m_pDict->GetWordIDNonStemmed ( sBuf ), m_tState.m_iHitPos } );
 		} else
-			m_tState.m_iBuildLastStep = m_iStopwordStep;
+		{
+			// need to count all blended part tokens to match query
+			m_tState.m_iBuildLastStep = ( m_pTokenizer->TokenIsBlendedPart() ? 1 : m_iStopwordStep );
+		}
 	}
 
 	m_tState.m_bProcessingHits = ( sWord!=NULL );

+ 2 - 0
src/sphinxplugin.cpp

@@ -278,6 +278,8 @@ static SymbolDesc_t g_dSymbolsTokenFilter[] =
 	{ static_cast<int>( offsetof(PluginTokenFilter_c, m_fnGetExtraToken)),	"get_extra_token",	false },
 	{ static_cast<int>( offsetof(PluginTokenFilter_c, m_fnEndField)),		"end_field",		false },
 	{ static_cast<int>( offsetof(PluginTokenFilter_c, m_fnDeinit)),		"deinit",			false },
+	{ static_cast<int>( offsetof(PluginTokenFilter_c, m_fnTokenIsBlended)),		"is_blended", false },
+	{ static_cast<int>( offsetof(PluginTokenFilter_c, m_fnTokenIsBlendedPart)),	"is_blended_part", false },
 	{ -1, nullptr, false }
 };
 

+ 6 - 0
src/sphinxplugin.h

@@ -41,6 +41,9 @@ typedef char *			(*TokenFilterPushToken_fn)		( void * userdata, char * token, in
 typedef char *			(*TokenFilterGetExtraToken_fn)	( void * userdata, int * delta );
 typedef int				(*TokenFilterEndField_fn)		( void * userdata );
 typedef void			(*TokenFilterDeinit_fn)			( void * userdata );
+typedef	int				(*TokenFilterIsBlended_fn)		( void * userdata );
+typedef	int				(*TokenFilterIsBlendedPart_fn)	( void * userdata );
+
 
 typedef int				(*QueryTokenFilterInit_fn)		( void ** userdata, int max_len, const char * options, char * error );
 typedef void			(*QueryTokenFilterPreMorph_fn)	( void * userdata, char * token, int * stopword );
@@ -124,6 +127,9 @@ public:
 	TokenFilterEndField_fn		m_fnEndField = nullptr;
 	TokenFilterDeinit_fn		m_fnDeinit = nullptr;
 
+	TokenFilterIsBlended_fn		m_fnTokenIsBlended = nullptr;
+	TokenFilterIsBlendedPart_fn m_fnTokenIsBlendedPart = nullptr;
+
 	explicit					PluginTokenFilter_c ( PluginLib_c * pLib ) : PluginDesc_c ( pLib ) {}
 };
 

+ 3 - 1
src/sphinxquery.cpp

@@ -1264,7 +1264,9 @@ int XQParser_t::GetToken ( YYSTYPE * lvalp )
 			return 0;
 
 		m_iPendingNulls = m_pTokenizer->GetOvershortCount() * m_iOvershortStep;
-		m_iAtomPos += 1 + m_iPendingNulls + iPrevDeltaPos;
+		m_iAtomPos += 1 + m_iPendingNulls;
+		if ( iPrevDeltaPos>1 ) // to match with condifion of m_bWasBlended above
+			m_iAtomPos += ( iPrevDeltaPos - 1);
 
 		bool bMultiDestHead = false;
 		bool bMultiDest = false;

+ 1 - 1
src/sphinxudf.h

@@ -29,7 +29,7 @@ extern "C" {
 #endif
 
 /// current udf version
-#define SPH_UDF_VERSION 9
+#define SPH_UDF_VERSION 10
 
 /// error buffer size
 #define SPH_UDF_ERROR_LEN 256

File diff suppressed because it is too large
+ 0 - 0
test/test_063/model.bin


+ 4 - 0
test/test_063/stop_blend.txt

@@ -0,0 +1,4 @@
+and
+a
+s
+t

+ 26 - 0
test/test_063/test.xml

@@ -120,6 +120,24 @@ index test4
 	blend_chars = U+2E
 }
 
+
+
+source blend_stop
+{
+	type			= mysql
+	<sql_settings/>
+	sql_query		= SELECT 1 as document_id, ' Children@s Hospital Colorado ' as text UNION SELECT 2 as document_id, ' got@s@but@not@t ok to go with a flow ' as text
+}
+
+index blend_stop
+{
+	source	= blend_stop
+	path	= <data_path/>/blend_stop
+	blend_chars = U+0023, U+0024, U+0026, U+0027, U+002D, U+0040
+    stopword_step = 0
+    stopwords     = <this_test/>/stop_blend.txt 
+}
+
 </config>
 
 <db_create>
@@ -255,6 +273,14 @@ INSERT INTO test_table VALUES
 	<sphinxql>CALL KEYWORDS ( 'thatsgoingverylongwordthathasblendedpartattheand', 'rt', 1 )</sphinxql>
 	<!-- regression blended vs quorum argument -->
 	<sphinxql>select * from test4 where match ( ' "dog of friend"/0.5 ' )</sphinxql>
+    
+    <!-- regression blended part that is stopword and stepword step is 0 -->
+    <sphinxql>set profiling=1</sphinxql>
+    <sphinxql>select * from blend_stop where match('("Children@s Hospital Colorado")')</sphinxql>
+    <sphinxql>show plan</sphinxql>
+    <sphinxql>select * from blend_stop where match('("got@s@but@not@t ok")')</sphinxql>
+    <sphinxql>show plan</sphinxql>
+    
 </sphqueries>
 
 </test>

Some files were not shown because too many files changed in this diff