Jelajahi Sumber

- php api: EscapeString
- escaped characters in tokenizer

git-svn-id: svn://svn.sphinxsearch.com/sphinx/trunk@1163 406a0c4d-033a-0410-8de8-e80135713968

glook 18 tahun lalu
induk
melakukan
3855b6317c
5 mengubah file dengan 178 tambahan dan 35 penghapusan
  1. 8 0
      api/sphinxapi.php
  2. 114 9
      src/sphinx.cpp
  3. 6 1
      src/sphinx.h
  4. 5 22
      src/sphinxquery.cpp
  5. 45 3
      src/tests.cpp

+ 8 - 0
api/sphinxapi.php

@@ -1088,6 +1088,14 @@ class SphinxClient
 		return $res;
 	}
 
+	function EscapeString ( $string )
+	{
+		$from = array ( '(',')','|','-','!','@','~','\"','&' );
+		$to   = array ( '\\(','\\)','\\|','\\-','\\!','\\@','\\~','\\\"', '\\&' );
+
+		return str_replace ( $from, $to, $string );
+	}
+
 	/////////////////////////////////////////////////////////////////////////////
 	// attribute updates
 	/////////////////////////////////////////////////////////////////////////////

+ 114 - 9
src/sphinx.cpp

@@ -2135,7 +2135,7 @@ public:
 
 	virtual void				SetBuffer ( BYTE * sBuffer, int iLength );
 	virtual BYTE *				GetToken ();
-	virtual ISphTokenizer *		Clone () const;
+	virtual ISphTokenizer *		Clone ( bool bEscaped ) const;
 	virtual bool				IsUtf8 () const { return false; }
 	virtual int					GetCodepointLength ( int ) const { return 1; }
 };
@@ -2149,7 +2149,7 @@ public:
 
 	virtual void				SetBuffer ( BYTE * sBuffer, int iLength );
 	virtual BYTE *				GetToken ();
-	virtual ISphTokenizer *		Clone () const;
+	virtual ISphTokenizer *		Clone ( bool bEscaped ) const;
 	virtual bool				IsUtf8 () const { return true; }
 	virtual int					GetCodepointLength ( int iCode ) const;
 
@@ -2713,6 +2713,8 @@ ISphTokenizer::ISphTokenizer ()
 	, m_iLastTokenLen ( 0 )
 	, m_bTokenBoundary ( false )
 	, m_bBoundary ( false )
+	, m_bWasSpecial ( false )
+	, m_bEscaped ( false )
 {}
 
 
@@ -3089,6 +3091,8 @@ BYTE * CSphTokenizerTraits<IS_UTF8>::GetTokenSyn ()
 {
 	assert ( m_dSynonyms.GetLength() );
 
+	bool bEscaped = m_bEscaped;
+
 	m_bTokenBoundary = false;
 	for ( ;; )
 	{
@@ -3102,6 +3106,7 @@ BYTE * CSphTokenizerTraits<IS_UTF8>::GetTokenSyn ()
 		int iSynEnd = m_dSynonyms.GetLength()-1;
 		int iSynOff = 0;
 
+		int iLastCodepoint = 0;
 		int iLastFolded = 0;
 		BYTE * pRescan = NULL;
 
@@ -3139,11 +3144,30 @@ BYTE * CSphTokenizerTraits<IS_UTF8>::GetTokenSyn ()
 			// skip continuous whitespace
 			if ( iLastFolded==0 && iFolded==0 )
 				continue;
+
+			if ( bEscaped )
+			{
+				if ( iCode == '\\' && iLastCodepoint != '\\' )
+				{
+					iLastCodepoint = iCode;
+					continue;
+				}
+				else
+				{
+					if ( iLastCodepoint == '\\' )
+						iFolded &= ~( FLAG_CODEPOINT_SPECIAL | FLAG_CODEPOINT_DUAL );
+				}
+
+				iLastCodepoint = iCode;
+			}
+
 			iLastFolded = iFolded;
 
 			// handle specials at the very word start
 			if ( ( iFolded & FLAG_CODEPOINT_SPECIAL ) && m_iAccum==0 )
 			{
+				m_bWasSpecial = true;
+
 				AccumCodepoint ( iFolded & MASK_CODEPOINT );
 				*m_pAccum = '\0';
 
@@ -3331,7 +3355,7 @@ BYTE * CSphTokenizerTraits<IS_UTF8>::GetTokenSyn ()
 		{
 			m_pCur = pRescan;
 			continue;
-		}			
+		}
 
 		// at this point, it also started with a valid char
 		assert ( m_iAccum>0 );
@@ -3339,14 +3363,25 @@ BYTE * CSphTokenizerTraits<IS_UTF8>::GetTokenSyn ()
 		// find the proper separator
 		if ( !pFirstSeparator )
 		{
+			int iLast = 0;
+
 			// if there was none, scan until found
 			for ( ;; )
 			{
 				BYTE * pCur = m_pCur;
+				int iCode = *pCur;
 				int iFolded = m_tLC.ToLower ( GetCodepoint() );
 				if ( iFolded<0 )
 					break; // eof
 
+				if ( bEscaped )
+				{
+					if ( iCode != '\\' && iLast == '\\' )
+						iFolded &= ~FLAG_CODEPOINT_SPECIAL;
+
+					iLast = iCode;
+				}
+
 				if ( IsSeparator ( iFolded, false ) )
 				{
 					if ( iFolded!=0 )
@@ -3354,7 +3389,13 @@ BYTE * CSphTokenizerTraits<IS_UTF8>::GetTokenSyn ()
 					break;
 				}
 
-				AccumCodepoint ( iFolded & MASK_CODEPOINT );
+				if ( bEscaped )
+				{
+					if ( iCode != '\\' )
+						AccumCodepoint ( iFolded & MASK_CODEPOINT );
+				}
+				else
+					AccumCodepoint ( iFolded & MASK_CODEPOINT );
 			}
 		} else
 		{
@@ -3466,9 +3507,14 @@ void CSphTokenizer_SBCS::SetBuffer ( BYTE * sBuffer, int iLength )
 
 BYTE * CSphTokenizer_SBCS::GetToken ()
 {
+	m_bWasSpecial = false;
+
 	if ( m_dSynonyms.GetLength() )
 		return GetTokenSyn ();
 
+	bool bEscaped = m_bEscaped;
+	int iCodepoint = 0;
+	int iLastCodepoint = 0;
 	m_bTokenBoundary = false;
 	for ( ;; )
 	{
@@ -3485,13 +3531,28 @@ BYTE * CSphTokenizer_SBCS::GetToken ()
 			}
 		} else
 		{
-			iCode = m_tLC.ToLower ( *m_pCur++ );
+			iCodepoint = *m_pCur++;
+			iCode = m_tLC.ToLower ( iCodepoint );
 		}
 
 		// handle ignored chars
 		if ( iCode & FLAG_CODEPOINT_IGNORE )
 			continue;
 
+		if ( bEscaped )
+		{
+			if ( iCodepoint == '\\' && iLastCodepoint != '\\' )
+			{
+				iLastCodepoint = iCodepoint;
+				continue;
+			}
+
+			if ( iLastCodepoint == '\\' )
+				iCode &= ~FLAG_CODEPOINT_SPECIAL;
+
+			iLastCodepoint = iCodepoint;
+		}
+
 		// handle whitespace and boundary
 		if ( m_bBoundary && ( iCode==0 ) ) m_bTokenBoundary = true;
 		m_bBoundary = ( iCode & FLAG_CODEPOINT_BOUNDARY )!=0;
@@ -3528,6 +3589,8 @@ BYTE * CSphTokenizer_SBCS::GetToken ()
 				m_iLastTokenLen = 1;
 				m_sAccum[0] = (BYTE)iCode;
 				m_sAccum[1] = '\0';
+
+				m_bWasSpecial = true;
 			} else
 			{
 				// flush prev accum and redo this special
@@ -3547,13 +3610,24 @@ BYTE * CSphTokenizer_SBCS::GetToken ()
 	}
 }
 
-ISphTokenizer * CSphTokenizer_SBCS::Clone () const
+ISphTokenizer * CSphTokenizer_SBCS::Clone ( bool bEscaped ) const
 {
 	CSphTokenizer_SBCS * pClone = new CSphTokenizer_SBCS ();
 	pClone->m_tLC = m_tLC;
 	pClone->m_dSynonyms = m_dSynonyms;
 	pClone->m_dSynStart = m_dSynStart;
 	pClone->m_dSynEnd = m_dSynEnd;
+	pClone->m_bEscaped = bEscaped;
+
+	if ( bEscaped )
+	{
+		CSphVector<CSphRemapRange> dRemaps;
+		CSphRemapRange Range;
+		Range.m_iStart = Range.m_iEnd = Range.m_iRemapStart = '\\';
+		dRemaps.Add ( Range );
+		pClone->m_tLC.AddRemaps ( dRemaps, 0, 0 );
+	}
+
 	return pClone;
 }
 
@@ -3585,15 +3659,20 @@ void CSphTokenizer_UTF8::SetBuffer ( BYTE * sBuffer, int iLength )
 
 BYTE * CSphTokenizer_UTF8::GetToken ()
 {
+	m_bWasSpecial = false;
+
 	if ( m_dSynonyms.GetLength() )
 		return GetTokenSyn ();
 
+	bool bEscaped = m_bEscaped;
+	int iLastCodepoint = 0;
 	m_bTokenBoundary = false;
 	for ( ;; )
 	{
 		// get next codepoint
 		BYTE * pCur = m_pCur; // to redo special char, if there's a token already
-		int iCode = m_tLC.ToLower ( GetCodepoint() ); // advances m_pCur
+		int iCodePoint = GetCodepoint();  // advances m_pCur
+		int iCode = m_tLC.ToLower ( iCodePoint );
 
 		// handle eof
 		if ( iCode<0 )
@@ -3616,6 +3695,20 @@ BYTE * CSphTokenizer_UTF8::GetToken ()
 		if ( iCode & FLAG_CODEPOINT_IGNORE )
 			continue;
 
+		if ( bEscaped )
+		{
+			if ( iCodePoint == '\\' && iLastCodepoint != '\\' )
+			{
+				iLastCodepoint = iCodePoint;
+				continue;
+			}
+
+			if ( iLastCodepoint == '\\' )
+				iCode &= ~FLAG_CODEPOINT_SPECIAL;
+
+			iLastCodepoint = iCodePoint;
+		}
+
 		// handle whitespace and boundary
 		if ( m_bBoundary && ( iCode==0 ) ) m_bTokenBoundary = true;
 		m_bBoundary = ( iCode & FLAG_CODEPOINT_BOUNDARY )!=0;
@@ -3643,6 +3736,7 @@ BYTE * CSphTokenizer_UTF8::GetToken ()
 
 			if ( m_iAccum==0 )
 			{
+				m_bWasSpecial = true;
 				AccumCodepoint ( iCode ); // handle special as a standalone token
 			} else
 			{
@@ -3669,13 +3763,24 @@ void CSphTokenizer_UTF8::FlushAccum ()
 }
 
 
-ISphTokenizer * CSphTokenizer_UTF8::Clone () const
+ISphTokenizer * CSphTokenizer_UTF8::Clone ( bool bEscaped ) const
 {
 	CSphTokenizer_UTF8 * pClone = new CSphTokenizer_UTF8 ();
 	pClone->m_tLC = m_tLC;
 	pClone->m_dSynonyms = m_dSynonyms;
 	pClone->m_dSynStart = m_dSynStart;
 	pClone->m_dSynEnd = m_dSynEnd;
+	pClone->m_bEscaped = bEscaped;
+
+	if ( bEscaped )
+	{
+		CSphVector<CSphRemapRange> dRemaps;
+		CSphRemapRange Range;
+		Range.m_iStart = Range.m_iEnd = Range.m_iRemapStart = '\\';
+		dRemaps.Add ( Range );
+		pClone->m_tLC.AddRemaps ( dRemaps, 0, 0 );
+	}
+
 	return pClone;
 }
 
@@ -13742,7 +13847,7 @@ CSphDictCRC::WordformContainer * CSphDictCRC::LoadWordformContainer ( const char
 	rdWordforms.SetFile ( fdWordforms );
 
 	// my tokenizer
-	CSphScopedPtr<ISphTokenizer> pMyTokenizer ( pTokenizer->Clone() );
+	CSphScopedPtr<ISphTokenizer> pMyTokenizer ( pTokenizer->Clone ( false ) );
 	pMyTokenizer->AddSpecials ( ">" );
 
 	// scan it line by line

+ 6 - 1
src/sphinx.h

@@ -374,12 +374,15 @@ public:
 	/// get last token boundary flag (true if there was a boundary before the token)
 	inline bool						GetBoundary () { return m_bTokenBoundary; }
 
+	/// was last token a special one?
+	inline bool						WasTokenSpecial () { return m_bWasSpecial; }
+
 public:
 	/// get lowercaser
 	virtual const CSphLowercaser *	GetLowercaser () const { return &m_tLC; }
 
 	/// spawn a clone of my own
-	virtual ISphTokenizer *			Clone () const = 0;
+	virtual ISphTokenizer *			Clone ( bool bEscaped ) const = 0;
 
 	/// SBCS or UTF-8?
 	virtual bool					IsUtf8 () const = 0;
@@ -401,6 +404,8 @@ protected:
 	int								m_iLastTokenLen;			///< last token length, in codepoints
 	bool							m_bTokenBoundary;			///< last token boundary flag (true after boundary codepoint followed by separator)
 	bool							m_bBoundary;				///< boundary flag (true immediately after boundary codepoint)
+	bool							m_bWasSpecial;				///< special token flag
+	bool							m_bEscaped;					///< backslash handling flag
 
 	CSphVector<CSphSynonym>			m_dSynonyms;				///< active synonyms
 	CSphVector<int>					m_dSynStart;				///< map 1st byte to candidate range start

+ 5 - 22
src/sphinxquery.cpp

@@ -99,17 +99,10 @@ protected:
 	CSphBooleanQueryExpr *	m_pCur;
 
 protected:
-	int						IsSpecial ( int iCh );
 	void					HandleOperator ( int iCh );
 };
 
 
-int CSphBooleanQueryParser::IsSpecial ( int iCh )
-{
-	return ( iCh=='(' || iCh==')' || iCh=='&' || iCh=='|' || iCh=='-' || iCh=='!' );
-}
-
-
 void CSphBooleanQueryParser::HandleOperator ( int iCh )
 {
 	assert ( iCh=='|' || iCh=='&' );
@@ -224,7 +217,7 @@ CSphBooleanQueryExpr * CSphBooleanQueryParser::Parse ( const char * sQuery, cons
 	// a buffer of my own
 	CSphString sBuffer ( sQuery );
 
-	CSphScopedPtr<ISphTokenizer> pMyTokenizer ( pTokenizer->Clone () );
+	CSphScopedPtr<ISphTokenizer> pMyTokenizer ( pTokenizer->Clone ( true ) );
 	pMyTokenizer->AddSpecials ( "&|()-!" );
 	pMyTokenizer->SetBuffer ( (BYTE*)sBuffer.cstr(), strlen ( sBuffer.cstr() ) );
 
@@ -242,9 +235,7 @@ CSphBooleanQueryExpr * CSphBooleanQueryParser::Parse ( const char * sQuery, cons
 		assert ( m_pCur->IsNull() );
 
 		int iSpecial = pToken
-			? ( IsSpecial(pToken[0])
-				? pToken[0]
-				: 0 )
+			? ( pMyTokenizer->WasTokenSpecial () ? pToken[0] : 0 )
 			: QUERY_END;
 		assert ( !( iSpecial>0 && pToken[1]!=0 ) );
 
@@ -680,7 +671,6 @@ protected:
 protected:
 	bool				m_bStopOnInvalid;		///< stop on invalid fields or skip them
 
-	int					IsSpecial ( int iCh );
 	bool				Error ( const char * sTemplate, ... );
 	void				Warning ( const char * sTemplate, ... );
 	bool				ParseFields ( DWORD & uFields, ISphTokenizer * pTokenizer, const CSphSchema * pSchema );
@@ -789,13 +779,6 @@ CSphExtendedQueryParser::CSphExtendedQueryParser ()
 }
 
 
-int CSphExtendedQueryParser::IsSpecial ( int iCh )
-{
-	return ( iCh=='(' || iCh==')' || iCh=='|' || iCh=='-' || iCh=='!'
-		|| iCh=='@' || iCh=='~' || iCh=='"' );
-}
-
-
 bool CSphExtendedQueryParser::Error ( const char * sTemplate, ... )
 {
 	assert ( m_pRes );
@@ -1054,8 +1037,8 @@ bool CSphExtendedQueryParser::Parse ( CSphExtendedQuery & tParsed, const char *
 
 	// a buffer of my own
 	CSphString sBuffer ( sQuery );
-	CSphScopedPtr<ISphTokenizer> pMyTokenizer ( pTokenizer->Clone () );
-	pMyTokenizer->AddSpecials ( "()|-!@~\"" ); // MUST be in sync with IsSpecial()
+	CSphScopedPtr<ISphTokenizer> pMyTokenizer ( pTokenizer->Clone ( true ) );
+	pMyTokenizer->AddSpecials ( "()|-!@~\"" );
 	pMyTokenizer->SetBuffer ( (BYTE*)sBuffer.cstr(), strlen ( sBuffer.cstr() ) );
 
 	// iterate all tokens
@@ -1088,7 +1071,7 @@ bool CSphExtendedQueryParser::Parse ( CSphExtendedQuery & tParsed, const char *
 		bRedo = false;
 
 		int iSpecial = sToken
-			? ( IsSpecial(sToken[0]) ? sToken[0] : 0 )
+			? ( pMyTokenizer->WasTokenSpecial () ? sToken[0] : 0 )
 			: QUERY_END;
 		assert ( !( iSpecial>0 && sToken[1]!=0 ) );
 

+ 45 - 3
src/tests.cpp

@@ -50,7 +50,7 @@ bool CreateSynonymsFile ( const char * sMagic )
 }
 
 
-ISphTokenizer * CreateTestTokenizer ( bool bUTF8, bool bSynonyms )
+ISphTokenizer * CreateTestTokenizer ( bool bUTF8, bool bSynonyms, bool bEscaped = false )
 {
 	CSphString sError;
 	ISphTokenizer * pTokenizer = bUTF8 ? sphCreateUTF8Tokenizer () : sphCreateSBCSTokenizer ();
@@ -59,11 +59,20 @@ ISphTokenizer * CreateTestTokenizer ( bool bUTF8, bool bSynonyms )
 	pTokenizer->AddSpecials ( "!-" );
 	if ( bSynonyms )
 		assert ( pTokenizer->LoadSynonyms ( g_sTmpfile, sError ) );
+
+	if ( bEscaped )
+	{	
+		ISphTokenizer * pOldTokenizer = pTokenizer;
+		pTokenizer = pTokenizer->Clone ( true );
+		pTokenizer->SetMinWordLen ( 2 );
+		SafeDelete ( pOldTokenizer );
+	}
+
 	return pTokenizer;
 }
 
 
-void TestTokenizer ( bool bUTF8 )
+void TestTokenizer ( bool bUTF8, bool bEscaped = false )
 {
 	const char * sPrefix = bUTF8 
 		? "testing UTF8 tokenizer"
@@ -77,7 +86,7 @@ void TestTokenizer ( bool bUTF8 )
 			: "\xC0\xC1\xF5\xF6"; // valid SBCS but invalid UTF-8
 
 		assert ( CreateSynonymsFile ( sMagic ) );
-		ISphTokenizer * pTokenizer = CreateTestTokenizer ( bUTF8, iRun==2 );
+		ISphTokenizer * pTokenizer = CreateTestTokenizer ( bUTF8, iRun==2, bEscaped );
 
 		const char * dTests[] =
 		{
@@ -112,6 +121,10 @@ void TestTokenizer ( bool bUTF8 )
 			"2", sMagic,									"test", NULL,
 			"2", "U.S. U.S.A. U.S.A.F.",					"US", "USA", "USAF", NULL,
 			"2", "U.S.AB U.S.A. U.S.B.U.S.D.U.S.U.S.A.F.",	"US", "ab", "USA", "USB", "USD", "US", "USAF", NULL,
+			"3", "phon\\e",						"phone", NULL,
+			"3", "\\thephone",					"thephone",  NULL,
+			"3", "the\\!phone",					"the!phone", NULL,
+			"3", "\\!phone",					"!phone", NULL,
 			NULL
 		};
 
@@ -225,6 +238,33 @@ void TestTokenizer ( bool bUTF8 )
 		assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "testing" ) ); assert ( pTokenizer->GetBoundary() );
 		assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "boundaries" ) ); assert ( !pTokenizer->GetBoundary() );
 
+		// check escaped sequences
+		if ( bEscaped )
+		{
+			int iRun = 3;
+			for ( int iCur=0; dTests[iCur] && atoi(dTests[iCur++])<=iRun; )
+			{
+				if ( atoi(dTests[iCur-1])!=iRun )
+				{
+					while ( dTests [iCur++] );
+					continue;
+				}
+
+				printf ( "%s, run=%d, line=%s\n", sPrefix, iRun, dTests[iCur] );
+				pTokenizer->SetBuffer ( (BYTE*)dTests[iCur], strlen(dTests[iCur]) );
+				iCur++;
+
+				for ( BYTE * pToken=pTokenizer->GetToken(); pToken; pToken=pTokenizer->GetToken() )
+				{
+					assert ( dTests[iCur] && strcmp ( (const char*)pToken, dTests[iCur] )==0 );
+					iCur++;
+				}
+
+				assert ( dTests[iCur]==NULL );
+				iCur++;
+			}
+		}
+
 		// done
 		SafeDelete ( pTokenizer );
 	}
@@ -535,6 +575,8 @@ int main ()
 	TestStripper ();
 	TestTokenizer ( false );
 	TestTokenizer ( true );
+	TestTokenizer ( false, true );
+	TestTokenizer ( true, true );
 	TestExpr ();
 #endif