18 tahun lalu · 3855b6317c
--- a/api/sphinxapi.php
+++ b/api/sphinxapi.php
@@ -1088,6 +1088,14 @@ class SphinxClient
 
				 		return $res;
			
 
				 	}
			
 
				 
			
 
				+	function EscapeString ( $string )
			
 
				+	{
			
 
				+		$from = array ( '(',')','|','-','!','@','~','\"','&' );
			
 
				+		$to   = array ( '\\(','\\)','\\|','\\-','\\!','\\@','\\~','\\\"', '\\&' );
			
 
				+
			
 
				+		return str_replace ( $from, $to, $string );
			
 
				+	}
			
 
				+
			
 
				 	/////////////////////////////////////////////////////////////////////////////
			
 
				 	// attribute updates
			
 
				 	/////////////////////////////////////////////////////////////////////////////
			
--- a/src/sphinx.cpp
+++ b/src/sphinx.cpp
@@ -2135,7 +2135,7 @@ public:
 
				 
			
 
				 	virtual void				SetBuffer ( BYTE * sBuffer, int iLength );
			
 
				 	virtual BYTE *				GetToken ();
			
 
				-	virtual ISphTokenizer *		Clone () const;
			
 
				+	virtual ISphTokenizer *		Clone ( bool bEscaped ) const;
			
 
				 	virtual bool				IsUtf8 () const { return false; }
			
 
				 	virtual int					GetCodepointLength ( int ) const { return 1; }
			
 
				 };
			
@@ -2149,7 +2149,7 @@ public:
 
				 
			
 
				 	virtual void				SetBuffer ( BYTE * sBuffer, int iLength );
			
 
				 	virtual BYTE *				GetToken ();
			
 
				-	virtual ISphTokenizer *		Clone () const;
			
 
				+	virtual ISphTokenizer *		Clone ( bool bEscaped ) const;
			
 
				 	virtual bool				IsUtf8 () const { return true; }
			
 
				 	virtual int					GetCodepointLength ( int iCode ) const;
			
 
				 
			
@@ -2713,6 +2713,8 @@ ISphTokenizer::ISphTokenizer ()
 
				 	, m_iLastTokenLen ( 0 )
			
 
				 	, m_bTokenBoundary ( false )
			
 
				 	, m_bBoundary ( false )
			
 
				+	, m_bWasSpecial ( false )
			
 
				+	, m_bEscaped ( false )
			
 
				 {}
			
 
				 
			
 
				 
			
@@ -3089,6 +3091,8 @@ BYTE * CSphTokenizerTraits<IS_UTF8>::GetTokenSyn ()
 
				 {
			
 
				 	assert ( m_dSynonyms.GetLength() );
			
 
				 
			
 
				+	bool bEscaped = m_bEscaped;
			
 
				+
			
 
				 	m_bTokenBoundary = false;
			
 
				 	for ( ;; )
			
 
				 	{
			
@@ -3102,6 +3106,7 @@ BYTE * CSphTokenizerTraits<IS_UTF8>::GetTokenSyn ()
 
				 		int iSynEnd = m_dSynonyms.GetLength()-1;
			
 
				 		int iSynOff = 0;
			
 
				 
			
 
				+		int iLastCodepoint = 0;
			
 
				 		int iLastFolded = 0;
			
 
				 		BYTE * pRescan = NULL;
			
 
				 
			
@@ -3139,11 +3144,30 @@ BYTE * CSphTokenizerTraits<IS_UTF8>::GetTokenSyn ()
 
				 			// skip continuous whitespace
			
 
				 			if ( iLastFolded==0 && iFolded==0 )
			
 
				 				continue;
			
 
				+
			
 
				+			if ( bEscaped )
			
 
				+			{
			
 
				+				if ( iCode == '\\' && iLastCodepoint != '\\' )
			
 
				+				{
			
 
				+					iLastCodepoint = iCode;
			
 
				+					continue;
			
 
				+				}
			
 
				+				else
			
 
				+				{
			
 
				+					if ( iLastCodepoint == '\\' )
			
 
				+						iFolded &= ~( FLAG_CODEPOINT_SPECIAL | FLAG_CODEPOINT_DUAL );
			
 
				+				}
			
 
				+
			
 
				+				iLastCodepoint = iCode;
			
 
				+			}
			
 
				+
			
 
				 			iLastFolded = iFolded;
			
 
				 
			
 
				 			// handle specials at the very word start
			
 
				 			if ( ( iFolded & FLAG_CODEPOINT_SPECIAL ) && m_iAccum==0 )
			
 
				 			{
			
 
				+				m_bWasSpecial = true;
			
 
				+
			
 
				 				AccumCodepoint ( iFolded & MASK_CODEPOINT );
			
 
				 				*m_pAccum = '\0';
			
 
				 
			
@@ -3331,7 +3355,7 @@ BYTE * CSphTokenizerTraits<IS_UTF8>::GetTokenSyn ()
 
				 		{
			
 
				 			m_pCur = pRescan;
			
 
				 			continue;
			
 
				-		}			
			
 
				+		}
			
 
				 
			
 
				 		// at this point, it also started with a valid char
			
 
				 		assert ( m_iAccum>0 );
			
@@ -3339,14 +3363,25 @@ BYTE * CSphTokenizerTraits<IS_UTF8>::GetTokenSyn ()
 
				 		// find the proper separator
			
 
				 		if ( !pFirstSeparator )
			
 
				 		{
			
 
				+			int iLast = 0;
			
 
				+
			
 
				 			// if there was none, scan until found
			
 
				 			for ( ;; )
			
 
				 			{
			
 
				 				BYTE * pCur = m_pCur;
			
 
				+				int iCode = *pCur;
			
 
				 				int iFolded = m_tLC.ToLower ( GetCodepoint() );
			
 
				 				if ( iFolded<0 )
			
 
				 					break; // eof
			
 
				 
			
 
				+				if ( bEscaped )
			
 
				+				{
			
 
				+					if ( iCode != '\\' && iLast == '\\' )
			
 
				+						iFolded &= ~FLAG_CODEPOINT_SPECIAL;
			
 
				+
			
 
				+					iLast = iCode;
			
 
				+				}
			
 
				+
			
 
				 				if ( IsSeparator ( iFolded, false ) )
			
 
				 				{
			
 
				 					if ( iFolded!=0 )
			
@@ -3354,7 +3389,13 @@ BYTE * CSphTokenizerTraits<IS_UTF8>::GetTokenSyn ()
 
				 					break;
			
 
				 				}
			
 
				 
			
 
				-				AccumCodepoint ( iFolded & MASK_CODEPOINT );
			
 
				+				if ( bEscaped )
			
 
				+				{
			
 
				+					if ( iCode != '\\' )
			
 
				+						AccumCodepoint ( iFolded & MASK_CODEPOINT );
			
 
				+				}
			
 
				+				else
			
 
				+					AccumCodepoint ( iFolded & MASK_CODEPOINT );
			
 
				 			}
			
 
				 		} else
			
 
				 		{
			
@@ -3466,9 +3507,14 @@ void CSphTokenizer_SBCS::SetBuffer ( BYTE * sBuffer, int iLength )
 
				 
			
 
				 BYTE * CSphTokenizer_SBCS::GetToken ()
			
 
				 {
			
 
				+	m_bWasSpecial = false;
			
 
				+
			
 
				 	if ( m_dSynonyms.GetLength() )
			
 
				 		return GetTokenSyn ();
			
 
				 
			
 
				+	bool bEscaped = m_bEscaped;
			
 
				+	int iCodepoint = 0;
			
 
				+	int iLastCodepoint = 0;
			
 
				 	m_bTokenBoundary = false;
			
 
				 	for ( ;; )
			
 
				 	{
			
@@ -3485,13 +3531,28 @@ BYTE * CSphTokenizer_SBCS::GetToken ()
 
				 			}
			
 
				 		} else
			
 
				 		{
			
 
				-			iCode = m_tLC.ToLower ( *m_pCur++ );
			
 
				+			iCodepoint = *m_pCur++;
			
 
				+			iCode = m_tLC.ToLower ( iCodepoint );
			
 
				 		}
			
 
				 
			
 
				 		// handle ignored chars
			
 
				 		if ( iCode & FLAG_CODEPOINT_IGNORE )
			
 
				 			continue;
			
 
				 
			
 
				+		if ( bEscaped )
			
 
				+		{
			
 
				+			if ( iCodepoint == '\\' && iLastCodepoint != '\\' )
			
 
				+			{
			
 
				+				iLastCodepoint = iCodepoint;
			
 
				+				continue;
			
 
				+			}
			
 
				+
			
 
				+			if ( iLastCodepoint == '\\' )
			
 
				+				iCode &= ~FLAG_CODEPOINT_SPECIAL;
			
 
				+
			
 
				+			iLastCodepoint = iCodepoint;
			
 
				+		}
			
 
				+
			
 
				 		// handle whitespace and boundary
			
 
				 		if ( m_bBoundary && ( iCode==0 ) ) m_bTokenBoundary = true;
			
 
				 		m_bBoundary = ( iCode & FLAG_CODEPOINT_BOUNDARY )!=0;
			
@@ -3528,6 +3589,8 @@ BYTE * CSphTokenizer_SBCS::GetToken ()
 
				 				m_iLastTokenLen = 1;
			
 
				 				m_sAccum[0] = (BYTE)iCode;
			
 
				 				m_sAccum[1] = '\0';
			
 
				+
			
 
				+				m_bWasSpecial = true;
			
 
				 			} else
			
 
				 			{
			
 
				 				// flush prev accum and redo this special
			
@@ -3547,13 +3610,24 @@ BYTE * CSphTokenizer_SBCS::GetToken ()
 
				 	}
			
 
				 }
			
 
				 
			
 
				-ISphTokenizer * CSphTokenizer_SBCS::Clone () const
			
 
				+ISphTokenizer * CSphTokenizer_SBCS::Clone ( bool bEscaped ) const
			
 
				 {
			
 
				 	CSphTokenizer_SBCS * pClone = new CSphTokenizer_SBCS ();
			
 
				 	pClone->m_tLC = m_tLC;
			
 
				 	pClone->m_dSynonyms = m_dSynonyms;
			
 
				 	pClone->m_dSynStart = m_dSynStart;
			
 
				 	pClone->m_dSynEnd = m_dSynEnd;
			
 
				+	pClone->m_bEscaped = bEscaped;
			
 
				+
			
 
				+	if ( bEscaped )
			
 
				+	{
			
 
				+		CSphVector<CSphRemapRange> dRemaps;
			
 
				+		CSphRemapRange Range;
			
 
				+		Range.m_iStart = Range.m_iEnd = Range.m_iRemapStart = '\\';
			
 
				+		dRemaps.Add ( Range );
			
 
				+		pClone->m_tLC.AddRemaps ( dRemaps, 0, 0 );
			
 
				+	}
			
 
				+
			
 
				 	return pClone;
			
 
				 }
			
 
				 
			
@@ -3585,15 +3659,20 @@ void CSphTokenizer_UTF8::SetBuffer ( BYTE * sBuffer, int iLength )
 
				 
			
 
				 BYTE * CSphTokenizer_UTF8::GetToken ()
			
 
				 {
			
 
				+	m_bWasSpecial = false;
			
 
				+
			
 
				 	if ( m_dSynonyms.GetLength() )
			
 
				 		return GetTokenSyn ();
			
 
				 
			
 
				+	bool bEscaped = m_bEscaped;
			
 
				+	int iLastCodepoint = 0;
			
 
				 	m_bTokenBoundary = false;
			
 
				 	for ( ;; )
			
 
				 	{
			
 
				 		// get next codepoint
			
 
				 		BYTE * pCur = m_pCur; // to redo special char, if there's a token already
			
 
				-		int iCode = m_tLC.ToLower ( GetCodepoint() ); // advances m_pCur
			
 
				+		int iCodePoint = GetCodepoint();  // advances m_pCur
			
 
				+		int iCode = m_tLC.ToLower ( iCodePoint );
			
 
				 
			
 
				 		// handle eof
			
 
				 		if ( iCode<0 )
			
@@ -3616,6 +3695,20 @@ BYTE * CSphTokenizer_UTF8::GetToken ()
 
				 		if ( iCode & FLAG_CODEPOINT_IGNORE )
			
 
				 			continue;
			
 
				 
			
 
				+		if ( bEscaped )
			
 
				+		{
			
 
				+			if ( iCodePoint == '\\' && iLastCodepoint != '\\' )
			
 
				+			{
			
 
				+				iLastCodepoint = iCodePoint;
			
 
				+				continue;
			
 
				+			}
			
 
				+
			
 
				+			if ( iLastCodepoint == '\\' )
			
 
				+				iCode &= ~FLAG_CODEPOINT_SPECIAL;
			
 
				+
			
 
				+			iLastCodepoint = iCodePoint;
			
 
				+		}
			
 
				+
			
 
				 		// handle whitespace and boundary
			
 
				 		if ( m_bBoundary && ( iCode==0 ) ) m_bTokenBoundary = true;
			
 
				 		m_bBoundary = ( iCode & FLAG_CODEPOINT_BOUNDARY )!=0;
			
@@ -3643,6 +3736,7 @@ BYTE * CSphTokenizer_UTF8::GetToken ()
 
				 
			
 
				 			if ( m_iAccum==0 )
			
 
				 			{
			
 
				+				m_bWasSpecial = true;
			
 
				 				AccumCodepoint ( iCode ); // handle special as a standalone token
			
 
				 			} else
			
 
				 			{
			
@@ -3669,13 +3763,24 @@ void CSphTokenizer_UTF8::FlushAccum ()
 
				 }
			
 
				 
			
 
				 
			
 
				-ISphTokenizer * CSphTokenizer_UTF8::Clone () const
			
 
				+ISphTokenizer * CSphTokenizer_UTF8::Clone ( bool bEscaped ) const
			
 
				 {
			
 
				 	CSphTokenizer_UTF8 * pClone = new CSphTokenizer_UTF8 ();
			
 
				 	pClone->m_tLC = m_tLC;
			
 
				 	pClone->m_dSynonyms = m_dSynonyms;
			
 
				 	pClone->m_dSynStart = m_dSynStart;
			
 
				 	pClone->m_dSynEnd = m_dSynEnd;
			
 
				+	pClone->m_bEscaped = bEscaped;
			
 
				+
			
 
				+	if ( bEscaped )
			
 
				+	{
			
 
				+		CSphVector<CSphRemapRange> dRemaps;
			
 
				+		CSphRemapRange Range;
			
 
				+		Range.m_iStart = Range.m_iEnd = Range.m_iRemapStart = '\\';
			
 
				+		dRemaps.Add ( Range );
			
 
				+		pClone->m_tLC.AddRemaps ( dRemaps, 0, 0 );
			
 
				+	}
			
 
				+
			
 
				 	return pClone;
			
 
				 }
			
 
				 
			
@@ -13742,7 +13847,7 @@ CSphDictCRC::WordformContainer * CSphDictCRC::LoadWordformContainer ( const char
 
				 	rdWordforms.SetFile ( fdWordforms );
			
 
				 
			
 
				 	// my tokenizer
			
 
				-	CSphScopedPtr<ISphTokenizer> pMyTokenizer ( pTokenizer->Clone() );
			
 
				+	CSphScopedPtr<ISphTokenizer> pMyTokenizer ( pTokenizer->Clone ( false ) );
			
 
				 	pMyTokenizer->AddSpecials ( ">" );
			
 
				 
			
 
				 	// scan it line by line
			
--- a/src/sphinx.h
+++ b/src/sphinx.h
@@ -374,12 +374,15 @@ public:
 
				 	/// get last token boundary flag (true if there was a boundary before the token)
			
 
				 	inline bool						GetBoundary () { return m_bTokenBoundary; }
			
 
				 
			
 
				+	/// was last token a special one?
			
 
				+	inline bool						WasTokenSpecial () { return m_bWasSpecial; }
			
 
				+
			
 
				 public:
			
 
				 	/// get lowercaser
			
 
				 	virtual const CSphLowercaser *	GetLowercaser () const { return &m_tLC; }
			
 
				 
			
 
				 	/// spawn a clone of my own
			
 
				-	virtual ISphTokenizer *			Clone () const = 0;
			
 
				+	virtual ISphTokenizer *			Clone ( bool bEscaped ) const = 0;
			
 
				 
			
 
				 	/// SBCS or UTF-8?
			
 
				 	virtual bool					IsUtf8 () const = 0;
			
@@ -401,6 +404,8 @@ protected:
 
				 	int								m_iLastTokenLen;			///< last token length, in codepoints
			
 
				 	bool							m_bTokenBoundary;			///< last token boundary flag (true after boundary codepoint followed by separator)
			
 
				 	bool							m_bBoundary;				///< boundary flag (true immediately after boundary codepoint)
			
 
				+	bool							m_bWasSpecial;				///< special token flag
			
 
				+	bool							m_bEscaped;					///< backslash handling flag
			
 
				 
			
 
				 	CSphVector<CSphSynonym>			m_dSynonyms;				///< active synonyms
			
 
				 	CSphVector<int>					m_dSynStart;				///< map 1st byte to candidate range start
			
--- a/src/sphinxquery.cpp
+++ b/src/sphinxquery.cpp
@@ -99,17 +99,10 @@ protected:
 
				 	CSphBooleanQueryExpr *	m_pCur;
			
 
				 
			
 
				 protected:
			
 
				-	int						IsSpecial ( int iCh );
			
 
				 	void					HandleOperator ( int iCh );
			
 
				 };
			
 
				 
			
 
				 
			
 
				-int CSphBooleanQueryParser::IsSpecial ( int iCh )
			
 
				-{
			
 
				-	return ( iCh=='(' || iCh==')' || iCh=='&' || iCh=='|' || iCh=='-' || iCh=='!' );
			
 
				-}
			
 
				-
			
 
				-
			
 
				 void CSphBooleanQueryParser::HandleOperator ( int iCh )
			
 
				 {
			
 
				 	assert ( iCh=='|' || iCh=='&' );
			
@@ -224,7 +217,7 @@ CSphBooleanQueryExpr * CSphBooleanQueryParser::Parse ( const char * sQuery, cons
 
				 	// a buffer of my own
			
 
				 	CSphString sBuffer ( sQuery );
			
 
				 
			
 
				-	CSphScopedPtr<ISphTokenizer> pMyTokenizer ( pTokenizer->Clone () );
			
 
				+	CSphScopedPtr<ISphTokenizer> pMyTokenizer ( pTokenizer->Clone ( true ) );
			
 
				 	pMyTokenizer->AddSpecials ( "&|()-!" );
			
 
				 	pMyTokenizer->SetBuffer ( (BYTE*)sBuffer.cstr(), strlen ( sBuffer.cstr() ) );
			
 
				 
			
@@ -242,9 +235,7 @@ CSphBooleanQueryExpr * CSphBooleanQueryParser::Parse ( const char * sQuery, cons
 
				 		assert ( m_pCur->IsNull() );
			
 
				 
			
 
				 		int iSpecial = pToken
			
 
				-			? ( IsSpecial(pToken[0])
			
 
				-				? pToken[0]
			
 
				-				: 0 )
			
 
				+			? ( pMyTokenizer->WasTokenSpecial () ? pToken[0] : 0 )
			
 
				 			: QUERY_END;
			
 
				 		assert ( !( iSpecial>0 && pToken[1]!=0 ) );
			
 
				 
			
@@ -680,7 +671,6 @@ protected:
 
				 protected:
			
 
				 	bool				m_bStopOnInvalid;		///< stop on invalid fields or skip them
			
 
				 
			
 
				-	int					IsSpecial ( int iCh );
			
 
				 	bool				Error ( const char * sTemplate, ... );
			
 
				 	void				Warning ( const char * sTemplate, ... );
			
 
				 	bool				ParseFields ( DWORD & uFields, ISphTokenizer * pTokenizer, const CSphSchema * pSchema );
			
@@ -789,13 +779,6 @@ CSphExtendedQueryParser::CSphExtendedQueryParser ()
 
				 }
			
 
				 
			
 
				 
			
 
				-int CSphExtendedQueryParser::IsSpecial ( int iCh )
			
 
				-{
			
 
				-	return ( iCh=='(' || iCh==')' || iCh=='|' || iCh=='-' || iCh=='!'
			
 
				-		|| iCh=='@' || iCh=='~' || iCh=='"' );
			
 
				-}
			
 
				-
			
 
				-
			
 
				 bool CSphExtendedQueryParser::Error ( const char * sTemplate, ... )
			
 
				 {
			
 
				 	assert ( m_pRes );
			
@@ -1054,8 +1037,8 @@ bool CSphExtendedQueryParser::Parse ( CSphExtendedQuery & tParsed, const char *
 
				 
			
 
				 	// a buffer of my own
			
 
				 	CSphString sBuffer ( sQuery );
			
 
				-	CSphScopedPtr<ISphTokenizer> pMyTokenizer ( pTokenizer->Clone () );
			
 
				-	pMyTokenizer->AddSpecials ( "()|-!@~\"" ); // MUST be in sync with IsSpecial()
			
 
				+	CSphScopedPtr<ISphTokenizer> pMyTokenizer ( pTokenizer->Clone ( true ) );
			
 
				+	pMyTokenizer->AddSpecials ( "()|-!@~\"" );
			
 
				 	pMyTokenizer->SetBuffer ( (BYTE*)sBuffer.cstr(), strlen ( sBuffer.cstr() ) );
			
 
				 
			
 
				 	// iterate all tokens
			
@@ -1088,7 +1071,7 @@ bool CSphExtendedQueryParser::Parse ( CSphExtendedQuery & tParsed, const char *
 
				 		bRedo = false;
			
 
				 
			
 
				 		int iSpecial = sToken
			
 
				-			? ( IsSpecial(sToken[0]) ? sToken[0] : 0 )
			
 
				+			? ( pMyTokenizer->WasTokenSpecial () ? sToken[0] : 0 )
			
 
				 			: QUERY_END;
			
 
				 		assert ( !( iSpecial>0 && sToken[1]!=0 ) );
			
 
				 
			
--- a/src/tests.cpp
+++ b/src/tests.cpp
@@ -50,7 +50,7 @@ bool CreateSynonymsFile ( const char * sMagic )
 
				 }
			
 
				 
			
 
				 
			
 
				-ISphTokenizer * CreateTestTokenizer ( bool bUTF8, bool bSynonyms )
			
 
				+ISphTokenizer * CreateTestTokenizer ( bool bUTF8, bool bSynonyms, bool bEscaped = false )
			
 
				 {
			
 
				 	CSphString sError;
			
 
				 	ISphTokenizer * pTokenizer = bUTF8 ? sphCreateUTF8Tokenizer () : sphCreateSBCSTokenizer ();
			
@@ -59,11 +59,20 @@ ISphTokenizer * CreateTestTokenizer ( bool bUTF8, bool bSynonyms )
 
				 	pTokenizer->AddSpecials ( "!-" );
			
 
				 	if ( bSynonyms )
			
 
				 		assert ( pTokenizer->LoadSynonyms ( g_sTmpfile, sError ) );
			
 
				+
			
 
				+	if ( bEscaped )
			
 
				+	{	
			
 
				+		ISphTokenizer * pOldTokenizer = pTokenizer;
			
 
				+		pTokenizer = pTokenizer->Clone ( true );
			
 
				+		pTokenizer->SetMinWordLen ( 2 );
			
 
				+		SafeDelete ( pOldTokenizer );
			
 
				+	}
			
 
				+
			
 
				 	return pTokenizer;
			
 
				 }
			
 
				 
			
 
				 
			
 
				-void TestTokenizer ( bool bUTF8 )
			
 
				+void TestTokenizer ( bool bUTF8, bool bEscaped = false )
			
 
				 {
			
 
				 	const char * sPrefix = bUTF8 
			
 
				 		? "testing UTF8 tokenizer"
			
@@ -77,7 +86,7 @@ void TestTokenizer ( bool bUTF8 )
 
				 			: "\xC0\xC1\xF5\xF6"; // valid SBCS but invalid UTF-8
			
 
				 
			
 
				 		assert ( CreateSynonymsFile ( sMagic ) );
			
 
				-		ISphTokenizer * pTokenizer = CreateTestTokenizer ( bUTF8, iRun==2 );
			
 
				+		ISphTokenizer * pTokenizer = CreateTestTokenizer ( bUTF8, iRun==2, bEscaped );
			
 
				 
			
 
				 		const char * dTests[] =
			
 
				 		{
			
@@ -112,6 +121,10 @@ void TestTokenizer ( bool bUTF8 )
 
				 			"2", sMagic,									"test", NULL,
			
 
				 			"2", "U.S. U.S.A. U.S.A.F.",					"US", "USA", "USAF", NULL,
			
 
				 			"2", "U.S.AB U.S.A. U.S.B.U.S.D.U.S.U.S.A.F.",	"US", "ab", "USA", "USB", "USD", "US", "USAF", NULL,
			
 
				+			"3", "phon\\e",						"phone", NULL,
			
 
				+			"3", "\\thephone",					"thephone",  NULL,
			
 
				+			"3", "the\\!phone",					"the!phone", NULL,
			
 
				+			"3", "\\!phone",					"!phone", NULL,
			
 
				 			NULL
			
 
				 		};
			
 
				 
			
@@ -225,6 +238,33 @@ void TestTokenizer ( bool bUTF8 )
 
				 		assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "testing" ) ); assert ( pTokenizer->GetBoundary() );
			
 
				 		assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "boundaries" ) ); assert ( !pTokenizer->GetBoundary() );
			
 
				 
			
 
				+		// check escaped sequences
			
 
				+		if ( bEscaped )
			
 
				+		{
			
 
				+			int iRun = 3;
			
 
				+			for ( int iCur=0; dTests[iCur] && atoi(dTests[iCur++])<=iRun; )
			
 
				+			{
			
 
				+				if ( atoi(dTests[iCur-1])!=iRun )
			
 
				+				{
			
 
				+					while ( dTests [iCur++] );
			
 
				+					continue;
			
 
				+				}
			
 
				+
			
 
				+				printf ( "%s, run=%d, line=%s\n", sPrefix, iRun, dTests[iCur] );
			
 
				+				pTokenizer->SetBuffer ( (BYTE*)dTests[iCur], strlen(dTests[iCur]) );
			
 
				+				iCur++;
			
 
				+
			
 
				+				for ( BYTE * pToken=pTokenizer->GetToken(); pToken; pToken=pTokenizer->GetToken() )
			
 
				+				{
			
 
				+					assert ( dTests[iCur] && strcmp ( (const char*)pToken, dTests[iCur] )==0 );
			
 
				+					iCur++;
			
 
				+				}
			
 
				+
			
 
				+				assert ( dTests[iCur]==NULL );
			
 
				+				iCur++;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				 		// done
			
 
				 		SafeDelete ( pTokenizer );
			
 
				 	}
			
@@ -535,6 +575,8 @@ int main ()
 
				 	TestStripper ();
			
 
				 	TestTokenizer ( false );
			
 
				 	TestTokenizer ( true );
			
 
				+	TestTokenizer ( false, true );
			
 
				+	TestTokenizer ( true, true );
			
 
				 	TestExpr ();
			
 
				 #endif