Переглянути джерело

fix: html_strip highlight entity preservation (#4184)

* fix: html_strip highlight entity preservation

Disable entity decoding when html_strip_mode=strip so highlight output keeps original entities, and add a regression case for entity-preserving highlight output.

Related issue: https://github.com/manticoresoftware/manticoresearch/issues/1737

added more cases for the test 134 for the different html_strip_mode to to capture current behavior

---------

Co-authored-by: Stas <[email protected]>
Sergey Nikolaev 3 тижнів тому
батько
коміт
d8dde62287

+ 1 - 1
src/sphinxexcerpt.cpp

@@ -1337,7 +1337,7 @@ bool SnippetBuilder_c::Impl_c::SetupStripperSPZ ( bool bSetupSPZ, CSphString & s
 	if ( q.m_sStripMode=="strip" || q.m_sStripMode=="retain" || ( q.m_sStripMode=="index" && tIndexSettings.m_bHtmlStrip ) )
 	{
 		// don't strip HTML markup in 'retain' mode - proceed zones only
-		m_pState->m_pStripper = std::make_unique<CSphHTMLStripper> ( q.m_sStripMode!="retain" );
+		m_pState->m_pStripper = std::make_unique<CSphHTMLStripper> ( q.m_sStripMode!="retain", q.m_sStripMode!="strip" );
 
 		if ( q.m_sStripMode=="index" )
 		{

+ 12 - 1
src/stripper/html_stripper.cpp

@@ -45,6 +45,12 @@ static inline int sphIsTagStart ( int c )
 }
 
 CSphHTMLStripper::CSphHTMLStripper ( bool bDefaultTags )
+	: CSphHTMLStripper ( bDefaultTags, true )
+{
+}
+
+CSphHTMLStripper::CSphHTMLStripper ( bool bDefaultTags, bool bDecodeEntities )
+	: m_bDecodeEntities ( bDecodeEntities )
 {
 	if ( bDefaultTags )
 	{
@@ -273,7 +279,6 @@ void CSphHTMLStripper::EnableParagraphs ()
 	UpdateTags ();
 }
 
-
 bool CSphHTMLStripper::SetZones ( const char * sZones, CSphString & sError )
 {
 	// yet another mini parser!
@@ -851,6 +856,12 @@ void CSphHTMLStripper::Strip ( BYTE * sData ) const
 
 		if ( *s=='&' )
 		{
+			if ( !m_bDecodeEntities )
+			{
+				*d++ = *s++;
+				continue;
+			}
+
 			if ( s[1]=='#' )
 			{
 				// handle "&#number;" and "&#xnumber;" forms

+ 2 - 0
src/stripper/html_stripper.h

@@ -20,6 +20,7 @@ class CSphHTMLStripper
 {
 public:
 	explicit CSphHTMLStripper ( bool bDefaultTags );
+	CSphHTMLStripper ( bool bDefaultTags, bool bDecodeEntities );
 	bool SetIndexedAttrs ( const char* sConfig, CSphString& sError );
 	bool SetRemovedElements ( const char* sConfig, CSphString& sError );
 	bool SetZones ( const char* sZones, CSphString& sError );
@@ -38,6 +39,7 @@ private:
 	CSphVector<html_stripper::StripperTag_t> m_dTags; ///< known tags to index attrs and/or to remove contents
 	int m_dStart[MAX_CHAR_INDEX];	   ///< maps index of the first tag name char to start offset in m_dTags
 	int m_dEnd[MAX_CHAR_INDEX];		   ///< maps index of the first tag name char to end offset in m_dTags
+	const bool m_bDecodeEntities = true;
 
 	int GetCharIndex ( int iCh ) const; ///< calcs index by raw char
 	void UpdateTags();					///< sorts tags, updates internal helpers

+ 2 - 2
test/test_134/model.bin

@@ -1,4 +1,4 @@
-a:1:{i:0;a:1:{i:0;a:16:{i:0;a:1:{i:0;s:182:"The institutional investment manager <b>it</b>.
+a:1:{i:0;a:1:{i:0;a:18:{i:0;a:1:{i:0;s:182:"The institutional investment manager <b>it</b>.
 <--->
  <b>Is</b> Filing this report and.
 <--->
@@ -78,4 +78,4 @@ cool <b>It is</b>
 <--->
 <b>Is</b> a signed hereby represent
 <--->
-";}i:12;a:1:{i:0;s:89:" ...  gains (if any). May <b>fallback</b> or go <b>insolvent</b>. Credit risk of the ... ";}i:13;a:1:{i:0;s:93:"Leverage gains (if any). May <b>fallback</b> or go <b>insolvent</b>. Credit risk of the deal.";}i:14;a:2:{i:0;s:69:" ...  this <b>match pass</b> is  ...  well as this <b>match</b>  ... ";i:1;s:69:" ...  this <b>match pass</b> is  ...  well as this <b>match</b>  ... ";}i:15;a:2:{i:0;s:27:"hex <b>number</b> №1 test";i:1;s:30:"normal <b>number</b> №2 test";}}}}
+";}i:12;a:1:{i:0;s:89:" ...  gains (if any). May <b>fallback</b> or go <b>insolvent</b>. Credit risk of the ... ";}i:13;a:1:{i:0;s:93:"Leverage gains (if any). May <b>fallback</b> or go <b>insolvent</b>. Credit risk of the deal.";}i:14;a:2:{i:0;s:69:" ...  this <b>match pass</b> is  ...  well as this <b>match</b>  ... ";i:1;s:69:" ...  this <b>match pass</b> is  ...  well as this <b>match</b>  ... ";}i:15;a:2:{i:0;s:32:"hex <b>number</b> &#x2116;1 test";i:1;s:34:"normal <b>number</b> &#8470;2 test";}i:16;a:2:{i:0;s:32:"hex <b>number</b> &#x2116;1 test";i:1;s:34:"normal <b>number</b> &#8470;2 test";}i:17;a:2:{i:0;s:32:"hex <b>number</b> &#x2116;1 test";i:1;s:34:"normal <b>number</b> &#8470;2 test";}}}}

+ 9 - 1
test/test_134/test.xml

@@ -260,11 +260,19 @@ $opts14 = array ( 'limit'=>40, 'limit_words'=>0, 'limit_passages'=>0, 'passage_b
 $words14 = 'match | pass';
 $results[] = $client->BuildExcerpts($docs14, 'test3', $words14, $opts14 );	
 
-// 15 test
+// 15 test - rendered entities keep as is
 $docs15 = array ( 'hex number &#x2116;1 test', 'normal number &#8470;2 test' );
 $opts15 = array ( "html_strip_mode"=>"strip" );
 $results[] = $client->BuildExcerpts($docs15, 'test3', 'number', $opts15 );
 
+$docs16 = array ( 'hex number &#x2116;1 test', 'normal number &#8470;2 test' );
+$opts16 = array ( "html_strip_mode"=>"none" );
+$results[] = $client->BuildExcerpts($docs16, 'test3', 'number', $opts16 );
+
+$docs17 = array ( 'hex number &#x2116;1 test', 'normal number &#8470;2 test' );
+$opts17 = array ( "html_strip_mode"=>"retain", "limit"=>0 );
+$results[] = $client->BuildExcerpts($docs17, 'test3', 'number', $opts17 );
+
 ]]></custom_test>
 
 </test>

Різницю між файлами не показано, бо вона завелика
+ 0 - 0
test/test_392/model.bin


+ 3 - 1
test/test_392/test.xml

@@ -118,7 +118,8 @@ insert into test_table values
 <db_insert>
 <![CDATA[
 insert into test_table2 values
-( 1, '<p>The ideas of syntax highlighting overlap significantly with those of <a href="/wiki/Structure_editor" title="Structure editor">syntax-directed editors</a>. One of the first such editors for code was Wilfred Hansens 1969 code editor, Emily.<sup id="cite_ref-hansen_3-0" class="reference"><a href="#cite_note-hansen-3">[3]</a></sup><sup id="cite_ref-4" class="reference"><a href="#cite_note-4">[4]</a></sup> It provided advanced language-independent <a href="/wiki/Autocomplete" title="Autocomplete">code completion</a> facilities, and unlike modern editors with syntax highlighting, actually made it impossible to create syntactically incorrect programs.</p>' )
+( 1, '<p>The ideas of syntax highlighting overlap significantly with those of <a href="/wiki/Structure_editor" title="Structure editor">syntax-directed editors</a>. One of the first such editors for code was Wilfred Hansens 1969 code editor, Emily.<sup id="cite_ref-hansen_3-0" class="reference"><a href="#cite_note-hansen-3">[3]</a></sup><sup id="cite_ref-4" class="reference"><a href="#cite_note-4">[4]</a></sup> It provided advanced language-independent <a href="/wiki/Autocomplete" title="Autocomplete">code completion</a> facilities, and unlike modern editors with syntax highlighting, actually made it impossible to create syntactically incorrect programs.</p>' ),
+( 2, '<code>&lt;?php echo "hello world"</code>' )
 ]]>
 </db_insert>
 
@@ -178,6 +179,7 @@ call snippets (('door opened last time', 'this is door closed'), 'test1', ' "doo
 <!-- expand keywords -->
 select id, highlight({limit=40},body) FROM test_kw1 WHERE MATCH('com');
 call snippets (('command mode activated', 'test.com looks like'), 'test_kw1', ' com ', 1 as query_mode);
+select highlight({html_strip_mode=strip}) from test2 where match('hello');
 
 
 </sphinxql></queries>

Деякі файли не було показано, через те що забагато файлів було змінено