Browse Source

added ranking modes (protocol v.1.16)


git-svn-id: svn://svn.sphinxsearch.com/sphinx/trunk@993 406a0c4d-033a-0410-8de8-e80135713968
shodan 18 years ago
parent
commit
e33de5f5cc
5 changed files with 150 additions and 13 deletions
  1. 20 3
      api/sphinxapi.php
  2. 8 0
      api/test.php
  3. 11 2
      src/searchd.cpp
  4. 98 8
      src/sphinx.cpp
  5. 13 0
      src/sphinx.h

+ 20 - 3
api/sphinxapi.php

@@ -23,7 +23,7 @@ define ( "SEARCHD_COMMAND_EXCERPT",	1 );
 define ( "SEARCHD_COMMAND_UPDATE",	2 );
 
 /// current client-side command implementation versions
-define ( "VER_COMMAND_SEARCH",		0x10F );
+define ( "VER_COMMAND_SEARCH",		0x110 );
 define ( "VER_COMMAND_EXCERPT",		0x100 );
 define ( "VER_COMMAND_UPDATE",		0x100 );
 
@@ -40,7 +40,12 @@ define ( "SPH_MATCH_PHRASE",		2 );
 define ( "SPH_MATCH_BOOLEAN",		3 );
 define ( "SPH_MATCH_EXTENDED",		4 );
 define ( "SPH_MATCH_FULLSCAN",		5 );
-define ( "SPH_MATCH_EXTENDED2",		6 );	// extended engine V2 (TEMPORARY, WILL BE REMOVED IN 0.9.8-RELEASE)
+define ( "SPH_MATCH_EXTENDED2",		6 );	// extended engine V2 (TEMPORARY, WILL BE REMOVED)
+
+/// known ranking modes (ext2 only)
+define ( "SPH_RANK_PROXIMITY_BM25",	0 );	///< default mode, phrase proximity major factor and BM25 minor one
+define ( "SPH_RANK_BM25",			1 );	///< statistical mode, BM25 ranking only (faster but worse quality)
+define ( "SPH_RANK_NONE",			2 );	///< no ranking, all matches get a weight of 1
 
 /// known sort modes
 define ( "SPH_SORT_RELEVANCE",		0 );
@@ -93,6 +98,8 @@ class SphinxClient
 	var $_retrycount;	///< distributed retries count
 	var $_retrydelay;	///< distributed retries delay
 	var $_anchor;		///< geographical anchor point
+	var $_indexweights;	///< per-index weights
+	var $_ranker;		///< ranking mode (default is SPH_RANK_PROXIMITY_BM25)
 
 	var $_error;		///< last error message
 	var $_warning;		///< last warning message
@@ -130,6 +137,7 @@ class SphinxClient
 		$this->_retrydelay	= 0;
 		$this->_anchor		= array ();
 		$this->_indexweights= array ();
+		$this->_ranker		= SPH_RANK_PROXIMITY_BM25;
 
 		// per-reply fields (for single-query case)
 		$this->_error		= "";
@@ -285,6 +293,15 @@ class SphinxClient
 		$this->_mode = $mode;
 	}
 
+	/// set ranking mode
+	function SetRankingMode ( $ranker )
+	{
+		assert ( $ranker==SPH_RANK_PROXIMITY_BM25
+			|| $ranker==SPH_RANK_BM25
+			|| $ranker==SPH_RANK_NONE );
+		$this->_ranker = $ranker;
+	}
+
 	/// set matches sorting mode
 	function SetSortMode ( $mode, $sortby="" )
 	{
@@ -556,7 +573,7 @@ class SphinxClient
 	function AddQuery ( $query, $index="*" )
 	{
 		// build request
-		$req = pack ( "NNNN", $this->_offset, $this->_limit, $this->_mode, $this->_sort ); // mode and limits
+		$req = pack ( "NNNNN", $this->_offset, $this->_limit, $this->_mode, $this->_ranker, $this->_sort ); // mode and limits
 		$req .= pack ( "N", strlen($this->_sortby) ) . $this->_sortby;
 		$req .= pack ( "N", strlen($query) ) . $query; // query itself
 		$req .= pack ( "N", count($this->_weights) ); // weights

+ 8 - 0
api/test.php

@@ -53,6 +53,7 @@ $filtervals = array();
 $distinct = "";
 $sortby = "";
 $limit = 20;
+$ranker = SPH_RANK_PROXIMITY_BM25;
 for ( $i=0; $i<count($args); $i++ )
 {
 	$arg = $args[$i];
@@ -72,6 +73,12 @@ for ( $i=0; $i<count($args); $i++ )
 	else if ( $arg=="-gs"|| $arg=="--groupsort" )	$groupsort = $args[++$i];
 	else if ( $arg=="-d" || $arg=="--distinct" )	$distinct = $args[++$i];
 	else if ( $arg=="-l" || $arg=="--limit" )		$limit = $args[++$i];
+	else if ( $arg=="-r" )
+	{
+		$arg = strtolower($args[++$i]);
+		if ( $arg=="bm25" )		$ranker = SPH_RANK_BM25;
+		if ( $arg=="none" )		$ranker = SPH_RANK_NONE;
+	}
 	else
 		$q .= $args[$i] . " ";
 }
@@ -89,6 +96,7 @@ if ( $groupby )				$cl->SetGroupBy ( $groupby, SPH_GROUPBY_ATTR, $groupsort );
 if ( $sortby )				$cl->SetSortMode ( SPH_SORT_EXTENDED, $sortby );
 if ( $distinct )			$cl->SetGroupDistinct ( $distinct );
 if ( $limit )				$cl->SetLimits ( 0, $limit, ( $limit>1000 ) ? $limit : 1000 );
+$cl->SetRankingMode ( $ranker );
 $res = $cl->Query ( $q, $index );
 
 ////////////////

+ 11 - 2
src/searchd.cpp

@@ -162,7 +162,7 @@ enum SearchdCommand_e
 /// known command versions
 enum
 {
-	VER_COMMAND_SEARCH		= 0x10F,
+	VER_COMMAND_SEARCH		= 0x110,
 	VER_COMMAND_EXCERPT		= 0x100,
 	VER_COMMAND_UPDATE		= 0x100
 };
@@ -1810,7 +1810,7 @@ protected:
 
 int SearchRequestBuilder_t::CalcQueryLen ( const char * sIndexes, const CSphQuery & q ) const
 {
-	int iReqSize = 80 + 2*sizeof(SphDocID_t) + 4*q.m_iWeights
+	int iReqSize = 84 + 2*sizeof(SphDocID_t) + 4*q.m_iWeights
 		+ strlen ( q.m_sSortBy.cstr() )
 		+ strlen ( q.m_sQuery.cstr() )
 		+ strlen ( sIndexes )
@@ -1846,6 +1846,7 @@ void SearchRequestBuilder_t::SendQuery ( const char * sIndexes, NetOutputBuffer_
 	tOut.SendInt ( 0 ); // offset is 0
 	tOut.SendInt ( q.m_iMaxMatches ); // limit is MAX_MATCHES
 	tOut.SendInt ( (DWORD)q.m_eMode ); // match mode
+	tOut.SendInt ( (DWORD)q.m_eRanker ); // ranking mode
 	tOut.SendInt ( q.m_eSort ); // sort mode
 	tOut.SendString ( q.m_sSortBy.cstr() ); // sort attr
 	tOut.SendString ( q.m_sQuery.cstr() ); // query
@@ -2179,6 +2180,8 @@ bool ParseSearchQuery ( InputBuffer_c & tReq, CSphQuery & tQuery, int iVer )
 	tQuery.m_iOffset	= tReq.GetInt ();
 	tQuery.m_iLimit		= tReq.GetInt ();
 	tQuery.m_eMode		= (ESphMatchMode) tReq.GetInt ();
+	if ( iVer>=0x110 )
+		tQuery.m_eRanker= (ESphRankMode) tReq.GetInt ();
 	tQuery.m_eSort		= (ESphSortOrder) tReq.GetInt ();
 	if ( iVer<=0x101 )
 		tQuery.m_iOldGroups = tReq.GetDwords ( &tQuery.m_pOldGroups, SEARCHD_MAX_ATTR_VALUES, "invalid group count %d (should be in 0..%d range)" );
@@ -2396,6 +2399,11 @@ bool ParseSearchQuery ( InputBuffer_c & tReq, CSphQuery & tQuery, int iVer )
 		tReq.SendErrorReply ( "invalid match mode %d", tQuery.m_eMode );
 		return false;
 	}
+	if ( tQuery.m_eRanker<0 || tQuery.m_eRanker>SPH_RANK_TOTAL )
+	{
+		tReq.SendErrorReply ( "invalid ranking mode %d", tQuery.m_eRanker );
+		return false;
+	}
 	if ( tQuery.m_iMaxMatches<1 || tQuery.m_iMaxMatches>g_iMaxMatches )
 	{
 		tReq.SendErrorReply ( "per-query max_matches=%d out of bounds (per-server max_matches=%d)",
@@ -2953,6 +2961,7 @@ void SearchHandler_c::RunSubset ( int iStart, int iEnd )
 			( qCheck.m_iWeights!=qFirst.m_iWeights ) || // weights count
 			( qCheck.m_pWeights && memcmp ( qCheck.m_pWeights, qFirst.m_pWeights, sizeof(int)*qCheck.m_iWeights ) ) || // weights
 			( qCheck.m_eMode!=qFirst.m_eMode ) || // search mode
+			( qCheck.m_eRanker!=qFirst.m_eRanker ) || // ranking mode
 			( qCheck.m_iMinID!=qFirst.m_iMinID ) || // min-id filter
 			( qCheck.m_iMaxID!=qFirst.m_iMaxID ) || // max-id filter
 			( qCheck.m_dFilters.GetLength()!=qFirst.m_dFilters.GetLength() ) || // attr filters count

+ 98 - 8
src/sphinx.cpp

@@ -3499,6 +3499,7 @@ CSphQuery::CSphQuery ()
 	, m_pWeights	( NULL )
 	, m_iWeights	( 0 )
 	, m_eMode		( SPH_MATCH_ALL )
+	, m_eRanker		( SPH_RANK_DEFAULT )
 	, m_eSort		( SPH_SORT_RELEVANCE )
 	, m_iMaxMatches	( 1000 )
 	, m_iMinID		( 0 )
@@ -9041,12 +9042,13 @@ protected:
 };
 
 
-/// ranker, which folds hitstream into simple match chunks
+/// ranker interface
+/// ranker folds incoming hitstream into simple match chunks, and computes relevance rank
 class ExtRanker_c
 {
 public:
 								ExtRanker_c ( const CSphExtendedQueryNode * pAccept, const CSphExtendedQueryNode * pReject, const CSphTermSetup & tSetup );
-	int							GetMatches ( int iFields, const int * pWeights );
+	virtual int					GetMatches ( int iFields, const int * pWeights ) = 0;
 
 	void						GetQwords ( ExtQwordsHash_t & hQwords )				{ if ( m_pRoot ) m_pRoot->GetQwords ( hQwords ); }
 	void						SetQwordsIDF ( const ExtQwordsHash_t & hQwords )	{ if ( m_pRoot ) m_pRoot->SetQwordsIDF ( hQwords ); }
@@ -9060,6 +9062,22 @@ protected:
 	const ExtHit_t *			m_pHitlist;
 };
 
+
+#define DECLARE_RANKER(_name) \
+	class ExtRanker_##_name##_c : public ExtRanker_c \
+	{ \
+	public: \
+		ExtRanker_##_name##_c ( const CSphExtendedQueryNode * pAccept, const CSphExtendedQueryNode * pReject, const CSphTermSetup & tSetup ) \
+			: ExtRanker_c ( pAccept, pReject, tSetup ) \
+		{} \
+	\
+		virtual int GetMatches ( int iFields, const int * pWeights ); \
+	};
+
+DECLARE_RANKER ( ProximityBM25 )
+DECLARE_RANKER ( BM25 )
+DECLARE_RANKER ( None )
+
 //////////////////////////////////////////////////////////////////////////
 
 ExtNode_i::ExtNode_i ()
@@ -9342,6 +9360,7 @@ const ExtDoc_t * ExtAnd_c::GetDocsChunk ()
 			// emit it
 			ExtDoc_t & tDoc = m_dDocs[iDoc++];	
 			tDoc.m_uDocid = pCur0->m_uDocid;
+			tDoc.m_uFields = pCur0->m_uFields | pCur1->m_uFields;
 			tDoc.m_uHitlistOffset = -1;
 			tDoc.m_fTFIDF = pCur0->m_fTFIDF + pCur1->m_fTFIDF;
 
@@ -9493,6 +9512,7 @@ const ExtDoc_t * ExtOr_c::GetDocsChunk ()
 				while ( pCur0->m_uDocid==pCur1->m_uDocid && pCur0->m_uDocid!=DOCID_MAX && iDoc<MAX_DOCS-1 )
 				{
 					m_dDocs[iDoc] = *pCur0;
+					m_dDocs[iDoc].m_uFields = pCur0->m_uFields | pCur1->m_uFields;
 					m_dDocs[iDoc].m_fTFIDF = pCur0->m_fTFIDF + pCur1->m_fTFIDF;
 					iDoc++;
 					pCur0++;
@@ -10075,7 +10095,9 @@ ExtRanker_c::ExtRanker_c ( const CSphExtendedQueryNode * pAccept, const CSphExte
 	m_pHitlist = NULL;
 }
 
-int ExtRanker_c::GetMatches ( int iFields, const int * pWeights )
+//////////////////////////////////////////////////////////////////////////
+
+int ExtRanker_ProximityBM25_c::GetMatches ( int iFields, const int * pWeights )
 {
 	if ( !m_pRoot )
 		return 0;
@@ -10186,6 +10208,62 @@ int ExtRanker_c::GetMatches ( int iFields, const int * pWeights )
 
 //////////////////////////////////////////////////////////////////////////
 
+int ExtRanker_BM25_c::GetMatches ( int iFields, const int * pWeights )
+{
+	if ( !m_pRoot )
+		return 0;
+
+	const ExtDoc_t * pDoc = m_pDoclist;
+	int iMatches = 0;
+
+	while ( iMatches<ExtNode_i::MAX_DOCS )
+	{
+		if ( !pDoc || pDoc->m_uDocid==DOCID_MAX ) pDoc = m_pRoot->GetDocsChunk ();
+		if ( !pDoc ) { m_pDoclist = NULL; return iMatches; }
+
+		DWORD uRank = 0;
+		for ( int i=0; i<iFields; i++ )
+			uRank += ( (pDoc->m_uFields>>i)&1 )*pWeights[i];
+
+		m_dMatches[iMatches].m_iDocID = pDoc->m_uDocid;
+		m_dMatches[iMatches].m_iWeight = uRank*SPH_BM25_SCALE + int( (pDoc->m_fTFIDF+0.5f)*SPH_BM25_SCALE );
+		iMatches++;
+
+		pDoc++;
+	}
+
+	m_pDoclist = pDoc;
+	return iMatches;
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+int ExtRanker_None_c::GetMatches ( int, const int * )
+{
+	if ( !m_pRoot )
+		return 0;
+
+	const ExtDoc_t * pDoc = m_pDoclist;
+	int iMatches = 0;
+
+	while ( iMatches<ExtNode_i::MAX_DOCS )
+	{
+		if ( !pDoc || pDoc->m_uDocid==DOCID_MAX ) pDoc = m_pRoot->GetDocsChunk ();
+		if ( !pDoc ) { m_pDoclist = NULL; return iMatches; }
+
+		m_dMatches[iMatches].m_iDocID = pDoc->m_uDocid;
+		m_dMatches[iMatches].m_iWeight = 1;
+		iMatches++;
+
+		pDoc++;
+	}
+
+	m_pDoclist = pDoc;
+	return iMatches;
+}
+
+//////////////////////////////////////////////////////////////////////////
+
 void CSphIndex_VLN::CheckExtendedQuery ( const CSphExtendedQueryNode * pNode, CSphQueryResult * pResult )
 {
 	ARRAY_FOREACH ( i, pNode->m_tAtom.m_dWords )
@@ -10215,11 +10293,23 @@ bool CSphIndex_VLN::MatchExtended ( const CSphQuery * pQuery, CSphQueryResult *
 	CheckExtendedQuery ( tParsed.m_pReject, pResult );
 
 	// setup eval-tree
-	ExtRanker_c tRoot ( tParsed.m_pAccept, tParsed.m_pReject, tTermSetup );
+	ExtRanker_c * pRoot = NULL;
+	switch ( pQuery->m_eRanker )
+	{
+		case SPH_RANK_PROXIMITY_BM25:	pRoot = new ExtRanker_ProximityBM25_c ( tParsed.m_pAccept, tParsed.m_pReject, tTermSetup ); break;
+		case SPH_RANK_BM25:				pRoot = new ExtRanker_BM25_c ( tParsed.m_pAccept, tParsed.m_pReject, tTermSetup ); break;
+		case SPH_RANK_NONE:				pRoot = new ExtRanker_None_c ( tParsed.m_pAccept, tParsed.m_pReject, tTermSetup ); break;
+
+		default:
+			pResult->m_sWarning.SetSprintf ( "unknown ranking mode %d; using default", (int)pQuery->m_eRanker );
+			pRoot = new ExtRanker_ProximityBM25_c ( tParsed.m_pAccept, tParsed.m_pReject, tTermSetup );
+			break;
+	}
+	assert ( pRoot );
 
 	// setup word stats and IDFs
 	ExtQwordsHash_t hQwords;
-	tRoot.GetQwords ( hQwords );
+	pRoot->GetQwords ( hQwords );
 
 	pResult->m_iNumWords = 0;
 	const int iQwords = hQwords.GetLength ();
@@ -10257,18 +10347,18 @@ bool CSphIndex_VLN::MatchExtended ( const CSphQuery * pQuery, CSphQueryResult *
 		}
 	}
 
-	tRoot.SetQwordsIDF ( hQwords );
+	pRoot->SetQwordsIDF ( hQwords );
 
 	// do searching
 	for ( ;; )
 	{
-		int iMatches = tRoot.GetMatches ( m_iWeights, m_dWeights );
+		int iMatches = pRoot->GetMatches ( m_iWeights, m_dWeights );
 		if ( iMatches<=0 )
 			break;
 
 		for ( int i=0; i<iMatches; i++ )
 		{
-			CSphMatch & tMatch = tRoot.m_dMatches[i];
+			CSphMatch & tMatch = pRoot->m_dMatches[i];
 
 			// early reject by group id, doc id or timestamp
 			if ( m_bEarlyLookup )

+ 13 - 0
src/sphinx.h

@@ -1090,6 +1090,18 @@ enum ESphMatchMode
 };
 
 
+/// search query relevance ranking mode
+enum ESphRankMode
+{
+	SPH_RANK_PROXIMITY_BM25		= 0,	///< default mode, phrase proximity major factor and BM25 minor one
+	SPH_RANK_BM25				= 1,	///< statistical mode, BM25 ranking only (faster but worse quality)
+	SPH_RANK_NONE				= 2,	///< no ranking, all matches get a weight of 1
+
+	SPH_RANK_TOTAL,
+	SPH_RANK_DEFAULT			= SPH_RANK_PROXIMITY_BM25
+};
+
+
 /// search query grouping mode
 enum ESphGroupBy
 {
@@ -1170,6 +1182,7 @@ public:
 	DWORD *			m_pWeights;		///< user-supplied per-field weights. may be NULL. default is NULL. NOT OWNED, WILL NOT BE FREED in dtor.
 	int				m_iWeights;		///< number of user-supplied weights. missing fields will be assigned weight 1. default is 0
 	ESphMatchMode	m_eMode;		///< match mode. default is "match all"
+	ESphRankMode	m_eRanker;		///< ranking mode, default is proximity+BM25
 	ESphSortOrder	m_eSort;		///< sort mode
 	CSphString		m_sSortBy;		///< attribute to sort by
 	int				m_iMaxMatches;	///< max matches to retrieve, default is 1000. more matches use more memory and CPU time to hold and sort them