indexformat.cpp 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690
  1. //
  2. // Copyright (c) 2017-2020, Manticore Software LTD (http://manticoresearch.com)
  3. // Copyright (c) 2001-2016, Andrew Aksyonoff
  4. // Copyright (c) 2008-2016, Sphinx Technologies Inc
  5. // All rights reserved
  6. //
  7. // This program is free software; you can redistribute it and/or modify
  8. // it under the terms of the GNU General Public License. You should have
  9. // received a copy of the GPL license along with this program; if you
  10. // did not, you can find it at http://www.gnu.org/
  11. //
  12. #include "indexformat.h"
  13. // let uDocs be DWORD here to prevent int overflow in case of hitless word (highest bit is 1)
  14. int DoclistHintUnpack ( DWORD uDocs, BYTE uHint )
  15. {
  16. if ( uDocs<(DWORD)DOCLIST_HINT_THRESH )
  17. return (int)Min ( 8*(int64_t)uDocs, INT_MAX );
  18. else
  19. return (int)Min ( 4*(int64_t)uDocs+( int64_t(uDocs)*uHint/64 ), INT_MAX );
  20. }
  21. //////////////////////////////////////////////////////////////////////////
  22. DiskIndexQwordTraits_c::DiskIndexQwordTraits_c ( bool bUseMini, bool bExcluded )
  23. {
  24. m_bExcluded = bExcluded;
  25. if ( bUseMini )
  26. {
  27. m_pDocsBuf = m_dDoclistBuf;
  28. m_pHitsBuf = m_dHitlistBuf;
  29. }
  30. }
  31. void DiskIndexQwordTraits_c::SetDocReader ( DataReaderFactory_c * pReader )
  32. {
  33. if ( !pReader )
  34. return;
  35. m_rdDoclist = pReader->MakeReader ( m_pDocsBuf, MINIBUFFER_LEN );
  36. }
  37. void DiskIndexQwordTraits_c::SetHitReader ( DataReaderFactory_c * pReader )
  38. {
  39. if ( !pReader )
  40. return;
  41. m_rdHitlist = pReader->MakeReader ( m_pHitsBuf, MINIBUFFER_LEN );
  42. }
  43. void DiskIndexQwordTraits_c::ResetDecoderState ()
  44. {
  45. ISphQword::Reset();
  46. m_uHitPosition = 0;
  47. m_uInlinedHit = 0;
  48. m_uHitState = 0;
  49. m_tDoc.m_tRowID = INVALID_ROWID;
  50. m_iHitPos = EMPTY_HIT;
  51. }
  52. //////////////////////////////////////////////////////////////////////////
  53. class CheckpointReader_c
  54. {
  55. public:
  56. const BYTE * ReadEntry ( const BYTE * pBuf, CSphWordlistCheckpoint & tCP ) const
  57. {
  58. tCP.m_uWordID = (SphWordID_t)sphUnalignedRead ( *(SphOffset_t *)pBuf );
  59. pBuf += sizeof(SphOffset_t);
  60. tCP.m_iWordlistOffset = sphUnalignedRead ( *(SphOffset_t *)pBuf );
  61. pBuf += sizeof(SphOffset_t);
  62. return pBuf;
  63. }
  64. int GetStride() const { return m_iSrcStride; }
  65. private:
  66. int m_iSrcStride = 2*sizeof(SphOffset_t);
  67. };
  68. struct MappedCheckpoint_fn : public ISphNoncopyable
  69. {
  70. const CSphWordlistCheckpoint * m_pDstStart;
  71. const BYTE * m_pSrcStart;
  72. const CheckpointReader_c * m_pReader;
  73. MappedCheckpoint_fn ( const CSphWordlistCheckpoint * pDstStart, const BYTE * pSrcStart, const CheckpointReader_c * pReader )
  74. : m_pDstStart ( pDstStart )
  75. , m_pSrcStart ( pSrcStart )
  76. , m_pReader ( pReader )
  77. {}
  78. CSphWordlistCheckpoint operator() ( const CSphWordlistCheckpoint * pCP ) const
  79. {
  80. assert ( m_pDstStart<=pCP );
  81. const BYTE * pCur = ( pCP - m_pDstStart ) * m_pReader->GetStride() + m_pSrcStart;
  82. CSphWordlistCheckpoint tEntry;
  83. m_pReader->ReadEntry ( pCur, tEntry );
  84. return tEntry;
  85. }
  86. };
  87. //////////////////////////////////////////////////////////////////////////
  88. struct DiskExpandedEntry_t
  89. {
  90. int m_iNameOff;
  91. int m_iDocs;
  92. int m_iHits;
  93. };
  94. struct DiskExpandedPayload_t
  95. {
  96. int m_iDocs;
  97. int m_iHits;
  98. uint64_t m_uDoclistOff;
  99. int m_iDoclistHint;
  100. };
  101. struct Slice64_t
  102. {
  103. uint64_t m_uOff;
  104. int m_iLen;
  105. };
  106. struct DiskSubstringPayload_t : public ISphSubstringPayload
  107. {
  108. explicit DiskSubstringPayload_t ( int iDoclists )
  109. : m_dDoclist ( iDoclists )
  110. {}
  111. CSphFixedVector<Slice64_t> m_dDoclist;
  112. };
  113. //////////////////////////////////////////////////////////////////////////
  114. struct DictEntryDiskPayload_t
  115. {
  116. DictEntryDiskPayload_t ( bool bPayload, ESphHitless eHitless )
  117. {
  118. m_bPayload = bPayload;
  119. m_eHitless = eHitless;
  120. if ( bPayload )
  121. m_dWordPayload.Reserve ( 1000 );
  122. m_dWordExpand.Reserve ( 1000 );
  123. m_dWordBuf.Reserve ( 8096 );
  124. }
  125. void Add ( const CSphDictEntry & tWord, int iWordLen )
  126. {
  127. if ( !m_bPayload || !sphIsExpandedPayload ( tWord.m_iDocs, tWord.m_iHits ) ||
  128. m_eHitless==SPH_HITLESS_ALL || ( m_eHitless==SPH_HITLESS_SOME && ( tWord.m_iDocs & HITLESS_DOC_FLAG )!=0 ) ) // FIXME!!! do we need hitless=some as payloads?
  129. {
  130. DiskExpandedEntry_t & tExpand = m_dWordExpand.Add();
  131. int iOff = m_dWordBuf.GetLength();
  132. tExpand.m_iNameOff = iOff;
  133. tExpand.m_iDocs = tWord.m_iDocs;
  134. tExpand.m_iHits = tWord.m_iHits;
  135. m_dWordBuf.Resize ( iOff + iWordLen + 1 );
  136. memcpy ( m_dWordBuf.Begin() + iOff + 1, tWord.m_sKeyword, iWordLen );
  137. m_dWordBuf[iOff] = (BYTE)iWordLen;
  138. } else
  139. {
  140. DiskExpandedPayload_t & tExpand = m_dWordPayload.Add();
  141. tExpand.m_iDocs = tWord.m_iDocs;
  142. tExpand.m_iHits = tWord.m_iHits;
  143. tExpand.m_uDoclistOff = tWord.m_iDoclistOffset;
  144. tExpand.m_iDoclistHint = tWord.m_iDoclistHint;
  145. }
  146. }
  147. void Convert ( ISphWordlist::Args_t & tArgs )
  148. {
  149. if ( !m_dWordExpand.GetLength() && !m_dWordPayload.GetLength() )
  150. return;
  151. int iTotalDocs = 0;
  152. int iTotalHits = 0;
  153. if ( m_dWordExpand.GetLength() )
  154. {
  155. LimitExpanded ( tArgs.m_iExpansionLimit, m_dWordExpand );
  156. const BYTE * sBase = m_dWordBuf.Begin();
  157. ARRAY_FOREACH ( i, m_dWordExpand )
  158. {
  159. const DiskExpandedEntry_t & tCur = m_dWordExpand[i];
  160. int iDocs = tCur.m_iDocs;
  161. if ( m_eHitless==SPH_HITLESS_SOME )
  162. iDocs = ( tCur.m_iDocs & HITLESS_DOC_MASK );
  163. tArgs.AddExpanded ( sBase + tCur.m_iNameOff + 1, sBase[tCur.m_iNameOff], iDocs, tCur.m_iHits );
  164. iTotalDocs += iDocs;
  165. iTotalHits += tCur.m_iHits;
  166. }
  167. }
  168. if ( m_dWordPayload.GetLength() )
  169. {
  170. LimitExpanded ( tArgs.m_iExpansionLimit, m_dWordPayload );
  171. DiskSubstringPayload_t * pPayload = new DiskSubstringPayload_t ( m_dWordPayload.GetLength() );
  172. // sorting by ascending doc-list offset gives some (15%) speed-up too
  173. sphSort ( m_dWordPayload.Begin(), m_dWordPayload.GetLength(), bind ( &DiskExpandedPayload_t::m_uDoclistOff ) );
  174. ARRAY_FOREACH ( i, m_dWordPayload )
  175. {
  176. const DiskExpandedPayload_t & tCur = m_dWordPayload[i];
  177. assert ( m_eHitless==SPH_HITLESS_NONE || ( m_eHitless==SPH_HITLESS_SOME && ( tCur.m_iDocs & HITLESS_DOC_FLAG )==0 ) );
  178. iTotalDocs += tCur.m_iDocs;
  179. iTotalHits += tCur.m_iHits;
  180. pPayload->m_dDoclist[i].m_uOff = tCur.m_uDoclistOff;
  181. pPayload->m_dDoclist[i].m_iLen = tCur.m_iDoclistHint;
  182. }
  183. pPayload->m_iTotalDocs = iTotalDocs;
  184. pPayload->m_iTotalHits = iTotalHits;
  185. tArgs.m_pPayload = pPayload;
  186. }
  187. tArgs.m_iTotalDocs = iTotalDocs;
  188. tArgs.m_iTotalHits = iTotalHits;
  189. }
  190. // sort expansions by frequency desc
  191. // clip the less frequent ones if needed, as they are likely misspellings
  192. template < typename T >
  193. void LimitExpanded ( int iExpansionLimit, CSphVector<T> & dVec ) const
  194. {
  195. if ( !iExpansionLimit || dVec.GetLength()<=iExpansionLimit )
  196. return;
  197. sphSort ( dVec.Begin(), dVec.GetLength(), ExpandedOrderDesc_T<T>() );
  198. dVec.Resize ( iExpansionLimit );
  199. }
  200. bool m_bPayload;
  201. ESphHitless m_eHitless;
  202. CSphVector<DiskExpandedEntry_t> m_dWordExpand;
  203. CSphVector<DiskExpandedPayload_t> m_dWordPayload;
  204. CSphVector<BYTE> m_dWordBuf;
  205. };
  206. //////////////////////////////////////////////////////////////////////////
  207. CWordlist::~CWordlist ()
  208. {
  209. Reset();
  210. }
  211. void CWordlist::Reset ()
  212. {
  213. m_tBuf.Reset ();
  214. m_dCheckpoints.Reset ( 0 );
  215. m_pWords.Reset ( 0 );
  216. SafeDeleteArray ( m_pInfixBlocksWords );
  217. SafeDelete ( m_pCpReader );
  218. }
  219. bool CWordlist::Preread ( const CSphString & sName, bool bWordDict, int iSkiplistBlockSize, CSphString & sError )
  220. {
  221. assert ( m_iDictCheckpointsOffset>0 );
  222. m_bWordDict = bWordDict;
  223. m_iWordsEnd = m_iDictCheckpointsOffset; // set wordlist end
  224. m_iSkiplistBlockSize = iSkiplistBlockSize;
  225. ////////////////////////////
  226. // preload word checkpoints
  227. ////////////////////////////
  228. ////////////////////////////
  229. // fast path for CRC checkpoints - just maps data and use inplace CP reader
  230. if ( !bWordDict )
  231. {
  232. if ( !m_tBuf.Setup ( sName, sError ) )
  233. return false;
  234. m_pCpReader = new CheckpointReader_c;
  235. return true;
  236. }
  237. ////////////////////////////
  238. // regular path that loads checkpoints data
  239. CSphAutoreader tReader;
  240. if ( !tReader.Open ( sName, sError ) )
  241. return false;
  242. int64_t iFileSize = tReader.GetFilesize();
  243. int iCheckpointOnlySize = (int)(iFileSize-m_iDictCheckpointsOffset);
  244. if ( m_iInfixCodepointBytes && m_iInfixBlocksOffset )
  245. iCheckpointOnlySize = (int)(m_iInfixBlocksOffset - strlen ( g_sTagInfixBlocks ) - m_iDictCheckpointsOffset);
  246. if ( iFileSize-m_iDictCheckpointsOffset>=UINT_MAX )
  247. {
  248. sError.SetSprintf ( "dictionary meta overflow: meta size=" INT64_FMT ", total size=" INT64_FMT ", meta offset=" INT64_FMT,
  249. iFileSize-m_iDictCheckpointsOffset, iFileSize, (int64_t)m_iDictCheckpointsOffset );
  250. return false;
  251. }
  252. tReader.SeekTo ( m_iDictCheckpointsOffset, iCheckpointOnlySize );
  253. assert ( m_bWordDict );
  254. int iArenaSize = iCheckpointOnlySize
  255. - (sizeof(DWORD)+sizeof(SphOffset_t))*m_dCheckpoints.GetLength()
  256. + sizeof(BYTE)*m_dCheckpoints.GetLength();
  257. assert ( iArenaSize>=0 );
  258. m_pWords.Reset ( iArenaSize );
  259. BYTE * pWord = m_pWords.Begin();
  260. for ( auto & dCheckpoint : m_dCheckpoints )
  261. {
  262. dCheckpoint.m_sWord = (char *)pWord;
  263. const int iLen = tReader.GetDword();
  264. assert ( iLen>0 );
  265. assert ( iLen + 1 + ( pWord - m_pWords.Begin() )<=iArenaSize );
  266. tReader.GetBytes ( pWord, iLen );
  267. pWord[iLen] = '\0';
  268. pWord += iLen+1;
  269. dCheckpoint.m_iWordlistOffset = tReader.GetOffset();
  270. }
  271. ////////////////////////
  272. // preload infix blocks
  273. ////////////////////////
  274. if ( m_iInfixCodepointBytes && m_iInfixBlocksOffset )
  275. {
  276. // reading to vector as old version doesn't store total infix words length
  277. CSphTightVector<BYTE> dInfixWords;
  278. dInfixWords.Reserve ( (int)m_iInfixBlocksWordsSize );
  279. tReader.SeekTo ( m_iInfixBlocksOffset, (int)(iFileSize-m_iInfixBlocksOffset) );
  280. m_dInfixBlocks.Resize ( tReader.UnzipInt() );
  281. for ( auto & dInfixBlock : m_dInfixBlocks )
  282. {
  283. int iBytes = tReader.UnzipInt();
  284. int iOff = dInfixWords.GetLength();
  285. dInfixBlock.m_iInfixOffset = (DWORD) iOff; /// FIXME! name convention of m_iInfixOffset
  286. dInfixWords.Resize ( iOff+iBytes+1 );
  287. tReader.GetBytes ( dInfixWords.Begin()+iOff, iBytes );
  288. dInfixWords[iOff+iBytes] = '\0';
  289. dInfixBlock.m_iOffset = tReader.UnzipInt();
  290. }
  291. // fix-up offset to pointer
  292. m_pInfixBlocksWords = dInfixWords.LeakData();
  293. ARRAY_FOREACH ( i, m_dInfixBlocks )
  294. m_dInfixBlocks[i].m_sInfix = (const char *)m_pInfixBlocksWords + m_dInfixBlocks[i].m_iInfixOffset;
  295. // FIXME!!! store and load that explicitly
  296. if ( m_dInfixBlocks.GetLength() )
  297. m_iWordsEnd = m_dInfixBlocks.Begin()->m_iOffset - strlen ( g_sTagInfixEntries );
  298. else
  299. m_iWordsEnd -= strlen ( g_sTagInfixEntries );
  300. }
  301. if ( tReader.GetErrorFlag() )
  302. {
  303. sError = tReader.GetErrorMessage();
  304. return false;
  305. }
  306. tReader.Close();
  307. // mapping up only wordlist without meta (checkpoints, infixes, etc)
  308. if ( !m_tBuf.Setup ( sName, sError ) )
  309. return false;
  310. return true;
  311. }
  312. void CWordlist::DebugPopulateCheckpoints()
  313. {
  314. if ( !m_pCpReader )
  315. return;
  316. const BYTE * pCur = m_tBuf.GetWritePtr() + m_iDictCheckpointsOffset;
  317. ARRAY_FOREACH ( i, m_dCheckpoints )
  318. pCur = m_pCpReader->ReadEntry ( pCur, m_dCheckpoints[i] );
  319. SafeDelete(m_pCpReader);
  320. }
  321. const CSphWordlistCheckpoint * CWordlist::FindCheckpoint ( const char * sWord, int iWordLen, SphWordID_t iWordID, bool bStarMode ) const
  322. {
  323. if ( m_pCpReader ) // FIXME!!! fall to regular checkpoints after data got read
  324. {
  325. MappedCheckpoint_fn tPred ( m_dCheckpoints.Begin(), m_tBuf.GetWritePtr() + m_iDictCheckpointsOffset, m_pCpReader );
  326. return sphSearchCheckpoint ( sWord, iWordLen, iWordID, bStarMode, m_bWordDict, m_dCheckpoints.Begin(), &m_dCheckpoints.Last(), tPred );
  327. }
  328. return sphSearchCheckpoint ( sWord, iWordLen, iWordID, bStarMode, m_bWordDict, m_dCheckpoints.Begin(), &m_dCheckpoints.Last() );
  329. }
  330. bool CWordlist::GetWord ( const BYTE * pBuf, SphWordID_t iWordID, CSphDictEntry & tWord ) const
  331. {
  332. SphWordID_t iLastID = 0;
  333. SphOffset_t uLastOff = 0;
  334. while (true)
  335. {
  336. // unpack next word ID
  337. const SphWordID_t iDeltaWord = sphUnzipWordid ( pBuf ); // FIXME! slow with 32bit wordids
  338. if ( iDeltaWord==0 ) // wordlist chunk is over
  339. return false;
  340. iLastID += iDeltaWord;
  341. // list is sorted, so if there was no match, there's no such word
  342. if ( iLastID>iWordID )
  343. return false;
  344. // unpack next offset
  345. const SphOffset_t iDeltaOffset = sphUnzipOffset ( pBuf );
  346. uLastOff += iDeltaOffset;
  347. // unpack doc/hit count
  348. const int iDocs = sphUnzipInt ( pBuf );
  349. const int iHits = sphUnzipInt ( pBuf );
  350. SphOffset_t iSkiplistPos = 0;
  351. if ( iDocs > m_iSkiplistBlockSize )
  352. iSkiplistPos = sphUnzipOffset ( pBuf );
  353. assert ( iDeltaOffset );
  354. assert ( iDocs );
  355. assert ( iHits );
  356. // it matches?!
  357. if ( iLastID==iWordID )
  358. {
  359. sphUnzipWordid ( pBuf ); // might be 0 at checkpoint
  360. const SphOffset_t iDoclistLen = sphUnzipOffset ( pBuf );
  361. tWord.m_iDoclistOffset = uLastOff;
  362. tWord.m_iDocs = iDocs;
  363. tWord.m_iHits = iHits;
  364. tWord.m_iDoclistHint = (int)iDoclistLen;
  365. tWord.m_iSkiplistOffset = iSkiplistPos;
  366. return true;
  367. }
  368. }
  369. }
  370. const BYTE * CWordlist::AcquireDict ( const CSphWordlistCheckpoint * pCheckpoint ) const
  371. {
  372. assert ( pCheckpoint );
  373. assert ( m_dCheckpoints.GetLength() );
  374. assert ( pCheckpoint>=m_dCheckpoints.Begin() && pCheckpoint<=&m_dCheckpoints.Last() );
  375. SphOffset_t iOff = pCheckpoint->m_iWordlistOffset;
  376. if ( m_pCpReader )
  377. {
  378. MappedCheckpoint_fn tPred ( m_dCheckpoints.Begin(), m_tBuf.GetWritePtr() + m_iDictCheckpointsOffset, m_pCpReader );
  379. iOff = tPred ( pCheckpoint ).m_iWordlistOffset;
  380. }
  381. assert ( !m_tBuf.IsEmpty() );
  382. assert ( iOff>0 && iOff<(int64_t)m_tBuf.GetLengthBytes() );
  383. return m_tBuf.GetWritePtr()+iOff;
  384. }
  385. void CWordlist::GetPrefixedWords ( const char * sSubstring, int iSubLen, const char * sWildcard, Args_t & tArgs ) const
  386. {
  387. assert ( sSubstring && *sSubstring && iSubLen>0 );
  388. // empty index?
  389. if ( !m_dCheckpoints.GetLength() )
  390. return;
  391. DictEntryDiskPayload_t tDict2Payload ( tArgs.m_bPayload, tArgs.m_eHitless );
  392. int dWildcard [ SPH_MAX_WORD_LEN + 1 ];
  393. int * pWildcard = ( sphIsUTF8 ( sWildcard ) && sphUTF8ToWideChar ( sWildcard, dWildcard, SPH_MAX_WORD_LEN ) ) ? dWildcard : NULL;
  394. const CSphWordlistCheckpoint * pCheckpoint = FindCheckpoint ( sSubstring, iSubLen, 0, true );
  395. const int iSkipMagic = ( BYTE(*sSubstring)<0x20 ); // whether to skip heading magic chars in the prefix, like NONSTEMMED maker
  396. while ( pCheckpoint )
  397. {
  398. // decode wordlist chunk
  399. KeywordsBlockReader_c tDictReader ( AcquireDict ( pCheckpoint ), m_iSkiplistBlockSize );
  400. while ( tDictReader.UnpackWord() )
  401. {
  402. // block is sorted
  403. // so once keywords are greater than the prefix, no more matches
  404. int iCmp = sphDictCmp ( sSubstring, iSubLen, (const char *)tDictReader.m_sKeyword, tDictReader.GetWordLen() );
  405. if ( iCmp<0 )
  406. break;
  407. if ( sphInterrupted() )
  408. break;
  409. // does it match the prefix *and* the entire wildcard?
  410. if ( iCmp==0 && sphWildcardMatch ( (const char *)tDictReader.m_sKeyword + iSkipMagic, sWildcard, pWildcard ) )
  411. tDict2Payload.Add ( tDictReader, tDictReader.GetWordLen() );
  412. }
  413. if ( sphInterrupted () )
  414. break;
  415. pCheckpoint++;
  416. if ( pCheckpoint > &m_dCheckpoints.Last() )
  417. break;
  418. if ( sphDictCmp ( sSubstring, iSubLen, pCheckpoint->m_sWord, strlen ( pCheckpoint->m_sWord ) )<0 )
  419. break;
  420. }
  421. tDict2Payload.Convert ( tArgs );
  422. }
  423. void CWordlist::GetInfixedWords ( const char * sSubstring, int iSubLen, const char * sWildcard, Args_t & tArgs ) const
  424. {
  425. // dict must be of keywords type, and fully cached
  426. // mmap()ed in the worst case, should we ever banish it to disk again
  427. if ( m_tBuf.IsEmpty() || !m_dCheckpoints.GetLength() )
  428. return;
  429. assert ( !m_pCpReader );
  430. // extract key1, upto 6 chars from infix start
  431. int iBytes1 = sphGetInfixLength ( sSubstring, iSubLen, m_iInfixCodepointBytes );
  432. // lookup key1
  433. // OPTIMIZE? maybe lookup key2 and reduce checkpoint set size, if possible?
  434. CSphVector<DWORD> dPoints;
  435. if ( !sphLookupInfixCheckpoints ( sSubstring, iBytes1, m_tBuf.GetWritePtr(), m_dInfixBlocks, m_iInfixCodepointBytes, dPoints ) )
  436. return;
  437. DictEntryDiskPayload_t tDict2Payload ( tArgs.m_bPayload, tArgs.m_eHitless );
  438. const int iSkipMagic = ( tArgs.m_bHasExactForms ? 1 : 0 ); // whether to skip heading magic chars in the prefix, like NONSTEMMED maker
  439. int dWildcard [ SPH_MAX_WORD_LEN + 1 ];
  440. int * pWildcard = ( sphIsUTF8 ( sWildcard ) && sphUTF8ToWideChar ( sWildcard, dWildcard, SPH_MAX_WORD_LEN ) ) ? dWildcard : NULL;
  441. // walk those checkpoints, check all their words
  442. ARRAY_FOREACH ( i, dPoints )
  443. {
  444. // OPTIMIZE? add a quicker path than a generic wildcard for "*infix*" case?
  445. KeywordsBlockReader_c tDictReader ( m_tBuf.GetWritePtr() + m_dCheckpoints[dPoints[i]-1].m_iWordlistOffset, m_iSkiplistBlockSize );
  446. while ( tDictReader.UnpackWord() )
  447. {
  448. if ( sphInterrupted () )
  449. break;
  450. // stemmed terms should not match suffixes
  451. if ( tArgs.m_bHasExactForms && *tDictReader.m_sKeyword!=MAGIC_WORD_HEAD_NONSTEMMED )
  452. continue;
  453. if ( sphWildcardMatch ( (const char *)tDictReader.m_sKeyword+iSkipMagic, sWildcard, pWildcard ) )
  454. tDict2Payload.Add ( tDictReader, tDictReader.GetWordLen() );
  455. }
  456. if ( sphInterrupted () )
  457. break;
  458. }
  459. tDict2Payload.Convert ( tArgs );
  460. }
  461. void CWordlist::SuffixGetChekpoints ( const SuggestResult_t & , const char * sSuffix, int iLen, CSphVector<DWORD> & dCheckpoints ) const
  462. {
  463. sphLookupInfixCheckpoints ( sSuffix, iLen, m_tBuf.GetWritePtr(), m_dInfixBlocks, m_iInfixCodepointBytes, dCheckpoints );
  464. }
  465. void CWordlist::SetCheckpoint ( SuggestResult_t & tRes, DWORD iCP ) const
  466. {
  467. assert ( tRes.m_pWordReader );
  468. KeywordsBlockReader_c * pReader = (KeywordsBlockReader_c *)tRes.m_pWordReader;
  469. pReader->Reset ( m_tBuf.GetWritePtr() + m_dCheckpoints[iCP-1].m_iWordlistOffset );
  470. }
  471. bool CWordlist::ReadNextWord ( SuggestResult_t & tRes, DictWord_t & tWord ) const
  472. {
  473. KeywordsBlockReader_c * pReader = (KeywordsBlockReader_c *)tRes.m_pWordReader;
  474. if ( !pReader->UnpackWord() )
  475. return false;
  476. tWord.m_sWord = pReader->GetWord();
  477. tWord.m_iLen = pReader->GetWordLen();
  478. tWord.m_iDocs = pReader->m_iDocs;
  479. return true;
  480. }
  481. //////////////////////////////////////////////////////////////////////////
  482. KeywordsBlockReader_c::KeywordsBlockReader_c ( const BYTE * pBuf, int iSkiplistBlockSize )
  483. : m_iSkiplistBlockSize ( iSkiplistBlockSize )
  484. {
  485. Reset ( pBuf );
  486. }
  487. void KeywordsBlockReader_c::Reset ( const BYTE * pBuf )
  488. {
  489. m_pBuf = pBuf;
  490. m_sWord[0] = '\0';
  491. m_iLen = 0;
  492. m_sKeyword = m_sWord;
  493. }
  494. bool KeywordsBlockReader_c::UnpackWord()
  495. {
  496. if ( !m_pBuf )
  497. return false;
  498. assert ( m_iSkiplistBlockSize>0 );
  499. // unpack next word
  500. // must be in sync with DictEnd()!
  501. BYTE uPack = *m_pBuf++;
  502. if ( !uPack )
  503. {
  504. // ok, this block is over
  505. m_pBuf = NULL;
  506. m_iLen = 0;
  507. return false;
  508. }
  509. int iMatch, iDelta;
  510. if ( uPack & 0x80 )
  511. {
  512. iDelta = ( ( uPack>>4 ) & 7 ) + 1;
  513. iMatch = uPack & 15;
  514. } else
  515. {
  516. iDelta = uPack & 127;
  517. iMatch = *m_pBuf++;
  518. }
  519. assert ( iMatch+iDelta<(int)sizeof(m_sWord)-1 );
  520. assert ( iMatch<=(int)strlen ( (char *)m_sWord ) );
  521. memcpy ( m_sWord + iMatch, m_pBuf, iDelta );
  522. m_pBuf += iDelta;
  523. m_iLen = iMatch + iDelta;
  524. m_sWord[m_iLen] = '\0';
  525. m_iDoclistOffset = sphUnzipOffset ( m_pBuf );
  526. m_iDocs = sphUnzipInt ( m_pBuf );
  527. m_iHits = sphUnzipInt ( m_pBuf );
  528. m_uHint = ( m_iDocs>=DOCLIST_HINT_THRESH ) ? *m_pBuf++ : 0;
  529. m_iDoclistHint = DoclistHintUnpack ( m_iDocs, m_uHint );
  530. if ( m_iDocs > m_iSkiplistBlockSize )
  531. m_iSkiplistOffset = sphUnzipOffset ( m_pBuf );
  532. else
  533. m_iSkiplistOffset = 0;
  534. assert ( m_iLen>0 );
  535. return true;
  536. }