indexformat.cpp 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855
  1. //
  2. // Copyright (c) 2017-2026, Manticore Software LTD (https://manticoresearch.com)
  3. // Copyright (c) 2001-2016, Andrew Aksyonoff
  4. // Copyright (c) 2008-2016, Sphinx Technologies Inc
  5. // All rights reserved
  6. //
  7. // This program is free software; you can redistribute it and/or modify
  8. // it under the terms of the GNU General Public License. You should have
  9. // received a copy of the GPL license along with this program; if you
  10. // did not, you can find it at http://www.gnu.org/
  11. //
  12. #include "indexformat.h"
  13. #if WITH_RE2
  14. #include <string>
  15. #include <re2/re2.h>
  16. #endif
  17. // let uDocs be DWORD here to prevent int overflow in case of hitless word (highest bit is 1)
  18. int DoclistHintUnpack ( DWORD uDocs, BYTE uHint )
  19. {
  20. if ( uDocs<(DWORD)DOCLIST_HINT_THRESH )
  21. return (int)Min ( 8*(int64_t)uDocs, INT_MAX );
  22. else
  23. return (int)Min ( 4*(int64_t)uDocs+( int64_t(uDocs)*uHint/64 ), INT_MAX );
  24. }
  25. //////////////////////////////////////////////////////////////////////////
  26. DiskIndexQwordTraits_c::DiskIndexQwordTraits_c ( bool bUseMini, bool bExcluded )
  27. {
  28. m_bExcluded = bExcluded;
  29. if ( bUseMini )
  30. {
  31. m_pDocsBuf = m_dDoclistBuf;
  32. m_pHitsBuf = m_dHitlistBuf;
  33. }
  34. }
  35. void DiskIndexQwordTraits_c::SetDocReader ( DataReaderFactory_c * pReader )
  36. {
  37. if ( !pReader )
  38. return;
  39. m_rdDoclist = pReader->MakeReader ( m_pDocsBuf, MINIBUFFER_LEN );
  40. }
  41. void DiskIndexQwordTraits_c::SetHitReader ( DataReaderFactory_c * pReader )
  42. {
  43. if ( !pReader )
  44. return;
  45. m_rdHitlist = pReader->MakeReader ( m_pHitsBuf, MINIBUFFER_LEN );
  46. }
  47. void DiskIndexQwordTraits_c::ResetDecoderState ()
  48. {
  49. ISphQword::Reset();
  50. m_uHitPosition = 0;
  51. m_uInlinedHit = 0;
  52. m_uHitState = 0;
  53. m_tDoc.m_tRowID = INVALID_ROWID;
  54. m_iHitPos = EMPTY_HIT;
  55. }
  56. //////////////////////////////////////////////////////////////////////////
  57. class CheckpointReader_c
  58. {
  59. public:
  60. const BYTE * ReadEntry ( const BYTE * pBuf, CSphWordlistCheckpoint & tCP ) const
  61. {
  62. tCP.m_uWordID = (SphWordID_t)sphUnalignedRead ( *(SphOffset_t *)pBuf );
  63. pBuf += sizeof(SphOffset_t);
  64. tCP.m_iWordlistOffset = sphUnalignedRead ( *(SphOffset_t *)pBuf );
  65. pBuf += sizeof(SphOffset_t);
  66. return pBuf;
  67. }
  68. int GetStride() const { return m_iSrcStride; }
  69. private:
  70. int m_iSrcStride = 2*sizeof(SphOffset_t);
  71. };
  72. struct MappedCheckpoint_fn : public ISphNoncopyable
  73. {
  74. const CSphWordlistCheckpoint * m_pDstStart;
  75. const BYTE * m_pSrcStart;
  76. const CheckpointReader_c * m_pReader;
  77. MappedCheckpoint_fn ( const CSphWordlistCheckpoint * pDstStart, const BYTE * pSrcStart, const CheckpointReader_c * pReader )
  78. : m_pDstStart ( pDstStart )
  79. , m_pSrcStart ( pSrcStart )
  80. , m_pReader ( pReader )
  81. {}
  82. CSphWordlistCheckpoint operator() ( const CSphWordlistCheckpoint * pCP ) const
  83. {
  84. assert ( m_pDstStart<=pCP );
  85. const BYTE * pCur = ( pCP - m_pDstStart ) * m_pReader->GetStride() + m_pSrcStart;
  86. CSphWordlistCheckpoint tEntry;
  87. m_pReader->ReadEntry ( pCur, tEntry );
  88. return tEntry;
  89. }
  90. };
  91. //////////////////////////////////////////////////////////////////////////
  92. struct DiskExpandedEntry_t
  93. {
  94. int m_iNameOff;
  95. int m_iDocs;
  96. int m_iHits;
  97. };
  98. struct DiskExpandedPayload_t
  99. {
  100. int m_iDocs;
  101. int m_iHits;
  102. uint64_t m_uDoclistOff;
  103. int m_iDoclistHint;
  104. };
  105. struct Slice64_t
  106. {
  107. uint64_t m_uOff;
  108. int m_iLen;
  109. };
  110. struct DiskSubstringPayload_t : public ISphSubstringPayload
  111. {
  112. explicit DiskSubstringPayload_t ( int iDoclists )
  113. : m_dDoclist ( iDoclists )
  114. {}
  115. CSphFixedVector<Slice64_t> m_dDoclist;
  116. };
  117. //////////////////////////////////////////////////////////////////////////
  118. struct DictEntryDiskPayload_t : public DictTerm2Expanded_i
  119. {
  120. DictEntryDiskPayload_t ( bool bPayload, ESphHitless eHitless )
  121. {
  122. m_bPayload = bPayload;
  123. m_eHitless = eHitless;
  124. if ( bPayload )
  125. m_dWordPayload.Reserve ( 1000 );
  126. m_dWordExpand.Reserve ( 1000 );
  127. m_dWordBuf.Reserve ( 8096 );
  128. }
  129. void Add ( const DictEntry_t & tWord, int iWordLen )
  130. {
  131. if ( !m_bPayload || !sphIsExpandedPayload ( tWord.m_iDocs, tWord.m_iHits ) ||
  132. m_eHitless==SPH_HITLESS_ALL || ( m_eHitless==SPH_HITLESS_SOME && ( tWord.m_iDocs & HITLESS_DOC_FLAG )!=0 ) ) // FIXME!!! do we need hitless=some as payloads?
  133. {
  134. DiskExpandedEntry_t & tExpand = m_dWordExpand.Add();
  135. int iOff = m_dWordBuf.GetLength();
  136. tExpand.m_iNameOff = iOff;
  137. tExpand.m_iDocs = tWord.m_iDocs;
  138. tExpand.m_iHits = tWord.m_iHits;
  139. m_dWordBuf.Resize ( iOff + iWordLen + 1 );
  140. memcpy ( m_dWordBuf.Begin() + iOff + 1, tWord.m_szKeyword, iWordLen );
  141. m_dWordBuf[iOff] = (BYTE)iWordLen;
  142. } else
  143. {
  144. DiskExpandedPayload_t & tExpand = m_dWordPayload.Add();
  145. tExpand.m_iDocs = tWord.m_iDocs;
  146. tExpand.m_iHits = tWord.m_iHits;
  147. tExpand.m_uDoclistOff = tWord.m_iDoclistOffset;
  148. tExpand.m_iDoclistHint = tWord.m_iDoclistHint;
  149. }
  150. }
  151. void Convert ( ISphWordlist::Args_t & tArgs ) override
  152. {
  153. if ( !m_dWordExpand.GetLength() && !m_dWordPayload.GetLength() )
  154. return;
  155. int iTotalDocs = 0;
  156. int iTotalHits = 0;
  157. if ( m_dWordExpand.GetLength() )
  158. {
  159. LimitExpanded ( tArgs.m_iExpansionLimit, m_dWordExpand );
  160. const BYTE * sBase = m_dWordBuf.Begin();
  161. ARRAY_FOREACH ( i, m_dWordExpand )
  162. {
  163. const DiskExpandedEntry_t & tCur = m_dWordExpand[i];
  164. int iDocs = tCur.m_iDocs;
  165. if ( m_eHitless==SPH_HITLESS_SOME )
  166. iDocs = ( tCur.m_iDocs & HITLESS_DOC_MASK );
  167. tArgs.AddExpanded ( sBase + tCur.m_iNameOff + 1, sBase[tCur.m_iNameOff], iDocs, tCur.m_iHits );
  168. iTotalDocs += iDocs;
  169. iTotalHits += tCur.m_iHits;
  170. }
  171. tArgs.m_tExpansionStats.m_iTerms += m_dWordExpand.GetLength();
  172. }
  173. if ( m_dWordPayload.GetLength() )
  174. {
  175. LimitExpanded ( tArgs.m_iExpansionLimit, m_dWordPayload );
  176. std::unique_ptr<DiskSubstringPayload_t> pPayload ( new DiskSubstringPayload_t ( m_dWordPayload.GetLength() ) );
  177. // sorting by ascending doc-list offset gives some (15%) speed-up too
  178. sphSort ( m_dWordPayload.Begin(), m_dWordPayload.GetLength(), bind ( &DiskExpandedPayload_t::m_uDoclistOff ) );
  179. ARRAY_FOREACH ( i, m_dWordPayload )
  180. {
  181. const DiskExpandedPayload_t & tCur = m_dWordPayload[i];
  182. assert ( m_eHitless==SPH_HITLESS_NONE || ( m_eHitless==SPH_HITLESS_SOME && ( tCur.m_iDocs & HITLESS_DOC_FLAG )==0 ) );
  183. iTotalDocs += tCur.m_iDocs;
  184. iTotalHits += tCur.m_iHits;
  185. pPayload->m_dDoclist[i].m_uOff = tCur.m_uDoclistOff;
  186. pPayload->m_dDoclist[i].m_iLen = tCur.m_iDoclistHint;
  187. }
  188. pPayload->m_iTotalDocs = iTotalDocs;
  189. pPayload->m_iTotalHits = iTotalHits;
  190. tArgs.m_pPayload = std::move ( pPayload );
  191. tArgs.m_tExpansionStats.m_iMerged += m_dWordPayload.GetLength();
  192. }
  193. tArgs.m_iTotalDocs = iTotalDocs;
  194. tArgs.m_iTotalHits = iTotalHits;
  195. }
  196. // sort expansions by frequency desc
  197. // clip the less frequent ones if needed, as they are likely misspellings
  198. template < typename T >
  199. void LimitExpanded ( int iExpansionLimit, CSphVector<T> & dVec ) const
  200. {
  201. if ( !iExpansionLimit || dVec.GetLength()<=iExpansionLimit )
  202. return;
  203. sphSort ( dVec.Begin(), dVec.GetLength(), ExpandedOrderDesc_T<T>() );
  204. dVec.Resize ( iExpansionLimit );
  205. }
  206. bool m_bPayload;
  207. ESphHitless m_eHitless;
  208. CSphVector<DiskExpandedEntry_t> m_dWordExpand;
  209. CSphVector<DiskExpandedPayload_t> m_dWordPayload;
  210. CSphVector<BYTE> m_dWordBuf;
  211. };
  212. //////////////////////////////////////////////////////////////////////////
  213. CWordlist::~CWordlist ()
  214. {
  215. Reset();
  216. }
  217. void CWordlist::Reset ()
  218. {
  219. m_tBuf.Reset ();
  220. m_dCheckpoints.Reset ( 0 );
  221. m_pWords.Reset ( 0 );
  222. SafeDeleteArray ( m_pInfixBlocksWords );
  223. SafeDelete ( m_pCpReader );
  224. }
  225. bool CWordlist::Preread ( const CSphString & sName, bool bWordDict, int iSkiplistBlockSize, CSphString & sError )
  226. {
  227. assert ( m_iDictCheckpointsOffset>0 );
  228. m_bWordDict = bWordDict;
  229. m_iWordsEnd = m_iDictCheckpointsOffset; // set wordlist end
  230. m_iSkiplistBlockSize = iSkiplistBlockSize;
  231. ////////////////////////////
  232. // preload word checkpoints
  233. ////////////////////////////
  234. ////////////////////////////
  235. // fast path for CRC checkpoints - just maps data and use inplace CP reader
  236. if ( !bWordDict )
  237. {
  238. if ( !m_tBuf.Setup ( sName, sError ) )
  239. return false;
  240. m_pCpReader = new CheckpointReader_c;
  241. return true;
  242. }
  243. ////////////////////////////
  244. // regular path that loads checkpoints data
  245. CSphAutoreader tReader;
  246. if ( !tReader.Open ( sName, sError ) )
  247. return false;
  248. int64_t iFileSize = tReader.GetFilesize();
  249. int iCheckpointOnlySize = (int)(iFileSize-m_iDictCheckpointsOffset);
  250. if ( m_iInfixCodepointBytes && m_iInfixBlocksOffset )
  251. iCheckpointOnlySize = (int)(m_iInfixBlocksOffset - g_sTagInfixBlocks.second - m_iDictCheckpointsOffset);
  252. if ( iFileSize-m_iDictCheckpointsOffset>=UINT_MAX )
  253. {
  254. sError.SetSprintf ( "dictionary meta overflow: meta size=" INT64_FMT ", total size=" INT64_FMT ", meta offset=" INT64_FMT,
  255. iFileSize-m_iDictCheckpointsOffset, iFileSize, (int64_t)m_iDictCheckpointsOffset );
  256. return false;
  257. }
  258. tReader.SeekTo ( m_iDictCheckpointsOffset, iCheckpointOnlySize );
  259. assert ( m_bWordDict );
  260. int iArenaSize = iCheckpointOnlySize
  261. - (sizeof(DWORD)+sizeof(SphOffset_t))*m_dCheckpoints.GetLength()
  262. + sizeof(BYTE)*m_dCheckpoints.GetLength();
  263. assert ( iArenaSize>=0 );
  264. m_pWords.Reset ( iArenaSize );
  265. BYTE * pWord = m_pWords.Begin();
  266. for ( auto & dCheckpoint : m_dCheckpoints )
  267. {
  268. dCheckpoint.m_szWord = (char *)pWord;
  269. const int iLen = tReader.GetDword();
  270. assert ( iLen>0 );
  271. assert ( iLen + 1 + ( pWord - m_pWords.Begin() )<=iArenaSize );
  272. tReader.GetBytes ( pWord, iLen );
  273. pWord[iLen] = '\0';
  274. pWord += iLen+1;
  275. dCheckpoint.m_iWordlistOffset = tReader.GetOffset();
  276. }
  277. ////////////////////////
  278. // preload infix blocks
  279. ////////////////////////
  280. if ( m_iInfixCodepointBytes && m_iInfixBlocksOffset )
  281. {
  282. // reading to vector as old version doesn't store total infix words length
  283. CSphTightVector<BYTE> dInfixWords;
  284. dInfixWords.Reserve ( (int)m_iInfixBlocksWordsSize );
  285. tReader.SeekTo ( m_iInfixBlocksOffset, (int)(iFileSize-m_iInfixBlocksOffset) );
  286. m_dInfixBlocks.Resize ( tReader.UnzipInt() );
  287. for ( auto & dInfixBlock : m_dInfixBlocks )
  288. {
  289. int iBytes = tReader.UnzipInt();
  290. int iOff = dInfixWords.GetLength();
  291. dInfixBlock.m_iInfixOffset = (DWORD) iOff; /// FIXME! name convention of m_iInfixOffset
  292. dInfixWords.Resize ( iOff+iBytes+1 );
  293. tReader.GetBytes ( dInfixWords.Begin()+iOff, iBytes );
  294. dInfixWords[iOff+iBytes] = '\0';
  295. dInfixBlock.m_iOffset = tReader.UnzipInt();
  296. }
  297. // fix-up offset to pointer
  298. m_pInfixBlocksWords = dInfixWords.LeakData();
  299. ARRAY_FOREACH ( i, m_dInfixBlocks )
  300. m_dInfixBlocks[i].m_sInfix = (const char *)m_pInfixBlocksWords + m_dInfixBlocks[i].m_iInfixOffset;
  301. // FIXME!!! store and load that explicitly
  302. if ( m_dInfixBlocks.GetLength() )
  303. m_iWordsEnd = m_dInfixBlocks.Begin()->m_iOffset - g_sTagInfixEntries.second;
  304. else
  305. m_iWordsEnd -= g_sTagInfixEntries.second;
  306. }
  307. if ( tReader.GetErrorFlag() )
  308. {
  309. sError = tReader.GetErrorMessage();
  310. return false;
  311. }
  312. tReader.Close();
  313. // mapping up only wordlist without meta (checkpoints, infixes, etc.)
  314. return m_tBuf.Setup ( sName, sError );
  315. }
  316. void CWordlist::DebugPopulateCheckpoints()
  317. {
  318. if ( !m_pCpReader )
  319. return;
  320. const BYTE * pCur = m_tBuf.GetReadPtr() + m_iDictCheckpointsOffset;
  321. ARRAY_FOREACH ( i, m_dCheckpoints )
  322. pCur = m_pCpReader->ReadEntry ( pCur, m_dCheckpoints[i] );
  323. SafeDelete(m_pCpReader);
  324. }
  325. const CSphWordlistCheckpoint * CWordlist::FindCheckpointCrc ( SphWordID_t iWordID ) const
  326. {
  327. if ( m_pCpReader ) // FIXME!!! fall to regular checkpoints after data got read
  328. {
  329. MappedCheckpoint_fn tPred ( m_dCheckpoints.Begin(), m_tBuf.GetReadPtr() + m_iDictCheckpointsOffset, m_pCpReader );
  330. return sphSearchCheckpointCrc( iWordID, m_dCheckpoints, std::move(tPred));
  331. }
  332. return sphSearchCheckpointCrc ( iWordID, m_dCheckpoints );
  333. }
  334. const CSphWordlistCheckpoint * CWordlist::FindCheckpointWrd ( const char* sWord, int iWordLen, bool bStarMode ) const
  335. {
  336. if ( m_pCpReader ) // FIXME!!! fall to regular checkpoints after data got read
  337. {
  338. MappedCheckpoint_fn tPred ( m_dCheckpoints.Begin(), m_tBuf.GetReadPtr() + m_iDictCheckpointsOffset, m_pCpReader );
  339. return sphSearchCheckpointWrd ( sWord, iWordLen, bStarMode, m_dCheckpoints, std::move ( tPred ) );
  340. }
  341. return sphSearchCheckpointWrd ( sWord, iWordLen, bStarMode, m_dCheckpoints );
  342. }
  343. bool CWordlist::GetWord ( const BYTE * pBuf, SphWordID_t iWordID, DictEntry_t & tWord ) const
  344. {
  345. SphWordID_t iLastID = 0;
  346. SphOffset_t uLastOff = 0;
  347. while (true)
  348. {
  349. // unpack next word ID
  350. const SphWordID_t iDeltaWord = UnzipWordidBE ( pBuf ); // FIXME! slow with 32bit wordids
  351. if ( iDeltaWord==0 ) // wordlist chunk is over
  352. return false;
  353. iLastID += iDeltaWord;
  354. // list is sorted, so if there was no match, there's no such word
  355. if ( iLastID>iWordID )
  356. return false;
  357. // unpack next offset
  358. const SphOffset_t iDeltaOffset = UnzipOffsetBE ( pBuf );
  359. uLastOff += iDeltaOffset;
  360. // unpack doc/hit count
  361. const int iDocs = UnzipIntBE ( pBuf );
  362. const int iHits = UnzipIntBE ( pBuf );
  363. SphOffset_t iSkiplistPos = 0;
  364. const int iLayoutDocs = iDocs & HITLESS_DOC_MASK;
  365. if ( iLayoutDocs > m_iSkiplistBlockSize )
  366. iSkiplistPos = UnzipOffsetBE ( pBuf );
  367. assert ( iDeltaOffset );
  368. assert ( iDocs );
  369. assert ( iHits );
  370. // it matches?!
  371. if ( iLastID==iWordID )
  372. {
  373. UnzipWordidBE ( pBuf ); // might be 0 at checkpoint
  374. const SphOffset_t iDoclistLen = UnzipOffsetBE ( pBuf );
  375. tWord.m_iDoclistOffset = uLastOff;
  376. tWord.m_iDocs = iDocs;
  377. tWord.m_iHits = iHits;
  378. tWord.m_iDoclistHint = (int)iDoclistLen;
  379. tWord.m_iSkiplistOffset = iSkiplistPos;
  380. return true;
  381. }
  382. }
  383. }
  384. const BYTE * CWordlist::AcquireDict ( const CSphWordlistCheckpoint * pCheckpoint ) const
  385. {
  386. assert ( pCheckpoint );
  387. assert ( m_dCheckpoints.GetLength() );
  388. assert ( pCheckpoint>=m_dCheckpoints.Begin() && pCheckpoint<=&m_dCheckpoints.Last() );
  389. SphOffset_t iOff = pCheckpoint->m_iWordlistOffset;
  390. if ( m_pCpReader )
  391. {
  392. MappedCheckpoint_fn tPred ( m_dCheckpoints.Begin(), m_tBuf.GetReadPtr() + m_iDictCheckpointsOffset, m_pCpReader );
  393. iOff = tPred ( pCheckpoint ).m_iWordlistOffset;
  394. }
  395. assert ( !m_tBuf.IsEmpty() );
  396. assert ( iOff>0 && iOff<(int64_t)m_tBuf.GetLengthBytes() );
  397. return m_tBuf.GetReadPtr()+iOff;
  398. }
  399. void CWordlist::GetPrefixedWords ( const char * sSubstring, int iSubLen, const char * sWildcard, Args_t & tArgs ) const
  400. {
  401. assert ( sSubstring && *sSubstring && iSubLen>0 );
  402. // empty index?
  403. if ( !m_dCheckpoints.GetLength() )
  404. return;
  405. DictEntryDiskPayload_t tDict2Payload ( tArgs.m_bPayload, tArgs.m_eHitless );
  406. int dWildcard [ SPH_MAX_WORD_LEN + 1 ];
  407. int * pWildcard = ( sphIsUTF8 ( sWildcard ) && sphUTF8ToWideChar ( sWildcard, dWildcard, SPH_MAX_WORD_LEN ) ) ? dWildcard : NULL;
  408. // assume dict=crc never has word with wordid=0, however just don't consider it and explicitly set nullptr.
  409. const CSphWordlistCheckpoint * pCheckpoint = m_bWordDict ? FindCheckpointWrd ( sSubstring, iSubLen, true ) : nullptr;
  410. const int iSkipMagic = ( BYTE(*sSubstring)<0x20 ); // whether to skip heading magic chars in the prefix, like NONSTEMMED maker
  411. while ( pCheckpoint )
  412. {
  413. // decode wordlist chunk
  414. KeywordsBlockReader_c tDictReader ( AcquireDict ( pCheckpoint ), m_iSkiplistBlockSize );
  415. while ( tDictReader.UnpackWord() )
  416. {
  417. // block is sorted
  418. // so once keywords are greater than the prefix, no more matches
  419. int iCmp = sphDictCmp ( sSubstring, iSubLen, (const char *)tDictReader.m_szKeyword, tDictReader.GetWordLen() );
  420. if ( iCmp<0 )
  421. break;
  422. if ( sphInterrupted() )
  423. break;
  424. // does it match the prefix *and* the entire wildcard?
  425. if ( iCmp==0 && sphWildcardMatch ( (const char *)tDictReader.m_szKeyword + iSkipMagic, sWildcard, pWildcard ) )
  426. tDict2Payload.Add ( tDictReader, tDictReader.GetWordLen() );
  427. }
  428. if ( sphInterrupted () )
  429. break;
  430. pCheckpoint++;
  431. if ( pCheckpoint > &m_dCheckpoints.Last() )
  432. break;
  433. if ( sphDictCmp ( sSubstring, iSubLen, pCheckpoint->m_szWord, (int) strlen ( pCheckpoint->m_szWord ) )<0 )
  434. break;
  435. }
  436. tDict2Payload.Convert ( tArgs );
  437. }
  438. void CWordlist::GetInfixedWords ( const char * sSubstring, int iSubLen, const char * sWildcard, Args_t & tArgs ) const
  439. {
  440. // dict must be of keywords type, and fully cached
  441. // mmap()ed in the worst case, should we ever banish it to disk again
  442. if ( m_tBuf.IsEmpty() || !m_dCheckpoints.GetLength() )
  443. return;
  444. assert ( !m_pCpReader );
  445. // extract key1, upto 6 chars from infix start
  446. int iBytes1 = sphGetInfixLength ( sSubstring, iSubLen, m_iInfixCodepointBytes );
  447. // lookup key1
  448. // OPTIMIZE? maybe lookup key2 and reduce checkpoint set size, if possible?
  449. CSphVector<DWORD> dPoints;
  450. if ( !sphLookupInfixCheckpoints ( sSubstring, iBytes1, m_tBuf.GetReadPtr(), m_dInfixBlocks, m_iInfixCodepointBytes, dPoints ) )
  451. return;
  452. DictEntryDiskPayload_t tDict2Payload ( tArgs.m_bPayload, tArgs.m_eHitless );
  453. const int iSkipMagic = ( tArgs.m_bHasExactForms ? 1 : 0 ); // whether to skip heading magic chars in the prefix, like NONSTEMMED maker
  454. int dWildcard [ SPH_MAX_WORD_LEN + 1 ];
  455. int * pWildcard = ( sphIsUTF8 ( sWildcard ) && sphUTF8ToWideChar ( sWildcard, dWildcard, SPH_MAX_WORD_LEN ) ) ? dWildcard : NULL;
  456. // walk those checkpoints, check all their words
  457. ARRAY_FOREACH ( i, dPoints )
  458. {
  459. // OPTIMIZE? add a quicker path than a generic wildcard for "*infix*" case?
  460. KeywordsBlockReader_c tDictReader ( m_tBuf.GetReadPtr() + m_dCheckpoints[dPoints[i]-1].m_iWordlistOffset, m_iSkiplistBlockSize );
  461. while ( tDictReader.UnpackWord() )
  462. {
  463. if ( sphInterrupted () )
  464. break;
  465. // stemmed terms should not match suffixes
  466. if ( tArgs.m_bHasExactForms && *tDictReader.m_szKeyword!=MAGIC_WORD_HEAD_NONSTEMMED )
  467. continue;
  468. if ( sphWildcardMatch ( (const char *)tDictReader.m_szKeyword+iSkipMagic, sWildcard, pWildcard ) )
  469. tDict2Payload.Add ( tDictReader, tDictReader.GetWordLen() );
  470. }
  471. if ( sphInterrupted () )
  472. break;
  473. }
  474. tDict2Payload.Convert ( tArgs );
  475. }
  476. #if WITH_RE2
  477. struct RegexMatch_t
  478. {
  479. std::unique_ptr<RE2> m_pRe { nullptr };
  480. std::unique_ptr<DictEntryDiskPayload_t> m_pPayload { nullptr };
  481. };
  482. #endif
  483. void CWordlist::ScanRegexWords ( const VecTraits_T<RegexTerm_t> & dTerms, const ISphWordlist::Args_t & tArgs, const VecExpandConv_t & dConverters ) const
  484. {
  485. // dict must be of keywords type, and fully cached
  486. // mmap()ed in the worst case, should we ever banish it to disk again
  487. if ( m_tBuf.IsEmpty() || !m_dCheckpoints.GetLength() )
  488. return;
  489. assert ( dTerms.GetLength() && dTerms.GetLength()==dConverters.GetLength() );
  490. #if WITH_RE2
  491. CSphFixedVector<RegexMatch_t> dRegex ( dTerms.GetLength() );
  492. RE2::Options tOptions;
  493. tOptions.set_encoding ( RE2::Options::Encoding::EncodingUTF8 );
  494. ARRAY_FOREACH ( i, dRegex )
  495. {
  496. dRegex[i].m_pRe = std::make_unique<RE2> ( dTerms[i].first.cstr(), tOptions );
  497. dRegex[i].m_pPayload = std::make_unique<DictEntryDiskPayload_t> ( tArgs.m_bPayload, tArgs.m_eHitless );
  498. assert ( dRegex[i].m_pRe && dRegex[i].m_pPayload );
  499. }
  500. const int iSkipMagic = ( tArgs.m_bHasExactForms ? 1 : 0 ); // whether to skip heading magic chars in the prefix, like NONSTEMMED maker
  501. // walk those checkpoints, check all their words
  502. ARRAY_FOREACH ( i, m_dCheckpoints )
  503. {
  504. const auto & tCP = m_dCheckpoints[i];
  505. KeywordsBlockReader_c tDictReader ( m_tBuf.GetReadPtr() + tCP.m_iWordlistOffset, m_iSkiplistBlockSize );
  506. while ( tDictReader.UnpackWord() )
  507. {
  508. if ( sphInterrupted () )
  509. break;
  510. // stemmed terms should not match suffixes
  511. if ( tArgs.m_bHasExactForms && *tDictReader.m_szKeyword!=MAGIC_WORD_HEAD_NONSTEMMED )
  512. continue;
  513. int iLen = tDictReader.GetWordLen();
  514. re2::StringPiece sDictToken ( (const char *)tDictReader.m_szKeyword+iSkipMagic, iLen );
  515. ARRAY_FOREACH ( i, dRegex )
  516. {
  517. if ( RE2::FullMatchN ( sDictToken, *dRegex[i].m_pRe, nullptr, 0 ) )
  518. dRegex[i].m_pPayload->Add ( tDictReader, iLen );
  519. }
  520. }
  521. if ( sphInterrupted () )
  522. break;
  523. }
  524. ARRAY_FOREACH ( i, dRegex )
  525. dConverters[i] = std::move( dRegex[i].m_pPayload );
  526. #endif
  527. }
  528. void CWordlist::SuffixGetChekpoints ( const SuggestResult_t & , const char * sSuffix, int iLen, CSphVector<DWORD> & dCheckpoints ) const
  529. {
  530. sphLookupInfixCheckpoints ( sSuffix, iLen, m_tBuf.GetReadPtr(), m_dInfixBlocks, m_iInfixCodepointBytes, dCheckpoints );
  531. }
  532. void CWordlist::SetCheckpoint ( SuggestResult_t & tRes, DWORD iCP ) const
  533. {
  534. assert ( tRes.m_pWordReader );
  535. KeywordsBlockReader_c * pReader = (KeywordsBlockReader_c *)tRes.m_pWordReader;
  536. pReader->Reset ( m_tBuf.GetReadPtr() + m_dCheckpoints[iCP-1].m_iWordlistOffset );
  537. }
  538. bool CWordlist::ReadNextWord ( SuggestResult_t & tRes, DictWord_t & tWord ) const
  539. {
  540. KeywordsBlockReader_c * pReader = (KeywordsBlockReader_c *)tRes.m_pWordReader;
  541. if ( !pReader->UnpackWord() )
  542. return false;
  543. tWord.m_sWord = pReader->GetWord();
  544. tWord.m_iLen = pReader->GetWordLen();
  545. tWord.m_iDocs = pReader->m_iDocs;
  546. return true;
  547. }
  548. //////////////////////////////////////////////////////////////////////////
  549. KeywordsBlockReader_c::KeywordsBlockReader_c ( const BYTE * pBuf, int iSkiplistBlockSize )
  550. : m_iSkiplistBlockSize ( iSkiplistBlockSize )
  551. {
  552. Reset ( pBuf );
  553. }
  554. void KeywordsBlockReader_c::Reset ( const BYTE * pBuf )
  555. {
  556. m_pBuf = pBuf;
  557. m_sWord[0] = '\0';
  558. m_iLen = 0;
  559. m_szKeyword = m_sWord.data();
  560. }
  561. bool KeywordsBlockReader_c::UnpackWord()
  562. {
  563. if ( !m_pBuf )
  564. return false;
  565. assert ( m_iSkiplistBlockSize>0 );
  566. // unpack next word
  567. // must be in sync with DictEnd()!
  568. BYTE uPack = *m_pBuf++;
  569. if ( !uPack )
  570. {
  571. // ok, this block is over
  572. m_pBuf = NULL;
  573. m_iLen = 0;
  574. return false;
  575. }
  576. int iMatch, iDelta;
  577. if ( uPack & 0x80 )
  578. {
  579. iDelta = ( ( uPack>>4 ) & 7 ) + 1;
  580. iMatch = uPack & 15;
  581. } else
  582. {
  583. iDelta = uPack & 127;
  584. iMatch = *m_pBuf++;
  585. }
  586. assert ( iDelta>0 );
  587. assert ( iMatch+iDelta<(int)sizeof(m_sWord)-1 );
  588. assert ( iMatch<=(int)strlen ( (char *)m_sWord.data() ) );
  589. memcpy ( m_sWord.data() + iMatch, m_pBuf, iDelta );
  590. m_pBuf += iDelta;
  591. m_iLen = iMatch + iDelta;
  592. m_sWord[m_iLen] = '\0';
  593. m_iDoclistOffset = UnzipOffsetBE ( m_pBuf );
  594. m_iDocs = UnzipIntBE ( m_pBuf );
  595. m_iHits = UnzipIntBE ( m_pBuf );
  596. const DWORD uLayoutDocs = (DWORD)( m_iDocs & HITLESS_DOC_MASK );
  597. m_uHint = ( uLayoutDocs>=(DWORD)DOCLIST_HINT_THRESH ) ? *m_pBuf++ : 0;
  598. m_iDoclistHint = DoclistHintUnpack ( uLayoutDocs, m_uHint );
  599. if ( uLayoutDocs>(DWORD)m_iSkiplistBlockSize )
  600. m_iSkiplistOffset = UnzipOffsetBE ( m_pBuf );
  601. else
  602. m_iSkiplistOffset = 0;
  603. assert ( m_iLen>0 );
  604. return true;
  605. }
  606. static int g_iExpandMergeDocs = 32;
  607. static int g_iExpandMergeHits = 256;
  608. bool sphIsExpandedPayload ( int iDocs, int iHits )
  609. {
  610. return ( iHits<g_iExpandMergeHits || iDocs<g_iExpandMergeDocs );
  611. }
  612. void ExpandedMergeThdDocs ( int iDocs )
  613. {
  614. g_iExpandMergeDocs = iDocs;
  615. }
  616. void ExpandedMergeThdHits ( int iHits )
  617. {
  618. g_iExpandMergeHits = iHits;
  619. }
  620. ////////////////////////////////////////////////////////////////////
  621. void IndexWriteHeader ( const BuildHeader_t & tBuildHeader, const WriteHeader_t & tWriteHeader, JsonEscapedBuilder& sJson, bool bForceWordDict, bool SkipEmbeddDict )
  622. {
  623. auto _ = sJson.ObjectW();
  624. // human-readable sugar
  625. sJson.NamedString ( "meta_created_time_utc", sphCurrentUtcTime() );
  626. // version
  627. sJson.NamedVal ( "index_format_version", INDEX_FORMAT_VERSION );
  628. // index stats - json (put here to be similar with .meta)
  629. sJson.NamedValNonDefault ( "total_documents", tBuildHeader.m_iTotalDocuments );
  630. sJson.NamedValNonDefault ( "total_bytes", tBuildHeader.m_iTotalBytes );
  631. // schema
  632. sJson.NamedVal ( "schema", *tWriteHeader.m_pSchema );
  633. // index settings
  634. sJson.NamedVal ( "index_settings", *tWriteHeader.m_pSettings );
  635. // tokenizer info
  636. assert ( tWriteHeader.m_pTokenizer );
  637. sJson.Named ( "tokenizer_settings" );
  638. SaveTokenizerSettings ( sJson, tWriteHeader.m_pTokenizer, tWriteHeader.m_pSettings->m_iEmbeddedLimit );
  639. // dictionary info
  640. assert ( tWriteHeader.m_pDict );
  641. sJson.Named ( "dictionary_settings" );
  642. SaveDictionarySettings ( sJson, tWriteHeader.m_pDict, bForceWordDict, SkipEmbeddDict ? 0 : tWriteHeader.m_pSettings->m_iEmbeddedLimit );
  643. // wordlist checkpoints - json
  644. sJson.NamedValNonDefault ( "dict_checkpoints_offset", tBuildHeader.m_iDictCheckpointsOffset );
  645. sJson.NamedValNonDefault ( "dict_checkpoints", tBuildHeader.m_iDictCheckpoints );
  646. sJson.NamedValNonDefault ( "infix_codepoint_bytes", tBuildHeader.m_iInfixCodepointBytes );
  647. sJson.NamedValNonDefault ( "infix_blocks_offset", tBuildHeader.m_iInfixBlocksOffset );
  648. sJson.NamedValNonDefault ( "infix_block_words_size", tBuildHeader.m_iInfixBlocksWordsSize );
  649. sJson.NamedValNonDefault ( "docinfo", tBuildHeader.m_iDocinfo );
  650. sJson.NamedValNonDefault ( "docinfo_index", tBuildHeader.m_iDocinfoIndex );
  651. sJson.NamedValNonDefault ( "min_max_index", tBuildHeader.m_iMinMaxIndex );
  652. // field filter info
  653. CSphFieldFilterSettings tFieldFilterSettings;
  654. if ( tWriteHeader.m_pFieldFilter )
  655. {
  656. tWriteHeader.m_pFieldFilter->GetSettings ( tFieldFilterSettings );
  657. sJson.NamedVal ( "field_filter_settings", tFieldFilterSettings );
  658. }
  659. // average field lengths
  660. if ( tWriteHeader.m_pSettings->m_bIndexFieldLens )
  661. {
  662. sJson.Named ( "index_fields_lens" );
  663. auto _ = sJson.Array();
  664. for ( int i=0; i < tWriteHeader.m_pSchema->GetFieldsCount(); ++i )
  665. {
  666. sJson << tWriteHeader.m_pFieldLens[i];
  667. }
  668. }
  669. }