docstore.cpp 57 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970
  1. //
  2. // Copyright (c) 2017-2026, Manticore Software LTD (https://manticoresearch.com)
  3. // All rights reserved
  4. //
  5. // This program is free software; you can redistribute it and/or modify
  6. // it under the terms of the GNU General Public License. You should have
  7. // received a copy of the GPL license along with this program; if you
  8. // did not, you can find it at http://www.gnu.org/
  9. //
  10. #include "docstore.h"
  11. #include "std/lrucache.h"
  12. #include "fileio.h"
  13. #include "memio.h"
  14. #include "fileutils.h"
  15. #include "attribute.h"
  16. #include "indexcheck.h"
  17. #include "lz4/lz4.h"
  18. #include "lz4/lz4hc.h"
  19. #include "sphinxint.h"
  20. enum BlockFlags_e : BYTE
  21. {
  22. BLOCK_FLAG_COMPRESSED = 1 << 0,
  23. BLOCK_FLAG_FIELD_REORDER = 1 << 1
  24. };
  25. enum BlockType_e : BYTE
  26. {
  27. BLOCK_TYPE_SMALL,
  28. BLOCK_TYPE_BIG,
  29. BLOCK_TYPE_TOTAL
  30. };
  31. enum DocFlags_e : BYTE
  32. {
  33. DOC_FLAG_ALL_EMPTY = 1 << 0,
  34. DOC_FLAG_EMPTY_BITMASK = 1 << 1
  35. };
  36. enum FieldFlags_e : BYTE
  37. {
  38. FIELD_FLAG_COMPRESSED = 1 << 0,
  39. FIELD_FLAG_EMPTY = 1 << 1
  40. };
  41. static const int STORAGE_VERSION = 1;
  42. //////////////////////////////////////////////////////////////////////////
  43. static BYTE Compression2Byte ( Compression_e eComp )
  44. {
  45. switch (eComp)
  46. {
  47. case Compression_e::NONE: return 0;
  48. case Compression_e::LZ4: return 1;
  49. case Compression_e::LZ4HC: return 2;
  50. default:
  51. assert ( 0 && "Unknown compression type" );
  52. return 0;
  53. }
  54. }
  55. static Compression_e Byte2Compression ( BYTE uComp )
  56. {
  57. switch (uComp)
  58. {
  59. case 0: return Compression_e::NONE;
  60. case 1: return Compression_e::LZ4;
  61. case 2: return Compression_e::LZ4HC;
  62. default:
  63. assert ( 0 && "Unknown compression type" );
  64. return Compression_e::NONE;
  65. }
  66. }
  67. static void PackData ( CSphVector<BYTE> & dDst, const BYTE * pData, DWORD uSize, bool bText, bool bPack )
  68. {
  69. if ( bPack )
  70. {
  71. const DWORD GAP = 8;
  72. dDst.Resize ( uSize+GAP );
  73. dDst.Resize ( sphPackPtrAttr ( dDst.Begin (), {pData, uSize} ));
  74. }
  75. else
  76. {
  77. dDst.Resize ( uSize + ( bText ? 1 : 0 ) );
  78. memcpy ( dDst.Begin(), pData, uSize );
  79. if ( bText )
  80. {
  81. dDst[uSize] = '\0';
  82. dDst.Resize(uSize);
  83. }
  84. }
  85. }
  86. //////////////////////////////////////////////////////////////////////////
  87. class Compressor_i
  88. {
  89. public:
  90. virtual ~Compressor_i(){}
  91. virtual bool Compress ( const VecTraits_T<BYTE> & dUncompressed, CSphVector<BYTE> & dCompressed ) const = 0;
  92. virtual bool Decompress ( const VecTraits_T<BYTE> & dCompressed, VecTraits_T<BYTE> & dDecompressed ) const = 0;
  93. };
  94. class Compressor_None_c : public Compressor_i
  95. {
  96. public:
  97. bool Compress ( const VecTraits_T<BYTE> & dUncompressed, CSphVector<BYTE> & dCompressed ) const final { return false; }
  98. bool Decompress ( const VecTraits_T<BYTE> & dCompressed, VecTraits_T<BYTE> & dDecompressed ) const final { return true; }
  99. };
  100. class Compressor_LZ4_c : public Compressor_i
  101. {
  102. public:
  103. bool Compress ( const VecTraits_T<BYTE> & dUncompressed, CSphVector<BYTE> & dCompressed ) const override;
  104. bool Decompress ( const VecTraits_T<BYTE> & dCompressed, VecTraits_T<BYTE> & dDecompressed ) const final;
  105. protected:
  106. virtual int DoCompression ( const VecTraits_T<BYTE> & dUncompressed, CSphVector<BYTE> & dCompressed ) const;
  107. };
  108. class Compressor_LZ4HC_c : public Compressor_LZ4_c
  109. {
  110. public:
  111. Compressor_LZ4HC_c ( int iCompressionLevel );
  112. protected:
  113. int DoCompression ( const VecTraits_T<BYTE> & dUncompressed, CSphVector<BYTE> & dCompressed ) const final;
  114. private:
  115. int m_iCompressionLevel = DEFAULT_COMPRESSION_LEVEL;
  116. };
  117. bool Compressor_LZ4_c::Compress ( const VecTraits_T<BYTE> & dUncompressed, CSphVector<BYTE> & dCompressed ) const
  118. {
  119. const int MIN_COMPRESSIBLE_SIZE = 64;
  120. if ( dUncompressed.GetLength() < MIN_COMPRESSIBLE_SIZE )
  121. return false;
  122. dCompressed.Resize ( int ( dUncompressed.GetLength()*1.5f ) );
  123. int iCompressedSize = DoCompression ( dUncompressed, dCompressed );
  124. const float WORST_COMPRESSION_RATIO = 0.95f;
  125. if ( iCompressedSize<0 || float(iCompressedSize)/dUncompressed.GetLength() > WORST_COMPRESSION_RATIO )
  126. return false;
  127. dCompressed.Resize(iCompressedSize);
  128. return true;
  129. }
  130. bool Compressor_LZ4_c::Decompress ( const VecTraits_T<BYTE> & dCompressed, VecTraits_T<BYTE> & dDecompressed ) const
  131. {
  132. int iRes = LZ4_decompress_safe ( (const char *)dCompressed.Begin(), (char *)dDecompressed.Begin(), dCompressed.GetLength(), dDecompressed.GetLength() );
  133. return iRes==dDecompressed.GetLength();
  134. }
  135. int Compressor_LZ4_c::DoCompression ( const VecTraits_T<BYTE> & dUncompressed, CSphVector<BYTE> & dCompressed ) const
  136. {
  137. return LZ4_compress_default ( (const char *)dUncompressed.Begin(), (char *)dCompressed.Begin(), dUncompressed.GetLength(), dCompressed.GetLength() );
  138. }
  139. Compressor_LZ4HC_c::Compressor_LZ4HC_c ( int iCompressionLevel )
  140. : m_iCompressionLevel ( iCompressionLevel )
  141. {}
  142. int Compressor_LZ4HC_c::DoCompression ( const VecTraits_T<BYTE> & dUncompressed, CSphVector<BYTE> & dCompressed ) const
  143. {
  144. return LZ4_compress_HC ( (const char *)dUncompressed.Begin(), (char *)dCompressed.Begin(), dUncompressed.GetLength(), dCompressed.GetLength(), m_iCompressionLevel );
  145. }
  146. std::unique_ptr<Compressor_i> CreateCompressor ( Compression_e eComp, int iCompressionLevel )
  147. {
  148. switch ( eComp )
  149. {
  150. case Compression_e::LZ4: return std::make_unique<Compressor_LZ4_c>();
  151. case Compression_e::LZ4HC: return std::make_unique<Compressor_LZ4HC_c> ( iCompressionLevel );
  152. default: return std::make_unique<Compressor_None_c>();
  153. }
  154. }
  155. //////////////////////////////////////////////////////////////////////////
  156. static CSphString BuildCompoundName ( const CSphString & sName, DocstoreDataType_e eType )
  157. {
  158. CSphString sCompound;
  159. sCompound.SetSprintf ( "%d%s", eType, sName.cstr() );
  160. return sCompound;
  161. }
  162. class DocstoreFields_c : public DocstoreFields_i
  163. {
  164. public:
  165. struct Field_t
  166. {
  167. CSphString m_sName;
  168. DocstoreDataType_e m_eType;
  169. };
  170. int AddField ( const CSphString & sName, DocstoreDataType_e eType ) final;
  171. void RemoveField ( const CSphString & sName, DocstoreDataType_e eType ) final;
  172. int GetFieldId ( const CSphString & sName, DocstoreDataType_e eType ) const final;
  173. int GetNumFields() const final { return m_dFields.GetLength(); }
  174. const Field_t & GetField ( int iField ) const { return m_dFields[iField]; }
  175. void Load ( CSphReader & tReader );
  176. void Save ( CSphWriter & tWriter );
  177. private:
  178. CSphVector<Field_t> m_dFields;
  179. SmallStringHash_T<int> m_hFields;
  180. };
  181. int DocstoreFields_c::AddField ( const CSphString & sName, DocstoreDataType_e eType )
  182. {
  183. int iField = m_dFields.GetLength();
  184. m_dFields.Add ( {sName, eType} );
  185. m_hFields.Add ( iField, BuildCompoundName ( sName, eType ) );
  186. return iField;
  187. }
  188. void DocstoreFields_c::RemoveField ( const CSphString & sName, DocstoreDataType_e eType )
  189. {
  190. int iFieldId = GetFieldId ( sName, eType );
  191. if ( iFieldId==-1 )
  192. return;
  193. m_dFields.Remove ( iFieldId, 1 );
  194. m_hFields.Reset();
  195. ARRAY_FOREACH ( i, m_dFields )
  196. m_hFields.Add ( i, BuildCompoundName ( m_dFields[i].m_sName, m_dFields[i].m_eType ) );
  197. }
  198. int DocstoreFields_c::GetFieldId ( const CSphString & sName, DocstoreDataType_e eType ) const
  199. {
  200. int * pField = m_hFields ( BuildCompoundName ( sName, eType ) );
  201. return pField ? *pField : -1;
  202. }
  203. void DocstoreFields_c::Load ( CSphReader & tReader )
  204. {
  205. assert ( !GetNumFields() );
  206. DWORD uNumFields = tReader.GetDword();
  207. for ( int i = 0; i < (int)uNumFields; i++ )
  208. {
  209. auto eType = (DocstoreDataType_e)tReader.GetByte();
  210. CSphString sName = tReader.GetString();
  211. AddField ( sName, eType );
  212. }
  213. }
  214. void DocstoreFields_c::Save ( CSphWriter & tWriter )
  215. {
  216. tWriter.PutDword ( GetNumFields() );
  217. for ( int i = 0, iNumFields = GetNumFields(); i < iNumFields; ++i )
  218. {
  219. tWriter.PutByte ( GetField(i).m_eType );
  220. tWriter.PutString ( GetField(i).m_sName );
  221. }
  222. }
  223. //////////////////////////////////////////////////////////////////////////
  224. struct BlockData_t
  225. {
  226. BYTE m_uFlags = 0;
  227. DWORD m_uNumDocs = 0;
  228. BYTE * m_pData = nullptr;
  229. DWORD m_uSize = 0;
  230. };
  231. struct HashKey_t
  232. {
  233. int64_t m_iIndexId;
  234. SphOffset_t m_tOffset;
  235. bool operator == ( const HashKey_t & tKey ) const { return m_iIndexId==tKey.m_iIndexId && m_tOffset==tKey.m_tOffset; }
  236. };
  237. struct BlockUtil_t
  238. {
  239. static DWORD GetHash ( const HashKey_t & tKey )
  240. {
  241. DWORD uCRC32 = sphCRC32 ( &tKey.m_iIndexId, sizeof(tKey.m_iIndexId) );
  242. return sphCRC32 ( &tKey.m_tOffset, sizeof(tKey.m_tOffset), uCRC32 );
  243. }
  244. static bool Equal ( const HashKey_t & a, const HashKey_t & b ) { return a==b; }
  245. static DWORD GetSize ( const BlockData_t & tValue ) { return tValue.m_uSize; }
  246. static void Reset ( BlockData_t & tValue ) { SafeDeleteArray ( tValue.m_pData ); }
  247. };
  248. class BlockCache_c : public LRUCache_T<HashKey_t, BlockData_t, BlockUtil_t>
  249. {
  250. using BASE = LRUCache_T<HashKey_t, BlockData_t, BlockUtil_t>;
  251. using BASE::BASE;
  252. public:
  253. void ClearByIndexId ( int64_t iIndexId ) { BASE::Delete ( [iIndexId]( const HashKey_t & tKey ){ return tKey.m_iIndexId==iIndexId; } ); }
  254. void ClearAll() { BASE::Delete ( []( const HashKey_t & ){ return true; } ); }
  255. static void Init ( int64_t iCacheSize );
  256. static void Done() { SafeDelete(m_pBlockCache); }
  257. static BlockCache_c * Get() { return m_pBlockCache; }
  258. private:
  259. static BlockCache_c * m_pBlockCache;
  260. };
  261. BlockCache_c * BlockCache_c::m_pBlockCache = nullptr;
  262. void BlockCache_c::Init ( int64_t iCacheSize )
  263. {
  264. assert ( !m_pBlockCache );
  265. if ( iCacheSize > 0 )
  266. m_pBlockCache = new BlockCache_c(iCacheSize);
  267. }
  268. //////////////////////////////////////////////////////////////////////////
  269. class DocstoreReaders_c
  270. {
  271. public:
  272. ~DocstoreReaders_c();
  273. void CreateReader ( int64_t iSessionId, int64_t iIndexId, const CSphAutofile & tFile, DWORD uBlockSize );
  274. CSphReader * GetReader ( int64_t iSessionId, int64_t iIndexId );
  275. void DeleteBySessionId ( int64_t iSessionId );
  276. void DeleteByIndexId ( int64_t iIndexId );
  277. static void Init();
  278. static void Done();
  279. static DocstoreReaders_c * Get();
  280. private:
  281. struct HashKey_t
  282. {
  283. int64_t m_iSessionId;
  284. int64_t m_iIndexId;
  285. bool operator == ( const HashKey_t & tKey ) const;
  286. static DWORD Hash ( const HashKey_t & tKey );
  287. };
  288. int m_iTotalReaderSize = 0;
  289. CSphMutex m_tLock;
  290. CSphOrderedHash<CSphReader *, HashKey_t, HashKey_t, 1024> m_tHash;
  291. static DocstoreReaders_c * m_pReaders;
  292. static const int MIN_READER_BUFFER_SIZE = 32768;
  293. static const int MAX_READER_BUFFER_SIZE = 262144;
  294. static const int MAX_TOTAL_READER_SIZE = 8388608;
  295. void Delete ( CSphReader * pReader, const HashKey_t tKey );
  296. };
  297. DocstoreReaders_c * DocstoreReaders_c::m_pReaders = nullptr;
  298. bool DocstoreReaders_c::HashKey_t::operator == ( const HashKey_t & tKey ) const
  299. {
  300. return m_iSessionId==tKey.m_iSessionId && m_iIndexId==tKey.m_iIndexId;
  301. }
  302. DWORD DocstoreReaders_c::HashKey_t::Hash ( const HashKey_t & tKey )
  303. {
  304. DWORD uCRC32 = sphCRC32 ( &tKey.m_iSessionId, sizeof(tKey.m_iSessionId) );
  305. return sphCRC32 ( &tKey.m_iIndexId, sizeof(tKey.m_iIndexId), uCRC32 );
  306. }
  307. DocstoreReaders_c::~DocstoreReaders_c()
  308. {
  309. for ( auto & tDocstore : m_tHash )
  310. SafeDelete ( tDocstore.second );
  311. }
  312. void DocstoreReaders_c::CreateReader ( int64_t iSessionId, int64_t iIndexId, const CSphAutofile & tFile, DWORD uBlockSize )
  313. {
  314. ScopedMutex_t tLock(m_tLock);
  315. if ( m_tHash ( { iSessionId, iIndexId } ) )
  316. return;
  317. int iBufferSize = (int)uBlockSize*4;
  318. iBufferSize = Min ( iBufferSize, MAX_READER_BUFFER_SIZE );
  319. iBufferSize = Max ( iBufferSize, MIN_READER_BUFFER_SIZE );
  320. if ( iBufferSize<=(int)uBlockSize )
  321. return;
  322. if ( m_iTotalReaderSize+iBufferSize > MAX_TOTAL_READER_SIZE )
  323. return;
  324. CSphReader * pReader = new CSphReader ( nullptr, iBufferSize );
  325. pReader->SetFile(tFile);
  326. Verify ( m_tHash.Add ( pReader, {iSessionId, iIndexId} ) );
  327. m_iTotalReaderSize += iBufferSize;
  328. }
  329. CSphReader * DocstoreReaders_c::GetReader ( int64_t iSessionId, int64_t iIndexId )
  330. {
  331. ScopedMutex_t tLock(m_tLock);
  332. CSphReader ** ppReader = m_tHash ( { iSessionId, iIndexId } );
  333. return ppReader ? *ppReader : nullptr;
  334. }
  335. void DocstoreReaders_c::Delete ( CSphReader * pReader, const HashKey_t tKey )
  336. {
  337. m_iTotalReaderSize -= pReader->GetBufferSize();
  338. assert ( m_iTotalReaderSize>=0 );
  339. SafeDelete(pReader);
  340. m_tHash.Delete(tKey);
  341. }
  342. void DocstoreReaders_c::DeleteBySessionId ( int64_t iSessionId )
  343. {
  344. ScopedMutex_t tLock(m_tLock);
  345. // fixme: create a separate (faster) lookup?
  346. CSphVector<std::pair<CSphReader*,HashKey_t>> dToDelete;
  347. for ( auto & tDocstore : m_tHash )
  348. if ( tDocstore.first.m_iSessionId==iSessionId )
  349. dToDelete.Add ( { tDocstore.second, tDocstore.first } );
  350. for ( const auto & i : dToDelete )
  351. Delete ( i.first, i.second );
  352. }
  353. void DocstoreReaders_c::DeleteByIndexId ( int64_t iIndexId )
  354. {
  355. ScopedMutex_t tLock(m_tLock);
  356. // fixme: create a separate (faster) lookup?
  357. CSphVector<std::pair<CSphReader*,HashKey_t>> dToDelete;
  358. for ( auto& tDocstore : m_tHash )
  359. if ( tDocstore.first.m_iIndexId==iIndexId )
  360. dToDelete.Add ( { tDocstore.second, tDocstore.first } );
  361. for ( const auto & i : dToDelete )
  362. Delete ( i.first, i.second );
  363. }
  364. void DocstoreReaders_c::Init ()
  365. {
  366. assert(!m_pReaders);
  367. m_pReaders = new DocstoreReaders_c;
  368. }
  369. void DocstoreReaders_c::Done()
  370. {
  371. SafeDelete(m_pReaders);
  372. }
  373. DocstoreReaders_c * DocstoreReaders_c::Get()
  374. {
  375. return m_pReaders;
  376. }
  377. //////////////////////////////////////////////////////////////////////////
  378. static void CreateFieldRemap ( VecTraits_T<int> & dFieldInRset, const VecTraits_T<int> * pFieldIds )
  379. {
  380. if ( pFieldIds )
  381. ARRAY_CONSTFOREACH ( i, dFieldInRset )
  382. {
  383. int * pFound = pFieldIds->BinarySearch(i);
  384. dFieldInRset[i] = pFound ? pFieldIds->Idx ( pFound ) : -1;
  385. }
  386. else
  387. ARRAY_CONSTFOREACH ( i, dFieldInRset )
  388. dFieldInRset[i] = i;
  389. }
  390. //////////////////////////////////////////////////////////////////////////
  391. class Docstore_c : public Docstore_i, public DocstoreSettings_t
  392. {
  393. friend class DocstoreChecker_c;
  394. public:
  395. Docstore_c ( int64_t iIndexId, const CSphString & sFilename );
  396. ~Docstore_c() override;
  397. bool Init ( CSphString & sError );
  398. int GetFieldId ( const CSphString & sName, DocstoreDataType_e eType ) const final;
  399. void CreateReader ( int64_t iSessionId ) const final;
  400. DocstoreDoc_t GetDoc ( RowID_t tRowID, const VecTraits_T<int> * pFieldIds, int64_t iSessionId, bool bPack ) const final;
  401. DocstoreSettings_t GetDocstoreSettings() const final;
  402. private:
  403. struct Block_t
  404. {
  405. SphOffset_t m_tOffset = 0;
  406. DWORD m_uSize = 0;
  407. DWORD m_uHeaderSize = 0;
  408. RowID_t m_tRowID = INVALID_ROWID;
  409. BlockType_e m_eType = BLOCK_TYPE_SMALL;
  410. };
  411. struct FieldInfo_t
  412. {
  413. BYTE m_uFlags = 0;
  414. DWORD m_uCompressedLen = 0;
  415. DWORD m_uUncompressedLen = 0;
  416. };
  417. int64_t m_iIndexId = 0;
  418. CSphString m_sFilename;
  419. CSphAutofile m_tFile;
  420. CSphFixedVector<Block_t> m_dBlocks{0};
  421. std::unique_ptr<Compressor_i> m_pCompressor;
  422. DocstoreFields_c m_tFields;
  423. const Block_t * FindBlock ( RowID_t tRowID ) const;
  424. void ReadFromFile ( BYTE * pData, int iLength, SphOffset_t tOffset, int64_t iSessionId ) const;
  425. DocstoreDoc_t ReadDocFromSmallBlock ( const Block_t & tBlock, RowID_t tRowID, const VecTraits_T<int> * pFieldIds, int64_t iSessionId, bool bPack ) const;
  426. DocstoreDoc_t ReadDocFromBigBlock ( const Block_t & tBlock, const VecTraits_T<int> * pFieldIds, int64_t iSessionId, bool bPack ) const;
  427. BlockData_t UncompressSmallBlock ( const Block_t & tBlock, int64_t iSessionId ) const;
  428. BlockData_t UncompressBigBlockField ( SphOffset_t tOffset, const FieldInfo_t & tInfo, int64_t iSessionId ) const;
  429. bool ProcessSmallBlockDoc ( RowID_t tCurDocRowID, RowID_t tRowID, const VecTraits_T<int> * pFieldIds, const CSphFixedVector<int> & dFieldInRset, bool bPack, MemoryReader2_c & tReader, CSphBitvec & tEmptyFields, DocstoreDoc_t & tResult ) const;
  430. void ProcessBigBlockField ( int iField, const FieldInfo_t & tInfo, int iFieldInRset, bool bPack, int64_t iSessionId, SphOffset_t & tOffset, DocstoreDoc_t & tResult ) const;
  431. };
  432. Docstore_c::Docstore_c ( int64_t iIndexId, const CSphString & sFilename )
  433. : m_iIndexId ( iIndexId )
  434. , m_sFilename ( sFilename )
  435. {}
  436. Docstore_c::~Docstore_c ()
  437. {
  438. BlockCache_c * pBlockCache = BlockCache_c::Get();
  439. if ( pBlockCache )
  440. pBlockCache->ClearByIndexId(m_iIndexId);
  441. DocstoreReaders_c * pReaders = DocstoreReaders_c::Get();
  442. if ( pReaders )
  443. pReaders->DeleteByIndexId(m_iIndexId);
  444. }
  445. bool Docstore_c::Init ( CSphString & sError )
  446. {
  447. CSphAutoreader tReader;
  448. if ( !tReader.Open ( m_sFilename, sError ) )
  449. return false;
  450. DWORD uStorageVersion = tReader.GetDword();
  451. if ( uStorageVersion > STORAGE_VERSION )
  452. {
  453. sError.SetSprintf ( "Unable to load docstore: %s is v.%d, binary is v.%d", m_sFilename.cstr(), uStorageVersion, STORAGE_VERSION );
  454. return false;
  455. }
  456. m_uBlockSize = tReader.GetDword();
  457. m_eCompression = Byte2Compression ( tReader.GetByte() );
  458. m_pCompressor = CreateCompressor ( m_eCompression, m_iCompressionLevel );
  459. if ( !m_pCompressor )
  460. return false;
  461. m_tFields.Load(tReader);
  462. DWORD uNumBlocks = tReader.GetDword();
  463. if ( !uNumBlocks )
  464. return true;
  465. SphOffset_t tHeaderOffset = tReader.GetOffset();
  466. tReader.SeekTo ( tHeaderOffset, 0 );
  467. m_dBlocks.Reset(uNumBlocks);
  468. DWORD tPrevBlockRowID = 0;
  469. SphOffset_t tPrevBlockOffset = 0;
  470. for ( auto & i : m_dBlocks )
  471. {
  472. i.m_tRowID = tReader.UnzipRowid() + tPrevBlockRowID;
  473. i.m_eType = (BlockType_e)tReader.GetByte();
  474. i.m_tOffset = tReader.UnzipOffset() + tPrevBlockOffset;
  475. if ( i.m_eType==BLOCK_TYPE_BIG )
  476. i.m_uHeaderSize = tReader.UnzipInt();
  477. tPrevBlockRowID = i.m_tRowID;
  478. tPrevBlockOffset = i.m_tOffset;
  479. }
  480. for ( int i = 1; i<m_dBlocks.GetLength(); i++ )
  481. m_dBlocks[i-1].m_uSize = m_dBlocks[i].m_tOffset-m_dBlocks[i-1].m_tOffset;
  482. m_dBlocks.Last().m_uSize = tHeaderOffset-m_dBlocks.Last().m_tOffset;
  483. if ( tReader.GetErrorFlag() )
  484. return false;
  485. tReader.Close();
  486. if ( m_tFile.Open ( m_sFilename, SPH_O_READ, sError ) < 0 )
  487. return false;
  488. return true;
  489. }
  490. const Docstore_c::Block_t * Docstore_c::FindBlock ( RowID_t tRowID ) const
  491. {
  492. const Block_t * pFound = sphBinarySearchFirst ( m_dBlocks.Begin(), m_dBlocks.End()-1, bind(&Block_t::m_tRowID), tRowID );
  493. assert(pFound);
  494. if ( pFound->m_tRowID>tRowID )
  495. {
  496. if ( pFound==m_dBlocks.Begin() )
  497. return nullptr;
  498. return pFound-1;
  499. }
  500. return pFound;
  501. }
  502. void Docstore_c::CreateReader ( int64_t iSessionId ) const
  503. {
  504. DocstoreReaders_c * pReaders = DocstoreReaders_c::Get();
  505. if ( pReaders )
  506. pReaders->CreateReader ( iSessionId, m_iIndexId, m_tFile, m_uBlockSize );
  507. }
  508. int Docstore_c::GetFieldId ( const CSphString & sName, DocstoreDataType_e eType ) const
  509. {
  510. return m_tFields.GetFieldId (sName, eType );
  511. }
  512. DocstoreDoc_t Docstore_c::GetDoc ( RowID_t tRowID, const VecTraits_T<int> * pFieldIds, int64_t iSessionId, bool bPack ) const
  513. {
  514. #ifndef NDEBUG
  515. // assume that field ids are sorted
  516. for ( int i = 1; pFieldIds && i < pFieldIds->GetLength(); ++i )
  517. assert ( (*pFieldIds)[i-1] < (*pFieldIds)[i] );
  518. #endif
  519. const Block_t * pBlock = FindBlock(tRowID);
  520. assert ( pBlock );
  521. if ( pBlock->m_eType==BLOCK_TYPE_SMALL )
  522. return ReadDocFromSmallBlock ( *pBlock, tRowID, pFieldIds, iSessionId, bPack );
  523. else
  524. return ReadDocFromBigBlock ( *pBlock, pFieldIds, iSessionId, bPack );
  525. }
  526. struct ScopedBlock_t
  527. {
  528. int64_t m_iIndexId = INT64_MAX;
  529. SphOffset_t m_tOffset = 0;
  530. ~ScopedBlock_t()
  531. {
  532. if ( m_iIndexId==INT64_MAX )
  533. return;
  534. BlockCache_c * pBlockCache = BlockCache_c::Get();
  535. assert ( pBlockCache );
  536. pBlockCache->Release ( { m_iIndexId, m_tOffset } );
  537. }
  538. };
  539. void Docstore_c::ReadFromFile ( BYTE * pData, int iLength, SphOffset_t tOffset, int64_t iSessionId ) const
  540. {
  541. DocstoreReaders_c * pReaders = DocstoreReaders_c::Get();
  542. CSphReader * pReader = nullptr;
  543. if ( pReaders )
  544. pReader = pReaders->GetReader ( iSessionId, m_iIndexId );
  545. if ( pReader )
  546. {
  547. pReader->SeekTo ( tOffset, iLength );
  548. pReader->GetBytes ( pData, iLength );
  549. }
  550. else
  551. sphPread ( m_tFile.GetFD(), pData, iLength, tOffset );
  552. }
  553. BlockData_t Docstore_c::UncompressSmallBlock ( const Block_t & tBlock, int64_t iSessionId ) const
  554. {
  555. BlockData_t tResult;
  556. CSphFixedVector<BYTE> dBlock ( tBlock.m_uSize );
  557. ReadFromFile ( dBlock.Begin(), dBlock.GetLength(), tBlock.m_tOffset, iSessionId );
  558. MemoryReader2_c tBlockReader ( dBlock.Begin(), dBlock.GetLength() );
  559. tResult.m_uFlags = tBlockReader.GetVal<BYTE>();
  560. tResult.m_uNumDocs = tBlockReader.UnzipInt();
  561. tResult.m_uSize = tBlockReader.UnzipInt();
  562. DWORD uCompressedLength = tResult.m_uSize;
  563. bool bCompressed = tResult.m_uFlags & BLOCK_FLAG_COMPRESSED;
  564. if ( bCompressed )
  565. uCompressedLength = tBlockReader.UnzipInt();
  566. const BYTE * pBody = dBlock.Begin() + tBlockReader.GetPos();
  567. CSphFixedVector<BYTE> dDecompressed(0);
  568. if ( bCompressed )
  569. {
  570. dDecompressed.Reset ( tResult.m_uSize );
  571. Verify ( m_pCompressor->Decompress ( VecTraits_T<const BYTE> (pBody, uCompressedLength), dDecompressed) );
  572. tResult.m_pData = dDecompressed.LeakData();
  573. }
  574. else
  575. {
  576. // we can't just pass tResult.m_pData because it doesn't point to the start of the allocated block
  577. tResult.m_pData = new BYTE[tResult.m_uSize];
  578. memcpy ( tResult.m_pData, pBody, tResult.m_uSize );
  579. }
  580. return tResult;
  581. }
  582. bool Docstore_c::ProcessSmallBlockDoc ( RowID_t tCurDocRowID, RowID_t tRowID, const VecTraits_T<int> * pFieldIds, const CSphFixedVector<int> & dFieldInRset, bool bPack, MemoryReader2_c & tReader, CSphBitvec & tEmptyFields, DocstoreDoc_t & tResult ) const
  583. {
  584. bool bDocFound = tCurDocRowID==tRowID;
  585. if ( bDocFound )
  586. tResult.m_dFields.Resize ( pFieldIds ? pFieldIds->GetLength() : m_tFields.GetNumFields() );
  587. DWORD uBitMaskSize = tEmptyFields.GetSizeBytes();
  588. BYTE uDocFlags = tReader.GetVal<BYTE>();
  589. if ( uDocFlags & DOC_FLAG_ALL_EMPTY )
  590. {
  591. for ( auto & i : tResult.m_dFields )
  592. i.Resize(0);
  593. return bDocFound;
  594. }
  595. bool bHasBitmask = !!(uDocFlags & DOC_FLAG_EMPTY_BITMASK);
  596. if ( bHasBitmask )
  597. {
  598. memcpy ( tEmptyFields.Begin(), tReader.Begin()+tReader.GetPos(), uBitMaskSize );
  599. tReader.SetPos ( tReader.GetPos()+uBitMaskSize );
  600. }
  601. for ( int iField = 0; iField < m_tFields.GetNumFields(); iField++ )
  602. if ( !bHasBitmask || !tEmptyFields.BitGet(iField) )
  603. {
  604. DWORD uFieldLength = tReader.UnzipInt();
  605. int iFieldInRset = dFieldInRset[iField];
  606. if ( bDocFound && iFieldInRset!=-1 )
  607. PackData ( tResult.m_dFields[iFieldInRset], tReader.Begin()+tReader.GetPos(), uFieldLength, m_tFields.GetField(iField).m_eType==DOCSTORE_TEXT, bPack );
  608. tReader.SetPos ( tReader.GetPos()+uFieldLength );
  609. }
  610. return bDocFound;
  611. }
  612. DocstoreDoc_t Docstore_c::ReadDocFromSmallBlock ( const Block_t & tBlock, RowID_t tRowID, const VecTraits_T<int> * pFieldIds, int64_t iSessionId, bool bPack ) const
  613. {
  614. BlockCache_c * pBlockCache = BlockCache_c::Get();
  615. BlockData_t tBlockData;
  616. bool bFromCache = pBlockCache && pBlockCache->Find ( { m_iIndexId, tBlock.m_tOffset }, tBlockData );
  617. if ( !bFromCache )
  618. {
  619. tBlockData = UncompressSmallBlock ( tBlock, iSessionId );
  620. bFromCache = pBlockCache && pBlockCache->Add ( { m_iIndexId, tBlock.m_tOffset }, tBlockData );
  621. }
  622. ScopedBlock_t tScopedBlock;
  623. CSphFixedVector<BYTE> tDataPtr {0}; // scoped array ptr
  624. if ( bFromCache )
  625. {
  626. tScopedBlock.m_iIndexId = m_iIndexId;
  627. tScopedBlock.m_tOffset = tBlock.m_tOffset;
  628. }
  629. else
  630. tDataPtr.Set ( tBlockData.m_pData, 0 );
  631. CSphFixedVector<int> dFieldInRset ( m_tFields.GetNumFields() );
  632. CreateFieldRemap ( dFieldInRset, pFieldIds );
  633. DocstoreDoc_t tResult;
  634. RowID_t tCurDocRowID = tBlock.m_tRowID;
  635. MemoryReader2_c tReader ( tBlockData.m_pData, tBlockData.m_uSize );
  636. CSphBitvec tEmptyFields ( m_tFields.GetNumFields() );
  637. for ( int i = 0; i < (int)tBlockData.m_uNumDocs; i++ )
  638. {
  639. if ( ProcessSmallBlockDoc ( tCurDocRowID, tRowID, pFieldIds, dFieldInRset, bPack, tReader, tEmptyFields, tResult ) )
  640. break;
  641. tCurDocRowID++;
  642. }
  643. return tResult;
  644. }
  645. BlockData_t Docstore_c::UncompressBigBlockField ( SphOffset_t tOffset, const FieldInfo_t & tInfo, int64_t iSessionId ) const
  646. {
  647. BlockData_t tResult;
  648. bool bCompressed = !!( tInfo.m_uFlags & FIELD_FLAG_COMPRESSED );
  649. DWORD uDataLen = bCompressed ? tInfo.m_uCompressedLen : tInfo.m_uUncompressedLen;
  650. CSphFixedVector<BYTE> dField ( uDataLen );
  651. ReadFromFile ( dField.Begin(), dField.GetLength(), tOffset, iSessionId );
  652. tResult.m_uSize = tInfo.m_uUncompressedLen;
  653. CSphFixedVector<BYTE> dDecompressed(0);
  654. if ( bCompressed )
  655. {
  656. dDecompressed.Reset ( tResult.m_uSize );
  657. Verify ( m_pCompressor->Decompress ( dField, dDecompressed ) );
  658. tResult.m_pData = dDecompressed.LeakData();
  659. }
  660. else
  661. tResult.m_pData = dField.LeakData();
  662. return tResult;
  663. }
  664. void Docstore_c::ProcessBigBlockField ( int iField, const FieldInfo_t & tInfo, int iFieldInRset, bool bPack, int64_t iSessionId, SphOffset_t & tOffset, DocstoreDoc_t & tResult ) const
  665. {
  666. if ( tInfo.m_uFlags & FIELD_FLAG_EMPTY )
  667. return;
  668. bool bCompressed = !!( tInfo.m_uFlags & FIELD_FLAG_COMPRESSED );
  669. SphOffset_t tOffsetDelta = bCompressed ? tInfo.m_uCompressedLen : tInfo.m_uUncompressedLen;
  670. if ( iFieldInRset==-1 )
  671. {
  672. tOffset += tOffsetDelta;
  673. return;
  674. }
  675. BlockCache_c * pBlockCache = BlockCache_c::Get();
  676. BlockData_t tBlockData;
  677. bool bFromCache = pBlockCache && pBlockCache->Find ( { m_iIndexId, tOffset }, tBlockData );
  678. if ( !bFromCache )
  679. {
  680. tBlockData = UncompressBigBlockField ( tOffset, tInfo, iSessionId );
  681. bFromCache = pBlockCache && pBlockCache->Add ( { m_iIndexId, tOffset }, tBlockData );
  682. }
  683. ScopedBlock_t tScopedBlock;
  684. CSphFixedVector<BYTE> tDataPtr {0}; // scoped array ptr
  685. if ( bFromCache )
  686. {
  687. tScopedBlock.m_iIndexId = m_iIndexId;
  688. tScopedBlock.m_tOffset = tOffset;
  689. }
  690. else
  691. tDataPtr.Set ( tBlockData.m_pData, 0 );
  692. PackData ( tResult.m_dFields[iFieldInRset], tBlockData.m_pData, tBlockData.m_uSize, m_tFields.GetField(iField).m_eType==DOCSTORE_TEXT, bPack );
  693. tOffset += tOffsetDelta;
  694. }
  695. DocstoreDoc_t Docstore_c::ReadDocFromBigBlock ( const Block_t & tBlock, const VecTraits_T<int> * pFieldIds, int64_t iSessionId, bool bPack ) const
  696. {
  697. CSphFixedVector<FieldInfo_t> dFieldInfo ( m_tFields.GetNumFields() );
  698. CSphFixedVector<BYTE> dBlockHeader(tBlock.m_uHeaderSize);
  699. ReadFromFile ( dBlockHeader.Begin(), dBlockHeader.GetLength(), tBlock.m_tOffset, iSessionId );
  700. MemoryReader2_c tReader ( dBlockHeader.Begin(), dBlockHeader.GetLength() );
  701. CSphVector<int> dFieldSort;
  702. BYTE uBlockFlags = tReader.GetVal<BYTE>();
  703. bool bNeedReorder = !!( uBlockFlags & BLOCK_FLAG_FIELD_REORDER );
  704. if ( bNeedReorder )
  705. {
  706. dFieldSort.Resize ( m_tFields.GetNumFields() );
  707. for ( auto & i : dFieldSort )
  708. i = tReader.UnzipInt();
  709. }
  710. for ( int i = 0; i < m_tFields.GetNumFields(); i++ )
  711. {
  712. int iField = bNeedReorder ? dFieldSort[i] : i;
  713. FieldInfo_t & tInfo = dFieldInfo[iField];
  714. tInfo.m_uFlags = tReader.GetVal<BYTE>();
  715. if ( tInfo.m_uFlags & FIELD_FLAG_EMPTY )
  716. continue;
  717. tInfo.m_uUncompressedLen = tReader.UnzipInt();
  718. if ( tInfo.m_uFlags & FIELD_FLAG_COMPRESSED )
  719. tInfo.m_uCompressedLen = tReader.UnzipInt();
  720. }
  721. dBlockHeader.Reset(0);
  722. CSphFixedVector<int> dFieldInRset ( m_tFields.GetNumFields() );
  723. CreateFieldRemap ( dFieldInRset, pFieldIds );
  724. DocstoreDoc_t tResult;
  725. tResult.m_dFields.Resize ( pFieldIds ? pFieldIds->GetLength() : m_tFields.GetNumFields() );
  726. SphOffset_t tOffset = tBlock.m_tOffset+tBlock.m_uHeaderSize;
  727. // i == physical field order in file
  728. // dFieldSort[i] == field order as in m_dFields
  729. // dFieldInRset[iField] == field order in result set
  730. for ( int i = 0; i < m_tFields.GetNumFields(); i++ )
  731. {
  732. int iField = bNeedReorder ? dFieldSort[i] : i;
  733. ProcessBigBlockField ( iField, dFieldInfo[iField], dFieldInRset[iField], bPack, iSessionId, tOffset, tResult );
  734. }
  735. return tResult;
  736. }
  737. DocstoreSettings_t Docstore_c::GetDocstoreSettings() const
  738. {
  739. return *this;
  740. }
  741. //////////////////////////////////////////////////////////////////////////
  742. DocstoreBuilder_i::Doc_t::Doc_t()
  743. {}
  744. DocstoreBuilder_i::Doc_t::Doc_t ( const DocstoreDoc_t & tDoc )
  745. {
  746. m_dFields.Resize ( tDoc.m_dFields.GetLength() );
  747. ARRAY_FOREACH ( i, m_dFields )
  748. m_dFields[i] = VecTraits_T<BYTE> ( tDoc.m_dFields[i].Begin(), tDoc.m_dFields[i].GetLength() );
  749. }
  750. //////////////////////////////////////////////////////////////////////////
  751. class DocstoreBuilder_c : public DocstoreBuilder_i, public DocstoreSettings_t
  752. {
  753. public:
  754. DocstoreBuilder_c ( CSphString sFilename, const DocstoreSettings_t & tSettings, int iBufferSize );
  755. bool Init ( CSphString & sError );
  756. void AddDoc ( RowID_t tRowID, const Doc_t & tDoc ) final;
  757. int AddField ( const CSphString & sName, DocstoreDataType_e eType ) final { return m_tFields.AddField ( sName, eType ); }
  758. void RemoveField ( const CSphString & sName, DocstoreDataType_e eType ) final { return m_tFields.RemoveField ( sName, eType ); }
  759. int GetFieldId ( const CSphString & sName, DocstoreDataType_e eType ) const final { return m_tFields.GetFieldId ( sName, eType ); }
  760. void Finalize() final;
  761. private:
  762. struct StoredDoc_t
  763. {
  764. RowID_t m_tRowID;
  765. CSphVector<CSphVector<BYTE>> m_dFields;
  766. };
  767. CSphString m_sFilename;
  768. CSphVector<StoredDoc_t> m_dStoredDocs;
  769. CSphVector<BYTE> m_dHeader;
  770. CSphVector<BYTE> m_dBuffer;
  771. std::unique_ptr<Compressor_i> m_pCompressor;
  772. MemoryWriter2_c m_tHeaderWriter;
  773. CSphWriter m_tWriter;
  774. DocstoreFields_c m_tFields;
  775. int m_iBufferSize = 0;
  776. DWORD m_uStoredLen = 0;
  777. int m_iNumBlocks = 0;
  778. SphOffset_t m_tHeaderOffset = 0;
  779. SphOffset_t m_tPrevBlockOffset = 0;
  780. DWORD m_tPrevBlockRowID = 0;
  781. using SortedField_t = std::pair<int,int>;
  782. CSphVector<SortedField_t> m_dFieldSort;
  783. CSphVector<CSphVector<BYTE>> m_dCompressedBuffers;
  784. void WriteInitialHeader();
  785. void WriteTrailingHeader();
  786. void WriteBlock();
  787. void WriteSmallBlockHeader ( SphOffset_t tBlockOffset );
  788. void WriteBigBlockHeader ( SphOffset_t tBlockOffset, SphOffset_t tHeaderSize );
  789. void WriteSmallBlock();
  790. void WriteBigBlock();
  791. };
  792. DocstoreBuilder_c::DocstoreBuilder_c ( CSphString sFilename, const DocstoreSettings_t & tSettings, int iBufferSize )
  793. : m_sFilename ( std::move (sFilename) )
  794. , m_tHeaderWriter ( m_dHeader )
  795. , m_iBufferSize ( iBufferSize )
  796. {
  797. *(DocstoreSettings_t*)this = tSettings;
  798. }
  799. bool DocstoreBuilder_c::Init ( CSphString & sError )
  800. {
  801. m_pCompressor = CreateCompressor ( m_eCompression, m_iCompressionLevel );
  802. if ( !m_pCompressor )
  803. return false;
  804. m_tWriter.SetBufferSize(m_iBufferSize);
  805. return m_tWriter.OpenFile ( m_sFilename, sError );
  806. }
  807. void DocstoreBuilder_c::AddDoc ( RowID_t tRowID, const Doc_t & tDoc )
  808. {
  809. assert ( tDoc.m_dFields.GetLength()==m_tFields.GetNumFields() );
  810. DWORD uLen = 0;
  811. for ( const auto & i : tDoc.m_dFields )
  812. uLen += i.GetLength();
  813. if ( m_uStoredLen+uLen > m_uBlockSize )
  814. WriteBlock();
  815. StoredDoc_t & tStoredDoc = m_dStoredDocs.Add();
  816. tStoredDoc.m_tRowID = tRowID;
  817. tStoredDoc.m_dFields.Resize ( m_tFields.GetNumFields() );
  818. for ( int i = 0; i<m_tFields.GetNumFields(); i++ )
  819. {
  820. int iLen = tDoc.m_dFields[i].GetLength();
  821. // remove trailing zero
  822. if ( m_tFields.GetField(i).m_eType==DOCSTORE_TEXT && iLen>0 && tDoc.m_dFields[i][iLen-1]=='\0' )
  823. iLen--;
  824. tStoredDoc.m_dFields[i].Resize(iLen);
  825. memcpy ( tStoredDoc.m_dFields[i].Begin(), tDoc.m_dFields[i].Begin(), iLen );
  826. }
  827. m_uStoredLen += uLen;
  828. }
  829. void DocstoreBuilder_c::Finalize()
  830. {
  831. WriteBlock();
  832. WriteTrailingHeader();
  833. }
  834. void DocstoreBuilder_c::WriteInitialHeader()
  835. {
  836. m_tWriter.PutDword ( STORAGE_VERSION );
  837. m_tWriter.PutDword ( m_uBlockSize );
  838. m_tWriter.PutByte ( Compression2Byte(m_eCompression) );
  839. m_tFields.Save(m_tWriter);
  840. m_tHeaderOffset = m_tWriter.GetPos();
  841. // reserve space for number of blocks
  842. m_tWriter.PutDword(0);
  843. // reserve space for header offset
  844. m_tWriter.PutOffset(0);
  845. }
  846. void DocstoreBuilder_c::WriteTrailingHeader()
  847. {
  848. SphOffset_t tHeaderPos = m_tWriter.GetPos();
  849. // write header
  850. m_tWriter.PutBytes ( m_dHeader.Begin(), m_dHeader.GetLength() );
  851. // rewind to the beginning, store num_blocks, offset to header
  852. m_tWriter.Flush(); // flush is necessary, see similar code in BlobRowBuilder_File_c::Done
  853. m_tWriter.SeekTo(m_tHeaderOffset);
  854. m_tWriter.PutDword(m_iNumBlocks);
  855. m_tWriter.PutOffset(tHeaderPos);
  856. m_tWriter.CloseFile();
  857. }
  858. void DocstoreBuilder_c::WriteSmallBlockHeader ( SphOffset_t tBlockOffset )
  859. {
  860. m_tHeaderWriter.ZipInt ( m_dStoredDocs[0].m_tRowID-m_tPrevBlockRowID ); // initial block rowid delta
  861. m_tHeaderWriter.PutByte ( BLOCK_TYPE_SMALL ); // block type
  862. m_tHeaderWriter.ZipOffset ( tBlockOffset-m_tPrevBlockOffset ); // block offset delta
  863. m_tPrevBlockOffset = tBlockOffset;
  864. m_tPrevBlockRowID = m_dStoredDocs[0].m_tRowID;
  865. }
  866. void DocstoreBuilder_c::WriteBigBlockHeader ( SphOffset_t tBlockOffset, SphOffset_t tHeaderSize )
  867. {
  868. m_tHeaderWriter.ZipInt ( m_dStoredDocs[0].m_tRowID-m_tPrevBlockRowID ); // initial block rowid delta
  869. m_tHeaderWriter.PutByte ( BLOCK_TYPE_BIG ); // block type
  870. m_tHeaderWriter.ZipOffset ( tBlockOffset-m_tPrevBlockOffset ); // block offset delta
  871. m_tHeaderWriter.ZipInt ( tHeaderSize ); // on-disk header size
  872. m_tPrevBlockOffset = tBlockOffset;
  873. m_tPrevBlockRowID = m_dStoredDocs[0].m_tRowID;
  874. }
  875. void DocstoreBuilder_c::WriteSmallBlock()
  876. {
  877. m_dCompressedBuffers.Resize(1);
  878. m_dBuffer.Resize(0);
  879. MemoryWriter2_c tMemWriter ( m_dBuffer );
  880. #ifndef NDEBUG
  881. for ( int i=1; i < m_dStoredDocs.GetLength(); i++ )
  882. assert ( m_dStoredDocs[i].m_tRowID-m_dStoredDocs[i-1].m_tRowID==1 );
  883. #endif // !NDEBUG
  884. CSphBitvec tEmptyFields ( m_tFields.GetNumFields() );
  885. for ( const auto & tDoc : m_dStoredDocs )
  886. {
  887. tEmptyFields.Clear();
  888. ARRAY_FOREACH ( iField, tDoc.m_dFields )
  889. if ( !tDoc.m_dFields[iField].GetLength() )
  890. tEmptyFields.BitSet(iField);
  891. int iEmptyFields = tEmptyFields.BitCount();
  892. if ( iEmptyFields==m_tFields.GetNumFields() )
  893. tMemWriter.PutByte ( DOC_FLAG_ALL_EMPTY );
  894. else
  895. {
  896. bool bNeedsBitmask = iEmptyFields && ( tEmptyFields.GetSizeBytes() < iEmptyFields );
  897. tMemWriter.PutByte ( bNeedsBitmask ? DOC_FLAG_EMPTY_BITMASK : 0 );
  898. if ( bNeedsBitmask )
  899. tMemWriter.PutBytes ( tEmptyFields.Begin(), tEmptyFields.GetSizeBytes() );
  900. ARRAY_FOREACH ( iField, tDoc.m_dFields )
  901. if ( !bNeedsBitmask || !tEmptyFields.BitGet(iField) )
  902. {
  903. const CSphVector<BYTE> & tField = tDoc.m_dFields[iField];
  904. tMemWriter.ZipInt ( tField.GetLength() );
  905. tMemWriter.PutBytes ( tField.Begin(), tField.GetLength() );
  906. }
  907. }
  908. }
  909. CSphVector<BYTE> & dCompressedBuffer = m_dCompressedBuffers[0];
  910. BYTE uBlockFlags = 0;
  911. bool bCompressed = m_pCompressor->Compress ( m_dBuffer, dCompressedBuffer );
  912. if ( bCompressed )
  913. uBlockFlags |= BLOCK_FLAG_COMPRESSED;
  914. WriteSmallBlockHeader ( m_tWriter.GetPos() );
  915. m_tWriter.PutByte ( uBlockFlags ); // block flags
  916. m_tWriter.ZipInt ( m_dStoredDocs.GetLength() ); // num docs
  917. m_tWriter.ZipInt ( m_dBuffer.GetLength() ); // uncompressed length
  918. if ( bCompressed )
  919. m_tWriter.ZipInt ( dCompressedBuffer.GetLength() ); // compressed length
  920. // body data
  921. if ( bCompressed )
  922. m_tWriter.PutBytes ( dCompressedBuffer.Begin(), dCompressedBuffer.GetLength() ); // compressed data
  923. else
  924. m_tWriter.PutBytes ( m_dBuffer.Begin(), m_dBuffer.GetLength() ); // uncompressed data
  925. }
  926. void DocstoreBuilder_c::WriteBigBlock()
  927. {
  928. assert ( m_dStoredDocs.GetLength()==1 );
  929. StoredDoc_t & tDoc = m_dStoredDocs[0];
  930. m_dCompressedBuffers.Resize ( m_tFields.GetNumFields() );
  931. bool bNeedReorder = false;
  932. CSphBitvec tCompressedFields ( m_tFields.GetNumFields() );
  933. int iPrevSize = 0;
  934. ARRAY_FOREACH ( iField, tDoc.m_dFields )
  935. {
  936. const CSphVector<BYTE> & dField = tDoc.m_dFields[iField];
  937. CSphVector<BYTE> & dCompressedBuffer = m_dCompressedBuffers[iField];
  938. bool bCompressed = m_pCompressor->Compress ( dField, dCompressedBuffer );
  939. if ( bCompressed )
  940. tCompressedFields.BitSet(iField);
  941. int iStoredSize = bCompressed ? dCompressedBuffer.GetLength() : dField.GetLength();
  942. bNeedReorder |= iStoredSize < iPrevSize;
  943. iPrevSize = dCompressedBuffer.GetLength();
  944. }
  945. if ( bNeedReorder )
  946. {
  947. m_dFieldSort.Resize ( m_tFields.GetNumFields() );
  948. ARRAY_FOREACH ( iField, tDoc.m_dFields )
  949. {
  950. m_dFieldSort[iField].first = iField;
  951. m_dFieldSort[iField].second = tCompressedFields.BitGet(iField) ? m_dCompressedBuffers[iField].GetLength() : tDoc.m_dFields[iField].GetLength();
  952. }
  953. m_dFieldSort.Sort ( ::bind(&SortedField_t::second) );
  954. }
  955. SphOffset_t tOnDiskHeaderStart = m_tWriter.GetPos();
  956. BYTE uBlockFlags = bNeedReorder ? BLOCK_FLAG_FIELD_REORDER : 0;
  957. m_tWriter.PutByte(uBlockFlags); // block flags
  958. if ( bNeedReorder )
  959. {
  960. for ( const auto & i : m_dFieldSort )
  961. m_tWriter.ZipInt(i.first); // field reorder map
  962. }
  963. for ( int i = 0; i < m_tFields.GetNumFields(); i++ )
  964. {
  965. int iField = bNeedReorder ? m_dFieldSort[i].first : i;
  966. bool bCompressed = tCompressedFields.BitGet(iField);
  967. bool bEmpty = !tDoc.m_dFields[iField].GetLength();
  968. BYTE uFieldFlags = 0;
  969. uFieldFlags |= bCompressed ? FIELD_FLAG_COMPRESSED : 0;
  970. uFieldFlags |= bEmpty ? FIELD_FLAG_EMPTY : 0;
  971. m_tWriter.PutByte(uFieldFlags); // field flags
  972. if ( bEmpty )
  973. continue;
  974. m_tWriter.ZipInt ( tDoc.m_dFields[iField].GetLength() ); // uncompressed len
  975. if ( bCompressed )
  976. m_tWriter.ZipInt ( m_dCompressedBuffers[iField].GetLength() ); // compressed len (if compressed)
  977. }
  978. SphOffset_t tOnDiskHeaderSize = m_tWriter.GetPos() - tOnDiskHeaderStart;
  979. for ( int i = 0; i < m_tFields.GetNumFields(); i++ )
  980. {
  981. int iField = bNeedReorder ? m_dFieldSort[i].first : i;
  982. bool bCompressed = tCompressedFields.BitGet(iField);
  983. bool bEmpty = !tDoc.m_dFields[iField].GetLength();
  984. if ( bEmpty )
  985. continue;
  986. if ( bCompressed )
  987. m_tWriter.PutBytes ( m_dCompressedBuffers[iField].Begin(), m_dCompressedBuffers[iField].GetLength() ); // compressed data
  988. else
  989. m_tWriter.PutBytes( tDoc.m_dFields[iField].Begin(), tDoc.m_dFields[iField].GetLength() ); // uncompressed data
  990. }
  991. WriteBigBlockHeader ( tOnDiskHeaderStart, tOnDiskHeaderSize );
  992. }
  993. void DocstoreBuilder_c::WriteBlock()
  994. {
  995. if ( !m_tWriter.GetPos() )
  996. WriteInitialHeader();
  997. if ( !m_dStoredDocs.GetLength() )
  998. return;
  999. bool bBigBlock = m_dStoredDocs.GetLength()==1 && m_uStoredLen>=m_uBlockSize;
  1000. if ( bBigBlock )
  1001. WriteBigBlock();
  1002. else
  1003. WriteSmallBlock();
  1004. m_iNumBlocks++;
  1005. m_uStoredLen = 0;
  1006. m_dStoredDocs.Resize(0);
  1007. }
  1008. //////////////////////////////////////////////////////////////////////////
  1009. class DocstoreRT_c : public DocstoreRT_i
  1010. {
  1011. public:
  1012. ~DocstoreRT_c() override;
  1013. void AddDoc ( RowID_t tRowID, const DocstoreBuilder_i::Doc_t & tDoc ) final;
  1014. int AddField ( const CSphString & sName, DocstoreDataType_e eType ) final;
  1015. void RemoveField ( const CSphString & sName, DocstoreDataType_e eType ) final;
  1016. void Finalize() final {}
  1017. void SwapRows ( RowID_t tDstID, RowID_t tSrcID ) final;
  1018. void DropTail ( RowID_t tTailID ) final;
  1019. DocstoreDoc_t GetDoc ( RowID_t tRowID, const VecTraits_T<int> * pFieldIds, int64_t iSessionId, bool bPack ) const final;
  1020. int GetFieldId ( const CSphString & sName, DocstoreDataType_e eType ) const final;
  1021. DocstoreSettings_t GetDocstoreSettings() const final;
  1022. void CreateReader ( int64_t iSessionId ) const final {}
  1023. bool Load ( CSphReader & tReader ) final;
  1024. void Save ( Writer_i & tWriter ) final;
  1025. void Load ( MemoryReader_c & tReader ) final;
  1026. void Save ( MemoryWriter_c & tWriter ) final;
  1027. void AddPackedDoc ( RowID_t tRowID, const DocstoreRT_i * pSrcDocstore, RowID_t tSrcRowID ) final;
  1028. int64_t AllocatedBytes() const final;
  1029. static int GetDocSize ( const BYTE * pDoc, int iFieldCount );
  1030. bool CheckFieldsLoaded ( CSphString & sError ) const final;
  1031. private:
  1032. CSphVector<BYTE *> m_dDocs;
  1033. int m_iLoadedFieldCount = 0;
  1034. DocstoreFields_c m_tFields;
  1035. int64_t m_iAllocated = 0;
  1036. };
  1037. DocstoreRT_c::~DocstoreRT_c()
  1038. {
  1039. for ( auto & i : m_dDocs )
  1040. SafeDeleteArray(i);
  1041. }
  1042. void DocstoreRT_c::AddDoc ( RowID_t tRowID, const DocstoreBuilder_i::Doc_t & tDoc )
  1043. {
  1044. assert ( (RowID_t)(m_dDocs.GetLength())==tRowID );
  1045. CSphFixedVector<int> tFieldLengths(tDoc.m_dFields.GetLength());
  1046. int iPackedLen = 0;
  1047. ARRAY_FOREACH ( i, tDoc.m_dFields )
  1048. {
  1049. int iLen = tDoc.m_dFields[i].GetLength();
  1050. // remove trailing zero
  1051. if ( m_tFields.GetField(i).m_eType==DOCSTORE_TEXT && iLen>0 && tDoc.m_dFields[i][iLen-1]=='\0' )
  1052. iLen--;
  1053. iPackedLen += sphCalcZippedLen(iLen)+iLen;
  1054. tFieldLengths[i] = iLen;
  1055. }
  1056. BYTE * & pPacked = m_dDocs.Add();
  1057. pPacked = new BYTE[iPackedLen];
  1058. BYTE * pPtr = pPacked;
  1059. ARRAY_FOREACH ( i, tDoc.m_dFields )
  1060. pPtr += sphPackPtrAttr ( pPtr, {tDoc.m_dFields[i].Begin (), tFieldLengths[i]} );
  1061. m_iAllocated += iPackedLen;
  1062. assert ( pPtr-pPacked==iPackedLen );
  1063. }
  1064. void DocstoreRT_c::SwapRows ( RowID_t tDstID, RowID_t tSrcID )
  1065. {
  1066. assert ( tDstID!=INVALID_ROWID );
  1067. assert ( tSrcID!=INVALID_ROWID );
  1068. ::Swap ( m_dDocs[tDstID], m_dDocs[tSrcID]);
  1069. }
  1070. void DocstoreRT_c::DropTail ( RowID_t tTailID )
  1071. {
  1072. int iFieldsCount = m_tFields.GetNumFields ();
  1073. for ( auto i = tTailID, iLen = (RowID_t) m_dDocs.GetLength (); i<iLen; ++i )
  1074. if ( m_dDocs[i])
  1075. {
  1076. m_iAllocated -= GetDocSize ( m_dDocs[i], iFieldsCount );
  1077. SafeDeleteArray( m_dDocs[i] );
  1078. }
  1079. m_dDocs.Resize ( tTailID );
  1080. }
  1081. int DocstoreRT_c::AddField ( const CSphString & sName, DocstoreDataType_e eType )
  1082. {
  1083. return m_tFields.AddField ( sName, eType );
  1084. }
  1085. void DocstoreRT_c::RemoveField ( const CSphString & sName, DocstoreDataType_e eType )
  1086. {
  1087. return m_tFields.RemoveField ( sName, eType );
  1088. }
  1089. DocstoreDoc_t DocstoreRT_c::GetDoc ( RowID_t tRowID, const VecTraits_T<int> * pFieldIds, int64_t iSessionId, bool bPack ) const
  1090. {
  1091. #ifndef NDEBUG
  1092. // assume that field ids are sorted
  1093. for ( int i = 1; pFieldIds && i < pFieldIds->GetLength(); i++ )
  1094. assert ( (*pFieldIds)[i-1] < (*pFieldIds)[i] );
  1095. #endif
  1096. CSphFixedVector<int> dFieldInRset ( m_tFields.GetNumFields() );
  1097. CreateFieldRemap ( dFieldInRset, pFieldIds );
  1098. DocstoreDoc_t tResult;
  1099. tResult.m_dFields.Resize ( pFieldIds ? pFieldIds->GetLength() : m_tFields.GetNumFields() );
  1100. const BYTE * pDoc = m_dDocs[tRowID];
  1101. for ( int iField = 0; iField < m_tFields.GetNumFields(); iField++ )
  1102. {
  1103. DWORD uFieldLength = UnzipIntBE(pDoc);
  1104. int iFieldInRset = dFieldInRset[iField];
  1105. if ( iFieldInRset!=-1 )
  1106. PackData ( tResult.m_dFields[iFieldInRset], pDoc, uFieldLength, m_tFields.GetField(iField).m_eType==DOCSTORE_TEXT, bPack );
  1107. pDoc += uFieldLength;
  1108. }
  1109. return tResult;
  1110. }
  1111. int DocstoreRT_c::GetFieldId ( const CSphString & sName, DocstoreDataType_e eType ) const
  1112. {
  1113. return m_tFields.GetFieldId ( sName, eType );
  1114. }
  1115. DocstoreSettings_t DocstoreRT_c::GetDocstoreSettings() const
  1116. {
  1117. assert ( 0 && "No settings for RT docstore" );
  1118. return DocstoreSettings_t();
  1119. }
  1120. int DocstoreRT_c::GetDocSize ( const BYTE * pDoc, int iFieldCount )
  1121. {
  1122. const BYTE * p = pDoc;
  1123. for ( int iField = 0; iField<iFieldCount; iField++ )
  1124. p += UnzipIntBE(p);
  1125. return p-pDoc;
  1126. }
  1127. template<typename T>
  1128. int64_t DocstoreLoad_T ( CSphVector<BYTE *> & dDocs, T & tReader )
  1129. {
  1130. int64_t iAllocated = 0;
  1131. DWORD uNumDocs = tReader.UnzipInt();
  1132. dDocs.Resize (uNumDocs);
  1133. for ( auto & i : dDocs )
  1134. {
  1135. DWORD uDocLen = tReader.UnzipInt();
  1136. i = new BYTE[uDocLen];
  1137. tReader.GetBytes ( i, uDocLen );
  1138. iAllocated += uDocLen;
  1139. }
  1140. return iAllocated;
  1141. }
  1142. template<typename T>
  1143. void DocstoreSave_T ( const CSphVector<BYTE *> & dDocs, int iFieldCount , T & tWriter )
  1144. {
  1145. tWriter.ZipInt ( dDocs.GetLength() );
  1146. for ( const auto & i : dDocs )
  1147. {
  1148. int iDocLen = DocstoreRT_c::GetDocSize ( i, iFieldCount );
  1149. tWriter.ZipInt ( iDocLen );
  1150. tWriter.PutBytes ( i, iDocLen );
  1151. }
  1152. }
  1153. bool DocstoreRT_c::Load ( CSphReader & tReader )
  1154. {
  1155. assert ( !m_dDocs.GetLength() && !m_iAllocated );
  1156. m_iAllocated += DocstoreLoad_T<CSphReader> ( m_dDocs, tReader );
  1157. return !tReader.GetErrorFlag();
  1158. }
  1159. void DocstoreRT_c::Save ( Writer_i & tWriter )
  1160. {
  1161. DocstoreSave_T<Writer_i> ( m_dDocs, m_tFields.GetNumFields(), tWriter );
  1162. }
  1163. void DocstoreRT_c::Load ( MemoryReader_c & tReader )
  1164. {
  1165. assert ( !m_dDocs.GetLength() && !m_iAllocated );
  1166. m_iLoadedFieldCount = tReader.GetDword();
  1167. m_iAllocated += DocstoreLoad_T<MemoryReader_c> ( m_dDocs, tReader );
  1168. }
  1169. void DocstoreRT_c::Save ( MemoryWriter_c & tWriter )
  1170. {
  1171. int iFieldCount = m_tFields.GetNumFields();
  1172. tWriter.PutDword ( iFieldCount );
  1173. DocstoreSave_T<MemoryWriter_c> ( m_dDocs, iFieldCount, tWriter );
  1174. }
  1175. bool DocstoreRT_c::CheckFieldsLoaded ( CSphString & sError ) const
  1176. {
  1177. if ( !m_iLoadedFieldCount )
  1178. return true;
  1179. int iFieldsCount = m_tFields.GetNumFields();
  1180. if ( m_iLoadedFieldCount!=iFieldsCount )
  1181. {
  1182. sError.SetSprintf ( "wrong fields count, loaded %d, stored %d", m_iLoadedFieldCount, iFieldsCount );
  1183. return false;
  1184. }
  1185. return true;
  1186. }
  1187. void DocstoreRT_c::AddPackedDoc ( RowID_t tRowID, const DocstoreRT_i * pSrcDocstore, RowID_t tSrcRowID )
  1188. {
  1189. const DocstoreRT_c * pSrc = (const DocstoreRT_c *)pSrcDocstore;
  1190. int iFieldsCount = m_tFields.GetNumFields();
  1191. assert ( iFieldsCount==pSrc->m_tFields.GetNumFields() );
  1192. // get raw doc and its length
  1193. const BYTE * pSrcPacked = pSrc->m_dDocs[tSrcRowID];
  1194. const int iSrcPackedLen = pSrc->GetDocSize ( pSrcPacked, iFieldsCount );
  1195. // copy doc into new place
  1196. BYTE * pDst = new BYTE[iSrcPackedLen];
  1197. memcpy ( pDst, pSrcPacked, iSrcPackedLen );
  1198. assert ( (RowID_t)(m_dDocs.GetLength())==tRowID );
  1199. m_dDocs.Add ( pDst );
  1200. m_iAllocated += GetDocSize ( pDst, iFieldsCount );
  1201. }
  1202. int64_t DocstoreRT_c::AllocatedBytes() const
  1203. {
  1204. return m_iAllocated + m_dDocs.AllocatedBytes();
  1205. }
  1206. //////////////////////////////////////////////////////////////////////////
  1207. std::atomic<int64_t> DocstoreSession_c::m_tUIDGenerator { 0 };
  1208. DocstoreSession_c::DocstoreSession_c()
  1209. : m_iUID ( m_tUIDGenerator.fetch_add ( 1, std::memory_order_relaxed ) )
  1210. {}
  1211. DocstoreSession_c::~DocstoreSession_c()
  1212. {
  1213. DocstoreReaders_c * pReaders = DocstoreReaders_c::Get();
  1214. if ( pReaders )
  1215. pReaders->DeleteBySessionId(m_iUID);
  1216. }
  1217. //////////////////////////////////////////////////////////////////////////
  1218. class DocstoreChecker_c
  1219. {
  1220. public:
  1221. DocstoreChecker_c ( CSphAutoreader & tReader, DebugCheckError_i & tReporter, int64_t iRowsCount );
  1222. bool Check();
  1223. private:
  1224. CSphAutoreader & m_tReader;
  1225. DebugCheckError_i & m_tReporter;
  1226. const char * m_szFilename = nullptr;
  1227. DocstoreFields_c m_tFields;
  1228. std::unique_ptr<Compressor_i> m_pCompressor;
  1229. int64_t m_iRowsCount = 0;
  1230. void CheckSmallBlockDoc ( MemoryReader2_c & tReader, CSphBitvec & tEmptyFields, SphOffset_t tOffset );
  1231. void CheckSmallBlock ( const Docstore_c::Block_t & tBlock );
  1232. void CheckBlock ( const Docstore_c::Block_t & tBlock );
  1233. void CheckBigBlockField ( const Docstore_c::FieldInfo_t & tInfo, SphOffset_t & tOffset );
  1234. void CheckBigBlock ( const Docstore_c::Block_t & tBlock );
  1235. };
  1236. DocstoreChecker_c::DocstoreChecker_c ( CSphAutoreader & tReader, DebugCheckError_i & tReporter, int64_t iRowsCount )
  1237. : m_tReader ( tReader )
  1238. , m_tReporter ( tReporter )
  1239. , m_szFilename ( tReader.GetFilename().cstr() )
  1240. , m_iRowsCount ( iRowsCount )
  1241. {}
  1242. bool DocstoreChecker_c::Check()
  1243. {
  1244. DWORD uStorageVersion = m_tReader.GetDword();
  1245. if ( uStorageVersion > STORAGE_VERSION )
  1246. return m_tReporter.Fail ( "Unable to load docstore: %s is v.%d, binary is v.%d", m_szFilename, uStorageVersion, STORAGE_VERSION );
  1247. m_tReader.GetDword(); // block size
  1248. BYTE uCompression = m_tReader.GetByte();
  1249. if ( uCompression > 2 )
  1250. return m_tReporter.Fail ( "Unknown docstore compression %u in %s", uCompression, m_szFilename );
  1251. Compression_e eCompression = Byte2Compression(uCompression);
  1252. m_pCompressor = CreateCompressor ( eCompression, DEFAULT_COMPRESSION_LEVEL );
  1253. if ( !m_pCompressor )
  1254. return m_tReporter.Fail ( "Unable to create compressor in %s", m_szFilename );
  1255. DWORD uNumFields = m_tReader.GetDword();
  1256. const DWORD MAX_SANE_FIELDS = 32768;
  1257. if ( uNumFields > MAX_SANE_FIELDS )
  1258. return m_tReporter.Fail ( "Too many docstore fields (%u) in %s", uNumFields, m_szFilename );
  1259. for ( int i = 0; i < (int)uNumFields; i++ )
  1260. {
  1261. BYTE uDataType = m_tReader.GetByte();
  1262. if ( uDataType > DOCSTORE_TOTAL )
  1263. return m_tReporter.Fail ( "Unknown docstore data type (%u) in %s", uDataType, m_szFilename );
  1264. DocstoreDataType_e eType = (DocstoreDataType_e)uDataType;
  1265. CSphString sName = m_tReader.GetString();
  1266. const int MAX_SANE_FIELD_NAME_LEN = 32768;
  1267. if ( sName.Length() > MAX_SANE_FIELD_NAME_LEN )
  1268. return m_tReporter.Fail ( "Docstore field name too long (%d) in %s", sName.Length(), m_szFilename );
  1269. m_tFields.AddField ( sName, eType );
  1270. }
  1271. DWORD uNumBlocks = m_tReader.GetDword();
  1272. // docstore from empty index
  1273. if ( !uNumBlocks )
  1274. {
  1275. if ( !m_iRowsCount )
  1276. return true;
  1277. return m_tReporter.Fail ( "Docstore has 0 blocks but " INT64_FMT " documents in %s", m_iRowsCount, m_szFilename );
  1278. }
  1279. SphOffset_t tHeaderOffset = m_tReader.GetOffset();
  1280. if ( tHeaderOffset <= 0 || tHeaderOffset >= m_tReader.GetFilesize() )
  1281. return m_tReporter.Fail ( "Wrong docstore header offset (" INT64_FMT ") in %s", tHeaderOffset, m_szFilename );
  1282. m_tReader.SeekTo ( tHeaderOffset, 0 );
  1283. CSphFixedVector<Docstore_c::Block_t> dBlocks(uNumBlocks);
  1284. DWORD tPrevBlockRowID = 0;
  1285. SphOffset_t tPrevBlockOffset = 0;
  1286. for ( auto & i : dBlocks )
  1287. {
  1288. RowID_t uUnzipped = m_tReader.UnzipRowid();
  1289. if ( (int64_t)uUnzipped + tPrevBlockRowID >= (int64_t)0xFFFFFFFF )
  1290. m_tReporter.Fail ( "Docstore rowid overflow in %s", m_szFilename );
  1291. i.m_tRowID = uUnzipped + tPrevBlockRowID;
  1292. BYTE uBlockType = m_tReader.GetByte();
  1293. if ( uBlockType>BLOCK_TYPE_TOTAL )
  1294. return m_tReporter.Fail ( "Unknown docstore block type (%u) in %s", uBlockType, m_szFilename );
  1295. i.m_eType = (BlockType_e)uBlockType;
  1296. i.m_tOffset = m_tReader.UnzipOffset() + tPrevBlockOffset;
  1297. if ( i.m_tOffset <= 0 || i.m_tOffset >= m_tReader.GetFilesize() )
  1298. return m_tReporter.Fail ( "Wrong docstore block offset (" INT64_FMT ") in %s", i.m_tOffset, m_szFilename );
  1299. if ( i.m_eType==BLOCK_TYPE_BIG )
  1300. i.m_uHeaderSize = m_tReader.UnzipInt();
  1301. tPrevBlockRowID = i.m_tRowID;
  1302. tPrevBlockOffset = i.m_tOffset;
  1303. }
  1304. for ( int i = 1; i<dBlocks.GetLength(); i++ )
  1305. {
  1306. if ( dBlocks[i-1].m_tOffset>=dBlocks[i].m_tOffset )
  1307. return m_tReporter.Fail ( "Descending docstore block offset in %s", m_szFilename );
  1308. dBlocks[i-1].m_uSize = dBlocks[i].m_tOffset-dBlocks[i-1].m_tOffset;
  1309. }
  1310. if ( dBlocks.GetLength() )
  1311. dBlocks.Last().m_uSize = tHeaderOffset-dBlocks.Last().m_tOffset;
  1312. for ( auto & i : dBlocks )
  1313. {
  1314. if ( i.m_tOffset+i.m_uSize > m_tReader.GetFilesize() )
  1315. return m_tReporter.Fail ( "Docstore block size+offset out of bounds in %s", m_szFilename );
  1316. CheckBlock(i);
  1317. }
  1318. if ( m_tReader.GetErrorFlag() )
  1319. return m_tReporter.Fail ( "%s", m_tReader.GetErrorMessage().cstr() );
  1320. return true;
  1321. }
  1322. void DocstoreChecker_c::CheckSmallBlockDoc ( MemoryReader2_c & tReader, CSphBitvec & tEmptyFields, SphOffset_t tOffset )
  1323. {
  1324. BYTE uDocFlags = tReader.GetVal<BYTE>();
  1325. if ( uDocFlags & ( ~(DOC_FLAG_ALL_EMPTY | DOC_FLAG_EMPTY_BITMASK) ) )
  1326. m_tReporter.Fail ( "Unknown docstore doc flag (%u) in %s (offset " INT64_FMT ")", uDocFlags, m_szFilename, tOffset );
  1327. if ( uDocFlags & DOC_FLAG_ALL_EMPTY )
  1328. return;
  1329. DWORD uBitMaskSize = tEmptyFields.GetSizeBytes();
  1330. bool bHasBitmask = !!(uDocFlags & DOC_FLAG_EMPTY_BITMASK);
  1331. if ( bHasBitmask )
  1332. {
  1333. memcpy ( tEmptyFields.Begin(), tReader.Begin()+tReader.GetPos(), uBitMaskSize );
  1334. tReader.SetPos ( tReader.GetPos()+uBitMaskSize );
  1335. }
  1336. for ( int iField = 0; iField < m_tFields.GetNumFields(); iField++ )
  1337. if ( !bHasBitmask || !tEmptyFields.BitGet(iField) )
  1338. {
  1339. DWORD uFieldLength = tReader.UnzipInt();
  1340. tReader.SetPos ( tReader.GetPos()+uFieldLength );
  1341. if ( tReader.GetPos() > tReader.GetLength() )
  1342. m_tReporter.Fail ( "Out of bounds in docstore field data in %s (offset " INT64_FMT ")", m_szFilename, tOffset );
  1343. }
  1344. }
  1345. void DocstoreChecker_c::CheckSmallBlock ( const Docstore_c::Block_t & tBlock )
  1346. {
  1347. CSphFixedVector<BYTE> dBlock ( tBlock.m_uSize );
  1348. m_tReader.SeekTo ( tBlock.m_tOffset, 0 );
  1349. m_tReader.GetBytes ( dBlock.Begin(), dBlock.GetLength() );
  1350. MemoryReader2_c tBlockReader ( dBlock.Begin(), dBlock.GetLength() );
  1351. BlockData_t tResult;
  1352. tResult.m_uFlags = tBlockReader.GetVal<BYTE>();
  1353. tResult.m_uNumDocs = tBlockReader.UnzipInt();
  1354. tResult.m_uSize = tBlockReader.UnzipInt();
  1355. DWORD uCompressedLength = tResult.m_uSize;
  1356. bool bCompressed = tResult.m_uFlags & BLOCK_FLAG_COMPRESSED;
  1357. if ( bCompressed )
  1358. uCompressedLength = tBlockReader.UnzipInt();
  1359. if ( tResult.m_uFlags!=0 && tResult.m_uFlags!=BLOCK_FLAG_COMPRESSED )
  1360. m_tReporter.Fail ( "Unknown docstore small block flag (%u) in %s (offset " INT64_FMT ")", tResult.m_uFlags, m_szFilename, tBlock.m_tOffset );
  1361. if ( uCompressedLength>tResult.m_uSize )
  1362. m_tReporter.Fail ( "Docstore block size mismatch: compressed=%u, uncompressed=%u in %s (offset " INT64_FMT ")", uCompressedLength, tResult.m_uSize, m_szFilename, tBlock.m_tOffset );
  1363. if ( !tResult.m_uNumDocs )
  1364. m_tReporter.Fail ( "Docstore block invalid document count: %d", tResult.m_uNumDocs );
  1365. const BYTE * pBody = dBlock.Begin() + tBlockReader.GetPos();
  1366. CSphFixedVector<BYTE> dDecompressed(0);
  1367. if ( bCompressed )
  1368. {
  1369. dDecompressed.Reset ( tResult.m_uSize );
  1370. if ( !m_pCompressor->Decompress ( VecTraits_T<const BYTE> (pBody, uCompressedLength), dDecompressed) )
  1371. m_tReporter.Fail ( "Error decompressing small block in %s (offset " INT64_FMT ")", m_szFilename, tBlock.m_tOffset );
  1372. tResult.m_pData = dDecompressed.LeakData();
  1373. }
  1374. else
  1375. {
  1376. // we can't just pass tResult.m_pData because it doesn't point to the start of the allocated block
  1377. tResult.m_pData = new BYTE[tResult.m_uSize];
  1378. memcpy ( tResult.m_pData, pBody, tResult.m_uSize );
  1379. }
  1380. MemoryReader2_c tReader ( tResult.m_pData, tResult.m_uSize );
  1381. CSphBitvec tEmptyFields ( m_tFields.GetNumFields() );
  1382. for ( int i = 0; i < (int)tResult.m_uNumDocs; i++ )
  1383. CheckSmallBlockDoc ( tReader, tEmptyFields, tBlock.m_tOffset );
  1384. SafeDelete ( tResult.m_pData );
  1385. }
  1386. void DocstoreChecker_c::CheckBigBlockField ( const Docstore_c::FieldInfo_t & tInfo, SphOffset_t & tOffset )
  1387. {
  1388. if ( tInfo.m_uFlags & FIELD_FLAG_EMPTY )
  1389. return;
  1390. bool bCompressed = !!( tInfo.m_uFlags & FIELD_FLAG_COMPRESSED );
  1391. SphOffset_t tOffsetDelta = bCompressed ? tInfo.m_uCompressedLen : tInfo.m_uUncompressedLen;
  1392. BlockData_t tBlockData;
  1393. CSphFixedVector<BYTE> dField ( tOffsetDelta );
  1394. m_tReader.SeekTo ( tOffset, 0 );
  1395. m_tReader.GetBytes ( dField.Begin(), dField.GetLength() );
  1396. tBlockData.m_uSize = tInfo.m_uUncompressedLen;
  1397. if ( bCompressed )
  1398. {
  1399. CSphFixedVector<BYTE> dDecompressed(0);
  1400. dDecompressed.Reset ( tBlockData.m_uSize );
  1401. if ( !m_pCompressor->Decompress ( dField, dDecompressed ) )
  1402. m_tReporter.Fail ( "Error decompressing big block in %s (offset " INT64_FMT ")", m_szFilename, tOffset );
  1403. }
  1404. tOffset += tOffsetDelta;
  1405. if ( tOffset > m_tReader.GetFilesize() )
  1406. m_tReporter.Fail ( "Docstore block size+offset out of bounds in %s (offset " INT64_FMT ")", m_szFilename, tOffset );
  1407. }
  1408. void DocstoreChecker_c::CheckBigBlock ( const Docstore_c::Block_t & tBlock )
  1409. {
  1410. CSphFixedVector<Docstore_c::FieldInfo_t> dFieldInfo ( m_tFields.GetNumFields() );
  1411. CSphFixedVector<BYTE> dBlockHeader(tBlock.m_uHeaderSize);
  1412. CSphFixedVector<BYTE> dBlock ( tBlock.m_uSize );
  1413. m_tReader.SeekTo ( tBlock.m_tOffset, 0 );
  1414. m_tReader.GetBytes ( dBlockHeader.Begin(), dBlockHeader.GetLength() );
  1415. MemoryReader2_c tReader ( dBlockHeader.Begin(), dBlockHeader.GetLength() );
  1416. CSphVector<int> dFieldSort;
  1417. BYTE uBlockFlags = tReader.GetVal<BYTE>();
  1418. if ( uBlockFlags & ~BLOCK_FLAG_FIELD_REORDER )
  1419. m_tReporter.Fail ( "Unknown docstore big block flag (%u) in %s (offset " INT64_FMT ")", uBlockFlags, m_szFilename, tBlock.m_tOffset );
  1420. bool bNeedReorder = !!( uBlockFlags & BLOCK_FLAG_FIELD_REORDER );
  1421. if ( bNeedReorder )
  1422. {
  1423. dFieldSort.Resize ( m_tFields.GetNumFields() );
  1424. for ( auto & i : dFieldSort )
  1425. {
  1426. i = tReader.UnzipInt();
  1427. if ( i<0 || i>m_tFields.GetNumFields() )
  1428. m_tReporter.Fail ( "Error in docstore field remap (%d) in %s (offset " INT64_FMT ")", i, m_szFilename, tBlock.m_tOffset );
  1429. }
  1430. }
  1431. for ( int i = 0; i < m_tFields.GetNumFields(); i++ )
  1432. {
  1433. int iField = bNeedReorder ? dFieldSort[i] : i;
  1434. Docstore_c::FieldInfo_t & tInfo = dFieldInfo[iField];
  1435. tInfo.m_uFlags = tReader.GetVal<BYTE>();
  1436. if ( tInfo.m_uFlags & (~(FIELD_FLAG_EMPTY | FIELD_FLAG_COMPRESSED) ) )
  1437. m_tReporter.Fail ( "Unknown docstore big block field flag (%u) in %s (offset " INT64_FMT ")", tInfo.m_uFlags, m_szFilename, tBlock.m_tOffset );
  1438. if ( tInfo.m_uFlags & FIELD_FLAG_EMPTY )
  1439. continue;
  1440. tInfo.m_uUncompressedLen = tReader.UnzipInt();
  1441. if ( tInfo.m_uFlags & FIELD_FLAG_COMPRESSED )
  1442. tInfo.m_uCompressedLen = tReader.UnzipInt();
  1443. if ( tInfo.m_uCompressedLen>tInfo.m_uUncompressedLen )
  1444. m_tReporter.Fail ( "Docstore block size mismatch: compressed=%u, uncompressed=%u in %s (offset " INT64_FMT ")", tInfo.m_uCompressedLen, tInfo.m_uUncompressedLen, m_szFilename, tBlock.m_tOffset );
  1445. if ( tReader.GetPos() > tReader.GetLength() )
  1446. m_tReporter.Fail ( "Out of bounds in docstore field data in %s (offset " INT64_FMT ")", m_szFilename, tBlock.m_tOffset );
  1447. }
  1448. SphOffset_t tOffset = tBlock.m_tOffset+tBlock.m_uHeaderSize;
  1449. for ( int i = 0; i < m_tFields.GetNumFields(); i++ )
  1450. CheckBigBlockField ( dFieldInfo[bNeedReorder ? dFieldSort[i] : i], tOffset );
  1451. }
  1452. void DocstoreChecker_c::CheckBlock ( const Docstore_c::Block_t & tBlock )
  1453. {
  1454. if ( tBlock.m_eType==BLOCK_TYPE_SMALL )
  1455. CheckSmallBlock(tBlock);
  1456. else
  1457. CheckBigBlock(tBlock);
  1458. }
  1459. //////////////////////////////////////////////////////////////////////////
  1460. std::unique_ptr<Docstore_i> CreateDocstore ( int64_t iIndexId, const CSphString & sFilename, CSphString & sError )
  1461. {
  1462. auto pDocstore = std::make_unique<Docstore_c>( iIndexId, sFilename );
  1463. if ( !pDocstore->Init(sError) )
  1464. return nullptr;
  1465. return pDocstore;
  1466. }
  1467. std::unique_ptr<DocstoreBuilder_i> CreateDocstoreBuilder ( const CSphString & sFilename, const DocstoreSettings_t & tSettings, int iBufferSize, CSphString & sError )
  1468. {
  1469. auto pBuilder = std::make_unique<DocstoreBuilder_c>( sFilename, tSettings, iBufferSize );
  1470. if ( !pBuilder->Init(sError) )
  1471. return nullptr;
  1472. return pBuilder;
  1473. }
  1474. std::unique_ptr<DocstoreRT_i> CreateDocstoreRT()
  1475. {
  1476. return std::make_unique<DocstoreRT_c>();
  1477. }
  1478. std::unique_ptr<DocstoreFields_i> CreateDocstoreFields()
  1479. {
  1480. return std::make_unique<DocstoreFields_c>();
  1481. }
  1482. void InitDocstore ( int64_t iCacheSize )
  1483. {
  1484. BlockCache_c::Init(iCacheSize);
  1485. DocstoreReaders_c::Init();
  1486. }
  1487. void ShutdownDocstore()
  1488. {
  1489. BlockCache_c::Done();
  1490. DocstoreReaders_c::Done();
  1491. }
  1492. void ClearDocstoreCache()
  1493. {
  1494. BlockCache_c * pBlockCache = BlockCache_c::Get();
  1495. if ( pBlockCache )
  1496. pBlockCache->ClearAll();
  1497. }
  1498. bool CheckDocstore ( CSphAutoreader & tReader, DebugCheckError_i & tReporter, int64_t iRowsCount )
  1499. {
  1500. DocstoreChecker_c tChecker ( tReader, tReporter, iRowsCount );
  1501. return tChecker.Check();
  1502. }