attribute.cpp 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071
  1. //
  2. // Copyright (c) 2017-2020, Manticore Software LTD (http://manticoresearch.com)
  3. // All rights reserved
  4. //
  5. // This program is free software; you can redistribute it and/or modify
  6. // it under the terms of the GNU General Public License. You should have
  7. // received a copy of the GPL license along with this program; if you
  8. // did not, you can find it at http://www.gnu.org/
  9. //
  10. #include "attribute.h"
  11. #include "sphinxint.h"
  12. #include "sphinxjson.h"
  13. #include "indexcheck.h"
  14. //////////////////////////////////////////////////////////////////////////
  15. // blob attributes
  16. enum
  17. {
  18. BLOB_ROW_LEN_BYTE = 0,
  19. BLOB_ROW_LEN_WORD = 1,
  20. BLOB_ROW_LEN_DWORD = 2
  21. };
  22. static BYTE CalcBlobRowFlags ( DWORD uTotalLen )
  23. {
  24. if ( uTotalLen<0xFF )
  25. return BLOB_ROW_LEN_BYTE;
  26. if ( uTotalLen<0xFFFF )
  27. return BLOB_ROW_LEN_WORD;
  28. return BLOB_ROW_LEN_DWORD;
  29. }
  30. DWORD RowFlagsToLen ( BYTE uFlags )
  31. {
  32. switch ( uFlags )
  33. {
  34. case BLOB_ROW_LEN_BYTE: return 1;
  35. case BLOB_ROW_LEN_WORD: return 2;
  36. case BLOB_ROW_LEN_DWORD: return 4;
  37. default:
  38. assert ( 0 && "Unknown blob flags" );
  39. return 0;
  40. }
  41. }
  42. class AttributePacker_i
  43. {
  44. public:
  45. virtual ~AttributePacker_i(){}
  46. virtual bool SetData ( const BYTE * pData, int iDataLen, CSphString & sError ) = 0;
  47. virtual const CSphVector<BYTE> & GetData() const = 0;
  48. };
  49. class AttributePacker_c : public AttributePacker_i
  50. {
  51. public:
  52. bool SetData ( const BYTE * pData, int iDataLen, CSphString & /*sError*/ ) override
  53. {
  54. m_dData.Resize ( iDataLen );
  55. memcpy ( m_dData.Begin(), pData, iDataLen );
  56. return true;
  57. }
  58. const CSphVector<BYTE> & GetData() const override
  59. {
  60. return m_dData;
  61. }
  62. protected:
  63. CSphVector<BYTE> m_dData;
  64. };
  65. // packs MVAs coming from updates (pairs of DWORDS for each value)
  66. template <typename INT>
  67. class AttributePacker_MVA_T : public AttributePacker_c
  68. {
  69. public:
  70. bool SetData ( const BYTE * pData, int iDataLen, CSphString & /*sError*/ ) override
  71. {
  72. int iValueSize = sizeof ( int64_t );
  73. int nValues = iDataLen/iValueSize;
  74. m_dData.Resize ( nValues*sizeof(INT) );
  75. auto * pResult = (INT*)m_dData.Begin();
  76. for ( int i = 0; i<nValues; i++ )
  77. {
  78. auto iVal = sphUnalignedRead ( *(int64_t*)pData );
  79. *pResult = INT(iVal);
  80. pResult++;
  81. pData += iValueSize;
  82. }
  83. return true;
  84. }
  85. };
  86. using AttributePacker_MVA32_c = AttributePacker_MVA_T<DWORD>;
  87. class AttributePacker_Json_c : public AttributePacker_c
  88. {
  89. public:
  90. bool SetData ( const BYTE * pData, int iDataLen, CSphString & sError ) override
  91. {
  92. m_dData.Resize(0);
  93. if ( !iDataLen )
  94. return true;
  95. // WARNING, tricky bit
  96. // flex lexer needs last two (!) bytes to be zeroes
  97. // asciiz string supplies one, and we fill out the extra one
  98. // and that works, because CSphString always allocates a small extra gap
  99. char * szData = const_cast<char*>((const char*)pData);
  100. szData[iDataLen] = '\0';
  101. szData[iDataLen+1] = '\0';
  102. return sphJsonParse ( m_dData, szData, g_bJsonAutoconvNumbers, g_bJsonKeynamesToLowercase, true, sError );
  103. }
  104. };
  105. //////////////////////////////////////////////////////////////////////////
  106. class BlobRowBuilder_Base_c : public BlobRowBuilder_i
  107. {
  108. public:
  109. bool SetAttr ( int iAttr, const BYTE * pData, int iDataLen, CSphString & sError ) override;
  110. ~BlobRowBuilder_Base_c () override;
  111. protected:
  112. CSphVector<AttributePacker_i*> m_dAttrs;
  113. };
  114. bool BlobRowBuilder_Base_c::SetAttr ( int iAttr, const BYTE * pData, int iDataLen, CSphString & sError )
  115. {
  116. return m_dAttrs[iAttr]->SetData ( pData, iDataLen, sError );
  117. }
  118. BlobRowBuilder_Base_c::~BlobRowBuilder_Base_c()
  119. {
  120. for ( auto i : m_dAttrs )
  121. SafeDelete (i);
  122. }
  123. //////////////////////////////////////////////////////////////////////////
  124. class BlobRowBuilder_File_c : public BlobRowBuilder_Base_c
  125. {
  126. public:
  127. BlobRowBuilder_File_c ( const ISphSchema & tSchema, SphOffset_t tSpaceForUpdates, bool bJsonPacked );
  128. ~BlobRowBuilder_File_c() override;
  129. bool Setup ( const CSphString & sFile, CSphString & sError );
  130. SphOffset_t Flush() override;
  131. SphOffset_t Flush ( const BYTE * pOldRow ) override;
  132. bool Done ( CSphString & sError ) override;
  133. private:
  134. CSphWriter m_tWriter;
  135. bool m_bDeleteFile {true};
  136. SphOffset_t m_tSpaceForUpdates {0};
  137. };
  138. BlobRowBuilder_File_c::BlobRowBuilder_File_c ( const ISphSchema & tSchema, SphOffset_t tSpaceForUpdates, bool bJsonPacked )
  139. : m_tSpaceForUpdates ( tSpaceForUpdates )
  140. {
  141. for ( int i = 0; i < tSchema.GetAttrsCount(); i++ )
  142. {
  143. const CSphColumnInfo & tCol = tSchema.GetAttr(i);
  144. AttributePacker_i * pPacker = nullptr;
  145. switch ( tCol.m_eAttrType )
  146. {
  147. case SPH_ATTR_STRING:
  148. case SPH_ATTR_INT64SET:
  149. pPacker = new AttributePacker_c;
  150. break;
  151. case SPH_ATTR_UINT32SET:
  152. pPacker = new AttributePacker_MVA32_c;
  153. break;
  154. case SPH_ATTR_JSON:
  155. if ( bJsonPacked )
  156. pPacker = new AttributePacker_c;
  157. else
  158. pPacker = new AttributePacker_Json_c;
  159. break;
  160. default:
  161. break;
  162. }
  163. if ( pPacker )
  164. m_dAttrs.Add(pPacker);
  165. }
  166. }
  167. BlobRowBuilder_File_c::~BlobRowBuilder_File_c()
  168. {
  169. if ( m_bDeleteFile )
  170. m_tWriter.UnlinkFile();
  171. }
  172. bool BlobRowBuilder_File_c::Setup ( const CSphString & sFile, CSphString & sError )
  173. {
  174. if ( !m_tWriter.OpenFile ( sFile, sError ) )
  175. return false;
  176. // reserve space
  177. m_tWriter.PutOffset(0);
  178. return true;
  179. }
  180. SphOffset_t BlobRowBuilder_File_c::Flush()
  181. {
  182. SphOffset_t tRowOffset = m_tWriter.GetPos();
  183. DWORD uTotalLen = 0;
  184. for ( const auto & i : m_dAttrs )
  185. uTotalLen += i->GetData().GetLength();
  186. BYTE uFlags = CalcBlobRowFlags(uTotalLen);
  187. m_tWriter.PutByte(uFlags);
  188. uTotalLen = 0;
  189. for ( const auto & i : m_dAttrs )
  190. {
  191. uTotalLen += i->GetData().GetLength();
  192. switch ( uFlags )
  193. {
  194. case BLOB_ROW_LEN_BYTE:
  195. m_tWriter.PutByte(uTotalLen);
  196. break;
  197. case BLOB_ROW_LEN_WORD:
  198. m_tWriter.PutWord(uTotalLen);
  199. break;
  200. case BLOB_ROW_LEN_DWORD:
  201. m_tWriter.PutDword(uTotalLen);
  202. break;
  203. }
  204. }
  205. for ( const auto & i : m_dAttrs )
  206. m_tWriter.PutBytes ( i->GetData().Begin(), i->GetData().GetLength() );
  207. return tRowOffset;
  208. }
  209. SphOffset_t BlobRowBuilder_File_c::Flush ( const BYTE * pOldRow )
  210. {
  211. assert ( pOldRow );
  212. SphOffset_t tRowOffset = m_tWriter.GetPos();
  213. m_tWriter.PutBytes ( pOldRow, sphGetBlobTotalLen ( pOldRow, m_dAttrs.GetLength() ) );
  214. return tRowOffset;
  215. }
  216. bool BlobRowBuilder_File_c::Done ( CSphString & sError )
  217. {
  218. SphOffset_t tTotalSize = m_tWriter.GetPos();
  219. // FIXME!!! made single function from this mess as order matters here
  220. m_tWriter.Flush(); // store collected data as SeekTo might got rid of buffer collected so far
  221. m_tWriter.SeekTo ( 0 );
  222. m_tWriter.PutOffset ( tTotalSize );
  223. m_tWriter.SeekTo ( tTotalSize + m_tSpaceForUpdates, true );
  224. m_tWriter.CloseFile();
  225. if ( m_tWriter.IsError() )
  226. {
  227. sError.SetSprintf ( "error writing .SPB, %s", sError.cstr() ); // keep original error from writer
  228. return false;
  229. }
  230. m_bDeleteFile = false;
  231. return true;
  232. }
  233. //////////////////////////////////////////////////////////////////////////
  234. class BlobRowBuilder_Mem_c : public BlobRowBuilder_Base_c
  235. {
  236. public:
  237. BlobRowBuilder_Mem_c ( CSphTightVector<BYTE> & dPool );
  238. BlobRowBuilder_Mem_c ( const ISphSchema & tSchema, CSphTightVector<BYTE> & dPool );
  239. SphOffset_t Flush() override;
  240. SphOffset_t Flush ( const BYTE * pOldRow ) override;
  241. bool Done ( CSphString & /*sError*/ ) override { return true; }
  242. protected:
  243. CSphTightVector<BYTE> & m_dPool;
  244. };
  245. BlobRowBuilder_Mem_c::BlobRowBuilder_Mem_c ( CSphTightVector<BYTE> & dPool )
  246. : m_dPool ( dPool )
  247. {}
  248. BlobRowBuilder_Mem_c::BlobRowBuilder_Mem_c ( const ISphSchema & tSchema, CSphTightVector<BYTE> & dPool )
  249. : m_dPool ( dPool )
  250. {
  251. for ( int i = 0; i < tSchema.GetAttrsCount(); i++ )
  252. {
  253. const CSphColumnInfo & tCol = tSchema.GetAttr(i);
  254. AttributePacker_i * pPacker = nullptr;
  255. switch ( tCol.m_eAttrType )
  256. {
  257. case SPH_ATTR_STRING:
  258. case SPH_ATTR_JSON: // json doesn't go to a separate packer because we work with pre-parsed json in this case
  259. case SPH_ATTR_INT64SET:
  260. pPacker = new AttributePacker_c;
  261. break;
  262. case SPH_ATTR_UINT32SET:
  263. pPacker = new AttributePacker_MVA32_c;
  264. break;
  265. default:
  266. break;
  267. }
  268. if ( pPacker )
  269. m_dAttrs.Add(pPacker);
  270. }
  271. }
  272. SphOffset_t BlobRowBuilder_Mem_c::Flush()
  273. {
  274. SphOffset_t tRowOffset = m_dPool.GetLength();
  275. DWORD uTotalLen = 0;
  276. for ( const auto & i : m_dAttrs )
  277. uTotalLen += i->GetData().GetLength();
  278. m_dPool.Reserve ( uTotalLen + m_dAttrs.GetLength()*sizeof(DWORD) + 1 );
  279. BYTE uFlags = CalcBlobRowFlags(uTotalLen);
  280. m_dPool.Add(uFlags);
  281. uTotalLen = 0;
  282. BYTE * pPtr;
  283. for ( const auto & i : m_dAttrs )
  284. {
  285. uTotalLen += i->GetData().GetLength();
  286. switch ( uFlags )
  287. {
  288. case BLOB_ROW_LEN_BYTE:
  289. m_dPool.Add((BYTE)uTotalLen);
  290. break;
  291. case BLOB_ROW_LEN_WORD:
  292. pPtr = m_dPool.AddN ( sizeof(WORD) );
  293. sphUnalignedWrite ( pPtr, (WORD)uTotalLen );
  294. break;
  295. case BLOB_ROW_LEN_DWORD:
  296. pPtr = m_dPool.AddN ( sizeof(DWORD) );
  297. sphUnalignedWrite ( pPtr, (DWORD)uTotalLen );
  298. break;
  299. }
  300. }
  301. for ( const auto & i : m_dAttrs )
  302. {
  303. int iLen = i->GetData().GetLength();
  304. pPtr = m_dPool.AddN(iLen);
  305. memcpy ( pPtr, i->GetData().Begin(), iLen );
  306. }
  307. return tRowOffset;
  308. }
  309. SphOffset_t BlobRowBuilder_Mem_c::Flush ( const BYTE * pOldRow )
  310. {
  311. assert ( 0 );
  312. return 0;
  313. }
  314. //////////////////////////////////////////////////////////////////////////
  315. class BlobRowBuilder_MemUpdate_c : public BlobRowBuilder_Mem_c
  316. {
  317. public:
  318. BlobRowBuilder_MemUpdate_c ( const ISphSchema & tSchema, CSphTightVector<BYTE> & dPool, const CSphBitvec & dAttrsUpdated );
  319. };
  320. BlobRowBuilder_MemUpdate_c::BlobRowBuilder_MemUpdate_c ( const ISphSchema & tSchema, CSphTightVector<BYTE> & dPool, const CSphBitvec & dAttrsUpdated )
  321. : BlobRowBuilder_Mem_c ( dPool )
  322. {
  323. for ( int i = 0; i < tSchema.GetAttrsCount(); i++ )
  324. {
  325. const CSphColumnInfo & tCol = tSchema.GetAttr(i);
  326. if ( !dAttrsUpdated.BitGet(i) && sphIsBlobAttr ( tCol.m_eAttrType ) )
  327. {
  328. m_dAttrs.Add ( new AttributePacker_c );
  329. continue;
  330. }
  331. AttributePacker_i * pPacker = nullptr;
  332. switch ( tCol.m_eAttrType )
  333. {
  334. case SPH_ATTR_STRING:
  335. pPacker = new AttributePacker_c;
  336. break;
  337. case SPH_ATTR_UINT32SET:
  338. pPacker = new AttributePacker_MVA_T<DWORD>;
  339. break;
  340. case SPH_ATTR_INT64SET:
  341. pPacker = new AttributePacker_MVA_T<int64_t>;
  342. break;
  343. case SPH_ATTR_JSON:
  344. pPacker = new AttributePacker_Json_c;
  345. break;
  346. default:
  347. break;
  348. }
  349. if ( pPacker )
  350. m_dAttrs.Add(pPacker);
  351. }
  352. }
  353. //////////////////////////////////////////////////////////////////////////
  354. BlobRowBuilder_i * sphCreateBlobRowBuilder ( const ISphSchema & tSchema, const CSphString & sFile, SphOffset_t tSpaceForUpdates, CSphString & sError )
  355. {
  356. BlobRowBuilder_File_c * pBuilder = new BlobRowBuilder_File_c ( tSchema, tSpaceForUpdates, false );
  357. if ( !pBuilder->Setup ( sFile, sError ) )
  358. SafeDelete ( pBuilder );
  359. return pBuilder;
  360. }
  361. BlobRowBuilder_i * sphCreateBlobRowJsonBuilder ( const ISphSchema & tSchema, const CSphString & sFile, SphOffset_t tSpaceForUpdates, CSphString & sError )
  362. {
  363. BlobRowBuilder_File_c * pBuilder = new BlobRowBuilder_File_c ( tSchema, tSpaceForUpdates, true );
  364. if ( !pBuilder->Setup ( sFile, sError ) )
  365. SafeDelete ( pBuilder );
  366. return pBuilder;
  367. }
  368. BlobRowBuilder_i * sphCreateBlobRowBuilder ( const ISphSchema & tSchema, CSphTightVector<BYTE> & dPool )
  369. {
  370. return new BlobRowBuilder_Mem_c ( tSchema, dPool );
  371. }
  372. BlobRowBuilder_i * sphCreateBlobRowBuilderUpdate ( const ISphSchema & tSchema, CSphTightVector<BYTE> & dPool, const CSphBitvec & dAttrsUpdated )
  373. {
  374. return new BlobRowBuilder_MemUpdate_c ( tSchema, dPool, dAttrsUpdated );
  375. }
  376. //////////////////////////////////////////////////////////////////////////
  377. static int64_t GetBlobRowOffset ( const CSphMatch & tMatch, const CSphAttrLocator & tLocator )
  378. {
  379. // blob row locator NEEDS to be the 2nd attribute after docid
  380. return sphGetBlobRowOffset ( tLocator.m_bDynamic ? tMatch.m_pDynamic : tMatch.m_pStatic );
  381. }
  382. template <typename T>
  383. static const BYTE * GetBlobAttr ( int iBlobAttrId, int nBlobAttrs, const BYTE * pRow, int & iLengthBytes )
  384. {
  385. T uLen1 = sphUnalignedRead ( *((T*)pRow + iBlobAttrId) );
  386. T uLen0 = iBlobAttrId > 0 ? sphUnalignedRead ( *((T*)pRow + iBlobAttrId - 1) ) : 0;
  387. iLengthBytes = (int)uLen1-uLen0;
  388. assert ( iLengthBytes>=0 );
  389. return iLengthBytes ? (const BYTE *)((T*)pRow + nBlobAttrs) + uLen0 : nullptr;
  390. }
  391. static const BYTE * GetBlobAttr ( const BYTE * pRow, int iBlobAttrId, int nBlobAttrs, int & iLengthBytes )
  392. {
  393. switch ( *pRow )
  394. {
  395. case BLOB_ROW_LEN_BYTE: return GetBlobAttr<BYTE> ( iBlobAttrId, nBlobAttrs, pRow+1, iLengthBytes );
  396. case BLOB_ROW_LEN_WORD: return GetBlobAttr<WORD> ( iBlobAttrId, nBlobAttrs, pRow+1, iLengthBytes );
  397. case BLOB_ROW_LEN_DWORD: return GetBlobAttr<DWORD>( iBlobAttrId, nBlobAttrs, pRow+1, iLengthBytes );
  398. default:
  399. break;
  400. }
  401. assert ( 0 && "Unknown blob row type" );
  402. return nullptr;
  403. }
  404. // same as above, but returns pair instead of confusing result-by-ref.
  405. template <typename T>
  406. static ByteBlob_t GetBlobAttr ( int iBlobAttrId, int nBlobAttrs, const BYTE * pRow )
  407. {
  408. auto pTRow = (T*)pRow;
  409. T uLen1 = sphUnalignedRead ( pTRow[iBlobAttrId] );
  410. T uLen0 = (iBlobAttrId > 0) ? sphUnalignedRead ( pTRow[iBlobAttrId - 1] ) : 0;
  411. auto iLengthBytes = (int)uLen1-uLen0;
  412. assert ( iLengthBytes>=0 );
  413. return {iLengthBytes ? (const BYTE *)(pTRow + nBlobAttrs) + uLen0 : nullptr, iLengthBytes };
  414. }
  415. static ByteBlob_t GetBlobAttr ( const BYTE * pRow, int iBlobAttrId, int nBlobAttrs )
  416. {
  417. switch ( *pRow )
  418. {
  419. case BLOB_ROW_LEN_BYTE: return GetBlobAttr<BYTE> ( iBlobAttrId, nBlobAttrs, pRow+1 );
  420. case BLOB_ROW_LEN_WORD: return GetBlobAttr<WORD> ( iBlobAttrId, nBlobAttrs, pRow+1 );
  421. case BLOB_ROW_LEN_DWORD: return GetBlobAttr<DWORD>( iBlobAttrId, nBlobAttrs, pRow+1 );
  422. default:
  423. break;
  424. }
  425. assert ( 0 && "Unknown blob row type" );
  426. return { nullptr, 0 };
  427. }
  428. const BYTE * sphGetBlobAttr ( const CSphMatch & tMatch, const CSphAttrLocator & tLocator, const BYTE * pBlobPool, int & iLengthBytes )
  429. {
  430. assert ( pBlobPool );
  431. int64_t iOffset = GetBlobRowOffset ( tMatch, tLocator );
  432. return GetBlobAttr ( pBlobPool+iOffset, tLocator.m_iBlobAttrId, tLocator.m_nBlobAttrs, iLengthBytes );
  433. }
  434. ByteBlob_t sphGetBlobAttr ( const CSphMatch & tMatch, const CSphAttrLocator & tLocator, const BYTE * pBlobPool )
  435. {
  436. assert ( pBlobPool );
  437. int64_t iOffset = GetBlobRowOffset ( tMatch, tLocator );
  438. return GetBlobAttr ( pBlobPool+iOffset, tLocator.m_iBlobAttrId, tLocator.m_nBlobAttrs );
  439. }
  440. const BYTE * sphGetBlobAttr ( const CSphRowitem * pDocinfo, const CSphAttrLocator & tLocator, const BYTE * pBlobPool, int & iLengthBytes )
  441. {
  442. assert ( pBlobPool );
  443. int64_t iOffset = sphGetBlobRowOffset ( pDocinfo );
  444. return GetBlobAttr ( pBlobPool+iOffset, tLocator.m_iBlobAttrId, tLocator.m_nBlobAttrs, iLengthBytes );
  445. }
  446. ByteBlob_t sphGetBlobAttr ( const CSphRowitem * pDocinfo, const CSphAttrLocator & tLocator, const BYTE * pBlobPool )
  447. {
  448. assert ( pBlobPool );
  449. int64_t iOffset = sphGetBlobRowOffset ( pDocinfo );
  450. return GetBlobAttr ( pBlobPool+iOffset, tLocator.m_iBlobAttrId, tLocator.m_nBlobAttrs );
  451. }
  452. int64_t sphGetBlobRowOffset ( const CSphRowitem * pDocinfo )
  453. {
  454. return sphUnalignedRead ( *((int64_t*)pDocinfo + 1) );
  455. }
  456. void sphSetBlobRowOffset ( CSphRowitem * pDocinfo, int64_t iOffset )
  457. {
  458. sphUnalignedWrite ( (int64_t*)pDocinfo + 1, iOffset );
  459. }
  460. template <typename T>
  461. static int GetBlobAttrLen ( int iBlobAttrId, const BYTE * pRow )
  462. {
  463. assert ( pRow );
  464. T uLen1 = sphUnalignedRead ( *((T*)pRow + iBlobAttrId) );
  465. T uLen0 = iBlobAttrId > 0 ? sphUnalignedRead ( *((T*)pRow+iBlobAttrId-1) ) : 0;
  466. return (int)uLen1-uLen0;
  467. }
  468. int sphGetBlobAttrLen ( const CSphMatch & tMatch, const CSphAttrLocator & tLocator, const BYTE * pBlobPool )
  469. {
  470. assert ( pBlobPool );
  471. int64_t iOffset = GetBlobRowOffset ( tMatch, tLocator );
  472. const BYTE * pRow = pBlobPool+iOffset;
  473. switch ( *pRow )
  474. {
  475. case BLOB_ROW_LEN_BYTE: return GetBlobAttrLen<BYTE> ( tLocator.m_iBlobAttrId, pRow+1 );
  476. case BLOB_ROW_LEN_WORD: return GetBlobAttrLen<WORD> ( tLocator.m_iBlobAttrId, pRow+1 );
  477. case BLOB_ROW_LEN_DWORD: return GetBlobAttrLen<DWORD> ( tLocator.m_iBlobAttrId, pRow+1 );
  478. default:
  479. break;
  480. }
  481. assert ( 0 && "Unknown blob row type" );
  482. return 0;
  483. }
  484. template <typename T>
  485. static int GetBlobTotalLen ( const BYTE * pRow, int nBlobAttrs )
  486. {
  487. assert ( pRow );
  488. return sphUnalignedRead ( *((const T *)pRow + nBlobAttrs - 1 ) ) + nBlobAttrs*sizeof(T) + 1;
  489. }
  490. DWORD sphGetBlobTotalLen ( const BYTE * pRow, int nBlobAttrs )
  491. {
  492. switch ( *pRow )
  493. {
  494. case BLOB_ROW_LEN_BYTE: return GetBlobTotalLen<BYTE> ( pRow+1, nBlobAttrs );
  495. case BLOB_ROW_LEN_WORD: return GetBlobTotalLen<WORD> ( pRow+1, nBlobAttrs );
  496. case BLOB_ROW_LEN_DWORD: return GetBlobTotalLen<DWORD> ( pRow+1, nBlobAttrs );
  497. default:
  498. break;
  499. }
  500. assert ( 0 && "Unknown blob row type" );
  501. return 0;
  502. }
  503. int64_t sphCopyBlobRow ( CSphTightVector<BYTE> & dDstPool, const CSphTightVector<BYTE> & dSrcPool, int64_t iOffset, int nBlobs )
  504. {
  505. const BYTE * pSrcBlob = dSrcPool.Begin()+iOffset;
  506. int iBlobLen = sphGetBlobTotalLen ( pSrcBlob, nBlobs );
  507. int64_t iNewOffset = dDstPool.GetLength();
  508. dDstPool.Resize ( iNewOffset+iBlobLen );
  509. BYTE * pDstBlob = dDstPool.Begin()+iNewOffset;
  510. memcpy ( pDstBlob, pSrcBlob, iBlobLen );
  511. return iNewOffset;
  512. }
  513. void sphAddAttrToBlobRow ( const CSphRowitem * pDocinfo, CSphTightVector<BYTE> & dBlobRow, const BYTE * pPool, int nBlobs )
  514. {
  515. dBlobRow.Resize ( 0 );
  516. if ( nBlobs )
  517. {
  518. const BYTE * pOldRow = pPool + sphGetBlobRowOffset ( pDocinfo );
  519. DWORD uOldBlobLen = sphGetBlobTotalLen ( pOldRow, nBlobs );
  520. DWORD uLenSize = RowFlagsToLen ( *pOldRow );
  521. dBlobRow.Resize ( uOldBlobLen + uLenSize );
  522. BYTE * pNewRow = dBlobRow.Begin();
  523. DWORD uAttrLengthSize = uLenSize*nBlobs+1; // old blob lengths + flags
  524. memcpy ( pNewRow, pOldRow, uAttrLengthSize );
  525. pNewRow += uAttrLengthSize;
  526. pOldRow += uAttrLengthSize;
  527. memcpy ( pNewRow, pOldRow-uLenSize, uLenSize ); // new attr length (last cumulative length)
  528. pNewRow += uLenSize;
  529. memcpy ( pNewRow, pOldRow, uOldBlobLen-uAttrLengthSize );
  530. }
  531. else
  532. {
  533. dBlobRow.Add ( CalcBlobRowFlags(0) ); // 1-byte flags
  534. dBlobRow.Add ( 0 ); // 1-byte length
  535. }
  536. }
  537. void sphRemoveAttrFromBlobRow ( const CSphRowitem * pDocinfo, CSphTightVector<BYTE> & dBlobRow, const BYTE * pPool, int nBlobs, int iBlobAttrId )
  538. {
  539. if ( nBlobs<=1 )
  540. {
  541. dBlobRow.Resize(0);
  542. return;
  543. }
  544. const BYTE * pOldRow = pPool + sphGetBlobRowOffset ( pDocinfo );
  545. BYTE uFlags = *pOldRow;
  546. CSphVector<DWORD> dAttrLengths;
  547. for ( int i = 0; i < nBlobs; i++ )
  548. if ( i!=iBlobAttrId )
  549. {
  550. switch ( uFlags )
  551. {
  552. case BLOB_ROW_LEN_BYTE:
  553. dAttrLengths.Add ( GetBlobAttrLen<BYTE> ( i, pOldRow+1 ) );
  554. break;
  555. case BLOB_ROW_LEN_WORD:
  556. dAttrLengths.Add ( GetBlobAttrLen<WORD> ( i, pOldRow+1 ) );
  557. break;
  558. case BLOB_ROW_LEN_DWORD:
  559. dAttrLengths.Add ( GetBlobAttrLen<DWORD> ( i, pOldRow+1 ) );
  560. break;
  561. default:
  562. break;
  563. }
  564. }
  565. DWORD uTotalLength = 0;
  566. for ( auto i : dAttrLengths )
  567. uTotalLength += i;
  568. dBlobRow.Resize ( 1 + (nBlobs-1)*RowFlagsToLen(uFlags) + uTotalLength );
  569. BYTE * pNewRow = dBlobRow.Begin();
  570. // flags
  571. BYTE uNewFlags = CalcBlobRowFlags ( uTotalLength );
  572. *pNewRow++ = uNewFlags;
  573. // attribute lengths
  574. DWORD uCumulativeLength = 0;
  575. ARRAY_FOREACH ( i, dAttrLengths )
  576. {
  577. uCumulativeLength += dAttrLengths[i];
  578. switch ( uNewFlags )
  579. {
  580. case BLOB_ROW_LEN_BYTE:
  581. sphUnalignedWrite ( pNewRow, (BYTE)uCumulativeLength );
  582. pNewRow += sizeof(BYTE);
  583. break;
  584. case BLOB_ROW_LEN_WORD:
  585. sphUnalignedWrite ( pNewRow, (WORD)uCumulativeLength );
  586. pNewRow += sizeof(WORD);
  587. break;
  588. case BLOB_ROW_LEN_DWORD:
  589. sphUnalignedWrite ( pNewRow, (DWORD)uCumulativeLength );
  590. pNewRow += sizeof(DWORD);
  591. break;
  592. default:
  593. break;
  594. }
  595. }
  596. // attribute data
  597. for ( int i = 0; i < nBlobs; i++ )
  598. if ( i!=iBlobAttrId )
  599. {
  600. int iLengthBytes = 0;
  601. const BYTE * pBlob = GetBlobAttr ( pOldRow, i, nBlobs, iLengthBytes );
  602. memcpy ( pNewRow, pBlob, iLengthBytes );
  603. pNewRow += iLengthBytes;
  604. }
  605. }
  606. template<typename T>
  607. static bool CheckMVAValues ( const T * pMVA, DWORD uLengthBytes, int iBlobAttrId, CSphString & sError )
  608. {
  609. if ( uLengthBytes % sizeof(T) )
  610. {
  611. sError.SetSprintf ( "Blob row error: MVA attribute length=%u is not a multiple of %u (blob attribute %d)", uLengthBytes, DWORD(sizeof(T)), iBlobAttrId );
  612. return false;
  613. }
  614. int nValues = int(uLengthBytes/sizeof(T));
  615. for ( int i = 0; i < nValues-1; i++ )
  616. if ( pMVA[i]>=pMVA[i+1] )
  617. {
  618. sError.SetSprintf ( "Blob row error: descending MVA values found (blob attribute %d)", iBlobAttrId );
  619. return false;
  620. }
  621. return true;
  622. }
  623. bool sphCheckBlobRow ( int64_t iOff, DebugCheckReader_i & tBlobs, const CSphSchema & tSchema, CSphString & sError )
  624. {
  625. CSphVector<ESphAttr> dBlobAttrs;
  626. for ( int i = 0; i < tSchema.GetAttrsCount(); i++ )
  627. {
  628. ESphAttr eAttr = tSchema.GetAttr(i).m_eAttrType;
  629. if ( sphIsBlobAttr(eAttr) )
  630. dBlobAttrs.Add(eAttr);
  631. }
  632. int64_t iBlobsElemCount = tBlobs.GetLengthBytes();
  633. if ( iOff<0 || iOff>iBlobsElemCount )
  634. {
  635. sError.SetSprintf ( "Blob offset out of bounds: " INT64_FMT " (max: " INT64_FMT ")", iOff, iBlobsElemCount );
  636. return false;
  637. }
  638. tBlobs.SeekTo ( iOff, 16 );
  639. BYTE uType = 0;
  640. tBlobs.GetBytes ( &uType, sizeof(uType) );
  641. if ( uType!=BLOB_ROW_LEN_BYTE && uType!=BLOB_ROW_LEN_WORD && uType!=BLOB_ROW_LEN_DWORD )
  642. {
  643. sError.SetSprintf ( "Unknown blob row type: %u", uType );
  644. return false;
  645. }
  646. int nBlobAttrs = dBlobAttrs.GetLength();
  647. DWORD uLenSize = RowFlagsToLen ( uType );
  648. DWORD uAttrLengths = uLenSize*nBlobAttrs;
  649. if ( iOff + uAttrLengths > iBlobsElemCount )
  650. sError = "Blob row too long";
  651. CSphFixedVector<BYTE> dLengths ( uAttrLengths );
  652. tBlobs.GetBytes ( dLengths.Begin(), (int) dLengths.GetLengthBytes() );
  653. const BYTE * pLen = dLengths.Begin();
  654. CSphVector<int> dAttrLengths ( nBlobAttrs );
  655. for ( int i = 0; i < nBlobAttrs; i++ )
  656. {
  657. switch ( uType )
  658. {
  659. case BLOB_ROW_LEN_BYTE: dAttrLengths[i] = GetBlobAttrLen<BYTE> ( i, pLen ); break;
  660. case BLOB_ROW_LEN_WORD: dAttrLengths[i] = GetBlobAttrLen<WORD> ( i, pLen ); break;
  661. case BLOB_ROW_LEN_DWORD: dAttrLengths[i] = GetBlobAttrLen<DWORD> ( i, pLen ); break;
  662. default:
  663. break;
  664. }
  665. }
  666. for ( int i = 0; i < nBlobAttrs-1; i++ )
  667. if ( dAttrLengths[i]<0 )
  668. {
  669. sError = "Blob row error: negative attribute length";
  670. return false;
  671. }
  672. DWORD uTotalLength = 0;
  673. for ( auto i : dAttrLengths )
  674. uTotalLength += (DWORD)i;
  675. if ( iOff+uAttrLengths+uTotalLength > iBlobsElemCount )
  676. {
  677. sError = "Blob row too long";
  678. return false;
  679. }
  680. CSphFixedVector<BYTE> dAttrs ( uTotalLength );
  681. tBlobs.GetBytes ( dAttrs.Begin(), (int) dAttrs.GetLengthBytes() );
  682. const BYTE * pAttr = dAttrs.Begin();
  683. for ( int i = 0; i < nBlobAttrs; i++ )
  684. {
  685. DWORD uLength = (DWORD)dAttrLengths[i];
  686. switch ( dBlobAttrs[i] )
  687. {
  688. case SPH_ATTR_UINT32SET:
  689. if ( !CheckMVAValues ( (const DWORD *)pAttr, uLength, i, sError ) )
  690. return false;
  691. break;
  692. case SPH_ATTR_INT64SET:
  693. if ( !CheckMVAValues ( (const int64_t *)pAttr, uLength, i, sError ) )
  694. return false;
  695. break;
  696. case SPH_ATTR_STRING:
  697. for ( DWORD j = 0; j < uLength; j++ )
  698. if ( !pAttr[j] )
  699. {
  700. sError.SetSprintf ( "Blob row error: string value contains zeroes (blob attribute %d)", i );
  701. return false;
  702. }
  703. break;
  704. default:
  705. break;
  706. }
  707. pAttr += uLength;
  708. }
  709. return true;
  710. }
  711. const char * sphGetBlobLocatorName()
  712. {
  713. static const char * BLOB_LOCATOR_ATTR = "$_blob_locator";
  714. return BLOB_LOCATOR_ATTR;
  715. }
  716. static const CSphString g_sDocidName { "id" };
  717. const char * sphGetDocidName()
  718. {
  719. return g_sDocidName.cstr();
  720. }
  721. const CSphString & sphGetDocidStr()
  722. {
  723. return g_sDocidName;
  724. }
  725. bool sphIsBlobAttr ( ESphAttr eAttr )
  726. {
  727. return eAttr==SPH_ATTR_STRING || eAttr==SPH_ATTR_JSON || eAttr==SPH_ATTR_UINT32SET || eAttr==SPH_ATTR_INT64SET;
  728. }
  729. //////////////////////////////////////////////////////////////////////////
  730. // data ptr attributes
  731. int sphCalcPackedLength ( int iLengthBytes )
  732. {
  733. return sphCalcZippedLen(iLengthBytes) + iLengthBytes;
  734. }
  735. BYTE * sphPackedBlob ( ByteBlob_t dBlob )
  736. {
  737. if ( !dBlob.first ) return nullptr;
  738. return const_cast<BYTE*>(dBlob.first-sphCalcZippedLen (dBlob.second));
  739. }
  740. // allocate buf and pack blob dBlob into it, return pointer to buf
  741. BYTE * sphPackPtrAttr ( ByteBlob_t dBlob )
  742. {
  743. if ( !dBlob.second )
  744. return nullptr;
  745. assert ( dBlob.first );
  746. BYTE * pPacked = sphAllocateSmall ( sphCalcPackedLength ( dBlob.second ));
  747. sphPackPtrAttr ( pPacked, std::move(dBlob) );
  748. return pPacked;
  749. }
  750. // pack blob pData[iLengthBytes] into preallocated buf
  751. int sphPackPtrAttr ( BYTE * pPrealloc, ByteBlob_t dBlob )
  752. {
  753. assert ( pPrealloc && IsValid ( dBlob ) );
  754. int iZippedLen = sphZipToPtr ( pPrealloc, dBlob.second );
  755. memcpy ( pPrealloc+iZippedLen, dBlob.first, dBlob.second );
  756. return iZippedLen+dBlob.second;
  757. }
  758. void sphPackPtrAttrInPlace ( TightPackedVec_T<BYTE> & dAttr, int iSize )
  759. {
  760. BYTE bSize[20];
  761. if ( iSize<0 ) iSize = dAttr.GetLength();
  762. int iZippedLen = sphZipToPtr ( bSize, iSize );
  763. dAttr.Resize ( iZippedLen+iSize );
  764. BYTE * pData = dAttr.Begin ();
  765. memmove ( pData+iZippedLen, pData, iSize );
  766. memcpy ( pData, bSize, iZippedLen );
  767. }
  768. // allocate buf for pack of iLengthBytes, pack size, then put pointer to payload in *ppData, and return buf
  769. BYTE * sphPackPtrAttr ( int iLengthBytes, BYTE ** ppData )
  770. {
  771. assert ( ppData );
  772. BYTE * pPacked = sphAllocateSmall ( sphCalcPackedLength ( iLengthBytes ) );
  773. *ppData = pPacked;
  774. *ppData += sphZipToPtr ( pPacked, iLengthBytes );
  775. return pPacked;
  776. }
  777. int sphUnpackPtrAttr ( const BYTE * pData, const BYTE ** ppUnpacked )
  778. {
  779. assert ( ppUnpacked );
  780. if ( !pData )
  781. {
  782. *ppUnpacked = nullptr;
  783. return 0;
  784. }
  785. int iLen = (int)sphUnzipInt ( pData );
  786. *ppUnpacked = pData;
  787. return iLen;
  788. }
  789. ByteBlob_t sphUnpackPtrAttr ( const BYTE * pData )
  790. {
  791. if ( !pData )
  792. return { nullptr, 0 };
  793. auto iLen = (int)sphUnzipInt ( pData );
  794. return { pData, iLen };
  795. }
  796. ESphAttr sphPlainAttrToPtrAttr ( ESphAttr eAttrType )
  797. {
  798. switch ( eAttrType )
  799. {
  800. case SPH_ATTR_STRING: return SPH_ATTR_STRINGPTR;
  801. case SPH_ATTR_JSON: return SPH_ATTR_JSON_PTR;
  802. case SPH_ATTR_UINT32SET: return SPH_ATTR_UINT32SET_PTR;
  803. case SPH_ATTR_INT64SET: return SPH_ATTR_INT64SET_PTR;
  804. case SPH_ATTR_JSON_FIELD: return SPH_ATTR_JSON_FIELD_PTR;
  805. default: return eAttrType;
  806. };
  807. }
  808. bool sphIsDataPtrAttr ( ESphAttr eAttr )
  809. {
  810. return eAttr==SPH_ATTR_STRINGPTR || eAttr==SPH_ATTR_FACTORS || eAttr==SPH_ATTR_FACTORS_JSON
  811. || eAttr==SPH_ATTR_UINT32SET_PTR || eAttr==SPH_ATTR_INT64SET_PTR
  812. || eAttr==SPH_ATTR_JSON_PTR || eAttr==SPH_ATTR_JSON_FIELD_PTR;
  813. }
  814. //////////////////////////////////////////////////////////////////////////
  815. // misc attribute-related
  816. template < typename T >
  817. static void MVA2Str ( const T * pMVA, int iLengthBytes, StringBuilder_c &dStr )
  818. {
  819. dStr.GrowEnough ( ( SPH_MAX_NUMERIC_STR + 1 ) * iLengthBytes / sizeof ( DWORD ) );
  820. int nValues = iLengthBytes / sizeof ( T );
  821. Comma_c sComma ( "," );
  822. for ( int i = 0; i<nValues; ++i )
  823. {
  824. dStr << sComma;
  825. dStr.GrowEnough ( SPH_MAX_NUMERIC_STR );
  826. dStr += sph::NtoA ( dStr.end (), pMVA[i] );
  827. }
  828. }
  829. bool sphIsInternalAttr ( const CSphString & sAttrName )
  830. {
  831. return sAttrName==sphGetBlobLocatorName();
  832. }
  833. bool sphIsInternalAttr ( const CSphColumnInfo & tCol )
  834. {
  835. return sphIsInternalAttr ( tCol.m_sName );
  836. }
  837. void sphMVA2Str ( const BYTE * pMVA, int iLengthBytes, bool b64bit, StringBuilder_c &dStr )
  838. {
  839. if ( b64bit )
  840. MVA2Str ( ( const int64_t * ) pMVA, iLengthBytes, dStr );
  841. else
  842. MVA2Str ( ( const DWORD * ) pMVA, iLengthBytes, dStr );
  843. }
  844. void sphPackedMVA2Str ( const BYTE * pMVA, bool b64bit, StringBuilder_c & dStr )
  845. {
  846. int iLengthBytes = sphUnpackPtrAttr ( pMVA, &pMVA );
  847. sphMVA2Str( pMVA, iLengthBytes, b64bit, dStr );
  848. }