distinct.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522
  1. //
  2. // Copyright (c) 2017-2026, Manticore Software LTD (https://manticoresearch.com)
  3. // Copyright (c) 2001-2016, Andrew Aksyonoff
  4. // Copyright (c) 2008-2016, Sphinx Technologies Inc
  5. // All rights reserved
  6. //
  7. // This program is free software; you can redistribute it and/or modify
  8. // it under the terms of the GNU General Public License. You should have
  9. // received a copy of the GPL license along with this program; if you
  10. // did not, you can find it at http://www.gnu.org/
  11. //
  12. #ifndef _distinct_
  13. #define _distinct_
  14. #include "grouper.h"
  15. #include "std/openhash.h"
  16. #include "hyperloglog.h"
  17. struct ValueWithCount_t
  18. {
  19. SphAttr_t m_tValue;
  20. int m_iCount;
  21. inline bool operator < ( const ValueWithCount_t & rhs ) const
  22. {
  23. if ( m_tValue != rhs.m_tValue )
  24. return m_tValue < rhs.m_tValue;
  25. return m_iCount > rhs.m_iCount;
  26. }
  27. inline bool operator== ( const ValueWithCount_t & rhs ) const
  28. {
  29. return m_tValue == rhs.m_tValue;
  30. }
  31. };
  32. struct ValueWithGroup_t
  33. {
  34. public:
  35. SphGroupKey_t m_tGroup;
  36. SphAttr_t m_tValue;
  37. inline bool operator < ( const ValueWithGroup_t & rhs ) const
  38. {
  39. if ( m_tGroup!=rhs.m_tGroup ) return m_tGroup<rhs.m_tGroup;
  40. return m_tValue<rhs.m_tValue;
  41. }
  42. inline bool operator == ( const ValueWithGroup_t & rhs ) const
  43. {
  44. return m_tGroup==rhs.m_tGroup && m_tValue==rhs.m_tValue;
  45. }
  46. };
  47. struct ValueWithGroupCount_t
  48. {
  49. public:
  50. SphGroupKey_t m_tGroup;
  51. SphAttr_t m_tValue;
  52. int m_iCount;
  53. ValueWithGroupCount_t () = default;
  54. ValueWithGroupCount_t ( SphGroupKey_t uGroup, SphAttr_t uValue, int iCount )
  55. : m_tGroup ( uGroup )
  56. , m_tValue ( uValue )
  57. , m_iCount ( iCount )
  58. {}
  59. inline bool operator < ( const ValueWithGroupCount_t & rhs ) const
  60. {
  61. if ( m_tGroup!=rhs.m_tGroup ) return m_tGroup<rhs.m_tGroup;
  62. if ( m_tValue!=rhs.m_tValue ) return m_tValue<rhs.m_tValue;
  63. return m_iCount>rhs.m_iCount;
  64. }
  65. inline bool operator == ( const ValueWithGroupCount_t & rhs ) const
  66. {
  67. return m_tGroup==rhs.m_tGroup && m_tValue==rhs.m_tValue;
  68. }
  69. };
  70. template <typename T>
  71. class UniqGrouped_T : public CSphVector<T>
  72. {
  73. using BASE = CSphVector<T>;
  74. public:
  75. UniqGrouped_T () { BASE::Reserve ( 16384 ); }
  76. void Add ( const ValueWithGroupCount_t & tValue );
  77. void Sort();
  78. int CountStart ( SphGroupKey_t & tOutGroup ); ///< starting counting distinct values, returns count and group key (0 if empty)
  79. int CountNext ( SphGroupKey_t & tOutGroup ); ///< continues counting distinct values, returns count and group key (0 if done)
  80. void Compact ( VecTraits_T<SphGroupKey_t> & dRemoveGroups );
  81. void CopyTo (UniqGrouped_T & dRhs ) const;
  82. void Reset() { BASE::Resize(0); }
  83. void SetAccuracy ( int iAccuracy ) {}
  84. protected:
  85. int m_iCountPos = 0;
  86. bool m_bSorted = true;
  87. };
  88. template <typename T>
  89. inline void UniqGrouped_T<T>::Add ( const ValueWithGroupCount_t & tValue )
  90. {
  91. BASE::Add ( { tValue.m_tGroup, tValue.m_tValue } );
  92. m_bSorted = false;
  93. }
  94. template <>
  95. inline void UniqGrouped_T<ValueWithGroupCount_t>::Add ( const ValueWithGroupCount_t & tValue )
  96. {
  97. BASE::Add ( tValue );
  98. m_bSorted = false;
  99. }
  100. template <typename T>
  101. void UniqGrouped_T<T>::Sort()
  102. {
  103. if ( m_bSorted )
  104. return;
  105. BASE::Sort();
  106. m_bSorted = true;
  107. }
  108. template <typename T>
  109. int UniqGrouped_T<T>::CountStart ( SphGroupKey_t & tOutGroup )
  110. {
  111. m_iCountPos = 0;
  112. return CountNext ( tOutGroup );
  113. }
  114. template <typename T>
  115. int UniqGrouped_T<T>::CountNext ( SphGroupKey_t & tOutGroup )
  116. {
  117. assert ( m_bSorted );
  118. if ( m_iCountPos>=BASE::GetLength() )
  119. return 0;
  120. constexpr bool bNeedCount = std::is_same<T, ValueWithGroupCount_t>::value;
  121. auto tGroup = BASE::m_pData[m_iCountPos].m_tGroup;
  122. auto tValue = BASE::m_pData[m_iCountPos].m_tValue;
  123. int iCount = 1;
  124. if constexpr ( bNeedCount )
  125. iCount = BASE::m_pData[m_iCountPos].m_iCount;
  126. tOutGroup = tGroup;
  127. m_iCountPos++;
  128. while ( m_iCountPos<BASE::GetLength() && BASE::m_pData[m_iCountPos].m_tGroup==tGroup )
  129. {
  130. if constexpr ( bNeedCount )
  131. {
  132. // if we have similar values in both groups, we can safely assume that we have at least one duplicate
  133. int iAdd = BASE::m_pData[m_iCountPos].m_iCount;
  134. assert ( iAdd>0 );
  135. if ( BASE::m_pData[m_iCountPos].m_tValue==tValue )
  136. iAdd--;
  137. iCount += iAdd;
  138. }
  139. else
  140. {
  141. if ( BASE::m_pData[m_iCountPos].m_tValue!=tValue )
  142. iCount++;
  143. }
  144. tValue = BASE::m_pData[m_iCountPos].m_tValue;
  145. ++m_iCountPos;
  146. }
  147. return iCount;
  148. }
  149. // works like 'uniq', and also throw away provided values to remove.
  150. template <typename T>
  151. void UniqGrouped_T<T>::Compact ( VecTraits_T<SphGroupKey_t>& dRemoveGroups )
  152. {
  153. assert ( m_bSorted );
  154. if ( !BASE::GetLength() )
  155. return;
  156. dRemoveGroups.Sort();
  157. auto * pRemoveGroups = dRemoveGroups.begin ();
  158. auto iRemoveGroups = dRemoveGroups.GetLength ();
  159. auto pSrc = BASE::Begin();
  160. auto pDst = BASE::Begin();
  161. auto pEnd = BASE::End();
  162. // skip remove-groups which are not in my list
  163. while ( iRemoveGroups && (*pRemoveGroups)<pSrc->m_tGroup )
  164. {
  165. ++pRemoveGroups;
  166. --iRemoveGroups;
  167. }
  168. for ( ; pSrc<pEnd; ++pSrc )
  169. {
  170. // check if this entry needs to be removed
  171. while ( iRemoveGroups && (*pRemoveGroups)<pSrc->m_tGroup )
  172. {
  173. ++pRemoveGroups;
  174. --iRemoveGroups;
  175. }
  176. if ( iRemoveGroups && pSrc->m_tGroup==*pRemoveGroups )
  177. continue;
  178. // check if it's a dupe
  179. if ( pDst>BASE::Begin() && pDst[-1]==pSrc[0] )
  180. continue;
  181. *pDst++ = *pSrc;
  182. }
  183. assert ( pDst-BASE::Begin()<=BASE::GetLength() );
  184. BASE::m_iCount = pDst-BASE::Begin();
  185. }
  186. template <typename T>
  187. void UniqGrouped_T<T>::CopyTo ( UniqGrouped_T & dRhs ) const
  188. {
  189. if ( m_bSorted && dRhs.m_bSorted )
  190. {
  191. UniqGrouped_T dRes;
  192. dRes.MergeSorted ( dRhs, *this );
  193. dRes.m_bSorted = true;
  194. dRhs = std::move(dRes);
  195. }
  196. else
  197. {
  198. auto * pNewValues = dRhs.AddN ( BASE::GetLength() );
  199. memcpy ( pNewValues, BASE::Begin(), BASE::GetLengthBytes64() );
  200. dRhs.m_bSorted = false;
  201. }
  202. }
  203. /////////////////////////////////////////////////////////////////////
  204. template <typename T>
  205. class UniqSingle_T : public CSphVector<T>
  206. {
  207. using BASE = CSphVector<T>;
  208. public:
  209. UniqSingle_T() { BASE::Reserve ( 16384 ); }
  210. void Add ( const ValueWithGroupCount_t & tValue );
  211. void Sort();
  212. void Compact();
  213. void CopyTo ( UniqSingle_T & dRhs ) const;
  214. void Reset() { BASE::Resize(0); }
  215. int CountDistinct();
  216. protected:
  217. bool m_bSorted = true;
  218. };
  219. template <typename T>
  220. inline void UniqSingle_T<T>::Add ( const ValueWithGroupCount_t & tValue )
  221. {
  222. BASE::Add ( tValue.m_tValue );
  223. m_bSorted = false;
  224. }
  225. template <>
  226. inline void UniqSingle_T<ValueWithCount_t>::Add ( const ValueWithGroupCount_t & tValue )
  227. {
  228. BASE::Add ( { tValue.m_tValue, tValue.m_iCount } );
  229. m_bSorted = false;
  230. }
  231. template <typename T>
  232. void UniqSingle_T<T>::Sort()
  233. {
  234. if ( m_bSorted )
  235. return;
  236. BASE::Sort();
  237. m_bSorted = true;
  238. }
  239. template <typename T>
  240. void UniqSingle_T<T>::Compact()
  241. {
  242. BASE::Uniq ( !m_bSorted );
  243. m_bSorted = true;
  244. }
  245. template <typename T>
  246. void UniqSingle_T<T>::CopyTo ( UniqSingle_T & dRhs ) const
  247. {
  248. if ( m_bSorted && dRhs.m_bSorted )
  249. {
  250. UniqSingle_T dRes;
  251. dRes.MergeSorted ( dRhs, *this );
  252. dRes.m_bSorted = true;
  253. dRhs = std::move(dRes);
  254. }
  255. else
  256. {
  257. auto * pNewValues = dRhs.AddN ( BASE::GetLength() );
  258. memcpy ( pNewValues, BASE::Begin(), BASE::GetLengthBytes64() );
  259. dRhs.m_bSorted = false;
  260. }
  261. }
  262. template <typename T>
  263. int UniqSingle_T<T>::CountDistinct()
  264. {
  265. constexpr bool bNeedCount = std::is_same<T, ValueWithCount_t>::value;
  266. Sort();
  267. auto & dArray = *this;
  268. int iCount = 1;
  269. if constexpr ( bNeedCount )
  270. iCount = dArray[0].m_iCount;
  271. for ( int i = 1, iLen = BASE::GetLength (); i<iLen; ++i )
  272. {
  273. if constexpr ( bNeedCount )
  274. {
  275. // if we have similar values in both groups, we can safely assume that we have at least one duplicate
  276. int iAdd = dArray[i].m_iCount;
  277. assert ( iAdd>0 );
  278. if ( dArray[i-1]==dArray[i] )
  279. iAdd--;
  280. iCount += iAdd;
  281. }
  282. else
  283. {
  284. if ( dArray[i-1]!=dArray[i] )
  285. iCount++;
  286. }
  287. }
  288. return iCount;
  289. }
  290. class UniqHLLTraits_c
  291. {
  292. public:
  293. UniqHLLTraits_c();
  294. ~UniqHLLTraits_c() { Reset(); }
  295. UniqHLLTraits_c & operator = ( UniqHLLTraits_c && tRhs );
  296. void Reset();
  297. void SetAccuracy ( int iAccuracy );
  298. protected:
  299. static const int SMALL_ARRAY_SIZE = 16;
  300. static const int MIN_HASH_SIZE = 64;
  301. static const int NON_PACKED_HLL_THRESH = 16;
  302. using HLLDensePacked_c = HyperLogLogDense_T<>;
  303. using HLLDenseNonPacked_c = HyperLogLogDense_T<uint64_t, FarmHash_T<uint64_t>, RegistersNonPacked_c>;
  304. using SmallArray_c = LazyVector_T<SphAttr_t, sph::DefaultRelimit, SMALL_ARRAY_SIZE>;
  305. using Hash_c = OpenHashSetFastClear_T<SphAttr_t>;
  306. enum class ContainterType_e
  307. {
  308. ARRAY,
  309. HASH,
  310. HLL_DENSE_PACKED,
  311. HLL_DENSE_NONPACKED,
  312. };
  313. struct Container_t
  314. {
  315. union
  316. {
  317. SmallArray_c * m_pArray = nullptr;
  318. Hash_c * m_pHash;
  319. HLLDensePacked_c * m_pHLLDensePacked;
  320. HLLDenseNonPacked_c * m_pHLLDenseNonPacked;
  321. };
  322. ContainterType_e m_eType = ContainterType_e::ARRAY;
  323. int m_iHashIdx = 0;
  324. Container_t() = default;
  325. Container_t ( const Container_t & tRhs );
  326. Container_t & operator = ( Container_t && tRhs );
  327. Container_t & operator = ( const Container_t & tRhs );
  328. int Estimate() const;
  329. void Reset();
  330. bool IsEmpty() const;
  331. FORCE_INLINE bool FindInArray ( SphAttr_t tValue )
  332. {
  333. assert ( m_eType==ContainterType_e::ARRAY );
  334. for ( auto i : *m_pArray )
  335. if ( i==tValue )
  336. return true;
  337. return false;
  338. }
  339. };
  340. int m_iAccuracy = 16;
  341. void AddToContainer ( Container_t & tContainer, SphAttr_t tValue );
  342. FORCE_INLINE SmallArray_c * AllocateArray()
  343. {
  344. if ( m_dUnusedArray.GetLength() )
  345. {
  346. SmallArray_c * pNew = m_dUnusedArray.Pop();
  347. pNew->Resize(0);
  348. return pNew;
  349. }
  350. return new SmallArray_c;
  351. }
  352. FORCE_INLINE void FreeContainer ( Container_t & tContainer )
  353. {
  354. switch ( tContainer.m_eType )
  355. {
  356. case ContainterType_e::ARRAY: m_dUnusedArray.Add ( tContainer.m_pArray ); break;
  357. case ContainterType_e::HASH: m_dUnusedHashes[tContainer.m_iHashIdx].Add ( tContainer.m_pHash ); break;
  358. case ContainterType_e::HLL_DENSE_PACKED: m_dUnusedHLLDensePacked.Add ( tContainer.m_pHLLDensePacked ); break;
  359. case ContainterType_e::HLL_DENSE_NONPACKED: m_dUnusedHLLDenseNonPacked.Add ( tContainer.m_pHLLDenseNonPacked ); break;
  360. default: assert ( 0 && "Unknown container type" ); break;
  361. }
  362. tContainer.m_pArray = nullptr;
  363. }
  364. void ConvertToHash ( Container_t & tContainer );
  365. void ConvertToHLLDensePacked ( Container_t & tContainer );
  366. void ConvertToHLLDenseNonPacked ( Container_t & tContainer );
  367. private:
  368. CSphVector<SmallArray_c *> m_dUnusedArray;
  369. CSphVector<CSphVector<Hash_c *>> m_dUnusedHashes;
  370. CSphVector<HLLDensePacked_c *> m_dUnusedHLLDensePacked;
  371. CSphVector<HLLDenseNonPacked_c *> m_dUnusedHLLDenseNonPacked;
  372. int m_iMaxHashSize = 0;
  373. HLLDensePacked_c * AllocateHLLDensePacked();
  374. HLLDenseNonPacked_c * AllocateHLLDenseNonPacked();
  375. Hash_c * AllocateHash ( int iIdx );
  376. void MoveToLargerHash ( Container_t & tContainer );
  377. template <typename T>
  378. friend void CopyContainerTo ( SphGroupKey_t tGroup, const UniqHLLTraits_c::Container_t & tFrom, T & tRhs );
  379. };
  380. class UniqHLL_c : public UniqHLLTraits_c
  381. {
  382. using BASE = UniqHLLTraits_c;
  383. public:
  384. ~UniqHLL_c() { Reset(); }
  385. UniqHLL_c & operator = ( UniqHLL_c && tRhs );
  386. FORCE_INLINE void Add ( const ValueWithGroupCount_t & tValue );
  387. void Sort() {}
  388. int CountStart ( SphGroupKey_t & tOutGroup );
  389. int CountNext ( SphGroupKey_t & tOutGroup );
  390. void Compact ( VecTraits_T<SphGroupKey_t>& dRemoveGroups );
  391. void Reset();
  392. void CopyTo ( UniqHLL_c & tRhs ) const;
  393. Container_t & Get ( SphGroupKey_t tGroup );
  394. private:
  395. OpenHashTable_T<SphGroupKey_t, Container_t> m_hGroups;
  396. int64_t m_iHashIterator = 0;
  397. };
  398. void UniqHLL_c::Add ( const ValueWithGroupCount_t & tValue )
  399. {
  400. Container_t * pContainer = m_hGroups.Find ( tValue.m_tGroup );
  401. if ( pContainer )
  402. AddToContainer ( *pContainer, tValue.m_tValue );
  403. else
  404. {
  405. Container_t tContainer;
  406. tContainer.m_pArray = AllocateArray();
  407. tContainer.m_pArray->Add ( tValue.m_tValue );
  408. m_hGroups.Add ( tValue.m_tGroup, tContainer );
  409. }
  410. }
  411. class UniqHLLSingle_c : public UniqHLLTraits_c
  412. {
  413. using BASE = UniqHLLTraits_c;
  414. public:
  415. UniqHLLSingle_c() { m_tContainer.m_pArray = AllocateArray(); }
  416. ~UniqHLLSingle_c() { Reset(); }
  417. UniqHLLSingle_c & operator = ( UniqHLLSingle_c && tRhs );
  418. inline void Add ( const ValueWithGroupCount_t & tValue ) { AddToContainer ( m_tContainer, tValue.m_tValue ); }
  419. void Compact() {}
  420. int CountDistinct() { return m_tContainer.Estimate(); }
  421. void CopyTo ( UniqHLLSingle_c & tRhs ) const;
  422. void Reset();
  423. Container_t & Get ( SphGroupKey_t tGroup ) { return m_tContainer; }
  424. private:
  425. Container_t m_tContainer;
  426. };
  427. #endif // _distinct_