| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522 |
- //
- // Copyright (c) 2017-2026, Manticore Software LTD (https://manticoresearch.com)
- // Copyright (c) 2001-2016, Andrew Aksyonoff
- // Copyright (c) 2008-2016, Sphinx Technologies Inc
- // All rights reserved
- //
- // This program is free software; you can redistribute it and/or modify
- // it under the terms of the GNU General Public License. You should have
- // received a copy of the GPL license along with this program; if you
- // did not, you can find it at http://www.gnu.org/
- //
- #ifndef _distinct_
- #define _distinct_
- #include "grouper.h"
- #include "std/openhash.h"
- #include "hyperloglog.h"
- struct ValueWithCount_t
- {
- SphAttr_t m_tValue;
- int m_iCount;
- inline bool operator < ( const ValueWithCount_t & rhs ) const
- {
- if ( m_tValue != rhs.m_tValue )
- return m_tValue < rhs.m_tValue;
- return m_iCount > rhs.m_iCount;
- }
- inline bool operator== ( const ValueWithCount_t & rhs ) const
- {
- return m_tValue == rhs.m_tValue;
- }
- };
- struct ValueWithGroup_t
- {
- public:
- SphGroupKey_t m_tGroup;
- SphAttr_t m_tValue;
- inline bool operator < ( const ValueWithGroup_t & rhs ) const
- {
- if ( m_tGroup!=rhs.m_tGroup ) return m_tGroup<rhs.m_tGroup;
- return m_tValue<rhs.m_tValue;
- }
- inline bool operator == ( const ValueWithGroup_t & rhs ) const
- {
- return m_tGroup==rhs.m_tGroup && m_tValue==rhs.m_tValue;
- }
- };
- struct ValueWithGroupCount_t
- {
- public:
- SphGroupKey_t m_tGroup;
- SphAttr_t m_tValue;
- int m_iCount;
- ValueWithGroupCount_t () = default;
- ValueWithGroupCount_t ( SphGroupKey_t uGroup, SphAttr_t uValue, int iCount )
- : m_tGroup ( uGroup )
- , m_tValue ( uValue )
- , m_iCount ( iCount )
- {}
- inline bool operator < ( const ValueWithGroupCount_t & rhs ) const
- {
- if ( m_tGroup!=rhs.m_tGroup ) return m_tGroup<rhs.m_tGroup;
- if ( m_tValue!=rhs.m_tValue ) return m_tValue<rhs.m_tValue;
- return m_iCount>rhs.m_iCount;
- }
- inline bool operator == ( const ValueWithGroupCount_t & rhs ) const
- {
- return m_tGroup==rhs.m_tGroup && m_tValue==rhs.m_tValue;
- }
- };
- template <typename T>
- class UniqGrouped_T : public CSphVector<T>
- {
- using BASE = CSphVector<T>;
- public:
- UniqGrouped_T () { BASE::Reserve ( 16384 ); }
- void Add ( const ValueWithGroupCount_t & tValue );
- void Sort();
- int CountStart ( SphGroupKey_t & tOutGroup ); ///< starting counting distinct values, returns count and group key (0 if empty)
- int CountNext ( SphGroupKey_t & tOutGroup ); ///< continues counting distinct values, returns count and group key (0 if done)
- void Compact ( VecTraits_T<SphGroupKey_t> & dRemoveGroups );
- void CopyTo (UniqGrouped_T & dRhs ) const;
- void Reset() { BASE::Resize(0); }
- void SetAccuracy ( int iAccuracy ) {}
- protected:
- int m_iCountPos = 0;
- bool m_bSorted = true;
- };
- template <typename T>
- inline void UniqGrouped_T<T>::Add ( const ValueWithGroupCount_t & tValue )
- {
- BASE::Add ( { tValue.m_tGroup, tValue.m_tValue } );
- m_bSorted = false;
- }
- template <>
- inline void UniqGrouped_T<ValueWithGroupCount_t>::Add ( const ValueWithGroupCount_t & tValue )
- {
- BASE::Add ( tValue );
- m_bSorted = false;
- }
- template <typename T>
- void UniqGrouped_T<T>::Sort()
- {
- if ( m_bSorted )
- return;
- BASE::Sort();
- m_bSorted = true;
- }
- template <typename T>
- int UniqGrouped_T<T>::CountStart ( SphGroupKey_t & tOutGroup )
- {
- m_iCountPos = 0;
- return CountNext ( tOutGroup );
- }
- template <typename T>
- int UniqGrouped_T<T>::CountNext ( SphGroupKey_t & tOutGroup )
- {
- assert ( m_bSorted );
- if ( m_iCountPos>=BASE::GetLength() )
- return 0;
- constexpr bool bNeedCount = std::is_same<T, ValueWithGroupCount_t>::value;
- auto tGroup = BASE::m_pData[m_iCountPos].m_tGroup;
- auto tValue = BASE::m_pData[m_iCountPos].m_tValue;
- int iCount = 1;
- if constexpr ( bNeedCount )
- iCount = BASE::m_pData[m_iCountPos].m_iCount;
- tOutGroup = tGroup;
- m_iCountPos++;
- while ( m_iCountPos<BASE::GetLength() && BASE::m_pData[m_iCountPos].m_tGroup==tGroup )
- {
- if constexpr ( bNeedCount )
- {
- // if we have similar values in both groups, we can safely assume that we have at least one duplicate
- int iAdd = BASE::m_pData[m_iCountPos].m_iCount;
- assert ( iAdd>0 );
- if ( BASE::m_pData[m_iCountPos].m_tValue==tValue )
- iAdd--;
- iCount += iAdd;
- }
- else
- {
- if ( BASE::m_pData[m_iCountPos].m_tValue!=tValue )
- iCount++;
- }
- tValue = BASE::m_pData[m_iCountPos].m_tValue;
- ++m_iCountPos;
- }
- return iCount;
- }
- // works like 'uniq', and also throw away provided values to remove.
- template <typename T>
- void UniqGrouped_T<T>::Compact ( VecTraits_T<SphGroupKey_t>& dRemoveGroups )
- {
- assert ( m_bSorted );
- if ( !BASE::GetLength() )
- return;
- dRemoveGroups.Sort();
- auto * pRemoveGroups = dRemoveGroups.begin ();
- auto iRemoveGroups = dRemoveGroups.GetLength ();
- auto pSrc = BASE::Begin();
- auto pDst = BASE::Begin();
- auto pEnd = BASE::End();
- // skip remove-groups which are not in my list
- while ( iRemoveGroups && (*pRemoveGroups)<pSrc->m_tGroup )
- {
- ++pRemoveGroups;
- --iRemoveGroups;
- }
- for ( ; pSrc<pEnd; ++pSrc )
- {
- // check if this entry needs to be removed
- while ( iRemoveGroups && (*pRemoveGroups)<pSrc->m_tGroup )
- {
- ++pRemoveGroups;
- --iRemoveGroups;
- }
- if ( iRemoveGroups && pSrc->m_tGroup==*pRemoveGroups )
- continue;
- // check if it's a dupe
- if ( pDst>BASE::Begin() && pDst[-1]==pSrc[0] )
- continue;
- *pDst++ = *pSrc;
- }
- assert ( pDst-BASE::Begin()<=BASE::GetLength() );
- BASE::m_iCount = pDst-BASE::Begin();
- }
- template <typename T>
- void UniqGrouped_T<T>::CopyTo ( UniqGrouped_T & dRhs ) const
- {
- if ( m_bSorted && dRhs.m_bSorted )
- {
- UniqGrouped_T dRes;
- dRes.MergeSorted ( dRhs, *this );
- dRes.m_bSorted = true;
- dRhs = std::move(dRes);
- }
- else
- {
- auto * pNewValues = dRhs.AddN ( BASE::GetLength() );
- memcpy ( pNewValues, BASE::Begin(), BASE::GetLengthBytes64() );
- dRhs.m_bSorted = false;
- }
- }
- /////////////////////////////////////////////////////////////////////
- template <typename T>
- class UniqSingle_T : public CSphVector<T>
- {
- using BASE = CSphVector<T>;
- public:
- UniqSingle_T() { BASE::Reserve ( 16384 ); }
- void Add ( const ValueWithGroupCount_t & tValue );
- void Sort();
- void Compact();
- void CopyTo ( UniqSingle_T & dRhs ) const;
- void Reset() { BASE::Resize(0); }
- int CountDistinct();
- protected:
- bool m_bSorted = true;
- };
- template <typename T>
- inline void UniqSingle_T<T>::Add ( const ValueWithGroupCount_t & tValue )
- {
- BASE::Add ( tValue.m_tValue );
- m_bSorted = false;
- }
- template <>
- inline void UniqSingle_T<ValueWithCount_t>::Add ( const ValueWithGroupCount_t & tValue )
- {
- BASE::Add ( { tValue.m_tValue, tValue.m_iCount } );
- m_bSorted = false;
- }
- template <typename T>
- void UniqSingle_T<T>::Sort()
- {
- if ( m_bSorted )
- return;
- BASE::Sort();
- m_bSorted = true;
- }
- template <typename T>
- void UniqSingle_T<T>::Compact()
- {
- BASE::Uniq ( !m_bSorted );
- m_bSorted = true;
- }
- template <typename T>
- void UniqSingle_T<T>::CopyTo ( UniqSingle_T & dRhs ) const
- {
- if ( m_bSorted && dRhs.m_bSorted )
- {
- UniqSingle_T dRes;
- dRes.MergeSorted ( dRhs, *this );
- dRes.m_bSorted = true;
- dRhs = std::move(dRes);
- }
- else
- {
- auto * pNewValues = dRhs.AddN ( BASE::GetLength() );
- memcpy ( pNewValues, BASE::Begin(), BASE::GetLengthBytes64() );
- dRhs.m_bSorted = false;
- }
- }
- template <typename T>
- int UniqSingle_T<T>::CountDistinct()
- {
- constexpr bool bNeedCount = std::is_same<T, ValueWithCount_t>::value;
- Sort();
- auto & dArray = *this;
- int iCount = 1;
- if constexpr ( bNeedCount )
- iCount = dArray[0].m_iCount;
- for ( int i = 1, iLen = BASE::GetLength (); i<iLen; ++i )
- {
- if constexpr ( bNeedCount )
- {
- // if we have similar values in both groups, we can safely assume that we have at least one duplicate
- int iAdd = dArray[i].m_iCount;
- assert ( iAdd>0 );
- if ( dArray[i-1]==dArray[i] )
- iAdd--;
- iCount += iAdd;
- }
- else
- {
- if ( dArray[i-1]!=dArray[i] )
- iCount++;
- }
- }
- return iCount;
- }
- class UniqHLLTraits_c
- {
- public:
- UniqHLLTraits_c();
- ~UniqHLLTraits_c() { Reset(); }
- UniqHLLTraits_c & operator = ( UniqHLLTraits_c && tRhs );
- void Reset();
- void SetAccuracy ( int iAccuracy );
- protected:
- static const int SMALL_ARRAY_SIZE = 16;
- static const int MIN_HASH_SIZE = 64;
- static const int NON_PACKED_HLL_THRESH = 16;
- using HLLDensePacked_c = HyperLogLogDense_T<>;
- using HLLDenseNonPacked_c = HyperLogLogDense_T<uint64_t, FarmHash_T<uint64_t>, RegistersNonPacked_c>;
- using SmallArray_c = LazyVector_T<SphAttr_t, sph::DefaultRelimit, SMALL_ARRAY_SIZE>;
- using Hash_c = OpenHashSetFastClear_T<SphAttr_t>;
- enum class ContainterType_e
- {
- ARRAY,
- HASH,
- HLL_DENSE_PACKED,
- HLL_DENSE_NONPACKED,
- };
- struct Container_t
- {
- union
- {
- SmallArray_c * m_pArray = nullptr;
- Hash_c * m_pHash;
- HLLDensePacked_c * m_pHLLDensePacked;
- HLLDenseNonPacked_c * m_pHLLDenseNonPacked;
- };
- ContainterType_e m_eType = ContainterType_e::ARRAY;
- int m_iHashIdx = 0;
- Container_t() = default;
- Container_t ( const Container_t & tRhs );
- Container_t & operator = ( Container_t && tRhs );
- Container_t & operator = ( const Container_t & tRhs );
- int Estimate() const;
- void Reset();
- bool IsEmpty() const;
- FORCE_INLINE bool FindInArray ( SphAttr_t tValue )
- {
- assert ( m_eType==ContainterType_e::ARRAY );
- for ( auto i : *m_pArray )
- if ( i==tValue )
- return true;
- return false;
- }
- };
- int m_iAccuracy = 16;
- void AddToContainer ( Container_t & tContainer, SphAttr_t tValue );
- FORCE_INLINE SmallArray_c * AllocateArray()
- {
- if ( m_dUnusedArray.GetLength() )
- {
- SmallArray_c * pNew = m_dUnusedArray.Pop();
- pNew->Resize(0);
- return pNew;
- }
- return new SmallArray_c;
- }
- FORCE_INLINE void FreeContainer ( Container_t & tContainer )
- {
- switch ( tContainer.m_eType )
- {
- case ContainterType_e::ARRAY: m_dUnusedArray.Add ( tContainer.m_pArray ); break;
- case ContainterType_e::HASH: m_dUnusedHashes[tContainer.m_iHashIdx].Add ( tContainer.m_pHash ); break;
- case ContainterType_e::HLL_DENSE_PACKED: m_dUnusedHLLDensePacked.Add ( tContainer.m_pHLLDensePacked ); break;
- case ContainterType_e::HLL_DENSE_NONPACKED: m_dUnusedHLLDenseNonPacked.Add ( tContainer.m_pHLLDenseNonPacked ); break;
- default: assert ( 0 && "Unknown container type" ); break;
- }
- tContainer.m_pArray = nullptr;
- }
- void ConvertToHash ( Container_t & tContainer );
- void ConvertToHLLDensePacked ( Container_t & tContainer );
- void ConvertToHLLDenseNonPacked ( Container_t & tContainer );
- private:
- CSphVector<SmallArray_c *> m_dUnusedArray;
- CSphVector<CSphVector<Hash_c *>> m_dUnusedHashes;
- CSphVector<HLLDensePacked_c *> m_dUnusedHLLDensePacked;
- CSphVector<HLLDenseNonPacked_c *> m_dUnusedHLLDenseNonPacked;
- int m_iMaxHashSize = 0;
- HLLDensePacked_c * AllocateHLLDensePacked();
- HLLDenseNonPacked_c * AllocateHLLDenseNonPacked();
- Hash_c * AllocateHash ( int iIdx );
- void MoveToLargerHash ( Container_t & tContainer );
- template <typename T>
- friend void CopyContainerTo ( SphGroupKey_t tGroup, const UniqHLLTraits_c::Container_t & tFrom, T & tRhs );
- };
- class UniqHLL_c : public UniqHLLTraits_c
- {
- using BASE = UniqHLLTraits_c;
- public:
- ~UniqHLL_c() { Reset(); }
- UniqHLL_c & operator = ( UniqHLL_c && tRhs );
- FORCE_INLINE void Add ( const ValueWithGroupCount_t & tValue );
- void Sort() {}
- int CountStart ( SphGroupKey_t & tOutGroup );
- int CountNext ( SphGroupKey_t & tOutGroup );
- void Compact ( VecTraits_T<SphGroupKey_t>& dRemoveGroups );
- void Reset();
- void CopyTo ( UniqHLL_c & tRhs ) const;
- Container_t & Get ( SphGroupKey_t tGroup );
- private:
- OpenHashTable_T<SphGroupKey_t, Container_t> m_hGroups;
- int64_t m_iHashIterator = 0;
- };
- void UniqHLL_c::Add ( const ValueWithGroupCount_t & tValue )
- {
- Container_t * pContainer = m_hGroups.Find ( tValue.m_tGroup );
- if ( pContainer )
- AddToContainer ( *pContainer, tValue.m_tValue );
- else
- {
- Container_t tContainer;
- tContainer.m_pArray = AllocateArray();
- tContainer.m_pArray->Add ( tValue.m_tValue );
- m_hGroups.Add ( tValue.m_tGroup, tContainer );
- }
- }
- class UniqHLLSingle_c : public UniqHLLTraits_c
- {
- using BASE = UniqHLLTraits_c;
- public:
- UniqHLLSingle_c() { m_tContainer.m_pArray = AllocateArray(); }
- ~UniqHLLSingle_c() { Reset(); }
- UniqHLLSingle_c & operator = ( UniqHLLSingle_c && tRhs );
- inline void Add ( const ValueWithGroupCount_t & tValue ) { AddToContainer ( m_tContainer, tValue.m_tValue ); }
- void Compact() {}
- int CountDistinct() { return m_tContainer.Estimate(); }
- void CopyTo ( UniqHLLSingle_c & tRhs ) const;
- void Reset();
- Container_t & Get ( SphGroupKey_t tGroup ) { return m_tContainer; }
- private:
- Container_t m_tContainer;
- };
- #endif // _distinct_
|