// // Copyright (c) 2017-2026, Manticore Software LTD (https://manticoresearch.com) // Copyright (c) 2001-2016, Andrew Aksyonoff // Copyright (c) 2008-2016, Sphinx Technologies Inc // All rights reserved // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License. You should have // received a copy of the GPL license along with this program; if you // did not, you can find it at http://www.gnu.org/ // #ifndef _distinct_ #define _distinct_ #include "grouper.h" #include "std/openhash.h" #include "hyperloglog.h" struct ValueWithCount_t { SphAttr_t m_tValue; int m_iCount; inline bool operator < ( const ValueWithCount_t & rhs ) const { if ( m_tValue != rhs.m_tValue ) return m_tValue < rhs.m_tValue; return m_iCount > rhs.m_iCount; } inline bool operator== ( const ValueWithCount_t & rhs ) const { return m_tValue == rhs.m_tValue; } }; struct ValueWithGroup_t { public: SphGroupKey_t m_tGroup; SphAttr_t m_tValue; inline bool operator < ( const ValueWithGroup_t & rhs ) const { if ( m_tGroup!=rhs.m_tGroup ) return m_tGrouprhs.m_iCount; } inline bool operator == ( const ValueWithGroupCount_t & rhs ) const { return m_tGroup==rhs.m_tGroup && m_tValue==rhs.m_tValue; } }; template class UniqGrouped_T : public CSphVector { using BASE = CSphVector; public: UniqGrouped_T () { BASE::Reserve ( 16384 ); } void Add ( const ValueWithGroupCount_t & tValue ); void Sort(); int CountStart ( SphGroupKey_t & tOutGroup ); ///< starting counting distinct values, returns count and group key (0 if empty) int CountNext ( SphGroupKey_t & tOutGroup ); ///< continues counting distinct values, returns count and group key (0 if done) void Compact ( VecTraits_T & dRemoveGroups ); void CopyTo (UniqGrouped_T & dRhs ) const; void Reset() { BASE::Resize(0); } void SetAccuracy ( int iAccuracy ) {} protected: int m_iCountPos = 0; bool m_bSorted = true; }; template inline void UniqGrouped_T::Add ( const ValueWithGroupCount_t & tValue ) { BASE::Add ( { tValue.m_tGroup, tValue.m_tValue } ); m_bSorted = false; } template <> inline void UniqGrouped_T::Add ( const ValueWithGroupCount_t & tValue ) { BASE::Add ( tValue ); m_bSorted = false; } template void UniqGrouped_T::Sort() { if ( m_bSorted ) return; BASE::Sort(); m_bSorted = true; } template int UniqGrouped_T::CountStart ( SphGroupKey_t & tOutGroup ) { m_iCountPos = 0; return CountNext ( tOutGroup ); } template int UniqGrouped_T::CountNext ( SphGroupKey_t & tOutGroup ) { assert ( m_bSorted ); if ( m_iCountPos>=BASE::GetLength() ) return 0; constexpr bool bNeedCount = std::is_same::value; auto tGroup = BASE::m_pData[m_iCountPos].m_tGroup; auto tValue = BASE::m_pData[m_iCountPos].m_tValue; int iCount = 1; if constexpr ( bNeedCount ) iCount = BASE::m_pData[m_iCountPos].m_iCount; tOutGroup = tGroup; m_iCountPos++; while ( m_iCountPos0 ); if ( BASE::m_pData[m_iCountPos].m_tValue==tValue ) iAdd--; iCount += iAdd; } else { if ( BASE::m_pData[m_iCountPos].m_tValue!=tValue ) iCount++; } tValue = BASE::m_pData[m_iCountPos].m_tValue; ++m_iCountPos; } return iCount; } // works like 'uniq', and also throw away provided values to remove. template void UniqGrouped_T::Compact ( VecTraits_T& dRemoveGroups ) { assert ( m_bSorted ); if ( !BASE::GetLength() ) return; dRemoveGroups.Sort(); auto * pRemoveGroups = dRemoveGroups.begin (); auto iRemoveGroups = dRemoveGroups.GetLength (); auto pSrc = BASE::Begin(); auto pDst = BASE::Begin(); auto pEnd = BASE::End(); // skip remove-groups which are not in my list while ( iRemoveGroups && (*pRemoveGroups)m_tGroup ) { ++pRemoveGroups; --iRemoveGroups; } for ( ; pSrcm_tGroup ) { ++pRemoveGroups; --iRemoveGroups; } if ( iRemoveGroups && pSrc->m_tGroup==*pRemoveGroups ) continue; // check if it's a dupe if ( pDst>BASE::Begin() && pDst[-1]==pSrc[0] ) continue; *pDst++ = *pSrc; } assert ( pDst-BASE::Begin()<=BASE::GetLength() ); BASE::m_iCount = pDst-BASE::Begin(); } template void UniqGrouped_T::CopyTo ( UniqGrouped_T & dRhs ) const { if ( m_bSorted && dRhs.m_bSorted ) { UniqGrouped_T dRes; dRes.MergeSorted ( dRhs, *this ); dRes.m_bSorted = true; dRhs = std::move(dRes); } else { auto * pNewValues = dRhs.AddN ( BASE::GetLength() ); memcpy ( pNewValues, BASE::Begin(), BASE::GetLengthBytes64() ); dRhs.m_bSorted = false; } } ///////////////////////////////////////////////////////////////////// template class UniqSingle_T : public CSphVector { using BASE = CSphVector; public: UniqSingle_T() { BASE::Reserve ( 16384 ); } void Add ( const ValueWithGroupCount_t & tValue ); void Sort(); void Compact(); void CopyTo ( UniqSingle_T & dRhs ) const; void Reset() { BASE::Resize(0); } int CountDistinct(); protected: bool m_bSorted = true; }; template inline void UniqSingle_T::Add ( const ValueWithGroupCount_t & tValue ) { BASE::Add ( tValue.m_tValue ); m_bSorted = false; } template <> inline void UniqSingle_T::Add ( const ValueWithGroupCount_t & tValue ) { BASE::Add ( { tValue.m_tValue, tValue.m_iCount } ); m_bSorted = false; } template void UniqSingle_T::Sort() { if ( m_bSorted ) return; BASE::Sort(); m_bSorted = true; } template void UniqSingle_T::Compact() { BASE::Uniq ( !m_bSorted ); m_bSorted = true; } template void UniqSingle_T::CopyTo ( UniqSingle_T & dRhs ) const { if ( m_bSorted && dRhs.m_bSorted ) { UniqSingle_T dRes; dRes.MergeSorted ( dRhs, *this ); dRes.m_bSorted = true; dRhs = std::move(dRes); } else { auto * pNewValues = dRhs.AddN ( BASE::GetLength() ); memcpy ( pNewValues, BASE::Begin(), BASE::GetLengthBytes64() ); dRhs.m_bSorted = false; } } template int UniqSingle_T::CountDistinct() { constexpr bool bNeedCount = std::is_same::value; Sort(); auto & dArray = *this; int iCount = 1; if constexpr ( bNeedCount ) iCount = dArray[0].m_iCount; for ( int i = 1, iLen = BASE::GetLength (); i0 ); if ( dArray[i-1]==dArray[i] ) iAdd--; iCount += iAdd; } else { if ( dArray[i-1]!=dArray[i] ) iCount++; } } return iCount; } class UniqHLLTraits_c { public: UniqHLLTraits_c(); ~UniqHLLTraits_c() { Reset(); } UniqHLLTraits_c & operator = ( UniqHLLTraits_c && tRhs ); void Reset(); void SetAccuracy ( int iAccuracy ); protected: static const int SMALL_ARRAY_SIZE = 16; static const int MIN_HASH_SIZE = 64; static const int NON_PACKED_HLL_THRESH = 16; using HLLDensePacked_c = HyperLogLogDense_T<>; using HLLDenseNonPacked_c = HyperLogLogDense_T, RegistersNonPacked_c>; using SmallArray_c = LazyVector_T; using Hash_c = OpenHashSetFastClear_T; enum class ContainterType_e { ARRAY, HASH, HLL_DENSE_PACKED, HLL_DENSE_NONPACKED, }; struct Container_t { union { SmallArray_c * m_pArray = nullptr; Hash_c * m_pHash; HLLDensePacked_c * m_pHLLDensePacked; HLLDenseNonPacked_c * m_pHLLDenseNonPacked; }; ContainterType_e m_eType = ContainterType_e::ARRAY; int m_iHashIdx = 0; Container_t() = default; Container_t ( const Container_t & tRhs ); Container_t & operator = ( Container_t && tRhs ); Container_t & operator = ( const Container_t & tRhs ); int Estimate() const; void Reset(); bool IsEmpty() const; FORCE_INLINE bool FindInArray ( SphAttr_t tValue ) { assert ( m_eType==ContainterType_e::ARRAY ); for ( auto i : *m_pArray ) if ( i==tValue ) return true; return false; } }; int m_iAccuracy = 16; void AddToContainer ( Container_t & tContainer, SphAttr_t tValue ); FORCE_INLINE SmallArray_c * AllocateArray() { if ( m_dUnusedArray.GetLength() ) { SmallArray_c * pNew = m_dUnusedArray.Pop(); pNew->Resize(0); return pNew; } return new SmallArray_c; } FORCE_INLINE void FreeContainer ( Container_t & tContainer ) { switch ( tContainer.m_eType ) { case ContainterType_e::ARRAY: m_dUnusedArray.Add ( tContainer.m_pArray ); break; case ContainterType_e::HASH: m_dUnusedHashes[tContainer.m_iHashIdx].Add ( tContainer.m_pHash ); break; case ContainterType_e::HLL_DENSE_PACKED: m_dUnusedHLLDensePacked.Add ( tContainer.m_pHLLDensePacked ); break; case ContainterType_e::HLL_DENSE_NONPACKED: m_dUnusedHLLDenseNonPacked.Add ( tContainer.m_pHLLDenseNonPacked ); break; default: assert ( 0 && "Unknown container type" ); break; } tContainer.m_pArray = nullptr; } void ConvertToHash ( Container_t & tContainer ); void ConvertToHLLDensePacked ( Container_t & tContainer ); void ConvertToHLLDenseNonPacked ( Container_t & tContainer ); private: CSphVector m_dUnusedArray; CSphVector> m_dUnusedHashes; CSphVector m_dUnusedHLLDensePacked; CSphVector m_dUnusedHLLDenseNonPacked; int m_iMaxHashSize = 0; HLLDensePacked_c * AllocateHLLDensePacked(); HLLDenseNonPacked_c * AllocateHLLDenseNonPacked(); Hash_c * AllocateHash ( int iIdx ); void MoveToLargerHash ( Container_t & tContainer ); template friend void CopyContainerTo ( SphGroupKey_t tGroup, const UniqHLLTraits_c::Container_t & tFrom, T & tRhs ); }; class UniqHLL_c : public UniqHLLTraits_c { using BASE = UniqHLLTraits_c; public: ~UniqHLL_c() { Reset(); } UniqHLL_c & operator = ( UniqHLL_c && tRhs ); FORCE_INLINE void Add ( const ValueWithGroupCount_t & tValue ); void Sort() {} int CountStart ( SphGroupKey_t & tOutGroup ); int CountNext ( SphGroupKey_t & tOutGroup ); void Compact ( VecTraits_T& dRemoveGroups ); void Reset(); void CopyTo ( UniqHLL_c & tRhs ) const; Container_t & Get ( SphGroupKey_t tGroup ); private: OpenHashTable_T m_hGroups; int64_t m_iHashIterator = 0; }; void UniqHLL_c::Add ( const ValueWithGroupCount_t & tValue ) { Container_t * pContainer = m_hGroups.Find ( tValue.m_tGroup ); if ( pContainer ) AddToContainer ( *pContainer, tValue.m_tValue ); else { Container_t tContainer; tContainer.m_pArray = AllocateArray(); tContainer.m_pArray->Add ( tValue.m_tValue ); m_hGroups.Add ( tValue.m_tGroup, tContainer ); } } class UniqHLLSingle_c : public UniqHLLTraits_c { using BASE = UniqHLLTraits_c; public: UniqHLLSingle_c() { m_tContainer.m_pArray = AllocateArray(); } ~UniqHLLSingle_c() { Reset(); } UniqHLLSingle_c & operator = ( UniqHLLSingle_c && tRhs ); inline void Add ( const ValueWithGroupCount_t & tValue ) { AddToContainer ( m_tContainer, tValue.m_tValue ); } void Compact() {} int CountDistinct() { return m_tContainer.Estimate(); } void CopyTo ( UniqHLLSingle_c & tRhs ) const; void Reset(); Container_t & Get ( SphGroupKey_t tGroup ) { return m_tContainer; } private: Container_t m_tContainer; }; #endif // _distinct_