| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338 |
- //
- // Copyright (c) 2017-2026, Manticore Software LTD (https://manticoresearch.com)
- // Copyright (c) 2001-2016, Andrew Aksyonoff
- // Copyright (c) 2008-2016, Sphinx Technologies Inc
- // All rights reserved
- //
- // This program is free software; you can redistribute it and/or modify
- // it under the terms of the GNU General Public License. You should have
- // received a copy of the GPL license along with this program; if you
- // did not, you can find it at http://www.gnu.org/
- //
- #include "global_idf.h"
- #include "sphinxint.h"
- #include "fileutils.h"
- #include <sys/stat.h>
- #include <math.h>
- #pragma pack(push, 4)
- struct IDFWord_t
- {
- uint64_t m_uWordID;
- DWORD m_iDocs;
- };
- #pragma pack(pop)
- STATIC_SIZE_ASSERT ( IDFWord_t, 12 );
- static const int HASH_BITS = 16;
- using namespace sph;
- /// global IDF
- class CSphGlobalIDF final : public IDFer_c
- {
- protected:
- ~CSphGlobalIDF() final = default;
- public:
- bool TouchCheckModified ( const CSphString& sFilename );
- bool Preread ( const CSphString& sFilename, CSphString& sError );
- float GetIDF ( const CSphString& sWord, int64_t iDocsLocal, bool bPlainIDF ) const final;
- private:
- DWORD GetDocs ( const CSphString& sWord ) const noexcept;
- int64_t m_iTotalDocuments = 0;
- int64_t m_iTotalWords = 0;
- SphOffset_t m_uMTime = 0;
- CSphLargeBuffer<IDFWord_t> m_dWords;
- CSphLargeBuffer<int64_t> m_dHash;
- };
- using CSphGlobalIDFRefPtr_c = CSphRefcountedPtr<CSphGlobalIDF>;
- // check if backend file was modified
- bool CSphGlobalIDF::TouchCheckModified ( const CSphString& sFilename )
- {
- // update m_uMTime, return true if modified
- struct_stat tStat = { 0 };
- if ( stat ( sFilename.cstr (), &tStat )<0 )
- tStat.st_mtime = 0;
- bool bModified = ( m_uMTime!=tStat.st_mtime );
- m_uMTime = tStat.st_mtime;
- return bModified;
- }
- bool CSphGlobalIDF::Preread ( const CSphString& sFilename, CSphString& sError )
- {
- TouchCheckModified ( sFilename );
- CSphAutofile tFile;
- if ( tFile.Open ( sFilename, SPH_O_READ, sError )<0 )
- return false;
- const SphOffset_t iSize = sphGetFileSize ( tFile.GetFD (), nullptr ) - sizeof ( SphOffset_t );
- sphReadThrottled ( tFile.GetFD (), &m_iTotalDocuments, sizeof ( SphOffset_t ));
- m_iTotalWords = iSize / sizeof ( IDFWord_t );
- // allocate words cache
- CSphString sWarning;
- if ( !m_dWords.Alloc ( m_iTotalWords, sError ))
- return false;
- // allocate lookup table if needed
- int iHashSize = ( int ) ( U64C( 1 ) << HASH_BITS );
- if ( m_iTotalWords>iHashSize * 8 )
- {
- if ( !m_dHash.Alloc ( iHashSize + 2, sError ))
- return false;
- }
- // read file into memory (may exceed 2GB)
- int64_t iRead = sphReadThrottled ( tFile.GetFD (), m_dWords.GetWritePtr (), iSize );
- if ( iRead!=iSize )
- return false;
- if ( sphInterrupted ())
- return false;
- if ( m_dHash.IsEmpty() )
- return true;
- // build lookup table
- int64_t* pHash = m_dHash.GetWritePtr ();
- uint64_t uFirst = m_dWords[0].m_uWordID;
- uint64_t uRange = m_dWords[m_iTotalWords - 1].m_uWordID - uFirst;
- DWORD iShift = 0;
- while ( uRange>=( U64C( 1 ) << HASH_BITS ))
- {
- ++iShift;
- uRange >>= 1;
- }
- pHash[0] = iShift;
- pHash[1] = 0;
- DWORD uLastHash = 0;
- for ( int64_t i = 1; i<m_iTotalWords; ++i )
- {
- // check for interrupt (throttled for speed)
- if (( i & 0xffff )==0 && sphInterrupted ())
- return false;
- auto uHash = ( DWORD ) (( m_dWords[i].m_uWordID - uFirst ) >> iShift );
- if ( uHash==uLastHash )
- continue;
- while ( uLastHash<uHash )
- pHash[++uLastHash + 1] = i;
- uLastHash = uHash;
- }
- pHash[++uLastHash + 1] = m_iTotalWords;
- return true;
- }
- DWORD CSphGlobalIDF::GetDocs ( const CSphString& sWord ) const noexcept
- {
- const char* s = sWord.cstr ();
- // replace = to MAGIC_WORD_HEAD_NONSTEMMED for exact terms
- char sBuf[3 * SPH_MAX_WORD_LEN + 4];
- if ( s && *s=='=' )
- {
- strncpy ( sBuf, s, sizeof ( sBuf ) - 1 );
- sBuf[0] = MAGIC_WORD_HEAD_NONSTEMMED;
- s = sBuf;
- }
- const uint64_t uWordID = sphFNV64 ( s );
- int64_t iStart = 0;
- int64_t iEnd = m_iTotalWords - 1;
- auto pWords = (const IDFWord_t*)m_dWords.GetReadPtr();
- if ( !m_dHash.IsEmpty () )
- {
- const auto uFirst = pWords[0].m_uWordID;
- const auto uHash = ( DWORD ) (( uWordID - uFirst ) >> m_dHash[0] );
- if ( uHash>( U64C( 1 ) << HASH_BITS ))
- return 0;
- iStart = m_dHash[uHash + 1];
- iEnd = m_dHash[uHash + 2] - 1;
- }
- const IDFWord_t* pWord = sphBinarySearch ( pWords + iStart, pWords + iEnd, bind ( &IDFWord_t::m_uWordID ), uWordID );
- return pWord ? pWord->m_iDocs : 0;
- }
- float CSphGlobalIDF::GetIDF ( const CSphString& sWord, int64_t iDocsLocal, bool bPlainIDF ) const
- {
- int64_t iDocs = Max ( iDocsLocal, ( int64_t ) GetDocs ( sWord ));
- int64_t iTotalClamped = Max ( m_iTotalDocuments, iDocs );
- if ( !iDocs )
- return 0.0f;
- if ( bPlainIDF )
- iTotalClamped += 1-iDocs;
- float fLogTotal = logf ( float ( 1 + iTotalClamped ));
- return logf ( float ( iTotalClamped ) / float ( iDocs )) / ( 2 * fLogTotal );
- }
- /// global idf definitions hash
- class cGlobalIDF
- {
- mutable RwLock_t m_tLock;
- SmallStringHash_T<CSphGlobalIDFRefPtr_c> m_hIDFs GUARDED_BY ( m_tLock );
- public:
- bool LoadGlobalIDF ( const CSphString& sPath, CSphString& sError );
- bool ReloadGlobalIDF ( const CSphString& sPath, CSphString& sError );
- CSphGlobalIDFRefPtr_c* GetIDF ( const CSphString& sPath );
- StrVec_t Collect() const;
- void DeleteMany ( const StrVec_t& dFiles );
- void Clear ();
- };
- cGlobalIDF& GetGlobalIDF()
- {
- static cGlobalIDF tIDF;
- return tIDF;
- }
- static CSphGlobalIDFRefPtr_c DoPrereadIDF ( const CSphString& sPath, CSphString& sError )
- {
- CSphGlobalIDFRefPtr_c pNewIDF { new CSphGlobalIDF };
- if ( !pNewIDF->Preread ( sPath, sError ))
- pNewIDF = nullptr;
- return pNewIDF;
- }
- bool cGlobalIDF::LoadGlobalIDF ( const CSphString& sPath, CSphString& sError )
- {
- sphLogDebug ( "Loading global IDF (%s)", sPath.cstr ());
- auto pGlobalIDF = DoPrereadIDF ( sPath, sError );
- if ( !pGlobalIDF )
- return false;
- ScWL_t wLock ( m_tLock );
- m_hIDFs.Add ( std::move (pGlobalIDF), sPath );
- return true;
- }
- bool cGlobalIDF::ReloadGlobalIDF ( const CSphString& sPath, CSphString& sError )
- {
- sphLogDebug ( "Reloading global IDF (%s)", sPath.cstr ());
- auto pGlobalIDF = DoPrereadIDF ( sPath, sError );
- if ( !pGlobalIDF )
- return false;
- ScWL_t wLock ( m_tLock );
- auto* ppGlobalIDF = m_hIDFs ( sPath );
- if ( ppGlobalIDF )
- *ppGlobalIDF = std::exchange ( pGlobalIDF, nullptr );
- return true;
- }
- CSphGlobalIDFRefPtr_c* cGlobalIDF::GetIDF ( const CSphString& sPath )
- {
- ScRL_t RLock ( m_tLock );
- return m_hIDFs ( sPath );
- }
- StrVec_t cGlobalIDF::Collect() const
- {
- StrVec_t dCollection;
- ScRL_t rLock ( m_tLock );
- for ( auto& dIdf : m_hIDFs )
- dCollection.Add ( dIdf.first );
- return dCollection;
- }
- void cGlobalIDF::DeleteMany ( const StrVec_t& dFiles )
- {
- ScWL_t wLock ( m_tLock );
- for ( const auto& sKey : dFiles )
- {
- sphLogDebug ( "Unloading global IDF (%s)", sKey.cstr() );
- m_hIDFs.Delete ( sKey );
- }
- }
- void cGlobalIDF::Clear()
- {
- ScWL_t wLock ( m_tLock );
- m_hIDFs.Reset();
- }
- bool sph::PrereadGlobalIDF ( const CSphString& sPath, CSphString& sError )
- {
- auto& tGlobalIDF = GetGlobalIDF();
- auto* ppGlobalIDF = tGlobalIDF.GetIDF(sPath);
- if ( !ppGlobalIDF )
- return tGlobalIDF.LoadGlobalIDF ( sPath, sError );
- auto& pGlobalIDF = *ppGlobalIDF;
- if ( pGlobalIDF && pGlobalIDF->TouchCheckModified ( sPath ))
- return tGlobalIDF.ReloadGlobalIDF ( sPath, sError );
- return true;
- }
- static StrVec_t CollectUnlistedIn ( const StrVec_t& dFiles )
- {
- StrVec_t dAllIDFs = GetGlobalIDF().Collect();
- StrVec_t dUnlisted;
- for ( const auto& sIdf : dAllIDFs )
- if ( !dFiles.Contains ( sIdf ) )
- dUnlisted.Add ( sIdf );
- return dUnlisted;
- }
- static void DeleteUnlistedIn ( const StrVec_t& dFiles )
- {
- auto dUnlisted = CollectUnlistedIn ( dFiles );
- GetGlobalIDF().DeleteMany(dUnlisted);
- }
- void sph::UpdateGlobalIDFs ( const StrVec_t& dFiles )
- {
- // delete unlisted entries
- DeleteUnlistedIn ( dFiles );
- // load/rotate remaining entries
- CSphString sError;
- for ( const auto& sPath: dFiles )
- {
- if ( !PrereadGlobalIDF ( sPath, sError ))
- sphLogDebug ( "Could not load global IDF (%s): %s", sPath.cstr (), sError.cstr ());
- }
- }
- void sph::ShutdownGlobalIDFs ()
- {
- StrVec_t dAllIDFs = GetGlobalIDF().Collect();
- GetGlobalIDF().DeleteMany ( dAllIDFs );
- }
- IDFerRefPtr_c sph::GetIDFer ( const CSphString& IDFPath )
- {
- IDFerRefPtr_c pResult;
- auto* ppGlobalIDF = GetGlobalIDF().GetIDF ( IDFPath );
- if ( ppGlobalIDF )
- pResult = *ppGlobalIDF;
- return pResult;
- }
|