global_idf.cpp 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338
  1. //
  2. // Copyright (c) 2017-2026, Manticore Software LTD (https://manticoresearch.com)
  3. // Copyright (c) 2001-2016, Andrew Aksyonoff
  4. // Copyright (c) 2008-2016, Sphinx Technologies Inc
  5. // All rights reserved
  6. //
  7. // This program is free software; you can redistribute it and/or modify
  8. // it under the terms of the GNU General Public License. You should have
  9. // received a copy of the GPL license along with this program; if you
  10. // did not, you can find it at http://www.gnu.org/
  11. //
  12. #include "global_idf.h"
  13. #include "sphinxint.h"
  14. #include "fileutils.h"
  15. #include <sys/stat.h>
  16. #include <math.h>
  17. #pragma pack(push, 4)
  18. struct IDFWord_t
  19. {
  20. uint64_t m_uWordID;
  21. DWORD m_iDocs;
  22. };
  23. #pragma pack(pop)
  24. STATIC_SIZE_ASSERT ( IDFWord_t, 12 );
  25. static const int HASH_BITS = 16;
  26. using namespace sph;
  27. /// global IDF
  28. class CSphGlobalIDF final : public IDFer_c
  29. {
  30. protected:
  31. ~CSphGlobalIDF() final = default;
  32. public:
  33. bool TouchCheckModified ( const CSphString& sFilename );
  34. bool Preread ( const CSphString& sFilename, CSphString& sError );
  35. float GetIDF ( const CSphString& sWord, int64_t iDocsLocal, bool bPlainIDF ) const final;
  36. private:
  37. DWORD GetDocs ( const CSphString& sWord ) const noexcept;
  38. int64_t m_iTotalDocuments = 0;
  39. int64_t m_iTotalWords = 0;
  40. SphOffset_t m_uMTime = 0;
  41. CSphLargeBuffer<IDFWord_t> m_dWords;
  42. CSphLargeBuffer<int64_t> m_dHash;
  43. };
  44. using CSphGlobalIDFRefPtr_c = CSphRefcountedPtr<CSphGlobalIDF>;
  45. // check if backend file was modified
  46. bool CSphGlobalIDF::TouchCheckModified ( const CSphString& sFilename )
  47. {
  48. // update m_uMTime, return true if modified
  49. struct_stat tStat = { 0 };
  50. if ( stat ( sFilename.cstr (), &tStat )<0 )
  51. tStat.st_mtime = 0;
  52. bool bModified = ( m_uMTime!=tStat.st_mtime );
  53. m_uMTime = tStat.st_mtime;
  54. return bModified;
  55. }
  56. bool CSphGlobalIDF::Preread ( const CSphString& sFilename, CSphString& sError )
  57. {
  58. TouchCheckModified ( sFilename );
  59. CSphAutofile tFile;
  60. if ( tFile.Open ( sFilename, SPH_O_READ, sError )<0 )
  61. return false;
  62. const SphOffset_t iSize = sphGetFileSize ( tFile.GetFD (), nullptr ) - sizeof ( SphOffset_t );
  63. sphReadThrottled ( tFile.GetFD (), &m_iTotalDocuments, sizeof ( SphOffset_t ));
  64. m_iTotalWords = iSize / sizeof ( IDFWord_t );
  65. // allocate words cache
  66. CSphString sWarning;
  67. if ( !m_dWords.Alloc ( m_iTotalWords, sError ))
  68. return false;
  69. // allocate lookup table if needed
  70. int iHashSize = ( int ) ( U64C( 1 ) << HASH_BITS );
  71. if ( m_iTotalWords>iHashSize * 8 )
  72. {
  73. if ( !m_dHash.Alloc ( iHashSize + 2, sError ))
  74. return false;
  75. }
  76. // read file into memory (may exceed 2GB)
  77. int64_t iRead = sphReadThrottled ( tFile.GetFD (), m_dWords.GetWritePtr (), iSize );
  78. if ( iRead!=iSize )
  79. return false;
  80. if ( sphInterrupted ())
  81. return false;
  82. if ( m_dHash.IsEmpty() )
  83. return true;
  84. // build lookup table
  85. int64_t* pHash = m_dHash.GetWritePtr ();
  86. uint64_t uFirst = m_dWords[0].m_uWordID;
  87. uint64_t uRange = m_dWords[m_iTotalWords - 1].m_uWordID - uFirst;
  88. DWORD iShift = 0;
  89. while ( uRange>=( U64C( 1 ) << HASH_BITS ))
  90. {
  91. ++iShift;
  92. uRange >>= 1;
  93. }
  94. pHash[0] = iShift;
  95. pHash[1] = 0;
  96. DWORD uLastHash = 0;
  97. for ( int64_t i = 1; i<m_iTotalWords; ++i )
  98. {
  99. // check for interrupt (throttled for speed)
  100. if (( i & 0xffff )==0 && sphInterrupted ())
  101. return false;
  102. auto uHash = ( DWORD ) (( m_dWords[i].m_uWordID - uFirst ) >> iShift );
  103. if ( uHash==uLastHash )
  104. continue;
  105. while ( uLastHash<uHash )
  106. pHash[++uLastHash + 1] = i;
  107. uLastHash = uHash;
  108. }
  109. pHash[++uLastHash + 1] = m_iTotalWords;
  110. return true;
  111. }
  112. DWORD CSphGlobalIDF::GetDocs ( const CSphString& sWord ) const noexcept
  113. {
  114. const char* s = sWord.cstr ();
  115. // replace = to MAGIC_WORD_HEAD_NONSTEMMED for exact terms
  116. char sBuf[3 * SPH_MAX_WORD_LEN + 4];
  117. if ( s && *s=='=' )
  118. {
  119. strncpy ( sBuf, s, sizeof ( sBuf ) - 1 );
  120. sBuf[0] = MAGIC_WORD_HEAD_NONSTEMMED;
  121. s = sBuf;
  122. }
  123. const uint64_t uWordID = sphFNV64 ( s );
  124. int64_t iStart = 0;
  125. int64_t iEnd = m_iTotalWords - 1;
  126. auto pWords = (const IDFWord_t*)m_dWords.GetReadPtr();
  127. if ( !m_dHash.IsEmpty () )
  128. {
  129. const auto uFirst = pWords[0].m_uWordID;
  130. const auto uHash = ( DWORD ) (( uWordID - uFirst ) >> m_dHash[0] );
  131. if ( uHash>( U64C( 1 ) << HASH_BITS ))
  132. return 0;
  133. iStart = m_dHash[uHash + 1];
  134. iEnd = m_dHash[uHash + 2] - 1;
  135. }
  136. const IDFWord_t* pWord = sphBinarySearch ( pWords + iStart, pWords + iEnd, bind ( &IDFWord_t::m_uWordID ), uWordID );
  137. return pWord ? pWord->m_iDocs : 0;
  138. }
  139. float CSphGlobalIDF::GetIDF ( const CSphString& sWord, int64_t iDocsLocal, bool bPlainIDF ) const
  140. {
  141. int64_t iDocs = Max ( iDocsLocal, ( int64_t ) GetDocs ( sWord ));
  142. int64_t iTotalClamped = Max ( m_iTotalDocuments, iDocs );
  143. if ( !iDocs )
  144. return 0.0f;
  145. if ( bPlainIDF )
  146. iTotalClamped += 1-iDocs;
  147. float fLogTotal = logf ( float ( 1 + iTotalClamped ));
  148. return logf ( float ( iTotalClamped ) / float ( iDocs )) / ( 2 * fLogTotal );
  149. }
  150. /// global idf definitions hash
  151. class cGlobalIDF
  152. {
  153. mutable RwLock_t m_tLock;
  154. SmallStringHash_T<CSphGlobalIDFRefPtr_c> m_hIDFs GUARDED_BY ( m_tLock );
  155. public:
  156. bool LoadGlobalIDF ( const CSphString& sPath, CSphString& sError );
  157. bool ReloadGlobalIDF ( const CSphString& sPath, CSphString& sError );
  158. CSphGlobalIDFRefPtr_c* GetIDF ( const CSphString& sPath );
  159. StrVec_t Collect() const;
  160. void DeleteMany ( const StrVec_t& dFiles );
  161. void Clear ();
  162. };
  163. cGlobalIDF& GetGlobalIDF()
  164. {
  165. static cGlobalIDF tIDF;
  166. return tIDF;
  167. }
  168. static CSphGlobalIDFRefPtr_c DoPrereadIDF ( const CSphString& sPath, CSphString& sError )
  169. {
  170. CSphGlobalIDFRefPtr_c pNewIDF { new CSphGlobalIDF };
  171. if ( !pNewIDF->Preread ( sPath, sError ))
  172. pNewIDF = nullptr;
  173. return pNewIDF;
  174. }
  175. bool cGlobalIDF::LoadGlobalIDF ( const CSphString& sPath, CSphString& sError )
  176. {
  177. sphLogDebug ( "Loading global IDF (%s)", sPath.cstr ());
  178. auto pGlobalIDF = DoPrereadIDF ( sPath, sError );
  179. if ( !pGlobalIDF )
  180. return false;
  181. ScWL_t wLock ( m_tLock );
  182. m_hIDFs.Add ( std::move (pGlobalIDF), sPath );
  183. return true;
  184. }
  185. bool cGlobalIDF::ReloadGlobalIDF ( const CSphString& sPath, CSphString& sError )
  186. {
  187. sphLogDebug ( "Reloading global IDF (%s)", sPath.cstr ());
  188. auto pGlobalIDF = DoPrereadIDF ( sPath, sError );
  189. if ( !pGlobalIDF )
  190. return false;
  191. ScWL_t wLock ( m_tLock );
  192. auto* ppGlobalIDF = m_hIDFs ( sPath );
  193. if ( ppGlobalIDF )
  194. *ppGlobalIDF = std::exchange ( pGlobalIDF, nullptr );
  195. return true;
  196. }
  197. CSphGlobalIDFRefPtr_c* cGlobalIDF::GetIDF ( const CSphString& sPath )
  198. {
  199. ScRL_t RLock ( m_tLock );
  200. return m_hIDFs ( sPath );
  201. }
  202. StrVec_t cGlobalIDF::Collect() const
  203. {
  204. StrVec_t dCollection;
  205. ScRL_t rLock ( m_tLock );
  206. for ( auto& dIdf : m_hIDFs )
  207. dCollection.Add ( dIdf.first );
  208. return dCollection;
  209. }
  210. void cGlobalIDF::DeleteMany ( const StrVec_t& dFiles )
  211. {
  212. ScWL_t wLock ( m_tLock );
  213. for ( const auto& sKey : dFiles )
  214. {
  215. sphLogDebug ( "Unloading global IDF (%s)", sKey.cstr() );
  216. m_hIDFs.Delete ( sKey );
  217. }
  218. }
  219. void cGlobalIDF::Clear()
  220. {
  221. ScWL_t wLock ( m_tLock );
  222. m_hIDFs.Reset();
  223. }
  224. bool sph::PrereadGlobalIDF ( const CSphString& sPath, CSphString& sError )
  225. {
  226. auto& tGlobalIDF = GetGlobalIDF();
  227. auto* ppGlobalIDF = tGlobalIDF.GetIDF(sPath);
  228. if ( !ppGlobalIDF )
  229. return tGlobalIDF.LoadGlobalIDF ( sPath, sError );
  230. auto& pGlobalIDF = *ppGlobalIDF;
  231. if ( pGlobalIDF && pGlobalIDF->TouchCheckModified ( sPath ))
  232. return tGlobalIDF.ReloadGlobalIDF ( sPath, sError );
  233. return true;
  234. }
  235. static StrVec_t CollectUnlistedIn ( const StrVec_t& dFiles )
  236. {
  237. StrVec_t dAllIDFs = GetGlobalIDF().Collect();
  238. StrVec_t dUnlisted;
  239. for ( const auto& sIdf : dAllIDFs )
  240. if ( !dFiles.Contains ( sIdf ) )
  241. dUnlisted.Add ( sIdf );
  242. return dUnlisted;
  243. }
  244. static void DeleteUnlistedIn ( const StrVec_t& dFiles )
  245. {
  246. auto dUnlisted = CollectUnlistedIn ( dFiles );
  247. GetGlobalIDF().DeleteMany(dUnlisted);
  248. }
  249. void sph::UpdateGlobalIDFs ( const StrVec_t& dFiles )
  250. {
  251. // delete unlisted entries
  252. DeleteUnlistedIn ( dFiles );
  253. // load/rotate remaining entries
  254. CSphString sError;
  255. for ( const auto& sPath: dFiles )
  256. {
  257. if ( !PrereadGlobalIDF ( sPath, sError ))
  258. sphLogDebug ( "Could not load global IDF (%s): %s", sPath.cstr (), sError.cstr ());
  259. }
  260. }
  261. void sph::ShutdownGlobalIDFs ()
  262. {
  263. StrVec_t dAllIDFs = GetGlobalIDF().Collect();
  264. GetGlobalIDF().DeleteMany ( dAllIDFs );
  265. }
  266. IDFerRefPtr_c sph::GetIDFer ( const CSphString& IDFPath )
  267. {
  268. IDFerRefPtr_c pResult;
  269. auto* ppGlobalIDF = GetGlobalIDF().GetIDF ( IDFPath );
  270. if ( ppGlobalIDF )
  271. pResult = *ppGlobalIDF;
  272. return pResult;
  273. }