indexsettings.h 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501
  1. //
  2. // Copyright (c) 2017-2026, Manticore Software LTD (https://manticoresearch.com)
  3. // Copyright (c) 2001-2016, Andrew Aksyonoff
  4. // Copyright (c) 2008-2016, Sphinx Technologies Inc
  5. // All rights reserved
  6. //
  7. // This program is free software; you can redistribute it and/or modify
  8. // it under the terms of the GNU General Public License. You should have
  9. // received a copy of the GPL license along with this program; if you
  10. // did not, you can find it at http://www.gnu.org/
  11. //
  12. #ifndef _indexsettings_
  13. #define _indexsettings_
  14. #include "sphinxstd.h"
  15. #include "sphinxutils.h"
  16. #include "fileutils.h"
  17. #include "sphinxexpr.h"
  18. #include "columnarlib.h"
  19. #include "sphinxdefs.h"
  20. #include "schema/columninfo.h"
  21. inline int64_t cast2signed ( SphWordID_t tVal )
  22. {
  23. return *(int64_t*)&tVal;
  24. }
  25. class CSphWriter;
  26. class CSphReader;
  27. class FilenameBuilder_i;
  28. enum
  29. {
  30. // where was TOKENIZER_SBCS=1 once
  31. TOKENIZER_UTF8 = 2,
  32. TOKENIZER_NGRAM = 3
  33. };
  34. struct CSphEmbeddedFiles
  35. {
  36. bool m_bEmbeddedSynonyms = false;
  37. bool m_bEmbeddedStopwords = false;
  38. bool m_bEmbeddedWordforms = false;
  39. CSphSavedFile m_tSynonymFile;
  40. StrVec_t m_dSynonyms;
  41. CSphVector<CSphSavedFile> m_dStopwordFiles;
  42. CSphVector<SphWordID_t> m_dStopwords;
  43. StrVec_t m_dWordforms;
  44. CSphVector<CSphSavedFile> m_dWordformFiles;
  45. void Reset();
  46. };
  47. class SettingsFormatter_c;
  48. struct SettingsFormatterState_t;
  49. namespace bson { class Bson_c; }
  50. class SettingsWriter_c
  51. {
  52. public:
  53. virtual ~SettingsWriter_c() = default;
  54. virtual void DumpReadable ( SettingsFormatterState_t & tState, const CSphEmbeddedFiles & tEmbeddedFiles, FilenameBuilder_i * pFilenameBuilder ) const;
  55. virtual void Format ( SettingsFormatter_c & tOut, FilenameBuilder_i * pFilenameBuilder ) const = 0;
  56. };
  57. class CSphTokenizerSettings : public SettingsWriter_c
  58. {
  59. public:
  60. int m_iType { TOKENIZER_UTF8 };
  61. CSphString m_sCaseFolding;
  62. int m_iMinWordLen = 1;
  63. CSphString m_sSynonymsFile;
  64. CSphString m_sBoundary;
  65. CSphString m_sIgnoreChars;
  66. int m_iNgramLen = 0;
  67. CSphString m_sNgramChars;
  68. CSphString m_sBlendChars;
  69. CSphString m_sBlendMode;
  70. void Setup ( const CSphConfigSection & hIndex, CSphString & sWarning );
  71. bool Load ( const FilenameBuilder_i * pFilenameBuilder, CSphReader & tReader, CSphEmbeddedFiles & tEmbeddedFiles, CSphString & sWarning );
  72. bool Load ( const FilenameBuilder_i* pFilenameBuilder, const bson::Bson_c& tNode, CSphEmbeddedFiles& tEmbeddedFiles, CSphString& sWarning );
  73. void DumpReadable ( SettingsFormatterState_t & tState, const CSphEmbeddedFiles & tEmbeddedFiles, FilenameBuilder_i * pFilenameBuilder ) const override;
  74. void Format ( SettingsFormatter_c & tOut, FilenameBuilder_i * pFilenameBuilder ) const override;
  75. };
  76. class CSphDictSettings : public SettingsWriter_c
  77. {
  78. public:
  79. CSphString m_sMorphology;
  80. CSphString m_sMorphFields;
  81. CSphString m_sStopwords;
  82. StrVec_t m_dWordforms;
  83. int m_iMinStemmingLen = 1;
  84. bool m_bWordDict = true;
  85. bool m_bStopwordsUnstemmed = false;
  86. CSphString m_sMorphFingerprint; ///< not used for creation; only for a check when loading
  87. void Setup ( const CSphConfigSection & hIndex, FilenameBuilder_i * pFilenameBuilder, CSphString & sWarning );
  88. void Load ( CSphReader & tReader, CSphEmbeddedFiles & tEmbeddedFiles, FilenameBuilder_i * pFilenameBuilder, CSphString & sWarning );
  89. void Load ( const bson::Bson_c & tNode, CSphEmbeddedFiles& tEmbeddedFiles, FilenameBuilder_i * pFilenameBuilder, CSphString & sWarning );
  90. void DumpReadable ( SettingsFormatterState_t & tState, const CSphEmbeddedFiles & tEmbeddedFiles, FilenameBuilder_i * pFilenameBuilder ) const override;
  91. void Format ( SettingsFormatter_c & tOut, FilenameBuilder_i * pFilenameBuilder ) const override;
  92. };
  93. class Writer_i;
  94. class CSphFieldFilterSettings : public SettingsWriter_c
  95. {
  96. public:
  97. StrVec_t m_dRegexps;
  98. bool Setup ( const CSphConfigSection & hIndex, CSphString & sWarning );
  99. void Load ( CSphReader & tReader );
  100. void Save ( Writer_i & tWriter ) const;
  101. void Format ( SettingsFormatter_c & tOut, FilenameBuilder_i * pFilenameBuilder ) const override;
  102. };
  103. struct KillListTarget_t
  104. {
  105. enum
  106. {
  107. USE_KLIST = 1 << 0,
  108. USE_DOCIDS = 1 << 1
  109. };
  110. CSphString m_sIndex;
  111. DWORD m_uFlags {USE_KLIST|USE_DOCIDS};
  112. CSphString Format() const;
  113. };
  114. class KillListTargets_c : public SettingsWriter_c
  115. {
  116. public:
  117. CSphVector<KillListTarget_t> m_dTargets;
  118. bool Parse ( const CSphString & sTargets, const char * szIndexName, CSphString & sError );
  119. void Format ( SettingsFormatter_c & tOut, FilenameBuilder_i * pFilenameBuilder ) const override;
  120. };
  121. /// indexing-related source settings
  122. /// NOTE, newly added fields should be synced with CSphSource::Setup()
  123. class CSphSourceSettings
  124. {
  125. public:
  126. int m_iMinInfixLen = 0; ///< min indexable infix length (0 means don't index infixes)
  127. int m_iMaxSubstringLen = 0; ///< max indexable infix and prefix (0 means don't limit infixes and prefixes)
  128. int m_iBoundaryStep = 0; ///< additional boundary word position increment
  129. bool m_bIndexExactWords = false; ///< exact (non-stemmed) word indexing flag
  130. int m_iOvershortStep = 1; ///< position step on overshort token (default is 1)
  131. int m_iStopwordStep = 1; ///< position step on stopword token (default is 1)
  132. bool m_bIndexSP = false; ///< whether to index sentence and paragraph delimiters
  133. bool m_bIndexFieldLens = false; ///< whether to index field lengths
  134. StrVec_t m_dPrefixFields; ///< list of prefix fields
  135. StrVec_t m_dInfixFields; ///< list of infix fields
  136. StrVec_t m_dStoredFields; ///< list of stored fields
  137. StrVec_t m_dStoredOnlyFields; ///< list of "fields" that are stored but not indexed
  138. AttrEngine_e m_eEngine = AttrEngine_e::DEFAULT; ///< attribute storage engine
  139. AttrEngine_e m_eDefaultEngine = AttrEngine_e::ROWWISE; ///< default storage engine set by daemon
  140. StrVec_t m_dColumnarAttrs; ///< list of attributes to be placed in columnar store
  141. StrVec_t m_dColumnarNonStoredAttrs; ///< list of columnar attributes that should be not added to document storage
  142. StrVec_t m_dRowwiseAttrs; ///< list of attributes to NOT be placed in columnar store
  143. StrVec_t m_dColumnarStringsNoHash; ///< list of columnar string attributes that don't need pregenerated hashes
  144. StrVec_t m_dJsonSIAttrs; ///< list of JSON attributes that need secondary indexes generated
  145. CSphVector<NamedKNNSettings_t> m_dKNN; ///< knn index settings
  146. ESphWordpart GetWordpart ( const char * sField, bool bWordDict );
  147. int GetMinPrefixLen ( bool bWordDict ) const;
  148. void SetMinPrefixLen ( int iMinPrefixLen );
  149. int RawMinPrefixLen () const;
  150. private:
  151. int m_iMinPrefixLen = 0; ///< min indexable prefix (0 means don't index prefixes)
  152. };
  153. enum class Preprocessor_e
  154. {
  155. NONE, ///< no preprocessor
  156. ICU, ///< ICU chinese preprocessor
  157. JIEBA ///< Jieba chinese preprocessor
  158. };
  159. enum class Compression_e
  160. {
  161. NONE,
  162. LZ4,
  163. LZ4HC
  164. };
  165. CSphString CompressionToStr ( Compression_e eComp );
  166. const DWORD DEFAULT_DOCSTORE_BLOCK = 16384;
  167. const int DEFAULT_COMPRESSION_LEVEL = 9;
  168. struct DocstoreSettings_t : public SettingsWriter_c
  169. {
  170. Compression_e m_eCompression = Compression_e::LZ4;
  171. int m_iCompressionLevel = DEFAULT_COMPRESSION_LEVEL;
  172. DWORD m_uBlockSize = DEFAULT_DOCSTORE_BLOCK;
  173. void Format ( SettingsFormatter_c & tOut, FilenameBuilder_i * pFilenameBuilder ) const override;
  174. };
  175. enum ESphHitless
  176. {
  177. SPH_HITLESS_NONE = 0, ///< all hits are present
  178. SPH_HITLESS_SOME = 1, ///< some of the hits might be omitted (check the flag bit)
  179. SPH_HITLESS_ALL = 2 ///< no hits in this index
  180. };
  181. enum ESphHitFormat
  182. {
  183. SPH_HIT_FORMAT_PLAIN = 0, ///< all hits are stored in hitlist
  184. SPH_HIT_FORMAT_INLINE = 1 ///< hits can be split and inlined into doclist (aka 9-23)
  185. };
  186. enum ESphBigram : BYTE
  187. {
  188. SPH_BIGRAM_NONE = 0, ///< no bigrams
  189. SPH_BIGRAM_ALL = 1, ///< index all word pairs
  190. SPH_BIGRAM_FIRSTFREQ = 2, ///< only index pairs where one of the words is in a frequent words list
  191. SPH_BIGRAM_BOTHFREQ = 3 ///< only index pairs where both words are in a frequent words list
  192. };
  193. enum class JiebaMode_e
  194. {
  195. NONE,
  196. ACCURATE,
  197. FULL,
  198. SEARCH,
  199. DEFAULT = ACCURATE
  200. };
  201. class CSphIndexSettings : public CSphSourceSettings, public DocstoreSettings_t
  202. {
  203. public:
  204. ESphHitFormat m_eHitFormat = SPH_HIT_FORMAT_PLAIN;
  205. bool m_bHtmlStrip = false;
  206. CSphString m_sHtmlIndexAttrs;
  207. CSphString m_sHtmlRemoveElements;
  208. CSphString m_sZones;
  209. ESphHitless m_eHitless = SPH_HITLESS_NONE;
  210. CSphString m_sHitlessFiles;
  211. int m_iEmbeddedLimit = 0;
  212. SphOffset_t m_tBlobUpdateSpace {0};
  213. int m_iSkiplistBlockSize {32};
  214. KillListTargets_c m_tKlistTargets; ///< list of indexes to apply killlist to
  215. ESphBigram m_eBigramIndex = SPH_BIGRAM_NONE;
  216. CSphString m_sBigramWords;
  217. StrVec_t m_dBigramWords;
  218. DWORD m_uAotFilterMask = 0; ///< lemmatize_XX_all forces us to transform queries on the index level too
  219. Preprocessor_e m_ePreprocessor = Preprocessor_e::NONE;
  220. JiebaMode_e m_eJiebaMode = JiebaMode_e::DEFAULT;
  221. bool m_bJiebaHMM = true;
  222. CSphString m_sJiebaUserDictPath;
  223. CSphString m_sIndexTokenFilter; ///< indexing time token filter spec string (pretty useless for disk, vital for RT)
  224. bool m_bBinlog = true;
  225. bool Setup ( const CSphConfigSection & hIndex, const char * szIndexName, CSphString & sWarning, CSphString & sError );
  226. void Format ( SettingsFormatter_c & tOut, FilenameBuilder_i * pFilenameBuilder ) const override;
  227. private:
  228. void ParseStoredFields ( const CSphConfigSection & hIndex );
  229. bool ParseColumnarSettings ( const CSphConfigSection & hIndex, CSphString & sError );
  230. bool ParseKNNSettings ( const CSphConfigSection & hIndex, CSphString & sError );
  231. bool ParseSISettings ( const CSphConfigSection & hIndex, CSphString & sError );
  232. bool ParseDocstoreSettings ( const CSphConfigSection & hIndex, CSphString & sWarning, CSphString & sError );
  233. bool ParseCJKSegmentation ( const CSphConfigSection & hIndex, const StrVec_t & dMorphs, CSphString & sWarning, CSphString & sError );
  234. };
  235. enum class FileAccess_e
  236. {
  237. FILE,
  238. MMAP,
  239. MMAP_PREREAD,
  240. MLOCK,
  241. UNKNOWN
  242. };
  243. enum class MutableName_e
  244. {
  245. EXPAND_KEYWORDS,
  246. RT_MEM_LIMIT,
  247. PREOPEN,
  248. ACCESS_PLAIN_ATTRS,
  249. ACCESS_BLOB_ATTRS,
  250. ACCESS_DOCLISTS,
  251. ACCESS_HITLISTS,
  252. ACCESS_DICT,
  253. READ_BUFFER_DOCS,
  254. READ_BUFFER_HITS,
  255. OPTIMIZE_CUTOFF,
  256. GLOBAL_IDF,
  257. DISKCHUNK_FLUSH_WRITE_TIMEOUT,
  258. DISKCHUNK_FLUSH_SEARCH_TIMEOUT,
  259. TOTAL
  260. };
  261. const int DEFAULT_READ_BUFFER = 256*1024;
  262. const int DEFAULT_READ_UNHINTED = 32768;
  263. struct FileAccessSettings_t : public SettingsWriter_c
  264. {
  265. FileAccess_e m_eAttr = FileAccess_e::MMAP_PREREAD;
  266. FileAccess_e m_eBlob = FileAccess_e::MMAP_PREREAD;
  267. FileAccess_e m_eDoclist = FileAccess_e::FILE;
  268. FileAccess_e m_eHitlist = FileAccess_e::FILE;
  269. FileAccess_e m_eDict = FileAccess_e::MMAP_PREREAD;
  270. int m_iReadBufferDocList = DEFAULT_READ_BUFFER;
  271. int m_iReadBufferHitList = DEFAULT_READ_BUFFER;
  272. bool operator== ( const FileAccessSettings_t & tOther ) const;
  273. bool operator!= ( const FileAccessSettings_t & tOther ) const;
  274. void Format ( SettingsFormatter_c & tOut, FilenameBuilder_i * pFilenameBuilder ) const override;
  275. };
  276. class MutableIndexSettings_c : public SettingsWriter_c
  277. {
  278. public:
  279. int m_iExpandKeywords;
  280. int64_t m_iMemLimit;
  281. bool m_bPreopen = false;
  282. FileAccessSettings_t m_tFileAccess;
  283. int m_iOptimizeCutoff;
  284. int m_iOptimizeCutoffKNN;
  285. CSphString m_sGlobalIDFPath;
  286. // flush check periods, in seconds
  287. int m_iFlushWrite;
  288. int m_iFlushSearch;
  289. MutableIndexSettings_c();
  290. static MutableIndexSettings_c & GetDefaults();
  291. bool Load ( const char * sFileName, const char * sIndexName );
  292. void Load ( const CSphConfigSection & hIndex, bool bNeedSave, StrVec_t * pWarnings );
  293. bool Save ( CSphString & sBuf ) const;
  294. bool NeedSave() const { return m_bNeedSave; }
  295. bool HasSettings() const { return ( m_dLoaded.BitCount()>0 ); }
  296. bool IsSet ( MutableName_e eOpt ) const { return ( HasSettings() && m_dLoaded.BitGet ( (int)eOpt ) ); }
  297. void Format ( SettingsFormatter_c & tOut, FilenameBuilder_i * ) const override;
  298. void Combine ( const MutableIndexSettings_c & tOther );
  299. private:
  300. CSphBitvec m_dLoaded;
  301. bool m_bNeedSave = false;
  302. };
  303. struct RtTypedAttr_t
  304. {
  305. ESphAttr m_eType;
  306. const char * m_szName;
  307. };
  308. int GetNumRtTypes();
  309. const RtTypedAttr_t & GetRtType ( int iType );
  310. bool StrToAttrEngine ( AttrEngine_e & eEngine, AttrEngine_e eDefault, const CSphString & sValue, CSphString & sError );
  311. struct CreateTableAttr_t
  312. {
  313. CSphColumnInfo m_tAttr;
  314. bool m_bFastFetch = true;
  315. bool m_bStringHash = true;
  316. bool m_bIndexed = false;
  317. bool m_bKNN = false;
  318. knn::IndexSettings_t m_tKNN;
  319. knn::ModelSettings_t m_tKNNModel;
  320. CSphString m_sKNNFrom;
  321. };
  322. struct NameValueStr_t
  323. {
  324. CSphString m_sName;
  325. CSphString m_sValue;
  326. };
  327. struct CreateTableSettings_t
  328. {
  329. CSphString m_sLike;
  330. bool m_bIfNotExists = false;
  331. CSphVector<CreateTableAttr_t> m_dAttrs;
  332. CSphVector<CSphColumnInfo> m_dFields;
  333. CSphVector<NameValueStr_t> m_dOpts;
  334. };
  335. class IndexSettingsContainer_i
  336. {
  337. public:
  338. virtual ~IndexSettingsContainer_i() {};
  339. virtual bool Populate ( const CreateTableSettings_t & tCreateTable, bool bExtCopy ) = 0;
  340. virtual bool Add ( const char * szName, const CSphString & sValue ) = 0;
  341. virtual bool Add ( const CSphString & sName, const CSphString & sValue ) = 0;
  342. virtual CSphString Get ( const CSphString & sName ) const =0 ;
  343. virtual CSphString GetList ( const CSphString & sName ) const = 0;
  344. virtual bool Contains ( const char * szName ) const = 0;
  345. virtual void RemoveKeys ( const CSphString & sName ) = 0;
  346. virtual bool AddOption ( const CSphString & sName, const CSphString & sValue, bool bExtCopy ) = 0;
  347. virtual bool CheckPaths() = 0;
  348. virtual bool CopyExternalFiles ( const CSphString & sIndexPath, int iSuffix ) = 0;
  349. virtual void ResetCleanup() = 0;
  350. virtual const CSphConfigSection & AsCfg() const = 0;
  351. virtual const CSphString & GetError() const = 0;
  352. };
  353. IndexSettingsContainer_i * CreateIndexSettingsContainer ();
  354. class ISphTokenizer;
  355. class CSphDict;
  356. class CSphIndex;
  357. class Writer_i;
  358. enum class ExtFilesFormat_e
  359. {
  360. FILE,
  361. LIST
  362. };
  363. void SaveTokenizerSettings ( Writer_i & tWriter, const TokenizerRefPtr_c& pTokenizer, int iEmbeddedLimit );
  364. void SaveDictionarySettings ( Writer_i & tWriter, const DictRefPtr_c& pDict, bool bForceWordDict, int iEmbeddedLimit );
  365. void DumpSettings ( StringBuilder_c & tBuf, const CSphIndex & tIndex, FilenameBuilder_i * pFilenameBuilder );
  366. void DumpSettingsCfg ( FILE * fp, const CSphIndex & tIndex, FilenameBuilder_i * pFilenameBuilder );
  367. void DumpReadable ( FILE * fp, const CSphIndex & tIndex, const CSphEmbeddedFiles & tEmbeddedFiles, FilenameBuilder_i * pFilenameBuilder );
  368. /// try to set dictionary, tokenizer and misc settings for an index (if not already set)
  369. bool sphFixupIndexSettings ( CSphIndex * pIndex, const CSphConfigSection & hIndex, bool bStripFile, FilenameBuilder_i * pFilenameBuilder, StrVec_t & dWarnings, CSphString & sError );
  370. CSphString BuildCreateTable ( const CSphString & sName, const CSphIndex * pIndex, const CSphSchema & tSchema, ExtFilesFormat_e eExt );
  371. // daemon-level callback
  372. using CreateFilenameBuilder_fn = std::unique_ptr<FilenameBuilder_i> (*) ( const char * szIndex );
  373. void SetIndexFilenameBuilder ( CreateFilenameBuilder_fn pBuilder );
  374. CreateFilenameBuilder_fn GetIndexFilenameBuilder();
  375. const char * FileAccessName ( FileAccess_e eValue );
  376. FileAccess_e ParseFileAccess ( CSphString sVal );
  377. int ParseKeywordExpansion ( const char * sValue );
  378. void SaveMutableSettings ( const MutableIndexSettings_c & tSettings, const CSphString & sSettingsFile );
  379. FileAccess_e GetFileAccess ( const CSphConfigSection & hIndex, const char * sKey, bool bList, FileAccess_e eDefault );
  380. // combine per-index and per-attribute engine settings
  381. AttrEngine_e CombineEngines ( AttrEngine_e eIndexEngine, AttrEngine_e eAttrEngine );
  382. class JsonEscapedBuilder;
  383. void operator<< ( JsonEscapedBuilder& tOut, const CSphFieldFilterSettings& tFieldFilterSettings );
  384. void operator<< ( JsonEscapedBuilder& tOut, const CSphIndexSettings& tIndexSettings );
  385. void SaveTokenizerSettings ( JsonEscapedBuilder& tOut, const TokenizerRefPtr_c& pTokenizer, int iEmbeddedLimit );
  386. void SaveDictionarySettings ( JsonEscapedBuilder& tOut, const DictRefPtr_c& pDict, bool bForceWordDict, int iEmbeddedLimit );
  387. void SetDefaultAttrEngine ( AttrEngine_e eEngine );
  388. AttrEngine_e GetDefaultAttrEngine();
  389. bool ForceExactWords ( bool bWordDict, bool bHasMorphology, int iMinPrefixLen, int iMinInfixLen, bool bMorphFieldsEmpty );
  390. void LoadIndexSettingsJson ( bson::Bson_c tNode, CSphIndexSettings & tSettings );
  391. void operator << ( JsonEscapedBuilder & tOut, const CSphIndexSettings & tSettings );
  392. void LoadIndexSettings ( CSphIndexSettings & tSettings, CSphReader & tReader, DWORD uVersion );
  393. void SaveIndexSettings ( Writer_i & tWriter, const CSphIndexSettings & tSettings );
  394. CSphString FormatPath ( const CSphString & sFile, const FilenameBuilder_i * pFilenameBuilder );
  395. #endif // _indexsettings_