// // Copyright (c) 2017-2026, Manticore Software LTD (https://manticoresearch.com) // Copyright (c) 2001-2016, Andrew Aksyonoff // Copyright (c) 2008-2016, Sphinx Technologies Inc // All rights reserved // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License. You should have // received a copy of the GPL license along with this program; if you // did not, you can find it at http://www.gnu.org/ // #include "collation.h" #include "attribute.h" #include "sphinxint.h" #include "secondary/secondary.h" #include "secondarylib.h" static const char * EMPTY_STR = ""; inline static void UnpackStrings ( ByteBlob_t& dStr1, ByteBlob_t& dStr2, bool bDataPtr ) { // strings that are stored in index don't need to be unpacked if ( bDataPtr ) { dStr1 = sphUnpackPtrAttr ( dStr1.first ); dStr2 = sphUnpackPtrAttr ( dStr2.first ); } if ( !dStr1.first ) dStr1 = {(const BYTE *) EMPTY_STR, 0}; if ( !dStr2.first ) dStr2 = {(const BYTE *) EMPTY_STR, 0}; } static int CollateBinary ( ByteBlob_t dStr1, ByteBlob_t dStr2, bool bDataPtr ) { UnpackStrings ( dStr1, dStr2, bDataPtr ); int iRes = memcmp ( (const char *) dStr1.first, (const char *)dStr2.first, Min ( dStr1.second, dStr2.second ) ); return iRes ? iRes : ( dStr1.second-dStr2.second ); } /// libc_ci, wrapper for strcasecmp static int CollateLibcCI ( ByteBlob_t dStr1, ByteBlob_t dStr2, bool bDataPtr ) { UnpackStrings ( dStr1, dStr2, bDataPtr ); int iRes = strncasecmp ( (const char *) dStr1.first, (const char *) dStr2.first, Min ( dStr1.second, dStr2.second ) ); return iRes ? iRes : ( dStr1.second-dStr2.second ); } /// libc_cs, wrapper for strcoll static int CollateLibcCS ( ByteBlob_t dStr1, ByteBlob_t dStr2, bool bDataPtr ) { #define COLLATE_STACK_BUFFER 1024 UnpackStrings ( dStr1, dStr2, bDataPtr ); // strcoll wants asciiz strings, so we would have to copy them over // lets use stack buffer for smaller ones, and allocate from heap for bigger ones int iRes = 0; int iLen = Min ( dStr1.second, dStr2.second ); if ( iLen uint64_t HashStrLen ( const BYTE * pStr, int iLen ) { if ( !pStr || !iLen ) return SPH_FNV64_SEED; else return HASH::Hash ( pStr, iLen ); } /// initialize collation LUTs void sphCollationInit() { const int dWeightPlane[0x0b] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x1e, 0x1f, 0x21, 0x24, 0xff }; // generate missing weights for ( int i=0; i<0x100; i++ ) { g_dCollWeights_UTF8CI[i+0x800] = (unsigned short)( 0x2100 + i - ( i>=0x70 && i<=0x7f )*16 ); // 2170..217f, -16 g_dCollWeights_UTF8CI[i+0x900] = (unsigned short)( 0x2400 + i - ( i>=0xd0 && i<=0xe9 )*26 ); // 24d0..24e9, -26 g_dCollWeights_UTF8CI[i+0xa00] = (unsigned short)( 0xff00 + i - ( i>=0x41 && i<=0x5a )*32 ); // ff41..ff5a, -32 } // generate planes table for ( auto& dCollPlanes : g_dCollPlanes_UTF8CI ) dCollPlanes = nullptr; for ( int i=0; i<0x0b; i++ ) g_dCollPlanes_UTF8CI [ dWeightPlane[i] ] = g_dCollWeights_UTF8CI + 0x100*i; } /// collate a single codepoint static inline int CollateUTF8CI ( int iCode ) { return ( ( iCode>>16 ) || !g_dCollPlanes_UTF8CI [ iCode>>8 ] ) ? iCode : g_dCollPlanes_UTF8CI [ iCode>>8 ][ iCode&0xff ]; } /// utf8_general_ci static int CollateUtf8GeneralCI ( ByteBlob_t dStr1, ByteBlob_t dStr2, bool bDataPtr) { UnpackStrings ( dStr1, dStr2, bDataPtr ); const BYTE * pMax1 = dStr1.first + dStr1.second; const BYTE * pMax2 = dStr2.first + dStr2.second; while (dStr1.first=pMax1 && dStr2.first>=pMax2 ) return 0; return ( dStr1.first dBuf { iCompositeLen }; memcpy ( dBuf.Begin(), pStr, iLen ); dBuf[iLen] = '\0'; BYTE * pDst = dBuf.Begin()+iLen+1; int iDstAvailable = dBuf.GetLength() - iLen - LOCALE_SAFE_GAP; auto iDstLen = (int) strxfrm ( (char *)pDst, (const char *) dBuf.Begin(), iDstAvailable ); assert ( iDstLenSetSprintf ( "Unknown collation: '%s'", sName.cstr() ); return SPH_COLLATION_DEFAULT; } static CSphString g_sLocale; static std::locale g_tLocale; static bool g_bGlobalLocaleSet = false; void SetLocale ( const CSphString & sLocale, bool bSet ) { g_sLocale = sLocale; g_tLocale = std::locale(); if ( g_sLocale.IsEmpty() ) return; g_bGlobalLocaleSet = bSet; g_tLocale = std::locale ( sLocale.cstr() ); } const std::locale & GlobalLocale() { return g_tLocale; } bool IsGlobalLocaleSet() { return g_bGlobalLocaleSet; }