| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451 |
- //
- // Copyright (c) 2017-2026, Manticore Software LTD (https://manticoresearch.com)
- // Copyright (c) 2001-2016, Andrew Aksyonoff
- // Copyright (c) 2008-2016, Sphinx Technologies Inc
- // All rights reserved
- //
- // This program is free software; you can redistribute it and/or modify
- // it under the terms of the GNU General Public License. You should have
- // received a copy of the GPL license along with this program; if you
- // did not, you can find it at http://www.gnu.org/
- //
- #include "collation.h"
- #include "attribute.h"
- #include "sphinxint.h"
- #include "secondary/secondary.h"
- #include "secondarylib.h"
- static const char * EMPTY_STR = "";
- inline static void UnpackStrings ( ByteBlob_t& dStr1, ByteBlob_t& dStr2, bool bDataPtr )
- {
- // strings that are stored in index don't need to be unpacked
- if ( bDataPtr )
- {
- dStr1 = sphUnpackPtrAttr ( dStr1.first );
- dStr2 = sphUnpackPtrAttr ( dStr2.first );
- }
- if ( !dStr1.first )
- dStr1 = {(const BYTE *) EMPTY_STR, 0};
- if ( !dStr2.first )
- dStr2 = {(const BYTE *) EMPTY_STR, 0};
- }
- static int CollateBinary ( ByteBlob_t dStr1, ByteBlob_t dStr2, bool bDataPtr )
- {
- UnpackStrings ( dStr1, dStr2, bDataPtr );
- int iRes = memcmp ( (const char *) dStr1.first, (const char *)dStr2.first, Min ( dStr1.second, dStr2.second ) );
- return iRes ? iRes : ( dStr1.second-dStr2.second );
- }
- /// libc_ci, wrapper for strcasecmp
- static int CollateLibcCI ( ByteBlob_t dStr1, ByteBlob_t dStr2, bool bDataPtr )
- {
- UnpackStrings ( dStr1, dStr2, bDataPtr );
- int iRes = strncasecmp ( (const char *) dStr1.first, (const char *) dStr2.first, Min ( dStr1.second, dStr2.second ) );
- return iRes ? iRes : ( dStr1.second-dStr2.second );
- }
- /// libc_cs, wrapper for strcoll
- static int CollateLibcCS ( ByteBlob_t dStr1, ByteBlob_t dStr2, bool bDataPtr )
- {
- #define COLLATE_STACK_BUFFER 1024
- UnpackStrings ( dStr1, dStr2, bDataPtr );
- // strcoll wants asciiz strings, so we would have to copy them over
- // lets use stack buffer for smaller ones, and allocate from heap for bigger ones
- int iRes = 0;
- int iLen = Min ( dStr1.second, dStr2.second );
- if ( iLen<COLLATE_STACK_BUFFER )
- {
- // small strings on stack
- BYTE sBuf1[COLLATE_STACK_BUFFER];
- BYTE sBuf2[COLLATE_STACK_BUFFER];
- memcpy ( sBuf1, dStr1.first, iLen );
- memcpy ( sBuf2, dStr2.first, iLen );
- sBuf1[iLen] = sBuf2[iLen] = '\0';
- iRes = strcoll ( (const char*)sBuf1, (const char*)sBuf2 );
- } else
- {
- // big strings on heap
- char * pBuf1 = new char[iLen + 1];
- char * pBuf2 = new char[iLen + 1];
- memcpy ( pBuf1, dStr1.first, iLen );
- memcpy ( pBuf2, dStr2.first, iLen );
- pBuf1[iLen] = pBuf2[iLen] = '\0';
- iRes = strcoll ( (const char*)pBuf1, (const char*)pBuf2 );
- SafeDeleteArray ( pBuf2 );
- SafeDeleteArray ( pBuf1 );
- }
- return iRes ? iRes : ( dStr1.second-dStr2.second );
- }
- /////////////////////////////
- // UTF8_GENERAL_CI COLLATION
- /////////////////////////////
- /// 1st level LUT
- static unsigned short * g_dCollPlanes_UTF8CI[0x100];
- /// 2nd level LUT, non-trivial collation data
- static unsigned short g_dCollWeights_UTF8CI[0xb00] =
- {
- // weights for 0x0 to 0x5ff
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
- 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
- 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
- 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
- 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
- 96, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
- 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 123, 124, 125, 126, 127,
- 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
- 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
- 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
- 176, 177, 178, 179, 180, 924, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
- 65, 65, 65, 65, 65, 65, 198, 67, 69, 69, 69, 69, 73, 73, 73, 73,
- 208, 78, 79, 79, 79, 79, 79, 215, 216, 85, 85, 85, 85, 89, 222, 83,
- 65, 65, 65, 65, 65, 65, 198, 67, 69, 69, 69, 69, 73, 73, 73, 73,
- 208, 78, 79, 79, 79, 79, 79, 247, 216, 85, 85, 85, 85, 89, 222, 89,
- 65, 65, 65, 65, 65, 65, 67, 67, 67, 67, 67, 67, 67, 67, 68, 68,
- 272, 272, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 71, 71, 71, 71,
- 71, 71, 71, 71, 72, 72, 294, 294, 73, 73, 73, 73, 73, 73, 73, 73,
- 73, 73, 306, 306, 74, 74, 75, 75, 312, 76, 76, 76, 76, 76, 76, 319,
- 319, 321, 321, 78, 78, 78, 78, 78, 78, 329, 330, 330, 79, 79, 79, 79,
- 79, 79, 338, 338, 82, 82, 82, 82, 82, 82, 83, 83, 83, 83, 83, 83,
- 83, 83, 84, 84, 84, 84, 358, 358, 85, 85, 85, 85, 85, 85, 85, 85,
- 85, 85, 85, 85, 87, 87, 89, 89, 89, 90, 90, 90, 90, 90, 90, 83,
- 384, 385, 386, 386, 388, 388, 390, 391, 391, 393, 394, 395, 395, 397, 398, 399,
- 400, 401, 401, 403, 404, 502, 406, 407, 408, 408, 410, 411, 412, 413, 414, 415,
- 79, 79, 418, 418, 420, 420, 422, 423, 423, 425, 426, 427, 428, 428, 430, 85,
- 85, 433, 434, 435, 435, 437, 437, 439, 440, 440, 442, 443, 444, 444, 446, 503,
- 448, 449, 450, 451, 452, 452, 452, 455, 455, 455, 458, 458, 458, 65, 65, 73,
- 73, 79, 79, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 398, 65, 65,
- 65, 65, 198, 198, 484, 484, 71, 71, 75, 75, 79, 79, 79, 79, 439, 439,
- 74, 497, 497, 497, 71, 71, 502, 503, 78, 78, 65, 65, 198, 198, 216, 216,
- 65, 65, 65, 65, 69, 69, 69, 69, 73, 73, 73, 73, 79, 79, 79, 79,
- 82, 82, 82, 82, 85, 85, 85, 85, 83, 83, 84, 84, 540, 540, 72, 72,
- 544, 545, 546, 546, 548, 548, 65, 65, 69, 69, 79, 79, 79, 79, 79, 79,
- 79, 79, 89, 89, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575,
- 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591,
- 592, 593, 594, 385, 390, 597, 393, 394, 600, 399, 602, 400, 604, 605, 606, 607,
- 403, 609, 610, 404, 612, 613, 614, 615, 407, 406, 618, 619, 620, 621, 622, 412,
- 624, 625, 413, 627, 628, 415, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639,
- 422, 641, 642, 425, 644, 645, 646, 647, 430, 649, 433, 434, 652, 653, 654, 655,
- 656, 657, 439, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671,
- 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687,
- 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703,
- 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719,
- 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735,
- 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751,
- 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767,
- 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783,
- 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799,
- 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815,
- 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831,
- 832, 833, 834, 835, 836, 921, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847,
- 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863,
- 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879,
- 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895,
- 896, 897, 898, 899, 900, 901, 913, 903, 917, 919, 921, 907, 927, 909, 933, 937,
- 921, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927,
- 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 921, 933, 913, 917, 919, 921,
- 933, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927,
- 928, 929, 931, 931, 932, 933, 934, 935, 936, 937, 921, 933, 927, 933, 937, 975,
- 914, 920, 978, 978, 978, 934, 928, 983, 984, 985, 986, 986, 988, 988, 990, 990,
- 992, 992, 994, 994, 996, 996, 998, 998, 1000, 1000, 1002, 1002, 1004, 1004, 1006, 1006,
- 922, 929, 931, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
- 1045, 1045, 1026, 1043, 1028, 1029, 1030, 1030, 1032, 1033, 1034, 1035, 1050, 1048, 1059, 1039,
- 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
- 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071,
- 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
- 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071,
- 1045, 1045, 1026, 1043, 1028, 1029, 1030, 1030, 1032, 1033, 1034, 1035, 1050, 1048, 1059, 1039,
- 1120, 1120, 1122, 1122, 1124, 1124, 1126, 1126, 1128, 1128, 1130, 1130, 1132, 1132, 1134, 1134,
- 1136, 1136, 1138, 1138, 1140, 1140, 1140, 1140, 1144, 1144, 1146, 1146, 1148, 1148, 1150, 1150,
- 1152, 1152, 1154, 1155, 1156, 1157, 1158, 1159, 1160, 1161, 1162, 1163, 1164, 1164, 1166, 1166,
- 1168, 1168, 1170, 1170, 1172, 1172, 1174, 1174, 1176, 1176, 1178, 1178, 1180, 1180, 1182, 1182,
- 1184, 1184, 1186, 1186, 1188, 1188, 1190, 1190, 1192, 1192, 1194, 1194, 1196, 1196, 1198, 1198,
- 1200, 1200, 1202, 1202, 1204, 1204, 1206, 1206, 1208, 1208, 1210, 1210, 1212, 1212, 1214, 1214,
- 1216, 1046, 1046, 1219, 1219, 1221, 1222, 1223, 1223, 1225, 1226, 1227, 1227, 1229, 1230, 1231,
- 1040, 1040, 1040, 1040, 1236, 1236, 1045, 1045, 1240, 1240, 1240, 1240, 1046, 1046, 1047, 1047,
- 1248, 1248, 1048, 1048, 1048, 1048, 1054, 1054, 1256, 1256, 1256, 1256, 1069, 1069, 1059, 1059,
- 1059, 1059, 1059, 1059, 1063, 1063, 1270, 1271, 1067, 1067, 1274, 1275, 1276, 1277, 1278, 1279,
- 1280, 1281, 1282, 1283, 1284, 1285, 1286, 1287, 1288, 1289, 1290, 1291, 1292, 1293, 1294, 1295,
- 1296, 1297, 1298, 1299, 1300, 1301, 1302, 1303, 1304, 1305, 1306, 1307, 1308, 1309, 1310, 1311,
- 1312, 1313, 1314, 1315, 1316, 1317, 1318, 1319, 1320, 1321, 1322, 1323, 1324, 1325, 1326, 1327,
- 1328, 1329, 1330, 1331, 1332, 1333, 1334, 1335, 1336, 1337, 1338, 1339, 1340, 1341, 1342, 1343,
- 1344, 1345, 1346, 1347, 1348, 1349, 1350, 1351, 1352, 1353, 1354, 1355, 1356, 1357, 1358, 1359,
- 1360, 1361, 1362, 1363, 1364, 1365, 1366, 1367, 1368, 1369, 1370, 1371, 1372, 1373, 1374, 1375,
- 1376, 1329, 1330, 1331, 1332, 1333, 1334, 1335, 1336, 1337, 1338, 1339, 1340, 1341, 1342, 1343,
- 1344, 1345, 1346, 1347, 1348, 1349, 1350, 1351, 1352, 1353, 1354, 1355, 1356, 1357, 1358, 1359,
- 1360, 1361, 1362, 1363, 1364, 1365, 1366, 1415, 1416, 1417, 1418, 1419, 1420, 1421, 1422, 1423,
- 1424, 1425, 1426, 1427, 1428, 1429, 1430, 1431, 1432, 1433, 1434, 1435, 1436, 1437, 1438, 1439,
- 1440, 1441, 1442, 1443, 1444, 1445, 1446, 1447, 1448, 1449, 1450, 1451, 1452, 1453, 1454, 1455,
- 1456, 1457, 1458, 1459, 1460, 1461, 1462, 1463, 1464, 1465, 1466, 1467, 1468, 1469, 1470, 1471,
- 1472, 1473, 1474, 1475, 1476, 1477, 1478, 1479, 1480, 1481, 1482, 1483, 1484, 1485, 1486, 1487,
- 1488, 1489, 1490, 1491, 1492, 1493, 1494, 1495, 1496, 1497, 1498, 1499, 1500, 1501, 1502, 1503,
- 1504, 1505, 1506, 1507, 1508, 1509, 1510, 1511, 1512, 1513, 1514, 1515, 1516, 1517, 1518, 1519,
- 1520, 1521, 1522, 1523, 1524, 1525, 1526, 1527, 1528, 1529, 1530, 1531, 1532, 1533, 1534, 1535,
- // weights for codepoints 0x1e00 to 0x1fff
- 65, 65, 66, 66, 66, 66, 66, 66, 67, 67, 68, 68, 68, 68, 68, 68,
- 68, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 70, 70,
- 71, 71, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 73, 73, 73, 73,
- 75, 75, 75, 75, 75, 75, 76, 76, 76, 76, 76, 76, 76, 76, 77, 77,
- 77, 77, 77, 77, 78, 78, 78, 78, 78, 78, 78, 78, 79, 79, 79, 79,
- 79, 79, 79, 79, 80, 80, 80, 80, 82, 82, 82, 82, 82, 82, 82, 82,
- 83, 83, 83, 83, 83, 83, 83, 83, 83, 83, 84, 84, 84, 84, 84, 84,
- 84, 84, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 86, 86, 86, 86,
- 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 88, 88, 88, 88, 89, 89,
- 90, 90, 90, 90, 90, 90, 72, 84, 87, 89, 7834, 83, 7836, 7837, 7838, 7839,
- 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
- 65, 65, 65, 65, 65, 65, 65, 65, 69, 69, 69, 69, 69, 69, 69, 69,
- 69, 69, 69, 69, 69, 69, 69, 69, 73, 73, 73, 73, 79, 79, 79, 79,
- 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
- 79, 79, 79, 79, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85,
- 85, 85, 89, 89, 89, 89, 89, 89, 89, 89, 7930, 7931, 7932, 7933, 7934, 7935,
- 913, 913, 913, 913, 913, 913, 913, 913, 913, 913, 913, 913, 913, 913, 913, 913,
- 917, 917, 917, 917, 917, 917, 7958, 7959, 917, 917, 917, 917, 917, 917, 7966, 7967,
- 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919,
- 921, 921, 921, 921, 921, 921, 921, 921, 921, 921, 921, 921, 921, 921, 921, 921,
- 927, 927, 927, 927, 927, 927, 8006, 8007, 927, 927, 927, 927, 927, 927, 8014, 8015,
- 933, 933, 933, 933, 933, 933, 933, 933, 8024, 933, 8026, 933, 8028, 933, 8030, 933,
- 937, 937, 937, 937, 937, 937, 937, 937, 937, 937, 937, 937, 937, 937, 937, 937,
- 913, 8123, 917, 8137, 919, 8139, 921, 8155, 927, 8185, 933, 8171, 937, 8187, 8062, 8063,
- 913, 913, 913, 913, 913, 913, 913, 913, 913, 913, 913, 913, 913, 913, 913, 913,
- 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919,
- 937, 937, 937, 937, 937, 937, 937, 937, 937, 937, 937, 937, 937, 937, 937, 937,
- 913, 913, 913, 913, 913, 8117, 913, 913, 913, 913, 913, 8123, 913, 8125, 921, 8127,
- 8128, 8129, 919, 919, 919, 8133, 919, 919, 917, 8137, 919, 8139, 919, 8141, 8142, 8143,
- 921, 921, 921, 8147, 8148, 8149, 921, 921, 921, 921, 921, 8155, 8156, 8157, 8158, 8159,
- 933, 933, 933, 8163, 929, 929, 933, 933, 933, 933, 933, 8171, 929, 8173, 8174, 8175,
- 8176, 8177, 937, 937, 937, 8181, 937, 937, 927, 8185, 937, 8187, 937, 8189, 8190, 8191
- // space for codepoints 0x21xx, 0x24xx, 0xffxx (generated)
- };
- template <class HASH>
- uint64_t HashStrLen ( const BYTE * pStr, int iLen )
- {
- if ( !pStr || !iLen )
- return SPH_FNV64_SEED;
- else
- return HASH::Hash ( pStr, iLen );
- }
- /// initialize collation LUTs
- void sphCollationInit()
- {
- const int dWeightPlane[0x0b] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x1e, 0x1f, 0x21, 0x24, 0xff };
- // generate missing weights
- for ( int i=0; i<0x100; i++ )
- {
- g_dCollWeights_UTF8CI[i+0x800] = (unsigned short)( 0x2100 + i - ( i>=0x70 && i<=0x7f )*16 ); // 2170..217f, -16
- g_dCollWeights_UTF8CI[i+0x900] = (unsigned short)( 0x2400 + i - ( i>=0xd0 && i<=0xe9 )*26 ); // 24d0..24e9, -26
- g_dCollWeights_UTF8CI[i+0xa00] = (unsigned short)( 0xff00 + i - ( i>=0x41 && i<=0x5a )*32 ); // ff41..ff5a, -32
- }
- // generate planes table
- for ( auto& dCollPlanes : g_dCollPlanes_UTF8CI )
- dCollPlanes = nullptr;
- for ( int i=0; i<0x0b; i++ )
- g_dCollPlanes_UTF8CI [ dWeightPlane[i] ] = g_dCollWeights_UTF8CI + 0x100*i;
- }
- /// collate a single codepoint
- static inline int CollateUTF8CI ( int iCode )
- {
- return ( ( iCode>>16 ) || !g_dCollPlanes_UTF8CI [ iCode>>8 ] )
- ? iCode
- : g_dCollPlanes_UTF8CI [ iCode>>8 ][ iCode&0xff ];
- }
- /// utf8_general_ci
- static int CollateUtf8GeneralCI ( ByteBlob_t dStr1, ByteBlob_t dStr2, bool bDataPtr)
- {
- UnpackStrings ( dStr1, dStr2, bDataPtr );
- const BYTE * pMax1 = dStr1.first + dStr1.second;
- const BYTE * pMax2 = dStr2.first + dStr2.second;
- while (dStr1.first<pMax1 && dStr2.first<pMax2 )
- {
- // FIXME! on broken data, decode might go beyond buffer bounds
- int iCode1 = sphUTF8Decode ( dStr1.first );
- int iCode2 = sphUTF8Decode ( dStr2.first );
- if ( !iCode1 && !iCode2 )
- return 0;
- if ( !iCode1 || !iCode2 )
- return !iCode1 ? -1 : 1;
- if ( iCode1==iCode2 )
- continue;
- iCode1 = CollateUTF8CI ( iCode1 );
- iCode2 = CollateUTF8CI ( iCode2 );
- if ( iCode1!=iCode2 )
- return iCode1-iCode2;
- }
- if ( dStr1.first>=pMax1 && dStr2.first>=pMax2 )
- return 0;
- return ( dStr1.first<pMax1 ) ? 1 : -1;
- }
- SphStringCmp_fn GetStringCmpFunc ( ESphCollation eCollation )
- {
- switch ( eCollation )
- {
- case SPH_COLLATION_LIBC_CS: return CollateLibcCS;
- case SPH_COLLATION_UTF8_GENERAL_CI: return CollateUtf8GeneralCI;
- case SPH_COLLATION_BINARY: return CollateBinary;
- default: return CollateLibcCI;
- }
- }
- /////////////////////////////
- // hashing functions
- /////////////////////////////
- uint64_t LibcCSHash_fn::Hash ( const BYTE * pStr, int iLen, uint64_t uPrev )
- {
- const int LOCALE_SAFE_GAP = 16;
- assert ( pStr && iLen );
- int iCompositeLen = iLen + 1 + (int)( 3.0f * (float)iLen ) + LOCALE_SAFE_GAP;
- CSphFixedVector<BYTE> dBuf { iCompositeLen };
- memcpy ( dBuf.Begin(), pStr, iLen );
- dBuf[iLen] = '\0';
- BYTE * pDst = dBuf.Begin()+iLen+1;
- int iDstAvailable = dBuf.GetLength() - iLen - LOCALE_SAFE_GAP;
- auto iDstLen = (int) strxfrm ( (char *)pDst, (const char *) dBuf.Begin(), iDstAvailable );
- assert ( iDstLen<iDstAvailable+LOCALE_SAFE_GAP );
- uint64_t uAcc = sphFNV64 ( pDst, iDstLen, uPrev );
- return uAcc;
- }
- uint64_t LibcCIHash_fn::Hash ( const BYTE * pStr, int iLen, uint64_t uPrev )
- {
- assert ( pStr && iLen );
- uint64_t uAcc = uPrev;
- while ( iLen-- )
- {
- int iChar = tolower ( *pStr++ );
- uAcc = sphFNV64 ( &iChar, 4, uAcc );
- }
- return uAcc;
- }
- uint64_t Utf8CIHash_fn::Hash ( const BYTE * pStr, int iLen, uint64_t uPrev )
- {
- assert ( pStr && iLen );
- uint64_t uAcc = uPrev;
- while ( iLen-- )
- {
- const BYTE * pCur = pStr++;
- int iCode = sphUTF8Decode ( pCur );
- iCode = CollateUTF8CI ( iCode );
- uAcc = sphFNV64 ( &iCode, 4, uAcc );
- }
- return uAcc;
- }
- uint64_t BinaryHash_fn::Hash ( const BYTE * pStr, int iLen, uint64_t uPrev )
- {
- assert ( pStr && iLen );
- return sphFNV64 ( pStr, iLen, uPrev );
- }
- /////////////////////////////////////////////////////////////////////
- StrHashCalc_fn GetStringHashCalcFunc ( ESphCollation eCollation )
- {
- switch ( eCollation )
- {
- case SPH_COLLATION_LIBC_CS: return LibcCSHash_fn::Hash;
- case SPH_COLLATION_UTF8_GENERAL_CI: return Utf8CIHash_fn::Hash;
- case SPH_COLLATION_BINARY: return BinaryHash_fn::Hash;
- default: return LibcCIHash_fn::Hash;
- }
- }
- volatile ESphCollation& GlobalCollation()
- {
- static ESphCollation eCollation = SPH_COLLATION_DEFAULT;
- return eCollation;
- }
- ESphCollation sphCollationFromName ( const CSphString & sName, CSphString * pError )
- {
- assert ( pError );
- // FIXME! replace with a hash lookup?
- if ( sName=="libc_ci" )
- return SPH_COLLATION_LIBC_CI;
- else if ( sName=="libc_cs" )
- return SPH_COLLATION_LIBC_CS;
- else if ( sName=="utf8_general_ci" )
- return SPH_COLLATION_UTF8_GENERAL_CI;
- else if ( sName=="binary" )
- return SPH_COLLATION_BINARY;
- pError->SetSprintf ( "Unknown collation: '%s'", sName.cstr() );
- return SPH_COLLATION_DEFAULT;
- }
- static CSphString g_sLocale;
- static std::locale g_tLocale;
- static bool g_bGlobalLocaleSet = false;
- void SetLocale ( const CSphString & sLocale, bool bSet )
- {
- g_sLocale = sLocale;
- g_tLocale = std::locale();
- if ( g_sLocale.IsEmpty() )
- return;
- g_bGlobalLocaleSet = bSet;
- g_tLocale = std::locale ( sLocale.cstr() );
- }
- const std::locale & GlobalLocale()
- {
- return g_tLocale;
- }
- bool IsGlobalLocaleSet()
- {
- return g_bGlobalLocaleSet;
- }
|