collation.cpp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451
  1. //
  2. // Copyright (c) 2017-2026, Manticore Software LTD (https://manticoresearch.com)
  3. // Copyright (c) 2001-2016, Andrew Aksyonoff
  4. // Copyright (c) 2008-2016, Sphinx Technologies Inc
  5. // All rights reserved
  6. //
  7. // This program is free software; you can redistribute it and/or modify
  8. // it under the terms of the GNU General Public License. You should have
  9. // received a copy of the GPL license along with this program; if you
  10. // did not, you can find it at http://www.gnu.org/
  11. //
  12. #include "collation.h"
  13. #include "attribute.h"
  14. #include "sphinxint.h"
  15. #include "secondary/secondary.h"
  16. #include "secondarylib.h"
  17. static const char * EMPTY_STR = "";
  18. inline static void UnpackStrings ( ByteBlob_t& dStr1, ByteBlob_t& dStr2, bool bDataPtr )
  19. {
  20. // strings that are stored in index don't need to be unpacked
  21. if ( bDataPtr )
  22. {
  23. dStr1 = sphUnpackPtrAttr ( dStr1.first );
  24. dStr2 = sphUnpackPtrAttr ( dStr2.first );
  25. }
  26. if ( !dStr1.first )
  27. dStr1 = {(const BYTE *) EMPTY_STR, 0};
  28. if ( !dStr2.first )
  29. dStr2 = {(const BYTE *) EMPTY_STR, 0};
  30. }
  31. static int CollateBinary ( ByteBlob_t dStr1, ByteBlob_t dStr2, bool bDataPtr )
  32. {
  33. UnpackStrings ( dStr1, dStr2, bDataPtr );
  34. int iRes = memcmp ( (const char *) dStr1.first, (const char *)dStr2.first, Min ( dStr1.second, dStr2.second ) );
  35. return iRes ? iRes : ( dStr1.second-dStr2.second );
  36. }
  37. /// libc_ci, wrapper for strcasecmp
  38. static int CollateLibcCI ( ByteBlob_t dStr1, ByteBlob_t dStr2, bool bDataPtr )
  39. {
  40. UnpackStrings ( dStr1, dStr2, bDataPtr );
  41. int iRes = strncasecmp ( (const char *) dStr1.first, (const char *) dStr2.first, Min ( dStr1.second, dStr2.second ) );
  42. return iRes ? iRes : ( dStr1.second-dStr2.second );
  43. }
  44. /// libc_cs, wrapper for strcoll
  45. static int CollateLibcCS ( ByteBlob_t dStr1, ByteBlob_t dStr2, bool bDataPtr )
  46. {
  47. #define COLLATE_STACK_BUFFER 1024
  48. UnpackStrings ( dStr1, dStr2, bDataPtr );
  49. // strcoll wants asciiz strings, so we would have to copy them over
  50. // lets use stack buffer for smaller ones, and allocate from heap for bigger ones
  51. int iRes = 0;
  52. int iLen = Min ( dStr1.second, dStr2.second );
  53. if ( iLen<COLLATE_STACK_BUFFER )
  54. {
  55. // small strings on stack
  56. BYTE sBuf1[COLLATE_STACK_BUFFER];
  57. BYTE sBuf2[COLLATE_STACK_BUFFER];
  58. memcpy ( sBuf1, dStr1.first, iLen );
  59. memcpy ( sBuf2, dStr2.first, iLen );
  60. sBuf1[iLen] = sBuf2[iLen] = '\0';
  61. iRes = strcoll ( (const char*)sBuf1, (const char*)sBuf2 );
  62. } else
  63. {
  64. // big strings on heap
  65. char * pBuf1 = new char[iLen + 1];
  66. char * pBuf2 = new char[iLen + 1];
  67. memcpy ( pBuf1, dStr1.first, iLen );
  68. memcpy ( pBuf2, dStr2.first, iLen );
  69. pBuf1[iLen] = pBuf2[iLen] = '\0';
  70. iRes = strcoll ( (const char*)pBuf1, (const char*)pBuf2 );
  71. SafeDeleteArray ( pBuf2 );
  72. SafeDeleteArray ( pBuf1 );
  73. }
  74. return iRes ? iRes : ( dStr1.second-dStr2.second );
  75. }
  76. /////////////////////////////
  77. // UTF8_GENERAL_CI COLLATION
  78. /////////////////////////////
  79. /// 1st level LUT
  80. static unsigned short * g_dCollPlanes_UTF8CI[0x100];
  81. /// 2nd level LUT, non-trivial collation data
  82. static unsigned short g_dCollWeights_UTF8CI[0xb00] =
  83. {
  84. // weights for 0x0 to 0x5ff
  85. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
  86. 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
  87. 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
  88. 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
  89. 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
  90. 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
  91. 96, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
  92. 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 123, 124, 125, 126, 127,
  93. 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
  94. 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
  95. 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
  96. 176, 177, 178, 179, 180, 924, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
  97. 65, 65, 65, 65, 65, 65, 198, 67, 69, 69, 69, 69, 73, 73, 73, 73,
  98. 208, 78, 79, 79, 79, 79, 79, 215, 216, 85, 85, 85, 85, 89, 222, 83,
  99. 65, 65, 65, 65, 65, 65, 198, 67, 69, 69, 69, 69, 73, 73, 73, 73,
  100. 208, 78, 79, 79, 79, 79, 79, 247, 216, 85, 85, 85, 85, 89, 222, 89,
  101. 65, 65, 65, 65, 65, 65, 67, 67, 67, 67, 67, 67, 67, 67, 68, 68,
  102. 272, 272, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 71, 71, 71, 71,
  103. 71, 71, 71, 71, 72, 72, 294, 294, 73, 73, 73, 73, 73, 73, 73, 73,
  104. 73, 73, 306, 306, 74, 74, 75, 75, 312, 76, 76, 76, 76, 76, 76, 319,
  105. 319, 321, 321, 78, 78, 78, 78, 78, 78, 329, 330, 330, 79, 79, 79, 79,
  106. 79, 79, 338, 338, 82, 82, 82, 82, 82, 82, 83, 83, 83, 83, 83, 83,
  107. 83, 83, 84, 84, 84, 84, 358, 358, 85, 85, 85, 85, 85, 85, 85, 85,
  108. 85, 85, 85, 85, 87, 87, 89, 89, 89, 90, 90, 90, 90, 90, 90, 83,
  109. 384, 385, 386, 386, 388, 388, 390, 391, 391, 393, 394, 395, 395, 397, 398, 399,
  110. 400, 401, 401, 403, 404, 502, 406, 407, 408, 408, 410, 411, 412, 413, 414, 415,
  111. 79, 79, 418, 418, 420, 420, 422, 423, 423, 425, 426, 427, 428, 428, 430, 85,
  112. 85, 433, 434, 435, 435, 437, 437, 439, 440, 440, 442, 443, 444, 444, 446, 503,
  113. 448, 449, 450, 451, 452, 452, 452, 455, 455, 455, 458, 458, 458, 65, 65, 73,
  114. 73, 79, 79, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 398, 65, 65,
  115. 65, 65, 198, 198, 484, 484, 71, 71, 75, 75, 79, 79, 79, 79, 439, 439,
  116. 74, 497, 497, 497, 71, 71, 502, 503, 78, 78, 65, 65, 198, 198, 216, 216,
  117. 65, 65, 65, 65, 69, 69, 69, 69, 73, 73, 73, 73, 79, 79, 79, 79,
  118. 82, 82, 82, 82, 85, 85, 85, 85, 83, 83, 84, 84, 540, 540, 72, 72,
  119. 544, 545, 546, 546, 548, 548, 65, 65, 69, 69, 79, 79, 79, 79, 79, 79,
  120. 79, 79, 89, 89, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575,
  121. 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591,
  122. 592, 593, 594, 385, 390, 597, 393, 394, 600, 399, 602, 400, 604, 605, 606, 607,
  123. 403, 609, 610, 404, 612, 613, 614, 615, 407, 406, 618, 619, 620, 621, 622, 412,
  124. 624, 625, 413, 627, 628, 415, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639,
  125. 422, 641, 642, 425, 644, 645, 646, 647, 430, 649, 433, 434, 652, 653, 654, 655,
  126. 656, 657, 439, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671,
  127. 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687,
  128. 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703,
  129. 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719,
  130. 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735,
  131. 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751,
  132. 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767,
  133. 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783,
  134. 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799,
  135. 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815,
  136. 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831,
  137. 832, 833, 834, 835, 836, 921, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847,
  138. 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863,
  139. 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879,
  140. 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895,
  141. 896, 897, 898, 899, 900, 901, 913, 903, 917, 919, 921, 907, 927, 909, 933, 937,
  142. 921, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927,
  143. 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 921, 933, 913, 917, 919, 921,
  144. 933, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927,
  145. 928, 929, 931, 931, 932, 933, 934, 935, 936, 937, 921, 933, 927, 933, 937, 975,
  146. 914, 920, 978, 978, 978, 934, 928, 983, 984, 985, 986, 986, 988, 988, 990, 990,
  147. 992, 992, 994, 994, 996, 996, 998, 998, 1000, 1000, 1002, 1002, 1004, 1004, 1006, 1006,
  148. 922, 929, 931, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
  149. 1045, 1045, 1026, 1043, 1028, 1029, 1030, 1030, 1032, 1033, 1034, 1035, 1050, 1048, 1059, 1039,
  150. 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
  151. 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071,
  152. 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
  153. 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071,
  154. 1045, 1045, 1026, 1043, 1028, 1029, 1030, 1030, 1032, 1033, 1034, 1035, 1050, 1048, 1059, 1039,
  155. 1120, 1120, 1122, 1122, 1124, 1124, 1126, 1126, 1128, 1128, 1130, 1130, 1132, 1132, 1134, 1134,
  156. 1136, 1136, 1138, 1138, 1140, 1140, 1140, 1140, 1144, 1144, 1146, 1146, 1148, 1148, 1150, 1150,
  157. 1152, 1152, 1154, 1155, 1156, 1157, 1158, 1159, 1160, 1161, 1162, 1163, 1164, 1164, 1166, 1166,
  158. 1168, 1168, 1170, 1170, 1172, 1172, 1174, 1174, 1176, 1176, 1178, 1178, 1180, 1180, 1182, 1182,
  159. 1184, 1184, 1186, 1186, 1188, 1188, 1190, 1190, 1192, 1192, 1194, 1194, 1196, 1196, 1198, 1198,
  160. 1200, 1200, 1202, 1202, 1204, 1204, 1206, 1206, 1208, 1208, 1210, 1210, 1212, 1212, 1214, 1214,
  161. 1216, 1046, 1046, 1219, 1219, 1221, 1222, 1223, 1223, 1225, 1226, 1227, 1227, 1229, 1230, 1231,
  162. 1040, 1040, 1040, 1040, 1236, 1236, 1045, 1045, 1240, 1240, 1240, 1240, 1046, 1046, 1047, 1047,
  163. 1248, 1248, 1048, 1048, 1048, 1048, 1054, 1054, 1256, 1256, 1256, 1256, 1069, 1069, 1059, 1059,
  164. 1059, 1059, 1059, 1059, 1063, 1063, 1270, 1271, 1067, 1067, 1274, 1275, 1276, 1277, 1278, 1279,
  165. 1280, 1281, 1282, 1283, 1284, 1285, 1286, 1287, 1288, 1289, 1290, 1291, 1292, 1293, 1294, 1295,
  166. 1296, 1297, 1298, 1299, 1300, 1301, 1302, 1303, 1304, 1305, 1306, 1307, 1308, 1309, 1310, 1311,
  167. 1312, 1313, 1314, 1315, 1316, 1317, 1318, 1319, 1320, 1321, 1322, 1323, 1324, 1325, 1326, 1327,
  168. 1328, 1329, 1330, 1331, 1332, 1333, 1334, 1335, 1336, 1337, 1338, 1339, 1340, 1341, 1342, 1343,
  169. 1344, 1345, 1346, 1347, 1348, 1349, 1350, 1351, 1352, 1353, 1354, 1355, 1356, 1357, 1358, 1359,
  170. 1360, 1361, 1362, 1363, 1364, 1365, 1366, 1367, 1368, 1369, 1370, 1371, 1372, 1373, 1374, 1375,
  171. 1376, 1329, 1330, 1331, 1332, 1333, 1334, 1335, 1336, 1337, 1338, 1339, 1340, 1341, 1342, 1343,
  172. 1344, 1345, 1346, 1347, 1348, 1349, 1350, 1351, 1352, 1353, 1354, 1355, 1356, 1357, 1358, 1359,
  173. 1360, 1361, 1362, 1363, 1364, 1365, 1366, 1415, 1416, 1417, 1418, 1419, 1420, 1421, 1422, 1423,
  174. 1424, 1425, 1426, 1427, 1428, 1429, 1430, 1431, 1432, 1433, 1434, 1435, 1436, 1437, 1438, 1439,
  175. 1440, 1441, 1442, 1443, 1444, 1445, 1446, 1447, 1448, 1449, 1450, 1451, 1452, 1453, 1454, 1455,
  176. 1456, 1457, 1458, 1459, 1460, 1461, 1462, 1463, 1464, 1465, 1466, 1467, 1468, 1469, 1470, 1471,
  177. 1472, 1473, 1474, 1475, 1476, 1477, 1478, 1479, 1480, 1481, 1482, 1483, 1484, 1485, 1486, 1487,
  178. 1488, 1489, 1490, 1491, 1492, 1493, 1494, 1495, 1496, 1497, 1498, 1499, 1500, 1501, 1502, 1503,
  179. 1504, 1505, 1506, 1507, 1508, 1509, 1510, 1511, 1512, 1513, 1514, 1515, 1516, 1517, 1518, 1519,
  180. 1520, 1521, 1522, 1523, 1524, 1525, 1526, 1527, 1528, 1529, 1530, 1531, 1532, 1533, 1534, 1535,
  181. // weights for codepoints 0x1e00 to 0x1fff
  182. 65, 65, 66, 66, 66, 66, 66, 66, 67, 67, 68, 68, 68, 68, 68, 68,
  183. 68, 68, 68, 68, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 70, 70,
  184. 71, 71, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 73, 73, 73, 73,
  185. 75, 75, 75, 75, 75, 75, 76, 76, 76, 76, 76, 76, 76, 76, 77, 77,
  186. 77, 77, 77, 77, 78, 78, 78, 78, 78, 78, 78, 78, 79, 79, 79, 79,
  187. 79, 79, 79, 79, 80, 80, 80, 80, 82, 82, 82, 82, 82, 82, 82, 82,
  188. 83, 83, 83, 83, 83, 83, 83, 83, 83, 83, 84, 84, 84, 84, 84, 84,
  189. 84, 84, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 86, 86, 86, 86,
  190. 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 88, 88, 88, 88, 89, 89,
  191. 90, 90, 90, 90, 90, 90, 72, 84, 87, 89, 7834, 83, 7836, 7837, 7838, 7839,
  192. 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
  193. 65, 65, 65, 65, 65, 65, 65, 65, 69, 69, 69, 69, 69, 69, 69, 69,
  194. 69, 69, 69, 69, 69, 69, 69, 69, 73, 73, 73, 73, 79, 79, 79, 79,
  195. 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
  196. 79, 79, 79, 79, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85,
  197. 85, 85, 89, 89, 89, 89, 89, 89, 89, 89, 7930, 7931, 7932, 7933, 7934, 7935,
  198. 913, 913, 913, 913, 913, 913, 913, 913, 913, 913, 913, 913, 913, 913, 913, 913,
  199. 917, 917, 917, 917, 917, 917, 7958, 7959, 917, 917, 917, 917, 917, 917, 7966, 7967,
  200. 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919,
  201. 921, 921, 921, 921, 921, 921, 921, 921, 921, 921, 921, 921, 921, 921, 921, 921,
  202. 927, 927, 927, 927, 927, 927, 8006, 8007, 927, 927, 927, 927, 927, 927, 8014, 8015,
  203. 933, 933, 933, 933, 933, 933, 933, 933, 8024, 933, 8026, 933, 8028, 933, 8030, 933,
  204. 937, 937, 937, 937, 937, 937, 937, 937, 937, 937, 937, 937, 937, 937, 937, 937,
  205. 913, 8123, 917, 8137, 919, 8139, 921, 8155, 927, 8185, 933, 8171, 937, 8187, 8062, 8063,
  206. 913, 913, 913, 913, 913, 913, 913, 913, 913, 913, 913, 913, 913, 913, 913, 913,
  207. 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919, 919,
  208. 937, 937, 937, 937, 937, 937, 937, 937, 937, 937, 937, 937, 937, 937, 937, 937,
  209. 913, 913, 913, 913, 913, 8117, 913, 913, 913, 913, 913, 8123, 913, 8125, 921, 8127,
  210. 8128, 8129, 919, 919, 919, 8133, 919, 919, 917, 8137, 919, 8139, 919, 8141, 8142, 8143,
  211. 921, 921, 921, 8147, 8148, 8149, 921, 921, 921, 921, 921, 8155, 8156, 8157, 8158, 8159,
  212. 933, 933, 933, 8163, 929, 929, 933, 933, 933, 933, 933, 8171, 929, 8173, 8174, 8175,
  213. 8176, 8177, 937, 937, 937, 8181, 937, 937, 927, 8185, 937, 8187, 937, 8189, 8190, 8191
  214. // space for codepoints 0x21xx, 0x24xx, 0xffxx (generated)
  215. };
  216. template <class HASH>
  217. uint64_t HashStrLen ( const BYTE * pStr, int iLen )
  218. {
  219. if ( !pStr || !iLen )
  220. return SPH_FNV64_SEED;
  221. else
  222. return HASH::Hash ( pStr, iLen );
  223. }
  224. /// initialize collation LUTs
  225. void sphCollationInit()
  226. {
  227. const int dWeightPlane[0x0b] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x1e, 0x1f, 0x21, 0x24, 0xff };
  228. // generate missing weights
  229. for ( int i=0; i<0x100; i++ )
  230. {
  231. g_dCollWeights_UTF8CI[i+0x800] = (unsigned short)( 0x2100 + i - ( i>=0x70 && i<=0x7f )*16 ); // 2170..217f, -16
  232. g_dCollWeights_UTF8CI[i+0x900] = (unsigned short)( 0x2400 + i - ( i>=0xd0 && i<=0xe9 )*26 ); // 24d0..24e9, -26
  233. g_dCollWeights_UTF8CI[i+0xa00] = (unsigned short)( 0xff00 + i - ( i>=0x41 && i<=0x5a )*32 ); // ff41..ff5a, -32
  234. }
  235. // generate planes table
  236. for ( auto& dCollPlanes : g_dCollPlanes_UTF8CI )
  237. dCollPlanes = nullptr;
  238. for ( int i=0; i<0x0b; i++ )
  239. g_dCollPlanes_UTF8CI [ dWeightPlane[i] ] = g_dCollWeights_UTF8CI + 0x100*i;
  240. }
  241. /// collate a single codepoint
  242. static inline int CollateUTF8CI ( int iCode )
  243. {
  244. return ( ( iCode>>16 ) || !g_dCollPlanes_UTF8CI [ iCode>>8 ] )
  245. ? iCode
  246. : g_dCollPlanes_UTF8CI [ iCode>>8 ][ iCode&0xff ];
  247. }
  248. /// utf8_general_ci
  249. static int CollateUtf8GeneralCI ( ByteBlob_t dStr1, ByteBlob_t dStr2, bool bDataPtr)
  250. {
  251. UnpackStrings ( dStr1, dStr2, bDataPtr );
  252. const BYTE * pMax1 = dStr1.first + dStr1.second;
  253. const BYTE * pMax2 = dStr2.first + dStr2.second;
  254. while (dStr1.first<pMax1 && dStr2.first<pMax2 )
  255. {
  256. // FIXME! on broken data, decode might go beyond buffer bounds
  257. int iCode1 = sphUTF8Decode ( dStr1.first );
  258. int iCode2 = sphUTF8Decode ( dStr2.first );
  259. if ( !iCode1 && !iCode2 )
  260. return 0;
  261. if ( !iCode1 || !iCode2 )
  262. return !iCode1 ? -1 : 1;
  263. if ( iCode1==iCode2 )
  264. continue;
  265. iCode1 = CollateUTF8CI ( iCode1 );
  266. iCode2 = CollateUTF8CI ( iCode2 );
  267. if ( iCode1!=iCode2 )
  268. return iCode1-iCode2;
  269. }
  270. if ( dStr1.first>=pMax1 && dStr2.first>=pMax2 )
  271. return 0;
  272. return ( dStr1.first<pMax1 ) ? 1 : -1;
  273. }
  274. SphStringCmp_fn GetStringCmpFunc ( ESphCollation eCollation )
  275. {
  276. switch ( eCollation )
  277. {
  278. case SPH_COLLATION_LIBC_CS: return CollateLibcCS;
  279. case SPH_COLLATION_UTF8_GENERAL_CI: return CollateUtf8GeneralCI;
  280. case SPH_COLLATION_BINARY: return CollateBinary;
  281. default: return CollateLibcCI;
  282. }
  283. }
  284. /////////////////////////////
  285. // hashing functions
  286. /////////////////////////////
  287. uint64_t LibcCSHash_fn::Hash ( const BYTE * pStr, int iLen, uint64_t uPrev )
  288. {
  289. const int LOCALE_SAFE_GAP = 16;
  290. assert ( pStr && iLen );
  291. int iCompositeLen = iLen + 1 + (int)( 3.0f * (float)iLen ) + LOCALE_SAFE_GAP;
  292. CSphFixedVector<BYTE> dBuf { iCompositeLen };
  293. memcpy ( dBuf.Begin(), pStr, iLen );
  294. dBuf[iLen] = '\0';
  295. BYTE * pDst = dBuf.Begin()+iLen+1;
  296. int iDstAvailable = dBuf.GetLength() - iLen - LOCALE_SAFE_GAP;
  297. auto iDstLen = (int) strxfrm ( (char *)pDst, (const char *) dBuf.Begin(), iDstAvailable );
  298. assert ( iDstLen<iDstAvailable+LOCALE_SAFE_GAP );
  299. uint64_t uAcc = sphFNV64 ( pDst, iDstLen, uPrev );
  300. return uAcc;
  301. }
  302. uint64_t LibcCIHash_fn::Hash ( const BYTE * pStr, int iLen, uint64_t uPrev )
  303. {
  304. assert ( pStr && iLen );
  305. uint64_t uAcc = uPrev;
  306. while ( iLen-- )
  307. {
  308. int iChar = tolower ( *pStr++ );
  309. uAcc = sphFNV64 ( &iChar, 4, uAcc );
  310. }
  311. return uAcc;
  312. }
  313. uint64_t Utf8CIHash_fn::Hash ( const BYTE * pStr, int iLen, uint64_t uPrev )
  314. {
  315. assert ( pStr && iLen );
  316. uint64_t uAcc = uPrev;
  317. while ( iLen-- )
  318. {
  319. const BYTE * pCur = pStr++;
  320. int iCode = sphUTF8Decode ( pCur );
  321. iCode = CollateUTF8CI ( iCode );
  322. uAcc = sphFNV64 ( &iCode, 4, uAcc );
  323. }
  324. return uAcc;
  325. }
  326. uint64_t BinaryHash_fn::Hash ( const BYTE * pStr, int iLen, uint64_t uPrev )
  327. {
  328. assert ( pStr && iLen );
  329. return sphFNV64 ( pStr, iLen, uPrev );
  330. }
  331. /////////////////////////////////////////////////////////////////////
  332. StrHashCalc_fn GetStringHashCalcFunc ( ESphCollation eCollation )
  333. {
  334. switch ( eCollation )
  335. {
  336. case SPH_COLLATION_LIBC_CS: return LibcCSHash_fn::Hash;
  337. case SPH_COLLATION_UTF8_GENERAL_CI: return Utf8CIHash_fn::Hash;
  338. case SPH_COLLATION_BINARY: return BinaryHash_fn::Hash;
  339. default: return LibcCIHash_fn::Hash;
  340. }
  341. }
  342. volatile ESphCollation& GlobalCollation()
  343. {
  344. static ESphCollation eCollation = SPH_COLLATION_DEFAULT;
  345. return eCollation;
  346. }
  347. ESphCollation sphCollationFromName ( const CSphString & sName, CSphString * pError )
  348. {
  349. assert ( pError );
  350. // FIXME! replace with a hash lookup?
  351. if ( sName=="libc_ci" )
  352. return SPH_COLLATION_LIBC_CI;
  353. else if ( sName=="libc_cs" )
  354. return SPH_COLLATION_LIBC_CS;
  355. else if ( sName=="utf8_general_ci" )
  356. return SPH_COLLATION_UTF8_GENERAL_CI;
  357. else if ( sName=="binary" )
  358. return SPH_COLLATION_BINARY;
  359. pError->SetSprintf ( "Unknown collation: '%s'", sName.cstr() );
  360. return SPH_COLLATION_DEFAULT;
  361. }
  362. static CSphString g_sLocale;
  363. static std::locale g_tLocale;
  364. static bool g_bGlobalLocaleSet = false;
  365. void SetLocale ( const CSphString & sLocale, bool bSet )
  366. {
  367. g_sLocale = sLocale;
  368. g_tLocale = std::locale();
  369. if ( g_sLocale.IsEmpty() )
  370. return;
  371. g_bGlobalLocaleSet = bSet;
  372. g_tLocale = std::locale ( sLocale.cstr() );
  373. }
  374. const std::locale & GlobalLocale()
  375. {
  376. return g_tLocale;
  377. }
  378. bool IsGlobalLocaleSet()
  379. {
  380. return g_bGlobalLocaleSet;
  381. }