unicode.cc 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622
  1. //-----------------------------------------------------------------------------
  2. // Copyright (c) 2013 GarageGames, LLC
  3. //
  4. // Permission is hereby granted, free of charge, to any person obtaining a copy
  5. // of this software and associated documentation files (the "Software"), to
  6. // deal in the Software without restriction, including without limitation the
  7. // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  8. // sell copies of the Software, and to permit persons to whom the Software is
  9. // furnished to do so, subject to the following conditions:
  10. //
  11. // The above copyright notice and this permission notice shall be included in
  12. // all copies or substantial portions of the Software.
  13. //
  14. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  19. // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  20. // IN THE SOFTWARE.
  21. //-----------------------------------------------------------------------------
  22. #include "unicode.h"
  23. #include "memory/frameAllocator.h"
  24. #include "math/mMath.h"
  25. #include "debug/profiler.h"
  26. #include <stdio.h>
  27. //-----------------------------------------------------------------------------
  28. /// replacement character. Standard correct value is 0xFFFD.
  29. #define kReplacementChar 0xFFFD
  30. /// Look up table. Shift a byte >> 1, then look up how many bytes to expect after it.
  31. /// Contains -1's for illegal values.
  32. U8 firstByteLUT[128] =
  33. {
  34. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x0F // single byte ascii
  35. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x1F // single byte ascii
  36. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x2F // single byte ascii
  37. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x3F // single byte ascii
  38. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x4F // trailing utf8
  39. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x5F // trailing utf8
  40. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0x6F // first of 2
  41. 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 6, 0, // 0x7F // first of 3,4,5,illegal in utf-8
  42. };
  43. /// Look up table. Shift a 16-bit word >> 10, then look up whether it is a surrogate,
  44. /// and which part. 0 means non-surrogate, 1 means 1st in pair, 2 means 2nd in pair.
  45. U8 surrogateLUT[64] =
  46. {
  47. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x0F
  48. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x1F
  49. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x2F
  50. 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, // 0x3F
  51. };
  52. /// Look up table. Feed value from firstByteLUT in, gives you
  53. /// the mask for the data bits of that UTF-8 code unit.
  54. U8 byteMask8LUT[] = { 0x3f, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; // last 0=6, 1=7, 2=5, 4, 3, 2, 1 bits
  55. /// Mask for the data bits of a UTF-16 surrogate.
  56. U16 byteMaskLow10 = 0x03ff;
  57. //-----------------------------------------------------------------------------
  58. inline bool isSurrogateRange(U32 codepoint)
  59. {
  60. return ( 0xd800 < codepoint && codepoint < 0xdfff );
  61. }
  62. inline bool isAboveBMP(U32 codepoint)
  63. {
  64. return ( codepoint > 0xFFFF );
  65. }
  66. //-----------------------------------------------------------------------------
  67. const U32 convertUTF8toUTF16(const UTF8 *unistring, UTF16 *outbuffer, U32 len)
  68. {
  69. AssertFatal(len >= 1, "Buffer for unicode conversion must be large enough to hold at least the null terminator.");
  70. PROFILE_START(convertUTF8toUTF16);
  71. U32 walked, nCodepoints;
  72. UTF32 middleman;
  73. nCodepoints=0;
  74. while(*unistring != '\0' && nCodepoints < len)
  75. {
  76. walked = 1;
  77. middleman = oneUTF8toUTF32(unistring,&walked);
  78. outbuffer[nCodepoints] = oneUTF32toUTF16(middleman);
  79. unistring+=walked;
  80. nCodepoints++;
  81. }
  82. nCodepoints = getMin(nCodepoints,len - 1);
  83. outbuffer[nCodepoints] = '\0';
  84. PROFILE_END();
  85. return nCodepoints;
  86. }
  87. //-----------------------------------------------------------------------------
  88. const U32 convertUTF8toUTF32(const UTF8 *unistring, UTF32 *outbuffer, U32 len)
  89. {
  90. AssertFatal(len >= 1, "Buffer for unicode conversion must be large enough to hold at least the null terminator.");
  91. PROFILE_START(convertUTF8toUTF32);
  92. U32 walked, nCodepoints;
  93. nCodepoints=0;
  94. while(*unistring != 0 && nCodepoints < len)
  95. {
  96. walked = 1;
  97. outbuffer[nCodepoints] = oneUTF8toUTF32(unistring,&walked);
  98. unistring+=walked;
  99. nCodepoints++;
  100. }
  101. nCodepoints = getMin(nCodepoints,len - 1);
  102. outbuffer[nCodepoints] = '\0';
  103. PROFILE_END();
  104. return nCodepoints;
  105. }
  106. //-----------------------------------------------------------------------------
  107. const U32 convertUTF16toUTF8( const UTF16 *unistring, UTF8 *outbuffer, U32 len)
  108. {
  109. AssertFatal(len >= 1, "Buffer for unicode conversion must be large enough to hold at least the null terminator.");
  110. PROFILE_START(convertUTF16toUTF8);
  111. U32 walked, nCodeunits, codeunitLen;
  112. UTF32 middleman;
  113. nCodeunits=0;
  114. while( *unistring != '\0' && nCodeunits + 3 < len )
  115. {
  116. walked = 1;
  117. middleman = oneUTF16toUTF32(unistring,&walked);
  118. codeunitLen = oneUTF32toUTF8(middleman, &outbuffer[nCodeunits]);
  119. unistring += walked;
  120. nCodeunits += codeunitLen;
  121. }
  122. nCodeunits = getMin(nCodeunits,len - 1);
  123. outbuffer[nCodeunits] = '\0';
  124. PROFILE_END();
  125. return nCodeunits;
  126. }
  127. //-----------------------------------------------------------------------------
  128. const U32 convertUTF16toUTF32(const UTF16 *unistring, UTF32 *outbuffer, U32 len)
  129. {
  130. AssertFatal(len >= 1, "Buffer for unicode conversion must be large enough to hold at least the null terminator.");
  131. PROFILE_START(convertUTF16toUTF32);
  132. U32 walked, nCodepoints;
  133. nCodepoints=0;
  134. while( *unistring != '\0' && nCodepoints < len )
  135. {
  136. walked=1;
  137. outbuffer[nCodepoints] = oneUTF16toUTF32(unistring,&walked);
  138. unistring += walked;
  139. nCodepoints++;
  140. }
  141. nCodepoints = getMin(nCodepoints,len);
  142. outbuffer[nCodepoints] = '\0';
  143. PROFILE_END();
  144. return nCodepoints;
  145. }
  146. //-----------------------------------------------------------------------------
  147. const U32 convertUTF32toUTF8( const UTF32 *unistring, UTF8 *outbuffer, U32 len)
  148. {
  149. AssertFatal(len >= 1, "Buffer for unicode conversion must be large enough to hold at least the null terminator.");
  150. PROFILE_START(convertUTF32toUTF8);
  151. U32 nCodeunits, codeunitLen;
  152. nCodeunits=0;
  153. while( *unistring != '\0' && nCodeunits + 3 < len )
  154. {
  155. codeunitLen = oneUTF32toUTF8(*unistring, &outbuffer[nCodeunits]);
  156. unistring++;
  157. nCodeunits += codeunitLen;
  158. }
  159. nCodeunits = getMin(nCodeunits,len);
  160. outbuffer[nCodeunits] = '\0';
  161. PROFILE_END();
  162. return nCodeunits;
  163. }
  164. //-----------------------------------------------------------------------------
  165. const U32 convertUTF32toUTF16(const UTF32 *unistring, UTF16 *outbuffer, U32 len)
  166. {
  167. AssertFatal(len >= 1, "Buffer for unicode conversion must be large enough to hold at least the null terminator.");
  168. PROFILE_START(convertUTF32toUTF16);
  169. U32 nCodepoints;
  170. nCodepoints=0;
  171. while(*unistring != '\0' && nCodepoints < len)
  172. {
  173. outbuffer[nCodepoints] = oneUTF32toUTF16(*unistring);
  174. unistring++;
  175. nCodepoints++;
  176. }
  177. nCodepoints = getMin(nCodepoints,len);
  178. outbuffer[nCodepoints] = '\0';
  179. PROFILE_END();
  180. return nCodepoints;
  181. }
  182. //-----------------------------------------------------------------------------
  183. // Functions that convert buffers of unicode code points
  184. //-----------------------------------------------------------------------------
  185. UTF16* convertUTF8toUTF16( const UTF8* unistring)
  186. {
  187. PROFILE_START(convertUTF8toUTF16_create);
  188. // allocate plenty of memory.
  189. U32 nCodepoints, len = dStrlen(unistring) + 1;
  190. FrameTemp<UTF16> buf(len);
  191. // perform conversion
  192. nCodepoints = convertUTF8toUTF16( unistring, buf, len);
  193. // add 1 for the NULL terminator the converter promises it included.
  194. nCodepoints++;
  195. // allocate the return buffer, copy over, and return it.
  196. UTF16 *ret = new UTF16[nCodepoints];
  197. dMemcpy(ret, buf, nCodepoints * sizeof(UTF16));
  198. PROFILE_END();
  199. return ret;
  200. }
  201. //-----------------------------------------------------------------------------
  202. UTF32* convertUTF8toUTF32( const UTF8* unistring)
  203. {
  204. PROFILE_START(convertUTF8toUTF32_create);
  205. // allocate plenty of memory.
  206. U32 nCodepoints, len = dStrlen(unistring) + 1;
  207. FrameTemp<UTF32> buf(len);
  208. // perform conversion
  209. nCodepoints = convertUTF8toUTF32( unistring, buf, len);
  210. // add 1 for the NULL terminator the converter promises it included.
  211. nCodepoints++;
  212. // allocate the return buffer, copy over, and return it.
  213. UTF32 *ret = new UTF32[nCodepoints];
  214. dMemcpy(ret, buf, nCodepoints * sizeof(UTF32));
  215. PROFILE_END();
  216. return ret;
  217. }
  218. //-----------------------------------------------------------------------------
  219. UTF8* convertUTF16toUTF8( const UTF16* unistring)
  220. {
  221. PROFILE_START(convertUTF16toUTF8_create);
  222. // allocate plenty of memory.
  223. U32 nCodeunits, len = dStrlen(unistring) * 3 + 1;
  224. FrameTemp<UTF8> buf(len);
  225. // perform conversion
  226. nCodeunits = convertUTF16toUTF8( unistring, buf, len);
  227. // add 1 for the NULL terminator the converter promises it included.
  228. nCodeunits++;
  229. // allocate the return buffer, copy over, and return it.
  230. UTF8 *ret = new UTF8[nCodeunits];
  231. dMemcpy(ret, buf, nCodeunits * sizeof(UTF8));
  232. PROFILE_END();
  233. return ret;
  234. }
  235. //-----------------------------------------------------------------------------
  236. UTF32* convertUTF16toUTF32(const UTF16* unistring)
  237. {
  238. PROFILE_START(convertUTF16toUTF32_create);
  239. // allocate plenty of memory.
  240. U32 nCodepoints, len = dStrlen(unistring) + 1;
  241. FrameTemp<UTF32> buf(len);
  242. // perform conversion
  243. nCodepoints = convertUTF16toUTF32( unistring, buf, len);
  244. // add 1 for the NULL terminator the converter promises it included.
  245. nCodepoints++;
  246. // allocate the return buffer, copy over, and return it.
  247. UTF32 *ret = new UTF32[nCodepoints];
  248. dMemcpy(ret, buf, nCodepoints * sizeof(UTF32));
  249. PROFILE_END();
  250. return ret;
  251. }
  252. //-----------------------------------------------------------------------------
  253. UTF8* convertUTF32toUTF8( const UTF32* unistring)
  254. {
  255. PROFILE_START(convertUTF32toUTF8_create);
  256. // allocate plenty of memory.
  257. U32 nCodeunits, len = dStrlen(unistring) * 3 + 1;
  258. FrameTemp<UTF8> buf(len);
  259. // perform conversion
  260. nCodeunits = convertUTF32toUTF8( unistring, buf, len);
  261. // add 1 for the NULL terminator the converter promises it included.
  262. nCodeunits++;
  263. // allocate the return buffer, copy over, and return it.
  264. UTF8 *ret = new UTF8[nCodeunits];
  265. dMemcpy(ret, buf, nCodeunits * sizeof(UTF8));
  266. PROFILE_END();
  267. return ret;
  268. }
  269. //-----------------------------------------------------------------------------
  270. UTF16* convertUTF32toUTF16(const UTF32* unistring)
  271. {
  272. PROFILE_START(convertUTF32toUTF16_create);
  273. // allocate plenty of memory.
  274. U32 nCodepoints, len = dStrlen(unistring) + 1;
  275. FrameTemp<UTF16> buf(len);
  276. // perform conversion
  277. nCodepoints = convertUTF32toUTF16( unistring, buf, len);
  278. // add 1 for the NULL terminator the converter promises it included.
  279. nCodepoints++;
  280. // allocate the return buffer, copy over, and return it.
  281. UTF16 *ret = new UTF16[nCodepoints];
  282. dMemcpy(ret, buf, nCodepoints * sizeof(UTF16));
  283. PROFILE_END();
  284. return ret;
  285. }
  286. //-----------------------------------------------------------------------------
  287. // Functions that converts one unicode codepoint at a time
  288. //-----------------------------------------------------------------------------
  289. const UTF32 oneUTF8toUTF32( const UTF8* codepoint, U32 *unitsWalked)
  290. {
  291. PROFILE_START(oneUTF8toUTF32);
  292. // codepoints 6 codeunits long are read, but do not convert correctly,
  293. // and are filtered out anyway.
  294. // early out for ascii
  295. if(!(*codepoint & 0x0080))
  296. {
  297. *unitsWalked = 1;
  298. PROFILE_END();
  299. return (UTF32)*codepoint;
  300. }
  301. U32 expectedByteCount;
  302. UTF32 ret = 0;
  303. U8 codeunit;
  304. // check the first byte ( a.k.a. codeunit ) .
  305. unsigned char c = codepoint[0];
  306. c = c >> 1;
  307. expectedByteCount = firstByteLUT[c];
  308. if(expectedByteCount > 0) // 0 or negative is illegal to start with
  309. {
  310. // process 1st codeunit
  311. ret |= byteMask8LUT[expectedByteCount] & codepoint[0]; // bug?
  312. // process trailing codeunits
  313. for(U32 i=1;i<expectedByteCount; i++)
  314. {
  315. codeunit = codepoint[i];
  316. if( firstByteLUT[codeunit>>1] == 0 )
  317. {
  318. ret <<= 6; // shift up 6
  319. ret |= (codeunit & 0x3f); // mask in the low 6 bits of this codeunit byte.
  320. }
  321. else
  322. {
  323. // found a bad codepoint - did not get a medial where we wanted one.
  324. // Dump the replacement, and claim to have parsed only 1 char,
  325. // so that we'll dump a slew of replacements, instead of eating the next char.
  326. ret = kReplacementChar;
  327. expectedByteCount = 1;
  328. break;
  329. }
  330. }
  331. }
  332. else
  333. {
  334. // found a bad codepoint - got a medial or an illegal codeunit.
  335. // Dump the replacement, and claim to have parsed only 1 char,
  336. // so that we'll dump a slew of replacements, instead of eating the next char.
  337. ret = kReplacementChar;
  338. expectedByteCount = 1;
  339. }
  340. if(unitsWalked != NULL)
  341. *unitsWalked = expectedByteCount;
  342. // codepoints in the surrogate range are illegal, and should be replaced.
  343. if(isSurrogateRange(ret))
  344. ret = kReplacementChar;
  345. // codepoints outside the Basic Multilingual Plane add complexity to our UTF16 string classes,
  346. // we've read them correctly so they wont foul the byte stream,
  347. // but we kill them here to make sure they wont foul anything else
  348. if(isAboveBMP(ret))
  349. ret = kReplacementChar;
  350. PROFILE_END();
  351. return ret;
  352. }
  353. //-----------------------------------------------------------------------------
  354. const UTF32 oneUTF16toUTF32(const UTF16* codepoint, U32 *unitsWalked)
  355. {
  356. PROFILE_START(oneUTF16toUTF32);
  357. U8 expectedType;
  358. U32 unitCount;
  359. UTF32 ret = 0;
  360. UTF16 codeunit1,codeunit2;
  361. codeunit1 = codepoint[0];
  362. expectedType = surrogateLUT[codeunit1 >> 10];
  363. switch(expectedType)
  364. {
  365. case 0: // simple
  366. ret = codeunit1;
  367. unitCount = 1;
  368. break;
  369. case 1: // 2 surrogates
  370. codeunit2 = codepoint[1];
  371. if( surrogateLUT[codeunit2 >> 10] == 2)
  372. {
  373. ret = ((codeunit1 & byteMaskLow10 ) << 10) | (codeunit2 & byteMaskLow10);
  374. unitCount = 2;
  375. break;
  376. }
  377. // else, did not find a trailing surrogate where we expected one,
  378. // so fall through to the error
  379. case 2: // error
  380. // found a trailing surrogate where we expected a codepoint or leading surrogate.
  381. // Dump the replacement.
  382. ret = kReplacementChar;
  383. unitCount = 1;
  384. break;
  385. }
  386. if(unitsWalked != NULL)
  387. *unitsWalked = unitCount;
  388. // codepoints in the surrogate range are illegal, and should be replaced.
  389. if(isSurrogateRange(ret))
  390. ret = kReplacementChar;
  391. // codepoints outside the Basic Multilingual Plane add complexity to our UTF16 string classes,
  392. // we've read them correctly so they wont foul the byte stream,
  393. // but we kill them here to make sure they wont foul anything else
  394. // NOTE: these are perfectly legal codepoints, we just dont want to deal with them.
  395. if(isAboveBMP(ret))
  396. ret = kReplacementChar;
  397. PROFILE_END();
  398. return ret;
  399. }
  400. //-----------------------------------------------------------------------------
  401. const UTF16 oneUTF32toUTF16(const UTF32 codepoint)
  402. {
  403. // found a codepoint outside the codeable UTF-16 range!
  404. // or, found an illegal codepoint!
  405. if(codepoint >= 0x10FFFF || isSurrogateRange(codepoint))
  406. return kReplacementChar;
  407. // these are legal, we just dont want to deal with them.
  408. if(isAboveBMP(codepoint))
  409. return kReplacementChar;
  410. return (UTF16)codepoint;
  411. }
  412. //-----------------------------------------------------------------------------
  413. const U32 oneUTF32toUTF8(const UTF32 codepoint, UTF8 *threeByteCodeunitBuf)
  414. {
  415. PROFILE_START(oneUTF32toUTF8);
  416. U32 bytecount = 0;
  417. UTF8 *buf;
  418. U32 working = codepoint;
  419. buf = threeByteCodeunitBuf;
  420. //-----------------
  421. if(isSurrogateRange(working)) // found an illegal codepoint!
  422. working = kReplacementChar;
  423. //return oneUTF32toUTF8(kReplacementChar, threeByteCodeunitBuf);
  424. if(isAboveBMP(working)) // these are legal, we just dont want to deal with them.
  425. working = kReplacementChar;
  426. //return oneUTF32toUTF8(kReplacementChar, threeByteCodeunitBuf);
  427. //-----------------
  428. if( working < (1 << 7)) // codeable in 7 bits
  429. bytecount = 1;
  430. else if( working < (1 << 11)) // codeable in 11 bits
  431. bytecount = 2;
  432. else if( working < (1 << 16)) // codeable in 16 bits
  433. bytecount = 3;
  434. AssertISV( bytecount > 0, "Error converting to UTF-8 in oneUTF32toUTF8(). isAboveBMP() should have caught this!");
  435. //-----------------
  436. U8 mask = byteMask8LUT[0]; // 0011 1111
  437. U8 marker = ( ~mask << 1); // 1000 0000
  438. // Process the low order bytes, shifting the codepoint down 6 each pass.
  439. for( int i = bytecount-1; i > 0; i--)
  440. {
  441. threeByteCodeunitBuf[i] = marker | (working & mask);
  442. working >>= 6;
  443. }
  444. // Process the 1st byte. filter based on the # of expected bytes.
  445. mask = byteMask8LUT[bytecount];
  446. marker = ( ~mask << 1 );
  447. threeByteCodeunitBuf[0] = marker | (working & mask);
  448. PROFILE_END();
  449. return bytecount;
  450. }
  451. //-----------------------------------------------------------------------------
  452. const U32 dStrlen(const UTF16 *unistring)
  453. {
  454. U32 i = 0;
  455. while(unistring[i] != '\0')
  456. i++;
  457. return i;
  458. }
  459. //-----------------------------------------------------------------------------
  460. const U32 dStrlen(const UTF32 *unistring)
  461. {
  462. U32 i = 0;
  463. while(unistring[i] != '\0')
  464. i++;
  465. return i;
  466. }
  467. //-----------------------------------------------------------------------------
  468. const U32 dStrncmp(const UTF16* unistring1, const UTF16* unistring2, U32 len)
  469. {
  470. UTF16 c1, c2;
  471. for(U32 i = 0; i<len; i++)
  472. {
  473. c1 = *unistring1++;
  474. c2 = *unistring2++;
  475. if(c1 < c2) return -1;
  476. if(c1 > c2) return 1;
  477. if(!c1) return 0;
  478. }
  479. return 0;
  480. }
  481. //-----------------------------------------------------------------------------
  482. const U32 dStrncmp(const UTF32* unistring1, const UTF32* unistring2, U32 len)
  483. {
  484. UTF32 c1, c2;
  485. for(U32 i = 0; i<len; i++)
  486. {
  487. c1 = *unistring1++;
  488. c2 = *unistring2++;
  489. if(c1 < c2) return -1;
  490. if(c1 > c2) return 1;
  491. if(!c1) return 0;
  492. }
  493. return 0;
  494. }
  495. //-----------------------------------------------------------------------------
  496. const UTF8* getNthCodepoint(const UTF8 *unistring, const U32 n)
  497. {
  498. const UTF8* ret = unistring;
  499. U32 charsseen = 0;
  500. while( *ret && charsseen < n)
  501. {
  502. ret++;
  503. if((*ret & 0xC0) != 0x80)
  504. charsseen++;
  505. }
  506. return ret;
  507. }
  508. /* alternate utf-8 decode impl for speed, no error checking,
  509. left here for your amusement:
  510. U32 codeunit = codepoint + expectedByteCount - 1;
  511. U32 i = 0;
  512. switch(expectedByteCount)
  513. {
  514. case 6: ret |= ( *(codeunit--) & 0x3f ); i++;
  515. case 5: ret |= ( *(codeunit--) & 0x3f ) << (6 * i++);
  516. case 4: ret |= ( *(codeunit--) & 0x3f ) << (6 * i++);
  517. case 3: ret |= ( *(codeunit--) & 0x3f ) << (6 * i++);
  518. case 2: ret |= ( *(codeunit--) & 0x3f ) << (6 * i++);
  519. case 1: ret |= *(codeunit) & byteMask8LUT[expectedByteCount] << (6 * i);
  520. }
  521. */