unicode.cpp 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663
  1. //-----------------------------------------------------------------------------
  2. // Copyright (c) 2012 GarageGames, LLC
  3. //
  4. // Permission is hereby granted, free of charge, to any person obtaining a copy
  5. // of this software and associated documentation files (the "Software"), to
  6. // deal in the Software without restriction, including without limitation the
  7. // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  8. // sell copies of the Software, and to permit persons to whom the Software is
  9. // furnished to do so, subject to the following conditions:
  10. //
  11. // The above copyright notice and this permission notice shall be included in
  12. // all copies or substantial portions of the Software.
  13. //
  14. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  19. // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  20. // IN THE SOFTWARE.
  21. //-----------------------------------------------------------------------------
  22. #include <stdio.h>
  23. #include "core/frameAllocator.h"
  24. #include "core/strings/unicode.h"
  25. #include "core/strings/stringFunctions.h"
  26. #include "platform/profiler.h"
  27. #include "console/console.h"
  28. #define TORQUE_ENABLE_UTF16_CACHE
  29. #ifdef TORQUE_ENABLE_UTF16_CACHE
  30. #include "core/util/tDictionary.h"
  31. #include "core/util/hashFunction.h"
  32. #endif
  33. //-----------------------------------------------------------------------------
  34. /// replacement character. Standard correct value is 0xFFFD.
  35. #define kReplacementChar 0xFFFD
  36. /// Look up table. Shift a byte >> 1, then look up how many bytes to expect after it.
  37. /// Contains -1's for illegal values.
  38. static const U8 sgFirstByteLUT[128] =
  39. {
  40. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x0F // single byte ascii
  41. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x1F // single byte ascii
  42. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x2F // single byte ascii
  43. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x3F // single byte ascii
  44. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x4F // trailing utf8
  45. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x5F // trailing utf8
  46. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0x6F // first of 2
  47. 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 6, 0, // 0x7F // first of 3,4,5,illegal in utf-8
  48. };
  49. /// Look up table. Shift a 16-bit word >> 10, then look up whether it is a surrogate,
  50. /// and which part. 0 means non-surrogate, 1 means 1st in pair, 2 means 2nd in pair.
  51. static const U8 sgSurrogateLUT[64] =
  52. {
  53. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x0F
  54. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x1F
  55. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x2F
  56. 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, // 0x3F
  57. };
  58. /// Look up table. Feed value from firstByteLUT in, gives you
  59. /// the mask for the data bits of that UTF-8 code unit.
  60. static const U8 sgByteMask8LUT[] = { 0x3f, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; // last 0=6, 1=7, 2=5, 4, 3, 2, 1 bits
  61. /// Mask for the data bits of a UTF-16 surrogate.
  62. static const U16 sgByteMaskLow10 = 0x03ff;
  63. //-----------------------------------------------------------------------------
  64. #ifdef TORQUE_ENABLE_UTF16_CACHE
  65. /// Cache data for UTF16 strings. This is wrapped in a class so that data is
  66. /// automatically freed when the hash table is deleted.
  67. struct UTF16Cache
  68. {
  69. UTF16 *mString;
  70. U32 mLength;
  71. UTF16Cache()
  72. {
  73. mString = NULL;
  74. mLength = 0;
  75. }
  76. UTF16Cache(UTF16 *str, U32 len)
  77. {
  78. mLength = len;
  79. mString = new UTF16[mLength];
  80. dMemcpy(mString, str, mLength * sizeof(UTF16));
  81. }
  82. UTF16Cache(const UTF16Cache &other)
  83. {
  84. mLength = other.mLength;
  85. mString = new UTF16[mLength];
  86. dMemcpy(mString, other.mString, mLength * sizeof(UTF16));
  87. }
  88. void operator =(const UTF16Cache &other)
  89. {
  90. delete [] mString;
  91. mLength = other.mLength;
  92. mString = new UTF16[mLength];
  93. dMemcpy(mString, other.mString, mLength * sizeof(UTF16));
  94. }
  95. ~UTF16Cache()
  96. {
  97. delete [] mString;
  98. }
  99. void copyToBuffer(UTF16 *outBuffer, U32 lenToCopy, bool nullTerminate = true) const
  100. {
  101. U32 copy = getMin(mLength, lenToCopy);
  102. if(mString && copy > 0)
  103. dMemcpy(outBuffer, mString, copy * sizeof(UTF16));
  104. if(nullTerminate)
  105. outBuffer[copy] = 0;
  106. }
  107. };
  108. /// Cache for UTF16 strings
  109. typedef HashTable<U32, UTF16Cache> UTF16CacheTable;
  110. static UTF16CacheTable sgUTF16Cache;
  111. #endif // TORQUE_ENABLE_UTF16_CACHE
  112. //-----------------------------------------------------------------------------
  113. inline bool isSurrogateRange(U32 codepoint)
  114. {
  115. return ( 0xd800 < codepoint && codepoint < 0xdfff );
  116. }
  117. inline bool isAboveBMP(U32 codepoint)
  118. {
  119. return ( codepoint > 0xFFFF );
  120. }
  121. //-----------------------------------------------------------------------------
  122. U32 convertUTF8toUTF16(const UTF8 *unistring, UTF16 *outbuffer, U32 len)
  123. {
  124. AssertFatal(len >= 1, "Buffer for unicode conversion must be large enough to hold at least the null terminator.");
  125. PROFILE_SCOPE(convertUTF8toUTF16);
  126. #ifdef TORQUE_ENABLE_UTF16_CACHE
  127. // If we have cached this conversion already, don't do it again
  128. U32 hashKey = Torque::hash((const U8 *)unistring, dStrlen(unistring), 0);
  129. UTF16CacheTable::Iterator cacheItr = sgUTF16Cache.find(hashKey);
  130. if(cacheItr != sgUTF16Cache.end())
  131. {
  132. const UTF16Cache &cache = (*cacheItr).value;
  133. cache.copyToBuffer(outbuffer, len);
  134. outbuffer[len-1] = '\0';
  135. return getMin(cache.mLength,len - 1);
  136. }
  137. #endif
  138. U32 walked, nCodepoints;
  139. UTF32 middleman;
  140. nCodepoints=0;
  141. while(*unistring != '\0' && nCodepoints < len)
  142. {
  143. walked = 1;
  144. middleman = oneUTF8toUTF32(unistring,&walked);
  145. outbuffer[nCodepoints] = oneUTF32toUTF16(middleman);
  146. unistring+=walked;
  147. nCodepoints++;
  148. }
  149. nCodepoints = getMin(nCodepoints,len - 1);
  150. outbuffer[nCodepoints] = '\0';
  151. #ifdef TORQUE_ENABLE_UTF16_CACHE
  152. // Cache the results.
  153. // FIXME As written, this will result in some unnecessary memory copying due to copy constructor calls.
  154. UTF16Cache cache(outbuffer, nCodepoints);
  155. sgUTF16Cache.insertUnique(hashKey, cache);
  156. #endif
  157. return nCodepoints;
  158. }
  159. //-----------------------------------------------------------------------------
  160. U32 convertUTF16toUTF8( const UTF16 *unistring, UTF8 *outbuffer, U32 len)
  161. {
  162. AssertFatal(len >= 1, "Buffer for unicode conversion must be large enough to hold at least the null terminator.");
  163. PROFILE_START(convertUTF16toUTF8);
  164. U32 walked, nCodeunits, codeunitLen;
  165. UTF32 middleman;
  166. nCodeunits=0;
  167. while( *unistring != '\0' && nCodeunits + 3 < len )
  168. {
  169. walked = 1;
  170. middleman = oneUTF16toUTF32(unistring,&walked);
  171. codeunitLen = oneUTF32toUTF8(middleman, &outbuffer[nCodeunits]);
  172. unistring += walked;
  173. nCodeunits += codeunitLen;
  174. }
  175. nCodeunits = getMin(nCodeunits,len - 1);
  176. outbuffer[nCodeunits] = '\0';
  177. PROFILE_END();
  178. return nCodeunits;
  179. }
  180. U32 convertUTF16toUTF8DoubleNULL( const UTF16 *unistring, UTF8 *outbuffer, U32 len)
  181. {
  182. AssertFatal(len >= 1, "Buffer for unicode conversion must be large enough to hold at least the null terminator.");
  183. PROFILE_START(convertUTF16toUTF8DoubleNULL);
  184. U32 walked, nCodeunits, codeunitLen;
  185. UTF32 middleman;
  186. nCodeunits=0;
  187. while( ! (*unistring == '\0' && *(unistring + 1) == '\0') && nCodeunits + 3 < len )
  188. {
  189. walked = 1;
  190. middleman = oneUTF16toUTF32(unistring,&walked);
  191. codeunitLen = oneUTF32toUTF8(middleman, &outbuffer[nCodeunits]);
  192. unistring += walked;
  193. nCodeunits += codeunitLen;
  194. }
  195. nCodeunits = getMin(nCodeunits,len - 1);
  196. outbuffer[nCodeunits] = NULL;
  197. outbuffer[nCodeunits+1] = NULL;
  198. PROFILE_END();
  199. return nCodeunits;
  200. }
  201. //-----------------------------------------------------------------------------
  202. // Functions that convert buffers of unicode code points
  203. //-----------------------------------------------------------------------------
  204. UTF16* convertUTF8toUTF16( const UTF8* unistring)
  205. {
  206. PROFILE_SCOPE(convertUTF8toUTF16_create);
  207. // allocate plenty of memory.
  208. U32 nCodepoints, len = dStrlen(unistring) + 1;
  209. FrameTemp<UTF16> buf(len);
  210. // perform conversion
  211. nCodepoints = convertUTF8toUTF16( unistring, buf, len);
  212. // add 1 for the NULL terminator the converter promises it included.
  213. nCodepoints++;
  214. // allocate the return buffer, copy over, and return it.
  215. UTF16 *ret = new UTF16[nCodepoints];
  216. dMemcpy(ret, buf, nCodepoints * sizeof(UTF16));
  217. return ret;
  218. }
  219. //-----------------------------------------------------------------------------
  220. UTF8* convertUTF16toUTF8( const UTF16* unistring)
  221. {
  222. PROFILE_SCOPE(convertUTF16toUTF8_create);
  223. // allocate plenty of memory.
  224. U32 nCodeunits, len = dStrlen(unistring) * 3 + 1;
  225. FrameTemp<UTF8> buf(len);
  226. // perform conversion
  227. nCodeunits = convertUTF16toUTF8( unistring, buf, len);
  228. // add 1 for the NULL terminator the converter promises it included.
  229. nCodeunits++;
  230. // allocate the return buffer, copy over, and return it.
  231. UTF8 *ret = new UTF8[nCodeunits];
  232. dMemcpy(ret, buf, nCodeunits * sizeof(UTF8));
  233. return ret;
  234. }
  235. //-----------------------------------------------------------------------------
  236. //-----------------------------------------------------------------------------
  237. // Functions that converts one unicode codepoint at a time
  238. //-----------------------------------------------------------------------------
  239. UTF32 oneUTF8toUTF32( const UTF8* codepoint, U32 *unitsWalked)
  240. {
  241. PROFILE_SCOPE(oneUTF8toUTF32);
  242. // codepoints 6 codeunits long are read, but do not convert correctly,
  243. // and are filtered out anyway.
  244. // early out for ascii
  245. if(!(*codepoint & 0x0080))
  246. {
  247. if (unitsWalked != NULL)
  248. *unitsWalked = 1;
  249. return (UTF32)*codepoint;
  250. }
  251. U32 expectedByteCount;
  252. UTF32 ret = 0;
  253. U8 codeunit;
  254. // check the first byte ( a.k.a. codeunit ) .
  255. U8 c = codepoint[0];
  256. c = c >> 1;
  257. expectedByteCount = sgFirstByteLUT[c];
  258. if(expectedByteCount > 0) // 0 or negative is illegal to start with
  259. {
  260. // process 1st codeunit
  261. ret |= sgByteMask8LUT[expectedByteCount] & codepoint[0]; // bug?
  262. // process trailing codeunits
  263. for(U32 i=1;i<expectedByteCount; i++)
  264. {
  265. codeunit = codepoint[i];
  266. if( sgFirstByteLUT[codeunit>>1] == 0 )
  267. {
  268. ret <<= 6; // shift up 6
  269. ret |= (codeunit & 0x3f); // mask in the low 6 bits of this codeunit byte.
  270. }
  271. else
  272. {
  273. // found a bad codepoint - did not get a medial where we wanted one.
  274. // Dump the replacement, and claim to have parsed only 1 char,
  275. // so that we'll dump a slew of replacements, instead of eating the next char.
  276. ret = kReplacementChar;
  277. expectedByteCount = 1;
  278. break;
  279. }
  280. }
  281. }
  282. else
  283. {
  284. // found a bad codepoint - got a medial or an illegal codeunit.
  285. // Dump the replacement, and claim to have parsed only 1 char,
  286. // so that we'll dump a slew of replacements, instead of eating the next char.
  287. ret = kReplacementChar;
  288. expectedByteCount = 1;
  289. }
  290. if(unitsWalked != NULL)
  291. *unitsWalked = expectedByteCount;
  292. // codepoints in the surrogate range are illegal, and should be replaced.
  293. if(isSurrogateRange(ret))
  294. ret = kReplacementChar;
  295. // codepoints outside the Basic Multilingual Plane add complexity to our UTF16 string classes,
  296. // we've read them correctly so they won't foul the byte stream,
  297. // but we kill them here to make sure they wont foul anything else
  298. if(isAboveBMP(ret))
  299. ret = kReplacementChar;
  300. return ret;
  301. }
  302. //-----------------------------------------------------------------------------
  303. UTF32 oneUTF16toUTF32(const UTF16* codepoint, U32 *unitsWalked)
  304. {
  305. PROFILE_START(oneUTF16toUTF32);
  306. U8 expectedType;
  307. U32 unitCount;
  308. UTF32 ret = 0;
  309. UTF16 codeunit1,codeunit2;
  310. codeunit1 = codepoint[0];
  311. expectedType = sgSurrogateLUT[codeunit1 >> 10];
  312. switch(expectedType)
  313. {
  314. case 0: // simple
  315. ret = codeunit1;
  316. unitCount = 1;
  317. break;
  318. case 1: // 2 surrogates
  319. codeunit2 = codepoint[1];
  320. if( sgSurrogateLUT[codeunit2 >> 10] == 2)
  321. {
  322. ret = ((codeunit1 & sgByteMaskLow10 ) << 10) | (codeunit2 & sgByteMaskLow10);
  323. unitCount = 2;
  324. break;
  325. }
  326. // else, did not find a trailing surrogate where we expected one,
  327. // so fall through to the error
  328. case 2: // error
  329. // found a trailing surrogate where we expected a codepoint or leading surrogate.
  330. // Dump the replacement.
  331. ret = kReplacementChar;
  332. unitCount = 1;
  333. break;
  334. default:
  335. // unexpected return
  336. AssertFatal(false, "oneUTF16toUTF323: unexpected type");
  337. ret = kReplacementChar;
  338. unitCount = 1;
  339. break;
  340. }
  341. if(unitsWalked != NULL)
  342. *unitsWalked = unitCount;
  343. // codepoints in the surrogate range are illegal, and should be replaced.
  344. if(isSurrogateRange(ret))
  345. ret = kReplacementChar;
  346. // codepoints outside the Basic Multilingual Plane add complexity to our UTF16 string classes,
  347. // we've read them correctly so they wont foul the byte stream,
  348. // but we kill them here to make sure they wont foul anything else
  349. // NOTE: these are perfectly legal codepoints, we just dont want to deal with them.
  350. if(isAboveBMP(ret))
  351. ret = kReplacementChar;
  352. PROFILE_END();
  353. return ret;
  354. }
  355. //-----------------------------------------------------------------------------
  356. UTF16 oneUTF32toUTF16(const UTF32 codepoint)
  357. {
  358. // found a codepoint outside the encodable UTF-16 range!
  359. // or, found an illegal codepoint!
  360. if(codepoint >= 0x10FFFF || isSurrogateRange(codepoint))
  361. return kReplacementChar;
  362. // these are legal, we just don't want to deal with them.
  363. if(isAboveBMP(codepoint))
  364. return kReplacementChar;
  365. return (UTF16)codepoint;
  366. }
  367. //-----------------------------------------------------------------------------
  368. U32 oneUTF32toUTF8(const UTF32 codepoint, UTF8 *threeByteCodeunitBuf)
  369. {
  370. PROFILE_START(oneUTF32toUTF8);
  371. U32 bytecount = 0;
  372. UTF8 *buf;
  373. U32 working = codepoint;
  374. buf = threeByteCodeunitBuf;
  375. //-----------------
  376. if(isSurrogateRange(working)) // found an illegal codepoint!
  377. working = kReplacementChar;
  378. if(isAboveBMP(working)) // these are legal, we just dont want to deal with them.
  379. working = kReplacementChar;
  380. //-----------------
  381. if( working < (1 << 7)) // codeable in 7 bits
  382. bytecount = 1;
  383. else if( working < (1 << 11)) // codeable in 11 bits
  384. bytecount = 2;
  385. else if( working < (1 << 16)) // codeable in 16 bits
  386. bytecount = 3;
  387. AssertISV( bytecount > 0, "Error converting to UTF-8 in oneUTF32toUTF8(). isAboveBMP() should have caught this!");
  388. //-----------------
  389. U8 mask = sgByteMask8LUT[0]; // 0011 1111
  390. U8 marker = ( ~mask << 1); // 1000 0000
  391. // Process the low order bytes, shifting the codepoint down 6 each pass.
  392. for( S32 i = bytecount-1; i > 0; i--)
  393. {
  394. threeByteCodeunitBuf[i] = marker | (working & mask);
  395. working >>= 6;
  396. }
  397. // Process the 1st byte. filter based on the # of expected bytes.
  398. mask = sgByteMask8LUT[bytecount];
  399. marker = ( ~mask << 1 );
  400. threeByteCodeunitBuf[0] = marker | working & mask;
  401. PROFILE_END();
  402. return bytecount;
  403. }
  404. //-----------------------------------------------------------------------------
  405. U32 dStrlen(const UTF16 *unistring)
  406. {
  407. if(!unistring)
  408. return 0;
  409. U32 i = 0;
  410. while(unistring[i] != '\0')
  411. i++;
  412. // AssertFatal( wcslen(unistring) == i, "Incorrect length" );
  413. return i;
  414. }
  415. //-----------------------------------------------------------------------------
  416. U32 dStrlen(const UTF32 *unistring)
  417. {
  418. U32 i = 0;
  419. while(unistring[i] != '\0')
  420. i++;
  421. return i;
  422. }
  423. //-----------------------------------------------------------------------------
  424. U32 dStrncmp(const UTF16* unistring1, const UTF16* unistring2, U32 len)
  425. {
  426. UTF16 c1, c2;
  427. for(U32 i = 0; i<len; i++)
  428. {
  429. c1 = *unistring1++;
  430. c2 = *unistring2++;
  431. if(c1 < c2) return -1;
  432. if(c1 > c2) return 1;
  433. if(!c1) return 0;
  434. }
  435. return 0;
  436. }
  437. //-----------------------------------------------------------------------------
  438. const UTF16* dStrrchr(const UTF16* unistring, U32 c)
  439. {
  440. if(!unistring) return NULL;
  441. const UTF16* tmp = unistring + dStrlen(unistring);
  442. while( tmp >= unistring)
  443. {
  444. if(*tmp == c)
  445. return tmp;
  446. tmp--;
  447. }
  448. return NULL;
  449. }
  450. UTF16* dStrrchr(UTF16* unistring, U32 c)
  451. {
  452. const UTF16* str = unistring;
  453. return const_cast<UTF16*>(dStrrchr(str, c));
  454. }
  455. const UTF16* dStrchr(const UTF16* unistring, U32 c)
  456. {
  457. if(!unistring) return NULL;
  458. const UTF16* tmp = unistring;
  459. while ( *tmp && *tmp != c)
  460. tmp++;
  461. return (*tmp == c) ? tmp : NULL;
  462. }
  463. UTF16* dStrchr(UTF16* unistring, U32 c)
  464. {
  465. const UTF16* str = unistring;
  466. return const_cast<UTF16*>(dStrchr(str, c));
  467. }
  468. //-----------------------------------------------------------------------------
  469. const UTF8* getNthCodepoint(const UTF8 *unistring, const U32 n)
  470. {
  471. const UTF8* ret = unistring;
  472. U32 charsseen = 0;
  473. while( *ret && charsseen < n)
  474. {
  475. ret++;
  476. if((*ret & 0xC0) != 0x80)
  477. charsseen++;
  478. }
  479. return ret;
  480. }
  481. /* alternate utf-8 decode impl for speed, no error checking,
  482. left here for your amusement:
  483. U32 codeunit = codepoint + expectedByteCount - 1;
  484. U32 i = 0;
  485. switch(expectedByteCount)
  486. {
  487. case 6: ret |= ( *(codeunit--) & 0x3f ); i++;
  488. case 5: ret |= ( *(codeunit--) & 0x3f ) << (6 * i++);
  489. case 4: ret |= ( *(codeunit--) & 0x3f ) << (6 * i++);
  490. case 3: ret |= ( *(codeunit--) & 0x3f ) << (6 * i++);
  491. case 2: ret |= ( *(codeunit--) & 0x3f ) << (6 * i++);
  492. case 1: ret |= *(codeunit) & byteMask8LUT[expectedByteCount] << (6 * i);
  493. }
  494. */
  495. //------------------------------------------------------------------------------
  496. // Byte Order Mark functions
  497. bool chompUTF8BOM( const char *inString, char **outStringPtr )
  498. {
  499. *outStringPtr = const_cast<char *>( inString );
  500. U8 bom[4];
  501. dMemcpy( bom, inString, 4 );
  502. bool valid = isValidUTF8BOM( bom );
  503. // This is hackey, but I am not sure the best way to do it at the present.
  504. // The only valid BOM is a UTF8 BOM, which is 3 bytes, even though we read
  505. // 4 bytes because it could possibly be a UTF32 BOM, and we want to provide
  506. // an accurate error message. Perhaps this could be re-worked when more UTF
  507. // formats are supported to have isValidBOM return the size of the BOM, in
  508. // bytes.
  509. if( valid )
  510. (*outStringPtr) += 3; // SEE ABOVE!! -pw
  511. return valid;
  512. }
  513. bool isValidUTF8BOM( U8 bom[4] )
  514. {
  515. // Is it a BOM?
  516. if( bom[0] == 0 )
  517. {
  518. // Could be UTF32BE
  519. if( bom[1] == 0 && bom[2] == 0xFE && bom[3] == 0xFF )
  520. {
  521. Con::warnf( "Encountered a UTF32 BE BOM in this file; Torque does NOT support this file encoding. Use UTF8!" );
  522. return false;
  523. }
  524. return false;
  525. }
  526. else if( bom[0] == 0xFF )
  527. {
  528. // It's little endian, either UTF16 or UTF32
  529. if( bom[1] == 0xFE )
  530. {
  531. if( bom[2] == 0 && bom[3] == 0 )
  532. Con::warnf( "Encountered a UTF32 LE BOM in this file; Torque does NOT support this file encoding. Use UTF8!" );
  533. else
  534. Con::warnf( "Encountered a UTF16 LE BOM in this file; Torque does NOT support this file encoding. Use UTF8!" );
  535. }
  536. return false;
  537. }
  538. else if( bom[0] == 0xFE && bom[1] == 0xFF )
  539. {
  540. Con::warnf( "Encountered a UTF16 BE BOM in this file; Torque does NOT support this file encoding. Use UTF8!" );
  541. return false;
  542. }
  543. else if( bom[0] == 0xEF && bom[1] == 0xBB && bom[2] == 0xBF )
  544. {
  545. // Can enable this if you want -pw
  546. //Con::printf("Encountered a UTF8 BOM. Torque supports this.");
  547. return true;
  548. }
  549. // Don't print out an error message here, because it will try this with
  550. // every script. -pw
  551. return false;
  552. }