BsUnicode.cpp 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394
  1. //********************************** Banshee Engine (www.banshee3d.com) **************************************************//
  2. //**************** Copyright (c) 2016 Marko Pintera ([email protected]). All rights reserved. **********************//
  3. #include "BsUnicode.h"
  4. namespace bs
  5. {
  6. /** Converts an UTF-8 encoded character (possibly multibyte) into an UTF-32 character. */
  7. template<typename T>
  8. T UTF8To32(T begin, T end, char32_t& output, char32_t invalidChar = 0)
  9. {
  10. // Nothing to parse
  11. if (begin >= end)
  12. return begin;
  13. // Determine the number of bytes used by the character
  14. UINT32 numBytes;
  15. UINT8 firstByte = (UINT8)*begin;
  16. if (firstByte < 192)
  17. numBytes = 1;
  18. else if (firstByte < 224)
  19. numBytes = 2;
  20. else if (firstByte < 240)
  21. numBytes = 3;
  22. else if (firstByte < 248)
  23. numBytes = 4;
  24. else if (firstByte < 252)
  25. numBytes = 5;
  26. else // < 256
  27. numBytes = 6;
  28. // Not enough bytes were provided, invalid character
  29. if((begin + numBytes) > end)
  30. {
  31. output = invalidChar;
  32. return end;
  33. }
  34. // Decode the character
  35. output = 0;
  36. switch(numBytes)
  37. {
  38. case 6: output += (UINT8)(*begin); ++begin; output <<= 6;
  39. case 5: output += (UINT8)(*begin); ++begin; output <<= 6;
  40. case 4: output += (UINT8)(*begin); ++begin; output <<= 6;
  41. case 3: output += (UINT8)(*begin); ++begin; output <<= 6;
  42. case 2: output += (UINT8)(*begin); ++begin; output <<= 6;
  43. case 1: output += (UINT8)(*begin); ++begin;
  44. default: break;
  45. }
  46. constexpr UINT32 offsets[6] = { 0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };
  47. output -= offsets[numBytes - 1];
  48. return begin;
  49. }
  50. /** Converts an UTF-32 encoded character into an (possibly multibyte) UTF-8 character. */
  51. template<typename T>
  52. T UTF32To8(char32_t input, T output, UINT32 maxElems, char invalidChar = 0)
  53. {
  54. // No place to write the character
  55. if (maxElems == 0)
  56. return output;
  57. // Check if character is valid
  58. if ((input > 0x0010FFFF) || ((input >= 0xD800) && (input <= 0xDBFF)))
  59. {
  60. *output = invalidChar;
  61. ++output;
  62. return output;
  63. }
  64. // Determine the number of bytes used by the character
  65. UINT32 numBytes;
  66. if (input < 0x80)
  67. numBytes = 1;
  68. else if (input < 0x800)
  69. numBytes = 2;
  70. else if (input < 0x10000)
  71. numBytes = 3;
  72. else // <= 0x0010FFFF
  73. numBytes = 4;
  74. // Check if we have enough space
  75. if(numBytes > maxElems)
  76. {
  77. *output = invalidChar;
  78. ++output;
  79. return output;
  80. }
  81. // Encode the character
  82. constexpr UINT8 headers[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
  83. char bytes[4];
  84. switch (numBytes)
  85. {
  86. case 4: bytes[3] = (char)((input | 0x80) & 0xBF); input >>= 6;
  87. case 3: bytes[2] = (char)((input | 0x80) & 0xBF); input >>= 6;
  88. case 2: bytes[1] = (char)((input | 0x80) & 0xBF); input >>= 6;
  89. case 1: bytes[0] = (char)(input | headers[numBytes]);
  90. default: break;
  91. }
  92. output = std::copy(bytes, bytes + numBytes, output);
  93. return output;
  94. }
  95. /** Converts an UTF-16 encoded character into an UTF-32 character. */
  96. template<typename T>
  97. T UTF16To32(T begin, T end, char32_t& output, char32_t invalidChar = 0)
  98. {
  99. // Nothing to parse
  100. if (begin >= end)
  101. return begin;
  102. char16_t firstElem = (char16_t)*begin;
  103. ++begin;
  104. // Check if it's a surrogate pair
  105. if ((firstElem >= 0xD800) && (firstElem <= 0xDBFF))
  106. {
  107. // Invalid character
  108. if (begin >= end)
  109. {
  110. output = invalidChar;
  111. return end;
  112. }
  113. char32_t secondElem = (char32_t)*begin;
  114. ++begin;
  115. if ((secondElem >= 0xDC00) && (secondElem <= 0xDFFF))
  116. output = (char32_t)(((firstElem - 0xD800) << 10) + (secondElem - 0xDC00) + 0x0010000);
  117. else // Invalid character
  118. output = invalidChar;
  119. }
  120. else
  121. {
  122. output = (char32_t)firstElem;
  123. return begin;
  124. }
  125. return begin;
  126. }
  127. /** Converts an UTF-32 encoded character into an UTF-16 character. */
  128. template<typename T>
  129. T UTF32To16(char32_t input, T output, UINT32 maxElems, char16_t invalidChar = 0)
  130. {
  131. // No place to write the character
  132. if (maxElems == 0)
  133. return output;
  134. // Invalid character
  135. if (input > 0x0010FFFF)
  136. {
  137. *output = invalidChar;
  138. ++output;
  139. return output;
  140. }
  141. // Can be encoded as a single element
  142. if (input <= 0xFFFF)
  143. {
  144. // Check if in valid range
  145. if ((input >= 0xD800) && (input <= 0xDFFF))
  146. {
  147. *output = invalidChar;
  148. ++output;
  149. return output;
  150. }
  151. *output = (char16_t)input;
  152. ++output;
  153. }
  154. else // Must be encoded as two elements
  155. {
  156. // Two elements won't fit
  157. if (maxElems < 2)
  158. {
  159. *output = invalidChar;
  160. ++output;
  161. return output;
  162. }
  163. input -= 0x0010000;
  164. *output = (char16_t)((input >> 10) + 0xD800);
  165. ++output;
  166. *output = (char16_t)((input & 0x3FFUL) + 0xDC00);
  167. ++output;
  168. }
  169. return output;
  170. }
  171. template<typename T>
  172. T wideToUTF32(T begin, T end, char32_t& output, char32_t invalidChar = 0)
  173. {
  174. if (sizeof(wchar_t) == 4) // Assuming UTF-32 (i.e. Unix)
  175. {
  176. output = (char32_t)*begin;
  177. ++begin;
  178. return begin;
  179. }
  180. else // Assuming UTF-16 (i.e. Windows)
  181. return UTF16To32(begin, end, output, invalidChar);
  182. }
  183. char32_t ANSIToUTF32(char input, const std::locale& locale = std::locale(""))
  184. {
  185. const std::ctype<wchar_t>& facet = std::use_facet<std::ctype<wchar_t>>(locale);
  186. // Note: Not exactly valid on Windows, since the input character could require a surrogate pair.
  187. // Consider improving this if it ever becomes an issue.
  188. wchar_t wideChar = facet.widen(input);
  189. char32_t output;
  190. wideToUTF32(&wideChar, &wideChar + 1, output);
  191. return output;
  192. }
  193. template<typename T>
  194. T UTF32ToWide(char32_t input, T output, UINT32 maxElems, wchar_t invalidChar = 0)
  195. {
  196. if(sizeof(wchar_t) == 4) // Assuming UTF-32 (i.e. Unix)
  197. {
  198. *output = (wchar_t)input;
  199. ++output;
  200. return output;
  201. }
  202. else // Assuming UTF-16 (i.e. Windows)
  203. return UTF32To16(input, output, maxElems, invalidChar);
  204. }
  205. char UTF32ToANSI(char32_t input, char invalidChar = 0, const std::locale& locale = std::locale(""))
  206. {
  207. const std::ctype<wchar_t>& facet = std::use_facet<std::ctype<wchar_t>>(locale);
  208. // Note: Same as above, not exactly correct as narrow() doesn't accept a surrogate pair
  209. return facet.narrow((wchar_t)input, invalidChar);
  210. }
  211. String UTF8::fromANSI(const String& input, const std::locale& locale)
  212. {
  213. String output;
  214. output.reserve(input.size());
  215. auto backInserter = std::back_inserter(output);
  216. auto iter = input.begin();
  217. while(iter != input.end())
  218. {
  219. char32_t u32char = ANSIToUTF32(*iter, locale);
  220. UTF32To8(u32char, backInserter, 4);
  221. ++iter;
  222. }
  223. return output;
  224. }
  225. String UTF8::toANSI(const String& input, const std::locale& locale, char invalidChar)
  226. {
  227. String output;
  228. auto iter = input.begin();
  229. while(iter != input.end())
  230. {
  231. char32_t u32char;
  232. iter = UTF8To32(iter, input.end(), u32char, invalidChar);
  233. output.push_back(UTF32ToANSI(u32char, invalidChar, locale));
  234. }
  235. return output;
  236. }
  237. String UTF8::fromWide(const WString& input)
  238. {
  239. String output;
  240. output.reserve(input.size());
  241. auto backInserter = std::back_inserter(output);
  242. auto iter = input.begin();
  243. while(iter != input.end())
  244. {
  245. char32_t u32char;
  246. iter = wideToUTF32(iter, input.end(), u32char);
  247. UTF32To8(u32char, backInserter, 4);
  248. }
  249. return output;
  250. }
  251. WString UTF8::toWide(const String& input)
  252. {
  253. WString output;
  254. auto backInserter = std::back_inserter(output);
  255. auto iter = input.begin();
  256. while(iter != input.end())
  257. {
  258. char32_t u32char;
  259. iter = UTF8To32(iter, input.end(), u32char);
  260. UTF32ToWide(u32char, backInserter, 2);
  261. }
  262. return output;
  263. }
  264. String UTF8::fromUTF16(const U16String& input)
  265. {
  266. String output;
  267. output.reserve(input.size());
  268. auto backInserter = std::back_inserter(output);
  269. auto iter = input.begin();
  270. while(iter != input.end())
  271. {
  272. char32_t u32char;
  273. iter = UTF16To32(iter, input.end(), u32char);
  274. UTF32To8(u32char, backInserter, 4);
  275. }
  276. return output;
  277. }
  278. U16String UTF8::toUTF16(const String& input)
  279. {
  280. U16String output;
  281. auto backInserter = std::back_inserter(output);
  282. auto iter = input.begin();
  283. while(iter != input.end())
  284. {
  285. char32_t u32char;
  286. iter = UTF8To32(iter, input.end(), u32char);
  287. UTF32To16(u32char, backInserter, 2);
  288. }
  289. return output;
  290. }
  291. String UTF8::fromUTF32(const U32String& input)
  292. {
  293. String output;
  294. output.reserve(input.size());
  295. auto backInserter = std::back_inserter(output);
  296. auto iter = input.begin();
  297. while(iter != input.end())
  298. {
  299. UTF32To8(*iter, backInserter, 4);
  300. ++iter;
  301. }
  302. return output;
  303. }
  304. U32String UTF8::toUTF32(const String& input)
  305. {
  306. U32String output;
  307. auto iter = input.begin();
  308. while(iter != input.end())
  309. {
  310. char32_t u32char;
  311. iter = UTF8To32(iter, input.end(), u32char);
  312. output.push_back(u32char);
  313. }
  314. return output;
  315. }
  316. }