2
0

Unicode.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384
  1. ///////////////////////////////////////////////////////////////////////////////
  2. // //
  3. // Unicode.cpp //
  4. // Copyright (C) Microsoft Corporation. All rights reserved. //
  5. // This file is distributed under the University of Illinois Open Source //
  6. // License. See LICENSE.TXT for details. //
  7. // //
  8. // Provides utitlity functions to work with Unicode and other encodings. //
  9. // //
  10. ///////////////////////////////////////////////////////////////////////////////
  11. #ifdef _WIN32
  12. #include <specstrings.h>
  13. #else
  14. #include <clocale>
  15. #endif
  16. #include <string>
  17. #include "dxc/Support/Global.h"
  18. #include "dxc/Support/Unicode.h"
  19. #include "dxc/Support/WinIncludes.h"
  20. #ifndef _WIN32
  21. // MultiByteToWideChar which is a Windows-specific method.
  22. // This is a very simplistic implementation for non-Windows platforms. This
  23. // implementation completely ignores CodePage and dwFlags.
  24. int MultiByteToWideChar(uint32_t CodePage, uint32_t /*dwFlags*/,
  25. const char *lpMultiByteStr, int cbMultiByte,
  26. wchar_t *lpWideCharStr, int cchWideChar) {
  27. if (cbMultiByte == 0) {
  28. SetLastError(ERROR_INVALID_PARAMETER);
  29. return 0;
  30. }
  31. // if cbMultiByte is -1, it indicates that lpMultiByteStr is null-terminated
  32. // and the entire string should be processed.
  33. if (cbMultiByte == -1) {
  34. for (cbMultiByte = 0; lpMultiByteStr[cbMultiByte] != '\0'; ++cbMultiByte)
  35. ;
  36. // Add 1 for the null-terminating character.
  37. ++cbMultiByte;
  38. }
  39. // If zero is given as the destination size, this function should
  40. // return the required size (including the null-terminating character).
  41. // This is the behavior of mbstowcs when the target is null.
  42. if (cchWideChar == 0) {
  43. lpWideCharStr = nullptr;
  44. } else if (cchWideChar < cbMultiByte) {
  45. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  46. return 0;
  47. }
  48. size_t rv;
  49. const char *locale = CPToLocale(CodePage);
  50. locale = setlocale(LC_ALL, locale);
  51. if (lpMultiByteStr[cbMultiByte - 1] != '\0') {
  52. char *srcStr = (char *)malloc((cbMultiByte +1) * sizeof(char));
  53. strncpy(srcStr, lpMultiByteStr, cbMultiByte);
  54. srcStr[cbMultiByte]='\0';
  55. rv = mbstowcs(lpWideCharStr, srcStr, cchWideChar);
  56. free(srcStr);
  57. } else {
  58. rv = mbstowcs(lpWideCharStr, lpMultiByteStr, cchWideChar);
  59. }
  60. setlocale(LC_ALL, locale);
  61. if (rv == (size_t)cbMultiByte) return rv;
  62. return rv + 1; // mbstowcs excludes the terminating character
  63. }
  64. // WideCharToMultiByte is a Windows-specific method.
  65. // This is a very simplistic implementation for non-Windows platforms. This
  66. // implementation completely ignores CodePage and dwFlags.
  67. int WideCharToMultiByte(uint32_t CodePage, uint32_t /*dwFlags*/,
  68. const wchar_t *lpWideCharStr, int cchWideChar,
  69. char *lpMultiByteStr, int cbMultiByte,
  70. const char * /*lpDefaultChar*/,
  71. bool * /*lpUsedDefaultChar*/) {
  72. if (cchWideChar == 0) {
  73. SetLastError(ERROR_INVALID_PARAMETER);
  74. return 0;
  75. }
  76. // if cchWideChar is -1, it indicates that lpWideCharStr is null-terminated
  77. // and the entire string should be processed.
  78. if (cchWideChar == -1) {
  79. for (cchWideChar = 0; lpWideCharStr[cchWideChar] != '\0'; ++cchWideChar)
  80. ;
  81. // Add 1 for the null-terminating character.
  82. ++cchWideChar;
  83. }
  84. // If zero is given as the destination size, this function should
  85. // return the required size (including the null-terminating character).
  86. // This is the behavior of wcstombs when the target is null.
  87. if (cbMultiByte == 0) {
  88. lpMultiByteStr = nullptr;
  89. } else if (cbMultiByte < cchWideChar) {
  90. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  91. return 0;
  92. }
  93. size_t rv;
  94. const char *locale = CPToLocale(CodePage);
  95. locale = setlocale(LC_ALL, locale);
  96. if (lpWideCharStr[cchWideChar - 1] != L'\0') {
  97. wchar_t *srcStr = (wchar_t *)malloc((cchWideChar+1) * sizeof(wchar_t));
  98. wcsncpy(srcStr, lpWideCharStr, cchWideChar);
  99. srcStr[cchWideChar] = L'\0';
  100. rv = wcstombs(lpMultiByteStr, srcStr, cbMultiByte);
  101. free(srcStr);
  102. } else {
  103. rv = wcstombs(lpMultiByteStr, lpWideCharStr, cbMultiByte);
  104. }
  105. setlocale(LC_ALL, locale);
  106. if (rv == (size_t)cchWideChar) return rv;
  107. return rv + 1; // mbstowcs excludes the terminating character
  108. }
  109. #endif // _WIN32
  110. namespace Unicode {
  111. _Success_(return != false)
  112. bool UTF16ToEncodedString(_In_z_ const wchar_t* text, size_t cUTF16, DWORD cp, DWORD flags, _Inout_ std::string* pValue, _Out_opt_ bool* lossy) {
  113. BOOL usedDefaultChar;
  114. LPBOOL pUsedDefaultChar = (lossy == nullptr) ? nullptr : &usedDefaultChar;
  115. if (lossy != nullptr) *lossy = false;
  116. // Handle zero-length as a special case; it's a special value to indicate errors in WideCharToMultiByte.
  117. if (cUTF16 == 0) {
  118. pValue->resize(0);
  119. DXASSERT(lossy == nullptr || *lossy == false, "otherwise earlier initialization in this function was updated");
  120. return true;
  121. }
  122. int cbUTF8 = ::WideCharToMultiByte(cp, flags, text, cUTF16, nullptr, 0, nullptr, pUsedDefaultChar);
  123. if (cbUTF8 == 0)
  124. return false;
  125. pValue->resize(cbUTF8);
  126. cbUTF8 = ::WideCharToMultiByte(cp, flags, text, cUTF16, &(*pValue)[0], pValue->size(), nullptr, pUsedDefaultChar);
  127. DXASSERT(cbUTF8 > 0, "otherwise contents have changed");
  128. DXASSERT((*pValue)[pValue->size()] == '\0', "otherwise string didn't null-terminate after resize() call");
  129. if (lossy != nullptr) *lossy = usedDefaultChar;
  130. return true;
  131. }
  132. _Use_decl_annotations_
  133. bool UTF8ToUTF16String(const char *pUTF8, std::wstring *pUTF16) {
  134. size_t cbUTF8 = (pUTF8 == nullptr) ? 0 : strlen(pUTF8);
  135. return UTF8ToUTF16String(pUTF8, cbUTF8, pUTF16);
  136. }
  137. _Use_decl_annotations_
  138. bool UTF8ToUTF16String(const char *pUTF8, size_t cbUTF8, std::wstring *pUTF16) {
  139. DXASSERT_NOMSG(pUTF16 != nullptr);
  140. // Handle zero-length as a special case; it's a special value to indicate
  141. // errors in MultiByteToWideChar.
  142. if (cbUTF8 == 0) {
  143. pUTF16->resize(0);
  144. return true;
  145. }
  146. int cUTF16 = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, pUTF8,
  147. cbUTF8, nullptr, 0);
  148. if (cUTF16 == 0)
  149. return false;
  150. pUTF16->resize(cUTF16);
  151. cUTF16 = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, pUTF8, cbUTF8,
  152. &(*pUTF16)[0], pUTF16->size());
  153. DXASSERT(cUTF16 > 0, "otherwise contents changed");
  154. DXASSERT((*pUTF16)[pUTF16->size()] == L'\0',
  155. "otherwise wstring didn't null-terminate after resize() call");
  156. return true;
  157. }
  158. std::wstring UTF8ToUTF16StringOrThrow(_In_z_ const char *pUTF8) {
  159. std::wstring result;
  160. if (!UTF8ToUTF16String(pUTF8, &result)) {
  161. throw hlsl::Exception(DXC_E_STRING_ENCODING_FAILED);
  162. }
  163. return result;
  164. }
  165. _Use_decl_annotations_
  166. bool UTF8ToConsoleString(_In_z_ const char* text, _In_ size_t textLen, _Inout_ std::string* pValue, _Out_opt_ bool* lossy) {
  167. DXASSERT_NOMSG(text != nullptr);
  168. DXASSERT_NOMSG(pValue != nullptr);
  169. std::wstring text16;
  170. if (lossy != nullptr) *lossy = false;
  171. if (!UTF8ToUTF16String(text, textLen, &text16)) {
  172. return false;
  173. }
  174. return UTF16ToConsoleString(text16.c_str(), text16.length(), pValue, lossy);
  175. }
  176. _Use_decl_annotations_
  177. bool UTF8ToConsoleString(_In_z_ const char* text, _Inout_ std::string* pValue, _Out_opt_ bool* lossy) {
  178. return UTF8ToConsoleString(text, strlen(text), pValue, lossy);
  179. }
  180. _Use_decl_annotations_
  181. bool UTF16ToConsoleString(const wchar_t* text, _In_ size_t textLen, std::string* pValue, bool* lossy) {
  182. DXASSERT_NOMSG(text != nullptr);
  183. DXASSERT_NOMSG(pValue != nullptr);
  184. UINT cp = GetConsoleOutputCP();
  185. return UTF16ToEncodedString(text, textLen, cp, 0, pValue, lossy);
  186. }
  187. _Use_decl_annotations_
  188. bool UTF16ToConsoleString(const wchar_t* text, std::string* pValue, bool* lossy) {
  189. return UTF16ToConsoleString(text, wcslen(text), pValue, lossy);
  190. }
  191. _Use_decl_annotations_
  192. bool UTF16ToUTF8String(const wchar_t *pUTF16, size_t cUTF16, std::string *pUTF8) {
  193. DXASSERT_NOMSG(pUTF16 != nullptr);
  194. DXASSERT_NOMSG(pUTF8 != nullptr);
  195. return UTF16ToEncodedString(pUTF16, cUTF16, CP_UTF8, 0, pUTF8, nullptr);
  196. }
  197. _Use_decl_annotations_
  198. bool UTF16ToUTF8String(const wchar_t *pUTF16, std::string *pUTF8) {
  199. DXASSERT_NOMSG(pUTF16 != nullptr);
  200. DXASSERT_NOMSG(pUTF8 != nullptr);
  201. return UTF16ToEncodedString(pUTF16, wcslen(pUTF16), CP_UTF8, 0, pUTF8, nullptr);
  202. }
  203. std::string UTF16ToUTF8StringOrThrow(_In_z_ const wchar_t *pUTF16) {
  204. std::string result;
  205. if (!UTF16ToUTF8String(pUTF16, &result)) {
  206. throw hlsl::Exception(DXC_E_STRING_ENCODING_FAILED);
  207. }
  208. return result;
  209. }
  210. _Use_decl_annotations_
  211. bool UTF8BufferToUTF16ComHeap(const char *pUTF8, wchar_t **ppUTF16) throw() {
  212. *ppUTF16 = nullptr;
  213. int c = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, pUTF8, -1,
  214. nullptr, 0);
  215. if (c == 0)
  216. return false;
  217. CComHeapPtr<wchar_t> p;
  218. if (!p.Allocate(c))
  219. return false;
  220. DXVERIFY_NOMSG(0 < ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, pUTF8,
  221. -1, p.m_pData, c));
  222. *ppUTF16 = p.Detach();
  223. return true;
  224. }
  225. _Use_decl_annotations_
  226. bool UTF8BufferToUTF16Buffer(const char *pUTF8, int cbUTF8, wchar_t **ppUTF16, size_t *pcUTF16) throw() {
  227. *ppUTF16 = nullptr;
  228. *pcUTF16 = 0;
  229. if (cbUTF8 == 0 || (cbUTF8 == -1 && *pUTF8 == '\0')) {
  230. *ppUTF16 = new (std::nothrow) wchar_t[1];
  231. if (*ppUTF16 == nullptr)
  232. return false;
  233. (*ppUTF16)[0] = L'\0';
  234. *pcUTF16 = 1;
  235. return true;
  236. }
  237. int c = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, pUTF8, cbUTF8, nullptr, 0);
  238. if (c == 0)
  239. return false;
  240. // add space for null-terminator if we're not accounting for it
  241. if (cbUTF8 != -1)
  242. c += 1;
  243. wchar_t *p = new (std::nothrow) wchar_t[c];
  244. if (p == nullptr)
  245. return false;
  246. int converted = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS,
  247. pUTF8, cbUTF8,
  248. p, c);
  249. (void)converted;
  250. DXASSERT(converted > 0, "otherwise contents have changed");
  251. p[c - 1] = L'\0';
  252. *ppUTF16 = p;
  253. *pcUTF16 = c;
  254. return true;
  255. }
  256. _Use_decl_annotations_
  257. bool UTF16BufferToUTF8Buffer(const wchar_t *pUTF16, int cUTF16, char **ppUTF8, size_t *pcUTF8) throw() {
  258. *ppUTF8 = nullptr;
  259. *pcUTF8 = 0;
  260. if (cUTF16 == 0 || (cUTF16 == -1 && *pUTF16 == '\0')) {
  261. *ppUTF8 = new (std::nothrow) char[1];
  262. if (*ppUTF8 == nullptr)
  263. return false;
  264. (*ppUTF8)[0] = '\0';
  265. *pcUTF8 = 1;
  266. return true;
  267. }
  268. int c1 = ::WideCharToMultiByte(CP_UTF8, // code page
  269. 0, // flags
  270. pUTF16, // string to convert
  271. cUTF16, // size, in chars, of string to convert
  272. nullptr, // output buffer
  273. 0, // size of output buffer
  274. nullptr, nullptr);
  275. if (c1 == 0)
  276. return false;
  277. // add space for null-terminator if we're not accounting for it
  278. if (cUTF16 != -1)
  279. c1 += 1;
  280. char *p = new (std::nothrow) char[c1];
  281. if (p == nullptr)
  282. return false;
  283. int converted = ::WideCharToMultiByte(CP_UTF8, 0,
  284. pUTF16, cUTF16,
  285. p, c1,
  286. nullptr, nullptr);
  287. (void)converted;
  288. DXASSERT(converted > 0, "otherwise contents have changed");
  289. p[c1 - 1] = '\0';
  290. *ppUTF8 = p;
  291. *pcUTF8 = c1;
  292. return true;
  293. }
  294. template<typename TChar>
  295. static
  296. bool IsStarMatchT(const TChar *pMask, size_t maskLen, const TChar *pName, size_t nameLen, TChar star) {
  297. if (maskLen == 0 && nameLen == 0) {
  298. return true;
  299. }
  300. if (maskLen == 0 || nameLen == 0) {
  301. return false;
  302. }
  303. if (pMask[maskLen - 1] == star) {
  304. // Prefix match.
  305. if (maskLen == 1) { // For just '*', everything is a match.
  306. return true;
  307. }
  308. --maskLen;
  309. if (maskLen > nameLen) { // Mask is longer than name, can't be a match.
  310. return false;
  311. }
  312. return 0 == memcmp(pMask, pName, sizeof(TChar) * maskLen);
  313. }
  314. else {
  315. // Exact match.
  316. if (nameLen != maskLen) {
  317. return false;
  318. }
  319. return 0 == memcmp(pMask, pName, sizeof(TChar) * nameLen);
  320. }
  321. }
  322. _Use_decl_annotations_
  323. bool IsStarMatchUTF8(const char *pMask, size_t maskLen, const char *pName, size_t nameLen) {
  324. return IsStarMatchT<char>(pMask, maskLen, pName, nameLen, '*');
  325. }
  326. _Use_decl_annotations_
  327. bool IsStarMatchUTF16(const wchar_t *pMask, size_t maskLen, const wchar_t *pName, size_t nameLen) {
  328. return IsStarMatchT<wchar_t>(pMask, maskLen, pName, nameLen, L'*');
  329. }
  330. } // namespace Unicode