Unicode.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
  1. ///////////////////////////////////////////////////////////////////////////////
  2. // //
  3. // Unicode.cpp //
  4. // Copyright (C) Microsoft Corporation. All rights reserved. //
  5. // This file is distributed under the University of Illinois Open Source //
  6. // License. See LICENSE.TXT for details. //
  7. // //
  8. // Provides utitlity functions to work with Unicode and other encodings. //
  9. // //
  10. ///////////////////////////////////////////////////////////////////////////////
  11. #ifdef _WIN32
  12. #include <specstrings.h>
  13. #endif
  14. #include <string>
  15. #include "dxc/Support/Global.h"
  16. #include "dxc/Support/Unicode.h"
  17. #include "dxc/Support/WinIncludes.h"
  18. #ifndef _WIN32
  19. // MultiByteToWideChar which is a Windows-specific method.
  20. // This is a very simplistic implementation for non-Windows platforms. This
  21. // implementation completely ignores CodePage and dwFlags.
  22. int MultiByteToWideChar(uint32_t /*CodePage*/, uint32_t /*dwFlags*/,
  23. const char *lpMultiByteStr, int cbMultiByte,
  24. wchar_t *lpWideCharStr, int cchWideChar) {
  25. if (cbMultiByte == 0) {
  26. SetLastError(ERROR_INVALID_PARAMETER);
  27. return 0;
  28. }
  29. // if cbMultiByte is -1, it indicates that lpMultiByteStr is null-terminated
  30. // and the entire string should be processed.
  31. if (cbMultiByte == -1) {
  32. for (cbMultiByte = 0; lpMultiByteStr[cbMultiByte] != '\0'; ++cbMultiByte)
  33. ;
  34. // Add 1 for the null-terminating character.
  35. ++cbMultiByte;
  36. }
  37. // if zero is given as the destination size, this function should
  38. // return the required size (including the null-terminating character).
  39. if (cchWideChar == 0) {
  40. wchar_t *tempStr = (wchar_t *)malloc(cbMultiByte * sizeof(wchar_t));
  41. size_t requiredSize = mbstowcs(tempStr, lpMultiByteStr, cbMultiByte);
  42. free(tempStr);
  43. if (requiredSize == (size_t)cbMultiByte) return requiredSize;
  44. return requiredSize + 1;
  45. }
  46. if (cchWideChar < cbMultiByte) {
  47. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  48. return 0;
  49. }
  50. size_t rv = mbstowcs(lpWideCharStr, lpMultiByteStr, cbMultiByte);
  51. if (rv == (size_t)cbMultiByte) return rv;
  52. return rv + 1; // mbstowcs excludes the terminating character
  53. }
  54. // WideCharToMultiByte is a Windows-specific method.
  55. // This is a very simplistic implementation for non-Windows platforms. This
  56. // implementation completely ignores CodePage and dwFlags.
  57. int WideCharToMultiByte(uint32_t /*CodePage*/, uint32_t /*dwFlags*/,
  58. const wchar_t *lpWideCharStr, int cchWideChar,
  59. char *lpMultiByteStr, int cbMultiByte,
  60. const char * /*lpDefaultChar*/,
  61. bool * /*lpUsedDefaultChar*/) {
  62. // if cchWideChar is -1, it indicates that lpWideCharStr is null-terminated
  63. // and the entire string should be processed.
  64. if (cchWideChar == 0) {
  65. SetLastError(ERROR_INVALID_PARAMETER);
  66. return 0;
  67. }
  68. if (cchWideChar == -1) {
  69. for (cchWideChar = 0; lpWideCharStr[cchWideChar] != '\0'; ++cchWideChar)
  70. ;
  71. // Add 1 for the null-terminating character.
  72. ++cchWideChar;
  73. }
  74. // if zero is given as the destination size, this function should
  75. // return the required size (including the null-terminating character).
  76. if (cbMultiByte == 0) {
  77. char *tempStr = (char *)malloc(cchWideChar * sizeof(char));
  78. size_t requiredSize = wcstombs(tempStr, lpWideCharStr, cchWideChar);
  79. free(tempStr);
  80. if (requiredSize == (size_t)cchWideChar) return requiredSize;
  81. return requiredSize + 1;
  82. }
  83. if (cbMultiByte < cchWideChar) {
  84. SetLastError(ERROR_INSUFFICIENT_BUFFER);
  85. return 0;
  86. }
  87. size_t rv = wcstombs(lpMultiByteStr, lpWideCharStr, cchWideChar);
  88. if (rv == (size_t)cchWideChar) return rv;
  89. return rv + 1; // mbstowcs excludes the terminating character
  90. }
  91. #endif // _WIN32
  92. namespace Unicode {
  93. _Success_(return != false)
  94. bool UTF16ToEncodedString(_In_z_ const wchar_t* text, DWORD cp, DWORD flags, _Inout_ std::string* pValue, _Out_opt_ bool* lossy) {
  95. BOOL usedDefaultChar;
  96. LPBOOL pUsedDefaultChar = (lossy == nullptr) ? nullptr : &usedDefaultChar;
  97. size_t cUTF16 = wcslen(text);
  98. if (lossy != nullptr) *lossy = false;
  99. // Handle zero-length as a special case; it's a special value to indicate errors in WideCharToMultiByte.
  100. if (cUTF16 == 0) {
  101. pValue->resize(0);
  102. DXASSERT(lossy == nullptr || *lossy == false, "otherwise earlier initialization in this function was updated");
  103. return true;
  104. }
  105. int cbUTF8 = ::WideCharToMultiByte(cp, flags, text, cUTF16, nullptr, 0, nullptr, pUsedDefaultChar);
  106. if (cbUTF8 == 0)
  107. return false;
  108. pValue->resize(cbUTF8);
  109. cbUTF8 = ::WideCharToMultiByte(cp, flags, text, cUTF16, &(*pValue)[0], pValue->size(), nullptr, pUsedDefaultChar);
  110. DXASSERT(cbUTF8 > 0, "otherwise contents have changed");
  111. DXASSERT((*pValue)[pValue->size()] == '\0', "otherwise string didn't null-terminate after resize() call");
  112. if (lossy != nullptr) *lossy = usedDefaultChar;
  113. return true;
  114. }
  115. _Use_decl_annotations_
  116. bool UTF8ToUTF16String(const char *pUTF8, std::wstring *pUTF16) {
  117. size_t cbUTF8 = (pUTF8 == nullptr) ? 0 : strlen(pUTF8);
  118. return UTF8ToUTF16String(pUTF8, cbUTF8, pUTF16);
  119. }
  120. _Use_decl_annotations_
  121. bool UTF8ToUTF16String(const char *pUTF8, size_t cbUTF8, std::wstring *pUTF16) {
  122. DXASSERT_NOMSG(pUTF16 != nullptr);
  123. // Handle zero-length as a special case; it's a special value to indicate
  124. // errors in MultiByteToWideChar.
  125. if (cbUTF8 == 0) {
  126. pUTF16->resize(0);
  127. return true;
  128. }
  129. int cUTF16 = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, pUTF8,
  130. cbUTF8, nullptr, 0);
  131. if (cUTF16 == 0)
  132. return false;
  133. pUTF16->resize(cUTF16);
  134. cUTF16 = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, pUTF8, cbUTF8,
  135. &(*pUTF16)[0], pUTF16->size());
  136. DXASSERT(cUTF16 > 0, "otherwise contents changed");
  137. DXASSERT((*pUTF16)[pUTF16->size()] == L'\0',
  138. "otherwise wstring didn't null-terminate after resize() call");
  139. return true;
  140. }
  141. std::wstring UTF8ToUTF16StringOrThrow(_In_z_ const char *pUTF8) {
  142. std::wstring result;
  143. if (!UTF8ToUTF16String(pUTF8, &result)) {
  144. throw hlsl::Exception(DXC_E_STRING_ENCODING_FAILED);
  145. }
  146. return result;
  147. }
  148. _Use_decl_annotations_
  149. bool UTF8ToConsoleString(_In_z_ const char* text, _Inout_ std::string* pValue, _Out_opt_ bool* lossy) {
  150. DXASSERT_NOMSG(text != nullptr);
  151. DXASSERT_NOMSG(pValue != nullptr);
  152. std::wstring text16;
  153. if (lossy != nullptr) *lossy = false;
  154. if (!UTF8ToUTF16String(text, &text16)) {
  155. return false;
  156. }
  157. return UTF16ToConsoleString(text16.c_str(), pValue, lossy);
  158. }
  159. _Use_decl_annotations_
  160. bool UTF16ToConsoleString(const wchar_t* text, std::string* pValue, bool* lossy) {
  161. DXASSERT_NOMSG(text != nullptr);
  162. DXASSERT_NOMSG(pValue != nullptr);
  163. UINT cp = GetConsoleOutputCP();
  164. return UTF16ToEncodedString(text, cp, 0, pValue, lossy);
  165. }
  166. _Use_decl_annotations_
  167. bool UTF16ToUTF8String(const wchar_t *pUTF16, std::string *pUTF8) {
  168. DXASSERT_NOMSG(pUTF16 != nullptr);
  169. DXASSERT_NOMSG(pUTF8 != nullptr);
  170. return UTF16ToEncodedString(pUTF16, CP_UTF8, 0, pUTF8, nullptr);
  171. }
  172. std::string UTF16ToUTF8StringOrThrow(_In_z_ const wchar_t *pUTF16) {
  173. std::string result;
  174. if (!UTF16ToUTF8String(pUTF16, &result)) {
  175. throw hlsl::Exception(DXC_E_STRING_ENCODING_FAILED);
  176. }
  177. return result;
  178. }
  179. _Use_decl_annotations_
  180. bool UTF8BufferToUTF16ComHeap(const char *pUTF8, wchar_t **ppUTF16) throw() {
  181. *ppUTF16 = nullptr;
  182. int c = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, pUTF8, -1,
  183. nullptr, 0);
  184. if (c == 0)
  185. return false;
  186. CComHeapPtr<wchar_t> p;
  187. if (!p.Allocate(c))
  188. return false;
  189. DXVERIFY_NOMSG(0 < ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, pUTF8,
  190. -1, p.m_pData, c));
  191. *ppUTF16 = p.Detach();
  192. return true;
  193. }
  194. _Use_decl_annotations_
  195. bool UTF8BufferToUTF16Buffer(const char *pUTF8, int cbUTF8, wchar_t **ppUTF16, size_t *pcUTF16) throw() {
  196. *ppUTF16 = nullptr;
  197. *pcUTF16 = 0;
  198. if (cbUTF8 == 0 || (cbUTF8 == -1 && *pUTF8 == '\0')) {
  199. *ppUTF16 = new (std::nothrow) wchar_t[1];
  200. if (*ppUTF16 == nullptr)
  201. return false;
  202. (*ppUTF16)[0] = L'\0';
  203. *pcUTF16 = 1;
  204. return true;
  205. }
  206. int c = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, pUTF8, cbUTF8, nullptr, 0);
  207. if (c == 0)
  208. return false;
  209. // add space for null-terminator if we're not accounting for it
  210. if (cbUTF8 != -1)
  211. c += 1;
  212. wchar_t *p = new (std::nothrow) wchar_t[c];
  213. if (p == nullptr)
  214. return false;
  215. int converted = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS,
  216. pUTF8, cbUTF8,
  217. p, c);
  218. (void)converted;
  219. DXASSERT(converted > 0, "otherwise contents have changed");
  220. p[c - 1] = L'\0';
  221. *ppUTF16 = p;
  222. *pcUTF16 = c;
  223. return true;
  224. }
  225. _Use_decl_annotations_
  226. bool UTF16BufferToUTF8Buffer(const wchar_t *pUTF16, int cUTF16, char **ppUTF8, size_t *pcUTF8) throw() {
  227. *ppUTF8 = nullptr;
  228. *pcUTF8 = 0;
  229. if (cUTF16 == 0 || (cUTF16 == -1 && *pUTF16 == '\0')) {
  230. *ppUTF8 = new (std::nothrow) char[1];
  231. if (*ppUTF8 == nullptr)
  232. return false;
  233. (*ppUTF8)[0] = '\0';
  234. *pcUTF8 = 1;
  235. return true;
  236. }
  237. int c1 = ::WideCharToMultiByte(CP_UTF8, // code page
  238. 0, // flags
  239. pUTF16, // string to convert
  240. cUTF16, // size, in chars, of string to convert
  241. nullptr, // output buffer
  242. 0, // size of output buffer
  243. nullptr, nullptr);
  244. if (c1 == 0)
  245. return false;
  246. // add space for null-terminator if we're not accounting for it
  247. if (cUTF16 != -1)
  248. c1 += 1;
  249. char *p = new (std::nothrow) char[c1];
  250. if (p == nullptr)
  251. return false;
  252. int converted = ::WideCharToMultiByte(CP_UTF8, 0,
  253. pUTF16, cUTF16,
  254. p, c1,
  255. nullptr, nullptr);
  256. (void)converted;
  257. DXASSERT(converted > 0, "otherwise contents have changed");
  258. p[c1 - 1] = '\0';
  259. *ppUTF8 = p;
  260. *pcUTF8 = c1;
  261. return true;
  262. }
  263. template<typename TChar>
  264. static
  265. bool IsStarMatchT(const TChar *pMask, size_t maskLen, const TChar *pName, size_t nameLen, TChar star) {
  266. if (maskLen == 0 && nameLen == 0) {
  267. return true;
  268. }
  269. if (maskLen == 0 || nameLen == 0) {
  270. return false;
  271. }
  272. if (pMask[maskLen - 1] == star) {
  273. // Prefix match.
  274. if (maskLen == 1) { // For just '*', everything is a match.
  275. return true;
  276. }
  277. --maskLen;
  278. if (maskLen > nameLen) { // Mask is longer than name, can't be a match.
  279. return false;
  280. }
  281. return 0 == memcmp(pMask, pName, sizeof(TChar) * maskLen);
  282. }
  283. else {
  284. // Exact match.
  285. if (nameLen != maskLen) {
  286. return false;
  287. }
  288. return 0 == memcmp(pMask, pName, sizeof(TChar) * nameLen);
  289. }
  290. }
  291. _Use_decl_annotations_
  292. bool IsStarMatchUTF8(const char *pMask, size_t maskLen, const char *pName, size_t nameLen) {
  293. return IsStarMatchT<char>(pMask, maskLen, pName, nameLen, '*');
  294. }
  295. _Use_decl_annotations_
  296. bool IsStarMatchUTF16(const wchar_t *pMask, size_t maskLen, const wchar_t *pName, size_t nameLen) {
  297. return IsStarMatchT<wchar_t>(pMask, maskLen, pName, nameLen, L'*');
  298. }
  299. } // namespace Unicode