CharUnicodeInfo.cs 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380
  1. // Licensed to the .NET Foundation under one or more agreements.
  2. // The .NET Foundation licenses this file to you under the MIT license.
  3. // See the LICENSE file in the project root for more information.
  4. ////////////////////////////////////////////////////////////////////////////
  5. //
  6. //
  7. // Purpose: This class implements a set of methods for retrieving
  8. // character type information. Character type information is
  9. // independent of culture and region.
  10. //
  11. //
  12. ////////////////////////////////////////////////////////////////////////////
  13. using System.Buffers.Binary;
  14. using System.Diagnostics;
  15. using System.Text;
  16. using Internal.Runtime.CompilerServices;
  17. namespace System.Globalization
  18. {
  19. public static partial class CharUnicodeInfo
  20. {
  21. //--------------------------------------------------------------------//
  22. // Internal Information //
  23. //--------------------------------------------------------------------//
  24. //
  25. // Native methods to access the Unicode category data tables in charinfo.nlp.
  26. //
  27. internal const char HIGH_SURROGATE_START = '\ud800';
  28. internal const char HIGH_SURROGATE_END = '\udbff';
  29. internal const char LOW_SURROGATE_START = '\udc00';
  30. internal const char LOW_SURROGATE_END = '\udfff';
  31. internal const int HIGH_SURROGATE_RANGE = 0x3FF;
  32. internal const int UNICODE_CATEGORY_OFFSET = 0;
  33. internal const int BIDI_CATEGORY_OFFSET = 1;
  34. // The starting codepoint for Unicode plane 1. Plane 1 contains 0x010000 ~ 0x01ffff.
  35. internal const int UNICODE_PLANE01_START = 0x10000;
  36. ////////////////////////////////////////////////////////////////////////
  37. //
  38. // Actions:
  39. // Convert the BMP character or surrogate pointed by index to a UTF32 value.
  40. // This is similar to char.ConvertToUTF32, but the difference is that
  41. // it does not throw exceptions when invalid surrogate characters are passed in.
  42. //
  43. // WARNING: since it doesn't throw an exception it CAN return a value
  44. // in the surrogate range D800-DFFF, which are not legal unicode values.
  45. //
  46. ////////////////////////////////////////////////////////////////////////
  47. internal static int InternalConvertToUtf32(string s, int index)
  48. {
  49. Debug.Assert(s != null, "s != null");
  50. Debug.Assert(index >= 0 && index < s.Length, "index < s.Length");
  51. if (index < s.Length - 1)
  52. {
  53. int temp1 = (int)s[index] - HIGH_SURROGATE_START;
  54. if ((uint)temp1 <= HIGH_SURROGATE_RANGE)
  55. {
  56. int temp2 = (int)s[index + 1] - LOW_SURROGATE_START;
  57. if ((uint)temp2 <= HIGH_SURROGATE_RANGE)
  58. {
  59. // Convert the surrogate to UTF32 and get the result.
  60. return ((temp1 * 0x400) + temp2 + UNICODE_PLANE01_START);
  61. }
  62. }
  63. }
  64. return ((int)s[index]);
  65. }
  66. internal static int InternalConvertToUtf32(StringBuilder s, int index)
  67. {
  68. Debug.Assert(s != null, "s != null");
  69. Debug.Assert(index >= 0 && index < s.Length, "index < s.Length");
  70. int c = (int)s[index];
  71. if (index < s.Length - 1)
  72. {
  73. int temp1 = c - HIGH_SURROGATE_START;
  74. if ((uint)temp1 <= HIGH_SURROGATE_RANGE)
  75. {
  76. int temp2 = (int)s[index + 1] - LOW_SURROGATE_START;
  77. if ((uint)temp2 <= HIGH_SURROGATE_RANGE)
  78. {
  79. // Convert the surrogate to UTF32 and get the result.
  80. return ((temp1 * 0x400) + temp2 + UNICODE_PLANE01_START);
  81. }
  82. }
  83. }
  84. return c;
  85. }
  86. ////////////////////////////////////////////////////////////////////////
  87. //
  88. // Convert a character or a surrogate pair starting at index of string s
  89. // to UTF32 value.
  90. //
  91. // Parameters:
  92. // s The string
  93. // index The starting index. It can point to a BMP character or
  94. // a surrogate pair.
  95. // len The length of the string.
  96. // charLength [out] If the index points to a BMP char, charLength
  97. // will be 1. If the index points to a surrogate pair,
  98. // charLength will be 2.
  99. //
  100. // WARNING: since it doesn't throw an exception it CAN return a value
  101. // in the surrogate range D800-DFFF, which are not legal unicode values.
  102. //
  103. // Returns:
  104. // The UTF32 value
  105. //
  106. ////////////////////////////////////////////////////////////////////////
  107. internal static int InternalConvertToUtf32(string s, int index, out int charLength)
  108. {
  109. Debug.Assert(s != null, "s != null");
  110. Debug.Assert(s.Length > 0, "s.Length > 0");
  111. Debug.Assert(index >= 0 && index < s.Length, "index >= 0 && index < s.Length");
  112. charLength = 1;
  113. if (index < s.Length - 1)
  114. {
  115. int temp1 = (int)s[index] - HIGH_SURROGATE_START;
  116. if ((uint)temp1 <= HIGH_SURROGATE_RANGE)
  117. {
  118. int temp2 = (int)s[index + 1] - LOW_SURROGATE_START;
  119. if ((uint)temp2 <= HIGH_SURROGATE_RANGE)
  120. {
  121. // Convert the surrogate to UTF32 and get the result.
  122. charLength++;
  123. return ((temp1 * 0x400) + temp2 + UNICODE_PLANE01_START);
  124. }
  125. }
  126. }
  127. return ((int)s[index]);
  128. }
  129. //
  130. // This is called by the public char and string, index versions
  131. //
  132. // Note that for ch in the range D800-DFFF we just treat it as any other non-numeric character
  133. internal static double InternalGetNumericValue(int ch)
  134. {
  135. Debug.Assert(ch >= 0 && ch <= 0x10ffff, "ch is not in valid Unicode range.");
  136. // Get the level 2 item from the highest 12 bit (8 - 19) of ch.
  137. int index = ch >> 8;
  138. if ((uint)index < (uint)NumericLevel1Index.Length)
  139. {
  140. index = NumericLevel1Index[index];
  141. // Get the level 2 offset from the 4 - 7 bit of ch. This provides the base offset of the level 3 table.
  142. // Note that & has the lower precedence than addition, so don't forget the parathesis.
  143. index = NumericLevel2Index[(index << 4) + ((ch >> 4) & 0x000f)];
  144. index = NumericLevel3Index[(index << 4) + (ch & 0x000f)];
  145. ref var value = ref Unsafe.AsRef(in NumericValues[index * 8]);
  146. if (BitConverter.IsLittleEndian)
  147. return Unsafe.ReadUnaligned<double>(ref value);
  148. return BitConverter.Int64BitsToDouble(BinaryPrimitives.ReverseEndianness(Unsafe.ReadUnaligned<long>(ref value)));
  149. }
  150. return -1;
  151. }
  152. internal static byte InternalGetDigitValues(int ch, int offset)
  153. {
  154. Debug.Assert(ch >= 0 && ch <= 0x10ffff, "ch is not in valid Unicode range.");
  155. // Get the level 2 item from the highest 12 bit (8 - 19) of ch.
  156. int index = ch >> 8;
  157. if ((uint)index < (uint)NumericLevel1Index.Length)
  158. {
  159. index = NumericLevel1Index[index];
  160. // Get the level 2 offset from the 4 - 7 bit of ch. This provides the base offset of the level 3 table.
  161. // Note that & has the lower precedence than addition, so don't forget the parathesis.
  162. index = NumericLevel2Index[(index << 4) + ((ch >> 4) & 0x000f)];
  163. index = NumericLevel3Index[(index << 4) + (ch & 0x000f)];
  164. return DigitValues[index * 2 + offset];
  165. }
  166. return 0xff;
  167. }
  168. ////////////////////////////////////////////////////////////////////////
  169. //
  170. //Returns the numeric value associated with the character c. If the character is a fraction,
  171. // the return value will not be an integer. If the character does not have a numeric value, the return value is -1.
  172. //
  173. //Returns:
  174. // the numeric value for the specified Unicode character. If the character does not have a numeric value, the return value is -1.
  175. //Arguments:
  176. // ch a Unicode character
  177. //Exceptions:
  178. // ArgumentNullException
  179. // ArgumentOutOfRangeException
  180. //
  181. ////////////////////////////////////////////////////////////////////////
  182. public static double GetNumericValue(char ch)
  183. {
  184. return (InternalGetNumericValue(ch));
  185. }
  186. public static double GetNumericValue(string s, int index)
  187. {
  188. if (s == null)
  189. {
  190. throw new ArgumentNullException(nameof(s));
  191. }
  192. if (index < 0 || index >= s.Length)
  193. {
  194. throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index);
  195. }
  196. return (InternalGetNumericValue(InternalConvertToUtf32(s, index)));
  197. }
  198. public static int GetDecimalDigitValue(char ch)
  199. {
  200. return (sbyte)InternalGetDigitValues(ch, 0);
  201. }
  202. public static int GetDecimalDigitValue(string s, int index)
  203. {
  204. if (s == null)
  205. {
  206. throw new ArgumentNullException(nameof(s));
  207. }
  208. if (index < 0 || index >= s.Length)
  209. {
  210. throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index);
  211. }
  212. return (sbyte)InternalGetDigitValues(InternalConvertToUtf32(s, index), 0);
  213. }
  214. public static int GetDigitValue(char ch)
  215. {
  216. return (sbyte)InternalGetDigitValues(ch, 1);
  217. }
  218. public static int GetDigitValue(string s, int index)
  219. {
  220. if (s == null)
  221. {
  222. throw new ArgumentNullException(nameof(s));
  223. }
  224. if (index < 0 || index >= s.Length)
  225. {
  226. throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index);
  227. }
  228. return (sbyte)InternalGetDigitValues(InternalConvertToUtf32(s, index), 1);
  229. }
  230. public static UnicodeCategory GetUnicodeCategory(char ch)
  231. {
  232. return (GetUnicodeCategory((int)ch));
  233. }
  234. public static UnicodeCategory GetUnicodeCategory(string s, int index)
  235. {
  236. if (s == null)
  237. throw new ArgumentNullException(nameof(s));
  238. if (((uint)index) >= ((uint)s.Length))
  239. {
  240. throw new ArgumentOutOfRangeException(nameof(index));
  241. }
  242. return InternalGetUnicodeCategory(s, index);
  243. }
  244. public static UnicodeCategory GetUnicodeCategory(int codePoint)
  245. {
  246. return ((UnicodeCategory)InternalGetCategoryValue(codePoint, UNICODE_CATEGORY_OFFSET));
  247. }
  248. ////////////////////////////////////////////////////////////////////////
  249. //
  250. //Action: Returns the Unicode Category property for the character c.
  251. //Returns:
  252. // an value in UnicodeCategory enum
  253. //Arguments:
  254. // ch a Unicode character
  255. //Exceptions:
  256. // None
  257. //
  258. //Note that this API will return values for D800-DF00 surrogate halves.
  259. //
  260. ////////////////////////////////////////////////////////////////////////
  261. internal static byte InternalGetCategoryValue(int ch, int offset)
  262. {
  263. Debug.Assert(ch >= 0 && ch <= 0x10ffff, "ch is not in valid Unicode range.");
  264. // Get the level 2 item from the highest 11 bits of ch.
  265. int index = CategoryLevel1Index[ch >> 9];
  266. // Get the level 2 WORD offset from the next 5 bits of ch. This provides the base offset of the level 3 table.
  267. // Note that & has the lower precedence than addition, so don't forget the parathesis.
  268. index = Unsafe.ReadUnaligned<ushort>(ref Unsafe.AsRef(in CategoryLevel2Index[(index << 6) + ((ch >> 3) & 0b111110)]));
  269. if (!BitConverter.IsLittleEndian)
  270. index = BinaryPrimitives.ReverseEndianness((ushort)index);
  271. // Get the result from the 0 -3 bit of ch.
  272. index = CategoryLevel3Index[(index << 4) + (ch & 0x000f)];
  273. return CategoriesValue[index * 2 + offset];
  274. }
  275. ////////////////////////////////////////////////////////////////////////
  276. //
  277. //Action: Returns the Unicode Category property for the character c.
  278. //Returns:
  279. // an value in UnicodeCategory enum
  280. //Arguments:
  281. // value a Unicode String
  282. // index Index for the specified string.
  283. //Exceptions:
  284. // None
  285. //
  286. ////////////////////////////////////////////////////////////////////////
  287. internal static UnicodeCategory InternalGetUnicodeCategory(string value, int index)
  288. {
  289. Debug.Assert(value != null, "value can not be null");
  290. Debug.Assert(index < value.Length, "index < value.Length");
  291. return (GetUnicodeCategory(InternalConvertToUtf32(value, index)));
  292. }
  293. internal static BidiCategory GetBidiCategory(string s, int index)
  294. {
  295. if (s == null)
  296. throw new ArgumentNullException(nameof(s));
  297. if (((uint)index) >= ((uint)s.Length))
  298. {
  299. throw new ArgumentOutOfRangeException(nameof(index));
  300. }
  301. return ((BidiCategory) InternalGetCategoryValue(InternalConvertToUtf32(s, index), BIDI_CATEGORY_OFFSET));
  302. }
  303. internal static BidiCategory GetBidiCategory(StringBuilder s, int index)
  304. {
  305. Debug.Assert(s != null, "s can not be null");
  306. Debug.Assert(index >= 0 && index < s.Length, "invalid index"); ;
  307. return ((BidiCategory) InternalGetCategoryValue(InternalConvertToUtf32(s, index), BIDI_CATEGORY_OFFSET));
  308. }
  309. ////////////////////////////////////////////////////////////////////////
  310. //
  311. // Get the Unicode category of the character starting at index. If the character is in BMP, charLength will return 1.
  312. // If the character is a valid surrogate pair, charLength will return 2.
  313. //
  314. ////////////////////////////////////////////////////////////////////////
  315. internal static UnicodeCategory InternalGetUnicodeCategory(string str, int index, out int charLength)
  316. {
  317. Debug.Assert(str != null, "str can not be null");
  318. Debug.Assert(str.Length > 0, "str.Length > 0"); ;
  319. Debug.Assert(index >= 0 && index < str.Length, "index >= 0 && index < str.Length");
  320. return (GetUnicodeCategory(InternalConvertToUtf32(str, index, out charLength)));
  321. }
  322. internal static bool IsCombiningCategory(UnicodeCategory uc)
  323. {
  324. Debug.Assert(uc >= 0, "uc >= 0");
  325. return (
  326. uc == UnicodeCategory.NonSpacingMark ||
  327. uc == UnicodeCategory.SpacingCombiningMark ||
  328. uc == UnicodeCategory.EnclosingMark
  329. );
  330. }
  331. }
  332. }