CharUnicodeInfo.cs 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321
  1. // Licensed to the .NET Foundation under one or more agreements.
  2. // The .NET Foundation licenses this file to you under the MIT license.
  3. // See the LICENSE file in the project root for more information.
  4. using System.Buffers.Binary;
  5. using System.Diagnostics;
  6. using System.Text;
  7. using Internal.Runtime.CompilerServices;
  8. namespace System.Globalization
  9. {
  10. /// <summary>
  11. /// This class implements a set of methods for retrieving character type
  12. /// information. Character type information is independent of culture
  13. /// and region.
  14. /// </summary>
  15. public static partial class CharUnicodeInfo
  16. {
  17. internal const char HIGH_SURROGATE_START = '\ud800';
  18. internal const char HIGH_SURROGATE_END = '\udbff';
  19. internal const char LOW_SURROGATE_START = '\udc00';
  20. internal const char LOW_SURROGATE_END = '\udfff';
  21. internal const int HIGH_SURROGATE_RANGE = 0x3FF;
  22. internal const int UNICODE_CATEGORY_OFFSET = 0;
  23. internal const int BIDI_CATEGORY_OFFSET = 1;
  24. // The starting codepoint for Unicode plane 1. Plane 1 contains 0x010000 ~ 0x01ffff.
  25. internal const int UNICODE_PLANE01_START = 0x10000;
  26. /// <summary>
  27. /// Convert the BMP character or surrogate pointed by index to a UTF32 value.
  28. /// This is similar to char.ConvertToUTF32, but the difference is that
  29. /// it does not throw exceptions when invalid surrogate characters are passed in.
  30. ///
  31. /// WARNING: since it doesn't throw an exception it CAN return a value
  32. /// in the surrogate range D800-DFFF, which are not legal unicode values.
  33. /// </summary>
  34. internal static int InternalConvertToUtf32(string s, int index)
  35. {
  36. Debug.Assert(s != null, "s != null");
  37. Debug.Assert(index >= 0 && index < s.Length, "index < s.Length");
  38. if (index < s.Length - 1)
  39. {
  40. int temp1 = (int)s[index] - HIGH_SURROGATE_START;
  41. if ((uint)temp1 <= HIGH_SURROGATE_RANGE)
  42. {
  43. int temp2 = (int)s[index + 1] - LOW_SURROGATE_START;
  44. if ((uint)temp2 <= HIGH_SURROGATE_RANGE)
  45. {
  46. // Convert the surrogate to UTF32 and get the result.
  47. return ((temp1 * 0x400) + temp2 + UNICODE_PLANE01_START);
  48. }
  49. }
  50. }
  51. return (int)s[index];
  52. }
  53. internal static int InternalConvertToUtf32(StringBuilder s, int index)
  54. {
  55. Debug.Assert(s != null, "s != null");
  56. Debug.Assert(index >= 0 && index < s.Length, "index < s.Length");
  57. int c = (int)s[index];
  58. if (index < s.Length - 1)
  59. {
  60. int temp1 = c - HIGH_SURROGATE_START;
  61. if ((uint)temp1 <= HIGH_SURROGATE_RANGE)
  62. {
  63. int temp2 = (int)s[index + 1] - LOW_SURROGATE_START;
  64. if ((uint)temp2 <= HIGH_SURROGATE_RANGE)
  65. {
  66. // Convert the surrogate to UTF32 and get the result.
  67. return (temp1 * 0x400) + temp2 + UNICODE_PLANE01_START;
  68. }
  69. }
  70. }
  71. return c;
  72. }
  73. /// <summary>
  74. /// Convert a character or a surrogate pair starting at index of string s
  75. /// to UTF32 value.
  76. /// WARNING: since it doesn't throw an exception it CAN return a value
  77. /// in the surrogate range D800-DFFF, which are not legal unicode values.
  78. /// </summary>
  79. internal static int InternalConvertToUtf32(string s, int index, out int charLength)
  80. {
  81. Debug.Assert(s != null, "s != null");
  82. Debug.Assert(s.Length > 0, "s.Length > 0");
  83. Debug.Assert(index >= 0 && index < s.Length, "index >= 0 && index < s.Length");
  84. charLength = 1;
  85. if (index < s.Length - 1)
  86. {
  87. int temp1 = (int)s[index] - HIGH_SURROGATE_START;
  88. if ((uint)temp1 <= HIGH_SURROGATE_RANGE)
  89. {
  90. int temp2 = (int)s[index + 1] - LOW_SURROGATE_START;
  91. if ((uint)temp2 <= HIGH_SURROGATE_RANGE)
  92. {
  93. // Convert the surrogate to UTF32 and get the result.
  94. charLength++;
  95. return ((temp1 * 0x400) + temp2 + UNICODE_PLANE01_START);
  96. }
  97. }
  98. }
  99. return ((int)s[index]);
  100. }
  101. /// <summary>
  102. /// This is called by the public char and string, index versions
  103. /// Note that for ch in the range D800-DFFF we just treat it as any
  104. /// other non-numeric character
  105. /// </summary>
  106. internal static double InternalGetNumericValue(int ch)
  107. {
  108. Debug.Assert(ch >= 0 && ch <= 0x10ffff, "ch is not in valid Unicode range.");
  109. // Get the level 2 item from the highest 12 bit (8 - 19) of ch.
  110. int index = ch >> 8;
  111. if ((uint)index < (uint)NumericLevel1Index.Length)
  112. {
  113. index = NumericLevel1Index[index];
  114. // Get the level 2 offset from the 4 - 7 bit of ch. This provides the base offset of the level 3 table.
  115. // Note that & has the lower precedence than addition, so don't forget the parathesis.
  116. index = NumericLevel2Index[(index << 4) + ((ch >> 4) & 0x000f)];
  117. index = NumericLevel3Index[(index << 4) + (ch & 0x000f)];
  118. ref var value = ref Unsafe.AsRef(in NumericValues[index * 8]);
  119. if (BitConverter.IsLittleEndian)
  120. {
  121. return Unsafe.ReadUnaligned<double>(ref value);
  122. }
  123. return BitConverter.Int64BitsToDouble(BinaryPrimitives.ReverseEndianness(Unsafe.ReadUnaligned<long>(ref value)));
  124. }
  125. return -1;
  126. }
  127. internal static byte InternalGetDigitValues(int ch, int offset)
  128. {
  129. Debug.Assert(ch >= 0 && ch <= 0x10ffff, "ch is not in valid Unicode range.");
  130. // Get the level 2 item from the highest 12 bit (8 - 19) of ch.
  131. int index = ch >> 8;
  132. if ((uint)index < (uint)NumericLevel1Index.Length)
  133. {
  134. index = NumericLevel1Index[index];
  135. // Get the level 2 offset from the 4 - 7 bit of ch. This provides the base offset of the level 3 table.
  136. // Note that & has the lower precedence than addition, so don't forget the parathesis.
  137. index = NumericLevel2Index[(index << 4) + ((ch >> 4) & 0x000f)];
  138. index = NumericLevel3Index[(index << 4) + (ch & 0x000f)];
  139. return DigitValues[index * 2 + offset];
  140. }
  141. return 0xff;
  142. }
  143. /// <summary>
  144. /// Returns the numeric value associated with the character c.
  145. /// If the character is a fraction, the return value will not be an
  146. /// integer. If the character does not have a numeric value, the return
  147. /// value is -1.
  148. /// </summary>
  149. public static double GetNumericValue(char ch)
  150. {
  151. return InternalGetNumericValue(ch);
  152. }
  153. public static double GetNumericValue(string s, int index)
  154. {
  155. if (s == null)
  156. {
  157. throw new ArgumentNullException(nameof(s));
  158. }
  159. if (index < 0 || index >= s.Length)
  160. {
  161. throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index);
  162. }
  163. return InternalGetNumericValue(InternalConvertToUtf32(s, index));
  164. }
  165. public static int GetDecimalDigitValue(char ch)
  166. {
  167. return (sbyte)InternalGetDigitValues(ch, 0);
  168. }
  169. public static int GetDecimalDigitValue(string s, int index)
  170. {
  171. if (s == null)
  172. {
  173. throw new ArgumentNullException(nameof(s));
  174. }
  175. if (index < 0 || index >= s.Length)
  176. {
  177. throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index);
  178. }
  179. return (sbyte)InternalGetDigitValues(InternalConvertToUtf32(s, index), 0);
  180. }
  181. public static int GetDigitValue(char ch)
  182. {
  183. return (sbyte)InternalGetDigitValues(ch, 1);
  184. }
  185. public static int GetDigitValue(string s, int index)
  186. {
  187. if (s == null)
  188. {
  189. throw new ArgumentNullException(nameof(s));
  190. }
  191. if (index < 0 || index >= s.Length)
  192. {
  193. throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index);
  194. }
  195. return (sbyte)InternalGetDigitValues(InternalConvertToUtf32(s, index), 1);
  196. }
  197. public static UnicodeCategory GetUnicodeCategory(char ch)
  198. {
  199. return GetUnicodeCategory((int)ch);
  200. }
  201. public static UnicodeCategory GetUnicodeCategory(string s, int index)
  202. {
  203. if (s == null)
  204. {
  205. throw new ArgumentNullException(nameof(s));
  206. }
  207. if (((uint)index) >= ((uint)s.Length))
  208. {
  209. throw new ArgumentOutOfRangeException(nameof(index));
  210. }
  211. return InternalGetUnicodeCategory(s, index);
  212. }
  213. public static UnicodeCategory GetUnicodeCategory(int codePoint)
  214. {
  215. return (UnicodeCategory)InternalGetCategoryValue(codePoint, UNICODE_CATEGORY_OFFSET);
  216. }
  217. /// <summary>
  218. /// Returns the Unicode Category property for the character c.
  219. /// Note that this API will return values for D800-DF00 surrogate halves.
  220. /// </summary>
  221. internal static byte InternalGetCategoryValue(int ch, int offset)
  222. {
  223. Debug.Assert(ch >= 0 && ch <= 0x10ffff, "ch is not in valid Unicode range.");
  224. // Get the level 2 item from the highest 11 bits of ch.
  225. int index = CategoryLevel1Index[ch >> 9];
  226. // Get the level 2 WORD offset from the next 5 bits of ch. This provides the base offset of the level 3 table.
  227. // Note that & has the lower precedence than addition, so don't forget the parathesis.
  228. index = Unsafe.ReadUnaligned<ushort>(ref Unsafe.AsRef(in CategoryLevel2Index[(index << 6) + ((ch >> 3) & 0b111110)]));
  229. if (!BitConverter.IsLittleEndian)
  230. {
  231. index = BinaryPrimitives.ReverseEndianness((ushort)index);
  232. }
  233. // Get the result from the 0 -3 bit of ch.
  234. index = CategoryLevel3Index[(index << 4) + (ch & 0x000f)];
  235. return CategoriesValue[index * 2 + offset];
  236. }
  237. /// <summary>
  238. /// Returns the Unicode Category property for the character c.
  239. /// </summary>
  240. internal static UnicodeCategory InternalGetUnicodeCategory(string value, int index)
  241. {
  242. Debug.Assert(value != null, "value can not be null");
  243. Debug.Assert(index < value.Length, "index < value.Length");
  244. return (GetUnicodeCategory(InternalConvertToUtf32(value, index)));
  245. }
  246. internal static BidiCategory GetBidiCategory(string s, int index)
  247. {
  248. if (s == null)
  249. {
  250. throw new ArgumentNullException(nameof(s));
  251. }
  252. if (((uint)index) >= ((uint)s.Length))
  253. {
  254. throw new ArgumentOutOfRangeException(nameof(index));
  255. }
  256. return ((BidiCategory) InternalGetCategoryValue(InternalConvertToUtf32(s, index), BIDI_CATEGORY_OFFSET));
  257. }
  258. internal static BidiCategory GetBidiCategory(StringBuilder s, int index)
  259. {
  260. Debug.Assert(s != null, "s can not be null");
  261. Debug.Assert(index >= 0 && index < s.Length, "invalid index"); ;
  262. return ((BidiCategory) InternalGetCategoryValue(InternalConvertToUtf32(s, index), BIDI_CATEGORY_OFFSET));
  263. }
  264. /// <summary>
  265. /// Get the Unicode category of the character starting at index. If the character is in BMP, charLength will return 1.
  266. /// If the character is a valid surrogate pair, charLength will return 2.
  267. /// </summary>
  268. internal static UnicodeCategory InternalGetUnicodeCategory(string str, int index, out int charLength)
  269. {
  270. Debug.Assert(str != null, "str can not be null");
  271. Debug.Assert(str.Length > 0, "str.Length > 0"); ;
  272. Debug.Assert(index >= 0 && index < str.Length, "index >= 0 && index < str.Length");
  273. return GetUnicodeCategory(InternalConvertToUtf32(str, index, out charLength));
  274. }
  275. internal static bool IsCombiningCategory(UnicodeCategory uc)
  276. {
  277. Debug.Assert(uc >= 0, "uc >= 0");
  278. return (
  279. uc == UnicodeCategory.NonSpacingMark ||
  280. uc == UnicodeCategory.SpacingCombiningMark ||
  281. uc == UnicodeCategory.EnclosingMark
  282. );
  283. }
  284. }
  285. }