UnicodeUtility.cs 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
  1. // Licensed to the .NET Foundation under one or more agreements.
  2. // The .NET Foundation licenses this file to you under the MIT license.
  3. // See the LICENSE file in the project root for more information.
  4. using System.Runtime.CompilerServices;
  5. namespace System.Text
  6. {
  7. internal static class UnicodeUtility
  8. {
  9. /// <summary>
  10. /// The Unicode replacement character U+FFFD.
  11. /// </summary>
  12. public const uint ReplacementChar = 0xFFFDU;
  13. /// <summary>
  14. /// Returns the Unicode plane (0 through 16, inclusive) which contains this code point.
  15. /// </summary>
  16. public static int GetPlane(uint codePoint)
  17. {
  18. UnicodeDebug.AssertIsValidCodePoint(codePoint);
  19. return (int)(codePoint >> 16);
  20. }
  21. /// <summary>
  22. /// Returns a Unicode scalar value from two code points representing a UTF-16 surrogate pair.
  23. /// </summary>
  24. public static uint GetScalarFromUtf16SurrogatePair(uint highSurrogateCodePoint, uint lowSurrogateCodePoint)
  25. {
  26. UnicodeDebug.AssertIsHighSurrogateCodePoint(highSurrogateCodePoint);
  27. UnicodeDebug.AssertIsLowSurrogateCodePoint(lowSurrogateCodePoint);
  28. // This calculation comes from the Unicode specification, Table 3-5.
  29. // Need to remove the D800 marker from the high surrogate and the DC00 marker from the low surrogate,
  30. // then fix up the "wwww = uuuuu - 1" section of the bit distribution. The code is written as below
  31. // to become just two instructions: shl, lea.
  32. return (highSurrogateCodePoint << 10) + lowSurrogateCodePoint - ((0xD800U << 10) + 0xDC00U - (1 << 16));
  33. }
  34. /// <summary>
  35. /// Given a Unicode scalar value, gets the number of UTF-16 code units required to represent this value.
  36. /// </summary>
  37. public static int GetUtf16SequenceLength(uint value)
  38. {
  39. UnicodeDebug.AssertIsValidScalar(value);
  40. value -= 0x10000; // if value < 0x10000, high byte = 0xFF; else high byte = 0x00
  41. value += (2 << 24); // if value < 0x10000, high byte = 0x01; else high byte = 0x02
  42. value >>= 24; // shift high byte down
  43. return (int)value; // and return it
  44. }
  45. /// <summary>
  46. /// Decomposes an astral Unicode scalar into UTF-16 high and low surrogate code units.
  47. /// </summary>
  48. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  49. public static void GetUtf16SurrogatesFromSupplementaryPlaneScalar(uint value, out char highSurrogateCodePoint, out char lowSurrogateCodePoint)
  50. {
  51. UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(value);
  52. // This calculation comes from the Unicode specification, Table 3-5.
  53. highSurrogateCodePoint = (char)((value + ((0xD800u - 0x40u) << 10)) >> 10);
  54. lowSurrogateCodePoint = (char)((value & 0x3FFu) + 0xDC00u);
  55. }
  56. /// <summary>
  57. /// Given a Unicode scalar value, gets the number of UTF-8 code units required to represent this value.
  58. /// </summary>
  59. public static int GetUtf8SequenceLength(uint value)
  60. {
  61. UnicodeDebug.AssertIsValidScalar(value);
  62. // The logic below can handle all valid scalar values branchlessly.
  63. // It gives generally good performance across all inputs, and on x86
  64. // it's only six instructions: lea, sar, xor, add, shr, lea.
  65. // 'a' will be -1 if input is < 0x800; else 'a' will be 0
  66. // => 'a' will be -1 if input is 1 or 2 UTF-8 code units; else 'a' will be 0
  67. int a = ((int)value - 0x0800) >> 31;
  68. // The number of UTF-8 code units for a given scalar is as follows:
  69. // - U+0000..U+007F => 1 code unit
  70. // - U+0080..U+07FF => 2 code units
  71. // - U+0800..U+FFFF => 3 code units
  72. // - U+10000+ => 4 code units
  73. //
  74. // If we XOR the incoming scalar with 0xF800, the chart mutates:
  75. // - U+0000..U+F7FF => 3 code units
  76. // - U+F800..U+F87F => 1 code unit
  77. // - U+F880..U+FFFF => 2 code units
  78. // - U+10000+ => 4 code units
  79. //
  80. // Since the 1- and 3-code unit cases are now clustered, they can
  81. // both be checked together very cheaply.
  82. value ^= 0xF800u;
  83. value -= 0xF880u; // if scalar is 1 or 3 code units, high byte = 0xFF; else high byte = 0x00
  84. value += (4 << 24); // if scalar is 1 or 3 code units, high byte = 0x03; else high byte = 0x04
  85. value >>= 24; // shift high byte down
  86. // Final return value:
  87. // - U+0000..U+007F => 3 + (-1) * 2 = 1
  88. // - U+0080..U+07FF => 4 + (-1) * 2 = 2
  89. // - U+0800..U+FFFF => 3 + ( 0) * 2 = 3
  90. // - U+10000+ => 4 + ( 0) * 2 = 4
  91. return (int)value + (a * 2);
  92. }
  93. /// <summary>
  94. /// Returns <see langword="true"/> iff <paramref name="value"/> is an ASCII
  95. /// character ([ U+0000..U+007F ]).
  96. /// </summary>
  97. /// <remarks>
  98. /// Per http://www.unicode.org/glossary/#ASCII, ASCII is only U+0000..U+007F.
  99. /// </remarks>
  100. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  101. public static bool IsAsciiCodePoint(uint value) => (value <= 0x7Fu);
  102. /// <summary>
  103. /// Returns <see langword="true"/> iff <paramref name="value"/> is in the
  104. /// Basic Multilingual Plane (BMP).
  105. /// </summary>
  106. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  107. public static bool IsBmpCodePoint(uint value) => (value <= 0xFFFFu);
  108. /// <summary>
  109. /// Returns <see langword="true"/> iff <paramref name="value"/> is a UTF-16 high surrogate code point,
  110. /// i.e., is in [ U+D800..U+DBFF ], inclusive.
  111. /// </summary>
  112. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  113. public static bool IsHighSurrogateCodePoint(uint value) => IsInRangeInclusive(value, 0xD800U, 0xDBFFU);
  114. /// <summary>
  115. /// Returns <see langword="true"/> iff <paramref name="value"/> is between
  116. /// <paramref name="lowerBound"/> and <paramref name="upperBound"/>, inclusive.
  117. /// </summary>
  118. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  119. public static bool IsInRangeInclusive(uint value, uint lowerBound, uint upperBound) => ((value - lowerBound) <= (upperBound - lowerBound));
  120. /// <summary>
  121. /// Returns <see langword="true"/> iff <paramref name="value"/> is a UTF-16 low surrogate code point,
  122. /// i.e., is in [ U+DC00..U+DFFF ], inclusive.
  123. /// </summary>
  124. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  125. public static bool IsLowSurrogateCodePoint(uint value) => IsInRangeInclusive(value, 0xDC00U, 0xDFFFU);
  126. /// <summary>
  127. /// Returns <see langword="true"/> iff <paramref name="value"/> is a UTF-16 surrogate code point,
  128. /// i.e., is in [ U+D800..U+DFFF ], inclusive.
  129. /// </summary>
  130. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  131. public static bool IsSurrogateCodePoint(uint value) => IsInRangeInclusive(value, 0xD800U, 0xDFFFU);
  132. /// <summary>
  133. /// Returns <see langword="true"/> iff <paramref name="codePoint"/> is a valid Unicode code
  134. /// point, i.e., is in [ U+0000..U+10FFFF ], inclusive.
  135. /// </summary>
  136. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  137. public static bool IsValidCodePoint(uint codePoint) => (codePoint <= 0x10FFFFU);
  138. /// <summary>
  139. /// Returns <see langword="true"/> iff <paramref name="value"/> is a valid Unicode scalar
  140. /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive.
  141. /// </summary>
  142. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  143. public static bool IsValidUnicodeScalar(uint value)
  144. {
  145. // By XORing the incoming value with 0xD800, surrogate code points
  146. // are moved to the range [ U+0000..U+07FF ], and all valid scalar
  147. // values are clustered into the single range [ U+0800..U+10FFFF ],
  148. // which allows performing a single fast range check.
  149. return IsInRangeInclusive(value ^ 0xD800U, 0x800U, 0x10FFFFU);
  150. }
  151. }
  152. }