Utf16Utility.cs 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. // Licensed to the .NET Foundation under one or more agreements.
  2. // The .NET Foundation licenses this file to you under the MIT license.
  3. // See the LICENSE file in the project root for more information.
  4. using System.Runtime.CompilerServices;
  5. using System.Diagnostics;
  6. namespace System.Text
  7. {
  8. internal static partial class Utf16Utility
  9. {
  10. /// <summary>
  11. /// Returns true iff the UInt32 represents two ASCII UTF-16 characters in machine endianness.
  12. /// </summary>
  13. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  14. internal static bool AllCharsInUInt32AreAscii(uint value)
  15. {
  16. return (value & ~0x007F_007Fu) == 0;
  17. }
  18. /// <summary>
  19. /// Returns true iff the UInt64 represents four ASCII UTF-16 characters in machine endianness.
  20. /// </summary>
  21. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  22. internal static bool AllCharsInUInt64AreAscii(ulong value)
  23. {
  24. return (value & ~0x007F_007F_007F_007Ful) == 0;
  25. }
  26. /// <summary>
  27. /// Given a UInt32 that represents two ASCII UTF-16 characters, returns the invariant
  28. /// lowercase representation of those characters. Requires the input value to contain
  29. /// two ASCII UTF-16 characters in machine endianness.
  30. /// </summary>
  31. /// <remarks>
  32. /// This is a branchless implementation.
  33. /// </remarks>
  34. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  35. internal static uint ConvertAllAsciiCharsInUInt32ToLowercase(uint value)
  36. {
  37. // ASSUMPTION: Caller has validated that input value is ASCII.
  38. Debug.Assert(AllCharsInUInt32AreAscii(value));
  39. // the 0x80 bit of each word of 'lowerIndicator' will be set iff the word has value >= 'A'
  40. uint lowerIndicator = value + 0x0080_0080u - 0x0041_0041u;
  41. // the 0x80 bit of each word of 'upperIndicator' will be set iff the word has value > 'Z'
  42. uint upperIndicator = value + 0x0080_0080u - 0x005B_005Bu;
  43. // the 0x80 bit of each word of 'combinedIndicator' will be set iff the word has value >= 'A' and <= 'Z'
  44. uint combinedIndicator = (lowerIndicator ^ upperIndicator);
  45. // the 0x20 bit of each word of 'mask' will be set iff the word has value >= 'A' and <= 'Z'
  46. uint mask = (combinedIndicator & 0x0080_0080u) >> 2;
  47. return value ^ mask; // bit flip uppercase letters [A-Z] => [a-z]
  48. }
  49. /// <summary>
  50. /// Given a UInt32 that represents two ASCII UTF-16 characters, returns the invariant
  51. /// uppercase representation of those characters. Requires the input value to contain
  52. /// two ASCII UTF-16 characters in machine endianness.
  53. /// </summary>
  54. /// <remarks>
  55. /// This is a branchless implementation.
  56. /// </remarks>
  57. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  58. internal static uint ConvertAllAsciiCharsInUInt32ToUppercase(uint value)
  59. {
  60. // ASSUMPTION: Caller has validated that input value is ASCII.
  61. Debug.Assert(AllCharsInUInt32AreAscii(value));
  62. // the 0x80 bit of each word of 'lowerIndicator' will be set iff the word has value >= 'a'
  63. uint lowerIndicator = value + 0x0080_0080u - 0x0061_0061u;
  64. // the 0x80 bit of each word of 'upperIndicator' will be set iff the word has value > 'z'
  65. uint upperIndicator = value + 0x0080_0080u - 0x007B_007Bu;
  66. // the 0x80 bit of each word of 'combinedIndicator' will be set iff the word has value >= 'a' and <= 'z'
  67. uint combinedIndicator = (lowerIndicator ^ upperIndicator);
  68. // the 0x20 bit of each word of 'mask' will be set iff the word has value >= 'a' and <= 'z'
  69. uint mask = (combinedIndicator & 0x0080_0080u) >> 2;
  70. return value ^ mask; // bit flip lowercase letters [a-z] => [A-Z]
  71. }
  72. /// <summary>
  73. /// Given a UInt32 that represents two ASCII UTF-16 characters, returns true iff
  74. /// the input contains one or more lowercase ASCII characters.
  75. /// </summary>
  76. /// <remarks>
  77. /// This is a branchless implementation.
  78. /// </remarks>
  79. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  80. internal static bool UInt32ContainsAnyLowercaseAsciiChar(uint value)
  81. {
  82. // ASSUMPTION: Caller has validated that input value is ASCII.
  83. Debug.Assert(AllCharsInUInt32AreAscii(value));
  84. // the 0x80 bit of each word of 'lowerIndicator' will be set iff the word has value >= 'a'
  85. uint lowerIndicator = value + 0x0080_0080u - 0x0061_0061u;
  86. // the 0x80 bit of each word of 'upperIndicator' will be set iff the word has value > 'z'
  87. uint upperIndicator = value + 0x0080_0080u - 0x007B_007Bu;
  88. // the 0x80 bit of each word of 'combinedIndicator' will be set iff the word has value >= 'a' and <= 'z'
  89. uint combinedIndicator = (lowerIndicator ^ upperIndicator);
  90. return (combinedIndicator & 0x0080_0080u) != 0;
  91. }
  92. /// <summary>
  93. /// Given a UInt32 that represents two ASCII UTF-16 characters, returns true iff
  94. /// the input contains one or more uppercase ASCII characters.
  95. /// </summary>
  96. /// <remarks>
  97. /// This is a branchless implementation.
  98. /// </remarks>
  99. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  100. internal static bool UInt32ContainsAnyUppercaseAsciiChar(uint value)
  101. {
  102. // ASSUMPTION: Caller has validated that input value is ASCII.
  103. Debug.Assert(AllCharsInUInt32AreAscii(value));
  104. // the 0x80 bit of each word of 'lowerIndicator' will be set iff the word has value >= 'A'
  105. uint lowerIndicator = value + 0x0080_0080u - 0x0041_0041u;
  106. // the 0x80 bit of each word of 'upperIndicator' will be set iff the word has value > 'Z'
  107. uint upperIndicator = value + 0x0080_0080u - 0x005B_005Bu;
  108. // the 0x80 bit of each word of 'combinedIndicator' will be set iff the word has value >= 'A' and <= 'Z'
  109. uint combinedIndicator = (lowerIndicator ^ upperIndicator);
  110. return (combinedIndicator & 0x0080_0080u) != 0;
  111. }
  112. /// <summary>
  113. /// Given two UInt32s that represent two ASCII UTF-16 characters each, returns true iff
  114. /// the two inputs are equal using an ordinal case-insensitive comparison.
  115. /// </summary>
  116. /// <remarks>
  117. /// This is a branchless implementation.
  118. /// </remarks>
  119. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  120. internal static bool UInt32OrdinalIgnoreCaseAscii(uint valueA, uint valueB)
  121. {
  122. // ASSUMPTION: Caller has validated that input values are ASCII.
  123. Debug.Assert(AllCharsInUInt32AreAscii(valueA));
  124. Debug.Assert(AllCharsInUInt32AreAscii(valueB));
  125. // a mask of all bits which are different between A and B
  126. uint differentBits = valueA ^ valueB;
  127. // the 0x80 bit of each word of 'lowerIndicator' will be set iff the word has value < 'A'
  128. uint lowerIndicator = valueA + 0x0100_0100u - 0x0041_0041u;
  129. // the 0x80 bit of each word of 'upperIndicator' will be set iff (word | 0x20) has value > 'z'
  130. uint upperIndicator = (valueA | 0x0020_0020u) + 0x0080_0080u - 0x007B_007Bu;
  131. // the 0x80 bit of each word of 'combinedIndicator' will be set iff the word is *not* [A-Za-z]
  132. uint combinedIndicator = lowerIndicator | upperIndicator;
  133. // Shift all the 0x80 bits of 'combinedIndicator' into the 0x20 positions, then set all bits
  134. // aside from 0x20. This creates a mask where all bits are set *except* for the 0x20 bits
  135. // which correspond to alpha chars (either lower or upper). For these alpha chars only, the
  136. // 0x20 bit is allowed to differ between the two input values. Every other char must be an
  137. // exact bitwise match between the two input values. In other words, (valueA & mask) will
  138. // convert valueA to uppercase, so (valueA & mask) == (valueB & mask) answers "is the uppercase
  139. // form of valueA equal to the uppercase form of valueB?" (Technically if valueA has an alpha
  140. // char in the same position as a non-alpha char in valueB, or vice versa, this operation will
  141. // result in nonsense, but it'll still compute as inequal regardless, which is what we want ultimately.)
  142. // The line below is a more efficient way of doing the same check taking advantage of the XOR
  143. // computation we performed at the beginning of the method.
  144. return (((combinedIndicator >> 2) | ~0x0020_0020u) & differentBits) == 0;
  145. }
  146. /// <summary>
  147. /// Given two UInt64s that represent four ASCII UTF-16 characters each, returns true iff
  148. /// the two inputs are equal using an ordinal case-insensitive comparison.
  149. /// </summary>
  150. /// <remarks>
  151. /// This is a branchless implementation.
  152. /// </remarks>
  153. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  154. internal static bool UInt64OrdinalIgnoreCaseAscii(ulong valueA, ulong valueB)
  155. {
  156. // ASSUMPTION: Caller has validated that input values are ASCII.
  157. Debug.Assert(AllCharsInUInt64AreAscii(valueA));
  158. Debug.Assert(AllCharsInUInt64AreAscii(valueB));
  159. // the 0x80 bit of each word of 'lowerIndicator' will be set iff the word has value >= 'A'
  160. ulong lowerIndicator = valueA + 0x0080_0080_0080_0080ul - 0x0041_0041_0041_0041ul;
  161. // the 0x80 bit of each word of 'upperIndicator' will be set iff (word | 0x20) has value <= 'z'
  162. ulong upperIndicator = (valueA | 0x0020_0020_0020_0020ul) + 0x0100_0100_0100_0100ul - 0x007B_007B_007B_007Bul;
  163. // the 0x20 bit of each word of 'combinedIndicator' will be set iff the word is [A-Za-z]
  164. ulong combinedIndicator = (0x0080_0080_0080_0080ul & lowerIndicator & upperIndicator) >> 2;
  165. // Convert both values to lowercase (using the combined indicator from the first value)
  166. // and compare for equality. It's possible that the first value will contain an alpha character
  167. // where the second value doesn't (or vice versa), and applying the combined indicator will
  168. // create nonsensical data, but the comparison would have failed anyway in this case so it's
  169. // a safe operation to perform.
  170. //
  171. // This 64-bit method is similar to the 32-bit method, but it performs the equivalent of convert-to-
  172. // lowercase-then-compare rather than convert-to-uppercase-and-compare. This particular operation
  173. // happens to be faster on x64.
  174. return (valueA | combinedIndicator) == (valueB | combinedIndicator);
  175. }
  176. }
  177. }