Rune.cs 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819
  1. // Licensed to the .NET Foundation under one or more agreements.
  2. // The .NET Foundation licenses this file to you under the MIT license.
  3. // See the LICENSE file in the project root for more information.
  4. using System.Diagnostics;
  5. using System.Globalization;
  6. using System.Runtime.CompilerServices;
  7. namespace System.Text
  8. {
  9. /// <summary>
  10. /// Represents a Unicode scalar value ([ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive).
  11. /// </summary>
  12. /// <remarks>
  13. /// This type's constructors and conversion operators validate the input, so consumers can call the APIs
  14. /// assuming that the underlying <see cref="Rune"/> instance is well-formed.
  15. /// </remarks>
  16. [DebuggerDisplay("{DebuggerDisplay,nq}")]
  17. public readonly struct Rune : IComparable<Rune>, IEquatable<Rune>
  18. {
  19. private const byte IsWhiteSpaceFlag = 0x80;
  20. private const byte IsLetterOrDigitFlag = 0x40;
  21. private const byte UnicodeCategoryMask = 0x1F;
  22. // Contains information about the ASCII character range [ U+0000..U+007F ], with:
  23. // - 0x80 bit if set means 'is whitespace'
  24. // - 0x40 bit if set means 'is letter or digit'
  25. // - 0x20 bit is reserved for future use
  26. // - bottom 5 bits are the UnicodeCategory of the character
  27. private static ReadOnlySpan<byte> AsciiCharInfo => new byte[]
  28. {
  29. 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x8E, 0x8E, 0x8E, 0x8E, 0x8E, 0x0E, 0x0E,
  30. 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E,
  31. 0x8B, 0x18, 0x18, 0x18, 0x1A, 0x18, 0x18, 0x18, 0x14, 0x15, 0x18, 0x19, 0x18, 0x13, 0x18, 0x18,
  32. 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x18, 0x18, 0x19, 0x19, 0x19, 0x18,
  33. 0x18, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
  34. 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x14, 0x18, 0x15, 0x1B, 0x12,
  35. 0x1B, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41,
  36. 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x14, 0x19, 0x15, 0x19, 0x0E
  37. };
  38. private readonly uint _value;
  39. /// <summary>
  40. /// Creates a <see cref="Rune"/> from the provided UTF-16 code unit.
  41. /// </summary>
  42. /// <exception cref="ArgumentOutOfRangeException">
  43. /// If <paramref name="ch"/> represents a UTF-16 surrogate code point
  44. /// U+D800..U+DFFF, inclusive.
  45. /// </exception>
  46. public Rune(char ch)
  47. {
  48. uint expanded = ch;
  49. if (UnicodeUtility.IsSurrogateCodePoint(expanded))
  50. {
  51. ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.ch);
  52. }
  53. _value = expanded;
  54. }
  55. /// <summary>
  56. /// Creates a <see cref="Rune"/> from the provided Unicode scalar value.
  57. /// </summary>
  58. /// <exception cref="ArgumentOutOfRangeException">
  59. /// If <paramref name="value"/> does not represent a value Unicode scalar value.
  60. /// </exception>
  61. public Rune(int value)
  62. : this((uint)value)
  63. {
  64. }
  65. /// <summary>
  66. /// Creates a <see cref="Rune"/> from the provided Unicode scalar value.
  67. /// </summary>
  68. /// <exception cref="ArgumentOutOfRangeException">
  69. /// If <paramref name="value"/> does not represent a value Unicode scalar value.
  70. /// </exception>
  71. [CLSCompliant(false)]
  72. public Rune(uint value)
  73. {
  74. if (!UnicodeUtility.IsValidUnicodeScalar(value))
  75. {
  76. ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.value);
  77. }
  78. _value = value;
  79. }
  80. // non-validating ctor
  81. private Rune(uint scalarValue, bool unused)
  82. {
  83. UnicodeDebug.AssertIsValidScalar(scalarValue);
  84. _value = scalarValue;
  85. }
  86. public static bool operator ==(Rune left, Rune right) => (left._value == right._value);
  87. public static bool operator !=(Rune left, Rune right) => (left._value != right._value);
  88. public static bool operator <(Rune left, Rune right) => (left._value < right._value);
  89. public static bool operator <=(Rune left, Rune right) => (left._value <= right._value);
  90. public static bool operator >(Rune left, Rune right) => (left._value > right._value);
  91. public static bool operator >=(Rune left, Rune right) => (left._value >= right._value);
  92. // Operators below are explicit because they may throw.
  93. public static explicit operator Rune(char ch) => new Rune(ch);
  94. [CLSCompliant(false)]
  95. public static explicit operator Rune(uint value) => new Rune(value);
  96. public static explicit operator Rune(int value) => new Rune(value);
  97. // Displayed as "'<char>' (U+XXXX)"; e.g., "'e' (U+0065)"
  98. private string DebuggerDisplay => FormattableString.Invariant($"U+{_value:X4} '{(IsValid(_value) ? ToString() : "\uFFFD")}'");
  99. /// <summary>
  100. /// Returns true if and only if this scalar value is ASCII ([ U+0000..U+007F ])
  101. /// and therefore representable by a single UTF-8 code unit.
  102. /// </summary>
  103. public bool IsAscii => UnicodeUtility.IsAsciiCodePoint(_value);
  104. /// <summary>
  105. /// Returns true if and only if this scalar value is within the BMP ([ U+0000..U+FFFF ])
  106. /// and therefore representable by a single UTF-16 code unit.
  107. /// </summary>
  108. public bool IsBmp => UnicodeUtility.IsBmpCodePoint(_value);
  109. /// <summary>
  110. /// Returns the Unicode plane (0 to 16, inclusive) which contains this scalar.
  111. /// </summary>
  112. public int Plane => UnicodeUtility.GetPlane(_value);
  113. /// <summary>
  114. /// A <see cref="Rune"/> instance that represents the Unicode replacement character U+FFFD.
  115. /// </summary>
  116. public static Rune ReplacementChar => UnsafeCreate(UnicodeUtility.ReplacementChar);
  117. /// <summary>
  118. /// Returns the length in code units (<see cref="Char"/>) of the
  119. /// UTF-16 sequence required to represent this scalar value.
  120. /// </summary>
  121. /// <remarks>
  122. /// The return value will be 1 or 2.
  123. /// </remarks>
  124. public int Utf16SequenceLength => UnicodeUtility.GetUtf16SequenceLength(_value);
  125. /// <summary>
  126. /// Returns the length in code units of the
  127. /// UTF-8 sequence required to represent this scalar value.
  128. /// </summary>
  129. /// <remarks>
  130. /// The return value will be 1 through 4, inclusive.
  131. /// </remarks>
  132. public int Utf8SequenceLength => UnicodeUtility.GetUtf8SequenceLength(_value);
  133. /// <summary>
  134. /// Returns the Unicode scalar value as an integer.
  135. /// </summary>
  136. public int Value => (int)_value;
  137. private static Rune ChangeCaseCultureAware(Rune rune, TextInfo textInfo, bool toUpper)
  138. {
  139. Debug.Assert(!GlobalizationMode.Invariant, "This should've been checked by the caller.");
  140. Debug.Assert(textInfo != null, "This should've been checked by the caller.");
  141. Span<char> original = stackalloc char[2]; // worst case scenario = 2 code units (for a surrogate pair)
  142. Span<char> modified = stackalloc char[2]; // case change should preserve UTF-16 code unit count
  143. int charCount = rune.EncodeToUtf16(original);
  144. original = original.Slice(0, charCount);
  145. modified = modified.Slice(0, charCount);
  146. if (toUpper)
  147. {
  148. textInfo.ChangeCaseToUpper(original, modified);
  149. }
  150. else
  151. {
  152. textInfo.ChangeCaseToLower(original, modified);
  153. }
  154. // We use simple case folding rules, which disallows moving between the BMP and supplementary
  155. // planes when performing a case conversion. The helper methods which reconstruct a Rune
  156. // contain debug asserts for this condition.
  157. if (rune.IsBmp)
  158. {
  159. return UnsafeCreate(modified[0]);
  160. }
  161. else
  162. {
  163. return UnsafeCreate(UnicodeUtility.GetScalarFromUtf16SurrogatePair(modified[0], modified[1]));
  164. }
  165. }
  166. public int CompareTo(Rune other) => this._value.CompareTo(other._value);
  167. // returns the number of chars written
  168. private int EncodeToUtf16(Span<char> destination)
  169. {
  170. Debug.Assert(destination.Length >= Utf16SequenceLength, "Caller should've provided a large enough buffer.");
  171. bool success = TryEncode(destination, out int charsWritten);
  172. Debug.Assert(success, "TryEncode should never fail given a large enough buffer.");
  173. return charsWritten;
  174. }
  175. public override bool Equals(object obj) => (obj is Rune other) && this.Equals(other);
  176. public bool Equals(Rune other) => (this == other);
  177. public override int GetHashCode() => Value;
  178. /// <summary>
  179. /// Gets the <see cref="Rune"/> which begins at index <paramref name="index"/> in
  180. /// string <paramref name="input"/>.
  181. /// </summary>
  182. /// <remarks>
  183. /// Throws if <paramref name="input"/> is null, if <paramref name="index"/> is out of range, or
  184. /// if <paramref name="index"/> does not reference the start of a valid scalar value within <paramref name="input"/>.
  185. /// </remarks>
  186. public static Rune GetRuneAt(string input, int index)
  187. {
  188. int runeValue = ReadRuneFromString(input, index);
  189. if (runeValue < 0)
  190. {
  191. ThrowHelper.ThrowArgumentException_CannotExtractScalar(ExceptionArgument.index);
  192. }
  193. return UnsafeCreate((uint)runeValue);
  194. }
  195. /// <summary>
  196. /// Returns <see langword="true"/> iff <paramref name="value"/> is a valid Unicode scalar
  197. /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive.
  198. /// </summary>
  199. public static bool IsValid(int value) => IsValid((uint)value);
  200. /// <summary>
  201. /// Returns <see langword="true"/> iff <paramref name="value"/> is a valid Unicode scalar
  202. /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive.
  203. /// </summary>
  204. [CLSCompliant(false)]
  205. public static bool IsValid(uint value) => UnicodeUtility.IsValidUnicodeScalar(value);
  206. // returns a negative number on failure
  207. internal static int ReadFirstRuneFromUtf16Buffer(ReadOnlySpan<char> input)
  208. {
  209. if (input.IsEmpty)
  210. {
  211. return -1;
  212. }
  213. // Optimistically assume input is within BMP.
  214. uint returnValue = input[0];
  215. if (UnicodeUtility.IsSurrogateCodePoint(returnValue))
  216. {
  217. if (!UnicodeUtility.IsHighSurrogateCodePoint(returnValue))
  218. {
  219. return -1;
  220. }
  221. // Treat 'returnValue' as the high surrogate.
  222. if (1 >= (uint)input.Length)
  223. {
  224. return -1; // not an argument exception - just a "bad data" failure
  225. }
  226. uint potentialLowSurrogate = input[1];
  227. if (!UnicodeUtility.IsLowSurrogateCodePoint(potentialLowSurrogate))
  228. {
  229. return -1;
  230. }
  231. returnValue = UnicodeUtility.GetScalarFromUtf16SurrogatePair(returnValue, potentialLowSurrogate);
  232. }
  233. return (int)returnValue;
  234. }
  235. // returns a negative number on failure
  236. private static int ReadRuneFromString(string input, int index)
  237. {
  238. if (input is null)
  239. {
  240. ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input);
  241. }
  242. if ((uint)index >= (uint)input.Length)
  243. {
  244. ThrowHelper.ThrowArgumentOutOfRange_IndexException();
  245. }
  246. // Optimistically assume input is within BMP.
  247. uint returnValue = input[index];
  248. if (UnicodeUtility.IsSurrogateCodePoint(returnValue))
  249. {
  250. if (!UnicodeUtility.IsHighSurrogateCodePoint(returnValue))
  251. {
  252. return -1;
  253. }
  254. // Treat 'returnValue' as the high surrogate.
  255. //
  256. // If this becomes a hot code path, we can skip the below bounds check by reading
  257. // off the end of the string using unsafe code. Since strings are null-terminated,
  258. // we're guaranteed not to read a valid low surrogate, so we'll fail correctly if
  259. // the string terminates unexpectedly.
  260. index++;
  261. if ((uint)index >= (uint)input.Length)
  262. {
  263. return -1; // not an argument exception - just a "bad data" failure
  264. }
  265. uint potentialLowSurrogate = input[index];
  266. if (!UnicodeUtility.IsLowSurrogateCodePoint(potentialLowSurrogate))
  267. {
  268. return -1;
  269. }
  270. returnValue = UnicodeUtility.GetScalarFromUtf16SurrogatePair(returnValue, potentialLowSurrogate);
  271. }
  272. return (int)returnValue;
  273. }
  274. /// <summary>
  275. /// Returns a <see cref="string"/> representation of this <see cref="Rune"/> instance.
  276. /// </summary>
  277. public override string ToString()
  278. {
  279. if (IsBmp)
  280. {
  281. return string.CreateFromChar((char)_value);
  282. }
  283. else
  284. {
  285. UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar(_value, out char high, out char low);
  286. return string.CreateFromChar(high, low);
  287. }
  288. }
  289. /// <summary>
  290. /// Attempts to create a <see cref="Rune"/> from the provided input value.
  291. /// </summary>
  292. public static bool TryCreate(char ch, out Rune result)
  293. {
  294. uint extendedValue = ch;
  295. if (!UnicodeUtility.IsSurrogateCodePoint(extendedValue))
  296. {
  297. result = UnsafeCreate(extendedValue);
  298. return true;
  299. }
  300. else
  301. {
  302. result = default;
  303. return false;
  304. }
  305. }
  306. /// <summary>
  307. /// Attempts to create a <see cref="Rune"/> from the provided input value.
  308. /// </summary>
  309. public static bool TryCreate(int value, out Rune result) => TryCreate((uint)value, out result);
  310. /// <summary>
  311. /// Attempts to create a <see cref="Rune"/> from the provided input value.
  312. /// </summary>
  313. [CLSCompliant(false)]
  314. public static bool TryCreate(uint value, out Rune result)
  315. {
  316. if (UnicodeUtility.IsValidUnicodeScalar(value))
  317. {
  318. result = UnsafeCreate(value);
  319. return true;
  320. }
  321. else
  322. {
  323. result = default;
  324. return false;
  325. }
  326. }
  327. /// <summary>
  328. /// Encodes this <see cref="Rune"/> to a UTF-16 destination buffer.
  329. /// </summary>
  330. /// <param name="destination">The buffer to which to write this value as UTF-16.</param>
  331. /// <param name="charsWritten">
  332. /// The number of <see cref="char"/>s written to <paramref name="destination"/>,
  333. /// or 0 if the destination buffer is not large enough to contain the output.</param>
  334. /// <returns>True if the value was written to the buffer; otherwise, false.</returns>
  335. /// <remarks>
  336. /// The <see cref="Utf16SequenceLength"/> property can be queried ahead of time to determine
  337. /// the required size of the <paramref name="destination"/> buffer.
  338. /// </remarks>
  339. public bool TryEncode(Span<char> destination, out int charsWritten)
  340. {
  341. if (destination.Length >= 1)
  342. {
  343. if (IsBmp)
  344. {
  345. destination[0] = (char)_value;
  346. charsWritten = 1;
  347. return true;
  348. }
  349. else if (destination.Length >= 2)
  350. {
  351. UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar(_value, out destination[0], out destination[1]);
  352. charsWritten = 2;
  353. return true;
  354. }
  355. }
  356. // Destination buffer not large enough
  357. charsWritten = default;
  358. return false;
  359. }
  360. /// <summary>
  361. /// Encodes this <see cref="Rune"/> to a destination buffer as UTF-8 bytes.
  362. /// </summary>
  363. /// <param name="destination">The buffer to which to write this value as UTF-8.</param>
  364. /// <param name="bytesWritten">
  365. /// The number of <see cref="byte"/>s written to <paramref name="destination"/>,
  366. /// or 0 if the destination buffer is not large enough to contain the output.</param>
  367. /// <returns>True if the value was written to the buffer; otherwise, false.</returns>
  368. /// <remarks>
  369. /// The <see cref="Utf8SequenceLength"/> property can be queried ahead of time to determine
  370. /// the required size of the <paramref name="destination"/> buffer.
  371. /// </remarks>
  372. // ** This is public so it can be unit tested but isn't yet exposed via the reference assemblies. **
  373. public bool TryEncodeToUtf8Bytes(Span<byte> destination, out int bytesWritten)
  374. {
  375. // TODO: Optimize some of these writes by using BMI2 instructions.
  376. // The bit patterns below come from the Unicode Standard, Table 3-6.
  377. if (destination.Length >= 1)
  378. {
  379. if (IsAscii)
  380. {
  381. destination[0] = (byte)_value;
  382. bytesWritten = 1;
  383. return true;
  384. }
  385. if (destination.Length >= 2)
  386. {
  387. if (_value <= 0x7FFu)
  388. {
  389. // Scalar 00000yyy yyxxxxxx -> bytes [ 110yyyyy 10xxxxxx ]
  390. destination[0] = (byte)((_value + (0b110u << 11)) >> 6);
  391. destination[1] = (byte)((_value & 0x3Fu) + 0x80u);
  392. bytesWritten = 2;
  393. return true;
  394. }
  395. if (destination.Length >= 3)
  396. {
  397. if (_value <= 0xFFFFu)
  398. {
  399. // Scalar zzzzyyyy yyxxxxxx -> bytes [ 1110zzzz 10yyyyyy 10xxxxxx ]
  400. destination[0] = (byte)((_value + (0b1110 << 16)) >> 12);
  401. destination[1] = (byte)(((_value & (0x3Fu << 6)) >> 6) + 0x80u);
  402. destination[2] = (byte)((_value & 0x3Fu) + 0x80u);
  403. bytesWritten = 3;
  404. return true;
  405. }
  406. if (destination.Length >= 4)
  407. {
  408. // Scalar 000uuuuu zzzzyyyy yyxxxxxx -> bytes [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ]
  409. destination[0] = (byte)((_value + (0b11110 << 21)) >> 18);
  410. destination[1] = (byte)(((_value & (0x3Fu << 12)) >> 12) + 0x80u);
  411. destination[2] = (byte)(((_value & (0x3Fu << 6)) >> 6) + 0x80u);
  412. destination[3] = (byte)((_value & 0x3Fu) + 0x80u);
  413. bytesWritten = 4;
  414. return true;
  415. }
  416. }
  417. }
  418. }
  419. // Destination buffer not large enough
  420. bytesWritten = default;
  421. return false;
  422. }
  423. /// <summary>
  424. /// Attempts to get the <see cref="Rune"/> which begins at index <paramref name="index"/> in
  425. /// string <paramref name="input"/>.
  426. /// </summary>
  427. /// <returns><see langword="true"/> if a scalar value was successfully extracted from the specified index,
  428. /// <see langword="false"/> if a value could not be extracted due to invalid data.</returns>
  429. /// <remarks>
  430. /// Throws only if <paramref name="input"/> is null or <paramref name="index"/> is out of range.
  431. /// </remarks>
  432. public static bool TryGetRuneAt(string input, int index, out Rune value)
  433. {
  434. int runeValue = ReadRuneFromString(input, index);
  435. if (runeValue >= 0)
  436. {
  437. value = UnsafeCreate((uint)runeValue);
  438. return true;
  439. }
  440. else
  441. {
  442. value = default;
  443. return false;
  444. }
  445. }
  446. // Allows constructing a Unicode scalar value from an arbitrary 32-bit integer without
  447. // validation. It is the caller's responsibility to have performed manual validation
  448. // before calling this method. If a Rune instance is forcibly constructed
  449. // from invalid input, the APIs on this type have undefined behavior, potentially including
  450. // introducing a security hole in the consuming application.
  451. //
  452. // An example of a security hole resulting from an invalid Rune value, which could result
  453. // in a stack overflow.
  454. //
  455. // public int GetMarvin32HashCode(Rune r) {
  456. // Span<char> buffer = stackalloc char[r.Utf16SequenceLength];
  457. // r.TryEncode(buffer, ...);
  458. // return Marvin32.ComputeHash(buffer.AsBytes());
  459. // }
  460. /// <summary>
  461. /// Creates a <see cref="Rune"/> without performing validation on the input.
  462. /// </summary>
  463. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  464. internal static Rune UnsafeCreate(uint scalarValue) => new Rune(scalarValue, false);
  465. // These are analogs of APIs on System.Char
  466. public static double GetNumericValue(Rune value)
  467. {
  468. if (value.IsAscii)
  469. {
  470. uint baseNum = value._value - '0';
  471. return (baseNum <= 9) ? (double)baseNum : -1;
  472. }
  473. else
  474. {
  475. // not an ASCII char; fall back to globalization table
  476. return CharUnicodeInfo.InternalGetNumericValue(value.Value);
  477. }
  478. }
  479. public static UnicodeCategory GetUnicodeCategory(Rune value)
  480. {
  481. if (value.IsAscii)
  482. {
  483. return (UnicodeCategory)(AsciiCharInfo[value.Value] & UnicodeCategoryMask);
  484. }
  485. else
  486. {
  487. return GetUnicodeCategoryNonAscii(value);
  488. }
  489. }
  490. private static UnicodeCategory GetUnicodeCategoryNonAscii(Rune value)
  491. {
  492. Debug.Assert(!value.IsAscii, "Shouldn't use this non-optimized code path for ASCII characters.");
  493. return CharUnicodeInfo.GetUnicodeCategory(value.Value);
  494. }
  495. // Returns true iff this Unicode category represents a letter
  496. private static bool IsCategoryLetter(UnicodeCategory category)
  497. {
  498. return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.UppercaseLetter, (uint)UnicodeCategory.OtherLetter);
  499. }
  500. // Returns true iff this Unicode category represents a letter or a decimal digit
  501. private static bool IsCategoryLetterOrDecimalDigit(UnicodeCategory category)
  502. {
  503. return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.UppercaseLetter, (uint)UnicodeCategory.OtherLetter)
  504. || (category == UnicodeCategory.DecimalDigitNumber);
  505. }
  506. // Returns true iff this Unicode category represents a number
  507. private static bool IsCategoryNumber(UnicodeCategory category)
  508. {
  509. return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.DecimalDigitNumber, (uint)UnicodeCategory.OtherNumber);
  510. }
  511. // Returns true iff this Unicode category represents a punctuation mark
  512. private static bool IsCategoryPunctuation(UnicodeCategory category)
  513. {
  514. return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.ConnectorPunctuation, (uint)UnicodeCategory.OtherPunctuation);
  515. }
  516. // Returns true iff this Unicode category represents a separator
  517. private static bool IsCategorySeparator(UnicodeCategory category)
  518. {
  519. return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.SpaceSeparator, (uint)UnicodeCategory.ParagraphSeparator);
  520. }
  521. // Returns true iff this Unicode category represents a symbol
  522. private static bool IsCategorySymbol(UnicodeCategory category)
  523. {
  524. return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.MathSymbol, (uint)UnicodeCategory.OtherSymbol);
  525. }
  526. public static bool IsControl(Rune value)
  527. {
  528. // Per the Unicode stability policy, the set of control characters
  529. // is forever fixed at [ U+0000..U+001F ], [ U+007F..U+009F ]. No
  530. // characters will ever be added to the "control characters" group.
  531. // See http://www.unicode.org/policies/stability_policy.html.
  532. // Logic below depends on Rune.Value never being -1 (since Rune is a validating type)
  533. // 00..1F (+1) => 01..20 (&~80) => 01..20
  534. // 7F..9F (+1) => 80..A0 (&~80) => 00..20
  535. return (((value._value + 1) & ~0x80u) <= 0x20u);
  536. }
  537. public static bool IsDigit(Rune value)
  538. {
  539. if (value.IsAscii)
  540. {
  541. return UnicodeUtility.IsInRangeInclusive(value._value, '0', '9');
  542. }
  543. else
  544. {
  545. return (GetUnicodeCategoryNonAscii(value) == UnicodeCategory.DecimalDigitNumber);
  546. }
  547. }
  548. public static bool IsLetter(Rune value)
  549. {
  550. if (value.IsAscii)
  551. {
  552. return (((value._value - 'A') & ~0x20u) <= (uint)('Z' - 'A')); // [A-Za-z]
  553. }
  554. else
  555. {
  556. return IsCategoryLetter(GetUnicodeCategoryNonAscii(value));
  557. }
  558. }
  559. public static bool IsLetterOrDigit(Rune value)
  560. {
  561. if (value.IsAscii)
  562. {
  563. return ((AsciiCharInfo[value.Value] & IsLetterOrDigitFlag) != 0);
  564. }
  565. else
  566. {
  567. return IsCategoryLetterOrDecimalDigit(GetUnicodeCategoryNonAscii(value));
  568. }
  569. }
  570. public static bool IsLower(Rune value)
  571. {
  572. if (value.IsAscii)
  573. {
  574. return UnicodeUtility.IsInRangeInclusive(value._value, 'a', 'z');
  575. }
  576. else
  577. {
  578. return (GetUnicodeCategoryNonAscii(value) == UnicodeCategory.LowercaseLetter);
  579. }
  580. }
  581. public static bool IsNumber(Rune value)
  582. {
  583. if (value.IsAscii)
  584. {
  585. return UnicodeUtility.IsInRangeInclusive(value._value, '0', '9');
  586. }
  587. else
  588. {
  589. return IsCategoryNumber(GetUnicodeCategoryNonAscii(value));
  590. }
  591. }
  592. public static bool IsPunctuation(Rune value)
  593. {
  594. return IsCategoryPunctuation(GetUnicodeCategory(value));
  595. }
  596. public static bool IsSeparator(Rune value)
  597. {
  598. return IsCategorySeparator(GetUnicodeCategory(value));
  599. }
  600. public static bool IsSymbol(Rune value)
  601. {
  602. return IsCategorySymbol(GetUnicodeCategory(value));
  603. }
  604. public static bool IsUpper(Rune value)
  605. {
  606. if (value.IsAscii)
  607. {
  608. return UnicodeUtility.IsInRangeInclusive(value._value, 'A', 'Z');
  609. }
  610. else
  611. {
  612. return (GetUnicodeCategoryNonAscii(value) == UnicodeCategory.UppercaseLetter);
  613. }
  614. }
  615. public static bool IsWhiteSpace(Rune value)
  616. {
  617. if (value.IsAscii)
  618. {
  619. return (AsciiCharInfo[value.Value] & IsWhiteSpaceFlag) != 0;
  620. }
  621. // U+0085 is special since it's a whitespace character but is in the Control category
  622. // instead of a normal separator category. No other code point outside the ASCII range
  623. // has this mismatch.
  624. if (value._value == 0x0085u)
  625. {
  626. return true;
  627. }
  628. return IsCategorySeparator(GetUnicodeCategoryNonAscii(value));
  629. }
  630. public static Rune ToLower(Rune value, CultureInfo culture)
  631. {
  632. if (culture is null)
  633. {
  634. ThrowHelper.ThrowArgumentNullException(ExceptionArgument.culture);
  635. }
  636. // We don't want to special-case ASCII here since the specified culture might handle
  637. // ASCII characters differently than the invariant culture (e.g., Turkish I). Instead
  638. // we'll just jump straight to the globalization tables if they're available.
  639. if (GlobalizationMode.Invariant)
  640. {
  641. return ToLowerInvariant(value);
  642. }
  643. return ChangeCaseCultureAware(value, culture.TextInfo, toUpper: false);
  644. }
  645. public static Rune ToLowerInvariant(Rune value)
  646. {
  647. // Handle the most common case (ASCII data) first. Within the common case, we expect
  648. // that there'll be a mix of lowercase & uppercase chars, so make the conversion branchless.
  649. if (value.IsAscii)
  650. {
  651. // It's ok for us to use the UTF-16 conversion utility for this since the high
  652. // 16 bits of the value will never be set so will be left unchanged.
  653. return UnsafeCreate(Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(value._value));
  654. }
  655. if (GlobalizationMode.Invariant)
  656. {
  657. // If the value isn't ASCII and if the globalization tables aren't available,
  658. // case changing has no effect.
  659. return value;
  660. }
  661. // Non-ASCII data requires going through the case folding tables.
  662. return ChangeCaseCultureAware(value, TextInfo.Invariant, toUpper: false);
  663. }
  664. public static Rune ToUpper(Rune value, CultureInfo culture)
  665. {
  666. if (culture is null)
  667. {
  668. ThrowHelper.ThrowArgumentNullException(ExceptionArgument.culture);
  669. }
  670. // We don't want to special-case ASCII here since the specified culture might handle
  671. // ASCII characters differently than the invariant culture (e.g., Turkish I). Instead
  672. // we'll just jump straight to the globalization tables if they're available.
  673. if (GlobalizationMode.Invariant)
  674. {
  675. return ToUpperInvariant(value);
  676. }
  677. return ChangeCaseCultureAware(value, culture.TextInfo, toUpper: true);
  678. }
  679. public static Rune ToUpperInvariant(Rune value)
  680. {
  681. // Handle the most common case (ASCII data) first. Within the common case, we expect
  682. // that there'll be a mix of lowercase & uppercase chars, so make the conversion branchless.
  683. if (value.IsAscii)
  684. {
  685. // It's ok for us to use the UTF-16 conversion utility for this since the high
  686. // 16 bits of the value will never be set so will be left unchanged.
  687. return UnsafeCreate(Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(value._value));
  688. }
  689. if (GlobalizationMode.Invariant)
  690. {
  691. // If the value isn't ASCII and if the globalization tables aren't available,
  692. // case changing has no effect.
  693. return value;
  694. }
  695. // Non-ASCII data requires going through the case folding tables.
  696. return ChangeCaseCultureAware(value, TextInfo.Invariant, toUpper: true);
  697. }
  698. }
  699. }