Rune.cs 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861
  1. // Licensed to the .NET Foundation under one or more agreements.
  2. // The .NET Foundation licenses this file to you under the MIT license.
  3. // See the LICENSE file in the project root for more information.
  4. using System.Diagnostics;
  5. using System.Globalization;
  6. using System.Runtime.CompilerServices;
  7. namespace System.Text
  8. {
  9. /// <summary>
  10. /// Represents a Unicode scalar value ([ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive).
  11. /// </summary>
  12. /// <remarks>
  13. /// This type's constructors and conversion operators validate the input, so consumers can call the APIs
  14. /// assuming that the underlying <see cref="Rune"/> instance is well-formed.
  15. /// </remarks>
  16. [DebuggerDisplay("{DebuggerDisplay,nq}")]
  17. public readonly struct Rune : IComparable<Rune>, IEquatable<Rune>
  18. {
  19. private const byte IsWhiteSpaceFlag = 0x80;
  20. private const byte IsLetterOrDigitFlag = 0x40;
  21. private const byte UnicodeCategoryMask = 0x1F;
  22. // Contains information about the ASCII character range [ U+0000..U+007F ], with:
  23. // - 0x80 bit if set means 'is whitespace'
  24. // - 0x40 bit if set means 'is letter or digit'
  25. // - 0x20 bit is reserved for future use
  26. // - bottom 5 bits are the UnicodeCategory of the character
  27. private static ReadOnlySpan<byte> AsciiCharInfo => new byte[]
  28. {
  29. 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x8E, 0x8E, 0x8E, 0x8E, 0x8E, 0x0E, 0x0E,
  30. 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E,
  31. 0x8B, 0x18, 0x18, 0x18, 0x1A, 0x18, 0x18, 0x18, 0x14, 0x15, 0x18, 0x19, 0x18, 0x13, 0x18, 0x18,
  32. 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x18, 0x18, 0x19, 0x19, 0x19, 0x18,
  33. 0x18, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
  34. 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x14, 0x18, 0x15, 0x1B, 0x12,
  35. 0x1B, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41,
  36. 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x14, 0x19, 0x15, 0x19, 0x0E
  37. };
  38. private readonly uint _value;
  39. /// <summary>
  40. /// Creates a <see cref="Rune"/> from the provided UTF-16 code unit.
  41. /// </summary>
  42. /// <exception cref="ArgumentOutOfRangeException">
  43. /// If <paramref name="ch"/> represents a UTF-16 surrogate code point
  44. /// U+D800..U+DFFF, inclusive.
  45. /// </exception>
  46. public Rune(char ch)
  47. {
  48. uint expanded = ch;
  49. if (UnicodeUtility.IsSurrogateCodePoint(expanded))
  50. {
  51. ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.ch);
  52. }
  53. _value = expanded;
  54. }
  55. /// <summary>
  56. /// Creates a <see cref="Rune"/> from the provided UTF-16 surrogate pair.
  57. /// </summary>
  58. /// <exception cref="ArgumentOutOfRangeException">
  59. /// If <paramref name="highSurrogate"/> does not represent a UTF-16 high surrogate code point
  60. /// or <paramref name="lowSurrogate"/> does not represent a UTF-16 low surrogate code point.
  61. /// </exception>
  62. public Rune(char highSurrogate, char lowSurrogate)
  63. : this((uint)char.ConvertToUtf32(highSurrogate, lowSurrogate), false)
  64. {
  65. }
  66. /// <summary>
  67. /// Creates a <see cref="Rune"/> from the provided Unicode scalar value.
  68. /// </summary>
  69. /// <exception cref="ArgumentOutOfRangeException">
  70. /// If <paramref name="value"/> does not represent a value Unicode scalar value.
  71. /// </exception>
  72. public Rune(int value)
  73. : this((uint)value)
  74. {
  75. }
  76. /// <summary>
  77. /// Creates a <see cref="Rune"/> from the provided Unicode scalar value.
  78. /// </summary>
  79. /// <exception cref="ArgumentOutOfRangeException">
  80. /// If <paramref name="value"/> does not represent a value Unicode scalar value.
  81. /// </exception>
  82. [CLSCompliant(false)]
  83. public Rune(uint value)
  84. {
  85. if (!UnicodeUtility.IsValidUnicodeScalar(value))
  86. {
  87. ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.value);
  88. }
  89. _value = value;
  90. }
  91. // non-validating ctor
  92. private Rune(uint scalarValue, bool unused)
  93. {
  94. UnicodeDebug.AssertIsValidScalar(scalarValue);
  95. _value = scalarValue;
  96. }
  97. public static bool operator ==(Rune left, Rune right) => (left._value == right._value);
  98. public static bool operator !=(Rune left, Rune right) => (left._value != right._value);
  99. public static bool operator <(Rune left, Rune right) => (left._value < right._value);
  100. public static bool operator <=(Rune left, Rune right) => (left._value <= right._value);
  101. public static bool operator >(Rune left, Rune right) => (left._value > right._value);
  102. public static bool operator >=(Rune left, Rune right) => (left._value >= right._value);
  103. // Operators below are explicit because they may throw.
  104. public static explicit operator Rune(char ch) => new Rune(ch);
  105. [CLSCompliant(false)]
  106. public static explicit operator Rune(uint value) => new Rune(value);
  107. public static explicit operator Rune(int value) => new Rune(value);
  108. // Displayed as "'<char>' (U+XXXX)"; e.g., "'e' (U+0065)"
  109. private string DebuggerDisplay => FormattableString.Invariant($"U+{_value:X4} '{(IsValid(_value) ? ToString() : "\uFFFD")}'");
  110. /// <summary>
  111. /// Returns true if and only if this scalar value is ASCII ([ U+0000..U+007F ])
  112. /// and therefore representable by a single UTF-8 code unit.
  113. /// </summary>
  114. public bool IsAscii => UnicodeUtility.IsAsciiCodePoint(_value);
  115. /// <summary>
  116. /// Returns true if and only if this scalar value is within the BMP ([ U+0000..U+FFFF ])
  117. /// and therefore representable by a single UTF-16 code unit.
  118. /// </summary>
  119. public bool IsBmp => UnicodeUtility.IsBmpCodePoint(_value);
  120. /// <summary>
  121. /// Returns the Unicode plane (0 to 16, inclusive) which contains this scalar.
  122. /// </summary>
  123. public int Plane => UnicodeUtility.GetPlane(_value);
  124. /// <summary>
  125. /// A <see cref="Rune"/> instance that represents the Unicode replacement character U+FFFD.
  126. /// </summary>
  127. public static Rune ReplacementChar => UnsafeCreate(UnicodeUtility.ReplacementChar);
  128. /// <summary>
  129. /// Returns the length in code units (<see cref="Char"/>) of the
  130. /// UTF-16 sequence required to represent this scalar value.
  131. /// </summary>
  132. /// <remarks>
  133. /// The return value will be 1 or 2.
  134. /// </remarks>
  135. public int Utf16SequenceLength => UnicodeUtility.GetUtf16SequenceLength(_value);
  136. /// <summary>
  137. /// Returns the length in code units of the
  138. /// UTF-8 sequence required to represent this scalar value.
  139. /// </summary>
  140. /// <remarks>
  141. /// The return value will be 1 through 4, inclusive.
  142. /// </remarks>
  143. public int Utf8SequenceLength => UnicodeUtility.GetUtf8SequenceLength(_value);
  144. /// <summary>
  145. /// Returns the Unicode scalar value as an integer.
  146. /// </summary>
  147. public int Value => (int)_value;
  148. private static Rune ChangeCaseCultureAware(Rune rune, TextInfo textInfo, bool toUpper)
  149. {
  150. Debug.Assert(!GlobalizationMode.Invariant, "This should've been checked by the caller.");
  151. Debug.Assert(textInfo != null, "This should've been checked by the caller.");
  152. Span<char> original = stackalloc char[2]; // worst case scenario = 2 code units (for a surrogate pair)
  153. Span<char> modified = stackalloc char[2]; // case change should preserve UTF-16 code unit count
  154. int charCount = rune.EncodeToUtf16(original);
  155. original = original.Slice(0, charCount);
  156. modified = modified.Slice(0, charCount);
  157. if (toUpper)
  158. {
  159. textInfo.ChangeCaseToUpper(original, modified);
  160. }
  161. else
  162. {
  163. textInfo.ChangeCaseToLower(original, modified);
  164. }
  165. // We use simple case folding rules, which disallows moving between the BMP and supplementary
  166. // planes when performing a case conversion. The helper methods which reconstruct a Rune
  167. // contain debug asserts for this condition.
  168. if (rune.IsBmp)
  169. {
  170. return UnsafeCreate(modified[0]);
  171. }
  172. else
  173. {
  174. return UnsafeCreate(UnicodeUtility.GetScalarFromUtf16SurrogatePair(modified[0], modified[1]));
  175. }
  176. }
  177. public int CompareTo(Rune other) => this._value.CompareTo(other._value);
  178. // returns the number of chars written
  179. private int EncodeToUtf16(Span<char> destination)
  180. {
  181. Debug.Assert(destination.Length >= Utf16SequenceLength, "Caller should've provided a large enough buffer.");
  182. bool success = TryEncode(destination, out int charsWritten);
  183. Debug.Assert(success, "TryEncode should never fail given a large enough buffer.");
  184. return charsWritten;
  185. }
  186. public override bool Equals(object obj) => (obj is Rune other) && this.Equals(other);
  187. public bool Equals(Rune other) => (this == other);
  188. public override int GetHashCode() => Value;
  189. /// <summary>
  190. /// Gets the <see cref="Rune"/> which begins at index <paramref name="index"/> in
  191. /// string <paramref name="input"/>.
  192. /// </summary>
  193. /// <remarks>
  194. /// Throws if <paramref name="input"/> is null, if <paramref name="index"/> is out of range, or
  195. /// if <paramref name="index"/> does not reference the start of a valid scalar value within <paramref name="input"/>.
  196. /// </remarks>
  197. public static Rune GetRuneAt(string input, int index)
  198. {
  199. int runeValue = ReadRuneFromString(input, index);
  200. if (runeValue < 0)
  201. {
  202. ThrowHelper.ThrowArgumentException_CannotExtractScalar(ExceptionArgument.index);
  203. }
  204. return UnsafeCreate((uint)runeValue);
  205. }
  206. /// <summary>
  207. /// Returns <see langword="true"/> iff <paramref name="value"/> is a valid Unicode scalar
  208. /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive.
  209. /// </summary>
  210. public static bool IsValid(int value) => IsValid((uint)value);
  211. /// <summary>
  212. /// Returns <see langword="true"/> iff <paramref name="value"/> is a valid Unicode scalar
  213. /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive.
  214. /// </summary>
  215. [CLSCompliant(false)]
  216. public static bool IsValid(uint value) => UnicodeUtility.IsValidUnicodeScalar(value);
  217. // returns a negative number on failure
  218. internal static int ReadFirstRuneFromUtf16Buffer(ReadOnlySpan<char> input)
  219. {
  220. if (input.IsEmpty)
  221. {
  222. return -1;
  223. }
  224. // Optimistically assume input is within BMP.
  225. uint returnValue = input[0];
  226. if (UnicodeUtility.IsSurrogateCodePoint(returnValue))
  227. {
  228. if (!UnicodeUtility.IsHighSurrogateCodePoint(returnValue))
  229. {
  230. return -1;
  231. }
  232. // Treat 'returnValue' as the high surrogate.
  233. if (1 >= (uint)input.Length)
  234. {
  235. return -1; // not an argument exception - just a "bad data" failure
  236. }
  237. uint potentialLowSurrogate = input[1];
  238. if (!UnicodeUtility.IsLowSurrogateCodePoint(potentialLowSurrogate))
  239. {
  240. return -1;
  241. }
  242. returnValue = UnicodeUtility.GetScalarFromUtf16SurrogatePair(returnValue, potentialLowSurrogate);
  243. }
  244. return (int)returnValue;
  245. }
  246. // returns a negative number on failure
  247. private static int ReadRuneFromString(string input, int index)
  248. {
  249. if (input is null)
  250. {
  251. ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input);
  252. }
  253. if ((uint)index >= (uint)input.Length)
  254. {
  255. ThrowHelper.ThrowArgumentOutOfRange_IndexException();
  256. }
  257. // Optimistically assume input is within BMP.
  258. uint returnValue = input[index];
  259. if (UnicodeUtility.IsSurrogateCodePoint(returnValue))
  260. {
  261. if (!UnicodeUtility.IsHighSurrogateCodePoint(returnValue))
  262. {
  263. return -1;
  264. }
  265. // Treat 'returnValue' as the high surrogate.
  266. //
  267. // If this becomes a hot code path, we can skip the below bounds check by reading
  268. // off the end of the string using unsafe code. Since strings are null-terminated,
  269. // we're guaranteed not to read a valid low surrogate, so we'll fail correctly if
  270. // the string terminates unexpectedly.
  271. index++;
  272. if ((uint)index >= (uint)input.Length)
  273. {
  274. return -1; // not an argument exception - just a "bad data" failure
  275. }
  276. uint potentialLowSurrogate = input[index];
  277. if (!UnicodeUtility.IsLowSurrogateCodePoint(potentialLowSurrogate))
  278. {
  279. return -1;
  280. }
  281. returnValue = UnicodeUtility.GetScalarFromUtf16SurrogatePair(returnValue, potentialLowSurrogate);
  282. }
  283. return (int)returnValue;
  284. }
  285. /// <summary>
  286. /// Returns a <see cref="string"/> representation of this <see cref="Rune"/> instance.
  287. /// </summary>
  288. public override string ToString()
  289. {
  290. if (IsBmp)
  291. {
  292. return string.CreateFromChar((char)_value);
  293. }
  294. else
  295. {
  296. UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar(_value, out char high, out char low);
  297. return string.CreateFromChar(high, low);
  298. }
  299. }
  300. /// <summary>
  301. /// Attempts to create a <see cref="Rune"/> from the provided input value.
  302. /// </summary>
  303. public static bool TryCreate(char ch, out Rune result)
  304. {
  305. uint extendedValue = ch;
  306. if (!UnicodeUtility.IsSurrogateCodePoint(extendedValue))
  307. {
  308. result = UnsafeCreate(extendedValue);
  309. return true;
  310. }
  311. else
  312. {
  313. result = default;
  314. return false;
  315. }
  316. }
  317. /// <summary>
  318. /// Attempts to create a <see cref="Rune"/> from the provided UTF-16 surrogate pair.
  319. /// Returns <see langword="false"/> if the input values don't represent a well-formed UTF-16surrogate pair.
  320. /// </summary>
  321. public static bool TryCreate(char highSurrogate, char lowSurrogate, out Rune result)
  322. {
  323. // First, extend both to 32 bits, then calculate the offset of
  324. // each candidate surrogate char from the start of its range.
  325. uint highSurrogateOffset = (uint)highSurrogate - CharUnicodeInfo.HIGH_SURROGATE_START;
  326. uint lowSurrogateOffset = (uint)lowSurrogate - CharUnicodeInfo.LOW_SURROGATE_START;
  327. // This is a single comparison which allows us to check both for validity at once since
  328. // both the high surrogate range and the low surrogate range are the same length.
  329. // If the comparison fails, we call to a helper method to throw the correct exception message.
  330. if ((highSurrogateOffset | lowSurrogateOffset) <= CharUnicodeInfo.HIGH_SURROGATE_RANGE)
  331. {
  332. // The 0x40u << 10 below is to account for uuuuu = wwww + 1 in the surrogate encoding.
  333. result = UnsafeCreate((highSurrogateOffset << 10) + ((uint)lowSurrogate - CharUnicodeInfo.LOW_SURROGATE_START) + (0x40u << 10));
  334. return true;
  335. }
  336. else
  337. {
  338. // Didn't have a high surrogate followed by a low surrogate.
  339. result = default;
  340. return false;
  341. }
  342. }
  343. /// <summary>
  344. /// Attempts to create a <see cref="Rune"/> from the provided input value.
  345. /// </summary>
  346. public static bool TryCreate(int value, out Rune result) => TryCreate((uint)value, out result);
  347. /// <summary>
  348. /// Attempts to create a <see cref="Rune"/> from the provided input value.
  349. /// </summary>
  350. [CLSCompliant(false)]
  351. public static bool TryCreate(uint value, out Rune result)
  352. {
  353. if (UnicodeUtility.IsValidUnicodeScalar(value))
  354. {
  355. result = UnsafeCreate(value);
  356. return true;
  357. }
  358. else
  359. {
  360. result = default;
  361. return false;
  362. }
  363. }
  364. /// <summary>
  365. /// Encodes this <see cref="Rune"/> to a UTF-16 destination buffer.
  366. /// </summary>
  367. /// <param name="destination">The buffer to which to write this value as UTF-16.</param>
  368. /// <param name="charsWritten">
  369. /// The number of <see cref="char"/>s written to <paramref name="destination"/>,
  370. /// or 0 if the destination buffer is not large enough to contain the output.</param>
  371. /// <returns>True if the value was written to the buffer; otherwise, false.</returns>
  372. /// <remarks>
  373. /// The <see cref="Utf16SequenceLength"/> property can be queried ahead of time to determine
  374. /// the required size of the <paramref name="destination"/> buffer.
  375. /// </remarks>
  376. public bool TryEncode(Span<char> destination, out int charsWritten)
  377. {
  378. if (destination.Length >= 1)
  379. {
  380. if (IsBmp)
  381. {
  382. destination[0] = (char)_value;
  383. charsWritten = 1;
  384. return true;
  385. }
  386. else if (destination.Length >= 2)
  387. {
  388. UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar(_value, out destination[0], out destination[1]);
  389. charsWritten = 2;
  390. return true;
  391. }
  392. }
  393. // Destination buffer not large enough
  394. charsWritten = default;
  395. return false;
  396. }
  397. /// <summary>
  398. /// Encodes this <see cref="Rune"/> to a destination buffer as UTF-8 bytes.
  399. /// </summary>
  400. /// <param name="destination">The buffer to which to write this value as UTF-8.</param>
  401. /// <param name="bytesWritten">
  402. /// The number of <see cref="byte"/>s written to <paramref name="destination"/>,
  403. /// or 0 if the destination buffer is not large enough to contain the output.</param>
  404. /// <returns>True if the value was written to the buffer; otherwise, false.</returns>
  405. /// <remarks>
  406. /// The <see cref="Utf8SequenceLength"/> property can be queried ahead of time to determine
  407. /// the required size of the <paramref name="destination"/> buffer.
  408. /// </remarks>
  409. // ** This is public so it can be unit tested but isn't yet exposed via the reference assemblies. **
  410. public bool TryEncodeToUtf8Bytes(Span<byte> destination, out int bytesWritten)
  411. {
  412. // TODO: Optimize some of these writes by using BMI2 instructions.
  413. // The bit patterns below come from the Unicode Standard, Table 3-6.
  414. if (destination.Length >= 1)
  415. {
  416. if (IsAscii)
  417. {
  418. destination[0] = (byte)_value;
  419. bytesWritten = 1;
  420. return true;
  421. }
  422. if (destination.Length >= 2)
  423. {
  424. if (_value <= 0x7FFu)
  425. {
  426. // Scalar 00000yyy yyxxxxxx -> bytes [ 110yyyyy 10xxxxxx ]
  427. destination[0] = (byte)((_value + (0b110u << 11)) >> 6);
  428. destination[1] = (byte)((_value & 0x3Fu) + 0x80u);
  429. bytesWritten = 2;
  430. return true;
  431. }
  432. if (destination.Length >= 3)
  433. {
  434. if (_value <= 0xFFFFu)
  435. {
  436. // Scalar zzzzyyyy yyxxxxxx -> bytes [ 1110zzzz 10yyyyyy 10xxxxxx ]
  437. destination[0] = (byte)((_value + (0b1110 << 16)) >> 12);
  438. destination[1] = (byte)(((_value & (0x3Fu << 6)) >> 6) + 0x80u);
  439. destination[2] = (byte)((_value & 0x3Fu) + 0x80u);
  440. bytesWritten = 3;
  441. return true;
  442. }
  443. if (destination.Length >= 4)
  444. {
  445. // Scalar 000uuuuu zzzzyyyy yyxxxxxx -> bytes [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ]
  446. destination[0] = (byte)((_value + (0b11110 << 21)) >> 18);
  447. destination[1] = (byte)(((_value & (0x3Fu << 12)) >> 12) + 0x80u);
  448. destination[2] = (byte)(((_value & (0x3Fu << 6)) >> 6) + 0x80u);
  449. destination[3] = (byte)((_value & 0x3Fu) + 0x80u);
  450. bytesWritten = 4;
  451. return true;
  452. }
  453. }
  454. }
  455. }
  456. // Destination buffer not large enough
  457. bytesWritten = default;
  458. return false;
  459. }
  460. /// <summary>
  461. /// Attempts to get the <see cref="Rune"/> which begins at index <paramref name="index"/> in
  462. /// string <paramref name="input"/>.
  463. /// </summary>
  464. /// <returns><see langword="true"/> if a scalar value was successfully extracted from the specified index,
  465. /// <see langword="false"/> if a value could not be extracted due to invalid data.</returns>
  466. /// <remarks>
  467. /// Throws only if <paramref name="input"/> is null or <paramref name="index"/> is out of range.
  468. /// </remarks>
  469. public static bool TryGetRuneAt(string input, int index, out Rune value)
  470. {
  471. int runeValue = ReadRuneFromString(input, index);
  472. if (runeValue >= 0)
  473. {
  474. value = UnsafeCreate((uint)runeValue);
  475. return true;
  476. }
  477. else
  478. {
  479. value = default;
  480. return false;
  481. }
  482. }
  483. // Allows constructing a Unicode scalar value from an arbitrary 32-bit integer without
  484. // validation. It is the caller's responsibility to have performed manual validation
  485. // before calling this method. If a Rune instance is forcibly constructed
  486. // from invalid input, the APIs on this type have undefined behavior, potentially including
  487. // introducing a security hole in the consuming application.
  488. //
  489. // An example of a security hole resulting from an invalid Rune value, which could result
  490. // in a stack overflow.
  491. //
  492. // public int GetMarvin32HashCode(Rune r) {
  493. // Span<char> buffer = stackalloc char[r.Utf16SequenceLength];
  494. // r.TryEncode(buffer, ...);
  495. // return Marvin32.ComputeHash(buffer.AsBytes());
  496. // }
  497. /// <summary>
  498. /// Creates a <see cref="Rune"/> without performing validation on the input.
  499. /// </summary>
  500. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  501. internal static Rune UnsafeCreate(uint scalarValue) => new Rune(scalarValue, false);
  502. // These are analogs of APIs on System.Char
  503. public static double GetNumericValue(Rune value)
  504. {
  505. if (value.IsAscii)
  506. {
  507. uint baseNum = value._value - '0';
  508. return (baseNum <= 9) ? (double)baseNum : -1;
  509. }
  510. else
  511. {
  512. // not an ASCII char; fall back to globalization table
  513. return CharUnicodeInfo.InternalGetNumericValue(value.Value);
  514. }
  515. }
  516. public static UnicodeCategory GetUnicodeCategory(Rune value)
  517. {
  518. if (value.IsAscii)
  519. {
  520. return (UnicodeCategory)(AsciiCharInfo[value.Value] & UnicodeCategoryMask);
  521. }
  522. else
  523. {
  524. return GetUnicodeCategoryNonAscii(value);
  525. }
  526. }
  527. private static UnicodeCategory GetUnicodeCategoryNonAscii(Rune value)
  528. {
  529. Debug.Assert(!value.IsAscii, "Shouldn't use this non-optimized code path for ASCII characters.");
  530. return CharUnicodeInfo.GetUnicodeCategory(value.Value);
  531. }
  532. // Returns true iff this Unicode category represents a letter
  533. private static bool IsCategoryLetter(UnicodeCategory category)
  534. {
  535. return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.UppercaseLetter, (uint)UnicodeCategory.OtherLetter);
  536. }
  537. // Returns true iff this Unicode category represents a letter or a decimal digit
  538. private static bool IsCategoryLetterOrDecimalDigit(UnicodeCategory category)
  539. {
  540. return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.UppercaseLetter, (uint)UnicodeCategory.OtherLetter)
  541. || (category == UnicodeCategory.DecimalDigitNumber);
  542. }
  543. // Returns true iff this Unicode category represents a number
  544. private static bool IsCategoryNumber(UnicodeCategory category)
  545. {
  546. return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.DecimalDigitNumber, (uint)UnicodeCategory.OtherNumber);
  547. }
  548. // Returns true iff this Unicode category represents a punctuation mark
  549. private static bool IsCategoryPunctuation(UnicodeCategory category)
  550. {
  551. return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.ConnectorPunctuation, (uint)UnicodeCategory.OtherPunctuation);
  552. }
  553. // Returns true iff this Unicode category represents a separator
  554. private static bool IsCategorySeparator(UnicodeCategory category)
  555. {
  556. return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.SpaceSeparator, (uint)UnicodeCategory.ParagraphSeparator);
  557. }
  558. // Returns true iff this Unicode category represents a symbol
  559. private static bool IsCategorySymbol(UnicodeCategory category)
  560. {
  561. return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.MathSymbol, (uint)UnicodeCategory.OtherSymbol);
  562. }
  563. public static bool IsControl(Rune value)
  564. {
  565. // Per the Unicode stability policy, the set of control characters
  566. // is forever fixed at [ U+0000..U+001F ], [ U+007F..U+009F ]. No
  567. // characters will ever be added to the "control characters" group.
  568. // See http://www.unicode.org/policies/stability_policy.html.
  569. // Logic below depends on Rune.Value never being -1 (since Rune is a validating type)
  570. // 00..1F (+1) => 01..20 (&~80) => 01..20
  571. // 7F..9F (+1) => 80..A0 (&~80) => 00..20
  572. return (((value._value + 1) & ~0x80u) <= 0x20u);
  573. }
  574. public static bool IsDigit(Rune value)
  575. {
  576. if (value.IsAscii)
  577. {
  578. return UnicodeUtility.IsInRangeInclusive(value._value, '0', '9');
  579. }
  580. else
  581. {
  582. return (GetUnicodeCategoryNonAscii(value) == UnicodeCategory.DecimalDigitNumber);
  583. }
  584. }
  585. public static bool IsLetter(Rune value)
  586. {
  587. if (value.IsAscii)
  588. {
  589. return (((value._value - 'A') & ~0x20u) <= (uint)('Z' - 'A')); // [A-Za-z]
  590. }
  591. else
  592. {
  593. return IsCategoryLetter(GetUnicodeCategoryNonAscii(value));
  594. }
  595. }
  596. public static bool IsLetterOrDigit(Rune value)
  597. {
  598. if (value.IsAscii)
  599. {
  600. return ((AsciiCharInfo[value.Value] & IsLetterOrDigitFlag) != 0);
  601. }
  602. else
  603. {
  604. return IsCategoryLetterOrDecimalDigit(GetUnicodeCategoryNonAscii(value));
  605. }
  606. }
  607. public static bool IsLower(Rune value)
  608. {
  609. if (value.IsAscii)
  610. {
  611. return UnicodeUtility.IsInRangeInclusive(value._value, 'a', 'z');
  612. }
  613. else
  614. {
  615. return (GetUnicodeCategoryNonAscii(value) == UnicodeCategory.LowercaseLetter);
  616. }
  617. }
  618. public static bool IsNumber(Rune value)
  619. {
  620. if (value.IsAscii)
  621. {
  622. return UnicodeUtility.IsInRangeInclusive(value._value, '0', '9');
  623. }
  624. else
  625. {
  626. return IsCategoryNumber(GetUnicodeCategoryNonAscii(value));
  627. }
  628. }
  629. public static bool IsPunctuation(Rune value)
  630. {
  631. return IsCategoryPunctuation(GetUnicodeCategory(value));
  632. }
  633. public static bool IsSeparator(Rune value)
  634. {
  635. return IsCategorySeparator(GetUnicodeCategory(value));
  636. }
  637. public static bool IsSymbol(Rune value)
  638. {
  639. return IsCategorySymbol(GetUnicodeCategory(value));
  640. }
  641. public static bool IsUpper(Rune value)
  642. {
  643. if (value.IsAscii)
  644. {
  645. return UnicodeUtility.IsInRangeInclusive(value._value, 'A', 'Z');
  646. }
  647. else
  648. {
  649. return (GetUnicodeCategoryNonAscii(value) == UnicodeCategory.UppercaseLetter);
  650. }
  651. }
  652. public static bool IsWhiteSpace(Rune value)
  653. {
  654. if (value.IsAscii)
  655. {
  656. return (AsciiCharInfo[value.Value] & IsWhiteSpaceFlag) != 0;
  657. }
  658. // U+0085 is special since it's a whitespace character but is in the Control category
  659. // instead of a normal separator category. No other code point outside the ASCII range
  660. // has this mismatch.
  661. if (value._value == 0x0085u)
  662. {
  663. return true;
  664. }
  665. return IsCategorySeparator(GetUnicodeCategoryNonAscii(value));
  666. }
  667. public static Rune ToLower(Rune value, CultureInfo culture)
  668. {
  669. if (culture is null)
  670. {
  671. ThrowHelper.ThrowArgumentNullException(ExceptionArgument.culture);
  672. }
  673. // We don't want to special-case ASCII here since the specified culture might handle
  674. // ASCII characters differently than the invariant culture (e.g., Turkish I). Instead
  675. // we'll just jump straight to the globalization tables if they're available.
  676. if (GlobalizationMode.Invariant)
  677. {
  678. return ToLowerInvariant(value);
  679. }
  680. return ChangeCaseCultureAware(value, culture.TextInfo, toUpper: false);
  681. }
  682. public static Rune ToLowerInvariant(Rune value)
  683. {
  684. // Handle the most common case (ASCII data) first. Within the common case, we expect
  685. // that there'll be a mix of lowercase & uppercase chars, so make the conversion branchless.
  686. if (value.IsAscii)
  687. {
  688. // It's ok for us to use the UTF-16 conversion utility for this since the high
  689. // 16 bits of the value will never be set so will be left unchanged.
  690. return UnsafeCreate(Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(value._value));
  691. }
  692. if (GlobalizationMode.Invariant)
  693. {
  694. // If the value isn't ASCII and if the globalization tables aren't available,
  695. // case changing has no effect.
  696. return value;
  697. }
  698. // Non-ASCII data requires going through the case folding tables.
  699. return ChangeCaseCultureAware(value, TextInfo.Invariant, toUpper: false);
  700. }
  701. public static Rune ToUpper(Rune value, CultureInfo culture)
  702. {
  703. if (culture is null)
  704. {
  705. ThrowHelper.ThrowArgumentNullException(ExceptionArgument.culture);
  706. }
  707. // We don't want to special-case ASCII here since the specified culture might handle
  708. // ASCII characters differently than the invariant culture (e.g., Turkish I). Instead
  709. // we'll just jump straight to the globalization tables if they're available.
  710. if (GlobalizationMode.Invariant)
  711. {
  712. return ToUpperInvariant(value);
  713. }
  714. return ChangeCaseCultureAware(value, culture.TextInfo, toUpper: true);
  715. }
  716. public static Rune ToUpperInvariant(Rune value)
  717. {
  718. // Handle the most common case (ASCII data) first. Within the common case, we expect
  719. // that there'll be a mix of lowercase & uppercase chars, so make the conversion branchless.
  720. if (value.IsAscii)
  721. {
  722. // It's ok for us to use the UTF-16 conversion utility for this since the high
  723. // 16 bits of the value will never be set so will be left unchanged.
  724. return UnsafeCreate(Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(value._value));
  725. }
  726. if (GlobalizationMode.Invariant)
  727. {
  728. // If the value isn't ASCII and if the globalization tables aren't available,
  729. // case changing has no effect.
  730. return value;
  731. }
  732. // Non-ASCII data requires going through the case folding tables.
  733. return ChangeCaseCultureAware(value, TextInfo.Invariant, toUpper: true);
  734. }
  735. }
  736. }