Rune.cs 56 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338
  1. // Licensed to the .NET Foundation under one or more agreements.
  2. // The .NET Foundation licenses this file to you under the MIT license.
  3. // See the LICENSE file in the project root for more information.
  4. using System.Buffers;
  5. using System.Diagnostics;
  6. using System.Globalization;
  7. using System.Runtime.CompilerServices;
  8. using System.Text.Unicode;
  9. namespace System.Text
  10. {
  11. /// <summary>
  12. /// Represents a Unicode scalar value ([ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive).
  13. /// </summary>
  14. /// <remarks>
  15. /// This type's constructors and conversion operators validate the input, so consumers can call the APIs
  16. /// assuming that the underlying <see cref="Rune"/> instance is well-formed.
  17. /// </remarks>
  18. [DebuggerDisplay("{DebuggerDisplay,nq}")]
  19. public readonly struct Rune : IComparable<Rune>, IEquatable<Rune>
  20. {
  21. private const byte IsWhiteSpaceFlag = 0x80;
  22. private const byte IsLetterOrDigitFlag = 0x40;
  23. private const byte UnicodeCategoryMask = 0x1F;
  24. // Contains information about the ASCII character range [ U+0000..U+007F ], with:
  25. // - 0x80 bit if set means 'is whitespace'
  26. // - 0x40 bit if set means 'is letter or digit'
  27. // - 0x20 bit is reserved for future use
  28. // - bottom 5 bits are the UnicodeCategory of the character
  29. private static ReadOnlySpan<byte> AsciiCharInfo => new byte[]
  30. {
  31. 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x8E, 0x8E, 0x8E, 0x8E, 0x8E, 0x0E, 0x0E,
  32. 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E,
  33. 0x8B, 0x18, 0x18, 0x18, 0x1A, 0x18, 0x18, 0x18, 0x14, 0x15, 0x18, 0x19, 0x18, 0x13, 0x18, 0x18,
  34. 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x18, 0x18, 0x19, 0x19, 0x19, 0x18,
  35. 0x18, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
  36. 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x14, 0x18, 0x15, 0x1B, 0x12,
  37. 0x1B, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41,
  38. 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x14, 0x19, 0x15, 0x19, 0x0E
  39. };
  40. private readonly uint _value;
  41. /// <summary>
  42. /// Creates a <see cref="Rune"/> from the provided UTF-16 code unit.
  43. /// </summary>
  44. /// <exception cref="ArgumentOutOfRangeException">
  45. /// If <paramref name="ch"/> represents a UTF-16 surrogate code point
  46. /// U+D800..U+DFFF, inclusive.
  47. /// </exception>
  48. public Rune(char ch)
  49. {
  50. uint expanded = ch;
  51. if (UnicodeUtility.IsSurrogateCodePoint(expanded))
  52. {
  53. ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.ch);
  54. }
  55. _value = expanded;
  56. }
  57. /// <summary>
  58. /// Creates a <see cref="Rune"/> from the provided UTF-16 surrogate pair.
  59. /// </summary>
  60. /// <exception cref="ArgumentOutOfRangeException">
  61. /// If <paramref name="highSurrogate"/> does not represent a UTF-16 high surrogate code point
  62. /// or <paramref name="lowSurrogate"/> does not represent a UTF-16 low surrogate code point.
  63. /// </exception>
  64. public Rune(char highSurrogate, char lowSurrogate)
  65. : this((uint)char.ConvertToUtf32(highSurrogate, lowSurrogate), false)
  66. {
  67. }
  68. /// <summary>
  69. /// Creates a <see cref="Rune"/> from the provided Unicode scalar value.
  70. /// </summary>
  71. /// <exception cref="ArgumentOutOfRangeException">
  72. /// If <paramref name="value"/> does not represent a value Unicode scalar value.
  73. /// </exception>
  74. public Rune(int value)
  75. : this((uint)value)
  76. {
  77. }
  78. /// <summary>
  79. /// Creates a <see cref="Rune"/> from the provided Unicode scalar value.
  80. /// </summary>
  81. /// <exception cref="ArgumentOutOfRangeException">
  82. /// If <paramref name="value"/> does not represent a value Unicode scalar value.
  83. /// </exception>
  84. [CLSCompliant(false)]
  85. public Rune(uint value)
  86. {
  87. if (!UnicodeUtility.IsValidUnicodeScalar(value))
  88. {
  89. ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.value);
  90. }
  91. _value = value;
  92. }
  93. // non-validating ctor
  94. private Rune(uint scalarValue, bool unused)
  95. {
  96. UnicodeDebug.AssertIsValidScalar(scalarValue);
  97. _value = scalarValue;
  98. }
  99. public static bool operator ==(Rune left, Rune right) => (left._value == right._value);
  100. public static bool operator !=(Rune left, Rune right) => (left._value != right._value);
  101. public static bool operator <(Rune left, Rune right) => (left._value < right._value);
  102. public static bool operator <=(Rune left, Rune right) => (left._value <= right._value);
  103. public static bool operator >(Rune left, Rune right) => (left._value > right._value);
  104. public static bool operator >=(Rune left, Rune right) => (left._value >= right._value);
  105. // Operators below are explicit because they may throw.
  106. public static explicit operator Rune(char ch) => new Rune(ch);
  107. [CLSCompliant(false)]
  108. public static explicit operator Rune(uint value) => new Rune(value);
  109. public static explicit operator Rune(int value) => new Rune(value);
  110. // Displayed as "'<char>' (U+XXXX)"; e.g., "'e' (U+0065)"
  111. private string DebuggerDisplay => FormattableString.Invariant($"U+{_value:X4} '{(IsValid(_value) ? ToString() : "\uFFFD")}'");
  112. /// <summary>
  113. /// Returns true if and only if this scalar value is ASCII ([ U+0000..U+007F ])
  114. /// and therefore representable by a single UTF-8 code unit.
  115. /// </summary>
  116. public bool IsAscii => UnicodeUtility.IsAsciiCodePoint(_value);
  117. /// <summary>
  118. /// Returns true if and only if this scalar value is within the BMP ([ U+0000..U+FFFF ])
  119. /// and therefore representable by a single UTF-16 code unit.
  120. /// </summary>
  121. public bool IsBmp => UnicodeUtility.IsBmpCodePoint(_value);
  122. /// <summary>
  123. /// Returns the Unicode plane (0 to 16, inclusive) which contains this scalar.
  124. /// </summary>
  125. public int Plane => UnicodeUtility.GetPlane(_value);
  126. /// <summary>
  127. /// A <see cref="Rune"/> instance that represents the Unicode replacement character U+FFFD.
  128. /// </summary>
  129. public static Rune ReplacementChar => UnsafeCreate(UnicodeUtility.ReplacementChar);
  130. /// <summary>
  131. /// Returns the length in code units (<see cref="Char"/>) of the
  132. /// UTF-16 sequence required to represent this scalar value.
  133. /// </summary>
  134. /// <remarks>
  135. /// The return value will be 1 or 2.
  136. /// </remarks>
  137. public int Utf16SequenceLength => UnicodeUtility.GetUtf16SequenceLength(_value);
  138. /// <summary>
  139. /// Returns the length in code units of the
  140. /// UTF-8 sequence required to represent this scalar value.
  141. /// </summary>
  142. /// <remarks>
  143. /// The return value will be 1 through 4, inclusive.
  144. /// </remarks>
  145. public int Utf8SequenceLength => UnicodeUtility.GetUtf8SequenceLength(_value);
  146. /// <summary>
  147. /// Returns the Unicode scalar value as an integer.
  148. /// </summary>
  149. public int Value => (int)_value;
  150. private static Rune ChangeCaseCultureAware(Rune rune, TextInfo textInfo, bool toUpper)
  151. {
  152. Debug.Assert(!GlobalizationMode.Invariant, "This should've been checked by the caller.");
  153. Debug.Assert(textInfo != null, "This should've been checked by the caller.");
  154. Span<char> original = stackalloc char[2]; // worst case scenario = 2 code units (for a surrogate pair)
  155. Span<char> modified = stackalloc char[2]; // case change should preserve UTF-16 code unit count
  156. int charCount = rune.EncodeToUtf16(original);
  157. original = original.Slice(0, charCount);
  158. modified = modified.Slice(0, charCount);
  159. if (toUpper)
  160. {
  161. textInfo.ChangeCaseToUpper(original, modified);
  162. }
  163. else
  164. {
  165. textInfo.ChangeCaseToLower(original, modified);
  166. }
  167. // We use simple case folding rules, which disallows moving between the BMP and supplementary
  168. // planes when performing a case conversion. The helper methods which reconstruct a Rune
  169. // contain debug asserts for this condition.
  170. if (rune.IsBmp)
  171. {
  172. return UnsafeCreate(modified[0]);
  173. }
  174. else
  175. {
  176. return UnsafeCreate(UnicodeUtility.GetScalarFromUtf16SurrogatePair(modified[0], modified[1]));
  177. }
  178. }
  179. public int CompareTo(Rune other) => this._value.CompareTo(other._value);
  180. /// <summary>
  181. /// Decodes the <see cref="Rune"/> at the beginning of the provided UTF-16 source buffer.
  182. /// </summary>
  183. /// <returns>
  184. /// <para>
  185. /// If the source buffer begins with a valid UTF-16 encoded scalar value, returns <see cref="OperationStatus.Done"/>,
  186. /// and outs via <paramref name="result"/> the decoded <see cref="Rune"/> and via <paramref name="charsConsumed"/> the
  187. /// number of <see langword="char"/>s used in the input buffer to encode the <see cref="Rune"/>.
  188. /// </para>
  189. /// <para>
  190. /// If the source buffer is empty or contains only a standalone UTF-16 high surrogate character, returns <see cref="OperationStatus.NeedMoreData"/>,
  191. /// and outs via <paramref name="result"/> <see cref="ReplacementChar"/> and via <paramref name="charsConsumed"/> the length of the input buffer.
  192. /// </para>
  193. /// <para>
  194. /// If the source buffer begins with an ill-formed UTF-16 encoded scalar value, returns <see cref="OperationStatus.InvalidData"/>,
  195. /// and outs via <paramref name="result"/> <see cref="ReplacementChar"/> and via <paramref name="charsConsumed"/> the number of
  196. /// <see langword="char"/>s used in the input buffer to encode the ill-formed sequence.
  197. /// </para>
  198. /// </returns>
  199. /// <remarks>
  200. /// The general calling convention is to call this method in a loop, slicing the <paramref name="source"/> buffer by
  201. /// <paramref name="charsConsumed"/> elements on each iteration of the loop. On each iteration of the loop <paramref name="result"/>
  202. /// will contain the real scalar value if successfully decoded, or it will contain <see cref="ReplacementChar"/> if
  203. /// the data could not be successfully decoded. This pattern provides convenient automatic U+FFFD substitution of
  204. /// invalid sequences while iterating through the loop.
  205. /// </remarks>
  206. public static OperationStatus DecodeFromUtf16(ReadOnlySpan<char> source, out Rune result, out int charsConsumed)
  207. {
  208. if (!source.IsEmpty)
  209. {
  210. // First, check for the common case of a BMP scalar value.
  211. // If this is correct, return immediately.
  212. char firstChar = source[0];
  213. if (TryCreate(firstChar, out result))
  214. {
  215. charsConsumed = 1;
  216. return OperationStatus.Done;
  217. }
  218. // First thing we saw was a UTF-16 surrogate code point.
  219. // Let's optimistically assume for now it's a high surrogate and hope
  220. // that combining it with the next char yields useful results.
  221. if (1 < (uint)source.Length)
  222. {
  223. char secondChar = source[1];
  224. if (TryCreate(firstChar, secondChar, out result))
  225. {
  226. // Success! Formed a supplementary scalar value.
  227. charsConsumed = 2;
  228. return OperationStatus.Done;
  229. }
  230. else
  231. {
  232. // Either the first character was a low surrogate, or the second
  233. // character was not a low surrogate. This is an error.
  234. goto InvalidData;
  235. }
  236. }
  237. else if (!char.IsHighSurrogate(firstChar))
  238. {
  239. // Quick check to make sure we're not going to report NeedMoreData for
  240. // a single-element buffer where the data is a standalone low surrogate
  241. // character. Since no additional data will ever make this valid, we'll
  242. // report an error immediately.
  243. goto InvalidData;
  244. }
  245. }
  246. // If we got to this point, the input buffer was empty, or the buffer
  247. // was a single element in length and that element was a high surrogate char.
  248. charsConsumed = source.Length;
  249. result = ReplacementChar;
  250. return OperationStatus.NeedMoreData;
  251. InvalidData:
  252. charsConsumed = 1; // maximal invalid subsequence for UTF-16 is always a single code unit in length
  253. result = ReplacementChar;
  254. return OperationStatus.InvalidData;
  255. }
  256. /// <summary>
  257. /// Decodes the <see cref="Rune"/> at the beginning of the provided UTF-8 source buffer.
  258. /// </summary>
  259. /// <returns>
  260. /// <para>
  261. /// If the source buffer begins with a valid UTF-8 encoded scalar value, returns <see cref="OperationStatus.Done"/>,
  262. /// and outs via <paramref name="result"/> the decoded <see cref="Rune"/> and via <paramref name="bytesConsumed"/> the
  263. /// number of <see langword="byte"/>s used in the input buffer to encode the <see cref="Rune"/>.
  264. /// </para>
  265. /// <para>
  266. /// If the source buffer is empty or contains only a standalone UTF-8 high surrogate character, returns <see cref="OperationStatus.NeedMoreData"/>,
  267. /// and outs via <paramref name="result"/> <see cref="ReplacementChar"/> and via <paramref name="bytesConsumed"/> the length of the input buffer.
  268. /// </para>
  269. /// <para>
  270. /// If the source buffer begins with an ill-formed UTF-8 encoded scalar value, returns <see cref="OperationStatus.InvalidData"/>,
  271. /// and outs via <paramref name="result"/> <see cref="ReplacementChar"/> and via <paramref name="bytesConsumed"/> the number of
  272. /// <see langword="char"/>s used in the input buffer to encode the ill-formed sequence.
  273. /// </para>
  274. /// </returns>
  275. /// <remarks>
  276. /// The general calling convention is to call this method in a loop, slicing the <paramref name="source"/> buffer by
  277. /// <paramref name="bytesConsumed"/> elements on each iteration of the loop. On each iteration of the loop <paramref name="result"/>
  278. /// will contain the real scalar value if successfully decoded, or it will contain <see cref="ReplacementChar"/> if
  279. /// the data could not be successfully decoded. This pattern provides convenient automatic U+FFFD substitution of
  280. /// invalid sequences while iterating through the loop.
  281. /// </remarks>
  282. public static OperationStatus DecodeFromUtf8(ReadOnlySpan<byte> source, out Rune result, out int bytesConsumed)
  283. {
  284. // This method follows the Unicode Standard's recommendation for detecting
  285. // the maximal subpart of an ill-formed subsequence. See The Unicode Standard,
  286. // Ch. 3.9 for more details. In summary, when reporting an invalid subsequence,
  287. // it tries to consume as many code units as possible as long as those code
  288. // units constitute the beginning of a longer well-formed subsequence per Table 3-7.
  289. int index = 0;
  290. // Try reading input[0].
  291. if ((uint)index >= (uint)source.Length)
  292. {
  293. goto NeedsMoreData;
  294. }
  295. uint tempValue = source[index];
  296. if (!UnicodeUtility.IsAsciiCodePoint(tempValue))
  297. {
  298. goto NotAscii;
  299. }
  300. Finish:
  301. bytesConsumed = index + 1;
  302. Debug.Assert(1 <= bytesConsumed && bytesConsumed <= 4); // Valid subsequences are always length [1..4]
  303. result = UnsafeCreate(tempValue);
  304. return OperationStatus.Done;
  305. NotAscii:
  306. // Per Table 3-7, the beginning of a multibyte sequence must be a code unit in
  307. // the range [C2..F4]. If it's outside of that range, it's either a standalone
  308. // continuation byte, or it's an overlong two-byte sequence, or it's an out-of-range
  309. // four-byte sequence.
  310. if (!UnicodeUtility.IsInRangeInclusive(tempValue, 0xC2, 0xF4))
  311. {
  312. goto FirstByteInvalid;
  313. }
  314. tempValue = (tempValue - 0xC2) << 6;
  315. // Try reading input[1].
  316. index++;
  317. if ((uint)index >= (uint)source.Length)
  318. {
  319. goto NeedsMoreData;
  320. }
  321. // Continuation bytes are of the form [10xxxxxx], which means that their two's
  322. // complement representation is in the range [-65..-128]. This allows us to
  323. // perform a single comparison to see if a byte is a continuation byte.
  324. int thisByteSignExtended = (sbyte)source[index];
  325. if (thisByteSignExtended >= -64)
  326. {
  327. goto Invalid;
  328. }
  329. tempValue += (uint)thisByteSignExtended;
  330. tempValue += 0x80; // remove the continuation byte marker
  331. tempValue += (0xC2 - 0xC0) << 6; // remove the leading byte marker
  332. if (tempValue < 0x0800)
  333. {
  334. Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0080, 0x07FF));
  335. goto Finish; // this is a valid 2-byte sequence
  336. }
  337. // This appears to be a 3- or 4-byte sequence. Since per Table 3-7 we now have
  338. // enough information (from just two code units) to detect overlong or surrogate
  339. // sequences, we need to perform these checks now.
  340. if (!UnicodeUtility.IsInRangeInclusive(tempValue, ((0xE0 - 0xC0) << 6) + (0xA0 - 0x80), ((0xF4 - 0xC0) << 6) + (0x8F - 0x80)))
  341. {
  342. // The first two bytes were not in the range [[E0 A0]..[F4 8F]].
  343. // This is an overlong 3-byte sequence or an out-of-range 4-byte sequence.
  344. goto Invalid;
  345. }
  346. if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xED - 0xC0) << 6) + (0xA0 - 0x80), ((0xED - 0xC0) << 6) + (0xBF - 0x80)))
  347. {
  348. // This is a UTF-16 surrogate code point, which is invalid in UTF-8.
  349. goto Invalid;
  350. }
  351. if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xF0 - 0xC0) << 6) + (0x80 - 0x80), ((0xF0 - 0xC0) << 6) + (0x8F - 0x80)))
  352. {
  353. // This is an overlong 4-byte sequence.
  354. goto Invalid;
  355. }
  356. // The first two bytes were just fine. We don't need to perform any other checks
  357. // on the remaining bytes other than to see that they're valid continuation bytes.
  358. // Try reading input[2].
  359. index++;
  360. if ((uint)index >= (uint)source.Length)
  361. {
  362. goto NeedsMoreData;
  363. }
  364. thisByteSignExtended = (sbyte)source[index];
  365. if (thisByteSignExtended >= -64)
  366. {
  367. goto Invalid; // this byte is not a UTF-8 continuation byte
  368. }
  369. tempValue <<= 6;
  370. tempValue += (uint)thisByteSignExtended;
  371. tempValue += 0x80; // remove the continuation byte marker
  372. tempValue -= (0xE0 - 0xC0) << 12; // remove the leading byte marker
  373. if (tempValue <= 0xFFFF)
  374. {
  375. Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0800, 0xFFFF));
  376. goto Finish; // this is a valid 3-byte sequence
  377. }
  378. // Try reading input[3].
  379. index++;
  380. if ((uint)index >= (uint)source.Length)
  381. {
  382. goto NeedsMoreData;
  383. }
  384. thisByteSignExtended = (sbyte)source[index];
  385. if (thisByteSignExtended >= -64)
  386. {
  387. goto Invalid; // this byte is not a UTF-8 continuation byte
  388. }
  389. tempValue <<= 6;
  390. tempValue += (uint)thisByteSignExtended;
  391. tempValue += 0x80; // remove the continuation byte marker
  392. tempValue -= (0xF0 - 0xE0) << 18; // remove the leading byte marker
  393. UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(tempValue);
  394. goto Finish; // this is a valid 4-byte sequence
  395. FirstByteInvalid:
  396. index = 1; // Invalid subsequences are always at least length 1.
  397. Invalid:
  398. Debug.Assert(1 <= index && index <= 3); // Invalid subsequences are always length 1..3
  399. bytesConsumed = index;
  400. result = ReplacementChar;
  401. return OperationStatus.InvalidData;
  402. NeedsMoreData:
  403. Debug.Assert(0 <= index && index <= 3); // Incomplete subsequences are always length 0..3
  404. bytesConsumed = index;
  405. result = ReplacementChar;
  406. return OperationStatus.NeedMoreData;
  407. }
  408. /// <summary>
  409. /// Decodes the <see cref="Rune"/> at the end of the provided UTF-16 source buffer.
  410. /// </summary>
  411. /// <remarks>
  412. /// This method is very similar to <see cref="DecodeFromUtf16(ReadOnlySpan{char}, out Rune, out int)"/>, but it allows
  413. /// the caller to loop backward instead of forward. The typical calling convention is that on each iteration
  414. /// of the loop, the caller should slice off the final <paramref name="charsConsumed"/> elements of
  415. /// the <paramref name="source"/> buffer.
  416. /// </remarks>
  417. public static OperationStatus DecodeLastFromUtf16(ReadOnlySpan<char> source, out Rune result, out int charsConsumed)
  418. {
  419. int index = source.Length - 1;
  420. if ((uint)index < (uint)source.Length)
  421. {
  422. // First, check for the common case of a BMP scalar value.
  423. // If this is correct, return immediately.
  424. char finalChar = source[index];
  425. if (TryCreate(finalChar, out result))
  426. {
  427. charsConsumed = 1;
  428. return OperationStatus.Done;
  429. }
  430. if (char.IsLowSurrogate(finalChar))
  431. {
  432. // The final character was a UTF-16 low surrogate code point.
  433. // This must be preceded by a UTF-16 high surrogate code point, otherwise
  434. // we have a standalone low surrogate, which is always invalid.
  435. index--;
  436. if ((uint)index < (uint)source.Length)
  437. {
  438. char penultimateChar = source[index];
  439. if (TryCreate(penultimateChar, finalChar, out result))
  440. {
  441. // Success! Formed a supplementary scalar value.
  442. charsConsumed = 2;
  443. return OperationStatus.Done;
  444. }
  445. }
  446. // If we got to this point, we saw a standalone low surrogate
  447. // and must report an error.
  448. charsConsumed = 1; // standalone surrogate
  449. result = ReplacementChar;
  450. return OperationStatus.InvalidData;
  451. }
  452. }
  453. // If we got this far, the source buffer was empty, or the source buffer ended
  454. // with a UTF-16 high surrogate code point. These aren't errors since they could
  455. // be valid given more input data.
  456. charsConsumed = (int)((uint)(-source.Length) >> 31); // 0 -> 0, all other lengths -> 1
  457. result = ReplacementChar;
  458. return OperationStatus.NeedMoreData;
  459. }
  460. /// <summary>
  461. /// Decodes the <see cref="Rune"/> at the end of the provided UTF-8 source buffer.
  462. /// </summary>
  463. /// <remarks>
  464. /// This method is very similar to <see cref="DecodeFromUtf8(ReadOnlySpan{byte}, out Rune, out int)"/>, but it allows
  465. /// the caller to loop backward instead of forward. The typical calling convention is that on each iteration
  466. /// of the loop, the caller should slice off the final <paramref name="bytesConsumed"/> elements of
  467. /// the <paramref name="source"/> buffer.
  468. /// </remarks>
  469. public static OperationStatus DecodeLastFromUtf8(ReadOnlySpan<byte> source, out Rune value, out int bytesConsumed)
  470. {
  471. int index = source.Length - 1;
  472. if ((uint)index < (uint)source.Length)
  473. {
  474. // The buffer contains at least one byte. Let's check the fast case where the
  475. // buffer ends with an ASCII byte.
  476. uint tempValue = source[index];
  477. if (UnicodeUtility.IsAsciiCodePoint(tempValue))
  478. {
  479. bytesConsumed = 1;
  480. value = UnsafeCreate(tempValue);
  481. return OperationStatus.Done;
  482. }
  483. // If the final byte is not an ASCII byte, we may be beginning or in the middle of
  484. // a UTF-8 multi-code unit sequence. We need to back up until we see the start of
  485. // the multi-code unit sequence; we can detect the leading byte because all multi-byte
  486. // sequences begin with a byte whose 0x40 bit is set. Since all multi-byte sequences
  487. // are no greater than 4 code units in length, we only need to search back a maximum
  488. // of four bytes.
  489. if (((byte)tempValue & 0x40) != 0)
  490. {
  491. // This is a UTF-8 leading byte. We'll do a forward read from here.
  492. // It'll return invalid (if given C0, F5, etc.) or incomplete. Both are fine.
  493. return DecodeFromUtf8(source.Slice(index), out value, out bytesConsumed);
  494. }
  495. // If we got to this point, the final byte was a UTF-8 continuation byte.
  496. // Let's check the three bytes immediately preceding this, looking for the starting byte.
  497. for (int i = 3; i > 0; i--)
  498. {
  499. index--;
  500. if ((uint)index >= (uint)source.Length)
  501. {
  502. goto Invalid; // out of data
  503. }
  504. // The check below will get hit for ASCII (values 00..7F) and for UTF-8 starting bytes
  505. // (bits 0xC0 set, values C0..FF). In two's complement this is the range [-64..127].
  506. // It's just a fast way for us to terminate the search.
  507. if ((sbyte)source[index] >= -64)
  508. {
  509. goto ForwardDecode;
  510. }
  511. }
  512. Invalid:
  513. // If we got to this point, either:
  514. // - the last 4 bytes of the input buffer are continuation bytes;
  515. // - the entire input buffer (if fewer than 4 bytes) consists only of continuation bytes; or
  516. // - there's no UTF-8 leading byte between the final continuation byte of the buffer and
  517. // the previous well-formed subsequence or maximal invalid subsequence.
  518. //
  519. // In all of these cases, the final byte must be a maximal invalid subsequence of length 1.
  520. // See comment near the end of this method for more information.
  521. value = ReplacementChar;
  522. bytesConsumed = 1;
  523. return OperationStatus.InvalidData;
  524. ForwardDecode:
  525. // If we got to this point, we found an ASCII byte or a UTF-8 starting byte at position source[index].
  526. // Technically this could also mean we found an invalid byte like C0 or F5 at this position, but that's
  527. // fine since it'll be handled by the forward read. From this position, we'll perform a forward read
  528. // and see if we consumed the entirety of the buffer.
  529. source = source.Slice(index);
  530. Debug.Assert(!source.IsEmpty, "Shouldn't reach this for empty inputs.");
  531. OperationStatus operationStatus = DecodeFromUtf8(source, out Rune tempRune, out int tempBytesConsumed);
  532. if (tempBytesConsumed == source.Length)
  533. {
  534. // If this forward read consumed the entirety of the end of the input buffer, we can return it
  535. // as the result of this function. It could be well-formed, incomplete, or invalid. If it's
  536. // invalid and we consumed the remainder of the buffer, we know we've found the maximal invalid
  537. // subsequence, which is what we wanted anyway.
  538. bytesConsumed = tempBytesConsumed;
  539. value = tempRune;
  540. return operationStatus;
  541. }
  542. // If we got to this point, we know that the final continuation byte wasn't consumed by the forward
  543. // read that we just performed above. This means that the continuation byte has to be part of an
  544. // invalid subsequence since there's no UTF-8 leading byte between what we just consumed and the
  545. // continuation byte at the end of the input. Furthermore, since any maximal invalid subsequence
  546. // of length > 1 must have a UTF-8 leading byte as its first code unit, this implies that the
  547. // continuation byte at the end of the buffer is itself a maximal invalid subsequence of length 1.
  548. goto Invalid;
  549. }
  550. else
  551. {
  552. // Source buffer was empty.
  553. value = ReplacementChar;
  554. bytesConsumed = 0;
  555. return OperationStatus.NeedMoreData;
  556. }
  557. }
  558. /// <summary>
  559. /// Encodes this <see cref="Rune"/> to a UTF-16 destination buffer.
  560. /// </summary>
  561. /// <param name="destination">The buffer to which to write this value as UTF-16.</param>
  562. /// <returns>The number of <see cref="char"/>s written to <paramref name="destination"/>.</returns>
  563. /// <exception cref="ArgumentException">
  564. /// If <paramref name="destination"/> is not large enough to hold the output.
  565. /// </exception>
  566. public int EncodeToUtf16(Span<char> destination)
  567. {
  568. if (!TryEncodeToUtf16(destination, out int charsWritten))
  569. {
  570. ThrowHelper.ThrowArgumentException_DestinationTooShort();
  571. }
  572. return charsWritten;
  573. }
  574. /// <summary>
  575. /// Encodes this <see cref="Rune"/> to a UTF-8 destination buffer.
  576. /// </summary>
  577. /// <param name="destination">The buffer to which to write this value as UTF-8.</param>
  578. /// <returns>The number of <see cref="byte"/>s written to <paramref name="destination"/>.</returns>
  579. /// <exception cref="ArgumentException">
  580. /// If <paramref name="destination"/> is not large enough to hold the output.
  581. /// </exception>
  582. public int EncodeToUtf8(Span<byte> destination)
  583. {
  584. if (!TryEncodeToUtf8(destination, out int bytesWritten))
  585. {
  586. ThrowHelper.ThrowArgumentException_DestinationTooShort();
  587. }
  588. return bytesWritten;
  589. }
  590. public override bool Equals(object? obj) => (obj is Rune other) && this.Equals(other);
  591. public bool Equals(Rune other) => (this == other);
  592. public override int GetHashCode() => Value;
  593. /// <summary>
  594. /// Gets the <see cref="Rune"/> which begins at index <paramref name="index"/> in
  595. /// string <paramref name="input"/>.
  596. /// </summary>
  597. /// <remarks>
  598. /// Throws if <paramref name="input"/> is null, if <paramref name="index"/> is out of range, or
  599. /// if <paramref name="index"/> does not reference the start of a valid scalar value within <paramref name="input"/>.
  600. /// </remarks>
  601. public static Rune GetRuneAt(string input, int index)
  602. {
  603. int runeValue = ReadRuneFromString(input, index);
  604. if (runeValue < 0)
  605. {
  606. ThrowHelper.ThrowArgumentException_CannotExtractScalar(ExceptionArgument.index);
  607. }
  608. return UnsafeCreate((uint)runeValue);
  609. }
  610. /// <summary>
  611. /// Returns <see langword="true"/> iff <paramref name="value"/> is a valid Unicode scalar
  612. /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive.
  613. /// </summary>
  614. public static bool IsValid(int value) => IsValid((uint)value);
  615. /// <summary>
  616. /// Returns <see langword="true"/> iff <paramref name="value"/> is a valid Unicode scalar
  617. /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive.
  618. /// </summary>
  619. [CLSCompliant(false)]
  620. public static bool IsValid(uint value) => UnicodeUtility.IsValidUnicodeScalar(value);
  621. // returns a negative number on failure
  622. internal static int ReadFirstRuneFromUtf16Buffer(ReadOnlySpan<char> input)
  623. {
  624. if (input.IsEmpty)
  625. {
  626. return -1;
  627. }
  628. // Optimistically assume input is within BMP.
  629. uint returnValue = input[0];
  630. if (UnicodeUtility.IsSurrogateCodePoint(returnValue))
  631. {
  632. if (!UnicodeUtility.IsHighSurrogateCodePoint(returnValue))
  633. {
  634. return -1;
  635. }
  636. // Treat 'returnValue' as the high surrogate.
  637. if (1 >= (uint)input.Length)
  638. {
  639. return -1; // not an argument exception - just a "bad data" failure
  640. }
  641. uint potentialLowSurrogate = input[1];
  642. if (!UnicodeUtility.IsLowSurrogateCodePoint(potentialLowSurrogate))
  643. {
  644. return -1;
  645. }
  646. returnValue = UnicodeUtility.GetScalarFromUtf16SurrogatePair(returnValue, potentialLowSurrogate);
  647. }
  648. return (int)returnValue;
  649. }
  650. // returns a negative number on failure
  651. private static int ReadRuneFromString(string input, int index)
  652. {
  653. if (input is null)
  654. {
  655. ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input);
  656. }
  657. if ((uint)index >= (uint)input!.Length)
  658. {
  659. ThrowHelper.ThrowArgumentOutOfRange_IndexException();
  660. }
  661. // Optimistically assume input is within BMP.
  662. uint returnValue = input[index];
  663. if (UnicodeUtility.IsSurrogateCodePoint(returnValue))
  664. {
  665. if (!UnicodeUtility.IsHighSurrogateCodePoint(returnValue))
  666. {
  667. return -1;
  668. }
  669. // Treat 'returnValue' as the high surrogate.
  670. //
  671. // If this becomes a hot code path, we can skip the below bounds check by reading
  672. // off the end of the string using unsafe code. Since strings are null-terminated,
  673. // we're guaranteed not to read a valid low surrogate, so we'll fail correctly if
  674. // the string terminates unexpectedly.
  675. index++;
  676. if ((uint)index >= (uint)input.Length)
  677. {
  678. return -1; // not an argument exception - just a "bad data" failure
  679. }
  680. uint potentialLowSurrogate = input[index];
  681. if (!UnicodeUtility.IsLowSurrogateCodePoint(potentialLowSurrogate))
  682. {
  683. return -1;
  684. }
  685. returnValue = UnicodeUtility.GetScalarFromUtf16SurrogatePair(returnValue, potentialLowSurrogate);
  686. }
  687. return (int)returnValue;
  688. }
  689. /// <summary>
  690. /// Returns a <see cref="string"/> representation of this <see cref="Rune"/> instance.
  691. /// </summary>
  692. public override string ToString()
  693. {
  694. if (IsBmp)
  695. {
  696. return string.CreateFromChar((char)_value);
  697. }
  698. else
  699. {
  700. UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar(_value, out char high, out char low);
  701. return string.CreateFromChar(high, low);
  702. }
  703. }
  704. /// <summary>
  705. /// Attempts to create a <see cref="Rune"/> from the provided input value.
  706. /// </summary>
  707. public static bool TryCreate(char ch, out Rune result)
  708. {
  709. uint extendedValue = ch;
  710. if (!UnicodeUtility.IsSurrogateCodePoint(extendedValue))
  711. {
  712. result = UnsafeCreate(extendedValue);
  713. return true;
  714. }
  715. else
  716. {
  717. result = default;
  718. return false;
  719. }
  720. }
  721. /// <summary>
  722. /// Attempts to create a <see cref="Rune"/> from the provided UTF-16 surrogate pair.
  723. /// Returns <see langword="false"/> if the input values don't represent a well-formed UTF-16surrogate pair.
  724. /// </summary>
  725. public static bool TryCreate(char highSurrogate, char lowSurrogate, out Rune result)
  726. {
  727. // First, extend both to 32 bits, then calculate the offset of
  728. // each candidate surrogate char from the start of its range.
  729. uint highSurrogateOffset = (uint)highSurrogate - CharUnicodeInfo.HIGH_SURROGATE_START;
  730. uint lowSurrogateOffset = (uint)lowSurrogate - CharUnicodeInfo.LOW_SURROGATE_START;
  731. // This is a single comparison which allows us to check both for validity at once since
  732. // both the high surrogate range and the low surrogate range are the same length.
  733. // If the comparison fails, we call to a helper method to throw the correct exception message.
  734. if ((highSurrogateOffset | lowSurrogateOffset) <= CharUnicodeInfo.HIGH_SURROGATE_RANGE)
  735. {
  736. // The 0x40u << 10 below is to account for uuuuu = wwww + 1 in the surrogate encoding.
  737. result = UnsafeCreate((highSurrogateOffset << 10) + ((uint)lowSurrogate - CharUnicodeInfo.LOW_SURROGATE_START) + (0x40u << 10));
  738. return true;
  739. }
  740. else
  741. {
  742. // Didn't have a high surrogate followed by a low surrogate.
  743. result = default;
  744. return false;
  745. }
  746. }
  747. /// <summary>
  748. /// Attempts to create a <see cref="Rune"/> from the provided input value.
  749. /// </summary>
  750. public static bool TryCreate(int value, out Rune result) => TryCreate((uint)value, out result);
  751. /// <summary>
  752. /// Attempts to create a <see cref="Rune"/> from the provided input value.
  753. /// </summary>
  754. [CLSCompliant(false)]
  755. public static bool TryCreate(uint value, out Rune result)
  756. {
  757. if (UnicodeUtility.IsValidUnicodeScalar(value))
  758. {
  759. result = UnsafeCreate(value);
  760. return true;
  761. }
  762. else
  763. {
  764. result = default;
  765. return false;
  766. }
  767. }
  768. /// <summary>
  769. /// Encodes this <see cref="Rune"/> to a UTF-16 destination buffer.
  770. /// </summary>
  771. /// <param name="destination">The buffer to which to write this value as UTF-16.</param>
  772. /// <param name="charsWritten">
  773. /// The number of <see cref="char"/>s written to <paramref name="destination"/>,
  774. /// or 0 if the destination buffer is not large enough to contain the output.</param>
  775. /// <returns>True if the value was written to the buffer; otherwise, false.</returns>
  776. /// <remarks>
  777. /// The <see cref="Utf16SequenceLength"/> property can be queried ahead of time to determine
  778. /// the required size of the <paramref name="destination"/> buffer.
  779. /// </remarks>
  780. public bool TryEncodeToUtf16(Span<char> destination, out int charsWritten)
  781. {
  782. if (destination.Length >= 1)
  783. {
  784. if (IsBmp)
  785. {
  786. destination[0] = (char)_value;
  787. charsWritten = 1;
  788. return true;
  789. }
  790. else if (destination.Length >= 2)
  791. {
  792. UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar(_value, out destination[0], out destination[1]);
  793. charsWritten = 2;
  794. return true;
  795. }
  796. }
  797. // Destination buffer not large enough
  798. charsWritten = default;
  799. return false;
  800. }
  801. /// <summary>
  802. /// Encodes this <see cref="Rune"/> to a destination buffer as UTF-8 bytes.
  803. /// </summary>
  804. /// <param name="destination">The buffer to which to write this value as UTF-8.</param>
  805. /// <param name="bytesWritten">
  806. /// The number of <see cref="byte"/>s written to <paramref name="destination"/>,
  807. /// or 0 if the destination buffer is not large enough to contain the output.</param>
  808. /// <returns>True if the value was written to the buffer; otherwise, false.</returns>
  809. /// <remarks>
  810. /// The <see cref="Utf8SequenceLength"/> property can be queried ahead of time to determine
  811. /// the required size of the <paramref name="destination"/> buffer.
  812. /// </remarks>
  813. public bool TryEncodeToUtf8(Span<byte> destination, out int bytesWritten)
  814. {
  815. // The bit patterns below come from the Unicode Standard, Table 3-6.
  816. if (destination.Length >= 1)
  817. {
  818. if (IsAscii)
  819. {
  820. destination[0] = (byte)_value;
  821. bytesWritten = 1;
  822. return true;
  823. }
  824. if (destination.Length >= 2)
  825. {
  826. if (_value <= 0x7FFu)
  827. {
  828. // Scalar 00000yyy yyxxxxxx -> bytes [ 110yyyyy 10xxxxxx ]
  829. destination[0] = (byte)((_value + (0b110u << 11)) >> 6);
  830. destination[1] = (byte)((_value & 0x3Fu) + 0x80u);
  831. bytesWritten = 2;
  832. return true;
  833. }
  834. if (destination.Length >= 3)
  835. {
  836. if (_value <= 0xFFFFu)
  837. {
  838. // Scalar zzzzyyyy yyxxxxxx -> bytes [ 1110zzzz 10yyyyyy 10xxxxxx ]
  839. destination[0] = (byte)((_value + (0b1110 << 16)) >> 12);
  840. destination[1] = (byte)(((_value & (0x3Fu << 6)) >> 6) + 0x80u);
  841. destination[2] = (byte)((_value & 0x3Fu) + 0x80u);
  842. bytesWritten = 3;
  843. return true;
  844. }
  845. if (destination.Length >= 4)
  846. {
  847. // Scalar 000uuuuu zzzzyyyy yyxxxxxx -> bytes [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ]
  848. destination[0] = (byte)((_value + (0b11110 << 21)) >> 18);
  849. destination[1] = (byte)(((_value & (0x3Fu << 12)) >> 12) + 0x80u);
  850. destination[2] = (byte)(((_value & (0x3Fu << 6)) >> 6) + 0x80u);
  851. destination[3] = (byte)((_value & 0x3Fu) + 0x80u);
  852. bytesWritten = 4;
  853. return true;
  854. }
  855. }
  856. }
  857. }
  858. // Destination buffer not large enough
  859. bytesWritten = default;
  860. return false;
  861. }
  862. /// <summary>
  863. /// Attempts to get the <see cref="Rune"/> which begins at index <paramref name="index"/> in
  864. /// string <paramref name="input"/>.
  865. /// </summary>
  866. /// <returns><see langword="true"/> if a scalar value was successfully extracted from the specified index,
  867. /// <see langword="false"/> if a value could not be extracted due to invalid data.</returns>
  868. /// <remarks>
  869. /// Throws only if <paramref name="input"/> is null or <paramref name="index"/> is out of range.
  870. /// </remarks>
  871. public static bool TryGetRuneAt(string input, int index, out Rune value)
  872. {
  873. int runeValue = ReadRuneFromString(input, index);
  874. if (runeValue >= 0)
  875. {
  876. value = UnsafeCreate((uint)runeValue);
  877. return true;
  878. }
  879. else
  880. {
  881. value = default;
  882. return false;
  883. }
  884. }
  885. // Allows constructing a Unicode scalar value from an arbitrary 32-bit integer without
  886. // validation. It is the caller's responsibility to have performed manual validation
  887. // before calling this method. If a Rune instance is forcibly constructed
  888. // from invalid input, the APIs on this type have undefined behavior, potentially including
  889. // introducing a security hole in the consuming application.
  890. //
  891. // An example of a security hole resulting from an invalid Rune value, which could result
  892. // in a stack overflow.
  893. //
  894. // public int GetMarvin32HashCode(Rune r) {
  895. // Span<char> buffer = stackalloc char[r.Utf16SequenceLength];
  896. // r.TryEncode(buffer, ...);
  897. // return Marvin32.ComputeHash(buffer.AsBytes());
  898. // }
  899. /// <summary>
  900. /// Creates a <see cref="Rune"/> without performing validation on the input.
  901. /// </summary>
  902. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  903. internal static Rune UnsafeCreate(uint scalarValue) => new Rune(scalarValue, false);
  904. // These are analogs of APIs on System.Char
  905. public static double GetNumericValue(Rune value)
  906. {
  907. if (value.IsAscii)
  908. {
  909. uint baseNum = value._value - '0';
  910. return (baseNum <= 9) ? (double)baseNum : -1;
  911. }
  912. else
  913. {
  914. // not an ASCII char; fall back to globalization table
  915. return CharUnicodeInfo.InternalGetNumericValue(value.Value);
  916. }
  917. }
  918. public static UnicodeCategory GetUnicodeCategory(Rune value)
  919. {
  920. if (value.IsAscii)
  921. {
  922. return (UnicodeCategory)(AsciiCharInfo[value.Value] & UnicodeCategoryMask);
  923. }
  924. else
  925. {
  926. return GetUnicodeCategoryNonAscii(value);
  927. }
  928. }
  929. private static UnicodeCategory GetUnicodeCategoryNonAscii(Rune value)
  930. {
  931. Debug.Assert(!value.IsAscii, "Shouldn't use this non-optimized code path for ASCII characters.");
  932. return CharUnicodeInfo.GetUnicodeCategory(value.Value);
  933. }
  934. // Returns true iff this Unicode category represents a letter
  935. private static bool IsCategoryLetter(UnicodeCategory category)
  936. {
  937. return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.UppercaseLetter, (uint)UnicodeCategory.OtherLetter);
  938. }
  939. // Returns true iff this Unicode category represents a letter or a decimal digit
  940. private static bool IsCategoryLetterOrDecimalDigit(UnicodeCategory category)
  941. {
  942. return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.UppercaseLetter, (uint)UnicodeCategory.OtherLetter)
  943. || (category == UnicodeCategory.DecimalDigitNumber);
  944. }
  945. // Returns true iff this Unicode category represents a number
  946. private static bool IsCategoryNumber(UnicodeCategory category)
  947. {
  948. return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.DecimalDigitNumber, (uint)UnicodeCategory.OtherNumber);
  949. }
  950. // Returns true iff this Unicode category represents a punctuation mark
  951. private static bool IsCategoryPunctuation(UnicodeCategory category)
  952. {
  953. return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.ConnectorPunctuation, (uint)UnicodeCategory.OtherPunctuation);
  954. }
  955. // Returns true iff this Unicode category represents a separator
  956. private static bool IsCategorySeparator(UnicodeCategory category)
  957. {
  958. return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.SpaceSeparator, (uint)UnicodeCategory.ParagraphSeparator);
  959. }
  960. // Returns true iff this Unicode category represents a symbol
  961. private static bool IsCategorySymbol(UnicodeCategory category)
  962. {
  963. return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.MathSymbol, (uint)UnicodeCategory.OtherSymbol);
  964. }
  965. public static bool IsControl(Rune value)
  966. {
  967. // Per the Unicode stability policy, the set of control characters
  968. // is forever fixed at [ U+0000..U+001F ], [ U+007F..U+009F ]. No
  969. // characters will ever be added to the "control characters" group.
  970. // See http://www.unicode.org/policies/stability_policy.html.
  971. // Logic below depends on Rune.Value never being -1 (since Rune is a validating type)
  972. // 00..1F (+1) => 01..20 (&~80) => 01..20
  973. // 7F..9F (+1) => 80..A0 (&~80) => 00..20
  974. return (((value._value + 1) & ~0x80u) <= 0x20u);
  975. }
  976. public static bool IsDigit(Rune value)
  977. {
  978. if (value.IsAscii)
  979. {
  980. return UnicodeUtility.IsInRangeInclusive(value._value, '0', '9');
  981. }
  982. else
  983. {
  984. return GetUnicodeCategoryNonAscii(value) == UnicodeCategory.DecimalDigitNumber;
  985. }
  986. }
  987. public static bool IsLetter(Rune value)
  988. {
  989. if (value.IsAscii)
  990. {
  991. return (((value._value - 'A') & ~0x20u) <= (uint)('Z' - 'A')); // [A-Za-z]
  992. }
  993. else
  994. {
  995. return IsCategoryLetter(GetUnicodeCategoryNonAscii(value));
  996. }
  997. }
  998. public static bool IsLetterOrDigit(Rune value)
  999. {
  1000. if (value.IsAscii)
  1001. {
  1002. return ((AsciiCharInfo[value.Value] & IsLetterOrDigitFlag) != 0);
  1003. }
  1004. else
  1005. {
  1006. return IsCategoryLetterOrDecimalDigit(GetUnicodeCategoryNonAscii(value));
  1007. }
  1008. }
  1009. public static bool IsLower(Rune value)
  1010. {
  1011. if (value.IsAscii)
  1012. {
  1013. return UnicodeUtility.IsInRangeInclusive(value._value, 'a', 'z');
  1014. }
  1015. else
  1016. {
  1017. return GetUnicodeCategoryNonAscii(value) == UnicodeCategory.LowercaseLetter;
  1018. }
  1019. }
  1020. public static bool IsNumber(Rune value)
  1021. {
  1022. if (value.IsAscii)
  1023. {
  1024. return UnicodeUtility.IsInRangeInclusive(value._value, '0', '9');
  1025. }
  1026. else
  1027. {
  1028. return IsCategoryNumber(GetUnicodeCategoryNonAscii(value));
  1029. }
  1030. }
  1031. public static bool IsPunctuation(Rune value)
  1032. {
  1033. return IsCategoryPunctuation(GetUnicodeCategory(value));
  1034. }
  1035. public static bool IsSeparator(Rune value)
  1036. {
  1037. return IsCategorySeparator(GetUnicodeCategory(value));
  1038. }
  1039. public static bool IsSymbol(Rune value)
  1040. {
  1041. return IsCategorySymbol(GetUnicodeCategory(value));
  1042. }
  1043. public static bool IsUpper(Rune value)
  1044. {
  1045. if (value.IsAscii)
  1046. {
  1047. return UnicodeUtility.IsInRangeInclusive(value._value, 'A', 'Z');
  1048. }
  1049. else
  1050. {
  1051. return GetUnicodeCategoryNonAscii(value) == UnicodeCategory.UppercaseLetter;
  1052. }
  1053. }
  1054. public static bool IsWhiteSpace(Rune value)
  1055. {
  1056. if (value.IsAscii)
  1057. {
  1058. return (AsciiCharInfo[value.Value] & IsWhiteSpaceFlag) != 0;
  1059. }
  1060. // U+0085 is special since it's a whitespace character but is in the Control category
  1061. // instead of a normal separator category. No other code point outside the ASCII range
  1062. // has this mismatch.
  1063. if (value._value == 0x0085u)
  1064. {
  1065. return true;
  1066. }
  1067. return IsCategorySeparator(GetUnicodeCategoryNonAscii(value));
  1068. }
  1069. public static Rune ToLower(Rune value, CultureInfo culture)
  1070. {
  1071. if (culture is null)
  1072. {
  1073. ThrowHelper.ThrowArgumentNullException(ExceptionArgument.culture);
  1074. }
  1075. // We don't want to special-case ASCII here since the specified culture might handle
  1076. // ASCII characters differently than the invariant culture (e.g., Turkish I). Instead
  1077. // we'll just jump straight to the globalization tables if they're available.
  1078. if (GlobalizationMode.Invariant)
  1079. {
  1080. return ToLowerInvariant(value);
  1081. }
  1082. return ChangeCaseCultureAware(value, culture!.TextInfo, toUpper: false);
  1083. }
  1084. public static Rune ToLowerInvariant(Rune value)
  1085. {
  1086. // Handle the most common case (ASCII data) first. Within the common case, we expect
  1087. // that there'll be a mix of lowercase & uppercase chars, so make the conversion branchless.
  1088. if (value.IsAscii)
  1089. {
  1090. // It's ok for us to use the UTF-16 conversion utility for this since the high
  1091. // 16 bits of the value will never be set so will be left unchanged.
  1092. return UnsafeCreate(Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(value._value));
  1093. }
  1094. if (GlobalizationMode.Invariant)
  1095. {
  1096. // If the value isn't ASCII and if the globalization tables aren't available,
  1097. // case changing has no effect.
  1098. return value;
  1099. }
  1100. // Non-ASCII data requires going through the case folding tables.
  1101. return ChangeCaseCultureAware(value, TextInfo.Invariant, toUpper: false);
  1102. }
  1103. public static Rune ToUpper(Rune value, CultureInfo culture)
  1104. {
  1105. if (culture is null)
  1106. {
  1107. ThrowHelper.ThrowArgumentNullException(ExceptionArgument.culture);
  1108. }
  1109. // We don't want to special-case ASCII here since the specified culture might handle
  1110. // ASCII characters differently than the invariant culture (e.g., Turkish I). Instead
  1111. // we'll just jump straight to the globalization tables if they're available.
  1112. if (GlobalizationMode.Invariant)
  1113. {
  1114. return ToUpperInvariant(value);
  1115. }
  1116. return ChangeCaseCultureAware(value, culture!.TextInfo, toUpper: true);
  1117. }
  1118. public static Rune ToUpperInvariant(Rune value)
  1119. {
  1120. // Handle the most common case (ASCII data) first. Within the common case, we expect
  1121. // that there'll be a mix of lowercase & uppercase chars, so make the conversion branchless.
  1122. if (value.IsAscii)
  1123. {
  1124. // It's ok for us to use the UTF-16 conversion utility for this since the high
  1125. // 16 bits of the value will never be set so will be left unchanged.
  1126. return UnsafeCreate(Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(value._value));
  1127. }
  1128. if (GlobalizationMode.Invariant)
  1129. {
  1130. // If the value isn't ASCII and if the globalization tables aren't available,
  1131. // case changing has no effect.
  1132. return value;
  1133. }
  1134. // Non-ASCII data requires going through the case folding tables.
  1135. return ChangeCaseCultureAware(value, TextInfo.Invariant, toUpper: true);
  1136. }
  1137. }
  1138. }