Char.cs 38 KB


  1. // Licensed to the .NET Foundation under one or more agreements.
  2. // The .NET Foundation licenses this file to you under the MIT license.
  3. // See the LICENSE file in the project root for more information.
  4. /*============================================================
  5. **
  6. **
  7. **
  8. ** Purpose: This is the value class representing a Unicode character
  9. ** Char methods until we create this functionality.
  10. **
  11. **
  12. ===========================================================*/
  13. using System.Diagnostics;
  14. using System.Globalization;
  15. using System.Runtime.InteropServices;
  16. using System.Text;
  17. namespace System
  18. {
  19. [Serializable]
  20. [StructLayout(LayoutKind.Sequential)]
  21. [System.Runtime.CompilerServices.TypeForwardedFrom("mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089")]
  22. public readonly struct Char : IComparable, IComparable<char>, IEquatable<char>, IConvertible
  23. {
  24. //
  25. // Member Variables
  26. //
  27. private readonly char m_value; // Do not rename (binary serialization)
  28. //
  29. // Public Constants
  30. //
  31. // The maximum character value.
  32. public const char MaxValue = (char)0xFFFF;
  33. // The minimum character value.
  34. public const char MinValue = (char)0x00;
  35. private const byte IsWhiteSpaceFlag = 0x80;
  36. private const byte IsUpperCaseLetterFlag = 0x40;
  37. private const byte IsLowerCaseLetterFlag = 0x20;
  38. private const byte UnicodeCategoryMask = 0x1F;
  39. // Contains information about the C0, Basic Latin, C1, and Latin-1 Supplement ranges [ U+0000..U+00FF ], with:
  40. // - 0x80 bit if set means 'is whitespace'
  41. // - 0x40 bit if set means 'is uppercase letter'
  42. // - 0x20 bit if set means 'is lowercase letter'
  43. // - bottom 5 bits are the UnicodeCategory of the character
  44. //
  45. // n.b. This data is locked to an earlier version of the Unicode standard (2.0, perhaps?), so
  46. // the UnicodeCategory data contained here doesn't necessarily reflect the UnicodeCategory data
  47. // contained within the CharUnicodeInfo or Rune types, which generally follow the latest Unicode
  48. // standard.
  49. private static ReadOnlySpan<byte> Latin1CharInfo => new byte[]
  50. {
  51. 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x8E, 0x8E, 0x8E, 0x8E, 0x8E, 0x0E, 0x0E, // U+0000..U+000F
  52. 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, // U+0010..U+001F
  53. 0x8B, 0x18, 0x18, 0x18, 0x1A, 0x18, 0x18, 0x18, 0x14, 0x15, 0x18, 0x19, 0x18, 0x13, 0x18, 0x18, // U+0020..U+002F
  54. 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x18, 0x18, 0x19, 0x19, 0x19, 0x18, // U+0030..U+003F
  55. 0x18, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // U+0040..U+004F
  56. 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x14, 0x18, 0x15, 0x1B, 0x12, // U+0050..U+005F
  57. 0x1B, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // U+0060..U+006F
  58. 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x14, 0x19, 0x15, 0x19, 0x0E, // U+0070..U+007F
  59. 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x8E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, // U+0080..U+008F
  60. 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, // U+0090..U+009F
  61. 0x8B, 0x18, 0x1A, 0x1A, 0x1A, 0x1A, 0x1C, 0x1C, 0x1B, 0x1C, 0x21, 0x16, 0x19, 0x13, 0x1C, 0x1B, // U+00A0..U+00AF
  62. 0x1C, 0x19, 0x0A, 0x0A, 0x1B, 0x21, 0x1C, 0x18, 0x1B, 0x0A, 0x21, 0x17, 0x0A, 0x0A, 0x0A, 0x18, // U+00B0..U+00BF
  63. 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // U+00C0..U+00CF
  64. 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x19, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x21, // U+00D0..U+00DF
  65. 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // U+00E0..U+00EF
  66. 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x19, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // U+00F0..U+00FF
  67. };
  68. // Return true for all characters below or equal U+00ff, which is ASCII + Latin-1 Supplement.
  69. private static bool IsLatin1(char ch)
  70. {
  71. return (uint)ch < (uint)Latin1CharInfo.Length;
  72. }
  73. // Return true for all characters below or equal U+007f, which is ASCII.
  74. private static bool IsAscii(char ch)
  75. {
  76. return (uint)ch <= '\x007f';
  77. }
  78. // Return the Unicode category for Unicode character <= 0x00ff.
  79. private static UnicodeCategory GetLatin1UnicodeCategory(char ch)
  80. {
  81. Debug.Assert(IsLatin1(ch), "char.GetLatin1UnicodeCategory(): ch should be <= 00ff");
  82. return (UnicodeCategory)(Latin1CharInfo[ch] & UnicodeCategoryMask);
  83. }
  84. //
  85. // Private Constants
  86. //
  87. //
  88. // Overriden Instance Methods
  89. //
  90. // Calculate a hashcode for a 2 byte Unicode character.
  91. public override int GetHashCode()
  92. {
  93. return (int)m_value | ((int)m_value << 16);
  94. }
  95. // Used for comparing two boxed Char objects.
  96. //
  97. public override bool Equals(object? obj)
  98. {
  99. if (!(obj is char))
  100. {
  101. return false;
  102. }
  103. return m_value == ((char)obj).m_value;
  104. }
  105. [System.Runtime.Versioning.NonVersionable]
  106. public bool Equals(char obj)
  107. {
  108. return m_value == obj;
  109. }
  110. // Compares this object to another object, returning an integer that
  111. // indicates the relationship.
  112. // Returns a value less than zero if this object
  113. // null is considered to be less than any instance.
  114. // If object is not of type Char, this method throws an ArgumentException.
  115. //
  116. public int CompareTo(object? value)
  117. {
  118. if (value == null)
  119. {
  120. return 1;
  121. }
  122. if (!(value is char))
  123. {
  124. throw new ArgumentException(SR.Arg_MustBeChar);
  125. }
  126. return m_value - ((char)value).m_value;
  127. }
  128. public int CompareTo(char value)
  129. {
  130. return m_value - value;
  131. }
  132. // Overrides System.Object.ToString.
  133. public override string ToString()
  134. {
  135. return char.ToString(m_value);
  136. }
  137. public string ToString(IFormatProvider? provider)
  138. {
  139. return char.ToString(m_value);
  140. }
  141. //
  142. // Formatting Methods
  143. //
  144. /*===================================ToString===================================
  145. **This static methods takes a character and returns the String representation of it.
  146. ==============================================================================*/
  147. // Provides a string representation of a character.
  148. public static string ToString(char c) => string.CreateFromChar(c);
  149. public static char Parse(string s)
  150. {
  151. if (s == null)
  152. {
  153. throw new ArgumentNullException(nameof(s));
  154. }
  155. if (s.Length != 1)
  156. {
  157. throw new FormatException(SR.Format_NeedSingleChar);
  158. }
  159. return s[0];
  160. }
  161. public static bool TryParse(string? s, out char result)
  162. {
  163. result = '\0';
  164. if (s == null)
  165. {
  166. return false;
  167. }
  168. if (s.Length != 1)
  169. {
  170. return false;
  171. }
  172. result = s[0];
  173. return true;
  174. }
  175. //
  176. // Static Methods
  177. //
  178. /*=================================ISDIGIT======================================
  179. **A wrapper for char. Returns a boolean indicating whether **
  180. **character c is considered to be a digit. **
  181. ==============================================================================*/
  182. // Determines whether a character is a digit.
  183. public static bool IsDigit(char c)
  184. {
  185. if (IsLatin1(c))
  186. {
  187. return IsInRange(c, '0', '9');
  188. }
  189. return CharUnicodeInfo.GetUnicodeCategory(c) == UnicodeCategory.DecimalDigitNumber;
  190. }
  191. internal static bool IsInRange(char c, char min, char max) => (uint)(c - min) <= (uint)(max - min);
  192. private static bool IsInRange(UnicodeCategory c, UnicodeCategory min, UnicodeCategory max) => (uint)(c - min) <= (uint)(max - min);
  193. /*=================================CheckLetter=====================================
  194. ** Check if the specified UnicodeCategory belongs to the letter categories.
  195. ==============================================================================*/
  196. internal static bool CheckLetter(UnicodeCategory uc)
  197. {
  198. return IsInRange(uc, UnicodeCategory.UppercaseLetter, UnicodeCategory.OtherLetter);
  199. }
  200. /*=================================ISLETTER=====================================
  201. **A wrapper for char. Returns a boolean indicating whether **
  202. **character c is considered to be a letter. **
  203. ==============================================================================*/
  204. // Determines whether a character is a letter.
  205. public static bool IsLetter(char c)
  206. {
  207. if (IsLatin1(c))
  208. {
  209. // For the version of the Unicode standard the Char type is locked to, the
  210. // Latin-1 range doesn't include letters in categories other than "upper" and "lower".
  211. return (Latin1CharInfo[c] & (IsUpperCaseLetterFlag | IsLowerCaseLetterFlag)) != 0;
  212. }
  213. return CheckLetter(CharUnicodeInfo.GetUnicodeCategory(c));
  214. }
  215. private static bool IsWhiteSpaceLatin1(char c)
  216. {
  217. Debug.Assert(IsLatin1(c));
  218. return (Latin1CharInfo[c] & IsWhiteSpaceFlag) != 0;
  219. }
  220. /*===============================ISWHITESPACE===================================
  221. **A wrapper for char. Returns a boolean indicating whether **
  222. **character c is considered to be a whitespace character. **
  223. ==============================================================================*/
  224. // Determines whether a character is whitespace.
  225. public static bool IsWhiteSpace(char c)
  226. {
  227. if (IsLatin1(c))
  228. {
  229. return IsWhiteSpaceLatin1(c);
  230. }
  231. return CheckSeparator(CharUnicodeInfo.GetUnicodeCategory(c));
  232. }
  233. /*===================================IsUpper====================================
  234. **Arguments: c -- the characater to be checked.
  235. **Returns: True if c is an uppercase character.
  236. ==============================================================================*/
  237. // Determines whether a character is upper-case.
  238. public static bool IsUpper(char c)
  239. {
  240. if (IsLatin1(c))
  241. {
  242. return (Latin1CharInfo[c] & IsUpperCaseLetterFlag) != 0;
  243. }
  244. return CharUnicodeInfo.GetUnicodeCategory(c) == UnicodeCategory.UppercaseLetter;
  245. }
  246. /*===================================IsLower====================================
  247. **Arguments: c -- the characater to be checked.
  248. **Returns: True if c is an lowercase character.
  249. ==============================================================================*/
  250. // Determines whether a character is lower-case.
  251. public static bool IsLower(char c)
  252. {
  253. if (IsLatin1(c))
  254. {
  255. return (Latin1CharInfo[c] & IsLowerCaseLetterFlag) != 0;
  256. }
  257. return CharUnicodeInfo.GetUnicodeCategory(c) == UnicodeCategory.LowercaseLetter;
  258. }
  259. internal static bool CheckPunctuation(UnicodeCategory uc)
  260. {
  261. return IsInRange(uc, UnicodeCategory.ConnectorPunctuation, UnicodeCategory.OtherPunctuation);
  262. }
  263. /*================================IsPunctuation=================================
  264. **Arguments: c -- the characater to be checked.
  265. **Returns: True if c is an punctuation mark
  266. ==============================================================================*/
  267. // Determines whether a character is a punctuation mark.
  268. public static bool IsPunctuation(char c)
  269. {
  270. if (IsLatin1(c))
  271. {
  272. return CheckPunctuation(GetLatin1UnicodeCategory(c));
  273. }
  274. return CheckPunctuation(CharUnicodeInfo.GetUnicodeCategory(c));
  275. }
  276. /*=================================CheckLetterOrDigit=====================================
  277. ** Check if the specified UnicodeCategory belongs to the letter or digit categories.
  278. ==============================================================================*/
  279. internal static bool CheckLetterOrDigit(UnicodeCategory uc)
  280. {
  281. return CheckLetter(uc) || uc == UnicodeCategory.DecimalDigitNumber;
  282. }
  283. // Determines whether a character is a letter or a digit.
  284. public static bool IsLetterOrDigit(char c)
  285. {
  286. if (IsLatin1(c))
  287. {
  288. return CheckLetterOrDigit(GetLatin1UnicodeCategory(c));
  289. }
  290. return CheckLetterOrDigit(CharUnicodeInfo.GetUnicodeCategory(c));
  291. }
  292. /*===================================ToUpper====================================
  293. **
  294. ==============================================================================*/
  295. // Converts a character to upper-case for the specified culture.
  296. // <;<;Not fully implemented>;>;
  297. public static char ToUpper(char c, CultureInfo culture)
  298. {
  299. if (culture == null)
  300. throw new ArgumentNullException(nameof(culture));
  301. return culture.TextInfo.ToUpper(c);
  302. }
  303. /*=================================TOUPPER======================================
  304. **A wrapper for char.ToUpperCase. Converts character c to its **
  305. **uppercase equivalent. If c is already an uppercase character or is not an **
  306. **alphabetic, nothing happens. **
  307. ==============================================================================*/
  308. // Converts a character to upper-case for the default culture.
  309. //
  310. public static char ToUpper(char c)
  311. {
  312. return CultureInfo.CurrentCulture.TextInfo.ToUpper(c);
  313. }
  314. // Converts a character to upper-case for invariant culture.
  315. public static char ToUpperInvariant(char c)
  316. {
  317. return CultureInfo.InvariantCulture.TextInfo.ToUpper(c);
  318. }
  319. /*===================================ToLower====================================
  320. **
  321. ==============================================================================*/
  322. // Converts a character to lower-case for the specified culture.
  323. // <;<;Not fully implemented>;>;
  324. public static char ToLower(char c, CultureInfo culture)
  325. {
  326. if (culture == null)
  327. throw new ArgumentNullException(nameof(culture));
  328. return culture.TextInfo.ToLower(c);
  329. }
  330. /*=================================TOLOWER======================================
  331. **A wrapper for char.ToLowerCase. Converts character c to its **
  332. **lowercase equivalent. If c is already a lowercase character or is not an **
  333. **alphabetic, nothing happens. **
  334. ==============================================================================*/
  335. // Converts a character to lower-case for the default culture.
  336. public static char ToLower(char c)
  337. {
  338. return CultureInfo.CurrentCulture.TextInfo.ToLower(c);
  339. }
  340. // Converts a character to lower-case for invariant culture.
  341. public static char ToLowerInvariant(char c)
  342. {
  343. return CultureInfo.InvariantCulture.TextInfo.ToLower(c);
  344. }
  345. //
  346. // IConvertible implementation
  347. //
  348. public TypeCode GetTypeCode()
  349. {
  350. return TypeCode.Char;
  351. }
  352. bool IConvertible.ToBoolean(IFormatProvider? provider)
  353. {
  354. throw new InvalidCastException(SR.Format(SR.InvalidCast_FromTo, "Char", "Boolean"));
  355. }
  356. char IConvertible.ToChar(IFormatProvider? provider)
  357. {
  358. return m_value;
  359. }
  360. sbyte IConvertible.ToSByte(IFormatProvider? provider)
  361. {
  362. return Convert.ToSByte(m_value);
  363. }
  364. byte IConvertible.ToByte(IFormatProvider? provider)
  365. {
  366. return Convert.ToByte(m_value);
  367. }
  368. short IConvertible.ToInt16(IFormatProvider? provider)
  369. {
  370. return Convert.ToInt16(m_value);
  371. }
  372. ushort IConvertible.ToUInt16(IFormatProvider? provider)
  373. {
  374. return Convert.ToUInt16(m_value);
  375. }
  376. int IConvertible.ToInt32(IFormatProvider? provider)
  377. {
  378. return Convert.ToInt32(m_value);
  379. }
  380. uint IConvertible.ToUInt32(IFormatProvider? provider)
  381. {
  382. return Convert.ToUInt32(m_value);
  383. }
  384. long IConvertible.ToInt64(IFormatProvider? provider)
  385. {
  386. return Convert.ToInt64(m_value);
  387. }
  388. ulong IConvertible.ToUInt64(IFormatProvider? provider)
  389. {
  390. return Convert.ToUInt64(m_value);
  391. }
  392. float IConvertible.ToSingle(IFormatProvider? provider)
  393. {
  394. throw new InvalidCastException(SR.Format(SR.InvalidCast_FromTo, "Char", "Single"));
  395. }
  396. double IConvertible.ToDouble(IFormatProvider? provider)
  397. {
  398. throw new InvalidCastException(SR.Format(SR.InvalidCast_FromTo, "Char", "Double"));
  399. }
  400. decimal IConvertible.ToDecimal(IFormatProvider? provider)
  401. {
  402. throw new InvalidCastException(SR.Format(SR.InvalidCast_FromTo, "Char", "Decimal"));
  403. }
  404. DateTime IConvertible.ToDateTime(IFormatProvider? provider)
  405. {
  406. throw new InvalidCastException(SR.Format(SR.InvalidCast_FromTo, "Char", "DateTime"));
  407. }
  408. object IConvertible.ToType(Type type, IFormatProvider? provider)
  409. {
  410. return Convert.DefaultToType((IConvertible)this, type, provider);
  411. }
  412. public static bool IsControl(char c)
  413. {
  414. if (IsLatin1(c))
  415. {
  416. return GetLatin1UnicodeCategory(c) == UnicodeCategory.Control;
  417. }
  418. return CharUnicodeInfo.GetUnicodeCategory(c) == UnicodeCategory.Control;
  419. }
  420. public static bool IsControl(string s, int index)
  421. {
  422. if (s == null)
  423. throw new ArgumentNullException(nameof(s));
  424. if (((uint)index) >= ((uint)s.Length))
  425. {
  426. throw new ArgumentOutOfRangeException(nameof(index));
  427. }
  428. char c = s[index];
  429. if (IsLatin1(c))
  430. {
  431. return GetLatin1UnicodeCategory(c) == UnicodeCategory.Control;
  432. }
  433. return CharUnicodeInfo.GetUnicodeCategory(s, index) == UnicodeCategory.Control;
  434. }
  435. public static bool IsDigit(string s, int index)
  436. {
  437. if (s == null)
  438. throw new ArgumentNullException(nameof(s));
  439. if (((uint)index) >= ((uint)s.Length))
  440. {
  441. throw new ArgumentOutOfRangeException(nameof(index));
  442. }
  443. char c = s[index];
  444. if (IsLatin1(c))
  445. {
  446. return IsInRange(c, '0', '9');
  447. }
  448. return CharUnicodeInfo.GetUnicodeCategory(s, index) == UnicodeCategory.DecimalDigitNumber;
  449. }
  450. public static bool IsLetter(string s, int index)
  451. {
  452. if (s == null)
  453. throw new ArgumentNullException(nameof(s));
  454. if (((uint)index) >= ((uint)s.Length))
  455. {
  456. throw new ArgumentOutOfRangeException(nameof(index));
  457. }
  458. char c = s[index];
  459. if (IsLatin1(c))
  460. {
  461. // The Latin-1 range doesn't include letters in categories other than "upper" and "lower"
  462. return (Latin1CharInfo[c] & (IsUpperCaseLetterFlag | IsLowerCaseLetterFlag)) != 0;
  463. }
  464. return CheckLetter(CharUnicodeInfo.GetUnicodeCategory(s, index));
  465. }
  466. public static bool IsLetterOrDigit(string s, int index)
  467. {
  468. if (s == null)
  469. throw new ArgumentNullException(nameof(s));
  470. if (((uint)index) >= ((uint)s.Length))
  471. {
  472. throw new ArgumentOutOfRangeException(nameof(index));
  473. }
  474. char c = s[index];
  475. if (IsLatin1(c))
  476. {
  477. return CheckLetterOrDigit(GetLatin1UnicodeCategory(c));
  478. }
  479. return CheckLetterOrDigit(CharUnicodeInfo.GetUnicodeCategory(s, index));
  480. }
  481. public static bool IsLower(string s, int index)
  482. {
  483. if (s == null)
  484. throw new ArgumentNullException(nameof(s));
  485. if (((uint)index) >= ((uint)s.Length))
  486. {
  487. throw new ArgumentOutOfRangeException(nameof(index));
  488. }
  489. char c = s[index];
  490. if (IsLatin1(c))
  491. {
  492. return (Latin1CharInfo[c] & IsLowerCaseLetterFlag) != 0;
  493. }
  494. return CharUnicodeInfo.GetUnicodeCategory(s, index) == UnicodeCategory.LowercaseLetter;
  495. }
  496. /*=================================CheckNumber=====================================
  497. ** Check if the specified UnicodeCategory belongs to the number categories.
  498. ==============================================================================*/
  499. internal static bool CheckNumber(UnicodeCategory uc)
  500. {
  501. return IsInRange(uc, UnicodeCategory.DecimalDigitNumber, UnicodeCategory.OtherNumber);
  502. }
  503. public static bool IsNumber(char c)
  504. {
  505. if (IsLatin1(c))
  506. {
  507. if (IsAscii(c))
  508. {
  509. return IsInRange(c, '0', '9');
  510. }
  511. return CheckNumber(GetLatin1UnicodeCategory(c));
  512. }
  513. return CheckNumber(CharUnicodeInfo.GetUnicodeCategory(c));
  514. }
  515. public static bool IsNumber(string s, int index)
  516. {
  517. if (s == null)
  518. throw new ArgumentNullException(nameof(s));
  519. if (((uint)index) >= ((uint)s.Length))
  520. {
  521. throw new ArgumentOutOfRangeException(nameof(index));
  522. }
  523. char c = s[index];
  524. if (IsLatin1(c))
  525. {
  526. if (IsAscii(c))
  527. {
  528. return IsInRange(c, '0', '9');
  529. }
  530. return CheckNumber(GetLatin1UnicodeCategory(c));
  531. }
  532. return CheckNumber(CharUnicodeInfo.GetUnicodeCategory(s, index));
  533. }
  534. ////////////////////////////////////////////////////////////////////////
  535. //
  536. // IsPunctuation
  537. //
  538. // Determines if the given character is a punctuation character.
  539. //
  540. ////////////////////////////////////////////////////////////////////////
  541. public static bool IsPunctuation(string s, int index)
  542. {
  543. if (s == null)
  544. throw new ArgumentNullException(nameof(s));
  545. if (((uint)index) >= ((uint)s.Length))
  546. {
  547. throw new ArgumentOutOfRangeException(nameof(index));
  548. }
  549. char c = s[index];
  550. if (IsLatin1(c))
  551. {
  552. return CheckPunctuation(GetLatin1UnicodeCategory(c));
  553. }
  554. return CheckPunctuation(CharUnicodeInfo.GetUnicodeCategory(s, index));
  555. }
  556. /*================================= CheckSeparator ============================
  557. ** Check if the specified UnicodeCategory belongs to the seprator categories.
  558. ==============================================================================*/
  559. internal static bool CheckSeparator(UnicodeCategory uc)
  560. {
  561. return IsInRange(uc, UnicodeCategory.SpaceSeparator, UnicodeCategory.ParagraphSeparator);
  562. }
  563. private static bool IsSeparatorLatin1(char c)
  564. {
  565. // U+00a0 = NO-BREAK SPACE
  566. // There is no LineSeparator or ParagraphSeparator in Latin 1 range.
  567. return c == '\x0020' || c == '\x00a0';
  568. }
  569. public static bool IsSeparator(char c)
  570. {
  571. if (IsLatin1(c))
  572. {
  573. return IsSeparatorLatin1(c);
  574. }
  575. return CheckSeparator(CharUnicodeInfo.GetUnicodeCategory(c));
  576. }
  577. public static bool IsSeparator(string s, int index)
  578. {
  579. if (s == null)
  580. throw new ArgumentNullException(nameof(s));
  581. if (((uint)index) >= ((uint)s.Length))
  582. {
  583. throw new ArgumentOutOfRangeException(nameof(index));
  584. }
  585. char c = s[index];
  586. if (IsLatin1(c))
  587. {
  588. return IsSeparatorLatin1(c);
  589. }
  590. return CheckSeparator(CharUnicodeInfo.GetUnicodeCategory(s, index));
  591. }
  592. public static bool IsSurrogate(char c)
  593. {
  594. return IsInRange(c, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END);
  595. }
  596. public static bool IsSurrogate(string s, int index)
  597. {
  598. if (s == null)
  599. {
  600. throw new ArgumentNullException(nameof(s));
  601. }
  602. if (((uint)index) >= ((uint)s.Length))
  603. {
  604. throw new ArgumentOutOfRangeException(nameof(index));
  605. }
  606. return IsSurrogate(s[index]);
  607. }
  608. /*================================= CheckSymbol ============================
  609. ** Check if the specified UnicodeCategory belongs to the symbol categories.
  610. ==============================================================================*/
  611. internal static bool CheckSymbol(UnicodeCategory uc)
  612. {
  613. return IsInRange(uc, UnicodeCategory.MathSymbol, UnicodeCategory.OtherSymbol);
  614. }
  615. public static bool IsSymbol(char c)
  616. {
  617. if (IsLatin1(c))
  618. {
  619. return CheckSymbol(GetLatin1UnicodeCategory(c));
  620. }
  621. return CheckSymbol(CharUnicodeInfo.GetUnicodeCategory(c));
  622. }
  623. public static bool IsSymbol(string s, int index)
  624. {
  625. if (s == null)
  626. throw new ArgumentNullException(nameof(s));
  627. if (((uint)index) >= ((uint)s.Length))
  628. {
  629. throw new ArgumentOutOfRangeException(nameof(index));
  630. }
  631. char c = s[index];
  632. if (IsLatin1(c))
  633. {
  634. return CheckSymbol(GetLatin1UnicodeCategory(c));
  635. }
  636. return CheckSymbol(CharUnicodeInfo.GetUnicodeCategory(s, index));
  637. }
  638. public static bool IsUpper(string s, int index)
  639. {
  640. if (s == null)
  641. throw new ArgumentNullException(nameof(s));
  642. if (((uint)index) >= ((uint)s.Length))
  643. {
  644. throw new ArgumentOutOfRangeException(nameof(index));
  645. }
  646. char c = s[index];
  647. if (IsLatin1(c))
  648. {
  649. return (Latin1CharInfo[c] & IsUpperCaseLetterFlag) != 0;
  650. }
  651. return CharUnicodeInfo.GetUnicodeCategory(s, index) == UnicodeCategory.UppercaseLetter;
  652. }
  653. public static bool IsWhiteSpace(string s, int index)
  654. {
  655. if (s == null)
  656. throw new ArgumentNullException(nameof(s));
  657. if (((uint)index) >= ((uint)s.Length))
  658. {
  659. throw new ArgumentOutOfRangeException(nameof(index));
  660. }
  661. char ch = s[index];
  662. if (IsLatin1(ch))
  663. {
  664. return IsWhiteSpaceLatin1(ch);
  665. }
  666. return CheckSeparator(CharUnicodeInfo.GetUnicodeCategory(s, index));
  667. }
  668. public static UnicodeCategory GetUnicodeCategory(char c)
  669. {
  670. if (IsLatin1(c))
  671. {
  672. return GetLatin1UnicodeCategory(c);
  673. }
  674. return CharUnicodeInfo.GetUnicodeCategory((int)c);
  675. }
  676. public static UnicodeCategory GetUnicodeCategory(string s, int index)
  677. {
  678. if (s == null)
  679. throw new ArgumentNullException(nameof(s));
  680. if (((uint)index) >= ((uint)s.Length))
  681. {
  682. throw new ArgumentOutOfRangeException(nameof(index));
  683. }
  684. if (IsLatin1(s[index]))
  685. {
  686. return GetLatin1UnicodeCategory(s[index]);
  687. }
  688. return CharUnicodeInfo.InternalGetUnicodeCategory(s, index);
  689. }
  690. public static double GetNumericValue(char c)
  691. {
  692. return CharUnicodeInfo.GetNumericValue(c);
  693. }
  694. public static double GetNumericValue(string s, int index)
  695. {
  696. if (s == null)
  697. throw new ArgumentNullException(nameof(s));
  698. if (((uint)index) >= ((uint)s.Length))
  699. {
  700. throw new ArgumentOutOfRangeException(nameof(index));
  701. }
  702. return CharUnicodeInfo.GetNumericValue(s, index);
  703. }
  704. /*================================= IsHighSurrogate ============================
  705. ** Check if a char is a high surrogate.
  706. ==============================================================================*/
  707. public static bool IsHighSurrogate(char c)
  708. {
  709. return IsInRange(c, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END);
  710. }
  711. public static bool IsHighSurrogate(string s, int index)
  712. {
  713. if (s == null)
  714. {
  715. throw new ArgumentNullException(nameof(s));
  716. }
  717. if (index < 0 || index >= s.Length)
  718. {
  719. throw new ArgumentOutOfRangeException(nameof(index));
  720. }
  721. return IsHighSurrogate(s[index]);
  722. }
  723. /*================================= IsLowSurrogate ============================
  724. ** Check if a char is a low surrogate.
  725. ==============================================================================*/
  726. public static bool IsLowSurrogate(char c)
  727. {
  728. return IsInRange(c, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END);
  729. }
  730. public static bool IsLowSurrogate(string s, int index)
  731. {
  732. if (s == null)
  733. {
  734. throw new ArgumentNullException(nameof(s));
  735. }
  736. if (index < 0 || index >= s.Length)
  737. {
  738. throw new ArgumentOutOfRangeException(nameof(index));
  739. }
  740. return IsLowSurrogate(s[index]);
  741. }
  742. /*================================= IsSurrogatePair ============================
  743. ** Check if the string specified by the index starts with a surrogate pair.
  744. ==============================================================================*/
  745. public static bool IsSurrogatePair(string s, int index)
  746. {
  747. if (s == null)
  748. {
  749. throw new ArgumentNullException(nameof(s));
  750. }
  751. if (index < 0 || index >= s.Length)
  752. {
  753. throw new ArgumentOutOfRangeException(nameof(index));
  754. }
  755. if (index + 1 < s.Length)
  756. {
  757. return IsSurrogatePair(s[index], s[index + 1]);
  758. }
  759. return false;
  760. }
  761. public static bool IsSurrogatePair(char highSurrogate, char lowSurrogate)
  762. {
  763. // Since both the high and low surrogate ranges are exactly 0x400 elements
  764. // wide, and since this is a power of two, we can perform a single comparison
  765. // by baselining each value to the start of its respective range and taking
  766. // the logical OR of them.
  767. uint highSurrogateOffset = (uint)highSurrogate - CharUnicodeInfo.HIGH_SURROGATE_START;
  768. uint lowSurrogateOffset = (uint)lowSurrogate - CharUnicodeInfo.LOW_SURROGATE_START;
  769. return (highSurrogateOffset | lowSurrogateOffset) <= CharUnicodeInfo.HIGH_SURROGATE_RANGE;
  770. }
  771. internal const int UNICODE_PLANE00_END = 0x00ffff;
  772. // The starting codepoint for Unicode plane 1. Plane 1 contains 0x010000 ~ 0x01ffff.
  773. internal const int UNICODE_PLANE01_START = 0x10000;
  774. // The end codepoint for Unicode plane 16. This is the maximum code point value allowed for Unicode.
  775. // Plane 16 contains 0x100000 ~ 0x10ffff.
  776. internal const int UNICODE_PLANE16_END = 0x10ffff;
  777. /*================================= ConvertFromUtf32 ============================
  778. ** Convert an UTF32 value into a surrogate pair.
  779. ==============================================================================*/
  780. public static string ConvertFromUtf32(int utf32)
  781. {
  782. if (!UnicodeUtility.IsValidUnicodeScalar((uint)utf32))
  783. {
  784. throw new ArgumentOutOfRangeException(nameof(utf32), SR.ArgumentOutOfRange_InvalidUTF32);
  785. }
  786. return Rune.UnsafeCreate((uint)utf32).ToString();
  787. }
  788. /*=============================ConvertToUtf32===================================
  789. ** Convert a surrogate pair to UTF32 value
  790. ==============================================================================*/
  791. public static int ConvertToUtf32(char highSurrogate, char lowSurrogate)
  792. {
  793. // First, extend both to 32 bits, then calculate the offset of
  794. // each candidate surrogate char from the start of its range.
  795. uint highSurrogateOffset = (uint)highSurrogate - CharUnicodeInfo.HIGH_SURROGATE_START;
  796. uint lowSurrogateOffset = (uint)lowSurrogate - CharUnicodeInfo.LOW_SURROGATE_START;
  797. // This is a single comparison which allows us to check both for validity at once since
  798. // both the high surrogate range and the low surrogate range are the same length.
  799. // If the comparison fails, we call to a helper method to throw the correct exception message.
  800. if ((highSurrogateOffset | lowSurrogateOffset) > CharUnicodeInfo.HIGH_SURROGATE_RANGE)
  801. {
  802. ConvertToUtf32_ThrowInvalidArgs(highSurrogateOffset);
  803. }
  804. // The 0x40u << 10 below is to account for uuuuu = wwww + 1 in the surrogate encoding.
  805. return ((int)highSurrogateOffset << 10) + (lowSurrogate - CharUnicodeInfo.LOW_SURROGATE_START) + (0x40 << 10);
  806. }
  807. [StackTraceHidden]
  808. private static void ConvertToUtf32_ThrowInvalidArgs(uint highSurrogateOffset)
  809. {
  810. // If the high surrogate is not within its expected range, throw an exception
  811. // whose message fingers it as invalid. If it's within the expected range,
  812. // change the message to read that the low surrogate was the problem.
  813. if (highSurrogateOffset > CharUnicodeInfo.HIGH_SURROGATE_RANGE)
  814. {
  815. throw new ArgumentOutOfRangeException(
  816. paramName: "highSurrogate",
  817. message: SR.ArgumentOutOfRange_InvalidHighSurrogate);
  818. }
  819. else
  820. {
  821. throw new ArgumentOutOfRangeException(
  822. paramName: "lowSurrogate",
  823. message: SR.ArgumentOutOfRange_InvalidLowSurrogate);
  824. }
  825. }
  826. /*=============================ConvertToUtf32===================================
  827. ** Convert a character or a surrogate pair starting at index of the specified string
  828. ** to UTF32 value.
  829. ** The char pointed by index should be a surrogate pair or a BMP character.
  830. ** This method throws if a high-surrogate is not followed by a low surrogate.
  831. ** This method throws if a low surrogate is seen without preceding a high-surrogate.
  832. ==============================================================================*/
  833. public static int ConvertToUtf32(string s, int index)
  834. {
  835. if (s == null)
  836. {
  837. throw new ArgumentNullException(nameof(s));
  838. }
  839. if (index < 0 || index >= s.Length)
  840. {
  841. throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index);
  842. }
  843. // Check if the character at index is a high surrogate.
  844. int temp1 = (int)s[index] - CharUnicodeInfo.HIGH_SURROGATE_START;
  845. if (temp1 >= 0 && temp1 <= 0x7ff)
  846. {
  847. // Found a surrogate char.
  848. if (temp1 <= 0x3ff)
  849. {
  850. // Found a high surrogate.
  851. if (index < s.Length - 1)
  852. {
  853. int temp2 = (int)s[index + 1] - CharUnicodeInfo.LOW_SURROGATE_START;
  854. if (temp2 >= 0 && temp2 <= 0x3ff)
  855. {
  856. // Found a low surrogate.
  857. return (temp1 * 0x400) + temp2 + UNICODE_PLANE01_START;
  858. }
  859. else
  860. {
  861. throw new ArgumentException(SR.Format(SR.Argument_InvalidHighSurrogate, index), nameof(s));
  862. }
  863. }
  864. else
  865. {
  866. // Found a high surrogate at the end of the string.
  867. throw new ArgumentException(SR.Format(SR.Argument_InvalidHighSurrogate, index), nameof(s));
  868. }
  869. }
  870. else
  871. {
  872. // Find a low surrogate at the character pointed by index.
  873. throw new ArgumentException(SR.Format(SR.Argument_InvalidLowSurrogate, index), nameof(s));
  874. }
  875. }
  876. // Not a high-surrogate or low-surrogate. Genereate the UTF32 value for the BMP characters.
  877. return (int)s[index];
  878. }
  879. }
  880. }