StringInfo.cs 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296
  1. // Licensed to the .NET Foundation under one or more agreements.
  2. // The .NET Foundation licenses this file to you under the MIT license.
  3. // See the LICENSE file in the project root for more information.
  4. using System.Diagnostics;
  5. namespace System.Globalization
  6. {
  7. /// <summary>
  8. /// This class defines behaviors specific to a writing system.
  9. /// A writing system is the collection of scripts and orthographic rules
  10. /// required to represent a language as text.
  11. /// </summary>
  12. public class StringInfo
  13. {
  14. private string _str = null!; // initialized in helper called by ctors
  15. private int[]? _indexes;
  16. public StringInfo() : this(string.Empty)
  17. {
  18. }
  19. public StringInfo(string value)
  20. {
  21. this.String = value;
  22. }
  23. public override bool Equals(object? value)
  24. {
  25. return value is StringInfo otherStringInfo
  26. && _str.Equals(otherStringInfo._str);
  27. }
  28. public override int GetHashCode() => _str.GetHashCode();
  29. /// <summary>
  30. /// Our zero-based array of index values into the string. Initialize if
  31. /// our private array is not yet, in fact, initialized.
  32. /// </summary>
  33. private int[]? Indexes
  34. {
  35. get
  36. {
  37. if (_indexes == null && String.Length > 0)
  38. {
  39. _indexes = StringInfo.ParseCombiningCharacters(String);
  40. }
  41. return _indexes;
  42. }
  43. }
  44. public string String
  45. {
  46. get => _str;
  47. set
  48. {
  49. _str = value ?? throw new ArgumentNullException(nameof(value));
  50. _indexes = null;
  51. }
  52. }
  53. public int LengthInTextElements => Indexes?.Length ?? 0;
  54. public string SubstringByTextElements(int startingTextElement)
  55. {
  56. // If the string is empty, no sense going further.
  57. if (Indexes == null)
  58. {
  59. if (startingTextElement < 0)
  60. {
  61. throw new ArgumentOutOfRangeException(nameof(startingTextElement), startingTextElement, SR.ArgumentOutOfRange_NeedPosNum);
  62. }
  63. else
  64. {
  65. throw new ArgumentOutOfRangeException(nameof(startingTextElement), startingTextElement, SR.Arg_ArgumentOutOfRangeException);
  66. }
  67. }
  68. return SubstringByTextElements(startingTextElement, Indexes.Length - startingTextElement);
  69. }
  70. public string SubstringByTextElements(int startingTextElement, int lengthInTextElements)
  71. {
  72. if (startingTextElement < 0)
  73. {
  74. throw new ArgumentOutOfRangeException(nameof(startingTextElement), startingTextElement, SR.ArgumentOutOfRange_NeedPosNum);
  75. }
  76. if (String.Length == 0 || startingTextElement >= Indexes!.Length)
  77. {
  78. throw new ArgumentOutOfRangeException(nameof(startingTextElement), startingTextElement, SR.Arg_ArgumentOutOfRangeException);
  79. }
  80. if (lengthInTextElements < 0)
  81. {
  82. throw new ArgumentOutOfRangeException(nameof(lengthInTextElements), lengthInTextElements, SR.ArgumentOutOfRange_NeedPosNum);
  83. }
  84. if (startingTextElement > Indexes.Length - lengthInTextElements)
  85. {
  86. throw new ArgumentOutOfRangeException(nameof(lengthInTextElements), lengthInTextElements, SR.Arg_ArgumentOutOfRangeException);
  87. }
  88. int start = Indexes[startingTextElement];
  89. if (startingTextElement + lengthInTextElements == Indexes.Length)
  90. {
  91. // We are at the last text element in the string and because of that
  92. // must handle the call differently.
  93. return String.Substring(start);
  94. }
  95. else
  96. {
  97. return String.Substring(start, Indexes[lengthInTextElements + startingTextElement] - start);
  98. }
  99. }
  100. public static string GetNextTextElement(string str) => GetNextTextElement(str, 0);
  101. /// <summary>
  102. /// Get the code point count of the current text element.
  103. ///
  104. /// A combining class is defined as:
  105. /// A character/surrogate that has the following Unicode category:
  106. /// * NonSpacingMark (e.g. U+0300 COMBINING GRAVE ACCENT)
  107. /// * SpacingCombiningMark (e.g. U+ 0903 DEVANGARI SIGN VISARGA)
  108. /// * EnclosingMark (e.g. U+20DD COMBINING ENCLOSING CIRCLE)
  109. ///
  110. /// In the context of GetNextTextElement() and ParseCombiningCharacters(), a text element is defined as:
  111. /// 1. If a character/surrogate is in the following category, it is a text element.
  112. /// It can NOT further combine with characters in the combinging class to form a text element.
  113. /// * one of the Unicode category in the combinging class
  114. /// * UnicodeCategory.Format
  115. /// * UnicodeCateogry.Control
  116. /// * UnicodeCategory.OtherNotAssigned
  117. /// 2. Otherwise, the character/surrogate can be combined with characters in the combinging class to form a text element.
  118. /// </summary>
  119. /// <returns>The length of the current text element</returns>
  120. internal static int GetCurrentTextElementLen(string str, int index, int len, ref UnicodeCategory ucCurrent, ref int currentCharCount)
  121. {
  122. Debug.Assert(index >= 0 && len >= 0, "StringInfo.GetCurrentTextElementLen() : index = " + index + ", len = " + len);
  123. Debug.Assert(index < len, "StringInfo.GetCurrentTextElementLen() : index = " + index + ", len = " + len);
  124. if (index + currentCharCount == len)
  125. {
  126. // This is the last character/surrogate in the string.
  127. return currentCharCount;
  128. }
  129. // Call an internal GetUnicodeCategory, which will tell us both the unicode category, and also tell us if it is a surrogate pair or not.
  130. int nextCharCount;
  131. UnicodeCategory ucNext = CharUnicodeInfo.InternalGetUnicodeCategory(str, index + currentCharCount, out nextCharCount);
  132. if (CharUnicodeInfo.IsCombiningCategory(ucNext))
  133. {
  134. // The next element is a combining class.
  135. // Check if the current text element to see if it is a valid base category (i.e. it should not be a combining category,
  136. // not a format character, and not a control character).
  137. if (CharUnicodeInfo.IsCombiningCategory(ucCurrent)
  138. || (ucCurrent == UnicodeCategory.Format)
  139. || (ucCurrent == UnicodeCategory.Control)
  140. || (ucCurrent == UnicodeCategory.OtherNotAssigned)
  141. || (ucCurrent == UnicodeCategory.Surrogate)) // An unpair high surrogate or low surrogate
  142. {
  143. // Will fall thru and return the currentCharCount
  144. }
  145. else
  146. {
  147. // Remember the current index.
  148. int startIndex = index;
  149. // We have a valid base characters, and we have a character (or surrogate) that is combining.
  150. // Check if there are more combining characters to follow.
  151. // Check if the next character is a nonspacing character.
  152. index += currentCharCount + nextCharCount;
  153. while (index < len)
  154. {
  155. ucNext = CharUnicodeInfo.InternalGetUnicodeCategory(str, index, out nextCharCount);
  156. if (!CharUnicodeInfo.IsCombiningCategory(ucNext))
  157. {
  158. ucCurrent = ucNext;
  159. currentCharCount = nextCharCount;
  160. break;
  161. }
  162. index += nextCharCount;
  163. }
  164. return index - startIndex;
  165. }
  166. }
  167. // The return value will be the currentCharCount.
  168. int ret = currentCharCount;
  169. ucCurrent = ucNext;
  170. // Update currentCharCount.
  171. currentCharCount = nextCharCount;
  172. return ret;
  173. }
  174. /// <summary>
  175. /// Returns the str containing the next text element in str starting at
  176. /// index index. If index is not supplied, then it will start at the beginning
  177. /// of str. It recognizes a base character plus one or more combining
  178. /// characters or a properly formed surrogate pair as a text element.
  179. /// See also the ParseCombiningCharacters() and the ParseSurrogates() methods.
  180. /// </summary>
  181. public static string GetNextTextElement(string str, int index)
  182. {
  183. if (str == null)
  184. {
  185. throw new ArgumentNullException(nameof(str));
  186. }
  187. int len = str.Length;
  188. if (index < 0 || index >= len)
  189. {
  190. if (index == len)
  191. {
  192. return string.Empty;
  193. }
  194. throw new ArgumentOutOfRangeException(nameof(index), index, SR.ArgumentOutOfRange_Index);
  195. }
  196. int charLen;
  197. UnicodeCategory uc = CharUnicodeInfo.InternalGetUnicodeCategory(str, index, out charLen);
  198. return str.Substring(index, GetCurrentTextElementLen(str, index, len, ref uc, ref charLen));
  199. }
  200. public static TextElementEnumerator GetTextElementEnumerator(string str)
  201. {
  202. return GetTextElementEnumerator(str, 0);
  203. }
  204. public static TextElementEnumerator GetTextElementEnumerator(string str, int index)
  205. {
  206. if (str == null)
  207. {
  208. throw new ArgumentNullException(nameof(str));
  209. }
  210. int len = str.Length;
  211. if (index < 0 || index > len)
  212. {
  213. throw new ArgumentOutOfRangeException(nameof(index), index, SR.ArgumentOutOfRange_Index);
  214. }
  215. return new TextElementEnumerator(str, index, len);
  216. }
  217. /// <summary>
  218. /// Returns the indices of each base character or properly formed surrogate
  219. /// pair within the str. It recognizes a base character plus one or more
  220. /// combining characters or a properly formed surrogate pair as a text
  221. /// element and returns the index of the base character or high surrogate.
  222. /// Each index is the beginning of a text element within a str. The length
  223. /// of each element is easily computed as the difference between successive
  224. /// indices. The length of the array will always be less than or equal to
  225. /// the length of the str. For example, given the str
  226. /// \u4f00\u302a\ud800\udc00\u4f01, this method would return the indices:
  227. /// 0, 2, 4.
  228. /// </summary>
  229. public static int[] ParseCombiningCharacters(string str)
  230. {
  231. if (str == null)
  232. {
  233. throw new ArgumentNullException(nameof(str));
  234. }
  235. int len = str.Length;
  236. int[] result = new int[len];
  237. if (len == 0)
  238. {
  239. return (result);
  240. }
  241. int resultCount = 0;
  242. int i = 0;
  243. int currentCharLen;
  244. UnicodeCategory currentCategory = CharUnicodeInfo.InternalGetUnicodeCategory(str, 0, out currentCharLen);
  245. while (i < len)
  246. {
  247. result[resultCount++] = i;
  248. i += GetCurrentTextElementLen(str, i, len, ref currentCategory, ref currentCharLen);
  249. }
  250. if (resultCount < len)
  251. {
  252. int[] returnArray = new int[resultCount];
  253. Array.Copy(result, 0, returnArray, 0, resultCount);
  254. return (returnArray);
  255. }
  256. return result;
  257. }
  258. }
  259. }