TextInfo.cs 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870
  1. // Licensed to the .NET Foundation under one or more agreements.
  2. // The .NET Foundation licenses this file to you under the MIT license.
  3. // See the LICENSE file in the project root for more information.
  4. using System.Diagnostics;
  5. using System.Runtime.CompilerServices;
  6. using System.Runtime.InteropServices;
  7. using System.Runtime.Serialization;
  8. using System.Text;
  9. using System.Text.Unicode;
  10. using Internal.Runtime.CompilerServices;
  11. #pragma warning disable SA1121 // explicitly using type aliases instead of built-in types
  12. #if BIT64
  13. using nuint = System.UInt64;
  14. #else // BIT64
  15. using nuint = System.UInt32;
  16. #endif // BIT64
  17. namespace System.Globalization
  18. {
  19. /// <summary>
  20. /// This Class defines behaviors specific to a writing system.
  21. /// A writing system is the collection of scripts and orthographic rules
  22. /// required to represent a language as text.
  23. /// </summary>
  24. public partial class TextInfo : ICloneable, IDeserializationCallback
  25. {
  26. private enum Tristate : byte
  27. {
  28. NotInitialized = 0,
  29. False = 1,
  30. True = 2
  31. }
  32. private string? _listSeparator;
  33. private bool _isReadOnly = false;
  34. private readonly string _cultureName;
  35. private readonly CultureData _cultureData;
  36. // // Name of the text info we're using (ie: _cultureData.TextInfoName)
  37. private readonly string _textInfoName;
  38. private Tristate _isAsciiCasingSameAsInvariant = Tristate.NotInitialized;
  39. // Invariant text info
  40. internal static TextInfo Invariant => s_invariant ??= new TextInfo(CultureData.Invariant);
  41. private static volatile TextInfo? s_invariant;
  42. internal TextInfo(CultureData cultureData)
  43. {
  44. // This is our primary data source, we don't need most of the rest of this
  45. _cultureData = cultureData;
  46. _cultureName = _cultureData.CultureName;
  47. _textInfoName = _cultureData.TextInfoName;
  48. FinishInitialization();
  49. }
  50. void IDeserializationCallback.OnDeserialization(object? sender)
  51. {
  52. throw new PlatformNotSupportedException();
  53. }
  54. public virtual int ANSICodePage => _cultureData.ANSICodePage;
  55. public virtual int OEMCodePage => _cultureData.OEMCodePage;
  56. public virtual int MacCodePage => _cultureData.MacCodePage;
  57. public virtual int EBCDICCodePage => _cultureData.EBCDICCodePage;
  58. // Just use the LCID from our text info name
  59. public int LCID => CultureInfo.GetCultureInfo(_textInfoName).LCID;
  60. public string CultureName => _textInfoName;
  61. public bool IsReadOnly => _isReadOnly;
  62. public virtual object Clone()
  63. {
  64. object o = MemberwiseClone();
  65. ((TextInfo)o).SetReadOnlyState(false);
  66. return o;
  67. }
  68. /// <summary>
  69. /// Create a cloned readonly instance or return the input one if it is
  70. /// readonly.
  71. /// </summary>
  72. public static TextInfo ReadOnly(TextInfo textInfo)
  73. {
  74. if (textInfo == null)
  75. {
  76. throw new ArgumentNullException(nameof(textInfo));
  77. }
  78. if (textInfo.IsReadOnly)
  79. {
  80. return textInfo;
  81. }
  82. TextInfo clonedTextInfo = (TextInfo)(textInfo.MemberwiseClone());
  83. clonedTextInfo.SetReadOnlyState(true);
  84. return clonedTextInfo;
  85. }
  86. private void VerifyWritable()
  87. {
  88. if (_isReadOnly)
  89. {
  90. throw new InvalidOperationException(SR.InvalidOperation_ReadOnly);
  91. }
  92. }
  93. internal void SetReadOnlyState(bool readOnly)
  94. {
  95. _isReadOnly = readOnly;
  96. }
  97. /// <summary>
  98. /// Returns the string used to separate items in a list.
  99. /// </summary>
  100. public virtual string ListSeparator
  101. {
  102. get => _listSeparator ??= _cultureData.ListSeparator;
  103. set
  104. {
  105. if (value == null)
  106. {
  107. throw new ArgumentNullException(nameof(value));
  108. }
  109. VerifyWritable();
  110. _listSeparator = value;
  111. }
  112. }
  113. /// <summary>
  114. /// Converts the character or string to lower case. Certain locales
  115. /// have different casing semantics from the file systems in Win32.
  116. /// </summary>
  117. public virtual char ToLower(char c)
  118. {
  119. if (GlobalizationMode.Invariant || (IsAscii(c) && IsAsciiCasingSameAsInvariant))
  120. {
  121. return ToLowerAsciiInvariant(c);
  122. }
  123. return ChangeCase(c, toUpper: false);
  124. }
  125. public virtual string ToLower(string str)
  126. {
  127. if (str == null)
  128. {
  129. throw new ArgumentNullException(nameof(str));
  130. }
  131. if (GlobalizationMode.Invariant)
  132. {
  133. return ToLowerAsciiInvariant(str);
  134. }
  135. return ChangeCaseCommon<ToLowerConversion>(str);
  136. }
  137. private unsafe char ChangeCase(char c, bool toUpper)
  138. {
  139. Debug.Assert(!GlobalizationMode.Invariant);
  140. char dst = default;
  141. ChangeCase(&c, 1, &dst, 1, toUpper);
  142. return dst;
  143. }
  144. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  145. internal void ChangeCaseToLower(ReadOnlySpan<char> source, Span<char> destination)
  146. {
  147. Debug.Assert(destination.Length >= source.Length);
  148. ChangeCaseCommon<ToLowerConversion>(ref MemoryMarshal.GetReference(source), ref MemoryMarshal.GetReference(destination), source.Length);
  149. }
  150. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  151. internal void ChangeCaseToUpper(ReadOnlySpan<char> source, Span<char> destination)
  152. {
  153. Debug.Assert(destination.Length >= source.Length);
  154. ChangeCaseCommon<ToUpperConversion>(ref MemoryMarshal.GetReference(source), ref MemoryMarshal.GetReference(destination), source.Length);
  155. }
  156. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  157. private void ChangeCaseCommon<TConversion>(ReadOnlySpan<char> source, Span<char> destination) where TConversion : struct
  158. {
  159. Debug.Assert(destination.Length >= source.Length);
  160. ChangeCaseCommon<TConversion>(ref MemoryMarshal.GetReference(source), ref MemoryMarshal.GetReference(destination), source.Length);
  161. }
  162. private unsafe void ChangeCaseCommon<TConversion>(ref char source, ref char destination, int charCount) where TConversion : struct
  163. {
  164. Debug.Assert(typeof(TConversion) == typeof(ToUpperConversion) || typeof(TConversion) == typeof(ToLowerConversion));
  165. bool toUpper = typeof(TConversion) == typeof(ToUpperConversion); // JIT will treat this as a constant in release builds
  166. Debug.Assert(!GlobalizationMode.Invariant);
  167. Debug.Assert(charCount >= 0);
  168. if (charCount == 0)
  169. {
  170. goto Return;
  171. }
  172. fixed (char* pSource = &source)
  173. fixed (char* pDestination = &destination)
  174. {
  175. nuint currIdx = 0; // in chars
  176. if (IsAsciiCasingSameAsInvariant)
  177. {
  178. // Read 4 chars (two 32-bit integers) at a time
  179. if (charCount >= 4)
  180. {
  181. nuint lastIndexWhereCanReadFourChars = (uint)charCount - 4;
  182. do
  183. {
  184. // This is a mostly branchless case change routine. Generally speaking, we assume that the majority
  185. // of input is ASCII, so the 'if' checks below should normally evaluate to false. However, within
  186. // the ASCII data, we expect that characters of either case might be about equally distributed, so
  187. // we want the case change operation itself to be branchless. This gives optimal performance in the
  188. // common case. We also expect that developers aren't passing very long (16+ character) strings into
  189. // this method, so we won't bother vectorizing until data shows us that it's worthwhile to do so.
  190. uint tempValue = Unsafe.ReadUnaligned<uint>(pSource + currIdx);
  191. if (!Utf16Utility.AllCharsInUInt32AreAscii(tempValue))
  192. {
  193. goto NonAscii;
  194. }
  195. tempValue = (toUpper) ? Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(tempValue) : Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(tempValue);
  196. Unsafe.WriteUnaligned<uint>(pDestination + currIdx, tempValue);
  197. tempValue = Unsafe.ReadUnaligned<uint>(pSource + currIdx + 2);
  198. if (!Utf16Utility.AllCharsInUInt32AreAscii(tempValue))
  199. {
  200. goto NonAsciiSkipTwoChars;
  201. }
  202. tempValue = (toUpper) ? Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(tempValue) : Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(tempValue);
  203. Unsafe.WriteUnaligned<uint>(pDestination + currIdx + 2, tempValue);
  204. currIdx += 4;
  205. } while (currIdx <= lastIndexWhereCanReadFourChars);
  206. // At this point, there are fewer than 4 characters remaining to convert.
  207. Debug.Assert((uint)charCount - currIdx < 4);
  208. }
  209. // If there are 2 or 3 characters left to convert, we'll convert 2 of them now.
  210. if ((charCount & 2) != 0)
  211. {
  212. uint tempValue = Unsafe.ReadUnaligned<uint>(pSource + currIdx);
  213. if (!Utf16Utility.AllCharsInUInt32AreAscii(tempValue))
  214. {
  215. goto NonAscii;
  216. }
  217. tempValue = (toUpper) ? Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(tempValue) : Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(tempValue);
  218. Unsafe.WriteUnaligned<uint>(pDestination + currIdx, tempValue);
  219. currIdx += 2;
  220. }
  221. // If there's a single character left to convert, do it now.
  222. if ((charCount & 1) != 0)
  223. {
  224. uint tempValue = pSource[currIdx];
  225. if (tempValue > 0x7Fu)
  226. {
  227. goto NonAscii;
  228. }
  229. tempValue = (toUpper) ? Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(tempValue) : Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(tempValue);
  230. pDestination[currIdx] = (char)tempValue;
  231. }
  232. // And we're finished!
  233. goto Return;
  234. // If we reached this point, we found non-ASCII data.
  235. // Fall back down the p/invoke code path.
  236. NonAsciiSkipTwoChars:
  237. currIdx += 2;
  238. NonAscii:
  239. Debug.Assert(currIdx < (uint)charCount, "We somehow read past the end of the buffer.");
  240. charCount -= (int)currIdx;
  241. }
  242. // We encountered non-ASCII data and therefore can't perform invariant case conversion; or the requested culture
  243. // has a case conversion that's different from the invariant culture, even for ASCII data (e.g., tr-TR converts
  244. // 'i' (U+0069) to Latin Capital Letter I With Dot Above (U+0130)).
  245. ChangeCase(pSource + currIdx, charCount, pDestination + currIdx, charCount, toUpper);
  246. }
  247. Return:
  248. return;
  249. }
  250. private unsafe string ChangeCaseCommon<TConversion>(string source) where TConversion : struct
  251. {
  252. Debug.Assert(typeof(TConversion) == typeof(ToUpperConversion) || typeof(TConversion) == typeof(ToLowerConversion));
  253. bool toUpper = typeof(TConversion) == typeof(ToUpperConversion); // JIT will treat this as a constant in release builds
  254. Debug.Assert(!GlobalizationMode.Invariant);
  255. Debug.Assert(source != null);
  256. // If the string is empty, we're done.
  257. if (source.Length == 0)
  258. {
  259. return string.Empty;
  260. }
  261. fixed (char* pSource = source)
  262. {
  263. nuint currIdx = 0; // in chars
  264. // If this culture's casing for ASCII is the same as invariant, try to take
  265. // a fast path that'll work in managed code and ASCII rather than calling out
  266. // to the OS for culture-aware casing.
  267. if (IsAsciiCasingSameAsInvariant)
  268. {
  269. // Read 2 chars (one 32-bit integer) at a time
  270. if (source.Length >= 2)
  271. {
  272. nuint lastIndexWhereCanReadTwoChars = (uint)source.Length - 2;
  273. do
  274. {
  275. // See the comments in ChangeCaseCommon<TConversion>(ROS<char>, Span<char>) for a full explanation of the below code.
  276. uint tempValue = Unsafe.ReadUnaligned<uint>(pSource + currIdx);
  277. if (!Utf16Utility.AllCharsInUInt32AreAscii(tempValue))
  278. {
  279. goto NotAscii;
  280. }
  281. if ((toUpper) ? Utf16Utility.UInt32ContainsAnyLowercaseAsciiChar(tempValue) : Utf16Utility.UInt32ContainsAnyUppercaseAsciiChar(tempValue))
  282. {
  283. goto AsciiMustChangeCase;
  284. }
  285. currIdx += 2;
  286. } while (currIdx <= lastIndexWhereCanReadTwoChars);
  287. }
  288. // If there's a single character left to convert, do it now.
  289. if ((source.Length & 1) != 0)
  290. {
  291. uint tempValue = pSource[currIdx];
  292. if (tempValue > 0x7Fu)
  293. {
  294. goto NotAscii;
  295. }
  296. if ((toUpper) ? ((tempValue - 'a') <= (uint)('z' - 'a')) : ((tempValue - 'A') <= (uint)('Z' - 'A')))
  297. {
  298. goto AsciiMustChangeCase;
  299. }
  300. }
  301. // We got through all characters without finding anything that needed to change - done!
  302. return source;
  303. AsciiMustChangeCase:
  304. {
  305. // We reached ASCII data that requires a case change.
  306. // This will necessarily allocate a new string, but let's try to stay within the managed (non-localization tables)
  307. // conversion code path if we can.
  308. string result = string.FastAllocateString(source.Length); // changing case uses simple folding: doesn't change UTF-16 code unit count
  309. // copy existing known-good data into the result
  310. Span<char> resultSpan = new Span<char>(ref result.GetRawStringData(), result.Length);
  311. source.AsSpan(0, (int)currIdx).CopyTo(resultSpan);
  312. // and re-run the fast span-based logic over the remainder of the data
  313. ChangeCaseCommon<TConversion>(source.AsSpan((int)currIdx), resultSpan.Slice((int)currIdx));
  314. return result;
  315. }
  316. }
  317. NotAscii:
  318. {
  319. // We reached non-ASCII data *or* the requested culture doesn't map ASCII data the same way as the invariant culture.
  320. // In either case we need to fall back to the localization tables.
  321. string result = string.FastAllocateString(source.Length); // changing case uses simple folding: doesn't change UTF-16 code unit count
  322. if (currIdx > 0)
  323. {
  324. // copy existing known-good data into the result
  325. Span<char> resultSpan = new Span<char>(ref result.GetRawStringData(), result.Length);
  326. source.AsSpan(0, (int)currIdx).CopyTo(resultSpan);
  327. }
  328. // and run the culture-aware logic over the remainder of the data
  329. fixed (char* pResult = result)
  330. {
  331. ChangeCase(pSource + currIdx, source.Length - (int)currIdx, pResult + currIdx, result.Length - (int)currIdx, toUpper);
  332. }
  333. return result;
  334. }
  335. }
  336. }
  337. internal static unsafe string ToLowerAsciiInvariant(string s)
  338. {
  339. if (s.Length == 0)
  340. {
  341. return string.Empty;
  342. }
  343. fixed (char* pSource = s)
  344. {
  345. int i = 0;
  346. while (i < s.Length)
  347. {
  348. if ((uint)(pSource[i] - 'A') <= (uint)('Z' - 'A'))
  349. {
  350. break;
  351. }
  352. i++;
  353. }
  354. if (i >= s.Length)
  355. {
  356. return s;
  357. }
  358. string result = string.FastAllocateString(s.Length);
  359. fixed (char* pResult = result)
  360. {
  361. for (int j = 0; j < i; j++)
  362. {
  363. pResult[j] = pSource[j];
  364. }
  365. pResult[i] = (char)(pSource[i] | 0x20);
  366. i++;
  367. while (i < s.Length)
  368. {
  369. pResult[i] = ToLowerAsciiInvariant(pSource[i]);
  370. i++;
  371. }
  372. }
  373. return result;
  374. }
  375. }
  376. internal static void ToLowerAsciiInvariant(ReadOnlySpan<char> source, Span<char> destination)
  377. {
  378. Debug.Assert(destination.Length >= source.Length);
  379. for (int i = 0; i < source.Length; i++)
  380. {
  381. destination[i] = ToLowerAsciiInvariant(source[i]);
  382. }
  383. }
  384. private static unsafe string ToUpperAsciiInvariant(string s)
  385. {
  386. if (s.Length == 0)
  387. {
  388. return string.Empty;
  389. }
  390. fixed (char* pSource = s)
  391. {
  392. int i = 0;
  393. while (i < s.Length)
  394. {
  395. if ((uint)(pSource[i] - 'a') <= (uint)('z' - 'a'))
  396. {
  397. break;
  398. }
  399. i++;
  400. }
  401. if (i >= s.Length)
  402. {
  403. return s;
  404. }
  405. string result = string.FastAllocateString(s.Length);
  406. fixed (char* pResult = result)
  407. {
  408. for (int j = 0; j < i; j++)
  409. {
  410. pResult[j] = pSource[j];
  411. }
  412. pResult[i] = (char)(pSource[i] & ~0x20);
  413. i++;
  414. while (i < s.Length)
  415. {
  416. pResult[i] = ToUpperAsciiInvariant(pSource[i]);
  417. i++;
  418. }
  419. }
  420. return result;
  421. }
  422. }
  423. internal static void ToUpperAsciiInvariant(ReadOnlySpan<char> source, Span<char> destination)
  424. {
  425. Debug.Assert(destination.Length >= source.Length);
  426. for (int i = 0; i < source.Length; i++)
  427. {
  428. destination[i] = ToUpperAsciiInvariant(source[i]);
  429. }
  430. }
  431. private static char ToLowerAsciiInvariant(char c)
  432. {
  433. if ((uint)(c - 'A') <= (uint)('Z' - 'A'))
  434. {
  435. c = (char)(c | 0x20);
  436. }
  437. return c;
  438. }
  439. /// <summary>
  440. /// Converts the character or string to upper case. Certain locales
  441. /// have different casing semantics from the file systems in Win32.
  442. /// </summary>
  443. public virtual char ToUpper(char c)
  444. {
  445. if (GlobalizationMode.Invariant || (IsAscii(c) && IsAsciiCasingSameAsInvariant))
  446. {
  447. return ToUpperAsciiInvariant(c);
  448. }
  449. return ChangeCase(c, toUpper: true);
  450. }
  451. public virtual string ToUpper(string str)
  452. {
  453. if (str == null)
  454. {
  455. throw new ArgumentNullException(nameof(str));
  456. }
  457. if (GlobalizationMode.Invariant)
  458. {
  459. return ToUpperAsciiInvariant(str);
  460. }
  461. return ChangeCaseCommon<ToUpperConversion>(str);
  462. }
  463. internal static char ToUpperAsciiInvariant(char c)
  464. {
  465. if ((uint)(c - 'a') <= (uint)('z' - 'a'))
  466. {
  467. c = (char)(c & ~0x20);
  468. }
  469. return c;
  470. }
  471. private static bool IsAscii(char c) => c < 0x80;
  472. private bool IsAsciiCasingSameAsInvariant
  473. {
  474. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  475. get
  476. {
  477. if (_isAsciiCasingSameAsInvariant == Tristate.NotInitialized)
  478. {
  479. PopulateIsAsciiCasingSameAsInvariant();
  480. }
  481. Debug.Assert(_isAsciiCasingSameAsInvariant == Tristate.True || _isAsciiCasingSameAsInvariant == Tristate.False);
  482. return _isAsciiCasingSameAsInvariant == Tristate.True;
  483. }
  484. }
  485. [MethodImpl(MethodImplOptions.NoInlining)]
  486. private void PopulateIsAsciiCasingSameAsInvariant()
  487. {
  488. bool compareResult = CultureInfo.GetCultureInfo(_textInfoName).CompareInfo.Compare("abcdefghijklmnopqrstuvwxyz", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", CompareOptions.IgnoreCase) == 0;
  489. _isAsciiCasingSameAsInvariant = (compareResult) ? Tristate.True : Tristate.False;
  490. }
  491. /// <summary>
  492. /// Returns true if the dominant direction of text and UI such as the
  493. /// relative position of buttons and scroll bars
  494. /// </summary>
  495. public bool IsRightToLeft => _cultureData.IsRightToLeft;
  496. public override bool Equals(object? obj)
  497. {
  498. return obj is TextInfo otherTextInfo
  499. && CultureName.Equals(otherTextInfo.CultureName);
  500. }
  501. public override int GetHashCode() => CultureName.GetHashCode();
  502. public override string ToString()
  503. {
  504. return "TextInfo - " + _cultureData.CultureName;
  505. }
  506. /// <summary>
  507. /// Titlecasing refers to a casing practice wherein the first letter of a word is an uppercase letter
  508. /// and the rest of the letters are lowercase. The choice of which words to titlecase in headings
  509. /// and titles is dependent on language and local conventions. For example, "The Merry Wives of Windor"
  510. /// is the appropriate titlecasing of that play's name in English, with the word "of" not titlecased.
  511. /// In German, however, the title is "Die lustigen Weiber von Windsor," and both "lustigen" and "von"
  512. /// are not titlecased. In French even fewer words are titlecased: "Les joyeuses commeres de Windsor."
  513. ///
  514. /// Moreover, the determination of what actually constitutes a word is language dependent, and this can
  515. /// influence which letter or letters of a "word" are uppercased when titlecasing strings. For example
  516. /// "l'arbre" is considered two words in French, whereas "can't" is considered one word in English.
  517. /// </summary>
  518. public unsafe string ToTitleCase(string str)
  519. {
  520. if (str == null)
  521. {
  522. throw new ArgumentNullException(nameof(str));
  523. }
  524. if (str.Length == 0)
  525. {
  526. return str;
  527. }
  528. StringBuilder result = new StringBuilder();
  529. string? lowercaseData = null;
  530. // Store if the current culture is Dutch (special case)
  531. bool isDutchCulture = CultureName.StartsWith("nl-", StringComparison.OrdinalIgnoreCase);
  532. for (int i = 0; i < str.Length; i++)
  533. {
  534. int charLen;
  535. UnicodeCategory charType = CharUnicodeInfo.InternalGetUnicodeCategory(str, i, out charLen);
  536. if (char.CheckLetter(charType))
  537. {
  538. // Special case to check for Dutch specific titlecasing with "IJ" characters
  539. // at the beginning of a word
  540. if (isDutchCulture && i < str.Length - 1 && (str[i] == 'i' || str[i] == 'I') && (str[i + 1] == 'j' || str[i + 1] == 'J'))
  541. {
  542. result.Append("IJ");
  543. i += 2;
  544. }
  545. else
  546. {
  547. // Do the titlecasing for the first character of the word.
  548. i = AddTitlecaseLetter(ref result, ref str, i, charLen) + 1;
  549. }
  550. // Convert the characters until the end of the this word
  551. // to lowercase.
  552. int lowercaseStart = i;
  553. // Use hasLowerCase flag to prevent from lowercasing acronyms (like "URT", "USA", etc)
  554. // This is in line with Word 2000 behavior of titlecasing.
  555. bool hasLowerCase = (charType == UnicodeCategory.LowercaseLetter);
  556. // Use a loop to find all of the other letters following this letter.
  557. while (i < str.Length)
  558. {
  559. charType = CharUnicodeInfo.InternalGetUnicodeCategory(str, i, out charLen);
  560. if (IsLetterCategory(charType))
  561. {
  562. if (charType == UnicodeCategory.LowercaseLetter)
  563. {
  564. hasLowerCase = true;
  565. }
  566. i += charLen;
  567. }
  568. else if (str[i] == '\'')
  569. {
  570. i++;
  571. if (hasLowerCase)
  572. {
  573. if (lowercaseData == null)
  574. {
  575. lowercaseData = ToLower(str);
  576. }
  577. result.Append(lowercaseData, lowercaseStart, i - lowercaseStart);
  578. }
  579. else
  580. {
  581. result.Append(str, lowercaseStart, i - lowercaseStart);
  582. }
  583. lowercaseStart = i;
  584. hasLowerCase = true;
  585. }
  586. else if (!IsWordSeparator(charType))
  587. {
  588. // This category is considered to be part of the word.
  589. // This is any category that is marked as false in wordSeprator array.
  590. i += charLen;
  591. }
  592. else
  593. {
  594. // A word separator. Break out of the loop.
  595. break;
  596. }
  597. }
  598. int count = i - lowercaseStart;
  599. if (count > 0)
  600. {
  601. if (hasLowerCase)
  602. {
  603. if (lowercaseData == null)
  604. {
  605. lowercaseData = ToLower(str);
  606. }
  607. result.Append(lowercaseData, lowercaseStart, count);
  608. }
  609. else
  610. {
  611. result.Append(str, lowercaseStart, count);
  612. }
  613. }
  614. if (i < str.Length)
  615. {
  616. // not a letter, just append it
  617. i = AddNonLetter(ref result, ref str, i, charLen);
  618. }
  619. }
  620. else
  621. {
  622. // not a letter, just append it
  623. i = AddNonLetter(ref result, ref str, i, charLen);
  624. }
  625. }
  626. return result.ToString();
  627. }
  628. private static int AddNonLetter(ref StringBuilder result, ref string input, int inputIndex, int charLen)
  629. {
  630. Debug.Assert(charLen == 1 || charLen == 2, "[TextInfo.AddNonLetter] CharUnicodeInfo.InternalGetUnicodeCategory returned an unexpected charLen!");
  631. if (charLen == 2)
  632. {
  633. // Surrogate pair
  634. result.Append(input[inputIndex++]);
  635. result.Append(input[inputIndex]);
  636. }
  637. else
  638. {
  639. result.Append(input[inputIndex]);
  640. }
  641. return inputIndex;
  642. }
  643. private int AddTitlecaseLetter(ref StringBuilder result, ref string input, int inputIndex, int charLen)
  644. {
  645. Debug.Assert(charLen == 1 || charLen == 2, "[TextInfo.AddTitlecaseLetter] CharUnicodeInfo.InternalGetUnicodeCategory returned an unexpected charLen!");
  646. if (charLen == 2)
  647. {
  648. // for surrogate pairs do a ToUpper operation on the substring
  649. ReadOnlySpan<char> src = input.AsSpan(inputIndex, 2);
  650. if (GlobalizationMode.Invariant)
  651. {
  652. result.Append(src); // surrogate pair in invariant mode, so changing case is a nop
  653. }
  654. else
  655. {
  656. Span<char> dst = stackalloc char[2];
  657. ChangeCaseToUpper(src, dst);
  658. result.Append(dst);
  659. }
  660. inputIndex++;
  661. }
  662. else
  663. {
  664. switch (input[inputIndex])
  665. {
  666. // For AppCompat, the Titlecase Case Mapping data from NDP 2.0 is used below.
  667. case (char)0x01C4: // DZ with Caron -> Dz with Caron
  668. case (char)0x01C5: // Dz with Caron -> Dz with Caron
  669. case (char)0x01C6: // dz with Caron -> Dz with Caron
  670. result.Append((char)0x01C5);
  671. break;
  672. case (char)0x01C7: // LJ -> Lj
  673. case (char)0x01C8: // Lj -> Lj
  674. case (char)0x01C9: // lj -> Lj
  675. result.Append((char)0x01C8);
  676. break;
  677. case (char)0x01CA: // NJ -> Nj
  678. case (char)0x01CB: // Nj -> Nj
  679. case (char)0x01CC: // nj -> Nj
  680. result.Append((char)0x01CB);
  681. break;
  682. case (char)0x01F1: // DZ -> Dz
  683. case (char)0x01F2: // Dz -> Dz
  684. case (char)0x01F3: // dz -> Dz
  685. result.Append((char)0x01F2);
  686. break;
  687. default:
  688. result.Append(ToUpper(input[inputIndex]));
  689. break;
  690. }
  691. }
  692. return inputIndex;
  693. }
  694. // Used in ToTitleCase():
  695. // When we find a starting letter, the following array decides if a category should be
  696. // considered as word seprator or not.
  697. private const int c_wordSeparatorMask =
  698. /* false */ (0 << 0) | // UppercaseLetter = 0,
  699. /* false */ (0 << 1) | // LowercaseLetter = 1,
  700. /* false */ (0 << 2) | // TitlecaseLetter = 2,
  701. /* false */ (0 << 3) | // ModifierLetter = 3,
  702. /* false */ (0 << 4) | // OtherLetter = 4,
  703. /* false */ (0 << 5) | // NonSpacingMark = 5,
  704. /* false */ (0 << 6) | // SpacingCombiningMark = 6,
  705. /* false */ (0 << 7) | // EnclosingMark = 7,
  706. /* false */ (0 << 8) | // DecimalDigitNumber = 8,
  707. /* false */ (0 << 9) | // LetterNumber = 9,
  708. /* false */ (0 << 10) | // OtherNumber = 10,
  709. /* true */ (1 << 11) | // SpaceSeparator = 11,
  710. /* true */ (1 << 12) | // LineSeparator = 12,
  711. /* true */ (1 << 13) | // ParagraphSeparator = 13,
  712. /* true */ (1 << 14) | // Control = 14,
  713. /* true */ (1 << 15) | // Format = 15,
  714. /* false */ (0 << 16) | // Surrogate = 16,
  715. /* false */ (0 << 17) | // PrivateUse = 17,
  716. /* true */ (1 << 18) | // ConnectorPunctuation = 18,
  717. /* true */ (1 << 19) | // DashPunctuation = 19,
  718. /* true */ (1 << 20) | // OpenPunctuation = 20,
  719. /* true */ (1 << 21) | // ClosePunctuation = 21,
  720. /* true */ (1 << 22) | // InitialQuotePunctuation = 22,
  721. /* true */ (1 << 23) | // FinalQuotePunctuation = 23,
  722. /* true */ (1 << 24) | // OtherPunctuation = 24,
  723. /* true */ (1 << 25) | // MathSymbol = 25,
  724. /* true */ (1 << 26) | // CurrencySymbol = 26,
  725. /* true */ (1 << 27) | // ModifierSymbol = 27,
  726. /* true */ (1 << 28) | // OtherSymbol = 28,
  727. /* false */ (0 << 29); // OtherNotAssigned = 29;
  728. private static bool IsWordSeparator(UnicodeCategory category)
  729. {
  730. return (c_wordSeparatorMask & (1 << (int)category)) != 0;
  731. }
  732. private static bool IsLetterCategory(UnicodeCategory uc)
  733. {
  734. return uc == UnicodeCategory.UppercaseLetter
  735. || uc == UnicodeCategory.LowercaseLetter
  736. || uc == UnicodeCategory.TitlecaseLetter
  737. || uc == UnicodeCategory.ModifierLetter
  738. || uc == UnicodeCategory.OtherLetter;
  739. }
  740. // A dummy struct that is used for 'ToUpper' in generic parameters
  741. private readonly struct ToUpperConversion { }
  742. // A dummy struct that is used for 'ToLower' in generic parameters
  743. private readonly struct ToLowerConversion { }
  744. }
  745. }