TextInfo.cs 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875
  1. // Licensed to the .NET Foundation under one or more agreements.
  2. // The .NET Foundation licenses this file to you under the MIT license.
  3. // See the LICENSE file in the project root for more information.
  4. using System.Diagnostics;
  5. using System.Runtime.CompilerServices;
  6. using System.Runtime.InteropServices;
  7. using System.Runtime.Serialization;
  8. using System.Text;
  9. using System.Text.Unicode;
  10. using Internal.Runtime.CompilerServices;
  11. #if BIT64
  12. using nuint = System.UInt64;
  13. using nint = System.Int64;
  14. #else // BIT64
  15. using nuint = System.UInt32;
  16. using nint = System.Int32;
  17. #endif // BIT64
  18. namespace System.Globalization
  19. {
  20. /// <summary>
  21. /// This Class defines behaviors specific to a writing system.
  22. /// A writing system is the collection of scripts and orthographic rules
  23. /// required to represent a language as text.
  24. /// </summary>
  25. public partial class TextInfo : ICloneable, IDeserializationCallback
  26. {
  27. private enum Tristate : byte
  28. {
  29. NotInitialized = 0,
  30. False = 1,
  31. True = 2
  32. }
  33. private string? _listSeparator;
  34. private bool _isReadOnly = false;
  35. private readonly string _cultureName;
  36. private readonly CultureData _cultureData;
  37. // // Name of the text info we're using (ie: _cultureData.TextInfoName)
  38. private readonly string _textInfoName;
  39. private Tristate _isAsciiCasingSameAsInvariant = Tristate.NotInitialized;
  40. // Invariant text info
  41. internal static TextInfo Invariant
  42. {
  43. get => s_invariant ?? (s_invariant = new TextInfo(CultureData.Invariant));
  44. }
  45. private volatile static TextInfo? s_invariant;
  46. internal TextInfo(CultureData cultureData)
  47. {
  48. // This is our primary data source, we don't need most of the rest of this
  49. _cultureData = cultureData;
  50. _cultureName = _cultureData.CultureName;
  51. _textInfoName = _cultureData.TextInfoName;
  52. FinishInitialization();
  53. }
  54. void IDeserializationCallback.OnDeserialization(object? sender)
  55. {
  56. throw new PlatformNotSupportedException();
  57. }
  58. public virtual int ANSICodePage => _cultureData.ANSICodePage;
  59. public virtual int OEMCodePage => _cultureData.OEMCodePage;
  60. public virtual int MacCodePage => _cultureData.MacCodePage;
  61. public virtual int EBCDICCodePage => _cultureData.EBCDICCodePage;
  62. // Just use the LCID from our text info name
  63. public int LCID => CultureInfo.GetCultureInfo(_textInfoName).LCID;
  64. public string CultureName => _textInfoName;
  65. public bool IsReadOnly => _isReadOnly;
  66. public virtual object Clone()
  67. {
  68. object o = MemberwiseClone();
  69. ((TextInfo)o).SetReadOnlyState(false);
  70. return o;
  71. }
  72. /// <summary>
  73. /// Create a cloned readonly instance or return the input one if it is
  74. /// readonly.
  75. /// </summary>
  76. public static TextInfo ReadOnly(TextInfo textInfo)
  77. {
  78. if (textInfo == null)
  79. {
  80. throw new ArgumentNullException(nameof(textInfo));
  81. }
  82. if (textInfo.IsReadOnly)
  83. {
  84. return textInfo;
  85. }
  86. TextInfo clonedTextInfo = (TextInfo)(textInfo.MemberwiseClone());
  87. clonedTextInfo.SetReadOnlyState(true);
  88. return clonedTextInfo;
  89. }
  90. private void VerifyWritable()
  91. {
  92. if (_isReadOnly)
  93. {
  94. throw new InvalidOperationException(SR.InvalidOperation_ReadOnly);
  95. }
  96. }
  97. internal void SetReadOnlyState(bool readOnly)
  98. {
  99. _isReadOnly = readOnly;
  100. }
  101. /// <summary>
  102. /// Returns the string used to separate items in a list.
  103. /// </summary>
  104. public virtual string ListSeparator
  105. {
  106. get => _listSeparator ?? (_listSeparator = _cultureData.ListSeparator);
  107. set
  108. {
  109. if (value == null)
  110. {
  111. throw new ArgumentNullException(nameof(value));
  112. }
  113. VerifyWritable();
  114. _listSeparator = value;
  115. }
  116. }
  117. /// <summary>
  118. /// Converts the character or string to lower case. Certain locales
  119. /// have different casing semantics from the file systems in Win32.
  120. /// </summary>
  121. public virtual char ToLower(char c)
  122. {
  123. if (GlobalizationMode.Invariant || (IsAscii(c) && IsAsciiCasingSameAsInvariant))
  124. {
  125. return ToLowerAsciiInvariant(c);
  126. }
  127. return ChangeCase(c, toUpper: false);
  128. }
  129. public virtual string ToLower(string str)
  130. {
  131. if (str == null)
  132. {
  133. throw new ArgumentNullException(nameof(str));
  134. }
  135. if (GlobalizationMode.Invariant)
  136. {
  137. return ToLowerAsciiInvariant(str);
  138. }
  139. return ChangeCaseCommon<ToLowerConversion>(str);
  140. }
  141. private unsafe char ChangeCase(char c, bool toUpper)
  142. {
  143. Debug.Assert(!GlobalizationMode.Invariant);
  144. char dst = default;
  145. ChangeCase(&c, 1, &dst, 1, toUpper);
  146. return dst;
  147. }
  148. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  149. internal void ChangeCaseToLower(ReadOnlySpan<char> source, Span<char> destination)
  150. {
  151. Debug.Assert(destination.Length >= source.Length);
  152. ChangeCaseCommon<ToLowerConversion>(ref MemoryMarshal.GetReference(source), ref MemoryMarshal.GetReference(destination), source.Length);
  153. }
  154. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  155. internal void ChangeCaseToUpper(ReadOnlySpan<char> source, Span<char> destination)
  156. {
  157. Debug.Assert(destination.Length >= source.Length);
  158. ChangeCaseCommon<ToUpperConversion>(ref MemoryMarshal.GetReference(source), ref MemoryMarshal.GetReference(destination), source.Length);
  159. }
  160. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  161. private void ChangeCaseCommon<TConversion>(ReadOnlySpan<char> source, Span<char> destination) where TConversion : struct
  162. {
  163. Debug.Assert(destination.Length >= source.Length);
  164. ChangeCaseCommon<TConversion>(ref MemoryMarshal.GetReference(source), ref MemoryMarshal.GetReference(destination), source.Length);
  165. }
  166. private unsafe void ChangeCaseCommon<TConversion>(ref char source, ref char destination, int charCount) where TConversion : struct
  167. {
  168. Debug.Assert(typeof(TConversion) == typeof(ToUpperConversion) || typeof(TConversion) == typeof(ToLowerConversion));
  169. bool toUpper = typeof(TConversion) == typeof(ToUpperConversion); // JIT will treat this as a constant in release builds
  170. Debug.Assert(!GlobalizationMode.Invariant);
  171. Debug.Assert(charCount >= 0);
  172. if (charCount == 0)
  173. {
  174. goto Return;
  175. }
  176. fixed (char* pSource = &source)
  177. fixed (char* pDestination = &destination)
  178. {
  179. nuint currIdx = 0; // in chars
  180. if (IsAsciiCasingSameAsInvariant)
  181. {
  182. // Read 4 chars (two 32-bit integers) at a time
  183. if (charCount >= 4)
  184. {
  185. nuint lastIndexWhereCanReadFourChars = (uint)charCount - 4;
  186. do
  187. {
  188. // This is a mostly branchless case change routine. Generally speaking, we assume that the majority
  189. // of input is ASCII, so the 'if' checks below should normally evaluate to false. However, within
  190. // the ASCII data, we expect that characters of either case might be about equally distributed, so
  191. // we want the case change operation itself to be branchless. This gives optimal performance in the
  192. // common case. We also expect that developers aren't passing very long (16+ character) strings into
  193. // this method, so we won't bother vectorizing until data shows us that it's worthwhile to do so.
  194. uint tempValue = Unsafe.ReadUnaligned<uint>(pSource + currIdx);
  195. if (!Utf16Utility.AllCharsInUInt32AreAscii(tempValue))
  196. {
  197. goto NonAscii;
  198. }
  199. tempValue = (toUpper) ? Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(tempValue) : Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(tempValue);
  200. Unsafe.WriteUnaligned<uint>(pDestination + currIdx, tempValue);
  201. tempValue = Unsafe.ReadUnaligned<uint>(pSource + currIdx + 2);
  202. if (!Utf16Utility.AllCharsInUInt32AreAscii(tempValue))
  203. {
  204. goto NonAsciiSkipTwoChars;
  205. }
  206. tempValue = (toUpper) ? Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(tempValue) : Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(tempValue);
  207. Unsafe.WriteUnaligned<uint>(pDestination + currIdx + 2, tempValue);
  208. currIdx += 4;
  209. } while (currIdx <= lastIndexWhereCanReadFourChars);
  210. // At this point, there are fewer than 4 characters remaining to convert.
  211. Debug.Assert((uint)charCount - currIdx < 4);
  212. }
  213. // If there are 2 or 3 characters left to convert, we'll convert 2 of them now.
  214. if ((charCount & 2) != 0)
  215. {
  216. uint tempValue = Unsafe.ReadUnaligned<uint>(pSource + currIdx);
  217. if (!Utf16Utility.AllCharsInUInt32AreAscii(tempValue))
  218. {
  219. goto NonAscii;
  220. }
  221. tempValue = (toUpper) ? Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(tempValue) : Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(tempValue);
  222. Unsafe.WriteUnaligned<uint>(pDestination + currIdx, tempValue);
  223. currIdx += 2;
  224. }
  225. // If there's a single character left to convert, do it now.
  226. if ((charCount & 1) != 0)
  227. {
  228. uint tempValue = pSource[currIdx];
  229. if (tempValue > 0x7Fu)
  230. {
  231. goto NonAscii;
  232. }
  233. tempValue = (toUpper) ? Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(tempValue) : Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(tempValue);
  234. pDestination[currIdx] = (char)tempValue;
  235. }
  236. // And we're finished!
  237. goto Return;
  238. // If we reached this point, we found non-ASCII data.
  239. // Fall back down the p/invoke code path.
  240. NonAsciiSkipTwoChars:
  241. currIdx += 2;
  242. NonAscii:
  243. Debug.Assert(currIdx < (uint)charCount, "We somehow read past the end of the buffer.");
  244. charCount -= (int)currIdx;
  245. }
  246. // We encountered non-ASCII data and therefore can't perform invariant case conversion; or the requested culture
  247. // has a case conversion that's different from the invariant culture, even for ASCII data (e.g., tr-TR converts
  248. // 'i' (U+0069) to Latin Capital Letter I With Dot Above (U+0130)).
  249. ChangeCase(pSource + currIdx, charCount, pDestination + currIdx, charCount, toUpper);
  250. }
  251. Return:
  252. return;
  253. }
  254. private unsafe string ChangeCaseCommon<TConversion>(string source) where TConversion : struct
  255. {
  256. Debug.Assert(typeof(TConversion) == typeof(ToUpperConversion) || typeof(TConversion) == typeof(ToLowerConversion));
  257. bool toUpper = typeof(TConversion) == typeof(ToUpperConversion); // JIT will treat this as a constant in release builds
  258. Debug.Assert(!GlobalizationMode.Invariant);
  259. Debug.Assert(source != null);
  260. // If the string is empty, we're done.
  261. if (source.Length == 0)
  262. {
  263. return string.Empty;
  264. }
  265. fixed (char* pSource = source)
  266. {
  267. nuint currIdx = 0; // in chars
  268. // If this culture's casing for ASCII is the same as invariant, try to take
  269. // a fast path that'll work in managed code and ASCII rather than calling out
  270. // to the OS for culture-aware casing.
  271. if (IsAsciiCasingSameAsInvariant)
  272. {
  273. // Read 2 chars (one 32-bit integer) at a time
  274. if (source.Length >= 2)
  275. {
  276. nuint lastIndexWhereCanReadTwoChars = (uint)source.Length - 2;
  277. do
  278. {
  279. // See the comments in ChangeCaseCommon<TConversion>(ROS<char>, Span<char>) for a full explanation of the below code.
  280. uint tempValue = Unsafe.ReadUnaligned<uint>(pSource + currIdx);
  281. if (!Utf16Utility.AllCharsInUInt32AreAscii(tempValue))
  282. {
  283. goto NotAscii;
  284. }
  285. if ((toUpper) ? Utf16Utility.UInt32ContainsAnyLowercaseAsciiChar(tempValue) : Utf16Utility.UInt32ContainsAnyUppercaseAsciiChar(tempValue))
  286. {
  287. goto AsciiMustChangeCase;
  288. }
  289. currIdx += 2;
  290. } while (currIdx <= lastIndexWhereCanReadTwoChars);
  291. }
  292. // If there's a single character left to convert, do it now.
  293. if ((source.Length & 1) != 0)
  294. {
  295. uint tempValue = pSource[currIdx];
  296. if (tempValue > 0x7Fu)
  297. {
  298. goto NotAscii;
  299. }
  300. if ((toUpper) ? ((tempValue - 'a') <= (uint)('z' - 'a')) : ((tempValue - 'A') <= (uint)('Z' - 'A')))
  301. {
  302. goto AsciiMustChangeCase;
  303. }
  304. }
  305. // We got through all characters without finding anything that needed to change - done!
  306. return source;
  307. AsciiMustChangeCase:
  308. {
  309. // We reached ASCII data that requires a case change.
  310. // This will necessarily allocate a new string, but let's try to stay within the managed (non-localization tables)
  311. // conversion code path if we can.
  312. string result = string.FastAllocateString(source.Length); // changing case uses simple folding: doesn't change UTF-16 code unit count
  313. // copy existing known-good data into the result
  314. Span<char> resultSpan = new Span<char>(ref result.GetRawStringData(), result.Length);
  315. source.AsSpan(0, (int)currIdx).CopyTo(resultSpan);
  316. // and re-run the fast span-based logic over the remainder of the data
  317. ChangeCaseCommon<TConversion>(source.AsSpan((int)currIdx), resultSpan.Slice((int)currIdx));
  318. return result;
  319. }
  320. }
  321. NotAscii:
  322. {
  323. // We reached non-ASCII data *or* the requested culture doesn't map ASCII data the same way as the invariant culture.
  324. // In either case we need to fall back to the localization tables.
  325. string result = string.FastAllocateString(source.Length); // changing case uses simple folding: doesn't change UTF-16 code unit count
  326. if (currIdx > 0)
  327. {
  328. // copy existing known-good data into the result
  329. Span<char> resultSpan = new Span<char>(ref result.GetRawStringData(), result.Length);
  330. source.AsSpan(0, (int)currIdx).CopyTo(resultSpan);
  331. }
  332. // and run the culture-aware logic over the remainder of the data
  333. fixed (char* pResult = result)
  334. {
  335. ChangeCase(pSource + currIdx, source.Length - (int)currIdx, pResult + currIdx, result.Length - (int)currIdx, toUpper);
  336. }
  337. return result;
  338. }
  339. }
  340. }
  341. internal static unsafe string ToLowerAsciiInvariant(string s)
  342. {
  343. if (s.Length == 0)
  344. {
  345. return string.Empty;
  346. }
  347. fixed (char* pSource = s)
  348. {
  349. int i = 0;
  350. while (i < s.Length)
  351. {
  352. if ((uint)(pSource[i] - 'A') <= (uint)('Z' - 'A'))
  353. {
  354. break;
  355. }
  356. i++;
  357. }
  358. if (i >= s.Length)
  359. {
  360. return s;
  361. }
  362. string result = string.FastAllocateString(s.Length);
  363. fixed (char* pResult = result)
  364. {
  365. for (int j = 0; j < i; j++)
  366. {
  367. pResult[j] = pSource[j];
  368. }
  369. pResult[i] = (char)(pSource[i] | 0x20);
  370. i++;
  371. while (i < s.Length)
  372. {
  373. pResult[i] = ToLowerAsciiInvariant(pSource[i]);
  374. i++;
  375. }
  376. }
  377. return result;
  378. }
  379. }
  380. internal static void ToLowerAsciiInvariant(ReadOnlySpan<char> source, Span<char> destination)
  381. {
  382. Debug.Assert(destination.Length >= source.Length);
  383. for (int i = 0; i < source.Length; i++)
  384. {
  385. destination[i] = ToLowerAsciiInvariant(source[i]);
  386. }
  387. }
  388. private static unsafe string ToUpperAsciiInvariant(string s)
  389. {
  390. if (s.Length == 0)
  391. {
  392. return string.Empty;
  393. }
  394. fixed (char* pSource = s)
  395. {
  396. int i = 0;
  397. while (i < s.Length)
  398. {
  399. if ((uint)(pSource[i] - 'a') <= (uint)('z' - 'a'))
  400. {
  401. break;
  402. }
  403. i++;
  404. }
  405. if (i >= s.Length)
  406. {
  407. return s;
  408. }
  409. string result = string.FastAllocateString(s.Length);
  410. fixed (char* pResult = result)
  411. {
  412. for (int j = 0; j < i; j++)
  413. {
  414. pResult[j] = pSource[j];
  415. }
  416. pResult[i] = (char)(pSource[i] & ~0x20);
  417. i++;
  418. while (i < s.Length)
  419. {
  420. pResult[i] = ToUpperAsciiInvariant(pSource[i]);
  421. i++;
  422. }
  423. }
  424. return result;
  425. }
  426. }
  427. internal static void ToUpperAsciiInvariant(ReadOnlySpan<char> source, Span<char> destination)
  428. {
  429. Debug.Assert(destination.Length >= source.Length);
  430. for (int i = 0; i < source.Length; i++)
  431. {
  432. destination[i] = ToUpperAsciiInvariant(source[i]);
  433. }
  434. }
  435. private static char ToLowerAsciiInvariant(char c)
  436. {
  437. if ((uint)(c - 'A') <= (uint)('Z' - 'A'))
  438. {
  439. c = (char)(c | 0x20);
  440. }
  441. return c;
  442. }
  443. /// <summary>
  444. /// Converts the character or string to upper case. Certain locales
  445. /// have different casing semantics from the file systems in Win32.
  446. /// </summary>
  447. public virtual char ToUpper(char c)
  448. {
  449. if (GlobalizationMode.Invariant || (IsAscii(c) && IsAsciiCasingSameAsInvariant))
  450. {
  451. return ToUpperAsciiInvariant(c);
  452. }
  453. return ChangeCase(c, toUpper: true);
  454. }
  455. public virtual string ToUpper(string str)
  456. {
  457. if (str == null)
  458. {
  459. throw new ArgumentNullException(nameof(str));
  460. }
  461. if (GlobalizationMode.Invariant)
  462. {
  463. return ToUpperAsciiInvariant(str);
  464. }
  465. return ChangeCaseCommon<ToUpperConversion>(str);
  466. }
  467. internal static char ToUpperAsciiInvariant(char c)
  468. {
  469. if ((uint)(c - 'a') <= (uint)('z' - 'a'))
  470. {
  471. c = (char)(c & ~0x20);
  472. }
  473. return c;
  474. }
  475. private static bool IsAscii(char c) => c < 0x80;
  476. private bool IsAsciiCasingSameAsInvariant
  477. {
  478. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  479. get
  480. {
  481. if (_isAsciiCasingSameAsInvariant == Tristate.NotInitialized)
  482. {
  483. PopulateIsAsciiCasingSameAsInvariant();
  484. }
  485. Debug.Assert(_isAsciiCasingSameAsInvariant == Tristate.True || _isAsciiCasingSameAsInvariant == Tristate.False);
  486. return _isAsciiCasingSameAsInvariant == Tristate.True;
  487. }
  488. }
  489. [MethodImpl(MethodImplOptions.NoInlining)]
  490. private void PopulateIsAsciiCasingSameAsInvariant()
  491. {
  492. bool compareResult = CultureInfo.GetCultureInfo(_textInfoName).CompareInfo.Compare("abcdefghijklmnopqrstuvwxyz", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", CompareOptions.IgnoreCase) == 0;
  493. _isAsciiCasingSameAsInvariant = (compareResult) ? Tristate.True : Tristate.False;
  494. }
  495. /// <summary>
  496. /// Returns true if the dominant direction of text and UI such as the
  497. /// relative position of buttons and scroll bars
  498. /// </summary>
  499. public bool IsRightToLeft => _cultureData.IsRightToLeft;
  500. public override bool Equals(object? obj)
  501. {
  502. return obj is TextInfo otherTextInfo
  503. && CultureName.Equals(otherTextInfo.CultureName);
  504. }
  505. public override int GetHashCode() => CultureName.GetHashCode();
  506. public override string ToString()
  507. {
  508. return "TextInfo - " + _cultureData.CultureName;
  509. }
  510. /// <summary>
  511. /// Titlecasing refers to a casing practice wherein the first letter of a word is an uppercase letter
  512. /// and the rest of the letters are lowercase. The choice of which words to titlecase in headings
  513. /// and titles is dependent on language and local conventions. For example, "The Merry Wives of Windor"
  514. /// is the appropriate titlecasing of that play's name in English, with the word "of" not titlecased.
  515. /// In German, however, the title is "Die lustigen Weiber von Windsor," and both "lustigen" and "von"
  516. /// are not titlecased. In French even fewer words are titlecased: "Les joyeuses commeres de Windsor."
  517. ///
  518. /// Moreover, the determination of what actually constitutes a word is language dependent, and this can
  519. /// influence which letter or letters of a "word" are uppercased when titlecasing strings. For example
  520. /// "l'arbre" is considered two words in French, whereas "can't" is considered one word in English.
  521. /// </summary>
  522. public unsafe string ToTitleCase(string str)
  523. {
  524. if (str == null)
  525. {
  526. throw new ArgumentNullException(nameof(str));
  527. }
  528. if (str.Length == 0)
  529. {
  530. return str;
  531. }
  532. StringBuilder result = new StringBuilder();
  533. string? lowercaseData = null;
  534. // Store if the current culture is Dutch (special case)
  535. bool isDutchCulture = CultureName.StartsWith("nl-", StringComparison.OrdinalIgnoreCase);
  536. for (int i = 0; i < str.Length; i++)
  537. {
  538. int charLen;
  539. UnicodeCategory charType = CharUnicodeInfo.InternalGetUnicodeCategory(str, i, out charLen);
  540. if (char.CheckLetter(charType))
  541. {
  542. // Special case to check for Dutch specific titlecasing with "IJ" characters
  543. // at the beginning of a word
  544. if (isDutchCulture && i < str.Length - 1 && (str[i] == 'i' || str[i] == 'I') && (str[i+1] == 'j' || str[i+1] == 'J'))
  545. {
  546. result.Append("IJ");
  547. i += 2;
  548. }
  549. else
  550. {
  551. // Do the titlecasing for the first character of the word.
  552. i = AddTitlecaseLetter(ref result, ref str, i, charLen) + 1;
  553. }
  554. // Convert the characters until the end of the this word
  555. // to lowercase.
  556. int lowercaseStart = i;
  557. // Use hasLowerCase flag to prevent from lowercasing acronyms (like "URT", "USA", etc)
  558. // This is in line with Word 2000 behavior of titlecasing.
  559. bool hasLowerCase = (charType == UnicodeCategory.LowercaseLetter);
  560. // Use a loop to find all of the other letters following this letter.
  561. while (i < str.Length)
  562. {
  563. charType = CharUnicodeInfo.InternalGetUnicodeCategory(str, i, out charLen);
  564. if (IsLetterCategory(charType))
  565. {
  566. if (charType == UnicodeCategory.LowercaseLetter)
  567. {
  568. hasLowerCase = true;
  569. }
  570. i += charLen;
  571. }
  572. else if (str[i] == '\'')
  573. {
  574. i++;
  575. if (hasLowerCase)
  576. {
  577. if (lowercaseData == null)
  578. {
  579. lowercaseData = ToLower(str);
  580. }
  581. result.Append(lowercaseData, lowercaseStart, i - lowercaseStart);
  582. }
  583. else
  584. {
  585. result.Append(str, lowercaseStart, i - lowercaseStart);
  586. }
  587. lowercaseStart = i;
  588. hasLowerCase = true;
  589. }
  590. else if (!IsWordSeparator(charType))
  591. {
  592. // This category is considered to be part of the word.
  593. // This is any category that is marked as false in wordSeprator array.
  594. i+= charLen;
  595. }
  596. else
  597. {
  598. // A word separator. Break out of the loop.
  599. break;
  600. }
  601. }
  602. int count = i - lowercaseStart;
  603. if (count > 0)
  604. {
  605. if (hasLowerCase)
  606. {
  607. if (lowercaseData == null)
  608. {
  609. lowercaseData = ToLower(str);
  610. }
  611. result.Append(lowercaseData, lowercaseStart, count);
  612. }
  613. else
  614. {
  615. result.Append(str, lowercaseStart, count);
  616. }
  617. }
  618. if (i < str.Length)
  619. {
  620. // not a letter, just append it
  621. i = AddNonLetter(ref result, ref str, i, charLen);
  622. }
  623. }
  624. else
  625. {
  626. // not a letter, just append it
  627. i = AddNonLetter(ref result, ref str, i, charLen);
  628. }
  629. }
  630. return result.ToString();
  631. }
  632. private static int AddNonLetter(ref StringBuilder result, ref string input, int inputIndex, int charLen)
  633. {
  634. Debug.Assert(charLen == 1 || charLen == 2, "[TextInfo.AddNonLetter] CharUnicodeInfo.InternalGetUnicodeCategory returned an unexpected charLen!");
  635. if (charLen == 2)
  636. {
  637. // Surrogate pair
  638. result.Append(input[inputIndex++]);
  639. result.Append(input[inputIndex]);
  640. }
  641. else
  642. {
  643. result.Append(input[inputIndex]);
  644. }
  645. return inputIndex;
  646. }
  647. private int AddTitlecaseLetter(ref StringBuilder result, ref string input, int inputIndex, int charLen)
  648. {
  649. Debug.Assert(charLen == 1 || charLen == 2, "[TextInfo.AddTitlecaseLetter] CharUnicodeInfo.InternalGetUnicodeCategory returned an unexpected charLen!");
  650. if (charLen == 2)
  651. {
  652. // for surrogate pairs do a ToUpper operation on the substring
  653. ReadOnlySpan<char> src = input.AsSpan(inputIndex, 2);
  654. if (GlobalizationMode.Invariant)
  655. {
  656. result.Append(src); // surrogate pair in invariant mode, so changing case is a nop
  657. }
  658. else
  659. {
  660. Span<char> dst = stackalloc char[2];
  661. ChangeCaseToUpper(src, dst);
  662. result.Append(dst);
  663. }
  664. inputIndex++;
  665. }
  666. else
  667. {
  668. switch (input[inputIndex])
  669. {
  670. // For AppCompat, the Titlecase Case Mapping data from NDP 2.0 is used below.
  671. case (char) 0x01C4: // DZ with Caron -> Dz with Caron
  672. case (char) 0x01C5: // Dz with Caron -> Dz with Caron
  673. case (char) 0x01C6: // dz with Caron -> Dz with Caron
  674. result.Append((char) 0x01C5);
  675. break;
  676. case (char) 0x01C7: // LJ -> Lj
  677. case (char) 0x01C8: // Lj -> Lj
  678. case (char) 0x01C9: // lj -> Lj
  679. result.Append((char) 0x01C8);
  680. break;
  681. case (char) 0x01CA: // NJ -> Nj
  682. case (char) 0x01CB: // Nj -> Nj
  683. case (char) 0x01CC: // nj -> Nj
  684. result.Append((char) 0x01CB);
  685. break;
  686. case (char) 0x01F1: // DZ -> Dz
  687. case (char) 0x01F2: // Dz -> Dz
  688. case (char) 0x01F3: // dz -> Dz
  689. result.Append((char) 0x01F2);
  690. break;
  691. default:
  692. result.Append(ToUpper(input[inputIndex]));
  693. break;
  694. }
  695. }
  696. return inputIndex;
  697. }
  698. // Used in ToTitleCase():
  699. // When we find a starting letter, the following array decides if a category should be
  700. // considered as word seprator or not.
  701. private const int c_wordSeparatorMask =
  702. /* false */ (0 << 0) | // UppercaseLetter = 0,
  703. /* false */ (0 << 1) | // LowercaseLetter = 1,
  704. /* false */ (0 << 2) | // TitlecaseLetter = 2,
  705. /* false */ (0 << 3) | // ModifierLetter = 3,
  706. /* false */ (0 << 4) | // OtherLetter = 4,
  707. /* false */ (0 << 5) | // NonSpacingMark = 5,
  708. /* false */ (0 << 6) | // SpacingCombiningMark = 6,
  709. /* false */ (0 << 7) | // EnclosingMark = 7,
  710. /* false */ (0 << 8) | // DecimalDigitNumber = 8,
  711. /* false */ (0 << 9) | // LetterNumber = 9,
  712. /* false */ (0 << 10) | // OtherNumber = 10,
  713. /* true */ (1 << 11) | // SpaceSeparator = 11,
  714. /* true */ (1 << 12) | // LineSeparator = 12,
  715. /* true */ (1 << 13) | // ParagraphSeparator = 13,
  716. /* true */ (1 << 14) | // Control = 14,
  717. /* true */ (1 << 15) | // Format = 15,
  718. /* false */ (0 << 16) | // Surrogate = 16,
  719. /* false */ (0 << 17) | // PrivateUse = 17,
  720. /* true */ (1 << 18) | // ConnectorPunctuation = 18,
  721. /* true */ (1 << 19) | // DashPunctuation = 19,
  722. /* true */ (1 << 20) | // OpenPunctuation = 20,
  723. /* true */ (1 << 21) | // ClosePunctuation = 21,
  724. /* true */ (1 << 22) | // InitialQuotePunctuation = 22,
  725. /* true */ (1 << 23) | // FinalQuotePunctuation = 23,
  726. /* true */ (1 << 24) | // OtherPunctuation = 24,
  727. /* true */ (1 << 25) | // MathSymbol = 25,
  728. /* true */ (1 << 26) | // CurrencySymbol = 26,
  729. /* true */ (1 << 27) | // ModifierSymbol = 27,
  730. /* true */ (1 << 28) | // OtherSymbol = 28,
  731. /* false */ (0 << 29); // OtherNotAssigned = 29;
  732. private static bool IsWordSeparator(UnicodeCategory category)
  733. {
  734. return (c_wordSeparatorMask & (1 << (int) category)) != 0;
  735. }
  736. private static bool IsLetterCategory(UnicodeCategory uc)
  737. {
  738. return (uc == UnicodeCategory.UppercaseLetter
  739. || uc == UnicodeCategory.LowercaseLetter
  740. || uc == UnicodeCategory.TitlecaseLetter
  741. || uc == UnicodeCategory.ModifierLetter
  742. || uc == UnicodeCategory.OtherLetter);
  743. }
  744. // A dummy struct that is used for 'ToUpper' in generic parameters
  745. private readonly struct ToUpperConversion { }
  746. // A dummy struct that is used for 'ToLower' in generic parameters
  747. private readonly struct ToLowerConversion { }
  748. }
  749. }