| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380 |
- // Licensed to the .NET Foundation under one or more agreements.
- // The .NET Foundation licenses this file to you under the MIT license.
- // See the LICENSE file in the project root for more information.
- ////////////////////////////////////////////////////////////////////////////
- //
- //
- // Purpose: This class implements a set of methods for retrieving
- // character type information. Character type information is
- // independent of culture and region.
- //
- //
- ////////////////////////////////////////////////////////////////////////////
- using System.Buffers.Binary;
- using System.Diagnostics;
- using System.Text;
- using Internal.Runtime.CompilerServices;
- namespace System.Globalization
- {
- public static partial class CharUnicodeInfo
- {
- //--------------------------------------------------------------------//
- // Internal Information //
- //--------------------------------------------------------------------//
- //
- // Native methods to access the Unicode category data tables in charinfo.nlp.
- //
- internal const char HIGH_SURROGATE_START = '\ud800';
- internal const char HIGH_SURROGATE_END = '\udbff';
- internal const char LOW_SURROGATE_START = '\udc00';
- internal const char LOW_SURROGATE_END = '\udfff';
- internal const int HIGH_SURROGATE_RANGE = 0x3FF;
- internal const int UNICODE_CATEGORY_OFFSET = 0;
- internal const int BIDI_CATEGORY_OFFSET = 1;
- // The starting codepoint for Unicode plane 1. Plane 1 contains 0x010000 ~ 0x01ffff.
- internal const int UNICODE_PLANE01_START = 0x10000;
- ////////////////////////////////////////////////////////////////////////
- //
- // Actions:
- // Convert the BMP character or surrogate pointed by index to a UTF32 value.
- // This is similar to char.ConvertToUTF32, but the difference is that
- // it does not throw exceptions when invalid surrogate characters are passed in.
- //
- // WARNING: since it doesn't throw an exception it CAN return a value
- // in the surrogate range D800-DFFF, which are not legal unicode values.
- //
- ////////////////////////////////////////////////////////////////////////
- internal static int InternalConvertToUtf32(string s, int index)
- {
- Debug.Assert(s != null, "s != null");
- Debug.Assert(index >= 0 && index < s.Length, "index < s.Length");
- if (index < s.Length - 1)
- {
- int temp1 = (int)s[index] - HIGH_SURROGATE_START;
- if ((uint)temp1 <= HIGH_SURROGATE_RANGE)
- {
- int temp2 = (int)s[index + 1] - LOW_SURROGATE_START;
- if ((uint)temp2 <= HIGH_SURROGATE_RANGE)
- {
- // Convert the surrogate to UTF32 and get the result.
- return ((temp1 * 0x400) + temp2 + UNICODE_PLANE01_START);
- }
- }
- }
- return ((int)s[index]);
- }
- internal static int InternalConvertToUtf32(StringBuilder s, int index)
- {
- Debug.Assert(s != null, "s != null");
- Debug.Assert(index >= 0 && index < s.Length, "index < s.Length");
- int c = (int)s[index];
- if (index < s.Length - 1)
- {
- int temp1 = c - HIGH_SURROGATE_START;
- if ((uint)temp1 <= HIGH_SURROGATE_RANGE)
- {
- int temp2 = (int)s[index + 1] - LOW_SURROGATE_START;
- if ((uint)temp2 <= HIGH_SURROGATE_RANGE)
- {
- // Convert the surrogate to UTF32 and get the result.
- return ((temp1 * 0x400) + temp2 + UNICODE_PLANE01_START);
- }
- }
- }
- return c;
- }
- ////////////////////////////////////////////////////////////////////////
- //
- // Convert a character or a surrogate pair starting at index of string s
- // to UTF32 value.
- //
- // Parameters:
- // s The string
- // index The starting index. It can point to a BMP character or
- // a surrogate pair.
- // len The length of the string.
- // charLength [out] If the index points to a BMP char, charLength
- // will be 1. If the index points to a surrogate pair,
- // charLength will be 2.
- //
- // WARNING: since it doesn't throw an exception it CAN return a value
- // in the surrogate range D800-DFFF, which are not legal unicode values.
- //
- // Returns:
- // The UTF32 value
- //
- ////////////////////////////////////////////////////////////////////////
- internal static int InternalConvertToUtf32(string s, int index, out int charLength)
- {
- Debug.Assert(s != null, "s != null");
- Debug.Assert(s.Length > 0, "s.Length > 0");
- Debug.Assert(index >= 0 && index < s.Length, "index >= 0 && index < s.Length");
- charLength = 1;
- if (index < s.Length - 1)
- {
- int temp1 = (int)s[index] - HIGH_SURROGATE_START;
- if ((uint)temp1 <= HIGH_SURROGATE_RANGE)
- {
- int temp2 = (int)s[index + 1] - LOW_SURROGATE_START;
- if ((uint)temp2 <= HIGH_SURROGATE_RANGE)
- {
- // Convert the surrogate to UTF32 and get the result.
- charLength++;
- return ((temp1 * 0x400) + temp2 + UNICODE_PLANE01_START);
- }
- }
- }
- return ((int)s[index]);
- }
- //
- // This is called by the public char and string, index versions
- //
- // Note that for ch in the range D800-DFFF we just treat it as any other non-numeric character
- internal static double InternalGetNumericValue(int ch)
- {
- Debug.Assert(ch >= 0 && ch <= 0x10ffff, "ch is not in valid Unicode range.");
- // Get the level 2 item from the highest 12 bit (8 - 19) of ch.
- int index = ch >> 8;
- if ((uint)index < (uint)NumericLevel1Index.Length)
- {
- index = NumericLevel1Index[index];
- // Get the level 2 offset from the 4 - 7 bit of ch. This provides the base offset of the level 3 table.
- // Note that & has the lower precedence than addition, so don't forget the parathesis.
- index = NumericLevel2Index[(index << 4) + ((ch >> 4) & 0x000f)];
- index = NumericLevel3Index[(index << 4) + (ch & 0x000f)];
- ref var value = ref Unsafe.AsRef(in NumericValues[index * 8]);
- if (BitConverter.IsLittleEndian)
- return Unsafe.ReadUnaligned<double>(ref value);
- return BitConverter.Int64BitsToDouble(BinaryPrimitives.ReverseEndianness(Unsafe.ReadUnaligned<long>(ref value)));
- }
- return -1;
- }
- internal static byte InternalGetDigitValues(int ch, int offset)
- {
- Debug.Assert(ch >= 0 && ch <= 0x10ffff, "ch is not in valid Unicode range.");
- // Get the level 2 item from the highest 12 bit (8 - 19) of ch.
- int index = ch >> 8;
- if ((uint)index < (uint)NumericLevel1Index.Length)
- {
- index = NumericLevel1Index[index];
- // Get the level 2 offset from the 4 - 7 bit of ch. This provides the base offset of the level 3 table.
- // Note that & has the lower precedence than addition, so don't forget the parathesis.
- index = NumericLevel2Index[(index << 4) + ((ch >> 4) & 0x000f)];
- index = NumericLevel3Index[(index << 4) + (ch & 0x000f)];
- return DigitValues[index * 2 + offset];
- }
- return 0xff;
- }
- ////////////////////////////////////////////////////////////////////////
- //
- //Returns the numeric value associated with the character c. If the character is a fraction,
- // the return value will not be an integer. If the character does not have a numeric value, the return value is -1.
- //
- //Returns:
- // the numeric value for the specified Unicode character. If the character does not have a numeric value, the return value is -1.
- //Arguments:
- // ch a Unicode character
- //Exceptions:
- // ArgumentNullException
- // ArgumentOutOfRangeException
- //
- ////////////////////////////////////////////////////////////////////////
- public static double GetNumericValue(char ch)
- {
- return (InternalGetNumericValue(ch));
- }
- public static double GetNumericValue(string s, int index)
- {
- if (s == null)
- {
- throw new ArgumentNullException(nameof(s));
- }
- if (index < 0 || index >= s.Length)
- {
- throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index);
- }
- return (InternalGetNumericValue(InternalConvertToUtf32(s, index)));
- }
- public static int GetDecimalDigitValue(char ch)
- {
- return (sbyte)InternalGetDigitValues(ch, 0);
- }
- public static int GetDecimalDigitValue(string s, int index)
- {
- if (s == null)
- {
- throw new ArgumentNullException(nameof(s));
- }
- if (index < 0 || index >= s.Length)
- {
- throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index);
- }
- return (sbyte)InternalGetDigitValues(InternalConvertToUtf32(s, index), 0);
- }
- public static int GetDigitValue(char ch)
- {
- return (sbyte)InternalGetDigitValues(ch, 1);
- }
- public static int GetDigitValue(string s, int index)
- {
- if (s == null)
- {
- throw new ArgumentNullException(nameof(s));
- }
- if (index < 0 || index >= s.Length)
- {
- throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index);
- }
- return (sbyte)InternalGetDigitValues(InternalConvertToUtf32(s, index), 1);
- }
- public static UnicodeCategory GetUnicodeCategory(char ch)
- {
- return (GetUnicodeCategory((int)ch));
- }
- public static UnicodeCategory GetUnicodeCategory(string s, int index)
- {
- if (s == null)
- throw new ArgumentNullException(nameof(s));
- if (((uint)index) >= ((uint)s.Length))
- {
- throw new ArgumentOutOfRangeException(nameof(index));
- }
- return InternalGetUnicodeCategory(s, index);
- }
- public static UnicodeCategory GetUnicodeCategory(int codePoint)
- {
- return ((UnicodeCategory)InternalGetCategoryValue(codePoint, UNICODE_CATEGORY_OFFSET));
- }
- ////////////////////////////////////////////////////////////////////////
- //
- //Action: Returns the Unicode Category property for the character c.
- //Returns:
- // an value in UnicodeCategory enum
- //Arguments:
- // ch a Unicode character
- //Exceptions:
- // None
- //
- //Note that this API will return values for D800-DF00 surrogate halves.
- //
- ////////////////////////////////////////////////////////////////////////
- internal static byte InternalGetCategoryValue(int ch, int offset)
- {
- Debug.Assert(ch >= 0 && ch <= 0x10ffff, "ch is not in valid Unicode range.");
- // Get the level 2 item from the highest 11 bits of ch.
- int index = CategoryLevel1Index[ch >> 9];
- // Get the level 2 WORD offset from the next 5 bits of ch. This provides the base offset of the level 3 table.
- // Note that & has the lower precedence than addition, so don't forget the parathesis.
- index = Unsafe.ReadUnaligned<ushort>(ref Unsafe.AsRef(in CategoryLevel2Index[(index << 6) + ((ch >> 3) & 0b111110)]));
- if (!BitConverter.IsLittleEndian)
- index = BinaryPrimitives.ReverseEndianness((ushort)index);
- // Get the result from the 0 -3 bit of ch.
- index = CategoryLevel3Index[(index << 4) + (ch & 0x000f)];
- return CategoriesValue[index * 2 + offset];
- }
- ////////////////////////////////////////////////////////////////////////
- //
- //Action: Returns the Unicode Category property for the character c.
- //Returns:
- // an value in UnicodeCategory enum
- //Arguments:
- // value a Unicode String
- // index Index for the specified string.
- //Exceptions:
- // None
- //
- ////////////////////////////////////////////////////////////////////////
- internal static UnicodeCategory InternalGetUnicodeCategory(string value, int index)
- {
- Debug.Assert(value != null, "value can not be null");
- Debug.Assert(index < value.Length, "index < value.Length");
- return (GetUnicodeCategory(InternalConvertToUtf32(value, index)));
- }
- internal static BidiCategory GetBidiCategory(string s, int index)
- {
- if (s == null)
- throw new ArgumentNullException(nameof(s));
- if (((uint)index) >= ((uint)s.Length))
- {
- throw new ArgumentOutOfRangeException(nameof(index));
- }
- return ((BidiCategory) InternalGetCategoryValue(InternalConvertToUtf32(s, index), BIDI_CATEGORY_OFFSET));
- }
- internal static BidiCategory GetBidiCategory(StringBuilder s, int index)
- {
- Debug.Assert(s != null, "s can not be null");
- Debug.Assert(index >= 0 && index < s.Length, "invalid index"); ;
- return ((BidiCategory) InternalGetCategoryValue(InternalConvertToUtf32(s, index), BIDI_CATEGORY_OFFSET));
- }
- ////////////////////////////////////////////////////////////////////////
- //
- // Get the Unicode category of the character starting at index. If the character is in BMP, charLength will return 1.
- // If the character is a valid surrogate pair, charLength will return 2.
- //
- ////////////////////////////////////////////////////////////////////////
- internal static UnicodeCategory InternalGetUnicodeCategory(string str, int index, out int charLength)
- {
- Debug.Assert(str != null, "str can not be null");
- Debug.Assert(str.Length > 0, "str.Length > 0"); ;
- Debug.Assert(index >= 0 && index < str.Length, "index >= 0 && index < str.Length");
- return (GetUnicodeCategory(InternalConvertToUtf32(str, index, out charLength)));
- }
- internal static bool IsCombiningCategory(UnicodeCategory uc)
- {
- Debug.Assert(uc >= 0, "uc >= 0");
- return (
- uc == UnicodeCategory.NonSpacingMark ||
- uc == UnicodeCategory.SpacingCombiningMark ||
- uc == UnicodeCategory.EnclosingMark
- );
- }
- }
- }
|