| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891 |
- // Licensed to the .NET Foundation under one or more agreements.
- // The .NET Foundation licenses this file to you under the MIT license.
- // See the LICENSE file in the project root for more information.
- // This file contains the IDN functions and implementation.
- //
- // This allows encoding of non-ASCII domain names in a "punycode" form,
- // for example:
- //
- // \u5B89\u5BA4\u5948\u7F8E\u6075-with-SUPER-MONKEYS
- //
- // is encoded as:
- //
- // xn---with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n
- //
- // Additional options are provided to allow unassigned IDN characters and
- // to validate according to the Std3ASCII Rules (like DNS names).
- //
- // There are also rules regarding bidirectionality of text and the length
- // of segments.
- //
- // For additional rules see also:
- // RFC 3490 - Internationalizing Domain Names in Applications (IDNA)
- // RFC 3491 - Nameprep: A Stringprep Profile for Internationalized Domain Names (IDN)
- // RFC 3492 - Punycode: A Bootstring encoding of Unicode for Internationalized Domain Names in Applications (IDNA)
- using System.Diagnostics;
- using System.Runtime.CompilerServices;
- using System.Text;
- namespace System.Globalization
- {
- // IdnMapping class used to map names to Punycode
- public sealed partial class IdnMapping
- {
- private bool _allowUnassigned;
- private bool _useStd3AsciiRules;
- public IdnMapping()
- {
- }
- public bool AllowUnassigned
- {
- get { return _allowUnassigned; }
- set { _allowUnassigned = value; }
- }
- public bool UseStd3AsciiRules
- {
- get { return _useStd3AsciiRules; }
- set { _useStd3AsciiRules = value; }
- }
- // Gets ASCII (Punycode) version of the string
- public string GetAscii(string unicode)
- {
- return GetAscii(unicode, 0);
- }
- public string GetAscii(string unicode, int index)
- {
- if (unicode == null)
- throw new ArgumentNullException(nameof(unicode));
- return GetAscii(unicode, index, unicode.Length - index);
- }
- public string GetAscii(string unicode, int index, int count)
- {
- if (unicode == null)
- throw new ArgumentNullException(nameof(unicode));
- if (index < 0 || count < 0)
- throw new ArgumentOutOfRangeException((index < 0) ? nameof(index) : nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
- if (index > unicode.Length)
- throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index);
- if (index > unicode.Length - count)
- throw new ArgumentOutOfRangeException(nameof(unicode), SR.ArgumentOutOfRange_IndexCountBuffer);
- if (count == 0)
- {
- throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
- }
- if (unicode[index + count - 1] == 0)
- {
- throw new ArgumentException(SR.Format(SR.Argument_InvalidCharSequence, index + count - 1), nameof(unicode));
- }
- if (GlobalizationMode.Invariant)
- {
- return GetAsciiInvariant(unicode, index, count);
- }
- unsafe
- {
- fixed (char* pUnicode = unicode)
- {
- return GetAsciiCore(unicode, pUnicode + index, count);
- }
- }
- }
- // Gets Unicode version of the string. Normalized and limited to IDNA characters.
- public string GetUnicode(string ascii)
- {
- return GetUnicode(ascii, 0);
- }
- public string GetUnicode(string ascii, int index)
- {
- if (ascii == null)
- throw new ArgumentNullException(nameof(ascii));
- return GetUnicode(ascii, index, ascii.Length - index);
- }
- public string GetUnicode(string ascii, int index, int count)
- {
- if (ascii == null)
- throw new ArgumentNullException(nameof(ascii));
- if (index < 0 || count < 0)
- throw new ArgumentOutOfRangeException((index < 0) ? nameof(index) : nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
- if (index > ascii.Length)
- throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index);
- if (index > ascii.Length - count)
- throw new ArgumentOutOfRangeException(nameof(ascii), SR.ArgumentOutOfRange_IndexCountBuffer);
- // This is a case (i.e. explicitly null-terminated input) where behavior in .NET and Win32 intentionally differ.
- // The .NET APIs should (and did in v4.0 and earlier) throw an ArgumentException on input that includes a terminating null.
- // The Win32 APIs fail on an embedded null, but not on a terminating null.
- if (count > 0 && ascii[index + count - 1] == (char)0)
- throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
- if (GlobalizationMode.Invariant)
- {
- return GetUnicodeInvariant(ascii, index, count);
- }
- unsafe
- {
- fixed (char* pAscii = ascii)
- {
- return GetUnicodeCore(ascii, pAscii + index, count);
- }
- }
- }
- public override bool Equals(object obj)
- {
- return
- obj is IdnMapping that &&
- _allowUnassigned == that._allowUnassigned &&
- _useStd3AsciiRules == that._useStd3AsciiRules;
- }
- public override int GetHashCode()
- {
- return (_allowUnassigned ? 100 : 200) + (_useStd3AsciiRules ? 1000 : 2000);
- }
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static unsafe string GetStringForOutput(string originalString, char* input, int inputLength, char* output, int outputLength)
- {
- return originalString.Length == inputLength && new ReadOnlySpan<char>(input, inputLength).SequenceEqual(new ReadOnlySpan<char>(output, outputLength)) ?
- originalString :
- new string(output, 0, outputLength);
- }
- //
- // Invariant implementation
- //
- private const char c_delimiter = '-';
- private const string c_strAcePrefix = "xn--";
- private const int c_labelLimit = 63; // Not including dots
- private const int c_defaultNameLimit = 255; // Including dots
- private const int c_initialN = 0x80;
- private const int c_maxint = 0x7ffffff;
- private const int c_initialBias = 72;
- private const int c_punycodeBase = 36;
- private const int c_tmin = 1;
- private const int c_tmax = 26;
- private const int c_skew = 38;
- private const int c_damp = 700;
- // Legal "dot" separators (i.e: . in www.microsoft.com)
- private static char[] c_Dots = { '.', '\u3002', '\uFF0E', '\uFF61' };
- private string GetAsciiInvariant(string unicode, int index, int count)
- {
- if (index > 0 || count < unicode.Length)
- {
- unicode = unicode.Substring(index, count);
- }
- // Check for ASCII only string, which will be unchanged
- if (ValidateStd3AndAscii(unicode, UseStd3AsciiRules, true))
- {
- return unicode;
- }
- // Cannot be null terminated (normalization won't help us with this one, and
- // may have returned false before checking the whole string above)
- Debug.Assert(count >= 1, "[IdnMapping.GetAscii] Expected 0 length strings to fail before now.");
- if (unicode[unicode.Length - 1] <= 0x1f)
- {
- throw new ArgumentException(SR.Format(SR.Argument_InvalidCharSequence, unicode.Length - 1), nameof(unicode));
- }
- // May need to check Std3 rules again for non-ascii
- if (UseStd3AsciiRules)
- {
- ValidateStd3AndAscii(unicode, true, false);
- }
- // Go ahead and encode it
- return PunycodeEncode(unicode);
- }
- // See if we're only ASCII
- static bool ValidateStd3AndAscii(string unicode, bool bUseStd3, bool bCheckAscii)
- {
- // If its empty, then its too small
- if (unicode.Length == 0)
- throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
- int iLastDot = -1;
- // Loop the whole string
- for (int i = 0; i < unicode.Length; i++)
- {
- // Aren't allowing control chars (or 7f, but idn tables catch that, they don't catch \0 at end though)
- if (unicode[i] <= 0x1f)
- {
- throw new ArgumentException(SR.Format(SR.Argument_InvalidCharSequence, i ), nameof(unicode));
- }
- // If its Unicode or a control character, return false (non-ascii)
- if (bCheckAscii && unicode[i] >= 0x7f)
- return false;
- // Check for dots
- if (IsDot(unicode[i]))
- {
- // Can't have 2 dots in a row
- if (i == iLastDot + 1)
- throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
- // If its too far between dots then fail
- if (i - iLastDot > c_labelLimit + 1)
- throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
- // If validating Std3, then char before dot can't be - char
- if (bUseStd3 && i > 0)
- ValidateStd3(unicode[i - 1], true);
- // Remember where the last dot is
- iLastDot = i;
- continue;
- }
- // If necessary, make sure its a valid std3 character
- if (bUseStd3)
- {
- ValidateStd3(unicode[i], (i == iLastDot + 1));
- }
- }
- // If we never had a dot, then we need to be shorter than the label limit
- if (iLastDot == -1 && unicode.Length > c_labelLimit)
- throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
- // Need to validate entire string length, 1 shorter if last char wasn't a dot
- if (unicode.Length > c_defaultNameLimit - (IsDot(unicode[unicode.Length - 1]) ? 0 : 1))
- throw new ArgumentException(SR.Format(SR.Argument_IdnBadNameSize,
- c_defaultNameLimit - (IsDot(unicode[unicode.Length - 1]) ? 0 : 1)), nameof(unicode));
- // If last char wasn't a dot we need to check for trailing -
- if (bUseStd3 && !IsDot(unicode[unicode.Length - 1]))
- ValidateStd3(unicode[unicode.Length - 1], true);
- return true;
- }
- /* PunycodeEncode() converts Unicode to Punycode. The input */
- /* is represented as an array of Unicode code points (not code */
- /* units; surrogate pairs are not allowed), and the output */
- /* will be represented as an array of ASCII code points. The */
- /* output string is *not* null-terminated; it will contain */
- /* zeros if and only if the input contains zeros. (Of course */
- /* the caller can leave room for a terminator and add one if */
- /* needed.) The input_length is the number of code points in */
- /* the input. The output_length is an in/out argument: the */
- /* caller passes in the maximum number of code points that it */
- /* can receive, and on successful return it will contain the */
- /* number of code points actually output. The case_flags array */
- /* holds input_length boolean values, where nonzero suggests that */
- /* the corresponding Unicode character be forced to uppercase */
- /* after being decoded (if possible), and zero suggests that */
- /* it be forced to lowercase (if possible). ASCII code points */
- /* are encoded literally, except that ASCII letters are forced */
- /* to uppercase or lowercase according to the corresponding */
- /* uppercase flags. If case_flags is a null pointer then ASCII */
- /* letters are left as they are, and other code points are */
- /* treated as if their uppercase flags were zero. The return */
- /* value can be any of the punycode_status values defined above */
- /* except punycode_bad_input; if not punycode_success, then */
- /* output_size and output might contain garbage. */
- static string PunycodeEncode(string unicode)
- {
- // 0 length strings aren't allowed
- if (unicode.Length == 0)
- throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
- StringBuilder output = new StringBuilder(unicode.Length);
- int iNextDot = 0;
- int iAfterLastDot = 0;
- int iOutputAfterLastDot = 0;
- // Find the next dot
- while (iNextDot < unicode.Length)
- {
- // Find end of this segment
- iNextDot = unicode.IndexOfAny(c_Dots, iAfterLastDot);
- Debug.Assert(iNextDot <= unicode.Length, "[IdnMapping.punycode_encode]IndexOfAny is broken");
- if (iNextDot < 0)
- iNextDot = unicode.Length;
- // Only allowed to have empty . section at end (www.microsoft.com.)
- if (iNextDot == iAfterLastDot)
- {
- // Only allowed to have empty sections as trailing .
- if (iNextDot != unicode.Length)
- throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
- // Last dot, stop
- break;
- }
- // We'll need an Ace prefix
- output.Append(c_strAcePrefix);
- // Everything resets every segment.
- bool bRightToLeft = false;
- // Check for RTL. If right-to-left, then 1st & last chars must be RTL
- BidiCategory eBidi = CharUnicodeInfo.GetBidiCategory(unicode, iAfterLastDot);
- if (eBidi == BidiCategory.RightToLeft || eBidi == BidiCategory.RightToLeftArabic)
- {
- // It has to be right to left.
- bRightToLeft = true;
- // Check last char
- int iTest = iNextDot - 1;
- if (char.IsLowSurrogate(unicode, iTest))
- {
- iTest--;
- }
- eBidi = CharUnicodeInfo.GetBidiCategory(unicode, iTest);
- if (eBidi != BidiCategory.RightToLeft && eBidi != BidiCategory.RightToLeftArabic)
- {
- // Oops, last wasn't RTL, last should be RTL if first is RTL
- throw new ArgumentException(SR.Argument_IdnBadBidi, nameof(unicode));
- }
- }
- // Handle the basic code points
- int basicCount;
- int numProcessed = 0; // Num code points that have been processed so far (this segment)
- for (basicCount = iAfterLastDot; basicCount < iNextDot; basicCount++)
- {
- // Can't be lonely surrogate because it would've thrown in normalization
- Debug.Assert(char.IsLowSurrogate(unicode, basicCount) == false, "[IdnMapping.punycode_encode]Unexpected low surrogate");
- // Double check our bidi rules
- BidiCategory testBidi = CharUnicodeInfo.GetBidiCategory(unicode, basicCount);
- // If we're RTL, we can't have LTR chars
- if (bRightToLeft && testBidi == BidiCategory.LeftToRight)
- {
- // Oops, throw error
- throw new ArgumentException(SR.Argument_IdnBadBidi, nameof(unicode));
- }
- // If we're not RTL we can't have RTL chars
- if (!bRightToLeft && (testBidi == BidiCategory.RightToLeft || testBidi == BidiCategory.RightToLeftArabic))
- {
- // Oops, throw error
- throw new ArgumentException(SR.Argument_IdnBadBidi, nameof(unicode));
- }
- // If its basic then add it
- if (Basic(unicode[basicCount]))
- {
- output.Append(EncodeBasic(unicode[basicCount]));
- numProcessed++;
- }
- // If its a surrogate, skip the next since our bidi category tester doesn't handle it.
- else if (char.IsSurrogatePair(unicode, basicCount))
- basicCount++;
- }
- int numBasicCodePoints = numProcessed; // number of basic code points
- // Stop if we ONLY had basic code points
- if (numBasicCodePoints == iNextDot - iAfterLastDot)
- {
- // Get rid of xn-- and this segments done
- output.Remove(iOutputAfterLastDot, c_strAcePrefix.Length);
- }
- else
- {
- // If it has some non-basic code points the input cannot start with xn--
- if (unicode.Length - iAfterLastDot >= c_strAcePrefix.Length &&
- unicode.Substring(iAfterLastDot, c_strAcePrefix.Length).Equals(
- c_strAcePrefix, StringComparison.OrdinalIgnoreCase))
- throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(unicode));
- // Need to do ACE encoding
- int numSurrogatePairs = 0; // number of surrogate pairs so far
- // Add a delimiter (-) if we had any basic code points (between basic and encoded pieces)
- if (numBasicCodePoints > 0)
- {
- output.Append(c_delimiter);
- }
- // Initialize the state
- int n = c_initialN;
- int delta = 0;
- int bias = c_initialBias;
- // Main loop
- while (numProcessed < (iNextDot - iAfterLastDot))
- {
- /* All non-basic code points < n have been */
- /* handled already. Find the next larger one: */
- int j;
- int m;
- int test = 0;
- for (m = c_maxint, j = iAfterLastDot;
- j < iNextDot;
- j += IsSupplementary(test) ? 2 : 1)
- {
- test = char.ConvertToUtf32(unicode, j);
- if (test >= n && test < m) m = test;
- }
- /* Increase delta enough to advance the decoder's */
- /* <n,i> state to <m,0>, but guard against overflow: */
- delta += (int)((m - n) * ((numProcessed - numSurrogatePairs) + 1));
- Debug.Assert(delta > 0, "[IdnMapping.cs]1 punycode_encode - delta overflowed int");
- n = m;
- for (j = iAfterLastDot; j < iNextDot; j+= IsSupplementary(test) ? 2 : 1)
- {
- // Make sure we're aware of surrogates
- test = char.ConvertToUtf32(unicode, j);
- // Adjust for character position (only the chars in our string already, some
- // haven't been processed.
- if (test < n)
- {
- delta++;
- Debug.Assert(delta > 0, "[IdnMapping.cs]2 punycode_encode - delta overflowed int");
- }
- if (test == n)
- {
- // Represent delta as a generalized variable-length integer:
- int q, k;
- for (q = delta, k = c_punycodeBase; ; k += c_punycodeBase)
- {
- int t = k <= bias ? c_tmin : k >= bias + c_tmax ? c_tmax : k - bias;
- if (q < t) break;
- Debug.Assert(c_punycodeBase != t, "[IdnMapping.punycode_encode]Expected c_punycodeBase (36) to be != t");
- output.Append(EncodeDigit(t + (q - t) % (c_punycodeBase - t)));
- q = (q - t) / (c_punycodeBase - t);
- }
- output.Append(EncodeDigit(q));
- bias = Adapt(delta, (numProcessed - numSurrogatePairs) + 1, numProcessed == numBasicCodePoints);
- delta = 0;
- numProcessed++;
- if (IsSupplementary(m))
- {
- numProcessed++;
- numSurrogatePairs++;
- }
- }
- }
- ++delta;
- ++n;
- Debug.Assert(delta > 0, "[IdnMapping.cs]3 punycode_encode - delta overflowed int");
- }
- }
- // Make sure its not too big
- if (output.Length - iOutputAfterLastDot > c_labelLimit)
- throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
- // Done with this segment, add dot if necessary
- if (iNextDot != unicode.Length)
- output.Append('.');
- iAfterLastDot = iNextDot + 1;
- iOutputAfterLastDot = output.Length;
- }
- // Throw if we're too long
- if (output.Length > c_defaultNameLimit - (IsDot(unicode[unicode.Length-1]) ? 0 : 1))
- throw new ArgumentException(SR.Format(SR.Argument_IdnBadNameSize,
- c_defaultNameLimit - (IsDot(unicode[unicode.Length-1]) ? 0 : 1)), nameof(unicode));
- // Return our output string
- return output.ToString();
- }
- // Is it a dot?
- // are we U+002E (., full stop), U+3002 (ideographic full stop), U+FF0E (fullwidth full stop), or
- // U+FF61 (halfwidth ideographic full stop).
- // Note: IDNA Normalization gets rid of dots now, but testing for last dot is before normalization
- private static bool IsDot(char c)
- {
- return c == '.' || c == '\u3002' || c == '\uFF0E' || c == '\uFF61';
- }
- private static bool IsSupplementary(int cTest)
- {
- return cTest >= 0x10000;
- }
- private static bool Basic(uint cp)
- {
- // Is it in ASCII range?
- return cp < 0x80;
- }
- // Validate Std3 rules for a character
- private static void ValidateStd3(char c, bool bNextToDot)
- {
- // Check for illegal characters
- if ((c <= ',' || c == '/' || (c >= ':' && c <= '@') || // Lots of characters not allowed
- (c >= '[' && c <= '`') || (c >= '{' && c <= (char)0x7F)) ||
- (c == '-' && bNextToDot))
- throw new ArgumentException(SR.Format(SR.Argument_IdnBadStd3, c), nameof(c));
- }
- private string GetUnicodeInvariant(string ascii, int index, int count)
- {
- if (index > 0 || count < ascii.Length)
- {
- // We're only using part of the string
- ascii = ascii.Substring(index, count);
- }
- // Convert Punycode to Unicode
- string strUnicode = PunycodeDecode(ascii);
- // Output name MUST obey IDNA rules & round trip (casing differences are allowed)
- if (!ascii.Equals(GetAscii(strUnicode), StringComparison.OrdinalIgnoreCase))
- throw new ArgumentException(SR.Argument_IdnIllegalName, nameof(ascii));
- return strUnicode;
- }
- /* PunycodeDecode() converts Punycode to Unicode. The input is */
- /* represented as an array of ASCII code points, and the output */
- /* will be represented as an array of Unicode code points. The */
- /* input_length is the number of code points in the input. The */
- /* output_length is an in/out argument: the caller passes in */
- /* the maximum number of code points that it can receive, and */
- /* on successful return it will contain the actual number of */
- /* code points output. The case_flags array needs room for at */
- /* least output_length values, or it can be a null pointer if the */
- /* case information is not needed. A nonzero flag suggests that */
- /* the corresponding Unicode character be forced to uppercase */
- /* by the caller (if possible), while zero suggests that it be */
- /* forced to lowercase (if possible). ASCII code points are */
- /* output already in the proper case, but their flags will be set */
- /* appropriately so that applying the flags would be harmless. */
- /* The return value can be any of the punycode_status values */
- /* defined above; if not punycode_success, then output_length, */
- /* output, and case_flags might contain garbage. On success, the */
- /* decoder will never need to write an output_length greater than */
- /* input_length, because of how the encoding is defined. */
- private static string PunycodeDecode(string ascii)
- {
- // 0 length strings aren't allowed
- if (ascii.Length == 0)
- throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(ascii));
- // Throw if we're too long
- if (ascii.Length > c_defaultNameLimit - (IsDot(ascii[ascii.Length-1]) ? 0 : 1))
- throw new ArgumentException(SR.Format(SR.Argument_IdnBadNameSize,
- c_defaultNameLimit - (IsDot(ascii[ascii.Length-1]) ? 0 : 1)), nameof(ascii));
- // output stringbuilder
- StringBuilder output = new StringBuilder(ascii.Length);
- // Dot searching
- int iNextDot = 0;
- int iAfterLastDot = 0;
- int iOutputAfterLastDot = 0;
- while (iNextDot < ascii.Length)
- {
- // Find end of this segment
- iNextDot = ascii.IndexOf('.', iAfterLastDot);
- if (iNextDot < 0 || iNextDot > ascii.Length)
- iNextDot = ascii.Length;
- // Only allowed to have empty . section at end (www.microsoft.com.)
- if (iNextDot == iAfterLastDot)
- {
- // Only allowed to have empty sections as trailing .
- if (iNextDot != ascii.Length)
- throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(ascii));
- // Last dot, stop
- break;
- }
- // In either case it can't be bigger than segment size
- if (iNextDot - iAfterLastDot > c_labelLimit)
- throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(ascii));
- // See if this section's ASCII or ACE
- if (ascii.Length < c_strAcePrefix.Length + iAfterLastDot ||
- string.Compare(ascii, iAfterLastDot, c_strAcePrefix, 0, c_strAcePrefix.Length, StringComparison.OrdinalIgnoreCase) != 0)
- {
- // Its ASCII, copy it
- output.Append(ascii, iAfterLastDot, iNextDot - iAfterLastDot);
- }
- else
- {
- // Not ASCII, bump up iAfterLastDot to be after ACE Prefix
- iAfterLastDot += c_strAcePrefix.Length;
- // Get number of basic code points (where delimiter is)
- // numBasicCodePoints < 0 if there're no basic code points
- int iTemp = ascii.LastIndexOf(c_delimiter, iNextDot - 1);
- // Trailing - not allowed
- if (iTemp == iNextDot - 1)
- throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
- int numBasicCodePoints;
- if (iTemp <= iAfterLastDot)
- numBasicCodePoints = 0;
- else
- {
- numBasicCodePoints = iTemp - iAfterLastDot;
- // Copy all the basic code points, making sure they're all in the allowed range,
- // and losing the casing for all of them.
- for (int copyAscii = iAfterLastDot; copyAscii < iAfterLastDot + numBasicCodePoints; copyAscii++)
- {
- // Make sure we don't allow unicode in the ascii part
- if (ascii[copyAscii] > 0x7f)
- throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
- // When appending make sure they get lower cased
- output.Append((char)(ascii[copyAscii] >= 'A' && ascii[copyAscii] <='Z' ? ascii[copyAscii] - 'A' + 'a' : ascii[copyAscii]));
- }
- }
- // Get ready for main loop. Start at beginning if we didn't have any
- // basic code points, otherwise start after the -.
- // asciiIndex will be next character to read from ascii
- int asciiIndex = iAfterLastDot + (numBasicCodePoints > 0 ? numBasicCodePoints + 1 : 0);
- // initialize our state
- int n = c_initialN;
- int bias = c_initialBias;
- int i = 0;
- int w, k;
- // no Supplementary characters yet
- int numSurrogatePairs = 0;
- // Main loop, read rest of ascii
- while (asciiIndex < iNextDot)
- {
- /* Decode a generalized variable-length integer into delta, */
- /* which gets added to i. The overflow checking is easier */
- /* if we increase i as we go, then subtract off its starting */
- /* value at the end to obtain delta. */
- int oldi = i;
- for (w = 1, k = c_punycodeBase; ; k += c_punycodeBase)
- {
- // Check to make sure we aren't overrunning our ascii string
- if (asciiIndex >= iNextDot)
- throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
- // decode the digit from the next char
- int digit = DecodeDigit(ascii[asciiIndex++]);
- Debug.Assert(w > 0, "[IdnMapping.punycode_decode]Expected w > 0");
- if (digit > (c_maxint - i) / w)
- throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
- i += (int)(digit * w);
- int t = k <= bias ? c_tmin : k >= bias + c_tmax ? c_tmax : k - bias;
- if (digit < t)
- break;
- Debug.Assert(c_punycodeBase != t, "[IdnMapping.punycode_decode]Expected t != c_punycodeBase (36)");
- if (w > c_maxint / (c_punycodeBase - t))
- throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
- w *= (c_punycodeBase - t);
- }
- bias = Adapt(i - oldi, (output.Length - iOutputAfterLastDot - numSurrogatePairs) + 1, oldi == 0);
- /* i was supposed to wrap around from output.Length to 0, */
- /* incrementing n each time, so we'll fix that now: */
- Debug.Assert((output.Length - iOutputAfterLastDot - numSurrogatePairs) + 1 > 0,
- "[IdnMapping.punycode_decode]Expected to have added > 0 characters this segment");
- if (i / ((output.Length - iOutputAfterLastDot - numSurrogatePairs) + 1) > c_maxint - n)
- throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
- n += (int)(i / (output.Length - iOutputAfterLastDot - numSurrogatePairs + 1));
- i %= (output.Length - iOutputAfterLastDot - numSurrogatePairs + 1);
- // Make sure n is legal
- if ((n < 0 || n > 0x10ffff) || (n >= 0xD800 && n <= 0xDFFF))
- throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
- // insert n at position i of the output: Really tricky if we have surrogates
- int iUseInsertLocation;
- string strTemp = char.ConvertFromUtf32(n);
- // If we have supplimentary characters
- if (numSurrogatePairs > 0)
- {
- // Hard way, we have supplimentary characters
- int iCount;
- for (iCount = i, iUseInsertLocation = iOutputAfterLastDot; iCount > 0; iCount--, iUseInsertLocation++)
- {
- // If its a surrogate, we have to go one more
- if (iUseInsertLocation >= output.Length)
- throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
- if (char.IsSurrogate(output[iUseInsertLocation]))
- iUseInsertLocation++;
- }
- }
- else
- {
- // No Supplementary chars yet, just add i
- iUseInsertLocation = iOutputAfterLastDot + i;
- }
- // Insert it
- output.Insert(iUseInsertLocation, strTemp);
- // If it was a surrogate increment our counter
- if (IsSupplementary(n))
- numSurrogatePairs++;
- // Index gets updated
- i++;
- }
- // Do BIDI testing
- bool bRightToLeft = false;
- // Check for RTL. If right-to-left, then 1st & last chars must be RTL
- BidiCategory eBidi = CharUnicodeInfo.GetBidiCategory(output, iOutputAfterLastDot);
- if (eBidi == BidiCategory.RightToLeft || eBidi == BidiCategory.RightToLeftArabic)
- {
- // It has to be right to left.
- bRightToLeft = true;
- }
- // Check the rest of them to make sure RTL/LTR is consistent
- for (int iTest = iOutputAfterLastDot; iTest < output.Length; iTest++)
- {
- // This might happen if we run into a pair
- if (char.IsLowSurrogate(output[iTest]))
- continue;
- // Check to see if its LTR
- eBidi = CharUnicodeInfo.GetBidiCategory(output, iTest);
- if ((bRightToLeft && eBidi == BidiCategory.LeftToRight) ||
- (!bRightToLeft && (eBidi == BidiCategory.RightToLeft || eBidi == BidiCategory.RightToLeftArabic)))
- throw new ArgumentException(SR.Argument_IdnBadBidi, nameof(ascii));
- }
- // Its also a requirement that the last one be RTL if 1st is RTL
- if (bRightToLeft && eBidi != BidiCategory.RightToLeft && eBidi != BidiCategory.RightToLeftArabic)
- {
- // Oops, last wasn't RTL, last should be RTL if first is RTL
- throw new ArgumentException(SR.Argument_IdnBadBidi, nameof(ascii));
- }
- }
- // See if this label was too long
- if (iNextDot - iAfterLastDot > c_labelLimit)
- throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(ascii));
- // Done with this segment, add dot if necessary
- if (iNextDot != ascii.Length)
- output.Append('.');
- iAfterLastDot = iNextDot + 1;
- iOutputAfterLastDot = output.Length;
- }
- // Throw if we're too long
- if (output.Length > c_defaultNameLimit - (IsDot(output[output.Length-1]) ? 0 : 1))
- throw new ArgumentException(SR.Format(SR.Argument_IdnBadNameSize, c_defaultNameLimit - (IsDot(output[output.Length-1]) ? 0 : 1)), nameof(ascii));
- // Return our output string
- return output.ToString();
- }
- // DecodeDigit(cp) returns the numeric value of a basic code */
- // point (for use in representing integers) in the range 0 to */
- // c_punycodeBase-1, or <0 if cp is does not represent a value. */
- private static int DecodeDigit(char cp)
- {
- if (cp >= '0' && cp <= '9')
- return cp - '0' + 26;
- // Two flavors for case differences
- if (cp >= 'a' && cp <= 'z')
- return cp - 'a';
- if (cp >= 'A' && cp <= 'Z')
- return cp - 'A';
- // Expected 0-9, A-Z or a-z, everything else is illegal
- throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(cp));
- }
- private static int Adapt(int delta, int numpoints, bool firsttime)
- {
- uint k;
- delta = firsttime ? delta / c_damp : delta / 2;
- Debug.Assert(numpoints != 0, "[IdnMapping.adapt]Expected non-zero numpoints.");
- delta += delta / numpoints;
- for (k = 0; delta > ((c_punycodeBase - c_tmin) * c_tmax) / 2; k += c_punycodeBase)
- {
- delta /= c_punycodeBase - c_tmin;
- }
- Debug.Assert(delta + c_skew != 0, "[IdnMapping.adapt]Expected non-zero delta+skew.");
- return (int)(k + (c_punycodeBase - c_tmin + 1) * delta / (delta + c_skew));
- }
- /* EncodeBasic(bcp,flag) forces a basic code point to lowercase */
- /* if flag is false, uppercase if flag is true, and returns */
- /* the resulting code point. The code point is unchanged if it */
- /* is caseless. The behavior is undefined if bcp is not a basic */
- /* code point. */
- static char EncodeBasic(char bcp)
- {
- if (HasUpperCaseFlag(bcp))
- bcp += (char)('a' - 'A');
- return bcp;
- }
- // Return whether a punycode code point is flagged as being upper case.
- private static bool HasUpperCaseFlag(char punychar)
- {
- return (punychar >= 'A' && punychar <= 'Z');
- }
- /* EncodeDigit(d,flag) returns the basic code point whose value */
- /* (when used for representing integers) is d, which needs to be in */
- /* the range 0 to punycodeBase-1. The lowercase form is used unless flag is */
- /* true, in which case the uppercase form is used. */
- private static char EncodeDigit(int d)
- {
- Debug.Assert(d >= 0 && d < c_punycodeBase, "[IdnMapping.encode_digit]Expected 0 <= d < punycodeBase");
- // 26-35 map to ASCII 0-9
- if (d > 25) return (char)(d - 26 + '0');
- // 0-25 map to a-z or A-Z
- return (char)(d + 'a');
- }
- }
- }
|