IdnMapping.cs 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871
  1. // Licensed to the .NET Foundation under one or more agreements.
  2. // The .NET Foundation licenses this file to you under the MIT license.
  3. // See the LICENSE file in the project root for more information.
  4. // This file contains the IDN functions and implementation.
  5. //
  6. // This allows encoding of non-ASCII domain names in a "punycode" form,
  7. // for example:
  8. //
  9. // \u5B89\u5BA4\u5948\u7F8E\u6075-with-SUPER-MONKEYS
  10. //
  11. // is encoded as:
  12. //
  13. // xn---with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n
  14. //
  15. // Additional options are provided to allow unassigned IDN characters and
  16. // to validate according to the Std3ASCII Rules (like DNS names).
  17. //
  18. // There are also rules regarding bidirectionality of text and the length
  19. // of segments.
  20. //
  21. // For additional rules see also:
  22. // RFC 3490 - Internationalizing Domain Names in Applications (IDNA)
  23. // RFC 3491 - Nameprep: A Stringprep Profile for Internationalized Domain Names (IDN)
  24. // RFC 3492 - Punycode: A Bootstring encoding of Unicode for Internationalized Domain Names in Applications (IDNA)
  25. using System.Diagnostics;
  26. using System.Runtime.CompilerServices;
  27. using System.Text;
  28. namespace System.Globalization
  29. {
  30. // IdnMapping class used to map names to Punycode
  31. public sealed partial class IdnMapping
  32. {
  33. private bool _allowUnassigned;
  34. private bool _useStd3AsciiRules;
  35. public IdnMapping()
  36. {
  37. }
  38. public bool AllowUnassigned
  39. {
  40. get => _allowUnassigned;
  41. set => _allowUnassigned = value;
  42. }
  43. public bool UseStd3AsciiRules
  44. {
  45. get => _useStd3AsciiRules;
  46. set => _useStd3AsciiRules = value;
  47. }
  48. // Gets ASCII (Punycode) version of the string
  49. public string GetAscii(string unicode) =>
  50. GetAscii(unicode, 0);
  51. public string GetAscii(string unicode, int index)
  52. {
  53. if (unicode == null)
  54. throw new ArgumentNullException(nameof(unicode));
  55. return GetAscii(unicode, index, unicode.Length - index);
  56. }
  57. public string GetAscii(string unicode, int index, int count)
  58. {
  59. if (unicode == null)
  60. throw new ArgumentNullException(nameof(unicode));
  61. if (index < 0 || count < 0)
  62. throw new ArgumentOutOfRangeException((index < 0) ? nameof(index) : nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
  63. if (index > unicode.Length)
  64. throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index);
  65. if (index > unicode.Length - count)
  66. throw new ArgumentOutOfRangeException(nameof(unicode), SR.ArgumentOutOfRange_IndexCountBuffer);
  67. if (count == 0)
  68. {
  69. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
  70. }
  71. if (unicode[index + count - 1] == 0)
  72. {
  73. throw new ArgumentException(SR.Format(SR.Argument_InvalidCharSequence, index + count - 1), nameof(unicode));
  74. }
  75. if (GlobalizationMode.Invariant)
  76. {
  77. return GetAsciiInvariant(unicode, index, count);
  78. }
  79. unsafe
  80. {
  81. fixed (char* pUnicode = unicode)
  82. {
  83. return GetAsciiCore(unicode, pUnicode + index, count);
  84. }
  85. }
  86. }
  87. // Gets Unicode version of the string. Normalized and limited to IDNA characters.
  88. public string GetUnicode(string ascii) =>
  89. GetUnicode(ascii, 0);
  90. public string GetUnicode(string ascii, int index)
  91. {
  92. if (ascii == null)
  93. throw new ArgumentNullException(nameof(ascii));
  94. return GetUnicode(ascii, index, ascii.Length - index);
  95. }
  96. public string GetUnicode(string ascii, int index, int count)
  97. {
  98. if (ascii == null)
  99. throw new ArgumentNullException(nameof(ascii));
  100. if (index < 0 || count < 0)
  101. throw new ArgumentOutOfRangeException((index < 0) ? nameof(index) : nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
  102. if (index > ascii.Length)
  103. throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index);
  104. if (index > ascii.Length - count)
  105. throw new ArgumentOutOfRangeException(nameof(ascii), SR.ArgumentOutOfRange_IndexCountBuffer);
  106. // This is a case (i.e. explicitly null-terminated input) where behavior in .NET and Win32 intentionally differ.
  107. // The .NET APIs should (and did in v4.0 and earlier) throw an ArgumentException on input that includes a terminating null.
  108. // The Win32 APIs fail on an embedded null, but not on a terminating null.
  109. if (count > 0 && ascii[index + count - 1] == (char)0)
  110. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
  111. if (GlobalizationMode.Invariant)
  112. {
  113. return GetUnicodeInvariant(ascii, index, count);
  114. }
  115. unsafe
  116. {
  117. fixed (char* pAscii = ascii)
  118. {
  119. return GetUnicodeCore(ascii, pAscii + index, count);
  120. }
  121. }
  122. }
  123. public override bool Equals(object? obj) =>
  124. obj is IdnMapping that &&
  125. _allowUnassigned == that._allowUnassigned &&
  126. _useStd3AsciiRules == that._useStd3AsciiRules;
  127. public override int GetHashCode() =>
  128. (_allowUnassigned ? 100 : 200) + (_useStd3AsciiRules ? 1000 : 2000);
  129. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  130. private static unsafe string GetStringForOutput(string originalString, char* input, int inputLength, char* output, int outputLength) =>
  131. originalString.Length == inputLength && new ReadOnlySpan<char>(input, inputLength).SequenceEqual(new ReadOnlySpan<char>(output, outputLength)) ?
  132. originalString :
  133. new string(output, 0, outputLength);
  134. //
  135. // Invariant implementation
  136. //
  137. private const char c_delimiter = '-';
  138. private const string c_strAcePrefix = "xn--";
  139. private const int c_labelLimit = 63; // Not including dots
  140. private const int c_defaultNameLimit = 255; // Including dots
  141. private const int c_initialN = 0x80;
  142. private const int c_maxint = 0x7ffffff;
  143. private const int c_initialBias = 72;
  144. private const int c_punycodeBase = 36;
  145. private const int c_tmin = 1;
  146. private const int c_tmax = 26;
  147. private const int c_skew = 38;
  148. private const int c_damp = 700;
  149. // Legal "dot" separators (i.e: . in www.microsoft.com)
  150. private static readonly char[] s_dotSeparators = { '.', '\u3002', '\uFF0E', '\uFF61' };
  151. private string GetAsciiInvariant(string unicode, int index, int count)
  152. {
  153. if (index > 0 || count < unicode.Length)
  154. {
  155. unicode = unicode.Substring(index, count);
  156. }
  157. // Check for ASCII only string, which will be unchanged
  158. if (ValidateStd3AndAscii(unicode, UseStd3AsciiRules, true))
  159. {
  160. return unicode;
  161. }
  162. // Cannot be null terminated (normalization won't help us with this one, and
  163. // may have returned false before checking the whole string above)
  164. Debug.Assert(count >= 1, "[IdnMapping.GetAscii] Expected 0 length strings to fail before now.");
  165. if (unicode[^1] <= 0x1f)
  166. {
  167. throw new ArgumentException(SR.Format(SR.Argument_InvalidCharSequence, unicode.Length - 1), nameof(unicode));
  168. }
  169. // May need to check Std3 rules again for non-ascii
  170. if (UseStd3AsciiRules)
  171. {
  172. ValidateStd3AndAscii(unicode, true, false);
  173. }
  174. // Go ahead and encode it
  175. return PunycodeEncode(unicode);
  176. }
  177. // See if we're only ASCII
  178. private static bool ValidateStd3AndAscii(string unicode, bool bUseStd3, bool bCheckAscii)
  179. {
  180. // If its empty, then its too small
  181. if (unicode.Length == 0)
  182. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
  183. int iLastDot = -1;
  184. // Loop the whole string
  185. for (int i = 0; i < unicode.Length; i++)
  186. {
  187. // Aren't allowing control chars (or 7f, but idn tables catch that, they don't catch \0 at end though)
  188. if (unicode[i] <= 0x1f)
  189. {
  190. throw new ArgumentException(SR.Format(SR.Argument_InvalidCharSequence, i), nameof(unicode));
  191. }
  192. // If its Unicode or a control character, return false (non-ascii)
  193. if (bCheckAscii && unicode[i] >= 0x7f)
  194. return false;
  195. // Check for dots
  196. if (IsDot(unicode[i]))
  197. {
  198. // Can't have 2 dots in a row
  199. if (i == iLastDot + 1)
  200. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
  201. // If its too far between dots then fail
  202. if (i - iLastDot > c_labelLimit + 1)
  203. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
  204. // If validating Std3, then char before dot can't be - char
  205. if (bUseStd3 && i > 0)
  206. ValidateStd3(unicode[i - 1], true);
  207. // Remember where the last dot is
  208. iLastDot = i;
  209. continue;
  210. }
  211. // If necessary, make sure its a valid std3 character
  212. if (bUseStd3)
  213. {
  214. ValidateStd3(unicode[i], i == iLastDot + 1);
  215. }
  216. }
  217. // If we never had a dot, then we need to be shorter than the label limit
  218. if (iLastDot == -1 && unicode.Length > c_labelLimit)
  219. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
  220. // Need to validate entire string length, 1 shorter if last char wasn't a dot
  221. if (unicode.Length > c_defaultNameLimit - (IsDot(unicode[^1]) ? 0 : 1))
  222. throw new ArgumentException(SR.Format(SR.Argument_IdnBadNameSize,
  223. c_defaultNameLimit - (IsDot(unicode[^1]) ? 0 : 1)), nameof(unicode));
  224. // If last char wasn't a dot we need to check for trailing -
  225. if (bUseStd3 && !IsDot(unicode[^1]))
  226. ValidateStd3(unicode[^1], true);
  227. return true;
  228. }
  229. /* PunycodeEncode() converts Unicode to Punycode. The input */
  230. /* is represented as an array of Unicode code points (not code */
  231. /* units; surrogate pairs are not allowed), and the output */
  232. /* will be represented as an array of ASCII code points. The */
  233. /* output string is *not* null-terminated; it will contain */
  234. /* zeros if and only if the input contains zeros. (Of course */
  235. /* the caller can leave room for a terminator and add one if */
  236. /* needed.) The input_length is the number of code points in */
  237. /* the input. The output_length is an in/out argument: the */
  238. /* caller passes in the maximum number of code points that it */
  239. /* can receive, and on successful return it will contain the */
  240. /* number of code points actually output. The case_flags array */
  241. /* holds input_length boolean values, where nonzero suggests that */
  242. /* the corresponding Unicode character be forced to uppercase */
  243. /* after being decoded (if possible), and zero suggests that */
  244. /* it be forced to lowercase (if possible). ASCII code points */
  245. /* are encoded literally, except that ASCII letters are forced */
  246. /* to uppercase or lowercase according to the corresponding */
  247. /* uppercase flags. If case_flags is a null pointer then ASCII */
  248. /* letters are left as they are, and other code points are */
  249. /* treated as if their uppercase flags were zero. The return */
  250. /* value can be any of the punycode_status values defined above */
  251. /* except punycode_bad_input; if not punycode_success, then */
  252. /* output_size and output might contain garbage. */
  253. private static string PunycodeEncode(string unicode)
  254. {
  255. // 0 length strings aren't allowed
  256. if (unicode.Length == 0)
  257. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
  258. StringBuilder output = new StringBuilder(unicode.Length);
  259. int iNextDot = 0;
  260. int iAfterLastDot = 0;
  261. int iOutputAfterLastDot = 0;
  262. // Find the next dot
  263. while (iNextDot < unicode.Length)
  264. {
  265. // Find end of this segment
  266. iNextDot = unicode.IndexOfAny(s_dotSeparators, iAfterLastDot);
  267. Debug.Assert(iNextDot <= unicode.Length, "[IdnMapping.punycode_encode]IndexOfAny is broken");
  268. if (iNextDot < 0)
  269. iNextDot = unicode.Length;
  270. // Only allowed to have empty . section at end (www.microsoft.com.)
  271. if (iNextDot == iAfterLastDot)
  272. {
  273. // Only allowed to have empty sections as trailing .
  274. if (iNextDot != unicode.Length)
  275. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
  276. // Last dot, stop
  277. break;
  278. }
  279. // We'll need an Ace prefix
  280. output.Append(c_strAcePrefix);
  281. // Everything resets every segment.
  282. bool bRightToLeft = false;
  283. // Check for RTL. If right-to-left, then 1st & last chars must be RTL
  284. BidiCategory eBidi = CharUnicodeInfo.GetBidiCategory(unicode, iAfterLastDot);
  285. if (eBidi == BidiCategory.RightToLeft || eBidi == BidiCategory.RightToLeftArabic)
  286. {
  287. // It has to be right to left.
  288. bRightToLeft = true;
  289. // Check last char
  290. int iTest = iNextDot - 1;
  291. if (char.IsLowSurrogate(unicode, iTest))
  292. {
  293. iTest--;
  294. }
  295. eBidi = CharUnicodeInfo.GetBidiCategory(unicode, iTest);
  296. if (eBidi != BidiCategory.RightToLeft && eBidi != BidiCategory.RightToLeftArabic)
  297. {
  298. // Oops, last wasn't RTL, last should be RTL if first is RTL
  299. throw new ArgumentException(SR.Argument_IdnBadBidi, nameof(unicode));
  300. }
  301. }
  302. // Handle the basic code points
  303. int basicCount;
  304. int numProcessed = 0; // Num code points that have been processed so far (this segment)
  305. for (basicCount = iAfterLastDot; basicCount < iNextDot; basicCount++)
  306. {
  307. // Can't be lonely surrogate because it would've thrown in normalization
  308. Debug.Assert(!char.IsLowSurrogate(unicode, basicCount), "[IdnMapping.punycode_encode]Unexpected low surrogate");
  309. // Double check our bidi rules
  310. BidiCategory testBidi = CharUnicodeInfo.GetBidiCategory(unicode, basicCount);
  311. // If we're RTL, we can't have LTR chars
  312. if (bRightToLeft && testBidi == BidiCategory.LeftToRight)
  313. {
  314. // Oops, throw error
  315. throw new ArgumentException(SR.Argument_IdnBadBidi, nameof(unicode));
  316. }
  317. // If we're not RTL we can't have RTL chars
  318. if (!bRightToLeft && (testBidi == BidiCategory.RightToLeft || testBidi == BidiCategory.RightToLeftArabic))
  319. {
  320. // Oops, throw error
  321. throw new ArgumentException(SR.Argument_IdnBadBidi, nameof(unicode));
  322. }
  323. // If its basic then add it
  324. if (Basic(unicode[basicCount]))
  325. {
  326. output.Append(EncodeBasic(unicode[basicCount]));
  327. numProcessed++;
  328. }
  329. // If its a surrogate, skip the next since our bidi category tester doesn't handle it.
  330. else if (char.IsSurrogatePair(unicode, basicCount))
  331. basicCount++;
  332. }
  333. int numBasicCodePoints = numProcessed; // number of basic code points
  334. // Stop if we ONLY had basic code points
  335. if (numBasicCodePoints == iNextDot - iAfterLastDot)
  336. {
  337. // Get rid of xn-- and this segments done
  338. output.Remove(iOutputAfterLastDot, c_strAcePrefix.Length);
  339. }
  340. else
  341. {
  342. // If it has some non-basic code points the input cannot start with xn--
  343. if (unicode.Length - iAfterLastDot >= c_strAcePrefix.Length &&
  344. unicode.Substring(iAfterLastDot, c_strAcePrefix.Length).Equals(
  345. c_strAcePrefix, StringComparison.OrdinalIgnoreCase))
  346. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(unicode));
  347. // Need to do ACE encoding
  348. int numSurrogatePairs = 0; // number of surrogate pairs so far
  349. // Add a delimiter (-) if we had any basic code points (between basic and encoded pieces)
  350. if (numBasicCodePoints > 0)
  351. {
  352. output.Append(c_delimiter);
  353. }
  354. // Initialize the state
  355. int n = c_initialN;
  356. int delta = 0;
  357. int bias = c_initialBias;
  358. // Main loop
  359. while (numProcessed < (iNextDot - iAfterLastDot))
  360. {
  361. /* All non-basic code points < n have been */
  362. /* handled already. Find the next larger one: */
  363. int j;
  364. int m;
  365. int test = 0;
  366. for (m = c_maxint, j = iAfterLastDot;
  367. j < iNextDot;
  368. j += IsSupplementary(test) ? 2 : 1)
  369. {
  370. test = char.ConvertToUtf32(unicode, j);
  371. if (test >= n && test < m) m = test;
  372. }
  373. /* Increase delta enough to advance the decoder's */
  374. /* <n,i> state to <m,0>, but guard against overflow: */
  375. delta += (int)((m - n) * ((numProcessed - numSurrogatePairs) + 1));
  376. Debug.Assert(delta > 0, "[IdnMapping.cs]1 punycode_encode - delta overflowed int");
  377. n = m;
  378. for (j = iAfterLastDot; j < iNextDot; j += IsSupplementary(test) ? 2 : 1)
  379. {
  380. // Make sure we're aware of surrogates
  381. test = char.ConvertToUtf32(unicode, j);
  382. // Adjust for character position (only the chars in our string already, some
  383. // haven't been processed.
  384. if (test < n)
  385. {
  386. delta++;
  387. Debug.Assert(delta > 0, "[IdnMapping.cs]2 punycode_encode - delta overflowed int");
  388. }
  389. if (test == n)
  390. {
  391. // Represent delta as a generalized variable-length integer:
  392. int q, k;
  393. for (q = delta, k = c_punycodeBase; ; k += c_punycodeBase)
  394. {
  395. int t = k <= bias ? c_tmin : k >= bias + c_tmax ? c_tmax : k - bias;
  396. if (q < t) break;
  397. Debug.Assert(c_punycodeBase != t, "[IdnMapping.punycode_encode]Expected c_punycodeBase (36) to be != t");
  398. output.Append(EncodeDigit(t + (q - t) % (c_punycodeBase - t)));
  399. q = (q - t) / (c_punycodeBase - t);
  400. }
  401. output.Append(EncodeDigit(q));
  402. bias = Adapt(delta, (numProcessed - numSurrogatePairs) + 1, numProcessed == numBasicCodePoints);
  403. delta = 0;
  404. numProcessed++;
  405. if (IsSupplementary(m))
  406. {
  407. numProcessed++;
  408. numSurrogatePairs++;
  409. }
  410. }
  411. }
  412. ++delta;
  413. ++n;
  414. Debug.Assert(delta > 0, "[IdnMapping.cs]3 punycode_encode - delta overflowed int");
  415. }
  416. }
  417. // Make sure its not too big
  418. if (output.Length - iOutputAfterLastDot > c_labelLimit)
  419. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
  420. // Done with this segment, add dot if necessary
  421. if (iNextDot != unicode.Length)
  422. output.Append('.');
  423. iAfterLastDot = iNextDot + 1;
  424. iOutputAfterLastDot = output.Length;
  425. }
  426. // Throw if we're too long
  427. if (output.Length > c_defaultNameLimit - (IsDot(unicode[^1]) ? 0 : 1))
  428. throw new ArgumentException(SR.Format(SR.Argument_IdnBadNameSize,
  429. c_defaultNameLimit - (IsDot(unicode[^1]) ? 0 : 1)), nameof(unicode));
  430. // Return our output string
  431. return output.ToString();
  432. }
  433. // Is it a dot?
  434. // are we U+002E (., full stop), U+3002 (ideographic full stop), U+FF0E (fullwidth full stop), or
  435. // U+FF61 (halfwidth ideographic full stop).
  436. // Note: IDNA Normalization gets rid of dots now, but testing for last dot is before normalization
  437. private static bool IsDot(char c) =>
  438. c == '.' || c == '\u3002' || c == '\uFF0E' || c == '\uFF61';
  439. private static bool IsSupplementary(int cTest) =>
  440. cTest >= 0x10000;
  441. private static bool Basic(uint cp) =>
  442. // Is it in ASCII range?
  443. cp < 0x80;
  444. // Validate Std3 rules for a character
  445. private static void ValidateStd3(char c, bool bNextToDot)
  446. {
  447. // Check for illegal characters
  448. if (c <= ',' || c == '/' || (c >= ':' && c <= '@') || // Lots of characters not allowed
  449. (c >= '[' && c <= '`') || (c >= '{' && c <= (char)0x7F) ||
  450. (c == '-' && bNextToDot))
  451. throw new ArgumentException(SR.Format(SR.Argument_IdnBadStd3, c), nameof(c));
  452. }
  453. private string GetUnicodeInvariant(string ascii, int index, int count)
  454. {
  455. if (index > 0 || count < ascii.Length)
  456. {
  457. // We're only using part of the string
  458. ascii = ascii.Substring(index, count);
  459. }
  460. // Convert Punycode to Unicode
  461. string strUnicode = PunycodeDecode(ascii);
  462. // Output name MUST obey IDNA rules & round trip (casing differences are allowed)
  463. if (!ascii.Equals(GetAscii(strUnicode), StringComparison.OrdinalIgnoreCase))
  464. throw new ArgumentException(SR.Argument_IdnIllegalName, nameof(ascii));
  465. return strUnicode;
  466. }
  467. /* PunycodeDecode() converts Punycode to Unicode. The input is */
  468. /* represented as an array of ASCII code points, and the output */
  469. /* will be represented as an array of Unicode code points. The */
  470. /* input_length is the number of code points in the input. The */
  471. /* output_length is an in/out argument: the caller passes in */
  472. /* the maximum number of code points that it can receive, and */
  473. /* on successful return it will contain the actual number of */
  474. /* code points output. The case_flags array needs room for at */
  475. /* least output_length values, or it can be a null pointer if the */
  476. /* case information is not needed. A nonzero flag suggests that */
  477. /* the corresponding Unicode character be forced to uppercase */
  478. /* by the caller (if possible), while zero suggests that it be */
  479. /* forced to lowercase (if possible). ASCII code points are */
  480. /* output already in the proper case, but their flags will be set */
  481. /* appropriately so that applying the flags would be harmless. */
  482. /* The return value can be any of the punycode_status values */
  483. /* defined above; if not punycode_success, then output_length, */
  484. /* output, and case_flags might contain garbage. On success, the */
  485. /* decoder will never need to write an output_length greater than */
  486. /* input_length, because of how the encoding is defined. */
  487. private static string PunycodeDecode(string ascii)
  488. {
  489. // 0 length strings aren't allowed
  490. if (ascii.Length == 0)
  491. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(ascii));
  492. // Throw if we're too long
  493. if (ascii.Length > c_defaultNameLimit - (IsDot(ascii[^1]) ? 0 : 1))
  494. throw new ArgumentException(SR.Format(SR.Argument_IdnBadNameSize,
  495. c_defaultNameLimit - (IsDot(ascii[^1]) ? 0 : 1)), nameof(ascii));
  496. // output stringbuilder
  497. StringBuilder output = new StringBuilder(ascii.Length);
  498. // Dot searching
  499. int iNextDot = 0;
  500. int iAfterLastDot = 0;
  501. int iOutputAfterLastDot = 0;
  502. while (iNextDot < ascii.Length)
  503. {
  504. // Find end of this segment
  505. iNextDot = ascii.IndexOf('.', iAfterLastDot);
  506. if (iNextDot < 0 || iNextDot > ascii.Length)
  507. iNextDot = ascii.Length;
  508. // Only allowed to have empty . section at end (www.microsoft.com.)
  509. if (iNextDot == iAfterLastDot)
  510. {
  511. // Only allowed to have empty sections as trailing .
  512. if (iNextDot != ascii.Length)
  513. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(ascii));
  514. // Last dot, stop
  515. break;
  516. }
  517. // In either case it can't be bigger than segment size
  518. if (iNextDot - iAfterLastDot > c_labelLimit)
  519. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(ascii));
  520. // See if this section's ASCII or ACE
  521. if (ascii.Length < c_strAcePrefix.Length + iAfterLastDot ||
  522. string.Compare(ascii, iAfterLastDot, c_strAcePrefix, 0, c_strAcePrefix.Length, StringComparison.OrdinalIgnoreCase) != 0)
  523. {
  524. // Its ASCII, copy it
  525. output.Append(ascii, iAfterLastDot, iNextDot - iAfterLastDot);
  526. }
  527. else
  528. {
  529. // Not ASCII, bump up iAfterLastDot to be after ACE Prefix
  530. iAfterLastDot += c_strAcePrefix.Length;
  531. // Get number of basic code points (where delimiter is)
  532. // numBasicCodePoints < 0 if there're no basic code points
  533. int iTemp = ascii.LastIndexOf(c_delimiter, iNextDot - 1);
  534. // Trailing - not allowed
  535. if (iTemp == iNextDot - 1)
  536. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
  537. int numBasicCodePoints;
  538. if (iTemp <= iAfterLastDot)
  539. numBasicCodePoints = 0;
  540. else
  541. {
  542. numBasicCodePoints = iTemp - iAfterLastDot;
  543. // Copy all the basic code points, making sure they're all in the allowed range,
  544. // and losing the casing for all of them.
  545. for (int copyAscii = iAfterLastDot; copyAscii < iAfterLastDot + numBasicCodePoints; copyAscii++)
  546. {
  547. // Make sure we don't allow unicode in the ascii part
  548. if (ascii[copyAscii] > 0x7f)
  549. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
  550. // When appending make sure they get lower cased
  551. output.Append((char)(ascii[copyAscii] >= 'A' && ascii[copyAscii] <= 'Z' ? ascii[copyAscii] - 'A' + 'a' : ascii[copyAscii]));
  552. }
  553. }
  554. // Get ready for main loop. Start at beginning if we didn't have any
  555. // basic code points, otherwise start after the -.
  556. // asciiIndex will be next character to read from ascii
  557. int asciiIndex = iAfterLastDot + (numBasicCodePoints > 0 ? numBasicCodePoints + 1 : 0);
  558. // initialize our state
  559. int n = c_initialN;
  560. int bias = c_initialBias;
  561. int i = 0;
  562. int w, k;
  563. // no Supplementary characters yet
  564. int numSurrogatePairs = 0;
  565. // Main loop, read rest of ascii
  566. while (asciiIndex < iNextDot)
  567. {
  568. /* Decode a generalized variable-length integer into delta, */
  569. /* which gets added to i. The overflow checking is easier */
  570. /* if we increase i as we go, then subtract off its starting */
  571. /* value at the end to obtain delta. */
  572. int oldi = i;
  573. for (w = 1, k = c_punycodeBase; ; k += c_punycodeBase)
  574. {
  575. // Check to make sure we aren't overrunning our ascii string
  576. if (asciiIndex >= iNextDot)
  577. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
  578. // decode the digit from the next char
  579. int digit = DecodeDigit(ascii[asciiIndex++]);
  580. Debug.Assert(w > 0, "[IdnMapping.punycode_decode]Expected w > 0");
  581. if (digit > (c_maxint - i) / w)
  582. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
  583. i += (int)(digit * w);
  584. int t = k <= bias ? c_tmin : k >= bias + c_tmax ? c_tmax : k - bias;
  585. if (digit < t)
  586. break;
  587. Debug.Assert(c_punycodeBase != t, "[IdnMapping.punycode_decode]Expected t != c_punycodeBase (36)");
  588. if (w > c_maxint / (c_punycodeBase - t))
  589. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
  590. w *= (c_punycodeBase - t);
  591. }
  592. bias = Adapt(i - oldi, (output.Length - iOutputAfterLastDot - numSurrogatePairs) + 1, oldi == 0);
  593. /* i was supposed to wrap around from output.Length to 0, */
  594. /* incrementing n each time, so we'll fix that now: */
  595. Debug.Assert((output.Length - iOutputAfterLastDot - numSurrogatePairs) + 1 > 0,
  596. "[IdnMapping.punycode_decode]Expected to have added > 0 characters this segment");
  597. if (i / ((output.Length - iOutputAfterLastDot - numSurrogatePairs) + 1) > c_maxint - n)
  598. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
  599. n += (int)(i / (output.Length - iOutputAfterLastDot - numSurrogatePairs + 1));
  600. i %= (output.Length - iOutputAfterLastDot - numSurrogatePairs + 1);
  601. // Make sure n is legal
  602. if (n < 0 || n > 0x10ffff || (n >= 0xD800 && n <= 0xDFFF))
  603. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
  604. // insert n at position i of the output: Really tricky if we have surrogates
  605. int iUseInsertLocation;
  606. string strTemp = char.ConvertFromUtf32(n);
  607. // If we have supplimentary characters
  608. if (numSurrogatePairs > 0)
  609. {
  610. // Hard way, we have supplimentary characters
  611. int iCount;
  612. for (iCount = i, iUseInsertLocation = iOutputAfterLastDot; iCount > 0; iCount--, iUseInsertLocation++)
  613. {
  614. // If its a surrogate, we have to go one more
  615. if (iUseInsertLocation >= output.Length)
  616. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
  617. if (char.IsSurrogate(output[iUseInsertLocation]))
  618. iUseInsertLocation++;
  619. }
  620. }
  621. else
  622. {
  623. // No Supplementary chars yet, just add i
  624. iUseInsertLocation = iOutputAfterLastDot + i;
  625. }
  626. // Insert it
  627. output.Insert(iUseInsertLocation, strTemp);
  628. // If it was a surrogate increment our counter
  629. if (IsSupplementary(n))
  630. numSurrogatePairs++;
  631. // Index gets updated
  632. i++;
  633. }
  634. // Do BIDI testing
  635. bool bRightToLeft = false;
  636. // Check for RTL. If right-to-left, then 1st & last chars must be RTL
  637. BidiCategory eBidi = CharUnicodeInfo.GetBidiCategory(output, iOutputAfterLastDot);
  638. if (eBidi == BidiCategory.RightToLeft || eBidi == BidiCategory.RightToLeftArabic)
  639. {
  640. // It has to be right to left.
  641. bRightToLeft = true;
  642. }
  643. // Check the rest of them to make sure RTL/LTR is consistent
  644. for (int iTest = iOutputAfterLastDot; iTest < output.Length; iTest++)
  645. {
  646. // This might happen if we run into a pair
  647. if (char.IsLowSurrogate(output[iTest]))
  648. continue;
  649. // Check to see if its LTR
  650. eBidi = CharUnicodeInfo.GetBidiCategory(output, iTest);
  651. if ((bRightToLeft && eBidi == BidiCategory.LeftToRight) ||
  652. (!bRightToLeft && (eBidi == BidiCategory.RightToLeft || eBidi == BidiCategory.RightToLeftArabic)))
  653. throw new ArgumentException(SR.Argument_IdnBadBidi, nameof(ascii));
  654. }
  655. // Its also a requirement that the last one be RTL if 1st is RTL
  656. if (bRightToLeft && eBidi != BidiCategory.RightToLeft && eBidi != BidiCategory.RightToLeftArabic)
  657. {
  658. // Oops, last wasn't RTL, last should be RTL if first is RTL
  659. throw new ArgumentException(SR.Argument_IdnBadBidi, nameof(ascii));
  660. }
  661. }
  662. // See if this label was too long
  663. if (iNextDot - iAfterLastDot > c_labelLimit)
  664. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(ascii));
  665. // Done with this segment, add dot if necessary
  666. if (iNextDot != ascii.Length)
  667. output.Append('.');
  668. iAfterLastDot = iNextDot + 1;
  669. iOutputAfterLastDot = output.Length;
  670. }
  671. // Throw if we're too long
  672. if (output.Length > c_defaultNameLimit - (IsDot(output[output.Length - 1]) ? 0 : 1))
  673. throw new ArgumentException(SR.Format(SR.Argument_IdnBadNameSize, c_defaultNameLimit - (IsDot(output[output.Length - 1]) ? 0 : 1)), nameof(ascii));
  674. // Return our output string
  675. return output.ToString();
  676. }
  677. // DecodeDigit(cp) returns the numeric value of a basic code */
  678. // point (for use in representing integers) in the range 0 to */
  679. // c_punycodeBase-1, or <0 if cp is does not represent a value. */
  680. private static int DecodeDigit(char cp)
  681. {
  682. if (cp >= '0' && cp <= '9')
  683. return cp - '0' + 26;
  684. // Two flavors for case differences
  685. if (cp >= 'a' && cp <= 'z')
  686. return cp - 'a';
  687. if (cp >= 'A' && cp <= 'Z')
  688. return cp - 'A';
  689. // Expected 0-9, A-Z or a-z, everything else is illegal
  690. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(cp));
  691. }
  692. private static int Adapt(int delta, int numpoints, bool firsttime)
  693. {
  694. uint k;
  695. delta = firsttime ? delta / c_damp : delta / 2;
  696. Debug.Assert(numpoints != 0, "[IdnMapping.adapt]Expected non-zero numpoints.");
  697. delta += delta / numpoints;
  698. for (k = 0; delta > ((c_punycodeBase - c_tmin) * c_tmax) / 2; k += c_punycodeBase)
  699. {
  700. delta /= c_punycodeBase - c_tmin;
  701. }
  702. Debug.Assert(delta + c_skew != 0, "[IdnMapping.adapt]Expected non-zero delta+skew.");
  703. return (int)(k + (c_punycodeBase - c_tmin + 1) * delta / (delta + c_skew));
  704. }
  705. /* EncodeBasic(bcp,flag) forces a basic code point to lowercase */
  706. /* if flag is false, uppercase if flag is true, and returns */
  707. /* the resulting code point. The code point is unchanged if it */
  708. /* is caseless. The behavior is undefined if bcp is not a basic */
  709. /* code point. */
  710. private static char EncodeBasic(char bcp)
  711. {
  712. if (HasUpperCaseFlag(bcp))
  713. bcp += (char)('a' - 'A');
  714. return bcp;
  715. }
  716. // Return whether a punycode code point is flagged as being upper case.
  717. private static bool HasUpperCaseFlag(char punychar) =>
  718. punychar >= 'A' && punychar <= 'Z';
  719. /* EncodeDigit(d,flag) returns the basic code point whose value */
  720. /* (when used for representing integers) is d, which needs to be in */
  721. /* the range 0 to punycodeBase-1. The lowercase form is used unless flag is */
  722. /* true, in which case the uppercase form is used. */
  723. private static char EncodeDigit(int d)
  724. {
  725. Debug.Assert(d >= 0 && d < c_punycodeBase, "[IdnMapping.encode_digit]Expected 0 <= d < punycodeBase");
  726. // 26-35 map to ASCII 0-9
  727. if (d > 25) return (char)(d - 26 + '0');
  728. // 0-25 map to a-z or A-Z
  729. return (char)(d + 'a');
  730. }
  731. }
  732. }