IdnMapping.cs 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891
  1. // Licensed to the .NET Foundation under one or more agreements.
  2. // The .NET Foundation licenses this file to you under the MIT license.
  3. // See the LICENSE file in the project root for more information.
  4. // This file contains the IDN functions and implementation.
  5. //
  6. // This allows encoding of non-ASCII domain names in a "punycode" form,
  7. // for example:
  8. //
  9. // \u5B89\u5BA4\u5948\u7F8E\u6075-with-SUPER-MONKEYS
  10. //
  11. // is encoded as:
  12. //
  13. // xn---with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n
  14. //
  15. // Additional options are provided to allow unassigned IDN characters and
  16. // to validate according to the Std3ASCII Rules (like DNS names).
  17. //
  18. // There are also rules regarding bidirectionality of text and the length
  19. // of segments.
  20. //
  21. // For additional rules see also:
  22. // RFC 3490 - Internationalizing Domain Names in Applications (IDNA)
  23. // RFC 3491 - Nameprep: A Stringprep Profile for Internationalized Domain Names (IDN)
  24. // RFC 3492 - Punycode: A Bootstring encoding of Unicode for Internationalized Domain Names in Applications (IDNA)
  25. using System.Diagnostics;
  26. using System.Runtime.CompilerServices;
  27. using System.Text;
  28. namespace System.Globalization
  29. {
  30. // IdnMapping class used to map names to Punycode
  31. public sealed partial class IdnMapping
  32. {
  33. private bool _allowUnassigned;
  34. private bool _useStd3AsciiRules;
  35. public IdnMapping()
  36. {
  37. }
  38. public bool AllowUnassigned
  39. {
  40. get { return _allowUnassigned; }
  41. set { _allowUnassigned = value; }
  42. }
  43. public bool UseStd3AsciiRules
  44. {
  45. get { return _useStd3AsciiRules; }
  46. set { _useStd3AsciiRules = value; }
  47. }
  48. // Gets ASCII (Punycode) version of the string
  49. public string GetAscii(string unicode)
  50. {
  51. return GetAscii(unicode, 0);
  52. }
  53. public string GetAscii(string unicode, int index)
  54. {
  55. if (unicode == null)
  56. throw new ArgumentNullException(nameof(unicode));
  57. return GetAscii(unicode, index, unicode.Length - index);
  58. }
  59. public string GetAscii(string unicode, int index, int count)
  60. {
  61. if (unicode == null)
  62. throw new ArgumentNullException(nameof(unicode));
  63. if (index < 0 || count < 0)
  64. throw new ArgumentOutOfRangeException((index < 0) ? nameof(index) : nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
  65. if (index > unicode.Length)
  66. throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index);
  67. if (index > unicode.Length - count)
  68. throw new ArgumentOutOfRangeException(nameof(unicode), SR.ArgumentOutOfRange_IndexCountBuffer);
  69. if (count == 0)
  70. {
  71. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
  72. }
  73. if (unicode[index + count - 1] == 0)
  74. {
  75. throw new ArgumentException(SR.Format(SR.Argument_InvalidCharSequence, index + count - 1), nameof(unicode));
  76. }
  77. if (GlobalizationMode.Invariant)
  78. {
  79. return GetAsciiInvariant(unicode, index, count);
  80. }
  81. unsafe
  82. {
  83. fixed (char* pUnicode = unicode)
  84. {
  85. return GetAsciiCore(unicode, pUnicode + index, count);
  86. }
  87. }
  88. }
  89. // Gets Unicode version of the string. Normalized and limited to IDNA characters.
  90. public string GetUnicode(string ascii)
  91. {
  92. return GetUnicode(ascii, 0);
  93. }
  94. public string GetUnicode(string ascii, int index)
  95. {
  96. if (ascii == null)
  97. throw new ArgumentNullException(nameof(ascii));
  98. return GetUnicode(ascii, index, ascii.Length - index);
  99. }
  100. public string GetUnicode(string ascii, int index, int count)
  101. {
  102. if (ascii == null)
  103. throw new ArgumentNullException(nameof(ascii));
  104. if (index < 0 || count < 0)
  105. throw new ArgumentOutOfRangeException((index < 0) ? nameof(index) : nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
  106. if (index > ascii.Length)
  107. throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index);
  108. if (index > ascii.Length - count)
  109. throw new ArgumentOutOfRangeException(nameof(ascii), SR.ArgumentOutOfRange_IndexCountBuffer);
  110. // This is a case (i.e. explicitly null-terminated input) where behavior in .NET and Win32 intentionally differ.
  111. // The .NET APIs should (and did in v4.0 and earlier) throw an ArgumentException on input that includes a terminating null.
  112. // The Win32 APIs fail on an embedded null, but not on a terminating null.
  113. if (count > 0 && ascii[index + count - 1] == (char)0)
  114. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
  115. if (GlobalizationMode.Invariant)
  116. {
  117. return GetUnicodeInvariant(ascii, index, count);
  118. }
  119. unsafe
  120. {
  121. fixed (char* pAscii = ascii)
  122. {
  123. return GetUnicodeCore(ascii, pAscii + index, count);
  124. }
  125. }
  126. }
  127. public override bool Equals(object obj)
  128. {
  129. return
  130. obj is IdnMapping that &&
  131. _allowUnassigned == that._allowUnassigned &&
  132. _useStd3AsciiRules == that._useStd3AsciiRules;
  133. }
  134. public override int GetHashCode()
  135. {
  136. return (_allowUnassigned ? 100 : 200) + (_useStd3AsciiRules ? 1000 : 2000);
  137. }
  138. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  139. private static unsafe string GetStringForOutput(string originalString, char* input, int inputLength, char* output, int outputLength)
  140. {
  141. return originalString.Length == inputLength && new ReadOnlySpan<char>(input, inputLength).SequenceEqual(new ReadOnlySpan<char>(output, outputLength)) ?
  142. originalString :
  143. new string(output, 0, outputLength);
  144. }
  145. //
  146. // Invariant implementation
  147. //
  148. private const char c_delimiter = '-';
  149. private const string c_strAcePrefix = "xn--";
  150. private const int c_labelLimit = 63; // Not including dots
  151. private const int c_defaultNameLimit = 255; // Including dots
  152. private const int c_initialN = 0x80;
  153. private const int c_maxint = 0x7ffffff;
  154. private const int c_initialBias = 72;
  155. private const int c_punycodeBase = 36;
  156. private const int c_tmin = 1;
  157. private const int c_tmax = 26;
  158. private const int c_skew = 38;
  159. private const int c_damp = 700;
  160. // Legal "dot" separators (i.e: . in www.microsoft.com)
  161. private static char[] c_Dots = { '.', '\u3002', '\uFF0E', '\uFF61' };
  162. private string GetAsciiInvariant(string unicode, int index, int count)
  163. {
  164. if (index > 0 || count < unicode.Length)
  165. {
  166. unicode = unicode.Substring(index, count);
  167. }
  168. // Check for ASCII only string, which will be unchanged
  169. if (ValidateStd3AndAscii(unicode, UseStd3AsciiRules, true))
  170. {
  171. return unicode;
  172. }
  173. // Cannot be null terminated (normalization won't help us with this one, and
  174. // may have returned false before checking the whole string above)
  175. Debug.Assert(count >= 1, "[IdnMapping.GetAscii] Expected 0 length strings to fail before now.");
  176. if (unicode[unicode.Length - 1] <= 0x1f)
  177. {
  178. throw new ArgumentException(SR.Format(SR.Argument_InvalidCharSequence, unicode.Length - 1), nameof(unicode));
  179. }
  180. // May need to check Std3 rules again for non-ascii
  181. if (UseStd3AsciiRules)
  182. {
  183. ValidateStd3AndAscii(unicode, true, false);
  184. }
  185. // Go ahead and encode it
  186. return PunycodeEncode(unicode);
  187. }
  188. // See if we're only ASCII
  189. static bool ValidateStd3AndAscii(string unicode, bool bUseStd3, bool bCheckAscii)
  190. {
  191. // If its empty, then its too small
  192. if (unicode.Length == 0)
  193. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
  194. int iLastDot = -1;
  195. // Loop the whole string
  196. for (int i = 0; i < unicode.Length; i++)
  197. {
  198. // Aren't allowing control chars (or 7f, but idn tables catch that, they don't catch \0 at end though)
  199. if (unicode[i] <= 0x1f)
  200. {
  201. throw new ArgumentException(SR.Format(SR.Argument_InvalidCharSequence, i ), nameof(unicode));
  202. }
  203. // If its Unicode or a control character, return false (non-ascii)
  204. if (bCheckAscii && unicode[i] >= 0x7f)
  205. return false;
  206. // Check for dots
  207. if (IsDot(unicode[i]))
  208. {
  209. // Can't have 2 dots in a row
  210. if (i == iLastDot + 1)
  211. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
  212. // If its too far between dots then fail
  213. if (i - iLastDot > c_labelLimit + 1)
  214. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
  215. // If validating Std3, then char before dot can't be - char
  216. if (bUseStd3 && i > 0)
  217. ValidateStd3(unicode[i - 1], true);
  218. // Remember where the last dot is
  219. iLastDot = i;
  220. continue;
  221. }
  222. // If necessary, make sure its a valid std3 character
  223. if (bUseStd3)
  224. {
  225. ValidateStd3(unicode[i], (i == iLastDot + 1));
  226. }
  227. }
  228. // If we never had a dot, then we need to be shorter than the label limit
  229. if (iLastDot == -1 && unicode.Length > c_labelLimit)
  230. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
  231. // Need to validate entire string length, 1 shorter if last char wasn't a dot
  232. if (unicode.Length > c_defaultNameLimit - (IsDot(unicode[unicode.Length - 1]) ? 0 : 1))
  233. throw new ArgumentException(SR.Format(SR.Argument_IdnBadNameSize,
  234. c_defaultNameLimit - (IsDot(unicode[unicode.Length - 1]) ? 0 : 1)), nameof(unicode));
  235. // If last char wasn't a dot we need to check for trailing -
  236. if (bUseStd3 && !IsDot(unicode[unicode.Length - 1]))
  237. ValidateStd3(unicode[unicode.Length - 1], true);
  238. return true;
  239. }
  240. /* PunycodeEncode() converts Unicode to Punycode. The input */
  241. /* is represented as an array of Unicode code points (not code */
  242. /* units; surrogate pairs are not allowed), and the output */
  243. /* will be represented as an array of ASCII code points. The */
  244. /* output string is *not* null-terminated; it will contain */
  245. /* zeros if and only if the input contains zeros. (Of course */
  246. /* the caller can leave room for a terminator and add one if */
  247. /* needed.) The input_length is the number of code points in */
  248. /* the input. The output_length is an in/out argument: the */
  249. /* caller passes in the maximum number of code points that it */
  250. /* can receive, and on successful return it will contain the */
  251. /* number of code points actually output. The case_flags array */
  252. /* holds input_length boolean values, where nonzero suggests that */
  253. /* the corresponding Unicode character be forced to uppercase */
  254. /* after being decoded (if possible), and zero suggests that */
  255. /* it be forced to lowercase (if possible). ASCII code points */
  256. /* are encoded literally, except that ASCII letters are forced */
  257. /* to uppercase or lowercase according to the corresponding */
  258. /* uppercase flags. If case_flags is a null pointer then ASCII */
  259. /* letters are left as they are, and other code points are */
  260. /* treated as if their uppercase flags were zero. The return */
  261. /* value can be any of the punycode_status values defined above */
  262. /* except punycode_bad_input; if not punycode_success, then */
  263. /* output_size and output might contain garbage. */
  264. static string PunycodeEncode(string unicode)
  265. {
  266. // 0 length strings aren't allowed
  267. if (unicode.Length == 0)
  268. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
  269. StringBuilder output = new StringBuilder(unicode.Length);
  270. int iNextDot = 0;
  271. int iAfterLastDot = 0;
  272. int iOutputAfterLastDot = 0;
  273. // Find the next dot
  274. while (iNextDot < unicode.Length)
  275. {
  276. // Find end of this segment
  277. iNextDot = unicode.IndexOfAny(c_Dots, iAfterLastDot);
  278. Debug.Assert(iNextDot <= unicode.Length, "[IdnMapping.punycode_encode]IndexOfAny is broken");
  279. if (iNextDot < 0)
  280. iNextDot = unicode.Length;
  281. // Only allowed to have empty . section at end (www.microsoft.com.)
  282. if (iNextDot == iAfterLastDot)
  283. {
  284. // Only allowed to have empty sections as trailing .
  285. if (iNextDot != unicode.Length)
  286. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
  287. // Last dot, stop
  288. break;
  289. }
  290. // We'll need an Ace prefix
  291. output.Append(c_strAcePrefix);
  292. // Everything resets every segment.
  293. bool bRightToLeft = false;
  294. // Check for RTL. If right-to-left, then 1st & last chars must be RTL
  295. BidiCategory eBidi = CharUnicodeInfo.GetBidiCategory(unicode, iAfterLastDot);
  296. if (eBidi == BidiCategory.RightToLeft || eBidi == BidiCategory.RightToLeftArabic)
  297. {
  298. // It has to be right to left.
  299. bRightToLeft = true;
  300. // Check last char
  301. int iTest = iNextDot - 1;
  302. if (char.IsLowSurrogate(unicode, iTest))
  303. {
  304. iTest--;
  305. }
  306. eBidi = CharUnicodeInfo.GetBidiCategory(unicode, iTest);
  307. if (eBidi != BidiCategory.RightToLeft && eBidi != BidiCategory.RightToLeftArabic)
  308. {
  309. // Oops, last wasn't RTL, last should be RTL if first is RTL
  310. throw new ArgumentException(SR.Argument_IdnBadBidi, nameof(unicode));
  311. }
  312. }
  313. // Handle the basic code points
  314. int basicCount;
  315. int numProcessed = 0; // Num code points that have been processed so far (this segment)
  316. for (basicCount = iAfterLastDot; basicCount < iNextDot; basicCount++)
  317. {
  318. // Can't be lonely surrogate because it would've thrown in normalization
  319. Debug.Assert(char.IsLowSurrogate(unicode, basicCount) == false, "[IdnMapping.punycode_encode]Unexpected low surrogate");
  320. // Double check our bidi rules
  321. BidiCategory testBidi = CharUnicodeInfo.GetBidiCategory(unicode, basicCount);
  322. // If we're RTL, we can't have LTR chars
  323. if (bRightToLeft && testBidi == BidiCategory.LeftToRight)
  324. {
  325. // Oops, throw error
  326. throw new ArgumentException(SR.Argument_IdnBadBidi, nameof(unicode));
  327. }
  328. // If we're not RTL we can't have RTL chars
  329. if (!bRightToLeft && (testBidi == BidiCategory.RightToLeft || testBidi == BidiCategory.RightToLeftArabic))
  330. {
  331. // Oops, throw error
  332. throw new ArgumentException(SR.Argument_IdnBadBidi, nameof(unicode));
  333. }
  334. // If its basic then add it
  335. if (Basic(unicode[basicCount]))
  336. {
  337. output.Append(EncodeBasic(unicode[basicCount]));
  338. numProcessed++;
  339. }
  340. // If its a surrogate, skip the next since our bidi category tester doesn't handle it.
  341. else if (char.IsSurrogatePair(unicode, basicCount))
  342. basicCount++;
  343. }
  344. int numBasicCodePoints = numProcessed; // number of basic code points
  345. // Stop if we ONLY had basic code points
  346. if (numBasicCodePoints == iNextDot - iAfterLastDot)
  347. {
  348. // Get rid of xn-- and this segments done
  349. output.Remove(iOutputAfterLastDot, c_strAcePrefix.Length);
  350. }
  351. else
  352. {
  353. // If it has some non-basic code points the input cannot start with xn--
  354. if (unicode.Length - iAfterLastDot >= c_strAcePrefix.Length &&
  355. unicode.Substring(iAfterLastDot, c_strAcePrefix.Length).Equals(
  356. c_strAcePrefix, StringComparison.OrdinalIgnoreCase))
  357. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(unicode));
  358. // Need to do ACE encoding
  359. int numSurrogatePairs = 0; // number of surrogate pairs so far
  360. // Add a delimiter (-) if we had any basic code points (between basic and encoded pieces)
  361. if (numBasicCodePoints > 0)
  362. {
  363. output.Append(c_delimiter);
  364. }
  365. // Initialize the state
  366. int n = c_initialN;
  367. int delta = 0;
  368. int bias = c_initialBias;
  369. // Main loop
  370. while (numProcessed < (iNextDot - iAfterLastDot))
  371. {
  372. /* All non-basic code points < n have been */
  373. /* handled already. Find the next larger one: */
  374. int j;
  375. int m;
  376. int test = 0;
  377. for (m = c_maxint, j = iAfterLastDot;
  378. j < iNextDot;
  379. j += IsSupplementary(test) ? 2 : 1)
  380. {
  381. test = char.ConvertToUtf32(unicode, j);
  382. if (test >= n && test < m) m = test;
  383. }
  384. /* Increase delta enough to advance the decoder's */
  385. /* <n,i> state to <m,0>, but guard against overflow: */
  386. delta += (int)((m - n) * ((numProcessed - numSurrogatePairs) + 1));
  387. Debug.Assert(delta > 0, "[IdnMapping.cs]1 punycode_encode - delta overflowed int");
  388. n = m;
  389. for (j = iAfterLastDot; j < iNextDot; j+= IsSupplementary(test) ? 2 : 1)
  390. {
  391. // Make sure we're aware of surrogates
  392. test = char.ConvertToUtf32(unicode, j);
  393. // Adjust for character position (only the chars in our string already, some
  394. // haven't been processed.
  395. if (test < n)
  396. {
  397. delta++;
  398. Debug.Assert(delta > 0, "[IdnMapping.cs]2 punycode_encode - delta overflowed int");
  399. }
  400. if (test == n)
  401. {
  402. // Represent delta as a generalized variable-length integer:
  403. int q, k;
  404. for (q = delta, k = c_punycodeBase; ; k += c_punycodeBase)
  405. {
  406. int t = k <= bias ? c_tmin : k >= bias + c_tmax ? c_tmax : k - bias;
  407. if (q < t) break;
  408. Debug.Assert(c_punycodeBase != t, "[IdnMapping.punycode_encode]Expected c_punycodeBase (36) to be != t");
  409. output.Append(EncodeDigit(t + (q - t) % (c_punycodeBase - t)));
  410. q = (q - t) / (c_punycodeBase - t);
  411. }
  412. output.Append(EncodeDigit(q));
  413. bias = Adapt(delta, (numProcessed - numSurrogatePairs) + 1, numProcessed == numBasicCodePoints);
  414. delta = 0;
  415. numProcessed++;
  416. if (IsSupplementary(m))
  417. {
  418. numProcessed++;
  419. numSurrogatePairs++;
  420. }
  421. }
  422. }
  423. ++delta;
  424. ++n;
  425. Debug.Assert(delta > 0, "[IdnMapping.cs]3 punycode_encode - delta overflowed int");
  426. }
  427. }
  428. // Make sure its not too big
  429. if (output.Length - iOutputAfterLastDot > c_labelLimit)
  430. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
  431. // Done with this segment, add dot if necessary
  432. if (iNextDot != unicode.Length)
  433. output.Append('.');
  434. iAfterLastDot = iNextDot + 1;
  435. iOutputAfterLastDot = output.Length;
  436. }
  437. // Throw if we're too long
  438. if (output.Length > c_defaultNameLimit - (IsDot(unicode[unicode.Length-1]) ? 0 : 1))
  439. throw new ArgumentException(SR.Format(SR.Argument_IdnBadNameSize,
  440. c_defaultNameLimit - (IsDot(unicode[unicode.Length-1]) ? 0 : 1)), nameof(unicode));
  441. // Return our output string
  442. return output.ToString();
  443. }
  444. // Is it a dot?
  445. // are we U+002E (., full stop), U+3002 (ideographic full stop), U+FF0E (fullwidth full stop), or
  446. // U+FF61 (halfwidth ideographic full stop).
  447. // Note: IDNA Normalization gets rid of dots now, but testing for last dot is before normalization
  448. private static bool IsDot(char c)
  449. {
  450. return c == '.' || c == '\u3002' || c == '\uFF0E' || c == '\uFF61';
  451. }
  452. private static bool IsSupplementary(int cTest)
  453. {
  454. return cTest >= 0x10000;
  455. }
  456. private static bool Basic(uint cp)
  457. {
  458. // Is it in ASCII range?
  459. return cp < 0x80;
  460. }
  461. // Validate Std3 rules for a character
  462. private static void ValidateStd3(char c, bool bNextToDot)
  463. {
  464. // Check for illegal characters
  465. if ((c <= ',' || c == '/' || (c >= ':' && c <= '@') || // Lots of characters not allowed
  466. (c >= '[' && c <= '`') || (c >= '{' && c <= (char)0x7F)) ||
  467. (c == '-' && bNextToDot))
  468. throw new ArgumentException(SR.Format(SR.Argument_IdnBadStd3, c), nameof(c));
  469. }
  470. private string GetUnicodeInvariant(string ascii, int index, int count)
  471. {
  472. if (index > 0 || count < ascii.Length)
  473. {
  474. // We're only using part of the string
  475. ascii = ascii.Substring(index, count);
  476. }
  477. // Convert Punycode to Unicode
  478. string strUnicode = PunycodeDecode(ascii);
  479. // Output name MUST obey IDNA rules & round trip (casing differences are allowed)
  480. if (!ascii.Equals(GetAscii(strUnicode), StringComparison.OrdinalIgnoreCase))
  481. throw new ArgumentException(SR.Argument_IdnIllegalName, nameof(ascii));
  482. return strUnicode;
  483. }
  484. /* PunycodeDecode() converts Punycode to Unicode. The input is */
  485. /* represented as an array of ASCII code points, and the output */
  486. /* will be represented as an array of Unicode code points. The */
  487. /* input_length is the number of code points in the input. The */
  488. /* output_length is an in/out argument: the caller passes in */
  489. /* the maximum number of code points that it can receive, and */
  490. /* on successful return it will contain the actual number of */
  491. /* code points output. The case_flags array needs room for at */
  492. /* least output_length values, or it can be a null pointer if the */
  493. /* case information is not needed. A nonzero flag suggests that */
  494. /* the corresponding Unicode character be forced to uppercase */
  495. /* by the caller (if possible), while zero suggests that it be */
  496. /* forced to lowercase (if possible). ASCII code points are */
  497. /* output already in the proper case, but their flags will be set */
  498. /* appropriately so that applying the flags would be harmless. */
  499. /* The return value can be any of the punycode_status values */
  500. /* defined above; if not punycode_success, then output_length, */
  501. /* output, and case_flags might contain garbage. On success, the */
  502. /* decoder will never need to write an output_length greater than */
  503. /* input_length, because of how the encoding is defined. */
  504. private static string PunycodeDecode(string ascii)
  505. {
  506. // 0 length strings aren't allowed
  507. if (ascii.Length == 0)
  508. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(ascii));
  509. // Throw if we're too long
  510. if (ascii.Length > c_defaultNameLimit - (IsDot(ascii[ascii.Length-1]) ? 0 : 1))
  511. throw new ArgumentException(SR.Format(SR.Argument_IdnBadNameSize,
  512. c_defaultNameLimit - (IsDot(ascii[ascii.Length-1]) ? 0 : 1)), nameof(ascii));
  513. // output stringbuilder
  514. StringBuilder output = new StringBuilder(ascii.Length);
  515. // Dot searching
  516. int iNextDot = 0;
  517. int iAfterLastDot = 0;
  518. int iOutputAfterLastDot = 0;
  519. while (iNextDot < ascii.Length)
  520. {
  521. // Find end of this segment
  522. iNextDot = ascii.IndexOf('.', iAfterLastDot);
  523. if (iNextDot < 0 || iNextDot > ascii.Length)
  524. iNextDot = ascii.Length;
  525. // Only allowed to have empty . section at end (www.microsoft.com.)
  526. if (iNextDot == iAfterLastDot)
  527. {
  528. // Only allowed to have empty sections as trailing .
  529. if (iNextDot != ascii.Length)
  530. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(ascii));
  531. // Last dot, stop
  532. break;
  533. }
  534. // In either case it can't be bigger than segment size
  535. if (iNextDot - iAfterLastDot > c_labelLimit)
  536. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(ascii));
  537. // See if this section's ASCII or ACE
  538. if (ascii.Length < c_strAcePrefix.Length + iAfterLastDot ||
  539. string.Compare(ascii, iAfterLastDot, c_strAcePrefix, 0, c_strAcePrefix.Length, StringComparison.OrdinalIgnoreCase) != 0)
  540. {
  541. // Its ASCII, copy it
  542. output.Append(ascii, iAfterLastDot, iNextDot - iAfterLastDot);
  543. }
  544. else
  545. {
  546. // Not ASCII, bump up iAfterLastDot to be after ACE Prefix
  547. iAfterLastDot += c_strAcePrefix.Length;
  548. // Get number of basic code points (where delimiter is)
  549. // numBasicCodePoints < 0 if there're no basic code points
  550. int iTemp = ascii.LastIndexOf(c_delimiter, iNextDot - 1);
  551. // Trailing - not allowed
  552. if (iTemp == iNextDot - 1)
  553. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
  554. int numBasicCodePoints;
  555. if (iTemp <= iAfterLastDot)
  556. numBasicCodePoints = 0;
  557. else
  558. {
  559. numBasicCodePoints = iTemp - iAfterLastDot;
  560. // Copy all the basic code points, making sure they're all in the allowed range,
  561. // and losing the casing for all of them.
  562. for (int copyAscii = iAfterLastDot; copyAscii < iAfterLastDot + numBasicCodePoints; copyAscii++)
  563. {
  564. // Make sure we don't allow unicode in the ascii part
  565. if (ascii[copyAscii] > 0x7f)
  566. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
  567. // When appending make sure they get lower cased
  568. output.Append((char)(ascii[copyAscii] >= 'A' && ascii[copyAscii] <='Z' ? ascii[copyAscii] - 'A' + 'a' : ascii[copyAscii]));
  569. }
  570. }
  571. // Get ready for main loop. Start at beginning if we didn't have any
  572. // basic code points, otherwise start after the -.
  573. // asciiIndex will be next character to read from ascii
  574. int asciiIndex = iAfterLastDot + (numBasicCodePoints > 0 ? numBasicCodePoints + 1 : 0);
  575. // initialize our state
  576. int n = c_initialN;
  577. int bias = c_initialBias;
  578. int i = 0;
  579. int w, k;
  580. // no Supplementary characters yet
  581. int numSurrogatePairs = 0;
  582. // Main loop, read rest of ascii
  583. while (asciiIndex < iNextDot)
  584. {
  585. /* Decode a generalized variable-length integer into delta, */
  586. /* which gets added to i. The overflow checking is easier */
  587. /* if we increase i as we go, then subtract off its starting */
  588. /* value at the end to obtain delta. */
  589. int oldi = i;
  590. for (w = 1, k = c_punycodeBase; ; k += c_punycodeBase)
  591. {
  592. // Check to make sure we aren't overrunning our ascii string
  593. if (asciiIndex >= iNextDot)
  594. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
  595. // decode the digit from the next char
  596. int digit = DecodeDigit(ascii[asciiIndex++]);
  597. Debug.Assert(w > 0, "[IdnMapping.punycode_decode]Expected w > 0");
  598. if (digit > (c_maxint - i) / w)
  599. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
  600. i += (int)(digit * w);
  601. int t = k <= bias ? c_tmin : k >= bias + c_tmax ? c_tmax : k - bias;
  602. if (digit < t)
  603. break;
  604. Debug.Assert(c_punycodeBase != t, "[IdnMapping.punycode_decode]Expected t != c_punycodeBase (36)");
  605. if (w > c_maxint / (c_punycodeBase - t))
  606. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
  607. w *= (c_punycodeBase - t);
  608. }
  609. bias = Adapt(i - oldi, (output.Length - iOutputAfterLastDot - numSurrogatePairs) + 1, oldi == 0);
  610. /* i was supposed to wrap around from output.Length to 0, */
  611. /* incrementing n each time, so we'll fix that now: */
  612. Debug.Assert((output.Length - iOutputAfterLastDot - numSurrogatePairs) + 1 > 0,
  613. "[IdnMapping.punycode_decode]Expected to have added > 0 characters this segment");
  614. if (i / ((output.Length - iOutputAfterLastDot - numSurrogatePairs) + 1) > c_maxint - n)
  615. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
  616. n += (int)(i / (output.Length - iOutputAfterLastDot - numSurrogatePairs + 1));
  617. i %= (output.Length - iOutputAfterLastDot - numSurrogatePairs + 1);
  618. // Make sure n is legal
  619. if ((n < 0 || n > 0x10ffff) || (n >= 0xD800 && n <= 0xDFFF))
  620. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
  621. // insert n at position i of the output: Really tricky if we have surrogates
  622. int iUseInsertLocation;
  623. string strTemp = char.ConvertFromUtf32(n);
  624. // If we have supplimentary characters
  625. if (numSurrogatePairs > 0)
  626. {
  627. // Hard way, we have supplimentary characters
  628. int iCount;
  629. for (iCount = i, iUseInsertLocation = iOutputAfterLastDot; iCount > 0; iCount--, iUseInsertLocation++)
  630. {
  631. // If its a surrogate, we have to go one more
  632. if (iUseInsertLocation >= output.Length)
  633. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
  634. if (char.IsSurrogate(output[iUseInsertLocation]))
  635. iUseInsertLocation++;
  636. }
  637. }
  638. else
  639. {
  640. // No Supplementary chars yet, just add i
  641. iUseInsertLocation = iOutputAfterLastDot + i;
  642. }
  643. // Insert it
  644. output.Insert(iUseInsertLocation, strTemp);
  645. // If it was a surrogate increment our counter
  646. if (IsSupplementary(n))
  647. numSurrogatePairs++;
  648. // Index gets updated
  649. i++;
  650. }
  651. // Do BIDI testing
  652. bool bRightToLeft = false;
  653. // Check for RTL. If right-to-left, then 1st & last chars must be RTL
  654. BidiCategory eBidi = CharUnicodeInfo.GetBidiCategory(output, iOutputAfterLastDot);
  655. if (eBidi == BidiCategory.RightToLeft || eBidi == BidiCategory.RightToLeftArabic)
  656. {
  657. // It has to be right to left.
  658. bRightToLeft = true;
  659. }
  660. // Check the rest of them to make sure RTL/LTR is consistent
  661. for (int iTest = iOutputAfterLastDot; iTest < output.Length; iTest++)
  662. {
  663. // This might happen if we run into a pair
  664. if (char.IsLowSurrogate(output[iTest]))
  665. continue;
  666. // Check to see if its LTR
  667. eBidi = CharUnicodeInfo.GetBidiCategory(output, iTest);
  668. if ((bRightToLeft && eBidi == BidiCategory.LeftToRight) ||
  669. (!bRightToLeft && (eBidi == BidiCategory.RightToLeft || eBidi == BidiCategory.RightToLeftArabic)))
  670. throw new ArgumentException(SR.Argument_IdnBadBidi, nameof(ascii));
  671. }
  672. // Its also a requirement that the last one be RTL if 1st is RTL
  673. if (bRightToLeft && eBidi != BidiCategory.RightToLeft && eBidi != BidiCategory.RightToLeftArabic)
  674. {
  675. // Oops, last wasn't RTL, last should be RTL if first is RTL
  676. throw new ArgumentException(SR.Argument_IdnBadBidi, nameof(ascii));
  677. }
  678. }
  679. // See if this label was too long
  680. if (iNextDot - iAfterLastDot > c_labelLimit)
  681. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(ascii));
  682. // Done with this segment, add dot if necessary
  683. if (iNextDot != ascii.Length)
  684. output.Append('.');
  685. iAfterLastDot = iNextDot + 1;
  686. iOutputAfterLastDot = output.Length;
  687. }
  688. // Throw if we're too long
  689. if (output.Length > c_defaultNameLimit - (IsDot(output[output.Length-1]) ? 0 : 1))
  690. throw new ArgumentException(SR.Format(SR.Argument_IdnBadNameSize, c_defaultNameLimit - (IsDot(output[output.Length-1]) ? 0 : 1)), nameof(ascii));
  691. // Return our output string
  692. return output.ToString();
  693. }
  694. // DecodeDigit(cp) returns the numeric value of a basic code */
  695. // point (for use in representing integers) in the range 0 to */
  696. // c_punycodeBase-1, or <0 if cp is does not represent a value. */
  697. private static int DecodeDigit(char cp)
  698. {
  699. if (cp >= '0' && cp <= '9')
  700. return cp - '0' + 26;
  701. // Two flavors for case differences
  702. if (cp >= 'a' && cp <= 'z')
  703. return cp - 'a';
  704. if (cp >= 'A' && cp <= 'Z')
  705. return cp - 'A';
  706. // Expected 0-9, A-Z or a-z, everything else is illegal
  707. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(cp));
  708. }
  709. private static int Adapt(int delta, int numpoints, bool firsttime)
  710. {
  711. uint k;
  712. delta = firsttime ? delta / c_damp : delta / 2;
  713. Debug.Assert(numpoints != 0, "[IdnMapping.adapt]Expected non-zero numpoints.");
  714. delta += delta / numpoints;
  715. for (k = 0; delta > ((c_punycodeBase - c_tmin) * c_tmax) / 2; k += c_punycodeBase)
  716. {
  717. delta /= c_punycodeBase - c_tmin;
  718. }
  719. Debug.Assert(delta + c_skew != 0, "[IdnMapping.adapt]Expected non-zero delta+skew.");
  720. return (int)(k + (c_punycodeBase - c_tmin + 1) * delta / (delta + c_skew));
  721. }
  722. /* EncodeBasic(bcp,flag) forces a basic code point to lowercase */
  723. /* if flag is false, uppercase if flag is true, and returns */
  724. /* the resulting code point. The code point is unchanged if it */
  725. /* is caseless. The behavior is undefined if bcp is not a basic */
  726. /* code point. */
  727. static char EncodeBasic(char bcp)
  728. {
  729. if (HasUpperCaseFlag(bcp))
  730. bcp += (char)('a' - 'A');
  731. return bcp;
  732. }
  733. // Return whether a punycode code point is flagged as being upper case.
  734. private static bool HasUpperCaseFlag(char punychar)
  735. {
  736. return (punychar >= 'A' && punychar <= 'Z');
  737. }
  738. /* EncodeDigit(d,flag) returns the basic code point whose value */
  739. /* (when used for representing integers) is d, which needs to be in */
  740. /* the range 0 to punycodeBase-1. The lowercase form is used unless flag is */
  741. /* true, in which case the uppercase form is used. */
  742. private static char EncodeDigit(int d)
  743. {
  744. Debug.Assert(d >= 0 && d < c_punycodeBase, "[IdnMapping.encode_digit]Expected 0 <= d < punycodeBase");
  745. // 26-35 map to ASCII 0-9
  746. if (d > 25) return (char)(d - 26 + '0');
  747. // 0-25 map to a-z or A-Z
  748. return (char)(d + 'a');
  749. }
  750. }
  751. }