IdnMapping.cs 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892
  1. // Licensed to the .NET Foundation under one or more agreements.
  2. // The .NET Foundation licenses this file to you under the MIT license.
  3. // See the LICENSE file in the project root for more information.
  4. // This file contains the IDN functions and implementation.
  5. //
  6. // This allows encoding of non-ASCII domain names in a "punycode" form,
  7. // for example:
  8. //
  9. // \u5B89\u5BA4\u5948\u7F8E\u6075-with-SUPER-MONKEYS
  10. //
  11. // is encoded as:
  12. //
  13. // xn---with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n
  14. //
  15. // Additional options are provided to allow unassigned IDN characters and
  16. // to validate according to the Std3ASCII Rules (like DNS names).
  17. //
  18. // There are also rules regarding bidirectionality of text and the length
  19. // of segments.
  20. //
  21. // For additional rules see also:
  22. // RFC 3490 - Internationalizing Domain Names in Applications (IDNA)
  23. // RFC 3491 - Nameprep: A Stringprep Profile for Internationalized Domain Names (IDN)
  24. // RFC 3492 - Punycode: A Bootstring encoding of Unicode for Internationalized Domain Names in Applications (IDNA)
  25. using System.Diagnostics;
  26. using System.Runtime.CompilerServices;
  27. using System.Text;
  28. namespace System.Globalization
  29. {
  30. // IdnMapping class used to map names to Punycode
  31. public sealed partial class IdnMapping
  32. {
  33. private bool _allowUnassigned;
  34. private bool _useStd3AsciiRules;
  35. public IdnMapping()
  36. {
  37. }
  38. public bool AllowUnassigned
  39. {
  40. get { return _allowUnassigned; }
  41. set { _allowUnassigned = value; }
  42. }
  43. public bool UseStd3AsciiRules
  44. {
  45. get { return _useStd3AsciiRules; }
  46. set { _useStd3AsciiRules = value; }
  47. }
  48. // Gets ASCII (Punycode) version of the string
  49. public string GetAscii(string unicode)
  50. {
  51. return GetAscii(unicode, 0);
  52. }
  53. public string GetAscii(string unicode, int index)
  54. {
  55. if (unicode == null)
  56. throw new ArgumentNullException(nameof(unicode));
  57. return GetAscii(unicode, index, unicode.Length - index);
  58. }
  59. public string GetAscii(string unicode, int index, int count)
  60. {
  61. if (unicode == null)
  62. throw new ArgumentNullException(nameof(unicode));
  63. if (index < 0 || count < 0)
  64. throw new ArgumentOutOfRangeException((index < 0) ? nameof(index) : nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
  65. if (index > unicode.Length)
  66. throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index);
  67. if (index > unicode.Length - count)
  68. throw new ArgumentOutOfRangeException(nameof(unicode), SR.ArgumentOutOfRange_IndexCountBuffer);
  69. if (count == 0)
  70. {
  71. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
  72. }
  73. if (unicode[index + count - 1] == 0)
  74. {
  75. throw new ArgumentException(SR.Format(SR.Argument_InvalidCharSequence, index + count - 1), nameof(unicode));
  76. }
  77. if (GlobalizationMode.Invariant)
  78. {
  79. return GetAsciiInvariant(unicode, index, count);
  80. }
  81. unsafe
  82. {
  83. fixed (char* pUnicode = unicode)
  84. {
  85. return GetAsciiCore(unicode, pUnicode + index, count);
  86. }
  87. }
  88. }
  89. // Gets Unicode version of the string. Normalized and limited to IDNA characters.
  90. public string GetUnicode(string ascii)
  91. {
  92. return GetUnicode(ascii, 0);
  93. }
  94. public string GetUnicode(string ascii, int index)
  95. {
  96. if (ascii == null)
  97. throw new ArgumentNullException(nameof(ascii));
  98. return GetUnicode(ascii, index, ascii.Length - index);
  99. }
  100. public string GetUnicode(string ascii, int index, int count)
  101. {
  102. if (ascii == null)
  103. throw new ArgumentNullException(nameof(ascii));
  104. if (index < 0 || count < 0)
  105. throw new ArgumentOutOfRangeException((index < 0) ? nameof(index) : nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
  106. if (index > ascii.Length)
  107. throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index);
  108. if (index > ascii.Length - count)
  109. throw new ArgumentOutOfRangeException(nameof(ascii), SR.ArgumentOutOfRange_IndexCountBuffer);
  110. // This is a case (i.e. explicitly null-terminated input) where behavior in .NET and Win32 intentionally differ.
  111. // The .NET APIs should (and did in v4.0 and earlier) throw an ArgumentException on input that includes a terminating null.
  112. // The Win32 APIs fail on an embedded null, but not on a terminating null.
  113. if (count > 0 && ascii[index + count - 1] == (char)0)
  114. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
  115. if (GlobalizationMode.Invariant)
  116. {
  117. return GetUnicodeInvariant(ascii, index, count);
  118. }
  119. unsafe
  120. {
  121. fixed (char* pAscii = ascii)
  122. {
  123. return GetUnicodeCore(ascii, pAscii + index, count);
  124. }
  125. }
  126. }
  127. public override bool Equals(object obj)
  128. {
  129. IdnMapping that = obj as IdnMapping;
  130. return
  131. that != null &&
  132. _allowUnassigned == that._allowUnassigned &&
  133. _useStd3AsciiRules == that._useStd3AsciiRules;
  134. }
  135. public override int GetHashCode()
  136. {
  137. return (_allowUnassigned ? 100 : 200) + (_useStd3AsciiRules ? 1000 : 2000);
  138. }
  139. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  140. private static unsafe string GetStringForOutput(string originalString, char* input, int inputLength, char* output, int outputLength)
  141. {
  142. return originalString.Length == inputLength && new ReadOnlySpan<char>(input, inputLength).SequenceEqual(new ReadOnlySpan<char>(output, outputLength)) ?
  143. originalString :
  144. new string(output, 0, outputLength);
  145. }
  146. //
  147. // Invariant implementation
  148. //
  149. private const char c_delimiter = '-';
  150. private const string c_strAcePrefix = "xn--";
  151. private const int c_labelLimit = 63; // Not including dots
  152. private const int c_defaultNameLimit = 255; // Including dots
  153. private const int c_initialN = 0x80;
  154. private const int c_maxint = 0x7ffffff;
  155. private const int c_initialBias = 72;
  156. private const int c_punycodeBase = 36;
  157. private const int c_tmin = 1;
  158. private const int c_tmax = 26;
  159. private const int c_skew = 38;
  160. private const int c_damp = 700;
  161. // Legal "dot" separators (i.e: . in www.microsoft.com)
  162. private static char[] c_Dots = { '.', '\u3002', '\uFF0E', '\uFF61' };
  163. private string GetAsciiInvariant(string unicode, int index, int count)
  164. {
  165. if (index > 0 || count < unicode.Length)
  166. {
  167. unicode = unicode.Substring(index, count);
  168. }
  169. // Check for ASCII only string, which will be unchanged
  170. if (ValidateStd3AndAscii(unicode, UseStd3AsciiRules, true))
  171. {
  172. return unicode;
  173. }
  174. // Cannot be null terminated (normalization won't help us with this one, and
  175. // may have returned false before checking the whole string above)
  176. Debug.Assert(count >= 1, "[IdnMapping.GetAscii] Expected 0 length strings to fail before now.");
  177. if (unicode[unicode.Length - 1] <= 0x1f)
  178. {
  179. throw new ArgumentException(SR.Format(SR.Argument_InvalidCharSequence, unicode.Length - 1), nameof(unicode));
  180. }
  181. // May need to check Std3 rules again for non-ascii
  182. if (UseStd3AsciiRules)
  183. {
  184. ValidateStd3AndAscii(unicode, true, false);
  185. }
  186. // Go ahead and encode it
  187. return PunycodeEncode(unicode);
  188. }
  189. // See if we're only ASCII
  190. static bool ValidateStd3AndAscii(string unicode, bool bUseStd3, bool bCheckAscii)
  191. {
  192. // If its empty, then its too small
  193. if (unicode.Length == 0)
  194. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
  195. int iLastDot = -1;
  196. // Loop the whole string
  197. for (int i = 0; i < unicode.Length; i++)
  198. {
  199. // Aren't allowing control chars (or 7f, but idn tables catch that, they don't catch \0 at end though)
  200. if (unicode[i] <= 0x1f)
  201. {
  202. throw new ArgumentException(SR.Format(SR.Argument_InvalidCharSequence, i ), nameof(unicode));
  203. }
  204. // If its Unicode or a control character, return false (non-ascii)
  205. if (bCheckAscii && unicode[i] >= 0x7f)
  206. return false;
  207. // Check for dots
  208. if (IsDot(unicode[i]))
  209. {
  210. // Can't have 2 dots in a row
  211. if (i == iLastDot + 1)
  212. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
  213. // If its too far between dots then fail
  214. if (i - iLastDot > c_labelLimit + 1)
  215. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
  216. // If validating Std3, then char before dot can't be - char
  217. if (bUseStd3 && i > 0)
  218. ValidateStd3(unicode[i - 1], true);
  219. // Remember where the last dot is
  220. iLastDot = i;
  221. continue;
  222. }
  223. // If necessary, make sure its a valid std3 character
  224. if (bUseStd3)
  225. {
  226. ValidateStd3(unicode[i], (i == iLastDot + 1));
  227. }
  228. }
  229. // If we never had a dot, then we need to be shorter than the label limit
  230. if (iLastDot == -1 && unicode.Length > c_labelLimit)
  231. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
  232. // Need to validate entire string length, 1 shorter if last char wasn't a dot
  233. if (unicode.Length > c_defaultNameLimit - (IsDot(unicode[unicode.Length - 1]) ? 0 : 1))
  234. throw new ArgumentException(SR.Format(SR.Argument_IdnBadNameSize,
  235. c_defaultNameLimit - (IsDot(unicode[unicode.Length - 1]) ? 0 : 1)), nameof(unicode));
  236. // If last char wasn't a dot we need to check for trailing -
  237. if (bUseStd3 && !IsDot(unicode[unicode.Length - 1]))
  238. ValidateStd3(unicode[unicode.Length - 1], true);
  239. return true;
  240. }
  241. /* PunycodeEncode() converts Unicode to Punycode. The input */
  242. /* is represented as an array of Unicode code points (not code */
  243. /* units; surrogate pairs are not allowed), and the output */
  244. /* will be represented as an array of ASCII code points. The */
  245. /* output string is *not* null-terminated; it will contain */
  246. /* zeros if and only if the input contains zeros. (Of course */
  247. /* the caller can leave room for a terminator and add one if */
  248. /* needed.) The input_length is the number of code points in */
  249. /* the input. The output_length is an in/out argument: the */
  250. /* caller passes in the maximum number of code points that it */
  251. /* can receive, and on successful return it will contain the */
  252. /* number of code points actually output. The case_flags array */
  253. /* holds input_length boolean values, where nonzero suggests that */
  254. /* the corresponding Unicode character be forced to uppercase */
  255. /* after being decoded (if possible), and zero suggests that */
  256. /* it be forced to lowercase (if possible). ASCII code points */
  257. /* are encoded literally, except that ASCII letters are forced */
  258. /* to uppercase or lowercase according to the corresponding */
  259. /* uppercase flags. If case_flags is a null pointer then ASCII */
  260. /* letters are left as they are, and other code points are */
  261. /* treated as if their uppercase flags were zero. The return */
  262. /* value can be any of the punycode_status values defined above */
  263. /* except punycode_bad_input; if not punycode_success, then */
  264. /* output_size and output might contain garbage. */
  265. static string PunycodeEncode(string unicode)
  266. {
  267. // 0 length strings aren't allowed
  268. if (unicode.Length == 0)
  269. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
  270. StringBuilder output = new StringBuilder(unicode.Length);
  271. int iNextDot = 0;
  272. int iAfterLastDot = 0;
  273. int iOutputAfterLastDot = 0;
  274. // Find the next dot
  275. while (iNextDot < unicode.Length)
  276. {
  277. // Find end of this segment
  278. iNextDot = unicode.IndexOfAny(c_Dots, iAfterLastDot);
  279. Debug.Assert(iNextDot <= unicode.Length, "[IdnMapping.punycode_encode]IndexOfAny is broken");
  280. if (iNextDot < 0)
  281. iNextDot = unicode.Length;
  282. // Only allowed to have empty . section at end (www.microsoft.com.)
  283. if (iNextDot == iAfterLastDot)
  284. {
  285. // Only allowed to have empty sections as trailing .
  286. if (iNextDot != unicode.Length)
  287. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
  288. // Last dot, stop
  289. break;
  290. }
  291. // We'll need an Ace prefix
  292. output.Append(c_strAcePrefix);
  293. // Everything resets every segment.
  294. bool bRightToLeft = false;
  295. // Check for RTL. If right-to-left, then 1st & last chars must be RTL
  296. BidiCategory eBidi = CharUnicodeInfo.GetBidiCategory(unicode, iAfterLastDot);
  297. if (eBidi == BidiCategory.RightToLeft || eBidi == BidiCategory.RightToLeftArabic)
  298. {
  299. // It has to be right to left.
  300. bRightToLeft = true;
  301. // Check last char
  302. int iTest = iNextDot - 1;
  303. if (char.IsLowSurrogate(unicode, iTest))
  304. {
  305. iTest--;
  306. }
  307. eBidi = CharUnicodeInfo.GetBidiCategory(unicode, iTest);
  308. if (eBidi != BidiCategory.RightToLeft && eBidi != BidiCategory.RightToLeftArabic)
  309. {
  310. // Oops, last wasn't RTL, last should be RTL if first is RTL
  311. throw new ArgumentException(SR.Argument_IdnBadBidi, nameof(unicode));
  312. }
  313. }
  314. // Handle the basic code points
  315. int basicCount;
  316. int numProcessed = 0; // Num code points that have been processed so far (this segment)
  317. for (basicCount = iAfterLastDot; basicCount < iNextDot; basicCount++)
  318. {
  319. // Can't be lonely surrogate because it would've thrown in normalization
  320. Debug.Assert(char.IsLowSurrogate(unicode, basicCount) == false, "[IdnMapping.punycode_encode]Unexpected low surrogate");
  321. // Double check our bidi rules
  322. BidiCategory testBidi = CharUnicodeInfo.GetBidiCategory(unicode, basicCount);
  323. // If we're RTL, we can't have LTR chars
  324. if (bRightToLeft && testBidi == BidiCategory.LeftToRight)
  325. {
  326. // Oops, throw error
  327. throw new ArgumentException(SR.Argument_IdnBadBidi, nameof(unicode));
  328. }
  329. // If we're not RTL we can't have RTL chars
  330. if (!bRightToLeft && (testBidi == BidiCategory.RightToLeft || testBidi == BidiCategory.RightToLeftArabic))
  331. {
  332. // Oops, throw error
  333. throw new ArgumentException(SR.Argument_IdnBadBidi, nameof(unicode));
  334. }
  335. // If its basic then add it
  336. if (Basic(unicode[basicCount]))
  337. {
  338. output.Append(EncodeBasic(unicode[basicCount]));
  339. numProcessed++;
  340. }
  341. // If its a surrogate, skip the next since our bidi category tester doesn't handle it.
  342. else if (char.IsSurrogatePair(unicode, basicCount))
  343. basicCount++;
  344. }
  345. int numBasicCodePoints = numProcessed; // number of basic code points
  346. // Stop if we ONLY had basic code points
  347. if (numBasicCodePoints == iNextDot - iAfterLastDot)
  348. {
  349. // Get rid of xn-- and this segments done
  350. output.Remove(iOutputAfterLastDot, c_strAcePrefix.Length);
  351. }
  352. else
  353. {
  354. // If it has some non-basic code points the input cannot start with xn--
  355. if (unicode.Length - iAfterLastDot >= c_strAcePrefix.Length &&
  356. unicode.Substring(iAfterLastDot, c_strAcePrefix.Length).Equals(
  357. c_strAcePrefix, StringComparison.OrdinalIgnoreCase))
  358. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(unicode));
  359. // Need to do ACE encoding
  360. int numSurrogatePairs = 0; // number of surrogate pairs so far
  361. // Add a delimiter (-) if we had any basic code points (between basic and encoded pieces)
  362. if (numBasicCodePoints > 0)
  363. {
  364. output.Append(c_delimiter);
  365. }
  366. // Initialize the state
  367. int n = c_initialN;
  368. int delta = 0;
  369. int bias = c_initialBias;
  370. // Main loop
  371. while (numProcessed < (iNextDot - iAfterLastDot))
  372. {
  373. /* All non-basic code points < n have been */
  374. /* handled already. Find the next larger one: */
  375. int j;
  376. int m;
  377. int test = 0;
  378. for (m = c_maxint, j = iAfterLastDot;
  379. j < iNextDot;
  380. j += IsSupplementary(test) ? 2 : 1)
  381. {
  382. test = char.ConvertToUtf32(unicode, j);
  383. if (test >= n && test < m) m = test;
  384. }
  385. /* Increase delta enough to advance the decoder's */
  386. /* <n,i> state to <m,0>, but guard against overflow: */
  387. delta += (int)((m - n) * ((numProcessed - numSurrogatePairs) + 1));
  388. Debug.Assert(delta > 0, "[IdnMapping.cs]1 punycode_encode - delta overflowed int");
  389. n = m;
  390. for (j = iAfterLastDot; j < iNextDot; j+= IsSupplementary(test) ? 2 : 1)
  391. {
  392. // Make sure we're aware of surrogates
  393. test = char.ConvertToUtf32(unicode, j);
  394. // Adjust for character position (only the chars in our string already, some
  395. // haven't been processed.
  396. if (test < n)
  397. {
  398. delta++;
  399. Debug.Assert(delta > 0, "[IdnMapping.cs]2 punycode_encode - delta overflowed int");
  400. }
  401. if (test == n)
  402. {
  403. // Represent delta as a generalized variable-length integer:
  404. int q, k;
  405. for (q = delta, k = c_punycodeBase; ; k += c_punycodeBase)
  406. {
  407. int t = k <= bias ? c_tmin : k >= bias + c_tmax ? c_tmax : k - bias;
  408. if (q < t) break;
  409. Debug.Assert(c_punycodeBase != t, "[IdnMapping.punycode_encode]Expected c_punycodeBase (36) to be != t");
  410. output.Append(EncodeDigit(t + (q - t) % (c_punycodeBase - t)));
  411. q = (q - t) / (c_punycodeBase - t);
  412. }
  413. output.Append(EncodeDigit(q));
  414. bias = Adapt(delta, (numProcessed - numSurrogatePairs) + 1, numProcessed == numBasicCodePoints);
  415. delta = 0;
  416. numProcessed++;
  417. if (IsSupplementary(m))
  418. {
  419. numProcessed++;
  420. numSurrogatePairs++;
  421. }
  422. }
  423. }
  424. ++delta;
  425. ++n;
  426. Debug.Assert(delta > 0, "[IdnMapping.cs]3 punycode_encode - delta overflowed int");
  427. }
  428. }
  429. // Make sure its not too big
  430. if (output.Length - iOutputAfterLastDot > c_labelLimit)
  431. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(unicode));
  432. // Done with this segment, add dot if necessary
  433. if (iNextDot != unicode.Length)
  434. output.Append('.');
  435. iAfterLastDot = iNextDot + 1;
  436. iOutputAfterLastDot = output.Length;
  437. }
  438. // Throw if we're too long
  439. if (output.Length > c_defaultNameLimit - (IsDot(unicode[unicode.Length-1]) ? 0 : 1))
  440. throw new ArgumentException(SR.Format(SR.Argument_IdnBadNameSize,
  441. c_defaultNameLimit - (IsDot(unicode[unicode.Length-1]) ? 0 : 1)), nameof(unicode));
  442. // Return our output string
  443. return output.ToString();
  444. }
  445. // Is it a dot?
  446. // are we U+002E (., full stop), U+3002 (ideographic full stop), U+FF0E (fullwidth full stop), or
  447. // U+FF61 (halfwidth ideographic full stop).
  448. // Note: IDNA Normalization gets rid of dots now, but testing for last dot is before normalization
  449. private static bool IsDot(char c)
  450. {
  451. return c == '.' || c == '\u3002' || c == '\uFF0E' || c == '\uFF61';
  452. }
  453. private static bool IsSupplementary(int cTest)
  454. {
  455. return cTest >= 0x10000;
  456. }
  457. private static bool Basic(uint cp)
  458. {
  459. // Is it in ASCII range?
  460. return cp < 0x80;
  461. }
  462. // Validate Std3 rules for a character
  463. private static void ValidateStd3(char c, bool bNextToDot)
  464. {
  465. // Check for illegal characters
  466. if ((c <= ',' || c == '/' || (c >= ':' && c <= '@') || // Lots of characters not allowed
  467. (c >= '[' && c <= '`') || (c >= '{' && c <= (char)0x7F)) ||
  468. (c == '-' && bNextToDot))
  469. throw new ArgumentException(SR.Format(SR.Argument_IdnBadStd3, c), nameof(c));
  470. }
  471. private string GetUnicodeInvariant(string ascii, int index, int count)
  472. {
  473. if (index > 0 || count < ascii.Length)
  474. {
  475. // We're only using part of the string
  476. ascii = ascii.Substring(index, count);
  477. }
  478. // Convert Punycode to Unicode
  479. string strUnicode = PunycodeDecode(ascii);
  480. // Output name MUST obey IDNA rules & round trip (casing differences are allowed)
  481. if (!ascii.Equals(GetAscii(strUnicode), StringComparison.OrdinalIgnoreCase))
  482. throw new ArgumentException(SR.Argument_IdnIllegalName, nameof(ascii));
  483. return strUnicode;
  484. }
  485. /* PunycodeDecode() converts Punycode to Unicode. The input is */
  486. /* represented as an array of ASCII code points, and the output */
  487. /* will be represented as an array of Unicode code points. The */
  488. /* input_length is the number of code points in the input. The */
  489. /* output_length is an in/out argument: the caller passes in */
  490. /* the maximum number of code points that it can receive, and */
  491. /* on successful return it will contain the actual number of */
  492. /* code points output. The case_flags array needs room for at */
  493. /* least output_length values, or it can be a null pointer if the */
  494. /* case information is not needed. A nonzero flag suggests that */
  495. /* the corresponding Unicode character be forced to uppercase */
  496. /* by the caller (if possible), while zero suggests that it be */
  497. /* forced to lowercase (if possible). ASCII code points are */
  498. /* output already in the proper case, but their flags will be set */
  499. /* appropriately so that applying the flags would be harmless. */
  500. /* The return value can be any of the punycode_status values */
  501. /* defined above; if not punycode_success, then output_length, */
  502. /* output, and case_flags might contain garbage. On success, the */
  503. /* decoder will never need to write an output_length greater than */
  504. /* input_length, because of how the encoding is defined. */
  505. private static string PunycodeDecode(string ascii)
  506. {
  507. // 0 length strings aren't allowed
  508. if (ascii.Length == 0)
  509. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(ascii));
  510. // Throw if we're too long
  511. if (ascii.Length > c_defaultNameLimit - (IsDot(ascii[ascii.Length-1]) ? 0 : 1))
  512. throw new ArgumentException(SR.Format(SR.Argument_IdnBadNameSize,
  513. c_defaultNameLimit - (IsDot(ascii[ascii.Length-1]) ? 0 : 1)), nameof(ascii));
  514. // output stringbuilder
  515. StringBuilder output = new StringBuilder(ascii.Length);
  516. // Dot searching
  517. int iNextDot = 0;
  518. int iAfterLastDot = 0;
  519. int iOutputAfterLastDot = 0;
  520. while (iNextDot < ascii.Length)
  521. {
  522. // Find end of this segment
  523. iNextDot = ascii.IndexOf('.', iAfterLastDot);
  524. if (iNextDot < 0 || iNextDot > ascii.Length)
  525. iNextDot = ascii.Length;
  526. // Only allowed to have empty . section at end (www.microsoft.com.)
  527. if (iNextDot == iAfterLastDot)
  528. {
  529. // Only allowed to have empty sections as trailing .
  530. if (iNextDot != ascii.Length)
  531. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(ascii));
  532. // Last dot, stop
  533. break;
  534. }
  535. // In either case it can't be bigger than segment size
  536. if (iNextDot - iAfterLastDot > c_labelLimit)
  537. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(ascii));
  538. // See if this section's ASCII or ACE
  539. if (ascii.Length < c_strAcePrefix.Length + iAfterLastDot ||
  540. string.Compare(ascii, iAfterLastDot, c_strAcePrefix, 0, c_strAcePrefix.Length, StringComparison.OrdinalIgnoreCase) != 0)
  541. {
  542. // Its ASCII, copy it
  543. output.Append(ascii, iAfterLastDot, iNextDot - iAfterLastDot);
  544. }
  545. else
  546. {
  547. // Not ASCII, bump up iAfterLastDot to be after ACE Prefix
  548. iAfterLastDot += c_strAcePrefix.Length;
  549. // Get number of basic code points (where delimiter is)
  550. // numBasicCodePoints < 0 if there're no basic code points
  551. int iTemp = ascii.LastIndexOf(c_delimiter, iNextDot - 1);
  552. // Trailing - not allowed
  553. if (iTemp == iNextDot - 1)
  554. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
  555. int numBasicCodePoints;
  556. if (iTemp <= iAfterLastDot)
  557. numBasicCodePoints = 0;
  558. else
  559. {
  560. numBasicCodePoints = iTemp - iAfterLastDot;
  561. // Copy all the basic code points, making sure they're all in the allowed range,
  562. // and losing the casing for all of them.
  563. for (int copyAscii = iAfterLastDot; copyAscii < iAfterLastDot + numBasicCodePoints; copyAscii++)
  564. {
  565. // Make sure we don't allow unicode in the ascii part
  566. if (ascii[copyAscii] > 0x7f)
  567. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
  568. // When appending make sure they get lower cased
  569. output.Append((char)(ascii[copyAscii] >= 'A' && ascii[copyAscii] <='Z' ? ascii[copyAscii] - 'A' + 'a' : ascii[copyAscii]));
  570. }
  571. }
  572. // Get ready for main loop. Start at beginning if we didn't have any
  573. // basic code points, otherwise start after the -.
  574. // asciiIndex will be next character to read from ascii
  575. int asciiIndex = iAfterLastDot + (numBasicCodePoints > 0 ? numBasicCodePoints + 1 : 0);
  576. // initialize our state
  577. int n = c_initialN;
  578. int bias = c_initialBias;
  579. int i = 0;
  580. int w, k;
  581. // no Supplementary characters yet
  582. int numSurrogatePairs = 0;
  583. // Main loop, read rest of ascii
  584. while (asciiIndex < iNextDot)
  585. {
  586. /* Decode a generalized variable-length integer into delta, */
  587. /* which gets added to i. The overflow checking is easier */
  588. /* if we increase i as we go, then subtract off its starting */
  589. /* value at the end to obtain delta. */
  590. int oldi = i;
  591. for (w = 1, k = c_punycodeBase; ; k += c_punycodeBase)
  592. {
  593. // Check to make sure we aren't overrunning our ascii string
  594. if (asciiIndex >= iNextDot)
  595. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
  596. // decode the digit from the next char
  597. int digit = DecodeDigit(ascii[asciiIndex++]);
  598. Debug.Assert(w > 0, "[IdnMapping.punycode_decode]Expected w > 0");
  599. if (digit > (c_maxint - i) / w)
  600. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
  601. i += (int)(digit * w);
  602. int t = k <= bias ? c_tmin : k >= bias + c_tmax ? c_tmax : k - bias;
  603. if (digit < t)
  604. break;
  605. Debug.Assert(c_punycodeBase != t, "[IdnMapping.punycode_decode]Expected t != c_punycodeBase (36)");
  606. if (w > c_maxint / (c_punycodeBase - t))
  607. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
  608. w *= (c_punycodeBase - t);
  609. }
  610. bias = Adapt(i - oldi, (output.Length - iOutputAfterLastDot - numSurrogatePairs) + 1, oldi == 0);
  611. /* i was supposed to wrap around from output.Length to 0, */
  612. /* incrementing n each time, so we'll fix that now: */
  613. Debug.Assert((output.Length - iOutputAfterLastDot - numSurrogatePairs) + 1 > 0,
  614. "[IdnMapping.punycode_decode]Expected to have added > 0 characters this segment");
  615. if (i / ((output.Length - iOutputAfterLastDot - numSurrogatePairs) + 1) > c_maxint - n)
  616. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
  617. n += (int)(i / (output.Length - iOutputAfterLastDot - numSurrogatePairs + 1));
  618. i %= (output.Length - iOutputAfterLastDot - numSurrogatePairs + 1);
  619. // Make sure n is legal
  620. if ((n < 0 || n > 0x10ffff) || (n >= 0xD800 && n <= 0xDFFF))
  621. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
  622. // insert n at position i of the output: Really tricky if we have surrogates
  623. int iUseInsertLocation;
  624. string strTemp = char.ConvertFromUtf32(n);
  625. // If we have supplimentary characters
  626. if (numSurrogatePairs > 0)
  627. {
  628. // Hard way, we have supplimentary characters
  629. int iCount;
  630. for (iCount = i, iUseInsertLocation = iOutputAfterLastDot; iCount > 0; iCount--, iUseInsertLocation++)
  631. {
  632. // If its a surrogate, we have to go one more
  633. if (iUseInsertLocation >= output.Length)
  634. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(ascii));
  635. if (char.IsSurrogate(output[iUseInsertLocation]))
  636. iUseInsertLocation++;
  637. }
  638. }
  639. else
  640. {
  641. // No Supplementary chars yet, just add i
  642. iUseInsertLocation = iOutputAfterLastDot + i;
  643. }
  644. // Insert it
  645. output.Insert(iUseInsertLocation, strTemp);
  646. // If it was a surrogate increment our counter
  647. if (IsSupplementary(n))
  648. numSurrogatePairs++;
  649. // Index gets updated
  650. i++;
  651. }
  652. // Do BIDI testing
  653. bool bRightToLeft = false;
  654. // Check for RTL. If right-to-left, then 1st & last chars must be RTL
  655. BidiCategory eBidi = CharUnicodeInfo.GetBidiCategory(output, iOutputAfterLastDot);
  656. if (eBidi == BidiCategory.RightToLeft || eBidi == BidiCategory.RightToLeftArabic)
  657. {
  658. // It has to be right to left.
  659. bRightToLeft = true;
  660. }
  661. // Check the rest of them to make sure RTL/LTR is consistent
  662. for (int iTest = iOutputAfterLastDot; iTest < output.Length; iTest++)
  663. {
  664. // This might happen if we run into a pair
  665. if (char.IsLowSurrogate(output[iTest]))
  666. continue;
  667. // Check to see if its LTR
  668. eBidi = CharUnicodeInfo.GetBidiCategory(output, iTest);
  669. if ((bRightToLeft && eBidi == BidiCategory.LeftToRight) ||
  670. (!bRightToLeft && (eBidi == BidiCategory.RightToLeft || eBidi == BidiCategory.RightToLeftArabic)))
  671. throw new ArgumentException(SR.Argument_IdnBadBidi, nameof(ascii));
  672. }
  673. // Its also a requirement that the last one be RTL if 1st is RTL
  674. if (bRightToLeft && eBidi != BidiCategory.RightToLeft && eBidi != BidiCategory.RightToLeftArabic)
  675. {
  676. // Oops, last wasn't RTL, last should be RTL if first is RTL
  677. throw new ArgumentException(SR.Argument_IdnBadBidi, nameof(ascii));
  678. }
  679. }
  680. // See if this label was too long
  681. if (iNextDot - iAfterLastDot > c_labelLimit)
  682. throw new ArgumentException(SR.Argument_IdnBadLabelSize, nameof(ascii));
  683. // Done with this segment, add dot if necessary
  684. if (iNextDot != ascii.Length)
  685. output.Append('.');
  686. iAfterLastDot = iNextDot + 1;
  687. iOutputAfterLastDot = output.Length;
  688. }
  689. // Throw if we're too long
  690. if (output.Length > c_defaultNameLimit - (IsDot(output[output.Length-1]) ? 0 : 1))
  691. throw new ArgumentException(SR.Format(SR.Argument_IdnBadNameSize, c_defaultNameLimit - (IsDot(output[output.Length-1]) ? 0 : 1)), nameof(ascii));
  692. // Return our output string
  693. return output.ToString();
  694. }
  695. // DecodeDigit(cp) returns the numeric value of a basic code */
  696. // point (for use in representing integers) in the range 0 to */
  697. // c_punycodeBase-1, or <0 if cp is does not represent a value. */
  698. private static int DecodeDigit(char cp)
  699. {
  700. if (cp >= '0' && cp <= '9')
  701. return cp - '0' + 26;
  702. // Two flavors for case differences
  703. if (cp >= 'a' && cp <= 'z')
  704. return cp - 'a';
  705. if (cp >= 'A' && cp <= 'Z')
  706. return cp - 'A';
  707. // Expected 0-9, A-Z or a-z, everything else is illegal
  708. throw new ArgumentException(SR.Argument_IdnBadPunycode, nameof(cp));
  709. }
  710. private static int Adapt(int delta, int numpoints, bool firsttime)
  711. {
  712. uint k;
  713. delta = firsttime ? delta / c_damp : delta / 2;
  714. Debug.Assert(numpoints != 0, "[IdnMapping.adapt]Expected non-zero numpoints.");
  715. delta += delta / numpoints;
  716. for (k = 0; delta > ((c_punycodeBase - c_tmin) * c_tmax) / 2; k += c_punycodeBase)
  717. {
  718. delta /= c_punycodeBase - c_tmin;
  719. }
  720. Debug.Assert(delta + c_skew != 0, "[IdnMapping.adapt]Expected non-zero delta+skew.");
  721. return (int)(k + (c_punycodeBase - c_tmin + 1) * delta / (delta + c_skew));
  722. }
  723. /* EncodeBasic(bcp,flag) forces a basic code point to lowercase */
  724. /* if flag is false, uppercase if flag is true, and returns */
  725. /* the resulting code point. The code point is unchanged if it */
  726. /* is caseless. The behavior is undefined if bcp is not a basic */
  727. /* code point. */
  728. static char EncodeBasic(char bcp)
  729. {
  730. if (HasUpperCaseFlag(bcp))
  731. bcp += (char)('a' - 'A');
  732. return bcp;
  733. }
  734. // Return whether a punycode code point is flagged as being upper case.
  735. private static bool HasUpperCaseFlag(char punychar)
  736. {
  737. return (punychar >= 'A' && punychar <= 'Z');
  738. }
  739. /* EncodeDigit(d,flag) returns the basic code point whose value */
  740. /* (when used for representing integers) is d, which needs to be in */
  741. /* the range 0 to punycodeBase-1. The lowercase form is used unless flag is */
  742. /* true, in which case the uppercase form is used. */
  743. private static char EncodeDigit(int d)
  744. {
  745. Debug.Assert(d >= 0 && d < c_punycodeBase, "[IdnMapping.encode_digit]Expected 0 <= d < punycodeBase");
  746. // 26-35 map to ASCII 0-9
  747. if (d > 25) return (char)(d - 26 + '0');
  748. // 0-25 map to a-z or A-Z
  749. return (char)(d + 'a');
  750. }
  751. }
  752. }