UTF8Encoding.cs 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913
  1. /*
  2. * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
  3. *
  4. * Copyright (c) 2001, 2002 Southern Storm Software, Pty Ltd
  5. * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
  6. *
  7. * Permission is hereby granted, free of charge, to any person obtaining
  8. * a copy of this software and associated documentation files (the "Software"),
  9. * to deal in the Software without restriction, including without limitation
  10. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11. * and/or sell copies of the Software, and to permit persons to whom the
  12. * Software is furnished to do so, subject to the following conditions:
  13. *
  14. * The above copyright notice and this permission notice shall be included
  15. * in all copies or substantial portions of the Software.
  16. *
  17. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18. * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  20. * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  21. * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  22. * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  23. * OTHER DEALINGS IN THE SOFTWARE.
  24. */
  25. namespace System.Text
  26. {
  27. using System;
  28. [Serializable]
  29. [MonoTODO ("Fix serialization compatibility with MS.NET")]
  30. #if NET_2_0
  31. [MonoTODO ("EncoderFallback is not handled")]
  32. #endif
  33. public class UTF8Encoding : Encoding
  34. {
  35. // Magic number used by Windows for UTF-8.
  36. internal const int UTF8_CODE_PAGE = 65001;
  37. // Internal state.
  38. private bool emitIdentifier;
  39. #if !NET_2_0
  40. private bool throwOnInvalid;
  41. #endif
  42. // Constructors.
  43. public UTF8Encoding () : this (false, false) {}
  44. public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
  45. : this (encoderShouldEmitUTF8Identifier, false) {}
  46. public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
  47. : base (UTF8_CODE_PAGE)
  48. {
  49. emitIdentifier = encoderShouldEmitUTF8Identifier;
  50. #if NET_2_0
  51. if (throwOnInvalidBytes)
  52. DecoderFallback = new DecoderExceptionFallback ();
  53. else
  54. DecoderFallback = new DecoderReplacementFallback (String.Empty);
  55. #else
  56. throwOnInvalid = throwOnInvalidBytes;
  57. #endif
  58. web_name = body_name = header_name = "utf-8";
  59. encoding_name = "Unicode (UTF-8)";
  60. is_browser_save = true;
  61. is_browser_display = true;
  62. is_mail_news_display = true;
  63. windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE;
  64. }
  65. // Internal version of "GetByteCount" which can handle a rolling
  66. // state between multiple calls to this method.
  67. private static int InternalGetByteCount (char[] chars, int index, int count, uint leftOver, bool flush)
  68. {
  69. // Validate the parameters.
  70. if (chars == null) {
  71. throw new ArgumentNullException ("chars");
  72. }
  73. if (index < 0 || index > chars.Length) {
  74. throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
  75. }
  76. if (count < 0 || count > (chars.Length - index)) {
  77. throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
  78. }
  79. // Determine the lengths of all characters.
  80. char ch;
  81. int length = 0;
  82. uint pair = leftOver;
  83. while (count > 0) {
  84. ch = chars[index];
  85. if (pair == 0) {
  86. if (ch < '\u0080') {
  87. ++length;
  88. } else if (ch < '\u0800') {
  89. length += 2;
  90. } else if (ch >= '\uD800' && ch <= '\uDBFF') {
  91. // This is the start of a surrogate pair.
  92. pair = (uint)ch;
  93. } else {
  94. length += 3;
  95. }
  96. } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
  97. // We have a surrogate pair.
  98. length += 4;
  99. pair = 0;
  100. } else {
  101. // We have a surrogate start followed by a
  102. // regular character. Technically, this is
  103. // invalid, but we have to do something.
  104. // We write out the surrogate start and then
  105. // re-visit the current character again.
  106. length += 3;
  107. pair = 0;
  108. continue;
  109. }
  110. ++index;
  111. --count;
  112. }
  113. if (flush && pair != 0) {
  114. // Flush the left-over surrogate pair start.
  115. length += 3;
  116. }
  117. // Return the final length to the caller.
  118. return length;
  119. }
  120. // Get the number of bytes needed to encode a character buffer.
  121. public override int GetByteCount (char[] chars, int index, int count)
  122. {
  123. return InternalGetByteCount (chars, index, count, 0, true);
  124. }
  125. // Convenience wrappers for "GetByteCount".
  126. public override int GetByteCount (String s)
  127. {
  128. // Validate the parameters.
  129. if (s == null) {
  130. throw new ArgumentNullException ("s");
  131. }
  132. // Determine the lengths of all characters.
  133. char ch;
  134. int index = 0;
  135. int count = s.Length;
  136. int length = 0;
  137. uint pair;
  138. while (count > 0) {
  139. ch = s[index++];
  140. if (ch < '\u0080') {
  141. ++length;
  142. } else if (ch < '\u0800') {
  143. length += 2;
  144. } else if (ch >= '\uD800' && ch <= '\uDBFF' && count > 1) {
  145. // This may be the start of a surrogate pair.
  146. pair = (uint)(s[index]);
  147. if (pair >= (uint)0xDC00 && pair <= (uint)0xDFFF) {
  148. length += 4;
  149. ++index;
  150. --count;
  151. } else {
  152. length += 3;
  153. }
  154. } else {
  155. length += 3;
  156. }
  157. --count;
  158. }
  159. // Return the final length to the caller.
  160. return length;
  161. }
  162. // Internal version of "GetBytes" which can handle a rolling
  163. // state between multiple calls to this method.
  164. private static int InternalGetBytes (char[] chars, int charIndex,
  165. int charCount, byte[] bytes,
  166. int byteIndex, ref uint leftOver,
  167. bool flush)
  168. {
  169. // Validate the parameters.
  170. if (chars == null) {
  171. throw new ArgumentNullException ("chars");
  172. }
  173. if (bytes == null) {
  174. throw new ArgumentNullException ("bytes");
  175. }
  176. if (charIndex < 0 || charIndex > chars.Length) {
  177. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
  178. }
  179. if (charCount < 0 || charCount > (chars.Length - charIndex)) {
  180. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
  181. }
  182. if (byteIndex < 0 || byteIndex > bytes.Length) {
  183. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  184. }
  185. // Convert the characters into bytes.
  186. char ch;
  187. int length = bytes.Length;
  188. uint pair;
  189. uint left = leftOver;
  190. int posn = byteIndex;
  191. while (charCount > 0) {
  192. // Fetch the next UTF-16 character pair value.
  193. ch = chars[charIndex++];
  194. --charCount;
  195. if (left == 0) {
  196. if (ch >= '\uD800' && ch <= '\uDBFF') {
  197. // This is the start of a surrogate pair.
  198. left = (uint)ch;
  199. continue;
  200. } else {
  201. // This is a regular character.
  202. pair = (uint)ch;
  203. }
  204. } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
  205. // We have a surrogate pair.
  206. pair = ((left - (uint)0xD800) << 10) +
  207. (((uint)ch) - (uint)0xDC00) +
  208. (uint)0x10000;
  209. left = 0;
  210. } else {
  211. // We have a surrogate start followed by a
  212. // regular character. Technically, this is
  213. // invalid, but we have to do something.
  214. // We write out the surrogate start and then
  215. // re-visit the current character again.
  216. pair = (uint)left;
  217. left = 0;
  218. --charIndex;
  219. ++charCount;
  220. }
  221. // Encode the character pair value.
  222. if (pair < (uint)0x0080) {
  223. if (posn >= length) {
  224. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  225. }
  226. bytes[posn++] = (byte)pair;
  227. } else if (pair < (uint)0x0800) {
  228. if ((posn + 2) > length) {
  229. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  230. }
  231. bytes[posn++] = (byte)(0xC0 | (pair >> 6));
  232. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  233. } else if (pair < (uint)0x10000) {
  234. if ((posn + 3) > length) {
  235. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  236. }
  237. bytes[posn++] = (byte)(0xE0 | (pair >> 12));
  238. bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
  239. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  240. } else {
  241. if ((posn + 4) > length) {
  242. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  243. }
  244. bytes[posn++] = (byte)(0xF0 | (pair >> 18));
  245. bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F));
  246. bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
  247. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  248. }
  249. }
  250. if (flush && left != 0) {
  251. // Flush the left-over surrogate pair start.
  252. if ((posn + 3) > length) {
  253. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  254. }
  255. bytes[posn++] = (byte)(0xE0 | (left >> 12));
  256. bytes[posn++] = (byte)(0x80 | ((left >> 6) & 0x3F));
  257. bytes[posn++] = (byte)(0x80 | (left & 0x3F));
  258. left = 0;
  259. }
  260. leftOver = left;
  261. // Return the final count to the caller.
  262. return posn - byteIndex;
  263. }
  264. // Get the bytes that result from encoding a character buffer.
  265. public override int GetBytes (char[] chars, int charIndex, int charCount,
  266. byte[] bytes, int byteIndex)
  267. {
  268. uint leftOver = 0;
  269. return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOver, true);
  270. }
  271. // Convenience wrappers for "GetBytes".
  272. public override int GetBytes (String s, int charIndex, int charCount,
  273. byte[] bytes, int byteIndex)
  274. {
  275. // Validate the parameters.
  276. if (s == null) {
  277. throw new ArgumentNullException ("s");
  278. }
  279. if (bytes == null) {
  280. throw new ArgumentNullException ("bytes");
  281. }
  282. if (charIndex < 0 || charIndex > s.Length) {
  283. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
  284. }
  285. if (charCount < 0 || charCount > (s.Length - charIndex)) {
  286. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
  287. }
  288. if (byteIndex < 0 || byteIndex > bytes.Length) {
  289. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  290. }
  291. // Convert the characters into bytes.
  292. char ch;
  293. int length = bytes.Length;
  294. uint pair;
  295. int posn = byteIndex;
  296. while (charCount > 0) {
  297. // Fetch the next UTF-16 character pair value.
  298. ch = s[charIndex++];
  299. if (ch >= '\uD800' && ch <= '\uDBFF' && charCount > 1) {
  300. // This may be the start of a surrogate pair.
  301. pair = (uint)(s[charIndex]);
  302. if (pair >= (uint)0xDC00 && pair <= (uint)0xDFFF) {
  303. pair = (pair - (uint)0xDC00) +
  304. ((((uint)ch) - (uint)0xD800) << 10) +
  305. (uint)0x10000;
  306. ++charIndex;
  307. --charCount;
  308. } else {
  309. pair = (uint)ch;
  310. }
  311. } else {
  312. pair = (uint)ch;
  313. }
  314. --charCount;
  315. // Encode the character pair value.
  316. if (pair < (uint)0x0080) {
  317. if (posn >= length) {
  318. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  319. }
  320. bytes[posn++] = (byte)pair;
  321. } else if (pair < (uint)0x0800) {
  322. if ((posn + 2) > length) {
  323. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  324. }
  325. bytes[posn++] = (byte)(0xC0 | (pair >> 6));
  326. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  327. } else if (pair < (uint)0x10000) {
  328. if ((posn + 3) > length) {
  329. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  330. }
  331. bytes[posn++] = (byte)(0xE0 | (pair >> 12));
  332. bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
  333. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  334. } else {
  335. if ((posn + 4) > length) {
  336. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  337. }
  338. bytes[posn++] = (byte)(0xF0 | (pair >> 18));
  339. bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F));
  340. bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
  341. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  342. }
  343. }
  344. // Return the final count to the caller.
  345. return posn - byteIndex;
  346. }
  347. // Internal version of "GetCharCount" which can handle a rolling
  348. // state between multiple calls to this method.
  349. #if NET_2_0
  350. // Internal version of "GetCharCount" which can handle a rolling
  351. // state between multiple calls to this method.
  352. private static int InternalGetCharCount (
  353. byte[] bytes, int index, int count, uint leftOverBits,
  354. uint leftOverCount, DecoderFallbackBuffer fallbackBuffer, bool flush)
  355. #else
  356. private static int InternalGetCharCount (
  357. byte[] bytes, int index, int count, uint leftOverBits,
  358. uint leftOverCount, bool throwOnInvalid, bool flush)
  359. #endif
  360. {
  361. // Validate the parameters.
  362. if (bytes == null) {
  363. throw new ArgumentNullException ("bytes");
  364. }
  365. if (index < 0 || index > bytes.Length) {
  366. throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
  367. }
  368. if (count < 0 || count > (bytes.Length - index)) {
  369. throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
  370. }
  371. // Determine the number of characters that we have.
  372. uint ch;
  373. int length = 0;
  374. uint leftBits = leftOverBits;
  375. uint leftSoFar = (leftOverCount & (uint)0x0F);
  376. uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
  377. while (count > 0) {
  378. ch = (uint)(bytes[index++]);
  379. --count;
  380. if (leftSize == 0) {
  381. // Process a UTF-8 start character.
  382. if (ch < (uint)0x0080) {
  383. // Single-byte UTF-8 character.
  384. ++length;
  385. } else if ((ch & (uint)0xE0) == (uint)0xC0) {
  386. // Double-byte UTF-8 character.
  387. leftBits = (ch & (uint)0x1F);
  388. leftSoFar = 1;
  389. leftSize = 2;
  390. } else if ((ch & (uint)0xF0) == (uint)0xE0) {
  391. // Three-byte UTF-8 character.
  392. leftBits = (ch & (uint)0x0F);
  393. leftSoFar = 1;
  394. leftSize = 3;
  395. } else if ((ch & (uint)0xF8) == (uint)0xF0) {
  396. // Four-byte UTF-8 character.
  397. leftBits = (ch & (uint)0x07);
  398. leftSoFar = 1;
  399. leftSize = 4;
  400. } else if ((ch & (uint)0xFC) == (uint)0xF8) {
  401. // Five-byte UTF-8 character.
  402. leftBits = (ch & (uint)0x03);
  403. leftSoFar = 1;
  404. leftSize = 5;
  405. } else if ((ch & (uint)0xFE) == (uint)0xFC) {
  406. // Six-byte UTF-8 character.
  407. leftBits = (ch & (uint)0x03);
  408. leftSoFar = 1;
  409. leftSize = 6;
  410. } else {
  411. // Invalid UTF-8 start character.
  412. #if NET_2_0
  413. length += Fallback (fallbackBuffer, bytes, index - 1);
  414. #else
  415. if (throwOnInvalid)
  416. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  417. #endif
  418. }
  419. } else {
  420. // Process an extra byte in a multi-byte sequence.
  421. if ((ch & (uint)0xC0) == (uint)0x80) {
  422. leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
  423. if (++leftSoFar >= leftSize) {
  424. // We have a complete character now.
  425. if (leftBits < (uint)0x10000) {
  426. // is it an overlong ?
  427. bool overlong = false;
  428. switch (leftSize) {
  429. case 2:
  430. overlong = (leftBits <= 0x7F);
  431. break;
  432. case 3:
  433. overlong = (leftBits <= 0x07FF);
  434. break;
  435. case 4:
  436. overlong = (leftBits <= 0xFFFF);
  437. break;
  438. case 5:
  439. overlong = (leftBits <= 0x1FFFFF);
  440. break;
  441. case 6:
  442. overlong = (leftBits <= 0x03FFFFFF);
  443. break;
  444. }
  445. if (overlong) {
  446. #if NET_2_0
  447. length += Fallback (fallbackBuffer, bytes, index - 1);
  448. #else
  449. if (throwOnInvalid)
  450. throw new ArgumentException (_("Overlong"), leftBits.ToString ());
  451. #endif
  452. }
  453. else
  454. ++length;
  455. } else if (leftBits < (uint)0x110000) {
  456. length += 2;
  457. } else {
  458. #if NET_2_0
  459. length += Fallback (fallbackBuffer, bytes, index - 1);
  460. #else
  461. if (throwOnInvalid)
  462. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  463. #endif
  464. }
  465. leftSize = 0;
  466. }
  467. } else {
  468. // Invalid UTF-8 sequence: clear and restart.
  469. #if NET_2_0
  470. length += Fallback (fallbackBuffer, bytes, index - 1);
  471. #else
  472. if (throwOnInvalid)
  473. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  474. #endif
  475. leftSize = 0;
  476. --index;
  477. ++count;
  478. }
  479. }
  480. }
  481. if (flush && leftSize != 0) {
  482. // We had left-over bytes that didn't make up
  483. // a complete UTF-8 character sequence.
  484. #if NET_2_0
  485. length += Fallback (fallbackBuffer, bytes, index - 1);
  486. #else
  487. if (throwOnInvalid)
  488. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  489. #endif
  490. }
  491. // Return the final length to the caller.
  492. return length;
  493. }
  494. #if NET_2_0
  495. // for GetCharCount()
  496. static int Fallback (DecoderFallbackBuffer buffer, byte [] bytes, int index)
  497. {
  498. buffer.Fallback (bytes, index - 1);
  499. return buffer.Remaining;
  500. }
  501. // for GetChars()
  502. static void Fallback (DecoderFallbackBuffer buffer, byte [] bytes, int byteIndex,
  503. char [] chars, ref int charIndex)
  504. {
  505. buffer.Fallback (bytes, byteIndex - 1);
  506. while (buffer.Remaining > 0)
  507. chars [charIndex++] = buffer.GetNextChar ();
  508. }
  509. #endif
  510. // Get the number of characters needed to decode a byte buffer.
  511. public override int GetCharCount (byte[] bytes, int index, int count)
  512. {
  513. #if NET_2_0
  514. return InternalGetCharCount (bytes, index, count, 0, 0, DecoderFallback.CreateFallbackBuffer (), true);
  515. #else
  516. return InternalGetCharCount (bytes, index, count, 0, 0, throwOnInvalid, true);
  517. #endif
  518. }
  519. // Get the characters that result from decoding a byte buffer.
  520. #if NET_2_0
  521. private static int InternalGetChars (
  522. byte[] bytes, int byteIndex, int byteCount, char[] chars,
  523. int charIndex, ref uint leftOverBits, ref uint leftOverCount,
  524. DecoderFallbackBuffer fallbackBuffer, bool flush)
  525. #else
  526. private static int InternalGetChars (
  527. byte[] bytes, int byteIndex, int byteCount, char[] chars,
  528. int charIndex, ref uint leftOverBits, ref uint leftOverCount,
  529. bool throwOnInvalid, bool flush)
  530. #endif
  531. {
  532. // Validate the parameters.
  533. if (bytes == null) {
  534. throw new ArgumentNullException ("bytes");
  535. }
  536. if (chars == null) {
  537. throw new ArgumentNullException ("chars");
  538. }
  539. if (byteIndex < 0 || byteIndex > bytes.Length) {
  540. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  541. }
  542. if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) {
  543. throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
  544. }
  545. if (charIndex < 0 || charIndex > chars.Length) {
  546. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
  547. }
  548. if (charIndex == chars.Length)
  549. return 0;
  550. // Convert the bytes into the output buffer.
  551. uint ch;
  552. int length = chars.Length;
  553. int posn = charIndex;
  554. uint leftBits = leftOverBits;
  555. uint leftSoFar = (leftOverCount & (uint)0x0F);
  556. uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
  557. while (byteCount > 0) {
  558. // Fetch the next character from the byte buffer.
  559. ch = (uint)(bytes[byteIndex++]);
  560. --byteCount;
  561. if (leftSize == 0) {
  562. // Process a UTF-8 start character.
  563. if (ch < (uint)0x0080) {
  564. // Single-byte UTF-8 character.
  565. if (posn >= length) {
  566. throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
  567. }
  568. chars[posn++] = (char)ch;
  569. } else if ((ch & (uint)0xE0) == (uint)0xC0) {
  570. // Double-byte UTF-8 character.
  571. leftBits = (ch & (uint)0x1F);
  572. leftSoFar = 1;
  573. leftSize = 2;
  574. } else if ((ch & (uint)0xF0) == (uint)0xE0) {
  575. // Three-byte UTF-8 character.
  576. leftBits = (ch & (uint)0x0F);
  577. leftSoFar = 1;
  578. leftSize = 3;
  579. } else if ((ch & (uint)0xF8) == (uint)0xF0) {
  580. // Four-byte UTF-8 character.
  581. leftBits = (ch & (uint)0x07);
  582. leftSoFar = 1;
  583. leftSize = 4;
  584. } else if ((ch & (uint)0xFC) == (uint)0xF8) {
  585. // Five-byte UTF-8 character.
  586. leftBits = (ch & (uint)0x03);
  587. leftSoFar = 1;
  588. leftSize = 5;
  589. } else if ((ch & (uint)0xFE) == (uint)0xFC) {
  590. // Six-byte UTF-8 character.
  591. leftBits = (ch & (uint)0x03);
  592. leftSoFar = 1;
  593. leftSize = 6;
  594. } else {
  595. // Invalid UTF-8 start character.
  596. #if NET_2_0
  597. Fallback (fallbackBuffer, bytes, byteIndex, chars, ref posn);
  598. #else
  599. if (throwOnInvalid)
  600. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  601. #endif
  602. }
  603. } else {
  604. // Process an extra byte in a multi-byte sequence.
  605. if ((ch & (uint)0xC0) == (uint)0x80) {
  606. leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
  607. if (++leftSoFar >= leftSize) {
  608. // We have a complete character now.
  609. if (leftBits < (uint)0x10000) {
  610. // is it an overlong ?
  611. bool overlong = false;
  612. switch (leftSize) {
  613. case 2:
  614. overlong = (leftBits <= 0x7F);
  615. break;
  616. case 3:
  617. overlong = (leftBits <= 0x07FF);
  618. break;
  619. case 4:
  620. overlong = (leftBits <= 0xFFFF);
  621. break;
  622. case 5:
  623. overlong = (leftBits <= 0x1FFFFF);
  624. break;
  625. case 6:
  626. overlong = (leftBits <= 0x03FFFFFF);
  627. break;
  628. }
  629. if (overlong) {
  630. #if NET_2_0
  631. Fallback (fallbackBuffer, bytes, byteIndex, chars, ref posn);
  632. #else
  633. if (throwOnInvalid)
  634. throw new ArgumentException (_("Overlong"), leftBits.ToString ());
  635. #endif
  636. }
  637. else {
  638. if (posn >= length) {
  639. throw new ArgumentException
  640. (_("Arg_InsufficientSpace"), "chars");
  641. }
  642. chars[posn++] = (char)leftBits;
  643. }
  644. } else if (leftBits < (uint)0x110000) {
  645. if ((posn + 2) > length) {
  646. throw new ArgumentException
  647. (_("Arg_InsufficientSpace"), "chars");
  648. }
  649. leftBits -= (uint)0x10000;
  650. chars[posn++] = (char)((leftBits >> 10) +
  651. (uint)0xD800);
  652. chars[posn++] =
  653. (char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
  654. } else {
  655. #if NET_2_0
  656. Fallback (fallbackBuffer, bytes, byteIndex, chars, ref posn);
  657. #else
  658. if (throwOnInvalid)
  659. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  660. #endif
  661. }
  662. leftSize = 0;
  663. }
  664. } else {
  665. // Invalid UTF-8 sequence: clear and restart.
  666. #if NET_2_0
  667. Fallback (fallbackBuffer, bytes, byteIndex, chars, ref posn);
  668. #else
  669. if (throwOnInvalid)
  670. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  671. #endif
  672. leftSize = 0;
  673. --byteIndex;
  674. ++byteCount;
  675. }
  676. }
  677. }
  678. if (flush && leftSize != 0) {
  679. // We had left-over bytes that didn't make up
  680. // a complete UTF-8 character sequence.
  681. #if NET_2_0
  682. Fallback (fallbackBuffer, bytes, byteIndex, chars, ref posn);
  683. #else
  684. if (throwOnInvalid)
  685. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  686. #endif
  687. }
  688. leftOverBits = leftBits;
  689. leftOverCount = (leftSoFar | (leftSize << 4));
  690. // Return the final length to the caller.
  691. return posn - charIndex;
  692. }
  693. // Get the characters that result from decoding a byte buffer.
  694. public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
  695. char[] chars, int charIndex)
  696. {
  697. uint leftOverBits = 0;
  698. uint leftOverCount = 0;
  699. #if NET_2_0
  700. return InternalGetChars (bytes, byteIndex, byteCount, chars,
  701. charIndex, ref leftOverBits, ref leftOverCount, DecoderFallback.CreateFallbackBuffer (), true);
  702. #else
  703. return InternalGetChars (bytes, byteIndex, byteCount, chars,
  704. charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, true);
  705. #endif
  706. }
  707. // Get the maximum number of bytes needed to encode a
  708. // specified number of characters.
  709. public override int GetMaxByteCount (int charCount)
  710. {
  711. if (charCount < 0) {
  712. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
  713. }
  714. return charCount * 4;
  715. }
  716. // Get the maximum number of characters needed to decode a
  717. // specified number of bytes.
  718. public override int GetMaxCharCount (int byteCount)
  719. {
  720. if (byteCount < 0) {
  721. throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
  722. }
  723. return byteCount;
  724. }
  725. // Get a UTF8-specific decoder that is attached to this instance.
  726. public override Decoder GetDecoder ()
  727. {
  728. #if NET_2_0
  729. UTF8Decoder ret = new UTF8Decoder ();
  730. ret.Fallback = DecoderFallback;
  731. return ret;
  732. #else
  733. return new UTF8Decoder (throwOnInvalid);
  734. #endif
  735. }
  736. // Get a UTF8-specific encoder that is attached to this instance.
  737. public override Encoder GetEncoder ()
  738. {
  739. return new UTF8Encoder (emitIdentifier);
  740. }
  741. // Get the UTF8 preamble.
  742. public override byte[] GetPreamble ()
  743. {
  744. if (emitIdentifier) {
  745. byte[] pre = new byte [3];
  746. pre[0] = (byte)0xEF;
  747. pre[1] = (byte)0xBB;
  748. pre[2] = (byte)0xBF;
  749. return pre;
  750. } else {
  751. return new byte [0];
  752. }
  753. }
  754. // Determine if this object is equal to another.
  755. public override bool Equals (Object value)
  756. {
  757. UTF8Encoding enc = (value as UTF8Encoding);
  758. if (enc != null) {
  759. #if NET_2_0
  760. return (codePage == enc.codePage &&
  761. emitIdentifier == enc.emitIdentifier &&
  762. DecoderFallback == enc.DecoderFallback &&
  763. EncoderFallback == enc.EncoderFallback);
  764. #else
  765. return (codePage == enc.codePage &&
  766. emitIdentifier == enc.emitIdentifier &&
  767. throwOnInvalid == enc.throwOnInvalid);
  768. #endif
  769. } else {
  770. return false;
  771. }
  772. }
  773. // Get the hash code for this object.
  774. public override int GetHashCode ()
  775. {
  776. return base.GetHashCode ();
  777. }
  778. public override byte [] GetBytes (String s)
  779. {
  780. if (s == null)
  781. throw new ArgumentNullException ("s");
  782. int length = GetByteCount (s);
  783. byte [] bytes = new byte [length];
  784. GetBytes (s, 0, s.Length, bytes, 0);
  785. return bytes;
  786. }
  787. // UTF-8 decoder implementation.
  788. [Serializable]
  789. private class UTF8Decoder : Decoder
  790. {
  791. #if !NET_2_0
  792. private bool throwOnInvalid;
  793. #endif
  794. private uint leftOverBits;
  795. private uint leftOverCount;
  796. // Constructor.
  797. #if NET_2_0
  798. public UTF8Decoder ()
  799. #else
  800. public UTF8Decoder (bool throwOnInvalid)
  801. #endif
  802. {
  803. #if !NET_2_0
  804. this.throwOnInvalid = throwOnInvalid;
  805. #endif
  806. leftOverBits = 0;
  807. leftOverCount = 0;
  808. }
  809. // Override inherited methods.
  810. public override int GetCharCount (byte[] bytes, int index, int count)
  811. {
  812. #if NET_2_0
  813. return InternalGetCharCount (bytes, index, count,
  814. leftOverBits, leftOverCount, FallbackBuffer, false);
  815. #else
  816. return InternalGetCharCount (bytes, index, count,
  817. leftOverBits, leftOverCount, throwOnInvalid, false);
  818. #endif
  819. }
  820. public override int GetChars (byte[] bytes, int byteIndex,
  821. int byteCount, char[] chars, int charIndex)
  822. {
  823. #if NET_2_0
  824. return InternalGetChars (bytes, byteIndex, byteCount,
  825. chars, charIndex, ref leftOverBits, ref leftOverCount, FallbackBuffer, false);
  826. #else
  827. return InternalGetChars (bytes, byteIndex, byteCount,
  828. chars, charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, false);
  829. #endif
  830. }
  831. } // class UTF8Decoder
  832. // UTF-8 encoder implementation.
  833. [Serializable]
  834. private class UTF8Encoder : Encoder
  835. {
  836. private bool emitIdentifier;
  837. private uint leftOver;
  838. // Constructor.
  839. public UTF8Encoder (bool emitIdentifier)
  840. {
  841. this.emitIdentifier = emitIdentifier;
  842. leftOver = 0;
  843. }
  844. // Override inherited methods.
  845. public override int GetByteCount (char[] chars, int index,
  846. int count, bool flush)
  847. {
  848. return InternalGetByteCount (chars, index, count, leftOver, flush);
  849. }
  850. public override int GetBytes (char[] chars, int charIndex,
  851. int charCount, byte[] bytes, int byteCount, bool flush)
  852. {
  853. int result;
  854. result = InternalGetBytes (chars, charIndex, charCount, bytes, byteCount, ref leftOver, flush);
  855. emitIdentifier = false;
  856. return result;
  857. }
  858. } // class UTF8Encoder
  859. }; // class UTF8Encoding
  860. }; // namespace System.Text