UTF8Encoding.cs 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794
  1. /*
  2. * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
  3. *
  4. * Copyright (c) 2001, 2002 Southern Storm Software, Pty Ltd
  5. * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
  6. *
  7. * Permission is hereby granted, free of charge, to any person obtaining
  8. * a copy of this software and associated documentation files (the "Software"),
  9. * to deal in the Software without restriction, including without limitation
  10. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11. * and/or sell copies of the Software, and to permit persons to whom the
  12. * Software is furnished to do so, subject to the following conditions:
  13. *
  14. * The above copyright notice and this permission notice shall be included
  15. * in all copies or substantial portions of the Software.
  16. *
  17. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18. * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  20. * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  21. * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  22. * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  23. * OTHER DEALINGS IN THE SOFTWARE.
  24. */
  25. namespace System.Text
  26. {
  27. using System;
  28. [Serializable]
  29. [MonoTODO ("Fix serialization compatibility with MS.NET")]
  30. public class UTF8Encoding : Encoding
  31. {
  32. // Magic number used by Windows for UTF-8.
  33. internal const int UTF8_CODE_PAGE = 65001;
  34. // Internal state.
  35. private bool emitIdentifier;
  36. private bool throwOnInvalid;
  37. // Constructors.
  38. public UTF8Encoding () : this (false, false) {}
  39. public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
  40. : this (encoderShouldEmitUTF8Identifier, false) {}
  41. public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
  42. : base (UTF8_CODE_PAGE)
  43. {
  44. emitIdentifier = encoderShouldEmitUTF8Identifier;
  45. throwOnInvalid = throwOnInvalidBytes;
  46. web_name = body_name = header_name = "utf-8";
  47. encoding_name = "Unicode (UTF-8)";
  48. is_browser_save = true;
  49. is_browser_display = true;
  50. is_mail_news_display = true;
  51. windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE;
  52. }
  53. // Internal version of "GetByteCount" which can handle a rolling
  54. // state between multiple calls to this method.
  55. private static int InternalGetByteCount (char[] chars, int index, int count, uint leftOver, bool flush)
  56. {
  57. // Validate the parameters.
  58. if (chars == null) {
  59. throw new ArgumentNullException ("chars");
  60. }
  61. if (index < 0 || index > chars.Length) {
  62. throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
  63. }
  64. if (count < 0 || count > (chars.Length - index)) {
  65. throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
  66. }
  67. // Determine the lengths of all characters.
  68. char ch;
  69. int length = 0;
  70. uint pair = leftOver;
  71. while (count > 0) {
  72. ch = chars[index];
  73. if (pair == 0) {
  74. if (ch < '\u0080') {
  75. ++length;
  76. } else if (ch < '\u0800') {
  77. length += 2;
  78. } else if (ch >= '\uD800' && ch <= '\uDBFF') {
  79. // This is the start of a surrogate pair.
  80. pair = (uint)ch;
  81. } else {
  82. length += 3;
  83. }
  84. } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
  85. // We have a surrogate pair.
  86. length += 4;
  87. pair = 0;
  88. } else {
  89. // We have a surrogate start followed by a
  90. // regular character. Technically, this is
  91. // invalid, but we have to do something.
  92. // We write out the surrogate start and then
  93. // re-visit the current character again.
  94. length += 3;
  95. pair = 0;
  96. continue;
  97. }
  98. ++index;
  99. --count;
  100. }
  101. if (flush && pair != 0) {
  102. // Flush the left-over surrogate pair start.
  103. length += 3;
  104. }
  105. // Return the final length to the caller.
  106. return length;
  107. }
  108. // Get the number of bytes needed to encode a character buffer.
  109. public override int GetByteCount (char[] chars, int index, int count)
  110. {
  111. return InternalGetByteCount (chars, index, count, 0, true);
  112. }
  113. // Convenience wrappers for "GetByteCount".
  114. public override int GetByteCount (String s)
  115. {
  116. // Validate the parameters.
  117. if (s == null) {
  118. throw new ArgumentNullException ("s");
  119. }
  120. // Determine the lengths of all characters.
  121. char ch;
  122. int index = 0;
  123. int count = s.Length;
  124. int length = 0;
  125. uint pair;
  126. while (count > 0) {
  127. ch = s[index++];
  128. if (ch < '\u0080') {
  129. ++length;
  130. } else if (ch < '\u0800') {
  131. length += 2;
  132. } else if (ch >= '\uD800' && ch <= '\uDBFF' && count > 1) {
  133. // This may be the start of a surrogate pair.
  134. pair = (uint)(s[index]);
  135. if (pair >= (uint)0xDC00 && pair <= (uint)0xDFFF) {
  136. length += 4;
  137. ++index;
  138. --count;
  139. } else {
  140. length += 3;
  141. }
  142. } else {
  143. length += 3;
  144. }
  145. --count;
  146. }
  147. // Return the final length to the caller.
  148. return length;
  149. }
  150. // Internal version of "GetBytes" which can handle a rolling
  151. // state between multiple calls to this method.
  152. private static int InternalGetBytes (char[] chars, int charIndex,
  153. int charCount, byte[] bytes,
  154. int byteIndex, ref uint leftOver,
  155. bool flush)
  156. {
  157. // Validate the parameters.
  158. if (chars == null) {
  159. throw new ArgumentNullException ("chars");
  160. }
  161. if (bytes == null) {
  162. throw new ArgumentNullException ("bytes");
  163. }
  164. if (charIndex < 0 || charIndex > chars.Length) {
  165. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
  166. }
  167. if (charCount < 0 || charCount > (chars.Length - charIndex)) {
  168. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
  169. }
  170. if (byteIndex < 0 || byteIndex > bytes.Length) {
  171. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  172. }
  173. // Convert the characters into bytes.
  174. char ch;
  175. int length = bytes.Length;
  176. uint pair;
  177. uint left = leftOver;
  178. int posn = byteIndex;
  179. while (charCount > 0) {
  180. // Fetch the next UTF-16 character pair value.
  181. ch = chars[charIndex++];
  182. --charCount;
  183. if (left == 0) {
  184. if (ch >= '\uD800' && ch <= '\uDBFF') {
  185. // This is the start of a surrogate pair.
  186. left = (uint)ch;
  187. continue;
  188. } else {
  189. // This is a regular character.
  190. pair = (uint)ch;
  191. }
  192. } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
  193. // We have a surrogate pair.
  194. pair = ((left - (uint)0xD800) << 10) +
  195. (((uint)ch) - (uint)0xDC00) +
  196. (uint)0x10000;
  197. left = 0;
  198. } else {
  199. // We have a surrogate start followed by a
  200. // regular character. Technically, this is
  201. // invalid, but we have to do something.
  202. // We write out the surrogate start and then
  203. // re-visit the current character again.
  204. pair = (uint)left;
  205. left = 0;
  206. --charIndex;
  207. ++charCount;
  208. }
  209. // Encode the character pair value.
  210. if (pair < (uint)0x0080) {
  211. if (posn >= length) {
  212. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  213. }
  214. bytes[posn++] = (byte)pair;
  215. } else if (pair < (uint)0x0800) {
  216. if ((posn + 2) > length) {
  217. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  218. }
  219. bytes[posn++] = (byte)(0xC0 | (pair >> 6));
  220. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  221. } else if (pair < (uint)0x10000) {
  222. if ((posn + 3) > length) {
  223. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  224. }
  225. bytes[posn++] = (byte)(0xE0 | (pair >> 12));
  226. bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
  227. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  228. } else {
  229. if ((posn + 4) > length) {
  230. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  231. }
  232. bytes[posn++] = (byte)(0xF0 | (pair >> 18));
  233. bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F));
  234. bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
  235. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  236. }
  237. }
  238. if (flush && left != 0) {
  239. // Flush the left-over surrogate pair start.
  240. if ((posn + 3) > length) {
  241. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  242. }
  243. bytes[posn++] = (byte)(0xE0 | (left >> 12));
  244. bytes[posn++] = (byte)(0x80 | ((left >> 6) & 0x3F));
  245. bytes[posn++] = (byte)(0x80 | (left & 0x3F));
  246. left = 0;
  247. }
  248. leftOver = left;
  249. // Return the final count to the caller.
  250. return posn - byteIndex;
  251. }
  252. // Get the bytes that result from encoding a character buffer.
  253. public override int GetBytes (char[] chars, int charIndex, int charCount,
  254. byte[] bytes, int byteIndex)
  255. {
  256. uint leftOver = 0;
  257. return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOver, true);
  258. }
  259. // Convenience wrappers for "GetBytes".
  260. public override int GetBytes (String s, int charIndex, int charCount,
  261. byte[] bytes, int byteIndex)
  262. {
  263. // Validate the parameters.
  264. if (s == null) {
  265. throw new ArgumentNullException ("s");
  266. }
  267. if (bytes == null) {
  268. throw new ArgumentNullException ("bytes");
  269. }
  270. if (charIndex < 0 || charIndex > s.Length) {
  271. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
  272. }
  273. if (charCount < 0 || charCount > (s.Length - charIndex)) {
  274. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
  275. }
  276. if (byteIndex < 0 || byteIndex > bytes.Length) {
  277. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  278. }
  279. // Convert the characters into bytes.
  280. char ch;
  281. int length = bytes.Length;
  282. uint pair;
  283. int posn = byteIndex;
  284. while (charCount > 0) {
  285. // Fetch the next UTF-16 character pair value.
  286. ch = s[charIndex++];
  287. if (ch >= '\uD800' && ch <= '\uDBFF' && charCount > 1) {
  288. // This may be the start of a surrogate pair.
  289. pair = (uint)(s[charIndex]);
  290. if (pair >= (uint)0xDC00 && pair <= (uint)0xDFFF) {
  291. pair = (pair - (uint)0xDC00) +
  292. ((((uint)ch) - (uint)0xD800) << 10) +
  293. (uint)0x10000;
  294. ++charIndex;
  295. --charCount;
  296. } else {
  297. pair = (uint)ch;
  298. }
  299. } else {
  300. pair = (uint)ch;
  301. }
  302. --charCount;
  303. // Encode the character pair value.
  304. if (pair < (uint)0x0080) {
  305. if (posn >= length) {
  306. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  307. }
  308. bytes[posn++] = (byte)pair;
  309. } else if (pair < (uint)0x0800) {
  310. if ((posn + 2) > length) {
  311. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  312. }
  313. bytes[posn++] = (byte)(0xC0 | (pair >> 6));
  314. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  315. } else if (pair < (uint)0x10000) {
  316. if ((posn + 3) > length) {
  317. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  318. }
  319. bytes[posn++] = (byte)(0xE0 | (pair >> 12));
  320. bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
  321. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  322. } else {
  323. if ((posn + 4) > length) {
  324. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  325. }
  326. bytes[posn++] = (byte)(0xF0 | (pair >> 18));
  327. bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F));
  328. bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
  329. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  330. }
  331. }
  332. // Return the final count to the caller.
  333. return posn - byteIndex;
  334. }
  335. // Internal version of "GetCharCount" which can handle a rolling
  336. // state between multiple calls to this method.
  337. private static int InternalGetCharCount (byte[] bytes, int index, int count,
  338. uint leftOverBits,
  339. uint leftOverCount,
  340. bool throwOnInvalid, bool flush)
  341. {
  342. // Validate the parameters.
  343. if (bytes == null) {
  344. throw new ArgumentNullException ("bytes");
  345. }
  346. if (index < 0 || index > bytes.Length) {
  347. throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
  348. }
  349. if (count < 0 || count > (bytes.Length - index)) {
  350. throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
  351. }
  352. // Determine the number of characters that we have.
  353. uint ch;
  354. int length = 0;
  355. uint leftBits = leftOverBits;
  356. uint leftSoFar = (leftOverCount & (uint)0x0F);
  357. uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
  358. while (count > 0) {
  359. ch = (uint)(bytes[index++]);
  360. --count;
  361. if (leftSize == 0) {
  362. // Process a UTF-8 start character.
  363. if (ch < (uint)0x0080) {
  364. // Single-byte UTF-8 character.
  365. ++length;
  366. } else if ((ch & (uint)0xE0) == (uint)0xC0) {
  367. // Double-byte UTF-8 character.
  368. leftBits = (ch & (uint)0x1F);
  369. leftSoFar = 1;
  370. leftSize = 2;
  371. } else if ((ch & (uint)0xF0) == (uint)0xE0) {
  372. // Three-byte UTF-8 character.
  373. leftBits = (ch & (uint)0x0F);
  374. leftSoFar = 1;
  375. leftSize = 3;
  376. } else if ((ch & (uint)0xF8) == (uint)0xF0) {
  377. // Four-byte UTF-8 character.
  378. leftBits = (ch & (uint)0x07);
  379. leftSoFar = 1;
  380. leftSize = 4;
  381. } else if ((ch & (uint)0xFC) == (uint)0xF8) {
  382. // Five-byte UTF-8 character.
  383. leftBits = (ch & (uint)0x03);
  384. leftSoFar = 1;
  385. leftSize = 5;
  386. } else if ((ch & (uint)0xFC) == (uint)0xFC) {
  387. // Six-byte UTF-8 character.
  388. leftBits = (ch & (uint)0x03);
  389. leftSoFar = 1;
  390. leftSize = 6;
  391. } else {
  392. // Invalid UTF-8 start character.
  393. if (throwOnInvalid) {
  394. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  395. }
  396. }
  397. } else {
  398. // Process an extra byte in a multi-byte sequence.
  399. if ((ch & (uint)0xC0) == (uint)0x80) {
  400. leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
  401. if (++leftSoFar >= leftSize) {
  402. // We have a complete character now.
  403. if (leftBits < (uint)0x10000) {
  404. if (leftBits != (uint)0xFEFF) {
  405. // is it an overlong ?
  406. bool overlong = false;
  407. switch (leftSize) {
  408. case 2:
  409. overlong = (leftBits <= 0x7F);
  410. break;
  411. case 3:
  412. overlong = (leftBits <= 0x07FF);
  413. break;
  414. case 4:
  415. overlong = (leftBits <= 0xFFFF);
  416. break;
  417. case 5:
  418. overlong = (leftBits <= 0x1FFFFF);
  419. break;
  420. case 6:
  421. overlong = (leftBits <= 0x03FFFFFF);
  422. break;
  423. }
  424. if (overlong) {
  425. if (throwOnInvalid)
  426. throw new ArgumentException (_("Overlong"), leftBits.ToString ());
  427. }
  428. else
  429. ++length;
  430. }
  431. } else if (leftBits < (uint)0x110000) {
  432. length += 2;
  433. } else if (throwOnInvalid) {
  434. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  435. }
  436. leftSize = 0;
  437. }
  438. } else {
  439. // Invalid UTF-8 sequence: clear and restart.
  440. if (throwOnInvalid) {
  441. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  442. }
  443. leftSize = 0;
  444. --index;
  445. ++count;
  446. }
  447. }
  448. }
  449. if (flush && leftSize != 0 && throwOnInvalid) {
  450. // We had left-over bytes that didn't make up
  451. // a complete UTF-8 character sequence.
  452. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  453. }
  454. // Return the final length to the caller.
  455. return length;
  456. }
  457. // Get the number of characters needed to decode a byte buffer.
  458. public override int GetCharCount (byte[] bytes, int index, int count)
  459. {
  460. return InternalGetCharCount (bytes, index, count, 0, 0, throwOnInvalid, true);
  461. }
  462. // Get the characters that result from decoding a byte buffer.
  463. private static int InternalGetChars (byte[] bytes, int byteIndex,
  464. int byteCount, char[] chars,
  465. int charIndex, ref uint leftOverBits,
  466. ref uint leftOverCount,
  467. bool throwOnInvalid, bool flush)
  468. {
  469. // Validate the parameters.
  470. if (bytes == null) {
  471. throw new ArgumentNullException ("bytes");
  472. }
  473. if (chars == null) {
  474. throw new ArgumentNullException ("chars");
  475. }
  476. if (byteIndex < 0 || byteIndex > bytes.Length) {
  477. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  478. }
  479. if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) {
  480. throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
  481. }
  482. if (charIndex < 0 || charIndex > chars.Length) {
  483. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
  484. }
  485. if (charIndex == chars.Length)
  486. return 0;
  487. // Convert the bytes into the output buffer.
  488. uint ch;
  489. int length = chars.Length;
  490. int posn = charIndex;
  491. uint leftBits = leftOverBits;
  492. uint leftSoFar = (leftOverCount & (uint)0x0F);
  493. uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
  494. while (byteCount > 0) {
  495. // Fetch the next character from the byte buffer.
  496. ch = (uint)(bytes[byteIndex++]);
  497. --byteCount;
  498. if (leftSize == 0) {
  499. // Process a UTF-8 start character.
  500. if (ch < (uint)0x0080) {
  501. // Single-byte UTF-8 character.
  502. if (posn >= length) {
  503. throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
  504. }
  505. chars[posn++] = (char)ch;
  506. } else if ((ch & (uint)0xE0) == (uint)0xC0) {
  507. // Double-byte UTF-8 character.
  508. leftBits = (ch & (uint)0x1F);
  509. leftSoFar = 1;
  510. leftSize = 2;
  511. } else if ((ch & (uint)0xF0) == (uint)0xE0) {
  512. // Three-byte UTF-8 character.
  513. leftBits = (ch & (uint)0x0F);
  514. leftSoFar = 1;
  515. leftSize = 3;
  516. } else if ((ch & (uint)0xF8) == (uint)0xF0) {
  517. // Four-byte UTF-8 character.
  518. leftBits = (ch & (uint)0x07);
  519. leftSoFar = 1;
  520. leftSize = 4;
  521. } else if ((ch & (uint)0xFC) == (uint)0xF8) {
  522. // Five-byte UTF-8 character.
  523. leftBits = (ch & (uint)0x03);
  524. leftSoFar = 1;
  525. leftSize = 5;
  526. } else if ((ch & (uint)0xFC) == (uint)0xFC) {
  527. // Six-byte UTF-8 character.
  528. leftBits = (ch & (uint)0x03);
  529. leftSoFar = 1;
  530. leftSize = 6;
  531. } else {
  532. // Invalid UTF-8 start character.
  533. if (throwOnInvalid) {
  534. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  535. }
  536. }
  537. } else {
  538. // Process an extra byte in a multi-byte sequence.
  539. if ((ch & (uint)0xC0) == (uint)0x80) {
  540. leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
  541. if (++leftSoFar >= leftSize) {
  542. // We have a complete character now.
  543. if (leftBits < (uint)0x10000) {
  544. if (leftBits != (uint)0xFEFF) {
  545. // is it an overlong ?
  546. bool overlong = false;
  547. switch (leftSize) {
  548. case 2:
  549. overlong = (leftBits <= 0x7F);
  550. break;
  551. case 3:
  552. overlong = (leftBits <= 0x07FF);
  553. break;
  554. case 4:
  555. overlong = (leftBits <= 0xFFFF);
  556. break;
  557. case 5:
  558. overlong = (leftBits <= 0x1FFFFF);
  559. break;
  560. case 6:
  561. overlong = (leftBits <= 0x03FFFFFF);
  562. break;
  563. }
  564. if (overlong) {
  565. if (throwOnInvalid)
  566. throw new ArgumentException (_("Overlong"), leftBits.ToString ());
  567. }
  568. else {
  569. if (posn >= length) {
  570. throw new ArgumentException
  571. (_("Arg_InsufficientSpace"), "chars");
  572. }
  573. chars[posn++] = (char)leftBits;
  574. }
  575. }
  576. } else if (leftBits < (uint)0x110000) {
  577. if ((posn + 2) > length) {
  578. throw new ArgumentException
  579. (_("Arg_InsufficientSpace"), "chars");
  580. }
  581. leftBits -= (uint)0x10000;
  582. chars[posn++] = (char)((leftBits >> 10) +
  583. (uint)0xD800);
  584. chars[posn++] =
  585. (char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
  586. } else if (throwOnInvalid) {
  587. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  588. }
  589. leftSize = 0;
  590. }
  591. } else {
  592. // Invalid UTF-8 sequence: clear and restart.
  593. if (throwOnInvalid) {
  594. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  595. }
  596. leftSize = 0;
  597. --byteIndex;
  598. ++byteCount;
  599. }
  600. }
  601. }
  602. if (flush && leftSize != 0 && throwOnInvalid) {
  603. // We had left-over bytes that didn't make up
  604. // a complete UTF-8 character sequence.
  605. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  606. }
  607. leftOverBits = leftBits;
  608. leftOverCount = (leftSoFar | (leftSize << 4));
  609. // Return the final length to the caller.
  610. return posn - charIndex;
  611. }
  612. // Get the characters that result from decoding a byte buffer.
  613. public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
  614. char[] chars, int charIndex)
  615. {
  616. uint leftOverBits = 0;
  617. uint leftOverCount = 0;
  618. return InternalGetChars (bytes, byteIndex, byteCount, chars,
  619. charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, true);
  620. }
  621. // Get the maximum number of bytes needed to encode a
  622. // specified number of characters.
  623. public override int GetMaxByteCount (int charCount)
  624. {
  625. if (charCount < 0) {
  626. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
  627. }
  628. return charCount * 4;
  629. }
  630. // Get the maximum number of characters needed to decode a
  631. // specified number of bytes.
  632. public override int GetMaxCharCount (int byteCount)
  633. {
  634. if (byteCount < 0) {
  635. throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
  636. }
  637. return byteCount;
  638. }
  639. // Get a UTF8-specific decoder that is attached to this instance.
  640. public override Decoder GetDecoder ()
  641. {
  642. return new UTF8Decoder (throwOnInvalid);
  643. }
  644. // Get a UTF8-specific encoder that is attached to this instance.
  645. public override Encoder GetEncoder ()
  646. {
  647. return new UTF8Encoder (emitIdentifier);
  648. }
  649. // Get the UTF8 preamble.
  650. public override byte[] GetPreamble ()
  651. {
  652. if (emitIdentifier) {
  653. byte[] pre = new byte [3];
  654. pre[0] = (byte)0xEF;
  655. pre[1] = (byte)0xBB;
  656. pre[2] = (byte)0xBF;
  657. return pre;
  658. } else {
  659. return new byte [0];
  660. }
  661. }
  662. // Determine if this object is equal to another.
  663. public override bool Equals (Object value)
  664. {
  665. UTF8Encoding enc = (value as UTF8Encoding);
  666. if (enc != null) {
  667. return (codePage == enc.codePage &&
  668. emitIdentifier == enc.emitIdentifier &&
  669. throwOnInvalid == enc.throwOnInvalid);
  670. } else {
  671. return false;
  672. }
  673. }
  674. // Get the hash code for this object.
  675. public override int GetHashCode ()
  676. {
  677. return base.GetHashCode ();
  678. }
  679. public override byte [] GetBytes (String s)
  680. {
  681. if (s == null)
  682. throw new ArgumentNullException ("s");
  683. int length = GetByteCount (s);
  684. byte [] bytes = new byte [length];
  685. GetBytes (s, 0, s.Length, bytes, 0);
  686. return bytes;
  687. }
  688. // UTF-8 decoder implementation.
  689. [Serializable]
  690. private class UTF8Decoder : Decoder
  691. {
  692. private bool throwOnInvalid;
  693. private uint leftOverBits;
  694. private uint leftOverCount;
  695. // Constructor.
  696. public UTF8Decoder (bool throwOnInvalid)
  697. {
  698. this.throwOnInvalid = throwOnInvalid;
  699. leftOverBits = 0;
  700. leftOverCount = 0;
  701. }
  702. // Override inherited methods.
  703. public override int GetCharCount (byte[] bytes, int index, int count)
  704. {
  705. return InternalGetCharCount (bytes, index, count,
  706. leftOverBits, leftOverCount, throwOnInvalid, false);
  707. }
  708. public override int GetChars (byte[] bytes, int byteIndex,
  709. int byteCount, char[] chars, int charIndex)
  710. {
  711. return InternalGetChars (bytes, byteIndex, byteCount,
  712. chars, charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, false);
  713. }
  714. } // class UTF8Decoder
  715. // UTF-8 encoder implementation.
  716. [Serializable]
  717. private class UTF8Encoder : Encoder
  718. {
  719. private bool emitIdentifier;
  720. private uint leftOver;
  721. // Constructor.
  722. public UTF8Encoder (bool emitIdentifier)
  723. {
  724. this.emitIdentifier = emitIdentifier;
  725. leftOver = 0;
  726. }
  727. // Override inherited methods.
  728. public override int GetByteCount (char[] chars, int index,
  729. int count, bool flush)
  730. {
  731. return InternalGetByteCount (chars, index, count, leftOver, flush);
  732. }
  733. public override int GetBytes (char[] chars, int charIndex,
  734. int charCount, byte[] bytes, int byteCount, bool flush)
  735. {
  736. int result;
  737. result = InternalGetBytes (chars, charIndex, charCount, bytes, byteCount, ref leftOver, flush);
  738. emitIdentifier = false;
  739. return result;
  740. }
  741. } // class UTF8Encoder
  742. }; // class UTF8Encoding
  743. }; // namespace System.Text