UTF8Encoding.cs 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790
  1. /*
  2. * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
  3. *
  4. * Copyright (c) 2001, 2002 Southern Storm Software, Pty Ltd
  5. * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
  6. *
  7. * Permission is hereby granted, free of charge, to any person obtaining
  8. * a copy of this software and associated documentation files (the "Software"),
  9. * to deal in the Software without restriction, including without limitation
  10. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11. * and/or sell copies of the Software, and to permit persons to whom the
  12. * Software is furnished to do so, subject to the following conditions:
  13. *
  14. * The above copyright notice and this permission notice shall be included
  15. * in all copies or substantial portions of the Software.
  16. *
  17. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18. * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  20. * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  21. * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  22. * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  23. * OTHER DEALINGS IN THE SOFTWARE.
  24. */
  25. namespace System.Text
  26. {
  27. using System;
  28. [Serializable]
  29. [MonoTODO ("Fix serialization compatibility with MS.NET")]
  30. public class UTF8Encoding : Encoding
  31. {
  32. // Magic number used by Windows for UTF-8.
  33. internal const int UTF8_CODE_PAGE = 65001;
  34. // Internal state.
  35. private bool emitIdentifier;
  36. private bool throwOnInvalid;
  37. // Constructors.
  38. public UTF8Encoding () : this (false, false) {}
  39. public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
  40. : this (encoderShouldEmitUTF8Identifier, false) {}
  41. public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
  42. : base (UTF8_CODE_PAGE)
  43. {
  44. emitIdentifier = encoderShouldEmitUTF8Identifier;
  45. throwOnInvalid = throwOnInvalidBytes;
  46. web_name = body_name = header_name = "utf-8";
  47. encoding_name = "Unicode (UTF-8)";
  48. is_browser_save = true;
  49. is_browser_display = true;
  50. is_mail_news_display = true;
  51. windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE;
  52. }
  53. // Internal version of "GetByteCount" which can handle a rolling
  54. // state between multiple calls to this method.
  55. private static int InternalGetByteCount (char[] chars, int index, int count, uint leftOver, bool flush)
  56. {
  57. // Validate the parameters.
  58. if (chars == null) {
  59. throw new ArgumentNullException ("chars");
  60. }
  61. if (index < 0 || index > chars.Length) {
  62. throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
  63. }
  64. if (count < 0 || count > (chars.Length - index)) {
  65. throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
  66. }
  67. // Determine the lengths of all characters.
  68. char ch;
  69. int length = 0;
  70. uint pair = leftOver;
  71. while (count > 0) {
  72. ch = chars[index];
  73. if (pair == 0) {
  74. if (ch < '\u0080') {
  75. ++length;
  76. } else if (ch < '\u0800') {
  77. length += 2;
  78. } else if (ch >= '\uD800' && ch <= '\uDBFF') {
  79. // This is the start of a surrogate pair.
  80. pair = (uint)ch;
  81. } else {
  82. length += 3;
  83. }
  84. } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
  85. // We have a surrogate pair.
  86. length += 4;
  87. pair = 0;
  88. } else {
  89. // We have a surrogate start followed by a
  90. // regular character. Technically, this is
  91. // invalid, but we have to do something.
  92. // We write out the surrogate start and then
  93. // re-visit the current character again.
  94. length += 3;
  95. pair = 0;
  96. continue;
  97. }
  98. ++index;
  99. --count;
  100. }
  101. if (flush && pair != 0) {
  102. // Flush the left-over surrogate pair start.
  103. length += 3;
  104. }
  105. // Return the final length to the caller.
  106. return length;
  107. }
  108. // Get the number of bytes needed to encode a character buffer.
  109. public override int GetByteCount (char[] chars, int index, int count)
  110. {
  111. return InternalGetByteCount (chars, index, count, 0, true);
  112. }
  113. // Convenience wrappers for "GetByteCount".
  114. public override int GetByteCount (String s)
  115. {
  116. // Validate the parameters.
  117. if (s == null) {
  118. throw new ArgumentNullException ("s");
  119. }
  120. // Determine the lengths of all characters.
  121. char ch;
  122. int index = 0;
  123. int count = s.Length;
  124. int length = 0;
  125. uint pair;
  126. while (count > 0) {
  127. ch = s[index++];
  128. if (ch < '\u0080') {
  129. ++length;
  130. } else if (ch < '\u0800') {
  131. length += 2;
  132. } else if (ch >= '\uD800' && ch <= '\uDBFF' && count > 1) {
  133. // This may be the start of a surrogate pair.
  134. pair = (uint)(s[index]);
  135. if (pair >= (uint)0xDC00 && pair <= (uint)0xDFFF) {
  136. length += 4;
  137. ++index;
  138. --count;
  139. } else {
  140. length += 3;
  141. }
  142. } else {
  143. length += 3;
  144. }
  145. --count;
  146. }
  147. // Return the final length to the caller.
  148. return length;
  149. }
  150. // Internal version of "GetBytes" which can handle a rolling
  151. // state between multiple calls to this method.
  152. private static int InternalGetBytes (char[] chars, int charIndex,
  153. int charCount, byte[] bytes,
  154. int byteIndex, ref uint leftOver,
  155. bool flush)
  156. {
  157. // Validate the parameters.
  158. if (chars == null) {
  159. throw new ArgumentNullException ("chars");
  160. }
  161. if (bytes == null) {
  162. throw new ArgumentNullException ("bytes");
  163. }
  164. if (charIndex < 0 || charIndex > chars.Length) {
  165. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
  166. }
  167. if (charCount < 0 || charCount > (chars.Length - charIndex)) {
  168. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
  169. }
  170. if (byteIndex < 0 || byteIndex > bytes.Length) {
  171. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  172. }
  173. // Convert the characters into bytes.
  174. char ch;
  175. int length = bytes.Length;
  176. uint pair;
  177. uint left = leftOver;
  178. int posn = byteIndex;
  179. while (charCount > 0) {
  180. // Fetch the next UTF-16 character pair value.
  181. ch = chars[charIndex++];
  182. --charCount;
  183. if (left == 0) {
  184. if (ch >= '\uD800' && ch <= '\uDBFF') {
  185. // This is the start of a surrogate pair.
  186. left = (uint)ch;
  187. continue;
  188. } else {
  189. // This is a regular character.
  190. pair = (uint)ch;
  191. }
  192. } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
  193. // We have a surrogate pair.
  194. pair = ((left - (uint)0xD800) << 10) +
  195. (((uint)ch) - (uint)0xDC00) +
  196. (uint)0x10000;
  197. left = 0;
  198. } else {
  199. // We have a surrogate start followed by a
  200. // regular character. Technically, this is
  201. // invalid, but we have to do something.
  202. // We write out the surrogate start and then
  203. // re-visit the current character again.
  204. pair = (uint)left;
  205. left = 0;
  206. --charIndex;
  207. ++charCount;
  208. }
  209. // Encode the character pair value.
  210. if (pair < (uint)0x0080) {
  211. if (posn >= length) {
  212. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  213. }
  214. bytes[posn++] = (byte)pair;
  215. } else if (pair < (uint)0x0800) {
  216. if ((posn + 2) > length) {
  217. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  218. }
  219. bytes[posn++] = (byte)(0xC0 | (pair >> 6));
  220. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  221. } else if (pair < (uint)0x10000) {
  222. if ((posn + 3) > length) {
  223. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  224. }
  225. bytes[posn++] = (byte)(0xE0 | (pair >> 12));
  226. bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
  227. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  228. } else {
  229. if ((posn + 4) > length) {
  230. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  231. }
  232. bytes[posn++] = (byte)(0xF0 | (pair >> 18));
  233. bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F));
  234. bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
  235. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  236. }
  237. }
  238. if (flush && left != 0) {
  239. // Flush the left-over surrogate pair start.
  240. if ((posn + 3) > length) {
  241. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  242. }
  243. bytes[posn++] = (byte)(0xE0 | (left >> 12));
  244. bytes[posn++] = (byte)(0x80 | ((left >> 6) & 0x3F));
  245. bytes[posn++] = (byte)(0x80 | (left & 0x3F));
  246. left = 0;
  247. }
  248. leftOver = left;
  249. // Return the final count to the caller.
  250. return posn - byteIndex;
  251. }
  252. // Get the bytes that result from encoding a character buffer.
  253. public override int GetBytes (char[] chars, int charIndex, int charCount,
  254. byte[] bytes, int byteIndex)
  255. {
  256. uint leftOver = 0;
  257. return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOver, true);
  258. }
  259. // Convenience wrappers for "GetBytes".
  260. public override int GetBytes (String s, int charIndex, int charCount,
  261. byte[] bytes, int byteIndex)
  262. {
  263. // Validate the parameters.
  264. if (s == null) {
  265. throw new ArgumentNullException ("s");
  266. }
  267. if (bytes == null) {
  268. throw new ArgumentNullException ("bytes");
  269. }
  270. if (charIndex < 0 || charIndex > s.Length) {
  271. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
  272. }
  273. if (charCount < 0 || charCount > (s.Length - charIndex)) {
  274. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
  275. }
  276. if (byteIndex < 0 || byteIndex > bytes.Length) {
  277. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  278. }
  279. // Convert the characters into bytes.
  280. char ch;
  281. int length = bytes.Length;
  282. uint pair;
  283. int posn = byteIndex;
  284. while (charCount > 0) {
  285. // Fetch the next UTF-16 character pair value.
  286. ch = s[charIndex++];
  287. if (ch >= '\uD800' && ch <= '\uDBFF' && charCount > 1) {
  288. // This may be the start of a surrogate pair.
  289. pair = (uint)(s[charIndex]);
  290. if (pair >= (uint)0xDC00 && pair <= (uint)0xDFFF) {
  291. pair = (pair - (uint)0xDC00) +
  292. ((((uint)ch) - (uint)0xD800) << 10) +
  293. (uint)0x10000;
  294. ++charIndex;
  295. --charCount;
  296. } else {
  297. pair = (uint)ch;
  298. }
  299. } else {
  300. pair = (uint)ch;
  301. }
  302. --charCount;
  303. // Encode the character pair value.
  304. if (pair < (uint)0x0080) {
  305. if (posn >= length) {
  306. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  307. }
  308. bytes[posn++] = (byte)pair;
  309. } else if (pair < (uint)0x0800) {
  310. if ((posn + 2) > length) {
  311. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  312. }
  313. bytes[posn++] = (byte)(0xC0 | (pair >> 6));
  314. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  315. } else if (pair < (uint)0x10000) {
  316. if ((posn + 3) > length) {
  317. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  318. }
  319. bytes[posn++] = (byte)(0xE0 | (pair >> 12));
  320. bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
  321. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  322. } else {
  323. if ((posn + 4) > length) {
  324. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  325. }
  326. bytes[posn++] = (byte)(0xF0 | (pair >> 18));
  327. bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F));
  328. bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
  329. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  330. }
  331. }
  332. // Return the final count to the caller.
  333. return posn - byteIndex;
  334. }
  335. // Internal version of "GetCharCount" which can handle a rolling
  336. // state between multiple calls to this method.
  337. private static int InternalGetCharCount (byte[] bytes, int index, int count,
  338. uint leftOverBits,
  339. uint leftOverCount,
  340. bool throwOnInvalid, bool flush)
  341. {
  342. // Validate the parameters.
  343. if (bytes == null) {
  344. throw new ArgumentNullException ("bytes");
  345. }
  346. if (index < 0 || index > bytes.Length) {
  347. throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
  348. }
  349. if (count < 0 || count > (bytes.Length - index)) {
  350. throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
  351. }
  352. // Determine the number of characters that we have.
  353. uint ch;
  354. int length = 0;
  355. uint leftBits = leftOverBits;
  356. uint leftSoFar = (leftOverCount & (uint)0x0F);
  357. uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
  358. while (count > 0) {
  359. ch = (uint)(bytes[index++]);
  360. --count;
  361. if (leftSize == 0) {
  362. // Process a UTF-8 start character.
  363. if (ch < (uint)0x0080) {
  364. // Single-byte UTF-8 character.
  365. ++length;
  366. } else if ((ch & (uint)0xE0) == (uint)0xC0) {
  367. // Double-byte UTF-8 character.
  368. leftBits = (ch & (uint)0x1F);
  369. leftSoFar = 1;
  370. leftSize = 2;
  371. } else if ((ch & (uint)0xF0) == (uint)0xE0) {
  372. // Three-byte UTF-8 character.
  373. leftBits = (ch & (uint)0x0F);
  374. leftSoFar = 1;
  375. leftSize = 3;
  376. } else if ((ch & (uint)0xF8) == (uint)0xF0) {
  377. // Four-byte UTF-8 character.
  378. leftBits = (ch & (uint)0x07);
  379. leftSoFar = 1;
  380. leftSize = 4;
  381. } else if ((ch & (uint)0xFC) == (uint)0xF8) {
  382. // Five-byte UTF-8 character.
  383. leftBits = (ch & (uint)0x03);
  384. leftSoFar = 1;
  385. leftSize = 5;
  386. } else if ((ch & (uint)0xFE) == (uint)0xFC) {
  387. // Six-byte UTF-8 character.
  388. leftBits = (ch & (uint)0x03);
  389. leftSoFar = 1;
  390. leftSize = 6;
  391. } else {
  392. // Invalid UTF-8 start character.
  393. if (throwOnInvalid) {
  394. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  395. }
  396. }
  397. } else {
  398. // Process an extra byte in a multi-byte sequence.
  399. if ((ch & (uint)0xC0) == (uint)0x80) {
  400. leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
  401. if (++leftSoFar >= leftSize) {
  402. // We have a complete character now.
  403. if (leftBits < (uint)0x10000) {
  404. // is it an overlong ?
  405. bool overlong = false;
  406. switch (leftSize) {
  407. case 2:
  408. overlong = (leftBits <= 0x7F);
  409. break;
  410. case 3:
  411. overlong = (leftBits <= 0x07FF);
  412. break;
  413. case 4:
  414. overlong = (leftBits <= 0xFFFF);
  415. break;
  416. case 5:
  417. overlong = (leftBits <= 0x1FFFFF);
  418. break;
  419. case 6:
  420. overlong = (leftBits <= 0x03FFFFFF);
  421. break;
  422. }
  423. if (overlong) {
  424. if (throwOnInvalid)
  425. throw new ArgumentException (_("Overlong"), leftBits.ToString ());
  426. }
  427. else
  428. ++length;
  429. } else if (leftBits < (uint)0x110000) {
  430. length += 2;
  431. } else if (throwOnInvalid) {
  432. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  433. }
  434. leftSize = 0;
  435. }
  436. } else {
  437. // Invalid UTF-8 sequence: clear and restart.
  438. if (throwOnInvalid) {
  439. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  440. }
  441. leftSize = 0;
  442. --index;
  443. ++count;
  444. }
  445. }
  446. }
  447. if (flush && leftSize != 0 && throwOnInvalid) {
  448. // We had left-over bytes that didn't make up
  449. // a complete UTF-8 character sequence.
  450. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  451. }
  452. // Return the final length to the caller.
  453. return length;
  454. }
  455. // Get the number of characters needed to decode a byte buffer.
  456. public override int GetCharCount (byte[] bytes, int index, int count)
  457. {
  458. return InternalGetCharCount (bytes, index, count, 0, 0, throwOnInvalid, true);
  459. }
  460. // Get the characters that result from decoding a byte buffer.
  461. private static int InternalGetChars (byte[] bytes, int byteIndex,
  462. int byteCount, char[] chars,
  463. int charIndex, ref uint leftOverBits,
  464. ref uint leftOverCount,
  465. bool throwOnInvalid, bool flush)
  466. {
  467. // Validate the parameters.
  468. if (bytes == null) {
  469. throw new ArgumentNullException ("bytes");
  470. }
  471. if (chars == null) {
  472. throw new ArgumentNullException ("chars");
  473. }
  474. if (byteIndex < 0 || byteIndex > bytes.Length) {
  475. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  476. }
  477. if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) {
  478. throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
  479. }
  480. if (charIndex < 0 || charIndex > chars.Length) {
  481. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
  482. }
  483. if (charIndex == chars.Length)
  484. return 0;
  485. // Convert the bytes into the output buffer.
  486. uint ch;
  487. int length = chars.Length;
  488. int posn = charIndex;
  489. uint leftBits = leftOverBits;
  490. uint leftSoFar = (leftOverCount & (uint)0x0F);
  491. uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
  492. while (byteCount > 0) {
  493. // Fetch the next character from the byte buffer.
  494. ch = (uint)(bytes[byteIndex++]);
  495. --byteCount;
  496. if (leftSize == 0) {
  497. // Process a UTF-8 start character.
  498. if (ch < (uint)0x0080) {
  499. // Single-byte UTF-8 character.
  500. if (posn >= length) {
  501. throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
  502. }
  503. chars[posn++] = (char)ch;
  504. } else if ((ch & (uint)0xE0) == (uint)0xC0) {
  505. // Double-byte UTF-8 character.
  506. leftBits = (ch & (uint)0x1F);
  507. leftSoFar = 1;
  508. leftSize = 2;
  509. } else if ((ch & (uint)0xF0) == (uint)0xE0) {
  510. // Three-byte UTF-8 character.
  511. leftBits = (ch & (uint)0x0F);
  512. leftSoFar = 1;
  513. leftSize = 3;
  514. } else if ((ch & (uint)0xF8) == (uint)0xF0) {
  515. // Four-byte UTF-8 character.
  516. leftBits = (ch & (uint)0x07);
  517. leftSoFar = 1;
  518. leftSize = 4;
  519. } else if ((ch & (uint)0xFC) == (uint)0xF8) {
  520. // Five-byte UTF-8 character.
  521. leftBits = (ch & (uint)0x03);
  522. leftSoFar = 1;
  523. leftSize = 5;
  524. } else if ((ch & (uint)0xFE) == (uint)0xFC) {
  525. // Six-byte UTF-8 character.
  526. leftBits = (ch & (uint)0x03);
  527. leftSoFar = 1;
  528. leftSize = 6;
  529. } else {
  530. // Invalid UTF-8 start character.
  531. if (throwOnInvalid) {
  532. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  533. }
  534. }
  535. } else {
  536. // Process an extra byte in a multi-byte sequence.
  537. if ((ch & (uint)0xC0) == (uint)0x80) {
  538. leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
  539. if (++leftSoFar >= leftSize) {
  540. // We have a complete character now.
  541. if (leftBits < (uint)0x10000) {
  542. // is it an overlong ?
  543. bool overlong = false;
  544. switch (leftSize) {
  545. case 2:
  546. overlong = (leftBits <= 0x7F);
  547. break;
  548. case 3:
  549. overlong = (leftBits <= 0x07FF);
  550. break;
  551. case 4:
  552. overlong = (leftBits <= 0xFFFF);
  553. break;
  554. case 5:
  555. overlong = (leftBits <= 0x1FFFFF);
  556. break;
  557. case 6:
  558. overlong = (leftBits <= 0x03FFFFFF);
  559. break;
  560. }
  561. if (overlong) {
  562. if (throwOnInvalid)
  563. throw new ArgumentException (_("Overlong"), leftBits.ToString ());
  564. }
  565. else {
  566. if (posn >= length) {
  567. throw new ArgumentException
  568. (_("Arg_InsufficientSpace"), "chars");
  569. }
  570. chars[posn++] = (char)leftBits;
  571. }
  572. } else if (leftBits < (uint)0x110000) {
  573. if ((posn + 2) > length) {
  574. throw new ArgumentException
  575. (_("Arg_InsufficientSpace"), "chars");
  576. }
  577. leftBits -= (uint)0x10000;
  578. chars[posn++] = (char)((leftBits >> 10) +
  579. (uint)0xD800);
  580. chars[posn++] =
  581. (char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
  582. } else if (throwOnInvalid) {
  583. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  584. }
  585. leftSize = 0;
  586. }
  587. } else {
  588. // Invalid UTF-8 sequence: clear and restart.
  589. if (throwOnInvalid) {
  590. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  591. }
  592. leftSize = 0;
  593. --byteIndex;
  594. ++byteCount;
  595. }
  596. }
  597. }
  598. if (flush && leftSize != 0 && throwOnInvalid) {
  599. // We had left-over bytes that didn't make up
  600. // a complete UTF-8 character sequence.
  601. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  602. }
  603. leftOverBits = leftBits;
  604. leftOverCount = (leftSoFar | (leftSize << 4));
  605. // Return the final length to the caller.
  606. return posn - charIndex;
  607. }
  608. // Get the characters that result from decoding a byte buffer.
  609. public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
  610. char[] chars, int charIndex)
  611. {
  612. uint leftOverBits = 0;
  613. uint leftOverCount = 0;
  614. return InternalGetChars (bytes, byteIndex, byteCount, chars,
  615. charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, true);
  616. }
  617. // Get the maximum number of bytes needed to encode a
  618. // specified number of characters.
  619. public override int GetMaxByteCount (int charCount)
  620. {
  621. if (charCount < 0) {
  622. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
  623. }
  624. return charCount * 4;
  625. }
  626. // Get the maximum number of characters needed to decode a
  627. // specified number of bytes.
  628. public override int GetMaxCharCount (int byteCount)
  629. {
  630. if (byteCount < 0) {
  631. throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
  632. }
  633. return byteCount;
  634. }
  635. // Get a UTF8-specific decoder that is attached to this instance.
  636. public override Decoder GetDecoder ()
  637. {
  638. return new UTF8Decoder (throwOnInvalid);
  639. }
  640. // Get a UTF8-specific encoder that is attached to this instance.
  641. public override Encoder GetEncoder ()
  642. {
  643. return new UTF8Encoder (emitIdentifier);
  644. }
  645. // Get the UTF8 preamble.
  646. public override byte[] GetPreamble ()
  647. {
  648. if (emitIdentifier) {
  649. byte[] pre = new byte [3];
  650. pre[0] = (byte)0xEF;
  651. pre[1] = (byte)0xBB;
  652. pre[2] = (byte)0xBF;
  653. return pre;
  654. } else {
  655. return new byte [0];
  656. }
  657. }
  658. // Determine if this object is equal to another.
  659. public override bool Equals (Object value)
  660. {
  661. UTF8Encoding enc = (value as UTF8Encoding);
  662. if (enc != null) {
  663. return (codePage == enc.codePage &&
  664. emitIdentifier == enc.emitIdentifier &&
  665. throwOnInvalid == enc.throwOnInvalid);
  666. } else {
  667. return false;
  668. }
  669. }
  670. // Get the hash code for this object.
  671. public override int GetHashCode ()
  672. {
  673. return base.GetHashCode ();
  674. }
  675. public override byte [] GetBytes (String s)
  676. {
  677. if (s == null)
  678. throw new ArgumentNullException ("s");
  679. int length = GetByteCount (s);
  680. byte [] bytes = new byte [length];
  681. GetBytes (s, 0, s.Length, bytes, 0);
  682. return bytes;
  683. }
  684. // UTF-8 decoder implementation.
  685. [Serializable]
  686. private class UTF8Decoder : Decoder
  687. {
  688. private bool throwOnInvalid;
  689. private uint leftOverBits;
  690. private uint leftOverCount;
  691. // Constructor.
  692. public UTF8Decoder (bool throwOnInvalid)
  693. {
  694. this.throwOnInvalid = throwOnInvalid;
  695. leftOverBits = 0;
  696. leftOverCount = 0;
  697. }
  698. // Override inherited methods.
  699. public override int GetCharCount (byte[] bytes, int index, int count)
  700. {
  701. return InternalGetCharCount (bytes, index, count,
  702. leftOverBits, leftOverCount, throwOnInvalid, false);
  703. }
  704. public override int GetChars (byte[] bytes, int byteIndex,
  705. int byteCount, char[] chars, int charIndex)
  706. {
  707. return InternalGetChars (bytes, byteIndex, byteCount,
  708. chars, charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, false);
  709. }
  710. } // class UTF8Decoder
  711. // UTF-8 encoder implementation.
  712. [Serializable]
  713. private class UTF8Encoder : Encoder
  714. {
  715. private bool emitIdentifier;
  716. private uint leftOver;
  717. // Constructor.
  718. public UTF8Encoder (bool emitIdentifier)
  719. {
  720. this.emitIdentifier = emitIdentifier;
  721. leftOver = 0;
  722. }
  723. // Override inherited methods.
  724. public override int GetByteCount (char[] chars, int index,
  725. int count, bool flush)
  726. {
  727. return InternalGetByteCount (chars, index, count, leftOver, flush);
  728. }
  729. public override int GetBytes (char[] chars, int charIndex,
  730. int charCount, byte[] bytes, int byteCount, bool flush)
  731. {
  732. int result;
  733. result = InternalGetBytes (chars, charIndex, charCount, bytes, byteCount, ref leftOver, flush);
  734. emitIdentifier = false;
  735. return result;
  736. }
  737. } // class UTF8Encoder
  738. }; // class UTF8Encoding
  739. }; // namespace System.Text