UTF8Encoding.cs 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797
  1. /*
  2. * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
  3. *
  4. * Copyright (c) 2001, 2002 Southern Storm Software, Pty Ltd
  5. *
  6. * Permission is hereby granted, free of charge, to any person obtaining
  7. * a copy of this software and associated documentation files (the "Software"),
  8. * to deal in the Software without restriction, including without limitation
  9. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10. * and/or sell copies of the Software, and to permit persons to whom the
  11. * Software is furnished to do so, subject to the following conditions:
  12. *
  13. * The above copyright notice and this permission notice shall be included
  14. * in all copies or substantial portions of the Software.
  15. *
  16. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  17. * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19. * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  20. * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  21. * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  22. * OTHER DEALINGS IN THE SOFTWARE.
  23. */
  24. namespace System.Text
  25. {
  26. using System;
  27. [Serializable]
  28. public class UTF8Encoding : Encoding
  29. {
  30. // Magic number used by Windows for UTF-8.
  31. internal const int UTF8_CODE_PAGE = 65001;
  32. // Internal state.
  33. private bool emitIdentifier;
  34. private bool throwOnInvalid;
  35. // Constructors.
  36. public UTF8Encoding () : this (false, false) {}
  37. public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
  38. : this (encoderShouldEmitUTF8Identifier, false) {}
  39. public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
  40. : base (UTF8_CODE_PAGE)
  41. {
  42. emitIdentifier = encoderShouldEmitUTF8Identifier;
  43. throwOnInvalid = throwOnInvalidBytes;
  44. }
  45. // Internal version of "GetByteCount" which can handle a rolling
  46. // state between multiple calls to this method.
  47. private static int InternalGetByteCount (char[] chars, int index, int count, uint leftOver, bool flush)
  48. {
  49. // Validate the parameters.
  50. if (chars == null) {
  51. throw new ArgumentNullException ("chars");
  52. }
  53. if (index < 0 || index > chars.Length) {
  54. throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
  55. }
  56. if (count < 0 || count > (chars.Length - index)) {
  57. throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
  58. }
  59. // Determine the lengths of all characters.
  60. char ch;
  61. int length = 0;
  62. uint pair = leftOver;
  63. while (count > 0) {
  64. ch = chars[index];
  65. if (pair == 0) {
  66. if (ch < '\u0080') {
  67. ++length;
  68. } else if (ch < '\u0800') {
  69. length += 2;
  70. } else if (ch >= '\uD800' && ch <= '\uDBFF') {
  71. // This is the start of a surrogate pair.
  72. pair = (uint)ch;
  73. } else {
  74. length += 3;
  75. }
  76. } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
  77. // We have a surrogate pair.
  78. length += 4;
  79. pair = 0;
  80. } else {
  81. // We have a surrogate start followed by a
  82. // regular character. Technically, this is
  83. // invalid, but we have to do something.
  84. // We write out the surrogate start and then
  85. // re-visit the current character again.
  86. length += 3;
  87. pair = 0;
  88. continue;
  89. }
  90. ++index;
  91. --count;
  92. }
  93. if (flush && pair != 0) {
  94. // Flush the left-over surrogate pair start.
  95. length += 3;
  96. }
  97. // Return the final length to the caller.
  98. return length;
  99. }
  100. // Get the number of bytes needed to encode a character buffer.
  101. public override int GetByteCount (char[] chars, int index, int count)
  102. {
  103. return InternalGetByteCount (chars, index, count, 0, true);
  104. }
  105. // Convenience wrappers for "GetByteCount".
  106. public override int GetByteCount (String s)
  107. {
  108. // Validate the parameters.
  109. if (s == null) {
  110. throw new ArgumentNullException ("s");
  111. }
  112. // Determine the lengths of all characters.
  113. char ch;
  114. int index = 0;
  115. int count = s.Length;
  116. int length = 0;
  117. uint pair;
  118. while (count > 0) {
  119. ch = s[index++];
  120. if (ch < '\u0080') {
  121. ++length;
  122. } else if (ch < '\u0800') {
  123. length += 2;
  124. } else if (ch >= '\uD800' && ch <= '\uDBFF' && count > 1) {
  125. // This may be the start of a surrogate pair.
  126. pair = (uint)(s[index]);
  127. if (pair >= (uint)0xDC00 && pair <= (uint)0xDFFF) {
  128. length += 4;
  129. ++index;
  130. --count;
  131. } else {
  132. length += 3;
  133. }
  134. } else {
  135. length += 3;
  136. }
  137. --count;
  138. }
  139. // Return the final length to the caller.
  140. return length;
  141. }
  142. // Internal version of "GetBytes" which can handle a rolling
  143. // state between multiple calls to this method.
  144. private static int InternalGetBytes (char[] chars, int charIndex,
  145. int charCount, byte[] bytes,
  146. int byteIndex, ref uint leftOver,
  147. bool flush)
  148. {
  149. // Validate the parameters.
  150. if (chars == null) {
  151. throw new ArgumentNullException ("chars");
  152. }
  153. if (bytes == null) {
  154. throw new ArgumentNullException ("bytes");
  155. }
  156. if (charIndex < 0 || charIndex > chars.Length) {
  157. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
  158. }
  159. if (charCount < 0 || charCount > (chars.Length - charIndex)) {
  160. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
  161. }
  162. if (byteIndex < 0 || byteIndex > bytes.Length) {
  163. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  164. }
  165. // Convert the characters into bytes.
  166. char ch;
  167. int length = bytes.Length;
  168. uint pair;
  169. uint left = leftOver;
  170. int posn = byteIndex;
  171. while (charCount > 0) {
  172. // Fetch the next UTF-16 character pair value.
  173. ch = chars[charIndex++];
  174. --charCount;
  175. if (left == 0) {
  176. if (ch >= '\uD800' && ch <= '\uDBFF') {
  177. // This is the start of a surrogate pair.
  178. left = (uint)ch;
  179. continue;
  180. } else {
  181. // This is a regular character.
  182. pair = (uint)ch;
  183. }
  184. } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
  185. // We have a surrogate pair.
  186. pair = ((left - (uint)0xD800) << 10) +
  187. (((uint)ch) - (uint)0xDC00) +
  188. (uint)0x10000;
  189. left = 0;
  190. } else {
  191. // We have a surrogate start followed by a
  192. // regular character. Technically, this is
  193. // invalid, but we have to do something.
  194. // We write out the surrogate start and then
  195. // re-visit the current character again.
  196. pair = (uint)left;
  197. left = 0;
  198. --charIndex;
  199. ++charCount;
  200. }
  201. // Encode the character pair value.
  202. if (pair < (uint)0x0080) {
  203. if (posn >= length) {
  204. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  205. }
  206. bytes[posn++] = (byte)pair;
  207. } else if (pair < (uint)0x0800) {
  208. if ((posn + 2) > length) {
  209. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  210. }
  211. bytes[posn++] = (byte)(0xC0 | (pair >> 6));
  212. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  213. } else if (pair < (uint)0x10000) {
  214. if ((posn + 3) > length) {
  215. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  216. }
  217. bytes[posn++] = (byte)(0xE0 | (pair >> 12));
  218. bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
  219. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  220. } else {
  221. if ((posn + 4) > length) {
  222. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  223. }
  224. bytes[posn++] = (byte)(0xF0 | (pair >> 18));
  225. bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F));
  226. bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
  227. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  228. }
  229. }
  230. if (flush && left != 0) {
  231. // Flush the left-over surrogate pair start.
  232. if ((posn + 3) > length) {
  233. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  234. }
  235. bytes[posn++] = (byte)(0xE0 | (left >> 12));
  236. bytes[posn++] = (byte)(0x80 | ((left >> 6) & 0x3F));
  237. bytes[posn++] = (byte)(0x80 | (left & 0x3F));
  238. left = 0;
  239. }
  240. leftOver = left;
  241. // Return the final count to the caller.
  242. return posn - byteIndex;
  243. }
  244. // Get the bytes that result from encoding a character buffer.
  245. public override int GetBytes (char[] chars, int charIndex, int charCount,
  246. byte[] bytes, int byteIndex)
  247. {
  248. uint leftOver = 0;
  249. return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOver, true);
  250. }
  251. // Convenience wrappers for "GetBytes".
  252. public override int GetBytes (String s, int charIndex, int charCount,
  253. byte[] bytes, int byteIndex)
  254. {
  255. // Validate the parameters.
  256. if (s == null) {
  257. throw new ArgumentNullException ("s");
  258. }
  259. if (bytes == null) {
  260. throw new ArgumentNullException ("bytes");
  261. }
  262. if (charIndex < 0 || charIndex > s.Length) {
  263. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
  264. }
  265. if (charCount < 0 || charCount > (s.Length - charIndex)) {
  266. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
  267. }
  268. if (byteIndex < 0 || byteIndex > bytes.Length) {
  269. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  270. }
  271. // Convert the characters into bytes.
  272. char ch;
  273. int length = bytes.Length;
  274. uint pair;
  275. int posn = byteIndex;
  276. while (charCount > 0) {
  277. // Fetch the next UTF-16 character pair value.
  278. ch = s[charIndex++];
  279. --charCount;
  280. if (ch >= '\uD800' && ch <= '\uDBFF' && charCount > 1) {
  281. // This may be the start of a surrogate pair.
  282. pair = (uint)(s[charIndex]);
  283. if (pair >= (uint)0xDC00 && pair <= (uint)0xDFFF) {
  284. pair = (pair - (uint)0xDC00) +
  285. ((((uint)ch) - (uint)0xD800) << 10) +
  286. (uint)0x10000;
  287. ++charIndex;
  288. --charCount;
  289. } else {
  290. pair = (uint)ch;
  291. }
  292. } else {
  293. pair = (uint)ch;
  294. }
  295. // Encode the character pair value.
  296. if (pair < (uint)0x0080) {
  297. if (posn >= length) {
  298. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  299. }
  300. bytes[posn++] = (byte)pair;
  301. } else if (pair < (uint)0x0800) {
  302. if ((posn + 2) > length) {
  303. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  304. }
  305. bytes[posn++] = (byte)(0xC0 | (pair >> 6));
  306. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  307. } else if (pair < (uint)0x10000) {
  308. if ((posn + 3) > length) {
  309. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  310. }
  311. bytes[posn++] = (byte)(0xE0 | (pair >> 12));
  312. bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
  313. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  314. } else {
  315. if ((posn + 4) > length) {
  316. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  317. }
  318. bytes[posn++] = (byte)(0xF0 | (pair >> 18));
  319. bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F));
  320. bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
  321. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  322. }
  323. }
  324. // Return the final count to the caller.
  325. return posn - byteIndex;
  326. }
  327. // Internal version of "GetCharCount" which can handle a rolling
  328. // state between multiple calls to this method.
  329. private static int InternalGetCharCount (byte[] bytes, int index, int count,
  330. uint leftOverBits,
  331. uint leftOverCount,
  332. bool throwOnInvalid, bool flush)
  333. {
  334. // Validate the parameters.
  335. if (bytes == null) {
  336. throw new ArgumentNullException ("bytes");
  337. }
  338. if (index < 0 || index > bytes.Length) {
  339. throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
  340. }
  341. if (count < 0 || count > (bytes.Length - index)) {
  342. throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
  343. }
  344. // Determine the number of characters that we have.
  345. uint ch;
  346. int length = 0;
  347. uint leftBits = leftOverBits;
  348. uint leftSoFar = (leftOverCount & (uint)0x0F);
  349. uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
  350. while (count > 0) {
  351. ch = (uint)(bytes[index++]);
  352. --count;
  353. if (leftSize == 0) {
  354. // Process a UTF-8 start character.
  355. if (ch < (uint)0x0080) {
  356. // Single-byte UTF-8 character.
  357. ++length;
  358. } else if ((ch & (uint)0xE0) == (uint)0xC0) {
  359. // Double-byte UTF-8 character.
  360. leftBits = (ch & (uint)0x1F);
  361. leftSoFar = 1;
  362. leftSize = 2;
  363. } else if ((ch & (uint)0xF0) == (uint)0xE0) {
  364. // Three-byte UTF-8 character.
  365. leftBits = (ch & (uint)0x0F);
  366. leftSoFar = 1;
  367. leftSize = 3;
  368. } else if ((ch & (uint)0xF8) == (uint)0xF0) {
  369. // Four-byte UTF-8 character.
  370. leftBits = (ch & (uint)0x07);
  371. leftSoFar = 1;
  372. leftSize = 4;
  373. } else if ((ch & (uint)0xFC) == (uint)0xF8) {
  374. // Five-byte UTF-8 character.
  375. leftBits = (ch & (uint)0x03);
  376. leftSoFar = 1;
  377. leftSize = 5;
  378. } else if ((ch & (uint)0xFC) == (uint)0xFC) {
  379. // Six-byte UTF-8 character.
  380. leftBits = (ch & (uint)0x03);
  381. leftSoFar = 1;
  382. leftSize = 6;
  383. } else {
  384. // Invalid UTF-8 start character.
  385. if (throwOnInvalid) {
  386. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  387. }
  388. }
  389. } else {
  390. // Process an extra byte in a multi-byte sequence.
  391. if ((ch & (uint)0xC0) == (uint)0x80) {
  392. leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
  393. if (++leftSoFar >= leftSize) {
  394. // We have a complete character now.
  395. if (leftBits < (uint)0x10000) {
  396. if (leftBits != (uint)0xFEFF) {
  397. ++length;
  398. }
  399. } else if (leftBits < (uint)0x110000) {
  400. length += 2;
  401. } else if (throwOnInvalid) {
  402. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  403. }
  404. leftSize = 0;
  405. }
  406. } else {
  407. // Invalid UTF-8 sequence: clear and restart.
  408. if (throwOnInvalid) {
  409. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  410. }
  411. leftSize = 0;
  412. --index;
  413. ++count;
  414. }
  415. }
  416. }
  417. if (flush && leftSize != 0 && throwOnInvalid) {
  418. // We had left-over bytes that didn't make up
  419. // a complete UTF-8 character sequence.
  420. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  421. }
  422. // Return the final length to the caller.
  423. return length;
  424. }
  425. // Get the number of characters needed to decode a byte buffer.
  426. public override int GetCharCount (byte[] bytes, int index, int count)
  427. {
  428. return InternalGetCharCount (bytes, index, count, 0, 0, throwOnInvalid, true);
  429. }
  430. // Get the characters that result from decoding a byte buffer.
  431. private static int InternalGetChars (byte[] bytes, int byteIndex,
  432. int byteCount, char[] chars,
  433. int charIndex, ref uint leftOverBits,
  434. ref uint leftOverCount,
  435. bool throwOnInvalid, bool flush)
  436. {
  437. // Validate the parameters.
  438. if (bytes == null) {
  439. throw new ArgumentNullException ("bytes");
  440. }
  441. if (chars == null) {
  442. throw new ArgumentNullException ("chars");
  443. }
  444. if (byteIndex < 0 || byteIndex > bytes.Length) {
  445. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  446. }
  447. if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) {
  448. throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
  449. }
  450. if (charIndex < 0 || charIndex > chars.Length) {
  451. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
  452. }
  453. // Convert the bytes into the output buffer.
  454. uint ch;
  455. int length = chars.Length;
  456. int posn = charIndex;
  457. uint leftBits = leftOverBits;
  458. uint leftSoFar = (leftOverCount & (uint)0x0F);
  459. uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
  460. while (byteCount > 0) {
  461. // Fetch the next character from the byte buffer.
  462. ch = (uint)(bytes[byteIndex++]);
  463. --byteCount;
  464. if (leftSize == 0) {
  465. // Process a UTF-8 start character.
  466. if (ch < (uint)0x0080) {
  467. // Single-byte UTF-8 character.
  468. if (posn >= length) {
  469. throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
  470. }
  471. chars[posn++] = (char)ch;
  472. } else if ((ch & (uint)0xE0) == (uint)0xC0) {
  473. // Double-byte UTF-8 character.
  474. leftBits = (ch & (uint)0x1F);
  475. leftSoFar = 1;
  476. leftSize = 2;
  477. } else if ((ch & (uint)0xF0) == (uint)0xE0) {
  478. // Three-byte UTF-8 character.
  479. leftBits = (ch & (uint)0x0F);
  480. leftSoFar = 1;
  481. leftSize = 3;
  482. } else if ((ch & (uint)0xF8) == (uint)0xF0) {
  483. // Four-byte UTF-8 character.
  484. leftBits = (ch & (uint)0x07);
  485. leftSoFar = 1;
  486. leftSize = 4;
  487. } else if ((ch & (uint)0xFC) == (uint)0xF8) {
  488. // Five-byte UTF-8 character.
  489. leftBits = (ch & (uint)0x03);
  490. leftSoFar = 1;
  491. leftSize = 5;
  492. } else if ((ch & (uint)0xFC) == (uint)0xFC) {
  493. // Six-byte UTF-8 character.
  494. leftBits = (ch & (uint)0x03);
  495. leftSoFar = 1;
  496. leftSize = 6;
  497. } else {
  498. // Invalid UTF-8 start character.
  499. if (throwOnInvalid) {
  500. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  501. }
  502. }
  503. } else {
  504. // Process an extra byte in a multi-byte sequence.
  505. if ((ch & (uint)0xC0) == (uint)0x80) {
  506. leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
  507. if (++leftSoFar >= leftSize) {
  508. // We have a complete character now.
  509. if (leftBits < (uint)0x10000) {
  510. if (leftBits != (uint)0xFEFF) {
  511. if (posn >= length) {
  512. throw new ArgumentException
  513. (_("Arg_InsufficientSpace"), "chars");
  514. }
  515. chars[posn++] = (char)leftBits;
  516. }
  517. } else if (leftBits < (uint)0x110000) {
  518. if ((posn + 2) > length) {
  519. throw new ArgumentException
  520. (_("Arg_InsufficientSpace"), "chars");
  521. }
  522. leftBits -= (uint)0x10000;
  523. chars[posn++] = (char)((leftBits >> 10) +
  524. (uint)0xD800);
  525. chars[posn++] =
  526. (char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
  527. } else if (throwOnInvalid) {
  528. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  529. }
  530. leftSize = 0;
  531. }
  532. } else {
  533. // Invalid UTF-8 sequence: clear and restart.
  534. if (throwOnInvalid) {
  535. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  536. }
  537. leftSize = 0;
  538. --byteIndex;
  539. ++byteCount;
  540. }
  541. }
  542. }
  543. if (flush && leftSize != 0 && throwOnInvalid) {
  544. // We had left-over bytes that didn't make up
  545. // a complete UTF-8 character sequence.
  546. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  547. }
  548. leftOverBits = leftBits;
  549. leftOverCount = (leftSoFar | (leftSize << 4));
  550. // Return the final length to the caller.
  551. return posn - charIndex;
  552. }
  553. // Get the characters that result from decoding a byte buffer.
  554. public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
  555. char[] chars, int charIndex)
  556. {
  557. uint leftOverBits = 0;
  558. uint leftOverCount = 0;
  559. return InternalGetChars (bytes, byteIndex, byteCount, chars,
  560. charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, true);
  561. }
  562. // Get the maximum number of bytes needed to encode a
  563. // specified number of characters.
  564. public override int GetMaxByteCount (int charCount)
  565. {
  566. if (charCount < 0) {
  567. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
  568. }
  569. return charCount * 4;
  570. }
  571. // Get the maximum number of characters needed to decode a
  572. // specified number of bytes.
  573. public override int GetMaxCharCount (int byteCount)
  574. {
  575. if (byteCount < 0) {
  576. throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
  577. }
  578. return byteCount;
  579. }
  580. // Get a UTF8-specific decoder that is attached to this instance.
  581. public override Decoder GetDecoder ()
  582. {
  583. return new UTF8Decoder (throwOnInvalid);
  584. }
  585. // Get a UTF8-specific encoder that is attached to this instance.
  586. public override Encoder GetEncoder ()
  587. {
  588. return new UTF8Encoder (emitIdentifier);
  589. }
  590. // Get the UTF8 preamble.
  591. public override byte[] GetPreamble ()
  592. {
  593. if (emitIdentifier) {
  594. byte[] pre = new byte [3];
  595. pre[0] = (byte)0xEF;
  596. pre[1] = (byte)0xBB;
  597. pre[2] = (byte)0xBF;
  598. return pre;
  599. } else {
  600. return new byte [0];
  601. }
  602. }
  603. // Determine if this object is equal to another.
  604. public override bool Equals (Object value)
  605. {
  606. UTF8Encoding enc = (value as UTF8Encoding);
  607. if (enc != null) {
  608. return (codePage == enc.codePage &&
  609. emitIdentifier == enc.emitIdentifier &&
  610. throwOnInvalid == enc.throwOnInvalid);
  611. } else {
  612. return false;
  613. }
  614. }
  615. // Get the hash code for this object.
  616. public override int GetHashCode ()
  617. {
  618. return base.GetHashCode ();
  619. }
  620. #if !ECMA_COMPAT
  621. // Get the mail body name for this encoding.
  622. public override String BodyName
  623. {
  624. get {
  625. return "utf-8";
  626. }
  627. }
  628. // Get the human-readable name for this encoding.
  629. public override String EncodingName
  630. {
  631. get {
  632. return "Unicode (UTF-8)";
  633. }
  634. }
  635. // Get the mail agent header name for this encoding.
  636. public override String HeaderName
  637. {
  638. get {
  639. return "utf-8";
  640. }
  641. }
  642. // Determine if this encoding can be displayed in a Web browser.
  643. public override bool IsBrowserDisplay
  644. {
  645. get {
  646. return true;
  647. }
  648. }
  649. // Determine if this encoding can be saved from a Web browser.
  650. public override bool IsBrowserSave
  651. {
  652. get {
  653. return true;
  654. }
  655. }
  656. // Determine if this encoding can be displayed in a mail/news agent.
  657. public override bool IsMailNewsDisplay
  658. {
  659. get {
  660. return true;
  661. }
  662. }
  663. // Determine if this encoding can be saved from a mail/news agent.
  664. public override bool IsMailNewsSave
  665. {
  666. get {
  667. return true;
  668. }
  669. }
  670. // Get the IANA-preferred Web name for this encoding.
  671. public override String WebName
  672. {
  673. get {
  674. return "utf-8";
  675. }
  676. }
  677. // Get the Windows code page represented by this object.
  678. public override int WindowsCodePage
  679. {
  680. get {
  681. return UnicodeEncoding.UNICODE_CODE_PAGE;
  682. }
  683. }
  684. #endif // !ECMA_COMPAT
  685. // UTF-8 decoder implementation.
  686. [Serializable]
  687. private sealed class UTF8Decoder : Decoder
  688. {
  689. private bool throwOnInvalid;
  690. private uint leftOverBits;
  691. private uint leftOverCount;
  692. // Constructor.
  693. public UTF8Decoder (bool throwOnInvalid)
  694. {
  695. this.throwOnInvalid = throwOnInvalid;
  696. leftOverBits = 0;
  697. leftOverCount = 0;
  698. }
  699. // Override inherited methods.
  700. public override int GetCharCount (byte[] bytes, int index, int count)
  701. {
  702. return InternalGetCharCount (bytes, index, count,
  703. leftOverBits, leftOverCount, throwOnInvalid, false);
  704. }
  705. public override int GetChars (byte[] bytes, int byteIndex,
  706. int byteCount, char[] chars, int charIndex)
  707. {
  708. return InternalGetChars (bytes, byteIndex, byteCount,
  709. chars, charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, false);
  710. }
  711. } // class UTF8Decoder
  712. // UTF-8 encoder implementation.
  713. [Serializable]
  714. private sealed class UTF8Encoder : Encoder
  715. {
  716. private bool emitIdentifier;
  717. private uint leftOver;
  718. // Constructor.
  719. public UTF8Encoder (bool emitIdentifier)
  720. {
  721. this.emitIdentifier = emitIdentifier;
  722. leftOver = 0;
  723. }
  724. // Override inherited methods.
  725. public override int GetByteCount (char[] chars, int index,
  726. int count, bool flush)
  727. {
  728. return InternalGetByteCount (chars, index, count, leftOver, flush);
  729. }
  730. public override int GetBytes (char[] chars, int charIndex,
  731. int charCount, byte[] bytes, int byteCount, bool flush)
  732. {
  733. int result;
  734. result = InternalGetBytes (chars, charIndex, charCount, bytes, byteCount, ref leftOver, flush);
  735. emitIdentifier = false;
  736. return result;
  737. }
  738. } // class UTF8Encoder
  739. }; // class UTF8Encoding
  740. }; // namespace System.Text