UTF8Encoding.cs 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936
  1. /*
  2. * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
  3. *
  4. * Copyright (c) 2001, 2002 Southern Storm Software, Pty Ltd
  5. * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
  6. *
  7. * Permission is hereby granted, free of charge, to any person obtaining
  8. * a copy of this software and associated documentation files (the "Software"),
  9. * to deal in the Software without restriction, including without limitation
  10. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11. * and/or sell copies of the Software, and to permit persons to whom the
  12. * Software is furnished to do so, subject to the following conditions:
  13. *
  14. * The above copyright notice and this permission notice shall be included
  15. * in all copies or substantial portions of the Software.
  16. *
  17. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18. * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  20. * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  21. * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  22. * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  23. * OTHER DEALINGS IN THE SOFTWARE.
  24. */
  25. namespace System.Text
  26. {
  27. using System;
  28. [Serializable]
  29. [MonoTODO ("Fix serialization compatibility with MS.NET")]
  30. #if NET_2_0
  31. [MonoTODO ("EncoderFallback is not handled")]
  32. #endif
  33. public class UTF8Encoding : Encoding
  34. {
  35. // Magic number used by Windows for UTF-8.
  36. internal const int UTF8_CODE_PAGE = 65001;
  37. // Internal state.
  38. private bool emitIdentifier;
  39. #if !NET_2_0
  40. private bool throwOnInvalid;
  41. #endif
  42. // Constructors.
  43. public UTF8Encoding () : this (false, false) {}
  44. public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
  45. : this (encoderShouldEmitUTF8Identifier, false) {}
  46. public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
  47. : base (UTF8_CODE_PAGE)
  48. {
  49. emitIdentifier = encoderShouldEmitUTF8Identifier;
  50. #if NET_2_0
  51. if (throwOnInvalidBytes)
  52. SetFallbackInternal (null, new DecoderExceptionFallback ());
  53. else
  54. SetFallbackInternal (null, new DecoderReplacementFallback (String.Empty));
  55. #else
  56. throwOnInvalid = throwOnInvalidBytes;
  57. #endif
  58. web_name = body_name = header_name = "utf-8";
  59. encoding_name = "Unicode (UTF-8)";
  60. is_browser_save = true;
  61. is_browser_display = true;
  62. is_mail_news_display = true;
  63. windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE;
  64. }
  65. // Internal version of "GetByteCount" which can handle a rolling
  66. // state between multiple calls to this method.
  67. private static int InternalGetByteCount (char[] chars, int index, int count, uint leftOver, bool flush)
  68. {
  69. // Validate the parameters.
  70. if (chars == null) {
  71. throw new ArgumentNullException ("chars");
  72. }
  73. if (index < 0 || index > chars.Length) {
  74. throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
  75. }
  76. if (count < 0 || count > (chars.Length - index)) {
  77. throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
  78. }
  79. // Determine the lengths of all characters.
  80. char ch;
  81. int length = 0;
  82. uint pair = leftOver;
  83. while (count > 0) {
  84. ch = chars[index];
  85. if (pair == 0) {
  86. if (ch < '\u0080') {
  87. ++length;
  88. } else if (ch < '\u0800') {
  89. length += 2;
  90. } else if (ch >= '\uD800' && ch <= '\uDBFF') {
  91. // This is the start of a surrogate pair.
  92. pair = (uint)ch;
  93. } else {
  94. length += 3;
  95. }
  96. } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
  97. // We have a surrogate pair.
  98. length += 4;
  99. pair = 0;
  100. } else {
  101. // We have a surrogate start followed by a
  102. // regular character. Technically, this is
  103. // invalid, but we have to do something.
  104. // We write out the surrogate start and then
  105. // re-visit the current character again.
  106. length += 3;
  107. pair = 0;
  108. continue;
  109. }
  110. ++index;
  111. --count;
  112. }
  113. if (flush && pair != 0) {
  114. // Flush the left-over surrogate pair start.
  115. length += 3;
  116. }
  117. // Return the final length to the caller.
  118. return length;
  119. }
  120. // Get the number of bytes needed to encode a character buffer.
  121. public override int GetByteCount (char[] chars, int index, int count)
  122. {
  123. return InternalGetByteCount (chars, index, count, 0, true);
  124. }
  125. // Convenience wrappers for "GetByteCount".
  126. public override int GetByteCount (String s)
  127. {
  128. // Validate the parameters.
  129. if (s == null) {
  130. throw new ArgumentNullException ("s");
  131. }
  132. // Determine the lengths of all characters.
  133. char ch;
  134. int index = 0;
  135. int count = s.Length;
  136. int length = 0;
  137. uint pair;
  138. while (count > 0) {
  139. ch = s[index++];
  140. if (ch < '\u0080') {
  141. ++length;
  142. } else if (ch < '\u0800') {
  143. length += 2;
  144. } else if (ch >= '\uD800' && ch <= '\uDBFF' && count > 1) {
  145. // This may be the start of a surrogate pair.
  146. pair = (uint)(s[index]);
  147. if (pair >= (uint)0xDC00 && pair <= (uint)0xDFFF) {
  148. length += 4;
  149. ++index;
  150. --count;
  151. } else {
  152. length += 3;
  153. }
  154. } else {
  155. length += 3;
  156. }
  157. --count;
  158. }
  159. // Return the final length to the caller.
  160. return length;
  161. }
  162. // Internal version of "GetBytes" which can handle a rolling
  163. // state between multiple calls to this method.
  164. private static int InternalGetBytes (char[] chars, int charIndex,
  165. int charCount, byte[] bytes,
  166. int byteIndex, ref uint leftOver,
  167. bool flush)
  168. {
  169. // Validate the parameters.
  170. if (chars == null) {
  171. throw new ArgumentNullException ("chars");
  172. }
  173. if (bytes == null) {
  174. throw new ArgumentNullException ("bytes");
  175. }
  176. if (charIndex < 0 || charIndex > chars.Length) {
  177. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
  178. }
  179. if (charCount < 0 || charCount > (chars.Length - charIndex)) {
  180. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
  181. }
  182. if (byteIndex < 0 || byteIndex > bytes.Length) {
  183. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  184. }
  185. // Convert the characters into bytes.
  186. char ch;
  187. int length = bytes.Length;
  188. uint pair;
  189. uint left = leftOver;
  190. int posn = byteIndex;
  191. while (charCount > 0) {
  192. // Fetch the next UTF-16 character pair value.
  193. ch = chars[charIndex++];
  194. --charCount;
  195. if (left == 0) {
  196. if (ch >= '\uD800' && ch <= '\uDBFF') {
  197. // This is the start of a surrogate pair.
  198. left = (uint)ch;
  199. continue;
  200. } else {
  201. // This is a regular character.
  202. pair = (uint)ch;
  203. }
  204. } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
  205. // We have a surrogate pair.
  206. pair = ((left - (uint)0xD800) << 10) +
  207. (((uint)ch) - (uint)0xDC00) +
  208. (uint)0x10000;
  209. left = 0;
  210. } else {
  211. // We have a surrogate start followed by a
  212. // regular character. Technically, this is
  213. // invalid, but we have to do something.
  214. // We write out the surrogate start and then
  215. // re-visit the current character again.
  216. pair = (uint)left;
  217. left = 0;
  218. --charIndex;
  219. ++charCount;
  220. }
  221. // Encode the character pair value.
  222. if (pair < (uint)0x0080) {
  223. if (posn >= length) {
  224. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  225. }
  226. bytes[posn++] = (byte)pair;
  227. } else if (pair < (uint)0x0800) {
  228. if ((posn + 2) > length) {
  229. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  230. }
  231. bytes[posn++] = (byte)(0xC0 | (pair >> 6));
  232. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  233. } else if (pair < (uint)0x10000) {
  234. if ((posn + 3) > length) {
  235. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  236. }
  237. bytes[posn++] = (byte)(0xE0 | (pair >> 12));
  238. bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
  239. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  240. } else {
  241. if ((posn + 4) > length) {
  242. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  243. }
  244. bytes[posn++] = (byte)(0xF0 | (pair >> 18));
  245. bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F));
  246. bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
  247. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  248. }
  249. }
  250. if (flush && left != 0) {
  251. // Flush the left-over surrogate pair start.
  252. if ((posn + 3) > length) {
  253. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  254. }
  255. bytes[posn++] = (byte)(0xE0 | (left >> 12));
  256. bytes[posn++] = (byte)(0x80 | ((left >> 6) & 0x3F));
  257. bytes[posn++] = (byte)(0x80 | (left & 0x3F));
  258. left = 0;
  259. }
  260. leftOver = left;
  261. // Return the final count to the caller.
  262. return posn - byteIndex;
  263. }
  264. // Get the bytes that result from encoding a character buffer.
  265. public override int GetBytes (char[] chars, int charIndex, int charCount,
  266. byte[] bytes, int byteIndex)
  267. {
  268. uint leftOver = 0;
  269. return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOver, true);
  270. }
  271. // Convenience wrappers for "GetBytes".
  272. public override int GetBytes (String s, int charIndex, int charCount,
  273. byte[] bytes, int byteIndex)
  274. {
  275. // Validate the parameters.
  276. if (s == null) {
  277. throw new ArgumentNullException ("s");
  278. }
  279. if (bytes == null) {
  280. throw new ArgumentNullException ("bytes");
  281. }
  282. if (charIndex < 0 || charIndex > s.Length) {
  283. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
  284. }
  285. if (charCount < 0 || charCount > (s.Length - charIndex)) {
  286. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
  287. }
  288. if (byteIndex < 0 || byteIndex > bytes.Length) {
  289. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  290. }
  291. // Convert the characters into bytes.
  292. char ch;
  293. int length = bytes.Length;
  294. uint pair;
  295. int posn = byteIndex;
  296. while (charCount > 0) {
  297. // Fetch the next UTF-16 character pair value.
  298. ch = s[charIndex++];
  299. if (ch >= '\uD800' && ch <= '\uDBFF' && charCount > 1) {
  300. // This may be the start of a surrogate pair.
  301. pair = (uint)(s[charIndex]);
  302. if (pair >= (uint)0xDC00 && pair <= (uint)0xDFFF) {
  303. pair = (pair - (uint)0xDC00) +
  304. ((((uint)ch) - (uint)0xD800) << 10) +
  305. (uint)0x10000;
  306. ++charIndex;
  307. --charCount;
  308. } else {
  309. pair = (uint)ch;
  310. }
  311. } else {
  312. pair = (uint)ch;
  313. }
  314. --charCount;
  315. // Encode the character pair value.
  316. if (pair < (uint)0x0080) {
  317. if (posn >= length) {
  318. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  319. }
  320. bytes[posn++] = (byte)pair;
  321. } else if (pair < (uint)0x0800) {
  322. if ((posn + 2) > length) {
  323. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  324. }
  325. bytes[posn++] = (byte)(0xC0 | (pair >> 6));
  326. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  327. } else if (pair < (uint)0x10000) {
  328. if ((posn + 3) > length) {
  329. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  330. }
  331. bytes[posn++] = (byte)(0xE0 | (pair >> 12));
  332. bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
  333. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  334. } else {
  335. if ((posn + 4) > length) {
  336. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  337. }
  338. bytes[posn++] = (byte)(0xF0 | (pair >> 18));
  339. bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F));
  340. bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
  341. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  342. }
  343. }
  344. // Return the final count to the caller.
  345. return posn - byteIndex;
  346. }
  347. // Internal version of "GetCharCount" which can handle a rolling
  348. // state between multiple calls to this method.
  349. #if NET_2_0
  350. // Internal version of "GetCharCount" which can handle a rolling
  351. // state between multiple calls to this method.
  352. private static int InternalGetCharCount (
  353. byte[] bytes, int index, int count, uint leftOverBits,
  354. uint leftOverCount, object provider,
  355. ref DecoderFallbackBuffer fallbackBuffer, bool flush)
  356. #else
  357. private static int InternalGetCharCount (
  358. byte[] bytes, int index, int count, uint leftOverBits,
  359. uint leftOverCount, bool throwOnInvalid, bool flush)
  360. #endif
  361. {
  362. // Validate the parameters.
  363. if (bytes == null) {
  364. throw new ArgumentNullException ("bytes");
  365. }
  366. if (index < 0 || index > bytes.Length) {
  367. throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
  368. }
  369. if (count < 0 || count > (bytes.Length - index)) {
  370. throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
  371. }
  372. // Determine the number of characters that we have.
  373. uint ch;
  374. int length = 0;
  375. uint leftBits = leftOverBits;
  376. uint leftSoFar = (leftOverCount & (uint)0x0F);
  377. uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
  378. while (count > 0) {
  379. ch = (uint)(bytes[index++]);
  380. --count;
  381. if (leftSize == 0) {
  382. // Process a UTF-8 start character.
  383. if (ch < (uint)0x0080) {
  384. // Single-byte UTF-8 character.
  385. ++length;
  386. } else if ((ch & (uint)0xE0) == (uint)0xC0) {
  387. // Double-byte UTF-8 character.
  388. leftBits = (ch & (uint)0x1F);
  389. leftSoFar = 1;
  390. leftSize = 2;
  391. } else if ((ch & (uint)0xF0) == (uint)0xE0) {
  392. // Three-byte UTF-8 character.
  393. leftBits = (ch & (uint)0x0F);
  394. leftSoFar = 1;
  395. leftSize = 3;
  396. } else if ((ch & (uint)0xF8) == (uint)0xF0) {
  397. // Four-byte UTF-8 character.
  398. leftBits = (ch & (uint)0x07);
  399. leftSoFar = 1;
  400. leftSize = 4;
  401. } else if ((ch & (uint)0xFC) == (uint)0xF8) {
  402. // Five-byte UTF-8 character.
  403. leftBits = (ch & (uint)0x03);
  404. leftSoFar = 1;
  405. leftSize = 5;
  406. } else if ((ch & (uint)0xFE) == (uint)0xFC) {
  407. // Six-byte UTF-8 character.
  408. leftBits = (ch & (uint)0x03);
  409. leftSoFar = 1;
  410. leftSize = 6;
  411. } else {
  412. // Invalid UTF-8 start character.
  413. #if NET_2_0
  414. length += Fallback (provider, ref fallbackBuffer, bytes, index - 1);
  415. #else
  416. if (throwOnInvalid)
  417. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  418. #endif
  419. }
  420. } else {
  421. // Process an extra byte in a multi-byte sequence.
  422. if ((ch & (uint)0xC0) == (uint)0x80) {
  423. leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
  424. if (++leftSoFar >= leftSize) {
  425. // We have a complete character now.
  426. if (leftBits < (uint)0x10000) {
  427. // is it an overlong ?
  428. bool overlong = false;
  429. switch (leftSize) {
  430. case 2:
  431. overlong = (leftBits <= 0x7F);
  432. break;
  433. case 3:
  434. overlong = (leftBits <= 0x07FF);
  435. break;
  436. case 4:
  437. overlong = (leftBits <= 0xFFFF);
  438. break;
  439. case 5:
  440. overlong = (leftBits <= 0x1FFFFF);
  441. break;
  442. case 6:
  443. overlong = (leftBits <= 0x03FFFFFF);
  444. break;
  445. }
  446. if (overlong) {
  447. #if NET_2_0
  448. length += Fallback (provider, ref fallbackBuffer, bytes, index - 1);
  449. #else
  450. if (throwOnInvalid)
  451. throw new ArgumentException (_("Overlong"), leftBits.ToString ());
  452. #endif
  453. }
  454. else
  455. ++length;
  456. } else if (leftBits < (uint)0x110000) {
  457. length += 2;
  458. } else {
  459. #if NET_2_0
  460. length += Fallback (provider, ref fallbackBuffer, bytes, index - 1);
  461. #else
  462. if (throwOnInvalid)
  463. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  464. #endif
  465. }
  466. leftSize = 0;
  467. }
  468. } else {
  469. // Invalid UTF-8 sequence: clear and restart.
  470. #if NET_2_0
  471. length += Fallback (provider, ref fallbackBuffer, bytes, index - 1);
  472. #else
  473. if (throwOnInvalid)
  474. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  475. #endif
  476. leftSize = 0;
  477. --index;
  478. ++count;
  479. }
  480. }
  481. }
  482. if (flush && leftSize != 0) {
  483. // We had left-over bytes that didn't make up
  484. // a complete UTF-8 character sequence.
  485. #if NET_2_0
  486. length += Fallback (provider, ref fallbackBuffer, bytes, index - 1);
  487. #else
  488. if (throwOnInvalid)
  489. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  490. #endif
  491. }
  492. // Return the final length to the caller.
  493. return length;
  494. }
  495. #if NET_2_0
  496. // for GetCharCount()
  497. static int Fallback (object provider, ref DecoderFallbackBuffer buffer, byte [] bytes, int index)
  498. {
  499. if (buffer == null) {
  500. DecoderFallback fb = provider as DecoderFallback;
  501. if (fb != null)
  502. buffer = fb.CreateFallbackBuffer ();
  503. else
  504. buffer = ((Decoder) provider).FallbackBuffer;
  505. }
  506. buffer.Fallback (bytes, index - 1);
  507. return buffer.Remaining;
  508. }
  509. // for GetChars()
  510. static void Fallback (object provider, ref DecoderFallbackBuffer buffer, byte [] bytes, int byteIndex,
  511. char [] chars, ref int charIndex)
  512. {
  513. if (buffer == null) {
  514. DecoderFallback fb = provider as DecoderFallback;
  515. if (fb != null)
  516. buffer = fb.CreateFallbackBuffer ();
  517. else
  518. buffer = ((Decoder) provider).FallbackBuffer;
  519. }
  520. buffer.Fallback (bytes, byteIndex - 1);
  521. while (buffer.Remaining > 0)
  522. chars [charIndex++] = buffer.GetNextChar ();
  523. }
  524. #endif
  525. // Get the number of characters needed to decode a byte buffer.
  526. public override int GetCharCount (byte[] bytes, int index, int count)
  527. {
  528. #if NET_2_0
  529. DecoderFallbackBuffer buf = null;
  530. return InternalGetCharCount (bytes, index, count, 0, 0, DecoderFallback, ref buf, true);
  531. #else
  532. return InternalGetCharCount (bytes, index, count, 0, 0, throwOnInvalid, true);
  533. #endif
  534. }
  535. // Get the characters that result from decoding a byte buffer.
  536. #if NET_2_0
  537. private static int InternalGetChars (
  538. byte[] bytes, int byteIndex, int byteCount, char[] chars,
  539. int charIndex, ref uint leftOverBits, ref uint leftOverCount,
  540. object provider,
  541. ref DecoderFallbackBuffer fallbackBuffer, bool flush)
  542. #else
  543. private static int InternalGetChars (
  544. byte[] bytes, int byteIndex, int byteCount, char[] chars,
  545. int charIndex, ref uint leftOverBits, ref uint leftOverCount,
  546. bool throwOnInvalid, bool flush)
  547. #endif
  548. {
  549. // Validate the parameters.
  550. if (bytes == null) {
  551. throw new ArgumentNullException ("bytes");
  552. }
  553. if (chars == null) {
  554. throw new ArgumentNullException ("chars");
  555. }
  556. if (byteIndex < 0 || byteIndex > bytes.Length) {
  557. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  558. }
  559. if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) {
  560. throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
  561. }
  562. if (charIndex < 0 || charIndex > chars.Length) {
  563. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
  564. }
  565. if (charIndex == chars.Length)
  566. return 0;
  567. // Convert the bytes into the output buffer.
  568. uint ch;
  569. int length = chars.Length;
  570. int posn = charIndex;
  571. uint leftBits = leftOverBits;
  572. uint leftSoFar = (leftOverCount & (uint)0x0F);
  573. uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
  574. int byteEnd = byteIndex + byteCount;
  575. if (byteEnd < 0 || byteEnd > bytes.Length)
  576. throw new SystemException (String.Format ("INTERNAL ERROR: should not happen: {0} {1} {2}", byteIndex, byteCount, byteEnd));
  577. for(; byteIndex < byteEnd; byteIndex++) {
  578. // Fetch the next character from the byte buffer.
  579. ch = (uint)(bytes[byteIndex]);
  580. if (leftSize == 0) {
  581. // Process a UTF-8 start character.
  582. if (ch < (uint)0x0080) {
  583. // Single-byte UTF-8 character.
  584. if (posn >= length) {
  585. throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
  586. }
  587. chars[posn++] = (char)ch;
  588. } else if ((ch & (uint)0xE0) == (uint)0xC0) {
  589. // Double-byte UTF-8 character.
  590. leftBits = (ch & (uint)0x1F);
  591. leftSoFar = 1;
  592. leftSize = 2;
  593. } else if ((ch & (uint)0xF0) == (uint)0xE0) {
  594. // Three-byte UTF-8 character.
  595. leftBits = (ch & (uint)0x0F);
  596. leftSoFar = 1;
  597. leftSize = 3;
  598. } else if ((ch & (uint)0xF8) == (uint)0xF0) {
  599. // Four-byte UTF-8 character.
  600. leftBits = (ch & (uint)0x07);
  601. leftSoFar = 1;
  602. leftSize = 4;
  603. } else if ((ch & (uint)0xFC) == (uint)0xF8) {
  604. // Five-byte UTF-8 character.
  605. leftBits = (ch & (uint)0x03);
  606. leftSoFar = 1;
  607. leftSize = 5;
  608. } else if ((ch & (uint)0xFE) == (uint)0xFC) {
  609. // Six-byte UTF-8 character.
  610. leftBits = (ch & (uint)0x03);
  611. leftSoFar = 1;
  612. leftSize = 6;
  613. } else {
  614. // Invalid UTF-8 start character.
  615. #if NET_2_0
  616. Fallback (provider, ref fallbackBuffer, bytes, byteIndex, chars, ref posn);
  617. #else
  618. if (throwOnInvalid)
  619. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  620. #endif
  621. }
  622. } else {
  623. // Process an extra byte in a multi-byte sequence.
  624. if ((ch & (uint)0xC0) == (uint)0x80) {
  625. leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
  626. if (++leftSoFar >= leftSize) {
  627. // We have a complete character now.
  628. if (leftBits < (uint)0x10000) {
  629. // is it an overlong ?
  630. bool overlong = false;
  631. switch (leftSize) {
  632. case 2:
  633. overlong = (leftBits <= 0x7F);
  634. break;
  635. case 3:
  636. overlong = (leftBits <= 0x07FF);
  637. break;
  638. case 4:
  639. overlong = (leftBits <= 0xFFFF);
  640. break;
  641. case 5:
  642. overlong = (leftBits <= 0x1FFFFF);
  643. break;
  644. case 6:
  645. overlong = (leftBits <= 0x03FFFFFF);
  646. break;
  647. }
  648. if (overlong) {
  649. #if NET_2_0
  650. Fallback (provider, ref fallbackBuffer, bytes, byteIndex, chars, ref posn);
  651. #else
  652. if (throwOnInvalid)
  653. throw new ArgumentException (_("Overlong"), leftBits.ToString ());
  654. #endif
  655. }
  656. else {
  657. if (posn >= length) {
  658. throw new ArgumentException
  659. (_("Arg_InsufficientSpace"), "chars");
  660. }
  661. chars[posn++] = (char)leftBits;
  662. }
  663. } else if (leftBits < (uint)0x110000) {
  664. if ((posn + 2) > length) {
  665. throw new ArgumentException
  666. (_("Arg_InsufficientSpace"), "chars");
  667. }
  668. leftBits -= (uint)0x10000;
  669. chars[posn++] = (char)((leftBits >> 10) +
  670. (uint)0xD800);
  671. chars[posn++] =
  672. (char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
  673. } else {
  674. #if NET_2_0
  675. Fallback (provider, ref fallbackBuffer, bytes, byteIndex, chars, ref posn);
  676. #else
  677. if (throwOnInvalid)
  678. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  679. #endif
  680. }
  681. leftSize = 0;
  682. }
  683. } else {
  684. // Invalid UTF-8 sequence: clear and restart.
  685. #if NET_2_0
  686. Fallback (provider, ref fallbackBuffer, bytes, byteIndex, chars, ref posn);
  687. #else
  688. if (throwOnInvalid)
  689. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  690. #endif
  691. leftSize = 0;
  692. --byteIndex;
  693. }
  694. }
  695. }
  696. if (flush && leftSize != 0) {
  697. // We had left-over bytes that didn't make up
  698. // a complete UTF-8 character sequence.
  699. #if NET_2_0
  700. Fallback (provider, ref fallbackBuffer, bytes, byteIndex, chars, ref posn);
  701. #else
  702. if (throwOnInvalid)
  703. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  704. #endif
  705. }
  706. leftOverBits = leftBits;
  707. leftOverCount = (leftSoFar | (leftSize << 4));
  708. // Return the final length to the caller.
  709. return posn - charIndex;
  710. }
  711. // Get the characters that result from decoding a byte buffer.
  712. public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
  713. char[] chars, int charIndex)
  714. {
  715. uint leftOverBits = 0;
  716. uint leftOverCount = 0;
  717. #if NET_2_0
  718. DecoderFallbackBuffer buf = null;
  719. return InternalGetChars (bytes, byteIndex, byteCount, chars,
  720. charIndex, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, true);
  721. #else
  722. return InternalGetChars (bytes, byteIndex, byteCount, chars,
  723. charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, true);
  724. #endif
  725. }
  726. // Get the maximum number of bytes needed to encode a
  727. // specified number of characters.
  728. public override int GetMaxByteCount (int charCount)
  729. {
  730. if (charCount < 0) {
  731. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
  732. }
  733. return charCount * 4;
  734. }
  735. // Get the maximum number of characters needed to decode a
  736. // specified number of bytes.
  737. public override int GetMaxCharCount (int byteCount)
  738. {
  739. if (byteCount < 0) {
  740. throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
  741. }
  742. return byteCount;
  743. }
  744. // Get a UTF8-specific decoder that is attached to this instance.
  745. public override Decoder GetDecoder ()
  746. {
  747. #if NET_2_0
  748. return new UTF8Decoder (DecoderFallback);
  749. #else
  750. return new UTF8Decoder (throwOnInvalid);
  751. #endif
  752. }
  753. // Get a UTF8-specific encoder that is attached to this instance.
  754. public override Encoder GetEncoder ()
  755. {
  756. return new UTF8Encoder (emitIdentifier);
  757. }
  758. // Get the UTF8 preamble.
  759. public override byte[] GetPreamble ()
  760. {
  761. if (emitIdentifier) {
  762. byte[] pre = new byte [3];
  763. pre[0] = (byte)0xEF;
  764. pre[1] = (byte)0xBB;
  765. pre[2] = (byte)0xBF;
  766. return pre;
  767. } else {
  768. return new byte [0];
  769. }
  770. }
  771. // Determine if this object is equal to another.
  772. public override bool Equals (Object value)
  773. {
  774. UTF8Encoding enc = (value as UTF8Encoding);
  775. if (enc != null) {
  776. #if NET_2_0
  777. return (codePage == enc.codePage &&
  778. emitIdentifier == enc.emitIdentifier &&
  779. DecoderFallback == enc.DecoderFallback &&
  780. EncoderFallback == enc.EncoderFallback);
  781. #else
  782. return (codePage == enc.codePage &&
  783. emitIdentifier == enc.emitIdentifier &&
  784. throwOnInvalid == enc.throwOnInvalid);
  785. #endif
  786. } else {
  787. return false;
  788. }
  789. }
  790. // Get the hash code for this object.
  791. public override int GetHashCode ()
  792. {
  793. return base.GetHashCode ();
  794. }
  795. public override byte [] GetBytes (String s)
  796. {
  797. if (s == null)
  798. throw new ArgumentNullException ("s");
  799. int length = GetByteCount (s);
  800. byte [] bytes = new byte [length];
  801. GetBytes (s, 0, s.Length, bytes, 0);
  802. return bytes;
  803. }
  804. // UTF-8 decoder implementation.
  805. [Serializable]
  806. private class UTF8Decoder : Decoder
  807. {
  808. #if !NET_2_0
  809. private bool throwOnInvalid;
  810. #endif
  811. private uint leftOverBits;
  812. private uint leftOverCount;
  813. // Constructor.
  814. #if NET_2_0
  815. public UTF8Decoder (DecoderFallback fallback)
  816. #else
  817. public UTF8Decoder (bool throwOnInvalid)
  818. #endif
  819. {
  820. #if NET_2_0
  821. Fallback = fallback;
  822. #else
  823. this.throwOnInvalid = throwOnInvalid;
  824. #endif
  825. leftOverBits = 0;
  826. leftOverCount = 0;
  827. }
  828. // Override inherited methods.
  829. public override int GetCharCount (byte[] bytes, int index, int count)
  830. {
  831. #if NET_2_0
  832. DecoderFallbackBuffer buf = null;
  833. return InternalGetCharCount (bytes, index, count,
  834. leftOverBits, leftOverCount, this, ref buf, false);
  835. #else
  836. return InternalGetCharCount (bytes, index, count,
  837. leftOverBits, leftOverCount, throwOnInvalid, false);
  838. #endif
  839. }
  840. public override int GetChars (byte[] bytes, int byteIndex,
  841. int byteCount, char[] chars, int charIndex)
  842. {
  843. #if NET_2_0
  844. DecoderFallbackBuffer buf = null;
  845. return InternalGetChars (bytes, byteIndex, byteCount,
  846. chars, charIndex, ref leftOverBits, ref leftOverCount, this, ref buf, false);
  847. #else
  848. return InternalGetChars (bytes, byteIndex, byteCount,
  849. chars, charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, false);
  850. #endif
  851. }
  852. } // class UTF8Decoder
  853. // UTF-8 encoder implementation.
  854. [Serializable]
  855. private class UTF8Encoder : Encoder
  856. {
  857. private bool emitIdentifier;
  858. private uint leftOver;
  859. // Constructor.
  860. public UTF8Encoder (bool emitIdentifier)
  861. {
  862. this.emitIdentifier = emitIdentifier;
  863. leftOver = 0;
  864. }
  865. // Override inherited methods.
  866. public override int GetByteCount (char[] chars, int index,
  867. int count, bool flush)
  868. {
  869. return InternalGetByteCount (chars, index, count, leftOver, flush);
  870. }
  871. public override int GetBytes (char[] chars, int charIndex,
  872. int charCount, byte[] bytes, int byteCount, bool flush)
  873. {
  874. int result;
  875. result = InternalGetBytes (chars, charIndex, charCount, bytes, byteCount, ref leftOver, flush);
  876. emitIdentifier = false;
  877. return result;
  878. }
  879. } // class UTF8Encoder
  880. }; // class UTF8Encoding
  881. }; // namespace System.Text