UTF8Encoding.cs 29 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016
  1. /*
  2. * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
  3. *
  4. * Copyright (c) 2001, 2002 Southern Storm Software, Pty Ltd
  5. * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
  6. *
  7. * Permission is hereby granted, free of charge, to any person obtaining
  8. * a copy of this software and associated documentation files (the "Software"),
  9. * to deal in the Software without restriction, including without limitation
  10. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11. * and/or sell copies of the Software, and to permit persons to whom the
  12. * Software is furnished to do so, subject to the following conditions:
  13. *
  14. * The above copyright notice and this permission notice shall be included
  15. * in all copies or substantial portions of the Software.
  16. *
  17. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18. * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  20. * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  21. * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  22. * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  23. * OTHER DEALINGS IN THE SOFTWARE.
  24. */
  25. namespace System.Text
  26. {
  27. using System;
  28. [Serializable]
  29. [MonoTODO ("Fix serialization compatibility with MS.NET")]
  30. #if NET_2_0
  31. [MonoTODO ("EncoderFallback is not handled")]
  32. #endif
  33. public class UTF8Encoding : Encoding
  34. {
  35. // Magic number used by Windows for UTF-8.
  36. internal const int UTF8_CODE_PAGE = 65001;
  37. // Internal state.
  38. private bool emitIdentifier;
  39. #if !NET_2_0
  40. private bool throwOnInvalid;
  41. #endif
  42. // Constructors.
  43. public UTF8Encoding () : this (false, false) {}
  44. public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
  45. : this (encoderShouldEmitUTF8Identifier, false) {}
  46. public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
  47. : base (UTF8_CODE_PAGE)
  48. {
  49. emitIdentifier = encoderShouldEmitUTF8Identifier;
  50. #if NET_2_0
  51. if (throwOnInvalidBytes)
  52. SetFallbackInternal (null, new DecoderExceptionFallback ());
  53. else
  54. SetFallbackInternal (null, new DecoderReplacementFallback (String.Empty));
  55. #else
  56. throwOnInvalid = throwOnInvalidBytes;
  57. #endif
  58. web_name = body_name = header_name = "utf-8";
  59. encoding_name = "Unicode (UTF-8)";
  60. is_browser_save = true;
  61. is_browser_display = true;
  62. is_mail_news_display = true;
  63. windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE;
  64. }
  65. #region GetByteCount()
  66. // Internal version of "GetByteCount" which can handle a rolling
  67. // state between multiple calls to this method.
  68. private static int InternalGetByteCount (char[] chars, int index, int count, ref char leftOver, bool flush)
  69. {
  70. // Validate the parameters.
  71. if (chars == null) {
  72. throw new ArgumentNullException ("chars");
  73. }
  74. if (index < 0 || index > chars.Length) {
  75. throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
  76. }
  77. if (count < 0 || count > (chars.Length - index)) {
  78. throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
  79. }
  80. if (index == chars.Length) {
  81. if (flush && leftOver != '\0') {
  82. // Flush the left-over surrogate pair start.
  83. leftOver = '\0';
  84. return 3;
  85. }
  86. return 0;
  87. }
  88. unsafe {
  89. fixed (char* cptr = chars) {
  90. return InternalGetByteCount (cptr + index, count, ref leftOver, flush);
  91. }
  92. }
  93. }
  94. private unsafe static int InternalGetByteCount (char* chars, int count, ref char leftOver, bool flush)
  95. {
  96. int index = 0;
  97. // Determine the lengths of all characters.
  98. char ch;
  99. int length = 0;
  100. char pair = leftOver;
  101. while (count > 0) {
  102. ch = chars[index];
  103. if (pair == 0) {
  104. if (ch < '\u0080') {
  105. ++length;
  106. } else if (ch < '\u0800') {
  107. length += 2;
  108. } else if (ch >= '\uD800' && ch <= '\uDBFF') {
  109. // This is the start of a surrogate pair.
  110. pair = ch;
  111. } else {
  112. length += 3;
  113. }
  114. } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
  115. if (pair != 0) {
  116. // We have a surrogate pair.
  117. length += 4;
  118. pair = '\0';
  119. } else {
  120. // We have a surrogate tail without
  121. // leading surrogate. In NET_2_0 it
  122. // uses fallback. In NET_1_1 we output
  123. // wrong surrogate.
  124. length += 3;
  125. pair = '\0';
  126. }
  127. } else {
  128. // We have a surrogate start followed by a
  129. // regular character. Technically, this is
  130. // invalid, but we have to do something.
  131. // We write out the surrogate start and then
  132. // re-visit the current character again.
  133. length += 3;
  134. pair = '\0';
  135. continue;
  136. }
  137. ++index;
  138. --count;
  139. }
  140. if (flush) {
  141. if (pair != '\0')
  142. // Flush the left-over surrogate pair start.
  143. length += 3;
  144. leftOver = '\0';
  145. }
  146. else
  147. leftOver = pair;
  148. // Return the final length to the caller.
  149. return length;
  150. }
  151. // Get the number of bytes needed to encode a character buffer.
  152. public override int GetByteCount (char[] chars, int index, int count)
  153. {
  154. char dummy = '\0';
  155. return InternalGetByteCount (chars, index, count, ref dummy, true);
  156. }
  157. // Convenience wrappers for "GetByteCount".
  158. public override int GetByteCount (String s)
  159. {
  160. // Validate the parameters.
  161. if (s == null) {
  162. throw new ArgumentNullException ("s");
  163. }
  164. unsafe {
  165. fixed (char* cptr = s) {
  166. char dummy = '\0';
  167. return InternalGetByteCount (cptr, s.Length, ref dummy, true);
  168. }
  169. }
  170. }
  171. #endregion
  172. #region GetBytes()
  173. // Internal version of "GetBytes" which can handle a rolling
  174. // state between multiple calls to this method.
  175. private static int InternalGetBytes (char[] chars, int charIndex,
  176. int charCount, byte[] bytes,
  177. int byteIndex, ref char leftOver,
  178. bool flush)
  179. {
  180. // Validate the parameters.
  181. if (chars == null) {
  182. throw new ArgumentNullException ("chars");
  183. }
  184. if (bytes == null) {
  185. throw new ArgumentNullException ("bytes");
  186. }
  187. if (charIndex < 0 || charIndex > chars.Length) {
  188. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
  189. }
  190. if (charCount < 0 || charCount > (chars.Length - charIndex)) {
  191. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
  192. }
  193. if (byteIndex < 0 || byteIndex > bytes.Length) {
  194. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  195. }
  196. if (charIndex == chars.Length) {
  197. if (flush && leftOver != 0) {
  198. // Flush the left-over surrogate pair start.
  199. bytes [byteIndex++] = 0xEF;
  200. bytes [byteIndex++] = 0xBB;
  201. bytes [byteIndex++] = 0xBF;
  202. leftOver = '\0';
  203. return 3;
  204. }
  205. return 0;
  206. }
  207. unsafe {
  208. fixed (char* cptr = chars) {
  209. fixed (byte *bptr = bytes) {
  210. return InternalGetBytes (
  211. cptr + charIndex, charCount,
  212. bptr + byteIndex, bytes.Length - byteIndex,
  213. ref leftOver, flush);
  214. }
  215. }
  216. }
  217. }
  218. private unsafe static int InternalGetBytes (char* chars, int charCount,
  219. byte* bytes, int byteCount,
  220. ref char leftOver, bool flush)
  221. {
  222. int charIndex = 0;
  223. int byteIndex = 0;
  224. // Convert the characters into bytes.
  225. // Convert the characters into bytes.
  226. char ch;
  227. int length = byteCount;
  228. char pair = leftOver;
  229. int posn = byteIndex;
  230. int code = 0;
  231. while (charCount > 0) {
  232. // Fetch the next UTF-16 character pair value.
  233. ch = chars [charIndex];
  234. if (pair == '\0') {
  235. if (ch < '\uD800' || ch >= '\uE000')
  236. code = ch;
  237. else if (ch < '\uDC00') {
  238. // surrogate start
  239. pair = ch;
  240. ++charIndex;
  241. --charCount;
  242. continue;
  243. } else { // ch <= '\uDFFF'
  244. // We have a surrogate tail without leading
  245. // surrogate. In NET_2_0 it uses fallback.
  246. // In NET_1_1 we output wrong surrogate.
  247. if ((posn + 3) > length) {
  248. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  249. }
  250. bytes [posn++] = (byte) (0xE0 | (ch >> 12));
  251. bytes [posn++] = (byte) (0x80 | ((ch >> 6) & 0x3F));
  252. bytes [posn++] = (byte) (0x80 | (ch & 0x3F));
  253. ++charIndex;
  254. --charCount;
  255. continue;
  256. }
  257. } else {
  258. if ('\uDC00' <= ch && ch <= '\uDFFF')
  259. code = 0x10000 + (int) ch - 0xDC00 +
  260. (((int) pair - 0xD800) << 10);
  261. else {
  262. // We have a surrogate start followed by a
  263. // regular character. Technically, this is
  264. // invalid, but we have to do something.
  265. // We write out the surrogate start and then
  266. // re-visit the current character again.
  267. if ((posn + 3) > length) {
  268. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  269. }
  270. bytes [posn++] = (byte) (0xE0 | (pair >> 12));
  271. bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F));
  272. bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
  273. pair = '\0';
  274. continue;
  275. }
  276. pair = '\0';
  277. }
  278. ++charIndex;
  279. --charCount;
  280. // Encode the character pair value.
  281. if (code < 0x0080) {
  282. if (posn >= length)
  283. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  284. bytes [posn++] = (byte)code;
  285. } else if (code < 0x0800) {
  286. if ((posn + 2) > length)
  287. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  288. bytes [posn++] = (byte) (0xC0 | (code >> 6));
  289. bytes [posn++] = (byte) (0x80 | (code & 0x3F));
  290. } else if (code < 0x10000) {
  291. if ((posn + 3) > length)
  292. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  293. bytes [posn++] = (byte) (0xE0 | (code >> 12));
  294. bytes [posn++] = (byte) (0x80 | ((code >> 6) & 0x3F));
  295. bytes [posn++] = (byte) (0x80 | (code & 0x3F));
  296. } else {
  297. if ((posn + 4) > length)
  298. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  299. bytes [posn++] = (byte) (0xF0 | (code >> 18));
  300. bytes [posn++] = (byte) (0x80 | ((code >> 12) & 0x3F));
  301. bytes [posn++] = (byte) (0x80 | ((code >> 6) & 0x3F));
  302. bytes [posn++] = (byte) (0x80 | (code & 0x3F));
  303. }
  304. }
  305. if (flush) {
  306. if (pair != '\0') {
  307. // Flush the left-over incomplete surrogate.
  308. if ((posn + 3) > length) {
  309. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  310. }
  311. bytes [posn++] = (byte) (0xE0 | (pair >> 12));
  312. bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F));
  313. bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
  314. }
  315. leftOver = '\0';
  316. }
  317. // Return the final count to the caller.
  318. return posn - byteIndex;
  319. }
  320. private unsafe int Fallback (byte* bytes, int byteCount, char lead, char tail)
  321. {
  322. throw new NotImplementedException ();
  323. }
  324. // Get the bytes that result from encoding a character buffer.
  325. public override int GetBytes (char[] chars, int charIndex, int charCount,
  326. byte[] bytes, int byteIndex)
  327. {
  328. char leftOver = '\0';
  329. return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOver, true);
  330. }
  331. // Convenience wrappers for "GetBytes".
  332. public override int GetBytes (String s, int charIndex, int charCount,
  333. byte[] bytes, int byteIndex)
  334. {
  335. // Validate the parameters.
  336. if (s == null) {
  337. throw new ArgumentNullException ("s");
  338. }
  339. if (bytes == null) {
  340. throw new ArgumentNullException ("bytes");
  341. }
  342. if (charIndex < 0 || charIndex > s.Length) {
  343. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
  344. }
  345. if (charCount < 0 || charCount > (s.Length - charIndex)) {
  346. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
  347. }
  348. if (byteIndex < 0 || byteIndex > bytes.Length) {
  349. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  350. }
  351. if (charIndex == s.Length)
  352. return 0;
  353. unsafe {
  354. fixed (char* cptr = s) {
  355. fixed (byte *bptr = bytes) {
  356. char dummy = '\0';
  357. return InternalGetBytes (
  358. cptr + charIndex, charCount,
  359. bptr + byteIndex, bytes.Length - byteIndex,
  360. ref dummy, true);
  361. }
  362. }
  363. }
  364. }
  365. #endregion
  366. // Internal version of "GetCharCount" which can handle a rolling
  367. // state between multiple calls to this method.
  368. #if NET_2_0
  369. // Internal version of "GetCharCount" which can handle a rolling
  370. // state between multiple calls to this method.
  371. private static int InternalGetCharCount (
  372. byte[] bytes, int index, int count, uint leftOverBits,
  373. uint leftOverCount, object provider,
  374. ref DecoderFallbackBuffer fallbackBuffer, bool flush)
  375. #else
  376. private static int InternalGetCharCount (
  377. byte[] bytes, int index, int count, uint leftOverBits,
  378. uint leftOverCount, bool throwOnInvalid, bool flush)
  379. #endif
  380. {
  381. // Validate the parameters.
  382. if (bytes == null) {
  383. throw new ArgumentNullException ("bytes");
  384. }
  385. if (index < 0 || index > bytes.Length) {
  386. throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
  387. }
  388. if (count < 0 || count > (bytes.Length - index)) {
  389. throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
  390. }
  391. int length = 0;
  392. if (leftOverCount == 0) {
  393. int end = index + count;
  394. for (; index < end; index++, count--) {
  395. if (bytes [index] < 0x80)
  396. length++;
  397. else
  398. break;
  399. }
  400. }
  401. // Determine the number of characters that we have.
  402. uint ch;
  403. uint leftBits = leftOverBits;
  404. uint leftSoFar = (leftOverCount & (uint)0x0F);
  405. uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
  406. while (count > 0) {
  407. ch = (uint)(bytes[index++]);
  408. --count;
  409. if (leftSize == 0) {
  410. // Process a UTF-8 start character.
  411. if (ch < (uint)0x0080) {
  412. // Single-byte UTF-8 character.
  413. ++length;
  414. } else if ((ch & (uint)0xE0) == (uint)0xC0) {
  415. // Double-byte UTF-8 character.
  416. leftBits = (ch & (uint)0x1F);
  417. leftSoFar = 1;
  418. leftSize = 2;
  419. } else if ((ch & (uint)0xF0) == (uint)0xE0) {
  420. // Three-byte UTF-8 character.
  421. leftBits = (ch & (uint)0x0F);
  422. leftSoFar = 1;
  423. leftSize = 3;
  424. } else if ((ch & (uint)0xF8) == (uint)0xF0) {
  425. // Four-byte UTF-8 character.
  426. leftBits = (ch & (uint)0x07);
  427. leftSoFar = 1;
  428. leftSize = 4;
  429. } else if ((ch & (uint)0xFC) == (uint)0xF8) {
  430. // Five-byte UTF-8 character.
  431. leftBits = (ch & (uint)0x03);
  432. leftSoFar = 1;
  433. leftSize = 5;
  434. } else if ((ch & (uint)0xFE) == (uint)0xFC) {
  435. // Six-byte UTF-8 character.
  436. leftBits = (ch & (uint)0x03);
  437. leftSoFar = 1;
  438. leftSize = 6;
  439. } else {
  440. // Invalid UTF-8 start character.
  441. #if NET_2_0
  442. length += Fallback (provider, ref fallbackBuffer, bytes, index - 1);
  443. #else
  444. if (throwOnInvalid)
  445. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  446. #endif
  447. }
  448. } else {
  449. // Process an extra byte in a multi-byte sequence.
  450. if ((ch & (uint)0xC0) == (uint)0x80) {
  451. leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
  452. if (++leftSoFar >= leftSize) {
  453. // We have a complete character now.
  454. if (leftBits < (uint)0x10000) {
  455. // is it an overlong ?
  456. bool overlong = false;
  457. switch (leftSize) {
  458. case 2:
  459. overlong = (leftBits <= 0x7F);
  460. break;
  461. case 3:
  462. overlong = (leftBits <= 0x07FF);
  463. break;
  464. case 4:
  465. overlong = (leftBits <= 0xFFFF);
  466. break;
  467. case 5:
  468. overlong = (leftBits <= 0x1FFFFF);
  469. break;
  470. case 6:
  471. overlong = (leftBits <= 0x03FFFFFF);
  472. break;
  473. }
  474. if (overlong) {
  475. #if NET_2_0
  476. length += Fallback (provider, ref fallbackBuffer, bytes, index - 1);
  477. #else
  478. if (throwOnInvalid)
  479. throw new ArgumentException (_("Overlong"), leftBits.ToString ());
  480. #endif
  481. }
  482. else
  483. ++length;
  484. } else if (leftBits < (uint)0x110000) {
  485. length += 2;
  486. } else {
  487. #if NET_2_0
  488. length += Fallback (provider, ref fallbackBuffer, bytes, index - 1);
  489. #else
  490. if (throwOnInvalid)
  491. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  492. #endif
  493. }
  494. leftSize = 0;
  495. }
  496. } else {
  497. // Invalid UTF-8 sequence: clear and restart.
  498. #if NET_2_0
  499. length += Fallback (provider, ref fallbackBuffer, bytes, index - 1);
  500. #else
  501. if (throwOnInvalid)
  502. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  503. #endif
  504. leftSize = 0;
  505. --index;
  506. ++count;
  507. }
  508. }
  509. }
  510. if (flush && leftSize != 0) {
  511. // We had left-over bytes that didn't make up
  512. // a complete UTF-8 character sequence.
  513. #if NET_2_0
  514. length += Fallback (provider, ref fallbackBuffer, bytes, index - 1);
  515. #else
  516. if (throwOnInvalid)
  517. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  518. #endif
  519. }
  520. // Return the final length to the caller.
  521. return length;
  522. }
  523. #if NET_2_0
  524. // for GetCharCount()
  525. static int Fallback (object provider, ref DecoderFallbackBuffer buffer, byte [] bytes, int index)
  526. {
  527. if (buffer == null) {
  528. DecoderFallback fb = provider as DecoderFallback;
  529. if (fb != null)
  530. buffer = fb.CreateFallbackBuffer ();
  531. else
  532. buffer = ((Decoder) provider).FallbackBuffer;
  533. }
  534. buffer.Fallback (bytes, index - 1);
  535. return buffer.Remaining;
  536. }
  537. // for GetChars()
  538. static void Fallback (object provider, ref DecoderFallbackBuffer buffer, byte [] bytes, int byteIndex,
  539. char [] chars, ref int charIndex)
  540. {
  541. if (buffer == null) {
  542. DecoderFallback fb = provider as DecoderFallback;
  543. if (fb != null)
  544. buffer = fb.CreateFallbackBuffer ();
  545. else
  546. buffer = ((Decoder) provider).FallbackBuffer;
  547. }
  548. buffer.Fallback (bytes, byteIndex - 1);
  549. while (buffer.Remaining > 0)
  550. chars [charIndex++] = buffer.GetNextChar ();
  551. }
  552. #endif
  553. // Get the number of characters needed to decode a byte buffer.
  554. public override int GetCharCount (byte[] bytes, int index, int count)
  555. {
  556. #if NET_2_0
  557. DecoderFallbackBuffer buf = null;
  558. return InternalGetCharCount (bytes, index, count, 0, 0, DecoderFallback, ref buf, true);
  559. #else
  560. return InternalGetCharCount (bytes, index, count, 0, 0, throwOnInvalid, true);
  561. #endif
  562. }
  563. // Get the characters that result from decoding a byte buffer.
  564. #if NET_2_0
  565. private static int InternalGetChars (
  566. byte[] bytes, int byteIndex, int byteCount, char[] chars,
  567. int charIndex, ref uint leftOverBits, ref uint leftOverCount,
  568. object provider,
  569. ref DecoderFallbackBuffer fallbackBuffer, bool flush)
  570. #else
  571. private static int InternalGetChars (
  572. byte[] bytes, int byteIndex, int byteCount, char[] chars,
  573. int charIndex, ref uint leftOverBits, ref uint leftOverCount,
  574. bool throwOnInvalid, bool flush)
  575. #endif
  576. {
  577. // Validate the parameters.
  578. if (bytes == null) {
  579. throw new ArgumentNullException ("bytes");
  580. }
  581. if (chars == null) {
  582. throw new ArgumentNullException ("chars");
  583. }
  584. if (byteIndex < 0 || byteIndex > bytes.Length) {
  585. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  586. }
  587. if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) {
  588. throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
  589. }
  590. if (charIndex < 0 || charIndex > chars.Length) {
  591. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
  592. }
  593. if (charIndex == chars.Length)
  594. return 0;
  595. int posn = charIndex;
  596. if (leftOverCount == 0) {
  597. int end = byteIndex + byteCount;
  598. for (; byteIndex < end; posn++, byteIndex++, byteCount--) {
  599. if (bytes [byteIndex] < 0x80)
  600. chars [posn] = (char) bytes [byteIndex];
  601. else
  602. break;
  603. }
  604. }
  605. // Convert the bytes into the output buffer.
  606. uint ch;
  607. int length = chars.Length;
  608. uint leftBits = leftOverBits;
  609. uint leftSoFar = (leftOverCount & (uint)0x0F);
  610. uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
  611. int byteEnd = byteIndex + byteCount;
  612. if (byteEnd < 0 || byteEnd > bytes.Length)
  613. throw new SystemException (String.Format ("INTERNAL ERROR: should not happen: {0} {1} {2}", byteIndex, byteCount, byteEnd));
  614. for(; byteIndex < byteEnd; byteIndex++) {
  615. // Fetch the next character from the byte buffer.
  616. ch = (uint)(bytes[byteIndex]);
  617. if (leftSize == 0) {
  618. // Process a UTF-8 start character.
  619. if (ch < (uint)0x0080) {
  620. // Single-byte UTF-8 character.
  621. if (posn >= length) {
  622. throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
  623. }
  624. chars[posn++] = (char)ch;
  625. } else if ((ch & (uint)0xE0) == (uint)0xC0) {
  626. // Double-byte UTF-8 character.
  627. leftBits = (ch & (uint)0x1F);
  628. leftSoFar = 1;
  629. leftSize = 2;
  630. } else if ((ch & (uint)0xF0) == (uint)0xE0) {
  631. // Three-byte UTF-8 character.
  632. leftBits = (ch & (uint)0x0F);
  633. leftSoFar = 1;
  634. leftSize = 3;
  635. } else if ((ch & (uint)0xF8) == (uint)0xF0) {
  636. // Four-byte UTF-8 character.
  637. leftBits = (ch & (uint)0x07);
  638. leftSoFar = 1;
  639. leftSize = 4;
  640. } else if ((ch & (uint)0xFC) == (uint)0xF8) {
  641. // Five-byte UTF-8 character.
  642. leftBits = (ch & (uint)0x03);
  643. leftSoFar = 1;
  644. leftSize = 5;
  645. } else if ((ch & (uint)0xFE) == (uint)0xFC) {
  646. // Six-byte UTF-8 character.
  647. leftBits = (ch & (uint)0x03);
  648. leftSoFar = 1;
  649. leftSize = 6;
  650. } else {
  651. // Invalid UTF-8 start character.
  652. #if NET_2_0
  653. Fallback (provider, ref fallbackBuffer, bytes, byteIndex, chars, ref posn);
  654. #else
  655. if (throwOnInvalid)
  656. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  657. #endif
  658. }
  659. } else {
  660. // Process an extra byte in a multi-byte sequence.
  661. if ((ch & (uint)0xC0) == (uint)0x80) {
  662. leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
  663. if (++leftSoFar >= leftSize) {
  664. // We have a complete character now.
  665. if (leftBits < (uint)0x10000) {
  666. // is it an overlong ?
  667. bool overlong = false;
  668. switch (leftSize) {
  669. case 2:
  670. overlong = (leftBits <= 0x7F);
  671. break;
  672. case 3:
  673. overlong = (leftBits <= 0x07FF);
  674. break;
  675. case 4:
  676. overlong = (leftBits <= 0xFFFF);
  677. break;
  678. case 5:
  679. overlong = (leftBits <= 0x1FFFFF);
  680. break;
  681. case 6:
  682. overlong = (leftBits <= 0x03FFFFFF);
  683. break;
  684. }
  685. if (overlong) {
  686. #if NET_2_0
  687. Fallback (provider, ref fallbackBuffer, bytes, byteIndex, chars, ref posn);
  688. #else
  689. if (throwOnInvalid)
  690. throw new ArgumentException (_("Overlong"), leftBits.ToString ());
  691. #endif
  692. }
  693. else if ((leftBits & 0xF800) == 0xD800) {
  694. // UTF-8 doesn't use surrogate characters
  695. #if NET_2_0
  696. Fallback (provider, ref fallbackBuffer, bytes, byteIndex, chars, ref posn);
  697. #else
  698. if (throwOnInvalid)
  699. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  700. #endif
  701. }
  702. else {
  703. if (posn >= length) {
  704. throw new ArgumentException
  705. (_("Arg_InsufficientSpace"), "chars");
  706. }
  707. chars[posn++] = (char)leftBits;
  708. }
  709. } else if (leftBits < (uint)0x110000) {
  710. if ((posn + 2) > length) {
  711. throw new ArgumentException
  712. (_("Arg_InsufficientSpace"), "chars");
  713. }
  714. leftBits -= (uint)0x10000;
  715. chars[posn++] = (char)((leftBits >> 10) +
  716. (uint)0xD800);
  717. chars[posn++] =
  718. (char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
  719. } else {
  720. #if NET_2_0
  721. Fallback (provider, ref fallbackBuffer, bytes, byteIndex, chars, ref posn);
  722. #else
  723. if (throwOnInvalid)
  724. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  725. #endif
  726. }
  727. leftSize = 0;
  728. }
  729. } else {
  730. // Invalid UTF-8 sequence: clear and restart.
  731. #if NET_2_0
  732. Fallback (provider, ref fallbackBuffer, bytes, byteIndex, chars, ref posn);
  733. #else
  734. if (throwOnInvalid)
  735. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  736. #endif
  737. leftSize = 0;
  738. --byteIndex;
  739. }
  740. }
  741. }
  742. if (flush && leftSize != 0) {
  743. // We had left-over bytes that didn't make up
  744. // a complete UTF-8 character sequence.
  745. #if NET_2_0
  746. Fallback (provider, ref fallbackBuffer, bytes, byteIndex, chars, ref posn);
  747. #else
  748. if (throwOnInvalid)
  749. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  750. #endif
  751. }
  752. leftOverBits = leftBits;
  753. leftOverCount = (leftSoFar | (leftSize << 4));
  754. // Return the final length to the caller.
  755. return posn - charIndex;
  756. }
  757. // Get the characters that result from decoding a byte buffer.
  758. public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
  759. char[] chars, int charIndex)
  760. {
  761. uint leftOverBits = 0;
  762. uint leftOverCount = 0;
  763. #if NET_2_0
  764. DecoderFallbackBuffer buf = null;
  765. return InternalGetChars (bytes, byteIndex, byteCount, chars,
  766. charIndex, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, true);
  767. #else
  768. return InternalGetChars (bytes, byteIndex, byteCount, chars,
  769. charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, true);
  770. #endif
  771. }
  772. // Get the maximum number of bytes needed to encode a
  773. // specified number of characters.
  774. public override int GetMaxByteCount (int charCount)
  775. {
  776. if (charCount < 0) {
  777. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
  778. }
  779. return charCount * 4;
  780. }
  781. // Get the maximum number of characters needed to decode a
  782. // specified number of bytes.
  783. public override int GetMaxCharCount (int byteCount)
  784. {
  785. if (byteCount < 0) {
  786. throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
  787. }
  788. return byteCount;
  789. }
  790. // Get a UTF8-specific decoder that is attached to this instance.
  791. public override Decoder GetDecoder ()
  792. {
  793. #if NET_2_0
  794. return new UTF8Decoder (DecoderFallback);
  795. #else
  796. return new UTF8Decoder (throwOnInvalid);
  797. #endif
  798. }
  799. // Get a UTF8-specific encoder that is attached to this instance.
  800. public override Encoder GetEncoder ()
  801. {
  802. return new UTF8Encoder (emitIdentifier);
  803. }
  804. // Get the UTF8 preamble.
  805. public override byte[] GetPreamble ()
  806. {
  807. if (emitIdentifier) {
  808. byte[] pre = new byte [3];
  809. pre[0] = (byte)0xEF;
  810. pre[1] = (byte)0xBB;
  811. pre[2] = (byte)0xBF;
  812. return pre;
  813. } else {
  814. return new byte [0];
  815. }
  816. }
  817. // Determine if this object is equal to another.
  818. public override bool Equals (Object value)
  819. {
  820. UTF8Encoding enc = (value as UTF8Encoding);
  821. if (enc != null) {
  822. #if NET_2_0
  823. return (codePage == enc.codePage &&
  824. emitIdentifier == enc.emitIdentifier &&
  825. DecoderFallback == enc.DecoderFallback &&
  826. EncoderFallback == enc.EncoderFallback);
  827. #else
  828. return (codePage == enc.codePage &&
  829. emitIdentifier == enc.emitIdentifier &&
  830. throwOnInvalid == enc.throwOnInvalid);
  831. #endif
  832. } else {
  833. return false;
  834. }
  835. }
  836. // Get the hash code for this object.
  837. public override int GetHashCode ()
  838. {
  839. return base.GetHashCode ();
  840. }
  841. public override byte [] GetBytes (String s)
  842. {
  843. if (s == null)
  844. throw new ArgumentNullException ("s");
  845. int length = GetByteCount (s);
  846. byte [] bytes = new byte [length];
  847. GetBytes (s, 0, s.Length, bytes, 0);
  848. return bytes;
  849. }
  850. // UTF-8 decoder implementation.
  851. [Serializable]
  852. private class UTF8Decoder : Decoder
  853. {
  854. #if !NET_2_0
  855. private bool throwOnInvalid;
  856. #endif
  857. private uint leftOverBits;
  858. private uint leftOverCount;
  859. // Constructor.
  860. #if NET_2_0
  861. public UTF8Decoder (DecoderFallback fallback)
  862. #else
  863. public UTF8Decoder (bool throwOnInvalid)
  864. #endif
  865. {
  866. #if NET_2_0
  867. Fallback = fallback;
  868. #else
  869. this.throwOnInvalid = throwOnInvalid;
  870. #endif
  871. leftOverBits = 0;
  872. leftOverCount = 0;
  873. }
  874. // Override inherited methods.
  875. public override int GetCharCount (byte[] bytes, int index, int count)
  876. {
  877. #if NET_2_0
  878. DecoderFallbackBuffer buf = null;
  879. return InternalGetCharCount (bytes, index, count,
  880. leftOverBits, leftOverCount, this, ref buf, false);
  881. #else
  882. return InternalGetCharCount (bytes, index, count,
  883. leftOverBits, leftOverCount, throwOnInvalid, false);
  884. #endif
  885. }
  886. public override int GetChars (byte[] bytes, int byteIndex,
  887. int byteCount, char[] chars, int charIndex)
  888. {
  889. #if NET_2_0
  890. DecoderFallbackBuffer buf = null;
  891. return InternalGetChars (bytes, byteIndex, byteCount,
  892. chars, charIndex, ref leftOverBits, ref leftOverCount, this, ref buf, false);
  893. #else
  894. return InternalGetChars (bytes, byteIndex, byteCount,
  895. chars, charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, false);
  896. #endif
  897. }
  898. } // class UTF8Decoder
  899. // UTF-8 encoder implementation.
  900. [Serializable]
  901. private class UTF8Encoder : Encoder
  902. {
  903. private bool emitIdentifier;
  904. private char leftOverForCount;
  905. private char leftOverForConv;
  906. // Constructor.
  907. public UTF8Encoder (bool emitIdentifier)
  908. {
  909. this.emitIdentifier = emitIdentifier;
  910. leftOverForCount = '\0';
  911. leftOverForConv = '\0';
  912. }
  913. // Override inherited methods.
  914. public override int GetByteCount (char[] chars, int index,
  915. int count, bool flush)
  916. {
  917. return InternalGetByteCount (chars, index, count, ref leftOverForCount, flush);
  918. }
  919. public override int GetBytes (char[] chars, int charIndex,
  920. int charCount, byte[] bytes, int byteIndex, bool flush)
  921. {
  922. int result;
  923. result = InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOverForConv, flush);
  924. emitIdentifier = false;
  925. return result;
  926. }
  927. #if NET_2_0
  928. public unsafe override int GetByteCount (char* chars, int count, bool flush)
  929. {
  930. return InternalGetByteCount (chars, count, ref leftOverForCount, flush);
  931. }
  932. public unsafe override int GetBytes (char* chars, int charCount,
  933. byte* bytes, int byteCount, bool flush)
  934. {
  935. int result;
  936. result = InternalGetBytes (chars, charCount, bytes, byteCount, ref leftOverForConv, flush);
  937. emitIdentifier = false;
  938. return result;
  939. }
  940. #endif
  941. } // class UTF8Encoder
  942. }; // class UTF8Encoding
  943. }; // namespace System.Text