UTF8Encoding.cs 31 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007
  1. /*
  2. * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
  3. *
  4. * Copyright (c) 2001, 2002 Southern Storm Software, Pty Ltd
  5. * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
  6. *
  7. * Permission is hereby granted, free of charge, to any person obtaining
  8. * a copy of this software and associated documentation files (the "Software"),
  9. * to deal in the Software without restriction, including without limitation
  10. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11. * and/or sell copies of the Software, and to permit persons to whom the
  12. * Software is furnished to do so, subject to the following conditions:
  13. *
  14. * The above copyright notice and this permission notice shall be included
  15. * in all copies or substantial portions of the Software.
  16. *
  17. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18. * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  20. * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  21. * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  22. * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  23. * OTHER DEALINGS IN THE SOFTWARE.
  24. */
  25. namespace System.Text
  26. {
  27. using System;
  28. using System.Runtime.InteropServices;
  29. [Serializable]
  30. [MonoLimitation ("Serialization format not compatible with .NET")]
  31. [ComVisible (true)]
  32. public class UTF8Encoding : Encoding
  33. {
  34. // Magic number used by Windows for UTF-8.
  35. internal const int UTF8_CODE_PAGE = 65001;
  36. // Internal state.
  37. private bool emitIdentifier;
  38. // Constructors.
  39. public UTF8Encoding () : this (false, false) {}
  40. public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
  41. : this (encoderShouldEmitUTF8Identifier, false) {}
  42. public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
  43. : base (UTF8_CODE_PAGE)
  44. {
  45. emitIdentifier = encoderShouldEmitUTF8Identifier;
  46. if (throwOnInvalidBytes)
  47. SetFallbackInternal (EncoderFallback.ExceptionFallback, DecoderFallback.ExceptionFallback);
  48. else
  49. SetFallbackInternal (EncoderFallback.StandardSafeFallback, DecoderFallback.StandardSafeFallback);
  50. web_name = body_name = header_name = "utf-8";
  51. encoding_name = "Unicode (UTF-8)";
  52. is_browser_save = true;
  53. is_browser_display = true;
  54. is_mail_news_display = true;
  55. is_mail_news_save = true;
  56. windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE;
  57. }
  58. #region GetByteCount()
  59. // Internal version of "GetByteCount" which can handle a rolling
  60. // state between multiple calls to this method.
  61. private static int InternalGetByteCount (char[] chars, int index, int count, EncoderFallback fallback, ref char leftOver, bool flush)
  62. {
  63. // Validate the parameters.
  64. if (chars == null) {
  65. throw new ArgumentNullException ("chars");
  66. }
  67. if (index < 0 || index > chars.Length) {
  68. throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
  69. }
  70. if (count < 0 || count > (chars.Length - index)) {
  71. throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
  72. }
  73. if (index == chars.Length) {
  74. if (flush && leftOver != '\0') {
  75. // Flush the left-over surrogate pair start.
  76. leftOver = '\0';
  77. return 3;
  78. }
  79. return 0;
  80. }
  81. unsafe {
  82. fixed (char* cptr = chars) {
  83. return InternalGetByteCount (cptr + index, count, fallback, ref leftOver, flush);
  84. }
  85. }
  86. }
  87. private unsafe static int InternalGetByteCount (char* chars, int count, EncoderFallback fallback, ref char leftOver, bool flush)
  88. {
  89. int length = 0;
  90. char* end = chars + count;
  91. char* start = chars;
  92. EncoderFallbackBuffer buffer = null;
  93. while (chars < end) {
  94. if (leftOver == 0) {
  95. for (; chars < end; chars++) {
  96. if (*chars < '\x80') {
  97. ++length;
  98. } else if (*chars < '\x800') {
  99. length += 2;
  100. } else if (*chars < '\uD800' || *chars > '\uDFFF') {
  101. length += 3;
  102. } else if (*chars <= '\uDBFF') {
  103. // This is a surrogate start char, exit the inner loop only
  104. // if we don't find the complete surrogate pair.
  105. if (chars + 1 < end && chars [1] >= '\uDC00' && chars [1] <= '\uDFFF') {
  106. length += 4;
  107. chars++;
  108. continue;
  109. }
  110. leftOver = *chars;
  111. chars++;
  112. break;
  113. } else {
  114. // We have a surrogate tail without
  115. // leading surrogate. In NET_2_0 it
  116. // uses fallback. In NET_1_1 we output
  117. // wrong surrogate.
  118. char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer);
  119. fixed (char *fb_chars = fallback_chars) {
  120. char dummy = '\0';
  121. length += InternalGetByteCount (fb_chars, fallback_chars.Length, fallback, ref dummy, true);
  122. }
  123. leftOver = '\0';
  124. }
  125. }
  126. } else {
  127. if (*chars >= '\uDC00' && *chars <= '\uDFFF') {
  128. // We have a correct surrogate pair.
  129. length += 4;
  130. chars++;
  131. } else {
  132. // We have a surrogate start followed by a
  133. // regular character. Technically, this is
  134. // invalid, but we have to do something.
  135. // We write out the surrogate start and then
  136. // re-visit the current character again.
  137. char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer);
  138. fixed (char *fb_chars = fallback_chars) {
  139. char dummy = '\0';
  140. length += InternalGetByteCount (fb_chars, fallback_chars.Length, fallback, ref dummy, true);
  141. }
  142. }
  143. leftOver = '\0';
  144. }
  145. }
  146. if (flush) {
  147. // Flush the left-over surrogate pair start.
  148. if (leftOver != '\0') {
  149. length += 3;
  150. leftOver = '\0';
  151. }
  152. }
  153. return length;
  154. }
  155. unsafe static char [] GetFallbackChars (char *chars, char *start, EncoderFallback fallback, ref EncoderFallbackBuffer buffer)
  156. {
  157. if (buffer == null)
  158. buffer = fallback.CreateFallbackBuffer ();
  159. buffer.Fallback (*chars, (int) (chars - start));
  160. char [] fallback_chars = new char [buffer.Remaining];
  161. for (int i = 0; i < fallback_chars.Length; i++)
  162. fallback_chars [i] = buffer.GetNextChar ();
  163. buffer.Reset ();
  164. return fallback_chars;
  165. }
  166. // Get the number of bytes needed to encode a character buffer.
  167. public override int GetByteCount (char[] chars, int index, int count)
  168. {
  169. char dummy = '\0';
  170. return InternalGetByteCount (chars, index, count, EncoderFallback, ref dummy, true);
  171. }
  172. [CLSCompliant (false)]
  173. [ComVisible (false)]
  174. public unsafe override int GetByteCount (char* chars, int count)
  175. {
  176. if (chars == null)
  177. throw new ArgumentNullException ("chars");
  178. if (count == 0)
  179. return 0;
  180. char dummy = '\0';
  181. return InternalGetByteCount (chars, count, EncoderFallback, ref dummy, true);
  182. }
  183. #endregion
  184. #region GetBytes()
  185. // Internal version of "GetBytes" which can handle a rolling
  186. // state between multiple calls to this method.
  187. private static int InternalGetBytes (char[] chars, int charIndex,
  188. int charCount, byte[] bytes,
  189. int byteIndex,
  190. EncoderFallback fallback, ref EncoderFallbackBuffer buffer,
  191. ref char leftOver, bool flush)
  192. {
  193. // Validate the parameters.
  194. if (chars == null) {
  195. throw new ArgumentNullException ("chars");
  196. }
  197. if (bytes == null) {
  198. throw new ArgumentNullException ("bytes");
  199. }
  200. if (charIndex < 0 || charIndex > chars.Length) {
  201. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
  202. }
  203. if (charCount < 0 || charCount > (chars.Length - charIndex)) {
  204. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
  205. }
  206. if (byteIndex < 0 || byteIndex > bytes.Length) {
  207. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  208. }
  209. if (charIndex == chars.Length) {
  210. if (flush && leftOver != '\0') {
  211. // FIXME: use EncoderFallback.
  212. //
  213. // By default it is empty, so I do nothing for now.
  214. leftOver = '\0';
  215. }
  216. return 0;
  217. }
  218. unsafe {
  219. fixed (char* cptr = chars) {
  220. if (bytes.Length == byteIndex)
  221. return InternalGetBytes (
  222. cptr + charIndex, charCount,
  223. null, 0, fallback, ref buffer, ref leftOver, flush);
  224. fixed (byte *bptr = bytes) {
  225. return InternalGetBytes (
  226. cptr + charIndex, charCount,
  227. bptr + byteIndex, bytes.Length - byteIndex,
  228. fallback, ref buffer,
  229. ref leftOver, flush);
  230. }
  231. }
  232. }
  233. }
  234. private unsafe static int InternalGetBytes (char* chars, int count, byte* bytes, int bcount, EncoderFallback fallback, ref EncoderFallbackBuffer buffer, ref char leftOver, bool flush)
  235. {
  236. char* end = chars + count;
  237. char* start = chars;
  238. byte* start_bytes = bytes;
  239. byte* end_bytes = bytes + bcount;
  240. while (chars < end) {
  241. if (leftOver == 0) {
  242. for (; chars < end; chars++) {
  243. int ch = *chars;
  244. if (ch < '\x80') {
  245. if (bytes >= end_bytes)
  246. goto fail_no_space;
  247. *bytes++ = (byte)ch;
  248. } else if (ch < '\x800') {
  249. if (bytes + 1 >= end_bytes)
  250. goto fail_no_space;
  251. bytes [0] = (byte) (0xC0 | (ch >> 6));
  252. bytes [1] = (byte) (0x80 | (ch & 0x3F));
  253. bytes += 2;
  254. } else if (ch < '\uD800' || ch > '\uDFFF') {
  255. if (bytes + 2 >= end_bytes)
  256. goto fail_no_space;
  257. bytes [0] = (byte) (0xE0 | (ch >> 12));
  258. bytes [1] = (byte) (0x80 | ((ch >> 6) & 0x3F));
  259. bytes [2] = (byte) (0x80 | (ch & 0x3F));
  260. bytes += 3;
  261. } else if (ch <= '\uDBFF') {
  262. // This is a surrogate char, exit the inner loop.
  263. leftOver = *chars;
  264. chars++;
  265. break;
  266. } else {
  267. // We have a surrogate tail without
  268. // leading surrogate. In NET_2_0 it
  269. // uses fallback. In NET_1_1 we output
  270. // wrong surrogate.
  271. char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer);
  272. char dummy = '\0';
  273. if (bytes + InternalGetByteCount (fallback_chars, 0, fallback_chars.Length, fallback, ref dummy, true) > end_bytes)
  274. goto fail_no_space;
  275. fixed (char *fb_chars = fallback_chars) {
  276. bytes += InternalGetBytes (fb_chars, fallback_chars.Length, bytes, bcount - (int) (bytes - start_bytes), fallback, ref buffer, ref dummy, true);
  277. }
  278. leftOver = '\0';
  279. }
  280. }
  281. } else {
  282. if (*chars >= '\uDC00' && *chars <= '\uDFFF') {
  283. // We have a correct surrogate pair.
  284. int ch = 0x10000 + (int) *chars - 0xDC00 + (((int) leftOver - 0xD800) << 10);
  285. if (bytes + 3 >= end_bytes)
  286. goto fail_no_space;
  287. bytes [0] = (byte) (0xF0 | (ch >> 18));
  288. bytes [1] = (byte) (0x80 | ((ch >> 12) & 0x3F));
  289. bytes [2] = (byte) (0x80 | ((ch >> 6) & 0x3F));
  290. bytes [3] = (byte) (0x80 | (ch & 0x3F));
  291. bytes += 4;
  292. chars++;
  293. } else {
  294. // We have a surrogate start followed by a
  295. // regular character. Technically, this is
  296. // invalid, but we have to do something.
  297. // We write out the surrogate start and then
  298. // re-visit the current character again.
  299. char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer);
  300. char dummy = '\0';
  301. if (bytes + InternalGetByteCount (fallback_chars, 0, fallback_chars.Length, fallback, ref dummy, true) > end_bytes)
  302. goto fail_no_space;
  303. fixed (char *fb_chars = fallback_chars) {
  304. InternalGetBytes (fb_chars, fallback_chars.Length, bytes, bcount - (int) (bytes - start_bytes), fallback, ref buffer, ref dummy, true);
  305. }
  306. leftOver = '\0';
  307. }
  308. leftOver = '\0';
  309. }
  310. }
  311. if (flush) {
  312. // Flush the left-over surrogate pair start.
  313. if (leftOver != '\0') {
  314. int ch = leftOver;
  315. if (bytes + 2 < end_bytes) {
  316. bytes [0] = (byte) (0xE0 | (ch >> 12));
  317. bytes [1] = (byte) (0x80 | ((ch >> 6) & 0x3F));
  318. bytes [2] = (byte) (0x80 | (ch & 0x3F));
  319. bytes += 3;
  320. } else {
  321. goto fail_no_space;
  322. }
  323. leftOver = '\0';
  324. }
  325. }
  326. return (int)(bytes - (end_bytes - bcount));
  327. fail_no_space:
  328. throw new ArgumentException ("Insufficient Space", "bytes");
  329. }
  330. // Get the bytes that result from encoding a character buffer.
  331. public override int GetBytes (char[] chars, int charIndex, int charCount,
  332. byte[] bytes, int byteIndex)
  333. {
  334. char leftOver = '\0';
  335. EncoderFallbackBuffer buffer = null;
  336. return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, EncoderFallback, ref buffer, ref leftOver, true);
  337. }
  338. // Convenience wrappers for "GetBytes".
  339. public override int GetBytes (String s, int charIndex, int charCount,
  340. byte[] bytes, int byteIndex)
  341. {
  342. // Validate the parameters.
  343. if (s == null) {
  344. throw new ArgumentNullException ("s");
  345. }
  346. if (bytes == null) {
  347. throw new ArgumentNullException ("bytes");
  348. }
  349. if (charIndex < 0 || charIndex > s.Length) {
  350. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
  351. }
  352. if (charCount < 0 || charCount > (s.Length - charIndex)) {
  353. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
  354. }
  355. if (byteIndex < 0 || byteIndex > bytes.Length) {
  356. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  357. }
  358. if (charIndex == s.Length)
  359. return 0;
  360. unsafe {
  361. fixed (char* cptr = s) {
  362. char dummy = '\0';
  363. EncoderFallbackBuffer buffer = null;
  364. if (bytes.Length == byteIndex)
  365. return InternalGetBytes (
  366. cptr + charIndex, charCount,
  367. null, 0, EncoderFallback, ref buffer, ref dummy, true);
  368. fixed (byte *bptr = bytes) {
  369. return InternalGetBytes (
  370. cptr + charIndex, charCount,
  371. bptr + byteIndex, bytes.Length - byteIndex,
  372. EncoderFallback, ref buffer,
  373. ref dummy, true);
  374. }
  375. }
  376. }
  377. }
  378. [CLSCompliant (false)]
  379. [ComVisible (false)]
  380. public unsafe override int GetBytes (char* chars, int charCount, byte* bytes, int byteCount)
  381. {
  382. if (chars == null)
  383. throw new ArgumentNullException ("chars");
  384. if (charCount < 0)
  385. throw new IndexOutOfRangeException ("charCount");
  386. if (bytes == null)
  387. throw new ArgumentNullException ("bytes");
  388. if (byteCount < 0)
  389. throw new IndexOutOfRangeException ("charCount");
  390. if (charCount == 0)
  391. return 0;
  392. char dummy = '\0';
  393. EncoderFallbackBuffer buffer = null;
  394. if (byteCount == 0)
  395. return InternalGetBytes (chars, charCount, null, 0, EncoderFallback, ref buffer, ref dummy, true);
  396. else
  397. return InternalGetBytes (chars, charCount, bytes, byteCount, EncoderFallback, ref buffer, ref dummy, true);
  398. }
  399. #endregion
  400. // Internal version of "GetCharCount" which can handle a rolling
  401. // state between multiple calls to this method.
  402. private unsafe static int InternalGetCharCount (
  403. byte[] bytes, int index, int count, uint leftOverBits,
  404. uint leftOverCount, object provider,
  405. ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
  406. {
  407. // Validate the parameters.
  408. if (bytes == null) {
  409. throw new ArgumentNullException ("bytes");
  410. }
  411. if (index < 0 || index > bytes.Length) {
  412. throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
  413. }
  414. if (count < 0 || count > (bytes.Length - index)) {
  415. throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
  416. }
  417. if (count == 0)
  418. return 0;
  419. fixed (byte *bptr = bytes)
  420. return InternalGetCharCount (bptr + index, count,
  421. leftOverBits, leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
  422. }
  423. private unsafe static int InternalGetCharCount (
  424. byte* bytes, int count, uint leftOverBits,
  425. uint leftOverCount, object provider,
  426. ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
  427. {
  428. int index = 0;
  429. int length = 0;
  430. if (leftOverCount == 0) {
  431. int end = index + count;
  432. for (; index < end; index++, count--) {
  433. if (bytes [index] < 0x80)
  434. length++;
  435. else
  436. break;
  437. }
  438. }
  439. // Determine the number of characters that we have.
  440. uint ch;
  441. uint leftBits = leftOverBits;
  442. uint leftSoFar = (leftOverCount & (uint)0x0F);
  443. uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
  444. while (count > 0) {
  445. ch = (uint)(bytes[index++]);
  446. --count;
  447. if (leftSize == 0) {
  448. // Process a UTF-8 start character.
  449. if (ch < (uint)0x0080) {
  450. // Single-byte UTF-8 character.
  451. ++length;
  452. } else if ((ch & (uint)0xE0) == (uint)0xC0) {
  453. // Double-byte UTF-8 character.
  454. leftBits = (ch & (uint)0x1F);
  455. leftSoFar = 1;
  456. leftSize = 2;
  457. } else if ((ch & (uint)0xF0) == (uint)0xE0) {
  458. // Three-byte UTF-8 character.
  459. leftBits = (ch & (uint)0x0F);
  460. leftSoFar = 1;
  461. leftSize = 3;
  462. } else if ((ch & (uint)0xF8) == (uint)0xF0) {
  463. // Four-byte UTF-8 character.
  464. leftBits = (ch & (uint)0x07);
  465. leftSoFar = 1;
  466. leftSize = 4;
  467. } else if ((ch & (uint)0xFC) == (uint)0xF8) {
  468. // Five-byte UTF-8 character.
  469. leftBits = (ch & (uint)0x03);
  470. leftSoFar = 1;
  471. leftSize = 5;
  472. } else if ((ch & (uint)0xFE) == (uint)0xFC) {
  473. // Six-byte UTF-8 character.
  474. leftBits = (ch & (uint)0x03);
  475. leftSoFar = 1;
  476. leftSize = 6;
  477. } else {
  478. // Invalid UTF-8 start character.
  479. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1, 1);
  480. }
  481. } else {
  482. // Process an extra byte in a multi-byte sequence.
  483. if ((ch & (uint)0xC0) == (uint)0x80) {
  484. leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
  485. if (++leftSoFar >= leftSize) {
  486. // We have a complete character now.
  487. if (leftBits < (uint)0x10000) {
  488. // is it an overlong ?
  489. bool overlong = false;
  490. switch (leftSize) {
  491. case 2:
  492. overlong = (leftBits <= 0x7F);
  493. break;
  494. case 3:
  495. overlong = (leftBits <= 0x07FF);
  496. break;
  497. case 4:
  498. overlong = (leftBits <= 0xFFFF);
  499. break;
  500. case 5:
  501. overlong = (leftBits <= 0x1FFFFF);
  502. break;
  503. case 6:
  504. overlong = (leftBits <= 0x03FFFFFF);
  505. break;
  506. }
  507. if (overlong) {
  508. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
  509. }
  510. else if ((leftBits & 0xF800) == 0xD800) {
  511. // UTF-8 doesn't use surrogate characters
  512. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
  513. }
  514. else
  515. ++length;
  516. } else if (leftBits < (uint)0x110000) {
  517. length += 2;
  518. } else {
  519. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
  520. }
  521. leftSize = 0;
  522. }
  523. } else {
  524. // Invalid UTF-8 sequence: clear and restart.
  525. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
  526. leftSize = 0;
  527. --index;
  528. ++count;
  529. }
  530. }
  531. }
  532. if (flush && leftSize != 0) {
  533. // We had left-over bytes that didn't make up
  534. // a complete UTF-8 character sequence.
  535. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
  536. }
  537. // Return the final length to the caller.
  538. return length;
  539. }
  540. // for GetCharCount()
  541. static unsafe int Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, long index, uint size)
  542. {
  543. if (buffer == null) {
  544. DecoderFallback fb = provider as DecoderFallback;
  545. if (fb != null)
  546. buffer = fb.CreateFallbackBuffer ();
  547. else
  548. buffer = ((Decoder) provider).FallbackBuffer;
  549. }
  550. if (bufferArg == null)
  551. bufferArg = new byte [1];
  552. int ret = 0;
  553. for (int i = 0; i < size; i++) {
  554. bufferArg [0] = bytes [(int) index + i];
  555. buffer.Fallback (bufferArg, 0);
  556. ret += buffer.Remaining;
  557. buffer.Reset ();
  558. }
  559. return ret;
  560. }
  561. // for GetChars()
  562. static unsafe void Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, long byteIndex, uint size,
  563. char* chars, ref int charIndex)
  564. {
  565. if (buffer == null) {
  566. DecoderFallback fb = provider as DecoderFallback;
  567. if (fb != null)
  568. buffer = fb.CreateFallbackBuffer ();
  569. else
  570. buffer = ((Decoder) provider).FallbackBuffer;
  571. }
  572. if (bufferArg == null)
  573. bufferArg = new byte [1];
  574. for (int i = 0; i < size; i++) {
  575. bufferArg [0] = bytes [byteIndex + i];
  576. buffer.Fallback (bufferArg, 0);
  577. while (buffer.Remaining > 0)
  578. chars [charIndex++] = buffer.GetNextChar ();
  579. buffer.Reset ();
  580. }
  581. }
  582. // Get the number of characters needed to decode a byte buffer.
  583. public override int GetCharCount (byte[] bytes, int index, int count)
  584. {
  585. DecoderFallbackBuffer buf = null;
  586. byte [] bufferArg = null;
  587. return InternalGetCharCount (bytes, index, count, 0, 0, DecoderFallback, ref buf, ref bufferArg, true);
  588. }
  589. [CLSCompliant (false)]
  590. [ComVisible (false)]
  591. public unsafe override int GetCharCount (byte* bytes, int count)
  592. {
  593. DecoderFallbackBuffer buf = null;
  594. byte [] bufferArg = null;
  595. return InternalGetCharCount (bytes, count, 0, 0, DecoderFallback, ref buf, ref bufferArg, true);
  596. }
  597. // Get the characters that result from decoding a byte buffer.
  598. private unsafe static int InternalGetChars (
  599. byte[] bytes, int byteIndex, int byteCount, char[] chars,
  600. int charIndex, ref uint leftOverBits, ref uint leftOverCount,
  601. object provider,
  602. ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
  603. {
  604. // Validate the parameters.
  605. if (bytes == null) {
  606. throw new ArgumentNullException ("bytes");
  607. }
  608. if (chars == null) {
  609. throw new ArgumentNullException ("chars");
  610. }
  611. if (byteIndex < 0 || byteIndex > bytes.Length) {
  612. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  613. }
  614. if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) {
  615. throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
  616. }
  617. if (charIndex < 0 || charIndex > chars.Length) {
  618. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
  619. }
  620. if (charIndex == chars.Length)
  621. return 0;
  622. fixed (char* cptr = chars) {
  623. if (byteCount == 0 || byteIndex == bytes.Length)
  624. return InternalGetChars (null, 0, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
  625. // otherwise...
  626. fixed (byte* bptr = bytes)
  627. return InternalGetChars (bptr + byteIndex, byteCount, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
  628. }
  629. }
  630. private unsafe static int InternalGetChars (
  631. byte* bytes, int byteCount, char* chars, int charCount,
  632. ref uint leftOverBits, ref uint leftOverCount,
  633. object provider,
  634. ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
  635. {
  636. int charIndex = 0, byteIndex = 0;
  637. int length = charCount;
  638. int posn = charIndex;
  639. if (leftOverCount == 0) {
  640. int end = byteIndex + byteCount;
  641. for (; byteIndex < end; posn++, byteIndex++, byteCount--) {
  642. if (bytes [byteIndex] < 0x80)
  643. chars [posn] = (char) bytes [byteIndex];
  644. else
  645. break;
  646. }
  647. }
  648. // Convert the bytes into the output buffer.
  649. uint ch;
  650. uint leftBits = leftOverBits;
  651. uint leftSoFar = (leftOverCount & (uint)0x0F);
  652. uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
  653. int byteEnd = byteIndex + byteCount;
  654. for(; byteIndex < byteEnd; byteIndex++) {
  655. // Fetch the next character from the byte buffer.
  656. ch = (uint)(bytes[byteIndex]);
  657. if (leftSize == 0) {
  658. // Process a UTF-8 start character.
  659. if (ch < (uint)0x0080) {
  660. // Single-byte UTF-8 character.
  661. if (posn >= length) {
  662. throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
  663. }
  664. chars[posn++] = (char)ch;
  665. } else if ((ch & (uint)0xE0) == (uint)0xC0) {
  666. // Double-byte UTF-8 character.
  667. leftBits = (ch & (uint)0x1F);
  668. leftSoFar = 1;
  669. leftSize = 2;
  670. } else if ((ch & (uint)0xF0) == (uint)0xE0) {
  671. // Three-byte UTF-8 character.
  672. leftBits = (ch & (uint)0x0F);
  673. leftSoFar = 1;
  674. leftSize = 3;
  675. } else if ((ch & (uint)0xF8) == (uint)0xF0) {
  676. // Four-byte UTF-8 character.
  677. leftBits = (ch & (uint)0x07);
  678. leftSoFar = 1;
  679. leftSize = 4;
  680. } else if ((ch & (uint)0xFC) == (uint)0xF8) {
  681. // Five-byte UTF-8 character.
  682. leftBits = (ch & (uint)0x03);
  683. leftSoFar = 1;
  684. leftSize = 5;
  685. } else if ((ch & (uint)0xFE) == (uint)0xFC) {
  686. // Six-byte UTF-8 character.
  687. leftBits = (ch & (uint)0x03);
  688. leftSoFar = 1;
  689. leftSize = 6;
  690. } else {
  691. // Invalid UTF-8 start character.
  692. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, 1, chars, ref posn);
  693. }
  694. } else {
  695. // Process an extra byte in a multi-byte sequence.
  696. if ((ch & (uint)0xC0) == (uint)0x80) {
  697. leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
  698. if (++leftSoFar >= leftSize) {
  699. // We have a complete character now.
  700. if (leftBits < (uint)0x10000) {
  701. // is it an overlong ?
  702. bool overlong = false;
  703. switch (leftSize) {
  704. case 2:
  705. overlong = (leftBits <= 0x7F);
  706. break;
  707. case 3:
  708. overlong = (leftBits <= 0x07FF);
  709. break;
  710. case 4:
  711. overlong = (leftBits <= 0xFFFF);
  712. break;
  713. case 5:
  714. overlong = (leftBits <= 0x1FFFFF);
  715. break;
  716. case 6:
  717. overlong = (leftBits <= 0x03FFFFFF);
  718. break;
  719. }
  720. if (overlong) {
  721. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
  722. }
  723. else if ((leftBits & 0xF800) == 0xD800) {
  724. // UTF-8 doesn't use surrogate characters
  725. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
  726. }
  727. else {
  728. if (posn >= length) {
  729. throw new ArgumentException
  730. (_("Arg_InsufficientSpace"), "chars");
  731. }
  732. chars[posn++] = (char)leftBits;
  733. }
  734. } else if (leftBits < (uint)0x110000) {
  735. if ((posn + 2) > length) {
  736. throw new ArgumentException
  737. (_("Arg_InsufficientSpace"), "chars");
  738. }
  739. leftBits -= (uint)0x10000;
  740. chars[posn++] = (char)((leftBits >> 10) +
  741. (uint)0xD800);
  742. chars[posn++] =
  743. (char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
  744. } else {
  745. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
  746. }
  747. leftSize = 0;
  748. }
  749. } else {
  750. // Invalid UTF-8 sequence: clear and restart.
  751. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
  752. leftSize = 0;
  753. --byteIndex;
  754. }
  755. }
  756. }
  757. if (flush && leftSize != 0) {
  758. // We had left-over bytes that didn't make up
  759. // a complete UTF-8 character sequence.
  760. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
  761. }
  762. leftOverBits = leftBits;
  763. leftOverCount = (leftSoFar | (leftSize << 4));
  764. // Return the final length to the caller.
  765. return posn - charIndex;
  766. }
  767. // Get the characters that result from decoding a byte buffer.
  768. public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
  769. char[] chars, int charIndex)
  770. {
  771. uint leftOverBits = 0;
  772. uint leftOverCount = 0;
  773. DecoderFallbackBuffer buf = null;
  774. byte [] bufferArg = null;
  775. return InternalGetChars (bytes, byteIndex, byteCount, chars,
  776. charIndex, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, ref bufferArg, true);
  777. }
  778. [CLSCompliant (false)]
  779. [ComVisible (false)]
  780. public unsafe override int GetChars (byte* bytes, int byteCount, char* chars, int charCount)
  781. {
  782. DecoderFallbackBuffer buf = null;
  783. byte [] bufferArg = null;
  784. uint leftOverBits = 0;
  785. uint leftOverCount = 0;
  786. return InternalGetChars (bytes, byteCount, chars,
  787. charCount, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, ref bufferArg, true);
  788. }
  789. // Get the maximum number of bytes needed to encode a
  790. // specified number of characters.
  791. public override int GetMaxByteCount (int charCount)
  792. {
  793. if (charCount < 0) {
  794. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
  795. }
  796. return charCount * 4;
  797. }
  798. // Get the maximum number of characters needed to decode a
  799. // specified number of bytes.
  800. public override int GetMaxCharCount (int byteCount)
  801. {
  802. if (byteCount < 0) {
  803. throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
  804. }
  805. return byteCount;
  806. }
  807. // Get a UTF8-specific decoder that is attached to this instance.
  808. public override Decoder GetDecoder ()
  809. {
  810. return new UTF8Decoder (DecoderFallback);
  811. }
  812. // Get a UTF8-specific encoder that is attached to this instance.
  813. public override Encoder GetEncoder ()
  814. {
  815. return new UTF8Encoder (EncoderFallback, emitIdentifier);
  816. }
  817. // Get the UTF8 preamble.
  818. public override byte[] GetPreamble ()
  819. {
  820. if (emitIdentifier)
  821. return new byte [] { 0xEF, 0xBB, 0xBF };
  822. return new byte [0];
  823. }
  824. // Determine if this object is equal to another.
  825. public override bool Equals (Object value)
  826. {
  827. UTF8Encoding enc = (value as UTF8Encoding);
  828. if (enc != null) {
  829. return (codePage == enc.codePage &&
  830. emitIdentifier == enc.emitIdentifier &&
  831. DecoderFallback.Equals (enc.DecoderFallback) &&
  832. EncoderFallback.Equals (enc.EncoderFallback));
  833. } else {
  834. return false;
  835. }
  836. }
  837. // Get the hash code for this object.
  838. public override int GetHashCode ()
  839. {
  840. return base.GetHashCode ();
  841. }
  842. public override int GetByteCount (string chars)
  843. {
  844. // hmm, does this override make any sense?
  845. return base.GetByteCount (chars);
  846. }
  847. [ComVisible (false)]
  848. public override string GetString (byte [] bytes, int index, int count)
  849. {
  850. // hmm, does this override make any sense?
  851. return base.GetString (bytes, index, count);
  852. }
  853. // UTF-8 decoder implementation.
  854. [Serializable]
  855. private class UTF8Decoder : Decoder
  856. {
  857. private uint leftOverBits;
  858. private uint leftOverCount;
  859. // Constructor.
  860. public UTF8Decoder (DecoderFallback fallback)
  861. {
  862. Fallback = fallback;
  863. leftOverBits = 0;
  864. leftOverCount = 0;
  865. }
  866. // Override inherited methods.
  867. public override int GetCharCount (byte[] bytes, int index, int count)
  868. {
  869. DecoderFallbackBuffer buf = null;
  870. byte [] bufferArg = null;
  871. return InternalGetCharCount (bytes, index, count,
  872. leftOverBits, leftOverCount, this, ref buf, ref bufferArg, false);
  873. }
  874. public override int GetChars (byte[] bytes, int byteIndex,
  875. int byteCount, char[] chars, int charIndex)
  876. {
  877. DecoderFallbackBuffer buf = null;
  878. byte [] bufferArg = null;
  879. return InternalGetChars (bytes, byteIndex, byteCount,
  880. chars, charIndex, ref leftOverBits, ref leftOverCount, this, ref buf, ref bufferArg, false);
  881. }
  882. } // class UTF8Decoder
  883. // UTF-8 encoder implementation.
  884. [Serializable]
  885. private class UTF8Encoder : Encoder
  886. {
  887. // private bool emitIdentifier;
  888. private char leftOverForCount;
  889. private char leftOverForConv;
  890. // Constructor.
  891. public UTF8Encoder (EncoderFallback fallback, bool emitIdentifier)
  892. {
  893. Fallback = fallback;
  894. // this.emitIdentifier = emitIdentifier;
  895. leftOverForCount = '\0';
  896. leftOverForConv = '\0';
  897. }
  898. // Override inherited methods.
  899. public override int GetByteCount (char[] chars, int index,
  900. int count, bool flush)
  901. {
  902. return InternalGetByteCount (chars, index, count, Fallback, ref leftOverForCount, flush);
  903. }
  904. public override int GetBytes (char[] chars, int charIndex,
  905. int charCount, byte[] bytes, int byteIndex, bool flush)
  906. {
  907. int result;
  908. EncoderFallbackBuffer buffer = null;
  909. result = InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, Fallback, ref buffer, ref leftOverForConv, flush);
  910. // emitIdentifier = false;
  911. return result;
  912. }
  913. public unsafe override int GetByteCount (char* chars, int count, bool flush)
  914. {
  915. return InternalGetByteCount (chars, count, Fallback, ref leftOverForCount, flush);
  916. }
  917. public unsafe override int GetBytes (char* chars, int charCount,
  918. byte* bytes, int byteCount, bool flush)
  919. {
  920. int result;
  921. EncoderFallbackBuffer buffer = null;
  922. result = InternalGetBytes (chars, charCount, bytes, byteCount, Fallback, ref buffer, ref leftOverForConv, flush);
  923. // emitIdentifier = false;
  924. return result;
  925. }
  926. } // class UTF8Encoder
  927. }; // class UTF8Encoding
  928. }; // namespace System.Text