UTF8Encoding.cs 31 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007
  1. /*
  2. * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
  3. *
  4. * Copyright (c) 2001, 2002 Southern Storm Software, Pty Ltd
  5. * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
  6. *
  7. * Permission is hereby granted, free of charge, to any person obtaining
  8. * a copy of this software and associated documentation files (the "Software"),
  9. * to deal in the Software without restriction, including without limitation
  10. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11. * and/or sell copies of the Software, and to permit persons to whom the
  12. * Software is furnished to do so, subject to the following conditions:
  13. *
  14. * The above copyright notice and this permission notice shall be included
  15. * in all copies or substantial portions of the Software.
  16. *
  17. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18. * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  20. * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  21. * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  22. * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  23. * OTHER DEALINGS IN THE SOFTWARE.
  24. */
  25. namespace System.Text
  26. {
  27. using System;
  28. using System.Runtime.InteropServices;
  29. [Serializable]
  30. [MonoLimitation ("Serialization format not compatible with .NET")]
  31. [ComVisible (true)]
  32. public class UTF8Encoding : Encoding
  33. {
  34. // Magic number used by Windows for UTF-8.
  35. internal const int UTF8_CODE_PAGE = 65001;
  36. // Internal state.
  37. private bool emitIdentifier;
  38. // Constructors.
  39. public UTF8Encoding () : this (false, false) {}
  40. public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
  41. : this (encoderShouldEmitUTF8Identifier, false) {}
  42. public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
  43. : base (UTF8_CODE_PAGE)
  44. {
  45. emitIdentifier = encoderShouldEmitUTF8Identifier;
  46. if (throwOnInvalidBytes)
  47. SetFallbackInternal (EncoderFallback.ExceptionFallback, DecoderFallback.ExceptionFallback);
  48. else
  49. SetFallbackInternal (EncoderFallback.StandardSafeFallback, DecoderFallback.StandardSafeFallback);
  50. web_name = body_name = header_name = "utf-8";
  51. encoding_name = "Unicode (UTF-8)";
  52. is_browser_save = true;
  53. is_browser_display = true;
  54. is_mail_news_display = true;
  55. is_mail_news_save = true;
  56. windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE;
  57. }
  58. #region GetByteCount()
  59. // Internal version of "GetByteCount" which can handle a rolling
  60. // state between multiple calls to this method.
  61. private static int InternalGetByteCount (char[] chars, int index, int count, EncoderFallback fallback, ref char leftOver, bool flush)
  62. {
  63. // Validate the parameters.
  64. if (chars == null) {
  65. throw new ArgumentNullException ("chars");
  66. }
  67. if (index < 0 || index > chars.Length) {
  68. throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
  69. }
  70. if (count < 0 || count > (chars.Length - index)) {
  71. throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
  72. }
  73. if (index == chars.Length) {
  74. if (flush && leftOver != '\0') {
  75. // Flush the left-over surrogate pair start.
  76. leftOver = '\0';
  77. return 3;
  78. }
  79. return 0;
  80. }
  81. unsafe {
  82. fixed (char* cptr = chars) {
  83. return InternalGetByteCount (cptr + index, count, fallback, ref leftOver, flush);
  84. }
  85. }
  86. }
  87. private unsafe static int InternalGetByteCount (char* chars, int count, EncoderFallback fallback, ref char leftOver, bool flush)
  88. {
  89. int length = 0;
  90. char* end = chars + count;
  91. char* start = chars;
  92. EncoderFallbackBuffer buffer = null;
  93. while (chars < end) {
  94. if (leftOver == 0) {
  95. for (; chars < end; chars++) {
  96. if (*chars < '\x80') {
  97. ++length;
  98. } else if (*chars < '\x800') {
  99. length += 2;
  100. } else if (*chars < '\uD800' || *chars > '\uDFFF') {
  101. length += 3;
  102. } else if (*chars <= '\uDBFF') {
  103. // This is a surrogate start char, exit the inner loop only
  104. // if we don't find the complete surrogate pair.
  105. if (chars + 1 < end && chars [1] >= '\uDC00' && chars [1] <= '\uDFFF') {
  106. length += 4;
  107. chars++;
  108. continue;
  109. }
  110. leftOver = *chars;
  111. chars++;
  112. break;
  113. } else {
  114. // We have a surrogate tail without
  115. // leading surrogate.
  116. char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer);
  117. fixed (char *fb_chars = fallback_chars) {
  118. char dummy = '\0';
  119. length += InternalGetByteCount (fb_chars, fallback_chars.Length, fallback, ref dummy, true);
  120. }
  121. leftOver = '\0';
  122. }
  123. }
  124. } else {
  125. if (*chars >= '\uDC00' && *chars <= '\uDFFF') {
  126. // We have a correct surrogate pair.
  127. length += 4;
  128. chars++;
  129. } else {
  130. // We have a surrogate start followed by a
  131. // regular character. Technically, this is
  132. // invalid, but we have to do something.
  133. // We write out the surrogate start and then
  134. // re-visit the current character again.
  135. char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer);
  136. fixed (char *fb_chars = fallback_chars) {
  137. char dummy = '\0';
  138. length += InternalGetByteCount (fb_chars, fallback_chars.Length, fallback, ref dummy, true);
  139. }
  140. }
  141. leftOver = '\0';
  142. }
  143. }
  144. if (flush) {
  145. // Flush the left-over surrogate pair start.
  146. if (leftOver != '\0') {
  147. length += 3;
  148. leftOver = '\0';
  149. }
  150. }
  151. return length;
  152. }
  153. unsafe static char [] GetFallbackChars (char *chars, char *start, EncoderFallback fallback, ref EncoderFallbackBuffer buffer)
  154. {
  155. if (buffer == null)
  156. buffer = fallback.CreateFallbackBuffer ();
  157. buffer.Fallback (*chars, (int) (chars - start));
  158. char [] fallback_chars = new char [buffer.Remaining];
  159. for (int i = 0; i < fallback_chars.Length; i++)
  160. fallback_chars [i] = buffer.GetNextChar ();
  161. buffer.Reset ();
  162. return fallback_chars;
  163. }
  164. // Get the number of bytes needed to encode a character buffer.
  165. public override int GetByteCount (char[] chars, int index, int count)
  166. {
  167. char dummy = '\0';
  168. return InternalGetByteCount (chars, index, count, EncoderFallback, ref dummy, true);
  169. }
  170. [CLSCompliant (false)]
  171. [ComVisible (false)]
  172. public unsafe override int GetByteCount (char* chars, int count)
  173. {
  174. if (chars == null)
  175. throw new ArgumentNullException ("chars");
  176. if (count == 0)
  177. return 0;
  178. char dummy = '\0';
  179. return InternalGetByteCount (chars, count, EncoderFallback, ref dummy, true);
  180. }
  181. #endregion
  182. #region GetBytes()
  183. // Internal version of "GetBytes" which can handle a rolling
  184. // state between multiple calls to this method.
  185. private static int InternalGetBytes (char[] chars, int charIndex,
  186. int charCount, byte[] bytes,
  187. int byteIndex,
  188. EncoderFallback fallback, ref EncoderFallbackBuffer buffer,
  189. ref char leftOver, bool flush)
  190. {
  191. // Validate the parameters.
  192. if (chars == null) {
  193. throw new ArgumentNullException ("chars");
  194. }
  195. if (bytes == null) {
  196. throw new ArgumentNullException ("bytes");
  197. }
  198. if (charIndex < 0 || charIndex > chars.Length) {
  199. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
  200. }
  201. if (charCount < 0 || charCount > (chars.Length - charIndex)) {
  202. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
  203. }
  204. if (byteIndex < 0 || byteIndex > bytes.Length) {
  205. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  206. }
  207. if (charIndex == chars.Length) {
  208. if (flush && leftOver != '\0') {
  209. // FIXME: use EncoderFallback.
  210. //
  211. // By default it is empty, so I do nothing for now.
  212. leftOver = '\0';
  213. }
  214. return 0;
  215. }
  216. unsafe {
  217. fixed (char* cptr = chars) {
  218. if (bytes.Length == byteIndex)
  219. return InternalGetBytes (
  220. cptr + charIndex, charCount,
  221. null, 0, fallback, ref buffer, ref leftOver, flush);
  222. fixed (byte *bptr = bytes) {
  223. return InternalGetBytes (
  224. cptr + charIndex, charCount,
  225. bptr + byteIndex, bytes.Length - byteIndex,
  226. fallback, ref buffer,
  227. ref leftOver, flush);
  228. }
  229. }
  230. }
  231. }
  232. private unsafe static int InternalGetBytes (char* chars, int count, byte* bytes, int bcount, EncoderFallback fallback, ref EncoderFallbackBuffer buffer, ref char leftOver, bool flush)
  233. {
  234. char* end = chars + count;
  235. char* start = chars;
  236. byte* start_bytes = bytes;
  237. byte* end_bytes = bytes + bcount;
  238. while (chars < end) {
  239. if (leftOver == 0) {
  240. for (; chars < end; chars++) {
  241. int ch = *chars;
  242. if (ch < '\x80') {
  243. if (bytes >= end_bytes)
  244. goto fail_no_space;
  245. *bytes++ = (byte)ch;
  246. } else if (ch < '\x800') {
  247. if (bytes + 1 >= end_bytes)
  248. goto fail_no_space;
  249. bytes [0] = (byte) (0xC0 | (ch >> 6));
  250. bytes [1] = (byte) (0x80 | (ch & 0x3F));
  251. bytes += 2;
  252. } else if (ch < '\uD800' || ch > '\uDFFF') {
  253. if (bytes + 2 >= end_bytes)
  254. goto fail_no_space;
  255. bytes [0] = (byte) (0xE0 | (ch >> 12));
  256. bytes [1] = (byte) (0x80 | ((ch >> 6) & 0x3F));
  257. bytes [2] = (byte) (0x80 | (ch & 0x3F));
  258. bytes += 3;
  259. } else if (ch <= '\uDBFF') {
  260. // This is a surrogate char, exit the inner loop.
  261. leftOver = *chars;
  262. chars++;
  263. break;
  264. } else {
  265. // We have a surrogate tail without
  266. // leading surrogate.
  267. char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer);
  268. char dummy = '\0';
  269. if (bytes + InternalGetByteCount (fallback_chars, 0, fallback_chars.Length, fallback, ref dummy, true) > end_bytes)
  270. goto fail_no_space;
  271. fixed (char *fb_chars = fallback_chars) {
  272. bytes += InternalGetBytes (fb_chars, fallback_chars.Length, bytes, bcount - (int) (bytes - start_bytes), fallback, ref buffer, ref dummy, true);
  273. }
  274. leftOver = '\0';
  275. }
  276. }
  277. } else {
  278. if (*chars >= '\uDC00' && *chars <= '\uDFFF') {
  279. // We have a correct surrogate pair.
  280. int ch = 0x10000 + (int) *chars - 0xDC00 + (((int) leftOver - 0xD800) << 10);
  281. if (bytes + 3 >= end_bytes)
  282. goto fail_no_space;
  283. bytes [0] = (byte) (0xF0 | (ch >> 18));
  284. bytes [1] = (byte) (0x80 | ((ch >> 12) & 0x3F));
  285. bytes [2] = (byte) (0x80 | ((ch >> 6) & 0x3F));
  286. bytes [3] = (byte) (0x80 | (ch & 0x3F));
  287. bytes += 4;
  288. chars++;
  289. } else {
  290. // We have a surrogate start followed by a
  291. // regular character. Technically, this is
  292. // invalid, but we have to do something.
  293. // We write out the surrogate start and then
  294. // re-visit the current character again.
  295. char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer);
  296. char dummy = '\0';
  297. if (bytes + InternalGetByteCount (fallback_chars, 0, fallback_chars.Length, fallback, ref dummy, true) > end_bytes)
  298. goto fail_no_space;
  299. fixed (char *fb_chars = fallback_chars) {
  300. InternalGetBytes (fb_chars, fallback_chars.Length, bytes, bcount - (int) (bytes - start_bytes), fallback, ref buffer, ref dummy, true);
  301. }
  302. leftOver = '\0';
  303. }
  304. leftOver = '\0';
  305. }
  306. }
  307. if (flush) {
  308. // Flush the left-over surrogate pair start.
  309. if (leftOver != '\0') {
  310. int ch = leftOver;
  311. if (bytes + 2 < end_bytes) {
  312. bytes [0] = (byte) (0xE0 | (ch >> 12));
  313. bytes [1] = (byte) (0x80 | ((ch >> 6) & 0x3F));
  314. bytes [2] = (byte) (0x80 | (ch & 0x3F));
  315. bytes += 3;
  316. } else {
  317. goto fail_no_space;
  318. }
  319. leftOver = '\0';
  320. }
  321. }
  322. return (int)(bytes - (end_bytes - bcount));
  323. fail_no_space:
  324. throw new ArgumentException ("Insufficient Space", "bytes");
  325. }
  326. // Get the bytes that result from encoding a character buffer.
  327. public override int GetBytes (char[] chars, int charIndex, int charCount,
  328. byte[] bytes, int byteIndex)
  329. {
  330. char leftOver = '\0';
  331. EncoderFallbackBuffer buffer = null;
  332. return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, EncoderFallback, ref buffer, ref leftOver, true);
  333. }
  334. // Convenience wrappers for "GetBytes".
  335. public override int GetBytes (String s, int charIndex, int charCount,
  336. byte[] bytes, int byteIndex)
  337. {
  338. // Validate the parameters.
  339. if (s == null) {
  340. throw new ArgumentNullException ("s");
  341. }
  342. if (bytes == null) {
  343. throw new ArgumentNullException ("bytes");
  344. }
  345. if (charIndex < 0 || charIndex > s.Length) {
  346. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
  347. }
  348. if (charCount < 0 || charCount > (s.Length - charIndex)) {
  349. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
  350. }
  351. if (byteIndex < 0 || byteIndex > bytes.Length) {
  352. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  353. }
  354. if (charIndex == s.Length)
  355. return 0;
  356. unsafe {
  357. fixed (char* cptr = s) {
  358. char dummy = '\0';
  359. EncoderFallbackBuffer buffer = null;
  360. if (bytes.Length == byteIndex)
  361. return InternalGetBytes (
  362. cptr + charIndex, charCount,
  363. null, 0, EncoderFallback, ref buffer, ref dummy, true);
  364. fixed (byte *bptr = bytes) {
  365. return InternalGetBytes (
  366. cptr + charIndex, charCount,
  367. bptr + byteIndex, bytes.Length - byteIndex,
  368. EncoderFallback, ref buffer,
  369. ref dummy, true);
  370. }
  371. }
  372. }
  373. }
  374. [CLSCompliant (false)]
  375. [ComVisible (false)]
  376. public unsafe override int GetBytes (char* chars, int charCount, byte* bytes, int byteCount)
  377. {
  378. if (chars == null)
  379. throw new ArgumentNullException ("chars");
  380. if (charCount < 0)
  381. throw new IndexOutOfRangeException ("charCount");
  382. if (bytes == null)
  383. throw new ArgumentNullException ("bytes");
  384. if (byteCount < 0)
  385. throw new IndexOutOfRangeException ("charCount");
  386. if (charCount == 0)
  387. return 0;
  388. char dummy = '\0';
  389. EncoderFallbackBuffer buffer = null;
  390. if (byteCount == 0)
  391. return InternalGetBytes (chars, charCount, null, 0, EncoderFallback, ref buffer, ref dummy, true);
  392. else
  393. return InternalGetBytes (chars, charCount, bytes, byteCount, EncoderFallback, ref buffer, ref dummy, true);
  394. }
  395. #endregion
  396. // Internal version of "GetCharCount" which can handle a rolling
  397. // state between multiple calls to this method.
  398. private unsafe static int InternalGetCharCount (
  399. byte[] bytes, int index, int count, uint leftOverBits,
  400. uint leftOverCount, object provider,
  401. ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
  402. {
  403. // Validate the parameters.
  404. if (bytes == null) {
  405. throw new ArgumentNullException ("bytes");
  406. }
  407. if (index < 0 || index > bytes.Length) {
  408. throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
  409. }
  410. if (count < 0 || count > (bytes.Length - index)) {
  411. throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
  412. }
  413. if (count == 0)
  414. return 0;
  415. fixed (byte *bptr = bytes)
  416. return InternalGetCharCount (bptr + index, count,
  417. leftOverBits, leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
  418. }
  419. private unsafe static int InternalGetCharCount (
  420. byte* bytes, int count, uint leftOverBits,
  421. uint leftOverCount, object provider,
  422. ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
  423. {
  424. int index = 0;
  425. int length = 0;
  426. if (leftOverCount == 0) {
  427. int end = index + count;
  428. for (; index < end; index++, count--) {
  429. if (bytes [index] < 0x80)
  430. length++;
  431. else
  432. break;
  433. }
  434. }
  435. // Determine the number of characters that we have.
  436. uint ch;
  437. uint leftBits = leftOverBits;
  438. uint leftSoFar = (leftOverCount & (uint)0x0F);
  439. uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
  440. while (count > 0) {
  441. ch = (uint)(bytes[index++]);
  442. --count;
  443. if (leftSize == 0) {
  444. // Process a UTF-8 start character.
  445. if (ch < (uint)0x0080) {
  446. // Single-byte UTF-8 character.
  447. ++length;
  448. } else if ((ch & (uint)0xE0) == (uint)0xC0) {
  449. // Double-byte UTF-8 character.
  450. leftBits = (ch & (uint)0x1F);
  451. leftSoFar = 1;
  452. leftSize = 2;
  453. } else if ((ch & (uint)0xF0) == (uint)0xE0) {
  454. // Three-byte UTF-8 character.
  455. leftBits = (ch & (uint)0x0F);
  456. leftSoFar = 1;
  457. leftSize = 3;
  458. } else if ((ch & (uint)0xF8) == (uint)0xF0) {
  459. // Four-byte UTF-8 character.
  460. leftBits = (ch & (uint)0x07);
  461. leftSoFar = 1;
  462. leftSize = 4;
  463. } else if ((ch & (uint)0xFC) == (uint)0xF8) {
  464. // Five-byte UTF-8 character.
  465. leftBits = (ch & (uint)0x03);
  466. leftSoFar = 1;
  467. leftSize = 5;
  468. } else if ((ch & (uint)0xFE) == (uint)0xFC) {
  469. // Six-byte UTF-8 character.
  470. leftBits = (ch & (uint)0x03);
  471. leftSoFar = 1;
  472. leftSize = 6;
  473. } else {
  474. // Invalid UTF-8 start character.
  475. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1, 1);
  476. }
  477. } else {
  478. // Process an extra byte in a multi-byte sequence.
  479. if ((ch & (uint)0xC0) == (uint)0x80) {
  480. leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
  481. if (++leftSoFar >= leftSize) {
  482. // We have a complete character now.
  483. if (leftBits < (uint)0x10000) {
  484. // is it an overlong ?
  485. bool overlong = false;
  486. switch (leftSize) {
  487. case 2:
  488. overlong = (leftBits <= 0x7F);
  489. break;
  490. case 3:
  491. overlong = (leftBits <= 0x07FF);
  492. break;
  493. case 4:
  494. overlong = (leftBits <= 0xFFFF);
  495. break;
  496. case 5:
  497. overlong = (leftBits <= 0x1FFFFF);
  498. break;
  499. case 6:
  500. overlong = (leftBits <= 0x03FFFFFF);
  501. break;
  502. }
  503. if (overlong) {
  504. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
  505. }
  506. else if ((leftBits & 0xF800) == 0xD800) {
  507. // UTF-8 doesn't use surrogate characters
  508. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
  509. }
  510. else
  511. ++length;
  512. } else if (leftBits < (uint)0x110000) {
  513. length += 2;
  514. } else {
  515. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
  516. }
  517. leftSize = 0;
  518. }
  519. } else {
  520. // Invalid UTF-8 sequence: clear and restart.
  521. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
  522. leftSize = 0;
  523. --index;
  524. ++count;
  525. }
  526. }
  527. }
  528. if (flush && leftSize != 0) {
  529. // We had left-over bytes that didn't make up
  530. // a complete UTF-8 character sequence.
  531. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
  532. }
  533. // Return the final length to the caller.
  534. return length;
  535. }
  536. // for GetCharCount()
  537. static unsafe int Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, long index, uint size)
  538. {
  539. if (buffer == null) {
  540. DecoderFallback fb = provider as DecoderFallback;
  541. if (fb != null)
  542. buffer = fb.CreateFallbackBuffer ();
  543. else
  544. buffer = ((Decoder) provider).FallbackBuffer;
  545. }
  546. if (bufferArg == null)
  547. bufferArg = new byte [1];
  548. int ret = 0;
  549. for (int i = 0; i < size; i++) {
  550. bufferArg [0] = bytes [(int) index + i];
  551. buffer.Fallback (bufferArg, 0);
  552. ret += buffer.Remaining;
  553. buffer.Reset ();
  554. }
  555. return ret;
  556. }
  557. // for GetChars()
  558. static unsafe void Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, long byteIndex, uint size,
  559. char* chars, ref int charIndex)
  560. {
  561. if (buffer == null) {
  562. DecoderFallback fb = provider as DecoderFallback;
  563. if (fb != null)
  564. buffer = fb.CreateFallbackBuffer ();
  565. else
  566. buffer = ((Decoder) provider).FallbackBuffer;
  567. }
  568. if (bufferArg == null)
  569. bufferArg = new byte [1];
  570. for (int i = 0; i < size; i++) {
  571. bufferArg [0] = bytes [byteIndex + i];
  572. buffer.Fallback (bufferArg, 0);
  573. while (buffer.Remaining > 0)
  574. chars [charIndex++] = buffer.GetNextChar ();
  575. buffer.Reset ();
  576. }
  577. }
  578. // Get the number of characters needed to decode a byte buffer.
  579. public override int GetCharCount (byte[] bytes, int index, int count)
  580. {
  581. DecoderFallbackBuffer buf = null;
  582. byte [] bufferArg = null;
  583. return InternalGetCharCount (bytes, index, count, 0, 0, DecoderFallback, ref buf, ref bufferArg, true);
  584. }
  585. [CLSCompliant (false)]
  586. [ComVisible (false)]
  587. public unsafe override int GetCharCount (byte* bytes, int count)
  588. {
  589. DecoderFallbackBuffer buf = null;
  590. byte [] bufferArg = null;
  591. return InternalGetCharCount (bytes, count, 0, 0, DecoderFallback, ref buf, ref bufferArg, true);
  592. }
  593. // Get the characters that result from decoding a byte buffer.
  594. private unsafe static int InternalGetChars (
  595. byte[] bytes, int byteIndex, int byteCount, char[] chars,
  596. int charIndex, ref uint leftOverBits, ref uint leftOverCount,
  597. object provider,
  598. ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
  599. {
  600. // Validate the parameters.
  601. if (bytes == null) {
  602. throw new ArgumentNullException ("bytes");
  603. }
  604. if (chars == null) {
  605. throw new ArgumentNullException ("chars");
  606. }
  607. if (byteIndex < 0 || byteIndex > bytes.Length) {
  608. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  609. }
  610. if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) {
  611. throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
  612. }
  613. if (charIndex < 0 || charIndex > chars.Length) {
  614. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
  615. }
  616. if (charIndex == chars.Length && byteCount == 0)
  617. return 0;
  618. fixed (char* cptr = chars) {
  619. if (byteCount == 0 || byteIndex == bytes.Length)
  620. return InternalGetChars (null, 0, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
  621. // otherwise...
  622. fixed (byte* bptr = bytes)
  623. return InternalGetChars (bptr + byteIndex, byteCount, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
  624. }
  625. }
  626. private unsafe static int InternalGetChars (
  627. byte* bytes, int byteCount, char* chars, int charCount,
  628. ref uint leftOverBits, ref uint leftOverCount,
  629. object provider,
  630. ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
  631. {
  632. int charIndex = 0, byteIndex = 0;
  633. int length = charCount;
  634. int posn = charIndex;
  635. if (leftOverCount == 0) {
  636. int end = byteIndex + byteCount;
  637. for (; byteIndex < end; posn++, byteIndex++, byteCount--) {
  638. if (bytes [byteIndex] < 0x80) {
  639. if (posn >= length) {
  640. throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
  641. }
  642. chars [posn] = (char) bytes [byteIndex];
  643. } else {
  644. break;
  645. }
  646. }
  647. }
  648. // Convert the bytes into the output buffer.
  649. uint ch;
  650. uint leftBits = leftOverBits;
  651. uint leftSoFar = (leftOverCount & (uint)0x0F);
  652. uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
  653. int byteEnd = byteIndex + byteCount;
  654. for(; byteIndex < byteEnd; byteIndex++) {
  655. // Fetch the next character from the byte buffer.
  656. ch = (uint)(bytes[byteIndex]);
  657. if (leftSize == 0) {
  658. // Process a UTF-8 start character.
  659. if (ch < (uint)0x0080) {
  660. // Single-byte UTF-8 character.
  661. if (posn >= length) {
  662. throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
  663. }
  664. chars[posn++] = (char)ch;
  665. } else if ((ch & (uint)0xE0) == (uint)0xC0) {
  666. // Double-byte UTF-8 character.
  667. leftBits = (ch & (uint)0x1F);
  668. leftSoFar = 1;
  669. leftSize = 2;
  670. } else if ((ch & (uint)0xF0) == (uint)0xE0) {
  671. // Three-byte UTF-8 character.
  672. leftBits = (ch & (uint)0x0F);
  673. leftSoFar = 1;
  674. leftSize = 3;
  675. } else if ((ch & (uint)0xF8) == (uint)0xF0) {
  676. // Four-byte UTF-8 character.
  677. leftBits = (ch & (uint)0x07);
  678. leftSoFar = 1;
  679. leftSize = 4;
  680. } else if ((ch & (uint)0xFC) == (uint)0xF8) {
  681. // Five-byte UTF-8 character.
  682. leftBits = (ch & (uint)0x03);
  683. leftSoFar = 1;
  684. leftSize = 5;
  685. } else if ((ch & (uint)0xFE) == (uint)0xFC) {
  686. // Six-byte UTF-8 character.
  687. leftBits = (ch & (uint)0x03);
  688. leftSoFar = 1;
  689. leftSize = 6;
  690. } else {
  691. // Invalid UTF-8 start character.
  692. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, 1, chars, ref posn);
  693. }
  694. } else {
  695. // Process an extra byte in a multi-byte sequence.
  696. if ((ch & (uint)0xC0) == (uint)0x80) {
  697. leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
  698. if (++leftSoFar >= leftSize) {
  699. // We have a complete character now.
  700. if (leftBits < (uint)0x10000) {
  701. // is it an overlong ?
  702. bool overlong = false;
  703. switch (leftSize) {
  704. case 2:
  705. overlong = (leftBits <= 0x7F);
  706. break;
  707. case 3:
  708. overlong = (leftBits <= 0x07FF);
  709. break;
  710. case 4:
  711. overlong = (leftBits <= 0xFFFF);
  712. break;
  713. case 5:
  714. overlong = (leftBits <= 0x1FFFFF);
  715. break;
  716. case 6:
  717. overlong = (leftBits <= 0x03FFFFFF);
  718. break;
  719. }
  720. if (overlong) {
  721. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
  722. }
  723. else if ((leftBits & 0xF800) == 0xD800) {
  724. // UTF-8 doesn't use surrogate characters
  725. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
  726. }
  727. else {
  728. if (posn >= length) {
  729. throw new ArgumentException
  730. (_("Arg_InsufficientSpace"), "chars");
  731. }
  732. chars[posn++] = (char)leftBits;
  733. }
  734. } else if (leftBits < (uint)0x110000) {
  735. if ((posn + 2) > length) {
  736. throw new ArgumentException
  737. (_("Arg_InsufficientSpace"), "chars");
  738. }
  739. leftBits -= (uint)0x10000;
  740. chars[posn++] = (char)((leftBits >> 10) +
  741. (uint)0xD800);
  742. chars[posn++] =
  743. (char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
  744. } else {
  745. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
  746. }
  747. leftSize = 0;
  748. }
  749. } else {
  750. // Invalid UTF-8 sequence: clear and restart.
  751. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
  752. leftSize = 0;
  753. --byteIndex;
  754. }
  755. }
  756. }
  757. if (flush && leftSize != 0) {
  758. // We had left-over bytes that didn't make up
  759. // a complete UTF-8 character sequence.
  760. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
  761. }
  762. leftOverBits = leftBits;
  763. leftOverCount = (leftSoFar | (leftSize << 4));
  764. // Return the final length to the caller.
  765. return posn - charIndex;
  766. }
  767. // Get the characters that result from decoding a byte buffer.
  768. public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
  769. char[] chars, int charIndex)
  770. {
  771. uint leftOverBits = 0;
  772. uint leftOverCount = 0;
  773. DecoderFallbackBuffer buf = null;
  774. byte [] bufferArg = null;
  775. return InternalGetChars (bytes, byteIndex, byteCount, chars,
  776. charIndex, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, ref bufferArg, true);
  777. }
  778. [CLSCompliant (false)]
  779. [ComVisible (false)]
  780. public unsafe override int GetChars (byte* bytes, int byteCount, char* chars, int charCount)
  781. {
  782. DecoderFallbackBuffer buf = null;
  783. byte [] bufferArg = null;
  784. uint leftOverBits = 0;
  785. uint leftOverCount = 0;
  786. return InternalGetChars (bytes, byteCount, chars,
  787. charCount, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, ref bufferArg, true);
  788. }
  789. // Get the maximum number of bytes needed to encode a
  790. // specified number of characters.
  791. public override int GetMaxByteCount (int charCount)
  792. {
  793. if (charCount < 0) {
  794. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
  795. }
  796. return charCount * 4;
  797. }
  798. // Get the maximum number of characters needed to decode a
  799. // specified number of bytes.
  800. public override int GetMaxCharCount (int byteCount)
  801. {
  802. if (byteCount < 0) {
  803. throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
  804. }
  805. return byteCount;
  806. }
  807. // Get a UTF8-specific decoder that is attached to this instance.
  808. public override Decoder GetDecoder ()
  809. {
  810. return new UTF8Decoder (DecoderFallback);
  811. }
  812. // Get a UTF8-specific encoder that is attached to this instance.
  813. public override Encoder GetEncoder ()
  814. {
  815. return new UTF8Encoder (EncoderFallback, emitIdentifier);
  816. }
  817. // Get the UTF8 preamble.
  818. public override byte[] GetPreamble ()
  819. {
  820. if (emitIdentifier)
  821. return new byte [] { 0xEF, 0xBB, 0xBF };
  822. return EmptyArray<byte>.Value;
  823. }
  824. // Determine if this object is equal to another.
  825. public override bool Equals (Object value)
  826. {
  827. UTF8Encoding enc = (value as UTF8Encoding);
  828. if (enc != null) {
  829. return (codePage == enc.codePage &&
  830. emitIdentifier == enc.emitIdentifier &&
  831. DecoderFallback.Equals (enc.DecoderFallback) &&
  832. EncoderFallback.Equals (enc.EncoderFallback));
  833. } else {
  834. return false;
  835. }
  836. }
  837. // Get the hash code for this object.
  838. public override int GetHashCode ()
  839. {
  840. return base.GetHashCode ();
  841. }
  842. public override int GetByteCount (string chars)
  843. {
  844. // hmm, does this override make any sense?
  845. return base.GetByteCount (chars);
  846. }
  847. [ComVisible (false)]
  848. public override string GetString (byte [] bytes, int index, int count)
  849. {
  850. // hmm, does this override make any sense?
  851. return base.GetString (bytes, index, count);
  852. }
  853. // UTF-8 decoder implementation.
  854. [Serializable]
  855. private class UTF8Decoder : Decoder
  856. {
  857. private uint leftOverBits;
  858. private uint leftOverCount;
  859. // Constructor.
  860. public UTF8Decoder (DecoderFallback fallback)
  861. {
  862. Fallback = fallback;
  863. leftOverBits = 0;
  864. leftOverCount = 0;
  865. }
  866. // Override inherited methods.
  867. public override int GetCharCount (byte[] bytes, int index, int count)
  868. {
  869. DecoderFallbackBuffer buf = null;
  870. byte [] bufferArg = null;
  871. return InternalGetCharCount (bytes, index, count,
  872. leftOverBits, leftOverCount, this, ref buf, ref bufferArg, false);
  873. }
  874. public override int GetChars (byte[] bytes, int byteIndex,
  875. int byteCount, char[] chars, int charIndex)
  876. {
  877. DecoderFallbackBuffer buf = null;
  878. byte [] bufferArg = null;
  879. return InternalGetChars (bytes, byteIndex, byteCount,
  880. chars, charIndex, ref leftOverBits, ref leftOverCount, this, ref buf, ref bufferArg, false);
  881. }
  882. } // class UTF8Decoder
  883. // UTF-8 encoder implementation.
  884. [Serializable]
  885. private class UTF8Encoder : Encoder
  886. {
  887. // private bool emitIdentifier;
  888. private char leftOverForCount;
  889. private char leftOverForConv;
  890. // Constructor.
  891. public UTF8Encoder (EncoderFallback fallback, bool emitIdentifier)
  892. {
  893. Fallback = fallback;
  894. // this.emitIdentifier = emitIdentifier;
  895. leftOverForCount = '\0';
  896. leftOverForConv = '\0';
  897. }
  898. // Override inherited methods.
  899. public override int GetByteCount (char[] chars, int index,
  900. int count, bool flush)
  901. {
  902. return InternalGetByteCount (chars, index, count, Fallback, ref leftOverForCount, flush);
  903. }
  904. public override int GetBytes (char[] chars, int charIndex,
  905. int charCount, byte[] bytes, int byteIndex, bool flush)
  906. {
  907. int result;
  908. EncoderFallbackBuffer buffer = null;
  909. result = InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, Fallback, ref buffer, ref leftOverForConv, flush);
  910. // emitIdentifier = false;
  911. return result;
  912. }
  913. public unsafe override int GetByteCount (char* chars, int count, bool flush)
  914. {
  915. return InternalGetByteCount (chars, count, Fallback, ref leftOverForCount, flush);
  916. }
  917. public unsafe override int GetBytes (char* chars, int charCount,
  918. byte* bytes, int byteCount, bool flush)
  919. {
  920. int result;
  921. EncoderFallbackBuffer buffer = null;
  922. result = InternalGetBytes (chars, charCount, bytes, byteCount, Fallback, ref buffer, ref leftOverForConv, flush);
  923. // emitIdentifier = false;
  924. return result;
  925. }
  926. } // class UTF8Encoder
  927. }; // class UTF8Encoding
  928. }; // namespace System.Text