UTF8Encoding.cs 31 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003
  1. /*
  2. * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
  3. *
  4. * Copyright (c) 2001, 2002 Southern Storm Software, Pty Ltd
  5. * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
  6. *
  7. * Permission is hereby granted, free of charge, to any person obtaining
  8. * a copy of this software and associated documentation files (the "Software"),
  9. * to deal in the Software without restriction, including without limitation
  10. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11. * and/or sell copies of the Software, and to permit persons to whom the
  12. * Software is furnished to do so, subject to the following conditions:
  13. *
  14. * The above copyright notice and this permission notice shall be included
  15. * in all copies or substantial portions of the Software.
  16. *
  17. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18. * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  20. * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  21. * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  22. * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  23. * OTHER DEALINGS IN THE SOFTWARE.
  24. */
  25. namespace System.Text
  26. {
  27. using System;
  28. using System.Runtime.InteropServices;
  29. [Serializable]
  30. [MonoLimitation ("Serialization format not compatible with .NET")]
  31. [ComVisible (true)]
  32. public class UTF8Encoding : Encoding
  33. {
  34. // Magic number used by Windows for UTF-8.
  35. internal const int UTF8_CODE_PAGE = 65001;
  36. // Internal state.
  37. private bool emitIdentifier;
  38. // Constructors.
  39. public UTF8Encoding () : this (false, false) {}
  40. public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
  41. : this (encoderShouldEmitUTF8Identifier, false) {}
  42. public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
  43. : base (UTF8_CODE_PAGE)
  44. {
  45. emitIdentifier = encoderShouldEmitUTF8Identifier;
  46. if (throwOnInvalidBytes)
  47. SetFallbackInternal (EncoderFallback.ExceptionFallback, DecoderFallback.ExceptionFallback);
  48. else
  49. SetFallbackInternal (EncoderFallback.StandardSafeFallback, DecoderFallback.StandardSafeFallback);
  50. web_name = body_name = header_name = "utf-8";
  51. encoding_name = "Unicode (UTF-8)";
  52. is_browser_save = true;
  53. is_browser_display = true;
  54. is_mail_news_display = true;
  55. is_mail_news_save = true;
  56. windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE;
  57. }
  58. #region GetByteCount()
  59. // Internal version of "GetByteCount" which can handle a rolling
  60. // state between multiple calls to this method.
  61. private static int InternalGetByteCount (char[] chars, int index, int count, EncoderFallback fallback, ref char leftOver, bool flush)
  62. {
  63. // Validate the parameters.
  64. if (chars == null) {
  65. throw new ArgumentNullException ("chars");
  66. }
  67. if (index < 0 || index > chars.Length) {
  68. throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
  69. }
  70. if (count < 0 || count > (chars.Length - index)) {
  71. throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
  72. }
  73. if (index == chars.Length) {
  74. if (flush && leftOver != '\0') {
  75. // Flush the left-over surrogate pair start.
  76. leftOver = '\0';
  77. return 3;
  78. }
  79. return 0;
  80. }
  81. unsafe {
  82. fixed (char* cptr = chars) {
  83. return InternalGetByteCount (cptr + index, count, fallback, ref leftOver, flush);
  84. }
  85. }
  86. }
  87. private unsafe static int InternalGetByteCount (char* chars, int count, EncoderFallback fallback, ref char leftOver, bool flush)
  88. {
  89. int length = 0;
  90. char* end = chars + count;
  91. char* start = chars;
  92. EncoderFallbackBuffer buffer = null;
  93. while (chars < end) {
  94. if (leftOver == 0) {
  95. for (; chars < end; chars++) {
  96. if (*chars < '\x80') {
  97. ++length;
  98. } else if (*chars < '\x800') {
  99. length += 2;
  100. } else if (*chars < '\uD800' || *chars > '\uDFFF') {
  101. length += 3;
  102. } else if (*chars <= '\uDBFF') {
  103. // This is a surrogate start char, exit the inner loop only
  104. // if we don't find the complete surrogate pair.
  105. if (chars + 1 < end && chars [1] >= '\uDC00' && chars [1] <= '\uDFFF') {
  106. length += 4;
  107. chars++;
  108. continue;
  109. }
  110. leftOver = *chars;
  111. chars++;
  112. break;
  113. } else {
  114. // We have a surrogate tail without
  115. // leading surrogate.
  116. char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer);
  117. fixed (char *fb_chars = fallback_chars) {
  118. char dummy = '\0';
  119. length += InternalGetByteCount (fb_chars, fallback_chars.Length, fallback, ref dummy, true);
  120. }
  121. leftOver = '\0';
  122. }
  123. }
  124. } else {
  125. if (*chars >= '\uDC00' && *chars <= '\uDFFF') {
  126. // We have a correct surrogate pair.
  127. length += 4;
  128. chars++;
  129. } else {
  130. // We have a surrogate start followed by a
  131. // regular character. Technically, this is
  132. // invalid, but we have to do something.
  133. // We write out the surrogate start and then
  134. // re-visit the current character again.
  135. char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer);
  136. fixed (char *fb_chars = fallback_chars) {
  137. char dummy = '\0';
  138. length += InternalGetByteCount (fb_chars, fallback_chars.Length, fallback, ref dummy, true);
  139. }
  140. }
  141. leftOver = '\0';
  142. }
  143. }
  144. if (flush) {
  145. // Flush the left-over surrogate pair start.
  146. if (leftOver != '\0') {
  147. length += 3;
  148. leftOver = '\0';
  149. }
  150. }
  151. return length;
  152. }
  153. unsafe static char [] GetFallbackChars (char *chars, char *start, EncoderFallback fallback, ref EncoderFallbackBuffer buffer)
  154. {
  155. if (buffer == null)
  156. buffer = fallback.CreateFallbackBuffer ();
  157. buffer.Fallback (*chars, (int) (chars - start));
  158. char [] fallback_chars = new char [buffer.Remaining];
  159. for (int i = 0; i < fallback_chars.Length; i++)
  160. fallback_chars [i] = buffer.GetNextChar ();
  161. buffer.Reset ();
  162. return fallback_chars;
  163. }
  164. // Get the number of bytes needed to encode a character buffer.
  165. public override int GetByteCount (char[] chars, int index, int count)
  166. {
  167. char dummy = '\0';
  168. return InternalGetByteCount (chars, index, count, EncoderFallback, ref dummy, true);
  169. }
  170. [CLSCompliant (false)]
  171. [ComVisible (false)]
  172. public unsafe override int GetByteCount (char* chars, int count)
  173. {
  174. if (chars == null)
  175. throw new ArgumentNullException ("chars");
  176. if (count == 0)
  177. return 0;
  178. char dummy = '\0';
  179. return InternalGetByteCount (chars, count, EncoderFallback, ref dummy, true);
  180. }
  181. #endregion
  182. #region GetBytes()
  183. // Internal version of "GetBytes" which can handle a rolling
  184. // state between multiple calls to this method.
  185. private static int InternalGetBytes (char[] chars, int charIndex,
  186. int charCount, byte[] bytes,
  187. int byteIndex,
  188. EncoderFallback fallback, ref EncoderFallbackBuffer buffer,
  189. ref char leftOver, bool flush)
  190. {
  191. // Validate the parameters.
  192. if (chars == null) {
  193. throw new ArgumentNullException ("chars");
  194. }
  195. if (bytes == null) {
  196. throw new ArgumentNullException ("bytes");
  197. }
  198. if (charIndex < 0 || charIndex > chars.Length) {
  199. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
  200. }
  201. if (charCount < 0 || charCount > (chars.Length - charIndex)) {
  202. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
  203. }
  204. if (byteIndex < 0 || byteIndex > bytes.Length) {
  205. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  206. }
  207. if (charIndex == chars.Length) {
  208. if (flush && leftOver != '\0') {
  209. // FIXME: use EncoderFallback.
  210. //
  211. // By default it is empty, so I do nothing for now.
  212. leftOver = '\0';
  213. }
  214. return 0;
  215. }
  216. unsafe {
  217. fixed (char* cptr = chars) {
  218. if (bytes.Length == byteIndex)
  219. return InternalGetBytes (
  220. cptr + charIndex, charCount,
  221. null, 0, fallback, ref buffer, ref leftOver, flush);
  222. fixed (byte *bptr = bytes) {
  223. return InternalGetBytes (
  224. cptr + charIndex, charCount,
  225. bptr + byteIndex, bytes.Length - byteIndex,
  226. fallback, ref buffer,
  227. ref leftOver, flush);
  228. }
  229. }
  230. }
  231. }
  232. private unsafe static int InternalGetBytes (char* chars, int count, byte* bytes, int bcount, EncoderFallback fallback, ref EncoderFallbackBuffer buffer, ref char leftOver, bool flush)
  233. {
  234. char* end = chars + count;
  235. char* start = chars;
  236. byte* start_bytes = bytes;
  237. byte* end_bytes = bytes + bcount;
  238. while (chars < end) {
  239. if (leftOver == 0) {
  240. for (; chars < end; chars++) {
  241. int ch = *chars;
  242. if (ch < '\x80') {
  243. if (bytes >= end_bytes)
  244. goto fail_no_space;
  245. *bytes++ = (byte)ch;
  246. } else if (ch < '\x800') {
  247. if (bytes + 1 >= end_bytes)
  248. goto fail_no_space;
  249. bytes [0] = (byte) (0xC0 | (ch >> 6));
  250. bytes [1] = (byte) (0x80 | (ch & 0x3F));
  251. bytes += 2;
  252. } else if (ch < '\uD800' || ch > '\uDFFF') {
  253. if (bytes + 2 >= end_bytes)
  254. goto fail_no_space;
  255. bytes [0] = (byte) (0xE0 | (ch >> 12));
  256. bytes [1] = (byte) (0x80 | ((ch >> 6) & 0x3F));
  257. bytes [2] = (byte) (0x80 | (ch & 0x3F));
  258. bytes += 3;
  259. } else if (ch <= '\uDBFF') {
  260. // This is a surrogate char, exit the inner loop.
  261. leftOver = *chars;
  262. chars++;
  263. break;
  264. } else {
  265. // We have a surrogate tail without
  266. // leading surrogate.
  267. char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer);
  268. char dummy = '\0';
  269. if (bytes + InternalGetByteCount (fallback_chars, 0, fallback_chars.Length, fallback, ref dummy, true) > end_bytes)
  270. goto fail_no_space;
  271. fixed (char *fb_chars = fallback_chars) {
  272. bytes += InternalGetBytes (fb_chars, fallback_chars.Length, bytes, bcount - (int) (bytes - start_bytes), fallback, ref buffer, ref dummy, true);
  273. }
  274. leftOver = '\0';
  275. }
  276. }
  277. } else {
  278. if (*chars >= '\uDC00' && *chars <= '\uDFFF') {
  279. // We have a correct surrogate pair.
  280. int ch = 0x10000 + (int) *chars - 0xDC00 + (((int) leftOver - 0xD800) << 10);
  281. if (bytes + 3 >= end_bytes)
  282. goto fail_no_space;
  283. bytes [0] = (byte) (0xF0 | (ch >> 18));
  284. bytes [1] = (byte) (0x80 | ((ch >> 12) & 0x3F));
  285. bytes [2] = (byte) (0x80 | ((ch >> 6) & 0x3F));
  286. bytes [3] = (byte) (0x80 | (ch & 0x3F));
  287. bytes += 4;
  288. chars++;
  289. } else {
  290. // We have a surrogate start followed by a
  291. // regular character. Technically, this is
  292. // invalid, but we have to do something.
  293. // We write out the surrogate start and then
  294. // re-visit the current character again.
  295. char [] fallback_chars = GetFallbackChars (chars, start, fallback, ref buffer);
  296. char dummy = '\0';
  297. if (bytes + InternalGetByteCount (fallback_chars, 0, fallback_chars.Length, fallback, ref dummy, true) > end_bytes)
  298. goto fail_no_space;
  299. fixed (char *fb_chars = fallback_chars) {
  300. InternalGetBytes (fb_chars, fallback_chars.Length, bytes, bcount - (int) (bytes - start_bytes), fallback, ref buffer, ref dummy, true);
  301. }
  302. leftOver = '\0';
  303. }
  304. leftOver = '\0';
  305. }
  306. }
  307. if (flush) {
  308. // Flush the left-over surrogate pair start.
  309. if (leftOver != '\0') {
  310. int ch = leftOver;
  311. if (bytes + 2 < end_bytes) {
  312. bytes [0] = (byte) (0xE0 | (ch >> 12));
  313. bytes [1] = (byte) (0x80 | ((ch >> 6) & 0x3F));
  314. bytes [2] = (byte) (0x80 | (ch & 0x3F));
  315. bytes += 3;
  316. } else {
  317. goto fail_no_space;
  318. }
  319. leftOver = '\0';
  320. }
  321. }
  322. return (int)(bytes - (end_bytes - bcount));
  323. fail_no_space:
  324. throw new ArgumentException ("Insufficient Space", "bytes");
  325. }
  326. // Get the bytes that result from encoding a character buffer.
  327. public override int GetBytes (char[] chars, int charIndex, int charCount,
  328. byte[] bytes, int byteIndex)
  329. {
  330. char leftOver = '\0';
  331. EncoderFallbackBuffer buffer = null;
  332. return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, EncoderFallback, ref buffer, ref leftOver, true);
  333. }
  334. // Convenience wrappers for "GetBytes".
  335. public override int GetBytes (String s, int charIndex, int charCount,
  336. byte[] bytes, int byteIndex)
  337. {
  338. // Validate the parameters.
  339. if (s == null) {
  340. throw new ArgumentNullException ("s");
  341. }
  342. if (bytes == null) {
  343. throw new ArgumentNullException ("bytes");
  344. }
  345. if (charIndex < 0 || charIndex > s.Length) {
  346. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
  347. }
  348. if (charCount < 0 || charCount > (s.Length - charIndex)) {
  349. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
  350. }
  351. if (byteIndex < 0 || byteIndex > bytes.Length) {
  352. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  353. }
  354. if (charIndex == s.Length)
  355. return 0;
  356. unsafe {
  357. fixed (char* cptr = s) {
  358. char dummy = '\0';
  359. EncoderFallbackBuffer buffer = null;
  360. if (bytes.Length == byteIndex)
  361. return InternalGetBytes (
  362. cptr + charIndex, charCount,
  363. null, 0, EncoderFallback, ref buffer, ref dummy, true);
  364. fixed (byte *bptr = bytes) {
  365. return InternalGetBytes (
  366. cptr + charIndex, charCount,
  367. bptr + byteIndex, bytes.Length - byteIndex,
  368. EncoderFallback, ref buffer,
  369. ref dummy, true);
  370. }
  371. }
  372. }
  373. }
  374. [CLSCompliant (false)]
  375. [ComVisible (false)]
  376. public unsafe override int GetBytes (char* chars, int charCount, byte* bytes, int byteCount)
  377. {
  378. if (chars == null)
  379. throw new ArgumentNullException ("chars");
  380. if (charCount < 0)
  381. throw new IndexOutOfRangeException ("charCount");
  382. if (bytes == null)
  383. throw new ArgumentNullException ("bytes");
  384. if (byteCount < 0)
  385. throw new IndexOutOfRangeException ("charCount");
  386. if (charCount == 0)
  387. return 0;
  388. char dummy = '\0';
  389. EncoderFallbackBuffer buffer = null;
  390. if (byteCount == 0)
  391. return InternalGetBytes (chars, charCount, null, 0, EncoderFallback, ref buffer, ref dummy, true);
  392. else
  393. return InternalGetBytes (chars, charCount, bytes, byteCount, EncoderFallback, ref buffer, ref dummy, true);
  394. }
  395. #endregion
  396. // Internal version of "GetCharCount" which can handle a rolling
  397. // state between multiple calls to this method.
  398. private unsafe static int InternalGetCharCount (
  399. byte[] bytes, int index, int count, uint leftOverBits,
  400. uint leftOverCount, object provider,
  401. ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
  402. {
  403. // Validate the parameters.
  404. if (bytes == null) {
  405. throw new ArgumentNullException ("bytes");
  406. }
  407. if (index < 0 || index > bytes.Length) {
  408. throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
  409. }
  410. if (count < 0 || count > (bytes.Length - index)) {
  411. throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
  412. }
  413. if (count == 0)
  414. return 0;
  415. fixed (byte *bptr = bytes)
  416. return InternalGetCharCount (bptr + index, count,
  417. leftOverBits, leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
  418. }
  419. private unsafe static int InternalGetCharCount (
  420. byte* bytes, int count, uint leftOverBits,
  421. uint leftOverCount, object provider,
  422. ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
  423. {
  424. int index = 0;
  425. int length = 0;
  426. if (leftOverCount == 0) {
  427. int end = index + count;
  428. for (; index < end; index++, count--) {
  429. if (bytes [index] < 0x80)
  430. length++;
  431. else
  432. break;
  433. }
  434. }
  435. // Determine the number of characters that we have.
  436. uint ch;
  437. uint leftBits = leftOverBits;
  438. uint leftSoFar = (leftOverCount & (uint)0x0F);
  439. uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
  440. while (count > 0) {
  441. ch = (uint)(bytes[index++]);
  442. --count;
  443. if (leftSize == 0) {
  444. // Process a UTF-8 start character.
  445. if (ch < (uint)0x0080) {
  446. // Single-byte UTF-8 character.
  447. ++length;
  448. } else if ((ch & (uint)0xE0) == (uint)0xC0) {
  449. // Double-byte UTF-8 character.
  450. leftBits = (ch & (uint)0x1F);
  451. leftSoFar = 1;
  452. leftSize = 2;
  453. } else if ((ch & (uint)0xF0) == (uint)0xE0) {
  454. // Three-byte UTF-8 character.
  455. leftBits = (ch & (uint)0x0F);
  456. leftSoFar = 1;
  457. leftSize = 3;
  458. } else if ((ch & (uint)0xF8) == (uint)0xF0) {
  459. // Four-byte UTF-8 character.
  460. leftBits = (ch & (uint)0x07);
  461. leftSoFar = 1;
  462. leftSize = 4;
  463. } else if ((ch & (uint)0xFC) == (uint)0xF8) {
  464. // Five-byte UTF-8 character.
  465. leftBits = (ch & (uint)0x03);
  466. leftSoFar = 1;
  467. leftSize = 5;
  468. } else if ((ch & (uint)0xFE) == (uint)0xFC) {
  469. // Six-byte UTF-8 character.
  470. leftBits = (ch & (uint)0x03);
  471. leftSoFar = 1;
  472. leftSize = 6;
  473. } else {
  474. // Invalid UTF-8 start character.
  475. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1, 1);
  476. }
  477. } else {
  478. // Process an extra byte in a multi-byte sequence.
  479. if ((ch & (uint)0xC0) == (uint)0x80) {
  480. leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
  481. if (++leftSoFar >= leftSize) {
  482. // We have a complete character now.
  483. if (leftBits < (uint)0x10000) {
  484. // is it an overlong ?
  485. bool overlong = false;
  486. switch (leftSize) {
  487. case 2:
  488. overlong = (leftBits <= 0x7F);
  489. break;
  490. case 3:
  491. overlong = (leftBits <= 0x07FF);
  492. break;
  493. case 4:
  494. overlong = (leftBits <= 0xFFFF);
  495. break;
  496. case 5:
  497. overlong = (leftBits <= 0x1FFFFF);
  498. break;
  499. case 6:
  500. overlong = (leftBits <= 0x03FFFFFF);
  501. break;
  502. }
  503. if (overlong) {
  504. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
  505. }
  506. else if ((leftBits & 0xF800) == 0xD800) {
  507. // UTF-8 doesn't use surrogate characters
  508. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
  509. }
  510. else
  511. ++length;
  512. } else if (leftBits < (uint)0x110000) {
  513. length += 2;
  514. } else {
  515. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
  516. }
  517. leftSize = 0;
  518. }
  519. } else {
  520. // Invalid UTF-8 sequence: clear and restart.
  521. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
  522. leftSize = 0;
  523. --index;
  524. ++count;
  525. }
  526. }
  527. }
  528. if (flush && leftSize != 0) {
  529. // We had left-over bytes that didn't make up
  530. // a complete UTF-8 character sequence.
  531. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
  532. }
  533. // Return the final length to the caller.
  534. return length;
  535. }
  536. // for GetCharCount()
  537. static unsafe int Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, long index, uint size)
  538. {
  539. if (buffer == null) {
  540. DecoderFallback fb = provider as DecoderFallback;
  541. if (fb != null)
  542. buffer = fb.CreateFallbackBuffer ();
  543. else
  544. buffer = ((Decoder) provider).FallbackBuffer;
  545. }
  546. if (bufferArg == null)
  547. bufferArg = new byte [1];
  548. int ret = 0;
  549. for (int i = 0; i < size; i++) {
  550. bufferArg [0] = bytes [(int) index + i];
  551. buffer.Fallback (bufferArg, 0);
  552. ret += buffer.Remaining;
  553. buffer.Reset ();
  554. }
  555. return ret;
  556. }
  557. // for GetChars()
  558. static unsafe void Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, long byteIndex, uint size,
  559. char* chars, ref int charIndex)
  560. {
  561. if (buffer == null) {
  562. DecoderFallback fb = provider as DecoderFallback;
  563. if (fb != null)
  564. buffer = fb.CreateFallbackBuffer ();
  565. else
  566. buffer = ((Decoder) provider).FallbackBuffer;
  567. }
  568. if (bufferArg == null)
  569. bufferArg = new byte [1];
  570. for (int i = 0; i < size; i++) {
  571. bufferArg [0] = bytes [byteIndex + i];
  572. buffer.Fallback (bufferArg, 0);
  573. while (buffer.Remaining > 0)
  574. chars [charIndex++] = buffer.GetNextChar ();
  575. buffer.Reset ();
  576. }
  577. }
  578. // Get the number of characters needed to decode a byte buffer.
  579. public override int GetCharCount (byte[] bytes, int index, int count)
  580. {
  581. DecoderFallbackBuffer buf = null;
  582. byte [] bufferArg = null;
  583. return InternalGetCharCount (bytes, index, count, 0, 0, DecoderFallback, ref buf, ref bufferArg, true);
  584. }
  585. [CLSCompliant (false)]
  586. [ComVisible (false)]
  587. public unsafe override int GetCharCount (byte* bytes, int count)
  588. {
  589. DecoderFallbackBuffer buf = null;
  590. byte [] bufferArg = null;
  591. return InternalGetCharCount (bytes, count, 0, 0, DecoderFallback, ref buf, ref bufferArg, true);
  592. }
  593. // Get the characters that result from decoding a byte buffer.
  594. private unsafe static int InternalGetChars (
  595. byte[] bytes, int byteIndex, int byteCount, char[] chars,
  596. int charIndex, ref uint leftOverBits, ref uint leftOverCount,
  597. object provider,
  598. ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
  599. {
  600. // Validate the parameters.
  601. if (bytes == null) {
  602. throw new ArgumentNullException ("bytes");
  603. }
  604. if (chars == null) {
  605. throw new ArgumentNullException ("chars");
  606. }
  607. if (byteIndex < 0 || byteIndex > bytes.Length) {
  608. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  609. }
  610. if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) {
  611. throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
  612. }
  613. if (charIndex < 0 || charIndex > chars.Length) {
  614. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
  615. }
  616. if (charIndex == chars.Length)
  617. return 0;
  618. fixed (char* cptr = chars) {
  619. if (byteCount == 0 || byteIndex == bytes.Length)
  620. return InternalGetChars (null, 0, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
  621. // otherwise...
  622. fixed (byte* bptr = bytes)
  623. return InternalGetChars (bptr + byteIndex, byteCount, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
  624. }
  625. }
  626. private unsafe static int InternalGetChars (
  627. byte* bytes, int byteCount, char* chars, int charCount,
  628. ref uint leftOverBits, ref uint leftOverCount,
  629. object provider,
  630. ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
  631. {
  632. int charIndex = 0, byteIndex = 0;
  633. int length = charCount;
  634. int posn = charIndex;
  635. if (leftOverCount == 0) {
  636. int end = byteIndex + byteCount;
  637. for (; byteIndex < end; posn++, byteIndex++, byteCount--) {
  638. if (bytes [byteIndex] < 0x80)
  639. chars [posn] = (char) bytes [byteIndex];
  640. else
  641. break;
  642. }
  643. }
  644. // Convert the bytes into the output buffer.
  645. uint ch;
  646. uint leftBits = leftOverBits;
  647. uint leftSoFar = (leftOverCount & (uint)0x0F);
  648. uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
  649. int byteEnd = byteIndex + byteCount;
  650. for(; byteIndex < byteEnd; byteIndex++) {
  651. // Fetch the next character from the byte buffer.
  652. ch = (uint)(bytes[byteIndex]);
  653. if (leftSize == 0) {
  654. // Process a UTF-8 start character.
  655. if (ch < (uint)0x0080) {
  656. // Single-byte UTF-8 character.
  657. if (posn >= length) {
  658. throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
  659. }
  660. chars[posn++] = (char)ch;
  661. } else if ((ch & (uint)0xE0) == (uint)0xC0) {
  662. // Double-byte UTF-8 character.
  663. leftBits = (ch & (uint)0x1F);
  664. leftSoFar = 1;
  665. leftSize = 2;
  666. } else if ((ch & (uint)0xF0) == (uint)0xE0) {
  667. // Three-byte UTF-8 character.
  668. leftBits = (ch & (uint)0x0F);
  669. leftSoFar = 1;
  670. leftSize = 3;
  671. } else if ((ch & (uint)0xF8) == (uint)0xF0) {
  672. // Four-byte UTF-8 character.
  673. leftBits = (ch & (uint)0x07);
  674. leftSoFar = 1;
  675. leftSize = 4;
  676. } else if ((ch & (uint)0xFC) == (uint)0xF8) {
  677. // Five-byte UTF-8 character.
  678. leftBits = (ch & (uint)0x03);
  679. leftSoFar = 1;
  680. leftSize = 5;
  681. } else if ((ch & (uint)0xFE) == (uint)0xFC) {
  682. // Six-byte UTF-8 character.
  683. leftBits = (ch & (uint)0x03);
  684. leftSoFar = 1;
  685. leftSize = 6;
  686. } else {
  687. // Invalid UTF-8 start character.
  688. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, 1, chars, ref posn);
  689. }
  690. } else {
  691. // Process an extra byte in a multi-byte sequence.
  692. if ((ch & (uint)0xC0) == (uint)0x80) {
  693. leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
  694. if (++leftSoFar >= leftSize) {
  695. // We have a complete character now.
  696. if (leftBits < (uint)0x10000) {
  697. // is it an overlong ?
  698. bool overlong = false;
  699. switch (leftSize) {
  700. case 2:
  701. overlong = (leftBits <= 0x7F);
  702. break;
  703. case 3:
  704. overlong = (leftBits <= 0x07FF);
  705. break;
  706. case 4:
  707. overlong = (leftBits <= 0xFFFF);
  708. break;
  709. case 5:
  710. overlong = (leftBits <= 0x1FFFFF);
  711. break;
  712. case 6:
  713. overlong = (leftBits <= 0x03FFFFFF);
  714. break;
  715. }
  716. if (overlong) {
  717. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
  718. }
  719. else if ((leftBits & 0xF800) == 0xD800) {
  720. // UTF-8 doesn't use surrogate characters
  721. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
  722. }
  723. else {
  724. if (posn >= length) {
  725. throw new ArgumentException
  726. (_("Arg_InsufficientSpace"), "chars");
  727. }
  728. chars[posn++] = (char)leftBits;
  729. }
  730. } else if (leftBits < (uint)0x110000) {
  731. if ((posn + 2) > length) {
  732. throw new ArgumentException
  733. (_("Arg_InsufficientSpace"), "chars");
  734. }
  735. leftBits -= (uint)0x10000;
  736. chars[posn++] = (char)((leftBits >> 10) +
  737. (uint)0xD800);
  738. chars[posn++] =
  739. (char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
  740. } else {
  741. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
  742. }
  743. leftSize = 0;
  744. }
  745. } else {
  746. // Invalid UTF-8 sequence: clear and restart.
  747. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
  748. leftSize = 0;
  749. --byteIndex;
  750. }
  751. }
  752. }
  753. if (flush && leftSize != 0) {
  754. // We had left-over bytes that didn't make up
  755. // a complete UTF-8 character sequence.
  756. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
  757. }
  758. leftOverBits = leftBits;
  759. leftOverCount = (leftSoFar | (leftSize << 4));
  760. // Return the final length to the caller.
  761. return posn - charIndex;
  762. }
  763. // Get the characters that result from decoding a byte buffer.
  764. public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
  765. char[] chars, int charIndex)
  766. {
  767. uint leftOverBits = 0;
  768. uint leftOverCount = 0;
  769. DecoderFallbackBuffer buf = null;
  770. byte [] bufferArg = null;
  771. return InternalGetChars (bytes, byteIndex, byteCount, chars,
  772. charIndex, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, ref bufferArg, true);
  773. }
  774. [CLSCompliant (false)]
  775. [ComVisible (false)]
  776. public unsafe override int GetChars (byte* bytes, int byteCount, char* chars, int charCount)
  777. {
  778. DecoderFallbackBuffer buf = null;
  779. byte [] bufferArg = null;
  780. uint leftOverBits = 0;
  781. uint leftOverCount = 0;
  782. return InternalGetChars (bytes, byteCount, chars,
  783. charCount, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, ref bufferArg, true);
  784. }
  785. // Get the maximum number of bytes needed to encode a
  786. // specified number of characters.
  787. public override int GetMaxByteCount (int charCount)
  788. {
  789. if (charCount < 0) {
  790. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
  791. }
  792. return charCount * 4;
  793. }
  794. // Get the maximum number of characters needed to decode a
  795. // specified number of bytes.
  796. public override int GetMaxCharCount (int byteCount)
  797. {
  798. if (byteCount < 0) {
  799. throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
  800. }
  801. return byteCount;
  802. }
  803. // Get a UTF8-specific decoder that is attached to this instance.
  804. public override Decoder GetDecoder ()
  805. {
  806. return new UTF8Decoder (DecoderFallback);
  807. }
  808. // Get a UTF8-specific encoder that is attached to this instance.
  809. public override Encoder GetEncoder ()
  810. {
  811. return new UTF8Encoder (EncoderFallback, emitIdentifier);
  812. }
  813. // Get the UTF8 preamble.
  814. public override byte[] GetPreamble ()
  815. {
  816. if (emitIdentifier)
  817. return new byte [] { 0xEF, 0xBB, 0xBF };
  818. return EmptyArray<byte>.Value;
  819. }
  820. // Determine if this object is equal to another.
  821. public override bool Equals (Object value)
  822. {
  823. UTF8Encoding enc = (value as UTF8Encoding);
  824. if (enc != null) {
  825. return (codePage == enc.codePage &&
  826. emitIdentifier == enc.emitIdentifier &&
  827. DecoderFallback.Equals (enc.DecoderFallback) &&
  828. EncoderFallback.Equals (enc.EncoderFallback));
  829. } else {
  830. return false;
  831. }
  832. }
  833. // Get the hash code for this object.
  834. public override int GetHashCode ()
  835. {
  836. return base.GetHashCode ();
  837. }
  838. public override int GetByteCount (string chars)
  839. {
  840. // hmm, does this override make any sense?
  841. return base.GetByteCount (chars);
  842. }
  843. [ComVisible (false)]
  844. public override string GetString (byte [] bytes, int index, int count)
  845. {
  846. // hmm, does this override make any sense?
  847. return base.GetString (bytes, index, count);
  848. }
  849. // UTF-8 decoder implementation.
  850. [Serializable]
  851. private class UTF8Decoder : Decoder
  852. {
  853. private uint leftOverBits;
  854. private uint leftOverCount;
  855. // Constructor.
  856. public UTF8Decoder (DecoderFallback fallback)
  857. {
  858. Fallback = fallback;
  859. leftOverBits = 0;
  860. leftOverCount = 0;
  861. }
  862. // Override inherited methods.
  863. public override int GetCharCount (byte[] bytes, int index, int count)
  864. {
  865. DecoderFallbackBuffer buf = null;
  866. byte [] bufferArg = null;
  867. return InternalGetCharCount (bytes, index, count,
  868. leftOverBits, leftOverCount, this, ref buf, ref bufferArg, false);
  869. }
  870. public override int GetChars (byte[] bytes, int byteIndex,
  871. int byteCount, char[] chars, int charIndex)
  872. {
  873. DecoderFallbackBuffer buf = null;
  874. byte [] bufferArg = null;
  875. return InternalGetChars (bytes, byteIndex, byteCount,
  876. chars, charIndex, ref leftOverBits, ref leftOverCount, this, ref buf, ref bufferArg, false);
  877. }
  878. } // class UTF8Decoder
  879. // UTF-8 encoder implementation.
  880. [Serializable]
  881. private class UTF8Encoder : Encoder
  882. {
  883. // private bool emitIdentifier;
  884. private char leftOverForCount;
  885. private char leftOverForConv;
  886. // Constructor.
  887. public UTF8Encoder (EncoderFallback fallback, bool emitIdentifier)
  888. {
  889. Fallback = fallback;
  890. // this.emitIdentifier = emitIdentifier;
  891. leftOverForCount = '\0';
  892. leftOverForConv = '\0';
  893. }
  894. // Override inherited methods.
  895. public override int GetByteCount (char[] chars, int index,
  896. int count, bool flush)
  897. {
  898. return InternalGetByteCount (chars, index, count, Fallback, ref leftOverForCount, flush);
  899. }
  900. public override int GetBytes (char[] chars, int charIndex,
  901. int charCount, byte[] bytes, int byteIndex, bool flush)
  902. {
  903. int result;
  904. EncoderFallbackBuffer buffer = null;
  905. result = InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, Fallback, ref buffer, ref leftOverForConv, flush);
  906. // emitIdentifier = false;
  907. return result;
  908. }
  909. public unsafe override int GetByteCount (char* chars, int count, bool flush)
  910. {
  911. return InternalGetByteCount (chars, count, Fallback, ref leftOverForCount, flush);
  912. }
  913. public unsafe override int GetBytes (char* chars, int charCount,
  914. byte* bytes, int byteCount, bool flush)
  915. {
  916. int result;
  917. EncoderFallbackBuffer buffer = null;
  918. result = InternalGetBytes (chars, charCount, bytes, byteCount, Fallback, ref buffer, ref leftOverForConv, flush);
  919. // emitIdentifier = false;
  920. return result;
  921. }
  922. } // class UTF8Encoder
  923. }; // class UTF8Encoding
  924. }; // namespace System.Text