UTF8Encoding.cs 34 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204
  1. /*
  2. * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
  3. *
  4. * Copyright (c) 2001, 2002 Southern Storm Software, Pty Ltd
  5. * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
  6. *
  7. * Permission is hereby granted, free of charge, to any person obtaining
  8. * a copy of this software and associated documentation files (the "Software"),
  9. * to deal in the Software without restriction, including without limitation
  10. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11. * and/or sell copies of the Software, and to permit persons to whom the
  12. * Software is furnished to do so, subject to the following conditions:
  13. *
  14. * The above copyright notice and this permission notice shall be included
  15. * in all copies or substantial portions of the Software.
  16. *
  17. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18. * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  20. * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  21. * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  22. * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  23. * OTHER DEALINGS IN THE SOFTWARE.
  24. */
  25. namespace System.Text
  26. {
  27. using System;
  28. using System.Runtime.InteropServices;
  29. [Serializable]
  30. [MonoTODO ("Serialization format not compatible with .NET")]
  31. #if NET_2_0
  32. [MonoTODO ("EncoderFallback is not handled")]
  33. [ComVisible (true)]
  34. #endif
  35. public class UTF8Encoding : Encoding
  36. {
  37. // Magic number used by Windows for UTF-8.
  38. internal const int UTF8_CODE_PAGE = 65001;
  39. // Internal state.
  40. private bool emitIdentifier;
  41. #if !NET_2_0
  42. private bool throwOnInvalid;
  43. #endif
  44. // Constructors.
  45. public UTF8Encoding () : this (false, false) {}
  46. public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
  47. : this (encoderShouldEmitUTF8Identifier, false) {}
  48. public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
  49. : base (UTF8_CODE_PAGE)
  50. {
  51. emitIdentifier = encoderShouldEmitUTF8Identifier;
  52. #if NET_2_0
  53. if (throwOnInvalidBytes)
  54. SetFallbackInternal (null, new DecoderExceptionFallback ());
  55. else
  56. SetFallbackInternal (null, new DecoderReplacementFallback (String.Empty));
  57. #else
  58. throwOnInvalid = throwOnInvalidBytes;
  59. #endif
  60. web_name = body_name = header_name = "utf-8";
  61. encoding_name = "Unicode (UTF-8)";
  62. is_browser_save = true;
  63. is_browser_display = true;
  64. is_mail_news_display = true;
  65. windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE;
  66. }
  67. #region GetByteCount()
  68. // Internal version of "GetByteCount" which can handle a rolling
  69. // state between multiple calls to this method.
  70. private static int InternalGetByteCount (char[] chars, int index, int count, ref char leftOver, bool flush)
  71. {
  72. // Validate the parameters.
  73. if (chars == null) {
  74. throw new ArgumentNullException ("chars");
  75. }
  76. if (index < 0 || index > chars.Length) {
  77. throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
  78. }
  79. if (count < 0 || count > (chars.Length - index)) {
  80. throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
  81. }
  82. if (index == chars.Length) {
  83. if (flush && leftOver != '\0') {
  84. // Flush the left-over surrogate pair start.
  85. leftOver = '\0';
  86. return 3;
  87. }
  88. return 0;
  89. }
  90. unsafe {
  91. fixed (char* cptr = chars) {
  92. return InternalGetByteCount (cptr + index, count, ref leftOver, flush);
  93. }
  94. }
  95. }
  96. private unsafe static int InternalGetByteCount (char* chars, int count, ref char leftOver, bool flush)
  97. {
  98. int index = 0;
  99. // Determine the lengths of all characters.
  100. char ch;
  101. int length = 0;
  102. char pair = leftOver;
  103. while (count > 0) {
  104. ch = chars[index];
  105. if (pair == 0) {
  106. if (ch < '\u0080') {
  107. // fast path optimization
  108. int end = index + count;
  109. for (; index < end; index++, count--) {
  110. if (chars [index] < '\x80')
  111. ++length;
  112. else
  113. break;
  114. }
  115. continue;
  116. //length++;
  117. } else if (ch < '\u0800') {
  118. length += 2;
  119. } else if (ch >= '\uD800' && ch <= '\uDBFF') {
  120. // This is the start of a surrogate pair.
  121. pair = ch;
  122. } else {
  123. length += 3;
  124. }
  125. } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
  126. if (pair != 0) {
  127. // We have a surrogate pair.
  128. length += 4;
  129. pair = '\0';
  130. } else {
  131. // We have a surrogate tail without
  132. // leading surrogate. In NET_2_0 it
  133. // uses fallback. In NET_1_1 we output
  134. // wrong surrogate.
  135. length += 3;
  136. pair = '\0';
  137. }
  138. } else {
  139. // We have a surrogate start followed by a
  140. // regular character. Technically, this is
  141. // invalid, but we have to do something.
  142. // We write out the surrogate start and then
  143. // re-visit the current character again.
  144. length += 3;
  145. pair = '\0';
  146. continue;
  147. }
  148. ++index;
  149. --count;
  150. }
  151. if (flush) {
  152. if (pair != '\0')
  153. // Flush the left-over surrogate pair start.
  154. length += 3;
  155. leftOver = '\0';
  156. }
  157. else
  158. leftOver = pair;
  159. // Return the final length to the caller.
  160. return length;
  161. }
  162. // Get the number of bytes needed to encode a character buffer.
  163. public override int GetByteCount (char[] chars, int index, int count)
  164. {
  165. char dummy = '\0';
  166. return InternalGetByteCount (chars, index, count, ref dummy, true);
  167. }
  168. #if !NET_2_0
  169. // Convenience wrappers for "GetByteCount".
  170. public override int GetByteCount (String s)
  171. {
  172. // Validate the parameters.
  173. if (s == null) {
  174. throw new ArgumentNullException ("s");
  175. }
  176. unsafe {
  177. fixed (char* cptr = s) {
  178. char dummy = '\0';
  179. return InternalGetByteCount (cptr, s.Length, ref dummy, true);
  180. }
  181. }
  182. }
  183. #endif
  184. #if NET_2_0
  185. [CLSCompliant (false)]
  186. [ComVisible (false)]
  187. public unsafe override int GetByteCount (char* chars, int count)
  188. {
  189. if (chars == null)
  190. throw new ArgumentNullException ("chars");
  191. if (count == 0)
  192. return 0;
  193. char dummy = '\0';
  194. return InternalGetByteCount (chars, count, ref dummy, true);
  195. }
  196. #endif
  197. #endregion
  198. #region GetBytes()
  199. // Internal version of "GetBytes" which can handle a rolling
  200. // state between multiple calls to this method.
  201. private static int InternalGetBytes (char[] chars, int charIndex,
  202. int charCount, byte[] bytes,
  203. int byteIndex, ref char leftOver,
  204. bool flush)
  205. {
  206. // Validate the parameters.
  207. if (chars == null) {
  208. throw new ArgumentNullException ("chars");
  209. }
  210. if (bytes == null) {
  211. throw new ArgumentNullException ("bytes");
  212. }
  213. if (charIndex < 0 || charIndex > chars.Length) {
  214. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
  215. }
  216. if (charCount < 0 || charCount > (chars.Length - charIndex)) {
  217. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
  218. }
  219. if (byteIndex < 0 || byteIndex > bytes.Length) {
  220. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  221. }
  222. if (charIndex == chars.Length) {
  223. if (flush && leftOver != '\0') {
  224. #if NET_2_0
  225. // FIXME: use EncoderFallback.
  226. //
  227. // By default it is empty, so I do nothing for now.
  228. leftOver = '\0';
  229. #else
  230. // Flush the left-over surrogate pair start.
  231. if (byteIndex >= bytes.Length - 3)
  232. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  233. bytes [byteIndex++] = 0xEF;
  234. bytes [byteIndex++] = 0xBB;
  235. bytes [byteIndex++] = 0xBF;
  236. leftOver = '\0';
  237. return 3;
  238. #endif
  239. }
  240. return 0;
  241. }
  242. unsafe {
  243. fixed (char* cptr = chars) {
  244. if (bytes.Length == byteIndex)
  245. return InternalGetBytes (
  246. cptr + charIndex, charCount,
  247. null, 0, ref leftOver, flush);
  248. fixed (byte *bptr = bytes) {
  249. return InternalGetBytes (
  250. cptr + charIndex, charCount,
  251. bptr + byteIndex, bytes.Length - byteIndex,
  252. ref leftOver, flush);
  253. }
  254. }
  255. }
  256. }
  257. private unsafe static int InternalGetBytes (char* chars, int charCount,
  258. byte* bytes, int byteCount,
  259. ref char leftOver, bool flush)
  260. {
  261. int charIndex = 0;
  262. int byteIndex = 0;
  263. // Convert the characters into bytes.
  264. // Convert the characters into bytes.
  265. char ch;
  266. int length = byteCount;
  267. char pair = leftOver;
  268. int posn = byteIndex;
  269. int code = 0;
  270. while (charCount > 0) {
  271. // Fetch the next UTF-16 character pair value.
  272. ch = chars [charIndex];
  273. if (pair == '\0') {
  274. if (ch < '\uD800' || ch >= '\uE000') {
  275. if (ch < '\x80') { // fast path optimization
  276. int end = charIndex + charCount;
  277. for (; charIndex < end; posn++, charIndex++, charCount--) {
  278. if (chars [charIndex] < '\x80')
  279. bytes [posn] = (byte) chars [charIndex];
  280. else
  281. break;
  282. }
  283. continue;
  284. }
  285. code = ch;
  286. }
  287. else if (ch < '\uDC00') {
  288. // surrogate start
  289. pair = ch;
  290. ++charIndex;
  291. --charCount;
  292. continue;
  293. } else { // ch <= '\uDFFF'
  294. // We have a surrogate tail without leading
  295. // surrogate. In NET_2_0 it uses fallback.
  296. // In NET_1_1 we output wrong surrogate.
  297. if (posn > length - 3) {
  298. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  299. }
  300. bytes [posn++] = (byte) (0xE0 | (ch >> 12));
  301. bytes [posn++] = (byte) (0x80 | ((ch >> 6) & 0x3F));
  302. bytes [posn++] = (byte) (0x80 | (ch & 0x3F));
  303. ++charIndex;
  304. --charCount;
  305. continue;
  306. }
  307. } else {
  308. if ('\uDC00' <= ch && ch <= '\uDFFF')
  309. code = 0x10000 + (int) ch - 0xDC00 +
  310. (((int) pair - 0xD800) << 10);
  311. else {
  312. // We have a surrogate start followed by a
  313. // regular character. Technically, this is
  314. // invalid, but we have to do something.
  315. // We write out the surrogate start and then
  316. // re-visit the current character again.
  317. if (posn > length - 3) {
  318. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  319. }
  320. bytes [posn++] = (byte) (0xE0 | (pair >> 12));
  321. bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F));
  322. bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
  323. pair = '\0';
  324. continue;
  325. }
  326. pair = '\0';
  327. }
  328. ++charIndex;
  329. --charCount;
  330. // Encode the character pair value.
  331. if (code < 0x0080) {
  332. if (posn >= length)
  333. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  334. bytes [posn++] = (byte)code;
  335. } else if (code < 0x0800) {
  336. if ((posn + 2) > length)
  337. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  338. bytes [posn++] = (byte) (0xC0 | (code >> 6));
  339. bytes [posn++] = (byte) (0x80 | (code & 0x3F));
  340. } else if (code < 0x10000) {
  341. if (posn > length - 3)
  342. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  343. bytes [posn++] = (byte) (0xE0 | (code >> 12));
  344. bytes [posn++] = (byte) (0x80 | ((code >> 6) & 0x3F));
  345. bytes [posn++] = (byte) (0x80 | (code & 0x3F));
  346. } else {
  347. if (posn > length - 4)
  348. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  349. bytes [posn++] = (byte) (0xF0 | (code >> 18));
  350. bytes [posn++] = (byte) (0x80 | ((code >> 12) & 0x3F));
  351. bytes [posn++] = (byte) (0x80 | ((code >> 6) & 0x3F));
  352. bytes [posn++] = (byte) (0x80 | (code & 0x3F));
  353. }
  354. }
  355. if (flush) {
  356. if (pair != '\0') {
  357. // Flush the left-over incomplete surrogate.
  358. if (posn > length - 3) {
  359. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  360. }
  361. bytes [posn++] = (byte) (0xE0 | (pair >> 12));
  362. bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F));
  363. bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
  364. }
  365. leftOver = '\0';
  366. }
  367. else
  368. leftOver = pair;
  369. Char.IsLetterOrDigit (pair);
  370. // Return the final count to the caller.
  371. return posn - byteIndex;
  372. }
  373. private unsafe int Fallback (byte* bytes, int byteCount, char lead, char tail)
  374. {
  375. throw new NotImplementedException ();
  376. }
  377. // Get the bytes that result from encoding a character buffer.
  378. public override int GetBytes (char[] chars, int charIndex, int charCount,
  379. byte[] bytes, int byteIndex)
  380. {
  381. char leftOver = '\0';
  382. return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOver, true);
  383. }
  384. // Convenience wrappers for "GetBytes".
  385. public override int GetBytes (String s, int charIndex, int charCount,
  386. byte[] bytes, int byteIndex)
  387. {
  388. // Validate the parameters.
  389. if (s == null) {
  390. throw new ArgumentNullException ("s");
  391. }
  392. if (bytes == null) {
  393. throw new ArgumentNullException ("bytes");
  394. }
  395. if (charIndex < 0 || charIndex > s.Length) {
  396. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
  397. }
  398. if (charCount < 0 || charCount > (s.Length - charIndex)) {
  399. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
  400. }
  401. if (byteIndex < 0 || byteIndex > bytes.Length) {
  402. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  403. }
  404. if (charIndex == s.Length)
  405. return 0;
  406. unsafe {
  407. fixed (char* cptr = s) {
  408. char dummy = '\0';
  409. if (bytes.Length == byteIndex)
  410. return InternalGetBytes (
  411. cptr + charIndex, charCount,
  412. null, 0, ref dummy, true);
  413. fixed (byte *bptr = bytes) {
  414. return InternalGetBytes (
  415. cptr + charIndex, charCount,
  416. bptr + byteIndex, bytes.Length - byteIndex,
  417. ref dummy, true);
  418. }
  419. }
  420. }
  421. }
  422. #if NET_2_0
  423. [CLSCompliant (false)]
  424. [ComVisible (false)]
  425. public unsafe override int GetBytes (char* chars, int charCount, byte* bytes, int byteCount)
  426. {
  427. if (chars == null)
  428. throw new ArgumentNullException ("chars");
  429. if (charCount < 0)
  430. throw new IndexOutOfRangeException ("charCount");
  431. if (bytes == null)
  432. throw new ArgumentNullException ("bytes");
  433. if (byteCount < 0)
  434. throw new IndexOutOfRangeException ("charCount");
  435. if (charCount == 0)
  436. return 0;
  437. char dummy = '\0';
  438. if (byteCount == 0)
  439. return InternalGetBytes (chars, charCount, null, 0, ref dummy, true);
  440. else
  441. return InternalGetBytes (chars, charCount, bytes, byteCount, ref dummy, true);
  442. }
  443. #endif
  444. #endregion
  445. // Internal version of "GetCharCount" which can handle a rolling
  446. // state between multiple calls to this method.
  447. #if NET_2_0
  448. private unsafe static int InternalGetCharCount (
  449. byte[] bytes, int index, int count, uint leftOverBits,
  450. uint leftOverCount, object provider,
  451. ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
  452. #else
  453. private unsafe static int InternalGetCharCount (
  454. byte[] bytes, int index, int count, uint leftOverBits,
  455. uint leftOverCount, bool throwOnInvalid, bool flush)
  456. #endif
  457. {
  458. // Validate the parameters.
  459. if (bytes == null) {
  460. throw new ArgumentNullException ("bytes");
  461. }
  462. if (index < 0 || index > bytes.Length) {
  463. throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
  464. }
  465. if (count < 0 || count > (bytes.Length - index)) {
  466. throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
  467. }
  468. if (count == 0)
  469. return 0;
  470. fixed (byte *bptr = bytes)
  471. #if NET_2_0
  472. return InternalGetCharCount (bptr + index, count,
  473. leftOverBits, leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
  474. #else
  475. return InternalGetCharCount (bptr + index, count,
  476. leftOverBits, leftOverCount, throwOnInvalid, flush);
  477. #endif
  478. }
  479. #if NET_2_0
  480. private unsafe static int InternalGetCharCount (
  481. byte* bytes, int count, uint leftOverBits,
  482. uint leftOverCount, object provider,
  483. ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
  484. #else
  485. private unsafe static int InternalGetCharCount (
  486. byte* bytes, int count, uint leftOverBits,
  487. uint leftOverCount, bool throwOnInvalid, bool flush)
  488. #endif
  489. {
  490. int index = 0;
  491. int length = 0;
  492. if (leftOverCount == 0) {
  493. int end = index + count;
  494. for (; index < end; index++, count--) {
  495. if (bytes [index] < 0x80)
  496. length++;
  497. else
  498. break;
  499. }
  500. }
  501. // Determine the number of characters that we have.
  502. uint ch;
  503. uint leftBits = leftOverBits;
  504. uint leftSoFar = (leftOverCount & (uint)0x0F);
  505. uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
  506. while (count > 0) {
  507. ch = (uint)(bytes[index++]);
  508. --count;
  509. if (leftSize == 0) {
  510. // Process a UTF-8 start character.
  511. if (ch < (uint)0x0080) {
  512. // Single-byte UTF-8 character.
  513. ++length;
  514. } else if ((ch & (uint)0xE0) == (uint)0xC0) {
  515. // Double-byte UTF-8 character.
  516. leftBits = (ch & (uint)0x1F);
  517. leftSoFar = 1;
  518. leftSize = 2;
  519. } else if ((ch & (uint)0xF0) == (uint)0xE0) {
  520. // Three-byte UTF-8 character.
  521. leftBits = (ch & (uint)0x0F);
  522. leftSoFar = 1;
  523. leftSize = 3;
  524. } else if ((ch & (uint)0xF8) == (uint)0xF0) {
  525. // Four-byte UTF-8 character.
  526. leftBits = (ch & (uint)0x07);
  527. leftSoFar = 1;
  528. leftSize = 4;
  529. } else if ((ch & (uint)0xFC) == (uint)0xF8) {
  530. // Five-byte UTF-8 character.
  531. leftBits = (ch & (uint)0x03);
  532. leftSoFar = 1;
  533. leftSize = 5;
  534. } else if ((ch & (uint)0xFE) == (uint)0xFC) {
  535. // Six-byte UTF-8 character.
  536. leftBits = (ch & (uint)0x03);
  537. leftSoFar = 1;
  538. leftSize = 6;
  539. } else {
  540. // Invalid UTF-8 start character.
  541. #if NET_2_0
  542. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1);
  543. #else
  544. if (throwOnInvalid)
  545. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  546. #endif
  547. }
  548. } else {
  549. // Process an extra byte in a multi-byte sequence.
  550. if ((ch & (uint)0xC0) == (uint)0x80) {
  551. leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
  552. if (++leftSoFar >= leftSize) {
  553. // We have a complete character now.
  554. if (leftBits < (uint)0x10000) {
  555. // is it an overlong ?
  556. bool overlong = false;
  557. switch (leftSize) {
  558. case 2:
  559. overlong = (leftBits <= 0x7F);
  560. break;
  561. case 3:
  562. overlong = (leftBits <= 0x07FF);
  563. break;
  564. case 4:
  565. overlong = (leftBits <= 0xFFFF);
  566. break;
  567. case 5:
  568. overlong = (leftBits <= 0x1FFFFF);
  569. break;
  570. case 6:
  571. overlong = (leftBits <= 0x03FFFFFF);
  572. break;
  573. }
  574. if (overlong) {
  575. #if NET_2_0
  576. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1);
  577. #else
  578. if (throwOnInvalid)
  579. throw new ArgumentException (_("Overlong"), leftBits.ToString ());
  580. #endif
  581. }
  582. else
  583. ++length;
  584. } else if (leftBits < (uint)0x110000) {
  585. length += 2;
  586. } else {
  587. #if NET_2_0
  588. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1);
  589. #else
  590. if (throwOnInvalid)
  591. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  592. #endif
  593. }
  594. leftSize = 0;
  595. }
  596. } else {
  597. // Invalid UTF-8 sequence: clear and restart.
  598. #if NET_2_0
  599. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1);
  600. #else
  601. if (throwOnInvalid)
  602. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  603. #endif
  604. leftSize = 0;
  605. --index;
  606. ++count;
  607. }
  608. }
  609. }
  610. if (flush && leftSize != 0) {
  611. // We had left-over bytes that didn't make up
  612. // a complete UTF-8 character sequence.
  613. #if NET_2_0
  614. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index);
  615. #else
  616. if (throwOnInvalid)
  617. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  618. #endif
  619. }
  620. // Return the final length to the caller.
  621. return length;
  622. }
  623. #if NET_2_0
  624. // for GetCharCount()
  625. static unsafe int Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, int index)
  626. {
  627. if (buffer == null) {
  628. DecoderFallback fb = provider as DecoderFallback;
  629. if (fb != null)
  630. buffer = fb.CreateFallbackBuffer ();
  631. else
  632. buffer = ((Decoder) provider).FallbackBuffer;
  633. }
  634. if (bufferArg == null)
  635. bufferArg = new byte [1];
  636. bufferArg [0] = bytes [index];
  637. buffer.Fallback (bufferArg, 0);
  638. return buffer.Remaining;
  639. }
  640. // for GetChars()
  641. static unsafe void Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, int byteIndex,
  642. char* chars, ref int charIndex)
  643. {
  644. if (buffer == null) {
  645. DecoderFallback fb = provider as DecoderFallback;
  646. if (fb != null)
  647. buffer = fb.CreateFallbackBuffer ();
  648. else
  649. buffer = ((Decoder) provider).FallbackBuffer;
  650. }
  651. if (bufferArg == null)
  652. bufferArg = new byte [1];
  653. bufferArg [0] = bytes [byteIndex];
  654. buffer.Fallback (bufferArg, 0);
  655. while (buffer.Remaining > 0)
  656. chars [charIndex++] = buffer.GetNextChar ();
  657. }
  658. #endif
  659. // Get the number of characters needed to decode a byte buffer.
  660. public override int GetCharCount (byte[] bytes, int index, int count)
  661. {
  662. #if NET_2_0
  663. DecoderFallbackBuffer buf = null;
  664. byte [] bufferArg = null;
  665. return InternalGetCharCount (bytes, index, count, 0, 0, DecoderFallback, ref buf, ref bufferArg, true);
  666. #else
  667. return InternalGetCharCount (bytes, index, count, 0, 0, throwOnInvalid, true);
  668. #endif
  669. }
  670. #if NET_2_0
  671. [CLSCompliant (false)]
  672. [ComVisible (false)]
  673. public unsafe override int GetCharCount (byte* bytes, int count)
  674. {
  675. DecoderFallbackBuffer buf = null;
  676. byte [] bufferArg = null;
  677. return InternalGetCharCount (bytes, count, 0, 0, DecoderFallback, ref buf, ref bufferArg, true);
  678. }
  679. #endif
  680. // Get the characters that result from decoding a byte buffer.
  681. #if NET_2_0
  682. private unsafe static int InternalGetChars (
  683. byte[] bytes, int byteIndex, int byteCount, char[] chars,
  684. int charIndex, ref uint leftOverBits, ref uint leftOverCount,
  685. object provider,
  686. ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
  687. #else
  688. private unsafe static int InternalGetChars (
  689. byte[] bytes, int byteIndex, int byteCount, char[] chars,
  690. int charIndex, ref uint leftOverBits, ref uint leftOverCount,
  691. bool throwOnInvalid, bool flush)
  692. #endif
  693. {
  694. // Validate the parameters.
  695. if (bytes == null) {
  696. throw new ArgumentNullException ("bytes");
  697. }
  698. if (chars == null) {
  699. throw new ArgumentNullException ("chars");
  700. }
  701. if (byteIndex < 0 || byteIndex > bytes.Length) {
  702. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  703. }
  704. if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) {
  705. throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
  706. }
  707. if (charIndex < 0 || charIndex > chars.Length) {
  708. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
  709. }
  710. if (charIndex == chars.Length)
  711. return 0;
  712. fixed (char* cptr = chars) {
  713. #if NET_2_0
  714. if (byteCount == 0 || byteIndex == bytes.Length)
  715. return InternalGetChars (null, 0, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
  716. // otherwise...
  717. fixed (byte* bptr = bytes)
  718. return InternalGetChars (bptr + byteIndex, byteCount, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
  719. #else
  720. if (byteCount == 0 || byteIndex == bytes.Length)
  721. return InternalGetChars (null, 0, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, flush);
  722. // otherwise...
  723. fixed (byte* bptr = bytes)
  724. return InternalGetChars (bptr + byteIndex, byteCount, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, flush);
  725. #endif
  726. }
  727. }
  728. #if NET_2_0
  729. private unsafe static int InternalGetChars (
  730. byte* bytes, int byteCount, char* chars, int charCount,
  731. ref uint leftOverBits, ref uint leftOverCount,
  732. object provider,
  733. ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
  734. #else
  735. private unsafe static int InternalGetChars (
  736. byte* bytes, int byteCount, char* chars, int charCount,
  737. ref uint leftOverBits, ref uint leftOverCount,
  738. bool throwOnInvalid, bool flush)
  739. #endif
  740. {
  741. int charIndex = 0, byteIndex = 0;
  742. int length = charCount;
  743. int posn = charIndex;
  744. if (leftOverCount == 0) {
  745. int end = byteIndex + byteCount;
  746. for (; byteIndex < end; posn++, byteIndex++, byteCount--) {
  747. if (bytes [byteIndex] < 0x80)
  748. chars [posn] = (char) bytes [byteIndex];
  749. else
  750. break;
  751. }
  752. }
  753. // Convert the bytes into the output buffer.
  754. uint ch;
  755. uint leftBits = leftOverBits;
  756. uint leftSoFar = (leftOverCount & (uint)0x0F);
  757. uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
  758. int byteEnd = byteIndex + byteCount;
  759. for(; byteIndex < byteEnd; byteIndex++) {
  760. // Fetch the next character from the byte buffer.
  761. ch = (uint)(bytes[byteIndex]);
  762. if (leftSize == 0) {
  763. // Process a UTF-8 start character.
  764. if (ch < (uint)0x0080) {
  765. // Single-byte UTF-8 character.
  766. if (posn >= length) {
  767. throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
  768. }
  769. chars[posn++] = (char)ch;
  770. } else if ((ch & (uint)0xE0) == (uint)0xC0) {
  771. // Double-byte UTF-8 character.
  772. leftBits = (ch & (uint)0x1F);
  773. leftSoFar = 1;
  774. leftSize = 2;
  775. } else if ((ch & (uint)0xF0) == (uint)0xE0) {
  776. // Three-byte UTF-8 character.
  777. leftBits = (ch & (uint)0x0F);
  778. leftSoFar = 1;
  779. leftSize = 3;
  780. } else if ((ch & (uint)0xF8) == (uint)0xF0) {
  781. // Four-byte UTF-8 character.
  782. leftBits = (ch & (uint)0x07);
  783. leftSoFar = 1;
  784. leftSize = 4;
  785. } else if ((ch & (uint)0xFC) == (uint)0xF8) {
  786. // Five-byte UTF-8 character.
  787. leftBits = (ch & (uint)0x03);
  788. leftSoFar = 1;
  789. leftSize = 5;
  790. } else if ((ch & (uint)0xFE) == (uint)0xFC) {
  791. // Six-byte UTF-8 character.
  792. leftBits = (ch & (uint)0x03);
  793. leftSoFar = 1;
  794. leftSize = 6;
  795. } else {
  796. // Invalid UTF-8 start character.
  797. #if NET_2_0
  798. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn);
  799. #else
  800. if (throwOnInvalid)
  801. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  802. #endif
  803. }
  804. } else {
  805. // Process an extra byte in a multi-byte sequence.
  806. if ((ch & (uint)0xC0) == (uint)0x80) {
  807. leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
  808. if (++leftSoFar >= leftSize) {
  809. // We have a complete character now.
  810. if (leftBits < (uint)0x10000) {
  811. // is it an overlong ?
  812. bool overlong = false;
  813. switch (leftSize) {
  814. case 2:
  815. overlong = (leftBits <= 0x7F);
  816. break;
  817. case 3:
  818. overlong = (leftBits <= 0x07FF);
  819. break;
  820. case 4:
  821. overlong = (leftBits <= 0xFFFF);
  822. break;
  823. case 5:
  824. overlong = (leftBits <= 0x1FFFFF);
  825. break;
  826. case 6:
  827. overlong = (leftBits <= 0x03FFFFFF);
  828. break;
  829. }
  830. if (overlong) {
  831. #if NET_2_0
  832. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn);
  833. #else
  834. if (throwOnInvalid)
  835. throw new ArgumentException (_("Overlong"), leftBits.ToString ());
  836. #endif
  837. }
  838. else if ((leftBits & 0xF800) == 0xD800) {
  839. // UTF-8 doesn't use surrogate characters
  840. #if NET_2_0
  841. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn);
  842. #else
  843. if (throwOnInvalid)
  844. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  845. #endif
  846. }
  847. else {
  848. if (posn >= length) {
  849. throw new ArgumentException
  850. (_("Arg_InsufficientSpace"), "chars");
  851. }
  852. chars[posn++] = (char)leftBits;
  853. }
  854. } else if (leftBits < (uint)0x110000) {
  855. if ((posn + 2) > length) {
  856. throw new ArgumentException
  857. (_("Arg_InsufficientSpace"), "chars");
  858. }
  859. leftBits -= (uint)0x10000;
  860. chars[posn++] = (char)((leftBits >> 10) +
  861. (uint)0xD800);
  862. chars[posn++] =
  863. (char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
  864. } else {
  865. #if NET_2_0
  866. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn);
  867. #else
  868. if (throwOnInvalid)
  869. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  870. #endif
  871. }
  872. leftSize = 0;
  873. }
  874. } else {
  875. // Invalid UTF-8 sequence: clear and restart.
  876. #if NET_2_0
  877. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn);
  878. #else
  879. if (throwOnInvalid)
  880. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  881. #endif
  882. leftSize = 0;
  883. --byteIndex;
  884. }
  885. }
  886. }
  887. if (flush && leftSize != 0) {
  888. // We had left-over bytes that didn't make up
  889. // a complete UTF-8 character sequence.
  890. #if NET_2_0
  891. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn);
  892. #else
  893. if (throwOnInvalid)
  894. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  895. #endif
  896. }
  897. leftOverBits = leftBits;
  898. leftOverCount = (leftSoFar | (leftSize << 4));
  899. // Return the final length to the caller.
  900. return posn - charIndex;
  901. }
  902. // Get the characters that result from decoding a byte buffer.
  903. public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
  904. char[] chars, int charIndex)
  905. {
  906. uint leftOverBits = 0;
  907. uint leftOverCount = 0;
  908. #if NET_2_0
  909. DecoderFallbackBuffer buf = null;
  910. byte [] bufferArg = null;
  911. return InternalGetChars (bytes, byteIndex, byteCount, chars,
  912. charIndex, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, ref bufferArg, true);
  913. #else
  914. return InternalGetChars (bytes, byteIndex, byteCount, chars,
  915. charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, true);
  916. #endif
  917. }
  918. #if NET_2_0
  919. [CLSCompliant (false)]
  920. [ComVisible (false)]
  921. public unsafe override int GetChars (byte* bytes, int byteCount, char* chars, int charCount)
  922. {
  923. DecoderFallbackBuffer buf = null;
  924. byte [] bufferArg = null;
  925. uint leftOverBits = 0;
  926. uint leftOverCount = 0;
  927. return InternalGetChars (bytes, byteCount, chars,
  928. charCount, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, ref bufferArg, true);
  929. }
  930. #endif
  931. // Get the maximum number of bytes needed to encode a
  932. // specified number of characters.
  933. public override int GetMaxByteCount (int charCount)
  934. {
  935. if (charCount < 0) {
  936. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
  937. }
  938. return charCount * 4;
  939. }
  940. // Get the maximum number of characters needed to decode a
  941. // specified number of bytes.
  942. public override int GetMaxCharCount (int byteCount)
  943. {
  944. if (byteCount < 0) {
  945. throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
  946. }
  947. return byteCount;
  948. }
  949. // Get a UTF8-specific decoder that is attached to this instance.
  950. public override Decoder GetDecoder ()
  951. {
  952. #if NET_2_0
  953. return new UTF8Decoder (DecoderFallback);
  954. #else
  955. return new UTF8Decoder (throwOnInvalid);
  956. #endif
  957. }
  958. // Get a UTF8-specific encoder that is attached to this instance.
  959. public override Encoder GetEncoder ()
  960. {
  961. return new UTF8Encoder (emitIdentifier);
  962. }
  963. // Get the UTF8 preamble.
  964. public override byte[] GetPreamble ()
  965. {
  966. if (emitIdentifier) {
  967. byte[] pre = new byte [3];
  968. pre[0] = (byte)0xEF;
  969. pre[1] = (byte)0xBB;
  970. pre[2] = (byte)0xBF;
  971. return pre;
  972. } else {
  973. return new byte [0];
  974. }
  975. }
  976. // Determine if this object is equal to another.
  977. public override bool Equals (Object value)
  978. {
  979. UTF8Encoding enc = (value as UTF8Encoding);
  980. if (enc != null) {
  981. #if NET_2_0
  982. return (codePage == enc.codePage &&
  983. emitIdentifier == enc.emitIdentifier &&
  984. DecoderFallback == enc.DecoderFallback &&
  985. EncoderFallback == enc.EncoderFallback);
  986. #else
  987. return (codePage == enc.codePage &&
  988. emitIdentifier == enc.emitIdentifier &&
  989. throwOnInvalid == enc.throwOnInvalid);
  990. #endif
  991. } else {
  992. return false;
  993. }
  994. }
  995. // Get the hash code for this object.
  996. public override int GetHashCode ()
  997. {
  998. return base.GetHashCode ();
  999. }
  1000. #if NET_2_0
  1001. [MonoTODO]
  1002. public override int GetByteCount (string s)
  1003. {
  1004. // hmm, does this override make any sense?
  1005. return base.GetByteCount (s);
  1006. }
  1007. [MonoTODO]
  1008. [ComVisible (false)]
  1009. public override string GetString (byte [] bytes, int index, int count)
  1010. {
  1011. // hmm, does this override make any sense?
  1012. return base.GetString (bytes, index, count);
  1013. }
  1014. #endif
  1015. #if !NET_2_0
  1016. public override byte [] GetBytes (String s)
  1017. {
  1018. if (s == null)
  1019. throw new ArgumentNullException ("s");
  1020. int length = GetByteCount (s);
  1021. byte [] bytes = new byte [length];
  1022. GetBytes (s, 0, s.Length, bytes, 0);
  1023. return bytes;
  1024. }
  1025. #endif
  1026. // UTF-8 decoder implementation.
  1027. [Serializable]
  1028. private class UTF8Decoder : Decoder
  1029. {
  1030. #if !NET_2_0
  1031. private bool throwOnInvalid;
  1032. #endif
  1033. private uint leftOverBits;
  1034. private uint leftOverCount;
  1035. // Constructor.
  1036. #if NET_2_0
  1037. public UTF8Decoder (DecoderFallback fallback)
  1038. #else
  1039. public UTF8Decoder (bool throwOnInvalid)
  1040. #endif
  1041. {
  1042. #if NET_2_0
  1043. Fallback = fallback;
  1044. #else
  1045. this.throwOnInvalid = throwOnInvalid;
  1046. #endif
  1047. leftOverBits = 0;
  1048. leftOverCount = 0;
  1049. }
  1050. // Override inherited methods.
  1051. public override int GetCharCount (byte[] bytes, int index, int count)
  1052. {
  1053. #if NET_2_0
  1054. DecoderFallbackBuffer buf = null;
  1055. byte [] bufferArg = null;
  1056. return InternalGetCharCount (bytes, index, count,
  1057. leftOverBits, leftOverCount, this, ref buf, ref bufferArg, false);
  1058. #else
  1059. return InternalGetCharCount (bytes, index, count,
  1060. leftOverBits, leftOverCount, throwOnInvalid, false);
  1061. #endif
  1062. }
  1063. public override int GetChars (byte[] bytes, int byteIndex,
  1064. int byteCount, char[] chars, int charIndex)
  1065. {
  1066. #if NET_2_0
  1067. DecoderFallbackBuffer buf = null;
  1068. byte [] bufferArg = null;
  1069. return InternalGetChars (bytes, byteIndex, byteCount,
  1070. chars, charIndex, ref leftOverBits, ref leftOverCount, this, ref buf, ref bufferArg, false);
  1071. #else
  1072. return InternalGetChars (bytes, byteIndex, byteCount,
  1073. chars, charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, false);
  1074. #endif
  1075. }
  1076. } // class UTF8Decoder
  1077. // UTF-8 encoder implementation.
  1078. [Serializable]
  1079. private class UTF8Encoder : Encoder
  1080. {
  1081. private bool emitIdentifier;
  1082. private char leftOverForCount;
  1083. private char leftOverForConv;
  1084. // Constructor.
  1085. public UTF8Encoder (bool emitIdentifier)
  1086. {
  1087. this.emitIdentifier = emitIdentifier;
  1088. leftOverForCount = '\0';
  1089. leftOverForConv = '\0';
  1090. }
  1091. // Override inherited methods.
  1092. public override int GetByteCount (char[] chars, int index,
  1093. int count, bool flush)
  1094. {
  1095. return InternalGetByteCount (chars, index, count, ref leftOverForCount, flush);
  1096. }
  1097. public override int GetBytes (char[] chars, int charIndex,
  1098. int charCount, byte[] bytes, int byteIndex, bool flush)
  1099. {
  1100. int result;
  1101. result = InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOverForConv, flush);
  1102. emitIdentifier = false;
  1103. return result;
  1104. }
  1105. #if NET_2_0
  1106. public unsafe override int GetByteCount (char* chars, int count, bool flush)
  1107. {
  1108. return InternalGetByteCount (chars, count, ref leftOverForCount, flush);
  1109. }
  1110. public unsafe override int GetBytes (char* chars, int charCount,
  1111. byte* bytes, int byteCount, bool flush)
  1112. {
  1113. int result;
  1114. result = InternalGetBytes (chars, charCount, bytes, byteCount, ref leftOverForConv, flush);
  1115. emitIdentifier = false;
  1116. return result;
  1117. }
  1118. #endif
  1119. } // class UTF8Encoder
  1120. }; // class UTF8Encoding
  1121. }; // namespace System.Text