UTF8Encoding.cs 35 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213
  1. /*
  2. * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
  3. *
  4. * Copyright (c) 2001, 2002 Southern Storm Software, Pty Ltd
  5. * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
  6. *
  7. * Permission is hereby granted, free of charge, to any person obtaining
  8. * a copy of this software and associated documentation files (the "Software"),
  9. * to deal in the Software without restriction, including without limitation
  10. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11. * and/or sell copies of the Software, and to permit persons to whom the
  12. * Software is furnished to do so, subject to the following conditions:
  13. *
  14. * The above copyright notice and this permission notice shall be included
  15. * in all copies or substantial portions of the Software.
  16. *
  17. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18. * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  20. * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  21. * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  22. * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  23. * OTHER DEALINGS IN THE SOFTWARE.
  24. */
  25. namespace System.Text
  26. {
  27. using System;
  28. using System.Runtime.InteropServices;
  29. [Serializable]
  30. [MonoTODO ("Serialization format not compatible with .NET")]
  31. #if NET_2_0
  32. [MonoTODO ("EncoderFallback is not handled")]
  33. [ComVisible (true)]
  34. #endif
  35. public class UTF8Encoding : Encoding
  36. {
  37. // Magic number used by Windows for UTF-8.
  38. internal const int UTF8_CODE_PAGE = 65001;
  39. // Internal state.
  40. private bool emitIdentifier;
  41. #if !NET_2_0
  42. private bool throwOnInvalid;
  43. #endif
  44. // Constructors.
  45. public UTF8Encoding () : this (false, false) {}
  46. public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
  47. : this (encoderShouldEmitUTF8Identifier, false) {}
  48. public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
  49. : base (UTF8_CODE_PAGE)
  50. {
  51. emitIdentifier = encoderShouldEmitUTF8Identifier;
  52. #if NET_2_0
  53. if (throwOnInvalidBytes)
  54. SetFallbackInternal (null, new DecoderExceptionFallback ());
  55. else
  56. SetFallbackInternal (null, new DecoderReplacementFallback ("\uFFFD"));
  57. #else
  58. throwOnInvalid = throwOnInvalidBytes;
  59. #endif
  60. web_name = body_name = header_name = "utf-8";
  61. encoding_name = "Unicode (UTF-8)";
  62. is_browser_save = true;
  63. is_browser_display = true;
  64. is_mail_news_display = true;
  65. is_mail_news_save = true;
  66. windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE;
  67. }
  68. #region GetByteCount()
  69. // Internal version of "GetByteCount" which can handle a rolling
  70. // state between multiple calls to this method.
  71. private static int InternalGetByteCount (char[] chars, int index, int count, ref char leftOver, bool flush)
  72. {
  73. // Validate the parameters.
  74. if (chars == null) {
  75. throw new ArgumentNullException ("chars");
  76. }
  77. if (index < 0 || index > chars.Length) {
  78. throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
  79. }
  80. if (count < 0 || count > (chars.Length - index)) {
  81. throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
  82. }
  83. if (index == chars.Length) {
  84. if (flush && leftOver != '\0') {
  85. // Flush the left-over surrogate pair start.
  86. leftOver = '\0';
  87. return 3;
  88. }
  89. return 0;
  90. }
  91. unsafe {
  92. fixed (char* cptr = chars) {
  93. return InternalGetByteCount (cptr + index, count, ref leftOver, flush);
  94. }
  95. }
  96. }
  97. private unsafe static int InternalGetByteCount (char* chars, int count, ref char leftOver, bool flush)
  98. {
  99. int index = 0;
  100. // Determine the lengths of all characters.
  101. char ch;
  102. int length = 0;
  103. char pair = leftOver;
  104. while (count > 0) {
  105. ch = chars[index];
  106. if (pair == 0) {
  107. if (ch < '\u0080') {
  108. // fast path optimization
  109. int end = index + count;
  110. for (; index < end; index++, count--) {
  111. if (chars [index] < '\x80')
  112. ++length;
  113. else
  114. break;
  115. }
  116. continue;
  117. //length++;
  118. } else if (ch < '\u0800') {
  119. length += 2;
  120. } else if (ch >= '\uD800' && ch <= '\uDBFF') {
  121. // This is the start of a surrogate pair.
  122. pair = ch;
  123. } else {
  124. length += 3;
  125. }
  126. } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
  127. if (pair != 0) {
  128. // We have a surrogate pair.
  129. length += 4;
  130. pair = '\0';
  131. } else {
  132. // We have a surrogate tail without
  133. // leading surrogate. In NET_2_0 it
  134. // uses fallback. In NET_1_1 we output
  135. // wrong surrogate.
  136. length += 3;
  137. pair = '\0';
  138. }
  139. } else {
  140. // We have a surrogate start followed by a
  141. // regular character. Technically, this is
  142. // invalid, but we have to do something.
  143. // We write out the surrogate start and then
  144. // re-visit the current character again.
  145. length += 3;
  146. pair = '\0';
  147. continue;
  148. }
  149. ++index;
  150. --count;
  151. }
  152. if (flush) {
  153. if (pair != '\0')
  154. // Flush the left-over surrogate pair start.
  155. length += 3;
  156. leftOver = '\0';
  157. }
  158. else
  159. leftOver = pair;
  160. // Return the final length to the caller.
  161. return length;
  162. }
  163. // Get the number of bytes needed to encode a character buffer.
  164. public override int GetByteCount (char[] chars, int index, int count)
  165. {
  166. char dummy = '\0';
  167. return InternalGetByteCount (chars, index, count, ref dummy, true);
  168. }
  169. #if !NET_2_0
  170. // Convenience wrappers for "GetByteCount".
  171. public override int GetByteCount (String s)
  172. {
  173. // Validate the parameters.
  174. if (s == null) {
  175. throw new ArgumentNullException ("s");
  176. }
  177. unsafe {
  178. fixed (char* cptr = s) {
  179. char dummy = '\0';
  180. return InternalGetByteCount (cptr, s.Length, ref dummy, true);
  181. }
  182. }
  183. }
  184. #endif
  185. #if NET_2_0
  186. [CLSCompliant (false)]
  187. [ComVisible (false)]
  188. public unsafe override int GetByteCount (char* chars, int count)
  189. {
  190. if (chars == null)
  191. throw new ArgumentNullException ("chars");
  192. if (count == 0)
  193. return 0;
  194. char dummy = '\0';
  195. return InternalGetByteCount (chars, count, ref dummy, true);
  196. }
  197. #endif
  198. #endregion
  199. #region GetBytes()
  200. // Internal version of "GetBytes" which can handle a rolling
  201. // state between multiple calls to this method.
  202. private static int InternalGetBytes (char[] chars, int charIndex,
  203. int charCount, byte[] bytes,
  204. int byteIndex, ref char leftOver,
  205. bool flush)
  206. {
  207. // Validate the parameters.
  208. if (chars == null) {
  209. throw new ArgumentNullException ("chars");
  210. }
  211. if (bytes == null) {
  212. throw new ArgumentNullException ("bytes");
  213. }
  214. if (charIndex < 0 || charIndex > chars.Length) {
  215. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
  216. }
  217. if (charCount < 0 || charCount > (chars.Length - charIndex)) {
  218. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
  219. }
  220. if (byteIndex < 0 || byteIndex > bytes.Length) {
  221. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  222. }
  223. if (charIndex == chars.Length) {
  224. if (flush && leftOver != '\0') {
  225. #if NET_2_0
  226. // FIXME: use EncoderFallback.
  227. //
  228. // By default it is empty, so I do nothing for now.
  229. leftOver = '\0';
  230. #else
  231. // Flush the left-over surrogate pair start.
  232. if (byteIndex >= bytes.Length - 3)
  233. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  234. bytes [byteIndex++] = 0xEF;
  235. bytes [byteIndex++] = 0xBB;
  236. bytes [byteIndex++] = 0xBF;
  237. leftOver = '\0';
  238. return 3;
  239. #endif
  240. }
  241. return 0;
  242. }
  243. unsafe {
  244. fixed (char* cptr = chars) {
  245. if (bytes.Length == byteIndex)
  246. return InternalGetBytes (
  247. cptr + charIndex, charCount,
  248. null, 0, ref leftOver, flush);
  249. fixed (byte *bptr = bytes) {
  250. return InternalGetBytes (
  251. cptr + charIndex, charCount,
  252. bptr + byteIndex, bytes.Length - byteIndex,
  253. ref leftOver, flush);
  254. }
  255. }
  256. }
  257. }
  258. private unsafe static int InternalGetBytes (char* chars, int charCount,
  259. byte* bytes, int byteCount,
  260. ref char leftOver, bool flush)
  261. {
  262. int charIndex = 0;
  263. int byteIndex = 0;
  264. // Convert the characters into bytes.
  265. // Convert the characters into bytes.
  266. char ch;
  267. int length = byteCount;
  268. char pair = leftOver;
  269. int posn = byteIndex;
  270. int code = 0;
  271. while (charCount > 0) {
  272. // Fetch the next UTF-16 character pair value.
  273. ch = chars [charIndex];
  274. if (pair == '\0') {
  275. if (ch < '\uD800' || ch >= '\uE000') {
  276. if (ch < '\x80') { // fast path optimization
  277. int end = charIndex + charCount;
  278. for (; charIndex < end; posn++, charIndex++, charCount--) {
  279. if (chars [charIndex] < '\x80')
  280. bytes [posn] = (byte) chars [charIndex];
  281. else
  282. break;
  283. }
  284. continue;
  285. }
  286. code = ch;
  287. }
  288. else if (ch < '\uDC00') {
  289. // surrogate start
  290. pair = ch;
  291. ++charIndex;
  292. --charCount;
  293. continue;
  294. } else { // ch <= '\uDFFF'
  295. // We have a surrogate tail without leading
  296. // surrogate. In NET_2_0 it uses fallback.
  297. // In NET_1_1 we output wrong surrogate.
  298. if (posn > length - 3) {
  299. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  300. }
  301. bytes [posn++] = (byte) (0xE0 | (ch >> 12));
  302. bytes [posn++] = (byte) (0x80 | ((ch >> 6) & 0x3F));
  303. bytes [posn++] = (byte) (0x80 | (ch & 0x3F));
  304. ++charIndex;
  305. --charCount;
  306. continue;
  307. }
  308. } else {
  309. if ('\uDC00' <= ch && ch <= '\uDFFF')
  310. code = 0x10000 + (int) ch - 0xDC00 +
  311. (((int) pair - 0xD800) << 10);
  312. else {
  313. // We have a surrogate start followed by a
  314. // regular character. Technically, this is
  315. // invalid, but we have to do something.
  316. // We write out the surrogate start and then
  317. // re-visit the current character again.
  318. if (posn > length - 3) {
  319. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  320. }
  321. bytes [posn++] = (byte) (0xE0 | (pair >> 12));
  322. bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F));
  323. bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
  324. pair = '\0';
  325. continue;
  326. }
  327. pair = '\0';
  328. }
  329. ++charIndex;
  330. --charCount;
  331. // Encode the character pair value.
  332. if (code < 0x0080) {
  333. if (posn >= length)
  334. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  335. bytes [posn++] = (byte)code;
  336. } else if (code < 0x0800) {
  337. if ((posn + 2) > length)
  338. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  339. bytes [posn++] = (byte) (0xC0 | (code >> 6));
  340. bytes [posn++] = (byte) (0x80 | (code & 0x3F));
  341. } else if (code < 0x10000) {
  342. if (posn > length - 3)
  343. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  344. bytes [posn++] = (byte) (0xE0 | (code >> 12));
  345. bytes [posn++] = (byte) (0x80 | ((code >> 6) & 0x3F));
  346. bytes [posn++] = (byte) (0x80 | (code & 0x3F));
  347. } else {
  348. if (posn > length - 4)
  349. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  350. bytes [posn++] = (byte) (0xF0 | (code >> 18));
  351. bytes [posn++] = (byte) (0x80 | ((code >> 12) & 0x3F));
  352. bytes [posn++] = (byte) (0x80 | ((code >> 6) & 0x3F));
  353. bytes [posn++] = (byte) (0x80 | (code & 0x3F));
  354. }
  355. }
  356. if (flush) {
  357. if (pair != '\0') {
  358. // Flush the left-over incomplete surrogate.
  359. if (posn > length - 3) {
  360. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  361. }
  362. bytes [posn++] = (byte) (0xE0 | (pair >> 12));
  363. bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F));
  364. bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
  365. }
  366. leftOver = '\0';
  367. }
  368. else
  369. leftOver = pair;
  370. Char.IsLetterOrDigit (pair);
  371. // Return the final count to the caller.
  372. return posn - byteIndex;
  373. }
  374. private unsafe int Fallback (byte* bytes, int byteCount, char lead, char tail)
  375. {
  376. throw new NotImplementedException ();
  377. }
  378. // Get the bytes that result from encoding a character buffer.
  379. public override int GetBytes (char[] chars, int charIndex, int charCount,
  380. byte[] bytes, int byteIndex)
  381. {
  382. char leftOver = '\0';
  383. return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOver, true);
  384. }
  385. // Convenience wrappers for "GetBytes".
  386. public override int GetBytes (String s, int charIndex, int charCount,
  387. byte[] bytes, int byteIndex)
  388. {
  389. // Validate the parameters.
  390. if (s == null) {
  391. throw new ArgumentNullException ("s");
  392. }
  393. if (bytes == null) {
  394. throw new ArgumentNullException ("bytes");
  395. }
  396. if (charIndex < 0 || charIndex > s.Length) {
  397. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
  398. }
  399. if (charCount < 0 || charCount > (s.Length - charIndex)) {
  400. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
  401. }
  402. if (byteIndex < 0 || byteIndex > bytes.Length) {
  403. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  404. }
  405. if (charIndex == s.Length)
  406. return 0;
  407. unsafe {
  408. fixed (char* cptr = s) {
  409. char dummy = '\0';
  410. if (bytes.Length == byteIndex)
  411. return InternalGetBytes (
  412. cptr + charIndex, charCount,
  413. null, 0, ref dummy, true);
  414. fixed (byte *bptr = bytes) {
  415. return InternalGetBytes (
  416. cptr + charIndex, charCount,
  417. bptr + byteIndex, bytes.Length - byteIndex,
  418. ref dummy, true);
  419. }
  420. }
  421. }
  422. }
  423. #if NET_2_0
  424. [CLSCompliant (false)]
  425. [ComVisible (false)]
  426. public unsafe override int GetBytes (char* chars, int charCount, byte* bytes, int byteCount)
  427. {
  428. if (chars == null)
  429. throw new ArgumentNullException ("chars");
  430. if (charCount < 0)
  431. throw new IndexOutOfRangeException ("charCount");
  432. if (bytes == null)
  433. throw new ArgumentNullException ("bytes");
  434. if (byteCount < 0)
  435. throw new IndexOutOfRangeException ("charCount");
  436. if (charCount == 0)
  437. return 0;
  438. char dummy = '\0';
  439. if (byteCount == 0)
  440. return InternalGetBytes (chars, charCount, null, 0, ref dummy, true);
  441. else
  442. return InternalGetBytes (chars, charCount, bytes, byteCount, ref dummy, true);
  443. }
  444. #endif
  445. #endregion
  446. // Internal version of "GetCharCount" which can handle a rolling
  447. // state between multiple calls to this method.
  448. #if NET_2_0
  449. private unsafe static int InternalGetCharCount (
  450. byte[] bytes, int index, int count, uint leftOverBits,
  451. uint leftOverCount, object provider,
  452. ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
  453. #else
  454. private unsafe static int InternalGetCharCount (
  455. byte[] bytes, int index, int count, uint leftOverBits,
  456. uint leftOverCount, bool throwOnInvalid, bool flush)
  457. #endif
  458. {
  459. // Validate the parameters.
  460. if (bytes == null) {
  461. throw new ArgumentNullException ("bytes");
  462. }
  463. if (index < 0 || index > bytes.Length) {
  464. throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
  465. }
  466. if (count < 0 || count > (bytes.Length - index)) {
  467. throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
  468. }
  469. if (count == 0)
  470. return 0;
  471. fixed (byte *bptr = bytes)
  472. #if NET_2_0
  473. return InternalGetCharCount (bptr + index, count,
  474. leftOverBits, leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
  475. #else
  476. return InternalGetCharCount (bptr + index, count,
  477. leftOverBits, leftOverCount, throwOnInvalid, flush);
  478. #endif
  479. }
  480. #if NET_2_0
  481. private unsafe static int InternalGetCharCount (
  482. byte* bytes, int count, uint leftOverBits,
  483. uint leftOverCount, object provider,
  484. ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
  485. #else
  486. private unsafe static int InternalGetCharCount (
  487. byte* bytes, int count, uint leftOverBits,
  488. uint leftOverCount, bool throwOnInvalid, bool flush)
  489. #endif
  490. {
  491. int index = 0;
  492. int length = 0;
  493. if (leftOverCount == 0) {
  494. int end = index + count;
  495. for (; index < end; index++, count--) {
  496. if (bytes [index] < 0x80)
  497. length++;
  498. else
  499. break;
  500. }
  501. }
  502. // Determine the number of characters that we have.
  503. uint ch;
  504. uint leftBits = leftOverBits;
  505. uint leftSoFar = (leftOverCount & (uint)0x0F);
  506. uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
  507. while (count > 0) {
  508. ch = (uint)(bytes[index++]);
  509. --count;
  510. if (leftSize == 0) {
  511. // Process a UTF-8 start character.
  512. if (ch < (uint)0x0080) {
  513. // Single-byte UTF-8 character.
  514. ++length;
  515. } else if ((ch & (uint)0xE0) == (uint)0xC0) {
  516. // Double-byte UTF-8 character.
  517. leftBits = (ch & (uint)0x1F);
  518. leftSoFar = 1;
  519. leftSize = 2;
  520. } else if ((ch & (uint)0xF0) == (uint)0xE0) {
  521. // Three-byte UTF-8 character.
  522. leftBits = (ch & (uint)0x0F);
  523. leftSoFar = 1;
  524. leftSize = 3;
  525. } else if ((ch & (uint)0xF8) == (uint)0xF0) {
  526. // Four-byte UTF-8 character.
  527. leftBits = (ch & (uint)0x07);
  528. leftSoFar = 1;
  529. leftSize = 4;
  530. } else if ((ch & (uint)0xFC) == (uint)0xF8) {
  531. // Five-byte UTF-8 character.
  532. leftBits = (ch & (uint)0x03);
  533. leftSoFar = 1;
  534. leftSize = 5;
  535. } else if ((ch & (uint)0xFE) == (uint)0xFC) {
  536. // Six-byte UTF-8 character.
  537. leftBits = (ch & (uint)0x03);
  538. leftSoFar = 1;
  539. leftSize = 6;
  540. } else {
  541. // Invalid UTF-8 start character.
  542. #if NET_2_0
  543. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1, 1);
  544. #else
  545. if (throwOnInvalid)
  546. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  547. #endif
  548. }
  549. } else {
  550. // Process an extra byte in a multi-byte sequence.
  551. if ((ch & (uint)0xC0) == (uint)0x80) {
  552. leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
  553. if (++leftSoFar >= leftSize) {
  554. // We have a complete character now.
  555. if (leftBits < (uint)0x10000) {
  556. // is it an overlong ?
  557. bool overlong = false;
  558. switch (leftSize) {
  559. case 2:
  560. overlong = (leftBits <= 0x7F);
  561. break;
  562. case 3:
  563. overlong = (leftBits <= 0x07FF);
  564. break;
  565. case 4:
  566. overlong = (leftBits <= 0xFFFF);
  567. break;
  568. case 5:
  569. overlong = (leftBits <= 0x1FFFFF);
  570. break;
  571. case 6:
  572. overlong = (leftBits <= 0x03FFFFFF);
  573. break;
  574. }
  575. if (overlong) {
  576. #if NET_2_0
  577. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
  578. #else
  579. if (throwOnInvalid)
  580. throw new ArgumentException (_("Overlong"), leftBits.ToString ());
  581. #endif
  582. }
  583. else
  584. ++length;
  585. } else if (leftBits < (uint)0x110000) {
  586. length += 2;
  587. } else {
  588. #if NET_2_0
  589. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
  590. #else
  591. if (throwOnInvalid)
  592. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  593. #endif
  594. }
  595. leftSize = 0;
  596. }
  597. } else {
  598. // Invalid UTF-8 sequence: clear and restart.
  599. #if NET_2_0
  600. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
  601. #else
  602. if (throwOnInvalid)
  603. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  604. #endif
  605. leftSize = 0;
  606. --index;
  607. ++count;
  608. }
  609. }
  610. }
  611. if (flush && leftSize != 0) {
  612. // We had left-over bytes that didn't make up
  613. // a complete UTF-8 character sequence.
  614. #if NET_2_0
  615. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - leftSoFar, leftSoFar);
  616. #else
  617. if (throwOnInvalid)
  618. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  619. #endif
  620. }
  621. // Return the final length to the caller.
  622. return length;
  623. }
  624. #if NET_2_0
  625. // for GetCharCount()
  626. static unsafe int Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, long index, uint size)
  627. {
  628. if (buffer == null) {
  629. DecoderFallback fb = provider as DecoderFallback;
  630. if (fb != null)
  631. buffer = fb.CreateFallbackBuffer ();
  632. else
  633. buffer = ((Decoder) provider).FallbackBuffer;
  634. }
  635. if (bufferArg == null)
  636. bufferArg = new byte [1];
  637. int ret = 0;
  638. for (int i = 0; i < size; i++) {
  639. bufferArg [0] = bytes [(int) index + i];
  640. buffer.Fallback (bufferArg, 0);
  641. ret += buffer.Remaining;
  642. buffer.Reset ();
  643. }
  644. return ret;
  645. }
  646. // for GetChars()
  647. static unsafe void Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, long byteIndex, uint size,
  648. char* chars, ref int charIndex)
  649. {
  650. if (buffer == null) {
  651. DecoderFallback fb = provider as DecoderFallback;
  652. if (fb != null)
  653. buffer = fb.CreateFallbackBuffer ();
  654. else
  655. buffer = ((Decoder) provider).FallbackBuffer;
  656. }
  657. if (bufferArg == null)
  658. bufferArg = new byte [1];
  659. for (int i = 0; i < size; i++) {
  660. bufferArg [0] = bytes [byteIndex + i];
  661. buffer.Fallback (bufferArg, 0);
  662. while (buffer.Remaining > 0)
  663. chars [charIndex++] = buffer.GetNextChar ();
  664. buffer.Reset ();
  665. }
  666. }
  667. #endif
  668. // Get the number of characters needed to decode a byte buffer.
  669. public override int GetCharCount (byte[] bytes, int index, int count)
  670. {
  671. #if NET_2_0
  672. DecoderFallbackBuffer buf = null;
  673. byte [] bufferArg = null;
  674. return InternalGetCharCount (bytes, index, count, 0, 0, DecoderFallback, ref buf, ref bufferArg, true);
  675. #else
  676. return InternalGetCharCount (bytes, index, count, 0, 0, throwOnInvalid, true);
  677. #endif
  678. }
  679. #if NET_2_0
  680. [CLSCompliant (false)]
  681. [ComVisible (false)]
  682. public unsafe override int GetCharCount (byte* bytes, int count)
  683. {
  684. DecoderFallbackBuffer buf = null;
  685. byte [] bufferArg = null;
  686. return InternalGetCharCount (bytes, count, 0, 0, DecoderFallback, ref buf, ref bufferArg, true);
  687. }
  688. #endif
  689. // Get the characters that result from decoding a byte buffer.
  690. #if NET_2_0
  691. private unsafe static int InternalGetChars (
  692. byte[] bytes, int byteIndex, int byteCount, char[] chars,
  693. int charIndex, ref uint leftOverBits, ref uint leftOverCount,
  694. object provider,
  695. ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
  696. #else
  697. private unsafe static int InternalGetChars (
  698. byte[] bytes, int byteIndex, int byteCount, char[] chars,
  699. int charIndex, ref uint leftOverBits, ref uint leftOverCount,
  700. bool throwOnInvalid, bool flush)
  701. #endif
  702. {
  703. // Validate the parameters.
  704. if (bytes == null) {
  705. throw new ArgumentNullException ("bytes");
  706. }
  707. if (chars == null) {
  708. throw new ArgumentNullException ("chars");
  709. }
  710. if (byteIndex < 0 || byteIndex > bytes.Length) {
  711. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  712. }
  713. if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) {
  714. throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
  715. }
  716. if (charIndex < 0 || charIndex > chars.Length) {
  717. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
  718. }
  719. if (charIndex == chars.Length)
  720. return 0;
  721. fixed (char* cptr = chars) {
  722. #if NET_2_0
  723. if (byteCount == 0 || byteIndex == bytes.Length)
  724. return InternalGetChars (null, 0, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
  725. // otherwise...
  726. fixed (byte* bptr = bytes)
  727. return InternalGetChars (bptr + byteIndex, byteCount, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
  728. #else
  729. if (byteCount == 0 || byteIndex == bytes.Length)
  730. return InternalGetChars (null, 0, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, flush);
  731. // otherwise...
  732. fixed (byte* bptr = bytes)
  733. return InternalGetChars (bptr + byteIndex, byteCount, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, flush);
  734. #endif
  735. }
  736. }
  737. #if NET_2_0
  738. private unsafe static int InternalGetChars (
  739. byte* bytes, int byteCount, char* chars, int charCount,
  740. ref uint leftOverBits, ref uint leftOverCount,
  741. object provider,
  742. ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
  743. #else
  744. private unsafe static int InternalGetChars (
  745. byte* bytes, int byteCount, char* chars, int charCount,
  746. ref uint leftOverBits, ref uint leftOverCount,
  747. bool throwOnInvalid, bool flush)
  748. #endif
  749. {
  750. int charIndex = 0, byteIndex = 0;
  751. int length = charCount;
  752. int posn = charIndex;
  753. if (leftOverCount == 0) {
  754. int end = byteIndex + byteCount;
  755. for (; byteIndex < end; posn++, byteIndex++, byteCount--) {
  756. if (bytes [byteIndex] < 0x80)
  757. chars [posn] = (char) bytes [byteIndex];
  758. else
  759. break;
  760. }
  761. }
  762. // Convert the bytes into the output buffer.
  763. uint ch;
  764. uint leftBits = leftOverBits;
  765. uint leftSoFar = (leftOverCount & (uint)0x0F);
  766. uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
  767. int byteEnd = byteIndex + byteCount;
  768. for(; byteIndex < byteEnd; byteIndex++) {
  769. // Fetch the next character from the byte buffer.
  770. ch = (uint)(bytes[byteIndex]);
  771. if (leftSize == 0) {
  772. // Process a UTF-8 start character.
  773. if (ch < (uint)0x0080) {
  774. // Single-byte UTF-8 character.
  775. if (posn >= length) {
  776. throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
  777. }
  778. chars[posn++] = (char)ch;
  779. } else if ((ch & (uint)0xE0) == (uint)0xC0) {
  780. // Double-byte UTF-8 character.
  781. leftBits = (ch & (uint)0x1F);
  782. leftSoFar = 1;
  783. leftSize = 2;
  784. } else if ((ch & (uint)0xF0) == (uint)0xE0) {
  785. // Three-byte UTF-8 character.
  786. leftBits = (ch & (uint)0x0F);
  787. leftSoFar = 1;
  788. leftSize = 3;
  789. } else if ((ch & (uint)0xF8) == (uint)0xF0) {
  790. // Four-byte UTF-8 character.
  791. leftBits = (ch & (uint)0x07);
  792. leftSoFar = 1;
  793. leftSize = 4;
  794. } else if ((ch & (uint)0xFC) == (uint)0xF8) {
  795. // Five-byte UTF-8 character.
  796. leftBits = (ch & (uint)0x03);
  797. leftSoFar = 1;
  798. leftSize = 5;
  799. } else if ((ch & (uint)0xFE) == (uint)0xFC) {
  800. // Six-byte UTF-8 character.
  801. leftBits = (ch & (uint)0x03);
  802. leftSoFar = 1;
  803. leftSize = 6;
  804. } else {
  805. // Invalid UTF-8 start character.
  806. #if NET_2_0
  807. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, 1, chars, ref posn);
  808. #else
  809. if (throwOnInvalid)
  810. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  811. #endif
  812. }
  813. } else {
  814. // Process an extra byte in a multi-byte sequence.
  815. if ((ch & (uint)0xC0) == (uint)0x80) {
  816. leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
  817. if (++leftSoFar >= leftSize) {
  818. // We have a complete character now.
  819. if (leftBits < (uint)0x10000) {
  820. // is it an overlong ?
  821. bool overlong = false;
  822. switch (leftSize) {
  823. case 2:
  824. overlong = (leftBits <= 0x7F);
  825. break;
  826. case 3:
  827. overlong = (leftBits <= 0x07FF);
  828. break;
  829. case 4:
  830. overlong = (leftBits <= 0xFFFF);
  831. break;
  832. case 5:
  833. overlong = (leftBits <= 0x1FFFFF);
  834. break;
  835. case 6:
  836. overlong = (leftBits <= 0x03FFFFFF);
  837. break;
  838. }
  839. if (overlong) {
  840. #if NET_2_0
  841. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
  842. #else
  843. if (throwOnInvalid)
  844. throw new ArgumentException (_("Overlong"), leftBits.ToString ());
  845. #endif
  846. }
  847. else if ((leftBits & 0xF800) == 0xD800) {
  848. // UTF-8 doesn't use surrogate characters
  849. #if NET_2_0
  850. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
  851. #else
  852. if (throwOnInvalid)
  853. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  854. #endif
  855. }
  856. else {
  857. if (posn >= length) {
  858. throw new ArgumentException
  859. (_("Arg_InsufficientSpace"), "chars");
  860. }
  861. chars[posn++] = (char)leftBits;
  862. }
  863. } else if (leftBits < (uint)0x110000) {
  864. if ((posn + 2) > length) {
  865. throw new ArgumentException
  866. (_("Arg_InsufficientSpace"), "chars");
  867. }
  868. leftBits -= (uint)0x10000;
  869. chars[posn++] = (char)((leftBits >> 10) +
  870. (uint)0xD800);
  871. chars[posn++] =
  872. (char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
  873. } else {
  874. #if NET_2_0
  875. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
  876. #else
  877. if (throwOnInvalid)
  878. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  879. #endif
  880. }
  881. leftSize = 0;
  882. }
  883. } else {
  884. // Invalid UTF-8 sequence: clear and restart.
  885. #if NET_2_0
  886. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
  887. #else
  888. if (throwOnInvalid)
  889. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  890. #endif
  891. leftSize = 0;
  892. --byteIndex;
  893. }
  894. }
  895. }
  896. if (flush && leftSize != 0) {
  897. // We had left-over bytes that didn't make up
  898. // a complete UTF-8 character sequence.
  899. #if NET_2_0
  900. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex - leftSoFar, leftSoFar, chars, ref posn);
  901. #else
  902. if (throwOnInvalid)
  903. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  904. #endif
  905. }
  906. leftOverBits = leftBits;
  907. leftOverCount = (leftSoFar | (leftSize << 4));
  908. // Return the final length to the caller.
  909. return posn - charIndex;
  910. }
  911. // Get the characters that result from decoding a byte buffer.
  912. public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
  913. char[] chars, int charIndex)
  914. {
  915. uint leftOverBits = 0;
  916. uint leftOverCount = 0;
  917. #if NET_2_0
  918. DecoderFallbackBuffer buf = null;
  919. byte [] bufferArg = null;
  920. return InternalGetChars (bytes, byteIndex, byteCount, chars,
  921. charIndex, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, ref bufferArg, true);
  922. #else
  923. return InternalGetChars (bytes, byteIndex, byteCount, chars,
  924. charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, true);
  925. #endif
  926. }
  927. #if NET_2_0
  928. [CLSCompliant (false)]
  929. [ComVisible (false)]
  930. public unsafe override int GetChars (byte* bytes, int byteCount, char* chars, int charCount)
  931. {
  932. DecoderFallbackBuffer buf = null;
  933. byte [] bufferArg = null;
  934. uint leftOverBits = 0;
  935. uint leftOverCount = 0;
  936. return InternalGetChars (bytes, byteCount, chars,
  937. charCount, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, ref bufferArg, true);
  938. }
  939. #endif
  940. // Get the maximum number of bytes needed to encode a
  941. // specified number of characters.
  942. public override int GetMaxByteCount (int charCount)
  943. {
  944. if (charCount < 0) {
  945. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
  946. }
  947. return charCount * 4;
  948. }
  949. // Get the maximum number of characters needed to decode a
  950. // specified number of bytes.
  951. public override int GetMaxCharCount (int byteCount)
  952. {
  953. if (byteCount < 0) {
  954. throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
  955. }
  956. return byteCount;
  957. }
  958. // Get a UTF8-specific decoder that is attached to this instance.
  959. public override Decoder GetDecoder ()
  960. {
  961. #if NET_2_0
  962. return new UTF8Decoder (DecoderFallback);
  963. #else
  964. return new UTF8Decoder (throwOnInvalid);
  965. #endif
  966. }
  967. // Get a UTF8-specific encoder that is attached to this instance.
  968. public override Encoder GetEncoder ()
  969. {
  970. return new UTF8Encoder (emitIdentifier);
  971. }
  972. // Get the UTF8 preamble.
  973. public override byte[] GetPreamble ()
  974. {
  975. if (emitIdentifier) {
  976. byte[] pre = new byte [3];
  977. pre[0] = (byte)0xEF;
  978. pre[1] = (byte)0xBB;
  979. pre[2] = (byte)0xBF;
  980. return pre;
  981. } else {
  982. return new byte [0];
  983. }
  984. }
  985. // Determine if this object is equal to another.
  986. public override bool Equals (Object value)
  987. {
  988. UTF8Encoding enc = (value as UTF8Encoding);
  989. if (enc != null) {
  990. #if NET_2_0
  991. return (codePage == enc.codePage &&
  992. emitIdentifier == enc.emitIdentifier &&
  993. DecoderFallback == enc.DecoderFallback &&
  994. EncoderFallback == enc.EncoderFallback);
  995. #else
  996. return (codePage == enc.codePage &&
  997. emitIdentifier == enc.emitIdentifier &&
  998. throwOnInvalid == enc.throwOnInvalid);
  999. #endif
  1000. } else {
  1001. return false;
  1002. }
  1003. }
  1004. // Get the hash code for this object.
  1005. public override int GetHashCode ()
  1006. {
  1007. return base.GetHashCode ();
  1008. }
  1009. #if NET_2_0
  1010. [MonoTODO]
  1011. public override int GetByteCount (string s)
  1012. {
  1013. // hmm, does this override make any sense?
  1014. return base.GetByteCount (s);
  1015. }
  1016. [MonoTODO]
  1017. [ComVisible (false)]
  1018. public override string GetString (byte [] bytes, int index, int count)
  1019. {
  1020. // hmm, does this override make any sense?
  1021. return base.GetString (bytes, index, count);
  1022. }
  1023. #endif
  1024. #if !NET_2_0
  1025. public override byte [] GetBytes (String s)
  1026. {
  1027. if (s == null)
  1028. throw new ArgumentNullException ("s");
  1029. int length = GetByteCount (s);
  1030. byte [] bytes = new byte [length];
  1031. GetBytes (s, 0, s.Length, bytes, 0);
  1032. return bytes;
  1033. }
  1034. #endif
  1035. // UTF-8 decoder implementation.
  1036. [Serializable]
  1037. private class UTF8Decoder : Decoder
  1038. {
  1039. #if !NET_2_0
  1040. private bool throwOnInvalid;
  1041. #endif
  1042. private uint leftOverBits;
  1043. private uint leftOverCount;
  1044. // Constructor.
  1045. #if NET_2_0
  1046. public UTF8Decoder (DecoderFallback fallback)
  1047. #else
  1048. public UTF8Decoder (bool throwOnInvalid)
  1049. #endif
  1050. {
  1051. #if NET_2_0
  1052. Fallback = fallback;
  1053. #else
  1054. this.throwOnInvalid = throwOnInvalid;
  1055. #endif
  1056. leftOverBits = 0;
  1057. leftOverCount = 0;
  1058. }
  1059. // Override inherited methods.
  1060. public override int GetCharCount (byte[] bytes, int index, int count)
  1061. {
  1062. #if NET_2_0
  1063. DecoderFallbackBuffer buf = null;
  1064. byte [] bufferArg = null;
  1065. return InternalGetCharCount (bytes, index, count,
  1066. leftOverBits, leftOverCount, this, ref buf, ref bufferArg, false);
  1067. #else
  1068. return InternalGetCharCount (bytes, index, count,
  1069. leftOverBits, leftOverCount, throwOnInvalid, false);
  1070. #endif
  1071. }
  1072. public override int GetChars (byte[] bytes, int byteIndex,
  1073. int byteCount, char[] chars, int charIndex)
  1074. {
  1075. #if NET_2_0
  1076. DecoderFallbackBuffer buf = null;
  1077. byte [] bufferArg = null;
  1078. return InternalGetChars (bytes, byteIndex, byteCount,
  1079. chars, charIndex, ref leftOverBits, ref leftOverCount, this, ref buf, ref bufferArg, false);
  1080. #else
  1081. return InternalGetChars (bytes, byteIndex, byteCount,
  1082. chars, charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, false);
  1083. #endif
  1084. }
  1085. } // class UTF8Decoder
  1086. // UTF-8 encoder implementation.
  1087. [Serializable]
  1088. private class UTF8Encoder : Encoder
  1089. {
  1090. private bool emitIdentifier;
  1091. private char leftOverForCount;
  1092. private char leftOverForConv;
  1093. // Constructor.
  1094. public UTF8Encoder (bool emitIdentifier)
  1095. {
  1096. this.emitIdentifier = emitIdentifier;
  1097. leftOverForCount = '\0';
  1098. leftOverForConv = '\0';
  1099. }
  1100. // Override inherited methods.
  1101. public override int GetByteCount (char[] chars, int index,
  1102. int count, bool flush)
  1103. {
  1104. return InternalGetByteCount (chars, index, count, ref leftOverForCount, flush);
  1105. }
  1106. public override int GetBytes (char[] chars, int charIndex,
  1107. int charCount, byte[] bytes, int byteIndex, bool flush)
  1108. {
  1109. int result;
  1110. result = InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOverForConv, flush);
  1111. emitIdentifier = false;
  1112. return result;
  1113. }
  1114. #if NET_2_0
  1115. public unsafe override int GetByteCount (char* chars, int count, bool flush)
  1116. {
  1117. return InternalGetByteCount (chars, count, ref leftOverForCount, flush);
  1118. }
  1119. public unsafe override int GetBytes (char* chars, int charCount,
  1120. byte* bytes, int byteCount, bool flush)
  1121. {
  1122. int result;
  1123. result = InternalGetBytes (chars, charCount, bytes, byteCount, ref leftOverForConv, flush);
  1124. emitIdentifier = false;
  1125. return result;
  1126. }
  1127. #endif
  1128. } // class UTF8Encoder
  1129. }; // class UTF8Encoding
  1130. }; // namespace System.Text