UTF8Encoding.cs 30 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054
  1. /*
  2. * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
  3. *
  4. * Copyright (c) 2001, 2002 Southern Storm Software, Pty Ltd
  5. * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
  6. *
  7. * Permission is hereby granted, free of charge, to any person obtaining
  8. * a copy of this software and associated documentation files (the "Software"),
  9. * to deal in the Software without restriction, including without limitation
  10. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11. * and/or sell copies of the Software, and to permit persons to whom the
  12. * Software is furnished to do so, subject to the following conditions:
  13. *
  14. * The above copyright notice and this permission notice shall be included
  15. * in all copies or substantial portions of the Software.
  16. *
  17. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18. * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  20. * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  21. * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  22. * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  23. * OTHER DEALINGS IN THE SOFTWARE.
  24. */
  25. namespace System.Text
  26. {
  27. using System;
  28. [Serializable]
  29. [MonoTODO ("Fix serialization compatibility with MS.NET")]
  30. #if NET_2_0
  31. [MonoTODO ("EncoderFallback is not handled")]
  32. #endif
  33. public class UTF8Encoding : Encoding
  34. {
  35. // Magic number used by Windows for UTF-8.
  36. internal const int UTF8_CODE_PAGE = 65001;
  37. // Internal state.
  38. private bool emitIdentifier;
  39. #if !NET_2_0
  40. private bool throwOnInvalid;
  41. #endif
  42. // Constructors.
  43. public UTF8Encoding () : this (false, false) {}
  44. public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
  45. : this (encoderShouldEmitUTF8Identifier, false) {}
  46. public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
  47. : base (UTF8_CODE_PAGE)
  48. {
  49. emitIdentifier = encoderShouldEmitUTF8Identifier;
  50. #if NET_2_0
  51. if (throwOnInvalidBytes)
  52. SetFallbackInternal (null, new DecoderExceptionFallback ());
  53. else
  54. SetFallbackInternal (null, new DecoderReplacementFallback (String.Empty));
  55. #else
  56. throwOnInvalid = throwOnInvalidBytes;
  57. #endif
  58. web_name = body_name = header_name = "utf-8";
  59. encoding_name = "Unicode (UTF-8)";
  60. is_browser_save = true;
  61. is_browser_display = true;
  62. is_mail_news_display = true;
  63. windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE;
  64. }
  65. #region GetByteCount()
  66. // Internal version of "GetByteCount" which can handle a rolling
  67. // state between multiple calls to this method.
  68. private static int InternalGetByteCount (char[] chars, int index, int count, ref char leftOver, bool flush)
  69. {
  70. // Validate the parameters.
  71. if (chars == null) {
  72. throw new ArgumentNullException ("chars");
  73. }
  74. if (index < 0 || index > chars.Length) {
  75. throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
  76. }
  77. if (count < 0 || count > (chars.Length - index)) {
  78. throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
  79. }
  80. if (index == chars.Length) {
  81. if (flush && leftOver != '\0') {
  82. // Flush the left-over surrogate pair start.
  83. leftOver = '\0';
  84. return 3;
  85. }
  86. return 0;
  87. }
  88. unsafe {
  89. fixed (char* cptr = chars) {
  90. return InternalGetByteCount (cptr + index, count, ref leftOver, flush);
  91. }
  92. }
  93. }
  94. private unsafe static int InternalGetByteCount (char* chars, int count, ref char leftOver, bool flush)
  95. {
  96. int index = 0;
  97. // Determine the lengths of all characters.
  98. char ch;
  99. int length = 0;
  100. char pair = leftOver;
  101. while (count > 0) {
  102. ch = chars[index];
  103. if (pair == 0) {
  104. if (ch < '\u0080') {
  105. // fast path optimization
  106. int end = index + count;
  107. for (; index < end; index++, count--) {
  108. if (chars [index] < '\x80')
  109. ++length;
  110. else
  111. break;
  112. }
  113. continue;
  114. //length++;
  115. } else if (ch < '\u0800') {
  116. length += 2;
  117. } else if (ch >= '\uD800' && ch <= '\uDBFF') {
  118. // This is the start of a surrogate pair.
  119. pair = ch;
  120. } else {
  121. length += 3;
  122. }
  123. } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
  124. if (pair != 0) {
  125. // We have a surrogate pair.
  126. length += 4;
  127. pair = '\0';
  128. } else {
  129. // We have a surrogate tail without
  130. // leading surrogate. In NET_2_0 it
  131. // uses fallback. In NET_1_1 we output
  132. // wrong surrogate.
  133. length += 3;
  134. pair = '\0';
  135. }
  136. } else {
  137. // We have a surrogate start followed by a
  138. // regular character. Technically, this is
  139. // invalid, but we have to do something.
  140. // We write out the surrogate start and then
  141. // re-visit the current character again.
  142. length += 3;
  143. pair = '\0';
  144. continue;
  145. }
  146. ++index;
  147. --count;
  148. }
  149. if (flush) {
  150. if (pair != '\0')
  151. // Flush the left-over surrogate pair start.
  152. length += 3;
  153. leftOver = '\0';
  154. }
  155. else
  156. leftOver = pair;
  157. // Return the final length to the caller.
  158. return length;
  159. }
  160. // Get the number of bytes needed to encode a character buffer.
  161. public override int GetByteCount (char[] chars, int index, int count)
  162. {
  163. char dummy = '\0';
  164. return InternalGetByteCount (chars, index, count, ref dummy, true);
  165. }
  166. // Convenience wrappers for "GetByteCount".
  167. public override int GetByteCount (String s)
  168. {
  169. // Validate the parameters.
  170. if (s == null) {
  171. throw new ArgumentNullException ("s");
  172. }
  173. unsafe {
  174. fixed (char* cptr = s) {
  175. char dummy = '\0';
  176. return InternalGetByteCount (cptr, s.Length, ref dummy, true);
  177. }
  178. }
  179. }
  180. #endregion
  181. #region GetBytes()
  182. // Internal version of "GetBytes" which can handle a rolling
  183. // state between multiple calls to this method.
  184. private static int InternalGetBytes (char[] chars, int charIndex,
  185. int charCount, byte[] bytes,
  186. int byteIndex, ref char leftOver,
  187. bool flush)
  188. {
  189. // Validate the parameters.
  190. if (chars == null) {
  191. throw new ArgumentNullException ("chars");
  192. }
  193. if (bytes == null) {
  194. throw new ArgumentNullException ("bytes");
  195. }
  196. if (charIndex < 0 || charIndex > chars.Length) {
  197. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
  198. }
  199. if (charCount < 0 || charCount > (chars.Length - charIndex)) {
  200. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
  201. }
  202. if (byteIndex < 0 || byteIndex > bytes.Length) {
  203. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  204. }
  205. if (charIndex == chars.Length) {
  206. if (flush && leftOver != '\0') {
  207. #if NET_2_0
  208. // FIXME: use EncoderFallback.
  209. //
  210. // By default it is empty, so I do nothing for now.
  211. leftOver = '\0';
  212. #else
  213. // Flush the left-over surrogate pair start.
  214. if (byteIndex >= bytes.Length - 3)
  215. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  216. bytes [byteIndex++] = 0xEF;
  217. bytes [byteIndex++] = 0xBB;
  218. bytes [byteIndex++] = 0xBF;
  219. leftOver = '\0';
  220. return 3;
  221. #endif
  222. }
  223. return 0;
  224. }
  225. unsafe {
  226. fixed (char* cptr = chars) {
  227. if (bytes.Length == byteIndex)
  228. return InternalGetBytes (
  229. cptr + charIndex, charCount,
  230. null, 0, ref leftOver, flush);
  231. fixed (byte *bptr = bytes) {
  232. return InternalGetBytes (
  233. cptr + charIndex, charCount,
  234. bptr + byteIndex, bytes.Length - byteIndex,
  235. ref leftOver, flush);
  236. }
  237. }
  238. }
  239. }
  240. private unsafe static int InternalGetBytes (char* chars, int charCount,
  241. byte* bytes, int byteCount,
  242. ref char leftOver, bool flush)
  243. {
  244. int charIndex = 0;
  245. int byteIndex = 0;
  246. // Convert the characters into bytes.
  247. // Convert the characters into bytes.
  248. char ch;
  249. int length = byteCount;
  250. char pair = leftOver;
  251. int posn = byteIndex;
  252. int code = 0;
  253. while (charCount > 0) {
  254. // Fetch the next UTF-16 character pair value.
  255. ch = chars [charIndex];
  256. if (pair == '\0') {
  257. if (ch < '\uD800' || ch >= '\uE000') {
  258. if (ch < '\x80') { // fast path optimization
  259. int end = charIndex + charCount;
  260. for (; charIndex < end; posn++, charIndex++, charCount--) {
  261. if (chars [charIndex] < '\x80')
  262. bytes [posn] = (byte) chars [charIndex];
  263. else
  264. break;
  265. }
  266. continue;
  267. }
  268. code = ch;
  269. }
  270. else if (ch < '\uDC00') {
  271. // surrogate start
  272. pair = ch;
  273. ++charIndex;
  274. --charCount;
  275. continue;
  276. } else { // ch <= '\uDFFF'
  277. // We have a surrogate tail without leading
  278. // surrogate. In NET_2_0 it uses fallback.
  279. // In NET_1_1 we output wrong surrogate.
  280. if (posn > length - 3) {
  281. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  282. }
  283. bytes [posn++] = (byte) (0xE0 | (ch >> 12));
  284. bytes [posn++] = (byte) (0x80 | ((ch >> 6) & 0x3F));
  285. bytes [posn++] = (byte) (0x80 | (ch & 0x3F));
  286. ++charIndex;
  287. --charCount;
  288. continue;
  289. }
  290. } else {
  291. if ('\uDC00' <= ch && ch <= '\uDFFF')
  292. code = 0x10000 + (int) ch - 0xDC00 +
  293. (((int) pair - 0xD800) << 10);
  294. else {
  295. // We have a surrogate start followed by a
  296. // regular character. Technically, this is
  297. // invalid, but we have to do something.
  298. // We write out the surrogate start and then
  299. // re-visit the current character again.
  300. if (posn > length - 3) {
  301. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  302. }
  303. bytes [posn++] = (byte) (0xE0 | (pair >> 12));
  304. bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F));
  305. bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
  306. pair = '\0';
  307. continue;
  308. }
  309. pair = '\0';
  310. }
  311. ++charIndex;
  312. --charCount;
  313. // Encode the character pair value.
  314. if (code < 0x0080) {
  315. if (posn >= length)
  316. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  317. bytes [posn++] = (byte)code;
  318. } else if (code < 0x0800) {
  319. if ((posn + 2) > length)
  320. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  321. bytes [posn++] = (byte) (0xC0 | (code >> 6));
  322. bytes [posn++] = (byte) (0x80 | (code & 0x3F));
  323. } else if (code < 0x10000) {
  324. if (posn > length - 3)
  325. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  326. bytes [posn++] = (byte) (0xE0 | (code >> 12));
  327. bytes [posn++] = (byte) (0x80 | ((code >> 6) & 0x3F));
  328. bytes [posn++] = (byte) (0x80 | (code & 0x3F));
  329. } else {
  330. if (posn > length - 4)
  331. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  332. bytes [posn++] = (byte) (0xF0 | (code >> 18));
  333. bytes [posn++] = (byte) (0x80 | ((code >> 12) & 0x3F));
  334. bytes [posn++] = (byte) (0x80 | ((code >> 6) & 0x3F));
  335. bytes [posn++] = (byte) (0x80 | (code & 0x3F));
  336. }
  337. }
  338. if (flush) {
  339. if (pair != '\0') {
  340. // Flush the left-over incomplete surrogate.
  341. if (posn > length - 3) {
  342. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  343. }
  344. bytes [posn++] = (byte) (0xE0 | (pair >> 12));
  345. bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F));
  346. bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
  347. }
  348. leftOver = '\0';
  349. }
  350. else
  351. leftOver = pair;
  352. Char.IsLetterOrDigit (pair);
  353. // Return the final count to the caller.
  354. return posn - byteIndex;
  355. }
  356. private unsafe int Fallback (byte* bytes, int byteCount, char lead, char tail)
  357. {
  358. throw new NotImplementedException ();
  359. }
  360. // Get the bytes that result from encoding a character buffer.
  361. public override int GetBytes (char[] chars, int charIndex, int charCount,
  362. byte[] bytes, int byteIndex)
  363. {
  364. char leftOver = '\0';
  365. return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOver, true);
  366. }
  367. // Convenience wrappers for "GetBytes".
  368. public override int GetBytes (String s, int charIndex, int charCount,
  369. byte[] bytes, int byteIndex)
  370. {
  371. // Validate the parameters.
  372. if (s == null) {
  373. throw new ArgumentNullException ("s");
  374. }
  375. if (bytes == null) {
  376. throw new ArgumentNullException ("bytes");
  377. }
  378. if (charIndex < 0 || charIndex > s.Length) {
  379. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
  380. }
  381. if (charCount < 0 || charCount > (s.Length - charIndex)) {
  382. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
  383. }
  384. if (byteIndex < 0 || byteIndex > bytes.Length) {
  385. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  386. }
  387. if (charIndex == s.Length)
  388. return 0;
  389. unsafe {
  390. fixed (char* cptr = s) {
  391. char dummy = '\0';
  392. if (bytes.Length == byteIndex)
  393. return InternalGetBytes (
  394. cptr + charIndex, charCount,
  395. null, 0, ref dummy, true);
  396. fixed (byte *bptr = bytes) {
  397. return InternalGetBytes (
  398. cptr + charIndex, charCount,
  399. bptr + byteIndex, bytes.Length - byteIndex,
  400. ref dummy, true);
  401. }
  402. }
  403. }
  404. }
  405. #endregion
  406. // Internal version of "GetCharCount" which can handle a rolling
  407. // state between multiple calls to this method.
  408. #if NET_2_0
  409. private static int InternalGetCharCount (
  410. byte[] bytes, int index, int count, uint leftOverBits,
  411. uint leftOverCount, object provider,
  412. ref DecoderFallbackBuffer fallbackBuffer, bool flush)
  413. #else
  414. private static int InternalGetCharCount (
  415. byte[] bytes, int index, int count, uint leftOverBits,
  416. uint leftOverCount, bool throwOnInvalid, bool flush)
  417. #endif
  418. {
  419. // Validate the parameters.
  420. if (bytes == null) {
  421. throw new ArgumentNullException ("bytes");
  422. }
  423. if (index < 0 || index > bytes.Length) {
  424. throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
  425. }
  426. if (count < 0 || count > (bytes.Length - index)) {
  427. throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
  428. }
  429. int length = 0;
  430. if (leftOverCount == 0) {
  431. int end = index + count;
  432. for (; index < end; index++, count--) {
  433. if (bytes [index] < 0x80)
  434. length++;
  435. else
  436. break;
  437. }
  438. }
  439. // Determine the number of characters that we have.
  440. uint ch;
  441. uint leftBits = leftOverBits;
  442. uint leftSoFar = (leftOverCount & (uint)0x0F);
  443. uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
  444. while (count > 0) {
  445. ch = (uint)(bytes[index++]);
  446. --count;
  447. if (leftSize == 0) {
  448. // Process a UTF-8 start character.
  449. if (ch < (uint)0x0080) {
  450. // Single-byte UTF-8 character.
  451. ++length;
  452. } else if ((ch & (uint)0xE0) == (uint)0xC0) {
  453. // Double-byte UTF-8 character.
  454. leftBits = (ch & (uint)0x1F);
  455. leftSoFar = 1;
  456. leftSize = 2;
  457. } else if ((ch & (uint)0xF0) == (uint)0xE0) {
  458. // Three-byte UTF-8 character.
  459. leftBits = (ch & (uint)0x0F);
  460. leftSoFar = 1;
  461. leftSize = 3;
  462. } else if ((ch & (uint)0xF8) == (uint)0xF0) {
  463. // Four-byte UTF-8 character.
  464. leftBits = (ch & (uint)0x07);
  465. leftSoFar = 1;
  466. leftSize = 4;
  467. } else if ((ch & (uint)0xFC) == (uint)0xF8) {
  468. // Five-byte UTF-8 character.
  469. leftBits = (ch & (uint)0x03);
  470. leftSoFar = 1;
  471. leftSize = 5;
  472. } else if ((ch & (uint)0xFE) == (uint)0xFC) {
  473. // Six-byte UTF-8 character.
  474. leftBits = (ch & (uint)0x03);
  475. leftSoFar = 1;
  476. leftSize = 6;
  477. } else {
  478. // Invalid UTF-8 start character.
  479. #if NET_2_0
  480. length += Fallback (provider, ref fallbackBuffer, bytes, index - 1);
  481. #else
  482. if (throwOnInvalid)
  483. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  484. #endif
  485. }
  486. } else {
  487. // Process an extra byte in a multi-byte sequence.
  488. if ((ch & (uint)0xC0) == (uint)0x80) {
  489. leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
  490. if (++leftSoFar >= leftSize) {
  491. // We have a complete character now.
  492. if (leftBits < (uint)0x10000) {
  493. // is it an overlong ?
  494. bool overlong = false;
  495. switch (leftSize) {
  496. case 2:
  497. overlong = (leftBits <= 0x7F);
  498. break;
  499. case 3:
  500. overlong = (leftBits <= 0x07FF);
  501. break;
  502. case 4:
  503. overlong = (leftBits <= 0xFFFF);
  504. break;
  505. case 5:
  506. overlong = (leftBits <= 0x1FFFFF);
  507. break;
  508. case 6:
  509. overlong = (leftBits <= 0x03FFFFFF);
  510. break;
  511. }
  512. if (overlong) {
  513. #if NET_2_0
  514. length += Fallback (provider, ref fallbackBuffer, bytes, index - 1);
  515. #else
  516. if (throwOnInvalid)
  517. throw new ArgumentException (_("Overlong"), leftBits.ToString ());
  518. #endif
  519. }
  520. else
  521. ++length;
  522. } else if (leftBits < (uint)0x110000) {
  523. length += 2;
  524. } else {
  525. #if NET_2_0
  526. length += Fallback (provider, ref fallbackBuffer, bytes, index - 1);
  527. #else
  528. if (throwOnInvalid)
  529. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  530. #endif
  531. }
  532. leftSize = 0;
  533. }
  534. } else {
  535. // Invalid UTF-8 sequence: clear and restart.
  536. #if NET_2_0
  537. length += Fallback (provider, ref fallbackBuffer, bytes, index - 1);
  538. #else
  539. if (throwOnInvalid)
  540. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  541. #endif
  542. leftSize = 0;
  543. --index;
  544. ++count;
  545. }
  546. }
  547. }
  548. if (flush && leftSize != 0) {
  549. // We had left-over bytes that didn't make up
  550. // a complete UTF-8 character sequence.
  551. #if NET_2_0
  552. length += Fallback (provider, ref fallbackBuffer, bytes, index);
  553. #else
  554. if (throwOnInvalid)
  555. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  556. #endif
  557. }
  558. // Return the final length to the caller.
  559. return length;
  560. }
  561. #if NET_2_0
  562. // for GetCharCount()
  563. static int Fallback (object provider, ref DecoderFallbackBuffer buffer, byte [] bytes, int index)
  564. {
  565. if (buffer == null) {
  566. DecoderFallback fb = provider as DecoderFallback;
  567. if (fb != null)
  568. buffer = fb.CreateFallbackBuffer ();
  569. else
  570. buffer = ((Decoder) provider).FallbackBuffer;
  571. }
  572. buffer.Fallback (bytes, index);
  573. return buffer.Remaining;
  574. }
  575. // for GetChars()
  576. static void Fallback (object provider, ref DecoderFallbackBuffer buffer, byte [] bytes, int byteIndex,
  577. char [] chars, ref int charIndex)
  578. {
  579. if (buffer == null) {
  580. DecoderFallback fb = provider as DecoderFallback;
  581. if (fb != null)
  582. buffer = fb.CreateFallbackBuffer ();
  583. else
  584. buffer = ((Decoder) provider).FallbackBuffer;
  585. }
  586. buffer.Fallback (bytes, byteIndex);
  587. while (buffer.Remaining > 0)
  588. chars [charIndex++] = buffer.GetNextChar ();
  589. }
  590. #endif
  591. // Get the number of characters needed to decode a byte buffer.
  592. public override int GetCharCount (byte[] bytes, int index, int count)
  593. {
  594. #if NET_2_0
  595. DecoderFallbackBuffer buf = null;
  596. return InternalGetCharCount (bytes, index, count, 0, 0, DecoderFallback, ref buf, true);
  597. #else
  598. return InternalGetCharCount (bytes, index, count, 0, 0, throwOnInvalid, true);
  599. #endif
  600. }
  601. // Get the characters that result from decoding a byte buffer.
  602. #if NET_2_0
  603. private static int InternalGetChars (
  604. byte[] bytes, int byteIndex, int byteCount, char[] chars,
  605. int charIndex, ref uint leftOverBits, ref uint leftOverCount,
  606. object provider,
  607. ref DecoderFallbackBuffer fallbackBuffer, bool flush)
  608. #else
  609. private static int InternalGetChars (
  610. byte[] bytes, int byteIndex, int byteCount, char[] chars,
  611. int charIndex, ref uint leftOverBits, ref uint leftOverCount,
  612. bool throwOnInvalid, bool flush)
  613. #endif
  614. {
  615. // Validate the parameters.
  616. if (bytes == null) {
  617. throw new ArgumentNullException ("bytes");
  618. }
  619. if (chars == null) {
  620. throw new ArgumentNullException ("chars");
  621. }
  622. if (byteIndex < 0 || byteIndex > bytes.Length) {
  623. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  624. }
  625. if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) {
  626. throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
  627. }
  628. if (charIndex < 0 || charIndex > chars.Length) {
  629. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
  630. }
  631. if (charIndex == chars.Length)
  632. return 0;
  633. int posn = charIndex;
  634. if (leftOverCount == 0) {
  635. int end = byteIndex + byteCount;
  636. for (; byteIndex < end; posn++, byteIndex++, byteCount--) {
  637. if (bytes [byteIndex] < 0x80)
  638. chars [posn] = (char) bytes [byteIndex];
  639. else
  640. break;
  641. }
  642. }
  643. // Convert the bytes into the output buffer.
  644. uint ch;
  645. int length = chars.Length;
  646. uint leftBits = leftOverBits;
  647. uint leftSoFar = (leftOverCount & (uint)0x0F);
  648. uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
  649. int byteEnd = byteIndex + byteCount;
  650. if (byteEnd < 0 || byteEnd > bytes.Length)
  651. throw new SystemException (String.Format ("INTERNAL ERROR: should not happen: {0} {1} {2}", byteIndex, byteCount, byteEnd));
  652. for(; byteIndex < byteEnd; byteIndex++) {
  653. // Fetch the next character from the byte buffer.
  654. ch = (uint)(bytes[byteIndex]);
  655. if (leftSize == 0) {
  656. // Process a UTF-8 start character.
  657. if (ch < (uint)0x0080) {
  658. // Single-byte UTF-8 character.
  659. if (posn >= length) {
  660. throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
  661. }
  662. chars[posn++] = (char)ch;
  663. } else if ((ch & (uint)0xE0) == (uint)0xC0) {
  664. // Double-byte UTF-8 character.
  665. leftBits = (ch & (uint)0x1F);
  666. leftSoFar = 1;
  667. leftSize = 2;
  668. } else if ((ch & (uint)0xF0) == (uint)0xE0) {
  669. // Three-byte UTF-8 character.
  670. leftBits = (ch & (uint)0x0F);
  671. leftSoFar = 1;
  672. leftSize = 3;
  673. } else if ((ch & (uint)0xF8) == (uint)0xF0) {
  674. // Four-byte UTF-8 character.
  675. leftBits = (ch & (uint)0x07);
  676. leftSoFar = 1;
  677. leftSize = 4;
  678. } else if ((ch & (uint)0xFC) == (uint)0xF8) {
  679. // Five-byte UTF-8 character.
  680. leftBits = (ch & (uint)0x03);
  681. leftSoFar = 1;
  682. leftSize = 5;
  683. } else if ((ch & (uint)0xFE) == (uint)0xFC) {
  684. // Six-byte UTF-8 character.
  685. leftBits = (ch & (uint)0x03);
  686. leftSoFar = 1;
  687. leftSize = 6;
  688. } else {
  689. // Invalid UTF-8 start character.
  690. #if NET_2_0
  691. Fallback (provider, ref fallbackBuffer, bytes, byteIndex, chars, ref posn);
  692. #else
  693. if (throwOnInvalid)
  694. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  695. #endif
  696. }
  697. } else {
  698. // Process an extra byte in a multi-byte sequence.
  699. if ((ch & (uint)0xC0) == (uint)0x80) {
  700. leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
  701. if (++leftSoFar >= leftSize) {
  702. // We have a complete character now.
  703. if (leftBits < (uint)0x10000) {
  704. // is it an overlong ?
  705. bool overlong = false;
  706. switch (leftSize) {
  707. case 2:
  708. overlong = (leftBits <= 0x7F);
  709. break;
  710. case 3:
  711. overlong = (leftBits <= 0x07FF);
  712. break;
  713. case 4:
  714. overlong = (leftBits <= 0xFFFF);
  715. break;
  716. case 5:
  717. overlong = (leftBits <= 0x1FFFFF);
  718. break;
  719. case 6:
  720. overlong = (leftBits <= 0x03FFFFFF);
  721. break;
  722. }
  723. if (overlong) {
  724. #if NET_2_0
  725. Fallback (provider, ref fallbackBuffer, bytes, byteIndex, chars, ref posn);
  726. #else
  727. if (throwOnInvalid)
  728. throw new ArgumentException (_("Overlong"), leftBits.ToString ());
  729. #endif
  730. }
  731. else if ((leftBits & 0xF800) == 0xD800) {
  732. // UTF-8 doesn't use surrogate characters
  733. #if NET_2_0
  734. Fallback (provider, ref fallbackBuffer, bytes, byteIndex, chars, ref posn);
  735. #else
  736. if (throwOnInvalid)
  737. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  738. #endif
  739. }
  740. else {
  741. if (posn >= length) {
  742. throw new ArgumentException
  743. (_("Arg_InsufficientSpace"), "chars");
  744. }
  745. chars[posn++] = (char)leftBits;
  746. }
  747. } else if (leftBits < (uint)0x110000) {
  748. if ((posn + 2) > length) {
  749. throw new ArgumentException
  750. (_("Arg_InsufficientSpace"), "chars");
  751. }
  752. leftBits -= (uint)0x10000;
  753. chars[posn++] = (char)((leftBits >> 10) +
  754. (uint)0xD800);
  755. chars[posn++] =
  756. (char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
  757. } else {
  758. #if NET_2_0
  759. Fallback (provider, ref fallbackBuffer, bytes, byteIndex, chars, ref posn);
  760. #else
  761. if (throwOnInvalid)
  762. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  763. #endif
  764. }
  765. leftSize = 0;
  766. }
  767. } else {
  768. // Invalid UTF-8 sequence: clear and restart.
  769. #if NET_2_0
  770. Fallback (provider, ref fallbackBuffer, bytes, byteIndex, chars, ref posn);
  771. #else
  772. if (throwOnInvalid)
  773. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  774. #endif
  775. leftSize = 0;
  776. --byteIndex;
  777. }
  778. }
  779. }
  780. if (flush && leftSize != 0) {
  781. // We had left-over bytes that didn't make up
  782. // a complete UTF-8 character sequence.
  783. #if NET_2_0
  784. Fallback (provider, ref fallbackBuffer, bytes, byteIndex, chars, ref posn);
  785. #else
  786. if (throwOnInvalid)
  787. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  788. #endif
  789. }
  790. leftOverBits = leftBits;
  791. leftOverCount = (leftSoFar | (leftSize << 4));
  792. // Return the final length to the caller.
  793. return posn - charIndex;
  794. }
  795. // Get the characters that result from decoding a byte buffer.
  796. public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
  797. char[] chars, int charIndex)
  798. {
  799. uint leftOverBits = 0;
  800. uint leftOverCount = 0;
  801. #if NET_2_0
  802. DecoderFallbackBuffer buf = null;
  803. return InternalGetChars (bytes, byteIndex, byteCount, chars,
  804. charIndex, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, true);
  805. #else
  806. return InternalGetChars (bytes, byteIndex, byteCount, chars,
  807. charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, true);
  808. #endif
  809. }
  810. // Get the maximum number of bytes needed to encode a
  811. // specified number of characters.
  812. public override int GetMaxByteCount (int charCount)
  813. {
  814. if (charCount < 0) {
  815. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
  816. }
  817. return charCount * 4;
  818. }
  819. // Get the maximum number of characters needed to decode a
  820. // specified number of bytes.
  821. public override int GetMaxCharCount (int byteCount)
  822. {
  823. if (byteCount < 0) {
  824. throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
  825. }
  826. return byteCount;
  827. }
  828. // Get a UTF8-specific decoder that is attached to this instance.
  829. public override Decoder GetDecoder ()
  830. {
  831. #if NET_2_0
  832. return new UTF8Decoder (DecoderFallback);
  833. #else
  834. return new UTF8Decoder (throwOnInvalid);
  835. #endif
  836. }
  837. // Get a UTF8-specific encoder that is attached to this instance.
  838. public override Encoder GetEncoder ()
  839. {
  840. return new UTF8Encoder (emitIdentifier);
  841. }
  842. // Get the UTF8 preamble.
  843. public override byte[] GetPreamble ()
  844. {
  845. if (emitIdentifier) {
  846. byte[] pre = new byte [3];
  847. pre[0] = (byte)0xEF;
  848. pre[1] = (byte)0xBB;
  849. pre[2] = (byte)0xBF;
  850. return pre;
  851. } else {
  852. return new byte [0];
  853. }
  854. }
  855. // Determine if this object is equal to another.
  856. public override bool Equals (Object value)
  857. {
  858. UTF8Encoding enc = (value as UTF8Encoding);
  859. if (enc != null) {
  860. #if NET_2_0
  861. return (codePage == enc.codePage &&
  862. emitIdentifier == enc.emitIdentifier &&
  863. DecoderFallback == enc.DecoderFallback &&
  864. EncoderFallback == enc.EncoderFallback);
  865. #else
  866. return (codePage == enc.codePage &&
  867. emitIdentifier == enc.emitIdentifier &&
  868. throwOnInvalid == enc.throwOnInvalid);
  869. #endif
  870. } else {
  871. return false;
  872. }
  873. }
  874. // Get the hash code for this object.
  875. public override int GetHashCode ()
  876. {
  877. return base.GetHashCode ();
  878. }
  879. public override byte [] GetBytes (String s)
  880. {
  881. if (s == null)
  882. throw new ArgumentNullException ("s");
  883. int length = GetByteCount (s);
  884. byte [] bytes = new byte [length];
  885. GetBytes (s, 0, s.Length, bytes, 0);
  886. return bytes;
  887. }
  888. // UTF-8 decoder implementation.
  889. [Serializable]
  890. private class UTF8Decoder : Decoder
  891. {
  892. #if !NET_2_0
  893. private bool throwOnInvalid;
  894. #endif
  895. private uint leftOverBits;
  896. private uint leftOverCount;
  897. // Constructor.
  898. #if NET_2_0
  899. public UTF8Decoder (DecoderFallback fallback)
  900. #else
  901. public UTF8Decoder (bool throwOnInvalid)
  902. #endif
  903. {
  904. #if NET_2_0
  905. Fallback = fallback;
  906. #else
  907. this.throwOnInvalid = throwOnInvalid;
  908. #endif
  909. leftOverBits = 0;
  910. leftOverCount = 0;
  911. }
  912. // Override inherited methods.
  913. public override int GetCharCount (byte[] bytes, int index, int count)
  914. {
  915. #if NET_2_0
  916. DecoderFallbackBuffer buf = null;
  917. return InternalGetCharCount (bytes, index, count,
  918. leftOverBits, leftOverCount, this, ref buf, false);
  919. #else
  920. return InternalGetCharCount (bytes, index, count,
  921. leftOverBits, leftOverCount, throwOnInvalid, false);
  922. #endif
  923. }
  924. public override int GetChars (byte[] bytes, int byteIndex,
  925. int byteCount, char[] chars, int charIndex)
  926. {
  927. #if NET_2_0
  928. DecoderFallbackBuffer buf = null;
  929. return InternalGetChars (bytes, byteIndex, byteCount,
  930. chars, charIndex, ref leftOverBits, ref leftOverCount, this, ref buf, false);
  931. #else
  932. return InternalGetChars (bytes, byteIndex, byteCount,
  933. chars, charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, false);
  934. #endif
  935. }
  936. } // class UTF8Decoder
  937. // UTF-8 encoder implementation.
  938. [Serializable]
  939. private class UTF8Encoder : Encoder
  940. {
  941. private bool emitIdentifier;
  942. private char leftOverForCount;
  943. private char leftOverForConv;
  944. // Constructor.
  945. public UTF8Encoder (bool emitIdentifier)
  946. {
  947. this.emitIdentifier = emitIdentifier;
  948. leftOverForCount = '\0';
  949. leftOverForConv = '\0';
  950. }
  951. // Override inherited methods.
  952. public override int GetByteCount (char[] chars, int index,
  953. int count, bool flush)
  954. {
  955. return InternalGetByteCount (chars, index, count, ref leftOverForCount, flush);
  956. }
  957. public override int GetBytes (char[] chars, int charIndex,
  958. int charCount, byte[] bytes, int byteIndex, bool flush)
  959. {
  960. int result;
  961. result = InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOverForConv, flush);
  962. emitIdentifier = false;
  963. return result;
  964. }
  965. #if NET_2_0
  966. public unsafe override int GetByteCount (char* chars, int count, bool flush)
  967. {
  968. return InternalGetByteCount (chars, count, ref leftOverForCount, flush);
  969. }
  970. public unsafe override int GetBytes (char* chars, int charCount,
  971. byte* bytes, int byteCount, bool flush)
  972. {
  973. int result;
  974. result = InternalGetBytes (chars, charCount, bytes, byteCount, ref leftOverForConv, flush);
  975. emitIdentifier = false;
  976. return result;
  977. }
  978. #endif
  979. } // class UTF8Encoder
  980. }; // class UTF8Encoding
  981. }; // namespace System.Text