UTF8Encoding.cs 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203
  1. /*
  2. * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
  3. *
  4. * Copyright (c) 2001, 2002 Southern Storm Software, Pty Ltd
  5. * Copyright (C) 2004 Novell, Inc (http://www.novell.com)
  6. *
  7. * Permission is hereby granted, free of charge, to any person obtaining
  8. * a copy of this software and associated documentation files (the "Software"),
  9. * to deal in the Software without restriction, including without limitation
  10. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11. * and/or sell copies of the Software, and to permit persons to whom the
  12. * Software is furnished to do so, subject to the following conditions:
  13. *
  14. * The above copyright notice and this permission notice shall be included
  15. * in all copies or substantial portions of the Software.
  16. *
  17. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18. * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  20. * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  21. * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  22. * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  23. * OTHER DEALINGS IN THE SOFTWARE.
  24. */
  25. namespace System.Text
  26. {
  27. using System;
  28. using System.Runtime.InteropServices;
  29. [Serializable]
  30. [MonoTODO ("Fix serialization compatibility with MS.NET")]
  31. #if NET_2_0
  32. [MonoTODO ("EncoderFallback is not handled")]
  33. [ComVisible (true)]
  34. #endif
  35. public class UTF8Encoding : Encoding
  36. {
  37. // Magic number used by Windows for UTF-8.
  38. internal const int UTF8_CODE_PAGE = 65001;
  39. // Internal state.
  40. private bool emitIdentifier;
  41. #if !NET_2_0
  42. private bool throwOnInvalid;
  43. #endif
  44. // Constructors.
  45. public UTF8Encoding () : this (false, false) {}
  46. public UTF8Encoding (bool encoderShouldEmitUTF8Identifier)
  47. : this (encoderShouldEmitUTF8Identifier, false) {}
  48. public UTF8Encoding (bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
  49. : base (UTF8_CODE_PAGE)
  50. {
  51. emitIdentifier = encoderShouldEmitUTF8Identifier;
  52. #if NET_2_0
  53. if (throwOnInvalidBytes)
  54. SetFallbackInternal (null, new DecoderExceptionFallback ());
  55. else
  56. SetFallbackInternal (null, new DecoderReplacementFallback (String.Empty));
  57. #else
  58. throwOnInvalid = throwOnInvalidBytes;
  59. #endif
  60. web_name = body_name = header_name = "utf-8";
  61. encoding_name = "Unicode (UTF-8)";
  62. is_browser_save = true;
  63. is_browser_display = true;
  64. is_mail_news_display = true;
  65. windows_code_page = UnicodeEncoding.UNICODE_CODE_PAGE;
  66. }
  67. #region GetByteCount()
  68. // Internal version of "GetByteCount" which can handle a rolling
  69. // state between multiple calls to this method.
  70. private static int InternalGetByteCount (char[] chars, int index, int count, ref char leftOver, bool flush)
  71. {
  72. // Validate the parameters.
  73. if (chars == null) {
  74. throw new ArgumentNullException ("chars");
  75. }
  76. if (index < 0 || index > chars.Length) {
  77. throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
  78. }
  79. if (count < 0 || count > (chars.Length - index)) {
  80. throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
  81. }
  82. if (index == chars.Length) {
  83. if (flush && leftOver != '\0') {
  84. // Flush the left-over surrogate pair start.
  85. leftOver = '\0';
  86. return 3;
  87. }
  88. return 0;
  89. }
  90. unsafe {
  91. fixed (char* cptr = chars) {
  92. return InternalGetByteCount (cptr + index, count, ref leftOver, flush);
  93. }
  94. }
  95. }
  96. private unsafe static int InternalGetByteCount (char* chars, int count, ref char leftOver, bool flush)
  97. {
  98. int index = 0;
  99. // Determine the lengths of all characters.
  100. char ch;
  101. int length = 0;
  102. char pair = leftOver;
  103. while (count > 0) {
  104. ch = chars[index];
  105. if (pair == 0) {
  106. if (ch < '\u0080') {
  107. // fast path optimization
  108. int end = index + count;
  109. for (; index < end; index++, count--) {
  110. if (chars [index] < '\x80')
  111. ++length;
  112. else
  113. break;
  114. }
  115. continue;
  116. //length++;
  117. } else if (ch < '\u0800') {
  118. length += 2;
  119. } else if (ch >= '\uD800' && ch <= '\uDBFF') {
  120. // This is the start of a surrogate pair.
  121. pair = ch;
  122. } else {
  123. length += 3;
  124. }
  125. } else if (ch >= '\uDC00' && ch <= '\uDFFF') {
  126. if (pair != 0) {
  127. // We have a surrogate pair.
  128. length += 4;
  129. pair = '\0';
  130. } else {
  131. // We have a surrogate tail without
  132. // leading surrogate. In NET_2_0 it
  133. // uses fallback. In NET_1_1 we output
  134. // wrong surrogate.
  135. length += 3;
  136. pair = '\0';
  137. }
  138. } else {
  139. // We have a surrogate start followed by a
  140. // regular character. Technically, this is
  141. // invalid, but we have to do something.
  142. // We write out the surrogate start and then
  143. // re-visit the current character again.
  144. length += 3;
  145. pair = '\0';
  146. continue;
  147. }
  148. ++index;
  149. --count;
  150. }
  151. if (flush) {
  152. if (pair != '\0')
  153. // Flush the left-over surrogate pair start.
  154. length += 3;
  155. leftOver = '\0';
  156. }
  157. else
  158. leftOver = pair;
  159. // Return the final length to the caller.
  160. return length;
  161. }
  162. // Get the number of bytes needed to encode a character buffer.
  163. public override int GetByteCount (char[] chars, int index, int count)
  164. {
  165. char dummy = '\0';
  166. return InternalGetByteCount (chars, index, count, ref dummy, true);
  167. }
  168. #if !NET_2_0
  169. // Convenience wrappers for "GetByteCount".
  170. public override int GetByteCount (String s)
  171. {
  172. // Validate the parameters.
  173. if (s == null) {
  174. throw new ArgumentNullException ("s");
  175. }
  176. unsafe {
  177. fixed (char* cptr = s) {
  178. char dummy = '\0';
  179. return InternalGetByteCount (cptr, s.Length, ref dummy, true);
  180. }
  181. }
  182. }
  183. #endif
  184. #if NET_2_0
  185. [CLSCompliant (false)]
  186. [ComVisible (false)]
  187. public unsafe override int GetByteCount (char* chars, int count)
  188. {
  189. if (chars == null)
  190. throw new ArgumentNullException ("chars");
  191. if (count == 0)
  192. return 0;
  193. char dummy = '\0';
  194. return InternalGetByteCount (chars, count, ref dummy, true);
  195. }
  196. #endif
  197. #endregion
  198. #region GetBytes()
  199. // Internal version of "GetBytes" which can handle a rolling
  200. // state between multiple calls to this method.
  201. private static int InternalGetBytes (char[] chars, int charIndex,
  202. int charCount, byte[] bytes,
  203. int byteIndex, ref char leftOver,
  204. bool flush)
  205. {
  206. // Validate the parameters.
  207. if (chars == null) {
  208. throw new ArgumentNullException ("chars");
  209. }
  210. if (bytes == null) {
  211. throw new ArgumentNullException ("bytes");
  212. }
  213. if (charIndex < 0 || charIndex > chars.Length) {
  214. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
  215. }
  216. if (charCount < 0 || charCount > (chars.Length - charIndex)) {
  217. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_Array"));
  218. }
  219. if (byteIndex < 0 || byteIndex > bytes.Length) {
  220. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  221. }
  222. if (charIndex == chars.Length) {
  223. if (flush && leftOver != '\0') {
  224. #if NET_2_0
  225. // FIXME: use EncoderFallback.
  226. //
  227. // By default it is empty, so I do nothing for now.
  228. leftOver = '\0';
  229. #else
  230. // Flush the left-over surrogate pair start.
  231. if (byteIndex >= bytes.Length - 3)
  232. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  233. bytes [byteIndex++] = 0xEF;
  234. bytes [byteIndex++] = 0xBB;
  235. bytes [byteIndex++] = 0xBF;
  236. leftOver = '\0';
  237. return 3;
  238. #endif
  239. }
  240. return 0;
  241. }
  242. unsafe {
  243. fixed (char* cptr = chars) {
  244. if (bytes.Length == byteIndex)
  245. return InternalGetBytes (
  246. cptr + charIndex, charCount,
  247. null, 0, ref leftOver, flush);
  248. fixed (byte *bptr = bytes) {
  249. return InternalGetBytes (
  250. cptr + charIndex, charCount,
  251. bptr + byteIndex, bytes.Length - byteIndex,
  252. ref leftOver, flush);
  253. }
  254. }
  255. }
  256. }
  257. private unsafe static int InternalGetBytes (char* chars, int charCount,
  258. byte* bytes, int byteCount,
  259. ref char leftOver, bool flush)
  260. {
  261. int charIndex = 0;
  262. int byteIndex = 0;
  263. // Convert the characters into bytes.
  264. // Convert the characters into bytes.
  265. char ch;
  266. int length = byteCount;
  267. char pair = leftOver;
  268. int posn = byteIndex;
  269. int code = 0;
  270. while (charCount > 0) {
  271. // Fetch the next UTF-16 character pair value.
  272. ch = chars [charIndex];
  273. if (pair == '\0') {
  274. if (ch < '\uD800' || ch >= '\uE000') {
  275. if (ch < '\x80') { // fast path optimization
  276. int end = charIndex + charCount;
  277. for (; charIndex < end; posn++, charIndex++, charCount--) {
  278. if (chars [charIndex] < '\x80')
  279. bytes [posn] = (byte) chars [charIndex];
  280. else
  281. break;
  282. }
  283. continue;
  284. }
  285. code = ch;
  286. }
  287. else if (ch < '\uDC00') {
  288. // surrogate start
  289. pair = ch;
  290. ++charIndex;
  291. --charCount;
  292. continue;
  293. } else { // ch <= '\uDFFF'
  294. // We have a surrogate tail without leading
  295. // surrogate. In NET_2_0 it uses fallback.
  296. // In NET_1_1 we output wrong surrogate.
  297. if (posn > length - 3) {
  298. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  299. }
  300. bytes [posn++] = (byte) (0xE0 | (ch >> 12));
  301. bytes [posn++] = (byte) (0x80 | ((ch >> 6) & 0x3F));
  302. bytes [posn++] = (byte) (0x80 | (ch & 0x3F));
  303. ++charIndex;
  304. --charCount;
  305. continue;
  306. }
  307. } else {
  308. if ('\uDC00' <= ch && ch <= '\uDFFF')
  309. code = 0x10000 + (int) ch - 0xDC00 +
  310. (((int) pair - 0xD800) << 10);
  311. else {
  312. // We have a surrogate start followed by a
  313. // regular character. Technically, this is
  314. // invalid, but we have to do something.
  315. // We write out the surrogate start and then
  316. // re-visit the current character again.
  317. if (posn > length - 3) {
  318. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  319. }
  320. bytes [posn++] = (byte) (0xE0 | (pair >> 12));
  321. bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F));
  322. bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
  323. pair = '\0';
  324. continue;
  325. }
  326. pair = '\0';
  327. }
  328. ++charIndex;
  329. --charCount;
  330. // Encode the character pair value.
  331. if (code < 0x0080) {
  332. if (posn >= length)
  333. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  334. bytes [posn++] = (byte)code;
  335. } else if (code < 0x0800) {
  336. if ((posn + 2) > length)
  337. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  338. bytes [posn++] = (byte) (0xC0 | (code >> 6));
  339. bytes [posn++] = (byte) (0x80 | (code & 0x3F));
  340. } else if (code < 0x10000) {
  341. if (posn > length - 3)
  342. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  343. bytes [posn++] = (byte) (0xE0 | (code >> 12));
  344. bytes [posn++] = (byte) (0x80 | ((code >> 6) & 0x3F));
  345. bytes [posn++] = (byte) (0x80 | (code & 0x3F));
  346. } else {
  347. if (posn > length - 4)
  348. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  349. bytes [posn++] = (byte) (0xF0 | (code >> 18));
  350. bytes [posn++] = (byte) (0x80 | ((code >> 12) & 0x3F));
  351. bytes [posn++] = (byte) (0x80 | ((code >> 6) & 0x3F));
  352. bytes [posn++] = (byte) (0x80 | (code & 0x3F));
  353. }
  354. }
  355. if (flush) {
  356. if (pair != '\0') {
  357. // Flush the left-over incomplete surrogate.
  358. if (posn > length - 3) {
  359. throw new ArgumentException (_("Arg_InsufficientSpace"), "bytes");
  360. }
  361. bytes [posn++] = (byte) (0xE0 | (pair >> 12));
  362. bytes [posn++] = (byte) (0x80 | ((pair >> 6) & 0x3F));
  363. bytes [posn++] = (byte) (0x80 | (pair & 0x3F));
  364. }
  365. leftOver = '\0';
  366. }
  367. else
  368. leftOver = pair;
  369. Char.IsLetterOrDigit (pair);
  370. // Return the final count to the caller.
  371. return posn - byteIndex;
  372. }
  373. private unsafe int Fallback (byte* bytes, int byteCount, char lead, char tail)
  374. {
  375. throw new NotImplementedException ();
  376. }
  377. // Get the bytes that result from encoding a character buffer.
  378. public override int GetBytes (char[] chars, int charIndex, int charCount,
  379. byte[] bytes, int byteIndex)
  380. {
  381. char leftOver = '\0';
  382. return InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOver, true);
  383. }
  384. // Convenience wrappers for "GetBytes".
  385. public override int GetBytes (String s, int charIndex, int charCount,
  386. byte[] bytes, int byteIndex)
  387. {
  388. // Validate the parameters.
  389. if (s == null) {
  390. throw new ArgumentNullException ("s");
  391. }
  392. if (bytes == null) {
  393. throw new ArgumentNullException ("bytes");
  394. }
  395. if (charIndex < 0 || charIndex > s.Length) {
  396. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_StringIndex"));
  397. }
  398. if (charCount < 0 || charCount > (s.Length - charIndex)) {
  399. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_StringRange"));
  400. }
  401. if (byteIndex < 0 || byteIndex > bytes.Length) {
  402. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  403. }
  404. if (charIndex == s.Length)
  405. return 0;
  406. unsafe {
  407. fixed (char* cptr = s) {
  408. char dummy = '\0';
  409. if (bytes.Length == byteIndex)
  410. return InternalGetBytes (
  411. cptr + charIndex, charCount,
  412. null, 0, ref dummy, true);
  413. fixed (byte *bptr = bytes) {
  414. return InternalGetBytes (
  415. cptr + charIndex, charCount,
  416. bptr + byteIndex, bytes.Length - byteIndex,
  417. ref dummy, true);
  418. }
  419. }
  420. }
  421. }
  422. #if NET_2_0
  423. [CLSCompliant (false)]
  424. [ComVisible (false)]
  425. public unsafe override int GetBytes (char* chars, int charCount, byte* bytes, int byteCount)
  426. {
  427. if (chars == null)
  428. throw new ArgumentNullException ("chars");
  429. if (charCount < 0)
  430. throw new IndexOutOfRangeException ("charCount");
  431. if (bytes == null)
  432. throw new ArgumentNullException ("bytes");
  433. if (byteCount < 0)
  434. throw new IndexOutOfRangeException ("charCount");
  435. if (charCount == 0)
  436. return 0;
  437. char dummy = '\0';
  438. if (byteCount == 0)
  439. return InternalGetBytes (chars, charCount, null, 0, ref dummy, true);
  440. else
  441. return InternalGetBytes (chars, charCount, bytes, byteCount, ref dummy, true);
  442. }
  443. #endif
  444. #endregion
  445. // Internal version of "GetCharCount" which can handle a rolling
  446. // state between multiple calls to this method.
  447. #if NET_2_0
  448. private unsafe static int InternalGetCharCount (
  449. byte[] bytes, int index, int count, uint leftOverBits,
  450. uint leftOverCount, object provider,
  451. ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
  452. #else
  453. private unsafe static int InternalGetCharCount (
  454. byte[] bytes, int index, int count, uint leftOverBits,
  455. uint leftOverCount, bool throwOnInvalid, bool flush)
  456. #endif
  457. {
  458. // Validate the parameters.
  459. if (bytes == null) {
  460. throw new ArgumentNullException ("bytes");
  461. }
  462. if (index < 0 || index > bytes.Length) {
  463. throw new ArgumentOutOfRangeException ("index", _("ArgRange_Array"));
  464. }
  465. if (count < 0 || count > (bytes.Length - index)) {
  466. throw new ArgumentOutOfRangeException ("count", _("ArgRange_Array"));
  467. }
  468. if (count == 0)
  469. return 0;
  470. fixed (byte *bptr = bytes)
  471. #if NET_2_0
  472. return InternalGetCharCount (bptr + index, count,
  473. leftOverBits, leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
  474. #else
  475. return InternalGetCharCount (bptr + index, count,
  476. leftOverBits, leftOverCount, throwOnInvalid, flush);
  477. #endif
  478. }
  479. #if NET_2_0
  480. private unsafe static int InternalGetCharCount (
  481. byte* bytes, int count, uint leftOverBits,
  482. uint leftOverCount, object provider,
  483. ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
  484. #else
  485. private unsafe static int InternalGetCharCount (
  486. byte* bytes, int count, uint leftOverBits,
  487. uint leftOverCount, bool throwOnInvalid, bool flush)
  488. #endif
  489. {
  490. int index = 0;
  491. int length = 0;
  492. if (leftOverCount == 0) {
  493. int end = index + count;
  494. for (; index < end; index++, count--) {
  495. if (bytes [index] < 0x80)
  496. length++;
  497. else
  498. break;
  499. }
  500. }
  501. // Determine the number of characters that we have.
  502. uint ch;
  503. uint leftBits = leftOverBits;
  504. uint leftSoFar = (leftOverCount & (uint)0x0F);
  505. uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
  506. while (count > 0) {
  507. ch = (uint)(bytes[index++]);
  508. --count;
  509. if (leftSize == 0) {
  510. // Process a UTF-8 start character.
  511. if (ch < (uint)0x0080) {
  512. // Single-byte UTF-8 character.
  513. ++length;
  514. } else if ((ch & (uint)0xE0) == (uint)0xC0) {
  515. // Double-byte UTF-8 character.
  516. leftBits = (ch & (uint)0x1F);
  517. leftSoFar = 1;
  518. leftSize = 2;
  519. } else if ((ch & (uint)0xF0) == (uint)0xE0) {
  520. // Three-byte UTF-8 character.
  521. leftBits = (ch & (uint)0x0F);
  522. leftSoFar = 1;
  523. leftSize = 3;
  524. } else if ((ch & (uint)0xF8) == (uint)0xF0) {
  525. // Four-byte UTF-8 character.
  526. leftBits = (ch & (uint)0x07);
  527. leftSoFar = 1;
  528. leftSize = 4;
  529. } else if ((ch & (uint)0xFC) == (uint)0xF8) {
  530. // Five-byte UTF-8 character.
  531. leftBits = (ch & (uint)0x03);
  532. leftSoFar = 1;
  533. leftSize = 5;
  534. } else if ((ch & (uint)0xFE) == (uint)0xFC) {
  535. // Six-byte UTF-8 character.
  536. leftBits = (ch & (uint)0x03);
  537. leftSoFar = 1;
  538. leftSize = 6;
  539. } else {
  540. // Invalid UTF-8 start character.
  541. #if NET_2_0
  542. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1);
  543. #else
  544. if (throwOnInvalid)
  545. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  546. #endif
  547. }
  548. } else {
  549. // Process an extra byte in a multi-byte sequence.
  550. if ((ch & (uint)0xC0) == (uint)0x80) {
  551. leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
  552. if (++leftSoFar >= leftSize) {
  553. // We have a complete character now.
  554. if (leftBits < (uint)0x10000) {
  555. // is it an overlong ?
  556. bool overlong = false;
  557. switch (leftSize) {
  558. case 2:
  559. overlong = (leftBits <= 0x7F);
  560. break;
  561. case 3:
  562. overlong = (leftBits <= 0x07FF);
  563. break;
  564. case 4:
  565. overlong = (leftBits <= 0xFFFF);
  566. break;
  567. case 5:
  568. overlong = (leftBits <= 0x1FFFFF);
  569. break;
  570. case 6:
  571. overlong = (leftBits <= 0x03FFFFFF);
  572. break;
  573. }
  574. if (overlong) {
  575. #if NET_2_0
  576. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1);
  577. #else
  578. if (throwOnInvalid)
  579. throw new ArgumentException (_("Overlong"), leftBits.ToString ());
  580. #endif
  581. }
  582. else
  583. ++length;
  584. } else if (leftBits < (uint)0x110000) {
  585. length += 2;
  586. } else {
  587. #if NET_2_0
  588. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1);
  589. #else
  590. if (throwOnInvalid)
  591. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  592. #endif
  593. }
  594. leftSize = 0;
  595. }
  596. } else {
  597. // Invalid UTF-8 sequence: clear and restart.
  598. #if NET_2_0
  599. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index - 1);
  600. #else
  601. if (throwOnInvalid)
  602. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  603. #endif
  604. leftSize = 0;
  605. --index;
  606. ++count;
  607. }
  608. }
  609. }
  610. if (flush && leftSize != 0) {
  611. // We had left-over bytes that didn't make up
  612. // a complete UTF-8 character sequence.
  613. #if NET_2_0
  614. length += Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, index);
  615. #else
  616. if (throwOnInvalid)
  617. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  618. #endif
  619. }
  620. // Return the final length to the caller.
  621. return length;
  622. }
  623. #if NET_2_0
  624. // for GetCharCount()
  625. static unsafe int Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, int index)
  626. {
  627. if (buffer == null) {
  628. DecoderFallback fb = provider as DecoderFallback;
  629. if (fb != null)
  630. buffer = fb.CreateFallbackBuffer ();
  631. else
  632. buffer = ((Decoder) provider).FallbackBuffer;
  633. }
  634. if (bufferArg == null)
  635. bufferArg = new byte [1];
  636. bufferArg [0] = bytes [index];
  637. buffer.Fallback (bufferArg, 0);
  638. return buffer.Remaining;
  639. }
  640. // for GetChars()
  641. static unsafe void Fallback (object provider, ref DecoderFallbackBuffer buffer, ref byte [] bufferArg, byte* bytes, int byteIndex,
  642. char* chars, ref int charIndex)
  643. {
  644. if (buffer == null) {
  645. DecoderFallback fb = provider as DecoderFallback;
  646. if (fb != null)
  647. buffer = fb.CreateFallbackBuffer ();
  648. else
  649. buffer = ((Decoder) provider).FallbackBuffer;
  650. }
  651. if (bufferArg == null)
  652. bufferArg = new byte [1];
  653. bufferArg [0] = bytes [byteIndex];
  654. buffer.Fallback (bufferArg, 0);
  655. while (buffer.Remaining > 0)
  656. chars [charIndex++] = buffer.GetNextChar ();
  657. }
  658. #endif
  659. // Get the number of characters needed to decode a byte buffer.
  660. public override int GetCharCount (byte[] bytes, int index, int count)
  661. {
  662. #if NET_2_0
  663. DecoderFallbackBuffer buf = null;
  664. byte [] bufferArg = null;
  665. return InternalGetCharCount (bytes, index, count, 0, 0, DecoderFallback, ref buf, ref bufferArg, true);
  666. #else
  667. return InternalGetCharCount (bytes, index, count, 0, 0, throwOnInvalid, true);
  668. #endif
  669. }
  670. #if NET_2_0
  671. [CLSCompliant (false)]
  672. [ComVisible (false)]
  673. public unsafe override int GetCharCount (byte* bytes, int count)
  674. {
  675. DecoderFallbackBuffer buf = null;
  676. byte [] bufferArg = null;
  677. return InternalGetCharCount (bytes, count, 0, 0, DecoderFallback, ref buf, ref bufferArg, true);
  678. }
  679. #endif
  680. // Get the characters that result from decoding a byte buffer.
  681. #if NET_2_0
  682. private unsafe static int InternalGetChars (
  683. byte[] bytes, int byteIndex, int byteCount, char[] chars,
  684. int charIndex, ref uint leftOverBits, ref uint leftOverCount,
  685. object provider,
  686. ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
  687. #else
  688. private unsafe static int InternalGetChars (
  689. byte[] bytes, int byteIndex, int byteCount, char[] chars,
  690. int charIndex, ref uint leftOverBits, ref uint leftOverCount,
  691. bool throwOnInvalid, bool flush)
  692. #endif
  693. {
  694. // Validate the parameters.
  695. if (bytes == null) {
  696. throw new ArgumentNullException ("bytes");
  697. }
  698. if (chars == null) {
  699. throw new ArgumentNullException ("chars");
  700. }
  701. if (byteIndex < 0 || byteIndex > bytes.Length) {
  702. throw new ArgumentOutOfRangeException ("byteIndex", _("ArgRange_Array"));
  703. }
  704. if (byteCount < 0 || byteCount > (bytes.Length - byteIndex)) {
  705. throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_Array"));
  706. }
  707. if (charIndex < 0 || charIndex > chars.Length) {
  708. throw new ArgumentOutOfRangeException ("charIndex", _("ArgRange_Array"));
  709. }
  710. if (charIndex == chars.Length)
  711. return 0;
  712. fixed (char* cptr = chars) {
  713. #if NET_2_0
  714. if (byteCount == 0 || byteIndex == bytes.Length)
  715. return InternalGetChars (null, 0, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
  716. // otherwise...
  717. fixed (byte* bptr = bytes)
  718. return InternalGetChars (bptr + byteIndex, byteCount, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, provider, ref fallbackBuffer, ref bufferArg, flush);
  719. #else
  720. if (byteCount == 0 || byteIndex == bytes.Length)
  721. return InternalGetChars (null, 0, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, flush);
  722. // otherwise...
  723. fixed (byte* bptr = bytes)
  724. return InternalGetChars (bptr + byteIndex, byteCount, cptr + charIndex, chars.Length - charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, flush);
  725. #endif
  726. }
  727. }
  728. #if NET_2_0
  729. private unsafe static int InternalGetChars (
  730. byte* bytes, int byteCount, char* chars, int charCount,
  731. ref uint leftOverBits, ref uint leftOverCount,
  732. object provider,
  733. ref DecoderFallbackBuffer fallbackBuffer, ref byte [] bufferArg, bool flush)
  734. #else
  735. private unsafe static int InternalGetChars (
  736. byte* bytes, int byteCount, char* chars, int charCount,
  737. ref uint leftOverBits, ref uint leftOverCount,
  738. bool throwOnInvalid, bool flush)
  739. #endif
  740. {
  741. int charIndex = 0, byteIndex = 0;
  742. int length = charCount;
  743. int posn = charIndex;
  744. if (leftOverCount == 0) {
  745. int end = byteIndex + byteCount;
  746. for (; byteIndex < end; posn++, byteIndex++, byteCount--) {
  747. if (bytes [byteIndex] < 0x80)
  748. chars [posn] = (char) bytes [byteIndex];
  749. else
  750. break;
  751. }
  752. }
  753. // Convert the bytes into the output buffer.
  754. uint ch;
  755. uint leftBits = leftOverBits;
  756. uint leftSoFar = (leftOverCount & (uint)0x0F);
  757. uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
  758. int byteEnd = byteIndex + byteCount;
  759. for(; byteIndex < byteEnd; byteIndex++) {
  760. // Fetch the next character from the byte buffer.
  761. ch = (uint)(bytes[byteIndex]);
  762. if (leftSize == 0) {
  763. // Process a UTF-8 start character.
  764. if (ch < (uint)0x0080) {
  765. // Single-byte UTF-8 character.
  766. if (posn >= length) {
  767. throw new ArgumentException (_("Arg_InsufficientSpace"), "chars");
  768. }
  769. chars[posn++] = (char)ch;
  770. } else if ((ch & (uint)0xE0) == (uint)0xC0) {
  771. // Double-byte UTF-8 character.
  772. leftBits = (ch & (uint)0x1F);
  773. leftSoFar = 1;
  774. leftSize = 2;
  775. } else if ((ch & (uint)0xF0) == (uint)0xE0) {
  776. // Three-byte UTF-8 character.
  777. leftBits = (ch & (uint)0x0F);
  778. leftSoFar = 1;
  779. leftSize = 3;
  780. } else if ((ch & (uint)0xF8) == (uint)0xF0) {
  781. // Four-byte UTF-8 character.
  782. leftBits = (ch & (uint)0x07);
  783. leftSoFar = 1;
  784. leftSize = 4;
  785. } else if ((ch & (uint)0xFC) == (uint)0xF8) {
  786. // Five-byte UTF-8 character.
  787. leftBits = (ch & (uint)0x03);
  788. leftSoFar = 1;
  789. leftSize = 5;
  790. } else if ((ch & (uint)0xFE) == (uint)0xFC) {
  791. // Six-byte UTF-8 character.
  792. leftBits = (ch & (uint)0x03);
  793. leftSoFar = 1;
  794. leftSize = 6;
  795. } else {
  796. // Invalid UTF-8 start character.
  797. #if NET_2_0
  798. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn);
  799. #else
  800. if (throwOnInvalid)
  801. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  802. #endif
  803. }
  804. } else {
  805. // Process an extra byte in a multi-byte sequence.
  806. if ((ch & (uint)0xC0) == (uint)0x80) {
  807. leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
  808. if (++leftSoFar >= leftSize) {
  809. // We have a complete character now.
  810. if (leftBits < (uint)0x10000) {
  811. // is it an overlong ?
  812. bool overlong = false;
  813. switch (leftSize) {
  814. case 2:
  815. overlong = (leftBits <= 0x7F);
  816. break;
  817. case 3:
  818. overlong = (leftBits <= 0x07FF);
  819. break;
  820. case 4:
  821. overlong = (leftBits <= 0xFFFF);
  822. break;
  823. case 5:
  824. overlong = (leftBits <= 0x1FFFFF);
  825. break;
  826. case 6:
  827. overlong = (leftBits <= 0x03FFFFFF);
  828. break;
  829. }
  830. if (overlong) {
  831. #if NET_2_0
  832. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn);
  833. #else
  834. if (throwOnInvalid)
  835. throw new ArgumentException (_("Overlong"), leftBits.ToString ());
  836. #endif
  837. }
  838. else if ((leftBits & 0xF800) == 0xD800) {
  839. // UTF-8 doesn't use surrogate characters
  840. #if NET_2_0
  841. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn);
  842. #else
  843. if (throwOnInvalid)
  844. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  845. #endif
  846. }
  847. else {
  848. if (posn >= length) {
  849. throw new ArgumentException
  850. (_("Arg_InsufficientSpace"), "chars");
  851. }
  852. chars[posn++] = (char)leftBits;
  853. }
  854. } else if (leftBits < (uint)0x110000) {
  855. if ((posn + 2) > length) {
  856. throw new ArgumentException
  857. (_("Arg_InsufficientSpace"), "chars");
  858. }
  859. leftBits -= (uint)0x10000;
  860. chars[posn++] = (char)((leftBits >> 10) +
  861. (uint)0xD800);
  862. chars[posn++] =
  863. (char)((leftBits & (uint)0x3FF) + (uint)0xDC00);
  864. } else {
  865. #if NET_2_0
  866. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn);
  867. #else
  868. if (throwOnInvalid)
  869. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  870. #endif
  871. }
  872. leftSize = 0;
  873. }
  874. } else {
  875. // Invalid UTF-8 sequence: clear and restart.
  876. #if NET_2_0
  877. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn);
  878. #else
  879. if (throwOnInvalid)
  880. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  881. #endif
  882. leftSize = 0;
  883. --byteIndex;
  884. }
  885. }
  886. }
  887. if (flush && leftSize != 0) {
  888. // We had left-over bytes that didn't make up
  889. // a complete UTF-8 character sequence.
  890. #if NET_2_0
  891. Fallback (provider, ref fallbackBuffer, ref bufferArg, bytes, byteIndex, chars, ref posn);
  892. #else
  893. if (throwOnInvalid)
  894. throw new ArgumentException (_("Arg_InvalidUTF8"), "bytes");
  895. #endif
  896. }
  897. leftOverBits = leftBits;
  898. leftOverCount = (leftSoFar | (leftSize << 4));
  899. // Return the final length to the caller.
  900. return posn - charIndex;
  901. }
  902. // Get the characters that result from decoding a byte buffer.
  903. public override int GetChars (byte[] bytes, int byteIndex, int byteCount,
  904. char[] chars, int charIndex)
  905. {
  906. uint leftOverBits = 0;
  907. uint leftOverCount = 0;
  908. #if NET_2_0
  909. DecoderFallbackBuffer buf = null;
  910. byte [] bufferArg = null;
  911. return InternalGetChars (bytes, byteIndex, byteCount, chars,
  912. charIndex, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, ref bufferArg, true);
  913. #else
  914. return InternalGetChars (bytes, byteIndex, byteCount, chars,
  915. charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, true);
  916. #endif
  917. }
  918. #if NET_2_0
  919. [CLSCompliant (false)]
  920. [ComVisible (false)]
  921. public unsafe override int GetChars (byte* bytes, int byteCount, char* chars, int charCount)
  922. {
  923. DecoderFallbackBuffer buf = null;
  924. byte [] bufferArg = null;
  925. uint leftOverBits = 0;
  926. uint leftOverCount = 0;
  927. return InternalGetChars (bytes, byteCount, chars,
  928. charCount, ref leftOverBits, ref leftOverCount, DecoderFallback, ref buf, ref bufferArg, true);
  929. }
  930. #endif
  931. // Get the maximum number of bytes needed to encode a
  932. // specified number of characters.
  933. public override int GetMaxByteCount (int charCount)
  934. {
  935. if (charCount < 0) {
  936. throw new ArgumentOutOfRangeException ("charCount", _("ArgRange_NonNegative"));
  937. }
  938. return charCount * 4;
  939. }
  940. // Get the maximum number of characters needed to decode a
  941. // specified number of bytes.
  942. public override int GetMaxCharCount (int byteCount)
  943. {
  944. if (byteCount < 0) {
  945. throw new ArgumentOutOfRangeException ("byteCount", _("ArgRange_NonNegative"));
  946. }
  947. return byteCount;
  948. }
  949. // Get a UTF8-specific decoder that is attached to this instance.
  950. public override Decoder GetDecoder ()
  951. {
  952. #if NET_2_0
  953. return new UTF8Decoder (DecoderFallback);
  954. #else
  955. return new UTF8Decoder (throwOnInvalid);
  956. #endif
  957. }
  958. // Get a UTF8-specific encoder that is attached to this instance.
  959. public override Encoder GetEncoder ()
  960. {
  961. return new UTF8Encoder (emitIdentifier);
  962. }
  963. // Get the UTF8 preamble.
  964. public override byte[] GetPreamble ()
  965. {
  966. if (emitIdentifier) {
  967. byte[] pre = new byte [3];
  968. pre[0] = (byte)0xEF;
  969. pre[1] = (byte)0xBB;
  970. pre[2] = (byte)0xBF;
  971. return pre;
  972. } else {
  973. return new byte [0];
  974. }
  975. }
  976. // Determine if this object is equal to another.
  977. public override bool Equals (Object value)
  978. {
  979. UTF8Encoding enc = (value as UTF8Encoding);
  980. if (enc != null) {
  981. #if NET_2_0
  982. return (codePage == enc.codePage &&
  983. emitIdentifier == enc.emitIdentifier &&
  984. DecoderFallback == enc.DecoderFallback &&
  985. EncoderFallback == enc.EncoderFallback);
  986. #else
  987. return (codePage == enc.codePage &&
  988. emitIdentifier == enc.emitIdentifier &&
  989. throwOnInvalid == enc.throwOnInvalid);
  990. #endif
  991. } else {
  992. return false;
  993. }
  994. }
  995. // Get the hash code for this object.
  996. public override int GetHashCode ()
  997. {
  998. return base.GetHashCode ();
  999. }
  1000. #if NET_2_0
  1001. [MonoTODO]
  1002. public override int GetByteCount (string s)
  1003. {
  1004. // hmm, does this override make any sense?
  1005. return base.GetByteCount (s);
  1006. }
  1007. [MonoTODO]
  1008. public override string GetString (byte [] bytes, int index, int count)
  1009. {
  1010. // hmm, does this override make any sense?
  1011. return base.GetString (bytes, index, count);
  1012. }
  1013. #endif
  1014. #if !NET_2_0
  1015. public override byte [] GetBytes (String s)
  1016. {
  1017. if (s == null)
  1018. throw new ArgumentNullException ("s");
  1019. int length = GetByteCount (s);
  1020. byte [] bytes = new byte [length];
  1021. GetBytes (s, 0, s.Length, bytes, 0);
  1022. return bytes;
  1023. }
  1024. #endif
  1025. // UTF-8 decoder implementation.
  1026. [Serializable]
  1027. private class UTF8Decoder : Decoder
  1028. {
  1029. #if !NET_2_0
  1030. private bool throwOnInvalid;
  1031. #endif
  1032. private uint leftOverBits;
  1033. private uint leftOverCount;
  1034. // Constructor.
  1035. #if NET_2_0
  1036. public UTF8Decoder (DecoderFallback fallback)
  1037. #else
  1038. public UTF8Decoder (bool throwOnInvalid)
  1039. #endif
  1040. {
  1041. #if NET_2_0
  1042. Fallback = fallback;
  1043. #else
  1044. this.throwOnInvalid = throwOnInvalid;
  1045. #endif
  1046. leftOverBits = 0;
  1047. leftOverCount = 0;
  1048. }
  1049. // Override inherited methods.
  1050. public override int GetCharCount (byte[] bytes, int index, int count)
  1051. {
  1052. #if NET_2_0
  1053. DecoderFallbackBuffer buf = null;
  1054. byte [] bufferArg = null;
  1055. return InternalGetCharCount (bytes, index, count,
  1056. leftOverBits, leftOverCount, this, ref buf, ref bufferArg, false);
  1057. #else
  1058. return InternalGetCharCount (bytes, index, count,
  1059. leftOverBits, leftOverCount, throwOnInvalid, false);
  1060. #endif
  1061. }
  1062. public override int GetChars (byte[] bytes, int byteIndex,
  1063. int byteCount, char[] chars, int charIndex)
  1064. {
  1065. #if NET_2_0
  1066. DecoderFallbackBuffer buf = null;
  1067. byte [] bufferArg = null;
  1068. return InternalGetChars (bytes, byteIndex, byteCount,
  1069. chars, charIndex, ref leftOverBits, ref leftOverCount, this, ref buf, ref bufferArg, false);
  1070. #else
  1071. return InternalGetChars (bytes, byteIndex, byteCount,
  1072. chars, charIndex, ref leftOverBits, ref leftOverCount, throwOnInvalid, false);
  1073. #endif
  1074. }
  1075. } // class UTF8Decoder
  1076. // UTF-8 encoder implementation.
  1077. [Serializable]
  1078. private class UTF8Encoder : Encoder
  1079. {
  1080. private bool emitIdentifier;
  1081. private char leftOverForCount;
  1082. private char leftOverForConv;
  1083. // Constructor.
  1084. public UTF8Encoder (bool emitIdentifier)
  1085. {
  1086. this.emitIdentifier = emitIdentifier;
  1087. leftOverForCount = '\0';
  1088. leftOverForConv = '\0';
  1089. }
  1090. // Override inherited methods.
  1091. public override int GetByteCount (char[] chars, int index,
  1092. int count, bool flush)
  1093. {
  1094. return InternalGetByteCount (chars, index, count, ref leftOverForCount, flush);
  1095. }
  1096. public override int GetBytes (char[] chars, int charIndex,
  1097. int charCount, byte[] bytes, int byteIndex, bool flush)
  1098. {
  1099. int result;
  1100. result = InternalGetBytes (chars, charIndex, charCount, bytes, byteIndex, ref leftOverForConv, flush);
  1101. emitIdentifier = false;
  1102. return result;
  1103. }
  1104. #if NET_2_0
  1105. public unsafe override int GetByteCount (char* chars, int count, bool flush)
  1106. {
  1107. return InternalGetByteCount (chars, count, ref leftOverForCount, flush);
  1108. }
  1109. public unsafe override int GetBytes (char* chars, int charCount,
  1110. byte* bytes, int byteCount, bool flush)
  1111. {
  1112. int result;
  1113. result = InternalGetBytes (chars, charCount, bytes, byteCount, ref leftOverForConv, flush);
  1114. emitIdentifier = false;
  1115. return result;
  1116. }
  1117. #endif
  1118. } // class UTF8Encoder
  1119. }; // class UTF8Encoding
  1120. }; // namespace System.Text