UTF8Encoding.cs 26 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039
  1. /*
  2. * UTF8Encoding.cs - Implementation of the "System.Text.UTF8Encoding" class.
  3. *
  4. * Copyright (c) 2001, 2002 Southern Storm Software, Pty Ltd
  5. *
  6. * Permission is hereby granted, free of charge, to any person obtaining
  7. * a copy of this software and associated documentation files (the "Software"),
  8. * to deal in the Software without restriction, including without limitation
  9. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10. * and/or sell copies of the Software, and to permit persons to whom the
  11. * Software is furnished to do so, subject to the following conditions:
  12. *
  13. * The above copyright notice and this permission notice shall be included
  14. * in all copies or substantial portions of the Software.
  15. *
  16. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  17. * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19. * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  20. * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  21. * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  22. * OTHER DEALINGS IN THE SOFTWARE.
  23. */
  24. namespace System.Text
  25. {
  26. using System;
  27. public class UTF8Encoding : Encoding
  28. {
  29. // Magic number used by Windows for UTF-8.
  30. internal const int UTF8_CODE_PAGE = 65001;
  31. // Internal state.
  32. private bool emitIdentifier;
  33. private bool throwOnInvalid;
  34. // Constructors.
  35. public UTF8Encoding()
  36. : this(false, false) {}
  37. public UTF8Encoding(bool encoderShouldEmitUTF8Identifier)
  38. : this(encoderShouldEmitUTF8Identifier, false) {}
  39. public UTF8Encoding(bool encoderShouldEmitUTF8Identifier,
  40. bool throwOnInvalidBytes)
  41. : base(UTF8_CODE_PAGE)
  42. {
  43. emitIdentifier = encoderShouldEmitUTF8Identifier;
  44. throwOnInvalid = throwOnInvalidBytes;
  45. }
  46. // Internal version of "GetByteCount" which can handle a rolling
  47. // state between multiple calls to this method.
  48. private static int InternalGetByteCount(char[] chars, int index,
  49. int count, uint leftOver,
  50. bool emitIdentifier, bool flush)
  51. {
  52. // Validate the parameters.
  53. if(chars == null)
  54. {
  55. throw new ArgumentNullException("chars");
  56. }
  57. if(index < 0 || index > chars.Length)
  58. {
  59. throw new ArgumentOutOfRangeException
  60. ("index", _("ArgRange_Array"));
  61. }
  62. if(count < 0 || count > (chars.Length - index))
  63. {
  64. throw new ArgumentOutOfRangeException
  65. ("count", _("ArgRange_Array"));
  66. }
  67. // Determine the lengths of all characters.
  68. char ch;
  69. int length = 0;
  70. uint pair = leftOver;
  71. while(count > 0)
  72. {
  73. ch = chars[index];
  74. if(pair == 0)
  75. {
  76. if(ch < '\u0080')
  77. {
  78. ++length;
  79. }
  80. else if(ch < '\u0800')
  81. {
  82. length += 2;
  83. }
  84. else if(ch >= '\uD800' && ch <= '\uDBFF')
  85. {
  86. // This is the start of a surrogate pair.
  87. pair = (uint)ch;
  88. }
  89. else
  90. {
  91. length += 3;
  92. }
  93. }
  94. else if(ch >= '\uDC00' && ch <= '\uDFFF')
  95. {
  96. // We have a surrogate pair.
  97. length += 4;
  98. pair = 0;
  99. }
  100. else
  101. {
  102. // We have a surrogate start followed by a
  103. // regular character. Technically, this is
  104. // invalid, but we have to do something.
  105. // We write out the surrogate start and then
  106. // re-visit the current character again.
  107. length += 3;
  108. pair = 0;
  109. continue;
  110. }
  111. ++index;
  112. --count;
  113. }
  114. if(flush && pair != 0)
  115. {
  116. // Flush the left-over surrogate pair start.
  117. length += 3;
  118. }
  119. // Return the final length to the caller.
  120. return length + (emitIdentifier ? 3 : 0);
  121. }
  122. // Get the number of bytes needed to encode a character buffer.
  123. public override int GetByteCount(char[] chars, int index, int count)
  124. {
  125. return InternalGetByteCount(chars, index, count, 0,
  126. emitIdentifier, true);
  127. }
  128. // Convenience wrappers for "GetByteCount".
  129. public override int GetByteCount(String s)
  130. {
  131. // Validate the parameters.
  132. if(s == null)
  133. {
  134. throw new ArgumentNullException("s");
  135. }
  136. // Determine the lengths of all characters.
  137. char ch;
  138. int index = 0;
  139. int count = s.Length;
  140. int length = 0;
  141. uint pair;
  142. while(count > 0)
  143. {
  144. ch = s[index++];
  145. if(ch < '\u0080')
  146. {
  147. ++length;
  148. }
  149. else if(ch < '\u0800')
  150. {
  151. length += 2;
  152. }
  153. else if(ch >= '\uD800' && ch <= '\uDBFF' && count > 1)
  154. {
  155. // This may be the start of a surrogate pair.
  156. pair = (uint)(s[index]);
  157. if(pair >= (uint)0xDC00 && pair <= (uint)0xDFFF)
  158. {
  159. length += 4;
  160. ++index;
  161. --count;
  162. }
  163. else
  164. {
  165. length += 3;
  166. }
  167. }
  168. else
  169. {
  170. length += 3;
  171. }
  172. --count;
  173. }
  174. // Return the final length to the caller.
  175. return length + (emitIdentifier ? 3 : 0);
  176. }
  177. // Internal version of "GetBytes" which can handle a rolling
  178. // state between multiple calls to this method.
  179. private static int InternalGetBytes(char[] chars, int charIndex,
  180. int charCount, byte[] bytes,
  181. int byteIndex, ref uint leftOver,
  182. bool emitIdentifier, bool flush)
  183. {
  184. // Validate the parameters.
  185. if(chars == null)
  186. {
  187. throw new ArgumentNullException("chars");
  188. }
  189. if(bytes == null)
  190. {
  191. throw new ArgumentNullException("bytes");
  192. }
  193. if(charIndex < 0 || charIndex > chars.Length)
  194. {
  195. throw new ArgumentOutOfRangeException
  196. ("charIndex", _("ArgRange_Array"));
  197. }
  198. if(charCount < 0 || charCount > (chars.Length - charIndex))
  199. {
  200. throw new ArgumentOutOfRangeException
  201. ("charCount", _("ArgRange_Array"));
  202. }
  203. if(byteIndex < 0 || byteIndex > bytes.Length)
  204. {
  205. throw new ArgumentOutOfRangeException
  206. ("byteIndex", _("ArgRange_Array"));
  207. }
  208. // Convert the characters into bytes.
  209. char ch;
  210. int length = bytes.Length;
  211. uint pair;
  212. uint left = leftOver;
  213. int posn = byteIndex;
  214. if(emitIdentifier)
  215. {
  216. if((posn + 3) > length)
  217. {
  218. throw new ArgumentException
  219. (_("Arg_InsufficientSpace"), "bytes");
  220. }
  221. bytes[posn++] = (byte)0xEF;
  222. bytes[posn++] = (byte)0xBB;
  223. bytes[posn++] = (byte)0xBF;
  224. }
  225. while(charCount > 0)
  226. {
  227. // Fetch the next UTF-16 character pair value.
  228. ch = chars[charIndex++];
  229. --charCount;
  230. if(left == 0)
  231. {
  232. if(ch >= '\uD800' && ch <= '\uDBFF')
  233. {
  234. // This is the start of a surrogate pair.
  235. left = (uint)ch;
  236. continue;
  237. }
  238. else
  239. {
  240. // This is a regular character.
  241. pair = (uint)ch;
  242. }
  243. }
  244. else if(ch >= '\uDC00' && ch <= '\uDFFF')
  245. {
  246. // We have a surrogate pair.
  247. pair = ((left - (uint)0xD800) << 10) +
  248. (((uint)ch) - (uint)0xDC00) +
  249. (uint)0x10000;
  250. left = 0;
  251. }
  252. else
  253. {
  254. // We have a surrogate start followed by a
  255. // regular character. Technically, this is
  256. // invalid, but we have to do something.
  257. // We write out the surrogate start and then
  258. // re-visit the current character again.
  259. pair = (uint)left;
  260. left = 0;
  261. --charIndex;
  262. ++charCount;
  263. }
  264. // Encode the character pair value.
  265. if(pair < (uint)0x0080)
  266. {
  267. if(posn >= length)
  268. {
  269. throw new ArgumentException
  270. (_("Arg_InsufficientSpace"), "bytes");
  271. }
  272. bytes[posn++] = (byte)pair;
  273. }
  274. else if(pair < (uint)0x0800)
  275. {
  276. if((posn + 2) > length)
  277. {
  278. throw new ArgumentException
  279. (_("Arg_InsufficientSpace"), "bytes");
  280. }
  281. bytes[posn++] = (byte)(0xC0 | (pair >> 6));
  282. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  283. }
  284. else if(pair < (uint)0x10000)
  285. {
  286. if((posn + 3) > length)
  287. {
  288. throw new ArgumentException
  289. (_("Arg_InsufficientSpace"), "bytes");
  290. }
  291. bytes[posn++] = (byte)(0xE0 | (pair >> 12));
  292. bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
  293. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  294. }
  295. else
  296. {
  297. if((posn + 4) > length)
  298. {
  299. throw new ArgumentException
  300. (_("Arg_InsufficientSpace"), "bytes");
  301. }
  302. bytes[posn++] = (byte)(0xF0 | (pair >> 18));
  303. bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F));
  304. bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
  305. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  306. }
  307. }
  308. if(flush && left != 0)
  309. {
  310. // Flush the left-over surrogate pair start.
  311. if((posn + 3) > length)
  312. {
  313. throw new ArgumentException
  314. (_("Arg_InsufficientSpace"), "bytes");
  315. }
  316. bytes[posn++] = (byte)(0xE0 | (left >> 12));
  317. bytes[posn++] = (byte)(0x80 | ((left >> 6) & 0x3F));
  318. bytes[posn++] = (byte)(0x80 | (left & 0x3F));
  319. left = 0;
  320. }
  321. leftOver = left;
  322. // Return the final count to the caller.
  323. return posn - byteIndex;
  324. }
  325. // Get the bytes that result from encoding a character buffer.
  326. public override int GetBytes(char[] chars, int charIndex, int charCount,
  327. byte[] bytes, int byteIndex)
  328. {
  329. uint leftOver = 0;
  330. return InternalGetBytes(chars, charIndex, charCount,
  331. bytes, byteIndex, ref leftOver,
  332. emitIdentifier, true);
  333. }
  334. // Convenience wrappers for "GetBytes".
  335. public override int GetBytes(String s, int charIndex, int charCount,
  336. byte[] bytes, int byteIndex)
  337. {
  338. // Validate the parameters.
  339. if(s == null)
  340. {
  341. throw new ArgumentNullException("s");
  342. }
  343. if(bytes == null)
  344. {
  345. throw new ArgumentNullException("bytes");
  346. }
  347. if(charIndex < 0 || charIndex > s.Length)
  348. {
  349. throw new ArgumentOutOfRangeException
  350. ("charIndex", _("ArgRange_StringIndex"));
  351. }
  352. if(charCount < 0 || charCount > (s.Length - charIndex))
  353. {
  354. throw new ArgumentOutOfRangeException
  355. ("charCount", _("ArgRange_StringRange"));
  356. }
  357. if(byteIndex < 0 || byteIndex > bytes.Length)
  358. {
  359. throw new ArgumentOutOfRangeException
  360. ("byteIndex", _("ArgRange_Array"));
  361. }
  362. // Convert the characters into bytes.
  363. char ch;
  364. int length = bytes.Length;
  365. uint pair;
  366. int posn = byteIndex;
  367. if(emitIdentifier)
  368. {
  369. if((posn + 3) > length)
  370. {
  371. throw new ArgumentException
  372. (_("Arg_InsufficientSpace"), "bytes");
  373. }
  374. bytes[posn++] = (byte)0xEF;
  375. bytes[posn++] = (byte)0xBB;
  376. bytes[posn++] = (byte)0xBF;
  377. }
  378. while(charCount > 0)
  379. {
  380. // Fetch the next UTF-16 character pair value.
  381. ch = s[charIndex++];
  382. --charCount;
  383. if(ch >= '\uD800' && ch <= '\uDBFF' && charCount > 1)
  384. {
  385. // This may be the start of a surrogate pair.
  386. pair = (uint)(s[charIndex]);
  387. if(pair >= (uint)0xDC00 && pair <= (uint)0xDFFF)
  388. {
  389. pair = (pair - (uint)0xDC00) +
  390. ((((uint)ch) - (uint)0xD800) << 10) +
  391. (uint)0x10000;
  392. ++charIndex;
  393. --charCount;
  394. }
  395. else
  396. {
  397. pair = (uint)ch;
  398. }
  399. }
  400. else
  401. {
  402. pair = (uint)ch;
  403. }
  404. // Encode the character pair value.
  405. if(pair < (uint)0x0080)
  406. {
  407. if(posn >= length)
  408. {
  409. throw new ArgumentException
  410. (_("Arg_InsufficientSpace"), "bytes");
  411. }
  412. bytes[posn++] = (byte)pair;
  413. }
  414. else if(pair < (uint)0x0800)
  415. {
  416. if((posn + 2) > length)
  417. {
  418. throw new ArgumentException
  419. (_("Arg_InsufficientSpace"), "bytes");
  420. }
  421. bytes[posn++] = (byte)(0xC0 | (pair >> 6));
  422. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  423. }
  424. else if(pair < (uint)0x10000)
  425. {
  426. if((posn + 3) > length)
  427. {
  428. throw new ArgumentException
  429. (_("Arg_InsufficientSpace"), "bytes");
  430. }
  431. bytes[posn++] = (byte)(0xE0 | (pair >> 12));
  432. bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
  433. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  434. }
  435. else
  436. {
  437. if((posn + 4) > length)
  438. {
  439. throw new ArgumentException
  440. (_("Arg_InsufficientSpace"), "bytes");
  441. }
  442. bytes[posn++] = (byte)(0xF0 | (pair >> 18));
  443. bytes[posn++] = (byte)(0x80 | ((pair >> 12) & 0x3F));
  444. bytes[posn++] = (byte)(0x80 | ((pair >> 6) & 0x3F));
  445. bytes[posn++] = (byte)(0x80 | (pair & 0x3F));
  446. }
  447. }
  448. // Return the final count to the caller.
  449. return posn - byteIndex;
  450. }
  451. // Internal version of "GetCharCount" which can handle a rolling
  452. // state between multiple calls to this method.
  453. private static int InternalGetCharCount(byte[] bytes, int index, int count,
  454. uint leftOverBits,
  455. uint leftOverCount,
  456. bool throwOnInvalid, bool flush)
  457. {
  458. // Validate the parameters.
  459. if(bytes == null)
  460. {
  461. throw new ArgumentNullException("bytes");
  462. }
  463. if(index < 0 || index > bytes.Length)
  464. {
  465. throw new ArgumentOutOfRangeException
  466. ("index", _("ArgRange_Array"));
  467. }
  468. if(count < 0 || count > (bytes.Length - index))
  469. {
  470. throw new ArgumentOutOfRangeException
  471. ("count", _("ArgRange_Array"));
  472. }
  473. // Determine the number of characters that we have.
  474. uint ch;
  475. int length = 0;
  476. uint leftBits = leftOverBits;
  477. uint leftSoFar = (leftOverCount & (uint)0x0F);
  478. uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
  479. while(count > 0)
  480. {
  481. ch = (uint)(bytes[index++]);
  482. --count;
  483. if(leftSize == 0)
  484. {
  485. // Process a UTF-8 start character.
  486. if(ch < (uint)0x0080)
  487. {
  488. // Single-byte UTF-8 character.
  489. ++length;
  490. }
  491. else if((ch & (uint)0xE0) == (uint)0xC0)
  492. {
  493. // Double-byte UTF-8 character.
  494. leftBits = (ch & (uint)0x1F);
  495. leftSoFar = 1;
  496. leftSize = 2;
  497. }
  498. else if((ch & (uint)0xF0) == (uint)0xE0)
  499. {
  500. // Three-byte UTF-8 character.
  501. leftBits = (ch & (uint)0x0F);
  502. leftSoFar = 1;
  503. leftSize = 3;
  504. }
  505. else if((ch & (uint)0xF8) == (uint)0xF0)
  506. {
  507. // Four-byte UTF-8 character.
  508. leftBits = (ch & (uint)0x07);
  509. leftSoFar = 1;
  510. leftSize = 4;
  511. }
  512. else if((ch & (uint)0xFC) == (uint)0xF8)
  513. {
  514. // Five-byte UTF-8 character.
  515. leftBits = (ch & (uint)0x03);
  516. leftSoFar = 1;
  517. leftSize = 5;
  518. }
  519. else if((ch & (uint)0xFC) == (uint)0xFC)
  520. {
  521. // Six-byte UTF-8 character.
  522. leftBits = (ch & (uint)0x03);
  523. leftSoFar = 1;
  524. leftSize = 6;
  525. }
  526. else
  527. {
  528. // Invalid UTF-8 start character.
  529. if(throwOnInvalid)
  530. {
  531. throw new ArgumentException
  532. (_("Arg_InvalidUTF8"), "bytes");
  533. }
  534. }
  535. }
  536. else
  537. {
  538. // Process an extra byte in a multi-byte sequence.
  539. if((ch & (uint)0xC0) == (uint)0x80)
  540. {
  541. leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
  542. if(++leftSoFar >= leftSize)
  543. {
  544. // We have a complete character now.
  545. if(leftBits < (uint)0x10000)
  546. {
  547. if(leftBits != (uint)0xFEFF)
  548. {
  549. ++length;
  550. }
  551. }
  552. else if(leftBits < (uint)0x110000)
  553. {
  554. length += 2;
  555. }
  556. else if(throwOnInvalid)
  557. {
  558. throw new ArgumentException
  559. (_("Arg_InvalidUTF8"), "bytes");
  560. }
  561. leftSize = 0;
  562. }
  563. }
  564. else
  565. {
  566. // Invalid UTF-8 sequence: clear and restart.
  567. if(throwOnInvalid)
  568. {
  569. throw new ArgumentException
  570. (_("Arg_InvalidUTF8"), "bytes");
  571. }
  572. leftSize = 0;
  573. --index;
  574. ++count;
  575. }
  576. }
  577. }
  578. if(flush && leftSize != 0 && throwOnInvalid)
  579. {
  580. // We had left-over bytes that didn't make up
  581. // a complete UTF-8 character sequence.
  582. throw new ArgumentException
  583. (_("Arg_InvalidUTF8"), "bytes");
  584. }
  585. // Return the final length to the caller.
  586. return length;
  587. }
  588. // Get the number of characters needed to decode a byte buffer.
  589. public override int GetCharCount(byte[] bytes, int index, int count)
  590. {
  591. return InternalGetCharCount(bytes, index, count, 0, 0,
  592. throwOnInvalid, true);
  593. }
  594. // Get the characters that result from decoding a byte buffer.
  595. private static int InternalGetChars(byte[] bytes, int byteIndex,
  596. int byteCount, char[] chars,
  597. int charIndex, ref uint leftOverBits,
  598. ref uint leftOverCount,
  599. bool throwOnInvalid, bool flush)
  600. {
  601. // Validate the parameters.
  602. if(bytes == null)
  603. {
  604. throw new ArgumentNullException("bytes");
  605. }
  606. if(chars == null)
  607. {
  608. throw new ArgumentNullException("chars");
  609. }
  610. if(byteIndex < 0 || byteIndex > bytes.Length)
  611. {
  612. throw new ArgumentOutOfRangeException
  613. ("byteIndex", _("ArgRange_Array"));
  614. }
  615. if(byteCount < 0 || byteCount > (bytes.Length - byteIndex))
  616. {
  617. throw new ArgumentOutOfRangeException
  618. ("byteCount", _("ArgRange_Array"));
  619. }
  620. if(charIndex < 0 || charIndex > chars.Length)
  621. {
  622. throw new ArgumentOutOfRangeException
  623. ("charIndex", _("ArgRange_Array"));
  624. }
  625. // Convert the bytes into the output buffer.
  626. uint ch;
  627. int length = chars.Length;
  628. int posn = charIndex;
  629. uint leftBits = leftOverBits;
  630. uint leftSoFar = (leftOverCount & (uint)0x0F);
  631. uint leftSize = ((leftOverCount >> 4) & (uint)0x0F);
  632. while(byteCount > 0)
  633. {
  634. // Fetch the next character from the byte buffer.
  635. ch = (uint)(bytes[byteIndex++]);
  636. --byteCount;
  637. if(leftSize == 0)
  638. {
  639. // Process a UTF-8 start character.
  640. if(ch < (uint)0x0080)
  641. {
  642. // Single-byte UTF-8 character.
  643. if(posn >= length)
  644. {
  645. throw new ArgumentException
  646. (_("Arg_InsufficientSpace"), "chars");
  647. }
  648. chars[posn++] = (char)ch;
  649. }
  650. else if((ch & (uint)0xE0) == (uint)0xC0)
  651. {
  652. // Double-byte UTF-8 character.
  653. leftBits = (ch & (uint)0x1F);
  654. leftSoFar = 1;
  655. leftSize = 2;
  656. }
  657. else if((ch & (uint)0xF0) == (uint)0xE0)
  658. {
  659. // Three-byte UTF-8 character.
  660. leftBits = (ch & (uint)0x0F);
  661. leftSoFar = 1;
  662. leftSize = 3;
  663. }
  664. else if((ch & (uint)0xF8) == (uint)0xF0)
  665. {
  666. // Four-byte UTF-8 character.
  667. leftBits = (ch & (uint)0x07);
  668. leftSoFar = 1;
  669. leftSize = 4;
  670. }
  671. else if((ch & (uint)0xFC) == (uint)0xF8)
  672. {
  673. // Five-byte UTF-8 character.
  674. leftBits = (ch & (uint)0x03);
  675. leftSoFar = 1;
  676. leftSize = 5;
  677. }
  678. else if((ch & (uint)0xFC) == (uint)0xFC)
  679. {
  680. // Six-byte UTF-8 character.
  681. leftBits = (ch & (uint)0x03);
  682. leftSoFar = 1;
  683. leftSize = 6;
  684. }
  685. else
  686. {
  687. // Invalid UTF-8 start character.
  688. if(throwOnInvalid)
  689. {
  690. throw new ArgumentException
  691. (_("Arg_InvalidUTF8"), "bytes");
  692. }
  693. }
  694. }
  695. else
  696. {
  697. // Process an extra byte in a multi-byte sequence.
  698. if((ch & (uint)0xC0) == (uint)0x80)
  699. {
  700. leftBits = ((leftBits << 6) | (ch & (uint)0x3F));
  701. if(++leftSoFar >= leftSize)
  702. {
  703. // We have a complete character now.
  704. if(leftBits < (uint)0x10000)
  705. {
  706. if(leftBits != (uint)0xFEFF)
  707. {
  708. if(posn >= length)
  709. {
  710. throw new ArgumentException
  711. (_("Arg_InsufficientSpace"),
  712. "chars");
  713. }
  714. chars[posn++] = (char)leftBits;
  715. }
  716. }
  717. else if(leftBits < (uint)0x110000)
  718. {
  719. if((posn + 2) > length)
  720. {
  721. throw new ArgumentException
  722. (_("Arg_InsufficientSpace"),
  723. "chars");
  724. }
  725. leftBits -= (uint)0x10000;
  726. chars[posn++] = (char)((leftBits >> 10) +
  727. (uint)0xD800);
  728. chars[posn++] =
  729. (char)((leftBits & (uint)0x3FF) +
  730. (uint)0xDC00);
  731. }
  732. else if(throwOnInvalid)
  733. {
  734. throw new ArgumentException
  735. (_("Arg_InvalidUTF8"), "bytes");
  736. }
  737. leftSize = 0;
  738. }
  739. }
  740. else
  741. {
  742. // Invalid UTF-8 sequence: clear and restart.
  743. if(throwOnInvalid)
  744. {
  745. throw new ArgumentException
  746. (_("Arg_InvalidUTF8"), "bytes");
  747. }
  748. leftSize = 0;
  749. --byteIndex;
  750. ++byteCount;
  751. }
  752. }
  753. }
  754. if(flush && leftSize != 0 && throwOnInvalid)
  755. {
  756. // We had left-over bytes that didn't make up
  757. // a complete UTF-8 character sequence.
  758. throw new ArgumentException
  759. (_("Arg_InvalidUTF8"), "bytes");
  760. }
  761. leftOverBits = leftBits;
  762. leftOverCount = (leftSoFar | (leftSize << 4));
  763. // Return the final length to the caller.
  764. return posn - charIndex;
  765. }
  766. // Get the characters that result from decoding a byte buffer.
  767. public override int GetChars(byte[] bytes, int byteIndex, int byteCount,
  768. char[] chars, int charIndex)
  769. {
  770. uint leftOverBits = 0;
  771. uint leftOverCount = 0;
  772. return InternalGetChars(bytes, byteIndex, byteCount,
  773. chars, charIndex, ref leftOverBits,
  774. ref leftOverCount, throwOnInvalid,
  775. true);
  776. }
  777. // Get the maximum number of bytes needed to encode a
  778. // specified number of characters.
  779. public override int GetMaxByteCount(int charCount)
  780. {
  781. if(charCount < 0)
  782. {
  783. throw new ArgumentOutOfRangeException
  784. ("charCount", _("ArgRange_NonNegative"));
  785. }
  786. return charCount * 4 + (emitIdentifier ? 3 : 0);
  787. }
  788. // Get the maximum number of characters needed to decode a
  789. // specified number of bytes.
  790. public override int GetMaxCharCount(int byteCount)
  791. {
  792. if(byteCount < 0)
  793. {
  794. throw new ArgumentOutOfRangeException
  795. ("byteCount", _("ArgRange_NonNegative"));
  796. }
  797. return byteCount;
  798. }
  799. // Get a UTF8-specific decoder that is attached to this instance.
  800. public override Decoder GetDecoder()
  801. {
  802. return new UTF8Decoder(throwOnInvalid);
  803. }
  804. // Get a UTF8-specific encoder that is attached to this instance.
  805. public override Encoder GetEncoder()
  806. {
  807. return new UTF8Encoder(emitIdentifier);
  808. }
  809. // Get the UTF8 preamble.
  810. public override byte[] GetPreamble()
  811. {
  812. if(emitIdentifier)
  813. {
  814. byte[] pre = new byte [3];
  815. pre[0] = (byte)0xEF;
  816. pre[1] = (byte)0xBB;
  817. pre[2] = (byte)0xBF;
  818. return pre;
  819. }
  820. else
  821. {
  822. return new byte [0];
  823. }
  824. }
  825. // Determine if this object is equal to another.
  826. public override bool Equals(Object value)
  827. {
  828. UTF8Encoding enc = (value as UTF8Encoding);
  829. if(enc != null)
  830. {
  831. return (codePage == enc.codePage &&
  832. emitIdentifier == enc.emitIdentifier &&
  833. throwOnInvalid == enc.throwOnInvalid);
  834. }
  835. else
  836. {
  837. return false;
  838. }
  839. }
  840. // Get the hash code for this object.
  841. public override int GetHashCode()
  842. {
  843. return base.GetHashCode();
  844. }
  845. #if !ECMA_COMPAT
  846. // Get the mail body name for this encoding.
  847. public override String BodyName
  848. {
  849. get
  850. {
  851. return "utf-8";
  852. }
  853. }
  854. // Get the human-readable name for this encoding.
  855. public override String EncodingName
  856. {
  857. get
  858. {
  859. return "Unicode (UTF-8)";
  860. }
  861. }
  862. // Get the mail agent header name for this encoding.
  863. public override String HeaderName
  864. {
  865. get
  866. {
  867. return "utf-8";
  868. }
  869. }
  870. // Determine if this encoding can be displayed in a Web browser.
  871. public override bool IsBrowserDisplay
  872. {
  873. get
  874. {
  875. return true;
  876. }
  877. }
  878. // Determine if this encoding can be saved from a Web browser.
  879. public override bool IsBrowserSave
  880. {
  881. get
  882. {
  883. return true;
  884. }
  885. }
  886. // Determine if this encoding can be displayed in a mail/news agent.
  887. public override bool IsMailNewsDisplay
  888. {
  889. get
  890. {
  891. return true;
  892. }
  893. }
  894. // Determine if this encoding can be saved from a mail/news agent.
  895. public override bool IsMailNewsSave
  896. {
  897. get
  898. {
  899. return true;
  900. }
  901. }
  902. // Get the IANA-preferred Web name for this encoding.
  903. public override String WebName
  904. {
  905. get
  906. {
  907. return "utf-8";
  908. }
  909. }
  910. // Get the Windows code page represented by this object.
  911. public override int WindowsCodePage
  912. {
  913. get
  914. {
  915. return UnicodeEncoding.UNICODE_CODE_PAGE;
  916. }
  917. }
  918. #endif // !ECMA_COMPAT
  919. // UTF-8 decoder implementation.
  920. private sealed class UTF8Decoder : Decoder
  921. {
  922. private bool throwOnInvalid;
  923. private uint leftOverBits;
  924. private uint leftOverCount;
  925. // Constructor.
  926. public UTF8Decoder(bool throwOnInvalid)
  927. {
  928. this.throwOnInvalid = throwOnInvalid;
  929. leftOverBits = 0;
  930. leftOverCount = 0;
  931. }
  932. // Override inherited methods.
  933. public override int GetCharCount(byte[] bytes, int index, int count)
  934. {
  935. return InternalGetCharCount(bytes, index, count,
  936. leftOverBits, leftOverCount,
  937. throwOnInvalid, false);
  938. }
  939. public override int GetChars(byte[] bytes, int byteIndex,
  940. int byteCount, char[] chars,
  941. int charIndex)
  942. {
  943. return InternalGetChars(bytes, byteIndex, byteCount,
  944. chars, charIndex,
  945. ref leftOverBits,
  946. ref leftOverCount,
  947. throwOnInvalid, false);
  948. }
  949. } // class UTF8Decoder
  950. // UTF-8 encoder implementation.
  951. private sealed class UTF8Encoder : Encoder
  952. {
  953. private bool emitIdentifier;
  954. private uint leftOver;
  955. // Constructor.
  956. public UTF8Encoder(bool emitIdentifier)
  957. {
  958. this.emitIdentifier = emitIdentifier;
  959. leftOver = 0;
  960. }
  961. // Override inherited methods.
  962. public override int GetByteCount(char[] chars, int index,
  963. int count, bool flush)
  964. {
  965. return InternalGetByteCount
  966. (chars, index, count, leftOver,
  967. emitIdentifier, flush);
  968. }
  969. public override int GetBytes(char[] chars, int charIndex,
  970. int charCount, byte[] bytes,
  971. int byteCount, bool flush)
  972. {
  973. int result;
  974. result = InternalGetBytes
  975. (chars, charIndex, charCount, bytes, byteCount,
  976. ref leftOver, emitIdentifier, flush);
  977. emitIdentifier = false;
  978. return result;
  979. }
  980. } // class UTF8Encoder
  981. }; // class UTF8Encoding
  982. }; // namespace System.Text