UTF8Encoding.cs 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858
  1. // Licensed to the .NET Foundation under one or more agreements.
  2. // The .NET Foundation licenses this file to you under the MIT license.
  3. // See the LICENSE file in the project root for more information.
  4. // The worker functions in this file was optimized for performance. If you make changes
  5. // you should use care to consider all of the interesting cases.
  6. // The code of all worker functions in this file is written twice: Once as a slow loop, and the
  7. // second time as a fast loop. The slow loops handles all special cases, throws exceptions, etc.
  8. // The fast loops attempts to blaze through as fast as possible with optimistic range checks,
  9. // processing multiple characters at a time, and falling back to the slow loop for all special cases.
  10. using System.Buffers;
  11. using System.Diagnostics;
  12. using System.Runtime.CompilerServices;
  13. using System.Runtime.InteropServices;
  14. using System.Text.Unicode;
  15. namespace System.Text
  16. {
  17. // Encodes text into and out of UTF-8. UTF-8 is a way of writing
  18. // Unicode characters with variable numbers of bytes per character,
  19. // optimized for the lower 127 ASCII characters. It's an efficient way
  20. // of encoding US English in an internationalizable way.
  21. //
  22. // Don't override IsAlwaysNormalized because it is just a Unicode Transformation and could be confused.
  23. //
  24. // The UTF-8 byte order mark is simply the Unicode byte order mark
  25. // (0xFEFF) written in UTF-8 (0xEF 0xBB 0xBF). The byte order mark is
  26. // used mostly to distinguish UTF-8 text from other encodings, and doesn't
  27. // switch the byte orderings.
  28. public partial class UTF8Encoding : Encoding
  29. {
  30. /*
  31. bytes bits UTF-8 representation
  32. ----- ---- -----------------------------------
  33. 1 7 0vvvvvvv
  34. 2 11 110vvvvv 10vvvvvv
  35. 3 16 1110vvvv 10vvvvvv 10vvvvvv
  36. 4 21 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
  37. ----- ---- -----------------------------------
  38. Surrogate:
  39. Real Unicode value = (HighSurrogate - 0xD800) * 0x400 + (LowSurrogate - 0xDC00) + 0x10000
  40. */
  41. private const int UTF8_CODEPAGE = 65001;
  42. /// <summary>
  43. /// Transcoding to UTF-8 bytes from UTF-16 input chars will result in a maximum 3:1 expansion.
  44. /// </summary>
  45. /// <remarks>
  46. /// Supplementary code points are expanded to UTF-8 from UTF-16 at a 4:2 ratio,
  47. /// so 3:1 is still the correct value for maximum expansion.
  48. /// </remarks>
  49. private const int MaxUtf8BytesPerChar = 3;
  50. // Used by Encoding.UTF8 for lazy initialization
  51. // The initialization code will not be run until a static member of the class is referenced
  52. internal static readonly UTF8EncodingSealed s_default = new UTF8EncodingSealed(encoderShouldEmitUTF8Identifier: true);
  53. internal static ReadOnlySpan<byte> PreambleSpan => new byte[3] { 0xEF, 0xBB, 0xBF }; // uses C# compiler's optimization for static byte[] data
  54. // Yes, the idea of emitting U+FEFF as a UTF-8 identifier has made it into
  55. // the standard.
  56. private readonly bool _emitUTF8Identifier = false;
  57. private readonly bool _isThrowException = false;
  58. public UTF8Encoding() : this(false)
  59. {
  60. }
  61. public UTF8Encoding(bool encoderShouldEmitUTF8Identifier) :
  62. base(UTF8_CODEPAGE)
  63. {
  64. _emitUTF8Identifier = encoderShouldEmitUTF8Identifier;
  65. }
  66. public UTF8Encoding(bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes) :
  67. this(encoderShouldEmitUTF8Identifier)
  68. {
  69. _isThrowException = throwOnInvalidBytes;
  70. // Encoding's constructor already did this, but it'll be wrong if we're throwing exceptions
  71. if (_isThrowException)
  72. SetDefaultFallbacks();
  73. }
  74. internal sealed override void SetDefaultFallbacks()
  75. {
  76. // For UTF-X encodings, we use a replacement fallback with an empty string
  77. if (_isThrowException)
  78. {
  79. this.encoderFallback = EncoderFallback.ExceptionFallback;
  80. this.decoderFallback = DecoderFallback.ExceptionFallback;
  81. }
  82. else
  83. {
  84. this.encoderFallback = new EncoderReplacementFallback("\xFFFD");
  85. this.decoderFallback = new DecoderReplacementFallback("\xFFFD");
  86. }
  87. }
  88. // WARNING: GetByteCount(string chars)
  89. // WARNING: has different variable names than EncodingNLS.cs, so this can't just be cut & pasted,
  90. // WARNING: otherwise it'll break VB's way of declaring these.
  91. //
  92. // The following methods are copied from EncodingNLS.cs.
  93. // Unfortunately EncodingNLS.cs is internal and we're public, so we have to re-implement them here.
  94. // These should be kept in sync for the following classes:
  95. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  96. // Returns the number of bytes required to encode a range of characters in
  97. // a character array.
  98. //
  99. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  100. // So if you fix this, fix the others. Currently those include:
  101. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  102. // parent method is safe
  103. public override unsafe int GetByteCount(char[] chars, int index, int count)
  104. {
  105. // Validate input parameters
  106. if (chars is null)
  107. {
  108. ThrowHelper.ThrowArgumentNullException(ExceptionArgument.chars, ExceptionResource.ArgumentNull_Array);
  109. }
  110. if ((index | count) < 0)
  111. {
  112. ThrowHelper.ThrowArgumentOutOfRangeException((index < 0) ? ExceptionArgument.index : ExceptionArgument.count, ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
  113. }
  114. if (chars.Length - index < count)
  115. {
  116. ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.chars, ExceptionResource.ArgumentOutOfRange_IndexCountBuffer);
  117. }
  118. fixed (char* pChars = chars)
  119. {
  120. return GetByteCountCommon(pChars + index, count);
  121. }
  122. }
  123. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  124. // So if you fix this, fix the others. Currently those include:
  125. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  126. // parent method is safe
  127. public override unsafe int GetByteCount(string chars)
  128. {
  129. // Validate input parameters
  130. if (chars is null)
  131. {
  132. ThrowHelper.ThrowArgumentNullException(ExceptionArgument.chars);
  133. }
  134. fixed (char* pChars = chars)
  135. {
  136. return GetByteCountCommon(pChars, chars.Length);
  137. }
  138. }
  139. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  140. // So if you fix this, fix the others. Currently those include:
  141. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  142. [CLSCompliant(false)]
  143. public override unsafe int GetByteCount(char* chars, int count)
  144. {
  145. // Validate Parameters
  146. if (chars == null)
  147. {
  148. ThrowHelper.ThrowArgumentNullException(ExceptionArgument.chars);
  149. }
  150. if (count < 0)
  151. {
  152. ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.count, ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
  153. }
  154. return GetByteCountCommon(chars, count);
  155. }
  156. public override unsafe int GetByteCount(ReadOnlySpan<char> chars)
  157. {
  158. // It's ok for us to pass null pointers down to the workhorse below.
  159. fixed (char* charsPtr = &MemoryMarshal.GetReference(chars))
  160. {
  161. return GetByteCountCommon(charsPtr, chars.Length);
  162. }
  163. }
  164. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  165. private unsafe int GetByteCountCommon(char* pChars, int charCount)
  166. {
  167. // Common helper method for all non-EncoderNLS entry points to GetByteCount.
  168. // A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32.
  169. Debug.Assert(charCount >= 0, "Caller shouldn't specify negative length buffer.");
  170. Debug.Assert(pChars != null || charCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
  171. // First call into the fast path.
  172. // Don't bother providing a fallback mechanism; our fast path doesn't use it.
  173. int totalByteCount = GetByteCountFast(pChars, charCount, fallback: null, out int charsConsumed);
  174. if (charsConsumed != charCount)
  175. {
  176. // If there's still data remaining in the source buffer, go down the fallback path.
  177. // We need to check for integer overflow since the fallback could change the required
  178. // output count in unexpected ways.
  179. totalByteCount += GetByteCountWithFallback(pChars, charCount, charsConsumed);
  180. if (totalByteCount < 0)
  181. {
  182. ThrowConversionOverflow();
  183. }
  184. }
  185. return totalByteCount;
  186. }
  187. [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetCharCountCommon
  188. private protected sealed override unsafe int GetByteCountFast(char* pChars, int charsLength, EncoderFallback? fallback, out int charsConsumed)
  189. {
  190. // The number of UTF-8 code units may exceed the number of UTF-16 code units,
  191. // so we'll need to check for overflow before casting to Int32.
  192. char* ptrToFirstInvalidChar = Utf16Utility.GetPointerToFirstInvalidChar(pChars, charsLength, out long utf8CodeUnitCountAdjustment, out _);
  193. int tempCharsConsumed = (int)(ptrToFirstInvalidChar - pChars);
  194. charsConsumed = tempCharsConsumed;
  195. long totalUtf8Bytes = tempCharsConsumed + utf8CodeUnitCountAdjustment;
  196. if ((ulong)totalUtf8Bytes > int.MaxValue)
  197. {
  198. ThrowConversionOverflow();
  199. }
  200. return (int)totalUtf8Bytes;
  201. }
  202. // Parent method is safe.
  203. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  204. // So if you fix this, fix the others. Currently those include:
  205. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  206. public override unsafe int GetBytes(string s, int charIndex, int charCount,
  207. byte[] bytes, int byteIndex)
  208. {
  209. // Validate Parameters
  210. if (s is null || bytes is null)
  211. {
  212. ThrowHelper.ThrowArgumentNullException(
  213. argument: (s is null) ? ExceptionArgument.s : ExceptionArgument.bytes,
  214. resource: ExceptionResource.ArgumentNull_Array);
  215. }
  216. if ((charIndex | charCount) < 0)
  217. {
  218. ThrowHelper.ThrowArgumentOutOfRangeException(
  219. argument: (charIndex < 0) ? ExceptionArgument.charIndex : ExceptionArgument.charCount,
  220. resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
  221. }
  222. if (s.Length - charIndex < charCount)
  223. {
  224. ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.s, ExceptionResource.ArgumentOutOfRange_IndexCount);
  225. }
  226. if ((uint)byteIndex > bytes.Length)
  227. {
  228. ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.byteIndex, ExceptionResource.ArgumentOutOfRange_Index);
  229. }
  230. fixed (char* pChars = s)
  231. fixed (byte* pBytes = bytes)
  232. {
  233. return GetBytesCommon(pChars + charIndex, charCount, pBytes + byteIndex, bytes.Length - byteIndex);
  234. }
  235. }
  236. // Encodes a range of characters in a character array into a range of bytes
  237. // in a byte array. An exception occurs if the byte array is not large
  238. // enough to hold the complete encoding of the characters. The
  239. // GetByteCount method can be used to determine the exact number of
  240. // bytes that will be produced for a given range of characters.
  241. // Alternatively, the GetMaxByteCount method can be used to
  242. // determine the maximum number of bytes that will be produced for a given
  243. // number of characters, regardless of the actual character values.
  244. //
  245. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  246. // So if you fix this, fix the others. Currently those include:
  247. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  248. // parent method is safe
  249. public override unsafe int GetBytes(char[] chars, int charIndex, int charCount,
  250. byte[] bytes, int byteIndex)
  251. {
  252. // Validate parameters
  253. if (chars is null || bytes is null)
  254. {
  255. ThrowHelper.ThrowArgumentNullException(
  256. argument: (chars is null) ? ExceptionArgument.chars : ExceptionArgument.bytes,
  257. resource: ExceptionResource.ArgumentNull_Array);
  258. }
  259. if ((charIndex | charCount) < 0)
  260. {
  261. ThrowHelper.ThrowArgumentOutOfRangeException(
  262. argument: (charIndex < 0) ? ExceptionArgument.charIndex : ExceptionArgument.charCount,
  263. resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
  264. }
  265. if (chars.Length - charIndex < charCount)
  266. {
  267. ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.chars, ExceptionResource.ArgumentOutOfRange_IndexCount);
  268. }
  269. if ((uint)byteIndex > bytes.Length)
  270. {
  271. ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.byteIndex, ExceptionResource.ArgumentOutOfRange_Index);
  272. }
  273. fixed (char* pChars = chars)
  274. fixed (byte* pBytes = bytes)
  275. {
  276. return GetBytesCommon(pChars + charIndex, charCount, pBytes + byteIndex, bytes.Length - byteIndex);
  277. }
  278. }
  279. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  280. // So if you fix this, fix the others. Currently those include:
  281. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  282. [CLSCompliant(false)]
  283. public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount)
  284. {
  285. // Validate Parameters
  286. if (chars == null || bytes == null)
  287. {
  288. ThrowHelper.ThrowArgumentNullException(
  289. argument: (chars is null) ? ExceptionArgument.chars : ExceptionArgument.bytes,
  290. resource: ExceptionResource.ArgumentNull_Array);
  291. }
  292. if ((charCount | byteCount) < 0)
  293. {
  294. ThrowHelper.ThrowArgumentOutOfRangeException(
  295. argument: (charCount < 0) ? ExceptionArgument.charCount : ExceptionArgument.byteCount,
  296. resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
  297. }
  298. return GetBytesCommon(chars, charCount, bytes, byteCount);
  299. }
  300. public override unsafe int GetBytes(ReadOnlySpan<char> chars, Span<byte> bytes)
  301. {
  302. // It's ok for us to operate on null / empty spans.
  303. fixed (char* charsPtr = &MemoryMarshal.GetReference(chars))
  304. fixed (byte* bytesPtr = &MemoryMarshal.GetReference(bytes))
  305. {
  306. return GetBytesCommon(charsPtr, chars.Length, bytesPtr, bytes.Length);
  307. }
  308. }
  309. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  310. private unsafe int GetBytesCommon(char* pChars, int charCount, byte* pBytes, int byteCount)
  311. {
  312. // Common helper method for all non-EncoderNLS entry points to GetBytes.
  313. // A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32.
  314. Debug.Assert(charCount >= 0, "Caller shouldn't specify negative length buffer.");
  315. Debug.Assert(pChars != null || charCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
  316. Debug.Assert(byteCount >= 0, "Caller shouldn't specify negative length buffer.");
  317. Debug.Assert(pBytes != null || byteCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
  318. // First call into the fast path.
  319. int bytesWritten = GetBytesFast(pChars, charCount, pBytes, byteCount, out int charsConsumed);
  320. if (charsConsumed == charCount)
  321. {
  322. // All elements converted - return immediately.
  323. return bytesWritten;
  324. }
  325. else
  326. {
  327. // Simple narrowing conversion couldn't operate on entire buffer - invoke fallback.
  328. return GetBytesWithFallback(pChars, charCount, pBytes, byteCount, charsConsumed, bytesWritten);
  329. }
  330. }
  331. [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetBytesCommon
  332. private protected sealed override unsafe int GetBytesFast(char* pChars, int charsLength, byte* pBytes, int bytesLength, out int charsConsumed)
  333. {
  334. // We don't care about the exact OperationStatus value returned by the workhorse routine; we only
  335. // care if the workhorse was able to consume the entire input payload. If we're unable to do so,
  336. // we'll handle the remainder in the fallback routine.
  337. Utf8Utility.TranscodeToUtf8(pChars, charsLength, pBytes, bytesLength, out char* pInputBufferRemaining, out byte* pOutputBufferRemaining);
  338. charsConsumed = (int)(pInputBufferRemaining - pChars);
  339. return (int)(pOutputBufferRemaining - pBytes);
  340. }
  341. // Returns the number of characters produced by decoding a range of bytes
  342. // in a byte array.
  343. //
  344. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  345. // So if you fix this, fix the others. Currently those include:
  346. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  347. // parent method is safe
  348. public override unsafe int GetCharCount(byte[] bytes, int index, int count)
  349. {
  350. // Validate Parameters
  351. if (bytes is null)
  352. {
  353. ThrowHelper.ThrowArgumentNullException(ExceptionArgument.bytes, ExceptionResource.ArgumentNull_Array);
  354. }
  355. if ((index | count) < 0)
  356. {
  357. ThrowHelper.ThrowArgumentOutOfRangeException((index < 0) ? ExceptionArgument.index : ExceptionArgument.count, ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
  358. }
  359. if (bytes.Length - index < count)
  360. {
  361. ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.bytes, ExceptionResource.ArgumentOutOfRange_IndexCountBuffer);
  362. }
  363. fixed (byte* pBytes = bytes)
  364. {
  365. return GetCharCountCommon(pBytes + index, count);
  366. }
  367. }
  368. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  369. // So if you fix this, fix the others. Currently those include:
  370. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  371. [CLSCompliant(false)]
  372. public override unsafe int GetCharCount(byte* bytes, int count)
  373. {
  374. // Validate Parameters
  375. if (bytes == null)
  376. {
  377. ThrowHelper.ThrowArgumentNullException(ExceptionArgument.bytes, ExceptionResource.ArgumentNull_Array);
  378. }
  379. if (count < 0)
  380. {
  381. ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.count, ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
  382. }
  383. return GetCharCountCommon(bytes, count);
  384. }
  385. public override unsafe int GetCharCount(ReadOnlySpan<byte> bytes)
  386. {
  387. // It's ok for us to pass null pointers down to the workhorse routine.
  388. fixed (byte* bytesPtr = &MemoryMarshal.GetReference(bytes))
  389. {
  390. return GetCharCountCommon(bytesPtr, bytes.Length);
  391. }
  392. }
  393. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  394. // So if you fix this, fix the others. Currently those include:
  395. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  396. // parent method is safe
  397. public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount,
  398. char[] chars, int charIndex)
  399. {
  400. // Validate Parameters
  401. if (bytes is null || chars is null)
  402. {
  403. ThrowHelper.ThrowArgumentNullException(
  404. argument: (bytes is null) ? ExceptionArgument.bytes : ExceptionArgument.chars,
  405. resource: ExceptionResource.ArgumentNull_Array);
  406. }
  407. if ((byteIndex | byteCount) < 0)
  408. {
  409. ThrowHelper.ThrowArgumentOutOfRangeException(
  410. argument: (byteIndex < 0) ? ExceptionArgument.byteIndex : ExceptionArgument.byteCount,
  411. resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
  412. }
  413. if (bytes.Length - byteIndex < byteCount)
  414. {
  415. ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.bytes, ExceptionResource.ArgumentOutOfRange_IndexCountBuffer);
  416. }
  417. if ((uint)charIndex > (uint)chars.Length)
  418. {
  419. ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.charIndex, ExceptionResource.ArgumentOutOfRange_Index);
  420. }
  421. fixed (byte* pBytes = bytes)
  422. fixed (char* pChars = chars)
  423. {
  424. return GetCharsCommon(pBytes + byteIndex, byteCount, pChars + charIndex, chars.Length - charIndex);
  425. }
  426. }
  427. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  428. // So if you fix this, fix the others. Currently those include:
  429. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  430. [CLSCompliant(false)]
  431. public override unsafe int GetChars(byte* bytes, int byteCount, char* chars, int charCount)
  432. {
  433. // Validate Parameters
  434. if (bytes is null || chars is null)
  435. {
  436. ThrowHelper.ThrowArgumentNullException(
  437. argument: (bytes is null) ? ExceptionArgument.bytes : ExceptionArgument.chars,
  438. resource: ExceptionResource.ArgumentNull_Array);
  439. }
  440. if ((byteCount | charCount) < 0)
  441. {
  442. ThrowHelper.ThrowArgumentOutOfRangeException(
  443. argument: (byteCount < 0) ? ExceptionArgument.byteCount : ExceptionArgument.charCount,
  444. resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
  445. }
  446. return GetCharsCommon(bytes, byteCount, chars, charCount);
  447. }
  448. public override unsafe int GetChars(ReadOnlySpan<byte> bytes, Span<char> chars)
  449. {
  450. // It's ok for us to pass null pointers down to the workhorse below.
  451. fixed (byte* bytesPtr = &MemoryMarshal.GetReference(bytes))
  452. fixed (char* charsPtr = &MemoryMarshal.GetReference(chars))
  453. {
  454. return GetCharsCommon(bytesPtr, bytes.Length, charsPtr, chars.Length);
  455. }
  456. }
  457. // WARNING: If we throw an error, then System.Resources.ResourceReader calls this method.
  458. // So if we're really broken, then that could also throw an error... recursively.
  459. // So try to make sure GetChars can at least process all uses by
  460. // System.Resources.ResourceReader!
  461. //
  462. // Note: We throw exceptions on individually encoded surrogates and other non-shortest forms.
  463. // If exceptions aren't turned on, then we drop all non-shortest &individual surrogates.
  464. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  465. private unsafe int GetCharsCommon(byte* pBytes, int byteCount, char* pChars, int charCount)
  466. {
  467. // Common helper method for all non-DecoderNLS entry points to GetChars.
  468. // A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32.
  469. Debug.Assert(byteCount >= 0, "Caller shouldn't specify negative length buffer.");
  470. Debug.Assert(pBytes != null || byteCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
  471. Debug.Assert(charCount >= 0, "Caller shouldn't specify negative length buffer.");
  472. Debug.Assert(pChars != null || charCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
  473. // First call into the fast path.
  474. int charsWritten = GetCharsFast(pBytes, byteCount, pChars, charCount, out int bytesConsumed);
  475. if (bytesConsumed == byteCount)
  476. {
  477. // All elements converted - return immediately.
  478. return charsWritten;
  479. }
  480. else
  481. {
  482. // Simple narrowing conversion couldn't operate on entire buffer - invoke fallback.
  483. return GetCharsWithFallback(pBytes, byteCount, pChars, charCount, bytesConsumed, charsWritten);
  484. }
  485. }
  486. [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetCharsCommon
  487. private protected sealed override unsafe int GetCharsFast(byte* pBytes, int bytesLength, char* pChars, int charsLength, out int bytesConsumed)
  488. {
  489. // We don't care about the exact OperationStatus value returned by the workhorse routine; we only
  490. // care if the workhorse was able to consume the entire input payload. If we're unable to do so,
  491. // we'll handle the remainder in the fallback routine.
  492. Utf8Utility.TranscodeToUtf16(pBytes, bytesLength, pChars, charsLength, out byte* pInputBufferRemaining, out char* pOutputBufferRemaining);
  493. bytesConsumed = (int)(pInputBufferRemaining - pBytes);
  494. return (int)(pOutputBufferRemaining - pChars);
  495. }
  496. private protected sealed override unsafe int GetCharsWithFallback(ReadOnlySpan<byte> bytes, int originalBytesLength, Span<char> chars, int originalCharsLength, DecoderNLS? decoder)
  497. {
  498. // We special-case DecoderReplacementFallback if it's telling us to write a single U+FFFD char,
  499. // since we believe this to be relatively common and we can handle it more efficiently than
  500. // the base implementation.
  501. if (((decoder is null) ? this.DecoderFallback : decoder.Fallback) is DecoderReplacementFallback replacementFallback
  502. && replacementFallback.MaxCharCount == 1
  503. && replacementFallback.DefaultString[0] == UnicodeUtility.ReplacementChar)
  504. {
  505. // Don't care about the exact OperationStatus, just how much of the payload we were able
  506. // to process.
  507. Utf8.ToUtf16(bytes, chars, out int bytesRead, out int charsWritten, replaceInvalidSequences: true, isFinalBlock: decoder is null || decoder.MustFlush);
  508. // Slice off how much we consumed / wrote.
  509. bytes = bytes.Slice(bytesRead);
  510. chars = chars.Slice(charsWritten);
  511. }
  512. // If we couldn't go through our fast fallback mechanism, or if we still have leftover
  513. // data because we couldn't consume everything in the loop above, we need to go down the
  514. // slow fallback path.
  515. if (bytes.IsEmpty)
  516. {
  517. return originalCharsLength - chars.Length; // total number of chars written
  518. }
  519. else
  520. {
  521. return base.GetCharsWithFallback(bytes, originalBytesLength, chars, originalCharsLength, decoder);
  522. }
  523. }
  524. // Returns a string containing the decoded representation of a range of
  525. // bytes in a byte array.
  526. //
  527. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  528. // So if you fix this, fix the others. Currently those include:
  529. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  530. // parent method is safe
  531. public override unsafe string GetString(byte[] bytes, int index, int count)
  532. {
  533. // Validate Parameters
  534. if (bytes is null)
  535. {
  536. ThrowHelper.ThrowArgumentNullException(ExceptionArgument.bytes, ExceptionResource.ArgumentNull_Array);
  537. }
  538. if ((index | count) < 0)
  539. {
  540. ThrowHelper.ThrowArgumentOutOfRangeException(
  541. argument: (index < 0) ? ExceptionArgument.index : ExceptionArgument.count,
  542. resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
  543. }
  544. if (bytes.Length - index < count)
  545. {
  546. ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.bytes, ExceptionResource.ArgumentOutOfRange_IndexCountBuffer);
  547. }
  548. // Avoid problems with empty input buffer
  549. if (count == 0)
  550. return string.Empty;
  551. fixed (byte* pBytes = bytes)
  552. {
  553. return string.CreateStringFromEncoding(pBytes + index, count, this);
  554. }
  555. }
  556. //
  557. // End of standard methods copied from EncodingNLS.cs
  558. //
  559. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  560. private unsafe int GetCharCountCommon(byte* pBytes, int byteCount)
  561. {
  562. // Common helper method for all non-DecoderNLS entry points to GetCharCount.
  563. // A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32.
  564. Debug.Assert(byteCount >= 0, "Caller shouldn't specify negative length buffer.");
  565. Debug.Assert(pBytes != null || byteCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
  566. // First call into the fast path.
  567. // Don't bother providing a fallback mechanism; our fast path doesn't use it.
  568. int totalCharCount = GetCharCountFast(pBytes, byteCount, fallback: null, out int bytesConsumed);
  569. if (bytesConsumed != byteCount)
  570. {
  571. // If there's still data remaining in the source buffer, go down the fallback path.
  572. // We need to check for integer overflow since the fallback could change the required
  573. // output count in unexpected ways.
  574. totalCharCount += GetCharCountWithFallback(pBytes, byteCount, bytesConsumed);
  575. if (totalCharCount < 0)
  576. {
  577. ThrowConversionOverflow();
  578. }
  579. }
  580. return totalCharCount;
  581. }
  582. [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetCharCountCommon
  583. private protected sealed override unsafe int GetCharCountFast(byte* pBytes, int bytesLength, DecoderFallback? fallback, out int bytesConsumed)
  584. {
  585. // The number of UTF-16 code units will never exceed the number of UTF-8 code units,
  586. // so the addition at the end of this method will not overflow.
  587. byte* ptrToFirstInvalidByte = Utf8Utility.GetPointerToFirstInvalidByte(pBytes, bytesLength, out int utf16CodeUnitCountAdjustment, out _);
  588. int tempBytesConsumed = (int)(ptrToFirstInvalidByte - pBytes);
  589. bytesConsumed = tempBytesConsumed;
  590. return tempBytesConsumed + utf16CodeUnitCountAdjustment;
  591. }
  592. public override Decoder GetDecoder()
  593. {
  594. return new DecoderNLS(this);
  595. }
  596. public override Encoder GetEncoder()
  597. {
  598. return new EncoderNLS(this);
  599. }
  600. //
  601. // Beginning of methods used by shared fallback logic.
  602. //
  603. internal sealed override bool TryGetByteCount(Rune value, out int byteCount)
  604. {
  605. // All well-formed Rune instances can be converted to 1..4 UTF-8 code units.
  606. byteCount = value.Utf8SequenceLength;
  607. return true;
  608. }
  609. internal sealed override OperationStatus EncodeRune(Rune value, Span<byte> bytes, out int bytesWritten)
  610. {
  611. // All well-formed Rune instances can be encoded as 1..4 UTF-8 code units.
  612. // If there's an error, it's because the destination was too small.
  613. return value.TryEncodeToUtf8(bytes, out bytesWritten) ? OperationStatus.Done : OperationStatus.DestinationTooSmall;
  614. }
  615. internal sealed override OperationStatus DecodeFirstRune(ReadOnlySpan<byte> bytes, out Rune value, out int bytesConsumed)
  616. {
  617. return Rune.DecodeFromUtf8(bytes, out value, out bytesConsumed);
  618. }
  619. //
  620. // End of methods used by shared fallback logic.
  621. //
  622. public override int GetMaxByteCount(int charCount)
  623. {
  624. if (charCount < 0)
  625. throw new ArgumentOutOfRangeException(nameof(charCount),
  626. SR.ArgumentOutOfRange_NeedNonNegNum);
  627. // Characters would be # of characters + 1 in case left over high surrogate is ? * max fallback
  628. long byteCount = (long)charCount + 1;
  629. if (EncoderFallback.MaxCharCount > 1)
  630. byteCount *= EncoderFallback.MaxCharCount;
  631. byteCount *= MaxUtf8BytesPerChar;
  632. if (byteCount > 0x7fffffff)
  633. throw new ArgumentOutOfRangeException(nameof(charCount), SR.ArgumentOutOfRange_GetByteCountOverflow);
  634. return (int)byteCount;
  635. }
  636. public override int GetMaxCharCount(int byteCount)
  637. {
  638. if (byteCount < 0)
  639. throw new ArgumentOutOfRangeException(nameof(byteCount),
  640. SR.ArgumentOutOfRange_NeedNonNegNum);
  641. // Figure out our length, 1 char per input byte + 1 char if 1st byte is last byte of 4 byte surrogate pair
  642. long charCount = ((long)byteCount + 1);
  643. // Non-shortest form would fall back, so get max count from fallback.
  644. // So would 11... followed by 11..., so you could fall back every byte
  645. if (DecoderFallback.MaxCharCount > 1)
  646. {
  647. charCount *= DecoderFallback.MaxCharCount;
  648. }
  649. if (charCount > 0x7fffffff)
  650. throw new ArgumentOutOfRangeException(nameof(byteCount), SR.ArgumentOutOfRange_GetCharCountOverflow);
  651. return (int)charCount;
  652. }
  653. public override byte[] GetPreamble()
  654. {
  655. if (_emitUTF8Identifier)
  656. {
  657. // Allocate new array to prevent users from modifying it.
  658. return new byte[3] { 0xEF, 0xBB, 0xBF };
  659. }
  660. else
  661. return Array.Empty<byte>();
  662. }
  663. public override ReadOnlySpan<byte> Preamble =>
  664. GetType() != typeof(UTF8Encoding) ? new ReadOnlySpan<byte>(GetPreamble()) : // in case a derived UTF8Encoding overrode GetPreamble
  665. _emitUTF8Identifier ? PreambleSpan :
  666. default;
  667. public override bool Equals(object? value)
  668. {
  669. if (value is UTF8Encoding that)
  670. {
  671. return (_emitUTF8Identifier == that._emitUTF8Identifier) &&
  672. (EncoderFallback.Equals(that.EncoderFallback)) &&
  673. (DecoderFallback.Equals(that.DecoderFallback));
  674. }
  675. return false;
  676. }
  677. public override int GetHashCode()
  678. {
  679. // Not great distribution, but this is relatively unlikely to be used as the key in a hashtable.
  680. return this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode() +
  681. UTF8_CODEPAGE + (_emitUTF8Identifier ? 1 : 0);
  682. }
  683. }
  684. }