UTF8Encoding.cs 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858
  1. // Licensed to the .NET Foundation under one or more agreements.
  2. // The .NET Foundation licenses this file to you under the MIT license.
  3. // See the LICENSE file in the project root for more information.
  4. // The worker functions in this file was optimized for performance. If you make changes
  5. // you should use care to consider all of the interesting cases.
  6. // The code of all worker functions in this file is written twice: Once as a slow loop, and the
  7. // second time as a fast loop. The slow loops handles all special cases, throws exceptions, etc.
  8. // The fast loops attempts to blaze through as fast as possible with optimistic range checks,
  9. // processing multiple characters at a time, and falling back to the slow loop for all special cases.
  10. using System;
  11. using System.Buffers;
  12. using System.Diagnostics;
  13. using System.Runtime.CompilerServices;
  14. using System.Runtime.InteropServices;
  15. using System.Text.Unicode;
  16. namespace System.Text
  17. {
  18. // Encodes text into and out of UTF-8. UTF-8 is a way of writing
  19. // Unicode characters with variable numbers of bytes per character,
  20. // optimized for the lower 127 ASCII characters. It's an efficient way
  21. // of encoding US English in an internationalizable way.
  22. //
  23. // Don't override IsAlwaysNormalized because it is just a Unicode Transformation and could be confused.
  24. //
  25. // The UTF-8 byte order mark is simply the Unicode byte order mark
  26. // (0xFEFF) written in UTF-8 (0xEF 0xBB 0xBF). The byte order mark is
  27. // used mostly to distinguish UTF-8 text from other encodings, and doesn't
  28. // switch the byte orderings.
  29. public class UTF8Encoding : Encoding
  30. {
  31. /*
  32. bytes bits UTF-8 representation
  33. ----- ---- -----------------------------------
  34. 1 7 0vvvvvvv
  35. 2 11 110vvvvv 10vvvvvv
  36. 3 16 1110vvvv 10vvvvvv 10vvvvvv
  37. 4 21 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
  38. ----- ---- -----------------------------------
  39. Surrogate:
  40. Real Unicode value = (HighSurrogate - 0xD800) * 0x400 + (LowSurrogate - 0xDC00) + 0x10000
  41. */
  42. private const int UTF8_CODEPAGE = 65001;
  43. // Allow for de-virtualization (see https://github.com/dotnet/coreclr/pull/9230)
  44. internal sealed class UTF8EncodingSealed : UTF8Encoding
  45. {
  46. public UTF8EncodingSealed(bool encoderShouldEmitUTF8Identifier) : base(encoderShouldEmitUTF8Identifier) { }
  47. public override ReadOnlySpan<byte> Preamble => _emitUTF8Identifier ? PreambleSpan : default;
  48. }
  49. // Used by Encoding.UTF8 for lazy initialization
  50. // The initialization code will not be run until a static member of the class is referenced
  51. internal static readonly UTF8EncodingSealed s_default = new UTF8EncodingSealed(encoderShouldEmitUTF8Identifier: true);
  52. internal static ReadOnlySpan<byte> PreambleSpan => new byte[3] { 0xEF, 0xBB, 0xBF }; // uses C# compiler's optimization for static byte[] data
  53. // Yes, the idea of emitting U+FEFF as a UTF-8 identifier has made it into
  54. // the standard.
  55. internal readonly bool _emitUTF8Identifier = false;
  56. private readonly bool _isThrowException = false;
  57. public UTF8Encoding() : this(false)
  58. {
  59. }
  60. public UTF8Encoding(bool encoderShouldEmitUTF8Identifier) :
  61. base(UTF8_CODEPAGE)
  62. {
  63. _emitUTF8Identifier = encoderShouldEmitUTF8Identifier;
  64. }
  65. public UTF8Encoding(bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes) :
  66. this(encoderShouldEmitUTF8Identifier)
  67. {
  68. _isThrowException = throwOnInvalidBytes;
  69. // Encoding's constructor already did this, but it'll be wrong if we're throwing exceptions
  70. if (_isThrowException)
  71. SetDefaultFallbacks();
  72. }
  73. internal sealed override void SetDefaultFallbacks()
  74. {
  75. // For UTF-X encodings, we use a replacement fallback with an empty string
  76. if (_isThrowException)
  77. {
  78. this.encoderFallback = EncoderFallback.ExceptionFallback;
  79. this.decoderFallback = DecoderFallback.ExceptionFallback;
  80. }
  81. else
  82. {
  83. this.encoderFallback = new EncoderReplacementFallback("\xFFFD");
  84. this.decoderFallback = new DecoderReplacementFallback("\xFFFD");
  85. }
  86. }
  87. // WARNING: GetByteCount(string chars)
  88. // WARNING: has different variable names than EncodingNLS.cs, so this can't just be cut & pasted,
  89. // WARNING: otherwise it'll break VB's way of declaring these.
  90. //
  91. // The following methods are copied from EncodingNLS.cs.
  92. // Unfortunately EncodingNLS.cs is internal and we're public, so we have to re-implement them here.
  93. // These should be kept in sync for the following classes:
  94. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  95. // Returns the number of bytes required to encode a range of characters in
  96. // a character array.
  97. //
  98. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  99. // So if you fix this, fix the others. Currently those include:
  100. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  101. // parent method is safe
  102. public override unsafe int GetByteCount(char[] chars, int index, int count)
  103. {
  104. // Validate input parameters
  105. if (chars is null)
  106. {
  107. ThrowHelper.ThrowArgumentNullException(ExceptionArgument.chars, ExceptionResource.ArgumentNull_Array);
  108. }
  109. if ((index | count) < 0)
  110. {
  111. ThrowHelper.ThrowArgumentOutOfRangeException((index < 0) ? ExceptionArgument.index : ExceptionArgument.count, ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
  112. }
  113. if (chars.Length - index < count)
  114. {
  115. ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.chars, ExceptionResource.ArgumentOutOfRange_IndexCountBuffer);
  116. }
  117. fixed (char* pChars = chars)
  118. {
  119. return GetByteCountCommon(pChars + index, count);
  120. }
  121. }
  122. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  123. // So if you fix this, fix the others. Currently those include:
  124. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  125. // parent method is safe
  126. public override unsafe int GetByteCount(string chars)
  127. {
  128. // Validate input parameters
  129. if (chars is null)
  130. {
  131. ThrowHelper.ThrowArgumentNullException(ExceptionArgument.chars);
  132. }
  133. fixed (char* pChars = chars)
  134. {
  135. return GetByteCountCommon(pChars, chars.Length);
  136. }
  137. }
  138. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  139. // So if you fix this, fix the others. Currently those include:
  140. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  141. [CLSCompliant(false)]
  142. public override unsafe int GetByteCount(char* chars, int count)
  143. {
  144. // Validate Parameters
  145. if (chars == null)
  146. {
  147. ThrowHelper.ThrowArgumentNullException(ExceptionArgument.chars);
  148. }
  149. if (count < 0)
  150. {
  151. ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.count, ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
  152. }
  153. return GetByteCountCommon(chars, count);
  154. }
  155. public override unsafe int GetByteCount(ReadOnlySpan<char> chars)
  156. {
  157. // It's ok for us to pass null pointers down to the workhorse below.
  158. fixed (char* charsPtr = &MemoryMarshal.GetReference(chars))
  159. {
  160. return GetByteCountCommon(charsPtr, chars.Length);
  161. }
  162. }
  163. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  164. private unsafe int GetByteCountCommon(char* pChars, int charCount)
  165. {
  166. // Common helper method for all non-EncoderNLS entry points to GetByteCount.
  167. // A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32.
  168. Debug.Assert(charCount >= 0, "Caller shouldn't specify negative length buffer.");
  169. Debug.Assert(pChars != null || charCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
  170. // First call into the fast path.
  171. // Don't bother providing a fallback mechanism; our fast path doesn't use it.
  172. int totalByteCount = GetByteCountFast(pChars, charCount, fallback: null, out int charsConsumed);
  173. if (charsConsumed != charCount)
  174. {
  175. // If there's still data remaining in the source buffer, go down the fallback path.
  176. // We need to check for integer overflow since the fallback could change the required
  177. // output count in unexpected ways.
  178. totalByteCount += GetByteCountWithFallback(pChars, charCount, charsConsumed);
  179. if (totalByteCount < 0)
  180. {
  181. ThrowConversionOverflow();
  182. }
  183. }
  184. return totalByteCount;
  185. }
  186. [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetCharCountCommon
  187. private protected sealed override unsafe int GetByteCountFast(char* pChars, int charsLength, EncoderFallback? fallback, out int charsConsumed)
  188. {
  189. // The number of UTF-8 code units may exceed the number of UTF-16 code units,
  190. // so we'll need to check for overflow before casting to Int32.
  191. char* ptrToFirstInvalidChar = Utf16Utility.GetPointerToFirstInvalidChar(pChars, charsLength, out long utf8CodeUnitCountAdjustment, out _);
  192. int tempCharsConsumed = (int)(ptrToFirstInvalidChar - pChars);
  193. charsConsumed = tempCharsConsumed;
  194. long totalUtf8Bytes = tempCharsConsumed + utf8CodeUnitCountAdjustment;
  195. if ((ulong)totalUtf8Bytes > int.MaxValue)
  196. {
  197. ThrowConversionOverflow();
  198. }
  199. return (int)totalUtf8Bytes;
  200. }
  201. // Parent method is safe.
  202. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  203. // So if you fix this, fix the others. Currently those include:
  204. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  205. public override unsafe int GetBytes(string s, int charIndex, int charCount,
  206. byte[] bytes, int byteIndex)
  207. {
  208. // Validate Parameters
  209. if (s is null || bytes is null)
  210. {
  211. ThrowHelper.ThrowArgumentNullException(
  212. argument: (s is null) ? ExceptionArgument.s : ExceptionArgument.bytes,
  213. resource: ExceptionResource.ArgumentNull_Array);
  214. }
  215. if ((charIndex | charCount) < 0)
  216. {
  217. ThrowHelper.ThrowArgumentOutOfRangeException(
  218. argument: (charIndex < 0) ? ExceptionArgument.charIndex : ExceptionArgument.charCount,
  219. resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
  220. }
  221. if (s.Length - charIndex < charCount)
  222. {
  223. ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.s, ExceptionResource.ArgumentOutOfRange_IndexCount);
  224. }
  225. if ((uint)byteIndex > bytes.Length)
  226. {
  227. ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.byteIndex, ExceptionResource.ArgumentOutOfRange_Index);
  228. }
  229. fixed (char* pChars = s)
  230. fixed (byte* pBytes = bytes)
  231. {
  232. return GetBytesCommon(pChars + charIndex, charCount, pBytes + byteIndex, bytes.Length - byteIndex);
  233. }
  234. }
  235. // Encodes a range of characters in a character array into a range of bytes
  236. // in a byte array. An exception occurs if the byte array is not large
  237. // enough to hold the complete encoding of the characters. The
  238. // GetByteCount method can be used to determine the exact number of
  239. // bytes that will be produced for a given range of characters.
  240. // Alternatively, the GetMaxByteCount method can be used to
  241. // determine the maximum number of bytes that will be produced for a given
  242. // number of characters, regardless of the actual character values.
  243. //
  244. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  245. // So if you fix this, fix the others. Currently those include:
  246. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  247. // parent method is safe
  248. public override unsafe int GetBytes(char[] chars, int charIndex, int charCount,
  249. byte[] bytes, int byteIndex)
  250. {
  251. // Validate parameters
  252. if (chars is null || bytes is null)
  253. {
  254. ThrowHelper.ThrowArgumentNullException(
  255. argument: (chars is null) ? ExceptionArgument.chars : ExceptionArgument.bytes,
  256. resource: ExceptionResource.ArgumentNull_Array);
  257. }
  258. if ((charIndex | charCount) < 0)
  259. {
  260. ThrowHelper.ThrowArgumentOutOfRangeException(
  261. argument: (charIndex < 0) ? ExceptionArgument.charIndex : ExceptionArgument.charCount,
  262. resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
  263. }
  264. if (chars.Length - charIndex < charCount)
  265. {
  266. ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.chars, ExceptionResource.ArgumentOutOfRange_IndexCount);
  267. }
  268. if ((uint)byteIndex > bytes.Length)
  269. {
  270. ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.byteIndex, ExceptionResource.ArgumentOutOfRange_Index);
  271. }
  272. fixed (char* pChars = chars)
  273. fixed (byte* pBytes = bytes)
  274. {
  275. return GetBytesCommon(pChars + charIndex, charCount, pBytes + byteIndex, bytes.Length - byteIndex);
  276. }
  277. }
  278. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  279. // So if you fix this, fix the others. Currently those include:
  280. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  281. [CLSCompliant(false)]
  282. public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount)
  283. {
  284. // Validate Parameters
  285. if (chars == null || bytes == null)
  286. {
  287. ThrowHelper.ThrowArgumentNullException(
  288. argument: (chars is null) ? ExceptionArgument.chars : ExceptionArgument.bytes,
  289. resource: ExceptionResource.ArgumentNull_Array);
  290. }
  291. if ((charCount | byteCount) < 0)
  292. {
  293. ThrowHelper.ThrowArgumentOutOfRangeException(
  294. argument: (charCount < 0) ? ExceptionArgument.charCount : ExceptionArgument.byteCount,
  295. resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
  296. }
  297. return GetBytesCommon(chars, charCount, bytes, byteCount);
  298. }
  299. public override unsafe int GetBytes(ReadOnlySpan<char> chars, Span<byte> bytes)
  300. {
  301. // It's ok for us to operate on null / empty spans.
  302. fixed (char* charsPtr = &MemoryMarshal.GetReference(chars))
  303. fixed (byte* bytesPtr = &MemoryMarshal.GetReference(bytes))
  304. {
  305. return GetBytesCommon(charsPtr, chars.Length, bytesPtr, bytes.Length);
  306. }
  307. }
  308. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  309. private unsafe int GetBytesCommon(char* pChars, int charCount, byte* pBytes, int byteCount)
  310. {
  311. // Common helper method for all non-EncoderNLS entry points to GetBytes.
  312. // A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32.
  313. Debug.Assert(charCount >= 0, "Caller shouldn't specify negative length buffer.");
  314. Debug.Assert(pChars != null || charCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
  315. Debug.Assert(byteCount >= 0, "Caller shouldn't specify negative length buffer.");
  316. Debug.Assert(pBytes != null || byteCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
  317. // First call into the fast path.
  318. int bytesWritten = GetBytesFast(pChars, charCount, pBytes, byteCount, out int charsConsumed);
  319. if (charsConsumed == charCount)
  320. {
  321. // All elements converted - return immediately.
  322. return bytesWritten;
  323. }
  324. else
  325. {
  326. // Simple narrowing conversion couldn't operate on entire buffer - invoke fallback.
  327. return GetBytesWithFallback(pChars, charCount, pBytes, byteCount, charsConsumed, bytesWritten);
  328. }
  329. }
  330. [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetBytesCommon
  331. private protected sealed override unsafe int GetBytesFast(char* pChars, int charsLength, byte* pBytes, int bytesLength, out int charsConsumed)
  332. {
  333. // We don't care about the exact OperationStatus value returned by the workhorse routine; we only
  334. // care if the workhorse was able to consume the entire input payload. If we're unable to do so,
  335. // we'll handle the remainder in the fallback routine.
  336. Utf8Utility.TranscodeToUtf8(pChars, charsLength, pBytes, bytesLength, out char* pInputBufferRemaining, out byte* pOutputBufferRemaining);
  337. charsConsumed = (int)(pInputBufferRemaining - pChars);
  338. return (int)(pOutputBufferRemaining - pBytes);
  339. }
  340. // Returns the number of characters produced by decoding a range of bytes
  341. // in a byte array.
  342. //
  343. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  344. // So if you fix this, fix the others. Currently those include:
  345. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  346. // parent method is safe
  347. public override unsafe int GetCharCount(byte[] bytes, int index, int count)
  348. {
  349. // Validate Parameters
  350. if (bytes is null)
  351. {
  352. ThrowHelper.ThrowArgumentNullException(ExceptionArgument.bytes, ExceptionResource.ArgumentNull_Array);
  353. }
  354. if ((index | count) < 0)
  355. {
  356. ThrowHelper.ThrowArgumentOutOfRangeException((index < 0) ? ExceptionArgument.index : ExceptionArgument.count, ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
  357. }
  358. if (bytes.Length - index < count)
  359. {
  360. ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.bytes, ExceptionResource.ArgumentOutOfRange_IndexCountBuffer);
  361. }
  362. fixed (byte* pBytes = bytes)
  363. {
  364. return GetCharCountCommon(pBytes + index, count);
  365. }
  366. }
  367. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  368. // So if you fix this, fix the others. Currently those include:
  369. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  370. [CLSCompliant(false)]
  371. public override unsafe int GetCharCount(byte* bytes, int count)
  372. {
  373. // Validate Parameters
  374. if (bytes == null)
  375. {
  376. ThrowHelper.ThrowArgumentNullException(ExceptionArgument.bytes, ExceptionResource.ArgumentNull_Array);
  377. }
  378. if (count < 0)
  379. {
  380. ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.count, ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
  381. }
  382. return GetCharCountCommon(bytes, count);
  383. }
  384. public override unsafe int GetCharCount(ReadOnlySpan<byte> bytes)
  385. {
  386. // It's ok for us to pass null pointers down to the workhorse routine.
  387. fixed (byte* bytesPtr = &MemoryMarshal.GetReference(bytes))
  388. {
  389. return GetCharCountCommon(bytesPtr, bytes.Length);
  390. }
  391. }
  392. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  393. // So if you fix this, fix the others. Currently those include:
  394. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  395. // parent method is safe
  396. public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount,
  397. char[] chars, int charIndex)
  398. {
  399. // Validate Parameters
  400. if (bytes is null || chars is null)
  401. {
  402. ThrowHelper.ThrowArgumentNullException(
  403. argument: (bytes is null) ? ExceptionArgument.bytes : ExceptionArgument.chars,
  404. resource: ExceptionResource.ArgumentNull_Array);
  405. }
  406. if ((byteIndex | byteCount) < 0)
  407. {
  408. ThrowHelper.ThrowArgumentOutOfRangeException(
  409. argument: (byteIndex < 0) ? ExceptionArgument.byteIndex : ExceptionArgument.byteCount,
  410. resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
  411. }
  412. if (bytes.Length - byteIndex < byteCount)
  413. {
  414. ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.bytes, ExceptionResource.ArgumentOutOfRange_IndexCountBuffer);
  415. }
  416. if ((uint)charIndex > (uint)chars.Length)
  417. {
  418. ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.charIndex, ExceptionResource.ArgumentOutOfRange_Index);
  419. }
  420. fixed (byte* pBytes = bytes)
  421. fixed (char* pChars = chars)
  422. {
  423. return GetCharsCommon(pBytes + byteIndex, byteCount, pChars + charIndex, chars.Length - charIndex);
  424. }
  425. }
  426. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  427. // So if you fix this, fix the others. Currently those include:
  428. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  429. [CLSCompliant(false)]
  430. public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount)
  431. {
  432. // Validate Parameters
  433. if (bytes is null || chars is null)
  434. {
  435. ThrowHelper.ThrowArgumentNullException(
  436. argument: (bytes is null) ? ExceptionArgument.bytes : ExceptionArgument.chars,
  437. resource: ExceptionResource.ArgumentNull_Array);
  438. }
  439. if ((byteCount | charCount) < 0)
  440. {
  441. ThrowHelper.ThrowArgumentOutOfRangeException(
  442. argument: (byteCount < 0) ? ExceptionArgument.byteCount : ExceptionArgument.charCount,
  443. resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
  444. }
  445. return GetCharsCommon(bytes, byteCount, chars, charCount);
  446. }
  447. public override unsafe int GetChars(ReadOnlySpan<byte> bytes, Span<char> chars)
  448. {
  449. // It's ok for us to pass null pointers down to the workhorse below.
  450. fixed (byte* bytesPtr = &MemoryMarshal.GetReference(bytes))
  451. fixed (char* charsPtr = &MemoryMarshal.GetReference(chars))
  452. {
  453. return GetCharsCommon(bytesPtr, bytes.Length, charsPtr, chars.Length);
  454. }
  455. }
  456. // WARNING: If we throw an error, then System.Resources.ResourceReader calls this method.
  457. // So if we're really broken, then that could also throw an error... recursively.
  458. // So try to make sure GetChars can at least process all uses by
  459. // System.Resources.ResourceReader!
  460. //
  461. // Note: We throw exceptions on individually encoded surrogates and other non-shortest forms.
  462. // If exceptions aren't turned on, then we drop all non-shortest &individual surrogates.
  463. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  464. private unsafe int GetCharsCommon(byte* pBytes, int byteCount, char* pChars, int charCount)
  465. {
  466. // Common helper method for all non-DecoderNLS entry points to GetChars.
  467. // A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32.
  468. Debug.Assert(byteCount >= 0, "Caller shouldn't specify negative length buffer.");
  469. Debug.Assert(pBytes != null || byteCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
  470. Debug.Assert(charCount >= 0, "Caller shouldn't specify negative length buffer.");
  471. Debug.Assert(pChars != null || charCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
  472. // First call into the fast path.
  473. int charsWritten = GetCharsFast(pBytes, byteCount, pChars, charCount, out int bytesConsumed);
  474. if (bytesConsumed == byteCount)
  475. {
  476. // All elements converted - return immediately.
  477. return charsWritten;
  478. }
  479. else
  480. {
  481. // Simple narrowing conversion couldn't operate on entire buffer - invoke fallback.
  482. return GetCharsWithFallback(pBytes, byteCount, pChars, charCount, bytesConsumed, charsWritten);
  483. }
  484. }
  485. [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetCharsCommon
  486. private protected sealed override unsafe int GetCharsFast(byte* pBytes, int bytesLength, char* pChars, int charsLength, out int bytesConsumed)
  487. {
  488. // We don't care about the exact OperationStatus value returned by the workhorse routine; we only
  489. // care if the workhorse was able to consume the entire input payload. If we're unable to do so,
  490. // we'll handle the remainder in the fallback routine.
  491. Utf8Utility.TranscodeToUtf16(pBytes, bytesLength, pChars, charsLength, out byte* pInputBufferRemaining, out char* pOutputBufferRemaining);
  492. bytesConsumed = (int)(pInputBufferRemaining - pBytes);
  493. return (int)(pOutputBufferRemaining - pChars);
  494. }
  495. private protected sealed override unsafe int GetCharsWithFallback(ReadOnlySpan<byte> bytes, int originalBytesLength, Span<char> chars, int originalCharsLength, DecoderNLS? decoder)
  496. {
  497. // We special-case DecoderReplacementFallback if it's telling us to write a single U+FFFD char,
  498. // since we believe this to be relatively common and we can handle it more efficiently than
  499. // the base implementation.
  500. if (((decoder is null) ? this.DecoderFallback : decoder.Fallback) is DecoderReplacementFallback replacementFallback
  501. && replacementFallback.MaxCharCount == 1
  502. && replacementFallback.DefaultString[0] == UnicodeUtility.ReplacementChar)
  503. {
  504. // Don't care about the exact OperationStatus, just how much of the payload we were able
  505. // to process.
  506. Utf8.ToUtf16(bytes, chars, out int bytesRead, out int charsWritten, replaceInvalidSequences: true, isFinalBlock: decoder is null || decoder.MustFlush);
  507. // Slice off how much we consumed / wrote.
  508. bytes = bytes.Slice(bytesRead);
  509. chars = chars.Slice(charsWritten);
  510. }
  511. // If we couldn't go through our fast fallback mechanism, or if we still have leftover
  512. // data because we couldn't consume everything in the loop above, we need to go down the
  513. // slow fallback path.
  514. if (bytes.IsEmpty)
  515. {
  516. return originalCharsLength - chars.Length; // total number of chars written
  517. }
  518. else
  519. {
  520. return base.GetCharsWithFallback(bytes, originalBytesLength, chars, originalCharsLength, decoder);
  521. }
  522. }
  523. // Returns a string containing the decoded representation of a range of
  524. // bytes in a byte array.
  525. //
  526. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  527. // So if you fix this, fix the others. Currently those include:
  528. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  529. // parent method is safe
  530. public override unsafe string GetString(byte[] bytes, int index, int count)
  531. {
  532. // Validate Parameters
  533. if (bytes is null)
  534. {
  535. ThrowHelper.ThrowArgumentNullException(ExceptionArgument.bytes, ExceptionResource.ArgumentNull_Array);
  536. }
  537. if ((index | count) < 0)
  538. {
  539. ThrowHelper.ThrowArgumentOutOfRangeException(
  540. argument: (index < 0) ? ExceptionArgument.index : ExceptionArgument.count,
  541. resource: ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
  542. }
  543. if (bytes.Length - index < count)
  544. {
  545. ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.bytes, ExceptionResource.ArgumentOutOfRange_IndexCountBuffer);
  546. }
  547. // Avoid problems with empty input buffer
  548. if (count == 0)
  549. return string.Empty;
  550. fixed (byte* pBytes = bytes)
  551. {
  552. return string.CreateStringFromEncoding(pBytes + index, count, this);
  553. }
  554. }
  555. //
  556. // End of standard methods copied from EncodingNLS.cs
  557. //
  558. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  559. private unsafe int GetCharCountCommon(byte* pBytes, int byteCount)
  560. {
  561. // Common helper method for all non-DecoderNLS entry points to GetCharCount.
  562. // A modification of this method should be copied in to each of the supported encodings: ASCII, UTF8, UTF16, UTF32.
  563. Debug.Assert(byteCount >= 0, "Caller shouldn't specify negative length buffer.");
  564. Debug.Assert(pBytes != null || byteCount == 0, "Input pointer shouldn't be null if non-zero length specified.");
  565. // First call into the fast path.
  566. // Don't bother providing a fallback mechanism; our fast path doesn't use it.
  567. int totalCharCount = GetCharCountFast(pBytes, byteCount, fallback: null, out int bytesConsumed);
  568. if (bytesConsumed != byteCount)
  569. {
  570. // If there's still data remaining in the source buffer, go down the fallback path.
  571. // We need to check for integer overflow since the fallback could change the required
  572. // output count in unexpected ways.
  573. totalCharCount += GetCharCountWithFallback(pBytes, byteCount, bytesConsumed);
  574. if (totalCharCount < 0)
  575. {
  576. ThrowConversionOverflow();
  577. }
  578. }
  579. return totalCharCount;
  580. }
  581. [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetCharCountCommon
  582. private protected sealed override unsafe int GetCharCountFast(byte* pBytes, int bytesLength, DecoderFallback? fallback, out int bytesConsumed)
  583. {
  584. // The number of UTF-16 code units will never exceed the number of UTF-8 code units,
  585. // so the addition at the end of this method will not overflow.
  586. byte* ptrToFirstInvalidByte = Utf8Utility.GetPointerToFirstInvalidByte(pBytes, bytesLength, out int utf16CodeUnitCountAdjustment, out _);
  587. int tempBytesConsumed = (int)(ptrToFirstInvalidByte - pBytes);
  588. bytesConsumed = tempBytesConsumed;
  589. return tempBytesConsumed + utf16CodeUnitCountAdjustment;
  590. }
  591. public override Decoder GetDecoder()
  592. {
  593. return new DecoderNLS(this);
  594. }
  595. public override Encoder GetEncoder()
  596. {
  597. return new EncoderNLS(this);
  598. }
  599. //
  600. // Beginning of methods used by shared fallback logic.
  601. //
  602. internal sealed override bool TryGetByteCount(Rune value, out int byteCount)
  603. {
  604. // All well-formed Rune instances can be converted to 1..4 UTF-8 code units.
  605. byteCount = value.Utf8SequenceLength;
  606. return true;
  607. }
  608. internal sealed override OperationStatus EncodeRune(Rune value, Span<byte> bytes, out int bytesWritten)
  609. {
  610. // All well-formed Rune instances can be encoded as 1..4 UTF-8 code units.
  611. // If there's an error, it's because the destination was too small.
  612. return value.TryEncodeToUtf8(bytes, out bytesWritten) ? OperationStatus.Done : OperationStatus.DestinationTooSmall;
  613. }
  614. internal sealed override OperationStatus DecodeFirstRune(ReadOnlySpan<byte> bytes, out Rune value, out int bytesConsumed)
  615. {
  616. return Rune.DecodeFromUtf8(bytes, out value, out bytesConsumed);
  617. }
  618. //
  619. // End of methods used by shared fallback logic.
  620. //
  621. public override int GetMaxByteCount(int charCount)
  622. {
  623. if (charCount < 0)
  624. throw new ArgumentOutOfRangeException(nameof(charCount),
  625. SR.ArgumentOutOfRange_NeedNonNegNum);
  626. // Characters would be # of characters + 1 in case left over high surrogate is ? * max fallback
  627. long byteCount = (long)charCount + 1;
  628. if (EncoderFallback.MaxCharCount > 1)
  629. byteCount *= EncoderFallback.MaxCharCount;
  630. // Max 3 bytes per char. (4 bytes per 2 chars for surrogates)
  631. byteCount *= 3;
  632. if (byteCount > 0x7fffffff)
  633. throw new ArgumentOutOfRangeException(nameof(charCount), SR.ArgumentOutOfRange_GetByteCountOverflow);
  634. return (int)byteCount;
  635. }
  636. public override int GetMaxCharCount(int byteCount)
  637. {
  638. if (byteCount < 0)
  639. throw new ArgumentOutOfRangeException(nameof(byteCount),
  640. SR.ArgumentOutOfRange_NeedNonNegNum);
  641. // Figure out our length, 1 char per input byte + 1 char if 1st byte is last byte of 4 byte surrogate pair
  642. long charCount = ((long)byteCount + 1);
  643. // Non-shortest form would fall back, so get max count from fallback.
  644. // So would 11... followed by 11..., so you could fall back every byte
  645. if (DecoderFallback.MaxCharCount > 1)
  646. {
  647. charCount *= DecoderFallback.MaxCharCount;
  648. }
  649. if (charCount > 0x7fffffff)
  650. throw new ArgumentOutOfRangeException(nameof(byteCount), SR.ArgumentOutOfRange_GetCharCountOverflow);
  651. return (int)charCount;
  652. }
  653. public override byte[] GetPreamble()
  654. {
  655. if (_emitUTF8Identifier)
  656. {
  657. // Allocate new array to prevent users from modifying it.
  658. return new byte[3] { 0xEF, 0xBB, 0xBF };
  659. }
  660. else
  661. return Array.Empty<byte>();
  662. }
  663. public override ReadOnlySpan<byte> Preamble =>
  664. GetType() != typeof(UTF8Encoding) ? new ReadOnlySpan<byte>(GetPreamble()) : // in case a derived UTF8Encoding overrode GetPreamble
  665. _emitUTF8Identifier ? PreambleSpan :
  666. default;
  667. public override bool Equals(object? value)
  668. {
  669. if (value is UTF8Encoding that)
  670. {
  671. return (_emitUTF8Identifier == that._emitUTF8Identifier) &&
  672. (EncoderFallback.Equals(that.EncoderFallback)) &&
  673. (DecoderFallback.Equals(that.DecoderFallback));
  674. }
  675. return false;
  676. }
  677. public override int GetHashCode()
  678. {
  679. //Not great distribution, but this is relatively unlikely to be used as the key in a hashtable.
  680. return this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode() +
  681. UTF8_CODEPAGE + (_emitUTF8Identifier ? 1 : 0);
  682. }
  683. }
  684. }