DecoderNLS.cs 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434
  1. // Licensed to the .NET Foundation under one or more agreements.
  2. // The .NET Foundation licenses this file to you under the MIT license.
  3. // See the LICENSE file in the project root for more information.
  4. using System.Buffers;
  5. using System.Diagnostics;
  6. using System.Runtime.InteropServices;
  7. namespace System.Text
  8. {
  9. // A Decoder is used to decode a sequence of blocks of bytes into a
  10. // sequence of blocks of characters. Following instantiation of a decoder,
  11. // sequential blocks of bytes are converted into blocks of characters through
  12. // calls to the GetChars method. The decoder maintains state between the
  13. // conversions, allowing it to correctly decode byte sequences that span
  14. // adjacent blocks.
  15. //
  16. // Instances of specific implementations of the Decoder abstract base
  17. // class are typically obtained through calls to the GetDecoder method
  18. // of Encoding objects.
  19. internal class DecoderNLS : Decoder
  20. {
  21. // Remember our encoding
  22. private readonly Encoding _encoding;
  23. private bool _mustFlush;
  24. internal bool _throwOnOverflow;
  25. internal int _bytesUsed;
  26. private int _leftoverBytes; // leftover data from a previous invocation of GetChars (up to 4 bytes)
  27. private int _leftoverByteCount; // number of bytes of actual data in _leftoverBytes
  28. internal DecoderNLS(Encoding encoding)
  29. {
  30. _encoding = encoding;
  31. _fallback = this._encoding.DecoderFallback;
  32. this.Reset();
  33. }
  34. public override void Reset()
  35. {
  36. ClearLeftoverData();
  37. _fallbackBuffer?.Reset();
  38. }
  39. public override int GetCharCount(byte[] bytes, int index, int count)
  40. {
  41. return GetCharCount(bytes, index, count, false);
  42. }
  43. public override unsafe int GetCharCount(byte[] bytes, int index, int count, bool flush)
  44. {
  45. // Validate Parameters
  46. if (bytes == null)
  47. throw new ArgumentNullException(nameof(bytes),
  48. SR.ArgumentNull_Array);
  49. if (index < 0 || count < 0)
  50. throw new ArgumentOutOfRangeException(index < 0 ? nameof(index) : nameof(count),
  51. SR.ArgumentOutOfRange_NeedNonNegNum);
  52. if (bytes.Length - index < count)
  53. throw new ArgumentOutOfRangeException(nameof(bytes),
  54. SR.ArgumentOutOfRange_IndexCountBuffer);
  55. // Just call pointer version
  56. fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes))
  57. return GetCharCount(pBytes + index, count, flush);
  58. }
  59. public override unsafe int GetCharCount(byte* bytes, int count, bool flush)
  60. {
  61. // Validate parameters
  62. if (bytes == null)
  63. throw new ArgumentNullException(nameof(bytes),
  64. SR.ArgumentNull_Array);
  65. if (count < 0)
  66. throw new ArgumentOutOfRangeException(nameof(count),
  67. SR.ArgumentOutOfRange_NeedNonNegNum);
  68. // Remember the flush
  69. _mustFlush = flush;
  70. _throwOnOverflow = true;
  71. // By default just call the encoding version, no flush by default
  72. Debug.Assert(_encoding != null);
  73. return _encoding.GetCharCount(bytes, count, this);
  74. }
  75. public override int GetChars(byte[] bytes, int byteIndex, int byteCount,
  76. char[] chars, int charIndex)
  77. {
  78. return GetChars(bytes, byteIndex, byteCount, chars, charIndex, false);
  79. }
  80. public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount,
  81. char[] chars, int charIndex, bool flush)
  82. {
  83. // Validate Parameters
  84. if (bytes == null || chars == null)
  85. throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars),
  86. SR.ArgumentNull_Array);
  87. if (byteIndex < 0 || byteCount < 0)
  88. throw new ArgumentOutOfRangeException(byteIndex < 0 ? nameof(byteIndex) : nameof(byteCount),
  89. SR.ArgumentOutOfRange_NeedNonNegNum);
  90. if (bytes.Length - byteIndex < byteCount)
  91. throw new ArgumentOutOfRangeException(nameof(bytes),
  92. SR.ArgumentOutOfRange_IndexCountBuffer);
  93. if (charIndex < 0 || charIndex > chars.Length)
  94. throw new ArgumentOutOfRangeException(nameof(charIndex),
  95. SR.ArgumentOutOfRange_Index);
  96. int charCount = chars.Length - charIndex;
  97. // Just call pointer version
  98. fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes))
  99. fixed (char* pChars = &MemoryMarshal.GetReference((Span<char>)chars))
  100. // Remember that charCount is # to decode, not size of array
  101. return GetChars(pBytes + byteIndex, byteCount,
  102. pChars + charIndex, charCount, flush);
  103. }
  104. public override unsafe int GetChars(byte* bytes, int byteCount,
  105. char* chars, int charCount, bool flush)
  106. {
  107. // Validate parameters
  108. if (chars == null || bytes == null)
  109. throw new ArgumentNullException(chars == null ? nameof(chars) : nameof(bytes),
  110. SR.ArgumentNull_Array);
  111. if (byteCount < 0 || charCount < 0)
  112. throw new ArgumentOutOfRangeException(byteCount < 0 ? nameof(byteCount) : nameof(charCount),
  113. SR.ArgumentOutOfRange_NeedNonNegNum);
  114. // Remember our flush
  115. _mustFlush = flush;
  116. _throwOnOverflow = true;
  117. // By default just call the encodings version
  118. Debug.Assert(_encoding != null);
  119. return _encoding.GetChars(bytes, byteCount, chars, charCount, this);
  120. }
  121. // This method is used when the output buffer might not be big enough.
  122. // Just call the pointer version. (This gets chars)
  123. public override unsafe void Convert(byte[] bytes, int byteIndex, int byteCount,
  124. char[] chars, int charIndex, int charCount, bool flush,
  125. out int bytesUsed, out int charsUsed, out bool completed)
  126. {
  127. // Validate parameters
  128. if (bytes == null || chars == null)
  129. throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars),
  130. SR.ArgumentNull_Array);
  131. if (byteIndex < 0 || byteCount < 0)
  132. throw new ArgumentOutOfRangeException(byteIndex < 0 ? nameof(byteIndex) : nameof(byteCount),
  133. SR.ArgumentOutOfRange_NeedNonNegNum);
  134. if (charIndex < 0 || charCount < 0)
  135. throw new ArgumentOutOfRangeException(charIndex < 0 ? nameof(charIndex) : nameof(charCount),
  136. SR.ArgumentOutOfRange_NeedNonNegNum);
  137. if (bytes.Length - byteIndex < byteCount)
  138. throw new ArgumentOutOfRangeException(nameof(bytes),
  139. SR.ArgumentOutOfRange_IndexCountBuffer);
  140. if (chars.Length - charIndex < charCount)
  141. throw new ArgumentOutOfRangeException(nameof(chars),
  142. SR.ArgumentOutOfRange_IndexCountBuffer);
  143. // Just call the pointer version (public overrides can't do this)
  144. fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes))
  145. {
  146. fixed (char* pChars = &MemoryMarshal.GetReference((Span<char>)chars))
  147. {
  148. Convert(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, flush,
  149. out bytesUsed, out charsUsed, out completed);
  150. }
  151. }
  152. }
  153. // This is the version that used pointers. We call the base encoding worker function
  154. // after setting our appropriate internal variables. This is getting chars
  155. public override unsafe void Convert(byte* bytes, int byteCount,
  156. char* chars, int charCount, bool flush,
  157. out int bytesUsed, out int charsUsed, out bool completed)
  158. {
  159. // Validate input parameters
  160. if (chars == null || bytes == null)
  161. throw new ArgumentNullException(chars == null ? nameof(chars) : nameof(bytes),
  162. SR.ArgumentNull_Array);
  163. if (byteCount < 0 || charCount < 0)
  164. throw new ArgumentOutOfRangeException(byteCount < 0 ? nameof(byteCount) : nameof(charCount),
  165. SR.ArgumentOutOfRange_NeedNonNegNum);
  166. // We don't want to throw
  167. _mustFlush = flush;
  168. _throwOnOverflow = false;
  169. _bytesUsed = 0;
  170. // Do conversion
  171. Debug.Assert(_encoding != null);
  172. charsUsed = _encoding.GetChars(bytes, byteCount, chars, charCount, this);
  173. bytesUsed = _bytesUsed;
  174. // Per MSDN, "The completed output parameter indicates whether all the data in the input
  175. // buffer was converted and stored in the output buffer." That means we've successfully
  176. // consumed all the input _and_ there's no pending state or fallback data remaining to be output.
  177. completed = (bytesUsed == byteCount)
  178. && !this.HasState
  179. && (_fallbackBuffer is null || _fallbackBuffer.Remaining == 0);
  180. }
  181. public bool MustFlush => _mustFlush;
  182. // Anything left in our decoder?
  183. internal virtual bool HasState => _leftoverByteCount != 0;
  184. // Allow encoding to clear our must flush instead of throwing (in ThrowCharsOverflow)
  185. internal void ClearMustFlush()
  186. {
  187. _mustFlush = false;
  188. }
  189. internal ReadOnlySpan<byte> GetLeftoverData() =>
  190. MemoryMarshal.AsBytes(new ReadOnlySpan<int>(ref _leftoverBytes, 1)).Slice(0, _leftoverByteCount);
  191. internal void SetLeftoverData(ReadOnlySpan<byte> bytes)
  192. {
  193. bytes.CopyTo(MemoryMarshal.AsBytes(new Span<int>(ref _leftoverBytes, 1)));
  194. _leftoverByteCount = bytes.Length;
  195. }
  196. internal bool HasLeftoverData => _leftoverByteCount != 0;
  197. internal void ClearLeftoverData()
  198. {
  199. _leftoverByteCount = 0;
  200. }
  201. internal int DrainLeftoverDataForGetCharCount(ReadOnlySpan<byte> bytes, out int bytesConsumed)
  202. {
  203. // Quick check: we _should not_ have leftover fallback data from a previous invocation,
  204. // as we'd end up consuming any such data and would corrupt whatever Convert call happens
  205. // to be in progress. Unlike EncoderNLS, this is simply a Debug.Assert. No exception is thrown.
  206. Debug.Assert(_fallbackBuffer is null || _fallbackBuffer.Remaining == 0, "Should have no data remaining in the fallback buffer.");
  207. Debug.Assert(HasLeftoverData, "Caller shouldn't invoke this routine unless there's leftover data in the decoder.");
  208. // Copy the existing leftover data plus as many bytes as possible of the new incoming data
  209. // into a temporary concated buffer, then get its char count by decoding it.
  210. Span<byte> combinedBuffer = stackalloc byte[4];
  211. combinedBuffer = combinedBuffer.Slice(0, ConcatInto(GetLeftoverData(), bytes, combinedBuffer));
  212. int charCount = 0;
  213. Debug.Assert(_encoding != null);
  214. switch (_encoding.DecodeFirstRune(combinedBuffer, out Rune value, out int combinedBufferBytesConsumed))
  215. {
  216. case OperationStatus.Done:
  217. charCount = value.Utf16SequenceLength;
  218. goto Finish; // successfully transcoded bytes -> chars
  219. case OperationStatus.NeedMoreData:
  220. if (MustFlush)
  221. {
  222. goto case OperationStatus.InvalidData; // treat as equivalent to bad data
  223. }
  224. else
  225. {
  226. goto Finish; // consumed some bytes, output 0 chars
  227. }
  228. case OperationStatus.InvalidData:
  229. break;
  230. default:
  231. Debug.Fail("Unexpected OperationStatus return value.");
  232. break;
  233. }
  234. // Couldn't decode the buffer. Fallback the buffer instead. See comment in DrainLeftoverDataForGetChars
  235. // for more information on why a negative index is provided.
  236. if (FallbackBuffer.Fallback(combinedBuffer.Slice(0, combinedBufferBytesConsumed).ToArray(), index: -_leftoverByteCount))
  237. {
  238. charCount = _fallbackBuffer!.DrainRemainingDataForGetCharCount();
  239. Debug.Assert(charCount >= 0, "Fallback buffer shouldn't have returned a negative char count.");
  240. }
  241. Finish:
  242. bytesConsumed = combinedBufferBytesConsumed - _leftoverByteCount; // amount of 'bytes' buffer consumed just now
  243. return charCount;
  244. }
  245. internal int DrainLeftoverDataForGetChars(ReadOnlySpan<byte> bytes, Span<char> chars, out int bytesConsumed)
  246. {
  247. // Quick check: we _should not_ have leftover fallback data from a previous invocation,
  248. // as we'd end up consuming any such data and would corrupt whatever Convert call happens
  249. // to be in progress. Unlike EncoderNLS, this is simply a Debug.Assert. No exception is thrown.
  250. Debug.Assert(_fallbackBuffer is null || _fallbackBuffer.Remaining == 0, "Should have no data remaining in the fallback buffer.");
  251. Debug.Assert(HasLeftoverData, "Caller shouldn't invoke this routine unless there's leftover data in the decoder.");
  252. // Copy the existing leftover data plus as many bytes as possible of the new incoming data
  253. // into a temporary concated buffer, then transcode it from bytes to chars.
  254. Span<byte> combinedBuffer = stackalloc byte[4];
  255. combinedBuffer = combinedBuffer.Slice(0, ConcatInto(GetLeftoverData(), bytes, combinedBuffer));
  256. int charsWritten = 0;
  257. bool persistNewCombinedBuffer = false;
  258. Debug.Assert(_encoding != null);
  259. switch (_encoding.DecodeFirstRune(combinedBuffer, out Rune value, out int combinedBufferBytesConsumed))
  260. {
  261. case OperationStatus.Done:
  262. if (value.TryEncodeToUtf16(chars, out charsWritten))
  263. {
  264. goto Finish; // successfully transcoded bytes -> chars
  265. }
  266. else
  267. {
  268. goto DestinationTooSmall;
  269. }
  270. case OperationStatus.NeedMoreData:
  271. if (MustFlush)
  272. {
  273. goto case OperationStatus.InvalidData; // treat as equivalent to bad data
  274. }
  275. else
  276. {
  277. persistNewCombinedBuffer = true;
  278. goto Finish; // successfully consumed some bytes, output no chars
  279. }
  280. case OperationStatus.InvalidData:
  281. break;
  282. default:
  283. Debug.Fail("Unexpected OperationStatus return value.");
  284. break;
  285. }
  286. // Couldn't decode the buffer. Fallback the buffer instead. The fallback mechanism relies
  287. // on a negative index to convey "the start of the invalid sequence was some number of
  288. // bytes back before the current buffer." Since we know the invalid sequence must have
  289. // started at the beginning of our leftover byte buffer, we can signal to our caller that
  290. // they must backtrack that many bytes to find the real start of the invalid sequence.
  291. if (FallbackBuffer.Fallback(combinedBuffer.Slice(0, combinedBufferBytesConsumed).ToArray(), index: -_leftoverByteCount)
  292. && !_fallbackBuffer!.TryDrainRemainingDataForGetChars(chars, out charsWritten))
  293. {
  294. goto DestinationTooSmall;
  295. }
  296. Finish:
  297. // Report back the number of bytes (from the new incoming span) we consumed just now.
  298. // This calculation is simple: it's the difference between the original leftover byte
  299. // count and the number of bytes from the combined buffer we needed to decode the first
  300. // scalar value. We need to report this before the call to SetLeftoverData /
  301. // ClearLeftoverData because those methods will overwrite the _leftoverByteCount field.
  302. bytesConsumed = combinedBufferBytesConsumed - _leftoverByteCount;
  303. if (persistNewCombinedBuffer)
  304. {
  305. Debug.Assert(combinedBufferBytesConsumed == combinedBuffer.Length, "We should be asked to persist the entire combined buffer.");
  306. SetLeftoverData(combinedBuffer); // the buffer still only contains partial data; a future call to Convert will need it
  307. }
  308. else
  309. {
  310. ClearLeftoverData(); // the buffer contains no partial data; we'll go down the normal paths
  311. }
  312. return charsWritten;
  313. DestinationTooSmall:
  314. // If we got to this point, we're trying to write chars to the output buffer, but we're unable to do
  315. // so. Unlike EncoderNLS, this type does not allow partial writes to the output buffer. Since we know
  316. // draining leftover data is the first operation performed by any DecoderNLS API, there was no
  317. // opportunity for any code before us to make forward progress, so we must fail immediately.
  318. _encoding.ThrowCharsOverflow(this, nothingDecoded: true);
  319. throw null!; // will never reach this point
  320. }
  321. /// <summary>
  322. /// Given a byte buffer <paramref name="dest"/>, concatenates as much of <paramref name="srcLeft"/> followed
  323. /// by <paramref name="srcRight"/> into it as will fit, then returns the total number of bytes copied.
  324. /// </summary>
  325. private static int ConcatInto(ReadOnlySpan<byte> srcLeft, ReadOnlySpan<byte> srcRight, Span<byte> dest)
  326. {
  327. int total = 0;
  328. for (int i = 0; i < srcLeft.Length; i++)
  329. {
  330. if ((uint)total >= (uint)dest.Length)
  331. {
  332. goto Finish;
  333. }
  334. else
  335. {
  336. dest[total++] = srcLeft[i];
  337. }
  338. }
  339. for (int i = 0; i < srcRight.Length; i++)
  340. {
  341. if ((uint)total >= (uint)dest.Length)
  342. {
  343. goto Finish;
  344. }
  345. else
  346. {
  347. dest[total++] = srcRight[i];
  348. }
  349. }
  350. Finish:
  351. return total;
  352. }
  353. }
  354. }