2
0

DecoderNLS.cs 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446
  1. // Licensed to the .NET Foundation under one or more agreements.
  2. // The .NET Foundation licenses this file to you under the MIT license.
  3. // See the LICENSE file in the project root for more information.
  4. using System.Buffers;
  5. using System.Diagnostics;
  6. using System.Runtime.InteropServices;
  7. namespace System.Text
  8. {
  9. // A Decoder is used to decode a sequence of blocks of bytes into a
  10. // sequence of blocks of characters. Following instantiation of a decoder,
  11. // sequential blocks of bytes are converted into blocks of characters through
  12. // calls to the GetChars method. The decoder maintains state between the
  13. // conversions, allowing it to correctly decode byte sequences that span
  14. // adjacent blocks.
  15. //
  16. // Instances of specific implementations of the Decoder abstract base
  17. // class are typically obtained through calls to the GetDecoder method
  18. // of Encoding objects.
  19. internal class DecoderNLS : Decoder
  20. {
  21. // Remember our encoding
  22. private Encoding _encoding;
  23. private bool _mustFlush;
  24. internal bool _throwOnOverflow;
  25. internal int _bytesUsed;
  26. private int _leftoverBytes; // leftover data from a previous invocation of GetChars (up to 4 bytes)
  27. private int _leftoverByteCount; // number of bytes of actual data in _leftoverBytes
  28. internal DecoderNLS(Encoding encoding)
  29. {
  30. _encoding = encoding;
  31. _fallback = this._encoding.DecoderFallback;
  32. this.Reset();
  33. }
  34. public override void Reset()
  35. {
  36. ClearLeftoverData();
  37. _fallbackBuffer?.Reset();
  38. }
  39. public override int GetCharCount(byte[] bytes, int index, int count)
  40. {
  41. return GetCharCount(bytes, index, count, false);
  42. }
  43. public override unsafe int GetCharCount(byte[] bytes, int index, int count, bool flush)
  44. {
  45. // Validate Parameters
  46. if (bytes == null)
  47. throw new ArgumentNullException(nameof(bytes),
  48. SR.ArgumentNull_Array);
  49. if (index < 0 || count < 0)
  50. throw new ArgumentOutOfRangeException((index < 0 ? nameof(index) : nameof(count)),
  51. SR.ArgumentOutOfRange_NeedNonNegNum);
  52. if (bytes.Length - index < count)
  53. throw new ArgumentOutOfRangeException(nameof(bytes),
  54. SR.ArgumentOutOfRange_IndexCountBuffer);
  55. // Just call pointer version
  56. fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes))
  57. return GetCharCount(pBytes + index, count, flush);
  58. }
  59. public unsafe override int GetCharCount(byte* bytes, int count, bool flush)
  60. {
  61. // Validate parameters
  62. if (bytes == null)
  63. throw new ArgumentNullException(nameof(bytes),
  64. SR.ArgumentNull_Array);
  65. if (count < 0)
  66. throw new ArgumentOutOfRangeException(nameof(count),
  67. SR.ArgumentOutOfRange_NeedNonNegNum);
  68. // Remember the flush
  69. _mustFlush = flush;
  70. _throwOnOverflow = true;
  71. // By default just call the encoding version, no flush by default
  72. Debug.Assert(_encoding != null);
  73. return _encoding.GetCharCount(bytes, count, this);
  74. }
  75. public override int GetChars(byte[] bytes, int byteIndex, int byteCount,
  76. char[] chars, int charIndex)
  77. {
  78. return GetChars(bytes, byteIndex, byteCount, chars, charIndex, false);
  79. }
  80. public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount,
  81. char[] chars, int charIndex, bool flush)
  82. {
  83. // Validate Parameters
  84. if (bytes == null || chars == null)
  85. throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars),
  86. SR.ArgumentNull_Array);
  87. if (byteIndex < 0 || byteCount < 0)
  88. throw new ArgumentOutOfRangeException((byteIndex < 0 ? nameof(byteIndex) : nameof(byteCount)),
  89. SR.ArgumentOutOfRange_NeedNonNegNum);
  90. if (bytes.Length - byteIndex < byteCount)
  91. throw new ArgumentOutOfRangeException(nameof(bytes),
  92. SR.ArgumentOutOfRange_IndexCountBuffer);
  93. if (charIndex < 0 || charIndex > chars.Length)
  94. throw new ArgumentOutOfRangeException(nameof(charIndex),
  95. SR.ArgumentOutOfRange_Index);
  96. int charCount = chars.Length - charIndex;
  97. // Just call pointer version
  98. fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes))
  99. fixed (char* pChars = &MemoryMarshal.GetReference((Span<char>)chars))
  100. // Remember that charCount is # to decode, not size of array
  101. return GetChars(pBytes + byteIndex, byteCount,
  102. pChars + charIndex, charCount, flush);
  103. }
  104. public unsafe override int GetChars(byte* bytes, int byteCount,
  105. char* chars, int charCount, bool flush)
  106. {
  107. // Validate parameters
  108. if (chars == null || bytes == null)
  109. throw new ArgumentNullException((chars == null ? nameof(chars) : nameof(bytes)),
  110. SR.ArgumentNull_Array);
  111. if (byteCount < 0 || charCount < 0)
  112. throw new ArgumentOutOfRangeException((byteCount < 0 ? nameof(byteCount) : nameof(charCount)),
  113. SR.ArgumentOutOfRange_NeedNonNegNum);
  114. // Remember our flush
  115. _mustFlush = flush;
  116. _throwOnOverflow = true;
  117. // By default just call the encodings version
  118. Debug.Assert(_encoding != null);
  119. return _encoding.GetChars(bytes, byteCount, chars, charCount, this);
  120. }
  121. // This method is used when the output buffer might not be big enough.
  122. // Just call the pointer version. (This gets chars)
  123. public override unsafe void Convert(byte[] bytes, int byteIndex, int byteCount,
  124. char[] chars, int charIndex, int charCount, bool flush,
  125. out int bytesUsed, out int charsUsed, out bool completed)
  126. {
  127. // Validate parameters
  128. if (bytes == null || chars == null)
  129. throw new ArgumentNullException((bytes == null ? nameof(bytes) : nameof(chars)),
  130. SR.ArgumentNull_Array);
  131. if (byteIndex < 0 || byteCount < 0)
  132. throw new ArgumentOutOfRangeException((byteIndex < 0 ? nameof(byteIndex) : nameof(byteCount)),
  133. SR.ArgumentOutOfRange_NeedNonNegNum);
  134. if (charIndex < 0 || charCount < 0)
  135. throw new ArgumentOutOfRangeException((charIndex < 0 ? nameof(charIndex) : nameof(charCount)),
  136. SR.ArgumentOutOfRange_NeedNonNegNum);
  137. if (bytes.Length - byteIndex < byteCount)
  138. throw new ArgumentOutOfRangeException(nameof(bytes),
  139. SR.ArgumentOutOfRange_IndexCountBuffer);
  140. if (chars.Length - charIndex < charCount)
  141. throw new ArgumentOutOfRangeException(nameof(chars),
  142. SR.ArgumentOutOfRange_IndexCountBuffer);
  143. // Just call the pointer version (public overrides can't do this)
  144. fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes))
  145. {
  146. fixed (char* pChars = &MemoryMarshal.GetReference((Span<char>)chars))
  147. {
  148. Convert(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, flush,
  149. out bytesUsed, out charsUsed, out completed);
  150. }
  151. }
  152. }
  153. // This is the version that used pointers. We call the base encoding worker function
  154. // after setting our appropriate internal variables. This is getting chars
  155. public unsafe override void Convert(byte* bytes, int byteCount,
  156. char* chars, int charCount, bool flush,
  157. out int bytesUsed, out int charsUsed, out bool completed)
  158. {
  159. // Validate input parameters
  160. if (chars == null || bytes == null)
  161. throw new ArgumentNullException(chars == null ? nameof(chars) : nameof(bytes),
  162. SR.ArgumentNull_Array);
  163. if (byteCount < 0 || charCount < 0)
  164. throw new ArgumentOutOfRangeException((byteCount < 0 ? nameof(byteCount) : nameof(charCount)),
  165. SR.ArgumentOutOfRange_NeedNonNegNum);
  166. // We don't want to throw
  167. _mustFlush = flush;
  168. _throwOnOverflow = false;
  169. _bytesUsed = 0;
  170. // Do conversion
  171. Debug.Assert(_encoding != null);
  172. charsUsed = _encoding.GetChars(bytes, byteCount, chars, charCount, this);
  173. bytesUsed = _bytesUsed;
  174. // Its completed if they've used what they wanted AND if they didn't want flush or if we are flushed
  175. completed = (bytesUsed == byteCount) && (!flush || !this.HasState) &&
  176. (_fallbackBuffer == null || _fallbackBuffer.Remaining == 0);
  177. // Our data thingy are now full, we can return
  178. }
  179. public bool MustFlush
  180. {
  181. get
  182. {
  183. return _mustFlush;
  184. }
  185. }
  186. // Anything left in our decoder?
  187. internal virtual bool HasState
  188. {
  189. get
  190. {
  191. return false;
  192. }
  193. }
  194. // Allow encoding to clear our must flush instead of throwing (in ThrowCharsOverflow)
  195. internal void ClearMustFlush()
  196. {
  197. _mustFlush = false;
  198. }
  199. internal ReadOnlySpan<byte> GetLeftoverData()
  200. {
  201. return MemoryMarshal.AsBytes(new ReadOnlySpan<int>(ref _leftoverBytes, 1)).Slice(0, _leftoverByteCount);
  202. }
  203. internal void SetLeftoverData(ReadOnlySpan<byte> bytes)
  204. {
  205. bytes.CopyTo(MemoryMarshal.AsBytes(new Span<int>(ref _leftoverBytes, 1)));
  206. _leftoverByteCount = bytes.Length;
  207. }
  208. internal bool HasLeftoverData => _leftoverByteCount != 0;
  209. internal void ClearLeftoverData()
  210. {
  211. _leftoverByteCount = 0;
  212. }
  213. internal int DrainLeftoverDataForGetCharCount(ReadOnlySpan<byte> bytes, out int bytesConsumed)
  214. {
  215. // Quick check: we _should not_ have leftover fallback data from a previous invocation,
  216. // as we'd end up consuming any such data and would corrupt whatever Convert call happens
  217. // to be in progress. Unlike EncoderNLS, this is simply a Debug.Assert. No exception is thrown.
  218. Debug.Assert(_fallbackBuffer is null || _fallbackBuffer.Remaining == 0, "Should have no data remaining in the fallback buffer.");
  219. Debug.Assert(HasLeftoverData, "Caller shouldn't invoke this routine unless there's leftover data in the decoder.");
  220. // Copy the existing leftover data plus as many bytes as possible of the new incoming data
  221. // into a temporary concated buffer, then get its char count by decoding it.
  222. Span<byte> combinedBuffer = stackalloc byte[4];
  223. combinedBuffer = combinedBuffer.Slice(0, ConcatInto(GetLeftoverData(), bytes, combinedBuffer));
  224. int charCount = 0;
  225. Debug.Assert(_encoding != null);
  226. switch (_encoding.DecodeFirstRune(combinedBuffer, out Rune value, out int combinedBufferBytesConsumed))
  227. {
  228. case OperationStatus.Done:
  229. charCount = value.Utf16SequenceLength;
  230. goto Finish; // successfully transcoded bytes -> chars
  231. case OperationStatus.NeedMoreData:
  232. if (MustFlush)
  233. {
  234. goto case OperationStatus.InvalidData; // treat as equivalent to bad data
  235. }
  236. else
  237. {
  238. goto Finish; // consumed some bytes, output 0 chars
  239. }
  240. case OperationStatus.InvalidData:
  241. break;
  242. default:
  243. Debug.Fail("Unexpected OperationStatus return value.");
  244. break;
  245. }
  246. // Couldn't decode the buffer. Fallback the buffer instead. See comment in DrainLeftoverDataForGetChars
  247. // for more information on why a negative index is provided.
  248. if (FallbackBuffer.Fallback(combinedBuffer.Slice(0, combinedBufferBytesConsumed).ToArray(), index: -_leftoverByteCount))
  249. {
  250. charCount = _fallbackBuffer!.DrainRemainingDataForGetCharCount();
  251. Debug.Assert(charCount >= 0, "Fallback buffer shouldn't have returned a negative char count.");
  252. }
  253. Finish:
  254. bytesConsumed = combinedBufferBytesConsumed - _leftoverByteCount; // amount of 'bytes' buffer consumed just now
  255. return charCount;
  256. }
  257. internal int DrainLeftoverDataForGetChars(ReadOnlySpan<byte> bytes, Span<char> chars, out int bytesConsumed)
  258. {
  259. // Quick check: we _should not_ have leftover fallback data from a previous invocation,
  260. // as we'd end up consuming any such data and would corrupt whatever Convert call happens
  261. // to be in progress. Unlike EncoderNLS, this is simply a Debug.Assert. No exception is thrown.
  262. Debug.Assert(_fallbackBuffer is null || _fallbackBuffer.Remaining == 0, "Should have no data remaining in the fallback buffer.");
  263. Debug.Assert(HasLeftoverData, "Caller shouldn't invoke this routine unless there's leftover data in the decoder.");
  264. // Copy the existing leftover data plus as many bytes as possible of the new incoming data
  265. // into a temporary concated buffer, then transcode it from bytes to chars.
  266. Span<byte> combinedBuffer = stackalloc byte[4];
  267. combinedBuffer = combinedBuffer.Slice(0, ConcatInto(GetLeftoverData(), bytes, combinedBuffer));
  268. int charsWritten = 0;
  269. bool persistNewCombinedBuffer = false;
  270. Debug.Assert(_encoding != null);
  271. switch (_encoding.DecodeFirstRune(combinedBuffer, out Rune value, out int combinedBufferBytesConsumed))
  272. {
  273. case OperationStatus.Done:
  274. if (value.TryEncodeToUtf16(chars, out charsWritten))
  275. {
  276. goto Finish; // successfully transcoded bytes -> chars
  277. }
  278. else
  279. {
  280. goto DestinationTooSmall;
  281. }
  282. case OperationStatus.NeedMoreData:
  283. if (MustFlush)
  284. {
  285. goto case OperationStatus.InvalidData; // treat as equivalent to bad data
  286. }
  287. else
  288. {
  289. persistNewCombinedBuffer = true;
  290. goto Finish; // successfully consumed some bytes, output no chars
  291. }
  292. case OperationStatus.InvalidData:
  293. break;
  294. default:
  295. Debug.Fail("Unexpected OperationStatus return value.");
  296. break;
  297. }
  298. // Couldn't decode the buffer. Fallback the buffer instead. The fallback mechanism relies
  299. // on a negative index to convey "the start of the invalid sequence was some number of
  300. // bytes back before the current buffer." Since we know the invalid sequence must have
  301. // started at the beginning of our leftover byte buffer, we can signal to our caller that
  302. // they must backtrack that many bytes to find the real start of the invalid sequence.
  303. if (FallbackBuffer.Fallback(combinedBuffer.Slice(0, combinedBufferBytesConsumed).ToArray(), index: -_leftoverByteCount)
  304. && !_fallbackBuffer!.TryDrainRemainingDataForGetChars(chars, out charsWritten))
  305. {
  306. goto DestinationTooSmall;
  307. }
  308. Finish:
  309. // Report back the number of bytes (from the new incoming span) we consumed just now.
  310. // This calculation is simple: it's the difference between the original leftover byte
  311. // count and the number of bytes from the combined buffer we needed to decode the first
  312. // scalar value. We need to report this before the call to SetLeftoverData /
  313. // ClearLeftoverData because those methods will overwrite the _leftoverByteCount field.
  314. bytesConsumed = combinedBufferBytesConsumed - _leftoverByteCount;
  315. if (persistNewCombinedBuffer)
  316. {
  317. Debug.Assert(combinedBufferBytesConsumed == combinedBuffer.Length, "We should be asked to persist the entire combined buffer.");
  318. SetLeftoverData(combinedBuffer); // the buffer still only contains partial data; a future call to Convert will need it
  319. }
  320. else
  321. {
  322. ClearLeftoverData(); // the buffer contains no partial data; we'll go down the normal paths
  323. }
  324. return charsWritten;
  325. DestinationTooSmall:
  326. // If we got to this point, we're trying to write chars to the output buffer, but we're unable to do
  327. // so. Unlike EncoderNLS, this type does not allow partial writes to the output buffer. Since we know
  328. // draining leftover data is the first operation performed by any DecoderNLS API, there was no
  329. // opportunity for any code before us to make forward progress, so we must fail immediately.
  330. _encoding.ThrowCharsOverflow(this, nothingDecoded: true);
  331. throw null!; // will never reach this point
  332. }
  333. /// <summary>
  334. /// Given a byte buffer <paramref name="dest"/>, concatenates as much of <paramref name="srcLeft"/> followed
  335. /// by <paramref name="srcRight"/> into it as will fit, then returns the total number of bytes copied.
  336. /// </summary>
  337. private static int ConcatInto(ReadOnlySpan<byte> srcLeft, ReadOnlySpan<byte> srcRight, Span<byte> dest)
  338. {
  339. int total = 0;
  340. for (int i = 0; i < srcLeft.Length; i++)
  341. {
  342. if ((uint)total >= (uint)dest.Length)
  343. {
  344. goto Finish;
  345. }
  346. else
  347. {
  348. dest[total++] = srcLeft[i];
  349. }
  350. }
  351. for (int i = 0; i < srcRight.Length; i++)
  352. {
  353. if ((uint)total >= (uint)dest.Length)
  354. {
  355. goto Finish;
  356. }
  357. else
  358. {
  359. dest[total++] = srcRight[i];
  360. }
  361. }
  362. Finish:
  363. return total;
  364. }
  365. }
  366. }