| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446 |
- // Licensed to the .NET Foundation under one or more agreements.
- // The .NET Foundation licenses this file to you under the MIT license.
- // See the LICENSE file in the project root for more information.
- using System.Buffers;
- using System.Diagnostics;
- using System.Runtime.InteropServices;
- namespace System.Text
- {
- // A Decoder is used to decode a sequence of blocks of bytes into a
- // sequence of blocks of characters. Following instantiation of a decoder,
- // sequential blocks of bytes are converted into blocks of characters through
- // calls to the GetChars method. The decoder maintains state between the
- // conversions, allowing it to correctly decode byte sequences that span
- // adjacent blocks.
- //
- // Instances of specific implementations of the Decoder abstract base
- // class are typically obtained through calls to the GetDecoder method
- // of Encoding objects.
- internal class DecoderNLS : Decoder
- {
- // Remember our encoding
- private Encoding _encoding;
- private bool _mustFlush;
- internal bool _throwOnOverflow;
- internal int _bytesUsed;
- private int _leftoverBytes; // leftover data from a previous invocation of GetChars (up to 4 bytes)
- private int _leftoverByteCount; // number of bytes of actual data in _leftoverBytes
- internal DecoderNLS(Encoding encoding)
- {
- _encoding = encoding;
- _fallback = this._encoding.DecoderFallback;
- this.Reset();
- }
- public override void Reset()
- {
- ClearLeftoverData();
- _fallbackBuffer?.Reset();
- }
- public override int GetCharCount(byte[] bytes, int index, int count)
- {
- return GetCharCount(bytes, index, count, false);
- }
- public override unsafe int GetCharCount(byte[] bytes, int index, int count, bool flush)
- {
- // Validate Parameters
- if (bytes == null)
- throw new ArgumentNullException(nameof(bytes),
- SR.ArgumentNull_Array);
- if (index < 0 || count < 0)
- throw new ArgumentOutOfRangeException((index < 0 ? nameof(index) : nameof(count)),
- SR.ArgumentOutOfRange_NeedNonNegNum);
- if (bytes.Length - index < count)
- throw new ArgumentOutOfRangeException(nameof(bytes),
- SR.ArgumentOutOfRange_IndexCountBuffer);
- // Just call pointer version
- fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes))
- return GetCharCount(pBytes + index, count, flush);
- }
- public unsafe override int GetCharCount(byte* bytes, int count, bool flush)
- {
- // Validate parameters
- if (bytes == null)
- throw new ArgumentNullException(nameof(bytes),
- SR.ArgumentNull_Array);
- if (count < 0)
- throw new ArgumentOutOfRangeException(nameof(count),
- SR.ArgumentOutOfRange_NeedNonNegNum);
- // Remember the flush
- _mustFlush = flush;
- _throwOnOverflow = true;
- // By default just call the encoding version, no flush by default
- Debug.Assert(_encoding != null);
- return _encoding.GetCharCount(bytes, count, this);
- }
- public override int GetChars(byte[] bytes, int byteIndex, int byteCount,
- char[] chars, int charIndex)
- {
- return GetChars(bytes, byteIndex, byteCount, chars, charIndex, false);
- }
- public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount,
- char[] chars, int charIndex, bool flush)
- {
- // Validate Parameters
- if (bytes == null || chars == null)
- throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars),
- SR.ArgumentNull_Array);
- if (byteIndex < 0 || byteCount < 0)
- throw new ArgumentOutOfRangeException((byteIndex < 0 ? nameof(byteIndex) : nameof(byteCount)),
- SR.ArgumentOutOfRange_NeedNonNegNum);
- if (bytes.Length - byteIndex < byteCount)
- throw new ArgumentOutOfRangeException(nameof(bytes),
- SR.ArgumentOutOfRange_IndexCountBuffer);
- if (charIndex < 0 || charIndex > chars.Length)
- throw new ArgumentOutOfRangeException(nameof(charIndex),
- SR.ArgumentOutOfRange_Index);
- int charCount = chars.Length - charIndex;
- // Just call pointer version
- fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes))
- fixed (char* pChars = &MemoryMarshal.GetReference((Span<char>)chars))
- // Remember that charCount is # to decode, not size of array
- return GetChars(pBytes + byteIndex, byteCount,
- pChars + charIndex, charCount, flush);
- }
- public unsafe override int GetChars(byte* bytes, int byteCount,
- char* chars, int charCount, bool flush)
- {
- // Validate parameters
- if (chars == null || bytes == null)
- throw new ArgumentNullException((chars == null ? nameof(chars) : nameof(bytes)),
- SR.ArgumentNull_Array);
- if (byteCount < 0 || charCount < 0)
- throw new ArgumentOutOfRangeException((byteCount < 0 ? nameof(byteCount) : nameof(charCount)),
- SR.ArgumentOutOfRange_NeedNonNegNum);
- // Remember our flush
- _mustFlush = flush;
- _throwOnOverflow = true;
- // By default just call the encodings version
- Debug.Assert(_encoding != null);
- return _encoding.GetChars(bytes, byteCount, chars, charCount, this);
- }
- // This method is used when the output buffer might not be big enough.
- // Just call the pointer version. (This gets chars)
- public override unsafe void Convert(byte[] bytes, int byteIndex, int byteCount,
- char[] chars, int charIndex, int charCount, bool flush,
- out int bytesUsed, out int charsUsed, out bool completed)
- {
- // Validate parameters
- if (bytes == null || chars == null)
- throw new ArgumentNullException((bytes == null ? nameof(bytes) : nameof(chars)),
- SR.ArgumentNull_Array);
- if (byteIndex < 0 || byteCount < 0)
- throw new ArgumentOutOfRangeException((byteIndex < 0 ? nameof(byteIndex) : nameof(byteCount)),
- SR.ArgumentOutOfRange_NeedNonNegNum);
- if (charIndex < 0 || charCount < 0)
- throw new ArgumentOutOfRangeException((charIndex < 0 ? nameof(charIndex) : nameof(charCount)),
- SR.ArgumentOutOfRange_NeedNonNegNum);
- if (bytes.Length - byteIndex < byteCount)
- throw new ArgumentOutOfRangeException(nameof(bytes),
- SR.ArgumentOutOfRange_IndexCountBuffer);
- if (chars.Length - charIndex < charCount)
- throw new ArgumentOutOfRangeException(nameof(chars),
- SR.ArgumentOutOfRange_IndexCountBuffer);
- // Just call the pointer version (public overrides can't do this)
- fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes))
- {
- fixed (char* pChars = &MemoryMarshal.GetReference((Span<char>)chars))
- {
- Convert(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, flush,
- out bytesUsed, out charsUsed, out completed);
- }
- }
- }
- // This is the version that used pointers. We call the base encoding worker function
- // after setting our appropriate internal variables. This is getting chars
- public unsafe override void Convert(byte* bytes, int byteCount,
- char* chars, int charCount, bool flush,
- out int bytesUsed, out int charsUsed, out bool completed)
- {
- // Validate input parameters
- if (chars == null || bytes == null)
- throw new ArgumentNullException(chars == null ? nameof(chars) : nameof(bytes),
- SR.ArgumentNull_Array);
- if (byteCount < 0 || charCount < 0)
- throw new ArgumentOutOfRangeException((byteCount < 0 ? nameof(byteCount) : nameof(charCount)),
- SR.ArgumentOutOfRange_NeedNonNegNum);
- // We don't want to throw
- _mustFlush = flush;
- _throwOnOverflow = false;
- _bytesUsed = 0;
- // Do conversion
- Debug.Assert(_encoding != null);
- charsUsed = _encoding.GetChars(bytes, byteCount, chars, charCount, this);
- bytesUsed = _bytesUsed;
- // Its completed if they've used what they wanted AND if they didn't want flush or if we are flushed
- completed = (bytesUsed == byteCount) && (!flush || !this.HasState) &&
- (_fallbackBuffer == null || _fallbackBuffer.Remaining == 0);
- // Our data thingy are now full, we can return
- }
- public bool MustFlush
- {
- get
- {
- return _mustFlush;
- }
- }
- // Anything left in our decoder?
- internal virtual bool HasState
- {
- get
- {
- return false;
- }
- }
- // Allow encoding to clear our must flush instead of throwing (in ThrowCharsOverflow)
- internal void ClearMustFlush()
- {
- _mustFlush = false;
- }
- internal ReadOnlySpan<byte> GetLeftoverData()
- {
- return MemoryMarshal.AsBytes(new ReadOnlySpan<int>(ref _leftoverBytes, 1)).Slice(0, _leftoverByteCount);
- }
- internal void SetLeftoverData(ReadOnlySpan<byte> bytes)
- {
- bytes.CopyTo(MemoryMarshal.AsBytes(new Span<int>(ref _leftoverBytes, 1)));
- _leftoverByteCount = bytes.Length;
- }
- internal bool HasLeftoverData => _leftoverByteCount != 0;
- internal void ClearLeftoverData()
- {
- _leftoverByteCount = 0;
- }
- internal int DrainLeftoverDataForGetCharCount(ReadOnlySpan<byte> bytes, out int bytesConsumed)
- {
- // Quick check: we _should not_ have leftover fallback data from a previous invocation,
- // as we'd end up consuming any such data and would corrupt whatever Convert call happens
- // to be in progress. Unlike EncoderNLS, this is simply a Debug.Assert. No exception is thrown.
- Debug.Assert(_fallbackBuffer is null || _fallbackBuffer.Remaining == 0, "Should have no data remaining in the fallback buffer.");
- Debug.Assert(HasLeftoverData, "Caller shouldn't invoke this routine unless there's leftover data in the decoder.");
- // Copy the existing leftover data plus as many bytes as possible of the new incoming data
- // into a temporary concated buffer, then get its char count by decoding it.
- Span<byte> combinedBuffer = stackalloc byte[4];
- combinedBuffer = combinedBuffer.Slice(0, ConcatInto(GetLeftoverData(), bytes, combinedBuffer));
- int charCount = 0;
- Debug.Assert(_encoding != null);
- switch (_encoding.DecodeFirstRune(combinedBuffer, out Rune value, out int combinedBufferBytesConsumed))
- {
- case OperationStatus.Done:
- charCount = value.Utf16SequenceLength;
- goto Finish; // successfully transcoded bytes -> chars
- case OperationStatus.NeedMoreData:
- if (MustFlush)
- {
- goto case OperationStatus.InvalidData; // treat as equivalent to bad data
- }
- else
- {
- goto Finish; // consumed some bytes, output 0 chars
- }
- case OperationStatus.InvalidData:
- break;
- default:
- Debug.Fail("Unexpected OperationStatus return value.");
- break;
- }
- // Couldn't decode the buffer. Fallback the buffer instead. See comment in DrainLeftoverDataForGetChars
- // for more information on why a negative index is provided.
- if (FallbackBuffer.Fallback(combinedBuffer.Slice(0, combinedBufferBytesConsumed).ToArray(), index: -_leftoverByteCount))
- {
- charCount = _fallbackBuffer!.DrainRemainingDataForGetCharCount();
- Debug.Assert(charCount >= 0, "Fallback buffer shouldn't have returned a negative char count.");
- }
- Finish:
- bytesConsumed = combinedBufferBytesConsumed - _leftoverByteCount; // amount of 'bytes' buffer consumed just now
- return charCount;
- }
- internal int DrainLeftoverDataForGetChars(ReadOnlySpan<byte> bytes, Span<char> chars, out int bytesConsumed)
- {
- // Quick check: we _should not_ have leftover fallback data from a previous invocation,
- // as we'd end up consuming any such data and would corrupt whatever Convert call happens
- // to be in progress. Unlike EncoderNLS, this is simply a Debug.Assert. No exception is thrown.
- Debug.Assert(_fallbackBuffer is null || _fallbackBuffer.Remaining == 0, "Should have no data remaining in the fallback buffer.");
- Debug.Assert(HasLeftoverData, "Caller shouldn't invoke this routine unless there's leftover data in the decoder.");
- // Copy the existing leftover data plus as many bytes as possible of the new incoming data
- // into a temporary concated buffer, then transcode it from bytes to chars.
- Span<byte> combinedBuffer = stackalloc byte[4];
- combinedBuffer = combinedBuffer.Slice(0, ConcatInto(GetLeftoverData(), bytes, combinedBuffer));
- int charsWritten = 0;
- bool persistNewCombinedBuffer = false;
- Debug.Assert(_encoding != null);
- switch (_encoding.DecodeFirstRune(combinedBuffer, out Rune value, out int combinedBufferBytesConsumed))
- {
- case OperationStatus.Done:
- if (value.TryEncodeToUtf16(chars, out charsWritten))
- {
- goto Finish; // successfully transcoded bytes -> chars
- }
- else
- {
- goto DestinationTooSmall;
- }
- case OperationStatus.NeedMoreData:
- if (MustFlush)
- {
- goto case OperationStatus.InvalidData; // treat as equivalent to bad data
- }
- else
- {
- persistNewCombinedBuffer = true;
- goto Finish; // successfully consumed some bytes, output no chars
- }
- case OperationStatus.InvalidData:
- break;
- default:
- Debug.Fail("Unexpected OperationStatus return value.");
- break;
- }
- // Couldn't decode the buffer. Fallback the buffer instead. The fallback mechanism relies
- // on a negative index to convey "the start of the invalid sequence was some number of
- // bytes back before the current buffer." Since we know the invalid sequence must have
- // started at the beginning of our leftover byte buffer, we can signal to our caller that
- // they must backtrack that many bytes to find the real start of the invalid sequence.
- if (FallbackBuffer.Fallback(combinedBuffer.Slice(0, combinedBufferBytesConsumed).ToArray(), index: -_leftoverByteCount)
- && !_fallbackBuffer!.TryDrainRemainingDataForGetChars(chars, out charsWritten))
- {
- goto DestinationTooSmall;
- }
- Finish:
- // Report back the number of bytes (from the new incoming span) we consumed just now.
- // This calculation is simple: it's the difference between the original leftover byte
- // count and the number of bytes from the combined buffer we needed to decode the first
- // scalar value. We need to report this before the call to SetLeftoverData /
- // ClearLeftoverData because those methods will overwrite the _leftoverByteCount field.
- bytesConsumed = combinedBufferBytesConsumed - _leftoverByteCount;
- if (persistNewCombinedBuffer)
- {
- Debug.Assert(combinedBufferBytesConsumed == combinedBuffer.Length, "We should be asked to persist the entire combined buffer.");
- SetLeftoverData(combinedBuffer); // the buffer still only contains partial data; a future call to Convert will need it
- }
- else
- {
- ClearLeftoverData(); // the buffer contains no partial data; we'll go down the normal paths
- }
- return charsWritten;
- DestinationTooSmall:
- // If we got to this point, we're trying to write chars to the output buffer, but we're unable to do
- // so. Unlike EncoderNLS, this type does not allow partial writes to the output buffer. Since we know
- // draining leftover data is the first operation performed by any DecoderNLS API, there was no
- // opportunity for any code before us to make forward progress, so we must fail immediately.
- _encoding.ThrowCharsOverflow(this, nothingDecoded: true);
- throw null!; // will never reach this point
- }
- /// <summary>
- /// Given a byte buffer <paramref name="dest"/>, concatenates as much of <paramref name="srcLeft"/> followed
- /// by <paramref name="srcRight"/> into it as will fit, then returns the total number of bytes copied.
- /// </summary>
- private static int ConcatInto(ReadOnlySpan<byte> srcLeft, ReadOnlySpan<byte> srcRight, Span<byte> dest)
- {
- int total = 0;
- for (int i = 0; i < srcLeft.Length; i++)
- {
- if ((uint)total >= (uint)dest.Length)
- {
- goto Finish;
- }
- else
- {
- dest[total++] = srcLeft[i];
- }
- }
- for (int i = 0; i < srcRight.Length; i++)
- {
- if ((uint)total >= (uint)dest.Length)
- {
- goto Finish;
- }
- else
- {
- dest[total++] = srcRight[i];
- }
- }
- Finish:
- return total;
- }
- }
- }
|