EncoderNLS.cs 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388
  1. // Licensed to the .NET Foundation under one or more agreements.
  2. // The .NET Foundation licenses this file to you under the MIT license.
  3. // See the LICENSE file in the project root for more information.
  4. using System.Buffers;
  5. using System.Diagnostics;
  6. using System.Runtime.InteropServices;
  7. namespace System.Text
  8. {
  9. // An Encoder is used to encode a sequence of blocks of characters into
  10. // a sequence of blocks of bytes. Following instantiation of an encoder,
  11. // sequential blocks of characters are converted into blocks of bytes through
  12. // calls to the GetBytes method. The encoder maintains state between the
  13. // conversions, allowing it to correctly encode character sequences that span
  14. // adjacent blocks.
  15. //
  16. // Instances of specific implementations of the Encoder abstract base
  17. // class are typically obtained through calls to the GetEncoder method
  18. // of Encoding objects.
  19. //
  20. internal class EncoderNLS : Encoder
  21. {
  22. // Need a place for the last left over character, most of our encodings use this
  23. internal char _charLeftOver;
  24. private readonly Encoding _encoding;
  25. private bool _mustFlush;
  26. internal bool _throwOnOverflow;
  27. internal int _charsUsed;
  28. internal EncoderNLS(Encoding encoding)
  29. {
  30. _encoding = encoding;
  31. _fallback = _encoding.EncoderFallback;
  32. this.Reset();
  33. }
  34. public override void Reset()
  35. {
  36. _charLeftOver = (char)0;
  37. if (_fallbackBuffer != null)
  38. _fallbackBuffer.Reset();
  39. }
  40. public override unsafe int GetByteCount(char[] chars, int index, int count, bool flush)
  41. {
  42. // Validate input parameters
  43. if (chars == null)
  44. throw new ArgumentNullException(nameof(chars),
  45. SR.ArgumentNull_Array);
  46. if (index < 0 || count < 0)
  47. throw new ArgumentOutOfRangeException(index < 0 ? nameof(index) : nameof(count),
  48. SR.ArgumentOutOfRange_NeedNonNegNum);
  49. if (chars.Length - index < count)
  50. throw new ArgumentOutOfRangeException(nameof(chars),
  51. SR.ArgumentOutOfRange_IndexCountBuffer);
  52. // Just call the pointer version
  53. int result = -1;
  54. fixed (char* pChars = &MemoryMarshal.GetReference((Span<char>)chars))
  55. {
  56. result = GetByteCount(pChars + index, count, flush);
  57. }
  58. return result;
  59. }
  60. public override unsafe int GetByteCount(char* chars, int count, bool flush)
  61. {
  62. // Validate input parameters
  63. if (chars == null)
  64. throw new ArgumentNullException(nameof(chars),
  65. SR.ArgumentNull_Array);
  66. if (count < 0)
  67. throw new ArgumentOutOfRangeException(nameof(count),
  68. SR.ArgumentOutOfRange_NeedNonNegNum);
  69. _mustFlush = flush;
  70. _throwOnOverflow = true;
  71. Debug.Assert(_encoding != null);
  72. return _encoding.GetByteCount(chars, count, this);
  73. }
  74. public override unsafe int GetBytes(char[] chars, int charIndex, int charCount,
  75. byte[] bytes, int byteIndex, bool flush)
  76. {
  77. // Validate parameters
  78. if (chars == null || bytes == null)
  79. throw new ArgumentNullException(chars == null ? nameof(chars) : nameof(bytes),
  80. SR.ArgumentNull_Array);
  81. if (charIndex < 0 || charCount < 0)
  82. throw new ArgumentOutOfRangeException(charIndex < 0 ? nameof(charIndex) : nameof(charCount),
  83. SR.ArgumentOutOfRange_NeedNonNegNum);
  84. if (chars.Length - charIndex < charCount)
  85. throw new ArgumentOutOfRangeException(nameof(chars),
  86. SR.ArgumentOutOfRange_IndexCountBuffer);
  87. if (byteIndex < 0 || byteIndex > bytes.Length)
  88. throw new ArgumentOutOfRangeException(nameof(byteIndex),
  89. SR.ArgumentOutOfRange_Index);
  90. int byteCount = bytes.Length - byteIndex;
  91. // Just call pointer version
  92. fixed (char* pChars = &MemoryMarshal.GetReference((Span<char>)chars))
  93. fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes))
  94. // Remember that charCount is # to decode, not size of array.
  95. return GetBytes(pChars + charIndex, charCount,
  96. pBytes + byteIndex, byteCount, flush);
  97. }
  98. public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount, bool flush)
  99. {
  100. // Validate parameters
  101. if (chars == null || bytes == null)
  102. throw new ArgumentNullException(chars == null ? nameof(chars) : nameof(bytes),
  103. SR.ArgumentNull_Array);
  104. if (byteCount < 0 || charCount < 0)
  105. throw new ArgumentOutOfRangeException(byteCount < 0 ? nameof(byteCount) : nameof(charCount),
  106. SR.ArgumentOutOfRange_NeedNonNegNum);
  107. _mustFlush = flush;
  108. _throwOnOverflow = true;
  109. Debug.Assert(_encoding != null);
  110. return _encoding.GetBytes(chars, charCount, bytes, byteCount, this);
  111. }
  112. // This method is used when your output buffer might not be large enough for the entire result.
  113. // Just call the pointer version. (This gets bytes)
  114. public override unsafe void Convert(char[] chars, int charIndex, int charCount,
  115. byte[] bytes, int byteIndex, int byteCount, bool flush,
  116. out int charsUsed, out int bytesUsed, out bool completed)
  117. {
  118. // Validate parameters
  119. if (chars == null || bytes == null)
  120. throw new ArgumentNullException(chars == null ? nameof(chars) : nameof(bytes),
  121. SR.ArgumentNull_Array);
  122. if (charIndex < 0 || charCount < 0)
  123. throw new ArgumentOutOfRangeException(charIndex < 0 ? nameof(charIndex) : nameof(charCount),
  124. SR.ArgumentOutOfRange_NeedNonNegNum);
  125. if (byteIndex < 0 || byteCount < 0)
  126. throw new ArgumentOutOfRangeException(byteIndex < 0 ? nameof(byteIndex) : nameof(byteCount),
  127. SR.ArgumentOutOfRange_NeedNonNegNum);
  128. if (chars.Length - charIndex < charCount)
  129. throw new ArgumentOutOfRangeException(nameof(chars),
  130. SR.ArgumentOutOfRange_IndexCountBuffer);
  131. if (bytes.Length - byteIndex < byteCount)
  132. throw new ArgumentOutOfRangeException(nameof(bytes),
  133. SR.ArgumentOutOfRange_IndexCountBuffer);
  134. // Just call the pointer version (can't do this for non-msft encoders)
  135. fixed (char* pChars = &MemoryMarshal.GetReference((Span<char>)chars))
  136. {
  137. fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes))
  138. {
  139. Convert(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, flush,
  140. out charsUsed, out bytesUsed, out completed);
  141. }
  142. }
  143. }
  144. // This is the version that uses pointers. We call the base encoding worker function
  145. // after setting our appropriate internal variables. This is getting bytes
  146. public override unsafe void Convert(char* chars, int charCount,
  147. byte* bytes, int byteCount, bool flush,
  148. out int charsUsed, out int bytesUsed, out bool completed)
  149. {
  150. // Validate input parameters
  151. if (bytes == null || chars == null)
  152. throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars),
  153. SR.ArgumentNull_Array);
  154. if (charCount < 0 || byteCount < 0)
  155. throw new ArgumentOutOfRangeException(charCount < 0 ? nameof(charCount) : nameof(byteCount),
  156. SR.ArgumentOutOfRange_NeedNonNegNum);
  157. // We don't want to throw
  158. _mustFlush = flush;
  159. _throwOnOverflow = false;
  160. _charsUsed = 0;
  161. // Do conversion
  162. Debug.Assert(_encoding != null);
  163. bytesUsed = _encoding.GetBytes(chars, charCount, bytes, byteCount, this);
  164. charsUsed = _charsUsed;
  165. // Per MSDN, "The completed output parameter indicates whether all the data in the input
  166. // buffer was converted and stored in the output buffer." That means we've successfully
  167. // consumed all the input _and_ there's no pending state or fallback data remaining to be output.
  168. completed = (charsUsed == charCount)
  169. && !this.HasState
  170. && (_fallbackBuffer is null || _fallbackBuffer.Remaining == 0);
  171. }
  172. public Encoding Encoding
  173. {
  174. get
  175. {
  176. Debug.Assert(_encoding != null);
  177. return _encoding;
  178. }
  179. }
  180. public bool MustFlush => _mustFlush;
  181. /// <summary>
  182. /// States whether a call to <see cref="Encoding.GetBytes(char*, int, byte*, int, EncoderNLS)"/> must first drain data on this <see cref="EncoderNLS"/> instance.
  183. /// </summary>
  184. internal bool HasLeftoverData => _charLeftOver != default || (_fallbackBuffer != null && _fallbackBuffer.Remaining > 0);
  185. // Anything left in our encoder?
  186. internal virtual bool HasState => _charLeftOver != (char)0;
  187. // Allow encoding to clear our must flush instead of throwing (in ThrowBytesOverflow)
  188. internal void ClearMustFlush()
  189. {
  190. _mustFlush = false;
  191. }
  192. internal int DrainLeftoverDataForGetByteCount(ReadOnlySpan<char> chars, out int charsConsumed)
  193. {
  194. // Quick check: we _should not_ have leftover fallback data from a previous invocation,
  195. // as we'd end up consuming any such data and would corrupt whatever Convert call happens
  196. // to be in progress.
  197. if (_fallbackBuffer != null && _fallbackBuffer.Remaining > 0)
  198. {
  199. throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, Encoding.EncodingName, _fallbackBuffer.GetType()));
  200. }
  201. // If we have a leftover high surrogate from a previous operation, consume it now.
  202. // We won't clear the _charLeftOver field since GetByteCount is supposed to be
  203. // a non-mutating operation, and we need the field to retain its value for the
  204. // next call to Convert.
  205. charsConsumed = 0; // could be incorrect, will fix up later in the method
  206. if (_charLeftOver == default)
  207. {
  208. return 0; // no leftover high surrogate char - short-circuit and finish
  209. }
  210. else
  211. {
  212. char secondChar = default;
  213. if (chars.IsEmpty)
  214. {
  215. // If the input buffer is empty and we're not being asked to flush, no-op and return
  216. // success to our caller. If we're being asked to flush, the leftover high surrogate from
  217. // the previous operation will go through the fallback mechanism by itself.
  218. if (!MustFlush)
  219. {
  220. return 0; // no-op = success
  221. }
  222. }
  223. else
  224. {
  225. secondChar = chars[0];
  226. }
  227. // If we have to fallback the chars we're reading immediately below, populate the
  228. // fallback buffer with the invalid data. We'll just fall through to the "consume
  229. // fallback buffer" logic at the end of the method.
  230. bool didFallback;
  231. if (Rune.TryCreate(_charLeftOver, secondChar, out Rune rune))
  232. {
  233. charsConsumed = 1; // consumed the leftover high surrogate + the first char in the input buffer
  234. Debug.Assert(_encoding != null);
  235. if (_encoding.TryGetByteCount(rune, out int byteCount))
  236. {
  237. Debug.Assert(byteCount >= 0, "Encoding shouldn't have returned a negative byte count.");
  238. return byteCount;
  239. }
  240. else
  241. {
  242. // The fallback mechanism relies on a negative index to convey "the start of the invalid
  243. // sequence was some number of chars back before the current buffer." In this block and
  244. // in the block immediately thereafter, we know we have a single leftover high surrogate
  245. // character from a previous operation, so we provide an index of -1 to convey that the
  246. // char immediately before the current buffer was the start of the invalid sequence.
  247. didFallback = FallbackBuffer.Fallback(_charLeftOver, secondChar, index: -1);
  248. }
  249. }
  250. else
  251. {
  252. didFallback = FallbackBuffer.Fallback(_charLeftOver, index: -1);
  253. }
  254. // Now tally the number of bytes that would've been emitted as part of fallback.
  255. Debug.Assert(_fallbackBuffer != null);
  256. return _fallbackBuffer.DrainRemainingDataForGetByteCount();
  257. }
  258. }
  259. internal bool TryDrainLeftoverDataForGetBytes(ReadOnlySpan<char> chars, Span<byte> bytes, out int charsConsumed, out int bytesWritten)
  260. {
  261. // We may have a leftover high surrogate data from a previous invocation, or we may have leftover
  262. // data in the fallback buffer, or we may have neither, but we will never have both. Check for these
  263. // conditions and handle them now.
  264. charsConsumed = 0; // could be incorrect, will fix up later in the method
  265. bytesWritten = 0; // could be incorrect, will fix up later in the method
  266. if (_charLeftOver != default)
  267. {
  268. char secondChar = default;
  269. if (chars.IsEmpty)
  270. {
  271. // If the input buffer is empty and we're not being asked to flush, no-op and return
  272. // success to our caller. If we're being asked to flush, the leftover high surrogate from
  273. // the previous operation will go through the fallback mechanism by itself.
  274. if (!MustFlush)
  275. {
  276. charsConsumed = 0;
  277. bytesWritten = 0;
  278. return true; // no-op = success
  279. }
  280. }
  281. else
  282. {
  283. secondChar = chars[0];
  284. }
  285. // If we have to fallback the chars we're reading immediately below, populate the
  286. // fallback buffer with the invalid data. We'll just fall through to the "consume
  287. // fallback buffer" logic at the end of the method.
  288. if (Rune.TryCreate(_charLeftOver, secondChar, out Rune rune))
  289. {
  290. charsConsumed = 1; // at the very least, we consumed 1 char from the input
  291. Debug.Assert(_encoding != null);
  292. switch (_encoding.EncodeRune(rune, bytes, out bytesWritten))
  293. {
  294. case OperationStatus.Done:
  295. _charLeftOver = default; // we just consumed this char
  296. return true; // that's all - we've handled the leftover data
  297. case OperationStatus.DestinationTooSmall:
  298. _charLeftOver = default; // we just consumed this char
  299. _encoding.ThrowBytesOverflow(this, nothingEncoded: true); // will throw
  300. break;
  301. case OperationStatus.InvalidData:
  302. FallbackBuffer.Fallback(_charLeftOver, secondChar, index: -1); // see comment in DrainLeftoverDataForGetByteCount
  303. break;
  304. default:
  305. Debug.Fail("Unknown return value.");
  306. break;
  307. }
  308. }
  309. else
  310. {
  311. FallbackBuffer.Fallback(_charLeftOver, index: -1); // see comment in DrainLeftoverDataForGetByteCount
  312. }
  313. }
  314. // Now check the fallback buffer for any remaining data.
  315. if (_fallbackBuffer != null && _fallbackBuffer.Remaining > 0)
  316. {
  317. return _fallbackBuffer.TryDrainRemainingDataForGetBytes(bytes, out bytesWritten);
  318. }
  319. // And we're done!
  320. return true; // success
  321. }
  322. }
  323. }