EncoderNLS.cs 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402
  1. // Licensed to the .NET Foundation under one or more agreements.
  2. // The .NET Foundation licenses this file to you under the MIT license.
  3. // See the LICENSE file in the project root for more information.
  4. using System.Buffers;
  5. using System.Diagnostics;
  6. using System.Runtime.InteropServices;
  7. namespace System.Text
  8. {
  9. // An Encoder is used to encode a sequence of blocks of characters into
  10. // a sequence of blocks of bytes. Following instantiation of an encoder,
  11. // sequential blocks of characters are converted into blocks of bytes through
  12. // calls to the GetBytes method. The encoder maintains state between the
  13. // conversions, allowing it to correctly encode character sequences that span
  14. // adjacent blocks.
  15. //
  16. // Instances of specific implementations of the Encoder abstract base
  17. // class are typically obtained through calls to the GetEncoder method
  18. // of Encoding objects.
  19. //
  20. internal class EncoderNLS : Encoder
  21. {
  22. // Need a place for the last left over character, most of our encodings use this
  23. internal char _charLeftOver;
  24. private Encoding _encoding;
  25. private bool _mustFlush;
  26. internal bool _throwOnOverflow;
  27. internal int _charsUsed;
  28. internal EncoderNLS(Encoding encoding)
  29. {
  30. _encoding = encoding;
  31. _fallback = _encoding.EncoderFallback;
  32. this.Reset();
  33. }
  34. public override void Reset()
  35. {
  36. _charLeftOver = (char)0;
  37. if (_fallbackBuffer != null)
  38. _fallbackBuffer.Reset();
  39. }
  40. public override unsafe int GetByteCount(char[] chars, int index, int count, bool flush)
  41. {
  42. // Validate input parameters
  43. if (chars == null)
  44. throw new ArgumentNullException(nameof(chars),
  45. SR.ArgumentNull_Array);
  46. if (index < 0 || count < 0)
  47. throw new ArgumentOutOfRangeException((index < 0 ? nameof(index) : nameof(count)),
  48. SR.ArgumentOutOfRange_NeedNonNegNum);
  49. if (chars.Length - index < count)
  50. throw new ArgumentOutOfRangeException(nameof(chars),
  51. SR.ArgumentOutOfRange_IndexCountBuffer);
  52. // Just call the pointer version
  53. int result = -1;
  54. fixed (char* pChars = &MemoryMarshal.GetReference((Span<char>)chars))
  55. {
  56. result = GetByteCount(pChars + index, count, flush);
  57. }
  58. return result;
  59. }
  60. public unsafe override int GetByteCount(char* chars, int count, bool flush)
  61. {
  62. // Validate input parameters
  63. if (chars == null)
  64. throw new ArgumentNullException(nameof(chars),
  65. SR.ArgumentNull_Array);
  66. if (count < 0)
  67. throw new ArgumentOutOfRangeException(nameof(count),
  68. SR.ArgumentOutOfRange_NeedNonNegNum);
  69. _mustFlush = flush;
  70. _throwOnOverflow = true;
  71. Debug.Assert(_encoding != null);
  72. return _encoding.GetByteCount(chars, count, this);
  73. }
  74. public override unsafe int GetBytes(char[] chars, int charIndex, int charCount,
  75. byte[] bytes, int byteIndex, bool flush)
  76. {
  77. // Validate parameters
  78. if (chars == null || bytes == null)
  79. throw new ArgumentNullException((chars == null ? nameof(chars) : nameof(bytes)),
  80. SR.ArgumentNull_Array);
  81. if (charIndex < 0 || charCount < 0)
  82. throw new ArgumentOutOfRangeException((charIndex < 0 ? nameof(charIndex) : nameof(charCount)),
  83. SR.ArgumentOutOfRange_NeedNonNegNum);
  84. if (chars.Length - charIndex < charCount)
  85. throw new ArgumentOutOfRangeException(nameof(chars),
  86. SR.ArgumentOutOfRange_IndexCountBuffer);
  87. if (byteIndex < 0 || byteIndex > bytes.Length)
  88. throw new ArgumentOutOfRangeException(nameof(byteIndex),
  89. SR.ArgumentOutOfRange_Index);
  90. int byteCount = bytes.Length - byteIndex;
  91. // Just call pointer version
  92. fixed (char* pChars = &MemoryMarshal.GetReference((Span<char>)chars))
  93. fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes))
  94. // Remember that charCount is # to decode, not size of array.
  95. return GetBytes(pChars + charIndex, charCount,
  96. pBytes + byteIndex, byteCount, flush);
  97. }
  98. public unsafe override int GetBytes(char* chars, int charCount, byte* bytes, int byteCount, bool flush)
  99. {
  100. // Validate parameters
  101. if (chars == null || bytes == null)
  102. throw new ArgumentNullException((chars == null ? nameof(chars) : nameof(bytes)),
  103. SR.ArgumentNull_Array);
  104. if (byteCount < 0 || charCount < 0)
  105. throw new ArgumentOutOfRangeException((byteCount < 0 ? nameof(byteCount) : nameof(charCount)),
  106. SR.ArgumentOutOfRange_NeedNonNegNum);
  107. _mustFlush = flush;
  108. _throwOnOverflow = true;
  109. Debug.Assert(_encoding != null);
  110. return _encoding.GetBytes(chars, charCount, bytes, byteCount, this);
  111. }
  112. // This method is used when your output buffer might not be large enough for the entire result.
  113. // Just call the pointer version. (This gets bytes)
  114. public override unsafe void Convert(char[] chars, int charIndex, int charCount,
  115. byte[] bytes, int byteIndex, int byteCount, bool flush,
  116. out int charsUsed, out int bytesUsed, out bool completed)
  117. {
  118. // Validate parameters
  119. if (chars == null || bytes == null)
  120. throw new ArgumentNullException((chars == null ? nameof(chars) : nameof(bytes)),
  121. SR.ArgumentNull_Array);
  122. if (charIndex < 0 || charCount < 0)
  123. throw new ArgumentOutOfRangeException((charIndex < 0 ? nameof(charIndex) : nameof(charCount)),
  124. SR.ArgumentOutOfRange_NeedNonNegNum);
  125. if (byteIndex < 0 || byteCount < 0)
  126. throw new ArgumentOutOfRangeException((byteIndex < 0 ? nameof(byteIndex) : nameof(byteCount)),
  127. SR.ArgumentOutOfRange_NeedNonNegNum);
  128. if (chars.Length - charIndex < charCount)
  129. throw new ArgumentOutOfRangeException(nameof(chars),
  130. SR.ArgumentOutOfRange_IndexCountBuffer);
  131. if (bytes.Length - byteIndex < byteCount)
  132. throw new ArgumentOutOfRangeException(nameof(bytes),
  133. SR.ArgumentOutOfRange_IndexCountBuffer);
  134. // Just call the pointer version (can't do this for non-msft encoders)
  135. fixed (char* pChars = &MemoryMarshal.GetReference((Span<char>)chars))
  136. {
  137. fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes))
  138. {
  139. Convert(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, flush,
  140. out charsUsed, out bytesUsed, out completed);
  141. }
  142. }
  143. }
  144. // This is the version that uses pointers. We call the base encoding worker function
  145. // after setting our appropriate internal variables. This is getting bytes
  146. public override unsafe void Convert(char* chars, int charCount,
  147. byte* bytes, int byteCount, bool flush,
  148. out int charsUsed, out int bytesUsed, out bool completed)
  149. {
  150. // Validate input parameters
  151. if (bytes == null || chars == null)
  152. throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars),
  153. SR.ArgumentNull_Array);
  154. if (charCount < 0 || byteCount < 0)
  155. throw new ArgumentOutOfRangeException((charCount < 0 ? nameof(charCount) : nameof(byteCount)),
  156. SR.ArgumentOutOfRange_NeedNonNegNum);
  157. // We don't want to throw
  158. _mustFlush = flush;
  159. _throwOnOverflow = false;
  160. _charsUsed = 0;
  161. // Do conversion
  162. Debug.Assert(_encoding != null);
  163. bytesUsed = _encoding.GetBytes(chars, charCount, bytes, byteCount, this);
  164. charsUsed = _charsUsed;
  165. // Per MSDN, "The completed output parameter indicates whether all the data in the input
  166. // buffer was converted and stored in the output buffer." That means we've successfully
  167. // consumed all the input _and_ there's no pending state or fallback data remaining to be output.
  168. completed = (charsUsed == charCount)
  169. && !this.HasState
  170. && (_fallbackBuffer is null || _fallbackBuffer.Remaining == 0);
  171. // Our data thingys are now full, we can return
  172. }
  173. public Encoding Encoding
  174. {
  175. get
  176. {
  177. Debug.Assert(_encoding != null);
  178. return _encoding;
  179. }
  180. }
  181. public bool MustFlush
  182. {
  183. get
  184. {
  185. return _mustFlush;
  186. }
  187. }
  188. /// <summary>
  189. /// States whether a call to <see cref="Encoding.GetBytes(char*, int, byte*, int, EncoderNLS)"/> must first drain data on this <see cref="EncoderNLS"/> instance.
  190. /// </summary>
  191. internal bool HasLeftoverData => _charLeftOver != default || (_fallbackBuffer != null && _fallbackBuffer.Remaining > 0);
  192. // Anything left in our encoder?
  193. internal virtual bool HasState
  194. {
  195. get
  196. {
  197. return (_charLeftOver != (char)0);
  198. }
  199. }
  200. // Allow encoding to clear our must flush instead of throwing (in ThrowBytesOverflow)
  201. internal void ClearMustFlush()
  202. {
  203. _mustFlush = false;
  204. }
  205. internal int DrainLeftoverDataForGetByteCount(ReadOnlySpan<char> chars, out int charsConsumed)
  206. {
  207. // Quick check: we _should not_ have leftover fallback data from a previous invocation,
  208. // as we'd end up consuming any such data and would corrupt whatever Convert call happens
  209. // to be in progress.
  210. if (_fallbackBuffer != null && _fallbackBuffer.Remaining > 0)
  211. {
  212. throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, Encoding.EncodingName, _fallbackBuffer.GetType()));
  213. }
  214. // If we have a leftover high surrogate from a previous operation, consume it now.
  215. // We won't clear the _charLeftOver field since GetByteCount is supposed to be
  216. // a non-mutating operation, and we need the field to retain its value for the
  217. // next call to Convert.
  218. charsConsumed = 0; // could be incorrect, will fix up later in the method
  219. if (_charLeftOver == default)
  220. {
  221. return 0; // no leftover high surrogate char - short-circuit and finish
  222. }
  223. else
  224. {
  225. char secondChar = default;
  226. if (chars.IsEmpty)
  227. {
  228. // If the input buffer is empty and we're not being asked to flush, no-op and return
  229. // success to our caller. If we're being asked to flush, the leftover high surrogate from
  230. // the previous operation will go through the fallback mechanism by itself.
  231. if (!MustFlush)
  232. {
  233. return 0; // no-op = success
  234. }
  235. }
  236. else
  237. {
  238. secondChar = chars[0];
  239. }
  240. // If we have to fallback the chars we're reading immediately below, populate the
  241. // fallback buffer with the invalid data. We'll just fall through to the "consume
  242. // fallback buffer" logic at the end of the method.
  243. bool didFallback;
  244. if (Rune.TryCreate(_charLeftOver, secondChar, out Rune rune))
  245. {
  246. charsConsumed = 1; // consumed the leftover high surrogate + the first char in the input buffer
  247. Debug.Assert(_encoding != null);
  248. if (_encoding.TryGetByteCount(rune, out int byteCount))
  249. {
  250. Debug.Assert(byteCount >= 0, "Encoding shouldn't have returned a negative byte count.");
  251. return byteCount;
  252. }
  253. else
  254. {
  255. // The fallback mechanism relies on a negative index to convey "the start of the invalid
  256. // sequence was some number of chars back before the current buffer." In this block and
  257. // in the block immediately thereafter, we know we have a single leftover high surrogate
  258. // character from a previous operation, so we provide an index of -1 to convey that the
  259. // char immediately before the current buffer was the start of the invalid sequence.
  260. didFallback = FallbackBuffer.Fallback(_charLeftOver, secondChar, index: -1);
  261. }
  262. }
  263. else
  264. {
  265. didFallback = FallbackBuffer.Fallback(_charLeftOver, index: -1);
  266. }
  267. // Now tally the number of bytes that would've been emitted as part of fallback.
  268. Debug.Assert(_fallbackBuffer != null);
  269. return _fallbackBuffer.DrainRemainingDataForGetByteCount();
  270. }
  271. }
  272. internal bool TryDrainLeftoverDataForGetBytes(ReadOnlySpan<char> chars, Span<byte> bytes, out int charsConsumed, out int bytesWritten)
  273. {
  274. // We may have a leftover high surrogate data from a previous invocation, or we may have leftover
  275. // data in the fallback buffer, or we may have neither, but we will never have both. Check for these
  276. // conditions and handle them now.
  277. charsConsumed = 0; // could be incorrect, will fix up later in the method
  278. bytesWritten = 0; // could be incorrect, will fix up later in the method
  279. if (_charLeftOver != default)
  280. {
  281. char secondChar = default;
  282. if (chars.IsEmpty)
  283. {
  284. // If the input buffer is empty and we're not being asked to flush, no-op and return
  285. // success to our caller. If we're being asked to flush, the leftover high surrogate from
  286. // the previous operation will go through the fallback mechanism by itself.
  287. if (!MustFlush)
  288. {
  289. charsConsumed = 0;
  290. bytesWritten = 0;
  291. return true; // no-op = success
  292. }
  293. }
  294. else
  295. {
  296. secondChar = chars[0];
  297. }
  298. // If we have to fallback the chars we're reading immediately below, populate the
  299. // fallback buffer with the invalid data. We'll just fall through to the "consume
  300. // fallback buffer" logic at the end of the method.
  301. if (Rune.TryCreate(_charLeftOver, secondChar, out Rune rune))
  302. {
  303. charsConsumed = 1; // at the very least, we consumed 1 char from the input
  304. Debug.Assert(_encoding != null);
  305. switch (_encoding.EncodeRune(rune, bytes, out bytesWritten))
  306. {
  307. case OperationStatus.Done:
  308. _charLeftOver = default; // we just consumed this char
  309. return true; // that's all - we've handled the leftover data
  310. case OperationStatus.DestinationTooSmall:
  311. _charLeftOver = default; // we just consumed this char
  312. _encoding.ThrowBytesOverflow(this, nothingEncoded: true); // will throw
  313. break;
  314. case OperationStatus.InvalidData:
  315. FallbackBuffer.Fallback(_charLeftOver, secondChar, index: -1); // see comment in DrainLeftoverDataForGetByteCount
  316. break;
  317. default:
  318. Debug.Fail("Unknown return value.");
  319. break;
  320. }
  321. }
  322. else
  323. {
  324. FallbackBuffer.Fallback(_charLeftOver, index: -1); // see comment in DrainLeftoverDataForGetByteCount
  325. }
  326. }
  327. // Now check the fallback buffer for any remaining data.
  328. if (_fallbackBuffer != null && _fallbackBuffer.Remaining > 0)
  329. {
  330. return _fallbackBuffer.TryDrainRemainingDataForGetBytes(bytes, out bytesWritten);
  331. }
  332. // And we're done!
  333. return true; // success
  334. }
  335. }
  336. }