DecoderFallback.cs 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324
  1. // Licensed to the .NET Foundation under one or more agreements.
  2. // The .NET Foundation licenses this file to you under the MIT license.
  3. // See the LICENSE file in the project root for more information.
  4. using System.Diagnostics;
  5. using System.Diagnostics.CodeAnalysis;
  6. using System.Globalization;
  7. using System.Threading;
  8. namespace System.Text
  9. {
  10. public abstract class DecoderFallback
  11. {
  12. private static DecoderFallback? s_replacementFallback; // Default fallback, uses no best fit & "?"
  13. private static DecoderFallback? s_exceptionFallback;
  14. public static DecoderFallback ReplacementFallback =>
  15. s_replacementFallback ?? Interlocked.CompareExchange(ref s_replacementFallback, new DecoderReplacementFallback(), null) ?? s_replacementFallback;
  16. public static DecoderFallback ExceptionFallback =>
  17. s_exceptionFallback ?? Interlocked.CompareExchange<DecoderFallback?>(ref s_exceptionFallback, new DecoderExceptionFallback(), null) ?? s_exceptionFallback;
  18. // Fallback
  19. //
  20. // Return the appropriate unicode string alternative to the character that need to fall back.
  21. // Most implementations will be:
  22. // return new MyCustomDecoderFallbackBuffer(this);
  23. public abstract DecoderFallbackBuffer CreateFallbackBuffer();
  24. // Maximum number of characters that this instance of this fallback could return
  25. public abstract int MaxCharCount { get; }
  26. }
  27. public abstract class DecoderFallbackBuffer
  28. {
  29. // Most implementations will probably need an implementation-specific constructor
  30. // internal methods that cannot be overridden that let us do our fallback thing
  31. // These wrap the internal methods so that we can check for people doing stuff that's incorrect
  32. public abstract bool Fallback(byte[] bytesUnknown, int index);
  33. // Get next character
  34. public abstract char GetNextChar();
  35. // Back up a character
  36. public abstract bool MovePrevious();
  37. // How many chars left in this fallback?
  38. public abstract int Remaining { get; }
  39. // Clear the buffer
  40. public virtual void Reset()
  41. {
  42. while (GetNextChar() != (char)0) ;
  43. }
  44. // Internal items to help us figure out what we're doing as far as error messages, etc.
  45. // These help us with our performance and messages internally
  46. internal unsafe byte* byteStart;
  47. internal unsafe char* charEnd;
  48. internal Encoding? _encoding;
  49. internal DecoderNLS? _decoder;
  50. private int _originalByteCount;
  51. // Internal Reset
  52. internal unsafe void InternalReset()
  53. {
  54. byteStart = null;
  55. Reset();
  56. }
  57. // Set the above values
  58. // This can't be part of the constructor because DecoderFallbacks would have to know how to implement these.
  59. internal unsafe void InternalInitialize(byte* byteStart, char* charEnd)
  60. {
  61. this.byteStart = byteStart;
  62. this.charEnd = charEnd;
  63. }
  64. internal static DecoderFallbackBuffer CreateAndInitialize(Encoding encoding, DecoderNLS? decoder, int originalByteCount)
  65. {
  66. // The original byte count is only used for keeping track of what 'index' value needs
  67. // to be passed to the abstract Fallback method. The index value is calculated by subtracting
  68. // 'bytes.Length' (where bytes is expected to be the entire remaining input buffer)
  69. // from the 'originalByteCount' value specified here.
  70. DecoderFallbackBuffer fallbackBuffer = (decoder is null) ? encoding.DecoderFallback.CreateFallbackBuffer() : decoder.FallbackBuffer;
  71. fallbackBuffer._encoding = encoding;
  72. fallbackBuffer._decoder = decoder;
  73. fallbackBuffer._originalByteCount = originalByteCount;
  74. return fallbackBuffer;
  75. }
  76. // Fallback the current byte by sticking it into the remaining char buffer.
  77. // This can only be called by our encodings (other have to use the public fallback methods), so
  78. // we can use our DecoderNLS here too (except we don't).
  79. // Returns true if we are successful, false if we can't fallback the character (no buffer space)
  80. // So caller needs to throw buffer space if return false.
  81. // Right now this has both bytes and bytes[], since we might have extra bytes, hence the
  82. // array, and we might need the index, hence the byte*
  83. // Don't touch ref chars unless we succeed
  84. internal unsafe virtual bool InternalFallback(byte[] bytes, byte* pBytes, ref char* chars)
  85. {
  86. Debug.Assert(byteStart != null, "[DecoderFallback.InternalFallback]Used InternalFallback without calling InternalInitialize");
  87. // See if there's a fallback character and we have an output buffer then copy our string.
  88. if (this.Fallback(bytes, (int)(pBytes - byteStart - bytes.Length)))
  89. {
  90. // Copy the chars to our output
  91. char ch;
  92. char* charTemp = chars;
  93. bool bHighSurrogate = false;
  94. while ((ch = GetNextChar()) != 0)
  95. {
  96. // Make sure no mixed up surrogates
  97. if (char.IsSurrogate(ch))
  98. {
  99. if (char.IsHighSurrogate(ch))
  100. {
  101. // High Surrogate
  102. if (bHighSurrogate)
  103. throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex);
  104. bHighSurrogate = true;
  105. }
  106. else
  107. {
  108. // Low surrogate
  109. if (bHighSurrogate == false)
  110. throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex);
  111. bHighSurrogate = false;
  112. }
  113. }
  114. if (charTemp >= charEnd)
  115. {
  116. // No buffer space
  117. return false;
  118. }
  119. *(charTemp++) = ch;
  120. }
  121. // Need to make sure that bHighSurrogate isn't true
  122. if (bHighSurrogate)
  123. throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex);
  124. // Now we aren't going to be false, so its OK to update chars
  125. chars = charTemp;
  126. }
  127. return true;
  128. }
  129. // This version just counts the fallback and doesn't actually copy anything.
  130. internal unsafe virtual int InternalFallback(byte[] bytes, byte* pBytes)
  131. // Right now this has both bytes and bytes[], since we might have extra bytes, hence the
  132. // array, and we might need the index, hence the byte*
  133. {
  134. Debug.Assert(byteStart != null, "[DecoderFallback.InternalFallback]Used InternalFallback without calling InternalInitialize");
  135. // See if there's a fallback character and we have an output buffer then copy our string.
  136. if (this.Fallback(bytes, (int)(pBytes - byteStart - bytes.Length)))
  137. {
  138. int count = 0;
  139. char ch;
  140. bool bHighSurrogate = false;
  141. while ((ch = GetNextChar()) != 0)
  142. {
  143. // Make sure no mixed up surrogates
  144. if (char.IsSurrogate(ch))
  145. {
  146. if (char.IsHighSurrogate(ch))
  147. {
  148. // High Surrogate
  149. if (bHighSurrogate)
  150. throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex);
  151. bHighSurrogate = true;
  152. }
  153. else
  154. {
  155. // Low surrogate
  156. if (bHighSurrogate == false)
  157. throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex);
  158. bHighSurrogate = false;
  159. }
  160. }
  161. count++;
  162. }
  163. // Need to make sure that bHighSurrogate isn't true
  164. if (bHighSurrogate)
  165. throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex);
  166. return count;
  167. }
  168. // If no fallback return 0
  169. return 0;
  170. }
  171. internal int InternalFallbackGetCharCount(ReadOnlySpan<byte> remainingBytes, int fallbackLength)
  172. {
  173. return (Fallback(remainingBytes.Slice(0, fallbackLength).ToArray(), index: _originalByteCount - remainingBytes.Length))
  174. ? DrainRemainingDataForGetCharCount()
  175. : 0;
  176. }
  177. internal bool TryInternalFallbackGetChars(ReadOnlySpan<byte> remainingBytes, int fallbackLength, Span<char> chars, out int charsWritten)
  178. {
  179. if (Fallback(remainingBytes.Slice(0, fallbackLength).ToArray(), index: _originalByteCount - remainingBytes.Length))
  180. {
  181. return TryDrainRemainingDataForGetChars(chars, out charsWritten);
  182. }
  183. else
  184. {
  185. // Return true because we weren't asked to write anything, so this is a "success" in the sense that
  186. // the output buffer was large enough to hold the desired 0 chars of output.
  187. charsWritten = 0;
  188. return true;
  189. }
  190. }
  191. private Rune GetNextRune()
  192. {
  193. // Call GetNextChar() and try treating it as a non-surrogate character.
  194. // If that fails, call GetNextChar() again and attempt to treat the two chars
  195. // as a surrogate pair. If that still fails, throw an exception since the fallback
  196. // mechanism is giving us a bad replacement character.
  197. Rune rune;
  198. char ch = GetNextChar();
  199. if (!Rune.TryCreate(ch, out rune) && !Rune.TryCreate(ch, GetNextChar(), out rune))
  200. {
  201. throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex);
  202. }
  203. return rune;
  204. }
  205. internal int DrainRemainingDataForGetCharCount()
  206. {
  207. int totalCharCount = 0;
  208. Rune thisRune;
  209. while ((thisRune = GetNextRune()).Value != 0)
  210. {
  211. // We need to check for overflow while tallying the fallback char count.
  212. totalCharCount += thisRune.Utf16SequenceLength;
  213. if (totalCharCount < 0)
  214. {
  215. InternalReset();
  216. Encoding.ThrowConversionOverflow();
  217. }
  218. }
  219. return totalCharCount;
  220. }
  221. internal bool TryDrainRemainingDataForGetChars(Span<char> chars, out int charsWritten)
  222. {
  223. int originalCharCount = chars.Length;
  224. Rune thisRune;
  225. while ((thisRune = GetNextRune()).Value != 0)
  226. {
  227. if (thisRune.TryEncodeToUtf16(chars, out int charsWrittenJustNow))
  228. {
  229. chars = chars.Slice(charsWrittenJustNow);
  230. continue;
  231. }
  232. else
  233. {
  234. InternalReset();
  235. charsWritten = default;
  236. return false;
  237. }
  238. }
  239. charsWritten = originalCharCount - chars.Length;
  240. return true;
  241. }
  242. // private helper methods
  243. [DoesNotReturn]
  244. internal void ThrowLastBytesRecursive(byte[] bytesUnknown)
  245. {
  246. bytesUnknown = bytesUnknown ?? Array.Empty<byte>();
  247. // Create a string representation of our bytes.
  248. StringBuilder strBytes = new StringBuilder(bytesUnknown.Length * 3);
  249. int i;
  250. for (i = 0; i < bytesUnknown.Length && i < 20; i++)
  251. {
  252. if (strBytes.Length > 0)
  253. strBytes.Append(' ');
  254. strBytes.AppendFormat(CultureInfo.InvariantCulture, "\\x{0:X2}", bytesUnknown[i]);
  255. }
  256. // In case the string's really long
  257. if (i == 20)
  258. strBytes.Append(" ...");
  259. // Throw it, using our complete bytes
  260. throw new ArgumentException(
  261. SR.Format(SR.Argument_RecursiveFallbackBytes,
  262. strBytes.ToString()), nameof(bytesUnknown));
  263. }
  264. }
  265. }