EncoderFallback.cs 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389
  1. // Licensed to the .NET Foundation under one or more agreements.
  2. // The .NET Foundation licenses this file to you under the MIT license.
  3. // See the LICENSE file in the project root for more information.
  4. using System.Buffers;
  5. using System.Diagnostics;
  6. using System.Diagnostics.CodeAnalysis;
  7. using System.Threading;
  8. namespace System.Text
  9. {
  10. public abstract class EncoderFallback
  11. {
  12. private static EncoderFallback? s_replacementFallback; // Default fallback, uses no best fit & "?"
  13. private static EncoderFallback? s_exceptionFallback;
  14. // Get each of our generic fallbacks.
  15. public static EncoderFallback ReplacementFallback
  16. {
  17. get
  18. {
  19. if (s_replacementFallback == null)
  20. Interlocked.CompareExchange<EncoderFallback?>(ref s_replacementFallback, new EncoderReplacementFallback(), null);
  21. return s_replacementFallback;
  22. }
  23. }
  24. public static EncoderFallback ExceptionFallback
  25. {
  26. get
  27. {
  28. if (s_exceptionFallback == null)
  29. Interlocked.CompareExchange<EncoderFallback?>(ref s_exceptionFallback, new EncoderExceptionFallback(), null);
  30. return s_exceptionFallback;
  31. }
  32. }
  33. // Fallback
  34. //
  35. // Return the appropriate unicode string alternative to the character that need to fall back.
  36. // Most implementations will be:
  37. // return new MyCustomEncoderFallbackBuffer(this);
  38. public abstract EncoderFallbackBuffer CreateFallbackBuffer();
  39. // Maximum number of characters that this instance of this fallback could return
  40. public abstract int MaxCharCount { get; }
  41. }
  42. public abstract class EncoderFallbackBuffer
  43. {
  44. // Most implementations will probably need an implementation-specific constructor
  45. // Public methods that cannot be overridden that let us do our fallback thing
  46. // These wrap the internal methods so that we can check for people doing stuff that is incorrect
  47. public abstract bool Fallback(char charUnknown, int index);
  48. public abstract bool Fallback(char charUnknownHigh, char charUnknownLow, int index);
  49. // Get next character
  50. public abstract char GetNextChar();
  51. // Back up a character
  52. public abstract bool MovePrevious();
  53. // How many chars left in this fallback?
  54. public abstract int Remaining { get; }
  55. // Not sure if this should be public or not.
  56. // Clear the buffer
  57. public virtual void Reset()
  58. {
  59. while (GetNextChar() != (char)0) ;
  60. }
  61. // Internal items to help us figure out what we're doing as far as error messages, etc.
  62. // These help us with our performance and messages internally
  63. internal unsafe char* charStart;
  64. internal unsafe char* charEnd;
  65. internal EncoderNLS? encoder; // TODO: MAKE ME PRIVATE
  66. internal bool setEncoder;
  67. internal bool bUsedEncoder;
  68. internal bool bFallingBack = false;
  69. internal int iRecursionCount = 0;
  70. private const int iMaxRecursion = 250;
  71. private Encoding? encoding;
  72. private int originalCharCount;
  73. // Internal Reset
  74. // For example, what if someone fails a conversion and wants to reset one of our fallback buffers?
  75. internal unsafe void InternalReset()
  76. {
  77. charStart = null;
  78. bFallingBack = false;
  79. iRecursionCount = 0;
  80. Reset();
  81. }
  82. // Set the above values
  83. // This can't be part of the constructor because EncoderFallbacks would have to know how to implement these.
  84. internal unsafe void InternalInitialize(char* charStart, char* charEnd, EncoderNLS? encoder, bool setEncoder)
  85. {
  86. this.charStart = charStart;
  87. this.charEnd = charEnd;
  88. this.encoder = encoder;
  89. this.setEncoder = setEncoder;
  90. this.bUsedEncoder = false;
  91. this.bFallingBack = false;
  92. this.iRecursionCount = 0;
  93. }
  94. internal static EncoderFallbackBuffer CreateAndInitialize(Encoding encoding, EncoderNLS? encoder, int originalCharCount)
  95. {
  96. // The original char count is only used for keeping track of what 'index' value needs
  97. // to be passed to the abstract Fallback method. The index value is calculated by subtracting
  98. // 'chars.Length' (where chars is expected to be the entire remaining input buffer)
  99. // from the 'originalCharCount' value specified here.
  100. EncoderFallbackBuffer fallbackBuffer = (encoder is null) ? encoding.EncoderFallback.CreateFallbackBuffer() : encoder.FallbackBuffer;
  101. fallbackBuffer.encoding = encoding;
  102. fallbackBuffer.encoder = encoder;
  103. fallbackBuffer.originalCharCount = originalCharCount;
  104. return fallbackBuffer;
  105. }
  106. internal char InternalGetNextChar()
  107. {
  108. char ch = GetNextChar();
  109. bFallingBack = (ch != 0);
  110. if (ch == 0) iRecursionCount = 0;
  111. return ch;
  112. }
  113. private bool InternalFallback(ReadOnlySpan<char> chars, out int charsConsumed)
  114. {
  115. Debug.Assert(!chars.IsEmpty, "Caller shouldn't invoke this if there's no data to fall back.");
  116. // First, try falling back a single BMP character or a standalone low surrogate.
  117. // If the first char is a high surrogate, we'll try to combine it with the next
  118. // char in the input sequence.
  119. char firstChar = chars[0];
  120. char secondChar = default;
  121. if (!chars.IsEmpty)
  122. {
  123. firstChar = chars[0];
  124. if (1 < (uint)chars.Length)
  125. {
  126. secondChar = chars[1];
  127. }
  128. }
  129. // Ask the subclassed type to initiate fallback logic.
  130. int index = originalCharCount - chars.Length;
  131. if (!char.IsSurrogatePair(firstChar, secondChar))
  132. {
  133. // This code path is also used when 'firstChar' is a standalone surrogate or
  134. // if it's a high surrogate at the end of the input buffer.
  135. charsConsumed = 1;
  136. return Fallback(firstChar, index);
  137. }
  138. else
  139. {
  140. charsConsumed = 2;
  141. return Fallback(firstChar, secondChar, index);
  142. }
  143. }
  144. internal int InternalFallbackGetByteCount(ReadOnlySpan<char> chars, out int charsConsumed)
  145. {
  146. int bytesWritten = 0;
  147. if (InternalFallback(chars, out charsConsumed))
  148. {
  149. // There's data in the fallback buffer - pull it out now.
  150. bytesWritten = DrainRemainingDataForGetByteCount();
  151. }
  152. return bytesWritten;
  153. }
  154. internal bool TryInternalFallbackGetBytes(ReadOnlySpan<char> chars, Span<byte> bytes, out int charsConsumed, out int bytesWritten)
  155. {
  156. if (InternalFallback(chars, out charsConsumed))
  157. {
  158. // There's data in the fallback buffer - pull it out now.
  159. return TryDrainRemainingDataForGetBytes(bytes, out bytesWritten);
  160. }
  161. else
  162. {
  163. // There's no data in the fallback buffer.
  164. bytesWritten = 0;
  165. return true; // true = didn't run out of space in destination buffer
  166. }
  167. }
  168. internal bool TryDrainRemainingDataForGetBytes(Span<byte> bytes, out int bytesWritten)
  169. {
  170. int originalBytesLength = bytes.Length;
  171. Debug.Assert(encoding != null);
  172. Rune thisRune;
  173. while ((thisRune = GetNextRune()).Value != 0)
  174. {
  175. switch (encoding.EncodeRune(thisRune, bytes, out int bytesWrittenJustNow))
  176. {
  177. case OperationStatus.Done:
  178. bytes = bytes.Slice(bytesWrittenJustNow);
  179. continue;
  180. case OperationStatus.DestinationTooSmall:
  181. // Since we're not consuming the Rune we just read, back up as many chars as necessary
  182. // to undo the read we just performed, then report to our caller that we ran out of space.
  183. for (int i = 0; i < thisRune.Utf16SequenceLength; i++)
  184. {
  185. MovePrevious();
  186. }
  187. bytesWritten = originalBytesLength - bytes.Length;
  188. return false; // ran out of destination buffer
  189. case OperationStatus.InvalidData:
  190. // We can't fallback the fallback. We can't make forward progress, so report to our caller
  191. // that something went terribly wrong. The error message contains the fallback char that
  192. // couldn't be converted. (Ideally we'd provide the first char that originally triggered
  193. // the fallback, but it's complicated to keep this state around, and a fallback producing
  194. // invalid data should be a very rare occurrence.)
  195. ThrowLastCharRecursive(thisRune.Value);
  196. break; // will never be hit; call above throws
  197. default:
  198. Debug.Fail("Unexpected return value.");
  199. break;
  200. }
  201. }
  202. bytesWritten = originalBytesLength - bytes.Length;
  203. return true; // finished successfully
  204. }
  205. internal int DrainRemainingDataForGetByteCount()
  206. {
  207. int totalByteCount = 0;
  208. Debug.Assert(encoding != null);
  209. Rune thisRune;
  210. while ((thisRune = GetNextRune()).Value != 0)
  211. {
  212. if (!encoding.TryGetByteCount(thisRune, out int byteCountThisIteration))
  213. {
  214. // We can't fallback the fallback. We can't make forward progress, so report to our caller
  215. // that something went terribly wrong. The error message contains the fallback char that
  216. // couldn't be converted. (Ideally we'd provide the first char that originally triggered
  217. // the fallback, but it's complicated to keep this state around, and a fallback producing
  218. // invalid data should be a very rare occurrence.)
  219. ThrowLastCharRecursive(thisRune.Value);
  220. }
  221. Debug.Assert(byteCountThisIteration >= 0, "Encoding shouldn't have returned a negative byte count.");
  222. // We need to check for overflow while tallying the fallback byte count.
  223. totalByteCount += byteCountThisIteration;
  224. if (totalByteCount < 0)
  225. {
  226. InternalReset();
  227. Encoding.ThrowConversionOverflow();
  228. }
  229. }
  230. return totalByteCount;
  231. }
  232. private Rune GetNextRune()
  233. {
  234. char firstChar = GetNextChar();
  235. if (Rune.TryCreate(firstChar, out Rune value) || Rune.TryCreate(firstChar, GetNextChar(), out value))
  236. {
  237. return value;
  238. }
  239. throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex);
  240. }
  241. // Fallback the current character using the remaining buffer and encoder if necessary
  242. // This can only be called by our encodings (other have to use the public fallback methods), so
  243. // we can use our EncoderNLS here too.
  244. // setEncoder is true if we're calling from a GetBytes method, false if we're calling from a GetByteCount
  245. //
  246. // Note that this could also change the contents of this.encoder, which is the same
  247. // object that the caller is using, so the caller could mess up the encoder for us
  248. // if they aren't careful.
  249. internal unsafe virtual bool InternalFallback(char ch, ref char* chars)
  250. {
  251. // Shouldn't have null charStart
  252. Debug.Assert(charStart != null,
  253. "[EncoderFallback.InternalFallbackBuffer]Fallback buffer is not initialized");
  254. // Get our index, remember chars was preincremented to point at next char, so have to -1
  255. int index = (int)(chars - charStart) - 1;
  256. // See if it was a high surrogate
  257. if (char.IsHighSurrogate(ch))
  258. {
  259. // See if there's a low surrogate to go with it
  260. if (chars >= this.charEnd)
  261. {
  262. // Nothing left in input buffer
  263. // No input, return 0 if mustflush is false
  264. if (this.encoder != null && !this.encoder.MustFlush)
  265. {
  266. // Done, nothing to fallback
  267. if (this.setEncoder)
  268. {
  269. bUsedEncoder = true;
  270. this.encoder._charLeftOver = ch;
  271. }
  272. bFallingBack = false;
  273. return false;
  274. }
  275. }
  276. else
  277. {
  278. // Might have a low surrogate
  279. char cNext = *chars;
  280. if (char.IsLowSurrogate(cNext))
  281. {
  282. // If already falling back then fail
  283. if (bFallingBack && iRecursionCount++ > iMaxRecursion)
  284. ThrowLastCharRecursive(char.ConvertToUtf32(ch, cNext));
  285. // Next is a surrogate, add it as surrogate pair, and increment chars
  286. chars++;
  287. bFallingBack = Fallback(ch, cNext, index);
  288. return bFallingBack;
  289. }
  290. // Next isn't a low surrogate, just fallback the high surrogate
  291. }
  292. }
  293. // If already falling back then fail
  294. if (bFallingBack && iRecursionCount++ > iMaxRecursion)
  295. ThrowLastCharRecursive((int)ch);
  296. // Fall back our char
  297. bFallingBack = Fallback(ch, index);
  298. return bFallingBack;
  299. }
  300. // private helper methods
  301. [DoesNotReturn]
  302. internal void ThrowLastCharRecursive(int charRecursive)
  303. {
  304. // Throw it, using our complete character
  305. throw new ArgumentException(
  306. SR.Format(SR.Argument_RecursiveFallback,
  307. charRecursive), "chars");
  308. }
  309. }
  310. }