UnicodeEncoding.cs 88 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877
  1. // Licensed to the .NET Foundation under one or more agreements.
  2. // The .NET Foundation licenses this file to you under the MIT license.
  3. // See the LICENSE file in the project root for more information.
  4. //
  5. // Don't override IsAlwaysNormalized because it is just a Unicode Transformation and could be confused.
  6. //
  7. // This define can be used to turn off the fast loops. Useful for finding whether
  8. // the problem is fastloop-specific.
  9. #define FASTLOOP
  10. using System.Diagnostics;
  11. using System.Runtime.InteropServices;
  12. using Internal.Runtime.CompilerServices;
  13. namespace System.Text
  14. {
  15. public class UnicodeEncoding : Encoding
  16. {
  17. // Used by Encoding.BigEndianUnicode/Unicode for lazy initialization
  18. // The initialization code will not be run until a static member of the class is referenced
  19. internal static readonly UnicodeEncoding s_bigEndianDefault = new UnicodeEncoding(bigEndian: true, byteOrderMark: true);
  20. internal static readonly UnicodeEncoding s_littleEndianDefault = new UnicodeEncoding(bigEndian: false, byteOrderMark: true);
  21. private readonly bool isThrowException = false;
  22. private readonly bool bigEndian = false;
  23. private readonly bool byteOrderMark = false;
  24. // Unicode version 2.0 character size in bytes
  25. public const int CharSize = 2;
  26. public UnicodeEncoding()
  27. : this(false, true)
  28. {
  29. }
  30. public UnicodeEncoding(bool bigEndian, bool byteOrderMark)
  31. : base(bigEndian ? 1201 : 1200) // Set the data item.
  32. {
  33. this.bigEndian = bigEndian;
  34. this.byteOrderMark = byteOrderMark;
  35. }
  36. public UnicodeEncoding(bool bigEndian, bool byteOrderMark, bool throwOnInvalidBytes)
  37. : this(bigEndian, byteOrderMark)
  38. {
  39. this.isThrowException = throwOnInvalidBytes;
  40. // Encoding constructor already did this, but it'll be wrong if we're throwing exceptions
  41. if (this.isThrowException)
  42. SetDefaultFallbacks();
  43. }
  44. internal sealed override void SetDefaultFallbacks()
  45. {
  46. // For UTF-X encodings, we use a replacement fallback with an empty string
  47. if (this.isThrowException)
  48. {
  49. this.encoderFallback = EncoderFallback.ExceptionFallback;
  50. this.decoderFallback = DecoderFallback.ExceptionFallback;
  51. }
  52. else
  53. {
  54. this.encoderFallback = new EncoderReplacementFallback("\xFFFD");
  55. this.decoderFallback = new DecoderReplacementFallback("\xFFFD");
  56. }
  57. }
  58. // The following methods are copied from EncodingNLS.cs.
  59. // Unfortunately EncodingNLS.cs is internal and we're public, so we have to re-implement them here.
  60. // These should be kept in sync for the following classes:
  61. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  62. //
  63. // Returns the number of bytes required to encode a range of characters in
  64. // a character array.
  65. //
  66. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  67. // So if you fix this, fix the others. Currently those include:
  68. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  69. // parent method is safe
  70. public override unsafe int GetByteCount(char[] chars, int index, int count)
  71. {
  72. // Validate input parameters
  73. if (chars == null)
  74. throw new ArgumentNullException(nameof(chars), SR.ArgumentNull_Array);
  75. if (index < 0 || count < 0)
  76. throw new ArgumentOutOfRangeException(index < 0 ? nameof(index) : nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
  77. if (chars.Length - index < count)
  78. throw new ArgumentOutOfRangeException(nameof(chars), SR.ArgumentOutOfRange_IndexCountBuffer);
  79. // If no input, return 0, avoid fixed empty array problem
  80. if (count == 0)
  81. return 0;
  82. // Just call the pointer version
  83. fixed (char* pChars = chars)
  84. return GetByteCount(pChars + index, count, null);
  85. }
  86. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  87. // So if you fix this, fix the others. Currently those include:
  88. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  89. // parent method is safe
  90. public override unsafe int GetByteCount(string s)
  91. {
  92. // Validate input
  93. if (s == null)
  94. throw new ArgumentNullException(nameof(s));
  95. fixed (char* pChars = s)
  96. return GetByteCount(pChars, s.Length, null);
  97. }
  98. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  99. // So if you fix this, fix the others. Currently those include:
  100. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  101. [CLSCompliant(false)]
  102. public override unsafe int GetByteCount(char* chars, int count)
  103. {
  104. // Validate Parameters
  105. if (chars == null)
  106. throw new ArgumentNullException(nameof(chars), SR.ArgumentNull_Array);
  107. if (count < 0)
  108. throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
  109. // Call it with empty encoder
  110. return GetByteCount(chars, count, null);
  111. }
  112. // Parent method is safe.
  113. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  114. // So if you fix this, fix the others. Currently those include:
  115. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  116. public override unsafe int GetBytes(string s, int charIndex, int charCount,
  117. byte[] bytes, int byteIndex)
  118. {
  119. if (s == null || bytes == null)
  120. throw new ArgumentNullException(s == null ? nameof(s) : nameof(bytes), SR.ArgumentNull_Array);
  121. if (charIndex < 0 || charCount < 0)
  122. throw new ArgumentOutOfRangeException(charIndex < 0 ? nameof(charIndex) : nameof(charCount), SR.ArgumentOutOfRange_NeedNonNegNum);
  123. if (s.Length - charIndex < charCount)
  124. throw new ArgumentOutOfRangeException(nameof(s), SR.ArgumentOutOfRange_IndexCount);
  125. if (byteIndex < 0 || byteIndex > bytes.Length)
  126. throw new ArgumentOutOfRangeException(nameof(byteIndex), SR.ArgumentOutOfRange_Index);
  127. int byteCount = bytes.Length - byteIndex;
  128. fixed (char* pChars = s) fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes))
  129. return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
  130. }
  131. // Encodes a range of characters in a character array into a range of bytes
  132. // in a byte array. An exception occurs if the byte array is not large
  133. // enough to hold the complete encoding of the characters. The
  134. // GetByteCount method can be used to determine the exact number of
  135. // bytes that will be produced for a given range of characters.
  136. // Alternatively, the GetMaxByteCount method can be used to
  137. // determine the maximum number of bytes that will be produced for a given
  138. // number of characters, regardless of the actual character values.
  139. //
  140. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  141. // So if you fix this, fix the others. Currently those include:
  142. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  143. // parent method is safe
  144. public override unsafe int GetBytes(char[] chars, int charIndex, int charCount,
  145. byte[] bytes, int byteIndex)
  146. {
  147. // Validate parameters
  148. if (chars == null || bytes == null)
  149. throw new ArgumentNullException(chars == null ? nameof(chars) : nameof(bytes), SR.ArgumentNull_Array);
  150. if (charIndex < 0 || charCount < 0)
  151. throw new ArgumentOutOfRangeException(charIndex < 0 ? nameof(charIndex) : nameof(charCount), SR.ArgumentOutOfRange_NeedNonNegNum);
  152. if (chars.Length - charIndex < charCount)
  153. throw new ArgumentOutOfRangeException(nameof(chars), SR.ArgumentOutOfRange_IndexCountBuffer);
  154. if (byteIndex < 0 || byteIndex > bytes.Length)
  155. throw new ArgumentOutOfRangeException(nameof(byteIndex), SR.ArgumentOutOfRange_Index);
  156. // If nothing to encode return 0, avoid fixed problem
  157. if (charCount == 0)
  158. return 0;
  159. // Just call pointer version
  160. int byteCount = bytes.Length - byteIndex;
  161. fixed (char* pChars = chars) fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes))
  162. // Remember that byteCount is # to decode, not size of array.
  163. return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
  164. }
  165. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  166. // So if you fix this, fix the others. Currently those include:
  167. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  168. [CLSCompliant(false)]
  169. public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount)
  170. {
  171. // Validate Parameters
  172. if (bytes == null || chars == null)
  173. throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array);
  174. if (charCount < 0 || byteCount < 0)
  175. throw new ArgumentOutOfRangeException(charCount < 0 ? nameof(charCount) : nameof(byteCount), SR.ArgumentOutOfRange_NeedNonNegNum);
  176. return GetBytes(chars, charCount, bytes, byteCount, null);
  177. }
  178. // Returns the number of characters produced by decoding a range of bytes
  179. // in a byte array.
  180. //
  181. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  182. // So if you fix this, fix the others. Currently those include:
  183. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  184. // parent method is safe
  185. public override unsafe int GetCharCount(byte[] bytes, int index, int count)
  186. {
  187. // Validate Parameters
  188. if (bytes == null)
  189. throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array);
  190. if (index < 0 || count < 0)
  191. throw new ArgumentOutOfRangeException(index < 0 ? nameof(index) : nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
  192. if (bytes.Length - index < count)
  193. throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer);
  194. // If no input just return 0, fixed doesn't like 0 length arrays
  195. if (count == 0)
  196. return 0;
  197. // Just call pointer version
  198. fixed (byte* pBytes = bytes)
  199. return GetCharCount(pBytes + index, count, null);
  200. }
  201. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  202. // So if you fix this, fix the others. Currently those include:
  203. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  204. [CLSCompliant(false)]
  205. public override unsafe int GetCharCount(byte* bytes, int count)
  206. {
  207. // Validate Parameters
  208. if (bytes == null)
  209. throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array);
  210. if (count < 0)
  211. throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
  212. return GetCharCount(bytes, count, null);
  213. }
  214. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  215. // So if you fix this, fix the others. Currently those include:
  216. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  217. // parent method is safe
  218. public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount,
  219. char[] chars, int charIndex)
  220. {
  221. // Validate Parameters
  222. if (bytes == null || chars == null)
  223. throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array);
  224. if (byteIndex < 0 || byteCount < 0)
  225. throw new ArgumentOutOfRangeException(byteIndex < 0 ? nameof(byteIndex) : nameof(byteCount), SR.ArgumentOutOfRange_NeedNonNegNum);
  226. if (bytes.Length - byteIndex < byteCount)
  227. throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer);
  228. if (charIndex < 0 || charIndex > chars.Length)
  229. throw new ArgumentOutOfRangeException(nameof(charIndex), SR.ArgumentOutOfRange_Index);
  230. // If no input, return 0 & avoid fixed problem
  231. if (byteCount == 0)
  232. return 0;
  233. // Just call pointer version
  234. int charCount = chars.Length - charIndex;
  235. fixed (byte* pBytes = bytes) fixed (char* pChars = &MemoryMarshal.GetReference((Span<char>)chars))
  236. // Remember that charCount is # to decode, not size of array
  237. return GetChars(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, null);
  238. }
  239. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  240. // So if you fix this, fix the others. Currently those include:
  241. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  242. [CLSCompliant(false)]
  243. public override unsafe int GetChars(byte* bytes, int byteCount, char* chars, int charCount)
  244. {
  245. // Validate Parameters
  246. if (bytes == null || chars == null)
  247. throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array);
  248. if (charCount < 0 || byteCount < 0)
  249. throw new ArgumentOutOfRangeException(charCount < 0 ? nameof(charCount) : nameof(byteCount), SR.ArgumentOutOfRange_NeedNonNegNum);
  250. return GetChars(bytes, byteCount, chars, charCount, null);
  251. }
  252. // Returns a string containing the decoded representation of a range of
  253. // bytes in a byte array.
  254. //
  255. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  256. // So if you fix this, fix the others. Currently those include:
  257. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  258. // parent method is safe
  259. public override unsafe string GetString(byte[] bytes, int index, int count)
  260. {
  261. // Validate Parameters
  262. if (bytes == null)
  263. throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array);
  264. if (index < 0 || count < 0)
  265. throw new ArgumentOutOfRangeException(index < 0 ? nameof(index) : nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
  266. if (bytes.Length - index < count)
  267. throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer);
  268. // Avoid problems with empty input buffer
  269. if (count == 0) return string.Empty;
  270. fixed (byte* pBytes = bytes)
  271. return string.CreateStringFromEncoding(
  272. pBytes + index, count, this);
  273. }
  274. //
  275. // End of standard methods copied from EncodingNLS.cs
  276. //
  277. internal sealed override unsafe int GetByteCount(char* chars, int count, EncoderNLS? encoder)
  278. {
  279. Debug.Assert(chars != null, "[UnicodeEncoding.GetByteCount]chars!=null");
  280. Debug.Assert(count >= 0, "[UnicodeEncoding.GetByteCount]count >=0");
  281. // Start by assuming each char gets 2 bytes
  282. int byteCount = count << 1;
  283. // Check for overflow in byteCount
  284. // (If they were all invalid chars, this would actually be wrong,
  285. // but that's a ridiculously large # so we're not concerned about that case)
  286. if (byteCount < 0)
  287. throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_GetByteCountOverflow);
  288. char* charStart = chars;
  289. char* charEnd = chars + count;
  290. char charLeftOver = (char)0;
  291. bool wasHereBefore = false;
  292. // For fallback we may need a fallback buffer
  293. EncoderFallbackBuffer? fallbackBuffer = null;
  294. char* charsForFallback;
  295. if (encoder != null)
  296. {
  297. charLeftOver = encoder._charLeftOver;
  298. // Assume extra bytes to encode charLeftOver if it existed
  299. if (charLeftOver > 0)
  300. byteCount += 2;
  301. // We mustn't have left over fallback data when counting
  302. if (encoder.InternalHasFallbackBuffer)
  303. {
  304. fallbackBuffer = encoder.FallbackBuffer;
  305. if (fallbackBuffer.Remaining > 0)
  306. throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback?.GetType()));
  307. // Set our internal fallback interesting things.
  308. fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false);
  309. }
  310. }
  311. char ch;
  312. TryAgain:
  313. while (((ch = (fallbackBuffer == null) ? (char)0 : fallbackBuffer.InternalGetNextChar()) != 0) || chars < charEnd)
  314. {
  315. // First unwind any fallback
  316. if (ch == 0)
  317. {
  318. // No fallback, maybe we can do it fast
  319. #if FASTLOOP
  320. // If endianess is backwards then each pair of bytes would be backwards.
  321. if ((bigEndian ^ BitConverter.IsLittleEndian) &&
  322. #if BIT64
  323. (unchecked((long)chars) & 7) == 0 &&
  324. #else
  325. (unchecked((int)chars) & 3) == 0 &&
  326. #endif
  327. charLeftOver == 0)
  328. {
  329. // Need -1 to check 2 at a time. If we have an even #, longChars will go
  330. // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longChars
  331. // will go from longEnd - 1 long to longEnd. (Might not get to use this)
  332. ulong* longEnd = (ulong*)(charEnd - 3);
  333. // Need new char* so we can check 4 at a time
  334. ulong* longChars = (ulong*)chars;
  335. while (longChars < longEnd)
  336. {
  337. // See if we potentially have surrogates (0x8000 bit set)
  338. // (We're either big endian on a big endian machine or little endian on
  339. // a little endian machine so that'll work)
  340. if ((0x8000800080008000 & *longChars) != 0)
  341. {
  342. // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high
  343. // 5 bits looks like 11011, then its a high or low surrogate.
  344. // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set.
  345. // Note that we expect BMP characters to be more common than surrogates
  346. // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates
  347. ulong uTemp = (0xf800f800f800f800 & *longChars) ^ 0xd800d800d800d800;
  348. // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate
  349. // but no clue if they're high or low.
  350. // If each of the 4 characters are non-zero, then none are surrogates.
  351. if ((uTemp & 0xFFFF000000000000) == 0 ||
  352. (uTemp & 0x0000FFFF00000000) == 0 ||
  353. (uTemp & 0x00000000FFFF0000) == 0 ||
  354. (uTemp & 0x000000000000FFFF) == 0)
  355. {
  356. // It has at least 1 surrogate, but we don't know if they're high or low surrogates,
  357. // or if there's 1 or 4 surrogates
  358. // If they happen to be high/low/high/low, we may as well continue. Check the next
  359. // bit to see if its set (low) or not (high) in the right pattern
  360. if ((0xfc00fc00fc00fc00 & *longChars) !=
  361. (BitConverter.IsLittleEndian ? (ulong)0xdc00d800dc00d800 : (ulong)0xd800dc00d800dc00))
  362. {
  363. // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
  364. // was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
  365. // Drop out to the slow loop to resolve the surrogates
  366. break;
  367. }
  368. // else they are all surrogates in High/Low/High/Low order, so we can use them.
  369. }
  370. // else none are surrogates, so we can use them.
  371. }
  372. // else all < 0x8000 so we can use them
  373. // We already counted these four chars, go to next long.
  374. longChars++;
  375. }
  376. chars = (char*)longChars;
  377. if (chars >= charEnd)
  378. break;
  379. }
  380. #endif // FASTLOOP
  381. // No fallback, just get next char
  382. ch = *chars;
  383. chars++;
  384. }
  385. else
  386. {
  387. // We weren't preallocating fallback space.
  388. byteCount += 2;
  389. }
  390. // Check for high or low surrogates
  391. if (ch >= 0xd800 && ch <= 0xdfff)
  392. {
  393. // Was it a high surrogate?
  394. if (ch <= 0xdbff)
  395. {
  396. // Its a high surrogate, if we already had a high surrogate do its fallback
  397. if (charLeftOver > 0)
  398. {
  399. // Unwind the current character, this should be safe because we
  400. // don't have leftover data in the fallback, so chars must have
  401. // advanced already.
  402. Debug.Assert(chars > charStart,
  403. "[UnicodeEncoding.GetByteCount]Expected chars to have advanced in unexpected high surrogate");
  404. chars--;
  405. // If previous high surrogate deallocate 2 bytes
  406. byteCount -= 2;
  407. // Fallback the previous surrogate
  408. // Need to initialize fallback buffer?
  409. if (fallbackBuffer == null)
  410. {
  411. if (encoder == null)
  412. fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
  413. else
  414. fallbackBuffer = encoder.FallbackBuffer;
  415. // Set our internal fallback interesting things.
  416. fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false);
  417. }
  418. charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered
  419. fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback);
  420. chars = charsForFallback;
  421. // Now no high surrogate left over
  422. charLeftOver = (char)0;
  423. continue;
  424. }
  425. // Remember this high surrogate
  426. charLeftOver = ch;
  427. continue;
  428. }
  429. // Its a low surrogate
  430. if (charLeftOver == 0)
  431. {
  432. // Expected a previous high surrogate.
  433. // Don't count this one (we'll count its fallback if necessary)
  434. byteCount -= 2;
  435. // fallback this one
  436. // Need to initialize fallback buffer?
  437. if (fallbackBuffer == null)
  438. {
  439. if (encoder == null)
  440. fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
  441. else
  442. fallbackBuffer = encoder.FallbackBuffer;
  443. // Set our internal fallback interesting things.
  444. fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false);
  445. }
  446. charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
  447. fallbackBuffer.InternalFallback(ch, ref charsForFallback);
  448. chars = charsForFallback;
  449. continue;
  450. }
  451. // Valid surrogate pair, add our charLeftOver
  452. charLeftOver = (char)0;
  453. continue;
  454. }
  455. else if (charLeftOver > 0)
  456. {
  457. // Expected a low surrogate, but this char is normal
  458. // Rewind the current character, fallback previous character.
  459. // this should be safe because we don't have leftover data in the
  460. // fallback, so chars must have advanced already.
  461. Debug.Assert(chars > charStart,
  462. "[UnicodeEncoding.GetByteCount]Expected chars to have advanced when expected low surrogate");
  463. chars--;
  464. // fallback previous chars
  465. // Need to initialize fallback buffer?
  466. if (fallbackBuffer == null)
  467. {
  468. if (encoder == null)
  469. fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
  470. else
  471. fallbackBuffer = encoder.FallbackBuffer;
  472. // Set our internal fallback interesting things.
  473. fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false);
  474. }
  475. charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
  476. fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback);
  477. chars = charsForFallback;
  478. // Ignore charLeftOver or throw
  479. byteCount -= 2;
  480. charLeftOver = (char)0;
  481. continue;
  482. }
  483. // Ok we had something to add (already counted)
  484. }
  485. // Don't allocate space for left over char
  486. if (charLeftOver > 0)
  487. {
  488. byteCount -= 2;
  489. // If we have to flush, stick it in fallback and try again
  490. if (encoder == null || encoder.MustFlush)
  491. {
  492. if (wasHereBefore)
  493. {
  494. // Throw it, using our complete character
  495. throw new ArgumentException(
  496. SR.Format(SR.Argument_RecursiveFallback, charLeftOver), nameof(chars));
  497. }
  498. else
  499. {
  500. // Need to initialize fallback buffer?
  501. if (fallbackBuffer == null)
  502. {
  503. if (encoder == null)
  504. fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
  505. else
  506. fallbackBuffer = encoder.FallbackBuffer;
  507. // Set our internal fallback interesting things.
  508. fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false);
  509. }
  510. charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
  511. fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback);
  512. chars = charsForFallback;
  513. charLeftOver = (char)0;
  514. wasHereBefore = true;
  515. goto TryAgain;
  516. }
  517. }
  518. }
  519. // Shouldn't have anything in fallback buffer for GetByteCount
  520. // (don't have to check _throwOnOverflow for count)
  521. Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0,
  522. "[UnicodeEncoding.GetByteCount]Expected empty fallback buffer at end");
  523. // Don't remember fallbackBuffer.encoder for counting
  524. return byteCount;
  525. }
  526. internal sealed override unsafe int GetBytes(
  527. char* chars, int charCount, byte* bytes, int byteCount, EncoderNLS? encoder)
  528. {
  529. Debug.Assert(chars != null, "[UnicodeEncoding.GetBytes]chars!=null");
  530. Debug.Assert(byteCount >= 0, "[UnicodeEncoding.GetBytes]byteCount >=0");
  531. Debug.Assert(charCount >= 0, "[UnicodeEncoding.GetBytes]charCount >=0");
  532. Debug.Assert(bytes != null, "[UnicodeEncoding.GetBytes]bytes!=null");
  533. char charLeftOver = (char)0;
  534. char ch;
  535. bool wasHereBefore = false;
  536. byte* byteEnd = bytes + byteCount;
  537. char* charEnd = chars + charCount;
  538. byte* byteStart = bytes;
  539. char* charStart = chars;
  540. // For fallback we may need a fallback buffer
  541. EncoderFallbackBuffer? fallbackBuffer = null;
  542. char* charsForFallback;
  543. // Get our encoder, but don't clear it yet.
  544. if (encoder != null)
  545. {
  546. charLeftOver = encoder._charLeftOver;
  547. // We mustn't have left over fallback data when counting
  548. if (encoder.InternalHasFallbackBuffer)
  549. {
  550. // We always need the fallback buffer in get bytes so we can flush any remaining ones if necessary
  551. fallbackBuffer = encoder.FallbackBuffer;
  552. if (fallbackBuffer.Remaining > 0 && encoder._throwOnOverflow)
  553. throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback?.GetType()));
  554. // Set our internal fallback interesting things.
  555. fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false);
  556. }
  557. }
  558. TryAgain:
  559. while (((ch = (fallbackBuffer == null) ?
  560. (char)0 : fallbackBuffer.InternalGetNextChar()) != 0) ||
  561. chars < charEnd)
  562. {
  563. // First unwind any fallback
  564. if (ch == 0)
  565. {
  566. // No fallback, maybe we can do it fast
  567. #if FASTLOOP
  568. // If endianess is backwards then each pair of bytes would be backwards.
  569. if ((bigEndian ^ BitConverter.IsLittleEndian) &&
  570. #if BIT64
  571. (unchecked((long)chars) & 7) == 0 &&
  572. #else
  573. (unchecked((int)chars) & 3) == 0 &&
  574. #endif
  575. charLeftOver == 0)
  576. {
  577. // Need -1 to check 2 at a time. If we have an even #, longChars will go
  578. // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longChars
  579. // will go from longEnd - 1 long to longEnd. (Might not get to use this)
  580. // We can only go iCount units (limited by shorter of char or byte buffers.
  581. ulong* longEnd = (ulong*)(chars - 3 +
  582. (((byteEnd - bytes) >> 1 < charEnd - chars) ?
  583. (byteEnd - bytes) >> 1 : charEnd - chars));
  584. // Need new char* so we can check 4 at a time
  585. ulong* longChars = (ulong*)chars;
  586. ulong* longBytes = (ulong*)bytes;
  587. while (longChars < longEnd)
  588. {
  589. // See if we potentially have surrogates (0x8000 bit set)
  590. // (We're either big endian on a big endian machine or little endian on
  591. // a little endian machine so that'll work)
  592. if ((0x8000800080008000 & *longChars) != 0)
  593. {
  594. // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high
  595. // 5 bits looks like 11011, then its a high or low surrogate.
  596. // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set.
  597. // Note that we expect BMP characters to be more common than surrogates
  598. // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates
  599. ulong uTemp = (0xf800f800f800f800 & *longChars) ^ 0xd800d800d800d800;
  600. // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate
  601. // but no clue if they're high or low.
  602. // If each of the 4 characters are non-zero, then none are surrogates.
  603. if ((uTemp & 0xFFFF000000000000) == 0 ||
  604. (uTemp & 0x0000FFFF00000000) == 0 ||
  605. (uTemp & 0x00000000FFFF0000) == 0 ||
  606. (uTemp & 0x000000000000FFFF) == 0)
  607. {
  608. // It has at least 1 surrogate, but we don't know if they're high or low surrogates,
  609. // or if there's 1 or 4 surrogates
  610. // If they happen to be high/low/high/low, we may as well continue. Check the next
  611. // bit to see if its set (low) or not (high) in the right pattern
  612. if ((0xfc00fc00fc00fc00 & *longChars) !=
  613. (BitConverter.IsLittleEndian ? (ulong)0xdc00d800dc00d800 : (ulong)0xd800dc00d800dc00))
  614. {
  615. // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
  616. // was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
  617. // Drop out to the slow loop to resolve the surrogates
  618. break;
  619. }
  620. // else they are all surrogates in High/Low/High/Low order, so we can use them.
  621. }
  622. // else none are surrogates, so we can use them.
  623. }
  624. // else all < 0x8000 so we can use them
  625. // We can use these 4 chars.
  626. Unsafe.WriteUnaligned<ulong>(longBytes, *longChars);
  627. longChars++;
  628. longBytes++;
  629. }
  630. chars = (char*)longChars;
  631. bytes = (byte*)longBytes;
  632. if (chars >= charEnd)
  633. break;
  634. }
  635. #endif // FASTLOOP
  636. // No fallback, just get next char
  637. ch = *chars;
  638. chars++;
  639. }
  640. // Check for high or low surrogates
  641. if (ch >= 0xd800 && ch <= 0xdfff)
  642. {
  643. // Was it a high surrogate?
  644. if (ch <= 0xdbff)
  645. {
  646. // Its a high surrogate, see if we already had a high surrogate
  647. if (charLeftOver > 0)
  648. {
  649. // Unwind the current character, this should be safe because we
  650. // don't have leftover data in the fallback, so chars must have
  651. // advanced already.
  652. Debug.Assert(chars > charStart,
  653. "[UnicodeEncoding.GetBytes]Expected chars to have advanced in unexpected high surrogate");
  654. chars--;
  655. // Fallback the previous surrogate
  656. // Might need to create our fallback buffer
  657. if (fallbackBuffer == null)
  658. {
  659. if (encoder == null)
  660. fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
  661. else
  662. fallbackBuffer = encoder.FallbackBuffer;
  663. // Set our internal fallback interesting things.
  664. fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true);
  665. }
  666. charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
  667. fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback);
  668. chars = charsForFallback;
  669. charLeftOver = (char)0;
  670. continue;
  671. }
  672. // Remember this high surrogate
  673. charLeftOver = ch;
  674. continue;
  675. }
  676. // Its a low surrogate
  677. if (charLeftOver == 0)
  678. {
  679. // We'll fall back this one
  680. // Might need to create our fallback buffer
  681. if (fallbackBuffer == null)
  682. {
  683. if (encoder == null)
  684. fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
  685. else
  686. fallbackBuffer = encoder.FallbackBuffer;
  687. // Set our internal fallback interesting things.
  688. fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true);
  689. }
  690. charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
  691. fallbackBuffer.InternalFallback(ch, ref charsForFallback);
  692. chars = charsForFallback;
  693. continue;
  694. }
  695. // Valid surrogate pair, add our charLeftOver
  696. if (bytes + 3 >= byteEnd)
  697. {
  698. // Not enough room to add this surrogate pair
  699. if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
  700. {
  701. // These must have both been from the fallbacks.
  702. // Both of these MUST have been from a fallback because if the 1st wasn't
  703. // from a fallback, then a high surrogate followed by an illegal char
  704. // would've caused the high surrogate to fall back. If a high surrogate
  705. // fell back, then it was consumed and both chars came from the fallback.
  706. fallbackBuffer.MovePrevious(); // Didn't use either fallback surrogate
  707. fallbackBuffer.MovePrevious();
  708. }
  709. else
  710. {
  711. // If we don't have enough room, then either we should've advanced a while
  712. // or we should have bytes==byteStart and throw below
  713. Debug.Assert(chars > charStart + 1 || bytes == byteStart,
  714. "[UnicodeEncoding.GetBytes]Expected chars to have when no room to add surrogate pair");
  715. chars -= 2; // Didn't use either surrogate
  716. }
  717. ThrowBytesOverflow(encoder, bytes == byteStart); // Throw maybe (if no bytes written)
  718. charLeftOver = (char)0; // we'll retry it later
  719. break; // Didn't throw, but stop 'til next time.
  720. }
  721. if (bigEndian)
  722. {
  723. *(bytes++) = (byte)(charLeftOver >> 8);
  724. *(bytes++) = (byte)charLeftOver;
  725. }
  726. else
  727. {
  728. *(bytes++) = (byte)charLeftOver;
  729. *(bytes++) = (byte)(charLeftOver >> 8);
  730. }
  731. charLeftOver = (char)0;
  732. }
  733. else if (charLeftOver > 0)
  734. {
  735. // Expected a low surrogate, but this char is normal
  736. // Rewind the current character, fallback previous character.
  737. // this should be safe because we don't have leftover data in the
  738. // fallback, so chars must have advanced already.
  739. Debug.Assert(chars > charStart,
  740. "[UnicodeEncoding.GetBytes]Expected chars to have advanced after expecting low surrogate");
  741. chars--;
  742. // fallback previous chars
  743. // Might need to create our fallback buffer
  744. if (fallbackBuffer == null)
  745. {
  746. if (encoder == null)
  747. fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
  748. else
  749. fallbackBuffer = encoder.FallbackBuffer;
  750. // Set our internal fallback interesting things.
  751. fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true);
  752. }
  753. charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
  754. fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback);
  755. chars = charsForFallback;
  756. // Ignore charLeftOver or throw
  757. charLeftOver = (char)0;
  758. continue;
  759. }
  760. // Ok, we have a char to add
  761. if (bytes + 1 >= byteEnd)
  762. {
  763. // Couldn't add this char
  764. if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
  765. fallbackBuffer.MovePrevious(); // Not using this fallback char
  766. else
  767. {
  768. // Lonely charLeftOver (from previous call) would've been caught up above,
  769. // so this must be a case where we've already read an input char.
  770. Debug.Assert(chars > charStart,
  771. "[UnicodeEncoding.GetBytes]Expected chars to have advanced for failed fallback");
  772. chars--; // Not using this char
  773. }
  774. ThrowBytesOverflow(encoder, bytes == byteStart); // Throw maybe (if no bytes written)
  775. break; // didn't throw, just stop
  776. }
  777. if (bigEndian)
  778. {
  779. *(bytes++) = (byte)(ch >> 8);
  780. *(bytes++) = (byte)ch;
  781. }
  782. else
  783. {
  784. *(bytes++) = (byte)ch;
  785. *(bytes++) = (byte)(ch >> 8);
  786. }
  787. }
  788. // Don't allocate space for left over char
  789. if (charLeftOver > 0)
  790. {
  791. // If we aren't flushing we need to fall this back
  792. if (encoder == null || encoder.MustFlush)
  793. {
  794. if (wasHereBefore)
  795. {
  796. // Throw it, using our complete character
  797. throw new ArgumentException(
  798. SR.Format(SR.Argument_RecursiveFallback, charLeftOver), nameof(chars));
  799. }
  800. else
  801. {
  802. // If we have to flush, stick it in fallback and try again
  803. // Might need to create our fallback buffer
  804. if (fallbackBuffer == null)
  805. {
  806. if (encoder == null)
  807. fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
  808. else
  809. fallbackBuffer = encoder.FallbackBuffer;
  810. // Set our internal fallback interesting things.
  811. fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true);
  812. }
  813. // If we're not flushing, that'll remember the left over character.
  814. charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
  815. fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback);
  816. chars = charsForFallback;
  817. charLeftOver = (char)0;
  818. wasHereBefore = true;
  819. goto TryAgain;
  820. }
  821. }
  822. }
  823. // Not flushing, remember it in the encoder
  824. if (encoder != null)
  825. {
  826. encoder._charLeftOver = charLeftOver;
  827. encoder._charsUsed = (int)(chars - charStart);
  828. }
  829. // Remember charLeftOver if we must, or clear it if we're flushing
  830. // (charLeftOver should be 0 if we're flushing)
  831. Debug.Assert((encoder != null && !encoder.MustFlush) || charLeftOver == (char)0,
  832. "[UnicodeEncoding.GetBytes] Expected no left over characters if flushing");
  833. Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0 ||
  834. encoder == null || !encoder._throwOnOverflow,
  835. "[UnicodeEncoding.GetBytes]Expected empty fallback buffer if not converting");
  836. return (int)(bytes - byteStart);
  837. }
  838. internal sealed override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS? baseDecoder)
  839. {
  840. Debug.Assert(bytes != null, "[UnicodeEncoding.GetCharCount]bytes!=null");
  841. Debug.Assert(count >= 0, "[UnicodeEncoding.GetCharCount]count >=0");
  842. UnicodeEncoding.Decoder? decoder = (UnicodeEncoding.Decoder?)baseDecoder;
  843. byte* byteEnd = bytes + count;
  844. byte* byteStart = bytes;
  845. // Need last vars
  846. int lastByte = -1;
  847. char lastChar = (char)0;
  848. // Start by assuming same # of chars as bytes
  849. int charCount = count >> 1;
  850. // For fallback we may need a fallback buffer
  851. DecoderFallbackBuffer? fallbackBuffer = null;
  852. if (decoder != null)
  853. {
  854. lastByte = decoder.lastByte;
  855. lastChar = decoder.lastChar;
  856. // Assume extra char if last char was around
  857. if (lastChar > 0)
  858. charCount++;
  859. // Assume extra char if extra last byte makes up odd # of input bytes
  860. if (lastByte >= 0 && (count & 1) == 1)
  861. {
  862. charCount++;
  863. }
  864. // Shouldn't have anything in fallback buffer for GetCharCount
  865. // (don't have to check _throwOnOverflow for count)
  866. Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
  867. "[UnicodeEncoding.GetCharCount]Expected empty fallback buffer at start");
  868. }
  869. while (bytes < byteEnd)
  870. {
  871. // If we're aligned then maybe we can do it fast
  872. // That'll hurt if we're unaligned because we'll always test but never be aligned
  873. #if FASTLOOP
  874. if ((bigEndian ^ BitConverter.IsLittleEndian) &&
  875. #if BIT64
  876. (unchecked((long)bytes) & 7) == 0 &&
  877. #else
  878. (unchecked((int)bytes) & 3) == 0 &&
  879. #endif // BIT64
  880. lastByte == -1 && lastChar == 0)
  881. {
  882. // Need -1 to check 2 at a time. If we have an even #, longBytes will go
  883. // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longBytes
  884. // will go from longEnd - 1 long to longEnd. (Might not get to use this)
  885. ulong* longEnd = (ulong*)(byteEnd - 7);
  886. // Need new char* so we can check 4 at a time
  887. ulong* longBytes = (ulong*)bytes;
  888. while (longBytes < longEnd)
  889. {
  890. // See if we potentially have surrogates (0x8000 bit set)
  891. // (We're either big endian on a big endian machine or little endian on
  892. // a little endian machine so that'll work)
  893. if ((0x8000800080008000 & *longBytes) != 0)
  894. {
  895. // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high
  896. // 5 bits looks like 11011, then its a high or low surrogate.
  897. // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set.
  898. // Note that we expect BMP characters to be more common than surrogates
  899. // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates
  900. ulong uTemp = (0xf800f800f800f800 & *longBytes) ^ 0xd800d800d800d800;
  901. // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate
  902. // but no clue if they're high or low.
  903. // If each of the 4 characters are non-zero, then none are surrogates.
  904. if ((uTemp & 0xFFFF000000000000) == 0 ||
  905. (uTemp & 0x0000FFFF00000000) == 0 ||
  906. (uTemp & 0x00000000FFFF0000) == 0 ||
  907. (uTemp & 0x000000000000FFFF) == 0)
  908. {
  909. // It has at least 1 surrogate, but we don't know if they're high or low surrogates,
  910. // or if there's 1 or 4 surrogates
  911. // If they happen to be high/low/high/low, we may as well continue. Check the next
  912. // bit to see if its set (low) or not (high) in the right pattern
  913. if ((0xfc00fc00fc00fc00 & *longBytes) !=
  914. (BitConverter.IsLittleEndian ? (ulong)0xdc00d800dc00d800 : (ulong)0xd800dc00d800dc00))
  915. {
  916. // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
  917. // was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
  918. // Drop out to the slow loop to resolve the surrogates
  919. break;
  920. }
  921. // else they are all surrogates in High/Low/High/Low order, so we can use them.
  922. }
  923. // else none are surrogates, so we can use them.
  924. }
  925. // else all < 0x8000 so we can use them
  926. // We can use these 4 chars.
  927. longBytes++;
  928. }
  929. bytes = (byte*)longBytes;
  930. if (bytes >= byteEnd)
  931. break;
  932. }
  933. #endif // FASTLOOP
  934. // Get 1st byte
  935. if (lastByte < 0)
  936. {
  937. lastByte = *bytes++;
  938. if (bytes >= byteEnd) break;
  939. }
  940. // Get full char
  941. char ch;
  942. if (bigEndian)
  943. {
  944. ch = (char)(lastByte << 8 | *(bytes++));
  945. }
  946. else
  947. {
  948. ch = (char)(*(bytes++) << 8 | lastByte);
  949. }
  950. lastByte = -1;
  951. // See if the char's valid
  952. if (ch >= 0xd800 && ch <= 0xdfff)
  953. {
  954. // Was it a high surrogate?
  955. if (ch <= 0xdbff)
  956. {
  957. // Its a high surrogate, if we had one then do fallback for previous one
  958. if (lastChar > 0)
  959. {
  960. // Ignore previous bad high surrogate
  961. charCount--;
  962. // Get fallback for previous high surrogate
  963. // Note we have to reconstruct bytes because some may have been in decoder
  964. byte[]? byteBuffer = null;
  965. if (bigEndian)
  966. {
  967. byteBuffer = new byte[]
  968. { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) };
  969. }
  970. else
  971. {
  972. byteBuffer = new byte[]
  973. { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) };
  974. }
  975. if (fallbackBuffer == null)
  976. {
  977. if (decoder == null)
  978. fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
  979. else
  980. fallbackBuffer = decoder.FallbackBuffer;
  981. // Set our internal fallback interesting things.
  982. fallbackBuffer.InternalInitialize(byteStart, null);
  983. }
  984. // Get fallback.
  985. charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes);
  986. }
  987. // Ignore the last one which fell back already,
  988. // and remember the new high surrogate
  989. lastChar = ch;
  990. continue;
  991. }
  992. // Its a low surrogate
  993. if (lastChar == 0)
  994. {
  995. // Expected a previous high surrogate
  996. charCount--;
  997. // Get fallback for this low surrogate
  998. // Note we have to reconstruct bytes because some may have been in decoder
  999. byte[]? byteBuffer = null;
  1000. if (bigEndian)
  1001. {
  1002. byteBuffer = new byte[]
  1003. { unchecked((byte)(ch >> 8)), unchecked((byte)ch) };
  1004. }
  1005. else
  1006. {
  1007. byteBuffer = new byte[]
  1008. { unchecked((byte)ch), unchecked((byte)(ch >> 8)) };
  1009. }
  1010. if (fallbackBuffer == null)
  1011. {
  1012. if (decoder == null)
  1013. fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
  1014. else
  1015. fallbackBuffer = decoder.FallbackBuffer;
  1016. // Set our internal fallback interesting things.
  1017. fallbackBuffer.InternalInitialize(byteStart, null);
  1018. }
  1019. charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes);
  1020. // Ignore this one (we already did its fallback)
  1021. continue;
  1022. }
  1023. // Valid surrogate pair, already counted.
  1024. lastChar = (char)0;
  1025. }
  1026. else if (lastChar > 0)
  1027. {
  1028. // Had a high surrogate, expected a low surrogate
  1029. // Un-count the last high surrogate
  1030. charCount--;
  1031. // fall back the high surrogate.
  1032. byte[]? byteBuffer = null;
  1033. if (bigEndian)
  1034. {
  1035. byteBuffer = new byte[]
  1036. { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) };
  1037. }
  1038. else
  1039. {
  1040. byteBuffer = new byte[]
  1041. { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) };
  1042. }
  1043. if (fallbackBuffer == null)
  1044. {
  1045. if (decoder == null)
  1046. fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
  1047. else
  1048. fallbackBuffer = decoder.FallbackBuffer;
  1049. // Set our internal fallback interesting things.
  1050. fallbackBuffer.InternalInitialize(byteStart, null);
  1051. }
  1052. // Already subtracted high surrogate
  1053. charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes);
  1054. // Not left over now, clear previous high surrogate and continue to add current char
  1055. lastChar = (char)0;
  1056. }
  1057. // Valid char, already counted
  1058. }
  1059. // Extra space if we can't use decoder
  1060. if (decoder == null || decoder.MustFlush)
  1061. {
  1062. if (lastChar > 0)
  1063. {
  1064. // No hanging high surrogates allowed, do fallback and remove count for it
  1065. charCount--;
  1066. byte[]? byteBuffer = null;
  1067. if (bigEndian)
  1068. {
  1069. byteBuffer = new byte[]
  1070. { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) };
  1071. }
  1072. else
  1073. {
  1074. byteBuffer = new byte[]
  1075. { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) };
  1076. }
  1077. if (fallbackBuffer == null)
  1078. {
  1079. if (decoder == null)
  1080. fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
  1081. else
  1082. fallbackBuffer = decoder.FallbackBuffer;
  1083. // Set our internal fallback interesting things.
  1084. fallbackBuffer.InternalInitialize(byteStart, null);
  1085. }
  1086. charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes);
  1087. lastChar = (char)0;
  1088. }
  1089. if (lastByte >= 0)
  1090. {
  1091. if (fallbackBuffer == null)
  1092. {
  1093. if (decoder == null)
  1094. fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
  1095. else
  1096. fallbackBuffer = decoder.FallbackBuffer;
  1097. // Set our internal fallback interesting things.
  1098. fallbackBuffer.InternalInitialize(byteStart, null);
  1099. }
  1100. // No hanging odd bytes allowed if must flush
  1101. charCount += fallbackBuffer.InternalFallback(new byte[] { unchecked((byte)lastByte) }, bytes);
  1102. lastByte = -1;
  1103. }
  1104. }
  1105. // If we had a high surrogate left over, we can't count it
  1106. if (lastChar > 0)
  1107. charCount--;
  1108. // Shouldn't have anything in fallback buffer for GetCharCount
  1109. // (don't have to check _throwOnOverflow for count)
  1110. Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0,
  1111. "[UnicodeEncoding.GetCharCount]Expected empty fallback buffer at end");
  1112. return charCount;
  1113. }
  1114. internal sealed override unsafe int GetChars(
  1115. byte* bytes, int byteCount, char* chars, int charCount, DecoderNLS? baseDecoder)
  1116. {
  1117. Debug.Assert(chars != null, "[UnicodeEncoding.GetChars]chars!=null");
  1118. Debug.Assert(byteCount >= 0, "[UnicodeEncoding.GetChars]byteCount >=0");
  1119. Debug.Assert(charCount >= 0, "[UnicodeEncoding.GetChars]charCount >=0");
  1120. Debug.Assert(bytes != null, "[UnicodeEncoding.GetChars]bytes!=null");
  1121. UnicodeEncoding.Decoder? decoder = (UnicodeEncoding.Decoder?)baseDecoder;
  1122. // Need last vars
  1123. int lastByte = -1;
  1124. char lastChar = (char)0;
  1125. // Get our decoder (but don't clear it yet)
  1126. if (decoder != null)
  1127. {
  1128. lastByte = decoder.lastByte;
  1129. lastChar = decoder.lastChar;
  1130. // Shouldn't have anything in fallback buffer for GetChars
  1131. // (don't have to check _throwOnOverflow for chars)
  1132. Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
  1133. "[UnicodeEncoding.GetChars]Expected empty fallback buffer at start");
  1134. }
  1135. // For fallback we may need a fallback buffer
  1136. DecoderFallbackBuffer? fallbackBuffer = null;
  1137. char* charsForFallback;
  1138. byte* byteEnd = bytes + byteCount;
  1139. char* charEnd = chars + charCount;
  1140. byte* byteStart = bytes;
  1141. char* charStart = chars;
  1142. while (bytes < byteEnd)
  1143. {
  1144. // If we're aligned then maybe we can do it fast
  1145. // That'll hurt if we're unaligned because we'll always test but never be aligned
  1146. #if FASTLOOP
  1147. if ((bigEndian ^ BitConverter.IsLittleEndian) &&
  1148. #if BIT64
  1149. (unchecked((long)chars) & 7) == 0 &&
  1150. #else
  1151. (unchecked((int)chars) & 3) == 0 &&
  1152. #endif
  1153. lastByte == -1 && lastChar == 0)
  1154. {
  1155. // Need -1 to check 2 at a time. If we have an even #, longChars will go
  1156. // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longChars
  1157. // will go from longEnd - 1 long to longEnd. (Might not get to use this)
  1158. // We can only go iCount units (limited by shorter of char or byte buffers.
  1159. ulong* longEnd = (ulong*)(bytes - 7 +
  1160. (((byteEnd - bytes) >> 1 < charEnd - chars) ?
  1161. (byteEnd - bytes) : (charEnd - chars) << 1));
  1162. // Need new char* so we can check 4 at a time
  1163. ulong* longBytes = (ulong*)bytes;
  1164. ulong* longChars = (ulong*)chars;
  1165. while (longBytes < longEnd)
  1166. {
  1167. // See if we potentially have surrogates (0x8000 bit set)
  1168. // (We're either big endian on a big endian machine or little endian on
  1169. // a little endian machine so that'll work)
  1170. if ((0x8000800080008000 & *longBytes) != 0)
  1171. {
  1172. // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high
  1173. // 5 bits looks like 11011, then its a high or low surrogate.
  1174. // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set.
  1175. // Note that we expect BMP characters to be more common than surrogates
  1176. // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates
  1177. ulong uTemp = (0xf800f800f800f800 & *longBytes) ^ 0xd800d800d800d800;
  1178. // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate
  1179. // but no clue if they're high or low.
  1180. // If each of the 4 characters are non-zero, then none are surrogates.
  1181. if ((uTemp & 0xFFFF000000000000) == 0 ||
  1182. (uTemp & 0x0000FFFF00000000) == 0 ||
  1183. (uTemp & 0x00000000FFFF0000) == 0 ||
  1184. (uTemp & 0x000000000000FFFF) == 0)
  1185. {
  1186. // It has at least 1 surrogate, but we don't know if they're high or low surrogates,
  1187. // or if there's 1 or 4 surrogates
  1188. // If they happen to be high/low/high/low, we may as well continue. Check the next
  1189. // bit to see if its set (low) or not (high) in the right pattern
  1190. if ((0xfc00fc00fc00fc00 & *longBytes) !=
  1191. (BitConverter.IsLittleEndian ? (ulong)0xdc00d800dc00d800 : (ulong)0xd800dc00d800dc00))
  1192. {
  1193. // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
  1194. // was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
  1195. // Drop out to the slow loop to resolve the surrogates
  1196. break;
  1197. }
  1198. // else they are all surrogates in High/Low/High/Low order, so we can use them.
  1199. }
  1200. // else none are surrogates, so we can use them.
  1201. }
  1202. // else all < 0x8000 so we can use them
  1203. // We can use these 4 chars.
  1204. Unsafe.WriteUnaligned<ulong>(longChars, *longBytes);
  1205. longBytes++;
  1206. longChars++;
  1207. }
  1208. chars = (char*)longChars;
  1209. bytes = (byte*)longBytes;
  1210. if (bytes >= byteEnd)
  1211. break;
  1212. }
  1213. #endif // FASTLOOP
  1214. // Get 1st byte
  1215. if (lastByte < 0)
  1216. {
  1217. lastByte = *bytes++;
  1218. continue;
  1219. }
  1220. // Get full char
  1221. char ch;
  1222. if (bigEndian)
  1223. {
  1224. ch = (char)(lastByte << 8 | *(bytes++));
  1225. }
  1226. else
  1227. {
  1228. ch = (char)(*(bytes++) << 8 | lastByte);
  1229. }
  1230. lastByte = -1;
  1231. // See if the char's valid
  1232. if (ch >= 0xd800 && ch <= 0xdfff)
  1233. {
  1234. // Was it a high surrogate?
  1235. if (ch <= 0xdbff)
  1236. {
  1237. // Its a high surrogate, if we had one then do fallback for previous one
  1238. if (lastChar > 0)
  1239. {
  1240. // Get fallback for previous high surrogate
  1241. // Note we have to reconstruct bytes because some may have been in decoder
  1242. byte[]? byteBuffer = null;
  1243. if (bigEndian)
  1244. {
  1245. byteBuffer = new byte[]
  1246. { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) };
  1247. }
  1248. else
  1249. {
  1250. byteBuffer = new byte[]
  1251. { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) };
  1252. }
  1253. if (fallbackBuffer == null)
  1254. {
  1255. if (decoder == null)
  1256. fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
  1257. else
  1258. fallbackBuffer = decoder.FallbackBuffer;
  1259. // Set our internal fallback interesting things.
  1260. fallbackBuffer.InternalInitialize(byteStart, charEnd);
  1261. }
  1262. charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
  1263. bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback);
  1264. chars = charsForFallback;
  1265. if (!fallbackResult)
  1266. {
  1267. // couldn't fall back lonely surrogate
  1268. // We either advanced bytes or chars should == charStart and throw below
  1269. Debug.Assert(bytes >= byteStart + 2 || chars == charStart,
  1270. "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (bad surrogate)");
  1271. bytes -= 2; // didn't use these 2 bytes
  1272. fallbackBuffer.InternalReset();
  1273. ThrowCharsOverflow(decoder, chars == charStart); // Might throw, if no chars output
  1274. break; // couldn't fallback but didn't throw
  1275. }
  1276. }
  1277. // Ignore the previous high surrogate which fell back already,
  1278. // yet remember the current high surrogate for next time.
  1279. lastChar = ch;
  1280. continue;
  1281. }
  1282. // Its a low surrogate
  1283. if (lastChar == 0)
  1284. {
  1285. // Expected a previous high surrogate
  1286. // Get fallback for this low surrogate
  1287. // Note we have to reconstruct bytes because some may have been in decoder
  1288. byte[]? byteBuffer = null;
  1289. if (bigEndian)
  1290. {
  1291. byteBuffer = new byte[]
  1292. { unchecked((byte)(ch >> 8)), unchecked((byte)ch) };
  1293. }
  1294. else
  1295. {
  1296. byteBuffer = new byte[]
  1297. { unchecked((byte)ch), unchecked((byte)(ch >> 8)) };
  1298. }
  1299. if (fallbackBuffer == null)
  1300. {
  1301. if (decoder == null)
  1302. fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
  1303. else
  1304. fallbackBuffer = decoder.FallbackBuffer;
  1305. // Set our internal fallback interesting things.
  1306. fallbackBuffer.InternalInitialize(byteStart, charEnd);
  1307. }
  1308. charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
  1309. bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback);
  1310. chars = charsForFallback;
  1311. if (!fallbackResult)
  1312. {
  1313. // couldn't fall back lonely surrogate
  1314. // We either advanced bytes or chars should == charStart and throw below
  1315. Debug.Assert(bytes >= byteStart + 2 || chars == charStart,
  1316. "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (lonely surrogate)");
  1317. bytes -= 2; // didn't use these 2 bytes
  1318. fallbackBuffer.InternalReset();
  1319. ThrowCharsOverflow(decoder, chars == charStart); // Might throw, if no chars output
  1320. break; // couldn't fallback but didn't throw
  1321. }
  1322. // Didn't throw, ignore this one (we already did its fallback)
  1323. continue;
  1324. }
  1325. // Valid surrogate pair, add our lastChar (will need 2 chars)
  1326. if (chars >= charEnd - 1)
  1327. {
  1328. // couldn't find room for this surrogate pair
  1329. // We either advanced bytes or chars should == charStart and throw below
  1330. Debug.Assert(bytes >= byteStart + 2 || chars == charStart,
  1331. "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (surrogate pair)");
  1332. bytes -= 2; // didn't use these 2 bytes
  1333. ThrowCharsOverflow(decoder, chars == charStart); // Might throw, if no chars output
  1334. // Leave lastChar for next call to Convert()
  1335. break; // couldn't fallback but didn't throw
  1336. }
  1337. *chars++ = lastChar;
  1338. lastChar = (char)0;
  1339. }
  1340. else if (lastChar > 0)
  1341. {
  1342. // Had a high surrogate, expected a low surrogate, fall back the high surrogate.
  1343. byte[]? byteBuffer = null;
  1344. if (bigEndian)
  1345. {
  1346. byteBuffer = new byte[]
  1347. { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) };
  1348. }
  1349. else
  1350. {
  1351. byteBuffer = new byte[]
  1352. { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) };
  1353. }
  1354. if (fallbackBuffer == null)
  1355. {
  1356. if (decoder == null)
  1357. fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
  1358. else
  1359. fallbackBuffer = decoder.FallbackBuffer;
  1360. // Set our internal fallback interesting things.
  1361. fallbackBuffer.InternalInitialize(byteStart, charEnd);
  1362. }
  1363. charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
  1364. bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback);
  1365. chars = charsForFallback;
  1366. if (!fallbackResult)
  1367. {
  1368. // couldn't fall back high surrogate, or char that would be next
  1369. // We either advanced bytes or chars should == charStart and throw below
  1370. Debug.Assert(bytes >= byteStart + 2 || chars == charStart,
  1371. "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (no low surrogate)");
  1372. bytes -= 2; // didn't use these 2 bytes
  1373. fallbackBuffer.InternalReset();
  1374. ThrowCharsOverflow(decoder, chars == charStart); // Might throw, if no chars output
  1375. break; // couldn't fallback but didn't throw
  1376. }
  1377. // Not left over now, clear previous high surrogate and continue to add current char
  1378. lastChar = (char)0;
  1379. }
  1380. // Valid char, room for it?
  1381. if (chars >= charEnd)
  1382. {
  1383. // 2 bytes couldn't fall back
  1384. // We either advanced bytes or chars should == charStart and throw below
  1385. Debug.Assert(bytes >= byteStart + 2 || chars == charStart,
  1386. "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (normal)");
  1387. bytes -= 2; // didn't use these bytes
  1388. ThrowCharsOverflow(decoder, chars == charStart); // Might throw, if no chars output
  1389. break; // couldn't fallback but didn't throw
  1390. }
  1391. // add it
  1392. *chars++ = ch;
  1393. }
  1394. // Remember our decoder if we must
  1395. if (decoder == null || decoder.MustFlush)
  1396. {
  1397. if (lastChar > 0)
  1398. {
  1399. // No hanging high surrogates allowed, do fallback and remove count for it
  1400. byte[]? byteBuffer = null;
  1401. if (bigEndian)
  1402. {
  1403. byteBuffer = new byte[]
  1404. { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) };
  1405. }
  1406. else
  1407. {
  1408. byteBuffer = new byte[]
  1409. { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) };
  1410. }
  1411. if (fallbackBuffer == null)
  1412. {
  1413. if (decoder == null)
  1414. fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
  1415. else
  1416. fallbackBuffer = decoder.FallbackBuffer;
  1417. // Set our internal fallback interesting things.
  1418. fallbackBuffer.InternalInitialize(byteStart, charEnd);
  1419. }
  1420. charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
  1421. bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback);
  1422. chars = charsForFallback;
  1423. if (!fallbackResult)
  1424. {
  1425. // 2 bytes couldn't fall back
  1426. // We either advanced bytes or chars should == charStart and throw below
  1427. Debug.Assert(bytes >= byteStart + 2 || chars == charStart,
  1428. "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (decoder)");
  1429. bytes -= 2; // didn't use these bytes
  1430. if (lastByte >= 0)
  1431. bytes--; // had an extra last byte hanging around
  1432. fallbackBuffer.InternalReset();
  1433. ThrowCharsOverflow(decoder, chars == charStart); // Might throw, if no chars output
  1434. // We'll remember these in our decoder though
  1435. bytes += 2;
  1436. if (lastByte >= 0)
  1437. bytes++;
  1438. goto End;
  1439. }
  1440. // done with this one
  1441. lastChar = (char)0;
  1442. }
  1443. if (lastByte >= 0)
  1444. {
  1445. if (fallbackBuffer == null)
  1446. {
  1447. if (decoder == null)
  1448. fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
  1449. else
  1450. fallbackBuffer = decoder.FallbackBuffer;
  1451. // Set our internal fallback interesting things.
  1452. fallbackBuffer.InternalInitialize(byteStart, charEnd);
  1453. }
  1454. // No hanging odd bytes allowed if must flush
  1455. charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
  1456. bool fallbackResult = fallbackBuffer.InternalFallback(new byte[] { unchecked((byte)lastByte) }, bytes, ref charsForFallback);
  1457. chars = charsForFallback;
  1458. if (!fallbackResult)
  1459. {
  1460. // odd byte couldn't fall back
  1461. bytes--; // didn't use this byte
  1462. fallbackBuffer.InternalReset();
  1463. ThrowCharsOverflow(decoder, chars == charStart); // Might throw, if no chars output
  1464. // didn't throw, but we'll remember it in the decoder
  1465. bytes++;
  1466. goto End;
  1467. }
  1468. // Didn't fail, clear buffer
  1469. lastByte = -1;
  1470. }
  1471. }
  1472. End:
  1473. // Remember our decoder if we must
  1474. if (decoder != null)
  1475. {
  1476. Debug.Assert(!decoder.MustFlush || ((lastChar == (char)0) && (lastByte == -1)),
  1477. "[UnicodeEncoding.GetChars] Expected no left over chars or bytes if flushing"
  1478. // + " " + ((int)lastChar).ToString("X4") + " " + lastByte.ToString("X2")
  1479. );
  1480. decoder._bytesUsed = (int)(bytes - byteStart);
  1481. decoder.lastChar = lastChar;
  1482. decoder.lastByte = lastByte;
  1483. }
  1484. // Shouldn't have anything in fallback buffer for GetChars
  1485. // (don't have to check _throwOnOverflow for count or chars)
  1486. Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0,
  1487. "[UnicodeEncoding.GetChars]Expected empty fallback buffer at end");
  1488. return (int)(chars - charStart);
  1489. }
  1490. public override System.Text.Encoder GetEncoder()
  1491. {
  1492. return new EncoderNLS(this);
  1493. }
  1494. public override System.Text.Decoder GetDecoder()
  1495. {
  1496. return new UnicodeEncoding.Decoder(this);
  1497. }
  1498. public override byte[] GetPreamble()
  1499. {
  1500. if (byteOrderMark)
  1501. {
  1502. // Note - we must allocate new byte[]'s here to prevent someone
  1503. // from modifying a cached byte[].
  1504. if (bigEndian)
  1505. return new byte[2] { 0xfe, 0xff };
  1506. else
  1507. return new byte[2] { 0xff, 0xfe };
  1508. }
  1509. return Array.Empty<byte>();
  1510. }
  1511. public override ReadOnlySpan<byte> Preamble =>
  1512. GetType() != typeof(UnicodeEncoding) ? new ReadOnlySpan<byte>(GetPreamble()) : // in case a derived UnicodeEncoding overrode GetPreamble
  1513. !byteOrderMark ? default :
  1514. bigEndian ? (ReadOnlySpan<byte>)new byte[2] { 0xfe, 0xff } : // uses C# compiler's optimization for static byte[] data
  1515. (ReadOnlySpan<byte>)new byte[2] { 0xff, 0xfe };
  1516. public override int GetMaxByteCount(int charCount)
  1517. {
  1518. if (charCount < 0)
  1519. throw new ArgumentOutOfRangeException(nameof(charCount),
  1520. SR.ArgumentOutOfRange_NeedNonNegNum);
  1521. // Characters would be # of characters + 1 in case left over high surrogate is ? * max fallback
  1522. long byteCount = (long)charCount + 1;
  1523. if (EncoderFallback.MaxCharCount > 1)
  1524. byteCount *= EncoderFallback.MaxCharCount;
  1525. // 2 bytes per char
  1526. byteCount <<= 1;
  1527. if (byteCount > 0x7fffffff)
  1528. throw new ArgumentOutOfRangeException(nameof(charCount), SR.ArgumentOutOfRange_GetByteCountOverflow);
  1529. return (int)byteCount;
  1530. }
  1531. public override int GetMaxCharCount(int byteCount)
  1532. {
  1533. if (byteCount < 0)
  1534. throw new ArgumentOutOfRangeException(nameof(byteCount),
  1535. SR.ArgumentOutOfRange_NeedNonNegNum);
  1536. // long because byteCount could be biggest int.
  1537. // 1 char per 2 bytes. Round up in case 1 left over in decoder.
  1538. // Round up using &1 in case byteCount is max size
  1539. // Might also need an extra 1 if there's a left over high surrogate in the decoder.
  1540. long charCount = (long)(byteCount >> 1) + (byteCount & 1) + 1;
  1541. // Don't forget fallback (in case they have a bunch of lonely surrogates or something bizarre like that)
  1542. if (DecoderFallback.MaxCharCount > 1)
  1543. charCount *= DecoderFallback.MaxCharCount;
  1544. if (charCount > 0x7fffffff)
  1545. throw new ArgumentOutOfRangeException(nameof(byteCount), SR.ArgumentOutOfRange_GetCharCountOverflow);
  1546. return (int)charCount;
  1547. }
  1548. public override bool Equals(object? value)
  1549. {
  1550. if (value is UnicodeEncoding that)
  1551. {
  1552. //
  1553. // Big Endian Unicode has different code page (1201) than small Endian one (1200),
  1554. // so we still have to check _codePage here.
  1555. //
  1556. return (CodePage == that.CodePage) &&
  1557. byteOrderMark == that.byteOrderMark &&
  1558. // isThrowException == that.isThrowException && // Same as Encoder/Decoder being exception fallbacks
  1559. bigEndian == that.bigEndian &&
  1560. (EncoderFallback.Equals(that.EncoderFallback)) &&
  1561. (DecoderFallback.Equals(that.DecoderFallback));
  1562. }
  1563. return false;
  1564. }
  1565. public override int GetHashCode()
  1566. {
  1567. return CodePage + this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode() +
  1568. (byteOrderMark ? 4 : 0) + (bigEndian ? 8 : 0);
  1569. }
  1570. private sealed class Decoder : System.Text.DecoderNLS
  1571. {
  1572. internal int lastByte = -1;
  1573. internal char lastChar = '\0';
  1574. public Decoder(UnicodeEncoding encoding) : base(encoding)
  1575. {
  1576. // base calls reset
  1577. }
  1578. public override void Reset()
  1579. {
  1580. lastByte = -1;
  1581. lastChar = '\0';
  1582. if (_fallbackBuffer != null)
  1583. _fallbackBuffer.Reset();
  1584. }
  1585. // Anything left in our decoder?
  1586. internal override bool HasState => this.lastByte != -1 || this.lastChar != '\0';
  1587. }
  1588. }
  1589. }