UnicodeEncoding.cs 88 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896
  1. // Licensed to the .NET Foundation under one or more agreements.
  2. // The .NET Foundation licenses this file to you under the MIT license.
  3. // See the LICENSE file in the project root for more information.
  4. //
  5. // Don't override IsAlwaysNormalized because it is just a Unicode Transformation and could be confused.
  6. //
  7. // This define can be used to turn off the fast loops. Useful for finding whether
  8. // the problem is fastloop-specific.
  9. #define FASTLOOP
  10. using System;
  11. using System.Globalization;
  12. using System.Diagnostics;
  13. using System.Runtime.InteropServices;
  14. using Internal.Runtime.CompilerServices;
  15. namespace System.Text
  16. {
  17. public class UnicodeEncoding : Encoding
  18. {
  19. // Used by Encoding.BigEndianUnicode/Unicode for lazy initialization
  20. // The initialization code will not be run until a static member of the class is referenced
  21. internal static readonly UnicodeEncoding s_bigEndianDefault = new UnicodeEncoding(bigEndian: true, byteOrderMark: true);
  22. internal static readonly UnicodeEncoding s_littleEndianDefault = new UnicodeEncoding(bigEndian: false, byteOrderMark: true);
  23. private readonly bool isThrowException = false;
  24. private readonly bool bigEndian = false;
  25. private readonly bool byteOrderMark = false;
  26. // Unicode version 2.0 character size in bytes
  27. public const int CharSize = 2;
  28. public UnicodeEncoding()
  29. : this(false, true)
  30. {
  31. }
  32. public UnicodeEncoding(bool bigEndian, bool byteOrderMark)
  33. : base(bigEndian ? 1201 : 1200) //Set the data item.
  34. {
  35. this.bigEndian = bigEndian;
  36. this.byteOrderMark = byteOrderMark;
  37. }
  38. public UnicodeEncoding(bool bigEndian, bool byteOrderMark, bool throwOnInvalidBytes)
  39. : this(bigEndian, byteOrderMark)
  40. {
  41. this.isThrowException = throwOnInvalidBytes;
  42. // Encoding constructor already did this, but it'll be wrong if we're throwing exceptions
  43. if (this.isThrowException)
  44. SetDefaultFallbacks();
  45. }
  46. internal sealed override void SetDefaultFallbacks()
  47. {
  48. // For UTF-X encodings, we use a replacement fallback with an empty string
  49. if (this.isThrowException)
  50. {
  51. this.encoderFallback = EncoderFallback.ExceptionFallback;
  52. this.decoderFallback = DecoderFallback.ExceptionFallback;
  53. }
  54. else
  55. {
  56. this.encoderFallback = new EncoderReplacementFallback("\xFFFD");
  57. this.decoderFallback = new DecoderReplacementFallback("\xFFFD");
  58. }
  59. }
  60. // The following methods are copied from EncodingNLS.cs.
  61. // Unfortunately EncodingNLS.cs is internal and we're public, so we have to re-implement them here.
  62. // These should be kept in sync for the following classes:
  63. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  64. //
  65. // Returns the number of bytes required to encode a range of characters in
  66. // a character array.
  67. //
  68. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  69. // So if you fix this, fix the others. Currently those include:
  70. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  71. // parent method is safe
  72. public override unsafe int GetByteCount(char[] chars, int index, int count)
  73. {
  74. // Validate input parameters
  75. if (chars == null)
  76. throw new ArgumentNullException(nameof(chars), SR.ArgumentNull_Array);
  77. if (index < 0 || count < 0)
  78. throw new ArgumentOutOfRangeException((index < 0 ? nameof(index) : nameof(count)), SR.ArgumentOutOfRange_NeedNonNegNum);
  79. if (chars.Length - index < count)
  80. throw new ArgumentOutOfRangeException(nameof(chars), SR.ArgumentOutOfRange_IndexCountBuffer);
  81. // If no input, return 0, avoid fixed empty array problem
  82. if (count == 0)
  83. return 0;
  84. // Just call the pointer version
  85. fixed (char* pChars = chars)
  86. return GetByteCount(pChars + index, count, null);
  87. }
  88. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  89. // So if you fix this, fix the others. Currently those include:
  90. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  91. // parent method is safe
  92. public override unsafe int GetByteCount(string s)
  93. {
  94. // Validate input
  95. if (s==null)
  96. throw new ArgumentNullException(nameof(s));
  97. fixed (char* pChars = s)
  98. return GetByteCount(pChars, s.Length, null);
  99. }
  100. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  101. // So if you fix this, fix the others. Currently those include:
  102. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  103. [CLSCompliant(false)]
  104. public override unsafe int GetByteCount(char* chars, int count)
  105. {
  106. // Validate Parameters
  107. if (chars == null)
  108. throw new ArgumentNullException(nameof(chars), SR.ArgumentNull_Array);
  109. if (count < 0)
  110. throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
  111. // Call it with empty encoder
  112. return GetByteCount(chars, count, null);
  113. }
  114. // Parent method is safe.
  115. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  116. // So if you fix this, fix the others. Currently those include:
  117. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  118. public override unsafe int GetBytes(string s, int charIndex, int charCount,
  119. byte[] bytes, int byteIndex)
  120. {
  121. if (s == null || bytes == null)
  122. throw new ArgumentNullException((s == null ? nameof(s) : nameof(bytes)), SR.ArgumentNull_Array);
  123. if (charIndex < 0 || charCount < 0)
  124. throw new ArgumentOutOfRangeException((charIndex < 0 ? nameof(charIndex) : nameof(charCount)), SR.ArgumentOutOfRange_NeedNonNegNum);
  125. if (s.Length - charIndex < charCount)
  126. throw new ArgumentOutOfRangeException(nameof(s), SR.ArgumentOutOfRange_IndexCount);
  127. if (byteIndex < 0 || byteIndex > bytes.Length)
  128. throw new ArgumentOutOfRangeException(nameof(byteIndex), SR.ArgumentOutOfRange_Index);
  129. int byteCount = bytes.Length - byteIndex;
  130. fixed (char* pChars = s) fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes))
  131. return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
  132. }
  133. // Encodes a range of characters in a character array into a range of bytes
  134. // in a byte array. An exception occurs if the byte array is not large
  135. // enough to hold the complete encoding of the characters. The
  136. // GetByteCount method can be used to determine the exact number of
  137. // bytes that will be produced for a given range of characters.
  138. // Alternatively, the GetMaxByteCount method can be used to
  139. // determine the maximum number of bytes that will be produced for a given
  140. // number of characters, regardless of the actual character values.
  141. //
  142. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  143. // So if you fix this, fix the others. Currently those include:
  144. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  145. // parent method is safe
  146. public override unsafe int GetBytes(char[] chars, int charIndex, int charCount,
  147. byte[] bytes, int byteIndex)
  148. {
  149. // Validate parameters
  150. if (chars == null || bytes == null)
  151. throw new ArgumentNullException((chars == null ? nameof(chars) : nameof(bytes)), SR.ArgumentNull_Array);
  152. if (charIndex < 0 || charCount < 0)
  153. throw new ArgumentOutOfRangeException((charIndex < 0 ? nameof(charIndex) : nameof(charCount)), SR.ArgumentOutOfRange_NeedNonNegNum);
  154. if (chars.Length - charIndex < charCount)
  155. throw new ArgumentOutOfRangeException(nameof(chars), SR.ArgumentOutOfRange_IndexCountBuffer);
  156. if (byteIndex < 0 || byteIndex > bytes.Length)
  157. throw new ArgumentOutOfRangeException(nameof(byteIndex), SR.ArgumentOutOfRange_Index);
  158. // If nothing to encode return 0, avoid fixed problem
  159. if (charCount == 0)
  160. return 0;
  161. // Just call pointer version
  162. int byteCount = bytes.Length - byteIndex;
  163. fixed (char* pChars = chars) fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes))
  164. // Remember that byteCount is # to decode, not size of array.
  165. return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
  166. }
  167. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  168. // So if you fix this, fix the others. Currently those include:
  169. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  170. [CLSCompliant(false)]
  171. public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount)
  172. {
  173. // Validate Parameters
  174. if (bytes == null || chars == null)
  175. throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array);
  176. if (charCount < 0 || byteCount < 0)
  177. throw new ArgumentOutOfRangeException((charCount < 0 ? nameof(charCount) : nameof(byteCount)), SR.ArgumentOutOfRange_NeedNonNegNum);
  178. return GetBytes(chars, charCount, bytes, byteCount, null);
  179. }
  180. // Returns the number of characters produced by decoding a range of bytes
  181. // in a byte array.
  182. //
  183. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  184. // So if you fix this, fix the others. Currently those include:
  185. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  186. // parent method is safe
  187. public override unsafe int GetCharCount(byte[] bytes, int index, int count)
  188. {
  189. // Validate Parameters
  190. if (bytes == null)
  191. throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array);
  192. if (index < 0 || count < 0)
  193. throw new ArgumentOutOfRangeException((index < 0 ? nameof(index) : nameof(count)), SR.ArgumentOutOfRange_NeedNonNegNum);
  194. if (bytes.Length - index < count)
  195. throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer);
  196. // If no input just return 0, fixed doesn't like 0 length arrays
  197. if (count == 0)
  198. return 0;
  199. // Just call pointer version
  200. fixed (byte* pBytes = bytes)
  201. return GetCharCount(pBytes + index, count, null);
  202. }
  203. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  204. // So if you fix this, fix the others. Currently those include:
  205. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  206. [CLSCompliant(false)]
  207. public override unsafe int GetCharCount(byte* bytes, int count)
  208. {
  209. // Validate Parameters
  210. if (bytes == null)
  211. throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array);
  212. if (count < 0)
  213. throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
  214. return GetCharCount(bytes, count, null);
  215. }
  216. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  217. // So if you fix this, fix the others. Currently those include:
  218. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  219. // parent method is safe
  220. public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount,
  221. char[] chars, int charIndex)
  222. {
  223. // Validate Parameters
  224. if (bytes == null || chars == null)
  225. throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array);
  226. if (byteIndex < 0 || byteCount < 0)
  227. throw new ArgumentOutOfRangeException((byteIndex < 0 ? nameof(byteIndex) : nameof(byteCount)), SR.ArgumentOutOfRange_NeedNonNegNum);
  228. if ( bytes.Length - byteIndex < byteCount)
  229. throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer);
  230. if (charIndex < 0 || charIndex > chars.Length)
  231. throw new ArgumentOutOfRangeException(nameof(charIndex), SR.ArgumentOutOfRange_Index);
  232. // If no input, return 0 & avoid fixed problem
  233. if (byteCount == 0)
  234. return 0;
  235. // Just call pointer version
  236. int charCount = chars.Length - charIndex;
  237. fixed (byte* pBytes = bytes) fixed (char* pChars = &MemoryMarshal.GetReference((Span<char>)chars))
  238. // Remember that charCount is # to decode, not size of array
  239. return GetChars(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, null);
  240. }
  241. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  242. // So if you fix this, fix the others. Currently those include:
  243. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  244. [CLSCompliant(false)]
  245. public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount)
  246. {
  247. // Validate Parameters
  248. if (bytes == null || chars == null)
  249. throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array);
  250. if (charCount < 0 || byteCount < 0)
  251. throw new ArgumentOutOfRangeException((charCount < 0 ? nameof(charCount) : nameof(byteCount)), SR.ArgumentOutOfRange_NeedNonNegNum);
  252. return GetChars(bytes, byteCount, chars, charCount, null);
  253. }
  254. // Returns a string containing the decoded representation of a range of
  255. // bytes in a byte array.
  256. //
  257. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  258. // So if you fix this, fix the others. Currently those include:
  259. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  260. // parent method is safe
  261. public override unsafe string GetString(byte[] bytes, int index, int count)
  262. {
  263. // Validate Parameters
  264. if (bytes == null)
  265. throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array);
  266. if (index < 0 || count < 0)
  267. throw new ArgumentOutOfRangeException((index < 0 ? nameof(index) : nameof(count)), SR.ArgumentOutOfRange_NeedNonNegNum);
  268. if (bytes.Length - index < count)
  269. throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer);
  270. // Avoid problems with empty input buffer
  271. if (count == 0) return string.Empty;
  272. fixed (byte* pBytes = bytes)
  273. return string.CreateStringFromEncoding(
  274. pBytes + index, count, this);
  275. }
  276. //
  277. // End of standard methods copied from EncodingNLS.cs
  278. //
  279. internal sealed override unsafe int GetByteCount(char* chars, int count, EncoderNLS encoder)
  280. {
  281. Debug.Assert(chars != null, "[UnicodeEncoding.GetByteCount]chars!=null");
  282. Debug.Assert(count >= 0, "[UnicodeEncoding.GetByteCount]count >=0");
  283. // Start by assuming each char gets 2 bytes
  284. int byteCount = count << 1;
  285. // Check for overflow in byteCount
  286. // (If they were all invalid chars, this would actually be wrong,
  287. // but that's a ridiculously large # so we're not concerned about that case)
  288. if (byteCount < 0)
  289. throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_GetByteCountOverflow);
  290. char* charStart = chars;
  291. char* charEnd = chars + count;
  292. char charLeftOver = (char)0;
  293. bool wasHereBefore = false;
  294. // For fallback we may need a fallback buffer
  295. EncoderFallbackBuffer fallbackBuffer = null;
  296. char* charsForFallback;
  297. if (encoder != null)
  298. {
  299. charLeftOver = encoder._charLeftOver;
  300. // Assume extra bytes to encode charLeftOver if it existed
  301. if (charLeftOver > 0)
  302. byteCount += 2;
  303. // We mustn't have left over fallback data when counting
  304. if (encoder.InternalHasFallbackBuffer)
  305. {
  306. fallbackBuffer = encoder.FallbackBuffer;
  307. if (fallbackBuffer.Remaining > 0)
  308. throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType()));
  309. // Set our internal fallback interesting things.
  310. fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false);
  311. }
  312. }
  313. char ch;
  314. TryAgain:
  315. while (((ch = (fallbackBuffer == null) ? (char)0 : fallbackBuffer.InternalGetNextChar()) != 0) || chars < charEnd)
  316. {
  317. // First unwind any fallback
  318. if (ch == 0)
  319. {
  320. // No fallback, maybe we can do it fast
  321. #if FASTLOOP
  322. // If endianess is backwards then each pair of bytes would be backwards.
  323. if ( (bigEndian ^ BitConverter.IsLittleEndian) &&
  324. #if BIT64
  325. (unchecked((long)chars) & 7) == 0 &&
  326. #else
  327. (unchecked((int)chars) & 3) == 0 &&
  328. #endif
  329. charLeftOver == 0)
  330. {
  331. // Need -1 to check 2 at a time. If we have an even #, longChars will go
  332. // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longChars
  333. // will go from longEnd - 1 long to longEnd. (Might not get to use this)
  334. ulong* longEnd = (ulong*)(charEnd - 3);
  335. // Need new char* so we can check 4 at a time
  336. ulong* longChars = (ulong*)chars;
  337. while (longChars < longEnd)
  338. {
  339. // See if we potentially have surrogates (0x8000 bit set)
  340. // (We're either big endian on a big endian machine or little endian on
  341. // a little endian machine so that'll work)
  342. if ((0x8000800080008000 & *longChars) != 0)
  343. {
  344. // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high
  345. // 5 bits looks like 11011, then its a high or low surrogate.
  346. // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set.
  347. // Note that we expect BMP characters to be more common than surrogates
  348. // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates
  349. ulong uTemp = (0xf800f800f800f800 & *longChars) ^ 0xd800d800d800d800;
  350. // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate
  351. // but no clue if they're high or low.
  352. // If each of the 4 characters are non-zero, then none are surrogates.
  353. if ((uTemp & 0xFFFF000000000000) == 0 ||
  354. (uTemp & 0x0000FFFF00000000) == 0 ||
  355. (uTemp & 0x00000000FFFF0000) == 0 ||
  356. (uTemp & 0x000000000000FFFF) == 0)
  357. {
  358. // It has at least 1 surrogate, but we don't know if they're high or low surrogates,
  359. // or if there's 1 or 4 surrogates
  360. // If they happen to be high/low/high/low, we may as well continue. Check the next
  361. // bit to see if its set (low) or not (high) in the right pattern
  362. if ((0xfc00fc00fc00fc00 & *longChars) !=
  363. (BitConverter.IsLittleEndian ? (ulong)0xdc00d800dc00d800 : (ulong)0xd800dc00d800dc00))
  364. {
  365. // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
  366. // was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
  367. // Drop out to the slow loop to resolve the surrogates
  368. break;
  369. }
  370. // else they are all surrogates in High/Low/High/Low order, so we can use them.
  371. }
  372. // else none are surrogates, so we can use them.
  373. }
  374. // else all < 0x8000 so we can use them
  375. // We already counted these four chars, go to next long.
  376. longChars++;
  377. }
  378. chars = (char*)longChars;
  379. if (chars >= charEnd)
  380. break;
  381. }
  382. #endif // FASTLOOP
  383. // No fallback, just get next char
  384. ch = *chars;
  385. chars++;
  386. }
  387. else
  388. {
  389. // We weren't preallocating fallback space.
  390. byteCount += 2;
  391. }
  392. // Check for high or low surrogates
  393. if (ch >= 0xd800 && ch <= 0xdfff)
  394. {
  395. // Was it a high surrogate?
  396. if (ch <= 0xdbff)
  397. {
  398. // Its a high surrogate, if we already had a high surrogate do its fallback
  399. if (charLeftOver > 0)
  400. {
  401. // Unwind the current character, this should be safe because we
  402. // don't have leftover data in the fallback, so chars must have
  403. // advanced already.
  404. Debug.Assert(chars > charStart,
  405. "[UnicodeEncoding.GetByteCount]Expected chars to have advanced in unexpected high surrogate");
  406. chars--;
  407. // If previous high surrogate deallocate 2 bytes
  408. byteCount -= 2;
  409. // Fallback the previous surrogate
  410. // Need to initialize fallback buffer?
  411. if (fallbackBuffer == null)
  412. {
  413. if (encoder == null)
  414. fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
  415. else
  416. fallbackBuffer = encoder.FallbackBuffer;
  417. // Set our internal fallback interesting things.
  418. fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false);
  419. }
  420. charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered
  421. fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback);
  422. chars = charsForFallback;
  423. // Now no high surrogate left over
  424. charLeftOver = (char)0;
  425. continue;
  426. }
  427. // Remember this high surrogate
  428. charLeftOver = ch;
  429. continue;
  430. }
  431. // Its a low surrogate
  432. if (charLeftOver == 0)
  433. {
  434. // Expected a previous high surrogate.
  435. // Don't count this one (we'll count its fallback if necessary)
  436. byteCount -= 2;
  437. // fallback this one
  438. // Need to initialize fallback buffer?
  439. if (fallbackBuffer == null)
  440. {
  441. if (encoder == null)
  442. fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
  443. else
  444. fallbackBuffer = encoder.FallbackBuffer;
  445. // Set our internal fallback interesting things.
  446. fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false);
  447. }
  448. charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
  449. fallbackBuffer.InternalFallback(ch, ref charsForFallback);
  450. chars = charsForFallback;
  451. continue;
  452. }
  453. // Valid surrogate pair, add our charLeftOver
  454. charLeftOver = (char)0;
  455. continue;
  456. }
  457. else if (charLeftOver > 0)
  458. {
  459. // Expected a low surrogate, but this char is normal
  460. // Rewind the current character, fallback previous character.
  461. // this should be safe because we don't have leftover data in the
  462. // fallback, so chars must have advanced already.
  463. Debug.Assert(chars > charStart,
  464. "[UnicodeEncoding.GetByteCount]Expected chars to have advanced when expected low surrogate");
  465. chars--;
  466. // fallback previous chars
  467. // Need to initialize fallback buffer?
  468. if (fallbackBuffer == null)
  469. {
  470. if (encoder == null)
  471. fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
  472. else
  473. fallbackBuffer = encoder.FallbackBuffer;
  474. // Set our internal fallback interesting things.
  475. fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false);
  476. }
  477. charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
  478. fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback);
  479. chars = charsForFallback;
  480. // Ignore charLeftOver or throw
  481. byteCount -= 2;
  482. charLeftOver = (char)0;
  483. continue;
  484. }
  485. // Ok we had something to add (already counted)
  486. }
  487. // Don't allocate space for left over char
  488. if (charLeftOver > 0)
  489. {
  490. byteCount -= 2;
  491. // If we have to flush, stick it in fallback and try again
  492. if (encoder == null || encoder.MustFlush)
  493. {
  494. if (wasHereBefore)
  495. {
  496. // Throw it, using our complete character
  497. throw new ArgumentException(
  498. SR.Format(SR.Argument_RecursiveFallback, charLeftOver), nameof(chars));
  499. }
  500. else
  501. {
  502. // Need to initialize fallback buffer?
  503. if (fallbackBuffer == null)
  504. {
  505. if (encoder == null)
  506. fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
  507. else
  508. fallbackBuffer = encoder.FallbackBuffer;
  509. // Set our internal fallback interesting things.
  510. fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false);
  511. }
  512. charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
  513. fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback);
  514. chars = charsForFallback;
  515. charLeftOver = (char)0;
  516. wasHereBefore = true;
  517. goto TryAgain;
  518. }
  519. }
  520. }
  521. // Shouldn't have anything in fallback buffer for GetByteCount
  522. // (don't have to check _throwOnOverflow for count)
  523. Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0,
  524. "[UnicodeEncoding.GetByteCount]Expected empty fallback buffer at end");
  525. // Don't remember fallbackBuffer.encoder for counting
  526. return byteCount;
  527. }
  528. internal sealed override unsafe int GetBytes(
  529. char* chars, int charCount, byte* bytes, int byteCount, EncoderNLS encoder)
  530. {
  531. Debug.Assert(chars != null, "[UnicodeEncoding.GetBytes]chars!=null");
  532. Debug.Assert(byteCount >= 0, "[UnicodeEncoding.GetBytes]byteCount >=0");
  533. Debug.Assert(charCount >= 0, "[UnicodeEncoding.GetBytes]charCount >=0");
  534. Debug.Assert(bytes != null, "[UnicodeEncoding.GetBytes]bytes!=null");
  535. char charLeftOver = (char)0;
  536. char ch;
  537. bool wasHereBefore = false;
  538. byte* byteEnd = bytes + byteCount;
  539. char* charEnd = chars + charCount;
  540. byte* byteStart = bytes;
  541. char* charStart = chars;
  542. // For fallback we may need a fallback buffer
  543. EncoderFallbackBuffer fallbackBuffer = null;
  544. char* charsForFallback;
  545. // Get our encoder, but don't clear it yet.
  546. if (encoder != null)
  547. {
  548. charLeftOver = encoder._charLeftOver;
  549. // We mustn't have left over fallback data when counting
  550. if (encoder.InternalHasFallbackBuffer)
  551. {
  552. // We always need the fallback buffer in get bytes so we can flush any remaining ones if necessary
  553. fallbackBuffer = encoder.FallbackBuffer;
  554. if (fallbackBuffer.Remaining > 0 && encoder._throwOnOverflow)
  555. throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType()));
  556. // Set our internal fallback interesting things.
  557. fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false);
  558. }
  559. }
  560. TryAgain:
  561. while (((ch = (fallbackBuffer == null) ?
  562. (char)0 : fallbackBuffer.InternalGetNextChar()) != 0) ||
  563. chars < charEnd)
  564. {
  565. // First unwind any fallback
  566. if (ch == 0)
  567. {
  568. // No fallback, maybe we can do it fast
  569. #if FASTLOOP
  570. // If endianess is backwards then each pair of bytes would be backwards.
  571. if ( (bigEndian ^ BitConverter.IsLittleEndian) &&
  572. #if BIT64
  573. (unchecked((long)chars) & 7) == 0 &&
  574. #else
  575. (unchecked((int)chars) & 3) == 0 &&
  576. #endif
  577. charLeftOver == 0)
  578. {
  579. // Need -1 to check 2 at a time. If we have an even #, longChars will go
  580. // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longChars
  581. // will go from longEnd - 1 long to longEnd. (Might not get to use this)
  582. // We can only go iCount units (limited by shorter of char or byte buffers.
  583. ulong* longEnd = (ulong*)(chars - 3 +
  584. (((byteEnd - bytes) >> 1 < charEnd - chars) ?
  585. (byteEnd - bytes) >> 1 : charEnd - chars));
  586. // Need new char* so we can check 4 at a time
  587. ulong* longChars = (ulong*)chars;
  588. ulong* longBytes = (ulong*)bytes;
  589. while (longChars < longEnd)
  590. {
  591. // See if we potentially have surrogates (0x8000 bit set)
  592. // (We're either big endian on a big endian machine or little endian on
  593. // a little endian machine so that'll work)
  594. if ((0x8000800080008000 & *longChars) != 0)
  595. {
  596. // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high
  597. // 5 bits looks like 11011, then its a high or low surrogate.
  598. // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set.
  599. // Note that we expect BMP characters to be more common than surrogates
  600. // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates
  601. ulong uTemp = (0xf800f800f800f800 & *longChars) ^ 0xd800d800d800d800;
  602. // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate
  603. // but no clue if they're high or low.
  604. // If each of the 4 characters are non-zero, then none are surrogates.
  605. if ((uTemp & 0xFFFF000000000000) == 0 ||
  606. (uTemp & 0x0000FFFF00000000) == 0 ||
  607. (uTemp & 0x00000000FFFF0000) == 0 ||
  608. (uTemp & 0x000000000000FFFF) == 0)
  609. {
  610. // It has at least 1 surrogate, but we don't know if they're high or low surrogates,
  611. // or if there's 1 or 4 surrogates
  612. // If they happen to be high/low/high/low, we may as well continue. Check the next
  613. // bit to see if its set (low) or not (high) in the right pattern
  614. if ((0xfc00fc00fc00fc00 & *longChars) !=
  615. (BitConverter.IsLittleEndian ? (ulong)0xdc00d800dc00d800 : (ulong)0xd800dc00d800dc00))
  616. {
  617. // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
  618. // was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
  619. // Drop out to the slow loop to resolve the surrogates
  620. break;
  621. }
  622. // else they are all surrogates in High/Low/High/Low order, so we can use them.
  623. }
  624. // else none are surrogates, so we can use them.
  625. }
  626. // else all < 0x8000 so we can use them
  627. // We can use these 4 chars.
  628. Unsafe.WriteUnaligned<ulong>(longBytes, *longChars);
  629. longChars++;
  630. longBytes++;
  631. }
  632. chars = (char*)longChars;
  633. bytes = (byte*)longBytes;
  634. if (chars >= charEnd)
  635. break;
  636. }
  637. #endif // FASTLOOP
  638. // No fallback, just get next char
  639. ch = *chars;
  640. chars++;
  641. }
  642. // Check for high or low surrogates
  643. if (ch >= 0xd800 && ch <= 0xdfff)
  644. {
  645. // Was it a high surrogate?
  646. if (ch <= 0xdbff)
  647. {
  648. // Its a high surrogate, see if we already had a high surrogate
  649. if (charLeftOver > 0)
  650. {
  651. // Unwind the current character, this should be safe because we
  652. // don't have leftover data in the fallback, so chars must have
  653. // advanced already.
  654. Debug.Assert(chars > charStart,
  655. "[UnicodeEncoding.GetBytes]Expected chars to have advanced in unexpected high surrogate");
  656. chars--;
  657. // Fallback the previous surrogate
  658. // Might need to create our fallback buffer
  659. if (fallbackBuffer == null)
  660. {
  661. if (encoder == null)
  662. fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
  663. else
  664. fallbackBuffer = encoder.FallbackBuffer;
  665. // Set our internal fallback interesting things.
  666. fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true);
  667. }
  668. charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
  669. fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback);
  670. chars = charsForFallback;
  671. charLeftOver = (char)0;
  672. continue;
  673. }
  674. // Remember this high surrogate
  675. charLeftOver = ch;
  676. continue;
  677. }
  678. // Its a low surrogate
  679. if (charLeftOver == 0)
  680. {
  681. // We'll fall back this one
  682. // Might need to create our fallback buffer
  683. if (fallbackBuffer == null)
  684. {
  685. if (encoder == null)
  686. fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
  687. else
  688. fallbackBuffer = encoder.FallbackBuffer;
  689. // Set our internal fallback interesting things.
  690. fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true);
  691. }
  692. charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
  693. fallbackBuffer.InternalFallback(ch, ref charsForFallback);
  694. chars = charsForFallback;
  695. continue;
  696. }
  697. // Valid surrogate pair, add our charLeftOver
  698. if (bytes + 3 >= byteEnd)
  699. {
  700. // Not enough room to add this surrogate pair
  701. if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
  702. {
  703. // These must have both been from the fallbacks.
  704. // Both of these MUST have been from a fallback because if the 1st wasn't
  705. // from a fallback, then a high surrogate followed by an illegal char
  706. // would've caused the high surrogate to fall back. If a high surrogate
  707. // fell back, then it was consumed and both chars came from the fallback.
  708. fallbackBuffer.MovePrevious(); // Didn't use either fallback surrogate
  709. fallbackBuffer.MovePrevious();
  710. }
  711. else
  712. {
  713. // If we don't have enough room, then either we should've advanced a while
  714. // or we should have bytes==byteStart and throw below
  715. Debug.Assert(chars > charStart + 1 || bytes == byteStart,
  716. "[UnicodeEncoding.GetBytes]Expected chars to have when no room to add surrogate pair");
  717. chars -= 2; // Didn't use either surrogate
  718. }
  719. ThrowBytesOverflow(encoder, bytes == byteStart); // Throw maybe (if no bytes written)
  720. charLeftOver = (char)0; // we'll retry it later
  721. break; // Didn't throw, but stop 'til next time.
  722. }
  723. if (bigEndian)
  724. {
  725. *(bytes++) = (byte)(charLeftOver >> 8);
  726. *(bytes++) = (byte)charLeftOver;
  727. }
  728. else
  729. {
  730. *(bytes++) = (byte)charLeftOver;
  731. *(bytes++) = (byte)(charLeftOver >> 8);
  732. }
  733. charLeftOver = (char)0;
  734. }
  735. else if (charLeftOver > 0)
  736. {
  737. // Expected a low surrogate, but this char is normal
  738. // Rewind the current character, fallback previous character.
  739. // this should be safe because we don't have leftover data in the
  740. // fallback, so chars must have advanced already.
  741. Debug.Assert(chars > charStart,
  742. "[UnicodeEncoding.GetBytes]Expected chars to have advanced after expecting low surrogate");
  743. chars--;
  744. // fallback previous chars
  745. // Might need to create our fallback buffer
  746. if (fallbackBuffer == null)
  747. {
  748. if (encoder == null)
  749. fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
  750. else
  751. fallbackBuffer = encoder.FallbackBuffer;
  752. // Set our internal fallback interesting things.
  753. fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true);
  754. }
  755. charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
  756. fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback);
  757. chars = charsForFallback;
  758. // Ignore charLeftOver or throw
  759. charLeftOver = (char)0;
  760. continue;
  761. }
  762. // Ok, we have a char to add
  763. if (bytes + 1 >= byteEnd)
  764. {
  765. // Couldn't add this char
  766. if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
  767. fallbackBuffer.MovePrevious(); // Not using this fallback char
  768. else
  769. {
  770. // Lonely charLeftOver (from previous call) would've been caught up above,
  771. // so this must be a case where we've already read an input char.
  772. Debug.Assert(chars > charStart,
  773. "[UnicodeEncoding.GetBytes]Expected chars to have advanced for failed fallback");
  774. chars--; // Not using this char
  775. }
  776. ThrowBytesOverflow(encoder, bytes == byteStart); // Throw maybe (if no bytes written)
  777. break; // didn't throw, just stop
  778. }
  779. if (bigEndian)
  780. {
  781. *(bytes++) = (byte)(ch >> 8);
  782. *(bytes++) = (byte)ch;
  783. }
  784. else
  785. {
  786. *(bytes++) = (byte)ch;
  787. *(bytes++) = (byte)(ch >> 8);
  788. }
  789. }
  790. // Don't allocate space for left over char
  791. if (charLeftOver > 0)
  792. {
  793. // If we aren't flushing we need to fall this back
  794. if (encoder == null || encoder.MustFlush)
  795. {
  796. if (wasHereBefore)
  797. {
  798. // Throw it, using our complete character
  799. throw new ArgumentException(
  800. SR.Format(SR.Argument_RecursiveFallback, charLeftOver), nameof(chars));
  801. }
  802. else
  803. {
  804. // If we have to flush, stick it in fallback and try again
  805. // Might need to create our fallback buffer
  806. if (fallbackBuffer == null)
  807. {
  808. if (encoder == null)
  809. fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
  810. else
  811. fallbackBuffer = encoder.FallbackBuffer;
  812. // Set our internal fallback interesting things.
  813. fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true);
  814. }
  815. // If we're not flushing, that'll remember the left over character.
  816. charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
  817. fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback);
  818. chars = charsForFallback;
  819. charLeftOver = (char)0;
  820. wasHereBefore = true;
  821. goto TryAgain;
  822. }
  823. }
  824. }
  825. // Not flushing, remember it in the encoder
  826. if (encoder != null)
  827. {
  828. encoder._charLeftOver = charLeftOver;
  829. encoder._charsUsed = (int)(chars - charStart);
  830. }
  831. // Remember charLeftOver if we must, or clear it if we're flushing
  832. // (charLeftOver should be 0 if we're flushing)
  833. Debug.Assert((encoder != null && !encoder.MustFlush) || charLeftOver == (char)0,
  834. "[UnicodeEncoding.GetBytes] Expected no left over characters if flushing");
  835. Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0 ||
  836. encoder == null || !encoder._throwOnOverflow,
  837. "[UnicodeEncoding.GetBytes]Expected empty fallback buffer if not converting");
  838. return (int)(bytes - byteStart);
  839. }
  840. internal sealed override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder)
  841. {
  842. Debug.Assert(bytes != null, "[UnicodeEncoding.GetCharCount]bytes!=null");
  843. Debug.Assert(count >= 0, "[UnicodeEncoding.GetCharCount]count >=0");
  844. UnicodeEncoding.Decoder decoder = (UnicodeEncoding.Decoder)baseDecoder;
  845. byte* byteEnd = bytes + count;
  846. byte* byteStart = bytes;
  847. // Need last vars
  848. int lastByte = -1;
  849. char lastChar = (char)0;
  850. // Start by assuming same # of chars as bytes
  851. int charCount = count >> 1;
  852. // For fallback we may need a fallback buffer
  853. DecoderFallbackBuffer fallbackBuffer = null;
  854. if (decoder != null)
  855. {
  856. lastByte = decoder.lastByte;
  857. lastChar = decoder.lastChar;
  858. // Assume extra char if last char was around
  859. if (lastChar > 0)
  860. charCount++;
  861. // Assume extra char if extra last byte makes up odd # of input bytes
  862. if (lastByte >= 0 && (count & 1) == 1)
  863. {
  864. charCount++;
  865. }
  866. // Shouldn't have anything in fallback buffer for GetCharCount
  867. // (don't have to check _throwOnOverflow for count)
  868. Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
  869. "[UnicodeEncoding.GetCharCount]Expected empty fallback buffer at start");
  870. }
  871. while (bytes < byteEnd)
  872. {
  873. // If we're aligned then maybe we can do it fast
  874. // That'll hurt if we're unaligned because we'll always test but never be aligned
  875. #if FASTLOOP
  876. if ((bigEndian ^ BitConverter.IsLittleEndian) &&
  877. #if BIT64
  878. (unchecked((long)bytes) & 7) == 0 &&
  879. #else
  880. (unchecked((int)bytes) & 3) == 0 &&
  881. #endif // BIT64
  882. lastByte == -1 && lastChar == 0)
  883. {
  884. // Need -1 to check 2 at a time. If we have an even #, longBytes will go
  885. // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longBytes
  886. // will go from longEnd - 1 long to longEnd. (Might not get to use this)
  887. ulong* longEnd = (ulong*)(byteEnd - 7);
  888. // Need new char* so we can check 4 at a time
  889. ulong* longBytes = (ulong*)bytes;
  890. while (longBytes < longEnd)
  891. {
  892. // See if we potentially have surrogates (0x8000 bit set)
  893. // (We're either big endian on a big endian machine or little endian on
  894. // a little endian machine so that'll work)
  895. if ((0x8000800080008000 & *longBytes) != 0)
  896. {
  897. // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high
  898. // 5 bits looks like 11011, then its a high or low surrogate.
  899. // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set.
  900. // Note that we expect BMP characters to be more common than surrogates
  901. // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates
  902. ulong uTemp = (0xf800f800f800f800 & *longBytes) ^ 0xd800d800d800d800;
  903. // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate
  904. // but no clue if they're high or low.
  905. // If each of the 4 characters are non-zero, then none are surrogates.
  906. if ((uTemp & 0xFFFF000000000000) == 0 ||
  907. (uTemp & 0x0000FFFF00000000) == 0 ||
  908. (uTemp & 0x00000000FFFF0000) == 0 ||
  909. (uTemp & 0x000000000000FFFF) == 0)
  910. {
  911. // It has at least 1 surrogate, but we don't know if they're high or low surrogates,
  912. // or if there's 1 or 4 surrogates
  913. // If they happen to be high/low/high/low, we may as well continue. Check the next
  914. // bit to see if its set (low) or not (high) in the right pattern
  915. if ((0xfc00fc00fc00fc00 & *longBytes) !=
  916. (BitConverter.IsLittleEndian ? (ulong)0xdc00d800dc00d800 : (ulong)0xd800dc00d800dc00))
  917. {
  918. // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
  919. // was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
  920. // Drop out to the slow loop to resolve the surrogates
  921. break;
  922. }
  923. // else they are all surrogates in High/Low/High/Low order, so we can use them.
  924. }
  925. // else none are surrogates, so we can use them.
  926. }
  927. // else all < 0x8000 so we can use them
  928. // We can use these 4 chars.
  929. longBytes++;
  930. }
  931. bytes = (byte*)longBytes;
  932. if (bytes >= byteEnd)
  933. break;
  934. }
  935. #endif // FASTLOOP
  936. // Get 1st byte
  937. if (lastByte < 0)
  938. {
  939. lastByte = *bytes++;
  940. if (bytes >= byteEnd) break;
  941. }
  942. // Get full char
  943. char ch;
  944. if (bigEndian)
  945. {
  946. ch = (char)(lastByte << 8 | *(bytes++));
  947. }
  948. else
  949. {
  950. ch = (char)(*(bytes++) << 8 | lastByte);
  951. }
  952. lastByte = -1;
  953. // See if the char's valid
  954. if (ch >= 0xd800 && ch <= 0xdfff)
  955. {
  956. // Was it a high surrogate?
  957. if (ch <= 0xdbff)
  958. {
  959. // Its a high surrogate, if we had one then do fallback for previous one
  960. if (lastChar > 0)
  961. {
  962. // Ignore previous bad high surrogate
  963. charCount--;
  964. // Get fallback for previous high surrogate
  965. // Note we have to reconstruct bytes because some may have been in decoder
  966. byte[] byteBuffer = null;
  967. if (bigEndian)
  968. {
  969. byteBuffer = new byte[]
  970. { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) };
  971. }
  972. else
  973. {
  974. byteBuffer = new byte[]
  975. { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) };
  976. }
  977. if (fallbackBuffer == null)
  978. {
  979. if (decoder == null)
  980. fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
  981. else
  982. fallbackBuffer = decoder.FallbackBuffer;
  983. // Set our internal fallback interesting things.
  984. fallbackBuffer.InternalInitialize(byteStart, null);
  985. }
  986. // Get fallback.
  987. charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes);
  988. }
  989. // Ignore the last one which fell back already,
  990. // and remember the new high surrogate
  991. lastChar = ch;
  992. continue;
  993. }
  994. // Its a low surrogate
  995. if (lastChar == 0)
  996. {
  997. // Expected a previous high surrogate
  998. charCount--;
  999. // Get fallback for this low surrogate
  1000. // Note we have to reconstruct bytes because some may have been in decoder
  1001. byte[] byteBuffer = null;
  1002. if (bigEndian)
  1003. {
  1004. byteBuffer = new byte[]
  1005. { unchecked((byte)(ch >> 8)), unchecked((byte)ch) };
  1006. }
  1007. else
  1008. {
  1009. byteBuffer = new byte[]
  1010. { unchecked((byte)ch), unchecked((byte)(ch >> 8)) };
  1011. }
  1012. if (fallbackBuffer == null)
  1013. {
  1014. if (decoder == null)
  1015. fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
  1016. else
  1017. fallbackBuffer = decoder.FallbackBuffer;
  1018. // Set our internal fallback interesting things.
  1019. fallbackBuffer.InternalInitialize(byteStart, null);
  1020. }
  1021. charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes);
  1022. // Ignore this one (we already did its fallback)
  1023. continue;
  1024. }
  1025. // Valid surrogate pair, already counted.
  1026. lastChar = (char)0;
  1027. }
  1028. else if (lastChar > 0)
  1029. {
  1030. // Had a high surrogate, expected a low surrogate
  1031. // Un-count the last high surrogate
  1032. charCount--;
  1033. // fall back the high surrogate.
  1034. byte[] byteBuffer = null;
  1035. if (bigEndian)
  1036. {
  1037. byteBuffer = new byte[]
  1038. { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) };
  1039. }
  1040. else
  1041. {
  1042. byteBuffer = new byte[]
  1043. { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) };
  1044. }
  1045. if (fallbackBuffer == null)
  1046. {
  1047. if (decoder == null)
  1048. fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
  1049. else
  1050. fallbackBuffer = decoder.FallbackBuffer;
  1051. // Set our internal fallback interesting things.
  1052. fallbackBuffer.InternalInitialize(byteStart, null);
  1053. }
  1054. // Already subtracted high surrogate
  1055. charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes);
  1056. // Not left over now, clear previous high surrogate and continue to add current char
  1057. lastChar = (char)0;
  1058. }
  1059. // Valid char, already counted
  1060. }
  1061. // Extra space if we can't use decoder
  1062. if (decoder == null || decoder.MustFlush)
  1063. {
  1064. if (lastChar > 0)
  1065. {
  1066. // No hanging high surrogates allowed, do fallback and remove count for it
  1067. charCount--;
  1068. byte[] byteBuffer = null;
  1069. if (bigEndian)
  1070. {
  1071. byteBuffer = new byte[]
  1072. { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) };
  1073. }
  1074. else
  1075. {
  1076. byteBuffer = new byte[]
  1077. { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) };
  1078. }
  1079. if (fallbackBuffer == null)
  1080. {
  1081. if (decoder == null)
  1082. fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
  1083. else
  1084. fallbackBuffer = decoder.FallbackBuffer;
  1085. // Set our internal fallback interesting things.
  1086. fallbackBuffer.InternalInitialize(byteStart, null);
  1087. }
  1088. charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes);
  1089. lastChar = (char)0;
  1090. }
  1091. if (lastByte >= 0)
  1092. {
  1093. if (fallbackBuffer == null)
  1094. {
  1095. if (decoder == null)
  1096. fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
  1097. else
  1098. fallbackBuffer = decoder.FallbackBuffer;
  1099. // Set our internal fallback interesting things.
  1100. fallbackBuffer.InternalInitialize(byteStart, null);
  1101. }
  1102. // No hanging odd bytes allowed if must flush
  1103. charCount += fallbackBuffer.InternalFallback(new byte[] { unchecked((byte)lastByte) }, bytes);
  1104. lastByte = -1;
  1105. }
  1106. }
  1107. // If we had a high surrogate left over, we can't count it
  1108. if (lastChar > 0)
  1109. charCount--;
  1110. // Shouldn't have anything in fallback buffer for GetCharCount
  1111. // (don't have to check _throwOnOverflow for count)
  1112. Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0,
  1113. "[UnicodeEncoding.GetCharCount]Expected empty fallback buffer at end");
  1114. return charCount;
  1115. }
  1116. internal sealed override unsafe int GetChars(
  1117. byte* bytes, int byteCount, char* chars, int charCount, DecoderNLS baseDecoder)
  1118. {
  1119. Debug.Assert(chars != null, "[UnicodeEncoding.GetChars]chars!=null");
  1120. Debug.Assert(byteCount >= 0, "[UnicodeEncoding.GetChars]byteCount >=0");
  1121. Debug.Assert(charCount >= 0, "[UnicodeEncoding.GetChars]charCount >=0");
  1122. Debug.Assert(bytes != null, "[UnicodeEncoding.GetChars]bytes!=null");
  1123. UnicodeEncoding.Decoder decoder = (UnicodeEncoding.Decoder)baseDecoder;
  1124. // Need last vars
  1125. int lastByte = -1;
  1126. char lastChar = (char)0;
  1127. // Get our decoder (but don't clear it yet)
  1128. if (decoder != null)
  1129. {
  1130. lastByte = decoder.lastByte;
  1131. lastChar = decoder.lastChar;
  1132. // Shouldn't have anything in fallback buffer for GetChars
  1133. // (don't have to check _throwOnOverflow for chars)
  1134. Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
  1135. "[UnicodeEncoding.GetChars]Expected empty fallback buffer at start");
  1136. }
  1137. // For fallback we may need a fallback buffer
  1138. DecoderFallbackBuffer fallbackBuffer = null;
  1139. char* charsForFallback;
  1140. byte* byteEnd = bytes + byteCount;
  1141. char* charEnd = chars + charCount;
  1142. byte* byteStart = bytes;
  1143. char* charStart = chars;
  1144. while (bytes < byteEnd)
  1145. {
  1146. // If we're aligned then maybe we can do it fast
  1147. // That'll hurt if we're unaligned because we'll always test but never be aligned
  1148. #if FASTLOOP
  1149. if ((bigEndian ^ BitConverter.IsLittleEndian) &&
  1150. #if BIT64
  1151. (unchecked((long)chars) & 7) == 0 &&
  1152. #else
  1153. (unchecked((int)chars) & 3) == 0 &&
  1154. #endif
  1155. lastByte == -1 && lastChar == 0)
  1156. {
  1157. // Need -1 to check 2 at a time. If we have an even #, longChars will go
  1158. // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longChars
  1159. // will go from longEnd - 1 long to longEnd. (Might not get to use this)
  1160. // We can only go iCount units (limited by shorter of char or byte buffers.
  1161. ulong* longEnd = (ulong*)(bytes - 7 +
  1162. (((byteEnd - bytes) >> 1 < charEnd - chars) ?
  1163. (byteEnd - bytes) : (charEnd - chars) << 1));
  1164. // Need new char* so we can check 4 at a time
  1165. ulong* longBytes = (ulong*)bytes;
  1166. ulong* longChars = (ulong*)chars;
  1167. while (longBytes < longEnd)
  1168. {
  1169. // See if we potentially have surrogates (0x8000 bit set)
  1170. // (We're either big endian on a big endian machine or little endian on
  1171. // a little endian machine so that'll work)
  1172. if ((0x8000800080008000 & *longBytes) != 0)
  1173. {
  1174. // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high
  1175. // 5 bits looks like 11011, then its a high or low surrogate.
  1176. // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set.
  1177. // Note that we expect BMP characters to be more common than surrogates
  1178. // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates
  1179. ulong uTemp = (0xf800f800f800f800 & *longBytes) ^ 0xd800d800d800d800;
  1180. // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate
  1181. // but no clue if they're high or low.
  1182. // If each of the 4 characters are non-zero, then none are surrogates.
  1183. if ((uTemp & 0xFFFF000000000000) == 0 ||
  1184. (uTemp & 0x0000FFFF00000000) == 0 ||
  1185. (uTemp & 0x00000000FFFF0000) == 0 ||
  1186. (uTemp & 0x000000000000FFFF) == 0)
  1187. {
  1188. // It has at least 1 surrogate, but we don't know if they're high or low surrogates,
  1189. // or if there's 1 or 4 surrogates
  1190. // If they happen to be high/low/high/low, we may as well continue. Check the next
  1191. // bit to see if its set (low) or not (high) in the right pattern
  1192. if ((0xfc00fc00fc00fc00 & *longBytes) !=
  1193. (BitConverter.IsLittleEndian ? (ulong)0xdc00d800dc00d800 : (ulong)0xd800dc00d800dc00))
  1194. {
  1195. // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high
  1196. // was hoped for or the 0x0400 bit wasn't set where a low was hoped for.
  1197. // Drop out to the slow loop to resolve the surrogates
  1198. break;
  1199. }
  1200. // else they are all surrogates in High/Low/High/Low order, so we can use them.
  1201. }
  1202. // else none are surrogates, so we can use them.
  1203. }
  1204. // else all < 0x8000 so we can use them
  1205. // We can use these 4 chars.
  1206. Unsafe.WriteUnaligned<ulong>(longChars, *longBytes);
  1207. longBytes++;
  1208. longChars++;
  1209. }
  1210. chars = (char*)longChars;
  1211. bytes = (byte*)longBytes;
  1212. if (bytes >= byteEnd)
  1213. break;
  1214. }
  1215. #endif // FASTLOOP
  1216. // Get 1st byte
  1217. if (lastByte < 0)
  1218. {
  1219. lastByte = *bytes++;
  1220. continue;
  1221. }
  1222. // Get full char
  1223. char ch;
  1224. if (bigEndian)
  1225. {
  1226. ch = (char)(lastByte << 8 | *(bytes++));
  1227. }
  1228. else
  1229. {
  1230. ch = (char)(*(bytes++) << 8 | lastByte);
  1231. }
  1232. lastByte = -1;
  1233. // See if the char's valid
  1234. if (ch >= 0xd800 && ch <= 0xdfff)
  1235. {
  1236. // Was it a high surrogate?
  1237. if (ch <= 0xdbff)
  1238. {
  1239. // Its a high surrogate, if we had one then do fallback for previous one
  1240. if (lastChar > 0)
  1241. {
  1242. // Get fallback for previous high surrogate
  1243. // Note we have to reconstruct bytes because some may have been in decoder
  1244. byte[] byteBuffer = null;
  1245. if (bigEndian)
  1246. {
  1247. byteBuffer = new byte[]
  1248. { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) };
  1249. }
  1250. else
  1251. {
  1252. byteBuffer = new byte[]
  1253. { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) };
  1254. }
  1255. if (fallbackBuffer == null)
  1256. {
  1257. if (decoder == null)
  1258. fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
  1259. else
  1260. fallbackBuffer = decoder.FallbackBuffer;
  1261. // Set our internal fallback interesting things.
  1262. fallbackBuffer.InternalInitialize(byteStart, charEnd);
  1263. }
  1264. charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
  1265. bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback);
  1266. chars = charsForFallback;
  1267. if (!fallbackResult)
  1268. {
  1269. // couldn't fall back lonely surrogate
  1270. // We either advanced bytes or chars should == charStart and throw below
  1271. Debug.Assert(bytes >= byteStart + 2 || chars == charStart,
  1272. "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (bad surrogate)");
  1273. bytes -= 2; // didn't use these 2 bytes
  1274. fallbackBuffer.InternalReset();
  1275. ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output
  1276. break; // couldn't fallback but didn't throw
  1277. }
  1278. }
  1279. // Ignore the previous high surrogate which fell back already,
  1280. // yet remember the current high surrogate for next time.
  1281. lastChar = ch;
  1282. continue;
  1283. }
  1284. // Its a low surrogate
  1285. if (lastChar == 0)
  1286. {
  1287. // Expected a previous high surrogate
  1288. // Get fallback for this low surrogate
  1289. // Note we have to reconstruct bytes because some may have been in decoder
  1290. byte[] byteBuffer = null;
  1291. if (bigEndian)
  1292. {
  1293. byteBuffer = new byte[]
  1294. { unchecked((byte)(ch >> 8)), unchecked((byte)ch) };
  1295. }
  1296. else
  1297. {
  1298. byteBuffer = new byte[]
  1299. { unchecked((byte)ch), unchecked((byte)(ch >> 8)) };
  1300. }
  1301. if (fallbackBuffer == null)
  1302. {
  1303. if (decoder == null)
  1304. fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
  1305. else
  1306. fallbackBuffer = decoder.FallbackBuffer;
  1307. // Set our internal fallback interesting things.
  1308. fallbackBuffer.InternalInitialize(byteStart, charEnd);
  1309. }
  1310. charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
  1311. bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback);
  1312. chars = charsForFallback;
  1313. if (!fallbackResult)
  1314. {
  1315. // couldn't fall back lonely surrogate
  1316. // We either advanced bytes or chars should == charStart and throw below
  1317. Debug.Assert(bytes >= byteStart + 2 || chars == charStart,
  1318. "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (lonely surrogate)");
  1319. bytes -= 2; // didn't use these 2 bytes
  1320. fallbackBuffer.InternalReset();
  1321. ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output
  1322. break; // couldn't fallback but didn't throw
  1323. }
  1324. // Didn't throw, ignore this one (we already did its fallback)
  1325. continue;
  1326. }
  1327. // Valid surrogate pair, add our lastChar (will need 2 chars)
  1328. if (chars >= charEnd - 1)
  1329. {
  1330. // couldn't find room for this surrogate pair
  1331. // We either advanced bytes or chars should == charStart and throw below
  1332. Debug.Assert(bytes >= byteStart + 2 || chars == charStart,
  1333. "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (surrogate pair)");
  1334. bytes -= 2; // didn't use these 2 bytes
  1335. ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output
  1336. // Leave lastChar for next call to Convert()
  1337. break; // couldn't fallback but didn't throw
  1338. }
  1339. *chars++ = lastChar;
  1340. lastChar = (char)0;
  1341. }
  1342. else if (lastChar > 0)
  1343. {
  1344. // Had a high surrogate, expected a low surrogate, fall back the high surrogate.
  1345. byte[] byteBuffer = null;
  1346. if (bigEndian)
  1347. {
  1348. byteBuffer = new byte[]
  1349. { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) };
  1350. }
  1351. else
  1352. {
  1353. byteBuffer = new byte[]
  1354. { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) };
  1355. }
  1356. if (fallbackBuffer == null)
  1357. {
  1358. if (decoder == null)
  1359. fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
  1360. else
  1361. fallbackBuffer = decoder.FallbackBuffer;
  1362. // Set our internal fallback interesting things.
  1363. fallbackBuffer.InternalInitialize(byteStart, charEnd);
  1364. }
  1365. charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
  1366. bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback);
  1367. chars = charsForFallback;
  1368. if (!fallbackResult)
  1369. {
  1370. // couldn't fall back high surrogate, or char that would be next
  1371. // We either advanced bytes or chars should == charStart and throw below
  1372. Debug.Assert(bytes >= byteStart + 2 || chars == charStart,
  1373. "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (no low surrogate)");
  1374. bytes -= 2; // didn't use these 2 bytes
  1375. fallbackBuffer.InternalReset();
  1376. ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output
  1377. break; // couldn't fallback but didn't throw
  1378. }
  1379. // Not left over now, clear previous high surrogate and continue to add current char
  1380. lastChar = (char)0;
  1381. }
  1382. // Valid char, room for it?
  1383. if (chars >= charEnd)
  1384. {
  1385. // 2 bytes couldn't fall back
  1386. // We either advanced bytes or chars should == charStart and throw below
  1387. Debug.Assert(bytes >= byteStart + 2 || chars == charStart,
  1388. "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (normal)");
  1389. bytes -= 2; // didn't use these bytes
  1390. ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output
  1391. break; // couldn't fallback but didn't throw
  1392. }
  1393. // add it
  1394. *chars++ = ch;
  1395. }
  1396. // Remember our decoder if we must
  1397. if (decoder == null || decoder.MustFlush)
  1398. {
  1399. if (lastChar > 0)
  1400. {
  1401. // No hanging high surrogates allowed, do fallback and remove count for it
  1402. byte[] byteBuffer = null;
  1403. if (bigEndian)
  1404. {
  1405. byteBuffer = new byte[]
  1406. { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) };
  1407. }
  1408. else
  1409. {
  1410. byteBuffer = new byte[]
  1411. { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) };
  1412. }
  1413. if (fallbackBuffer == null)
  1414. {
  1415. if (decoder == null)
  1416. fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
  1417. else
  1418. fallbackBuffer = decoder.FallbackBuffer;
  1419. // Set our internal fallback interesting things.
  1420. fallbackBuffer.InternalInitialize(byteStart, charEnd);
  1421. }
  1422. charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
  1423. bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback);
  1424. chars = charsForFallback;
  1425. if (!fallbackResult)
  1426. {
  1427. // 2 bytes couldn't fall back
  1428. // We either advanced bytes or chars should == charStart and throw below
  1429. Debug.Assert(bytes >= byteStart + 2 || chars == charStart,
  1430. "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (decoder)");
  1431. bytes -= 2; // didn't use these bytes
  1432. if (lastByte >= 0)
  1433. bytes--; // had an extra last byte hanging around
  1434. fallbackBuffer.InternalReset();
  1435. ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output
  1436. // We'll remember these in our decoder though
  1437. bytes += 2;
  1438. if (lastByte >= 0)
  1439. bytes++;
  1440. goto End;
  1441. }
  1442. // done with this one
  1443. lastChar = (char)0;
  1444. }
  1445. if (lastByte >= 0)
  1446. {
  1447. if (fallbackBuffer == null)
  1448. {
  1449. if (decoder == null)
  1450. fallbackBuffer = this.decoderFallback.CreateFallbackBuffer();
  1451. else
  1452. fallbackBuffer = decoder.FallbackBuffer;
  1453. // Set our internal fallback interesting things.
  1454. fallbackBuffer.InternalInitialize(byteStart, charEnd);
  1455. }
  1456. // No hanging odd bytes allowed if must flush
  1457. charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered
  1458. bool fallbackResult = fallbackBuffer.InternalFallback(new byte[] { unchecked((byte)lastByte) }, bytes, ref charsForFallback);
  1459. chars = charsForFallback;
  1460. if (!fallbackResult)
  1461. {
  1462. // odd byte couldn't fall back
  1463. bytes--; // didn't use this byte
  1464. fallbackBuffer.InternalReset();
  1465. ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output
  1466. // didn't throw, but we'll remember it in the decoder
  1467. bytes++;
  1468. goto End;
  1469. }
  1470. // Didn't fail, clear buffer
  1471. lastByte = -1;
  1472. }
  1473. }
  1474. End:
  1475. // Remember our decoder if we must
  1476. if (decoder != null)
  1477. {
  1478. Debug.Assert((decoder.MustFlush == false) || ((lastChar == (char)0) && (lastByte == -1)),
  1479. "[UnicodeEncoding.GetChars] Expected no left over chars or bytes if flushing"
  1480. // + " " + ((int)lastChar).ToString("X4") + " " + lastByte.ToString("X2")
  1481. );
  1482. decoder._bytesUsed = (int)(bytes - byteStart);
  1483. decoder.lastChar = lastChar;
  1484. decoder.lastByte = lastByte;
  1485. }
  1486. // Shouldn't have anything in fallback buffer for GetChars
  1487. // (don't have to check _throwOnOverflow for count or chars)
  1488. Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0,
  1489. "[UnicodeEncoding.GetChars]Expected empty fallback buffer at end");
  1490. return (int)(chars - charStart);
  1491. }
  1492. public override System.Text.Encoder GetEncoder()
  1493. {
  1494. return new EncoderNLS(this);
  1495. }
  1496. public override System.Text.Decoder GetDecoder()
  1497. {
  1498. return new UnicodeEncoding.Decoder(this);
  1499. }
  1500. public override byte[] GetPreamble()
  1501. {
  1502. if (byteOrderMark)
  1503. {
  1504. // Note - we must allocate new byte[]'s here to prevent someone
  1505. // from modifying a cached byte[].
  1506. if (bigEndian)
  1507. return new byte[2] { 0xfe, 0xff };
  1508. else
  1509. return new byte[2] { 0xff, 0xfe };
  1510. }
  1511. return Array.Empty<byte>();
  1512. }
  1513. public override ReadOnlySpan<byte> Preamble =>
  1514. GetType() != typeof(UnicodeEncoding) ? new ReadOnlySpan<byte>(GetPreamble()) : // in case a derived UnicodeEncoding overrode GetPreamble
  1515. !byteOrderMark ? default :
  1516. bigEndian ? (ReadOnlySpan<byte>)new byte[2] { 0xfe, 0xff } : // uses C# compiler's optimization for static byte[] data
  1517. (ReadOnlySpan<byte>)new byte[2] { 0xff, 0xfe };
  1518. public override int GetMaxByteCount(int charCount)
  1519. {
  1520. if (charCount < 0)
  1521. throw new ArgumentOutOfRangeException(nameof(charCount),
  1522. SR.ArgumentOutOfRange_NeedNonNegNum);
  1523. // Characters would be # of characters + 1 in case left over high surrogate is ? * max fallback
  1524. long byteCount = (long)charCount + 1;
  1525. if (EncoderFallback.MaxCharCount > 1)
  1526. byteCount *= EncoderFallback.MaxCharCount;
  1527. // 2 bytes per char
  1528. byteCount <<= 1;
  1529. if (byteCount > 0x7fffffff)
  1530. throw new ArgumentOutOfRangeException(nameof(charCount), SR.ArgumentOutOfRange_GetByteCountOverflow);
  1531. return (int)byteCount;
  1532. }
  1533. public override int GetMaxCharCount(int byteCount)
  1534. {
  1535. if (byteCount < 0)
  1536. throw new ArgumentOutOfRangeException(nameof(byteCount),
  1537. SR.ArgumentOutOfRange_NeedNonNegNum);
  1538. // long because byteCount could be biggest int.
  1539. // 1 char per 2 bytes. Round up in case 1 left over in decoder.
  1540. // Round up using &1 in case byteCount is max size
  1541. // Might also need an extra 1 if there's a left over high surrogate in the decoder.
  1542. long charCount = (long)(byteCount >> 1) + (byteCount & 1) + 1;
  1543. // Don't forget fallback (in case they have a bunch of lonely surrogates or something bizarre like that)
  1544. if (DecoderFallback.MaxCharCount > 1)
  1545. charCount *= DecoderFallback.MaxCharCount;
  1546. if (charCount > 0x7fffffff)
  1547. throw new ArgumentOutOfRangeException(nameof(byteCount), SR.ArgumentOutOfRange_GetCharCountOverflow);
  1548. return (int)charCount;
  1549. }
  1550. public override bool Equals(object value)
  1551. {
  1552. if (value is UnicodeEncoding that)
  1553. {
  1554. //
  1555. // Big Endian Unicode has different code page (1201) than small Endian one (1200),
  1556. // so we still have to check _codePage here.
  1557. //
  1558. return (CodePage == that.CodePage) &&
  1559. byteOrderMark == that.byteOrderMark &&
  1560. // isThrowException == that.isThrowException && // Same as Encoder/Decoder being exception fallbacks
  1561. bigEndian == that.bigEndian &&
  1562. (EncoderFallback.Equals(that.EncoderFallback)) &&
  1563. (DecoderFallback.Equals(that.DecoderFallback));
  1564. }
  1565. return (false);
  1566. }
  1567. public override int GetHashCode()
  1568. {
  1569. return CodePage + this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode() +
  1570. (byteOrderMark ? 4 : 0) + (bigEndian ? 8 : 0);
  1571. }
  1572. private sealed class Decoder : System.Text.DecoderNLS
  1573. {
  1574. internal int lastByte = -1;
  1575. internal char lastChar = '\0';
  1576. public Decoder(UnicodeEncoding encoding) : base(encoding)
  1577. {
  1578. // base calls reset
  1579. }
  1580. public override void Reset()
  1581. {
  1582. lastByte = -1;
  1583. lastChar = '\0';
  1584. if (_fallbackBuffer != null)
  1585. _fallbackBuffer.Reset();
  1586. }
  1587. // Anything left in our decoder?
  1588. internal override bool HasState
  1589. {
  1590. get
  1591. {
  1592. return (this.lastByte != -1 || this.lastChar != '\0');
  1593. }
  1594. }
  1595. }
  1596. }
  1597. }