UTF8Encoding.cs 109 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632
  1. // Licensed to the .NET Foundation under one or more agreements.
  2. // The .NET Foundation licenses this file to you under the MIT license.
  3. // See the LICENSE file in the project root for more information.
  4. // The worker functions in this file was optimized for performance. If you make changes
  5. // you should use care to consider all of the interesting cases.
  6. // The code of all worker functions in this file is written twice: Once as as a slow loop, and the
  7. // second time as a fast loop. The slow loops handles all special cases, throws exceptions, etc.
  8. // The fast loops attempts to blaze through as fast as possible with optimistic range checks,
  9. // processing multiple characters at a time, and falling back to the slow loop for all special cases.
  10. // This define can be used to turn off the fast loops. Useful for finding whether
  11. // the problem is fastloop-specific.
  12. #define FASTLOOP
  13. using System;
  14. using System.Diagnostics;
  15. using System.Globalization;
  16. using System.Runtime.InteropServices;
  17. namespace System.Text
  18. {
  19. // Encodes text into and out of UTF-8. UTF-8 is a way of writing
  20. // Unicode characters with variable numbers of bytes per character,
  21. // optimized for the lower 127 ASCII characters. It's an efficient way
  22. // of encoding US English in an internationalizable way.
  23. //
  24. // Don't override IsAlwaysNormalized because it is just a Unicode Transformation and could be confused.
  25. //
  26. // The UTF-8 byte order mark is simply the Unicode byte order mark
  27. // (0xFEFF) written in UTF-8 (0xEF 0xBB 0xBF). The byte order mark is
  28. // used mostly to distinguish UTF-8 text from other encodings, and doesn't
  29. // switch the byte orderings.
  30. public class UTF8Encoding : Encoding
  31. {
  32. /*
  33. bytes bits UTF-8 representation
  34. ----- ---- -----------------------------------
  35. 1 7 0vvvvvvv
  36. 2 11 110vvvvv 10vvvvvv
  37. 3 16 1110vvvv 10vvvvvv 10vvvvvv
  38. 4 21 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
  39. ----- ---- -----------------------------------
  40. Surrogate:
  41. Real Unicode value = (HighSurrogate - 0xD800) * 0x400 + (LowSurrogate - 0xDC00) + 0x10000
  42. */
  43. private const int UTF8_CODEPAGE = 65001;
  44. // Allow for de-virtualization (see https://github.com/dotnet/coreclr/pull/9230)
  45. internal sealed class UTF8EncodingSealed : UTF8Encoding
  46. {
  47. public UTF8EncodingSealed(bool encoderShouldEmitUTF8Identifier) : base(encoderShouldEmitUTF8Identifier) { }
  48. public override ReadOnlySpan<byte> Preamble => _emitUTF8Identifier ? PreambleSpan : default;
  49. }
  50. // Used by Encoding.UTF8 for lazy initialization
  51. // The initialization code will not be run until a static member of the class is referenced
  52. internal static readonly UTF8EncodingSealed s_default = new UTF8EncodingSealed(encoderShouldEmitUTF8Identifier: true);
  53. internal static ReadOnlySpan<byte> PreambleSpan => new byte[3] { 0xEF, 0xBB, 0xBF }; // uses C# compiler's optimization for static byte[] data
  54. // Yes, the idea of emitting U+FEFF as a UTF-8 identifier has made it into
  55. // the standard.
  56. internal readonly bool _emitUTF8Identifier = false;
  57. private readonly bool _isThrowException = false;
  58. public UTF8Encoding() : this(false)
  59. {
  60. }
  61. public UTF8Encoding(bool encoderShouldEmitUTF8Identifier) :
  62. base(UTF8_CODEPAGE)
  63. {
  64. _emitUTF8Identifier = encoderShouldEmitUTF8Identifier;
  65. }
  66. public UTF8Encoding(bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes) :
  67. this(encoderShouldEmitUTF8Identifier)
  68. {
  69. _isThrowException = throwOnInvalidBytes;
  70. // Encoding's constructor already did this, but it'll be wrong if we're throwing exceptions
  71. if (_isThrowException)
  72. SetDefaultFallbacks();
  73. }
  74. internal sealed override void SetDefaultFallbacks()
  75. {
  76. // For UTF-X encodings, we use a replacement fallback with an empty string
  77. if (_isThrowException)
  78. {
  79. this.encoderFallback = EncoderFallback.ExceptionFallback;
  80. this.decoderFallback = DecoderFallback.ExceptionFallback;
  81. }
  82. else
  83. {
  84. this.encoderFallback = new EncoderReplacementFallback("\xFFFD");
  85. this.decoderFallback = new DecoderReplacementFallback("\xFFFD");
  86. }
  87. }
  88. // WARNING: GetByteCount(string chars)
  89. // WARNING: has different variable names than EncodingNLS.cs, so this can't just be cut & pasted,
  90. // WARNING: otherwise it'll break VB's way of declaring these.
  91. //
  92. // The following methods are copied from EncodingNLS.cs.
  93. // Unfortunately EncodingNLS.cs is internal and we're public, so we have to re-implement them here.
  94. // These should be kept in sync for the following classes:
  95. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  96. // Returns the number of bytes required to encode a range of characters in
  97. // a character array.
  98. //
  99. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  100. // So if you fix this, fix the others. Currently those include:
  101. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  102. // parent method is safe
  103. public override unsafe int GetByteCount(char[] chars, int index, int count)
  104. {
  105. // Validate input parameters
  106. if (chars == null)
  107. throw new ArgumentNullException(nameof(chars), SR.ArgumentNull_Array);
  108. if (index < 0 || count < 0)
  109. throw new ArgumentOutOfRangeException((index < 0 ? nameof(index) : nameof(count)), SR.ArgumentOutOfRange_NeedNonNegNum);
  110. if (chars.Length - index < count)
  111. throw new ArgumentOutOfRangeException(nameof(chars), SR.ArgumentOutOfRange_IndexCountBuffer);
  112. // If no input, return 0, avoid fixed empty array problem
  113. if (count == 0)
  114. return 0;
  115. // Just call the pointer version
  116. fixed (char* pChars = chars)
  117. return GetByteCount(pChars + index, count, null);
  118. }
  119. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  120. // So if you fix this, fix the others. Currently those include:
  121. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  122. // parent method is safe
  123. public override unsafe int GetByteCount(string chars)
  124. {
  125. // Validate input
  126. if (chars==null)
  127. throw new ArgumentNullException("s");
  128. fixed (char* pChars = chars)
  129. return GetByteCount(pChars, chars.Length, null);
  130. }
  131. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  132. // So if you fix this, fix the others. Currently those include:
  133. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  134. [CLSCompliant(false)]
  135. public override unsafe int GetByteCount(char* chars, int count)
  136. {
  137. // Validate Parameters
  138. if (chars == null)
  139. throw new ArgumentNullException(nameof(chars), SR.ArgumentNull_Array);
  140. if (count < 0)
  141. throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
  142. // Call it with empty encoder
  143. return GetByteCount(chars, count, null);
  144. }
  145. public override unsafe int GetByteCount(ReadOnlySpan<char> chars)
  146. {
  147. fixed (char* charsPtr = &MemoryMarshal.GetNonNullPinnableReference(chars))
  148. {
  149. return GetByteCount(charsPtr, chars.Length, baseEncoder: null);
  150. }
  151. }
  152. // Parent method is safe.
  153. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  154. // So if you fix this, fix the others. Currently those include:
  155. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  156. public override unsafe int GetBytes(string s, int charIndex, int charCount,
  157. byte[] bytes, int byteIndex)
  158. {
  159. if (s == null || bytes == null)
  160. throw new ArgumentNullException((s == null ? nameof(s) : nameof(bytes)), SR.ArgumentNull_Array);
  161. if (charIndex < 0 || charCount < 0)
  162. throw new ArgumentOutOfRangeException((charIndex < 0 ? nameof(charIndex) : nameof(charCount)), SR.ArgumentOutOfRange_NeedNonNegNum);
  163. if (s.Length - charIndex < charCount)
  164. throw new ArgumentOutOfRangeException(nameof(s), SR.ArgumentOutOfRange_IndexCount);
  165. if (byteIndex < 0 || byteIndex > bytes.Length)
  166. throw new ArgumentOutOfRangeException(nameof(byteIndex), SR.ArgumentOutOfRange_Index);
  167. int byteCount = bytes.Length - byteIndex;
  168. fixed (char* pChars = s) fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes))
  169. return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
  170. }
  171. // Encodes a range of characters in a character array into a range of bytes
  172. // in a byte array. An exception occurs if the byte array is not large
  173. // enough to hold the complete encoding of the characters. The
  174. // GetByteCount method can be used to determine the exact number of
  175. // bytes that will be produced for a given range of characters.
  176. // Alternatively, the GetMaxByteCount method can be used to
  177. // determine the maximum number of bytes that will be produced for a given
  178. // number of characters, regardless of the actual character values.
  179. //
  180. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  181. // So if you fix this, fix the others. Currently those include:
  182. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  183. // parent method is safe
  184. public override unsafe int GetBytes(char[] chars, int charIndex, int charCount,
  185. byte[] bytes, int byteIndex)
  186. {
  187. // Validate parameters
  188. if (chars == null || bytes == null)
  189. throw new ArgumentNullException((chars == null ? nameof(chars) : nameof(bytes)), SR.ArgumentNull_Array);
  190. if (charIndex < 0 || charCount < 0)
  191. throw new ArgumentOutOfRangeException((charIndex < 0 ? nameof(charIndex) : nameof(charCount)), SR.ArgumentOutOfRange_NeedNonNegNum);
  192. if (chars.Length - charIndex < charCount)
  193. throw new ArgumentOutOfRangeException(nameof(chars), SR.ArgumentOutOfRange_IndexCountBuffer);
  194. if (byteIndex < 0 || byteIndex > bytes.Length)
  195. throw new ArgumentOutOfRangeException(nameof(byteIndex), SR.ArgumentOutOfRange_Index);
  196. // If nothing to encode return 0, avoid fixed problem
  197. if (charCount == 0)
  198. return 0;
  199. // Just call pointer version
  200. int byteCount = bytes.Length - byteIndex;
  201. fixed (char* pChars = chars) fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes))
  202. // Remember that byteCount is # to decode, not size of array.
  203. return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null);
  204. }
  205. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  206. // So if you fix this, fix the others. Currently those include:
  207. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  208. [CLSCompliant(false)]
  209. public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount)
  210. {
  211. // Validate Parameters
  212. if (bytes == null || chars == null)
  213. throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array);
  214. if (charCount < 0 || byteCount < 0)
  215. throw new ArgumentOutOfRangeException((charCount < 0 ? nameof(charCount) : nameof(byteCount)), SR.ArgumentOutOfRange_NeedNonNegNum);
  216. return GetBytes(chars, charCount, bytes, byteCount, null);
  217. }
  218. public override unsafe int GetBytes(ReadOnlySpan<char> chars, Span<byte> bytes)
  219. {
  220. fixed (char* charsPtr = &MemoryMarshal.GetNonNullPinnableReference(chars))
  221. fixed (byte* bytesPtr = &MemoryMarshal.GetNonNullPinnableReference(bytes))
  222. {
  223. return GetBytes(charsPtr, chars.Length, bytesPtr, bytes.Length, baseEncoder: null);
  224. }
  225. }
  226. // Returns the number of characters produced by decoding a range of bytes
  227. // in a byte array.
  228. //
  229. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  230. // So if you fix this, fix the others. Currently those include:
  231. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  232. // parent method is safe
  233. public override unsafe int GetCharCount(byte[] bytes, int index, int count)
  234. {
  235. // Validate Parameters
  236. if (bytes == null)
  237. throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array);
  238. if (index < 0 || count < 0)
  239. throw new ArgumentOutOfRangeException((index < 0 ? nameof(index) : nameof(count)), SR.ArgumentOutOfRange_NeedNonNegNum);
  240. if (bytes.Length - index < count)
  241. throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer);
  242. // If no input just return 0, fixed doesn't like 0 length arrays.
  243. if (count == 0)
  244. return 0;
  245. // Just call pointer version
  246. fixed (byte* pBytes = bytes)
  247. return GetCharCount(pBytes + index, count, null);
  248. }
  249. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  250. // So if you fix this, fix the others. Currently those include:
  251. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  252. [CLSCompliant(false)]
  253. public override unsafe int GetCharCount(byte* bytes, int count)
  254. {
  255. // Validate Parameters
  256. if (bytes == null)
  257. throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array);
  258. if (count < 0)
  259. throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_NeedNonNegNum);
  260. return GetCharCount(bytes, count, null);
  261. }
  262. public override unsafe int GetCharCount(ReadOnlySpan<byte> bytes)
  263. {
  264. fixed (byte* bytesPtr = &MemoryMarshal.GetNonNullPinnableReference(bytes))
  265. {
  266. return GetCharCount(bytesPtr, bytes.Length, baseDecoder: null);
  267. }
  268. }
  269. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  270. // So if you fix this, fix the others. Currently those include:
  271. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  272. // parent method is safe
  273. public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount,
  274. char[] chars, int charIndex)
  275. {
  276. // Validate Parameters
  277. if (bytes == null || chars == null)
  278. throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array);
  279. if (byteIndex < 0 || byteCount < 0)
  280. throw new ArgumentOutOfRangeException((byteIndex < 0 ? nameof(byteIndex) : nameof(byteCount)), SR.ArgumentOutOfRange_NeedNonNegNum);
  281. if ( bytes.Length - byteIndex < byteCount)
  282. throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer);
  283. if (charIndex < 0 || charIndex > chars.Length)
  284. throw new ArgumentOutOfRangeException(nameof(charIndex), SR.ArgumentOutOfRange_Index);
  285. // If no input, return 0 & avoid fixed problem
  286. if (byteCount == 0)
  287. return 0;
  288. // Just call pointer version
  289. int charCount = chars.Length - charIndex;
  290. fixed (byte* pBytes = bytes) fixed (char* pChars = &MemoryMarshal.GetReference((Span<char>)chars))
  291. // Remember that charCount is # to decode, not size of array
  292. return GetChars(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, null);
  293. }
  294. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  295. // So if you fix this, fix the others. Currently those include:
  296. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  297. [CLSCompliant(false)]
  298. public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount)
  299. {
  300. // Validate Parameters
  301. if (bytes == null || chars == null)
  302. throw new ArgumentNullException(bytes == null ? nameof(bytes) : nameof(chars), SR.ArgumentNull_Array);
  303. if (charCount < 0 || byteCount < 0)
  304. throw new ArgumentOutOfRangeException((charCount < 0 ? nameof(charCount) : nameof(byteCount)), SR.ArgumentOutOfRange_NeedNonNegNum);
  305. return GetChars(bytes, byteCount, chars, charCount, null);
  306. }
  307. public override unsafe int GetChars(ReadOnlySpan<byte> bytes, Span<char> chars)
  308. {
  309. fixed (byte* bytesPtr = &MemoryMarshal.GetNonNullPinnableReference(bytes))
  310. fixed (char* charsPtr = &MemoryMarshal.GetNonNullPinnableReference(chars))
  311. {
  312. return GetChars(bytesPtr, bytes.Length, charsPtr, chars.Length, baseDecoder: null);
  313. }
  314. }
  315. // Returns a string containing the decoded representation of a range of
  316. // bytes in a byte array.
  317. //
  318. // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS)
  319. // So if you fix this, fix the others. Currently those include:
  320. // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding
  321. // parent method is safe
  322. public override unsafe string GetString(byte[] bytes, int index, int count)
  323. {
  324. // Validate Parameters
  325. if (bytes == null)
  326. throw new ArgumentNullException(nameof(bytes), SR.ArgumentNull_Array);
  327. if (index < 0 || count < 0)
  328. throw new ArgumentOutOfRangeException((index < 0 ? nameof(index) : nameof(count)), SR.ArgumentOutOfRange_NeedNonNegNum);
  329. if (bytes.Length - index < count)
  330. throw new ArgumentOutOfRangeException(nameof(bytes), SR.ArgumentOutOfRange_IndexCountBuffer);
  331. // Avoid problems with empty input buffer
  332. if (count == 0) return string.Empty;
  333. fixed (byte* pBytes = bytes)
  334. return string.CreateStringFromEncoding(
  335. pBytes + index, count, this);
  336. }
  337. //
  338. // End of standard methods copied from EncodingNLS.cs
  339. //
  340. // To simplify maintenance, the structure of GetByteCount and GetBytes should be
  341. // kept the same as much as possible
  342. internal sealed override unsafe int GetByteCount(char* chars, int count, EncoderNLS baseEncoder)
  343. {
  344. // For fallback we may need a fallback buffer.
  345. // We wait to initialize it though in case we don't have any broken input unicode
  346. EncoderFallbackBuffer fallbackBuffer = null;
  347. char* pSrcForFallback;
  348. char* pSrc = chars;
  349. char* pEnd = pSrc + count;
  350. // Start by assuming we have as many as count
  351. int byteCount = count;
  352. int ch = 0;
  353. if (baseEncoder != null)
  354. {
  355. UTF8Encoder encoder = (UTF8Encoder)baseEncoder;
  356. ch = encoder.surrogateChar;
  357. // We mustn't have left over fallback data when counting
  358. if (encoder.InternalHasFallbackBuffer)
  359. {
  360. fallbackBuffer = encoder.FallbackBuffer;
  361. if (fallbackBuffer.Remaining > 0)
  362. throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType()));
  363. // Set our internal fallback interesting things.
  364. fallbackBuffer.InternalInitialize(chars, pEnd, encoder, false);
  365. }
  366. }
  367. for (;;)
  368. {
  369. // SLOWLOOP: does all range checks, handles all special cases, but it is slow
  370. if (pSrc >= pEnd)
  371. {
  372. if (ch == 0)
  373. {
  374. // Unroll any fallback that happens at the end
  375. ch = fallbackBuffer != null ? fallbackBuffer.InternalGetNextChar() : 0;
  376. if (ch > 0)
  377. {
  378. byteCount++;
  379. goto ProcessChar;
  380. }
  381. }
  382. else
  383. {
  384. // Case of surrogates in the fallback.
  385. if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
  386. {
  387. Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
  388. "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
  389. ch = fallbackBuffer.InternalGetNextChar();
  390. byteCount++;
  391. if (InRange(ch, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
  392. {
  393. ch = 0xfffd;
  394. byteCount++;
  395. goto EncodeChar;
  396. }
  397. else if (ch > 0)
  398. {
  399. goto ProcessChar;
  400. }
  401. else
  402. {
  403. byteCount--; // ignore last one.
  404. break;
  405. }
  406. }
  407. }
  408. if (ch <= 0)
  409. {
  410. break;
  411. }
  412. if (baseEncoder != null && !baseEncoder.MustFlush)
  413. {
  414. break;
  415. }
  416. // attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1.
  417. byteCount++;
  418. goto EncodeChar;
  419. }
  420. if (ch > 0)
  421. {
  422. Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
  423. "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
  424. // use separate helper variables for local contexts so that the jit optimizations
  425. // won't get confused about the variable lifetimes
  426. int cha = *pSrc;
  427. // count the pending surrogate
  428. byteCount++;
  429. // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
  430. // if (IsLowSurrogate(cha)) {
  431. if (InRange(cha, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
  432. {
  433. // Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do.
  434. ch = 0xfffd;
  435. // ch = cha + (ch << 10) +
  436. // (0x10000
  437. // - CharUnicodeInfo.LOW_SURROGATE_START
  438. // - (CharUnicodeInfo.HIGH_SURROGATE_START << 10) );
  439. // Use this next char
  440. pSrc++;
  441. }
  442. // else ch is still high surrogate and encoding will fail (so don't add count)
  443. // attempt to encode the surrogate or partial surrogate
  444. goto EncodeChar;
  445. }
  446. // If we've used a fallback, then we have to check for it
  447. if (fallbackBuffer != null)
  448. {
  449. ch = fallbackBuffer.InternalGetNextChar();
  450. if (ch > 0)
  451. {
  452. // We have an extra byte we weren't expecting.
  453. byteCount++;
  454. goto ProcessChar;
  455. }
  456. }
  457. // read next char. The JIT optimization seems to be getting confused when
  458. // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
  459. ch = *pSrc;
  460. pSrc++;
  461. ProcessChar:
  462. // if (IsHighSurrogate(ch)) {
  463. if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END))
  464. {
  465. // we will count this surrogate next time around
  466. byteCount--;
  467. continue;
  468. }
  469. // either good char or partial surrogate
  470. EncodeChar:
  471. // throw exception on partial surrogate if necessary
  472. // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
  473. if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
  474. {
  475. // Lone surrogates aren't allowed
  476. // Have to make a fallback buffer if we don't have one
  477. if (fallbackBuffer == null)
  478. {
  479. // wait on fallbacks if we can
  480. // For fallback we may need a fallback buffer
  481. if (baseEncoder == null)
  482. fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
  483. else
  484. fallbackBuffer = baseEncoder.FallbackBuffer;
  485. // Set our internal fallback interesting things.
  486. fallbackBuffer.InternalInitialize(chars, chars + count, baseEncoder, false);
  487. }
  488. // Do our fallback. Actually we already know its a mixed up surrogate,
  489. // so the ref pSrc isn't gonna do anything.
  490. pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be en-registered
  491. fallbackBuffer.InternalFallback(unchecked((char)ch), ref pSrcForFallback);
  492. pSrc = pSrcForFallback;
  493. // Ignore it if we don't throw (we had preallocated this ch)
  494. byteCount--;
  495. ch = 0;
  496. continue;
  497. }
  498. // Count them
  499. if (ch > 0x7F)
  500. {
  501. if (ch > 0x7FF)
  502. {
  503. // the extra surrogate byte was compensated by the second surrogate character
  504. // (2 surrogates make 4 bytes. We've already counted 2 bytes, 1 per char)
  505. byteCount++;
  506. }
  507. byteCount++;
  508. }
  509. #if BIT64
  510. // check for overflow
  511. if (byteCount < 0)
  512. {
  513. break;
  514. }
  515. #endif
  516. #if FASTLOOP
  517. // If still have fallback don't do fast loop
  518. if (fallbackBuffer != null && (ch = fallbackBuffer.InternalGetNextChar()) != 0)
  519. {
  520. // We're reserving 1 byte for each char by default
  521. byteCount++;
  522. goto ProcessChar;
  523. }
  524. int availableChars = PtrDiff(pEnd, pSrc);
  525. // don't fall into the fast decoding loop if we don't have enough characters
  526. if (availableChars <= 13)
  527. {
  528. // try to get over the remainder of the ascii characters fast though
  529. char* pLocalEnd = pEnd; // hint to get pLocalEnd en-registered
  530. while (pSrc < pLocalEnd)
  531. {
  532. ch = *pSrc;
  533. pSrc++;
  534. if (ch > 0x7F)
  535. goto ProcessChar;
  536. }
  537. // we are done
  538. break;
  539. }
  540. #if BIT64
  541. // make sure that we won't get a silent overflow inside the fast loop
  542. // (Fall out to slow loop if we have this many characters)
  543. availableChars &= 0x0FFFFFFF;
  544. #endif
  545. // To compute the upper bound, assume that all characters are ASCII characters at this point,
  546. // the boundary will be decreased for every non-ASCII character we encounter
  547. // Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
  548. char* pStop = pSrc + availableChars - (3 + 4);
  549. while (pSrc < pStop)
  550. {
  551. ch = *pSrc;
  552. pSrc++;
  553. if (ch > 0x7F) // Not ASCII
  554. {
  555. if (ch > 0x7FF) // Not 2 Byte
  556. {
  557. if ((ch & 0xF800) == 0xD800) // See if its a Surrogate
  558. goto LongCode;
  559. byteCount++;
  560. }
  561. byteCount++;
  562. }
  563. // get pSrc aligned
  564. if ((unchecked((int)pSrc) & 0x2) != 0)
  565. {
  566. ch = *pSrc;
  567. pSrc++;
  568. if (ch > 0x7F) // Not ASCII
  569. {
  570. if (ch > 0x7FF) // Not 2 Byte
  571. {
  572. if ((ch & 0xF800) == 0xD800) // See if its a Surrogate
  573. goto LongCode;
  574. byteCount++;
  575. }
  576. byteCount++;
  577. }
  578. }
  579. // Run 2 * 4 characters at a time!
  580. while (pSrc < pStop)
  581. {
  582. ch = *(int*)pSrc;
  583. int chc = *(int*)(pSrc + 2);
  584. if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0) // See if not ASCII
  585. {
  586. if (((ch | chc) & unchecked((int)0xF800F800)) != 0) // See if not 2 Byte
  587. {
  588. goto LongCodeWithMask;
  589. }
  590. if ((ch & unchecked((int)0xFF800000)) != 0) // Actually 0x07800780 is all we care about (4 bits)
  591. byteCount++;
  592. if ((ch & unchecked((int)0xFF80)) != 0)
  593. byteCount++;
  594. if ((chc & unchecked((int)0xFF800000)) != 0)
  595. byteCount++;
  596. if ((chc & unchecked((int)0xFF80)) != 0)
  597. byteCount++;
  598. }
  599. pSrc += 4;
  600. ch = *(int*)pSrc;
  601. chc = *(int*)(pSrc + 2);
  602. if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0) // See if not ASCII
  603. {
  604. if (((ch | chc) & unchecked((int)0xF800F800)) != 0) // See if not 2 Byte
  605. {
  606. goto LongCodeWithMask;
  607. }
  608. if ((ch & unchecked((int)0xFF800000)) != 0)
  609. byteCount++;
  610. if ((ch & unchecked((int)0xFF80)) != 0)
  611. byteCount++;
  612. if ((chc & unchecked((int)0xFF800000)) != 0)
  613. byteCount++;
  614. if ((chc & unchecked((int)0xFF80)) != 0)
  615. byteCount++;
  616. }
  617. pSrc += 4;
  618. }
  619. break;
  620. LongCodeWithMask:
  621. if (BitConverter.IsLittleEndian)
  622. {
  623. ch = (char)ch;
  624. }
  625. else
  626. {
  627. // be careful about the sign extension
  628. ch = (int)(((uint)ch) >> 16);
  629. }
  630. pSrc++;
  631. if (ch <= 0x7F)
  632. {
  633. continue;
  634. }
  635. LongCode:
  636. // use separate helper variables for slow and fast loop so that the jit optimizations
  637. // won't get confused about the variable lifetimes
  638. if (ch > 0x7FF)
  639. {
  640. // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
  641. if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
  642. {
  643. // 4 byte encoding - high surrogate + low surrogate
  644. int chd = *pSrc;
  645. if (
  646. // !IsHighSurrogate(ch) // low without high -> bad
  647. ch > CharUnicodeInfo.HIGH_SURROGATE_END ||
  648. // !IsLowSurrogate(chd) // high not followed by low -> bad
  649. !InRange(chd, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
  650. {
  651. // Back up and drop out to slow loop to figure out error
  652. pSrc--;
  653. break;
  654. }
  655. pSrc++;
  656. // byteCount - this byte is compensated by the second surrogate character
  657. }
  658. byteCount++;
  659. }
  660. byteCount++;
  661. // byteCount - the last byte is already included
  662. }
  663. #endif // FASTLOOP
  664. // no pending char at this point
  665. ch = 0;
  666. }
  667. #if BIT64
  668. // check for overflow
  669. if (byteCount < 0)
  670. {
  671. throw new ArgumentException(
  672. SR.Argument_ConversionOverflow);
  673. }
  674. #endif
  675. Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0,
  676. "[UTF8Encoding.GetByteCount]Expected Empty fallback buffer");
  677. return byteCount;
  678. }
  679. // diffs two char pointers using unsigned arithmetic. The unsigned arithmetic
  680. // is good enough for us, and it tends to generate better code than the signed
  681. // arithmetic generated by default
  682. private static unsafe int PtrDiff(char* a, char* b)
  683. {
  684. return (int)(((uint)((byte*)a - (byte*)b)) >> 1);
  685. }
  686. // byte* flavor just for parity
  687. private static unsafe int PtrDiff(byte* a, byte* b)
  688. {
  689. return (int)(a - b);
  690. }
  691. private static bool InRange(int ch, int start, int end)
  692. {
  693. return (uint)(ch - start) <= (uint)(end - start);
  694. }
  695. // Our workhorse
  696. // Note: We ignore mismatched surrogates, unless the exception flag is set in which case we throw
  697. internal sealed override unsafe int GetBytes(
  698. char* chars, int charCount, byte* bytes, int byteCount, EncoderNLS baseEncoder)
  699. {
  700. Debug.Assert(chars != null, "[UTF8Encoding.GetBytes]chars!=null");
  701. Debug.Assert(byteCount >= 0, "[UTF8Encoding.GetBytes]byteCount >=0");
  702. Debug.Assert(charCount >= 0, "[UTF8Encoding.GetBytes]charCount >=0");
  703. Debug.Assert(bytes != null, "[UTF8Encoding.GetBytes]bytes!=null");
  704. UTF8Encoder encoder = null;
  705. // For fallback we may need a fallback buffer.
  706. // We wait to initialize it though in case we don't have any broken input unicode
  707. EncoderFallbackBuffer fallbackBuffer = null;
  708. char* pSrcForFallback;
  709. char* pSrc = chars;
  710. byte* pTarget = bytes;
  711. char* pEnd = pSrc + charCount;
  712. byte* pAllocatedBufferEnd = pTarget + byteCount;
  713. int ch = 0;
  714. // assume that JIT will en-register pSrc, pTarget and ch
  715. if (baseEncoder != null)
  716. {
  717. encoder = (UTF8Encoder)baseEncoder;
  718. ch = encoder.surrogateChar;
  719. // We mustn't have left over fallback data when counting
  720. if (encoder.InternalHasFallbackBuffer)
  721. {
  722. // We always need the fallback buffer in get bytes so we can flush any remaining ones if necessary
  723. fallbackBuffer = encoder.FallbackBuffer;
  724. if (fallbackBuffer.Remaining > 0 && encoder._throwOnOverflow)
  725. throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType()));
  726. // Set our internal fallback interesting things.
  727. fallbackBuffer.InternalInitialize(chars, pEnd, encoder, true);
  728. }
  729. }
  730. for (;;)
  731. {
  732. // SLOWLOOP: does all range checks, handles all special cases, but it is slow
  733. if (pSrc >= pEnd)
  734. {
  735. if (ch == 0)
  736. {
  737. // Check if there's anything left to get out of the fallback buffer
  738. ch = fallbackBuffer != null ? fallbackBuffer.InternalGetNextChar() : 0;
  739. if (ch > 0)
  740. {
  741. goto ProcessChar;
  742. }
  743. }
  744. else
  745. {
  746. // Case of leftover surrogates in the fallback buffer
  747. if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
  748. {
  749. Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
  750. "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
  751. int cha = ch;
  752. ch = fallbackBuffer.InternalGetNextChar();
  753. if (InRange(ch, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
  754. {
  755. ch = ch + (cha << 10) + (0x10000 - CharUnicodeInfo.LOW_SURROGATE_START - (CharUnicodeInfo.HIGH_SURROGATE_START << 10));
  756. goto EncodeChar;
  757. }
  758. else if (ch > 0)
  759. {
  760. goto ProcessChar;
  761. }
  762. else
  763. {
  764. break;
  765. }
  766. }
  767. }
  768. // attempt to encode the partial surrogate (will fail or ignore)
  769. if (ch > 0 && (encoder == null || encoder.MustFlush))
  770. goto EncodeChar;
  771. // We're done
  772. break;
  773. }
  774. if (ch > 0)
  775. {
  776. // We have a high surrogate left over from a previous loop.
  777. Debug.Assert(ch >= 0xD800 && ch <= 0xDBFF,
  778. "[UTF8Encoding.GetBytes]expected high surrogate, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
  779. // use separate helper variables for local contexts so that the jit optimizations
  780. // won't get confused about the variable lifetimes
  781. int cha = *pSrc;
  782. // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
  783. // if (IsLowSurrogate(cha)) {
  784. if (InRange(cha, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
  785. {
  786. ch = cha + (ch << 10) +
  787. (0x10000
  788. - CharUnicodeInfo.LOW_SURROGATE_START
  789. - (CharUnicodeInfo.HIGH_SURROGATE_START << 10));
  790. pSrc++;
  791. }
  792. // else ch is still high surrogate and encoding will fail
  793. // attempt to encode the surrogate or partial surrogate
  794. goto EncodeChar;
  795. }
  796. // If we've used a fallback, then we have to check for it
  797. if (fallbackBuffer != null)
  798. {
  799. ch = fallbackBuffer.InternalGetNextChar();
  800. if (ch > 0) goto ProcessChar;
  801. }
  802. // read next char. The JIT optimization seems to be getting confused when
  803. // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
  804. ch = *pSrc;
  805. pSrc++;
  806. ProcessChar:
  807. // if (IsHighSurrogate(ch)) {
  808. if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.HIGH_SURROGATE_END))
  809. {
  810. continue;
  811. }
  812. // either good char or partial surrogate
  813. EncodeChar:
  814. // throw exception on partial surrogate if necessary
  815. // if (IsLowSurrogate(ch) || IsHighSurrogate(ch))
  816. if (InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
  817. {
  818. // Lone surrogates aren't allowed, we have to do fallback for them
  819. // Have to make a fallback buffer if we don't have one
  820. if (fallbackBuffer == null)
  821. {
  822. // wait on fallbacks if we can
  823. // For fallback we may need a fallback buffer
  824. if (baseEncoder == null)
  825. fallbackBuffer = this.encoderFallback.CreateFallbackBuffer();
  826. else
  827. fallbackBuffer = baseEncoder.FallbackBuffer;
  828. // Set our internal fallback interesting things.
  829. fallbackBuffer.InternalInitialize(chars, pEnd, baseEncoder, true);
  830. }
  831. // Do our fallback. Actually we already know its a mixed up surrogate,
  832. // so the ref pSrc isn't gonna do anything.
  833. pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be en-registered
  834. fallbackBuffer.InternalFallback(unchecked((char)ch), ref pSrcForFallback);
  835. pSrc = pSrcForFallback;
  836. // Ignore it if we don't throw
  837. ch = 0;
  838. continue;
  839. }
  840. // Count bytes needed
  841. int bytesNeeded = 1;
  842. if (ch > 0x7F)
  843. {
  844. if (ch > 0x7FF)
  845. {
  846. if (ch > 0xFFFF)
  847. {
  848. bytesNeeded++; // 4 bytes (surrogate pair)
  849. }
  850. bytesNeeded++; // 3 bytes (800-FFFF)
  851. }
  852. bytesNeeded++; // 2 bytes (80-7FF)
  853. }
  854. if (pTarget > pAllocatedBufferEnd - bytesNeeded)
  855. {
  856. // Left over surrogate from last time will cause pSrc == chars, so we'll throw
  857. if (fallbackBuffer != null && fallbackBuffer.bFallingBack)
  858. {
  859. fallbackBuffer.MovePrevious(); // Didn't use this fallback char
  860. if (ch > 0xFFFF)
  861. fallbackBuffer.MovePrevious(); // Was surrogate, didn't use 2nd part either
  862. }
  863. else
  864. {
  865. pSrc--; // Didn't use this char
  866. if (ch > 0xFFFF)
  867. pSrc--; // Was surrogate, didn't use 2nd part either
  868. }
  869. Debug.Assert(pSrc >= chars || pTarget == bytes,
  870. "[UTF8Encoding.GetBytes]Expected pSrc to be within buffer or to throw with insufficient room.");
  871. ThrowBytesOverflow(encoder, pTarget == bytes); // Throw if we must
  872. ch = 0; // Nothing left over (we backed up to start of pair if supplementary)
  873. break;
  874. }
  875. if (ch <= 0x7F)
  876. {
  877. *pTarget = (byte)ch;
  878. }
  879. else
  880. {
  881. // use separate helper variables for local contexts so that the jit optimizations
  882. // won't get confused about the variable lifetimes
  883. int chb;
  884. if (ch <= 0x7FF)
  885. {
  886. // 2 byte encoding
  887. chb = (byte)(unchecked((sbyte)0xC0) | (ch >> 6));
  888. }
  889. else
  890. {
  891. if (ch <= 0xFFFF)
  892. {
  893. chb = (byte)(unchecked((sbyte)0xE0) | (ch >> 12));
  894. }
  895. else
  896. {
  897. *pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18));
  898. pTarget++;
  899. chb = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F;
  900. }
  901. *pTarget = (byte)chb;
  902. pTarget++;
  903. chb = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F;
  904. }
  905. *pTarget = (byte)chb;
  906. pTarget++;
  907. *pTarget = (byte)(unchecked((sbyte)0x80) | ch & 0x3F);
  908. }
  909. pTarget++;
  910. #if FASTLOOP
  911. // If still have fallback don't do fast loop
  912. if (fallbackBuffer != null && (ch = fallbackBuffer.InternalGetNextChar()) != 0)
  913. goto ProcessChar;
  914. int availableChars = PtrDiff(pEnd, pSrc);
  915. int availableBytes = PtrDiff(pAllocatedBufferEnd, pTarget);
  916. // don't fall into the fast decoding loop if we don't have enough characters
  917. // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop.
  918. if (availableChars <= 13)
  919. {
  920. // we are hoping for 1 byte per char
  921. if (availableBytes < availableChars)
  922. {
  923. // not enough output room. no pending bits at this point
  924. ch = 0;
  925. continue;
  926. }
  927. // try to get over the remainder of the ascii characters fast though
  928. char* pLocalEnd = pEnd; // hint to get pLocalEnd en-registered
  929. while (pSrc < pLocalEnd)
  930. {
  931. ch = *pSrc;
  932. pSrc++;
  933. // Not ASCII, need more than 1 byte per char
  934. if (ch > 0x7F)
  935. goto ProcessChar;
  936. *pTarget = (byte)ch;
  937. pTarget++;
  938. }
  939. // we are done, let ch be 0 to clear encoder
  940. ch = 0;
  941. break;
  942. }
  943. // we need at least 1 byte per character, but Convert might allow us to convert
  944. // only part of the input, so try as much as we can. Reduce charCount if necessary
  945. if (availableBytes < availableChars)
  946. {
  947. availableChars = availableBytes;
  948. }
  949. // FASTLOOP:
  950. // - optimistic range checks
  951. // - fallbacks to the slow loop for all special cases, exception throwing, etc.
  952. // To compute the upper bound, assume that all characters are ASCII characters at this point,
  953. // the boundary will be decreased for every non-ASCII character we encounter
  954. // Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
  955. // If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop.
  956. char* pStop = pSrc + availableChars - 5;
  957. while (pSrc < pStop)
  958. {
  959. ch = *pSrc;
  960. pSrc++;
  961. if (ch > 0x7F)
  962. {
  963. goto LongCode;
  964. }
  965. *pTarget = (byte)ch;
  966. pTarget++;
  967. // get pSrc aligned
  968. if ((unchecked((int)pSrc) & 0x2) != 0)
  969. {
  970. ch = *pSrc;
  971. pSrc++;
  972. if (ch > 0x7F)
  973. {
  974. goto LongCode;
  975. }
  976. *pTarget = (byte)ch;
  977. pTarget++;
  978. }
  979. // Run 4 characters at a time!
  980. while (pSrc < pStop)
  981. {
  982. ch = *(int*)pSrc;
  983. int chc = *(int*)(pSrc + 2);
  984. if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0)
  985. {
  986. goto LongCodeWithMask;
  987. }
  988. // Unfortunately, this is endianess sensitive
  989. if (BitConverter.IsLittleEndian)
  990. {
  991. *pTarget = (byte)ch;
  992. *(pTarget + 1) = (byte)(ch >> 16);
  993. pSrc += 4;
  994. *(pTarget + 2) = (byte)chc;
  995. *(pTarget + 3) = (byte)(chc >> 16);
  996. pTarget += 4;
  997. }
  998. else
  999. {
  1000. *pTarget = (byte)(ch>>16);
  1001. *(pTarget+1) = (byte)ch;
  1002. pSrc += 4;
  1003. *(pTarget+2) = (byte)(chc>>16);
  1004. *(pTarget+3) = (byte)chc;
  1005. pTarget += 4;
  1006. }
  1007. }
  1008. continue;
  1009. LongCodeWithMask:
  1010. if (BitConverter.IsLittleEndian)
  1011. {
  1012. ch = (char)ch;
  1013. }
  1014. else
  1015. {
  1016. // be careful about the sign extension
  1017. ch = (int)(((uint)ch) >> 16);
  1018. }
  1019. pSrc++;
  1020. if (ch > 0x7F)
  1021. {
  1022. goto LongCode;
  1023. }
  1024. *pTarget = (byte)ch;
  1025. pTarget++;
  1026. continue;
  1027. LongCode:
  1028. // use separate helper variables for slow and fast loop so that the jit optimizations
  1029. // won't get confused about the variable lifetimes
  1030. int chd;
  1031. if (ch <= 0x7FF)
  1032. {
  1033. // 2 byte encoding
  1034. chd = unchecked((sbyte)0xC0) | (ch >> 6);
  1035. }
  1036. else
  1037. {
  1038. // if (!IsLowSurrogate(ch) && !IsHighSurrogate(ch))
  1039. if (!InRange(ch, CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
  1040. {
  1041. // 3 byte encoding
  1042. chd = unchecked((sbyte)0xE0) | (ch >> 12);
  1043. }
  1044. else
  1045. {
  1046. // 4 byte encoding - high surrogate + low surrogate
  1047. // if (!IsHighSurrogate(ch))
  1048. if (ch > CharUnicodeInfo.HIGH_SURROGATE_END)
  1049. {
  1050. // low without high -> bad, try again in slow loop
  1051. pSrc -= 1;
  1052. break;
  1053. }
  1054. chd = *pSrc;
  1055. pSrc++;
  1056. // if (!IsLowSurrogate(chd)) {
  1057. if (!InRange(chd, CharUnicodeInfo.LOW_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END))
  1058. {
  1059. // high not followed by low -> bad, try again in slow loop
  1060. pSrc -= 2;
  1061. break;
  1062. }
  1063. ch = chd + (ch << 10) +
  1064. (0x10000
  1065. - CharUnicodeInfo.LOW_SURROGATE_START
  1066. - (CharUnicodeInfo.HIGH_SURROGATE_START << 10));
  1067. *pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18));
  1068. // pStop - this byte is compensated by the second surrogate character
  1069. // 2 input chars require 4 output bytes. 2 have been anticipated already
  1070. // and 2 more will be accounted for by the 2 pStop-- calls below.
  1071. pTarget++;
  1072. chd = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F;
  1073. }
  1074. *pTarget = (byte)chd;
  1075. pStop--; // 3 byte sequence for 1 char, so need pStop-- and the one below too.
  1076. pTarget++;
  1077. chd = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F;
  1078. }
  1079. *pTarget = (byte)chd;
  1080. pStop--; // 2 byte sequence for 1 char so need pStop--.
  1081. pTarget++;
  1082. *pTarget = (byte)(unchecked((sbyte)0x80) | ch & 0x3F);
  1083. // pStop - this byte is already included
  1084. pTarget++;
  1085. }
  1086. Debug.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetBytes]pTarget <= pAllocatedBufferEnd");
  1087. #endif // FASTLOOP
  1088. // no pending char at this point
  1089. ch = 0;
  1090. }
  1091. // Do we have to set the encoder bytes?
  1092. if (encoder != null)
  1093. {
  1094. Debug.Assert(!encoder.MustFlush || ch == 0,
  1095. "[UTF8Encoding.GetBytes] Expected no mustflush or 0 leftover ch " + ch.ToString("X2", CultureInfo.InvariantCulture));
  1096. encoder.surrogateChar = ch;
  1097. encoder._charsUsed = (int)(pSrc - chars);
  1098. }
  1099. Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0 ||
  1100. baseEncoder == null || !baseEncoder._throwOnOverflow,
  1101. "[UTF8Encoding.GetBytes]Expected empty fallback buffer if not converting");
  1102. return (int)(pTarget - bytes);
  1103. }
  1104. // These are bitmasks used to maintain the state in the decoder. They occupy the higher bits
  1105. // while the actual character is being built in the lower bits. They are shifted together
  1106. // with the actual bits of the character.
  1107. // bits 30 & 31 are used for pending bits fixup
  1108. private const int FinalByte = 1 << 29;
  1109. private const int SupplimentarySeq = 1 << 28;
  1110. private const int ThreeByteSeq = 1 << 27;
  1111. // Note: We throw exceptions on individually encoded surrogates and other non-shortest forms.
  1112. // If exceptions aren't turned on, then we drop all non-shortest &individual surrogates.
  1113. //
  1114. // To simplify maintenance, the structure of GetCharCount and GetChars should be
  1115. // kept the same as much as possible
  1116. internal sealed override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder)
  1117. {
  1118. Debug.Assert(count >= 0, "[UTF8Encoding.GetCharCount]count >=0");
  1119. Debug.Assert(bytes != null, "[UTF8Encoding.GetCharCount]bytes!=null");
  1120. // Initialize stuff
  1121. byte* pSrc = bytes;
  1122. byte* pEnd = pSrc + count;
  1123. // Start by assuming we have as many as count, charCount always includes the adjustment
  1124. // for the character being decoded
  1125. int charCount = count;
  1126. int ch = 0;
  1127. DecoderFallbackBuffer fallback = null;
  1128. if (baseDecoder != null)
  1129. {
  1130. UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
  1131. ch = decoder.bits;
  1132. charCount -= (ch >> 30); // Adjust char count for # of expected bytes and expected output chars.
  1133. // Shouldn't have anything in fallback buffer for GetCharCount
  1134. // (don't have to check _throwOnOverflow for count)
  1135. Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
  1136. "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at start");
  1137. }
  1138. for (;;)
  1139. {
  1140. // SLOWLOOP: does all range checks, handles all special cases, but it is slow
  1141. if (pSrc >= pEnd)
  1142. {
  1143. break;
  1144. }
  1145. if (ch == 0)
  1146. {
  1147. // no pending bits
  1148. goto ReadChar;
  1149. }
  1150. // read next byte. The JIT optimization seems to be getting confused when
  1151. // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
  1152. int cha = *pSrc;
  1153. pSrc++;
  1154. // we are expecting to see trailing bytes like 10vvvvvv
  1155. if ((cha & unchecked((sbyte)0xC0)) != 0x80)
  1156. {
  1157. // This can be a valid starting byte for another UTF8 byte sequence, so let's put
  1158. // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
  1159. pSrc--;
  1160. charCount += (ch >> 30);
  1161. goto InvalidByteSequence;
  1162. }
  1163. // fold in the new byte
  1164. ch = (ch << 6) | (cha & 0x3F);
  1165. if ((ch & FinalByte) == 0)
  1166. {
  1167. Debug.Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
  1168. "[UTF8Encoding.GetChars]Invariant volation");
  1169. if ((ch & SupplimentarySeq) != 0)
  1170. {
  1171. if ((ch & (FinalByte >> 6)) != 0)
  1172. {
  1173. // this is 3rd byte (of 4 byte supplementary) - nothing to do
  1174. continue;
  1175. }
  1176. // 2nd byte, check for non-shortest form of supplementary char and the valid
  1177. // supplementary characters in range 0x010000 - 0x10FFFF at the same time
  1178. if (!InRange(ch & 0x1F0, 0x10, 0x100))
  1179. {
  1180. goto InvalidByteSequence;
  1181. }
  1182. }
  1183. else
  1184. {
  1185. // Must be 2nd byte of a 3-byte sequence
  1186. // check for non-shortest form of 3 byte seq
  1187. if ((ch & (0x1F << 5)) == 0 || // non-shortest form
  1188. (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate
  1189. {
  1190. goto InvalidByteSequence;
  1191. }
  1192. }
  1193. continue;
  1194. }
  1195. // ready to punch
  1196. // adjust for surrogates in non-shortest form
  1197. if ((ch & (SupplimentarySeq | 0x1F0000)) == SupplimentarySeq)
  1198. {
  1199. charCount--;
  1200. }
  1201. goto EncodeChar;
  1202. InvalidByteSequence:
  1203. // this code fragment should be close to the goto referencing it
  1204. // Have to do fallback for invalid bytes
  1205. if (fallback == null)
  1206. {
  1207. if (baseDecoder == null)
  1208. fallback = this.decoderFallback.CreateFallbackBuffer();
  1209. else
  1210. fallback = baseDecoder.FallbackBuffer;
  1211. fallback.InternalInitialize(bytes, null);
  1212. }
  1213. charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
  1214. ch = 0;
  1215. continue;
  1216. ReadChar:
  1217. ch = *pSrc;
  1218. pSrc++;
  1219. ProcessChar:
  1220. if (ch > 0x7F)
  1221. {
  1222. // If its > 0x7F, its start of a new multi-byte sequence
  1223. // Long sequence, so unreserve our char.
  1224. charCount--;
  1225. // bit 6 has to be non-zero for start of multibyte chars.
  1226. if ((ch & 0x40) == 0)
  1227. {
  1228. // Unexpected trail byte
  1229. goto InvalidByteSequence;
  1230. }
  1231. // start a new long code
  1232. if ((ch & 0x20) != 0)
  1233. {
  1234. if ((ch & 0x10) != 0)
  1235. {
  1236. // 4 byte encoding - supplimentary character (2 surrogates)
  1237. ch &= 0x0F;
  1238. // check that bit 4 is zero and the valid supplimentary character
  1239. // range 0x000000 - 0x10FFFF at the same time
  1240. if (ch > 0x04)
  1241. {
  1242. ch |= 0xf0;
  1243. goto InvalidByteSequence;
  1244. }
  1245. // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
  1246. // Final byte flag, count fix if we don't make final byte & supplimentary sequence flag.
  1247. ch |= (FinalByte >> 3 * 6) | // Final byte is 3 more bytes from now
  1248. (1 << 30) | // If it dies on next byte we'll need an extra char
  1249. (3 << (30 - 2 * 6)) | // If it dies on last byte we'll need to subtract a char
  1250. (SupplimentarySeq) | (SupplimentarySeq >> 6) |
  1251. (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
  1252. // Our character count will be 2 characters for these 4 bytes, so subtract another char
  1253. charCount--;
  1254. }
  1255. else
  1256. {
  1257. // 3 byte encoding
  1258. // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
  1259. ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
  1260. (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
  1261. // We'll expect 1 character for these 3 bytes, so subtract another char.
  1262. charCount--;
  1263. }
  1264. }
  1265. else
  1266. {
  1267. // 2 byte encoding
  1268. ch &= 0x1F;
  1269. // check for non-shortest form
  1270. if (ch <= 1)
  1271. {
  1272. ch |= 0xc0;
  1273. goto InvalidByteSequence;
  1274. }
  1275. // Add bit flags so we'll be flagged correctly
  1276. ch |= (FinalByte >> 6);
  1277. }
  1278. continue;
  1279. }
  1280. EncodeChar:
  1281. #if FASTLOOP
  1282. int availableBytes = PtrDiff(pEnd, pSrc);
  1283. // don't fall into the fast decoding loop if we don't have enough bytes
  1284. if (availableBytes <= 13)
  1285. {
  1286. // try to get over the remainder of the ascii characters fast though
  1287. byte* pLocalEnd = pEnd; // hint to get pLocalEnd en-registered
  1288. while (pSrc < pLocalEnd)
  1289. {
  1290. ch = *pSrc;
  1291. pSrc++;
  1292. if (ch > 0x7F)
  1293. goto ProcessChar;
  1294. }
  1295. // we are done
  1296. ch = 0;
  1297. break;
  1298. }
  1299. // To compute the upper bound, assume that all characters are ASCII characters at this point,
  1300. // the boundary will be decreased for every non-ASCII character we encounter
  1301. // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
  1302. byte* pStop = pSrc + availableBytes - 7;
  1303. while (pSrc < pStop)
  1304. {
  1305. ch = *pSrc;
  1306. pSrc++;
  1307. if (ch > 0x7F)
  1308. {
  1309. goto LongCode;
  1310. }
  1311. // get pSrc 2-byte aligned
  1312. if ((unchecked((int)pSrc) & 0x1) != 0)
  1313. {
  1314. ch = *pSrc;
  1315. pSrc++;
  1316. if (ch > 0x7F)
  1317. {
  1318. goto LongCode;
  1319. }
  1320. }
  1321. // get pSrc 4-byte aligned
  1322. if ((unchecked((int)pSrc) & 0x2) != 0)
  1323. {
  1324. ch = *(ushort*)pSrc;
  1325. if ((ch & 0x8080) != 0)
  1326. {
  1327. goto LongCodeWithMask16;
  1328. }
  1329. pSrc += 2;
  1330. }
  1331. // Run 8 + 8 characters at a time!
  1332. while (pSrc < pStop)
  1333. {
  1334. ch = *(int*)pSrc;
  1335. int chb = *(int*)(pSrc + 4);
  1336. if (((ch | chb) & unchecked((int)0x80808080)) != 0)
  1337. {
  1338. goto LongCodeWithMask32;
  1339. }
  1340. pSrc += 8;
  1341. // This is a really small loop - unroll it
  1342. if (pSrc >= pStop)
  1343. break;
  1344. ch = *(int*)pSrc;
  1345. chb = *(int*)(pSrc + 4);
  1346. if (((ch | chb) & unchecked((int)0x80808080)) != 0)
  1347. {
  1348. goto LongCodeWithMask32;
  1349. }
  1350. pSrc += 8;
  1351. }
  1352. break;
  1353. LongCodeWithMask32:
  1354. if (BitConverter.IsLittleEndian)
  1355. {
  1356. ch &= 0xFF;
  1357. }
  1358. else
  1359. {
  1360. // be careful about the sign extension
  1361. ch = (int)(((uint)ch) >> 16);
  1362. }
  1363. LongCodeWithMask16:
  1364. if (BitConverter.IsLittleEndian)
  1365. {
  1366. ch &= 0xFF;
  1367. }
  1368. else
  1369. {
  1370. ch = (int)(((uint)ch) >> 8);
  1371. }
  1372. pSrc++;
  1373. if (ch <= 0x7F)
  1374. {
  1375. continue;
  1376. }
  1377. LongCode:
  1378. int chc = *pSrc;
  1379. pSrc++;
  1380. if (
  1381. // bit 6 has to be zero
  1382. (ch & 0x40) == 0 ||
  1383. // we are expecting to see trailing bytes like 10vvvvvv
  1384. (chc & unchecked((sbyte)0xC0)) != 0x80)
  1385. {
  1386. goto BadLongCode;
  1387. }
  1388. chc &= 0x3F;
  1389. // start a new long code
  1390. if ((ch & 0x20) != 0)
  1391. {
  1392. // fold the first two bytes together
  1393. chc |= (ch & 0x0F) << 6;
  1394. if ((ch & 0x10) != 0)
  1395. {
  1396. // 4 byte encoding - surrogate
  1397. ch = *pSrc;
  1398. if (
  1399. // check that bit 4 is zero, the non-shortest form of surrogate
  1400. // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
  1401. !InRange(chc >> 4, 0x01, 0x10) ||
  1402. // we are expecting to see trailing bytes like 10vvvvvv
  1403. (ch & unchecked((sbyte)0xC0)) != 0x80)
  1404. {
  1405. goto BadLongCode;
  1406. }
  1407. chc = (chc << 6) | (ch & 0x3F);
  1408. ch = *(pSrc + 1);
  1409. // we are expecting to see trailing bytes like 10vvvvvv
  1410. if ((ch & unchecked((sbyte)0xC0)) != 0x80)
  1411. {
  1412. goto BadLongCode;
  1413. }
  1414. pSrc += 2;
  1415. // extra byte
  1416. charCount--;
  1417. }
  1418. else
  1419. {
  1420. // 3 byte encoding
  1421. ch = *pSrc;
  1422. if (
  1423. // check for non-shortest form of 3 byte seq
  1424. (chc & (0x1F << 5)) == 0 ||
  1425. // Can't have surrogates here.
  1426. (chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
  1427. // we are expecting to see trailing bytes like 10vvvvvv
  1428. (ch & unchecked((sbyte)0xC0)) != 0x80)
  1429. {
  1430. goto BadLongCode;
  1431. }
  1432. pSrc++;
  1433. // extra byte
  1434. charCount--;
  1435. }
  1436. }
  1437. else
  1438. {
  1439. // 2 byte encoding
  1440. // check for non-shortest form
  1441. if ((ch & 0x1E) == 0)
  1442. {
  1443. goto BadLongCode;
  1444. }
  1445. }
  1446. // extra byte
  1447. charCount--;
  1448. }
  1449. #endif // FASTLOOP
  1450. // no pending bits at this point
  1451. ch = 0;
  1452. continue;
  1453. BadLongCode:
  1454. pSrc -= 2;
  1455. ch = 0;
  1456. continue;
  1457. }
  1458. // May have a problem if we have to flush
  1459. if (ch != 0)
  1460. {
  1461. // We were already adjusting for these, so need to un-adjust
  1462. charCount += (ch >> 30);
  1463. if (baseDecoder == null || baseDecoder.MustFlush)
  1464. {
  1465. // Have to do fallback for invalid bytes
  1466. if (fallback == null)
  1467. {
  1468. if (baseDecoder == null)
  1469. fallback = this.decoderFallback.CreateFallbackBuffer();
  1470. else
  1471. fallback = baseDecoder.FallbackBuffer;
  1472. fallback.InternalInitialize(bytes, null);
  1473. }
  1474. charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
  1475. }
  1476. }
  1477. // Shouldn't have anything in fallback buffer for GetCharCount
  1478. // (don't have to check _throwOnOverflow for count)
  1479. Debug.Assert(fallback == null || fallback.Remaining == 0,
  1480. "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at end");
  1481. return charCount;
  1482. }
  1483. // WARNING: If we throw an error, then System.Resources.ResourceReader calls this method.
  1484. // So if we're really broken, then that could also throw an error... recursively.
  1485. // So try to make sure GetChars can at least process all uses by
  1486. // System.Resources.ResourceReader!
  1487. //
  1488. // Note: We throw exceptions on individually encoded surrogates and other non-shortest forms.
  1489. // If exceptions aren't turned on, then we drop all non-shortest &individual surrogates.
  1490. //
  1491. // To simplify maintenance, the structure of GetCharCount and GetChars should be
  1492. // kept the same as much as possible
  1493. internal sealed override unsafe int GetChars(
  1494. byte* bytes, int byteCount, char* chars, int charCount, DecoderNLS baseDecoder)
  1495. {
  1496. Debug.Assert(chars != null, "[UTF8Encoding.GetChars]chars!=null");
  1497. Debug.Assert(byteCount >= 0, "[UTF8Encoding.GetChars]count >=0");
  1498. Debug.Assert(charCount >= 0, "[UTF8Encoding.GetChars]charCount >=0");
  1499. Debug.Assert(bytes != null, "[UTF8Encoding.GetChars]bytes!=null");
  1500. byte* pSrc = bytes;
  1501. char* pTarget = chars;
  1502. byte* pEnd = pSrc + byteCount;
  1503. char* pAllocatedBufferEnd = pTarget + charCount;
  1504. int ch = 0;
  1505. DecoderFallbackBuffer fallback = null;
  1506. byte* pSrcForFallback;
  1507. char* pTargetForFallback;
  1508. if (baseDecoder != null)
  1509. {
  1510. UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
  1511. ch = decoder.bits;
  1512. // Shouldn't have anything in fallback buffer for GetChars
  1513. // (don't have to check _throwOnOverflow for chars, we always use all or none so always should be empty)
  1514. Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0,
  1515. "[UTF8Encoding.GetChars]Expected empty fallback buffer at start");
  1516. }
  1517. for (;;)
  1518. {
  1519. // SLOWLOOP: does all range checks, handles all special cases, but it is slow
  1520. if (pSrc >= pEnd)
  1521. {
  1522. break;
  1523. }
  1524. if (ch == 0)
  1525. {
  1526. // no pending bits
  1527. goto ReadChar;
  1528. }
  1529. // read next byte. The JIT optimization seems to be getting confused when
  1530. // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
  1531. int cha = *pSrc;
  1532. pSrc++;
  1533. // we are expecting to see trailing bytes like 10vvvvvv
  1534. if ((cha & unchecked((sbyte)0xC0)) != 0x80)
  1535. {
  1536. // This can be a valid starting byte for another UTF8 byte sequence, so let's put
  1537. // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
  1538. pSrc--;
  1539. goto InvalidByteSequence;
  1540. }
  1541. // fold in the new byte
  1542. ch = (ch << 6) | (cha & 0x3F);
  1543. if ((ch & FinalByte) == 0)
  1544. {
  1545. // Not at last byte yet
  1546. Debug.Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
  1547. "[UTF8Encoding.GetChars]Invariant volation");
  1548. if ((ch & SupplimentarySeq) != 0)
  1549. {
  1550. // Its a 4-byte supplimentary sequence
  1551. if ((ch & (FinalByte >> 6)) != 0)
  1552. {
  1553. // this is 3rd byte of 4 byte sequence - nothing to do
  1554. continue;
  1555. }
  1556. // 2nd byte of 4 bytes
  1557. // check for non-shortest form of surrogate and the valid surrogate
  1558. // range 0x000000 - 0x10FFFF at the same time
  1559. if (!InRange(ch & 0x1F0, 0x10, 0x100))
  1560. {
  1561. goto InvalidByteSequence;
  1562. }
  1563. }
  1564. else
  1565. {
  1566. // Must be 2nd byte of a 3-byte sequence
  1567. // check for non-shortest form of 3 byte seq
  1568. if ((ch & (0x1F << 5)) == 0 || // non-shortest form
  1569. (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate
  1570. {
  1571. goto InvalidByteSequence;
  1572. }
  1573. }
  1574. continue;
  1575. }
  1576. // ready to punch
  1577. // surrogate in shortest form?
  1578. // Might be possible to get rid of this? Already did non-shortest check for 4-byte sequence when reading 2nd byte?
  1579. if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq)
  1580. {
  1581. // let the range check for the second char throw the exception
  1582. if (pTarget < pAllocatedBufferEnd)
  1583. {
  1584. *pTarget = (char)(((ch >> 10) & 0x7FF) +
  1585. unchecked((short)((CharUnicodeInfo.HIGH_SURROGATE_START - (0x10000 >> 10)))));
  1586. pTarget++;
  1587. ch = (ch & 0x3FF) +
  1588. unchecked((int)(CharUnicodeInfo.LOW_SURROGATE_START));
  1589. }
  1590. }
  1591. goto EncodeChar;
  1592. InvalidByteSequence:
  1593. // this code fragment should be close to the gotos referencing it
  1594. // Have to do fallback for invalid bytes
  1595. if (fallback == null)
  1596. {
  1597. if (baseDecoder == null)
  1598. fallback = this.decoderFallback.CreateFallbackBuffer();
  1599. else
  1600. fallback = baseDecoder.FallbackBuffer;
  1601. fallback.InternalInitialize(bytes, pAllocatedBufferEnd);
  1602. }
  1603. // That'll back us up the appropriate # of bytes if we didn't get anywhere
  1604. pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be en-registered
  1605. pTargetForFallback = pTarget; // Avoid passing pTarget by reference to allow it to be en-registered
  1606. bool fallbackResult = FallbackInvalidByteSequence(ref pSrcForFallback, ch, fallback, ref pTargetForFallback);
  1607. pSrc = pSrcForFallback;
  1608. pTarget = pTargetForFallback;
  1609. if (!fallbackResult)
  1610. {
  1611. // Ran out of buffer space
  1612. // Need to throw an exception?
  1613. Debug.Assert(pSrc >= bytes || pTarget == chars,
  1614. "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer after fallback");
  1615. fallback.InternalReset();
  1616. ThrowCharsOverflow(baseDecoder, pTarget == chars);
  1617. ch = 0;
  1618. break;
  1619. }
  1620. Debug.Assert(pSrc >= bytes,
  1621. "[UTF8Encoding.GetChars]Expected invalid byte sequence to have remained within the byte array");
  1622. ch = 0;
  1623. continue;
  1624. ReadChar:
  1625. ch = *pSrc;
  1626. pSrc++;
  1627. ProcessChar:
  1628. if (ch > 0x7F)
  1629. {
  1630. // If its > 0x7F, its start of a new multi-byte sequence
  1631. // bit 6 has to be non-zero
  1632. if ((ch & 0x40) == 0)
  1633. {
  1634. goto InvalidByteSequence;
  1635. }
  1636. // start a new long code
  1637. if ((ch & 0x20) != 0)
  1638. {
  1639. if ((ch & 0x10) != 0)
  1640. {
  1641. // 4 byte encoding - supplimentary character (2 surrogates)
  1642. ch &= 0x0F;
  1643. // check that bit 4 is zero and the valid supplimentary character
  1644. // range 0x000000 - 0x10FFFF at the same time
  1645. if (ch > 0x04)
  1646. {
  1647. ch |= 0xf0;
  1648. goto InvalidByteSequence;
  1649. }
  1650. ch |= (FinalByte >> 3 * 6) | (1 << 30) | (3 << (30 - 2 * 6)) |
  1651. (SupplimentarySeq) | (SupplimentarySeq >> 6) |
  1652. (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
  1653. }
  1654. else
  1655. {
  1656. // 3 byte encoding
  1657. ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
  1658. (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
  1659. }
  1660. }
  1661. else
  1662. {
  1663. // 2 byte encoding
  1664. ch &= 0x1F;
  1665. // check for non-shortest form
  1666. if (ch <= 1)
  1667. {
  1668. ch |= 0xc0;
  1669. goto InvalidByteSequence;
  1670. }
  1671. ch |= (FinalByte >> 6);
  1672. }
  1673. continue;
  1674. }
  1675. EncodeChar:
  1676. // write the pending character
  1677. if (pTarget >= pAllocatedBufferEnd)
  1678. {
  1679. // Fix chars so we make sure to throw if we didn't output anything
  1680. ch &= 0x1fffff;
  1681. if (ch > 0x7f)
  1682. {
  1683. if (ch > 0x7ff)
  1684. {
  1685. if (ch >= CharUnicodeInfo.LOW_SURROGATE_START &&
  1686. ch <= CharUnicodeInfo.LOW_SURROGATE_END)
  1687. {
  1688. pSrc--; // It was 4 bytes
  1689. pTarget--; // 1 was stored already, but we can't remember 1/2, so back up
  1690. }
  1691. else if (ch > 0xffff)
  1692. {
  1693. pSrc--; // It was 4 bytes, nothing was stored
  1694. }
  1695. pSrc--; // It was at least 3 bytes
  1696. }
  1697. pSrc--; // It was at least 2 bytes
  1698. }
  1699. pSrc--;
  1700. // Throw that we don't have enough room (pSrc could be < chars if we had started to process
  1701. // a 4 byte sequence already)
  1702. Debug.Assert(pSrc >= bytes || pTarget == chars,
  1703. "[UTF8Encoding.GetChars]Expected pSrc to be within input buffer or throw due to no output]");
  1704. ThrowCharsOverflow(baseDecoder, pTarget == chars);
  1705. // Don't store ch in decoder, we already backed up to its start
  1706. ch = 0;
  1707. // Didn't throw, just use this buffer size.
  1708. break;
  1709. }
  1710. *pTarget = (char)ch;
  1711. pTarget++;
  1712. #if FASTLOOP
  1713. int availableChars = PtrDiff(pAllocatedBufferEnd, pTarget);
  1714. int availableBytes = PtrDiff(pEnd, pSrc);
  1715. // don't fall into the fast decoding loop if we don't have enough bytes
  1716. // Test for availableChars is done because pStop would be <= pTarget.
  1717. if (availableBytes <= 13)
  1718. {
  1719. // we may need as many as 1 character per byte
  1720. if (availableChars < availableBytes)
  1721. {
  1722. // not enough output room. no pending bits at this point
  1723. ch = 0;
  1724. continue;
  1725. }
  1726. // try to get over the remainder of the ascii characters fast though
  1727. byte* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
  1728. while (pSrc < pLocalEnd)
  1729. {
  1730. ch = *pSrc;
  1731. pSrc++;
  1732. if (ch > 0x7F)
  1733. goto ProcessChar;
  1734. *pTarget = (char)ch;
  1735. pTarget++;
  1736. }
  1737. // we are done
  1738. ch = 0;
  1739. break;
  1740. }
  1741. // we may need as many as 1 character per byte, so reduce the byte count if necessary.
  1742. // If availableChars is too small, pStop will be before pTarget and we won't do fast loop.
  1743. if (availableChars < availableBytes)
  1744. {
  1745. availableBytes = availableChars;
  1746. }
  1747. // To compute the upper bound, assume that all characters are ASCII characters at this point,
  1748. // the boundary will be decreased for every non-ASCII character we encounter
  1749. // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
  1750. char* pStop = pTarget + availableBytes - 7;
  1751. while (pTarget < pStop)
  1752. {
  1753. ch = *pSrc;
  1754. pSrc++;
  1755. if (ch > 0x7F)
  1756. {
  1757. goto LongCode;
  1758. }
  1759. *pTarget = (char)ch;
  1760. pTarget++;
  1761. // get pSrc to be 2-byte aligned
  1762. if ((unchecked((int)pSrc) & 0x1) != 0)
  1763. {
  1764. ch = *pSrc;
  1765. pSrc++;
  1766. if (ch > 0x7F)
  1767. {
  1768. goto LongCode;
  1769. }
  1770. *pTarget = (char)ch;
  1771. pTarget++;
  1772. }
  1773. // get pSrc to be 4-byte aligned
  1774. if ((unchecked((int)pSrc) & 0x2) != 0)
  1775. {
  1776. ch = *(ushort*)pSrc;
  1777. if ((ch & 0x8080) != 0)
  1778. {
  1779. goto LongCodeWithMask16;
  1780. }
  1781. // Unfortunately, this is endianess sensitive
  1782. if (BitConverter.IsLittleEndian)
  1783. {
  1784. *pTarget = (char)(ch & 0x7F);
  1785. pSrc += 2;
  1786. *(pTarget + 1) = (char)((ch >> 8) & 0x7F);
  1787. pTarget += 2;
  1788. }
  1789. else
  1790. {
  1791. *pTarget = (char)((ch >> 8) & 0x7F);
  1792. pSrc += 2;
  1793. *(pTarget+1) = (char)(ch & 0x7F);
  1794. pTarget += 2;
  1795. }
  1796. }
  1797. // Run 8 characters at a time!
  1798. while (pTarget < pStop)
  1799. {
  1800. ch = *(int*)pSrc;
  1801. int chb = *(int*)(pSrc + 4);
  1802. if (((ch | chb) & unchecked((int)0x80808080)) != 0)
  1803. {
  1804. goto LongCodeWithMask32;
  1805. }
  1806. // Unfortunately, this is endianess sensitive
  1807. if (BitConverter.IsLittleEndian)
  1808. {
  1809. *pTarget = (char)(ch & 0x7F);
  1810. *(pTarget + 1) = (char)((ch >> 8) & 0x7F);
  1811. *(pTarget + 2) = (char)((ch >> 16) & 0x7F);
  1812. *(pTarget + 3) = (char)((ch >> 24) & 0x7F);
  1813. pSrc += 8;
  1814. *(pTarget + 4) = (char)(chb & 0x7F);
  1815. *(pTarget + 5) = (char)((chb >> 8) & 0x7F);
  1816. *(pTarget + 6) = (char)((chb >> 16) & 0x7F);
  1817. *(pTarget + 7) = (char)((chb >> 24) & 0x7F);
  1818. pTarget += 8;
  1819. }
  1820. else
  1821. {
  1822. *pTarget = (char)((ch >> 24) & 0x7F);
  1823. *(pTarget+1) = (char)((ch >> 16) & 0x7F);
  1824. *(pTarget+2) = (char)((ch >> 8) & 0x7F);
  1825. *(pTarget+3) = (char)(ch & 0x7F);
  1826. pSrc += 8;
  1827. *(pTarget+4) = (char)((chb >> 24) & 0x7F);
  1828. *(pTarget+5) = (char)((chb >> 16) & 0x7F);
  1829. *(pTarget+6) = (char)((chb >> 8) & 0x7F);
  1830. *(pTarget+7) = (char)(chb & 0x7F);
  1831. pTarget += 8;
  1832. }
  1833. }
  1834. break;
  1835. LongCodeWithMask32:
  1836. if (BitConverter.IsLittleEndian)
  1837. {
  1838. ch &= 0xFF;
  1839. }
  1840. else
  1841. {
  1842. // be careful about the sign extension
  1843. ch = (int)(((uint)ch) >> 16);
  1844. }
  1845. LongCodeWithMask16:
  1846. if (BitConverter.IsLittleEndian)
  1847. {
  1848. ch &= 0xFF;
  1849. }
  1850. else
  1851. {
  1852. ch = (int)(((uint)ch) >> 8);
  1853. }
  1854. pSrc++;
  1855. if (ch <= 0x7F)
  1856. {
  1857. *pTarget = (char)ch;
  1858. pTarget++;
  1859. continue;
  1860. }
  1861. LongCode:
  1862. int chc = *pSrc;
  1863. pSrc++;
  1864. if (
  1865. // bit 6 has to be zero
  1866. (ch & 0x40) == 0 ||
  1867. // we are expecting to see trailing bytes like 10vvvvvv
  1868. (chc & unchecked((sbyte)0xC0)) != 0x80)
  1869. {
  1870. goto BadLongCode;
  1871. }
  1872. chc &= 0x3F;
  1873. // start a new long code
  1874. if ((ch & 0x20) != 0)
  1875. {
  1876. // fold the first two bytes together
  1877. chc |= (ch & 0x0F) << 6;
  1878. if ((ch & 0x10) != 0)
  1879. {
  1880. // 4 byte encoding - surrogate
  1881. ch = *pSrc;
  1882. if (
  1883. // check that bit 4 is zero, the non-shortest form of surrogate
  1884. // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
  1885. !InRange(chc >> 4, 0x01, 0x10) ||
  1886. // we are expecting to see trailing bytes like 10vvvvvv
  1887. (ch & unchecked((sbyte)0xC0)) != 0x80)
  1888. {
  1889. goto BadLongCode;
  1890. }
  1891. chc = (chc << 6) | (ch & 0x3F);
  1892. ch = *(pSrc + 1);
  1893. // we are expecting to see trailing bytes like 10vvvvvv
  1894. if ((ch & unchecked((sbyte)0xC0)) != 0x80)
  1895. {
  1896. goto BadLongCode;
  1897. }
  1898. pSrc += 2;
  1899. ch = (chc << 6) | (ch & 0x3F);
  1900. *pTarget = (char)(((ch >> 10) & 0x7FF) +
  1901. unchecked((short)(CharUnicodeInfo.HIGH_SURROGATE_START - (0x10000 >> 10))));
  1902. pTarget++;
  1903. ch = (ch & 0x3FF) +
  1904. unchecked((short)(CharUnicodeInfo.LOW_SURROGATE_START));
  1905. // extra byte, we're already planning 2 chars for 2 of these bytes,
  1906. // but the big loop is testing the target against pStop, so we need
  1907. // to subtract 2 more or we risk overrunning the input. Subtract
  1908. // one here and one below.
  1909. pStop--;
  1910. }
  1911. else
  1912. {
  1913. // 3 byte encoding
  1914. ch = *pSrc;
  1915. if (
  1916. // check for non-shortest form of 3 byte seq
  1917. (chc & (0x1F << 5)) == 0 ||
  1918. // Can't have surrogates here.
  1919. (chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
  1920. // we are expecting to see trailing bytes like 10vvvvvv
  1921. (ch & unchecked((sbyte)0xC0)) != 0x80)
  1922. {
  1923. goto BadLongCode;
  1924. }
  1925. pSrc++;
  1926. ch = (chc << 6) | (ch & 0x3F);
  1927. // extra byte, we're only expecting 1 char for each of these 3 bytes,
  1928. // but the loop is testing the target (not source) against pStop, so
  1929. // we need to subtract 2 more or we risk overrunning the input.
  1930. // Subtract 1 here and one more below
  1931. pStop--;
  1932. }
  1933. }
  1934. else
  1935. {
  1936. // 2 byte encoding
  1937. ch &= 0x1F;
  1938. // check for non-shortest form
  1939. if (ch <= 1)
  1940. {
  1941. goto BadLongCode;
  1942. }
  1943. ch = (ch << 6) | chc;
  1944. }
  1945. *pTarget = (char)ch;
  1946. pTarget++;
  1947. // extra byte, we're only expecting 1 char for each of these 2 bytes,
  1948. // but the loop is testing the target (not source) against pStop.
  1949. // subtract an extra count from pStop so that we don't overrun the input.
  1950. pStop--;
  1951. }
  1952. #endif // FASTLOOP
  1953. Debug.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetChars]pTarget <= pAllocatedBufferEnd");
  1954. // no pending bits at this point
  1955. ch = 0;
  1956. continue;
  1957. BadLongCode:
  1958. pSrc -= 2;
  1959. ch = 0;
  1960. continue;
  1961. }
  1962. if (ch != 0 && (baseDecoder == null || baseDecoder.MustFlush))
  1963. {
  1964. // Have to do fallback for invalid bytes
  1965. if (fallback == null)
  1966. {
  1967. if (baseDecoder == null)
  1968. fallback = this.decoderFallback.CreateFallbackBuffer();
  1969. else
  1970. fallback = baseDecoder.FallbackBuffer;
  1971. fallback.InternalInitialize(bytes, pAllocatedBufferEnd);
  1972. }
  1973. // That'll back us up the appropriate # of bytes if we didn't get anywhere
  1974. pSrcForFallback = pSrc; // Avoid passing pSrc by reference to allow it to be en-registered
  1975. pTargetForFallback = pTarget; // Avoid passing pTarget by reference to allow it to be en-registered
  1976. bool fallbackResult = FallbackInvalidByteSequence(ref pSrcForFallback, ch, fallback, ref pTargetForFallback);
  1977. pSrc = pSrcForFallback;
  1978. pTarget = pTargetForFallback;
  1979. if (!fallbackResult)
  1980. {
  1981. Debug.Assert(pSrc >= bytes || pTarget == chars,
  1982. "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer while flushing");
  1983. // Ran out of buffer space
  1984. // Need to throw an exception?
  1985. fallback.InternalReset();
  1986. ThrowCharsOverflow(baseDecoder, pTarget == chars);
  1987. }
  1988. Debug.Assert(pSrc >= bytes,
  1989. "[UTF8Encoding.GetChars]Expected flushing invalid byte sequence to have remained within the byte array");
  1990. ch = 0;
  1991. }
  1992. if (baseDecoder != null)
  1993. {
  1994. UTF8Decoder decoder = (UTF8Decoder)baseDecoder;
  1995. // If we're storing flush data we expect all bits to be used or else
  1996. // we're stuck in the middle of a conversion
  1997. Debug.Assert(!baseDecoder.MustFlush || ch == 0 || !baseDecoder._throwOnOverflow,
  1998. "[UTF8Encoding.GetChars]Expected no must flush or no left over bits or no throw on overflow.");
  1999. // Remember our leftover bits.
  2000. decoder.bits = ch;
  2001. baseDecoder._bytesUsed = (int)(pSrc - bytes);
  2002. }
  2003. // Shouldn't have anything in fallback buffer for GetChars
  2004. // (don't have to check _throwOnOverflow for chars)
  2005. Debug.Assert(fallback == null || fallback.Remaining == 0,
  2006. "[UTF8Encoding.GetChars]Expected empty fallback buffer at end");
  2007. return PtrDiff(pTarget, chars);
  2008. }
  2009. // During GetChars we had an invalid byte sequence
  2010. // pSrc is backed up to the start of the bad sequence if we didn't have room to
  2011. // fall it back. Otherwise pSrc remains where it is.
  2012. private unsafe bool FallbackInvalidByteSequence(
  2013. ref byte* pSrc, int ch, DecoderFallbackBuffer fallback, ref char* pTarget)
  2014. {
  2015. // Get our byte[]
  2016. byte* pStart = pSrc;
  2017. byte[] bytesUnknown = GetBytesUnknown(ref pStart, ch);
  2018. // Do the actual fallback
  2019. if (!fallback.InternalFallback(bytesUnknown, pSrc, ref pTarget))
  2020. {
  2021. // Oops, it failed, back up to pStart
  2022. pSrc = pStart;
  2023. return false;
  2024. }
  2025. // It worked
  2026. return true;
  2027. }
  2028. // During GetCharCount we had an invalid byte sequence
  2029. // pSrc is used to find the index that points to the invalid bytes,
  2030. // however the byte[] contains the fallback bytes (in case the index is -1)
  2031. private unsafe int FallbackInvalidByteSequence(
  2032. byte* pSrc, int ch, DecoderFallbackBuffer fallback)
  2033. {
  2034. // Calling GetBytesUnknown can adjust the pSrc pointer but we need to pass the pointer before the adjustment
  2035. // to fallback.InternalFallback. The input pSrc to fallback.InternalFallback will only be used to calculate the
  2036. // index inside bytesUnknown and if we pass the adjusted pointer we can end up with negative index values.
  2037. // We store the original pSrc in pOriginalSrc and then pass pOriginalSrc to fallback.InternalFallback.
  2038. byte* pOriginalSrc = pSrc;
  2039. // Get our byte[]
  2040. byte[] bytesUnknown = GetBytesUnknown(ref pSrc, ch);
  2041. // Do the actual fallback
  2042. int count = fallback.InternalFallback(bytesUnknown, pOriginalSrc);
  2043. // # of fallback chars expected.
  2044. // Note that we only get here for "long" sequences, and have already unreserved
  2045. // the count that we prereserved for the input bytes
  2046. return count;
  2047. }
  2048. // Note that some of these bytes may have come from a previous fallback, so we cannot
  2049. // just decrement the pointer and use the values we read. In those cases we have
  2050. // to regenerate the original values.
  2051. private unsafe byte[] GetBytesUnknown(ref byte* pSrc, int ch)
  2052. {
  2053. // Get our byte[]
  2054. byte[] bytesUnknown = null;
  2055. // See if it was a plain char
  2056. // (have to check >= 0 because we have all sorts of wierd bit flags)
  2057. if (ch < 0x100 && ch >= 0)
  2058. {
  2059. pSrc--;
  2060. bytesUnknown = new byte[] { unchecked((byte)ch) };
  2061. }
  2062. // See if its an unfinished 2 byte sequence
  2063. else if ((ch & (SupplimentarySeq | ThreeByteSeq)) == 0)
  2064. {
  2065. pSrc--;
  2066. bytesUnknown = new byte[] { unchecked((byte)((ch & 0x1F) | 0xc0)) };
  2067. }
  2068. // So now we're either 2nd byte of 3 or 4 byte sequence or
  2069. // we hit a non-trail byte or we ran out of space for 3rd byte of 4 byte sequence
  2070. // 1st check if its a 4 byte sequence
  2071. else if ((ch & SupplimentarySeq) != 0)
  2072. {
  2073. // 3rd byte of 4 byte sequence?
  2074. if ((ch & (FinalByte >> 6)) != 0)
  2075. {
  2076. // 3rd byte of 4 byte sequence
  2077. pSrc -= 3;
  2078. bytesUnknown = new byte[] {
  2079. unchecked((byte)(((ch >> 12) & 0x07) | 0xF0)),
  2080. unchecked((byte)(((ch >> 6) & 0x3F) | 0x80)),
  2081. unchecked((byte)(((ch) & 0x3F) | 0x80)) };
  2082. }
  2083. else if ((ch & (FinalByte >> 12)) != 0)
  2084. {
  2085. // 2nd byte of a 4 byte sequence
  2086. pSrc -= 2;
  2087. bytesUnknown = new byte[] {
  2088. unchecked((byte)(((ch >> 6) & 0x07) | 0xF0)),
  2089. unchecked((byte)(((ch) & 0x3F) | 0x80)) };
  2090. }
  2091. else
  2092. {
  2093. // 4th byte of a 4 byte sequence
  2094. pSrc--;
  2095. bytesUnknown = new byte[] { unchecked((byte)(((ch) & 0x07) | 0xF0)) };
  2096. }
  2097. }
  2098. else
  2099. {
  2100. // 2nd byte of 3 byte sequence?
  2101. if ((ch & (FinalByte >> 6)) != 0)
  2102. {
  2103. // So its 2nd byte of a 3 byte sequence
  2104. pSrc -= 2;
  2105. bytesUnknown = new byte[] {
  2106. unchecked((byte)(((ch >> 6) & 0x0F) | 0xE0)), unchecked ((byte)(((ch) & 0x3F) | 0x80)) };
  2107. }
  2108. else
  2109. {
  2110. // 1st byte of a 3 byte sequence
  2111. pSrc--;
  2112. bytesUnknown = new byte[] { unchecked((byte)(((ch) & 0x0F) | 0xE0)) };
  2113. }
  2114. }
  2115. return bytesUnknown;
  2116. }
  2117. public override Decoder GetDecoder()
  2118. {
  2119. return new UTF8Decoder(this);
  2120. }
  2121. public override Encoder GetEncoder()
  2122. {
  2123. return new UTF8Encoder(this);
  2124. }
  2125. public override int GetMaxByteCount(int charCount)
  2126. {
  2127. if (charCount < 0)
  2128. throw new ArgumentOutOfRangeException(nameof(charCount),
  2129. SR.ArgumentOutOfRange_NeedNonNegNum);
  2130. // Characters would be # of characters + 1 in case left over high surrogate is ? * max fallback
  2131. long byteCount = (long)charCount + 1;
  2132. if (EncoderFallback.MaxCharCount > 1)
  2133. byteCount *= EncoderFallback.MaxCharCount;
  2134. // Max 3 bytes per char. (4 bytes per 2 chars for surrogates)
  2135. byteCount *= 3;
  2136. if (byteCount > 0x7fffffff)
  2137. throw new ArgumentOutOfRangeException(nameof(charCount), SR.ArgumentOutOfRange_GetByteCountOverflow);
  2138. return (int)byteCount;
  2139. }
  2140. public override int GetMaxCharCount(int byteCount)
  2141. {
  2142. if (byteCount < 0)
  2143. throw new ArgumentOutOfRangeException(nameof(byteCount),
  2144. SR.ArgumentOutOfRange_NeedNonNegNum);
  2145. // Figure out our length, 1 char per input byte + 1 char if 1st byte is last byte of 4 byte surrogate pair
  2146. long charCount = ((long)byteCount + 1);
  2147. // Non-shortest form would fall back, so get max count from fallback.
  2148. // So would 11... followed by 11..., so you could fall back every byte
  2149. if (DecoderFallback.MaxCharCount > 1)
  2150. {
  2151. charCount *= DecoderFallback.MaxCharCount;
  2152. }
  2153. if (charCount > 0x7fffffff)
  2154. throw new ArgumentOutOfRangeException(nameof(byteCount), SR.ArgumentOutOfRange_GetCharCountOverflow);
  2155. return (int)charCount;
  2156. }
  2157. public override byte[] GetPreamble()
  2158. {
  2159. if (_emitUTF8Identifier)
  2160. {
  2161. // Allocate new array to prevent users from modifying it.
  2162. return new byte[3] { 0xEF, 0xBB, 0xBF };
  2163. }
  2164. else
  2165. return Array.Empty<byte>();
  2166. }
  2167. public override ReadOnlySpan<byte> Preamble =>
  2168. GetType() != typeof(UTF8Encoding) ? new ReadOnlySpan<byte>(GetPreamble()) : // in case a derived UTF8Encoding overrode GetPreamble
  2169. _emitUTF8Identifier ? PreambleSpan :
  2170. default;
  2171. public override bool Equals(object value)
  2172. {
  2173. if (value is UTF8Encoding that)
  2174. {
  2175. return (_emitUTF8Identifier == that._emitUTF8Identifier) &&
  2176. (EncoderFallback.Equals(that.EncoderFallback)) &&
  2177. (DecoderFallback.Equals(that.DecoderFallback));
  2178. }
  2179. return (false);
  2180. }
  2181. public override int GetHashCode()
  2182. {
  2183. //Not great distribution, but this is relatively unlikely to be used as the key in a hashtable.
  2184. return this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode() +
  2185. UTF8_CODEPAGE + (_emitUTF8Identifier ? 1 : 0);
  2186. }
  2187. private sealed class UTF8Encoder : EncoderNLS
  2188. {
  2189. // We must save a high surrogate value until the next call, looking
  2190. // for a low surrogate value.
  2191. internal int surrogateChar;
  2192. public UTF8Encoder(UTF8Encoding encoding) : base(encoding)
  2193. {
  2194. // base calls reset
  2195. }
  2196. public override void Reset()
  2197. {
  2198. this.surrogateChar = 0;
  2199. if (_fallbackBuffer != null)
  2200. _fallbackBuffer.Reset();
  2201. }
  2202. // Anything left in our encoder?
  2203. internal override bool HasState
  2204. {
  2205. get
  2206. {
  2207. return (this.surrogateChar != 0);
  2208. }
  2209. }
  2210. }
  2211. private sealed class UTF8Decoder : DecoderNLS
  2212. {
  2213. // We'll need to remember the previous information. See the comments around definition
  2214. // of FinalByte for details.
  2215. internal int bits;
  2216. public UTF8Decoder(UTF8Encoding encoding) : base(encoding)
  2217. {
  2218. // base calls reset
  2219. }
  2220. public override void Reset()
  2221. {
  2222. this.bits = 0;
  2223. if (_fallbackBuffer != null)
  2224. _fallbackBuffer.Reset();
  2225. }
  2226. // Anything left in our decoder?
  2227. internal override bool HasState
  2228. {
  2229. get
  2230. {
  2231. return (this.bits != 0);
  2232. }
  2233. }
  2234. }
  2235. }
  2236. }