ASCIIUtility.cs 75 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731
  1. // Licensed to the .NET Foundation under one or more agreements.
  2. // The .NET Foundation licenses this file to you under the MIT license.
  3. // See the LICENSE file in the project root for more information.
  4. using System.Diagnostics;
  5. using System.Numerics;
  6. using System.Runtime.CompilerServices;
  7. using System.Runtime.Intrinsics;
  8. using System.Runtime.Intrinsics.X86;
  9. using Internal.Runtime.CompilerServices;
  10. #if BIT64
  11. using nint = System.Int64;
  12. using nuint = System.UInt64;
  13. #else // BIT64
  14. using nint = System.Int32;
  15. using nuint = System.UInt32;
  16. #endif // BIT64
  17. namespace System.Text
  18. {
  19. internal static partial class ASCIIUtility
  20. {
  21. #if DEBUG
  22. static ASCIIUtility()
  23. {
  24. Debug.Assert(sizeof(nint) == IntPtr.Size && nint.MinValue < 0, "nint is defined incorrectly.");
  25. Debug.Assert(sizeof(nuint) == IntPtr.Size && nuint.MinValue == 0, "nuint is defined incorrectly.");
  26. }
  27. #endif // DEBUG
  28. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  29. private static bool AllBytesInUInt64AreAscii(ulong value)
  30. {
  31. // If the high bit of any byte is set, that byte is non-ASCII.
  32. return ((value & UInt64HighBitsOnlyMask) == 0);
  33. }
  34. /// <summary>
  35. /// Returns <see langword="true"/> iff all chars in <paramref name="value"/> are ASCII.
  36. /// </summary>
  37. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  38. private static bool AllCharsInUInt32AreAscii(uint value)
  39. {
  40. return ((value & ~0x007F007Fu) == 0);
  41. }
  42. /// <summary>
  43. /// Returns <see langword="true"/> iff all chars in <paramref name="value"/> are ASCII.
  44. /// </summary>
  45. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  46. private static bool AllCharsInUInt64AreAscii(ulong value)
  47. {
  48. return ((value & ~0x007F007F_007F007Ful) == 0);
  49. }
  50. /// <summary>
  51. /// Given a DWORD which represents two packed chars in machine-endian order,
  52. /// <see langword="true"/> iff the first char (in machine-endian order) is ASCII.
  53. /// </summary>
  54. /// <param name="value"></param>
  55. /// <returns></returns>
  56. private static bool FirstCharInUInt32IsAscii(uint value)
  57. {
  58. return (BitConverter.IsLittleEndian && (value & 0xFF80u) == 0)
  59. || (!BitConverter.IsLittleEndian && (value & 0xFF800000u) == 0);
  60. }
  61. /// <summary>
  62. /// Returns the index in <paramref name="pBuffer"/> where the first non-ASCII byte is found.
  63. /// Returns <paramref name="bufferLength"/> if the buffer is empty or all-ASCII.
  64. /// </summary>
  65. /// <returns>An ASCII byte is defined as 0x00 - 0x7F, inclusive.</returns>
  66. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  67. public static unsafe nuint GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint bufferLength)
  68. {
  69. // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
  70. // code below. This has two benefits: (a) we can take advantage of specific instructions like
  71. // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
  72. // this method is running.
  73. return (Sse2.IsSupported)
  74. ? GetIndexOfFirstNonAsciiByte_Sse2(pBuffer, bufferLength)
  75. : GetIndexOfFirstNonAsciiByte_Default(pBuffer, bufferLength);
  76. }
  77. private static unsafe nuint GetIndexOfFirstNonAsciiByte_Default(byte* pBuffer, nuint bufferLength)
  78. {
  79. // Squirrel away the original buffer reference. This method works by determining the exact
  80. // byte reference where non-ASCII data begins, so we need this base value to perform the
  81. // final subtraction at the end of the method to get the index into the original buffer.
  82. byte* pOriginalBuffer = pBuffer;
  83. // Before we drain off byte-by-byte, try a generic vectorized loop.
  84. // Only run the loop if we have at least two vectors we can pull out.
  85. // Note use of SBYTE instead of BYTE below; we're using the two's-complement
  86. // representation of negative integers to act as a surrogate for "is ASCII?".
  87. if (Vector.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector<sbyte>.Count)
  88. {
  89. uint SizeOfVectorInBytes = (uint)Vector<sbyte>.Count; // JIT will make this a const
  90. if (Vector.GreaterThanOrEqualAll(Unsafe.ReadUnaligned<Vector<sbyte>>(pBuffer), Vector<sbyte>.Zero))
  91. {
  92. // The first several elements of the input buffer were ASCII. Bump up the pointer to the
  93. // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII
  94. // data or we approach the end of the buffer. It's possible we'll reread data; this is ok.
  95. byte* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVectorInBytes;
  96. pBuffer = (byte*)(((nuint)pBuffer + SizeOfVectorInBytes) & ~(nuint)(SizeOfVectorInBytes - 1));
  97. #if DEBUG
  98. long numBytesRead = pBuffer - pOriginalBuffer;
  99. Debug.Assert(0 < numBytesRead && numBytesRead <= SizeOfVectorInBytes, "We should've made forward progress of at least one byte.");
  100. Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
  101. #endif
  102. Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector.");
  103. do
  104. {
  105. Debug.Assert((nuint)pBuffer % SizeOfVectorInBytes == 0, "Vector read should be aligned.");
  106. if (Vector.LessThanAny(Unsafe.Read<Vector<sbyte>>(pBuffer), Vector<sbyte>.Zero))
  107. {
  108. break; // found non-ASCII data
  109. }
  110. pBuffer += SizeOfVectorInBytes;
  111. } while (pBuffer <= pFinalVectorReadPos);
  112. // Adjust the remaining buffer length for the number of elements we just consumed.
  113. bufferLength -= (nuint)pBuffer;
  114. bufferLength += (nuint)pOriginalBuffer;
  115. }
  116. }
  117. // At this point, the buffer length wasn't enough to perform a vectorized search, or we did perform
  118. // a vectorized search and encountered non-ASCII data. In either case go down a non-vectorized code
  119. // path to drain any remaining ASCII bytes.
  120. //
  121. // We're going to perform unaligned reads, so prefer 32-bit reads instead of 64-bit reads.
  122. // This also allows us to perform more optimized bit twiddling tricks to count the number of ASCII bytes.
  123. uint currentUInt32;
  124. // Try reading 64 bits at a time in a loop.
  125. for (; bufferLength >= 8; bufferLength -= 8)
  126. {
  127. currentUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer);
  128. uint nextUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer + 4);
  129. if (!AllBytesInUInt32AreAscii(currentUInt32 | nextUInt32))
  130. {
  131. // One of these two values contains non-ASCII bytes.
  132. // Figure out which one it is, then put it in 'current' so that we can drain the ASCII bytes.
  133. if (AllBytesInUInt32AreAscii(currentUInt32))
  134. {
  135. currentUInt32 = nextUInt32;
  136. pBuffer += 4;
  137. }
  138. goto FoundNonAsciiData;
  139. }
  140. pBuffer += 8; // consumed 8 ASCII bytes
  141. }
  142. // From this point forward we don't need to update bufferLength.
  143. // Try reading 32 bits.
  144. if ((bufferLength & 4) != 0)
  145. {
  146. currentUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer);
  147. if (!AllBytesInUInt32AreAscii(currentUInt32))
  148. {
  149. goto FoundNonAsciiData;
  150. }
  151. pBuffer += 4;
  152. }
  153. // Try reading 16 bits.
  154. if ((bufferLength & 2) != 0)
  155. {
  156. currentUInt32 = Unsafe.ReadUnaligned<ushort>(pBuffer);
  157. if (!AllBytesInUInt32AreAscii(currentUInt32))
  158. {
  159. goto FoundNonAsciiData;
  160. }
  161. pBuffer += 2;
  162. }
  163. // Try reading 8 bits
  164. if ((bufferLength & 1) != 0)
  165. {
  166. // If the buffer contains non-ASCII data, the comparison below will fail, and
  167. // we'll end up not incrementing the buffer reference.
  168. if (*(sbyte*)pBuffer >= 0)
  169. {
  170. pBuffer++;
  171. }
  172. }
  173. Finish:
  174. nuint totalNumBytesRead = (nuint)pBuffer - (nuint)pOriginalBuffer;
  175. return totalNumBytesRead;
  176. FoundNonAsciiData:
  177. Debug.Assert(!AllBytesInUInt32AreAscii(currentUInt32), "Shouldn't have reached this point if we have an all-ASCII input.");
  178. // The method being called doesn't bother looking at whether the high byte is ASCII. There are only
  179. // two scenarios: (a) either one of the earlier bytes is not ASCII and the search terminates before
  180. // we get to the high byte; or (b) all of the earlier bytes are ASCII, so the high byte must be
  181. // non-ASCII. In both cases we only care about the low 24 bits.
  182. pBuffer += CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(currentUInt32);
  183. goto Finish;
  184. }
  185. private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuint bufferLength)
  186. {
  187. // JIT turns the below into constants
  188. uint SizeOfVector128 = (uint)Unsafe.SizeOf<Vector128<byte>>();
  189. nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1);
  190. Debug.Assert(Sse2.IsSupported, "Should've been checked by caller.");
  191. Debug.Assert(BitConverter.IsLittleEndian, "SSE2 assumes little-endian.");
  192. uint currentMask, secondMask;
  193. byte* pOriginalBuffer = pBuffer;
  194. // This method is written such that control generally flows top-to-bottom, avoiding
  195. // jumps as much as possible in the optimistic case of a large enough buffer and
  196. // "all ASCII". If we see non-ASCII data, we jump out of the hot paths to targets
  197. // after all the main logic.
  198. if (bufferLength < SizeOfVector128)
  199. {
  200. goto InputBufferLessThanOneVectorInLength; // can't vectorize; drain primitives instead
  201. }
  202. // Read the first vector unaligned.
  203. currentMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load
  204. if (currentMask != 0)
  205. {
  206. goto FoundNonAsciiDataInCurrentMask;
  207. }
  208. // If we have less than 32 bytes to process, just go straight to the final unaligned
  209. // read. There's no need to mess with the loop logic in the middle of this method.
  210. if (bufferLength < 2 * SizeOfVector128)
  211. {
  212. goto IncrementCurrentOffsetBeforeFinalUnalignedVectorRead;
  213. }
  214. // Now adjust the read pointer so that future reads are aligned.
  215. pBuffer = (byte*)(((nuint)pBuffer + SizeOfVector128) & ~(nuint)MaskOfAllBitsInVector128);
  216. #if DEBUG
  217. long numBytesRead = pBuffer - pOriginalBuffer;
  218. Debug.Assert(0 < numBytesRead && numBytesRead <= SizeOfVector128, "We should've made forward progress of at least one byte.");
  219. Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
  220. #endif
  221. // Adjust the remaining length to account for what we just read.
  222. bufferLength += (nuint)pOriginalBuffer;
  223. bufferLength -= (nuint)pBuffer;
  224. // The buffer is now properly aligned.
  225. // Read 2 vectors at a time if possible.
  226. if (bufferLength >= 2 * SizeOfVector128)
  227. {
  228. byte* pFinalVectorReadPos = (byte*)((nuint)pBuffer + bufferLength - 2 * SizeOfVector128);
  229. // After this point, we no longer need to update the bufferLength value.
  230. do
  231. {
  232. Vector128<byte> firstVector = Sse2.LoadAlignedVector128(pBuffer);
  233. Vector128<byte> secondVector = Sse2.LoadAlignedVector128(pBuffer + SizeOfVector128);
  234. currentMask = (uint)Sse2.MoveMask(firstVector);
  235. secondMask = (uint)Sse2.MoveMask(secondVector);
  236. if ((currentMask | secondMask) != 0)
  237. {
  238. goto FoundNonAsciiDataInInnerLoop;
  239. }
  240. pBuffer += 2 * SizeOfVector128;
  241. } while (pBuffer <= pFinalVectorReadPos);
  242. }
  243. // We have somewhere between 0 and (2 * vector length) - 1 bytes remaining to read from.
  244. // Since the above loop doesn't update bufferLength, we can't rely on its absolute value.
  245. // But we _can_ rely on it to tell us how much remaining data must be drained by looking
  246. // at what bits of it are set. This works because had we updated it within the loop above,
  247. // we would've been adding 2 * SizeOfVector128 on each iteration, but we only care about
  248. // bits which are less significant than those that the addition would've acted on.
  249. // If there is fewer than one vector length remaining, skip the next aligned read.
  250. if ((bufferLength & SizeOfVector128) == 0)
  251. {
  252. goto DoFinalUnalignedVectorRead;
  253. }
  254. // At least one full vector's worth of data remains, so we can safely read it.
  255. // Remember, at this point pBuffer is still aligned.
  256. currentMask = (uint)Sse2.MoveMask(Sse2.LoadAlignedVector128(pBuffer));
  257. if (currentMask != 0)
  258. {
  259. goto FoundNonAsciiDataInCurrentMask;
  260. }
  261. IncrementCurrentOffsetBeforeFinalUnalignedVectorRead:
  262. pBuffer += SizeOfVector128;
  263. DoFinalUnalignedVectorRead:
  264. if (((byte)bufferLength & MaskOfAllBitsInVector128) != 0)
  265. {
  266. // Perform an unaligned read of the last vector.
  267. // We need to adjust the pointer because we're re-reading data.
  268. pBuffer += (bufferLength & MaskOfAllBitsInVector128) - SizeOfVector128;
  269. currentMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load
  270. if (currentMask != 0)
  271. {
  272. goto FoundNonAsciiDataInCurrentMask;
  273. }
  274. pBuffer += SizeOfVector128;
  275. }
  276. Finish:
  277. return (nuint)pBuffer - (nuint)pOriginalBuffer; // and we're done!
  278. FoundNonAsciiDataInInnerLoop:
  279. // If the current (first) mask isn't the mask that contains non-ASCII data, then it must
  280. // instead be the second mask. If so, skip the entire first mask and drain ASCII bytes
  281. // from the second mask.
  282. if (currentMask == 0)
  283. {
  284. pBuffer += SizeOfVector128;
  285. currentMask = secondMask;
  286. }
  287. FoundNonAsciiDataInCurrentMask:
  288. // The mask contains - from the LSB - a 0 for each ASCII byte we saw, and a 1 for each non-ASCII byte.
  289. // Tzcnt is the correct operation to count the number of zero bits quickly. If this instruction isn't
  290. // available, we'll fall back to a normal loop.
  291. Debug.Assert(currentMask != 0, "Shouldn't be here unless we see non-ASCII data.");
  292. pBuffer += (uint)BitOperations.TrailingZeroCount(currentMask);
  293. goto Finish;
  294. FoundNonAsciiDataInCurrentDWord:
  295. uint currentDWord;
  296. Debug.Assert(!AllBytesInUInt32AreAscii(currentDWord), "Shouldn't be here unless we see non-ASCII data.");
  297. pBuffer += CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(currentDWord);
  298. goto Finish;
  299. InputBufferLessThanOneVectorInLength:
  300. // These code paths get hit if the original input length was less than one vector in size.
  301. // We can't perform vectorized reads at this point, so we'll fall back to reading primitives
  302. // directly. Note that all of these reads are unaligned.
  303. Debug.Assert(bufferLength < SizeOfVector128);
  304. // QWORD drain
  305. if ((bufferLength & 8) != 0)
  306. {
  307. if (Bmi1.X64.IsSupported)
  308. {
  309. // If we can use 64-bit tzcnt to count the number of leading ASCII bytes, prefer it.
  310. ulong candidateUInt64 = Unsafe.ReadUnaligned<ulong>(pBuffer);
  311. if (!AllBytesInUInt64AreAscii(candidateUInt64))
  312. {
  313. // Clear everything but the high bit of each byte, then tzcnt.
  314. // Remember the / 8 at the end to convert bit count to byte count.
  315. candidateUInt64 &= UInt64HighBitsOnlyMask;
  316. pBuffer += (nuint)(Bmi1.X64.TrailingZeroCount(candidateUInt64) / 8);
  317. goto Finish;
  318. }
  319. }
  320. else
  321. {
  322. // If we can't use 64-bit tzcnt, no worries. We'll just do 2x 32-bit reads instead.
  323. currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
  324. uint nextDWord = Unsafe.ReadUnaligned<uint>(pBuffer + 4);
  325. if (!AllBytesInUInt32AreAscii(currentDWord | nextDWord))
  326. {
  327. // At least one of the values wasn't all-ASCII.
  328. // We need to figure out which one it was and stick it in the currentMask local.
  329. if (AllBytesInUInt32AreAscii(currentDWord))
  330. {
  331. currentDWord = nextDWord; // this one is the culprit
  332. pBuffer += 4;
  333. }
  334. goto FoundNonAsciiDataInCurrentDWord;
  335. }
  336. }
  337. pBuffer += 8; // successfully consumed 8 ASCII bytes
  338. }
  339. // DWORD drain
  340. if ((bufferLength & 4) != 0)
  341. {
  342. currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
  343. if (!AllBytesInUInt32AreAscii(currentDWord))
  344. {
  345. goto FoundNonAsciiDataInCurrentDWord;
  346. }
  347. pBuffer += 4; // successfully consumed 4 ASCII bytes
  348. }
  349. // WORD drain
  350. // (We movzx to a DWORD for ease of manipulation.)
  351. if ((bufferLength & 2) != 0)
  352. {
  353. currentDWord = Unsafe.ReadUnaligned<ushort>(pBuffer);
  354. if (!AllBytesInUInt32AreAscii(currentDWord))
  355. {
  356. // We only care about the 0x0080 bit of the value. If it's not set, then we
  357. // increment currentOffset by 1. If it's set, we don't increment it at all.
  358. pBuffer += (nuint)((nint)(sbyte)currentDWord >> 7) + 1;
  359. goto Finish;
  360. }
  361. pBuffer += 2; // successfully consumed 2 ASCII bytes
  362. }
  363. // BYTE drain
  364. if ((bufferLength & 1) != 0)
  365. {
  366. // sbyte has non-negative value if byte is ASCII.
  367. if (*(sbyte*)(pBuffer) >= 0)
  368. {
  369. pBuffer++; // successfully consumed a single byte
  370. }
  371. }
  372. goto Finish;
  373. }
  374. /// <summary>
  375. /// Returns the index in <paramref name="pBuffer"/> where the first non-ASCII char is found.
  376. /// Returns <paramref name="bufferLength"/> if the buffer is empty or all-ASCII.
  377. /// </summary>
  378. /// <returns>An ASCII char is defined as 0x0000 - 0x007F, inclusive.</returns>
  379. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  380. public static unsafe nuint GetIndexOfFirstNonAsciiChar(char* pBuffer, nuint bufferLength /* in chars */)
  381. {
  382. // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
  383. // code below. This has two benefits: (a) we can take advantage of specific instructions like
  384. // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
  385. // this method is running.
  386. return (Sse2.IsSupported)
  387. ? GetIndexOfFirstNonAsciiChar_Sse2(pBuffer, bufferLength)
  388. : GetIndexOfFirstNonAsciiChar_Default(pBuffer, bufferLength);
  389. }
  390. private static unsafe nuint GetIndexOfFirstNonAsciiChar_Default(char* pBuffer, nuint bufferLength /* in chars */)
  391. {
  392. // Squirrel away the original buffer reference.This method works by determining the exact
  393. // char reference where non-ASCII data begins, so we need this base value to perform the
  394. // final subtraction at the end of the method to get the index into the original buffer.
  395. char* pOriginalBuffer = pBuffer;
  396. Debug.Assert(bufferLength <= nuint.MaxValue / sizeof(char));
  397. // Before we drain off char-by-char, try a generic vectorized loop.
  398. // Only run the loop if we have at least two vectors we can pull out.
  399. if (Vector.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector<ushort>.Count)
  400. {
  401. uint SizeOfVectorInChars = (uint)Vector<ushort>.Count; // JIT will make this a const
  402. uint SizeOfVectorInBytes = (uint)Vector<byte>.Count; // JIT will make this a const
  403. Vector<ushort> maxAscii = new Vector<ushort>(0x007F);
  404. if (Vector.LessThanOrEqualAll(Unsafe.ReadUnaligned<Vector<ushort>>(pBuffer), maxAscii))
  405. {
  406. // The first several elements of the input buffer were ASCII. Bump up the pointer to the
  407. // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII
  408. // data or we approach the end of the buffer. It's possible we'll reread data; this is ok.
  409. char* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVectorInChars;
  410. pBuffer = (char*)(((nuint)pBuffer + SizeOfVectorInBytes) & ~(nuint)(SizeOfVectorInBytes - 1));
  411. #if DEBUG
  412. long numCharsRead = pBuffer - pOriginalBuffer;
  413. Debug.Assert(0 < numCharsRead && numCharsRead <= SizeOfVectorInChars, "We should've made forward progress of at least one char.");
  414. Debug.Assert((nuint)numCharsRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
  415. #endif
  416. Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector.");
  417. do
  418. {
  419. Debug.Assert((nuint)pBuffer % SizeOfVectorInChars == 0, "Vector read should be aligned.");
  420. if (Vector.GreaterThanAny(Unsafe.Read<Vector<ushort>>(pBuffer), maxAscii))
  421. {
  422. break; // found non-ASCII data
  423. }
  424. pBuffer += SizeOfVectorInChars;
  425. } while (pBuffer <= pFinalVectorReadPos);
  426. // Adjust the remaining buffer length for the number of elements we just consumed.
  427. bufferLength -= ((nuint)pBuffer - (nuint)pOriginalBuffer) / sizeof(char);
  428. }
  429. }
  430. // At this point, the buffer length wasn't enough to perform a vectorized search, or we did perform
  431. // a vectorized search and encountered non-ASCII data. In either case go down a non-vectorized code
  432. // path to drain any remaining ASCII chars.
  433. //
  434. // We're going to perform unaligned reads, so prefer 32-bit reads instead of 64-bit reads.
  435. // This also allows us to perform more optimized bit twiddling tricks to count the number of ASCII chars.
  436. uint currentUInt32;
  437. // Try reading 64 bits at a time in a loop.
  438. for (; bufferLength >= 4; bufferLength -= 4) // 64 bits = 4 * 16-bit chars
  439. {
  440. currentUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer);
  441. uint nextUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer + 4 / sizeof(char));
  442. if (!AllCharsInUInt32AreAscii(currentUInt32 | nextUInt32))
  443. {
  444. // One of these two values contains non-ASCII chars.
  445. // Figure out which one it is, then put it in 'current' so that we can drain the ASCII chars.
  446. if (AllCharsInUInt32AreAscii(currentUInt32))
  447. {
  448. currentUInt32 = nextUInt32;
  449. pBuffer += 2;
  450. }
  451. goto FoundNonAsciiData;
  452. }
  453. pBuffer += 4; // consumed 4 ASCII chars
  454. }
  455. // From this point forward we don't need to keep track of the remaining buffer length.
  456. // Try reading 32 bits.
  457. if ((bufferLength & 2) != 0) // 32 bits = 2 * 16-bit chars
  458. {
  459. currentUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer);
  460. if (!AllCharsInUInt32AreAscii(currentUInt32))
  461. {
  462. goto FoundNonAsciiData;
  463. }
  464. pBuffer += 2;
  465. }
  466. // Try reading 16 bits.
  467. // No need to try an 8-bit read after this since we're working with chars.
  468. if ((bufferLength & 1) != 0)
  469. {
  470. // If the buffer contains non-ASCII data, the comparison below will fail, and
  471. // we'll end up not incrementing the buffer reference.
  472. if (*pBuffer <= 0x007F)
  473. {
  474. pBuffer++;
  475. }
  476. }
  477. Finish:
  478. nuint totalNumBytesRead = (nuint)pBuffer - (nuint)pOriginalBuffer;
  479. Debug.Assert(totalNumBytesRead % sizeof(char) == 0, "Total number of bytes read should be even since we're working with chars.");
  480. return totalNumBytesRead / sizeof(char); // convert byte count -> char count before returning
  481. FoundNonAsciiData:
  482. Debug.Assert(!AllCharsInUInt32AreAscii(currentUInt32), "Shouldn't have reached this point if we have an all-ASCII input.");
  483. // We don't bother looking at the second char - only the first char.
  484. if (FirstCharInUInt32IsAscii(currentUInt32))
  485. {
  486. pBuffer++;
  487. }
  488. goto Finish;
  489. }
  490. private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuint bufferLength /* in chars */)
  491. {
  492. // This method contains logic optimized for both SSE2 and SSE41. Much of the logic in this method
  493. // will be elided by JIT once we determine which specific ISAs we support.
  494. // Quick check for empty inputs.
  495. if (bufferLength == 0)
  496. {
  497. return 0;
  498. }
  499. // JIT turns the below into constants
  500. uint SizeOfVector128InBytes = (uint)Unsafe.SizeOf<Vector128<byte>>();
  501. uint SizeOfVector128InChars = SizeOfVector128InBytes / sizeof(char);
  502. Debug.Assert(Sse2.IsSupported, "Should've been checked by caller.");
  503. Debug.Assert(BitConverter.IsLittleEndian, "SSE2 assumes little-endian.");
  504. Vector128<short> firstVector, secondVector;
  505. uint currentMask;
  506. char* pOriginalBuffer = pBuffer;
  507. if (bufferLength < SizeOfVector128InChars)
  508. {
  509. goto InputBufferLessThanOneVectorInLength; // can't vectorize; drain primitives instead
  510. }
  511. // This method is written such that control generally flows top-to-bottom, avoiding
  512. // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
  513. // data, we jump out of the hot paths to targets at the end of the method.
  514. Vector128<short> asciiMaskForPTEST = Vector128.Create(unchecked((short)0xFF80)); // used for PTEST on supported hardware
  515. Vector128<ushort> asciiMaskForPMINUW = Vector128.Create((ushort)0x0080); // used for PMINUW on supported hardware
  516. Vector128<short> asciiMaskForPXOR = Vector128.Create(unchecked((short)0x8000)); // used for PXOR
  517. Vector128<short> asciiMaskForPCMPGTW = Vector128.Create(unchecked((short)0x807F)); // used for PCMPGTW
  518. Debug.Assert(bufferLength <= nuint.MaxValue / sizeof(char));
  519. // Read the first vector unaligned.
  520. firstVector = Sse2.LoadVector128((short*)pBuffer); // unaligned load
  521. if (Sse41.IsSupported)
  522. {
  523. // The SSE41-optimized code path works by forcing the 0x0080 bit in each WORD of the vector to be
  524. // set iff the WORD element has value >= 0x0080 (non-ASCII). Then we'll treat it as a BYTE vector
  525. // in order to extract the mask.
  526. currentMask = (uint)Sse2.MoveMask(Sse41.Min(firstVector.AsUInt16(), asciiMaskForPMINUW).AsByte());
  527. }
  528. else
  529. {
  530. // The SSE2-optimized code path works by forcing each WORD of the vector to be 0xFFFF iff the WORD
  531. // element has value >= 0x0080 (non-ASCII). Then we'll treat it as a BYTE vector in order to extract
  532. // the mask.
  533. currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
  534. }
  535. if (currentMask != 0)
  536. {
  537. goto FoundNonAsciiDataInCurrentMask;
  538. }
  539. // If we have less than 32 bytes to process, just go straight to the final unaligned
  540. // read. There's no need to mess with the loop logic in the middle of this method.
  541. // Adjust the remaining length to account for what we just read.
  542. // For the remainder of this code path, bufferLength will be in bytes, not chars.
  543. bufferLength <<= 1; // chars to bytes
  544. if (bufferLength < 2 * SizeOfVector128InBytes)
  545. {
  546. goto IncrementCurrentOffsetBeforeFinalUnalignedVectorRead;
  547. }
  548. // Now adjust the read pointer so that future reads are aligned.
  549. pBuffer = (char*)(((nuint)pBuffer + SizeOfVector128InBytes) & ~(nuint)(SizeOfVector128InBytes - 1));
  550. #if DEBUG
  551. long numCharsRead = pBuffer - pOriginalBuffer;
  552. Debug.Assert(0 < numCharsRead && numCharsRead <= SizeOfVector128InChars, "We should've made forward progress of at least one char.");
  553. Debug.Assert((nuint)numCharsRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
  554. #endif
  555. // Adjust remaining buffer length.
  556. bufferLength += (nuint)pOriginalBuffer;
  557. bufferLength -= (nuint)pBuffer;
  558. // The buffer is now properly aligned.
  559. // Read 2 vectors at a time if possible.
  560. if (bufferLength >= 2 * SizeOfVector128InBytes)
  561. {
  562. char* pFinalVectorReadPos = (char*)((nuint)pBuffer + bufferLength - 2 * SizeOfVector128InBytes);
  563. // After this point, we no longer need to update the bufferLength value.
  564. do
  565. {
  566. firstVector = Sse2.LoadAlignedVector128((short*)pBuffer);
  567. secondVector = Sse2.LoadAlignedVector128((short*)pBuffer + SizeOfVector128InChars);
  568. Vector128<short> combinedVector = Sse2.Or(firstVector, secondVector);
  569. if (Sse41.IsSupported)
  570. {
  571. // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
  572. // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
  573. if (!Sse41.TestZ(combinedVector, asciiMaskForPTEST))
  574. {
  575. goto FoundNonAsciiDataInFirstOrSecondVector;
  576. }
  577. }
  578. else
  579. {
  580. // See comment earlier in the method for an explanation of how the below logic works.
  581. if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(combinedVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
  582. {
  583. goto FoundNonAsciiDataInFirstOrSecondVector;
  584. }
  585. }
  586. pBuffer += 2 * SizeOfVector128InChars;
  587. } while (pBuffer <= pFinalVectorReadPos);
  588. }
  589. // We have somewhere between 0 and (2 * vector length) - 1 bytes remaining to read from.
  590. // Since the above loop doesn't update bufferLength, we can't rely on its absolute value.
  591. // But we _can_ rely on it to tell us how much remaining data must be drained by looking
  592. // at what bits of it are set. This works because had we updated it within the loop above,
  593. // we would've been adding 2 * SizeOfVector128 on each iteration, but we only care about
  594. // bits which are less significant than those that the addition would've acted on.
  595. // If there is fewer than one vector length remaining, skip the next aligned read.
  596. // Remember, at this point bufferLength is measured in bytes, not chars.
  597. if ((bufferLength & SizeOfVector128InBytes) == 0)
  598. {
  599. goto DoFinalUnalignedVectorRead;
  600. }
  601. // At least one full vector's worth of data remains, so we can safely read it.
  602. // Remember, at this point pBuffer is still aligned.
  603. firstVector = Sse2.LoadAlignedVector128((short*)pBuffer);
  604. if (Sse41.IsSupported)
  605. {
  606. // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
  607. // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
  608. if (!Sse41.TestZ(firstVector, asciiMaskForPTEST))
  609. {
  610. goto FoundNonAsciiDataInFirstVector;
  611. }
  612. }
  613. else
  614. {
  615. // See comment earlier in the method for an explanation of how the below logic works.
  616. currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
  617. if (currentMask != 0)
  618. {
  619. goto FoundNonAsciiDataInCurrentMask;
  620. }
  621. }
  622. IncrementCurrentOffsetBeforeFinalUnalignedVectorRead:
  623. pBuffer += SizeOfVector128InChars;
  624. DoFinalUnalignedVectorRead:
  625. if (((byte)bufferLength & (SizeOfVector128InBytes - 1)) != 0)
  626. {
  627. // Perform an unaligned read of the last vector.
  628. // We need to adjust the pointer because we're re-reading data.
  629. pBuffer = (char*)((byte*)pBuffer + (bufferLength & (SizeOfVector128InBytes - 1)) - SizeOfVector128InBytes);
  630. firstVector = Sse2.LoadVector128((short*)pBuffer); // unaligned load
  631. if (Sse41.IsSupported)
  632. {
  633. // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
  634. // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
  635. if (!Sse41.TestZ(firstVector, asciiMaskForPTEST))
  636. {
  637. goto FoundNonAsciiDataInFirstVector;
  638. }
  639. }
  640. else
  641. {
  642. // See comment earlier in the method for an explanation of how the below logic works.
  643. currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
  644. if (currentMask != 0)
  645. {
  646. goto FoundNonAsciiDataInCurrentMask;
  647. }
  648. }
  649. pBuffer += SizeOfVector128InChars;
  650. }
  651. Finish:
  652. Debug.Assert(((nuint)pBuffer - (nuint)pOriginalBuffer) % 2 == 0, "Shouldn't have incremented any pointer by an odd byte count.");
  653. return ((nuint)pBuffer - (nuint)pOriginalBuffer) / sizeof(char); // and we're done! (remember to adjust for char count)
  654. FoundNonAsciiDataInFirstOrSecondVector:
  655. // We don't know if the first or the second vector contains non-ASCII data. Check the first
  656. // vector, and if that's all-ASCII then the second vector must be the culprit. Either way
  657. // we'll make sure the first vector local is the one that contains the non-ASCII data.
  658. // See comment earlier in the method for an explanation of how the below logic works.
  659. if (Sse41.IsSupported)
  660. {
  661. if (!Sse41.TestZ(firstVector, asciiMaskForPTEST))
  662. {
  663. goto FoundNonAsciiDataInFirstVector;
  664. }
  665. }
  666. else
  667. {
  668. currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
  669. if (currentMask != 0)
  670. {
  671. goto FoundNonAsciiDataInCurrentMask;
  672. }
  673. }
  674. // Wasn't the first vector; must be the second.
  675. pBuffer += SizeOfVector128InChars;
  676. firstVector = secondVector;
  677. FoundNonAsciiDataInFirstVector:
  678. // See comment earlier in the method for an explanation of how the below logic works.
  679. if (Sse41.IsSupported)
  680. {
  681. currentMask = (uint)Sse2.MoveMask(Sse41.Min(firstVector.AsUInt16(), asciiMaskForPMINUW).AsByte());
  682. }
  683. else
  684. {
  685. currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
  686. }
  687. FoundNonAsciiDataInCurrentMask:
  688. // The mask contains - from the LSB - a 0 for each ASCII byte we saw, and a 1 for each non-ASCII byte.
  689. // Tzcnt is the correct operation to count the number of zero bits quickly. If this instruction isn't
  690. // available, we'll fall back to a normal loop. (Even though the original vector used WORD elements,
  691. // masks work on BYTE elements, and we account for this in the final fixup.)
  692. Debug.Assert(currentMask != 0, "Shouldn't be here unless we see non-ASCII data.");
  693. pBuffer = (char*)((byte*)pBuffer + (uint)BitOperations.TrailingZeroCount(currentMask));
  694. goto Finish;
  695. FoundNonAsciiDataInCurrentDWord:
  696. uint currentDWord;
  697. Debug.Assert(!AllCharsInUInt32AreAscii(currentDWord), "Shouldn't be here unless we see non-ASCII data.");
  698. if (FirstCharInUInt32IsAscii(currentDWord))
  699. {
  700. pBuffer++; // skip past the ASCII char
  701. }
  702. goto Finish;
  703. InputBufferLessThanOneVectorInLength:
  704. // These code paths get hit if the original input length was less than one vector in size.
  705. // We can't perform vectorized reads at this point, so we'll fall back to reading primitives
  706. // directly. Note that all of these reads are unaligned.
  707. // Reminder: If this code path is hit, bufferLength is still a char count, not a byte count.
  708. // We skipped the code path that multiplied the count by sizeof(char).
  709. Debug.Assert(bufferLength < SizeOfVector128InChars);
  710. // QWORD drain
  711. if ((bufferLength & 4) != 0)
  712. {
  713. if (Bmi1.X64.IsSupported)
  714. {
  715. // If we can use 64-bit tzcnt to count the number of leading ASCII chars, prefer it.
  716. ulong candidateUInt64 = Unsafe.ReadUnaligned<ulong>(pBuffer);
  717. if (!AllCharsInUInt64AreAscii(candidateUInt64))
  718. {
  719. // Clear the low 7 bits (the ASCII bits) of each char, then tzcnt.
  720. // Remember the / 8 at the end to convert bit count to byte count,
  721. // then the & ~1 at the end to treat a match in the high byte of
  722. // any char the same as a match in the low byte of that same char.
  723. candidateUInt64 &= 0xFF80FF80_FF80FF80ul;
  724. pBuffer = (char*)((byte*)pBuffer + ((nuint)(Bmi1.X64.TrailingZeroCount(candidateUInt64) / 8) & ~(nuint)1));
  725. goto Finish;
  726. }
  727. }
  728. else
  729. {
  730. // If we can't use 64-bit tzcnt, no worries. We'll just do 2x 32-bit reads instead.
  731. currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
  732. uint nextDWord = Unsafe.ReadUnaligned<uint>(pBuffer + 4 / sizeof(char));
  733. if (!AllCharsInUInt32AreAscii(currentDWord | nextDWord))
  734. {
  735. // At least one of the values wasn't all-ASCII.
  736. // We need to figure out which one it was and stick it in the currentMask local.
  737. if (AllCharsInUInt32AreAscii(currentDWord))
  738. {
  739. currentDWord = nextDWord; // this one is the culprit
  740. pBuffer += 4 / sizeof(char);
  741. }
  742. goto FoundNonAsciiDataInCurrentDWord;
  743. }
  744. }
  745. pBuffer += 4; // successfully consumed 4 ASCII chars
  746. }
  747. // DWORD drain
  748. if ((bufferLength & 2) != 0)
  749. {
  750. currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
  751. if (!AllCharsInUInt32AreAscii(currentDWord))
  752. {
  753. goto FoundNonAsciiDataInCurrentDWord;
  754. }
  755. pBuffer += 2; // successfully consumed 2 ASCII chars
  756. }
  757. // WORD drain
  758. // This is the final drain; there's no need for a BYTE drain since our elemental type is 16-bit char.
  759. if ((bufferLength & 1) != 0)
  760. {
  761. if (*pBuffer <= 0x007F)
  762. {
  763. pBuffer++; // successfully consumed a single char
  764. }
  765. }
  766. goto Finish;
  767. }
  768. /// <summary>
  769. /// Given a QWORD which represents a buffer of 4 ASCII chars in machine-endian order,
  770. /// narrows each WORD to a BYTE, then writes the 4-byte result to the output buffer
  771. /// also in machine-endian order.
  772. /// </summary>
  773. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  774. private static void NarrowFourUtf16CharsToAsciiAndWriteToBuffer(ref byte outputBuffer, ulong value)
  775. {
  776. Debug.Assert(AllCharsInUInt64AreAscii(value));
  777. if (Bmi2.X64.IsSupported)
  778. {
  779. // BMI2 will work regardless of the processor's endianness.
  780. Unsafe.WriteUnaligned(ref outputBuffer, (uint)Bmi2.X64.ParallelBitExtract(value, 0x00FF00FF_00FF00FFul));
  781. }
  782. else
  783. {
  784. if (BitConverter.IsLittleEndian)
  785. {
  786. outputBuffer = (byte)value;
  787. value >>= 16;
  788. Unsafe.Add(ref outputBuffer, 1) = (byte)value;
  789. value >>= 16;
  790. Unsafe.Add(ref outputBuffer, 2) = (byte)value;
  791. value >>= 16;
  792. Unsafe.Add(ref outputBuffer, 3) = (byte)value;
  793. }
  794. else
  795. {
  796. Unsafe.Add(ref outputBuffer, 3) = (byte)value;
  797. value >>= 16;
  798. Unsafe.Add(ref outputBuffer, 2) = (byte)value;
  799. value >>= 16;
  800. Unsafe.Add(ref outputBuffer, 1) = (byte)value;
  801. value >>= 16;
  802. outputBuffer = (byte)value;
  803. }
  804. }
  805. }
  806. /// <summary>
  807. /// Given a DWORD which represents a buffer of 2 ASCII chars in machine-endian order,
  808. /// narrows each WORD to a BYTE, then writes the 2-byte result to the output buffer also in
  809. /// machine-endian order.
  810. /// </summary>
  811. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  812. private static void NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref byte outputBuffer, uint value)
  813. {
  814. Debug.Assert(AllCharsInUInt32AreAscii(value));
  815. if (BitConverter.IsLittleEndian)
  816. {
  817. outputBuffer = (byte)value;
  818. Unsafe.Add(ref outputBuffer, 1) = (byte)(value >> 16);
  819. }
  820. else
  821. {
  822. Unsafe.Add(ref outputBuffer, 1) = (byte)value;
  823. outputBuffer = (byte)(value >> 16);
  824. }
  825. }
  826. /// <summary>
  827. /// Copies as many ASCII characters (U+0000..U+007F) as possible from <paramref name="pUtf16Buffer"/>
  828. /// to <paramref name="pAsciiBuffer"/>, stopping when the first non-ASCII character is encountered
  829. /// or once <paramref name="elementCount"/> elements have been converted. Returns the total number
  830. /// of elements that were able to be converted.
  831. /// </summary>
  832. public static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount)
  833. {
  834. nuint currentOffset = 0;
  835. uint utf16Data32BitsHigh = 0, utf16Data32BitsLow = 0;
  836. ulong utf16Data64Bits = 0;
  837. // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
  838. // code below. This has two benefits: (a) we can take advantage of specific instructions like
  839. // pmovmskb, ptest, vpminuw which we know are optimized, and (b) we can avoid downclocking the
  840. // processor while this method is running.
  841. if (Sse2.IsSupported)
  842. {
  843. Debug.Assert(BitConverter.IsLittleEndian, "Assume little endian if SSE2 is supported.");
  844. if (elementCount >= 2 * (uint)Unsafe.SizeOf<Vector128<byte>>())
  845. {
  846. // Since there's overhead to setting up the vectorized code path, we only want to
  847. // call into it after a quick probe to ensure the next immediate characters really are ASCII.
  848. // If we see non-ASCII data, we'll jump immediately to the draining logic at the end of the method.
  849. if (IntPtr.Size >= 8)
  850. {
  851. utf16Data64Bits = Unsafe.ReadUnaligned<ulong>(pUtf16Buffer);
  852. if (!AllCharsInUInt64AreAscii(utf16Data64Bits))
  853. {
  854. goto FoundNonAsciiDataIn64BitRead;
  855. }
  856. }
  857. else
  858. {
  859. utf16Data32BitsHigh = Unsafe.ReadUnaligned<uint>(pUtf16Buffer);
  860. utf16Data32BitsLow = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + 4 / sizeof(char));
  861. if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh | utf16Data32BitsLow))
  862. {
  863. goto FoundNonAsciiDataIn64BitRead;
  864. }
  865. }
  866. currentOffset = NarrowUtf16ToAscii_Sse2(pUtf16Buffer, pAsciiBuffer, elementCount);
  867. }
  868. }
  869. else if (Vector.IsHardwareAccelerated)
  870. {
  871. uint SizeOfVector = (uint)Unsafe.SizeOf<Vector<byte>>(); // JIT will make this a const
  872. // Only bother vectorizing if we have enough data to do so.
  873. if (elementCount >= 2 * SizeOfVector)
  874. {
  875. // Since there's overhead to setting up the vectorized code path, we only want to
  876. // call into it after a quick probe to ensure the next immediate characters really are ASCII.
  877. // If we see non-ASCII data, we'll jump immediately to the draining logic at the end of the method.
  878. if (IntPtr.Size >= 8)
  879. {
  880. utf16Data64Bits = Unsafe.ReadUnaligned<ulong>(pUtf16Buffer);
  881. if (!AllCharsInUInt64AreAscii(utf16Data64Bits))
  882. {
  883. goto FoundNonAsciiDataIn64BitRead;
  884. }
  885. }
  886. else
  887. {
  888. utf16Data32BitsHigh = Unsafe.ReadUnaligned<uint>(pUtf16Buffer);
  889. utf16Data32BitsLow = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + 4 / sizeof(char));
  890. if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh | utf16Data32BitsLow))
  891. {
  892. goto FoundNonAsciiDataIn64BitRead;
  893. }
  894. }
  895. Vector<ushort> maxAscii = new Vector<ushort>(0x007F);
  896. nuint finalOffsetWhereCanLoop = elementCount - 2 * SizeOfVector;
  897. do
  898. {
  899. Vector<ushort> utf16VectorHigh = Unsafe.ReadUnaligned<Vector<ushort>>(pUtf16Buffer + currentOffset);
  900. Vector<ushort> utf16VectorLow = Unsafe.ReadUnaligned<Vector<ushort>>(pUtf16Buffer + currentOffset + Vector<ushort>.Count);
  901. if (Vector.GreaterThanAny(Vector.BitwiseOr(utf16VectorHigh, utf16VectorLow), maxAscii))
  902. {
  903. break; // found non-ASCII data
  904. }
  905. // TODO: Is the below logic also valid for big-endian platforms?
  906. Vector<byte> asciiVector = Vector.Narrow(utf16VectorHigh, utf16VectorLow);
  907. Unsafe.WriteUnaligned<Vector<byte>>(pAsciiBuffer + currentOffset, asciiVector);
  908. currentOffset += SizeOfVector;
  909. } while (currentOffset <= finalOffsetWhereCanLoop);
  910. }
  911. }
  912. Debug.Assert(currentOffset <= elementCount);
  913. nuint remainingElementCount = elementCount - currentOffset;
  914. // Try to narrow 64 bits -> 32 bits at a time.
  915. // We needn't update remainingElementCount after this point.
  916. if (remainingElementCount >= 4)
  917. {
  918. nuint finalOffsetWhereCanLoop = currentOffset + remainingElementCount - 4;
  919. do
  920. {
  921. if (IntPtr.Size >= 8)
  922. {
  923. // Only perform QWORD reads on a 64-bit platform.
  924. utf16Data64Bits = Unsafe.ReadUnaligned<ulong>(pUtf16Buffer + currentOffset);
  925. if (!AllCharsInUInt64AreAscii(utf16Data64Bits))
  926. {
  927. goto FoundNonAsciiDataIn64BitRead;
  928. }
  929. NarrowFourUtf16CharsToAsciiAndWriteToBuffer(ref pAsciiBuffer[currentOffset], utf16Data64Bits);
  930. }
  931. else
  932. {
  933. utf16Data32BitsHigh = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + currentOffset);
  934. utf16Data32BitsLow = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + currentOffset + 4 / sizeof(char));
  935. if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh | utf16Data32BitsLow))
  936. {
  937. goto FoundNonAsciiDataIn64BitRead;
  938. }
  939. NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref pAsciiBuffer[currentOffset], utf16Data32BitsHigh);
  940. NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref pAsciiBuffer[currentOffset + 2], utf16Data32BitsLow);
  941. }
  942. currentOffset += 4;
  943. } while (currentOffset <= finalOffsetWhereCanLoop);
  944. }
  945. // Try to narrow 32 bits -> 16 bits.
  946. if (((uint)remainingElementCount & 2) != 0)
  947. {
  948. utf16Data32BitsHigh = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + currentOffset);
  949. if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh))
  950. {
  951. goto FoundNonAsciiDataInHigh32Bits;
  952. }
  953. NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref pAsciiBuffer[currentOffset], utf16Data32BitsHigh);
  954. currentOffset += 2;
  955. }
  956. // Try to narrow 16 bits -> 8 bits.
  957. if (((uint)remainingElementCount & 1) != 0)
  958. {
  959. utf16Data32BitsHigh = pUtf16Buffer[currentOffset];
  960. if (utf16Data32BitsHigh <= 0x007Fu)
  961. {
  962. pAsciiBuffer[currentOffset] = (byte)utf16Data32BitsHigh;
  963. currentOffset++;
  964. }
  965. }
  966. Finish:
  967. return currentOffset;
  968. FoundNonAsciiDataIn64BitRead:
  969. if (IntPtr.Size >= 8)
  970. {
  971. // Try checking the first 32 bits of the buffer for non-ASCII data.
  972. // Regardless, we'll move the non-ASCII data into the utf16Data32BitsHigh local.
  973. if (BitConverter.IsLittleEndian)
  974. {
  975. utf16Data32BitsHigh = (uint)utf16Data64Bits;
  976. }
  977. else
  978. {
  979. utf16Data32BitsHigh = (uint)(utf16Data64Bits >> 32);
  980. }
  981. if (AllCharsInUInt32AreAscii(utf16Data32BitsHigh))
  982. {
  983. NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref pAsciiBuffer[currentOffset], utf16Data32BitsHigh);
  984. if (BitConverter.IsLittleEndian)
  985. {
  986. utf16Data32BitsHigh = (uint)(utf16Data64Bits >> 32);
  987. }
  988. else
  989. {
  990. utf16Data32BitsHigh = (uint)utf16Data64Bits;
  991. }
  992. currentOffset += 2;
  993. }
  994. }
  995. else
  996. {
  997. // Need to determine if the high or the low 32-bit value contained non-ASCII data.
  998. // Regardless, we'll move the non-ASCII data into the utf16Data32BitsHigh local.
  999. if (AllCharsInUInt32AreAscii(utf16Data32BitsHigh))
  1000. {
  1001. NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref pAsciiBuffer[currentOffset], utf16Data32BitsHigh);
  1002. utf16Data32BitsHigh = utf16Data32BitsLow;
  1003. currentOffset += 2;
  1004. }
  1005. }
  1006. FoundNonAsciiDataInHigh32Bits:
  1007. Debug.Assert(!AllCharsInUInt32AreAscii(utf16Data32BitsHigh), "Shouldn't have reached this point if we have an all-ASCII input.");
  1008. // There's at most one char that needs to be drained.
  1009. if (FirstCharInUInt32IsAscii(utf16Data32BitsHigh))
  1010. {
  1011. if (!BitConverter.IsLittleEndian)
  1012. {
  1013. utf16Data32BitsHigh >>= 16; // move high char down to low char
  1014. }
  1015. pAsciiBuffer[currentOffset] = (byte)utf16Data32BitsHigh;
  1016. currentOffset++;
  1017. }
  1018. goto Finish;
  1019. }
  1020. private static unsafe nuint NarrowUtf16ToAscii_Sse2(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount)
  1021. {
  1022. // This method contains logic optimized for both SSE2 and SSE41. Much of the logic in this method
  1023. // will be elided by JIT once we determine which specific ISAs we support.
  1024. // JIT turns the below into constants
  1025. uint SizeOfVector128 = (uint)Unsafe.SizeOf<Vector128<byte>>();
  1026. nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1);
  1027. // This method is written such that control generally flows top-to-bottom, avoiding
  1028. // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
  1029. // data, we jump out of the hot paths to targets at the end of the method.
  1030. Debug.Assert(Sse2.IsSupported);
  1031. Debug.Assert(BitConverter.IsLittleEndian);
  1032. Debug.Assert(elementCount >= 2 * SizeOfVector128);
  1033. Vector128<short> asciiMaskForPTEST = Vector128.Create(unchecked((short)0xFF80)); // used for PTEST on supported hardware
  1034. Vector128<short> asciiMaskForPXOR = Vector128.Create(unchecked((short)0x8000)); // used for PXOR
  1035. Vector128<short> asciiMaskForPCMPGTW = Vector128.Create(unchecked((short)0x807F)); // used for PCMPGTW
  1036. // First, perform an unaligned read of the first part of the input buffer.
  1037. Vector128<short> utf16VectorFirst = Sse2.LoadVector128((short*)pUtf16Buffer); // unaligned load
  1038. // If there's non-ASCII data in the first 8 elements of the vector, there's nothing we can do.
  1039. // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
  1040. if (Sse41.IsSupported)
  1041. {
  1042. if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST))
  1043. {
  1044. return 0;
  1045. }
  1046. }
  1047. else
  1048. {
  1049. if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
  1050. {
  1051. return 0;
  1052. }
  1053. }
  1054. // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.
  1055. Vector128<byte> asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
  1056. Sse2.StoreScalar((ulong*)pAsciiBuffer, asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED
  1057. nuint currentOffsetInElements = SizeOfVector128 / 2; // we processed 8 elements so far
  1058. // We're going to get the best performance when we have aligned writes, so we'll take the
  1059. // hit of potentially unaligned reads in order to hit this sweet spot.
  1060. // pAsciiBuffer points to the start of the destination buffer, immediately before where we wrote
  1061. // the 8 bytes previously. If the 0x08 bit is set at the pinned address, then the 8 bytes we wrote
  1062. // previously mean that the 0x08 bit is *not* set at address &pAsciiBuffer[SizeOfVector128 / 2]. In
  1063. // that case we can immediately back up to the previous aligned boundary and start the main loop.
  1064. // If the 0x08 bit is *not* set at the pinned address, then it means the 0x08 bit *is* set at
  1065. // address &pAsciiBuffer[SizeOfVector128 / 2], and we should perform one more 8-byte write to bump
  1066. // just past the next aligned boundary address.
  1067. if (((uint)pAsciiBuffer & (SizeOfVector128 / 2)) == 0)
  1068. {
  1069. // We need to perform one more partial vector write before we can get the alignment we want.
  1070. utf16VectorFirst = Sse2.LoadVector128((short*)pUtf16Buffer + currentOffsetInElements); // unaligned load
  1071. // See comments earlier in this method for information about how this works.
  1072. if (Sse41.IsSupported)
  1073. {
  1074. if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST))
  1075. {
  1076. goto Finish;
  1077. }
  1078. }
  1079. else
  1080. {
  1081. if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
  1082. {
  1083. goto Finish;
  1084. }
  1085. }
  1086. // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.
  1087. asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
  1088. Sse2.StoreScalar((ulong*)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED
  1089. }
  1090. // Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment
  1091. // point, then use that as the base offset going forward.
  1092. currentOffsetInElements = SizeOfVector128 - ((nuint)pAsciiBuffer & MaskOfAllBitsInVector128);
  1093. Debug.Assert(0 < currentOffsetInElements && currentOffsetInElements <= SizeOfVector128, "We wrote at least 1 byte but no more than a whole vector.");
  1094. Debug.Assert(currentOffsetInElements <= elementCount, "Shouldn't have overrun the destination buffer.");
  1095. Debug.Assert(elementCount - currentOffsetInElements >= SizeOfVector128, "We should be able to run at least one whole vector.");
  1096. nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector128;
  1097. do
  1098. {
  1099. // In a loop, perform two unaligned reads, narrow to a single vector, then aligned write one vector.
  1100. utf16VectorFirst = Sse2.LoadVector128((short*)pUtf16Buffer + currentOffsetInElements); // unaligned load
  1101. Vector128<short> utf16VectorSecond = Sse2.LoadVector128((short*)pUtf16Buffer + currentOffsetInElements + SizeOfVector128 / sizeof(short)); // unaligned load
  1102. Vector128<short> combinedVector = Sse2.Or(utf16VectorFirst, utf16VectorSecond);
  1103. // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
  1104. if (Sse41.IsSupported)
  1105. {
  1106. if (!Sse41.TestZ(combinedVector, asciiMaskForPTEST))
  1107. {
  1108. goto FoundNonAsciiDataInLoop;
  1109. }
  1110. }
  1111. else
  1112. {
  1113. if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(combinedVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
  1114. {
  1115. goto FoundNonAsciiDataInLoop;
  1116. }
  1117. }
  1118. // Build up the UTF-8 vector and perform the store.
  1119. asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorSecond);
  1120. Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % SizeOfVector128 == 0, "Write should be aligned.");
  1121. Sse2.StoreAligned(pAsciiBuffer + currentOffsetInElements, asciiVector); // aligned
  1122. currentOffsetInElements += SizeOfVector128;
  1123. } while (currentOffsetInElements <= finalOffsetWhereCanRunLoop);
  1124. Finish:
  1125. // There might be some ASCII data left over. That's fine - we'll let our caller handle the final drain.
  1126. return currentOffsetInElements;
  1127. FoundNonAsciiDataInLoop:
  1128. // Can we at least narrow the high vector?
  1129. // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
  1130. if (Sse41.IsSupported)
  1131. {
  1132. if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST))
  1133. {
  1134. goto Finish; // found non-ASCII data
  1135. }
  1136. }
  1137. else
  1138. {
  1139. if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
  1140. {
  1141. goto Finish; // found non-ASCII data
  1142. }
  1143. }
  1144. // First part was all ASCII, narrow and aligned write. Note we're only filling in the low half of the vector.
  1145. asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
  1146. Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned.");
  1147. Sse2.StoreScalar((ulong*)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is aligned
  1148. currentOffsetInElements += SizeOfVector128 / 2;
  1149. goto Finish;
  1150. }
  1151. /// <summary>
  1152. /// Copies as many ASCII bytes (00..7F) as possible from <paramref name="pAsciiBuffer"/>
  1153. /// to <paramref name="pUtf16Buffer"/>, stopping when the first non-ASCII byte is encountered
  1154. /// or once <paramref name="elementCount"/> elements have been converted. Returns the total number
  1155. /// of elements that were able to be converted.
  1156. /// </summary>
  1157. public static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount)
  1158. {
  1159. nuint currentOffset = 0;
  1160. // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
  1161. // code below. This has two benefits: (a) we can take advantage of specific instructions like
  1162. // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
  1163. // this method is running.
  1164. if (Sse2.IsSupported)
  1165. {
  1166. if (elementCount >= 2 * (uint)Unsafe.SizeOf<Vector128<byte>>())
  1167. {
  1168. currentOffset = WidenAsciiToUtf16_Sse2(pAsciiBuffer, pUtf16Buffer, elementCount);
  1169. }
  1170. }
  1171. else if (Vector.IsHardwareAccelerated)
  1172. {
  1173. uint SizeOfVector = (uint)Unsafe.SizeOf<Vector<byte>>(); // JIT will make this a const
  1174. // Only bother vectorizing if we have enough data to do so.
  1175. if (elementCount >= SizeOfVector)
  1176. {
  1177. // Note use of SBYTE instead of BYTE below; we're using the two's-complement
  1178. // representation of negative integers to act as a surrogate for "is ASCII?".
  1179. nuint finalOffsetWhereCanLoop = elementCount - SizeOfVector;
  1180. do
  1181. {
  1182. Vector<sbyte> asciiVector = Unsafe.ReadUnaligned<Vector<sbyte>>(pAsciiBuffer + currentOffset);
  1183. if (Vector.LessThanAny(asciiVector, Vector<sbyte>.Zero))
  1184. {
  1185. break; // found non-ASCII data
  1186. }
  1187. Vector.Widen(Vector.AsVectorByte(asciiVector), out Vector<ushort> utf16LowVector, out Vector<ushort> utf16HighVector);
  1188. // TODO: Is the below logic also valid for big-endian platforms?
  1189. Unsafe.WriteUnaligned<Vector<ushort>>(pUtf16Buffer + currentOffset, utf16LowVector);
  1190. Unsafe.WriteUnaligned<Vector<ushort>>(pUtf16Buffer + currentOffset + Vector<ushort>.Count, utf16HighVector);
  1191. currentOffset += SizeOfVector;
  1192. } while (currentOffset <= finalOffsetWhereCanLoop);
  1193. }
  1194. }
  1195. Debug.Assert(currentOffset <= elementCount);
  1196. nuint remainingElementCount = elementCount - currentOffset;
  1197. // Try to widen 32 bits -> 64 bits at a time.
  1198. // We needn't update remainingElementCount after this point.
  1199. uint asciiData;
  1200. if (remainingElementCount >= 4)
  1201. {
  1202. nuint finalOffsetWhereCanLoop = currentOffset + remainingElementCount - 4;
  1203. do
  1204. {
  1205. asciiData = Unsafe.ReadUnaligned<uint>(pAsciiBuffer + currentOffset);
  1206. if (!AllBytesInUInt32AreAscii(asciiData))
  1207. {
  1208. goto FoundNonAsciiData;
  1209. }
  1210. WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref pUtf16Buffer[currentOffset], asciiData);
  1211. currentOffset += 4;
  1212. } while (currentOffset <= finalOffsetWhereCanLoop);
  1213. }
  1214. // Try to widen 16 bits -> 32 bits.
  1215. if (((uint)remainingElementCount & 2) != 0)
  1216. {
  1217. asciiData = Unsafe.ReadUnaligned<ushort>(pAsciiBuffer + currentOffset);
  1218. if (!AllBytesInUInt32AreAscii(asciiData))
  1219. {
  1220. goto FoundNonAsciiData;
  1221. }
  1222. if (BitConverter.IsLittleEndian)
  1223. {
  1224. pUtf16Buffer[currentOffset] = (char)(byte)asciiData;
  1225. pUtf16Buffer[currentOffset + 1] = (char)(asciiData >> 8);
  1226. }
  1227. else
  1228. {
  1229. pUtf16Buffer[currentOffset + 1] = (char)(byte)asciiData;
  1230. pUtf16Buffer[currentOffset] = (char)(asciiData >> 8);
  1231. }
  1232. currentOffset += 2;
  1233. }
  1234. // Try to widen 8 bits -> 16 bits.
  1235. if (((uint)remainingElementCount & 1) != 0)
  1236. {
  1237. asciiData = pAsciiBuffer[currentOffset];
  1238. if (((byte)asciiData & 0x80) != 0)
  1239. {
  1240. goto Finish;
  1241. }
  1242. pUtf16Buffer[currentOffset] = (char)asciiData;
  1243. currentOffset += 1;
  1244. }
  1245. Finish:
  1246. return currentOffset;
  1247. FoundNonAsciiData:
  1248. Debug.Assert(!AllBytesInUInt32AreAscii(asciiData), "Shouldn't have reached this point if we have an all-ASCII input.");
  1249. // Drain ASCII bytes one at a time.
  1250. while (((byte)asciiData & 0x80) == 0)
  1251. {
  1252. pUtf16Buffer[currentOffset] = (char)(byte)asciiData;
  1253. currentOffset += 1;
  1254. asciiData >>= 8;
  1255. }
  1256. goto Finish;
  1257. }
  1258. private static unsafe nuint WidenAsciiToUtf16_Sse2(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount)
  1259. {
  1260. // JIT turns the below into constants
  1261. uint SizeOfVector128 = (uint)Unsafe.SizeOf<Vector128<byte>>();
  1262. nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1);
  1263. // This method is written such that control generally flows top-to-bottom, avoiding
  1264. // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
  1265. // data, we jump out of the hot paths to targets at the end of the method.
  1266. Debug.Assert(Sse2.IsSupported);
  1267. Debug.Assert(BitConverter.IsLittleEndian);
  1268. Debug.Assert(elementCount >= 2 * SizeOfVector128);
  1269. // We're going to get the best performance when we have aligned writes, so we'll take the
  1270. // hit of potentially unaligned reads in order to hit this sweet spot.
  1271. Vector128<byte> asciiVector;
  1272. Vector128<byte> utf16FirstHalfVector;
  1273. uint mask;
  1274. // First, perform an unaligned read of the first part of the input buffer.
  1275. asciiVector = Sse2.LoadVector128(pAsciiBuffer); // unaligned load
  1276. mask = (uint)Sse2.MoveMask(asciiVector);
  1277. // If there's non-ASCII data in the first 8 elements of the vector, there's nothing we can do.
  1278. if ((byte)mask != 0)
  1279. {
  1280. return 0;
  1281. }
  1282. // Then perform an unaligned write of the first part of the input buffer.
  1283. Vector128<byte> zeroVector = Vector128<byte>.Zero;
  1284. utf16FirstHalfVector = Sse2.UnpackLow(asciiVector, zeroVector);
  1285. Sse2.Store((byte*)pUtf16Buffer, utf16FirstHalfVector); // unaligned
  1286. // Calculate how many elements we wrote in order to get pOutputBuffer to its next alignment
  1287. // point, then use that as the base offset going forward. Remember the >> 1 to account for
  1288. // that we wrote chars, not bytes. This means we may re-read data in the next iteration of
  1289. // the loop, but this is ok.
  1290. nuint currentOffset = (SizeOfVector128 >> 1) - (((nuint)pUtf16Buffer >> 1) & (MaskOfAllBitsInVector128 >> 1));
  1291. Debug.Assert(0 < currentOffset && currentOffset <= SizeOfVector128 / sizeof(char));
  1292. nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector128;
  1293. do
  1294. {
  1295. // In a loop, perform an unaligned read, widen to two vectors, then aligned write the two vectors.
  1296. asciiVector = Sse2.LoadVector128(pAsciiBuffer + currentOffset); // unaligned load
  1297. mask = (uint)Sse2.MoveMask(asciiVector);
  1298. if (mask != 0)
  1299. {
  1300. // non-ASCII byte somewhere
  1301. goto NonAsciiDataSeenInInnerLoop;
  1302. }
  1303. byte* pStore = (byte*)(pUtf16Buffer + currentOffset);
  1304. Sse2.StoreAligned(pStore, Sse2.UnpackLow(asciiVector, zeroVector));
  1305. pStore += SizeOfVector128;
  1306. Sse2.StoreAligned(pStore, Sse2.UnpackHigh(asciiVector, zeroVector));
  1307. currentOffset += SizeOfVector128;
  1308. } while (currentOffset <= finalOffsetWhereCanRunLoop);
  1309. Finish:
  1310. return currentOffset;
  1311. NonAsciiDataSeenInInnerLoop:
  1312. // Can we at least widen the first part of the vector?
  1313. if ((byte)mask == 0)
  1314. {
  1315. // First part was all ASCII, widen
  1316. utf16FirstHalfVector = Sse2.UnpackLow(asciiVector, zeroVector);
  1317. Sse2.StoreAligned((byte*)(pUtf16Buffer + currentOffset), utf16FirstHalfVector);
  1318. currentOffset += SizeOfVector128 / 2;
  1319. }
  1320. goto Finish;
  1321. }
  1322. /// <summary>
  1323. /// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and
  1324. /// writes them to the output buffer with machine endianness.
  1325. /// </summary>
  1326. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  1327. private static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputBuffer, uint value)
  1328. {
  1329. Debug.Assert(AllBytesInUInt32AreAscii(value));
  1330. if (Bmi2.X64.IsSupported)
  1331. {
  1332. // BMI2 will work regardless of the processor's endianness.
  1333. Unsafe.WriteUnaligned(ref Unsafe.As<char, byte>(ref outputBuffer), Bmi2.X64.ParallelBitDeposit(value, 0x00FF00FF_00FF00FFul));
  1334. }
  1335. else
  1336. {
  1337. if (BitConverter.IsLittleEndian)
  1338. {
  1339. outputBuffer = (char)(byte)value;
  1340. value >>= 8;
  1341. Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value;
  1342. value >>= 8;
  1343. Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value;
  1344. value >>= 8;
  1345. Unsafe.Add(ref outputBuffer, 3) = (char)value;
  1346. }
  1347. else
  1348. {
  1349. Unsafe.Add(ref outputBuffer, 3) = (char)(byte)value;
  1350. value >>= 8;
  1351. Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value;
  1352. value >>= 8;
  1353. Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value;
  1354. value >>= 8;
  1355. outputBuffer = (char)value;
  1356. }
  1357. }
  1358. }
  1359. }
  1360. }