ASCIIUtility.cs 75 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732
  1. // Licensed to the .NET Foundation under one or more agreements.
  2. // The .NET Foundation licenses this file to you under the MIT license.
  3. // See the LICENSE file in the project root for more information.
  4. using System.Diagnostics;
  5. using System.Numerics;
  6. using System.Runtime.CompilerServices;
  7. using System.Runtime.Intrinsics;
  8. using System.Runtime.Intrinsics.X86;
  9. using Internal.Runtime.CompilerServices;
  10. #pragma warning disable SA1121 // explicitly using type aliases instead of built-in types
  11. #if BIT64
  12. using nint = System.Int64;
  13. using nuint = System.UInt64;
  14. #else // BIT64
  15. using nint = System.Int32;
  16. using nuint = System.UInt32;
  17. #endif // BIT64
  18. namespace System.Text
  19. {
  20. internal static partial class ASCIIUtility
  21. {
  22. #if DEBUG
  23. static ASCIIUtility()
  24. {
  25. Debug.Assert(sizeof(nint) == IntPtr.Size && nint.MinValue < 0, "nint is defined incorrectly.");
  26. Debug.Assert(sizeof(nuint) == IntPtr.Size && nuint.MinValue == 0, "nuint is defined incorrectly.");
  27. }
  28. #endif // DEBUG
  29. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  30. private static bool AllBytesInUInt64AreAscii(ulong value)
  31. {
  32. // If the high bit of any byte is set, that byte is non-ASCII.
  33. return (value & UInt64HighBitsOnlyMask) == 0;
  34. }
  35. /// <summary>
  36. /// Returns <see langword="true"/> iff all chars in <paramref name="value"/> are ASCII.
  37. /// </summary>
  38. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  39. private static bool AllCharsInUInt32AreAscii(uint value)
  40. {
  41. return (value & ~0x007F007Fu) == 0;
  42. }
  43. /// <summary>
  44. /// Returns <see langword="true"/> iff all chars in <paramref name="value"/> are ASCII.
  45. /// </summary>
  46. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  47. private static bool AllCharsInUInt64AreAscii(ulong value)
  48. {
  49. return (value & ~0x007F007F_007F007Ful) == 0;
  50. }
  51. /// <summary>
  52. /// Given a DWORD which represents two packed chars in machine-endian order,
  53. /// <see langword="true"/> iff the first char (in machine-endian order) is ASCII.
  54. /// </summary>
  55. /// <param name="value"></param>
  56. /// <returns></returns>
  57. private static bool FirstCharInUInt32IsAscii(uint value)
  58. {
  59. return (BitConverter.IsLittleEndian && (value & 0xFF80u) == 0)
  60. || (!BitConverter.IsLittleEndian && (value & 0xFF800000u) == 0);
  61. }
  62. /// <summary>
  63. /// Returns the index in <paramref name="pBuffer"/> where the first non-ASCII byte is found.
  64. /// Returns <paramref name="bufferLength"/> if the buffer is empty or all-ASCII.
  65. /// </summary>
  66. /// <returns>An ASCII byte is defined as 0x00 - 0x7F, inclusive.</returns>
  67. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  68. public static unsafe nuint GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint bufferLength)
  69. {
  70. // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
  71. // code below. This has two benefits: (a) we can take advantage of specific instructions like
  72. // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
  73. // this method is running.
  74. return (Sse2.IsSupported)
  75. ? GetIndexOfFirstNonAsciiByte_Sse2(pBuffer, bufferLength)
  76. : GetIndexOfFirstNonAsciiByte_Default(pBuffer, bufferLength);
  77. }
  78. private static unsafe nuint GetIndexOfFirstNonAsciiByte_Default(byte* pBuffer, nuint bufferLength)
  79. {
  80. // Squirrel away the original buffer reference. This method works by determining the exact
  81. // byte reference where non-ASCII data begins, so we need this base value to perform the
  82. // final subtraction at the end of the method to get the index into the original buffer.
  83. byte* pOriginalBuffer = pBuffer;
  84. // Before we drain off byte-by-byte, try a generic vectorized loop.
  85. // Only run the loop if we have at least two vectors we can pull out.
  86. // Note use of SBYTE instead of BYTE below; we're using the two's-complement
  87. // representation of negative integers to act as a surrogate for "is ASCII?".
  88. if (Vector.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector<sbyte>.Count)
  89. {
  90. uint SizeOfVectorInBytes = (uint)Vector<sbyte>.Count; // JIT will make this a const
  91. if (Vector.GreaterThanOrEqualAll(Unsafe.ReadUnaligned<Vector<sbyte>>(pBuffer), Vector<sbyte>.Zero))
  92. {
  93. // The first several elements of the input buffer were ASCII. Bump up the pointer to the
  94. // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII
  95. // data or we approach the end of the buffer. It's possible we'll reread data; this is ok.
  96. byte* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVectorInBytes;
  97. pBuffer = (byte*)(((nuint)pBuffer + SizeOfVectorInBytes) & ~(nuint)(SizeOfVectorInBytes - 1));
  98. #if DEBUG
  99. long numBytesRead = pBuffer - pOriginalBuffer;
  100. Debug.Assert(0 < numBytesRead && numBytesRead <= SizeOfVectorInBytes, "We should've made forward progress of at least one byte.");
  101. Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
  102. #endif
  103. Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector.");
  104. do
  105. {
  106. Debug.Assert((nuint)pBuffer % SizeOfVectorInBytes == 0, "Vector read should be aligned.");
  107. if (Vector.LessThanAny(Unsafe.Read<Vector<sbyte>>(pBuffer), Vector<sbyte>.Zero))
  108. {
  109. break; // found non-ASCII data
  110. }
  111. pBuffer += SizeOfVectorInBytes;
  112. } while (pBuffer <= pFinalVectorReadPos);
  113. // Adjust the remaining buffer length for the number of elements we just consumed.
  114. bufferLength -= (nuint)pBuffer;
  115. bufferLength += (nuint)pOriginalBuffer;
  116. }
  117. }
  118. // At this point, the buffer length wasn't enough to perform a vectorized search, or we did perform
  119. // a vectorized search and encountered non-ASCII data. In either case go down a non-vectorized code
  120. // path to drain any remaining ASCII bytes.
  121. //
  122. // We're going to perform unaligned reads, so prefer 32-bit reads instead of 64-bit reads.
  123. // This also allows us to perform more optimized bit twiddling tricks to count the number of ASCII bytes.
  124. uint currentUInt32;
  125. // Try reading 64 bits at a time in a loop.
  126. for (; bufferLength >= 8; bufferLength -= 8)
  127. {
  128. currentUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer);
  129. uint nextUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer + 4);
  130. if (!AllBytesInUInt32AreAscii(currentUInt32 | nextUInt32))
  131. {
  132. // One of these two values contains non-ASCII bytes.
  133. // Figure out which one it is, then put it in 'current' so that we can drain the ASCII bytes.
  134. if (AllBytesInUInt32AreAscii(currentUInt32))
  135. {
  136. currentUInt32 = nextUInt32;
  137. pBuffer += 4;
  138. }
  139. goto FoundNonAsciiData;
  140. }
  141. pBuffer += 8; // consumed 8 ASCII bytes
  142. }
  143. // From this point forward we don't need to update bufferLength.
  144. // Try reading 32 bits.
  145. if ((bufferLength & 4) != 0)
  146. {
  147. currentUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer);
  148. if (!AllBytesInUInt32AreAscii(currentUInt32))
  149. {
  150. goto FoundNonAsciiData;
  151. }
  152. pBuffer += 4;
  153. }
  154. // Try reading 16 bits.
  155. if ((bufferLength & 2) != 0)
  156. {
  157. currentUInt32 = Unsafe.ReadUnaligned<ushort>(pBuffer);
  158. if (!AllBytesInUInt32AreAscii(currentUInt32))
  159. {
  160. goto FoundNonAsciiData;
  161. }
  162. pBuffer += 2;
  163. }
  164. // Try reading 8 bits
  165. if ((bufferLength & 1) != 0)
  166. {
  167. // If the buffer contains non-ASCII data, the comparison below will fail, and
  168. // we'll end up not incrementing the buffer reference.
  169. if (*(sbyte*)pBuffer >= 0)
  170. {
  171. pBuffer++;
  172. }
  173. }
  174. Finish:
  175. nuint totalNumBytesRead = (nuint)pBuffer - (nuint)pOriginalBuffer;
  176. return totalNumBytesRead;
  177. FoundNonAsciiData:
  178. Debug.Assert(!AllBytesInUInt32AreAscii(currentUInt32), "Shouldn't have reached this point if we have an all-ASCII input.");
  179. // The method being called doesn't bother looking at whether the high byte is ASCII. There are only
  180. // two scenarios: (a) either one of the earlier bytes is not ASCII and the search terminates before
  181. // we get to the high byte; or (b) all of the earlier bytes are ASCII, so the high byte must be
  182. // non-ASCII. In both cases we only care about the low 24 bits.
  183. pBuffer += CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(currentUInt32);
  184. goto Finish;
  185. }
  186. private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuint bufferLength)
  187. {
  188. // JIT turns the below into constants
  189. uint SizeOfVector128 = (uint)Unsafe.SizeOf<Vector128<byte>>();
  190. nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1);
  191. Debug.Assert(Sse2.IsSupported, "Should've been checked by caller.");
  192. Debug.Assert(BitConverter.IsLittleEndian, "SSE2 assumes little-endian.");
  193. uint currentMask, secondMask;
  194. byte* pOriginalBuffer = pBuffer;
  195. // This method is written such that control generally flows top-to-bottom, avoiding
  196. // jumps as much as possible in the optimistic case of a large enough buffer and
  197. // "all ASCII". If we see non-ASCII data, we jump out of the hot paths to targets
  198. // after all the main logic.
  199. if (bufferLength < SizeOfVector128)
  200. {
  201. goto InputBufferLessThanOneVectorInLength; // can't vectorize; drain primitives instead
  202. }
  203. // Read the first vector unaligned.
  204. currentMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load
  205. if (currentMask != 0)
  206. {
  207. goto FoundNonAsciiDataInCurrentMask;
  208. }
  209. // If we have less than 32 bytes to process, just go straight to the final unaligned
  210. // read. There's no need to mess with the loop logic in the middle of this method.
  211. if (bufferLength < 2 * SizeOfVector128)
  212. {
  213. goto IncrementCurrentOffsetBeforeFinalUnalignedVectorRead;
  214. }
  215. // Now adjust the read pointer so that future reads are aligned.
  216. pBuffer = (byte*)(((nuint)pBuffer + SizeOfVector128) & ~(nuint)MaskOfAllBitsInVector128);
  217. #if DEBUG
  218. long numBytesRead = pBuffer - pOriginalBuffer;
  219. Debug.Assert(0 < numBytesRead && numBytesRead <= SizeOfVector128, "We should've made forward progress of at least one byte.");
  220. Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
  221. #endif
  222. // Adjust the remaining length to account for what we just read.
  223. bufferLength += (nuint)pOriginalBuffer;
  224. bufferLength -= (nuint)pBuffer;
  225. // The buffer is now properly aligned.
  226. // Read 2 vectors at a time if possible.
  227. if (bufferLength >= 2 * SizeOfVector128)
  228. {
  229. byte* pFinalVectorReadPos = (byte*)((nuint)pBuffer + bufferLength - 2 * SizeOfVector128);
  230. // After this point, we no longer need to update the bufferLength value.
  231. do
  232. {
  233. Vector128<byte> firstVector = Sse2.LoadAlignedVector128(pBuffer);
  234. Vector128<byte> secondVector = Sse2.LoadAlignedVector128(pBuffer + SizeOfVector128);
  235. currentMask = (uint)Sse2.MoveMask(firstVector);
  236. secondMask = (uint)Sse2.MoveMask(secondVector);
  237. if ((currentMask | secondMask) != 0)
  238. {
  239. goto FoundNonAsciiDataInInnerLoop;
  240. }
  241. pBuffer += 2 * SizeOfVector128;
  242. } while (pBuffer <= pFinalVectorReadPos);
  243. }
  244. // We have somewhere between 0 and (2 * vector length) - 1 bytes remaining to read from.
  245. // Since the above loop doesn't update bufferLength, we can't rely on its absolute value.
  246. // But we _can_ rely on it to tell us how much remaining data must be drained by looking
  247. // at what bits of it are set. This works because had we updated it within the loop above,
  248. // we would've been adding 2 * SizeOfVector128 on each iteration, but we only care about
  249. // bits which are less significant than those that the addition would've acted on.
  250. // If there is fewer than one vector length remaining, skip the next aligned read.
  251. if ((bufferLength & SizeOfVector128) == 0)
  252. {
  253. goto DoFinalUnalignedVectorRead;
  254. }
  255. // At least one full vector's worth of data remains, so we can safely read it.
  256. // Remember, at this point pBuffer is still aligned.
  257. currentMask = (uint)Sse2.MoveMask(Sse2.LoadAlignedVector128(pBuffer));
  258. if (currentMask != 0)
  259. {
  260. goto FoundNonAsciiDataInCurrentMask;
  261. }
  262. IncrementCurrentOffsetBeforeFinalUnalignedVectorRead:
  263. pBuffer += SizeOfVector128;
  264. DoFinalUnalignedVectorRead:
  265. if (((byte)bufferLength & MaskOfAllBitsInVector128) != 0)
  266. {
  267. // Perform an unaligned read of the last vector.
  268. // We need to adjust the pointer because we're re-reading data.
  269. pBuffer += (bufferLength & MaskOfAllBitsInVector128) - SizeOfVector128;
  270. currentMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load
  271. if (currentMask != 0)
  272. {
  273. goto FoundNonAsciiDataInCurrentMask;
  274. }
  275. pBuffer += SizeOfVector128;
  276. }
  277. Finish:
  278. return (nuint)pBuffer - (nuint)pOriginalBuffer; // and we're done!
  279. FoundNonAsciiDataInInnerLoop:
  280. // If the current (first) mask isn't the mask that contains non-ASCII data, then it must
  281. // instead be the second mask. If so, skip the entire first mask and drain ASCII bytes
  282. // from the second mask.
  283. if (currentMask == 0)
  284. {
  285. pBuffer += SizeOfVector128;
  286. currentMask = secondMask;
  287. }
  288. FoundNonAsciiDataInCurrentMask:
  289. // The mask contains - from the LSB - a 0 for each ASCII byte we saw, and a 1 for each non-ASCII byte.
  290. // Tzcnt is the correct operation to count the number of zero bits quickly. If this instruction isn't
  291. // available, we'll fall back to a normal loop.
  292. Debug.Assert(currentMask != 0, "Shouldn't be here unless we see non-ASCII data.");
  293. pBuffer += (uint)BitOperations.TrailingZeroCount(currentMask);
  294. goto Finish;
  295. FoundNonAsciiDataInCurrentDWord:
  296. uint currentDWord;
  297. Debug.Assert(!AllBytesInUInt32AreAscii(currentDWord), "Shouldn't be here unless we see non-ASCII data.");
  298. pBuffer += CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(currentDWord);
  299. goto Finish;
  300. InputBufferLessThanOneVectorInLength:
  301. // These code paths get hit if the original input length was less than one vector in size.
  302. // We can't perform vectorized reads at this point, so we'll fall back to reading primitives
  303. // directly. Note that all of these reads are unaligned.
  304. Debug.Assert(bufferLength < SizeOfVector128);
  305. // QWORD drain
  306. if ((bufferLength & 8) != 0)
  307. {
  308. if (Bmi1.X64.IsSupported)
  309. {
  310. // If we can use 64-bit tzcnt to count the number of leading ASCII bytes, prefer it.
  311. ulong candidateUInt64 = Unsafe.ReadUnaligned<ulong>(pBuffer);
  312. if (!AllBytesInUInt64AreAscii(candidateUInt64))
  313. {
  314. // Clear everything but the high bit of each byte, then tzcnt.
  315. // Remember the / 8 at the end to convert bit count to byte count.
  316. candidateUInt64 &= UInt64HighBitsOnlyMask;
  317. pBuffer += (nuint)(Bmi1.X64.TrailingZeroCount(candidateUInt64) / 8);
  318. goto Finish;
  319. }
  320. }
  321. else
  322. {
  323. // If we can't use 64-bit tzcnt, no worries. We'll just do 2x 32-bit reads instead.
  324. currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
  325. uint nextDWord = Unsafe.ReadUnaligned<uint>(pBuffer + 4);
  326. if (!AllBytesInUInt32AreAscii(currentDWord | nextDWord))
  327. {
  328. // At least one of the values wasn't all-ASCII.
  329. // We need to figure out which one it was and stick it in the currentMask local.
  330. if (AllBytesInUInt32AreAscii(currentDWord))
  331. {
  332. currentDWord = nextDWord; // this one is the culprit
  333. pBuffer += 4;
  334. }
  335. goto FoundNonAsciiDataInCurrentDWord;
  336. }
  337. }
  338. pBuffer += 8; // successfully consumed 8 ASCII bytes
  339. }
  340. // DWORD drain
  341. if ((bufferLength & 4) != 0)
  342. {
  343. currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
  344. if (!AllBytesInUInt32AreAscii(currentDWord))
  345. {
  346. goto FoundNonAsciiDataInCurrentDWord;
  347. }
  348. pBuffer += 4; // successfully consumed 4 ASCII bytes
  349. }
  350. // WORD drain
  351. // (We movzx to a DWORD for ease of manipulation.)
  352. if ((bufferLength & 2) != 0)
  353. {
  354. currentDWord = Unsafe.ReadUnaligned<ushort>(pBuffer);
  355. if (!AllBytesInUInt32AreAscii(currentDWord))
  356. {
  357. // We only care about the 0x0080 bit of the value. If it's not set, then we
  358. // increment currentOffset by 1. If it's set, we don't increment it at all.
  359. pBuffer += (nuint)((nint)(sbyte)currentDWord >> 7) + 1;
  360. goto Finish;
  361. }
  362. pBuffer += 2; // successfully consumed 2 ASCII bytes
  363. }
  364. // BYTE drain
  365. if ((bufferLength & 1) != 0)
  366. {
  367. // sbyte has non-negative value if byte is ASCII.
  368. if (*(sbyte*)(pBuffer) >= 0)
  369. {
  370. pBuffer++; // successfully consumed a single byte
  371. }
  372. }
  373. goto Finish;
  374. }
  375. /// <summary>
  376. /// Returns the index in <paramref name="pBuffer"/> where the first non-ASCII char is found.
  377. /// Returns <paramref name="bufferLength"/> if the buffer is empty or all-ASCII.
  378. /// </summary>
  379. /// <returns>An ASCII char is defined as 0x0000 - 0x007F, inclusive.</returns>
  380. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  381. public static unsafe nuint GetIndexOfFirstNonAsciiChar(char* pBuffer, nuint bufferLength /* in chars */)
  382. {
  383. // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
  384. // code below. This has two benefits: (a) we can take advantage of specific instructions like
  385. // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
  386. // this method is running.
  387. return (Sse2.IsSupported)
  388. ? GetIndexOfFirstNonAsciiChar_Sse2(pBuffer, bufferLength)
  389. : GetIndexOfFirstNonAsciiChar_Default(pBuffer, bufferLength);
  390. }
  391. private static unsafe nuint GetIndexOfFirstNonAsciiChar_Default(char* pBuffer, nuint bufferLength /* in chars */)
  392. {
  393. // Squirrel away the original buffer reference.This method works by determining the exact
  394. // char reference where non-ASCII data begins, so we need this base value to perform the
  395. // final subtraction at the end of the method to get the index into the original buffer.
  396. char* pOriginalBuffer = pBuffer;
  397. Debug.Assert(bufferLength <= nuint.MaxValue / sizeof(char));
  398. // Before we drain off char-by-char, try a generic vectorized loop.
  399. // Only run the loop if we have at least two vectors we can pull out.
  400. if (Vector.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector<ushort>.Count)
  401. {
  402. uint SizeOfVectorInChars = (uint)Vector<ushort>.Count; // JIT will make this a const
  403. uint SizeOfVectorInBytes = (uint)Vector<byte>.Count; // JIT will make this a const
  404. Vector<ushort> maxAscii = new Vector<ushort>(0x007F);
  405. if (Vector.LessThanOrEqualAll(Unsafe.ReadUnaligned<Vector<ushort>>(pBuffer), maxAscii))
  406. {
  407. // The first several elements of the input buffer were ASCII. Bump up the pointer to the
  408. // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII
  409. // data or we approach the end of the buffer. It's possible we'll reread data; this is ok.
  410. char* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVectorInChars;
  411. pBuffer = (char*)(((nuint)pBuffer + SizeOfVectorInBytes) & ~(nuint)(SizeOfVectorInBytes - 1));
  412. #if DEBUG
  413. long numCharsRead = pBuffer - pOriginalBuffer;
  414. Debug.Assert(0 < numCharsRead && numCharsRead <= SizeOfVectorInChars, "We should've made forward progress of at least one char.");
  415. Debug.Assert((nuint)numCharsRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
  416. #endif
  417. Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector.");
  418. do
  419. {
  420. Debug.Assert((nuint)pBuffer % SizeOfVectorInChars == 0, "Vector read should be aligned.");
  421. if (Vector.GreaterThanAny(Unsafe.Read<Vector<ushort>>(pBuffer), maxAscii))
  422. {
  423. break; // found non-ASCII data
  424. }
  425. pBuffer += SizeOfVectorInChars;
  426. } while (pBuffer <= pFinalVectorReadPos);
  427. // Adjust the remaining buffer length for the number of elements we just consumed.
  428. bufferLength -= ((nuint)pBuffer - (nuint)pOriginalBuffer) / sizeof(char);
  429. }
  430. }
  431. // At this point, the buffer length wasn't enough to perform a vectorized search, or we did perform
  432. // a vectorized search and encountered non-ASCII data. In either case go down a non-vectorized code
  433. // path to drain any remaining ASCII chars.
  434. //
  435. // We're going to perform unaligned reads, so prefer 32-bit reads instead of 64-bit reads.
  436. // This also allows us to perform more optimized bit twiddling tricks to count the number of ASCII chars.
  437. uint currentUInt32;
  438. // Try reading 64 bits at a time in a loop.
  439. for (; bufferLength >= 4; bufferLength -= 4) // 64 bits = 4 * 16-bit chars
  440. {
  441. currentUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer);
  442. uint nextUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer + 4 / sizeof(char));
  443. if (!AllCharsInUInt32AreAscii(currentUInt32 | nextUInt32))
  444. {
  445. // One of these two values contains non-ASCII chars.
  446. // Figure out which one it is, then put it in 'current' so that we can drain the ASCII chars.
  447. if (AllCharsInUInt32AreAscii(currentUInt32))
  448. {
  449. currentUInt32 = nextUInt32;
  450. pBuffer += 2;
  451. }
  452. goto FoundNonAsciiData;
  453. }
  454. pBuffer += 4; // consumed 4 ASCII chars
  455. }
  456. // From this point forward we don't need to keep track of the remaining buffer length.
  457. // Try reading 32 bits.
  458. if ((bufferLength & 2) != 0) // 32 bits = 2 * 16-bit chars
  459. {
  460. currentUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer);
  461. if (!AllCharsInUInt32AreAscii(currentUInt32))
  462. {
  463. goto FoundNonAsciiData;
  464. }
  465. pBuffer += 2;
  466. }
  467. // Try reading 16 bits.
  468. // No need to try an 8-bit read after this since we're working with chars.
  469. if ((bufferLength & 1) != 0)
  470. {
  471. // If the buffer contains non-ASCII data, the comparison below will fail, and
  472. // we'll end up not incrementing the buffer reference.
  473. if (*pBuffer <= 0x007F)
  474. {
  475. pBuffer++;
  476. }
  477. }
  478. Finish:
  479. nuint totalNumBytesRead = (nuint)pBuffer - (nuint)pOriginalBuffer;
  480. Debug.Assert(totalNumBytesRead % sizeof(char) == 0, "Total number of bytes read should be even since we're working with chars.");
  481. return totalNumBytesRead / sizeof(char); // convert byte count -> char count before returning
  482. FoundNonAsciiData:
  483. Debug.Assert(!AllCharsInUInt32AreAscii(currentUInt32), "Shouldn't have reached this point if we have an all-ASCII input.");
  484. // We don't bother looking at the second char - only the first char.
  485. if (FirstCharInUInt32IsAscii(currentUInt32))
  486. {
  487. pBuffer++;
  488. }
  489. goto Finish;
  490. }
  491. private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuint bufferLength /* in chars */)
  492. {
  493. // This method contains logic optimized for both SSE2 and SSE41. Much of the logic in this method
  494. // will be elided by JIT once we determine which specific ISAs we support.
  495. // Quick check for empty inputs.
  496. if (bufferLength == 0)
  497. {
  498. return 0;
  499. }
  500. // JIT turns the below into constants
  501. uint SizeOfVector128InBytes = (uint)Unsafe.SizeOf<Vector128<byte>>();
  502. uint SizeOfVector128InChars = SizeOfVector128InBytes / sizeof(char);
  503. Debug.Assert(Sse2.IsSupported, "Should've been checked by caller.");
  504. Debug.Assert(BitConverter.IsLittleEndian, "SSE2 assumes little-endian.");
  505. Vector128<short> firstVector, secondVector;
  506. uint currentMask;
  507. char* pOriginalBuffer = pBuffer;
  508. if (bufferLength < SizeOfVector128InChars)
  509. {
  510. goto InputBufferLessThanOneVectorInLength; // can't vectorize; drain primitives instead
  511. }
  512. // This method is written such that control generally flows top-to-bottom, avoiding
  513. // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
  514. // data, we jump out of the hot paths to targets at the end of the method.
  515. Vector128<short> asciiMaskForPTEST = Vector128.Create(unchecked((short)0xFF80)); // used for PTEST on supported hardware
  516. Vector128<ushort> asciiMaskForPMINUW = Vector128.Create((ushort)0x0080); // used for PMINUW on supported hardware
  517. Vector128<short> asciiMaskForPXOR = Vector128.Create(unchecked((short)0x8000)); // used for PXOR
  518. Vector128<short> asciiMaskForPCMPGTW = Vector128.Create(unchecked((short)0x807F)); // used for PCMPGTW
  519. Debug.Assert(bufferLength <= nuint.MaxValue / sizeof(char));
  520. // Read the first vector unaligned.
  521. firstVector = Sse2.LoadVector128((short*)pBuffer); // unaligned load
  522. if (Sse41.IsSupported)
  523. {
  524. // The SSE41-optimized code path works by forcing the 0x0080 bit in each WORD of the vector to be
  525. // set iff the WORD element has value >= 0x0080 (non-ASCII). Then we'll treat it as a BYTE vector
  526. // in order to extract the mask.
  527. currentMask = (uint)Sse2.MoveMask(Sse41.Min(firstVector.AsUInt16(), asciiMaskForPMINUW).AsByte());
  528. }
  529. else
  530. {
  531. // The SSE2-optimized code path works by forcing each WORD of the vector to be 0xFFFF iff the WORD
  532. // element has value >= 0x0080 (non-ASCII). Then we'll treat it as a BYTE vector in order to extract
  533. // the mask.
  534. currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
  535. }
  536. if (currentMask != 0)
  537. {
  538. goto FoundNonAsciiDataInCurrentMask;
  539. }
  540. // If we have less than 32 bytes to process, just go straight to the final unaligned
  541. // read. There's no need to mess with the loop logic in the middle of this method.
  542. // Adjust the remaining length to account for what we just read.
  543. // For the remainder of this code path, bufferLength will be in bytes, not chars.
  544. bufferLength <<= 1; // chars to bytes
  545. if (bufferLength < 2 * SizeOfVector128InBytes)
  546. {
  547. goto IncrementCurrentOffsetBeforeFinalUnalignedVectorRead;
  548. }
  549. // Now adjust the read pointer so that future reads are aligned.
  550. pBuffer = (char*)(((nuint)pBuffer + SizeOfVector128InBytes) & ~(nuint)(SizeOfVector128InBytes - 1));
  551. #if DEBUG
  552. long numCharsRead = pBuffer - pOriginalBuffer;
  553. Debug.Assert(0 < numCharsRead && numCharsRead <= SizeOfVector128InChars, "We should've made forward progress of at least one char.");
  554. Debug.Assert((nuint)numCharsRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
  555. #endif
  556. // Adjust remaining buffer length.
  557. bufferLength += (nuint)pOriginalBuffer;
  558. bufferLength -= (nuint)pBuffer;
  559. // The buffer is now properly aligned.
  560. // Read 2 vectors at a time if possible.
  561. if (bufferLength >= 2 * SizeOfVector128InBytes)
  562. {
  563. char* pFinalVectorReadPos = (char*)((nuint)pBuffer + bufferLength - 2 * SizeOfVector128InBytes);
  564. // After this point, we no longer need to update the bufferLength value.
  565. do
  566. {
  567. firstVector = Sse2.LoadAlignedVector128((short*)pBuffer);
  568. secondVector = Sse2.LoadAlignedVector128((short*)pBuffer + SizeOfVector128InChars);
  569. Vector128<short> combinedVector = Sse2.Or(firstVector, secondVector);
  570. if (Sse41.IsSupported)
  571. {
  572. // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
  573. // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
  574. if (!Sse41.TestZ(combinedVector, asciiMaskForPTEST))
  575. {
  576. goto FoundNonAsciiDataInFirstOrSecondVector;
  577. }
  578. }
  579. else
  580. {
  581. // See comment earlier in the method for an explanation of how the below logic works.
  582. if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(combinedVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
  583. {
  584. goto FoundNonAsciiDataInFirstOrSecondVector;
  585. }
  586. }
  587. pBuffer += 2 * SizeOfVector128InChars;
  588. } while (pBuffer <= pFinalVectorReadPos);
  589. }
  590. // We have somewhere between 0 and (2 * vector length) - 1 bytes remaining to read from.
  591. // Since the above loop doesn't update bufferLength, we can't rely on its absolute value.
  592. // But we _can_ rely on it to tell us how much remaining data must be drained by looking
  593. // at what bits of it are set. This works because had we updated it within the loop above,
  594. // we would've been adding 2 * SizeOfVector128 on each iteration, but we only care about
  595. // bits which are less significant than those that the addition would've acted on.
  596. // If there is fewer than one vector length remaining, skip the next aligned read.
  597. // Remember, at this point bufferLength is measured in bytes, not chars.
  598. if ((bufferLength & SizeOfVector128InBytes) == 0)
  599. {
  600. goto DoFinalUnalignedVectorRead;
  601. }
  602. // At least one full vector's worth of data remains, so we can safely read it.
  603. // Remember, at this point pBuffer is still aligned.
  604. firstVector = Sse2.LoadAlignedVector128((short*)pBuffer);
  605. if (Sse41.IsSupported)
  606. {
  607. // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
  608. // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
  609. if (!Sse41.TestZ(firstVector, asciiMaskForPTEST))
  610. {
  611. goto FoundNonAsciiDataInFirstVector;
  612. }
  613. }
  614. else
  615. {
  616. // See comment earlier in the method for an explanation of how the below logic works.
  617. currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
  618. if (currentMask != 0)
  619. {
  620. goto FoundNonAsciiDataInCurrentMask;
  621. }
  622. }
  623. IncrementCurrentOffsetBeforeFinalUnalignedVectorRead:
  624. pBuffer += SizeOfVector128InChars;
  625. DoFinalUnalignedVectorRead:
  626. if (((byte)bufferLength & (SizeOfVector128InBytes - 1)) != 0)
  627. {
  628. // Perform an unaligned read of the last vector.
  629. // We need to adjust the pointer because we're re-reading data.
  630. pBuffer = (char*)((byte*)pBuffer + (bufferLength & (SizeOfVector128InBytes - 1)) - SizeOfVector128InBytes);
  631. firstVector = Sse2.LoadVector128((short*)pBuffer); // unaligned load
  632. if (Sse41.IsSupported)
  633. {
  634. // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
  635. // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
  636. if (!Sse41.TestZ(firstVector, asciiMaskForPTEST))
  637. {
  638. goto FoundNonAsciiDataInFirstVector;
  639. }
  640. }
  641. else
  642. {
  643. // See comment earlier in the method for an explanation of how the below logic works.
  644. currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
  645. if (currentMask != 0)
  646. {
  647. goto FoundNonAsciiDataInCurrentMask;
  648. }
  649. }
  650. pBuffer += SizeOfVector128InChars;
  651. }
  652. Finish:
  653. Debug.Assert(((nuint)pBuffer - (nuint)pOriginalBuffer) % 2 == 0, "Shouldn't have incremented any pointer by an odd byte count.");
  654. return ((nuint)pBuffer - (nuint)pOriginalBuffer) / sizeof(char); // and we're done! (remember to adjust for char count)
  655. FoundNonAsciiDataInFirstOrSecondVector:
  656. // We don't know if the first or the second vector contains non-ASCII data. Check the first
  657. // vector, and if that's all-ASCII then the second vector must be the culprit. Either way
  658. // we'll make sure the first vector local is the one that contains the non-ASCII data.
  659. // See comment earlier in the method for an explanation of how the below logic works.
  660. if (Sse41.IsSupported)
  661. {
  662. if (!Sse41.TestZ(firstVector, asciiMaskForPTEST))
  663. {
  664. goto FoundNonAsciiDataInFirstVector;
  665. }
  666. }
  667. else
  668. {
  669. currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
  670. if (currentMask != 0)
  671. {
  672. goto FoundNonAsciiDataInCurrentMask;
  673. }
  674. }
  675. // Wasn't the first vector; must be the second.
  676. pBuffer += SizeOfVector128InChars;
  677. firstVector = secondVector;
  678. FoundNonAsciiDataInFirstVector:
  679. // See comment earlier in the method for an explanation of how the below logic works.
  680. if (Sse41.IsSupported)
  681. {
  682. currentMask = (uint)Sse2.MoveMask(Sse41.Min(firstVector.AsUInt16(), asciiMaskForPMINUW).AsByte());
  683. }
  684. else
  685. {
  686. currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
  687. }
  688. FoundNonAsciiDataInCurrentMask:
  689. // The mask contains - from the LSB - a 0 for each ASCII byte we saw, and a 1 for each non-ASCII byte.
  690. // Tzcnt is the correct operation to count the number of zero bits quickly. If this instruction isn't
  691. // available, we'll fall back to a normal loop. (Even though the original vector used WORD elements,
  692. // masks work on BYTE elements, and we account for this in the final fixup.)
  693. Debug.Assert(currentMask != 0, "Shouldn't be here unless we see non-ASCII data.");
  694. pBuffer = (char*)((byte*)pBuffer + (uint)BitOperations.TrailingZeroCount(currentMask));
  695. goto Finish;
  696. FoundNonAsciiDataInCurrentDWord:
  697. uint currentDWord;
  698. Debug.Assert(!AllCharsInUInt32AreAscii(currentDWord), "Shouldn't be here unless we see non-ASCII data.");
  699. if (FirstCharInUInt32IsAscii(currentDWord))
  700. {
  701. pBuffer++; // skip past the ASCII char
  702. }
  703. goto Finish;
  704. InputBufferLessThanOneVectorInLength:
  705. // These code paths get hit if the original input length was less than one vector in size.
  706. // We can't perform vectorized reads at this point, so we'll fall back to reading primitives
  707. // directly. Note that all of these reads are unaligned.
  708. // Reminder: If this code path is hit, bufferLength is still a char count, not a byte count.
  709. // We skipped the code path that multiplied the count by sizeof(char).
  710. Debug.Assert(bufferLength < SizeOfVector128InChars);
  711. // QWORD drain
  712. if ((bufferLength & 4) != 0)
  713. {
  714. if (Bmi1.X64.IsSupported)
  715. {
  716. // If we can use 64-bit tzcnt to count the number of leading ASCII chars, prefer it.
  717. ulong candidateUInt64 = Unsafe.ReadUnaligned<ulong>(pBuffer);
  718. if (!AllCharsInUInt64AreAscii(candidateUInt64))
  719. {
  720. // Clear the low 7 bits (the ASCII bits) of each char, then tzcnt.
  721. // Remember the / 8 at the end to convert bit count to byte count,
  722. // then the & ~1 at the end to treat a match in the high byte of
  723. // any char the same as a match in the low byte of that same char.
  724. candidateUInt64 &= 0xFF80FF80_FF80FF80ul;
  725. pBuffer = (char*)((byte*)pBuffer + ((nuint)(Bmi1.X64.TrailingZeroCount(candidateUInt64) / 8) & ~(nuint)1));
  726. goto Finish;
  727. }
  728. }
  729. else
  730. {
  731. // If we can't use 64-bit tzcnt, no worries. We'll just do 2x 32-bit reads instead.
  732. currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
  733. uint nextDWord = Unsafe.ReadUnaligned<uint>(pBuffer + 4 / sizeof(char));
  734. if (!AllCharsInUInt32AreAscii(currentDWord | nextDWord))
  735. {
  736. // At least one of the values wasn't all-ASCII.
  737. // We need to figure out which one it was and stick it in the currentMask local.
  738. if (AllCharsInUInt32AreAscii(currentDWord))
  739. {
  740. currentDWord = nextDWord; // this one is the culprit
  741. pBuffer += 4 / sizeof(char);
  742. }
  743. goto FoundNonAsciiDataInCurrentDWord;
  744. }
  745. }
  746. pBuffer += 4; // successfully consumed 4 ASCII chars
  747. }
  748. // DWORD drain
  749. if ((bufferLength & 2) != 0)
  750. {
  751. currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
  752. if (!AllCharsInUInt32AreAscii(currentDWord))
  753. {
  754. goto FoundNonAsciiDataInCurrentDWord;
  755. }
  756. pBuffer += 2; // successfully consumed 2 ASCII chars
  757. }
  758. // WORD drain
  759. // This is the final drain; there's no need for a BYTE drain since our elemental type is 16-bit char.
  760. if ((bufferLength & 1) != 0)
  761. {
  762. if (*pBuffer <= 0x007F)
  763. {
  764. pBuffer++; // successfully consumed a single char
  765. }
  766. }
  767. goto Finish;
  768. }
  769. /// <summary>
  770. /// Given a QWORD which represents a buffer of 4 ASCII chars in machine-endian order,
  771. /// narrows each WORD to a BYTE, then writes the 4-byte result to the output buffer
  772. /// also in machine-endian order.
  773. /// </summary>
  774. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  775. private static void NarrowFourUtf16CharsToAsciiAndWriteToBuffer(ref byte outputBuffer, ulong value)
  776. {
  777. Debug.Assert(AllCharsInUInt64AreAscii(value));
  778. if (Bmi2.X64.IsSupported)
  779. {
  780. // BMI2 will work regardless of the processor's endianness.
  781. Unsafe.WriteUnaligned(ref outputBuffer, (uint)Bmi2.X64.ParallelBitExtract(value, 0x00FF00FF_00FF00FFul));
  782. }
  783. else
  784. {
  785. if (BitConverter.IsLittleEndian)
  786. {
  787. outputBuffer = (byte)value;
  788. value >>= 16;
  789. Unsafe.Add(ref outputBuffer, 1) = (byte)value;
  790. value >>= 16;
  791. Unsafe.Add(ref outputBuffer, 2) = (byte)value;
  792. value >>= 16;
  793. Unsafe.Add(ref outputBuffer, 3) = (byte)value;
  794. }
  795. else
  796. {
  797. Unsafe.Add(ref outputBuffer, 3) = (byte)value;
  798. value >>= 16;
  799. Unsafe.Add(ref outputBuffer, 2) = (byte)value;
  800. value >>= 16;
  801. Unsafe.Add(ref outputBuffer, 1) = (byte)value;
  802. value >>= 16;
  803. outputBuffer = (byte)value;
  804. }
  805. }
  806. }
  807. /// <summary>
  808. /// Given a DWORD which represents a buffer of 2 ASCII chars in machine-endian order,
  809. /// narrows each WORD to a BYTE, then writes the 2-byte result to the output buffer also in
  810. /// machine-endian order.
  811. /// </summary>
  812. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  813. private static void NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref byte outputBuffer, uint value)
  814. {
  815. Debug.Assert(AllCharsInUInt32AreAscii(value));
  816. if (BitConverter.IsLittleEndian)
  817. {
  818. outputBuffer = (byte)value;
  819. Unsafe.Add(ref outputBuffer, 1) = (byte)(value >> 16);
  820. }
  821. else
  822. {
  823. Unsafe.Add(ref outputBuffer, 1) = (byte)value;
  824. outputBuffer = (byte)(value >> 16);
  825. }
  826. }
  827. /// <summary>
  828. /// Copies as many ASCII characters (U+0000..U+007F) as possible from <paramref name="pUtf16Buffer"/>
  829. /// to <paramref name="pAsciiBuffer"/>, stopping when the first non-ASCII character is encountered
  830. /// or once <paramref name="elementCount"/> elements have been converted. Returns the total number
  831. /// of elements that were able to be converted.
  832. /// </summary>
  833. public static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount)
  834. {
  835. nuint currentOffset = 0;
  836. uint utf16Data32BitsHigh = 0, utf16Data32BitsLow = 0;
  837. ulong utf16Data64Bits = 0;
  838. // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
  839. // code below. This has two benefits: (a) we can take advantage of specific instructions like
  840. // pmovmskb, ptest, vpminuw which we know are optimized, and (b) we can avoid downclocking the
  841. // processor while this method is running.
  842. if (Sse2.IsSupported)
  843. {
  844. Debug.Assert(BitConverter.IsLittleEndian, "Assume little endian if SSE2 is supported.");
  845. if (elementCount >= 2 * (uint)Unsafe.SizeOf<Vector128<byte>>())
  846. {
  847. // Since there's overhead to setting up the vectorized code path, we only want to
  848. // call into it after a quick probe to ensure the next immediate characters really are ASCII.
  849. // If we see non-ASCII data, we'll jump immediately to the draining logic at the end of the method.
  850. if (IntPtr.Size >= 8)
  851. {
  852. utf16Data64Bits = Unsafe.ReadUnaligned<ulong>(pUtf16Buffer);
  853. if (!AllCharsInUInt64AreAscii(utf16Data64Bits))
  854. {
  855. goto FoundNonAsciiDataIn64BitRead;
  856. }
  857. }
  858. else
  859. {
  860. utf16Data32BitsHigh = Unsafe.ReadUnaligned<uint>(pUtf16Buffer);
  861. utf16Data32BitsLow = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + 4 / sizeof(char));
  862. if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh | utf16Data32BitsLow))
  863. {
  864. goto FoundNonAsciiDataIn64BitRead;
  865. }
  866. }
  867. currentOffset = NarrowUtf16ToAscii_Sse2(pUtf16Buffer, pAsciiBuffer, elementCount);
  868. }
  869. }
  870. else if (Vector.IsHardwareAccelerated)
  871. {
  872. uint SizeOfVector = (uint)Unsafe.SizeOf<Vector<byte>>(); // JIT will make this a const
  873. // Only bother vectorizing if we have enough data to do so.
  874. if (elementCount >= 2 * SizeOfVector)
  875. {
  876. // Since there's overhead to setting up the vectorized code path, we only want to
  877. // call into it after a quick probe to ensure the next immediate characters really are ASCII.
  878. // If we see non-ASCII data, we'll jump immediately to the draining logic at the end of the method.
  879. if (IntPtr.Size >= 8)
  880. {
  881. utf16Data64Bits = Unsafe.ReadUnaligned<ulong>(pUtf16Buffer);
  882. if (!AllCharsInUInt64AreAscii(utf16Data64Bits))
  883. {
  884. goto FoundNonAsciiDataIn64BitRead;
  885. }
  886. }
  887. else
  888. {
  889. utf16Data32BitsHigh = Unsafe.ReadUnaligned<uint>(pUtf16Buffer);
  890. utf16Data32BitsLow = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + 4 / sizeof(char));
  891. if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh | utf16Data32BitsLow))
  892. {
  893. goto FoundNonAsciiDataIn64BitRead;
  894. }
  895. }
  896. Vector<ushort> maxAscii = new Vector<ushort>(0x007F);
  897. nuint finalOffsetWhereCanLoop = elementCount - 2 * SizeOfVector;
  898. do
  899. {
  900. Vector<ushort> utf16VectorHigh = Unsafe.ReadUnaligned<Vector<ushort>>(pUtf16Buffer + currentOffset);
  901. Vector<ushort> utf16VectorLow = Unsafe.ReadUnaligned<Vector<ushort>>(pUtf16Buffer + currentOffset + Vector<ushort>.Count);
  902. if (Vector.GreaterThanAny(Vector.BitwiseOr(utf16VectorHigh, utf16VectorLow), maxAscii))
  903. {
  904. break; // found non-ASCII data
  905. }
  906. // TODO: Is the below logic also valid for big-endian platforms?
  907. Vector<byte> asciiVector = Vector.Narrow(utf16VectorHigh, utf16VectorLow);
  908. Unsafe.WriteUnaligned<Vector<byte>>(pAsciiBuffer + currentOffset, asciiVector);
  909. currentOffset += SizeOfVector;
  910. } while (currentOffset <= finalOffsetWhereCanLoop);
  911. }
  912. }
  913. Debug.Assert(currentOffset <= elementCount);
  914. nuint remainingElementCount = elementCount - currentOffset;
  915. // Try to narrow 64 bits -> 32 bits at a time.
  916. // We needn't update remainingElementCount after this point.
  917. if (remainingElementCount >= 4)
  918. {
  919. nuint finalOffsetWhereCanLoop = currentOffset + remainingElementCount - 4;
  920. do
  921. {
  922. if (IntPtr.Size >= 8)
  923. {
  924. // Only perform QWORD reads on a 64-bit platform.
  925. utf16Data64Bits = Unsafe.ReadUnaligned<ulong>(pUtf16Buffer + currentOffset);
  926. if (!AllCharsInUInt64AreAscii(utf16Data64Bits))
  927. {
  928. goto FoundNonAsciiDataIn64BitRead;
  929. }
  930. NarrowFourUtf16CharsToAsciiAndWriteToBuffer(ref pAsciiBuffer[currentOffset], utf16Data64Bits);
  931. }
  932. else
  933. {
  934. utf16Data32BitsHigh = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + currentOffset);
  935. utf16Data32BitsLow = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + currentOffset + 4 / sizeof(char));
  936. if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh | utf16Data32BitsLow))
  937. {
  938. goto FoundNonAsciiDataIn64BitRead;
  939. }
  940. NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref pAsciiBuffer[currentOffset], utf16Data32BitsHigh);
  941. NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref pAsciiBuffer[currentOffset + 2], utf16Data32BitsLow);
  942. }
  943. currentOffset += 4;
  944. } while (currentOffset <= finalOffsetWhereCanLoop);
  945. }
  946. // Try to narrow 32 bits -> 16 bits.
  947. if (((uint)remainingElementCount & 2) != 0)
  948. {
  949. utf16Data32BitsHigh = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + currentOffset);
  950. if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh))
  951. {
  952. goto FoundNonAsciiDataInHigh32Bits;
  953. }
  954. NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref pAsciiBuffer[currentOffset], utf16Data32BitsHigh);
  955. currentOffset += 2;
  956. }
  957. // Try to narrow 16 bits -> 8 bits.
  958. if (((uint)remainingElementCount & 1) != 0)
  959. {
  960. utf16Data32BitsHigh = pUtf16Buffer[currentOffset];
  961. if (utf16Data32BitsHigh <= 0x007Fu)
  962. {
  963. pAsciiBuffer[currentOffset] = (byte)utf16Data32BitsHigh;
  964. currentOffset++;
  965. }
  966. }
  967. Finish:
  968. return currentOffset;
  969. FoundNonAsciiDataIn64BitRead:
  970. if (IntPtr.Size >= 8)
  971. {
  972. // Try checking the first 32 bits of the buffer for non-ASCII data.
  973. // Regardless, we'll move the non-ASCII data into the utf16Data32BitsHigh local.
  974. if (BitConverter.IsLittleEndian)
  975. {
  976. utf16Data32BitsHigh = (uint)utf16Data64Bits;
  977. }
  978. else
  979. {
  980. utf16Data32BitsHigh = (uint)(utf16Data64Bits >> 32);
  981. }
  982. if (AllCharsInUInt32AreAscii(utf16Data32BitsHigh))
  983. {
  984. NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref pAsciiBuffer[currentOffset], utf16Data32BitsHigh);
  985. if (BitConverter.IsLittleEndian)
  986. {
  987. utf16Data32BitsHigh = (uint)(utf16Data64Bits >> 32);
  988. }
  989. else
  990. {
  991. utf16Data32BitsHigh = (uint)utf16Data64Bits;
  992. }
  993. currentOffset += 2;
  994. }
  995. }
  996. else
  997. {
  998. // Need to determine if the high or the low 32-bit value contained non-ASCII data.
  999. // Regardless, we'll move the non-ASCII data into the utf16Data32BitsHigh local.
  1000. if (AllCharsInUInt32AreAscii(utf16Data32BitsHigh))
  1001. {
  1002. NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref pAsciiBuffer[currentOffset], utf16Data32BitsHigh);
  1003. utf16Data32BitsHigh = utf16Data32BitsLow;
  1004. currentOffset += 2;
  1005. }
  1006. }
  1007. FoundNonAsciiDataInHigh32Bits:
  1008. Debug.Assert(!AllCharsInUInt32AreAscii(utf16Data32BitsHigh), "Shouldn't have reached this point if we have an all-ASCII input.");
  1009. // There's at most one char that needs to be drained.
  1010. if (FirstCharInUInt32IsAscii(utf16Data32BitsHigh))
  1011. {
  1012. if (!BitConverter.IsLittleEndian)
  1013. {
  1014. utf16Data32BitsHigh >>= 16; // move high char down to low char
  1015. }
  1016. pAsciiBuffer[currentOffset] = (byte)utf16Data32BitsHigh;
  1017. currentOffset++;
  1018. }
  1019. goto Finish;
  1020. }
  1021. private static unsafe nuint NarrowUtf16ToAscii_Sse2(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount)
  1022. {
  1023. // This method contains logic optimized for both SSE2 and SSE41. Much of the logic in this method
  1024. // will be elided by JIT once we determine which specific ISAs we support.
  1025. // JIT turns the below into constants
  1026. uint SizeOfVector128 = (uint)Unsafe.SizeOf<Vector128<byte>>();
  1027. nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1);
  1028. // This method is written such that control generally flows top-to-bottom, avoiding
  1029. // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
  1030. // data, we jump out of the hot paths to targets at the end of the method.
  1031. Debug.Assert(Sse2.IsSupported);
  1032. Debug.Assert(BitConverter.IsLittleEndian);
  1033. Debug.Assert(elementCount >= 2 * SizeOfVector128);
  1034. Vector128<short> asciiMaskForPTEST = Vector128.Create(unchecked((short)0xFF80)); // used for PTEST on supported hardware
  1035. Vector128<short> asciiMaskForPXOR = Vector128.Create(unchecked((short)0x8000)); // used for PXOR
  1036. Vector128<short> asciiMaskForPCMPGTW = Vector128.Create(unchecked((short)0x807F)); // used for PCMPGTW
  1037. // First, perform an unaligned read of the first part of the input buffer.
  1038. Vector128<short> utf16VectorFirst = Sse2.LoadVector128((short*)pUtf16Buffer); // unaligned load
  1039. // If there's non-ASCII data in the first 8 elements of the vector, there's nothing we can do.
  1040. // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
  1041. if (Sse41.IsSupported)
  1042. {
  1043. if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST))
  1044. {
  1045. return 0;
  1046. }
  1047. }
  1048. else
  1049. {
  1050. if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
  1051. {
  1052. return 0;
  1053. }
  1054. }
  1055. // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.
  1056. Vector128<byte> asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
  1057. Sse2.StoreScalar((ulong*)pAsciiBuffer, asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED
  1058. nuint currentOffsetInElements = SizeOfVector128 / 2; // we processed 8 elements so far
  1059. // We're going to get the best performance when we have aligned writes, so we'll take the
  1060. // hit of potentially unaligned reads in order to hit this sweet spot.
  1061. // pAsciiBuffer points to the start of the destination buffer, immediately before where we wrote
  1062. // the 8 bytes previously. If the 0x08 bit is set at the pinned address, then the 8 bytes we wrote
  1063. // previously mean that the 0x08 bit is *not* set at address &pAsciiBuffer[SizeOfVector128 / 2]. In
  1064. // that case we can immediately back up to the previous aligned boundary and start the main loop.
  1065. // If the 0x08 bit is *not* set at the pinned address, then it means the 0x08 bit *is* set at
  1066. // address &pAsciiBuffer[SizeOfVector128 / 2], and we should perform one more 8-byte write to bump
  1067. // just past the next aligned boundary address.
  1068. if (((uint)pAsciiBuffer & (SizeOfVector128 / 2)) == 0)
  1069. {
  1070. // We need to perform one more partial vector write before we can get the alignment we want.
  1071. utf16VectorFirst = Sse2.LoadVector128((short*)pUtf16Buffer + currentOffsetInElements); // unaligned load
  1072. // See comments earlier in this method for information about how this works.
  1073. if (Sse41.IsSupported)
  1074. {
  1075. if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST))
  1076. {
  1077. goto Finish;
  1078. }
  1079. }
  1080. else
  1081. {
  1082. if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
  1083. {
  1084. goto Finish;
  1085. }
  1086. }
  1087. // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.
  1088. asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
  1089. Sse2.StoreScalar((ulong*)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED
  1090. }
  1091. // Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment
  1092. // point, then use that as the base offset going forward.
  1093. currentOffsetInElements = SizeOfVector128 - ((nuint)pAsciiBuffer & MaskOfAllBitsInVector128);
  1094. Debug.Assert(0 < currentOffsetInElements && currentOffsetInElements <= SizeOfVector128, "We wrote at least 1 byte but no more than a whole vector.");
  1095. Debug.Assert(currentOffsetInElements <= elementCount, "Shouldn't have overrun the destination buffer.");
  1096. Debug.Assert(elementCount - currentOffsetInElements >= SizeOfVector128, "We should be able to run at least one whole vector.");
  1097. nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector128;
  1098. do
  1099. {
  1100. // In a loop, perform two unaligned reads, narrow to a single vector, then aligned write one vector.
  1101. utf16VectorFirst = Sse2.LoadVector128((short*)pUtf16Buffer + currentOffsetInElements); // unaligned load
  1102. Vector128<short> utf16VectorSecond = Sse2.LoadVector128((short*)pUtf16Buffer + currentOffsetInElements + SizeOfVector128 / sizeof(short)); // unaligned load
  1103. Vector128<short> combinedVector = Sse2.Or(utf16VectorFirst, utf16VectorSecond);
  1104. // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
  1105. if (Sse41.IsSupported)
  1106. {
  1107. if (!Sse41.TestZ(combinedVector, asciiMaskForPTEST))
  1108. {
  1109. goto FoundNonAsciiDataInLoop;
  1110. }
  1111. }
  1112. else
  1113. {
  1114. if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(combinedVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
  1115. {
  1116. goto FoundNonAsciiDataInLoop;
  1117. }
  1118. }
  1119. // Build up the UTF-8 vector and perform the store.
  1120. asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorSecond);
  1121. Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % SizeOfVector128 == 0, "Write should be aligned.");
  1122. Sse2.StoreAligned(pAsciiBuffer + currentOffsetInElements, asciiVector); // aligned
  1123. currentOffsetInElements += SizeOfVector128;
  1124. } while (currentOffsetInElements <= finalOffsetWhereCanRunLoop);
  1125. Finish:
  1126. // There might be some ASCII data left over. That's fine - we'll let our caller handle the final drain.
  1127. return currentOffsetInElements;
  1128. FoundNonAsciiDataInLoop:
  1129. // Can we at least narrow the high vector?
  1130. // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
  1131. if (Sse41.IsSupported)
  1132. {
  1133. if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST))
  1134. {
  1135. goto Finish; // found non-ASCII data
  1136. }
  1137. }
  1138. else
  1139. {
  1140. if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
  1141. {
  1142. goto Finish; // found non-ASCII data
  1143. }
  1144. }
  1145. // First part was all ASCII, narrow and aligned write. Note we're only filling in the low half of the vector.
  1146. asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
  1147. Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned.");
  1148. Sse2.StoreScalar((ulong*)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is aligned
  1149. currentOffsetInElements += SizeOfVector128 / 2;
  1150. goto Finish;
  1151. }
  1152. /// <summary>
  1153. /// Copies as many ASCII bytes (00..7F) as possible from <paramref name="pAsciiBuffer"/>
  1154. /// to <paramref name="pUtf16Buffer"/>, stopping when the first non-ASCII byte is encountered
  1155. /// or once <paramref name="elementCount"/> elements have been converted. Returns the total number
  1156. /// of elements that were able to be converted.
  1157. /// </summary>
  1158. public static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount)
  1159. {
  1160. nuint currentOffset = 0;
  1161. // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
  1162. // code below. This has two benefits: (a) we can take advantage of specific instructions like
  1163. // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
  1164. // this method is running.
  1165. if (Sse2.IsSupported)
  1166. {
  1167. if (elementCount >= 2 * (uint)Unsafe.SizeOf<Vector128<byte>>())
  1168. {
  1169. currentOffset = WidenAsciiToUtf16_Sse2(pAsciiBuffer, pUtf16Buffer, elementCount);
  1170. }
  1171. }
  1172. else if (Vector.IsHardwareAccelerated)
  1173. {
  1174. uint SizeOfVector = (uint)Unsafe.SizeOf<Vector<byte>>(); // JIT will make this a const
  1175. // Only bother vectorizing if we have enough data to do so.
  1176. if (elementCount >= SizeOfVector)
  1177. {
  1178. // Note use of SBYTE instead of BYTE below; we're using the two's-complement
  1179. // representation of negative integers to act as a surrogate for "is ASCII?".
  1180. nuint finalOffsetWhereCanLoop = elementCount - SizeOfVector;
  1181. do
  1182. {
  1183. Vector<sbyte> asciiVector = Unsafe.ReadUnaligned<Vector<sbyte>>(pAsciiBuffer + currentOffset);
  1184. if (Vector.LessThanAny(asciiVector, Vector<sbyte>.Zero))
  1185. {
  1186. break; // found non-ASCII data
  1187. }
  1188. Vector.Widen(Vector.AsVectorByte(asciiVector), out Vector<ushort> utf16LowVector, out Vector<ushort> utf16HighVector);
  1189. // TODO: Is the below logic also valid for big-endian platforms?
  1190. Unsafe.WriteUnaligned<Vector<ushort>>(pUtf16Buffer + currentOffset, utf16LowVector);
  1191. Unsafe.WriteUnaligned<Vector<ushort>>(pUtf16Buffer + currentOffset + Vector<ushort>.Count, utf16HighVector);
  1192. currentOffset += SizeOfVector;
  1193. } while (currentOffset <= finalOffsetWhereCanLoop);
  1194. }
  1195. }
  1196. Debug.Assert(currentOffset <= elementCount);
  1197. nuint remainingElementCount = elementCount - currentOffset;
  1198. // Try to widen 32 bits -> 64 bits at a time.
  1199. // We needn't update remainingElementCount after this point.
  1200. uint asciiData;
  1201. if (remainingElementCount >= 4)
  1202. {
  1203. nuint finalOffsetWhereCanLoop = currentOffset + remainingElementCount - 4;
  1204. do
  1205. {
  1206. asciiData = Unsafe.ReadUnaligned<uint>(pAsciiBuffer + currentOffset);
  1207. if (!AllBytesInUInt32AreAscii(asciiData))
  1208. {
  1209. goto FoundNonAsciiData;
  1210. }
  1211. WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref pUtf16Buffer[currentOffset], asciiData);
  1212. currentOffset += 4;
  1213. } while (currentOffset <= finalOffsetWhereCanLoop);
  1214. }
  1215. // Try to widen 16 bits -> 32 bits.
  1216. if (((uint)remainingElementCount & 2) != 0)
  1217. {
  1218. asciiData = Unsafe.ReadUnaligned<ushort>(pAsciiBuffer + currentOffset);
  1219. if (!AllBytesInUInt32AreAscii(asciiData))
  1220. {
  1221. goto FoundNonAsciiData;
  1222. }
  1223. if (BitConverter.IsLittleEndian)
  1224. {
  1225. pUtf16Buffer[currentOffset] = (char)(byte)asciiData;
  1226. pUtf16Buffer[currentOffset + 1] = (char)(asciiData >> 8);
  1227. }
  1228. else
  1229. {
  1230. pUtf16Buffer[currentOffset + 1] = (char)(byte)asciiData;
  1231. pUtf16Buffer[currentOffset] = (char)(asciiData >> 8);
  1232. }
  1233. currentOffset += 2;
  1234. }
  1235. // Try to widen 8 bits -> 16 bits.
  1236. if (((uint)remainingElementCount & 1) != 0)
  1237. {
  1238. asciiData = pAsciiBuffer[currentOffset];
  1239. if (((byte)asciiData & 0x80) != 0)
  1240. {
  1241. goto Finish;
  1242. }
  1243. pUtf16Buffer[currentOffset] = (char)asciiData;
  1244. currentOffset++;
  1245. }
  1246. Finish:
  1247. return currentOffset;
  1248. FoundNonAsciiData:
  1249. Debug.Assert(!AllBytesInUInt32AreAscii(asciiData), "Shouldn't have reached this point if we have an all-ASCII input.");
  1250. // Drain ASCII bytes one at a time.
  1251. while (((byte)asciiData & 0x80) == 0)
  1252. {
  1253. pUtf16Buffer[currentOffset] = (char)(byte)asciiData;
  1254. currentOffset++;
  1255. asciiData >>= 8;
  1256. }
  1257. goto Finish;
  1258. }
  1259. private static unsafe nuint WidenAsciiToUtf16_Sse2(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount)
  1260. {
  1261. // JIT turns the below into constants
  1262. uint SizeOfVector128 = (uint)Unsafe.SizeOf<Vector128<byte>>();
  1263. nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1);
  1264. // This method is written such that control generally flows top-to-bottom, avoiding
  1265. // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
  1266. // data, we jump out of the hot paths to targets at the end of the method.
  1267. Debug.Assert(Sse2.IsSupported);
  1268. Debug.Assert(BitConverter.IsLittleEndian);
  1269. Debug.Assert(elementCount >= 2 * SizeOfVector128);
  1270. // We're going to get the best performance when we have aligned writes, so we'll take the
  1271. // hit of potentially unaligned reads in order to hit this sweet spot.
  1272. Vector128<byte> asciiVector;
  1273. Vector128<byte> utf16FirstHalfVector;
  1274. uint mask;
  1275. // First, perform an unaligned read of the first part of the input buffer.
  1276. asciiVector = Sse2.LoadVector128(pAsciiBuffer); // unaligned load
  1277. mask = (uint)Sse2.MoveMask(asciiVector);
  1278. // If there's non-ASCII data in the first 8 elements of the vector, there's nothing we can do.
  1279. if ((byte)mask != 0)
  1280. {
  1281. return 0;
  1282. }
  1283. // Then perform an unaligned write of the first part of the input buffer.
  1284. Vector128<byte> zeroVector = Vector128<byte>.Zero;
  1285. utf16FirstHalfVector = Sse2.UnpackLow(asciiVector, zeroVector);
  1286. Sse2.Store((byte*)pUtf16Buffer, utf16FirstHalfVector); // unaligned
  1287. // Calculate how many elements we wrote in order to get pOutputBuffer to its next alignment
  1288. // point, then use that as the base offset going forward. Remember the >> 1 to account for
  1289. // that we wrote chars, not bytes. This means we may re-read data in the next iteration of
  1290. // the loop, but this is ok.
  1291. nuint currentOffset = (SizeOfVector128 >> 1) - (((nuint)pUtf16Buffer >> 1) & (MaskOfAllBitsInVector128 >> 1));
  1292. Debug.Assert(0 < currentOffset && currentOffset <= SizeOfVector128 / sizeof(char));
  1293. nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector128;
  1294. do
  1295. {
  1296. // In a loop, perform an unaligned read, widen to two vectors, then aligned write the two vectors.
  1297. asciiVector = Sse2.LoadVector128(pAsciiBuffer + currentOffset); // unaligned load
  1298. mask = (uint)Sse2.MoveMask(asciiVector);
  1299. if (mask != 0)
  1300. {
  1301. // non-ASCII byte somewhere
  1302. goto NonAsciiDataSeenInInnerLoop;
  1303. }
  1304. byte* pStore = (byte*)(pUtf16Buffer + currentOffset);
  1305. Sse2.StoreAligned(pStore, Sse2.UnpackLow(asciiVector, zeroVector));
  1306. pStore += SizeOfVector128;
  1307. Sse2.StoreAligned(pStore, Sse2.UnpackHigh(asciiVector, zeroVector));
  1308. currentOffset += SizeOfVector128;
  1309. } while (currentOffset <= finalOffsetWhereCanRunLoop);
  1310. Finish:
  1311. return currentOffset;
  1312. NonAsciiDataSeenInInnerLoop:
  1313. // Can we at least widen the first part of the vector?
  1314. if ((byte)mask == 0)
  1315. {
  1316. // First part was all ASCII, widen
  1317. utf16FirstHalfVector = Sse2.UnpackLow(asciiVector, zeroVector);
  1318. Sse2.StoreAligned((byte*)(pUtf16Buffer + currentOffset), utf16FirstHalfVector);
  1319. currentOffset += SizeOfVector128 / 2;
  1320. }
  1321. goto Finish;
  1322. }
  1323. /// <summary>
  1324. /// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and
  1325. /// writes them to the output buffer with machine endianness.
  1326. /// </summary>
  1327. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  1328. private static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputBuffer, uint value)
  1329. {
  1330. Debug.Assert(AllBytesInUInt32AreAscii(value));
  1331. if (Bmi2.X64.IsSupported)
  1332. {
  1333. // BMI2 will work regardless of the processor's endianness.
  1334. Unsafe.WriteUnaligned(ref Unsafe.As<char, byte>(ref outputBuffer), Bmi2.X64.ParallelBitDeposit(value, 0x00FF00FF_00FF00FFul));
  1335. }
  1336. else
  1337. {
  1338. if (BitConverter.IsLittleEndian)
  1339. {
  1340. outputBuffer = (char)(byte)value;
  1341. value >>= 8;
  1342. Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value;
  1343. value >>= 8;
  1344. Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value;
  1345. value >>= 8;
  1346. Unsafe.Add(ref outputBuffer, 3) = (char)value;
  1347. }
  1348. else
  1349. {
  1350. Unsafe.Add(ref outputBuffer, 3) = (char)(byte)value;
  1351. value >>= 8;
  1352. Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value;
  1353. value >>= 8;
  1354. Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value;
  1355. value >>= 8;
  1356. outputBuffer = (char)value;
  1357. }
  1358. }
  1359. }
  1360. }
  1361. }