simd.h 183 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148
  1. // zlib open source license
  2. //
  3. // Copyright (c) 2017 to 2025 David Forsgren Piuva
  4. //
  5. // This software is provided 'as-is', without any express or implied
  6. // warranty. In no event will the authors be held liable for any damages
  7. // arising from the use of this software.
  8. //
  9. // Permission is granted to anyone to use this software for any purpose,
  10. // including commercial applications, and to alter it and redistribute it
  11. // freely, subject to the following restrictions:
  12. //
  13. // 1. The origin of this software must not be misrepresented; you must not
  14. // claim that you wrote the original software. If you use this software
  15. // in a product, an acknowledgment in the product documentation would be
  16. // appreciated but is not required.
  17. //
  18. // 2. Altered source versions must be plainly marked as such, and must not be
  19. // misrepresented as being the original software.
  20. //
  21. // 3. This notice may not be removed or altered from any source
  22. // distribution.
  23. // Hardware abstraction layer for portable SIMD math.
  24. // Used to make calculations faster without having to mess around with any hardware specific assembler code nor intrinsic functions.
  25. // You get the performance you need today and the ability to compile with automatic scalar emulation when building for a future processor that this module has not yet been ported to.
  26. // When you can generate the vectorized code using the same template function as your non-vectorized code, you don't even need to write a reference implementation.
  27. // Using with 128-bit SIMD: (beginner friendly, test once, compile anywhere, no compiler flags)
  28. // If you are new to vectorization or only plan to work with 128-bit extensions such as ARM NEON, you can keep it simple by only using the 128-bit vector types.
  29. // Pros and cons:
  30. // + Most target platforms (excluding older systems such as ARMv6) have 128-bit SIMD extensions such as Intel SSE2 or ARM NEON enabled by default.
  31. // ARMv6 does not support ARM NEON, but most ARMv7 processors support it, so that compilers enable NEON by default.
  32. // All 64-bit ARM processors have ARM NEON, because it stopped being optional in ARMv8.
  33. // Building for 64-bit Intel processors usually have SSE2 enabled by default, so you don't have to change any compiler flags when building on a different system.
  34. // + One build for all computers of the same instruction set.
  35. // Great when your application is not so resource heavy, because the least powerful systems don't have the fancy extensions anyway.
  36. // - You might end up enabling the additional SIMD extensions anyway because the library is already using it to become faster.
  37. // Types:
  38. // * Use F32x4, I32x4 and U32x4 for 4 elements at a time
  39. // * U16x8 for 8 elements at a time
  40. // * U8x16 for 16 elements at a time
  41. // Using the X vector size: (advanced, having to test with different build flags or emulation)
  42. // If you want more performance, you can use variable length type aliases.
  43. // Pros and cons:
  44. // + For heavy calculations where memory access is not the bottleneck, using larger SIMD vectors when enabled allow saving energy and increasing performance.
  45. // - If you forget to test with longer vector lengths (compiling with -mavx2 or -mEMULATE_256BIT_SIMD) then you might find bugs from not iterating or aligning memory correctly.
  46. // Types:
  47. // * Use F32xX, I32xX and U32xX for laneCountX_32Bit elements at a time
  48. // * U16xX for laneCountX_16Bit elements at a time
  49. // * U8xX for laneCountX_8Bit elements at a time
  50. // Using the F vector size: (very dangerous, no test can confirm that memory alignment is correct)
  51. // If you want even more performance, you can let float operations use the longest available F vector size, which might exceed the X vector size.
  52. // Pros and cons:
  53. // - Have to manually set the alignment of buffers to DSR_FLOAT_ALIGNMENT to prevent crashing.
  54. // If the default alignment for buffers changed based on the size of F vectors, the more commonly used X vector would get slowed down from cache misses from padding larger than X vectors.
  55. // AlignedImageF32 and sound backends are already aligned with the F vector size, because they are not generic like Buffer.
  56. // - It can be difficult to detect incorrect memory alignment, because a pointer can accidentally be aligned to more than what was requested.
  57. // If accidentally aligning to 128 bits instead of 256 bits, there is a 50% risk of failing to detect it at runtime and later fail on another computer.
  58. // If sticking with 128-bit or X vectors, all buffers will be correctly aligned automatically.
  59. // + For heavy calculations where memory access is not the bottleneck, using larger SIMD vectors when enabled allow saving energy and increasing performance.
  60. // - If you forget to test with longer vector lengths (compiling with -mavx2 or -mEMULATE_256BIT_SIMD) then you might find bugs from not iterating or aligning memory correctly.
  61. // Types:
  62. // * Use F32xX, I32xX and U32xX for laneCountX_32Bit elements at a time
  63. // Compiler extensions
  64. // On Intel/AMD processors:
  65. // SSE2 is usually enabled by default, because SSE2 is mandatory for 64-bit Intel instructions.
  66. // Use -mavx as a G++ compiler flag to enable the AVX extension, enabling the USE_AVX and USE_256BIT_F_SIMD macros.
  67. // If not available on your computer, you can test your algorithm for 256-bit float SIMD using EMULATE_256BIT_F_SIMD, but only if you use the F vector size.
  68. // Use -mavx2 as a G++ compiler flag to enable the AVX2 extension, enabling the USE_AVX2 and USE_256BIT_X_SIMD macros.
  69. // If not available on your computer, you can test your algorithm for 256-bit float and integer SIMD using EMULATE_256BIT_X_SIMD, but only if you use the X vector size.
  70. // On ARMv6 processors:
  71. // Scalar emulation is used when compiling for ARMv6, because it does not have NEON and VFP is not supported in this abstraction.
  72. // On ARMv7 processors:
  73. // NEON is usually enabled by default for ARMv7, because most of them have the extension.
  74. // On ARMv8 processors:
  75. // NEON can not be disabled for ARMv8, because it is mandatory for ARMv8.
  76. // The g++ compiler does not consider __m256 and __m256i to have strict alignment requirements, despite crashing if they are not aligned.
  77. // * Each container or variable for __m256 and __m256i has to be explicitly aligned using alignas, because it is not enough that alignof returns 32.
  78. // The compiler only cares about the strict alignment requirement, but somehow the 256-bit AVX2 types are not treated as
  79. // strictly required to be aligned, despite Intel's ABI being clear about the need for them to awlays be aligned.
  80. // * It is also not enough to have all variables strictly aligned, because the compiler may generate temporary variables automatically that are unaligned.
  81. // Each intrinsic SIMD function, has to write the result directly to an explicitly aligned named variable to supress the creation of unaligned temps.
  82. // The intrinsic functions can not be used to form nest expressions due to this compiler bug, because intermediate values will generate unaligned temporary variables.
  83. // * Even if you always contain the SIMD types in an explicitly aligned struct, you must also define the copy, assignment and move operators,
  84. // to make sure that no unaligned temporary variables are created when moving the data around at the end of function calls.
  85. // Some intrinsic functions require input arguments to be immediate constants.
  86. // Then a template argument can be used as a wrapper making sure that constant evaluation is enforced even when optimization is turned off.
  87. // The expression 5 + 5 will not becomes an immediate constant when optimization is disabled, which may cause a crash if passing the expression as an immediate constant.
  88. // Sometimes you need to turn optimization off for debugging, so it is good if turning optimizations off does not cause the program to crash.
  89. #ifndef DFPSR_SIMD
  90. #define DFPSR_SIMD
  91. #include <cstdint>
  92. #include <cassert>
  93. #include "SafePointer.h"
  94. #include "../math/FVector.h"
  95. #include "../math/IVector.h"
  96. #include "../math/UVector.h"
  97. #include "DsrTraits.h"
  98. #include "../settings.h"
  99. #include "../base/noSimd.h"
  100. #include "../api/stringAPI.h"
  101. #ifdef USE_SSE2
  102. #include <emmintrin.h> // SSE2
  103. #ifdef USE_SSSE3
  104. #include <tmmintrin.h> // SSSE3
  105. #endif
  106. #ifdef USE_AVX
  107. #include <immintrin.h> // AVX / AVX2
  108. #endif
  109. #endif
  110. #ifdef USE_NEON
  111. #include <arm_neon.h> // NEON
  112. #endif
  113. namespace dsr {
  114. // Alignment in bytes
  115. #define ALIGN_BYTES(SIZE) alignas(SIZE)
  116. #define ALIGN16 ALIGN_BYTES(16) // 128-bit alignment
  117. #define ALIGN32 ALIGN_BYTES(32) // 256-bit alignment
  118. #define ALIGN64 ALIGN_BYTES(64) // 512-bit alignment
  119. #define ALIGN128 ALIGN_BYTES(128) // 1024-bit alignment
  120. #define ALIGN256 ALIGN_BYTES(256) // 2048-bit alignment
  121. // Everything declared in here handles things specific for SSE.
  122. // Direct use of the macros will not provide portability to all hardware.
  123. #ifdef USE_SSE2
  124. // Vector types
  125. #define SIMD_F32x4 __m128
  126. #define SIMD_U8x16 __m128i
  127. #define SIMD_U16x8 __m128i
  128. #define SIMD_U32x4 __m128i
  129. #define SIMD_I32x4 __m128i
  130. // Vector uploads in address order
  131. #define LOAD_VECTOR_F32_SIMD(A, B, C, D) _mm_set_ps(D, C, B, A)
  132. #define LOAD_SCALAR_F32_SIMD(A) _mm_set1_ps(A)
  133. #define LOAD_VECTOR_U8_SIMD(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) _mm_set_epi8(P, O, N, M, L, K, J, I, H, G, F, E, D, C, B, A)
  134. #define LOAD_SCALAR_U8_SIMD(A) _mm_set1_epi8(A)
  135. #define LOAD_VECTOR_U16_SIMD(A, B, C, D, E, F, G, H) _mm_set_epi16(H, G, F, E, D, C, B, A)
  136. #define LOAD_SCALAR_U16_SIMD(A) _mm_set1_epi16(A)
  137. #define LOAD_VECTOR_U32_SIMD(A, B, C, D) _mm_set_epi32(D, C, B, A)
  138. #define LOAD_SCALAR_U32_SIMD(A) _mm_set1_epi32(A)
  139. #define LOAD_VECTOR_I32_SIMD(A, B, C, D) _mm_set_epi32(D, C, B, A)
  140. #define LOAD_SCALAR_I32_SIMD(A) _mm_set1_epi32(A)
  141. // Conversions
  142. #define F32_TO_I32_SIMD(A) _mm_cvttps_epi32(A)
  143. #define F32_TO_U32_SIMD(A) _mm_cvttps_epi32(A)
  144. #define I32_TO_F32_SIMD(A) _mm_cvtepi32_ps(A)
  145. #define U32_TO_F32_SIMD(A) _mm_cvtepi32_ps(A)
  146. // Unpacking conversions
  147. #define U8_LOW_TO_U16_SIMD(A) _mm_unpacklo_epi8(A, _mm_set1_epi8(0))
  148. #define U8_HIGH_TO_U16_SIMD(A) _mm_unpackhi_epi8(A, _mm_set1_epi8(0))
  149. #define U16_LOW_TO_U32_SIMD(A) _mm_unpacklo_epi16(A, _mm_set1_epi16(0))
  150. #define U16_HIGH_TO_U32_SIMD(A) _mm_unpackhi_epi16(A, _mm_set1_epi16(0))
  151. // Reinterpret casting
  152. #define REINTERPRET_U32_TO_U8_SIMD(A) (A)
  153. #define REINTERPRET_U32_TO_U16_SIMD(A) (A)
  154. #define REINTERPRET_U8_TO_U32_SIMD(A) (A)
  155. #define REINTERPRET_U16_TO_U32_SIMD(A) (A)
  156. #define REINTERPRET_U32_TO_I32_SIMD(A) (A)
  157. #define REINTERPRET_I32_TO_U32_SIMD(A) (A)
  158. // Vector float operations returning SIMD_F32x4
  159. #define ADD_F32_SIMD(A, B) _mm_add_ps(A, B)
  160. #define SUB_F32_SIMD(A, B) _mm_sub_ps(A, B)
  161. #define MUL_F32_SIMD(A, B) _mm_mul_ps(A, B)
  162. // Vector integer operations returning SIMD_I32x4
  163. #define ADD_I32_SIMD(A, B) _mm_add_epi32(A, B)
  164. #define SUB_I32_SIMD(A, B) _mm_sub_epi32(A, B)
  165. // 32-bit integer multiplications are not available on SSE2.
  166. // Vector integer operations returning SIMD_U32x4
  167. #define ADD_U32_SIMD(A, B) _mm_add_epi32(A, B)
  168. #define SUB_U32_SIMD(A, B) _mm_sub_epi32(A, B)
  169. // 32-bit integer multiplications are not available on SSE2.
  170. // Vector integer operations returning SIMD_U16x8
  171. #define ADD_U16_SIMD(A, B) _mm_add_epi16(A, B)
  172. #define SUB_U16_SIMD(A, B) _mm_sub_epi16(A, B)
  173. #define MUL_U16_SIMD(A, B) _mm_mullo_epi16(A, B)
  174. // Vector integer operations returning SIMD_U8x16
  175. #define ADD_U8_SIMD(A, B) _mm_add_epi8(A, B)
  176. #define ADD_SAT_U8_SIMD(A, B) _mm_adds_epu8(A, B) // Saturated addition
  177. #define SUB_U8_SIMD(A, B) _mm_sub_epi8(A, B)
  178. #define SUB_SAT_U8_SIMD(A, B) _mm_subs_epu8(A, B) // Saturated subtraction
  179. // No 8-bit multiplications
  180. // Statistics
  181. #define MIN_F32_SIMD(A, B) _mm_min_ps(A, B)
  182. #define MAX_F32_SIMD(A, B) _mm_max_ps(A, B)
  183. // TODO: Implement minimum and maximum for integer vectors, so that all operations exist for all applicable types:
  184. // Using _mm256_max_epu16... in AVX2 for 256-bit versions
  185. // Using comparisons and masking in SSE2 when _mm_max_epu16... in SSE4.1 is not available
  186. // Bitwise
  187. #define BITWISE_AND_U32_SIMD(A, B) _mm_and_si128(A, B)
  188. #define BITWISE_OR_U32_SIMD(A, B) _mm_or_si128(A, B)
  189. #define BITWISE_XOR_U32_SIMD(A, B) _mm_xor_si128(A, B)
  190. #ifdef USE_AVX
  191. // 256-bit vector types
  192. #define SIMD_F32x8 __m256
  193. // Vector float operations returning SIMD_F32x4
  194. #define ADD_F32_SIMD256(A, B) _mm256_add_ps(A, B)
  195. #define SUB_F32_SIMD256(A, B) _mm256_sub_ps(A, B)
  196. #define MUL_F32_SIMD256(A, B) _mm256_mul_ps(A, B)
  197. // Statistics
  198. #define MIN_F32_SIMD256(A, B) _mm256_min_ps(A, B)
  199. #define MAX_F32_SIMD256(A, B) _mm256_max_ps(A, B)
  200. #ifdef USE_AVX2
  201. // 256-bit vector types
  202. #define SIMD_U8x32 __m256i
  203. #define SIMD_U16x16 __m256i
  204. #define SIMD_U32x8 __m256i
  205. #define SIMD_I32x8 __m256i
  206. // Conversions
  207. #define F32_TO_I32_SIMD256(A) _mm256_cvttps_epi32(A)
  208. #define F32_TO_U32_SIMD256(A) _mm256_cvttps_epi32(A)
  209. #define I32_TO_F32_SIMD256(A) _mm256_cvtepi32_ps(A)
  210. #define U32_TO_F32_SIMD256(A) _mm256_cvtepi32_ps(A)
  211. // Unpacking conversions
  212. #define U8_LOW_TO_U16_SIMD256(A) _mm256_unpacklo_epi8(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi8(0))
  213. #define U8_HIGH_TO_U16_SIMD256(A) _mm256_unpackhi_epi8(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi8(0))
  214. #define U16_LOW_TO_U32_SIMD256(A) _mm256_unpacklo_epi16(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi16(0))
  215. #define U16_HIGH_TO_U32_SIMD256(A) _mm256_unpackhi_epi16(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi16(0))
  216. // Reinterpret casting
  217. #define REINTERPRET_U32_TO_U8_SIMD256(A) (A)
  218. #define REINTERPRET_U32_TO_U16_SIMD256(A) (A)
  219. #define REINTERPRET_U8_TO_U32_SIMD256(A) (A)
  220. #define REINTERPRET_U16_TO_U32_SIMD256(A) (A)
  221. #define REINTERPRET_U32_TO_I32_SIMD256(A) (A)
  222. #define REINTERPRET_I32_TO_U32_SIMD256(A) (A)
  223. // Vector integer operations returning SIMD_I32x4
  224. #define ADD_I32_SIMD256(A, B) _mm256_add_epi32(A, B)
  225. #define SUB_I32_SIMD256(A, B) _mm256_sub_epi32(A, B)
  226. #define MUL_I32_SIMD256(A, B) _mm256_mullo_epi32(A, B)
  227. // Vector integer operations returning SIMD_U32x4
  228. #define ADD_U32_SIMD256(A, B) _mm256_add_epi32(A, B)
  229. #define SUB_U32_SIMD256(A, B) _mm256_sub_epi32(A, B)
  230. #define MUL_U32_SIMD256(A, B) _mm256_mullo_epi32(A, B)
  231. // Vector integer operations returning SIMD_U16x8
  232. #define ADD_U16_SIMD256(A, B) _mm256_add_epi16(A, B)
  233. #define SUB_U16_SIMD256(A, B) _mm256_sub_epi16(A, B)
  234. #define MUL_U16_SIMD256(A, B) _mm256_mullo_epi16(A, B)
  235. // Vector integer operations returning SIMD_U8x16
  236. #define ADD_U8_SIMD256(A, B) _mm256_add_epi8(A, B)
  237. #define ADD_SAT_U8_SIMD256(A, B) _mm256_adds_epu8(A, B) // Saturated addition
  238. #define SUB_U8_SIMD256(A, B) _mm256_sub_epi8(A, B)
  239. #define SUB_SAT_U8_SIMD256(A, B) _mm256_subs_epu8(A, B) // Saturated subtraction
  240. // No 8-bit multiplications
  241. // Bitwise
  242. #define BITWISE_AND_U32_SIMD256(A, B) _mm256_and_si256(A, B)
  243. #define BITWISE_OR_U32_SIMD256(A, B) _mm256_or_si256(A, B)
  244. #define BITWISE_XOR_U32_SIMD256(A, B) _mm256_xor_si256(A, B)
  245. #endif
  246. #endif
  247. #endif
  248. // Everything declared in here handles things specific for NEON.
  249. // Direct use of the macros will not provide portability to all hardware.
  250. #ifdef USE_NEON
  251. // Vector types
  252. #define SIMD_F32x4 float32x4_t
  253. #define SIMD_U8x16 uint8x16_t
  254. #define SIMD_U16x8 uint16x8_t
  255. #define SIMD_U32x4 uint32x4_t
  256. #define SIMD_I32x4 int32x4_t
  257. // Vector uploads in address order
  258. inline SIMD_F32x4 LOAD_VECTOR_F32_SIMD(float a, float b, float c, float d) {
  259. ALIGN16 float data[4] = {a, b, c, d};
  260. #ifdef SAFE_POINTER_CHECKS
  261. if (uintptr_t((void*)data) & 15u) { throwError(U"Unaligned stack memory detected in LOAD_VECTOR_F32_SIMD for NEON!\n"); }
  262. #endif
  263. return vld1q_f32(data);
  264. }
  265. inline SIMD_F32x4 LOAD_SCALAR_F32_SIMD(float a) {
  266. return vdupq_n_f32(a);
  267. }
  268. inline SIMD_U8x16 LOAD_VECTOR_U8_SIMD(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
  269. uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p) {
  270. ALIGN16 uint8_t data[16] = {a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p};
  271. #ifdef SAFE_POINTER_CHECKS
  272. if (uintptr_t((void*)data) & 15u) { throwError(U"Unaligned stack memory detected in LOAD_VECTOR_U8_SIMD for NEON!\n"); }
  273. #endif
  274. return vld1q_u8(data);
  275. }
  276. inline SIMD_U8x16 LOAD_SCALAR_U8_SIMD(uint16_t a) {
  277. return vdupq_n_u8(a);
  278. }
  279. inline SIMD_U16x8 LOAD_VECTOR_U16_SIMD(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h) {
  280. ALIGN16 uint16_t data[8] = {a, b, c, d, e, f, g, h};
  281. #ifdef SAFE_POINTER_CHECKS
  282. if (uintptr_t((void*)data) & 15u) { throwError(U"Unaligned stack memory detected in LOAD_VECTOR_U16_SIMD for NEON!\n"); }
  283. #endif
  284. return vld1q_u16(data);
  285. }
  286. inline SIMD_U16x8 LOAD_SCALAR_U16_SIMD(uint16_t a) {
  287. return vdupq_n_u16(a);
  288. }
  289. inline SIMD_U32x4 LOAD_VECTOR_U32_SIMD(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
  290. ALIGN16 uint32_t data[4] = {a, b, c, d};
  291. #ifdef SAFE_POINTER_CHECKS
  292. if (uintptr_t((void*)data) & 15u) { throwError(U"Unaligned stack memory detected in LOAD_VECTOR_U32_SIMD for NEON!\n"); }
  293. #endif
  294. return vld1q_u32(data);
  295. }
  296. inline SIMD_U32x4 LOAD_SCALAR_U32_SIMD(uint32_t a) {
  297. return vdupq_n_u32(a);
  298. }
  299. inline SIMD_I32x4 LOAD_VECTOR_I32_SIMD(int32_t a, int32_t b, int32_t c, int32_t d) {
  300. ALIGN16 int32_t data[4] = {a, b, c, d};
  301. #ifdef SAFE_POINTER_CHECKS
  302. if (uintptr_t((void*)data) & 15u) { throwError(U"Unaligned stack memory detected in LOAD_VECTOR_I32_SIMD for NEON!\n"); }
  303. #endif
  304. return vld1q_s32(data);
  305. }
  306. inline SIMD_I32x4 LOAD_SCALAR_I32_SIMD(int32_t a) {
  307. return vdupq_n_s32(a);
  308. }
  309. // Conversions
  310. #define F32_TO_I32_SIMD(A) vcvtq_s32_f32(A)
  311. #define F32_TO_U32_SIMD(A) vcvtq_u32_f32(A)
  312. #define I32_TO_F32_SIMD(A) vcvtq_f32_s32(A)
  313. #define U32_TO_F32_SIMD(A) vcvtq_f32_u32(A)
  314. // Unpacking conversions
  315. #define U8_LOW_TO_U16_SIMD(A) vmovl_u8(vget_low_u8(A))
  316. #define U8_HIGH_TO_U16_SIMD(A) vmovl_u8(vget_high_u8(A))
  317. #define U16_LOW_TO_U32_SIMD(A) vmovl_u16(vget_low_u16(A))
  318. #define U16_HIGH_TO_U32_SIMD(A) vmovl_u16(vget_high_u16(A))
  319. // Reinterpret casting
  320. #define REINTERPRET_U32_TO_U8_SIMD(A) vreinterpretq_u8_u32(A)
  321. #define REINTERPRET_U32_TO_U16_SIMD(A) vreinterpretq_u16_u32(A)
  322. #define REINTERPRET_U8_TO_U32_SIMD(A) vreinterpretq_u32_u8(A)
  323. #define REINTERPRET_U16_TO_U32_SIMD(A) vreinterpretq_u32_u16(A)
  324. #define REINTERPRET_U32_TO_I32_SIMD(A) vreinterpretq_s32_u32(A)
  325. #define REINTERPRET_I32_TO_U32_SIMD(A) vreinterpretq_u32_s32(A)
  326. // Vector float operations returning SIMD_F32x4
  327. #define ADD_F32_SIMD(A, B) vaddq_f32(A, B)
  328. #define SUB_F32_SIMD(A, B) vsubq_f32(A, B)
  329. #define MUL_F32_SIMD(A, B) vmulq_f32(A, B)
  330. // Vector integer operations returning SIMD_I32x4
  331. #define ADD_I32_SIMD(A, B) vaddq_s32(A, B)
  332. #define SUB_I32_SIMD(A, B) vsubq_s32(A, B)
  333. #define MUL_I32_NEON(A, B) vmulq_s32(A, B)
  334. // Vector integer operations returning SIMD_U32x4
  335. #define ADD_U32_SIMD(A, B) vaddq_u32(A, B)
  336. #define SUB_U32_SIMD(A, B) vsubq_u32(A, B)
  337. #define MUL_U32_NEON(A, B) vmulq_u32(A, B)
  338. // Vector integer operations returning SIMD_U16x8
  339. #define ADD_U16_SIMD(A, B) vaddq_u16(A, B)
  340. #define SUB_U16_SIMD(A, B) vsubq_u16(A, B)
  341. #define MUL_U16_SIMD(A, B) vmulq_u16(A, B)
  342. // Vector integer operations returning SIMD_U8x16
  343. #define ADD_U8_SIMD(A, B) vaddq_u8(A, B)
  344. #define ADD_SAT_U8_SIMD(A, B) vqaddq_u8(A, B) // Saturated addition
  345. #define SUB_U8_SIMD(A, B) vsubq_u8(A, B)
  346. #define SUB_SAT_U8_SIMD(A, B) vqsubq_u8(A, B) // Saturated subtraction
  347. // No 8-bit multiplications
  348. // Statistics
  349. #define MIN_F32_SIMD(A, B) vminq_f32(A, B)
  350. #define MAX_F32_SIMD(A, B) vmaxq_f32(A, B)
  351. // Bitwise
  352. #define BITWISE_AND_U32_SIMD(A, B) vandq_u32(A, B)
  353. #define BITWISE_OR_U32_SIMD(A, B) vorrq_u32(A, B)
  354. #define BITWISE_XOR_U32_SIMD(A, B) veorq_u32(A, B)
  355. #endif
  356. /*
  357. The vector types below are supposed to be portable across different CPU architectures.
  358. When mixed with handwritten SIMD intrinsics:
  359. Use "USE_SSE2" instead of "__SSE2__"
  360. Use "USE_AVX2" instead of "__AVX2__"
  361. Use "USE_NEON" instead of "__ARM_NEON"
  362. So that any new variations of the macro named given from the compiler can be added to simd.h instead of duplicated everywhere.
  363. Portability exceptions:
  364. * The "v" variable is the native backend, which is only defined when SIMD is supported by hardware.
  365. Only use when USE_BASIC_SIMD is defined.
  366. Will not work on scalar emulation.
  367. * The "scalars" array is available when emulating a type that does not exist or the SIMD vector has direct access to the memory.
  368. Do not rely on these for accessing elements, because otherwise your code will not be able to compile for ARM NEON.
  369. */
  370. struct ALIGN16 F32x4 {
  371. private:
  372. // The uninitialized default constructor is private for safety reasons.
  373. F32x4() {}
  374. public:
  375. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  376. static inline F32x4 create_dangerous_uninitialized() { return F32x4(); }
  377. #ifdef USE_BASIC_SIMD
  378. public:
  379. // The SIMD vector of undefined type
  380. // Not accessible while emulating!
  381. SIMD_F32x4 v;
  382. // Construct a portable vector from a native SIMD vector
  383. explicit F32x4(const SIMD_F32x4& v) : v(v) {}
  384. // Construct a portable vector from a set of scalars
  385. F32x4(float a1, float a2, float a3, float a4) : v(LOAD_VECTOR_F32_SIMD(a1, a2, a3, a4)) {}
  386. // Construct a portable vector from a single duplicated scalar
  387. explicit F32x4(float scalar) : v(LOAD_SCALAR_F32_SIMD(scalar)) {}
  388. // Copy constructor.
  389. F32x4(const F32x4& other) {
  390. v = other.v;
  391. }
  392. // Assignment operator.
  393. F32x4& operator=(const F32x4& other) {
  394. if (this != &other) {
  395. v = other.v;
  396. }
  397. return *this;
  398. }
  399. // Move operator.
  400. F32x4& operator=(F32x4&& other) noexcept {
  401. v = other.v;
  402. return *this;
  403. }
  404. #else
  405. public:
  406. // Emulate a SIMD vector as an array of scalars without hardware support.
  407. // Only accessible while emulating!
  408. float scalars[4];
  409. // Construct a portable vector from a set of scalars
  410. F32x4(float a1, float a2, float a3, float a4) {
  411. this->scalars[0] = a1;
  412. this->scalars[1] = a2;
  413. this->scalars[2] = a3;
  414. this->scalars[3] = a4;
  415. }
  416. // Construct a portable vector from a single duplicated scalar
  417. explicit F32x4(float scalar) {
  418. this->scalars[0] = scalar;
  419. this->scalars[1] = scalar;
  420. this->scalars[2] = scalar;
  421. this->scalars[3] = scalar;
  422. }
  423. #endif
  424. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  425. static inline F32x4 createGradient(float start, float increment) {
  426. return F32x4(start, start + increment, start + increment * 2.0f, start + increment * 3.0f);
  427. }
  428. // Construct a portable SIMD vector from a pointer to aligned data.
  429. // Data must be aligned with at least 16 bytes.
  430. static inline F32x4 readAlignedUnsafe(const float* data) {
  431. #ifdef SAFE_POINTER_CHECKS
  432. if (uintptr_t((const void*)data) & 15u) { throwError(U"Unaligned pointer detected in F32x4::readAlignedUnsafe!\n"); }
  433. #endif
  434. #ifdef USE_BASIC_SIMD
  435. #if defined(USE_SSE2)
  436. ALIGN16 SIMD_F32x4 result = _mm_load_ps(data);
  437. return F32x4(result);
  438. #elif defined(USE_NEON)
  439. ALIGN16 SIMD_F32x4 result = vld1q_f32(data);
  440. return F32x4(result);
  441. #endif
  442. #else
  443. return F32x4(data[0], data[1], data[2], data[3]);
  444. #endif
  445. }
  446. // Write to aligned memory from the existing vector
  447. // data must be aligned with 32 bytes.
  448. inline void writeAlignedUnsafe(float* data) const {
  449. #ifdef SAFE_POINTER_CHECKS
  450. if (uintptr_t((void*)data) & 15u) { throwError(U"Unaligned pionter detected in F32x4::writeAlignedUnsafe!\n"); }
  451. #endif
  452. #if defined(USE_BASIC_SIMD)
  453. #if defined(USE_SSE2)
  454. _mm_store_ps(data, this->v);
  455. #elif defined(USE_NEON)
  456. vst1q_f32(data, this->v);
  457. #endif
  458. #else
  459. data[0] = this->scalars[0];
  460. data[1] = this->scalars[1];
  461. data[2] = this->scalars[2];
  462. data[3] = this->scalars[3];
  463. #endif
  464. }
  465. #if defined(DFPSR_GEOMETRY_FVECTOR)
  466. dsr::FVector4D get() const {
  467. ALIGN16 float data[4];
  468. #ifdef SAFE_POINTER_CHECKS
  469. if (uintptr_t(data) & 15u) { throwError(U"Unaligned stack memory detected in FVector4D F32x4::get!\n"); }
  470. #endif
  471. this->writeAlignedUnsafe(data);
  472. return dsr::FVector4D(data[0], data[1], data[2], data[3]);
  473. }
  474. #endif
  475. // Bound and alignment checked reading
  476. static inline F32x4 readAligned(dsr::SafePointer<const float> data, const char* methodName) {
  477. const float* pointer = data.getUnsafe();
  478. #if defined(SAFE_POINTER_CHECKS)
  479. data.assertInside(methodName, pointer, 16);
  480. #endif
  481. return F32x4::readAlignedUnsafe(pointer);
  482. }
  483. // Bound and alignment checked writing
  484. inline void writeAligned(dsr::SafePointer<float> data, const char* methodName) const {
  485. float* pointer = data.getUnsafe();
  486. #if defined(SAFE_POINTER_CHECKS)
  487. data.assertInside(methodName, pointer, 16);
  488. #endif
  489. this->writeAlignedUnsafe(pointer);
  490. }
  491. };
  492. struct ALIGN16 I32x4 {
  493. private:
  494. // The uninitialized default constructor is private for safety reasons.
  495. I32x4() {}
  496. public:
  497. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  498. static inline I32x4 create_dangerous_uninitialized() { return I32x4(); }
  499. #if defined(USE_BASIC_SIMD)
  500. public:
  501. // The SIMD vector of undefined type
  502. // Not accessible while emulating!
  503. SIMD_I32x4 v;
  504. // Construct a portable vector from a native SIMD vector
  505. explicit I32x4(const SIMD_I32x4& v) : v(v) {}
  506. // Construct a portable vector from a set of scalars
  507. I32x4(int32_t a1, int32_t a2, int32_t a3, int32_t a4) : v(LOAD_VECTOR_I32_SIMD(a1, a2, a3, a4)) {}
  508. // Construct a portable vector from a single duplicated scalar
  509. explicit I32x4(int32_t scalar) : v(LOAD_SCALAR_I32_SIMD(scalar)) {}
  510. // Copy constructor.
  511. I32x4(const I32x4& other) {
  512. v = other.v;
  513. }
  514. // Assignment operator.
  515. I32x4& operator=(const I32x4& other) {
  516. if (this != &other) {
  517. v = other.v;
  518. }
  519. return *this;
  520. }
  521. // Move operator.
  522. I32x4& operator=(I32x4&& other) noexcept {
  523. v = other.v;
  524. return *this;
  525. }
  526. #else
  527. public:
  528. // Emulate a SIMD vector as an array of scalars without hardware support.
  529. // Only accessible while emulating!
  530. int32_t scalars[4];
  531. // Construct a portable vector from a set of scalars
  532. I32x4(int32_t a1, int32_t a2, int32_t a3, int32_t a4) {
  533. this->scalars[0] = a1;
  534. this->scalars[1] = a2;
  535. this->scalars[2] = a3;
  536. this->scalars[3] = a4;
  537. }
  538. // Construct a portable vector from a single duplicated scalar
  539. explicit I32x4(int32_t scalar) {
  540. this->scalars[0] = scalar;
  541. this->scalars[1] = scalar;
  542. this->scalars[2] = scalar;
  543. this->scalars[3] = scalar;
  544. }
  545. #endif
  546. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  547. static inline I32x4 createGradient(int32_t start, int32_t increment) {
  548. return I32x4(start, start + increment, start + increment * 2, start + increment * 3);
  549. }
  550. // Construct a portable SIMD vector from a pointer to aligned data.
  551. // Data must be aligned with at least 16 bytes.
  552. static inline I32x4 readAlignedUnsafe(const int32_t* data) {
  553. #ifdef SAFE_POINTER_CHECKS
  554. if (uintptr_t(data) & 15u) { throwError(U"Unaligned pointer detected in I32x4::readAlignedUnsafe!\n"); }
  555. #endif
  556. #if defined(USE_BASIC_SIMD)
  557. #if defined(USE_SSE2)
  558. ALIGN16 SIMD_I32x4 result = _mm_load_si128((const __m128i*)data);
  559. return I32x4(result);
  560. #elif defined(USE_NEON)
  561. ALIGN16 SIMD_I32x4 result = vld1q_s32(data);
  562. return I32x4(result);
  563. #endif
  564. #else
  565. return I32x4(data[0], data[1], data[2], data[3]);
  566. #endif
  567. }
  568. // Write to aligned memory from the existing vector
  569. // data must be aligned with 32 bytes.
  570. inline void writeAlignedUnsafe(int32_t* data) const {
  571. #ifdef SAFE_POINTER_CHECKS
  572. if (uintptr_t(data) & 15u) { throwError(U"Unaligned pointer detected in I32x4::writeAlignedUnsafe!\n"); }
  573. #endif
  574. #if defined(USE_BASIC_SIMD)
  575. #if defined(USE_SSE2)
  576. _mm_store_si128((__m128i*)data, this->v);
  577. #elif defined(USE_NEON)
  578. vst1q_s32(data, this->v);
  579. #endif
  580. #else
  581. data[0] = this->scalars[0];
  582. data[1] = this->scalars[1];
  583. data[2] = this->scalars[2];
  584. data[3] = this->scalars[3];
  585. #endif
  586. }
  587. #if defined(DFPSR_GEOMETRY_IVECTOR)
  588. dsr::IVector4D get() const {
  589. ALIGN16 int32_t data[4];
  590. #ifdef SAFE_POINTER_CHECKS
  591. if (uintptr_t(data) & 15u) { throwError(U"Unaligned stack memory detected in IVector4D I32x4::get!\n"); }
  592. #endif
  593. this->writeAlignedUnsafe(data);
  594. return dsr::IVector4D(data[0], data[1], data[2], data[3]);
  595. }
  596. #endif
  597. // Bound and alignment checked reading
  598. static inline I32x4 readAligned(dsr::SafePointer<const int32_t> data, const char* methodName) {
  599. const int32_t* pointer = data.getUnsafe();
  600. #if defined(SAFE_POINTER_CHECKS)
  601. data.assertInside(methodName, pointer, 16);
  602. #endif
  603. return I32x4::readAlignedUnsafe(pointer);
  604. }
  605. // Bound and alignment checked writing
  606. inline void writeAligned(dsr::SafePointer<int32_t> data, const char* methodName) const {
  607. int32_t* pointer = data.getUnsafe();
  608. #if defined(SAFE_POINTER_CHECKS)
  609. data.assertInside(methodName, pointer, 16);
  610. #endif
  611. this->writeAlignedUnsafe(pointer);
  612. }
  613. };
  614. struct ALIGN16 U32x4 {
  615. private:
  616. // The uninitialized default constructor is private for safety reasons.
  617. U32x4() {}
  618. public:
  619. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  620. static inline U32x4 create_dangerous_uninitialized() { return U32x4(); }
  621. #if defined(USE_BASIC_SIMD)
  622. public:
  623. // The SIMD vector of undefined type
  624. // Not accessible while emulating!
  625. SIMD_U32x4 v;
  626. // Construct a portable vector from a native SIMD vector
  627. explicit U32x4(const SIMD_U32x4& v) : v(v) {}
  628. // Construct a portable vector from a set of scalars
  629. U32x4(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4) : v(LOAD_VECTOR_U32_SIMD(a1, a2, a3, a4)) {}
  630. // Construct a portable vector from a single duplicated scalar
  631. explicit U32x4(uint32_t scalar) : v(LOAD_SCALAR_U32_SIMD(scalar)) {}
  632. // Copy constructor.
  633. U32x4(const U32x4& other) {
  634. v = other.v;
  635. }
  636. // Assignment operator.
  637. U32x4& operator=(const U32x4& other) {
  638. if (this != &other) {
  639. v = other.v;
  640. }
  641. return *this;
  642. }
  643. // Move operator.
  644. U32x4& operator=(U32x4&& other) noexcept {
  645. v = other.v;
  646. return *this;
  647. }
  648. #else
  649. public:
  650. // Emulate a SIMD vector as an array of scalars without hardware support.
  651. // Only accessible while emulating!
  652. uint32_t scalars[4];
  653. // Construct a portable vector from a set of scalars
  654. U32x4(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4) {
  655. this->scalars[0] = a1;
  656. this->scalars[1] = a2;
  657. this->scalars[2] = a3;
  658. this->scalars[3] = a4;
  659. }
  660. // Construct a portable vector from a single duplicated scalar
  661. explicit U32x4(uint32_t scalar) {
  662. this->scalars[0] = scalar;
  663. this->scalars[1] = scalar;
  664. this->scalars[2] = scalar;
  665. this->scalars[3] = scalar;
  666. }
  667. #endif
  668. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  669. static inline U32x4 createGradient(uint32_t start, uint32_t increment) {
  670. return U32x4(start, start + increment, start + increment * 2, start + increment * 3);
  671. }
  672. // Construct a portable SIMD vector from a pointer to aligned data
  673. // Data must be aligned with at least 16 bytes.
  674. static inline U32x4 readAlignedUnsafe(const uint32_t* data) {
  675. #ifdef SAFE_POINTER_CHECKS
  676. if (uintptr_t(data) & 15u) { throwError(U"Unaligned pointer detected in U32x4::readAlignedUnsafe!\n"); }
  677. #endif
  678. #if defined(USE_BASIC_SIMD)
  679. #if defined(USE_SSE2)
  680. ALIGN16 SIMD_I32x4 result = _mm_load_si128((const __m128i*)data);
  681. return U32x4(result);
  682. #elif defined(USE_NEON)
  683. ALIGN16 SIMD_I32x4 result = vld1q_u32(data);
  684. return U32x4(result);
  685. #endif
  686. #else
  687. return U32x4(data[0], data[1], data[2], data[3]);
  688. #endif
  689. }
  690. // Write to aligned memory from the existing vector
  691. // data must be aligned with 32 bytes.
  692. inline void writeAlignedUnsafe(uint32_t* data) const {
  693. #ifdef SAFE_POINTER_CHECKS
  694. if (uintptr_t(data) & 15u) { throwError(U"Unaligned pointer detected in U32x4::writeAlignedUnsafe!\n"); }
  695. #endif
  696. #if defined(USE_BASIC_SIMD)
  697. #if defined(USE_SSE2)
  698. _mm_store_si128((__m128i*)data, this->v);
  699. #elif defined(USE_NEON)
  700. vst1q_u32(data, this->v);
  701. #endif
  702. #else
  703. data[0] = this->scalars[0];
  704. data[1] = this->scalars[1];
  705. data[2] = this->scalars[2];
  706. data[3] = this->scalars[3];
  707. #endif
  708. }
  709. #if defined(DFPSR_GEOMETRY_UVECTOR)
  710. dsr::UVector4D get() const {
  711. ALIGN16 uint32_t data[4];
  712. #ifdef SAFE_POINTER_CHECKS
  713. if (uintptr_t(data) & 15u) { throwError(U"Unaligned stack memory detected in UVector4D U32x4::get!\n"); }
  714. #endif
  715. this->writeAlignedUnsafe(data);
  716. return dsr::UVector4D(data[0], data[1], data[2], data[3]);
  717. }
  718. #endif
  719. // Bound and alignment checked reading
  720. static inline U32x4 readAligned(dsr::SafePointer<const uint32_t> data, const char* methodName) {
  721. const uint32_t* pointer = data.getUnsafe();
  722. #if defined(SAFE_POINTER_CHECKS)
  723. data.assertInside(methodName, pointer, 16);
  724. #endif
  725. return U32x4::readAlignedUnsafe(pointer);
  726. }
  727. // Bound and alignment checked writing
  728. inline void writeAligned(dsr::SafePointer<uint32_t> data, const char* methodName) const {
  729. uint32_t* pointer = data.getUnsafe();
  730. #if defined(SAFE_POINTER_CHECKS)
  731. data.assertInside(methodName, pointer, 16);
  732. #endif
  733. this->writeAlignedUnsafe(pointer);
  734. }
  735. };
  736. struct ALIGN16 U16x8 {
  737. private:
  738. // The uninitialized default constructor is private for safety reasons.
  739. U16x8() {}
  740. public:
  741. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  742. static inline U16x8 create_dangerous_uninitialized() { return U16x8(); }
  743. #if defined(USE_BASIC_SIMD)
  744. public:
  745. // The SIMD vector of undefined type
  746. // Not accessible while emulating!
  747. SIMD_U16x8 v;
  748. // Construct a portable vector from a native SIMD vector
  749. explicit U16x8(const SIMD_U16x8& v) : v(v) {}
  750. // Construct a portable vector from a set of scalars
  751. U16x8(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8) : v(LOAD_VECTOR_U16_SIMD(a1, a2, a3, a4, a5, a6, a7, a8)) {}
  752. // Construct a portable vector from a single duplicated scalar
  753. explicit U16x8(uint16_t scalar) : v(LOAD_SCALAR_U16_SIMD(scalar)) {}
  754. // Copy constructor.
  755. U16x8(const U16x8& other) {
  756. v = other.v;
  757. }
  758. // Assignment operator.
  759. U16x8& operator=(const U16x8& other) {
  760. if (this != &other) {
  761. v = other.v;
  762. }
  763. return *this;
  764. }
  765. // Move operator.
  766. U16x8& operator=(U16x8&& other) noexcept {
  767. v = other.v;
  768. return *this;
  769. }
  770. #else
  771. public:
  772. // Emulate a SIMD vector as an array of scalars without hardware support.
  773. // Only accessible while emulating!
  774. uint16_t scalars[8];
  775. // Construct a portable vector from a set of scalars
  776. U16x8(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8) {
  777. this->scalars[0] = a1;
  778. this->scalars[1] = a2;
  779. this->scalars[2] = a3;
  780. this->scalars[3] = a4;
  781. this->scalars[4] = a5;
  782. this->scalars[5] = a6;
  783. this->scalars[6] = a7;
  784. this->scalars[7] = a8;
  785. }
  786. // Construct a portable vector from a single duplicated scalar
  787. explicit U16x8(uint16_t scalar) {
  788. this->scalars[0] = scalar;
  789. this->scalars[1] = scalar;
  790. this->scalars[2] = scalar;
  791. this->scalars[3] = scalar;
  792. this->scalars[4] = scalar;
  793. this->scalars[5] = scalar;
  794. this->scalars[6] = scalar;
  795. this->scalars[7] = scalar;
  796. }
  797. #endif
  798. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  799. static inline U16x8 createGradient(uint16_t start, uint16_t increment) {
  800. return U16x8(
  801. start,
  802. start + increment,
  803. start + increment * 2,
  804. start + increment * 3,
  805. start + increment * 4,
  806. start + increment * 5,
  807. start + increment * 6,
  808. start + increment * 7
  809. );
  810. }
  811. static inline U16x8 readAlignedUnsafe(const uint16_t* data) {
  812. #ifdef SAFE_POINTER_CHECKS
  813. if (uintptr_t(data) & 15u) { throwError(U"Unaligned pointer detected in U16x8::readAlignedUnsafe!\n"); }
  814. #endif
  815. #if defined(USE_BASIC_SIMD)
  816. #if defined(USE_SSE2)
  817. ALIGN16 SIMD_I32x4 result = _mm_load_si128((const __m128i*)data);
  818. return U16x8(result);
  819. #elif defined(USE_NEON)
  820. ALIGN16 SIMD_I32x4 result = vld1q_u16(data);
  821. return U16x8(result);
  822. #endif
  823. #else
  824. return U16x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  825. #endif
  826. }
  827. // Data must be aligned with at least 16 bytes.
  828. inline void writeAlignedUnsafe(uint16_t* data) const {
  829. #ifdef SAFE_POINTER_CHECKS
  830. if (uintptr_t(data) & 15u) { throwError(U"Unaligned pointer detected in U16x8::writeAlignedUnsafe!\n"); }
  831. #endif
  832. #if defined(USE_BASIC_SIMD)
  833. #if defined(USE_SSE2)
  834. _mm_store_si128((__m128i*)data, this->v);
  835. #elif defined(USE_NEON)
  836. vst1q_u16(data, this->v);
  837. #endif
  838. #else
  839. data[0] = this->scalars[0];
  840. data[1] = this->scalars[1];
  841. data[2] = this->scalars[2];
  842. data[3] = this->scalars[3];
  843. data[4] = this->scalars[4];
  844. data[5] = this->scalars[5];
  845. data[6] = this->scalars[6];
  846. data[7] = this->scalars[7];
  847. #endif
  848. }
  849. // Bound and alignment checked reading
  850. static inline U16x8 readAligned(dsr::SafePointer<const uint16_t> data, const char* methodName) {
  851. const uint16_t* pointer = data.getUnsafe();
  852. #if defined(SAFE_POINTER_CHECKS)
  853. data.assertInside(methodName, pointer, 16);
  854. #endif
  855. return U16x8::readAlignedUnsafe(pointer);
  856. }
  857. // Bound and alignment checked writing
  858. inline void writeAligned(dsr::SafePointer<uint16_t> data, const char* methodName) const {
  859. uint16_t* pointer = data.getUnsafe();
  860. #if defined(SAFE_POINTER_CHECKS)
  861. data.assertInside(methodName, pointer, 16);
  862. #endif
  863. this->writeAlignedUnsafe(pointer);
  864. }
  865. };
  866. struct ALIGN16 U8x16 {
  867. private:
  868. // The uninitialized default constructor is private for safety reasons.
  869. U8x16() {}
  870. public:
  871. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  872. static inline U8x16 create_dangerous_uninitialized() { return U8x16(); }
  873. #if defined(USE_BASIC_SIMD)
  874. public:
  875. // The SIMD vector of undefined type
  876. // Not accessible while emulating!
  877. SIMD_U8x16 v;
  878. // Construct a portable vector from a native SIMD vector
  879. explicit U8x16(const SIMD_U8x16& v) : v(v) {}
  880. // Construct a portable vector from a set of scalars
  881. U8x16(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  882. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16)
  883. : v(LOAD_VECTOR_U8_SIMD(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16)) {}
  884. // Construct a portable vector from a single duplicated scalar
  885. explicit U8x16(uint8_t scalar) : v(LOAD_SCALAR_U8_SIMD(scalar)) {}
  886. // Copy constructor.
  887. U8x16(const U8x16& other) {
  888. v = other.v;
  889. }
  890. // Assignment operator.
  891. U8x16& operator=(const U8x16& other) {
  892. if (this != &other) {
  893. v = other.v;
  894. }
  895. return *this;
  896. }
  897. // Move operator.
  898. U8x16& operator=(U8x16&& other) noexcept {
  899. v = other.v;
  900. return *this;
  901. }
  902. #else
  903. public:
  904. // Emulate a SIMD vector as an array of scalars without hardware support.
  905. // Only accessible while emulating!
  906. uint8_t scalars[16];
  907. // Construct a portable vector from a set of scalars
  908. U8x16(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  909. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16) {
  910. this->scalars[0] = a1;
  911. this->scalars[1] = a2;
  912. this->scalars[2] = a3;
  913. this->scalars[3] = a4;
  914. this->scalars[4] = a5;
  915. this->scalars[5] = a6;
  916. this->scalars[6] = a7;
  917. this->scalars[7] = a8;
  918. this->scalars[8] = a9;
  919. this->scalars[9] = a10;
  920. this->scalars[10] = a11;
  921. this->scalars[11] = a12;
  922. this->scalars[12] = a13;
  923. this->scalars[13] = a14;
  924. this->scalars[14] = a15;
  925. this->scalars[15] = a16;
  926. }
  927. // Construct a portable vector from a single duplicated scalar
  928. explicit U8x16(uint8_t scalar) {
  929. this->scalars[0] = scalar;
  930. this->scalars[1] = scalar;
  931. this->scalars[2] = scalar;
  932. this->scalars[3] = scalar;
  933. this->scalars[4] = scalar;
  934. this->scalars[5] = scalar;
  935. this->scalars[6] = scalar;
  936. this->scalars[7] = scalar;
  937. this->scalars[8] = scalar;
  938. this->scalars[9] = scalar;
  939. this->scalars[10] = scalar;
  940. this->scalars[11] = scalar;
  941. this->scalars[12] = scalar;
  942. this->scalars[13] = scalar;
  943. this->scalars[14] = scalar;
  944. this->scalars[15] = scalar;
  945. }
  946. #endif
  947. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  948. static inline U8x16 createGradient(uint8_t start, uint8_t increment) {
  949. return U8x16(
  950. start,
  951. start + increment,
  952. start + increment * 2,
  953. start + increment * 3,
  954. start + increment * 4,
  955. start + increment * 5,
  956. start + increment * 6,
  957. start + increment * 7,
  958. start + increment * 8,
  959. start + increment * 9,
  960. start + increment * 10,
  961. start + increment * 11,
  962. start + increment * 12,
  963. start + increment * 13,
  964. start + increment * 14,
  965. start + increment * 15
  966. );
  967. }
  968. static inline U8x16 readAlignedUnsafe(const uint8_t* data) {
  969. #ifdef SAFE_POINTER_CHECKS
  970. if (uintptr_t(data) & 15u) { throwError(U"Unaligned pointer detected in U8x16::readAlignedUnsafe!\n"); }
  971. #endif
  972. #if defined(USE_BASIC_SIMD)
  973. #if defined(USE_SSE2)
  974. ALIGN16 SIMD_I32x4 result = _mm_load_si128((const __m128i*)data);
  975. return U8x16(result);
  976. #elif defined(USE_NEON)
  977. ALIGN16 SIMD_I32x4 result = vld1q_u8(data);
  978. return U8x16(result);
  979. #endif
  980. #else
  981. return U8x16(
  982. data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7],
  983. data[8], data[9], data[10], data[11], data[12], data[13], data[14], data[15]
  984. );
  985. #endif
  986. }
  987. // Data must be aligned with at least 16 bytes.
  988. inline void writeAlignedUnsafe(uint8_t* data) const {
  989. #ifdef SAFE_POINTER_CHECKS
  990. if (uintptr_t(data) & 15u) { throwError(U"Unaligned pointer detected in U8x16::writeAlignedUnsafe!\n"); }
  991. #endif
  992. #if defined(USE_BASIC_SIMD)
  993. #if defined(USE_SSE2)
  994. _mm_store_si128((__m128i*)data, this->v);
  995. #elif defined(USE_NEON)
  996. vst1q_u8(data, this->v);
  997. #endif
  998. #else
  999. data[0] = this->scalars[0];
  1000. data[1] = this->scalars[1];
  1001. data[2] = this->scalars[2];
  1002. data[3] = this->scalars[3];
  1003. data[4] = this->scalars[4];
  1004. data[5] = this->scalars[5];
  1005. data[6] = this->scalars[6];
  1006. data[7] = this->scalars[7];
  1007. data[8] = this->scalars[8];
  1008. data[9] = this->scalars[9];
  1009. data[10] = this->scalars[10];
  1010. data[11] = this->scalars[11];
  1011. data[12] = this->scalars[12];
  1012. data[13] = this->scalars[13];
  1013. data[14] = this->scalars[14];
  1014. data[15] = this->scalars[15];
  1015. #endif
  1016. }
  1017. // Bound and alignment checked reading
  1018. static inline U8x16 readAligned(dsr::SafePointer<const uint8_t> data, const char* methodName) {
  1019. const uint8_t* pointer = data.getUnsafe();
  1020. #if defined(SAFE_POINTER_CHECKS)
  1021. data.assertInside(methodName, pointer, 16);
  1022. #endif
  1023. return U8x16::readAlignedUnsafe(pointer);
  1024. }
  1025. // Bound and alignment checked writing
  1026. inline void writeAligned(dsr::SafePointer<uint8_t> data, const char* methodName) const {
  1027. uint8_t* pointer = data.getUnsafe();
  1028. #if defined(SAFE_POINTER_CHECKS)
  1029. data.assertInside(methodName, pointer, 16);
  1030. #endif
  1031. this->writeAlignedUnsafe(pointer);
  1032. }
  1033. };
  1034. struct ALIGN32 F32x8 {
  1035. private:
  1036. // The uninitialized default constructor is private for safety reasons.
  1037. F32x8() {}
  1038. public:
  1039. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  1040. static inline F32x8 create_dangerous_uninitialized() { return F32x8(); }
  1041. #if defined(USE_256BIT_F_SIMD)
  1042. public:
  1043. // The SIMD vector of undefined type
  1044. // Not accessible while emulating!
  1045. SIMD_F32x8 v;
  1046. // Construct a portable vector from a native SIMD vector.
  1047. explicit F32x8(const SIMD_F32x8& v) : v(v) {}
  1048. #if defined(USE_AVX)
  1049. // Construct a portable vector from a set of scalars.
  1050. F32x8(float a1, float a2, float a3, float a4, float a5, float a6, float a7, float a8) {
  1051. ALIGN32 __m256 target = _mm256_set_ps(a8, a7, a6, a5, a4, a3, a2, a1);
  1052. this->v = target;
  1053. }
  1054. // Construct a portable vector from a single duplicated scalar.
  1055. explicit F32x8(float scalar) {
  1056. ALIGN32 __m256 target = _mm256_set1_ps(scalar);
  1057. this->v = target;
  1058. }
  1059. // Copy constructor.
  1060. F32x8(const F32x8& other) {
  1061. v = other.v;
  1062. }
  1063. // Assignment operator.
  1064. F32x8& operator=(const F32x8& other) {
  1065. if (this != &other) {
  1066. v = other.v;
  1067. }
  1068. return *this;
  1069. }
  1070. // Move operator.
  1071. F32x8& operator=(F32x8&& other) noexcept {
  1072. v = other.v;
  1073. return *this;
  1074. }
  1075. #else
  1076. #error "Missing constructors for the F32x8 type!\n"
  1077. #endif
  1078. #else
  1079. public:
  1080. // Emulate a SIMD vector as an array of scalars without hardware support.
  1081. // Only accessible while emulating!
  1082. float scalars[8];
  1083. // Construct a portable vector from a set of scalars
  1084. F32x8(float a1, float a2, float a3, float a4, float a5, float a6, float a7, float a8) {
  1085. this->scalars[0] = a1;
  1086. this->scalars[1] = a2;
  1087. this->scalars[2] = a3;
  1088. this->scalars[3] = a4;
  1089. this->scalars[4] = a5;
  1090. this->scalars[5] = a6;
  1091. this->scalars[6] = a7;
  1092. this->scalars[7] = a8;
  1093. }
  1094. // Construct a portable vector from a single duplicated scalar
  1095. explicit F32x8(float scalar) {
  1096. this->scalars[0] = scalar;
  1097. this->scalars[1] = scalar;
  1098. this->scalars[2] = scalar;
  1099. this->scalars[3] = scalar;
  1100. this->scalars[4] = scalar;
  1101. this->scalars[5] = scalar;
  1102. this->scalars[6] = scalar;
  1103. this->scalars[7] = scalar;
  1104. }
  1105. #endif
  1106. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1107. static inline F32x8 createGradient(float start, float increment) {
  1108. return F32x8(
  1109. start,
  1110. start + increment,
  1111. start + increment * 2.0f,
  1112. start + increment * 3.0f,
  1113. start + increment * 4.0f,
  1114. start + increment * 5.0f,
  1115. start + increment * 6.0f,
  1116. start + increment * 7.0f
  1117. );
  1118. }
  1119. // Construct a portable SIMD vector from a pointer to aligned data.
  1120. static inline F32x8 readAlignedUnsafe(const float* data) {
  1121. #ifdef SAFE_POINTER_CHECKS
  1122. if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in F32x8::readAlignedUnsafe!\n"); }
  1123. #endif
  1124. #if defined(USE_AVX2)
  1125. ALIGN32 __m256 result = _mm256_load_ps(data);
  1126. return F32x8(result);
  1127. #else
  1128. return F32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  1129. #endif
  1130. }
  1131. // Write to aligned memory from the existing vector.
  1132. inline void writeAlignedUnsafe(float* data) const {
  1133. #ifdef SAFE_POINTER_CHECKS
  1134. if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in F32x8::writeAlignedUnsafe!\n"); }
  1135. #endif
  1136. #if defined(USE_AVX2)
  1137. _mm256_store_ps(data, this->v);
  1138. #else
  1139. data[0] = this->scalars[0];
  1140. data[1] = this->scalars[1];
  1141. data[2] = this->scalars[2];
  1142. data[3] = this->scalars[3];
  1143. data[4] = this->scalars[4];
  1144. data[5] = this->scalars[5];
  1145. data[6] = this->scalars[6];
  1146. data[7] = this->scalars[7];
  1147. #endif
  1148. }
  1149. // Bound and alignment checked reading
  1150. static inline F32x8 readAligned(dsr::SafePointer<const float> data, const char* methodName) {
  1151. const float* pointer = data.getUnsafe();
  1152. #if defined(SAFE_POINTER_CHECKS)
  1153. data.assertInside(methodName, pointer, 32);
  1154. #endif
  1155. return F32x8::readAlignedUnsafe(pointer);
  1156. }
  1157. // Bound and alignment checked writing
  1158. inline void writeAligned(dsr::SafePointer<float> data, const char* methodName) const {
  1159. float* pointer = data.getUnsafe();
  1160. #if defined(SAFE_POINTER_CHECKS)
  1161. data.assertInside(methodName, pointer, 32);
  1162. #endif
  1163. this->writeAlignedUnsafe(pointer);
  1164. }
  1165. };
  1166. struct ALIGN32 I32x8 {
  1167. private:
  1168. // The uninitialized default constructor is private for safety reasons.
  1169. I32x8() {}
  1170. public:
  1171. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  1172. static inline I32x8 create_dangerous_uninitialized() { return I32x8(); }
  1173. #if defined(USE_256BIT_X_SIMD)
  1174. public:
  1175. // The SIMD vector of undefined type
  1176. // Not accessible while emulating!
  1177. SIMD_I32x8 v;
  1178. // Construct a portable vector from a native SIMD vector.
  1179. explicit I32x8(const SIMD_I32x8& v) : v(v) {}
  1180. #if defined(USE_AVX2)
  1181. // Construct a portable vector from a set of scalars.
  1182. I32x8(int32_t a1, int32_t a2, int32_t a3, int32_t a4, int32_t a5, int32_t a6, int32_t a7, int32_t a8) {
  1183. ALIGN32 __m256i target = _mm256_set_epi32(a8, a7, a6, a5, a4, a3, a2, a1);
  1184. this->v = target;
  1185. }
  1186. // Construct a portable vector from a single duplicated scalar.
  1187. explicit I32x8(int32_t scalar) {
  1188. ALIGN32 __m256i target = _mm256_set1_epi32(scalar);
  1189. this->v = target;
  1190. }
  1191. // Copy constructor.
  1192. I32x8(const I32x8& other) {
  1193. v = other.v;
  1194. }
  1195. // Assignment operator.
  1196. I32x8& operator=(const I32x8& other) {
  1197. if (this != &other) {
  1198. v = other.v;
  1199. }
  1200. return *this;
  1201. }
  1202. // Move operator.
  1203. I32x8& operator=(I32x8&& other) noexcept {
  1204. v = other.v;
  1205. return *this;
  1206. }
  1207. #else
  1208. #error "Missing constructors for the I32x8 type!\n"
  1209. #endif
  1210. #else
  1211. public:
  1212. // Emulate a SIMD vector as an array of scalars without hardware support.
  1213. // Only accessible while emulating!
  1214. int32_t scalars[8];
  1215. // Construct a portable vector from a set of scalars
  1216. I32x8(int32_t a1, int32_t a2, int32_t a3, int32_t a4, int32_t a5, int32_t a6, int32_t a7, int32_t a8) {
  1217. this->scalars[0] = a1;
  1218. this->scalars[1] = a2;
  1219. this->scalars[2] = a3;
  1220. this->scalars[3] = a4;
  1221. this->scalars[4] = a5;
  1222. this->scalars[5] = a6;
  1223. this->scalars[6] = a7;
  1224. this->scalars[7] = a8;
  1225. }
  1226. // Construct a portable vector from a single duplicated scalar
  1227. explicit I32x8(int32_t scalar) {
  1228. this->scalars[0] = scalar;
  1229. this->scalars[1] = scalar;
  1230. this->scalars[2] = scalar;
  1231. this->scalars[3] = scalar;
  1232. this->scalars[4] = scalar;
  1233. this->scalars[5] = scalar;
  1234. this->scalars[6] = scalar;
  1235. this->scalars[7] = scalar;
  1236. }
  1237. #endif
  1238. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1239. static inline I32x8 createGradient(int32_t start, int32_t increment) {
  1240. return I32x8(
  1241. start,
  1242. start + increment,
  1243. start + increment * 2,
  1244. start + increment * 3,
  1245. start + increment * 4,
  1246. start + increment * 5,
  1247. start + increment * 6,
  1248. start + increment * 7
  1249. );
  1250. }
  1251. // Construct a portable SIMD vector from a pointer to aligned data.
  1252. static inline I32x8 readAlignedUnsafe(const int32_t* data) {
  1253. #ifdef SAFE_POINTER_CHECKS
  1254. if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in I32x8::readAlignedUnsafe!\n"); }
  1255. #endif
  1256. #if defined(USE_AVX2)
  1257. ALIGN32 __m256i result = _mm256_load_si256((const __m256i*)data);
  1258. return I32x8(result);
  1259. #else
  1260. return I32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  1261. #endif
  1262. }
  1263. // Write to aligned memory from the existing vector.
  1264. inline void writeAlignedUnsafe(int32_t* data) const {
  1265. #ifdef SAFE_POINTER_CHECKS
  1266. if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in I32x8::writeAlignedUnsafe!\n"); }
  1267. #endif
  1268. #if defined(USE_AVX2)
  1269. _mm256_store_si256((__m256i*)data, this->v);
  1270. #else
  1271. data[0] = this->scalars[0];
  1272. data[1] = this->scalars[1];
  1273. data[2] = this->scalars[2];
  1274. data[3] = this->scalars[3];
  1275. data[4] = this->scalars[4];
  1276. data[5] = this->scalars[5];
  1277. data[6] = this->scalars[6];
  1278. data[7] = this->scalars[7];
  1279. #endif
  1280. }
  1281. // Bound and alignment checked reading
  1282. static inline I32x8 readAligned(dsr::SafePointer<const int32_t> data, const char* methodName) {
  1283. const int32_t* pointer = data.getUnsafe();
  1284. #if defined(SAFE_POINTER_CHECKS)
  1285. data.assertInside(methodName, pointer, 32);
  1286. #endif
  1287. return I32x8::readAlignedUnsafe(pointer);
  1288. }
  1289. // Bound and alignment checked writing
  1290. inline void writeAligned(dsr::SafePointer<int32_t> data, const char* methodName) const {
  1291. int32_t* pointer = data.getUnsafe();
  1292. #if defined(SAFE_POINTER_CHECKS)
  1293. data.assertInside(methodName, pointer, 32);
  1294. #endif
  1295. this->writeAlignedUnsafe(pointer);
  1296. }
  1297. };
  1298. struct ALIGN32 U32x8 {
  1299. private:
  1300. // The uninitialized default constructor is private for safety reasons.
  1301. U32x8() {}
  1302. public:
  1303. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  1304. static inline U32x8 create_dangerous_uninitialized() { return U32x8(); }
  1305. #if defined(USE_256BIT_X_SIMD)
  1306. public:
  1307. // The SIMD vector of undefined type
  1308. // Not accessible while emulating!
  1309. SIMD_U32x8 v;
  1310. // Construct a portable vector from a native SIMD vector.
  1311. explicit U32x8(const SIMD_U32x8& v) : v(v) {}
  1312. #if defined(USE_AVX2)
  1313. // Construct a portable vector from a set of scalars.
  1314. U32x8(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4, uint32_t a5, uint32_t a6, uint32_t a7, uint32_t a8) {
  1315. ALIGN32 __m256i target = _mm256_set_epi32(a8, a7, a6, a5, a4, a3, a2, a1);
  1316. this->v = target;
  1317. }
  1318. // Construct a portable vector from a single duplicated scalar.
  1319. explicit U32x8(uint32_t scalar) {
  1320. ALIGN32 __m256i target = _mm256_set1_epi32(scalar);
  1321. this->v = target;
  1322. }
  1323. // Copy constructor.
  1324. U32x8(const U32x8& other) {
  1325. v = other.v;
  1326. }
  1327. // Assignment operator.
  1328. U32x8& operator=(const U32x8& other) {
  1329. if (this != &other) {
  1330. v = other.v;
  1331. }
  1332. return *this;
  1333. }
  1334. // Move operator.
  1335. U32x8& operator=(U32x8&& other) noexcept {
  1336. v = other.v;
  1337. return *this;
  1338. }
  1339. #else
  1340. #error "Missing constructors for the U32x8 type!\n"
  1341. #endif
  1342. #else
  1343. public:
  1344. // Emulate a SIMD vector as an array of scalars without hardware support.
  1345. // Only accessible while emulating!
  1346. uint32_t scalars[8];
  1347. // Construct a portable vector from a set of scalars
  1348. U32x8(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4, uint32_t a5, uint32_t a6, uint32_t a7, uint32_t a8) {
  1349. this->scalars[0] = a1;
  1350. this->scalars[1] = a2;
  1351. this->scalars[2] = a3;
  1352. this->scalars[3] = a4;
  1353. this->scalars[4] = a5;
  1354. this->scalars[5] = a6;
  1355. this->scalars[6] = a7;
  1356. this->scalars[7] = a8;
  1357. }
  1358. // Construct a portable vector from a single duplicated scalar
  1359. explicit U32x8(uint32_t scalar) {
  1360. this->scalars[0] = scalar;
  1361. this->scalars[1] = scalar;
  1362. this->scalars[2] = scalar;
  1363. this->scalars[3] = scalar;
  1364. this->scalars[4] = scalar;
  1365. this->scalars[5] = scalar;
  1366. this->scalars[6] = scalar;
  1367. this->scalars[7] = scalar;
  1368. }
  1369. #endif
  1370. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1371. static inline U32x8 createGradient(uint32_t start, uint32_t increment) {
  1372. return U32x8(
  1373. start,
  1374. start + increment,
  1375. start + increment * 2,
  1376. start + increment * 3,
  1377. start + increment * 4,
  1378. start + increment * 5,
  1379. start + increment * 6,
  1380. start + increment * 7
  1381. );
  1382. }
  1383. // Construct a portable SIMD vector from a pointer to aligned data.
  1384. // Data must be aligned with at least 32 bytes.
  1385. static inline U32x8 readAlignedUnsafe(const uint32_t* data) {
  1386. #ifdef SAFE_POINTER_CHECKS
  1387. if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in U32x8::readAlignedUnsafe!\n"); }
  1388. #endif
  1389. #if defined(USE_AVX2)
  1390. ALIGN32 __m256i result = _mm256_load_si256((const __m256i*)data);
  1391. return U32x8(result);
  1392. #else
  1393. return U32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  1394. #endif
  1395. }
  1396. // Write to aligned memory from the existing vector.
  1397. // data must be aligned with 32 bytes.
  1398. inline void writeAlignedUnsafe(uint32_t* data) const {
  1399. #ifdef SAFE_POINTER_CHECKS
  1400. if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in U32x8::writeAlignedUnsafe!\n"); }
  1401. #endif
  1402. #if defined(USE_AVX2)
  1403. _mm256_store_si256((__m256i*)data, this->v);
  1404. #else
  1405. data[0] = this->scalars[0];
  1406. data[1] = this->scalars[1];
  1407. data[2] = this->scalars[2];
  1408. data[3] = this->scalars[3];
  1409. data[4] = this->scalars[4];
  1410. data[5] = this->scalars[5];
  1411. data[6] = this->scalars[6];
  1412. data[7] = this->scalars[7];
  1413. #endif
  1414. }
  1415. // Bound and alignment checked reading
  1416. static inline U32x8 readAligned(dsr::SafePointer<const uint32_t> data, const char* methodName) {
  1417. const uint32_t* pointer = data.getUnsafe();
  1418. #if defined(SAFE_POINTER_CHECKS)
  1419. data.assertInside(methodName, pointer, 32);
  1420. #endif
  1421. return U32x8::readAlignedUnsafe(pointer);
  1422. }
  1423. // Bound and alignment checked writing
  1424. inline void writeAligned(dsr::SafePointer<uint32_t> data, const char* methodName) const {
  1425. uint32_t* pointer = data.getUnsafe();
  1426. #if defined(SAFE_POINTER_CHECKS)
  1427. data.assertInside(methodName, pointer, 32);
  1428. #endif
  1429. this->writeAlignedUnsafe(pointer);
  1430. }
  1431. };
  1432. struct ALIGN32 U16x16 {
  1433. private:
  1434. // The uninitialized default constructor is private for safety reasons.
  1435. U16x16() {}
  1436. public:
  1437. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  1438. static inline U16x16 create_dangerous_uninitialized() { return U16x16(); }
  1439. #if defined(USE_256BIT_X_SIMD)
  1440. public:
  1441. // The SIMD vector of undefined type
  1442. // Not accessible while emulating!
  1443. SIMD_U16x16 v;
  1444. // Construct a portable vector from a native SIMD vector
  1445. explicit U16x16(const SIMD_U16x16& v) : v(v) {}
  1446. #if defined(USE_AVX2)
  1447. // Construct a portable vector from a set of scalars.
  1448. U16x16(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8,
  1449. uint16_t a9, uint16_t a10, uint16_t a11, uint16_t a12, uint16_t a13, uint16_t a14, uint16_t a15, uint16_t a16) {
  1450. ALIGN32 __m256i target = _mm256_set_epi16(a16, a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1);
  1451. this->v = target;
  1452. }
  1453. // Construct a portable vector from a single duplicated scalar.
  1454. explicit U16x16(uint16_t scalar) {
  1455. ALIGN32 __m256i target = _mm256_set1_epi16(scalar);
  1456. this->v = target;
  1457. }
  1458. // Copy constructor.
  1459. U16x16(const U16x16& other) {
  1460. v = other.v;
  1461. }
  1462. // Assignment operator.
  1463. U16x16& operator=(const U16x16& other) {
  1464. if (this != &other) {
  1465. v = other.v;
  1466. }
  1467. return *this;
  1468. }
  1469. // Move operator.
  1470. U16x16& operator=(U16x16&& other) noexcept {
  1471. v = other.v;
  1472. return *this;
  1473. }
  1474. #else
  1475. #error "Missing constructors for the U16x16 type!\n"
  1476. #endif
  1477. #else
  1478. public:
  1479. // Emulate a SIMD vector as an array of scalars without hardware support.
  1480. // Only accessible while emulating!
  1481. uint16_t scalars[16];
  1482. // Construct a portable vector from a set of scalars
  1483. U16x16(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8,
  1484. uint16_t a9, uint16_t a10, uint16_t a11, uint16_t a12, uint16_t a13, uint16_t a14, uint16_t a15, uint16_t a16) {
  1485. this->scalars[0] = a1;
  1486. this->scalars[1] = a2;
  1487. this->scalars[2] = a3;
  1488. this->scalars[3] = a4;
  1489. this->scalars[4] = a5;
  1490. this->scalars[5] = a6;
  1491. this->scalars[6] = a7;
  1492. this->scalars[7] = a8;
  1493. this->scalars[8] = a9;
  1494. this->scalars[9] = a10;
  1495. this->scalars[10] = a11;
  1496. this->scalars[11] = a12;
  1497. this->scalars[12] = a13;
  1498. this->scalars[13] = a14;
  1499. this->scalars[14] = a15;
  1500. this->scalars[15] = a16;
  1501. }
  1502. // Construct a portable vector from a single duplicated scalar
  1503. explicit U16x16(uint16_t scalar) {
  1504. this->scalars[0] = scalar;
  1505. this->scalars[1] = scalar;
  1506. this->scalars[2] = scalar;
  1507. this->scalars[3] = scalar;
  1508. this->scalars[4] = scalar;
  1509. this->scalars[5] = scalar;
  1510. this->scalars[6] = scalar;
  1511. this->scalars[7] = scalar;
  1512. this->scalars[8] = scalar;
  1513. this->scalars[9] = scalar;
  1514. this->scalars[10] = scalar;
  1515. this->scalars[11] = scalar;
  1516. this->scalars[12] = scalar;
  1517. this->scalars[13] = scalar;
  1518. this->scalars[14] = scalar;
  1519. this->scalars[15] = scalar;
  1520. }
  1521. #endif
  1522. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1523. static inline U16x16 createGradient(uint16_t start, uint16_t increment) {
  1524. return U16x16(
  1525. start,
  1526. start + increment,
  1527. start + increment * 2,
  1528. start + increment * 3,
  1529. start + increment * 4,
  1530. start + increment * 5,
  1531. start + increment * 6,
  1532. start + increment * 7,
  1533. start + increment * 8,
  1534. start + increment * 9,
  1535. start + increment * 10,
  1536. start + increment * 11,
  1537. start + increment * 12,
  1538. start + increment * 13,
  1539. start + increment * 14,
  1540. start + increment * 15
  1541. );
  1542. }
  1543. // Data must be aligned with at least 32 bytes.
  1544. static inline U16x16 readAlignedUnsafe(const uint16_t* data) {
  1545. #ifdef SAFE_POINTER_CHECKS
  1546. if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in U16x16::readAlignedUnsafe!\n"); }
  1547. #endif
  1548. #if defined(USE_AVX2)
  1549. ALIGN32 __m256i result = _mm256_load_si256((const __m256i*)data);
  1550. return U16x16(result);
  1551. #else
  1552. return U16x16(
  1553. data[0],
  1554. data[1],
  1555. data[2],
  1556. data[3],
  1557. data[4],
  1558. data[5],
  1559. data[6],
  1560. data[7],
  1561. data[8],
  1562. data[9],
  1563. data[10],
  1564. data[11],
  1565. data[12],
  1566. data[13],
  1567. data[14],
  1568. data[15]
  1569. );
  1570. #endif
  1571. }
  1572. // Data must be aligned with at least 32 bytes.
  1573. inline void writeAlignedUnsafe(uint16_t* data) const {
  1574. #ifdef SAFE_POINTER_CHECKS
  1575. if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in U16x16::writeAlignedUnsafe!\n"); }
  1576. #endif
  1577. #if defined(USE_AVX2)
  1578. _mm256_store_si256((__m256i*)data, this->v);
  1579. #else
  1580. data[0] = this->scalars[0];
  1581. data[1] = this->scalars[1];
  1582. data[2] = this->scalars[2];
  1583. data[3] = this->scalars[3];
  1584. data[4] = this->scalars[4];
  1585. data[5] = this->scalars[5];
  1586. data[6] = this->scalars[6];
  1587. data[7] = this->scalars[7];
  1588. data[8] = this->scalars[8];
  1589. data[9] = this->scalars[9];
  1590. data[10] = this->scalars[10];
  1591. data[11] = this->scalars[11];
  1592. data[12] = this->scalars[12];
  1593. data[13] = this->scalars[13];
  1594. data[14] = this->scalars[14];
  1595. data[15] = this->scalars[15];
  1596. #endif
  1597. }
  1598. // Bound and alignment checked reading
  1599. static inline U16x16 readAligned(dsr::SafePointer<const uint16_t> data, const char* methodName) {
  1600. const uint16_t* pointer = data.getUnsafe();
  1601. #if defined(SAFE_POINTER_CHECKS)
  1602. data.assertInside(methodName, pointer, 32);
  1603. #endif
  1604. return U16x16::readAlignedUnsafe(pointer);
  1605. }
  1606. // Bound and alignment checked writing
  1607. inline void writeAligned(dsr::SafePointer<uint16_t> data, const char* methodName) const {
  1608. uint16_t* pointer = data.getUnsafe();
  1609. #if defined(SAFE_POINTER_CHECKS)
  1610. data.assertInside(methodName, pointer, 32);
  1611. #endif
  1612. this->writeAlignedUnsafe(pointer);
  1613. }
  1614. };
  1615. struct ALIGN32 U8x32 {
  1616. private:
  1617. // The uninitialized default constructor is private for safety reasons.
  1618. U8x32() {}
  1619. public:
  1620. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  1621. static inline U8x32 create_dangerous_uninitialized() { return U8x32(); }
  1622. #if defined(USE_256BIT_X_SIMD)
  1623. public:
  1624. // The SIMD vector of undefined type
  1625. // Not accessible while emulating!
  1626. SIMD_U8x32 v;
  1627. // Construct a portable vector from a native SIMD vector
  1628. explicit U8x32(const SIMD_U8x32& v) : v(v) {}
  1629. #if defined(USE_AVX2)
  1630. // Construct a portable vector from a set of scalars.
  1631. U8x32(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  1632. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16,
  1633. uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24,
  1634. uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32) {
  1635. ALIGN32 __m256i target = _mm256_set_epi8(a32, a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1);
  1636. this->v = target;
  1637. }
  1638. // Construct a portable vector from a single duplicated scalar.
  1639. explicit U8x32(uint8_t scalar) {
  1640. ALIGN32 __m256i target = _mm256_set1_epi8(scalar);
  1641. this->v = target;
  1642. }
  1643. // Copy constructor.
  1644. U8x32(const U8x32& other) {
  1645. v = other.v;
  1646. }
  1647. // Assignment operator.
  1648. U8x32& operator=(const U8x32& other) {
  1649. if (this != &other) {
  1650. v = other.v;
  1651. }
  1652. return *this;
  1653. }
  1654. // Move operator.
  1655. U8x32& operator=(U8x32&& other) noexcept {
  1656. v = other.v;
  1657. return *this;
  1658. }
  1659. #else
  1660. #error "Missing constructors for the U8x32 type!\n"
  1661. #endif
  1662. #else
  1663. public:
  1664. // Emulate a SIMD vector as an array of scalars without hardware support.
  1665. // Only accessible while emulating!
  1666. uint8_t scalars[32];
  1667. // Construct a portable vector from a set of scalars
  1668. U8x32(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  1669. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16,
  1670. uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24,
  1671. uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32) {
  1672. this->scalars[0] = a1;
  1673. this->scalars[1] = a2;
  1674. this->scalars[2] = a3;
  1675. this->scalars[3] = a4;
  1676. this->scalars[4] = a5;
  1677. this->scalars[5] = a6;
  1678. this->scalars[6] = a7;
  1679. this->scalars[7] = a8;
  1680. this->scalars[8] = a9;
  1681. this->scalars[9] = a10;
  1682. this->scalars[10] = a11;
  1683. this->scalars[11] = a12;
  1684. this->scalars[12] = a13;
  1685. this->scalars[13] = a14;
  1686. this->scalars[14] = a15;
  1687. this->scalars[15] = a16;
  1688. this->scalars[16] = a17;
  1689. this->scalars[17] = a18;
  1690. this->scalars[18] = a19;
  1691. this->scalars[19] = a20;
  1692. this->scalars[20] = a21;
  1693. this->scalars[21] = a22;
  1694. this->scalars[22] = a23;
  1695. this->scalars[23] = a24;
  1696. this->scalars[24] = a25;
  1697. this->scalars[25] = a26;
  1698. this->scalars[26] = a27;
  1699. this->scalars[27] = a28;
  1700. this->scalars[28] = a29;
  1701. this->scalars[29] = a30;
  1702. this->scalars[30] = a31;
  1703. this->scalars[31] = a32;
  1704. }
  1705. // Construct a portable vector from a single duplicated scalar
  1706. explicit U8x32(uint8_t scalar) {
  1707. for (int i = 0; i < 32; i++) {
  1708. this->scalars[i] = scalar;
  1709. }
  1710. }
  1711. #endif
  1712. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1713. static inline U8x32 createGradient(uint8_t start, uint8_t increment) {
  1714. return U8x32(
  1715. start,
  1716. start + increment,
  1717. start + increment * 2,
  1718. start + increment * 3,
  1719. start + increment * 4,
  1720. start + increment * 5,
  1721. start + increment * 6,
  1722. start + increment * 7,
  1723. start + increment * 8,
  1724. start + increment * 9,
  1725. start + increment * 10,
  1726. start + increment * 11,
  1727. start + increment * 12,
  1728. start + increment * 13,
  1729. start + increment * 14,
  1730. start + increment * 15,
  1731. start + increment * 16,
  1732. start + increment * 17,
  1733. start + increment * 18,
  1734. start + increment * 19,
  1735. start + increment * 20,
  1736. start + increment * 21,
  1737. start + increment * 22,
  1738. start + increment * 23,
  1739. start + increment * 24,
  1740. start + increment * 25,
  1741. start + increment * 26,
  1742. start + increment * 27,
  1743. start + increment * 28,
  1744. start + increment * 29,
  1745. start + increment * 30,
  1746. start + increment * 31
  1747. );
  1748. }
  1749. // Data must be aligned with at least 32 bytes.
  1750. static inline U8x32 readAlignedUnsafe(const uint8_t* data) {
  1751. #ifdef SAFE_POINTER_CHECKS
  1752. if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in U8x32::readAlignedUnsafe!\n"); }
  1753. #endif
  1754. #if defined(USE_AVX2)
  1755. ALIGN32 __m256i result = _mm256_load_si256((const __m256i*)data);
  1756. return U8x32(result);
  1757. #else
  1758. U8x32 result;
  1759. for (int i = 0; i < 32; i++) {
  1760. result.scalars[i] = data[i];
  1761. }
  1762. return result;
  1763. #endif
  1764. }
  1765. // Data must be aligned with at least 32 bytes.
  1766. inline void writeAlignedUnsafe(uint8_t* data) const {
  1767. #ifdef SAFE_POINTER_CHECKS
  1768. if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in U8x32::writeAlignedUnsafe!\n"); }
  1769. #endif
  1770. #if defined(USE_AVX2)
  1771. _mm256_store_si256((__m256i*)data, this->v);
  1772. #else
  1773. for (int i = 0; i < 32; i++) {
  1774. data[i] = this->scalars[i];
  1775. }
  1776. #endif
  1777. }
  1778. // Bound and alignment checked reading
  1779. static inline U8x32 readAligned(dsr::SafePointer<const uint8_t> data, const char* methodName) {
  1780. const uint8_t* pointer = data.getUnsafe();
  1781. #if defined(SAFE_POINTER_CHECKS)
  1782. data.assertInside(methodName, pointer, 32);
  1783. #endif
  1784. return U8x32::readAlignedUnsafe(pointer);
  1785. }
  1786. // Bound and alignment checked writing
  1787. inline void writeAligned(dsr::SafePointer<uint8_t> data, const char* methodName) const {
  1788. uint8_t* pointer = data.getUnsafe();
  1789. #if defined(SAFE_POINTER_CHECKS)
  1790. data.assertInside(methodName, pointer, 32);
  1791. #endif
  1792. this->writeAlignedUnsafe(pointer);
  1793. }
  1794. };
  1795. #define IMPL_SCALAR_FALLBACK_START(A, B, VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  1796. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE lanesA[LANE_COUNT]; \
  1797. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE lanesB[LANE_COUNT]; \
  1798. A.writeAlignedUnsafe(&(lanesA[0])); \
  1799. B.writeAlignedUnsafe(&(lanesB[0]));
  1800. // Used for vector types that have SIMD registers but not the operation needed.
  1801. #define IMPL_SCALAR_FALLBACK_INFIX_4_LANES(A, B, VECTOR_TYPE, ELEMENT_TYPE, OPERATION) { \
  1802. IMPL_SCALAR_FALLBACK_START(A, B, VECTOR_TYPE, ELEMENT_TYPE, 4) \
  1803. return VECTOR_TYPE( \
  1804. ELEMENT_TYPE(lanesA[ 0] OPERATION lanesB[ 0]), \
  1805. ELEMENT_TYPE(lanesA[ 1] OPERATION lanesB[ 1]), \
  1806. ELEMENT_TYPE(lanesA[ 2] OPERATION lanesB[ 2]), \
  1807. ELEMENT_TYPE(lanesA[ 3] OPERATION lanesB[ 3]) \
  1808. ); \
  1809. }
  1810. #define IMPL_SCALAR_FALLBACK_INFIX_8_LANES(A, B, VECTOR_TYPE, ELEMENT_TYPE, OPERATION) { \
  1811. IMPL_SCALAR_FALLBACK_START(A, B, VECTOR_TYPE, ELEMENT_TYPE, 8) \
  1812. return VECTOR_TYPE( \
  1813. ELEMENT_TYPE(lanesA[ 0] OPERATION lanesB[ 0]), \
  1814. ELEMENT_TYPE(lanesA[ 1] OPERATION lanesB[ 1]), \
  1815. ELEMENT_TYPE(lanesA[ 2] OPERATION lanesB[ 2]), \
  1816. ELEMENT_TYPE(lanesA[ 3] OPERATION lanesB[ 3]), \
  1817. ELEMENT_TYPE(lanesA[ 4] OPERATION lanesB[ 4]), \
  1818. ELEMENT_TYPE(lanesA[ 5] OPERATION lanesB[ 5]), \
  1819. ELEMENT_TYPE(lanesA[ 6] OPERATION lanesB[ 6]), \
  1820. ELEMENT_TYPE(lanesA[ 7] OPERATION lanesB[ 7]) \
  1821. ); \
  1822. }
  1823. #define IMPL_SCALAR_FALLBACK_INFIX_16_LANES(A, B, VECTOR_TYPE, ELEMENT_TYPE, OPERATION) { \
  1824. IMPL_SCALAR_FALLBACK_START(A, B, VECTOR_TYPE, ELEMENT_TYPE, 16) \
  1825. return VECTOR_TYPE( \
  1826. ELEMENT_TYPE(lanesA[ 0] OPERATION lanesB[ 0]), \
  1827. ELEMENT_TYPE(lanesA[ 1] OPERATION lanesB[ 1]), \
  1828. ELEMENT_TYPE(lanesA[ 2] OPERATION lanesB[ 2]), \
  1829. ELEMENT_TYPE(lanesA[ 3] OPERATION lanesB[ 3]), \
  1830. ELEMENT_TYPE(lanesA[ 4] OPERATION lanesB[ 4]), \
  1831. ELEMENT_TYPE(lanesA[ 5] OPERATION lanesB[ 5]), \
  1832. ELEMENT_TYPE(lanesA[ 6] OPERATION lanesB[ 6]), \
  1833. ELEMENT_TYPE(lanesA[ 7] OPERATION lanesB[ 7]), \
  1834. ELEMENT_TYPE(lanesA[ 8] OPERATION lanesB[ 8]), \
  1835. ELEMENT_TYPE(lanesA[ 9] OPERATION lanesB[ 9]), \
  1836. ELEMENT_TYPE(lanesA[10] OPERATION lanesB[10]), \
  1837. ELEMENT_TYPE(lanesA[11] OPERATION lanesB[11]), \
  1838. ELEMENT_TYPE(lanesA[12] OPERATION lanesB[12]), \
  1839. ELEMENT_TYPE(lanesA[13] OPERATION lanesB[13]), \
  1840. ELEMENT_TYPE(lanesA[14] OPERATION lanesB[14]), \
  1841. ELEMENT_TYPE(lanesA[15] OPERATION lanesB[15]) \
  1842. ); \
  1843. }
  1844. #define IMPL_SCALAR_FALLBACK_INFIX_32_LANES(A, B, VECTOR_TYPE, ELEMENT_TYPE, OPERATION) { \
  1845. IMPL_SCALAR_FALLBACK_START(A, B, VECTOR_TYPE, ELEMENT_TYPE, 32) \
  1846. return VECTOR_TYPE( \
  1847. ELEMENT_TYPE(lanesA[ 0] OPERATION lanesB[ 0]), \
  1848. ELEMENT_TYPE(lanesA[ 1] OPERATION lanesB[ 1]), \
  1849. ELEMENT_TYPE(lanesA[ 2] OPERATION lanesB[ 2]), \
  1850. ELEMENT_TYPE(lanesA[ 3] OPERATION lanesB[ 3]), \
  1851. ELEMENT_TYPE(lanesA[ 4] OPERATION lanesB[ 4]), \
  1852. ELEMENT_TYPE(lanesA[ 5] OPERATION lanesB[ 5]), \
  1853. ELEMENT_TYPE(lanesA[ 6] OPERATION lanesB[ 6]), \
  1854. ELEMENT_TYPE(lanesA[ 7] OPERATION lanesB[ 7]), \
  1855. ELEMENT_TYPE(lanesA[ 8] OPERATION lanesB[ 8]), \
  1856. ELEMENT_TYPE(lanesA[ 9] OPERATION lanesB[ 9]), \
  1857. ELEMENT_TYPE(lanesA[10] OPERATION lanesB[10]), \
  1858. ELEMENT_TYPE(lanesA[11] OPERATION lanesB[11]), \
  1859. ELEMENT_TYPE(lanesA[12] OPERATION lanesB[12]), \
  1860. ELEMENT_TYPE(lanesA[13] OPERATION lanesB[13]), \
  1861. ELEMENT_TYPE(lanesA[14] OPERATION lanesB[14]), \
  1862. ELEMENT_TYPE(lanesA[15] OPERATION lanesB[15]), \
  1863. ELEMENT_TYPE(lanesA[16] OPERATION lanesB[16]), \
  1864. ELEMENT_TYPE(lanesA[17] OPERATION lanesB[17]), \
  1865. ELEMENT_TYPE(lanesA[18] OPERATION lanesB[18]), \
  1866. ELEMENT_TYPE(lanesA[19] OPERATION lanesB[19]), \
  1867. ELEMENT_TYPE(lanesA[20] OPERATION lanesB[20]), \
  1868. ELEMENT_TYPE(lanesA[21] OPERATION lanesB[21]), \
  1869. ELEMENT_TYPE(lanesA[22] OPERATION lanesB[22]), \
  1870. ELEMENT_TYPE(lanesA[23] OPERATION lanesB[23]), \
  1871. ELEMENT_TYPE(lanesA[24] OPERATION lanesB[24]), \
  1872. ELEMENT_TYPE(lanesA[25] OPERATION lanesB[25]), \
  1873. ELEMENT_TYPE(lanesA[26] OPERATION lanesB[26]), \
  1874. ELEMENT_TYPE(lanesA[27] OPERATION lanesB[27]), \
  1875. ELEMENT_TYPE(lanesA[28] OPERATION lanesB[28]), \
  1876. ELEMENT_TYPE(lanesA[29] OPERATION lanesB[29]), \
  1877. ELEMENT_TYPE(lanesA[30] OPERATION lanesB[30]), \
  1878. ELEMENT_TYPE(lanesA[31] OPERATION lanesB[31]) \
  1879. ); \
  1880. }
  1881. // Used for vector types that do not have any supported SIMD register.
  1882. #define IMPL_SCALAR_REFERENCE_INFIX_4_LANES(A, B, VECTOR_TYPE, ELEMENT_TYPE, OPERATION) \
  1883. { \
  1884. VECTOR_TYPE impl_result = VECTOR_TYPE::create_dangerous_uninitialized(); \
  1885. impl_result.scalars[ 0] = (A).scalars[ 0] OPERATION (B).scalars[ 0]; \
  1886. impl_result.scalars[ 1] = (A).scalars[ 1] OPERATION (B).scalars[ 1]; \
  1887. impl_result.scalars[ 2] = (A).scalars[ 2] OPERATION (B).scalars[ 2]; \
  1888. impl_result.scalars[ 3] = (A).scalars[ 3] OPERATION (B).scalars[ 3]; \
  1889. return impl_result; \
  1890. }
  1891. #define IMPL_SCALAR_REFERENCE_INFIX_8_LANES(A, B, VECTOR_TYPE, ELEMENT_TYPE, OPERATION) \
  1892. { \
  1893. VECTOR_TYPE impl_result = VECTOR_TYPE::create_dangerous_uninitialized(); \
  1894. impl_result.scalars[ 0] = (A).scalars[ 0] OPERATION (B).scalars[ 0]; \
  1895. impl_result.scalars[ 1] = (A).scalars[ 1] OPERATION (B).scalars[ 1]; \
  1896. impl_result.scalars[ 2] = (A).scalars[ 2] OPERATION (B).scalars[ 2]; \
  1897. impl_result.scalars[ 3] = (A).scalars[ 3] OPERATION (B).scalars[ 3]; \
  1898. impl_result.scalars[ 4] = (A).scalars[ 4] OPERATION (B).scalars[ 4]; \
  1899. impl_result.scalars[ 5] = (A).scalars[ 5] OPERATION (B).scalars[ 5]; \
  1900. impl_result.scalars[ 6] = (A).scalars[ 6] OPERATION (B).scalars[ 6]; \
  1901. impl_result.scalars[ 7] = (A).scalars[ 7] OPERATION (B).scalars[ 7]; \
  1902. return impl_result; \
  1903. }
  1904. #define IMPL_SCALAR_REFERENCE_INFIX_16_LANES(A, B, VECTOR_TYPE, ELEMENT_TYPE, OPERATION) \
  1905. { \
  1906. VECTOR_TYPE impl_result = VECTOR_TYPE::create_dangerous_uninitialized(); \
  1907. impl_result.scalars[ 0] = (A).scalars[ 0] OPERATION (B).scalars[ 0]; \
  1908. impl_result.scalars[ 1] = (A).scalars[ 1] OPERATION (B).scalars[ 1]; \
  1909. impl_result.scalars[ 2] = (A).scalars[ 2] OPERATION (B).scalars[ 2]; \
  1910. impl_result.scalars[ 3] = (A).scalars[ 3] OPERATION (B).scalars[ 3]; \
  1911. impl_result.scalars[ 4] = (A).scalars[ 4] OPERATION (B).scalars[ 4]; \
  1912. impl_result.scalars[ 5] = (A).scalars[ 5] OPERATION (B).scalars[ 5]; \
  1913. impl_result.scalars[ 6] = (A).scalars[ 6] OPERATION (B).scalars[ 6]; \
  1914. impl_result.scalars[ 7] = (A).scalars[ 7] OPERATION (B).scalars[ 7]; \
  1915. impl_result.scalars[ 8] = (A).scalars[ 8] OPERATION (B).scalars[ 8]; \
  1916. impl_result.scalars[ 9] = (A).scalars[ 9] OPERATION (B).scalars[ 9]; \
  1917. impl_result.scalars[10] = (A).scalars[10] OPERATION (B).scalars[10]; \
  1918. impl_result.scalars[11] = (A).scalars[11] OPERATION (B).scalars[11]; \
  1919. impl_result.scalars[12] = (A).scalars[12] OPERATION (B).scalars[12]; \
  1920. impl_result.scalars[13] = (A).scalars[13] OPERATION (B).scalars[13]; \
  1921. impl_result.scalars[14] = (A).scalars[14] OPERATION (B).scalars[14]; \
  1922. impl_result.scalars[15] = (A).scalars[15] OPERATION (B).scalars[15]; \
  1923. return impl_result; \
  1924. }
  1925. #define IMPL_SCALAR_REFERENCE_INFIX_32_LANES(A, B, VECTOR_TYPE, ELEMENT_TYPE, OPERATION) \
  1926. { \
  1927. VECTOR_TYPE impl_result = VECTOR_TYPE::create_dangerous_uninitialized(); \
  1928. impl_result.scalars[ 0] = (A).scalars[ 0] OPERATION (B).scalars[ 0]; \
  1929. impl_result.scalars[ 1] = (A).scalars[ 1] OPERATION (B).scalars[ 1]; \
  1930. impl_result.scalars[ 2] = (A).scalars[ 2] OPERATION (B).scalars[ 2]; \
  1931. impl_result.scalars[ 3] = (A).scalars[ 3] OPERATION (B).scalars[ 3]; \
  1932. impl_result.scalars[ 4] = (A).scalars[ 4] OPERATION (B).scalars[ 4]; \
  1933. impl_result.scalars[ 5] = (A).scalars[ 5] OPERATION (B).scalars[ 5]; \
  1934. impl_result.scalars[ 6] = (A).scalars[ 6] OPERATION (B).scalars[ 6]; \
  1935. impl_result.scalars[ 7] = (A).scalars[ 7] OPERATION (B).scalars[ 7]; \
  1936. impl_result.scalars[ 8] = (A).scalars[ 8] OPERATION (B).scalars[ 8]; \
  1937. impl_result.scalars[ 9] = (A).scalars[ 9] OPERATION (B).scalars[ 9]; \
  1938. impl_result.scalars[10] = (A).scalars[10] OPERATION (B).scalars[10]; \
  1939. impl_result.scalars[11] = (A).scalars[11] OPERATION (B).scalars[11]; \
  1940. impl_result.scalars[12] = (A).scalars[12] OPERATION (B).scalars[12]; \
  1941. impl_result.scalars[13] = (A).scalars[13] OPERATION (B).scalars[13]; \
  1942. impl_result.scalars[14] = (A).scalars[14] OPERATION (B).scalars[14]; \
  1943. impl_result.scalars[15] = (A).scalars[15] OPERATION (B).scalars[15]; \
  1944. impl_result.scalars[16] = (A).scalars[16] OPERATION (B).scalars[16]; \
  1945. impl_result.scalars[17] = (A).scalars[17] OPERATION (B).scalars[17]; \
  1946. impl_result.scalars[18] = (A).scalars[18] OPERATION (B).scalars[18]; \
  1947. impl_result.scalars[19] = (A).scalars[19] OPERATION (B).scalars[19]; \
  1948. impl_result.scalars[20] = (A).scalars[20] OPERATION (B).scalars[20]; \
  1949. impl_result.scalars[21] = (A).scalars[21] OPERATION (B).scalars[21]; \
  1950. impl_result.scalars[22] = (A).scalars[22] OPERATION (B).scalars[22]; \
  1951. impl_result.scalars[23] = (A).scalars[23] OPERATION (B).scalars[23]; \
  1952. impl_result.scalars[24] = (A).scalars[24] OPERATION (B).scalars[24]; \
  1953. impl_result.scalars[25] = (A).scalars[25] OPERATION (B).scalars[25]; \
  1954. impl_result.scalars[26] = (A).scalars[26] OPERATION (B).scalars[26]; \
  1955. impl_result.scalars[27] = (A).scalars[27] OPERATION (B).scalars[27]; \
  1956. impl_result.scalars[28] = (A).scalars[28] OPERATION (B).scalars[28]; \
  1957. impl_result.scalars[29] = (A).scalars[29] OPERATION (B).scalars[29]; \
  1958. impl_result.scalars[30] = (A).scalars[30] OPERATION (B).scalars[30]; \
  1959. impl_result.scalars[31] = (A).scalars[31] OPERATION (B).scalars[31]; \
  1960. return impl_result; \
  1961. }
  1962. // Helper macros for doing things to certain sets of SIMD vector types
  1963. // Performing do(vector_type, element_type, lane_count)
  1964. #define FOR_ALL_VECTOR_TYPES(DO) \
  1965. DO(F32x4, float, 4) \
  1966. DO(I32x4, int32_t, 4) \
  1967. DO(U32x4, uint32_t, 4) \
  1968. DO(U16x8, uint16_t, 8) \
  1969. DO(U8x16, uint8_t, 16) \
  1970. DO(F32x8, float, 8) \
  1971. DO(I32x8, int32_t, 8) \
  1972. DO(U32x8, uint32_t, 8) \
  1973. DO(U16x16, uint16_t, 16) \
  1974. DO(U8x32, uint8_t, 32)
  1975. #define FOR_FLOAT_VECTOR_TYPES(DO) \
  1976. DO(F32x4, float, 4) \
  1977. DO(F32x8, float, 8)
  1978. #define FOR_INTEGER_VECTOR_TYPES(DO) \
  1979. DO(I32x4, int32_t, 4) \
  1980. DO(U32x4, uint32_t, 4) \
  1981. DO(U16x8, uint16_t, 8) \
  1982. DO(U8x16, uint8_t, 16) \
  1983. DO(I32x8, int32_t, 8) \
  1984. DO(U32x8, uint32_t, 8) \
  1985. DO(U16x16, uint16_t, 16) \
  1986. DO(U8x32, uint8_t, 32)
  1987. #define FOR_SIGNED_VECTOR_TYPES(DO) \
  1988. DO(F32x4, float, 4) \
  1989. DO(I32x4, int32_t, 4) \
  1990. DO(F32x8, float, 8) \
  1991. DO(I32x8, int32_t, 8)
  1992. #define FOR_UNSIGNED_VECTOR_TYPES(DO) \
  1993. DO(U32x4, uint32_t, 4) \
  1994. DO(U16x8, uint16_t, 8) \
  1995. DO(U8x16, uint8_t, 16) \
  1996. DO(U32x8, uint32_t, 8) \
  1997. DO(U16x16, uint16_t, 16) \
  1998. DO(U8x32, uint8_t, 32)
  1999. // Print SIMD vectors to the terminal or append them to strings.
  2000. #define CREATE_METHOD_PRINT(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  2001. inline dsr::String& string_toStreamIndented(dsr::String& target, const VECTOR_TYPE& source, const dsr::ReadableString& indentation) { \
  2002. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  2003. source.writeAlignedUnsafe(&(a[0])); \
  2004. dsr::string_append(target, indentation, a[0]); \
  2005. for (int i = 1; i < LANE_COUNT; i++) { \
  2006. string_append(target, U", ", a[i]); \
  2007. } \
  2008. return target; \
  2009. }
  2010. // All SIMD vectors can be printed.
  2011. FOR_ALL_VECTOR_TYPES(CREATE_METHOD_PRINT)
  2012. #undef CREATE_METHOD_PRINT
  2013. // Integer equality returns true iff all comparisons are identical.
  2014. #define CREATE_EXACT_EQUALITY(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  2015. inline bool allLanesEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  2016. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  2017. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  2018. left.writeAlignedUnsafe(&(a[0])); \
  2019. right.writeAlignedUnsafe(&(b[0])); \
  2020. for (int i = 0; i < LANE_COUNT; i++) { \
  2021. if (a[i] != b[i]) return false; \
  2022. } \
  2023. return true; \
  2024. } \
  2025. inline bool allLanesNotEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  2026. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  2027. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  2028. left.writeAlignedUnsafe(&(a[0])); \
  2029. right.writeAlignedUnsafe(&(b[0])); \
  2030. for (int i = 0; i < LANE_COUNT; i++) { \
  2031. if (a[i] == b[i]) return false; \
  2032. } \
  2033. return true; \
  2034. }
  2035. // Integer SIMD vectors have exact equlity.
  2036. FOR_INTEGER_VECTOR_TYPES(CREATE_EXACT_EQUALITY)
  2037. #undef CREATE_EXACT_EQUALITY
  2038. // Float equality returns true iff all comparisons are near.
  2039. #define CREATE_TOLERANT_EQUALITY(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  2040. inline bool allLanesEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  2041. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  2042. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  2043. left.writeAlignedUnsafe(&(a[0])); \
  2044. right.writeAlignedUnsafe(&(b[0])); \
  2045. for (int i = 0; i < LANE_COUNT; i++) { \
  2046. if (fabs(a[i] - b[i]) >= 0.0001f) return false; \
  2047. } \
  2048. return true; \
  2049. } \
  2050. inline bool allLanesNotEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  2051. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  2052. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  2053. left.writeAlignedUnsafe(&(a[0])); \
  2054. right.writeAlignedUnsafe(&(b[0])); \
  2055. for (int i = 0; i < LANE_COUNT; i++) { \
  2056. if (fabs(a[i] - b[i]) < 0.0001f) return false; \
  2057. } \
  2058. return true; \
  2059. }
  2060. // Float SIMD vectors have inexact equality.
  2061. FOR_FLOAT_VECTOR_TYPES(CREATE_TOLERANT_EQUALITY)
  2062. #undef CREATE_TOLERANT_EQUALITY
  2063. #define CREATE_COMPARISONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  2064. inline bool allLanesGreater(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  2065. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  2066. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  2067. left.writeAlignedUnsafe(&(a[0])); \
  2068. right.writeAlignedUnsafe(&(b[0])); \
  2069. for (int i = 0; i < LANE_COUNT; i++) { \
  2070. if (a[i] <= b[i]) return false; \
  2071. } \
  2072. return true; \
  2073. } \
  2074. inline bool allLanesGreaterOrEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  2075. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  2076. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  2077. left.writeAlignedUnsafe(&(a[0])); \
  2078. right.writeAlignedUnsafe(&(b[0])); \
  2079. for (int i = 0; i < LANE_COUNT; i++) { \
  2080. if (a[i] < b[i]) return false; \
  2081. } \
  2082. return true; \
  2083. } \
  2084. inline bool allLanesLesser(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  2085. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  2086. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  2087. left.writeAlignedUnsafe(&(a[0])); \
  2088. right.writeAlignedUnsafe(&(b[0])); \
  2089. for (int i = 0; i < LANE_COUNT; i++) { \
  2090. if (a[i] >= b[i]) return false; \
  2091. } \
  2092. return true; \
  2093. } \
  2094. inline bool allLanesLesserOrEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  2095. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  2096. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  2097. left.writeAlignedUnsafe(&(a[0])); \
  2098. right.writeAlignedUnsafe(&(b[0])); \
  2099. for (int i = 0; i < LANE_COUNT; i++) { \
  2100. if (a[i] > b[i]) return false; \
  2101. } \
  2102. return true; \
  2103. }
  2104. FOR_ALL_VECTOR_TYPES(CREATE_COMPARISONS)
  2105. #undef CREATE_COMPARISONS
  2106. inline F32x4 operator+(const F32x4& left, const F32x4& right) {
  2107. #if defined(USE_BASIC_SIMD)
  2108. return F32x4(ADD_F32_SIMD(left.v, right.v));
  2109. #else
  2110. return F32x4(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3]);
  2111. #endif
  2112. }
  2113. inline F32x4 operator-(const F32x4& left, const F32x4& right) {
  2114. #if defined(USE_BASIC_SIMD)
  2115. return F32x4(SUB_F32_SIMD(left.v, right.v));
  2116. #else
  2117. return F32x4(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3]);
  2118. #endif
  2119. }
  2120. inline F32x4 operator*(const F32x4& left, const F32x4& right) {
  2121. #if defined(USE_BASIC_SIMD)
  2122. return F32x4(MUL_F32_SIMD(left.v, right.v));
  2123. #else
  2124. return F32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
  2125. #endif
  2126. }
  2127. inline F32x4 min(const F32x4& left, const F32x4& right) {
  2128. #if defined(USE_BASIC_SIMD)
  2129. return F32x4(MIN_F32_SIMD(left.v, right.v));
  2130. #else
  2131. float v0 = left.scalars[0];
  2132. float v1 = left.scalars[1];
  2133. float v2 = left.scalars[2];
  2134. float v3 = left.scalars[3];
  2135. float r0 = right.scalars[0];
  2136. float r1 = right.scalars[1];
  2137. float r2 = right.scalars[2];
  2138. float r3 = right.scalars[3];
  2139. if (r0 < v0) { v0 = r0; }
  2140. if (r1 < v1) { v1 = r1; }
  2141. if (r2 < v2) { v2 = r2; }
  2142. if (r3 < v3) { v3 = r3; }
  2143. return F32x4(v0, v1, v2, v3);
  2144. #endif
  2145. }
  2146. inline F32x4 max(const F32x4& left, const F32x4& right) {
  2147. #if defined(USE_BASIC_SIMD)
  2148. return F32x4(MAX_F32_SIMD(left.v, right.v));
  2149. #else
  2150. float v0 = left.scalars[0];
  2151. float v1 = left.scalars[1];
  2152. float v2 = left.scalars[2];
  2153. float v3 = left.scalars[3];
  2154. float r0 = right.scalars[0];
  2155. float r1 = right.scalars[1];
  2156. float r2 = right.scalars[2];
  2157. float r3 = right.scalars[3];
  2158. if (r0 > v0) { v0 = r0; }
  2159. if (r1 > v1) { v1 = r1; }
  2160. if (r2 > v2) { v2 = r2; }
  2161. if (r3 > v3) { v3 = r3; }
  2162. return F32x4(v0, v1, v2, v3);
  2163. #endif
  2164. }
  2165. inline I32x4 operator+(const I32x4& left, const I32x4& right) {
  2166. #if defined(USE_BASIC_SIMD)
  2167. return I32x4(ADD_I32_SIMD(left.v, right.v));
  2168. #else
  2169. return I32x4(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3]);
  2170. #endif
  2171. }
  2172. inline I32x4 operator-(const I32x4& left, const I32x4& right) {
  2173. #if defined(USE_BASIC_SIMD)
  2174. return I32x4(SUB_I32_SIMD(left.v, right.v));
  2175. #else
  2176. return I32x4(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3]);
  2177. #endif
  2178. }
  2179. inline I32x4 operator*(const I32x4& left, const I32x4& right) {
  2180. #if defined(USE_BASIC_SIMD)
  2181. #if defined(USE_SSE2)
  2182. // TODO: Use AVX2 for 32-bit integer multiplication when available.
  2183. IMPL_SCALAR_FALLBACK_INFIX_4_LANES(left, right, I32x4, int32_t, *)
  2184. #elif defined(USE_NEON)
  2185. return I32x4(MUL_I32_NEON(left.v, right.v));
  2186. #endif
  2187. #else
  2188. IMPL_SCALAR_REFERENCE_INFIX_4_LANES(left, right, I32x4, int32_t, *)
  2189. #endif
  2190. }
  2191. // TODO: Specify the behavior of truncated unsigned integer overflow and add it to the tests.
  2192. inline U32x4 operator+(const U32x4& left, const U32x4& right) {
  2193. #if defined(USE_BASIC_SIMD)
  2194. return U32x4(ADD_U32_SIMD(left.v, right.v));
  2195. #else
  2196. IMPL_SCALAR_REFERENCE_INFIX_4_LANES(left, right, U32x4, uint32_t, +)
  2197. #endif
  2198. }
  2199. inline U32x4 operator-(const U32x4& left, const U32x4& right) {
  2200. #if defined(USE_BASIC_SIMD)
  2201. return U32x4(SUB_U32_SIMD(left.v, right.v));
  2202. #else
  2203. IMPL_SCALAR_REFERENCE_INFIX_4_LANES(left, right, U32x4, uint32_t, -)
  2204. #endif
  2205. }
  2206. inline U32x4 operator*(const U32x4& left, const U32x4& right) {
  2207. #if defined(USE_BASIC_SIMD)
  2208. #if defined(USE_SSE2)
  2209. // TODO: Use AVX2 for 32-bit integer multiplication when available.
  2210. IMPL_SCALAR_FALLBACK_INFIX_4_LANES(left, right, U32x4, uint32_t, *)
  2211. #else // NEON
  2212. return U32x4(MUL_U32_NEON(left.v, right.v));
  2213. #endif
  2214. #else
  2215. IMPL_SCALAR_REFERENCE_INFIX_4_LANES(left, right, U32x4, uint32_t, *)
  2216. #endif
  2217. }
  2218. // Bitwise and
  2219. inline U16x8 operator&(const U16x8& left, const U16x8& right) {
  2220. #if defined(USE_SSE2)
  2221. return U16x8(_mm_and_si128(left.v, right.v));
  2222. #elif defined(USE_NEON)
  2223. return U16x8(vandq_u16(left.v, right.v));
  2224. #else
  2225. IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, right, U16x8, uint16_t, &)
  2226. #endif
  2227. }
  2228. // Bitwise or
  2229. inline U16x8 operator|(const U16x8& left, const U16x8& right) {
  2230. #if defined(USE_SSE2)
  2231. return U16x8(_mm_or_si128(left.v, right.v));
  2232. #elif defined(USE_NEON)
  2233. return U16x8(vorrq_u16(left.v, right.v));
  2234. #else
  2235. IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, right, U16x8, uint16_t, |)
  2236. #endif
  2237. }
  2238. // Bitwise xor
  2239. inline U16x8 operator^(const U16x8& left, const U16x8& right) {
  2240. #if defined(USE_SSE2)
  2241. return U16x8(_mm_xor_si128(left.v, right.v));
  2242. #elif defined(USE_NEON)
  2243. return U16x8(veorq_u16(left.v, right.v));
  2244. #else
  2245. IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, right, U16x8, uint16_t, ^)
  2246. #endif
  2247. }
  2248. // Bitwise negation
  2249. inline U16x8 operator~(const U16x8& value) {
  2250. #if defined(USE_NEON)
  2251. return U16x8(vmvnq_u16(value.v));
  2252. #elif defined(USE_BASIC_SIMD)
  2253. // Fall back on xor against all ones.
  2254. return value ^ U16x8(~uint16_t(0));
  2255. #else
  2256. // TODO: Generate automatically using a macro.
  2257. return U16x8(~value.scalars[0], ~value.scalars[1], ~value.scalars[2], ~value.scalars[3]);
  2258. #endif
  2259. }
  2260. inline U16x8 operator<<(const U16x8& left, const U16x8 &bitOffsets) {
  2261. #ifdef SAFE_POINTER_CHECKS
  2262. if(!allLanesLesser(bitOffsets, U16x8(16u))) {
  2263. throwError(U"Tried to shift ", left, U" by bit offsets ", bitOffsets, U", which is non-deterministic from being out of bound 0..15!\n");
  2264. }
  2265. #endif
  2266. #if defined(USE_SSE2)
  2267. IMPL_SCALAR_FALLBACK_INFIX_8_LANES(left, bitOffsets, U16x8, uint16_t, <<)
  2268. #elif defined(USE_NEON)
  2269. return U16x8(vshlq_u16(left.v, vreinterpretq_s16_u16(bitOffsets.v)));
  2270. #else
  2271. IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, bitOffsets, U16x8, uint16_t, <<)
  2272. #endif
  2273. }
  2274. inline U16x8 operator>>(const U16x8& left, const U16x8 &bitOffsets) {
  2275. #ifdef SAFE_POINTER_CHECKS
  2276. if(!allLanesLesser(bitOffsets, U16x8(16u))) {
  2277. throwError(U"Tried to shift ", left, U" by bit offsets ", bitOffsets, U", which is non-deterministic from being out of bound 0..15!\n");
  2278. }
  2279. #endif
  2280. #if defined(USE_SSE2)
  2281. IMPL_SCALAR_FALLBACK_INFIX_8_LANES(left, bitOffsets, U16x8, uint16_t, >>)
  2282. #elif defined(USE_NEON)
  2283. //return U16x8(vshrq_u16(left.v, vreinterpretq_s16_u16(bitOffsets.v)));
  2284. return U16x8(vshlq_u16(left.v, vnegq_s16(vreinterpretq_s16_u16(bitOffsets.v))));
  2285. #else
  2286. IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, bitOffsets, U16x8, uint16_t, >>)
  2287. #endif
  2288. }
  2289. inline U16x8 operator<<(const U16x8& left, const uint32_t &bitOffset) {
  2290. #if defined(USE_SSE2)
  2291. #ifdef SAFE_POINTER_CHECKS
  2292. if(bitOffset >= 16u) {
  2293. throwError(U"Tried to shift ", left, U" by bit offset ", bitOffset, U", which is non-deterministic from being out of bound 0..16!\n");
  2294. }
  2295. #endif
  2296. // Write the content to aligned stack memory.
  2297. ALIGN16 __m128i values;
  2298. left.writeAlignedUnsafe((uint16_t*)&values);
  2299. // Cast a pointer to the data into two 64-bit elements.
  2300. uint64_t *largeLanes = (uint64_t*)&values;
  2301. // Shift the 128 bits as two 64-bit values.
  2302. largeLanes[0] = largeLanes[0] << bitOffset;
  2303. largeLanes[1] = largeLanes[1] << bitOffset;
  2304. // Create a mask.
  2305. U16x8 mask = U16x8(uint16_t(~0u) << bitOffset);
  2306. // Return the shifted 64-bit elements masked to remove spill across lanes.
  2307. return U16x8::readAlignedUnsafe((uint16_t*)&values) & mask;
  2308. #else
  2309. return left << U16x8(bitOffset);
  2310. #endif
  2311. }
  2312. inline U16x8 operator>>(const U16x8& left, const uint32_t &bitOffset) {
  2313. #if defined(USE_SSE2)
  2314. #ifdef SAFE_POINTER_CHECKS
  2315. if(bitOffset >= 16u) {
  2316. throwError(U"Tried to shift ", left, U" by bit offset ", bitOffset, U", which is non-deterministic from being out of bound 0..16!\n");
  2317. }
  2318. #endif
  2319. // Write the content to aligned stack memory.
  2320. ALIGN16 __m128i values;
  2321. left.writeAlignedUnsafe((uint16_t*)&values);
  2322. // Cast a pointer to the data into two 64-bit elements.
  2323. uint64_t *largeLanes = (uint64_t*)&values;
  2324. // Shift the 128 bits as two 64-bit values.
  2325. largeLanes[0] = largeLanes[0] >> bitOffset;
  2326. largeLanes[1] = largeLanes[1] >> bitOffset;
  2327. // Create a mask.
  2328. U16x8 mask = U16x8(uint16_t(~0u) >> bitOffset);
  2329. // Return the shifted 64-bit elements masked to remove spill across lanes.
  2330. return U16x8::readAlignedUnsafe((uint16_t*)&values) & mask;
  2331. #else
  2332. return left >> U16x8(bitOffset);
  2333. #endif
  2334. }
  2335. // bitOffset must be an immediate constant, so a template argument is used.
  2336. template <uint32_t bitOffset>
  2337. inline U16x8 bitShiftLeftImmediate(const U16x8& left) {
  2338. static_assert(bitOffset < 16u, "Immediate left shift of 16-bit values may not shift more than 15 bits!");
  2339. #if defined(USE_SSE2)
  2340. return U16x8(_mm_slli_epi16(left.v, bitOffset));
  2341. #elif defined(USE_NEON)
  2342. return U16x8(vshlq_u32(left.v, vdupq_n_s16(bitOffset)));
  2343. #else
  2344. U16x8 bitOffsets = U16x8(bitOffset);
  2345. IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, bitOffsets, U16x8, uint16_t, <<)
  2346. #endif
  2347. }
  2348. // bitOffset must be an immediate constant.
  2349. template <uint32_t bitOffset>
  2350. inline U16x8 bitShiftRightImmediate(const U16x8& left) {
  2351. static_assert(bitOffset < 16u, "Immediate right shift of 16-bit values may not shift more than 15 bits!");
  2352. #if defined(USE_SSE2)
  2353. return U16x8(_mm_srli_epi16(left.v, bitOffset));
  2354. #elif defined(USE_NEON)
  2355. //return U16x8(vshrq_u16(left.v, vdupq_n_s16(bitOffset)));
  2356. return U16x8(vshlq_u16(left.v, vdupq_n_s16(-(int32_t)bitOffset)));
  2357. #else
  2358. U16x8 bitOffsets = U16x8(bitOffset);
  2359. IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, bitOffsets, U16x8, uint16_t, >>)
  2360. #endif
  2361. }
  2362. // Bitwise and
  2363. inline U32x4 operator&(const U32x4& left, const U32x4& right) {
  2364. #if defined(USE_BASIC_SIMD)
  2365. return U32x4(BITWISE_AND_U32_SIMD(left.v, right.v));
  2366. #else
  2367. IMPL_SCALAR_REFERENCE_INFIX_4_LANES(left, right, U32x4, uint32_t, &)
  2368. #endif
  2369. }
  2370. // Bitwise or
  2371. inline U32x4 operator|(const U32x4& left, const U32x4& right) {
  2372. #if defined(USE_BASIC_SIMD)
  2373. return U32x4(BITWISE_OR_U32_SIMD(left.v, right.v));
  2374. #else
  2375. IMPL_SCALAR_REFERENCE_INFIX_4_LANES(left, right, U32x4, uint32_t, |)
  2376. #endif
  2377. }
  2378. // Bitwise xor
  2379. inline U32x4 operator^(const U32x4& left, const U32x4& right) {
  2380. #if defined(USE_BASIC_SIMD)
  2381. return U32x4(BITWISE_XOR_U32_SIMD(left.v, right.v));
  2382. #else
  2383. IMPL_SCALAR_REFERENCE_INFIX_4_LANES(left, right, U32x4, uint32_t, ^)
  2384. #endif
  2385. }
  2386. // Bitwise negation
  2387. inline U32x4 operator~(const U32x4& value) {
  2388. #if defined(USE_NEON)
  2389. return U32x4(vmvnq_u32(value.v));
  2390. #elif defined(USE_BASIC_SIMD)
  2391. // Fall back on xor against all ones.
  2392. return value ^ U32x4(~uint32_t(0));
  2393. #else
  2394. // TODO: Generate automatically using a macro.
  2395. return U32x4(~value.scalars[0], ~value.scalars[1], ~value.scalars[2], ~value.scalars[3]);
  2396. #endif
  2397. }
  2398. inline U32x4 operator<<(const U32x4& left, const U32x4 &bitOffsets) {
  2399. #ifdef SAFE_POINTER_CHECKS
  2400. if(!allLanesLesser(bitOffsets, U32x4(32u))) {
  2401. throwError(U"Tried to shift ", left, U" by bit offsets ", bitOffsets, U", which is non-deterministic from being out of bound 0..31!\n");
  2402. }
  2403. #endif
  2404. #if defined(USE_SSE2)
  2405. IMPL_SCALAR_FALLBACK_INFIX_4_LANES(left, bitOffsets, U32x4, uint32_t, <<)
  2406. #elif defined(USE_NEON)
  2407. return U32x4(vshlq_u32(left.v, vreinterpretq_s32_u32(bitOffsets.v)));
  2408. #else
  2409. IMPL_SCALAR_REFERENCE_INFIX_4_LANES(left, bitOffsets, U32x4, uint32_t, <<)
  2410. #endif
  2411. }
  2412. inline U32x4 operator>>(const U32x4& left, const U32x4 &bitOffsets) {
  2413. #ifdef SAFE_POINTER_CHECKS
  2414. if(!allLanesLesser(bitOffsets, U32x4(32u))) {
  2415. throwError(U"Tried to shift ", left, U" by bit offsets ", bitOffsets, U", which is non-deterministic from being out of bound 0..31!\n");
  2416. }
  2417. #endif
  2418. #if defined(USE_SSE2)
  2419. IMPL_SCALAR_FALLBACK_INFIX_4_LANES(left, bitOffsets, U32x4, uint32_t, >>)
  2420. #elif defined(USE_NEON)
  2421. // TODO: Why is vshrq_u32 not found?
  2422. //return U32x4(vshrq_u32(left.v, vreinterpretq_s32_u32(bitOffsets.v)));
  2423. return U32x4(vshlq_u32(left.v, vnegq_s32(vreinterpretq_s32_u32(bitOffsets.v))));
  2424. #else
  2425. IMPL_SCALAR_REFERENCE_INFIX_4_LANES(left, bitOffsets, U32x4, uint32_t, >>)
  2426. #endif
  2427. }
  2428. inline U32x4 operator<<(const U32x4& left, const uint32_t &bitOffset) {
  2429. #if defined(USE_SSE2)
  2430. #ifdef SAFE_POINTER_CHECKS
  2431. if(bitOffset >= 32u) {
  2432. throwError(U"Tried to shift ", left, U" by bit offset ", bitOffset, U", which is non-deterministic from being out of bound 0..31!\n");
  2433. }
  2434. #endif
  2435. // Write the content to aligned stack memory.
  2436. ALIGN16 __m128i values;
  2437. left.writeAlignedUnsafe((uint32_t*)&values);
  2438. // Cast a pointer to the data into two 64-bit elements.
  2439. uint64_t *largeLanes = (uint64_t*)&values;
  2440. // Shift the 128 bits as two 64-bit values.
  2441. largeLanes[0] = largeLanes[0] << bitOffset;
  2442. largeLanes[1] = largeLanes[1] << bitOffset;
  2443. // Create a mask.
  2444. U32x4 mask = U32x4(uint32_t(~0u) << bitOffset);
  2445. // Return the shifted 64-bit elements masked to remove spill across lanes.
  2446. return U32x4::readAlignedUnsafe((uint32_t*)&values) & mask;
  2447. #else
  2448. return left << U32x4(bitOffset);
  2449. #endif
  2450. }
  2451. inline U32x4 operator>>(const U32x4& left, const uint32_t &bitOffset) {
  2452. #if defined(USE_SSE2)
  2453. #ifdef SAFE_POINTER_CHECKS
  2454. if(bitOffset >= 32u) {
  2455. throwError(U"Tried to shift ", left, U" by bit offset ", bitOffset, U", which is non-deterministic from being out of bound 0..31!\n");
  2456. }
  2457. #endif
  2458. // Write the content to aligned stack memory.
  2459. ALIGN16 __m128i values;
  2460. left.writeAlignedUnsafe((uint32_t*)&values);
  2461. // Cast a pointer to the data into two 64-bit elements.
  2462. uint64_t *largeLanes = (uint64_t*)&values;
  2463. // Shift the 128 bits as two 64-bit values.
  2464. largeLanes[0] = largeLanes[0] >> bitOffset;
  2465. largeLanes[1] = largeLanes[1] >> bitOffset;
  2466. // Create a mask.
  2467. U32x4 mask = U32x4(uint32_t(~0u) >> bitOffset);
  2468. // Return the shifted 64-bit elements masked to remove spill across lanes.
  2469. return U32x4::readAlignedUnsafe((uint32_t*)&values) & mask;
  2470. #else
  2471. return left >> U32x4(bitOffset);
  2472. #endif
  2473. }
  2474. // bitOffset must be an immediate constant, so a template argument is used.
  2475. template <uint32_t bitOffset>
  2476. inline U32x4 bitShiftLeftImmediate(const U32x4& left) {
  2477. static_assert(bitOffset < 32u, "Immediate left shift of 32-bit values may not shift more than 31 bits!");
  2478. #if defined(USE_SSE2)
  2479. return U32x4(_mm_slli_epi32(left.v, bitOffset));
  2480. #elif defined(USE_NEON)
  2481. return U32x4(vshlq_u32(left.v, LOAD_SCALAR_I32_SIMD(bitOffset)));
  2482. #else
  2483. U32x4 bitOffsets = U32x4(bitOffset);
  2484. IMPL_SCALAR_REFERENCE_INFIX_4_LANES(left, bitOffsets, U32x4, uint32_t, <<)
  2485. #endif
  2486. }
  2487. // bitOffset must be an immediate constant.
  2488. template <uint32_t bitOffset>
  2489. inline U32x4 bitShiftRightImmediate(const U32x4& left) {
  2490. static_assert(bitOffset < 32u, "Immediate right shift of 32-bit values may not shift more than 31 bits!");
  2491. #if defined(USE_SSE2)
  2492. return U32x4(_mm_srli_epi32(left.v, bitOffset));
  2493. #elif defined(USE_NEON)
  2494. // TODO: Why is vshrq_u32 not found?
  2495. //return U32x4(vshrq_u32(left.v, LOAD_SCALAR_I32_SIMD(bitOffset)));
  2496. return U32x4(vshlq_u32(left.v, LOAD_SCALAR_I32_SIMD(-(int32_t)bitOffset)));
  2497. #else
  2498. U32x4 bitOffsets = U32x4(bitOffset);
  2499. IMPL_SCALAR_REFERENCE_INFIX_4_LANES(left, bitOffsets, U32x4, uint32_t, >>)
  2500. #endif
  2501. }
  2502. inline U16x8 operator+(const U16x8& left, const U16x8& right) {
  2503. #if defined(USE_BASIC_SIMD)
  2504. return U16x8(ADD_U16_SIMD(left.v, right.v));
  2505. #else
  2506. IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, right, U16x8, uint16_t, +)
  2507. #endif
  2508. }
  2509. inline U16x8 operator-(const U16x8& left, const U16x8& right) {
  2510. #if defined(USE_BASIC_SIMD)
  2511. return U16x8(SUB_U16_SIMD(left.v, right.v));
  2512. #else
  2513. IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, right, U16x8, uint16_t, -)
  2514. #endif
  2515. }
  2516. inline U16x8 operator*(const U16x8& left, const U16x8& right) {
  2517. #if defined(USE_BASIC_SIMD)
  2518. return U16x8(MUL_U16_SIMD(left.v, right.v));
  2519. #else
  2520. IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, right, U16x8, uint16_t, *)
  2521. #endif
  2522. }
  2523. inline U8x16 operator+(const U8x16& left, const U8x16& right) {
  2524. #if defined(USE_BASIC_SIMD)
  2525. return U8x16(ADD_U8_SIMD(left.v, right.v));
  2526. #else
  2527. IMPL_SCALAR_REFERENCE_INFIX_16_LANES(left, right, U8x16, uint8_t, +)
  2528. #endif
  2529. }
  2530. inline U8x16 operator-(const U8x16& left, const U8x16& right) {
  2531. #if defined(USE_BASIC_SIMD)
  2532. return U8x16(SUB_U8_SIMD(left.v, right.v));
  2533. #else
  2534. IMPL_SCALAR_REFERENCE_INFIX_16_LANES(left, right, U8x16, uint8_t, -)
  2535. #endif
  2536. }
  2537. inline uint8_t impl_limit0(int32_t x) { return x < 0 ? 0 : x; }
  2538. inline uint8_t impl_limit255(uint32_t x) { return x > 255 ? 255 : x; }
  2539. inline U8x16 saturatedAddition(const U8x16& left, const U8x16& right) {
  2540. #if defined(USE_BASIC_SIMD)
  2541. return U8x16(ADD_SAT_U8_SIMD(left.v, right.v));
  2542. #else
  2543. return U8x16(
  2544. impl_limit255((uint32_t)left.scalars[0] + (uint32_t)right.scalars[0]),
  2545. impl_limit255((uint32_t)left.scalars[1] + (uint32_t)right.scalars[1]),
  2546. impl_limit255((uint32_t)left.scalars[2] + (uint32_t)right.scalars[2]),
  2547. impl_limit255((uint32_t)left.scalars[3] + (uint32_t)right.scalars[3]),
  2548. impl_limit255((uint32_t)left.scalars[4] + (uint32_t)right.scalars[4]),
  2549. impl_limit255((uint32_t)left.scalars[5] + (uint32_t)right.scalars[5]),
  2550. impl_limit255((uint32_t)left.scalars[6] + (uint32_t)right.scalars[6]),
  2551. impl_limit255((uint32_t)left.scalars[7] + (uint32_t)right.scalars[7]),
  2552. impl_limit255((uint32_t)left.scalars[8] + (uint32_t)right.scalars[8]),
  2553. impl_limit255((uint32_t)left.scalars[9] + (uint32_t)right.scalars[9]),
  2554. impl_limit255((uint32_t)left.scalars[10] + (uint32_t)right.scalars[10]),
  2555. impl_limit255((uint32_t)left.scalars[11] + (uint32_t)right.scalars[11]),
  2556. impl_limit255((uint32_t)left.scalars[12] + (uint32_t)right.scalars[12]),
  2557. impl_limit255((uint32_t)left.scalars[13] + (uint32_t)right.scalars[13]),
  2558. impl_limit255((uint32_t)left.scalars[14] + (uint32_t)right.scalars[14]),
  2559. impl_limit255((uint32_t)left.scalars[15] + (uint32_t)right.scalars[15])
  2560. );
  2561. #endif
  2562. }
  2563. inline U8x16 saturatedSubtraction(const U8x16& left, const U8x16& right) {
  2564. #if defined(USE_BASIC_SIMD)
  2565. return U8x16(SUB_SAT_U8_SIMD(left.v, right.v));
  2566. #else
  2567. return U8x16(
  2568. impl_limit0((int32_t)left.scalars[0] - (int32_t)right.scalars[0]),
  2569. impl_limit0((int32_t)left.scalars[1] - (int32_t)right.scalars[1]),
  2570. impl_limit0((int32_t)left.scalars[2] - (int32_t)right.scalars[2]),
  2571. impl_limit0((int32_t)left.scalars[3] - (int32_t)right.scalars[3]),
  2572. impl_limit0((int32_t)left.scalars[4] - (int32_t)right.scalars[4]),
  2573. impl_limit0((int32_t)left.scalars[5] - (int32_t)right.scalars[5]),
  2574. impl_limit0((int32_t)left.scalars[6] - (int32_t)right.scalars[6]),
  2575. impl_limit0((int32_t)left.scalars[7] - (int32_t)right.scalars[7]),
  2576. impl_limit0((int32_t)left.scalars[8] - (int32_t)right.scalars[8]),
  2577. impl_limit0((int32_t)left.scalars[9] - (int32_t)right.scalars[9]),
  2578. impl_limit0((int32_t)left.scalars[10] - (int32_t)right.scalars[10]),
  2579. impl_limit0((int32_t)left.scalars[11] - (int32_t)right.scalars[11]),
  2580. impl_limit0((int32_t)left.scalars[12] - (int32_t)right.scalars[12]),
  2581. impl_limit0((int32_t)left.scalars[13] - (int32_t)right.scalars[13]),
  2582. impl_limit0((int32_t)left.scalars[14] - (int32_t)right.scalars[14]),
  2583. impl_limit0((int32_t)left.scalars[15] - (int32_t)right.scalars[15])
  2584. );
  2585. #endif
  2586. }
  2587. inline I32x4 truncateToI32(const F32x4& vector) {
  2588. #if defined(USE_BASIC_SIMD)
  2589. return I32x4(F32_TO_I32_SIMD(vector.v));
  2590. #else
  2591. return I32x4((int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3]);
  2592. #endif
  2593. }
  2594. inline U32x4 truncateToU32(const F32x4& vector) {
  2595. #if defined(USE_BASIC_SIMD)
  2596. return U32x4(F32_TO_U32_SIMD(vector.v));
  2597. #else
  2598. return U32x4((uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3]);
  2599. #endif
  2600. }
  2601. inline F32x4 floatFromI32(const I32x4& vector) {
  2602. #if defined(USE_BASIC_SIMD)
  2603. return F32x4(I32_TO_F32_SIMD(vector.v));
  2604. #else
  2605. return F32x4((float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3]);
  2606. #endif
  2607. }
  2608. inline F32x4 floatFromU32(const U32x4& vector) {
  2609. #if defined(USE_BASIC_SIMD)
  2610. return F32x4(U32_TO_F32_SIMD(vector.v));
  2611. #else
  2612. return F32x4((float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3]);
  2613. #endif
  2614. }
  2615. inline I32x4 I32FromU32(const U32x4& vector) {
  2616. #if defined(USE_BASIC_SIMD)
  2617. return I32x4(REINTERPRET_U32_TO_I32_SIMD(vector.v));
  2618. #else
  2619. return I32x4((int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3]);
  2620. #endif
  2621. }
  2622. inline U32x4 U32FromI32(const I32x4& vector) {
  2623. #if defined(USE_BASIC_SIMD)
  2624. return U32x4(REINTERPRET_I32_TO_U32_SIMD(vector.v));
  2625. #else
  2626. return U32x4((uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3]);
  2627. #endif
  2628. }
  2629. // Warning! Behavior depends on endianness.
  2630. inline U8x16 reinterpret_U8FromU32(const U32x4& vector) {
  2631. #if defined(USE_BASIC_SIMD)
  2632. return U8x16(REINTERPRET_U32_TO_U8_SIMD(vector.v));
  2633. #else
  2634. const uint8_t *source = (const uint8_t*)vector.scalars;
  2635. return U8x16(
  2636. source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7],
  2637. source[8], source[9], source[10], source[11], source[12], source[13], source[14], source[15]
  2638. );
  2639. #endif
  2640. }
  2641. // Warning! Behavior depends on endianness.
  2642. inline U32x4 reinterpret_U32FromU8(const U8x16& vector) {
  2643. #if defined(USE_BASIC_SIMD)
  2644. return U32x4(REINTERPRET_U8_TO_U32_SIMD(vector.v));
  2645. #else
  2646. const uint32_t *source = (const uint32_t*)vector.scalars;
  2647. return U32x4(source[0], source[1], source[2], source[3]);
  2648. #endif
  2649. }
  2650. // Warning! Behavior depends on endianness.
  2651. inline U16x8 reinterpret_U16FromU32(const U32x4& vector) {
  2652. #if defined(USE_BASIC_SIMD)
  2653. return U16x8(REINTERPRET_U32_TO_U16_SIMD(vector.v));
  2654. #else
  2655. const uint16_t *source = (const uint16_t*)vector.scalars;
  2656. return U16x8(
  2657. source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7]
  2658. );
  2659. #endif
  2660. }
  2661. // Warning! Behavior depends on endianness.
  2662. inline U32x4 reinterpret_U32FromU16(const U16x8& vector) {
  2663. #if defined(USE_BASIC_SIMD)
  2664. return U32x4(REINTERPRET_U16_TO_U32_SIMD(vector.v));
  2665. #else
  2666. const uint32_t *source = (const uint32_t*)vector.scalars;
  2667. return U32x4(source[0], source[1], source[2], source[3]);
  2668. #endif
  2669. }
  2670. // Unpacking to larger integers
  2671. inline U32x4 lowerToU32(const U16x8& vector) {
  2672. #if defined(USE_BASIC_SIMD)
  2673. return U32x4(U16_LOW_TO_U32_SIMD(vector.v));
  2674. #else
  2675. return U32x4(vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3]);
  2676. #endif
  2677. }
  2678. inline U32x4 higherToU32(const U16x8& vector) {
  2679. #if defined(USE_BASIC_SIMD)
  2680. return U32x4(U16_HIGH_TO_U32_SIMD(vector.v));
  2681. #else
  2682. return U32x4(vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7]);
  2683. #endif
  2684. }
  2685. inline U16x8 lowerToU16(const U8x16& vector) {
  2686. #if defined(USE_BASIC_SIMD)
  2687. return U16x8(U8_LOW_TO_U16_SIMD(vector.v));
  2688. #else
  2689. return U16x8(
  2690. vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3],
  2691. vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7]
  2692. );
  2693. #endif
  2694. }
  2695. inline U16x8 higherToU16(const U8x16& vector) {
  2696. #if defined(USE_BASIC_SIMD)
  2697. return U16x8(U8_HIGH_TO_U16_SIMD(vector.v));
  2698. #else
  2699. return U16x8(
  2700. vector.scalars[8], vector.scalars[9], vector.scalars[10], vector.scalars[11],
  2701. vector.scalars[12], vector.scalars[13], vector.scalars[14], vector.scalars[15]
  2702. );
  2703. #endif
  2704. }
  2705. // Unary negation for convenience and code readability.
  2706. // Before using unary negation, always check if:
  2707. // * An addition can be turned into a subtraction?
  2708. // x = -a + b
  2709. // x = b - a
  2710. // * A multiplying constant or scalar can be negated instead?
  2711. // x = -b * 2
  2712. // x = b * -2
  2713. inline F32x4 operator-(const F32x4& value) {
  2714. #if defined(USE_BASIC_SIMD)
  2715. return F32x4(0.0f) - value;
  2716. #else
  2717. return F32x4(-value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3]);
  2718. #endif
  2719. }
  2720. inline I32x4 operator-(const I32x4& value) {
  2721. #if defined(USE_BASIC_SIMD)
  2722. return I32x4(0) - value;
  2723. #else
  2724. return I32x4(-value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3]);
  2725. #endif
  2726. }
  2727. // Helper macros for generating the vector extract functions.
  2728. // Having one function for each type and offset makes sure that the compiler gets an immediate integer within the valid range.
  2729. #if defined(USE_BASIC_SIMD)
  2730. #if defined(USE_SSE2)
  2731. #if defined(USE_SSSE3)
  2732. #define _MM_ALIGNR_EPI8(A, B, OFFSET) _mm_alignr_epi8(A, B, OFFSET)
  2733. #else
  2734. // If SSSE3 is not used, emulate it using stack memory and unaligned reading of data.
  2735. static inline SIMD_U8x16 _MM_ALIGNR_EPI8(SIMD_U8x16 a, SIMD_U8x16 b, int offset) {
  2736. ALIGN16 uint8_t vectorBuffer[32];
  2737. #ifdef SAFE_POINTER_CHECKS
  2738. if (uintptr_t((void*)vectorBuffer) & 15u) { throwError(U"Unaligned stack memory detected in 128-bit VECTOR_EXTRACT_GENERATOR!\n"); }
  2739. #endif
  2740. _mm_store_si128((SIMD_U8x16*)(vectorBuffer), b);
  2741. _mm_store_si128((SIMD_U8x16*)(vectorBuffer + 16), a);
  2742. ALIGN16 SIMD_U8x16 result = _mm_loadu_si128((SIMD_U8x16*)(vectorBuffer + offset));
  2743. return result;
  2744. }
  2745. #endif
  2746. #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return U8x16(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET));
  2747. #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return U16x8(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 2));
  2748. #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return U32x4(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 4));
  2749. #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return I32x4(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 4));
  2750. #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return F32x4(SIMD_F32x4(_MM_ALIGNR_EPI8(SIMD_U32x4(b.v), SIMD_U32x4(a.v), OFFSET * 4)));
  2751. #elif defined(USE_NEON)
  2752. #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return U8x16(vextq_u8(a.v, b.v, OFFSET));
  2753. #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return U16x8(vextq_u16(a.v, b.v, OFFSET));
  2754. #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return U32x4(vextq_u32(a.v, b.v, OFFSET));
  2755. #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return I32x4(vextq_s32(a.v, b.v, OFFSET));
  2756. #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return F32x4(vextq_f32(a.v, b.v, OFFSET));
  2757. #endif
  2758. #else
  2759. #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  2760. #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  2761. #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  2762. #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  2763. #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  2764. #endif
  2765. // Vector extraction concatunates two input vectors and reads a vector between them using an offset.
  2766. // The first and last offsets that only return one of the inputs can be used for readability, because they will be inlined and removed by the compiler.
  2767. // To get elements from the right side, combine the center vector with the right vector and shift one element to the left using vectorExtract_1 for the given type.
  2768. // To get elements from the left side, combine the left vector with the center vector and shift one element to the right using vectorExtract_15 for 16 lanes, vectorExtract_7 for 8 lanes, or vectorExtract_3 for 4 lanes.
  2769. U8x16 inline vectorExtract_0(const U8x16 &a, const U8x16 &b) { return a; }
  2770. U8x16 inline vectorExtract_1(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(1, U8x16(a.scalars[1], a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0])) }
  2771. U8x16 inline vectorExtract_2(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(2, U8x16(a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1])) }
  2772. U8x16 inline vectorExtract_3(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(3, U8x16(a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2])) }
  2773. U8x16 inline vectorExtract_4(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(4, U8x16(a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3])) }
  2774. U8x16 inline vectorExtract_5(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(5, U8x16(a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4])) }
  2775. U8x16 inline vectorExtract_6(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(6, U8x16(a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5])) }
  2776. U8x16 inline vectorExtract_7(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(7, U8x16(a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6])) }
  2777. U8x16 inline vectorExtract_8(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(8, U8x16(a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7])) }
  2778. U8x16 inline vectorExtract_9(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(9, U8x16(a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8])) }
  2779. U8x16 inline vectorExtract_10(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(10, U8x16(a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9])) }
  2780. U8x16 inline vectorExtract_11(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(11, U8x16(a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10])) }
  2781. U8x16 inline vectorExtract_12(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(12, U8x16(a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11])) }
  2782. U8x16 inline vectorExtract_13(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(13, U8x16(a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11], b.scalars[12])) }
  2783. U8x16 inline vectorExtract_14(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(14, U8x16(a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11], b.scalars[12], b.scalars[13])) }
  2784. U8x16 inline vectorExtract_15(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(15, U8x16(a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11], b.scalars[12], b.scalars[13], b.scalars[14])) }
  2785. U8x16 inline vectorExtract_16(const U8x16 &a, const U8x16 &b) { return b; }
  2786. U16x8 inline vectorExtract_0(const U16x8 &a, const U16x8 &b) { return a; }
  2787. U16x8 inline vectorExtract_1(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(1, U16x8(a.scalars[1], a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0])) }
  2788. U16x8 inline vectorExtract_2(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(2, U16x8(a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1])) }
  2789. U16x8 inline vectorExtract_3(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(3, U16x8(a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2])) }
  2790. U16x8 inline vectorExtract_4(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(4, U16x8(a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3])) }
  2791. U16x8 inline vectorExtract_5(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(5, U16x8(a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4])) }
  2792. U16x8 inline vectorExtract_6(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(6, U16x8(a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5])) }
  2793. U16x8 inline vectorExtract_7(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(7, U16x8(a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6])) }
  2794. U16x8 inline vectorExtract_8(const U16x8 &a, const U16x8 &b) { return b; }
  2795. U32x4 inline vectorExtract_0(const U32x4 &a, const U32x4 &b) { return a; }
  2796. U32x4 inline vectorExtract_1(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(1, U32x4(a.scalars[1], a.scalars[2], a.scalars[3], b.scalars[0])) }
  2797. U32x4 inline vectorExtract_2(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(2, U32x4(a.scalars[2], a.scalars[3], b.scalars[0], b.scalars[1])) }
  2798. U32x4 inline vectorExtract_3(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(3, U32x4(a.scalars[3], b.scalars[0], b.scalars[1], b.scalars[2])) }
  2799. U32x4 inline vectorExtract_4(const U32x4 &a, const U32x4 &b) { return b; }
  2800. I32x4 inline vectorExtract_0(const I32x4 &a, const I32x4 &b) { return a; }
  2801. I32x4 inline vectorExtract_1(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(1, I32x4(a.scalars[1], a.scalars[2], a.scalars[3], b.scalars[0])) }
  2802. I32x4 inline vectorExtract_2(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(2, I32x4(a.scalars[2], a.scalars[3], b.scalars[0], b.scalars[1])) }
  2803. I32x4 inline vectorExtract_3(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(3, I32x4(a.scalars[3], b.scalars[0], b.scalars[1], b.scalars[2])) }
  2804. I32x4 inline vectorExtract_4(const I32x4 &a, const I32x4 &b) { return b; }
  2805. F32x4 inline vectorExtract_0(const F32x4 &a, const F32x4 &b) { return a; }
  2806. F32x4 inline vectorExtract_1(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(1, F32x4(a.scalars[1], a.scalars[2], a.scalars[3], b.scalars[0])) }
  2807. F32x4 inline vectorExtract_2(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(2, F32x4(a.scalars[2], a.scalars[3], b.scalars[0], b.scalars[1])) }
  2808. F32x4 inline vectorExtract_3(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(3, F32x4(a.scalars[3], b.scalars[0], b.scalars[1], b.scalars[2])) }
  2809. F32x4 inline vectorExtract_4(const F32x4 &a, const F32x4 &b) { return b; }
  2810. // Gather instructions load memory from a pointer at multiple index offsets at the same time.
  2811. // The given pointers should be aligned with 4 bytes, so that the fallback solution works on machines with strict alignment requirements.
  2812. #if defined(USE_AVX2)
  2813. #define GATHER_I32x4_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_epi32((const int32_t*)(SOURCE), FOUR_OFFSETS, SCALE)
  2814. #define GATHER_U32x4_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_epi32((const int32_t*)(SOURCE), FOUR_OFFSETS, SCALE)
  2815. #define GATHER_F32x4_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_ps((const float*)(SOURCE), FOUR_OFFSETS, SCALE)
  2816. #endif
  2817. static inline U32x4 gather_U32(dsr::SafePointer<const uint32_t> data, const U32x4 &elementOffset) {
  2818. #ifdef SAFE_POINTER_CHECKS
  2819. ALIGN16 uint32_t elementOffsets[4];
  2820. if (uintptr_t((void*)elementOffsets) & 15u) { throwError(U"Unaligned stack memory detected in 128-bit gather_U32!\n"); }
  2821. elementOffset.writeAlignedUnsafe(elementOffsets);
  2822. data.assertInside("U32x4 gather_U32 lane 0", (data + elementOffsets[0]).getUnchecked());
  2823. data.assertInside("U32x4 gather_U32 lane 1", (data + elementOffsets[1]).getUnchecked());
  2824. data.assertInside("U32x4 gather_U32 lane 2", (data + elementOffsets[2]).getUnchecked());
  2825. data.assertInside("U32x4 gather_U32 lane 3", (data + elementOffsets[3]).getUnchecked());
  2826. #endif
  2827. #if defined(USE_AVX2)
  2828. return U32x4(GATHER_U32x4_AVX2(data.getUnsafe(), elementOffset.v, 4));
  2829. #else
  2830. #ifndef SAFE_POINTER_CHECKS
  2831. ALIGN16 uint32_t elementOffsets[4];
  2832. elementOffset.writeAlignedUnsafe(elementOffsets);
  2833. #endif
  2834. return U32x4(
  2835. *(data + elementOffsets[0]),
  2836. *(data + elementOffsets[1]),
  2837. *(data + elementOffsets[2]),
  2838. *(data + elementOffsets[3])
  2839. );
  2840. #endif
  2841. }
  2842. static inline I32x4 gather_I32(dsr::SafePointer<const int32_t> data, const U32x4 &elementOffset) {
  2843. #ifdef SAFE_POINTER_CHECKS
  2844. ALIGN16 uint32_t elementOffsets[4];
  2845. if (uintptr_t((void*)elementOffsets) & 15u) { throwError(U"Unaligned stack memory detected in 128-bit gather_I32!\n"); }
  2846. elementOffset.writeAlignedUnsafe(elementOffsets);
  2847. data.assertInside("I32x4 gather_I32 lane 0", (data + elementOffsets[0]).getUnchecked());
  2848. data.assertInside("I32x4 gather_I32 lane 1", (data + elementOffsets[1]).getUnchecked());
  2849. data.assertInside("I32x4 gather_I32 lane 2", (data + elementOffsets[2]).getUnchecked());
  2850. data.assertInside("I32x4 gather_I32 lane 3", (data + elementOffsets[3]).getUnchecked());
  2851. #endif
  2852. #if defined(USE_AVX2)
  2853. return I32x4(GATHER_U32x4_AVX2(data.getUnsafe(), elementOffset.v, 4));
  2854. #else
  2855. #ifndef SAFE_POINTER_CHECKS
  2856. ALIGN16 uint32_t elementOffsets[4];
  2857. elementOffset.writeAlignedUnsafe(elementOffsets);
  2858. #endif
  2859. return I32x4(
  2860. *(data + elementOffsets[0]),
  2861. *(data + elementOffsets[1]),
  2862. *(data + elementOffsets[2]),
  2863. *(data + elementOffsets[3])
  2864. );
  2865. #endif
  2866. }
  2867. static inline F32x4 gather_F32(dsr::SafePointer<const float> data, const U32x4 &elementOffset) {
  2868. #ifdef SAFE_POINTER_CHECKS
  2869. ALIGN16 uint32_t elementOffsets[4];
  2870. if (uintptr_t((void*)elementOffsets) & 15u) { throwError(U"Unaligned stack memory detected in 128-bit gather_F32!\n"); }
  2871. elementOffset.writeAlignedUnsafe(elementOffsets);
  2872. data.assertInside("F32x4 gather_F32 lane 0", (data + elementOffsets[0]).getUnchecked());
  2873. data.assertInside("F32x4 gather_F32 lane 1", (data + elementOffsets[1]).getUnchecked());
  2874. data.assertInside("F32x4 gather_F32 lane 2", (data + elementOffsets[2]).getUnchecked());
  2875. data.assertInside("F32x4 gather_F32 lane 3", (data + elementOffsets[3]).getUnchecked());
  2876. #endif
  2877. #if defined(USE_AVX2)
  2878. return F32x4(GATHER_F32x4_AVX2(data.getUnsafe(), elementOffset.v, 4));
  2879. #else
  2880. #ifndef SAFE_POINTER_CHECKS
  2881. ALIGN16 uint32_t elementOffsets[4];
  2882. elementOffset.writeAlignedUnsafe(elementOffsets);
  2883. #endif
  2884. return F32x4(
  2885. *(data + elementOffsets[0]),
  2886. *(data + elementOffsets[1]),
  2887. *(data + elementOffsets[2]),
  2888. *(data + elementOffsets[3])
  2889. );
  2890. #endif
  2891. }
  2892. inline F32x8 operator+(const F32x8& left, const F32x8& right) {
  2893. #if defined(USE_256BIT_F_SIMD)
  2894. return F32x8(ADD_F32_SIMD256(left.v, right.v));
  2895. #else
  2896. IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, right, F32x8, float, +)
  2897. #endif
  2898. }
  2899. inline F32x8 operator-(const F32x8& left, const F32x8& right) {
  2900. #if defined(USE_256BIT_F_SIMD)
  2901. return F32x8(SUB_F32_SIMD256(left.v, right.v));
  2902. #else
  2903. IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, right, F32x8, float, -)
  2904. #endif
  2905. }
  2906. inline F32x8 operator*(const F32x8& left, const F32x8& right) {
  2907. #if defined(USE_256BIT_F_SIMD)
  2908. return F32x8(MUL_F32_SIMD256(left.v, right.v));
  2909. #else
  2910. IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, right, F32x8, float, *)
  2911. #endif
  2912. }
  2913. inline F32x8 min(const F32x8& left, const F32x8& right) {
  2914. #if defined(USE_256BIT_F_SIMD)
  2915. return F32x8(MIN_F32_SIMD256(left.v, right.v));
  2916. #else
  2917. float v0 = left.scalars[0];
  2918. float v1 = left.scalars[1];
  2919. float v2 = left.scalars[2];
  2920. float v3 = left.scalars[3];
  2921. float v4 = left.scalars[4];
  2922. float v5 = left.scalars[5];
  2923. float v6 = left.scalars[6];
  2924. float v7 = left.scalars[7];
  2925. float r0 = right.scalars[0];
  2926. float r1 = right.scalars[1];
  2927. float r2 = right.scalars[2];
  2928. float r3 = right.scalars[3];
  2929. float r4 = right.scalars[4];
  2930. float r5 = right.scalars[5];
  2931. float r6 = right.scalars[6];
  2932. float r7 = right.scalars[7];
  2933. if (r0 < v0) { v0 = r0; }
  2934. if (r1 < v1) { v1 = r1; }
  2935. if (r2 < v2) { v2 = r2; }
  2936. if (r3 < v3) { v3 = r3; }
  2937. if (r4 < v4) { v4 = r4; }
  2938. if (r5 < v5) { v5 = r5; }
  2939. if (r6 < v6) { v6 = r6; }
  2940. if (r7 < v7) { v7 = r7; }
  2941. return F32x8(v0, v1, v2, v3, v4, v5, v6, v7);
  2942. #endif
  2943. }
  2944. inline F32x8 max(const F32x8& left, const F32x8& right) {
  2945. #if defined(USE_256BIT_F_SIMD)
  2946. return F32x8(MAX_F32_SIMD256(left.v, right.v));
  2947. #else
  2948. float v0 = left.scalars[0];
  2949. float v1 = left.scalars[1];
  2950. float v2 = left.scalars[2];
  2951. float v3 = left.scalars[3];
  2952. float v4 = left.scalars[4];
  2953. float v5 = left.scalars[5];
  2954. float v6 = left.scalars[6];
  2955. float v7 = left.scalars[7];
  2956. float r0 = right.scalars[0];
  2957. float r1 = right.scalars[1];
  2958. float r2 = right.scalars[2];
  2959. float r3 = right.scalars[3];
  2960. float r4 = right.scalars[4];
  2961. float r5 = right.scalars[5];
  2962. float r6 = right.scalars[6];
  2963. float r7 = right.scalars[7];
  2964. if (r0 > v0) { v0 = r0; }
  2965. if (r1 > v1) { v1 = r1; }
  2966. if (r2 > v2) { v2 = r2; }
  2967. if (r3 > v3) { v3 = r3; }
  2968. if (r4 > v4) { v4 = r4; }
  2969. if (r5 > v5) { v5 = r5; }
  2970. if (r6 > v6) { v6 = r6; }
  2971. if (r7 > v7) { v7 = r7; }
  2972. return F32x8(v0, v1, v2, v3, v4, v5, v6, v7);
  2973. #endif
  2974. }
  2975. inline I32x8 operator+(const I32x8& left, const I32x8& right) {
  2976. #if defined(USE_256BIT_X_SIMD)
  2977. return I32x8(ADD_I32_SIMD256(left.v, right.v));
  2978. #else
  2979. IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, right, I32x8, int32_t, +)
  2980. #endif
  2981. }
  2982. inline I32x8 operator-(const I32x8& left, const I32x8& right) {
  2983. #if defined(USE_256BIT_X_SIMD)
  2984. return I32x8(SUB_I32_SIMD256(left.v, right.v));
  2985. #else
  2986. IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, right, I32x8, int32_t, -)
  2987. #endif
  2988. }
  2989. inline I32x8 operator*(const I32x8& left, const I32x8& right) {
  2990. #if defined(USE_AVX2)
  2991. return I32x8(MUL_I32_SIMD256(left.v, right.v));
  2992. #else
  2993. IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, right, I32x8, int32_t, *)
  2994. #endif
  2995. }
  2996. inline U32x8 operator+(const U32x8& left, const U32x8& right) {
  2997. #if defined(USE_256BIT_X_SIMD)
  2998. return U32x8(ADD_U32_SIMD256(left.v, right.v));
  2999. #else
  3000. IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, right, U32x8, uint32_t, +)
  3001. #endif
  3002. }
  3003. inline U32x8 operator-(const U32x8& left, const U32x8& right) {
  3004. #if defined(USE_256BIT_X_SIMD)
  3005. return U32x8(SUB_U32_SIMD256(left.v, right.v));
  3006. #else
  3007. IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, right, U32x8, uint32_t, -)
  3008. #endif
  3009. }
  3010. inline U32x8 operator*(const U32x8& left, const U32x8& right) {
  3011. #if defined(USE_256BIT_X_SIMD)
  3012. return U32x8(MUL_U32_SIMD256(left.v, right.v));
  3013. #else
  3014. IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, right, U32x8, uint32_t, *)
  3015. #endif
  3016. }
  3017. inline U32x8 operator&(const U32x8& left, const U32x8& right) {
  3018. assert((uintptr_t(&left) & 31u) == 0);
  3019. #if defined(USE_256BIT_X_SIMD)
  3020. return U32x8(BITWISE_AND_U32_SIMD256(left.v, right.v));
  3021. #else
  3022. IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, right, U32x8, uint32_t, &)
  3023. #endif
  3024. }
  3025. inline U32x8 operator|(const U32x8& left, const U32x8& right) {
  3026. assert((uintptr_t(&left) & 31u) == 0);
  3027. #if defined(USE_256BIT_X_SIMD)
  3028. return U32x8(BITWISE_OR_U32_SIMD256(left.v, right.v));
  3029. #else
  3030. IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, right, U32x8, uint32_t, |)
  3031. #endif
  3032. }
  3033. inline U32x8 operator^(const U32x8& left, const U32x8& right) {
  3034. assert((uintptr_t(&left) & 31u) == 0);
  3035. #if defined(USE_256BIT_X_SIMD)
  3036. return U32x8(BITWISE_XOR_U32_SIMD256(left.v, right.v));
  3037. #else
  3038. IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, right, U32x8, uint32_t, ^)
  3039. #endif
  3040. }
  3041. inline U32x8 operator~(const U32x8& value) {
  3042. assert((uintptr_t(&value) & 31u) == 0);
  3043. #if defined(USE_BASIC_SIMD)
  3044. return value ^ U32x8(~uint32_t(0));
  3045. #else
  3046. return U32x8(
  3047. ~value.scalars[0],
  3048. ~value.scalars[1],
  3049. ~value.scalars[2],
  3050. ~value.scalars[3],
  3051. ~value.scalars[4],
  3052. ~value.scalars[5],
  3053. ~value.scalars[6],
  3054. ~value.scalars[7]
  3055. );
  3056. #endif
  3057. }
  3058. inline U16x16 operator&(const U16x16& left, const U16x16& right) {
  3059. #if defined(USE_AVX2)
  3060. return U16x16(_mm256_and_si256(left.v, right.v));
  3061. #else
  3062. IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, right, U16x16, uint16_t, &)
  3063. #endif
  3064. }
  3065. // Bitwise or
  3066. inline U16x16 operator|(const U16x16& left, const U16x16& right) {
  3067. #if defined(USE_AVX2)
  3068. return U16x16(_mm256_or_si256(left.v, right.v));
  3069. #else
  3070. IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, right, U16x16, uint16_t, |)
  3071. #endif
  3072. }
  3073. // Bitwise xor
  3074. inline U16x16 operator^(const U16x16& left, const U16x16& right) {
  3075. #if defined(USE_AVX2)
  3076. return U16x16(_mm256_xor_si256(left.v, right.v));
  3077. #else
  3078. IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, right, U16x16, uint16_t, ^)
  3079. #endif
  3080. }
  3081. // Bitwise negation
  3082. inline U16x16 operator~(const U16x16& value) {
  3083. #if defined(USE_AVX2)
  3084. // Fall back on xor against all ones.
  3085. return value ^ U16x16(~uint16_t(0));
  3086. #else
  3087. // TODO: Perform using 64-bit integers.
  3088. return U16x16(~value.scalars[0], ~value.scalars[1], ~value.scalars[2], ~value.scalars[3], ~value.scalars[4], ~value.scalars[5], ~value.scalars[6], ~value.scalars[7], ~value.scalars[8], ~value.scalars[9], ~value.scalars[10], ~value.scalars[11], ~value.scalars[12], ~value.scalars[13], ~value.scalars[14], ~value.scalars[15]);
  3089. #endif
  3090. }
  3091. inline U16x16 operator<<(const U16x16& left, const U16x16 &bitOffsets) {
  3092. #ifdef SAFE_POINTER_CHECKS
  3093. if(!allLanesLesser(bitOffsets, U16x16(16u))) {
  3094. throwError(U"Tried to shift ", left, U" by bit offsets ", bitOffsets, U", which is non-deterministic from being out of bound 0..15!\n");
  3095. }
  3096. #endif
  3097. #if defined(USE_AVX2)
  3098. IMPL_SCALAR_FALLBACK_INFIX_16_LANES(left, bitOffsets, U16x16, uint16_t, <<)
  3099. #else
  3100. IMPL_SCALAR_REFERENCE_INFIX_16_LANES(left, bitOffsets, U16x16, uint16_t, <<)
  3101. #endif
  3102. }
  3103. inline U16x16 operator>>(const U16x16& left, const U16x16 &bitOffsets) {
  3104. #ifdef SAFE_POINTER_CHECKS
  3105. if(!allLanesLesser(bitOffsets, U16x16(16u))) {
  3106. throwError(U"Tried to shift ", left, U" by bit offsets ", bitOffsets, U", which is non-deterministic from being out of bound 0..15!\n");
  3107. }
  3108. #endif
  3109. #if defined(USE_AVX2)
  3110. IMPL_SCALAR_FALLBACK_INFIX_16_LANES(left, bitOffsets, U16x16, uint16_t, >>)
  3111. #else
  3112. IMPL_SCALAR_REFERENCE_INFIX_16_LANES(left, bitOffsets, U16x16, uint16_t, >>)
  3113. #endif
  3114. }
  3115. inline U16x16 operator<<(const U16x16& left, const uint32_t &bitOffset) {
  3116. #if defined(USE_AVX2)
  3117. #ifdef SAFE_POINTER_CHECKS
  3118. if(bitOffset >= 16u) {
  3119. throwError(U"Tried to shift ", left, U" by bit offset ", bitOffset, U", which is non-deterministic from being out of bound 0..16!\n");
  3120. }
  3121. #endif
  3122. // Write the content to aligned stack memory.
  3123. ALIGN32 __m256i values;
  3124. left.writeAlignedUnsafe((uint16_t*)&values);
  3125. // Cast a pointer to the data into four 64-bit elements.
  3126. uint64_t *largeLanes = (uint64_t*)&values;
  3127. // Shift the 128 bits as two 64-bit values.
  3128. largeLanes[0] = largeLanes[0] << bitOffset;
  3129. largeLanes[1] = largeLanes[1] << bitOffset;
  3130. largeLanes[2] = largeLanes[2] << bitOffset;
  3131. largeLanes[3] = largeLanes[3] << bitOffset;
  3132. // Create a mask.
  3133. U16x16 mask = U16x16(uint16_t(~0u) << bitOffset);
  3134. // Return the shifted 64-bit elements masked to remove spill across lanes.
  3135. return U16x16::readAlignedUnsafe((uint16_t*)&values) & mask;
  3136. #else
  3137. return left << U16x16(bitOffset);
  3138. #endif
  3139. }
  3140. inline U16x16 operator>>(const U16x16& left, const uint32_t &bitOffset) {
  3141. #if defined(USE_AVX2)
  3142. #ifdef SAFE_POINTER_CHECKS
  3143. if(bitOffset >= 16u) {
  3144. throwError(U"Tried to shift ", left, U" by bit offset ", bitOffset, U", which is non-deterministic from being out of bound 0..16!\n");
  3145. }
  3146. #endif
  3147. // Write the content to aligned stack memory.
  3148. ALIGN32 __m256i values;
  3149. left.writeAlignedUnsafe((uint16_t*)&values);
  3150. // Cast a pointer to the data into four 64-bit elements.
  3151. uint64_t *largeLanes = (uint64_t*)&values;
  3152. // Shift the 128 bits as two 64-bit values.
  3153. largeLanes[0] = largeLanes[0] >> bitOffset;
  3154. largeLanes[1] = largeLanes[1] >> bitOffset;
  3155. largeLanes[2] = largeLanes[2] >> bitOffset;
  3156. largeLanes[3] = largeLanes[3] >> bitOffset;
  3157. // Create a mask.
  3158. U16x16 mask = U16x16(uint16_t(~0u) >> bitOffset);
  3159. // Return the shifted 64-bit elements masked to remove spill across lanes.
  3160. return U16x16::readAlignedUnsafe((uint16_t*)&values) & mask;
  3161. #else
  3162. return left >> U16x16(bitOffset);
  3163. #endif
  3164. }
  3165. // bitOffset must be an immediate constant from 0 to 31, so a template argument is used.
  3166. template <uint32_t bitOffset>
  3167. inline U16x16 bitShiftLeftImmediate(const U16x16& left) {
  3168. static_assert(bitOffset < 16u, "Immediate left shift of 16-bit values may not shift more than 15 bits!");
  3169. #if defined(USE_AVX2)
  3170. return U16x16(_mm256_slli_epi16(left.v, bitOffset));
  3171. #else
  3172. U16x16 bitOffsets = U16x16(bitOffset);
  3173. IMPL_SCALAR_REFERENCE_INFIX_16_LANES(left, bitOffsets, U16x16, uint16_t, <<)
  3174. #endif
  3175. }
  3176. // bitOffset must be an immediate constant from 0 to 31, so a template argument is used.
  3177. template <uint32_t bitOffset>
  3178. inline U16x16 bitShiftRightImmediate(const U16x16& left) {
  3179. static_assert(bitOffset < 16u, "Immediate right shift of 16-bit values may not shift more than 15 bits!");
  3180. #if defined(USE_AVX2)
  3181. return U16x16(_mm256_srli_epi16(left.v, bitOffset));
  3182. #else
  3183. U16x16 bitOffsets = U16x16(bitOffset);
  3184. IMPL_SCALAR_REFERENCE_INFIX_16_LANES(left, bitOffsets, U16x16, uint16_t, <<)
  3185. #endif
  3186. }
  3187. inline U32x8 operator<<(const U32x8& left, const U32x8 &bitOffsets) {
  3188. assert((uintptr_t(&left) & 31u) == 0);
  3189. #ifdef SAFE_POINTER_CHECKS
  3190. if(!allLanesLesser(bitOffsets, U32x8(32u))) {
  3191. throwError(U"Tried to shift ", left, U" by bit offsets ", bitOffsets, U", which is non-deterministic from being out of bound 0..31!\n");
  3192. }
  3193. #endif
  3194. #if defined(USE_AVX2)
  3195. IMPL_SCALAR_FALLBACK_INFIX_8_LANES(left, bitOffsets, U32x8, uint32_t, <<)
  3196. #else
  3197. IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, bitOffsets, U32x8, uint32_t, <<)
  3198. #endif
  3199. }
  3200. inline U32x8 operator>>(const U32x8& left, const U32x8 &bitOffsets) {
  3201. assert((uintptr_t(&left) & 31u) == 0);
  3202. #ifdef SAFE_POINTER_CHECKS
  3203. if(!allLanesLesser(bitOffsets, U32x8(32u))) {
  3204. throwError(U"Tried to shift ", left, U" by bit offsets ", bitOffsets, U", which is non-deterministic from being out of bound 0..31!\n");
  3205. }
  3206. #endif
  3207. #if defined(USE_AVX2)
  3208. IMPL_SCALAR_FALLBACK_INFIX_8_LANES(left, bitOffsets, U32x8, uint32_t, >>)
  3209. #else
  3210. IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, bitOffsets, U32x8, uint32_t, >>)
  3211. #endif
  3212. }
  3213. inline U32x8 operator<<(const U32x8& left, const uint32_t &bitOffset) {
  3214. #if defined(USE_AVX2)
  3215. #ifdef SAFE_POINTER_CHECKS
  3216. if(bitOffset >= 32u) {
  3217. throwError(U"Tried to shift ", left, U" by bit offset ", bitOffset, U", which is non-deterministic from being out of bound 0..31!\n");
  3218. }
  3219. #endif
  3220. // Write the content to aligned stack memory.
  3221. ALIGN32 __m256i values;
  3222. left.writeAlignedUnsafe((uint32_t*)&values);
  3223. // Cast a pointer to the data into two 64-bit elements.
  3224. uint64_t *largeLanes = (uint64_t*)&values;
  3225. // Shift the 128 bits as two 64-bit values.
  3226. largeLanes[0] = largeLanes[0] << bitOffset;
  3227. largeLanes[1] = largeLanes[1] << bitOffset;
  3228. largeLanes[2] = largeLanes[2] << bitOffset;
  3229. largeLanes[3] = largeLanes[3] << bitOffset;
  3230. // Create a mask.
  3231. U32x8 mask = U32x8(uint32_t(~0u) << bitOffset);
  3232. // Return the shifted 64-bit elements masked to remove spill across lanes.
  3233. return U32x8::readAlignedUnsafe((uint32_t*)&values) & mask;
  3234. #else
  3235. return left << U32x8(bitOffset);
  3236. #endif
  3237. }
  3238. inline U32x8 operator>>(const U32x8& left, const uint32_t &bitOffset) {
  3239. #if defined(USE_AVX2)
  3240. #ifdef SAFE_POINTER_CHECKS
  3241. if(bitOffset >= 32u) {
  3242. throwError(U"Tried to shift ", left, U" by bit offset ", bitOffset, U", which is non-deterministic from being out of bound 0..31!\n");
  3243. }
  3244. #endif
  3245. // Write the content to aligned stack memory.
  3246. ALIGN32 __m256i values;
  3247. left.writeAlignedUnsafe((uint32_t*)&values);
  3248. // Cast a pointer to the data into two 64-bit elements.
  3249. uint64_t *largeLanes = (uint64_t*)&values;
  3250. // Shift the 128 bits as two 64-bit values.
  3251. largeLanes[0] = largeLanes[0] >> bitOffset;
  3252. largeLanes[1] = largeLanes[1] >> bitOffset;
  3253. largeLanes[2] = largeLanes[2] >> bitOffset;
  3254. largeLanes[3] = largeLanes[3] >> bitOffset;
  3255. // Create a mask.
  3256. U32x8 mask = U32x8(uint32_t(~0u) >> bitOffset);
  3257. // Return the shifted 64-bit elements masked to remove spill across lanes.
  3258. return U32x8::readAlignedUnsafe((uint32_t*)&values) & mask;
  3259. #else
  3260. return left >> U32x8(bitOffset);
  3261. #endif
  3262. }
  3263. // bitOffset must be an immediate constant from 0 to 31, so a template argument is used.
  3264. template <uint32_t bitOffset>
  3265. inline U32x8 bitShiftLeftImmediate(const U32x8& left) {
  3266. assert((uintptr_t(&left) & 31u) == 0);
  3267. static_assert(bitOffset < 32u, "Immediate left shift of 32-bit values may not shift more than 31 bits!");
  3268. #if defined(USE_AVX2)
  3269. return U32x8(_mm256_slli_epi32(left.v, bitOffset));
  3270. #else
  3271. U32x8 bitOffsets = U32x8(bitOffset);
  3272. IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, bitOffsets, U32x8, uint32_t, <<)
  3273. #endif
  3274. }
  3275. // bitOffset must be an immediate constant from 0 to 31, so a template argument is used.
  3276. template <uint32_t bitOffset>
  3277. inline U32x8 bitShiftRightImmediate(const U32x8& left) {
  3278. assert((uintptr_t(&left) & 31u) == 0);
  3279. static_assert(bitOffset < 32u, "Immediate right shift of 32-bit values may not shift more than 31 bits!");
  3280. #if defined(USE_AVX2)
  3281. return U32x8(_mm256_srli_epi32(left.v, bitOffset));
  3282. #else
  3283. U32x8 bitOffsets = U32x8(bitOffset);
  3284. IMPL_SCALAR_REFERENCE_INFIX_8_LANES(left, bitOffsets, U32x8, uint32_t, >>)
  3285. #endif
  3286. }
  3287. inline U16x16 operator+(const U16x16& left, const U16x16& right) {
  3288. #if defined(USE_256BIT_X_SIMD)
  3289. return U16x16(ADD_U16_SIMD256(left.v, right.v));
  3290. #else
  3291. IMPL_SCALAR_REFERENCE_INFIX_16_LANES(left, right, U16x16, uint16_t, +)
  3292. #endif
  3293. }
  3294. inline U16x16 operator-(const U16x16& left, const U16x16& right) {
  3295. #if defined(USE_256BIT_X_SIMD)
  3296. return U16x16(SUB_U16_SIMD256(left.v, right.v));
  3297. #else
  3298. IMPL_SCALAR_REFERENCE_INFIX_16_LANES(left, right, U16x16, uint16_t, -)
  3299. #endif
  3300. }
  3301. inline U16x16 operator*(const U16x16& left, const U16x16& right) {
  3302. #if defined(USE_256BIT_X_SIMD)
  3303. return U16x16(MUL_U16_SIMD256(left.v, right.v));
  3304. #else
  3305. IMPL_SCALAR_REFERENCE_INFIX_16_LANES(left, right, U16x16, uint16_t, *)
  3306. #endif
  3307. }
  3308. inline U8x32 operator+(const U8x32& left, const U8x32& right) {
  3309. #if defined(USE_256BIT_X_SIMD)
  3310. return U8x32(ADD_U8_SIMD256(left.v, right.v));
  3311. #else
  3312. IMPL_SCALAR_REFERENCE_INFIX_32_LANES(left, right, U8x32, uint8_t, +)
  3313. #endif
  3314. }
  3315. inline U8x32 operator-(const U8x32& left, const U8x32& right) {
  3316. #if defined(USE_256BIT_X_SIMD)
  3317. return U8x32(SUB_U8_SIMD256(left.v, right.v));
  3318. #else
  3319. IMPL_SCALAR_REFERENCE_INFIX_32_LANES(left, right, U8x32, uint8_t, -)
  3320. #endif
  3321. }
  3322. inline U8x32 saturatedAddition(const U8x32& left, const U8x32& right) {
  3323. #if defined(USE_256BIT_X_SIMD)
  3324. return U8x32(ADD_SAT_U8_SIMD256(left.v, right.v));
  3325. #else
  3326. U8x32 result = U8x32::create_dangerous_uninitialized();
  3327. for (int i = 0; i < 32; i++) {
  3328. result.scalars[i] = impl_limit255((uint32_t)left.scalars[i] + (uint32_t)right.scalars[i]);
  3329. }
  3330. return result;
  3331. #endif
  3332. }
  3333. inline U8x32 saturatedSubtraction(const U8x32& left, const U8x32& right) {
  3334. #if defined(USE_256BIT_X_SIMD)
  3335. return U8x32(SUB_SAT_U8_SIMD256(left.v, right.v));
  3336. #else
  3337. U8x32 result = U8x32::create_dangerous_uninitialized();
  3338. for (int i = 0; i < 32; i++) {
  3339. result.scalars[i] = impl_limit0((int32_t)left.scalars[i] - (int32_t)right.scalars[i]);
  3340. }
  3341. return result;
  3342. #endif
  3343. }
  3344. inline I32x8 truncateToI32(const F32x8& vector) {
  3345. #if defined(USE_256BIT_X_SIMD)
  3346. return I32x8(F32_TO_I32_SIMD256(vector.v));
  3347. #else
  3348. return I32x8(
  3349. (int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3],
  3350. (int32_t)vector.scalars[4], (int32_t)vector.scalars[5], (int32_t)vector.scalars[6], (int32_t)vector.scalars[7]
  3351. );
  3352. #endif
  3353. }
  3354. inline U32x8 truncateToU32(const F32x8& vector) {
  3355. #if defined(USE_256BIT_X_SIMD)
  3356. return U32x8(F32_TO_U32_SIMD256(vector.v));
  3357. #else
  3358. return U32x8(
  3359. (uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3],
  3360. (uint32_t)vector.scalars[4], (uint32_t)vector.scalars[5], (uint32_t)vector.scalars[6], (uint32_t)vector.scalars[7]
  3361. );
  3362. #endif
  3363. }
  3364. inline F32x8 floatFromI32(const I32x8& vector) {
  3365. #if defined(USE_256BIT_X_SIMD)
  3366. return F32x8(I32_TO_F32_SIMD256(vector.v));
  3367. #else
  3368. return F32x8(
  3369. (float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3],
  3370. (float)vector.scalars[4], (float)vector.scalars[5], (float)vector.scalars[6], (float)vector.scalars[7]
  3371. );
  3372. #endif
  3373. }
  3374. inline F32x8 floatFromU32(const U32x8& vector) {
  3375. #if defined(USE_256BIT_X_SIMD)
  3376. return F32x8(U32_TO_F32_SIMD256(vector.v));
  3377. #else
  3378. return F32x8(
  3379. (float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3],
  3380. (float)vector.scalars[4], (float)vector.scalars[5], (float)vector.scalars[6], (float)vector.scalars[7]
  3381. );
  3382. #endif
  3383. }
  3384. inline I32x8 I32FromU32(const U32x8& vector) {
  3385. #if defined(USE_256BIT_X_SIMD)
  3386. return I32x8(REINTERPRET_U32_TO_I32_SIMD256(vector.v));
  3387. #else
  3388. return I32x8(
  3389. (int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3],
  3390. (int32_t)vector.scalars[4], (int32_t)vector.scalars[5], (int32_t)vector.scalars[6], (int32_t)vector.scalars[7]
  3391. );
  3392. #endif
  3393. }
  3394. inline U32x8 U32FromI32(const I32x8& vector) {
  3395. #if defined(USE_256BIT_X_SIMD)
  3396. return U32x8(REINTERPRET_I32_TO_U32_SIMD256(vector.v));
  3397. #else
  3398. return U32x8(
  3399. (uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3],
  3400. (uint32_t)vector.scalars[4], (uint32_t)vector.scalars[5], (uint32_t)vector.scalars[6], (uint32_t)vector.scalars[7]
  3401. );
  3402. #endif
  3403. }
  3404. // Warning! Behavior depends on endianness.
  3405. inline U8x32 reinterpret_U8FromU32(const U32x8& vector) {
  3406. #if defined(USE_256BIT_X_SIMD)
  3407. return U8x32(REINTERPRET_U32_TO_U8_SIMD256(vector.v));
  3408. #else
  3409. const uint8_t *source = (const uint8_t*)vector.scalars;
  3410. U8x32 result = U8x32::create_dangerous_uninitialized();
  3411. for (int i = 0; i < 32; i++) {
  3412. result.scalars[i] = source[i];
  3413. }
  3414. return result;
  3415. #endif
  3416. }
  3417. // Warning! Behavior depends on endianness.
  3418. inline U32x8 reinterpret_U32FromU8(const U8x32& vector) {
  3419. #if defined(USE_256BIT_X_SIMD)
  3420. return U32x8(REINTERPRET_U8_TO_U32_SIMD256(vector.v));
  3421. #else
  3422. const uint32_t *source = (const uint32_t*)vector.scalars;
  3423. return U32x8(source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7]);
  3424. #endif
  3425. }
  3426. // Warning! Behavior depends on endianness.
  3427. inline U16x16 reinterpret_U16FromU32(const U32x8& vector) {
  3428. #if defined(USE_256BIT_X_SIMD)
  3429. return U16x16(REINTERPRET_U32_TO_U16_SIMD256(vector.v));
  3430. #else
  3431. const uint16_t *source = (const uint16_t*)vector.scalars;
  3432. return U16x16(
  3433. source[0], source[1], source[2] , source[3] , source[4] , source[5] , source[6] , source[7] ,
  3434. source[8], source[9], source[10], source[11], source[12], source[13], source[14], source[15]
  3435. );
  3436. #endif
  3437. }
  3438. // Warning! Behavior depends on endianness.
  3439. inline U32x8 reinterpret_U32FromU16(const U16x16& vector) {
  3440. #if defined(USE_256BIT_X_SIMD)
  3441. return U32x8(REINTERPRET_U16_TO_U32_SIMD256(vector.v));
  3442. #else
  3443. const uint32_t *source = (const uint32_t*)vector.scalars;
  3444. return U32x8(source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7]);
  3445. #endif
  3446. }
  3447. // Unpacking to larger integers
  3448. inline U32x8 lowerToU32(const U16x16& vector) {
  3449. #if defined(USE_256BIT_X_SIMD)
  3450. return U32x8(U16_LOW_TO_U32_SIMD256(vector.v));
  3451. #else
  3452. return U32x8(vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3], vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7]);
  3453. #endif
  3454. }
  3455. inline U32x8 higherToU32(const U16x16& vector) {
  3456. #if defined(USE_256BIT_X_SIMD)
  3457. return U32x8(U16_HIGH_TO_U32_SIMD256(vector.v));
  3458. #else
  3459. return U32x8(vector.scalars[8], vector.scalars[9], vector.scalars[10], vector.scalars[11], vector.scalars[12], vector.scalars[13], vector.scalars[14], vector.scalars[15]);
  3460. #endif
  3461. }
  3462. inline U16x16 lowerToU16(const U8x32& vector) {
  3463. #if defined(USE_256BIT_X_SIMD)
  3464. return U16x16(U8_LOW_TO_U16_SIMD256(vector.v));
  3465. #else
  3466. return U16x16(
  3467. vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3],
  3468. vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7],
  3469. vector.scalars[8], vector.scalars[9], vector.scalars[10], vector.scalars[11],
  3470. vector.scalars[12], vector.scalars[13], vector.scalars[14], vector.scalars[15]
  3471. );
  3472. #endif
  3473. }
  3474. inline U16x16 higherToU16(const U8x32& vector) {
  3475. #if defined(USE_256BIT_X_SIMD)
  3476. return U16x16(U8_HIGH_TO_U16_SIMD256(vector.v));
  3477. #else
  3478. return U16x16(
  3479. vector.scalars[16], vector.scalars[17], vector.scalars[18], vector.scalars[19],
  3480. vector.scalars[20], vector.scalars[21], vector.scalars[22], vector.scalars[23],
  3481. vector.scalars[24], vector.scalars[25], vector.scalars[26], vector.scalars[27],
  3482. vector.scalars[28], vector.scalars[29], vector.scalars[30], vector.scalars[31]
  3483. );
  3484. #endif
  3485. }
  3486. // Unary negation for convenience and code readability.
  3487. // Before using unary negation, always check if:
  3488. // * An addition can be turned into a subtraction?
  3489. // x = -a + b
  3490. // x = b - a
  3491. // * A multiplying constant or scalar can be negated instead?
  3492. // x = -b * 2
  3493. // x = b * -2
  3494. inline F32x8 operator-(const F32x8& value) {
  3495. #if defined(USE_256BIT_F_SIMD)
  3496. return F32x8(0.0f) - value;
  3497. #else
  3498. return F32x8(
  3499. -value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3],
  3500. -value.scalars[4], -value.scalars[5], -value.scalars[6], -value.scalars[7]
  3501. );
  3502. #endif
  3503. }
  3504. inline I32x8 operator-(const I32x8& value) {
  3505. #if defined(USE_256BIT_X_SIMD)
  3506. return I32x8(0) - value;
  3507. #else
  3508. return I32x8(
  3509. -value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3],
  3510. -value.scalars[4], -value.scalars[5], -value.scalars[6], -value.scalars[7]
  3511. );
  3512. #endif
  3513. }
  3514. // Helper macros for generating the vector extract functions.
  3515. // Having one function for each type and offset makes sure that the compiler gets an immediate integer within the valid range.
  3516. #if defined(USE_AVX2)
  3517. // AVX2 does not offer any 256-bit element extraction, only two 128-bit shifts done in parallel, so we might as well use two separate 128-bit extractions.
  3518. // The __m256i type should never be returned from a non-intrinsic function, because g++ does not automatically enforce
  3519. // 32 byte alignment for __m256i vectors when creating temporary variables in the generated assembler instructions.
  3520. template <typename T, int INNER_OFFSET, int EDGE_HALF_INDEX, int MIDDLE_HALF_INDEX>
  3521. inline T impl_extractBytes_AVX2(const T &leftInput, const T &middleInput, const T &rightInput) {
  3522. static_assert(0 <= INNER_OFFSET && INNER_OFFSET < 16, "impl_extractBytes_AVX2: INNER_OFFSET is out of bound 0..15!\n");
  3523. static_assert(0 <= EDGE_HALF_INDEX && EDGE_HALF_INDEX < 2, "impl_extractBytes_AVX2: INNER_OFFSET is out of bound 0..1!n");
  3524. static_assert(0 <= MIDDLE_HALF_INDEX && MIDDLE_HALF_INDEX < 2, "impl_extractBytes_AVX2: INNER_OFFSET is out of bound 0..1!\n");
  3525. // Extract three halves depending on which ones overlap with the offset.
  3526. ALIGN16 __m128i leftPart = _mm256_extractf128_si256(leftInput.v , EDGE_HALF_INDEX );
  3527. ALIGN16 __m128i middlePart = _mm256_extractf128_si256(middleInput.v, MIDDLE_HALF_INDEX);
  3528. ALIGN16 __m128i rightPart = _mm256_extractf128_si256(rightInput.v , EDGE_HALF_INDEX );
  3529. // Make two 128-bit vector extractions.
  3530. ALIGN16 __m128i leftResult = _mm_alignr_epi8(leftPart, middlePart, INNER_OFFSET);
  3531. ALIGN16 __m128i rightResult = _mm_alignr_epi8(middlePart, rightPart, INNER_OFFSET);
  3532. // Combine the results.
  3533. ALIGN32 __m256i result = _mm256_set_m128i(leftResult, rightResult);
  3534. return T(result);
  3535. }
  3536. template <typename T, int INNER_OFFSET, int EDGE_HALF_INDEX, int MIDDLE_HALF_INDEX>
  3537. inline T impl_extractBytes_AVX2_F32(const T &leftInput, const T &middleInput, const T &rightInput) {
  3538. static_assert(0 <= INNER_OFFSET && INNER_OFFSET < 16, "impl_extractBytes_AVX2: INNER_OFFSET is out of bound 0..15!\n");
  3539. static_assert(0 <= EDGE_HALF_INDEX && EDGE_HALF_INDEX < 2, "impl_extractBytes_AVX2: INNER_OFFSET is out of bound 0..1!n");
  3540. static_assert(0 <= MIDDLE_HALF_INDEX && MIDDLE_HALF_INDEX < 2, "impl_extractBytes_AVX2: INNER_OFFSET is out of bound 0..1!\n");
  3541. // Extract three halves depending on which ones overlap with the offset.
  3542. ALIGN16 __m128i leftPart = _mm256_extractf128_si256(__m256i(leftInput.v) , EDGE_HALF_INDEX );
  3543. ALIGN16 __m128i middlePart = _mm256_extractf128_si256(__m256i(middleInput.v), MIDDLE_HALF_INDEX);
  3544. ALIGN16 __m128i rightPart = _mm256_extractf128_si256(__m256i(rightInput.v) , EDGE_HALF_INDEX );
  3545. // Make two 128-bit vector extractions.
  3546. ALIGN16 __m128i leftResult = _mm_alignr_epi8(leftPart, middlePart, INNER_OFFSET);
  3547. ALIGN16 __m128i rightResult = _mm_alignr_epi8(middlePart, rightPart, INNER_OFFSET);
  3548. // Combine the results.
  3549. ALIGN32 __m256i result = _mm256_set_m128i(leftResult, rightResult);
  3550. return T(__m256(result));
  3551. }
  3552. #define VECTOR_EXTRACT_GENERATOR_256(METHOD_NAME, VECTOR_TYPE, OFFSET, A, B) \
  3553. METHOD_NAME<VECTOR_TYPE, (OFFSET) - ((OFFSET) < 16 ? 0 : 16), (OFFSET) < 16 ? 0 : 1, (OFFSET) < 16 ? 1 : 0> ((B), (OFFSET) < 16 ? (A) : (B), (A))
  3554. #define VECTOR_EXTRACT_GENERATOR_256_U8( OFFSET) return U8x32 (VECTOR_EXTRACT_GENERATOR_256(impl_extractBytes_AVX2 , U8x32 , OFFSET , a, b));
  3555. #define VECTOR_EXTRACT_GENERATOR_256_U16(OFFSET) return U16x16(VECTOR_EXTRACT_GENERATOR_256(impl_extractBytes_AVX2 , U16x16, OFFSET * 2, a, b));
  3556. #define VECTOR_EXTRACT_GENERATOR_256_U32(OFFSET) return U32x8 (VECTOR_EXTRACT_GENERATOR_256(impl_extractBytes_AVX2 , U32x8 , OFFSET * 4, a, b));
  3557. #define VECTOR_EXTRACT_GENERATOR_256_I32(OFFSET) return I32x8 (VECTOR_EXTRACT_GENERATOR_256(impl_extractBytes_AVX2 , I32x8 , OFFSET * 4, a, b));
  3558. #define VECTOR_EXTRACT_GENERATOR_256_F32(OFFSET) return F32x8 (VECTOR_EXTRACT_GENERATOR_256(impl_extractBytes_AVX2_F32, F32x8 , OFFSET * 4, a, b));
  3559. #else
  3560. // TODO: Implement bound checks for scalars in debug mode. A static index can be checked in compile time.
  3561. template<typename T, int ELEMENT_COUNT, int OFFSET>
  3562. T impl_vectorExtract_emulated(const T &a, const T &b) {
  3563. static_assert(0 <= OFFSET && OFFSET <= ELEMENT_COUNT, "Offset is out of bound in impl_vectorExtract_emulated!\n");
  3564. static_assert(sizeof(a.scalars) == sizeof(a.scalars[0]) * ELEMENT_COUNT, "A does not match the element count in impl_vectorExtract_emulated!\n");
  3565. static_assert(sizeof(b.scalars) == sizeof(b.scalars[0]) * ELEMENT_COUNT, "B does not match the element count in impl_vectorExtract_emulated!\n");
  3566. T result = T::create_dangerous_uninitialized();
  3567. static_assert(sizeof(result.scalars) == sizeof(result.scalars[0]) * ELEMENT_COUNT, "The result does not match the element count in impl_vectorExtract_emulated!\n");
  3568. int t = 0;
  3569. for (int s = OFFSET; s < ELEMENT_COUNT; s++) {
  3570. assert(0 <= s && s < ELEMENT_COUNT);
  3571. assert(0 <= t && t < ELEMENT_COUNT);
  3572. result.scalars[t] = a.scalars[s];
  3573. t++;
  3574. }
  3575. for (int s = 0; s < OFFSET; s++) {
  3576. assert(0 <= s && s < ELEMENT_COUNT);
  3577. assert(0 <= t && t < ELEMENT_COUNT);
  3578. result.scalars[t] = b.scalars[s];
  3579. t++;
  3580. }
  3581. return result;
  3582. }
  3583. #define VECTOR_EXTRACT_GENERATOR_256_U8( OFFSET) return impl_vectorExtract_emulated< U8x32, 32, OFFSET>(a, b);
  3584. #define VECTOR_EXTRACT_GENERATOR_256_U16(OFFSET) return impl_vectorExtract_emulated<U16x16, 16, OFFSET>(a, b);
  3585. #define VECTOR_EXTRACT_GENERATOR_256_U32(OFFSET) return impl_vectorExtract_emulated< U32x8, 8, OFFSET>(a, b);
  3586. #define VECTOR_EXTRACT_GENERATOR_256_I32(OFFSET) return impl_vectorExtract_emulated< I32x8, 8, OFFSET>(a, b);
  3587. #define VECTOR_EXTRACT_GENERATOR_256_F32(OFFSET) return impl_vectorExtract_emulated< F32x8, 8, OFFSET>(a, b);
  3588. #endif
  3589. // Vector extraction concatunates two input vectors and reads a vector between them using an offset.
  3590. // The first and last offsets that only return one of the inputs can be used for readability, because they will be inlined and removed by the compiler.
  3591. // To get elements from the right side, combine the center vector with the right vector and shift one element to the left using vectorExtract_1 for the given type.
  3592. // To get elements from the left side, combine the left vector with the center vector and shift one element to the right using vectorExtract_15 for 16 lanes, vectorExtract_7 for 8 lanes, or vectorExtract_3 for 4 lanes.
  3593. // TODO: Also allow using a template arguments as the element offset with a static assert for the offset, which might be useful in template programming.
  3594. U8x32 inline vectorExtract_0(const U8x32 &a, const U8x32 &b) { return a; }
  3595. U8x32 inline vectorExtract_1(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(1) }
  3596. U8x32 inline vectorExtract_2(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(2) }
  3597. U8x32 inline vectorExtract_3(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(3) }
  3598. U8x32 inline vectorExtract_4(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(4) }
  3599. U8x32 inline vectorExtract_5(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(5) }
  3600. U8x32 inline vectorExtract_6(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(6) }
  3601. U8x32 inline vectorExtract_7(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(7) }
  3602. U8x32 inline vectorExtract_8(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(8) }
  3603. U8x32 inline vectorExtract_9(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(9) }
  3604. U8x32 inline vectorExtract_10(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(10) }
  3605. U8x32 inline vectorExtract_11(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(11) }
  3606. U8x32 inline vectorExtract_12(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(12) }
  3607. U8x32 inline vectorExtract_13(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(13) }
  3608. U8x32 inline vectorExtract_14(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(14) }
  3609. U8x32 inline vectorExtract_15(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(15) }
  3610. U8x32 inline vectorExtract_16(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(16) }
  3611. U8x32 inline vectorExtract_17(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(17) }
  3612. U8x32 inline vectorExtract_18(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(18) }
  3613. U8x32 inline vectorExtract_19(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(19) }
  3614. U8x32 inline vectorExtract_20(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(20) }
  3615. U8x32 inline vectorExtract_21(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(21) }
  3616. U8x32 inline vectorExtract_22(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(22) }
  3617. U8x32 inline vectorExtract_23(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(23) }
  3618. U8x32 inline vectorExtract_24(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(24) }
  3619. U8x32 inline vectorExtract_25(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(25) }
  3620. U8x32 inline vectorExtract_26(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(26) }
  3621. U8x32 inline vectorExtract_27(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(27) }
  3622. U8x32 inline vectorExtract_28(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(28) }
  3623. U8x32 inline vectorExtract_29(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(29) }
  3624. U8x32 inline vectorExtract_30(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(30) }
  3625. U8x32 inline vectorExtract_31(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(31) }
  3626. U8x32 inline vectorExtract_32(const U8x32 &a, const U8x32 &b) { return b; }
  3627. U16x16 inline vectorExtract_0(const U16x16 &a, const U16x16 &b) { return a; }
  3628. U16x16 inline vectorExtract_1(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(1) }
  3629. U16x16 inline vectorExtract_2(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(2) }
  3630. U16x16 inline vectorExtract_3(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(3) }
  3631. U16x16 inline vectorExtract_4(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(4) }
  3632. U16x16 inline vectorExtract_5(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(5) }
  3633. U16x16 inline vectorExtract_6(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(6) }
  3634. U16x16 inline vectorExtract_7(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(7) }
  3635. U16x16 inline vectorExtract_8(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(8) }
  3636. U16x16 inline vectorExtract_9(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(9) }
  3637. U16x16 inline vectorExtract_10(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(10) }
  3638. U16x16 inline vectorExtract_11(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(11) }
  3639. U16x16 inline vectorExtract_12(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(12) }
  3640. U16x16 inline vectorExtract_13(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(13) }
  3641. U16x16 inline vectorExtract_14(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(14) }
  3642. U16x16 inline vectorExtract_15(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(15) }
  3643. U16x16 inline vectorExtract_16(const U16x16 &a, const U16x16 &b) { return b; }
  3644. U32x8 inline vectorExtract_0(const U32x8 &a, const U32x8 &b) { return a; }
  3645. U32x8 inline vectorExtract_1(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(1) }
  3646. U32x8 inline vectorExtract_2(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(2) }
  3647. U32x8 inline vectorExtract_3(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(3) }
  3648. U32x8 inline vectorExtract_4(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(4) }
  3649. U32x8 inline vectorExtract_5(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(5) }
  3650. U32x8 inline vectorExtract_6(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(6) }
  3651. U32x8 inline vectorExtract_7(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(7) }
  3652. U32x8 inline vectorExtract_8(const U32x8 &a, const U32x8 &b) { return b; }
  3653. I32x8 inline vectorExtract_0(const I32x8 &a, const I32x8 &b) { return a; }
  3654. I32x8 inline vectorExtract_1(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(1) }
  3655. I32x8 inline vectorExtract_2(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(2) }
  3656. I32x8 inline vectorExtract_3(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(3) }
  3657. I32x8 inline vectorExtract_4(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(4) }
  3658. I32x8 inline vectorExtract_5(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(5) }
  3659. I32x8 inline vectorExtract_6(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(6) }
  3660. I32x8 inline vectorExtract_7(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(7) }
  3661. I32x8 inline vectorExtract_8(const I32x8 &a, const I32x8 &b) { return b; }
  3662. F32x8 inline vectorExtract_0(const F32x8 &a, const F32x8 &b) { return a; }
  3663. F32x8 inline vectorExtract_1(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(1) }
  3664. F32x8 inline vectorExtract_2(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(2) }
  3665. F32x8 inline vectorExtract_3(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(3) }
  3666. F32x8 inline vectorExtract_4(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(4) }
  3667. F32x8 inline vectorExtract_5(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(5) }
  3668. F32x8 inline vectorExtract_6(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(6) }
  3669. F32x8 inline vectorExtract_7(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(7) }
  3670. F32x8 inline vectorExtract_8(const F32x8 &a, const F32x8 &b) { return b; }
  3671. // Gather instructions load memory from a pointer at multiple index offsets at the same time.
  3672. // The given pointers should be aligned with 4 bytes, so that the fallback solution works on machines with strict alignment requirements.
  3673. #if defined(USE_AVX2)
  3674. #define GATHER_I32x8_AVX2(SOURCE, EIGHT_OFFSETS, SCALE) _mm256_i32gather_epi32((const int32_t*)(SOURCE), EIGHT_OFFSETS, SCALE)
  3675. #define GATHER_U32x8_AVX2(SOURCE, EIGHT_OFFSETS, SCALE) _mm256_i32gather_epi32((const int32_t*)(SOURCE), EIGHT_OFFSETS, SCALE)
  3676. #define GATHER_F32x8_AVX2(SOURCE, EIGHT_OFFSETS, SCALE) _mm256_i32gather_ps((const float*)(SOURCE), EIGHT_OFFSETS, SCALE)
  3677. #endif
  3678. static inline U32x8 gather_U32(dsr::SafePointer<const uint32_t> data, const U32x8 &elementOffset) {
  3679. #ifdef SAFE_POINTER_CHECKS
  3680. ALIGN32 uint32_t elementOffsets[8];
  3681. if (uintptr_t((void*)elementOffsets) & 31u) { throwError(U"Unaligned stack memory detected in 256-bit gather_U32!\n"); }
  3682. elementOffset.writeAlignedUnsafe(elementOffsets);
  3683. data.assertInside("U32x4 gather_U32 lane 0", (data + elementOffsets[0]).getUnchecked());
  3684. data.assertInside("U32x4 gather_U32 lane 1", (data + elementOffsets[1]).getUnchecked());
  3685. data.assertInside("U32x4 gather_U32 lane 2", (data + elementOffsets[2]).getUnchecked());
  3686. data.assertInside("U32x4 gather_U32 lane 3", (data + elementOffsets[3]).getUnchecked());
  3687. data.assertInside("U32x4 gather_U32 lane 4", (data + elementOffsets[4]).getUnchecked());
  3688. data.assertInside("U32x4 gather_U32 lane 5", (data + elementOffsets[5]).getUnchecked());
  3689. data.assertInside("U32x4 gather_U32 lane 6", (data + elementOffsets[6]).getUnchecked());
  3690. data.assertInside("U32x4 gather_U32 lane 7", (data + elementOffsets[7]).getUnchecked());
  3691. #endif
  3692. #if defined(USE_AVX2)
  3693. return U32x8(GATHER_I32x8_AVX2(data.getUnsafe(), elementOffset.v, 4));
  3694. #else
  3695. #ifndef SAFE_POINTER_CHECKS
  3696. ALIGN32 uint32_t elementOffsets[8];
  3697. elementOffset.writeAlignedUnsafe(elementOffsets);
  3698. #endif
  3699. return U32x8(
  3700. *(data + elementOffsets[0]),
  3701. *(data + elementOffsets[1]),
  3702. *(data + elementOffsets[2]),
  3703. *(data + elementOffsets[3]),
  3704. *(data + elementOffsets[4]),
  3705. *(data + elementOffsets[5]),
  3706. *(data + elementOffsets[6]),
  3707. *(data + elementOffsets[7])
  3708. );
  3709. #endif
  3710. }
  3711. static inline I32x8 gather_I32(dsr::SafePointer<const int32_t> data, const U32x8 &elementOffset) {
  3712. #ifdef SAFE_POINTER_CHECKS
  3713. ALIGN32 uint32_t elementOffsets[8];
  3714. if (uintptr_t((void*)elementOffsets) & 31u) { throwError(U"Unaligned stack memory detected in 256-bit gather_I32!\n"); }
  3715. elementOffset.writeAlignedUnsafe(elementOffsets);
  3716. data.assertInside("I32x4 gather_I32 lane 0", (data + elementOffsets[0]).getUnchecked());
  3717. data.assertInside("I32x4 gather_I32 lane 1", (data + elementOffsets[1]).getUnchecked());
  3718. data.assertInside("I32x4 gather_I32 lane 2", (data + elementOffsets[2]).getUnchecked());
  3719. data.assertInside("I32x4 gather_I32 lane 3", (data + elementOffsets[3]).getUnchecked());
  3720. data.assertInside("I32x4 gather_I32 lane 4", (data + elementOffsets[4]).getUnchecked());
  3721. data.assertInside("I32x4 gather_I32 lane 5", (data + elementOffsets[5]).getUnchecked());
  3722. data.assertInside("I32x4 gather_I32 lane 6", (data + elementOffsets[6]).getUnchecked());
  3723. data.assertInside("I32x4 gather_I32 lane 7", (data + elementOffsets[7]).getUnchecked());
  3724. #endif
  3725. #if defined(USE_AVX2)
  3726. return I32x8(GATHER_U32x8_AVX2(data.getUnsafe(), elementOffset.v, 4));
  3727. #else
  3728. #ifndef SAFE_POINTER_CHECKS
  3729. ALIGN32 uint32_t elementOffsets[8];
  3730. elementOffset.writeAlignedUnsafe(elementOffsets);
  3731. #endif
  3732. return I32x8(
  3733. *(data + elementOffsets[0]),
  3734. *(data + elementOffsets[1]),
  3735. *(data + elementOffsets[2]),
  3736. *(data + elementOffsets[3]),
  3737. *(data + elementOffsets[4]),
  3738. *(data + elementOffsets[5]),
  3739. *(data + elementOffsets[6]),
  3740. *(data + elementOffsets[7])
  3741. );
  3742. #endif
  3743. }
  3744. static inline F32x8 gather_F32(dsr::SafePointer<const float> data, const U32x8 &elementOffset) {
  3745. #ifdef SAFE_POINTER_CHECKS
  3746. ALIGN32 uint32_t elementOffsets[8];
  3747. if (uintptr_t((void*)elementOffsets) & 31u) { throwError(U"Unaligned stack memory detected in 256-bit gather_F32!\n"); }
  3748. elementOffset.writeAlignedUnsafe(elementOffsets);
  3749. data.assertInside("F32x4 gather_F32 lane 0", (data + elementOffsets[0]).getUnchecked());
  3750. data.assertInside("F32x4 gather_F32 lane 1", (data + elementOffsets[1]).getUnchecked());
  3751. data.assertInside("F32x4 gather_F32 lane 2", (data + elementOffsets[2]).getUnchecked());
  3752. data.assertInside("F32x4 gather_F32 lane 3", (data + elementOffsets[3]).getUnchecked());
  3753. data.assertInside("F32x4 gather_I32 lane 4", (data + elementOffsets[4]).getUnchecked());
  3754. data.assertInside("F32x4 gather_F32 lane 5", (data + elementOffsets[5]).getUnchecked());
  3755. data.assertInside("F32x4 gather_F32 lane 6", (data + elementOffsets[6]).getUnchecked());
  3756. data.assertInside("F32x4 gather_F32 lane 7", (data + elementOffsets[7]).getUnchecked());
  3757. #endif
  3758. #if defined(USE_AVX2)
  3759. return F32x8(GATHER_F32x8_AVX2(data.getUnsafe(), elementOffset.v, 4));
  3760. #else
  3761. #ifndef SAFE_POINTER_CHECKS
  3762. ALIGN32 uint32_t elementOffsets[8];
  3763. elementOffset.writeAlignedUnsafe(elementOffsets);
  3764. #endif
  3765. return F32x8(
  3766. *(data + elementOffsets[0]),
  3767. *(data + elementOffsets[1]),
  3768. *(data + elementOffsets[2]),
  3769. *(data + elementOffsets[3]),
  3770. *(data + elementOffsets[4]),
  3771. *(data + elementOffsets[5]),
  3772. *(data + elementOffsets[6]),
  3773. *(data + elementOffsets[7])
  3774. );
  3775. #endif
  3776. }
  3777. // TODO: Move to noSimd.h using SFINAE.
  3778. // Wrapper functions for explicitly expanding scalars into vectors during math operations.
  3779. #define NUMERICAL_SCALAR_OPERATIONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  3780. inline VECTOR_TYPE operator+(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left + VECTOR_TYPE(right); } \
  3781. inline VECTOR_TYPE operator+(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) + right; } \
  3782. inline VECTOR_TYPE operator-(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left - VECTOR_TYPE(right); } \
  3783. inline VECTOR_TYPE operator-(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) - right; }
  3784. FOR_ALL_VECTOR_TYPES(NUMERICAL_SCALAR_OPERATIONS)
  3785. #undef NUMERICAL_SCALAR_OPERATIONS
  3786. // TODO: Move to noSimd.h using SFINAE.
  3787. #define MULTIPLY_SCALAR_OPERATIONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  3788. inline VECTOR_TYPE operator*(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left * VECTOR_TYPE(right); } \
  3789. inline VECTOR_TYPE operator*(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) * right; }
  3790. // TODO: Implement multiplication for U8x16 and U8x32.
  3791. //FOR_ALL_VECTOR_TYPES(MULTIPLY_SCALAR_OPERATIONS)
  3792. MULTIPLY_SCALAR_OPERATIONS(F32x4, float, 4)
  3793. MULTIPLY_SCALAR_OPERATIONS(F32x8, float, 8)
  3794. MULTIPLY_SCALAR_OPERATIONS(U32x4, uint32_t, 4)
  3795. MULTIPLY_SCALAR_OPERATIONS(U32x8, uint32_t, 8)
  3796. MULTIPLY_SCALAR_OPERATIONS(I32x4, int32_t, 4)
  3797. MULTIPLY_SCALAR_OPERATIONS(I32x8, int32_t, 8)
  3798. MULTIPLY_SCALAR_OPERATIONS(U16x8, uint16_t, 8)
  3799. MULTIPLY_SCALAR_OPERATIONS(U16x16, uint16_t, 16)
  3800. #undef MULTIPLY_SCALAR_OPERATIONS
  3801. // TODO: Move to noSimd.h using SFINAE.
  3802. // Wrapper functions for explicitly duplicating bit masks into the same lane count.
  3803. #define BITWISE_SCALAR_OPERATIONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  3804. inline VECTOR_TYPE operator&(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left & VECTOR_TYPE(right); } \
  3805. inline VECTOR_TYPE operator&(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) & right; } \
  3806. inline VECTOR_TYPE operator|(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left | VECTOR_TYPE(right); } \
  3807. inline VECTOR_TYPE operator|(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) | right; } \
  3808. inline VECTOR_TYPE operator^(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left ^ VECTOR_TYPE(right); } \
  3809. inline VECTOR_TYPE operator^(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) ^ right; }
  3810. // TODO: Implement bitwise operations for all unsigned SIMD vectors.
  3811. //FOR_UNSIGNED_VECTOR_TYPES(BITWISE_SCALAR_OPERATIONS)
  3812. BITWISE_SCALAR_OPERATIONS(U32x4, uint32_t, 4)
  3813. BITWISE_SCALAR_OPERATIONS(U32x8, uint32_t, 8)
  3814. #undef BITWISE_SCALAR_OPERATIONS
  3815. // Cleaning up temporary macro definitions to avoid cluttering the namespace.
  3816. #undef FOR_ALL_VECTOR_TYPES
  3817. #undef FOR_FLOAT_VECTOR_TYPES
  3818. #undef FOR_INTEGER_VECTOR_TYPES
  3819. #undef FOR_SIGNED_VECTOR_TYPES
  3820. #undef FOR_UNSIGNED_VECTOR_TYPES
  3821. // 1 / value
  3822. inline F32x4 reciprocal(const F32x4 &value) {
  3823. #if defined(USE_BASIC_SIMD)
  3824. #if defined(USE_SSE2)
  3825. // Approximate
  3826. ALIGN16 SIMD_F32x4 lowQS = _mm_rcp_ps(value.v);
  3827. F32x4 lowQ = F32x4(lowQS);
  3828. // Refine
  3829. return ((lowQ + lowQ) - (value * lowQ * lowQ));
  3830. #elif defined(USE_NEON)
  3831. // Approximate
  3832. ALIGN16 SIMD_F32x4 result = vrecpeq_f32(value.v);
  3833. // Refine
  3834. ALIGN16 SIMD_F32x4 a = vrecpsq_f32(value.v, result);
  3835. result = MUL_F32_SIMD(a, result);
  3836. return F32x4(MUL_F32_SIMD(vrecpsq_f32(value.v, result), result));
  3837. #else
  3838. #error "Missing F32x4 implementation of reciprocal!\n");
  3839. return F32x4(0);
  3840. #endif
  3841. #else
  3842. F32x4 one = F32x4(1.0f);
  3843. IMPL_SCALAR_REFERENCE_INFIX_4_LANES(one, value, F32x4, float, /)
  3844. #endif
  3845. }
  3846. // 1 / value
  3847. inline F32x8 reciprocal(const F32x8 &value) {
  3848. #if defined(USE_AVX2)
  3849. // Approximate
  3850. ALIGN32 SIMD_F32x8 lowQ = _mm256_rcp_ps(value.v);
  3851. // Refine
  3852. return F32x8(SUB_F32_SIMD256(ADD_F32_SIMD256(lowQ, lowQ), MUL_F32_SIMD256(value.v, MUL_F32_SIMD256(lowQ, lowQ))));
  3853. #else
  3854. F32x8 one = F32x8(1.0f);
  3855. IMPL_SCALAR_REFERENCE_INFIX_8_LANES(one, value, F32x8, float, /)
  3856. #endif
  3857. }
  3858. // 1 / sqrt(value)
  3859. inline F32x4 reciprocalSquareRoot(const F32x4 &value) {
  3860. #if defined(USE_BASIC_SIMD)
  3861. #if defined(USE_SSE2)
  3862. ALIGN16 SIMD_F32x4 reciRootS = _mm_rsqrt_ps(value.v);
  3863. F32x4 reciRoot = F32x4(reciRootS);
  3864. F32x4 mul = value * reciRoot * reciRoot;
  3865. return (reciRoot * 0.5f) * (3.0f - mul);
  3866. #elif defined(USE_NEON)
  3867. // Approximate
  3868. ALIGN16 SIMD_F32x4 reciRoot = vrsqrteq_f32(value.v);
  3869. // Refine
  3870. ALIGN16 SIMD_F32x4 a = MUL_F32_SIMD(value.v, reciRoot);
  3871. ALIGN16 SIMD_F32x4 b = vrsqrtsq_f32(a, reciRoot);
  3872. ALIGN16 SIMD_F32x4 c = MUL_F32_SIMD(b, reciRoot);
  3873. return F32x4(c);
  3874. #else
  3875. static_assert(false, "Missing SIMD implementation of reciprocalSquareRoot!\n");
  3876. return F32x4(0);
  3877. #endif
  3878. #else
  3879. return F32x4(1.0f / sqrt(value.scalars[0]), 1.0f / sqrt(value.scalars[1]), 1.0f / sqrt(value.scalars[2]), 1.0f / sqrt(value.scalars[3]));
  3880. #endif
  3881. }
  3882. // 1 / sqrt(value)
  3883. inline F32x8 reciprocalSquareRoot(const F32x8 &value) {
  3884. #if defined(USE_AVX2)
  3885. ALIGN32 SIMD_F32x8 reciRootS = _mm256_rsqrt_ps(value.v);
  3886. F32x8 reciRoot = F32x8(reciRootS);
  3887. F32x8 mul = value * reciRoot * reciRoot;
  3888. return (reciRoot * 0.5f) * (3.0f - mul);
  3889. #else
  3890. return F32x8(
  3891. 1.0f / sqrt(value.scalars[0]),
  3892. 1.0f / sqrt(value.scalars[1]),
  3893. 1.0f / sqrt(value.scalars[2]),
  3894. 1.0f / sqrt(value.scalars[3]),
  3895. 1.0f / sqrt(value.scalars[4]),
  3896. 1.0f / sqrt(value.scalars[5]),
  3897. 1.0f / sqrt(value.scalars[6]),
  3898. 1.0f / sqrt(value.scalars[7])
  3899. );
  3900. #endif
  3901. }
  3902. // sqrt(value)
  3903. inline F32x4 squareRoot(const F32x4 &value) {
  3904. #if defined(USE_BASIC_SIMD)
  3905. #if defined(USE_SSE2)
  3906. ALIGN16 SIMD_F32x4 half = _mm_set1_ps(0.5f);
  3907. // Approximate
  3908. ALIGN16 SIMD_F32x4 root = _mm_sqrt_ps(value.v);
  3909. // Refine
  3910. root = _mm_mul_ps(_mm_add_ps(root, _mm_div_ps(value.v, root)), half);
  3911. return F32x4(root);
  3912. #else
  3913. return reciprocalSquareRoot(value) * value;
  3914. #endif
  3915. #else
  3916. return F32x4(sqrt(value.scalars[0]), sqrt(value.scalars[1]), sqrt(value.scalars[2]), sqrt(value.scalars[3]));
  3917. #endif
  3918. }
  3919. // sqrt(value)
  3920. inline F32x8 squareRoot(const F32x8 &value) {
  3921. #if defined(USE_AVX2)
  3922. ALIGN32 SIMD_F32x8 half = _mm256_set1_ps(0.5f);
  3923. // Approximate
  3924. ALIGN32 SIMD_F32x8 root = _mm256_sqrt_ps(value.v);
  3925. // Refine
  3926. root = _mm256_mul_ps(_mm256_add_ps(root, _mm256_div_ps(value.v, root)), half);
  3927. return F32x8(root);
  3928. #else
  3929. return F32x8(
  3930. sqrt(value.scalars[0]),
  3931. sqrt(value.scalars[1]),
  3932. sqrt(value.scalars[2]),
  3933. sqrt(value.scalars[3]),
  3934. sqrt(value.scalars[4]),
  3935. sqrt(value.scalars[5]),
  3936. sqrt(value.scalars[6]),
  3937. sqrt(value.scalars[7]));
  3938. #endif
  3939. }
  3940. // TODO: Let SVE define completely separate types for dynamic vectors.
  3941. // The X vectors using the longest SIMD length that is efficient to use for both floating-point and integer types.
  3942. // DSR_DEFAULT_ALIGNMENT
  3943. // The number of bytes memory should be aligned with by default when creating buffers and images.
  3944. // F32xX
  3945. // The longest available SIMD vector for storing 32-bit float values. Iterating laneCountX_32Bit floats at a time.
  3946. // I32xX
  3947. // The longest available SIMD vector for storing signed 32-bit integer values. Iterating laneCountX_32Bit integers at a time.
  3948. // U32xX
  3949. // The longest available SIMD vector for storing unsigned 32-bit integer values. Iterating laneCountX_32Bit integers at a time.
  3950. // U16xX
  3951. // The longest available SIMD vector for storing unsigned 16-bit integer values. Iterating laneCountX_16Bit integers at a time.
  3952. // U8xX
  3953. // The longest available SIMD vector for storing unsigned 8-bit integer values. Iterating laneCountX_8Bit integers at a time.
  3954. #if defined(USE_256BIT_X_SIMD) || defined(EMULATE_256BIT_X_SIMD)
  3955. // Using 256-bit SIMD
  3956. #define DSR_DEFAULT_VECTOR_SIZE 32
  3957. #define DSR_DEFAULT_ALIGNMENT 32
  3958. using F32xX = F32x8;
  3959. using I32xX = I32x8;
  3960. using U32xX = U32x8;
  3961. using U16xX = U16x16;
  3962. using U8xX = U8x32;
  3963. // Align memory with 256 bits to allow overwriting padding at the end of each pixel row.
  3964. // Otherwise you would have to preserve data at the end of each row with slow and bloated duplicated code in every filter.
  3965. #else
  3966. // If there is no hardware support for 256-bit vectors, the emulation of 256-bit vectors when used explicitly, is allowed to be aligned with just 128 bits.
  3967. #define DSR_DEFAULT_VECTOR_SIZE 16
  3968. #define DSR_DEFAULT_ALIGNMENT 16
  3969. using F32xX = F32x4;
  3970. using I32xX = I32x4;
  3971. using U32xX = U32x4;
  3972. using U16xX = U16x8;
  3973. using U8xX = U8x16;
  3974. #endif
  3975. // How many lanes do the longest available vector have for a specified lane size.
  3976. // Used to iterate indices and pointers using whole elements.
  3977. static const int laneCountX_32Bit = DSR_DEFAULT_VECTOR_SIZE / 4;
  3978. static const int laneCountX_16Bit = DSR_DEFAULT_VECTOR_SIZE / 2;
  3979. static const int laneCountX_8Bit = DSR_DEFAULT_VECTOR_SIZE;
  3980. // TODO: Let SVE define completely separate types for dynamic vectors.
  3981. // The F vector using the longest SIMD length that is efficient to use when only processing float values, even if no integer types are available in the same size.
  3982. // Used when you know that your algorithm is only going to work with float types and you need the extra performance.
  3983. // Some processors have AVX but not AVX2, meaning that it has 256-bit SIMD for floats, but only 128-bit SIMD for integers.
  3984. // F32xF
  3985. // The longest available SIMD vector for storing 32-bit float values. Iterating laneCountF floats at a time.
  3986. #if defined(USE_256BIT_F_SIMD) || defined(EMULATE_256BIT_F_SIMD)
  3987. #define DSR_FLOAT_VECTOR_SIZE 32
  3988. #define DSR_FLOAT_ALIGNMENT 32
  3989. using F32xF = F32x8;
  3990. #else
  3991. // F vectors are 128-bits.
  3992. #define DSR_FLOAT_VECTOR_SIZE 16
  3993. #define DSR_FLOAT_ALIGNMENT 16
  3994. using F32xF = F32x4;
  3995. #endif
  3996. // Used to iterate over float pointers when using F32xF.
  3997. static const int laneCountF = DSR_FLOAT_VECTOR_SIZE / 4;
  3998. // Define traits.
  3999. DSR_APPLY_PROPERTY(DsrTrait_Any_U8 , U8x16)
  4000. DSR_APPLY_PROPERTY(DsrTrait_Any_U8 , U8x32)
  4001. DSR_APPLY_PROPERTY(DsrTrait_Any_U16, U16x8)
  4002. DSR_APPLY_PROPERTY(DsrTrait_Any_U16, U16x16)
  4003. DSR_APPLY_PROPERTY(DsrTrait_Any_U32, U32x4)
  4004. DSR_APPLY_PROPERTY(DsrTrait_Any_U32, U32x8)
  4005. DSR_APPLY_PROPERTY(DsrTrait_Any_I32, I32x4)
  4006. DSR_APPLY_PROPERTY(DsrTrait_Any_I32, I32x8)
  4007. DSR_APPLY_PROPERTY(DsrTrait_Any_F32, F32x4)
  4008. DSR_APPLY_PROPERTY(DsrTrait_Any_F32, F32x8)
  4009. DSR_APPLY_PROPERTY(DsrTrait_Any , U8x16)
  4010. DSR_APPLY_PROPERTY(DsrTrait_Any , U8x32)
  4011. DSR_APPLY_PROPERTY(DsrTrait_Any, U16x8)
  4012. DSR_APPLY_PROPERTY(DsrTrait_Any, U16x16)
  4013. DSR_APPLY_PROPERTY(DsrTrait_Any, U32x4)
  4014. DSR_APPLY_PROPERTY(DsrTrait_Any, U32x8)
  4015. DSR_APPLY_PROPERTY(DsrTrait_Any, I32x4)
  4016. DSR_APPLY_PROPERTY(DsrTrait_Any, I32x8)
  4017. DSR_APPLY_PROPERTY(DsrTrait_Any, F32x4)
  4018. DSR_APPLY_PROPERTY(DsrTrait_Any, F32x8)
  4019. // TODO: Use as independent types when the largest vector lengths are not known in compile time on ARM SVE.
  4020. //DSR_APPLY_PROPERTY(DsrTrait_Any_U8 , U8xX)
  4021. //DSR_APPLY_PROPERTY(DsrTrait_Any_U16, U16xX)
  4022. //DSR_APPLY_PROPERTY(DsrTrait_Any_U32, U32xX)
  4023. //DSR_APPLY_PROPERTY(DsrTrait_Any_I32, I32xX)
  4024. //DSR_APPLY_PROPERTY(DsrTrait_Any_F32, F32xX)
  4025. //DSR_APPLY_PROPERTY(DsrTrait_Any_F32, F32xF)
  4026. //DSR_APPLY_PROPERTY(DsrTrait_Any , U8xX)
  4027. //DSR_APPLY_PROPERTY(DsrTrait_Any, U16xX)
  4028. //DSR_APPLY_PROPERTY(DsrTrait_Any, U32xX)
  4029. //DSR_APPLY_PROPERTY(DsrTrait_Any, I32xX)
  4030. //DSR_APPLY_PROPERTY(DsrTrait_Any, F32xX)
  4031. //DSR_APPLY_PROPERTY(DsrTrait_Any, F32xF)
  4032. }
  4033. #endif