simd.h 168 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926
  1. // zlib open source license
  2. //
  3. // Copyright (c) 2017 to 2025 David Forsgren Piuva
  4. //
  5. // This software is provided 'as-is', without any express or implied
  6. // warranty. In no event will the authors be held liable for any damages
  7. // arising from the use of this software.
  8. //
  9. // Permission is granted to anyone to use this software for any purpose,
  10. // including commercial applications, and to alter it and redistribute it
  11. // freely, subject to the following restrictions:
  12. //
  13. // 1. The origin of this software must not be misrepresented; you must not
  14. // claim that you wrote the original software. If you use this software
  15. // in a product, an acknowledgment in the product documentation would be
  16. // appreciated but is not required.
  17. //
  18. // 2. Altered source versions must be plainly marked as such, and must not be
  19. // misrepresented as being the original software.
  20. //
  21. // 3. This notice may not be removed or altered from any source
  22. // distribution.
  23. // Hardware abstraction layer for portable SIMD math.
  24. // Used to make calculations faster without having to mess around with any hardware specific assembler code nor intrinsic functions.
  25. // You get the performance you need today and the ability to compile with automatic scalar emulation when building for a future processor that this module has not yet been ported to.
  26. // When you can generate the vectorized code using the same template function as your non-vectorized code, you don't even need to write a reference implementation.
  27. // Using with 128-bit SIMD: (beginner friendly, test once, compile anywhere, no compiler flags)
  28. // If you are new to vectorization or only plan to work with 128-bit extensions such as ARM NEON, you can keep it simple by only using the 128-bit vector types.
  29. // Pros and cons:
  30. // + Most target platforms (excluding older systems such as ARMv6) have 128-bit SIMD extensions such as Intel SSE2 or ARM NEON enabled by default.
  31. // ARMv6 does not support ARM NEON, but most ARMv7 processors support it, so that compilers enable NEON by default.
  32. // All 64-bit ARM processors have ARM NEON, because it stopped being optional in ARMv8.
  33. // Building for 64-bit Intel processors usually have SSE2 enabled by default, so you don't have to change any compiler flags when building on a different system.
  34. // + One build for all computers of the same instruction set.
  35. // Great when your application is not so resource heavy, because the least powerful systems don't have the fancy extensions anyway.
  36. // - You might end up enabling the additional SIMD extensions anyway because the library is already using it to become faster.
  37. // Types:
  38. // * Use F32x4, I32x4 and U32x4 for 4 elements at a time
  39. // * U16x8 for 8 elements at a time
  40. // * U8x16 for 16 elements at a time
  41. // Using the X vector size: (advanced, having to test with different build flags or emulation)
  42. // If you want more performance, you can use variable length type aliases.
  43. // Pros and cons:
  44. // + For heavy calculations where memory access is not the bottleneck, using larger SIMD vectors when enabled allow saving energy and increasing performance.
  45. // - If you forget to test with longer vector lengths (compiling with -mavx2 or -mEMULATE_256BIT_SIMD) then you might find bugs from not iterating or aligning memory correctly.
  46. // Types:
  47. // * Use F32xX, I32xX and U32xX for laneCountX_32Bit elements at a time
  48. // * U16xX for laneCountX_16Bit elements at a time
  49. // * U8xX for laneCountX_8Bit elements at a time
  50. // Using the F vector size: (very dangerous, no test can confirm that memory alignment is correct)
  51. // If you want even more performance, you can let float operations use the longest available F vector size, which might exceed the X vector size.
  52. // Pros and cons:
  53. // - Have to manually set the alignment of buffers to DSR_FLOAT_ALIGNMENT to prevent crashing.
  54. // If the default alignment for buffers changed based on the size of F vectors, the more commonly used X vector would get slowed down from cache misses from padding larger than X vectors.
  55. // AlignedImageF32 and sound backends are already aligned with the F vector size, because they are not generic like Buffer.
  56. // - It can be difficult to detect incorrect memory alignment, because a pointer can accidentally be aligned to more than what was requested.
  57. // If accidentally aligning to 128 bits instead of 256 bits, there is a 50% risk of failing to detect it at runtime and later fail on another computer.
  58. // If sticking with 128-bit or X vectors, all buffers will be correctly aligned automatically.
  59. // + For heavy calculations where memory access is not the bottleneck, using larger SIMD vectors when enabled allow saving energy and increasing performance.
  60. // - If you forget to test with longer vector lengths (compiling with -mavx2 or -mEMULATE_256BIT_SIMD) then you might find bugs from not iterating or aligning memory correctly.
  61. // Types:
  62. // * Use F32xX, I32xX and U32xX for laneCountX_32Bit elements at a time
  63. // Compiler extensions
  64. // On Intel/AMD processors:
  65. // SSE2 is usually enabled by default, because SSE2 is mandatory for 64-bit Intel instructions.
  66. // Use -mavx as a G++ compiler flag to enable the AVX extension, enabling the USE_AVX and USE_256BIT_F_SIMD macros.
  67. // If not available on your computer, you can test your algorithm for 256-bit float SIMD using EMULATE_256BIT_F_SIMD, but only if you use the F vector size.
  68. // Use -mavx2 as a G++ compiler flag to enable the AVX2 extension, enabling the USE_AVX2 and USE_256BIT_X_SIMD macros.
  69. // If not available on your computer, you can test your algorithm for 256-bit float and integer SIMD using EMULATE_256BIT_X_SIMD, but only if you use the X vector size.
  70. // On ARMv6 processors:
  71. // Scalar emulation is used when compiling for ARMv6, because it does not have NEON and VFP is not supported in this abstraction.
  72. // On ARMv7 processors:
  73. // NEON is usually enabled by default for ARMv7, because most of them have the extension.
  74. // On ARMv8 processors:
  75. // NEON can not be disabled for ARMv8, because it is mandatory for ARMv8.
  76. #ifndef DFPSR_SIMD
  77. #define DFPSR_SIMD
  78. #include <cstdint>
  79. #include <cassert>
  80. #include "SafePointer.h"
  81. #include "../math/FVector.h"
  82. #include "../math/IVector.h"
  83. #include "../math/UVector.h"
  84. #include "DsrTraits.h"
  85. #include "../settings.h"
  86. #include "../base/noSimd.h"
  87. // Alignment in bytes
  88. #define ALIGN_BYTES(SIZE) __attribute__((aligned(SIZE)))
  89. #define ALIGN16 ALIGN_BYTES(16) // 128-bit alignment
  90. #define ALIGN32 ALIGN_BYTES(32) // 256-bit alignment
  91. #define ALIGN64 ALIGN_BYTES(64) // 512-bit alignment
  92. #define ALIGN128 ALIGN_BYTES(128) // 1024-bit alignment
  93. #define ALIGN256 ALIGN_BYTES(256) // 2048-bit alignment
  94. namespace dsr {
  95. // Everything declared in here handles things specific for SSE.
  96. // Direct use of the macros will not provide portability to all hardware.
  97. #ifdef USE_SSE2
  98. #define USE_DIRECT_SIMD_MEMORY_ACCESS
  99. #include <emmintrin.h> // SSE2
  100. #ifdef USE_SSSE3
  101. #include <tmmintrin.h> // SSSE3
  102. #endif
  103. #ifdef USE_AVX
  104. #include <immintrin.h> // AVX / AVX2
  105. #endif
  106. // Vector types
  107. #define SIMD_F32x4 __m128
  108. #define SIMD_U8x16 __m128i
  109. #define SIMD_U16x8 __m128i
  110. #define SIMD_U32x4 __m128i
  111. #define SIMD_I32x4 __m128i
  112. // Vector uploads in address order
  113. #define LOAD_VECTOR_F32_SIMD(A, B, C, D) _mm_set_ps(D, C, B, A)
  114. #define LOAD_SCALAR_F32_SIMD(A) _mm_set1_ps(A)
  115. #define LOAD_VECTOR_U8_SIMD(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) _mm_set_epi8(P, O, N, M, L, K, J, I, H, G, F, E, D, C, B, A)
  116. #define LOAD_SCALAR_U8_SIMD(A) _mm_set1_epi8(A)
  117. #define LOAD_VECTOR_U16_SIMD(A, B, C, D, E, F, G, H) _mm_set_epi16(H, G, F, E, D, C, B, A)
  118. #define LOAD_SCALAR_U16_SIMD(A) _mm_set1_epi16(A)
  119. #define LOAD_VECTOR_U32_SIMD(A, B, C, D) _mm_set_epi32(D, C, B, A)
  120. #define LOAD_SCALAR_U32_SIMD(A) _mm_set1_epi32(A)
  121. #define LOAD_VECTOR_I32_SIMD(A, B, C, D) _mm_set_epi32(D, C, B, A)
  122. #define LOAD_SCALAR_I32_SIMD(A) _mm_set1_epi32(A)
  123. // Conversions
  124. #define F32_TO_I32_SIMD(A) _mm_cvttps_epi32(A)
  125. #define F32_TO_U32_SIMD(A) _mm_cvttps_epi32(A)
  126. #define I32_TO_F32_SIMD(A) _mm_cvtepi32_ps(A)
  127. #define U32_TO_F32_SIMD(A) _mm_cvtepi32_ps(A)
  128. // Unpacking conversions
  129. #define U8_LOW_TO_U16_SIMD(A) _mm_unpacklo_epi8(A, _mm_set1_epi8(0))
  130. #define U8_HIGH_TO_U16_SIMD(A) _mm_unpackhi_epi8(A, _mm_set1_epi8(0))
  131. #define U16_LOW_TO_U32_SIMD(A) _mm_unpacklo_epi16(A, _mm_set1_epi16(0))
  132. #define U16_HIGH_TO_U32_SIMD(A) _mm_unpackhi_epi16(A, _mm_set1_epi16(0))
  133. // Saturated packing
  134. // Credit: Using ideas from Victoria Zhislina's NEON_2_SSE.h header from the Intel corporation, but not trying to emulate NEON
  135. inline SIMD_U8x16 PACK_SAT_U16_TO_U8_SIMD(const SIMD_U16x8& a, const SIMD_U16x8& b) {
  136. SIMD_U16x8 mask, a2, b2;
  137. mask = _mm_set1_epi16(0b0111111111111111);
  138. a2 = _mm_and_si128(a, mask);
  139. a2 = _mm_or_si128(a2, _mm_and_si128(_mm_cmpgt_epi16(a2, a), mask));
  140. b2 = _mm_and_si128(b, mask);
  141. b2 = _mm_or_si128(b2, _mm_and_si128(_mm_cmpgt_epi16(b2, b), mask));
  142. return _mm_packus_epi16(a2, b2);
  143. }
  144. // Reinterpret casting
  145. #define REINTERPRET_U32_TO_U8_SIMD(A) (A)
  146. #define REINTERPRET_U32_TO_U16_SIMD(A) (A)
  147. #define REINTERPRET_U8_TO_U32_SIMD(A) (A)
  148. #define REINTERPRET_U16_TO_U32_SIMD(A) (A)
  149. #define REINTERPRET_U32_TO_I32_SIMD(A) (A)
  150. #define REINTERPRET_I32_TO_U32_SIMD(A) (A)
  151. // Vector float operations returning SIMD_F32x4
  152. #define ADD_F32_SIMD(A, B) _mm_add_ps(A, B)
  153. #define SUB_F32_SIMD(A, B) _mm_sub_ps(A, B)
  154. #define MUL_F32_SIMD(A, B) _mm_mul_ps(A, B)
  155. // Vector integer operations returning SIMD_I32x4
  156. #define ADD_I32_SIMD(A, B) _mm_add_epi32(A, B)
  157. #define SUB_I32_SIMD(A, B) _mm_sub_epi32(A, B)
  158. // 32-bit integer multiplications are not available on SSE2.
  159. // Vector integer operations returning SIMD_U32x4
  160. #define ADD_U32_SIMD(A, B) _mm_add_epi32(A, B)
  161. #define SUB_U32_SIMD(A, B) _mm_sub_epi32(A, B)
  162. // 32-bit integer multiplications are not available on SSE2.
  163. // Vector integer operations returning SIMD_U16x8
  164. #define ADD_U16_SIMD(A, B) _mm_add_epi16(A, B)
  165. #define SUB_U16_SIMD(A, B) _mm_sub_epi16(A, B)
  166. #define MUL_U16_SIMD(A, B) _mm_mullo_epi16(A, B)
  167. // Vector integer operations returning SIMD_U8x16
  168. #define ADD_U8_SIMD(A, B) _mm_add_epi8(A, B)
  169. #define ADD_SAT_U8_SIMD(A, B) _mm_adds_epu8(A, B) // Saturated addition
  170. #define SUB_U8_SIMD(A, B) _mm_sub_epi8(A, B)
  171. #define SUB_SAT_U8_SIMD(A, B) _mm_subs_epu8(A, B) // Saturated subtraction
  172. // No 8-bit multiplications
  173. // Statistics
  174. #define MIN_F32_SIMD(A, B) _mm_min_ps(A, B)
  175. #define MAX_F32_SIMD(A, B) _mm_max_ps(A, B)
  176. // TODO: Implement minimum and maximum for integer vectors, so that all operations exist for all applicable types:
  177. // Using _mm256_max_epu16... in AVX2 for 256-bit versions
  178. // Using comparisons and masking in SSE2 when _mm_max_epu16... in SSE4.1 is not available
  179. // Bitwise
  180. #define BITWISE_AND_U32_SIMD(A, B) _mm_and_si128(A, B)
  181. #define BITWISE_OR_U32_SIMD(A, B) _mm_or_si128(A, B)
  182. #define BITWISE_XOR_U32_SIMD(A, B) _mm_xor_si128(A, B)
  183. #ifdef USE_AVX
  184. // 256-bit vector types
  185. #define SIMD_F32x8 __m256
  186. // Vector uploads in address order
  187. #define LOAD_VECTOR_F32_SIMD256(A, B, C, D, E, F, G, H) _mm256_set_ps(H, G, F, E, D, C, B, A)
  188. #define LOAD_SCALAR_F32_SIMD256(A) _mm256_set1_ps(A)
  189. // Vector float operations returning SIMD_F32x4
  190. #define ADD_F32_SIMD256(A, B) _mm256_add_ps(A, B)
  191. #define SUB_F32_SIMD256(A, B) _mm256_sub_ps(A, B)
  192. #define MUL_F32_SIMD256(A, B) _mm256_mul_ps(A, B)
  193. // Statistics
  194. #define MIN_F32_SIMD256(A, B) _mm256_min_ps(A, B)
  195. #define MAX_F32_SIMD256(A, B) _mm256_max_ps(A, B)
  196. #ifdef USE_AVX2
  197. // 256-bit vector types
  198. #define SIMD_U8x32 __m256i
  199. #define SIMD_U16x16 __m256i
  200. #define SIMD_U32x8 __m256i
  201. #define SIMD_I32x8 __m256i
  202. // Vector uploads in address order
  203. #define LOAD_VECTOR_U8_SIMD256(A1, B1, C1, D1, E1, F1, G1, H1, I1, J1, K1, L1, M1, N1, O1, P1, Q1, R1, S1, T1, U1, V1, W1, X1, Y1, Z1, A2, B2, C2, D2, E2, F2) _mm256_set_epi8(F2, E2, D2, C2, B2, A2, Z1, Y1, X1, W1, V1, U1, T1, S1, R1, Q1, P1, O1, N1, M1, L1, K1, J1, I1, H1, G1, F1, E1, D1, C1, B1, A1)
  204. #define LOAD_SCALAR_U8_SIMD256(A) _mm256_set1_epi8(A)
  205. #define LOAD_VECTOR_U16_SIMD256(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) _mm256_set_epi16(P, O, N, M, L, K, J, I, H, G, F, E, D, C, B, A)
  206. #define LOAD_SCALAR_U16_SIMD256(A) _mm256_set1_epi16(A)
  207. #define LOAD_VECTOR_U32_SIMD256(A, B, C, D, E, F, G, H) _mm256_set_epi32(H, G, F, E, D, C, B, A)
  208. #define LOAD_SCALAR_U32_SIMD256(A) _mm256_set1_epi32(A)
  209. #define LOAD_VECTOR_I32_SIMD256(A, B, C, D, E, F, G, H) _mm256_set_epi32(H, G, F, E, D, C, B, A)
  210. #define LOAD_SCALAR_I32_SIMD256(A) _mm256_set1_epi32(A)
  211. // Conversions
  212. #define F32_TO_I32_SIMD256(A) _mm256_cvttps_epi32(A)
  213. #define F32_TO_U32_SIMD256(A) _mm256_cvttps_epi32(A)
  214. #define I32_TO_F32_SIMD256(A) _mm256_cvtepi32_ps(A)
  215. #define U32_TO_F32_SIMD256(A) _mm256_cvtepi32_ps(A)
  216. // Unpacking conversions
  217. #define U8_LOW_TO_U16_SIMD256(A) _mm256_unpacklo_epi8(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi8(0))
  218. #define U8_HIGH_TO_U16_SIMD256(A) _mm256_unpackhi_epi8(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi8(0))
  219. #define U16_LOW_TO_U32_SIMD256(A) _mm256_unpacklo_epi16(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi16(0))
  220. #define U16_HIGH_TO_U32_SIMD256(A) _mm256_unpackhi_epi16(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi16(0))
  221. // Saturated packing
  222. // Credit: Using ideas from Victoria Zhislina's NEON_2_SSE.h header from the Intel corporation, but not trying to emulate NEON
  223. inline SIMD_U8x32 PACK_SAT_U16_TO_U8_SIMD256(const SIMD_U16x16& a, const SIMD_U16x16& b) {
  224. SIMD_U16x16 mask, a2, b2;
  225. mask = _mm256_set1_epi16(0b0111111111111111);
  226. a2 = _mm256_and_si256(a, mask);
  227. a2 = _mm256_or_si256(a2, _mm256_and_si256(_mm256_cmpgt_epi16(a2, a), mask));
  228. b2 = _mm256_and_si256(b, mask);
  229. b2 = _mm256_or_si256(b2, _mm256_and_si256(_mm256_cmpgt_epi16(b2, b), mask));
  230. // The 256-bit pack instruction _mm256_packus_epi16 is not serial, so the result has to be permutated into the correct order.
  231. // 0 2 1 3
  232. // | X |
  233. // 0 1 2 3
  234. return _mm256_permute4x64_epi64(_mm256_packus_epi16(a2, b2), 0b11011000);
  235. }
  236. // Reinterpret casting
  237. #define REINTERPRET_U32_TO_U8_SIMD256(A) (A)
  238. #define REINTERPRET_U32_TO_U16_SIMD256(A) (A)
  239. #define REINTERPRET_U8_TO_U32_SIMD256(A) (A)
  240. #define REINTERPRET_U16_TO_U32_SIMD256(A) (A)
  241. #define REINTERPRET_U32_TO_I32_SIMD256(A) (A)
  242. #define REINTERPRET_I32_TO_U32_SIMD256(A) (A)
  243. // Vector integer operations returning SIMD_I32x4
  244. #define ADD_I32_SIMD256(A, B) _mm256_add_epi32(A, B)
  245. #define SUB_I32_SIMD256(A, B) _mm256_sub_epi32(A, B)
  246. #define MUL_I32_SIMD256(A, B) _mm256_mullo_epi32(A, B)
  247. // Vector integer operations returning SIMD_U32x4
  248. #define ADD_U32_SIMD256(A, B) _mm256_add_epi32(A, B)
  249. #define SUB_U32_SIMD256(A, B) _mm256_sub_epi32(A, B)
  250. #define MUL_U32_SIMD256(A, B) _mm256_mullo_epi32(A, B)
  251. // Vector integer operations returning SIMD_U16x8
  252. #define ADD_U16_SIMD256(A, B) _mm256_add_epi16(A, B)
  253. #define SUB_U16_SIMD256(A, B) _mm256_sub_epi16(A, B)
  254. #define MUL_U16_SIMD256(A, B) _mm256_mullo_epi16(A, B)
  255. // Vector integer operations returning SIMD_U8x16
  256. #define ADD_U8_SIMD256(A, B) _mm256_add_epi8(A, B)
  257. #define ADD_SAT_U8_SIMD256(A, B) _mm256_adds_epu8(A, B) // Saturated addition
  258. #define SUB_U8_SIMD256(A, B) _mm256_sub_epi8(A, B)
  259. #define SUB_SAT_U8_SIMD256(A, B) _mm256_subs_epu8(A, B) // Saturated subtraction
  260. // No 8-bit multiplications
  261. // Bitwise
  262. #define BITWISE_AND_U32_SIMD256(A, B) _mm256_and_si256(A, B)
  263. #define BITWISE_OR_U32_SIMD256(A, B) _mm256_or_si256(A, B)
  264. #define BITWISE_XOR_U32_SIMD256(A, B) _mm256_xor_si256(A, B)
  265. #endif
  266. #endif
  267. #endif
  268. // Everything declared in here handles things specific for NEON.
  269. // Direct use of the macros will not provide portability to all hardware.
  270. #ifdef USE_NEON
  271. #include <arm_neon.h> // NEON
  272. // Vector types
  273. #define SIMD_F32x4 float32x4_t
  274. #define SIMD_U8x16 uint8x16_t
  275. #define SIMD_U16x8 uint16x8_t
  276. #define SIMD_U32x4 uint32x4_t
  277. #define SIMD_I32x4 int32x4_t
  278. // Vector uploads in address order
  279. inline SIMD_F32x4 LOAD_VECTOR_F32_SIMD(float a, float b, float c, float d) {
  280. float data[4] ALIGN16 = {a, b, c, d};
  281. return vld1q_f32(data);
  282. }
  283. inline SIMD_F32x4 LOAD_SCALAR_F32_SIMD(float a) {
  284. return vdupq_n_f32(a);
  285. }
  286. inline SIMD_U8x16 LOAD_VECTOR_U8_SIMD(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
  287. uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p) {
  288. uint8_t data[16] ALIGN16 = {a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p};
  289. return vld1q_u8(data);
  290. }
  291. inline SIMD_U8x16 LOAD_SCALAR_U8_SIMD(uint16_t a) {
  292. return vdupq_n_u8(a);
  293. }
  294. inline SIMD_U16x8 LOAD_VECTOR_U16_SIMD(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h) {
  295. uint16_t data[8] ALIGN16 = {a, b, c, d, e, f, g, h};
  296. return vld1q_u16(data);
  297. }
  298. inline SIMD_U16x8 LOAD_SCALAR_U16_SIMD(uint16_t a) {
  299. return vdupq_n_u16(a);
  300. }
  301. inline SIMD_U32x4 LOAD_VECTOR_U32_SIMD(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
  302. uint32_t data[4] ALIGN16 = {a, b, c, d};
  303. return vld1q_u32(data);
  304. }
  305. inline SIMD_U32x4 LOAD_SCALAR_U32_SIMD(uint32_t a) {
  306. return vdupq_n_u32(a);
  307. }
  308. inline SIMD_I32x4 LOAD_VECTOR_I32_SIMD(int32_t a, int32_t b, int32_t c, int32_t d) {
  309. int32_t data[4] ALIGN16 = {a, b, c, d};
  310. return vld1q_s32(data);
  311. }
  312. inline SIMD_I32x4 LOAD_SCALAR_I32_SIMD(int32_t a) {
  313. return vdupq_n_s32(a);
  314. }
  315. // Conversions
  316. #define F32_TO_I32_SIMD(A) vcvtq_s32_f32(A)
  317. #define F32_TO_U32_SIMD(A) vcvtq_u32_f32(A)
  318. #define I32_TO_F32_SIMD(A) vcvtq_f32_s32(A)
  319. #define U32_TO_F32_SIMD(A) vcvtq_f32_u32(A)
  320. // Unpacking conversions
  321. #define U8_LOW_TO_U16_SIMD(A) vmovl_u8(vget_low_u8(A))
  322. #define U8_HIGH_TO_U16_SIMD(A) vmovl_u8(vget_high_u8(A))
  323. #define U16_LOW_TO_U32_SIMD(A) vmovl_u16(vget_low_u16(A))
  324. #define U16_HIGH_TO_U32_SIMD(A) vmovl_u16(vget_high_u16(A))
  325. // Saturated packing
  326. #define PACK_SAT_U16_TO_U8_SIMD(A, B) vcombine_u8(vqmovn_u16(A), vqmovn_u16(B))
  327. // Reinterpret casting
  328. #define REINTERPRET_U32_TO_U8_SIMD(A) vreinterpretq_u8_u32(A)
  329. #define REINTERPRET_U32_TO_U16_SIMD(A) vreinterpretq_u16_u32(A)
  330. #define REINTERPRET_U8_TO_U32_SIMD(A) vreinterpretq_u32_u8(A)
  331. #define REINTERPRET_U16_TO_U32_SIMD(A) vreinterpretq_u32_u16(A)
  332. #define REINTERPRET_U32_TO_I32_SIMD(A) vreinterpretq_s32_u32(A)
  333. #define REINTERPRET_I32_TO_U32_SIMD(A) vreinterpretq_u32_s32(A)
  334. // Vector float operations returning SIMD_F32x4
  335. #define ADD_F32_SIMD(A, B) vaddq_f32(A, B)
  336. #define SUB_F32_SIMD(A, B) vsubq_f32(A, B)
  337. #define MUL_F32_SIMD(A, B) vmulq_f32(A, B)
  338. // Vector integer operations returning SIMD_I32x4
  339. #define ADD_I32_SIMD(A, B) vaddq_s32(A, B)
  340. #define SUB_I32_SIMD(A, B) vsubq_s32(A, B)
  341. #define MUL_I32_NEON(A, B) vmulq_s32(A, B)
  342. // Vector integer operations returning SIMD_U32x4
  343. #define ADD_U32_SIMD(A, B) vaddq_u32(A, B)
  344. #define SUB_U32_SIMD(A, B) vsubq_u32(A, B)
  345. #define MUL_U32_NEON(A, B) vmulq_u32(A, B)
  346. // Vector integer operations returning SIMD_U16x8
  347. #define ADD_U16_SIMD(A, B) vaddq_u16(A, B)
  348. #define SUB_U16_SIMD(A, B) vsubq_u16(A, B)
  349. #define MUL_U16_SIMD(A, B) vmulq_u16(A, B)
  350. // Vector integer operations returning SIMD_U8x16
  351. #define ADD_U8_SIMD(A, B) vaddq_u8(A, B)
  352. #define ADD_SAT_U8_SIMD(A, B) vqaddq_u8(A, B) // Saturated addition
  353. #define SUB_U8_SIMD(A, B) vsubq_u8(A, B)
  354. #define SUB_SAT_U8_SIMD(A, B) vqsubq_u8(A, B) // Saturated subtraction
  355. // No 8-bit multiplications
  356. // Statistics
  357. #define MIN_F32_SIMD(A, B) vminq_f32(A, B)
  358. #define MAX_F32_SIMD(A, B) vmaxq_f32(A, B)
  359. // Bitwise
  360. #define BITWISE_AND_U32_SIMD(A, B) vandq_u32(A, B)
  361. #define BITWISE_OR_U32_SIMD(A, B) vorrq_u32(A, B)
  362. #define BITWISE_XOR_U32_SIMD(A, B) veorq_u32(A, B)
  363. #endif
  364. /*
  365. The vector types below are supposed to be portable across different CPU architectures.
  366. When mixed with handwritten SIMD intrinsics:
  367. Use "USE_SSE2" instead of "__SSE2__"
  368. Use "USE_AVX2" instead of "__AVX2__"
  369. Use "USE_NEON" instead of "__ARM_NEON"
  370. So that any new variations of the macro named given from the compiler can be added to simd.h instead of duplicated everywhere.
  371. Portability exceptions:
  372. * The "v" variable is the native backend, which is only defined when SIMD is supported by hardware.
  373. Only use when USE_BASIC_SIMD is defined.
  374. Will not work on scalar emulation.
  375. * The "scalars" array is available when emulating a type that does not exist or the SIMD vector has direct access to the memory.
  376. Do not rely on these for accessing elements, because otherwise your code will not be able to compile for ARM NEON.
  377. */
  378. union F32x4 {
  379. private:
  380. // The uninitialized default constructor is private for safety reasons.
  381. F32x4() {}
  382. public:
  383. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  384. static inline F32x4 create_dangerous_uninitialized() { return F32x4(); }
  385. #ifdef USE_BASIC_SIMD
  386. public:
  387. #ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
  388. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  389. // Direct access cannot be done on NEON!
  390. float scalars[4];
  391. #endif
  392. // The SIMD vector of undefined type
  393. // Not accessible while emulating!
  394. SIMD_F32x4 v;
  395. // Construct a portable vector from a native SIMD vector
  396. explicit F32x4(const SIMD_F32x4& v) : v(v) {}
  397. // Construct a portable vector from a set of scalars
  398. F32x4(float a1, float a2, float a3, float a4) : v(LOAD_VECTOR_F32_SIMD(a1, a2, a3, a4)) {}
  399. // Construct a portable vector from a single duplicated scalar
  400. explicit F32x4(float scalar) : v(LOAD_SCALAR_F32_SIMD(scalar)) {}
  401. #else
  402. public:
  403. // Emulate a SIMD vector as an array of scalars without hardware support.
  404. // Only accessible while emulating!
  405. float scalars[4];
  406. // Construct a portable vector from a set of scalars
  407. F32x4(float a1, float a2, float a3, float a4) {
  408. this->scalars[0] = a1;
  409. this->scalars[1] = a2;
  410. this->scalars[2] = a3;
  411. this->scalars[3] = a4;
  412. }
  413. // Construct a portable vector from a single duplicated scalar
  414. explicit F32x4(float scalar) {
  415. this->scalars[0] = scalar;
  416. this->scalars[1] = scalar;
  417. this->scalars[2] = scalar;
  418. this->scalars[3] = scalar;
  419. }
  420. #endif
  421. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  422. static inline F32x4 createGradient(float start, float increment) {
  423. return F32x4(start, start + increment, start + increment * 2.0f, start + increment * 3.0f);
  424. }
  425. // Construct a portable SIMD vector from a pointer to aligned data
  426. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  427. static inline F32x4 readAlignedUnsafe(const float* data) {
  428. #ifdef USE_BASIC_SIMD
  429. #if defined USE_SSE2
  430. return F32x4(_mm_load_ps(data));
  431. #elif defined USE_NEON
  432. return F32x4(vld1q_f32(data));
  433. #endif
  434. #else
  435. return F32x4(data[0], data[1], data[2], data[3]);
  436. #endif
  437. }
  438. // Write to aligned memory from the existing vector
  439. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  440. inline void writeAlignedUnsafe(float* data) const {
  441. #if defined USE_BASIC_SIMD
  442. #if defined USE_SSE2
  443. _mm_store_ps(data, this->v);
  444. #elif defined USE_NEON
  445. vst1q_f32(data, this->v);
  446. #endif
  447. #else
  448. data[0] = this->scalars[0];
  449. data[1] = this->scalars[1];
  450. data[2] = this->scalars[2];
  451. data[3] = this->scalars[3];
  452. #endif
  453. }
  454. #if defined DFPSR_GEOMETRY_FVECTOR
  455. dsr::FVector4D get() const {
  456. float data[4] ALIGN16;
  457. this->writeAlignedUnsafe(data);
  458. return dsr::FVector4D(data[0], data[1], data[2], data[3]);
  459. }
  460. #endif
  461. // Bound and alignment checked reading
  462. static inline F32x4 readAligned(dsr::SafePointer<const float> data, const char* methodName) {
  463. const float* pointer = data.getUnsafe();
  464. assert(((uintptr_t)pointer & 15) == 0);
  465. #if defined SAFE_POINTER_CHECKS
  466. data.assertInside(methodName, pointer, 16);
  467. #endif
  468. return F32x4::readAlignedUnsafe(pointer);
  469. }
  470. // Bound and alignment checked writing
  471. inline void writeAligned(dsr::SafePointer<float> data, const char* methodName) const {
  472. float* pointer = data.getUnsafe();
  473. assert(((uintptr_t)pointer & 15) == 0);
  474. #if defined SAFE_POINTER_CHECKS
  475. data.assertInside(methodName, pointer, 16);
  476. #endif
  477. this->writeAlignedUnsafe(pointer);
  478. }
  479. // 1 / x
  480. // Useful for multiple divisions with the same denominator
  481. // Useless if the denominator is a constant
  482. F32x4 reciprocal() const {
  483. #if defined USE_BASIC_SIMD
  484. #if defined USE_SSE2
  485. // Approximate
  486. SIMD_F32x4 lowQ = _mm_rcp_ps(this->v);
  487. // Refine
  488. return F32x4(SUB_F32_SIMD(ADD_F32_SIMD(lowQ, lowQ), MUL_F32_SIMD(this->v, MUL_F32_SIMD(lowQ, lowQ))));
  489. #elif defined USE_NEON
  490. // Approximate
  491. SIMD_F32x4 result = vrecpeq_f32(this->v);
  492. // Refine
  493. result = MUL_F32_SIMD(vrecpsq_f32(this->v, result), result);
  494. return F32x4(MUL_F32_SIMD(vrecpsq_f32(this->v, result), result));
  495. #else
  496. assert(false);
  497. return F32x4(0);
  498. #endif
  499. #else
  500. return F32x4(1.0f / this->scalars[0], 1.0f / this->scalars[1], 1.0f / this->scalars[2], 1.0f / this->scalars[3]);
  501. #endif
  502. }
  503. // 1 / sqrt(x)
  504. // Useful for normalizing vectors
  505. F32x4 reciprocalSquareRoot() const {
  506. #if defined USE_BASIC_SIMD
  507. #if defined USE_SSE2
  508. SIMD_F32x4 reciRoot = _mm_rsqrt_ps(this->v);
  509. SIMD_F32x4 mul = MUL_F32_SIMD(MUL_F32_SIMD(this->v, reciRoot), reciRoot);
  510. reciRoot = MUL_F32_SIMD(MUL_F32_SIMD(LOAD_SCALAR_F32_SIMD(0.5f), reciRoot), SUB_F32_SIMD(LOAD_SCALAR_F32_SIMD(3.0f), mul));
  511. return F32x4(reciRoot);
  512. #elif defined USE_NEON
  513. // Approximate
  514. SIMD_F32x4 reciRoot = vrsqrteq_f32(this->v);
  515. // Refine
  516. reciRoot = MUL_F32_SIMD(vrsqrtsq_f32(MUL_F32_SIMD(this->v, reciRoot), reciRoot), reciRoot);
  517. return F32x4(reciRoot);
  518. #else
  519. assert(false);
  520. return F32x4(0);
  521. #endif
  522. #else
  523. return F32x4(1.0f / sqrt(this->scalars[0]), 1.0f / sqrt(this->scalars[1]), 1.0f / sqrt(this->scalars[2]), 1.0f / sqrt(this->scalars[3]));
  524. #endif
  525. }
  526. // sqrt(x)
  527. // Useful for getting lengths of vectors
  528. F32x4 squareRoot() const {
  529. #if defined USE_BASIC_SIMD
  530. #if defined USE_SSE2
  531. SIMD_F32x4 half = LOAD_SCALAR_F32_SIMD(0.5f);
  532. // Approximate
  533. SIMD_F32x4 root = _mm_sqrt_ps(this->v);
  534. // Refine
  535. root = _mm_mul_ps(_mm_add_ps(root, _mm_div_ps(this->v, root)), half);
  536. return F32x4(root);
  537. #elif defined USE_NEON
  538. return F32x4(MUL_F32_SIMD(this->v, this->reciprocalSquareRoot().v));
  539. #else
  540. assert(false);
  541. return F32x4(0);
  542. #endif
  543. #else
  544. return F32x4(sqrt(this->scalars[0]), sqrt(this->scalars[1]), sqrt(this->scalars[2]), sqrt(this->scalars[3]));
  545. #endif
  546. }
  547. F32x4 clamp(float minimum, float maximum) const {
  548. #if defined USE_BASIC_SIMD
  549. return F32x4(MIN_F32_SIMD(MAX_F32_SIMD(this->v, LOAD_SCALAR_F32_SIMD(minimum)), LOAD_SCALAR_F32_SIMD(maximum)));
  550. #else
  551. float val0 = this->scalars[0];
  552. float val1 = this->scalars[1];
  553. float val2 = this->scalars[2];
  554. float val3 = this->scalars[3];
  555. if (minimum > val0) { val0 = minimum; }
  556. if (maximum < val0) { val0 = maximum; }
  557. if (minimum > val1) { val1 = minimum; }
  558. if (maximum < val1) { val1 = maximum; }
  559. if (minimum > val2) { val2 = minimum; }
  560. if (maximum < val2) { val2 = maximum; }
  561. if (minimum > val3) { val3 = minimum; }
  562. if (maximum < val3) { val3 = maximum; }
  563. return F32x4(val0, val1, val2, val3);
  564. #endif
  565. }
  566. F32x4 clampLower(float minimum) const {
  567. #if defined USE_BASIC_SIMD
  568. return F32x4(MAX_F32_SIMD(this->v, LOAD_SCALAR_F32_SIMD(minimum)));
  569. #else
  570. float val0 = this->scalars[0];
  571. float val1 = this->scalars[1];
  572. float val2 = this->scalars[2];
  573. float val3 = this->scalars[3];
  574. if (minimum > val0) { val0 = minimum; }
  575. if (minimum > val1) { val1 = minimum; }
  576. if (minimum > val2) { val2 = minimum; }
  577. if (minimum > val3) { val3 = minimum; }
  578. return F32x4(val0, val1, val2, val3);
  579. #endif
  580. }
  581. F32x4 clampUpper(float maximum) const {
  582. #if defined USE_BASIC_SIMD
  583. return F32x4(MIN_F32_SIMD(this->v, LOAD_SCALAR_F32_SIMD(maximum)));
  584. #else
  585. float val0 = this->scalars[0];
  586. float val1 = this->scalars[1];
  587. float val2 = this->scalars[2];
  588. float val3 = this->scalars[3];
  589. if (maximum < val0) { val0 = maximum; }
  590. if (maximum < val1) { val1 = maximum; }
  591. if (maximum < val2) { val2 = maximum; }
  592. if (maximum < val3) { val3 = maximum; }
  593. return F32x4(val0, val1, val2, val3);
  594. #endif
  595. }
  596. };
  597. union I32x4 {
  598. private:
  599. // The uninitialized default constructor is private for safety reasons.
  600. I32x4() {}
  601. public:
  602. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  603. static inline I32x4 create_dangerous_uninitialized() { return I32x4(); }
  604. #if defined USE_BASIC_SIMD
  605. public:
  606. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  607. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  608. // Direct access cannot be done on NEON!
  609. int32_t scalars[4];
  610. #endif
  611. // The SIMD vector of undefined type
  612. // Not accessible while emulating!
  613. SIMD_I32x4 v;
  614. // Construct a portable vector from a native SIMD vector
  615. explicit I32x4(const SIMD_I32x4& v) : v(v) {}
  616. // Construct a portable vector from a set of scalars
  617. I32x4(int32_t a1, int32_t a2, int32_t a3, int32_t a4) : v(LOAD_VECTOR_I32_SIMD(a1, a2, a3, a4)) {}
  618. // Construct a portable vector from a single duplicated scalar
  619. explicit I32x4(int32_t scalar) : v(LOAD_SCALAR_I32_SIMD(scalar)) {}
  620. #else
  621. public:
  622. // Emulate a SIMD vector as an array of scalars without hardware support.
  623. // Only accessible while emulating!
  624. int32_t scalars[4];
  625. // Construct a portable vector from a set of scalars
  626. I32x4(int32_t a1, int32_t a2, int32_t a3, int32_t a4) {
  627. this->scalars[0] = a1;
  628. this->scalars[1] = a2;
  629. this->scalars[2] = a3;
  630. this->scalars[3] = a4;
  631. }
  632. // Construct a portable vector from a single duplicated scalar
  633. explicit I32x4(int32_t scalar) {
  634. this->scalars[0] = scalar;
  635. this->scalars[1] = scalar;
  636. this->scalars[2] = scalar;
  637. this->scalars[3] = scalar;
  638. }
  639. #endif
  640. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  641. static inline I32x4 createGradient(int32_t start, int32_t increment) {
  642. return I32x4(start, start + increment, start + increment * 2, start + increment * 3);
  643. }
  644. // Construct a portable SIMD vector from a pointer to aligned data
  645. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  646. static inline I32x4 readAlignedUnsafe(const int32_t* data) {
  647. #if defined USE_BASIC_SIMD
  648. #if defined USE_SSE2
  649. return I32x4(_mm_load_si128((const __m128i*)data));
  650. #elif defined USE_NEON
  651. return I32x4(vld1q_s32(data));
  652. #endif
  653. #else
  654. return I32x4(data[0], data[1], data[2], data[3]);
  655. #endif
  656. }
  657. // Write to aligned memory from the existing vector
  658. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  659. inline void writeAlignedUnsafe(int32_t* data) const {
  660. #if defined USE_BASIC_SIMD
  661. #if defined USE_SSE2
  662. _mm_store_si128((__m128i*)data, this->v);
  663. #elif defined USE_NEON
  664. vst1q_s32(data, this->v);
  665. #endif
  666. #else
  667. data[0] = this->scalars[0];
  668. data[1] = this->scalars[1];
  669. data[2] = this->scalars[2];
  670. data[3] = this->scalars[3];
  671. #endif
  672. }
  673. #if defined DFPSR_GEOMETRY_IVECTOR
  674. dsr::IVector4D get() const {
  675. int32_t data[4] ALIGN16;
  676. this->writeAlignedUnsafe(data);
  677. return dsr::IVector4D(data[0], data[1], data[2], data[3]);
  678. }
  679. #endif
  680. // Bound and alignment checked reading
  681. static inline I32x4 readAligned(dsr::SafePointer<const int32_t> data, const char* methodName) {
  682. const int32_t* pointer = data.getUnsafe();
  683. assert(((uintptr_t)pointer & 15) == 0);
  684. #if defined SAFE_POINTER_CHECKS
  685. data.assertInside(methodName, pointer, 16);
  686. #endif
  687. return I32x4::readAlignedUnsafe(pointer);
  688. }
  689. // Bound and alignment checked writing
  690. inline void writeAligned(dsr::SafePointer<int32_t> data, const char* methodName) const {
  691. int32_t* pointer = data.getUnsafe();
  692. assert(((uintptr_t)pointer & 15) == 0);
  693. #if defined SAFE_POINTER_CHECKS
  694. data.assertInside(methodName, pointer, 16);
  695. #endif
  696. this->writeAlignedUnsafe(pointer);
  697. }
  698. };
  699. union U32x4 {
  700. #if defined USE_BASIC_SIMD
  701. public:
  702. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  703. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  704. // Direct access cannot be done on NEON!
  705. uint32_t scalars[4];
  706. #endif
  707. // The SIMD vector of undefined type
  708. // Not accessible while emulating!
  709. SIMD_U32x4 v;
  710. // Construct a portable vector from a native SIMD vector
  711. explicit U32x4(const SIMD_U32x4& v) : v(v) {}
  712. // Construct a portable vector from a set of scalars
  713. U32x4(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4) : v(LOAD_VECTOR_U32_SIMD(a1, a2, a3, a4)) {}
  714. // Construct a portable vector from a single duplicated scalar
  715. explicit U32x4(uint32_t scalar) : v(LOAD_SCALAR_U32_SIMD(scalar)) {}
  716. #else
  717. public:
  718. // Emulate a SIMD vector as an array of scalars without hardware support.
  719. // Only accessible while emulating!
  720. uint32_t scalars[4];
  721. // Construct a portable vector from a set of scalars
  722. U32x4(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4) {
  723. this->scalars[0] = a1;
  724. this->scalars[1] = a2;
  725. this->scalars[2] = a3;
  726. this->scalars[3] = a4;
  727. }
  728. // Construct a portable vector from a single duplicated scalar
  729. explicit U32x4(uint32_t scalar) {
  730. this->scalars[0] = scalar;
  731. this->scalars[1] = scalar;
  732. this->scalars[2] = scalar;
  733. this->scalars[3] = scalar;
  734. }
  735. #endif
  736. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  737. static inline U32x4 createGradient(uint32_t start, uint32_t increment) {
  738. return U32x4(start, start + increment, start + increment * 2, start + increment * 3);
  739. }
  740. // Construct a portable SIMD vector from a pointer to aligned data
  741. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  742. static inline U32x4 readAlignedUnsafe(const uint32_t* data) {
  743. #if defined USE_BASIC_SIMD
  744. #if defined USE_SSE2
  745. return U32x4(_mm_load_si128((const __m128i*)data));
  746. #elif defined USE_NEON
  747. return U32x4(vld1q_u32(data));
  748. #endif
  749. #else
  750. return U32x4(data[0], data[1], data[2], data[3]);
  751. #endif
  752. }
  753. // Write to aligned memory from the existing vector
  754. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  755. inline void writeAlignedUnsafe(uint32_t* data) const {
  756. #if defined USE_BASIC_SIMD
  757. #if defined USE_SSE2
  758. _mm_store_si128((__m128i*)data, this->v);
  759. #elif defined USE_NEON
  760. vst1q_u32(data, this->v);
  761. #endif
  762. #else
  763. data[0] = this->scalars[0];
  764. data[1] = this->scalars[1];
  765. data[2] = this->scalars[2];
  766. data[3] = this->scalars[3];
  767. #endif
  768. }
  769. #if defined DFPSR_GEOMETRY_UVECTOR
  770. dsr::UVector4D get() const {
  771. uint32_t data[4] ALIGN16;
  772. this->writeAlignedUnsafe(data);
  773. return dsr::UVector4D(data[0], data[1], data[2], data[3]);
  774. }
  775. #endif
  776. // Bound and alignment checked reading
  777. static inline U32x4 readAligned(dsr::SafePointer<const uint32_t> data, const char* methodName) {
  778. const uint32_t* pointer = data.getUnsafe();
  779. assert(((uintptr_t)pointer & 15) == 0);
  780. #if defined SAFE_POINTER_CHECKS
  781. data.assertInside(methodName, pointer, 16);
  782. #endif
  783. return U32x4::readAlignedUnsafe(pointer);
  784. }
  785. // Bound and alignment checked writing
  786. inline void writeAligned(dsr::SafePointer<uint32_t> data, const char* methodName) const {
  787. uint32_t* pointer = data.getUnsafe();
  788. assert(((uintptr_t)pointer & 15) == 0);
  789. #if defined SAFE_POINTER_CHECKS
  790. data.assertInside(methodName, pointer, 16);
  791. #endif
  792. this->writeAlignedUnsafe(pointer);
  793. }
  794. };
  795. union U16x8 {
  796. private:
  797. // The uninitialized default constructor is private for safety reasons.
  798. U16x8() {}
  799. public:
  800. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  801. static inline U16x8 create_dangerous_uninitialized() { return U16x8(); }
  802. #if defined USE_BASIC_SIMD
  803. public:
  804. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  805. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  806. // Direct access cannot be done on NEON!
  807. uint16_t scalars[8];
  808. #endif
  809. // The SIMD vector of undefined type
  810. // Not accessible while emulating!
  811. SIMD_U16x8 v;
  812. // Construct a portable vector from a native SIMD vector
  813. explicit U16x8(const SIMD_U16x8& v) : v(v) {}
  814. // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
  815. // Reinterpret casting is used
  816. explicit U16x8(const U32x4& vector) : v(REINTERPRET_U32_TO_U16_SIMD(vector.v)) {}
  817. // Construct a portable vector from a set of scalars
  818. U16x8(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8) : v(LOAD_VECTOR_U16_SIMD(a1, a2, a3, a4, a5, a6, a7, a8)) {}
  819. // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
  820. // Reinterpret casting is used
  821. // TODO: Remove all reintreprets from constructors to improve readability
  822. explicit U16x8(uint32_t scalar) : v(REINTERPRET_U32_TO_U16_SIMD(LOAD_SCALAR_U32_SIMD(scalar))) {}
  823. // Construct a portable vector from a single duplicated scalar
  824. explicit U16x8(uint16_t scalar) : v(LOAD_SCALAR_U16_SIMD(scalar)) {}
  825. // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
  826. U32x4 get_U32() const {
  827. return U32x4(REINTERPRET_U16_TO_U32_SIMD(this->v));
  828. }
  829. #else
  830. public:
  831. // Emulate a SIMD vector as an array of scalars without hardware support.
  832. // Only accessible while emulating!
  833. uint16_t scalars[8];
  834. // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
  835. // Reinterpret casting is used
  836. explicit U16x8(const U32x4& vector) {
  837. uint64_t *target = (uint64_t*)this->scalars;
  838. uint64_t *source = (uint64_t*)vector.scalars;
  839. target[0] = source[0];
  840. target[1] = source[1];
  841. }
  842. // Construct a portable vector from a set of scalars
  843. U16x8(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8) {
  844. this->scalars[0] = a1;
  845. this->scalars[1] = a2;
  846. this->scalars[2] = a3;
  847. this->scalars[3] = a4;
  848. this->scalars[4] = a5;
  849. this->scalars[5] = a6;
  850. this->scalars[6] = a7;
  851. this->scalars[7] = a8;
  852. }
  853. // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
  854. // Reinterpret casting is used
  855. explicit U16x8(uint32_t scalar) {
  856. uint32_t *target = (uint32_t*)this->scalars;
  857. target[0] = scalar;
  858. target[1] = scalar;
  859. target[2] = scalar;
  860. target[3] = scalar;
  861. }
  862. // Construct a portable vector from a single duplicated scalar
  863. explicit U16x8(uint16_t scalar) {
  864. this->scalars[0] = scalar;
  865. this->scalars[1] = scalar;
  866. this->scalars[2] = scalar;
  867. this->scalars[3] = scalar;
  868. this->scalars[4] = scalar;
  869. this->scalars[5] = scalar;
  870. this->scalars[6] = scalar;
  871. this->scalars[7] = scalar;
  872. }
  873. // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
  874. U32x4 get_U32() const {
  875. U32x4 result(0);
  876. uint64_t *target = (uint64_t*)result.scalars;
  877. uint64_t *source = (uint64_t*)this->scalars;
  878. target[0] = source[0];
  879. target[1] = source[1];
  880. return result;
  881. }
  882. #endif
  883. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  884. static inline U16x8 createGradient(uint16_t start, uint16_t increment) {
  885. return U16x8(
  886. start,
  887. start + increment,
  888. start + increment * 2,
  889. start + increment * 3,
  890. start + increment * 4,
  891. start + increment * 5,
  892. start + increment * 6,
  893. start + increment * 7
  894. );
  895. }
  896. static inline U16x8 readAlignedUnsafe(const uint16_t* data) {
  897. #if defined USE_BASIC_SIMD
  898. #if defined USE_SSE2
  899. return U16x8(_mm_load_si128((const __m128i*)data));
  900. #elif defined USE_NEON
  901. return U16x8(vld1q_u16(data));
  902. #endif
  903. #else
  904. return U16x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  905. #endif
  906. }
  907. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  908. inline void writeAlignedUnsafe(uint16_t* data) const {
  909. #if defined USE_BASIC_SIMD
  910. #if defined USE_SSE2
  911. _mm_store_si128((__m128i*)data, this->v);
  912. #elif defined USE_NEON
  913. vst1q_u16(data, this->v);
  914. #endif
  915. #else
  916. data[0] = this->scalars[0];
  917. data[1] = this->scalars[1];
  918. data[2] = this->scalars[2];
  919. data[3] = this->scalars[3];
  920. data[4] = this->scalars[4];
  921. data[5] = this->scalars[5];
  922. data[6] = this->scalars[6];
  923. data[7] = this->scalars[7];
  924. #endif
  925. }
  926. // Bound and alignment checked reading
  927. static inline U16x8 readAligned(dsr::SafePointer<const uint16_t> data, const char* methodName) {
  928. const uint16_t* pointer = data.getUnsafe();
  929. assert(((uintptr_t)pointer & 15) == 0);
  930. #if defined SAFE_POINTER_CHECKS
  931. data.assertInside(methodName, pointer, 16);
  932. #endif
  933. return U16x8::readAlignedUnsafe(pointer);
  934. }
  935. // Bound and alignment checked writing
  936. inline void writeAligned(dsr::SafePointer<uint16_t> data, const char* methodName) const {
  937. uint16_t* pointer = data.getUnsafe();
  938. assert(((uintptr_t)pointer & 15) == 0);
  939. #if defined SAFE_POINTER_CHECKS
  940. data.assertInside(methodName, pointer, 16);
  941. #endif
  942. this->writeAlignedUnsafe(pointer);
  943. }
  944. };
  945. union U8x16 {
  946. private:
  947. // The uninitialized default constructor is private for safety reasons.
  948. U8x16() {}
  949. public:
  950. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  951. static inline U8x16 create_dangerous_uninitialized() { return U8x16(); }
  952. #if defined USE_BASIC_SIMD
  953. public:
  954. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  955. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  956. // Direct access cannot be done on NEON!
  957. uint8_t scalars[16];
  958. #endif
  959. // The SIMD vector of undefined type
  960. // Not accessible while emulating!
  961. SIMD_U8x16 v;
  962. // Construct a portable vector from a native SIMD vector
  963. explicit U8x16(const SIMD_U8x16& v) : v(v) {}
  964. // Construct a portable vector from a set of scalars
  965. U8x16(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  966. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16)
  967. : v(LOAD_VECTOR_U8_SIMD(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16)) {}
  968. // Construct a portable vector from a single duplicated scalar
  969. explicit U8x16(uint8_t scalar) : v(LOAD_SCALAR_U8_SIMD(scalar)) {}
  970. #else
  971. public:
  972. // Emulate a SIMD vector as an array of scalars without hardware support.
  973. // Only accessible while emulating!
  974. uint8_t scalars[16];
  975. // Construct a portable vector from a set of scalars
  976. U8x16(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  977. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16) {
  978. this->scalars[0] = a1;
  979. this->scalars[1] = a2;
  980. this->scalars[2] = a3;
  981. this->scalars[3] = a4;
  982. this->scalars[4] = a5;
  983. this->scalars[5] = a6;
  984. this->scalars[6] = a7;
  985. this->scalars[7] = a8;
  986. this->scalars[8] = a9;
  987. this->scalars[9] = a10;
  988. this->scalars[10] = a11;
  989. this->scalars[11] = a12;
  990. this->scalars[12] = a13;
  991. this->scalars[13] = a14;
  992. this->scalars[14] = a15;
  993. this->scalars[15] = a16;
  994. }
  995. // Construct a portable vector from a single duplicated scalar
  996. explicit U8x16(uint8_t scalar) {
  997. this->scalars[0] = scalar;
  998. this->scalars[1] = scalar;
  999. this->scalars[2] = scalar;
  1000. this->scalars[3] = scalar;
  1001. this->scalars[4] = scalar;
  1002. this->scalars[5] = scalar;
  1003. this->scalars[6] = scalar;
  1004. this->scalars[7] = scalar;
  1005. this->scalars[8] = scalar;
  1006. this->scalars[9] = scalar;
  1007. this->scalars[10] = scalar;
  1008. this->scalars[11] = scalar;
  1009. this->scalars[12] = scalar;
  1010. this->scalars[13] = scalar;
  1011. this->scalars[14] = scalar;
  1012. this->scalars[15] = scalar;
  1013. }
  1014. #endif
  1015. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1016. static inline U8x16 createGradient(uint8_t start, uint8_t increment) {
  1017. return U8x16(
  1018. start,
  1019. start + increment,
  1020. start + increment * 2,
  1021. start + increment * 3,
  1022. start + increment * 4,
  1023. start + increment * 5,
  1024. start + increment * 6,
  1025. start + increment * 7,
  1026. start + increment * 8,
  1027. start + increment * 9,
  1028. start + increment * 10,
  1029. start + increment * 11,
  1030. start + increment * 12,
  1031. start + increment * 13,
  1032. start + increment * 14,
  1033. start + increment * 15
  1034. );
  1035. }
  1036. static inline U8x16 readAlignedUnsafe(const uint8_t* data) {
  1037. #if defined USE_BASIC_SIMD
  1038. #if defined USE_SSE2
  1039. return U8x16(_mm_load_si128((const __m128i*)data));
  1040. #elif defined USE_NEON
  1041. return U8x16(vld1q_u8(data));
  1042. #endif
  1043. #else
  1044. return U8x16(
  1045. data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7],
  1046. data[8], data[9], data[10], data[11], data[12], data[13], data[14], data[15]
  1047. );
  1048. #endif
  1049. }
  1050. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1051. inline void writeAlignedUnsafe(uint8_t* data) const {
  1052. #if defined USE_BASIC_SIMD
  1053. #if defined USE_SSE2
  1054. _mm_store_si128((__m128i*)data, this->v);
  1055. #elif defined USE_NEON
  1056. vst1q_u8(data, this->v);
  1057. #endif
  1058. #else
  1059. data[0] = this->scalars[0];
  1060. data[1] = this->scalars[1];
  1061. data[2] = this->scalars[2];
  1062. data[3] = this->scalars[3];
  1063. data[4] = this->scalars[4];
  1064. data[5] = this->scalars[5];
  1065. data[6] = this->scalars[6];
  1066. data[7] = this->scalars[7];
  1067. data[8] = this->scalars[8];
  1068. data[9] = this->scalars[9];
  1069. data[10] = this->scalars[10];
  1070. data[11] = this->scalars[11];
  1071. data[12] = this->scalars[12];
  1072. data[13] = this->scalars[13];
  1073. data[14] = this->scalars[14];
  1074. data[15] = this->scalars[15];
  1075. #endif
  1076. }
  1077. // Bound and alignment checked reading
  1078. static inline U8x16 readAligned(dsr::SafePointer<const uint8_t> data, const char* methodName) {
  1079. const uint8_t* pointer = data.getUnsafe();
  1080. assert(((uintptr_t)pointer & 15) == 0);
  1081. #if defined SAFE_POINTER_CHECKS
  1082. data.assertInside(methodName, pointer, 16);
  1083. #endif
  1084. return U8x16::readAlignedUnsafe(pointer);
  1085. }
  1086. // Bound and alignment checked writing
  1087. inline void writeAligned(dsr::SafePointer<uint8_t> data, const char* methodName) const {
  1088. uint8_t* pointer = data.getUnsafe();
  1089. assert(((uintptr_t)pointer & 15) == 0);
  1090. #if defined SAFE_POINTER_CHECKS
  1091. data.assertInside(methodName, pointer, 16);
  1092. #endif
  1093. this->writeAlignedUnsafe(pointer);
  1094. }
  1095. };
  1096. union F32x8 {
  1097. private:
  1098. // The uninitialized default constructor is private for safety reasons.
  1099. F32x8() {}
  1100. public:
  1101. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  1102. static inline F32x8 create_dangerous_uninitialized() { return F32x8(); }
  1103. #if defined USE_256BIT_F_SIMD
  1104. public:
  1105. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  1106. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  1107. float scalars[8];
  1108. #endif
  1109. // The SIMD vector of undefined type
  1110. // Not accessible while emulating!
  1111. SIMD_F32x8 v;
  1112. // Construct a portable vector from a native SIMD vector
  1113. explicit F32x8(const SIMD_F32x8& v) : v(v) {}
  1114. // Construct a portable vector from a set of scalars
  1115. F32x8(float a1, float a2, float a3, float a4, float a5, float a6, float a7, float a8)
  1116. : v(LOAD_VECTOR_F32_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8)) {}
  1117. // Construct a portable vector from a single duplicated scalar
  1118. explicit F32x8(float scalar) : v(LOAD_SCALAR_F32_SIMD256(scalar)) {}
  1119. #else
  1120. public:
  1121. // Emulate a SIMD vector as an array of scalars without hardware support.
  1122. // Only accessible while emulating!
  1123. float scalars[8];
  1124. // Construct a portable vector from a set of scalars
  1125. F32x8(float a1, float a2, float a3, float a4, float a5, float a6, float a7, float a8) {
  1126. this->scalars[0] = a1;
  1127. this->scalars[1] = a2;
  1128. this->scalars[2] = a3;
  1129. this->scalars[3] = a4;
  1130. this->scalars[4] = a5;
  1131. this->scalars[5] = a6;
  1132. this->scalars[6] = a7;
  1133. this->scalars[7] = a8;
  1134. }
  1135. // Construct a portable vector from a single duplicated scalar
  1136. explicit F32x8(float scalar) {
  1137. this->scalars[0] = scalar;
  1138. this->scalars[1] = scalar;
  1139. this->scalars[2] = scalar;
  1140. this->scalars[3] = scalar;
  1141. this->scalars[4] = scalar;
  1142. this->scalars[5] = scalar;
  1143. this->scalars[6] = scalar;
  1144. this->scalars[7] = scalar;
  1145. }
  1146. #endif
  1147. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1148. static inline F32x8 createGradient(float start, float increment) {
  1149. return F32x8(
  1150. start,
  1151. start + increment,
  1152. start + increment * 2.0f,
  1153. start + increment * 3.0f,
  1154. start + increment * 4.0f,
  1155. start + increment * 5.0f,
  1156. start + increment * 6.0f,
  1157. start + increment * 7.0f
  1158. );
  1159. }
  1160. // Construct a portable SIMD vector from a pointer to aligned data
  1161. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1162. static inline F32x8 readAlignedUnsafe(const float* data) {
  1163. #if defined USE_AVX2
  1164. return F32x8(_mm256_load_ps(data));
  1165. #else
  1166. return F32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  1167. #endif
  1168. }
  1169. // Write to aligned memory from the existing vector
  1170. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1171. inline void writeAlignedUnsafe(float* data) const {
  1172. #if defined USE_AVX2
  1173. _mm256_store_ps(data, this->v);
  1174. #else
  1175. data[0] = this->scalars[0];
  1176. data[1] = this->scalars[1];
  1177. data[2] = this->scalars[2];
  1178. data[3] = this->scalars[3];
  1179. data[4] = this->scalars[4];
  1180. data[5] = this->scalars[5];
  1181. data[6] = this->scalars[6];
  1182. data[7] = this->scalars[7];
  1183. #endif
  1184. }
  1185. // Bound and alignment checked reading
  1186. static inline F32x8 readAligned(dsr::SafePointer<const float> data, const char* methodName) {
  1187. const float* pointer = data.getUnsafe();
  1188. assert(((uintptr_t)pointer & 31) == 0);
  1189. #if defined SAFE_POINTER_CHECKS
  1190. data.assertInside(methodName, pointer, 32);
  1191. #endif
  1192. return F32x8::readAlignedUnsafe(pointer);
  1193. }
  1194. // Bound and alignment checked writing
  1195. inline void writeAligned(dsr::SafePointer<float> data, const char* methodName) const {
  1196. float* pointer = data.getUnsafe();
  1197. assert(((uintptr_t)pointer & 31) == 0);
  1198. #if defined SAFE_POINTER_CHECKS
  1199. data.assertInside(methodName, pointer, 32);
  1200. #endif
  1201. this->writeAlignedUnsafe(pointer);
  1202. }
  1203. // 1 / x
  1204. // Useful for multiple divisions with the same denominator
  1205. // Useless if the denominator is a constant
  1206. F32x8 reciprocal() const {
  1207. #if defined USE_AVX2
  1208. // Approximate
  1209. SIMD_F32x8 lowQ = _mm256_rcp_ps(this->v);
  1210. // Refine
  1211. return F32x8(SUB_F32_SIMD256(ADD_F32_SIMD256(lowQ, lowQ), MUL_F32_SIMD256(this->v, MUL_F32_SIMD256(lowQ, lowQ))));
  1212. #else
  1213. return F32x8(
  1214. 1.0f / this->scalars[0],
  1215. 1.0f / this->scalars[1],
  1216. 1.0f / this->scalars[2],
  1217. 1.0f / this->scalars[3],
  1218. 1.0f / this->scalars[4],
  1219. 1.0f / this->scalars[5],
  1220. 1.0f / this->scalars[6],
  1221. 1.0f / this->scalars[7]
  1222. );
  1223. #endif
  1224. }
  1225. // 1 / sqrt(x)
  1226. // Useful for normalizing vectors
  1227. F32x8 reciprocalSquareRoot() const {
  1228. #if defined USE_AVX2
  1229. //__m128 reciRoot = _mm256_rsqrt_ps(this->v);
  1230. SIMD_F32x8 reciRoot = _mm256_rsqrt_ps(this->v);
  1231. SIMD_F32x8 mul = MUL_F32_SIMD256(MUL_F32_SIMD256(this->v, reciRoot), reciRoot);
  1232. reciRoot = MUL_F32_SIMD256(MUL_F32_SIMD256(LOAD_SCALAR_F32_SIMD256(0.5f), reciRoot), SUB_F32_SIMD256(LOAD_SCALAR_F32_SIMD256(3.0f), mul));
  1233. return F32x8(reciRoot);
  1234. #else
  1235. return F32x8(
  1236. 1.0f / sqrt(this->scalars[0]),
  1237. 1.0f / sqrt(this->scalars[1]),
  1238. 1.0f / sqrt(this->scalars[2]),
  1239. 1.0f / sqrt(this->scalars[3]),
  1240. 1.0f / sqrt(this->scalars[4]),
  1241. 1.0f / sqrt(this->scalars[5]),
  1242. 1.0f / sqrt(this->scalars[6]),
  1243. 1.0f / sqrt(this->scalars[7])
  1244. );
  1245. #endif
  1246. }
  1247. // sqrt(x)
  1248. // Useful for getting lengths of vectors
  1249. F32x8 squareRoot() const {
  1250. #if defined USE_AVX2
  1251. SIMD_F32x8 half = LOAD_SCALAR_F32_SIMD256(0.5f);
  1252. // Approximate
  1253. SIMD_F32x8 root = _mm256_sqrt_ps(this->v);
  1254. // Refine
  1255. root = _mm256_mul_ps(_mm256_add_ps(root, _mm256_div_ps(this->v, root)), half);
  1256. return F32x8(root);
  1257. #else
  1258. return F32x8(
  1259. sqrt(this->scalars[0]),
  1260. sqrt(this->scalars[1]),
  1261. sqrt(this->scalars[2]),
  1262. sqrt(this->scalars[3]),
  1263. sqrt(this->scalars[4]),
  1264. sqrt(this->scalars[5]),
  1265. sqrt(this->scalars[6]),
  1266. sqrt(this->scalars[7]));
  1267. #endif
  1268. }
  1269. F32x8 clamp(float minimum, float maximum) const {
  1270. #if defined USE_256BIT_F_SIMD
  1271. return F32x8(MIN_F32_SIMD256(MAX_F32_SIMD256(this->v, LOAD_SCALAR_F32_SIMD256(minimum)), LOAD_SCALAR_F32_SIMD256(maximum)));
  1272. #else
  1273. float val0 = this->scalars[0];
  1274. float val1 = this->scalars[1];
  1275. float val2 = this->scalars[2];
  1276. float val3 = this->scalars[3];
  1277. float val4 = this->scalars[4];
  1278. float val5 = this->scalars[5];
  1279. float val6 = this->scalars[6];
  1280. float val7 = this->scalars[7];
  1281. if (minimum > val0) { val0 = minimum; }
  1282. if (maximum < val0) { val0 = maximum; }
  1283. if (minimum > val1) { val1 = minimum; }
  1284. if (maximum < val1) { val1 = maximum; }
  1285. if (minimum > val2) { val2 = minimum; }
  1286. if (maximum < val2) { val2 = maximum; }
  1287. if (minimum > val3) { val3 = minimum; }
  1288. if (maximum < val3) { val3 = maximum; }
  1289. if (minimum > val4) { val4 = minimum; }
  1290. if (maximum < val4) { val4 = maximum; }
  1291. if (minimum > val5) { val5 = minimum; }
  1292. if (maximum < val5) { val5 = maximum; }
  1293. if (minimum > val6) { val6 = minimum; }
  1294. if (maximum < val6) { val6 = maximum; }
  1295. if (minimum > val7) { val7 = minimum; }
  1296. if (maximum < val7) { val7 = maximum; }
  1297. return F32x8(val0, val1, val2, val3, val4, val5, val6, val7);
  1298. #endif
  1299. }
  1300. F32x8 clampLower(float minimum) const {
  1301. #if defined USE_256BIT_F_SIMD
  1302. return F32x8(MAX_F32_SIMD256(this->v, LOAD_SCALAR_F32_SIMD256(minimum)));
  1303. #else
  1304. float val0 = this->scalars[0];
  1305. float val1 = this->scalars[1];
  1306. float val2 = this->scalars[2];
  1307. float val3 = this->scalars[3];
  1308. float val4 = this->scalars[4];
  1309. float val5 = this->scalars[5];
  1310. float val6 = this->scalars[6];
  1311. float val7 = this->scalars[7];
  1312. if (minimum > val0) { val0 = minimum; }
  1313. if (minimum > val1) { val1 = minimum; }
  1314. if (minimum > val2) { val2 = minimum; }
  1315. if (minimum > val3) { val3 = minimum; }
  1316. if (minimum > val4) { val4 = minimum; }
  1317. if (minimum > val5) { val5 = minimum; }
  1318. if (minimum > val6) { val6 = minimum; }
  1319. if (minimum > val7) { val7 = minimum; }
  1320. return F32x8(val0, val1, val2, val3, val4, val5, val6, val7);
  1321. #endif
  1322. }
  1323. F32x8 clampUpper(float maximum) const {
  1324. #if defined USE_256BIT_F_SIMD
  1325. return F32x8(MIN_F32_SIMD256(this->v, LOAD_SCALAR_F32_SIMD256(maximum)));
  1326. #else
  1327. float val0 = this->scalars[0];
  1328. float val1 = this->scalars[1];
  1329. float val2 = this->scalars[2];
  1330. float val3 = this->scalars[3];
  1331. float val4 = this->scalars[4];
  1332. float val5 = this->scalars[5];
  1333. float val6 = this->scalars[6];
  1334. float val7 = this->scalars[7];
  1335. if (maximum < val0) { val0 = maximum; }
  1336. if (maximum < val1) { val1 = maximum; }
  1337. if (maximum < val2) { val2 = maximum; }
  1338. if (maximum < val3) { val3 = maximum; }
  1339. if (maximum < val4) { val4 = maximum; }
  1340. if (maximum < val5) { val5 = maximum; }
  1341. if (maximum < val6) { val6 = maximum; }
  1342. if (maximum < val7) { val7 = maximum; }
  1343. return F32x8(val0, val1, val2, val3, val4, val5, val6, val7);
  1344. #endif
  1345. }
  1346. };
  1347. union I32x8 {
  1348. private:
  1349. // The uninitialized default constructor is private for safety reasons.
  1350. I32x8() {}
  1351. public:
  1352. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  1353. static inline I32x8 create_dangerous_uninitialized() { return I32x8(); }
  1354. #if defined USE_256BIT_X_SIMD
  1355. public:
  1356. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  1357. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  1358. int32_t scalars[8];
  1359. #endif
  1360. // The SIMD vector of undefined type
  1361. // Not accessible while emulating!
  1362. SIMD_I32x8 v;
  1363. // Construct a portable vector from a native SIMD vector
  1364. explicit I32x8(const SIMD_I32x8& v) : v(v) {}
  1365. // Construct a portable vector from a set of scalars
  1366. I32x8(int32_t a1, int32_t a2, int32_t a3, int32_t a4, int32_t a5, int32_t a6, int32_t a7, int32_t a8)
  1367. : v(LOAD_VECTOR_I32_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8)) {}
  1368. // Construct a portable vector from a single duplicated scalar
  1369. explicit I32x8(int32_t scalar) : v(LOAD_SCALAR_I32_SIMD256(scalar)) {}
  1370. #else
  1371. public:
  1372. // Emulate a SIMD vector as an array of scalars without hardware support.
  1373. // Only accessible while emulating!
  1374. int32_t scalars[8];
  1375. // Construct a portable vector from a set of scalars
  1376. I32x8(int32_t a1, int32_t a2, int32_t a3, int32_t a4, int32_t a5, int32_t a6, int32_t a7, int32_t a8) {
  1377. this->scalars[0] = a1;
  1378. this->scalars[1] = a2;
  1379. this->scalars[2] = a3;
  1380. this->scalars[3] = a4;
  1381. this->scalars[4] = a5;
  1382. this->scalars[5] = a6;
  1383. this->scalars[6] = a7;
  1384. this->scalars[7] = a8;
  1385. }
  1386. // Construct a portable vector from a single duplicated scalar
  1387. explicit I32x8(int32_t scalar) {
  1388. this->scalars[0] = scalar;
  1389. this->scalars[1] = scalar;
  1390. this->scalars[2] = scalar;
  1391. this->scalars[3] = scalar;
  1392. this->scalars[4] = scalar;
  1393. this->scalars[5] = scalar;
  1394. this->scalars[6] = scalar;
  1395. this->scalars[7] = scalar;
  1396. }
  1397. #endif
  1398. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1399. static inline I32x8 createGradient(int32_t start, int32_t increment) {
  1400. return I32x8(
  1401. start,
  1402. start + increment,
  1403. start + increment * 2,
  1404. start + increment * 3,
  1405. start + increment * 4,
  1406. start + increment * 5,
  1407. start + increment * 6,
  1408. start + increment * 7
  1409. );
  1410. }
  1411. // Construct a portable SIMD vector from a pointer to aligned data
  1412. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1413. static inline I32x8 readAlignedUnsafe(const int32_t* data) {
  1414. #if defined USE_AVX2
  1415. return I32x8(_mm256_load_si256((const __m256i*)data));
  1416. #else
  1417. return I32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  1418. #endif
  1419. }
  1420. // Write to aligned memory from the existing vector
  1421. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1422. inline void writeAlignedUnsafe(int32_t* data) const {
  1423. #if defined USE_AVX2
  1424. _mm256_store_si256((__m256i*)data, this->v);
  1425. #else
  1426. data[0] = this->scalars[0];
  1427. data[1] = this->scalars[1];
  1428. data[2] = this->scalars[2];
  1429. data[3] = this->scalars[3];
  1430. data[4] = this->scalars[4];
  1431. data[5] = this->scalars[5];
  1432. data[6] = this->scalars[6];
  1433. data[7] = this->scalars[7];
  1434. #endif
  1435. }
  1436. // Bound and alignment checked reading
  1437. static inline I32x8 readAligned(dsr::SafePointer<const int32_t> data, const char* methodName) {
  1438. const int32_t* pointer = data.getUnsafe();
  1439. assert(((uintptr_t)pointer & 31) == 0);
  1440. #if defined SAFE_POINTER_CHECKS
  1441. data.assertInside(methodName, pointer, 32);
  1442. #endif
  1443. return I32x8::readAlignedUnsafe(pointer);
  1444. }
  1445. // Bound and alignment checked writing
  1446. inline void writeAligned(dsr::SafePointer<int32_t> data, const char* methodName) const {
  1447. int32_t* pointer = data.getUnsafe();
  1448. assert(((uintptr_t)pointer & 31) == 0);
  1449. #if defined SAFE_POINTER_CHECKS
  1450. data.assertInside(methodName, pointer, 32);
  1451. #endif
  1452. this->writeAlignedUnsafe(pointer);
  1453. }
  1454. };
  1455. union U32x8 {
  1456. private:
  1457. // The uninitialized default constructor is private for safety reasons.
  1458. U32x8() {}
  1459. public:
  1460. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  1461. static inline U32x8 create_dangerous_uninitialized() { return U32x8(); }
  1462. #if defined USE_256BIT_X_SIMD
  1463. public:
  1464. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  1465. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  1466. uint32_t scalars[8];
  1467. #endif
  1468. // The SIMD vector of undefined type
  1469. // Not accessible while emulating!
  1470. SIMD_U32x8 v;
  1471. // Construct a portable vector from a native SIMD vector
  1472. explicit U32x8(const SIMD_U32x8& v) : v(v) {}
  1473. // Construct a portable vector from a set of scalars
  1474. U32x8(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4, uint32_t a5, uint32_t a6, uint32_t a7, uint32_t a8)
  1475. : v(LOAD_VECTOR_U32_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8)) {}
  1476. // Construct a portable vector from a single duplicated scalar
  1477. explicit U32x8(uint32_t scalar) : v(LOAD_SCALAR_U32_SIMD256(scalar)) {}
  1478. #else
  1479. public:
  1480. // Emulate a SIMD vector as an array of scalars without hardware support.
  1481. // Only accessible while emulating!
  1482. uint32_t scalars[8];
  1483. // Construct a portable vector from a set of scalars
  1484. U32x8(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4, uint32_t a5, uint32_t a6, uint32_t a7, uint32_t a8) {
  1485. this->scalars[0] = a1;
  1486. this->scalars[1] = a2;
  1487. this->scalars[2] = a3;
  1488. this->scalars[3] = a4;
  1489. this->scalars[4] = a5;
  1490. this->scalars[5] = a6;
  1491. this->scalars[6] = a7;
  1492. this->scalars[7] = a8;
  1493. }
  1494. // Construct a portable vector from a single duplicated scalar
  1495. explicit U32x8(uint32_t scalar) {
  1496. this->scalars[0] = scalar;
  1497. this->scalars[1] = scalar;
  1498. this->scalars[2] = scalar;
  1499. this->scalars[3] = scalar;
  1500. this->scalars[4] = scalar;
  1501. this->scalars[5] = scalar;
  1502. this->scalars[6] = scalar;
  1503. this->scalars[7] = scalar;
  1504. }
  1505. #endif
  1506. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1507. static inline U32x8 createGradient(uint32_t start, uint32_t increment) {
  1508. return U32x8(
  1509. start,
  1510. start + increment,
  1511. start + increment * 2,
  1512. start + increment * 3,
  1513. start + increment * 4,
  1514. start + increment * 5,
  1515. start + increment * 6,
  1516. start + increment * 7
  1517. );
  1518. }
  1519. // Construct a portable SIMD vector from a pointer to aligned data
  1520. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1521. static inline U32x8 readAlignedUnsafe(const uint32_t* data) {
  1522. #if defined USE_AVX2
  1523. return U32x8(_mm256_load_si256((const __m256i*)data));
  1524. #else
  1525. return U32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  1526. #endif
  1527. }
  1528. // Write to aligned memory from the existing vector
  1529. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1530. inline void writeAlignedUnsafe(uint32_t* data) const {
  1531. #if defined USE_AVX2
  1532. _mm256_store_si256((__m256i*)data, this->v);
  1533. #else
  1534. data[0] = this->scalars[0];
  1535. data[1] = this->scalars[1];
  1536. data[2] = this->scalars[2];
  1537. data[3] = this->scalars[3];
  1538. data[4] = this->scalars[4];
  1539. data[5] = this->scalars[5];
  1540. data[6] = this->scalars[6];
  1541. data[7] = this->scalars[7];
  1542. #endif
  1543. }
  1544. // Bound and alignment checked reading
  1545. static inline U32x8 readAligned(dsr::SafePointer<const uint32_t> data, const char* methodName) {
  1546. const uint32_t* pointer = data.getUnsafe();
  1547. assert(((uintptr_t)pointer & 31) == 0);
  1548. #if defined SAFE_POINTER_CHECKS
  1549. data.assertInside(methodName, pointer, 32);
  1550. #endif
  1551. return U32x8::readAlignedUnsafe(pointer);
  1552. }
  1553. // Bound and alignment checked writing
  1554. inline void writeAligned(dsr::SafePointer<uint32_t> data, const char* methodName) const {
  1555. uint32_t* pointer = data.getUnsafe();
  1556. assert(((uintptr_t)pointer & 31) == 0);
  1557. #if defined SAFE_POINTER_CHECKS
  1558. data.assertInside(methodName, pointer, 32);
  1559. #endif
  1560. this->writeAlignedUnsafe(pointer);
  1561. }
  1562. };
  1563. union U16x16 {
  1564. private:
  1565. // The uninitialized default constructor is private for safety reasons.
  1566. U16x16() {}
  1567. public:
  1568. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  1569. static inline U16x16 create_dangerous_uninitialized() { return U16x16(); }
  1570. #if defined USE_256BIT_X_SIMD
  1571. public:
  1572. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  1573. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  1574. uint16_t scalars[16];
  1575. #endif
  1576. // The SIMD vector of undefined type
  1577. // Not accessible while emulating!
  1578. SIMD_U16x16 v;
  1579. // Construct a portable vector from a native SIMD vector
  1580. explicit U16x16(const SIMD_U16x16& v) : v(v) {}
  1581. // Construct a vector of 16 x 16-bit unsigned integers from a vector of 8 x 32-bit unsigned integers
  1582. // Reinterpret casting is used
  1583. explicit U16x16(const U32x8& vector) : v(REINTERPRET_U32_TO_U16_SIMD256(vector.v)) {}
  1584. // Construct a portable vector from a set of scalars
  1585. U16x16(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8,
  1586. uint16_t a9, uint16_t a10, uint16_t a11, uint16_t a12, uint16_t a13, uint16_t a14, uint16_t a15, uint16_t a16)
  1587. : v(LOAD_VECTOR_U16_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16)) {}
  1588. // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
  1589. // Reinterpret casting is used
  1590. // TODO: Remove all reintreprets from constructors to improve readability
  1591. explicit U16x16(uint32_t scalar) : v(REINTERPRET_U32_TO_U16_SIMD256(LOAD_SCALAR_U32_SIMD256(scalar))) {}
  1592. // Construct a portable vector from a single duplicated scalar
  1593. explicit U16x16(uint16_t scalar) : v(LOAD_SCALAR_U16_SIMD256(scalar)) {}
  1594. // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
  1595. U32x8 get_U32() const {
  1596. return U32x8(REINTERPRET_U16_TO_U32_SIMD256(this->v));
  1597. }
  1598. #else
  1599. public:
  1600. // Emulate a SIMD vector as an array of scalars without hardware support.
  1601. // Only accessible while emulating!
  1602. uint16_t scalars[16];
  1603. // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
  1604. // Reinterpret casting is used
  1605. explicit U16x16(const U32x8& vector) {
  1606. uint64_t *target = (uint64_t*)this->scalars;
  1607. uint64_t *source = (uint64_t*)vector.scalars;
  1608. target[0] = source[0];
  1609. target[1] = source[1];
  1610. target[2] = source[2];
  1611. target[3] = source[3];
  1612. }
  1613. // Construct a portable vector from a set of scalars
  1614. U16x16(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8,
  1615. uint16_t a9, uint16_t a10, uint16_t a11, uint16_t a12, uint16_t a13, uint16_t a14, uint16_t a15, uint16_t a16) {
  1616. this->scalars[0] = a1;
  1617. this->scalars[1] = a2;
  1618. this->scalars[2] = a3;
  1619. this->scalars[3] = a4;
  1620. this->scalars[4] = a5;
  1621. this->scalars[5] = a6;
  1622. this->scalars[6] = a7;
  1623. this->scalars[7] = a8;
  1624. this->scalars[8] = a9;
  1625. this->scalars[9] = a10;
  1626. this->scalars[10] = a11;
  1627. this->scalars[11] = a12;
  1628. this->scalars[12] = a13;
  1629. this->scalars[13] = a14;
  1630. this->scalars[14] = a15;
  1631. this->scalars[15] = a16;
  1632. }
  1633. // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
  1634. // Reinterpret casting is used
  1635. explicit U16x16(uint32_t scalar) {
  1636. uint32_t *target = (uint32_t*)this->scalars;
  1637. target[0] = scalar;
  1638. target[1] = scalar;
  1639. target[2] = scalar;
  1640. target[3] = scalar;
  1641. target[4] = scalar;
  1642. target[5] = scalar;
  1643. target[6] = scalar;
  1644. target[7] = scalar;
  1645. }
  1646. // Construct a portable vector from a single duplicated scalar
  1647. explicit U16x16(uint16_t scalar) {
  1648. this->scalars[0] = scalar;
  1649. this->scalars[1] = scalar;
  1650. this->scalars[2] = scalar;
  1651. this->scalars[3] = scalar;
  1652. this->scalars[4] = scalar;
  1653. this->scalars[5] = scalar;
  1654. this->scalars[6] = scalar;
  1655. this->scalars[7] = scalar;
  1656. this->scalars[8] = scalar;
  1657. this->scalars[9] = scalar;
  1658. this->scalars[10] = scalar;
  1659. this->scalars[11] = scalar;
  1660. this->scalars[12] = scalar;
  1661. this->scalars[13] = scalar;
  1662. this->scalars[14] = scalar;
  1663. this->scalars[15] = scalar;
  1664. }
  1665. // Reinterpret cast to a vector of 8 x 32-bit unsigned integers
  1666. U32x8 get_U32() const {
  1667. U32x8 result(0);
  1668. uint64_t *target = (uint64_t*)result.scalars;
  1669. uint64_t *source = (uint64_t*)this->scalars;
  1670. target[0] = source[0];
  1671. target[1] = source[1];
  1672. target[2] = source[2];
  1673. target[3] = source[3];
  1674. return result;
  1675. }
  1676. #endif
  1677. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1678. static inline U16x16 createGradient(uint16_t start, uint16_t increment) {
  1679. return U16x16(
  1680. start,
  1681. start + increment,
  1682. start + increment * 2,
  1683. start + increment * 3,
  1684. start + increment * 4,
  1685. start + increment * 5,
  1686. start + increment * 6,
  1687. start + increment * 7,
  1688. start + increment * 8,
  1689. start + increment * 9,
  1690. start + increment * 10,
  1691. start + increment * 11,
  1692. start + increment * 12,
  1693. start + increment * 13,
  1694. start + increment * 14,
  1695. start + increment * 15
  1696. );
  1697. }
  1698. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1699. //static inline U16x16 readSlow(uint16_t* data) {
  1700. // return U16x16(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  1701. //}
  1702. static inline U16x16 readAlignedUnsafe(const uint16_t* data) {
  1703. #if defined USE_AVX2
  1704. return U16x16(_mm256_load_si256((const __m256i*)data));
  1705. #else
  1706. return U16x16(
  1707. data[0],
  1708. data[1],
  1709. data[2],
  1710. data[3],
  1711. data[4],
  1712. data[5],
  1713. data[6],
  1714. data[7],
  1715. data[8],
  1716. data[9],
  1717. data[10],
  1718. data[11],
  1719. data[12],
  1720. data[13],
  1721. data[14],
  1722. data[15]
  1723. );
  1724. #endif
  1725. }
  1726. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1727. inline void writeAlignedUnsafe(uint16_t* data) const {
  1728. #if defined USE_AVX2
  1729. _mm256_store_si256((__m256i*)data, this->v);
  1730. #else
  1731. data[0] = this->scalars[0];
  1732. data[1] = this->scalars[1];
  1733. data[2] = this->scalars[2];
  1734. data[3] = this->scalars[3];
  1735. data[4] = this->scalars[4];
  1736. data[5] = this->scalars[5];
  1737. data[6] = this->scalars[6];
  1738. data[7] = this->scalars[7];
  1739. data[8] = this->scalars[8];
  1740. data[9] = this->scalars[9];
  1741. data[10] = this->scalars[10];
  1742. data[11] = this->scalars[11];
  1743. data[12] = this->scalars[12];
  1744. data[13] = this->scalars[13];
  1745. data[14] = this->scalars[14];
  1746. data[15] = this->scalars[15];
  1747. #endif
  1748. }
  1749. // Bound and alignment checked reading
  1750. static inline U16x16 readAligned(dsr::SafePointer<const uint16_t> data, const char* methodName) {
  1751. const uint16_t* pointer = data.getUnsafe();
  1752. assert(((uintptr_t)pointer & 31) == 0);
  1753. #if defined SAFE_POINTER_CHECKS
  1754. data.assertInside(methodName, pointer, 32);
  1755. #endif
  1756. return U16x16::readAlignedUnsafe(pointer);
  1757. }
  1758. // Bound and alignment checked writing
  1759. inline void writeAligned(dsr::SafePointer<uint16_t> data, const char* methodName) const {
  1760. uint16_t* pointer = data.getUnsafe();
  1761. assert(((uintptr_t)pointer & 31) == 0);
  1762. #if defined SAFE_POINTER_CHECKS
  1763. data.assertInside(methodName, pointer, 32);
  1764. #endif
  1765. this->writeAlignedUnsafe(pointer);
  1766. }
  1767. };
  1768. union U8x32 {
  1769. private:
  1770. // The uninitialized default constructor is private for safety reasons.
  1771. U8x32() {}
  1772. public:
  1773. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  1774. static inline U8x32 create_dangerous_uninitialized() { return U8x32(); }
  1775. #if defined USE_256BIT_X_SIMD
  1776. public:
  1777. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  1778. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  1779. uint8_t scalars[32];
  1780. #endif
  1781. // The SIMD vector of undefined type
  1782. // Not accessible while emulating!
  1783. SIMD_U8x32 v;
  1784. // Construct a portable vector from a native SIMD vector
  1785. explicit U8x32(const SIMD_U8x32& v) : v(v) {}
  1786. // Construct a portable vector from a set of scalars
  1787. U8x32(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  1788. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16,
  1789. uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24,
  1790. uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32)
  1791. : v(LOAD_VECTOR_U8_SIMD256(
  1792. a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16,
  1793. a17, a18, a19, a20, a21, a22, a23, a24, a25, a26, a27, a28, a29, a30, a31, a32
  1794. )) {}
  1795. // Construct a portable vector from a single duplicated scalar
  1796. explicit U8x32(uint8_t scalar) : v(LOAD_SCALAR_U8_SIMD256(scalar)) {}
  1797. #else
  1798. public:
  1799. // Emulate a SIMD vector as an array of scalars without hardware support.
  1800. // Only accessible while emulating!
  1801. uint8_t scalars[32];
  1802. // Construct a portable vector from a set of scalars
  1803. U8x32(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  1804. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16,
  1805. uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24,
  1806. uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32) {
  1807. this->scalars[0] = a1;
  1808. this->scalars[1] = a2;
  1809. this->scalars[2] = a3;
  1810. this->scalars[3] = a4;
  1811. this->scalars[4] = a5;
  1812. this->scalars[5] = a6;
  1813. this->scalars[6] = a7;
  1814. this->scalars[7] = a8;
  1815. this->scalars[8] = a9;
  1816. this->scalars[9] = a10;
  1817. this->scalars[10] = a11;
  1818. this->scalars[11] = a12;
  1819. this->scalars[12] = a13;
  1820. this->scalars[13] = a14;
  1821. this->scalars[14] = a15;
  1822. this->scalars[15] = a16;
  1823. this->scalars[16] = a17;
  1824. this->scalars[17] = a18;
  1825. this->scalars[18] = a19;
  1826. this->scalars[19] = a20;
  1827. this->scalars[20] = a21;
  1828. this->scalars[21] = a22;
  1829. this->scalars[22] = a23;
  1830. this->scalars[23] = a24;
  1831. this->scalars[24] = a25;
  1832. this->scalars[25] = a26;
  1833. this->scalars[26] = a27;
  1834. this->scalars[27] = a28;
  1835. this->scalars[28] = a29;
  1836. this->scalars[29] = a30;
  1837. this->scalars[30] = a31;
  1838. this->scalars[31] = a32;
  1839. }
  1840. // Construct a portable vector from a single duplicated scalar
  1841. explicit U8x32(uint8_t scalar) {
  1842. for (int i = 0; i < 32; i++) {
  1843. this->scalars[i] = scalar;
  1844. }
  1845. }
  1846. #endif
  1847. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1848. static inline U8x32 createGradient(uint8_t start, uint8_t increment) {
  1849. return U8x32(
  1850. start,
  1851. start + increment,
  1852. start + increment * 2,
  1853. start + increment * 3,
  1854. start + increment * 4,
  1855. start + increment * 5,
  1856. start + increment * 6,
  1857. start + increment * 7,
  1858. start + increment * 8,
  1859. start + increment * 9,
  1860. start + increment * 10,
  1861. start + increment * 11,
  1862. start + increment * 12,
  1863. start + increment * 13,
  1864. start + increment * 14,
  1865. start + increment * 15,
  1866. start + increment * 16,
  1867. start + increment * 17,
  1868. start + increment * 18,
  1869. start + increment * 19,
  1870. start + increment * 20,
  1871. start + increment * 21,
  1872. start + increment * 22,
  1873. start + increment * 23,
  1874. start + increment * 24,
  1875. start + increment * 25,
  1876. start + increment * 26,
  1877. start + increment * 27,
  1878. start + increment * 28,
  1879. start + increment * 29,
  1880. start + increment * 30,
  1881. start + increment * 31
  1882. );
  1883. }
  1884. static inline U8x32 readAlignedUnsafe(const uint8_t* data) {
  1885. #if defined USE_AVX2
  1886. return U8x32(_mm256_load_si256((const __m256i*)data));
  1887. #else
  1888. U8x32 result;
  1889. for (int i = 0; i < 32; i++) {
  1890. result.scalars[i] = data[i];
  1891. }
  1892. return result;
  1893. #endif
  1894. }
  1895. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1896. inline void writeAlignedUnsafe(uint8_t* data) const {
  1897. #if defined USE_AVX2
  1898. _mm256_store_si256((__m256i*)data, this->v);
  1899. #else
  1900. for (int i = 0; i < 32; i++) {
  1901. data[i] = this->scalars[i];
  1902. }
  1903. #endif
  1904. }
  1905. // Bound and alignment checked reading
  1906. static inline U8x32 readAligned(dsr::SafePointer<const uint8_t> data, const char* methodName) {
  1907. const uint8_t* pointer = data.getUnsafe();
  1908. assert(((uintptr_t)pointer & 31) == 0);
  1909. #if defined SAFE_POINTER_CHECKS
  1910. data.assertInside(methodName, pointer, 32);
  1911. #endif
  1912. return U8x32::readAlignedUnsafe(pointer);
  1913. }
  1914. // Bound and alignment checked writing
  1915. inline void writeAligned(dsr::SafePointer<uint8_t> data, const char* methodName) const {
  1916. uint8_t* pointer = data.getUnsafe();
  1917. assert(((uintptr_t)pointer & 31) == 0);
  1918. #if defined SAFE_POINTER_CHECKS
  1919. data.assertInside(methodName, pointer, 32);
  1920. #endif
  1921. this->writeAlignedUnsafe(pointer);
  1922. }
  1923. };
  1924. // Helper macros for doing things to certain sets of SIMD vector types
  1925. // Performing do(vector_type, element_type, lane_count)
  1926. #define FOR_ALL_VECTOR_TYPES(DO) \
  1927. DO(F32x4, float, 4) \
  1928. DO(I32x4, int32_t, 4) \
  1929. DO(U32x4, uint32_t, 4) \
  1930. DO(U16x8, uint16_t, 8) \
  1931. DO(U8x16, uint8_t, 16) \
  1932. DO(F32x8, float, 8) \
  1933. DO(I32x8, int32_t, 8) \
  1934. DO(U32x8, uint32_t, 8) \
  1935. DO(U16x16, uint16_t, 16) \
  1936. DO(U8x32, uint8_t, 32)
  1937. #define FOR_FLOAT_VECTOR_TYPES(DO) \
  1938. DO(F32x4, float, 4) \
  1939. DO(F32x8, float, 8)
  1940. #define FOR_INTEGER_VECTOR_TYPES(DO) \
  1941. DO(I32x4, int32_t, 4) \
  1942. DO(U32x4, uint32_t, 4) \
  1943. DO(U16x8, uint16_t, 8) \
  1944. DO(U8x16, uint8_t, 16) \
  1945. DO(I32x8, int32_t, 8) \
  1946. DO(U32x8, uint32_t, 8) \
  1947. DO(U16x16, uint16_t, 16) \
  1948. DO(U8x32, uint8_t, 32)
  1949. #define FOR_SIGNED_VECTOR_TYPES(DO) \
  1950. DO(F32x4, float, 4) \
  1951. DO(I32x4, int32_t, 4) \
  1952. DO(F32x8, float, 8) \
  1953. DO(I32x8, int32_t, 8)
  1954. #define FOR_UNSIGNED_VECTOR_TYPES(DO) \
  1955. DO(U32x4, uint32_t, 4) \
  1956. DO(U16x8, uint16_t, 8) \
  1957. DO(U8x16, uint8_t, 16) \
  1958. DO(U32x8, uint32_t, 8) \
  1959. DO(U16x16, uint16_t, 16) \
  1960. DO(U8x32, uint8_t, 32)
  1961. // Print SIMD vectors to the terminal or append them to strings.
  1962. #define CREATE_METHOD_PRINT(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  1963. inline dsr::String& string_toStreamIndented(dsr::String& target, const VECTOR_TYPE& source, const dsr::ReadableString& indentation) { \
  1964. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  1965. source.writeAlignedUnsafe(a); \
  1966. dsr::string_append(target, indentation, a[0]); \
  1967. for (int i = 1; i < LANE_COUNT; i++) { \
  1968. string_append(target, U", ", a[i]); \
  1969. } \
  1970. return target; \
  1971. }
  1972. // All SIMD vectors can be printed.
  1973. FOR_ALL_VECTOR_TYPES(CREATE_METHOD_PRINT)
  1974. #undef CREATE_METHOD_PRINT
  1975. // Integer equality returns true iff all comparisons are identical.
  1976. #define CREATE_EXACT_EQUALITY(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  1977. inline bool allLanesEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  1978. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  1979. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  1980. left.writeAlignedUnsafe(a); \
  1981. right.writeAlignedUnsafe(b); \
  1982. for (int i = 0; i < LANE_COUNT; i++) { \
  1983. if (a[i] != b[i]) return false; \
  1984. } \
  1985. return true; \
  1986. } \
  1987. inline bool allLanesNotEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  1988. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  1989. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  1990. left.writeAlignedUnsafe(a); \
  1991. right.writeAlignedUnsafe(b); \
  1992. for (int i = 0; i < LANE_COUNT; i++) { \
  1993. if (a[i] == b[i]) return false; \
  1994. } \
  1995. return true; \
  1996. }
  1997. // Integer SIMD vectors have exact equlity.
  1998. FOR_INTEGER_VECTOR_TYPES(CREATE_EXACT_EQUALITY)
  1999. #undef CREATE_EXACT_EQUALITY
  2000. // Float equality returns true iff all comparisons are near.
  2001. #define CREATE_TOLERANT_EQUALITY(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  2002. inline bool allLanesEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  2003. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  2004. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  2005. left.writeAlignedUnsafe(a); \
  2006. right.writeAlignedUnsafe(b); \
  2007. for (int i = 0; i < LANE_COUNT; i++) { \
  2008. if (fabs(a[i] - b[i]) >= 0.0001f) return false; \
  2009. } \
  2010. return true; \
  2011. } \
  2012. inline bool allLanesNotEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  2013. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  2014. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  2015. left.writeAlignedUnsafe(a); \
  2016. right.writeAlignedUnsafe(b); \
  2017. for (int i = 0; i < LANE_COUNT; i++) { \
  2018. if (fabs(a[i] - b[i]) < 0.0001f) return false; \
  2019. } \
  2020. return true; \
  2021. }
  2022. // Float SIMD vectors have inexact equality.
  2023. FOR_FLOAT_VECTOR_TYPES(CREATE_TOLERANT_EQUALITY)
  2024. #undef CREATE_TOLERANT_EQUALITY
  2025. #define CREATE_COMPARISONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  2026. inline bool allLanesGreater(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  2027. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  2028. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  2029. left.writeAlignedUnsafe(a); \
  2030. right.writeAlignedUnsafe(b); \
  2031. for (int i = 0; i < LANE_COUNT; i++) { \
  2032. if (a[i] <= b[i]) return false; \
  2033. } \
  2034. return true; \
  2035. } \
  2036. inline bool allLanesGreaterOrEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  2037. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  2038. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  2039. left.writeAlignedUnsafe(a); \
  2040. right.writeAlignedUnsafe(b); \
  2041. for (int i = 0; i < LANE_COUNT; i++) { \
  2042. if (a[i] < b[i]) return false; \
  2043. } \
  2044. return true; \
  2045. } \
  2046. inline bool allLanesLesser(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  2047. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  2048. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  2049. left.writeAlignedUnsafe(a); \
  2050. right.writeAlignedUnsafe(b); \
  2051. for (int i = 0; i < LANE_COUNT; i++) { \
  2052. if (a[i] >= b[i]) return false; \
  2053. } \
  2054. return true; \
  2055. } \
  2056. inline bool allLanesLesserOrEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  2057. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  2058. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  2059. left.writeAlignedUnsafe(a); \
  2060. right.writeAlignedUnsafe(b); \
  2061. for (int i = 0; i < LANE_COUNT; i++) { \
  2062. if (a[i] > b[i]) return false; \
  2063. } \
  2064. return true; \
  2065. }
  2066. FOR_ALL_VECTOR_TYPES(CREATE_COMPARISONS)
  2067. #undef CREATE_COMPARISONS
  2068. inline F32x4 operator+(const F32x4& left, const F32x4& right) {
  2069. #if defined USE_BASIC_SIMD
  2070. return F32x4(ADD_F32_SIMD(left.v, right.v));
  2071. #else
  2072. return F32x4(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3]);
  2073. #endif
  2074. }
  2075. inline F32x4 operator-(const F32x4& left, const F32x4& right) {
  2076. #if defined USE_BASIC_SIMD
  2077. return F32x4(SUB_F32_SIMD(left.v, right.v));
  2078. #else
  2079. return F32x4(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3]);
  2080. #endif
  2081. }
  2082. inline F32x4 operator*(const F32x4& left, const F32x4& right) {
  2083. #if defined USE_BASIC_SIMD
  2084. return F32x4(MUL_F32_SIMD(left.v, right.v));
  2085. #else
  2086. return F32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
  2087. #endif
  2088. }
  2089. inline F32x4 min(const F32x4& left, const F32x4& right) {
  2090. #if defined USE_BASIC_SIMD
  2091. return F32x4(MIN_F32_SIMD(left.v, right.v));
  2092. #else
  2093. float v0 = left.scalars[0];
  2094. float v1 = left.scalars[1];
  2095. float v2 = left.scalars[2];
  2096. float v3 = left.scalars[3];
  2097. float r0 = right.scalars[0];
  2098. float r1 = right.scalars[1];
  2099. float r2 = right.scalars[2];
  2100. float r3 = right.scalars[3];
  2101. if (r0 < v0) { v0 = r0; }
  2102. if (r1 < v1) { v1 = r1; }
  2103. if (r2 < v2) { v2 = r2; }
  2104. if (r3 < v3) { v3 = r3; }
  2105. return F32x4(v0, v1, v2, v3);
  2106. #endif
  2107. }
  2108. inline F32x4 max(const F32x4& left, const F32x4& right) {
  2109. #if defined USE_BASIC_SIMD
  2110. return F32x4(MAX_F32_SIMD(left.v, right.v));
  2111. #else
  2112. float v0 = left.scalars[0];
  2113. float v1 = left.scalars[1];
  2114. float v2 = left.scalars[2];
  2115. float v3 = left.scalars[3];
  2116. float r0 = right.scalars[0];
  2117. float r1 = right.scalars[1];
  2118. float r2 = right.scalars[2];
  2119. float r3 = right.scalars[3];
  2120. if (r0 > v0) { v0 = r0; }
  2121. if (r1 > v1) { v1 = r1; }
  2122. if (r2 > v2) { v2 = r2; }
  2123. if (r3 > v3) { v3 = r3; }
  2124. return F32x4(v0, v1, v2, v3);
  2125. #endif
  2126. }
  2127. inline I32x4 operator+(const I32x4& left, const I32x4& right) {
  2128. #if defined USE_BASIC_SIMD
  2129. return I32x4(ADD_I32_SIMD(left.v, right.v));
  2130. #else
  2131. return I32x4(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3]);
  2132. #endif
  2133. }
  2134. inline I32x4 operator-(const I32x4& left, const I32x4& right) {
  2135. #if defined USE_BASIC_SIMD
  2136. return I32x4(SUB_I32_SIMD(left.v, right.v));
  2137. #else
  2138. return I32x4(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3]);
  2139. #endif
  2140. }
  2141. inline I32x4 operator*(const I32x4& left, const I32x4& right) {
  2142. #if defined USE_BASIC_SIMD
  2143. #if defined USE_SSE2
  2144. // TODO: Use AVX2 for 32-bit integer multiplication when available.
  2145. return I32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
  2146. #elif defined USE_NEON
  2147. return I32x4(MUL_I32_NEON(left.v, right.v));
  2148. #endif
  2149. #else
  2150. return I32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
  2151. #endif
  2152. }
  2153. // TODO: Specify the behavior of truncated unsigned integer overflow and add it to the tests.
  2154. inline U32x4 operator+(const U32x4& left, const U32x4& right) {
  2155. #if defined USE_BASIC_SIMD
  2156. return U32x4(ADD_U32_SIMD(left.v, right.v));
  2157. #else
  2158. return U32x4(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3]);
  2159. #endif
  2160. }
  2161. inline U32x4 operator-(const U32x4& left, const U32x4& right) {
  2162. #if defined USE_BASIC_SIMD
  2163. return U32x4(SUB_U32_SIMD(left.v, right.v));
  2164. #else
  2165. return U32x4(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3]);
  2166. #endif
  2167. }
  2168. inline U32x4 operator*(const U32x4& left, const U32x4& right) {
  2169. #if defined USE_BASIC_SIMD
  2170. #if defined USE_SSE2
  2171. // Emulate a NEON instruction on SSE2 registers
  2172. return U32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
  2173. #else // NEON
  2174. return U32x4(MUL_U32_NEON(left.v, right.v));
  2175. #endif
  2176. #else
  2177. return U32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
  2178. #endif
  2179. }
  2180. // Bitwise and
  2181. inline U32x4 operator&(const U32x4& left, const U32x4& right) {
  2182. #if defined USE_BASIC_SIMD
  2183. return U32x4(BITWISE_AND_U32_SIMD(left.v, right.v));
  2184. #else
  2185. return U32x4(left.scalars[0] & right.scalars[0], left.scalars[1] & right.scalars[1], left.scalars[2] & right.scalars[2], left.scalars[3] & right.scalars[3]);
  2186. #endif
  2187. }
  2188. // Bitwise or
  2189. inline U32x4 operator|(const U32x4& left, const U32x4& right) {
  2190. #if defined USE_BASIC_SIMD
  2191. return U32x4(BITWISE_OR_U32_SIMD(left.v, right.v));
  2192. #else
  2193. return U32x4(left.scalars[0] | right.scalars[0], left.scalars[1] | right.scalars[1], left.scalars[2] | right.scalars[2], left.scalars[3] | right.scalars[3]);
  2194. #endif
  2195. }
  2196. // Bitwise xor
  2197. inline U32x4 operator^(const U32x4& left, const U32x4& right) {
  2198. #if defined USE_BASIC_SIMD
  2199. return U32x4(BITWISE_XOR_U32_SIMD(left.v, right.v));
  2200. #else
  2201. return U32x4(left.scalars[0] ^ right.scalars[0], left.scalars[1] ^ right.scalars[1], left.scalars[2] ^ right.scalars[2], left.scalars[3] ^ right.scalars[3]);
  2202. #endif
  2203. }
  2204. // Bitwise negation
  2205. inline U32x4 operator~(const U32x4& value) {
  2206. #if defined USE_NEON
  2207. return U32x4(vmvnq_u32(value.v));
  2208. #elif defined USE_BASIC_SIMD
  2209. // Fall back on xor against all ones.
  2210. return value ^ U32x4(~uint32_t(0));
  2211. #else
  2212. return U32x4(~value.scalars[0], ~value.scalars[1], ~value.scalars[2], ~value.scalars[3]);
  2213. #endif
  2214. }
  2215. inline U32x4 operator<<(const U32x4& left, const U32x4 &bitOffsets) {
  2216. assert(allLanesLesser(bitOffsets, U32x4(32u)));
  2217. #if defined USE_NEON
  2218. return U32x4(vshlq_u32(left.v, vreinterpretq_s32_u32(bitOffsets.v)));
  2219. #else
  2220. return U32x4(
  2221. left.scalars[0] << bitOffsets.scalars[0],
  2222. left.scalars[1] << bitOffsets.scalars[1],
  2223. left.scalars[2] << bitOffsets.scalars[2],
  2224. left.scalars[3] << bitOffsets.scalars[3]
  2225. );
  2226. #endif
  2227. }
  2228. inline U32x4 operator>>(const U32x4& left, const U32x4 &bitOffsets) {
  2229. assert(allLanesLesser(bitOffsets, U32x4(32u)));
  2230. #if defined USE_NEON
  2231. return U32x4(vshrq_u32(left.v, vreinterpretq_s32_u32(bitOffsets.v)));
  2232. #else
  2233. return U32x4(
  2234. left.scalars[0] >> bitOffsets.scalars[0],
  2235. left.scalars[1] >> bitOffsets.scalars[1],
  2236. left.scalars[2] >> bitOffsets.scalars[2],
  2237. left.scalars[3] >> bitOffsets.scalars[3]
  2238. );
  2239. #endif
  2240. }
  2241. // bitOffset must be an immediate constant, so a template argument is used.
  2242. template <uint32_t bitOffset>
  2243. inline U32x4 bitShiftLeftImmediate(const U32x4& left) {
  2244. static_assert(bitOffset < 32u);
  2245. #if defined USE_SSE2
  2246. return U32x4(_mm_slli_epi32(left.v, bitOffset));
  2247. #else
  2248. #if defined USE_NEON
  2249. return U32x4(vshlq_u32(left.v, LOAD_SCALAR_I32_SIMD(bitOffset)));
  2250. #else
  2251. return U32x4(left.scalars[0] << bitOffset, left.scalars[1] << bitOffset, left.scalars[2] << bitOffset, left.scalars[3] << bitOffset);
  2252. #endif
  2253. #endif
  2254. }
  2255. // bitOffset must be an immediate constant.
  2256. template <uint32_t bitOffset>
  2257. inline U32x4 bitShiftRightImmediate(const U32x4& left) {
  2258. static_assert(bitOffset < 32u);
  2259. #if defined USE_SSE2
  2260. return U32x4(_mm_srli_epi32(left.v, bitOffset));
  2261. #else
  2262. #if defined USE_NEON
  2263. return U32x4(vshrq_u32(left.v, LOAD_SCALAR_I32_SIMD(bitOffset)));
  2264. #else
  2265. return U32x4(left.scalars[0] >> bitOffset, left.scalars[1] >> bitOffset, left.scalars[2] >> bitOffset, left.scalars[3] >> bitOffset);
  2266. #endif
  2267. #endif
  2268. }
  2269. inline U16x8 operator<<(const U16x8& left, const U16x8 &bitOffsets) {
  2270. assert(allLanesLesser(bitOffsets, U16x8(16u)));
  2271. #if defined USE_NEON
  2272. return U16x8(vshlq_u16(left.v, vreinterpretq_s16_u16(bitOffsets.v)));
  2273. #else
  2274. return U16x8(
  2275. left.scalars[0] << bitOffsets.scalars[0],
  2276. left.scalars[1] << bitOffsets.scalars[1],
  2277. left.scalars[2] << bitOffsets.scalars[2],
  2278. left.scalars[3] << bitOffsets.scalars[3],
  2279. left.scalars[4] << bitOffsets.scalars[4],
  2280. left.scalars[5] << bitOffsets.scalars[5],
  2281. left.scalars[6] << bitOffsets.scalars[6],
  2282. left.scalars[7] << bitOffsets.scalars[7]
  2283. );
  2284. #endif
  2285. }
  2286. inline U16x8 operator>>(const U16x8& left, const U16x8 &bitOffsets) {
  2287. assert(allLanesLesser(bitOffsets, U16x8(16u)));
  2288. #if defined USE_NEON
  2289. return U16x8(vshrq_u16(left.v, vreinterpretq_s16_u16(bitOffsets.v)));
  2290. #else
  2291. return U16x8(
  2292. left.scalars[0] >> bitOffsets.scalars[0],
  2293. left.scalars[1] >> bitOffsets.scalars[1],
  2294. left.scalars[2] >> bitOffsets.scalars[2],
  2295. left.scalars[3] >> bitOffsets.scalars[3],
  2296. left.scalars[4] >> bitOffsets.scalars[4],
  2297. left.scalars[5] >> bitOffsets.scalars[5],
  2298. left.scalars[6] >> bitOffsets.scalars[6],
  2299. left.scalars[7] >> bitOffsets.scalars[7]
  2300. );
  2301. #endif
  2302. }
  2303. // bitOffset must be an immediate constant, so a template argument is used.
  2304. template <uint32_t bitOffset>
  2305. inline U16x8 bitShiftLeftImmediate(const U16x8& left) {
  2306. static_assert(bitOffset < 16u);
  2307. #if defined USE_SSE2
  2308. return U16x8(_mm_slli_epi16(left.v, bitOffset));
  2309. #else
  2310. #if defined USE_NEON
  2311. return U16x8(vshlq_u32(left.v, vdupq_n_s16(int16_t(bitOffset))));
  2312. #else
  2313. return U16x8(
  2314. left.scalars[0] << bitOffset,
  2315. left.scalars[1] << bitOffset,
  2316. left.scalars[2] << bitOffset,
  2317. left.scalars[3] << bitOffset,
  2318. left.scalars[4] << bitOffset,
  2319. left.scalars[5] << bitOffset,
  2320. left.scalars[6] << bitOffset,
  2321. left.scalars[7] << bitOffset
  2322. );
  2323. #endif
  2324. #endif
  2325. }
  2326. // bitOffset must be an immediate constant.
  2327. template <uint32_t bitOffset>
  2328. inline U16x8 bitShiftRightImmediate(const U16x8& left) {
  2329. static_assert(bitOffset < 16u);
  2330. #if defined USE_SSE2
  2331. return U16x8(_mm_srli_epi16(left.v, bitOffset));
  2332. #else
  2333. #if defined USE_NEON
  2334. return U16x8(vshrq_u32(left.v, vdupq_n_s16(int16_t(bitOffset))));
  2335. #else
  2336. return U16x8(
  2337. left.scalars[0] >> bitOffset,
  2338. left.scalars[1] >> bitOffset,
  2339. left.scalars[2] >> bitOffset,
  2340. left.scalars[3] >> bitOffset,
  2341. left.scalars[4] >> bitOffset,
  2342. left.scalars[5] >> bitOffset,
  2343. left.scalars[6] >> bitOffset,
  2344. left.scalars[7] >> bitOffset
  2345. );
  2346. #endif
  2347. #endif
  2348. }
  2349. inline U8x16 operator<<(const U8x16& left, const U8x16 &bitOffsets) {
  2350. assert(allLanesLesser(bitOffsets, U8x16(8u)));
  2351. #if defined USE_NEON
  2352. return U8x16(vshlq_u16(left.v, vreinterpretq_s8_u8(bitOffsets.v)));
  2353. #else
  2354. return U8x16(
  2355. left.scalars[ 0] << bitOffsets.scalars[ 0],
  2356. left.scalars[ 1] << bitOffsets.scalars[ 1],
  2357. left.scalars[ 2] << bitOffsets.scalars[ 2],
  2358. left.scalars[ 3] << bitOffsets.scalars[ 3],
  2359. left.scalars[ 4] << bitOffsets.scalars[ 4],
  2360. left.scalars[ 5] << bitOffsets.scalars[ 5],
  2361. left.scalars[ 6] << bitOffsets.scalars[ 6],
  2362. left.scalars[ 7] << bitOffsets.scalars[ 7],
  2363. left.scalars[ 8] << bitOffsets.scalars[ 8],
  2364. left.scalars[ 9] << bitOffsets.scalars[ 9],
  2365. left.scalars[10] << bitOffsets.scalars[10],
  2366. left.scalars[11] << bitOffsets.scalars[11],
  2367. left.scalars[12] << bitOffsets.scalars[12],
  2368. left.scalars[13] << bitOffsets.scalars[13],
  2369. left.scalars[14] << bitOffsets.scalars[14],
  2370. left.scalars[15] << bitOffsets.scalars[15]
  2371. );
  2372. #endif
  2373. }
  2374. inline U8x16 operator>>(const U8x16& left, const U8x16 &bitOffsets) {
  2375. assert(allLanesLesser(bitOffsets, U8x16(8u)));
  2376. #if defined USE_NEON
  2377. return U8x16(vshrq_u16(left.v, vreinterpretq_s8_u8(bitOffsets.v)));
  2378. #else
  2379. return U8x16(
  2380. left.scalars[ 0] >> bitOffsets.scalars[ 0],
  2381. left.scalars[ 1] >> bitOffsets.scalars[ 1],
  2382. left.scalars[ 2] >> bitOffsets.scalars[ 2],
  2383. left.scalars[ 3] >> bitOffsets.scalars[ 3],
  2384. left.scalars[ 4] >> bitOffsets.scalars[ 4],
  2385. left.scalars[ 5] >> bitOffsets.scalars[ 5],
  2386. left.scalars[ 6] >> bitOffsets.scalars[ 6],
  2387. left.scalars[ 7] >> bitOffsets.scalars[ 7],
  2388. left.scalars[ 8] >> bitOffsets.scalars[ 8],
  2389. left.scalars[ 9] >> bitOffsets.scalars[ 9],
  2390. left.scalars[10] >> bitOffsets.scalars[10],
  2391. left.scalars[11] >> bitOffsets.scalars[11],
  2392. left.scalars[12] >> bitOffsets.scalars[12],
  2393. left.scalars[13] >> bitOffsets.scalars[13],
  2394. left.scalars[14] >> bitOffsets.scalars[14],
  2395. left.scalars[15] >> bitOffsets.scalars[15]
  2396. );
  2397. #endif
  2398. }
  2399. // bitOffset must be an immediate constant, so a template argument is used.
  2400. template <uint32_t bitOffset>
  2401. inline U8x16 bitShiftLeftImmediate(const U8x16& left) {
  2402. static_assert(bitOffset < 8u);
  2403. #if defined USE_SSE2
  2404. return U8x16(_mm_slli_epi16(left.v, bitOffset));
  2405. #elif defined USE_NEON
  2406. return U8x16(vshlq_u32(left.v, vdupq_n_s8(int8_t(bitOffset))));
  2407. #else
  2408. return U8x16(
  2409. left.scalars[ 0] << bitOffset,
  2410. left.scalars[ 1] << bitOffset,
  2411. left.scalars[ 2] << bitOffset,
  2412. left.scalars[ 3] << bitOffset,
  2413. left.scalars[ 4] << bitOffset,
  2414. left.scalars[ 5] << bitOffset,
  2415. left.scalars[ 6] << bitOffset,
  2416. left.scalars[ 7] << bitOffset,
  2417. left.scalars[ 8] << bitOffset,
  2418. left.scalars[ 9] << bitOffset,
  2419. left.scalars[10] << bitOffset,
  2420. left.scalars[11] << bitOffset,
  2421. left.scalars[12] << bitOffset,
  2422. left.scalars[13] << bitOffset,
  2423. left.scalars[14] << bitOffset,
  2424. left.scalars[15] << bitOffset
  2425. );
  2426. #endif
  2427. }
  2428. // bitOffset must be an immediate constant.
  2429. template <uint32_t bitOffset>
  2430. inline U8x16 bitShiftRightImmediate(const U8x16& left) {
  2431. static_assert(bitOffset < 8u);
  2432. #if defined USE_SSE2
  2433. return U8x16(_mm_srli_epi16(left.v, bitOffset));
  2434. #elif defined USE_NEON
  2435. return U8x16(vshrq_u32(left.v, vdupq_n_s8(int8_t(bitOffset))));
  2436. #else
  2437. return U8x16(
  2438. left.scalars[ 0] >> bitOffset,
  2439. left.scalars[ 1] >> bitOffset,
  2440. left.scalars[ 2] >> bitOffset,
  2441. left.scalars[ 3] >> bitOffset,
  2442. left.scalars[ 4] >> bitOffset,
  2443. left.scalars[ 5] >> bitOffset,
  2444. left.scalars[ 6] >> bitOffset,
  2445. left.scalars[ 7] >> bitOffset,
  2446. left.scalars[ 8] >> bitOffset,
  2447. left.scalars[ 9] >> bitOffset,
  2448. left.scalars[10] >> bitOffset,
  2449. left.scalars[11] >> bitOffset,
  2450. left.scalars[12] >> bitOffset,
  2451. left.scalars[13] >> bitOffset,
  2452. left.scalars[14] >> bitOffset,
  2453. left.scalars[15] >> bitOffset
  2454. );
  2455. #endif
  2456. }
  2457. inline U16x8 operator+(const U16x8& left, const U16x8& right) {
  2458. #if defined USE_BASIC_SIMD
  2459. return U16x8(ADD_U16_SIMD(left.v, right.v));
  2460. #else
  2461. return U16x8(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3],
  2462. left.scalars[4] + right.scalars[4], left.scalars[5] + right.scalars[5], left.scalars[6] + right.scalars[6], left.scalars[7] + right.scalars[7]);
  2463. #endif
  2464. }
  2465. inline U16x8 operator-(const U16x8& left, const U16x8& right) {
  2466. #if defined USE_BASIC_SIMD
  2467. return U16x8(SUB_U16_SIMD(left.v, right.v));
  2468. #else
  2469. return U16x8(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3],
  2470. left.scalars[4] - right.scalars[4], left.scalars[5] - right.scalars[5], left.scalars[6] - right.scalars[6], left.scalars[7] - right.scalars[7]);
  2471. #endif
  2472. }
  2473. inline U16x8 operator*(const U16x8& left, const U16x8& right) {
  2474. #if defined USE_BASIC_SIMD
  2475. return U16x8(MUL_U16_SIMD(left.v, right.v));
  2476. #else
  2477. return U16x8(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3],
  2478. left.scalars[4] * right.scalars[4], left.scalars[5] * right.scalars[5], left.scalars[6] * right.scalars[6], left.scalars[7] * right.scalars[7]);
  2479. #endif
  2480. }
  2481. inline U8x16 operator+(const U8x16& left, const U8x16& right) {
  2482. #if defined USE_BASIC_SIMD
  2483. return U8x16(ADD_U8_SIMD(left.v, right.v));
  2484. #else
  2485. return U8x16(
  2486. left.scalars[0] + right.scalars[0],
  2487. left.scalars[1] + right.scalars[1],
  2488. left.scalars[2] + right.scalars[2],
  2489. left.scalars[3] + right.scalars[3],
  2490. left.scalars[4] + right.scalars[4],
  2491. left.scalars[5] + right.scalars[5],
  2492. left.scalars[6] + right.scalars[6],
  2493. left.scalars[7] + right.scalars[7],
  2494. left.scalars[8] + right.scalars[8],
  2495. left.scalars[9] + right.scalars[9],
  2496. left.scalars[10] + right.scalars[10],
  2497. left.scalars[11] + right.scalars[11],
  2498. left.scalars[12] + right.scalars[12],
  2499. left.scalars[13] + right.scalars[13],
  2500. left.scalars[14] + right.scalars[14],
  2501. left.scalars[15] + right.scalars[15]
  2502. );
  2503. #endif
  2504. }
  2505. inline U8x16 operator-(const U8x16& left, const U8x16& right) {
  2506. #if defined USE_BASIC_SIMD
  2507. return U8x16(SUB_U8_SIMD(left.v, right.v));
  2508. #else
  2509. return U8x16(
  2510. left.scalars[0] - right.scalars[0],
  2511. left.scalars[1] - right.scalars[1],
  2512. left.scalars[2] - right.scalars[2],
  2513. left.scalars[3] - right.scalars[3],
  2514. left.scalars[4] - right.scalars[4],
  2515. left.scalars[5] - right.scalars[5],
  2516. left.scalars[6] - right.scalars[6],
  2517. left.scalars[7] - right.scalars[7],
  2518. left.scalars[8] - right.scalars[8],
  2519. left.scalars[9] - right.scalars[9],
  2520. left.scalars[10] - right.scalars[10],
  2521. left.scalars[11] - right.scalars[11],
  2522. left.scalars[12] - right.scalars[12],
  2523. left.scalars[13] - right.scalars[13],
  2524. left.scalars[14] - right.scalars[14],
  2525. left.scalars[15] - right.scalars[15]
  2526. );
  2527. #endif
  2528. }
  2529. inline uint8_t impl_limit0(int32_t x) { return x < 0 ? 0 : x; }
  2530. inline uint8_t impl_limit255(uint32_t x) { return x > 255 ? 255 : x; }
  2531. inline U8x16 saturatedAddition(const U8x16& left, const U8x16& right) {
  2532. #if defined USE_BASIC_SIMD
  2533. return U8x16(ADD_SAT_U8_SIMD(left.v, right.v));
  2534. #else
  2535. return U8x16(
  2536. impl_limit255((uint32_t)left.scalars[0] + (uint32_t)right.scalars[0]),
  2537. impl_limit255((uint32_t)left.scalars[1] + (uint32_t)right.scalars[1]),
  2538. impl_limit255((uint32_t)left.scalars[2] + (uint32_t)right.scalars[2]),
  2539. impl_limit255((uint32_t)left.scalars[3] + (uint32_t)right.scalars[3]),
  2540. impl_limit255((uint32_t)left.scalars[4] + (uint32_t)right.scalars[4]),
  2541. impl_limit255((uint32_t)left.scalars[5] + (uint32_t)right.scalars[5]),
  2542. impl_limit255((uint32_t)left.scalars[6] + (uint32_t)right.scalars[6]),
  2543. impl_limit255((uint32_t)left.scalars[7] + (uint32_t)right.scalars[7]),
  2544. impl_limit255((uint32_t)left.scalars[8] + (uint32_t)right.scalars[8]),
  2545. impl_limit255((uint32_t)left.scalars[9] + (uint32_t)right.scalars[9]),
  2546. impl_limit255((uint32_t)left.scalars[10] + (uint32_t)right.scalars[10]),
  2547. impl_limit255((uint32_t)left.scalars[11] + (uint32_t)right.scalars[11]),
  2548. impl_limit255((uint32_t)left.scalars[12] + (uint32_t)right.scalars[12]),
  2549. impl_limit255((uint32_t)left.scalars[13] + (uint32_t)right.scalars[13]),
  2550. impl_limit255((uint32_t)left.scalars[14] + (uint32_t)right.scalars[14]),
  2551. impl_limit255((uint32_t)left.scalars[15] + (uint32_t)right.scalars[15])
  2552. );
  2553. #endif
  2554. }
  2555. inline U8x16 saturatedSubtraction(const U8x16& left, const U8x16& right) {
  2556. #if defined USE_BASIC_SIMD
  2557. return U8x16(SUB_SAT_U8_SIMD(left.v, right.v));
  2558. #else
  2559. return U8x16(
  2560. impl_limit0((int32_t)left.scalars[0] - (int32_t)right.scalars[0]),
  2561. impl_limit0((int32_t)left.scalars[1] - (int32_t)right.scalars[1]),
  2562. impl_limit0((int32_t)left.scalars[2] - (int32_t)right.scalars[2]),
  2563. impl_limit0((int32_t)left.scalars[3] - (int32_t)right.scalars[3]),
  2564. impl_limit0((int32_t)left.scalars[4] - (int32_t)right.scalars[4]),
  2565. impl_limit0((int32_t)left.scalars[5] - (int32_t)right.scalars[5]),
  2566. impl_limit0((int32_t)left.scalars[6] - (int32_t)right.scalars[6]),
  2567. impl_limit0((int32_t)left.scalars[7] - (int32_t)right.scalars[7]),
  2568. impl_limit0((int32_t)left.scalars[8] - (int32_t)right.scalars[8]),
  2569. impl_limit0((int32_t)left.scalars[9] - (int32_t)right.scalars[9]),
  2570. impl_limit0((int32_t)left.scalars[10] - (int32_t)right.scalars[10]),
  2571. impl_limit0((int32_t)left.scalars[11] - (int32_t)right.scalars[11]),
  2572. impl_limit0((int32_t)left.scalars[12] - (int32_t)right.scalars[12]),
  2573. impl_limit0((int32_t)left.scalars[13] - (int32_t)right.scalars[13]),
  2574. impl_limit0((int32_t)left.scalars[14] - (int32_t)right.scalars[14]),
  2575. impl_limit0((int32_t)left.scalars[15] - (int32_t)right.scalars[15])
  2576. );
  2577. #endif
  2578. }
  2579. inline I32x4 truncateToI32(const F32x4& vector) {
  2580. #if defined USE_BASIC_SIMD
  2581. return I32x4(F32_TO_I32_SIMD(vector.v));
  2582. #else
  2583. return I32x4((int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3]);
  2584. #endif
  2585. }
  2586. inline U32x4 truncateToU32(const F32x4& vector) {
  2587. #if defined USE_BASIC_SIMD
  2588. return U32x4(F32_TO_U32_SIMD(vector.v));
  2589. #else
  2590. return U32x4((uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3]);
  2591. #endif
  2592. }
  2593. inline F32x4 floatFromI32(const I32x4& vector) {
  2594. #if defined USE_BASIC_SIMD
  2595. return F32x4(I32_TO_F32_SIMD(vector.v));
  2596. #else
  2597. return F32x4((float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3]);
  2598. #endif
  2599. }
  2600. inline F32x4 floatFromU32(const U32x4& vector) {
  2601. #if defined USE_BASIC_SIMD
  2602. return F32x4(U32_TO_F32_SIMD(vector.v));
  2603. #else
  2604. return F32x4((float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3]);
  2605. #endif
  2606. }
  2607. inline I32x4 I32FromU32(const U32x4& vector) {
  2608. #if defined USE_BASIC_SIMD
  2609. return I32x4(REINTERPRET_U32_TO_I32_SIMD(vector.v));
  2610. #else
  2611. return I32x4((int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3]);
  2612. #endif
  2613. }
  2614. inline U32x4 U32FromI32(const I32x4& vector) {
  2615. #if defined USE_BASIC_SIMD
  2616. return U32x4(REINTERPRET_I32_TO_U32_SIMD(vector.v));
  2617. #else
  2618. return U32x4((uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3]);
  2619. #endif
  2620. }
  2621. // Warning! Behavior depends on endianness.
  2622. inline U8x16 reinterpret_U8FromU32(const U32x4& vector) {
  2623. #if defined USE_BASIC_SIMD
  2624. return U8x16(REINTERPRET_U32_TO_U8_SIMD(vector.v));
  2625. #else
  2626. const uint8_t *source = (const uint8_t*)vector.scalars;
  2627. return U8x16(
  2628. source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7],
  2629. source[8], source[9], source[10], source[11], source[12], source[13], source[14], source[15]
  2630. );
  2631. #endif
  2632. }
  2633. // Warning! Behavior depends on endianness.
  2634. inline U32x4 reinterpret_U32FromU8(const U8x16& vector) {
  2635. #if defined USE_BASIC_SIMD
  2636. return U32x4(REINTERPRET_U8_TO_U32_SIMD(vector.v));
  2637. #else
  2638. const uint32_t *source = (const uint32_t*)vector.scalars;
  2639. return U32x4(source[0], source[1], source[2], source[3]);
  2640. #endif
  2641. }
  2642. // Warning! Behavior depends on endianness.
  2643. inline U16x8 reinterpret_U16FromU32(const U32x4& vector) {
  2644. #if defined USE_BASIC_SIMD
  2645. return U16x8(REINTERPRET_U32_TO_U16_SIMD(vector.v));
  2646. #else
  2647. const uint16_t *source = (const uint16_t*)vector.scalars;
  2648. return U16x8(
  2649. source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7]
  2650. );
  2651. #endif
  2652. }
  2653. // Warning! Behavior depends on endianness.
  2654. inline U32x4 reinterpret_U32FromU16(const U16x8& vector) {
  2655. #if defined USE_BASIC_SIMD
  2656. return U32x4(REINTERPRET_U16_TO_U32_SIMD(vector.v));
  2657. #else
  2658. const uint32_t *source = (const uint32_t*)vector.scalars;
  2659. return U32x4(source[0], source[1], source[2], source[3]);
  2660. #endif
  2661. }
  2662. // Unpacking to larger integers
  2663. inline U32x4 lowerToU32(const U16x8& vector) {
  2664. #if defined USE_BASIC_SIMD
  2665. return U32x4(U16_LOW_TO_U32_SIMD(vector.v));
  2666. #else
  2667. return U32x4(vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3]);
  2668. #endif
  2669. }
  2670. inline U32x4 higherToU32(const U16x8& vector) {
  2671. #if defined USE_BASIC_SIMD
  2672. return U32x4(U16_HIGH_TO_U32_SIMD(vector.v));
  2673. #else
  2674. return U32x4(vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7]);
  2675. #endif
  2676. }
  2677. inline U16x8 lowerToU16(const U8x16& vector) {
  2678. #if defined USE_BASIC_SIMD
  2679. return U16x8(U8_LOW_TO_U16_SIMD(vector.v));
  2680. #else
  2681. return U16x8(
  2682. vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3],
  2683. vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7]
  2684. );
  2685. #endif
  2686. }
  2687. inline U16x8 higherToU16(const U8x16& vector) {
  2688. #if defined USE_BASIC_SIMD
  2689. return U16x8(U8_HIGH_TO_U16_SIMD(vector.v));
  2690. #else
  2691. return U16x8(
  2692. vector.scalars[8], vector.scalars[9], vector.scalars[10], vector.scalars[11],
  2693. vector.scalars[12], vector.scalars[13], vector.scalars[14], vector.scalars[15]
  2694. );
  2695. #endif
  2696. }
  2697. // Saturated packing
  2698. inline U8x16 saturateToU8(const U16x8& lower, const U16x8& upper) {
  2699. #if defined USE_BASIC_SIMD
  2700. return U8x16(PACK_SAT_U16_TO_U8_SIMD(lower.v, upper.v));
  2701. #else
  2702. return U8x16(
  2703. impl_limit255(lower.scalars[0]),
  2704. impl_limit255(lower.scalars[1]),
  2705. impl_limit255(lower.scalars[2]),
  2706. impl_limit255(lower.scalars[3]),
  2707. impl_limit255(lower.scalars[4]),
  2708. impl_limit255(lower.scalars[5]),
  2709. impl_limit255(lower.scalars[6]),
  2710. impl_limit255(lower.scalars[7]),
  2711. impl_limit255(upper.scalars[0]),
  2712. impl_limit255(upper.scalars[1]),
  2713. impl_limit255(upper.scalars[2]),
  2714. impl_limit255(upper.scalars[3]),
  2715. impl_limit255(upper.scalars[4]),
  2716. impl_limit255(upper.scalars[5]),
  2717. impl_limit255(upper.scalars[6]),
  2718. impl_limit255(upper.scalars[7])
  2719. );
  2720. #endif
  2721. }
  2722. // Unary negation for convenience and code readability.
  2723. // Before using unary negation, always check if:
  2724. // * An addition can be turned into a subtraction?
  2725. // x = -a + b
  2726. // x = b - a
  2727. // * A multiplying constant or scalar can be negated instead?
  2728. // x = -b * 2
  2729. // x = b * -2
  2730. inline F32x4 operator-(const F32x4& value) {
  2731. #if defined USE_BASIC_SIMD
  2732. return F32x4(0.0f) - value;
  2733. #else
  2734. return F32x4(-value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3]);
  2735. #endif
  2736. }
  2737. inline I32x4 operator-(const I32x4& value) {
  2738. #if defined USE_BASIC_SIMD
  2739. return I32x4(0) - value;
  2740. #else
  2741. return I32x4(-value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3]);
  2742. #endif
  2743. }
  2744. // Helper macros for generating the vector extract functions.
  2745. // Having one function for each type and offset makes sure that the compiler gets an immediate integer within the valid range.
  2746. #if defined USE_BASIC_SIMD
  2747. #if defined USE_SSE2
  2748. #if defined USE_SSSE3
  2749. #define _MM_ALIGNR_EPI8(A, B, OFFSET) _mm_alignr_epi8(A, B, OFFSET)
  2750. #else
  2751. // If SSSE3 is not used, emulate it using stack memory and unaligned reading of data.
  2752. static inline SIMD_U8x16 _MM_ALIGNR_EPI8(SIMD_U8x16 a, SIMD_U8x16 b, int offset) {
  2753. ALIGN16 uint8_t vectorBuffer[32];
  2754. _mm_store_si128((SIMD_U8x16*)(vectorBuffer), b);
  2755. _mm_store_si128((SIMD_U8x16*)(vectorBuffer + 16), a);
  2756. return _mm_loadu_si128((SIMD_U8x16*)(vectorBuffer + offset));
  2757. }
  2758. #endif
  2759. #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return U8x16(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET));
  2760. #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return U16x8(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 2));
  2761. #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return U32x4(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 4));
  2762. #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return I32x4(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 4));
  2763. #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return F32x4(SIMD_F32x4(_MM_ALIGNR_EPI8(SIMD_U32x4(b.v), SIMD_U32x4(a.v), OFFSET * 4)));
  2764. #elif defined USE_NEON
  2765. #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return U8x16(vextq_u8(a.v, b.v, OFFSET));
  2766. #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return U16x8(vextq_u16(a.v, b.v, OFFSET));
  2767. #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return U32x4(vextq_u32(a.v, b.v, OFFSET));
  2768. #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return I32x4(vextq_s32(a.v, b.v, OFFSET));
  2769. #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return F32x4(vextq_f32(a.v, b.v, OFFSET));
  2770. #endif
  2771. #else
  2772. #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  2773. #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  2774. #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  2775. #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  2776. #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  2777. #endif
  2778. // Vector extraction concatunates two input vectors and reads a vector between them using an offset.
  2779. // The first and last offsets that only return one of the inputs can be used for readability, because they will be inlined and removed by the compiler.
  2780. // To get elements from the right side, combine the center vector with the right vector and shift one element to the left using vectorExtract_1 for the given type.
  2781. // To get elements from the left side, combine the left vector with the center vector and shift one element to the right using vectorExtract_15 for 16 lanes, vectorExtract_7 for 8 lanes, or vectorExtract_3 for 4 lanes.
  2782. U8x16 inline vectorExtract_0(const U8x16 &a, const U8x16 &b) { return a; }
  2783. U8x16 inline vectorExtract_1(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(1, U8x16(a.scalars[1], a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0])) }
  2784. U8x16 inline vectorExtract_2(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(2, U8x16(a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1])) }
  2785. U8x16 inline vectorExtract_3(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(3, U8x16(a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2])) }
  2786. U8x16 inline vectorExtract_4(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(4, U8x16(a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3])) }
  2787. U8x16 inline vectorExtract_5(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(5, U8x16(a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4])) }
  2788. U8x16 inline vectorExtract_6(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(6, U8x16(a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5])) }
  2789. U8x16 inline vectorExtract_7(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(7, U8x16(a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6])) }
  2790. U8x16 inline vectorExtract_8(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(8, U8x16(a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7])) }
  2791. U8x16 inline vectorExtract_9(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(9, U8x16(a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8])) }
  2792. U8x16 inline vectorExtract_10(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(10, U8x16(a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9])) }
  2793. U8x16 inline vectorExtract_11(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(11, U8x16(a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10])) }
  2794. U8x16 inline vectorExtract_12(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(12, U8x16(a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11])) }
  2795. U8x16 inline vectorExtract_13(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(13, U8x16(a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11], b.scalars[12])) }
  2796. U8x16 inline vectorExtract_14(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(14, U8x16(a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11], b.scalars[12], b.scalars[13])) }
  2797. U8x16 inline vectorExtract_15(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(15, U8x16(a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11], b.scalars[12], b.scalars[13], b.scalars[14])) }
  2798. U8x16 inline vectorExtract_16(const U8x16 &a, const U8x16 &b) { return b; }
  2799. U16x8 inline vectorExtract_0(const U16x8 &a, const U16x8 &b) { return a; }
  2800. U16x8 inline vectorExtract_1(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(1, U16x8(a.scalars[1], a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0])) }
  2801. U16x8 inline vectorExtract_2(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(2, U16x8(a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1])) }
  2802. U16x8 inline vectorExtract_3(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(3, U16x8(a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2])) }
  2803. U16x8 inline vectorExtract_4(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(4, U16x8(a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3])) }
  2804. U16x8 inline vectorExtract_5(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(5, U16x8(a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4])) }
  2805. U16x8 inline vectorExtract_6(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(6, U16x8(a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5])) }
  2806. U16x8 inline vectorExtract_7(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(7, U16x8(a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6])) }
  2807. U16x8 inline vectorExtract_8(const U16x8 &a, const U16x8 &b) { return b; }
  2808. U32x4 inline vectorExtract_0(const U32x4 &a, const U32x4 &b) { return a; }
  2809. U32x4 inline vectorExtract_1(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(1, U32x4(a.scalars[1], a.scalars[2], a.scalars[3], b.scalars[0])) }
  2810. U32x4 inline vectorExtract_2(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(2, U32x4(a.scalars[2], a.scalars[3], b.scalars[0], b.scalars[1])) }
  2811. U32x4 inline vectorExtract_3(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(3, U32x4(a.scalars[3], b.scalars[0], b.scalars[1], b.scalars[2])) }
  2812. U32x4 inline vectorExtract_4(const U32x4 &a, const U32x4 &b) { return b; }
  2813. I32x4 inline vectorExtract_0(const I32x4 &a, const I32x4 &b) { return a; }
  2814. I32x4 inline vectorExtract_1(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(1, I32x4(a.scalars[1], a.scalars[2], a.scalars[3], b.scalars[0])) }
  2815. I32x4 inline vectorExtract_2(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(2, I32x4(a.scalars[2], a.scalars[3], b.scalars[0], b.scalars[1])) }
  2816. I32x4 inline vectorExtract_3(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(3, I32x4(a.scalars[3], b.scalars[0], b.scalars[1], b.scalars[2])) }
  2817. I32x4 inline vectorExtract_4(const I32x4 &a, const I32x4 &b) { return b; }
  2818. F32x4 inline vectorExtract_0(const F32x4 &a, const F32x4 &b) { return a; }
  2819. F32x4 inline vectorExtract_1(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(1, F32x4(a.scalars[1], a.scalars[2], a.scalars[3], b.scalars[0])) }
  2820. F32x4 inline vectorExtract_2(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(2, F32x4(a.scalars[2], a.scalars[3], b.scalars[0], b.scalars[1])) }
  2821. F32x4 inline vectorExtract_3(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(3, F32x4(a.scalars[3], b.scalars[0], b.scalars[1], b.scalars[2])) }
  2822. F32x4 inline vectorExtract_4(const F32x4 &a, const F32x4 &b) { return b; }
  2823. // Gather instructions load memory from a pointer at multiple index offsets at the same time.
  2824. // The given pointers should be aligned with 4 bytes, so that the fallback solution works on machines with strict alignment requirements.
  2825. #if defined USE_AVX2
  2826. #define GATHER_I32x4_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_epi32((const int32_t*)(SOURCE), FOUR_OFFSETS, SCALE)
  2827. #define GATHER_U32x4_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_epi32((const int32_t*)(SOURCE), FOUR_OFFSETS, SCALE)
  2828. #define GATHER_F32x4_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_ps((const float*)(SOURCE), FOUR_OFFSETS, SCALE)
  2829. #endif
  2830. static inline U32x4 gather_U32(dsr::SafePointer<const uint32_t> data, const U32x4 &elementOffset) {
  2831. #if defined USE_AVX2
  2832. // TODO: Implement safety checks for debug mode.
  2833. return U32x4(GATHER_U32x4_AVX2(data.getUnsafe(), elementOffset.v, 4));
  2834. #else
  2835. ALIGN16 uint32_t elementOffsets[4];
  2836. elementOffset.writeAlignedUnsafe(elementOffsets);
  2837. return U32x4(
  2838. *(data + elementOffsets[0]),
  2839. *(data + elementOffsets[1]),
  2840. *(data + elementOffsets[2]),
  2841. *(data + elementOffsets[3])
  2842. );
  2843. #endif
  2844. }
  2845. static inline I32x4 gather_I32(dsr::SafePointer<const int32_t> data, const U32x4 &elementOffset) {
  2846. #if defined USE_AVX2
  2847. // TODO: Implement safety checks for debug mode.
  2848. return I32x4(GATHER_U32x4_AVX2(data.getUnsafe(), elementOffset.v, 4));
  2849. #else
  2850. ALIGN16 uint32_t elementOffsets[4];
  2851. elementOffset.writeAlignedUnsafe(elementOffsets);
  2852. return I32x4(
  2853. *(data + elementOffsets[0]),
  2854. *(data + elementOffsets[1]),
  2855. *(data + elementOffsets[2]),
  2856. *(data + elementOffsets[3])
  2857. );
  2858. #endif
  2859. }
  2860. static inline F32x4 gather_F32(dsr::SafePointer<const float> data, const U32x4 &elementOffset) {
  2861. #if defined USE_AVX2
  2862. // TODO: Implement safety checks for debug mode.
  2863. return F32x4(GATHER_F32x4_AVX2(data.getUnsafe(), elementOffset.v, 4));
  2864. #else
  2865. ALIGN16 uint32_t elementOffsets[4];
  2866. elementOffset.writeAlignedUnsafe(elementOffsets);
  2867. return F32x4(
  2868. *(data + elementOffsets[0]),
  2869. *(data + elementOffsets[1]),
  2870. *(data + elementOffsets[2]),
  2871. *(data + elementOffsets[3])
  2872. );
  2873. #endif
  2874. }
  2875. inline F32x8 operator+(const F32x8& left, const F32x8& right) {
  2876. #if defined USE_256BIT_F_SIMD
  2877. return F32x8(ADD_F32_SIMD256(left.v, right.v));
  2878. #else
  2879. return F32x8(
  2880. left.scalars[0] + right.scalars[0],
  2881. left.scalars[1] + right.scalars[1],
  2882. left.scalars[2] + right.scalars[2],
  2883. left.scalars[3] + right.scalars[3],
  2884. left.scalars[4] + right.scalars[4],
  2885. left.scalars[5] + right.scalars[5],
  2886. left.scalars[6] + right.scalars[6],
  2887. left.scalars[7] + right.scalars[7]
  2888. );
  2889. #endif
  2890. }
  2891. inline F32x8 operator-(const F32x8& left, const F32x8& right) {
  2892. #if defined USE_256BIT_F_SIMD
  2893. return F32x8(SUB_F32_SIMD256(left.v, right.v));
  2894. #else
  2895. return F32x8(
  2896. left.scalars[0] - right.scalars[0],
  2897. left.scalars[1] - right.scalars[1],
  2898. left.scalars[2] - right.scalars[2],
  2899. left.scalars[3] - right.scalars[3],
  2900. left.scalars[4] - right.scalars[4],
  2901. left.scalars[5] - right.scalars[5],
  2902. left.scalars[6] - right.scalars[6],
  2903. left.scalars[7] - right.scalars[7]
  2904. );
  2905. #endif
  2906. }
  2907. inline F32x8 operator*(const F32x8& left, const F32x8& right) {
  2908. #if defined USE_256BIT_F_SIMD
  2909. return F32x8(MUL_F32_SIMD256(left.v, right.v));
  2910. #else
  2911. return F32x8(
  2912. left.scalars[0] * right.scalars[0],
  2913. left.scalars[1] * right.scalars[1],
  2914. left.scalars[2] * right.scalars[2],
  2915. left.scalars[3] * right.scalars[3],
  2916. left.scalars[4] * right.scalars[4],
  2917. left.scalars[5] * right.scalars[5],
  2918. left.scalars[6] * right.scalars[6],
  2919. left.scalars[7] * right.scalars[7]
  2920. );
  2921. #endif
  2922. }
  2923. inline F32x8 min(const F32x8& left, const F32x8& right) {
  2924. #if defined USE_256BIT_F_SIMD
  2925. return F32x8(MIN_F32_SIMD256(left.v, right.v));
  2926. #else
  2927. float v0 = left.scalars[0];
  2928. float v1 = left.scalars[1];
  2929. float v2 = left.scalars[2];
  2930. float v3 = left.scalars[3];
  2931. float v4 = left.scalars[4];
  2932. float v5 = left.scalars[5];
  2933. float v6 = left.scalars[6];
  2934. float v7 = left.scalars[7];
  2935. float r0 = right.scalars[0];
  2936. float r1 = right.scalars[1];
  2937. float r2 = right.scalars[2];
  2938. float r3 = right.scalars[3];
  2939. float r4 = right.scalars[4];
  2940. float r5 = right.scalars[5];
  2941. float r6 = right.scalars[6];
  2942. float r7 = right.scalars[7];
  2943. if (r0 < v0) { v0 = r0; }
  2944. if (r1 < v1) { v1 = r1; }
  2945. if (r2 < v2) { v2 = r2; }
  2946. if (r3 < v3) { v3 = r3; }
  2947. if (r4 < v4) { v4 = r4; }
  2948. if (r5 < v5) { v5 = r5; }
  2949. if (r6 < v6) { v6 = r6; }
  2950. if (r7 < v7) { v7 = r7; }
  2951. return F32x8(v0, v1, v2, v3, v4, v5, v6, v7);
  2952. #endif
  2953. }
  2954. inline F32x8 max(const F32x8& left, const F32x8& right) {
  2955. #if defined USE_256BIT_F_SIMD
  2956. return F32x8(MAX_F32_SIMD256(left.v, right.v));
  2957. #else
  2958. float v0 = left.scalars[0];
  2959. float v1 = left.scalars[1];
  2960. float v2 = left.scalars[2];
  2961. float v3 = left.scalars[3];
  2962. float v4 = left.scalars[4];
  2963. float v5 = left.scalars[5];
  2964. float v6 = left.scalars[6];
  2965. float v7 = left.scalars[7];
  2966. float r0 = right.scalars[0];
  2967. float r1 = right.scalars[1];
  2968. float r2 = right.scalars[2];
  2969. float r3 = right.scalars[3];
  2970. float r4 = right.scalars[4];
  2971. float r5 = right.scalars[5];
  2972. float r6 = right.scalars[6];
  2973. float r7 = right.scalars[7];
  2974. if (r0 > v0) { v0 = r0; }
  2975. if (r1 > v1) { v1 = r1; }
  2976. if (r2 > v2) { v2 = r2; }
  2977. if (r3 > v3) { v3 = r3; }
  2978. if (r4 > v4) { v4 = r4; }
  2979. if (r5 > v5) { v5 = r5; }
  2980. if (r6 > v6) { v6 = r6; }
  2981. if (r7 > v7) { v7 = r7; }
  2982. return F32x8(v0, v1, v2, v3, v4, v5, v6, v7);
  2983. #endif
  2984. }
  2985. inline I32x8 operator+(const I32x8& left, const I32x8& right) {
  2986. #if defined USE_256BIT_X_SIMD
  2987. return I32x8(ADD_I32_SIMD256(left.v, right.v));
  2988. #else
  2989. return I32x8(
  2990. left.scalars[0] + right.scalars[0],
  2991. left.scalars[1] + right.scalars[1],
  2992. left.scalars[2] + right.scalars[2],
  2993. left.scalars[3] + right.scalars[3],
  2994. left.scalars[4] + right.scalars[4],
  2995. left.scalars[5] + right.scalars[5],
  2996. left.scalars[6] + right.scalars[6],
  2997. left.scalars[7] + right.scalars[7]);
  2998. #endif
  2999. }
  3000. inline I32x8 operator-(const I32x8& left, const I32x8& right) {
  3001. #if defined USE_256BIT_X_SIMD
  3002. return I32x8(SUB_I32_SIMD256(left.v, right.v));
  3003. #else
  3004. return I32x8(
  3005. left.scalars[0] - right.scalars[0],
  3006. left.scalars[1] - right.scalars[1],
  3007. left.scalars[2] - right.scalars[2],
  3008. left.scalars[3] - right.scalars[3],
  3009. left.scalars[4] - right.scalars[4],
  3010. left.scalars[5] - right.scalars[5],
  3011. left.scalars[6] - right.scalars[6],
  3012. left.scalars[7] - right.scalars[7]);
  3013. #endif
  3014. }
  3015. inline I32x8 operator*(const I32x8& left, const I32x8& right) {
  3016. #if defined USE_AVX2
  3017. return I32x8(MUL_I32_SIMD256(left.v, right.v));
  3018. #else
  3019. return I32x8(
  3020. left.scalars[0] * right.scalars[0],
  3021. left.scalars[1] * right.scalars[1],
  3022. left.scalars[2] * right.scalars[2],
  3023. left.scalars[3] * right.scalars[3],
  3024. left.scalars[4] * right.scalars[4],
  3025. left.scalars[5] * right.scalars[5],
  3026. left.scalars[6] * right.scalars[6],
  3027. left.scalars[7] * right.scalars[7]
  3028. );
  3029. #endif
  3030. }
  3031. inline U32x8 operator+(const U32x8& left, const U32x8& right) {
  3032. #if defined USE_256BIT_X_SIMD
  3033. return U32x8(ADD_U32_SIMD256(left.v, right.v));
  3034. #else
  3035. return U32x8(
  3036. left.scalars[0] + right.scalars[0],
  3037. left.scalars[1] + right.scalars[1],
  3038. left.scalars[2] + right.scalars[2],
  3039. left.scalars[3] + right.scalars[3],
  3040. left.scalars[4] + right.scalars[4],
  3041. left.scalars[5] + right.scalars[5],
  3042. left.scalars[6] + right.scalars[6],
  3043. left.scalars[7] + right.scalars[7]
  3044. );
  3045. #endif
  3046. }
  3047. inline U32x8 operator-(const U32x8& left, const U32x8& right) {
  3048. #if defined USE_256BIT_X_SIMD
  3049. return U32x8(SUB_U32_SIMD256(left.v, right.v));
  3050. #else
  3051. return U32x8(
  3052. left.scalars[0] - right.scalars[0],
  3053. left.scalars[1] - right.scalars[1],
  3054. left.scalars[2] - right.scalars[2],
  3055. left.scalars[3] - right.scalars[3],
  3056. left.scalars[4] - right.scalars[4],
  3057. left.scalars[5] - right.scalars[5],
  3058. left.scalars[6] - right.scalars[6],
  3059. left.scalars[7] - right.scalars[7]
  3060. );
  3061. #endif
  3062. }
  3063. inline U32x8 operator*(const U32x8& left, const U32x8& right) {
  3064. #if defined USE_256BIT_X_SIMD
  3065. return U32x8(MUL_U32_SIMD256(left.v, right.v));
  3066. #else
  3067. return U32x8(
  3068. left.scalars[0] * right.scalars[0],
  3069. left.scalars[1] * right.scalars[1],
  3070. left.scalars[2] * right.scalars[2],
  3071. left.scalars[3] * right.scalars[3],
  3072. left.scalars[4] * right.scalars[4],
  3073. left.scalars[5] * right.scalars[5],
  3074. left.scalars[6] * right.scalars[6],
  3075. left.scalars[7] * right.scalars[7]
  3076. );
  3077. #endif
  3078. }
  3079. inline U32x8 operator&(const U32x8& left, const U32x8& right) {
  3080. #if defined USE_256BIT_X_SIMD
  3081. return U32x8(BITWISE_AND_U32_SIMD256(left.v, right.v));
  3082. #else
  3083. return U32x8(
  3084. left.scalars[0] & right.scalars[0],
  3085. left.scalars[1] & right.scalars[1],
  3086. left.scalars[2] & right.scalars[2],
  3087. left.scalars[3] & right.scalars[3],
  3088. left.scalars[4] & right.scalars[4],
  3089. left.scalars[5] & right.scalars[5],
  3090. left.scalars[6] & right.scalars[6],
  3091. left.scalars[7] & right.scalars[7]
  3092. );
  3093. #endif
  3094. }
  3095. inline U32x8 operator|(const U32x8& left, const U32x8& right) {
  3096. #if defined USE_256BIT_X_SIMD
  3097. return U32x8(BITWISE_OR_U32_SIMD256(left.v, right.v));
  3098. #else
  3099. return U32x8(
  3100. left.scalars[0] | right.scalars[0],
  3101. left.scalars[1] | right.scalars[1],
  3102. left.scalars[2] | right.scalars[2],
  3103. left.scalars[3] | right.scalars[3],
  3104. left.scalars[4] | right.scalars[4],
  3105. left.scalars[5] | right.scalars[5],
  3106. left.scalars[6] | right.scalars[6],
  3107. left.scalars[7] | right.scalars[7]
  3108. );
  3109. #endif
  3110. }
  3111. inline U32x8 operator^(const U32x8& left, const U32x8& right) {
  3112. #if defined USE_256BIT_X_SIMD
  3113. return U32x8(BITWISE_XOR_U32_SIMD256(left.v, right.v));
  3114. #else
  3115. return U32x8(
  3116. left.scalars[0] ^ right.scalars[0],
  3117. left.scalars[1] ^ right.scalars[1],
  3118. left.scalars[2] ^ right.scalars[2],
  3119. left.scalars[3] ^ right.scalars[3],
  3120. left.scalars[4] ^ right.scalars[4],
  3121. left.scalars[5] ^ right.scalars[5],
  3122. left.scalars[6] ^ right.scalars[6],
  3123. left.scalars[7] ^ right.scalars[7]
  3124. );
  3125. #endif
  3126. }
  3127. inline U32x8 operator~(const U32x8& value) {
  3128. #if defined USE_BASIC_SIMD
  3129. return value ^ U32x8(~uint32_t(0));
  3130. #else
  3131. return U32x8(
  3132. ~value.scalars[0],
  3133. ~value.scalars[1],
  3134. ~value.scalars[2],
  3135. ~value.scalars[3],
  3136. ~value.scalars[4],
  3137. ~value.scalars[5],
  3138. ~value.scalars[6],
  3139. ~value.scalars[7]
  3140. );
  3141. #endif
  3142. }
  3143. // ARM NEON does not support 256-bit vectors and Intel's AVX2 does not support variable shifting.
  3144. inline U32x8 operator<<(const U32x8& left, const U32x8 &bitOffsets) {
  3145. assert(allLanesLesser(bitOffsets, U32x8(32u)));
  3146. return U32x8(
  3147. left.scalars[0] << bitOffsets.scalars[0],
  3148. left.scalars[1] << bitOffsets.scalars[1],
  3149. left.scalars[2] << bitOffsets.scalars[2],
  3150. left.scalars[3] << bitOffsets.scalars[3],
  3151. left.scalars[4] << bitOffsets.scalars[4],
  3152. left.scalars[5] << bitOffsets.scalars[5],
  3153. left.scalars[6] << bitOffsets.scalars[6],
  3154. left.scalars[7] << bitOffsets.scalars[7]
  3155. );
  3156. }
  3157. inline U32x8 operator>>(const U32x8& left, const U32x8 &bitOffsets) {
  3158. assert(allLanesLesser(bitOffsets, U32x8(32u)));
  3159. return U32x8(
  3160. left.scalars[0] >> bitOffsets.scalars[0],
  3161. left.scalars[1] >> bitOffsets.scalars[1],
  3162. left.scalars[2] >> bitOffsets.scalars[2],
  3163. left.scalars[3] >> bitOffsets.scalars[3],
  3164. left.scalars[4] >> bitOffsets.scalars[4],
  3165. left.scalars[5] >> bitOffsets.scalars[5],
  3166. left.scalars[6] >> bitOffsets.scalars[6],
  3167. left.scalars[7] >> bitOffsets.scalars[7]
  3168. );
  3169. }
  3170. // bitOffset must be an immediate constant from 0 to 31, so a template argument is used.
  3171. template <uint32_t bitOffset>
  3172. inline U32x8 bitShiftLeftImmediate(const U32x8& left) {
  3173. static_assert(bitOffset < 32u);
  3174. #if defined USE_AVX2
  3175. return U32x8(_mm256_slli_epi32(left.v, bitOffset));
  3176. #else
  3177. return U32x8(
  3178. left.scalars[0] << bitOffset,
  3179. left.scalars[1] << bitOffset,
  3180. left.scalars[2] << bitOffset,
  3181. left.scalars[3] << bitOffset,
  3182. left.scalars[4] << bitOffset,
  3183. left.scalars[5] << bitOffset,
  3184. left.scalars[6] << bitOffset,
  3185. left.scalars[7] << bitOffset
  3186. );
  3187. #endif
  3188. }
  3189. // bitOffset must be an immediate constant from 0 to 31, so a template argument is used.
  3190. template <uint32_t bitOffset>
  3191. inline U32x8 bitShiftRightImmediate(const U32x8& left) {
  3192. static_assert(bitOffset < 32u);
  3193. #if defined USE_AVX2
  3194. return U32x8(_mm256_srli_epi32(left.v, bitOffset));
  3195. #else
  3196. return U32x8(
  3197. left.scalars[0] >> bitOffset,
  3198. left.scalars[1] >> bitOffset,
  3199. left.scalars[2] >> bitOffset,
  3200. left.scalars[3] >> bitOffset,
  3201. left.scalars[4] >> bitOffset,
  3202. left.scalars[5] >> bitOffset,
  3203. left.scalars[6] >> bitOffset,
  3204. left.scalars[7] >> bitOffset
  3205. );
  3206. #endif
  3207. }
  3208. inline U16x16 operator+(const U16x16& left, const U16x16& right) {
  3209. #if defined USE_256BIT_X_SIMD
  3210. return U16x16(ADD_U16_SIMD256(left.v, right.v));
  3211. #else
  3212. return U16x16(
  3213. left.scalars[0] + right.scalars[0],
  3214. left.scalars[1] + right.scalars[1],
  3215. left.scalars[2] + right.scalars[2],
  3216. left.scalars[3] + right.scalars[3],
  3217. left.scalars[4] + right.scalars[4],
  3218. left.scalars[5] + right.scalars[5],
  3219. left.scalars[6] + right.scalars[6],
  3220. left.scalars[7] + right.scalars[7],
  3221. left.scalars[8] + right.scalars[8],
  3222. left.scalars[9] + right.scalars[9],
  3223. left.scalars[10] + right.scalars[10],
  3224. left.scalars[11] + right.scalars[11],
  3225. left.scalars[12] + right.scalars[12],
  3226. left.scalars[13] + right.scalars[13],
  3227. left.scalars[14] + right.scalars[14],
  3228. left.scalars[15] + right.scalars[15]
  3229. );
  3230. #endif
  3231. }
  3232. inline U16x16 operator-(const U16x16& left, const U16x16& right) {
  3233. #if defined USE_256BIT_X_SIMD
  3234. return U16x16(SUB_U16_SIMD256(left.v, right.v));
  3235. #else
  3236. return U16x16(
  3237. left.scalars[0] - right.scalars[0],
  3238. left.scalars[1] - right.scalars[1],
  3239. left.scalars[2] - right.scalars[2],
  3240. left.scalars[3] - right.scalars[3],
  3241. left.scalars[4] - right.scalars[4],
  3242. left.scalars[5] - right.scalars[5],
  3243. left.scalars[6] - right.scalars[6],
  3244. left.scalars[7] - right.scalars[7],
  3245. left.scalars[8] - right.scalars[8],
  3246. left.scalars[9] - right.scalars[9],
  3247. left.scalars[10] - right.scalars[10],
  3248. left.scalars[11] - right.scalars[11],
  3249. left.scalars[12] - right.scalars[12],
  3250. left.scalars[13] - right.scalars[13],
  3251. left.scalars[14] - right.scalars[14],
  3252. left.scalars[15] - right.scalars[15]
  3253. );
  3254. #endif
  3255. }
  3256. inline U16x16 operator*(const U16x16& left, const U16x16& right) {
  3257. #if defined USE_256BIT_X_SIMD
  3258. return U16x16(MUL_U16_SIMD256(left.v, right.v));
  3259. #else
  3260. return U16x16(
  3261. left.scalars[0] * right.scalars[0],
  3262. left.scalars[1] * right.scalars[1],
  3263. left.scalars[2] * right.scalars[2],
  3264. left.scalars[3] * right.scalars[3],
  3265. left.scalars[4] * right.scalars[4],
  3266. left.scalars[5] * right.scalars[5],
  3267. left.scalars[6] * right.scalars[6],
  3268. left.scalars[7] * right.scalars[7],
  3269. left.scalars[8] * right.scalars[8],
  3270. left.scalars[9] * right.scalars[9],
  3271. left.scalars[10] * right.scalars[10],
  3272. left.scalars[11] * right.scalars[11],
  3273. left.scalars[12] * right.scalars[12],
  3274. left.scalars[13] * right.scalars[13],
  3275. left.scalars[14] * right.scalars[14],
  3276. left.scalars[15] * right.scalars[15]
  3277. );
  3278. #endif
  3279. }
  3280. inline U8x32 operator+(const U8x32& left, const U8x32& right) {
  3281. #if defined USE_256BIT_X_SIMD
  3282. return U8x32(ADD_U8_SIMD256(left.v, right.v));
  3283. #else
  3284. U8x32 result = U8x32::create_dangerous_uninitialized();
  3285. for (int i = 0; i < 32; i++) {
  3286. result.scalars[i] = left.scalars[i] + right.scalars[i];
  3287. }
  3288. return result;
  3289. #endif
  3290. }
  3291. inline U8x32 operator-(const U8x32& left, const U8x32& right) {
  3292. #if defined USE_256BIT_X_SIMD
  3293. return U8x32(SUB_U8_SIMD256(left.v, right.v));
  3294. #else
  3295. U8x32 result = U8x32::create_dangerous_uninitialized();
  3296. for (int i = 0; i < 32; i++) {
  3297. result.scalars[i] = left.scalars[i] - right.scalars[i];
  3298. }
  3299. return result;
  3300. #endif
  3301. }
  3302. inline U8x32 saturatedAddition(const U8x32& left, const U8x32& right) {
  3303. #if defined USE_256BIT_X_SIMD
  3304. return U8x32(ADD_SAT_U8_SIMD256(left.v, right.v));
  3305. #else
  3306. U8x32 result = U8x32::create_dangerous_uninitialized();
  3307. for (int i = 0; i < 32; i++) {
  3308. result.scalars[i] = impl_limit255((uint32_t)left.scalars[i] + (uint32_t)right.scalars[i]);
  3309. }
  3310. return result;
  3311. #endif
  3312. }
  3313. inline U8x32 saturatedSubtraction(const U8x32& left, const U8x32& right) {
  3314. #if defined USE_256BIT_X_SIMD
  3315. return U8x32(SUB_SAT_U8_SIMD256(left.v, right.v));
  3316. #else
  3317. U8x32 result = U8x32::create_dangerous_uninitialized();
  3318. for (int i = 0; i < 32; i++) {
  3319. result.scalars[i] = impl_limit0((int32_t)left.scalars[i] - (int32_t)right.scalars[i]);
  3320. }
  3321. return result;
  3322. #endif
  3323. }
  3324. inline I32x8 truncateToI32(const F32x8& vector) {
  3325. #if defined USE_256BIT_X_SIMD
  3326. return I32x8(F32_TO_I32_SIMD256(vector.v));
  3327. #else
  3328. return I32x8(
  3329. (int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3],
  3330. (int32_t)vector.scalars[4], (int32_t)vector.scalars[5], (int32_t)vector.scalars[6], (int32_t)vector.scalars[7]
  3331. );
  3332. #endif
  3333. }
  3334. inline U32x8 truncateToU32(const F32x8& vector) {
  3335. #if defined USE_256BIT_X_SIMD
  3336. return U32x8(F32_TO_U32_SIMD256(vector.v));
  3337. #else
  3338. return U32x8(
  3339. (uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3],
  3340. (uint32_t)vector.scalars[4], (uint32_t)vector.scalars[5], (uint32_t)vector.scalars[6], (uint32_t)vector.scalars[7]
  3341. );
  3342. #endif
  3343. }
  3344. inline F32x8 floatFromI32(const I32x8& vector) {
  3345. #if defined USE_256BIT_X_SIMD
  3346. return F32x8(I32_TO_F32_SIMD256(vector.v));
  3347. #else
  3348. return F32x8(
  3349. (float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3],
  3350. (float)vector.scalars[4], (float)vector.scalars[5], (float)vector.scalars[6], (float)vector.scalars[7]
  3351. );
  3352. #endif
  3353. }
  3354. inline F32x8 floatFromU32(const U32x8& vector) {
  3355. #if defined USE_256BIT_X_SIMD
  3356. return F32x8(U32_TO_F32_SIMD256(vector.v));
  3357. #else
  3358. return F32x8(
  3359. (float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3],
  3360. (float)vector.scalars[4], (float)vector.scalars[5], (float)vector.scalars[6], (float)vector.scalars[7]
  3361. );
  3362. #endif
  3363. }
  3364. inline I32x8 I32FromU32(const U32x8& vector) {
  3365. #if defined USE_256BIT_X_SIMD
  3366. return I32x8(REINTERPRET_U32_TO_I32_SIMD256(vector.v));
  3367. #else
  3368. return I32x8(
  3369. (int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3],
  3370. (int32_t)vector.scalars[4], (int32_t)vector.scalars[5], (int32_t)vector.scalars[6], (int32_t)vector.scalars[7]
  3371. );
  3372. #endif
  3373. }
  3374. inline U32x8 U32FromI32(const I32x8& vector) {
  3375. #if defined USE_256BIT_X_SIMD
  3376. return U32x8(REINTERPRET_I32_TO_U32_SIMD256(vector.v));
  3377. #else
  3378. return U32x8(
  3379. (uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3],
  3380. (uint32_t)vector.scalars[4], (uint32_t)vector.scalars[5], (uint32_t)vector.scalars[6], (uint32_t)vector.scalars[7]
  3381. );
  3382. #endif
  3383. }
  3384. // Warning! Behavior depends on endianness.
  3385. inline U8x32 reinterpret_U8FromU32(const U32x8& vector) {
  3386. #if defined USE_256BIT_X_SIMD
  3387. return U8x32(REINTERPRET_U32_TO_U8_SIMD256(vector.v));
  3388. #else
  3389. const uint8_t *source = (const uint8_t*)vector.scalars;
  3390. U8x32 result = U8x32::create_dangerous_uninitialized();
  3391. for (int i = 0; i < 32; i++) {
  3392. result.scalars[i] = source[i];
  3393. }
  3394. return result;
  3395. #endif
  3396. }
  3397. // Warning! Behavior depends on endianness.
  3398. inline U32x8 reinterpret_U32FromU8(const U8x32& vector) {
  3399. #if defined USE_256BIT_X_SIMD
  3400. return U32x8(REINTERPRET_U8_TO_U32_SIMD256(vector.v));
  3401. #else
  3402. const uint32_t *source = (const uint32_t*)vector.scalars;
  3403. return U32x8(source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7]);
  3404. #endif
  3405. }
  3406. // Warning! Behavior depends on endianness.
  3407. inline U16x16 reinterpret_U16FromU32(const U32x8& vector) {
  3408. #if defined USE_256BIT_X_SIMD
  3409. return U16x16(REINTERPRET_U32_TO_U16_SIMD256(vector.v));
  3410. #else
  3411. const uint16_t *source = (const uint16_t*)vector.scalars;
  3412. return U16x16(
  3413. source[0], source[1], source[2] , source[3] , source[4] , source[5] , source[6] , source[7] ,
  3414. source[8], source[9], source[10], source[11], source[12], source[13], source[14], source[15]
  3415. );
  3416. #endif
  3417. }
  3418. // Warning! Behavior depends on endianness.
  3419. inline U32x8 reinterpret_U32FromU16(const U16x16& vector) {
  3420. #if defined USE_256BIT_X_SIMD
  3421. return U32x4(REINTERPRET_U16_TO_U32_SIMD256(vector.v));
  3422. #else
  3423. const uint32_t *source = (const uint32_t*)vector.scalars;
  3424. return U32x8(source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7]);
  3425. #endif
  3426. }
  3427. // Unpacking to larger integers
  3428. inline U32x8 lowerToU32(const U16x16& vector) {
  3429. #if defined USE_256BIT_X_SIMD
  3430. return U32x8(U16_LOW_TO_U32_SIMD256(vector.v));
  3431. #else
  3432. return U32x8(vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3], vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7]);
  3433. #endif
  3434. }
  3435. inline U32x8 higherToU32(const U16x16& vector) {
  3436. #if defined USE_256BIT_X_SIMD
  3437. return U32x8(U16_HIGH_TO_U32_SIMD256(vector.v));
  3438. #else
  3439. return U32x8(vector.scalars[8], vector.scalars[9], vector.scalars[10], vector.scalars[11], vector.scalars[12], vector.scalars[13], vector.scalars[14], vector.scalars[15]);
  3440. #endif
  3441. }
  3442. inline U16x16 lowerToU16(const U8x32& vector) {
  3443. #if defined USE_256BIT_X_SIMD
  3444. return U16x16(U8_LOW_TO_U16_SIMD256(vector.v));
  3445. #else
  3446. return U16x16(
  3447. vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3],
  3448. vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7],
  3449. vector.scalars[8], vector.scalars[9], vector.scalars[10], vector.scalars[11],
  3450. vector.scalars[12], vector.scalars[13], vector.scalars[14], vector.scalars[15]
  3451. );
  3452. #endif
  3453. }
  3454. inline U16x16 higherToU16(const U8x32& vector) {
  3455. #if defined USE_256BIT_X_SIMD
  3456. return U16x16(U8_HIGH_TO_U16_SIMD256(vector.v));
  3457. #else
  3458. return U16x16(
  3459. vector.scalars[16], vector.scalars[17], vector.scalars[18], vector.scalars[19],
  3460. vector.scalars[20], vector.scalars[21], vector.scalars[22], vector.scalars[23],
  3461. vector.scalars[24], vector.scalars[25], vector.scalars[26], vector.scalars[27],
  3462. vector.scalars[28], vector.scalars[29], vector.scalars[30], vector.scalars[31]
  3463. );
  3464. #endif
  3465. }
  3466. // Saturated packing
  3467. inline U8x32 saturateToU8(const U16x16& lower, const U16x16& upper) {
  3468. #if defined USE_256BIT_X_SIMD
  3469. return U8x32(PACK_SAT_U16_TO_U8_SIMD256(lower.v, upper.v));
  3470. #else
  3471. U8x32 result = U8x32::create_dangerous_uninitialized();
  3472. for (int i = 0; i < 16; i++) {
  3473. result.scalars[i] = impl_limit255(lower.scalars[i]);
  3474. }
  3475. for (int i = 0; i < 16; i++) {
  3476. result.scalars[i + 16] = impl_limit255(upper.scalars[i]);
  3477. }
  3478. return result;
  3479. #endif
  3480. }
  3481. // Unary negation for convenience and code readability.
  3482. // Before using unary negation, always check if:
  3483. // * An addition can be turned into a subtraction?
  3484. // x = -a + b
  3485. // x = b - a
  3486. // * A multiplying constant or scalar can be negated instead?
  3487. // x = -b * 2
  3488. // x = b * -2
  3489. inline F32x8 operator-(const F32x8& value) {
  3490. #if defined USE_256BIT_F_SIMD
  3491. return F32x8(0.0f) - value;
  3492. #else
  3493. return F32x8(
  3494. -value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3],
  3495. -value.scalars[4], -value.scalars[5], -value.scalars[6], -value.scalars[7]
  3496. );
  3497. #endif
  3498. }
  3499. inline I32x8 operator-(const I32x8& value) {
  3500. #if defined USE_256BIT_X_SIMD
  3501. return I32x8(0) - value;
  3502. #else
  3503. return I32x8(
  3504. -value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3],
  3505. -value.scalars[4], -value.scalars[5], -value.scalars[6], -value.scalars[7]
  3506. );
  3507. #endif
  3508. }
  3509. // Helper macros for generating the vector extract functions.
  3510. // Having one function for each type and offset makes sure that the compiler gets an immediate integer within the valid range.
  3511. #if defined USE_AVX2
  3512. // AVX2 does not offer any 256-bit element extraction, only two 128-bit shifts done in parallel, so we might as well use two separate 128-bit extractions.
  3513. template <int OFFSET>
  3514. __m256i impl_extractBytes_AVX2(const __m256i leftInput, const __m256i rightInput) {
  3515. // Extract three halves depending on which ones overlap with the offset.
  3516. __m128i leftPart = _mm256_extractf128_si256(leftInput, (OFFSET < 16) ? 0 : 1);
  3517. __m128i middlePart = _mm256_extractf128_si256(OFFSET < 16 ? rightInput : leftInput, (OFFSET < 16) ? 1 : 0);
  3518. __m128i rightPart = _mm256_extractf128_si256(rightInput, (OFFSET < 16) ? 0 : 1);
  3519. // Combine two 128-bit extracts into a whole 256-bit extract.
  3520. return _mm256_set_m128i(
  3521. _mm_alignr_epi8(leftPart, middlePart, OFFSET - ((OFFSET < 16) ? 0 : 16)),
  3522. _mm_alignr_epi8(middlePart, rightPart, OFFSET - ((OFFSET < 16) ? 0 : 16))
  3523. );
  3524. }
  3525. #define VECTOR_EXTRACT_GENERATOR_256_U8(OFFSET) return U8x32(impl_extractBytes_AVX2<OFFSET>(b.v, a.v));
  3526. #define VECTOR_EXTRACT_GENERATOR_256_U16(OFFSET) return U16x16(impl_extractBytes_AVX2<OFFSET * 2>(b.v, a.v));
  3527. #define VECTOR_EXTRACT_GENERATOR_256_U32(OFFSET) return U32x8(impl_extractBytes_AVX2<OFFSET * 4>(b.v, a.v));
  3528. #define VECTOR_EXTRACT_GENERATOR_256_I32(OFFSET) return I32x8(impl_extractBytes_AVX2<OFFSET * 4>(b.v, a.v));
  3529. #define VECTOR_EXTRACT_GENERATOR_256_F32(OFFSET) return F32x8(SIMD_F32x8(impl_extractBytes_AVX2<OFFSET * 4>(SIMD_U32x8(b.v), SIMD_U32x8(a.v))));
  3530. #else
  3531. template<typename T, int elementCount>
  3532. T vectorExtract_emulated(const T &a, const T &b, int offset) {
  3533. // For safety reasons, uninitialized default construction of SIMD vectors is only available during full emulation, not partial AVX support.
  3534. T result = T::create_dangerous_uninitialized();
  3535. int t = 0;
  3536. for (int s = offset; s < elementCount; s++) {
  3537. result.scalars[t] = a.scalars[s];
  3538. t++;
  3539. }
  3540. for (int s = 0; s < offset; s++) {
  3541. result.scalars[t] = b.scalars[s];
  3542. t++;
  3543. }
  3544. return result;
  3545. }
  3546. #define VECTOR_EXTRACT_GENERATOR_256_U8(OFFSET) return vectorExtract_emulated<U8x32, 32>(a, b, OFFSET);
  3547. #define VECTOR_EXTRACT_GENERATOR_256_U16(OFFSET) return vectorExtract_emulated<U16x16, 16>(a, b, OFFSET);
  3548. #define VECTOR_EXTRACT_GENERATOR_256_U32(OFFSET) return vectorExtract_emulated<U32x8, 8>(a, b, OFFSET);
  3549. #define VECTOR_EXTRACT_GENERATOR_256_I32(OFFSET) return vectorExtract_emulated<I32x8, 8>(a, b, OFFSET);
  3550. #define VECTOR_EXTRACT_GENERATOR_256_F32(OFFSET) return vectorExtract_emulated<F32x8, 8>(a, b, OFFSET);
  3551. #endif
  3552. // Vector extraction concatunates two input vectors and reads a vector between them using an offset.
  3553. // The first and last offsets that only return one of the inputs can be used for readability, because they will be inlined and removed by the compiler.
  3554. // To get elements from the right side, combine the center vector with the right vector and shift one element to the left using vectorExtract_1 for the given type.
  3555. // To get elements from the left side, combine the left vector with the center vector and shift one element to the right using vectorExtract_15 for 16 lanes, vectorExtract_7 for 8 lanes, or vectorExtract_3 for 4 lanes.
  3556. // TODO: Also allow using a template arguments as the element offset with a static assert for the offset, which might be useful in template programming.
  3557. U8x32 inline vectorExtract_0(const U8x32 &a, const U8x32 &b) { return a; }
  3558. U8x32 inline vectorExtract_1(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(1) }
  3559. U8x32 inline vectorExtract_2(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(2) }
  3560. U8x32 inline vectorExtract_3(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(3) }
  3561. U8x32 inline vectorExtract_4(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(4) }
  3562. U8x32 inline vectorExtract_5(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(5) }
  3563. U8x32 inline vectorExtract_6(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(6) }
  3564. U8x32 inline vectorExtract_7(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(7) }
  3565. U8x32 inline vectorExtract_8(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(8) }
  3566. U8x32 inline vectorExtract_9(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(9) }
  3567. U8x32 inline vectorExtract_10(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(10) }
  3568. U8x32 inline vectorExtract_11(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(11) }
  3569. U8x32 inline vectorExtract_12(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(12) }
  3570. U8x32 inline vectorExtract_13(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(13) }
  3571. U8x32 inline vectorExtract_14(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(14) }
  3572. U8x32 inline vectorExtract_15(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(15) }
  3573. U8x32 inline vectorExtract_16(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(16) }
  3574. U8x32 inline vectorExtract_17(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(17) }
  3575. U8x32 inline vectorExtract_18(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(18) }
  3576. U8x32 inline vectorExtract_19(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(19) }
  3577. U8x32 inline vectorExtract_20(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(20) }
  3578. U8x32 inline vectorExtract_21(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(21) }
  3579. U8x32 inline vectorExtract_22(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(22) }
  3580. U8x32 inline vectorExtract_23(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(23) }
  3581. U8x32 inline vectorExtract_24(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(24) }
  3582. U8x32 inline vectorExtract_25(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(25) }
  3583. U8x32 inline vectorExtract_26(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(26) }
  3584. U8x32 inline vectorExtract_27(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(27) }
  3585. U8x32 inline vectorExtract_28(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(28) }
  3586. U8x32 inline vectorExtract_29(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(29) }
  3587. U8x32 inline vectorExtract_30(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(30) }
  3588. U8x32 inline vectorExtract_31(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(31) }
  3589. U8x32 inline vectorExtract_32(const U8x32 &a, const U8x32 &b) { return b; }
  3590. U16x16 inline vectorExtract_0(const U16x16 &a, const U16x16 &b) { return a; }
  3591. U16x16 inline vectorExtract_1(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(1) }
  3592. U16x16 inline vectorExtract_2(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(2) }
  3593. U16x16 inline vectorExtract_3(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(3) }
  3594. U16x16 inline vectorExtract_4(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(4) }
  3595. U16x16 inline vectorExtract_5(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(5) }
  3596. U16x16 inline vectorExtract_6(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(6) }
  3597. U16x16 inline vectorExtract_7(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(7) }
  3598. U16x16 inline vectorExtract_8(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(8) }
  3599. U16x16 inline vectorExtract_9(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(9) }
  3600. U16x16 inline vectorExtract_10(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(10) }
  3601. U16x16 inline vectorExtract_11(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(11) }
  3602. U16x16 inline vectorExtract_12(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(12) }
  3603. U16x16 inline vectorExtract_13(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(13) }
  3604. U16x16 inline vectorExtract_14(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(14) }
  3605. U16x16 inline vectorExtract_15(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(15) }
  3606. U16x16 inline vectorExtract_16(const U16x16 &a, const U16x16 &b) { return b; }
  3607. U32x8 inline vectorExtract_0(const U32x8 &a, const U32x8 &b) { return a; }
  3608. U32x8 inline vectorExtract_1(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(1) }
  3609. U32x8 inline vectorExtract_2(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(2) }
  3610. U32x8 inline vectorExtract_3(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(3) }
  3611. U32x8 inline vectorExtract_4(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(4) }
  3612. U32x8 inline vectorExtract_5(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(5) }
  3613. U32x8 inline vectorExtract_6(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(6) }
  3614. U32x8 inline vectorExtract_7(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(7) }
  3615. U32x8 inline vectorExtract_8(const U32x8 &a, const U32x8 &b) { return b; }
  3616. I32x8 inline vectorExtract_0(const I32x8 &a, const I32x8 &b) { return a; }
  3617. I32x8 inline vectorExtract_1(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(1) }
  3618. I32x8 inline vectorExtract_2(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(2) }
  3619. I32x8 inline vectorExtract_3(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(3) }
  3620. I32x8 inline vectorExtract_4(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(4) }
  3621. I32x8 inline vectorExtract_5(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(5) }
  3622. I32x8 inline vectorExtract_6(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(6) }
  3623. I32x8 inline vectorExtract_7(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(7) }
  3624. I32x8 inline vectorExtract_8(const I32x8 &a, const I32x8 &b) { return b; }
  3625. F32x8 inline vectorExtract_0(const F32x8 &a, const F32x8 &b) { return a; }
  3626. F32x8 inline vectorExtract_1(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(1) }
  3627. F32x8 inline vectorExtract_2(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(2) }
  3628. F32x8 inline vectorExtract_3(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(3) }
  3629. F32x8 inline vectorExtract_4(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(4) }
  3630. F32x8 inline vectorExtract_5(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(5) }
  3631. F32x8 inline vectorExtract_6(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(6) }
  3632. F32x8 inline vectorExtract_7(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(7) }
  3633. F32x8 inline vectorExtract_8(const F32x8 &a, const F32x8 &b) { return b; }
  3634. // Gather instructions load memory from a pointer at multiple index offsets at the same time.
  3635. // The given pointers should be aligned with 4 bytes, so that the fallback solution works on machines with strict alignment requirements.
  3636. #if defined USE_AVX2
  3637. #define GATHER_I32x8_AVX2(SOURCE, EIGHT_OFFSETS, SCALE) _mm256_i32gather_epi32((const int32_t*)(SOURCE), EIGHT_OFFSETS, SCALE)
  3638. #define GATHER_U32x8_AVX2(SOURCE, EIGHT_OFFSETS, SCALE) _mm256_i32gather_epi32((const int32_t*)(SOURCE), EIGHT_OFFSETS, SCALE)
  3639. #define GATHER_F32x8_AVX2(SOURCE, EIGHT_OFFSETS, SCALE) _mm256_i32gather_ps((const float*)(SOURCE), EIGHT_OFFSETS, SCALE)
  3640. #endif
  3641. static inline U32x8 gather_U32(dsr::SafePointer<const uint32_t> data, const U32x8 &elementOffset) {
  3642. #if defined USE_AVX2
  3643. // TODO: Implement safety checks for debug mode.
  3644. return U32x8(GATHER_I32x8_AVX2(data.getUnsafe(), elementOffset.v, 4));
  3645. #else
  3646. ALIGN32 uint32_t elementOffsets[8];
  3647. elementOffset.writeAlignedUnsafe(elementOffsets);
  3648. return U32x8(
  3649. *(data + elementOffsets[0]),
  3650. *(data + elementOffsets[1]),
  3651. *(data + elementOffsets[2]),
  3652. *(data + elementOffsets[3]),
  3653. *(data + elementOffsets[4]),
  3654. *(data + elementOffsets[5]),
  3655. *(data + elementOffsets[6]),
  3656. *(data + elementOffsets[7])
  3657. );
  3658. #endif
  3659. }
  3660. static inline I32x8 gather_I32(dsr::SafePointer<const int32_t> data, const U32x8 &elementOffset) {
  3661. #if defined USE_AVX2
  3662. // TODO: Implement safety checks for debug mode.
  3663. return I32x8(GATHER_U32x8_AVX2(data.getUnsafe(), elementOffset.v, 4));
  3664. #else
  3665. ALIGN32 uint32_t elementOffsets[8];
  3666. elementOffset.writeAlignedUnsafe(elementOffsets);
  3667. return I32x8(
  3668. *(data + elementOffsets[0]),
  3669. *(data + elementOffsets[1]),
  3670. *(data + elementOffsets[2]),
  3671. *(data + elementOffsets[3]),
  3672. *(data + elementOffsets[4]),
  3673. *(data + elementOffsets[5]),
  3674. *(data + elementOffsets[6]),
  3675. *(data + elementOffsets[7])
  3676. );
  3677. #endif
  3678. }
  3679. static inline F32x8 gather_F32(dsr::SafePointer<const float> data, const U32x8 &elementOffset) {
  3680. #if defined USE_AVX2
  3681. // TODO: Implement safety checks for debug mode.
  3682. return F32x8(GATHER_F32x8_AVX2(data.getUnsafe(), elementOffset.v, 4));
  3683. #else
  3684. ALIGN32 uint32_t elementOffsets[8];
  3685. elementOffset.writeAlignedUnsafe(elementOffsets);
  3686. return F32x8(
  3687. *(data + elementOffsets[0]),
  3688. *(data + elementOffsets[1]),
  3689. *(data + elementOffsets[2]),
  3690. *(data + elementOffsets[3]),
  3691. *(data + elementOffsets[4]),
  3692. *(data + elementOffsets[5]),
  3693. *(data + elementOffsets[6]),
  3694. *(data + elementOffsets[7])
  3695. );
  3696. #endif
  3697. }
  3698. // Wrapper functions for explicitly expanding scalars into vectors during math operations.
  3699. #define NUMERICAL_SCALAR_OPERATIONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  3700. inline VECTOR_TYPE operator+(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left + VECTOR_TYPE(right); } \
  3701. inline VECTOR_TYPE operator+(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) + right; } \
  3702. inline VECTOR_TYPE operator-(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left - VECTOR_TYPE(right); } \
  3703. inline VECTOR_TYPE operator-(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) - right; }
  3704. FOR_ALL_VECTOR_TYPES(NUMERICAL_SCALAR_OPERATIONS)
  3705. #undef NUMERICAL_SCALAR_OPERATIONS
  3706. #define MULTIPLY_SCALAR_OPERATIONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  3707. inline VECTOR_TYPE operator*(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left * VECTOR_TYPE(right); } \
  3708. inline VECTOR_TYPE operator*(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) * right; }
  3709. // TODO: Implement multiplication for U8x16 and U8x32.
  3710. //FOR_ALL_VECTOR_TYPES(MULTIPLY_SCALAR_OPERATIONS)
  3711. MULTIPLY_SCALAR_OPERATIONS(F32x4, float, 4)
  3712. MULTIPLY_SCALAR_OPERATIONS(F32x8, float, 8)
  3713. MULTIPLY_SCALAR_OPERATIONS(U32x4, uint32_t, 4)
  3714. MULTIPLY_SCALAR_OPERATIONS(U32x8, uint32_t, 8)
  3715. MULTIPLY_SCALAR_OPERATIONS(I32x4, int32_t, 4)
  3716. MULTIPLY_SCALAR_OPERATIONS(I32x8, int32_t, 8)
  3717. MULTIPLY_SCALAR_OPERATIONS(U16x8, uint16_t, 8)
  3718. MULTIPLY_SCALAR_OPERATIONS(U16x16, uint16_t, 16)
  3719. #undef MULTIPLY_SCALAR_OPERATIONS
  3720. // Wrapper functions for explicitly duplicating bit masks into the same lane count.
  3721. #define BITWISE_SCALAR_OPERATIONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  3722. inline VECTOR_TYPE operator&(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left & VECTOR_TYPE(right); } \
  3723. inline VECTOR_TYPE operator&(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) & right; } \
  3724. inline VECTOR_TYPE operator|(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left | VECTOR_TYPE(right); } \
  3725. inline VECTOR_TYPE operator|(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) | right; } \
  3726. inline VECTOR_TYPE operator^(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left ^ VECTOR_TYPE(right); } \
  3727. inline VECTOR_TYPE operator^(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) ^ right; }
  3728. // TODO: Implement bitwise operations for all unsigned SIMD vectors.
  3729. //FOR_UNSIGNED_VECTOR_TYPES(BITWISE_SCALAR_OPERATIONS)
  3730. BITWISE_SCALAR_OPERATIONS(U32x4, uint32_t, 4)
  3731. BITWISE_SCALAR_OPERATIONS(U32x8, uint32_t, 8)
  3732. #undef BITWISE_SCALAR_OPERATIONS
  3733. // Cleaning up temporary macro definitions to avoid cluttering the namespace.
  3734. #undef FOR_ALL_VECTOR_TYPES
  3735. #undef FOR_FLOAT_VECTOR_TYPES
  3736. #undef FOR_INTEGER_VECTOR_TYPES
  3737. #undef FOR_SIGNED_VECTOR_TYPES
  3738. #undef FOR_UNSIGNED_VECTOR_TYPES
  3739. // TODO: Let SVE define completely separate types for dynamic vectors.
  3740. // The X vectors using the longest SIMD length that is efficient to use for both floating-point and integer types.
  3741. // DSR_DEFAULT_ALIGNMENT
  3742. // The number of bytes memory should be aligned with by default when creating buffers and images.
  3743. // F32xX
  3744. // The longest available SIMD vector for storing 32-bit float values. Iterating laneCountX_32Bit floats at a time.
  3745. // I32xX
  3746. // The longest available SIMD vector for storing signed 32-bit integer values. Iterating laneCountX_32Bit integers at a time.
  3747. // U32xX
  3748. // The longest available SIMD vector for storing unsigned 32-bit integer values. Iterating laneCountX_32Bit integers at a time.
  3749. // U16xX
  3750. // The longest available SIMD vector for storing unsigned 16-bit integer values. Iterating laneCountX_16Bit integers at a time.
  3751. // U8xX
  3752. // The longest available SIMD vector for storing unsigned 8-bit integer values. Iterating laneCountX_8Bit integers at a time.
  3753. #if defined USE_256BIT_X_SIMD || defined EMULATE_256BIT_X_SIMD
  3754. // Using 256-bit SIMD
  3755. #define DSR_DEFAULT_VECTOR_SIZE 32
  3756. #define DSR_DEFAULT_ALIGNMENT 32
  3757. using F32xX = F32x8;
  3758. using I32xX = I32x8;
  3759. using U32xX = U32x8;
  3760. using U16xX = U16x16;
  3761. using U8xX = U8x32;
  3762. // Align memory with 256 bits to allow overwriting padding at the end of each pixel row.
  3763. // Otherwise you would have to preserve data at the end of each row with slow and bloated duplicated code in every filter.
  3764. #else
  3765. // If there is no hardware support for 256-bit vectors, the emulation of 256-bit vectors when used explicitly, is allowed to be aligned with just 128 bits.
  3766. #define DSR_DEFAULT_VECTOR_SIZE 16
  3767. #define DSR_DEFAULT_ALIGNMENT 16
  3768. using F32xX = F32x4;
  3769. using I32xX = I32x4;
  3770. using U32xX = U32x4;
  3771. using U16xX = U16x8;
  3772. using U8xX = U8x16;
  3773. #endif
  3774. // How many lanes do the longest available vector have for a specified lane size.
  3775. // Used to iterate indices and pointers using whole elements.
  3776. static const int laneCountX_32Bit = DSR_DEFAULT_VECTOR_SIZE / 4;
  3777. static const int laneCountX_16Bit = DSR_DEFAULT_VECTOR_SIZE / 2;
  3778. static const int laneCountX_8Bit = DSR_DEFAULT_VECTOR_SIZE;
  3779. // TODO: Let SVE define completely separate types for dynamic vectors.
  3780. // The F vector using the longest SIMD length that is efficient to use when only processing float values, even if no integer types are available in the same size.
  3781. // Used when you know that your algorithm is only going to work with float types and you need the extra performance.
  3782. // Some processors have AVX but not AVX2, meaning that it has 256-bit SIMD for floats, but only 128-bit SIMD for integers.
  3783. // F32xF
  3784. // The longest available SIMD vector for storing 32-bit float values. Iterating laneCountF_32Bit floats at a time.
  3785. #if defined USE_256BIT_F_SIMD || defined EMULATE_256BIT_F_SIMD
  3786. #define DSR_FLOAT_VECTOR_SIZE 32
  3787. #define DSR_FLOAT_ALIGNMENT 32
  3788. using F32xF = F32x8;
  3789. #else
  3790. // F vectors are 128-bits.
  3791. #define DSR_FLOAT_VECTOR_SIZE 16
  3792. #define DSR_FLOAT_ALIGNMENT 16
  3793. using F32xF = F32x4;
  3794. #endif
  3795. // Used to iterate over float pointers when using F32xF.
  3796. static const int laneCountF = DSR_FLOAT_VECTOR_SIZE / 4;
  3797. // Define traits.
  3798. DSR_APPLY_PROPERTY(DsrTrait_Any_U8 , U8x16)
  3799. DSR_APPLY_PROPERTY(DsrTrait_Any_U8 , U8x32)
  3800. DSR_APPLY_PROPERTY(DsrTrait_Any_U16, U16x8)
  3801. DSR_APPLY_PROPERTY(DsrTrait_Any_U16, U16x16)
  3802. DSR_APPLY_PROPERTY(DsrTrait_Any_U32, U32x4)
  3803. DSR_APPLY_PROPERTY(DsrTrait_Any_U32, U32x8)
  3804. DSR_APPLY_PROPERTY(DsrTrait_Any_I32, I32x4)
  3805. DSR_APPLY_PROPERTY(DsrTrait_Any_I32, I32x8)
  3806. DSR_APPLY_PROPERTY(DsrTrait_Any_F32, F32x4)
  3807. DSR_APPLY_PROPERTY(DsrTrait_Any_F32, F32x8)
  3808. // TODO: Use as independent types when the largest vector lengths are not known in compile time on ARM SVE.
  3809. //DSR_APPLY_PROPERTY(DsrTrait_Any_U8 , U8xX)
  3810. //DSR_APPLY_PROPERTY(DsrTrait_Any_U16, U16xX)
  3811. //DSR_APPLY_PROPERTY(DsrTrait_Any_U32, U32xX)
  3812. //DSR_APPLY_PROPERTY(DsrTrait_Any_I32, I32xX)
  3813. //DSR_APPLY_PROPERTY(DsrTrait_Any_F32, F32xX)
  3814. //DSR_APPLY_PROPERTY(DsrTrait_Any_F32, F32xF)
  3815. }
  3816. #endif