simd.h 156 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566
  1. // zlib open source license
  2. //
  3. // Copyright (c) 2017 to 2023 David Forsgren Piuva
  4. //
  5. // This software is provided 'as-is', without any express or implied
  6. // warranty. In no event will the authors be held liable for any damages
  7. // arising from the use of this software.
  8. //
  9. // Permission is granted to anyone to use this software for any purpose,
  10. // including commercial applications, and to alter it and redistribute it
  11. // freely, subject to the following restrictions:
  12. //
  13. // 1. The origin of this software must not be misrepresented; you must not
  14. // claim that you wrote the original software. If you use this software
  15. // in a product, an acknowledgment in the product documentation would be
  16. // appreciated but is not required.
  17. //
  18. // 2. Altered source versions must be plainly marked as such, and must not be
  19. // misrepresented as being the original software.
  20. //
  21. // 3. This notice may not be removed or altered from any source
  22. // distribution.
  23. // Hardware abstraction layer for portable SIMD math.
  24. // Used to make calculations faster without having to mess around with any hardware specific assembler code nor intrinsic functions.
  25. // You get the performance you need today and the ability to compile with automatic scalar emulation when building for a future processor that this module has not yet been ported to.
  26. // When you can generate the vectorized code using the same template function as your non-vectorized code, you don't even need to write a reference implementation.
  27. // Using with 128-bit SIMD: (beginner friendly, test once, compile anywhere, no compiler flags)
  28. // If you are new to vectorization or only plan to work with 128-bit extensions such as ARM NEON, you can keep it simple by only using the 128-bit vector types.
  29. // Pros and cons:
  30. // + Most target platforms (excluding older systems such as ARMv6) have 128-bit SIMD extensions such as Intel SSE2 or ARM NEON enabled by default.
  31. // ARMv6 does not support ARM NEON, but most ARMv7 processors support it, so that compilers enable NEON by default.
  32. // All 64-bit ARM processors have ARM NEON, because it stopped being optional in ARMv8.
  33. // Building for 64-bit Intel processors usually have SSE2 enabled by default, so you don't have to change any compiler flags when building on a different system.
  34. // + One build for all computers of the same instruction set.
  35. // Great when your application is not so resource heavy, because the least powerful systems don't have the fancy extensions anyway.
  36. // - You might end up enabling the additional SIMD extensions anyway because the library is already using it to become faster.
  37. // Types:
  38. // * Use F32x4, I32x4 and U32x4 for 4 elements at a time
  39. // * U16x8 for 8 elements at a time
  40. // * U8x16 for 16 elements at a time
  41. // Using the X vector size: (advanced, having to test with different build flags or emulation)
  42. // If you want more performance, you can use variable length type aliases.
  43. // Pros and cons:
  44. // + For heavy calculations where memory access is not the bottleneck, using larger SIMD vectors when enabled allow saving energy and increasing performance.
  45. // - If you forget to test with longer vector lengths (compiling with -mavx2 or -mEMULATE_256BIT_SIMD) then you might find bugs from not iterating or aligning memory correctly.
  46. // Types:
  47. // * Use F32xX, I32xX and U32xX for laneCountX_32Bit elements at a time
  48. // * U16xX for laneCountX_16Bit elements at a time
  49. // * U8xX for laneCountX_8Bit elements at a time
  50. // Using the F vector size: (very dangerous, no test can confirm that memory alignment is correct)
  51. // If you want even more performance, you can let float operations use the longest available F vector size, which might exceed the X vector size.
  52. // Pros and cons:
  53. // - Have to manually set the alignment of buffers to DSR_FLOAT_ALIGNMENT to prevent crashing.
  54. // If the default alignment for buffers changed based on the size of F vectors, the more commonly used X vector would get slowed down from cache misses from padding larger than X vectors.
  55. // AlignedImageF32 and sound backends are already aligned with the F vector size, because they are not generic like Buffer.
  56. // - It can be difficult to detect incorrect memory alignment, because a pointer can accidentally be aligned to more than what was requested.
  57. // If accidentally aligning to 128 bits instead of 256 bits, there is a 50% risk of failing to detect it at runtime and later fail on another computer.
  58. // If sticking with 128-bit or X vectors, all buffers will be correctly aligned automatically.
  59. // + For heavy calculations where memory access is not the bottleneck, using larger SIMD vectors when enabled allow saving energy and increasing performance.
  60. // - If you forget to test with longer vector lengths (compiling with -mavx2 or -mEMULATE_256BIT_SIMD) then you might find bugs from not iterating or aligning memory correctly.
  61. // Types:
  62. // * Use F32xX, I32xX and U32xX for laneCountX_32Bit elements at a time
  63. // Compiler extensions
  64. // On Intel/AMD processors:
  65. // SSE2 is usually enabled by default, because SSE2 is mandatory for 64-bit Intel instructions.
  66. // Use -mavx as a G++ compiler flag to enable the AVX extension, enabling the USE_AVX and USE_256BIT_F_SIMD macros.
  67. // If not available on your computer, you can test your algorithm for 256-bit float SIMD using EMULATE_256BIT_F_SIMD, but only if you use the F vector size.
  68. // Use -mavx2 as a G++ compiler flag to enable the AVX2 extension, enabling the USE_AVX2 and USE_256BIT_X_SIMD macros.
  69. // If not available on your computer, you can test your algorithm for 256-bit float and integer SIMD using EMULATE_256BIT_X_SIMD, but only if you use the X vector size.
  70. // On ARMv6 processors:
  71. // Scalar emulation is used when compiling for ARMv6, because it does not have NEON and VFP is not supported in this abstraction.
  72. // On ARMv7 processors:
  73. // NEON is usually enabled by default for ARMv7, because most of them have the extension.
  74. // On ARMv8 processors:
  75. // NEON can not be disabled for ARMv8, because it is mandatory for ARMv8.
  76. #ifndef DFPSR_SIMD
  77. #define DFPSR_SIMD
  78. #include <stdint.h>
  79. #include <cassert>
  80. #include "SafePointer.h"
  81. #include "../math/FVector.h"
  82. #include "../math/IVector.h"
  83. #include "../math/UVector.h"
  84. // Determine which SIMD extensions to use when compiling this executable
  85. // Use the standard compiler flags for enabling SIMD extensions and having them as a minimum system requirement for the specific build.
  86. #if defined(__SSE2__)
  87. #define USE_SSE2 // Comment out this line to test without SSE2
  88. #ifdef USE_SSE2
  89. #ifdef __SSSE3__
  90. #define USE_SSSE3 // Comment out this line to test without SSSE3
  91. #endif
  92. #ifdef __AVX__
  93. #define USE_AVX // Comment out this line to test without AVX
  94. #ifdef USE_AVX
  95. #ifdef __AVX2__
  96. #define USE_AVX2 // Comment out this line to test without AVX2
  97. #endif
  98. #endif
  99. #endif
  100. #endif
  101. #elif defined(__ARM_NEON)
  102. #define USE_NEON // Comment out this line to test without NEON
  103. #endif
  104. // Enable the EMULATE_X_256BIT_SIMD macro to force use of 256-bit vectors even when there is no hardware instructions supporting it.
  105. // F32xX will then be an alias for F32x8, laneCountX_32Bit will be 8 for iterating your algorithms in larger steps, buffers and image rows will padd and align for 256 bits, et cetera...
  106. // This will be slower if only compiling with 128-bit SIMD instructions enabled, but can show if the algorithm is using variable lane count correctly before profiling on a processor that has the extension.
  107. // Useful for testing algorithms when the computer used for programming does not have the hardware instructions.
  108. // To get real 256-bit SIMD on an Intel processor with AVX2 hardware instructions, enable the AVX2 compiler flag for the library and your project (which is -mavx2 in GCC).
  109. //#define EMULATE_256BIT_X_SIMD
  110. // Enable the EMULATE_F_256BIT_SIMD macro to force use of 256-bit float vectors even when there is no hardware instructions supporting it.
  111. // F32xF will then be an alias for F32x8, laneCountF will be 8 for iterating your float algorithms in larger steps.
  112. // Buffers are however default aligned based on X, so using the F vector length require aligning your buffers using DSR_FLOAT_ALIGNMENT instead of defaulting to DSR_DEFAULT_ALIGNMENT.
  113. // Useful for testing algorithms when the computer used for programming does not have the hardware instructions.
  114. // To get real 256-bit float SIMD on an Intel processor with AVX hardware instructions, enable the AVX compiler flag for the library and your project (which is -mavx in GCC).
  115. //#define EMULATE_256BIT_F_SIMD
  116. // A platform independent summary of which features are enabled.
  117. #ifdef USE_SSE2
  118. // We have hardware support for 128-bit SIMD, which is enough to make memory bandwidth the bottleneck for light computation.
  119. #define USE_BASIC_SIMD
  120. #ifdef USE_AVX
  121. // We have hardware support for 256-bit float SIMD, so that we get a performance boost from using F32x8 or its alias F32xF instead of F32x4
  122. #define USE_256BIT_F_SIMD
  123. #ifdef USE_AVX2
  124. // We also have hardware support for the other 256-bit SIMD types, pushing the size of an X vector and default alignment to 256 bits.
  125. // F32xX will now refer to F32x8
  126. // I32xX will now refer to I32x8
  127. // U32xX will now refer to U32x8
  128. // U16xX will now refer to U16x16
  129. // U8xX will now refer to U8x32
  130. #define USE_256BIT_X_SIMD
  131. #endif
  132. #endif
  133. #endif
  134. #ifdef USE_NEON
  135. #define USE_BASIC_SIMD
  136. #endif
  137. // Alignment in bytes
  138. #define ALIGN_BYTES(SIZE) __attribute__((aligned(SIZE)))
  139. #define ALIGN16 ALIGN_BYTES(16) // 128-bit alignment
  140. #define ALIGN32 ALIGN_BYTES(32) // 256-bit alignment
  141. // Everything declared in here handles things specific for SSE.
  142. // Direct use of the macros will not provide portability to all hardware.
  143. #ifdef USE_SSE2
  144. #define USE_DIRECT_SIMD_MEMORY_ACCESS
  145. #include <emmintrin.h> // SSE2
  146. #ifdef USE_SSSE3
  147. #include <tmmintrin.h> // SSSE3
  148. #endif
  149. #ifdef USE_AVX
  150. #include <immintrin.h> // AVX / AVX2
  151. #endif
  152. // Vector types
  153. #define SIMD_F32x4 __m128
  154. #define SIMD_U8x16 __m128i
  155. #define SIMD_U16x8 __m128i
  156. #define SIMD_U32x4 __m128i
  157. #define SIMD_I32x4 __m128i
  158. // Vector uploads in address order
  159. #define LOAD_VECTOR_F32_SIMD(A, B, C, D) _mm_set_ps(D, C, B, A)
  160. #define LOAD_SCALAR_F32_SIMD(A) _mm_set1_ps(A)
  161. #define LOAD_VECTOR_U8_SIMD(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) _mm_set_epi8(P, O, N, M, L, K, J, I, H, G, F, E, D, C, B, A)
  162. #define LOAD_SCALAR_U8_SIMD(A) _mm_set1_epi8(A)
  163. #define LOAD_VECTOR_U16_SIMD(A, B, C, D, E, F, G, H) _mm_set_epi16(H, G, F, E, D, C, B, A)
  164. #define LOAD_SCALAR_U16_SIMD(A) _mm_set1_epi16(A)
  165. #define LOAD_VECTOR_U32_SIMD(A, B, C, D) _mm_set_epi32(D, C, B, A)
  166. #define LOAD_SCALAR_U32_SIMD(A) _mm_set1_epi32(A)
  167. #define LOAD_VECTOR_I32_SIMD(A, B, C, D) _mm_set_epi32(D, C, B, A)
  168. #define LOAD_SCALAR_I32_SIMD(A) _mm_set1_epi32(A)
  169. // Conversions
  170. #define F32_TO_I32_SIMD(A) _mm_cvttps_epi32(A)
  171. #define F32_TO_U32_SIMD(A) _mm_cvttps_epi32(A)
  172. #define I32_TO_F32_SIMD(A) _mm_cvtepi32_ps(A)
  173. #define U32_TO_F32_SIMD(A) _mm_cvtepi32_ps(A)
  174. // Unpacking conversions
  175. #define U8_LOW_TO_U16_SIMD(A) _mm_unpacklo_epi8(A, _mm_set1_epi8(0))
  176. #define U8_HIGH_TO_U16_SIMD(A) _mm_unpackhi_epi8(A, _mm_set1_epi8(0))
  177. #define U16_LOW_TO_U32_SIMD(A) _mm_unpacklo_epi16(A, _mm_set1_epi16(0))
  178. #define U16_HIGH_TO_U32_SIMD(A) _mm_unpackhi_epi16(A, _mm_set1_epi16(0))
  179. // Saturated packing
  180. // Credit: Using ideas from Victoria Zhislina's NEON_2_SSE.h header from the Intel corporation, but not trying to emulate NEON
  181. inline SIMD_U8x16 PACK_SAT_U16_TO_U8_SIMD(const SIMD_U16x8& a, const SIMD_U16x8& b) {
  182. SIMD_U16x8 mask, a2, b2;
  183. mask = _mm_set1_epi16(0b0111111111111111);
  184. a2 = _mm_and_si128(a, mask);
  185. a2 = _mm_or_si128(a2, _mm_and_si128(_mm_cmpgt_epi16(a2, a), mask));
  186. b2 = _mm_and_si128(b, mask);
  187. b2 = _mm_or_si128(b2, _mm_and_si128(_mm_cmpgt_epi16(b2, b), mask));
  188. return _mm_packus_epi16(a2, b2);
  189. }
  190. // Reinterpret casting
  191. #define REINTERPRET_U32_TO_U8_SIMD(A) (A)
  192. #define REINTERPRET_U32_TO_U16_SIMD(A) (A)
  193. #define REINTERPRET_U8_TO_U32_SIMD(A) (A)
  194. #define REINTERPRET_U16_TO_U32_SIMD(A) (A)
  195. #define REINTERPRET_U32_TO_I32_SIMD(A) (A)
  196. #define REINTERPRET_I32_TO_U32_SIMD(A) (A)
  197. // Vector float operations returning SIMD_F32x4
  198. #define ADD_F32_SIMD(A, B) _mm_add_ps(A, B)
  199. #define SUB_F32_SIMD(A, B) _mm_sub_ps(A, B)
  200. #define MUL_F32_SIMD(A, B) _mm_mul_ps(A, B)
  201. // Vector integer operations returning SIMD_I32x4
  202. #define ADD_I32_SIMD(A, B) _mm_add_epi32(A, B)
  203. #define SUB_I32_SIMD(A, B) _mm_sub_epi32(A, B)
  204. // 32-bit integer multiplications are not available on SSE2.
  205. // Vector integer operations returning SIMD_U32x4
  206. #define ADD_U32_SIMD(A, B) _mm_add_epi32(A, B)
  207. #define SUB_U32_SIMD(A, B) _mm_sub_epi32(A, B)
  208. // 32-bit integer multiplications are not available on SSE2.
  209. // Vector integer operations returning SIMD_U16x8
  210. #define ADD_U16_SIMD(A, B) _mm_add_epi16(A, B)
  211. #define SUB_U16_SIMD(A, B) _mm_sub_epi16(A, B)
  212. #define MUL_U16_SIMD(A, B) _mm_mullo_epi16(A, B)
  213. // Vector integer operations returning SIMD_U8x16
  214. #define ADD_U8_SIMD(A, B) _mm_add_epi8(A, B)
  215. #define ADD_SAT_U8_SIMD(A, B) _mm_adds_epu8(A, B) // Saturated addition
  216. #define SUB_U8_SIMD(A, B) _mm_sub_epi8(A, B)
  217. #define SUB_SAT_U8_SIMD(A, B) _mm_subs_epu8(A, B) // Saturated subtraction
  218. // No 8-bit multiplications
  219. // Statistics
  220. #define MIN_F32_SIMD(A, B) _mm_min_ps(A, B)
  221. #define MAX_F32_SIMD(A, B) _mm_max_ps(A, B)
  222. // TODO: Implement minimum and maximum for integer vectors, so that all operations exist for all applicable types:
  223. // Using _mm256_max_epu16... in AVX2 for 256-bit versions
  224. // Using comparisons and masking in SSE2 when _mm_max_epu16... in SSE4.1 is not available
  225. // Bitwise
  226. #define BITWISE_AND_U32_SIMD(A, B) _mm_and_si128(A, B)
  227. #define BITWISE_OR_U32_SIMD(A, B) _mm_or_si128(A, B)
  228. #define BITWISE_XOR_U32_SIMD(A, B) _mm_xor_si128(A, B)
  229. #ifdef USE_AVX
  230. // 256-bit vector types
  231. #define SIMD_F32x8 __m256
  232. // Vector uploads in address order
  233. #define LOAD_VECTOR_F32_SIMD256(A, B, C, D, E, F, G, H) _mm256_set_ps(H, G, F, E, D, C, B, A)
  234. #define LOAD_SCALAR_F32_SIMD256(A) _mm256_set1_ps(A)
  235. // Vector float operations returning SIMD_F32x4
  236. #define ADD_F32_SIMD256(A, B) _mm256_add_ps(A, B)
  237. #define SUB_F32_SIMD256(A, B) _mm256_sub_ps(A, B)
  238. #define MUL_F32_SIMD256(A, B) _mm256_mul_ps(A, B)
  239. // Statistics
  240. #define MIN_F32_SIMD256(A, B) _mm256_min_ps(A, B)
  241. #define MAX_F32_SIMD256(A, B) _mm256_max_ps(A, B)
  242. #ifdef USE_AVX2
  243. // 256-bit vector types
  244. #define SIMD_U8x32 __m256i
  245. #define SIMD_U16x16 __m256i
  246. #define SIMD_U32x8 __m256i
  247. #define SIMD_I32x8 __m256i
  248. // Vector uploads in address order
  249. #define LOAD_VECTOR_U8_SIMD256(A1, B1, C1, D1, E1, F1, G1, H1, I1, J1, K1, L1, M1, N1, O1, P1, Q1, R1, S1, T1, U1, V1, W1, X1, Y1, Z1, A2, B2, C2, D2, E2, F2) _mm256_set_epi8(F2, E2, D2, C2, B2, A2, Z1, Y1, X1, W1, V1, U1, T1, S1, R1, Q1, P1, O1, N1, M1, L1, K1, J1, I1, H1, G1, F1, E1, D1, C1, B1, A1)
  250. #define LOAD_SCALAR_U8_SIMD256(A) _mm256_set1_epi8(A)
  251. #define LOAD_VECTOR_U16_SIMD256(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) _mm256_set_epi16(P, O, N, M, L, K, J, I, H, G, F, E, D, C, B, A)
  252. #define LOAD_SCALAR_U16_SIMD256(A) _mm256_set1_epi16(A)
  253. #define LOAD_VECTOR_U32_SIMD256(A, B, C, D, E, F, G, H) _mm256_set_epi32(H, G, F, E, D, C, B, A)
  254. #define LOAD_SCALAR_U32_SIMD256(A) _mm256_set1_epi32(A)
  255. #define LOAD_VECTOR_I32_SIMD256(A, B, C, D, E, F, G, H) _mm256_set_epi32(H, G, F, E, D, C, B, A)
  256. #define LOAD_SCALAR_I32_SIMD256(A) _mm256_set1_epi32(A)
  257. // Conversions
  258. #define F32_TO_I32_SIMD256(A) _mm256_cvttps_epi32(A)
  259. #define F32_TO_U32_SIMD256(A) _mm256_cvttps_epi32(A)
  260. #define I32_TO_F32_SIMD256(A) _mm256_cvtepi32_ps(A)
  261. #define U32_TO_F32_SIMD256(A) _mm256_cvtepi32_ps(A)
  262. // Unpacking conversions
  263. #define U8_LOW_TO_U16_SIMD256(A) _mm256_unpacklo_epi8(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi8(0))
  264. #define U8_HIGH_TO_U16_SIMD256(A) _mm256_unpackhi_epi8(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi8(0))
  265. #define U16_LOW_TO_U32_SIMD256(A) _mm256_unpacklo_epi16(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi16(0))
  266. #define U16_HIGH_TO_U32_SIMD256(A) _mm256_unpackhi_epi16(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi16(0))
  267. // Saturated packing
  268. // Credit: Using ideas from Victoria Zhislina's NEON_2_SSE.h header from the Intel corporation, but not trying to emulate NEON
  269. inline SIMD_U8x32 PACK_SAT_U16_TO_U8_SIMD256(const SIMD_U16x16& a, const SIMD_U16x16& b) {
  270. SIMD_U16x16 mask, a2, b2;
  271. mask = _mm256_set1_epi16(0b0111111111111111);
  272. a2 = _mm256_and_si256(a, mask);
  273. a2 = _mm256_or_si256(a2, _mm256_and_si256(_mm256_cmpgt_epi16(a2, a), mask));
  274. b2 = _mm256_and_si256(b, mask);
  275. b2 = _mm256_or_si256(b2, _mm256_and_si256(_mm256_cmpgt_epi16(b2, b), mask));
  276. // The 256-bit pack instruction _mm256_packus_epi16 is not serial, so the result has to be permutated into the correct order.
  277. // 0 2 1 3
  278. // | X |
  279. // 0 1 2 3
  280. return _mm256_permute4x64_epi64(_mm256_packus_epi16(a2, b2), 0b11011000);
  281. }
  282. // Reinterpret casting
  283. #define REINTERPRET_U32_TO_U8_SIMD256(A) (A)
  284. #define REINTERPRET_U32_TO_U16_SIMD256(A) (A)
  285. #define REINTERPRET_U8_TO_U32_SIMD256(A) (A)
  286. #define REINTERPRET_U16_TO_U32_SIMD256(A) (A)
  287. #define REINTERPRET_U32_TO_I32_SIMD256(A) (A)
  288. #define REINTERPRET_I32_TO_U32_SIMD256(A) (A)
  289. // Vector integer operations returning SIMD_I32x4
  290. #define ADD_I32_SIMD256(A, B) _mm256_add_epi32(A, B)
  291. #define SUB_I32_SIMD256(A, B) _mm256_sub_epi32(A, B)
  292. #define MUL_I32_SIMD256(A, B) _mm256_mullo_epi32(A, B)
  293. // Vector integer operations returning SIMD_U32x4
  294. #define ADD_U32_SIMD256(A, B) _mm256_add_epi32(A, B)
  295. #define SUB_U32_SIMD256(A, B) _mm256_sub_epi32(A, B)
  296. #define MUL_U32_SIMD256(A, B) _mm256_mullo_epi32(A, B)
  297. // Vector integer operations returning SIMD_U16x8
  298. #define ADD_U16_SIMD256(A, B) _mm256_add_epi16(A, B)
  299. #define SUB_U16_SIMD256(A, B) _mm256_sub_epi16(A, B)
  300. #define MUL_U16_SIMD256(A, B) _mm256_mullo_epi16(A, B)
  301. // Vector integer operations returning SIMD_U8x16
  302. #define ADD_U8_SIMD256(A, B) _mm256_add_epi8(A, B)
  303. #define ADD_SAT_U8_SIMD256(A, B) _mm256_adds_epu8(A, B) // Saturated addition
  304. #define SUB_U8_SIMD256(A, B) _mm256_sub_epi8(A, B)
  305. #define SUB_SAT_U8_SIMD256(A, B) _mm256_subs_epu8(A, B) // Saturated subtraction
  306. // No 8-bit multiplications
  307. // Bitwise
  308. #define BITWISE_AND_U32_SIMD256(A, B) _mm256_and_si256(A, B)
  309. #define BITWISE_OR_U32_SIMD256(A, B) _mm256_or_si256(A, B)
  310. #define BITWISE_XOR_U32_SIMD256(A, B) _mm256_xor_si256(A, B)
  311. #endif
  312. #endif
  313. #endif
  314. // Everything declared in here handles things specific for NEON.
  315. // Direct use of the macros will not provide portability to all hardware.
  316. #ifdef USE_NEON
  317. #include <arm_neon.h> // NEON
  318. // Vector types
  319. #define SIMD_F32x4 float32x4_t
  320. #define SIMD_U8x16 uint8x16_t
  321. #define SIMD_U16x8 uint16x8_t
  322. #define SIMD_U32x4 uint32x4_t
  323. #define SIMD_I32x4 int32x4_t
  324. // Vector uploads in address order
  325. inline SIMD_F32x4 LOAD_VECTOR_F32_SIMD(float a, float b, float c, float d) {
  326. float data[4] ALIGN16 = {a, b, c, d};
  327. return vld1q_f32(data);
  328. }
  329. inline SIMD_F32x4 LOAD_SCALAR_F32_SIMD(float a) {
  330. return vdupq_n_f32(a);
  331. }
  332. inline SIMD_U8x16 LOAD_VECTOR_U8_SIMD(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
  333. uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p) {
  334. uint8_t data[16] ALIGN16 = {a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p};
  335. return vld1q_u8(data);
  336. }
  337. inline SIMD_U8x16 LOAD_SCALAR_U8_SIMD(uint16_t a) {
  338. return vdupq_n_u8(a);
  339. }
  340. inline SIMD_U16x8 LOAD_VECTOR_U16_SIMD(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h) {
  341. uint16_t data[8] ALIGN16 = {a, b, c, d, e, f, g, h};
  342. return vld1q_u16(data);
  343. }
  344. inline SIMD_U16x8 LOAD_SCALAR_U16_SIMD(uint16_t a) {
  345. return vdupq_n_u16(a);
  346. }
  347. inline SIMD_U32x4 LOAD_VECTOR_U32_SIMD(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
  348. uint32_t data[4] ALIGN16 = {a, b, c, d};
  349. return vld1q_u32(data);
  350. }
  351. inline SIMD_U32x4 LOAD_SCALAR_U32_SIMD(uint32_t a) {
  352. return vdupq_n_u32(a);
  353. }
  354. inline SIMD_I32x4 LOAD_VECTOR_I32_SIMD(int32_t a, int32_t b, int32_t c, int32_t d) {
  355. int32_t data[4] ALIGN16 = {a, b, c, d};
  356. return vld1q_s32(data);
  357. }
  358. inline SIMD_I32x4 LOAD_SCALAR_I32_SIMD(int32_t a) {
  359. return vdupq_n_s32(a);
  360. }
  361. // Conversions
  362. #define F32_TO_I32_SIMD(A) vcvtq_s32_f32(A)
  363. #define F32_TO_U32_SIMD(A) vcvtq_u32_f32(A)
  364. #define I32_TO_F32_SIMD(A) vcvtq_f32_s32(A)
  365. #define U32_TO_F32_SIMD(A) vcvtq_f32_u32(A)
  366. // Unpacking conversions
  367. #define U8_LOW_TO_U16_SIMD(A) vmovl_u8(vget_low_u8(A))
  368. #define U8_HIGH_TO_U16_SIMD(A) vmovl_u8(vget_high_u8(A))
  369. #define U16_LOW_TO_U32_SIMD(A) vmovl_u16(vget_low_u16(A))
  370. #define U16_HIGH_TO_U32_SIMD(A) vmovl_u16(vget_high_u16(A))
  371. // Saturated packing
  372. #define PACK_SAT_U16_TO_U8_SIMD(A, B) vcombine_u8(vqmovn_u16(A), vqmovn_u16(B))
  373. // Reinterpret casting
  374. #define REINTERPRET_U32_TO_U8_SIMD(A) vreinterpretq_u8_u32(A)
  375. #define REINTERPRET_U32_TO_U16_SIMD(A) vreinterpretq_u16_u32(A)
  376. #define REINTERPRET_U8_TO_U32_SIMD(A) vreinterpretq_u32_u8(A)
  377. #define REINTERPRET_U16_TO_U32_SIMD(A) vreinterpretq_u32_u16(A)
  378. #define REINTERPRET_U32_TO_I32_SIMD(A) vreinterpretq_s32_u32(A)
  379. #define REINTERPRET_I32_TO_U32_SIMD(A) vreinterpretq_u32_s32(A)
  380. // Vector float operations returning SIMD_F32x4
  381. #define ADD_F32_SIMD(A, B) vaddq_f32(A, B)
  382. #define SUB_F32_SIMD(A, B) vsubq_f32(A, B)
  383. #define MUL_F32_SIMD(A, B) vmulq_f32(A, B)
  384. // Vector integer operations returning SIMD_I32x4
  385. #define ADD_I32_SIMD(A, B) vaddq_s32(A, B)
  386. #define SUB_I32_SIMD(A, B) vsubq_s32(A, B)
  387. #define MUL_I32_NEON(A, B) vmulq_s32(A, B)
  388. // Vector integer operations returning SIMD_U32x4
  389. #define ADD_U32_SIMD(A, B) vaddq_u32(A, B)
  390. #define SUB_U32_SIMD(A, B) vsubq_u32(A, B)
  391. #define MUL_U32_NEON(A, B) vmulq_u32(A, B)
  392. // Vector integer operations returning SIMD_U16x8
  393. #define ADD_U16_SIMD(A, B) vaddq_u16(A, B)
  394. #define SUB_U16_SIMD(A, B) vsubq_u16(A, B)
  395. #define MUL_U16_SIMD(A, B) vmulq_u16(A, B)
  396. // Vector integer operations returning SIMD_U8x16
  397. #define ADD_U8_SIMD(A, B) vaddq_u8(A, B)
  398. #define ADD_SAT_U8_SIMD(A, B) vqaddq_u8(A, B) // Saturated addition
  399. #define SUB_U8_SIMD(A, B) vsubq_u8(A, B)
  400. #define SUB_SAT_U8_SIMD(A, B) vqsubq_u8(A, B) // Saturated subtraction
  401. // No 8-bit multiplications
  402. // Statistics
  403. #define MIN_F32_SIMD(A, B) vminq_f32(A, B)
  404. #define MAX_F32_SIMD(A, B) vmaxq_f32(A, B)
  405. // Bitwise
  406. #define BITWISE_AND_U32_SIMD(A, B) vandq_u32(A, B)
  407. #define BITWISE_OR_U32_SIMD(A, B) vorrq_u32(A, B)
  408. #define BITWISE_XOR_U32_SIMD(A, B) veorq_u32(A, B)
  409. #endif
  410. /*
  411. The vector types below are supposed to be portable across different CPU architectures.
  412. When mixed with handwritten SIMD intrinsics:
  413. Use "USE_SSE2" instead of "__SSE2__"
  414. Use "USE_AVX2" instead of "__AVX2__"
  415. Use "USE_NEON" instead of "__ARM_NEON"
  416. So that any new variations of the macro named given from the compiler can be added to simd.h instead of duplicated everywhere.
  417. Portability exceptions:
  418. * The "v" variable is the native backend, which is only defined when SIMD is supported by hardware.
  419. Only use when USE_BASIC_SIMD is defined.
  420. Will not work on scalar emulation.
  421. * The "scalars" array is available when emulating a type that does not exist or the SIMD vector has direct access to the memory.
  422. Do not rely on these for accessing elements, because otherwise your code will not be able to compile for ARM NEON.
  423. */
  424. union F32x4 {
  425. private:
  426. // The uninitialized default constructor is private for safety reasons.
  427. F32x4() {}
  428. public:
  429. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  430. static inline F32x4 create_dangerous_uninitialized() { return F32x4(); }
  431. #ifdef USE_BASIC_SIMD
  432. public:
  433. #ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
  434. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  435. // Direct access cannot be done on NEON!
  436. float scalars[4];
  437. #endif
  438. // The SIMD vector of undefined type
  439. // Not accessible while emulating!
  440. SIMD_F32x4 v;
  441. // Construct a portable vector from a native SIMD vector
  442. explicit F32x4(const SIMD_F32x4& v) : v(v) {}
  443. // Construct a portable vector from a set of scalars
  444. F32x4(float a1, float a2, float a3, float a4) : v(LOAD_VECTOR_F32_SIMD(a1, a2, a3, a4)) {}
  445. // Construct a portable vector from a single duplicated scalar
  446. explicit F32x4(float scalar) : v(LOAD_SCALAR_F32_SIMD(scalar)) {}
  447. #else
  448. public:
  449. // Emulate a SIMD vector as an array of scalars without hardware support.
  450. // Only accessible while emulating!
  451. float scalars[4];
  452. // Construct a portable vector from a set of scalars
  453. F32x4(float a1, float a2, float a3, float a4) {
  454. this->scalars[0] = a1;
  455. this->scalars[1] = a2;
  456. this->scalars[2] = a3;
  457. this->scalars[3] = a4;
  458. }
  459. // Construct a portable vector from a single duplicated scalar
  460. explicit F32x4(float scalar) {
  461. this->scalars[0] = scalar;
  462. this->scalars[1] = scalar;
  463. this->scalars[2] = scalar;
  464. this->scalars[3] = scalar;
  465. }
  466. #endif
  467. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  468. static inline F32x4 createGradient(float start, float increment) {
  469. return F32x4(start, start + increment, start + increment * 2.0f, start + increment * 3.0f);
  470. }
  471. // Construct a portable SIMD vector from a pointer to aligned data
  472. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  473. static inline F32x4 readAlignedUnsafe(const float* data) {
  474. #ifdef USE_BASIC_SIMD
  475. #if defined USE_SSE2
  476. return F32x4(_mm_load_ps(data));
  477. #elif defined USE_NEON
  478. return F32x4(vld1q_f32(data));
  479. #endif
  480. #else
  481. return F32x4(data[0], data[1], data[2], data[3]);
  482. #endif
  483. }
  484. // Write to aligned memory from the existing vector
  485. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  486. inline void writeAlignedUnsafe(float* data) const {
  487. #if defined USE_BASIC_SIMD
  488. #if defined USE_SSE2
  489. _mm_store_ps(data, this->v);
  490. #elif defined USE_NEON
  491. vst1q_f32(data, this->v);
  492. #endif
  493. #else
  494. data[0] = this->scalars[0];
  495. data[1] = this->scalars[1];
  496. data[2] = this->scalars[2];
  497. data[3] = this->scalars[3];
  498. #endif
  499. }
  500. #if defined DFPSR_GEOMETRY_FVECTOR
  501. dsr::FVector4D get() const {
  502. float data[4] ALIGN16;
  503. this->writeAlignedUnsafe(data);
  504. return dsr::FVector4D(data[0], data[1], data[2], data[3]);
  505. }
  506. #endif
  507. // Bound and alignment checked reading
  508. static inline F32x4 readAligned(const dsr::SafePointer<float> data, const char* methodName) {
  509. const float* pointer = data.getUnsafe();
  510. assert(((uintptr_t)pointer & 15) == 0);
  511. #if defined SAFE_POINTER_CHECKS
  512. data.assertInside(methodName, pointer, 16);
  513. #endif
  514. return F32x4::readAlignedUnsafe(pointer);
  515. }
  516. // Bound and alignment checked writing
  517. inline void writeAligned(dsr::SafePointer<float> data, const char* methodName) const {
  518. float* pointer = data.getUnsafe();
  519. assert(((uintptr_t)pointer & 15) == 0);
  520. #if defined SAFE_POINTER_CHECKS
  521. data.assertInside(methodName, pointer, 16);
  522. #endif
  523. this->writeAlignedUnsafe(pointer);
  524. }
  525. // 1 / x
  526. // Useful for multiple divisions with the same denominator
  527. // Useless if the denominator is a constant
  528. F32x4 reciprocal() const {
  529. #if defined USE_BASIC_SIMD
  530. #if defined USE_SSE2
  531. // Approximate
  532. SIMD_F32x4 lowQ = _mm_rcp_ps(this->v);
  533. // Refine
  534. return F32x4(SUB_F32_SIMD(ADD_F32_SIMD(lowQ, lowQ), MUL_F32_SIMD(this->v, MUL_F32_SIMD(lowQ, lowQ))));
  535. #elif defined USE_NEON
  536. // Approximate
  537. SIMD_F32x4 result = vrecpeq_f32(this->v);
  538. // Refine
  539. result = MUL_F32_SIMD(vrecpsq_f32(this->v, result), result);
  540. return F32x4(MUL_F32_SIMD(vrecpsq_f32(this->v, result), result));
  541. #else
  542. assert(false);
  543. return F32x4(0);
  544. #endif
  545. #else
  546. return F32x4(1.0f / this->scalars[0], 1.0f / this->scalars[1], 1.0f / this->scalars[2], 1.0f / this->scalars[3]);
  547. #endif
  548. }
  549. // 1 / sqrt(x)
  550. // Useful for normalizing vectors
  551. F32x4 reciprocalSquareRoot() const {
  552. #if defined USE_BASIC_SIMD
  553. #if defined USE_SSE2
  554. SIMD_F32x4 reciRoot = _mm_rsqrt_ps(this->v);
  555. SIMD_F32x4 mul = MUL_F32_SIMD(MUL_F32_SIMD(this->v, reciRoot), reciRoot);
  556. reciRoot = MUL_F32_SIMD(MUL_F32_SIMD(LOAD_SCALAR_F32_SIMD(0.5f), reciRoot), SUB_F32_SIMD(LOAD_SCALAR_F32_SIMD(3.0f), mul));
  557. return F32x4(reciRoot);
  558. #elif defined USE_NEON
  559. // Approximate
  560. SIMD_F32x4 reciRoot = vrsqrteq_f32(this->v);
  561. // Refine
  562. reciRoot = MUL_F32_SIMD(vrsqrtsq_f32(MUL_F32_SIMD(this->v, reciRoot), reciRoot), reciRoot);
  563. return F32x4(reciRoot);
  564. #else
  565. assert(false);
  566. return F32x4(0);
  567. #endif
  568. #else
  569. return F32x4(1.0f / sqrt(this->scalars[0]), 1.0f / sqrt(this->scalars[1]), 1.0f / sqrt(this->scalars[2]), 1.0f / sqrt(this->scalars[3]));
  570. #endif
  571. }
  572. // sqrt(x)
  573. // Useful for getting lengths of vectors
  574. F32x4 squareRoot() const {
  575. #if defined USE_BASIC_SIMD
  576. #if defined USE_SSE2
  577. SIMD_F32x4 half = LOAD_SCALAR_F32_SIMD(0.5f);
  578. // Approximate
  579. SIMD_F32x4 root = _mm_sqrt_ps(this->v);
  580. // Refine
  581. root = _mm_mul_ps(_mm_add_ps(root, _mm_div_ps(this->v, root)), half);
  582. return F32x4(root);
  583. #elif defined USE_NEON
  584. return F32x4(MUL_F32_SIMD(this->v, this->reciprocalSquareRoot().v));
  585. #else
  586. assert(false);
  587. return F32x4(0);
  588. #endif
  589. #else
  590. return F32x4(sqrt(this->scalars[0]), sqrt(this->scalars[1]), sqrt(this->scalars[2]), sqrt(this->scalars[3]));
  591. #endif
  592. }
  593. F32x4 clamp(float minimum, float maximum) const {
  594. #if defined USE_BASIC_SIMD
  595. return F32x4(MIN_F32_SIMD(MAX_F32_SIMD(this->v, LOAD_SCALAR_F32_SIMD(minimum)), LOAD_SCALAR_F32_SIMD(maximum)));
  596. #else
  597. float val0 = this->scalars[0];
  598. float val1 = this->scalars[1];
  599. float val2 = this->scalars[2];
  600. float val3 = this->scalars[3];
  601. if (minimum > val0) { val0 = minimum; }
  602. if (maximum < val0) { val0 = maximum; }
  603. if (minimum > val1) { val1 = minimum; }
  604. if (maximum < val1) { val1 = maximum; }
  605. if (minimum > val2) { val2 = minimum; }
  606. if (maximum < val2) { val2 = maximum; }
  607. if (minimum > val3) { val3 = minimum; }
  608. if (maximum < val3) { val3 = maximum; }
  609. return F32x4(val0, val1, val2, val3);
  610. #endif
  611. }
  612. F32x4 clampLower(float minimum) const {
  613. #if defined USE_BASIC_SIMD
  614. return F32x4(MAX_F32_SIMD(this->v, LOAD_SCALAR_F32_SIMD(minimum)));
  615. #else
  616. float val0 = this->scalars[0];
  617. float val1 = this->scalars[1];
  618. float val2 = this->scalars[2];
  619. float val3 = this->scalars[3];
  620. if (minimum > val0) { val0 = minimum; }
  621. if (minimum > val1) { val1 = minimum; }
  622. if (minimum > val2) { val2 = minimum; }
  623. if (minimum > val3) { val3 = minimum; }
  624. return F32x4(val0, val1, val2, val3);
  625. #endif
  626. }
  627. F32x4 clampUpper(float maximum) const {
  628. #if defined USE_BASIC_SIMD
  629. return F32x4(MIN_F32_SIMD(this->v, LOAD_SCALAR_F32_SIMD(maximum)));
  630. #else
  631. float val0 = this->scalars[0];
  632. float val1 = this->scalars[1];
  633. float val2 = this->scalars[2];
  634. float val3 = this->scalars[3];
  635. if (maximum < val0) { val0 = maximum; }
  636. if (maximum < val1) { val1 = maximum; }
  637. if (maximum < val2) { val2 = maximum; }
  638. if (maximum < val3) { val3 = maximum; }
  639. return F32x4(val0, val1, val2, val3);
  640. #endif
  641. }
  642. };
  643. union I32x4 {
  644. private:
  645. // The uninitialized default constructor is private for safety reasons.
  646. I32x4() {}
  647. public:
  648. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  649. static inline I32x4 create_dangerous_uninitialized() { return I32x4(); }
  650. #if defined USE_BASIC_SIMD
  651. public:
  652. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  653. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  654. // Direct access cannot be done on NEON!
  655. int32_t scalars[4];
  656. #endif
  657. // The SIMD vector of undefined type
  658. // Not accessible while emulating!
  659. SIMD_I32x4 v;
  660. // Construct a portable vector from a native SIMD vector
  661. explicit I32x4(const SIMD_I32x4& v) : v(v) {}
  662. // Construct a portable vector from a set of scalars
  663. I32x4(int32_t a1, int32_t a2, int32_t a3, int32_t a4) : v(LOAD_VECTOR_I32_SIMD(a1, a2, a3, a4)) {}
  664. // Construct a portable vector from a single duplicated scalar
  665. explicit I32x4(int32_t scalar) : v(LOAD_SCALAR_I32_SIMD(scalar)) {}
  666. #else
  667. public:
  668. // Emulate a SIMD vector as an array of scalars without hardware support.
  669. // Only accessible while emulating!
  670. int32_t scalars[4];
  671. // Construct a portable vector from a set of scalars
  672. I32x4(int32_t a1, int32_t a2, int32_t a3, int32_t a4) {
  673. this->scalars[0] = a1;
  674. this->scalars[1] = a2;
  675. this->scalars[2] = a3;
  676. this->scalars[3] = a4;
  677. }
  678. // Construct a portable vector from a single duplicated scalar
  679. explicit I32x4(int32_t scalar) {
  680. this->scalars[0] = scalar;
  681. this->scalars[1] = scalar;
  682. this->scalars[2] = scalar;
  683. this->scalars[3] = scalar;
  684. }
  685. #endif
  686. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  687. static inline I32x4 createGradient(int32_t start, int32_t increment) {
  688. return I32x4(start, start + increment, start + increment * 2, start + increment * 3);
  689. }
  690. // Construct a portable SIMD vector from a pointer to aligned data
  691. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  692. static inline I32x4 readAlignedUnsafe(const int32_t* data) {
  693. #if defined USE_BASIC_SIMD
  694. #if defined USE_SSE2
  695. return I32x4(_mm_load_si128((const __m128i*)data));
  696. #elif defined USE_NEON
  697. return I32x4(vld1q_s32(data));
  698. #endif
  699. #else
  700. return I32x4(data[0], data[1], data[2], data[3]);
  701. #endif
  702. }
  703. // Write to aligned memory from the existing vector
  704. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  705. inline void writeAlignedUnsafe(int32_t* data) const {
  706. #if defined USE_BASIC_SIMD
  707. #if defined USE_SSE2
  708. _mm_store_si128((__m128i*)data, this->v);
  709. #elif defined USE_NEON
  710. vst1q_s32(data, this->v);
  711. #endif
  712. #else
  713. data[0] = this->scalars[0];
  714. data[1] = this->scalars[1];
  715. data[2] = this->scalars[2];
  716. data[3] = this->scalars[3];
  717. #endif
  718. }
  719. #if defined DFPSR_GEOMETRY_IVECTOR
  720. dsr::IVector4D get() const {
  721. int32_t data[4] ALIGN16;
  722. this->writeAlignedUnsafe(data);
  723. return dsr::IVector4D(data[0], data[1], data[2], data[3]);
  724. }
  725. #endif
  726. // Bound and alignment checked reading
  727. static inline I32x4 readAligned(const dsr::SafePointer<int32_t> data, const char* methodName) {
  728. const int32_t* pointer = data.getUnsafe();
  729. assert(((uintptr_t)pointer & 15) == 0);
  730. #if defined SAFE_POINTER_CHECKS
  731. data.assertInside(methodName, pointer, 16);
  732. #endif
  733. return I32x4::readAlignedUnsafe(pointer);
  734. }
  735. // Bound and alignment checked writing
  736. inline void writeAligned(dsr::SafePointer<int32_t> data, const char* methodName) const {
  737. int32_t* pointer = data.getUnsafe();
  738. assert(((uintptr_t)pointer & 15) == 0);
  739. #if defined SAFE_POINTER_CHECKS
  740. data.assertInside(methodName, pointer, 16);
  741. #endif
  742. this->writeAlignedUnsafe(pointer);
  743. }
  744. };
  745. union U32x4 {
  746. #if defined USE_BASIC_SIMD
  747. public:
  748. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  749. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  750. // Direct access cannot be done on NEON!
  751. uint32_t scalars[4];
  752. #endif
  753. // The SIMD vector of undefined type
  754. // Not accessible while emulating!
  755. SIMD_U32x4 v;
  756. // Construct a portable vector from a native SIMD vector
  757. explicit U32x4(const SIMD_U32x4& v) : v(v) {}
  758. // Construct a portable vector from a set of scalars
  759. U32x4(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4) : v(LOAD_VECTOR_U32_SIMD(a1, a2, a3, a4)) {}
  760. // Construct a portable vector from a single duplicated scalar
  761. explicit U32x4(uint32_t scalar) : v(LOAD_SCALAR_U32_SIMD(scalar)) {}
  762. #else
  763. public:
  764. // Emulate a SIMD vector as an array of scalars without hardware support.
  765. // Only accessible while emulating!
  766. uint32_t scalars[4];
  767. // Construct a portable vector from a set of scalars
  768. U32x4(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4) {
  769. this->scalars[0] = a1;
  770. this->scalars[1] = a2;
  771. this->scalars[2] = a3;
  772. this->scalars[3] = a4;
  773. }
  774. // Construct a portable vector from a single duplicated scalar
  775. explicit U32x4(uint32_t scalar) {
  776. this->scalars[0] = scalar;
  777. this->scalars[1] = scalar;
  778. this->scalars[2] = scalar;
  779. this->scalars[3] = scalar;
  780. }
  781. #endif
  782. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  783. static inline U32x4 createGradient(uint32_t start, uint32_t increment) {
  784. return U32x4(start, start + increment, start + increment * 2, start + increment * 3);
  785. }
  786. // Construct a portable SIMD vector from a pointer to aligned data
  787. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  788. static inline U32x4 readAlignedUnsafe(const uint32_t* data) {
  789. #if defined USE_BASIC_SIMD
  790. #if defined USE_SSE2
  791. return U32x4(_mm_load_si128((const __m128i*)data));
  792. #elif defined USE_NEON
  793. return U32x4(vld1q_u32(data));
  794. #endif
  795. #else
  796. return U32x4(data[0], data[1], data[2], data[3]);
  797. #endif
  798. }
  799. // Write to aligned memory from the existing vector
  800. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  801. inline void writeAlignedUnsafe(uint32_t* data) const {
  802. #if defined USE_BASIC_SIMD
  803. #if defined USE_SSE2
  804. _mm_store_si128((__m128i*)data, this->v);
  805. #elif defined USE_NEON
  806. vst1q_u32(data, this->v);
  807. #endif
  808. #else
  809. data[0] = this->scalars[0];
  810. data[1] = this->scalars[1];
  811. data[2] = this->scalars[2];
  812. data[3] = this->scalars[3];
  813. #endif
  814. }
  815. #if defined DFPSR_GEOMETRY_UVECTOR
  816. dsr::UVector4D get() const {
  817. uint32_t data[4] ALIGN16;
  818. this->writeAlignedUnsafe(data);
  819. return dsr::UVector4D(data[0], data[1], data[2], data[3]);
  820. }
  821. #endif
  822. // Bound and alignment checked reading
  823. static inline U32x4 readAligned(const dsr::SafePointer<uint32_t> data, const char* methodName) {
  824. const uint32_t* pointer = data.getUnsafe();
  825. assert(((uintptr_t)pointer & 15) == 0);
  826. #if defined SAFE_POINTER_CHECKS
  827. data.assertInside(methodName, pointer, 16);
  828. #endif
  829. return U32x4::readAlignedUnsafe(pointer);
  830. }
  831. // Bound and alignment checked writing
  832. inline void writeAligned(dsr::SafePointer<uint32_t> data, const char* methodName) const {
  833. uint32_t* pointer = data.getUnsafe();
  834. assert(((uintptr_t)pointer & 15) == 0);
  835. #if defined SAFE_POINTER_CHECKS
  836. data.assertInside(methodName, pointer, 16);
  837. #endif
  838. this->writeAlignedUnsafe(pointer);
  839. }
  840. };
  841. union U16x8 {
  842. private:
  843. // The uninitialized default constructor is private for safety reasons.
  844. U16x8() {}
  845. public:
  846. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  847. static inline U16x8 create_dangerous_uninitialized() { return U16x8(); }
  848. #if defined USE_BASIC_SIMD
  849. public:
  850. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  851. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  852. // Direct access cannot be done on NEON!
  853. uint16_t scalars[8];
  854. #endif
  855. // The SIMD vector of undefined type
  856. // Not accessible while emulating!
  857. SIMD_U16x8 v;
  858. // Construct a portable vector from a native SIMD vector
  859. explicit U16x8(const SIMD_U16x8& v) : v(v) {}
  860. // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
  861. // Reinterpret casting is used
  862. explicit U16x8(const U32x4& vector) : v(REINTERPRET_U32_TO_U16_SIMD(vector.v)) {}
  863. // Construct a portable vector from a set of scalars
  864. U16x8(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8) : v(LOAD_VECTOR_U16_SIMD(a1, a2, a3, a4, a5, a6, a7, a8)) {}
  865. // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
  866. // Reinterpret casting is used
  867. // TODO: Remove all reintreprets from constructors to improve readability
  868. explicit U16x8(uint32_t scalar) : v(REINTERPRET_U32_TO_U16_SIMD(LOAD_SCALAR_U32_SIMD(scalar))) {}
  869. // Construct a portable vector from a single duplicated scalar
  870. explicit U16x8(uint16_t scalar) : v(LOAD_SCALAR_U16_SIMD(scalar)) {}
  871. // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
  872. U32x4 get_U32() const {
  873. return U32x4(REINTERPRET_U16_TO_U32_SIMD(this->v));
  874. }
  875. #else
  876. public:
  877. // Emulate a SIMD vector as an array of scalars without hardware support.
  878. // Only accessible while emulating!
  879. uint16_t scalars[8];
  880. // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
  881. // Reinterpret casting is used
  882. explicit U16x8(const U32x4& vector) {
  883. uint64_t *target = (uint64_t*)this->scalars;
  884. uint64_t *source = (uint64_t*)vector.scalars;
  885. target[0] = source[0];
  886. target[1] = source[1];
  887. }
  888. // Construct a portable vector from a set of scalars
  889. U16x8(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8) {
  890. this->scalars[0] = a1;
  891. this->scalars[1] = a2;
  892. this->scalars[2] = a3;
  893. this->scalars[3] = a4;
  894. this->scalars[4] = a5;
  895. this->scalars[5] = a6;
  896. this->scalars[6] = a7;
  897. this->scalars[7] = a8;
  898. }
  899. // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
  900. // Reinterpret casting is used
  901. explicit U16x8(uint32_t scalar) {
  902. uint32_t *target = (uint32_t*)this->scalars;
  903. target[0] = scalar;
  904. target[1] = scalar;
  905. target[2] = scalar;
  906. target[3] = scalar;
  907. }
  908. // Construct a portable vector from a single duplicated scalar
  909. explicit U16x8(uint16_t scalar) {
  910. this->scalars[0] = scalar;
  911. this->scalars[1] = scalar;
  912. this->scalars[2] = scalar;
  913. this->scalars[3] = scalar;
  914. this->scalars[4] = scalar;
  915. this->scalars[5] = scalar;
  916. this->scalars[6] = scalar;
  917. this->scalars[7] = scalar;
  918. }
  919. // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
  920. U32x4 get_U32() const {
  921. U32x4 result(0);
  922. uint64_t *target = (uint64_t*)result.scalars;
  923. uint64_t *source = (uint64_t*)this->scalars;
  924. target[0] = source[0];
  925. target[1] = source[1];
  926. return result;
  927. }
  928. #endif
  929. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  930. static inline U16x8 createGradient(uint16_t start, uint16_t increment) {
  931. return U16x8(
  932. start,
  933. start + increment,
  934. start + increment * 2,
  935. start + increment * 3,
  936. start + increment * 4,
  937. start + increment * 5,
  938. start + increment * 6,
  939. start + increment * 7
  940. );
  941. }
  942. static inline U16x8 readAlignedUnsafe(const uint16_t* data) {
  943. #if defined USE_BASIC_SIMD
  944. #if defined USE_SSE2
  945. return U16x8(_mm_load_si128((const __m128i*)data));
  946. #elif defined USE_NEON
  947. return U16x8(vld1q_u16(data));
  948. #endif
  949. #else
  950. return U16x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  951. #endif
  952. }
  953. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  954. inline void writeAlignedUnsafe(uint16_t* data) const {
  955. #if defined USE_BASIC_SIMD
  956. #if defined USE_SSE2
  957. _mm_store_si128((__m128i*)data, this->v);
  958. #elif defined USE_NEON
  959. vst1q_u16(data, this->v);
  960. #endif
  961. #else
  962. data[0] = this->scalars[0];
  963. data[1] = this->scalars[1];
  964. data[2] = this->scalars[2];
  965. data[3] = this->scalars[3];
  966. data[4] = this->scalars[4];
  967. data[5] = this->scalars[5];
  968. data[6] = this->scalars[6];
  969. data[7] = this->scalars[7];
  970. #endif
  971. }
  972. // Bound and alignment checked reading
  973. static inline U16x8 readAligned(const dsr::SafePointer<uint16_t> data, const char* methodName) {
  974. const uint16_t* pointer = data.getUnsafe();
  975. assert(((uintptr_t)pointer & 15) == 0);
  976. #if defined SAFE_POINTER_CHECKS
  977. data.assertInside(methodName, pointer, 16);
  978. #endif
  979. return U16x8::readAlignedUnsafe(pointer);
  980. }
  981. // Bound and alignment checked writing
  982. inline void writeAligned(dsr::SafePointer<uint16_t> data, const char* methodName) const {
  983. uint16_t* pointer = data.getUnsafe();
  984. assert(((uintptr_t)pointer & 15) == 0);
  985. #if defined SAFE_POINTER_CHECKS
  986. data.assertInside(methodName, pointer, 16);
  987. #endif
  988. this->writeAlignedUnsafe(pointer);
  989. }
  990. };
  991. union U8x16 {
  992. private:
  993. // The uninitialized default constructor is private for safety reasons.
  994. U8x16() {}
  995. public:
  996. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  997. static inline U8x16 create_dangerous_uninitialized() { return U8x16(); }
  998. #if defined USE_BASIC_SIMD
  999. public:
  1000. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  1001. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  1002. // Direct access cannot be done on NEON!
  1003. uint8_t scalars[16];
  1004. #endif
  1005. // The SIMD vector of undefined type
  1006. // Not accessible while emulating!
  1007. SIMD_U8x16 v;
  1008. // Construct a portable vector from a native SIMD vector
  1009. explicit U8x16(const SIMD_U8x16& v) : v(v) {}
  1010. // Construct a portable vector from a set of scalars
  1011. U8x16(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  1012. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16)
  1013. : v(LOAD_VECTOR_U8_SIMD(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16)) {}
  1014. // Construct a portable vector from a single duplicated scalar
  1015. explicit U8x16(uint8_t scalar) : v(LOAD_SCALAR_U8_SIMD(scalar)) {}
  1016. #else
  1017. public:
  1018. // Emulate a SIMD vector as an array of scalars without hardware support.
  1019. // Only accessible while emulating!
  1020. uint8_t scalars[16];
  1021. // Construct a portable vector from a set of scalars
  1022. U8x16(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  1023. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16) {
  1024. this->scalars[0] = a1;
  1025. this->scalars[1] = a2;
  1026. this->scalars[2] = a3;
  1027. this->scalars[3] = a4;
  1028. this->scalars[4] = a5;
  1029. this->scalars[5] = a6;
  1030. this->scalars[6] = a7;
  1031. this->scalars[7] = a8;
  1032. this->scalars[8] = a9;
  1033. this->scalars[9] = a10;
  1034. this->scalars[10] = a11;
  1035. this->scalars[11] = a12;
  1036. this->scalars[12] = a13;
  1037. this->scalars[13] = a14;
  1038. this->scalars[14] = a15;
  1039. this->scalars[15] = a16;
  1040. }
  1041. // Construct a portable vector from a single duplicated scalar
  1042. explicit U8x16(uint8_t scalar) {
  1043. this->scalars[0] = scalar;
  1044. this->scalars[1] = scalar;
  1045. this->scalars[2] = scalar;
  1046. this->scalars[3] = scalar;
  1047. this->scalars[4] = scalar;
  1048. this->scalars[5] = scalar;
  1049. this->scalars[6] = scalar;
  1050. this->scalars[7] = scalar;
  1051. this->scalars[8] = scalar;
  1052. this->scalars[9] = scalar;
  1053. this->scalars[10] = scalar;
  1054. this->scalars[11] = scalar;
  1055. this->scalars[12] = scalar;
  1056. this->scalars[13] = scalar;
  1057. this->scalars[14] = scalar;
  1058. this->scalars[15] = scalar;
  1059. }
  1060. #endif
  1061. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1062. static inline U8x16 createGradient(uint8_t start, uint8_t increment) {
  1063. return U8x16(
  1064. start,
  1065. start + increment,
  1066. start + increment * 2,
  1067. start + increment * 3,
  1068. start + increment * 4,
  1069. start + increment * 5,
  1070. start + increment * 6,
  1071. start + increment * 7,
  1072. start + increment * 8,
  1073. start + increment * 9,
  1074. start + increment * 10,
  1075. start + increment * 11,
  1076. start + increment * 12,
  1077. start + increment * 13,
  1078. start + increment * 14,
  1079. start + increment * 15
  1080. );
  1081. }
  1082. static inline U8x16 readAlignedUnsafe(const uint8_t* data) {
  1083. #if defined USE_BASIC_SIMD
  1084. #if defined USE_SSE2
  1085. return U8x16(_mm_load_si128((const __m128i*)data));
  1086. #elif defined USE_NEON
  1087. return U8x16(vld1q_u8(data));
  1088. #endif
  1089. #else
  1090. return U8x16(
  1091. data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7],
  1092. data[8], data[9], data[10], data[11], data[12], data[13], data[14], data[15]
  1093. );
  1094. #endif
  1095. }
  1096. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1097. inline void writeAlignedUnsafe(uint8_t* data) const {
  1098. #if defined USE_BASIC_SIMD
  1099. #if defined USE_SSE2
  1100. _mm_store_si128((__m128i*)data, this->v);
  1101. #elif defined USE_NEON
  1102. vst1q_u8(data, this->v);
  1103. #endif
  1104. #else
  1105. data[0] = this->scalars[0];
  1106. data[1] = this->scalars[1];
  1107. data[2] = this->scalars[2];
  1108. data[3] = this->scalars[3];
  1109. data[4] = this->scalars[4];
  1110. data[5] = this->scalars[5];
  1111. data[6] = this->scalars[6];
  1112. data[7] = this->scalars[7];
  1113. data[8] = this->scalars[8];
  1114. data[9] = this->scalars[9];
  1115. data[10] = this->scalars[10];
  1116. data[11] = this->scalars[11];
  1117. data[12] = this->scalars[12];
  1118. data[13] = this->scalars[13];
  1119. data[14] = this->scalars[14];
  1120. data[15] = this->scalars[15];
  1121. #endif
  1122. }
  1123. // Bound and alignment checked reading
  1124. static inline U8x16 readAligned(const dsr::SafePointer<uint8_t> data, const char* methodName) {
  1125. const uint8_t* pointer = data.getUnsafe();
  1126. assert(((uintptr_t)pointer & 15) == 0);
  1127. #if defined SAFE_POINTER_CHECKS
  1128. data.assertInside(methodName, pointer, 16);
  1129. #endif
  1130. return U8x16::readAlignedUnsafe(pointer);
  1131. }
  1132. // Bound and alignment checked writing
  1133. inline void writeAligned(dsr::SafePointer<uint8_t> data, const char* methodName) const {
  1134. uint8_t* pointer = data.getUnsafe();
  1135. assert(((uintptr_t)pointer & 15) == 0);
  1136. #if defined SAFE_POINTER_CHECKS
  1137. data.assertInside(methodName, pointer, 16);
  1138. #endif
  1139. this->writeAlignedUnsafe(pointer);
  1140. }
  1141. };
  1142. union F32x8 {
  1143. private:
  1144. // The uninitialized default constructor is private for safety reasons.
  1145. F32x8() {}
  1146. public:
  1147. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  1148. static inline F32x8 create_dangerous_uninitialized() { return F32x8(); }
  1149. #if defined USE_256BIT_F_SIMD
  1150. public:
  1151. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  1152. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  1153. float scalars[8];
  1154. #endif
  1155. // The SIMD vector of undefined type
  1156. // Not accessible while emulating!
  1157. SIMD_F32x8 v;
  1158. // Construct a portable vector from a native SIMD vector
  1159. explicit F32x8(const SIMD_F32x8& v) : v(v) {}
  1160. // Construct a portable vector from a set of scalars
  1161. F32x8(float a1, float a2, float a3, float a4, float a5, float a6, float a7, float a8)
  1162. : v(LOAD_VECTOR_F32_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8)) {}
  1163. // Construct a portable vector from a single duplicated scalar
  1164. explicit F32x8(float scalar) : v(LOAD_SCALAR_F32_SIMD256(scalar)) {}
  1165. #else
  1166. public:
  1167. // Emulate a SIMD vector as an array of scalars without hardware support.
  1168. // Only accessible while emulating!
  1169. float scalars[8];
  1170. // Construct a portable vector from a set of scalars
  1171. F32x8(float a1, float a2, float a3, float a4, float a5, float a6, float a7, float a8) {
  1172. this->scalars[0] = a1;
  1173. this->scalars[1] = a2;
  1174. this->scalars[2] = a3;
  1175. this->scalars[3] = a4;
  1176. this->scalars[4] = a5;
  1177. this->scalars[5] = a6;
  1178. this->scalars[6] = a7;
  1179. this->scalars[7] = a8;
  1180. }
  1181. // Construct a portable vector from a single duplicated scalar
  1182. explicit F32x8(float scalar) {
  1183. this->scalars[0] = scalar;
  1184. this->scalars[1] = scalar;
  1185. this->scalars[2] = scalar;
  1186. this->scalars[3] = scalar;
  1187. this->scalars[4] = scalar;
  1188. this->scalars[5] = scalar;
  1189. this->scalars[6] = scalar;
  1190. this->scalars[7] = scalar;
  1191. }
  1192. #endif
  1193. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1194. static inline F32x8 createGradient(float start, float increment) {
  1195. return F32x8(
  1196. start,
  1197. start + increment,
  1198. start + increment * 2.0f,
  1199. start + increment * 3.0f,
  1200. start + increment * 4.0f,
  1201. start + increment * 5.0f,
  1202. start + increment * 6.0f,
  1203. start + increment * 7.0f
  1204. );
  1205. }
  1206. // Construct a portable SIMD vector from a pointer to aligned data
  1207. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1208. static inline F32x8 readAlignedUnsafe(const float* data) {
  1209. #if defined USE_AVX2
  1210. return F32x8(_mm256_load_ps(data));
  1211. #else
  1212. return F32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  1213. #endif
  1214. }
  1215. // Write to aligned memory from the existing vector
  1216. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1217. inline void writeAlignedUnsafe(float* data) const {
  1218. #if defined USE_AVX2
  1219. _mm256_store_ps(data, this->v);
  1220. #else
  1221. data[0] = this->scalars[0];
  1222. data[1] = this->scalars[1];
  1223. data[2] = this->scalars[2];
  1224. data[3] = this->scalars[3];
  1225. data[4] = this->scalars[4];
  1226. data[5] = this->scalars[5];
  1227. data[6] = this->scalars[6];
  1228. data[7] = this->scalars[7];
  1229. #endif
  1230. }
  1231. // Bound and alignment checked reading
  1232. static inline F32x8 readAligned(const dsr::SafePointer<float> data, const char* methodName) {
  1233. const float* pointer = data.getUnsafe();
  1234. assert(((uintptr_t)pointer & 31) == 0);
  1235. #if defined SAFE_POINTER_CHECKS
  1236. data.assertInside(methodName, pointer, 32);
  1237. #endif
  1238. return F32x8::readAlignedUnsafe(pointer);
  1239. }
  1240. // Bound and alignment checked writing
  1241. inline void writeAligned(dsr::SafePointer<float> data, const char* methodName) const {
  1242. float* pointer = data.getUnsafe();
  1243. assert(((uintptr_t)pointer & 31) == 0);
  1244. #if defined SAFE_POINTER_CHECKS
  1245. data.assertInside(methodName, pointer, 32);
  1246. #endif
  1247. this->writeAlignedUnsafe(pointer);
  1248. }
  1249. // 1 / x
  1250. // Useful for multiple divisions with the same denominator
  1251. // Useless if the denominator is a constant
  1252. F32x8 reciprocal() const {
  1253. #if defined USE_AVX2
  1254. // Approximate
  1255. SIMD_F32x8 lowQ = _mm256_rcp_ps(this->v);
  1256. // Refine
  1257. return F32x8(SUB_F32_SIMD256(ADD_F32_SIMD256(lowQ, lowQ), MUL_F32_SIMD256(this->v, MUL_F32_SIMD256(lowQ, lowQ))));
  1258. #else
  1259. return F32x8(
  1260. 1.0f / this->scalars[0],
  1261. 1.0f / this->scalars[1],
  1262. 1.0f / this->scalars[2],
  1263. 1.0f / this->scalars[3],
  1264. 1.0f / this->scalars[4],
  1265. 1.0f / this->scalars[5],
  1266. 1.0f / this->scalars[6],
  1267. 1.0f / this->scalars[7]
  1268. );
  1269. #endif
  1270. }
  1271. // 1 / sqrt(x)
  1272. // Useful for normalizing vectors
  1273. F32x8 reciprocalSquareRoot() const {
  1274. #if defined USE_AVX2
  1275. //__m128 reciRoot = _mm256_rsqrt_ps(this->v);
  1276. SIMD_F32x8 reciRoot = _mm256_rsqrt_ps(this->v);
  1277. SIMD_F32x8 mul = MUL_F32_SIMD256(MUL_F32_SIMD256(this->v, reciRoot), reciRoot);
  1278. reciRoot = MUL_F32_SIMD256(MUL_F32_SIMD256(LOAD_SCALAR_F32_SIMD256(0.5f), reciRoot), SUB_F32_SIMD256(LOAD_SCALAR_F32_SIMD256(3.0f), mul));
  1279. return F32x8(reciRoot);
  1280. #else
  1281. return F32x8(
  1282. 1.0f / sqrt(this->scalars[0]),
  1283. 1.0f / sqrt(this->scalars[1]),
  1284. 1.0f / sqrt(this->scalars[2]),
  1285. 1.0f / sqrt(this->scalars[3]),
  1286. 1.0f / sqrt(this->scalars[4]),
  1287. 1.0f / sqrt(this->scalars[5]),
  1288. 1.0f / sqrt(this->scalars[6]),
  1289. 1.0f / sqrt(this->scalars[7])
  1290. );
  1291. #endif
  1292. }
  1293. // sqrt(x)
  1294. // Useful for getting lengths of vectors
  1295. F32x8 squareRoot() const {
  1296. #if defined USE_AVX2
  1297. SIMD_F32x8 half = LOAD_SCALAR_F32_SIMD256(0.5f);
  1298. // Approximate
  1299. SIMD_F32x8 root = _mm256_sqrt_ps(this->v);
  1300. // Refine
  1301. root = _mm256_mul_ps(_mm256_add_ps(root, _mm256_div_ps(this->v, root)), half);
  1302. return F32x8(root);
  1303. #else
  1304. return F32x8(
  1305. sqrt(this->scalars[0]),
  1306. sqrt(this->scalars[1]),
  1307. sqrt(this->scalars[2]),
  1308. sqrt(this->scalars[3]),
  1309. sqrt(this->scalars[4]),
  1310. sqrt(this->scalars[5]),
  1311. sqrt(this->scalars[6]),
  1312. sqrt(this->scalars[7]));
  1313. #endif
  1314. }
  1315. F32x8 clamp(float minimum, float maximum) const {
  1316. #if defined USE_256BIT_F_SIMD
  1317. return F32x8(MIN_F32_SIMD256(MAX_F32_SIMD256(this->v, LOAD_SCALAR_F32_SIMD256(minimum)), LOAD_SCALAR_F32_SIMD256(maximum)));
  1318. #else
  1319. float val0 = this->scalars[0];
  1320. float val1 = this->scalars[1];
  1321. float val2 = this->scalars[2];
  1322. float val3 = this->scalars[3];
  1323. float val4 = this->scalars[4];
  1324. float val5 = this->scalars[5];
  1325. float val6 = this->scalars[6];
  1326. float val7 = this->scalars[7];
  1327. if (minimum > val0) { val0 = minimum; }
  1328. if (maximum < val0) { val0 = maximum; }
  1329. if (minimum > val1) { val1 = minimum; }
  1330. if (maximum < val1) { val1 = maximum; }
  1331. if (minimum > val2) { val2 = minimum; }
  1332. if (maximum < val2) { val2 = maximum; }
  1333. if (minimum > val3) { val3 = minimum; }
  1334. if (maximum < val3) { val3 = maximum; }
  1335. if (minimum > val4) { val4 = minimum; }
  1336. if (maximum < val4) { val4 = maximum; }
  1337. if (minimum > val5) { val5 = minimum; }
  1338. if (maximum < val5) { val5 = maximum; }
  1339. if (minimum > val6) { val6 = minimum; }
  1340. if (maximum < val6) { val6 = maximum; }
  1341. if (minimum > val7) { val7 = minimum; }
  1342. if (maximum < val7) { val7 = maximum; }
  1343. return F32x8(val0, val1, val2, val3, val4, val5, val6, val7);
  1344. #endif
  1345. }
  1346. F32x8 clampLower(float minimum) const {
  1347. #if defined USE_256BIT_F_SIMD
  1348. return F32x8(MAX_F32_SIMD256(this->v, LOAD_SCALAR_F32_SIMD256(minimum)));
  1349. #else
  1350. float val0 = this->scalars[0];
  1351. float val1 = this->scalars[1];
  1352. float val2 = this->scalars[2];
  1353. float val3 = this->scalars[3];
  1354. float val4 = this->scalars[4];
  1355. float val5 = this->scalars[5];
  1356. float val6 = this->scalars[6];
  1357. float val7 = this->scalars[7];
  1358. if (minimum > val0) { val0 = minimum; }
  1359. if (minimum > val1) { val1 = minimum; }
  1360. if (minimum > val2) { val2 = minimum; }
  1361. if (minimum > val3) { val3 = minimum; }
  1362. if (minimum > val4) { val4 = minimum; }
  1363. if (minimum > val5) { val5 = minimum; }
  1364. if (minimum > val6) { val6 = minimum; }
  1365. if (minimum > val7) { val7 = minimum; }
  1366. return F32x8(val0, val1, val2, val3, val4, val5, val6, val7);
  1367. #endif
  1368. }
  1369. F32x8 clampUpper(float maximum) const {
  1370. #if defined USE_256BIT_F_SIMD
  1371. return F32x8(MIN_F32_SIMD256(this->v, LOAD_SCALAR_F32_SIMD256(maximum)));
  1372. #else
  1373. float val0 = this->scalars[0];
  1374. float val1 = this->scalars[1];
  1375. float val2 = this->scalars[2];
  1376. float val3 = this->scalars[3];
  1377. float val4 = this->scalars[4];
  1378. float val5 = this->scalars[5];
  1379. float val6 = this->scalars[6];
  1380. float val7 = this->scalars[7];
  1381. if (maximum < val0) { val0 = maximum; }
  1382. if (maximum < val1) { val1 = maximum; }
  1383. if (maximum < val2) { val2 = maximum; }
  1384. if (maximum < val3) { val3 = maximum; }
  1385. if (maximum < val4) { val4 = maximum; }
  1386. if (maximum < val5) { val5 = maximum; }
  1387. if (maximum < val6) { val6 = maximum; }
  1388. if (maximum < val7) { val7 = maximum; }
  1389. return F32x8(val0, val1, val2, val3, val4, val5, val6, val7);
  1390. #endif
  1391. }
  1392. };
  1393. union I32x8 {
  1394. private:
  1395. // The uninitialized default constructor is private for safety reasons.
  1396. I32x8() {}
  1397. public:
  1398. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  1399. static inline I32x8 create_dangerous_uninitialized() { return I32x8(); }
  1400. #if defined USE_256BIT_X_SIMD
  1401. public:
  1402. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  1403. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  1404. int32_t scalars[8];
  1405. #endif
  1406. // The SIMD vector of undefined type
  1407. // Not accessible while emulating!
  1408. SIMD_I32x8 v;
  1409. // Construct a portable vector from a native SIMD vector
  1410. explicit I32x8(const SIMD_I32x8& v) : v(v) {}
  1411. // Construct a portable vector from a set of scalars
  1412. I32x8(int32_t a1, int32_t a2, int32_t a3, int32_t a4, int32_t a5, int32_t a6, int32_t a7, int32_t a8)
  1413. : v(LOAD_VECTOR_I32_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8)) {}
  1414. // Construct a portable vector from a single duplicated scalar
  1415. explicit I32x8(int32_t scalar) : v(LOAD_SCALAR_I32_SIMD256(scalar)) {}
  1416. #else
  1417. public:
  1418. // Emulate a SIMD vector as an array of scalars without hardware support.
  1419. // Only accessible while emulating!
  1420. int32_t scalars[8];
  1421. // Construct a portable vector from a set of scalars
  1422. I32x8(int32_t a1, int32_t a2, int32_t a3, int32_t a4, int32_t a5, int32_t a6, int32_t a7, int32_t a8) {
  1423. this->scalars[0] = a1;
  1424. this->scalars[1] = a2;
  1425. this->scalars[2] = a3;
  1426. this->scalars[3] = a4;
  1427. this->scalars[4] = a5;
  1428. this->scalars[5] = a6;
  1429. this->scalars[6] = a7;
  1430. this->scalars[7] = a8;
  1431. }
  1432. // Construct a portable vector from a single duplicated scalar
  1433. explicit I32x8(int32_t scalar) {
  1434. this->scalars[0] = scalar;
  1435. this->scalars[1] = scalar;
  1436. this->scalars[2] = scalar;
  1437. this->scalars[3] = scalar;
  1438. this->scalars[4] = scalar;
  1439. this->scalars[5] = scalar;
  1440. this->scalars[6] = scalar;
  1441. this->scalars[7] = scalar;
  1442. }
  1443. #endif
  1444. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1445. static inline I32x8 createGradient(int32_t start, int32_t increment) {
  1446. return I32x8(
  1447. start,
  1448. start + increment,
  1449. start + increment * 2,
  1450. start + increment * 3,
  1451. start + increment * 4,
  1452. start + increment * 5,
  1453. start + increment * 6,
  1454. start + increment * 7
  1455. );
  1456. }
  1457. // Construct a portable SIMD vector from a pointer to aligned data
  1458. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1459. static inline I32x8 readAlignedUnsafe(const int32_t* data) {
  1460. #if defined USE_AVX2
  1461. return I32x8(_mm256_load_si256((const __m256i*)data));
  1462. #else
  1463. return I32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  1464. #endif
  1465. }
  1466. // Write to aligned memory from the existing vector
  1467. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1468. inline void writeAlignedUnsafe(int32_t* data) const {
  1469. #if defined USE_AVX2
  1470. _mm256_store_si256((__m256i*)data, this->v);
  1471. #else
  1472. data[0] = this->scalars[0];
  1473. data[1] = this->scalars[1];
  1474. data[2] = this->scalars[2];
  1475. data[3] = this->scalars[3];
  1476. data[4] = this->scalars[4];
  1477. data[5] = this->scalars[5];
  1478. data[6] = this->scalars[6];
  1479. data[7] = this->scalars[7];
  1480. #endif
  1481. }
  1482. // Bound and alignment checked reading
  1483. static inline I32x8 readAligned(const dsr::SafePointer<int32_t> data, const char* methodName) {
  1484. const int32_t* pointer = data.getUnsafe();
  1485. assert(((uintptr_t)pointer & 31) == 0);
  1486. #if defined SAFE_POINTER_CHECKS
  1487. data.assertInside(methodName, pointer, 32);
  1488. #endif
  1489. return I32x8::readAlignedUnsafe(pointer);
  1490. }
  1491. // Bound and alignment checked writing
  1492. inline void writeAligned(dsr::SafePointer<int32_t> data, const char* methodName) const {
  1493. int32_t* pointer = data.getUnsafe();
  1494. assert(((uintptr_t)pointer & 31) == 0);
  1495. #if defined SAFE_POINTER_CHECKS
  1496. data.assertInside(methodName, pointer, 32);
  1497. #endif
  1498. this->writeAlignedUnsafe(pointer);
  1499. }
  1500. };
  1501. union U32x8 {
  1502. private:
  1503. // The uninitialized default constructor is private for safety reasons.
  1504. U32x8() {}
  1505. public:
  1506. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  1507. static inline U32x8 create_dangerous_uninitialized() { return U32x8(); }
  1508. #if defined USE_256BIT_X_SIMD
  1509. public:
  1510. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  1511. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  1512. uint32_t scalars[8];
  1513. #endif
  1514. // The SIMD vector of undefined type
  1515. // Not accessible while emulating!
  1516. SIMD_U32x8 v;
  1517. // Construct a portable vector from a native SIMD vector
  1518. explicit U32x8(const SIMD_U32x8& v) : v(v) {}
  1519. // Construct a portable vector from a set of scalars
  1520. U32x8(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4, uint32_t a5, uint32_t a6, uint32_t a7, uint32_t a8)
  1521. : v(LOAD_VECTOR_U32_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8)) {}
  1522. // Construct a portable vector from a single duplicated scalar
  1523. explicit U32x8(uint32_t scalar) : v(LOAD_SCALAR_U32_SIMD256(scalar)) {}
  1524. #else
  1525. public:
  1526. // Emulate a SIMD vector as an array of scalars without hardware support.
  1527. // Only accessible while emulating!
  1528. uint32_t scalars[8];
  1529. // Construct a portable vector from a set of scalars
  1530. U32x8(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4, uint32_t a5, uint32_t a6, uint32_t a7, uint32_t a8) {
  1531. this->scalars[0] = a1;
  1532. this->scalars[1] = a2;
  1533. this->scalars[2] = a3;
  1534. this->scalars[3] = a4;
  1535. this->scalars[4] = a5;
  1536. this->scalars[5] = a6;
  1537. this->scalars[6] = a7;
  1538. this->scalars[7] = a8;
  1539. }
  1540. // Construct a portable vector from a single duplicated scalar
  1541. explicit U32x8(uint32_t scalar) {
  1542. this->scalars[0] = scalar;
  1543. this->scalars[1] = scalar;
  1544. this->scalars[2] = scalar;
  1545. this->scalars[3] = scalar;
  1546. this->scalars[4] = scalar;
  1547. this->scalars[5] = scalar;
  1548. this->scalars[6] = scalar;
  1549. this->scalars[7] = scalar;
  1550. }
  1551. #endif
  1552. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1553. static inline U32x8 createGradient(uint32_t start, uint32_t increment) {
  1554. return U32x8(
  1555. start,
  1556. start + increment,
  1557. start + increment * 2,
  1558. start + increment * 3,
  1559. start + increment * 4,
  1560. start + increment * 5,
  1561. start + increment * 6,
  1562. start + increment * 7
  1563. );
  1564. }
  1565. // Construct a portable SIMD vector from a pointer to aligned data
  1566. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1567. static inline U32x8 readAlignedUnsafe(const uint32_t* data) {
  1568. #if defined USE_AVX2
  1569. return U32x8(_mm256_load_si256((const __m256i*)data));
  1570. #else
  1571. return U32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  1572. #endif
  1573. }
  1574. // Write to aligned memory from the existing vector
  1575. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1576. inline void writeAlignedUnsafe(uint32_t* data) const {
  1577. #if defined USE_AVX2
  1578. _mm256_store_si256((__m256i*)data, this->v);
  1579. #else
  1580. data[0] = this->scalars[0];
  1581. data[1] = this->scalars[1];
  1582. data[2] = this->scalars[2];
  1583. data[3] = this->scalars[3];
  1584. data[4] = this->scalars[4];
  1585. data[5] = this->scalars[5];
  1586. data[6] = this->scalars[6];
  1587. data[7] = this->scalars[7];
  1588. #endif
  1589. }
  1590. // Bound and alignment checked reading
  1591. static inline U32x8 readAligned(const dsr::SafePointer<uint32_t> data, const char* methodName) {
  1592. const uint32_t* pointer = data.getUnsafe();
  1593. assert(((uintptr_t)pointer & 31) == 0);
  1594. #if defined SAFE_POINTER_CHECKS
  1595. data.assertInside(methodName, pointer, 32);
  1596. #endif
  1597. return U32x8::readAlignedUnsafe(pointer);
  1598. }
  1599. // Bound and alignment checked writing
  1600. inline void writeAligned(dsr::SafePointer<uint32_t> data, const char* methodName) const {
  1601. uint32_t* pointer = data.getUnsafe();
  1602. assert(((uintptr_t)pointer & 31) == 0);
  1603. #if defined SAFE_POINTER_CHECKS
  1604. data.assertInside(methodName, pointer, 32);
  1605. #endif
  1606. this->writeAlignedUnsafe(pointer);
  1607. }
  1608. };
  1609. union U16x16 {
  1610. private:
  1611. // The uninitialized default constructor is private for safety reasons.
  1612. U16x16() {}
  1613. public:
  1614. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  1615. static inline U16x16 create_dangerous_uninitialized() { return U16x16(); }
  1616. #if defined USE_256BIT_X_SIMD
  1617. public:
  1618. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  1619. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  1620. uint16_t scalars[16];
  1621. #endif
  1622. // The SIMD vector of undefined type
  1623. // Not accessible while emulating!
  1624. SIMD_U16x16 v;
  1625. // Construct a portable vector from a native SIMD vector
  1626. explicit U16x16(const SIMD_U16x16& v) : v(v) {}
  1627. // Construct a vector of 16 x 16-bit unsigned integers from a vector of 8 x 32-bit unsigned integers
  1628. // Reinterpret casting is used
  1629. explicit U16x16(const U32x8& vector) : v(REINTERPRET_U32_TO_U16_SIMD256(vector.v)) {}
  1630. // Construct a portable vector from a set of scalars
  1631. U16x16(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8,
  1632. uint16_t a9, uint16_t a10, uint16_t a11, uint16_t a12, uint16_t a13, uint16_t a14, uint16_t a15, uint16_t a16)
  1633. : v(LOAD_VECTOR_U16_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16)) {}
  1634. // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
  1635. // Reinterpret casting is used
  1636. // TODO: Remove all reintreprets from constructors to improve readability
  1637. explicit U16x16(uint32_t scalar) : v(REINTERPRET_U32_TO_U16_SIMD256(LOAD_SCALAR_U32_SIMD256(scalar))) {}
  1638. // Construct a portable vector from a single duplicated scalar
  1639. explicit U16x16(uint16_t scalar) : v(LOAD_SCALAR_U16_SIMD256(scalar)) {}
  1640. // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
  1641. U32x8 get_U32() const {
  1642. return U32x8(REINTERPRET_U16_TO_U32_SIMD256(this->v));
  1643. }
  1644. #else
  1645. public:
  1646. // Emulate a SIMD vector as an array of scalars without hardware support.
  1647. // Only accessible while emulating!
  1648. uint16_t scalars[16];
  1649. // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
  1650. // Reinterpret casting is used
  1651. explicit U16x16(const U32x8& vector) {
  1652. uint64_t *target = (uint64_t*)this->scalars;
  1653. uint64_t *source = (uint64_t*)vector.scalars;
  1654. target[0] = source[0];
  1655. target[1] = source[1];
  1656. target[2] = source[2];
  1657. target[3] = source[3];
  1658. }
  1659. // Construct a portable vector from a set of scalars
  1660. U16x16(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8,
  1661. uint16_t a9, uint16_t a10, uint16_t a11, uint16_t a12, uint16_t a13, uint16_t a14, uint16_t a15, uint16_t a16) {
  1662. this->scalars[0] = a1;
  1663. this->scalars[1] = a2;
  1664. this->scalars[2] = a3;
  1665. this->scalars[3] = a4;
  1666. this->scalars[4] = a5;
  1667. this->scalars[5] = a6;
  1668. this->scalars[6] = a7;
  1669. this->scalars[7] = a8;
  1670. this->scalars[8] = a9;
  1671. this->scalars[9] = a10;
  1672. this->scalars[10] = a11;
  1673. this->scalars[11] = a12;
  1674. this->scalars[12] = a13;
  1675. this->scalars[13] = a14;
  1676. this->scalars[14] = a15;
  1677. this->scalars[15] = a16;
  1678. }
  1679. // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
  1680. // Reinterpret casting is used
  1681. explicit U16x16(uint32_t scalar) {
  1682. uint32_t *target = (uint32_t*)this->scalars;
  1683. target[0] = scalar;
  1684. target[1] = scalar;
  1685. target[2] = scalar;
  1686. target[3] = scalar;
  1687. target[4] = scalar;
  1688. target[5] = scalar;
  1689. target[6] = scalar;
  1690. target[7] = scalar;
  1691. }
  1692. // Construct a portable vector from a single duplicated scalar
  1693. explicit U16x16(uint16_t scalar) {
  1694. this->scalars[0] = scalar;
  1695. this->scalars[1] = scalar;
  1696. this->scalars[2] = scalar;
  1697. this->scalars[3] = scalar;
  1698. this->scalars[4] = scalar;
  1699. this->scalars[5] = scalar;
  1700. this->scalars[6] = scalar;
  1701. this->scalars[7] = scalar;
  1702. this->scalars[8] = scalar;
  1703. this->scalars[9] = scalar;
  1704. this->scalars[10] = scalar;
  1705. this->scalars[11] = scalar;
  1706. this->scalars[12] = scalar;
  1707. this->scalars[13] = scalar;
  1708. this->scalars[14] = scalar;
  1709. this->scalars[15] = scalar;
  1710. }
  1711. // Reinterpret cast to a vector of 8 x 32-bit unsigned integers
  1712. U32x8 get_U32() const {
  1713. U32x8 result(0);
  1714. uint64_t *target = (uint64_t*)result.scalars;
  1715. uint64_t *source = (uint64_t*)this->scalars;
  1716. target[0] = source[0];
  1717. target[1] = source[1];
  1718. target[2] = source[2];
  1719. target[3] = source[3];
  1720. return result;
  1721. }
  1722. #endif
  1723. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1724. static inline U16x16 createGradient(uint16_t start, uint16_t increment) {
  1725. return U16x16(
  1726. start,
  1727. start + increment,
  1728. start + increment * 2,
  1729. start + increment * 3,
  1730. start + increment * 4,
  1731. start + increment * 5,
  1732. start + increment * 6,
  1733. start + increment * 7,
  1734. start + increment * 8,
  1735. start + increment * 9,
  1736. start + increment * 10,
  1737. start + increment * 11,
  1738. start + increment * 12,
  1739. start + increment * 13,
  1740. start + increment * 14,
  1741. start + increment * 15
  1742. );
  1743. }
  1744. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1745. //static inline U16x16 readSlow(uint16_t* data) {
  1746. // return U16x16(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  1747. //}
  1748. static inline U16x16 readAlignedUnsafe(const uint16_t* data) {
  1749. #if defined USE_AVX2
  1750. return U16x16(_mm256_load_si256((const __m256i*)data));
  1751. #else
  1752. return U16x16(
  1753. data[0],
  1754. data[1],
  1755. data[2],
  1756. data[3],
  1757. data[4],
  1758. data[5],
  1759. data[6],
  1760. data[7],
  1761. data[8],
  1762. data[9],
  1763. data[10],
  1764. data[11],
  1765. data[12],
  1766. data[13],
  1767. data[14],
  1768. data[15]
  1769. );
  1770. #endif
  1771. }
  1772. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1773. inline void writeAlignedUnsafe(uint16_t* data) const {
  1774. #if defined USE_AVX2
  1775. _mm256_store_si256((__m256i*)data, this->v);
  1776. #else
  1777. data[0] = this->scalars[0];
  1778. data[1] = this->scalars[1];
  1779. data[2] = this->scalars[2];
  1780. data[3] = this->scalars[3];
  1781. data[4] = this->scalars[4];
  1782. data[5] = this->scalars[5];
  1783. data[6] = this->scalars[6];
  1784. data[7] = this->scalars[7];
  1785. data[8] = this->scalars[8];
  1786. data[9] = this->scalars[9];
  1787. data[10] = this->scalars[10];
  1788. data[11] = this->scalars[11];
  1789. data[12] = this->scalars[12];
  1790. data[13] = this->scalars[13];
  1791. data[14] = this->scalars[14];
  1792. data[15] = this->scalars[15];
  1793. #endif
  1794. }
  1795. // Bound and alignment checked reading
  1796. static inline U16x16 readAligned(const dsr::SafePointer<uint16_t> data, const char* methodName) {
  1797. const uint16_t* pointer = data.getUnsafe();
  1798. assert(((uintptr_t)pointer & 31) == 0);
  1799. #if defined SAFE_POINTER_CHECKS
  1800. data.assertInside(methodName, pointer, 32);
  1801. #endif
  1802. return U16x16::readAlignedUnsafe(pointer);
  1803. }
  1804. // Bound and alignment checked writing
  1805. inline void writeAligned(dsr::SafePointer<uint16_t> data, const char* methodName) const {
  1806. uint16_t* pointer = data.getUnsafe();
  1807. assert(((uintptr_t)pointer & 31) == 0);
  1808. #if defined SAFE_POINTER_CHECKS
  1809. data.assertInside(methodName, pointer, 32);
  1810. #endif
  1811. this->writeAlignedUnsafe(pointer);
  1812. }
  1813. };
  1814. union U8x32 {
  1815. private:
  1816. // The uninitialized default constructor is private for safety reasons.
  1817. U8x32() {}
  1818. public:
  1819. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  1820. static inline U8x32 create_dangerous_uninitialized() { return U8x32(); }
  1821. #if defined USE_256BIT_X_SIMD
  1822. public:
  1823. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  1824. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  1825. uint8_t scalars[32];
  1826. #endif
  1827. // The SIMD vector of undefined type
  1828. // Not accessible while emulating!
  1829. SIMD_U8x32 v;
  1830. // Construct a portable vector from a native SIMD vector
  1831. explicit U8x32(const SIMD_U8x32& v) : v(v) {}
  1832. // Construct a portable vector from a set of scalars
  1833. U8x32(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  1834. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16,
  1835. uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24,
  1836. uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32)
  1837. : v(LOAD_VECTOR_U8_SIMD256(
  1838. a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16,
  1839. a17, a18, a19, a20, a21, a22, a23, a24, a25, a26, a27, a28, a29, a30, a31, a32
  1840. )) {}
  1841. // Construct a portable vector from a single duplicated scalar
  1842. explicit U8x32(uint8_t scalar) : v(LOAD_SCALAR_U8_SIMD256(scalar)) {}
  1843. #else
  1844. public:
  1845. // Emulate a SIMD vector as an array of scalars without hardware support.
  1846. // Only accessible while emulating!
  1847. uint8_t scalars[32];
  1848. // Construct a portable vector from a set of scalars
  1849. U8x32(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  1850. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16,
  1851. uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24,
  1852. uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32) {
  1853. this->scalars[0] = a1;
  1854. this->scalars[1] = a2;
  1855. this->scalars[2] = a3;
  1856. this->scalars[3] = a4;
  1857. this->scalars[4] = a5;
  1858. this->scalars[5] = a6;
  1859. this->scalars[6] = a7;
  1860. this->scalars[7] = a8;
  1861. this->scalars[8] = a9;
  1862. this->scalars[9] = a10;
  1863. this->scalars[10] = a11;
  1864. this->scalars[11] = a12;
  1865. this->scalars[12] = a13;
  1866. this->scalars[13] = a14;
  1867. this->scalars[14] = a15;
  1868. this->scalars[15] = a16;
  1869. this->scalars[16] = a17;
  1870. this->scalars[17] = a18;
  1871. this->scalars[18] = a19;
  1872. this->scalars[19] = a20;
  1873. this->scalars[20] = a21;
  1874. this->scalars[21] = a22;
  1875. this->scalars[22] = a23;
  1876. this->scalars[23] = a24;
  1877. this->scalars[24] = a25;
  1878. this->scalars[25] = a26;
  1879. this->scalars[26] = a27;
  1880. this->scalars[27] = a28;
  1881. this->scalars[28] = a29;
  1882. this->scalars[29] = a30;
  1883. this->scalars[30] = a31;
  1884. this->scalars[31] = a32;
  1885. }
  1886. // Construct a portable vector from a single duplicated scalar
  1887. explicit U8x32(uint8_t scalar) {
  1888. for (int i = 0; i < 32; i++) {
  1889. this->scalars[i] = scalar;
  1890. }
  1891. }
  1892. #endif
  1893. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1894. static inline U8x32 createGradient(uint8_t start, uint8_t increment) {
  1895. return U8x32(
  1896. start,
  1897. start + increment,
  1898. start + increment * 2,
  1899. start + increment * 3,
  1900. start + increment * 4,
  1901. start + increment * 5,
  1902. start + increment * 6,
  1903. start + increment * 7,
  1904. start + increment * 8,
  1905. start + increment * 9,
  1906. start + increment * 10,
  1907. start + increment * 11,
  1908. start + increment * 12,
  1909. start + increment * 13,
  1910. start + increment * 14,
  1911. start + increment * 15,
  1912. start + increment * 16,
  1913. start + increment * 17,
  1914. start + increment * 18,
  1915. start + increment * 19,
  1916. start + increment * 20,
  1917. start + increment * 21,
  1918. start + increment * 22,
  1919. start + increment * 23,
  1920. start + increment * 24,
  1921. start + increment * 25,
  1922. start + increment * 26,
  1923. start + increment * 27,
  1924. start + increment * 28,
  1925. start + increment * 29,
  1926. start + increment * 30,
  1927. start + increment * 31
  1928. );
  1929. }
  1930. static inline U8x32 readAlignedUnsafe(const uint8_t* data) {
  1931. #if defined USE_AVX2
  1932. return U8x32(_mm256_load_si256((const __m256i*)data));
  1933. #else
  1934. U8x32 result;
  1935. for (int i = 0; i < 32; i++) {
  1936. result.scalars[i] = data[i];
  1937. }
  1938. return result;
  1939. #endif
  1940. }
  1941. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1942. inline void writeAlignedUnsafe(uint8_t* data) const {
  1943. #if defined USE_AVX2
  1944. _mm256_store_si256((__m256i*)data, this->v);
  1945. #else
  1946. for (int i = 0; i < 32; i++) {
  1947. data[i] = this->scalars[i];
  1948. }
  1949. #endif
  1950. }
  1951. // Bound and alignment checked reading
  1952. static inline U8x32 readAligned(const dsr::SafePointer<uint8_t> data, const char* methodName) {
  1953. const uint8_t* pointer = data.getUnsafe();
  1954. assert(((uintptr_t)pointer & 31) == 0);
  1955. #if defined SAFE_POINTER_CHECKS
  1956. data.assertInside(methodName, pointer, 32);
  1957. #endif
  1958. return U8x32::readAlignedUnsafe(pointer);
  1959. }
  1960. // Bound and alignment checked writing
  1961. inline void writeAligned(dsr::SafePointer<uint8_t> data, const char* methodName) const {
  1962. uint8_t* pointer = data.getUnsafe();
  1963. assert(((uintptr_t)pointer & 31) == 0);
  1964. #if defined SAFE_POINTER_CHECKS
  1965. data.assertInside(methodName, pointer, 32);
  1966. #endif
  1967. this->writeAlignedUnsafe(pointer);
  1968. }
  1969. };
  1970. // Helper macros for doing things to certain sets of SIMD vector types
  1971. // Performing do(vector_type, element_type, lane_count)
  1972. #define FOR_ALL_VECTOR_TYPES(DO) \
  1973. DO(F32x4, float, 4) \
  1974. DO(I32x4, int32_t, 4) \
  1975. DO(U32x4, uint32_t, 4) \
  1976. DO(U16x8, uint16_t, 8) \
  1977. DO(U8x16, uint8_t, 16) \
  1978. DO(F32x8, float, 8) \
  1979. DO(I32x8, int32_t, 8) \
  1980. DO(U32x8, uint32_t, 8) \
  1981. DO(U16x16, uint16_t, 16) \
  1982. DO(U8x32, uint8_t, 32)
  1983. #define FOR_FLOAT_VECTOR_TYPES(DO) \
  1984. DO(F32x4, float, 4) \
  1985. DO(F32x8, float, 8)
  1986. #define FOR_INTEGER_VECTOR_TYPES(DO) \
  1987. DO(I32x4, int32_t, 4) \
  1988. DO(U32x4, uint32_t, 4) \
  1989. DO(U16x8, uint16_t, 8) \
  1990. DO(U8x16, uint8_t, 16) \
  1991. DO(I32x8, int32_t, 8) \
  1992. DO(U32x8, uint32_t, 8) \
  1993. DO(U16x16, uint16_t, 16) \
  1994. DO(U8x32, uint8_t, 32)
  1995. #define FOR_SIGNED_VECTOR_TYPES(DO) \
  1996. DO(F32x4, float, 4) \
  1997. DO(I32x4, int32_t, 4) \
  1998. DO(F32x8, float, 8) \
  1999. DO(I32x8, int32_t, 8)
  2000. #define FOR_UNSIGNED_VECTOR_TYPES(DO) \
  2001. DO(U32x4, uint32_t, 4) \
  2002. DO(U16x8, uint16_t, 8) \
  2003. DO(U8x16, uint8_t, 16) \
  2004. DO(U32x8, uint32_t, 8) \
  2005. DO(U16x16, uint16_t, 16) \
  2006. DO(U8x32, uint8_t, 32)
  2007. // Print SIMD vectors to the terminal or append them to strings.
  2008. #define CREATE_METHOD_PRINT(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  2009. inline dsr::String& string_toStreamIndented(dsr::String& target, const VECTOR_TYPE& source, const dsr::ReadableString& indentation) { \
  2010. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  2011. source.writeAlignedUnsafe(a); \
  2012. dsr::string_append(target, indentation, a[0]); \
  2013. for (int i = 1; i < LANE_COUNT; i++) { \
  2014. string_append(target, U", ", a[i]); \
  2015. } \
  2016. return target; \
  2017. }
  2018. // All SIMD vectors can be printed.
  2019. FOR_ALL_VECTOR_TYPES(CREATE_METHOD_PRINT)
  2020. #undef CREATE_METHOD_PRINT
  2021. // Whole comparisons returning a single boolean, mainly for regression tests.
  2022. #define CREATE_EXACT_EQUALITY(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  2023. inline bool operator==(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  2024. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  2025. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  2026. left.writeAlignedUnsafe(a); \
  2027. right.writeAlignedUnsafe(b); \
  2028. for (int i = 0; i < LANE_COUNT; i++) { \
  2029. if (a[i] != b[i]) return false; \
  2030. } \
  2031. return true; \
  2032. }
  2033. // Integer SIMD vectors have exact equlity.
  2034. FOR_INTEGER_VECTOR_TYPES(CREATE_EXACT_EQUALITY)
  2035. #undef CREATE_EXACT_EQUALITY
  2036. #define CREATE_TOLERANT_EQUALITY(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  2037. inline bool operator==(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  2038. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  2039. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  2040. left.writeAlignedUnsafe(a); \
  2041. right.writeAlignedUnsafe(b); \
  2042. for (int i = 0; i < LANE_COUNT; i++) { \
  2043. if (fabs(a[i] - b[i]) >= 0.0001f) return false; \
  2044. } \
  2045. return true; \
  2046. }
  2047. // Float SIMD vectors have inexact equality.
  2048. FOR_FLOAT_VECTOR_TYPES(CREATE_TOLERANT_EQUALITY)
  2049. #undef CREATE_TOLERANT_EQUALITY
  2050. #define CREATE_INEQUALITY(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  2051. inline bool operator!=(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  2052. return !(left == right); \
  2053. }
  2054. // All SIMD vectors have inequality.
  2055. FOR_ALL_VECTOR_TYPES(CREATE_INEQUALITY)
  2056. #undef CREATE_INEQUALITY
  2057. inline F32x4 operator+(const F32x4& left, const F32x4& right) {
  2058. #if defined USE_BASIC_SIMD
  2059. return F32x4(ADD_F32_SIMD(left.v, right.v));
  2060. #else
  2061. return F32x4(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3]);
  2062. #endif
  2063. }
  2064. inline F32x4 operator-(const F32x4& left, const F32x4& right) {
  2065. #if defined USE_BASIC_SIMD
  2066. return F32x4(SUB_F32_SIMD(left.v, right.v));
  2067. #else
  2068. return F32x4(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3]);
  2069. #endif
  2070. }
  2071. inline F32x4 operator*(const F32x4& left, const F32x4& right) {
  2072. #if defined USE_BASIC_SIMD
  2073. return F32x4(MUL_F32_SIMD(left.v, right.v));
  2074. #else
  2075. return F32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
  2076. #endif
  2077. }
  2078. inline F32x4 min(const F32x4& left, const F32x4& right) {
  2079. #if defined USE_BASIC_SIMD
  2080. return F32x4(MIN_F32_SIMD(left.v, right.v));
  2081. #else
  2082. float v0 = left.scalars[0];
  2083. float v1 = left.scalars[1];
  2084. float v2 = left.scalars[2];
  2085. float v3 = left.scalars[3];
  2086. float r0 = right.scalars[0];
  2087. float r1 = right.scalars[1];
  2088. float r2 = right.scalars[2];
  2089. float r3 = right.scalars[3];
  2090. if (r0 < v0) { v0 = r0; }
  2091. if (r1 < v1) { v1 = r1; }
  2092. if (r2 < v2) { v2 = r2; }
  2093. if (r3 < v3) { v3 = r3; }
  2094. return F32x4(v0, v1, v2, v3);
  2095. #endif
  2096. }
  2097. inline F32x4 max(const F32x4& left, const F32x4& right) {
  2098. #if defined USE_BASIC_SIMD
  2099. return F32x4(MAX_F32_SIMD(left.v, right.v));
  2100. #else
  2101. float v0 = left.scalars[0];
  2102. float v1 = left.scalars[1];
  2103. float v2 = left.scalars[2];
  2104. float v3 = left.scalars[3];
  2105. float r0 = right.scalars[0];
  2106. float r1 = right.scalars[1];
  2107. float r2 = right.scalars[2];
  2108. float r3 = right.scalars[3];
  2109. if (r0 > v0) { v0 = r0; }
  2110. if (r1 > v1) { v1 = r1; }
  2111. if (r2 > v2) { v2 = r2; }
  2112. if (r3 > v3) { v3 = r3; }
  2113. return F32x4(v0, v1, v2, v3);
  2114. #endif
  2115. }
  2116. inline I32x4 operator+(const I32x4& left, const I32x4& right) {
  2117. #if defined USE_BASIC_SIMD
  2118. return I32x4(ADD_I32_SIMD(left.v, right.v));
  2119. #else
  2120. return I32x4(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3]);
  2121. #endif
  2122. }
  2123. inline I32x4 operator-(const I32x4& left, const I32x4& right) {
  2124. #if defined USE_BASIC_SIMD
  2125. return I32x4(SUB_I32_SIMD(left.v, right.v));
  2126. #else
  2127. return I32x4(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3]);
  2128. #endif
  2129. }
  2130. inline I32x4 operator*(const I32x4& left, const I32x4& right) {
  2131. #if defined USE_BASIC_SIMD
  2132. #if defined USE_SSE2
  2133. // Emulate a NEON instruction
  2134. return I32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
  2135. #elif defined USE_NEON
  2136. return I32x4(MUL_I32_NEON(left.v, right.v));
  2137. #endif
  2138. #else
  2139. return I32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
  2140. #endif
  2141. }
  2142. inline U32x4 operator+(const U32x4& left, const U32x4& right) {
  2143. #if defined USE_BASIC_SIMD
  2144. return U32x4(ADD_U32_SIMD(left.v, right.v));
  2145. #else
  2146. return U32x4(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3]);
  2147. #endif
  2148. }
  2149. inline U32x4 operator-(const U32x4& left, const U32x4& right) {
  2150. #if defined USE_BASIC_SIMD
  2151. return U32x4(SUB_U32_SIMD(left.v, right.v));
  2152. #else
  2153. return U32x4(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3]);
  2154. #endif
  2155. }
  2156. inline U32x4 operator*(const U32x4& left, const U32x4& right) {
  2157. #if defined USE_BASIC_SIMD
  2158. #if defined USE_SSE2
  2159. // Emulate a NEON instruction on SSE2 registers
  2160. return U32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
  2161. #else // NEON
  2162. return U32x4(MUL_U32_NEON(left.v, right.v));
  2163. #endif
  2164. #else
  2165. return U32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
  2166. #endif
  2167. }
  2168. inline U32x4 operator&(const U32x4& left, const U32x4& right) {
  2169. #if defined USE_BASIC_SIMD
  2170. return U32x4(BITWISE_AND_U32_SIMD(left.v, right.v));
  2171. #else
  2172. return U32x4(left.scalars[0] & right.scalars[0], left.scalars[1] & right.scalars[1], left.scalars[2] & right.scalars[2], left.scalars[3] & right.scalars[3]);
  2173. #endif
  2174. }
  2175. inline U32x4 operator|(const U32x4& left, const U32x4& right) {
  2176. #if defined USE_BASIC_SIMD
  2177. return U32x4(BITWISE_OR_U32_SIMD(left.v, right.v));
  2178. #else
  2179. return U32x4(left.scalars[0] | right.scalars[0], left.scalars[1] | right.scalars[1], left.scalars[2] | right.scalars[2], left.scalars[3] | right.scalars[3]);
  2180. #endif
  2181. }
  2182. inline U32x4 operator^(const U32x4& left, const U32x4& right) {
  2183. #if defined USE_BASIC_SIMD
  2184. return U32x4(BITWISE_XOR_U32_SIMD(left.v, right.v));
  2185. #else
  2186. return U32x4(left.scalars[0] ^ right.scalars[0], left.scalars[1] ^ right.scalars[1], left.scalars[2] ^ right.scalars[2], left.scalars[3] ^ right.scalars[3]);
  2187. #endif
  2188. }
  2189. inline U32x4 operator<<(const U32x4& left, uint32_t bitOffset) {
  2190. #if defined USE_SSE2
  2191. return U32x4(_mm_slli_epi32(left.v, bitOffset));
  2192. #else
  2193. #if defined USE_NEON
  2194. return U32x4(vshlq_u32(left.v, LOAD_SCALAR_I32_SIMD(bitOffset)));
  2195. #else
  2196. return U32x4(left.scalars[0] << bitOffset, left.scalars[1] << bitOffset, left.scalars[2] << bitOffset, left.scalars[3] << bitOffset);
  2197. #endif
  2198. #endif
  2199. }
  2200. inline U32x4 operator>>(const U32x4& left, uint32_t bitOffset) {
  2201. #if defined USE_SSE2
  2202. return U32x4(_mm_srli_epi32(left.v, bitOffset));
  2203. #else
  2204. #if defined USE_NEON
  2205. return U32x4(vshlq_u32(left.v, LOAD_SCALAR_I32_SIMD(-bitOffset)));
  2206. #else
  2207. return U32x4(left.scalars[0] >> bitOffset, left.scalars[1] >> bitOffset, left.scalars[2] >> bitOffset, left.scalars[3] >> bitOffset);
  2208. #endif
  2209. #endif
  2210. }
  2211. inline U16x8 operator+(const U16x8& left, const U16x8& right) {
  2212. #if defined USE_BASIC_SIMD
  2213. return U16x8(ADD_U16_SIMD(left.v, right.v));
  2214. #else
  2215. return U16x8(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3],
  2216. left.scalars[4] + right.scalars[4], left.scalars[5] + right.scalars[5], left.scalars[6] + right.scalars[6], left.scalars[7] + right.scalars[7]);
  2217. #endif
  2218. }
  2219. inline U16x8 operator-(const U16x8& left, const U16x8& right) {
  2220. #if defined USE_BASIC_SIMD
  2221. return U16x8(SUB_U16_SIMD(left.v, right.v));
  2222. #else
  2223. return U16x8(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3],
  2224. left.scalars[4] - right.scalars[4], left.scalars[5] - right.scalars[5], left.scalars[6] - right.scalars[6], left.scalars[7] - right.scalars[7]);
  2225. #endif
  2226. }
  2227. inline U16x8 operator*(const U16x8& left, const U16x8& right) {
  2228. #if defined USE_BASIC_SIMD
  2229. return U16x8(MUL_U16_SIMD(left.v, right.v));
  2230. #else
  2231. return U16x8(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3],
  2232. left.scalars[4] * right.scalars[4], left.scalars[5] * right.scalars[5], left.scalars[6] * right.scalars[6], left.scalars[7] * right.scalars[7]);
  2233. #endif
  2234. }
  2235. inline U8x16 operator+(const U8x16& left, const U8x16& right) {
  2236. #if defined USE_BASIC_SIMD
  2237. return U8x16(ADD_U8_SIMD(left.v, right.v));
  2238. #else
  2239. return U8x16(
  2240. left.scalars[0] + right.scalars[0],
  2241. left.scalars[1] + right.scalars[1],
  2242. left.scalars[2] + right.scalars[2],
  2243. left.scalars[3] + right.scalars[3],
  2244. left.scalars[4] + right.scalars[4],
  2245. left.scalars[5] + right.scalars[5],
  2246. left.scalars[6] + right.scalars[6],
  2247. left.scalars[7] + right.scalars[7],
  2248. left.scalars[8] + right.scalars[8],
  2249. left.scalars[9] + right.scalars[9],
  2250. left.scalars[10] + right.scalars[10],
  2251. left.scalars[11] + right.scalars[11],
  2252. left.scalars[12] + right.scalars[12],
  2253. left.scalars[13] + right.scalars[13],
  2254. left.scalars[14] + right.scalars[14],
  2255. left.scalars[15] + right.scalars[15]
  2256. );
  2257. #endif
  2258. }
  2259. inline U8x16 operator-(const U8x16& left, const U8x16& right) {
  2260. #if defined USE_BASIC_SIMD
  2261. return U8x16(SUB_U8_SIMD(left.v, right.v));
  2262. #else
  2263. return U8x16(
  2264. left.scalars[0] - right.scalars[0],
  2265. left.scalars[1] - right.scalars[1],
  2266. left.scalars[2] - right.scalars[2],
  2267. left.scalars[3] - right.scalars[3],
  2268. left.scalars[4] - right.scalars[4],
  2269. left.scalars[5] - right.scalars[5],
  2270. left.scalars[6] - right.scalars[6],
  2271. left.scalars[7] - right.scalars[7],
  2272. left.scalars[8] - right.scalars[8],
  2273. left.scalars[9] - right.scalars[9],
  2274. left.scalars[10] - right.scalars[10],
  2275. left.scalars[11] - right.scalars[11],
  2276. left.scalars[12] - right.scalars[12],
  2277. left.scalars[13] - right.scalars[13],
  2278. left.scalars[14] - right.scalars[14],
  2279. left.scalars[15] - right.scalars[15]
  2280. );
  2281. #endif
  2282. }
  2283. inline uint8_t impl_limit0(int32_t x) { return x < 0 ? 0 : x; }
  2284. inline uint8_t impl_limit255(uint32_t x) { return x > 255 ? 255 : x; }
  2285. inline U8x16 saturatedAddition(const U8x16& left, const U8x16& right) {
  2286. #if defined USE_BASIC_SIMD
  2287. return U8x16(ADD_SAT_U8_SIMD(left.v, right.v));
  2288. #else
  2289. return U8x16(
  2290. impl_limit255((uint32_t)left.scalars[0] + (uint32_t)right.scalars[0]),
  2291. impl_limit255((uint32_t)left.scalars[1] + (uint32_t)right.scalars[1]),
  2292. impl_limit255((uint32_t)left.scalars[2] + (uint32_t)right.scalars[2]),
  2293. impl_limit255((uint32_t)left.scalars[3] + (uint32_t)right.scalars[3]),
  2294. impl_limit255((uint32_t)left.scalars[4] + (uint32_t)right.scalars[4]),
  2295. impl_limit255((uint32_t)left.scalars[5] + (uint32_t)right.scalars[5]),
  2296. impl_limit255((uint32_t)left.scalars[6] + (uint32_t)right.scalars[6]),
  2297. impl_limit255((uint32_t)left.scalars[7] + (uint32_t)right.scalars[7]),
  2298. impl_limit255((uint32_t)left.scalars[8] + (uint32_t)right.scalars[8]),
  2299. impl_limit255((uint32_t)left.scalars[9] + (uint32_t)right.scalars[9]),
  2300. impl_limit255((uint32_t)left.scalars[10] + (uint32_t)right.scalars[10]),
  2301. impl_limit255((uint32_t)left.scalars[11] + (uint32_t)right.scalars[11]),
  2302. impl_limit255((uint32_t)left.scalars[12] + (uint32_t)right.scalars[12]),
  2303. impl_limit255((uint32_t)left.scalars[13] + (uint32_t)right.scalars[13]),
  2304. impl_limit255((uint32_t)left.scalars[14] + (uint32_t)right.scalars[14]),
  2305. impl_limit255((uint32_t)left.scalars[15] + (uint32_t)right.scalars[15])
  2306. );
  2307. #endif
  2308. }
  2309. inline U8x16 saturatedSubtraction(const U8x16& left, const U8x16& right) {
  2310. #if defined USE_BASIC_SIMD
  2311. return U8x16(SUB_SAT_U8_SIMD(left.v, right.v));
  2312. #else
  2313. return U8x16(
  2314. impl_limit0((int32_t)left.scalars[0] - (int32_t)right.scalars[0]),
  2315. impl_limit0((int32_t)left.scalars[1] - (int32_t)right.scalars[1]),
  2316. impl_limit0((int32_t)left.scalars[2] - (int32_t)right.scalars[2]),
  2317. impl_limit0((int32_t)left.scalars[3] - (int32_t)right.scalars[3]),
  2318. impl_limit0((int32_t)left.scalars[4] - (int32_t)right.scalars[4]),
  2319. impl_limit0((int32_t)left.scalars[5] - (int32_t)right.scalars[5]),
  2320. impl_limit0((int32_t)left.scalars[6] - (int32_t)right.scalars[6]),
  2321. impl_limit0((int32_t)left.scalars[7] - (int32_t)right.scalars[7]),
  2322. impl_limit0((int32_t)left.scalars[8] - (int32_t)right.scalars[8]),
  2323. impl_limit0((int32_t)left.scalars[9] - (int32_t)right.scalars[9]),
  2324. impl_limit0((int32_t)left.scalars[10] - (int32_t)right.scalars[10]),
  2325. impl_limit0((int32_t)left.scalars[11] - (int32_t)right.scalars[11]),
  2326. impl_limit0((int32_t)left.scalars[12] - (int32_t)right.scalars[12]),
  2327. impl_limit0((int32_t)left.scalars[13] - (int32_t)right.scalars[13]),
  2328. impl_limit0((int32_t)left.scalars[14] - (int32_t)right.scalars[14]),
  2329. impl_limit0((int32_t)left.scalars[15] - (int32_t)right.scalars[15])
  2330. );
  2331. #endif
  2332. }
  2333. inline I32x4 truncateToI32(const F32x4& vector) {
  2334. #if defined USE_BASIC_SIMD
  2335. return I32x4(F32_TO_I32_SIMD(vector.v));
  2336. #else
  2337. return I32x4((int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3]);
  2338. #endif
  2339. }
  2340. inline U32x4 truncateToU32(const F32x4& vector) {
  2341. #if defined USE_BASIC_SIMD
  2342. return U32x4(F32_TO_U32_SIMD(vector.v));
  2343. #else
  2344. return U32x4((uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3]);
  2345. #endif
  2346. }
  2347. inline F32x4 floatFromI32(const I32x4& vector) {
  2348. #if defined USE_BASIC_SIMD
  2349. return F32x4(I32_TO_F32_SIMD(vector.v));
  2350. #else
  2351. return F32x4((float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3]);
  2352. #endif
  2353. }
  2354. inline F32x4 floatFromU32(const U32x4& vector) {
  2355. #if defined USE_BASIC_SIMD
  2356. return F32x4(U32_TO_F32_SIMD(vector.v));
  2357. #else
  2358. return F32x4((float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3]);
  2359. #endif
  2360. }
  2361. inline I32x4 I32FromU32(const U32x4& vector) {
  2362. #if defined USE_BASIC_SIMD
  2363. return I32x4(REINTERPRET_U32_TO_I32_SIMD(vector.v));
  2364. #else
  2365. return I32x4((int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3]);
  2366. #endif
  2367. }
  2368. inline U32x4 U32FromI32(const I32x4& vector) {
  2369. #if defined USE_BASIC_SIMD
  2370. return U32x4(REINTERPRET_I32_TO_U32_SIMD(vector.v));
  2371. #else
  2372. return U32x4((uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3]);
  2373. #endif
  2374. }
  2375. // Warning! Behavior depends on endianness.
  2376. inline U8x16 reinterpret_U8FromU32(const U32x4& vector) {
  2377. #if defined USE_BASIC_SIMD
  2378. return U8x16(REINTERPRET_U32_TO_U8_SIMD(vector.v));
  2379. #else
  2380. const uint8_t *source = (const uint8_t*)vector.scalars;
  2381. return U8x16(
  2382. source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7],
  2383. source[8], source[9], source[10], source[11], source[12], source[13], source[14], source[15]
  2384. );
  2385. #endif
  2386. }
  2387. // Warning! Behavior depends on endianness.
  2388. inline U32x4 reinterpret_U32FromU8(const U8x16& vector) {
  2389. #if defined USE_BASIC_SIMD
  2390. return U32x4(REINTERPRET_U8_TO_U32_SIMD(vector.v));
  2391. #else
  2392. const uint32_t *source = (const uint32_t*)vector.scalars;
  2393. return U32x4(source[0], source[1], source[2], source[3]);
  2394. #endif
  2395. }
  2396. // Unpacking to larger integers
  2397. inline U32x4 lowerToU32(const U16x8& vector) {
  2398. #if defined USE_BASIC_SIMD
  2399. return U32x4(U16_LOW_TO_U32_SIMD(vector.v));
  2400. #else
  2401. return U32x4(vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3]);
  2402. #endif
  2403. }
  2404. inline U32x4 higherToU32(const U16x8& vector) {
  2405. #if defined USE_BASIC_SIMD
  2406. return U32x4(U16_HIGH_TO_U32_SIMD(vector.v));
  2407. #else
  2408. return U32x4(vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7]);
  2409. #endif
  2410. }
  2411. inline U16x8 lowerToU16(const U8x16& vector) {
  2412. #if defined USE_BASIC_SIMD
  2413. return U16x8(U8_LOW_TO_U16_SIMD(vector.v));
  2414. #else
  2415. return U16x8(
  2416. vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3],
  2417. vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7]
  2418. );
  2419. #endif
  2420. }
  2421. inline U16x8 higherToU16(const U8x16& vector) {
  2422. #if defined USE_BASIC_SIMD
  2423. return U16x8(U8_HIGH_TO_U16_SIMD(vector.v));
  2424. #else
  2425. return U16x8(
  2426. vector.scalars[8], vector.scalars[9], vector.scalars[10], vector.scalars[11],
  2427. vector.scalars[12], vector.scalars[13], vector.scalars[14], vector.scalars[15]
  2428. );
  2429. #endif
  2430. }
  2431. // Saturated packing
  2432. inline U8x16 saturateToU8(const U16x8& lower, const U16x8& upper) {
  2433. #if defined USE_BASIC_SIMD
  2434. return U8x16(PACK_SAT_U16_TO_U8_SIMD(lower.v, upper.v));
  2435. #else
  2436. return U8x16(
  2437. impl_limit255(lower.scalars[0]),
  2438. impl_limit255(lower.scalars[1]),
  2439. impl_limit255(lower.scalars[2]),
  2440. impl_limit255(lower.scalars[3]),
  2441. impl_limit255(lower.scalars[4]),
  2442. impl_limit255(lower.scalars[5]),
  2443. impl_limit255(lower.scalars[6]),
  2444. impl_limit255(lower.scalars[7]),
  2445. impl_limit255(upper.scalars[0]),
  2446. impl_limit255(upper.scalars[1]),
  2447. impl_limit255(upper.scalars[2]),
  2448. impl_limit255(upper.scalars[3]),
  2449. impl_limit255(upper.scalars[4]),
  2450. impl_limit255(upper.scalars[5]),
  2451. impl_limit255(upper.scalars[6]),
  2452. impl_limit255(upper.scalars[7])
  2453. );
  2454. #endif
  2455. }
  2456. // Unary negation for convenience and code readability.
  2457. // Before using unary negation, always check if:
  2458. // * An addition can be turned into a subtraction?
  2459. // x = -a + b
  2460. // x = b - a
  2461. // * A multiplying constant or scalar can be negated instead?
  2462. // x = -b * 2
  2463. // x = b * -2
  2464. inline F32x4 operator-(const F32x4& value) {
  2465. #if defined USE_BASIC_SIMD
  2466. return F32x4(0.0f) - value;
  2467. #else
  2468. return F32x4(-value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3]);
  2469. #endif
  2470. }
  2471. inline I32x4 operator-(const I32x4& value) {
  2472. #if defined USE_BASIC_SIMD
  2473. return I32x4(0) - value;
  2474. #else
  2475. return I32x4(-value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3]);
  2476. #endif
  2477. }
  2478. // Helper macros for generating the vector extract functions.
  2479. // Having one function for each type and offset makes sure that the compiler gets an immediate integer within the valid range.
  2480. #if defined USE_BASIC_SIMD
  2481. #if defined USE_SSE2
  2482. #if defined USE_SSSE3
  2483. #define _MM_ALIGNR_EPI8(A, B, OFFSET) _mm_alignr_epi8(A, B, OFFSET)
  2484. #else
  2485. // If SSSE3 is not used, emulate it using stack memory and unaligned reading of data.
  2486. static inline SIMD_U8x16 _MM_ALIGNR_EPI8(SIMD_U8x16 a, SIMD_U8x16 b, int offset) {
  2487. ALIGN16 uint8_t vectorBuffer[32];
  2488. _mm_store_si128((SIMD_U8x16*)(vectorBuffer), b);
  2489. _mm_store_si128((SIMD_U8x16*)(vectorBuffer + 16), a);
  2490. return _mm_loadu_si128((SIMD_U8x16*)(vectorBuffer + offset));
  2491. }
  2492. #endif
  2493. #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return U8x16(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET));
  2494. #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return U16x8(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 2));
  2495. #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return U32x4(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 4));
  2496. #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return I32x4(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 4));
  2497. #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return F32x4(SIMD_F32x4(_MM_ALIGNR_EPI8(SIMD_U32x4(b.v), SIMD_U32x4(a.v), OFFSET * 4)));
  2498. #elif defined USE_NEON
  2499. #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return U8x16(vextq_u8(a.v, b.v, OFFSET));
  2500. #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return U16x8(vextq_u16(a.v, b.v, OFFSET));
  2501. #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return U32x4(vextq_u32(a.v, b.v, OFFSET));
  2502. #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return I32x4(vextq_s32(a.v, b.v, OFFSET));
  2503. #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return F32x4(vextq_f32(a.v, b.v, OFFSET));
  2504. #endif
  2505. #else
  2506. #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  2507. #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  2508. #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  2509. #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  2510. #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  2511. #endif
  2512. // Vector extraction concatunates two input vectors and reads a vector between them using an offset.
  2513. // The first and last offsets that only return one of the inputs can be used for readability, because they will be inlined and removed by the compiler.
  2514. // To get elements from the right side, combine the center vector with the right vector and shift one element to the left using vectorExtract_1 for the given type.
  2515. // To get elements from the left side, combine the left vector with the center vector and shift one element to the right using vectorExtract_15 for 16 lanes, vectorExtract_7 for 8 lanes, or vectorExtract_3 for 4 lanes.
  2516. U8x16 inline vectorExtract_0(const U8x16 &a, const U8x16 &b) { return a; }
  2517. U8x16 inline vectorExtract_1(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(1, U8x16(a.scalars[1], a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0])) }
  2518. U8x16 inline vectorExtract_2(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(2, U8x16(a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1])) }
  2519. U8x16 inline vectorExtract_3(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(3, U8x16(a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2])) }
  2520. U8x16 inline vectorExtract_4(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(4, U8x16(a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3])) }
  2521. U8x16 inline vectorExtract_5(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(5, U8x16(a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4])) }
  2522. U8x16 inline vectorExtract_6(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(6, U8x16(a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5])) }
  2523. U8x16 inline vectorExtract_7(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(7, U8x16(a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6])) }
  2524. U8x16 inline vectorExtract_8(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(8, U8x16(a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7])) }
  2525. U8x16 inline vectorExtract_9(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(9, U8x16(a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8])) }
  2526. U8x16 inline vectorExtract_10(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(10, U8x16(a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9])) }
  2527. U8x16 inline vectorExtract_11(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(11, U8x16(a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10])) }
  2528. U8x16 inline vectorExtract_12(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(12, U8x16(a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11])) }
  2529. U8x16 inline vectorExtract_13(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(13, U8x16(a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11], b.scalars[12])) }
  2530. U8x16 inline vectorExtract_14(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(14, U8x16(a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11], b.scalars[12], b.scalars[13])) }
  2531. U8x16 inline vectorExtract_15(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(15, U8x16(a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11], b.scalars[12], b.scalars[13], b.scalars[14])) }
  2532. U8x16 inline vectorExtract_16(const U8x16 &a, const U8x16 &b) { return b; }
  2533. U16x8 inline vectorExtract_0(const U16x8 &a, const U16x8 &b) { return a; }
  2534. U16x8 inline vectorExtract_1(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(1, U16x8(a.scalars[1], a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0])) }
  2535. U16x8 inline vectorExtract_2(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(2, U16x8(a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1])) }
  2536. U16x8 inline vectorExtract_3(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(3, U16x8(a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2])) }
  2537. U16x8 inline vectorExtract_4(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(4, U16x8(a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3])) }
  2538. U16x8 inline vectorExtract_5(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(5, U16x8(a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4])) }
  2539. U16x8 inline vectorExtract_6(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(6, U16x8(a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5])) }
  2540. U16x8 inline vectorExtract_7(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(7, U16x8(a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6])) }
  2541. U16x8 inline vectorExtract_8(const U16x8 &a, const U16x8 &b) { return b; }
  2542. U32x4 inline vectorExtract_0(const U32x4 &a, const U32x4 &b) { return a; }
  2543. U32x4 inline vectorExtract_1(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(1, U32x4(a.scalars[1], a.scalars[2], a.scalars[3], b.scalars[0])) }
  2544. U32x4 inline vectorExtract_2(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(2, U32x4(a.scalars[2], a.scalars[3], b.scalars[0], b.scalars[1])) }
  2545. U32x4 inline vectorExtract_3(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(3, U32x4(a.scalars[3], b.scalars[0], b.scalars[1], b.scalars[2])) }
  2546. U32x4 inline vectorExtract_4(const U32x4 &a, const U32x4 &b) { return b; }
  2547. I32x4 inline vectorExtract_0(const I32x4 &a, const I32x4 &b) { return a; }
  2548. I32x4 inline vectorExtract_1(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(1, I32x4(a.scalars[1], a.scalars[2], a.scalars[3], b.scalars[0])) }
  2549. I32x4 inline vectorExtract_2(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(2, I32x4(a.scalars[2], a.scalars[3], b.scalars[0], b.scalars[1])) }
  2550. I32x4 inline vectorExtract_3(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(3, I32x4(a.scalars[3], b.scalars[0], b.scalars[1], b.scalars[2])) }
  2551. I32x4 inline vectorExtract_4(const I32x4 &a, const I32x4 &b) { return b; }
  2552. F32x4 inline vectorExtract_0(const F32x4 &a, const F32x4 &b) { return a; }
  2553. F32x4 inline vectorExtract_1(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(1, F32x4(a.scalars[1], a.scalars[2], a.scalars[3], b.scalars[0])) }
  2554. F32x4 inline vectorExtract_2(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(2, F32x4(a.scalars[2], a.scalars[3], b.scalars[0], b.scalars[1])) }
  2555. F32x4 inline vectorExtract_3(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(3, F32x4(a.scalars[3], b.scalars[0], b.scalars[1], b.scalars[2])) }
  2556. F32x4 inline vectorExtract_4(const F32x4 &a, const F32x4 &b) { return b; }
  2557. // Gather instructions load memory from a pointer at multiple index offsets at the same time.
  2558. // The given pointers should be aligned with 4 bytes, so that the fallback solution works on machines with strict alignment requirements.
  2559. #if defined USE_AVX2
  2560. #define GATHER_I32x4_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_epi32((const int32_t*)(SOURCE), FOUR_OFFSETS, SCALE)
  2561. #define GATHER_U32x4_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_epi32((const int32_t*)(SOURCE), FOUR_OFFSETS, SCALE)
  2562. #define GATHER_F32x4_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_ps((const float*)(SOURCE), FOUR_OFFSETS, SCALE)
  2563. #endif
  2564. static inline U32x4 gather(const dsr::SafePointer<uint32_t> data, const U32x4 &elementOffset) {
  2565. #if defined USE_AVX2
  2566. // TODO: Implement safety checks for debug mode.
  2567. return U32x4(GATHER_U32x4_AVX2(data.getUnsafe(), elementOffset.v, 4));
  2568. #else
  2569. ALIGN16 uint32_t elementOffsets[4];
  2570. elementOffset.writeAlignedUnsafe(elementOffsets);
  2571. return U32x4(
  2572. *(data + elementOffsets[0]),
  2573. *(data + elementOffsets[1]),
  2574. *(data + elementOffsets[2]),
  2575. *(data + elementOffsets[3])
  2576. );
  2577. #endif
  2578. }
  2579. static inline I32x4 gather(const dsr::SafePointer<int32_t> data, const U32x4 &elementOffset) {
  2580. #if defined USE_AVX2
  2581. // TODO: Implement safety checks for debug mode.
  2582. return I32x4(GATHER_U32x4_AVX2(data.getUnsafe(), elementOffset.v, 4));
  2583. #else
  2584. ALIGN16 uint32_t elementOffsets[4];
  2585. elementOffset.writeAlignedUnsafe(elementOffsets);
  2586. return I32x4(
  2587. *(data + elementOffsets[0]),
  2588. *(data + elementOffsets[1]),
  2589. *(data + elementOffsets[2]),
  2590. *(data + elementOffsets[3])
  2591. );
  2592. #endif
  2593. }
  2594. static inline F32x4 gather(const dsr::SafePointer<float> data, const U32x4 &elementOffset) {
  2595. #if defined USE_AVX2
  2596. // TODO: Implement safety checks for debug mode.
  2597. return F32x4(GATHER_F32x4_AVX2(data.getUnsafe(), elementOffset.v, 4));
  2598. #else
  2599. ALIGN16 uint32_t elementOffsets[4];
  2600. elementOffset.writeAlignedUnsafe(elementOffsets);
  2601. return F32x4(
  2602. *(data + elementOffsets[0]),
  2603. *(data + elementOffsets[1]),
  2604. *(data + elementOffsets[2]),
  2605. *(data + elementOffsets[3])
  2606. );
  2607. #endif
  2608. }
  2609. inline F32x8 operator+(const F32x8& left, const F32x8& right) {
  2610. #if defined USE_256BIT_F_SIMD
  2611. return F32x8(ADD_F32_SIMD256(left.v, right.v));
  2612. #else
  2613. return F32x8(
  2614. left.scalars[0] + right.scalars[0],
  2615. left.scalars[1] + right.scalars[1],
  2616. left.scalars[2] + right.scalars[2],
  2617. left.scalars[3] + right.scalars[3],
  2618. left.scalars[4] + right.scalars[4],
  2619. left.scalars[5] + right.scalars[5],
  2620. left.scalars[6] + right.scalars[6],
  2621. left.scalars[7] + right.scalars[7]
  2622. );
  2623. #endif
  2624. }
  2625. inline F32x8 operator-(const F32x8& left, const F32x8& right) {
  2626. #if defined USE_256BIT_F_SIMD
  2627. return F32x8(SUB_F32_SIMD256(left.v, right.v));
  2628. #else
  2629. return F32x8(
  2630. left.scalars[0] - right.scalars[0],
  2631. left.scalars[1] - right.scalars[1],
  2632. left.scalars[2] - right.scalars[2],
  2633. left.scalars[3] - right.scalars[3],
  2634. left.scalars[4] - right.scalars[4],
  2635. left.scalars[5] - right.scalars[5],
  2636. left.scalars[6] - right.scalars[6],
  2637. left.scalars[7] - right.scalars[7]
  2638. );
  2639. #endif
  2640. }
  2641. inline F32x8 operator*(const F32x8& left, const F32x8& right) {
  2642. #if defined USE_256BIT_F_SIMD
  2643. return F32x8(MUL_F32_SIMD256(left.v, right.v));
  2644. #else
  2645. return F32x8(
  2646. left.scalars[0] * right.scalars[0],
  2647. left.scalars[1] * right.scalars[1],
  2648. left.scalars[2] * right.scalars[2],
  2649. left.scalars[3] * right.scalars[3],
  2650. left.scalars[4] * right.scalars[4],
  2651. left.scalars[5] * right.scalars[5],
  2652. left.scalars[6] * right.scalars[6],
  2653. left.scalars[7] * right.scalars[7]
  2654. );
  2655. #endif
  2656. }
  2657. inline F32x8 min(const F32x8& left, const F32x8& right) {
  2658. #if defined USE_256BIT_F_SIMD
  2659. return F32x8(MIN_F32_SIMD256(left.v, right.v));
  2660. #else
  2661. float v0 = left.scalars[0];
  2662. float v1 = left.scalars[1];
  2663. float v2 = left.scalars[2];
  2664. float v3 = left.scalars[3];
  2665. float v4 = left.scalars[4];
  2666. float v5 = left.scalars[5];
  2667. float v6 = left.scalars[6];
  2668. float v7 = left.scalars[7];
  2669. float r0 = right.scalars[0];
  2670. float r1 = right.scalars[1];
  2671. float r2 = right.scalars[2];
  2672. float r3 = right.scalars[3];
  2673. float r4 = right.scalars[4];
  2674. float r5 = right.scalars[5];
  2675. float r6 = right.scalars[6];
  2676. float r7 = right.scalars[7];
  2677. if (r0 < v0) { v0 = r0; }
  2678. if (r1 < v1) { v1 = r1; }
  2679. if (r2 < v2) { v2 = r2; }
  2680. if (r3 < v3) { v3 = r3; }
  2681. if (r4 < v4) { v4 = r4; }
  2682. if (r5 < v5) { v5 = r5; }
  2683. if (r6 < v6) { v6 = r6; }
  2684. if (r7 < v7) { v7 = r7; }
  2685. return F32x8(v0, v1, v2, v3, v4, v5, v6, v7);
  2686. #endif
  2687. }
  2688. inline F32x8 max(const F32x8& left, const F32x8& right) {
  2689. #if defined USE_256BIT_F_SIMD
  2690. return F32x8(MAX_F32_SIMD256(left.v, right.v));
  2691. #else
  2692. float v0 = left.scalars[0];
  2693. float v1 = left.scalars[1];
  2694. float v2 = left.scalars[2];
  2695. float v3 = left.scalars[3];
  2696. float v4 = left.scalars[4];
  2697. float v5 = left.scalars[5];
  2698. float v6 = left.scalars[6];
  2699. float v7 = left.scalars[7];
  2700. float r0 = right.scalars[0];
  2701. float r1 = right.scalars[1];
  2702. float r2 = right.scalars[2];
  2703. float r3 = right.scalars[3];
  2704. float r4 = right.scalars[4];
  2705. float r5 = right.scalars[5];
  2706. float r6 = right.scalars[6];
  2707. float r7 = right.scalars[7];
  2708. if (r0 > v0) { v0 = r0; }
  2709. if (r1 > v1) { v1 = r1; }
  2710. if (r2 > v2) { v2 = r2; }
  2711. if (r3 > v3) { v3 = r3; }
  2712. if (r4 > v4) { v4 = r4; }
  2713. if (r5 > v5) { v5 = r5; }
  2714. if (r6 > v6) { v6 = r6; }
  2715. if (r7 > v7) { v7 = r7; }
  2716. return F32x8(v0, v1, v2, v3, v4, v5, v6, v7);
  2717. #endif
  2718. }
  2719. inline I32x8 operator+(const I32x8& left, const I32x8& right) {
  2720. #if defined USE_256BIT_X_SIMD
  2721. return I32x8(ADD_I32_SIMD256(left.v, right.v));
  2722. #else
  2723. return I32x8(
  2724. left.scalars[0] + right.scalars[0],
  2725. left.scalars[1] + right.scalars[1],
  2726. left.scalars[2] + right.scalars[2],
  2727. left.scalars[3] + right.scalars[3],
  2728. left.scalars[4] + right.scalars[4],
  2729. left.scalars[5] + right.scalars[5],
  2730. left.scalars[6] + right.scalars[6],
  2731. left.scalars[7] + right.scalars[7]);
  2732. #endif
  2733. }
  2734. inline I32x8 operator-(const I32x8& left, const I32x8& right) {
  2735. #if defined USE_256BIT_X_SIMD
  2736. return I32x8(SUB_I32_SIMD256(left.v, right.v));
  2737. #else
  2738. return I32x8(
  2739. left.scalars[0] - right.scalars[0],
  2740. left.scalars[1] - right.scalars[1],
  2741. left.scalars[2] - right.scalars[2],
  2742. left.scalars[3] - right.scalars[3],
  2743. left.scalars[4] - right.scalars[4],
  2744. left.scalars[5] - right.scalars[5],
  2745. left.scalars[6] - right.scalars[6],
  2746. left.scalars[7] - right.scalars[7]);
  2747. #endif
  2748. }
  2749. inline I32x8 operator*(const I32x8& left, const I32x8& right) {
  2750. #if defined USE_AVX2
  2751. return I32x8(MUL_I32_SIMD256(left.v, right.v));
  2752. #else
  2753. return I32x8(
  2754. left.scalars[0] * right.scalars[0],
  2755. left.scalars[1] * right.scalars[1],
  2756. left.scalars[2] * right.scalars[2],
  2757. left.scalars[3] * right.scalars[3],
  2758. left.scalars[4] * right.scalars[4],
  2759. left.scalars[5] * right.scalars[5],
  2760. left.scalars[6] * right.scalars[6],
  2761. left.scalars[7] * right.scalars[7]
  2762. );
  2763. #endif
  2764. }
  2765. inline U32x8 operator+(const U32x8& left, const U32x8& right) {
  2766. #if defined USE_256BIT_X_SIMD
  2767. return U32x8(ADD_U32_SIMD256(left.v, right.v));
  2768. #else
  2769. return U32x8(
  2770. left.scalars[0] + right.scalars[0],
  2771. left.scalars[1] + right.scalars[1],
  2772. left.scalars[2] + right.scalars[2],
  2773. left.scalars[3] + right.scalars[3],
  2774. left.scalars[4] + right.scalars[4],
  2775. left.scalars[5] + right.scalars[5],
  2776. left.scalars[6] + right.scalars[6],
  2777. left.scalars[7] + right.scalars[7]
  2778. );
  2779. #endif
  2780. }
  2781. inline U32x8 operator-(const U32x8& left, const U32x8& right) {
  2782. #if defined USE_256BIT_X_SIMD
  2783. return U32x8(SUB_U32_SIMD256(left.v, right.v));
  2784. #else
  2785. return U32x8(
  2786. left.scalars[0] - right.scalars[0],
  2787. left.scalars[1] - right.scalars[1],
  2788. left.scalars[2] - right.scalars[2],
  2789. left.scalars[3] - right.scalars[3],
  2790. left.scalars[4] - right.scalars[4],
  2791. left.scalars[5] - right.scalars[5],
  2792. left.scalars[6] - right.scalars[6],
  2793. left.scalars[7] - right.scalars[7]
  2794. );
  2795. #endif
  2796. }
  2797. inline U32x8 operator*(const U32x8& left, const U32x8& right) {
  2798. #if defined USE_AVX2
  2799. return U32x8(MUL_U32_SIMD256(left.v, right.v));
  2800. #else
  2801. return U32x8(
  2802. left.scalars[0] * right.scalars[0],
  2803. left.scalars[1] * right.scalars[1],
  2804. left.scalars[2] * right.scalars[2],
  2805. left.scalars[3] * right.scalars[3],
  2806. left.scalars[4] * right.scalars[4],
  2807. left.scalars[5] * right.scalars[5],
  2808. left.scalars[6] * right.scalars[6],
  2809. left.scalars[7] * right.scalars[7]
  2810. );
  2811. #endif
  2812. }
  2813. inline U32x8 operator&(const U32x8& left, const U32x8& right) {
  2814. #if defined USE_256BIT_X_SIMD
  2815. return U32x8(BITWISE_AND_U32_SIMD256(left.v, right.v));
  2816. #else
  2817. return U32x8(
  2818. left.scalars[0] & right.scalars[0],
  2819. left.scalars[1] & right.scalars[1],
  2820. left.scalars[2] & right.scalars[2],
  2821. left.scalars[3] & right.scalars[3],
  2822. left.scalars[4] & right.scalars[4],
  2823. left.scalars[5] & right.scalars[5],
  2824. left.scalars[6] & right.scalars[6],
  2825. left.scalars[7] & right.scalars[7]
  2826. );
  2827. #endif
  2828. }
  2829. inline U32x8 operator|(const U32x8& left, const U32x8& right) {
  2830. #if defined USE_256BIT_X_SIMD
  2831. return U32x8(BITWISE_OR_U32_SIMD256(left.v, right.v));
  2832. #else
  2833. return U32x8(
  2834. left.scalars[0] | right.scalars[0],
  2835. left.scalars[1] | right.scalars[1],
  2836. left.scalars[2] | right.scalars[2],
  2837. left.scalars[3] | right.scalars[3],
  2838. left.scalars[4] | right.scalars[4],
  2839. left.scalars[5] | right.scalars[5],
  2840. left.scalars[6] | right.scalars[6],
  2841. left.scalars[7] | right.scalars[7]
  2842. );
  2843. #endif
  2844. }
  2845. inline U32x8 operator^(const U32x8& left, const U32x8& right) {
  2846. #if defined USE_256BIT_X_SIMD
  2847. return U32x8(BITWISE_XOR_U32_SIMD256(left.v, right.v));
  2848. #else
  2849. return U32x8(
  2850. left.scalars[0] ^ right.scalars[0],
  2851. left.scalars[1] ^ right.scalars[1],
  2852. left.scalars[2] ^ right.scalars[2],
  2853. left.scalars[3] ^ right.scalars[3],
  2854. left.scalars[4] ^ right.scalars[4],
  2855. left.scalars[5] ^ right.scalars[5],
  2856. left.scalars[6] ^ right.scalars[6],
  2857. left.scalars[7] ^ right.scalars[7]
  2858. );
  2859. #endif
  2860. }
  2861. inline U32x8 operator<<(const U32x8& left, uint32_t bitOffset) {
  2862. #if defined USE_AVX2
  2863. return U32x8(_mm256_slli_epi32(left.v, bitOffset));
  2864. #else
  2865. return U32x8(
  2866. left.scalars[0] << bitOffset,
  2867. left.scalars[1] << bitOffset,
  2868. left.scalars[2] << bitOffset,
  2869. left.scalars[3] << bitOffset,
  2870. left.scalars[4] << bitOffset,
  2871. left.scalars[5] << bitOffset,
  2872. left.scalars[6] << bitOffset,
  2873. left.scalars[7] << bitOffset
  2874. );
  2875. #endif
  2876. }
  2877. inline U32x8 operator>>(const U32x8& left, uint32_t bitOffset) {
  2878. #if defined USE_AVX2
  2879. return U32x8(_mm256_srli_epi32(left.v, bitOffset));
  2880. #else
  2881. return U32x8(
  2882. left.scalars[0] >> bitOffset,
  2883. left.scalars[1] >> bitOffset,
  2884. left.scalars[2] >> bitOffset,
  2885. left.scalars[3] >> bitOffset,
  2886. left.scalars[4] >> bitOffset,
  2887. left.scalars[5] >> bitOffset,
  2888. left.scalars[6] >> bitOffset,
  2889. left.scalars[7] >> bitOffset
  2890. );
  2891. #endif
  2892. }
  2893. inline U16x16 operator+(const U16x16& left, const U16x16& right) {
  2894. #if defined USE_256BIT_X_SIMD
  2895. return U16x16(ADD_U16_SIMD256(left.v, right.v));
  2896. #else
  2897. return U16x16(
  2898. left.scalars[0] + right.scalars[0],
  2899. left.scalars[1] + right.scalars[1],
  2900. left.scalars[2] + right.scalars[2],
  2901. left.scalars[3] + right.scalars[3],
  2902. left.scalars[4] + right.scalars[4],
  2903. left.scalars[5] + right.scalars[5],
  2904. left.scalars[6] + right.scalars[6],
  2905. left.scalars[7] + right.scalars[7],
  2906. left.scalars[8] + right.scalars[8],
  2907. left.scalars[9] + right.scalars[9],
  2908. left.scalars[10] + right.scalars[10],
  2909. left.scalars[11] + right.scalars[11],
  2910. left.scalars[12] + right.scalars[12],
  2911. left.scalars[13] + right.scalars[13],
  2912. left.scalars[14] + right.scalars[14],
  2913. left.scalars[15] + right.scalars[15]
  2914. );
  2915. #endif
  2916. }
  2917. inline U16x16 operator-(const U16x16& left, const U16x16& right) {
  2918. #if defined USE_256BIT_X_SIMD
  2919. return U16x16(SUB_U16_SIMD256(left.v, right.v));
  2920. #else
  2921. return U16x16(
  2922. left.scalars[0] - right.scalars[0],
  2923. left.scalars[1] - right.scalars[1],
  2924. left.scalars[2] - right.scalars[2],
  2925. left.scalars[3] - right.scalars[3],
  2926. left.scalars[4] - right.scalars[4],
  2927. left.scalars[5] - right.scalars[5],
  2928. left.scalars[6] - right.scalars[6],
  2929. left.scalars[7] - right.scalars[7],
  2930. left.scalars[8] - right.scalars[8],
  2931. left.scalars[9] - right.scalars[9],
  2932. left.scalars[10] - right.scalars[10],
  2933. left.scalars[11] - right.scalars[11],
  2934. left.scalars[12] - right.scalars[12],
  2935. left.scalars[13] - right.scalars[13],
  2936. left.scalars[14] - right.scalars[14],
  2937. left.scalars[15] - right.scalars[15]
  2938. );
  2939. #endif
  2940. }
  2941. inline U16x16 operator*(const U16x16& left, const U16x16& right) {
  2942. #if defined USE_256BIT_X_SIMD
  2943. return U16x16(MUL_U16_SIMD256(left.v, right.v));
  2944. #else
  2945. return U16x16(
  2946. left.scalars[0] * right.scalars[0],
  2947. left.scalars[1] * right.scalars[1],
  2948. left.scalars[2] * right.scalars[2],
  2949. left.scalars[3] * right.scalars[3],
  2950. left.scalars[4] * right.scalars[4],
  2951. left.scalars[5] * right.scalars[5],
  2952. left.scalars[6] * right.scalars[6],
  2953. left.scalars[7] * right.scalars[7],
  2954. left.scalars[8] * right.scalars[8],
  2955. left.scalars[9] * right.scalars[9],
  2956. left.scalars[10] * right.scalars[10],
  2957. left.scalars[11] * right.scalars[11],
  2958. left.scalars[12] * right.scalars[12],
  2959. left.scalars[13] * right.scalars[13],
  2960. left.scalars[14] * right.scalars[14],
  2961. left.scalars[15] * right.scalars[15]
  2962. );
  2963. #endif
  2964. }
  2965. inline U8x32 operator+(const U8x32& left, const U8x32& right) {
  2966. #if defined USE_256BIT_X_SIMD
  2967. return U8x32(ADD_U8_SIMD256(left.v, right.v));
  2968. #else
  2969. U8x32 result = U8x32::create_dangerous_uninitialized();
  2970. for (int i = 0; i < 32; i++) {
  2971. result.scalars[i] = left.scalars[i] + right.scalars[i];
  2972. }
  2973. return result;
  2974. #endif
  2975. }
  2976. inline U8x32 operator-(const U8x32& left, const U8x32& right) {
  2977. #if defined USE_256BIT_X_SIMD
  2978. return U8x32(SUB_U8_SIMD256(left.v, right.v));
  2979. #else
  2980. U8x32 result = U8x32::create_dangerous_uninitialized();
  2981. for (int i = 0; i < 32; i++) {
  2982. result.scalars[i] = left.scalars[i] - right.scalars[i];
  2983. }
  2984. return result;
  2985. #endif
  2986. }
  2987. inline U8x32 saturatedAddition(const U8x32& left, const U8x32& right) {
  2988. #if defined USE_256BIT_X_SIMD
  2989. return U8x32(ADD_SAT_U8_SIMD256(left.v, right.v));
  2990. #else
  2991. U8x32 result = U8x32::create_dangerous_uninitialized();
  2992. for (int i = 0; i < 32; i++) {
  2993. result.scalars[i] = impl_limit255((uint32_t)left.scalars[i] + (uint32_t)right.scalars[i]);
  2994. }
  2995. return result;
  2996. #endif
  2997. }
  2998. inline U8x32 saturatedSubtraction(const U8x32& left, const U8x32& right) {
  2999. #if defined USE_256BIT_X_SIMD
  3000. return U8x32(SUB_SAT_U8_SIMD256(left.v, right.v));
  3001. #else
  3002. U8x32 result = U8x32::create_dangerous_uninitialized();
  3003. for (int i = 0; i < 32; i++) {
  3004. result.scalars[i] = impl_limit0((int32_t)left.scalars[i] - (int32_t)right.scalars[i]);
  3005. }
  3006. return result;
  3007. #endif
  3008. }
  3009. inline I32x8 truncateToI32(const F32x8& vector) {
  3010. #if defined USE_256BIT_X_SIMD
  3011. return I32x8(F32_TO_I32_SIMD256(vector.v));
  3012. #else
  3013. return I32x8(
  3014. (int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3],
  3015. (int32_t)vector.scalars[4], (int32_t)vector.scalars[5], (int32_t)vector.scalars[6], (int32_t)vector.scalars[7]
  3016. );
  3017. #endif
  3018. }
  3019. inline U32x8 truncateToU32(const F32x8& vector) {
  3020. #if defined USE_256BIT_X_SIMD
  3021. return U32x8(F32_TO_U32_SIMD256(vector.v));
  3022. #else
  3023. return U32x8(
  3024. (uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3],
  3025. (uint32_t)vector.scalars[4], (uint32_t)vector.scalars[5], (uint32_t)vector.scalars[6], (uint32_t)vector.scalars[7]
  3026. );
  3027. #endif
  3028. }
  3029. inline F32x8 floatFromI32(const I32x8& vector) {
  3030. #if defined USE_256BIT_X_SIMD
  3031. return F32x8(I32_TO_F32_SIMD256(vector.v));
  3032. #else
  3033. return F32x8(
  3034. (float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3],
  3035. (float)vector.scalars[4], (float)vector.scalars[5], (float)vector.scalars[6], (float)vector.scalars[7]
  3036. );
  3037. #endif
  3038. }
  3039. inline F32x8 floatFromU32(const U32x8& vector) {
  3040. #if defined USE_256BIT_X_SIMD
  3041. return F32x8(U32_TO_F32_SIMD256(vector.v));
  3042. #else
  3043. return F32x8(
  3044. (float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3],
  3045. (float)vector.scalars[4], (float)vector.scalars[5], (float)vector.scalars[6], (float)vector.scalars[7]
  3046. );
  3047. #endif
  3048. }
  3049. inline I32x8 I32FromU32(const U32x8& vector) {
  3050. #if defined USE_256BIT_X_SIMD
  3051. return I32x8(REINTERPRET_U32_TO_I32_SIMD256(vector.v));
  3052. #else
  3053. return I32x8(
  3054. (int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3],
  3055. (int32_t)vector.scalars[4], (int32_t)vector.scalars[5], (int32_t)vector.scalars[6], (int32_t)vector.scalars[7]
  3056. );
  3057. #endif
  3058. }
  3059. inline U32x8 U32FromI32(const I32x8& vector) {
  3060. #if defined USE_256BIT_X_SIMD
  3061. return U32x8(REINTERPRET_I32_TO_U32_SIMD256(vector.v));
  3062. #else
  3063. return U32x8(
  3064. (uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3],
  3065. (uint32_t)vector.scalars[4], (uint32_t)vector.scalars[5], (uint32_t)vector.scalars[6], (uint32_t)vector.scalars[7]
  3066. );
  3067. #endif
  3068. }
  3069. // Warning! Behavior depends on endianness.
  3070. inline U8x32 reinterpret_U8FromU32(const U32x8& vector) {
  3071. #if defined USE_256BIT_X_SIMD
  3072. return U8x32(REINTERPRET_U32_TO_U8_SIMD256(vector.v));
  3073. #else
  3074. const uint8_t *source = (const uint8_t*)vector.scalars;
  3075. U8x32 result = U8x32::create_dangerous_uninitialized();
  3076. for (int i = 0; i < 32; i++) {
  3077. result.scalars[i] = source[i];
  3078. }
  3079. return result;
  3080. #endif
  3081. }
  3082. // Warning! Behavior depends on endianness.
  3083. inline U32x8 reinterpret_U32FromU8(const U8x32& vector) {
  3084. #if defined USE_256BIT_X_SIMD
  3085. return U32x8(REINTERPRET_U8_TO_U32_SIMD256(vector.v));
  3086. #else
  3087. const uint32_t *source = (const uint32_t*)vector.scalars;
  3088. return U32x8(source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7]);
  3089. #endif
  3090. }
  3091. // Unpacking to larger integers
  3092. inline U32x8 lowerToU32(const U16x16& vector) {
  3093. #if defined USE_256BIT_X_SIMD
  3094. return U32x8(U16_LOW_TO_U32_SIMD256(vector.v));
  3095. #else
  3096. return U32x8(vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3], vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7]);
  3097. #endif
  3098. }
  3099. inline U32x8 higherToU32(const U16x16& vector) {
  3100. #if defined USE_256BIT_X_SIMD
  3101. return U32x8(U16_HIGH_TO_U32_SIMD256(vector.v));
  3102. #else
  3103. return U32x8(vector.scalars[8], vector.scalars[9], vector.scalars[10], vector.scalars[11], vector.scalars[12], vector.scalars[13], vector.scalars[14], vector.scalars[15]);
  3104. #endif
  3105. }
  3106. inline U16x16 lowerToU16(const U8x32& vector) {
  3107. #if defined USE_256BIT_X_SIMD
  3108. return U16x16(U8_LOW_TO_U16_SIMD256(vector.v));
  3109. #else
  3110. return U16x16(
  3111. vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3],
  3112. vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7],
  3113. vector.scalars[8], vector.scalars[9], vector.scalars[10], vector.scalars[11],
  3114. vector.scalars[12], vector.scalars[13], vector.scalars[14], vector.scalars[15]
  3115. );
  3116. #endif
  3117. }
  3118. inline U16x16 higherToU16(const U8x32& vector) {
  3119. #if defined USE_256BIT_X_SIMD
  3120. return U16x16(U8_HIGH_TO_U16_SIMD256(vector.v));
  3121. #else
  3122. return U16x16(
  3123. vector.scalars[16], vector.scalars[17], vector.scalars[18], vector.scalars[19],
  3124. vector.scalars[20], vector.scalars[21], vector.scalars[22], vector.scalars[23],
  3125. vector.scalars[24], vector.scalars[25], vector.scalars[26], vector.scalars[27],
  3126. vector.scalars[28], vector.scalars[29], vector.scalars[30], vector.scalars[31]
  3127. );
  3128. #endif
  3129. }
  3130. // Saturated packing
  3131. inline U8x32 saturateToU8(const U16x16& lower, const U16x16& upper) {
  3132. #if defined USE_256BIT_X_SIMD
  3133. return U8x32(PACK_SAT_U16_TO_U8_SIMD256(lower.v, upper.v));
  3134. #else
  3135. U8x32 result = U8x32::create_dangerous_uninitialized();
  3136. for (int i = 0; i < 16; i++) {
  3137. result.scalars[i] = impl_limit255(lower.scalars[i]);
  3138. }
  3139. for (int i = 0; i < 16; i++) {
  3140. result.scalars[i + 16] = impl_limit255(upper.scalars[i]);
  3141. }
  3142. return result;
  3143. #endif
  3144. }
  3145. // Unary negation for convenience and code readability.
  3146. // Before using unary negation, always check if:
  3147. // * An addition can be turned into a subtraction?
  3148. // x = -a + b
  3149. // x = b - a
  3150. // * A multiplying constant or scalar can be negated instead?
  3151. // x = -b * 2
  3152. // x = b * -2
  3153. inline F32x8 operator-(const F32x8& value) {
  3154. #if defined USE_256BIT_F_SIMD
  3155. return F32x8(0.0f) - value;
  3156. #else
  3157. return F32x8(
  3158. -value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3],
  3159. -value.scalars[4], -value.scalars[5], -value.scalars[6], -value.scalars[7]
  3160. );
  3161. #endif
  3162. }
  3163. inline I32x8 operator-(const I32x8& value) {
  3164. #if defined USE_256BIT_X_SIMD
  3165. return I32x8(0) - value;
  3166. #else
  3167. return I32x8(
  3168. -value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3],
  3169. -value.scalars[4], -value.scalars[5], -value.scalars[6], -value.scalars[7]
  3170. );
  3171. #endif
  3172. }
  3173. // Helper macros for generating the vector extract functions.
  3174. // Having one function for each type and offset makes sure that the compiler gets an immediate integer within the valid range.
  3175. #if defined USE_AVX2
  3176. // AVX2 does not offer any 256-bit element extraction, only two 128-bit shifts done in parallel, so we might as well use two separate 128-bit extractions.
  3177. template <int OFFSET>
  3178. __m256i impl_extractBytes_AVX2(const __m256i leftInput, const __m256i rightInput) {
  3179. // Extract three halves depending on which ones overlap with the offset.
  3180. __m128i leftPart = _mm256_extractf128_si256(leftInput, (OFFSET < 16) ? 0 : 1);
  3181. __m128i middlePart = _mm256_extractf128_si256(OFFSET < 16 ? rightInput : leftInput, (OFFSET < 16) ? 1 : 0);
  3182. __m128i rightPart = _mm256_extractf128_si256(rightInput, (OFFSET < 16) ? 0 : 1);
  3183. // Combine two 128-bit extracts into a whole 256-bit extract.
  3184. return _mm256_set_m128i(
  3185. _mm_alignr_epi8(leftPart, middlePart, OFFSET - ((OFFSET < 16) ? 0 : 16)),
  3186. _mm_alignr_epi8(middlePart, rightPart, OFFSET - ((OFFSET < 16) ? 0 : 16))
  3187. );
  3188. }
  3189. #define VECTOR_EXTRACT_GENERATOR_256_U8(OFFSET) return U8x32(impl_extractBytes_AVX2<OFFSET>(b.v, a.v));
  3190. #define VECTOR_EXTRACT_GENERATOR_256_U16(OFFSET) return U16x16(impl_extractBytes_AVX2<OFFSET * 2>(b.v, a.v));
  3191. #define VECTOR_EXTRACT_GENERATOR_256_U32(OFFSET) return U32x8(impl_extractBytes_AVX2<OFFSET * 4>(b.v, a.v));
  3192. #define VECTOR_EXTRACT_GENERATOR_256_I32(OFFSET) return I32x8(impl_extractBytes_AVX2<OFFSET * 4>(b.v, a.v));
  3193. #define VECTOR_EXTRACT_GENERATOR_256_F32(OFFSET) return F32x8(SIMD_F32x8(impl_extractBytes_AVX2<OFFSET * 4>(SIMD_U32x8(b.v), SIMD_U32x8(a.v))));
  3194. #else
  3195. template<typename T, int elementCount>
  3196. T vectorExtract_emulated(const T &a, const T &b, int offset) {
  3197. // For safety reasons, uninitialized default construction of SIMD vectors is only available during full emulation, not partial AVX support.
  3198. T result = T::create_dangerous_uninitialized();
  3199. int t = 0;
  3200. for (int s = offset; s < elementCount; s++) {
  3201. result.scalars[t] = a.scalars[s];
  3202. t++;
  3203. }
  3204. for (int s = 0; s < offset; s++) {
  3205. result.scalars[t] = b.scalars[s];
  3206. t++;
  3207. }
  3208. return result;
  3209. }
  3210. #define VECTOR_EXTRACT_GENERATOR_256_U8(OFFSET) return vectorExtract_emulated<U8x32, 32>(a, b, OFFSET);
  3211. #define VECTOR_EXTRACT_GENERATOR_256_U16(OFFSET) return vectorExtract_emulated<U16x16, 16>(a, b, OFFSET);
  3212. #define VECTOR_EXTRACT_GENERATOR_256_U32(OFFSET) return vectorExtract_emulated<U32x8, 8>(a, b, OFFSET);
  3213. #define VECTOR_EXTRACT_GENERATOR_256_I32(OFFSET) return vectorExtract_emulated<I32x8, 8>(a, b, OFFSET);
  3214. #define VECTOR_EXTRACT_GENERATOR_256_F32(OFFSET) return vectorExtract_emulated<F32x8, 8>(a, b, OFFSET);
  3215. #endif
  3216. // Vector extraction concatunates two input vectors and reads a vector between them using an offset.
  3217. // The first and last offsets that only return one of the inputs can be used for readability, because they will be inlined and removed by the compiler.
  3218. // To get elements from the right side, combine the center vector with the right vector and shift one element to the left using vectorExtract_1 for the given type.
  3219. // To get elements from the left side, combine the left vector with the center vector and shift one element to the right using vectorExtract_15 for 16 lanes, vectorExtract_7 for 8 lanes, or vectorExtract_3 for 4 lanes.
  3220. U8x32 inline vectorExtract_0(const U8x32 &a, const U8x32 &b) { return a; }
  3221. U8x32 inline vectorExtract_1(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(1) }
  3222. U8x32 inline vectorExtract_2(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(2) }
  3223. U8x32 inline vectorExtract_3(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(3) }
  3224. U8x32 inline vectorExtract_4(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(4) }
  3225. U8x32 inline vectorExtract_5(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(5) }
  3226. U8x32 inline vectorExtract_6(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(6) }
  3227. U8x32 inline vectorExtract_7(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(7) }
  3228. U8x32 inline vectorExtract_8(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(8) }
  3229. U8x32 inline vectorExtract_9(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(9) }
  3230. U8x32 inline vectorExtract_10(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(10) }
  3231. U8x32 inline vectorExtract_11(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(11) }
  3232. U8x32 inline vectorExtract_12(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(12) }
  3233. U8x32 inline vectorExtract_13(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(13) }
  3234. U8x32 inline vectorExtract_14(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(14) }
  3235. U8x32 inline vectorExtract_15(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(15) }
  3236. U8x32 inline vectorExtract_16(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(16) }
  3237. U8x32 inline vectorExtract_17(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(17) }
  3238. U8x32 inline vectorExtract_18(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(18) }
  3239. U8x32 inline vectorExtract_19(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(19) }
  3240. U8x32 inline vectorExtract_20(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(20) }
  3241. U8x32 inline vectorExtract_21(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(21) }
  3242. U8x32 inline vectorExtract_22(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(22) }
  3243. U8x32 inline vectorExtract_23(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(23) }
  3244. U8x32 inline vectorExtract_24(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(24) }
  3245. U8x32 inline vectorExtract_25(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(25) }
  3246. U8x32 inline vectorExtract_26(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(26) }
  3247. U8x32 inline vectorExtract_27(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(27) }
  3248. U8x32 inline vectorExtract_28(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(28) }
  3249. U8x32 inline vectorExtract_29(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(29) }
  3250. U8x32 inline vectorExtract_30(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(30) }
  3251. U8x32 inline vectorExtract_31(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(31) }
  3252. U8x32 inline vectorExtract_32(const U8x32 &a, const U8x32 &b) { return b; }
  3253. U16x16 inline vectorExtract_0(const U16x16 &a, const U16x16 &b) { return a; }
  3254. U16x16 inline vectorExtract_1(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(1) }
  3255. U16x16 inline vectorExtract_2(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(2) }
  3256. U16x16 inline vectorExtract_3(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(3) }
  3257. U16x16 inline vectorExtract_4(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(4) }
  3258. U16x16 inline vectorExtract_5(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(5) }
  3259. U16x16 inline vectorExtract_6(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(6) }
  3260. U16x16 inline vectorExtract_7(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(7) }
  3261. U16x16 inline vectorExtract_8(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(8) }
  3262. U16x16 inline vectorExtract_9(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(9) }
  3263. U16x16 inline vectorExtract_10(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(10) }
  3264. U16x16 inline vectorExtract_11(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(11) }
  3265. U16x16 inline vectorExtract_12(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(12) }
  3266. U16x16 inline vectorExtract_13(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(13) }
  3267. U16x16 inline vectorExtract_14(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(14) }
  3268. U16x16 inline vectorExtract_15(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(15) }
  3269. U16x16 inline vectorExtract_16(const U16x16 &a, const U16x16 &b) { return b; }
  3270. U32x8 inline vectorExtract_0(const U32x8 &a, const U32x8 &b) { return a; }
  3271. U32x8 inline vectorExtract_1(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(1) }
  3272. U32x8 inline vectorExtract_2(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(2) }
  3273. U32x8 inline vectorExtract_3(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(3) }
  3274. U32x8 inline vectorExtract_4(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(4) }
  3275. U32x8 inline vectorExtract_5(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(5) }
  3276. U32x8 inline vectorExtract_6(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(6) }
  3277. U32x8 inline vectorExtract_7(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(7) }
  3278. U32x8 inline vectorExtract_8(const U32x8 &a, const U32x8 &b) { return b; }
  3279. I32x8 inline vectorExtract_0(const I32x8 &a, const I32x8 &b) { return a; }
  3280. I32x8 inline vectorExtract_1(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(1) }
  3281. I32x8 inline vectorExtract_2(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(2) }
  3282. I32x8 inline vectorExtract_3(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(3) }
  3283. I32x8 inline vectorExtract_4(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(4) }
  3284. I32x8 inline vectorExtract_5(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(5) }
  3285. I32x8 inline vectorExtract_6(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(6) }
  3286. I32x8 inline vectorExtract_7(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(7) }
  3287. I32x8 inline vectorExtract_8(const I32x8 &a, const I32x8 &b) { return b; }
  3288. F32x8 inline vectorExtract_0(const F32x8 &a, const F32x8 &b) { return a; }
  3289. F32x8 inline vectorExtract_1(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(1) }
  3290. F32x8 inline vectorExtract_2(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(2) }
  3291. F32x8 inline vectorExtract_3(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(3) }
  3292. F32x8 inline vectorExtract_4(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(4) }
  3293. F32x8 inline vectorExtract_5(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(5) }
  3294. F32x8 inline vectorExtract_6(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(6) }
  3295. F32x8 inline vectorExtract_7(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(7) }
  3296. F32x8 inline vectorExtract_8(const F32x8 &a, const F32x8 &b) { return b; }
  3297. // Gather instructions load memory from a pointer at multiple index offsets at the same time.
  3298. // The given pointers should be aligned with 4 bytes, so that the fallback solution works on machines with strict alignment requirements.
  3299. #if defined USE_AVX2
  3300. #define GATHER_I32x8_AVX2(SOURCE, EIGHT_OFFSETS, SCALE) _mm256_i32gather_epi32((const int32_t*)(SOURCE), EIGHT_OFFSETS, SCALE)
  3301. #define GATHER_U32x8_AVX2(SOURCE, EIGHT_OFFSETS, SCALE) _mm256_i32gather_epi32((const int32_t*)(SOURCE), EIGHT_OFFSETS, SCALE)
  3302. #define GATHER_F32x8_AVX2(SOURCE, EIGHT_OFFSETS, SCALE) _mm256_i32gather_ps((const float*)(SOURCE), EIGHT_OFFSETS, SCALE)
  3303. #endif
  3304. static inline U32x8 gather(const dsr::SafePointer<uint32_t> data, const U32x8 &elementOffset) {
  3305. #if defined USE_AVX2
  3306. // TODO: Implement safety checks for debug mode.
  3307. return U32x8(GATHER_I32x8_AVX2(data.getUnsafe(), elementOffset.v, 4));
  3308. #else
  3309. ALIGN32 uint32_t elementOffsets[8];
  3310. elementOffset.writeAlignedUnsafe(elementOffsets);
  3311. return U32x8(
  3312. *(data + elementOffsets[0]),
  3313. *(data + elementOffsets[1]),
  3314. *(data + elementOffsets[2]),
  3315. *(data + elementOffsets[3]),
  3316. *(data + elementOffsets[4]),
  3317. *(data + elementOffsets[5]),
  3318. *(data + elementOffsets[6]),
  3319. *(data + elementOffsets[7])
  3320. );
  3321. #endif
  3322. }
  3323. static inline I32x8 gather(const dsr::SafePointer<int32_t> data, const U32x8 &elementOffset) {
  3324. #if defined USE_AVX2
  3325. // TODO: Implement safety checks for debug mode.
  3326. return I32x8(GATHER_U32x8_AVX2(data.getUnsafe(), elementOffset.v, 4));
  3327. #else
  3328. ALIGN32 uint32_t elementOffsets[8];
  3329. elementOffset.writeAlignedUnsafe(elementOffsets);
  3330. return I32x8(
  3331. *(data + elementOffsets[0]),
  3332. *(data + elementOffsets[1]),
  3333. *(data + elementOffsets[2]),
  3334. *(data + elementOffsets[3]),
  3335. *(data + elementOffsets[4]),
  3336. *(data + elementOffsets[5]),
  3337. *(data + elementOffsets[6]),
  3338. *(data + elementOffsets[7])
  3339. );
  3340. #endif
  3341. }
  3342. static inline F32x8 gather(const dsr::SafePointer<float> data, const U32x8 &elementOffset) {
  3343. #if defined USE_AVX2
  3344. // TODO: Implement safety checks for debug mode.
  3345. return F32x8(GATHER_F32x8_AVX2(data.getUnsafe(), elementOffset.v, 4));
  3346. #else
  3347. ALIGN32 uint32_t elementOffsets[8];
  3348. elementOffset.writeAlignedUnsafe(elementOffsets);
  3349. return F32x8(
  3350. *(data + elementOffsets[0]),
  3351. *(data + elementOffsets[1]),
  3352. *(data + elementOffsets[2]),
  3353. *(data + elementOffsets[3]),
  3354. *(data + elementOffsets[4]),
  3355. *(data + elementOffsets[5]),
  3356. *(data + elementOffsets[6]),
  3357. *(data + elementOffsets[7])
  3358. );
  3359. #endif
  3360. }
  3361. // Wrapper functions for explicitly expanding scalars into vectors during math operations.
  3362. #define NUMERICAL_SCALAR_OPERATIONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  3363. inline VECTOR_TYPE operator+(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left + VECTOR_TYPE(right); } \
  3364. inline VECTOR_TYPE operator+(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) + right; } \
  3365. inline VECTOR_TYPE operator-(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left - VECTOR_TYPE(right); } \
  3366. inline VECTOR_TYPE operator-(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) - right; }
  3367. FOR_ALL_VECTOR_TYPES(NUMERICAL_SCALAR_OPERATIONS)
  3368. #undef NUMERICAL_SCALAR_OPERATIONS
  3369. #define MULTIPLY_SCALAR_OPERATIONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  3370. inline VECTOR_TYPE operator*(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left * VECTOR_TYPE(right); } \
  3371. inline VECTOR_TYPE operator*(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) * right; }
  3372. // TODO: Implement multiplication for U8x16 and U8x32.
  3373. //FOR_ALL_VECTOR_TYPES(MULTIPLY_SCALAR_OPERATIONS)
  3374. MULTIPLY_SCALAR_OPERATIONS(F32x4, float, 4)
  3375. MULTIPLY_SCALAR_OPERATIONS(F32x8, float, 8)
  3376. MULTIPLY_SCALAR_OPERATIONS(U32x4, uint32_t, 4)
  3377. MULTIPLY_SCALAR_OPERATIONS(U32x8, uint32_t, 8)
  3378. MULTIPLY_SCALAR_OPERATIONS(I32x4, int32_t, 4)
  3379. MULTIPLY_SCALAR_OPERATIONS(I32x8, int32_t, 8)
  3380. MULTIPLY_SCALAR_OPERATIONS(U16x8, uint16_t, 8)
  3381. MULTIPLY_SCALAR_OPERATIONS(U16x16, uint16_t, 16)
  3382. #undef MULTIPLY_SCALAR_OPERATIONS
  3383. // Wrapper functions for explicitly duplicating bit masks into the same lane count.
  3384. #define BITWISE_SCALAR_OPERATIONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  3385. inline VECTOR_TYPE operator&(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left & VECTOR_TYPE(right); } \
  3386. inline VECTOR_TYPE operator&(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) & right; } \
  3387. inline VECTOR_TYPE operator|(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left | VECTOR_TYPE(right); } \
  3388. inline VECTOR_TYPE operator|(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) | right; } \
  3389. inline VECTOR_TYPE operator^(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left ^ VECTOR_TYPE(right); } \
  3390. inline VECTOR_TYPE operator^(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) ^ right; }
  3391. // TODO: Implement bitwise operations for all unsigned SIMD vectors.
  3392. //FOR_UNSIGNED_VECTOR_TYPES(BITWISE_SCALAR_OPERATIONS)
  3393. BITWISE_SCALAR_OPERATIONS(U32x4, uint32_t, 4)
  3394. BITWISE_SCALAR_OPERATIONS(U32x8, uint32_t, 8)
  3395. #undef BITWISE_SCALAR_OPERATIONS
  3396. // Cleaning up temporary macro definitions to avoid cluttering the namespace.
  3397. #undef FOR_ALL_VECTOR_TYPES
  3398. #undef FOR_FLOAT_VECTOR_TYPES
  3399. #undef FOR_INTEGER_VECTOR_TYPES
  3400. #undef FOR_SIGNED_VECTOR_TYPES
  3401. #undef FOR_UNSIGNED_VECTOR_TYPES
  3402. // The X vectors using the longest SIMD length that is efficient to use for both floating-point and integer types.
  3403. // DSR_DEFAULT_ALIGNMENT
  3404. // The number of bytes memory should be aligned with by default when creating buffers and images.
  3405. // F32xX
  3406. // The longest available SIMD vector for storing 32-bit float values. Iterating laneCountX_32Bit floats at a time.
  3407. // I32xX
  3408. // The longest available SIMD vector for storing signed 32-bit integer values. Iterating laneCountX_32Bit integers at a time.
  3409. // U32xX
  3410. // The longest available SIMD vector for storing unsigned 32-bit integer values. Iterating laneCountX_32Bit integers at a time.
  3411. // U16xX
  3412. // The longest available SIMD vector for storing unsigned 16-bit integer values. Iterating laneCountX_16Bit integers at a time.
  3413. // U8xX
  3414. // The longest available SIMD vector for storing unsigned 8-bit integer values. Iterating laneCountX_8Bit integers at a time.
  3415. #if defined USE_256BIT_X_SIMD || defined EMULATE_256BIT_X_SIMD
  3416. // Using 256-bit SIMD
  3417. #define DSR_DEFAULT_VECTOR_SIZE 32
  3418. #define DSR_DEFAULT_ALIGNMENT 32
  3419. using F32xX = F32x8;
  3420. using I32xX = I32x8;
  3421. using U32xX = U32x8;
  3422. using U16xX = U16x16;
  3423. using U8xX = U8x32;
  3424. // Align memory with 256 bits to allow overwriting padding at the end of each pixel row.
  3425. // Otherwise you would have to preserve data at the end of each row with slow and bloated duplicated code in every filter.
  3426. #else
  3427. // If there is no hardware support for 256-bit vectors, the emulation of 256-bit vectors when used explicitly, is allowed to be aligned with just 128 bits.
  3428. #define DSR_DEFAULT_VECTOR_SIZE 16
  3429. #define DSR_DEFAULT_ALIGNMENT 16
  3430. using F32xX = F32x4;
  3431. using I32xX = I32x4;
  3432. using U32xX = U32x4;
  3433. using U16xX = U16x8;
  3434. using U8xX = U8x16;
  3435. #endif
  3436. // How many lanes do the longest available vector have for a specified lane size.
  3437. // Used to iterate indices and pointers using whole elements.
  3438. static const int laneCountX_32Bit = DSR_DEFAULT_VECTOR_SIZE / 4;
  3439. static const int laneCountX_16Bit = DSR_DEFAULT_VECTOR_SIZE / 2;
  3440. static const int laneCountX_8Bit = DSR_DEFAULT_VECTOR_SIZE;
  3441. // The F vector using the longest SIMD length that is efficient to use when only processing float values, even if no integer types are available in the same size.
  3442. // Used when you know that your algorithm is only going to work with float types and you need the extra performance.
  3443. // Some processors have AVX but not AVX2, meaning that it has 256-bit SIMD for floats, but only 128-bit SIMD for integers.
  3444. // To use the F vector size correctly, only use it with buffers allocated with DSR_FLOAT_ALIGNMENT or a larger alignment.
  3445. // Images are allocated using the X vector size's default alignment from DSR_DEFAULT_ALIGNMENT, because images are impossible to use without integer types.
  3446. // DSR_FLOAT_ALIGNMENT
  3447. // The number of bytes memory should be aligned with manually when creating buffers for processing with the F vector.
  3448. // If this sounds too advanced, you can just stick with the X vectors and not release builds with partial support of a SIMD length.
  3449. // F32xF
  3450. // The longest available SIMD vector for storing 32-bit float values. Iterating laneCountF_32Bit floats at a time.
  3451. #if defined USE_256BIT_F_SIMD || defined EMULATE_256BIT_F_SIMD
  3452. #define DSR_FLOAT_VECTOR_SIZE 32
  3453. #define DSR_FLOAT_ALIGNMENT 32
  3454. using F32xF = F32x8;
  3455. #else
  3456. // F vectors are 128-bits.
  3457. #define DSR_FLOAT_VECTOR_SIZE 16
  3458. #define DSR_FLOAT_ALIGNMENT 16
  3459. using F32xF = F32x4;
  3460. #endif
  3461. // Used to iterate over float pointers when using F32xF.
  3462. static const int laneCountF = DSR_FLOAT_VECTOR_SIZE / 4;
  3463. #endif