simd.h 152 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507
  1. // zlib open source license
  2. //
  3. // Copyright (c) 2017 to 2023 David Forsgren Piuva
  4. //
  5. // This software is provided 'as-is', without any express or implied
  6. // warranty. In no event will the authors be held liable for any damages
  7. // arising from the use of this software.
  8. //
  9. // Permission is granted to anyone to use this software for any purpose,
  10. // including commercial applications, and to alter it and redistribute it
  11. // freely, subject to the following restrictions:
  12. //
  13. // 1. The origin of this software must not be misrepresented; you must not
  14. // claim that you wrote the original software. If you use this software
  15. // in a product, an acknowledgment in the product documentation would be
  16. // appreciated but is not required.
  17. //
  18. // 2. Altered source versions must be plainly marked as such, and must not be
  19. // misrepresented as being the original software.
  20. //
  21. // 3. This notice may not be removed or altered from any source
  22. // distribution.
  23. // Hardware abstraction layer for portable SIMD math.
  24. // Used to make calculations faster without having to mess around with any hardware specific assembler code nor intrinsic functions.
  25. // You get the performance you need today and the ability to compile with automatic scalar emulation when building for a future processor that this module has not yet been ported to.
  26. // When you can generate the vectorized code using the same template function as your non-vectorized code, you don't even need to write a reference implementation.
  27. // Using with 128-bit SIMD: (beginner friendly, test once, compile anywhere, no compiler flags)
  28. // If you are new to vectorization or only plan to work with 128-bit extensions such as ARM NEON, you can keep it simple by only using the 128-bit vector types.
  29. // Pros and cons:
  30. // + Most target platforms (excluding older systems such as ARMv6) have 128-bit SIMD extensions such as Intel SSE2 or ARM NEON enabled by default.
  31. // ARMv6 does not support ARM NEON, but most ARMv7 processors support it, so that compilers enable NEON by default.
  32. // All 64-bit ARM processors have ARM NEON, because it stopped being optional in ARMv8.
  33. // Building for 64-bit Intel processors usually have SSE2 enabled by default, so you don't have to change any compiler flags when building on a different system.
  34. // + One build for all computers of the same instruction set.
  35. // Great when your application is not so resource heavy, because the least powerful systems don't have the fancy extensions anyway.
  36. // - You might end up enabling the additional SIMD extensions anyway because the library is already using it to become faster.
  37. // Types:
  38. // * Use F32x4, I32x4 and U32x4 for 4 elements at a time
  39. // * U16x8 for 8 elements at a time
  40. // * U8x16 for 16 elements at a time
  41. // Using the X vector size: (advanced, having to test with different build flags or emulation)
  42. // If you want more performance, you can use variable length type aliases.
  43. // Pros and cons:
  44. // + For heavy calculations where memory access is not the bottleneck, using larger SIMD vectors when enabled allow saving energy and increasing performance.
  45. // - If you forget to test with longer vector lengths (compiling with -mavx2 or -mEMULATE_256BIT_SIMD) then you might find bugs from not iterating or aligning memory correctly.
  46. // Types:
  47. // * Use F32xX, I32xX and U32xX for laneCountX_32Bit elements at a time
  48. // * U16xX for laneCountX_16Bit elements at a time
  49. // * U8xX for laneCountX_8Bit elements at a time
  50. // Using the F vector size: (very dangerous, no test can confirm that memory alignment is correct)
  51. // If you want even more performance, you can let float operations use the longest available F vector size, which might exceed the X vector size.
  52. // Pros and cons:
  53. // - Have to manually set the alignment of buffers to DSR_FLOAT_ALIGNMENT to prevent crashing.
  54. // If the default alignment for buffers changed based on the size of F vectors, the more commonly used X vector would get slowed down from cache misses from padding larger than X vectors.
  55. // AlignedImageF32 and sound backends are already aligned with the F vector size, because they are not generic like Buffer.
  56. // - It can be difficult to detect incorrect memory alignment, because a pointer can accidentally be aligned to more than what was requested.
  57. // If accidentally aligning to 128 bits instead of 256 bits, there is a 50% risk of failing to detect it at runtime and later fail on another computer.
  58. // If sticking with 128-bit or X vectors, all buffers will be correctly aligned automatically.
  59. // + For heavy calculations where memory access is not the bottleneck, using larger SIMD vectors when enabled allow saving energy and increasing performance.
  60. // - If you forget to test with longer vector lengths (compiling with -mavx2 or -mEMULATE_256BIT_SIMD) then you might find bugs from not iterating or aligning memory correctly.
  61. // Types:
  62. // * Use F32xX, I32xX and U32xX for laneCountX_32Bit elements at a time
  63. // Compiler extensions
  64. // On Intel/AMD processors:
  65. // SSE2 is usually enabled by default, because SSE2 is mandatory for 64-bit Intel instructions.
  66. // Use -mavx as a G++ compiler flag to enable the AVX extension, enabling the USE_AVX and USE_256BIT_F_SIMD macros.
  67. // If not available on your computer, you can test your algorithm for 256-bit float SIMD using EMULATE_256BIT_F_SIMD, but only if you use the F vector size.
  68. // Use -mavx2 as a G++ compiler flag to enable the AVX2 extension, enabling the USE_AVX2 and USE_256BIT_X_SIMD macros.
  69. // If not available on your computer, you can test your algorithm for 256-bit float and integer SIMD using EMULATE_256BIT_X_SIMD, but only if you use the X vector size.
  70. // On ARMv6 processors:
  71. // Scalar emulation is used when compiling for ARMv6, because it does not have NEON and VFP is not supported in this abstraction.
  72. // On ARMv7 processors:
  73. // NEON is usually enabled by default for ARMv7, because most of them have the extension.
  74. // On ARMv8 processors:
  75. // NEON can not be disabled for ARMv8, because it is mandatory for ARMv8.
  76. #ifndef DFPSR_SIMD
  77. #define DFPSR_SIMD
  78. #include <cstdint>
  79. #include <cassert>
  80. #include "SafePointer.h"
  81. #include "../math/FVector.h"
  82. #include "../math/IVector.h"
  83. #include "../math/UVector.h"
  84. // Get settings from here.
  85. #include "../settings.h"
  86. // Alignment in bytes
  87. #define ALIGN_BYTES(SIZE) __attribute__((aligned(SIZE)))
  88. #define ALIGN16 ALIGN_BYTES(16) // 128-bit alignment
  89. #define ALIGN32 ALIGN_BYTES(32) // 256-bit alignment
  90. // Everything declared in here handles things specific for SSE.
  91. // Direct use of the macros will not provide portability to all hardware.
  92. #ifdef USE_SSE2
  93. #define USE_DIRECT_SIMD_MEMORY_ACCESS
  94. #include <emmintrin.h> // SSE2
  95. #ifdef USE_SSSE3
  96. #include <tmmintrin.h> // SSSE3
  97. #endif
  98. #ifdef USE_AVX
  99. #include <immintrin.h> // AVX / AVX2
  100. #endif
  101. // Vector types
  102. #define SIMD_F32x4 __m128
  103. #define SIMD_U8x16 __m128i
  104. #define SIMD_U16x8 __m128i
  105. #define SIMD_U32x4 __m128i
  106. #define SIMD_I32x4 __m128i
  107. // Vector uploads in address order
  108. #define LOAD_VECTOR_F32_SIMD(A, B, C, D) _mm_set_ps(D, C, B, A)
  109. #define LOAD_SCALAR_F32_SIMD(A) _mm_set1_ps(A)
  110. #define LOAD_VECTOR_U8_SIMD(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) _mm_set_epi8(P, O, N, M, L, K, J, I, H, G, F, E, D, C, B, A)
  111. #define LOAD_SCALAR_U8_SIMD(A) _mm_set1_epi8(A)
  112. #define LOAD_VECTOR_U16_SIMD(A, B, C, D, E, F, G, H) _mm_set_epi16(H, G, F, E, D, C, B, A)
  113. #define LOAD_SCALAR_U16_SIMD(A) _mm_set1_epi16(A)
  114. #define LOAD_VECTOR_U32_SIMD(A, B, C, D) _mm_set_epi32(D, C, B, A)
  115. #define LOAD_SCALAR_U32_SIMD(A) _mm_set1_epi32(A)
  116. #define LOAD_VECTOR_I32_SIMD(A, B, C, D) _mm_set_epi32(D, C, B, A)
  117. #define LOAD_SCALAR_I32_SIMD(A) _mm_set1_epi32(A)
  118. // Conversions
  119. #define F32_TO_I32_SIMD(A) _mm_cvttps_epi32(A)
  120. #define F32_TO_U32_SIMD(A) _mm_cvttps_epi32(A)
  121. #define I32_TO_F32_SIMD(A) _mm_cvtepi32_ps(A)
  122. #define U32_TO_F32_SIMD(A) _mm_cvtepi32_ps(A)
  123. // Unpacking conversions
  124. #define U8_LOW_TO_U16_SIMD(A) _mm_unpacklo_epi8(A, _mm_set1_epi8(0))
  125. #define U8_HIGH_TO_U16_SIMD(A) _mm_unpackhi_epi8(A, _mm_set1_epi8(0))
  126. #define U16_LOW_TO_U32_SIMD(A) _mm_unpacklo_epi16(A, _mm_set1_epi16(0))
  127. #define U16_HIGH_TO_U32_SIMD(A) _mm_unpackhi_epi16(A, _mm_set1_epi16(0))
  128. // Saturated packing
  129. // Credit: Using ideas from Victoria Zhislina's NEON_2_SSE.h header from the Intel corporation, but not trying to emulate NEON
  130. inline SIMD_U8x16 PACK_SAT_U16_TO_U8_SIMD(const SIMD_U16x8& a, const SIMD_U16x8& b) {
  131. SIMD_U16x8 mask, a2, b2;
  132. mask = _mm_set1_epi16(0b0111111111111111);
  133. a2 = _mm_and_si128(a, mask);
  134. a2 = _mm_or_si128(a2, _mm_and_si128(_mm_cmpgt_epi16(a2, a), mask));
  135. b2 = _mm_and_si128(b, mask);
  136. b2 = _mm_or_si128(b2, _mm_and_si128(_mm_cmpgt_epi16(b2, b), mask));
  137. return _mm_packus_epi16(a2, b2);
  138. }
  139. // Reinterpret casting
  140. #define REINTERPRET_U32_TO_U8_SIMD(A) (A)
  141. #define REINTERPRET_U32_TO_U16_SIMD(A) (A)
  142. #define REINTERPRET_U8_TO_U32_SIMD(A) (A)
  143. #define REINTERPRET_U16_TO_U32_SIMD(A) (A)
  144. #define REINTERPRET_U32_TO_I32_SIMD(A) (A)
  145. #define REINTERPRET_I32_TO_U32_SIMD(A) (A)
  146. // Vector float operations returning SIMD_F32x4
  147. #define ADD_F32_SIMD(A, B) _mm_add_ps(A, B)
  148. #define SUB_F32_SIMD(A, B) _mm_sub_ps(A, B)
  149. #define MUL_F32_SIMD(A, B) _mm_mul_ps(A, B)
  150. // Vector integer operations returning SIMD_I32x4
  151. #define ADD_I32_SIMD(A, B) _mm_add_epi32(A, B)
  152. #define SUB_I32_SIMD(A, B) _mm_sub_epi32(A, B)
  153. // 32-bit integer multiplications are not available on SSE2.
  154. // Vector integer operations returning SIMD_U32x4
  155. #define ADD_U32_SIMD(A, B) _mm_add_epi32(A, B)
  156. #define SUB_U32_SIMD(A, B) _mm_sub_epi32(A, B)
  157. // 32-bit integer multiplications are not available on SSE2.
  158. // Vector integer operations returning SIMD_U16x8
  159. #define ADD_U16_SIMD(A, B) _mm_add_epi16(A, B)
  160. #define SUB_U16_SIMD(A, B) _mm_sub_epi16(A, B)
  161. #define MUL_U16_SIMD(A, B) _mm_mullo_epi16(A, B)
  162. // Vector integer operations returning SIMD_U8x16
  163. #define ADD_U8_SIMD(A, B) _mm_add_epi8(A, B)
  164. #define ADD_SAT_U8_SIMD(A, B) _mm_adds_epu8(A, B) // Saturated addition
  165. #define SUB_U8_SIMD(A, B) _mm_sub_epi8(A, B)
  166. #define SUB_SAT_U8_SIMD(A, B) _mm_subs_epu8(A, B) // Saturated subtraction
  167. // No 8-bit multiplications
  168. // Statistics
  169. #define MIN_F32_SIMD(A, B) _mm_min_ps(A, B)
  170. #define MAX_F32_SIMD(A, B) _mm_max_ps(A, B)
  171. // TODO: Implement minimum and maximum for integer vectors, so that all operations exist for all applicable types:
  172. // Using _mm256_max_epu16... in AVX2 for 256-bit versions
  173. // Using comparisons and masking in SSE2 when _mm_max_epu16... in SSE4.1 is not available
  174. // Bitwise
  175. #define BITWISE_AND_U32_SIMD(A, B) _mm_and_si128(A, B)
  176. #define BITWISE_OR_U32_SIMD(A, B) _mm_or_si128(A, B)
  177. #define BITWISE_XOR_U32_SIMD(A, B) _mm_xor_si128(A, B)
  178. #ifdef USE_AVX
  179. // 256-bit vector types
  180. #define SIMD_F32x8 __m256
  181. // Vector uploads in address order
  182. #define LOAD_VECTOR_F32_SIMD256(A, B, C, D, E, F, G, H) _mm256_set_ps(H, G, F, E, D, C, B, A)
  183. #define LOAD_SCALAR_F32_SIMD256(A) _mm256_set1_ps(A)
  184. // Vector float operations returning SIMD_F32x4
  185. #define ADD_F32_SIMD256(A, B) _mm256_add_ps(A, B)
  186. #define SUB_F32_SIMD256(A, B) _mm256_sub_ps(A, B)
  187. #define MUL_F32_SIMD256(A, B) _mm256_mul_ps(A, B)
  188. // Statistics
  189. #define MIN_F32_SIMD256(A, B) _mm256_min_ps(A, B)
  190. #define MAX_F32_SIMD256(A, B) _mm256_max_ps(A, B)
  191. #ifdef USE_AVX2
  192. // 256-bit vector types
  193. #define SIMD_U8x32 __m256i
  194. #define SIMD_U16x16 __m256i
  195. #define SIMD_U32x8 __m256i
  196. #define SIMD_I32x8 __m256i
  197. // Vector uploads in address order
  198. #define LOAD_VECTOR_U8_SIMD256(A1, B1, C1, D1, E1, F1, G1, H1, I1, J1, K1, L1, M1, N1, O1, P1, Q1, R1, S1, T1, U1, V1, W1, X1, Y1, Z1, A2, B2, C2, D2, E2, F2) _mm256_set_epi8(F2, E2, D2, C2, B2, A2, Z1, Y1, X1, W1, V1, U1, T1, S1, R1, Q1, P1, O1, N1, M1, L1, K1, J1, I1, H1, G1, F1, E1, D1, C1, B1, A1)
  199. #define LOAD_SCALAR_U8_SIMD256(A) _mm256_set1_epi8(A)
  200. #define LOAD_VECTOR_U16_SIMD256(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) _mm256_set_epi16(P, O, N, M, L, K, J, I, H, G, F, E, D, C, B, A)
  201. #define LOAD_SCALAR_U16_SIMD256(A) _mm256_set1_epi16(A)
  202. #define LOAD_VECTOR_U32_SIMD256(A, B, C, D, E, F, G, H) _mm256_set_epi32(H, G, F, E, D, C, B, A)
  203. #define LOAD_SCALAR_U32_SIMD256(A) _mm256_set1_epi32(A)
  204. #define LOAD_VECTOR_I32_SIMD256(A, B, C, D, E, F, G, H) _mm256_set_epi32(H, G, F, E, D, C, B, A)
  205. #define LOAD_SCALAR_I32_SIMD256(A) _mm256_set1_epi32(A)
  206. // Conversions
  207. #define F32_TO_I32_SIMD256(A) _mm256_cvttps_epi32(A)
  208. #define F32_TO_U32_SIMD256(A) _mm256_cvttps_epi32(A)
  209. #define I32_TO_F32_SIMD256(A) _mm256_cvtepi32_ps(A)
  210. #define U32_TO_F32_SIMD256(A) _mm256_cvtepi32_ps(A)
  211. // Unpacking conversions
  212. #define U8_LOW_TO_U16_SIMD256(A) _mm256_unpacklo_epi8(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi8(0))
  213. #define U8_HIGH_TO_U16_SIMD256(A) _mm256_unpackhi_epi8(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi8(0))
  214. #define U16_LOW_TO_U32_SIMD256(A) _mm256_unpacklo_epi16(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi16(0))
  215. #define U16_HIGH_TO_U32_SIMD256(A) _mm256_unpackhi_epi16(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi16(0))
  216. // Saturated packing
  217. // Credit: Using ideas from Victoria Zhislina's NEON_2_SSE.h header from the Intel corporation, but not trying to emulate NEON
  218. inline SIMD_U8x32 PACK_SAT_U16_TO_U8_SIMD256(const SIMD_U16x16& a, const SIMD_U16x16& b) {
  219. SIMD_U16x16 mask, a2, b2;
  220. mask = _mm256_set1_epi16(0b0111111111111111);
  221. a2 = _mm256_and_si256(a, mask);
  222. a2 = _mm256_or_si256(a2, _mm256_and_si256(_mm256_cmpgt_epi16(a2, a), mask));
  223. b2 = _mm256_and_si256(b, mask);
  224. b2 = _mm256_or_si256(b2, _mm256_and_si256(_mm256_cmpgt_epi16(b2, b), mask));
  225. // The 256-bit pack instruction _mm256_packus_epi16 is not serial, so the result has to be permutated into the correct order.
  226. // 0 2 1 3
  227. // | X |
  228. // 0 1 2 3
  229. return _mm256_permute4x64_epi64(_mm256_packus_epi16(a2, b2), 0b11011000);
  230. }
  231. // Reinterpret casting
  232. #define REINTERPRET_U32_TO_U8_SIMD256(A) (A)
  233. #define REINTERPRET_U32_TO_U16_SIMD256(A) (A)
  234. #define REINTERPRET_U8_TO_U32_SIMD256(A) (A)
  235. #define REINTERPRET_U16_TO_U32_SIMD256(A) (A)
  236. #define REINTERPRET_U32_TO_I32_SIMD256(A) (A)
  237. #define REINTERPRET_I32_TO_U32_SIMD256(A) (A)
  238. // Vector integer operations returning SIMD_I32x4
  239. #define ADD_I32_SIMD256(A, B) _mm256_add_epi32(A, B)
  240. #define SUB_I32_SIMD256(A, B) _mm256_sub_epi32(A, B)
  241. #define MUL_I32_SIMD256(A, B) _mm256_mullo_epi32(A, B)
  242. // Vector integer operations returning SIMD_U32x4
  243. #define ADD_U32_SIMD256(A, B) _mm256_add_epi32(A, B)
  244. #define SUB_U32_SIMD256(A, B) _mm256_sub_epi32(A, B)
  245. #define MUL_U32_SIMD256(A, B) _mm256_mullo_epi32(A, B)
  246. // Vector integer operations returning SIMD_U16x8
  247. #define ADD_U16_SIMD256(A, B) _mm256_add_epi16(A, B)
  248. #define SUB_U16_SIMD256(A, B) _mm256_sub_epi16(A, B)
  249. #define MUL_U16_SIMD256(A, B) _mm256_mullo_epi16(A, B)
  250. // Vector integer operations returning SIMD_U8x16
  251. #define ADD_U8_SIMD256(A, B) _mm256_add_epi8(A, B)
  252. #define ADD_SAT_U8_SIMD256(A, B) _mm256_adds_epu8(A, B) // Saturated addition
  253. #define SUB_U8_SIMD256(A, B) _mm256_sub_epi8(A, B)
  254. #define SUB_SAT_U8_SIMD256(A, B) _mm256_subs_epu8(A, B) // Saturated subtraction
  255. // No 8-bit multiplications
  256. // Bitwise
  257. #define BITWISE_AND_U32_SIMD256(A, B) _mm256_and_si256(A, B)
  258. #define BITWISE_OR_U32_SIMD256(A, B) _mm256_or_si256(A, B)
  259. #define BITWISE_XOR_U32_SIMD256(A, B) _mm256_xor_si256(A, B)
  260. #endif
  261. #endif
  262. #endif
  263. // Everything declared in here handles things specific for NEON.
  264. // Direct use of the macros will not provide portability to all hardware.
  265. #ifdef USE_NEON
  266. #include <arm_neon.h> // NEON
  267. // Vector types
  268. #define SIMD_F32x4 float32x4_t
  269. #define SIMD_U8x16 uint8x16_t
  270. #define SIMD_U16x8 uint16x8_t
  271. #define SIMD_U32x4 uint32x4_t
  272. #define SIMD_I32x4 int32x4_t
  273. // Vector uploads in address order
  274. inline SIMD_F32x4 LOAD_VECTOR_F32_SIMD(float a, float b, float c, float d) {
  275. float data[4] ALIGN16 = {a, b, c, d};
  276. return vld1q_f32(data);
  277. }
  278. inline SIMD_F32x4 LOAD_SCALAR_F32_SIMD(float a) {
  279. return vdupq_n_f32(a);
  280. }
  281. inline SIMD_U8x16 LOAD_VECTOR_U8_SIMD(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
  282. uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p) {
  283. uint8_t data[16] ALIGN16 = {a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p};
  284. return vld1q_u8(data);
  285. }
  286. inline SIMD_U8x16 LOAD_SCALAR_U8_SIMD(uint16_t a) {
  287. return vdupq_n_u8(a);
  288. }
  289. inline SIMD_U16x8 LOAD_VECTOR_U16_SIMD(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h) {
  290. uint16_t data[8] ALIGN16 = {a, b, c, d, e, f, g, h};
  291. return vld1q_u16(data);
  292. }
  293. inline SIMD_U16x8 LOAD_SCALAR_U16_SIMD(uint16_t a) {
  294. return vdupq_n_u16(a);
  295. }
  296. inline SIMD_U32x4 LOAD_VECTOR_U32_SIMD(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
  297. uint32_t data[4] ALIGN16 = {a, b, c, d};
  298. return vld1q_u32(data);
  299. }
  300. inline SIMD_U32x4 LOAD_SCALAR_U32_SIMD(uint32_t a) {
  301. return vdupq_n_u32(a);
  302. }
  303. inline SIMD_I32x4 LOAD_VECTOR_I32_SIMD(int32_t a, int32_t b, int32_t c, int32_t d) {
  304. int32_t data[4] ALIGN16 = {a, b, c, d};
  305. return vld1q_s32(data);
  306. }
  307. inline SIMD_I32x4 LOAD_SCALAR_I32_SIMD(int32_t a) {
  308. return vdupq_n_s32(a);
  309. }
  310. // Conversions
  311. #define F32_TO_I32_SIMD(A) vcvtq_s32_f32(A)
  312. #define F32_TO_U32_SIMD(A) vcvtq_u32_f32(A)
  313. #define I32_TO_F32_SIMD(A) vcvtq_f32_s32(A)
  314. #define U32_TO_F32_SIMD(A) vcvtq_f32_u32(A)
  315. // Unpacking conversions
  316. #define U8_LOW_TO_U16_SIMD(A) vmovl_u8(vget_low_u8(A))
  317. #define U8_HIGH_TO_U16_SIMD(A) vmovl_u8(vget_high_u8(A))
  318. #define U16_LOW_TO_U32_SIMD(A) vmovl_u16(vget_low_u16(A))
  319. #define U16_HIGH_TO_U32_SIMD(A) vmovl_u16(vget_high_u16(A))
  320. // Saturated packing
  321. #define PACK_SAT_U16_TO_U8_SIMD(A, B) vcombine_u8(vqmovn_u16(A), vqmovn_u16(B))
  322. // Reinterpret casting
  323. #define REINTERPRET_U32_TO_U8_SIMD(A) vreinterpretq_u8_u32(A)
  324. #define REINTERPRET_U32_TO_U16_SIMD(A) vreinterpretq_u16_u32(A)
  325. #define REINTERPRET_U8_TO_U32_SIMD(A) vreinterpretq_u32_u8(A)
  326. #define REINTERPRET_U16_TO_U32_SIMD(A) vreinterpretq_u32_u16(A)
  327. #define REINTERPRET_U32_TO_I32_SIMD(A) vreinterpretq_s32_u32(A)
  328. #define REINTERPRET_I32_TO_U32_SIMD(A) vreinterpretq_u32_s32(A)
  329. // Vector float operations returning SIMD_F32x4
  330. #define ADD_F32_SIMD(A, B) vaddq_f32(A, B)
  331. #define SUB_F32_SIMD(A, B) vsubq_f32(A, B)
  332. #define MUL_F32_SIMD(A, B) vmulq_f32(A, B)
  333. // Vector integer operations returning SIMD_I32x4
  334. #define ADD_I32_SIMD(A, B) vaddq_s32(A, B)
  335. #define SUB_I32_SIMD(A, B) vsubq_s32(A, B)
  336. #define MUL_I32_NEON(A, B) vmulq_s32(A, B)
  337. // Vector integer operations returning SIMD_U32x4
  338. #define ADD_U32_SIMD(A, B) vaddq_u32(A, B)
  339. #define SUB_U32_SIMD(A, B) vsubq_u32(A, B)
  340. #define MUL_U32_NEON(A, B) vmulq_u32(A, B)
  341. // Vector integer operations returning SIMD_U16x8
  342. #define ADD_U16_SIMD(A, B) vaddq_u16(A, B)
  343. #define SUB_U16_SIMD(A, B) vsubq_u16(A, B)
  344. #define MUL_U16_SIMD(A, B) vmulq_u16(A, B)
  345. // Vector integer operations returning SIMD_U8x16
  346. #define ADD_U8_SIMD(A, B) vaddq_u8(A, B)
  347. #define ADD_SAT_U8_SIMD(A, B) vqaddq_u8(A, B) // Saturated addition
  348. #define SUB_U8_SIMD(A, B) vsubq_u8(A, B)
  349. #define SUB_SAT_U8_SIMD(A, B) vqsubq_u8(A, B) // Saturated subtraction
  350. // No 8-bit multiplications
  351. // Statistics
  352. #define MIN_F32_SIMD(A, B) vminq_f32(A, B)
  353. #define MAX_F32_SIMD(A, B) vmaxq_f32(A, B)
  354. // Bitwise
  355. #define BITWISE_AND_U32_SIMD(A, B) vandq_u32(A, B)
  356. #define BITWISE_OR_U32_SIMD(A, B) vorrq_u32(A, B)
  357. #define BITWISE_XOR_U32_SIMD(A, B) veorq_u32(A, B)
  358. #endif
  359. /*
  360. The vector types below are supposed to be portable across different CPU architectures.
  361. When mixed with handwritten SIMD intrinsics:
  362. Use "USE_SSE2" instead of "__SSE2__"
  363. Use "USE_AVX2" instead of "__AVX2__"
  364. Use "USE_NEON" instead of "__ARM_NEON"
  365. So that any new variations of the macro named given from the compiler can be added to simd.h instead of duplicated everywhere.
  366. Portability exceptions:
  367. * The "v" variable is the native backend, which is only defined when SIMD is supported by hardware.
  368. Only use when USE_BASIC_SIMD is defined.
  369. Will not work on scalar emulation.
  370. * The "scalars" array is available when emulating a type that does not exist or the SIMD vector has direct access to the memory.
  371. Do not rely on these for accessing elements, because otherwise your code will not be able to compile for ARM NEON.
  372. */
  373. union F32x4 {
  374. private:
  375. // The uninitialized default constructor is private for safety reasons.
  376. F32x4() {}
  377. public:
  378. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  379. static inline F32x4 create_dangerous_uninitialized() { return F32x4(); }
  380. #ifdef USE_BASIC_SIMD
  381. public:
  382. #ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
  383. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  384. // Direct access cannot be done on NEON!
  385. float scalars[4];
  386. #endif
  387. // The SIMD vector of undefined type
  388. // Not accessible while emulating!
  389. SIMD_F32x4 v;
  390. // Construct a portable vector from a native SIMD vector
  391. explicit F32x4(const SIMD_F32x4& v) : v(v) {}
  392. // Construct a portable vector from a set of scalars
  393. F32x4(float a1, float a2, float a3, float a4) : v(LOAD_VECTOR_F32_SIMD(a1, a2, a3, a4)) {}
  394. // Construct a portable vector from a single duplicated scalar
  395. explicit F32x4(float scalar) : v(LOAD_SCALAR_F32_SIMD(scalar)) {}
  396. #else
  397. public:
  398. // Emulate a SIMD vector as an array of scalars without hardware support.
  399. // Only accessible while emulating!
  400. float scalars[4];
  401. // Construct a portable vector from a set of scalars
  402. F32x4(float a1, float a2, float a3, float a4) {
  403. this->scalars[0] = a1;
  404. this->scalars[1] = a2;
  405. this->scalars[2] = a3;
  406. this->scalars[3] = a4;
  407. }
  408. // Construct a portable vector from a single duplicated scalar
  409. explicit F32x4(float scalar) {
  410. this->scalars[0] = scalar;
  411. this->scalars[1] = scalar;
  412. this->scalars[2] = scalar;
  413. this->scalars[3] = scalar;
  414. }
  415. #endif
  416. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  417. static inline F32x4 createGradient(float start, float increment) {
  418. return F32x4(start, start + increment, start + increment * 2.0f, start + increment * 3.0f);
  419. }
  420. // Construct a portable SIMD vector from a pointer to aligned data
  421. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  422. static inline F32x4 readAlignedUnsafe(const float* data) {
  423. #ifdef USE_BASIC_SIMD
  424. #if defined USE_SSE2
  425. return F32x4(_mm_load_ps(data));
  426. #elif defined USE_NEON
  427. return F32x4(vld1q_f32(data));
  428. #endif
  429. #else
  430. return F32x4(data[0], data[1], data[2], data[3]);
  431. #endif
  432. }
  433. // Write to aligned memory from the existing vector
  434. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  435. inline void writeAlignedUnsafe(float* data) const {
  436. #if defined USE_BASIC_SIMD
  437. #if defined USE_SSE2
  438. _mm_store_ps(data, this->v);
  439. #elif defined USE_NEON
  440. vst1q_f32(data, this->v);
  441. #endif
  442. #else
  443. data[0] = this->scalars[0];
  444. data[1] = this->scalars[1];
  445. data[2] = this->scalars[2];
  446. data[3] = this->scalars[3];
  447. #endif
  448. }
  449. #if defined DFPSR_GEOMETRY_FVECTOR
  450. dsr::FVector4D get() const {
  451. float data[4] ALIGN16;
  452. this->writeAlignedUnsafe(data);
  453. return dsr::FVector4D(data[0], data[1], data[2], data[3]);
  454. }
  455. #endif
  456. // Bound and alignment checked reading
  457. static inline F32x4 readAligned(const dsr::SafePointer<float> data, const char* methodName) {
  458. const float* pointer = data.getUnsafe();
  459. assert(((uintptr_t)pointer & 15) == 0);
  460. #if defined SAFE_POINTER_CHECKS
  461. data.assertInside(methodName, pointer, 16);
  462. #endif
  463. return F32x4::readAlignedUnsafe(pointer);
  464. }
  465. // Bound and alignment checked writing
  466. inline void writeAligned(dsr::SafePointer<float> data, const char* methodName) const {
  467. float* pointer = data.getUnsafe();
  468. assert(((uintptr_t)pointer & 15) == 0);
  469. #if defined SAFE_POINTER_CHECKS
  470. data.assertInside(methodName, pointer, 16);
  471. #endif
  472. this->writeAlignedUnsafe(pointer);
  473. }
  474. // 1 / x
  475. // Useful for multiple divisions with the same denominator
  476. // Useless if the denominator is a constant
  477. F32x4 reciprocal() const {
  478. #if defined USE_BASIC_SIMD
  479. #if defined USE_SSE2
  480. // Approximate
  481. SIMD_F32x4 lowQ = _mm_rcp_ps(this->v);
  482. // Refine
  483. return F32x4(SUB_F32_SIMD(ADD_F32_SIMD(lowQ, lowQ), MUL_F32_SIMD(this->v, MUL_F32_SIMD(lowQ, lowQ))));
  484. #elif defined USE_NEON
  485. // Approximate
  486. SIMD_F32x4 result = vrecpeq_f32(this->v);
  487. // Refine
  488. result = MUL_F32_SIMD(vrecpsq_f32(this->v, result), result);
  489. return F32x4(MUL_F32_SIMD(vrecpsq_f32(this->v, result), result));
  490. #else
  491. assert(false);
  492. return F32x4(0);
  493. #endif
  494. #else
  495. return F32x4(1.0f / this->scalars[0], 1.0f / this->scalars[1], 1.0f / this->scalars[2], 1.0f / this->scalars[3]);
  496. #endif
  497. }
  498. // 1 / sqrt(x)
  499. // Useful for normalizing vectors
  500. F32x4 reciprocalSquareRoot() const {
  501. #if defined USE_BASIC_SIMD
  502. #if defined USE_SSE2
  503. SIMD_F32x4 reciRoot = _mm_rsqrt_ps(this->v);
  504. SIMD_F32x4 mul = MUL_F32_SIMD(MUL_F32_SIMD(this->v, reciRoot), reciRoot);
  505. reciRoot = MUL_F32_SIMD(MUL_F32_SIMD(LOAD_SCALAR_F32_SIMD(0.5f), reciRoot), SUB_F32_SIMD(LOAD_SCALAR_F32_SIMD(3.0f), mul));
  506. return F32x4(reciRoot);
  507. #elif defined USE_NEON
  508. // Approximate
  509. SIMD_F32x4 reciRoot = vrsqrteq_f32(this->v);
  510. // Refine
  511. reciRoot = MUL_F32_SIMD(vrsqrtsq_f32(MUL_F32_SIMD(this->v, reciRoot), reciRoot), reciRoot);
  512. return F32x4(reciRoot);
  513. #else
  514. assert(false);
  515. return F32x4(0);
  516. #endif
  517. #else
  518. return F32x4(1.0f / sqrt(this->scalars[0]), 1.0f / sqrt(this->scalars[1]), 1.0f / sqrt(this->scalars[2]), 1.0f / sqrt(this->scalars[3]));
  519. #endif
  520. }
  521. // sqrt(x)
  522. // Useful for getting lengths of vectors
  523. F32x4 squareRoot() const {
  524. #if defined USE_BASIC_SIMD
  525. #if defined USE_SSE2
  526. SIMD_F32x4 half = LOAD_SCALAR_F32_SIMD(0.5f);
  527. // Approximate
  528. SIMD_F32x4 root = _mm_sqrt_ps(this->v);
  529. // Refine
  530. root = _mm_mul_ps(_mm_add_ps(root, _mm_div_ps(this->v, root)), half);
  531. return F32x4(root);
  532. #elif defined USE_NEON
  533. return F32x4(MUL_F32_SIMD(this->v, this->reciprocalSquareRoot().v));
  534. #else
  535. assert(false);
  536. return F32x4(0);
  537. #endif
  538. #else
  539. return F32x4(sqrt(this->scalars[0]), sqrt(this->scalars[1]), sqrt(this->scalars[2]), sqrt(this->scalars[3]));
  540. #endif
  541. }
  542. F32x4 clamp(float minimum, float maximum) const {
  543. #if defined USE_BASIC_SIMD
  544. return F32x4(MIN_F32_SIMD(MAX_F32_SIMD(this->v, LOAD_SCALAR_F32_SIMD(minimum)), LOAD_SCALAR_F32_SIMD(maximum)));
  545. #else
  546. float val0 = this->scalars[0];
  547. float val1 = this->scalars[1];
  548. float val2 = this->scalars[2];
  549. float val3 = this->scalars[3];
  550. if (minimum > val0) { val0 = minimum; }
  551. if (maximum < val0) { val0 = maximum; }
  552. if (minimum > val1) { val1 = minimum; }
  553. if (maximum < val1) { val1 = maximum; }
  554. if (minimum > val2) { val2 = minimum; }
  555. if (maximum < val2) { val2 = maximum; }
  556. if (minimum > val3) { val3 = minimum; }
  557. if (maximum < val3) { val3 = maximum; }
  558. return F32x4(val0, val1, val2, val3);
  559. #endif
  560. }
  561. F32x4 clampLower(float minimum) const {
  562. #if defined USE_BASIC_SIMD
  563. return F32x4(MAX_F32_SIMD(this->v, LOAD_SCALAR_F32_SIMD(minimum)));
  564. #else
  565. float val0 = this->scalars[0];
  566. float val1 = this->scalars[1];
  567. float val2 = this->scalars[2];
  568. float val3 = this->scalars[3];
  569. if (minimum > val0) { val0 = minimum; }
  570. if (minimum > val1) { val1 = minimum; }
  571. if (minimum > val2) { val2 = minimum; }
  572. if (minimum > val3) { val3 = minimum; }
  573. return F32x4(val0, val1, val2, val3);
  574. #endif
  575. }
  576. F32x4 clampUpper(float maximum) const {
  577. #if defined USE_BASIC_SIMD
  578. return F32x4(MIN_F32_SIMD(this->v, LOAD_SCALAR_F32_SIMD(maximum)));
  579. #else
  580. float val0 = this->scalars[0];
  581. float val1 = this->scalars[1];
  582. float val2 = this->scalars[2];
  583. float val3 = this->scalars[3];
  584. if (maximum < val0) { val0 = maximum; }
  585. if (maximum < val1) { val1 = maximum; }
  586. if (maximum < val2) { val2 = maximum; }
  587. if (maximum < val3) { val3 = maximum; }
  588. return F32x4(val0, val1, val2, val3);
  589. #endif
  590. }
  591. };
  592. union I32x4 {
  593. private:
  594. // The uninitialized default constructor is private for safety reasons.
  595. I32x4() {}
  596. public:
  597. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  598. static inline I32x4 create_dangerous_uninitialized() { return I32x4(); }
  599. #if defined USE_BASIC_SIMD
  600. public:
  601. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  602. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  603. // Direct access cannot be done on NEON!
  604. int32_t scalars[4];
  605. #endif
  606. // The SIMD vector of undefined type
  607. // Not accessible while emulating!
  608. SIMD_I32x4 v;
  609. // Construct a portable vector from a native SIMD vector
  610. explicit I32x4(const SIMD_I32x4& v) : v(v) {}
  611. // Construct a portable vector from a set of scalars
  612. I32x4(int32_t a1, int32_t a2, int32_t a3, int32_t a4) : v(LOAD_VECTOR_I32_SIMD(a1, a2, a3, a4)) {}
  613. // Construct a portable vector from a single duplicated scalar
  614. explicit I32x4(int32_t scalar) : v(LOAD_SCALAR_I32_SIMD(scalar)) {}
  615. #else
  616. public:
  617. // Emulate a SIMD vector as an array of scalars without hardware support.
  618. // Only accessible while emulating!
  619. int32_t scalars[4];
  620. // Construct a portable vector from a set of scalars
  621. I32x4(int32_t a1, int32_t a2, int32_t a3, int32_t a4) {
  622. this->scalars[0] = a1;
  623. this->scalars[1] = a2;
  624. this->scalars[2] = a3;
  625. this->scalars[3] = a4;
  626. }
  627. // Construct a portable vector from a single duplicated scalar
  628. explicit I32x4(int32_t scalar) {
  629. this->scalars[0] = scalar;
  630. this->scalars[1] = scalar;
  631. this->scalars[2] = scalar;
  632. this->scalars[3] = scalar;
  633. }
  634. #endif
  635. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  636. static inline I32x4 createGradient(int32_t start, int32_t increment) {
  637. return I32x4(start, start + increment, start + increment * 2, start + increment * 3);
  638. }
  639. // Construct a portable SIMD vector from a pointer to aligned data
  640. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  641. static inline I32x4 readAlignedUnsafe(const int32_t* data) {
  642. #if defined USE_BASIC_SIMD
  643. #if defined USE_SSE2
  644. return I32x4(_mm_load_si128((const __m128i*)data));
  645. #elif defined USE_NEON
  646. return I32x4(vld1q_s32(data));
  647. #endif
  648. #else
  649. return I32x4(data[0], data[1], data[2], data[3]);
  650. #endif
  651. }
  652. // Write to aligned memory from the existing vector
  653. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  654. inline void writeAlignedUnsafe(int32_t* data) const {
  655. #if defined USE_BASIC_SIMD
  656. #if defined USE_SSE2
  657. _mm_store_si128((__m128i*)data, this->v);
  658. #elif defined USE_NEON
  659. vst1q_s32(data, this->v);
  660. #endif
  661. #else
  662. data[0] = this->scalars[0];
  663. data[1] = this->scalars[1];
  664. data[2] = this->scalars[2];
  665. data[3] = this->scalars[3];
  666. #endif
  667. }
  668. #if defined DFPSR_GEOMETRY_IVECTOR
  669. dsr::IVector4D get() const {
  670. int32_t data[4] ALIGN16;
  671. this->writeAlignedUnsafe(data);
  672. return dsr::IVector4D(data[0], data[1], data[2], data[3]);
  673. }
  674. #endif
  675. // Bound and alignment checked reading
  676. static inline I32x4 readAligned(const dsr::SafePointer<int32_t> data, const char* methodName) {
  677. const int32_t* pointer = data.getUnsafe();
  678. assert(((uintptr_t)pointer & 15) == 0);
  679. #if defined SAFE_POINTER_CHECKS
  680. data.assertInside(methodName, pointer, 16);
  681. #endif
  682. return I32x4::readAlignedUnsafe(pointer);
  683. }
  684. // Bound and alignment checked writing
  685. inline void writeAligned(dsr::SafePointer<int32_t> data, const char* methodName) const {
  686. int32_t* pointer = data.getUnsafe();
  687. assert(((uintptr_t)pointer & 15) == 0);
  688. #if defined SAFE_POINTER_CHECKS
  689. data.assertInside(methodName, pointer, 16);
  690. #endif
  691. this->writeAlignedUnsafe(pointer);
  692. }
  693. };
  694. union U32x4 {
  695. #if defined USE_BASIC_SIMD
  696. public:
  697. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  698. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  699. // Direct access cannot be done on NEON!
  700. uint32_t scalars[4];
  701. #endif
  702. // The SIMD vector of undefined type
  703. // Not accessible while emulating!
  704. SIMD_U32x4 v;
  705. // Construct a portable vector from a native SIMD vector
  706. explicit U32x4(const SIMD_U32x4& v) : v(v) {}
  707. // Construct a portable vector from a set of scalars
  708. U32x4(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4) : v(LOAD_VECTOR_U32_SIMD(a1, a2, a3, a4)) {}
  709. // Construct a portable vector from a single duplicated scalar
  710. explicit U32x4(uint32_t scalar) : v(LOAD_SCALAR_U32_SIMD(scalar)) {}
  711. #else
  712. public:
  713. // Emulate a SIMD vector as an array of scalars without hardware support.
  714. // Only accessible while emulating!
  715. uint32_t scalars[4];
  716. // Construct a portable vector from a set of scalars
  717. U32x4(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4) {
  718. this->scalars[0] = a1;
  719. this->scalars[1] = a2;
  720. this->scalars[2] = a3;
  721. this->scalars[3] = a4;
  722. }
  723. // Construct a portable vector from a single duplicated scalar
  724. explicit U32x4(uint32_t scalar) {
  725. this->scalars[0] = scalar;
  726. this->scalars[1] = scalar;
  727. this->scalars[2] = scalar;
  728. this->scalars[3] = scalar;
  729. }
  730. #endif
  731. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  732. static inline U32x4 createGradient(uint32_t start, uint32_t increment) {
  733. return U32x4(start, start + increment, start + increment * 2, start + increment * 3);
  734. }
  735. // Construct a portable SIMD vector from a pointer to aligned data
  736. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  737. static inline U32x4 readAlignedUnsafe(const uint32_t* data) {
  738. #if defined USE_BASIC_SIMD
  739. #if defined USE_SSE2
  740. return U32x4(_mm_load_si128((const __m128i*)data));
  741. #elif defined USE_NEON
  742. return U32x4(vld1q_u32(data));
  743. #endif
  744. #else
  745. return U32x4(data[0], data[1], data[2], data[3]);
  746. #endif
  747. }
  748. // Write to aligned memory from the existing vector
  749. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  750. inline void writeAlignedUnsafe(uint32_t* data) const {
  751. #if defined USE_BASIC_SIMD
  752. #if defined USE_SSE2
  753. _mm_store_si128((__m128i*)data, this->v);
  754. #elif defined USE_NEON
  755. vst1q_u32(data, this->v);
  756. #endif
  757. #else
  758. data[0] = this->scalars[0];
  759. data[1] = this->scalars[1];
  760. data[2] = this->scalars[2];
  761. data[3] = this->scalars[3];
  762. #endif
  763. }
  764. #if defined DFPSR_GEOMETRY_UVECTOR
  765. dsr::UVector4D get() const {
  766. uint32_t data[4] ALIGN16;
  767. this->writeAlignedUnsafe(data);
  768. return dsr::UVector4D(data[0], data[1], data[2], data[3]);
  769. }
  770. #endif
  771. // Bound and alignment checked reading
  772. static inline U32x4 readAligned(const dsr::SafePointer<uint32_t> data, const char* methodName) {
  773. const uint32_t* pointer = data.getUnsafe();
  774. assert(((uintptr_t)pointer & 15) == 0);
  775. #if defined SAFE_POINTER_CHECKS
  776. data.assertInside(methodName, pointer, 16);
  777. #endif
  778. return U32x4::readAlignedUnsafe(pointer);
  779. }
  780. // Bound and alignment checked writing
  781. inline void writeAligned(dsr::SafePointer<uint32_t> data, const char* methodName) const {
  782. uint32_t* pointer = data.getUnsafe();
  783. assert(((uintptr_t)pointer & 15) == 0);
  784. #if defined SAFE_POINTER_CHECKS
  785. data.assertInside(methodName, pointer, 16);
  786. #endif
  787. this->writeAlignedUnsafe(pointer);
  788. }
  789. };
  790. union U16x8 {
  791. private:
  792. // The uninitialized default constructor is private for safety reasons.
  793. U16x8() {}
  794. public:
  795. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  796. static inline U16x8 create_dangerous_uninitialized() { return U16x8(); }
  797. #if defined USE_BASIC_SIMD
  798. public:
  799. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  800. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  801. // Direct access cannot be done on NEON!
  802. uint16_t scalars[8];
  803. #endif
  804. // The SIMD vector of undefined type
  805. // Not accessible while emulating!
  806. SIMD_U16x8 v;
  807. // Construct a portable vector from a native SIMD vector
  808. explicit U16x8(const SIMD_U16x8& v) : v(v) {}
  809. // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
  810. // Reinterpret casting is used
  811. explicit U16x8(const U32x4& vector) : v(REINTERPRET_U32_TO_U16_SIMD(vector.v)) {}
  812. // Construct a portable vector from a set of scalars
  813. U16x8(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8) : v(LOAD_VECTOR_U16_SIMD(a1, a2, a3, a4, a5, a6, a7, a8)) {}
  814. // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
  815. // Reinterpret casting is used
  816. // TODO: Remove all reintreprets from constructors to improve readability
  817. explicit U16x8(uint32_t scalar) : v(REINTERPRET_U32_TO_U16_SIMD(LOAD_SCALAR_U32_SIMD(scalar))) {}
  818. // Construct a portable vector from a single duplicated scalar
  819. explicit U16x8(uint16_t scalar) : v(LOAD_SCALAR_U16_SIMD(scalar)) {}
  820. // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
  821. U32x4 get_U32() const {
  822. return U32x4(REINTERPRET_U16_TO_U32_SIMD(this->v));
  823. }
  824. #else
  825. public:
  826. // Emulate a SIMD vector as an array of scalars without hardware support.
  827. // Only accessible while emulating!
  828. uint16_t scalars[8];
  829. // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
  830. // Reinterpret casting is used
  831. explicit U16x8(const U32x4& vector) {
  832. uint64_t *target = (uint64_t*)this->scalars;
  833. uint64_t *source = (uint64_t*)vector.scalars;
  834. target[0] = source[0];
  835. target[1] = source[1];
  836. }
  837. // Construct a portable vector from a set of scalars
  838. U16x8(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8) {
  839. this->scalars[0] = a1;
  840. this->scalars[1] = a2;
  841. this->scalars[2] = a3;
  842. this->scalars[3] = a4;
  843. this->scalars[4] = a5;
  844. this->scalars[5] = a6;
  845. this->scalars[6] = a7;
  846. this->scalars[7] = a8;
  847. }
  848. // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
  849. // Reinterpret casting is used
  850. explicit U16x8(uint32_t scalar) {
  851. uint32_t *target = (uint32_t*)this->scalars;
  852. target[0] = scalar;
  853. target[1] = scalar;
  854. target[2] = scalar;
  855. target[3] = scalar;
  856. }
  857. // Construct a portable vector from a single duplicated scalar
  858. explicit U16x8(uint16_t scalar) {
  859. this->scalars[0] = scalar;
  860. this->scalars[1] = scalar;
  861. this->scalars[2] = scalar;
  862. this->scalars[3] = scalar;
  863. this->scalars[4] = scalar;
  864. this->scalars[5] = scalar;
  865. this->scalars[6] = scalar;
  866. this->scalars[7] = scalar;
  867. }
  868. // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
  869. U32x4 get_U32() const {
  870. U32x4 result(0);
  871. uint64_t *target = (uint64_t*)result.scalars;
  872. uint64_t *source = (uint64_t*)this->scalars;
  873. target[0] = source[0];
  874. target[1] = source[1];
  875. return result;
  876. }
  877. #endif
  878. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  879. static inline U16x8 createGradient(uint16_t start, uint16_t increment) {
  880. return U16x8(
  881. start,
  882. start + increment,
  883. start + increment * 2,
  884. start + increment * 3,
  885. start + increment * 4,
  886. start + increment * 5,
  887. start + increment * 6,
  888. start + increment * 7
  889. );
  890. }
  891. static inline U16x8 readAlignedUnsafe(const uint16_t* data) {
  892. #if defined USE_BASIC_SIMD
  893. #if defined USE_SSE2
  894. return U16x8(_mm_load_si128((const __m128i*)data));
  895. #elif defined USE_NEON
  896. return U16x8(vld1q_u16(data));
  897. #endif
  898. #else
  899. return U16x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  900. #endif
  901. }
  902. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  903. inline void writeAlignedUnsafe(uint16_t* data) const {
  904. #if defined USE_BASIC_SIMD
  905. #if defined USE_SSE2
  906. _mm_store_si128((__m128i*)data, this->v);
  907. #elif defined USE_NEON
  908. vst1q_u16(data, this->v);
  909. #endif
  910. #else
  911. data[0] = this->scalars[0];
  912. data[1] = this->scalars[1];
  913. data[2] = this->scalars[2];
  914. data[3] = this->scalars[3];
  915. data[4] = this->scalars[4];
  916. data[5] = this->scalars[5];
  917. data[6] = this->scalars[6];
  918. data[7] = this->scalars[7];
  919. #endif
  920. }
  921. // Bound and alignment checked reading
  922. static inline U16x8 readAligned(const dsr::SafePointer<uint16_t> data, const char* methodName) {
  923. const uint16_t* pointer = data.getUnsafe();
  924. assert(((uintptr_t)pointer & 15) == 0);
  925. #if defined SAFE_POINTER_CHECKS
  926. data.assertInside(methodName, pointer, 16);
  927. #endif
  928. return U16x8::readAlignedUnsafe(pointer);
  929. }
  930. // Bound and alignment checked writing
  931. inline void writeAligned(dsr::SafePointer<uint16_t> data, const char* methodName) const {
  932. uint16_t* pointer = data.getUnsafe();
  933. assert(((uintptr_t)pointer & 15) == 0);
  934. #if defined SAFE_POINTER_CHECKS
  935. data.assertInside(methodName, pointer, 16);
  936. #endif
  937. this->writeAlignedUnsafe(pointer);
  938. }
  939. };
  940. union U8x16 {
  941. private:
  942. // The uninitialized default constructor is private for safety reasons.
  943. U8x16() {}
  944. public:
  945. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  946. static inline U8x16 create_dangerous_uninitialized() { return U8x16(); }
  947. #if defined USE_BASIC_SIMD
  948. public:
  949. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  950. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  951. // Direct access cannot be done on NEON!
  952. uint8_t scalars[16];
  953. #endif
  954. // The SIMD vector of undefined type
  955. // Not accessible while emulating!
  956. SIMD_U8x16 v;
  957. // Construct a portable vector from a native SIMD vector
  958. explicit U8x16(const SIMD_U8x16& v) : v(v) {}
  959. // Construct a portable vector from a set of scalars
  960. U8x16(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  961. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16)
  962. : v(LOAD_VECTOR_U8_SIMD(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16)) {}
  963. // Construct a portable vector from a single duplicated scalar
  964. explicit U8x16(uint8_t scalar) : v(LOAD_SCALAR_U8_SIMD(scalar)) {}
  965. #else
  966. public:
  967. // Emulate a SIMD vector as an array of scalars without hardware support.
  968. // Only accessible while emulating!
  969. uint8_t scalars[16];
  970. // Construct a portable vector from a set of scalars
  971. U8x16(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  972. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16) {
  973. this->scalars[0] = a1;
  974. this->scalars[1] = a2;
  975. this->scalars[2] = a3;
  976. this->scalars[3] = a4;
  977. this->scalars[4] = a5;
  978. this->scalars[5] = a6;
  979. this->scalars[6] = a7;
  980. this->scalars[7] = a8;
  981. this->scalars[8] = a9;
  982. this->scalars[9] = a10;
  983. this->scalars[10] = a11;
  984. this->scalars[11] = a12;
  985. this->scalars[12] = a13;
  986. this->scalars[13] = a14;
  987. this->scalars[14] = a15;
  988. this->scalars[15] = a16;
  989. }
  990. // Construct a portable vector from a single duplicated scalar
  991. explicit U8x16(uint8_t scalar) {
  992. this->scalars[0] = scalar;
  993. this->scalars[1] = scalar;
  994. this->scalars[2] = scalar;
  995. this->scalars[3] = scalar;
  996. this->scalars[4] = scalar;
  997. this->scalars[5] = scalar;
  998. this->scalars[6] = scalar;
  999. this->scalars[7] = scalar;
  1000. this->scalars[8] = scalar;
  1001. this->scalars[9] = scalar;
  1002. this->scalars[10] = scalar;
  1003. this->scalars[11] = scalar;
  1004. this->scalars[12] = scalar;
  1005. this->scalars[13] = scalar;
  1006. this->scalars[14] = scalar;
  1007. this->scalars[15] = scalar;
  1008. }
  1009. #endif
  1010. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1011. static inline U8x16 createGradient(uint8_t start, uint8_t increment) {
  1012. return U8x16(
  1013. start,
  1014. start + increment,
  1015. start + increment * 2,
  1016. start + increment * 3,
  1017. start + increment * 4,
  1018. start + increment * 5,
  1019. start + increment * 6,
  1020. start + increment * 7,
  1021. start + increment * 8,
  1022. start + increment * 9,
  1023. start + increment * 10,
  1024. start + increment * 11,
  1025. start + increment * 12,
  1026. start + increment * 13,
  1027. start + increment * 14,
  1028. start + increment * 15
  1029. );
  1030. }
  1031. static inline U8x16 readAlignedUnsafe(const uint8_t* data) {
  1032. #if defined USE_BASIC_SIMD
  1033. #if defined USE_SSE2
  1034. return U8x16(_mm_load_si128((const __m128i*)data));
  1035. #elif defined USE_NEON
  1036. return U8x16(vld1q_u8(data));
  1037. #endif
  1038. #else
  1039. return U8x16(
  1040. data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7],
  1041. data[8], data[9], data[10], data[11], data[12], data[13], data[14], data[15]
  1042. );
  1043. #endif
  1044. }
  1045. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1046. inline void writeAlignedUnsafe(uint8_t* data) const {
  1047. #if defined USE_BASIC_SIMD
  1048. #if defined USE_SSE2
  1049. _mm_store_si128((__m128i*)data, this->v);
  1050. #elif defined USE_NEON
  1051. vst1q_u8(data, this->v);
  1052. #endif
  1053. #else
  1054. data[0] = this->scalars[0];
  1055. data[1] = this->scalars[1];
  1056. data[2] = this->scalars[2];
  1057. data[3] = this->scalars[3];
  1058. data[4] = this->scalars[4];
  1059. data[5] = this->scalars[5];
  1060. data[6] = this->scalars[6];
  1061. data[7] = this->scalars[7];
  1062. data[8] = this->scalars[8];
  1063. data[9] = this->scalars[9];
  1064. data[10] = this->scalars[10];
  1065. data[11] = this->scalars[11];
  1066. data[12] = this->scalars[12];
  1067. data[13] = this->scalars[13];
  1068. data[14] = this->scalars[14];
  1069. data[15] = this->scalars[15];
  1070. #endif
  1071. }
  1072. // Bound and alignment checked reading
  1073. static inline U8x16 readAligned(const dsr::SafePointer<uint8_t> data, const char* methodName) {
  1074. const uint8_t* pointer = data.getUnsafe();
  1075. assert(((uintptr_t)pointer & 15) == 0);
  1076. #if defined SAFE_POINTER_CHECKS
  1077. data.assertInside(methodName, pointer, 16);
  1078. #endif
  1079. return U8x16::readAlignedUnsafe(pointer);
  1080. }
  1081. // Bound and alignment checked writing
  1082. inline void writeAligned(dsr::SafePointer<uint8_t> data, const char* methodName) const {
  1083. uint8_t* pointer = data.getUnsafe();
  1084. assert(((uintptr_t)pointer & 15) == 0);
  1085. #if defined SAFE_POINTER_CHECKS
  1086. data.assertInside(methodName, pointer, 16);
  1087. #endif
  1088. this->writeAlignedUnsafe(pointer);
  1089. }
  1090. };
  1091. union F32x8 {
  1092. private:
  1093. // The uninitialized default constructor is private for safety reasons.
  1094. F32x8() {}
  1095. public:
  1096. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  1097. static inline F32x8 create_dangerous_uninitialized() { return F32x8(); }
  1098. #if defined USE_256BIT_F_SIMD
  1099. public:
  1100. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  1101. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  1102. float scalars[8];
  1103. #endif
  1104. // The SIMD vector of undefined type
  1105. // Not accessible while emulating!
  1106. SIMD_F32x8 v;
  1107. // Construct a portable vector from a native SIMD vector
  1108. explicit F32x8(const SIMD_F32x8& v) : v(v) {}
  1109. // Construct a portable vector from a set of scalars
  1110. F32x8(float a1, float a2, float a3, float a4, float a5, float a6, float a7, float a8)
  1111. : v(LOAD_VECTOR_F32_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8)) {}
  1112. // Construct a portable vector from a single duplicated scalar
  1113. explicit F32x8(float scalar) : v(LOAD_SCALAR_F32_SIMD256(scalar)) {}
  1114. #else
  1115. public:
  1116. // Emulate a SIMD vector as an array of scalars without hardware support.
  1117. // Only accessible while emulating!
  1118. float scalars[8];
  1119. // Construct a portable vector from a set of scalars
  1120. F32x8(float a1, float a2, float a3, float a4, float a5, float a6, float a7, float a8) {
  1121. this->scalars[0] = a1;
  1122. this->scalars[1] = a2;
  1123. this->scalars[2] = a3;
  1124. this->scalars[3] = a4;
  1125. this->scalars[4] = a5;
  1126. this->scalars[5] = a6;
  1127. this->scalars[6] = a7;
  1128. this->scalars[7] = a8;
  1129. }
  1130. // Construct a portable vector from a single duplicated scalar
  1131. explicit F32x8(float scalar) {
  1132. this->scalars[0] = scalar;
  1133. this->scalars[1] = scalar;
  1134. this->scalars[2] = scalar;
  1135. this->scalars[3] = scalar;
  1136. this->scalars[4] = scalar;
  1137. this->scalars[5] = scalar;
  1138. this->scalars[6] = scalar;
  1139. this->scalars[7] = scalar;
  1140. }
  1141. #endif
  1142. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1143. static inline F32x8 createGradient(float start, float increment) {
  1144. return F32x8(
  1145. start,
  1146. start + increment,
  1147. start + increment * 2.0f,
  1148. start + increment * 3.0f,
  1149. start + increment * 4.0f,
  1150. start + increment * 5.0f,
  1151. start + increment * 6.0f,
  1152. start + increment * 7.0f
  1153. );
  1154. }
  1155. // Construct a portable SIMD vector from a pointer to aligned data
  1156. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1157. static inline F32x8 readAlignedUnsafe(const float* data) {
  1158. #if defined USE_AVX2
  1159. return F32x8(_mm256_load_ps(data));
  1160. #else
  1161. return F32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  1162. #endif
  1163. }
  1164. // Write to aligned memory from the existing vector
  1165. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1166. inline void writeAlignedUnsafe(float* data) const {
  1167. #if defined USE_AVX2
  1168. _mm256_store_ps(data, this->v);
  1169. #else
  1170. data[0] = this->scalars[0];
  1171. data[1] = this->scalars[1];
  1172. data[2] = this->scalars[2];
  1173. data[3] = this->scalars[3];
  1174. data[4] = this->scalars[4];
  1175. data[5] = this->scalars[5];
  1176. data[6] = this->scalars[6];
  1177. data[7] = this->scalars[7];
  1178. #endif
  1179. }
  1180. // Bound and alignment checked reading
  1181. static inline F32x8 readAligned(const dsr::SafePointer<float> data, const char* methodName) {
  1182. const float* pointer = data.getUnsafe();
  1183. assert(((uintptr_t)pointer & 31) == 0);
  1184. #if defined SAFE_POINTER_CHECKS
  1185. data.assertInside(methodName, pointer, 32);
  1186. #endif
  1187. return F32x8::readAlignedUnsafe(pointer);
  1188. }
  1189. // Bound and alignment checked writing
  1190. inline void writeAligned(dsr::SafePointer<float> data, const char* methodName) const {
  1191. float* pointer = data.getUnsafe();
  1192. assert(((uintptr_t)pointer & 31) == 0);
  1193. #if defined SAFE_POINTER_CHECKS
  1194. data.assertInside(methodName, pointer, 32);
  1195. #endif
  1196. this->writeAlignedUnsafe(pointer);
  1197. }
  1198. // 1 / x
  1199. // Useful for multiple divisions with the same denominator
  1200. // Useless if the denominator is a constant
  1201. F32x8 reciprocal() const {
  1202. #if defined USE_AVX2
  1203. // Approximate
  1204. SIMD_F32x8 lowQ = _mm256_rcp_ps(this->v);
  1205. // Refine
  1206. return F32x8(SUB_F32_SIMD256(ADD_F32_SIMD256(lowQ, lowQ), MUL_F32_SIMD256(this->v, MUL_F32_SIMD256(lowQ, lowQ))));
  1207. #else
  1208. return F32x8(
  1209. 1.0f / this->scalars[0],
  1210. 1.0f / this->scalars[1],
  1211. 1.0f / this->scalars[2],
  1212. 1.0f / this->scalars[3],
  1213. 1.0f / this->scalars[4],
  1214. 1.0f / this->scalars[5],
  1215. 1.0f / this->scalars[6],
  1216. 1.0f / this->scalars[7]
  1217. );
  1218. #endif
  1219. }
  1220. // 1 / sqrt(x)
  1221. // Useful for normalizing vectors
  1222. F32x8 reciprocalSquareRoot() const {
  1223. #if defined USE_AVX2
  1224. //__m128 reciRoot = _mm256_rsqrt_ps(this->v);
  1225. SIMD_F32x8 reciRoot = _mm256_rsqrt_ps(this->v);
  1226. SIMD_F32x8 mul = MUL_F32_SIMD256(MUL_F32_SIMD256(this->v, reciRoot), reciRoot);
  1227. reciRoot = MUL_F32_SIMD256(MUL_F32_SIMD256(LOAD_SCALAR_F32_SIMD256(0.5f), reciRoot), SUB_F32_SIMD256(LOAD_SCALAR_F32_SIMD256(3.0f), mul));
  1228. return F32x8(reciRoot);
  1229. #else
  1230. return F32x8(
  1231. 1.0f / sqrt(this->scalars[0]),
  1232. 1.0f / sqrt(this->scalars[1]),
  1233. 1.0f / sqrt(this->scalars[2]),
  1234. 1.0f / sqrt(this->scalars[3]),
  1235. 1.0f / sqrt(this->scalars[4]),
  1236. 1.0f / sqrt(this->scalars[5]),
  1237. 1.0f / sqrt(this->scalars[6]),
  1238. 1.0f / sqrt(this->scalars[7])
  1239. );
  1240. #endif
  1241. }
  1242. // sqrt(x)
  1243. // Useful for getting lengths of vectors
  1244. F32x8 squareRoot() const {
  1245. #if defined USE_AVX2
  1246. SIMD_F32x8 half = LOAD_SCALAR_F32_SIMD256(0.5f);
  1247. // Approximate
  1248. SIMD_F32x8 root = _mm256_sqrt_ps(this->v);
  1249. // Refine
  1250. root = _mm256_mul_ps(_mm256_add_ps(root, _mm256_div_ps(this->v, root)), half);
  1251. return F32x8(root);
  1252. #else
  1253. return F32x8(
  1254. sqrt(this->scalars[0]),
  1255. sqrt(this->scalars[1]),
  1256. sqrt(this->scalars[2]),
  1257. sqrt(this->scalars[3]),
  1258. sqrt(this->scalars[4]),
  1259. sqrt(this->scalars[5]),
  1260. sqrt(this->scalars[6]),
  1261. sqrt(this->scalars[7]));
  1262. #endif
  1263. }
  1264. F32x8 clamp(float minimum, float maximum) const {
  1265. #if defined USE_256BIT_F_SIMD
  1266. return F32x8(MIN_F32_SIMD256(MAX_F32_SIMD256(this->v, LOAD_SCALAR_F32_SIMD256(minimum)), LOAD_SCALAR_F32_SIMD256(maximum)));
  1267. #else
  1268. float val0 = this->scalars[0];
  1269. float val1 = this->scalars[1];
  1270. float val2 = this->scalars[2];
  1271. float val3 = this->scalars[3];
  1272. float val4 = this->scalars[4];
  1273. float val5 = this->scalars[5];
  1274. float val6 = this->scalars[6];
  1275. float val7 = this->scalars[7];
  1276. if (minimum > val0) { val0 = minimum; }
  1277. if (maximum < val0) { val0 = maximum; }
  1278. if (minimum > val1) { val1 = minimum; }
  1279. if (maximum < val1) { val1 = maximum; }
  1280. if (minimum > val2) { val2 = minimum; }
  1281. if (maximum < val2) { val2 = maximum; }
  1282. if (minimum > val3) { val3 = minimum; }
  1283. if (maximum < val3) { val3 = maximum; }
  1284. if (minimum > val4) { val4 = minimum; }
  1285. if (maximum < val4) { val4 = maximum; }
  1286. if (minimum > val5) { val5 = minimum; }
  1287. if (maximum < val5) { val5 = maximum; }
  1288. if (minimum > val6) { val6 = minimum; }
  1289. if (maximum < val6) { val6 = maximum; }
  1290. if (minimum > val7) { val7 = minimum; }
  1291. if (maximum < val7) { val7 = maximum; }
  1292. return F32x8(val0, val1, val2, val3, val4, val5, val6, val7);
  1293. #endif
  1294. }
  1295. F32x8 clampLower(float minimum) const {
  1296. #if defined USE_256BIT_F_SIMD
  1297. return F32x8(MAX_F32_SIMD256(this->v, LOAD_SCALAR_F32_SIMD256(minimum)));
  1298. #else
  1299. float val0 = this->scalars[0];
  1300. float val1 = this->scalars[1];
  1301. float val2 = this->scalars[2];
  1302. float val3 = this->scalars[3];
  1303. float val4 = this->scalars[4];
  1304. float val5 = this->scalars[5];
  1305. float val6 = this->scalars[6];
  1306. float val7 = this->scalars[7];
  1307. if (minimum > val0) { val0 = minimum; }
  1308. if (minimum > val1) { val1 = minimum; }
  1309. if (minimum > val2) { val2 = minimum; }
  1310. if (minimum > val3) { val3 = minimum; }
  1311. if (minimum > val4) { val4 = minimum; }
  1312. if (minimum > val5) { val5 = minimum; }
  1313. if (minimum > val6) { val6 = minimum; }
  1314. if (minimum > val7) { val7 = minimum; }
  1315. return F32x8(val0, val1, val2, val3, val4, val5, val6, val7);
  1316. #endif
  1317. }
  1318. F32x8 clampUpper(float maximum) const {
  1319. #if defined USE_256BIT_F_SIMD
  1320. return F32x8(MIN_F32_SIMD256(this->v, LOAD_SCALAR_F32_SIMD256(maximum)));
  1321. #else
  1322. float val0 = this->scalars[0];
  1323. float val1 = this->scalars[1];
  1324. float val2 = this->scalars[2];
  1325. float val3 = this->scalars[3];
  1326. float val4 = this->scalars[4];
  1327. float val5 = this->scalars[5];
  1328. float val6 = this->scalars[6];
  1329. float val7 = this->scalars[7];
  1330. if (maximum < val0) { val0 = maximum; }
  1331. if (maximum < val1) { val1 = maximum; }
  1332. if (maximum < val2) { val2 = maximum; }
  1333. if (maximum < val3) { val3 = maximum; }
  1334. if (maximum < val4) { val4 = maximum; }
  1335. if (maximum < val5) { val5 = maximum; }
  1336. if (maximum < val6) { val6 = maximum; }
  1337. if (maximum < val7) { val7 = maximum; }
  1338. return F32x8(val0, val1, val2, val3, val4, val5, val6, val7);
  1339. #endif
  1340. }
  1341. };
  1342. union I32x8 {
  1343. private:
  1344. // The uninitialized default constructor is private for safety reasons.
  1345. I32x8() {}
  1346. public:
  1347. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  1348. static inline I32x8 create_dangerous_uninitialized() { return I32x8(); }
  1349. #if defined USE_256BIT_X_SIMD
  1350. public:
  1351. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  1352. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  1353. int32_t scalars[8];
  1354. #endif
  1355. // The SIMD vector of undefined type
  1356. // Not accessible while emulating!
  1357. SIMD_I32x8 v;
  1358. // Construct a portable vector from a native SIMD vector
  1359. explicit I32x8(const SIMD_I32x8& v) : v(v) {}
  1360. // Construct a portable vector from a set of scalars
  1361. I32x8(int32_t a1, int32_t a2, int32_t a3, int32_t a4, int32_t a5, int32_t a6, int32_t a7, int32_t a8)
  1362. : v(LOAD_VECTOR_I32_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8)) {}
  1363. // Construct a portable vector from a single duplicated scalar
  1364. explicit I32x8(int32_t scalar) : v(LOAD_SCALAR_I32_SIMD256(scalar)) {}
  1365. #else
  1366. public:
  1367. // Emulate a SIMD vector as an array of scalars without hardware support.
  1368. // Only accessible while emulating!
  1369. int32_t scalars[8];
  1370. // Construct a portable vector from a set of scalars
  1371. I32x8(int32_t a1, int32_t a2, int32_t a3, int32_t a4, int32_t a5, int32_t a6, int32_t a7, int32_t a8) {
  1372. this->scalars[0] = a1;
  1373. this->scalars[1] = a2;
  1374. this->scalars[2] = a3;
  1375. this->scalars[3] = a4;
  1376. this->scalars[4] = a5;
  1377. this->scalars[5] = a6;
  1378. this->scalars[6] = a7;
  1379. this->scalars[7] = a8;
  1380. }
  1381. // Construct a portable vector from a single duplicated scalar
  1382. explicit I32x8(int32_t scalar) {
  1383. this->scalars[0] = scalar;
  1384. this->scalars[1] = scalar;
  1385. this->scalars[2] = scalar;
  1386. this->scalars[3] = scalar;
  1387. this->scalars[4] = scalar;
  1388. this->scalars[5] = scalar;
  1389. this->scalars[6] = scalar;
  1390. this->scalars[7] = scalar;
  1391. }
  1392. #endif
  1393. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1394. static inline I32x8 createGradient(int32_t start, int32_t increment) {
  1395. return I32x8(
  1396. start,
  1397. start + increment,
  1398. start + increment * 2,
  1399. start + increment * 3,
  1400. start + increment * 4,
  1401. start + increment * 5,
  1402. start + increment * 6,
  1403. start + increment * 7
  1404. );
  1405. }
  1406. // Construct a portable SIMD vector from a pointer to aligned data
  1407. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1408. static inline I32x8 readAlignedUnsafe(const int32_t* data) {
  1409. #if defined USE_AVX2
  1410. return I32x8(_mm256_load_si256((const __m256i*)data));
  1411. #else
  1412. return I32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  1413. #endif
  1414. }
  1415. // Write to aligned memory from the existing vector
  1416. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1417. inline void writeAlignedUnsafe(int32_t* data) const {
  1418. #if defined USE_AVX2
  1419. _mm256_store_si256((__m256i*)data, this->v);
  1420. #else
  1421. data[0] = this->scalars[0];
  1422. data[1] = this->scalars[1];
  1423. data[2] = this->scalars[2];
  1424. data[3] = this->scalars[3];
  1425. data[4] = this->scalars[4];
  1426. data[5] = this->scalars[5];
  1427. data[6] = this->scalars[6];
  1428. data[7] = this->scalars[7];
  1429. #endif
  1430. }
  1431. // Bound and alignment checked reading
  1432. static inline I32x8 readAligned(const dsr::SafePointer<int32_t> data, const char* methodName) {
  1433. const int32_t* pointer = data.getUnsafe();
  1434. assert(((uintptr_t)pointer & 31) == 0);
  1435. #if defined SAFE_POINTER_CHECKS
  1436. data.assertInside(methodName, pointer, 32);
  1437. #endif
  1438. return I32x8::readAlignedUnsafe(pointer);
  1439. }
  1440. // Bound and alignment checked writing
  1441. inline void writeAligned(dsr::SafePointer<int32_t> data, const char* methodName) const {
  1442. int32_t* pointer = data.getUnsafe();
  1443. assert(((uintptr_t)pointer & 31) == 0);
  1444. #if defined SAFE_POINTER_CHECKS
  1445. data.assertInside(methodName, pointer, 32);
  1446. #endif
  1447. this->writeAlignedUnsafe(pointer);
  1448. }
  1449. };
  1450. union U32x8 {
  1451. private:
  1452. // The uninitialized default constructor is private for safety reasons.
  1453. U32x8() {}
  1454. public:
  1455. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  1456. static inline U32x8 create_dangerous_uninitialized() { return U32x8(); }
  1457. #if defined USE_256BIT_X_SIMD
  1458. public:
  1459. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  1460. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  1461. uint32_t scalars[8];
  1462. #endif
  1463. // The SIMD vector of undefined type
  1464. // Not accessible while emulating!
  1465. SIMD_U32x8 v;
  1466. // Construct a portable vector from a native SIMD vector
  1467. explicit U32x8(const SIMD_U32x8& v) : v(v) {}
  1468. // Construct a portable vector from a set of scalars
  1469. U32x8(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4, uint32_t a5, uint32_t a6, uint32_t a7, uint32_t a8)
  1470. : v(LOAD_VECTOR_U32_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8)) {}
  1471. // Construct a portable vector from a single duplicated scalar
  1472. explicit U32x8(uint32_t scalar) : v(LOAD_SCALAR_U32_SIMD256(scalar)) {}
  1473. #else
  1474. public:
  1475. // Emulate a SIMD vector as an array of scalars without hardware support.
  1476. // Only accessible while emulating!
  1477. uint32_t scalars[8];
  1478. // Construct a portable vector from a set of scalars
  1479. U32x8(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4, uint32_t a5, uint32_t a6, uint32_t a7, uint32_t a8) {
  1480. this->scalars[0] = a1;
  1481. this->scalars[1] = a2;
  1482. this->scalars[2] = a3;
  1483. this->scalars[3] = a4;
  1484. this->scalars[4] = a5;
  1485. this->scalars[5] = a6;
  1486. this->scalars[6] = a7;
  1487. this->scalars[7] = a8;
  1488. }
  1489. // Construct a portable vector from a single duplicated scalar
  1490. explicit U32x8(uint32_t scalar) {
  1491. this->scalars[0] = scalar;
  1492. this->scalars[1] = scalar;
  1493. this->scalars[2] = scalar;
  1494. this->scalars[3] = scalar;
  1495. this->scalars[4] = scalar;
  1496. this->scalars[5] = scalar;
  1497. this->scalars[6] = scalar;
  1498. this->scalars[7] = scalar;
  1499. }
  1500. #endif
  1501. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1502. static inline U32x8 createGradient(uint32_t start, uint32_t increment) {
  1503. return U32x8(
  1504. start,
  1505. start + increment,
  1506. start + increment * 2,
  1507. start + increment * 3,
  1508. start + increment * 4,
  1509. start + increment * 5,
  1510. start + increment * 6,
  1511. start + increment * 7
  1512. );
  1513. }
  1514. // Construct a portable SIMD vector from a pointer to aligned data
  1515. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1516. static inline U32x8 readAlignedUnsafe(const uint32_t* data) {
  1517. #if defined USE_AVX2
  1518. return U32x8(_mm256_load_si256((const __m256i*)data));
  1519. #else
  1520. return U32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  1521. #endif
  1522. }
  1523. // Write to aligned memory from the existing vector
  1524. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1525. inline void writeAlignedUnsafe(uint32_t* data) const {
  1526. #if defined USE_AVX2
  1527. _mm256_store_si256((__m256i*)data, this->v);
  1528. #else
  1529. data[0] = this->scalars[0];
  1530. data[1] = this->scalars[1];
  1531. data[2] = this->scalars[2];
  1532. data[3] = this->scalars[3];
  1533. data[4] = this->scalars[4];
  1534. data[5] = this->scalars[5];
  1535. data[6] = this->scalars[6];
  1536. data[7] = this->scalars[7];
  1537. #endif
  1538. }
  1539. // Bound and alignment checked reading
  1540. static inline U32x8 readAligned(const dsr::SafePointer<uint32_t> data, const char* methodName) {
  1541. const uint32_t* pointer = data.getUnsafe();
  1542. assert(((uintptr_t)pointer & 31) == 0);
  1543. #if defined SAFE_POINTER_CHECKS
  1544. data.assertInside(methodName, pointer, 32);
  1545. #endif
  1546. return U32x8::readAlignedUnsafe(pointer);
  1547. }
  1548. // Bound and alignment checked writing
  1549. inline void writeAligned(dsr::SafePointer<uint32_t> data, const char* methodName) const {
  1550. uint32_t* pointer = data.getUnsafe();
  1551. assert(((uintptr_t)pointer & 31) == 0);
  1552. #if defined SAFE_POINTER_CHECKS
  1553. data.assertInside(methodName, pointer, 32);
  1554. #endif
  1555. this->writeAlignedUnsafe(pointer);
  1556. }
  1557. };
  1558. union U16x16 {
  1559. private:
  1560. // The uninitialized default constructor is private for safety reasons.
  1561. U16x16() {}
  1562. public:
  1563. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  1564. static inline U16x16 create_dangerous_uninitialized() { return U16x16(); }
  1565. #if defined USE_256BIT_X_SIMD
  1566. public:
  1567. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  1568. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  1569. uint16_t scalars[16];
  1570. #endif
  1571. // The SIMD vector of undefined type
  1572. // Not accessible while emulating!
  1573. SIMD_U16x16 v;
  1574. // Construct a portable vector from a native SIMD vector
  1575. explicit U16x16(const SIMD_U16x16& v) : v(v) {}
  1576. // Construct a vector of 16 x 16-bit unsigned integers from a vector of 8 x 32-bit unsigned integers
  1577. // Reinterpret casting is used
  1578. explicit U16x16(const U32x8& vector) : v(REINTERPRET_U32_TO_U16_SIMD256(vector.v)) {}
  1579. // Construct a portable vector from a set of scalars
  1580. U16x16(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8,
  1581. uint16_t a9, uint16_t a10, uint16_t a11, uint16_t a12, uint16_t a13, uint16_t a14, uint16_t a15, uint16_t a16)
  1582. : v(LOAD_VECTOR_U16_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16)) {}
  1583. // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
  1584. // Reinterpret casting is used
  1585. // TODO: Remove all reintreprets from constructors to improve readability
  1586. explicit U16x16(uint32_t scalar) : v(REINTERPRET_U32_TO_U16_SIMD256(LOAD_SCALAR_U32_SIMD256(scalar))) {}
  1587. // Construct a portable vector from a single duplicated scalar
  1588. explicit U16x16(uint16_t scalar) : v(LOAD_SCALAR_U16_SIMD256(scalar)) {}
  1589. // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
  1590. U32x8 get_U32() const {
  1591. return U32x8(REINTERPRET_U16_TO_U32_SIMD256(this->v));
  1592. }
  1593. #else
  1594. public:
  1595. // Emulate a SIMD vector as an array of scalars without hardware support.
  1596. // Only accessible while emulating!
  1597. uint16_t scalars[16];
  1598. // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
  1599. // Reinterpret casting is used
  1600. explicit U16x16(const U32x8& vector) {
  1601. uint64_t *target = (uint64_t*)this->scalars;
  1602. uint64_t *source = (uint64_t*)vector.scalars;
  1603. target[0] = source[0];
  1604. target[1] = source[1];
  1605. target[2] = source[2];
  1606. target[3] = source[3];
  1607. }
  1608. // Construct a portable vector from a set of scalars
  1609. U16x16(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8,
  1610. uint16_t a9, uint16_t a10, uint16_t a11, uint16_t a12, uint16_t a13, uint16_t a14, uint16_t a15, uint16_t a16) {
  1611. this->scalars[0] = a1;
  1612. this->scalars[1] = a2;
  1613. this->scalars[2] = a3;
  1614. this->scalars[3] = a4;
  1615. this->scalars[4] = a5;
  1616. this->scalars[5] = a6;
  1617. this->scalars[6] = a7;
  1618. this->scalars[7] = a8;
  1619. this->scalars[8] = a9;
  1620. this->scalars[9] = a10;
  1621. this->scalars[10] = a11;
  1622. this->scalars[11] = a12;
  1623. this->scalars[12] = a13;
  1624. this->scalars[13] = a14;
  1625. this->scalars[14] = a15;
  1626. this->scalars[15] = a16;
  1627. }
  1628. // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
  1629. // Reinterpret casting is used
  1630. explicit U16x16(uint32_t scalar) {
  1631. uint32_t *target = (uint32_t*)this->scalars;
  1632. target[0] = scalar;
  1633. target[1] = scalar;
  1634. target[2] = scalar;
  1635. target[3] = scalar;
  1636. target[4] = scalar;
  1637. target[5] = scalar;
  1638. target[6] = scalar;
  1639. target[7] = scalar;
  1640. }
  1641. // Construct a portable vector from a single duplicated scalar
  1642. explicit U16x16(uint16_t scalar) {
  1643. this->scalars[0] = scalar;
  1644. this->scalars[1] = scalar;
  1645. this->scalars[2] = scalar;
  1646. this->scalars[3] = scalar;
  1647. this->scalars[4] = scalar;
  1648. this->scalars[5] = scalar;
  1649. this->scalars[6] = scalar;
  1650. this->scalars[7] = scalar;
  1651. this->scalars[8] = scalar;
  1652. this->scalars[9] = scalar;
  1653. this->scalars[10] = scalar;
  1654. this->scalars[11] = scalar;
  1655. this->scalars[12] = scalar;
  1656. this->scalars[13] = scalar;
  1657. this->scalars[14] = scalar;
  1658. this->scalars[15] = scalar;
  1659. }
  1660. // Reinterpret cast to a vector of 8 x 32-bit unsigned integers
  1661. U32x8 get_U32() const {
  1662. U32x8 result(0);
  1663. uint64_t *target = (uint64_t*)result.scalars;
  1664. uint64_t *source = (uint64_t*)this->scalars;
  1665. target[0] = source[0];
  1666. target[1] = source[1];
  1667. target[2] = source[2];
  1668. target[3] = source[3];
  1669. return result;
  1670. }
  1671. #endif
  1672. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1673. static inline U16x16 createGradient(uint16_t start, uint16_t increment) {
  1674. return U16x16(
  1675. start,
  1676. start + increment,
  1677. start + increment * 2,
  1678. start + increment * 3,
  1679. start + increment * 4,
  1680. start + increment * 5,
  1681. start + increment * 6,
  1682. start + increment * 7,
  1683. start + increment * 8,
  1684. start + increment * 9,
  1685. start + increment * 10,
  1686. start + increment * 11,
  1687. start + increment * 12,
  1688. start + increment * 13,
  1689. start + increment * 14,
  1690. start + increment * 15
  1691. );
  1692. }
  1693. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1694. //static inline U16x16 readSlow(uint16_t* data) {
  1695. // return U16x16(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  1696. //}
  1697. static inline U16x16 readAlignedUnsafe(const uint16_t* data) {
  1698. #if defined USE_AVX2
  1699. return U16x16(_mm256_load_si256((const __m256i*)data));
  1700. #else
  1701. return U16x16(
  1702. data[0],
  1703. data[1],
  1704. data[2],
  1705. data[3],
  1706. data[4],
  1707. data[5],
  1708. data[6],
  1709. data[7],
  1710. data[8],
  1711. data[9],
  1712. data[10],
  1713. data[11],
  1714. data[12],
  1715. data[13],
  1716. data[14],
  1717. data[15]
  1718. );
  1719. #endif
  1720. }
  1721. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1722. inline void writeAlignedUnsafe(uint16_t* data) const {
  1723. #if defined USE_AVX2
  1724. _mm256_store_si256((__m256i*)data, this->v);
  1725. #else
  1726. data[0] = this->scalars[0];
  1727. data[1] = this->scalars[1];
  1728. data[2] = this->scalars[2];
  1729. data[3] = this->scalars[3];
  1730. data[4] = this->scalars[4];
  1731. data[5] = this->scalars[5];
  1732. data[6] = this->scalars[6];
  1733. data[7] = this->scalars[7];
  1734. data[8] = this->scalars[8];
  1735. data[9] = this->scalars[9];
  1736. data[10] = this->scalars[10];
  1737. data[11] = this->scalars[11];
  1738. data[12] = this->scalars[12];
  1739. data[13] = this->scalars[13];
  1740. data[14] = this->scalars[14];
  1741. data[15] = this->scalars[15];
  1742. #endif
  1743. }
  1744. // Bound and alignment checked reading
  1745. static inline U16x16 readAligned(const dsr::SafePointer<uint16_t> data, const char* methodName) {
  1746. const uint16_t* pointer = data.getUnsafe();
  1747. assert(((uintptr_t)pointer & 31) == 0);
  1748. #if defined SAFE_POINTER_CHECKS
  1749. data.assertInside(methodName, pointer, 32);
  1750. #endif
  1751. return U16x16::readAlignedUnsafe(pointer);
  1752. }
  1753. // Bound and alignment checked writing
  1754. inline void writeAligned(dsr::SafePointer<uint16_t> data, const char* methodName) const {
  1755. uint16_t* pointer = data.getUnsafe();
  1756. assert(((uintptr_t)pointer & 31) == 0);
  1757. #if defined SAFE_POINTER_CHECKS
  1758. data.assertInside(methodName, pointer, 32);
  1759. #endif
  1760. this->writeAlignedUnsafe(pointer);
  1761. }
  1762. };
  1763. union U8x32 {
  1764. private:
  1765. // The uninitialized default constructor is private for safety reasons.
  1766. U8x32() {}
  1767. public:
  1768. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  1769. static inline U8x32 create_dangerous_uninitialized() { return U8x32(); }
  1770. #if defined USE_256BIT_X_SIMD
  1771. public:
  1772. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  1773. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  1774. uint8_t scalars[32];
  1775. #endif
  1776. // The SIMD vector of undefined type
  1777. // Not accessible while emulating!
  1778. SIMD_U8x32 v;
  1779. // Construct a portable vector from a native SIMD vector
  1780. explicit U8x32(const SIMD_U8x32& v) : v(v) {}
  1781. // Construct a portable vector from a set of scalars
  1782. U8x32(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  1783. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16,
  1784. uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24,
  1785. uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32)
  1786. : v(LOAD_VECTOR_U8_SIMD256(
  1787. a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16,
  1788. a17, a18, a19, a20, a21, a22, a23, a24, a25, a26, a27, a28, a29, a30, a31, a32
  1789. )) {}
  1790. // Construct a portable vector from a single duplicated scalar
  1791. explicit U8x32(uint8_t scalar) : v(LOAD_SCALAR_U8_SIMD256(scalar)) {}
  1792. #else
  1793. public:
  1794. // Emulate a SIMD vector as an array of scalars without hardware support.
  1795. // Only accessible while emulating!
  1796. uint8_t scalars[32];
  1797. // Construct a portable vector from a set of scalars
  1798. U8x32(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  1799. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16,
  1800. uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24,
  1801. uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32) {
  1802. this->scalars[0] = a1;
  1803. this->scalars[1] = a2;
  1804. this->scalars[2] = a3;
  1805. this->scalars[3] = a4;
  1806. this->scalars[4] = a5;
  1807. this->scalars[5] = a6;
  1808. this->scalars[6] = a7;
  1809. this->scalars[7] = a8;
  1810. this->scalars[8] = a9;
  1811. this->scalars[9] = a10;
  1812. this->scalars[10] = a11;
  1813. this->scalars[11] = a12;
  1814. this->scalars[12] = a13;
  1815. this->scalars[13] = a14;
  1816. this->scalars[14] = a15;
  1817. this->scalars[15] = a16;
  1818. this->scalars[16] = a17;
  1819. this->scalars[17] = a18;
  1820. this->scalars[18] = a19;
  1821. this->scalars[19] = a20;
  1822. this->scalars[20] = a21;
  1823. this->scalars[21] = a22;
  1824. this->scalars[22] = a23;
  1825. this->scalars[23] = a24;
  1826. this->scalars[24] = a25;
  1827. this->scalars[25] = a26;
  1828. this->scalars[26] = a27;
  1829. this->scalars[27] = a28;
  1830. this->scalars[28] = a29;
  1831. this->scalars[29] = a30;
  1832. this->scalars[30] = a31;
  1833. this->scalars[31] = a32;
  1834. }
  1835. // Construct a portable vector from a single duplicated scalar
  1836. explicit U8x32(uint8_t scalar) {
  1837. for (int i = 0; i < 32; i++) {
  1838. this->scalars[i] = scalar;
  1839. }
  1840. }
  1841. #endif
  1842. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1843. static inline U8x32 createGradient(uint8_t start, uint8_t increment) {
  1844. return U8x32(
  1845. start,
  1846. start + increment,
  1847. start + increment * 2,
  1848. start + increment * 3,
  1849. start + increment * 4,
  1850. start + increment * 5,
  1851. start + increment * 6,
  1852. start + increment * 7,
  1853. start + increment * 8,
  1854. start + increment * 9,
  1855. start + increment * 10,
  1856. start + increment * 11,
  1857. start + increment * 12,
  1858. start + increment * 13,
  1859. start + increment * 14,
  1860. start + increment * 15,
  1861. start + increment * 16,
  1862. start + increment * 17,
  1863. start + increment * 18,
  1864. start + increment * 19,
  1865. start + increment * 20,
  1866. start + increment * 21,
  1867. start + increment * 22,
  1868. start + increment * 23,
  1869. start + increment * 24,
  1870. start + increment * 25,
  1871. start + increment * 26,
  1872. start + increment * 27,
  1873. start + increment * 28,
  1874. start + increment * 29,
  1875. start + increment * 30,
  1876. start + increment * 31
  1877. );
  1878. }
  1879. static inline U8x32 readAlignedUnsafe(const uint8_t* data) {
  1880. #if defined USE_AVX2
  1881. return U8x32(_mm256_load_si256((const __m256i*)data));
  1882. #else
  1883. U8x32 result;
  1884. for (int i = 0; i < 32; i++) {
  1885. result.scalars[i] = data[i];
  1886. }
  1887. return result;
  1888. #endif
  1889. }
  1890. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1891. inline void writeAlignedUnsafe(uint8_t* data) const {
  1892. #if defined USE_AVX2
  1893. _mm256_store_si256((__m256i*)data, this->v);
  1894. #else
  1895. for (int i = 0; i < 32; i++) {
  1896. data[i] = this->scalars[i];
  1897. }
  1898. #endif
  1899. }
  1900. // Bound and alignment checked reading
  1901. static inline U8x32 readAligned(const dsr::SafePointer<uint8_t> data, const char* methodName) {
  1902. const uint8_t* pointer = data.getUnsafe();
  1903. assert(((uintptr_t)pointer & 31) == 0);
  1904. #if defined SAFE_POINTER_CHECKS
  1905. data.assertInside(methodName, pointer, 32);
  1906. #endif
  1907. return U8x32::readAlignedUnsafe(pointer);
  1908. }
  1909. // Bound and alignment checked writing
  1910. inline void writeAligned(dsr::SafePointer<uint8_t> data, const char* methodName) const {
  1911. uint8_t* pointer = data.getUnsafe();
  1912. assert(((uintptr_t)pointer & 31) == 0);
  1913. #if defined SAFE_POINTER_CHECKS
  1914. data.assertInside(methodName, pointer, 32);
  1915. #endif
  1916. this->writeAlignedUnsafe(pointer);
  1917. }
  1918. };
  1919. // Helper macros for doing things to certain sets of SIMD vector types
  1920. // Performing do(vector_type, element_type, lane_count)
  1921. #define FOR_ALL_VECTOR_TYPES(DO) \
  1922. DO(F32x4, float, 4) \
  1923. DO(I32x4, int32_t, 4) \
  1924. DO(U32x4, uint32_t, 4) \
  1925. DO(U16x8, uint16_t, 8) \
  1926. DO(U8x16, uint8_t, 16) \
  1927. DO(F32x8, float, 8) \
  1928. DO(I32x8, int32_t, 8) \
  1929. DO(U32x8, uint32_t, 8) \
  1930. DO(U16x16, uint16_t, 16) \
  1931. DO(U8x32, uint8_t, 32)
  1932. #define FOR_FLOAT_VECTOR_TYPES(DO) \
  1933. DO(F32x4, float, 4) \
  1934. DO(F32x8, float, 8)
  1935. #define FOR_INTEGER_VECTOR_TYPES(DO) \
  1936. DO(I32x4, int32_t, 4) \
  1937. DO(U32x4, uint32_t, 4) \
  1938. DO(U16x8, uint16_t, 8) \
  1939. DO(U8x16, uint8_t, 16) \
  1940. DO(I32x8, int32_t, 8) \
  1941. DO(U32x8, uint32_t, 8) \
  1942. DO(U16x16, uint16_t, 16) \
  1943. DO(U8x32, uint8_t, 32)
  1944. #define FOR_SIGNED_VECTOR_TYPES(DO) \
  1945. DO(F32x4, float, 4) \
  1946. DO(I32x4, int32_t, 4) \
  1947. DO(F32x8, float, 8) \
  1948. DO(I32x8, int32_t, 8)
  1949. #define FOR_UNSIGNED_VECTOR_TYPES(DO) \
  1950. DO(U32x4, uint32_t, 4) \
  1951. DO(U16x8, uint16_t, 8) \
  1952. DO(U8x16, uint8_t, 16) \
  1953. DO(U32x8, uint32_t, 8) \
  1954. DO(U16x16, uint16_t, 16) \
  1955. DO(U8x32, uint8_t, 32)
  1956. // Print SIMD vectors to the terminal or append them to strings.
  1957. #define CREATE_METHOD_PRINT(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  1958. inline dsr::String& string_toStreamIndented(dsr::String& target, const VECTOR_TYPE& source, const dsr::ReadableString& indentation) { \
  1959. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  1960. source.writeAlignedUnsafe(a); \
  1961. dsr::string_append(target, indentation, a[0]); \
  1962. for (int i = 1; i < LANE_COUNT; i++) { \
  1963. string_append(target, U", ", a[i]); \
  1964. } \
  1965. return target; \
  1966. }
  1967. // All SIMD vectors can be printed.
  1968. FOR_ALL_VECTOR_TYPES(CREATE_METHOD_PRINT)
  1969. #undef CREATE_METHOD_PRINT
  1970. // Whole comparisons returning a single boolean, mainly for regression tests.
  1971. #define CREATE_EXACT_EQUALITY(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  1972. inline bool operator==(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  1973. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  1974. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  1975. left.writeAlignedUnsafe(a); \
  1976. right.writeAlignedUnsafe(b); \
  1977. for (int i = 0; i < LANE_COUNT; i++) { \
  1978. if (a[i] != b[i]) return false; \
  1979. } \
  1980. return true; \
  1981. }
  1982. // Integer SIMD vectors have exact equlity.
  1983. FOR_INTEGER_VECTOR_TYPES(CREATE_EXACT_EQUALITY)
  1984. #undef CREATE_EXACT_EQUALITY
  1985. #define CREATE_TOLERANT_EQUALITY(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  1986. inline bool operator==(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  1987. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  1988. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  1989. left.writeAlignedUnsafe(a); \
  1990. right.writeAlignedUnsafe(b); \
  1991. for (int i = 0; i < LANE_COUNT; i++) { \
  1992. if (fabs(a[i] - b[i]) >= 0.0001f) return false; \
  1993. } \
  1994. return true; \
  1995. }
  1996. // Float SIMD vectors have inexact equality.
  1997. FOR_FLOAT_VECTOR_TYPES(CREATE_TOLERANT_EQUALITY)
  1998. #undef CREATE_TOLERANT_EQUALITY
  1999. #define CREATE_INEQUALITY(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  2000. inline bool operator!=(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  2001. return !(left == right); \
  2002. }
  2003. // All SIMD vectors have inequality.
  2004. FOR_ALL_VECTOR_TYPES(CREATE_INEQUALITY)
  2005. #undef CREATE_INEQUALITY
  2006. inline F32x4 operator+(const F32x4& left, const F32x4& right) {
  2007. #if defined USE_BASIC_SIMD
  2008. return F32x4(ADD_F32_SIMD(left.v, right.v));
  2009. #else
  2010. return F32x4(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3]);
  2011. #endif
  2012. }
  2013. inline F32x4 operator-(const F32x4& left, const F32x4& right) {
  2014. #if defined USE_BASIC_SIMD
  2015. return F32x4(SUB_F32_SIMD(left.v, right.v));
  2016. #else
  2017. return F32x4(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3]);
  2018. #endif
  2019. }
  2020. inline F32x4 operator*(const F32x4& left, const F32x4& right) {
  2021. #if defined USE_BASIC_SIMD
  2022. return F32x4(MUL_F32_SIMD(left.v, right.v));
  2023. #else
  2024. return F32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
  2025. #endif
  2026. }
  2027. inline F32x4 min(const F32x4& left, const F32x4& right) {
  2028. #if defined USE_BASIC_SIMD
  2029. return F32x4(MIN_F32_SIMD(left.v, right.v));
  2030. #else
  2031. float v0 = left.scalars[0];
  2032. float v1 = left.scalars[1];
  2033. float v2 = left.scalars[2];
  2034. float v3 = left.scalars[3];
  2035. float r0 = right.scalars[0];
  2036. float r1 = right.scalars[1];
  2037. float r2 = right.scalars[2];
  2038. float r3 = right.scalars[3];
  2039. if (r0 < v0) { v0 = r0; }
  2040. if (r1 < v1) { v1 = r1; }
  2041. if (r2 < v2) { v2 = r2; }
  2042. if (r3 < v3) { v3 = r3; }
  2043. return F32x4(v0, v1, v2, v3);
  2044. #endif
  2045. }
  2046. inline F32x4 max(const F32x4& left, const F32x4& right) {
  2047. #if defined USE_BASIC_SIMD
  2048. return F32x4(MAX_F32_SIMD(left.v, right.v));
  2049. #else
  2050. float v0 = left.scalars[0];
  2051. float v1 = left.scalars[1];
  2052. float v2 = left.scalars[2];
  2053. float v3 = left.scalars[3];
  2054. float r0 = right.scalars[0];
  2055. float r1 = right.scalars[1];
  2056. float r2 = right.scalars[2];
  2057. float r3 = right.scalars[3];
  2058. if (r0 > v0) { v0 = r0; }
  2059. if (r1 > v1) { v1 = r1; }
  2060. if (r2 > v2) { v2 = r2; }
  2061. if (r3 > v3) { v3 = r3; }
  2062. return F32x4(v0, v1, v2, v3);
  2063. #endif
  2064. }
  2065. inline I32x4 operator+(const I32x4& left, const I32x4& right) {
  2066. #if defined USE_BASIC_SIMD
  2067. return I32x4(ADD_I32_SIMD(left.v, right.v));
  2068. #else
  2069. return I32x4(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3]);
  2070. #endif
  2071. }
  2072. inline I32x4 operator-(const I32x4& left, const I32x4& right) {
  2073. #if defined USE_BASIC_SIMD
  2074. return I32x4(SUB_I32_SIMD(left.v, right.v));
  2075. #else
  2076. return I32x4(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3]);
  2077. #endif
  2078. }
  2079. inline I32x4 operator*(const I32x4& left, const I32x4& right) {
  2080. #if defined USE_BASIC_SIMD
  2081. #if defined USE_SSE2
  2082. // Emulate a NEON instruction
  2083. return I32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
  2084. #elif defined USE_NEON
  2085. return I32x4(MUL_I32_NEON(left.v, right.v));
  2086. #endif
  2087. #else
  2088. return I32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
  2089. #endif
  2090. }
  2091. inline U32x4 operator+(const U32x4& left, const U32x4& right) {
  2092. #if defined USE_BASIC_SIMD
  2093. return U32x4(ADD_U32_SIMD(left.v, right.v));
  2094. #else
  2095. return U32x4(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3]);
  2096. #endif
  2097. }
  2098. inline U32x4 operator-(const U32x4& left, const U32x4& right) {
  2099. #if defined USE_BASIC_SIMD
  2100. return U32x4(SUB_U32_SIMD(left.v, right.v));
  2101. #else
  2102. return U32x4(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3]);
  2103. #endif
  2104. }
  2105. inline U32x4 operator*(const U32x4& left, const U32x4& right) {
  2106. #if defined USE_BASIC_SIMD
  2107. #if defined USE_SSE2
  2108. // Emulate a NEON instruction on SSE2 registers
  2109. return U32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
  2110. #else // NEON
  2111. return U32x4(MUL_U32_NEON(left.v, right.v));
  2112. #endif
  2113. #else
  2114. return U32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
  2115. #endif
  2116. }
  2117. inline U32x4 operator&(const U32x4& left, const U32x4& right) {
  2118. #if defined USE_BASIC_SIMD
  2119. return U32x4(BITWISE_AND_U32_SIMD(left.v, right.v));
  2120. #else
  2121. return U32x4(left.scalars[0] & right.scalars[0], left.scalars[1] & right.scalars[1], left.scalars[2] & right.scalars[2], left.scalars[3] & right.scalars[3]);
  2122. #endif
  2123. }
  2124. inline U32x4 operator|(const U32x4& left, const U32x4& right) {
  2125. #if defined USE_BASIC_SIMD
  2126. return U32x4(BITWISE_OR_U32_SIMD(left.v, right.v));
  2127. #else
  2128. return U32x4(left.scalars[0] | right.scalars[0], left.scalars[1] | right.scalars[1], left.scalars[2] | right.scalars[2], left.scalars[3] | right.scalars[3]);
  2129. #endif
  2130. }
  2131. inline U32x4 operator^(const U32x4& left, const U32x4& right) {
  2132. #if defined USE_BASIC_SIMD
  2133. return U32x4(BITWISE_XOR_U32_SIMD(left.v, right.v));
  2134. #else
  2135. return U32x4(left.scalars[0] ^ right.scalars[0], left.scalars[1] ^ right.scalars[1], left.scalars[2] ^ right.scalars[2], left.scalars[3] ^ right.scalars[3]);
  2136. #endif
  2137. }
  2138. inline U32x4 operator<<(const U32x4& left, uint32_t bitOffset) {
  2139. #if defined USE_SSE2
  2140. return U32x4(_mm_slli_epi32(left.v, bitOffset));
  2141. #else
  2142. #if defined USE_NEON
  2143. return U32x4(vshlq_u32(left.v, LOAD_SCALAR_I32_SIMD(bitOffset)));
  2144. #else
  2145. return U32x4(left.scalars[0] << bitOffset, left.scalars[1] << bitOffset, left.scalars[2] << bitOffset, left.scalars[3] << bitOffset);
  2146. #endif
  2147. #endif
  2148. }
  2149. inline U32x4 operator>>(const U32x4& left, uint32_t bitOffset) {
  2150. #if defined USE_SSE2
  2151. return U32x4(_mm_srli_epi32(left.v, bitOffset));
  2152. #else
  2153. #if defined USE_NEON
  2154. return U32x4(vshlq_u32(left.v, LOAD_SCALAR_I32_SIMD(-bitOffset)));
  2155. #else
  2156. return U32x4(left.scalars[0] >> bitOffset, left.scalars[1] >> bitOffset, left.scalars[2] >> bitOffset, left.scalars[3] >> bitOffset);
  2157. #endif
  2158. #endif
  2159. }
  2160. inline U16x8 operator+(const U16x8& left, const U16x8& right) {
  2161. #if defined USE_BASIC_SIMD
  2162. return U16x8(ADD_U16_SIMD(left.v, right.v));
  2163. #else
  2164. return U16x8(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3],
  2165. left.scalars[4] + right.scalars[4], left.scalars[5] + right.scalars[5], left.scalars[6] + right.scalars[6], left.scalars[7] + right.scalars[7]);
  2166. #endif
  2167. }
  2168. inline U16x8 operator-(const U16x8& left, const U16x8& right) {
  2169. #if defined USE_BASIC_SIMD
  2170. return U16x8(SUB_U16_SIMD(left.v, right.v));
  2171. #else
  2172. return U16x8(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3],
  2173. left.scalars[4] - right.scalars[4], left.scalars[5] - right.scalars[5], left.scalars[6] - right.scalars[6], left.scalars[7] - right.scalars[7]);
  2174. #endif
  2175. }
  2176. inline U16x8 operator*(const U16x8& left, const U16x8& right) {
  2177. #if defined USE_BASIC_SIMD
  2178. return U16x8(MUL_U16_SIMD(left.v, right.v));
  2179. #else
  2180. return U16x8(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3],
  2181. left.scalars[4] * right.scalars[4], left.scalars[5] * right.scalars[5], left.scalars[6] * right.scalars[6], left.scalars[7] * right.scalars[7]);
  2182. #endif
  2183. }
  2184. inline U8x16 operator+(const U8x16& left, const U8x16& right) {
  2185. #if defined USE_BASIC_SIMD
  2186. return U8x16(ADD_U8_SIMD(left.v, right.v));
  2187. #else
  2188. return U8x16(
  2189. left.scalars[0] + right.scalars[0],
  2190. left.scalars[1] + right.scalars[1],
  2191. left.scalars[2] + right.scalars[2],
  2192. left.scalars[3] + right.scalars[3],
  2193. left.scalars[4] + right.scalars[4],
  2194. left.scalars[5] + right.scalars[5],
  2195. left.scalars[6] + right.scalars[6],
  2196. left.scalars[7] + right.scalars[7],
  2197. left.scalars[8] + right.scalars[8],
  2198. left.scalars[9] + right.scalars[9],
  2199. left.scalars[10] + right.scalars[10],
  2200. left.scalars[11] + right.scalars[11],
  2201. left.scalars[12] + right.scalars[12],
  2202. left.scalars[13] + right.scalars[13],
  2203. left.scalars[14] + right.scalars[14],
  2204. left.scalars[15] + right.scalars[15]
  2205. );
  2206. #endif
  2207. }
  2208. inline U8x16 operator-(const U8x16& left, const U8x16& right) {
  2209. #if defined USE_BASIC_SIMD
  2210. return U8x16(SUB_U8_SIMD(left.v, right.v));
  2211. #else
  2212. return U8x16(
  2213. left.scalars[0] - right.scalars[0],
  2214. left.scalars[1] - right.scalars[1],
  2215. left.scalars[2] - right.scalars[2],
  2216. left.scalars[3] - right.scalars[3],
  2217. left.scalars[4] - right.scalars[4],
  2218. left.scalars[5] - right.scalars[5],
  2219. left.scalars[6] - right.scalars[6],
  2220. left.scalars[7] - right.scalars[7],
  2221. left.scalars[8] - right.scalars[8],
  2222. left.scalars[9] - right.scalars[9],
  2223. left.scalars[10] - right.scalars[10],
  2224. left.scalars[11] - right.scalars[11],
  2225. left.scalars[12] - right.scalars[12],
  2226. left.scalars[13] - right.scalars[13],
  2227. left.scalars[14] - right.scalars[14],
  2228. left.scalars[15] - right.scalars[15]
  2229. );
  2230. #endif
  2231. }
  2232. inline uint8_t impl_limit0(int32_t x) { return x < 0 ? 0 : x; }
  2233. inline uint8_t impl_limit255(uint32_t x) { return x > 255 ? 255 : x; }
  2234. inline U8x16 saturatedAddition(const U8x16& left, const U8x16& right) {
  2235. #if defined USE_BASIC_SIMD
  2236. return U8x16(ADD_SAT_U8_SIMD(left.v, right.v));
  2237. #else
  2238. return U8x16(
  2239. impl_limit255((uint32_t)left.scalars[0] + (uint32_t)right.scalars[0]),
  2240. impl_limit255((uint32_t)left.scalars[1] + (uint32_t)right.scalars[1]),
  2241. impl_limit255((uint32_t)left.scalars[2] + (uint32_t)right.scalars[2]),
  2242. impl_limit255((uint32_t)left.scalars[3] + (uint32_t)right.scalars[3]),
  2243. impl_limit255((uint32_t)left.scalars[4] + (uint32_t)right.scalars[4]),
  2244. impl_limit255((uint32_t)left.scalars[5] + (uint32_t)right.scalars[5]),
  2245. impl_limit255((uint32_t)left.scalars[6] + (uint32_t)right.scalars[6]),
  2246. impl_limit255((uint32_t)left.scalars[7] + (uint32_t)right.scalars[7]),
  2247. impl_limit255((uint32_t)left.scalars[8] + (uint32_t)right.scalars[8]),
  2248. impl_limit255((uint32_t)left.scalars[9] + (uint32_t)right.scalars[9]),
  2249. impl_limit255((uint32_t)left.scalars[10] + (uint32_t)right.scalars[10]),
  2250. impl_limit255((uint32_t)left.scalars[11] + (uint32_t)right.scalars[11]),
  2251. impl_limit255((uint32_t)left.scalars[12] + (uint32_t)right.scalars[12]),
  2252. impl_limit255((uint32_t)left.scalars[13] + (uint32_t)right.scalars[13]),
  2253. impl_limit255((uint32_t)left.scalars[14] + (uint32_t)right.scalars[14]),
  2254. impl_limit255((uint32_t)left.scalars[15] + (uint32_t)right.scalars[15])
  2255. );
  2256. #endif
  2257. }
  2258. inline U8x16 saturatedSubtraction(const U8x16& left, const U8x16& right) {
  2259. #if defined USE_BASIC_SIMD
  2260. return U8x16(SUB_SAT_U8_SIMD(left.v, right.v));
  2261. #else
  2262. return U8x16(
  2263. impl_limit0((int32_t)left.scalars[0] - (int32_t)right.scalars[0]),
  2264. impl_limit0((int32_t)left.scalars[1] - (int32_t)right.scalars[1]),
  2265. impl_limit0((int32_t)left.scalars[2] - (int32_t)right.scalars[2]),
  2266. impl_limit0((int32_t)left.scalars[3] - (int32_t)right.scalars[3]),
  2267. impl_limit0((int32_t)left.scalars[4] - (int32_t)right.scalars[4]),
  2268. impl_limit0((int32_t)left.scalars[5] - (int32_t)right.scalars[5]),
  2269. impl_limit0((int32_t)left.scalars[6] - (int32_t)right.scalars[6]),
  2270. impl_limit0((int32_t)left.scalars[7] - (int32_t)right.scalars[7]),
  2271. impl_limit0((int32_t)left.scalars[8] - (int32_t)right.scalars[8]),
  2272. impl_limit0((int32_t)left.scalars[9] - (int32_t)right.scalars[9]),
  2273. impl_limit0((int32_t)left.scalars[10] - (int32_t)right.scalars[10]),
  2274. impl_limit0((int32_t)left.scalars[11] - (int32_t)right.scalars[11]),
  2275. impl_limit0((int32_t)left.scalars[12] - (int32_t)right.scalars[12]),
  2276. impl_limit0((int32_t)left.scalars[13] - (int32_t)right.scalars[13]),
  2277. impl_limit0((int32_t)left.scalars[14] - (int32_t)right.scalars[14]),
  2278. impl_limit0((int32_t)left.scalars[15] - (int32_t)right.scalars[15])
  2279. );
  2280. #endif
  2281. }
  2282. inline I32x4 truncateToI32(const F32x4& vector) {
  2283. #if defined USE_BASIC_SIMD
  2284. return I32x4(F32_TO_I32_SIMD(vector.v));
  2285. #else
  2286. return I32x4((int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3]);
  2287. #endif
  2288. }
  2289. inline U32x4 truncateToU32(const F32x4& vector) {
  2290. #if defined USE_BASIC_SIMD
  2291. return U32x4(F32_TO_U32_SIMD(vector.v));
  2292. #else
  2293. return U32x4((uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3]);
  2294. #endif
  2295. }
  2296. inline F32x4 floatFromI32(const I32x4& vector) {
  2297. #if defined USE_BASIC_SIMD
  2298. return F32x4(I32_TO_F32_SIMD(vector.v));
  2299. #else
  2300. return F32x4((float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3]);
  2301. #endif
  2302. }
  2303. inline F32x4 floatFromU32(const U32x4& vector) {
  2304. #if defined USE_BASIC_SIMD
  2305. return F32x4(U32_TO_F32_SIMD(vector.v));
  2306. #else
  2307. return F32x4((float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3]);
  2308. #endif
  2309. }
  2310. inline I32x4 I32FromU32(const U32x4& vector) {
  2311. #if defined USE_BASIC_SIMD
  2312. return I32x4(REINTERPRET_U32_TO_I32_SIMD(vector.v));
  2313. #else
  2314. return I32x4((int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3]);
  2315. #endif
  2316. }
  2317. inline U32x4 U32FromI32(const I32x4& vector) {
  2318. #if defined USE_BASIC_SIMD
  2319. return U32x4(REINTERPRET_I32_TO_U32_SIMD(vector.v));
  2320. #else
  2321. return U32x4((uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3]);
  2322. #endif
  2323. }
  2324. // Warning! Behavior depends on endianness.
  2325. inline U8x16 reinterpret_U8FromU32(const U32x4& vector) {
  2326. #if defined USE_BASIC_SIMD
  2327. return U8x16(REINTERPRET_U32_TO_U8_SIMD(vector.v));
  2328. #else
  2329. const uint8_t *source = (const uint8_t*)vector.scalars;
  2330. return U8x16(
  2331. source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7],
  2332. source[8], source[9], source[10], source[11], source[12], source[13], source[14], source[15]
  2333. );
  2334. #endif
  2335. }
  2336. // Warning! Behavior depends on endianness.
  2337. inline U32x4 reinterpret_U32FromU8(const U8x16& vector) {
  2338. #if defined USE_BASIC_SIMD
  2339. return U32x4(REINTERPRET_U8_TO_U32_SIMD(vector.v));
  2340. #else
  2341. const uint32_t *source = (const uint32_t*)vector.scalars;
  2342. return U32x4(source[0], source[1], source[2], source[3]);
  2343. #endif
  2344. }
  2345. // Unpacking to larger integers
  2346. inline U32x4 lowerToU32(const U16x8& vector) {
  2347. #if defined USE_BASIC_SIMD
  2348. return U32x4(U16_LOW_TO_U32_SIMD(vector.v));
  2349. #else
  2350. return U32x4(vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3]);
  2351. #endif
  2352. }
  2353. inline U32x4 higherToU32(const U16x8& vector) {
  2354. #if defined USE_BASIC_SIMD
  2355. return U32x4(U16_HIGH_TO_U32_SIMD(vector.v));
  2356. #else
  2357. return U32x4(vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7]);
  2358. #endif
  2359. }
  2360. inline U16x8 lowerToU16(const U8x16& vector) {
  2361. #if defined USE_BASIC_SIMD
  2362. return U16x8(U8_LOW_TO_U16_SIMD(vector.v));
  2363. #else
  2364. return U16x8(
  2365. vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3],
  2366. vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7]
  2367. );
  2368. #endif
  2369. }
  2370. inline U16x8 higherToU16(const U8x16& vector) {
  2371. #if defined USE_BASIC_SIMD
  2372. return U16x8(U8_HIGH_TO_U16_SIMD(vector.v));
  2373. #else
  2374. return U16x8(
  2375. vector.scalars[8], vector.scalars[9], vector.scalars[10], vector.scalars[11],
  2376. vector.scalars[12], vector.scalars[13], vector.scalars[14], vector.scalars[15]
  2377. );
  2378. #endif
  2379. }
  2380. // Saturated packing
  2381. inline U8x16 saturateToU8(const U16x8& lower, const U16x8& upper) {
  2382. #if defined USE_BASIC_SIMD
  2383. return U8x16(PACK_SAT_U16_TO_U8_SIMD(lower.v, upper.v));
  2384. #else
  2385. return U8x16(
  2386. impl_limit255(lower.scalars[0]),
  2387. impl_limit255(lower.scalars[1]),
  2388. impl_limit255(lower.scalars[2]),
  2389. impl_limit255(lower.scalars[3]),
  2390. impl_limit255(lower.scalars[4]),
  2391. impl_limit255(lower.scalars[5]),
  2392. impl_limit255(lower.scalars[6]),
  2393. impl_limit255(lower.scalars[7]),
  2394. impl_limit255(upper.scalars[0]),
  2395. impl_limit255(upper.scalars[1]),
  2396. impl_limit255(upper.scalars[2]),
  2397. impl_limit255(upper.scalars[3]),
  2398. impl_limit255(upper.scalars[4]),
  2399. impl_limit255(upper.scalars[5]),
  2400. impl_limit255(upper.scalars[6]),
  2401. impl_limit255(upper.scalars[7])
  2402. );
  2403. #endif
  2404. }
  2405. // Unary negation for convenience and code readability.
  2406. // Before using unary negation, always check if:
  2407. // * An addition can be turned into a subtraction?
  2408. // x = -a + b
  2409. // x = b - a
  2410. // * A multiplying constant or scalar can be negated instead?
  2411. // x = -b * 2
  2412. // x = b * -2
  2413. inline F32x4 operator-(const F32x4& value) {
  2414. #if defined USE_BASIC_SIMD
  2415. return F32x4(0.0f) - value;
  2416. #else
  2417. return F32x4(-value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3]);
  2418. #endif
  2419. }
  2420. inline I32x4 operator-(const I32x4& value) {
  2421. #if defined USE_BASIC_SIMD
  2422. return I32x4(0) - value;
  2423. #else
  2424. return I32x4(-value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3]);
  2425. #endif
  2426. }
  2427. // Helper macros for generating the vector extract functions.
  2428. // Having one function for each type and offset makes sure that the compiler gets an immediate integer within the valid range.
  2429. #if defined USE_BASIC_SIMD
  2430. #if defined USE_SSE2
  2431. #if defined USE_SSSE3
  2432. #define _MM_ALIGNR_EPI8(A, B, OFFSET) _mm_alignr_epi8(A, B, OFFSET)
  2433. #else
  2434. // If SSSE3 is not used, emulate it using stack memory and unaligned reading of data.
  2435. static inline SIMD_U8x16 _MM_ALIGNR_EPI8(SIMD_U8x16 a, SIMD_U8x16 b, int offset) {
  2436. ALIGN16 uint8_t vectorBuffer[32];
  2437. _mm_store_si128((SIMD_U8x16*)(vectorBuffer), b);
  2438. _mm_store_si128((SIMD_U8x16*)(vectorBuffer + 16), a);
  2439. return _mm_loadu_si128((SIMD_U8x16*)(vectorBuffer + offset));
  2440. }
  2441. #endif
  2442. #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return U8x16(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET));
  2443. #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return U16x8(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 2));
  2444. #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return U32x4(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 4));
  2445. #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return I32x4(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 4));
  2446. #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return F32x4(SIMD_F32x4(_MM_ALIGNR_EPI8(SIMD_U32x4(b.v), SIMD_U32x4(a.v), OFFSET * 4)));
  2447. #elif defined USE_NEON
  2448. #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return U8x16(vextq_u8(a.v, b.v, OFFSET));
  2449. #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return U16x8(vextq_u16(a.v, b.v, OFFSET));
  2450. #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return U32x4(vextq_u32(a.v, b.v, OFFSET));
  2451. #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return I32x4(vextq_s32(a.v, b.v, OFFSET));
  2452. #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return F32x4(vextq_f32(a.v, b.v, OFFSET));
  2453. #endif
  2454. #else
  2455. #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  2456. #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  2457. #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  2458. #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  2459. #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  2460. #endif
  2461. // Vector extraction concatunates two input vectors and reads a vector between them using an offset.
  2462. // The first and last offsets that only return one of the inputs can be used for readability, because they will be inlined and removed by the compiler.
  2463. // To get elements from the right side, combine the center vector with the right vector and shift one element to the left using vectorExtract_1 for the given type.
  2464. // To get elements from the left side, combine the left vector with the center vector and shift one element to the right using vectorExtract_15 for 16 lanes, vectorExtract_7 for 8 lanes, or vectorExtract_3 for 4 lanes.
  2465. U8x16 inline vectorExtract_0(const U8x16 &a, const U8x16 &b) { return a; }
  2466. U8x16 inline vectorExtract_1(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(1, U8x16(a.scalars[1], a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0])) }
  2467. U8x16 inline vectorExtract_2(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(2, U8x16(a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1])) }
  2468. U8x16 inline vectorExtract_3(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(3, U8x16(a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2])) }
  2469. U8x16 inline vectorExtract_4(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(4, U8x16(a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3])) }
  2470. U8x16 inline vectorExtract_5(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(5, U8x16(a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4])) }
  2471. U8x16 inline vectorExtract_6(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(6, U8x16(a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5])) }
  2472. U8x16 inline vectorExtract_7(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(7, U8x16(a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6])) }
  2473. U8x16 inline vectorExtract_8(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(8, U8x16(a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7])) }
  2474. U8x16 inline vectorExtract_9(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(9, U8x16(a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8])) }
  2475. U8x16 inline vectorExtract_10(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(10, U8x16(a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9])) }
  2476. U8x16 inline vectorExtract_11(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(11, U8x16(a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10])) }
  2477. U8x16 inline vectorExtract_12(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(12, U8x16(a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11])) }
  2478. U8x16 inline vectorExtract_13(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(13, U8x16(a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11], b.scalars[12])) }
  2479. U8x16 inline vectorExtract_14(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(14, U8x16(a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11], b.scalars[12], b.scalars[13])) }
  2480. U8x16 inline vectorExtract_15(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(15, U8x16(a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11], b.scalars[12], b.scalars[13], b.scalars[14])) }
  2481. U8x16 inline vectorExtract_16(const U8x16 &a, const U8x16 &b) { return b; }
  2482. U16x8 inline vectorExtract_0(const U16x8 &a, const U16x8 &b) { return a; }
  2483. U16x8 inline vectorExtract_1(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(1, U16x8(a.scalars[1], a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0])) }
  2484. U16x8 inline vectorExtract_2(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(2, U16x8(a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1])) }
  2485. U16x8 inline vectorExtract_3(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(3, U16x8(a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2])) }
  2486. U16x8 inline vectorExtract_4(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(4, U16x8(a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3])) }
  2487. U16x8 inline vectorExtract_5(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(5, U16x8(a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4])) }
  2488. U16x8 inline vectorExtract_6(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(6, U16x8(a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5])) }
  2489. U16x8 inline vectorExtract_7(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(7, U16x8(a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6])) }
  2490. U16x8 inline vectorExtract_8(const U16x8 &a, const U16x8 &b) { return b; }
  2491. U32x4 inline vectorExtract_0(const U32x4 &a, const U32x4 &b) { return a; }
  2492. U32x4 inline vectorExtract_1(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(1, U32x4(a.scalars[1], a.scalars[2], a.scalars[3], b.scalars[0])) }
  2493. U32x4 inline vectorExtract_2(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(2, U32x4(a.scalars[2], a.scalars[3], b.scalars[0], b.scalars[1])) }
  2494. U32x4 inline vectorExtract_3(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(3, U32x4(a.scalars[3], b.scalars[0], b.scalars[1], b.scalars[2])) }
  2495. U32x4 inline vectorExtract_4(const U32x4 &a, const U32x4 &b) { return b; }
  2496. I32x4 inline vectorExtract_0(const I32x4 &a, const I32x4 &b) { return a; }
  2497. I32x4 inline vectorExtract_1(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(1, I32x4(a.scalars[1], a.scalars[2], a.scalars[3], b.scalars[0])) }
  2498. I32x4 inline vectorExtract_2(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(2, I32x4(a.scalars[2], a.scalars[3], b.scalars[0], b.scalars[1])) }
  2499. I32x4 inline vectorExtract_3(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(3, I32x4(a.scalars[3], b.scalars[0], b.scalars[1], b.scalars[2])) }
  2500. I32x4 inline vectorExtract_4(const I32x4 &a, const I32x4 &b) { return b; }
  2501. F32x4 inline vectorExtract_0(const F32x4 &a, const F32x4 &b) { return a; }
  2502. F32x4 inline vectorExtract_1(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(1, F32x4(a.scalars[1], a.scalars[2], a.scalars[3], b.scalars[0])) }
  2503. F32x4 inline vectorExtract_2(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(2, F32x4(a.scalars[2], a.scalars[3], b.scalars[0], b.scalars[1])) }
  2504. F32x4 inline vectorExtract_3(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(3, F32x4(a.scalars[3], b.scalars[0], b.scalars[1], b.scalars[2])) }
  2505. F32x4 inline vectorExtract_4(const F32x4 &a, const F32x4 &b) { return b; }
  2506. // Gather instructions load memory from a pointer at multiple index offsets at the same time.
  2507. // The given pointers should be aligned with 4 bytes, so that the fallback solution works on machines with strict alignment requirements.
  2508. #if defined USE_AVX2
  2509. #define GATHER_I32x4_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_epi32((const int32_t*)(SOURCE), FOUR_OFFSETS, SCALE)
  2510. #define GATHER_U32x4_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_epi32((const int32_t*)(SOURCE), FOUR_OFFSETS, SCALE)
  2511. #define GATHER_F32x4_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_ps((const float*)(SOURCE), FOUR_OFFSETS, SCALE)
  2512. #endif
  2513. static inline U32x4 gather(const dsr::SafePointer<uint32_t> data, const U32x4 &elementOffset) {
  2514. #if defined USE_AVX2
  2515. // TODO: Implement safety checks for debug mode.
  2516. return U32x4(GATHER_U32x4_AVX2(data.getUnsafe(), elementOffset.v, 4));
  2517. #else
  2518. ALIGN16 uint32_t elementOffsets[4];
  2519. elementOffset.writeAlignedUnsafe(elementOffsets);
  2520. return U32x4(
  2521. *(data + elementOffsets[0]),
  2522. *(data + elementOffsets[1]),
  2523. *(data + elementOffsets[2]),
  2524. *(data + elementOffsets[3])
  2525. );
  2526. #endif
  2527. }
  2528. static inline I32x4 gather(const dsr::SafePointer<int32_t> data, const U32x4 &elementOffset) {
  2529. #if defined USE_AVX2
  2530. // TODO: Implement safety checks for debug mode.
  2531. return I32x4(GATHER_U32x4_AVX2(data.getUnsafe(), elementOffset.v, 4));
  2532. #else
  2533. ALIGN16 uint32_t elementOffsets[4];
  2534. elementOffset.writeAlignedUnsafe(elementOffsets);
  2535. return I32x4(
  2536. *(data + elementOffsets[0]),
  2537. *(data + elementOffsets[1]),
  2538. *(data + elementOffsets[2]),
  2539. *(data + elementOffsets[3])
  2540. );
  2541. #endif
  2542. }
  2543. static inline F32x4 gather(const dsr::SafePointer<float> data, const U32x4 &elementOffset) {
  2544. #if defined USE_AVX2
  2545. // TODO: Implement safety checks for debug mode.
  2546. return F32x4(GATHER_F32x4_AVX2(data.getUnsafe(), elementOffset.v, 4));
  2547. #else
  2548. ALIGN16 uint32_t elementOffsets[4];
  2549. elementOffset.writeAlignedUnsafe(elementOffsets);
  2550. return F32x4(
  2551. *(data + elementOffsets[0]),
  2552. *(data + elementOffsets[1]),
  2553. *(data + elementOffsets[2]),
  2554. *(data + elementOffsets[3])
  2555. );
  2556. #endif
  2557. }
  2558. inline F32x8 operator+(const F32x8& left, const F32x8& right) {
  2559. #if defined USE_256BIT_F_SIMD
  2560. return F32x8(ADD_F32_SIMD256(left.v, right.v));
  2561. #else
  2562. return F32x8(
  2563. left.scalars[0] + right.scalars[0],
  2564. left.scalars[1] + right.scalars[1],
  2565. left.scalars[2] + right.scalars[2],
  2566. left.scalars[3] + right.scalars[3],
  2567. left.scalars[4] + right.scalars[4],
  2568. left.scalars[5] + right.scalars[5],
  2569. left.scalars[6] + right.scalars[6],
  2570. left.scalars[7] + right.scalars[7]
  2571. );
  2572. #endif
  2573. }
  2574. inline F32x8 operator-(const F32x8& left, const F32x8& right) {
  2575. #if defined USE_256BIT_F_SIMD
  2576. return F32x8(SUB_F32_SIMD256(left.v, right.v));
  2577. #else
  2578. return F32x8(
  2579. left.scalars[0] - right.scalars[0],
  2580. left.scalars[1] - right.scalars[1],
  2581. left.scalars[2] - right.scalars[2],
  2582. left.scalars[3] - right.scalars[3],
  2583. left.scalars[4] - right.scalars[4],
  2584. left.scalars[5] - right.scalars[5],
  2585. left.scalars[6] - right.scalars[6],
  2586. left.scalars[7] - right.scalars[7]
  2587. );
  2588. #endif
  2589. }
  2590. inline F32x8 operator*(const F32x8& left, const F32x8& right) {
  2591. #if defined USE_256BIT_F_SIMD
  2592. return F32x8(MUL_F32_SIMD256(left.v, right.v));
  2593. #else
  2594. return F32x8(
  2595. left.scalars[0] * right.scalars[0],
  2596. left.scalars[1] * right.scalars[1],
  2597. left.scalars[2] * right.scalars[2],
  2598. left.scalars[3] * right.scalars[3],
  2599. left.scalars[4] * right.scalars[4],
  2600. left.scalars[5] * right.scalars[5],
  2601. left.scalars[6] * right.scalars[6],
  2602. left.scalars[7] * right.scalars[7]
  2603. );
  2604. #endif
  2605. }
  2606. inline F32x8 min(const F32x8& left, const F32x8& right) {
  2607. #if defined USE_256BIT_F_SIMD
  2608. return F32x8(MIN_F32_SIMD256(left.v, right.v));
  2609. #else
  2610. float v0 = left.scalars[0];
  2611. float v1 = left.scalars[1];
  2612. float v2 = left.scalars[2];
  2613. float v3 = left.scalars[3];
  2614. float v4 = left.scalars[4];
  2615. float v5 = left.scalars[5];
  2616. float v6 = left.scalars[6];
  2617. float v7 = left.scalars[7];
  2618. float r0 = right.scalars[0];
  2619. float r1 = right.scalars[1];
  2620. float r2 = right.scalars[2];
  2621. float r3 = right.scalars[3];
  2622. float r4 = right.scalars[4];
  2623. float r5 = right.scalars[5];
  2624. float r6 = right.scalars[6];
  2625. float r7 = right.scalars[7];
  2626. if (r0 < v0) { v0 = r0; }
  2627. if (r1 < v1) { v1 = r1; }
  2628. if (r2 < v2) { v2 = r2; }
  2629. if (r3 < v3) { v3 = r3; }
  2630. if (r4 < v4) { v4 = r4; }
  2631. if (r5 < v5) { v5 = r5; }
  2632. if (r6 < v6) { v6 = r6; }
  2633. if (r7 < v7) { v7 = r7; }
  2634. return F32x8(v0, v1, v2, v3, v4, v5, v6, v7);
  2635. #endif
  2636. }
  2637. inline F32x8 max(const F32x8& left, const F32x8& right) {
  2638. #if defined USE_256BIT_F_SIMD
  2639. return F32x8(MAX_F32_SIMD256(left.v, right.v));
  2640. #else
  2641. float v0 = left.scalars[0];
  2642. float v1 = left.scalars[1];
  2643. float v2 = left.scalars[2];
  2644. float v3 = left.scalars[3];
  2645. float v4 = left.scalars[4];
  2646. float v5 = left.scalars[5];
  2647. float v6 = left.scalars[6];
  2648. float v7 = left.scalars[7];
  2649. float r0 = right.scalars[0];
  2650. float r1 = right.scalars[1];
  2651. float r2 = right.scalars[2];
  2652. float r3 = right.scalars[3];
  2653. float r4 = right.scalars[4];
  2654. float r5 = right.scalars[5];
  2655. float r6 = right.scalars[6];
  2656. float r7 = right.scalars[7];
  2657. if (r0 > v0) { v0 = r0; }
  2658. if (r1 > v1) { v1 = r1; }
  2659. if (r2 > v2) { v2 = r2; }
  2660. if (r3 > v3) { v3 = r3; }
  2661. if (r4 > v4) { v4 = r4; }
  2662. if (r5 > v5) { v5 = r5; }
  2663. if (r6 > v6) { v6 = r6; }
  2664. if (r7 > v7) { v7 = r7; }
  2665. return F32x8(v0, v1, v2, v3, v4, v5, v6, v7);
  2666. #endif
  2667. }
  2668. inline I32x8 operator+(const I32x8& left, const I32x8& right) {
  2669. #if defined USE_256BIT_X_SIMD
  2670. return I32x8(ADD_I32_SIMD256(left.v, right.v));
  2671. #else
  2672. return I32x8(
  2673. left.scalars[0] + right.scalars[0],
  2674. left.scalars[1] + right.scalars[1],
  2675. left.scalars[2] + right.scalars[2],
  2676. left.scalars[3] + right.scalars[3],
  2677. left.scalars[4] + right.scalars[4],
  2678. left.scalars[5] + right.scalars[5],
  2679. left.scalars[6] + right.scalars[6],
  2680. left.scalars[7] + right.scalars[7]);
  2681. #endif
  2682. }
  2683. inline I32x8 operator-(const I32x8& left, const I32x8& right) {
  2684. #if defined USE_256BIT_X_SIMD
  2685. return I32x8(SUB_I32_SIMD256(left.v, right.v));
  2686. #else
  2687. return I32x8(
  2688. left.scalars[0] - right.scalars[0],
  2689. left.scalars[1] - right.scalars[1],
  2690. left.scalars[2] - right.scalars[2],
  2691. left.scalars[3] - right.scalars[3],
  2692. left.scalars[4] - right.scalars[4],
  2693. left.scalars[5] - right.scalars[5],
  2694. left.scalars[6] - right.scalars[6],
  2695. left.scalars[7] - right.scalars[7]);
  2696. #endif
  2697. }
  2698. inline I32x8 operator*(const I32x8& left, const I32x8& right) {
  2699. #if defined USE_AVX2
  2700. return I32x8(MUL_I32_SIMD256(left.v, right.v));
  2701. #else
  2702. return I32x8(
  2703. left.scalars[0] * right.scalars[0],
  2704. left.scalars[1] * right.scalars[1],
  2705. left.scalars[2] * right.scalars[2],
  2706. left.scalars[3] * right.scalars[3],
  2707. left.scalars[4] * right.scalars[4],
  2708. left.scalars[5] * right.scalars[5],
  2709. left.scalars[6] * right.scalars[6],
  2710. left.scalars[7] * right.scalars[7]
  2711. );
  2712. #endif
  2713. }
  2714. inline U32x8 operator+(const U32x8& left, const U32x8& right) {
  2715. #if defined USE_256BIT_X_SIMD
  2716. return U32x8(ADD_U32_SIMD256(left.v, right.v));
  2717. #else
  2718. return U32x8(
  2719. left.scalars[0] + right.scalars[0],
  2720. left.scalars[1] + right.scalars[1],
  2721. left.scalars[2] + right.scalars[2],
  2722. left.scalars[3] + right.scalars[3],
  2723. left.scalars[4] + right.scalars[4],
  2724. left.scalars[5] + right.scalars[5],
  2725. left.scalars[6] + right.scalars[6],
  2726. left.scalars[7] + right.scalars[7]
  2727. );
  2728. #endif
  2729. }
  2730. inline U32x8 operator-(const U32x8& left, const U32x8& right) {
  2731. #if defined USE_256BIT_X_SIMD
  2732. return U32x8(SUB_U32_SIMD256(left.v, right.v));
  2733. #else
  2734. return U32x8(
  2735. left.scalars[0] - right.scalars[0],
  2736. left.scalars[1] - right.scalars[1],
  2737. left.scalars[2] - right.scalars[2],
  2738. left.scalars[3] - right.scalars[3],
  2739. left.scalars[4] - right.scalars[4],
  2740. left.scalars[5] - right.scalars[5],
  2741. left.scalars[6] - right.scalars[6],
  2742. left.scalars[7] - right.scalars[7]
  2743. );
  2744. #endif
  2745. }
  2746. inline U32x8 operator*(const U32x8& left, const U32x8& right) {
  2747. #if defined USE_AVX2
  2748. return U32x8(MUL_U32_SIMD256(left.v, right.v));
  2749. #else
  2750. return U32x8(
  2751. left.scalars[0] * right.scalars[0],
  2752. left.scalars[1] * right.scalars[1],
  2753. left.scalars[2] * right.scalars[2],
  2754. left.scalars[3] * right.scalars[3],
  2755. left.scalars[4] * right.scalars[4],
  2756. left.scalars[5] * right.scalars[5],
  2757. left.scalars[6] * right.scalars[6],
  2758. left.scalars[7] * right.scalars[7]
  2759. );
  2760. #endif
  2761. }
  2762. inline U32x8 operator&(const U32x8& left, const U32x8& right) {
  2763. #if defined USE_256BIT_X_SIMD
  2764. return U32x8(BITWISE_AND_U32_SIMD256(left.v, right.v));
  2765. #else
  2766. return U32x8(
  2767. left.scalars[0] & right.scalars[0],
  2768. left.scalars[1] & right.scalars[1],
  2769. left.scalars[2] & right.scalars[2],
  2770. left.scalars[3] & right.scalars[3],
  2771. left.scalars[4] & right.scalars[4],
  2772. left.scalars[5] & right.scalars[5],
  2773. left.scalars[6] & right.scalars[6],
  2774. left.scalars[7] & right.scalars[7]
  2775. );
  2776. #endif
  2777. }
  2778. inline U32x8 operator|(const U32x8& left, const U32x8& right) {
  2779. #if defined USE_256BIT_X_SIMD
  2780. return U32x8(BITWISE_OR_U32_SIMD256(left.v, right.v));
  2781. #else
  2782. return U32x8(
  2783. left.scalars[0] | right.scalars[0],
  2784. left.scalars[1] | right.scalars[1],
  2785. left.scalars[2] | right.scalars[2],
  2786. left.scalars[3] | right.scalars[3],
  2787. left.scalars[4] | right.scalars[4],
  2788. left.scalars[5] | right.scalars[5],
  2789. left.scalars[6] | right.scalars[6],
  2790. left.scalars[7] | right.scalars[7]
  2791. );
  2792. #endif
  2793. }
  2794. inline U32x8 operator^(const U32x8& left, const U32x8& right) {
  2795. #if defined USE_256BIT_X_SIMD
  2796. return U32x8(BITWISE_XOR_U32_SIMD256(left.v, right.v));
  2797. #else
  2798. return U32x8(
  2799. left.scalars[0] ^ right.scalars[0],
  2800. left.scalars[1] ^ right.scalars[1],
  2801. left.scalars[2] ^ right.scalars[2],
  2802. left.scalars[3] ^ right.scalars[3],
  2803. left.scalars[4] ^ right.scalars[4],
  2804. left.scalars[5] ^ right.scalars[5],
  2805. left.scalars[6] ^ right.scalars[6],
  2806. left.scalars[7] ^ right.scalars[7]
  2807. );
  2808. #endif
  2809. }
  2810. inline U32x8 operator<<(const U32x8& left, uint32_t bitOffset) {
  2811. #if defined USE_AVX2
  2812. return U32x8(_mm256_slli_epi32(left.v, bitOffset));
  2813. #else
  2814. return U32x8(
  2815. left.scalars[0] << bitOffset,
  2816. left.scalars[1] << bitOffset,
  2817. left.scalars[2] << bitOffset,
  2818. left.scalars[3] << bitOffset,
  2819. left.scalars[4] << bitOffset,
  2820. left.scalars[5] << bitOffset,
  2821. left.scalars[6] << bitOffset,
  2822. left.scalars[7] << bitOffset
  2823. );
  2824. #endif
  2825. }
  2826. inline U32x8 operator>>(const U32x8& left, uint32_t bitOffset) {
  2827. #if defined USE_AVX2
  2828. return U32x8(_mm256_srli_epi32(left.v, bitOffset));
  2829. #else
  2830. return U32x8(
  2831. left.scalars[0] >> bitOffset,
  2832. left.scalars[1] >> bitOffset,
  2833. left.scalars[2] >> bitOffset,
  2834. left.scalars[3] >> bitOffset,
  2835. left.scalars[4] >> bitOffset,
  2836. left.scalars[5] >> bitOffset,
  2837. left.scalars[6] >> bitOffset,
  2838. left.scalars[7] >> bitOffset
  2839. );
  2840. #endif
  2841. }
  2842. inline U16x16 operator+(const U16x16& left, const U16x16& right) {
  2843. #if defined USE_256BIT_X_SIMD
  2844. return U16x16(ADD_U16_SIMD256(left.v, right.v));
  2845. #else
  2846. return U16x16(
  2847. left.scalars[0] + right.scalars[0],
  2848. left.scalars[1] + right.scalars[1],
  2849. left.scalars[2] + right.scalars[2],
  2850. left.scalars[3] + right.scalars[3],
  2851. left.scalars[4] + right.scalars[4],
  2852. left.scalars[5] + right.scalars[5],
  2853. left.scalars[6] + right.scalars[6],
  2854. left.scalars[7] + right.scalars[7],
  2855. left.scalars[8] + right.scalars[8],
  2856. left.scalars[9] + right.scalars[9],
  2857. left.scalars[10] + right.scalars[10],
  2858. left.scalars[11] + right.scalars[11],
  2859. left.scalars[12] + right.scalars[12],
  2860. left.scalars[13] + right.scalars[13],
  2861. left.scalars[14] + right.scalars[14],
  2862. left.scalars[15] + right.scalars[15]
  2863. );
  2864. #endif
  2865. }
  2866. inline U16x16 operator-(const U16x16& left, const U16x16& right) {
  2867. #if defined USE_256BIT_X_SIMD
  2868. return U16x16(SUB_U16_SIMD256(left.v, right.v));
  2869. #else
  2870. return U16x16(
  2871. left.scalars[0] - right.scalars[0],
  2872. left.scalars[1] - right.scalars[1],
  2873. left.scalars[2] - right.scalars[2],
  2874. left.scalars[3] - right.scalars[3],
  2875. left.scalars[4] - right.scalars[4],
  2876. left.scalars[5] - right.scalars[5],
  2877. left.scalars[6] - right.scalars[6],
  2878. left.scalars[7] - right.scalars[7],
  2879. left.scalars[8] - right.scalars[8],
  2880. left.scalars[9] - right.scalars[9],
  2881. left.scalars[10] - right.scalars[10],
  2882. left.scalars[11] - right.scalars[11],
  2883. left.scalars[12] - right.scalars[12],
  2884. left.scalars[13] - right.scalars[13],
  2885. left.scalars[14] - right.scalars[14],
  2886. left.scalars[15] - right.scalars[15]
  2887. );
  2888. #endif
  2889. }
  2890. inline U16x16 operator*(const U16x16& left, const U16x16& right) {
  2891. #if defined USE_256BIT_X_SIMD
  2892. return U16x16(MUL_U16_SIMD256(left.v, right.v));
  2893. #else
  2894. return U16x16(
  2895. left.scalars[0] * right.scalars[0],
  2896. left.scalars[1] * right.scalars[1],
  2897. left.scalars[2] * right.scalars[2],
  2898. left.scalars[3] * right.scalars[3],
  2899. left.scalars[4] * right.scalars[4],
  2900. left.scalars[5] * right.scalars[5],
  2901. left.scalars[6] * right.scalars[6],
  2902. left.scalars[7] * right.scalars[7],
  2903. left.scalars[8] * right.scalars[8],
  2904. left.scalars[9] * right.scalars[9],
  2905. left.scalars[10] * right.scalars[10],
  2906. left.scalars[11] * right.scalars[11],
  2907. left.scalars[12] * right.scalars[12],
  2908. left.scalars[13] * right.scalars[13],
  2909. left.scalars[14] * right.scalars[14],
  2910. left.scalars[15] * right.scalars[15]
  2911. );
  2912. #endif
  2913. }
  2914. inline U8x32 operator+(const U8x32& left, const U8x32& right) {
  2915. #if defined USE_256BIT_X_SIMD
  2916. return U8x32(ADD_U8_SIMD256(left.v, right.v));
  2917. #else
  2918. U8x32 result = U8x32::create_dangerous_uninitialized();
  2919. for (int i = 0; i < 32; i++) {
  2920. result.scalars[i] = left.scalars[i] + right.scalars[i];
  2921. }
  2922. return result;
  2923. #endif
  2924. }
  2925. inline U8x32 operator-(const U8x32& left, const U8x32& right) {
  2926. #if defined USE_256BIT_X_SIMD
  2927. return U8x32(SUB_U8_SIMD256(left.v, right.v));
  2928. #else
  2929. U8x32 result = U8x32::create_dangerous_uninitialized();
  2930. for (int i = 0; i < 32; i++) {
  2931. result.scalars[i] = left.scalars[i] - right.scalars[i];
  2932. }
  2933. return result;
  2934. #endif
  2935. }
  2936. inline U8x32 saturatedAddition(const U8x32& left, const U8x32& right) {
  2937. #if defined USE_256BIT_X_SIMD
  2938. return U8x32(ADD_SAT_U8_SIMD256(left.v, right.v));
  2939. #else
  2940. U8x32 result = U8x32::create_dangerous_uninitialized();
  2941. for (int i = 0; i < 32; i++) {
  2942. result.scalars[i] = impl_limit255((uint32_t)left.scalars[i] + (uint32_t)right.scalars[i]);
  2943. }
  2944. return result;
  2945. #endif
  2946. }
  2947. inline U8x32 saturatedSubtraction(const U8x32& left, const U8x32& right) {
  2948. #if defined USE_256BIT_X_SIMD
  2949. return U8x32(SUB_SAT_U8_SIMD256(left.v, right.v));
  2950. #else
  2951. U8x32 result = U8x32::create_dangerous_uninitialized();
  2952. for (int i = 0; i < 32; i++) {
  2953. result.scalars[i] = impl_limit0((int32_t)left.scalars[i] - (int32_t)right.scalars[i]);
  2954. }
  2955. return result;
  2956. #endif
  2957. }
  2958. inline I32x8 truncateToI32(const F32x8& vector) {
  2959. #if defined USE_256BIT_X_SIMD
  2960. return I32x8(F32_TO_I32_SIMD256(vector.v));
  2961. #else
  2962. return I32x8(
  2963. (int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3],
  2964. (int32_t)vector.scalars[4], (int32_t)vector.scalars[5], (int32_t)vector.scalars[6], (int32_t)vector.scalars[7]
  2965. );
  2966. #endif
  2967. }
  2968. inline U32x8 truncateToU32(const F32x8& vector) {
  2969. #if defined USE_256BIT_X_SIMD
  2970. return U32x8(F32_TO_U32_SIMD256(vector.v));
  2971. #else
  2972. return U32x8(
  2973. (uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3],
  2974. (uint32_t)vector.scalars[4], (uint32_t)vector.scalars[5], (uint32_t)vector.scalars[6], (uint32_t)vector.scalars[7]
  2975. );
  2976. #endif
  2977. }
  2978. inline F32x8 floatFromI32(const I32x8& vector) {
  2979. #if defined USE_256BIT_X_SIMD
  2980. return F32x8(I32_TO_F32_SIMD256(vector.v));
  2981. #else
  2982. return F32x8(
  2983. (float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3],
  2984. (float)vector.scalars[4], (float)vector.scalars[5], (float)vector.scalars[6], (float)vector.scalars[7]
  2985. );
  2986. #endif
  2987. }
  2988. inline F32x8 floatFromU32(const U32x8& vector) {
  2989. #if defined USE_256BIT_X_SIMD
  2990. return F32x8(U32_TO_F32_SIMD256(vector.v));
  2991. #else
  2992. return F32x8(
  2993. (float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3],
  2994. (float)vector.scalars[4], (float)vector.scalars[5], (float)vector.scalars[6], (float)vector.scalars[7]
  2995. );
  2996. #endif
  2997. }
  2998. inline I32x8 I32FromU32(const U32x8& vector) {
  2999. #if defined USE_256BIT_X_SIMD
  3000. return I32x8(REINTERPRET_U32_TO_I32_SIMD256(vector.v));
  3001. #else
  3002. return I32x8(
  3003. (int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3],
  3004. (int32_t)vector.scalars[4], (int32_t)vector.scalars[5], (int32_t)vector.scalars[6], (int32_t)vector.scalars[7]
  3005. );
  3006. #endif
  3007. }
  3008. inline U32x8 U32FromI32(const I32x8& vector) {
  3009. #if defined USE_256BIT_X_SIMD
  3010. return U32x8(REINTERPRET_I32_TO_U32_SIMD256(vector.v));
  3011. #else
  3012. return U32x8(
  3013. (uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3],
  3014. (uint32_t)vector.scalars[4], (uint32_t)vector.scalars[5], (uint32_t)vector.scalars[6], (uint32_t)vector.scalars[7]
  3015. );
  3016. #endif
  3017. }
  3018. // Warning! Behavior depends on endianness.
  3019. inline U8x32 reinterpret_U8FromU32(const U32x8& vector) {
  3020. #if defined USE_256BIT_X_SIMD
  3021. return U8x32(REINTERPRET_U32_TO_U8_SIMD256(vector.v));
  3022. #else
  3023. const uint8_t *source = (const uint8_t*)vector.scalars;
  3024. U8x32 result = U8x32::create_dangerous_uninitialized();
  3025. for (int i = 0; i < 32; i++) {
  3026. result.scalars[i] = source[i];
  3027. }
  3028. return result;
  3029. #endif
  3030. }
  3031. // Warning! Behavior depends on endianness.
  3032. inline U32x8 reinterpret_U32FromU8(const U8x32& vector) {
  3033. #if defined USE_256BIT_X_SIMD
  3034. return U32x8(REINTERPRET_U8_TO_U32_SIMD256(vector.v));
  3035. #else
  3036. const uint32_t *source = (const uint32_t*)vector.scalars;
  3037. return U32x8(source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7]);
  3038. #endif
  3039. }
  3040. // Unpacking to larger integers
  3041. inline U32x8 lowerToU32(const U16x16& vector) {
  3042. #if defined USE_256BIT_X_SIMD
  3043. return U32x8(U16_LOW_TO_U32_SIMD256(vector.v));
  3044. #else
  3045. return U32x8(vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3], vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7]);
  3046. #endif
  3047. }
  3048. inline U32x8 higherToU32(const U16x16& vector) {
  3049. #if defined USE_256BIT_X_SIMD
  3050. return U32x8(U16_HIGH_TO_U32_SIMD256(vector.v));
  3051. #else
  3052. return U32x8(vector.scalars[8], vector.scalars[9], vector.scalars[10], vector.scalars[11], vector.scalars[12], vector.scalars[13], vector.scalars[14], vector.scalars[15]);
  3053. #endif
  3054. }
  3055. inline U16x16 lowerToU16(const U8x32& vector) {
  3056. #if defined USE_256BIT_X_SIMD
  3057. return U16x16(U8_LOW_TO_U16_SIMD256(vector.v));
  3058. #else
  3059. return U16x16(
  3060. vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3],
  3061. vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7],
  3062. vector.scalars[8], vector.scalars[9], vector.scalars[10], vector.scalars[11],
  3063. vector.scalars[12], vector.scalars[13], vector.scalars[14], vector.scalars[15]
  3064. );
  3065. #endif
  3066. }
  3067. inline U16x16 higherToU16(const U8x32& vector) {
  3068. #if defined USE_256BIT_X_SIMD
  3069. return U16x16(U8_HIGH_TO_U16_SIMD256(vector.v));
  3070. #else
  3071. return U16x16(
  3072. vector.scalars[16], vector.scalars[17], vector.scalars[18], vector.scalars[19],
  3073. vector.scalars[20], vector.scalars[21], vector.scalars[22], vector.scalars[23],
  3074. vector.scalars[24], vector.scalars[25], vector.scalars[26], vector.scalars[27],
  3075. vector.scalars[28], vector.scalars[29], vector.scalars[30], vector.scalars[31]
  3076. );
  3077. #endif
  3078. }
  3079. // Saturated packing
  3080. inline U8x32 saturateToU8(const U16x16& lower, const U16x16& upper) {
  3081. #if defined USE_256BIT_X_SIMD
  3082. return U8x32(PACK_SAT_U16_TO_U8_SIMD256(lower.v, upper.v));
  3083. #else
  3084. U8x32 result = U8x32::create_dangerous_uninitialized();
  3085. for (int i = 0; i < 16; i++) {
  3086. result.scalars[i] = impl_limit255(lower.scalars[i]);
  3087. }
  3088. for (int i = 0; i < 16; i++) {
  3089. result.scalars[i + 16] = impl_limit255(upper.scalars[i]);
  3090. }
  3091. return result;
  3092. #endif
  3093. }
  3094. // Unary negation for convenience and code readability.
  3095. // Before using unary negation, always check if:
  3096. // * An addition can be turned into a subtraction?
  3097. // x = -a + b
  3098. // x = b - a
  3099. // * A multiplying constant or scalar can be negated instead?
  3100. // x = -b * 2
  3101. // x = b * -2
  3102. inline F32x8 operator-(const F32x8& value) {
  3103. #if defined USE_256BIT_F_SIMD
  3104. return F32x8(0.0f) - value;
  3105. #else
  3106. return F32x8(
  3107. -value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3],
  3108. -value.scalars[4], -value.scalars[5], -value.scalars[6], -value.scalars[7]
  3109. );
  3110. #endif
  3111. }
  3112. inline I32x8 operator-(const I32x8& value) {
  3113. #if defined USE_256BIT_X_SIMD
  3114. return I32x8(0) - value;
  3115. #else
  3116. return I32x8(
  3117. -value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3],
  3118. -value.scalars[4], -value.scalars[5], -value.scalars[6], -value.scalars[7]
  3119. );
  3120. #endif
  3121. }
  3122. // Helper macros for generating the vector extract functions.
  3123. // Having one function for each type and offset makes sure that the compiler gets an immediate integer within the valid range.
  3124. #if defined USE_AVX2
  3125. // AVX2 does not offer any 256-bit element extraction, only two 128-bit shifts done in parallel, so we might as well use two separate 128-bit extractions.
  3126. template <int OFFSET>
  3127. __m256i impl_extractBytes_AVX2(const __m256i leftInput, const __m256i rightInput) {
  3128. // Extract three halves depending on which ones overlap with the offset.
  3129. __m128i leftPart = _mm256_extractf128_si256(leftInput, (OFFSET < 16) ? 0 : 1);
  3130. __m128i middlePart = _mm256_extractf128_si256(OFFSET < 16 ? rightInput : leftInput, (OFFSET < 16) ? 1 : 0);
  3131. __m128i rightPart = _mm256_extractf128_si256(rightInput, (OFFSET < 16) ? 0 : 1);
  3132. // Combine two 128-bit extracts into a whole 256-bit extract.
  3133. return _mm256_set_m128i(
  3134. _mm_alignr_epi8(leftPart, middlePart, OFFSET - ((OFFSET < 16) ? 0 : 16)),
  3135. _mm_alignr_epi8(middlePart, rightPart, OFFSET - ((OFFSET < 16) ? 0 : 16))
  3136. );
  3137. }
  3138. #define VECTOR_EXTRACT_GENERATOR_256_U8(OFFSET) return U8x32(impl_extractBytes_AVX2<OFFSET>(b.v, a.v));
  3139. #define VECTOR_EXTRACT_GENERATOR_256_U16(OFFSET) return U16x16(impl_extractBytes_AVX2<OFFSET * 2>(b.v, a.v));
  3140. #define VECTOR_EXTRACT_GENERATOR_256_U32(OFFSET) return U32x8(impl_extractBytes_AVX2<OFFSET * 4>(b.v, a.v));
  3141. #define VECTOR_EXTRACT_GENERATOR_256_I32(OFFSET) return I32x8(impl_extractBytes_AVX2<OFFSET * 4>(b.v, a.v));
  3142. #define VECTOR_EXTRACT_GENERATOR_256_F32(OFFSET) return F32x8(SIMD_F32x8(impl_extractBytes_AVX2<OFFSET * 4>(SIMD_U32x8(b.v), SIMD_U32x8(a.v))));
  3143. #else
  3144. template<typename T, int elementCount>
  3145. T vectorExtract_emulated(const T &a, const T &b, int offset) {
  3146. // For safety reasons, uninitialized default construction of SIMD vectors is only available during full emulation, not partial AVX support.
  3147. T result = T::create_dangerous_uninitialized();
  3148. int t = 0;
  3149. for (int s = offset; s < elementCount; s++) {
  3150. result.scalars[t] = a.scalars[s];
  3151. t++;
  3152. }
  3153. for (int s = 0; s < offset; s++) {
  3154. result.scalars[t] = b.scalars[s];
  3155. t++;
  3156. }
  3157. return result;
  3158. }
  3159. #define VECTOR_EXTRACT_GENERATOR_256_U8(OFFSET) return vectorExtract_emulated<U8x32, 32>(a, b, OFFSET);
  3160. #define VECTOR_EXTRACT_GENERATOR_256_U16(OFFSET) return vectorExtract_emulated<U16x16, 16>(a, b, OFFSET);
  3161. #define VECTOR_EXTRACT_GENERATOR_256_U32(OFFSET) return vectorExtract_emulated<U32x8, 8>(a, b, OFFSET);
  3162. #define VECTOR_EXTRACT_GENERATOR_256_I32(OFFSET) return vectorExtract_emulated<I32x8, 8>(a, b, OFFSET);
  3163. #define VECTOR_EXTRACT_GENERATOR_256_F32(OFFSET) return vectorExtract_emulated<F32x8, 8>(a, b, OFFSET);
  3164. #endif
  3165. // Vector extraction concatunates two input vectors and reads a vector between them using an offset.
  3166. // The first and last offsets that only return one of the inputs can be used for readability, because they will be inlined and removed by the compiler.
  3167. // To get elements from the right side, combine the center vector with the right vector and shift one element to the left using vectorExtract_1 for the given type.
  3168. // To get elements from the left side, combine the left vector with the center vector and shift one element to the right using vectorExtract_15 for 16 lanes, vectorExtract_7 for 8 lanes, or vectorExtract_3 for 4 lanes.
  3169. U8x32 inline vectorExtract_0(const U8x32 &a, const U8x32 &b) { return a; }
  3170. U8x32 inline vectorExtract_1(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(1) }
  3171. U8x32 inline vectorExtract_2(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(2) }
  3172. U8x32 inline vectorExtract_3(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(3) }
  3173. U8x32 inline vectorExtract_4(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(4) }
  3174. U8x32 inline vectorExtract_5(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(5) }
  3175. U8x32 inline vectorExtract_6(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(6) }
  3176. U8x32 inline vectorExtract_7(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(7) }
  3177. U8x32 inline vectorExtract_8(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(8) }
  3178. U8x32 inline vectorExtract_9(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(9) }
  3179. U8x32 inline vectorExtract_10(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(10) }
  3180. U8x32 inline vectorExtract_11(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(11) }
  3181. U8x32 inline vectorExtract_12(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(12) }
  3182. U8x32 inline vectorExtract_13(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(13) }
  3183. U8x32 inline vectorExtract_14(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(14) }
  3184. U8x32 inline vectorExtract_15(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(15) }
  3185. U8x32 inline vectorExtract_16(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(16) }
  3186. U8x32 inline vectorExtract_17(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(17) }
  3187. U8x32 inline vectorExtract_18(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(18) }
  3188. U8x32 inline vectorExtract_19(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(19) }
  3189. U8x32 inline vectorExtract_20(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(20) }
  3190. U8x32 inline vectorExtract_21(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(21) }
  3191. U8x32 inline vectorExtract_22(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(22) }
  3192. U8x32 inline vectorExtract_23(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(23) }
  3193. U8x32 inline vectorExtract_24(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(24) }
  3194. U8x32 inline vectorExtract_25(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(25) }
  3195. U8x32 inline vectorExtract_26(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(26) }
  3196. U8x32 inline vectorExtract_27(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(27) }
  3197. U8x32 inline vectorExtract_28(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(28) }
  3198. U8x32 inline vectorExtract_29(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(29) }
  3199. U8x32 inline vectorExtract_30(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(30) }
  3200. U8x32 inline vectorExtract_31(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(31) }
  3201. U8x32 inline vectorExtract_32(const U8x32 &a, const U8x32 &b) { return b; }
  3202. U16x16 inline vectorExtract_0(const U16x16 &a, const U16x16 &b) { return a; }
  3203. U16x16 inline vectorExtract_1(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(1) }
  3204. U16x16 inline vectorExtract_2(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(2) }
  3205. U16x16 inline vectorExtract_3(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(3) }
  3206. U16x16 inline vectorExtract_4(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(4) }
  3207. U16x16 inline vectorExtract_5(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(5) }
  3208. U16x16 inline vectorExtract_6(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(6) }
  3209. U16x16 inline vectorExtract_7(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(7) }
  3210. U16x16 inline vectorExtract_8(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(8) }
  3211. U16x16 inline vectorExtract_9(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(9) }
  3212. U16x16 inline vectorExtract_10(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(10) }
  3213. U16x16 inline vectorExtract_11(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(11) }
  3214. U16x16 inline vectorExtract_12(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(12) }
  3215. U16x16 inline vectorExtract_13(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(13) }
  3216. U16x16 inline vectorExtract_14(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(14) }
  3217. U16x16 inline vectorExtract_15(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(15) }
  3218. U16x16 inline vectorExtract_16(const U16x16 &a, const U16x16 &b) { return b; }
  3219. U32x8 inline vectorExtract_0(const U32x8 &a, const U32x8 &b) { return a; }
  3220. U32x8 inline vectorExtract_1(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(1) }
  3221. U32x8 inline vectorExtract_2(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(2) }
  3222. U32x8 inline vectorExtract_3(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(3) }
  3223. U32x8 inline vectorExtract_4(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(4) }
  3224. U32x8 inline vectorExtract_5(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(5) }
  3225. U32x8 inline vectorExtract_6(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(6) }
  3226. U32x8 inline vectorExtract_7(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(7) }
  3227. U32x8 inline vectorExtract_8(const U32x8 &a, const U32x8 &b) { return b; }
  3228. I32x8 inline vectorExtract_0(const I32x8 &a, const I32x8 &b) { return a; }
  3229. I32x8 inline vectorExtract_1(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(1) }
  3230. I32x8 inline vectorExtract_2(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(2) }
  3231. I32x8 inline vectorExtract_3(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(3) }
  3232. I32x8 inline vectorExtract_4(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(4) }
  3233. I32x8 inline vectorExtract_5(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(5) }
  3234. I32x8 inline vectorExtract_6(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(6) }
  3235. I32x8 inline vectorExtract_7(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(7) }
  3236. I32x8 inline vectorExtract_8(const I32x8 &a, const I32x8 &b) { return b; }
  3237. F32x8 inline vectorExtract_0(const F32x8 &a, const F32x8 &b) { return a; }
  3238. F32x8 inline vectorExtract_1(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(1) }
  3239. F32x8 inline vectorExtract_2(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(2) }
  3240. F32x8 inline vectorExtract_3(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(3) }
  3241. F32x8 inline vectorExtract_4(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(4) }
  3242. F32x8 inline vectorExtract_5(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(5) }
  3243. F32x8 inline vectorExtract_6(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(6) }
  3244. F32x8 inline vectorExtract_7(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(7) }
  3245. F32x8 inline vectorExtract_8(const F32x8 &a, const F32x8 &b) { return b; }
  3246. // Gather instructions load memory from a pointer at multiple index offsets at the same time.
  3247. // The given pointers should be aligned with 4 bytes, so that the fallback solution works on machines with strict alignment requirements.
  3248. #if defined USE_AVX2
  3249. #define GATHER_I32x8_AVX2(SOURCE, EIGHT_OFFSETS, SCALE) _mm256_i32gather_epi32((const int32_t*)(SOURCE), EIGHT_OFFSETS, SCALE)
  3250. #define GATHER_U32x8_AVX2(SOURCE, EIGHT_OFFSETS, SCALE) _mm256_i32gather_epi32((const int32_t*)(SOURCE), EIGHT_OFFSETS, SCALE)
  3251. #define GATHER_F32x8_AVX2(SOURCE, EIGHT_OFFSETS, SCALE) _mm256_i32gather_ps((const float*)(SOURCE), EIGHT_OFFSETS, SCALE)
  3252. #endif
  3253. static inline U32x8 gather(const dsr::SafePointer<uint32_t> data, const U32x8 &elementOffset) {
  3254. #if defined USE_AVX2
  3255. // TODO: Implement safety checks for debug mode.
  3256. return U32x8(GATHER_I32x8_AVX2(data.getUnsafe(), elementOffset.v, 4));
  3257. #else
  3258. ALIGN32 uint32_t elementOffsets[8];
  3259. elementOffset.writeAlignedUnsafe(elementOffsets);
  3260. return U32x8(
  3261. *(data + elementOffsets[0]),
  3262. *(data + elementOffsets[1]),
  3263. *(data + elementOffsets[2]),
  3264. *(data + elementOffsets[3]),
  3265. *(data + elementOffsets[4]),
  3266. *(data + elementOffsets[5]),
  3267. *(data + elementOffsets[6]),
  3268. *(data + elementOffsets[7])
  3269. );
  3270. #endif
  3271. }
  3272. static inline I32x8 gather(const dsr::SafePointer<int32_t> data, const U32x8 &elementOffset) {
  3273. #if defined USE_AVX2
  3274. // TODO: Implement safety checks for debug mode.
  3275. return I32x8(GATHER_U32x8_AVX2(data.getUnsafe(), elementOffset.v, 4));
  3276. #else
  3277. ALIGN32 uint32_t elementOffsets[8];
  3278. elementOffset.writeAlignedUnsafe(elementOffsets);
  3279. return I32x8(
  3280. *(data + elementOffsets[0]),
  3281. *(data + elementOffsets[1]),
  3282. *(data + elementOffsets[2]),
  3283. *(data + elementOffsets[3]),
  3284. *(data + elementOffsets[4]),
  3285. *(data + elementOffsets[5]),
  3286. *(data + elementOffsets[6]),
  3287. *(data + elementOffsets[7])
  3288. );
  3289. #endif
  3290. }
  3291. static inline F32x8 gather(const dsr::SafePointer<float> data, const U32x8 &elementOffset) {
  3292. #if defined USE_AVX2
  3293. // TODO: Implement safety checks for debug mode.
  3294. return F32x8(GATHER_F32x8_AVX2(data.getUnsafe(), elementOffset.v, 4));
  3295. #else
  3296. ALIGN32 uint32_t elementOffsets[8];
  3297. elementOffset.writeAlignedUnsafe(elementOffsets);
  3298. return F32x8(
  3299. *(data + elementOffsets[0]),
  3300. *(data + elementOffsets[1]),
  3301. *(data + elementOffsets[2]),
  3302. *(data + elementOffsets[3]),
  3303. *(data + elementOffsets[4]),
  3304. *(data + elementOffsets[5]),
  3305. *(data + elementOffsets[6]),
  3306. *(data + elementOffsets[7])
  3307. );
  3308. #endif
  3309. }
  3310. // Wrapper functions for explicitly expanding scalars into vectors during math operations.
  3311. #define NUMERICAL_SCALAR_OPERATIONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  3312. inline VECTOR_TYPE operator+(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left + VECTOR_TYPE(right); } \
  3313. inline VECTOR_TYPE operator+(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) + right; } \
  3314. inline VECTOR_TYPE operator-(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left - VECTOR_TYPE(right); } \
  3315. inline VECTOR_TYPE operator-(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) - right; }
  3316. FOR_ALL_VECTOR_TYPES(NUMERICAL_SCALAR_OPERATIONS)
  3317. #undef NUMERICAL_SCALAR_OPERATIONS
  3318. #define MULTIPLY_SCALAR_OPERATIONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  3319. inline VECTOR_TYPE operator*(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left * VECTOR_TYPE(right); } \
  3320. inline VECTOR_TYPE operator*(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) * right; }
  3321. // TODO: Implement multiplication for U8x16 and U8x32.
  3322. //FOR_ALL_VECTOR_TYPES(MULTIPLY_SCALAR_OPERATIONS)
  3323. MULTIPLY_SCALAR_OPERATIONS(F32x4, float, 4)
  3324. MULTIPLY_SCALAR_OPERATIONS(F32x8, float, 8)
  3325. MULTIPLY_SCALAR_OPERATIONS(U32x4, uint32_t, 4)
  3326. MULTIPLY_SCALAR_OPERATIONS(U32x8, uint32_t, 8)
  3327. MULTIPLY_SCALAR_OPERATIONS(I32x4, int32_t, 4)
  3328. MULTIPLY_SCALAR_OPERATIONS(I32x8, int32_t, 8)
  3329. MULTIPLY_SCALAR_OPERATIONS(U16x8, uint16_t, 8)
  3330. MULTIPLY_SCALAR_OPERATIONS(U16x16, uint16_t, 16)
  3331. #undef MULTIPLY_SCALAR_OPERATIONS
  3332. // Wrapper functions for explicitly duplicating bit masks into the same lane count.
  3333. #define BITWISE_SCALAR_OPERATIONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  3334. inline VECTOR_TYPE operator&(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left & VECTOR_TYPE(right); } \
  3335. inline VECTOR_TYPE operator&(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) & right; } \
  3336. inline VECTOR_TYPE operator|(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left | VECTOR_TYPE(right); } \
  3337. inline VECTOR_TYPE operator|(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) | right; } \
  3338. inline VECTOR_TYPE operator^(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left ^ VECTOR_TYPE(right); } \
  3339. inline VECTOR_TYPE operator^(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) ^ right; }
  3340. // TODO: Implement bitwise operations for all unsigned SIMD vectors.
  3341. //FOR_UNSIGNED_VECTOR_TYPES(BITWISE_SCALAR_OPERATIONS)
  3342. BITWISE_SCALAR_OPERATIONS(U32x4, uint32_t, 4)
  3343. BITWISE_SCALAR_OPERATIONS(U32x8, uint32_t, 8)
  3344. #undef BITWISE_SCALAR_OPERATIONS
  3345. // Cleaning up temporary macro definitions to avoid cluttering the namespace.
  3346. #undef FOR_ALL_VECTOR_TYPES
  3347. #undef FOR_FLOAT_VECTOR_TYPES
  3348. #undef FOR_INTEGER_VECTOR_TYPES
  3349. #undef FOR_SIGNED_VECTOR_TYPES
  3350. #undef FOR_UNSIGNED_VECTOR_TYPES
  3351. // The X vectors using the longest SIMD length that is efficient to use for both floating-point and integer types.
  3352. // DSR_DEFAULT_ALIGNMENT
  3353. // The number of bytes memory should be aligned with by default when creating buffers and images.
  3354. // F32xX
  3355. // The longest available SIMD vector for storing 32-bit float values. Iterating laneCountX_32Bit floats at a time.
  3356. // I32xX
  3357. // The longest available SIMD vector for storing signed 32-bit integer values. Iterating laneCountX_32Bit integers at a time.
  3358. // U32xX
  3359. // The longest available SIMD vector for storing unsigned 32-bit integer values. Iterating laneCountX_32Bit integers at a time.
  3360. // U16xX
  3361. // The longest available SIMD vector for storing unsigned 16-bit integer values. Iterating laneCountX_16Bit integers at a time.
  3362. // U8xX
  3363. // The longest available SIMD vector for storing unsigned 8-bit integer values. Iterating laneCountX_8Bit integers at a time.
  3364. #if defined USE_256BIT_X_SIMD || defined EMULATE_256BIT_X_SIMD
  3365. // Using 256-bit SIMD
  3366. #define DSR_DEFAULT_VECTOR_SIZE 32
  3367. #define DSR_DEFAULT_ALIGNMENT 32
  3368. using F32xX = F32x8;
  3369. using I32xX = I32x8;
  3370. using U32xX = U32x8;
  3371. using U16xX = U16x16;
  3372. using U8xX = U8x32;
  3373. // Align memory with 256 bits to allow overwriting padding at the end of each pixel row.
  3374. // Otherwise you would have to preserve data at the end of each row with slow and bloated duplicated code in every filter.
  3375. #else
  3376. // If there is no hardware support for 256-bit vectors, the emulation of 256-bit vectors when used explicitly, is allowed to be aligned with just 128 bits.
  3377. #define DSR_DEFAULT_VECTOR_SIZE 16
  3378. #define DSR_DEFAULT_ALIGNMENT 16
  3379. using F32xX = F32x4;
  3380. using I32xX = I32x4;
  3381. using U32xX = U32x4;
  3382. using U16xX = U16x8;
  3383. using U8xX = U8x16;
  3384. #endif
  3385. // How many lanes do the longest available vector have for a specified lane size.
  3386. // Used to iterate indices and pointers using whole elements.
  3387. static const int laneCountX_32Bit = DSR_DEFAULT_VECTOR_SIZE / 4;
  3388. static const int laneCountX_16Bit = DSR_DEFAULT_VECTOR_SIZE / 2;
  3389. static const int laneCountX_8Bit = DSR_DEFAULT_VECTOR_SIZE;
  3390. // The F vector using the longest SIMD length that is efficient to use when only processing float values, even if no integer types are available in the same size.
  3391. // Used when you know that your algorithm is only going to work with float types and you need the extra performance.
  3392. // Some processors have AVX but not AVX2, meaning that it has 256-bit SIMD for floats, but only 128-bit SIMD for integers.
  3393. // F32xF
  3394. // The longest available SIMD vector for storing 32-bit float values. Iterating laneCountF_32Bit floats at a time.
  3395. #if defined USE_256BIT_F_SIMD || defined EMULATE_256BIT_F_SIMD
  3396. #define DSR_FLOAT_VECTOR_SIZE 32
  3397. #define DSR_FLOAT_ALIGNMENT 32
  3398. using F32xF = F32x8;
  3399. #else
  3400. // F vectors are 128-bits.
  3401. #define DSR_FLOAT_VECTOR_SIZE 16
  3402. #define DSR_FLOAT_ALIGNMENT 16
  3403. using F32xF = F32x4;
  3404. #endif
  3405. // Used to iterate over float pointers when using F32xF.
  3406. static const int laneCountF = DSR_FLOAT_VECTOR_SIZE / 4;
  3407. #endif