simd.h 174 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068
  1. // zlib open source license
  2. //
  3. // Copyright (c) 2017 to 2025 David Forsgren Piuva
  4. //
  5. // This software is provided 'as-is', without any express or implied
  6. // warranty. In no event will the authors be held liable for any damages
  7. // arising from the use of this software.
  8. //
  9. // Permission is granted to anyone to use this software for any purpose,
  10. // including commercial applications, and to alter it and redistribute it
  11. // freely, subject to the following restrictions:
  12. //
  13. // 1. The origin of this software must not be misrepresented; you must not
  14. // claim that you wrote the original software. If you use this software
  15. // in a product, an acknowledgment in the product documentation would be
  16. // appreciated but is not required.
  17. //
  18. // 2. Altered source versions must be plainly marked as such, and must not be
  19. // misrepresented as being the original software.
  20. //
  21. // 3. This notice may not be removed or altered from any source
  22. // distribution.
  23. // Hardware abstraction layer for portable SIMD math.
  24. // Used to make calculations faster without having to mess around with any hardware specific assembler code nor intrinsic functions.
  25. // You get the performance you need today and the ability to compile with automatic scalar emulation when building for a future processor that this module has not yet been ported to.
  26. // When you can generate the vectorized code using the same template function as your non-vectorized code, you don't even need to write a reference implementation.
  27. // Using with 128-bit SIMD: (beginner friendly, test once, compile anywhere, no compiler flags)
  28. // If you are new to vectorization or only plan to work with 128-bit extensions such as ARM NEON, you can keep it simple by only using the 128-bit vector types.
  29. // Pros and cons:
  30. // + Most target platforms (excluding older systems such as ARMv6) have 128-bit SIMD extensions such as Intel SSE2 or ARM NEON enabled by default.
  31. // ARMv6 does not support ARM NEON, but most ARMv7 processors support it, so that compilers enable NEON by default.
  32. // All 64-bit ARM processors have ARM NEON, because it stopped being optional in ARMv8.
  33. // Building for 64-bit Intel processors usually have SSE2 enabled by default, so you don't have to change any compiler flags when building on a different system.
  34. // + One build for all computers of the same instruction set.
  35. // Great when your application is not so resource heavy, because the least powerful systems don't have the fancy extensions anyway.
  36. // - You might end up enabling the additional SIMD extensions anyway because the library is already using it to become faster.
  37. // Types:
  38. // * Use F32x4, I32x4 and U32x4 for 4 elements at a time
  39. // * U16x8 for 8 elements at a time
  40. // * U8x16 for 16 elements at a time
  41. // Using the X vector size: (advanced, having to test with different build flags or emulation)
  42. // If you want more performance, you can use variable length type aliases.
  43. // Pros and cons:
  44. // + For heavy calculations where memory access is not the bottleneck, using larger SIMD vectors when enabled allow saving energy and increasing performance.
  45. // - If you forget to test with longer vector lengths (compiling with -mavx2 or -mEMULATE_256BIT_SIMD) then you might find bugs from not iterating or aligning memory correctly.
  46. // Types:
  47. // * Use F32xX, I32xX and U32xX for laneCountX_32Bit elements at a time
  48. // * U16xX for laneCountX_16Bit elements at a time
  49. // * U8xX for laneCountX_8Bit elements at a time
  50. // Using the F vector size: (very dangerous, no test can confirm that memory alignment is correct)
  51. // If you want even more performance, you can let float operations use the longest available F vector size, which might exceed the X vector size.
  52. // Pros and cons:
  53. // - Have to manually set the alignment of buffers to DSR_FLOAT_ALIGNMENT to prevent crashing.
  54. // If the default alignment for buffers changed based on the size of F vectors, the more commonly used X vector would get slowed down from cache misses from padding larger than X vectors.
  55. // AlignedImageF32 and sound backends are already aligned with the F vector size, because they are not generic like Buffer.
  56. // - It can be difficult to detect incorrect memory alignment, because a pointer can accidentally be aligned to more than what was requested.
  57. // If accidentally aligning to 128 bits instead of 256 bits, there is a 50% risk of failing to detect it at runtime and later fail on another computer.
  58. // If sticking with 128-bit or X vectors, all buffers will be correctly aligned automatically.
  59. // + For heavy calculations where memory access is not the bottleneck, using larger SIMD vectors when enabled allow saving energy and increasing performance.
  60. // - If you forget to test with longer vector lengths (compiling with -mavx2 or -mEMULATE_256BIT_SIMD) then you might find bugs from not iterating or aligning memory correctly.
  61. // Types:
  62. // * Use F32xX, I32xX and U32xX for laneCountX_32Bit elements at a time
  63. // Compiler extensions
  64. // On Intel/AMD processors:
  65. // SSE2 is usually enabled by default, because SSE2 is mandatory for 64-bit Intel instructions.
  66. // Use -mavx as a G++ compiler flag to enable the AVX extension, enabling the USE_AVX and USE_256BIT_F_SIMD macros.
  67. // If not available on your computer, you can test your algorithm for 256-bit float SIMD using EMULATE_256BIT_F_SIMD, but only if you use the F vector size.
  68. // Use -mavx2 as a G++ compiler flag to enable the AVX2 extension, enabling the USE_AVX2 and USE_256BIT_X_SIMD macros.
  69. // If not available on your computer, you can test your algorithm for 256-bit float and integer SIMD using EMULATE_256BIT_X_SIMD, but only if you use the X vector size.
  70. // On ARMv6 processors:
  71. // Scalar emulation is used when compiling for ARMv6, because it does not have NEON and VFP is not supported in this abstraction.
  72. // On ARMv7 processors:
  73. // NEON is usually enabled by default for ARMv7, because most of them have the extension.
  74. // On ARMv8 processors:
  75. // NEON can not be disabled for ARMv8, because it is mandatory for ARMv8.
  76. #ifndef DFPSR_SIMD
  77. #define DFPSR_SIMD
  78. #include <cstdint>
  79. #include <cassert>
  80. #include "SafePointer.h"
  81. #include "../math/FVector.h"
  82. #include "../math/IVector.h"
  83. #include "../math/UVector.h"
  84. #include "DsrTraits.h"
  85. #include "../settings.h"
  86. #include "../base/noSimd.h"
  87. namespace dsr {
  88. // Alignment in bytes
  89. #define ALIGN_BYTES(SIZE) __attribute__((aligned(SIZE)))
  90. #define ALIGN16 ALIGN_BYTES(16) // 128-bit alignment
  91. #define ALIGN32 ALIGN_BYTES(32) // 256-bit alignment
  92. #define ALIGN64 ALIGN_BYTES(64) // 512-bit alignment
  93. #define ALIGN128 ALIGN_BYTES(128) // 1024-bit alignment
  94. #define ALIGN256 ALIGN_BYTES(256) // 2048-bit alignment
  95. // Everything declared in here handles things specific for SSE.
  96. // Direct use of the macros will not provide portability to all hardware.
  97. #ifdef USE_SSE2
  98. #define USE_DIRECT_SIMD_MEMORY_ACCESS
  99. #include <emmintrin.h> // SSE2
  100. #ifdef USE_SSSE3
  101. #include <tmmintrin.h> // SSSE3
  102. #endif
  103. #ifdef USE_AVX
  104. #include <immintrin.h> // AVX / AVX2
  105. #endif
  106. // Vector types
  107. #define SIMD_F32x4 __m128
  108. #define SIMD_U8x16 __m128i
  109. #define SIMD_U16x8 __m128i
  110. #define SIMD_U32x4 __m128i
  111. #define SIMD_I32x4 __m128i
  112. // Vector uploads in address order
  113. #define LOAD_VECTOR_F32_SIMD(A, B, C, D) _mm_set_ps(D, C, B, A)
  114. #define LOAD_SCALAR_F32_SIMD(A) _mm_set1_ps(A)
  115. #define LOAD_VECTOR_U8_SIMD(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) _mm_set_epi8(P, O, N, M, L, K, J, I, H, G, F, E, D, C, B, A)
  116. #define LOAD_SCALAR_U8_SIMD(A) _mm_set1_epi8(A)
  117. #define LOAD_VECTOR_U16_SIMD(A, B, C, D, E, F, G, H) _mm_set_epi16(H, G, F, E, D, C, B, A)
  118. #define LOAD_SCALAR_U16_SIMD(A) _mm_set1_epi16(A)
  119. #define LOAD_VECTOR_U32_SIMD(A, B, C, D) _mm_set_epi32(D, C, B, A)
  120. #define LOAD_SCALAR_U32_SIMD(A) _mm_set1_epi32(A)
  121. #define LOAD_VECTOR_I32_SIMD(A, B, C, D) _mm_set_epi32(D, C, B, A)
  122. #define LOAD_SCALAR_I32_SIMD(A) _mm_set1_epi32(A)
  123. // Conversions
  124. #define F32_TO_I32_SIMD(A) _mm_cvttps_epi32(A)
  125. #define F32_TO_U32_SIMD(A) _mm_cvttps_epi32(A)
  126. #define I32_TO_F32_SIMD(A) _mm_cvtepi32_ps(A)
  127. #define U32_TO_F32_SIMD(A) _mm_cvtepi32_ps(A)
  128. // Unpacking conversions
  129. #define U8_LOW_TO_U16_SIMD(A) _mm_unpacklo_epi8(A, _mm_set1_epi8(0))
  130. #define U8_HIGH_TO_U16_SIMD(A) _mm_unpackhi_epi8(A, _mm_set1_epi8(0))
  131. #define U16_LOW_TO_U32_SIMD(A) _mm_unpacklo_epi16(A, _mm_set1_epi16(0))
  132. #define U16_HIGH_TO_U32_SIMD(A) _mm_unpackhi_epi16(A, _mm_set1_epi16(0))
  133. // Saturated packing
  134. // Credit: Using ideas from Victoria Zhislina's NEON_2_SSE.h header from the Intel corporation, but not trying to emulate NEON
  135. inline SIMD_U8x16 PACK_SAT_U16_TO_U8_SIMD(const SIMD_U16x8& a, const SIMD_U16x8& b) {
  136. SIMD_U16x8 mask, a2, b2;
  137. mask = _mm_set1_epi16(0b0111111111111111);
  138. a2 = _mm_and_si128(a, mask);
  139. a2 = _mm_or_si128(a2, _mm_and_si128(_mm_cmpgt_epi16(a2, a), mask));
  140. b2 = _mm_and_si128(b, mask);
  141. b2 = _mm_or_si128(b2, _mm_and_si128(_mm_cmpgt_epi16(b2, b), mask));
  142. return _mm_packus_epi16(a2, b2);
  143. }
  144. // Reinterpret casting
  145. #define REINTERPRET_U32_TO_U8_SIMD(A) (A)
  146. #define REINTERPRET_U32_TO_U16_SIMD(A) (A)
  147. #define REINTERPRET_U8_TO_U32_SIMD(A) (A)
  148. #define REINTERPRET_U16_TO_U32_SIMD(A) (A)
  149. #define REINTERPRET_U32_TO_I32_SIMD(A) (A)
  150. #define REINTERPRET_I32_TO_U32_SIMD(A) (A)
  151. // Vector float operations returning SIMD_F32x4
  152. #define ADD_F32_SIMD(A, B) _mm_add_ps(A, B)
  153. #define SUB_F32_SIMD(A, B) _mm_sub_ps(A, B)
  154. #define MUL_F32_SIMD(A, B) _mm_mul_ps(A, B)
  155. // Vector integer operations returning SIMD_I32x4
  156. #define ADD_I32_SIMD(A, B) _mm_add_epi32(A, B)
  157. #define SUB_I32_SIMD(A, B) _mm_sub_epi32(A, B)
  158. // 32-bit integer multiplications are not available on SSE2.
  159. // Vector integer operations returning SIMD_U32x4
  160. #define ADD_U32_SIMD(A, B) _mm_add_epi32(A, B)
  161. #define SUB_U32_SIMD(A, B) _mm_sub_epi32(A, B)
  162. // 32-bit integer multiplications are not available on SSE2.
  163. // Vector integer operations returning SIMD_U16x8
  164. #define ADD_U16_SIMD(A, B) _mm_add_epi16(A, B)
  165. #define SUB_U16_SIMD(A, B) _mm_sub_epi16(A, B)
  166. #define MUL_U16_SIMD(A, B) _mm_mullo_epi16(A, B)
  167. // Vector integer operations returning SIMD_U8x16
  168. #define ADD_U8_SIMD(A, B) _mm_add_epi8(A, B)
  169. #define ADD_SAT_U8_SIMD(A, B) _mm_adds_epu8(A, B) // Saturated addition
  170. #define SUB_U8_SIMD(A, B) _mm_sub_epi8(A, B)
  171. #define SUB_SAT_U8_SIMD(A, B) _mm_subs_epu8(A, B) // Saturated subtraction
  172. // No 8-bit multiplications
  173. // Statistics
  174. #define MIN_F32_SIMD(A, B) _mm_min_ps(A, B)
  175. #define MAX_F32_SIMD(A, B) _mm_max_ps(A, B)
  176. // TODO: Implement minimum and maximum for integer vectors, so that all operations exist for all applicable types:
  177. // Using _mm256_max_epu16... in AVX2 for 256-bit versions
  178. // Using comparisons and masking in SSE2 when _mm_max_epu16... in SSE4.1 is not available
  179. // Bitwise
  180. #define BITWISE_AND_U32_SIMD(A, B) _mm_and_si128(A, B)
  181. #define BITWISE_OR_U32_SIMD(A, B) _mm_or_si128(A, B)
  182. #define BITWISE_XOR_U32_SIMD(A, B) _mm_xor_si128(A, B)
  183. #ifdef USE_AVX
  184. // 256-bit vector types
  185. #define SIMD_F32x8 __m256
  186. // Vector uploads in address order
  187. #define LOAD_VECTOR_F32_SIMD256(A, B, C, D, E, F, G, H) _mm256_set_ps(H, G, F, E, D, C, B, A)
  188. #define LOAD_SCALAR_F32_SIMD256(A) _mm256_set1_ps(A)
  189. // Vector float operations returning SIMD_F32x4
  190. #define ADD_F32_SIMD256(A, B) _mm256_add_ps(A, B)
  191. #define SUB_F32_SIMD256(A, B) _mm256_sub_ps(A, B)
  192. #define MUL_F32_SIMD256(A, B) _mm256_mul_ps(A, B)
  193. // Statistics
  194. #define MIN_F32_SIMD256(A, B) _mm256_min_ps(A, B)
  195. #define MAX_F32_SIMD256(A, B) _mm256_max_ps(A, B)
  196. #ifdef USE_AVX2
  197. // 256-bit vector types
  198. #define SIMD_U8x32 __m256i
  199. #define SIMD_U16x16 __m256i
  200. #define SIMD_U32x8 __m256i
  201. #define SIMD_I32x8 __m256i
  202. // Vector uploads in address order
  203. #define LOAD_VECTOR_U8_SIMD256(A1, B1, C1, D1, E1, F1, G1, H1, I1, J1, K1, L1, M1, N1, O1, P1, Q1, R1, S1, T1, U1, V1, W1, X1, Y1, Z1, A2, B2, C2, D2, E2, F2) _mm256_set_epi8(F2, E2, D2, C2, B2, A2, Z1, Y1, X1, W1, V1, U1, T1, S1, R1, Q1, P1, O1, N1, M1, L1, K1, J1, I1, H1, G1, F1, E1, D1, C1, B1, A1)
  204. #define LOAD_SCALAR_U8_SIMD256(A) _mm256_set1_epi8(A)
  205. #define LOAD_VECTOR_U16_SIMD256(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) _mm256_set_epi16(P, O, N, M, L, K, J, I, H, G, F, E, D, C, B, A)
  206. #define LOAD_SCALAR_U16_SIMD256(A) _mm256_set1_epi16(A)
  207. #define LOAD_VECTOR_U32_SIMD256(A, B, C, D, E, F, G, H) _mm256_set_epi32(H, G, F, E, D, C, B, A)
  208. #define LOAD_SCALAR_U32_SIMD256(A) _mm256_set1_epi32(A)
  209. #define LOAD_VECTOR_I32_SIMD256(A, B, C, D, E, F, G, H) _mm256_set_epi32(H, G, F, E, D, C, B, A)
  210. #define LOAD_SCALAR_I32_SIMD256(A) _mm256_set1_epi32(A)
  211. // Conversions
  212. #define F32_TO_I32_SIMD256(A) _mm256_cvttps_epi32(A)
  213. #define F32_TO_U32_SIMD256(A) _mm256_cvttps_epi32(A)
  214. #define I32_TO_F32_SIMD256(A) _mm256_cvtepi32_ps(A)
  215. #define U32_TO_F32_SIMD256(A) _mm256_cvtepi32_ps(A)
  216. // Unpacking conversions
  217. #define U8_LOW_TO_U16_SIMD256(A) _mm256_unpacklo_epi8(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi8(0))
  218. #define U8_HIGH_TO_U16_SIMD256(A) _mm256_unpackhi_epi8(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi8(0))
  219. #define U16_LOW_TO_U32_SIMD256(A) _mm256_unpacklo_epi16(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi16(0))
  220. #define U16_HIGH_TO_U32_SIMD256(A) _mm256_unpackhi_epi16(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi16(0))
  221. // Saturated packing
  222. // Credit: Using ideas from Victoria Zhislina's NEON_2_SSE.h header from the Intel corporation, but not trying to emulate NEON
  223. inline SIMD_U8x32 PACK_SAT_U16_TO_U8_SIMD256(const SIMD_U16x16& a, const SIMD_U16x16& b) {
  224. SIMD_U16x16 mask, a2, b2;
  225. mask = _mm256_set1_epi16(0b0111111111111111);
  226. a2 = _mm256_and_si256(a, mask);
  227. a2 = _mm256_or_si256(a2, _mm256_and_si256(_mm256_cmpgt_epi16(a2, a), mask));
  228. b2 = _mm256_and_si256(b, mask);
  229. b2 = _mm256_or_si256(b2, _mm256_and_si256(_mm256_cmpgt_epi16(b2, b), mask));
  230. // The 256-bit pack instruction _mm256_packus_epi16 is not serial, so the result has to be permutated into the correct order.
  231. // 0 2 1 3
  232. // | X |
  233. // 0 1 2 3
  234. return _mm256_permute4x64_epi64(_mm256_packus_epi16(a2, b2), 0b11011000);
  235. }
  236. // Reinterpret casting
  237. #define REINTERPRET_U32_TO_U8_SIMD256(A) (A)
  238. #define REINTERPRET_U32_TO_U16_SIMD256(A) (A)
  239. #define REINTERPRET_U8_TO_U32_SIMD256(A) (A)
  240. #define REINTERPRET_U16_TO_U32_SIMD256(A) (A)
  241. #define REINTERPRET_U32_TO_I32_SIMD256(A) (A)
  242. #define REINTERPRET_I32_TO_U32_SIMD256(A) (A)
  243. // Vector integer operations returning SIMD_I32x4
  244. #define ADD_I32_SIMD256(A, B) _mm256_add_epi32(A, B)
  245. #define SUB_I32_SIMD256(A, B) _mm256_sub_epi32(A, B)
  246. #define MUL_I32_SIMD256(A, B) _mm256_mullo_epi32(A, B)
  247. // Vector integer operations returning SIMD_U32x4
  248. #define ADD_U32_SIMD256(A, B) _mm256_add_epi32(A, B)
  249. #define SUB_U32_SIMD256(A, B) _mm256_sub_epi32(A, B)
  250. #define MUL_U32_SIMD256(A, B) _mm256_mullo_epi32(A, B)
  251. // Vector integer operations returning SIMD_U16x8
  252. #define ADD_U16_SIMD256(A, B) _mm256_add_epi16(A, B)
  253. #define SUB_U16_SIMD256(A, B) _mm256_sub_epi16(A, B)
  254. #define MUL_U16_SIMD256(A, B) _mm256_mullo_epi16(A, B)
  255. // Vector integer operations returning SIMD_U8x16
  256. #define ADD_U8_SIMD256(A, B) _mm256_add_epi8(A, B)
  257. #define ADD_SAT_U8_SIMD256(A, B) _mm256_adds_epu8(A, B) // Saturated addition
  258. #define SUB_U8_SIMD256(A, B) _mm256_sub_epi8(A, B)
  259. #define SUB_SAT_U8_SIMD256(A, B) _mm256_subs_epu8(A, B) // Saturated subtraction
  260. // No 8-bit multiplications
  261. // Bitwise
  262. #define BITWISE_AND_U32_SIMD256(A, B) _mm256_and_si256(A, B)
  263. #define BITWISE_OR_U32_SIMD256(A, B) _mm256_or_si256(A, B)
  264. #define BITWISE_XOR_U32_SIMD256(A, B) _mm256_xor_si256(A, B)
  265. #endif
  266. #endif
  267. #endif
  268. // Everything declared in here handles things specific for NEON.
  269. // Direct use of the macros will not provide portability to all hardware.
  270. #ifdef USE_NEON
  271. #include <arm_neon.h> // NEON
  272. // Vector types
  273. #define SIMD_F32x4 float32x4_t
  274. #define SIMD_U8x16 uint8x16_t
  275. #define SIMD_U16x8 uint16x8_t
  276. #define SIMD_U32x4 uint32x4_t
  277. #define SIMD_I32x4 int32x4_t
  278. // Vector uploads in address order
  279. inline SIMD_F32x4 LOAD_VECTOR_F32_SIMD(float a, float b, float c, float d) {
  280. float data[4] ALIGN16 = {a, b, c, d};
  281. return vld1q_f32(data);
  282. }
  283. inline SIMD_F32x4 LOAD_SCALAR_F32_SIMD(float a) {
  284. return vdupq_n_f32(a);
  285. }
  286. inline SIMD_U8x16 LOAD_VECTOR_U8_SIMD(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
  287. uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p) {
  288. uint8_t data[16] ALIGN16 = {a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p};
  289. return vld1q_u8(data);
  290. }
  291. inline SIMD_U8x16 LOAD_SCALAR_U8_SIMD(uint16_t a) {
  292. return vdupq_n_u8(a);
  293. }
  294. inline SIMD_U16x8 LOAD_VECTOR_U16_SIMD(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h) {
  295. uint16_t data[8] ALIGN16 = {a, b, c, d, e, f, g, h};
  296. return vld1q_u16(data);
  297. }
  298. inline SIMD_U16x8 LOAD_SCALAR_U16_SIMD(uint16_t a) {
  299. return vdupq_n_u16(a);
  300. }
  301. inline SIMD_U32x4 LOAD_VECTOR_U32_SIMD(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
  302. uint32_t data[4] ALIGN16 = {a, b, c, d};
  303. return vld1q_u32(data);
  304. }
  305. inline SIMD_U32x4 LOAD_SCALAR_U32_SIMD(uint32_t a) {
  306. return vdupq_n_u32(a);
  307. }
  308. inline SIMD_I32x4 LOAD_VECTOR_I32_SIMD(int32_t a, int32_t b, int32_t c, int32_t d) {
  309. int32_t data[4] ALIGN16 = {a, b, c, d};
  310. return vld1q_s32(data);
  311. }
  312. inline SIMD_I32x4 LOAD_SCALAR_I32_SIMD(int32_t a) {
  313. return vdupq_n_s32(a);
  314. }
  315. // Conversions
  316. #define F32_TO_I32_SIMD(A) vcvtq_s32_f32(A)
  317. #define F32_TO_U32_SIMD(A) vcvtq_u32_f32(A)
  318. #define I32_TO_F32_SIMD(A) vcvtq_f32_s32(A)
  319. #define U32_TO_F32_SIMD(A) vcvtq_f32_u32(A)
  320. // Unpacking conversions
  321. #define U8_LOW_TO_U16_SIMD(A) vmovl_u8(vget_low_u8(A))
  322. #define U8_HIGH_TO_U16_SIMD(A) vmovl_u8(vget_high_u8(A))
  323. #define U16_LOW_TO_U32_SIMD(A) vmovl_u16(vget_low_u16(A))
  324. #define U16_HIGH_TO_U32_SIMD(A) vmovl_u16(vget_high_u16(A))
  325. // Saturated packing
  326. #define PACK_SAT_U16_TO_U8_SIMD(A, B) vcombine_u8(vqmovn_u16(A), vqmovn_u16(B))
  327. // Reinterpret casting
  328. #define REINTERPRET_U32_TO_U8_SIMD(A) vreinterpretq_u8_u32(A)
  329. #define REINTERPRET_U32_TO_U16_SIMD(A) vreinterpretq_u16_u32(A)
  330. #define REINTERPRET_U8_TO_U32_SIMD(A) vreinterpretq_u32_u8(A)
  331. #define REINTERPRET_U16_TO_U32_SIMD(A) vreinterpretq_u32_u16(A)
  332. #define REINTERPRET_U32_TO_I32_SIMD(A) vreinterpretq_s32_u32(A)
  333. #define REINTERPRET_I32_TO_U32_SIMD(A) vreinterpretq_u32_s32(A)
  334. // Vector float operations returning SIMD_F32x4
  335. #define ADD_F32_SIMD(A, B) vaddq_f32(A, B)
  336. #define SUB_F32_SIMD(A, B) vsubq_f32(A, B)
  337. #define MUL_F32_SIMD(A, B) vmulq_f32(A, B)
  338. // Vector integer operations returning SIMD_I32x4
  339. #define ADD_I32_SIMD(A, B) vaddq_s32(A, B)
  340. #define SUB_I32_SIMD(A, B) vsubq_s32(A, B)
  341. #define MUL_I32_NEON(A, B) vmulq_s32(A, B)
  342. // Vector integer operations returning SIMD_U32x4
  343. #define ADD_U32_SIMD(A, B) vaddq_u32(A, B)
  344. #define SUB_U32_SIMD(A, B) vsubq_u32(A, B)
  345. #define MUL_U32_NEON(A, B) vmulq_u32(A, B)
  346. // Vector integer operations returning SIMD_U16x8
  347. #define ADD_U16_SIMD(A, B) vaddq_u16(A, B)
  348. #define SUB_U16_SIMD(A, B) vsubq_u16(A, B)
  349. #define MUL_U16_SIMD(A, B) vmulq_u16(A, B)
  350. // Vector integer operations returning SIMD_U8x16
  351. #define ADD_U8_SIMD(A, B) vaddq_u8(A, B)
  352. #define ADD_SAT_U8_SIMD(A, B) vqaddq_u8(A, B) // Saturated addition
  353. #define SUB_U8_SIMD(A, B) vsubq_u8(A, B)
  354. #define SUB_SAT_U8_SIMD(A, B) vqsubq_u8(A, B) // Saturated subtraction
  355. // No 8-bit multiplications
  356. // Statistics
  357. #define MIN_F32_SIMD(A, B) vminq_f32(A, B)
  358. #define MAX_F32_SIMD(A, B) vmaxq_f32(A, B)
  359. // Bitwise
  360. #define BITWISE_AND_U32_SIMD(A, B) vandq_u32(A, B)
  361. #define BITWISE_OR_U32_SIMD(A, B) vorrq_u32(A, B)
  362. #define BITWISE_XOR_U32_SIMD(A, B) veorq_u32(A, B)
  363. #endif
  364. /*
  365. The vector types below are supposed to be portable across different CPU architectures.
  366. When mixed with handwritten SIMD intrinsics:
  367. Use "USE_SSE2" instead of "__SSE2__"
  368. Use "USE_AVX2" instead of "__AVX2__"
  369. Use "USE_NEON" instead of "__ARM_NEON"
  370. So that any new variations of the macro named given from the compiler can be added to simd.h instead of duplicated everywhere.
  371. Portability exceptions:
  372. * The "v" variable is the native backend, which is only defined when SIMD is supported by hardware.
  373. Only use when USE_BASIC_SIMD is defined.
  374. Will not work on scalar emulation.
  375. * The "scalars" array is available when emulating a type that does not exist or the SIMD vector has direct access to the memory.
  376. Do not rely on these for accessing elements, because otherwise your code will not be able to compile for ARM NEON.
  377. */
  378. union F32x4 {
  379. private:
  380. // The uninitialized default constructor is private for safety reasons.
  381. F32x4() {}
  382. public:
  383. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  384. static inline F32x4 create_dangerous_uninitialized() { return F32x4(); }
  385. #ifdef USE_BASIC_SIMD
  386. public:
  387. #ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
  388. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  389. // Direct access cannot be done on NEON!
  390. float scalars[4];
  391. #endif
  392. // The SIMD vector of undefined type
  393. // Not accessible while emulating!
  394. SIMD_F32x4 v;
  395. // Construct a portable vector from a native SIMD vector
  396. explicit F32x4(const SIMD_F32x4& v) : v(v) {}
  397. // Construct a portable vector from a set of scalars
  398. F32x4(float a1, float a2, float a3, float a4) : v(LOAD_VECTOR_F32_SIMD(a1, a2, a3, a4)) {}
  399. // Construct a portable vector from a single duplicated scalar
  400. explicit F32x4(float scalar) : v(LOAD_SCALAR_F32_SIMD(scalar)) {}
  401. #else
  402. public:
  403. // Emulate a SIMD vector as an array of scalars without hardware support.
  404. // Only accessible while emulating!
  405. float scalars[4];
  406. // Construct a portable vector from a set of scalars
  407. F32x4(float a1, float a2, float a3, float a4) {
  408. this->scalars[0] = a1;
  409. this->scalars[1] = a2;
  410. this->scalars[2] = a3;
  411. this->scalars[3] = a4;
  412. }
  413. // Construct a portable vector from a single duplicated scalar
  414. explicit F32x4(float scalar) {
  415. this->scalars[0] = scalar;
  416. this->scalars[1] = scalar;
  417. this->scalars[2] = scalar;
  418. this->scalars[3] = scalar;
  419. }
  420. #endif
  421. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  422. static inline F32x4 createGradient(float start, float increment) {
  423. return F32x4(start, start + increment, start + increment * 2.0f, start + increment * 3.0f);
  424. }
  425. // Construct a portable SIMD vector from a pointer to aligned data
  426. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  427. static inline F32x4 readAlignedUnsafe(const float* data) {
  428. #ifdef USE_BASIC_SIMD
  429. #if defined USE_SSE2
  430. return F32x4(_mm_load_ps(data));
  431. #elif defined USE_NEON
  432. return F32x4(vld1q_f32(data));
  433. #endif
  434. #else
  435. return F32x4(data[0], data[1], data[2], data[3]);
  436. #endif
  437. }
  438. // Write to aligned memory from the existing vector
  439. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  440. inline void writeAlignedUnsafe(float* data) const {
  441. #if defined USE_BASIC_SIMD
  442. #if defined USE_SSE2
  443. _mm_store_ps(data, this->v);
  444. #elif defined USE_NEON
  445. vst1q_f32(data, this->v);
  446. #endif
  447. #else
  448. data[0] = this->scalars[0];
  449. data[1] = this->scalars[1];
  450. data[2] = this->scalars[2];
  451. data[3] = this->scalars[3];
  452. #endif
  453. }
  454. #if defined DFPSR_GEOMETRY_FVECTOR
  455. dsr::FVector4D get() const {
  456. float data[4] ALIGN16;
  457. this->writeAlignedUnsafe(data);
  458. return dsr::FVector4D(data[0], data[1], data[2], data[3]);
  459. }
  460. #endif
  461. // Bound and alignment checked reading
  462. static inline F32x4 readAligned(dsr::SafePointer<const float> data, const char* methodName) {
  463. const float* pointer = data.getUnsafe();
  464. assert(((uintptr_t)pointer & 15) == 0);
  465. #if defined SAFE_POINTER_CHECKS
  466. data.assertInside(methodName, pointer, 16);
  467. #endif
  468. return F32x4::readAlignedUnsafe(pointer);
  469. }
  470. // Bound and alignment checked writing
  471. inline void writeAligned(dsr::SafePointer<float> data, const char* methodName) const {
  472. float* pointer = data.getUnsafe();
  473. assert(((uintptr_t)pointer & 15) == 0);
  474. #if defined SAFE_POINTER_CHECKS
  475. data.assertInside(methodName, pointer, 16);
  476. #endif
  477. this->writeAlignedUnsafe(pointer);
  478. }
  479. };
  480. // 1 / value
  481. inline F32x4 reciprocal(const F32x4 &value) {
  482. #if defined USE_BASIC_SIMD
  483. #if defined USE_SSE2
  484. // Approximate
  485. SIMD_F32x4 lowQ = _mm_rcp_ps(value.v);
  486. // Refine
  487. return F32x4(SUB_F32_SIMD(ADD_F32_SIMD(lowQ, lowQ), MUL_F32_SIMD(value.v, MUL_F32_SIMD(lowQ, lowQ))));
  488. #elif defined USE_NEON
  489. // Approximate
  490. SIMD_F32x4 result = vrecpeq_f32(value.v);
  491. // Refine
  492. result = MUL_F32_SIMD(vrecpsq_f32(value.v, result), result);
  493. return F32x4(MUL_F32_SIMD(vrecpsq_f32(value.v, result), result));
  494. #else
  495. assert(false);
  496. return F32x4(0);
  497. #endif
  498. #else
  499. return F32x4(1.0f / value.scalars[0], 1.0f / value.scalars[1], 1.0f / value.scalars[2], 1.0f / value.scalars[3]);
  500. #endif
  501. }
  502. // 1 / sqrt(value)
  503. inline F32x4 reciprocalSquareRoot(const F32x4 &value) {
  504. #if defined USE_BASIC_SIMD
  505. #if defined USE_SSE2
  506. SIMD_F32x4 reciRoot = _mm_rsqrt_ps(value.v);
  507. SIMD_F32x4 mul = MUL_F32_SIMD(MUL_F32_SIMD(value.v, reciRoot), reciRoot);
  508. reciRoot = MUL_F32_SIMD(MUL_F32_SIMD(LOAD_SCALAR_F32_SIMD(0.5f), reciRoot), SUB_F32_SIMD(LOAD_SCALAR_F32_SIMD(3.0f), mul));
  509. return F32x4(reciRoot);
  510. #elif defined USE_NEON
  511. // Approximate
  512. SIMD_F32x4 reciRoot = vrsqrteq_f32(value.v);
  513. // Refine
  514. reciRoot = MUL_F32_SIMD(vrsqrtsq_f32(MUL_F32_SIMD(value.v, reciRoot), reciRoot), reciRoot);
  515. return F32x4(reciRoot);
  516. #else
  517. assert(false);
  518. return F32x4(0);
  519. #endif
  520. #else
  521. return F32x4(1.0f / sqrt(value.scalars[0]), 1.0f / sqrt(value.scalars[1]), 1.0f / sqrt(value.scalars[2]), 1.0f / sqrt(value.scalars[3]));
  522. #endif
  523. }
  524. // sqrt(value)
  525. inline F32x4 squareRoot(const F32x4 &value) {
  526. #if defined USE_BASIC_SIMD
  527. #if defined USE_SSE2
  528. SIMD_F32x4 half = LOAD_SCALAR_F32_SIMD(0.5f);
  529. // Approximate
  530. SIMD_F32x4 root = _mm_sqrt_ps(value.v);
  531. // Refine
  532. root = _mm_mul_ps(_mm_add_ps(root, _mm_div_ps(value.v, root)), half);
  533. return F32x4(root);
  534. #elif defined USE_NEON
  535. return F32x4(MUL_F32_SIMD(value.v, reciprocalSquareRoot(value).v));
  536. #else
  537. assert(false);
  538. return F32x4(0);
  539. #endif
  540. #else
  541. return F32x4(sqrt(value.scalars[0]), sqrt(value.scalars[1]), sqrt(value.scalars[2]), sqrt(value.scalars[3]));
  542. #endif
  543. }
  544. union I32x4 {
  545. private:
  546. // The uninitialized default constructor is private for safety reasons.
  547. I32x4() {}
  548. public:
  549. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  550. static inline I32x4 create_dangerous_uninitialized() { return I32x4(); }
  551. #if defined USE_BASIC_SIMD
  552. public:
  553. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  554. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  555. // Direct access cannot be done on NEON!
  556. int32_t scalars[4];
  557. #endif
  558. // The SIMD vector of undefined type
  559. // Not accessible while emulating!
  560. SIMD_I32x4 v;
  561. // Construct a portable vector from a native SIMD vector
  562. explicit I32x4(const SIMD_I32x4& v) : v(v) {}
  563. // Construct a portable vector from a set of scalars
  564. I32x4(int32_t a1, int32_t a2, int32_t a3, int32_t a4) : v(LOAD_VECTOR_I32_SIMD(a1, a2, a3, a4)) {}
  565. // Construct a portable vector from a single duplicated scalar
  566. explicit I32x4(int32_t scalar) : v(LOAD_SCALAR_I32_SIMD(scalar)) {}
  567. #else
  568. public:
  569. // Emulate a SIMD vector as an array of scalars without hardware support.
  570. // Only accessible while emulating!
  571. int32_t scalars[4];
  572. // Construct a portable vector from a set of scalars
  573. I32x4(int32_t a1, int32_t a2, int32_t a3, int32_t a4) {
  574. this->scalars[0] = a1;
  575. this->scalars[1] = a2;
  576. this->scalars[2] = a3;
  577. this->scalars[3] = a4;
  578. }
  579. // Construct a portable vector from a single duplicated scalar
  580. explicit I32x4(int32_t scalar) {
  581. this->scalars[0] = scalar;
  582. this->scalars[1] = scalar;
  583. this->scalars[2] = scalar;
  584. this->scalars[3] = scalar;
  585. }
  586. #endif
  587. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  588. static inline I32x4 createGradient(int32_t start, int32_t increment) {
  589. return I32x4(start, start + increment, start + increment * 2, start + increment * 3);
  590. }
  591. // Construct a portable SIMD vector from a pointer to aligned data
  592. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  593. static inline I32x4 readAlignedUnsafe(const int32_t* data) {
  594. #if defined USE_BASIC_SIMD
  595. #if defined USE_SSE2
  596. return I32x4(_mm_load_si128((const __m128i*)data));
  597. #elif defined USE_NEON
  598. return I32x4(vld1q_s32(data));
  599. #endif
  600. #else
  601. return I32x4(data[0], data[1], data[2], data[3]);
  602. #endif
  603. }
  604. // Write to aligned memory from the existing vector
  605. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  606. inline void writeAlignedUnsafe(int32_t* data) const {
  607. #if defined USE_BASIC_SIMD
  608. #if defined USE_SSE2
  609. _mm_store_si128((__m128i*)data, this->v);
  610. #elif defined USE_NEON
  611. vst1q_s32(data, this->v);
  612. #endif
  613. #else
  614. data[0] = this->scalars[0];
  615. data[1] = this->scalars[1];
  616. data[2] = this->scalars[2];
  617. data[3] = this->scalars[3];
  618. #endif
  619. }
  620. #if defined DFPSR_GEOMETRY_IVECTOR
  621. dsr::IVector4D get() const {
  622. int32_t data[4] ALIGN16;
  623. this->writeAlignedUnsafe(data);
  624. return dsr::IVector4D(data[0], data[1], data[2], data[3]);
  625. }
  626. #endif
  627. // Bound and alignment checked reading
  628. static inline I32x4 readAligned(dsr::SafePointer<const int32_t> data, const char* methodName) {
  629. const int32_t* pointer = data.getUnsafe();
  630. assert(((uintptr_t)pointer & 15) == 0);
  631. #if defined SAFE_POINTER_CHECKS
  632. data.assertInside(methodName, pointer, 16);
  633. #endif
  634. return I32x4::readAlignedUnsafe(pointer);
  635. }
  636. // Bound and alignment checked writing
  637. inline void writeAligned(dsr::SafePointer<int32_t> data, const char* methodName) const {
  638. int32_t* pointer = data.getUnsafe();
  639. assert(((uintptr_t)pointer & 15) == 0);
  640. #if defined SAFE_POINTER_CHECKS
  641. data.assertInside(methodName, pointer, 16);
  642. #endif
  643. this->writeAlignedUnsafe(pointer);
  644. }
  645. };
  646. union U32x4 {
  647. #if defined USE_BASIC_SIMD
  648. public:
  649. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  650. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  651. // Direct access cannot be done on NEON!
  652. uint32_t scalars[4];
  653. #endif
  654. // The SIMD vector of undefined type
  655. // Not accessible while emulating!
  656. SIMD_U32x4 v;
  657. // Construct a portable vector from a native SIMD vector
  658. explicit U32x4(const SIMD_U32x4& v) : v(v) {}
  659. // Construct a portable vector from a set of scalars
  660. U32x4(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4) : v(LOAD_VECTOR_U32_SIMD(a1, a2, a3, a4)) {}
  661. // Construct a portable vector from a single duplicated scalar
  662. explicit U32x4(uint32_t scalar) : v(LOAD_SCALAR_U32_SIMD(scalar)) {}
  663. #else
  664. public:
  665. // Emulate a SIMD vector as an array of scalars without hardware support.
  666. // Only accessible while emulating!
  667. uint32_t scalars[4];
  668. // Construct a portable vector from a set of scalars
  669. U32x4(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4) {
  670. this->scalars[0] = a1;
  671. this->scalars[1] = a2;
  672. this->scalars[2] = a3;
  673. this->scalars[3] = a4;
  674. }
  675. // Construct a portable vector from a single duplicated scalar
  676. explicit U32x4(uint32_t scalar) {
  677. this->scalars[0] = scalar;
  678. this->scalars[1] = scalar;
  679. this->scalars[2] = scalar;
  680. this->scalars[3] = scalar;
  681. }
  682. #endif
  683. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  684. static inline U32x4 createGradient(uint32_t start, uint32_t increment) {
  685. return U32x4(start, start + increment, start + increment * 2, start + increment * 3);
  686. }
  687. // Construct a portable SIMD vector from a pointer to aligned data
  688. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  689. static inline U32x4 readAlignedUnsafe(const uint32_t* data) {
  690. #if defined USE_BASIC_SIMD
  691. #if defined USE_SSE2
  692. return U32x4(_mm_load_si128((const __m128i*)data));
  693. #elif defined USE_NEON
  694. return U32x4(vld1q_u32(data));
  695. #endif
  696. #else
  697. return U32x4(data[0], data[1], data[2], data[3]);
  698. #endif
  699. }
  700. // Write to aligned memory from the existing vector
  701. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  702. inline void writeAlignedUnsafe(uint32_t* data) const {
  703. #if defined USE_BASIC_SIMD
  704. #if defined USE_SSE2
  705. _mm_store_si128((__m128i*)data, this->v);
  706. #elif defined USE_NEON
  707. vst1q_u32(data, this->v);
  708. #endif
  709. #else
  710. data[0] = this->scalars[0];
  711. data[1] = this->scalars[1];
  712. data[2] = this->scalars[2];
  713. data[3] = this->scalars[3];
  714. #endif
  715. }
  716. #if defined DFPSR_GEOMETRY_UVECTOR
  717. dsr::UVector4D get() const {
  718. uint32_t data[4] ALIGN16;
  719. this->writeAlignedUnsafe(data);
  720. return dsr::UVector4D(data[0], data[1], data[2], data[3]);
  721. }
  722. #endif
  723. // Bound and alignment checked reading
  724. static inline U32x4 readAligned(dsr::SafePointer<const uint32_t> data, const char* methodName) {
  725. const uint32_t* pointer = data.getUnsafe();
  726. assert(((uintptr_t)pointer & 15) == 0);
  727. #if defined SAFE_POINTER_CHECKS
  728. data.assertInside(methodName, pointer, 16);
  729. #endif
  730. return U32x4::readAlignedUnsafe(pointer);
  731. }
  732. // Bound and alignment checked writing
  733. inline void writeAligned(dsr::SafePointer<uint32_t> data, const char* methodName) const {
  734. uint32_t* pointer = data.getUnsafe();
  735. assert(((uintptr_t)pointer & 15) == 0);
  736. #if defined SAFE_POINTER_CHECKS
  737. data.assertInside(methodName, pointer, 16);
  738. #endif
  739. this->writeAlignedUnsafe(pointer);
  740. }
  741. };
  742. union U16x8 {
  743. private:
  744. // The uninitialized default constructor is private for safety reasons.
  745. U16x8() {}
  746. public:
  747. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  748. static inline U16x8 create_dangerous_uninitialized() { return U16x8(); }
  749. #if defined USE_BASIC_SIMD
  750. public:
  751. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  752. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  753. // Direct access cannot be done on NEON!
  754. uint16_t scalars[8];
  755. #endif
  756. // The SIMD vector of undefined type
  757. // Not accessible while emulating!
  758. SIMD_U16x8 v;
  759. // Construct a portable vector from a native SIMD vector
  760. explicit U16x8(const SIMD_U16x8& v) : v(v) {}
  761. // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
  762. // Reinterpret casting is used
  763. explicit U16x8(const U32x4& vector) : v(REINTERPRET_U32_TO_U16_SIMD(vector.v)) {}
  764. // Construct a portable vector from a set of scalars
  765. U16x8(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8) : v(LOAD_VECTOR_U16_SIMD(a1, a2, a3, a4, a5, a6, a7, a8)) {}
  766. // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
  767. // Reinterpret casting is used
  768. // TODO: Remove all reintreprets from constructors to improve readability
  769. explicit U16x8(uint32_t scalar) : v(REINTERPRET_U32_TO_U16_SIMD(LOAD_SCALAR_U32_SIMD(scalar))) {}
  770. // Construct a portable vector from a single duplicated scalar
  771. explicit U16x8(uint16_t scalar) : v(LOAD_SCALAR_U16_SIMD(scalar)) {}
  772. // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
  773. U32x4 get_U32() const {
  774. return U32x4(REINTERPRET_U16_TO_U32_SIMD(this->v));
  775. }
  776. #else
  777. public:
  778. // Emulate a SIMD vector as an array of scalars without hardware support.
  779. // Only accessible while emulating!
  780. uint16_t scalars[8];
  781. // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
  782. // Reinterpret casting is used
  783. explicit U16x8(const U32x4& vector) {
  784. uint64_t *target = (uint64_t*)this->scalars;
  785. uint64_t *source = (uint64_t*)vector.scalars;
  786. target[0] = source[0];
  787. target[1] = source[1];
  788. }
  789. // Construct a portable vector from a set of scalars
  790. U16x8(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8) {
  791. this->scalars[0] = a1;
  792. this->scalars[1] = a2;
  793. this->scalars[2] = a3;
  794. this->scalars[3] = a4;
  795. this->scalars[4] = a5;
  796. this->scalars[5] = a6;
  797. this->scalars[6] = a7;
  798. this->scalars[7] = a8;
  799. }
  800. // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
  801. // Reinterpret casting is used
  802. explicit U16x8(uint32_t scalar) {
  803. uint32_t *target = (uint32_t*)this->scalars;
  804. target[0] = scalar;
  805. target[1] = scalar;
  806. target[2] = scalar;
  807. target[3] = scalar;
  808. }
  809. // Construct a portable vector from a single duplicated scalar
  810. explicit U16x8(uint16_t scalar) {
  811. this->scalars[0] = scalar;
  812. this->scalars[1] = scalar;
  813. this->scalars[2] = scalar;
  814. this->scalars[3] = scalar;
  815. this->scalars[4] = scalar;
  816. this->scalars[5] = scalar;
  817. this->scalars[6] = scalar;
  818. this->scalars[7] = scalar;
  819. }
  820. // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
  821. U32x4 get_U32() const {
  822. U32x4 result(0);
  823. uint64_t *target = (uint64_t*)result.scalars;
  824. uint64_t *source = (uint64_t*)this->scalars;
  825. target[0] = source[0];
  826. target[1] = source[1];
  827. return result;
  828. }
  829. #endif
  830. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  831. static inline U16x8 createGradient(uint16_t start, uint16_t increment) {
  832. return U16x8(
  833. start,
  834. start + increment,
  835. start + increment * 2,
  836. start + increment * 3,
  837. start + increment * 4,
  838. start + increment * 5,
  839. start + increment * 6,
  840. start + increment * 7
  841. );
  842. }
  843. static inline U16x8 readAlignedUnsafe(const uint16_t* data) {
  844. #if defined USE_BASIC_SIMD
  845. #if defined USE_SSE2
  846. return U16x8(_mm_load_si128((const __m128i*)data));
  847. #elif defined USE_NEON
  848. return U16x8(vld1q_u16(data));
  849. #endif
  850. #else
  851. return U16x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  852. #endif
  853. }
  854. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  855. inline void writeAlignedUnsafe(uint16_t* data) const {
  856. #if defined USE_BASIC_SIMD
  857. #if defined USE_SSE2
  858. _mm_store_si128((__m128i*)data, this->v);
  859. #elif defined USE_NEON
  860. vst1q_u16(data, this->v);
  861. #endif
  862. #else
  863. data[0] = this->scalars[0];
  864. data[1] = this->scalars[1];
  865. data[2] = this->scalars[2];
  866. data[3] = this->scalars[3];
  867. data[4] = this->scalars[4];
  868. data[5] = this->scalars[5];
  869. data[6] = this->scalars[6];
  870. data[7] = this->scalars[7];
  871. #endif
  872. }
  873. // Bound and alignment checked reading
  874. static inline U16x8 readAligned(dsr::SafePointer<const uint16_t> data, const char* methodName) {
  875. const uint16_t* pointer = data.getUnsafe();
  876. assert(((uintptr_t)pointer & 15) == 0);
  877. #if defined SAFE_POINTER_CHECKS
  878. data.assertInside(methodName, pointer, 16);
  879. #endif
  880. return U16x8::readAlignedUnsafe(pointer);
  881. }
  882. // Bound and alignment checked writing
  883. inline void writeAligned(dsr::SafePointer<uint16_t> data, const char* methodName) const {
  884. uint16_t* pointer = data.getUnsafe();
  885. assert(((uintptr_t)pointer & 15) == 0);
  886. #if defined SAFE_POINTER_CHECKS
  887. data.assertInside(methodName, pointer, 16);
  888. #endif
  889. this->writeAlignedUnsafe(pointer);
  890. }
  891. };
  892. union U8x16 {
  893. private:
  894. // The uninitialized default constructor is private for safety reasons.
  895. U8x16() {}
  896. public:
  897. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  898. static inline U8x16 create_dangerous_uninitialized() { return U8x16(); }
  899. #if defined USE_BASIC_SIMD
  900. public:
  901. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  902. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  903. // Direct access cannot be done on NEON!
  904. uint8_t scalars[16];
  905. #endif
  906. // The SIMD vector of undefined type
  907. // Not accessible while emulating!
  908. SIMD_U8x16 v;
  909. // Construct a portable vector from a native SIMD vector
  910. explicit U8x16(const SIMD_U8x16& v) : v(v) {}
  911. // Construct a portable vector from a set of scalars
  912. U8x16(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  913. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16)
  914. : v(LOAD_VECTOR_U8_SIMD(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16)) {}
  915. // Construct a portable vector from a single duplicated scalar
  916. explicit U8x16(uint8_t scalar) : v(LOAD_SCALAR_U8_SIMD(scalar)) {}
  917. #else
  918. public:
  919. // Emulate a SIMD vector as an array of scalars without hardware support.
  920. // Only accessible while emulating!
  921. uint8_t scalars[16];
  922. // Construct a portable vector from a set of scalars
  923. U8x16(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  924. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16) {
  925. this->scalars[0] = a1;
  926. this->scalars[1] = a2;
  927. this->scalars[2] = a3;
  928. this->scalars[3] = a4;
  929. this->scalars[4] = a5;
  930. this->scalars[5] = a6;
  931. this->scalars[6] = a7;
  932. this->scalars[7] = a8;
  933. this->scalars[8] = a9;
  934. this->scalars[9] = a10;
  935. this->scalars[10] = a11;
  936. this->scalars[11] = a12;
  937. this->scalars[12] = a13;
  938. this->scalars[13] = a14;
  939. this->scalars[14] = a15;
  940. this->scalars[15] = a16;
  941. }
  942. // Construct a portable vector from a single duplicated scalar
  943. explicit U8x16(uint8_t scalar) {
  944. this->scalars[0] = scalar;
  945. this->scalars[1] = scalar;
  946. this->scalars[2] = scalar;
  947. this->scalars[3] = scalar;
  948. this->scalars[4] = scalar;
  949. this->scalars[5] = scalar;
  950. this->scalars[6] = scalar;
  951. this->scalars[7] = scalar;
  952. this->scalars[8] = scalar;
  953. this->scalars[9] = scalar;
  954. this->scalars[10] = scalar;
  955. this->scalars[11] = scalar;
  956. this->scalars[12] = scalar;
  957. this->scalars[13] = scalar;
  958. this->scalars[14] = scalar;
  959. this->scalars[15] = scalar;
  960. }
  961. #endif
  962. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  963. static inline U8x16 createGradient(uint8_t start, uint8_t increment) {
  964. return U8x16(
  965. start,
  966. start + increment,
  967. start + increment * 2,
  968. start + increment * 3,
  969. start + increment * 4,
  970. start + increment * 5,
  971. start + increment * 6,
  972. start + increment * 7,
  973. start + increment * 8,
  974. start + increment * 9,
  975. start + increment * 10,
  976. start + increment * 11,
  977. start + increment * 12,
  978. start + increment * 13,
  979. start + increment * 14,
  980. start + increment * 15
  981. );
  982. }
  983. static inline U8x16 readAlignedUnsafe(const uint8_t* data) {
  984. #if defined USE_BASIC_SIMD
  985. #if defined USE_SSE2
  986. return U8x16(_mm_load_si128((const __m128i*)data));
  987. #elif defined USE_NEON
  988. return U8x16(vld1q_u8(data));
  989. #endif
  990. #else
  991. return U8x16(
  992. data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7],
  993. data[8], data[9], data[10], data[11], data[12], data[13], data[14], data[15]
  994. );
  995. #endif
  996. }
  997. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  998. inline void writeAlignedUnsafe(uint8_t* data) const {
  999. #if defined USE_BASIC_SIMD
  1000. #if defined USE_SSE2
  1001. _mm_store_si128((__m128i*)data, this->v);
  1002. #elif defined USE_NEON
  1003. vst1q_u8(data, this->v);
  1004. #endif
  1005. #else
  1006. data[0] = this->scalars[0];
  1007. data[1] = this->scalars[1];
  1008. data[2] = this->scalars[2];
  1009. data[3] = this->scalars[3];
  1010. data[4] = this->scalars[4];
  1011. data[5] = this->scalars[5];
  1012. data[6] = this->scalars[6];
  1013. data[7] = this->scalars[7];
  1014. data[8] = this->scalars[8];
  1015. data[9] = this->scalars[9];
  1016. data[10] = this->scalars[10];
  1017. data[11] = this->scalars[11];
  1018. data[12] = this->scalars[12];
  1019. data[13] = this->scalars[13];
  1020. data[14] = this->scalars[14];
  1021. data[15] = this->scalars[15];
  1022. #endif
  1023. }
  1024. // Bound and alignment checked reading
  1025. static inline U8x16 readAligned(dsr::SafePointer<const uint8_t> data, const char* methodName) {
  1026. const uint8_t* pointer = data.getUnsafe();
  1027. assert(((uintptr_t)pointer & 15) == 0);
  1028. #if defined SAFE_POINTER_CHECKS
  1029. data.assertInside(methodName, pointer, 16);
  1030. #endif
  1031. return U8x16::readAlignedUnsafe(pointer);
  1032. }
  1033. // Bound and alignment checked writing
  1034. inline void writeAligned(dsr::SafePointer<uint8_t> data, const char* methodName) const {
  1035. uint8_t* pointer = data.getUnsafe();
  1036. assert(((uintptr_t)pointer & 15) == 0);
  1037. #if defined SAFE_POINTER_CHECKS
  1038. data.assertInside(methodName, pointer, 16);
  1039. #endif
  1040. this->writeAlignedUnsafe(pointer);
  1041. }
  1042. };
  1043. union F32x8 {
  1044. private:
  1045. // The uninitialized default constructor is private for safety reasons.
  1046. F32x8() {}
  1047. public:
  1048. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  1049. static inline F32x8 create_dangerous_uninitialized() { return F32x8(); }
  1050. #if defined USE_256BIT_F_SIMD
  1051. public:
  1052. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  1053. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  1054. float scalars[8];
  1055. #endif
  1056. // The SIMD vector of undefined type
  1057. // Not accessible while emulating!
  1058. SIMD_F32x8 v;
  1059. // Construct a portable vector from a native SIMD vector
  1060. explicit F32x8(const SIMD_F32x8& v) : v(v) {}
  1061. // Construct a portable vector from a set of scalars
  1062. F32x8(float a1, float a2, float a3, float a4, float a5, float a6, float a7, float a8)
  1063. : v(LOAD_VECTOR_F32_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8)) {}
  1064. // Construct a portable vector from a single duplicated scalar
  1065. explicit F32x8(float scalar) : v(LOAD_SCALAR_F32_SIMD256(scalar)) {}
  1066. #else
  1067. public:
  1068. // Emulate a SIMD vector as an array of scalars without hardware support.
  1069. // Only accessible while emulating!
  1070. float scalars[8];
  1071. // Construct a portable vector from a set of scalars
  1072. F32x8(float a1, float a2, float a3, float a4, float a5, float a6, float a7, float a8) {
  1073. this->scalars[0] = a1;
  1074. this->scalars[1] = a2;
  1075. this->scalars[2] = a3;
  1076. this->scalars[3] = a4;
  1077. this->scalars[4] = a5;
  1078. this->scalars[5] = a6;
  1079. this->scalars[6] = a7;
  1080. this->scalars[7] = a8;
  1081. }
  1082. // Construct a portable vector from a single duplicated scalar
  1083. explicit F32x8(float scalar) {
  1084. this->scalars[0] = scalar;
  1085. this->scalars[1] = scalar;
  1086. this->scalars[2] = scalar;
  1087. this->scalars[3] = scalar;
  1088. this->scalars[4] = scalar;
  1089. this->scalars[5] = scalar;
  1090. this->scalars[6] = scalar;
  1091. this->scalars[7] = scalar;
  1092. }
  1093. #endif
  1094. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1095. static inline F32x8 createGradient(float start, float increment) {
  1096. return F32x8(
  1097. start,
  1098. start + increment,
  1099. start + increment * 2.0f,
  1100. start + increment * 3.0f,
  1101. start + increment * 4.0f,
  1102. start + increment * 5.0f,
  1103. start + increment * 6.0f,
  1104. start + increment * 7.0f
  1105. );
  1106. }
  1107. // Construct a portable SIMD vector from a pointer to aligned data
  1108. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1109. static inline F32x8 readAlignedUnsafe(const float* data) {
  1110. #if defined USE_AVX2
  1111. return F32x8(_mm256_load_ps(data));
  1112. #else
  1113. return F32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  1114. #endif
  1115. }
  1116. // Write to aligned memory from the existing vector
  1117. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1118. inline void writeAlignedUnsafe(float* data) const {
  1119. #if defined USE_AVX2
  1120. _mm256_store_ps(data, this->v);
  1121. #else
  1122. data[0] = this->scalars[0];
  1123. data[1] = this->scalars[1];
  1124. data[2] = this->scalars[2];
  1125. data[3] = this->scalars[3];
  1126. data[4] = this->scalars[4];
  1127. data[5] = this->scalars[5];
  1128. data[6] = this->scalars[6];
  1129. data[7] = this->scalars[7];
  1130. #endif
  1131. }
  1132. // Bound and alignment checked reading
  1133. static inline F32x8 readAligned(dsr::SafePointer<const float> data, const char* methodName) {
  1134. const float* pointer = data.getUnsafe();
  1135. assert(((uintptr_t)pointer & 31) == 0);
  1136. #if defined SAFE_POINTER_CHECKS
  1137. data.assertInside(methodName, pointer, 32);
  1138. #endif
  1139. return F32x8::readAlignedUnsafe(pointer);
  1140. }
  1141. // Bound and alignment checked writing
  1142. inline void writeAligned(dsr::SafePointer<float> data, const char* methodName) const {
  1143. float* pointer = data.getUnsafe();
  1144. assert(((uintptr_t)pointer & 31) == 0);
  1145. #if defined SAFE_POINTER_CHECKS
  1146. data.assertInside(methodName, pointer, 32);
  1147. #endif
  1148. this->writeAlignedUnsafe(pointer);
  1149. }
  1150. };
  1151. // 1 / value
  1152. inline F32x8 reciprocal(const F32x8 &value) {
  1153. #if defined USE_AVX2
  1154. // Approximate
  1155. SIMD_F32x8 lowQ = _mm256_rcp_ps(value.v);
  1156. // Refine
  1157. return F32x8(SUB_F32_SIMD256(ADD_F32_SIMD256(lowQ, lowQ), MUL_F32_SIMD256(value.v, MUL_F32_SIMD256(lowQ, lowQ))));
  1158. #else
  1159. return F32x8(
  1160. 1.0f / value.scalars[0],
  1161. 1.0f / value.scalars[1],
  1162. 1.0f / value.scalars[2],
  1163. 1.0f / value.scalars[3],
  1164. 1.0f / value.scalars[4],
  1165. 1.0f / value.scalars[5],
  1166. 1.0f / value.scalars[6],
  1167. 1.0f / value.scalars[7]
  1168. );
  1169. #endif
  1170. }
  1171. // 1 / sqrt(value)
  1172. inline F32x8 reciprocalSquareRoot(const F32x8 &value) {
  1173. #if defined USE_AVX2
  1174. SIMD_F32x8 reciRoot = _mm256_rsqrt_ps(value.v);
  1175. SIMD_F32x8 mul = MUL_F32_SIMD256(MUL_F32_SIMD256(value.v, reciRoot), reciRoot);
  1176. reciRoot = MUL_F32_SIMD256(MUL_F32_SIMD256(LOAD_SCALAR_F32_SIMD256(0.5f), reciRoot), SUB_F32_SIMD256(LOAD_SCALAR_F32_SIMD256(3.0f), mul));
  1177. return F32x8(reciRoot);
  1178. #else
  1179. return F32x8(
  1180. 1.0f / sqrt(value.scalars[0]),
  1181. 1.0f / sqrt(value.scalars[1]),
  1182. 1.0f / sqrt(value.scalars[2]),
  1183. 1.0f / sqrt(value.scalars[3]),
  1184. 1.0f / sqrt(value.scalars[4]),
  1185. 1.0f / sqrt(value.scalars[5]),
  1186. 1.0f / sqrt(value.scalars[6]),
  1187. 1.0f / sqrt(value.scalars[7])
  1188. );
  1189. #endif
  1190. }
  1191. // sqrt(value)
  1192. inline F32x8 squareRoot(const F32x8 &value) {
  1193. #if defined USE_AVX2
  1194. SIMD_F32x8 half = LOAD_SCALAR_F32_SIMD256(0.5f);
  1195. // Approximate
  1196. SIMD_F32x8 root = _mm256_sqrt_ps(value.v);
  1197. // Refine
  1198. root = _mm256_mul_ps(_mm256_add_ps(root, _mm256_div_ps(value.v, root)), half);
  1199. return F32x8(root);
  1200. #else
  1201. return F32x8(
  1202. sqrt(value.scalars[0]),
  1203. sqrt(value.scalars[1]),
  1204. sqrt(value.scalars[2]),
  1205. sqrt(value.scalars[3]),
  1206. sqrt(value.scalars[4]),
  1207. sqrt(value.scalars[5]),
  1208. sqrt(value.scalars[6]),
  1209. sqrt(value.scalars[7]));
  1210. #endif
  1211. }
  1212. union I32x8 {
  1213. private:
  1214. // The uninitialized default constructor is private for safety reasons.
  1215. I32x8() {}
  1216. public:
  1217. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  1218. static inline I32x8 create_dangerous_uninitialized() { return I32x8(); }
  1219. #if defined USE_256BIT_X_SIMD
  1220. public:
  1221. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  1222. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  1223. int32_t scalars[8];
  1224. #endif
  1225. // The SIMD vector of undefined type
  1226. // Not accessible while emulating!
  1227. SIMD_I32x8 v;
  1228. // Construct a portable vector from a native SIMD vector
  1229. explicit I32x8(const SIMD_I32x8& v) : v(v) {}
  1230. // Construct a portable vector from a set of scalars
  1231. I32x8(int32_t a1, int32_t a2, int32_t a3, int32_t a4, int32_t a5, int32_t a6, int32_t a7, int32_t a8)
  1232. : v(LOAD_VECTOR_I32_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8)) {}
  1233. // Construct a portable vector from a single duplicated scalar
  1234. explicit I32x8(int32_t scalar) : v(LOAD_SCALAR_I32_SIMD256(scalar)) {}
  1235. #else
  1236. public:
  1237. // Emulate a SIMD vector as an array of scalars without hardware support.
  1238. // Only accessible while emulating!
  1239. int32_t scalars[8];
  1240. // Construct a portable vector from a set of scalars
  1241. I32x8(int32_t a1, int32_t a2, int32_t a3, int32_t a4, int32_t a5, int32_t a6, int32_t a7, int32_t a8) {
  1242. this->scalars[0] = a1;
  1243. this->scalars[1] = a2;
  1244. this->scalars[2] = a3;
  1245. this->scalars[3] = a4;
  1246. this->scalars[4] = a5;
  1247. this->scalars[5] = a6;
  1248. this->scalars[6] = a7;
  1249. this->scalars[7] = a8;
  1250. }
  1251. // Construct a portable vector from a single duplicated scalar
  1252. explicit I32x8(int32_t scalar) {
  1253. this->scalars[0] = scalar;
  1254. this->scalars[1] = scalar;
  1255. this->scalars[2] = scalar;
  1256. this->scalars[3] = scalar;
  1257. this->scalars[4] = scalar;
  1258. this->scalars[5] = scalar;
  1259. this->scalars[6] = scalar;
  1260. this->scalars[7] = scalar;
  1261. }
  1262. #endif
  1263. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1264. static inline I32x8 createGradient(int32_t start, int32_t increment) {
  1265. return I32x8(
  1266. start,
  1267. start + increment,
  1268. start + increment * 2,
  1269. start + increment * 3,
  1270. start + increment * 4,
  1271. start + increment * 5,
  1272. start + increment * 6,
  1273. start + increment * 7
  1274. );
  1275. }
  1276. // Construct a portable SIMD vector from a pointer to aligned data
  1277. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1278. static inline I32x8 readAlignedUnsafe(const int32_t* data) {
  1279. #if defined USE_AVX2
  1280. return I32x8(_mm256_load_si256((const __m256i*)data));
  1281. #else
  1282. return I32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  1283. #endif
  1284. }
  1285. // Write to aligned memory from the existing vector
  1286. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1287. inline void writeAlignedUnsafe(int32_t* data) const {
  1288. #if defined USE_AVX2
  1289. _mm256_store_si256((__m256i*)data, this->v);
  1290. #else
  1291. data[0] = this->scalars[0];
  1292. data[1] = this->scalars[1];
  1293. data[2] = this->scalars[2];
  1294. data[3] = this->scalars[3];
  1295. data[4] = this->scalars[4];
  1296. data[5] = this->scalars[5];
  1297. data[6] = this->scalars[6];
  1298. data[7] = this->scalars[7];
  1299. #endif
  1300. }
  1301. // Bound and alignment checked reading
  1302. static inline I32x8 readAligned(dsr::SafePointer<const int32_t> data, const char* methodName) {
  1303. const int32_t* pointer = data.getUnsafe();
  1304. assert(((uintptr_t)pointer & 31) == 0);
  1305. #if defined SAFE_POINTER_CHECKS
  1306. data.assertInside(methodName, pointer, 32);
  1307. #endif
  1308. return I32x8::readAlignedUnsafe(pointer);
  1309. }
  1310. // Bound and alignment checked writing
  1311. inline void writeAligned(dsr::SafePointer<int32_t> data, const char* methodName) const {
  1312. int32_t* pointer = data.getUnsafe();
  1313. assert(((uintptr_t)pointer & 31) == 0);
  1314. #if defined SAFE_POINTER_CHECKS
  1315. data.assertInside(methodName, pointer, 32);
  1316. #endif
  1317. this->writeAlignedUnsafe(pointer);
  1318. }
  1319. };
  1320. union U32x8 {
  1321. private:
  1322. // The uninitialized default constructor is private for safety reasons.
  1323. U32x8() {}
  1324. public:
  1325. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  1326. static inline U32x8 create_dangerous_uninitialized() { return U32x8(); }
  1327. #if defined USE_256BIT_X_SIMD
  1328. public:
  1329. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  1330. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  1331. uint32_t scalars[8];
  1332. #endif
  1333. // The SIMD vector of undefined type
  1334. // Not accessible while emulating!
  1335. SIMD_U32x8 v;
  1336. // Construct a portable vector from a native SIMD vector
  1337. explicit U32x8(const SIMD_U32x8& v) : v(v) {}
  1338. // Construct a portable vector from a set of scalars
  1339. U32x8(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4, uint32_t a5, uint32_t a6, uint32_t a7, uint32_t a8)
  1340. : v(LOAD_VECTOR_U32_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8)) {}
  1341. // Construct a portable vector from a single duplicated scalar
  1342. explicit U32x8(uint32_t scalar) : v(LOAD_SCALAR_U32_SIMD256(scalar)) {}
  1343. #else
  1344. public:
  1345. // Emulate a SIMD vector as an array of scalars without hardware support.
  1346. // Only accessible while emulating!
  1347. uint32_t scalars[8];
  1348. // Construct a portable vector from a set of scalars
  1349. U32x8(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4, uint32_t a5, uint32_t a6, uint32_t a7, uint32_t a8) {
  1350. this->scalars[0] = a1;
  1351. this->scalars[1] = a2;
  1352. this->scalars[2] = a3;
  1353. this->scalars[3] = a4;
  1354. this->scalars[4] = a5;
  1355. this->scalars[5] = a6;
  1356. this->scalars[6] = a7;
  1357. this->scalars[7] = a8;
  1358. }
  1359. // Construct a portable vector from a single duplicated scalar
  1360. explicit U32x8(uint32_t scalar) {
  1361. this->scalars[0] = scalar;
  1362. this->scalars[1] = scalar;
  1363. this->scalars[2] = scalar;
  1364. this->scalars[3] = scalar;
  1365. this->scalars[4] = scalar;
  1366. this->scalars[5] = scalar;
  1367. this->scalars[6] = scalar;
  1368. this->scalars[7] = scalar;
  1369. }
  1370. #endif
  1371. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1372. static inline U32x8 createGradient(uint32_t start, uint32_t increment) {
  1373. return U32x8(
  1374. start,
  1375. start + increment,
  1376. start + increment * 2,
  1377. start + increment * 3,
  1378. start + increment * 4,
  1379. start + increment * 5,
  1380. start + increment * 6,
  1381. start + increment * 7
  1382. );
  1383. }
  1384. // Construct a portable SIMD vector from a pointer to aligned data
  1385. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1386. static inline U32x8 readAlignedUnsafe(const uint32_t* data) {
  1387. #if defined USE_AVX2
  1388. return U32x8(_mm256_load_si256((const __m256i*)data));
  1389. #else
  1390. return U32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  1391. #endif
  1392. }
  1393. // Write to aligned memory from the existing vector
  1394. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1395. inline void writeAlignedUnsafe(uint32_t* data) const {
  1396. #if defined USE_AVX2
  1397. _mm256_store_si256((__m256i*)data, this->v);
  1398. #else
  1399. data[0] = this->scalars[0];
  1400. data[1] = this->scalars[1];
  1401. data[2] = this->scalars[2];
  1402. data[3] = this->scalars[3];
  1403. data[4] = this->scalars[4];
  1404. data[5] = this->scalars[5];
  1405. data[6] = this->scalars[6];
  1406. data[7] = this->scalars[7];
  1407. #endif
  1408. }
  1409. // Bound and alignment checked reading
  1410. static inline U32x8 readAligned(dsr::SafePointer<const uint32_t> data, const char* methodName) {
  1411. const uint32_t* pointer = data.getUnsafe();
  1412. assert(((uintptr_t)pointer & 31) == 0);
  1413. #if defined SAFE_POINTER_CHECKS
  1414. data.assertInside(methodName, pointer, 32);
  1415. #endif
  1416. return U32x8::readAlignedUnsafe(pointer);
  1417. }
  1418. // Bound and alignment checked writing
  1419. inline void writeAligned(dsr::SafePointer<uint32_t> data, const char* methodName) const {
  1420. uint32_t* pointer = data.getUnsafe();
  1421. assert(((uintptr_t)pointer & 31) == 0);
  1422. #if defined SAFE_POINTER_CHECKS
  1423. data.assertInside(methodName, pointer, 32);
  1424. #endif
  1425. this->writeAlignedUnsafe(pointer);
  1426. }
  1427. };
  1428. union U16x16 {
  1429. private:
  1430. // The uninitialized default constructor is private for safety reasons.
  1431. U16x16() {}
  1432. public:
  1433. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  1434. static inline U16x16 create_dangerous_uninitialized() { return U16x16(); }
  1435. #if defined USE_256BIT_X_SIMD
  1436. public:
  1437. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  1438. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  1439. uint16_t scalars[16];
  1440. #endif
  1441. // The SIMD vector of undefined type
  1442. // Not accessible while emulating!
  1443. SIMD_U16x16 v;
  1444. // Construct a portable vector from a native SIMD vector
  1445. explicit U16x16(const SIMD_U16x16& v) : v(v) {}
  1446. // Construct a vector of 16 x 16-bit unsigned integers from a vector of 8 x 32-bit unsigned integers
  1447. // Reinterpret casting is used
  1448. explicit U16x16(const U32x8& vector) : v(REINTERPRET_U32_TO_U16_SIMD256(vector.v)) {}
  1449. // Construct a portable vector from a set of scalars
  1450. U16x16(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8,
  1451. uint16_t a9, uint16_t a10, uint16_t a11, uint16_t a12, uint16_t a13, uint16_t a14, uint16_t a15, uint16_t a16)
  1452. : v(LOAD_VECTOR_U16_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16)) {}
  1453. // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
  1454. // Reinterpret casting is used
  1455. // TODO: Remove all reintreprets from constructors to improve readability
  1456. explicit U16x16(uint32_t scalar) : v(REINTERPRET_U32_TO_U16_SIMD256(LOAD_SCALAR_U32_SIMD256(scalar))) {}
  1457. // Construct a portable vector from a single duplicated scalar
  1458. explicit U16x16(uint16_t scalar) : v(LOAD_SCALAR_U16_SIMD256(scalar)) {}
  1459. // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
  1460. U32x8 get_U32() const {
  1461. return U32x8(REINTERPRET_U16_TO_U32_SIMD256(this->v));
  1462. }
  1463. #else
  1464. public:
  1465. // Emulate a SIMD vector as an array of scalars without hardware support.
  1466. // Only accessible while emulating!
  1467. uint16_t scalars[16];
  1468. // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
  1469. // Reinterpret casting is used
  1470. explicit U16x16(const U32x8& vector) {
  1471. uint64_t *target = (uint64_t*)this->scalars;
  1472. uint64_t *source = (uint64_t*)vector.scalars;
  1473. target[0] = source[0];
  1474. target[1] = source[1];
  1475. target[2] = source[2];
  1476. target[3] = source[3];
  1477. }
  1478. // Construct a portable vector from a set of scalars
  1479. U16x16(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8,
  1480. uint16_t a9, uint16_t a10, uint16_t a11, uint16_t a12, uint16_t a13, uint16_t a14, uint16_t a15, uint16_t a16) {
  1481. this->scalars[0] = a1;
  1482. this->scalars[1] = a2;
  1483. this->scalars[2] = a3;
  1484. this->scalars[3] = a4;
  1485. this->scalars[4] = a5;
  1486. this->scalars[5] = a6;
  1487. this->scalars[6] = a7;
  1488. this->scalars[7] = a8;
  1489. this->scalars[8] = a9;
  1490. this->scalars[9] = a10;
  1491. this->scalars[10] = a11;
  1492. this->scalars[11] = a12;
  1493. this->scalars[12] = a13;
  1494. this->scalars[13] = a14;
  1495. this->scalars[14] = a15;
  1496. this->scalars[15] = a16;
  1497. }
  1498. // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
  1499. // Reinterpret casting is used
  1500. explicit U16x16(uint32_t scalar) {
  1501. uint32_t *target = (uint32_t*)this->scalars;
  1502. target[0] = scalar;
  1503. target[1] = scalar;
  1504. target[2] = scalar;
  1505. target[3] = scalar;
  1506. target[4] = scalar;
  1507. target[5] = scalar;
  1508. target[6] = scalar;
  1509. target[7] = scalar;
  1510. }
  1511. // Construct a portable vector from a single duplicated scalar
  1512. explicit U16x16(uint16_t scalar) {
  1513. this->scalars[0] = scalar;
  1514. this->scalars[1] = scalar;
  1515. this->scalars[2] = scalar;
  1516. this->scalars[3] = scalar;
  1517. this->scalars[4] = scalar;
  1518. this->scalars[5] = scalar;
  1519. this->scalars[6] = scalar;
  1520. this->scalars[7] = scalar;
  1521. this->scalars[8] = scalar;
  1522. this->scalars[9] = scalar;
  1523. this->scalars[10] = scalar;
  1524. this->scalars[11] = scalar;
  1525. this->scalars[12] = scalar;
  1526. this->scalars[13] = scalar;
  1527. this->scalars[14] = scalar;
  1528. this->scalars[15] = scalar;
  1529. }
  1530. // Reinterpret cast to a vector of 8 x 32-bit unsigned integers
  1531. U32x8 get_U32() const {
  1532. U32x8 result(0);
  1533. uint64_t *target = (uint64_t*)result.scalars;
  1534. uint64_t *source = (uint64_t*)this->scalars;
  1535. target[0] = source[0];
  1536. target[1] = source[1];
  1537. target[2] = source[2];
  1538. target[3] = source[3];
  1539. return result;
  1540. }
  1541. #endif
  1542. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1543. static inline U16x16 createGradient(uint16_t start, uint16_t increment) {
  1544. return U16x16(
  1545. start,
  1546. start + increment,
  1547. start + increment * 2,
  1548. start + increment * 3,
  1549. start + increment * 4,
  1550. start + increment * 5,
  1551. start + increment * 6,
  1552. start + increment * 7,
  1553. start + increment * 8,
  1554. start + increment * 9,
  1555. start + increment * 10,
  1556. start + increment * 11,
  1557. start + increment * 12,
  1558. start + increment * 13,
  1559. start + increment * 14,
  1560. start + increment * 15
  1561. );
  1562. }
  1563. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1564. //static inline U16x16 readSlow(uint16_t* data) {
  1565. // return U16x16(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  1566. //}
  1567. static inline U16x16 readAlignedUnsafe(const uint16_t* data) {
  1568. #if defined USE_AVX2
  1569. return U16x16(_mm256_load_si256((const __m256i*)data));
  1570. #else
  1571. return U16x16(
  1572. data[0],
  1573. data[1],
  1574. data[2],
  1575. data[3],
  1576. data[4],
  1577. data[5],
  1578. data[6],
  1579. data[7],
  1580. data[8],
  1581. data[9],
  1582. data[10],
  1583. data[11],
  1584. data[12],
  1585. data[13],
  1586. data[14],
  1587. data[15]
  1588. );
  1589. #endif
  1590. }
  1591. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1592. inline void writeAlignedUnsafe(uint16_t* data) const {
  1593. #if defined USE_AVX2
  1594. _mm256_store_si256((__m256i*)data, this->v);
  1595. #else
  1596. data[0] = this->scalars[0];
  1597. data[1] = this->scalars[1];
  1598. data[2] = this->scalars[2];
  1599. data[3] = this->scalars[3];
  1600. data[4] = this->scalars[4];
  1601. data[5] = this->scalars[5];
  1602. data[6] = this->scalars[6];
  1603. data[7] = this->scalars[7];
  1604. data[8] = this->scalars[8];
  1605. data[9] = this->scalars[9];
  1606. data[10] = this->scalars[10];
  1607. data[11] = this->scalars[11];
  1608. data[12] = this->scalars[12];
  1609. data[13] = this->scalars[13];
  1610. data[14] = this->scalars[14];
  1611. data[15] = this->scalars[15];
  1612. #endif
  1613. }
  1614. // Bound and alignment checked reading
  1615. static inline U16x16 readAligned(dsr::SafePointer<const uint16_t> data, const char* methodName) {
  1616. const uint16_t* pointer = data.getUnsafe();
  1617. assert(((uintptr_t)pointer & 31) == 0);
  1618. #if defined SAFE_POINTER_CHECKS
  1619. data.assertInside(methodName, pointer, 32);
  1620. #endif
  1621. return U16x16::readAlignedUnsafe(pointer);
  1622. }
  1623. // Bound and alignment checked writing
  1624. inline void writeAligned(dsr::SafePointer<uint16_t> data, const char* methodName) const {
  1625. uint16_t* pointer = data.getUnsafe();
  1626. assert(((uintptr_t)pointer & 31) == 0);
  1627. #if defined SAFE_POINTER_CHECKS
  1628. data.assertInside(methodName, pointer, 32);
  1629. #endif
  1630. this->writeAlignedUnsafe(pointer);
  1631. }
  1632. };
  1633. union U8x32 {
  1634. private:
  1635. // The uninitialized default constructor is private for safety reasons.
  1636. U8x32() {}
  1637. public:
  1638. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  1639. static inline U8x32 create_dangerous_uninitialized() { return U8x32(); }
  1640. #if defined USE_256BIT_X_SIMD
  1641. public:
  1642. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  1643. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  1644. uint8_t scalars[32];
  1645. #endif
  1646. // The SIMD vector of undefined type
  1647. // Not accessible while emulating!
  1648. SIMD_U8x32 v;
  1649. // Construct a portable vector from a native SIMD vector
  1650. explicit U8x32(const SIMD_U8x32& v) : v(v) {}
  1651. // Construct a portable vector from a set of scalars
  1652. U8x32(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  1653. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16,
  1654. uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24,
  1655. uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32)
  1656. : v(LOAD_VECTOR_U8_SIMD256(
  1657. a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16,
  1658. a17, a18, a19, a20, a21, a22, a23, a24, a25, a26, a27, a28, a29, a30, a31, a32
  1659. )) {}
  1660. // Construct a portable vector from a single duplicated scalar
  1661. explicit U8x32(uint8_t scalar) : v(LOAD_SCALAR_U8_SIMD256(scalar)) {}
  1662. #else
  1663. public:
  1664. // Emulate a SIMD vector as an array of scalars without hardware support.
  1665. // Only accessible while emulating!
  1666. uint8_t scalars[32];
  1667. // Construct a portable vector from a set of scalars
  1668. U8x32(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  1669. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16,
  1670. uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24,
  1671. uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32) {
  1672. this->scalars[0] = a1;
  1673. this->scalars[1] = a2;
  1674. this->scalars[2] = a3;
  1675. this->scalars[3] = a4;
  1676. this->scalars[4] = a5;
  1677. this->scalars[5] = a6;
  1678. this->scalars[6] = a7;
  1679. this->scalars[7] = a8;
  1680. this->scalars[8] = a9;
  1681. this->scalars[9] = a10;
  1682. this->scalars[10] = a11;
  1683. this->scalars[11] = a12;
  1684. this->scalars[12] = a13;
  1685. this->scalars[13] = a14;
  1686. this->scalars[14] = a15;
  1687. this->scalars[15] = a16;
  1688. this->scalars[16] = a17;
  1689. this->scalars[17] = a18;
  1690. this->scalars[18] = a19;
  1691. this->scalars[19] = a20;
  1692. this->scalars[20] = a21;
  1693. this->scalars[21] = a22;
  1694. this->scalars[22] = a23;
  1695. this->scalars[23] = a24;
  1696. this->scalars[24] = a25;
  1697. this->scalars[25] = a26;
  1698. this->scalars[26] = a27;
  1699. this->scalars[27] = a28;
  1700. this->scalars[28] = a29;
  1701. this->scalars[29] = a30;
  1702. this->scalars[30] = a31;
  1703. this->scalars[31] = a32;
  1704. }
  1705. // Construct a portable vector from a single duplicated scalar
  1706. explicit U8x32(uint8_t scalar) {
  1707. for (int i = 0; i < 32; i++) {
  1708. this->scalars[i] = scalar;
  1709. }
  1710. }
  1711. #endif
  1712. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1713. static inline U8x32 createGradient(uint8_t start, uint8_t increment) {
  1714. return U8x32(
  1715. start,
  1716. start + increment,
  1717. start + increment * 2,
  1718. start + increment * 3,
  1719. start + increment * 4,
  1720. start + increment * 5,
  1721. start + increment * 6,
  1722. start + increment * 7,
  1723. start + increment * 8,
  1724. start + increment * 9,
  1725. start + increment * 10,
  1726. start + increment * 11,
  1727. start + increment * 12,
  1728. start + increment * 13,
  1729. start + increment * 14,
  1730. start + increment * 15,
  1731. start + increment * 16,
  1732. start + increment * 17,
  1733. start + increment * 18,
  1734. start + increment * 19,
  1735. start + increment * 20,
  1736. start + increment * 21,
  1737. start + increment * 22,
  1738. start + increment * 23,
  1739. start + increment * 24,
  1740. start + increment * 25,
  1741. start + increment * 26,
  1742. start + increment * 27,
  1743. start + increment * 28,
  1744. start + increment * 29,
  1745. start + increment * 30,
  1746. start + increment * 31
  1747. );
  1748. }
  1749. static inline U8x32 readAlignedUnsafe(const uint8_t* data) {
  1750. #if defined USE_AVX2
  1751. return U8x32(_mm256_load_si256((const __m256i*)data));
  1752. #else
  1753. U8x32 result;
  1754. for (int i = 0; i < 32; i++) {
  1755. result.scalars[i] = data[i];
  1756. }
  1757. return result;
  1758. #endif
  1759. }
  1760. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1761. inline void writeAlignedUnsafe(uint8_t* data) const {
  1762. #if defined USE_AVX2
  1763. _mm256_store_si256((__m256i*)data, this->v);
  1764. #else
  1765. for (int i = 0; i < 32; i++) {
  1766. data[i] = this->scalars[i];
  1767. }
  1768. #endif
  1769. }
  1770. // Bound and alignment checked reading
  1771. static inline U8x32 readAligned(dsr::SafePointer<const uint8_t> data, const char* methodName) {
  1772. const uint8_t* pointer = data.getUnsafe();
  1773. assert(((uintptr_t)pointer & 31) == 0);
  1774. #if defined SAFE_POINTER_CHECKS
  1775. data.assertInside(methodName, pointer, 32);
  1776. #endif
  1777. return U8x32::readAlignedUnsafe(pointer);
  1778. }
  1779. // Bound and alignment checked writing
  1780. inline void writeAligned(dsr::SafePointer<uint8_t> data, const char* methodName) const {
  1781. uint8_t* pointer = data.getUnsafe();
  1782. assert(((uintptr_t)pointer & 31) == 0);
  1783. #if defined SAFE_POINTER_CHECKS
  1784. data.assertInside(methodName, pointer, 32);
  1785. #endif
  1786. this->writeAlignedUnsafe(pointer);
  1787. }
  1788. };
  1789. // Helper macros for doing things to certain sets of SIMD vector types
  1790. // Performing do(vector_type, element_type, lane_count)
  1791. #define FOR_ALL_VECTOR_TYPES(DO) \
  1792. DO(F32x4, float, 4) \
  1793. DO(I32x4, int32_t, 4) \
  1794. DO(U32x4, uint32_t, 4) \
  1795. DO(U16x8, uint16_t, 8) \
  1796. DO(U8x16, uint8_t, 16) \
  1797. DO(F32x8, float, 8) \
  1798. DO(I32x8, int32_t, 8) \
  1799. DO(U32x8, uint32_t, 8) \
  1800. DO(U16x16, uint16_t, 16) \
  1801. DO(U8x32, uint8_t, 32)
  1802. #define FOR_FLOAT_VECTOR_TYPES(DO) \
  1803. DO(F32x4, float, 4) \
  1804. DO(F32x8, float, 8)
  1805. #define FOR_INTEGER_VECTOR_TYPES(DO) \
  1806. DO(I32x4, int32_t, 4) \
  1807. DO(U32x4, uint32_t, 4) \
  1808. DO(U16x8, uint16_t, 8) \
  1809. DO(U8x16, uint8_t, 16) \
  1810. DO(I32x8, int32_t, 8) \
  1811. DO(U32x8, uint32_t, 8) \
  1812. DO(U16x16, uint16_t, 16) \
  1813. DO(U8x32, uint8_t, 32)
  1814. #define FOR_SIGNED_VECTOR_TYPES(DO) \
  1815. DO(F32x4, float, 4) \
  1816. DO(I32x4, int32_t, 4) \
  1817. DO(F32x8, float, 8) \
  1818. DO(I32x8, int32_t, 8)
  1819. #define FOR_UNSIGNED_VECTOR_TYPES(DO) \
  1820. DO(U32x4, uint32_t, 4) \
  1821. DO(U16x8, uint16_t, 8) \
  1822. DO(U8x16, uint8_t, 16) \
  1823. DO(U32x8, uint32_t, 8) \
  1824. DO(U16x16, uint16_t, 16) \
  1825. DO(U8x32, uint8_t, 32)
  1826. // Print SIMD vectors to the terminal or append them to strings.
  1827. #define CREATE_METHOD_PRINT(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  1828. inline dsr::String& string_toStreamIndented(dsr::String& target, const VECTOR_TYPE& source, const dsr::ReadableString& indentation) { \
  1829. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  1830. source.writeAlignedUnsafe(a); \
  1831. dsr::string_append(target, indentation, a[0]); \
  1832. for (int i = 1; i < LANE_COUNT; i++) { \
  1833. string_append(target, U", ", a[i]); \
  1834. } \
  1835. return target; \
  1836. }
  1837. // All SIMD vectors can be printed.
  1838. FOR_ALL_VECTOR_TYPES(CREATE_METHOD_PRINT)
  1839. #undef CREATE_METHOD_PRINT
  1840. // Integer equality returns true iff all comparisons are identical.
  1841. #define CREATE_EXACT_EQUALITY(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  1842. inline bool allLanesEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  1843. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  1844. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  1845. left.writeAlignedUnsafe(a); \
  1846. right.writeAlignedUnsafe(b); \
  1847. for (int i = 0; i < LANE_COUNT; i++) { \
  1848. if (a[i] != b[i]) return false; \
  1849. } \
  1850. return true; \
  1851. } \
  1852. inline bool allLanesNotEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  1853. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  1854. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  1855. left.writeAlignedUnsafe(a); \
  1856. right.writeAlignedUnsafe(b); \
  1857. for (int i = 0; i < LANE_COUNT; i++) { \
  1858. if (a[i] == b[i]) return false; \
  1859. } \
  1860. return true; \
  1861. }
  1862. // Integer SIMD vectors have exact equlity.
  1863. FOR_INTEGER_VECTOR_TYPES(CREATE_EXACT_EQUALITY)
  1864. #undef CREATE_EXACT_EQUALITY
  1865. // Float equality returns true iff all comparisons are near.
  1866. #define CREATE_TOLERANT_EQUALITY(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  1867. inline bool allLanesEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  1868. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  1869. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  1870. left.writeAlignedUnsafe(a); \
  1871. right.writeAlignedUnsafe(b); \
  1872. for (int i = 0; i < LANE_COUNT; i++) { \
  1873. if (fabs(a[i] - b[i]) >= 0.0001f) return false; \
  1874. } \
  1875. return true; \
  1876. } \
  1877. inline bool allLanesNotEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  1878. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  1879. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  1880. left.writeAlignedUnsafe(a); \
  1881. right.writeAlignedUnsafe(b); \
  1882. for (int i = 0; i < LANE_COUNT; i++) { \
  1883. if (fabs(a[i] - b[i]) < 0.0001f) return false; \
  1884. } \
  1885. return true; \
  1886. }
  1887. // Float SIMD vectors have inexact equality.
  1888. FOR_FLOAT_VECTOR_TYPES(CREATE_TOLERANT_EQUALITY)
  1889. #undef CREATE_TOLERANT_EQUALITY
  1890. #define CREATE_COMPARISONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  1891. inline bool allLanesGreater(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  1892. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  1893. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  1894. left.writeAlignedUnsafe(a); \
  1895. right.writeAlignedUnsafe(b); \
  1896. for (int i = 0; i < LANE_COUNT; i++) { \
  1897. if (a[i] <= b[i]) return false; \
  1898. } \
  1899. return true; \
  1900. } \
  1901. inline bool allLanesGreaterOrEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  1902. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  1903. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  1904. left.writeAlignedUnsafe(a); \
  1905. right.writeAlignedUnsafe(b); \
  1906. for (int i = 0; i < LANE_COUNT; i++) { \
  1907. if (a[i] < b[i]) return false; \
  1908. } \
  1909. return true; \
  1910. } \
  1911. inline bool allLanesLesser(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  1912. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  1913. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  1914. left.writeAlignedUnsafe(a); \
  1915. right.writeAlignedUnsafe(b); \
  1916. for (int i = 0; i < LANE_COUNT; i++) { \
  1917. if (a[i] >= b[i]) return false; \
  1918. } \
  1919. return true; \
  1920. } \
  1921. inline bool allLanesLesserOrEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  1922. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  1923. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  1924. left.writeAlignedUnsafe(a); \
  1925. right.writeAlignedUnsafe(b); \
  1926. for (int i = 0; i < LANE_COUNT; i++) { \
  1927. if (a[i] > b[i]) return false; \
  1928. } \
  1929. return true; \
  1930. }
  1931. FOR_ALL_VECTOR_TYPES(CREATE_COMPARISONS)
  1932. #undef CREATE_COMPARISONS
  1933. inline F32x4 operator+(const F32x4& left, const F32x4& right) {
  1934. #if defined USE_BASIC_SIMD
  1935. return F32x4(ADD_F32_SIMD(left.v, right.v));
  1936. #else
  1937. return F32x4(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3]);
  1938. #endif
  1939. }
  1940. inline F32x4 operator-(const F32x4& left, const F32x4& right) {
  1941. #if defined USE_BASIC_SIMD
  1942. return F32x4(SUB_F32_SIMD(left.v, right.v));
  1943. #else
  1944. return F32x4(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3]);
  1945. #endif
  1946. }
  1947. inline F32x4 operator*(const F32x4& left, const F32x4& right) {
  1948. #if defined USE_BASIC_SIMD
  1949. return F32x4(MUL_F32_SIMD(left.v, right.v));
  1950. #else
  1951. return F32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
  1952. #endif
  1953. }
  1954. inline F32x4 min(const F32x4& left, const F32x4& right) {
  1955. #if defined USE_BASIC_SIMD
  1956. return F32x4(MIN_F32_SIMD(left.v, right.v));
  1957. #else
  1958. float v0 = left.scalars[0];
  1959. float v1 = left.scalars[1];
  1960. float v2 = left.scalars[2];
  1961. float v3 = left.scalars[3];
  1962. float r0 = right.scalars[0];
  1963. float r1 = right.scalars[1];
  1964. float r2 = right.scalars[2];
  1965. float r3 = right.scalars[3];
  1966. if (r0 < v0) { v0 = r0; }
  1967. if (r1 < v1) { v1 = r1; }
  1968. if (r2 < v2) { v2 = r2; }
  1969. if (r3 < v3) { v3 = r3; }
  1970. return F32x4(v0, v1, v2, v3);
  1971. #endif
  1972. }
  1973. inline F32x4 max(const F32x4& left, const F32x4& right) {
  1974. #if defined USE_BASIC_SIMD
  1975. return F32x4(MAX_F32_SIMD(left.v, right.v));
  1976. #else
  1977. float v0 = left.scalars[0];
  1978. float v1 = left.scalars[1];
  1979. float v2 = left.scalars[2];
  1980. float v3 = left.scalars[3];
  1981. float r0 = right.scalars[0];
  1982. float r1 = right.scalars[1];
  1983. float r2 = right.scalars[2];
  1984. float r3 = right.scalars[3];
  1985. if (r0 > v0) { v0 = r0; }
  1986. if (r1 > v1) { v1 = r1; }
  1987. if (r2 > v2) { v2 = r2; }
  1988. if (r3 > v3) { v3 = r3; }
  1989. return F32x4(v0, v1, v2, v3);
  1990. #endif
  1991. }
  1992. inline I32x4 operator+(const I32x4& left, const I32x4& right) {
  1993. #if defined USE_BASIC_SIMD
  1994. return I32x4(ADD_I32_SIMD(left.v, right.v));
  1995. #else
  1996. return I32x4(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3]);
  1997. #endif
  1998. }
  1999. inline I32x4 operator-(const I32x4& left, const I32x4& right) {
  2000. #if defined USE_BASIC_SIMD
  2001. return I32x4(SUB_I32_SIMD(left.v, right.v));
  2002. #else
  2003. return I32x4(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3]);
  2004. #endif
  2005. }
  2006. inline I32x4 operator*(const I32x4& left, const I32x4& right) {
  2007. #if defined USE_BASIC_SIMD
  2008. #if defined USE_SSE2
  2009. // TODO: Use AVX2 for 32-bit integer multiplication when available.
  2010. return I32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
  2011. #elif defined USE_NEON
  2012. return I32x4(MUL_I32_NEON(left.v, right.v));
  2013. #endif
  2014. #else
  2015. return I32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
  2016. #endif
  2017. }
  2018. // TODO: Specify the behavior of truncated unsigned integer overflow and add it to the tests.
  2019. inline U32x4 operator+(const U32x4& left, const U32x4& right) {
  2020. #if defined USE_BASIC_SIMD
  2021. return U32x4(ADD_U32_SIMD(left.v, right.v));
  2022. #else
  2023. return U32x4(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3]);
  2024. #endif
  2025. }
  2026. inline U32x4 operator-(const U32x4& left, const U32x4& right) {
  2027. #if defined USE_BASIC_SIMD
  2028. return U32x4(SUB_U32_SIMD(left.v, right.v));
  2029. #else
  2030. return U32x4(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3]);
  2031. #endif
  2032. }
  2033. inline U32x4 operator*(const U32x4& left, const U32x4& right) {
  2034. #if defined USE_BASIC_SIMD
  2035. #if defined USE_SSE2
  2036. // Emulate a NEON instruction on SSE2 registers
  2037. return U32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
  2038. #else // NEON
  2039. return U32x4(MUL_U32_NEON(left.v, right.v));
  2040. #endif
  2041. #else
  2042. return U32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
  2043. #endif
  2044. }
  2045. // Bitwise and
  2046. inline U32x4 operator&(const U32x4& left, const U32x4& right) {
  2047. #if defined USE_BASIC_SIMD
  2048. return U32x4(BITWISE_AND_U32_SIMD(left.v, right.v));
  2049. #else
  2050. return U32x4(left.scalars[0] & right.scalars[0], left.scalars[1] & right.scalars[1], left.scalars[2] & right.scalars[2], left.scalars[3] & right.scalars[3]);
  2051. #endif
  2052. }
  2053. // Bitwise or
  2054. inline U32x4 operator|(const U32x4& left, const U32x4& right) {
  2055. #if defined USE_BASIC_SIMD
  2056. return U32x4(BITWISE_OR_U32_SIMD(left.v, right.v));
  2057. #else
  2058. return U32x4(left.scalars[0] | right.scalars[0], left.scalars[1] | right.scalars[1], left.scalars[2] | right.scalars[2], left.scalars[3] | right.scalars[3]);
  2059. #endif
  2060. }
  2061. // Bitwise xor
  2062. inline U32x4 operator^(const U32x4& left, const U32x4& right) {
  2063. #if defined USE_BASIC_SIMD
  2064. return U32x4(BITWISE_XOR_U32_SIMD(left.v, right.v));
  2065. #else
  2066. return U32x4(left.scalars[0] ^ right.scalars[0], left.scalars[1] ^ right.scalars[1], left.scalars[2] ^ right.scalars[2], left.scalars[3] ^ right.scalars[3]);
  2067. #endif
  2068. }
  2069. // Bitwise negation
  2070. inline U32x4 operator~(const U32x4& value) {
  2071. #if defined USE_NEON
  2072. return U32x4(vmvnq_u32(value.v));
  2073. #elif defined USE_BASIC_SIMD
  2074. // Fall back on xor against all ones.
  2075. return value ^ U32x4(~uint32_t(0));
  2076. #else
  2077. return U32x4(~value.scalars[0], ~value.scalars[1], ~value.scalars[2], ~value.scalars[3]);
  2078. #endif
  2079. }
  2080. inline U32x4 operator<<(const U32x4& left, const U32x4 &bitOffsets) {
  2081. assert(allLanesLesser(bitOffsets, U32x4(32u)));
  2082. #if defined USE_NEON
  2083. return U32x4(vshlq_u32(left.v, vreinterpretq_s32_u32(bitOffsets.v)));
  2084. #else
  2085. return U32x4(
  2086. left.scalars[0] << bitOffsets.scalars[0],
  2087. left.scalars[1] << bitOffsets.scalars[1],
  2088. left.scalars[2] << bitOffsets.scalars[2],
  2089. left.scalars[3] << bitOffsets.scalars[3]
  2090. );
  2091. #endif
  2092. }
  2093. inline U32x4 operator>>(const U32x4& left, const U32x4 &bitOffsets) {
  2094. assert(allLanesLesser(bitOffsets, U32x4(32u)));
  2095. #if defined USE_NEON
  2096. //return U32x4(vshrq_u32(left.v, vreinterpretq_s32_u32(bitOffsets.v)));
  2097. return U32x4(vshlq_u32(left.v, vnegq_s32(vreinterpretq_s32_u32(bitOffsets.v))));
  2098. #else
  2099. return U32x4(
  2100. left.scalars[0] >> bitOffsets.scalars[0],
  2101. left.scalars[1] >> bitOffsets.scalars[1],
  2102. left.scalars[2] >> bitOffsets.scalars[2],
  2103. left.scalars[3] >> bitOffsets.scalars[3]
  2104. );
  2105. #endif
  2106. }
  2107. // bitOffset must be an immediate constant, so a template argument is used.
  2108. template <uint32_t bitOffset>
  2109. inline U32x4 bitShiftLeftImmediate(const U32x4& left) {
  2110. static_assert(bitOffset < 32u, "Immediate left shift of 32-bit values may not shift more than 31 bits!");
  2111. #if defined USE_SSE2
  2112. return U32x4(_mm_slli_epi32(left.v, bitOffset));
  2113. #else
  2114. #if defined USE_NEON
  2115. return U32x4(vshlq_u32(left.v, LOAD_SCALAR_I32_SIMD(bitOffset)));
  2116. #else
  2117. return U32x4(left.scalars[0] << bitOffset, left.scalars[1] << bitOffset, left.scalars[2] << bitOffset, left.scalars[3] << bitOffset);
  2118. #endif
  2119. #endif
  2120. }
  2121. // bitOffset must be an immediate constant.
  2122. template <uint32_t bitOffset>
  2123. inline U32x4 bitShiftRightImmediate(const U32x4& left) {
  2124. static_assert(bitOffset < 32u, "Immediate right shift of 32-bit values may not shift more than 31 bits!");
  2125. #if defined USE_SSE2
  2126. return U32x4(_mm_srli_epi32(left.v, bitOffset));
  2127. #else
  2128. #if defined USE_NEON
  2129. // TODO: Why is vshrq_u32 not found?
  2130. //return U32x4(vshrq_u32(left.v, LOAD_SCALAR_I32_SIMD(bitOffset)));
  2131. return U32x4(vshlq_u32(left.v, LOAD_SCALAR_I32_SIMD(-(int32_t)bitOffset)));
  2132. #else
  2133. return U32x4(left.scalars[0] >> bitOffset, left.scalars[1] >> bitOffset, left.scalars[2] >> bitOffset, left.scalars[3] >> bitOffset);
  2134. #endif
  2135. #endif
  2136. }
  2137. inline U16x8 operator<<(const U16x8& left, const U16x8 &bitOffsets) {
  2138. assert(allLanesLesser(bitOffsets, U16x8(16u)));
  2139. #if defined USE_NEON
  2140. return U16x8(vshlq_u16(left.v, vreinterpretq_s16_u16(bitOffsets.v)));
  2141. #else
  2142. return U16x8(
  2143. left.scalars[0] << bitOffsets.scalars[0],
  2144. left.scalars[1] << bitOffsets.scalars[1],
  2145. left.scalars[2] << bitOffsets.scalars[2],
  2146. left.scalars[3] << bitOffsets.scalars[3],
  2147. left.scalars[4] << bitOffsets.scalars[4],
  2148. left.scalars[5] << bitOffsets.scalars[5],
  2149. left.scalars[6] << bitOffsets.scalars[6],
  2150. left.scalars[7] << bitOffsets.scalars[7]
  2151. );
  2152. #endif
  2153. }
  2154. inline U16x8 operator>>(const U16x8& left, const U16x8 &bitOffsets) {
  2155. assert(allLanesLesser(bitOffsets, U16x8(16u)));
  2156. #if defined USE_NEON
  2157. //return U16x8(vshrq_u16(left.v, vreinterpretq_s16_u16(bitOffsets.v)));
  2158. return U16x8(vshlq_u16(left.v, vnegq_s16(vreinterpretq_s16_u16(bitOffsets.v))));
  2159. #else
  2160. return U16x8(
  2161. left.scalars[0] >> bitOffsets.scalars[0],
  2162. left.scalars[1] >> bitOffsets.scalars[1],
  2163. left.scalars[2] >> bitOffsets.scalars[2],
  2164. left.scalars[3] >> bitOffsets.scalars[3],
  2165. left.scalars[4] >> bitOffsets.scalars[4],
  2166. left.scalars[5] >> bitOffsets.scalars[5],
  2167. left.scalars[6] >> bitOffsets.scalars[6],
  2168. left.scalars[7] >> bitOffsets.scalars[7]
  2169. );
  2170. #endif
  2171. }
  2172. // bitOffset must be an immediate constant, so a template argument is used.
  2173. template <uint32_t bitOffset>
  2174. inline U16x8 bitShiftLeftImmediate(const U16x8& left) {
  2175. static_assert(bitOffset < 16u, "Immediate left shift of 16-bit values may not shift more than 15 bits!");
  2176. #if defined USE_SSE2
  2177. return U16x8(_mm_slli_epi16(left.v, bitOffset));
  2178. #else
  2179. #if defined USE_NEON
  2180. return U16x8(vshlq_u32(left.v, vdupq_n_s16(bitOffset)));
  2181. #else
  2182. return U16x8(
  2183. left.scalars[0] << bitOffset,
  2184. left.scalars[1] << bitOffset,
  2185. left.scalars[2] << bitOffset,
  2186. left.scalars[3] << bitOffset,
  2187. left.scalars[4] << bitOffset,
  2188. left.scalars[5] << bitOffset,
  2189. left.scalars[6] << bitOffset,
  2190. left.scalars[7] << bitOffset
  2191. );
  2192. #endif
  2193. #endif
  2194. }
  2195. // bitOffset must be an immediate constant.
  2196. template <uint32_t bitOffset>
  2197. inline U16x8 bitShiftRightImmediate(const U16x8& left) {
  2198. static_assert(bitOffset < 16u, "Immediate right shift of 16-bit values may not shift more than 15 bits!");
  2199. #if defined USE_SSE2
  2200. return U16x8(_mm_srli_epi16(left.v, bitOffset));
  2201. #else
  2202. #if defined USE_NEON
  2203. //return U16x8(vshrq_u16(left.v, vdupq_n_s16(bitOffset)));
  2204. return U16x8(vshlq_u16(left.v, vdupq_n_s16(-(int32_t)bitOffset)));
  2205. #else
  2206. return U16x8(
  2207. left.scalars[0] >> bitOffset,
  2208. left.scalars[1] >> bitOffset,
  2209. left.scalars[2] >> bitOffset,
  2210. left.scalars[3] >> bitOffset,
  2211. left.scalars[4] >> bitOffset,
  2212. left.scalars[5] >> bitOffset,
  2213. left.scalars[6] >> bitOffset,
  2214. left.scalars[7] >> bitOffset
  2215. );
  2216. #endif
  2217. #endif
  2218. }
  2219. inline U8x16 operator<<(const U8x16& left, const U8x16 &bitOffsets) {
  2220. assert(allLanesLesser(bitOffsets, U8x16(8u)));
  2221. #if defined USE_NEON
  2222. return U8x16(vshlq_u16(left.v, vreinterpretq_s8_u8(bitOffsets.v)));
  2223. #else
  2224. return U8x16(
  2225. left.scalars[ 0] << bitOffsets.scalars[ 0],
  2226. left.scalars[ 1] << bitOffsets.scalars[ 1],
  2227. left.scalars[ 2] << bitOffsets.scalars[ 2],
  2228. left.scalars[ 3] << bitOffsets.scalars[ 3],
  2229. left.scalars[ 4] << bitOffsets.scalars[ 4],
  2230. left.scalars[ 5] << bitOffsets.scalars[ 5],
  2231. left.scalars[ 6] << bitOffsets.scalars[ 6],
  2232. left.scalars[ 7] << bitOffsets.scalars[ 7],
  2233. left.scalars[ 8] << bitOffsets.scalars[ 8],
  2234. left.scalars[ 9] << bitOffsets.scalars[ 9],
  2235. left.scalars[10] << bitOffsets.scalars[10],
  2236. left.scalars[11] << bitOffsets.scalars[11],
  2237. left.scalars[12] << bitOffsets.scalars[12],
  2238. left.scalars[13] << bitOffsets.scalars[13],
  2239. left.scalars[14] << bitOffsets.scalars[14],
  2240. left.scalars[15] << bitOffsets.scalars[15]
  2241. );
  2242. #endif
  2243. }
  2244. inline U8x16 operator>>(const U8x16& left, const U8x16 &bitOffsets) {
  2245. assert(allLanesLesser(bitOffsets, U8x16(8u)));
  2246. #if defined USE_NEON
  2247. //return U8x16(vshrq_u16(left.v, vreinterpretq_s8_u8(bitOffsets.v)));
  2248. return U8x16(vshlq_u16(left.v, vnegq_s8(vreinterpretq_s8_u8(bitOffsets.v))));
  2249. #else
  2250. return U8x16(
  2251. left.scalars[ 0] >> bitOffsets.scalars[ 0],
  2252. left.scalars[ 1] >> bitOffsets.scalars[ 1],
  2253. left.scalars[ 2] >> bitOffsets.scalars[ 2],
  2254. left.scalars[ 3] >> bitOffsets.scalars[ 3],
  2255. left.scalars[ 4] >> bitOffsets.scalars[ 4],
  2256. left.scalars[ 5] >> bitOffsets.scalars[ 5],
  2257. left.scalars[ 6] >> bitOffsets.scalars[ 6],
  2258. left.scalars[ 7] >> bitOffsets.scalars[ 7],
  2259. left.scalars[ 8] >> bitOffsets.scalars[ 8],
  2260. left.scalars[ 9] >> bitOffsets.scalars[ 9],
  2261. left.scalars[10] >> bitOffsets.scalars[10],
  2262. left.scalars[11] >> bitOffsets.scalars[11],
  2263. left.scalars[12] >> bitOffsets.scalars[12],
  2264. left.scalars[13] >> bitOffsets.scalars[13],
  2265. left.scalars[14] >> bitOffsets.scalars[14],
  2266. left.scalars[15] >> bitOffsets.scalars[15]
  2267. );
  2268. #endif
  2269. }
  2270. // bitOffset must be an immediate constant, so a template argument is used.
  2271. template <uint32_t bitOffset>
  2272. inline U8x16 bitShiftLeftImmediate(const U8x16& left) {
  2273. static_assert(bitOffset < 8u, "Immediate left shift of 8-bit values may not shift more than 7 bits!");
  2274. #if defined USE_NEON
  2275. return U8x16(vshlq_u32(left.v, vdupq_n_s8(bitOffset)));
  2276. #else
  2277. return U8x16(
  2278. left.scalars[ 0] << bitOffset,
  2279. left.scalars[ 1] << bitOffset,
  2280. left.scalars[ 2] << bitOffset,
  2281. left.scalars[ 3] << bitOffset,
  2282. left.scalars[ 4] << bitOffset,
  2283. left.scalars[ 5] << bitOffset,
  2284. left.scalars[ 6] << bitOffset,
  2285. left.scalars[ 7] << bitOffset,
  2286. left.scalars[ 8] << bitOffset,
  2287. left.scalars[ 9] << bitOffset,
  2288. left.scalars[10] << bitOffset,
  2289. left.scalars[11] << bitOffset,
  2290. left.scalars[12] << bitOffset,
  2291. left.scalars[13] << bitOffset,
  2292. left.scalars[14] << bitOffset,
  2293. left.scalars[15] << bitOffset
  2294. );
  2295. #endif
  2296. }
  2297. // bitOffset must be an immediate constant.
  2298. template <uint32_t bitOffset>
  2299. inline U8x16 bitShiftRightImmediate(const U8x16& left) {
  2300. static_assert(bitOffset < 8u, "Immediate right shift of 8-bit values may not shift more than 7 bits!");
  2301. #if defined USE_NEON
  2302. //return U8x16(vshrq_u32(left.v, vdupq_n_s8(bitOffset)));
  2303. return U8x16(vshlq_u32(left.v, vdupq_n_s8(-(int32_t)bitOffset)));
  2304. #else
  2305. return U8x16(
  2306. left.scalars[ 0] >> bitOffset,
  2307. left.scalars[ 1] >> bitOffset,
  2308. left.scalars[ 2] >> bitOffset,
  2309. left.scalars[ 3] >> bitOffset,
  2310. left.scalars[ 4] >> bitOffset,
  2311. left.scalars[ 5] >> bitOffset,
  2312. left.scalars[ 6] >> bitOffset,
  2313. left.scalars[ 7] >> bitOffset,
  2314. left.scalars[ 8] >> bitOffset,
  2315. left.scalars[ 9] >> bitOffset,
  2316. left.scalars[10] >> bitOffset,
  2317. left.scalars[11] >> bitOffset,
  2318. left.scalars[12] >> bitOffset,
  2319. left.scalars[13] >> bitOffset,
  2320. left.scalars[14] >> bitOffset,
  2321. left.scalars[15] >> bitOffset
  2322. );
  2323. #endif
  2324. }
  2325. inline U16x8 operator+(const U16x8& left, const U16x8& right) {
  2326. #if defined USE_BASIC_SIMD
  2327. return U16x8(ADD_U16_SIMD(left.v, right.v));
  2328. #else
  2329. return U16x8(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3],
  2330. left.scalars[4] + right.scalars[4], left.scalars[5] + right.scalars[5], left.scalars[6] + right.scalars[6], left.scalars[7] + right.scalars[7]);
  2331. #endif
  2332. }
  2333. inline U16x8 operator-(const U16x8& left, const U16x8& right) {
  2334. #if defined USE_BASIC_SIMD
  2335. return U16x8(SUB_U16_SIMD(left.v, right.v));
  2336. #else
  2337. return U16x8(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3],
  2338. left.scalars[4] - right.scalars[4], left.scalars[5] - right.scalars[5], left.scalars[6] - right.scalars[6], left.scalars[7] - right.scalars[7]);
  2339. #endif
  2340. }
  2341. inline U16x8 operator*(const U16x8& left, const U16x8& right) {
  2342. #if defined USE_BASIC_SIMD
  2343. return U16x8(MUL_U16_SIMD(left.v, right.v));
  2344. #else
  2345. return U16x8(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3],
  2346. left.scalars[4] * right.scalars[4], left.scalars[5] * right.scalars[5], left.scalars[6] * right.scalars[6], left.scalars[7] * right.scalars[7]);
  2347. #endif
  2348. }
  2349. inline U8x16 operator+(const U8x16& left, const U8x16& right) {
  2350. #if defined USE_BASIC_SIMD
  2351. return U8x16(ADD_U8_SIMD(left.v, right.v));
  2352. #else
  2353. return U8x16(
  2354. left.scalars[0] + right.scalars[0],
  2355. left.scalars[1] + right.scalars[1],
  2356. left.scalars[2] + right.scalars[2],
  2357. left.scalars[3] + right.scalars[3],
  2358. left.scalars[4] + right.scalars[4],
  2359. left.scalars[5] + right.scalars[5],
  2360. left.scalars[6] + right.scalars[6],
  2361. left.scalars[7] + right.scalars[7],
  2362. left.scalars[8] + right.scalars[8],
  2363. left.scalars[9] + right.scalars[9],
  2364. left.scalars[10] + right.scalars[10],
  2365. left.scalars[11] + right.scalars[11],
  2366. left.scalars[12] + right.scalars[12],
  2367. left.scalars[13] + right.scalars[13],
  2368. left.scalars[14] + right.scalars[14],
  2369. left.scalars[15] + right.scalars[15]
  2370. );
  2371. #endif
  2372. }
  2373. inline U8x16 operator-(const U8x16& left, const U8x16& right) {
  2374. #if defined USE_BASIC_SIMD
  2375. return U8x16(SUB_U8_SIMD(left.v, right.v));
  2376. #else
  2377. return U8x16(
  2378. left.scalars[0] - right.scalars[0],
  2379. left.scalars[1] - right.scalars[1],
  2380. left.scalars[2] - right.scalars[2],
  2381. left.scalars[3] - right.scalars[3],
  2382. left.scalars[4] - right.scalars[4],
  2383. left.scalars[5] - right.scalars[5],
  2384. left.scalars[6] - right.scalars[6],
  2385. left.scalars[7] - right.scalars[7],
  2386. left.scalars[8] - right.scalars[8],
  2387. left.scalars[9] - right.scalars[9],
  2388. left.scalars[10] - right.scalars[10],
  2389. left.scalars[11] - right.scalars[11],
  2390. left.scalars[12] - right.scalars[12],
  2391. left.scalars[13] - right.scalars[13],
  2392. left.scalars[14] - right.scalars[14],
  2393. left.scalars[15] - right.scalars[15]
  2394. );
  2395. #endif
  2396. }
  2397. inline uint8_t impl_limit0(int32_t x) { return x < 0 ? 0 : x; }
  2398. inline uint8_t impl_limit255(uint32_t x) { return x > 255 ? 255 : x; }
  2399. inline U8x16 saturatedAddition(const U8x16& left, const U8x16& right) {
  2400. #if defined USE_BASIC_SIMD
  2401. return U8x16(ADD_SAT_U8_SIMD(left.v, right.v));
  2402. #else
  2403. return U8x16(
  2404. impl_limit255((uint32_t)left.scalars[0] + (uint32_t)right.scalars[0]),
  2405. impl_limit255((uint32_t)left.scalars[1] + (uint32_t)right.scalars[1]),
  2406. impl_limit255((uint32_t)left.scalars[2] + (uint32_t)right.scalars[2]),
  2407. impl_limit255((uint32_t)left.scalars[3] + (uint32_t)right.scalars[3]),
  2408. impl_limit255((uint32_t)left.scalars[4] + (uint32_t)right.scalars[4]),
  2409. impl_limit255((uint32_t)left.scalars[5] + (uint32_t)right.scalars[5]),
  2410. impl_limit255((uint32_t)left.scalars[6] + (uint32_t)right.scalars[6]),
  2411. impl_limit255((uint32_t)left.scalars[7] + (uint32_t)right.scalars[7]),
  2412. impl_limit255((uint32_t)left.scalars[8] + (uint32_t)right.scalars[8]),
  2413. impl_limit255((uint32_t)left.scalars[9] + (uint32_t)right.scalars[9]),
  2414. impl_limit255((uint32_t)left.scalars[10] + (uint32_t)right.scalars[10]),
  2415. impl_limit255((uint32_t)left.scalars[11] + (uint32_t)right.scalars[11]),
  2416. impl_limit255((uint32_t)left.scalars[12] + (uint32_t)right.scalars[12]),
  2417. impl_limit255((uint32_t)left.scalars[13] + (uint32_t)right.scalars[13]),
  2418. impl_limit255((uint32_t)left.scalars[14] + (uint32_t)right.scalars[14]),
  2419. impl_limit255((uint32_t)left.scalars[15] + (uint32_t)right.scalars[15])
  2420. );
  2421. #endif
  2422. }
  2423. inline U8x16 saturatedSubtraction(const U8x16& left, const U8x16& right) {
  2424. #if defined USE_BASIC_SIMD
  2425. return U8x16(SUB_SAT_U8_SIMD(left.v, right.v));
  2426. #else
  2427. return U8x16(
  2428. impl_limit0((int32_t)left.scalars[0] - (int32_t)right.scalars[0]),
  2429. impl_limit0((int32_t)left.scalars[1] - (int32_t)right.scalars[1]),
  2430. impl_limit0((int32_t)left.scalars[2] - (int32_t)right.scalars[2]),
  2431. impl_limit0((int32_t)left.scalars[3] - (int32_t)right.scalars[3]),
  2432. impl_limit0((int32_t)left.scalars[4] - (int32_t)right.scalars[4]),
  2433. impl_limit0((int32_t)left.scalars[5] - (int32_t)right.scalars[5]),
  2434. impl_limit0((int32_t)left.scalars[6] - (int32_t)right.scalars[6]),
  2435. impl_limit0((int32_t)left.scalars[7] - (int32_t)right.scalars[7]),
  2436. impl_limit0((int32_t)left.scalars[8] - (int32_t)right.scalars[8]),
  2437. impl_limit0((int32_t)left.scalars[9] - (int32_t)right.scalars[9]),
  2438. impl_limit0((int32_t)left.scalars[10] - (int32_t)right.scalars[10]),
  2439. impl_limit0((int32_t)left.scalars[11] - (int32_t)right.scalars[11]),
  2440. impl_limit0((int32_t)left.scalars[12] - (int32_t)right.scalars[12]),
  2441. impl_limit0((int32_t)left.scalars[13] - (int32_t)right.scalars[13]),
  2442. impl_limit0((int32_t)left.scalars[14] - (int32_t)right.scalars[14]),
  2443. impl_limit0((int32_t)left.scalars[15] - (int32_t)right.scalars[15])
  2444. );
  2445. #endif
  2446. }
  2447. inline I32x4 truncateToI32(const F32x4& vector) {
  2448. #if defined USE_BASIC_SIMD
  2449. return I32x4(F32_TO_I32_SIMD(vector.v));
  2450. #else
  2451. return I32x4((int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3]);
  2452. #endif
  2453. }
  2454. inline U32x4 truncateToU32(const F32x4& vector) {
  2455. #if defined USE_BASIC_SIMD
  2456. return U32x4(F32_TO_U32_SIMD(vector.v));
  2457. #else
  2458. return U32x4((uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3]);
  2459. #endif
  2460. }
  2461. inline F32x4 floatFromI32(const I32x4& vector) {
  2462. #if defined USE_BASIC_SIMD
  2463. return F32x4(I32_TO_F32_SIMD(vector.v));
  2464. #else
  2465. return F32x4((float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3]);
  2466. #endif
  2467. }
  2468. inline F32x4 floatFromU32(const U32x4& vector) {
  2469. #if defined USE_BASIC_SIMD
  2470. return F32x4(U32_TO_F32_SIMD(vector.v));
  2471. #else
  2472. return F32x4((float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3]);
  2473. #endif
  2474. }
  2475. inline I32x4 I32FromU32(const U32x4& vector) {
  2476. #if defined USE_BASIC_SIMD
  2477. return I32x4(REINTERPRET_U32_TO_I32_SIMD(vector.v));
  2478. #else
  2479. return I32x4((int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3]);
  2480. #endif
  2481. }
  2482. inline U32x4 U32FromI32(const I32x4& vector) {
  2483. #if defined USE_BASIC_SIMD
  2484. return U32x4(REINTERPRET_I32_TO_U32_SIMD(vector.v));
  2485. #else
  2486. return U32x4((uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3]);
  2487. #endif
  2488. }
  2489. // Warning! Behavior depends on endianness.
  2490. inline U8x16 reinterpret_U8FromU32(const U32x4& vector) {
  2491. #if defined USE_BASIC_SIMD
  2492. return U8x16(REINTERPRET_U32_TO_U8_SIMD(vector.v));
  2493. #else
  2494. const uint8_t *source = (const uint8_t*)vector.scalars;
  2495. return U8x16(
  2496. source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7],
  2497. source[8], source[9], source[10], source[11], source[12], source[13], source[14], source[15]
  2498. );
  2499. #endif
  2500. }
  2501. // Warning! Behavior depends on endianness.
  2502. inline U32x4 reinterpret_U32FromU8(const U8x16& vector) {
  2503. #if defined USE_BASIC_SIMD
  2504. return U32x4(REINTERPRET_U8_TO_U32_SIMD(vector.v));
  2505. #else
  2506. const uint32_t *source = (const uint32_t*)vector.scalars;
  2507. return U32x4(source[0], source[1], source[2], source[3]);
  2508. #endif
  2509. }
  2510. // Warning! Behavior depends on endianness.
  2511. inline U16x8 reinterpret_U16FromU32(const U32x4& vector) {
  2512. #if defined USE_BASIC_SIMD
  2513. return U16x8(REINTERPRET_U32_TO_U16_SIMD(vector.v));
  2514. #else
  2515. const uint16_t *source = (const uint16_t*)vector.scalars;
  2516. return U16x8(
  2517. source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7]
  2518. );
  2519. #endif
  2520. }
  2521. // Warning! Behavior depends on endianness.
  2522. inline U32x4 reinterpret_U32FromU16(const U16x8& vector) {
  2523. #if defined USE_BASIC_SIMD
  2524. return U32x4(REINTERPRET_U16_TO_U32_SIMD(vector.v));
  2525. #else
  2526. const uint32_t *source = (const uint32_t*)vector.scalars;
  2527. return U32x4(source[0], source[1], source[2], source[3]);
  2528. #endif
  2529. }
  2530. // Unpacking to larger integers
  2531. inline U32x4 lowerToU32(const U16x8& vector) {
  2532. #if defined USE_BASIC_SIMD
  2533. return U32x4(U16_LOW_TO_U32_SIMD(vector.v));
  2534. #else
  2535. return U32x4(vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3]);
  2536. #endif
  2537. }
  2538. inline U32x4 higherToU32(const U16x8& vector) {
  2539. #if defined USE_BASIC_SIMD
  2540. return U32x4(U16_HIGH_TO_U32_SIMD(vector.v));
  2541. #else
  2542. return U32x4(vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7]);
  2543. #endif
  2544. }
  2545. inline U16x8 lowerToU16(const U8x16& vector) {
  2546. #if defined USE_BASIC_SIMD
  2547. return U16x8(U8_LOW_TO_U16_SIMD(vector.v));
  2548. #else
  2549. return U16x8(
  2550. vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3],
  2551. vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7]
  2552. );
  2553. #endif
  2554. }
  2555. inline U16x8 higherToU16(const U8x16& vector) {
  2556. #if defined USE_BASIC_SIMD
  2557. return U16x8(U8_HIGH_TO_U16_SIMD(vector.v));
  2558. #else
  2559. return U16x8(
  2560. vector.scalars[8], vector.scalars[9], vector.scalars[10], vector.scalars[11],
  2561. vector.scalars[12], vector.scalars[13], vector.scalars[14], vector.scalars[15]
  2562. );
  2563. #endif
  2564. }
  2565. // Saturated packing
  2566. inline U8x16 saturateToU8(const U16x8& lower, const U16x8& upper) {
  2567. #if defined USE_BASIC_SIMD
  2568. return U8x16(PACK_SAT_U16_TO_U8_SIMD(lower.v, upper.v));
  2569. #else
  2570. return U8x16(
  2571. impl_limit255(lower.scalars[0]),
  2572. impl_limit255(lower.scalars[1]),
  2573. impl_limit255(lower.scalars[2]),
  2574. impl_limit255(lower.scalars[3]),
  2575. impl_limit255(lower.scalars[4]),
  2576. impl_limit255(lower.scalars[5]),
  2577. impl_limit255(lower.scalars[6]),
  2578. impl_limit255(lower.scalars[7]),
  2579. impl_limit255(upper.scalars[0]),
  2580. impl_limit255(upper.scalars[1]),
  2581. impl_limit255(upper.scalars[2]),
  2582. impl_limit255(upper.scalars[3]),
  2583. impl_limit255(upper.scalars[4]),
  2584. impl_limit255(upper.scalars[5]),
  2585. impl_limit255(upper.scalars[6]),
  2586. impl_limit255(upper.scalars[7])
  2587. );
  2588. #endif
  2589. }
  2590. // Unary negation for convenience and code readability.
  2591. // Before using unary negation, always check if:
  2592. // * An addition can be turned into a subtraction?
  2593. // x = -a + b
  2594. // x = b - a
  2595. // * A multiplying constant or scalar can be negated instead?
  2596. // x = -b * 2
  2597. // x = b * -2
  2598. inline F32x4 operator-(const F32x4& value) {
  2599. #if defined USE_BASIC_SIMD
  2600. return F32x4(0.0f) - value;
  2601. #else
  2602. return F32x4(-value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3]);
  2603. #endif
  2604. }
  2605. inline I32x4 operator-(const I32x4& value) {
  2606. #if defined USE_BASIC_SIMD
  2607. return I32x4(0) - value;
  2608. #else
  2609. return I32x4(-value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3]);
  2610. #endif
  2611. }
  2612. // Helper macros for generating the vector extract functions.
  2613. // Having one function for each type and offset makes sure that the compiler gets an immediate integer within the valid range.
  2614. #if defined USE_BASIC_SIMD
  2615. #if defined USE_SSE2
  2616. #if defined USE_SSSE3
  2617. #define _MM_ALIGNR_EPI8(A, B, OFFSET) _mm_alignr_epi8(A, B, OFFSET)
  2618. #else
  2619. // If SSSE3 is not used, emulate it using stack memory and unaligned reading of data.
  2620. static inline SIMD_U8x16 _MM_ALIGNR_EPI8(SIMD_U8x16 a, SIMD_U8x16 b, int offset) {
  2621. ALIGN16 uint8_t vectorBuffer[32];
  2622. _mm_store_si128((SIMD_U8x16*)(vectorBuffer), b);
  2623. _mm_store_si128((SIMD_U8x16*)(vectorBuffer + 16), a);
  2624. return _mm_loadu_si128((SIMD_U8x16*)(vectorBuffer + offset));
  2625. }
  2626. #endif
  2627. #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return U8x16(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET));
  2628. #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return U16x8(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 2));
  2629. #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return U32x4(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 4));
  2630. #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return I32x4(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 4));
  2631. #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return F32x4(SIMD_F32x4(_MM_ALIGNR_EPI8(SIMD_U32x4(b.v), SIMD_U32x4(a.v), OFFSET * 4)));
  2632. #elif defined USE_NEON
  2633. #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return U8x16(vextq_u8(a.v, b.v, OFFSET));
  2634. #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return U16x8(vextq_u16(a.v, b.v, OFFSET));
  2635. #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return U32x4(vextq_u32(a.v, b.v, OFFSET));
  2636. #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return I32x4(vextq_s32(a.v, b.v, OFFSET));
  2637. #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return F32x4(vextq_f32(a.v, b.v, OFFSET));
  2638. #endif
  2639. #else
  2640. #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  2641. #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  2642. #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  2643. #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  2644. #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  2645. #endif
  2646. // Vector extraction concatunates two input vectors and reads a vector between them using an offset.
  2647. // The first and last offsets that only return one of the inputs can be used for readability, because they will be inlined and removed by the compiler.
  2648. // To get elements from the right side, combine the center vector with the right vector and shift one element to the left using vectorExtract_1 for the given type.
  2649. // To get elements from the left side, combine the left vector with the center vector and shift one element to the right using vectorExtract_15 for 16 lanes, vectorExtract_7 for 8 lanes, or vectorExtract_3 for 4 lanes.
  2650. U8x16 inline vectorExtract_0(const U8x16 &a, const U8x16 &b) { return a; }
  2651. U8x16 inline vectorExtract_1(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(1, U8x16(a.scalars[1], a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0])) }
  2652. U8x16 inline vectorExtract_2(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(2, U8x16(a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1])) }
  2653. U8x16 inline vectorExtract_3(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(3, U8x16(a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2])) }
  2654. U8x16 inline vectorExtract_4(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(4, U8x16(a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3])) }
  2655. U8x16 inline vectorExtract_5(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(5, U8x16(a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4])) }
  2656. U8x16 inline vectorExtract_6(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(6, U8x16(a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5])) }
  2657. U8x16 inline vectorExtract_7(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(7, U8x16(a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6])) }
  2658. U8x16 inline vectorExtract_8(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(8, U8x16(a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7])) }
  2659. U8x16 inline vectorExtract_9(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(9, U8x16(a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8])) }
  2660. U8x16 inline vectorExtract_10(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(10, U8x16(a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9])) }
  2661. U8x16 inline vectorExtract_11(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(11, U8x16(a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10])) }
  2662. U8x16 inline vectorExtract_12(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(12, U8x16(a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11])) }
  2663. U8x16 inline vectorExtract_13(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(13, U8x16(a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11], b.scalars[12])) }
  2664. U8x16 inline vectorExtract_14(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(14, U8x16(a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11], b.scalars[12], b.scalars[13])) }
  2665. U8x16 inline vectorExtract_15(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(15, U8x16(a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11], b.scalars[12], b.scalars[13], b.scalars[14])) }
  2666. U8x16 inline vectorExtract_16(const U8x16 &a, const U8x16 &b) { return b; }
  2667. U16x8 inline vectorExtract_0(const U16x8 &a, const U16x8 &b) { return a; }
  2668. U16x8 inline vectorExtract_1(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(1, U16x8(a.scalars[1], a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0])) }
  2669. U16x8 inline vectorExtract_2(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(2, U16x8(a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1])) }
  2670. U16x8 inline vectorExtract_3(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(3, U16x8(a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2])) }
  2671. U16x8 inline vectorExtract_4(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(4, U16x8(a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3])) }
  2672. U16x8 inline vectorExtract_5(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(5, U16x8(a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4])) }
  2673. U16x8 inline vectorExtract_6(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(6, U16x8(a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5])) }
  2674. U16x8 inline vectorExtract_7(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(7, U16x8(a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6])) }
  2675. U16x8 inline vectorExtract_8(const U16x8 &a, const U16x8 &b) { return b; }
  2676. U32x4 inline vectorExtract_0(const U32x4 &a, const U32x4 &b) { return a; }
  2677. U32x4 inline vectorExtract_1(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(1, U32x4(a.scalars[1], a.scalars[2], a.scalars[3], b.scalars[0])) }
  2678. U32x4 inline vectorExtract_2(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(2, U32x4(a.scalars[2], a.scalars[3], b.scalars[0], b.scalars[1])) }
  2679. U32x4 inline vectorExtract_3(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(3, U32x4(a.scalars[3], b.scalars[0], b.scalars[1], b.scalars[2])) }
  2680. U32x4 inline vectorExtract_4(const U32x4 &a, const U32x4 &b) { return b; }
  2681. I32x4 inline vectorExtract_0(const I32x4 &a, const I32x4 &b) { return a; }
  2682. I32x4 inline vectorExtract_1(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(1, I32x4(a.scalars[1], a.scalars[2], a.scalars[3], b.scalars[0])) }
  2683. I32x4 inline vectorExtract_2(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(2, I32x4(a.scalars[2], a.scalars[3], b.scalars[0], b.scalars[1])) }
  2684. I32x4 inline vectorExtract_3(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(3, I32x4(a.scalars[3], b.scalars[0], b.scalars[1], b.scalars[2])) }
  2685. I32x4 inline vectorExtract_4(const I32x4 &a, const I32x4 &b) { return b; }
  2686. F32x4 inline vectorExtract_0(const F32x4 &a, const F32x4 &b) { return a; }
  2687. F32x4 inline vectorExtract_1(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(1, F32x4(a.scalars[1], a.scalars[2], a.scalars[3], b.scalars[0])) }
  2688. F32x4 inline vectorExtract_2(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(2, F32x4(a.scalars[2], a.scalars[3], b.scalars[0], b.scalars[1])) }
  2689. F32x4 inline vectorExtract_3(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(3, F32x4(a.scalars[3], b.scalars[0], b.scalars[1], b.scalars[2])) }
  2690. F32x4 inline vectorExtract_4(const F32x4 &a, const F32x4 &b) { return b; }
  2691. // Gather instructions load memory from a pointer at multiple index offsets at the same time.
  2692. // The given pointers should be aligned with 4 bytes, so that the fallback solution works on machines with strict alignment requirements.
  2693. #if defined USE_AVX2
  2694. #define GATHER_I32x4_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_epi32((const int32_t*)(SOURCE), FOUR_OFFSETS, SCALE)
  2695. #define GATHER_U32x4_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_epi32((const int32_t*)(SOURCE), FOUR_OFFSETS, SCALE)
  2696. #define GATHER_F32x4_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_ps((const float*)(SOURCE), FOUR_OFFSETS, SCALE)
  2697. #endif
  2698. static inline U32x4 gather_U32(dsr::SafePointer<const uint32_t> data, const U32x4 &elementOffset) {
  2699. #if defined USE_AVX2
  2700. // TODO: Implement safety checks for debug mode.
  2701. return U32x4(GATHER_U32x4_AVX2(data.getUnsafe(), elementOffset.v, 4));
  2702. #else
  2703. ALIGN16 uint32_t elementOffsets[4];
  2704. elementOffset.writeAlignedUnsafe(elementOffsets);
  2705. return U32x4(
  2706. *(data + elementOffsets[0]),
  2707. *(data + elementOffsets[1]),
  2708. *(data + elementOffsets[2]),
  2709. *(data + elementOffsets[3])
  2710. );
  2711. #endif
  2712. }
  2713. static inline I32x4 gather_I32(dsr::SafePointer<const int32_t> data, const U32x4 &elementOffset) {
  2714. #if defined USE_AVX2
  2715. // TODO: Implement safety checks for debug mode.
  2716. return I32x4(GATHER_U32x4_AVX2(data.getUnsafe(), elementOffset.v, 4));
  2717. #else
  2718. ALIGN16 uint32_t elementOffsets[4];
  2719. elementOffset.writeAlignedUnsafe(elementOffsets);
  2720. return I32x4(
  2721. *(data + elementOffsets[0]),
  2722. *(data + elementOffsets[1]),
  2723. *(data + elementOffsets[2]),
  2724. *(data + elementOffsets[3])
  2725. );
  2726. #endif
  2727. }
  2728. static inline F32x4 gather_F32(dsr::SafePointer<const float> data, const U32x4 &elementOffset) {
  2729. #if defined USE_AVX2
  2730. // TODO: Implement safety checks for debug mode.
  2731. return F32x4(GATHER_F32x4_AVX2(data.getUnsafe(), elementOffset.v, 4));
  2732. #else
  2733. ALIGN16 uint32_t elementOffsets[4];
  2734. elementOffset.writeAlignedUnsafe(elementOffsets);
  2735. return F32x4(
  2736. *(data + elementOffsets[0]),
  2737. *(data + elementOffsets[1]),
  2738. *(data + elementOffsets[2]),
  2739. *(data + elementOffsets[3])
  2740. );
  2741. #endif
  2742. }
  2743. inline F32x8 operator+(const F32x8& left, const F32x8& right) {
  2744. #if defined USE_256BIT_F_SIMD
  2745. return F32x8(ADD_F32_SIMD256(left.v, right.v));
  2746. #else
  2747. return F32x8(
  2748. left.scalars[0] + right.scalars[0],
  2749. left.scalars[1] + right.scalars[1],
  2750. left.scalars[2] + right.scalars[2],
  2751. left.scalars[3] + right.scalars[3],
  2752. left.scalars[4] + right.scalars[4],
  2753. left.scalars[5] + right.scalars[5],
  2754. left.scalars[6] + right.scalars[6],
  2755. left.scalars[7] + right.scalars[7]
  2756. );
  2757. #endif
  2758. }
  2759. inline F32x8 operator-(const F32x8& left, const F32x8& right) {
  2760. #if defined USE_256BIT_F_SIMD
  2761. return F32x8(SUB_F32_SIMD256(left.v, right.v));
  2762. #else
  2763. return F32x8(
  2764. left.scalars[0] - right.scalars[0],
  2765. left.scalars[1] - right.scalars[1],
  2766. left.scalars[2] - right.scalars[2],
  2767. left.scalars[3] - right.scalars[3],
  2768. left.scalars[4] - right.scalars[4],
  2769. left.scalars[5] - right.scalars[5],
  2770. left.scalars[6] - right.scalars[6],
  2771. left.scalars[7] - right.scalars[7]
  2772. );
  2773. #endif
  2774. }
  2775. inline F32x8 operator*(const F32x8& left, const F32x8& right) {
  2776. #if defined USE_256BIT_F_SIMD
  2777. return F32x8(MUL_F32_SIMD256(left.v, right.v));
  2778. #else
  2779. return F32x8(
  2780. left.scalars[0] * right.scalars[0],
  2781. left.scalars[1] * right.scalars[1],
  2782. left.scalars[2] * right.scalars[2],
  2783. left.scalars[3] * right.scalars[3],
  2784. left.scalars[4] * right.scalars[4],
  2785. left.scalars[5] * right.scalars[5],
  2786. left.scalars[6] * right.scalars[6],
  2787. left.scalars[7] * right.scalars[7]
  2788. );
  2789. #endif
  2790. }
  2791. inline F32x8 min(const F32x8& left, const F32x8& right) {
  2792. #if defined USE_256BIT_F_SIMD
  2793. return F32x8(MIN_F32_SIMD256(left.v, right.v));
  2794. #else
  2795. float v0 = left.scalars[0];
  2796. float v1 = left.scalars[1];
  2797. float v2 = left.scalars[2];
  2798. float v3 = left.scalars[3];
  2799. float v4 = left.scalars[4];
  2800. float v5 = left.scalars[5];
  2801. float v6 = left.scalars[6];
  2802. float v7 = left.scalars[7];
  2803. float r0 = right.scalars[0];
  2804. float r1 = right.scalars[1];
  2805. float r2 = right.scalars[2];
  2806. float r3 = right.scalars[3];
  2807. float r4 = right.scalars[4];
  2808. float r5 = right.scalars[5];
  2809. float r6 = right.scalars[6];
  2810. float r7 = right.scalars[7];
  2811. if (r0 < v0) { v0 = r0; }
  2812. if (r1 < v1) { v1 = r1; }
  2813. if (r2 < v2) { v2 = r2; }
  2814. if (r3 < v3) { v3 = r3; }
  2815. if (r4 < v4) { v4 = r4; }
  2816. if (r5 < v5) { v5 = r5; }
  2817. if (r6 < v6) { v6 = r6; }
  2818. if (r7 < v7) { v7 = r7; }
  2819. return F32x8(v0, v1, v2, v3, v4, v5, v6, v7);
  2820. #endif
  2821. }
  2822. inline F32x8 max(const F32x8& left, const F32x8& right) {
  2823. #if defined USE_256BIT_F_SIMD
  2824. return F32x8(MAX_F32_SIMD256(left.v, right.v));
  2825. #else
  2826. float v0 = left.scalars[0];
  2827. float v1 = left.scalars[1];
  2828. float v2 = left.scalars[2];
  2829. float v3 = left.scalars[3];
  2830. float v4 = left.scalars[4];
  2831. float v5 = left.scalars[5];
  2832. float v6 = left.scalars[6];
  2833. float v7 = left.scalars[7];
  2834. float r0 = right.scalars[0];
  2835. float r1 = right.scalars[1];
  2836. float r2 = right.scalars[2];
  2837. float r3 = right.scalars[3];
  2838. float r4 = right.scalars[4];
  2839. float r5 = right.scalars[5];
  2840. float r6 = right.scalars[6];
  2841. float r7 = right.scalars[7];
  2842. if (r0 > v0) { v0 = r0; }
  2843. if (r1 > v1) { v1 = r1; }
  2844. if (r2 > v2) { v2 = r2; }
  2845. if (r3 > v3) { v3 = r3; }
  2846. if (r4 > v4) { v4 = r4; }
  2847. if (r5 > v5) { v5 = r5; }
  2848. if (r6 > v6) { v6 = r6; }
  2849. if (r7 > v7) { v7 = r7; }
  2850. return F32x8(v0, v1, v2, v3, v4, v5, v6, v7);
  2851. #endif
  2852. }
  2853. inline I32x8 operator+(const I32x8& left, const I32x8& right) {
  2854. #if defined USE_256BIT_X_SIMD
  2855. return I32x8(ADD_I32_SIMD256(left.v, right.v));
  2856. #else
  2857. return I32x8(
  2858. left.scalars[0] + right.scalars[0],
  2859. left.scalars[1] + right.scalars[1],
  2860. left.scalars[2] + right.scalars[2],
  2861. left.scalars[3] + right.scalars[3],
  2862. left.scalars[4] + right.scalars[4],
  2863. left.scalars[5] + right.scalars[5],
  2864. left.scalars[6] + right.scalars[6],
  2865. left.scalars[7] + right.scalars[7]);
  2866. #endif
  2867. }
  2868. inline I32x8 operator-(const I32x8& left, const I32x8& right) {
  2869. #if defined USE_256BIT_X_SIMD
  2870. return I32x8(SUB_I32_SIMD256(left.v, right.v));
  2871. #else
  2872. return I32x8(
  2873. left.scalars[0] - right.scalars[0],
  2874. left.scalars[1] - right.scalars[1],
  2875. left.scalars[2] - right.scalars[2],
  2876. left.scalars[3] - right.scalars[3],
  2877. left.scalars[4] - right.scalars[4],
  2878. left.scalars[5] - right.scalars[5],
  2879. left.scalars[6] - right.scalars[6],
  2880. left.scalars[7] - right.scalars[7]);
  2881. #endif
  2882. }
  2883. inline I32x8 operator*(const I32x8& left, const I32x8& right) {
  2884. #if defined USE_AVX2
  2885. return I32x8(MUL_I32_SIMD256(left.v, right.v));
  2886. #else
  2887. return I32x8(
  2888. left.scalars[0] * right.scalars[0],
  2889. left.scalars[1] * right.scalars[1],
  2890. left.scalars[2] * right.scalars[2],
  2891. left.scalars[3] * right.scalars[3],
  2892. left.scalars[4] * right.scalars[4],
  2893. left.scalars[5] * right.scalars[5],
  2894. left.scalars[6] * right.scalars[6],
  2895. left.scalars[7] * right.scalars[7]
  2896. );
  2897. #endif
  2898. }
  2899. inline U32x8 operator+(const U32x8& left, const U32x8& right) {
  2900. #if defined USE_256BIT_X_SIMD
  2901. return U32x8(ADD_U32_SIMD256(left.v, right.v));
  2902. #else
  2903. return U32x8(
  2904. left.scalars[0] + right.scalars[0],
  2905. left.scalars[1] + right.scalars[1],
  2906. left.scalars[2] + right.scalars[2],
  2907. left.scalars[3] + right.scalars[3],
  2908. left.scalars[4] + right.scalars[4],
  2909. left.scalars[5] + right.scalars[5],
  2910. left.scalars[6] + right.scalars[6],
  2911. left.scalars[7] + right.scalars[7]
  2912. );
  2913. #endif
  2914. }
  2915. inline U32x8 operator-(const U32x8& left, const U32x8& right) {
  2916. #if defined USE_256BIT_X_SIMD
  2917. return U32x8(SUB_U32_SIMD256(left.v, right.v));
  2918. #else
  2919. return U32x8(
  2920. left.scalars[0] - right.scalars[0],
  2921. left.scalars[1] - right.scalars[1],
  2922. left.scalars[2] - right.scalars[2],
  2923. left.scalars[3] - right.scalars[3],
  2924. left.scalars[4] - right.scalars[4],
  2925. left.scalars[5] - right.scalars[5],
  2926. left.scalars[6] - right.scalars[6],
  2927. left.scalars[7] - right.scalars[7]
  2928. );
  2929. #endif
  2930. }
  2931. inline U32x8 operator*(const U32x8& left, const U32x8& right) {
  2932. #if defined USE_256BIT_X_SIMD
  2933. return U32x8(MUL_U32_SIMD256(left.v, right.v));
  2934. #else
  2935. return U32x8(
  2936. left.scalars[0] * right.scalars[0],
  2937. left.scalars[1] * right.scalars[1],
  2938. left.scalars[2] * right.scalars[2],
  2939. left.scalars[3] * right.scalars[3],
  2940. left.scalars[4] * right.scalars[4],
  2941. left.scalars[5] * right.scalars[5],
  2942. left.scalars[6] * right.scalars[6],
  2943. left.scalars[7] * right.scalars[7]
  2944. );
  2945. #endif
  2946. }
  2947. inline U32x8 operator&(const U32x8& left, const U32x8& right) {
  2948. #if defined USE_256BIT_X_SIMD
  2949. return U32x8(BITWISE_AND_U32_SIMD256(left.v, right.v));
  2950. #else
  2951. return U32x8(
  2952. left.scalars[0] & right.scalars[0],
  2953. left.scalars[1] & right.scalars[1],
  2954. left.scalars[2] & right.scalars[2],
  2955. left.scalars[3] & right.scalars[3],
  2956. left.scalars[4] & right.scalars[4],
  2957. left.scalars[5] & right.scalars[5],
  2958. left.scalars[6] & right.scalars[6],
  2959. left.scalars[7] & right.scalars[7]
  2960. );
  2961. #endif
  2962. }
  2963. inline U32x8 operator|(const U32x8& left, const U32x8& right) {
  2964. #if defined USE_256BIT_X_SIMD
  2965. return U32x8(BITWISE_OR_U32_SIMD256(left.v, right.v));
  2966. #else
  2967. return U32x8(
  2968. left.scalars[0] | right.scalars[0],
  2969. left.scalars[1] | right.scalars[1],
  2970. left.scalars[2] | right.scalars[2],
  2971. left.scalars[3] | right.scalars[3],
  2972. left.scalars[4] | right.scalars[4],
  2973. left.scalars[5] | right.scalars[5],
  2974. left.scalars[6] | right.scalars[6],
  2975. left.scalars[7] | right.scalars[7]
  2976. );
  2977. #endif
  2978. }
  2979. inline U32x8 operator^(const U32x8& left, const U32x8& right) {
  2980. #if defined USE_256BIT_X_SIMD
  2981. return U32x8(BITWISE_XOR_U32_SIMD256(left.v, right.v));
  2982. #else
  2983. return U32x8(
  2984. left.scalars[0] ^ right.scalars[0],
  2985. left.scalars[1] ^ right.scalars[1],
  2986. left.scalars[2] ^ right.scalars[2],
  2987. left.scalars[3] ^ right.scalars[3],
  2988. left.scalars[4] ^ right.scalars[4],
  2989. left.scalars[5] ^ right.scalars[5],
  2990. left.scalars[6] ^ right.scalars[6],
  2991. left.scalars[7] ^ right.scalars[7]
  2992. );
  2993. #endif
  2994. }
  2995. inline U32x8 operator~(const U32x8& value) {
  2996. #if defined USE_BASIC_SIMD
  2997. return value ^ U32x8(~uint32_t(0));
  2998. #else
  2999. return U32x8(
  3000. ~value.scalars[0],
  3001. ~value.scalars[1],
  3002. ~value.scalars[2],
  3003. ~value.scalars[3],
  3004. ~value.scalars[4],
  3005. ~value.scalars[5],
  3006. ~value.scalars[6],
  3007. ~value.scalars[7]
  3008. );
  3009. #endif
  3010. }
  3011. // ARM NEON does not support 256-bit vectors and Intel's AVX2 does not support variable shifting.
  3012. inline U32x8 operator<<(const U32x8& left, const U32x8 &bitOffsets) {
  3013. assert(allLanesLesser(bitOffsets, U32x8(32u)));
  3014. return U32x8(
  3015. left.scalars[0] << bitOffsets.scalars[0],
  3016. left.scalars[1] << bitOffsets.scalars[1],
  3017. left.scalars[2] << bitOffsets.scalars[2],
  3018. left.scalars[3] << bitOffsets.scalars[3],
  3019. left.scalars[4] << bitOffsets.scalars[4],
  3020. left.scalars[5] << bitOffsets.scalars[5],
  3021. left.scalars[6] << bitOffsets.scalars[6],
  3022. left.scalars[7] << bitOffsets.scalars[7]
  3023. );
  3024. }
  3025. inline U32x8 operator>>(const U32x8& left, const U32x8 &bitOffsets) {
  3026. assert(allLanesLesser(bitOffsets, U32x8(32u)));
  3027. return U32x8(
  3028. left.scalars[0] >> bitOffsets.scalars[0],
  3029. left.scalars[1] >> bitOffsets.scalars[1],
  3030. left.scalars[2] >> bitOffsets.scalars[2],
  3031. left.scalars[3] >> bitOffsets.scalars[3],
  3032. left.scalars[4] >> bitOffsets.scalars[4],
  3033. left.scalars[5] >> bitOffsets.scalars[5],
  3034. left.scalars[6] >> bitOffsets.scalars[6],
  3035. left.scalars[7] >> bitOffsets.scalars[7]
  3036. );
  3037. }
  3038. // bitOffset must be an immediate constant from 0 to 31, so a template argument is used.
  3039. template <uint32_t bitOffset>
  3040. inline U32x8 bitShiftLeftImmediate(const U32x8& left) {
  3041. static_assert(bitOffset < 32u, "Immediate left shift of 32-bit values may not shift more than 31 bits!");
  3042. #if defined USE_AVX2
  3043. return U32x8(_mm256_slli_epi32(left.v, bitOffset));
  3044. #else
  3045. return U32x8(
  3046. left.scalars[0] << bitOffset,
  3047. left.scalars[1] << bitOffset,
  3048. left.scalars[2] << bitOffset,
  3049. left.scalars[3] << bitOffset,
  3050. left.scalars[4] << bitOffset,
  3051. left.scalars[5] << bitOffset,
  3052. left.scalars[6] << bitOffset,
  3053. left.scalars[7] << bitOffset
  3054. );
  3055. #endif
  3056. }
  3057. // bitOffset must be an immediate constant from 0 to 31, so a template argument is used.
  3058. template <uint32_t bitOffset>
  3059. inline U32x8 bitShiftRightImmediate(const U32x8& left) {
  3060. static_assert(bitOffset < 32u, "Immediate right shift of 32-bit values may not shift more than 31 bits!");
  3061. #if defined USE_AVX2
  3062. return U32x8(_mm256_srli_epi32(left.v, bitOffset));
  3063. #else
  3064. return U32x8(
  3065. left.scalars[0] >> bitOffset,
  3066. left.scalars[1] >> bitOffset,
  3067. left.scalars[2] >> bitOffset,
  3068. left.scalars[3] >> bitOffset,
  3069. left.scalars[4] >> bitOffset,
  3070. left.scalars[5] >> bitOffset,
  3071. left.scalars[6] >> bitOffset,
  3072. left.scalars[7] >> bitOffset
  3073. );
  3074. #endif
  3075. }
  3076. inline U16x16 operator<<(const U16x16& left, const U16x16 &bitOffsets) {
  3077. assert(allLanesLesser(bitOffsets, U16x16(16u)));
  3078. return U16x16(
  3079. left.scalars[ 0] << bitOffsets.scalars[ 0],
  3080. left.scalars[ 1] << bitOffsets.scalars[ 1],
  3081. left.scalars[ 2] << bitOffsets.scalars[ 2],
  3082. left.scalars[ 3] << bitOffsets.scalars[ 3],
  3083. left.scalars[ 4] << bitOffsets.scalars[ 4],
  3084. left.scalars[ 5] << bitOffsets.scalars[ 5],
  3085. left.scalars[ 6] << bitOffsets.scalars[ 6],
  3086. left.scalars[ 7] << bitOffsets.scalars[ 7],
  3087. left.scalars[ 8] << bitOffsets.scalars[ 8],
  3088. left.scalars[ 9] << bitOffsets.scalars[ 9],
  3089. left.scalars[10] << bitOffsets.scalars[10],
  3090. left.scalars[11] << bitOffsets.scalars[11],
  3091. left.scalars[12] << bitOffsets.scalars[12],
  3092. left.scalars[13] << bitOffsets.scalars[13],
  3093. left.scalars[14] << bitOffsets.scalars[14],
  3094. left.scalars[15] << bitOffsets.scalars[15]
  3095. );
  3096. }
  3097. inline U16x16 operator>>(const U16x16& left, const U16x16 &bitOffsets) {
  3098. assert(allLanesLesser(bitOffsets, U16x16(16u)));
  3099. return U16x16(
  3100. left.scalars[ 0] >> bitOffsets.scalars[ 0],
  3101. left.scalars[ 1] >> bitOffsets.scalars[ 1],
  3102. left.scalars[ 2] >> bitOffsets.scalars[ 2],
  3103. left.scalars[ 3] >> bitOffsets.scalars[ 3],
  3104. left.scalars[ 4] >> bitOffsets.scalars[ 4],
  3105. left.scalars[ 5] >> bitOffsets.scalars[ 5],
  3106. left.scalars[ 6] >> bitOffsets.scalars[ 6],
  3107. left.scalars[ 7] >> bitOffsets.scalars[ 7],
  3108. left.scalars[ 8] >> bitOffsets.scalars[ 8],
  3109. left.scalars[ 9] >> bitOffsets.scalars[ 9],
  3110. left.scalars[10] >> bitOffsets.scalars[10],
  3111. left.scalars[11] >> bitOffsets.scalars[11],
  3112. left.scalars[12] >> bitOffsets.scalars[12],
  3113. left.scalars[13] >> bitOffsets.scalars[13],
  3114. left.scalars[14] >> bitOffsets.scalars[14],
  3115. left.scalars[15] >> bitOffsets.scalars[15]
  3116. );
  3117. }
  3118. // bitOffset must be an immediate constant from 0 to 31, so a template argument is used.
  3119. template <uint32_t bitOffset>
  3120. inline U16x16 bitShiftLeftImmediate(const U16x16& left) {
  3121. static_assert(bitOffset < 16u, "Immediate left shift of 16-bit values may not shift more than 15 bits!");
  3122. #if defined USE_AVX2
  3123. return U16x16(_mm256_slli_epi16(left.v, bitOffset));
  3124. #else
  3125. return U16x16(
  3126. left.scalars[ 0] << bitOffset,
  3127. left.scalars[ 1] << bitOffset,
  3128. left.scalars[ 2] << bitOffset,
  3129. left.scalars[ 3] << bitOffset,
  3130. left.scalars[ 4] << bitOffset,
  3131. left.scalars[ 5] << bitOffset,
  3132. left.scalars[ 6] << bitOffset,
  3133. left.scalars[ 7] << bitOffset,
  3134. left.scalars[ 8] << bitOffset,
  3135. left.scalars[ 9] << bitOffset,
  3136. left.scalars[10] << bitOffset,
  3137. left.scalars[11] << bitOffset,
  3138. left.scalars[12] << bitOffset,
  3139. left.scalars[13] << bitOffset,
  3140. left.scalars[14] << bitOffset,
  3141. left.scalars[15] << bitOffset
  3142. );
  3143. #endif
  3144. }
  3145. // bitOffset must be an immediate constant from 0 to 31, so a template argument is used.
  3146. template <uint32_t bitOffset>
  3147. inline U16x16 bitShiftRightImmediate(const U16x16& left) {
  3148. static_assert(bitOffset < 16u, "Immediate right shift of 16-bit values may not shift more than 15 bits!");
  3149. #if defined USE_AVX2
  3150. return U16x16(_mm256_srli_epi16(left.v, bitOffset));
  3151. #else
  3152. return U16x16(
  3153. left.scalars[ 0] >> bitOffset,
  3154. left.scalars[ 1] >> bitOffset,
  3155. left.scalars[ 2] >> bitOffset,
  3156. left.scalars[ 3] >> bitOffset,
  3157. left.scalars[ 4] >> bitOffset,
  3158. left.scalars[ 5] >> bitOffset,
  3159. left.scalars[ 6] >> bitOffset,
  3160. left.scalars[ 7] >> bitOffset,
  3161. left.scalars[ 8] >> bitOffset,
  3162. left.scalars[ 9] >> bitOffset,
  3163. left.scalars[10] >> bitOffset,
  3164. left.scalars[11] >> bitOffset,
  3165. left.scalars[12] >> bitOffset,
  3166. left.scalars[13] >> bitOffset,
  3167. left.scalars[14] >> bitOffset,
  3168. left.scalars[15] >> bitOffset
  3169. );
  3170. #endif
  3171. }
  3172. inline U8x32 operator<<(const U8x32& left, const U8x32 &bitOffsets) {
  3173. assert(allLanesLesser(bitOffsets, U8x32(32u)));
  3174. return U8x32(
  3175. left.scalars[ 0] << bitOffsets.scalars[ 0],
  3176. left.scalars[ 1] << bitOffsets.scalars[ 1],
  3177. left.scalars[ 2] << bitOffsets.scalars[ 2],
  3178. left.scalars[ 3] << bitOffsets.scalars[ 3],
  3179. left.scalars[ 4] << bitOffsets.scalars[ 4],
  3180. left.scalars[ 5] << bitOffsets.scalars[ 5],
  3181. left.scalars[ 6] << bitOffsets.scalars[ 6],
  3182. left.scalars[ 7] << bitOffsets.scalars[ 7],
  3183. left.scalars[ 8] << bitOffsets.scalars[ 8],
  3184. left.scalars[ 9] << bitOffsets.scalars[ 9],
  3185. left.scalars[10] << bitOffsets.scalars[10],
  3186. left.scalars[11] << bitOffsets.scalars[11],
  3187. left.scalars[12] << bitOffsets.scalars[12],
  3188. left.scalars[13] << bitOffsets.scalars[13],
  3189. left.scalars[14] << bitOffsets.scalars[14],
  3190. left.scalars[15] << bitOffsets.scalars[15],
  3191. left.scalars[16] << bitOffsets.scalars[16],
  3192. left.scalars[17] << bitOffsets.scalars[17],
  3193. left.scalars[18] << bitOffsets.scalars[18],
  3194. left.scalars[19] << bitOffsets.scalars[19],
  3195. left.scalars[20] << bitOffsets.scalars[20],
  3196. left.scalars[21] << bitOffsets.scalars[21],
  3197. left.scalars[22] << bitOffsets.scalars[22],
  3198. left.scalars[23] << bitOffsets.scalars[23],
  3199. left.scalars[24] << bitOffsets.scalars[24],
  3200. left.scalars[25] << bitOffsets.scalars[25],
  3201. left.scalars[26] << bitOffsets.scalars[26],
  3202. left.scalars[27] << bitOffsets.scalars[27],
  3203. left.scalars[28] << bitOffsets.scalars[28],
  3204. left.scalars[29] << bitOffsets.scalars[29],
  3205. left.scalars[30] << bitOffsets.scalars[30],
  3206. left.scalars[31] << bitOffsets.scalars[31]
  3207. );
  3208. }
  3209. inline U8x32 operator>>(const U8x32& left, const U8x32 &bitOffsets) {
  3210. assert(allLanesLesser(bitOffsets, U8x32(32u)));
  3211. return U8x32(
  3212. left.scalars[ 0] >> bitOffsets.scalars[ 0],
  3213. left.scalars[ 1] >> bitOffsets.scalars[ 1],
  3214. left.scalars[ 2] >> bitOffsets.scalars[ 2],
  3215. left.scalars[ 3] >> bitOffsets.scalars[ 3],
  3216. left.scalars[ 4] >> bitOffsets.scalars[ 4],
  3217. left.scalars[ 5] >> bitOffsets.scalars[ 5],
  3218. left.scalars[ 6] >> bitOffsets.scalars[ 6],
  3219. left.scalars[ 7] >> bitOffsets.scalars[ 7],
  3220. left.scalars[ 8] >> bitOffsets.scalars[ 8],
  3221. left.scalars[ 9] >> bitOffsets.scalars[ 9],
  3222. left.scalars[10] >> bitOffsets.scalars[10],
  3223. left.scalars[11] >> bitOffsets.scalars[11],
  3224. left.scalars[12] >> bitOffsets.scalars[12],
  3225. left.scalars[13] >> bitOffsets.scalars[13],
  3226. left.scalars[14] >> bitOffsets.scalars[14],
  3227. left.scalars[15] >> bitOffsets.scalars[15],
  3228. left.scalars[16] >> bitOffsets.scalars[16],
  3229. left.scalars[17] >> bitOffsets.scalars[17],
  3230. left.scalars[18] >> bitOffsets.scalars[18],
  3231. left.scalars[19] >> bitOffsets.scalars[19],
  3232. left.scalars[20] >> bitOffsets.scalars[20],
  3233. left.scalars[21] >> bitOffsets.scalars[21],
  3234. left.scalars[22] >> bitOffsets.scalars[22],
  3235. left.scalars[23] >> bitOffsets.scalars[23],
  3236. left.scalars[24] >> bitOffsets.scalars[24],
  3237. left.scalars[25] >> bitOffsets.scalars[25],
  3238. left.scalars[26] >> bitOffsets.scalars[26],
  3239. left.scalars[27] >> bitOffsets.scalars[27],
  3240. left.scalars[28] >> bitOffsets.scalars[28],
  3241. left.scalars[29] >> bitOffsets.scalars[29],
  3242. left.scalars[30] >> bitOffsets.scalars[30],
  3243. left.scalars[31] >> bitOffsets.scalars[31]
  3244. );
  3245. }
  3246. // bitOffset must be an immediate constant from 0 to 31, so a template argument is used.
  3247. template <uint32_t bitOffset>
  3248. inline U8x32 bitShiftLeftImmediate(const U8x32& left) {
  3249. static_assert(bitOffset < 8u, "Immediate left shift of 32-bit values may not shift more than 7 bits!");
  3250. // TODO: Use a larger lane and a mask generated in compile time.
  3251. return U8x32(
  3252. left.scalars[ 0] << bitOffset,
  3253. left.scalars[ 1] << bitOffset,
  3254. left.scalars[ 2] << bitOffset,
  3255. left.scalars[ 3] << bitOffset,
  3256. left.scalars[ 4] << bitOffset,
  3257. left.scalars[ 5] << bitOffset,
  3258. left.scalars[ 6] << bitOffset,
  3259. left.scalars[ 7] << bitOffset,
  3260. left.scalars[ 8] << bitOffset,
  3261. left.scalars[ 9] << bitOffset,
  3262. left.scalars[10] << bitOffset,
  3263. left.scalars[11] << bitOffset,
  3264. left.scalars[12] << bitOffset,
  3265. left.scalars[13] << bitOffset,
  3266. left.scalars[14] << bitOffset,
  3267. left.scalars[15] << bitOffset,
  3268. left.scalars[16] << bitOffset,
  3269. left.scalars[17] << bitOffset,
  3270. left.scalars[18] << bitOffset,
  3271. left.scalars[19] << bitOffset,
  3272. left.scalars[20] << bitOffset,
  3273. left.scalars[21] << bitOffset,
  3274. left.scalars[22] << bitOffset,
  3275. left.scalars[23] << bitOffset,
  3276. left.scalars[24] << bitOffset,
  3277. left.scalars[25] << bitOffset,
  3278. left.scalars[26] << bitOffset,
  3279. left.scalars[27] << bitOffset,
  3280. left.scalars[28] << bitOffset,
  3281. left.scalars[29] << bitOffset,
  3282. left.scalars[30] << bitOffset,
  3283. left.scalars[31] << bitOffset
  3284. );
  3285. }
  3286. // bitOffset must be an immediate constant from 0 to 31, so a template argument is used.
  3287. template <uint32_t bitOffset>
  3288. inline U8x32 bitShiftRightImmediate(const U8x32& left) {
  3289. static_assert(bitOffset < 8u, "Immediate right shift of 32-bit values may not shift more than 7 bits!");
  3290. // TODO: Use a larger lane and a mask generated in compile time.
  3291. return U8x32(
  3292. left.scalars[ 0] >> bitOffset,
  3293. left.scalars[ 1] >> bitOffset,
  3294. left.scalars[ 2] >> bitOffset,
  3295. left.scalars[ 3] >> bitOffset,
  3296. left.scalars[ 4] >> bitOffset,
  3297. left.scalars[ 5] >> bitOffset,
  3298. left.scalars[ 6] >> bitOffset,
  3299. left.scalars[ 7] >> bitOffset,
  3300. left.scalars[ 8] >> bitOffset,
  3301. left.scalars[ 9] >> bitOffset,
  3302. left.scalars[10] >> bitOffset,
  3303. left.scalars[11] >> bitOffset,
  3304. left.scalars[12] >> bitOffset,
  3305. left.scalars[13] >> bitOffset,
  3306. left.scalars[14] >> bitOffset,
  3307. left.scalars[15] >> bitOffset,
  3308. left.scalars[16] >> bitOffset,
  3309. left.scalars[17] >> bitOffset,
  3310. left.scalars[18] >> bitOffset,
  3311. left.scalars[19] >> bitOffset,
  3312. left.scalars[20] >> bitOffset,
  3313. left.scalars[21] >> bitOffset,
  3314. left.scalars[22] >> bitOffset,
  3315. left.scalars[23] >> bitOffset,
  3316. left.scalars[24] >> bitOffset,
  3317. left.scalars[25] >> bitOffset,
  3318. left.scalars[26] >> bitOffset,
  3319. left.scalars[27] >> bitOffset,
  3320. left.scalars[28] >> bitOffset,
  3321. left.scalars[29] >> bitOffset,
  3322. left.scalars[30] >> bitOffset,
  3323. left.scalars[31] >> bitOffset
  3324. );
  3325. }
  3326. inline U16x16 operator+(const U16x16& left, const U16x16& right) {
  3327. #if defined USE_256BIT_X_SIMD
  3328. return U16x16(ADD_U16_SIMD256(left.v, right.v));
  3329. #else
  3330. return U16x16(
  3331. left.scalars[0] + right.scalars[0],
  3332. left.scalars[1] + right.scalars[1],
  3333. left.scalars[2] + right.scalars[2],
  3334. left.scalars[3] + right.scalars[3],
  3335. left.scalars[4] + right.scalars[4],
  3336. left.scalars[5] + right.scalars[5],
  3337. left.scalars[6] + right.scalars[6],
  3338. left.scalars[7] + right.scalars[7],
  3339. left.scalars[8] + right.scalars[8],
  3340. left.scalars[9] + right.scalars[9],
  3341. left.scalars[10] + right.scalars[10],
  3342. left.scalars[11] + right.scalars[11],
  3343. left.scalars[12] + right.scalars[12],
  3344. left.scalars[13] + right.scalars[13],
  3345. left.scalars[14] + right.scalars[14],
  3346. left.scalars[15] + right.scalars[15]
  3347. );
  3348. #endif
  3349. }
  3350. inline U16x16 operator-(const U16x16& left, const U16x16& right) {
  3351. #if defined USE_256BIT_X_SIMD
  3352. return U16x16(SUB_U16_SIMD256(left.v, right.v));
  3353. #else
  3354. return U16x16(
  3355. left.scalars[0] - right.scalars[0],
  3356. left.scalars[1] - right.scalars[1],
  3357. left.scalars[2] - right.scalars[2],
  3358. left.scalars[3] - right.scalars[3],
  3359. left.scalars[4] - right.scalars[4],
  3360. left.scalars[5] - right.scalars[5],
  3361. left.scalars[6] - right.scalars[6],
  3362. left.scalars[7] - right.scalars[7],
  3363. left.scalars[8] - right.scalars[8],
  3364. left.scalars[9] - right.scalars[9],
  3365. left.scalars[10] - right.scalars[10],
  3366. left.scalars[11] - right.scalars[11],
  3367. left.scalars[12] - right.scalars[12],
  3368. left.scalars[13] - right.scalars[13],
  3369. left.scalars[14] - right.scalars[14],
  3370. left.scalars[15] - right.scalars[15]
  3371. );
  3372. #endif
  3373. }
  3374. inline U16x16 operator*(const U16x16& left, const U16x16& right) {
  3375. #if defined USE_256BIT_X_SIMD
  3376. return U16x16(MUL_U16_SIMD256(left.v, right.v));
  3377. #else
  3378. return U16x16(
  3379. left.scalars[0] * right.scalars[0],
  3380. left.scalars[1] * right.scalars[1],
  3381. left.scalars[2] * right.scalars[2],
  3382. left.scalars[3] * right.scalars[3],
  3383. left.scalars[4] * right.scalars[4],
  3384. left.scalars[5] * right.scalars[5],
  3385. left.scalars[6] * right.scalars[6],
  3386. left.scalars[7] * right.scalars[7],
  3387. left.scalars[8] * right.scalars[8],
  3388. left.scalars[9] * right.scalars[9],
  3389. left.scalars[10] * right.scalars[10],
  3390. left.scalars[11] * right.scalars[11],
  3391. left.scalars[12] * right.scalars[12],
  3392. left.scalars[13] * right.scalars[13],
  3393. left.scalars[14] * right.scalars[14],
  3394. left.scalars[15] * right.scalars[15]
  3395. );
  3396. #endif
  3397. }
  3398. inline U8x32 operator+(const U8x32& left, const U8x32& right) {
  3399. #if defined USE_256BIT_X_SIMD
  3400. return U8x32(ADD_U8_SIMD256(left.v, right.v));
  3401. #else
  3402. U8x32 result = U8x32::create_dangerous_uninitialized();
  3403. for (int i = 0; i < 32; i++) {
  3404. result.scalars[i] = left.scalars[i] + right.scalars[i];
  3405. }
  3406. return result;
  3407. #endif
  3408. }
  3409. inline U8x32 operator-(const U8x32& left, const U8x32& right) {
  3410. #if defined USE_256BIT_X_SIMD
  3411. return U8x32(SUB_U8_SIMD256(left.v, right.v));
  3412. #else
  3413. U8x32 result = U8x32::create_dangerous_uninitialized();
  3414. for (int i = 0; i < 32; i++) {
  3415. result.scalars[i] = left.scalars[i] - right.scalars[i];
  3416. }
  3417. return result;
  3418. #endif
  3419. }
  3420. inline U8x32 saturatedAddition(const U8x32& left, const U8x32& right) {
  3421. #if defined USE_256BIT_X_SIMD
  3422. return U8x32(ADD_SAT_U8_SIMD256(left.v, right.v));
  3423. #else
  3424. U8x32 result = U8x32::create_dangerous_uninitialized();
  3425. for (int i = 0; i < 32; i++) {
  3426. result.scalars[i] = impl_limit255((uint32_t)left.scalars[i] + (uint32_t)right.scalars[i]);
  3427. }
  3428. return result;
  3429. #endif
  3430. }
  3431. inline U8x32 saturatedSubtraction(const U8x32& left, const U8x32& right) {
  3432. #if defined USE_256BIT_X_SIMD
  3433. return U8x32(SUB_SAT_U8_SIMD256(left.v, right.v));
  3434. #else
  3435. U8x32 result = U8x32::create_dangerous_uninitialized();
  3436. for (int i = 0; i < 32; i++) {
  3437. result.scalars[i] = impl_limit0((int32_t)left.scalars[i] - (int32_t)right.scalars[i]);
  3438. }
  3439. return result;
  3440. #endif
  3441. }
  3442. inline I32x8 truncateToI32(const F32x8& vector) {
  3443. #if defined USE_256BIT_X_SIMD
  3444. return I32x8(F32_TO_I32_SIMD256(vector.v));
  3445. #else
  3446. return I32x8(
  3447. (int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3],
  3448. (int32_t)vector.scalars[4], (int32_t)vector.scalars[5], (int32_t)vector.scalars[6], (int32_t)vector.scalars[7]
  3449. );
  3450. #endif
  3451. }
  3452. inline U32x8 truncateToU32(const F32x8& vector) {
  3453. #if defined USE_256BIT_X_SIMD
  3454. return U32x8(F32_TO_U32_SIMD256(vector.v));
  3455. #else
  3456. return U32x8(
  3457. (uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3],
  3458. (uint32_t)vector.scalars[4], (uint32_t)vector.scalars[5], (uint32_t)vector.scalars[6], (uint32_t)vector.scalars[7]
  3459. );
  3460. #endif
  3461. }
  3462. inline F32x8 floatFromI32(const I32x8& vector) {
  3463. #if defined USE_256BIT_X_SIMD
  3464. return F32x8(I32_TO_F32_SIMD256(vector.v));
  3465. #else
  3466. return F32x8(
  3467. (float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3],
  3468. (float)vector.scalars[4], (float)vector.scalars[5], (float)vector.scalars[6], (float)vector.scalars[7]
  3469. );
  3470. #endif
  3471. }
  3472. inline F32x8 floatFromU32(const U32x8& vector) {
  3473. #if defined USE_256BIT_X_SIMD
  3474. return F32x8(U32_TO_F32_SIMD256(vector.v));
  3475. #else
  3476. return F32x8(
  3477. (float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3],
  3478. (float)vector.scalars[4], (float)vector.scalars[5], (float)vector.scalars[6], (float)vector.scalars[7]
  3479. );
  3480. #endif
  3481. }
  3482. inline I32x8 I32FromU32(const U32x8& vector) {
  3483. #if defined USE_256BIT_X_SIMD
  3484. return I32x8(REINTERPRET_U32_TO_I32_SIMD256(vector.v));
  3485. #else
  3486. return I32x8(
  3487. (int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3],
  3488. (int32_t)vector.scalars[4], (int32_t)vector.scalars[5], (int32_t)vector.scalars[6], (int32_t)vector.scalars[7]
  3489. );
  3490. #endif
  3491. }
  3492. inline U32x8 U32FromI32(const I32x8& vector) {
  3493. #if defined USE_256BIT_X_SIMD
  3494. return U32x8(REINTERPRET_I32_TO_U32_SIMD256(vector.v));
  3495. #else
  3496. return U32x8(
  3497. (uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3],
  3498. (uint32_t)vector.scalars[4], (uint32_t)vector.scalars[5], (uint32_t)vector.scalars[6], (uint32_t)vector.scalars[7]
  3499. );
  3500. #endif
  3501. }
  3502. // Warning! Behavior depends on endianness.
  3503. inline U8x32 reinterpret_U8FromU32(const U32x8& vector) {
  3504. #if defined USE_256BIT_X_SIMD
  3505. return U8x32(REINTERPRET_U32_TO_U8_SIMD256(vector.v));
  3506. #else
  3507. const uint8_t *source = (const uint8_t*)vector.scalars;
  3508. U8x32 result = U8x32::create_dangerous_uninitialized();
  3509. for (int i = 0; i < 32; i++) {
  3510. result.scalars[i] = source[i];
  3511. }
  3512. return result;
  3513. #endif
  3514. }
  3515. // Warning! Behavior depends on endianness.
  3516. inline U32x8 reinterpret_U32FromU8(const U8x32& vector) {
  3517. #if defined USE_256BIT_X_SIMD
  3518. return U32x8(REINTERPRET_U8_TO_U32_SIMD256(vector.v));
  3519. #else
  3520. const uint32_t *source = (const uint32_t*)vector.scalars;
  3521. return U32x8(source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7]);
  3522. #endif
  3523. }
  3524. // Warning! Behavior depends on endianness.
  3525. inline U16x16 reinterpret_U16FromU32(const U32x8& vector) {
  3526. #if defined USE_256BIT_X_SIMD
  3527. return U16x16(REINTERPRET_U32_TO_U16_SIMD256(vector.v));
  3528. #else
  3529. const uint16_t *source = (const uint16_t*)vector.scalars;
  3530. return U16x16(
  3531. source[0], source[1], source[2] , source[3] , source[4] , source[5] , source[6] , source[7] ,
  3532. source[8], source[9], source[10], source[11], source[12], source[13], source[14], source[15]
  3533. );
  3534. #endif
  3535. }
  3536. // Warning! Behavior depends on endianness.
  3537. inline U32x8 reinterpret_U32FromU16(const U16x16& vector) {
  3538. #if defined USE_256BIT_X_SIMD
  3539. return U32x8(REINTERPRET_U16_TO_U32_SIMD256(vector.v));
  3540. #else
  3541. const uint32_t *source = (const uint32_t*)vector.scalars;
  3542. return U32x8(source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7]);
  3543. #endif
  3544. }
  3545. // Unpacking to larger integers
  3546. inline U32x8 lowerToU32(const U16x16& vector) {
  3547. #if defined USE_256BIT_X_SIMD
  3548. return U32x8(U16_LOW_TO_U32_SIMD256(vector.v));
  3549. #else
  3550. return U32x8(vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3], vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7]);
  3551. #endif
  3552. }
  3553. inline U32x8 higherToU32(const U16x16& vector) {
  3554. #if defined USE_256BIT_X_SIMD
  3555. return U32x8(U16_HIGH_TO_U32_SIMD256(vector.v));
  3556. #else
  3557. return U32x8(vector.scalars[8], vector.scalars[9], vector.scalars[10], vector.scalars[11], vector.scalars[12], vector.scalars[13], vector.scalars[14], vector.scalars[15]);
  3558. #endif
  3559. }
  3560. inline U16x16 lowerToU16(const U8x32& vector) {
  3561. #if defined USE_256BIT_X_SIMD
  3562. return U16x16(U8_LOW_TO_U16_SIMD256(vector.v));
  3563. #else
  3564. return U16x16(
  3565. vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3],
  3566. vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7],
  3567. vector.scalars[8], vector.scalars[9], vector.scalars[10], vector.scalars[11],
  3568. vector.scalars[12], vector.scalars[13], vector.scalars[14], vector.scalars[15]
  3569. );
  3570. #endif
  3571. }
  3572. inline U16x16 higherToU16(const U8x32& vector) {
  3573. #if defined USE_256BIT_X_SIMD
  3574. return U16x16(U8_HIGH_TO_U16_SIMD256(vector.v));
  3575. #else
  3576. return U16x16(
  3577. vector.scalars[16], vector.scalars[17], vector.scalars[18], vector.scalars[19],
  3578. vector.scalars[20], vector.scalars[21], vector.scalars[22], vector.scalars[23],
  3579. vector.scalars[24], vector.scalars[25], vector.scalars[26], vector.scalars[27],
  3580. vector.scalars[28], vector.scalars[29], vector.scalars[30], vector.scalars[31]
  3581. );
  3582. #endif
  3583. }
  3584. // Saturated packing
  3585. inline U8x32 saturateToU8(const U16x16& lower, const U16x16& upper) {
  3586. #if defined USE_256BIT_X_SIMD
  3587. return U8x32(PACK_SAT_U16_TO_U8_SIMD256(lower.v, upper.v));
  3588. #else
  3589. U8x32 result = U8x32::create_dangerous_uninitialized();
  3590. for (int i = 0; i < 16; i++) {
  3591. result.scalars[i] = impl_limit255(lower.scalars[i]);
  3592. }
  3593. for (int i = 0; i < 16; i++) {
  3594. result.scalars[i + 16] = impl_limit255(upper.scalars[i]);
  3595. }
  3596. return result;
  3597. #endif
  3598. }
  3599. // Unary negation for convenience and code readability.
  3600. // Before using unary negation, always check if:
  3601. // * An addition can be turned into a subtraction?
  3602. // x = -a + b
  3603. // x = b - a
  3604. // * A multiplying constant or scalar can be negated instead?
  3605. // x = -b * 2
  3606. // x = b * -2
  3607. inline F32x8 operator-(const F32x8& value) {
  3608. #if defined USE_256BIT_F_SIMD
  3609. return F32x8(0.0f) - value;
  3610. #else
  3611. return F32x8(
  3612. -value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3],
  3613. -value.scalars[4], -value.scalars[5], -value.scalars[6], -value.scalars[7]
  3614. );
  3615. #endif
  3616. }
  3617. inline I32x8 operator-(const I32x8& value) {
  3618. #if defined USE_256BIT_X_SIMD
  3619. return I32x8(0) - value;
  3620. #else
  3621. return I32x8(
  3622. -value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3],
  3623. -value.scalars[4], -value.scalars[5], -value.scalars[6], -value.scalars[7]
  3624. );
  3625. #endif
  3626. }
  3627. // Helper macros for generating the vector extract functions.
  3628. // Having one function for each type and offset makes sure that the compiler gets an immediate integer within the valid range.
  3629. #if defined USE_AVX2
  3630. // AVX2 does not offer any 256-bit element extraction, only two 128-bit shifts done in parallel, so we might as well use two separate 128-bit extractions.
  3631. template <int OFFSET>
  3632. __m256i impl_extractBytes_AVX2(const __m256i leftInput, const __m256i rightInput) {
  3633. // Extract three halves depending on which ones overlap with the offset.
  3634. __m128i leftPart = _mm256_extractf128_si256(leftInput, (OFFSET < 16) ? 0 : 1);
  3635. __m128i middlePart = _mm256_extractf128_si256(OFFSET < 16 ? rightInput : leftInput, (OFFSET < 16) ? 1 : 0);
  3636. __m128i rightPart = _mm256_extractf128_si256(rightInput, (OFFSET < 16) ? 0 : 1);
  3637. // Combine two 128-bit extracts into a whole 256-bit extract.
  3638. return _mm256_set_m128i(
  3639. _mm_alignr_epi8(leftPart, middlePart, OFFSET - ((OFFSET < 16) ? 0 : 16)),
  3640. _mm_alignr_epi8(middlePart, rightPart, OFFSET - ((OFFSET < 16) ? 0 : 16))
  3641. );
  3642. }
  3643. #define VECTOR_EXTRACT_GENERATOR_256_U8(OFFSET) return U8x32(impl_extractBytes_AVX2<OFFSET>(b.v, a.v));
  3644. #define VECTOR_EXTRACT_GENERATOR_256_U16(OFFSET) return U16x16(impl_extractBytes_AVX2<OFFSET * 2>(b.v, a.v));
  3645. #define VECTOR_EXTRACT_GENERATOR_256_U32(OFFSET) return U32x8(impl_extractBytes_AVX2<OFFSET * 4>(b.v, a.v));
  3646. #define VECTOR_EXTRACT_GENERATOR_256_I32(OFFSET) return I32x8(impl_extractBytes_AVX2<OFFSET * 4>(b.v, a.v));
  3647. #define VECTOR_EXTRACT_GENERATOR_256_F32(OFFSET) return F32x8(SIMD_F32x8(impl_extractBytes_AVX2<OFFSET * 4>(SIMD_U32x8(b.v), SIMD_U32x8(a.v))));
  3648. #else
  3649. template<typename T, int elementCount>
  3650. T vectorExtract_emulated(const T &a, const T &b, int offset) {
  3651. // For safety reasons, uninitialized default construction of SIMD vectors is only available during full emulation, not partial AVX support.
  3652. T result = T::create_dangerous_uninitialized();
  3653. int t = 0;
  3654. for (int s = offset; s < elementCount; s++) {
  3655. result.scalars[t] = a.scalars[s];
  3656. t++;
  3657. }
  3658. for (int s = 0; s < offset; s++) {
  3659. result.scalars[t] = b.scalars[s];
  3660. t++;
  3661. }
  3662. return result;
  3663. }
  3664. #define VECTOR_EXTRACT_GENERATOR_256_U8(OFFSET) return vectorExtract_emulated<U8x32, 32>(a, b, OFFSET);
  3665. #define VECTOR_EXTRACT_GENERATOR_256_U16(OFFSET) return vectorExtract_emulated<U16x16, 16>(a, b, OFFSET);
  3666. #define VECTOR_EXTRACT_GENERATOR_256_U32(OFFSET) return vectorExtract_emulated<U32x8, 8>(a, b, OFFSET);
  3667. #define VECTOR_EXTRACT_GENERATOR_256_I32(OFFSET) return vectorExtract_emulated<I32x8, 8>(a, b, OFFSET);
  3668. #define VECTOR_EXTRACT_GENERATOR_256_F32(OFFSET) return vectorExtract_emulated<F32x8, 8>(a, b, OFFSET);
  3669. #endif
  3670. // Vector extraction concatunates two input vectors and reads a vector between them using an offset.
  3671. // The first and last offsets that only return one of the inputs can be used for readability, because they will be inlined and removed by the compiler.
  3672. // To get elements from the right side, combine the center vector with the right vector and shift one element to the left using vectorExtract_1 for the given type.
  3673. // To get elements from the left side, combine the left vector with the center vector and shift one element to the right using vectorExtract_15 for 16 lanes, vectorExtract_7 for 8 lanes, or vectorExtract_3 for 4 lanes.
  3674. // TODO: Also allow using a template arguments as the element offset with a static assert for the offset, which might be useful in template programming.
  3675. U8x32 inline vectorExtract_0(const U8x32 &a, const U8x32 &b) { return a; }
  3676. U8x32 inline vectorExtract_1(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(1) }
  3677. U8x32 inline vectorExtract_2(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(2) }
  3678. U8x32 inline vectorExtract_3(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(3) }
  3679. U8x32 inline vectorExtract_4(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(4) }
  3680. U8x32 inline vectorExtract_5(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(5) }
  3681. U8x32 inline vectorExtract_6(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(6) }
  3682. U8x32 inline vectorExtract_7(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(7) }
  3683. U8x32 inline vectorExtract_8(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(8) }
  3684. U8x32 inline vectorExtract_9(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(9) }
  3685. U8x32 inline vectorExtract_10(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(10) }
  3686. U8x32 inline vectorExtract_11(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(11) }
  3687. U8x32 inline vectorExtract_12(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(12) }
  3688. U8x32 inline vectorExtract_13(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(13) }
  3689. U8x32 inline vectorExtract_14(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(14) }
  3690. U8x32 inline vectorExtract_15(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(15) }
  3691. U8x32 inline vectorExtract_16(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(16) }
  3692. U8x32 inline vectorExtract_17(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(17) }
  3693. U8x32 inline vectorExtract_18(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(18) }
  3694. U8x32 inline vectorExtract_19(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(19) }
  3695. U8x32 inline vectorExtract_20(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(20) }
  3696. U8x32 inline vectorExtract_21(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(21) }
  3697. U8x32 inline vectorExtract_22(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(22) }
  3698. U8x32 inline vectorExtract_23(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(23) }
  3699. U8x32 inline vectorExtract_24(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(24) }
  3700. U8x32 inline vectorExtract_25(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(25) }
  3701. U8x32 inline vectorExtract_26(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(26) }
  3702. U8x32 inline vectorExtract_27(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(27) }
  3703. U8x32 inline vectorExtract_28(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(28) }
  3704. U8x32 inline vectorExtract_29(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(29) }
  3705. U8x32 inline vectorExtract_30(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(30) }
  3706. U8x32 inline vectorExtract_31(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(31) }
  3707. U8x32 inline vectorExtract_32(const U8x32 &a, const U8x32 &b) { return b; }
  3708. U16x16 inline vectorExtract_0(const U16x16 &a, const U16x16 &b) { return a; }
  3709. U16x16 inline vectorExtract_1(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(1) }
  3710. U16x16 inline vectorExtract_2(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(2) }
  3711. U16x16 inline vectorExtract_3(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(3) }
  3712. U16x16 inline vectorExtract_4(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(4) }
  3713. U16x16 inline vectorExtract_5(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(5) }
  3714. U16x16 inline vectorExtract_6(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(6) }
  3715. U16x16 inline vectorExtract_7(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(7) }
  3716. U16x16 inline vectorExtract_8(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(8) }
  3717. U16x16 inline vectorExtract_9(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(9) }
  3718. U16x16 inline vectorExtract_10(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(10) }
  3719. U16x16 inline vectorExtract_11(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(11) }
  3720. U16x16 inline vectorExtract_12(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(12) }
  3721. U16x16 inline vectorExtract_13(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(13) }
  3722. U16x16 inline vectorExtract_14(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(14) }
  3723. U16x16 inline vectorExtract_15(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(15) }
  3724. U16x16 inline vectorExtract_16(const U16x16 &a, const U16x16 &b) { return b; }
  3725. U32x8 inline vectorExtract_0(const U32x8 &a, const U32x8 &b) { return a; }
  3726. U32x8 inline vectorExtract_1(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(1) }
  3727. U32x8 inline vectorExtract_2(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(2) }
  3728. U32x8 inline vectorExtract_3(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(3) }
  3729. U32x8 inline vectorExtract_4(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(4) }
  3730. U32x8 inline vectorExtract_5(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(5) }
  3731. U32x8 inline vectorExtract_6(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(6) }
  3732. U32x8 inline vectorExtract_7(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(7) }
  3733. U32x8 inline vectorExtract_8(const U32x8 &a, const U32x8 &b) { return b; }
  3734. I32x8 inline vectorExtract_0(const I32x8 &a, const I32x8 &b) { return a; }
  3735. I32x8 inline vectorExtract_1(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(1) }
  3736. I32x8 inline vectorExtract_2(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(2) }
  3737. I32x8 inline vectorExtract_3(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(3) }
  3738. I32x8 inline vectorExtract_4(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(4) }
  3739. I32x8 inline vectorExtract_5(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(5) }
  3740. I32x8 inline vectorExtract_6(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(6) }
  3741. I32x8 inline vectorExtract_7(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(7) }
  3742. I32x8 inline vectorExtract_8(const I32x8 &a, const I32x8 &b) { return b; }
  3743. F32x8 inline vectorExtract_0(const F32x8 &a, const F32x8 &b) { return a; }
  3744. F32x8 inline vectorExtract_1(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(1) }
  3745. F32x8 inline vectorExtract_2(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(2) }
  3746. F32x8 inline vectorExtract_3(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(3) }
  3747. F32x8 inline vectorExtract_4(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(4) }
  3748. F32x8 inline vectorExtract_5(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(5) }
  3749. F32x8 inline vectorExtract_6(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(6) }
  3750. F32x8 inline vectorExtract_7(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(7) }
  3751. F32x8 inline vectorExtract_8(const F32x8 &a, const F32x8 &b) { return b; }
  3752. // Gather instructions load memory from a pointer at multiple index offsets at the same time.
  3753. // The given pointers should be aligned with 4 bytes, so that the fallback solution works on machines with strict alignment requirements.
  3754. #if defined USE_AVX2
  3755. #define GATHER_I32x8_AVX2(SOURCE, EIGHT_OFFSETS, SCALE) _mm256_i32gather_epi32((const int32_t*)(SOURCE), EIGHT_OFFSETS, SCALE)
  3756. #define GATHER_U32x8_AVX2(SOURCE, EIGHT_OFFSETS, SCALE) _mm256_i32gather_epi32((const int32_t*)(SOURCE), EIGHT_OFFSETS, SCALE)
  3757. #define GATHER_F32x8_AVX2(SOURCE, EIGHT_OFFSETS, SCALE) _mm256_i32gather_ps((const float*)(SOURCE), EIGHT_OFFSETS, SCALE)
  3758. #endif
  3759. static inline U32x8 gather_U32(dsr::SafePointer<const uint32_t> data, const U32x8 &elementOffset) {
  3760. #if defined USE_AVX2
  3761. // TODO: Implement safety checks for debug mode.
  3762. return U32x8(GATHER_I32x8_AVX2(data.getUnsafe(), elementOffset.v, 4));
  3763. #else
  3764. ALIGN32 uint32_t elementOffsets[8];
  3765. elementOffset.writeAlignedUnsafe(elementOffsets);
  3766. return U32x8(
  3767. *(data + elementOffsets[0]),
  3768. *(data + elementOffsets[1]),
  3769. *(data + elementOffsets[2]),
  3770. *(data + elementOffsets[3]),
  3771. *(data + elementOffsets[4]),
  3772. *(data + elementOffsets[5]),
  3773. *(data + elementOffsets[6]),
  3774. *(data + elementOffsets[7])
  3775. );
  3776. #endif
  3777. }
  3778. static inline I32x8 gather_I32(dsr::SafePointer<const int32_t> data, const U32x8 &elementOffset) {
  3779. #if defined USE_AVX2
  3780. // TODO: Implement safety checks for debug mode.
  3781. return I32x8(GATHER_U32x8_AVX2(data.getUnsafe(), elementOffset.v, 4));
  3782. #else
  3783. ALIGN32 uint32_t elementOffsets[8];
  3784. elementOffset.writeAlignedUnsafe(elementOffsets);
  3785. return I32x8(
  3786. *(data + elementOffsets[0]),
  3787. *(data + elementOffsets[1]),
  3788. *(data + elementOffsets[2]),
  3789. *(data + elementOffsets[3]),
  3790. *(data + elementOffsets[4]),
  3791. *(data + elementOffsets[5]),
  3792. *(data + elementOffsets[6]),
  3793. *(data + elementOffsets[7])
  3794. );
  3795. #endif
  3796. }
  3797. static inline F32x8 gather_F32(dsr::SafePointer<const float> data, const U32x8 &elementOffset) {
  3798. #if defined USE_AVX2
  3799. // TODO: Implement safety checks for debug mode.
  3800. return F32x8(GATHER_F32x8_AVX2(data.getUnsafe(), elementOffset.v, 4));
  3801. #else
  3802. ALIGN32 uint32_t elementOffsets[8];
  3803. elementOffset.writeAlignedUnsafe(elementOffsets);
  3804. return F32x8(
  3805. *(data + elementOffsets[0]),
  3806. *(data + elementOffsets[1]),
  3807. *(data + elementOffsets[2]),
  3808. *(data + elementOffsets[3]),
  3809. *(data + elementOffsets[4]),
  3810. *(data + elementOffsets[5]),
  3811. *(data + elementOffsets[6]),
  3812. *(data + elementOffsets[7])
  3813. );
  3814. #endif
  3815. }
  3816. // Wrapper functions for explicitly expanding scalars into vectors during math operations.
  3817. #define NUMERICAL_SCALAR_OPERATIONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  3818. inline VECTOR_TYPE operator+(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left + VECTOR_TYPE(right); } \
  3819. inline VECTOR_TYPE operator+(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) + right; } \
  3820. inline VECTOR_TYPE operator-(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left - VECTOR_TYPE(right); } \
  3821. inline VECTOR_TYPE operator-(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) - right; }
  3822. FOR_ALL_VECTOR_TYPES(NUMERICAL_SCALAR_OPERATIONS)
  3823. #undef NUMERICAL_SCALAR_OPERATIONS
  3824. #define MULTIPLY_SCALAR_OPERATIONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  3825. inline VECTOR_TYPE operator*(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left * VECTOR_TYPE(right); } \
  3826. inline VECTOR_TYPE operator*(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) * right; }
  3827. // TODO: Implement multiplication for U8x16 and U8x32.
  3828. //FOR_ALL_VECTOR_TYPES(MULTIPLY_SCALAR_OPERATIONS)
  3829. MULTIPLY_SCALAR_OPERATIONS(F32x4, float, 4)
  3830. MULTIPLY_SCALAR_OPERATIONS(F32x8, float, 8)
  3831. MULTIPLY_SCALAR_OPERATIONS(U32x4, uint32_t, 4)
  3832. MULTIPLY_SCALAR_OPERATIONS(U32x8, uint32_t, 8)
  3833. MULTIPLY_SCALAR_OPERATIONS(I32x4, int32_t, 4)
  3834. MULTIPLY_SCALAR_OPERATIONS(I32x8, int32_t, 8)
  3835. MULTIPLY_SCALAR_OPERATIONS(U16x8, uint16_t, 8)
  3836. MULTIPLY_SCALAR_OPERATIONS(U16x16, uint16_t, 16)
  3837. #undef MULTIPLY_SCALAR_OPERATIONS
  3838. // Wrapper functions for explicitly duplicating bit masks into the same lane count.
  3839. #define BITWISE_SCALAR_OPERATIONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  3840. inline VECTOR_TYPE operator&(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left & VECTOR_TYPE(right); } \
  3841. inline VECTOR_TYPE operator&(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) & right; } \
  3842. inline VECTOR_TYPE operator|(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left | VECTOR_TYPE(right); } \
  3843. inline VECTOR_TYPE operator|(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) | right; } \
  3844. inline VECTOR_TYPE operator^(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left ^ VECTOR_TYPE(right); } \
  3845. inline VECTOR_TYPE operator^(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) ^ right; }
  3846. // TODO: Implement bitwise operations for all unsigned SIMD vectors.
  3847. //FOR_UNSIGNED_VECTOR_TYPES(BITWISE_SCALAR_OPERATIONS)
  3848. BITWISE_SCALAR_OPERATIONS(U32x4, uint32_t, 4)
  3849. BITWISE_SCALAR_OPERATIONS(U32x8, uint32_t, 8)
  3850. #undef BITWISE_SCALAR_OPERATIONS
  3851. // Cleaning up temporary macro definitions to avoid cluttering the namespace.
  3852. #undef FOR_ALL_VECTOR_TYPES
  3853. #undef FOR_FLOAT_VECTOR_TYPES
  3854. #undef FOR_INTEGER_VECTOR_TYPES
  3855. #undef FOR_SIGNED_VECTOR_TYPES
  3856. #undef FOR_UNSIGNED_VECTOR_TYPES
  3857. // TODO: Let SVE define completely separate types for dynamic vectors.
  3858. // The X vectors using the longest SIMD length that is efficient to use for both floating-point and integer types.
  3859. // DSR_DEFAULT_ALIGNMENT
  3860. // The number of bytes memory should be aligned with by default when creating buffers and images.
  3861. // F32xX
  3862. // The longest available SIMD vector for storing 32-bit float values. Iterating laneCountX_32Bit floats at a time.
  3863. // I32xX
  3864. // The longest available SIMD vector for storing signed 32-bit integer values. Iterating laneCountX_32Bit integers at a time.
  3865. // U32xX
  3866. // The longest available SIMD vector for storing unsigned 32-bit integer values. Iterating laneCountX_32Bit integers at a time.
  3867. // U16xX
  3868. // The longest available SIMD vector for storing unsigned 16-bit integer values. Iterating laneCountX_16Bit integers at a time.
  3869. // U8xX
  3870. // The longest available SIMD vector for storing unsigned 8-bit integer values. Iterating laneCountX_8Bit integers at a time.
  3871. #if defined USE_256BIT_X_SIMD || defined EMULATE_256BIT_X_SIMD
  3872. // Using 256-bit SIMD
  3873. #define DSR_DEFAULT_VECTOR_SIZE 32
  3874. #define DSR_DEFAULT_ALIGNMENT 32
  3875. using F32xX = F32x8;
  3876. using I32xX = I32x8;
  3877. using U32xX = U32x8;
  3878. using U16xX = U16x16;
  3879. using U8xX = U8x32;
  3880. // Align memory with 256 bits to allow overwriting padding at the end of each pixel row.
  3881. // Otherwise you would have to preserve data at the end of each row with slow and bloated duplicated code in every filter.
  3882. #else
  3883. // If there is no hardware support for 256-bit vectors, the emulation of 256-bit vectors when used explicitly, is allowed to be aligned with just 128 bits.
  3884. #define DSR_DEFAULT_VECTOR_SIZE 16
  3885. #define DSR_DEFAULT_ALIGNMENT 16
  3886. using F32xX = F32x4;
  3887. using I32xX = I32x4;
  3888. using U32xX = U32x4;
  3889. using U16xX = U16x8;
  3890. using U8xX = U8x16;
  3891. #endif
  3892. // How many lanes do the longest available vector have for a specified lane size.
  3893. // Used to iterate indices and pointers using whole elements.
  3894. static const int laneCountX_32Bit = DSR_DEFAULT_VECTOR_SIZE / 4;
  3895. static const int laneCountX_16Bit = DSR_DEFAULT_VECTOR_SIZE / 2;
  3896. static const int laneCountX_8Bit = DSR_DEFAULT_VECTOR_SIZE;
  3897. // TODO: Let SVE define completely separate types for dynamic vectors.
  3898. // The F vector using the longest SIMD length that is efficient to use when only processing float values, even if no integer types are available in the same size.
  3899. // Used when you know that your algorithm is only going to work with float types and you need the extra performance.
  3900. // Some processors have AVX but not AVX2, meaning that it has 256-bit SIMD for floats, but only 128-bit SIMD for integers.
  3901. // F32xF
  3902. // The longest available SIMD vector for storing 32-bit float values. Iterating laneCountF_32Bit floats at a time.
  3903. #if defined USE_256BIT_F_SIMD || defined EMULATE_256BIT_F_SIMD
  3904. #define DSR_FLOAT_VECTOR_SIZE 32
  3905. #define DSR_FLOAT_ALIGNMENT 32
  3906. using F32xF = F32x8;
  3907. #else
  3908. // F vectors are 128-bits.
  3909. #define DSR_FLOAT_VECTOR_SIZE 16
  3910. #define DSR_FLOAT_ALIGNMENT 16
  3911. using F32xF = F32x4;
  3912. #endif
  3913. // Used to iterate over float pointers when using F32xF.
  3914. static const int laneCountF = DSR_FLOAT_VECTOR_SIZE / 4;
  3915. // Define traits.
  3916. DSR_APPLY_PROPERTY(DsrTrait_Any_U8 , U8x16)
  3917. DSR_APPLY_PROPERTY(DsrTrait_Any_U8 , U8x32)
  3918. DSR_APPLY_PROPERTY(DsrTrait_Any_U16, U16x8)
  3919. DSR_APPLY_PROPERTY(DsrTrait_Any_U16, U16x16)
  3920. DSR_APPLY_PROPERTY(DsrTrait_Any_U32, U32x4)
  3921. DSR_APPLY_PROPERTY(DsrTrait_Any_U32, U32x8)
  3922. DSR_APPLY_PROPERTY(DsrTrait_Any_I32, I32x4)
  3923. DSR_APPLY_PROPERTY(DsrTrait_Any_I32, I32x8)
  3924. DSR_APPLY_PROPERTY(DsrTrait_Any_F32, F32x4)
  3925. DSR_APPLY_PROPERTY(DsrTrait_Any_F32, F32x8)
  3926. DSR_APPLY_PROPERTY(DsrTrait_Any , U8x16)
  3927. DSR_APPLY_PROPERTY(DsrTrait_Any , U8x32)
  3928. DSR_APPLY_PROPERTY(DsrTrait_Any, U16x8)
  3929. DSR_APPLY_PROPERTY(DsrTrait_Any, U16x16)
  3930. DSR_APPLY_PROPERTY(DsrTrait_Any, U32x4)
  3931. DSR_APPLY_PROPERTY(DsrTrait_Any, U32x8)
  3932. DSR_APPLY_PROPERTY(DsrTrait_Any, I32x4)
  3933. DSR_APPLY_PROPERTY(DsrTrait_Any, I32x8)
  3934. DSR_APPLY_PROPERTY(DsrTrait_Any, F32x4)
  3935. DSR_APPLY_PROPERTY(DsrTrait_Any, F32x8)
  3936. // TODO: Use as independent types when the largest vector lengths are not known in compile time on ARM SVE.
  3937. //DSR_APPLY_PROPERTY(DsrTrait_Any_U8 , U8xX)
  3938. //DSR_APPLY_PROPERTY(DsrTrait_Any_U16, U16xX)
  3939. //DSR_APPLY_PROPERTY(DsrTrait_Any_U32, U32xX)
  3940. //DSR_APPLY_PROPERTY(DsrTrait_Any_I32, I32xX)
  3941. //DSR_APPLY_PROPERTY(DsrTrait_Any_F32, F32xX)
  3942. //DSR_APPLY_PROPERTY(DsrTrait_Any_F32, F32xF)
  3943. //DSR_APPLY_PROPERTY(DsrTrait_Any , U8xX)
  3944. //DSR_APPLY_PROPERTY(DsrTrait_Any, U16xX)
  3945. //DSR_APPLY_PROPERTY(DsrTrait_Any, U32xX)
  3946. //DSR_APPLY_PROPERTY(DsrTrait_Any, I32xX)
  3947. //DSR_APPLY_PROPERTY(DsrTrait_Any, F32xX)
  3948. //DSR_APPLY_PROPERTY(DsrTrait_Any, F32xF)
  3949. }
  3950. #endif