| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507 |
- // zlib open source license
- //
- // Copyright (c) 2017 to 2023 David Forsgren Piuva
- //
- // This software is provided 'as-is', without any express or implied
- // warranty. In no event will the authors be held liable for any damages
- // arising from the use of this software.
- //
- // Permission is granted to anyone to use this software for any purpose,
- // including commercial applications, and to alter it and redistribute it
- // freely, subject to the following restrictions:
- //
- // 1. The origin of this software must not be misrepresented; you must not
- // claim that you wrote the original software. If you use this software
- // in a product, an acknowledgment in the product documentation would be
- // appreciated but is not required.
- //
- // 2. Altered source versions must be plainly marked as such, and must not be
- // misrepresented as being the original software.
- //
- // 3. This notice may not be removed or altered from any source
- // distribution.
- // Hardware abstraction layer for portable SIMD math.
- // Used to make calculations faster without having to mess around with any hardware specific assembler code nor intrinsic functions.
- // You get the performance you need today and the ability to compile with automatic scalar emulation when building for a future processor that this module has not yet been ported to.
- // When you can generate the vectorized code using the same template function as your non-vectorized code, you don't even need to write a reference implementation.
- // Using with 128-bit SIMD: (beginner friendly, test once, compile anywhere, no compiler flags)
- // If you are new to vectorization or only plan to work with 128-bit extensions such as ARM NEON, you can keep it simple by only using the 128-bit vector types.
- // Pros and cons:
- // + Most target platforms (excluding older systems such as ARMv6) have 128-bit SIMD extensions such as Intel SSE2 or ARM NEON enabled by default.
- // ARMv6 does not support ARM NEON, but most ARMv7 processors support it, so that compilers enable NEON by default.
- // All 64-bit ARM processors have ARM NEON, because it stopped being optional in ARMv8.
- // Building for 64-bit Intel processors usually have SSE2 enabled by default, so you don't have to change any compiler flags when building on a different system.
- // + One build for all computers of the same instruction set.
- // Great when your application is not so resource heavy, because the least powerful systems don't have the fancy extensions anyway.
- // - You might end up enabling the additional SIMD extensions anyway because the library is already using it to become faster.
- // Types:
- // * Use F32x4, I32x4 and U32x4 for 4 elements at a time
- // * U16x8 for 8 elements at a time
- // * U8x16 for 16 elements at a time
- // Using the X vector size: (advanced, having to test with different build flags or emulation)
- // If you want more performance, you can use variable length type aliases.
- // Pros and cons:
- // + For heavy calculations where memory access is not the bottleneck, using larger SIMD vectors when enabled allow saving energy and increasing performance.
- // - If you forget to test with longer vector lengths (compiling with -mavx2 or -mEMULATE_256BIT_SIMD) then you might find bugs from not iterating or aligning memory correctly.
- // Types:
- // * Use F32xX, I32xX and U32xX for laneCountX_32Bit elements at a time
- // * U16xX for laneCountX_16Bit elements at a time
- // * U8xX for laneCountX_8Bit elements at a time
- // Using the F vector size: (very dangerous, no test can confirm that memory alignment is correct)
- // If you want even more performance, you can let float operations use the longest available F vector size, which might exceed the X vector size.
- // Pros and cons:
- // - Have to manually set the alignment of buffers to DSR_FLOAT_ALIGNMENT to prevent crashing.
- // If the default alignment for buffers changed based on the size of F vectors, the more commonly used X vector would get slowed down from cache misses from padding larger than X vectors.
- // AlignedImageF32 and sound backends are already aligned with the F vector size, because they are not generic like Buffer.
- // - It can be difficult to detect incorrect memory alignment, because a pointer can accidentally be aligned to more than what was requested.
- // If accidentally aligning to 128 bits instead of 256 bits, there is a 50% risk of failing to detect it at runtime and later fail on another computer.
- // If sticking with 128-bit or X vectors, all buffers will be correctly aligned automatically.
- // + For heavy calculations where memory access is not the bottleneck, using larger SIMD vectors when enabled allow saving energy and increasing performance.
- // - If you forget to test with longer vector lengths (compiling with -mavx2 or -mEMULATE_256BIT_SIMD) then you might find bugs from not iterating or aligning memory correctly.
- // Types:
- // * Use F32xX, I32xX and U32xX for laneCountX_32Bit elements at a time
- // Compiler extensions
- // On Intel/AMD processors:
- // SSE2 is usually enabled by default, because SSE2 is mandatory for 64-bit Intel instructions.
- // Use -mavx as a G++ compiler flag to enable the AVX extension, enabling the USE_AVX and USE_256BIT_F_SIMD macros.
- // If not available on your computer, you can test your algorithm for 256-bit float SIMD using EMULATE_256BIT_F_SIMD, but only if you use the F vector size.
- // Use -mavx2 as a G++ compiler flag to enable the AVX2 extension, enabling the USE_AVX2 and USE_256BIT_X_SIMD macros.
- // If not available on your computer, you can test your algorithm for 256-bit float and integer SIMD using EMULATE_256BIT_X_SIMD, but only if you use the X vector size.
- // On ARMv6 processors:
- // Scalar emulation is used when compiling for ARMv6, because it does not have NEON and VFP is not supported in this abstraction.
- // On ARMv7 processors:
- // NEON is usually enabled by default for ARMv7, because most of them have the extension.
- // On ARMv8 processors:
- // NEON can not be disabled for ARMv8, because it is mandatory for ARMv8.
- #ifndef DFPSR_SIMD
- #define DFPSR_SIMD
- #include <cstdint>
- #include <cassert>
- #include "SafePointer.h"
- #include "../math/FVector.h"
- #include "../math/IVector.h"
- #include "../math/UVector.h"
- // Get settings from here.
- #include "../settings.h"
- // Alignment in bytes
- #define ALIGN_BYTES(SIZE) __attribute__((aligned(SIZE)))
- #define ALIGN16 ALIGN_BYTES(16) // 128-bit alignment
- #define ALIGN32 ALIGN_BYTES(32) // 256-bit alignment
- // Everything declared in here handles things specific for SSE.
- // Direct use of the macros will not provide portability to all hardware.
- #ifdef USE_SSE2
- #define USE_DIRECT_SIMD_MEMORY_ACCESS
- #include <emmintrin.h> // SSE2
- #ifdef USE_SSSE3
- #include <tmmintrin.h> // SSSE3
- #endif
- #ifdef USE_AVX
- #include <immintrin.h> // AVX / AVX2
- #endif
- // Vector types
- #define SIMD_F32x4 __m128
- #define SIMD_U8x16 __m128i
- #define SIMD_U16x8 __m128i
- #define SIMD_U32x4 __m128i
- #define SIMD_I32x4 __m128i
- // Vector uploads in address order
- #define LOAD_VECTOR_F32_SIMD(A, B, C, D) _mm_set_ps(D, C, B, A)
- #define LOAD_SCALAR_F32_SIMD(A) _mm_set1_ps(A)
- #define LOAD_VECTOR_U8_SIMD(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) _mm_set_epi8(P, O, N, M, L, K, J, I, H, G, F, E, D, C, B, A)
- #define LOAD_SCALAR_U8_SIMD(A) _mm_set1_epi8(A)
- #define LOAD_VECTOR_U16_SIMD(A, B, C, D, E, F, G, H) _mm_set_epi16(H, G, F, E, D, C, B, A)
- #define LOAD_SCALAR_U16_SIMD(A) _mm_set1_epi16(A)
- #define LOAD_VECTOR_U32_SIMD(A, B, C, D) _mm_set_epi32(D, C, B, A)
- #define LOAD_SCALAR_U32_SIMD(A) _mm_set1_epi32(A)
- #define LOAD_VECTOR_I32_SIMD(A, B, C, D) _mm_set_epi32(D, C, B, A)
- #define LOAD_SCALAR_I32_SIMD(A) _mm_set1_epi32(A)
- // Conversions
- #define F32_TO_I32_SIMD(A) _mm_cvttps_epi32(A)
- #define F32_TO_U32_SIMD(A) _mm_cvttps_epi32(A)
- #define I32_TO_F32_SIMD(A) _mm_cvtepi32_ps(A)
- #define U32_TO_F32_SIMD(A) _mm_cvtepi32_ps(A)
- // Unpacking conversions
- #define U8_LOW_TO_U16_SIMD(A) _mm_unpacklo_epi8(A, _mm_set1_epi8(0))
- #define U8_HIGH_TO_U16_SIMD(A) _mm_unpackhi_epi8(A, _mm_set1_epi8(0))
- #define U16_LOW_TO_U32_SIMD(A) _mm_unpacklo_epi16(A, _mm_set1_epi16(0))
- #define U16_HIGH_TO_U32_SIMD(A) _mm_unpackhi_epi16(A, _mm_set1_epi16(0))
- // Saturated packing
- // Credit: Using ideas from Victoria Zhislina's NEON_2_SSE.h header from the Intel corporation, but not trying to emulate NEON
- inline SIMD_U8x16 PACK_SAT_U16_TO_U8_SIMD(const SIMD_U16x8& a, const SIMD_U16x8& b) {
- SIMD_U16x8 mask, a2, b2;
- mask = _mm_set1_epi16(0b0111111111111111);
- a2 = _mm_and_si128(a, mask);
- a2 = _mm_or_si128(a2, _mm_and_si128(_mm_cmpgt_epi16(a2, a), mask));
- b2 = _mm_and_si128(b, mask);
- b2 = _mm_or_si128(b2, _mm_and_si128(_mm_cmpgt_epi16(b2, b), mask));
- return _mm_packus_epi16(a2, b2);
- }
- // Reinterpret casting
- #define REINTERPRET_U32_TO_U8_SIMD(A) (A)
- #define REINTERPRET_U32_TO_U16_SIMD(A) (A)
- #define REINTERPRET_U8_TO_U32_SIMD(A) (A)
- #define REINTERPRET_U16_TO_U32_SIMD(A) (A)
- #define REINTERPRET_U32_TO_I32_SIMD(A) (A)
- #define REINTERPRET_I32_TO_U32_SIMD(A) (A)
- // Vector float operations returning SIMD_F32x4
- #define ADD_F32_SIMD(A, B) _mm_add_ps(A, B)
- #define SUB_F32_SIMD(A, B) _mm_sub_ps(A, B)
- #define MUL_F32_SIMD(A, B) _mm_mul_ps(A, B)
- // Vector integer operations returning SIMD_I32x4
- #define ADD_I32_SIMD(A, B) _mm_add_epi32(A, B)
- #define SUB_I32_SIMD(A, B) _mm_sub_epi32(A, B)
- // 32-bit integer multiplications are not available on SSE2.
- // Vector integer operations returning SIMD_U32x4
- #define ADD_U32_SIMD(A, B) _mm_add_epi32(A, B)
- #define SUB_U32_SIMD(A, B) _mm_sub_epi32(A, B)
- // 32-bit integer multiplications are not available on SSE2.
- // Vector integer operations returning SIMD_U16x8
- #define ADD_U16_SIMD(A, B) _mm_add_epi16(A, B)
- #define SUB_U16_SIMD(A, B) _mm_sub_epi16(A, B)
- #define MUL_U16_SIMD(A, B) _mm_mullo_epi16(A, B)
- // Vector integer operations returning SIMD_U8x16
- #define ADD_U8_SIMD(A, B) _mm_add_epi8(A, B)
- #define ADD_SAT_U8_SIMD(A, B) _mm_adds_epu8(A, B) // Saturated addition
- #define SUB_U8_SIMD(A, B) _mm_sub_epi8(A, B)
- #define SUB_SAT_U8_SIMD(A, B) _mm_subs_epu8(A, B) // Saturated subtraction
- // No 8-bit multiplications
- // Statistics
- #define MIN_F32_SIMD(A, B) _mm_min_ps(A, B)
- #define MAX_F32_SIMD(A, B) _mm_max_ps(A, B)
- // TODO: Implement minimum and maximum for integer vectors, so that all operations exist for all applicable types:
- // Using _mm256_max_epu16... in AVX2 for 256-bit versions
- // Using comparisons and masking in SSE2 when _mm_max_epu16... in SSE4.1 is not available
- // Bitwise
- #define BITWISE_AND_U32_SIMD(A, B) _mm_and_si128(A, B)
- #define BITWISE_OR_U32_SIMD(A, B) _mm_or_si128(A, B)
- #define BITWISE_XOR_U32_SIMD(A, B) _mm_xor_si128(A, B)
- #ifdef USE_AVX
- // 256-bit vector types
- #define SIMD_F32x8 __m256
- // Vector uploads in address order
- #define LOAD_VECTOR_F32_SIMD256(A, B, C, D, E, F, G, H) _mm256_set_ps(H, G, F, E, D, C, B, A)
- #define LOAD_SCALAR_F32_SIMD256(A) _mm256_set1_ps(A)
- // Vector float operations returning SIMD_F32x4
- #define ADD_F32_SIMD256(A, B) _mm256_add_ps(A, B)
- #define SUB_F32_SIMD256(A, B) _mm256_sub_ps(A, B)
- #define MUL_F32_SIMD256(A, B) _mm256_mul_ps(A, B)
- // Statistics
- #define MIN_F32_SIMD256(A, B) _mm256_min_ps(A, B)
- #define MAX_F32_SIMD256(A, B) _mm256_max_ps(A, B)
- #ifdef USE_AVX2
- // 256-bit vector types
- #define SIMD_U8x32 __m256i
- #define SIMD_U16x16 __m256i
- #define SIMD_U32x8 __m256i
- #define SIMD_I32x8 __m256i
- // Vector uploads in address order
- #define LOAD_VECTOR_U8_SIMD256(A1, B1, C1, D1, E1, F1, G1, H1, I1, J1, K1, L1, M1, N1, O1, P1, Q1, R1, S1, T1, U1, V1, W1, X1, Y1, Z1, A2, B2, C2, D2, E2, F2) _mm256_set_epi8(F2, E2, D2, C2, B2, A2, Z1, Y1, X1, W1, V1, U1, T1, S1, R1, Q1, P1, O1, N1, M1, L1, K1, J1, I1, H1, G1, F1, E1, D1, C1, B1, A1)
- #define LOAD_SCALAR_U8_SIMD256(A) _mm256_set1_epi8(A)
- #define LOAD_VECTOR_U16_SIMD256(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) _mm256_set_epi16(P, O, N, M, L, K, J, I, H, G, F, E, D, C, B, A)
- #define LOAD_SCALAR_U16_SIMD256(A) _mm256_set1_epi16(A)
- #define LOAD_VECTOR_U32_SIMD256(A, B, C, D, E, F, G, H) _mm256_set_epi32(H, G, F, E, D, C, B, A)
- #define LOAD_SCALAR_U32_SIMD256(A) _mm256_set1_epi32(A)
- #define LOAD_VECTOR_I32_SIMD256(A, B, C, D, E, F, G, H) _mm256_set_epi32(H, G, F, E, D, C, B, A)
- #define LOAD_SCALAR_I32_SIMD256(A) _mm256_set1_epi32(A)
- // Conversions
- #define F32_TO_I32_SIMD256(A) _mm256_cvttps_epi32(A)
- #define F32_TO_U32_SIMD256(A) _mm256_cvttps_epi32(A)
- #define I32_TO_F32_SIMD256(A) _mm256_cvtepi32_ps(A)
- #define U32_TO_F32_SIMD256(A) _mm256_cvtepi32_ps(A)
- // Unpacking conversions
- #define U8_LOW_TO_U16_SIMD256(A) _mm256_unpacklo_epi8(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi8(0))
- #define U8_HIGH_TO_U16_SIMD256(A) _mm256_unpackhi_epi8(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi8(0))
- #define U16_LOW_TO_U32_SIMD256(A) _mm256_unpacklo_epi16(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi16(0))
- #define U16_HIGH_TO_U32_SIMD256(A) _mm256_unpackhi_epi16(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi16(0))
- // Saturated packing
- // Credit: Using ideas from Victoria Zhislina's NEON_2_SSE.h header from the Intel corporation, but not trying to emulate NEON
- inline SIMD_U8x32 PACK_SAT_U16_TO_U8_SIMD256(const SIMD_U16x16& a, const SIMD_U16x16& b) {
- SIMD_U16x16 mask, a2, b2;
- mask = _mm256_set1_epi16(0b0111111111111111);
- a2 = _mm256_and_si256(a, mask);
- a2 = _mm256_or_si256(a2, _mm256_and_si256(_mm256_cmpgt_epi16(a2, a), mask));
- b2 = _mm256_and_si256(b, mask);
- b2 = _mm256_or_si256(b2, _mm256_and_si256(_mm256_cmpgt_epi16(b2, b), mask));
- // The 256-bit pack instruction _mm256_packus_epi16 is not serial, so the result has to be permutated into the correct order.
- // 0 2 1 3
- // | X |
- // 0 1 2 3
- return _mm256_permute4x64_epi64(_mm256_packus_epi16(a2, b2), 0b11011000);
- }
- // Reinterpret casting
- #define REINTERPRET_U32_TO_U8_SIMD256(A) (A)
- #define REINTERPRET_U32_TO_U16_SIMD256(A) (A)
- #define REINTERPRET_U8_TO_U32_SIMD256(A) (A)
- #define REINTERPRET_U16_TO_U32_SIMD256(A) (A)
- #define REINTERPRET_U32_TO_I32_SIMD256(A) (A)
- #define REINTERPRET_I32_TO_U32_SIMD256(A) (A)
- // Vector integer operations returning SIMD_I32x4
- #define ADD_I32_SIMD256(A, B) _mm256_add_epi32(A, B)
- #define SUB_I32_SIMD256(A, B) _mm256_sub_epi32(A, B)
- #define MUL_I32_SIMD256(A, B) _mm256_mullo_epi32(A, B)
- // Vector integer operations returning SIMD_U32x4
- #define ADD_U32_SIMD256(A, B) _mm256_add_epi32(A, B)
- #define SUB_U32_SIMD256(A, B) _mm256_sub_epi32(A, B)
- #define MUL_U32_SIMD256(A, B) _mm256_mullo_epi32(A, B)
- // Vector integer operations returning SIMD_U16x8
- #define ADD_U16_SIMD256(A, B) _mm256_add_epi16(A, B)
- #define SUB_U16_SIMD256(A, B) _mm256_sub_epi16(A, B)
- #define MUL_U16_SIMD256(A, B) _mm256_mullo_epi16(A, B)
- // Vector integer operations returning SIMD_U8x16
- #define ADD_U8_SIMD256(A, B) _mm256_add_epi8(A, B)
- #define ADD_SAT_U8_SIMD256(A, B) _mm256_adds_epu8(A, B) // Saturated addition
- #define SUB_U8_SIMD256(A, B) _mm256_sub_epi8(A, B)
- #define SUB_SAT_U8_SIMD256(A, B) _mm256_subs_epu8(A, B) // Saturated subtraction
- // No 8-bit multiplications
- // Bitwise
- #define BITWISE_AND_U32_SIMD256(A, B) _mm256_and_si256(A, B)
- #define BITWISE_OR_U32_SIMD256(A, B) _mm256_or_si256(A, B)
- #define BITWISE_XOR_U32_SIMD256(A, B) _mm256_xor_si256(A, B)
- #endif
- #endif
- #endif
- // Everything declared in here handles things specific for NEON.
- // Direct use of the macros will not provide portability to all hardware.
- #ifdef USE_NEON
- #include <arm_neon.h> // NEON
- // Vector types
- #define SIMD_F32x4 float32x4_t
- #define SIMD_U8x16 uint8x16_t
- #define SIMD_U16x8 uint16x8_t
- #define SIMD_U32x4 uint32x4_t
- #define SIMD_I32x4 int32x4_t
- // Vector uploads in address order
- inline SIMD_F32x4 LOAD_VECTOR_F32_SIMD(float a, float b, float c, float d) {
- float data[4] ALIGN16 = {a, b, c, d};
- return vld1q_f32(data);
- }
- inline SIMD_F32x4 LOAD_SCALAR_F32_SIMD(float a) {
- return vdupq_n_f32(a);
- }
- inline SIMD_U8x16 LOAD_VECTOR_U8_SIMD(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
- uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p) {
- uint8_t data[16] ALIGN16 = {a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p};
- return vld1q_u8(data);
- }
- inline SIMD_U8x16 LOAD_SCALAR_U8_SIMD(uint16_t a) {
- return vdupq_n_u8(a);
- }
- inline SIMD_U16x8 LOAD_VECTOR_U16_SIMD(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h) {
- uint16_t data[8] ALIGN16 = {a, b, c, d, e, f, g, h};
- return vld1q_u16(data);
- }
- inline SIMD_U16x8 LOAD_SCALAR_U16_SIMD(uint16_t a) {
- return vdupq_n_u16(a);
- }
- inline SIMD_U32x4 LOAD_VECTOR_U32_SIMD(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
- uint32_t data[4] ALIGN16 = {a, b, c, d};
- return vld1q_u32(data);
- }
- inline SIMD_U32x4 LOAD_SCALAR_U32_SIMD(uint32_t a) {
- return vdupq_n_u32(a);
- }
- inline SIMD_I32x4 LOAD_VECTOR_I32_SIMD(int32_t a, int32_t b, int32_t c, int32_t d) {
- int32_t data[4] ALIGN16 = {a, b, c, d};
- return vld1q_s32(data);
- }
- inline SIMD_I32x4 LOAD_SCALAR_I32_SIMD(int32_t a) {
- return vdupq_n_s32(a);
- }
- // Conversions
- #define F32_TO_I32_SIMD(A) vcvtq_s32_f32(A)
- #define F32_TO_U32_SIMD(A) vcvtq_u32_f32(A)
- #define I32_TO_F32_SIMD(A) vcvtq_f32_s32(A)
- #define U32_TO_F32_SIMD(A) vcvtq_f32_u32(A)
- // Unpacking conversions
- #define U8_LOW_TO_U16_SIMD(A) vmovl_u8(vget_low_u8(A))
- #define U8_HIGH_TO_U16_SIMD(A) vmovl_u8(vget_high_u8(A))
- #define U16_LOW_TO_U32_SIMD(A) vmovl_u16(vget_low_u16(A))
- #define U16_HIGH_TO_U32_SIMD(A) vmovl_u16(vget_high_u16(A))
- // Saturated packing
- #define PACK_SAT_U16_TO_U8_SIMD(A, B) vcombine_u8(vqmovn_u16(A), vqmovn_u16(B))
- // Reinterpret casting
- #define REINTERPRET_U32_TO_U8_SIMD(A) vreinterpretq_u8_u32(A)
- #define REINTERPRET_U32_TO_U16_SIMD(A) vreinterpretq_u16_u32(A)
- #define REINTERPRET_U8_TO_U32_SIMD(A) vreinterpretq_u32_u8(A)
- #define REINTERPRET_U16_TO_U32_SIMD(A) vreinterpretq_u32_u16(A)
- #define REINTERPRET_U32_TO_I32_SIMD(A) vreinterpretq_s32_u32(A)
- #define REINTERPRET_I32_TO_U32_SIMD(A) vreinterpretq_u32_s32(A)
- // Vector float operations returning SIMD_F32x4
- #define ADD_F32_SIMD(A, B) vaddq_f32(A, B)
- #define SUB_F32_SIMD(A, B) vsubq_f32(A, B)
- #define MUL_F32_SIMD(A, B) vmulq_f32(A, B)
- // Vector integer operations returning SIMD_I32x4
- #define ADD_I32_SIMD(A, B) vaddq_s32(A, B)
- #define SUB_I32_SIMD(A, B) vsubq_s32(A, B)
- #define MUL_I32_NEON(A, B) vmulq_s32(A, B)
- // Vector integer operations returning SIMD_U32x4
- #define ADD_U32_SIMD(A, B) vaddq_u32(A, B)
- #define SUB_U32_SIMD(A, B) vsubq_u32(A, B)
- #define MUL_U32_NEON(A, B) vmulq_u32(A, B)
- // Vector integer operations returning SIMD_U16x8
- #define ADD_U16_SIMD(A, B) vaddq_u16(A, B)
- #define SUB_U16_SIMD(A, B) vsubq_u16(A, B)
- #define MUL_U16_SIMD(A, B) vmulq_u16(A, B)
- // Vector integer operations returning SIMD_U8x16
- #define ADD_U8_SIMD(A, B) vaddq_u8(A, B)
- #define ADD_SAT_U8_SIMD(A, B) vqaddq_u8(A, B) // Saturated addition
- #define SUB_U8_SIMD(A, B) vsubq_u8(A, B)
- #define SUB_SAT_U8_SIMD(A, B) vqsubq_u8(A, B) // Saturated subtraction
- // No 8-bit multiplications
- // Statistics
- #define MIN_F32_SIMD(A, B) vminq_f32(A, B)
- #define MAX_F32_SIMD(A, B) vmaxq_f32(A, B)
- // Bitwise
- #define BITWISE_AND_U32_SIMD(A, B) vandq_u32(A, B)
- #define BITWISE_OR_U32_SIMD(A, B) vorrq_u32(A, B)
- #define BITWISE_XOR_U32_SIMD(A, B) veorq_u32(A, B)
- #endif
- /*
- The vector types below are supposed to be portable across different CPU architectures.
- When mixed with handwritten SIMD intrinsics:
- Use "USE_SSE2" instead of "__SSE2__"
- Use "USE_AVX2" instead of "__AVX2__"
- Use "USE_NEON" instead of "__ARM_NEON"
- So that any new variations of the macro named given from the compiler can be added to simd.h instead of duplicated everywhere.
- Portability exceptions:
- * The "v" variable is the native backend, which is only defined when SIMD is supported by hardware.
- Only use when USE_BASIC_SIMD is defined.
- Will not work on scalar emulation.
- * The "scalars" array is available when emulating a type that does not exist or the SIMD vector has direct access to the memory.
- Do not rely on these for accessing elements, because otherwise your code will not be able to compile for ARM NEON.
- */
- union F32x4 {
- private:
- // The uninitialized default constructor is private for safety reasons.
- F32x4() {}
- public:
- // When the uninitialized constructor is needed for performance, use this named constructor instead.
- static inline F32x4 create_dangerous_uninitialized() { return F32x4(); }
- #ifdef USE_BASIC_SIMD
- public:
- #ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
- // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
- // Direct access cannot be done on NEON!
- float scalars[4];
- #endif
- // The SIMD vector of undefined type
- // Not accessible while emulating!
- SIMD_F32x4 v;
- // Construct a portable vector from a native SIMD vector
- explicit F32x4(const SIMD_F32x4& v) : v(v) {}
- // Construct a portable vector from a set of scalars
- F32x4(float a1, float a2, float a3, float a4) : v(LOAD_VECTOR_F32_SIMD(a1, a2, a3, a4)) {}
- // Construct a portable vector from a single duplicated scalar
- explicit F32x4(float scalar) : v(LOAD_SCALAR_F32_SIMD(scalar)) {}
- #else
- public:
- // Emulate a SIMD vector as an array of scalars without hardware support.
- // Only accessible while emulating!
- float scalars[4];
- // Construct a portable vector from a set of scalars
- F32x4(float a1, float a2, float a3, float a4) {
- this->scalars[0] = a1;
- this->scalars[1] = a2;
- this->scalars[2] = a3;
- this->scalars[3] = a4;
- }
- // Construct a portable vector from a single duplicated scalar
- explicit F32x4(float scalar) {
- this->scalars[0] = scalar;
- this->scalars[1] = scalar;
- this->scalars[2] = scalar;
- this->scalars[3] = scalar;
- }
- #endif
- // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
- static inline F32x4 createGradient(float start, float increment) {
- return F32x4(start, start + increment, start + increment * 2.0f, start + increment * 3.0f);
- }
- // Construct a portable SIMD vector from a pointer to aligned data
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- static inline F32x4 readAlignedUnsafe(const float* data) {
- #ifdef USE_BASIC_SIMD
- #if defined USE_SSE2
- return F32x4(_mm_load_ps(data));
- #elif defined USE_NEON
- return F32x4(vld1q_f32(data));
- #endif
- #else
- return F32x4(data[0], data[1], data[2], data[3]);
- #endif
- }
- // Write to aligned memory from the existing vector
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- inline void writeAlignedUnsafe(float* data) const {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- _mm_store_ps(data, this->v);
- #elif defined USE_NEON
- vst1q_f32(data, this->v);
- #endif
- #else
- data[0] = this->scalars[0];
- data[1] = this->scalars[1];
- data[2] = this->scalars[2];
- data[3] = this->scalars[3];
- #endif
- }
- #if defined DFPSR_GEOMETRY_FVECTOR
- dsr::FVector4D get() const {
- float data[4] ALIGN16;
- this->writeAlignedUnsafe(data);
- return dsr::FVector4D(data[0], data[1], data[2], data[3]);
- }
- #endif
- // Bound and alignment checked reading
- static inline F32x4 readAligned(const dsr::SafePointer<float> data, const char* methodName) {
- const float* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 15) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 16);
- #endif
- return F32x4::readAlignedUnsafe(pointer);
- }
- // Bound and alignment checked writing
- inline void writeAligned(dsr::SafePointer<float> data, const char* methodName) const {
- float* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 15) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 16);
- #endif
- this->writeAlignedUnsafe(pointer);
- }
- // 1 / x
- // Useful for multiple divisions with the same denominator
- // Useless if the denominator is a constant
- F32x4 reciprocal() const {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- // Approximate
- SIMD_F32x4 lowQ = _mm_rcp_ps(this->v);
- // Refine
- return F32x4(SUB_F32_SIMD(ADD_F32_SIMD(lowQ, lowQ), MUL_F32_SIMD(this->v, MUL_F32_SIMD(lowQ, lowQ))));
- #elif defined USE_NEON
- // Approximate
- SIMD_F32x4 result = vrecpeq_f32(this->v);
- // Refine
- result = MUL_F32_SIMD(vrecpsq_f32(this->v, result), result);
- return F32x4(MUL_F32_SIMD(vrecpsq_f32(this->v, result), result));
- #else
- assert(false);
- return F32x4(0);
- #endif
- #else
- return F32x4(1.0f / this->scalars[0], 1.0f / this->scalars[1], 1.0f / this->scalars[2], 1.0f / this->scalars[3]);
- #endif
- }
- // 1 / sqrt(x)
- // Useful for normalizing vectors
- F32x4 reciprocalSquareRoot() const {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- SIMD_F32x4 reciRoot = _mm_rsqrt_ps(this->v);
- SIMD_F32x4 mul = MUL_F32_SIMD(MUL_F32_SIMD(this->v, reciRoot), reciRoot);
- reciRoot = MUL_F32_SIMD(MUL_F32_SIMD(LOAD_SCALAR_F32_SIMD(0.5f), reciRoot), SUB_F32_SIMD(LOAD_SCALAR_F32_SIMD(3.0f), mul));
- return F32x4(reciRoot);
- #elif defined USE_NEON
- // Approximate
- SIMD_F32x4 reciRoot = vrsqrteq_f32(this->v);
- // Refine
- reciRoot = MUL_F32_SIMD(vrsqrtsq_f32(MUL_F32_SIMD(this->v, reciRoot), reciRoot), reciRoot);
- return F32x4(reciRoot);
- #else
- assert(false);
- return F32x4(0);
- #endif
- #else
- return F32x4(1.0f / sqrt(this->scalars[0]), 1.0f / sqrt(this->scalars[1]), 1.0f / sqrt(this->scalars[2]), 1.0f / sqrt(this->scalars[3]));
- #endif
- }
- // sqrt(x)
- // Useful for getting lengths of vectors
- F32x4 squareRoot() const {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- SIMD_F32x4 half = LOAD_SCALAR_F32_SIMD(0.5f);
- // Approximate
- SIMD_F32x4 root = _mm_sqrt_ps(this->v);
- // Refine
- root = _mm_mul_ps(_mm_add_ps(root, _mm_div_ps(this->v, root)), half);
- return F32x4(root);
- #elif defined USE_NEON
- return F32x4(MUL_F32_SIMD(this->v, this->reciprocalSquareRoot().v));
- #else
- assert(false);
- return F32x4(0);
- #endif
- #else
- return F32x4(sqrt(this->scalars[0]), sqrt(this->scalars[1]), sqrt(this->scalars[2]), sqrt(this->scalars[3]));
- #endif
- }
- F32x4 clamp(float minimum, float maximum) const {
- #if defined USE_BASIC_SIMD
- return F32x4(MIN_F32_SIMD(MAX_F32_SIMD(this->v, LOAD_SCALAR_F32_SIMD(minimum)), LOAD_SCALAR_F32_SIMD(maximum)));
- #else
- float val0 = this->scalars[0];
- float val1 = this->scalars[1];
- float val2 = this->scalars[2];
- float val3 = this->scalars[3];
- if (minimum > val0) { val0 = minimum; }
- if (maximum < val0) { val0 = maximum; }
- if (minimum > val1) { val1 = minimum; }
- if (maximum < val1) { val1 = maximum; }
- if (minimum > val2) { val2 = minimum; }
- if (maximum < val2) { val2 = maximum; }
- if (minimum > val3) { val3 = minimum; }
- if (maximum < val3) { val3 = maximum; }
- return F32x4(val0, val1, val2, val3);
- #endif
- }
- F32x4 clampLower(float minimum) const {
- #if defined USE_BASIC_SIMD
- return F32x4(MAX_F32_SIMD(this->v, LOAD_SCALAR_F32_SIMD(minimum)));
- #else
- float val0 = this->scalars[0];
- float val1 = this->scalars[1];
- float val2 = this->scalars[2];
- float val3 = this->scalars[3];
- if (minimum > val0) { val0 = minimum; }
- if (minimum > val1) { val1 = minimum; }
- if (minimum > val2) { val2 = minimum; }
- if (minimum > val3) { val3 = minimum; }
- return F32x4(val0, val1, val2, val3);
- #endif
- }
- F32x4 clampUpper(float maximum) const {
- #if defined USE_BASIC_SIMD
- return F32x4(MIN_F32_SIMD(this->v, LOAD_SCALAR_F32_SIMD(maximum)));
- #else
- float val0 = this->scalars[0];
- float val1 = this->scalars[1];
- float val2 = this->scalars[2];
- float val3 = this->scalars[3];
- if (maximum < val0) { val0 = maximum; }
- if (maximum < val1) { val1 = maximum; }
- if (maximum < val2) { val2 = maximum; }
- if (maximum < val3) { val3 = maximum; }
- return F32x4(val0, val1, val2, val3);
- #endif
- }
- };
- union I32x4 {
- private:
- // The uninitialized default constructor is private for safety reasons.
- I32x4() {}
- public:
- // When the uninitialized constructor is needed for performance, use this named constructor instead.
- static inline I32x4 create_dangerous_uninitialized() { return I32x4(); }
- #if defined USE_BASIC_SIMD
- public:
- #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
- // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
- // Direct access cannot be done on NEON!
- int32_t scalars[4];
- #endif
- // The SIMD vector of undefined type
- // Not accessible while emulating!
- SIMD_I32x4 v;
- // Construct a portable vector from a native SIMD vector
- explicit I32x4(const SIMD_I32x4& v) : v(v) {}
- // Construct a portable vector from a set of scalars
- I32x4(int32_t a1, int32_t a2, int32_t a3, int32_t a4) : v(LOAD_VECTOR_I32_SIMD(a1, a2, a3, a4)) {}
- // Construct a portable vector from a single duplicated scalar
- explicit I32x4(int32_t scalar) : v(LOAD_SCALAR_I32_SIMD(scalar)) {}
- #else
- public:
- // Emulate a SIMD vector as an array of scalars without hardware support.
- // Only accessible while emulating!
- int32_t scalars[4];
- // Construct a portable vector from a set of scalars
- I32x4(int32_t a1, int32_t a2, int32_t a3, int32_t a4) {
- this->scalars[0] = a1;
- this->scalars[1] = a2;
- this->scalars[2] = a3;
- this->scalars[3] = a4;
- }
- // Construct a portable vector from a single duplicated scalar
- explicit I32x4(int32_t scalar) {
- this->scalars[0] = scalar;
- this->scalars[1] = scalar;
- this->scalars[2] = scalar;
- this->scalars[3] = scalar;
- }
- #endif
- // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
- static inline I32x4 createGradient(int32_t start, int32_t increment) {
- return I32x4(start, start + increment, start + increment * 2, start + increment * 3);
- }
- // Construct a portable SIMD vector from a pointer to aligned data
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- static inline I32x4 readAlignedUnsafe(const int32_t* data) {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- return I32x4(_mm_load_si128((const __m128i*)data));
- #elif defined USE_NEON
- return I32x4(vld1q_s32(data));
- #endif
- #else
- return I32x4(data[0], data[1], data[2], data[3]);
- #endif
- }
- // Write to aligned memory from the existing vector
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- inline void writeAlignedUnsafe(int32_t* data) const {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- _mm_store_si128((__m128i*)data, this->v);
- #elif defined USE_NEON
- vst1q_s32(data, this->v);
- #endif
- #else
- data[0] = this->scalars[0];
- data[1] = this->scalars[1];
- data[2] = this->scalars[2];
- data[3] = this->scalars[3];
- #endif
- }
- #if defined DFPSR_GEOMETRY_IVECTOR
- dsr::IVector4D get() const {
- int32_t data[4] ALIGN16;
- this->writeAlignedUnsafe(data);
- return dsr::IVector4D(data[0], data[1], data[2], data[3]);
- }
- #endif
- // Bound and alignment checked reading
- static inline I32x4 readAligned(const dsr::SafePointer<int32_t> data, const char* methodName) {
- const int32_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 15) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 16);
- #endif
- return I32x4::readAlignedUnsafe(pointer);
- }
- // Bound and alignment checked writing
- inline void writeAligned(dsr::SafePointer<int32_t> data, const char* methodName) const {
- int32_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 15) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 16);
- #endif
- this->writeAlignedUnsafe(pointer);
- }
- };
- union U32x4 {
- #if defined USE_BASIC_SIMD
- public:
- #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
- // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
- // Direct access cannot be done on NEON!
- uint32_t scalars[4];
- #endif
- // The SIMD vector of undefined type
- // Not accessible while emulating!
- SIMD_U32x4 v;
- // Construct a portable vector from a native SIMD vector
- explicit U32x4(const SIMD_U32x4& v) : v(v) {}
- // Construct a portable vector from a set of scalars
- U32x4(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4) : v(LOAD_VECTOR_U32_SIMD(a1, a2, a3, a4)) {}
- // Construct a portable vector from a single duplicated scalar
- explicit U32x4(uint32_t scalar) : v(LOAD_SCALAR_U32_SIMD(scalar)) {}
- #else
- public:
- // Emulate a SIMD vector as an array of scalars without hardware support.
- // Only accessible while emulating!
- uint32_t scalars[4];
- // Construct a portable vector from a set of scalars
- U32x4(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4) {
- this->scalars[0] = a1;
- this->scalars[1] = a2;
- this->scalars[2] = a3;
- this->scalars[3] = a4;
- }
- // Construct a portable vector from a single duplicated scalar
- explicit U32x4(uint32_t scalar) {
- this->scalars[0] = scalar;
- this->scalars[1] = scalar;
- this->scalars[2] = scalar;
- this->scalars[3] = scalar;
- }
- #endif
- // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
- static inline U32x4 createGradient(uint32_t start, uint32_t increment) {
- return U32x4(start, start + increment, start + increment * 2, start + increment * 3);
- }
- // Construct a portable SIMD vector from a pointer to aligned data
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- static inline U32x4 readAlignedUnsafe(const uint32_t* data) {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- return U32x4(_mm_load_si128((const __m128i*)data));
- #elif defined USE_NEON
- return U32x4(vld1q_u32(data));
- #endif
- #else
- return U32x4(data[0], data[1], data[2], data[3]);
- #endif
- }
- // Write to aligned memory from the existing vector
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- inline void writeAlignedUnsafe(uint32_t* data) const {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- _mm_store_si128((__m128i*)data, this->v);
- #elif defined USE_NEON
- vst1q_u32(data, this->v);
- #endif
- #else
- data[0] = this->scalars[0];
- data[1] = this->scalars[1];
- data[2] = this->scalars[2];
- data[3] = this->scalars[3];
- #endif
- }
- #if defined DFPSR_GEOMETRY_UVECTOR
- dsr::UVector4D get() const {
- uint32_t data[4] ALIGN16;
- this->writeAlignedUnsafe(data);
- return dsr::UVector4D(data[0], data[1], data[2], data[3]);
- }
- #endif
- // Bound and alignment checked reading
- static inline U32x4 readAligned(const dsr::SafePointer<uint32_t> data, const char* methodName) {
- const uint32_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 15) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 16);
- #endif
- return U32x4::readAlignedUnsafe(pointer);
- }
- // Bound and alignment checked writing
- inline void writeAligned(dsr::SafePointer<uint32_t> data, const char* methodName) const {
- uint32_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 15) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 16);
- #endif
- this->writeAlignedUnsafe(pointer);
- }
- };
- union U16x8 {
- private:
- // The uninitialized default constructor is private for safety reasons.
- U16x8() {}
- public:
- // When the uninitialized constructor is needed for performance, use this named constructor instead.
- static inline U16x8 create_dangerous_uninitialized() { return U16x8(); }
- #if defined USE_BASIC_SIMD
- public:
- #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
- // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
- // Direct access cannot be done on NEON!
- uint16_t scalars[8];
- #endif
- // The SIMD vector of undefined type
- // Not accessible while emulating!
- SIMD_U16x8 v;
- // Construct a portable vector from a native SIMD vector
- explicit U16x8(const SIMD_U16x8& v) : v(v) {}
- // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
- // Reinterpret casting is used
- explicit U16x8(const U32x4& vector) : v(REINTERPRET_U32_TO_U16_SIMD(vector.v)) {}
- // Construct a portable vector from a set of scalars
- U16x8(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8) : v(LOAD_VECTOR_U16_SIMD(a1, a2, a3, a4, a5, a6, a7, a8)) {}
- // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
- // Reinterpret casting is used
- // TODO: Remove all reintreprets from constructors to improve readability
- explicit U16x8(uint32_t scalar) : v(REINTERPRET_U32_TO_U16_SIMD(LOAD_SCALAR_U32_SIMD(scalar))) {}
- // Construct a portable vector from a single duplicated scalar
- explicit U16x8(uint16_t scalar) : v(LOAD_SCALAR_U16_SIMD(scalar)) {}
- // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
- U32x4 get_U32() const {
- return U32x4(REINTERPRET_U16_TO_U32_SIMD(this->v));
- }
- #else
- public:
- // Emulate a SIMD vector as an array of scalars without hardware support.
- // Only accessible while emulating!
- uint16_t scalars[8];
- // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
- // Reinterpret casting is used
- explicit U16x8(const U32x4& vector) {
- uint64_t *target = (uint64_t*)this->scalars;
- uint64_t *source = (uint64_t*)vector.scalars;
- target[0] = source[0];
- target[1] = source[1];
- }
- // Construct a portable vector from a set of scalars
- U16x8(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8) {
- this->scalars[0] = a1;
- this->scalars[1] = a2;
- this->scalars[2] = a3;
- this->scalars[3] = a4;
- this->scalars[4] = a5;
- this->scalars[5] = a6;
- this->scalars[6] = a7;
- this->scalars[7] = a8;
- }
- // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
- // Reinterpret casting is used
- explicit U16x8(uint32_t scalar) {
- uint32_t *target = (uint32_t*)this->scalars;
- target[0] = scalar;
- target[1] = scalar;
- target[2] = scalar;
- target[3] = scalar;
- }
- // Construct a portable vector from a single duplicated scalar
- explicit U16x8(uint16_t scalar) {
- this->scalars[0] = scalar;
- this->scalars[1] = scalar;
- this->scalars[2] = scalar;
- this->scalars[3] = scalar;
- this->scalars[4] = scalar;
- this->scalars[5] = scalar;
- this->scalars[6] = scalar;
- this->scalars[7] = scalar;
- }
- // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
- U32x4 get_U32() const {
- U32x4 result(0);
- uint64_t *target = (uint64_t*)result.scalars;
- uint64_t *source = (uint64_t*)this->scalars;
- target[0] = source[0];
- target[1] = source[1];
- return result;
- }
- #endif
- // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
- static inline U16x8 createGradient(uint16_t start, uint16_t increment) {
- return U16x8(
- start,
- start + increment,
- start + increment * 2,
- start + increment * 3,
- start + increment * 4,
- start + increment * 5,
- start + increment * 6,
- start + increment * 7
- );
- }
- static inline U16x8 readAlignedUnsafe(const uint16_t* data) {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- return U16x8(_mm_load_si128((const __m128i*)data));
- #elif defined USE_NEON
- return U16x8(vld1q_u16(data));
- #endif
- #else
- return U16x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
- #endif
- }
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- inline void writeAlignedUnsafe(uint16_t* data) const {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- _mm_store_si128((__m128i*)data, this->v);
- #elif defined USE_NEON
- vst1q_u16(data, this->v);
- #endif
- #else
- data[0] = this->scalars[0];
- data[1] = this->scalars[1];
- data[2] = this->scalars[2];
- data[3] = this->scalars[3];
- data[4] = this->scalars[4];
- data[5] = this->scalars[5];
- data[6] = this->scalars[6];
- data[7] = this->scalars[7];
- #endif
- }
- // Bound and alignment checked reading
- static inline U16x8 readAligned(const dsr::SafePointer<uint16_t> data, const char* methodName) {
- const uint16_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 15) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 16);
- #endif
- return U16x8::readAlignedUnsafe(pointer);
- }
- // Bound and alignment checked writing
- inline void writeAligned(dsr::SafePointer<uint16_t> data, const char* methodName) const {
- uint16_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 15) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 16);
- #endif
- this->writeAlignedUnsafe(pointer);
- }
- };
- union U8x16 {
- private:
- // The uninitialized default constructor is private for safety reasons.
- U8x16() {}
- public:
- // When the uninitialized constructor is needed for performance, use this named constructor instead.
- static inline U8x16 create_dangerous_uninitialized() { return U8x16(); }
- #if defined USE_BASIC_SIMD
- public:
- #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
- // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
- // Direct access cannot be done on NEON!
- uint8_t scalars[16];
- #endif
- // The SIMD vector of undefined type
- // Not accessible while emulating!
- SIMD_U8x16 v;
- // Construct a portable vector from a native SIMD vector
- explicit U8x16(const SIMD_U8x16& v) : v(v) {}
- // Construct a portable vector from a set of scalars
- U8x16(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
- uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16)
- : v(LOAD_VECTOR_U8_SIMD(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16)) {}
- // Construct a portable vector from a single duplicated scalar
- explicit U8x16(uint8_t scalar) : v(LOAD_SCALAR_U8_SIMD(scalar)) {}
- #else
- public:
- // Emulate a SIMD vector as an array of scalars without hardware support.
- // Only accessible while emulating!
- uint8_t scalars[16];
- // Construct a portable vector from a set of scalars
- U8x16(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
- uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16) {
- this->scalars[0] = a1;
- this->scalars[1] = a2;
- this->scalars[2] = a3;
- this->scalars[3] = a4;
- this->scalars[4] = a5;
- this->scalars[5] = a6;
- this->scalars[6] = a7;
- this->scalars[7] = a8;
- this->scalars[8] = a9;
- this->scalars[9] = a10;
- this->scalars[10] = a11;
- this->scalars[11] = a12;
- this->scalars[12] = a13;
- this->scalars[13] = a14;
- this->scalars[14] = a15;
- this->scalars[15] = a16;
- }
- // Construct a portable vector from a single duplicated scalar
- explicit U8x16(uint8_t scalar) {
- this->scalars[0] = scalar;
- this->scalars[1] = scalar;
- this->scalars[2] = scalar;
- this->scalars[3] = scalar;
- this->scalars[4] = scalar;
- this->scalars[5] = scalar;
- this->scalars[6] = scalar;
- this->scalars[7] = scalar;
- this->scalars[8] = scalar;
- this->scalars[9] = scalar;
- this->scalars[10] = scalar;
- this->scalars[11] = scalar;
- this->scalars[12] = scalar;
- this->scalars[13] = scalar;
- this->scalars[14] = scalar;
- this->scalars[15] = scalar;
- }
- #endif
- // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
- static inline U8x16 createGradient(uint8_t start, uint8_t increment) {
- return U8x16(
- start,
- start + increment,
- start + increment * 2,
- start + increment * 3,
- start + increment * 4,
- start + increment * 5,
- start + increment * 6,
- start + increment * 7,
- start + increment * 8,
- start + increment * 9,
- start + increment * 10,
- start + increment * 11,
- start + increment * 12,
- start + increment * 13,
- start + increment * 14,
- start + increment * 15
- );
- }
- static inline U8x16 readAlignedUnsafe(const uint8_t* data) {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- return U8x16(_mm_load_si128((const __m128i*)data));
- #elif defined USE_NEON
- return U8x16(vld1q_u8(data));
- #endif
- #else
- return U8x16(
- data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7],
- data[8], data[9], data[10], data[11], data[12], data[13], data[14], data[15]
- );
- #endif
- }
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- inline void writeAlignedUnsafe(uint8_t* data) const {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- _mm_store_si128((__m128i*)data, this->v);
- #elif defined USE_NEON
- vst1q_u8(data, this->v);
- #endif
- #else
- data[0] = this->scalars[0];
- data[1] = this->scalars[1];
- data[2] = this->scalars[2];
- data[3] = this->scalars[3];
- data[4] = this->scalars[4];
- data[5] = this->scalars[5];
- data[6] = this->scalars[6];
- data[7] = this->scalars[7];
- data[8] = this->scalars[8];
- data[9] = this->scalars[9];
- data[10] = this->scalars[10];
- data[11] = this->scalars[11];
- data[12] = this->scalars[12];
- data[13] = this->scalars[13];
- data[14] = this->scalars[14];
- data[15] = this->scalars[15];
- #endif
- }
- // Bound and alignment checked reading
- static inline U8x16 readAligned(const dsr::SafePointer<uint8_t> data, const char* methodName) {
- const uint8_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 15) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 16);
- #endif
- return U8x16::readAlignedUnsafe(pointer);
- }
- // Bound and alignment checked writing
- inline void writeAligned(dsr::SafePointer<uint8_t> data, const char* methodName) const {
- uint8_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 15) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 16);
- #endif
- this->writeAlignedUnsafe(pointer);
- }
- };
- union F32x8 {
- private:
- // The uninitialized default constructor is private for safety reasons.
- F32x8() {}
- public:
- // When the uninitialized constructor is needed for performance, use this named constructor instead.
- static inline F32x8 create_dangerous_uninitialized() { return F32x8(); }
- #if defined USE_256BIT_F_SIMD
- public:
- #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
- // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
- float scalars[8];
- #endif
- // The SIMD vector of undefined type
- // Not accessible while emulating!
- SIMD_F32x8 v;
- // Construct a portable vector from a native SIMD vector
- explicit F32x8(const SIMD_F32x8& v) : v(v) {}
- // Construct a portable vector from a set of scalars
- F32x8(float a1, float a2, float a3, float a4, float a5, float a6, float a7, float a8)
- : v(LOAD_VECTOR_F32_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8)) {}
- // Construct a portable vector from a single duplicated scalar
- explicit F32x8(float scalar) : v(LOAD_SCALAR_F32_SIMD256(scalar)) {}
- #else
- public:
- // Emulate a SIMD vector as an array of scalars without hardware support.
- // Only accessible while emulating!
- float scalars[8];
- // Construct a portable vector from a set of scalars
- F32x8(float a1, float a2, float a3, float a4, float a5, float a6, float a7, float a8) {
- this->scalars[0] = a1;
- this->scalars[1] = a2;
- this->scalars[2] = a3;
- this->scalars[3] = a4;
- this->scalars[4] = a5;
- this->scalars[5] = a6;
- this->scalars[6] = a7;
- this->scalars[7] = a8;
- }
- // Construct a portable vector from a single duplicated scalar
- explicit F32x8(float scalar) {
- this->scalars[0] = scalar;
- this->scalars[1] = scalar;
- this->scalars[2] = scalar;
- this->scalars[3] = scalar;
- this->scalars[4] = scalar;
- this->scalars[5] = scalar;
- this->scalars[6] = scalar;
- this->scalars[7] = scalar;
- }
- #endif
- // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
- static inline F32x8 createGradient(float start, float increment) {
- return F32x8(
- start,
- start + increment,
- start + increment * 2.0f,
- start + increment * 3.0f,
- start + increment * 4.0f,
- start + increment * 5.0f,
- start + increment * 6.0f,
- start + increment * 7.0f
- );
- }
- // Construct a portable SIMD vector from a pointer to aligned data
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- static inline F32x8 readAlignedUnsafe(const float* data) {
- #if defined USE_AVX2
- return F32x8(_mm256_load_ps(data));
- #else
- return F32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
- #endif
- }
- // Write to aligned memory from the existing vector
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- inline void writeAlignedUnsafe(float* data) const {
- #if defined USE_AVX2
- _mm256_store_ps(data, this->v);
- #else
- data[0] = this->scalars[0];
- data[1] = this->scalars[1];
- data[2] = this->scalars[2];
- data[3] = this->scalars[3];
- data[4] = this->scalars[4];
- data[5] = this->scalars[5];
- data[6] = this->scalars[6];
- data[7] = this->scalars[7];
- #endif
- }
- // Bound and alignment checked reading
- static inline F32x8 readAligned(const dsr::SafePointer<float> data, const char* methodName) {
- const float* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 31) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 32);
- #endif
- return F32x8::readAlignedUnsafe(pointer);
- }
- // Bound and alignment checked writing
- inline void writeAligned(dsr::SafePointer<float> data, const char* methodName) const {
- float* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 31) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 32);
- #endif
- this->writeAlignedUnsafe(pointer);
- }
- // 1 / x
- // Useful for multiple divisions with the same denominator
- // Useless if the denominator is a constant
- F32x8 reciprocal() const {
- #if defined USE_AVX2
- // Approximate
- SIMD_F32x8 lowQ = _mm256_rcp_ps(this->v);
- // Refine
- return F32x8(SUB_F32_SIMD256(ADD_F32_SIMD256(lowQ, lowQ), MUL_F32_SIMD256(this->v, MUL_F32_SIMD256(lowQ, lowQ))));
- #else
- return F32x8(
- 1.0f / this->scalars[0],
- 1.0f / this->scalars[1],
- 1.0f / this->scalars[2],
- 1.0f / this->scalars[3],
- 1.0f / this->scalars[4],
- 1.0f / this->scalars[5],
- 1.0f / this->scalars[6],
- 1.0f / this->scalars[7]
- );
- #endif
- }
- // 1 / sqrt(x)
- // Useful for normalizing vectors
- F32x8 reciprocalSquareRoot() const {
- #if defined USE_AVX2
- //__m128 reciRoot = _mm256_rsqrt_ps(this->v);
- SIMD_F32x8 reciRoot = _mm256_rsqrt_ps(this->v);
- SIMD_F32x8 mul = MUL_F32_SIMD256(MUL_F32_SIMD256(this->v, reciRoot), reciRoot);
- reciRoot = MUL_F32_SIMD256(MUL_F32_SIMD256(LOAD_SCALAR_F32_SIMD256(0.5f), reciRoot), SUB_F32_SIMD256(LOAD_SCALAR_F32_SIMD256(3.0f), mul));
- return F32x8(reciRoot);
- #else
- return F32x8(
- 1.0f / sqrt(this->scalars[0]),
- 1.0f / sqrt(this->scalars[1]),
- 1.0f / sqrt(this->scalars[2]),
- 1.0f / sqrt(this->scalars[3]),
- 1.0f / sqrt(this->scalars[4]),
- 1.0f / sqrt(this->scalars[5]),
- 1.0f / sqrt(this->scalars[6]),
- 1.0f / sqrt(this->scalars[7])
- );
- #endif
- }
- // sqrt(x)
- // Useful for getting lengths of vectors
- F32x8 squareRoot() const {
- #if defined USE_AVX2
- SIMD_F32x8 half = LOAD_SCALAR_F32_SIMD256(0.5f);
- // Approximate
- SIMD_F32x8 root = _mm256_sqrt_ps(this->v);
- // Refine
- root = _mm256_mul_ps(_mm256_add_ps(root, _mm256_div_ps(this->v, root)), half);
- return F32x8(root);
- #else
- return F32x8(
- sqrt(this->scalars[0]),
- sqrt(this->scalars[1]),
- sqrt(this->scalars[2]),
- sqrt(this->scalars[3]),
- sqrt(this->scalars[4]),
- sqrt(this->scalars[5]),
- sqrt(this->scalars[6]),
- sqrt(this->scalars[7]));
- #endif
- }
- F32x8 clamp(float minimum, float maximum) const {
- #if defined USE_256BIT_F_SIMD
- return F32x8(MIN_F32_SIMD256(MAX_F32_SIMD256(this->v, LOAD_SCALAR_F32_SIMD256(minimum)), LOAD_SCALAR_F32_SIMD256(maximum)));
- #else
- float val0 = this->scalars[0];
- float val1 = this->scalars[1];
- float val2 = this->scalars[2];
- float val3 = this->scalars[3];
- float val4 = this->scalars[4];
- float val5 = this->scalars[5];
- float val6 = this->scalars[6];
- float val7 = this->scalars[7];
- if (minimum > val0) { val0 = minimum; }
- if (maximum < val0) { val0 = maximum; }
- if (minimum > val1) { val1 = minimum; }
- if (maximum < val1) { val1 = maximum; }
- if (minimum > val2) { val2 = minimum; }
- if (maximum < val2) { val2 = maximum; }
- if (minimum > val3) { val3 = minimum; }
- if (maximum < val3) { val3 = maximum; }
- if (minimum > val4) { val4 = minimum; }
- if (maximum < val4) { val4 = maximum; }
- if (minimum > val5) { val5 = minimum; }
- if (maximum < val5) { val5 = maximum; }
- if (minimum > val6) { val6 = minimum; }
- if (maximum < val6) { val6 = maximum; }
- if (minimum > val7) { val7 = minimum; }
- if (maximum < val7) { val7 = maximum; }
- return F32x8(val0, val1, val2, val3, val4, val5, val6, val7);
- #endif
- }
- F32x8 clampLower(float minimum) const {
- #if defined USE_256BIT_F_SIMD
- return F32x8(MAX_F32_SIMD256(this->v, LOAD_SCALAR_F32_SIMD256(minimum)));
- #else
- float val0 = this->scalars[0];
- float val1 = this->scalars[1];
- float val2 = this->scalars[2];
- float val3 = this->scalars[3];
- float val4 = this->scalars[4];
- float val5 = this->scalars[5];
- float val6 = this->scalars[6];
- float val7 = this->scalars[7];
- if (minimum > val0) { val0 = minimum; }
- if (minimum > val1) { val1 = minimum; }
- if (minimum > val2) { val2 = minimum; }
- if (minimum > val3) { val3 = minimum; }
- if (minimum > val4) { val4 = minimum; }
- if (minimum > val5) { val5 = minimum; }
- if (minimum > val6) { val6 = minimum; }
- if (minimum > val7) { val7 = minimum; }
- return F32x8(val0, val1, val2, val3, val4, val5, val6, val7);
- #endif
- }
- F32x8 clampUpper(float maximum) const {
- #if defined USE_256BIT_F_SIMD
- return F32x8(MIN_F32_SIMD256(this->v, LOAD_SCALAR_F32_SIMD256(maximum)));
- #else
- float val0 = this->scalars[0];
- float val1 = this->scalars[1];
- float val2 = this->scalars[2];
- float val3 = this->scalars[3];
- float val4 = this->scalars[4];
- float val5 = this->scalars[5];
- float val6 = this->scalars[6];
- float val7 = this->scalars[7];
- if (maximum < val0) { val0 = maximum; }
- if (maximum < val1) { val1 = maximum; }
- if (maximum < val2) { val2 = maximum; }
- if (maximum < val3) { val3 = maximum; }
- if (maximum < val4) { val4 = maximum; }
- if (maximum < val5) { val5 = maximum; }
- if (maximum < val6) { val6 = maximum; }
- if (maximum < val7) { val7 = maximum; }
- return F32x8(val0, val1, val2, val3, val4, val5, val6, val7);
- #endif
- }
- };
- union I32x8 {
- private:
- // The uninitialized default constructor is private for safety reasons.
- I32x8() {}
- public:
- // When the uninitialized constructor is needed for performance, use this named constructor instead.
- static inline I32x8 create_dangerous_uninitialized() { return I32x8(); }
- #if defined USE_256BIT_X_SIMD
- public:
- #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
- // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
- int32_t scalars[8];
- #endif
- // The SIMD vector of undefined type
- // Not accessible while emulating!
- SIMD_I32x8 v;
- // Construct a portable vector from a native SIMD vector
- explicit I32x8(const SIMD_I32x8& v) : v(v) {}
- // Construct a portable vector from a set of scalars
- I32x8(int32_t a1, int32_t a2, int32_t a3, int32_t a4, int32_t a5, int32_t a6, int32_t a7, int32_t a8)
- : v(LOAD_VECTOR_I32_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8)) {}
- // Construct a portable vector from a single duplicated scalar
- explicit I32x8(int32_t scalar) : v(LOAD_SCALAR_I32_SIMD256(scalar)) {}
- #else
- public:
- // Emulate a SIMD vector as an array of scalars without hardware support.
- // Only accessible while emulating!
- int32_t scalars[8];
- // Construct a portable vector from a set of scalars
- I32x8(int32_t a1, int32_t a2, int32_t a3, int32_t a4, int32_t a5, int32_t a6, int32_t a7, int32_t a8) {
- this->scalars[0] = a1;
- this->scalars[1] = a2;
- this->scalars[2] = a3;
- this->scalars[3] = a4;
- this->scalars[4] = a5;
- this->scalars[5] = a6;
- this->scalars[6] = a7;
- this->scalars[7] = a8;
- }
- // Construct a portable vector from a single duplicated scalar
- explicit I32x8(int32_t scalar) {
- this->scalars[0] = scalar;
- this->scalars[1] = scalar;
- this->scalars[2] = scalar;
- this->scalars[3] = scalar;
- this->scalars[4] = scalar;
- this->scalars[5] = scalar;
- this->scalars[6] = scalar;
- this->scalars[7] = scalar;
- }
- #endif
- // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
- static inline I32x8 createGradient(int32_t start, int32_t increment) {
- return I32x8(
- start,
- start + increment,
- start + increment * 2,
- start + increment * 3,
- start + increment * 4,
- start + increment * 5,
- start + increment * 6,
- start + increment * 7
- );
- }
- // Construct a portable SIMD vector from a pointer to aligned data
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- static inline I32x8 readAlignedUnsafe(const int32_t* data) {
- #if defined USE_AVX2
- return I32x8(_mm256_load_si256((const __m256i*)data));
- #else
- return I32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
- #endif
- }
- // Write to aligned memory from the existing vector
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- inline void writeAlignedUnsafe(int32_t* data) const {
- #if defined USE_AVX2
- _mm256_store_si256((__m256i*)data, this->v);
- #else
- data[0] = this->scalars[0];
- data[1] = this->scalars[1];
- data[2] = this->scalars[2];
- data[3] = this->scalars[3];
- data[4] = this->scalars[4];
- data[5] = this->scalars[5];
- data[6] = this->scalars[6];
- data[7] = this->scalars[7];
- #endif
- }
- // Bound and alignment checked reading
- static inline I32x8 readAligned(const dsr::SafePointer<int32_t> data, const char* methodName) {
- const int32_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 31) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 32);
- #endif
- return I32x8::readAlignedUnsafe(pointer);
- }
- // Bound and alignment checked writing
- inline void writeAligned(dsr::SafePointer<int32_t> data, const char* methodName) const {
- int32_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 31) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 32);
- #endif
- this->writeAlignedUnsafe(pointer);
- }
- };
- union U32x8 {
- private:
- // The uninitialized default constructor is private for safety reasons.
- U32x8() {}
- public:
- // When the uninitialized constructor is needed for performance, use this named constructor instead.
- static inline U32x8 create_dangerous_uninitialized() { return U32x8(); }
- #if defined USE_256BIT_X_SIMD
- public:
- #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
- // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
- uint32_t scalars[8];
- #endif
- // The SIMD vector of undefined type
- // Not accessible while emulating!
- SIMD_U32x8 v;
- // Construct a portable vector from a native SIMD vector
- explicit U32x8(const SIMD_U32x8& v) : v(v) {}
- // Construct a portable vector from a set of scalars
- U32x8(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4, uint32_t a5, uint32_t a6, uint32_t a7, uint32_t a8)
- : v(LOAD_VECTOR_U32_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8)) {}
- // Construct a portable vector from a single duplicated scalar
- explicit U32x8(uint32_t scalar) : v(LOAD_SCALAR_U32_SIMD256(scalar)) {}
- #else
- public:
- // Emulate a SIMD vector as an array of scalars without hardware support.
- // Only accessible while emulating!
- uint32_t scalars[8];
- // Construct a portable vector from a set of scalars
- U32x8(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4, uint32_t a5, uint32_t a6, uint32_t a7, uint32_t a8) {
- this->scalars[0] = a1;
- this->scalars[1] = a2;
- this->scalars[2] = a3;
- this->scalars[3] = a4;
- this->scalars[4] = a5;
- this->scalars[5] = a6;
- this->scalars[6] = a7;
- this->scalars[7] = a8;
- }
- // Construct a portable vector from a single duplicated scalar
- explicit U32x8(uint32_t scalar) {
- this->scalars[0] = scalar;
- this->scalars[1] = scalar;
- this->scalars[2] = scalar;
- this->scalars[3] = scalar;
- this->scalars[4] = scalar;
- this->scalars[5] = scalar;
- this->scalars[6] = scalar;
- this->scalars[7] = scalar;
- }
- #endif
- // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
- static inline U32x8 createGradient(uint32_t start, uint32_t increment) {
- return U32x8(
- start,
- start + increment,
- start + increment * 2,
- start + increment * 3,
- start + increment * 4,
- start + increment * 5,
- start + increment * 6,
- start + increment * 7
- );
- }
- // Construct a portable SIMD vector from a pointer to aligned data
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- static inline U32x8 readAlignedUnsafe(const uint32_t* data) {
- #if defined USE_AVX2
- return U32x8(_mm256_load_si256((const __m256i*)data));
- #else
- return U32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
- #endif
- }
- // Write to aligned memory from the existing vector
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- inline void writeAlignedUnsafe(uint32_t* data) const {
- #if defined USE_AVX2
- _mm256_store_si256((__m256i*)data, this->v);
- #else
- data[0] = this->scalars[0];
- data[1] = this->scalars[1];
- data[2] = this->scalars[2];
- data[3] = this->scalars[3];
- data[4] = this->scalars[4];
- data[5] = this->scalars[5];
- data[6] = this->scalars[6];
- data[7] = this->scalars[7];
- #endif
- }
- // Bound and alignment checked reading
- static inline U32x8 readAligned(const dsr::SafePointer<uint32_t> data, const char* methodName) {
- const uint32_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 31) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 32);
- #endif
- return U32x8::readAlignedUnsafe(pointer);
- }
- // Bound and alignment checked writing
- inline void writeAligned(dsr::SafePointer<uint32_t> data, const char* methodName) const {
- uint32_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 31) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 32);
- #endif
- this->writeAlignedUnsafe(pointer);
- }
- };
- union U16x16 {
- private:
- // The uninitialized default constructor is private for safety reasons.
- U16x16() {}
- public:
- // When the uninitialized constructor is needed for performance, use this named constructor instead.
- static inline U16x16 create_dangerous_uninitialized() { return U16x16(); }
- #if defined USE_256BIT_X_SIMD
- public:
- #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
- // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
- uint16_t scalars[16];
- #endif
- // The SIMD vector of undefined type
- // Not accessible while emulating!
- SIMD_U16x16 v;
- // Construct a portable vector from a native SIMD vector
- explicit U16x16(const SIMD_U16x16& v) : v(v) {}
- // Construct a vector of 16 x 16-bit unsigned integers from a vector of 8 x 32-bit unsigned integers
- // Reinterpret casting is used
- explicit U16x16(const U32x8& vector) : v(REINTERPRET_U32_TO_U16_SIMD256(vector.v)) {}
- // Construct a portable vector from a set of scalars
- U16x16(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8,
- uint16_t a9, uint16_t a10, uint16_t a11, uint16_t a12, uint16_t a13, uint16_t a14, uint16_t a15, uint16_t a16)
- : v(LOAD_VECTOR_U16_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16)) {}
- // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
- // Reinterpret casting is used
- // TODO: Remove all reintreprets from constructors to improve readability
- explicit U16x16(uint32_t scalar) : v(REINTERPRET_U32_TO_U16_SIMD256(LOAD_SCALAR_U32_SIMD256(scalar))) {}
- // Construct a portable vector from a single duplicated scalar
- explicit U16x16(uint16_t scalar) : v(LOAD_SCALAR_U16_SIMD256(scalar)) {}
- // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
- U32x8 get_U32() const {
- return U32x8(REINTERPRET_U16_TO_U32_SIMD256(this->v));
- }
- #else
- public:
- // Emulate a SIMD vector as an array of scalars without hardware support.
- // Only accessible while emulating!
- uint16_t scalars[16];
- // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
- // Reinterpret casting is used
- explicit U16x16(const U32x8& vector) {
- uint64_t *target = (uint64_t*)this->scalars;
- uint64_t *source = (uint64_t*)vector.scalars;
- target[0] = source[0];
- target[1] = source[1];
- target[2] = source[2];
- target[3] = source[3];
- }
- // Construct a portable vector from a set of scalars
- U16x16(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8,
- uint16_t a9, uint16_t a10, uint16_t a11, uint16_t a12, uint16_t a13, uint16_t a14, uint16_t a15, uint16_t a16) {
- this->scalars[0] = a1;
- this->scalars[1] = a2;
- this->scalars[2] = a3;
- this->scalars[3] = a4;
- this->scalars[4] = a5;
- this->scalars[5] = a6;
- this->scalars[6] = a7;
- this->scalars[7] = a8;
- this->scalars[8] = a9;
- this->scalars[9] = a10;
- this->scalars[10] = a11;
- this->scalars[11] = a12;
- this->scalars[12] = a13;
- this->scalars[13] = a14;
- this->scalars[14] = a15;
- this->scalars[15] = a16;
- }
- // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
- // Reinterpret casting is used
- explicit U16x16(uint32_t scalar) {
- uint32_t *target = (uint32_t*)this->scalars;
- target[0] = scalar;
- target[1] = scalar;
- target[2] = scalar;
- target[3] = scalar;
- target[4] = scalar;
- target[5] = scalar;
- target[6] = scalar;
- target[7] = scalar;
- }
- // Construct a portable vector from a single duplicated scalar
- explicit U16x16(uint16_t scalar) {
- this->scalars[0] = scalar;
- this->scalars[1] = scalar;
- this->scalars[2] = scalar;
- this->scalars[3] = scalar;
- this->scalars[4] = scalar;
- this->scalars[5] = scalar;
- this->scalars[6] = scalar;
- this->scalars[7] = scalar;
- this->scalars[8] = scalar;
- this->scalars[9] = scalar;
- this->scalars[10] = scalar;
- this->scalars[11] = scalar;
- this->scalars[12] = scalar;
- this->scalars[13] = scalar;
- this->scalars[14] = scalar;
- this->scalars[15] = scalar;
- }
- // Reinterpret cast to a vector of 8 x 32-bit unsigned integers
- U32x8 get_U32() const {
- U32x8 result(0);
- uint64_t *target = (uint64_t*)result.scalars;
- uint64_t *source = (uint64_t*)this->scalars;
- target[0] = source[0];
- target[1] = source[1];
- target[2] = source[2];
- target[3] = source[3];
- return result;
- }
- #endif
- // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
- static inline U16x16 createGradient(uint16_t start, uint16_t increment) {
- return U16x16(
- start,
- start + increment,
- start + increment * 2,
- start + increment * 3,
- start + increment * 4,
- start + increment * 5,
- start + increment * 6,
- start + increment * 7,
- start + increment * 8,
- start + increment * 9,
- start + increment * 10,
- start + increment * 11,
- start + increment * 12,
- start + increment * 13,
- start + increment * 14,
- start + increment * 15
- );
- }
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- //static inline U16x16 readSlow(uint16_t* data) {
- // return U16x16(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
- //}
- static inline U16x16 readAlignedUnsafe(const uint16_t* data) {
- #if defined USE_AVX2
- return U16x16(_mm256_load_si256((const __m256i*)data));
- #else
- return U16x16(
- data[0],
- data[1],
- data[2],
- data[3],
- data[4],
- data[5],
- data[6],
- data[7],
- data[8],
- data[9],
- data[10],
- data[11],
- data[12],
- data[13],
- data[14],
- data[15]
- );
- #endif
- }
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- inline void writeAlignedUnsafe(uint16_t* data) const {
- #if defined USE_AVX2
- _mm256_store_si256((__m256i*)data, this->v);
- #else
- data[0] = this->scalars[0];
- data[1] = this->scalars[1];
- data[2] = this->scalars[2];
- data[3] = this->scalars[3];
- data[4] = this->scalars[4];
- data[5] = this->scalars[5];
- data[6] = this->scalars[6];
- data[7] = this->scalars[7];
- data[8] = this->scalars[8];
- data[9] = this->scalars[9];
- data[10] = this->scalars[10];
- data[11] = this->scalars[11];
- data[12] = this->scalars[12];
- data[13] = this->scalars[13];
- data[14] = this->scalars[14];
- data[15] = this->scalars[15];
- #endif
- }
- // Bound and alignment checked reading
- static inline U16x16 readAligned(const dsr::SafePointer<uint16_t> data, const char* methodName) {
- const uint16_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 31) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 32);
- #endif
- return U16x16::readAlignedUnsafe(pointer);
- }
- // Bound and alignment checked writing
- inline void writeAligned(dsr::SafePointer<uint16_t> data, const char* methodName) const {
- uint16_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 31) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 32);
- #endif
- this->writeAlignedUnsafe(pointer);
- }
- };
- union U8x32 {
- private:
- // The uninitialized default constructor is private for safety reasons.
- U8x32() {}
- public:
- // When the uninitialized constructor is needed for performance, use this named constructor instead.
- static inline U8x32 create_dangerous_uninitialized() { return U8x32(); }
- #if defined USE_256BIT_X_SIMD
- public:
- #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
- // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
- uint8_t scalars[32];
- #endif
- // The SIMD vector of undefined type
- // Not accessible while emulating!
- SIMD_U8x32 v;
- // Construct a portable vector from a native SIMD vector
- explicit U8x32(const SIMD_U8x32& v) : v(v) {}
- // Construct a portable vector from a set of scalars
- U8x32(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
- uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16,
- uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24,
- uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32)
- : v(LOAD_VECTOR_U8_SIMD256(
- a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16,
- a17, a18, a19, a20, a21, a22, a23, a24, a25, a26, a27, a28, a29, a30, a31, a32
- )) {}
- // Construct a portable vector from a single duplicated scalar
- explicit U8x32(uint8_t scalar) : v(LOAD_SCALAR_U8_SIMD256(scalar)) {}
- #else
- public:
- // Emulate a SIMD vector as an array of scalars without hardware support.
- // Only accessible while emulating!
- uint8_t scalars[32];
- // Construct a portable vector from a set of scalars
- U8x32(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
- uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16,
- uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24,
- uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32) {
- this->scalars[0] = a1;
- this->scalars[1] = a2;
- this->scalars[2] = a3;
- this->scalars[3] = a4;
- this->scalars[4] = a5;
- this->scalars[5] = a6;
- this->scalars[6] = a7;
- this->scalars[7] = a8;
- this->scalars[8] = a9;
- this->scalars[9] = a10;
- this->scalars[10] = a11;
- this->scalars[11] = a12;
- this->scalars[12] = a13;
- this->scalars[13] = a14;
- this->scalars[14] = a15;
- this->scalars[15] = a16;
- this->scalars[16] = a17;
- this->scalars[17] = a18;
- this->scalars[18] = a19;
- this->scalars[19] = a20;
- this->scalars[20] = a21;
- this->scalars[21] = a22;
- this->scalars[22] = a23;
- this->scalars[23] = a24;
- this->scalars[24] = a25;
- this->scalars[25] = a26;
- this->scalars[26] = a27;
- this->scalars[27] = a28;
- this->scalars[28] = a29;
- this->scalars[29] = a30;
- this->scalars[30] = a31;
- this->scalars[31] = a32;
- }
- // Construct a portable vector from a single duplicated scalar
- explicit U8x32(uint8_t scalar) {
- for (int i = 0; i < 32; i++) {
- this->scalars[i] = scalar;
- }
- }
- #endif
- // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
- static inline U8x32 createGradient(uint8_t start, uint8_t increment) {
- return U8x32(
- start,
- start + increment,
- start + increment * 2,
- start + increment * 3,
- start + increment * 4,
- start + increment * 5,
- start + increment * 6,
- start + increment * 7,
- start + increment * 8,
- start + increment * 9,
- start + increment * 10,
- start + increment * 11,
- start + increment * 12,
- start + increment * 13,
- start + increment * 14,
- start + increment * 15,
- start + increment * 16,
- start + increment * 17,
- start + increment * 18,
- start + increment * 19,
- start + increment * 20,
- start + increment * 21,
- start + increment * 22,
- start + increment * 23,
- start + increment * 24,
- start + increment * 25,
- start + increment * 26,
- start + increment * 27,
- start + increment * 28,
- start + increment * 29,
- start + increment * 30,
- start + increment * 31
- );
- }
- static inline U8x32 readAlignedUnsafe(const uint8_t* data) {
- #if defined USE_AVX2
- return U8x32(_mm256_load_si256((const __m256i*)data));
- #else
- U8x32 result;
- for (int i = 0; i < 32; i++) {
- result.scalars[i] = data[i];
- }
- return result;
- #endif
- }
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- inline void writeAlignedUnsafe(uint8_t* data) const {
- #if defined USE_AVX2
- _mm256_store_si256((__m256i*)data, this->v);
- #else
- for (int i = 0; i < 32; i++) {
- data[i] = this->scalars[i];
- }
- #endif
- }
- // Bound and alignment checked reading
- static inline U8x32 readAligned(const dsr::SafePointer<uint8_t> data, const char* methodName) {
- const uint8_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 31) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 32);
- #endif
- return U8x32::readAlignedUnsafe(pointer);
- }
- // Bound and alignment checked writing
- inline void writeAligned(dsr::SafePointer<uint8_t> data, const char* methodName) const {
- uint8_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 31) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 32);
- #endif
- this->writeAlignedUnsafe(pointer);
- }
- };
- // Helper macros for doing things to certain sets of SIMD vector types
- // Performing do(vector_type, element_type, lane_count)
- #define FOR_ALL_VECTOR_TYPES(DO) \
- DO(F32x4, float, 4) \
- DO(I32x4, int32_t, 4) \
- DO(U32x4, uint32_t, 4) \
- DO(U16x8, uint16_t, 8) \
- DO(U8x16, uint8_t, 16) \
- DO(F32x8, float, 8) \
- DO(I32x8, int32_t, 8) \
- DO(U32x8, uint32_t, 8) \
- DO(U16x16, uint16_t, 16) \
- DO(U8x32, uint8_t, 32)
- #define FOR_FLOAT_VECTOR_TYPES(DO) \
- DO(F32x4, float, 4) \
- DO(F32x8, float, 8)
- #define FOR_INTEGER_VECTOR_TYPES(DO) \
- DO(I32x4, int32_t, 4) \
- DO(U32x4, uint32_t, 4) \
- DO(U16x8, uint16_t, 8) \
- DO(U8x16, uint8_t, 16) \
- DO(I32x8, int32_t, 8) \
- DO(U32x8, uint32_t, 8) \
- DO(U16x16, uint16_t, 16) \
- DO(U8x32, uint8_t, 32)
- #define FOR_SIGNED_VECTOR_TYPES(DO) \
- DO(F32x4, float, 4) \
- DO(I32x4, int32_t, 4) \
- DO(F32x8, float, 8) \
- DO(I32x8, int32_t, 8)
- #define FOR_UNSIGNED_VECTOR_TYPES(DO) \
- DO(U32x4, uint32_t, 4) \
- DO(U16x8, uint16_t, 8) \
- DO(U8x16, uint8_t, 16) \
- DO(U32x8, uint32_t, 8) \
- DO(U16x16, uint16_t, 16) \
- DO(U8x32, uint8_t, 32)
- // Print SIMD vectors to the terminal or append them to strings.
- #define CREATE_METHOD_PRINT(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
- inline dsr::String& string_toStreamIndented(dsr::String& target, const VECTOR_TYPE& source, const dsr::ReadableString& indentation) { \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
- source.writeAlignedUnsafe(a); \
- dsr::string_append(target, indentation, a[0]); \
- for (int i = 1; i < LANE_COUNT; i++) { \
- string_append(target, U", ", a[i]); \
- } \
- return target; \
- }
- // All SIMD vectors can be printed.
- FOR_ALL_VECTOR_TYPES(CREATE_METHOD_PRINT)
- #undef CREATE_METHOD_PRINT
- // Whole comparisons returning a single boolean, mainly for regression tests.
- #define CREATE_EXACT_EQUALITY(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
- inline bool operator==(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
- left.writeAlignedUnsafe(a); \
- right.writeAlignedUnsafe(b); \
- for (int i = 0; i < LANE_COUNT; i++) { \
- if (a[i] != b[i]) return false; \
- } \
- return true; \
- }
- // Integer SIMD vectors have exact equlity.
- FOR_INTEGER_VECTOR_TYPES(CREATE_EXACT_EQUALITY)
- #undef CREATE_EXACT_EQUALITY
- #define CREATE_TOLERANT_EQUALITY(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
- inline bool operator==(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
- left.writeAlignedUnsafe(a); \
- right.writeAlignedUnsafe(b); \
- for (int i = 0; i < LANE_COUNT; i++) { \
- if (fabs(a[i] - b[i]) >= 0.0001f) return false; \
- } \
- return true; \
- }
- // Float SIMD vectors have inexact equality.
- FOR_FLOAT_VECTOR_TYPES(CREATE_TOLERANT_EQUALITY)
- #undef CREATE_TOLERANT_EQUALITY
- #define CREATE_INEQUALITY(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
- inline bool operator!=(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
- return !(left == right); \
- }
- // All SIMD vectors have inequality.
- FOR_ALL_VECTOR_TYPES(CREATE_INEQUALITY)
- #undef CREATE_INEQUALITY
- inline F32x4 operator+(const F32x4& left, const F32x4& right) {
- #if defined USE_BASIC_SIMD
- return F32x4(ADD_F32_SIMD(left.v, right.v));
- #else
- return F32x4(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3]);
- #endif
- }
- inline F32x4 operator-(const F32x4& left, const F32x4& right) {
- #if defined USE_BASIC_SIMD
- return F32x4(SUB_F32_SIMD(left.v, right.v));
- #else
- return F32x4(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3]);
- #endif
- }
- inline F32x4 operator*(const F32x4& left, const F32x4& right) {
- #if defined USE_BASIC_SIMD
- return F32x4(MUL_F32_SIMD(left.v, right.v));
- #else
- return F32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
- #endif
- }
- inline F32x4 min(const F32x4& left, const F32x4& right) {
- #if defined USE_BASIC_SIMD
- return F32x4(MIN_F32_SIMD(left.v, right.v));
- #else
- float v0 = left.scalars[0];
- float v1 = left.scalars[1];
- float v2 = left.scalars[2];
- float v3 = left.scalars[3];
- float r0 = right.scalars[0];
- float r1 = right.scalars[1];
- float r2 = right.scalars[2];
- float r3 = right.scalars[3];
- if (r0 < v0) { v0 = r0; }
- if (r1 < v1) { v1 = r1; }
- if (r2 < v2) { v2 = r2; }
- if (r3 < v3) { v3 = r3; }
- return F32x4(v0, v1, v2, v3);
- #endif
- }
- inline F32x4 max(const F32x4& left, const F32x4& right) {
- #if defined USE_BASIC_SIMD
- return F32x4(MAX_F32_SIMD(left.v, right.v));
- #else
- float v0 = left.scalars[0];
- float v1 = left.scalars[1];
- float v2 = left.scalars[2];
- float v3 = left.scalars[3];
- float r0 = right.scalars[0];
- float r1 = right.scalars[1];
- float r2 = right.scalars[2];
- float r3 = right.scalars[3];
- if (r0 > v0) { v0 = r0; }
- if (r1 > v1) { v1 = r1; }
- if (r2 > v2) { v2 = r2; }
- if (r3 > v3) { v3 = r3; }
- return F32x4(v0, v1, v2, v3);
- #endif
- }
- inline I32x4 operator+(const I32x4& left, const I32x4& right) {
- #if defined USE_BASIC_SIMD
- return I32x4(ADD_I32_SIMD(left.v, right.v));
- #else
- return I32x4(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3]);
- #endif
- }
- inline I32x4 operator-(const I32x4& left, const I32x4& right) {
- #if defined USE_BASIC_SIMD
- return I32x4(SUB_I32_SIMD(left.v, right.v));
- #else
- return I32x4(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3]);
- #endif
- }
- inline I32x4 operator*(const I32x4& left, const I32x4& right) {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- // Emulate a NEON instruction
- return I32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
- #elif defined USE_NEON
- return I32x4(MUL_I32_NEON(left.v, right.v));
- #endif
- #else
- return I32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
- #endif
- }
- inline U32x4 operator+(const U32x4& left, const U32x4& right) {
- #if defined USE_BASIC_SIMD
- return U32x4(ADD_U32_SIMD(left.v, right.v));
- #else
- return U32x4(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3]);
- #endif
- }
- inline U32x4 operator-(const U32x4& left, const U32x4& right) {
- #if defined USE_BASIC_SIMD
- return U32x4(SUB_U32_SIMD(left.v, right.v));
- #else
- return U32x4(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3]);
- #endif
- }
- inline U32x4 operator*(const U32x4& left, const U32x4& right) {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- // Emulate a NEON instruction on SSE2 registers
- return U32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
- #else // NEON
- return U32x4(MUL_U32_NEON(left.v, right.v));
- #endif
- #else
- return U32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
- #endif
- }
- inline U32x4 operator&(const U32x4& left, const U32x4& right) {
- #if defined USE_BASIC_SIMD
- return U32x4(BITWISE_AND_U32_SIMD(left.v, right.v));
- #else
- return U32x4(left.scalars[0] & right.scalars[0], left.scalars[1] & right.scalars[1], left.scalars[2] & right.scalars[2], left.scalars[3] & right.scalars[3]);
- #endif
- }
- inline U32x4 operator|(const U32x4& left, const U32x4& right) {
- #if defined USE_BASIC_SIMD
- return U32x4(BITWISE_OR_U32_SIMD(left.v, right.v));
- #else
- return U32x4(left.scalars[0] | right.scalars[0], left.scalars[1] | right.scalars[1], left.scalars[2] | right.scalars[2], left.scalars[3] | right.scalars[3]);
- #endif
- }
- inline U32x4 operator^(const U32x4& left, const U32x4& right) {
- #if defined USE_BASIC_SIMD
- return U32x4(BITWISE_XOR_U32_SIMD(left.v, right.v));
- #else
- return U32x4(left.scalars[0] ^ right.scalars[0], left.scalars[1] ^ right.scalars[1], left.scalars[2] ^ right.scalars[2], left.scalars[3] ^ right.scalars[3]);
- #endif
- }
- inline U32x4 operator<<(const U32x4& left, uint32_t bitOffset) {
- #if defined USE_SSE2
- return U32x4(_mm_slli_epi32(left.v, bitOffset));
- #else
- #if defined USE_NEON
- return U32x4(vshlq_u32(left.v, LOAD_SCALAR_I32_SIMD(bitOffset)));
- #else
- return U32x4(left.scalars[0] << bitOffset, left.scalars[1] << bitOffset, left.scalars[2] << bitOffset, left.scalars[3] << bitOffset);
- #endif
- #endif
- }
- inline U32x4 operator>>(const U32x4& left, uint32_t bitOffset) {
- #if defined USE_SSE2
- return U32x4(_mm_srli_epi32(left.v, bitOffset));
- #else
- #if defined USE_NEON
- return U32x4(vshlq_u32(left.v, LOAD_SCALAR_I32_SIMD(-bitOffset)));
- #else
- return U32x4(left.scalars[0] >> bitOffset, left.scalars[1] >> bitOffset, left.scalars[2] >> bitOffset, left.scalars[3] >> bitOffset);
- #endif
- #endif
- }
- inline U16x8 operator+(const U16x8& left, const U16x8& right) {
- #if defined USE_BASIC_SIMD
- return U16x8(ADD_U16_SIMD(left.v, right.v));
- #else
- return U16x8(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3],
- left.scalars[4] + right.scalars[4], left.scalars[5] + right.scalars[5], left.scalars[6] + right.scalars[6], left.scalars[7] + right.scalars[7]);
- #endif
- }
- inline U16x8 operator-(const U16x8& left, const U16x8& right) {
- #if defined USE_BASIC_SIMD
- return U16x8(SUB_U16_SIMD(left.v, right.v));
- #else
- return U16x8(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3],
- left.scalars[4] - right.scalars[4], left.scalars[5] - right.scalars[5], left.scalars[6] - right.scalars[6], left.scalars[7] - right.scalars[7]);
- #endif
- }
- inline U16x8 operator*(const U16x8& left, const U16x8& right) {
- #if defined USE_BASIC_SIMD
- return U16x8(MUL_U16_SIMD(left.v, right.v));
- #else
- return U16x8(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3],
- left.scalars[4] * right.scalars[4], left.scalars[5] * right.scalars[5], left.scalars[6] * right.scalars[6], left.scalars[7] * right.scalars[7]);
- #endif
- }
- inline U8x16 operator+(const U8x16& left, const U8x16& right) {
- #if defined USE_BASIC_SIMD
- return U8x16(ADD_U8_SIMD(left.v, right.v));
- #else
- return U8x16(
- left.scalars[0] + right.scalars[0],
- left.scalars[1] + right.scalars[1],
- left.scalars[2] + right.scalars[2],
- left.scalars[3] + right.scalars[3],
- left.scalars[4] + right.scalars[4],
- left.scalars[5] + right.scalars[5],
- left.scalars[6] + right.scalars[6],
- left.scalars[7] + right.scalars[7],
- left.scalars[8] + right.scalars[8],
- left.scalars[9] + right.scalars[9],
- left.scalars[10] + right.scalars[10],
- left.scalars[11] + right.scalars[11],
- left.scalars[12] + right.scalars[12],
- left.scalars[13] + right.scalars[13],
- left.scalars[14] + right.scalars[14],
- left.scalars[15] + right.scalars[15]
- );
- #endif
- }
- inline U8x16 operator-(const U8x16& left, const U8x16& right) {
- #if defined USE_BASIC_SIMD
- return U8x16(SUB_U8_SIMD(left.v, right.v));
- #else
- return U8x16(
- left.scalars[0] - right.scalars[0],
- left.scalars[1] - right.scalars[1],
- left.scalars[2] - right.scalars[2],
- left.scalars[3] - right.scalars[3],
- left.scalars[4] - right.scalars[4],
- left.scalars[5] - right.scalars[5],
- left.scalars[6] - right.scalars[6],
- left.scalars[7] - right.scalars[7],
- left.scalars[8] - right.scalars[8],
- left.scalars[9] - right.scalars[9],
- left.scalars[10] - right.scalars[10],
- left.scalars[11] - right.scalars[11],
- left.scalars[12] - right.scalars[12],
- left.scalars[13] - right.scalars[13],
- left.scalars[14] - right.scalars[14],
- left.scalars[15] - right.scalars[15]
- );
- #endif
- }
- inline uint8_t impl_limit0(int32_t x) { return x < 0 ? 0 : x; }
- inline uint8_t impl_limit255(uint32_t x) { return x > 255 ? 255 : x; }
- inline U8x16 saturatedAddition(const U8x16& left, const U8x16& right) {
- #if defined USE_BASIC_SIMD
- return U8x16(ADD_SAT_U8_SIMD(left.v, right.v));
- #else
- return U8x16(
- impl_limit255((uint32_t)left.scalars[0] + (uint32_t)right.scalars[0]),
- impl_limit255((uint32_t)left.scalars[1] + (uint32_t)right.scalars[1]),
- impl_limit255((uint32_t)left.scalars[2] + (uint32_t)right.scalars[2]),
- impl_limit255((uint32_t)left.scalars[3] + (uint32_t)right.scalars[3]),
- impl_limit255((uint32_t)left.scalars[4] + (uint32_t)right.scalars[4]),
- impl_limit255((uint32_t)left.scalars[5] + (uint32_t)right.scalars[5]),
- impl_limit255((uint32_t)left.scalars[6] + (uint32_t)right.scalars[6]),
- impl_limit255((uint32_t)left.scalars[7] + (uint32_t)right.scalars[7]),
- impl_limit255((uint32_t)left.scalars[8] + (uint32_t)right.scalars[8]),
- impl_limit255((uint32_t)left.scalars[9] + (uint32_t)right.scalars[9]),
- impl_limit255((uint32_t)left.scalars[10] + (uint32_t)right.scalars[10]),
- impl_limit255((uint32_t)left.scalars[11] + (uint32_t)right.scalars[11]),
- impl_limit255((uint32_t)left.scalars[12] + (uint32_t)right.scalars[12]),
- impl_limit255((uint32_t)left.scalars[13] + (uint32_t)right.scalars[13]),
- impl_limit255((uint32_t)left.scalars[14] + (uint32_t)right.scalars[14]),
- impl_limit255((uint32_t)left.scalars[15] + (uint32_t)right.scalars[15])
- );
- #endif
- }
- inline U8x16 saturatedSubtraction(const U8x16& left, const U8x16& right) {
- #if defined USE_BASIC_SIMD
- return U8x16(SUB_SAT_U8_SIMD(left.v, right.v));
- #else
- return U8x16(
- impl_limit0((int32_t)left.scalars[0] - (int32_t)right.scalars[0]),
- impl_limit0((int32_t)left.scalars[1] - (int32_t)right.scalars[1]),
- impl_limit0((int32_t)left.scalars[2] - (int32_t)right.scalars[2]),
- impl_limit0((int32_t)left.scalars[3] - (int32_t)right.scalars[3]),
- impl_limit0((int32_t)left.scalars[4] - (int32_t)right.scalars[4]),
- impl_limit0((int32_t)left.scalars[5] - (int32_t)right.scalars[5]),
- impl_limit0((int32_t)left.scalars[6] - (int32_t)right.scalars[6]),
- impl_limit0((int32_t)left.scalars[7] - (int32_t)right.scalars[7]),
- impl_limit0((int32_t)left.scalars[8] - (int32_t)right.scalars[8]),
- impl_limit0((int32_t)left.scalars[9] - (int32_t)right.scalars[9]),
- impl_limit0((int32_t)left.scalars[10] - (int32_t)right.scalars[10]),
- impl_limit0((int32_t)left.scalars[11] - (int32_t)right.scalars[11]),
- impl_limit0((int32_t)left.scalars[12] - (int32_t)right.scalars[12]),
- impl_limit0((int32_t)left.scalars[13] - (int32_t)right.scalars[13]),
- impl_limit0((int32_t)left.scalars[14] - (int32_t)right.scalars[14]),
- impl_limit0((int32_t)left.scalars[15] - (int32_t)right.scalars[15])
- );
- #endif
- }
- inline I32x4 truncateToI32(const F32x4& vector) {
- #if defined USE_BASIC_SIMD
- return I32x4(F32_TO_I32_SIMD(vector.v));
- #else
- return I32x4((int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3]);
- #endif
- }
- inline U32x4 truncateToU32(const F32x4& vector) {
- #if defined USE_BASIC_SIMD
- return U32x4(F32_TO_U32_SIMD(vector.v));
- #else
- return U32x4((uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3]);
- #endif
- }
- inline F32x4 floatFromI32(const I32x4& vector) {
- #if defined USE_BASIC_SIMD
- return F32x4(I32_TO_F32_SIMD(vector.v));
- #else
- return F32x4((float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3]);
- #endif
- }
- inline F32x4 floatFromU32(const U32x4& vector) {
- #if defined USE_BASIC_SIMD
- return F32x4(U32_TO_F32_SIMD(vector.v));
- #else
- return F32x4((float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3]);
- #endif
- }
- inline I32x4 I32FromU32(const U32x4& vector) {
- #if defined USE_BASIC_SIMD
- return I32x4(REINTERPRET_U32_TO_I32_SIMD(vector.v));
- #else
- return I32x4((int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3]);
- #endif
- }
- inline U32x4 U32FromI32(const I32x4& vector) {
- #if defined USE_BASIC_SIMD
- return U32x4(REINTERPRET_I32_TO_U32_SIMD(vector.v));
- #else
- return U32x4((uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3]);
- #endif
- }
- // Warning! Behavior depends on endianness.
- inline U8x16 reinterpret_U8FromU32(const U32x4& vector) {
- #if defined USE_BASIC_SIMD
- return U8x16(REINTERPRET_U32_TO_U8_SIMD(vector.v));
- #else
- const uint8_t *source = (const uint8_t*)vector.scalars;
- return U8x16(
- source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7],
- source[8], source[9], source[10], source[11], source[12], source[13], source[14], source[15]
- );
- #endif
- }
- // Warning! Behavior depends on endianness.
- inline U32x4 reinterpret_U32FromU8(const U8x16& vector) {
- #if defined USE_BASIC_SIMD
- return U32x4(REINTERPRET_U8_TO_U32_SIMD(vector.v));
- #else
- const uint32_t *source = (const uint32_t*)vector.scalars;
- return U32x4(source[0], source[1], source[2], source[3]);
- #endif
- }
- // Unpacking to larger integers
- inline U32x4 lowerToU32(const U16x8& vector) {
- #if defined USE_BASIC_SIMD
- return U32x4(U16_LOW_TO_U32_SIMD(vector.v));
- #else
- return U32x4(vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3]);
- #endif
- }
- inline U32x4 higherToU32(const U16x8& vector) {
- #if defined USE_BASIC_SIMD
- return U32x4(U16_HIGH_TO_U32_SIMD(vector.v));
- #else
- return U32x4(vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7]);
- #endif
- }
- inline U16x8 lowerToU16(const U8x16& vector) {
- #if defined USE_BASIC_SIMD
- return U16x8(U8_LOW_TO_U16_SIMD(vector.v));
- #else
- return U16x8(
- vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3],
- vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7]
- );
- #endif
- }
- inline U16x8 higherToU16(const U8x16& vector) {
- #if defined USE_BASIC_SIMD
- return U16x8(U8_HIGH_TO_U16_SIMD(vector.v));
- #else
- return U16x8(
- vector.scalars[8], vector.scalars[9], vector.scalars[10], vector.scalars[11],
- vector.scalars[12], vector.scalars[13], vector.scalars[14], vector.scalars[15]
- );
- #endif
- }
- // Saturated packing
- inline U8x16 saturateToU8(const U16x8& lower, const U16x8& upper) {
- #if defined USE_BASIC_SIMD
- return U8x16(PACK_SAT_U16_TO_U8_SIMD(lower.v, upper.v));
- #else
- return U8x16(
- impl_limit255(lower.scalars[0]),
- impl_limit255(lower.scalars[1]),
- impl_limit255(lower.scalars[2]),
- impl_limit255(lower.scalars[3]),
- impl_limit255(lower.scalars[4]),
- impl_limit255(lower.scalars[5]),
- impl_limit255(lower.scalars[6]),
- impl_limit255(lower.scalars[7]),
- impl_limit255(upper.scalars[0]),
- impl_limit255(upper.scalars[1]),
- impl_limit255(upper.scalars[2]),
- impl_limit255(upper.scalars[3]),
- impl_limit255(upper.scalars[4]),
- impl_limit255(upper.scalars[5]),
- impl_limit255(upper.scalars[6]),
- impl_limit255(upper.scalars[7])
- );
- #endif
- }
- // Unary negation for convenience and code readability.
- // Before using unary negation, always check if:
- // * An addition can be turned into a subtraction?
- // x = -a + b
- // x = b - a
- // * A multiplying constant or scalar can be negated instead?
- // x = -b * 2
- // x = b * -2
- inline F32x4 operator-(const F32x4& value) {
- #if defined USE_BASIC_SIMD
- return F32x4(0.0f) - value;
- #else
- return F32x4(-value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3]);
- #endif
- }
- inline I32x4 operator-(const I32x4& value) {
- #if defined USE_BASIC_SIMD
- return I32x4(0) - value;
- #else
- return I32x4(-value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3]);
- #endif
- }
- // Helper macros for generating the vector extract functions.
- // Having one function for each type and offset makes sure that the compiler gets an immediate integer within the valid range.
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- #if defined USE_SSSE3
- #define _MM_ALIGNR_EPI8(A, B, OFFSET) _mm_alignr_epi8(A, B, OFFSET)
- #else
- // If SSSE3 is not used, emulate it using stack memory and unaligned reading of data.
- static inline SIMD_U8x16 _MM_ALIGNR_EPI8(SIMD_U8x16 a, SIMD_U8x16 b, int offset) {
- ALIGN16 uint8_t vectorBuffer[32];
- _mm_store_si128((SIMD_U8x16*)(vectorBuffer), b);
- _mm_store_si128((SIMD_U8x16*)(vectorBuffer + 16), a);
- return _mm_loadu_si128((SIMD_U8x16*)(vectorBuffer + offset));
- }
- #endif
- #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return U8x16(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET));
- #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return U16x8(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 2));
- #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return U32x4(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 4));
- #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return I32x4(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 4));
- #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return F32x4(SIMD_F32x4(_MM_ALIGNR_EPI8(SIMD_U32x4(b.v), SIMD_U32x4(a.v), OFFSET * 4)));
- #elif defined USE_NEON
- #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return U8x16(vextq_u8(a.v, b.v, OFFSET));
- #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return U16x8(vextq_u16(a.v, b.v, OFFSET));
- #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return U32x4(vextq_u32(a.v, b.v, OFFSET));
- #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return I32x4(vextq_s32(a.v, b.v, OFFSET));
- #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return F32x4(vextq_f32(a.v, b.v, OFFSET));
- #endif
- #else
- #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
- #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
- #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
- #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
- #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
- #endif
- // Vector extraction concatunates two input vectors and reads a vector between them using an offset.
- // The first and last offsets that only return one of the inputs can be used for readability, because they will be inlined and removed by the compiler.
- // To get elements from the right side, combine the center vector with the right vector and shift one element to the left using vectorExtract_1 for the given type.
- // To get elements from the left side, combine the left vector with the center vector and shift one element to the right using vectorExtract_15 for 16 lanes, vectorExtract_7 for 8 lanes, or vectorExtract_3 for 4 lanes.
- U8x16 inline vectorExtract_0(const U8x16 &a, const U8x16 &b) { return a; }
- U8x16 inline vectorExtract_1(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(1, U8x16(a.scalars[1], a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0])) }
- U8x16 inline vectorExtract_2(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(2, U8x16(a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1])) }
- U8x16 inline vectorExtract_3(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(3, U8x16(a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2])) }
- U8x16 inline vectorExtract_4(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(4, U8x16(a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3])) }
- U8x16 inline vectorExtract_5(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(5, U8x16(a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4])) }
- U8x16 inline vectorExtract_6(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(6, U8x16(a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5])) }
- U8x16 inline vectorExtract_7(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(7, U8x16(a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6])) }
- U8x16 inline vectorExtract_8(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(8, U8x16(a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7])) }
- U8x16 inline vectorExtract_9(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(9, U8x16(a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8])) }
- U8x16 inline vectorExtract_10(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(10, U8x16(a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9])) }
- U8x16 inline vectorExtract_11(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(11, U8x16(a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10])) }
- U8x16 inline vectorExtract_12(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(12, U8x16(a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11])) }
- U8x16 inline vectorExtract_13(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(13, U8x16(a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11], b.scalars[12])) }
- U8x16 inline vectorExtract_14(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(14, U8x16(a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11], b.scalars[12], b.scalars[13])) }
- U8x16 inline vectorExtract_15(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(15, U8x16(a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11], b.scalars[12], b.scalars[13], b.scalars[14])) }
- U8x16 inline vectorExtract_16(const U8x16 &a, const U8x16 &b) { return b; }
- U16x8 inline vectorExtract_0(const U16x8 &a, const U16x8 &b) { return a; }
- U16x8 inline vectorExtract_1(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(1, U16x8(a.scalars[1], a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0])) }
- U16x8 inline vectorExtract_2(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(2, U16x8(a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1])) }
- U16x8 inline vectorExtract_3(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(3, U16x8(a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2])) }
- U16x8 inline vectorExtract_4(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(4, U16x8(a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3])) }
- U16x8 inline vectorExtract_5(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(5, U16x8(a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4])) }
- U16x8 inline vectorExtract_6(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(6, U16x8(a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5])) }
- U16x8 inline vectorExtract_7(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(7, U16x8(a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6])) }
- U16x8 inline vectorExtract_8(const U16x8 &a, const U16x8 &b) { return b; }
- U32x4 inline vectorExtract_0(const U32x4 &a, const U32x4 &b) { return a; }
- U32x4 inline vectorExtract_1(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(1, U32x4(a.scalars[1], a.scalars[2], a.scalars[3], b.scalars[0])) }
- U32x4 inline vectorExtract_2(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(2, U32x4(a.scalars[2], a.scalars[3], b.scalars[0], b.scalars[1])) }
- U32x4 inline vectorExtract_3(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(3, U32x4(a.scalars[3], b.scalars[0], b.scalars[1], b.scalars[2])) }
- U32x4 inline vectorExtract_4(const U32x4 &a, const U32x4 &b) { return b; }
- I32x4 inline vectorExtract_0(const I32x4 &a, const I32x4 &b) { return a; }
- I32x4 inline vectorExtract_1(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(1, I32x4(a.scalars[1], a.scalars[2], a.scalars[3], b.scalars[0])) }
- I32x4 inline vectorExtract_2(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(2, I32x4(a.scalars[2], a.scalars[3], b.scalars[0], b.scalars[1])) }
- I32x4 inline vectorExtract_3(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(3, I32x4(a.scalars[3], b.scalars[0], b.scalars[1], b.scalars[2])) }
- I32x4 inline vectorExtract_4(const I32x4 &a, const I32x4 &b) { return b; }
- F32x4 inline vectorExtract_0(const F32x4 &a, const F32x4 &b) { return a; }
- F32x4 inline vectorExtract_1(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(1, F32x4(a.scalars[1], a.scalars[2], a.scalars[3], b.scalars[0])) }
- F32x4 inline vectorExtract_2(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(2, F32x4(a.scalars[2], a.scalars[3], b.scalars[0], b.scalars[1])) }
- F32x4 inline vectorExtract_3(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(3, F32x4(a.scalars[3], b.scalars[0], b.scalars[1], b.scalars[2])) }
- F32x4 inline vectorExtract_4(const F32x4 &a, const F32x4 &b) { return b; }
- // Gather instructions load memory from a pointer at multiple index offsets at the same time.
- // The given pointers should be aligned with 4 bytes, so that the fallback solution works on machines with strict alignment requirements.
- #if defined USE_AVX2
- #define GATHER_I32x4_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_epi32((const int32_t*)(SOURCE), FOUR_OFFSETS, SCALE)
- #define GATHER_U32x4_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_epi32((const int32_t*)(SOURCE), FOUR_OFFSETS, SCALE)
- #define GATHER_F32x4_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_ps((const float*)(SOURCE), FOUR_OFFSETS, SCALE)
- #endif
- static inline U32x4 gather(const dsr::SafePointer<uint32_t> data, const U32x4 &elementOffset) {
- #if defined USE_AVX2
- // TODO: Implement safety checks for debug mode.
- return U32x4(GATHER_U32x4_AVX2(data.getUnsafe(), elementOffset.v, 4));
- #else
- ALIGN16 uint32_t elementOffsets[4];
- elementOffset.writeAlignedUnsafe(elementOffsets);
- return U32x4(
- *(data + elementOffsets[0]),
- *(data + elementOffsets[1]),
- *(data + elementOffsets[2]),
- *(data + elementOffsets[3])
- );
- #endif
- }
- static inline I32x4 gather(const dsr::SafePointer<int32_t> data, const U32x4 &elementOffset) {
- #if defined USE_AVX2
- // TODO: Implement safety checks for debug mode.
- return I32x4(GATHER_U32x4_AVX2(data.getUnsafe(), elementOffset.v, 4));
- #else
- ALIGN16 uint32_t elementOffsets[4];
- elementOffset.writeAlignedUnsafe(elementOffsets);
- return I32x4(
- *(data + elementOffsets[0]),
- *(data + elementOffsets[1]),
- *(data + elementOffsets[2]),
- *(data + elementOffsets[3])
- );
- #endif
- }
- static inline F32x4 gather(const dsr::SafePointer<float> data, const U32x4 &elementOffset) {
- #if defined USE_AVX2
- // TODO: Implement safety checks for debug mode.
- return F32x4(GATHER_F32x4_AVX2(data.getUnsafe(), elementOffset.v, 4));
- #else
- ALIGN16 uint32_t elementOffsets[4];
- elementOffset.writeAlignedUnsafe(elementOffsets);
- return F32x4(
- *(data + elementOffsets[0]),
- *(data + elementOffsets[1]),
- *(data + elementOffsets[2]),
- *(data + elementOffsets[3])
- );
- #endif
- }
- inline F32x8 operator+(const F32x8& left, const F32x8& right) {
- #if defined USE_256BIT_F_SIMD
- return F32x8(ADD_F32_SIMD256(left.v, right.v));
- #else
- return F32x8(
- left.scalars[0] + right.scalars[0],
- left.scalars[1] + right.scalars[1],
- left.scalars[2] + right.scalars[2],
- left.scalars[3] + right.scalars[3],
- left.scalars[4] + right.scalars[4],
- left.scalars[5] + right.scalars[5],
- left.scalars[6] + right.scalars[6],
- left.scalars[7] + right.scalars[7]
- );
- #endif
- }
- inline F32x8 operator-(const F32x8& left, const F32x8& right) {
- #if defined USE_256BIT_F_SIMD
- return F32x8(SUB_F32_SIMD256(left.v, right.v));
- #else
- return F32x8(
- left.scalars[0] - right.scalars[0],
- left.scalars[1] - right.scalars[1],
- left.scalars[2] - right.scalars[2],
- left.scalars[3] - right.scalars[3],
- left.scalars[4] - right.scalars[4],
- left.scalars[5] - right.scalars[5],
- left.scalars[6] - right.scalars[6],
- left.scalars[7] - right.scalars[7]
- );
- #endif
- }
- inline F32x8 operator*(const F32x8& left, const F32x8& right) {
- #if defined USE_256BIT_F_SIMD
- return F32x8(MUL_F32_SIMD256(left.v, right.v));
- #else
- return F32x8(
- left.scalars[0] * right.scalars[0],
- left.scalars[1] * right.scalars[1],
- left.scalars[2] * right.scalars[2],
- left.scalars[3] * right.scalars[3],
- left.scalars[4] * right.scalars[4],
- left.scalars[5] * right.scalars[5],
- left.scalars[6] * right.scalars[6],
- left.scalars[7] * right.scalars[7]
- );
- #endif
- }
- inline F32x8 min(const F32x8& left, const F32x8& right) {
- #if defined USE_256BIT_F_SIMD
- return F32x8(MIN_F32_SIMD256(left.v, right.v));
- #else
- float v0 = left.scalars[0];
- float v1 = left.scalars[1];
- float v2 = left.scalars[2];
- float v3 = left.scalars[3];
- float v4 = left.scalars[4];
- float v5 = left.scalars[5];
- float v6 = left.scalars[6];
- float v7 = left.scalars[7];
- float r0 = right.scalars[0];
- float r1 = right.scalars[1];
- float r2 = right.scalars[2];
- float r3 = right.scalars[3];
- float r4 = right.scalars[4];
- float r5 = right.scalars[5];
- float r6 = right.scalars[6];
- float r7 = right.scalars[7];
- if (r0 < v0) { v0 = r0; }
- if (r1 < v1) { v1 = r1; }
- if (r2 < v2) { v2 = r2; }
- if (r3 < v3) { v3 = r3; }
- if (r4 < v4) { v4 = r4; }
- if (r5 < v5) { v5 = r5; }
- if (r6 < v6) { v6 = r6; }
- if (r7 < v7) { v7 = r7; }
- return F32x8(v0, v1, v2, v3, v4, v5, v6, v7);
- #endif
- }
- inline F32x8 max(const F32x8& left, const F32x8& right) {
- #if defined USE_256BIT_F_SIMD
- return F32x8(MAX_F32_SIMD256(left.v, right.v));
- #else
- float v0 = left.scalars[0];
- float v1 = left.scalars[1];
- float v2 = left.scalars[2];
- float v3 = left.scalars[3];
- float v4 = left.scalars[4];
- float v5 = left.scalars[5];
- float v6 = left.scalars[6];
- float v7 = left.scalars[7];
- float r0 = right.scalars[0];
- float r1 = right.scalars[1];
- float r2 = right.scalars[2];
- float r3 = right.scalars[3];
- float r4 = right.scalars[4];
- float r5 = right.scalars[5];
- float r6 = right.scalars[6];
- float r7 = right.scalars[7];
- if (r0 > v0) { v0 = r0; }
- if (r1 > v1) { v1 = r1; }
- if (r2 > v2) { v2 = r2; }
- if (r3 > v3) { v3 = r3; }
- if (r4 > v4) { v4 = r4; }
- if (r5 > v5) { v5 = r5; }
- if (r6 > v6) { v6 = r6; }
- if (r7 > v7) { v7 = r7; }
- return F32x8(v0, v1, v2, v3, v4, v5, v6, v7);
- #endif
- }
- inline I32x8 operator+(const I32x8& left, const I32x8& right) {
- #if defined USE_256BIT_X_SIMD
- return I32x8(ADD_I32_SIMD256(left.v, right.v));
- #else
- return I32x8(
- left.scalars[0] + right.scalars[0],
- left.scalars[1] + right.scalars[1],
- left.scalars[2] + right.scalars[2],
- left.scalars[3] + right.scalars[3],
- left.scalars[4] + right.scalars[4],
- left.scalars[5] + right.scalars[5],
- left.scalars[6] + right.scalars[6],
- left.scalars[7] + right.scalars[7]);
- #endif
- }
- inline I32x8 operator-(const I32x8& left, const I32x8& right) {
- #if defined USE_256BIT_X_SIMD
- return I32x8(SUB_I32_SIMD256(left.v, right.v));
- #else
- return I32x8(
- left.scalars[0] - right.scalars[0],
- left.scalars[1] - right.scalars[1],
- left.scalars[2] - right.scalars[2],
- left.scalars[3] - right.scalars[3],
- left.scalars[4] - right.scalars[4],
- left.scalars[5] - right.scalars[5],
- left.scalars[6] - right.scalars[6],
- left.scalars[7] - right.scalars[7]);
- #endif
- }
- inline I32x8 operator*(const I32x8& left, const I32x8& right) {
- #if defined USE_AVX2
- return I32x8(MUL_I32_SIMD256(left.v, right.v));
- #else
- return I32x8(
- left.scalars[0] * right.scalars[0],
- left.scalars[1] * right.scalars[1],
- left.scalars[2] * right.scalars[2],
- left.scalars[3] * right.scalars[3],
- left.scalars[4] * right.scalars[4],
- left.scalars[5] * right.scalars[5],
- left.scalars[6] * right.scalars[6],
- left.scalars[7] * right.scalars[7]
- );
- #endif
- }
- inline U32x8 operator+(const U32x8& left, const U32x8& right) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(ADD_U32_SIMD256(left.v, right.v));
- #else
- return U32x8(
- left.scalars[0] + right.scalars[0],
- left.scalars[1] + right.scalars[1],
- left.scalars[2] + right.scalars[2],
- left.scalars[3] + right.scalars[3],
- left.scalars[4] + right.scalars[4],
- left.scalars[5] + right.scalars[5],
- left.scalars[6] + right.scalars[6],
- left.scalars[7] + right.scalars[7]
- );
- #endif
- }
- inline U32x8 operator-(const U32x8& left, const U32x8& right) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(SUB_U32_SIMD256(left.v, right.v));
- #else
- return U32x8(
- left.scalars[0] - right.scalars[0],
- left.scalars[1] - right.scalars[1],
- left.scalars[2] - right.scalars[2],
- left.scalars[3] - right.scalars[3],
- left.scalars[4] - right.scalars[4],
- left.scalars[5] - right.scalars[5],
- left.scalars[6] - right.scalars[6],
- left.scalars[7] - right.scalars[7]
- );
- #endif
- }
- inline U32x8 operator*(const U32x8& left, const U32x8& right) {
- #if defined USE_AVX2
- return U32x8(MUL_U32_SIMD256(left.v, right.v));
- #else
- return U32x8(
- left.scalars[0] * right.scalars[0],
- left.scalars[1] * right.scalars[1],
- left.scalars[2] * right.scalars[2],
- left.scalars[3] * right.scalars[3],
- left.scalars[4] * right.scalars[4],
- left.scalars[5] * right.scalars[5],
- left.scalars[6] * right.scalars[6],
- left.scalars[7] * right.scalars[7]
- );
- #endif
- }
- inline U32x8 operator&(const U32x8& left, const U32x8& right) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(BITWISE_AND_U32_SIMD256(left.v, right.v));
- #else
- return U32x8(
- left.scalars[0] & right.scalars[0],
- left.scalars[1] & right.scalars[1],
- left.scalars[2] & right.scalars[2],
- left.scalars[3] & right.scalars[3],
- left.scalars[4] & right.scalars[4],
- left.scalars[5] & right.scalars[5],
- left.scalars[6] & right.scalars[6],
- left.scalars[7] & right.scalars[7]
- );
- #endif
- }
- inline U32x8 operator|(const U32x8& left, const U32x8& right) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(BITWISE_OR_U32_SIMD256(left.v, right.v));
- #else
- return U32x8(
- left.scalars[0] | right.scalars[0],
- left.scalars[1] | right.scalars[1],
- left.scalars[2] | right.scalars[2],
- left.scalars[3] | right.scalars[3],
- left.scalars[4] | right.scalars[4],
- left.scalars[5] | right.scalars[5],
- left.scalars[6] | right.scalars[6],
- left.scalars[7] | right.scalars[7]
- );
- #endif
- }
- inline U32x8 operator^(const U32x8& left, const U32x8& right) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(BITWISE_XOR_U32_SIMD256(left.v, right.v));
- #else
- return U32x8(
- left.scalars[0] ^ right.scalars[0],
- left.scalars[1] ^ right.scalars[1],
- left.scalars[2] ^ right.scalars[2],
- left.scalars[3] ^ right.scalars[3],
- left.scalars[4] ^ right.scalars[4],
- left.scalars[5] ^ right.scalars[5],
- left.scalars[6] ^ right.scalars[6],
- left.scalars[7] ^ right.scalars[7]
- );
- #endif
- }
- inline U32x8 operator<<(const U32x8& left, uint32_t bitOffset) {
- #if defined USE_AVX2
- return U32x8(_mm256_slli_epi32(left.v, bitOffset));
- #else
- return U32x8(
- left.scalars[0] << bitOffset,
- left.scalars[1] << bitOffset,
- left.scalars[2] << bitOffset,
- left.scalars[3] << bitOffset,
- left.scalars[4] << bitOffset,
- left.scalars[5] << bitOffset,
- left.scalars[6] << bitOffset,
- left.scalars[7] << bitOffset
- );
- #endif
- }
- inline U32x8 operator>>(const U32x8& left, uint32_t bitOffset) {
- #if defined USE_AVX2
- return U32x8(_mm256_srli_epi32(left.v, bitOffset));
- #else
- return U32x8(
- left.scalars[0] >> bitOffset,
- left.scalars[1] >> bitOffset,
- left.scalars[2] >> bitOffset,
- left.scalars[3] >> bitOffset,
- left.scalars[4] >> bitOffset,
- left.scalars[5] >> bitOffset,
- left.scalars[6] >> bitOffset,
- left.scalars[7] >> bitOffset
- );
- #endif
- }
- inline U16x16 operator+(const U16x16& left, const U16x16& right) {
- #if defined USE_256BIT_X_SIMD
- return U16x16(ADD_U16_SIMD256(left.v, right.v));
- #else
- return U16x16(
- left.scalars[0] + right.scalars[0],
- left.scalars[1] + right.scalars[1],
- left.scalars[2] + right.scalars[2],
- left.scalars[3] + right.scalars[3],
- left.scalars[4] + right.scalars[4],
- left.scalars[5] + right.scalars[5],
- left.scalars[6] + right.scalars[6],
- left.scalars[7] + right.scalars[7],
- left.scalars[8] + right.scalars[8],
- left.scalars[9] + right.scalars[9],
- left.scalars[10] + right.scalars[10],
- left.scalars[11] + right.scalars[11],
- left.scalars[12] + right.scalars[12],
- left.scalars[13] + right.scalars[13],
- left.scalars[14] + right.scalars[14],
- left.scalars[15] + right.scalars[15]
- );
- #endif
- }
- inline U16x16 operator-(const U16x16& left, const U16x16& right) {
- #if defined USE_256BIT_X_SIMD
- return U16x16(SUB_U16_SIMD256(left.v, right.v));
- #else
- return U16x16(
- left.scalars[0] - right.scalars[0],
- left.scalars[1] - right.scalars[1],
- left.scalars[2] - right.scalars[2],
- left.scalars[3] - right.scalars[3],
- left.scalars[4] - right.scalars[4],
- left.scalars[5] - right.scalars[5],
- left.scalars[6] - right.scalars[6],
- left.scalars[7] - right.scalars[7],
- left.scalars[8] - right.scalars[8],
- left.scalars[9] - right.scalars[9],
- left.scalars[10] - right.scalars[10],
- left.scalars[11] - right.scalars[11],
- left.scalars[12] - right.scalars[12],
- left.scalars[13] - right.scalars[13],
- left.scalars[14] - right.scalars[14],
- left.scalars[15] - right.scalars[15]
- );
- #endif
- }
- inline U16x16 operator*(const U16x16& left, const U16x16& right) {
- #if defined USE_256BIT_X_SIMD
- return U16x16(MUL_U16_SIMD256(left.v, right.v));
- #else
- return U16x16(
- left.scalars[0] * right.scalars[0],
- left.scalars[1] * right.scalars[1],
- left.scalars[2] * right.scalars[2],
- left.scalars[3] * right.scalars[3],
- left.scalars[4] * right.scalars[4],
- left.scalars[5] * right.scalars[5],
- left.scalars[6] * right.scalars[6],
- left.scalars[7] * right.scalars[7],
- left.scalars[8] * right.scalars[8],
- left.scalars[9] * right.scalars[9],
- left.scalars[10] * right.scalars[10],
- left.scalars[11] * right.scalars[11],
- left.scalars[12] * right.scalars[12],
- left.scalars[13] * right.scalars[13],
- left.scalars[14] * right.scalars[14],
- left.scalars[15] * right.scalars[15]
- );
- #endif
- }
- inline U8x32 operator+(const U8x32& left, const U8x32& right) {
- #if defined USE_256BIT_X_SIMD
- return U8x32(ADD_U8_SIMD256(left.v, right.v));
- #else
- U8x32 result = U8x32::create_dangerous_uninitialized();
- for (int i = 0; i < 32; i++) {
- result.scalars[i] = left.scalars[i] + right.scalars[i];
- }
- return result;
- #endif
- }
- inline U8x32 operator-(const U8x32& left, const U8x32& right) {
- #if defined USE_256BIT_X_SIMD
- return U8x32(SUB_U8_SIMD256(left.v, right.v));
- #else
- U8x32 result = U8x32::create_dangerous_uninitialized();
- for (int i = 0; i < 32; i++) {
- result.scalars[i] = left.scalars[i] - right.scalars[i];
- }
- return result;
- #endif
- }
- inline U8x32 saturatedAddition(const U8x32& left, const U8x32& right) {
- #if defined USE_256BIT_X_SIMD
- return U8x32(ADD_SAT_U8_SIMD256(left.v, right.v));
- #else
- U8x32 result = U8x32::create_dangerous_uninitialized();
- for (int i = 0; i < 32; i++) {
- result.scalars[i] = impl_limit255((uint32_t)left.scalars[i] + (uint32_t)right.scalars[i]);
- }
- return result;
- #endif
- }
- inline U8x32 saturatedSubtraction(const U8x32& left, const U8x32& right) {
- #if defined USE_256BIT_X_SIMD
- return U8x32(SUB_SAT_U8_SIMD256(left.v, right.v));
- #else
- U8x32 result = U8x32::create_dangerous_uninitialized();
- for (int i = 0; i < 32; i++) {
- result.scalars[i] = impl_limit0((int32_t)left.scalars[i] - (int32_t)right.scalars[i]);
- }
- return result;
- #endif
- }
- inline I32x8 truncateToI32(const F32x8& vector) {
- #if defined USE_256BIT_X_SIMD
- return I32x8(F32_TO_I32_SIMD256(vector.v));
- #else
- return I32x8(
- (int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3],
- (int32_t)vector.scalars[4], (int32_t)vector.scalars[5], (int32_t)vector.scalars[6], (int32_t)vector.scalars[7]
- );
- #endif
- }
- inline U32x8 truncateToU32(const F32x8& vector) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(F32_TO_U32_SIMD256(vector.v));
- #else
- return U32x8(
- (uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3],
- (uint32_t)vector.scalars[4], (uint32_t)vector.scalars[5], (uint32_t)vector.scalars[6], (uint32_t)vector.scalars[7]
- );
- #endif
- }
- inline F32x8 floatFromI32(const I32x8& vector) {
- #if defined USE_256BIT_X_SIMD
- return F32x8(I32_TO_F32_SIMD256(vector.v));
- #else
- return F32x8(
- (float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3],
- (float)vector.scalars[4], (float)vector.scalars[5], (float)vector.scalars[6], (float)vector.scalars[7]
- );
- #endif
- }
- inline F32x8 floatFromU32(const U32x8& vector) {
- #if defined USE_256BIT_X_SIMD
- return F32x8(U32_TO_F32_SIMD256(vector.v));
- #else
- return F32x8(
- (float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3],
- (float)vector.scalars[4], (float)vector.scalars[5], (float)vector.scalars[6], (float)vector.scalars[7]
- );
- #endif
- }
- inline I32x8 I32FromU32(const U32x8& vector) {
- #if defined USE_256BIT_X_SIMD
- return I32x8(REINTERPRET_U32_TO_I32_SIMD256(vector.v));
- #else
- return I32x8(
- (int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3],
- (int32_t)vector.scalars[4], (int32_t)vector.scalars[5], (int32_t)vector.scalars[6], (int32_t)vector.scalars[7]
- );
- #endif
- }
- inline U32x8 U32FromI32(const I32x8& vector) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(REINTERPRET_I32_TO_U32_SIMD256(vector.v));
- #else
- return U32x8(
- (uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3],
- (uint32_t)vector.scalars[4], (uint32_t)vector.scalars[5], (uint32_t)vector.scalars[6], (uint32_t)vector.scalars[7]
- );
- #endif
- }
- // Warning! Behavior depends on endianness.
- inline U8x32 reinterpret_U8FromU32(const U32x8& vector) {
- #if defined USE_256BIT_X_SIMD
- return U8x32(REINTERPRET_U32_TO_U8_SIMD256(vector.v));
- #else
- const uint8_t *source = (const uint8_t*)vector.scalars;
- U8x32 result = U8x32::create_dangerous_uninitialized();
- for (int i = 0; i < 32; i++) {
- result.scalars[i] = source[i];
- }
- return result;
- #endif
- }
- // Warning! Behavior depends on endianness.
- inline U32x8 reinterpret_U32FromU8(const U8x32& vector) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(REINTERPRET_U8_TO_U32_SIMD256(vector.v));
- #else
- const uint32_t *source = (const uint32_t*)vector.scalars;
- return U32x8(source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7]);
- #endif
- }
- // Unpacking to larger integers
- inline U32x8 lowerToU32(const U16x16& vector) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(U16_LOW_TO_U32_SIMD256(vector.v));
- #else
- return U32x8(vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3], vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7]);
- #endif
- }
- inline U32x8 higherToU32(const U16x16& vector) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(U16_HIGH_TO_U32_SIMD256(vector.v));
- #else
- return U32x8(vector.scalars[8], vector.scalars[9], vector.scalars[10], vector.scalars[11], vector.scalars[12], vector.scalars[13], vector.scalars[14], vector.scalars[15]);
- #endif
- }
- inline U16x16 lowerToU16(const U8x32& vector) {
- #if defined USE_256BIT_X_SIMD
- return U16x16(U8_LOW_TO_U16_SIMD256(vector.v));
- #else
- return U16x16(
- vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3],
- vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7],
- vector.scalars[8], vector.scalars[9], vector.scalars[10], vector.scalars[11],
- vector.scalars[12], vector.scalars[13], vector.scalars[14], vector.scalars[15]
- );
- #endif
- }
- inline U16x16 higherToU16(const U8x32& vector) {
- #if defined USE_256BIT_X_SIMD
- return U16x16(U8_HIGH_TO_U16_SIMD256(vector.v));
- #else
- return U16x16(
- vector.scalars[16], vector.scalars[17], vector.scalars[18], vector.scalars[19],
- vector.scalars[20], vector.scalars[21], vector.scalars[22], vector.scalars[23],
- vector.scalars[24], vector.scalars[25], vector.scalars[26], vector.scalars[27],
- vector.scalars[28], vector.scalars[29], vector.scalars[30], vector.scalars[31]
- );
- #endif
- }
- // Saturated packing
- inline U8x32 saturateToU8(const U16x16& lower, const U16x16& upper) {
- #if defined USE_256BIT_X_SIMD
- return U8x32(PACK_SAT_U16_TO_U8_SIMD256(lower.v, upper.v));
- #else
- U8x32 result = U8x32::create_dangerous_uninitialized();
- for (int i = 0; i < 16; i++) {
- result.scalars[i] = impl_limit255(lower.scalars[i]);
- }
- for (int i = 0; i < 16; i++) {
- result.scalars[i + 16] = impl_limit255(upper.scalars[i]);
- }
- return result;
- #endif
- }
- // Unary negation for convenience and code readability.
- // Before using unary negation, always check if:
- // * An addition can be turned into a subtraction?
- // x = -a + b
- // x = b - a
- // * A multiplying constant or scalar can be negated instead?
- // x = -b * 2
- // x = b * -2
- inline F32x8 operator-(const F32x8& value) {
- #if defined USE_256BIT_F_SIMD
- return F32x8(0.0f) - value;
- #else
- return F32x8(
- -value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3],
- -value.scalars[4], -value.scalars[5], -value.scalars[6], -value.scalars[7]
- );
- #endif
- }
- inline I32x8 operator-(const I32x8& value) {
- #if defined USE_256BIT_X_SIMD
- return I32x8(0) - value;
- #else
- return I32x8(
- -value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3],
- -value.scalars[4], -value.scalars[5], -value.scalars[6], -value.scalars[7]
- );
- #endif
- }
- // Helper macros for generating the vector extract functions.
- // Having one function for each type and offset makes sure that the compiler gets an immediate integer within the valid range.
- #if defined USE_AVX2
- // AVX2 does not offer any 256-bit element extraction, only two 128-bit shifts done in parallel, so we might as well use two separate 128-bit extractions.
- template <int OFFSET>
- __m256i impl_extractBytes_AVX2(const __m256i leftInput, const __m256i rightInput) {
- // Extract three halves depending on which ones overlap with the offset.
- __m128i leftPart = _mm256_extractf128_si256(leftInput, (OFFSET < 16) ? 0 : 1);
- __m128i middlePart = _mm256_extractf128_si256(OFFSET < 16 ? rightInput : leftInput, (OFFSET < 16) ? 1 : 0);
- __m128i rightPart = _mm256_extractf128_si256(rightInput, (OFFSET < 16) ? 0 : 1);
- // Combine two 128-bit extracts into a whole 256-bit extract.
- return _mm256_set_m128i(
- _mm_alignr_epi8(leftPart, middlePart, OFFSET - ((OFFSET < 16) ? 0 : 16)),
- _mm_alignr_epi8(middlePart, rightPart, OFFSET - ((OFFSET < 16) ? 0 : 16))
- );
- }
- #define VECTOR_EXTRACT_GENERATOR_256_U8(OFFSET) return U8x32(impl_extractBytes_AVX2<OFFSET>(b.v, a.v));
- #define VECTOR_EXTRACT_GENERATOR_256_U16(OFFSET) return U16x16(impl_extractBytes_AVX2<OFFSET * 2>(b.v, a.v));
- #define VECTOR_EXTRACT_GENERATOR_256_U32(OFFSET) return U32x8(impl_extractBytes_AVX2<OFFSET * 4>(b.v, a.v));
- #define VECTOR_EXTRACT_GENERATOR_256_I32(OFFSET) return I32x8(impl_extractBytes_AVX2<OFFSET * 4>(b.v, a.v));
- #define VECTOR_EXTRACT_GENERATOR_256_F32(OFFSET) return F32x8(SIMD_F32x8(impl_extractBytes_AVX2<OFFSET * 4>(SIMD_U32x8(b.v), SIMD_U32x8(a.v))));
- #else
- template<typename T, int elementCount>
- T vectorExtract_emulated(const T &a, const T &b, int offset) {
- // For safety reasons, uninitialized default construction of SIMD vectors is only available during full emulation, not partial AVX support.
- T result = T::create_dangerous_uninitialized();
- int t = 0;
- for (int s = offset; s < elementCount; s++) {
- result.scalars[t] = a.scalars[s];
- t++;
- }
- for (int s = 0; s < offset; s++) {
- result.scalars[t] = b.scalars[s];
- t++;
- }
- return result;
- }
- #define VECTOR_EXTRACT_GENERATOR_256_U8(OFFSET) return vectorExtract_emulated<U8x32, 32>(a, b, OFFSET);
- #define VECTOR_EXTRACT_GENERATOR_256_U16(OFFSET) return vectorExtract_emulated<U16x16, 16>(a, b, OFFSET);
- #define VECTOR_EXTRACT_GENERATOR_256_U32(OFFSET) return vectorExtract_emulated<U32x8, 8>(a, b, OFFSET);
- #define VECTOR_EXTRACT_GENERATOR_256_I32(OFFSET) return vectorExtract_emulated<I32x8, 8>(a, b, OFFSET);
- #define VECTOR_EXTRACT_GENERATOR_256_F32(OFFSET) return vectorExtract_emulated<F32x8, 8>(a, b, OFFSET);
- #endif
- // Vector extraction concatunates two input vectors and reads a vector between them using an offset.
- // The first and last offsets that only return one of the inputs can be used for readability, because they will be inlined and removed by the compiler.
- // To get elements from the right side, combine the center vector with the right vector and shift one element to the left using vectorExtract_1 for the given type.
- // To get elements from the left side, combine the left vector with the center vector and shift one element to the right using vectorExtract_15 for 16 lanes, vectorExtract_7 for 8 lanes, or vectorExtract_3 for 4 lanes.
- U8x32 inline vectorExtract_0(const U8x32 &a, const U8x32 &b) { return a; }
- U8x32 inline vectorExtract_1(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(1) }
- U8x32 inline vectorExtract_2(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(2) }
- U8x32 inline vectorExtract_3(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(3) }
- U8x32 inline vectorExtract_4(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(4) }
- U8x32 inline vectorExtract_5(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(5) }
- U8x32 inline vectorExtract_6(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(6) }
- U8x32 inline vectorExtract_7(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(7) }
- U8x32 inline vectorExtract_8(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(8) }
- U8x32 inline vectorExtract_9(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(9) }
- U8x32 inline vectorExtract_10(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(10) }
- U8x32 inline vectorExtract_11(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(11) }
- U8x32 inline vectorExtract_12(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(12) }
- U8x32 inline vectorExtract_13(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(13) }
- U8x32 inline vectorExtract_14(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(14) }
- U8x32 inline vectorExtract_15(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(15) }
- U8x32 inline vectorExtract_16(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(16) }
- U8x32 inline vectorExtract_17(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(17) }
- U8x32 inline vectorExtract_18(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(18) }
- U8x32 inline vectorExtract_19(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(19) }
- U8x32 inline vectorExtract_20(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(20) }
- U8x32 inline vectorExtract_21(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(21) }
- U8x32 inline vectorExtract_22(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(22) }
- U8x32 inline vectorExtract_23(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(23) }
- U8x32 inline vectorExtract_24(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(24) }
- U8x32 inline vectorExtract_25(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(25) }
- U8x32 inline vectorExtract_26(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(26) }
- U8x32 inline vectorExtract_27(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(27) }
- U8x32 inline vectorExtract_28(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(28) }
- U8x32 inline vectorExtract_29(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(29) }
- U8x32 inline vectorExtract_30(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(30) }
- U8x32 inline vectorExtract_31(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(31) }
- U8x32 inline vectorExtract_32(const U8x32 &a, const U8x32 &b) { return b; }
- U16x16 inline vectorExtract_0(const U16x16 &a, const U16x16 &b) { return a; }
- U16x16 inline vectorExtract_1(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(1) }
- U16x16 inline vectorExtract_2(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(2) }
- U16x16 inline vectorExtract_3(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(3) }
- U16x16 inline vectorExtract_4(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(4) }
- U16x16 inline vectorExtract_5(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(5) }
- U16x16 inline vectorExtract_6(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(6) }
- U16x16 inline vectorExtract_7(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(7) }
- U16x16 inline vectorExtract_8(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(8) }
- U16x16 inline vectorExtract_9(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(9) }
- U16x16 inline vectorExtract_10(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(10) }
- U16x16 inline vectorExtract_11(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(11) }
- U16x16 inline vectorExtract_12(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(12) }
- U16x16 inline vectorExtract_13(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(13) }
- U16x16 inline vectorExtract_14(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(14) }
- U16x16 inline vectorExtract_15(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(15) }
- U16x16 inline vectorExtract_16(const U16x16 &a, const U16x16 &b) { return b; }
- U32x8 inline vectorExtract_0(const U32x8 &a, const U32x8 &b) { return a; }
- U32x8 inline vectorExtract_1(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(1) }
- U32x8 inline vectorExtract_2(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(2) }
- U32x8 inline vectorExtract_3(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(3) }
- U32x8 inline vectorExtract_4(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(4) }
- U32x8 inline vectorExtract_5(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(5) }
- U32x8 inline vectorExtract_6(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(6) }
- U32x8 inline vectorExtract_7(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(7) }
- U32x8 inline vectorExtract_8(const U32x8 &a, const U32x8 &b) { return b; }
- I32x8 inline vectorExtract_0(const I32x8 &a, const I32x8 &b) { return a; }
- I32x8 inline vectorExtract_1(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(1) }
- I32x8 inline vectorExtract_2(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(2) }
- I32x8 inline vectorExtract_3(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(3) }
- I32x8 inline vectorExtract_4(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(4) }
- I32x8 inline vectorExtract_5(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(5) }
- I32x8 inline vectorExtract_6(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(6) }
- I32x8 inline vectorExtract_7(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(7) }
- I32x8 inline vectorExtract_8(const I32x8 &a, const I32x8 &b) { return b; }
- F32x8 inline vectorExtract_0(const F32x8 &a, const F32x8 &b) { return a; }
- F32x8 inline vectorExtract_1(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(1) }
- F32x8 inline vectorExtract_2(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(2) }
- F32x8 inline vectorExtract_3(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(3) }
- F32x8 inline vectorExtract_4(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(4) }
- F32x8 inline vectorExtract_5(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(5) }
- F32x8 inline vectorExtract_6(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(6) }
- F32x8 inline vectorExtract_7(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(7) }
- F32x8 inline vectorExtract_8(const F32x8 &a, const F32x8 &b) { return b; }
- // Gather instructions load memory from a pointer at multiple index offsets at the same time.
- // The given pointers should be aligned with 4 bytes, so that the fallback solution works on machines with strict alignment requirements.
- #if defined USE_AVX2
- #define GATHER_I32x8_AVX2(SOURCE, EIGHT_OFFSETS, SCALE) _mm256_i32gather_epi32((const int32_t*)(SOURCE), EIGHT_OFFSETS, SCALE)
- #define GATHER_U32x8_AVX2(SOURCE, EIGHT_OFFSETS, SCALE) _mm256_i32gather_epi32((const int32_t*)(SOURCE), EIGHT_OFFSETS, SCALE)
- #define GATHER_F32x8_AVX2(SOURCE, EIGHT_OFFSETS, SCALE) _mm256_i32gather_ps((const float*)(SOURCE), EIGHT_OFFSETS, SCALE)
- #endif
- static inline U32x8 gather(const dsr::SafePointer<uint32_t> data, const U32x8 &elementOffset) {
- #if defined USE_AVX2
- // TODO: Implement safety checks for debug mode.
- return U32x8(GATHER_I32x8_AVX2(data.getUnsafe(), elementOffset.v, 4));
- #else
- ALIGN32 uint32_t elementOffsets[8];
- elementOffset.writeAlignedUnsafe(elementOffsets);
- return U32x8(
- *(data + elementOffsets[0]),
- *(data + elementOffsets[1]),
- *(data + elementOffsets[2]),
- *(data + elementOffsets[3]),
- *(data + elementOffsets[4]),
- *(data + elementOffsets[5]),
- *(data + elementOffsets[6]),
- *(data + elementOffsets[7])
- );
- #endif
- }
- static inline I32x8 gather(const dsr::SafePointer<int32_t> data, const U32x8 &elementOffset) {
- #if defined USE_AVX2
- // TODO: Implement safety checks for debug mode.
- return I32x8(GATHER_U32x8_AVX2(data.getUnsafe(), elementOffset.v, 4));
- #else
- ALIGN32 uint32_t elementOffsets[8];
- elementOffset.writeAlignedUnsafe(elementOffsets);
- return I32x8(
- *(data + elementOffsets[0]),
- *(data + elementOffsets[1]),
- *(data + elementOffsets[2]),
- *(data + elementOffsets[3]),
- *(data + elementOffsets[4]),
- *(data + elementOffsets[5]),
- *(data + elementOffsets[6]),
- *(data + elementOffsets[7])
- );
- #endif
- }
- static inline F32x8 gather(const dsr::SafePointer<float> data, const U32x8 &elementOffset) {
- #if defined USE_AVX2
- // TODO: Implement safety checks for debug mode.
- return F32x8(GATHER_F32x8_AVX2(data.getUnsafe(), elementOffset.v, 4));
- #else
- ALIGN32 uint32_t elementOffsets[8];
- elementOffset.writeAlignedUnsafe(elementOffsets);
- return F32x8(
- *(data + elementOffsets[0]),
- *(data + elementOffsets[1]),
- *(data + elementOffsets[2]),
- *(data + elementOffsets[3]),
- *(data + elementOffsets[4]),
- *(data + elementOffsets[5]),
- *(data + elementOffsets[6]),
- *(data + elementOffsets[7])
- );
- #endif
- }
- // Wrapper functions for explicitly expanding scalars into vectors during math operations.
- #define NUMERICAL_SCALAR_OPERATIONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
- inline VECTOR_TYPE operator+(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left + VECTOR_TYPE(right); } \
- inline VECTOR_TYPE operator+(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) + right; } \
- inline VECTOR_TYPE operator-(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left - VECTOR_TYPE(right); } \
- inline VECTOR_TYPE operator-(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) - right; }
- FOR_ALL_VECTOR_TYPES(NUMERICAL_SCALAR_OPERATIONS)
- #undef NUMERICAL_SCALAR_OPERATIONS
- #define MULTIPLY_SCALAR_OPERATIONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
- inline VECTOR_TYPE operator*(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left * VECTOR_TYPE(right); } \
- inline VECTOR_TYPE operator*(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) * right; }
- // TODO: Implement multiplication for U8x16 and U8x32.
- //FOR_ALL_VECTOR_TYPES(MULTIPLY_SCALAR_OPERATIONS)
- MULTIPLY_SCALAR_OPERATIONS(F32x4, float, 4)
- MULTIPLY_SCALAR_OPERATIONS(F32x8, float, 8)
- MULTIPLY_SCALAR_OPERATIONS(U32x4, uint32_t, 4)
- MULTIPLY_SCALAR_OPERATIONS(U32x8, uint32_t, 8)
- MULTIPLY_SCALAR_OPERATIONS(I32x4, int32_t, 4)
- MULTIPLY_SCALAR_OPERATIONS(I32x8, int32_t, 8)
- MULTIPLY_SCALAR_OPERATIONS(U16x8, uint16_t, 8)
- MULTIPLY_SCALAR_OPERATIONS(U16x16, uint16_t, 16)
- #undef MULTIPLY_SCALAR_OPERATIONS
- // Wrapper functions for explicitly duplicating bit masks into the same lane count.
- #define BITWISE_SCALAR_OPERATIONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
- inline VECTOR_TYPE operator&(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left & VECTOR_TYPE(right); } \
- inline VECTOR_TYPE operator&(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) & right; } \
- inline VECTOR_TYPE operator|(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left | VECTOR_TYPE(right); } \
- inline VECTOR_TYPE operator|(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) | right; } \
- inline VECTOR_TYPE operator^(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left ^ VECTOR_TYPE(right); } \
- inline VECTOR_TYPE operator^(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) ^ right; }
- // TODO: Implement bitwise operations for all unsigned SIMD vectors.
- //FOR_UNSIGNED_VECTOR_TYPES(BITWISE_SCALAR_OPERATIONS)
- BITWISE_SCALAR_OPERATIONS(U32x4, uint32_t, 4)
- BITWISE_SCALAR_OPERATIONS(U32x8, uint32_t, 8)
- #undef BITWISE_SCALAR_OPERATIONS
- // Cleaning up temporary macro definitions to avoid cluttering the namespace.
- #undef FOR_ALL_VECTOR_TYPES
- #undef FOR_FLOAT_VECTOR_TYPES
- #undef FOR_INTEGER_VECTOR_TYPES
- #undef FOR_SIGNED_VECTOR_TYPES
- #undef FOR_UNSIGNED_VECTOR_TYPES
- // The X vectors using the longest SIMD length that is efficient to use for both floating-point and integer types.
- // DSR_DEFAULT_ALIGNMENT
- // The number of bytes memory should be aligned with by default when creating buffers and images.
- // F32xX
- // The longest available SIMD vector for storing 32-bit float values. Iterating laneCountX_32Bit floats at a time.
- // I32xX
- // The longest available SIMD vector for storing signed 32-bit integer values. Iterating laneCountX_32Bit integers at a time.
- // U32xX
- // The longest available SIMD vector for storing unsigned 32-bit integer values. Iterating laneCountX_32Bit integers at a time.
- // U16xX
- // The longest available SIMD vector for storing unsigned 16-bit integer values. Iterating laneCountX_16Bit integers at a time.
- // U8xX
- // The longest available SIMD vector for storing unsigned 8-bit integer values. Iterating laneCountX_8Bit integers at a time.
- #if defined USE_256BIT_X_SIMD || defined EMULATE_256BIT_X_SIMD
- // Using 256-bit SIMD
- #define DSR_DEFAULT_VECTOR_SIZE 32
- #define DSR_DEFAULT_ALIGNMENT 32
- using F32xX = F32x8;
- using I32xX = I32x8;
- using U32xX = U32x8;
- using U16xX = U16x16;
- using U8xX = U8x32;
- // Align memory with 256 bits to allow overwriting padding at the end of each pixel row.
- // Otherwise you would have to preserve data at the end of each row with slow and bloated duplicated code in every filter.
- #else
- // If there is no hardware support for 256-bit vectors, the emulation of 256-bit vectors when used explicitly, is allowed to be aligned with just 128 bits.
- #define DSR_DEFAULT_VECTOR_SIZE 16
- #define DSR_DEFAULT_ALIGNMENT 16
- using F32xX = F32x4;
- using I32xX = I32x4;
- using U32xX = U32x4;
- using U16xX = U16x8;
- using U8xX = U8x16;
- #endif
- // How many lanes do the longest available vector have for a specified lane size.
- // Used to iterate indices and pointers using whole elements.
- static const int laneCountX_32Bit = DSR_DEFAULT_VECTOR_SIZE / 4;
- static const int laneCountX_16Bit = DSR_DEFAULT_VECTOR_SIZE / 2;
- static const int laneCountX_8Bit = DSR_DEFAULT_VECTOR_SIZE;
- // The F vector using the longest SIMD length that is efficient to use when only processing float values, even if no integer types are available in the same size.
- // Used when you know that your algorithm is only going to work with float types and you need the extra performance.
- // Some processors have AVX but not AVX2, meaning that it has 256-bit SIMD for floats, but only 128-bit SIMD for integers.
- // F32xF
- // The longest available SIMD vector for storing 32-bit float values. Iterating laneCountF_32Bit floats at a time.
- #if defined USE_256BIT_F_SIMD || defined EMULATE_256BIT_F_SIMD
- #define DSR_FLOAT_VECTOR_SIZE 32
- #define DSR_FLOAT_ALIGNMENT 32
- using F32xF = F32x8;
- #else
- // F vectors are 128-bits.
- #define DSR_FLOAT_VECTOR_SIZE 16
- #define DSR_FLOAT_ALIGNMENT 16
- using F32xF = F32x4;
- #endif
- // Used to iterate over float pointers when using F32xF.
- static const int laneCountF = DSR_FLOAT_VECTOR_SIZE / 4;
- #endif
|