| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926 |
- // zlib open source license
- //
- // Copyright (c) 2017 to 2025 David Forsgren Piuva
- //
- // This software is provided 'as-is', without any express or implied
- // warranty. In no event will the authors be held liable for any damages
- // arising from the use of this software.
- //
- // Permission is granted to anyone to use this software for any purpose,
- // including commercial applications, and to alter it and redistribute it
- // freely, subject to the following restrictions:
- //
- // 1. The origin of this software must not be misrepresented; you must not
- // claim that you wrote the original software. If you use this software
- // in a product, an acknowledgment in the product documentation would be
- // appreciated but is not required.
- //
- // 2. Altered source versions must be plainly marked as such, and must not be
- // misrepresented as being the original software.
- //
- // 3. This notice may not be removed or altered from any source
- // distribution.
- // Hardware abstraction layer for portable SIMD math.
- // Used to make calculations faster without having to mess around with any hardware specific assembler code nor intrinsic functions.
- // You get the performance you need today and the ability to compile with automatic scalar emulation when building for a future processor that this module has not yet been ported to.
- // When you can generate the vectorized code using the same template function as your non-vectorized code, you don't even need to write a reference implementation.
- // Using with 128-bit SIMD: (beginner friendly, test once, compile anywhere, no compiler flags)
- // If you are new to vectorization or only plan to work with 128-bit extensions such as ARM NEON, you can keep it simple by only using the 128-bit vector types.
- // Pros and cons:
- // + Most target platforms (excluding older systems such as ARMv6) have 128-bit SIMD extensions such as Intel SSE2 or ARM NEON enabled by default.
- // ARMv6 does not support ARM NEON, but most ARMv7 processors support it, so that compilers enable NEON by default.
- // All 64-bit ARM processors have ARM NEON, because it stopped being optional in ARMv8.
- // Building for 64-bit Intel processors usually have SSE2 enabled by default, so you don't have to change any compiler flags when building on a different system.
- // + One build for all computers of the same instruction set.
- // Great when your application is not so resource heavy, because the least powerful systems don't have the fancy extensions anyway.
- // - You might end up enabling the additional SIMD extensions anyway because the library is already using it to become faster.
- // Types:
- // * Use F32x4, I32x4 and U32x4 for 4 elements at a time
- // * U16x8 for 8 elements at a time
- // * U8x16 for 16 elements at a time
- // Using the X vector size: (advanced, having to test with different build flags or emulation)
- // If you want more performance, you can use variable length type aliases.
- // Pros and cons:
- // + For heavy calculations where memory access is not the bottleneck, using larger SIMD vectors when enabled allow saving energy and increasing performance.
- // - If you forget to test with longer vector lengths (compiling with -mavx2 or -mEMULATE_256BIT_SIMD) then you might find bugs from not iterating or aligning memory correctly.
- // Types:
- // * Use F32xX, I32xX and U32xX for laneCountX_32Bit elements at a time
- // * U16xX for laneCountX_16Bit elements at a time
- // * U8xX for laneCountX_8Bit elements at a time
- // Using the F vector size: (very dangerous, no test can confirm that memory alignment is correct)
- // If you want even more performance, you can let float operations use the longest available F vector size, which might exceed the X vector size.
- // Pros and cons:
- // - Have to manually set the alignment of buffers to DSR_FLOAT_ALIGNMENT to prevent crashing.
- // If the default alignment for buffers changed based on the size of F vectors, the more commonly used X vector would get slowed down from cache misses from padding larger than X vectors.
- // AlignedImageF32 and sound backends are already aligned with the F vector size, because they are not generic like Buffer.
- // - It can be difficult to detect incorrect memory alignment, because a pointer can accidentally be aligned to more than what was requested.
- // If accidentally aligning to 128 bits instead of 256 bits, there is a 50% risk of failing to detect it at runtime and later fail on another computer.
- // If sticking with 128-bit or X vectors, all buffers will be correctly aligned automatically.
- // + For heavy calculations where memory access is not the bottleneck, using larger SIMD vectors when enabled allow saving energy and increasing performance.
- // - If you forget to test with longer vector lengths (compiling with -mavx2 or -mEMULATE_256BIT_SIMD) then you might find bugs from not iterating or aligning memory correctly.
- // Types:
- // * Use F32xX, I32xX and U32xX for laneCountX_32Bit elements at a time
- // Compiler extensions
- // On Intel/AMD processors:
- // SSE2 is usually enabled by default, because SSE2 is mandatory for 64-bit Intel instructions.
- // Use -mavx as a G++ compiler flag to enable the AVX extension, enabling the USE_AVX and USE_256BIT_F_SIMD macros.
- // If not available on your computer, you can test your algorithm for 256-bit float SIMD using EMULATE_256BIT_F_SIMD, but only if you use the F vector size.
- // Use -mavx2 as a G++ compiler flag to enable the AVX2 extension, enabling the USE_AVX2 and USE_256BIT_X_SIMD macros.
- // If not available on your computer, you can test your algorithm for 256-bit float and integer SIMD using EMULATE_256BIT_X_SIMD, but only if you use the X vector size.
- // On ARMv6 processors:
- // Scalar emulation is used when compiling for ARMv6, because it does not have NEON and VFP is not supported in this abstraction.
- // On ARMv7 processors:
- // NEON is usually enabled by default for ARMv7, because most of them have the extension.
- // On ARMv8 processors:
- // NEON can not be disabled for ARMv8, because it is mandatory for ARMv8.
- #ifndef DFPSR_SIMD
- #define DFPSR_SIMD
- #include <cstdint>
- #include <cassert>
- #include "SafePointer.h"
- #include "../math/FVector.h"
- #include "../math/IVector.h"
- #include "../math/UVector.h"
- #include "DsrTraits.h"
- #include "../settings.h"
- #include "../base/noSimd.h"
- // Alignment in bytes
- #define ALIGN_BYTES(SIZE) __attribute__((aligned(SIZE)))
- #define ALIGN16 ALIGN_BYTES(16) // 128-bit alignment
- #define ALIGN32 ALIGN_BYTES(32) // 256-bit alignment
- #define ALIGN64 ALIGN_BYTES(64) // 512-bit alignment
- #define ALIGN128 ALIGN_BYTES(128) // 1024-bit alignment
- #define ALIGN256 ALIGN_BYTES(256) // 2048-bit alignment
- namespace dsr {
- // Everything declared in here handles things specific for SSE.
- // Direct use of the macros will not provide portability to all hardware.
- #ifdef USE_SSE2
- #define USE_DIRECT_SIMD_MEMORY_ACCESS
- #include <emmintrin.h> // SSE2
- #ifdef USE_SSSE3
- #include <tmmintrin.h> // SSSE3
- #endif
- #ifdef USE_AVX
- #include <immintrin.h> // AVX / AVX2
- #endif
- // Vector types
- #define SIMD_F32x4 __m128
- #define SIMD_U8x16 __m128i
- #define SIMD_U16x8 __m128i
- #define SIMD_U32x4 __m128i
- #define SIMD_I32x4 __m128i
- // Vector uploads in address order
- #define LOAD_VECTOR_F32_SIMD(A, B, C, D) _mm_set_ps(D, C, B, A)
- #define LOAD_SCALAR_F32_SIMD(A) _mm_set1_ps(A)
- #define LOAD_VECTOR_U8_SIMD(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) _mm_set_epi8(P, O, N, M, L, K, J, I, H, G, F, E, D, C, B, A)
- #define LOAD_SCALAR_U8_SIMD(A) _mm_set1_epi8(A)
- #define LOAD_VECTOR_U16_SIMD(A, B, C, D, E, F, G, H) _mm_set_epi16(H, G, F, E, D, C, B, A)
- #define LOAD_SCALAR_U16_SIMD(A) _mm_set1_epi16(A)
- #define LOAD_VECTOR_U32_SIMD(A, B, C, D) _mm_set_epi32(D, C, B, A)
- #define LOAD_SCALAR_U32_SIMD(A) _mm_set1_epi32(A)
- #define LOAD_VECTOR_I32_SIMD(A, B, C, D) _mm_set_epi32(D, C, B, A)
- #define LOAD_SCALAR_I32_SIMD(A) _mm_set1_epi32(A)
- // Conversions
- #define F32_TO_I32_SIMD(A) _mm_cvttps_epi32(A)
- #define F32_TO_U32_SIMD(A) _mm_cvttps_epi32(A)
- #define I32_TO_F32_SIMD(A) _mm_cvtepi32_ps(A)
- #define U32_TO_F32_SIMD(A) _mm_cvtepi32_ps(A)
- // Unpacking conversions
- #define U8_LOW_TO_U16_SIMD(A) _mm_unpacklo_epi8(A, _mm_set1_epi8(0))
- #define U8_HIGH_TO_U16_SIMD(A) _mm_unpackhi_epi8(A, _mm_set1_epi8(0))
- #define U16_LOW_TO_U32_SIMD(A) _mm_unpacklo_epi16(A, _mm_set1_epi16(0))
- #define U16_HIGH_TO_U32_SIMD(A) _mm_unpackhi_epi16(A, _mm_set1_epi16(0))
- // Saturated packing
- // Credit: Using ideas from Victoria Zhislina's NEON_2_SSE.h header from the Intel corporation, but not trying to emulate NEON
- inline SIMD_U8x16 PACK_SAT_U16_TO_U8_SIMD(const SIMD_U16x8& a, const SIMD_U16x8& b) {
- SIMD_U16x8 mask, a2, b2;
- mask = _mm_set1_epi16(0b0111111111111111);
- a2 = _mm_and_si128(a, mask);
- a2 = _mm_or_si128(a2, _mm_and_si128(_mm_cmpgt_epi16(a2, a), mask));
- b2 = _mm_and_si128(b, mask);
- b2 = _mm_or_si128(b2, _mm_and_si128(_mm_cmpgt_epi16(b2, b), mask));
- return _mm_packus_epi16(a2, b2);
- }
- // Reinterpret casting
- #define REINTERPRET_U32_TO_U8_SIMD(A) (A)
- #define REINTERPRET_U32_TO_U16_SIMD(A) (A)
- #define REINTERPRET_U8_TO_U32_SIMD(A) (A)
- #define REINTERPRET_U16_TO_U32_SIMD(A) (A)
- #define REINTERPRET_U32_TO_I32_SIMD(A) (A)
- #define REINTERPRET_I32_TO_U32_SIMD(A) (A)
- // Vector float operations returning SIMD_F32x4
- #define ADD_F32_SIMD(A, B) _mm_add_ps(A, B)
- #define SUB_F32_SIMD(A, B) _mm_sub_ps(A, B)
- #define MUL_F32_SIMD(A, B) _mm_mul_ps(A, B)
- // Vector integer operations returning SIMD_I32x4
- #define ADD_I32_SIMD(A, B) _mm_add_epi32(A, B)
- #define SUB_I32_SIMD(A, B) _mm_sub_epi32(A, B)
- // 32-bit integer multiplications are not available on SSE2.
- // Vector integer operations returning SIMD_U32x4
- #define ADD_U32_SIMD(A, B) _mm_add_epi32(A, B)
- #define SUB_U32_SIMD(A, B) _mm_sub_epi32(A, B)
- // 32-bit integer multiplications are not available on SSE2.
- // Vector integer operations returning SIMD_U16x8
- #define ADD_U16_SIMD(A, B) _mm_add_epi16(A, B)
- #define SUB_U16_SIMD(A, B) _mm_sub_epi16(A, B)
- #define MUL_U16_SIMD(A, B) _mm_mullo_epi16(A, B)
- // Vector integer operations returning SIMD_U8x16
- #define ADD_U8_SIMD(A, B) _mm_add_epi8(A, B)
- #define ADD_SAT_U8_SIMD(A, B) _mm_adds_epu8(A, B) // Saturated addition
- #define SUB_U8_SIMD(A, B) _mm_sub_epi8(A, B)
- #define SUB_SAT_U8_SIMD(A, B) _mm_subs_epu8(A, B) // Saturated subtraction
- // No 8-bit multiplications
- // Statistics
- #define MIN_F32_SIMD(A, B) _mm_min_ps(A, B)
- #define MAX_F32_SIMD(A, B) _mm_max_ps(A, B)
- // TODO: Implement minimum and maximum for integer vectors, so that all operations exist for all applicable types:
- // Using _mm256_max_epu16... in AVX2 for 256-bit versions
- // Using comparisons and masking in SSE2 when _mm_max_epu16... in SSE4.1 is not available
- // Bitwise
- #define BITWISE_AND_U32_SIMD(A, B) _mm_and_si128(A, B)
- #define BITWISE_OR_U32_SIMD(A, B) _mm_or_si128(A, B)
- #define BITWISE_XOR_U32_SIMD(A, B) _mm_xor_si128(A, B)
- #ifdef USE_AVX
- // 256-bit vector types
- #define SIMD_F32x8 __m256
- // Vector uploads in address order
- #define LOAD_VECTOR_F32_SIMD256(A, B, C, D, E, F, G, H) _mm256_set_ps(H, G, F, E, D, C, B, A)
- #define LOAD_SCALAR_F32_SIMD256(A) _mm256_set1_ps(A)
- // Vector float operations returning SIMD_F32x4
- #define ADD_F32_SIMD256(A, B) _mm256_add_ps(A, B)
- #define SUB_F32_SIMD256(A, B) _mm256_sub_ps(A, B)
- #define MUL_F32_SIMD256(A, B) _mm256_mul_ps(A, B)
- // Statistics
- #define MIN_F32_SIMD256(A, B) _mm256_min_ps(A, B)
- #define MAX_F32_SIMD256(A, B) _mm256_max_ps(A, B)
- #ifdef USE_AVX2
- // 256-bit vector types
- #define SIMD_U8x32 __m256i
- #define SIMD_U16x16 __m256i
- #define SIMD_U32x8 __m256i
- #define SIMD_I32x8 __m256i
- // Vector uploads in address order
- #define LOAD_VECTOR_U8_SIMD256(A1, B1, C1, D1, E1, F1, G1, H1, I1, J1, K1, L1, M1, N1, O1, P1, Q1, R1, S1, T1, U1, V1, W1, X1, Y1, Z1, A2, B2, C2, D2, E2, F2) _mm256_set_epi8(F2, E2, D2, C2, B2, A2, Z1, Y1, X1, W1, V1, U1, T1, S1, R1, Q1, P1, O1, N1, M1, L1, K1, J1, I1, H1, G1, F1, E1, D1, C1, B1, A1)
- #define LOAD_SCALAR_U8_SIMD256(A) _mm256_set1_epi8(A)
- #define LOAD_VECTOR_U16_SIMD256(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) _mm256_set_epi16(P, O, N, M, L, K, J, I, H, G, F, E, D, C, B, A)
- #define LOAD_SCALAR_U16_SIMD256(A) _mm256_set1_epi16(A)
- #define LOAD_VECTOR_U32_SIMD256(A, B, C, D, E, F, G, H) _mm256_set_epi32(H, G, F, E, D, C, B, A)
- #define LOAD_SCALAR_U32_SIMD256(A) _mm256_set1_epi32(A)
- #define LOAD_VECTOR_I32_SIMD256(A, B, C, D, E, F, G, H) _mm256_set_epi32(H, G, F, E, D, C, B, A)
- #define LOAD_SCALAR_I32_SIMD256(A) _mm256_set1_epi32(A)
- // Conversions
- #define F32_TO_I32_SIMD256(A) _mm256_cvttps_epi32(A)
- #define F32_TO_U32_SIMD256(A) _mm256_cvttps_epi32(A)
- #define I32_TO_F32_SIMD256(A) _mm256_cvtepi32_ps(A)
- #define U32_TO_F32_SIMD256(A) _mm256_cvtepi32_ps(A)
- // Unpacking conversions
- #define U8_LOW_TO_U16_SIMD256(A) _mm256_unpacklo_epi8(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi8(0))
- #define U8_HIGH_TO_U16_SIMD256(A) _mm256_unpackhi_epi8(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi8(0))
- #define U16_LOW_TO_U32_SIMD256(A) _mm256_unpacklo_epi16(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi16(0))
- #define U16_HIGH_TO_U32_SIMD256(A) _mm256_unpackhi_epi16(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi16(0))
- // Saturated packing
- // Credit: Using ideas from Victoria Zhislina's NEON_2_SSE.h header from the Intel corporation, but not trying to emulate NEON
- inline SIMD_U8x32 PACK_SAT_U16_TO_U8_SIMD256(const SIMD_U16x16& a, const SIMD_U16x16& b) {
- SIMD_U16x16 mask, a2, b2;
- mask = _mm256_set1_epi16(0b0111111111111111);
- a2 = _mm256_and_si256(a, mask);
- a2 = _mm256_or_si256(a2, _mm256_and_si256(_mm256_cmpgt_epi16(a2, a), mask));
- b2 = _mm256_and_si256(b, mask);
- b2 = _mm256_or_si256(b2, _mm256_and_si256(_mm256_cmpgt_epi16(b2, b), mask));
- // The 256-bit pack instruction _mm256_packus_epi16 is not serial, so the result has to be permutated into the correct order.
- // 0 2 1 3
- // | X |
- // 0 1 2 3
- return _mm256_permute4x64_epi64(_mm256_packus_epi16(a2, b2), 0b11011000);
- }
- // Reinterpret casting
- #define REINTERPRET_U32_TO_U8_SIMD256(A) (A)
- #define REINTERPRET_U32_TO_U16_SIMD256(A) (A)
- #define REINTERPRET_U8_TO_U32_SIMD256(A) (A)
- #define REINTERPRET_U16_TO_U32_SIMD256(A) (A)
- #define REINTERPRET_U32_TO_I32_SIMD256(A) (A)
- #define REINTERPRET_I32_TO_U32_SIMD256(A) (A)
- // Vector integer operations returning SIMD_I32x4
- #define ADD_I32_SIMD256(A, B) _mm256_add_epi32(A, B)
- #define SUB_I32_SIMD256(A, B) _mm256_sub_epi32(A, B)
- #define MUL_I32_SIMD256(A, B) _mm256_mullo_epi32(A, B)
- // Vector integer operations returning SIMD_U32x4
- #define ADD_U32_SIMD256(A, B) _mm256_add_epi32(A, B)
- #define SUB_U32_SIMD256(A, B) _mm256_sub_epi32(A, B)
- #define MUL_U32_SIMD256(A, B) _mm256_mullo_epi32(A, B)
- // Vector integer operations returning SIMD_U16x8
- #define ADD_U16_SIMD256(A, B) _mm256_add_epi16(A, B)
- #define SUB_U16_SIMD256(A, B) _mm256_sub_epi16(A, B)
- #define MUL_U16_SIMD256(A, B) _mm256_mullo_epi16(A, B)
- // Vector integer operations returning SIMD_U8x16
- #define ADD_U8_SIMD256(A, B) _mm256_add_epi8(A, B)
- #define ADD_SAT_U8_SIMD256(A, B) _mm256_adds_epu8(A, B) // Saturated addition
- #define SUB_U8_SIMD256(A, B) _mm256_sub_epi8(A, B)
- #define SUB_SAT_U8_SIMD256(A, B) _mm256_subs_epu8(A, B) // Saturated subtraction
- // No 8-bit multiplications
- // Bitwise
- #define BITWISE_AND_U32_SIMD256(A, B) _mm256_and_si256(A, B)
- #define BITWISE_OR_U32_SIMD256(A, B) _mm256_or_si256(A, B)
- #define BITWISE_XOR_U32_SIMD256(A, B) _mm256_xor_si256(A, B)
- #endif
- #endif
- #endif
- // Everything declared in here handles things specific for NEON.
- // Direct use of the macros will not provide portability to all hardware.
- #ifdef USE_NEON
- #include <arm_neon.h> // NEON
- // Vector types
- #define SIMD_F32x4 float32x4_t
- #define SIMD_U8x16 uint8x16_t
- #define SIMD_U16x8 uint16x8_t
- #define SIMD_U32x4 uint32x4_t
- #define SIMD_I32x4 int32x4_t
- // Vector uploads in address order
- inline SIMD_F32x4 LOAD_VECTOR_F32_SIMD(float a, float b, float c, float d) {
- float data[4] ALIGN16 = {a, b, c, d};
- return vld1q_f32(data);
- }
- inline SIMD_F32x4 LOAD_SCALAR_F32_SIMD(float a) {
- return vdupq_n_f32(a);
- }
- inline SIMD_U8x16 LOAD_VECTOR_U8_SIMD(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
- uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p) {
- uint8_t data[16] ALIGN16 = {a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p};
- return vld1q_u8(data);
- }
- inline SIMD_U8x16 LOAD_SCALAR_U8_SIMD(uint16_t a) {
- return vdupq_n_u8(a);
- }
- inline SIMD_U16x8 LOAD_VECTOR_U16_SIMD(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h) {
- uint16_t data[8] ALIGN16 = {a, b, c, d, e, f, g, h};
- return vld1q_u16(data);
- }
- inline SIMD_U16x8 LOAD_SCALAR_U16_SIMD(uint16_t a) {
- return vdupq_n_u16(a);
- }
- inline SIMD_U32x4 LOAD_VECTOR_U32_SIMD(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
- uint32_t data[4] ALIGN16 = {a, b, c, d};
- return vld1q_u32(data);
- }
- inline SIMD_U32x4 LOAD_SCALAR_U32_SIMD(uint32_t a) {
- return vdupq_n_u32(a);
- }
- inline SIMD_I32x4 LOAD_VECTOR_I32_SIMD(int32_t a, int32_t b, int32_t c, int32_t d) {
- int32_t data[4] ALIGN16 = {a, b, c, d};
- return vld1q_s32(data);
- }
- inline SIMD_I32x4 LOAD_SCALAR_I32_SIMD(int32_t a) {
- return vdupq_n_s32(a);
- }
- // Conversions
- #define F32_TO_I32_SIMD(A) vcvtq_s32_f32(A)
- #define F32_TO_U32_SIMD(A) vcvtq_u32_f32(A)
- #define I32_TO_F32_SIMD(A) vcvtq_f32_s32(A)
- #define U32_TO_F32_SIMD(A) vcvtq_f32_u32(A)
- // Unpacking conversions
- #define U8_LOW_TO_U16_SIMD(A) vmovl_u8(vget_low_u8(A))
- #define U8_HIGH_TO_U16_SIMD(A) vmovl_u8(vget_high_u8(A))
- #define U16_LOW_TO_U32_SIMD(A) vmovl_u16(vget_low_u16(A))
- #define U16_HIGH_TO_U32_SIMD(A) vmovl_u16(vget_high_u16(A))
- // Saturated packing
- #define PACK_SAT_U16_TO_U8_SIMD(A, B) vcombine_u8(vqmovn_u16(A), vqmovn_u16(B))
- // Reinterpret casting
- #define REINTERPRET_U32_TO_U8_SIMD(A) vreinterpretq_u8_u32(A)
- #define REINTERPRET_U32_TO_U16_SIMD(A) vreinterpretq_u16_u32(A)
- #define REINTERPRET_U8_TO_U32_SIMD(A) vreinterpretq_u32_u8(A)
- #define REINTERPRET_U16_TO_U32_SIMD(A) vreinterpretq_u32_u16(A)
- #define REINTERPRET_U32_TO_I32_SIMD(A) vreinterpretq_s32_u32(A)
- #define REINTERPRET_I32_TO_U32_SIMD(A) vreinterpretq_u32_s32(A)
- // Vector float operations returning SIMD_F32x4
- #define ADD_F32_SIMD(A, B) vaddq_f32(A, B)
- #define SUB_F32_SIMD(A, B) vsubq_f32(A, B)
- #define MUL_F32_SIMD(A, B) vmulq_f32(A, B)
- // Vector integer operations returning SIMD_I32x4
- #define ADD_I32_SIMD(A, B) vaddq_s32(A, B)
- #define SUB_I32_SIMD(A, B) vsubq_s32(A, B)
- #define MUL_I32_NEON(A, B) vmulq_s32(A, B)
- // Vector integer operations returning SIMD_U32x4
- #define ADD_U32_SIMD(A, B) vaddq_u32(A, B)
- #define SUB_U32_SIMD(A, B) vsubq_u32(A, B)
- #define MUL_U32_NEON(A, B) vmulq_u32(A, B)
- // Vector integer operations returning SIMD_U16x8
- #define ADD_U16_SIMD(A, B) vaddq_u16(A, B)
- #define SUB_U16_SIMD(A, B) vsubq_u16(A, B)
- #define MUL_U16_SIMD(A, B) vmulq_u16(A, B)
- // Vector integer operations returning SIMD_U8x16
- #define ADD_U8_SIMD(A, B) vaddq_u8(A, B)
- #define ADD_SAT_U8_SIMD(A, B) vqaddq_u8(A, B) // Saturated addition
- #define SUB_U8_SIMD(A, B) vsubq_u8(A, B)
- #define SUB_SAT_U8_SIMD(A, B) vqsubq_u8(A, B) // Saturated subtraction
- // No 8-bit multiplications
- // Statistics
- #define MIN_F32_SIMD(A, B) vminq_f32(A, B)
- #define MAX_F32_SIMD(A, B) vmaxq_f32(A, B)
- // Bitwise
- #define BITWISE_AND_U32_SIMD(A, B) vandq_u32(A, B)
- #define BITWISE_OR_U32_SIMD(A, B) vorrq_u32(A, B)
- #define BITWISE_XOR_U32_SIMD(A, B) veorq_u32(A, B)
- #endif
- /*
- The vector types below are supposed to be portable across different CPU architectures.
- When mixed with handwritten SIMD intrinsics:
- Use "USE_SSE2" instead of "__SSE2__"
- Use "USE_AVX2" instead of "__AVX2__"
- Use "USE_NEON" instead of "__ARM_NEON"
- So that any new variations of the macro named given from the compiler can be added to simd.h instead of duplicated everywhere.
- Portability exceptions:
- * The "v" variable is the native backend, which is only defined when SIMD is supported by hardware.
- Only use when USE_BASIC_SIMD is defined.
- Will not work on scalar emulation.
- * The "scalars" array is available when emulating a type that does not exist or the SIMD vector has direct access to the memory.
- Do not rely on these for accessing elements, because otherwise your code will not be able to compile for ARM NEON.
- */
- union F32x4 {
- private:
- // The uninitialized default constructor is private for safety reasons.
- F32x4() {}
- public:
- // When the uninitialized constructor is needed for performance, use this named constructor instead.
- static inline F32x4 create_dangerous_uninitialized() { return F32x4(); }
- #ifdef USE_BASIC_SIMD
- public:
- #ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
- // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
- // Direct access cannot be done on NEON!
- float scalars[4];
- #endif
- // The SIMD vector of undefined type
- // Not accessible while emulating!
- SIMD_F32x4 v;
- // Construct a portable vector from a native SIMD vector
- explicit F32x4(const SIMD_F32x4& v) : v(v) {}
- // Construct a portable vector from a set of scalars
- F32x4(float a1, float a2, float a3, float a4) : v(LOAD_VECTOR_F32_SIMD(a1, a2, a3, a4)) {}
- // Construct a portable vector from a single duplicated scalar
- explicit F32x4(float scalar) : v(LOAD_SCALAR_F32_SIMD(scalar)) {}
- #else
- public:
- // Emulate a SIMD vector as an array of scalars without hardware support.
- // Only accessible while emulating!
- float scalars[4];
- // Construct a portable vector from a set of scalars
- F32x4(float a1, float a2, float a3, float a4) {
- this->scalars[0] = a1;
- this->scalars[1] = a2;
- this->scalars[2] = a3;
- this->scalars[3] = a4;
- }
- // Construct a portable vector from a single duplicated scalar
- explicit F32x4(float scalar) {
- this->scalars[0] = scalar;
- this->scalars[1] = scalar;
- this->scalars[2] = scalar;
- this->scalars[3] = scalar;
- }
- #endif
- // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
- static inline F32x4 createGradient(float start, float increment) {
- return F32x4(start, start + increment, start + increment * 2.0f, start + increment * 3.0f);
- }
- // Construct a portable SIMD vector from a pointer to aligned data
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- static inline F32x4 readAlignedUnsafe(const float* data) {
- #ifdef USE_BASIC_SIMD
- #if defined USE_SSE2
- return F32x4(_mm_load_ps(data));
- #elif defined USE_NEON
- return F32x4(vld1q_f32(data));
- #endif
- #else
- return F32x4(data[0], data[1], data[2], data[3]);
- #endif
- }
- // Write to aligned memory from the existing vector
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- inline void writeAlignedUnsafe(float* data) const {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- _mm_store_ps(data, this->v);
- #elif defined USE_NEON
- vst1q_f32(data, this->v);
- #endif
- #else
- data[0] = this->scalars[0];
- data[1] = this->scalars[1];
- data[2] = this->scalars[2];
- data[3] = this->scalars[3];
- #endif
- }
- #if defined DFPSR_GEOMETRY_FVECTOR
- dsr::FVector4D get() const {
- float data[4] ALIGN16;
- this->writeAlignedUnsafe(data);
- return dsr::FVector4D(data[0], data[1], data[2], data[3]);
- }
- #endif
- // Bound and alignment checked reading
- static inline F32x4 readAligned(dsr::SafePointer<const float> data, const char* methodName) {
- const float* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 15) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 16);
- #endif
- return F32x4::readAlignedUnsafe(pointer);
- }
- // Bound and alignment checked writing
- inline void writeAligned(dsr::SafePointer<float> data, const char* methodName) const {
- float* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 15) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 16);
- #endif
- this->writeAlignedUnsafe(pointer);
- }
- // 1 / x
- // Useful for multiple divisions with the same denominator
- // Useless if the denominator is a constant
- F32x4 reciprocal() const {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- // Approximate
- SIMD_F32x4 lowQ = _mm_rcp_ps(this->v);
- // Refine
- return F32x4(SUB_F32_SIMD(ADD_F32_SIMD(lowQ, lowQ), MUL_F32_SIMD(this->v, MUL_F32_SIMD(lowQ, lowQ))));
- #elif defined USE_NEON
- // Approximate
- SIMD_F32x4 result = vrecpeq_f32(this->v);
- // Refine
- result = MUL_F32_SIMD(vrecpsq_f32(this->v, result), result);
- return F32x4(MUL_F32_SIMD(vrecpsq_f32(this->v, result), result));
- #else
- assert(false);
- return F32x4(0);
- #endif
- #else
- return F32x4(1.0f / this->scalars[0], 1.0f / this->scalars[1], 1.0f / this->scalars[2], 1.0f / this->scalars[3]);
- #endif
- }
- // 1 / sqrt(x)
- // Useful for normalizing vectors
- F32x4 reciprocalSquareRoot() const {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- SIMD_F32x4 reciRoot = _mm_rsqrt_ps(this->v);
- SIMD_F32x4 mul = MUL_F32_SIMD(MUL_F32_SIMD(this->v, reciRoot), reciRoot);
- reciRoot = MUL_F32_SIMD(MUL_F32_SIMD(LOAD_SCALAR_F32_SIMD(0.5f), reciRoot), SUB_F32_SIMD(LOAD_SCALAR_F32_SIMD(3.0f), mul));
- return F32x4(reciRoot);
- #elif defined USE_NEON
- // Approximate
- SIMD_F32x4 reciRoot = vrsqrteq_f32(this->v);
- // Refine
- reciRoot = MUL_F32_SIMD(vrsqrtsq_f32(MUL_F32_SIMD(this->v, reciRoot), reciRoot), reciRoot);
- return F32x4(reciRoot);
- #else
- assert(false);
- return F32x4(0);
- #endif
- #else
- return F32x4(1.0f / sqrt(this->scalars[0]), 1.0f / sqrt(this->scalars[1]), 1.0f / sqrt(this->scalars[2]), 1.0f / sqrt(this->scalars[3]));
- #endif
- }
- // sqrt(x)
- // Useful for getting lengths of vectors
- F32x4 squareRoot() const {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- SIMD_F32x4 half = LOAD_SCALAR_F32_SIMD(0.5f);
- // Approximate
- SIMD_F32x4 root = _mm_sqrt_ps(this->v);
- // Refine
- root = _mm_mul_ps(_mm_add_ps(root, _mm_div_ps(this->v, root)), half);
- return F32x4(root);
- #elif defined USE_NEON
- return F32x4(MUL_F32_SIMD(this->v, this->reciprocalSquareRoot().v));
- #else
- assert(false);
- return F32x4(0);
- #endif
- #else
- return F32x4(sqrt(this->scalars[0]), sqrt(this->scalars[1]), sqrt(this->scalars[2]), sqrt(this->scalars[3]));
- #endif
- }
- F32x4 clamp(float minimum, float maximum) const {
- #if defined USE_BASIC_SIMD
- return F32x4(MIN_F32_SIMD(MAX_F32_SIMD(this->v, LOAD_SCALAR_F32_SIMD(minimum)), LOAD_SCALAR_F32_SIMD(maximum)));
- #else
- float val0 = this->scalars[0];
- float val1 = this->scalars[1];
- float val2 = this->scalars[2];
- float val3 = this->scalars[3];
- if (minimum > val0) { val0 = minimum; }
- if (maximum < val0) { val0 = maximum; }
- if (minimum > val1) { val1 = minimum; }
- if (maximum < val1) { val1 = maximum; }
- if (minimum > val2) { val2 = minimum; }
- if (maximum < val2) { val2 = maximum; }
- if (minimum > val3) { val3 = minimum; }
- if (maximum < val3) { val3 = maximum; }
- return F32x4(val0, val1, val2, val3);
- #endif
- }
- F32x4 clampLower(float minimum) const {
- #if defined USE_BASIC_SIMD
- return F32x4(MAX_F32_SIMD(this->v, LOAD_SCALAR_F32_SIMD(minimum)));
- #else
- float val0 = this->scalars[0];
- float val1 = this->scalars[1];
- float val2 = this->scalars[2];
- float val3 = this->scalars[3];
- if (minimum > val0) { val0 = minimum; }
- if (minimum > val1) { val1 = minimum; }
- if (minimum > val2) { val2 = minimum; }
- if (minimum > val3) { val3 = minimum; }
- return F32x4(val0, val1, val2, val3);
- #endif
- }
- F32x4 clampUpper(float maximum) const {
- #if defined USE_BASIC_SIMD
- return F32x4(MIN_F32_SIMD(this->v, LOAD_SCALAR_F32_SIMD(maximum)));
- #else
- float val0 = this->scalars[0];
- float val1 = this->scalars[1];
- float val2 = this->scalars[2];
- float val3 = this->scalars[3];
- if (maximum < val0) { val0 = maximum; }
- if (maximum < val1) { val1 = maximum; }
- if (maximum < val2) { val2 = maximum; }
- if (maximum < val3) { val3 = maximum; }
- return F32x4(val0, val1, val2, val3);
- #endif
- }
- };
- union I32x4 {
- private:
- // The uninitialized default constructor is private for safety reasons.
- I32x4() {}
- public:
- // When the uninitialized constructor is needed for performance, use this named constructor instead.
- static inline I32x4 create_dangerous_uninitialized() { return I32x4(); }
- #if defined USE_BASIC_SIMD
- public:
- #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
- // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
- // Direct access cannot be done on NEON!
- int32_t scalars[4];
- #endif
- // The SIMD vector of undefined type
- // Not accessible while emulating!
- SIMD_I32x4 v;
- // Construct a portable vector from a native SIMD vector
- explicit I32x4(const SIMD_I32x4& v) : v(v) {}
- // Construct a portable vector from a set of scalars
- I32x4(int32_t a1, int32_t a2, int32_t a3, int32_t a4) : v(LOAD_VECTOR_I32_SIMD(a1, a2, a3, a4)) {}
- // Construct a portable vector from a single duplicated scalar
- explicit I32x4(int32_t scalar) : v(LOAD_SCALAR_I32_SIMD(scalar)) {}
- #else
- public:
- // Emulate a SIMD vector as an array of scalars without hardware support.
- // Only accessible while emulating!
- int32_t scalars[4];
- // Construct a portable vector from a set of scalars
- I32x4(int32_t a1, int32_t a2, int32_t a3, int32_t a4) {
- this->scalars[0] = a1;
- this->scalars[1] = a2;
- this->scalars[2] = a3;
- this->scalars[3] = a4;
- }
- // Construct a portable vector from a single duplicated scalar
- explicit I32x4(int32_t scalar) {
- this->scalars[0] = scalar;
- this->scalars[1] = scalar;
- this->scalars[2] = scalar;
- this->scalars[3] = scalar;
- }
- #endif
- // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
- static inline I32x4 createGradient(int32_t start, int32_t increment) {
- return I32x4(start, start + increment, start + increment * 2, start + increment * 3);
- }
- // Construct a portable SIMD vector from a pointer to aligned data
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- static inline I32x4 readAlignedUnsafe(const int32_t* data) {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- return I32x4(_mm_load_si128((const __m128i*)data));
- #elif defined USE_NEON
- return I32x4(vld1q_s32(data));
- #endif
- #else
- return I32x4(data[0], data[1], data[2], data[3]);
- #endif
- }
- // Write to aligned memory from the existing vector
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- inline void writeAlignedUnsafe(int32_t* data) const {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- _mm_store_si128((__m128i*)data, this->v);
- #elif defined USE_NEON
- vst1q_s32(data, this->v);
- #endif
- #else
- data[0] = this->scalars[0];
- data[1] = this->scalars[1];
- data[2] = this->scalars[2];
- data[3] = this->scalars[3];
- #endif
- }
- #if defined DFPSR_GEOMETRY_IVECTOR
- dsr::IVector4D get() const {
- int32_t data[4] ALIGN16;
- this->writeAlignedUnsafe(data);
- return dsr::IVector4D(data[0], data[1], data[2], data[3]);
- }
- #endif
- // Bound and alignment checked reading
- static inline I32x4 readAligned(dsr::SafePointer<const int32_t> data, const char* methodName) {
- const int32_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 15) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 16);
- #endif
- return I32x4::readAlignedUnsafe(pointer);
- }
- // Bound and alignment checked writing
- inline void writeAligned(dsr::SafePointer<int32_t> data, const char* methodName) const {
- int32_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 15) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 16);
- #endif
- this->writeAlignedUnsafe(pointer);
- }
- };
- union U32x4 {
- #if defined USE_BASIC_SIMD
- public:
- #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
- // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
- // Direct access cannot be done on NEON!
- uint32_t scalars[4];
- #endif
- // The SIMD vector of undefined type
- // Not accessible while emulating!
- SIMD_U32x4 v;
- // Construct a portable vector from a native SIMD vector
- explicit U32x4(const SIMD_U32x4& v) : v(v) {}
- // Construct a portable vector from a set of scalars
- U32x4(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4) : v(LOAD_VECTOR_U32_SIMD(a1, a2, a3, a4)) {}
- // Construct a portable vector from a single duplicated scalar
- explicit U32x4(uint32_t scalar) : v(LOAD_SCALAR_U32_SIMD(scalar)) {}
- #else
- public:
- // Emulate a SIMD vector as an array of scalars without hardware support.
- // Only accessible while emulating!
- uint32_t scalars[4];
- // Construct a portable vector from a set of scalars
- U32x4(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4) {
- this->scalars[0] = a1;
- this->scalars[1] = a2;
- this->scalars[2] = a3;
- this->scalars[3] = a4;
- }
- // Construct a portable vector from a single duplicated scalar
- explicit U32x4(uint32_t scalar) {
- this->scalars[0] = scalar;
- this->scalars[1] = scalar;
- this->scalars[2] = scalar;
- this->scalars[3] = scalar;
- }
- #endif
- // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
- static inline U32x4 createGradient(uint32_t start, uint32_t increment) {
- return U32x4(start, start + increment, start + increment * 2, start + increment * 3);
- }
- // Construct a portable SIMD vector from a pointer to aligned data
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- static inline U32x4 readAlignedUnsafe(const uint32_t* data) {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- return U32x4(_mm_load_si128((const __m128i*)data));
- #elif defined USE_NEON
- return U32x4(vld1q_u32(data));
- #endif
- #else
- return U32x4(data[0], data[1], data[2], data[3]);
- #endif
- }
- // Write to aligned memory from the existing vector
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- inline void writeAlignedUnsafe(uint32_t* data) const {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- _mm_store_si128((__m128i*)data, this->v);
- #elif defined USE_NEON
- vst1q_u32(data, this->v);
- #endif
- #else
- data[0] = this->scalars[0];
- data[1] = this->scalars[1];
- data[2] = this->scalars[2];
- data[3] = this->scalars[3];
- #endif
- }
- #if defined DFPSR_GEOMETRY_UVECTOR
- dsr::UVector4D get() const {
- uint32_t data[4] ALIGN16;
- this->writeAlignedUnsafe(data);
- return dsr::UVector4D(data[0], data[1], data[2], data[3]);
- }
- #endif
- // Bound and alignment checked reading
- static inline U32x4 readAligned(dsr::SafePointer<const uint32_t> data, const char* methodName) {
- const uint32_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 15) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 16);
- #endif
- return U32x4::readAlignedUnsafe(pointer);
- }
- // Bound and alignment checked writing
- inline void writeAligned(dsr::SafePointer<uint32_t> data, const char* methodName) const {
- uint32_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 15) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 16);
- #endif
- this->writeAlignedUnsafe(pointer);
- }
- };
- union U16x8 {
- private:
- // The uninitialized default constructor is private for safety reasons.
- U16x8() {}
- public:
- // When the uninitialized constructor is needed for performance, use this named constructor instead.
- static inline U16x8 create_dangerous_uninitialized() { return U16x8(); }
- #if defined USE_BASIC_SIMD
- public:
- #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
- // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
- // Direct access cannot be done on NEON!
- uint16_t scalars[8];
- #endif
- // The SIMD vector of undefined type
- // Not accessible while emulating!
- SIMD_U16x8 v;
- // Construct a portable vector from a native SIMD vector
- explicit U16x8(const SIMD_U16x8& v) : v(v) {}
- // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
- // Reinterpret casting is used
- explicit U16x8(const U32x4& vector) : v(REINTERPRET_U32_TO_U16_SIMD(vector.v)) {}
- // Construct a portable vector from a set of scalars
- U16x8(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8) : v(LOAD_VECTOR_U16_SIMD(a1, a2, a3, a4, a5, a6, a7, a8)) {}
- // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
- // Reinterpret casting is used
- // TODO: Remove all reintreprets from constructors to improve readability
- explicit U16x8(uint32_t scalar) : v(REINTERPRET_U32_TO_U16_SIMD(LOAD_SCALAR_U32_SIMD(scalar))) {}
- // Construct a portable vector from a single duplicated scalar
- explicit U16x8(uint16_t scalar) : v(LOAD_SCALAR_U16_SIMD(scalar)) {}
- // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
- U32x4 get_U32() const {
- return U32x4(REINTERPRET_U16_TO_U32_SIMD(this->v));
- }
- #else
- public:
- // Emulate a SIMD vector as an array of scalars without hardware support.
- // Only accessible while emulating!
- uint16_t scalars[8];
- // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
- // Reinterpret casting is used
- explicit U16x8(const U32x4& vector) {
- uint64_t *target = (uint64_t*)this->scalars;
- uint64_t *source = (uint64_t*)vector.scalars;
- target[0] = source[0];
- target[1] = source[1];
- }
- // Construct a portable vector from a set of scalars
- U16x8(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8) {
- this->scalars[0] = a1;
- this->scalars[1] = a2;
- this->scalars[2] = a3;
- this->scalars[3] = a4;
- this->scalars[4] = a5;
- this->scalars[5] = a6;
- this->scalars[6] = a7;
- this->scalars[7] = a8;
- }
- // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
- // Reinterpret casting is used
- explicit U16x8(uint32_t scalar) {
- uint32_t *target = (uint32_t*)this->scalars;
- target[0] = scalar;
- target[1] = scalar;
- target[2] = scalar;
- target[3] = scalar;
- }
- // Construct a portable vector from a single duplicated scalar
- explicit U16x8(uint16_t scalar) {
- this->scalars[0] = scalar;
- this->scalars[1] = scalar;
- this->scalars[2] = scalar;
- this->scalars[3] = scalar;
- this->scalars[4] = scalar;
- this->scalars[5] = scalar;
- this->scalars[6] = scalar;
- this->scalars[7] = scalar;
- }
- // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
- U32x4 get_U32() const {
- U32x4 result(0);
- uint64_t *target = (uint64_t*)result.scalars;
- uint64_t *source = (uint64_t*)this->scalars;
- target[0] = source[0];
- target[1] = source[1];
- return result;
- }
- #endif
- // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
- static inline U16x8 createGradient(uint16_t start, uint16_t increment) {
- return U16x8(
- start,
- start + increment,
- start + increment * 2,
- start + increment * 3,
- start + increment * 4,
- start + increment * 5,
- start + increment * 6,
- start + increment * 7
- );
- }
- static inline U16x8 readAlignedUnsafe(const uint16_t* data) {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- return U16x8(_mm_load_si128((const __m128i*)data));
- #elif defined USE_NEON
- return U16x8(vld1q_u16(data));
- #endif
- #else
- return U16x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
- #endif
- }
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- inline void writeAlignedUnsafe(uint16_t* data) const {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- _mm_store_si128((__m128i*)data, this->v);
- #elif defined USE_NEON
- vst1q_u16(data, this->v);
- #endif
- #else
- data[0] = this->scalars[0];
- data[1] = this->scalars[1];
- data[2] = this->scalars[2];
- data[3] = this->scalars[3];
- data[4] = this->scalars[4];
- data[5] = this->scalars[5];
- data[6] = this->scalars[6];
- data[7] = this->scalars[7];
- #endif
- }
- // Bound and alignment checked reading
- static inline U16x8 readAligned(dsr::SafePointer<const uint16_t> data, const char* methodName) {
- const uint16_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 15) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 16);
- #endif
- return U16x8::readAlignedUnsafe(pointer);
- }
- // Bound and alignment checked writing
- inline void writeAligned(dsr::SafePointer<uint16_t> data, const char* methodName) const {
- uint16_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 15) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 16);
- #endif
- this->writeAlignedUnsafe(pointer);
- }
- };
- union U8x16 {
- private:
- // The uninitialized default constructor is private for safety reasons.
- U8x16() {}
- public:
- // When the uninitialized constructor is needed for performance, use this named constructor instead.
- static inline U8x16 create_dangerous_uninitialized() { return U8x16(); }
- #if defined USE_BASIC_SIMD
- public:
- #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
- // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
- // Direct access cannot be done on NEON!
- uint8_t scalars[16];
- #endif
- // The SIMD vector of undefined type
- // Not accessible while emulating!
- SIMD_U8x16 v;
- // Construct a portable vector from a native SIMD vector
- explicit U8x16(const SIMD_U8x16& v) : v(v) {}
- // Construct a portable vector from a set of scalars
- U8x16(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
- uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16)
- : v(LOAD_VECTOR_U8_SIMD(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16)) {}
- // Construct a portable vector from a single duplicated scalar
- explicit U8x16(uint8_t scalar) : v(LOAD_SCALAR_U8_SIMD(scalar)) {}
- #else
- public:
- // Emulate a SIMD vector as an array of scalars without hardware support.
- // Only accessible while emulating!
- uint8_t scalars[16];
- // Construct a portable vector from a set of scalars
- U8x16(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
- uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16) {
- this->scalars[0] = a1;
- this->scalars[1] = a2;
- this->scalars[2] = a3;
- this->scalars[3] = a4;
- this->scalars[4] = a5;
- this->scalars[5] = a6;
- this->scalars[6] = a7;
- this->scalars[7] = a8;
- this->scalars[8] = a9;
- this->scalars[9] = a10;
- this->scalars[10] = a11;
- this->scalars[11] = a12;
- this->scalars[12] = a13;
- this->scalars[13] = a14;
- this->scalars[14] = a15;
- this->scalars[15] = a16;
- }
- // Construct a portable vector from a single duplicated scalar
- explicit U8x16(uint8_t scalar) {
- this->scalars[0] = scalar;
- this->scalars[1] = scalar;
- this->scalars[2] = scalar;
- this->scalars[3] = scalar;
- this->scalars[4] = scalar;
- this->scalars[5] = scalar;
- this->scalars[6] = scalar;
- this->scalars[7] = scalar;
- this->scalars[8] = scalar;
- this->scalars[9] = scalar;
- this->scalars[10] = scalar;
- this->scalars[11] = scalar;
- this->scalars[12] = scalar;
- this->scalars[13] = scalar;
- this->scalars[14] = scalar;
- this->scalars[15] = scalar;
- }
- #endif
- // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
- static inline U8x16 createGradient(uint8_t start, uint8_t increment) {
- return U8x16(
- start,
- start + increment,
- start + increment * 2,
- start + increment * 3,
- start + increment * 4,
- start + increment * 5,
- start + increment * 6,
- start + increment * 7,
- start + increment * 8,
- start + increment * 9,
- start + increment * 10,
- start + increment * 11,
- start + increment * 12,
- start + increment * 13,
- start + increment * 14,
- start + increment * 15
- );
- }
- static inline U8x16 readAlignedUnsafe(const uint8_t* data) {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- return U8x16(_mm_load_si128((const __m128i*)data));
- #elif defined USE_NEON
- return U8x16(vld1q_u8(data));
- #endif
- #else
- return U8x16(
- data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7],
- data[8], data[9], data[10], data[11], data[12], data[13], data[14], data[15]
- );
- #endif
- }
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- inline void writeAlignedUnsafe(uint8_t* data) const {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- _mm_store_si128((__m128i*)data, this->v);
- #elif defined USE_NEON
- vst1q_u8(data, this->v);
- #endif
- #else
- data[0] = this->scalars[0];
- data[1] = this->scalars[1];
- data[2] = this->scalars[2];
- data[3] = this->scalars[3];
- data[4] = this->scalars[4];
- data[5] = this->scalars[5];
- data[6] = this->scalars[6];
- data[7] = this->scalars[7];
- data[8] = this->scalars[8];
- data[9] = this->scalars[9];
- data[10] = this->scalars[10];
- data[11] = this->scalars[11];
- data[12] = this->scalars[12];
- data[13] = this->scalars[13];
- data[14] = this->scalars[14];
- data[15] = this->scalars[15];
- #endif
- }
- // Bound and alignment checked reading
- static inline U8x16 readAligned(dsr::SafePointer<const uint8_t> data, const char* methodName) {
- const uint8_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 15) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 16);
- #endif
- return U8x16::readAlignedUnsafe(pointer);
- }
- // Bound and alignment checked writing
- inline void writeAligned(dsr::SafePointer<uint8_t> data, const char* methodName) const {
- uint8_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 15) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 16);
- #endif
- this->writeAlignedUnsafe(pointer);
- }
- };
- union F32x8 {
- private:
- // The uninitialized default constructor is private for safety reasons.
- F32x8() {}
- public:
- // When the uninitialized constructor is needed for performance, use this named constructor instead.
- static inline F32x8 create_dangerous_uninitialized() { return F32x8(); }
- #if defined USE_256BIT_F_SIMD
- public:
- #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
- // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
- float scalars[8];
- #endif
- // The SIMD vector of undefined type
- // Not accessible while emulating!
- SIMD_F32x8 v;
- // Construct a portable vector from a native SIMD vector
- explicit F32x8(const SIMD_F32x8& v) : v(v) {}
- // Construct a portable vector from a set of scalars
- F32x8(float a1, float a2, float a3, float a4, float a5, float a6, float a7, float a8)
- : v(LOAD_VECTOR_F32_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8)) {}
- // Construct a portable vector from a single duplicated scalar
- explicit F32x8(float scalar) : v(LOAD_SCALAR_F32_SIMD256(scalar)) {}
- #else
- public:
- // Emulate a SIMD vector as an array of scalars without hardware support.
- // Only accessible while emulating!
- float scalars[8];
- // Construct a portable vector from a set of scalars
- F32x8(float a1, float a2, float a3, float a4, float a5, float a6, float a7, float a8) {
- this->scalars[0] = a1;
- this->scalars[1] = a2;
- this->scalars[2] = a3;
- this->scalars[3] = a4;
- this->scalars[4] = a5;
- this->scalars[5] = a6;
- this->scalars[6] = a7;
- this->scalars[7] = a8;
- }
- // Construct a portable vector from a single duplicated scalar
- explicit F32x8(float scalar) {
- this->scalars[0] = scalar;
- this->scalars[1] = scalar;
- this->scalars[2] = scalar;
- this->scalars[3] = scalar;
- this->scalars[4] = scalar;
- this->scalars[5] = scalar;
- this->scalars[6] = scalar;
- this->scalars[7] = scalar;
- }
- #endif
- // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
- static inline F32x8 createGradient(float start, float increment) {
- return F32x8(
- start,
- start + increment,
- start + increment * 2.0f,
- start + increment * 3.0f,
- start + increment * 4.0f,
- start + increment * 5.0f,
- start + increment * 6.0f,
- start + increment * 7.0f
- );
- }
- // Construct a portable SIMD vector from a pointer to aligned data
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- static inline F32x8 readAlignedUnsafe(const float* data) {
- #if defined USE_AVX2
- return F32x8(_mm256_load_ps(data));
- #else
- return F32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
- #endif
- }
- // Write to aligned memory from the existing vector
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- inline void writeAlignedUnsafe(float* data) const {
- #if defined USE_AVX2
- _mm256_store_ps(data, this->v);
- #else
- data[0] = this->scalars[0];
- data[1] = this->scalars[1];
- data[2] = this->scalars[2];
- data[3] = this->scalars[3];
- data[4] = this->scalars[4];
- data[5] = this->scalars[5];
- data[6] = this->scalars[6];
- data[7] = this->scalars[7];
- #endif
- }
- // Bound and alignment checked reading
- static inline F32x8 readAligned(dsr::SafePointer<const float> data, const char* methodName) {
- const float* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 31) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 32);
- #endif
- return F32x8::readAlignedUnsafe(pointer);
- }
- // Bound and alignment checked writing
- inline void writeAligned(dsr::SafePointer<float> data, const char* methodName) const {
- float* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 31) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 32);
- #endif
- this->writeAlignedUnsafe(pointer);
- }
- // 1 / x
- // Useful for multiple divisions with the same denominator
- // Useless if the denominator is a constant
- F32x8 reciprocal() const {
- #if defined USE_AVX2
- // Approximate
- SIMD_F32x8 lowQ = _mm256_rcp_ps(this->v);
- // Refine
- return F32x8(SUB_F32_SIMD256(ADD_F32_SIMD256(lowQ, lowQ), MUL_F32_SIMD256(this->v, MUL_F32_SIMD256(lowQ, lowQ))));
- #else
- return F32x8(
- 1.0f / this->scalars[0],
- 1.0f / this->scalars[1],
- 1.0f / this->scalars[2],
- 1.0f / this->scalars[3],
- 1.0f / this->scalars[4],
- 1.0f / this->scalars[5],
- 1.0f / this->scalars[6],
- 1.0f / this->scalars[7]
- );
- #endif
- }
- // 1 / sqrt(x)
- // Useful for normalizing vectors
- F32x8 reciprocalSquareRoot() const {
- #if defined USE_AVX2
- //__m128 reciRoot = _mm256_rsqrt_ps(this->v);
- SIMD_F32x8 reciRoot = _mm256_rsqrt_ps(this->v);
- SIMD_F32x8 mul = MUL_F32_SIMD256(MUL_F32_SIMD256(this->v, reciRoot), reciRoot);
- reciRoot = MUL_F32_SIMD256(MUL_F32_SIMD256(LOAD_SCALAR_F32_SIMD256(0.5f), reciRoot), SUB_F32_SIMD256(LOAD_SCALAR_F32_SIMD256(3.0f), mul));
- return F32x8(reciRoot);
- #else
- return F32x8(
- 1.0f / sqrt(this->scalars[0]),
- 1.0f / sqrt(this->scalars[1]),
- 1.0f / sqrt(this->scalars[2]),
- 1.0f / sqrt(this->scalars[3]),
- 1.0f / sqrt(this->scalars[4]),
- 1.0f / sqrt(this->scalars[5]),
- 1.0f / sqrt(this->scalars[6]),
- 1.0f / sqrt(this->scalars[7])
- );
- #endif
- }
- // sqrt(x)
- // Useful for getting lengths of vectors
- F32x8 squareRoot() const {
- #if defined USE_AVX2
- SIMD_F32x8 half = LOAD_SCALAR_F32_SIMD256(0.5f);
- // Approximate
- SIMD_F32x8 root = _mm256_sqrt_ps(this->v);
- // Refine
- root = _mm256_mul_ps(_mm256_add_ps(root, _mm256_div_ps(this->v, root)), half);
- return F32x8(root);
- #else
- return F32x8(
- sqrt(this->scalars[0]),
- sqrt(this->scalars[1]),
- sqrt(this->scalars[2]),
- sqrt(this->scalars[3]),
- sqrt(this->scalars[4]),
- sqrt(this->scalars[5]),
- sqrt(this->scalars[6]),
- sqrt(this->scalars[7]));
- #endif
- }
- F32x8 clamp(float minimum, float maximum) const {
- #if defined USE_256BIT_F_SIMD
- return F32x8(MIN_F32_SIMD256(MAX_F32_SIMD256(this->v, LOAD_SCALAR_F32_SIMD256(minimum)), LOAD_SCALAR_F32_SIMD256(maximum)));
- #else
- float val0 = this->scalars[0];
- float val1 = this->scalars[1];
- float val2 = this->scalars[2];
- float val3 = this->scalars[3];
- float val4 = this->scalars[4];
- float val5 = this->scalars[5];
- float val6 = this->scalars[6];
- float val7 = this->scalars[7];
- if (minimum > val0) { val0 = minimum; }
- if (maximum < val0) { val0 = maximum; }
- if (minimum > val1) { val1 = minimum; }
- if (maximum < val1) { val1 = maximum; }
- if (minimum > val2) { val2 = minimum; }
- if (maximum < val2) { val2 = maximum; }
- if (minimum > val3) { val3 = minimum; }
- if (maximum < val3) { val3 = maximum; }
- if (minimum > val4) { val4 = minimum; }
- if (maximum < val4) { val4 = maximum; }
- if (minimum > val5) { val5 = minimum; }
- if (maximum < val5) { val5 = maximum; }
- if (minimum > val6) { val6 = minimum; }
- if (maximum < val6) { val6 = maximum; }
- if (minimum > val7) { val7 = minimum; }
- if (maximum < val7) { val7 = maximum; }
- return F32x8(val0, val1, val2, val3, val4, val5, val6, val7);
- #endif
- }
- F32x8 clampLower(float minimum) const {
- #if defined USE_256BIT_F_SIMD
- return F32x8(MAX_F32_SIMD256(this->v, LOAD_SCALAR_F32_SIMD256(minimum)));
- #else
- float val0 = this->scalars[0];
- float val1 = this->scalars[1];
- float val2 = this->scalars[2];
- float val3 = this->scalars[3];
- float val4 = this->scalars[4];
- float val5 = this->scalars[5];
- float val6 = this->scalars[6];
- float val7 = this->scalars[7];
- if (minimum > val0) { val0 = minimum; }
- if (minimum > val1) { val1 = minimum; }
- if (minimum > val2) { val2 = minimum; }
- if (minimum > val3) { val3 = minimum; }
- if (minimum > val4) { val4 = minimum; }
- if (minimum > val5) { val5 = minimum; }
- if (minimum > val6) { val6 = minimum; }
- if (minimum > val7) { val7 = minimum; }
- return F32x8(val0, val1, val2, val3, val4, val5, val6, val7);
- #endif
- }
- F32x8 clampUpper(float maximum) const {
- #if defined USE_256BIT_F_SIMD
- return F32x8(MIN_F32_SIMD256(this->v, LOAD_SCALAR_F32_SIMD256(maximum)));
- #else
- float val0 = this->scalars[0];
- float val1 = this->scalars[1];
- float val2 = this->scalars[2];
- float val3 = this->scalars[3];
- float val4 = this->scalars[4];
- float val5 = this->scalars[5];
- float val6 = this->scalars[6];
- float val7 = this->scalars[7];
- if (maximum < val0) { val0 = maximum; }
- if (maximum < val1) { val1 = maximum; }
- if (maximum < val2) { val2 = maximum; }
- if (maximum < val3) { val3 = maximum; }
- if (maximum < val4) { val4 = maximum; }
- if (maximum < val5) { val5 = maximum; }
- if (maximum < val6) { val6 = maximum; }
- if (maximum < val7) { val7 = maximum; }
- return F32x8(val0, val1, val2, val3, val4, val5, val6, val7);
- #endif
- }
- };
- union I32x8 {
- private:
- // The uninitialized default constructor is private for safety reasons.
- I32x8() {}
- public:
- // When the uninitialized constructor is needed for performance, use this named constructor instead.
- static inline I32x8 create_dangerous_uninitialized() { return I32x8(); }
- #if defined USE_256BIT_X_SIMD
- public:
- #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
- // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
- int32_t scalars[8];
- #endif
- // The SIMD vector of undefined type
- // Not accessible while emulating!
- SIMD_I32x8 v;
- // Construct a portable vector from a native SIMD vector
- explicit I32x8(const SIMD_I32x8& v) : v(v) {}
- // Construct a portable vector from a set of scalars
- I32x8(int32_t a1, int32_t a2, int32_t a3, int32_t a4, int32_t a5, int32_t a6, int32_t a7, int32_t a8)
- : v(LOAD_VECTOR_I32_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8)) {}
- // Construct a portable vector from a single duplicated scalar
- explicit I32x8(int32_t scalar) : v(LOAD_SCALAR_I32_SIMD256(scalar)) {}
- #else
- public:
- // Emulate a SIMD vector as an array of scalars without hardware support.
- // Only accessible while emulating!
- int32_t scalars[8];
- // Construct a portable vector from a set of scalars
- I32x8(int32_t a1, int32_t a2, int32_t a3, int32_t a4, int32_t a5, int32_t a6, int32_t a7, int32_t a8) {
- this->scalars[0] = a1;
- this->scalars[1] = a2;
- this->scalars[2] = a3;
- this->scalars[3] = a4;
- this->scalars[4] = a5;
- this->scalars[5] = a6;
- this->scalars[6] = a7;
- this->scalars[7] = a8;
- }
- // Construct a portable vector from a single duplicated scalar
- explicit I32x8(int32_t scalar) {
- this->scalars[0] = scalar;
- this->scalars[1] = scalar;
- this->scalars[2] = scalar;
- this->scalars[3] = scalar;
- this->scalars[4] = scalar;
- this->scalars[5] = scalar;
- this->scalars[6] = scalar;
- this->scalars[7] = scalar;
- }
- #endif
- // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
- static inline I32x8 createGradient(int32_t start, int32_t increment) {
- return I32x8(
- start,
- start + increment,
- start + increment * 2,
- start + increment * 3,
- start + increment * 4,
- start + increment * 5,
- start + increment * 6,
- start + increment * 7
- );
- }
- // Construct a portable SIMD vector from a pointer to aligned data
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- static inline I32x8 readAlignedUnsafe(const int32_t* data) {
- #if defined USE_AVX2
- return I32x8(_mm256_load_si256((const __m256i*)data));
- #else
- return I32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
- #endif
- }
- // Write to aligned memory from the existing vector
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- inline void writeAlignedUnsafe(int32_t* data) const {
- #if defined USE_AVX2
- _mm256_store_si256((__m256i*)data, this->v);
- #else
- data[0] = this->scalars[0];
- data[1] = this->scalars[1];
- data[2] = this->scalars[2];
- data[3] = this->scalars[3];
- data[4] = this->scalars[4];
- data[5] = this->scalars[5];
- data[6] = this->scalars[6];
- data[7] = this->scalars[7];
- #endif
- }
- // Bound and alignment checked reading
- static inline I32x8 readAligned(dsr::SafePointer<const int32_t> data, const char* methodName) {
- const int32_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 31) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 32);
- #endif
- return I32x8::readAlignedUnsafe(pointer);
- }
- // Bound and alignment checked writing
- inline void writeAligned(dsr::SafePointer<int32_t> data, const char* methodName) const {
- int32_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 31) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 32);
- #endif
- this->writeAlignedUnsafe(pointer);
- }
- };
- union U32x8 {
- private:
- // The uninitialized default constructor is private for safety reasons.
- U32x8() {}
- public:
- // When the uninitialized constructor is needed for performance, use this named constructor instead.
- static inline U32x8 create_dangerous_uninitialized() { return U32x8(); }
- #if defined USE_256BIT_X_SIMD
- public:
- #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
- // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
- uint32_t scalars[8];
- #endif
- // The SIMD vector of undefined type
- // Not accessible while emulating!
- SIMD_U32x8 v;
- // Construct a portable vector from a native SIMD vector
- explicit U32x8(const SIMD_U32x8& v) : v(v) {}
- // Construct a portable vector from a set of scalars
- U32x8(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4, uint32_t a5, uint32_t a6, uint32_t a7, uint32_t a8)
- : v(LOAD_VECTOR_U32_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8)) {}
- // Construct a portable vector from a single duplicated scalar
- explicit U32x8(uint32_t scalar) : v(LOAD_SCALAR_U32_SIMD256(scalar)) {}
- #else
- public:
- // Emulate a SIMD vector as an array of scalars without hardware support.
- // Only accessible while emulating!
- uint32_t scalars[8];
- // Construct a portable vector from a set of scalars
- U32x8(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4, uint32_t a5, uint32_t a6, uint32_t a7, uint32_t a8) {
- this->scalars[0] = a1;
- this->scalars[1] = a2;
- this->scalars[2] = a3;
- this->scalars[3] = a4;
- this->scalars[4] = a5;
- this->scalars[5] = a6;
- this->scalars[6] = a7;
- this->scalars[7] = a8;
- }
- // Construct a portable vector from a single duplicated scalar
- explicit U32x8(uint32_t scalar) {
- this->scalars[0] = scalar;
- this->scalars[1] = scalar;
- this->scalars[2] = scalar;
- this->scalars[3] = scalar;
- this->scalars[4] = scalar;
- this->scalars[5] = scalar;
- this->scalars[6] = scalar;
- this->scalars[7] = scalar;
- }
- #endif
- // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
- static inline U32x8 createGradient(uint32_t start, uint32_t increment) {
- return U32x8(
- start,
- start + increment,
- start + increment * 2,
- start + increment * 3,
- start + increment * 4,
- start + increment * 5,
- start + increment * 6,
- start + increment * 7
- );
- }
- // Construct a portable SIMD vector from a pointer to aligned data
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- static inline U32x8 readAlignedUnsafe(const uint32_t* data) {
- #if defined USE_AVX2
- return U32x8(_mm256_load_si256((const __m256i*)data));
- #else
- return U32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
- #endif
- }
- // Write to aligned memory from the existing vector
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- inline void writeAlignedUnsafe(uint32_t* data) const {
- #if defined USE_AVX2
- _mm256_store_si256((__m256i*)data, this->v);
- #else
- data[0] = this->scalars[0];
- data[1] = this->scalars[1];
- data[2] = this->scalars[2];
- data[3] = this->scalars[3];
- data[4] = this->scalars[4];
- data[5] = this->scalars[5];
- data[6] = this->scalars[6];
- data[7] = this->scalars[7];
- #endif
- }
- // Bound and alignment checked reading
- static inline U32x8 readAligned(dsr::SafePointer<const uint32_t> data, const char* methodName) {
- const uint32_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 31) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 32);
- #endif
- return U32x8::readAlignedUnsafe(pointer);
- }
- // Bound and alignment checked writing
- inline void writeAligned(dsr::SafePointer<uint32_t> data, const char* methodName) const {
- uint32_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 31) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 32);
- #endif
- this->writeAlignedUnsafe(pointer);
- }
- };
- union U16x16 {
- private:
- // The uninitialized default constructor is private for safety reasons.
- U16x16() {}
- public:
- // When the uninitialized constructor is needed for performance, use this named constructor instead.
- static inline U16x16 create_dangerous_uninitialized() { return U16x16(); }
- #if defined USE_256BIT_X_SIMD
- public:
- #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
- // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
- uint16_t scalars[16];
- #endif
- // The SIMD vector of undefined type
- // Not accessible while emulating!
- SIMD_U16x16 v;
- // Construct a portable vector from a native SIMD vector
- explicit U16x16(const SIMD_U16x16& v) : v(v) {}
- // Construct a vector of 16 x 16-bit unsigned integers from a vector of 8 x 32-bit unsigned integers
- // Reinterpret casting is used
- explicit U16x16(const U32x8& vector) : v(REINTERPRET_U32_TO_U16_SIMD256(vector.v)) {}
- // Construct a portable vector from a set of scalars
- U16x16(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8,
- uint16_t a9, uint16_t a10, uint16_t a11, uint16_t a12, uint16_t a13, uint16_t a14, uint16_t a15, uint16_t a16)
- : v(LOAD_VECTOR_U16_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16)) {}
- // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
- // Reinterpret casting is used
- // TODO: Remove all reintreprets from constructors to improve readability
- explicit U16x16(uint32_t scalar) : v(REINTERPRET_U32_TO_U16_SIMD256(LOAD_SCALAR_U32_SIMD256(scalar))) {}
- // Construct a portable vector from a single duplicated scalar
- explicit U16x16(uint16_t scalar) : v(LOAD_SCALAR_U16_SIMD256(scalar)) {}
- // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
- U32x8 get_U32() const {
- return U32x8(REINTERPRET_U16_TO_U32_SIMD256(this->v));
- }
- #else
- public:
- // Emulate a SIMD vector as an array of scalars without hardware support.
- // Only accessible while emulating!
- uint16_t scalars[16];
- // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
- // Reinterpret casting is used
- explicit U16x16(const U32x8& vector) {
- uint64_t *target = (uint64_t*)this->scalars;
- uint64_t *source = (uint64_t*)vector.scalars;
- target[0] = source[0];
- target[1] = source[1];
- target[2] = source[2];
- target[3] = source[3];
- }
- // Construct a portable vector from a set of scalars
- U16x16(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8,
- uint16_t a9, uint16_t a10, uint16_t a11, uint16_t a12, uint16_t a13, uint16_t a14, uint16_t a15, uint16_t a16) {
- this->scalars[0] = a1;
- this->scalars[1] = a2;
- this->scalars[2] = a3;
- this->scalars[3] = a4;
- this->scalars[4] = a5;
- this->scalars[5] = a6;
- this->scalars[6] = a7;
- this->scalars[7] = a8;
- this->scalars[8] = a9;
- this->scalars[9] = a10;
- this->scalars[10] = a11;
- this->scalars[11] = a12;
- this->scalars[12] = a13;
- this->scalars[13] = a14;
- this->scalars[14] = a15;
- this->scalars[15] = a16;
- }
- // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
- // Reinterpret casting is used
- explicit U16x16(uint32_t scalar) {
- uint32_t *target = (uint32_t*)this->scalars;
- target[0] = scalar;
- target[1] = scalar;
- target[2] = scalar;
- target[3] = scalar;
- target[4] = scalar;
- target[5] = scalar;
- target[6] = scalar;
- target[7] = scalar;
- }
- // Construct a portable vector from a single duplicated scalar
- explicit U16x16(uint16_t scalar) {
- this->scalars[0] = scalar;
- this->scalars[1] = scalar;
- this->scalars[2] = scalar;
- this->scalars[3] = scalar;
- this->scalars[4] = scalar;
- this->scalars[5] = scalar;
- this->scalars[6] = scalar;
- this->scalars[7] = scalar;
- this->scalars[8] = scalar;
- this->scalars[9] = scalar;
- this->scalars[10] = scalar;
- this->scalars[11] = scalar;
- this->scalars[12] = scalar;
- this->scalars[13] = scalar;
- this->scalars[14] = scalar;
- this->scalars[15] = scalar;
- }
- // Reinterpret cast to a vector of 8 x 32-bit unsigned integers
- U32x8 get_U32() const {
- U32x8 result(0);
- uint64_t *target = (uint64_t*)result.scalars;
- uint64_t *source = (uint64_t*)this->scalars;
- target[0] = source[0];
- target[1] = source[1];
- target[2] = source[2];
- target[3] = source[3];
- return result;
- }
- #endif
- // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
- static inline U16x16 createGradient(uint16_t start, uint16_t increment) {
- return U16x16(
- start,
- start + increment,
- start + increment * 2,
- start + increment * 3,
- start + increment * 4,
- start + increment * 5,
- start + increment * 6,
- start + increment * 7,
- start + increment * 8,
- start + increment * 9,
- start + increment * 10,
- start + increment * 11,
- start + increment * 12,
- start + increment * 13,
- start + increment * 14,
- start + increment * 15
- );
- }
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- //static inline U16x16 readSlow(uint16_t* data) {
- // return U16x16(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
- //}
- static inline U16x16 readAlignedUnsafe(const uint16_t* data) {
- #if defined USE_AVX2
- return U16x16(_mm256_load_si256((const __m256i*)data));
- #else
- return U16x16(
- data[0],
- data[1],
- data[2],
- data[3],
- data[4],
- data[5],
- data[6],
- data[7],
- data[8],
- data[9],
- data[10],
- data[11],
- data[12],
- data[13],
- data[14],
- data[15]
- );
- #endif
- }
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- inline void writeAlignedUnsafe(uint16_t* data) const {
- #if defined USE_AVX2
- _mm256_store_si256((__m256i*)data, this->v);
- #else
- data[0] = this->scalars[0];
- data[1] = this->scalars[1];
- data[2] = this->scalars[2];
- data[3] = this->scalars[3];
- data[4] = this->scalars[4];
- data[5] = this->scalars[5];
- data[6] = this->scalars[6];
- data[7] = this->scalars[7];
- data[8] = this->scalars[8];
- data[9] = this->scalars[9];
- data[10] = this->scalars[10];
- data[11] = this->scalars[11];
- data[12] = this->scalars[12];
- data[13] = this->scalars[13];
- data[14] = this->scalars[14];
- data[15] = this->scalars[15];
- #endif
- }
- // Bound and alignment checked reading
- static inline U16x16 readAligned(dsr::SafePointer<const uint16_t> data, const char* methodName) {
- const uint16_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 31) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 32);
- #endif
- return U16x16::readAlignedUnsafe(pointer);
- }
- // Bound and alignment checked writing
- inline void writeAligned(dsr::SafePointer<uint16_t> data, const char* methodName) const {
- uint16_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 31) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 32);
- #endif
- this->writeAlignedUnsafe(pointer);
- }
- };
- union U8x32 {
- private:
- // The uninitialized default constructor is private for safety reasons.
- U8x32() {}
- public:
- // When the uninitialized constructor is needed for performance, use this named constructor instead.
- static inline U8x32 create_dangerous_uninitialized() { return U8x32(); }
- #if defined USE_256BIT_X_SIMD
- public:
- #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
- // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
- uint8_t scalars[32];
- #endif
- // The SIMD vector of undefined type
- // Not accessible while emulating!
- SIMD_U8x32 v;
- // Construct a portable vector from a native SIMD vector
- explicit U8x32(const SIMD_U8x32& v) : v(v) {}
- // Construct a portable vector from a set of scalars
- U8x32(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
- uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16,
- uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24,
- uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32)
- : v(LOAD_VECTOR_U8_SIMD256(
- a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16,
- a17, a18, a19, a20, a21, a22, a23, a24, a25, a26, a27, a28, a29, a30, a31, a32
- )) {}
- // Construct a portable vector from a single duplicated scalar
- explicit U8x32(uint8_t scalar) : v(LOAD_SCALAR_U8_SIMD256(scalar)) {}
- #else
- public:
- // Emulate a SIMD vector as an array of scalars without hardware support.
- // Only accessible while emulating!
- uint8_t scalars[32];
- // Construct a portable vector from a set of scalars
- U8x32(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
- uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16,
- uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24,
- uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32) {
- this->scalars[0] = a1;
- this->scalars[1] = a2;
- this->scalars[2] = a3;
- this->scalars[3] = a4;
- this->scalars[4] = a5;
- this->scalars[5] = a6;
- this->scalars[6] = a7;
- this->scalars[7] = a8;
- this->scalars[8] = a9;
- this->scalars[9] = a10;
- this->scalars[10] = a11;
- this->scalars[11] = a12;
- this->scalars[12] = a13;
- this->scalars[13] = a14;
- this->scalars[14] = a15;
- this->scalars[15] = a16;
- this->scalars[16] = a17;
- this->scalars[17] = a18;
- this->scalars[18] = a19;
- this->scalars[19] = a20;
- this->scalars[20] = a21;
- this->scalars[21] = a22;
- this->scalars[22] = a23;
- this->scalars[23] = a24;
- this->scalars[24] = a25;
- this->scalars[25] = a26;
- this->scalars[26] = a27;
- this->scalars[27] = a28;
- this->scalars[28] = a29;
- this->scalars[29] = a30;
- this->scalars[30] = a31;
- this->scalars[31] = a32;
- }
- // Construct a portable vector from a single duplicated scalar
- explicit U8x32(uint8_t scalar) {
- for (int i = 0; i < 32; i++) {
- this->scalars[i] = scalar;
- }
- }
- #endif
- // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
- static inline U8x32 createGradient(uint8_t start, uint8_t increment) {
- return U8x32(
- start,
- start + increment,
- start + increment * 2,
- start + increment * 3,
- start + increment * 4,
- start + increment * 5,
- start + increment * 6,
- start + increment * 7,
- start + increment * 8,
- start + increment * 9,
- start + increment * 10,
- start + increment * 11,
- start + increment * 12,
- start + increment * 13,
- start + increment * 14,
- start + increment * 15,
- start + increment * 16,
- start + increment * 17,
- start + increment * 18,
- start + increment * 19,
- start + increment * 20,
- start + increment * 21,
- start + increment * 22,
- start + increment * 23,
- start + increment * 24,
- start + increment * 25,
- start + increment * 26,
- start + increment * 27,
- start + increment * 28,
- start + increment * 29,
- start + increment * 30,
- start + increment * 31
- );
- }
- static inline U8x32 readAlignedUnsafe(const uint8_t* data) {
- #if defined USE_AVX2
- return U8x32(_mm256_load_si256((const __m256i*)data));
- #else
- U8x32 result;
- for (int i = 0; i < 32; i++) {
- result.scalars[i] = data[i];
- }
- return result;
- #endif
- }
- // data must be aligned with at least 8 bytes, but preferrably 16 bytes
- inline void writeAlignedUnsafe(uint8_t* data) const {
- #if defined USE_AVX2
- _mm256_store_si256((__m256i*)data, this->v);
- #else
- for (int i = 0; i < 32; i++) {
- data[i] = this->scalars[i];
- }
- #endif
- }
- // Bound and alignment checked reading
- static inline U8x32 readAligned(dsr::SafePointer<const uint8_t> data, const char* methodName) {
- const uint8_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 31) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 32);
- #endif
- return U8x32::readAlignedUnsafe(pointer);
- }
- // Bound and alignment checked writing
- inline void writeAligned(dsr::SafePointer<uint8_t> data, const char* methodName) const {
- uint8_t* pointer = data.getUnsafe();
- assert(((uintptr_t)pointer & 31) == 0);
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 32);
- #endif
- this->writeAlignedUnsafe(pointer);
- }
- };
- // Helper macros for doing things to certain sets of SIMD vector types
- // Performing do(vector_type, element_type, lane_count)
- #define FOR_ALL_VECTOR_TYPES(DO) \
- DO(F32x4, float, 4) \
- DO(I32x4, int32_t, 4) \
- DO(U32x4, uint32_t, 4) \
- DO(U16x8, uint16_t, 8) \
- DO(U8x16, uint8_t, 16) \
- DO(F32x8, float, 8) \
- DO(I32x8, int32_t, 8) \
- DO(U32x8, uint32_t, 8) \
- DO(U16x16, uint16_t, 16) \
- DO(U8x32, uint8_t, 32)
- #define FOR_FLOAT_VECTOR_TYPES(DO) \
- DO(F32x4, float, 4) \
- DO(F32x8, float, 8)
- #define FOR_INTEGER_VECTOR_TYPES(DO) \
- DO(I32x4, int32_t, 4) \
- DO(U32x4, uint32_t, 4) \
- DO(U16x8, uint16_t, 8) \
- DO(U8x16, uint8_t, 16) \
- DO(I32x8, int32_t, 8) \
- DO(U32x8, uint32_t, 8) \
- DO(U16x16, uint16_t, 16) \
- DO(U8x32, uint8_t, 32)
- #define FOR_SIGNED_VECTOR_TYPES(DO) \
- DO(F32x4, float, 4) \
- DO(I32x4, int32_t, 4) \
- DO(F32x8, float, 8) \
- DO(I32x8, int32_t, 8)
- #define FOR_UNSIGNED_VECTOR_TYPES(DO) \
- DO(U32x4, uint32_t, 4) \
- DO(U16x8, uint16_t, 8) \
- DO(U8x16, uint8_t, 16) \
- DO(U32x8, uint32_t, 8) \
- DO(U16x16, uint16_t, 16) \
- DO(U8x32, uint8_t, 32)
- // Print SIMD vectors to the terminal or append them to strings.
- #define CREATE_METHOD_PRINT(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
- inline dsr::String& string_toStreamIndented(dsr::String& target, const VECTOR_TYPE& source, const dsr::ReadableString& indentation) { \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
- source.writeAlignedUnsafe(a); \
- dsr::string_append(target, indentation, a[0]); \
- for (int i = 1; i < LANE_COUNT; i++) { \
- string_append(target, U", ", a[i]); \
- } \
- return target; \
- }
- // All SIMD vectors can be printed.
- FOR_ALL_VECTOR_TYPES(CREATE_METHOD_PRINT)
- #undef CREATE_METHOD_PRINT
- // Integer equality returns true iff all comparisons are identical.
- #define CREATE_EXACT_EQUALITY(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
- inline bool allLanesEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
- left.writeAlignedUnsafe(a); \
- right.writeAlignedUnsafe(b); \
- for (int i = 0; i < LANE_COUNT; i++) { \
- if (a[i] != b[i]) return false; \
- } \
- return true; \
- } \
- inline bool allLanesNotEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
- left.writeAlignedUnsafe(a); \
- right.writeAlignedUnsafe(b); \
- for (int i = 0; i < LANE_COUNT; i++) { \
- if (a[i] == b[i]) return false; \
- } \
- return true; \
- }
- // Integer SIMD vectors have exact equlity.
- FOR_INTEGER_VECTOR_TYPES(CREATE_EXACT_EQUALITY)
- #undef CREATE_EXACT_EQUALITY
- // Float equality returns true iff all comparisons are near.
- #define CREATE_TOLERANT_EQUALITY(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
- inline bool allLanesEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
- left.writeAlignedUnsafe(a); \
- right.writeAlignedUnsafe(b); \
- for (int i = 0; i < LANE_COUNT; i++) { \
- if (fabs(a[i] - b[i]) >= 0.0001f) return false; \
- } \
- return true; \
- } \
- inline bool allLanesNotEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
- left.writeAlignedUnsafe(a); \
- right.writeAlignedUnsafe(b); \
- for (int i = 0; i < LANE_COUNT; i++) { \
- if (fabs(a[i] - b[i]) < 0.0001f) return false; \
- } \
- return true; \
- }
- // Float SIMD vectors have inexact equality.
- FOR_FLOAT_VECTOR_TYPES(CREATE_TOLERANT_EQUALITY)
- #undef CREATE_TOLERANT_EQUALITY
- #define CREATE_COMPARISONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
- inline bool allLanesGreater(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
- left.writeAlignedUnsafe(a); \
- right.writeAlignedUnsafe(b); \
- for (int i = 0; i < LANE_COUNT; i++) { \
- if (a[i] <= b[i]) return false; \
- } \
- return true; \
- } \
- inline bool allLanesGreaterOrEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
- left.writeAlignedUnsafe(a); \
- right.writeAlignedUnsafe(b); \
- for (int i = 0; i < LANE_COUNT; i++) { \
- if (a[i] < b[i]) return false; \
- } \
- return true; \
- } \
- inline bool allLanesLesser(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
- left.writeAlignedUnsafe(a); \
- right.writeAlignedUnsafe(b); \
- for (int i = 0; i < LANE_COUNT; i++) { \
- if (a[i] >= b[i]) return false; \
- } \
- return true; \
- } \
- inline bool allLanesLesserOrEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
- left.writeAlignedUnsafe(a); \
- right.writeAlignedUnsafe(b); \
- for (int i = 0; i < LANE_COUNT; i++) { \
- if (a[i] > b[i]) return false; \
- } \
- return true; \
- }
- FOR_ALL_VECTOR_TYPES(CREATE_COMPARISONS)
- #undef CREATE_COMPARISONS
- inline F32x4 operator+(const F32x4& left, const F32x4& right) {
- #if defined USE_BASIC_SIMD
- return F32x4(ADD_F32_SIMD(left.v, right.v));
- #else
- return F32x4(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3]);
- #endif
- }
- inline F32x4 operator-(const F32x4& left, const F32x4& right) {
- #if defined USE_BASIC_SIMD
- return F32x4(SUB_F32_SIMD(left.v, right.v));
- #else
- return F32x4(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3]);
- #endif
- }
- inline F32x4 operator*(const F32x4& left, const F32x4& right) {
- #if defined USE_BASIC_SIMD
- return F32x4(MUL_F32_SIMD(left.v, right.v));
- #else
- return F32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
- #endif
- }
- inline F32x4 min(const F32x4& left, const F32x4& right) {
- #if defined USE_BASIC_SIMD
- return F32x4(MIN_F32_SIMD(left.v, right.v));
- #else
- float v0 = left.scalars[0];
- float v1 = left.scalars[1];
- float v2 = left.scalars[2];
- float v3 = left.scalars[3];
- float r0 = right.scalars[0];
- float r1 = right.scalars[1];
- float r2 = right.scalars[2];
- float r3 = right.scalars[3];
- if (r0 < v0) { v0 = r0; }
- if (r1 < v1) { v1 = r1; }
- if (r2 < v2) { v2 = r2; }
- if (r3 < v3) { v3 = r3; }
- return F32x4(v0, v1, v2, v3);
- #endif
- }
- inline F32x4 max(const F32x4& left, const F32x4& right) {
- #if defined USE_BASIC_SIMD
- return F32x4(MAX_F32_SIMD(left.v, right.v));
- #else
- float v0 = left.scalars[0];
- float v1 = left.scalars[1];
- float v2 = left.scalars[2];
- float v3 = left.scalars[3];
- float r0 = right.scalars[0];
- float r1 = right.scalars[1];
- float r2 = right.scalars[2];
- float r3 = right.scalars[3];
- if (r0 > v0) { v0 = r0; }
- if (r1 > v1) { v1 = r1; }
- if (r2 > v2) { v2 = r2; }
- if (r3 > v3) { v3 = r3; }
- return F32x4(v0, v1, v2, v3);
- #endif
- }
- inline I32x4 operator+(const I32x4& left, const I32x4& right) {
- #if defined USE_BASIC_SIMD
- return I32x4(ADD_I32_SIMD(left.v, right.v));
- #else
- return I32x4(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3]);
- #endif
- }
- inline I32x4 operator-(const I32x4& left, const I32x4& right) {
- #if defined USE_BASIC_SIMD
- return I32x4(SUB_I32_SIMD(left.v, right.v));
- #else
- return I32x4(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3]);
- #endif
- }
- inline I32x4 operator*(const I32x4& left, const I32x4& right) {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- // TODO: Use AVX2 for 32-bit integer multiplication when available.
- return I32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
- #elif defined USE_NEON
- return I32x4(MUL_I32_NEON(left.v, right.v));
- #endif
- #else
- return I32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
- #endif
- }
- // TODO: Specify the behavior of truncated unsigned integer overflow and add it to the tests.
- inline U32x4 operator+(const U32x4& left, const U32x4& right) {
- #if defined USE_BASIC_SIMD
- return U32x4(ADD_U32_SIMD(left.v, right.v));
- #else
- return U32x4(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3]);
- #endif
- }
- inline U32x4 operator-(const U32x4& left, const U32x4& right) {
- #if defined USE_BASIC_SIMD
- return U32x4(SUB_U32_SIMD(left.v, right.v));
- #else
- return U32x4(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3]);
- #endif
- }
- inline U32x4 operator*(const U32x4& left, const U32x4& right) {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- // Emulate a NEON instruction on SSE2 registers
- return U32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
- #else // NEON
- return U32x4(MUL_U32_NEON(left.v, right.v));
- #endif
- #else
- return U32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
- #endif
- }
- // Bitwise and
- inline U32x4 operator&(const U32x4& left, const U32x4& right) {
- #if defined USE_BASIC_SIMD
- return U32x4(BITWISE_AND_U32_SIMD(left.v, right.v));
- #else
- return U32x4(left.scalars[0] & right.scalars[0], left.scalars[1] & right.scalars[1], left.scalars[2] & right.scalars[2], left.scalars[3] & right.scalars[3]);
- #endif
- }
- // Bitwise or
- inline U32x4 operator|(const U32x4& left, const U32x4& right) {
- #if defined USE_BASIC_SIMD
- return U32x4(BITWISE_OR_U32_SIMD(left.v, right.v));
- #else
- return U32x4(left.scalars[0] | right.scalars[0], left.scalars[1] | right.scalars[1], left.scalars[2] | right.scalars[2], left.scalars[3] | right.scalars[3]);
- #endif
- }
- // Bitwise xor
- inline U32x4 operator^(const U32x4& left, const U32x4& right) {
- #if defined USE_BASIC_SIMD
- return U32x4(BITWISE_XOR_U32_SIMD(left.v, right.v));
- #else
- return U32x4(left.scalars[0] ^ right.scalars[0], left.scalars[1] ^ right.scalars[1], left.scalars[2] ^ right.scalars[2], left.scalars[3] ^ right.scalars[3]);
- #endif
- }
- // Bitwise negation
- inline U32x4 operator~(const U32x4& value) {
- #if defined USE_NEON
- return U32x4(vmvnq_u32(value.v));
- #elif defined USE_BASIC_SIMD
- // Fall back on xor against all ones.
- return value ^ U32x4(~uint32_t(0));
- #else
- return U32x4(~value.scalars[0], ~value.scalars[1], ~value.scalars[2], ~value.scalars[3]);
- #endif
- }
- inline U32x4 operator<<(const U32x4& left, const U32x4 &bitOffsets) {
- assert(allLanesLesser(bitOffsets, U32x4(32u)));
- #if defined USE_NEON
- return U32x4(vshlq_u32(left.v, vreinterpretq_s32_u32(bitOffsets.v)));
- #else
- return U32x4(
- left.scalars[0] << bitOffsets.scalars[0],
- left.scalars[1] << bitOffsets.scalars[1],
- left.scalars[2] << bitOffsets.scalars[2],
- left.scalars[3] << bitOffsets.scalars[3]
- );
- #endif
- }
- inline U32x4 operator>>(const U32x4& left, const U32x4 &bitOffsets) {
- assert(allLanesLesser(bitOffsets, U32x4(32u)));
- #if defined USE_NEON
- return U32x4(vshrq_u32(left.v, vreinterpretq_s32_u32(bitOffsets.v)));
- #else
- return U32x4(
- left.scalars[0] >> bitOffsets.scalars[0],
- left.scalars[1] >> bitOffsets.scalars[1],
- left.scalars[2] >> bitOffsets.scalars[2],
- left.scalars[3] >> bitOffsets.scalars[3]
- );
- #endif
- }
- // bitOffset must be an immediate constant, so a template argument is used.
- template <uint32_t bitOffset>
- inline U32x4 bitShiftLeftImmediate(const U32x4& left) {
- static_assert(bitOffset < 32u);
- #if defined USE_SSE2
- return U32x4(_mm_slli_epi32(left.v, bitOffset));
- #else
- #if defined USE_NEON
- return U32x4(vshlq_u32(left.v, LOAD_SCALAR_I32_SIMD(bitOffset)));
- #else
- return U32x4(left.scalars[0] << bitOffset, left.scalars[1] << bitOffset, left.scalars[2] << bitOffset, left.scalars[3] << bitOffset);
- #endif
- #endif
- }
- // bitOffset must be an immediate constant.
- template <uint32_t bitOffset>
- inline U32x4 bitShiftRightImmediate(const U32x4& left) {
- static_assert(bitOffset < 32u);
- #if defined USE_SSE2
- return U32x4(_mm_srli_epi32(left.v, bitOffset));
- #else
- #if defined USE_NEON
- return U32x4(vshrq_u32(left.v, LOAD_SCALAR_I32_SIMD(bitOffset)));
- #else
- return U32x4(left.scalars[0] >> bitOffset, left.scalars[1] >> bitOffset, left.scalars[2] >> bitOffset, left.scalars[3] >> bitOffset);
- #endif
- #endif
- }
-
- inline U16x8 operator<<(const U16x8& left, const U16x8 &bitOffsets) {
- assert(allLanesLesser(bitOffsets, U16x8(16u)));
- #if defined USE_NEON
- return U16x8(vshlq_u16(left.v, vreinterpretq_s16_u16(bitOffsets.v)));
- #else
- return U16x8(
- left.scalars[0] << bitOffsets.scalars[0],
- left.scalars[1] << bitOffsets.scalars[1],
- left.scalars[2] << bitOffsets.scalars[2],
- left.scalars[3] << bitOffsets.scalars[3],
- left.scalars[4] << bitOffsets.scalars[4],
- left.scalars[5] << bitOffsets.scalars[5],
- left.scalars[6] << bitOffsets.scalars[6],
- left.scalars[7] << bitOffsets.scalars[7]
- );
- #endif
- }
- inline U16x8 operator>>(const U16x8& left, const U16x8 &bitOffsets) {
- assert(allLanesLesser(bitOffsets, U16x8(16u)));
- #if defined USE_NEON
- return U16x8(vshrq_u16(left.v, vreinterpretq_s16_u16(bitOffsets.v)));
- #else
- return U16x8(
- left.scalars[0] >> bitOffsets.scalars[0],
- left.scalars[1] >> bitOffsets.scalars[1],
- left.scalars[2] >> bitOffsets.scalars[2],
- left.scalars[3] >> bitOffsets.scalars[3],
- left.scalars[4] >> bitOffsets.scalars[4],
- left.scalars[5] >> bitOffsets.scalars[5],
- left.scalars[6] >> bitOffsets.scalars[6],
- left.scalars[7] >> bitOffsets.scalars[7]
- );
- #endif
- }
- // bitOffset must be an immediate constant, so a template argument is used.
- template <uint32_t bitOffset>
- inline U16x8 bitShiftLeftImmediate(const U16x8& left) {
- static_assert(bitOffset < 16u);
- #if defined USE_SSE2
- return U16x8(_mm_slli_epi16(left.v, bitOffset));
- #else
- #if defined USE_NEON
- return U16x8(vshlq_u32(left.v, vdupq_n_s16(int16_t(bitOffset))));
- #else
- return U16x8(
- left.scalars[0] << bitOffset,
- left.scalars[1] << bitOffset,
- left.scalars[2] << bitOffset,
- left.scalars[3] << bitOffset,
- left.scalars[4] << bitOffset,
- left.scalars[5] << bitOffset,
- left.scalars[6] << bitOffset,
- left.scalars[7] << bitOffset
- );
- #endif
- #endif
- }
- // bitOffset must be an immediate constant.
- template <uint32_t bitOffset>
- inline U16x8 bitShiftRightImmediate(const U16x8& left) {
- static_assert(bitOffset < 16u);
- #if defined USE_SSE2
- return U16x8(_mm_srli_epi16(left.v, bitOffset));
- #else
- #if defined USE_NEON
- return U16x8(vshrq_u32(left.v, vdupq_n_s16(int16_t(bitOffset))));
- #else
- return U16x8(
- left.scalars[0] >> bitOffset,
- left.scalars[1] >> bitOffset,
- left.scalars[2] >> bitOffset,
- left.scalars[3] >> bitOffset,
- left.scalars[4] >> bitOffset,
- left.scalars[5] >> bitOffset,
- left.scalars[6] >> bitOffset,
- left.scalars[7] >> bitOffset
- );
- #endif
- #endif
- }
- inline U8x16 operator<<(const U8x16& left, const U8x16 &bitOffsets) {
- assert(allLanesLesser(bitOffsets, U8x16(8u)));
- #if defined USE_NEON
- return U8x16(vshlq_u16(left.v, vreinterpretq_s8_u8(bitOffsets.v)));
- #else
- return U8x16(
- left.scalars[ 0] << bitOffsets.scalars[ 0],
- left.scalars[ 1] << bitOffsets.scalars[ 1],
- left.scalars[ 2] << bitOffsets.scalars[ 2],
- left.scalars[ 3] << bitOffsets.scalars[ 3],
- left.scalars[ 4] << bitOffsets.scalars[ 4],
- left.scalars[ 5] << bitOffsets.scalars[ 5],
- left.scalars[ 6] << bitOffsets.scalars[ 6],
- left.scalars[ 7] << bitOffsets.scalars[ 7],
- left.scalars[ 8] << bitOffsets.scalars[ 8],
- left.scalars[ 9] << bitOffsets.scalars[ 9],
- left.scalars[10] << bitOffsets.scalars[10],
- left.scalars[11] << bitOffsets.scalars[11],
- left.scalars[12] << bitOffsets.scalars[12],
- left.scalars[13] << bitOffsets.scalars[13],
- left.scalars[14] << bitOffsets.scalars[14],
- left.scalars[15] << bitOffsets.scalars[15]
- );
- #endif
- }
- inline U8x16 operator>>(const U8x16& left, const U8x16 &bitOffsets) {
- assert(allLanesLesser(bitOffsets, U8x16(8u)));
- #if defined USE_NEON
- return U8x16(vshrq_u16(left.v, vreinterpretq_s8_u8(bitOffsets.v)));
- #else
- return U8x16(
- left.scalars[ 0] >> bitOffsets.scalars[ 0],
- left.scalars[ 1] >> bitOffsets.scalars[ 1],
- left.scalars[ 2] >> bitOffsets.scalars[ 2],
- left.scalars[ 3] >> bitOffsets.scalars[ 3],
- left.scalars[ 4] >> bitOffsets.scalars[ 4],
- left.scalars[ 5] >> bitOffsets.scalars[ 5],
- left.scalars[ 6] >> bitOffsets.scalars[ 6],
- left.scalars[ 7] >> bitOffsets.scalars[ 7],
- left.scalars[ 8] >> bitOffsets.scalars[ 8],
- left.scalars[ 9] >> bitOffsets.scalars[ 9],
- left.scalars[10] >> bitOffsets.scalars[10],
- left.scalars[11] >> bitOffsets.scalars[11],
- left.scalars[12] >> bitOffsets.scalars[12],
- left.scalars[13] >> bitOffsets.scalars[13],
- left.scalars[14] >> bitOffsets.scalars[14],
- left.scalars[15] >> bitOffsets.scalars[15]
- );
- #endif
- }
- // bitOffset must be an immediate constant, so a template argument is used.
- template <uint32_t bitOffset>
- inline U8x16 bitShiftLeftImmediate(const U8x16& left) {
- static_assert(bitOffset < 8u);
- #if defined USE_SSE2
- return U8x16(_mm_slli_epi16(left.v, bitOffset));
- #elif defined USE_NEON
- return U8x16(vshlq_u32(left.v, vdupq_n_s8(int8_t(bitOffset))));
- #else
- return U8x16(
- left.scalars[ 0] << bitOffset,
- left.scalars[ 1] << bitOffset,
- left.scalars[ 2] << bitOffset,
- left.scalars[ 3] << bitOffset,
- left.scalars[ 4] << bitOffset,
- left.scalars[ 5] << bitOffset,
- left.scalars[ 6] << bitOffset,
- left.scalars[ 7] << bitOffset,
- left.scalars[ 8] << bitOffset,
- left.scalars[ 9] << bitOffset,
- left.scalars[10] << bitOffset,
- left.scalars[11] << bitOffset,
- left.scalars[12] << bitOffset,
- left.scalars[13] << bitOffset,
- left.scalars[14] << bitOffset,
- left.scalars[15] << bitOffset
- );
- #endif
- }
- // bitOffset must be an immediate constant.
- template <uint32_t bitOffset>
- inline U8x16 bitShiftRightImmediate(const U8x16& left) {
- static_assert(bitOffset < 8u);
- #if defined USE_SSE2
- return U8x16(_mm_srli_epi16(left.v, bitOffset));
- #elif defined USE_NEON
- return U8x16(vshrq_u32(left.v, vdupq_n_s8(int8_t(bitOffset))));
- #else
- return U8x16(
- left.scalars[ 0] >> bitOffset,
- left.scalars[ 1] >> bitOffset,
- left.scalars[ 2] >> bitOffset,
- left.scalars[ 3] >> bitOffset,
- left.scalars[ 4] >> bitOffset,
- left.scalars[ 5] >> bitOffset,
- left.scalars[ 6] >> bitOffset,
- left.scalars[ 7] >> bitOffset,
- left.scalars[ 8] >> bitOffset,
- left.scalars[ 9] >> bitOffset,
- left.scalars[10] >> bitOffset,
- left.scalars[11] >> bitOffset,
- left.scalars[12] >> bitOffset,
- left.scalars[13] >> bitOffset,
- left.scalars[14] >> bitOffset,
- left.scalars[15] >> bitOffset
- );
- #endif
- }
- inline U16x8 operator+(const U16x8& left, const U16x8& right) {
- #if defined USE_BASIC_SIMD
- return U16x8(ADD_U16_SIMD(left.v, right.v));
- #else
- return U16x8(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3],
- left.scalars[4] + right.scalars[4], left.scalars[5] + right.scalars[5], left.scalars[6] + right.scalars[6], left.scalars[7] + right.scalars[7]);
- #endif
- }
- inline U16x8 operator-(const U16x8& left, const U16x8& right) {
- #if defined USE_BASIC_SIMD
- return U16x8(SUB_U16_SIMD(left.v, right.v));
- #else
- return U16x8(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3],
- left.scalars[4] - right.scalars[4], left.scalars[5] - right.scalars[5], left.scalars[6] - right.scalars[6], left.scalars[7] - right.scalars[7]);
- #endif
- }
- inline U16x8 operator*(const U16x8& left, const U16x8& right) {
- #if defined USE_BASIC_SIMD
- return U16x8(MUL_U16_SIMD(left.v, right.v));
- #else
- return U16x8(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3],
- left.scalars[4] * right.scalars[4], left.scalars[5] * right.scalars[5], left.scalars[6] * right.scalars[6], left.scalars[7] * right.scalars[7]);
- #endif
- }
- inline U8x16 operator+(const U8x16& left, const U8x16& right) {
- #if defined USE_BASIC_SIMD
- return U8x16(ADD_U8_SIMD(left.v, right.v));
- #else
- return U8x16(
- left.scalars[0] + right.scalars[0],
- left.scalars[1] + right.scalars[1],
- left.scalars[2] + right.scalars[2],
- left.scalars[3] + right.scalars[3],
- left.scalars[4] + right.scalars[4],
- left.scalars[5] + right.scalars[5],
- left.scalars[6] + right.scalars[6],
- left.scalars[7] + right.scalars[7],
- left.scalars[8] + right.scalars[8],
- left.scalars[9] + right.scalars[9],
- left.scalars[10] + right.scalars[10],
- left.scalars[11] + right.scalars[11],
- left.scalars[12] + right.scalars[12],
- left.scalars[13] + right.scalars[13],
- left.scalars[14] + right.scalars[14],
- left.scalars[15] + right.scalars[15]
- );
- #endif
- }
- inline U8x16 operator-(const U8x16& left, const U8x16& right) {
- #if defined USE_BASIC_SIMD
- return U8x16(SUB_U8_SIMD(left.v, right.v));
- #else
- return U8x16(
- left.scalars[0] - right.scalars[0],
- left.scalars[1] - right.scalars[1],
- left.scalars[2] - right.scalars[2],
- left.scalars[3] - right.scalars[3],
- left.scalars[4] - right.scalars[4],
- left.scalars[5] - right.scalars[5],
- left.scalars[6] - right.scalars[6],
- left.scalars[7] - right.scalars[7],
- left.scalars[8] - right.scalars[8],
- left.scalars[9] - right.scalars[9],
- left.scalars[10] - right.scalars[10],
- left.scalars[11] - right.scalars[11],
- left.scalars[12] - right.scalars[12],
- left.scalars[13] - right.scalars[13],
- left.scalars[14] - right.scalars[14],
- left.scalars[15] - right.scalars[15]
- );
- #endif
- }
- inline uint8_t impl_limit0(int32_t x) { return x < 0 ? 0 : x; }
- inline uint8_t impl_limit255(uint32_t x) { return x > 255 ? 255 : x; }
- inline U8x16 saturatedAddition(const U8x16& left, const U8x16& right) {
- #if defined USE_BASIC_SIMD
- return U8x16(ADD_SAT_U8_SIMD(left.v, right.v));
- #else
- return U8x16(
- impl_limit255((uint32_t)left.scalars[0] + (uint32_t)right.scalars[0]),
- impl_limit255((uint32_t)left.scalars[1] + (uint32_t)right.scalars[1]),
- impl_limit255((uint32_t)left.scalars[2] + (uint32_t)right.scalars[2]),
- impl_limit255((uint32_t)left.scalars[3] + (uint32_t)right.scalars[3]),
- impl_limit255((uint32_t)left.scalars[4] + (uint32_t)right.scalars[4]),
- impl_limit255((uint32_t)left.scalars[5] + (uint32_t)right.scalars[5]),
- impl_limit255((uint32_t)left.scalars[6] + (uint32_t)right.scalars[6]),
- impl_limit255((uint32_t)left.scalars[7] + (uint32_t)right.scalars[7]),
- impl_limit255((uint32_t)left.scalars[8] + (uint32_t)right.scalars[8]),
- impl_limit255((uint32_t)left.scalars[9] + (uint32_t)right.scalars[9]),
- impl_limit255((uint32_t)left.scalars[10] + (uint32_t)right.scalars[10]),
- impl_limit255((uint32_t)left.scalars[11] + (uint32_t)right.scalars[11]),
- impl_limit255((uint32_t)left.scalars[12] + (uint32_t)right.scalars[12]),
- impl_limit255((uint32_t)left.scalars[13] + (uint32_t)right.scalars[13]),
- impl_limit255((uint32_t)left.scalars[14] + (uint32_t)right.scalars[14]),
- impl_limit255((uint32_t)left.scalars[15] + (uint32_t)right.scalars[15])
- );
- #endif
- }
- inline U8x16 saturatedSubtraction(const U8x16& left, const U8x16& right) {
- #if defined USE_BASIC_SIMD
- return U8x16(SUB_SAT_U8_SIMD(left.v, right.v));
- #else
- return U8x16(
- impl_limit0((int32_t)left.scalars[0] - (int32_t)right.scalars[0]),
- impl_limit0((int32_t)left.scalars[1] - (int32_t)right.scalars[1]),
- impl_limit0((int32_t)left.scalars[2] - (int32_t)right.scalars[2]),
- impl_limit0((int32_t)left.scalars[3] - (int32_t)right.scalars[3]),
- impl_limit0((int32_t)left.scalars[4] - (int32_t)right.scalars[4]),
- impl_limit0((int32_t)left.scalars[5] - (int32_t)right.scalars[5]),
- impl_limit0((int32_t)left.scalars[6] - (int32_t)right.scalars[6]),
- impl_limit0((int32_t)left.scalars[7] - (int32_t)right.scalars[7]),
- impl_limit0((int32_t)left.scalars[8] - (int32_t)right.scalars[8]),
- impl_limit0((int32_t)left.scalars[9] - (int32_t)right.scalars[9]),
- impl_limit0((int32_t)left.scalars[10] - (int32_t)right.scalars[10]),
- impl_limit0((int32_t)left.scalars[11] - (int32_t)right.scalars[11]),
- impl_limit0((int32_t)left.scalars[12] - (int32_t)right.scalars[12]),
- impl_limit0((int32_t)left.scalars[13] - (int32_t)right.scalars[13]),
- impl_limit0((int32_t)left.scalars[14] - (int32_t)right.scalars[14]),
- impl_limit0((int32_t)left.scalars[15] - (int32_t)right.scalars[15])
- );
- #endif
- }
- inline I32x4 truncateToI32(const F32x4& vector) {
- #if defined USE_BASIC_SIMD
- return I32x4(F32_TO_I32_SIMD(vector.v));
- #else
- return I32x4((int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3]);
- #endif
- }
- inline U32x4 truncateToU32(const F32x4& vector) {
- #if defined USE_BASIC_SIMD
- return U32x4(F32_TO_U32_SIMD(vector.v));
- #else
- return U32x4((uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3]);
- #endif
- }
- inline F32x4 floatFromI32(const I32x4& vector) {
- #if defined USE_BASIC_SIMD
- return F32x4(I32_TO_F32_SIMD(vector.v));
- #else
- return F32x4((float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3]);
- #endif
- }
- inline F32x4 floatFromU32(const U32x4& vector) {
- #if defined USE_BASIC_SIMD
- return F32x4(U32_TO_F32_SIMD(vector.v));
- #else
- return F32x4((float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3]);
- #endif
- }
- inline I32x4 I32FromU32(const U32x4& vector) {
- #if defined USE_BASIC_SIMD
- return I32x4(REINTERPRET_U32_TO_I32_SIMD(vector.v));
- #else
- return I32x4((int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3]);
- #endif
- }
- inline U32x4 U32FromI32(const I32x4& vector) {
- #if defined USE_BASIC_SIMD
- return U32x4(REINTERPRET_I32_TO_U32_SIMD(vector.v));
- #else
- return U32x4((uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3]);
- #endif
- }
- // Warning! Behavior depends on endianness.
- inline U8x16 reinterpret_U8FromU32(const U32x4& vector) {
- #if defined USE_BASIC_SIMD
- return U8x16(REINTERPRET_U32_TO_U8_SIMD(vector.v));
- #else
- const uint8_t *source = (const uint8_t*)vector.scalars;
- return U8x16(
- source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7],
- source[8], source[9], source[10], source[11], source[12], source[13], source[14], source[15]
- );
- #endif
- }
- // Warning! Behavior depends on endianness.
- inline U32x4 reinterpret_U32FromU8(const U8x16& vector) {
- #if defined USE_BASIC_SIMD
- return U32x4(REINTERPRET_U8_TO_U32_SIMD(vector.v));
- #else
- const uint32_t *source = (const uint32_t*)vector.scalars;
- return U32x4(source[0], source[1], source[2], source[3]);
- #endif
- }
- // Warning! Behavior depends on endianness.
- inline U16x8 reinterpret_U16FromU32(const U32x4& vector) {
- #if defined USE_BASIC_SIMD
- return U16x8(REINTERPRET_U32_TO_U16_SIMD(vector.v));
- #else
- const uint16_t *source = (const uint16_t*)vector.scalars;
- return U16x8(
- source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7]
- );
- #endif
- }
- // Warning! Behavior depends on endianness.
- inline U32x4 reinterpret_U32FromU16(const U16x8& vector) {
- #if defined USE_BASIC_SIMD
- return U32x4(REINTERPRET_U16_TO_U32_SIMD(vector.v));
- #else
- const uint32_t *source = (const uint32_t*)vector.scalars;
- return U32x4(source[0], source[1], source[2], source[3]);
- #endif
- }
- // Unpacking to larger integers
- inline U32x4 lowerToU32(const U16x8& vector) {
- #if defined USE_BASIC_SIMD
- return U32x4(U16_LOW_TO_U32_SIMD(vector.v));
- #else
- return U32x4(vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3]);
- #endif
- }
- inline U32x4 higherToU32(const U16x8& vector) {
- #if defined USE_BASIC_SIMD
- return U32x4(U16_HIGH_TO_U32_SIMD(vector.v));
- #else
- return U32x4(vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7]);
- #endif
- }
- inline U16x8 lowerToU16(const U8x16& vector) {
- #if defined USE_BASIC_SIMD
- return U16x8(U8_LOW_TO_U16_SIMD(vector.v));
- #else
- return U16x8(
- vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3],
- vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7]
- );
- #endif
- }
- inline U16x8 higherToU16(const U8x16& vector) {
- #if defined USE_BASIC_SIMD
- return U16x8(U8_HIGH_TO_U16_SIMD(vector.v));
- #else
- return U16x8(
- vector.scalars[8], vector.scalars[9], vector.scalars[10], vector.scalars[11],
- vector.scalars[12], vector.scalars[13], vector.scalars[14], vector.scalars[15]
- );
- #endif
- }
- // Saturated packing
- inline U8x16 saturateToU8(const U16x8& lower, const U16x8& upper) {
- #if defined USE_BASIC_SIMD
- return U8x16(PACK_SAT_U16_TO_U8_SIMD(lower.v, upper.v));
- #else
- return U8x16(
- impl_limit255(lower.scalars[0]),
- impl_limit255(lower.scalars[1]),
- impl_limit255(lower.scalars[2]),
- impl_limit255(lower.scalars[3]),
- impl_limit255(lower.scalars[4]),
- impl_limit255(lower.scalars[5]),
- impl_limit255(lower.scalars[6]),
- impl_limit255(lower.scalars[7]),
- impl_limit255(upper.scalars[0]),
- impl_limit255(upper.scalars[1]),
- impl_limit255(upper.scalars[2]),
- impl_limit255(upper.scalars[3]),
- impl_limit255(upper.scalars[4]),
- impl_limit255(upper.scalars[5]),
- impl_limit255(upper.scalars[6]),
- impl_limit255(upper.scalars[7])
- );
- #endif
- }
- // Unary negation for convenience and code readability.
- // Before using unary negation, always check if:
- // * An addition can be turned into a subtraction?
- // x = -a + b
- // x = b - a
- // * A multiplying constant or scalar can be negated instead?
- // x = -b * 2
- // x = b * -2
- inline F32x4 operator-(const F32x4& value) {
- #if defined USE_BASIC_SIMD
- return F32x4(0.0f) - value;
- #else
- return F32x4(-value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3]);
- #endif
- }
- inline I32x4 operator-(const I32x4& value) {
- #if defined USE_BASIC_SIMD
- return I32x4(0) - value;
- #else
- return I32x4(-value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3]);
- #endif
- }
- // Helper macros for generating the vector extract functions.
- // Having one function for each type and offset makes sure that the compiler gets an immediate integer within the valid range.
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- #if defined USE_SSSE3
- #define _MM_ALIGNR_EPI8(A, B, OFFSET) _mm_alignr_epi8(A, B, OFFSET)
- #else
- // If SSSE3 is not used, emulate it using stack memory and unaligned reading of data.
- static inline SIMD_U8x16 _MM_ALIGNR_EPI8(SIMD_U8x16 a, SIMD_U8x16 b, int offset) {
- ALIGN16 uint8_t vectorBuffer[32];
- _mm_store_si128((SIMD_U8x16*)(vectorBuffer), b);
- _mm_store_si128((SIMD_U8x16*)(vectorBuffer + 16), a);
- return _mm_loadu_si128((SIMD_U8x16*)(vectorBuffer + offset));
- }
- #endif
- #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return U8x16(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET));
- #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return U16x8(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 2));
- #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return U32x4(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 4));
- #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return I32x4(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 4));
- #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return F32x4(SIMD_F32x4(_MM_ALIGNR_EPI8(SIMD_U32x4(b.v), SIMD_U32x4(a.v), OFFSET * 4)));
- #elif defined USE_NEON
- #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return U8x16(vextq_u8(a.v, b.v, OFFSET));
- #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return U16x8(vextq_u16(a.v, b.v, OFFSET));
- #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return U32x4(vextq_u32(a.v, b.v, OFFSET));
- #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return I32x4(vextq_s32(a.v, b.v, OFFSET));
- #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return F32x4(vextq_f32(a.v, b.v, OFFSET));
- #endif
- #else
- #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
- #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
- #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
- #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
- #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
- #endif
- // Vector extraction concatunates two input vectors and reads a vector between them using an offset.
- // The first and last offsets that only return one of the inputs can be used for readability, because they will be inlined and removed by the compiler.
- // To get elements from the right side, combine the center vector with the right vector and shift one element to the left using vectorExtract_1 for the given type.
- // To get elements from the left side, combine the left vector with the center vector and shift one element to the right using vectorExtract_15 for 16 lanes, vectorExtract_7 for 8 lanes, or vectorExtract_3 for 4 lanes.
- U8x16 inline vectorExtract_0(const U8x16 &a, const U8x16 &b) { return a; }
- U8x16 inline vectorExtract_1(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(1, U8x16(a.scalars[1], a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0])) }
- U8x16 inline vectorExtract_2(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(2, U8x16(a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1])) }
- U8x16 inline vectorExtract_3(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(3, U8x16(a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2])) }
- U8x16 inline vectorExtract_4(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(4, U8x16(a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3])) }
- U8x16 inline vectorExtract_5(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(5, U8x16(a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4])) }
- U8x16 inline vectorExtract_6(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(6, U8x16(a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5])) }
- U8x16 inline vectorExtract_7(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(7, U8x16(a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6])) }
- U8x16 inline vectorExtract_8(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(8, U8x16(a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7])) }
- U8x16 inline vectorExtract_9(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(9, U8x16(a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8])) }
- U8x16 inline vectorExtract_10(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(10, U8x16(a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9])) }
- U8x16 inline vectorExtract_11(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(11, U8x16(a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10])) }
- U8x16 inline vectorExtract_12(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(12, U8x16(a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11])) }
- U8x16 inline vectorExtract_13(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(13, U8x16(a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11], b.scalars[12])) }
- U8x16 inline vectorExtract_14(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(14, U8x16(a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11], b.scalars[12], b.scalars[13])) }
- U8x16 inline vectorExtract_15(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(15, U8x16(a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11], b.scalars[12], b.scalars[13], b.scalars[14])) }
- U8x16 inline vectorExtract_16(const U8x16 &a, const U8x16 &b) { return b; }
- U16x8 inline vectorExtract_0(const U16x8 &a, const U16x8 &b) { return a; }
- U16x8 inline vectorExtract_1(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(1, U16x8(a.scalars[1], a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0])) }
- U16x8 inline vectorExtract_2(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(2, U16x8(a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1])) }
- U16x8 inline vectorExtract_3(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(3, U16x8(a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2])) }
- U16x8 inline vectorExtract_4(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(4, U16x8(a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3])) }
- U16x8 inline vectorExtract_5(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(5, U16x8(a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4])) }
- U16x8 inline vectorExtract_6(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(6, U16x8(a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5])) }
- U16x8 inline vectorExtract_7(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(7, U16x8(a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6])) }
- U16x8 inline vectorExtract_8(const U16x8 &a, const U16x8 &b) { return b; }
- U32x4 inline vectorExtract_0(const U32x4 &a, const U32x4 &b) { return a; }
- U32x4 inline vectorExtract_1(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(1, U32x4(a.scalars[1], a.scalars[2], a.scalars[3], b.scalars[0])) }
- U32x4 inline vectorExtract_2(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(2, U32x4(a.scalars[2], a.scalars[3], b.scalars[0], b.scalars[1])) }
- U32x4 inline vectorExtract_3(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(3, U32x4(a.scalars[3], b.scalars[0], b.scalars[1], b.scalars[2])) }
- U32x4 inline vectorExtract_4(const U32x4 &a, const U32x4 &b) { return b; }
- I32x4 inline vectorExtract_0(const I32x4 &a, const I32x4 &b) { return a; }
- I32x4 inline vectorExtract_1(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(1, I32x4(a.scalars[1], a.scalars[2], a.scalars[3], b.scalars[0])) }
- I32x4 inline vectorExtract_2(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(2, I32x4(a.scalars[2], a.scalars[3], b.scalars[0], b.scalars[1])) }
- I32x4 inline vectorExtract_3(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(3, I32x4(a.scalars[3], b.scalars[0], b.scalars[1], b.scalars[2])) }
- I32x4 inline vectorExtract_4(const I32x4 &a, const I32x4 &b) { return b; }
- F32x4 inline vectorExtract_0(const F32x4 &a, const F32x4 &b) { return a; }
- F32x4 inline vectorExtract_1(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(1, F32x4(a.scalars[1], a.scalars[2], a.scalars[3], b.scalars[0])) }
- F32x4 inline vectorExtract_2(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(2, F32x4(a.scalars[2], a.scalars[3], b.scalars[0], b.scalars[1])) }
- F32x4 inline vectorExtract_3(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(3, F32x4(a.scalars[3], b.scalars[0], b.scalars[1], b.scalars[2])) }
- F32x4 inline vectorExtract_4(const F32x4 &a, const F32x4 &b) { return b; }
- // Gather instructions load memory from a pointer at multiple index offsets at the same time.
- // The given pointers should be aligned with 4 bytes, so that the fallback solution works on machines with strict alignment requirements.
- #if defined USE_AVX2
- #define GATHER_I32x4_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_epi32((const int32_t*)(SOURCE), FOUR_OFFSETS, SCALE)
- #define GATHER_U32x4_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_epi32((const int32_t*)(SOURCE), FOUR_OFFSETS, SCALE)
- #define GATHER_F32x4_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_ps((const float*)(SOURCE), FOUR_OFFSETS, SCALE)
- #endif
- static inline U32x4 gather_U32(dsr::SafePointer<const uint32_t> data, const U32x4 &elementOffset) {
- #if defined USE_AVX2
- // TODO: Implement safety checks for debug mode.
- return U32x4(GATHER_U32x4_AVX2(data.getUnsafe(), elementOffset.v, 4));
- #else
- ALIGN16 uint32_t elementOffsets[4];
- elementOffset.writeAlignedUnsafe(elementOffsets);
- return U32x4(
- *(data + elementOffsets[0]),
- *(data + elementOffsets[1]),
- *(data + elementOffsets[2]),
- *(data + elementOffsets[3])
- );
- #endif
- }
- static inline I32x4 gather_I32(dsr::SafePointer<const int32_t> data, const U32x4 &elementOffset) {
- #if defined USE_AVX2
- // TODO: Implement safety checks for debug mode.
- return I32x4(GATHER_U32x4_AVX2(data.getUnsafe(), elementOffset.v, 4));
- #else
- ALIGN16 uint32_t elementOffsets[4];
- elementOffset.writeAlignedUnsafe(elementOffsets);
- return I32x4(
- *(data + elementOffsets[0]),
- *(data + elementOffsets[1]),
- *(data + elementOffsets[2]),
- *(data + elementOffsets[3])
- );
- #endif
- }
- static inline F32x4 gather_F32(dsr::SafePointer<const float> data, const U32x4 &elementOffset) {
- #if defined USE_AVX2
- // TODO: Implement safety checks for debug mode.
- return F32x4(GATHER_F32x4_AVX2(data.getUnsafe(), elementOffset.v, 4));
- #else
- ALIGN16 uint32_t elementOffsets[4];
- elementOffset.writeAlignedUnsafe(elementOffsets);
- return F32x4(
- *(data + elementOffsets[0]),
- *(data + elementOffsets[1]),
- *(data + elementOffsets[2]),
- *(data + elementOffsets[3])
- );
- #endif
- }
- inline F32x8 operator+(const F32x8& left, const F32x8& right) {
- #if defined USE_256BIT_F_SIMD
- return F32x8(ADD_F32_SIMD256(left.v, right.v));
- #else
- return F32x8(
- left.scalars[0] + right.scalars[0],
- left.scalars[1] + right.scalars[1],
- left.scalars[2] + right.scalars[2],
- left.scalars[3] + right.scalars[3],
- left.scalars[4] + right.scalars[4],
- left.scalars[5] + right.scalars[5],
- left.scalars[6] + right.scalars[6],
- left.scalars[7] + right.scalars[7]
- );
- #endif
- }
- inline F32x8 operator-(const F32x8& left, const F32x8& right) {
- #if defined USE_256BIT_F_SIMD
- return F32x8(SUB_F32_SIMD256(left.v, right.v));
- #else
- return F32x8(
- left.scalars[0] - right.scalars[0],
- left.scalars[1] - right.scalars[1],
- left.scalars[2] - right.scalars[2],
- left.scalars[3] - right.scalars[3],
- left.scalars[4] - right.scalars[4],
- left.scalars[5] - right.scalars[5],
- left.scalars[6] - right.scalars[6],
- left.scalars[7] - right.scalars[7]
- );
- #endif
- }
- inline F32x8 operator*(const F32x8& left, const F32x8& right) {
- #if defined USE_256BIT_F_SIMD
- return F32x8(MUL_F32_SIMD256(left.v, right.v));
- #else
- return F32x8(
- left.scalars[0] * right.scalars[0],
- left.scalars[1] * right.scalars[1],
- left.scalars[2] * right.scalars[2],
- left.scalars[3] * right.scalars[3],
- left.scalars[4] * right.scalars[4],
- left.scalars[5] * right.scalars[5],
- left.scalars[6] * right.scalars[6],
- left.scalars[7] * right.scalars[7]
- );
- #endif
- }
- inline F32x8 min(const F32x8& left, const F32x8& right) {
- #if defined USE_256BIT_F_SIMD
- return F32x8(MIN_F32_SIMD256(left.v, right.v));
- #else
- float v0 = left.scalars[0];
- float v1 = left.scalars[1];
- float v2 = left.scalars[2];
- float v3 = left.scalars[3];
- float v4 = left.scalars[4];
- float v5 = left.scalars[5];
- float v6 = left.scalars[6];
- float v7 = left.scalars[7];
- float r0 = right.scalars[0];
- float r1 = right.scalars[1];
- float r2 = right.scalars[2];
- float r3 = right.scalars[3];
- float r4 = right.scalars[4];
- float r5 = right.scalars[5];
- float r6 = right.scalars[6];
- float r7 = right.scalars[7];
- if (r0 < v0) { v0 = r0; }
- if (r1 < v1) { v1 = r1; }
- if (r2 < v2) { v2 = r2; }
- if (r3 < v3) { v3 = r3; }
- if (r4 < v4) { v4 = r4; }
- if (r5 < v5) { v5 = r5; }
- if (r6 < v6) { v6 = r6; }
- if (r7 < v7) { v7 = r7; }
- return F32x8(v0, v1, v2, v3, v4, v5, v6, v7);
- #endif
- }
- inline F32x8 max(const F32x8& left, const F32x8& right) {
- #if defined USE_256BIT_F_SIMD
- return F32x8(MAX_F32_SIMD256(left.v, right.v));
- #else
- float v0 = left.scalars[0];
- float v1 = left.scalars[1];
- float v2 = left.scalars[2];
- float v3 = left.scalars[3];
- float v4 = left.scalars[4];
- float v5 = left.scalars[5];
- float v6 = left.scalars[6];
- float v7 = left.scalars[7];
- float r0 = right.scalars[0];
- float r1 = right.scalars[1];
- float r2 = right.scalars[2];
- float r3 = right.scalars[3];
- float r4 = right.scalars[4];
- float r5 = right.scalars[5];
- float r6 = right.scalars[6];
- float r7 = right.scalars[7];
- if (r0 > v0) { v0 = r0; }
- if (r1 > v1) { v1 = r1; }
- if (r2 > v2) { v2 = r2; }
- if (r3 > v3) { v3 = r3; }
- if (r4 > v4) { v4 = r4; }
- if (r5 > v5) { v5 = r5; }
- if (r6 > v6) { v6 = r6; }
- if (r7 > v7) { v7 = r7; }
- return F32x8(v0, v1, v2, v3, v4, v5, v6, v7);
- #endif
- }
- inline I32x8 operator+(const I32x8& left, const I32x8& right) {
- #if defined USE_256BIT_X_SIMD
- return I32x8(ADD_I32_SIMD256(left.v, right.v));
- #else
- return I32x8(
- left.scalars[0] + right.scalars[0],
- left.scalars[1] + right.scalars[1],
- left.scalars[2] + right.scalars[2],
- left.scalars[3] + right.scalars[3],
- left.scalars[4] + right.scalars[4],
- left.scalars[5] + right.scalars[5],
- left.scalars[6] + right.scalars[6],
- left.scalars[7] + right.scalars[7]);
- #endif
- }
- inline I32x8 operator-(const I32x8& left, const I32x8& right) {
- #if defined USE_256BIT_X_SIMD
- return I32x8(SUB_I32_SIMD256(left.v, right.v));
- #else
- return I32x8(
- left.scalars[0] - right.scalars[0],
- left.scalars[1] - right.scalars[1],
- left.scalars[2] - right.scalars[2],
- left.scalars[3] - right.scalars[3],
- left.scalars[4] - right.scalars[4],
- left.scalars[5] - right.scalars[5],
- left.scalars[6] - right.scalars[6],
- left.scalars[7] - right.scalars[7]);
- #endif
- }
- inline I32x8 operator*(const I32x8& left, const I32x8& right) {
- #if defined USE_AVX2
- return I32x8(MUL_I32_SIMD256(left.v, right.v));
- #else
- return I32x8(
- left.scalars[0] * right.scalars[0],
- left.scalars[1] * right.scalars[1],
- left.scalars[2] * right.scalars[2],
- left.scalars[3] * right.scalars[3],
- left.scalars[4] * right.scalars[4],
- left.scalars[5] * right.scalars[5],
- left.scalars[6] * right.scalars[6],
- left.scalars[7] * right.scalars[7]
- );
- #endif
- }
- inline U32x8 operator+(const U32x8& left, const U32x8& right) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(ADD_U32_SIMD256(left.v, right.v));
- #else
- return U32x8(
- left.scalars[0] + right.scalars[0],
- left.scalars[1] + right.scalars[1],
- left.scalars[2] + right.scalars[2],
- left.scalars[3] + right.scalars[3],
- left.scalars[4] + right.scalars[4],
- left.scalars[5] + right.scalars[5],
- left.scalars[6] + right.scalars[6],
- left.scalars[7] + right.scalars[7]
- );
- #endif
- }
- inline U32x8 operator-(const U32x8& left, const U32x8& right) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(SUB_U32_SIMD256(left.v, right.v));
- #else
- return U32x8(
- left.scalars[0] - right.scalars[0],
- left.scalars[1] - right.scalars[1],
- left.scalars[2] - right.scalars[2],
- left.scalars[3] - right.scalars[3],
- left.scalars[4] - right.scalars[4],
- left.scalars[5] - right.scalars[5],
- left.scalars[6] - right.scalars[6],
- left.scalars[7] - right.scalars[7]
- );
- #endif
- }
- inline U32x8 operator*(const U32x8& left, const U32x8& right) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(MUL_U32_SIMD256(left.v, right.v));
- #else
- return U32x8(
- left.scalars[0] * right.scalars[0],
- left.scalars[1] * right.scalars[1],
- left.scalars[2] * right.scalars[2],
- left.scalars[3] * right.scalars[3],
- left.scalars[4] * right.scalars[4],
- left.scalars[5] * right.scalars[5],
- left.scalars[6] * right.scalars[6],
- left.scalars[7] * right.scalars[7]
- );
- #endif
- }
- inline U32x8 operator&(const U32x8& left, const U32x8& right) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(BITWISE_AND_U32_SIMD256(left.v, right.v));
- #else
- return U32x8(
- left.scalars[0] & right.scalars[0],
- left.scalars[1] & right.scalars[1],
- left.scalars[2] & right.scalars[2],
- left.scalars[3] & right.scalars[3],
- left.scalars[4] & right.scalars[4],
- left.scalars[5] & right.scalars[5],
- left.scalars[6] & right.scalars[6],
- left.scalars[7] & right.scalars[7]
- );
- #endif
- }
- inline U32x8 operator|(const U32x8& left, const U32x8& right) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(BITWISE_OR_U32_SIMD256(left.v, right.v));
- #else
- return U32x8(
- left.scalars[0] | right.scalars[0],
- left.scalars[1] | right.scalars[1],
- left.scalars[2] | right.scalars[2],
- left.scalars[3] | right.scalars[3],
- left.scalars[4] | right.scalars[4],
- left.scalars[5] | right.scalars[5],
- left.scalars[6] | right.scalars[6],
- left.scalars[7] | right.scalars[7]
- );
- #endif
- }
- inline U32x8 operator^(const U32x8& left, const U32x8& right) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(BITWISE_XOR_U32_SIMD256(left.v, right.v));
- #else
- return U32x8(
- left.scalars[0] ^ right.scalars[0],
- left.scalars[1] ^ right.scalars[1],
- left.scalars[2] ^ right.scalars[2],
- left.scalars[3] ^ right.scalars[3],
- left.scalars[4] ^ right.scalars[4],
- left.scalars[5] ^ right.scalars[5],
- left.scalars[6] ^ right.scalars[6],
- left.scalars[7] ^ right.scalars[7]
- );
- #endif
- }
- inline U32x8 operator~(const U32x8& value) {
- #if defined USE_BASIC_SIMD
- return value ^ U32x8(~uint32_t(0));
- #else
- return U32x8(
- ~value.scalars[0],
- ~value.scalars[1],
- ~value.scalars[2],
- ~value.scalars[3],
- ~value.scalars[4],
- ~value.scalars[5],
- ~value.scalars[6],
- ~value.scalars[7]
- );
- #endif
- }
- // ARM NEON does not support 256-bit vectors and Intel's AVX2 does not support variable shifting.
- inline U32x8 operator<<(const U32x8& left, const U32x8 &bitOffsets) {
- assert(allLanesLesser(bitOffsets, U32x8(32u)));
- return U32x8(
- left.scalars[0] << bitOffsets.scalars[0],
- left.scalars[1] << bitOffsets.scalars[1],
- left.scalars[2] << bitOffsets.scalars[2],
- left.scalars[3] << bitOffsets.scalars[3],
- left.scalars[4] << bitOffsets.scalars[4],
- left.scalars[5] << bitOffsets.scalars[5],
- left.scalars[6] << bitOffsets.scalars[6],
- left.scalars[7] << bitOffsets.scalars[7]
- );
- }
- inline U32x8 operator>>(const U32x8& left, const U32x8 &bitOffsets) {
- assert(allLanesLesser(bitOffsets, U32x8(32u)));
- return U32x8(
- left.scalars[0] >> bitOffsets.scalars[0],
- left.scalars[1] >> bitOffsets.scalars[1],
- left.scalars[2] >> bitOffsets.scalars[2],
- left.scalars[3] >> bitOffsets.scalars[3],
- left.scalars[4] >> bitOffsets.scalars[4],
- left.scalars[5] >> bitOffsets.scalars[5],
- left.scalars[6] >> bitOffsets.scalars[6],
- left.scalars[7] >> bitOffsets.scalars[7]
- );
- }
- // bitOffset must be an immediate constant from 0 to 31, so a template argument is used.
- template <uint32_t bitOffset>
- inline U32x8 bitShiftLeftImmediate(const U32x8& left) {
- static_assert(bitOffset < 32u);
- #if defined USE_AVX2
- return U32x8(_mm256_slli_epi32(left.v, bitOffset));
- #else
- return U32x8(
- left.scalars[0] << bitOffset,
- left.scalars[1] << bitOffset,
- left.scalars[2] << bitOffset,
- left.scalars[3] << bitOffset,
- left.scalars[4] << bitOffset,
- left.scalars[5] << bitOffset,
- left.scalars[6] << bitOffset,
- left.scalars[7] << bitOffset
- );
- #endif
- }
- // bitOffset must be an immediate constant from 0 to 31, so a template argument is used.
- template <uint32_t bitOffset>
- inline U32x8 bitShiftRightImmediate(const U32x8& left) {
- static_assert(bitOffset < 32u);
- #if defined USE_AVX2
- return U32x8(_mm256_srli_epi32(left.v, bitOffset));
- #else
- return U32x8(
- left.scalars[0] >> bitOffset,
- left.scalars[1] >> bitOffset,
- left.scalars[2] >> bitOffset,
- left.scalars[3] >> bitOffset,
- left.scalars[4] >> bitOffset,
- left.scalars[5] >> bitOffset,
- left.scalars[6] >> bitOffset,
- left.scalars[7] >> bitOffset
- );
- #endif
- }
- inline U16x16 operator+(const U16x16& left, const U16x16& right) {
- #if defined USE_256BIT_X_SIMD
- return U16x16(ADD_U16_SIMD256(left.v, right.v));
- #else
- return U16x16(
- left.scalars[0] + right.scalars[0],
- left.scalars[1] + right.scalars[1],
- left.scalars[2] + right.scalars[2],
- left.scalars[3] + right.scalars[3],
- left.scalars[4] + right.scalars[4],
- left.scalars[5] + right.scalars[5],
- left.scalars[6] + right.scalars[6],
- left.scalars[7] + right.scalars[7],
- left.scalars[8] + right.scalars[8],
- left.scalars[9] + right.scalars[9],
- left.scalars[10] + right.scalars[10],
- left.scalars[11] + right.scalars[11],
- left.scalars[12] + right.scalars[12],
- left.scalars[13] + right.scalars[13],
- left.scalars[14] + right.scalars[14],
- left.scalars[15] + right.scalars[15]
- );
- #endif
- }
- inline U16x16 operator-(const U16x16& left, const U16x16& right) {
- #if defined USE_256BIT_X_SIMD
- return U16x16(SUB_U16_SIMD256(left.v, right.v));
- #else
- return U16x16(
- left.scalars[0] - right.scalars[0],
- left.scalars[1] - right.scalars[1],
- left.scalars[2] - right.scalars[2],
- left.scalars[3] - right.scalars[3],
- left.scalars[4] - right.scalars[4],
- left.scalars[5] - right.scalars[5],
- left.scalars[6] - right.scalars[6],
- left.scalars[7] - right.scalars[7],
- left.scalars[8] - right.scalars[8],
- left.scalars[9] - right.scalars[9],
- left.scalars[10] - right.scalars[10],
- left.scalars[11] - right.scalars[11],
- left.scalars[12] - right.scalars[12],
- left.scalars[13] - right.scalars[13],
- left.scalars[14] - right.scalars[14],
- left.scalars[15] - right.scalars[15]
- );
- #endif
- }
- inline U16x16 operator*(const U16x16& left, const U16x16& right) {
- #if defined USE_256BIT_X_SIMD
- return U16x16(MUL_U16_SIMD256(left.v, right.v));
- #else
- return U16x16(
- left.scalars[0] * right.scalars[0],
- left.scalars[1] * right.scalars[1],
- left.scalars[2] * right.scalars[2],
- left.scalars[3] * right.scalars[3],
- left.scalars[4] * right.scalars[4],
- left.scalars[5] * right.scalars[5],
- left.scalars[6] * right.scalars[6],
- left.scalars[7] * right.scalars[7],
- left.scalars[8] * right.scalars[8],
- left.scalars[9] * right.scalars[9],
- left.scalars[10] * right.scalars[10],
- left.scalars[11] * right.scalars[11],
- left.scalars[12] * right.scalars[12],
- left.scalars[13] * right.scalars[13],
- left.scalars[14] * right.scalars[14],
- left.scalars[15] * right.scalars[15]
- );
- #endif
- }
- inline U8x32 operator+(const U8x32& left, const U8x32& right) {
- #if defined USE_256BIT_X_SIMD
- return U8x32(ADD_U8_SIMD256(left.v, right.v));
- #else
- U8x32 result = U8x32::create_dangerous_uninitialized();
- for (int i = 0; i < 32; i++) {
- result.scalars[i] = left.scalars[i] + right.scalars[i];
- }
- return result;
- #endif
- }
- inline U8x32 operator-(const U8x32& left, const U8x32& right) {
- #if defined USE_256BIT_X_SIMD
- return U8x32(SUB_U8_SIMD256(left.v, right.v));
- #else
- U8x32 result = U8x32::create_dangerous_uninitialized();
- for (int i = 0; i < 32; i++) {
- result.scalars[i] = left.scalars[i] - right.scalars[i];
- }
- return result;
- #endif
- }
- inline U8x32 saturatedAddition(const U8x32& left, const U8x32& right) {
- #if defined USE_256BIT_X_SIMD
- return U8x32(ADD_SAT_U8_SIMD256(left.v, right.v));
- #else
- U8x32 result = U8x32::create_dangerous_uninitialized();
- for (int i = 0; i < 32; i++) {
- result.scalars[i] = impl_limit255((uint32_t)left.scalars[i] + (uint32_t)right.scalars[i]);
- }
- return result;
- #endif
- }
- inline U8x32 saturatedSubtraction(const U8x32& left, const U8x32& right) {
- #if defined USE_256BIT_X_SIMD
- return U8x32(SUB_SAT_U8_SIMD256(left.v, right.v));
- #else
- U8x32 result = U8x32::create_dangerous_uninitialized();
- for (int i = 0; i < 32; i++) {
- result.scalars[i] = impl_limit0((int32_t)left.scalars[i] - (int32_t)right.scalars[i]);
- }
- return result;
- #endif
- }
- inline I32x8 truncateToI32(const F32x8& vector) {
- #if defined USE_256BIT_X_SIMD
- return I32x8(F32_TO_I32_SIMD256(vector.v));
- #else
- return I32x8(
- (int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3],
- (int32_t)vector.scalars[4], (int32_t)vector.scalars[5], (int32_t)vector.scalars[6], (int32_t)vector.scalars[7]
- );
- #endif
- }
- inline U32x8 truncateToU32(const F32x8& vector) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(F32_TO_U32_SIMD256(vector.v));
- #else
- return U32x8(
- (uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3],
- (uint32_t)vector.scalars[4], (uint32_t)vector.scalars[5], (uint32_t)vector.scalars[6], (uint32_t)vector.scalars[7]
- );
- #endif
- }
- inline F32x8 floatFromI32(const I32x8& vector) {
- #if defined USE_256BIT_X_SIMD
- return F32x8(I32_TO_F32_SIMD256(vector.v));
- #else
- return F32x8(
- (float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3],
- (float)vector.scalars[4], (float)vector.scalars[5], (float)vector.scalars[6], (float)vector.scalars[7]
- );
- #endif
- }
- inline F32x8 floatFromU32(const U32x8& vector) {
- #if defined USE_256BIT_X_SIMD
- return F32x8(U32_TO_F32_SIMD256(vector.v));
- #else
- return F32x8(
- (float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3],
- (float)vector.scalars[4], (float)vector.scalars[5], (float)vector.scalars[6], (float)vector.scalars[7]
- );
- #endif
- }
- inline I32x8 I32FromU32(const U32x8& vector) {
- #if defined USE_256BIT_X_SIMD
- return I32x8(REINTERPRET_U32_TO_I32_SIMD256(vector.v));
- #else
- return I32x8(
- (int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3],
- (int32_t)vector.scalars[4], (int32_t)vector.scalars[5], (int32_t)vector.scalars[6], (int32_t)vector.scalars[7]
- );
- #endif
- }
- inline U32x8 U32FromI32(const I32x8& vector) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(REINTERPRET_I32_TO_U32_SIMD256(vector.v));
- #else
- return U32x8(
- (uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3],
- (uint32_t)vector.scalars[4], (uint32_t)vector.scalars[5], (uint32_t)vector.scalars[6], (uint32_t)vector.scalars[7]
- );
- #endif
- }
- // Warning! Behavior depends on endianness.
- inline U8x32 reinterpret_U8FromU32(const U32x8& vector) {
- #if defined USE_256BIT_X_SIMD
- return U8x32(REINTERPRET_U32_TO_U8_SIMD256(vector.v));
- #else
- const uint8_t *source = (const uint8_t*)vector.scalars;
- U8x32 result = U8x32::create_dangerous_uninitialized();
- for (int i = 0; i < 32; i++) {
- result.scalars[i] = source[i];
- }
- return result;
- #endif
- }
- // Warning! Behavior depends on endianness.
- inline U32x8 reinterpret_U32FromU8(const U8x32& vector) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(REINTERPRET_U8_TO_U32_SIMD256(vector.v));
- #else
- const uint32_t *source = (const uint32_t*)vector.scalars;
- return U32x8(source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7]);
- #endif
- }
- // Warning! Behavior depends on endianness.
- inline U16x16 reinterpret_U16FromU32(const U32x8& vector) {
- #if defined USE_256BIT_X_SIMD
- return U16x16(REINTERPRET_U32_TO_U16_SIMD256(vector.v));
- #else
- const uint16_t *source = (const uint16_t*)vector.scalars;
- return U16x16(
- source[0], source[1], source[2] , source[3] , source[4] , source[5] , source[6] , source[7] ,
- source[8], source[9], source[10], source[11], source[12], source[13], source[14], source[15]
- );
- #endif
- }
- // Warning! Behavior depends on endianness.
- inline U32x8 reinterpret_U32FromU16(const U16x16& vector) {
- #if defined USE_256BIT_X_SIMD
- return U32x4(REINTERPRET_U16_TO_U32_SIMD256(vector.v));
- #else
- const uint32_t *source = (const uint32_t*)vector.scalars;
- return U32x8(source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7]);
- #endif
- }
- // Unpacking to larger integers
- inline U32x8 lowerToU32(const U16x16& vector) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(U16_LOW_TO_U32_SIMD256(vector.v));
- #else
- return U32x8(vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3], vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7]);
- #endif
- }
- inline U32x8 higherToU32(const U16x16& vector) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(U16_HIGH_TO_U32_SIMD256(vector.v));
- #else
- return U32x8(vector.scalars[8], vector.scalars[9], vector.scalars[10], vector.scalars[11], vector.scalars[12], vector.scalars[13], vector.scalars[14], vector.scalars[15]);
- #endif
- }
- inline U16x16 lowerToU16(const U8x32& vector) {
- #if defined USE_256BIT_X_SIMD
- return U16x16(U8_LOW_TO_U16_SIMD256(vector.v));
- #else
- return U16x16(
- vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3],
- vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7],
- vector.scalars[8], vector.scalars[9], vector.scalars[10], vector.scalars[11],
- vector.scalars[12], vector.scalars[13], vector.scalars[14], vector.scalars[15]
- );
- #endif
- }
- inline U16x16 higherToU16(const U8x32& vector) {
- #if defined USE_256BIT_X_SIMD
- return U16x16(U8_HIGH_TO_U16_SIMD256(vector.v));
- #else
- return U16x16(
- vector.scalars[16], vector.scalars[17], vector.scalars[18], vector.scalars[19],
- vector.scalars[20], vector.scalars[21], vector.scalars[22], vector.scalars[23],
- vector.scalars[24], vector.scalars[25], vector.scalars[26], vector.scalars[27],
- vector.scalars[28], vector.scalars[29], vector.scalars[30], vector.scalars[31]
- );
- #endif
- }
- // Saturated packing
- inline U8x32 saturateToU8(const U16x16& lower, const U16x16& upper) {
- #if defined USE_256BIT_X_SIMD
- return U8x32(PACK_SAT_U16_TO_U8_SIMD256(lower.v, upper.v));
- #else
- U8x32 result = U8x32::create_dangerous_uninitialized();
- for (int i = 0; i < 16; i++) {
- result.scalars[i] = impl_limit255(lower.scalars[i]);
- }
- for (int i = 0; i < 16; i++) {
- result.scalars[i + 16] = impl_limit255(upper.scalars[i]);
- }
- return result;
- #endif
- }
- // Unary negation for convenience and code readability.
- // Before using unary negation, always check if:
- // * An addition can be turned into a subtraction?
- // x = -a + b
- // x = b - a
- // * A multiplying constant or scalar can be negated instead?
- // x = -b * 2
- // x = b * -2
- inline F32x8 operator-(const F32x8& value) {
- #if defined USE_256BIT_F_SIMD
- return F32x8(0.0f) - value;
- #else
- return F32x8(
- -value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3],
- -value.scalars[4], -value.scalars[5], -value.scalars[6], -value.scalars[7]
- );
- #endif
- }
- inline I32x8 operator-(const I32x8& value) {
- #if defined USE_256BIT_X_SIMD
- return I32x8(0) - value;
- #else
- return I32x8(
- -value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3],
- -value.scalars[4], -value.scalars[5], -value.scalars[6], -value.scalars[7]
- );
- #endif
- }
- // Helper macros for generating the vector extract functions.
- // Having one function for each type and offset makes sure that the compiler gets an immediate integer within the valid range.
- #if defined USE_AVX2
- // AVX2 does not offer any 256-bit element extraction, only two 128-bit shifts done in parallel, so we might as well use two separate 128-bit extractions.
- template <int OFFSET>
- __m256i impl_extractBytes_AVX2(const __m256i leftInput, const __m256i rightInput) {
- // Extract three halves depending on which ones overlap with the offset.
- __m128i leftPart = _mm256_extractf128_si256(leftInput, (OFFSET < 16) ? 0 : 1);
- __m128i middlePart = _mm256_extractf128_si256(OFFSET < 16 ? rightInput : leftInput, (OFFSET < 16) ? 1 : 0);
- __m128i rightPart = _mm256_extractf128_si256(rightInput, (OFFSET < 16) ? 0 : 1);
- // Combine two 128-bit extracts into a whole 256-bit extract.
- return _mm256_set_m128i(
- _mm_alignr_epi8(leftPart, middlePart, OFFSET - ((OFFSET < 16) ? 0 : 16)),
- _mm_alignr_epi8(middlePart, rightPart, OFFSET - ((OFFSET < 16) ? 0 : 16))
- );
- }
- #define VECTOR_EXTRACT_GENERATOR_256_U8(OFFSET) return U8x32(impl_extractBytes_AVX2<OFFSET>(b.v, a.v));
- #define VECTOR_EXTRACT_GENERATOR_256_U16(OFFSET) return U16x16(impl_extractBytes_AVX2<OFFSET * 2>(b.v, a.v));
- #define VECTOR_EXTRACT_GENERATOR_256_U32(OFFSET) return U32x8(impl_extractBytes_AVX2<OFFSET * 4>(b.v, a.v));
- #define VECTOR_EXTRACT_GENERATOR_256_I32(OFFSET) return I32x8(impl_extractBytes_AVX2<OFFSET * 4>(b.v, a.v));
- #define VECTOR_EXTRACT_GENERATOR_256_F32(OFFSET) return F32x8(SIMD_F32x8(impl_extractBytes_AVX2<OFFSET * 4>(SIMD_U32x8(b.v), SIMD_U32x8(a.v))));
- #else
- template<typename T, int elementCount>
- T vectorExtract_emulated(const T &a, const T &b, int offset) {
- // For safety reasons, uninitialized default construction of SIMD vectors is only available during full emulation, not partial AVX support.
- T result = T::create_dangerous_uninitialized();
- int t = 0;
- for (int s = offset; s < elementCount; s++) {
- result.scalars[t] = a.scalars[s];
- t++;
- }
- for (int s = 0; s < offset; s++) {
- result.scalars[t] = b.scalars[s];
- t++;
- }
- return result;
- }
- #define VECTOR_EXTRACT_GENERATOR_256_U8(OFFSET) return vectorExtract_emulated<U8x32, 32>(a, b, OFFSET);
- #define VECTOR_EXTRACT_GENERATOR_256_U16(OFFSET) return vectorExtract_emulated<U16x16, 16>(a, b, OFFSET);
- #define VECTOR_EXTRACT_GENERATOR_256_U32(OFFSET) return vectorExtract_emulated<U32x8, 8>(a, b, OFFSET);
- #define VECTOR_EXTRACT_GENERATOR_256_I32(OFFSET) return vectorExtract_emulated<I32x8, 8>(a, b, OFFSET);
- #define VECTOR_EXTRACT_GENERATOR_256_F32(OFFSET) return vectorExtract_emulated<F32x8, 8>(a, b, OFFSET);
- #endif
- // Vector extraction concatunates two input vectors and reads a vector between them using an offset.
- // The first and last offsets that only return one of the inputs can be used for readability, because they will be inlined and removed by the compiler.
- // To get elements from the right side, combine the center vector with the right vector and shift one element to the left using vectorExtract_1 for the given type.
- // To get elements from the left side, combine the left vector with the center vector and shift one element to the right using vectorExtract_15 for 16 lanes, vectorExtract_7 for 8 lanes, or vectorExtract_3 for 4 lanes.
- // TODO: Also allow using a template arguments as the element offset with a static assert for the offset, which might be useful in template programming.
- U8x32 inline vectorExtract_0(const U8x32 &a, const U8x32 &b) { return a; }
- U8x32 inline vectorExtract_1(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(1) }
- U8x32 inline vectorExtract_2(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(2) }
- U8x32 inline vectorExtract_3(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(3) }
- U8x32 inline vectorExtract_4(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(4) }
- U8x32 inline vectorExtract_5(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(5) }
- U8x32 inline vectorExtract_6(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(6) }
- U8x32 inline vectorExtract_7(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(7) }
- U8x32 inline vectorExtract_8(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(8) }
- U8x32 inline vectorExtract_9(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(9) }
- U8x32 inline vectorExtract_10(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(10) }
- U8x32 inline vectorExtract_11(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(11) }
- U8x32 inline vectorExtract_12(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(12) }
- U8x32 inline vectorExtract_13(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(13) }
- U8x32 inline vectorExtract_14(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(14) }
- U8x32 inline vectorExtract_15(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(15) }
- U8x32 inline vectorExtract_16(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(16) }
- U8x32 inline vectorExtract_17(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(17) }
- U8x32 inline vectorExtract_18(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(18) }
- U8x32 inline vectorExtract_19(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(19) }
- U8x32 inline vectorExtract_20(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(20) }
- U8x32 inline vectorExtract_21(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(21) }
- U8x32 inline vectorExtract_22(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(22) }
- U8x32 inline vectorExtract_23(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(23) }
- U8x32 inline vectorExtract_24(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(24) }
- U8x32 inline vectorExtract_25(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(25) }
- U8x32 inline vectorExtract_26(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(26) }
- U8x32 inline vectorExtract_27(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(27) }
- U8x32 inline vectorExtract_28(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(28) }
- U8x32 inline vectorExtract_29(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(29) }
- U8x32 inline vectorExtract_30(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(30) }
- U8x32 inline vectorExtract_31(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(31) }
- U8x32 inline vectorExtract_32(const U8x32 &a, const U8x32 &b) { return b; }
- U16x16 inline vectorExtract_0(const U16x16 &a, const U16x16 &b) { return a; }
- U16x16 inline vectorExtract_1(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(1) }
- U16x16 inline vectorExtract_2(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(2) }
- U16x16 inline vectorExtract_3(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(3) }
- U16x16 inline vectorExtract_4(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(4) }
- U16x16 inline vectorExtract_5(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(5) }
- U16x16 inline vectorExtract_6(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(6) }
- U16x16 inline vectorExtract_7(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(7) }
- U16x16 inline vectorExtract_8(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(8) }
- U16x16 inline vectorExtract_9(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(9) }
- U16x16 inline vectorExtract_10(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(10) }
- U16x16 inline vectorExtract_11(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(11) }
- U16x16 inline vectorExtract_12(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(12) }
- U16x16 inline vectorExtract_13(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(13) }
- U16x16 inline vectorExtract_14(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(14) }
- U16x16 inline vectorExtract_15(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(15) }
- U16x16 inline vectorExtract_16(const U16x16 &a, const U16x16 &b) { return b; }
- U32x8 inline vectorExtract_0(const U32x8 &a, const U32x8 &b) { return a; }
- U32x8 inline vectorExtract_1(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(1) }
- U32x8 inline vectorExtract_2(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(2) }
- U32x8 inline vectorExtract_3(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(3) }
- U32x8 inline vectorExtract_4(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(4) }
- U32x8 inline vectorExtract_5(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(5) }
- U32x8 inline vectorExtract_6(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(6) }
- U32x8 inline vectorExtract_7(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(7) }
- U32x8 inline vectorExtract_8(const U32x8 &a, const U32x8 &b) { return b; }
- I32x8 inline vectorExtract_0(const I32x8 &a, const I32x8 &b) { return a; }
- I32x8 inline vectorExtract_1(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(1) }
- I32x8 inline vectorExtract_2(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(2) }
- I32x8 inline vectorExtract_3(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(3) }
- I32x8 inline vectorExtract_4(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(4) }
- I32x8 inline vectorExtract_5(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(5) }
- I32x8 inline vectorExtract_6(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(6) }
- I32x8 inline vectorExtract_7(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(7) }
- I32x8 inline vectorExtract_8(const I32x8 &a, const I32x8 &b) { return b; }
- F32x8 inline vectorExtract_0(const F32x8 &a, const F32x8 &b) { return a; }
- F32x8 inline vectorExtract_1(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(1) }
- F32x8 inline vectorExtract_2(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(2) }
- F32x8 inline vectorExtract_3(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(3) }
- F32x8 inline vectorExtract_4(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(4) }
- F32x8 inline vectorExtract_5(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(5) }
- F32x8 inline vectorExtract_6(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(6) }
- F32x8 inline vectorExtract_7(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(7) }
- F32x8 inline vectorExtract_8(const F32x8 &a, const F32x8 &b) { return b; }
- // Gather instructions load memory from a pointer at multiple index offsets at the same time.
- // The given pointers should be aligned with 4 bytes, so that the fallback solution works on machines with strict alignment requirements.
- #if defined USE_AVX2
- #define GATHER_I32x8_AVX2(SOURCE, EIGHT_OFFSETS, SCALE) _mm256_i32gather_epi32((const int32_t*)(SOURCE), EIGHT_OFFSETS, SCALE)
- #define GATHER_U32x8_AVX2(SOURCE, EIGHT_OFFSETS, SCALE) _mm256_i32gather_epi32((const int32_t*)(SOURCE), EIGHT_OFFSETS, SCALE)
- #define GATHER_F32x8_AVX2(SOURCE, EIGHT_OFFSETS, SCALE) _mm256_i32gather_ps((const float*)(SOURCE), EIGHT_OFFSETS, SCALE)
- #endif
- static inline U32x8 gather_U32(dsr::SafePointer<const uint32_t> data, const U32x8 &elementOffset) {
- #if defined USE_AVX2
- // TODO: Implement safety checks for debug mode.
- return U32x8(GATHER_I32x8_AVX2(data.getUnsafe(), elementOffset.v, 4));
- #else
- ALIGN32 uint32_t elementOffsets[8];
- elementOffset.writeAlignedUnsafe(elementOffsets);
- return U32x8(
- *(data + elementOffsets[0]),
- *(data + elementOffsets[1]),
- *(data + elementOffsets[2]),
- *(data + elementOffsets[3]),
- *(data + elementOffsets[4]),
- *(data + elementOffsets[5]),
- *(data + elementOffsets[6]),
- *(data + elementOffsets[7])
- );
- #endif
- }
- static inline I32x8 gather_I32(dsr::SafePointer<const int32_t> data, const U32x8 &elementOffset) {
- #if defined USE_AVX2
- // TODO: Implement safety checks for debug mode.
- return I32x8(GATHER_U32x8_AVX2(data.getUnsafe(), elementOffset.v, 4));
- #else
- ALIGN32 uint32_t elementOffsets[8];
- elementOffset.writeAlignedUnsafe(elementOffsets);
- return I32x8(
- *(data + elementOffsets[0]),
- *(data + elementOffsets[1]),
- *(data + elementOffsets[2]),
- *(data + elementOffsets[3]),
- *(data + elementOffsets[4]),
- *(data + elementOffsets[5]),
- *(data + elementOffsets[6]),
- *(data + elementOffsets[7])
- );
- #endif
- }
- static inline F32x8 gather_F32(dsr::SafePointer<const float> data, const U32x8 &elementOffset) {
- #if defined USE_AVX2
- // TODO: Implement safety checks for debug mode.
- return F32x8(GATHER_F32x8_AVX2(data.getUnsafe(), elementOffset.v, 4));
- #else
- ALIGN32 uint32_t elementOffsets[8];
- elementOffset.writeAlignedUnsafe(elementOffsets);
- return F32x8(
- *(data + elementOffsets[0]),
- *(data + elementOffsets[1]),
- *(data + elementOffsets[2]),
- *(data + elementOffsets[3]),
- *(data + elementOffsets[4]),
- *(data + elementOffsets[5]),
- *(data + elementOffsets[6]),
- *(data + elementOffsets[7])
- );
- #endif
- }
- // Wrapper functions for explicitly expanding scalars into vectors during math operations.
- #define NUMERICAL_SCALAR_OPERATIONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
- inline VECTOR_TYPE operator+(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left + VECTOR_TYPE(right); } \
- inline VECTOR_TYPE operator+(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) + right; } \
- inline VECTOR_TYPE operator-(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left - VECTOR_TYPE(right); } \
- inline VECTOR_TYPE operator-(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) - right; }
- FOR_ALL_VECTOR_TYPES(NUMERICAL_SCALAR_OPERATIONS)
- #undef NUMERICAL_SCALAR_OPERATIONS
- #define MULTIPLY_SCALAR_OPERATIONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
- inline VECTOR_TYPE operator*(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left * VECTOR_TYPE(right); } \
- inline VECTOR_TYPE operator*(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) * right; }
- // TODO: Implement multiplication for U8x16 and U8x32.
- //FOR_ALL_VECTOR_TYPES(MULTIPLY_SCALAR_OPERATIONS)
- MULTIPLY_SCALAR_OPERATIONS(F32x4, float, 4)
- MULTIPLY_SCALAR_OPERATIONS(F32x8, float, 8)
- MULTIPLY_SCALAR_OPERATIONS(U32x4, uint32_t, 4)
- MULTIPLY_SCALAR_OPERATIONS(U32x8, uint32_t, 8)
- MULTIPLY_SCALAR_OPERATIONS(I32x4, int32_t, 4)
- MULTIPLY_SCALAR_OPERATIONS(I32x8, int32_t, 8)
- MULTIPLY_SCALAR_OPERATIONS(U16x8, uint16_t, 8)
- MULTIPLY_SCALAR_OPERATIONS(U16x16, uint16_t, 16)
- #undef MULTIPLY_SCALAR_OPERATIONS
- // Wrapper functions for explicitly duplicating bit masks into the same lane count.
- #define BITWISE_SCALAR_OPERATIONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
- inline VECTOR_TYPE operator&(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left & VECTOR_TYPE(right); } \
- inline VECTOR_TYPE operator&(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) & right; } \
- inline VECTOR_TYPE operator|(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left | VECTOR_TYPE(right); } \
- inline VECTOR_TYPE operator|(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) | right; } \
- inline VECTOR_TYPE operator^(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left ^ VECTOR_TYPE(right); } \
- inline VECTOR_TYPE operator^(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) ^ right; }
- // TODO: Implement bitwise operations for all unsigned SIMD vectors.
- //FOR_UNSIGNED_VECTOR_TYPES(BITWISE_SCALAR_OPERATIONS)
- BITWISE_SCALAR_OPERATIONS(U32x4, uint32_t, 4)
- BITWISE_SCALAR_OPERATIONS(U32x8, uint32_t, 8)
- #undef BITWISE_SCALAR_OPERATIONS
- // Cleaning up temporary macro definitions to avoid cluttering the namespace.
- #undef FOR_ALL_VECTOR_TYPES
- #undef FOR_FLOAT_VECTOR_TYPES
- #undef FOR_INTEGER_VECTOR_TYPES
- #undef FOR_SIGNED_VECTOR_TYPES
- #undef FOR_UNSIGNED_VECTOR_TYPES
- // TODO: Let SVE define completely separate types for dynamic vectors.
- // The X vectors using the longest SIMD length that is efficient to use for both floating-point and integer types.
- // DSR_DEFAULT_ALIGNMENT
- // The number of bytes memory should be aligned with by default when creating buffers and images.
- // F32xX
- // The longest available SIMD vector for storing 32-bit float values. Iterating laneCountX_32Bit floats at a time.
- // I32xX
- // The longest available SIMD vector for storing signed 32-bit integer values. Iterating laneCountX_32Bit integers at a time.
- // U32xX
- // The longest available SIMD vector for storing unsigned 32-bit integer values. Iterating laneCountX_32Bit integers at a time.
- // U16xX
- // The longest available SIMD vector for storing unsigned 16-bit integer values. Iterating laneCountX_16Bit integers at a time.
- // U8xX
- // The longest available SIMD vector for storing unsigned 8-bit integer values. Iterating laneCountX_8Bit integers at a time.
- #if defined USE_256BIT_X_SIMD || defined EMULATE_256BIT_X_SIMD
- // Using 256-bit SIMD
- #define DSR_DEFAULT_VECTOR_SIZE 32
- #define DSR_DEFAULT_ALIGNMENT 32
- using F32xX = F32x8;
- using I32xX = I32x8;
- using U32xX = U32x8;
- using U16xX = U16x16;
- using U8xX = U8x32;
- // Align memory with 256 bits to allow overwriting padding at the end of each pixel row.
- // Otherwise you would have to preserve data at the end of each row with slow and bloated duplicated code in every filter.
- #else
- // If there is no hardware support for 256-bit vectors, the emulation of 256-bit vectors when used explicitly, is allowed to be aligned with just 128 bits.
- #define DSR_DEFAULT_VECTOR_SIZE 16
- #define DSR_DEFAULT_ALIGNMENT 16
- using F32xX = F32x4;
- using I32xX = I32x4;
- using U32xX = U32x4;
- using U16xX = U16x8;
- using U8xX = U8x16;
- #endif
- // How many lanes do the longest available vector have for a specified lane size.
- // Used to iterate indices and pointers using whole elements.
- static const int laneCountX_32Bit = DSR_DEFAULT_VECTOR_SIZE / 4;
- static const int laneCountX_16Bit = DSR_DEFAULT_VECTOR_SIZE / 2;
- static const int laneCountX_8Bit = DSR_DEFAULT_VECTOR_SIZE;
- // TODO: Let SVE define completely separate types for dynamic vectors.
- // The F vector using the longest SIMD length that is efficient to use when only processing float values, even if no integer types are available in the same size.
- // Used when you know that your algorithm is only going to work with float types and you need the extra performance.
- // Some processors have AVX but not AVX2, meaning that it has 256-bit SIMD for floats, but only 128-bit SIMD for integers.
- // F32xF
- // The longest available SIMD vector for storing 32-bit float values. Iterating laneCountF_32Bit floats at a time.
- #if defined USE_256BIT_F_SIMD || defined EMULATE_256BIT_F_SIMD
- #define DSR_FLOAT_VECTOR_SIZE 32
- #define DSR_FLOAT_ALIGNMENT 32
- using F32xF = F32x8;
- #else
- // F vectors are 128-bits.
- #define DSR_FLOAT_VECTOR_SIZE 16
- #define DSR_FLOAT_ALIGNMENT 16
- using F32xF = F32x4;
- #endif
- // Used to iterate over float pointers when using F32xF.
- static const int laneCountF = DSR_FLOAT_VECTOR_SIZE / 4;
- // Define traits.
- DSR_APPLY_PROPERTY(DsrTrait_Any_U8 , U8x16)
- DSR_APPLY_PROPERTY(DsrTrait_Any_U8 , U8x32)
- DSR_APPLY_PROPERTY(DsrTrait_Any_U16, U16x8)
- DSR_APPLY_PROPERTY(DsrTrait_Any_U16, U16x16)
- DSR_APPLY_PROPERTY(DsrTrait_Any_U32, U32x4)
- DSR_APPLY_PROPERTY(DsrTrait_Any_U32, U32x8)
- DSR_APPLY_PROPERTY(DsrTrait_Any_I32, I32x4)
- DSR_APPLY_PROPERTY(DsrTrait_Any_I32, I32x8)
- DSR_APPLY_PROPERTY(DsrTrait_Any_F32, F32x4)
- DSR_APPLY_PROPERTY(DsrTrait_Any_F32, F32x8)
- // TODO: Use as independent types when the largest vector lengths are not known in compile time on ARM SVE.
- //DSR_APPLY_PROPERTY(DsrTrait_Any_U8 , U8xX)
- //DSR_APPLY_PROPERTY(DsrTrait_Any_U16, U16xX)
- //DSR_APPLY_PROPERTY(DsrTrait_Any_U32, U32xX)
- //DSR_APPLY_PROPERTY(DsrTrait_Any_I32, I32xX)
- //DSR_APPLY_PROPERTY(DsrTrait_Any_F32, F32xX)
- //DSR_APPLY_PROPERTY(DsrTrait_Any_F32, F32xF)
- }
- #endif
|