| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201 |
- // zlib open source license
- //
- // Copyright (c) 2017 to 2025 David Forsgren Piuva
- //
- // This software is provided 'as-is', without any express or implied
- // warranty. In no event will the authors be held liable for any damages
- // arising from the use of this software.
- //
- // Permission is granted to anyone to use this software for any purpose,
- // including commercial applications, and to alter it and redistribute it
- // freely, subject to the following restrictions:
- //
- // 1. The origin of this software must not be misrepresented; you must not
- // claim that you wrote the original software. If you use this software
- // in a product, an acknowledgment in the product documentation would be
- // appreciated but is not required.
- //
- // 2. Altered source versions must be plainly marked as such, and must not be
- // misrepresented as being the original software.
- //
- // 3. This notice may not be removed or altered from any source
- // distribution.
- // Hardware abstraction layer for portable SIMD math.
- // Used to make calculations faster without having to mess around with any hardware specific assembler code nor intrinsic functions.
- // You get the performance you need today and the ability to compile with automatic scalar emulation when building for a future processor that this module has not yet been ported to.
- // When you can generate the vectorized code using the same template function as your non-vectorized code, you don't even need to write a reference implementation.
- // Using with 128-bit SIMD: (beginner friendly, test once, compile anywhere, no compiler flags)
- // If you are new to vectorization or only plan to work with 128-bit extensions such as ARM NEON, you can keep it simple by only using the 128-bit vector types.
- // Pros and cons:
- // + Most target platforms (excluding older systems such as ARMv6) have 128-bit SIMD extensions such as Intel SSE2 or ARM NEON enabled by default.
- // ARMv6 does not support ARM NEON, but most ARMv7 processors support it, so that compilers enable NEON by default.
- // All 64-bit ARM processors have ARM NEON, because it stopped being optional in ARMv8.
- // Building for 64-bit Intel processors usually have SSE2 enabled by default, so you don't have to change any compiler flags when building on a different system.
- // + One build for all computers of the same instruction set.
- // Great when your application is not so resource heavy, because the least powerful systems don't have the fancy extensions anyway.
- // - You might end up enabling the additional SIMD extensions anyway because the library is already using it to become faster.
- // Types:
- // * Use F32x4, I32x4 and U32x4 for 4 elements at a time
- // * U16x8 for 8 elements at a time
- // * U8x16 for 16 elements at a time
- // Using the X vector size: (advanced, having to test with different build flags or emulation)
- // If you want more performance, you can use variable length type aliases.
- // Pros and cons:
- // + For heavy calculations where memory access is not the bottleneck, using larger SIMD vectors when enabled allow saving energy and increasing performance.
- // - If you forget to test with longer vector lengths (compiling with -mavx2 or -mEMULATE_256BIT_SIMD) then you might find bugs from not iterating or aligning memory correctly.
- // Types:
- // * Use F32xX, I32xX and U32xX for laneCountX_32Bit elements at a time
- // * U16xX for laneCountX_16Bit elements at a time
- // * U8xX for laneCountX_8Bit elements at a time
- // Using the F vector size: (very dangerous, no test can confirm that memory alignment is correct)
- // If you want even more performance, you can let float operations use the longest available F vector size, which might exceed the X vector size.
- // Pros and cons:
- // - Have to manually set the alignment of buffers to DSR_FLOAT_ALIGNMENT to prevent crashing.
- // If the default alignment for buffers changed based on the size of F vectors, the more commonly used X vector would get slowed down from cache misses from padding larger than X vectors.
- // AlignedImageF32 and sound backends are already aligned with the F vector size, because they are not generic like Buffer.
- // - It can be difficult to detect incorrect memory alignment, because a pointer can accidentally be aligned to more than what was requested.
- // If accidentally aligning to 128 bits instead of 256 bits, there is a 50% risk of failing to detect it at runtime and later fail on another computer.
- // If sticking with 128-bit or X vectors, all buffers will be correctly aligned automatically.
- // + For heavy calculations where memory access is not the bottleneck, using larger SIMD vectors when enabled allow saving energy and increasing performance.
- // - If you forget to test with longer vector lengths (compiling with -mavx2 or -mEMULATE_256BIT_SIMD) then you might find bugs from not iterating or aligning memory correctly.
- // Types:
- // * Use F32xX, I32xX and U32xX for laneCountX_32Bit elements at a time
- // Compiler extensions
- // On Intel/AMD processors:
- // SSE2 is usually enabled by default, because SSE2 is mandatory for 64-bit Intel instructions.
- // Use -mavx as a G++ compiler flag to enable the AVX extension, enabling the USE_AVX and USE_256BIT_F_SIMD macros.
- // If not available on your computer, you can test your algorithm for 256-bit float SIMD using EMULATE_256BIT_F_SIMD, but only if you use the F vector size.
- // Use -mavx2 as a G++ compiler flag to enable the AVX2 extension, enabling the USE_AVX2 and USE_256BIT_X_SIMD macros.
- // If not available on your computer, you can test your algorithm for 256-bit float and integer SIMD using EMULATE_256BIT_X_SIMD, but only if you use the X vector size.
- // On ARMv6 processors:
- // Scalar emulation is used when compiling for ARMv6, because it does not have NEON and VFP is not supported in this abstraction.
- // On ARMv7 processors:
- // NEON is usually enabled by default for ARMv7, because most of them have the extension.
- // On ARMv8 processors:
- // NEON can not be disabled for ARMv8, because it is mandatory for ARMv8.
- #ifndef DFPSR_SIMD
- #define DFPSR_SIMD
- #include <cstdint>
- #include <cassert>
- #include "SafePointer.h"
- #include "../math/FVector.h"
- #include "../math/IVector.h"
- #include "../math/UVector.h"
- #include "DsrTraits.h"
- #include "../settings.h"
- #include "../base/noSimd.h"
- namespace dsr {
- // Alignment in bytes
- #define ALIGN_BYTES(SIZE) __attribute__((aligned(SIZE)))
- #define ALIGN16 ALIGN_BYTES(16) // 128-bit alignment
- #define ALIGN32 ALIGN_BYTES(32) // 256-bit alignment
- #define ALIGN64 ALIGN_BYTES(64) // 512-bit alignment
- #define ALIGN128 ALIGN_BYTES(128) // 1024-bit alignment
- #define ALIGN256 ALIGN_BYTES(256) // 2048-bit alignment
- // Everything declared in here handles things specific for SSE.
- // Direct use of the macros will not provide portability to all hardware.
- #ifdef USE_SSE2
- #define USE_DIRECT_SIMD_MEMORY_ACCESS
- #include <emmintrin.h> // SSE2
- #ifdef USE_SSSE3
- #include <tmmintrin.h> // SSSE3
- #endif
- #ifdef USE_AVX
- #include <immintrin.h> // AVX / AVX2
- #endif
- // Vector types
- #define SIMD_F32x4 __m128
- #define SIMD_U8x16 __m128i
- #define SIMD_U16x8 __m128i
- #define SIMD_U32x4 __m128i
- #define SIMD_I32x4 __m128i
- // Vector uploads in address order
- #define LOAD_VECTOR_F32_SIMD(A, B, C, D) _mm_set_ps(D, C, B, A)
- #define LOAD_SCALAR_F32_SIMD(A) _mm_set1_ps(A)
- #define LOAD_VECTOR_U8_SIMD(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) _mm_set_epi8(P, O, N, M, L, K, J, I, H, G, F, E, D, C, B, A)
- #define LOAD_SCALAR_U8_SIMD(A) _mm_set1_epi8(A)
- #define LOAD_VECTOR_U16_SIMD(A, B, C, D, E, F, G, H) _mm_set_epi16(H, G, F, E, D, C, B, A)
- #define LOAD_SCALAR_U16_SIMD(A) _mm_set1_epi16(A)
- #define LOAD_VECTOR_U32_SIMD(A, B, C, D) _mm_set_epi32(D, C, B, A)
- #define LOAD_SCALAR_U32_SIMD(A) _mm_set1_epi32(A)
- #define LOAD_VECTOR_I32_SIMD(A, B, C, D) _mm_set_epi32(D, C, B, A)
- #define LOAD_SCALAR_I32_SIMD(A) _mm_set1_epi32(A)
- // Conversions
- #define F32_TO_I32_SIMD(A) _mm_cvttps_epi32(A)
- #define F32_TO_U32_SIMD(A) _mm_cvttps_epi32(A)
- #define I32_TO_F32_SIMD(A) _mm_cvtepi32_ps(A)
- #define U32_TO_F32_SIMD(A) _mm_cvtepi32_ps(A)
- // Unpacking conversions
- #define U8_LOW_TO_U16_SIMD(A) _mm_unpacklo_epi8(A, _mm_set1_epi8(0))
- #define U8_HIGH_TO_U16_SIMD(A) _mm_unpackhi_epi8(A, _mm_set1_epi8(0))
- #define U16_LOW_TO_U32_SIMD(A) _mm_unpacklo_epi16(A, _mm_set1_epi16(0))
- #define U16_HIGH_TO_U32_SIMD(A) _mm_unpackhi_epi16(A, _mm_set1_epi16(0))
- // Saturated packing
- // Credit: Using ideas from Victoria Zhislina's NEON_2_SSE.h header from the Intel corporation, but not trying to emulate NEON
- inline SIMD_U8x16 PACK_SAT_U16_TO_U8_SIMD(const SIMD_U16x8& a, const SIMD_U16x8& b) {
- SIMD_U16x8 mask, a2, b2;
- mask = _mm_set1_epi16(0b0111111111111111);
- a2 = _mm_and_si128(a, mask);
- a2 = _mm_or_si128(a2, _mm_and_si128(_mm_cmpgt_epi16(a2, a), mask));
- b2 = _mm_and_si128(b, mask);
- b2 = _mm_or_si128(b2, _mm_and_si128(_mm_cmpgt_epi16(b2, b), mask));
- return _mm_packus_epi16(a2, b2);
- }
- // Reinterpret casting
- #define REINTERPRET_U32_TO_U8_SIMD(A) (A)
- #define REINTERPRET_U32_TO_U16_SIMD(A) (A)
- #define REINTERPRET_U8_TO_U32_SIMD(A) (A)
- #define REINTERPRET_U16_TO_U32_SIMD(A) (A)
- #define REINTERPRET_U32_TO_I32_SIMD(A) (A)
- #define REINTERPRET_I32_TO_U32_SIMD(A) (A)
- // Vector float operations returning SIMD_F32x4
- #define ADD_F32_SIMD(A, B) _mm_add_ps(A, B)
- #define SUB_F32_SIMD(A, B) _mm_sub_ps(A, B)
- #define MUL_F32_SIMD(A, B) _mm_mul_ps(A, B)
- // Vector integer operations returning SIMD_I32x4
- #define ADD_I32_SIMD(A, B) _mm_add_epi32(A, B)
- #define SUB_I32_SIMD(A, B) _mm_sub_epi32(A, B)
- // 32-bit integer multiplications are not available on SSE2.
- // Vector integer operations returning SIMD_U32x4
- #define ADD_U32_SIMD(A, B) _mm_add_epi32(A, B)
- #define SUB_U32_SIMD(A, B) _mm_sub_epi32(A, B)
- // 32-bit integer multiplications are not available on SSE2.
- // Vector integer operations returning SIMD_U16x8
- #define ADD_U16_SIMD(A, B) _mm_add_epi16(A, B)
- #define SUB_U16_SIMD(A, B) _mm_sub_epi16(A, B)
- #define MUL_U16_SIMD(A, B) _mm_mullo_epi16(A, B)
- // Vector integer operations returning SIMD_U8x16
- #define ADD_U8_SIMD(A, B) _mm_add_epi8(A, B)
- #define ADD_SAT_U8_SIMD(A, B) _mm_adds_epu8(A, B) // Saturated addition
- #define SUB_U8_SIMD(A, B) _mm_sub_epi8(A, B)
- #define SUB_SAT_U8_SIMD(A, B) _mm_subs_epu8(A, B) // Saturated subtraction
- // No 8-bit multiplications
- // Statistics
- #define MIN_F32_SIMD(A, B) _mm_min_ps(A, B)
- #define MAX_F32_SIMD(A, B) _mm_max_ps(A, B)
- // TODO: Implement minimum and maximum for integer vectors, so that all operations exist for all applicable types:
- // Using _mm256_max_epu16... in AVX2 for 256-bit versions
- // Using comparisons and masking in SSE2 when _mm_max_epu16... in SSE4.1 is not available
- // Bitwise
- #define BITWISE_AND_U32_SIMD(A, B) _mm_and_si128(A, B)
- #define BITWISE_OR_U32_SIMD(A, B) _mm_or_si128(A, B)
- #define BITWISE_XOR_U32_SIMD(A, B) _mm_xor_si128(A, B)
- #ifdef USE_AVX
- // 256-bit vector types
- #define SIMD_F32x8 __m256
- // Vector uploads in address order
- #define LOAD_VECTOR_F32_SIMD256(A, B, C, D, E, F, G, H) _mm256_set_ps(H, G, F, E, D, C, B, A)
- #define LOAD_SCALAR_F32_SIMD256(A) _mm256_set1_ps(A)
- // Vector float operations returning SIMD_F32x4
- #define ADD_F32_SIMD256(A, B) _mm256_add_ps(A, B)
- #define SUB_F32_SIMD256(A, B) _mm256_sub_ps(A, B)
- #define MUL_F32_SIMD256(A, B) _mm256_mul_ps(A, B)
- // Statistics
- #define MIN_F32_SIMD256(A, B) _mm256_min_ps(A, B)
- #define MAX_F32_SIMD256(A, B) _mm256_max_ps(A, B)
- #ifdef USE_AVX2
- // 256-bit vector types
- #define SIMD_U8x32 __m256i
- #define SIMD_U16x16 __m256i
- #define SIMD_U32x8 __m256i
- #define SIMD_I32x8 __m256i
- // Vector uploads in address order
- #define LOAD_VECTOR_U8_SIMD256(A1, B1, C1, D1, E1, F1, G1, H1, I1, J1, K1, L1, M1, N1, O1, P1, Q1, R1, S1, T1, U1, V1, W1, X1, Y1, Z1, A2, B2, C2, D2, E2, F2) _mm256_set_epi8(F2, E2, D2, C2, B2, A2, Z1, Y1, X1, W1, V1, U1, T1, S1, R1, Q1, P1, O1, N1, M1, L1, K1, J1, I1, H1, G1, F1, E1, D1, C1, B1, A1)
- #define LOAD_SCALAR_U8_SIMD256(A) _mm256_set1_epi8(A)
- #define LOAD_VECTOR_U16_SIMD256(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) _mm256_set_epi16(P, O, N, M, L, K, J, I, H, G, F, E, D, C, B, A)
- #define LOAD_SCALAR_U16_SIMD256(A) _mm256_set1_epi16(A)
- #define LOAD_VECTOR_U32_SIMD256(A, B, C, D, E, F, G, H) _mm256_set_epi32(H, G, F, E, D, C, B, A)
- #define LOAD_SCALAR_U32_SIMD256(A) _mm256_set1_epi32(A)
- #define LOAD_VECTOR_I32_SIMD256(A, B, C, D, E, F, G, H) _mm256_set_epi32(H, G, F, E, D, C, B, A)
- #define LOAD_SCALAR_I32_SIMD256(A) _mm256_set1_epi32(A)
- // Conversions
- #define F32_TO_I32_SIMD256(A) _mm256_cvttps_epi32(A)
- #define F32_TO_U32_SIMD256(A) _mm256_cvttps_epi32(A)
- #define I32_TO_F32_SIMD256(A) _mm256_cvtepi32_ps(A)
- #define U32_TO_F32_SIMD256(A) _mm256_cvtepi32_ps(A)
- // Unpacking conversions
- #define U8_LOW_TO_U16_SIMD256(A) _mm256_unpacklo_epi8(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi8(0))
- #define U8_HIGH_TO_U16_SIMD256(A) _mm256_unpackhi_epi8(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi8(0))
- #define U16_LOW_TO_U32_SIMD256(A) _mm256_unpacklo_epi16(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi16(0))
- #define U16_HIGH_TO_U32_SIMD256(A) _mm256_unpackhi_epi16(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi16(0))
- // Saturated packing
- // Credit: Using ideas from Victoria Zhislina's NEON_2_SSE.h header from the Intel corporation, but not trying to emulate NEON
- inline SIMD_U8x32 PACK_SAT_U16_TO_U8_SIMD256(const SIMD_U16x16& a, const SIMD_U16x16& b) {
- SIMD_U16x16 mask, a2, b2;
- mask = _mm256_set1_epi16(0b0111111111111111);
- a2 = _mm256_and_si256(a, mask);
- a2 = _mm256_or_si256(a2, _mm256_and_si256(_mm256_cmpgt_epi16(a2, a), mask));
- b2 = _mm256_and_si256(b, mask);
- b2 = _mm256_or_si256(b2, _mm256_and_si256(_mm256_cmpgt_epi16(b2, b), mask));
- // The 256-bit pack instruction _mm256_packus_epi16 is not serial, so the result has to be permutated into the correct order.
- // 0 2 1 3
- // | X |
- // 0 1 2 3
- return _mm256_permute4x64_epi64(_mm256_packus_epi16(a2, b2), 0b11011000);
- }
- // Reinterpret casting
- #define REINTERPRET_U32_TO_U8_SIMD256(A) (A)
- #define REINTERPRET_U32_TO_U16_SIMD256(A) (A)
- #define REINTERPRET_U8_TO_U32_SIMD256(A) (A)
- #define REINTERPRET_U16_TO_U32_SIMD256(A) (A)
- #define REINTERPRET_U32_TO_I32_SIMD256(A) (A)
- #define REINTERPRET_I32_TO_U32_SIMD256(A) (A)
- // Vector integer operations returning SIMD_I32x4
- #define ADD_I32_SIMD256(A, B) _mm256_add_epi32(A, B)
- #define SUB_I32_SIMD256(A, B) _mm256_sub_epi32(A, B)
- #define MUL_I32_SIMD256(A, B) _mm256_mullo_epi32(A, B)
- // Vector integer operations returning SIMD_U32x4
- #define ADD_U32_SIMD256(A, B) _mm256_add_epi32(A, B)
- #define SUB_U32_SIMD256(A, B) _mm256_sub_epi32(A, B)
- #define MUL_U32_SIMD256(A, B) _mm256_mullo_epi32(A, B)
- // Vector integer operations returning SIMD_U16x8
- #define ADD_U16_SIMD256(A, B) _mm256_add_epi16(A, B)
- #define SUB_U16_SIMD256(A, B) _mm256_sub_epi16(A, B)
- #define MUL_U16_SIMD256(A, B) _mm256_mullo_epi16(A, B)
- // Vector integer operations returning SIMD_U8x16
- #define ADD_U8_SIMD256(A, B) _mm256_add_epi8(A, B)
- #define ADD_SAT_U8_SIMD256(A, B) _mm256_adds_epu8(A, B) // Saturated addition
- #define SUB_U8_SIMD256(A, B) _mm256_sub_epi8(A, B)
- #define SUB_SAT_U8_SIMD256(A, B) _mm256_subs_epu8(A, B) // Saturated subtraction
- // No 8-bit multiplications
- // Bitwise
- #define BITWISE_AND_U32_SIMD256(A, B) _mm256_and_si256(A, B)
- #define BITWISE_OR_U32_SIMD256(A, B) _mm256_or_si256(A, B)
- #define BITWISE_XOR_U32_SIMD256(A, B) _mm256_xor_si256(A, B)
- #endif
- #endif
- #endif
- // Everything declared in here handles things specific for NEON.
- // Direct use of the macros will not provide portability to all hardware.
- #ifdef USE_NEON
- #include <arm_neon.h> // NEON
- // Vector types
- #define SIMD_F32x4 float32x4_t
- #define SIMD_U8x16 uint8x16_t
- #define SIMD_U16x8 uint16x8_t
- #define SIMD_U32x4 uint32x4_t
- #define SIMD_I32x4 int32x4_t
- // Vector uploads in address order
- inline SIMD_F32x4 LOAD_VECTOR_F32_SIMD(float a, float b, float c, float d) {
- float data[4] ALIGN16 = {a, b, c, d};
- #ifdef SAFE_POINTER_CHECKS
- if (uintptr_t((void*)data) & 15u) { throwError(U"Unaligned stack memory detected in LOAD_VECTOR_F32_SIMD for NEON!\n"); }
- #endif
- return vld1q_f32(data);
- }
- inline SIMD_F32x4 LOAD_SCALAR_F32_SIMD(float a) {
- return vdupq_n_f32(a);
- }
- inline SIMD_U8x16 LOAD_VECTOR_U8_SIMD(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
- uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p) {
- uint8_t data[16] ALIGN16 = {a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p};
- #ifdef SAFE_POINTER_CHECKS
- if (uintptr_t((void*)data) & 15u) { throwError(U"Unaligned stack memory detected in LOAD_VECTOR_U8_SIMD for NEON!\n"); }
- #endif
- return vld1q_u8(data);
- }
- inline SIMD_U8x16 LOAD_SCALAR_U8_SIMD(uint16_t a) {
- return vdupq_n_u8(a);
- }
- inline SIMD_U16x8 LOAD_VECTOR_U16_SIMD(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h) {
- uint16_t data[8] ALIGN16 = {a, b, c, d, e, f, g, h};
- #ifdef SAFE_POINTER_CHECKS
- if (uintptr_t((void*)data) & 15u) { throwError(U"Unaligned stack memory detected in LOAD_VECTOR_U16_SIMD for NEON!\n"); }
- #endif
- return vld1q_u16(data);
- }
- inline SIMD_U16x8 LOAD_SCALAR_U16_SIMD(uint16_t a) {
- return vdupq_n_u16(a);
- }
- inline SIMD_U32x4 LOAD_VECTOR_U32_SIMD(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
- uint32_t data[4] ALIGN16 = {a, b, c, d};
- #ifdef SAFE_POINTER_CHECKS
- if (uintptr_t((void*)data) & 15u) { throwError(U"Unaligned stack memory detected in LOAD_VECTOR_U32_SIMD for NEON!\n"); }
- #endif
- return vld1q_u32(data);
- }
- inline SIMD_U32x4 LOAD_SCALAR_U32_SIMD(uint32_t a) {
- return vdupq_n_u32(a);
- }
- inline SIMD_I32x4 LOAD_VECTOR_I32_SIMD(int32_t a, int32_t b, int32_t c, int32_t d) {
- int32_t data[4] ALIGN16 = {a, b, c, d};
- #ifdef SAFE_POINTER_CHECKS
- if (uintptr_t((void*)data) & 15u) { throwError(U"Unaligned stack memory detected in LOAD_VECTOR_I32_SIMD for NEON!\n"); }
- #endif
- return vld1q_s32(data);
- }
- inline SIMD_I32x4 LOAD_SCALAR_I32_SIMD(int32_t a) {
- return vdupq_n_s32(a);
- }
- // Conversions
- #define F32_TO_I32_SIMD(A) vcvtq_s32_f32(A)
- #define F32_TO_U32_SIMD(A) vcvtq_u32_f32(A)
- #define I32_TO_F32_SIMD(A) vcvtq_f32_s32(A)
- #define U32_TO_F32_SIMD(A) vcvtq_f32_u32(A)
- // Unpacking conversions
- #define U8_LOW_TO_U16_SIMD(A) vmovl_u8(vget_low_u8(A))
- #define U8_HIGH_TO_U16_SIMD(A) vmovl_u8(vget_high_u8(A))
- #define U16_LOW_TO_U32_SIMD(A) vmovl_u16(vget_low_u16(A))
- #define U16_HIGH_TO_U32_SIMD(A) vmovl_u16(vget_high_u16(A))
- // Saturated packing
- #define PACK_SAT_U16_TO_U8_SIMD(A, B) vcombine_u8(vqmovn_u16(A), vqmovn_u16(B))
- // Reinterpret casting
- #define REINTERPRET_U32_TO_U8_SIMD(A) vreinterpretq_u8_u32(A)
- #define REINTERPRET_U32_TO_U16_SIMD(A) vreinterpretq_u16_u32(A)
- #define REINTERPRET_U8_TO_U32_SIMD(A) vreinterpretq_u32_u8(A)
- #define REINTERPRET_U16_TO_U32_SIMD(A) vreinterpretq_u32_u16(A)
- #define REINTERPRET_U32_TO_I32_SIMD(A) vreinterpretq_s32_u32(A)
- #define REINTERPRET_I32_TO_U32_SIMD(A) vreinterpretq_u32_s32(A)
- // Vector float operations returning SIMD_F32x4
- #define ADD_F32_SIMD(A, B) vaddq_f32(A, B)
- #define SUB_F32_SIMD(A, B) vsubq_f32(A, B)
- #define MUL_F32_SIMD(A, B) vmulq_f32(A, B)
- // Vector integer operations returning SIMD_I32x4
- #define ADD_I32_SIMD(A, B) vaddq_s32(A, B)
- #define SUB_I32_SIMD(A, B) vsubq_s32(A, B)
- #define MUL_I32_NEON(A, B) vmulq_s32(A, B)
- // Vector integer operations returning SIMD_U32x4
- #define ADD_U32_SIMD(A, B) vaddq_u32(A, B)
- #define SUB_U32_SIMD(A, B) vsubq_u32(A, B)
- #define MUL_U32_NEON(A, B) vmulq_u32(A, B)
- // Vector integer operations returning SIMD_U16x8
- #define ADD_U16_SIMD(A, B) vaddq_u16(A, B)
- #define SUB_U16_SIMD(A, B) vsubq_u16(A, B)
- #define MUL_U16_SIMD(A, B) vmulq_u16(A, B)
- // Vector integer operations returning SIMD_U8x16
- #define ADD_U8_SIMD(A, B) vaddq_u8(A, B)
- #define ADD_SAT_U8_SIMD(A, B) vqaddq_u8(A, B) // Saturated addition
- #define SUB_U8_SIMD(A, B) vsubq_u8(A, B)
- #define SUB_SAT_U8_SIMD(A, B) vqsubq_u8(A, B) // Saturated subtraction
- // No 8-bit multiplications
- // Statistics
- #define MIN_F32_SIMD(A, B) vminq_f32(A, B)
- #define MAX_F32_SIMD(A, B) vmaxq_f32(A, B)
- // Bitwise
- #define BITWISE_AND_U32_SIMD(A, B) vandq_u32(A, B)
- #define BITWISE_OR_U32_SIMD(A, B) vorrq_u32(A, B)
- #define BITWISE_XOR_U32_SIMD(A, B) veorq_u32(A, B)
- #endif
- /*
- The vector types below are supposed to be portable across different CPU architectures.
- When mixed with handwritten SIMD intrinsics:
- Use "USE_SSE2" instead of "__SSE2__"
- Use "USE_AVX2" instead of "__AVX2__"
- Use "USE_NEON" instead of "__ARM_NEON"
- So that any new variations of the macro named given from the compiler can be added to simd.h instead of duplicated everywhere.
- Portability exceptions:
- * The "v" variable is the native backend, which is only defined when SIMD is supported by hardware.
- Only use when USE_BASIC_SIMD is defined.
- Will not work on scalar emulation.
- * The "scalars" array is available when emulating a type that does not exist or the SIMD vector has direct access to the memory.
- Do not rely on these for accessing elements, because otherwise your code will not be able to compile for ARM NEON.
- */
- union F32x4 {
- private:
- // The uninitialized default constructor is private for safety reasons.
- F32x4() {}
- public:
- // When the uninitialized constructor is needed for performance, use this named constructor instead.
- static inline F32x4 create_dangerous_uninitialized() { return F32x4(); }
- #ifdef USE_BASIC_SIMD
- public:
- #ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
- // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
- // Direct access cannot be done on NEON!
- float scalars[4];
- #endif
- // The SIMD vector of undefined type
- // Not accessible while emulating!
- SIMD_F32x4 v;
- // Construct a portable vector from a native SIMD vector
- explicit F32x4(const SIMD_F32x4& v) : v(v) {}
- // Construct a portable vector from a set of scalars
- F32x4(float a1, float a2, float a3, float a4) : v(LOAD_VECTOR_F32_SIMD(a1, a2, a3, a4)) {}
- // Construct a portable vector from a single duplicated scalar
- explicit F32x4(float scalar) : v(LOAD_SCALAR_F32_SIMD(scalar)) {}
- #else
- public:
- // Emulate a SIMD vector as an array of scalars without hardware support.
- // Only accessible while emulating!
- float scalars[4];
- // Construct a portable vector from a set of scalars
- F32x4(float a1, float a2, float a3, float a4) {
- this->scalars[0] = a1;
- this->scalars[1] = a2;
- this->scalars[2] = a3;
- this->scalars[3] = a4;
- }
- // Construct a portable vector from a single duplicated scalar
- explicit F32x4(float scalar) {
- this->scalars[0] = scalar;
- this->scalars[1] = scalar;
- this->scalars[2] = scalar;
- this->scalars[3] = scalar;
- }
- #endif
- // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
- static inline F32x4 createGradient(float start, float increment) {
- return F32x4(start, start + increment, start + increment * 2.0f, start + increment * 3.0f);
- }
- // Construct a portable SIMD vector from a pointer to aligned data.
- // Data must be aligned with at least 16 bytes.
- static inline F32x4 readAlignedUnsafe(const float* data) {
- #ifdef SAFE_POINTER_CHECKS
- if (uintptr_t((const void*)data) & 15u) { throwError(U"Unaligned pointer detected in F32x4::readAlignedUnsafe!\n"); }
- #endif
- #ifdef USE_BASIC_SIMD
- #if defined USE_SSE2
- return F32x4(_mm_load_ps(data));
- #elif defined USE_NEON
- return F32x4(vld1q_f32(data));
- #endif
- #else
- return F32x4(data[0], data[1], data[2], data[3]);
- #endif
- }
- // Write to aligned memory from the existing vector
- // data must be aligned with 32 bytes.
- inline void writeAlignedUnsafe(float* data) const {
- #ifdef SAFE_POINTER_CHECKS
- if (uintptr_t((void*)data) & 15u) { throwError(U"Unaligned pionter detected in F32x4::writeAlignedUnsafe!\n"); }
- #endif
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- _mm_store_ps(data, this->v);
- #elif defined USE_NEON
- vst1q_f32(data, this->v);
- #endif
- #else
- data[0] = this->scalars[0];
- data[1] = this->scalars[1];
- data[2] = this->scalars[2];
- data[3] = this->scalars[3];
- #endif
- }
- #if defined DFPSR_GEOMETRY_FVECTOR
- dsr::FVector4D get() const {
- float data[4] ALIGN16;
- #ifdef SAFE_POINTER_CHECKS
- if (uintptr_t(data) & 15u) { throwError(U"Unaligned stack memory detected in FVector4D F32x4::get!\n"); }
- #endif
- this->writeAlignedUnsafe(data);
- return dsr::FVector4D(data[0], data[1], data[2], data[3]);
- }
- #endif
- // Bound and alignment checked reading
- static inline F32x4 readAligned(dsr::SafePointer<const float> data, const char* methodName) {
- const float* pointer = data.getUnsafe();
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 16);
- #endif
- return F32x4::readAlignedUnsafe(pointer);
- }
- // Bound and alignment checked writing
- inline void writeAligned(dsr::SafePointer<float> data, const char* methodName) const {
- float* pointer = data.getUnsafe();
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 16);
- #endif
- this->writeAlignedUnsafe(pointer);
- }
- };
- // 1 / value
- inline F32x4 reciprocal(const F32x4 &value) {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- // Approximate
- SIMD_F32x4 lowQ = _mm_rcp_ps(value.v);
- // Refine
- return F32x4(SUB_F32_SIMD(ADD_F32_SIMD(lowQ, lowQ), MUL_F32_SIMD(value.v, MUL_F32_SIMD(lowQ, lowQ))));
- #elif defined USE_NEON
- // Approximate
- SIMD_F32x4 result = vrecpeq_f32(value.v);
- // Refine
- result = MUL_F32_SIMD(vrecpsq_f32(value.v, result), result);
- return F32x4(MUL_F32_SIMD(vrecpsq_f32(value.v, result), result));
- #else
- assert(false);
- return F32x4(0);
- #endif
- #else
- return F32x4(1.0f / value.scalars[0], 1.0f / value.scalars[1], 1.0f / value.scalars[2], 1.0f / value.scalars[3]);
- #endif
- }
- // 1 / sqrt(value)
- inline F32x4 reciprocalSquareRoot(const F32x4 &value) {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- SIMD_F32x4 reciRoot = _mm_rsqrt_ps(value.v);
- SIMD_F32x4 mul = MUL_F32_SIMD(MUL_F32_SIMD(value.v, reciRoot), reciRoot);
- reciRoot = MUL_F32_SIMD(MUL_F32_SIMD(LOAD_SCALAR_F32_SIMD(0.5f), reciRoot), SUB_F32_SIMD(LOAD_SCALAR_F32_SIMD(3.0f), mul));
- return F32x4(reciRoot);
- #elif defined USE_NEON
- // Approximate
- SIMD_F32x4 reciRoot = vrsqrteq_f32(value.v);
- // Refine
- reciRoot = MUL_F32_SIMD(vrsqrtsq_f32(MUL_F32_SIMD(value.v, reciRoot), reciRoot), reciRoot);
- return F32x4(reciRoot);
- #else
- assert(false);
- return F32x4(0);
- #endif
- #else
- return F32x4(1.0f / sqrt(value.scalars[0]), 1.0f / sqrt(value.scalars[1]), 1.0f / sqrt(value.scalars[2]), 1.0f / sqrt(value.scalars[3]));
- #endif
- }
- // sqrt(value)
- inline F32x4 squareRoot(const F32x4 &value) {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- SIMD_F32x4 half = LOAD_SCALAR_F32_SIMD(0.5f);
- // Approximate
- SIMD_F32x4 root = _mm_sqrt_ps(value.v);
- // Refine
- root = _mm_mul_ps(_mm_add_ps(root, _mm_div_ps(value.v, root)), half);
- return F32x4(root);
- #elif defined USE_NEON
- return F32x4(MUL_F32_SIMD(value.v, reciprocalSquareRoot(value).v));
- #else
- assert(false);
- return F32x4(0);
- #endif
- #else
- return F32x4(sqrt(value.scalars[0]), sqrt(value.scalars[1]), sqrt(value.scalars[2]), sqrt(value.scalars[3]));
- #endif
- }
- union I32x4 {
- private:
- // The uninitialized default constructor is private for safety reasons.
- I32x4() {}
- public:
- // When the uninitialized constructor is needed for performance, use this named constructor instead.
- static inline I32x4 create_dangerous_uninitialized() { return I32x4(); }
- #if defined USE_BASIC_SIMD
- public:
- #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
- // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
- // Direct access cannot be done on NEON!
- int32_t scalars[4];
- #endif
- // The SIMD vector of undefined type
- // Not accessible while emulating!
- SIMD_I32x4 v;
- // Construct a portable vector from a native SIMD vector
- explicit I32x4(const SIMD_I32x4& v) : v(v) {}
- // Construct a portable vector from a set of scalars
- I32x4(int32_t a1, int32_t a2, int32_t a3, int32_t a4) : v(LOAD_VECTOR_I32_SIMD(a1, a2, a3, a4)) {}
- // Construct a portable vector from a single duplicated scalar
- explicit I32x4(int32_t scalar) : v(LOAD_SCALAR_I32_SIMD(scalar)) {}
- #else
- public:
- // Emulate a SIMD vector as an array of scalars without hardware support.
- // Only accessible while emulating!
- int32_t scalars[4];
- // Construct a portable vector from a set of scalars
- I32x4(int32_t a1, int32_t a2, int32_t a3, int32_t a4) {
- this->scalars[0] = a1;
- this->scalars[1] = a2;
- this->scalars[2] = a3;
- this->scalars[3] = a4;
- }
- // Construct a portable vector from a single duplicated scalar
- explicit I32x4(int32_t scalar) {
- this->scalars[0] = scalar;
- this->scalars[1] = scalar;
- this->scalars[2] = scalar;
- this->scalars[3] = scalar;
- }
- #endif
- // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
- static inline I32x4 createGradient(int32_t start, int32_t increment) {
- return I32x4(start, start + increment, start + increment * 2, start + increment * 3);
- }
- // Construct a portable SIMD vector from a pointer to aligned data.
- // Data must be aligned with at least 16 bytes.
- static inline I32x4 readAlignedUnsafe(const int32_t* data) {
- #ifdef SAFE_POINTER_CHECKS
- if (uintptr_t(data) & 15u) { throwError(U"Unaligned pointer detected in I32x4::readAlignedUnsafe!\n"); }
- #endif
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- return I32x4(_mm_load_si128((const __m128i*)data));
- #elif defined USE_NEON
- return I32x4(vld1q_s32(data));
- #endif
- #else
- return I32x4(data[0], data[1], data[2], data[3]);
- #endif
- }
- // Write to aligned memory from the existing vector
- // data must be aligned with 32 bytes.
- inline void writeAlignedUnsafe(int32_t* data) const {
- #ifdef SAFE_POINTER_CHECKS
- if (uintptr_t(data) & 15u) { throwError(U"Unaligned pointer detected in I32x4::writeAlignedUnsafe!\n"); }
- #endif
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- _mm_store_si128((__m128i*)data, this->v);
- #elif defined USE_NEON
- vst1q_s32(data, this->v);
- #endif
- #else
- data[0] = this->scalars[0];
- data[1] = this->scalars[1];
- data[2] = this->scalars[2];
- data[3] = this->scalars[3];
- #endif
- }
- #if defined DFPSR_GEOMETRY_IVECTOR
- dsr::IVector4D get() const {
- int32_t data[4] ALIGN16;
- #ifdef SAFE_POINTER_CHECKS
- if (uintptr_t(data) & 15u) { throwError(U"Unaligned stack memory detected in IVector4D I32x4::get!\n"); }
- #endif
- this->writeAlignedUnsafe(data);
- return dsr::IVector4D(data[0], data[1], data[2], data[3]);
- }
- #endif
- // Bound and alignment checked reading
- static inline I32x4 readAligned(dsr::SafePointer<const int32_t> data, const char* methodName) {
- const int32_t* pointer = data.getUnsafe();
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 16);
- #endif
- return I32x4::readAlignedUnsafe(pointer);
- }
- // Bound and alignment checked writing
- inline void writeAligned(dsr::SafePointer<int32_t> data, const char* methodName) const {
- int32_t* pointer = data.getUnsafe();
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 16);
- #endif
- this->writeAlignedUnsafe(pointer);
- }
- };
- union U32x4 {
- #if defined USE_BASIC_SIMD
- public:
- #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
- // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
- // Direct access cannot be done on NEON!
- uint32_t scalars[4];
- #endif
- // The SIMD vector of undefined type
- // Not accessible while emulating!
- SIMD_U32x4 v;
- // Construct a portable vector from a native SIMD vector
- explicit U32x4(const SIMD_U32x4& v) : v(v) {}
- // Construct a portable vector from a set of scalars
- U32x4(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4) : v(LOAD_VECTOR_U32_SIMD(a1, a2, a3, a4)) {}
- // Construct a portable vector from a single duplicated scalar
- explicit U32x4(uint32_t scalar) : v(LOAD_SCALAR_U32_SIMD(scalar)) {}
- #else
- public:
- // Emulate a SIMD vector as an array of scalars without hardware support.
- // Only accessible while emulating!
- uint32_t scalars[4];
- // Construct a portable vector from a set of scalars
- U32x4(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4) {
- this->scalars[0] = a1;
- this->scalars[1] = a2;
- this->scalars[2] = a3;
- this->scalars[3] = a4;
- }
- // Construct a portable vector from a single duplicated scalar
- explicit U32x4(uint32_t scalar) {
- this->scalars[0] = scalar;
- this->scalars[1] = scalar;
- this->scalars[2] = scalar;
- this->scalars[3] = scalar;
- }
- #endif
- // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
- static inline U32x4 createGradient(uint32_t start, uint32_t increment) {
- return U32x4(start, start + increment, start + increment * 2, start + increment * 3);
- }
- // Construct a portable SIMD vector from a pointer to aligned data
- // Data must be aligned with at least 16 bytes.
- static inline U32x4 readAlignedUnsafe(const uint32_t* data) {
- #ifdef SAFE_POINTER_CHECKS
- if (uintptr_t(data) & 15u) { throwError(U"Unaligned pointer detected in U32x4::readAlignedUnsafe!\n"); }
- #endif
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- return U32x4(_mm_load_si128((const __m128i*)data));
- #elif defined USE_NEON
- return U32x4(vld1q_u32(data));
- #endif
- #else
- return U32x4(data[0], data[1], data[2], data[3]);
- #endif
- }
- // Write to aligned memory from the existing vector
- // data must be aligned with 32 bytes.
- inline void writeAlignedUnsafe(uint32_t* data) const {
- #ifdef SAFE_POINTER_CHECKS
- if (uintptr_t(data) & 15u) { throwError(U"Unaligned pointer detected in U32x4::writeAlignedUnsafe!\n"); }
- #endif
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- _mm_store_si128((__m128i*)data, this->v);
- #elif defined USE_NEON
- vst1q_u32(data, this->v);
- #endif
- #else
- data[0] = this->scalars[0];
- data[1] = this->scalars[1];
- data[2] = this->scalars[2];
- data[3] = this->scalars[3];
- #endif
- }
- #if defined DFPSR_GEOMETRY_UVECTOR
- dsr::UVector4D get() const {
- uint32_t data[4] ALIGN16;
- #ifdef SAFE_POINTER_CHECKS
- if (uintptr_t(data) & 15u) { throwError(U"Unaligned stack memory detected in UVector4D U32x4::get!\n"); }
- #endif
- this->writeAlignedUnsafe(data);
- return dsr::UVector4D(data[0], data[1], data[2], data[3]);
- }
- #endif
- // Bound and alignment checked reading
- static inline U32x4 readAligned(dsr::SafePointer<const uint32_t> data, const char* methodName) {
- const uint32_t* pointer = data.getUnsafe();
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 16);
- #endif
- return U32x4::readAlignedUnsafe(pointer);
- }
- // Bound and alignment checked writing
- inline void writeAligned(dsr::SafePointer<uint32_t> data, const char* methodName) const {
- uint32_t* pointer = data.getUnsafe();
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 16);
- #endif
- this->writeAlignedUnsafe(pointer);
- }
- };
- union U16x8 {
- private:
- // The uninitialized default constructor is private for safety reasons.
- U16x8() {}
- public:
- // When the uninitialized constructor is needed for performance, use this named constructor instead.
- static inline U16x8 create_dangerous_uninitialized() { return U16x8(); }
- #if defined USE_BASIC_SIMD
- public:
- #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
- // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
- // Direct access cannot be done on NEON!
- uint16_t scalars[8];
- #endif
- // The SIMD vector of undefined type
- // Not accessible while emulating!
- SIMD_U16x8 v;
- // Construct a portable vector from a native SIMD vector
- explicit U16x8(const SIMD_U16x8& v) : v(v) {}
- // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
- // Reinterpret casting is used
- explicit U16x8(const U32x4& vector) : v(REINTERPRET_U32_TO_U16_SIMD(vector.v)) {}
- // Construct a portable vector from a set of scalars
- U16x8(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8) : v(LOAD_VECTOR_U16_SIMD(a1, a2, a3, a4, a5, a6, a7, a8)) {}
- // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
- // Reinterpret casting is used
- // TODO: Remove all reintreprets from constructors to improve readability
- explicit U16x8(uint32_t scalar) : v(REINTERPRET_U32_TO_U16_SIMD(LOAD_SCALAR_U32_SIMD(scalar))) {}
- // Construct a portable vector from a single duplicated scalar
- explicit U16x8(uint16_t scalar) : v(LOAD_SCALAR_U16_SIMD(scalar)) {}
- // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
- U32x4 get_U32() const {
- return U32x4(REINTERPRET_U16_TO_U32_SIMD(this->v));
- }
- #else
- public:
- // Emulate a SIMD vector as an array of scalars without hardware support.
- // Only accessible while emulating!
- uint16_t scalars[8];
- // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
- // Reinterpret casting is used
- explicit U16x8(const U32x4& vector) {
- uint64_t *target = (uint64_t*)this->scalars;
- uint64_t *source = (uint64_t*)vector.scalars;
- target[0] = source[0];
- target[1] = source[1];
- }
- // Construct a portable vector from a set of scalars
- U16x8(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8) {
- this->scalars[0] = a1;
- this->scalars[1] = a2;
- this->scalars[2] = a3;
- this->scalars[3] = a4;
- this->scalars[4] = a5;
- this->scalars[5] = a6;
- this->scalars[6] = a7;
- this->scalars[7] = a8;
- }
- // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
- // Reinterpret casting is used
- explicit U16x8(uint32_t scalar) {
- uint32_t *target = (uint32_t*)this->scalars;
- target[0] = scalar;
- target[1] = scalar;
- target[2] = scalar;
- target[3] = scalar;
- }
- // Construct a portable vector from a single duplicated scalar
- explicit U16x8(uint16_t scalar) {
- this->scalars[0] = scalar;
- this->scalars[1] = scalar;
- this->scalars[2] = scalar;
- this->scalars[3] = scalar;
- this->scalars[4] = scalar;
- this->scalars[5] = scalar;
- this->scalars[6] = scalar;
- this->scalars[7] = scalar;
- }
- // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
- U32x4 get_U32() const {
- U32x4 result(0);
- uint64_t *target = (uint64_t*)result.scalars;
- uint64_t *source = (uint64_t*)this->scalars;
- target[0] = source[0];
- target[1] = source[1];
- return result;
- }
- #endif
- // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
- static inline U16x8 createGradient(uint16_t start, uint16_t increment) {
- return U16x8(
- start,
- start + increment,
- start + increment * 2,
- start + increment * 3,
- start + increment * 4,
- start + increment * 5,
- start + increment * 6,
- start + increment * 7
- );
- }
- static inline U16x8 readAlignedUnsafe(const uint16_t* data) {
- #ifdef SAFE_POINTER_CHECKS
- if (uintptr_t(data) & 15u) { throwError(U"Unaligned pointer detected in U16x8::readAlignedUnsafe!\n"); }
- #endif
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- return U16x8(_mm_load_si128((const __m128i*)data));
- #elif defined USE_NEON
- return U16x8(vld1q_u16(data));
- #endif
- #else
- return U16x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
- #endif
- }
- // Data must be aligned with at least 16 bytes.
- inline void writeAlignedUnsafe(uint16_t* data) const {
- #ifdef SAFE_POINTER_CHECKS
- if (uintptr_t(data) & 15u) { throwError(U"Unaligned pointer detected in U16x8::writeAlignedUnsafe!\n"); }
- #endif
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- _mm_store_si128((__m128i*)data, this->v);
- #elif defined USE_NEON
- vst1q_u16(data, this->v);
- #endif
- #else
- data[0] = this->scalars[0];
- data[1] = this->scalars[1];
- data[2] = this->scalars[2];
- data[3] = this->scalars[3];
- data[4] = this->scalars[4];
- data[5] = this->scalars[5];
- data[6] = this->scalars[6];
- data[7] = this->scalars[7];
- #endif
- }
- // Bound and alignment checked reading
- static inline U16x8 readAligned(dsr::SafePointer<const uint16_t> data, const char* methodName) {
- const uint16_t* pointer = data.getUnsafe();
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 16);
- #endif
- return U16x8::readAlignedUnsafe(pointer);
- }
- // Bound and alignment checked writing
- inline void writeAligned(dsr::SafePointer<uint16_t> data, const char* methodName) const {
- uint16_t* pointer = data.getUnsafe();
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 16);
- #endif
- this->writeAlignedUnsafe(pointer);
- }
- };
- union U8x16 {
- private:
- // The uninitialized default constructor is private for safety reasons.
- U8x16() {}
- public:
- // When the uninitialized constructor is needed for performance, use this named constructor instead.
- static inline U8x16 create_dangerous_uninitialized() { return U8x16(); }
- #if defined USE_BASIC_SIMD
- public:
- #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
- // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
- // Direct access cannot be done on NEON!
- uint8_t scalars[16];
- #endif
- // The SIMD vector of undefined type
- // Not accessible while emulating!
- SIMD_U8x16 v;
- // Construct a portable vector from a native SIMD vector
- explicit U8x16(const SIMD_U8x16& v) : v(v) {}
- // Construct a portable vector from a set of scalars
- U8x16(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
- uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16)
- : v(LOAD_VECTOR_U8_SIMD(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16)) {}
- // Construct a portable vector from a single duplicated scalar
- explicit U8x16(uint8_t scalar) : v(LOAD_SCALAR_U8_SIMD(scalar)) {}
- #else
- public:
- // Emulate a SIMD vector as an array of scalars without hardware support.
- // Only accessible while emulating!
- uint8_t scalars[16];
- // Construct a portable vector from a set of scalars
- U8x16(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
- uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16) {
- this->scalars[0] = a1;
- this->scalars[1] = a2;
- this->scalars[2] = a3;
- this->scalars[3] = a4;
- this->scalars[4] = a5;
- this->scalars[5] = a6;
- this->scalars[6] = a7;
- this->scalars[7] = a8;
- this->scalars[8] = a9;
- this->scalars[9] = a10;
- this->scalars[10] = a11;
- this->scalars[11] = a12;
- this->scalars[12] = a13;
- this->scalars[13] = a14;
- this->scalars[14] = a15;
- this->scalars[15] = a16;
- }
- // Construct a portable vector from a single duplicated scalar
- explicit U8x16(uint8_t scalar) {
- this->scalars[0] = scalar;
- this->scalars[1] = scalar;
- this->scalars[2] = scalar;
- this->scalars[3] = scalar;
- this->scalars[4] = scalar;
- this->scalars[5] = scalar;
- this->scalars[6] = scalar;
- this->scalars[7] = scalar;
- this->scalars[8] = scalar;
- this->scalars[9] = scalar;
- this->scalars[10] = scalar;
- this->scalars[11] = scalar;
- this->scalars[12] = scalar;
- this->scalars[13] = scalar;
- this->scalars[14] = scalar;
- this->scalars[15] = scalar;
- }
- #endif
- // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
- static inline U8x16 createGradient(uint8_t start, uint8_t increment) {
- return U8x16(
- start,
- start + increment,
- start + increment * 2,
- start + increment * 3,
- start + increment * 4,
- start + increment * 5,
- start + increment * 6,
- start + increment * 7,
- start + increment * 8,
- start + increment * 9,
- start + increment * 10,
- start + increment * 11,
- start + increment * 12,
- start + increment * 13,
- start + increment * 14,
- start + increment * 15
- );
- }
- static inline U8x16 readAlignedUnsafe(const uint8_t* data) {
- #ifdef SAFE_POINTER_CHECKS
- if (uintptr_t(data) & 15u) { throwError(U"Unaligned pointer detected in U8x16::readAlignedUnsafe!\n"); }
- #endif
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- return U8x16(_mm_load_si128((const __m128i*)data));
- #elif defined USE_NEON
- return U8x16(vld1q_u8(data));
- #endif
- #else
- return U8x16(
- data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7],
- data[8], data[9], data[10], data[11], data[12], data[13], data[14], data[15]
- );
- #endif
- }
- // Data must be aligned with at least 16 bytes.
- inline void writeAlignedUnsafe(uint8_t* data) const {
- #ifdef SAFE_POINTER_CHECKS
- if (uintptr_t(data) & 15u) { throwError(U"Unaligned pointer detected in U8x16::writeAlignedUnsafe!\n"); }
- #endif
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- _mm_store_si128((__m128i*)data, this->v);
- #elif defined USE_NEON
- vst1q_u8(data, this->v);
- #endif
- #else
- data[0] = this->scalars[0];
- data[1] = this->scalars[1];
- data[2] = this->scalars[2];
- data[3] = this->scalars[3];
- data[4] = this->scalars[4];
- data[5] = this->scalars[5];
- data[6] = this->scalars[6];
- data[7] = this->scalars[7];
- data[8] = this->scalars[8];
- data[9] = this->scalars[9];
- data[10] = this->scalars[10];
- data[11] = this->scalars[11];
- data[12] = this->scalars[12];
- data[13] = this->scalars[13];
- data[14] = this->scalars[14];
- data[15] = this->scalars[15];
- #endif
- }
- // Bound and alignment checked reading
- static inline U8x16 readAligned(dsr::SafePointer<const uint8_t> data, const char* methodName) {
- const uint8_t* pointer = data.getUnsafe();
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 16);
- #endif
- return U8x16::readAlignedUnsafe(pointer);
- }
- // Bound and alignment checked writing
- inline void writeAligned(dsr::SafePointer<uint8_t> data, const char* methodName) const {
- uint8_t* pointer = data.getUnsafe();
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 16);
- #endif
- this->writeAlignedUnsafe(pointer);
- }
- };
- union F32x8 {
- private:
- // The uninitialized default constructor is private for safety reasons.
- F32x8() {}
- public:
- // When the uninitialized constructor is needed for performance, use this named constructor instead.
- static inline F32x8 create_dangerous_uninitialized() { return F32x8(); }
- #if defined USE_256BIT_F_SIMD
- public:
- #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
- // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
- float scalars[8];
- #endif
- // The SIMD vector of undefined type
- // Not accessible while emulating!
- SIMD_F32x8 v;
- // Construct a portable vector from a native SIMD vector
- explicit F32x8(const SIMD_F32x8& v) : v(v) {}
- // Construct a portable vector from a set of scalars
- F32x8(float a1, float a2, float a3, float a4, float a5, float a6, float a7, float a8)
- : v(LOAD_VECTOR_F32_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8)) {}
- // Construct a portable vector from a single duplicated scalar
- explicit F32x8(float scalar) : v(LOAD_SCALAR_F32_SIMD256(scalar)) {}
- #else
- public:
- // Emulate a SIMD vector as an array of scalars without hardware support.
- // Only accessible while emulating!
- float scalars[8];
- // Construct a portable vector from a set of scalars
- F32x8(float a1, float a2, float a3, float a4, float a5, float a6, float a7, float a8) {
- this->scalars[0] = a1;
- this->scalars[1] = a2;
- this->scalars[2] = a3;
- this->scalars[3] = a4;
- this->scalars[4] = a5;
- this->scalars[5] = a6;
- this->scalars[6] = a7;
- this->scalars[7] = a8;
- }
- // Construct a portable vector from a single duplicated scalar
- explicit F32x8(float scalar) {
- this->scalars[0] = scalar;
- this->scalars[1] = scalar;
- this->scalars[2] = scalar;
- this->scalars[3] = scalar;
- this->scalars[4] = scalar;
- this->scalars[5] = scalar;
- this->scalars[6] = scalar;
- this->scalars[7] = scalar;
- }
- #endif
- // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
- static inline F32x8 createGradient(float start, float increment) {
- return F32x8(
- start,
- start + increment,
- start + increment * 2.0f,
- start + increment * 3.0f,
- start + increment * 4.0f,
- start + increment * 5.0f,
- start + increment * 6.0f,
- start + increment * 7.0f
- );
- }
- // Construct a portable SIMD vector from a pointer to aligned data.
- static inline F32x8 readAlignedUnsafe(const float* data) {
- #ifdef SAFE_POINTER_CHECKS
- if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in F32x8::readAlignedUnsafe!\n"); }
- #endif
- #if defined USE_AVX2
- return F32x8(_mm256_load_ps(data));
- #else
- return F32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
- #endif
- }
- // Write to aligned memory from the existing vector.
- inline void writeAlignedUnsafe(float* data) const {
- #ifdef SAFE_POINTER_CHECKS
- if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in F32x8::writeAlignedUnsafe!\n"); }
- #endif
- #if defined USE_AVX2
- _mm256_store_ps(data, this->v);
- #else
- data[0] = this->scalars[0];
- data[1] = this->scalars[1];
- data[2] = this->scalars[2];
- data[3] = this->scalars[3];
- data[4] = this->scalars[4];
- data[5] = this->scalars[5];
- data[6] = this->scalars[6];
- data[7] = this->scalars[7];
- #endif
- }
- // Bound and alignment checked reading
- static inline F32x8 readAligned(dsr::SafePointer<const float> data, const char* methodName) {
- const float* pointer = data.getUnsafe();
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 32);
- #endif
- return F32x8::readAlignedUnsafe(pointer);
- }
- // Bound and alignment checked writing
- inline void writeAligned(dsr::SafePointer<float> data, const char* methodName) const {
- float* pointer = data.getUnsafe();
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 32);
- #endif
- this->writeAlignedUnsafe(pointer);
- }
- };
- // 1 / value
- inline F32x8 reciprocal(const F32x8 &value) {
- #if defined USE_AVX2
- // Approximate
- SIMD_F32x8 lowQ = _mm256_rcp_ps(value.v);
- // Refine
- return F32x8(SUB_F32_SIMD256(ADD_F32_SIMD256(lowQ, lowQ), MUL_F32_SIMD256(value.v, MUL_F32_SIMD256(lowQ, lowQ))));
- #else
- return F32x8(
- 1.0f / value.scalars[0],
- 1.0f / value.scalars[1],
- 1.0f / value.scalars[2],
- 1.0f / value.scalars[3],
- 1.0f / value.scalars[4],
- 1.0f / value.scalars[5],
- 1.0f / value.scalars[6],
- 1.0f / value.scalars[7]
- );
- #endif
- }
- // 1 / sqrt(value)
- inline F32x8 reciprocalSquareRoot(const F32x8 &value) {
- #if defined USE_AVX2
- SIMD_F32x8 reciRoot = _mm256_rsqrt_ps(value.v);
- SIMD_F32x8 mul = MUL_F32_SIMD256(MUL_F32_SIMD256(value.v, reciRoot), reciRoot);
- reciRoot = MUL_F32_SIMD256(MUL_F32_SIMD256(LOAD_SCALAR_F32_SIMD256(0.5f), reciRoot), SUB_F32_SIMD256(LOAD_SCALAR_F32_SIMD256(3.0f), mul));
- return F32x8(reciRoot);
- #else
- return F32x8(
- 1.0f / sqrt(value.scalars[0]),
- 1.0f / sqrt(value.scalars[1]),
- 1.0f / sqrt(value.scalars[2]),
- 1.0f / sqrt(value.scalars[3]),
- 1.0f / sqrt(value.scalars[4]),
- 1.0f / sqrt(value.scalars[5]),
- 1.0f / sqrt(value.scalars[6]),
- 1.0f / sqrt(value.scalars[7])
- );
- #endif
- }
- // sqrt(value)
- inline F32x8 squareRoot(const F32x8 &value) {
- #if defined USE_AVX2
- SIMD_F32x8 half = LOAD_SCALAR_F32_SIMD256(0.5f);
- // Approximate
- SIMD_F32x8 root = _mm256_sqrt_ps(value.v);
- // Refine
- root = _mm256_mul_ps(_mm256_add_ps(root, _mm256_div_ps(value.v, root)), half);
- return F32x8(root);
- #else
- return F32x8(
- sqrt(value.scalars[0]),
- sqrt(value.scalars[1]),
- sqrt(value.scalars[2]),
- sqrt(value.scalars[3]),
- sqrt(value.scalars[4]),
- sqrt(value.scalars[5]),
- sqrt(value.scalars[6]),
- sqrt(value.scalars[7]));
- #endif
- }
- union I32x8 {
- private:
- // The uninitialized default constructor is private for safety reasons.
- I32x8() {}
- public:
- // When the uninitialized constructor is needed for performance, use this named constructor instead.
- static inline I32x8 create_dangerous_uninitialized() { return I32x8(); }
- #if defined USE_256BIT_X_SIMD
- public:
- #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
- // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
- int32_t scalars[8];
- #endif
- // The SIMD vector of undefined type
- // Not accessible while emulating!
- SIMD_I32x8 v;
- // Construct a portable vector from a native SIMD vector
- explicit I32x8(const SIMD_I32x8& v) : v(v) {}
- // Construct a portable vector from a set of scalars
- I32x8(int32_t a1, int32_t a2, int32_t a3, int32_t a4, int32_t a5, int32_t a6, int32_t a7, int32_t a8)
- : v(LOAD_VECTOR_I32_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8)) {}
- // Construct a portable vector from a single duplicated scalar
- explicit I32x8(int32_t scalar) : v(LOAD_SCALAR_I32_SIMD256(scalar)) {}
- #else
- public:
- // Emulate a SIMD vector as an array of scalars without hardware support.
- // Only accessible while emulating!
- int32_t scalars[8];
- // Construct a portable vector from a set of scalars
- I32x8(int32_t a1, int32_t a2, int32_t a3, int32_t a4, int32_t a5, int32_t a6, int32_t a7, int32_t a8) {
- this->scalars[0] = a1;
- this->scalars[1] = a2;
- this->scalars[2] = a3;
- this->scalars[3] = a4;
- this->scalars[4] = a5;
- this->scalars[5] = a6;
- this->scalars[6] = a7;
- this->scalars[7] = a8;
- }
- // Construct a portable vector from a single duplicated scalar
- explicit I32x8(int32_t scalar) {
- this->scalars[0] = scalar;
- this->scalars[1] = scalar;
- this->scalars[2] = scalar;
- this->scalars[3] = scalar;
- this->scalars[4] = scalar;
- this->scalars[5] = scalar;
- this->scalars[6] = scalar;
- this->scalars[7] = scalar;
- }
- #endif
- // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
- static inline I32x8 createGradient(int32_t start, int32_t increment) {
- return I32x8(
- start,
- start + increment,
- start + increment * 2,
- start + increment * 3,
- start + increment * 4,
- start + increment * 5,
- start + increment * 6,
- start + increment * 7
- );
- }
- // Construct a portable SIMD vector from a pointer to aligned data.
- static inline I32x8 readAlignedUnsafe(const int32_t* data) {
- #ifdef SAFE_POINTER_CHECKS
- if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in I32x8::readAlignedUnsafe!\n"); }
- #endif
- #if defined USE_AVX2
- return I32x8(_mm256_load_si256((const __m256i*)data));
- #else
- return I32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
- #endif
- }
- // Write to aligned memory from the existing vector.
- inline void writeAlignedUnsafe(int32_t* data) const {
- #ifdef SAFE_POINTER_CHECKS
- if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in I32x8::writeAlignedUnsafe!\n"); }
- #endif
- #if defined USE_AVX2
- _mm256_store_si256((__m256i*)data, this->v);
- #else
- data[0] = this->scalars[0];
- data[1] = this->scalars[1];
- data[2] = this->scalars[2];
- data[3] = this->scalars[3];
- data[4] = this->scalars[4];
- data[5] = this->scalars[5];
- data[6] = this->scalars[6];
- data[7] = this->scalars[7];
- #endif
- }
- // Bound and alignment checked reading
- static inline I32x8 readAligned(dsr::SafePointer<const int32_t> data, const char* methodName) {
- const int32_t* pointer = data.getUnsafe();
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 32);
- #endif
- return I32x8::readAlignedUnsafe(pointer);
- }
- // Bound and alignment checked writing
- inline void writeAligned(dsr::SafePointer<int32_t> data, const char* methodName) const {
- int32_t* pointer = data.getUnsafe();
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 32);
- #endif
- this->writeAlignedUnsafe(pointer);
- }
- };
- union U32x8 {
- private:
- // The uninitialized default constructor is private for safety reasons.
- U32x8() {}
- public:
- // When the uninitialized constructor is needed for performance, use this named constructor instead.
- static inline U32x8 create_dangerous_uninitialized() { return U32x8(); }
- #if defined USE_256BIT_X_SIMD
- public:
- #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
- // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
- uint32_t scalars[8];
- #endif
- // The SIMD vector of undefined type
- // Not accessible while emulating!
- SIMD_U32x8 v;
- // Construct a portable vector from a native SIMD vector
- explicit U32x8(const SIMD_U32x8& v) : v(v) {}
- // Construct a portable vector from a set of scalars
- U32x8(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4, uint32_t a5, uint32_t a6, uint32_t a7, uint32_t a8)
- : v(LOAD_VECTOR_U32_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8)) {}
- // Construct a portable vector from a single duplicated scalar
- explicit U32x8(uint32_t scalar) : v(LOAD_SCALAR_U32_SIMD256(scalar)) {}
- #else
- public:
- // Emulate a SIMD vector as an array of scalars without hardware support.
- // Only accessible while emulating!
- uint32_t scalars[8];
- // Construct a portable vector from a set of scalars
- U32x8(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4, uint32_t a5, uint32_t a6, uint32_t a7, uint32_t a8) {
- this->scalars[0] = a1;
- this->scalars[1] = a2;
- this->scalars[2] = a3;
- this->scalars[3] = a4;
- this->scalars[4] = a5;
- this->scalars[5] = a6;
- this->scalars[6] = a7;
- this->scalars[7] = a8;
- }
- // Construct a portable vector from a single duplicated scalar
- explicit U32x8(uint32_t scalar) {
- this->scalars[0] = scalar;
- this->scalars[1] = scalar;
- this->scalars[2] = scalar;
- this->scalars[3] = scalar;
- this->scalars[4] = scalar;
- this->scalars[5] = scalar;
- this->scalars[6] = scalar;
- this->scalars[7] = scalar;
- }
- #endif
- // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
- static inline U32x8 createGradient(uint32_t start, uint32_t increment) {
- return U32x8(
- start,
- start + increment,
- start + increment * 2,
- start + increment * 3,
- start + increment * 4,
- start + increment * 5,
- start + increment * 6,
- start + increment * 7
- );
- }
- // Construct a portable SIMD vector from a pointer to aligned data.
- // Data must be aligned with at least 32 bytes.
- static inline U32x8 readAlignedUnsafe(const uint32_t* data) {
- #ifdef SAFE_POINTER_CHECKS
- if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in U32x8::readAlignedUnsafe!\n"); }
- #endif
- #if defined USE_AVX2
- return U32x8(_mm256_load_si256((const __m256i*)data));
- #else
- return U32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
- #endif
- }
- // Write to aligned memory from the existing vector.
- // data must be aligned with 32 bytes.
- inline void writeAlignedUnsafe(uint32_t* data) const {
- #ifdef SAFE_POINTER_CHECKS
- if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in U32x8::writeAlignedUnsafe!\n"); }
- #endif
- #if defined USE_AVX2
- _mm256_store_si256((__m256i*)data, this->v);
- #else
- data[0] = this->scalars[0];
- data[1] = this->scalars[1];
- data[2] = this->scalars[2];
- data[3] = this->scalars[3];
- data[4] = this->scalars[4];
- data[5] = this->scalars[5];
- data[6] = this->scalars[6];
- data[7] = this->scalars[7];
- #endif
- }
- // Bound and alignment checked reading
- static inline U32x8 readAligned(dsr::SafePointer<const uint32_t> data, const char* methodName) {
- const uint32_t* pointer = data.getUnsafe();
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 32);
- #endif
- return U32x8::readAlignedUnsafe(pointer);
- }
- // Bound and alignment checked writing
- inline void writeAligned(dsr::SafePointer<uint32_t> data, const char* methodName) const {
- uint32_t* pointer = data.getUnsafe();
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 32);
- #endif
- this->writeAlignedUnsafe(pointer);
- }
- };
- union U16x16 {
- private:
- // The uninitialized default constructor is private for safety reasons.
- U16x16() {}
- public:
- // When the uninitialized constructor is needed for performance, use this named constructor instead.
- static inline U16x16 create_dangerous_uninitialized() { return U16x16(); }
- #if defined USE_256BIT_X_SIMD
- public:
- #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
- // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
- uint16_t scalars[16];
- #endif
- // The SIMD vector of undefined type
- // Not accessible while emulating!
- SIMD_U16x16 v;
- // Construct a portable vector from a native SIMD vector
- explicit U16x16(const SIMD_U16x16& v) : v(v) {}
- // Construct a vector of 16 x 16-bit unsigned integers from a vector of 8 x 32-bit unsigned integers
- // Reinterpret casting is used
- explicit U16x16(const U32x8& vector) : v(REINTERPRET_U32_TO_U16_SIMD256(vector.v)) {}
- // Construct a portable vector from a set of scalars
- U16x16(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8,
- uint16_t a9, uint16_t a10, uint16_t a11, uint16_t a12, uint16_t a13, uint16_t a14, uint16_t a15, uint16_t a16)
- : v(LOAD_VECTOR_U16_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16)) {}
- // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
- // Reinterpret casting is used
- // TODO: Remove all reintreprets from constructors to improve readability
- explicit U16x16(uint32_t scalar) : v(REINTERPRET_U32_TO_U16_SIMD256(LOAD_SCALAR_U32_SIMD256(scalar))) {}
- // Construct a portable vector from a single duplicated scalar
- explicit U16x16(uint16_t scalar) : v(LOAD_SCALAR_U16_SIMD256(scalar)) {}
- // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
- U32x8 get_U32() const {
- return U32x8(REINTERPRET_U16_TO_U32_SIMD256(this->v));
- }
- #else
- public:
- // Emulate a SIMD vector as an array of scalars without hardware support.
- // Only accessible while emulating!
- uint16_t scalars[16];
- // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
- // Reinterpret casting is used
- explicit U16x16(const U32x8& vector) {
- uint64_t *target = (uint64_t*)this->scalars;
- uint64_t *source = (uint64_t*)vector.scalars;
- target[0] = source[0];
- target[1] = source[1];
- target[2] = source[2];
- target[3] = source[3];
- }
- // Construct a portable vector from a set of scalars
- U16x16(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8,
- uint16_t a9, uint16_t a10, uint16_t a11, uint16_t a12, uint16_t a13, uint16_t a14, uint16_t a15, uint16_t a16) {
- this->scalars[0] = a1;
- this->scalars[1] = a2;
- this->scalars[2] = a3;
- this->scalars[3] = a4;
- this->scalars[4] = a5;
- this->scalars[5] = a6;
- this->scalars[6] = a7;
- this->scalars[7] = a8;
- this->scalars[8] = a9;
- this->scalars[9] = a10;
- this->scalars[10] = a11;
- this->scalars[11] = a12;
- this->scalars[12] = a13;
- this->scalars[13] = a14;
- this->scalars[14] = a15;
- this->scalars[15] = a16;
- }
- // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
- // Reinterpret casting is used
- explicit U16x16(uint32_t scalar) {
- uint32_t *target = (uint32_t*)this->scalars;
- target[0] = scalar;
- target[1] = scalar;
- target[2] = scalar;
- target[3] = scalar;
- target[4] = scalar;
- target[5] = scalar;
- target[6] = scalar;
- target[7] = scalar;
- }
- // Construct a portable vector from a single duplicated scalar
- explicit U16x16(uint16_t scalar) {
- this->scalars[0] = scalar;
- this->scalars[1] = scalar;
- this->scalars[2] = scalar;
- this->scalars[3] = scalar;
- this->scalars[4] = scalar;
- this->scalars[5] = scalar;
- this->scalars[6] = scalar;
- this->scalars[7] = scalar;
- this->scalars[8] = scalar;
- this->scalars[9] = scalar;
- this->scalars[10] = scalar;
- this->scalars[11] = scalar;
- this->scalars[12] = scalar;
- this->scalars[13] = scalar;
- this->scalars[14] = scalar;
- this->scalars[15] = scalar;
- }
- // Reinterpret cast to a vector of 8 x 32-bit unsigned integers
- U32x8 get_U32() const {
- U32x8 result(0);
- uint64_t *target = (uint64_t*)result.scalars;
- uint64_t *source = (uint64_t*)this->scalars;
- target[0] = source[0];
- target[1] = source[1];
- target[2] = source[2];
- target[3] = source[3];
- return result;
- }
- #endif
- // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
- static inline U16x16 createGradient(uint16_t start, uint16_t increment) {
- return U16x16(
- start,
- start + increment,
- start + increment * 2,
- start + increment * 3,
- start + increment * 4,
- start + increment * 5,
- start + increment * 6,
- start + increment * 7,
- start + increment * 8,
- start + increment * 9,
- start + increment * 10,
- start + increment * 11,
- start + increment * 12,
- start + increment * 13,
- start + increment * 14,
- start + increment * 15
- );
- }
- // Data must be aligned with at least 32 bytes.
- static inline U16x16 readAlignedUnsafe(const uint16_t* data) {
- #ifdef SAFE_POINTER_CHECKS
- if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in U16x16::readAlignedUnsafe!\n"); }
- #endif
- #if defined USE_AVX2
- return U16x16(_mm256_load_si256((const __m256i*)data));
- #else
- return U16x16(
- data[0],
- data[1],
- data[2],
- data[3],
- data[4],
- data[5],
- data[6],
- data[7],
- data[8],
- data[9],
- data[10],
- data[11],
- data[12],
- data[13],
- data[14],
- data[15]
- );
- #endif
- }
- // Data must be aligned with at least 32 bytes.
- inline void writeAlignedUnsafe(uint16_t* data) const {
- #ifdef SAFE_POINTER_CHECKS
- if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in U16x16::writeAlignedUnsafe!\n"); }
- #endif
- #if defined USE_AVX2
- _mm256_store_si256((__m256i*)data, this->v);
- #else
- data[0] = this->scalars[0];
- data[1] = this->scalars[1];
- data[2] = this->scalars[2];
- data[3] = this->scalars[3];
- data[4] = this->scalars[4];
- data[5] = this->scalars[5];
- data[6] = this->scalars[6];
- data[7] = this->scalars[7];
- data[8] = this->scalars[8];
- data[9] = this->scalars[9];
- data[10] = this->scalars[10];
- data[11] = this->scalars[11];
- data[12] = this->scalars[12];
- data[13] = this->scalars[13];
- data[14] = this->scalars[14];
- data[15] = this->scalars[15];
- #endif
- }
- // Bound and alignment checked reading
- static inline U16x16 readAligned(dsr::SafePointer<const uint16_t> data, const char* methodName) {
- const uint16_t* pointer = data.getUnsafe();
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 32);
- #endif
- return U16x16::readAlignedUnsafe(pointer);
- }
- // Bound and alignment checked writing
- inline void writeAligned(dsr::SafePointer<uint16_t> data, const char* methodName) const {
- uint16_t* pointer = data.getUnsafe();
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 32);
- #endif
- this->writeAlignedUnsafe(pointer);
- }
- };
- union U8x32 {
- private:
- // The uninitialized default constructor is private for safety reasons.
- U8x32() {}
- public:
- // When the uninitialized constructor is needed for performance, use this named constructor instead.
- static inline U8x32 create_dangerous_uninitialized() { return U8x32(); }
- #if defined USE_256BIT_X_SIMD
- public:
- #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
- // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
- uint8_t scalars[32];
- #endif
- // The SIMD vector of undefined type
- // Not accessible while emulating!
- SIMD_U8x32 v;
- // Construct a portable vector from a native SIMD vector
- explicit U8x32(const SIMD_U8x32& v) : v(v) {}
- // Construct a portable vector from a set of scalars
- U8x32(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
- uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16,
- uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24,
- uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32)
- : v(LOAD_VECTOR_U8_SIMD256(
- a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16,
- a17, a18, a19, a20, a21, a22, a23, a24, a25, a26, a27, a28, a29, a30, a31, a32
- )) {}
- // Construct a portable vector from a single duplicated scalar
- explicit U8x32(uint8_t scalar) : v(LOAD_SCALAR_U8_SIMD256(scalar)) {}
- #else
- public:
- // Emulate a SIMD vector as an array of scalars without hardware support.
- // Only accessible while emulating!
- uint8_t scalars[32];
- // Construct a portable vector from a set of scalars
- U8x32(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
- uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16,
- uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24,
- uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32) {
- this->scalars[0] = a1;
- this->scalars[1] = a2;
- this->scalars[2] = a3;
- this->scalars[3] = a4;
- this->scalars[4] = a5;
- this->scalars[5] = a6;
- this->scalars[6] = a7;
- this->scalars[7] = a8;
- this->scalars[8] = a9;
- this->scalars[9] = a10;
- this->scalars[10] = a11;
- this->scalars[11] = a12;
- this->scalars[12] = a13;
- this->scalars[13] = a14;
- this->scalars[14] = a15;
- this->scalars[15] = a16;
- this->scalars[16] = a17;
- this->scalars[17] = a18;
- this->scalars[18] = a19;
- this->scalars[19] = a20;
- this->scalars[20] = a21;
- this->scalars[21] = a22;
- this->scalars[22] = a23;
- this->scalars[23] = a24;
- this->scalars[24] = a25;
- this->scalars[25] = a26;
- this->scalars[26] = a27;
- this->scalars[27] = a28;
- this->scalars[28] = a29;
- this->scalars[29] = a30;
- this->scalars[30] = a31;
- this->scalars[31] = a32;
- }
- // Construct a portable vector from a single duplicated scalar
- explicit U8x32(uint8_t scalar) {
- for (int i = 0; i < 32; i++) {
- this->scalars[i] = scalar;
- }
- }
- #endif
- // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
- static inline U8x32 createGradient(uint8_t start, uint8_t increment) {
- return U8x32(
- start,
- start + increment,
- start + increment * 2,
- start + increment * 3,
- start + increment * 4,
- start + increment * 5,
- start + increment * 6,
- start + increment * 7,
- start + increment * 8,
- start + increment * 9,
- start + increment * 10,
- start + increment * 11,
- start + increment * 12,
- start + increment * 13,
- start + increment * 14,
- start + increment * 15,
- start + increment * 16,
- start + increment * 17,
- start + increment * 18,
- start + increment * 19,
- start + increment * 20,
- start + increment * 21,
- start + increment * 22,
- start + increment * 23,
- start + increment * 24,
- start + increment * 25,
- start + increment * 26,
- start + increment * 27,
- start + increment * 28,
- start + increment * 29,
- start + increment * 30,
- start + increment * 31
- );
- }
- // Data must be aligned with at least 32 bytes.
- static inline U8x32 readAlignedUnsafe(const uint8_t* data) {
- #ifdef SAFE_POINTER_CHECKS
- if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in U8x32::readAlignedUnsafe!\n"); }
- #endif
- #if defined USE_AVX2
- return U8x32(_mm256_load_si256((const __m256i*)data));
- #else
- U8x32 result;
- for (int i = 0; i < 32; i++) {
- result.scalars[i] = data[i];
- }
- return result;
- #endif
- }
- // Data must be aligned with at least 32 bytes.
- inline void writeAlignedUnsafe(uint8_t* data) const {
- #ifdef SAFE_POINTER_CHECKS
- if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in U8x32::writeAlignedUnsafe!\n"); }
- #endif
- #if defined USE_AVX2
- _mm256_store_si256((__m256i*)data, this->v);
- #else
- for (int i = 0; i < 32; i++) {
- data[i] = this->scalars[i];
- }
- #endif
- }
- // Bound and alignment checked reading
- static inline U8x32 readAligned(dsr::SafePointer<const uint8_t> data, const char* methodName) {
- const uint8_t* pointer = data.getUnsafe();
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 32);
- #endif
- return U8x32::readAlignedUnsafe(pointer);
- }
- // Bound and alignment checked writing
- inline void writeAligned(dsr::SafePointer<uint8_t> data, const char* methodName) const {
- uint8_t* pointer = data.getUnsafe();
- #if defined SAFE_POINTER_CHECKS
- data.assertInside(methodName, pointer, 32);
- #endif
- this->writeAlignedUnsafe(pointer);
- }
- };
- // Helper macros for doing things to certain sets of SIMD vector types
- // Performing do(vector_type, element_type, lane_count)
- #define FOR_ALL_VECTOR_TYPES(DO) \
- DO(F32x4, float, 4) \
- DO(I32x4, int32_t, 4) \
- DO(U32x4, uint32_t, 4) \
- DO(U16x8, uint16_t, 8) \
- DO(U8x16, uint8_t, 16) \
- DO(F32x8, float, 8) \
- DO(I32x8, int32_t, 8) \
- DO(U32x8, uint32_t, 8) \
- DO(U16x16, uint16_t, 16) \
- DO(U8x32, uint8_t, 32)
- #define FOR_FLOAT_VECTOR_TYPES(DO) \
- DO(F32x4, float, 4) \
- DO(F32x8, float, 8)
- #define FOR_INTEGER_VECTOR_TYPES(DO) \
- DO(I32x4, int32_t, 4) \
- DO(U32x4, uint32_t, 4) \
- DO(U16x8, uint16_t, 8) \
- DO(U8x16, uint8_t, 16) \
- DO(I32x8, int32_t, 8) \
- DO(U32x8, uint32_t, 8) \
- DO(U16x16, uint16_t, 16) \
- DO(U8x32, uint8_t, 32)
- #define FOR_SIGNED_VECTOR_TYPES(DO) \
- DO(F32x4, float, 4) \
- DO(I32x4, int32_t, 4) \
- DO(F32x8, float, 8) \
- DO(I32x8, int32_t, 8)
- #define FOR_UNSIGNED_VECTOR_TYPES(DO) \
- DO(U32x4, uint32_t, 4) \
- DO(U16x8, uint16_t, 8) \
- DO(U8x16, uint8_t, 16) \
- DO(U32x8, uint32_t, 8) \
- DO(U16x16, uint16_t, 16) \
- DO(U8x32, uint8_t, 32)
- // Print SIMD vectors to the terminal or append them to strings.
- #define CREATE_METHOD_PRINT(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
- inline dsr::String& string_toStreamIndented(dsr::String& target, const VECTOR_TYPE& source, const dsr::ReadableString& indentation) { \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
- source.writeAlignedUnsafe(a); \
- dsr::string_append(target, indentation, a[0]); \
- for (int i = 1; i < LANE_COUNT; i++) { \
- string_append(target, U", ", a[i]); \
- } \
- return target; \
- }
- // All SIMD vectors can be printed.
- FOR_ALL_VECTOR_TYPES(CREATE_METHOD_PRINT)
- #undef CREATE_METHOD_PRINT
- // Integer equality returns true iff all comparisons are identical.
- #define CREATE_EXACT_EQUALITY(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
- inline bool allLanesEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
- left.writeAlignedUnsafe(a); \
- right.writeAlignedUnsafe(b); \
- for (int i = 0; i < LANE_COUNT; i++) { \
- if (a[i] != b[i]) return false; \
- } \
- return true; \
- } \
- inline bool allLanesNotEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
- left.writeAlignedUnsafe(a); \
- right.writeAlignedUnsafe(b); \
- for (int i = 0; i < LANE_COUNT; i++) { \
- if (a[i] == b[i]) return false; \
- } \
- return true; \
- }
- // Integer SIMD vectors have exact equlity.
- FOR_INTEGER_VECTOR_TYPES(CREATE_EXACT_EQUALITY)
- #undef CREATE_EXACT_EQUALITY
- // Float equality returns true iff all comparisons are near.
- #define CREATE_TOLERANT_EQUALITY(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
- inline bool allLanesEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
- left.writeAlignedUnsafe(a); \
- right.writeAlignedUnsafe(b); \
- for (int i = 0; i < LANE_COUNT; i++) { \
- if (fabs(a[i] - b[i]) >= 0.0001f) return false; \
- } \
- return true; \
- } \
- inline bool allLanesNotEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
- left.writeAlignedUnsafe(a); \
- right.writeAlignedUnsafe(b); \
- for (int i = 0; i < LANE_COUNT; i++) { \
- if (fabs(a[i] - b[i]) < 0.0001f) return false; \
- } \
- return true; \
- }
- // Float SIMD vectors have inexact equality.
- FOR_FLOAT_VECTOR_TYPES(CREATE_TOLERANT_EQUALITY)
- #undef CREATE_TOLERANT_EQUALITY
- #define CREATE_COMPARISONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
- inline bool allLanesGreater(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
- left.writeAlignedUnsafe(a); \
- right.writeAlignedUnsafe(b); \
- for (int i = 0; i < LANE_COUNT; i++) { \
- if (a[i] <= b[i]) return false; \
- } \
- return true; \
- } \
- inline bool allLanesGreaterOrEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
- left.writeAlignedUnsafe(a); \
- right.writeAlignedUnsafe(b); \
- for (int i = 0; i < LANE_COUNT; i++) { \
- if (a[i] < b[i]) return false; \
- } \
- return true; \
- } \
- inline bool allLanesLesser(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
- left.writeAlignedUnsafe(a); \
- right.writeAlignedUnsafe(b); \
- for (int i = 0; i < LANE_COUNT; i++) { \
- if (a[i] >= b[i]) return false; \
- } \
- return true; \
- } \
- inline bool allLanesLesserOrEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
- ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
- left.writeAlignedUnsafe(a); \
- right.writeAlignedUnsafe(b); \
- for (int i = 0; i < LANE_COUNT; i++) { \
- if (a[i] > b[i]) return false; \
- } \
- return true; \
- }
- FOR_ALL_VECTOR_TYPES(CREATE_COMPARISONS)
- #undef CREATE_COMPARISONS
- inline F32x4 operator+(const F32x4& left, const F32x4& right) {
- #if defined USE_BASIC_SIMD
- return F32x4(ADD_F32_SIMD(left.v, right.v));
- #else
- return F32x4(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3]);
- #endif
- }
- inline F32x4 operator-(const F32x4& left, const F32x4& right) {
- #if defined USE_BASIC_SIMD
- return F32x4(SUB_F32_SIMD(left.v, right.v));
- #else
- return F32x4(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3]);
- #endif
- }
- inline F32x4 operator*(const F32x4& left, const F32x4& right) {
- #if defined USE_BASIC_SIMD
- return F32x4(MUL_F32_SIMD(left.v, right.v));
- #else
- return F32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
- #endif
- }
- inline F32x4 min(const F32x4& left, const F32x4& right) {
- #if defined USE_BASIC_SIMD
- return F32x4(MIN_F32_SIMD(left.v, right.v));
- #else
- float v0 = left.scalars[0];
- float v1 = left.scalars[1];
- float v2 = left.scalars[2];
- float v3 = left.scalars[3];
- float r0 = right.scalars[0];
- float r1 = right.scalars[1];
- float r2 = right.scalars[2];
- float r3 = right.scalars[3];
- if (r0 < v0) { v0 = r0; }
- if (r1 < v1) { v1 = r1; }
- if (r2 < v2) { v2 = r2; }
- if (r3 < v3) { v3 = r3; }
- return F32x4(v0, v1, v2, v3);
- #endif
- }
- inline F32x4 max(const F32x4& left, const F32x4& right) {
- #if defined USE_BASIC_SIMD
- return F32x4(MAX_F32_SIMD(left.v, right.v));
- #else
- float v0 = left.scalars[0];
- float v1 = left.scalars[1];
- float v2 = left.scalars[2];
- float v3 = left.scalars[3];
- float r0 = right.scalars[0];
- float r1 = right.scalars[1];
- float r2 = right.scalars[2];
- float r3 = right.scalars[3];
- if (r0 > v0) { v0 = r0; }
- if (r1 > v1) { v1 = r1; }
- if (r2 > v2) { v2 = r2; }
- if (r3 > v3) { v3 = r3; }
- return F32x4(v0, v1, v2, v3);
- #endif
- }
- inline I32x4 operator+(const I32x4& left, const I32x4& right) {
- #if defined USE_BASIC_SIMD
- return I32x4(ADD_I32_SIMD(left.v, right.v));
- #else
- return I32x4(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3]);
- #endif
- }
- inline I32x4 operator-(const I32x4& left, const I32x4& right) {
- #if defined USE_BASIC_SIMD
- return I32x4(SUB_I32_SIMD(left.v, right.v));
- #else
- return I32x4(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3]);
- #endif
- }
- inline I32x4 operator*(const I32x4& left, const I32x4& right) {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- // TODO: Use AVX2 for 32-bit integer multiplication when available.
- return I32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
- #elif defined USE_NEON
- return I32x4(MUL_I32_NEON(left.v, right.v));
- #endif
- #else
- return I32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
- #endif
- }
- // TODO: Specify the behavior of truncated unsigned integer overflow and add it to the tests.
- inline U32x4 operator+(const U32x4& left, const U32x4& right) {
- #if defined USE_BASIC_SIMD
- return U32x4(ADD_U32_SIMD(left.v, right.v));
- #else
- return U32x4(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3]);
- #endif
- }
- inline U32x4 operator-(const U32x4& left, const U32x4& right) {
- #if defined USE_BASIC_SIMD
- return U32x4(SUB_U32_SIMD(left.v, right.v));
- #else
- return U32x4(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3]);
- #endif
- }
- inline U32x4 operator*(const U32x4& left, const U32x4& right) {
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- // Emulate a NEON instruction on SSE2 registers
- return U32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
- #else // NEON
- return U32x4(MUL_U32_NEON(left.v, right.v));
- #endif
- #else
- return U32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
- #endif
- }
- // Bitwise and
- inline U32x4 operator&(const U32x4& left, const U32x4& right) {
- #if defined USE_BASIC_SIMD
- return U32x4(BITWISE_AND_U32_SIMD(left.v, right.v));
- #else
- return U32x4(left.scalars[0] & right.scalars[0], left.scalars[1] & right.scalars[1], left.scalars[2] & right.scalars[2], left.scalars[3] & right.scalars[3]);
- #endif
- }
- // Bitwise or
- inline U32x4 operator|(const U32x4& left, const U32x4& right) {
- #if defined USE_BASIC_SIMD
- return U32x4(BITWISE_OR_U32_SIMD(left.v, right.v));
- #else
- return U32x4(left.scalars[0] | right.scalars[0], left.scalars[1] | right.scalars[1], left.scalars[2] | right.scalars[2], left.scalars[3] | right.scalars[3]);
- #endif
- }
- // Bitwise xor
- inline U32x4 operator^(const U32x4& left, const U32x4& right) {
- #if defined USE_BASIC_SIMD
- return U32x4(BITWISE_XOR_U32_SIMD(left.v, right.v));
- #else
- return U32x4(left.scalars[0] ^ right.scalars[0], left.scalars[1] ^ right.scalars[1], left.scalars[2] ^ right.scalars[2], left.scalars[3] ^ right.scalars[3]);
- #endif
- }
- // Bitwise negation
- inline U32x4 operator~(const U32x4& value) {
- #if defined USE_NEON
- return U32x4(vmvnq_u32(value.v));
- #elif defined USE_BASIC_SIMD
- // Fall back on xor against all ones.
- return value ^ U32x4(~uint32_t(0));
- #else
- return U32x4(~value.scalars[0], ~value.scalars[1], ~value.scalars[2], ~value.scalars[3]);
- #endif
- }
- inline U32x4 operator<<(const U32x4& left, const U32x4 &bitOffsets) {
- assert(allLanesLesser(bitOffsets, U32x4(32u)));
- #if defined USE_NEON
- return U32x4(vshlq_u32(left.v, vreinterpretq_s32_u32(bitOffsets.v)));
- #else
- return U32x4(
- left.scalars[0] << bitOffsets.scalars[0],
- left.scalars[1] << bitOffsets.scalars[1],
- left.scalars[2] << bitOffsets.scalars[2],
- left.scalars[3] << bitOffsets.scalars[3]
- );
- #endif
- }
- inline U32x4 operator>>(const U32x4& left, const U32x4 &bitOffsets) {
- assert(allLanesLesser(bitOffsets, U32x4(32u)));
- #if defined USE_NEON
- //return U32x4(vshrq_u32(left.v, vreinterpretq_s32_u32(bitOffsets.v)));
- return U32x4(vshlq_u32(left.v, vnegq_s32(vreinterpretq_s32_u32(bitOffsets.v))));
- #else
- return U32x4(
- left.scalars[0] >> bitOffsets.scalars[0],
- left.scalars[1] >> bitOffsets.scalars[1],
- left.scalars[2] >> bitOffsets.scalars[2],
- left.scalars[3] >> bitOffsets.scalars[3]
- );
- #endif
- }
- // bitOffset must be an immediate constant, so a template argument is used.
- template <uint32_t bitOffset>
- inline U32x4 bitShiftLeftImmediate(const U32x4& left) {
- static_assert(bitOffset < 32u, "Immediate left shift of 32-bit values may not shift more than 31 bits!");
- #if defined USE_SSE2
- return U32x4(_mm_slli_epi32(left.v, bitOffset));
- #else
- #if defined USE_NEON
- return U32x4(vshlq_u32(left.v, LOAD_SCALAR_I32_SIMD(bitOffset)));
- #else
- return U32x4(left.scalars[0] << bitOffset, left.scalars[1] << bitOffset, left.scalars[2] << bitOffset, left.scalars[3] << bitOffset);
- #endif
- #endif
- }
- // bitOffset must be an immediate constant.
- template <uint32_t bitOffset>
- inline U32x4 bitShiftRightImmediate(const U32x4& left) {
- static_assert(bitOffset < 32u, "Immediate right shift of 32-bit values may not shift more than 31 bits!");
- #if defined USE_SSE2
- return U32x4(_mm_srli_epi32(left.v, bitOffset));
- #else
- #if defined USE_NEON
- // TODO: Why is vshrq_u32 not found?
- //return U32x4(vshrq_u32(left.v, LOAD_SCALAR_I32_SIMD(bitOffset)));
- return U32x4(vshlq_u32(left.v, LOAD_SCALAR_I32_SIMD(-(int32_t)bitOffset)));
- #else
- return U32x4(left.scalars[0] >> bitOffset, left.scalars[1] >> bitOffset, left.scalars[2] >> bitOffset, left.scalars[3] >> bitOffset);
- #endif
- #endif
- }
-
- inline U16x8 operator<<(const U16x8& left, const U16x8 &bitOffsets) {
- assert(allLanesLesser(bitOffsets, U16x8(16u)));
- #if defined USE_NEON
- return U16x8(vshlq_u16(left.v, vreinterpretq_s16_u16(bitOffsets.v)));
- #else
- return U16x8(
- left.scalars[0] << bitOffsets.scalars[0],
- left.scalars[1] << bitOffsets.scalars[1],
- left.scalars[2] << bitOffsets.scalars[2],
- left.scalars[3] << bitOffsets.scalars[3],
- left.scalars[4] << bitOffsets.scalars[4],
- left.scalars[5] << bitOffsets.scalars[5],
- left.scalars[6] << bitOffsets.scalars[6],
- left.scalars[7] << bitOffsets.scalars[7]
- );
- #endif
- }
- inline U16x8 operator>>(const U16x8& left, const U16x8 &bitOffsets) {
- assert(allLanesLesser(bitOffsets, U16x8(16u)));
- #if defined USE_NEON
- //return U16x8(vshrq_u16(left.v, vreinterpretq_s16_u16(bitOffsets.v)));
- return U16x8(vshlq_u16(left.v, vnegq_s16(vreinterpretq_s16_u16(bitOffsets.v))));
- #else
- return U16x8(
- left.scalars[0] >> bitOffsets.scalars[0],
- left.scalars[1] >> bitOffsets.scalars[1],
- left.scalars[2] >> bitOffsets.scalars[2],
- left.scalars[3] >> bitOffsets.scalars[3],
- left.scalars[4] >> bitOffsets.scalars[4],
- left.scalars[5] >> bitOffsets.scalars[5],
- left.scalars[6] >> bitOffsets.scalars[6],
- left.scalars[7] >> bitOffsets.scalars[7]
- );
- #endif
- }
- // bitOffset must be an immediate constant, so a template argument is used.
- template <uint32_t bitOffset>
- inline U16x8 bitShiftLeftImmediate(const U16x8& left) {
- static_assert(bitOffset < 16u, "Immediate left shift of 16-bit values may not shift more than 15 bits!");
- #if defined USE_SSE2
- return U16x8(_mm_slli_epi16(left.v, bitOffset));
- #else
- #if defined USE_NEON
- return U16x8(vshlq_u32(left.v, vdupq_n_s16(bitOffset)));
- #else
- return U16x8(
- left.scalars[0] << bitOffset,
- left.scalars[1] << bitOffset,
- left.scalars[2] << bitOffset,
- left.scalars[3] << bitOffset,
- left.scalars[4] << bitOffset,
- left.scalars[5] << bitOffset,
- left.scalars[6] << bitOffset,
- left.scalars[7] << bitOffset
- );
- #endif
- #endif
- }
- // bitOffset must be an immediate constant.
- template <uint32_t bitOffset>
- inline U16x8 bitShiftRightImmediate(const U16x8& left) {
- static_assert(bitOffset < 16u, "Immediate right shift of 16-bit values may not shift more than 15 bits!");
- #if defined USE_SSE2
- return U16x8(_mm_srli_epi16(left.v, bitOffset));
- #else
- #if defined USE_NEON
- //return U16x8(vshrq_u16(left.v, vdupq_n_s16(bitOffset)));
- return U16x8(vshlq_u16(left.v, vdupq_n_s16(-(int32_t)bitOffset)));
- #else
- return U16x8(
- left.scalars[0] >> bitOffset,
- left.scalars[1] >> bitOffset,
- left.scalars[2] >> bitOffset,
- left.scalars[3] >> bitOffset,
- left.scalars[4] >> bitOffset,
- left.scalars[5] >> bitOffset,
- left.scalars[6] >> bitOffset,
- left.scalars[7] >> bitOffset
- );
- #endif
- #endif
- }
- inline U8x16 operator<<(const U8x16& left, const U8x16 &bitOffsets) {
- assert(allLanesLesser(bitOffsets, U8x16(8u)));
- #if defined USE_NEON
- return U8x16(vshlq_u16(left.v, vreinterpretq_s8_u8(bitOffsets.v)));
- #else
- return U8x16(
- left.scalars[ 0] << bitOffsets.scalars[ 0],
- left.scalars[ 1] << bitOffsets.scalars[ 1],
- left.scalars[ 2] << bitOffsets.scalars[ 2],
- left.scalars[ 3] << bitOffsets.scalars[ 3],
- left.scalars[ 4] << bitOffsets.scalars[ 4],
- left.scalars[ 5] << bitOffsets.scalars[ 5],
- left.scalars[ 6] << bitOffsets.scalars[ 6],
- left.scalars[ 7] << bitOffsets.scalars[ 7],
- left.scalars[ 8] << bitOffsets.scalars[ 8],
- left.scalars[ 9] << bitOffsets.scalars[ 9],
- left.scalars[10] << bitOffsets.scalars[10],
- left.scalars[11] << bitOffsets.scalars[11],
- left.scalars[12] << bitOffsets.scalars[12],
- left.scalars[13] << bitOffsets.scalars[13],
- left.scalars[14] << bitOffsets.scalars[14],
- left.scalars[15] << bitOffsets.scalars[15]
- );
- #endif
- }
- inline U8x16 operator>>(const U8x16& left, const U8x16 &bitOffsets) {
- assert(allLanesLesser(bitOffsets, U8x16(8u)));
- #if defined USE_NEON
- //return U8x16(vshrq_u16(left.v, vreinterpretq_s8_u8(bitOffsets.v)));
- return U8x16(vshlq_u16(left.v, vnegq_s8(vreinterpretq_s8_u8(bitOffsets.v))));
- #else
- return U8x16(
- left.scalars[ 0] >> bitOffsets.scalars[ 0],
- left.scalars[ 1] >> bitOffsets.scalars[ 1],
- left.scalars[ 2] >> bitOffsets.scalars[ 2],
- left.scalars[ 3] >> bitOffsets.scalars[ 3],
- left.scalars[ 4] >> bitOffsets.scalars[ 4],
- left.scalars[ 5] >> bitOffsets.scalars[ 5],
- left.scalars[ 6] >> bitOffsets.scalars[ 6],
- left.scalars[ 7] >> bitOffsets.scalars[ 7],
- left.scalars[ 8] >> bitOffsets.scalars[ 8],
- left.scalars[ 9] >> bitOffsets.scalars[ 9],
- left.scalars[10] >> bitOffsets.scalars[10],
- left.scalars[11] >> bitOffsets.scalars[11],
- left.scalars[12] >> bitOffsets.scalars[12],
- left.scalars[13] >> bitOffsets.scalars[13],
- left.scalars[14] >> bitOffsets.scalars[14],
- left.scalars[15] >> bitOffsets.scalars[15]
- );
- #endif
- }
- // bitOffset must be an immediate constant, so a template argument is used.
- template <uint32_t bitOffset>
- inline U8x16 bitShiftLeftImmediate(const U8x16& left) {
- static_assert(bitOffset < 8u, "Immediate left shift of 8-bit values may not shift more than 7 bits!");
- #if defined USE_NEON
- return U8x16(vshlq_u32(left.v, vdupq_n_s8(bitOffset)));
- #else
- return U8x16(
- left.scalars[ 0] << bitOffset,
- left.scalars[ 1] << bitOffset,
- left.scalars[ 2] << bitOffset,
- left.scalars[ 3] << bitOffset,
- left.scalars[ 4] << bitOffset,
- left.scalars[ 5] << bitOffset,
- left.scalars[ 6] << bitOffset,
- left.scalars[ 7] << bitOffset,
- left.scalars[ 8] << bitOffset,
- left.scalars[ 9] << bitOffset,
- left.scalars[10] << bitOffset,
- left.scalars[11] << bitOffset,
- left.scalars[12] << bitOffset,
- left.scalars[13] << bitOffset,
- left.scalars[14] << bitOffset,
- left.scalars[15] << bitOffset
- );
- #endif
- }
- // bitOffset must be an immediate constant.
- template <uint32_t bitOffset>
- inline U8x16 bitShiftRightImmediate(const U8x16& left) {
- static_assert(bitOffset < 8u, "Immediate right shift of 8-bit values may not shift more than 7 bits!");
- #if defined USE_NEON
- //return U8x16(vshrq_u32(left.v, vdupq_n_s8(bitOffset)));
- return U8x16(vshlq_u32(left.v, vdupq_n_s8(-(int32_t)bitOffset)));
- #else
- return U8x16(
- left.scalars[ 0] >> bitOffset,
- left.scalars[ 1] >> bitOffset,
- left.scalars[ 2] >> bitOffset,
- left.scalars[ 3] >> bitOffset,
- left.scalars[ 4] >> bitOffset,
- left.scalars[ 5] >> bitOffset,
- left.scalars[ 6] >> bitOffset,
- left.scalars[ 7] >> bitOffset,
- left.scalars[ 8] >> bitOffset,
- left.scalars[ 9] >> bitOffset,
- left.scalars[10] >> bitOffset,
- left.scalars[11] >> bitOffset,
- left.scalars[12] >> bitOffset,
- left.scalars[13] >> bitOffset,
- left.scalars[14] >> bitOffset,
- left.scalars[15] >> bitOffset
- );
- #endif
- }
- inline U16x8 operator+(const U16x8& left, const U16x8& right) {
- #if defined USE_BASIC_SIMD
- return U16x8(ADD_U16_SIMD(left.v, right.v));
- #else
- return U16x8(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3],
- left.scalars[4] + right.scalars[4], left.scalars[5] + right.scalars[5], left.scalars[6] + right.scalars[6], left.scalars[7] + right.scalars[7]);
- #endif
- }
- inline U16x8 operator-(const U16x8& left, const U16x8& right) {
- #if defined USE_BASIC_SIMD
- return U16x8(SUB_U16_SIMD(left.v, right.v));
- #else
- return U16x8(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3],
- left.scalars[4] - right.scalars[4], left.scalars[5] - right.scalars[5], left.scalars[6] - right.scalars[6], left.scalars[7] - right.scalars[7]);
- #endif
- }
- inline U16x8 operator*(const U16x8& left, const U16x8& right) {
- #if defined USE_BASIC_SIMD
- return U16x8(MUL_U16_SIMD(left.v, right.v));
- #else
- return U16x8(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3],
- left.scalars[4] * right.scalars[4], left.scalars[5] * right.scalars[5], left.scalars[6] * right.scalars[6], left.scalars[7] * right.scalars[7]);
- #endif
- }
- inline U8x16 operator+(const U8x16& left, const U8x16& right) {
- #if defined USE_BASIC_SIMD
- return U8x16(ADD_U8_SIMD(left.v, right.v));
- #else
- return U8x16(
- left.scalars[0] + right.scalars[0],
- left.scalars[1] + right.scalars[1],
- left.scalars[2] + right.scalars[2],
- left.scalars[3] + right.scalars[3],
- left.scalars[4] + right.scalars[4],
- left.scalars[5] + right.scalars[5],
- left.scalars[6] + right.scalars[6],
- left.scalars[7] + right.scalars[7],
- left.scalars[8] + right.scalars[8],
- left.scalars[9] + right.scalars[9],
- left.scalars[10] + right.scalars[10],
- left.scalars[11] + right.scalars[11],
- left.scalars[12] + right.scalars[12],
- left.scalars[13] + right.scalars[13],
- left.scalars[14] + right.scalars[14],
- left.scalars[15] + right.scalars[15]
- );
- #endif
- }
- inline U8x16 operator-(const U8x16& left, const U8x16& right) {
- #if defined USE_BASIC_SIMD
- return U8x16(SUB_U8_SIMD(left.v, right.v));
- #else
- return U8x16(
- left.scalars[0] - right.scalars[0],
- left.scalars[1] - right.scalars[1],
- left.scalars[2] - right.scalars[2],
- left.scalars[3] - right.scalars[3],
- left.scalars[4] - right.scalars[4],
- left.scalars[5] - right.scalars[5],
- left.scalars[6] - right.scalars[6],
- left.scalars[7] - right.scalars[7],
- left.scalars[8] - right.scalars[8],
- left.scalars[9] - right.scalars[9],
- left.scalars[10] - right.scalars[10],
- left.scalars[11] - right.scalars[11],
- left.scalars[12] - right.scalars[12],
- left.scalars[13] - right.scalars[13],
- left.scalars[14] - right.scalars[14],
- left.scalars[15] - right.scalars[15]
- );
- #endif
- }
- inline uint8_t impl_limit0(int32_t x) { return x < 0 ? 0 : x; }
- inline uint8_t impl_limit255(uint32_t x) { return x > 255 ? 255 : x; }
- inline U8x16 saturatedAddition(const U8x16& left, const U8x16& right) {
- #if defined USE_BASIC_SIMD
- return U8x16(ADD_SAT_U8_SIMD(left.v, right.v));
- #else
- return U8x16(
- impl_limit255((uint32_t)left.scalars[0] + (uint32_t)right.scalars[0]),
- impl_limit255((uint32_t)left.scalars[1] + (uint32_t)right.scalars[1]),
- impl_limit255((uint32_t)left.scalars[2] + (uint32_t)right.scalars[2]),
- impl_limit255((uint32_t)left.scalars[3] + (uint32_t)right.scalars[3]),
- impl_limit255((uint32_t)left.scalars[4] + (uint32_t)right.scalars[4]),
- impl_limit255((uint32_t)left.scalars[5] + (uint32_t)right.scalars[5]),
- impl_limit255((uint32_t)left.scalars[6] + (uint32_t)right.scalars[6]),
- impl_limit255((uint32_t)left.scalars[7] + (uint32_t)right.scalars[7]),
- impl_limit255((uint32_t)left.scalars[8] + (uint32_t)right.scalars[8]),
- impl_limit255((uint32_t)left.scalars[9] + (uint32_t)right.scalars[9]),
- impl_limit255((uint32_t)left.scalars[10] + (uint32_t)right.scalars[10]),
- impl_limit255((uint32_t)left.scalars[11] + (uint32_t)right.scalars[11]),
- impl_limit255((uint32_t)left.scalars[12] + (uint32_t)right.scalars[12]),
- impl_limit255((uint32_t)left.scalars[13] + (uint32_t)right.scalars[13]),
- impl_limit255((uint32_t)left.scalars[14] + (uint32_t)right.scalars[14]),
- impl_limit255((uint32_t)left.scalars[15] + (uint32_t)right.scalars[15])
- );
- #endif
- }
- inline U8x16 saturatedSubtraction(const U8x16& left, const U8x16& right) {
- #if defined USE_BASIC_SIMD
- return U8x16(SUB_SAT_U8_SIMD(left.v, right.v));
- #else
- return U8x16(
- impl_limit0((int32_t)left.scalars[0] - (int32_t)right.scalars[0]),
- impl_limit0((int32_t)left.scalars[1] - (int32_t)right.scalars[1]),
- impl_limit0((int32_t)left.scalars[2] - (int32_t)right.scalars[2]),
- impl_limit0((int32_t)left.scalars[3] - (int32_t)right.scalars[3]),
- impl_limit0((int32_t)left.scalars[4] - (int32_t)right.scalars[4]),
- impl_limit0((int32_t)left.scalars[5] - (int32_t)right.scalars[5]),
- impl_limit0((int32_t)left.scalars[6] - (int32_t)right.scalars[6]),
- impl_limit0((int32_t)left.scalars[7] - (int32_t)right.scalars[7]),
- impl_limit0((int32_t)left.scalars[8] - (int32_t)right.scalars[8]),
- impl_limit0((int32_t)left.scalars[9] - (int32_t)right.scalars[9]),
- impl_limit0((int32_t)left.scalars[10] - (int32_t)right.scalars[10]),
- impl_limit0((int32_t)left.scalars[11] - (int32_t)right.scalars[11]),
- impl_limit0((int32_t)left.scalars[12] - (int32_t)right.scalars[12]),
- impl_limit0((int32_t)left.scalars[13] - (int32_t)right.scalars[13]),
- impl_limit0((int32_t)left.scalars[14] - (int32_t)right.scalars[14]),
- impl_limit0((int32_t)left.scalars[15] - (int32_t)right.scalars[15])
- );
- #endif
- }
- inline I32x4 truncateToI32(const F32x4& vector) {
- #if defined USE_BASIC_SIMD
- return I32x4(F32_TO_I32_SIMD(vector.v));
- #else
- return I32x4((int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3]);
- #endif
- }
- inline U32x4 truncateToU32(const F32x4& vector) {
- #if defined USE_BASIC_SIMD
- return U32x4(F32_TO_U32_SIMD(vector.v));
- #else
- return U32x4((uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3]);
- #endif
- }
- inline F32x4 floatFromI32(const I32x4& vector) {
- #if defined USE_BASIC_SIMD
- return F32x4(I32_TO_F32_SIMD(vector.v));
- #else
- return F32x4((float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3]);
- #endif
- }
- inline F32x4 floatFromU32(const U32x4& vector) {
- #if defined USE_BASIC_SIMD
- return F32x4(U32_TO_F32_SIMD(vector.v));
- #else
- return F32x4((float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3]);
- #endif
- }
- inline I32x4 I32FromU32(const U32x4& vector) {
- #if defined USE_BASIC_SIMD
- return I32x4(REINTERPRET_U32_TO_I32_SIMD(vector.v));
- #else
- return I32x4((int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3]);
- #endif
- }
- inline U32x4 U32FromI32(const I32x4& vector) {
- #if defined USE_BASIC_SIMD
- return U32x4(REINTERPRET_I32_TO_U32_SIMD(vector.v));
- #else
- return U32x4((uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3]);
- #endif
- }
- // Warning! Behavior depends on endianness.
- inline U8x16 reinterpret_U8FromU32(const U32x4& vector) {
- #if defined USE_BASIC_SIMD
- return U8x16(REINTERPRET_U32_TO_U8_SIMD(vector.v));
- #else
- const uint8_t *source = (const uint8_t*)vector.scalars;
- return U8x16(
- source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7],
- source[8], source[9], source[10], source[11], source[12], source[13], source[14], source[15]
- );
- #endif
- }
- // Warning! Behavior depends on endianness.
- inline U32x4 reinterpret_U32FromU8(const U8x16& vector) {
- #if defined USE_BASIC_SIMD
- return U32x4(REINTERPRET_U8_TO_U32_SIMD(vector.v));
- #else
- const uint32_t *source = (const uint32_t*)vector.scalars;
- return U32x4(source[0], source[1], source[2], source[3]);
- #endif
- }
- // Warning! Behavior depends on endianness.
- inline U16x8 reinterpret_U16FromU32(const U32x4& vector) {
- #if defined USE_BASIC_SIMD
- return U16x8(REINTERPRET_U32_TO_U16_SIMD(vector.v));
- #else
- const uint16_t *source = (const uint16_t*)vector.scalars;
- return U16x8(
- source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7]
- );
- #endif
- }
- // Warning! Behavior depends on endianness.
- inline U32x4 reinterpret_U32FromU16(const U16x8& vector) {
- #if defined USE_BASIC_SIMD
- return U32x4(REINTERPRET_U16_TO_U32_SIMD(vector.v));
- #else
- const uint32_t *source = (const uint32_t*)vector.scalars;
- return U32x4(source[0], source[1], source[2], source[3]);
- #endif
- }
- // Unpacking to larger integers
- inline U32x4 lowerToU32(const U16x8& vector) {
- #if defined USE_BASIC_SIMD
- return U32x4(U16_LOW_TO_U32_SIMD(vector.v));
- #else
- return U32x4(vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3]);
- #endif
- }
- inline U32x4 higherToU32(const U16x8& vector) {
- #if defined USE_BASIC_SIMD
- return U32x4(U16_HIGH_TO_U32_SIMD(vector.v));
- #else
- return U32x4(vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7]);
- #endif
- }
- inline U16x8 lowerToU16(const U8x16& vector) {
- #if defined USE_BASIC_SIMD
- return U16x8(U8_LOW_TO_U16_SIMD(vector.v));
- #else
- return U16x8(
- vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3],
- vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7]
- );
- #endif
- }
- inline U16x8 higherToU16(const U8x16& vector) {
- #if defined USE_BASIC_SIMD
- return U16x8(U8_HIGH_TO_U16_SIMD(vector.v));
- #else
- return U16x8(
- vector.scalars[8], vector.scalars[9], vector.scalars[10], vector.scalars[11],
- vector.scalars[12], vector.scalars[13], vector.scalars[14], vector.scalars[15]
- );
- #endif
- }
- // Saturated packing
- inline U8x16 saturateToU8(const U16x8& lower, const U16x8& upper) {
- #if defined USE_BASIC_SIMD
- return U8x16(PACK_SAT_U16_TO_U8_SIMD(lower.v, upper.v));
- #else
- return U8x16(
- impl_limit255(lower.scalars[0]),
- impl_limit255(lower.scalars[1]),
- impl_limit255(lower.scalars[2]),
- impl_limit255(lower.scalars[3]),
- impl_limit255(lower.scalars[4]),
- impl_limit255(lower.scalars[5]),
- impl_limit255(lower.scalars[6]),
- impl_limit255(lower.scalars[7]),
- impl_limit255(upper.scalars[0]),
- impl_limit255(upper.scalars[1]),
- impl_limit255(upper.scalars[2]),
- impl_limit255(upper.scalars[3]),
- impl_limit255(upper.scalars[4]),
- impl_limit255(upper.scalars[5]),
- impl_limit255(upper.scalars[6]),
- impl_limit255(upper.scalars[7])
- );
- #endif
- }
- // Unary negation for convenience and code readability.
- // Before using unary negation, always check if:
- // * An addition can be turned into a subtraction?
- // x = -a + b
- // x = b - a
- // * A multiplying constant or scalar can be negated instead?
- // x = -b * 2
- // x = b * -2
- inline F32x4 operator-(const F32x4& value) {
- #if defined USE_BASIC_SIMD
- return F32x4(0.0f) - value;
- #else
- return F32x4(-value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3]);
- #endif
- }
- inline I32x4 operator-(const I32x4& value) {
- #if defined USE_BASIC_SIMD
- return I32x4(0) - value;
- #else
- return I32x4(-value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3]);
- #endif
- }
- // Helper macros for generating the vector extract functions.
- // Having one function for each type and offset makes sure that the compiler gets an immediate integer within the valid range.
- #if defined USE_BASIC_SIMD
- #if defined USE_SSE2
- #if defined USE_SSSE3
- #define _MM_ALIGNR_EPI8(A, B, OFFSET) _mm_alignr_epi8(A, B, OFFSET)
- #else
- // If SSSE3 is not used, emulate it using stack memory and unaligned reading of data.
- static inline SIMD_U8x16 _MM_ALIGNR_EPI8(SIMD_U8x16 a, SIMD_U8x16 b, int offset) {
- ALIGN16 uint8_t vectorBuffer[32];
- #ifdef SAFE_POINTER_CHECKS
- if (uintptr_t((void*)vectorBuffer) & 15u) { throwError(U"Unaligned stack memory detected in 128-bit VECTOR_EXTRACT_GENERATOR!\n"); }
- #endif
- _mm_store_si128((SIMD_U8x16*)(vectorBuffer), b);
- _mm_store_si128((SIMD_U8x16*)(vectorBuffer + 16), a);
- return _mm_loadu_si128((SIMD_U8x16*)(vectorBuffer + offset));
- }
- #endif
- #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return U8x16(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET));
- #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return U16x8(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 2));
- #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return U32x4(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 4));
- #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return I32x4(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 4));
- #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return F32x4(SIMD_F32x4(_MM_ALIGNR_EPI8(SIMD_U32x4(b.v), SIMD_U32x4(a.v), OFFSET * 4)));
- #elif defined USE_NEON
- #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return U8x16(vextq_u8(a.v, b.v, OFFSET));
- #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return U16x8(vextq_u16(a.v, b.v, OFFSET));
- #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return U32x4(vextq_u32(a.v, b.v, OFFSET));
- #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return I32x4(vextq_s32(a.v, b.v, OFFSET));
- #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return F32x4(vextq_f32(a.v, b.v, OFFSET));
- #endif
- #else
- #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
- #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
- #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
- #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
- #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
- #endif
- // Vector extraction concatunates two input vectors and reads a vector between them using an offset.
- // The first and last offsets that only return one of the inputs can be used for readability, because they will be inlined and removed by the compiler.
- // To get elements from the right side, combine the center vector with the right vector and shift one element to the left using vectorExtract_1 for the given type.
- // To get elements from the left side, combine the left vector with the center vector and shift one element to the right using vectorExtract_15 for 16 lanes, vectorExtract_7 for 8 lanes, or vectorExtract_3 for 4 lanes.
- U8x16 inline vectorExtract_0(const U8x16 &a, const U8x16 &b) { return a; }
- U8x16 inline vectorExtract_1(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(1, U8x16(a.scalars[1], a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0])) }
- U8x16 inline vectorExtract_2(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(2, U8x16(a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1])) }
- U8x16 inline vectorExtract_3(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(3, U8x16(a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2])) }
- U8x16 inline vectorExtract_4(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(4, U8x16(a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3])) }
- U8x16 inline vectorExtract_5(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(5, U8x16(a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4])) }
- U8x16 inline vectorExtract_6(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(6, U8x16(a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5])) }
- U8x16 inline vectorExtract_7(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(7, U8x16(a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6])) }
- U8x16 inline vectorExtract_8(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(8, U8x16(a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7])) }
- U8x16 inline vectorExtract_9(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(9, U8x16(a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8])) }
- U8x16 inline vectorExtract_10(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(10, U8x16(a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9])) }
- U8x16 inline vectorExtract_11(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(11, U8x16(a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10])) }
- U8x16 inline vectorExtract_12(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(12, U8x16(a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11])) }
- U8x16 inline vectorExtract_13(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(13, U8x16(a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11], b.scalars[12])) }
- U8x16 inline vectorExtract_14(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(14, U8x16(a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11], b.scalars[12], b.scalars[13])) }
- U8x16 inline vectorExtract_15(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(15, U8x16(a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11], b.scalars[12], b.scalars[13], b.scalars[14])) }
- U8x16 inline vectorExtract_16(const U8x16 &a, const U8x16 &b) { return b; }
- U16x8 inline vectorExtract_0(const U16x8 &a, const U16x8 &b) { return a; }
- U16x8 inline vectorExtract_1(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(1, U16x8(a.scalars[1], a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0])) }
- U16x8 inline vectorExtract_2(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(2, U16x8(a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1])) }
- U16x8 inline vectorExtract_3(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(3, U16x8(a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2])) }
- U16x8 inline vectorExtract_4(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(4, U16x8(a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3])) }
- U16x8 inline vectorExtract_5(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(5, U16x8(a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4])) }
- U16x8 inline vectorExtract_6(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(6, U16x8(a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5])) }
- U16x8 inline vectorExtract_7(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(7, U16x8(a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6])) }
- U16x8 inline vectorExtract_8(const U16x8 &a, const U16x8 &b) { return b; }
- U32x4 inline vectorExtract_0(const U32x4 &a, const U32x4 &b) { return a; }
- U32x4 inline vectorExtract_1(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(1, U32x4(a.scalars[1], a.scalars[2], a.scalars[3], b.scalars[0])) }
- U32x4 inline vectorExtract_2(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(2, U32x4(a.scalars[2], a.scalars[3], b.scalars[0], b.scalars[1])) }
- U32x4 inline vectorExtract_3(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(3, U32x4(a.scalars[3], b.scalars[0], b.scalars[1], b.scalars[2])) }
- U32x4 inline vectorExtract_4(const U32x4 &a, const U32x4 &b) { return b; }
- I32x4 inline vectorExtract_0(const I32x4 &a, const I32x4 &b) { return a; }
- I32x4 inline vectorExtract_1(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(1, I32x4(a.scalars[1], a.scalars[2], a.scalars[3], b.scalars[0])) }
- I32x4 inline vectorExtract_2(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(2, I32x4(a.scalars[2], a.scalars[3], b.scalars[0], b.scalars[1])) }
- I32x4 inline vectorExtract_3(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(3, I32x4(a.scalars[3], b.scalars[0], b.scalars[1], b.scalars[2])) }
- I32x4 inline vectorExtract_4(const I32x4 &a, const I32x4 &b) { return b; }
- F32x4 inline vectorExtract_0(const F32x4 &a, const F32x4 &b) { return a; }
- F32x4 inline vectorExtract_1(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(1, F32x4(a.scalars[1], a.scalars[2], a.scalars[3], b.scalars[0])) }
- F32x4 inline vectorExtract_2(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(2, F32x4(a.scalars[2], a.scalars[3], b.scalars[0], b.scalars[1])) }
- F32x4 inline vectorExtract_3(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(3, F32x4(a.scalars[3], b.scalars[0], b.scalars[1], b.scalars[2])) }
- F32x4 inline vectorExtract_4(const F32x4 &a, const F32x4 &b) { return b; }
- // Gather instructions load memory from a pointer at multiple index offsets at the same time.
- // The given pointers should be aligned with 4 bytes, so that the fallback solution works on machines with strict alignment requirements.
- #if defined USE_AVX2
- #define GATHER_I32x4_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_epi32((const int32_t*)(SOURCE), FOUR_OFFSETS, SCALE)
- #define GATHER_U32x4_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_epi32((const int32_t*)(SOURCE), FOUR_OFFSETS, SCALE)
- #define GATHER_F32x4_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_ps((const float*)(SOURCE), FOUR_OFFSETS, SCALE)
- #endif
- static inline U32x4 gather_U32(dsr::SafePointer<const uint32_t> data, const U32x4 &elementOffset) {
- #ifdef SAFE_POINTER_CHECKS
- ALIGN16 uint32_t elementOffsets[4];
- if (uintptr_t((void*)elementOffsets) & 15u) { throwError(U"Unaligned stack memory detected in 128-bit gather_U32!\n"); }
- elementOffset.writeAlignedUnsafe(elementOffsets);
- data.assertInside("U32x4 gather_U32 lane 0", (data + elementOffsets[0]).getUnchecked());
- data.assertInside("U32x4 gather_U32 lane 1", (data + elementOffsets[1]).getUnchecked());
- data.assertInside("U32x4 gather_U32 lane 2", (data + elementOffsets[2]).getUnchecked());
- data.assertInside("U32x4 gather_U32 lane 3", (data + elementOffsets[3]).getUnchecked());
- #endif
- #if defined USE_AVX2
- return U32x4(GATHER_U32x4_AVX2(data.getUnsafe(), elementOffset.v, 4));
- #else
- #ifndef SAFE_POINTER_CHECKS
- ALIGN16 uint32_t elementOffsets[4];
- elementOffset.writeAlignedUnsafe(elementOffsets);
- #endif
- return U32x4(
- *(data + elementOffsets[0]),
- *(data + elementOffsets[1]),
- *(data + elementOffsets[2]),
- *(data + elementOffsets[3])
- );
- #endif
- }
- static inline I32x4 gather_I32(dsr::SafePointer<const int32_t> data, const U32x4 &elementOffset) {
- #ifdef SAFE_POINTER_CHECKS
- ALIGN16 uint32_t elementOffsets[4];
- if (uintptr_t((void*)elementOffsets) & 15u) { throwError(U"Unaligned stack memory detected in 128-bit gather_I32!\n"); }
- elementOffset.writeAlignedUnsafe(elementOffsets);
- data.assertInside("I32x4 gather_I32 lane 0", (data + elementOffsets[0]).getUnchecked());
- data.assertInside("I32x4 gather_I32 lane 1", (data + elementOffsets[1]).getUnchecked());
- data.assertInside("I32x4 gather_I32 lane 2", (data + elementOffsets[2]).getUnchecked());
- data.assertInside("I32x4 gather_I32 lane 3", (data + elementOffsets[3]).getUnchecked());
- #endif
- #if defined USE_AVX2
- return I32x4(GATHER_U32x4_AVX2(data.getUnsafe(), elementOffset.v, 4));
- #else
- #ifndef SAFE_POINTER_CHECKS
- ALIGN16 uint32_t elementOffsets[4];
- elementOffset.writeAlignedUnsafe(elementOffsets);
- #endif
- return I32x4(
- *(data + elementOffsets[0]),
- *(data + elementOffsets[1]),
- *(data + elementOffsets[2]),
- *(data + elementOffsets[3])
- );
- #endif
- }
- static inline F32x4 gather_F32(dsr::SafePointer<const float> data, const U32x4 &elementOffset) {
- #ifdef SAFE_POINTER_CHECKS
- ALIGN16 uint32_t elementOffsets[4];
- if (uintptr_t((void*)elementOffsets) & 15u) { throwError(U"Unaligned stack memory detected in 128-bit gather_F32!\n"); }
- elementOffset.writeAlignedUnsafe(elementOffsets);
- data.assertInside("F32x4 gather_F32 lane 0", (data + elementOffsets[0]).getUnchecked());
- data.assertInside("F32x4 gather_F32 lane 1", (data + elementOffsets[1]).getUnchecked());
- data.assertInside("F32x4 gather_F32 lane 2", (data + elementOffsets[2]).getUnchecked());
- data.assertInside("F32x4 gather_F32 lane 3", (data + elementOffsets[3]).getUnchecked());
- #endif
- #if defined USE_AVX2
- return F32x4(GATHER_F32x4_AVX2(data.getUnsafe(), elementOffset.v, 4));
- #else
- #ifndef SAFE_POINTER_CHECKS
- ALIGN16 uint32_t elementOffsets[4];
- elementOffset.writeAlignedUnsafe(elementOffsets);
- #endif
- return F32x4(
- *(data + elementOffsets[0]),
- *(data + elementOffsets[1]),
- *(data + elementOffsets[2]),
- *(data + elementOffsets[3])
- );
- #endif
- }
- inline F32x8 operator+(const F32x8& left, const F32x8& right) {
- #if defined USE_256BIT_F_SIMD
- return F32x8(ADD_F32_SIMD256(left.v, right.v));
- #else
- return F32x8(
- left.scalars[0] + right.scalars[0],
- left.scalars[1] + right.scalars[1],
- left.scalars[2] + right.scalars[2],
- left.scalars[3] + right.scalars[3],
- left.scalars[4] + right.scalars[4],
- left.scalars[5] + right.scalars[5],
- left.scalars[6] + right.scalars[6],
- left.scalars[7] + right.scalars[7]
- );
- #endif
- }
- inline F32x8 operator-(const F32x8& left, const F32x8& right) {
- #if defined USE_256BIT_F_SIMD
- return F32x8(SUB_F32_SIMD256(left.v, right.v));
- #else
- return F32x8(
- left.scalars[0] - right.scalars[0],
- left.scalars[1] - right.scalars[1],
- left.scalars[2] - right.scalars[2],
- left.scalars[3] - right.scalars[3],
- left.scalars[4] - right.scalars[4],
- left.scalars[5] - right.scalars[5],
- left.scalars[6] - right.scalars[6],
- left.scalars[7] - right.scalars[7]
- );
- #endif
- }
- inline F32x8 operator*(const F32x8& left, const F32x8& right) {
- #if defined USE_256BIT_F_SIMD
- return F32x8(MUL_F32_SIMD256(left.v, right.v));
- #else
- return F32x8(
- left.scalars[0] * right.scalars[0],
- left.scalars[1] * right.scalars[1],
- left.scalars[2] * right.scalars[2],
- left.scalars[3] * right.scalars[3],
- left.scalars[4] * right.scalars[4],
- left.scalars[5] * right.scalars[5],
- left.scalars[6] * right.scalars[6],
- left.scalars[7] * right.scalars[7]
- );
- #endif
- }
- inline F32x8 min(const F32x8& left, const F32x8& right) {
- #if defined USE_256BIT_F_SIMD
- return F32x8(MIN_F32_SIMD256(left.v, right.v));
- #else
- float v0 = left.scalars[0];
- float v1 = left.scalars[1];
- float v2 = left.scalars[2];
- float v3 = left.scalars[3];
- float v4 = left.scalars[4];
- float v5 = left.scalars[5];
- float v6 = left.scalars[6];
- float v7 = left.scalars[7];
- float r0 = right.scalars[0];
- float r1 = right.scalars[1];
- float r2 = right.scalars[2];
- float r3 = right.scalars[3];
- float r4 = right.scalars[4];
- float r5 = right.scalars[5];
- float r6 = right.scalars[6];
- float r7 = right.scalars[7];
- if (r0 < v0) { v0 = r0; }
- if (r1 < v1) { v1 = r1; }
- if (r2 < v2) { v2 = r2; }
- if (r3 < v3) { v3 = r3; }
- if (r4 < v4) { v4 = r4; }
- if (r5 < v5) { v5 = r5; }
- if (r6 < v6) { v6 = r6; }
- if (r7 < v7) { v7 = r7; }
- return F32x8(v0, v1, v2, v3, v4, v5, v6, v7);
- #endif
- }
- inline F32x8 max(const F32x8& left, const F32x8& right) {
- #if defined USE_256BIT_F_SIMD
- return F32x8(MAX_F32_SIMD256(left.v, right.v));
- #else
- float v0 = left.scalars[0];
- float v1 = left.scalars[1];
- float v2 = left.scalars[2];
- float v3 = left.scalars[3];
- float v4 = left.scalars[4];
- float v5 = left.scalars[5];
- float v6 = left.scalars[6];
- float v7 = left.scalars[7];
- float r0 = right.scalars[0];
- float r1 = right.scalars[1];
- float r2 = right.scalars[2];
- float r3 = right.scalars[3];
- float r4 = right.scalars[4];
- float r5 = right.scalars[5];
- float r6 = right.scalars[6];
- float r7 = right.scalars[7];
- if (r0 > v0) { v0 = r0; }
- if (r1 > v1) { v1 = r1; }
- if (r2 > v2) { v2 = r2; }
- if (r3 > v3) { v3 = r3; }
- if (r4 > v4) { v4 = r4; }
- if (r5 > v5) { v5 = r5; }
- if (r6 > v6) { v6 = r6; }
- if (r7 > v7) { v7 = r7; }
- return F32x8(v0, v1, v2, v3, v4, v5, v6, v7);
- #endif
- }
- inline I32x8 operator+(const I32x8& left, const I32x8& right) {
- #if defined USE_256BIT_X_SIMD
- return I32x8(ADD_I32_SIMD256(left.v, right.v));
- #else
- return I32x8(
- left.scalars[0] + right.scalars[0],
- left.scalars[1] + right.scalars[1],
- left.scalars[2] + right.scalars[2],
- left.scalars[3] + right.scalars[3],
- left.scalars[4] + right.scalars[4],
- left.scalars[5] + right.scalars[5],
- left.scalars[6] + right.scalars[6],
- left.scalars[7] + right.scalars[7]);
- #endif
- }
- inline I32x8 operator-(const I32x8& left, const I32x8& right) {
- #if defined USE_256BIT_X_SIMD
- return I32x8(SUB_I32_SIMD256(left.v, right.v));
- #else
- return I32x8(
- left.scalars[0] - right.scalars[0],
- left.scalars[1] - right.scalars[1],
- left.scalars[2] - right.scalars[2],
- left.scalars[3] - right.scalars[3],
- left.scalars[4] - right.scalars[4],
- left.scalars[5] - right.scalars[5],
- left.scalars[6] - right.scalars[6],
- left.scalars[7] - right.scalars[7]);
- #endif
- }
- inline I32x8 operator*(const I32x8& left, const I32x8& right) {
- #if defined USE_AVX2
- return I32x8(MUL_I32_SIMD256(left.v, right.v));
- #else
- return I32x8(
- left.scalars[0] * right.scalars[0],
- left.scalars[1] * right.scalars[1],
- left.scalars[2] * right.scalars[2],
- left.scalars[3] * right.scalars[3],
- left.scalars[4] * right.scalars[4],
- left.scalars[5] * right.scalars[5],
- left.scalars[6] * right.scalars[6],
- left.scalars[7] * right.scalars[7]
- );
- #endif
- }
- inline U32x8 operator+(const U32x8& left, const U32x8& right) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(ADD_U32_SIMD256(left.v, right.v));
- #else
- return U32x8(
- left.scalars[0] + right.scalars[0],
- left.scalars[1] + right.scalars[1],
- left.scalars[2] + right.scalars[2],
- left.scalars[3] + right.scalars[3],
- left.scalars[4] + right.scalars[4],
- left.scalars[5] + right.scalars[5],
- left.scalars[6] + right.scalars[6],
- left.scalars[7] + right.scalars[7]
- );
- #endif
- }
- inline U32x8 operator-(const U32x8& left, const U32x8& right) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(SUB_U32_SIMD256(left.v, right.v));
- #else
- return U32x8(
- left.scalars[0] - right.scalars[0],
- left.scalars[1] - right.scalars[1],
- left.scalars[2] - right.scalars[2],
- left.scalars[3] - right.scalars[3],
- left.scalars[4] - right.scalars[4],
- left.scalars[5] - right.scalars[5],
- left.scalars[6] - right.scalars[6],
- left.scalars[7] - right.scalars[7]
- );
- #endif
- }
- inline U32x8 operator*(const U32x8& left, const U32x8& right) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(MUL_U32_SIMD256(left.v, right.v));
- #else
- return U32x8(
- left.scalars[0] * right.scalars[0],
- left.scalars[1] * right.scalars[1],
- left.scalars[2] * right.scalars[2],
- left.scalars[3] * right.scalars[3],
- left.scalars[4] * right.scalars[4],
- left.scalars[5] * right.scalars[5],
- left.scalars[6] * right.scalars[6],
- left.scalars[7] * right.scalars[7]
- );
- #endif
- }
- inline U32x8 operator&(const U32x8& left, const U32x8& right) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(BITWISE_AND_U32_SIMD256(left.v, right.v));
- #else
- return U32x8(
- left.scalars[0] & right.scalars[0],
- left.scalars[1] & right.scalars[1],
- left.scalars[2] & right.scalars[2],
- left.scalars[3] & right.scalars[3],
- left.scalars[4] & right.scalars[4],
- left.scalars[5] & right.scalars[5],
- left.scalars[6] & right.scalars[6],
- left.scalars[7] & right.scalars[7]
- );
- #endif
- }
- inline U32x8 operator|(const U32x8& left, const U32x8& right) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(BITWISE_OR_U32_SIMD256(left.v, right.v));
- #else
- return U32x8(
- left.scalars[0] | right.scalars[0],
- left.scalars[1] | right.scalars[1],
- left.scalars[2] | right.scalars[2],
- left.scalars[3] | right.scalars[3],
- left.scalars[4] | right.scalars[4],
- left.scalars[5] | right.scalars[5],
- left.scalars[6] | right.scalars[6],
- left.scalars[7] | right.scalars[7]
- );
- #endif
- }
- inline U32x8 operator^(const U32x8& left, const U32x8& right) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(BITWISE_XOR_U32_SIMD256(left.v, right.v));
- #else
- return U32x8(
- left.scalars[0] ^ right.scalars[0],
- left.scalars[1] ^ right.scalars[1],
- left.scalars[2] ^ right.scalars[2],
- left.scalars[3] ^ right.scalars[3],
- left.scalars[4] ^ right.scalars[4],
- left.scalars[5] ^ right.scalars[5],
- left.scalars[6] ^ right.scalars[6],
- left.scalars[7] ^ right.scalars[7]
- );
- #endif
- }
- inline U32x8 operator~(const U32x8& value) {
- #if defined USE_BASIC_SIMD
- return value ^ U32x8(~uint32_t(0));
- #else
- return U32x8(
- ~value.scalars[0],
- ~value.scalars[1],
- ~value.scalars[2],
- ~value.scalars[3],
- ~value.scalars[4],
- ~value.scalars[5],
- ~value.scalars[6],
- ~value.scalars[7]
- );
- #endif
- }
- // ARM NEON does not support 256-bit vectors and Intel's AVX2 does not support variable shifting.
- inline U32x8 operator<<(const U32x8& left, const U32x8 &bitOffsets) {
- assert(allLanesLesser(bitOffsets, U32x8(32u)));
- return U32x8(
- left.scalars[0] << bitOffsets.scalars[0],
- left.scalars[1] << bitOffsets.scalars[1],
- left.scalars[2] << bitOffsets.scalars[2],
- left.scalars[3] << bitOffsets.scalars[3],
- left.scalars[4] << bitOffsets.scalars[4],
- left.scalars[5] << bitOffsets.scalars[5],
- left.scalars[6] << bitOffsets.scalars[6],
- left.scalars[7] << bitOffsets.scalars[7]
- );
- }
- inline U32x8 operator>>(const U32x8& left, const U32x8 &bitOffsets) {
- assert(allLanesLesser(bitOffsets, U32x8(32u)));
- return U32x8(
- left.scalars[0] >> bitOffsets.scalars[0],
- left.scalars[1] >> bitOffsets.scalars[1],
- left.scalars[2] >> bitOffsets.scalars[2],
- left.scalars[3] >> bitOffsets.scalars[3],
- left.scalars[4] >> bitOffsets.scalars[4],
- left.scalars[5] >> bitOffsets.scalars[5],
- left.scalars[6] >> bitOffsets.scalars[6],
- left.scalars[7] >> bitOffsets.scalars[7]
- );
- }
- // bitOffset must be an immediate constant from 0 to 31, so a template argument is used.
- template <uint32_t bitOffset>
- inline U32x8 bitShiftLeftImmediate(const U32x8& left) {
- static_assert(bitOffset < 32u, "Immediate left shift of 32-bit values may not shift more than 31 bits!");
- #if defined USE_AVX2
- return U32x8(_mm256_slli_epi32(left.v, bitOffset));
- #else
- return U32x8(
- left.scalars[0] << bitOffset,
- left.scalars[1] << bitOffset,
- left.scalars[2] << bitOffset,
- left.scalars[3] << bitOffset,
- left.scalars[4] << bitOffset,
- left.scalars[5] << bitOffset,
- left.scalars[6] << bitOffset,
- left.scalars[7] << bitOffset
- );
- #endif
- }
- // bitOffset must be an immediate constant from 0 to 31, so a template argument is used.
- template <uint32_t bitOffset>
- inline U32x8 bitShiftRightImmediate(const U32x8& left) {
- static_assert(bitOffset < 32u, "Immediate right shift of 32-bit values may not shift more than 31 bits!");
- #if defined USE_AVX2
- return U32x8(_mm256_srli_epi32(left.v, bitOffset));
- #else
- return U32x8(
- left.scalars[0] >> bitOffset,
- left.scalars[1] >> bitOffset,
- left.scalars[2] >> bitOffset,
- left.scalars[3] >> bitOffset,
- left.scalars[4] >> bitOffset,
- left.scalars[5] >> bitOffset,
- left.scalars[6] >> bitOffset,
- left.scalars[7] >> bitOffset
- );
- #endif
- }
- inline U16x16 operator<<(const U16x16& left, const U16x16 &bitOffsets) {
- assert(allLanesLesser(bitOffsets, U16x16(16u)));
- return U16x16(
- left.scalars[ 0] << bitOffsets.scalars[ 0],
- left.scalars[ 1] << bitOffsets.scalars[ 1],
- left.scalars[ 2] << bitOffsets.scalars[ 2],
- left.scalars[ 3] << bitOffsets.scalars[ 3],
- left.scalars[ 4] << bitOffsets.scalars[ 4],
- left.scalars[ 5] << bitOffsets.scalars[ 5],
- left.scalars[ 6] << bitOffsets.scalars[ 6],
- left.scalars[ 7] << bitOffsets.scalars[ 7],
- left.scalars[ 8] << bitOffsets.scalars[ 8],
- left.scalars[ 9] << bitOffsets.scalars[ 9],
- left.scalars[10] << bitOffsets.scalars[10],
- left.scalars[11] << bitOffsets.scalars[11],
- left.scalars[12] << bitOffsets.scalars[12],
- left.scalars[13] << bitOffsets.scalars[13],
- left.scalars[14] << bitOffsets.scalars[14],
- left.scalars[15] << bitOffsets.scalars[15]
- );
- }
- inline U16x16 operator>>(const U16x16& left, const U16x16 &bitOffsets) {
- assert(allLanesLesser(bitOffsets, U16x16(16u)));
- return U16x16(
- left.scalars[ 0] >> bitOffsets.scalars[ 0],
- left.scalars[ 1] >> bitOffsets.scalars[ 1],
- left.scalars[ 2] >> bitOffsets.scalars[ 2],
- left.scalars[ 3] >> bitOffsets.scalars[ 3],
- left.scalars[ 4] >> bitOffsets.scalars[ 4],
- left.scalars[ 5] >> bitOffsets.scalars[ 5],
- left.scalars[ 6] >> bitOffsets.scalars[ 6],
- left.scalars[ 7] >> bitOffsets.scalars[ 7],
- left.scalars[ 8] >> bitOffsets.scalars[ 8],
- left.scalars[ 9] >> bitOffsets.scalars[ 9],
- left.scalars[10] >> bitOffsets.scalars[10],
- left.scalars[11] >> bitOffsets.scalars[11],
- left.scalars[12] >> bitOffsets.scalars[12],
- left.scalars[13] >> bitOffsets.scalars[13],
- left.scalars[14] >> bitOffsets.scalars[14],
- left.scalars[15] >> bitOffsets.scalars[15]
- );
- }
- // bitOffset must be an immediate constant from 0 to 31, so a template argument is used.
- template <uint32_t bitOffset>
- inline U16x16 bitShiftLeftImmediate(const U16x16& left) {
- static_assert(bitOffset < 16u, "Immediate left shift of 16-bit values may not shift more than 15 bits!");
- #if defined USE_AVX2
- return U16x16(_mm256_slli_epi16(left.v, bitOffset));
- #else
- return U16x16(
- left.scalars[ 0] << bitOffset,
- left.scalars[ 1] << bitOffset,
- left.scalars[ 2] << bitOffset,
- left.scalars[ 3] << bitOffset,
- left.scalars[ 4] << bitOffset,
- left.scalars[ 5] << bitOffset,
- left.scalars[ 6] << bitOffset,
- left.scalars[ 7] << bitOffset,
- left.scalars[ 8] << bitOffset,
- left.scalars[ 9] << bitOffset,
- left.scalars[10] << bitOffset,
- left.scalars[11] << bitOffset,
- left.scalars[12] << bitOffset,
- left.scalars[13] << bitOffset,
- left.scalars[14] << bitOffset,
- left.scalars[15] << bitOffset
- );
- #endif
- }
- // bitOffset must be an immediate constant from 0 to 31, so a template argument is used.
- template <uint32_t bitOffset>
- inline U16x16 bitShiftRightImmediate(const U16x16& left) {
- static_assert(bitOffset < 16u, "Immediate right shift of 16-bit values may not shift more than 15 bits!");
- #if defined USE_AVX2
- return U16x16(_mm256_srli_epi16(left.v, bitOffset));
- #else
- return U16x16(
- left.scalars[ 0] >> bitOffset,
- left.scalars[ 1] >> bitOffset,
- left.scalars[ 2] >> bitOffset,
- left.scalars[ 3] >> bitOffset,
- left.scalars[ 4] >> bitOffset,
- left.scalars[ 5] >> bitOffset,
- left.scalars[ 6] >> bitOffset,
- left.scalars[ 7] >> bitOffset,
- left.scalars[ 8] >> bitOffset,
- left.scalars[ 9] >> bitOffset,
- left.scalars[10] >> bitOffset,
- left.scalars[11] >> bitOffset,
- left.scalars[12] >> bitOffset,
- left.scalars[13] >> bitOffset,
- left.scalars[14] >> bitOffset,
- left.scalars[15] >> bitOffset
- );
- #endif
- }
- inline U8x32 operator<<(const U8x32& left, const U8x32 &bitOffsets) {
- assert(allLanesLesser(bitOffsets, U8x32(32u)));
- return U8x32(
- left.scalars[ 0] << bitOffsets.scalars[ 0],
- left.scalars[ 1] << bitOffsets.scalars[ 1],
- left.scalars[ 2] << bitOffsets.scalars[ 2],
- left.scalars[ 3] << bitOffsets.scalars[ 3],
- left.scalars[ 4] << bitOffsets.scalars[ 4],
- left.scalars[ 5] << bitOffsets.scalars[ 5],
- left.scalars[ 6] << bitOffsets.scalars[ 6],
- left.scalars[ 7] << bitOffsets.scalars[ 7],
- left.scalars[ 8] << bitOffsets.scalars[ 8],
- left.scalars[ 9] << bitOffsets.scalars[ 9],
- left.scalars[10] << bitOffsets.scalars[10],
- left.scalars[11] << bitOffsets.scalars[11],
- left.scalars[12] << bitOffsets.scalars[12],
- left.scalars[13] << bitOffsets.scalars[13],
- left.scalars[14] << bitOffsets.scalars[14],
- left.scalars[15] << bitOffsets.scalars[15],
- left.scalars[16] << bitOffsets.scalars[16],
- left.scalars[17] << bitOffsets.scalars[17],
- left.scalars[18] << bitOffsets.scalars[18],
- left.scalars[19] << bitOffsets.scalars[19],
- left.scalars[20] << bitOffsets.scalars[20],
- left.scalars[21] << bitOffsets.scalars[21],
- left.scalars[22] << bitOffsets.scalars[22],
- left.scalars[23] << bitOffsets.scalars[23],
- left.scalars[24] << bitOffsets.scalars[24],
- left.scalars[25] << bitOffsets.scalars[25],
- left.scalars[26] << bitOffsets.scalars[26],
- left.scalars[27] << bitOffsets.scalars[27],
- left.scalars[28] << bitOffsets.scalars[28],
- left.scalars[29] << bitOffsets.scalars[29],
- left.scalars[30] << bitOffsets.scalars[30],
- left.scalars[31] << bitOffsets.scalars[31]
- );
- }
- inline U8x32 operator>>(const U8x32& left, const U8x32 &bitOffsets) {
- assert(allLanesLesser(bitOffsets, U8x32(32u)));
- return U8x32(
- left.scalars[ 0] >> bitOffsets.scalars[ 0],
- left.scalars[ 1] >> bitOffsets.scalars[ 1],
- left.scalars[ 2] >> bitOffsets.scalars[ 2],
- left.scalars[ 3] >> bitOffsets.scalars[ 3],
- left.scalars[ 4] >> bitOffsets.scalars[ 4],
- left.scalars[ 5] >> bitOffsets.scalars[ 5],
- left.scalars[ 6] >> bitOffsets.scalars[ 6],
- left.scalars[ 7] >> bitOffsets.scalars[ 7],
- left.scalars[ 8] >> bitOffsets.scalars[ 8],
- left.scalars[ 9] >> bitOffsets.scalars[ 9],
- left.scalars[10] >> bitOffsets.scalars[10],
- left.scalars[11] >> bitOffsets.scalars[11],
- left.scalars[12] >> bitOffsets.scalars[12],
- left.scalars[13] >> bitOffsets.scalars[13],
- left.scalars[14] >> bitOffsets.scalars[14],
- left.scalars[15] >> bitOffsets.scalars[15],
- left.scalars[16] >> bitOffsets.scalars[16],
- left.scalars[17] >> bitOffsets.scalars[17],
- left.scalars[18] >> bitOffsets.scalars[18],
- left.scalars[19] >> bitOffsets.scalars[19],
- left.scalars[20] >> bitOffsets.scalars[20],
- left.scalars[21] >> bitOffsets.scalars[21],
- left.scalars[22] >> bitOffsets.scalars[22],
- left.scalars[23] >> bitOffsets.scalars[23],
- left.scalars[24] >> bitOffsets.scalars[24],
- left.scalars[25] >> bitOffsets.scalars[25],
- left.scalars[26] >> bitOffsets.scalars[26],
- left.scalars[27] >> bitOffsets.scalars[27],
- left.scalars[28] >> bitOffsets.scalars[28],
- left.scalars[29] >> bitOffsets.scalars[29],
- left.scalars[30] >> bitOffsets.scalars[30],
- left.scalars[31] >> bitOffsets.scalars[31]
- );
- }
- // bitOffset must be an immediate constant from 0 to 31, so a template argument is used.
- template <uint32_t bitOffset>
- inline U8x32 bitShiftLeftImmediate(const U8x32& left) {
- static_assert(bitOffset < 8u, "Immediate left shift of 32-bit values may not shift more than 7 bits!");
- // TODO: Use a larger lane and a mask generated in compile time.
- return U8x32(
- left.scalars[ 0] << bitOffset,
- left.scalars[ 1] << bitOffset,
- left.scalars[ 2] << bitOffset,
- left.scalars[ 3] << bitOffset,
- left.scalars[ 4] << bitOffset,
- left.scalars[ 5] << bitOffset,
- left.scalars[ 6] << bitOffset,
- left.scalars[ 7] << bitOffset,
- left.scalars[ 8] << bitOffset,
- left.scalars[ 9] << bitOffset,
- left.scalars[10] << bitOffset,
- left.scalars[11] << bitOffset,
- left.scalars[12] << bitOffset,
- left.scalars[13] << bitOffset,
- left.scalars[14] << bitOffset,
- left.scalars[15] << bitOffset,
- left.scalars[16] << bitOffset,
- left.scalars[17] << bitOffset,
- left.scalars[18] << bitOffset,
- left.scalars[19] << bitOffset,
- left.scalars[20] << bitOffset,
- left.scalars[21] << bitOffset,
- left.scalars[22] << bitOffset,
- left.scalars[23] << bitOffset,
- left.scalars[24] << bitOffset,
- left.scalars[25] << bitOffset,
- left.scalars[26] << bitOffset,
- left.scalars[27] << bitOffset,
- left.scalars[28] << bitOffset,
- left.scalars[29] << bitOffset,
- left.scalars[30] << bitOffset,
- left.scalars[31] << bitOffset
- );
- }
- // bitOffset must be an immediate constant from 0 to 31, so a template argument is used.
- template <uint32_t bitOffset>
- inline U8x32 bitShiftRightImmediate(const U8x32& left) {
- static_assert(bitOffset < 8u, "Immediate right shift of 32-bit values may not shift more than 7 bits!");
- // TODO: Use a larger lane and a mask generated in compile time.
- return U8x32(
- left.scalars[ 0] >> bitOffset,
- left.scalars[ 1] >> bitOffset,
- left.scalars[ 2] >> bitOffset,
- left.scalars[ 3] >> bitOffset,
- left.scalars[ 4] >> bitOffset,
- left.scalars[ 5] >> bitOffset,
- left.scalars[ 6] >> bitOffset,
- left.scalars[ 7] >> bitOffset,
- left.scalars[ 8] >> bitOffset,
- left.scalars[ 9] >> bitOffset,
- left.scalars[10] >> bitOffset,
- left.scalars[11] >> bitOffset,
- left.scalars[12] >> bitOffset,
- left.scalars[13] >> bitOffset,
- left.scalars[14] >> bitOffset,
- left.scalars[15] >> bitOffset,
- left.scalars[16] >> bitOffset,
- left.scalars[17] >> bitOffset,
- left.scalars[18] >> bitOffset,
- left.scalars[19] >> bitOffset,
- left.scalars[20] >> bitOffset,
- left.scalars[21] >> bitOffset,
- left.scalars[22] >> bitOffset,
- left.scalars[23] >> bitOffset,
- left.scalars[24] >> bitOffset,
- left.scalars[25] >> bitOffset,
- left.scalars[26] >> bitOffset,
- left.scalars[27] >> bitOffset,
- left.scalars[28] >> bitOffset,
- left.scalars[29] >> bitOffset,
- left.scalars[30] >> bitOffset,
- left.scalars[31] >> bitOffset
- );
- }
- inline U16x16 operator+(const U16x16& left, const U16x16& right) {
- #if defined USE_256BIT_X_SIMD
- return U16x16(ADD_U16_SIMD256(left.v, right.v));
- #else
- return U16x16(
- left.scalars[0] + right.scalars[0],
- left.scalars[1] + right.scalars[1],
- left.scalars[2] + right.scalars[2],
- left.scalars[3] + right.scalars[3],
- left.scalars[4] + right.scalars[4],
- left.scalars[5] + right.scalars[5],
- left.scalars[6] + right.scalars[6],
- left.scalars[7] + right.scalars[7],
- left.scalars[8] + right.scalars[8],
- left.scalars[9] + right.scalars[9],
- left.scalars[10] + right.scalars[10],
- left.scalars[11] + right.scalars[11],
- left.scalars[12] + right.scalars[12],
- left.scalars[13] + right.scalars[13],
- left.scalars[14] + right.scalars[14],
- left.scalars[15] + right.scalars[15]
- );
- #endif
- }
- inline U16x16 operator-(const U16x16& left, const U16x16& right) {
- #if defined USE_256BIT_X_SIMD
- return U16x16(SUB_U16_SIMD256(left.v, right.v));
- #else
- return U16x16(
- left.scalars[0] - right.scalars[0],
- left.scalars[1] - right.scalars[1],
- left.scalars[2] - right.scalars[2],
- left.scalars[3] - right.scalars[3],
- left.scalars[4] - right.scalars[4],
- left.scalars[5] - right.scalars[5],
- left.scalars[6] - right.scalars[6],
- left.scalars[7] - right.scalars[7],
- left.scalars[8] - right.scalars[8],
- left.scalars[9] - right.scalars[9],
- left.scalars[10] - right.scalars[10],
- left.scalars[11] - right.scalars[11],
- left.scalars[12] - right.scalars[12],
- left.scalars[13] - right.scalars[13],
- left.scalars[14] - right.scalars[14],
- left.scalars[15] - right.scalars[15]
- );
- #endif
- }
- inline U16x16 operator*(const U16x16& left, const U16x16& right) {
- #if defined USE_256BIT_X_SIMD
- return U16x16(MUL_U16_SIMD256(left.v, right.v));
- #else
- return U16x16(
- left.scalars[0] * right.scalars[0],
- left.scalars[1] * right.scalars[1],
- left.scalars[2] * right.scalars[2],
- left.scalars[3] * right.scalars[3],
- left.scalars[4] * right.scalars[4],
- left.scalars[5] * right.scalars[5],
- left.scalars[6] * right.scalars[6],
- left.scalars[7] * right.scalars[7],
- left.scalars[8] * right.scalars[8],
- left.scalars[9] * right.scalars[9],
- left.scalars[10] * right.scalars[10],
- left.scalars[11] * right.scalars[11],
- left.scalars[12] * right.scalars[12],
- left.scalars[13] * right.scalars[13],
- left.scalars[14] * right.scalars[14],
- left.scalars[15] * right.scalars[15]
- );
- #endif
- }
- inline U8x32 operator+(const U8x32& left, const U8x32& right) {
- #if defined USE_256BIT_X_SIMD
- return U8x32(ADD_U8_SIMD256(left.v, right.v));
- #else
- U8x32 result = U8x32::create_dangerous_uninitialized();
- for (int i = 0; i < 32; i++) {
- result.scalars[i] = left.scalars[i] + right.scalars[i];
- }
- return result;
- #endif
- }
- inline U8x32 operator-(const U8x32& left, const U8x32& right) {
- #if defined USE_256BIT_X_SIMD
- return U8x32(SUB_U8_SIMD256(left.v, right.v));
- #else
- U8x32 result = U8x32::create_dangerous_uninitialized();
- for (int i = 0; i < 32; i++) {
- result.scalars[i] = left.scalars[i] - right.scalars[i];
- }
- return result;
- #endif
- }
- inline U8x32 saturatedAddition(const U8x32& left, const U8x32& right) {
- #if defined USE_256BIT_X_SIMD
- return U8x32(ADD_SAT_U8_SIMD256(left.v, right.v));
- #else
- U8x32 result = U8x32::create_dangerous_uninitialized();
- for (int i = 0; i < 32; i++) {
- result.scalars[i] = impl_limit255((uint32_t)left.scalars[i] + (uint32_t)right.scalars[i]);
- }
- return result;
- #endif
- }
- inline U8x32 saturatedSubtraction(const U8x32& left, const U8x32& right) {
- #if defined USE_256BIT_X_SIMD
- return U8x32(SUB_SAT_U8_SIMD256(left.v, right.v));
- #else
- U8x32 result = U8x32::create_dangerous_uninitialized();
- for (int i = 0; i < 32; i++) {
- result.scalars[i] = impl_limit0((int32_t)left.scalars[i] - (int32_t)right.scalars[i]);
- }
- return result;
- #endif
- }
- inline I32x8 truncateToI32(const F32x8& vector) {
- #if defined USE_256BIT_X_SIMD
- return I32x8(F32_TO_I32_SIMD256(vector.v));
- #else
- return I32x8(
- (int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3],
- (int32_t)vector.scalars[4], (int32_t)vector.scalars[5], (int32_t)vector.scalars[6], (int32_t)vector.scalars[7]
- );
- #endif
- }
- inline U32x8 truncateToU32(const F32x8& vector) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(F32_TO_U32_SIMD256(vector.v));
- #else
- return U32x8(
- (uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3],
- (uint32_t)vector.scalars[4], (uint32_t)vector.scalars[5], (uint32_t)vector.scalars[6], (uint32_t)vector.scalars[7]
- );
- #endif
- }
- inline F32x8 floatFromI32(const I32x8& vector) {
- #if defined USE_256BIT_X_SIMD
- return F32x8(I32_TO_F32_SIMD256(vector.v));
- #else
- return F32x8(
- (float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3],
- (float)vector.scalars[4], (float)vector.scalars[5], (float)vector.scalars[6], (float)vector.scalars[7]
- );
- #endif
- }
- inline F32x8 floatFromU32(const U32x8& vector) {
- #if defined USE_256BIT_X_SIMD
- return F32x8(U32_TO_F32_SIMD256(vector.v));
- #else
- return F32x8(
- (float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3],
- (float)vector.scalars[4], (float)vector.scalars[5], (float)vector.scalars[6], (float)vector.scalars[7]
- );
- #endif
- }
- inline I32x8 I32FromU32(const U32x8& vector) {
- #if defined USE_256BIT_X_SIMD
- return I32x8(REINTERPRET_U32_TO_I32_SIMD256(vector.v));
- #else
- return I32x8(
- (int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3],
- (int32_t)vector.scalars[4], (int32_t)vector.scalars[5], (int32_t)vector.scalars[6], (int32_t)vector.scalars[7]
- );
- #endif
- }
- inline U32x8 U32FromI32(const I32x8& vector) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(REINTERPRET_I32_TO_U32_SIMD256(vector.v));
- #else
- return U32x8(
- (uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3],
- (uint32_t)vector.scalars[4], (uint32_t)vector.scalars[5], (uint32_t)vector.scalars[6], (uint32_t)vector.scalars[7]
- );
- #endif
- }
- // Warning! Behavior depends on endianness.
- inline U8x32 reinterpret_U8FromU32(const U32x8& vector) {
- #if defined USE_256BIT_X_SIMD
- return U8x32(REINTERPRET_U32_TO_U8_SIMD256(vector.v));
- #else
- const uint8_t *source = (const uint8_t*)vector.scalars;
- U8x32 result = U8x32::create_dangerous_uninitialized();
- for (int i = 0; i < 32; i++) {
- result.scalars[i] = source[i];
- }
- return result;
- #endif
- }
- // Warning! Behavior depends on endianness.
- inline U32x8 reinterpret_U32FromU8(const U8x32& vector) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(REINTERPRET_U8_TO_U32_SIMD256(vector.v));
- #else
- const uint32_t *source = (const uint32_t*)vector.scalars;
- return U32x8(source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7]);
- #endif
- }
- // Warning! Behavior depends on endianness.
- inline U16x16 reinterpret_U16FromU32(const U32x8& vector) {
- #if defined USE_256BIT_X_SIMD
- return U16x16(REINTERPRET_U32_TO_U16_SIMD256(vector.v));
- #else
- const uint16_t *source = (const uint16_t*)vector.scalars;
- return U16x16(
- source[0], source[1], source[2] , source[3] , source[4] , source[5] , source[6] , source[7] ,
- source[8], source[9], source[10], source[11], source[12], source[13], source[14], source[15]
- );
- #endif
- }
- // Warning! Behavior depends on endianness.
- inline U32x8 reinterpret_U32FromU16(const U16x16& vector) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(REINTERPRET_U16_TO_U32_SIMD256(vector.v));
- #else
- const uint32_t *source = (const uint32_t*)vector.scalars;
- return U32x8(source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7]);
- #endif
- }
- // Unpacking to larger integers
- inline U32x8 lowerToU32(const U16x16& vector) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(U16_LOW_TO_U32_SIMD256(vector.v));
- #else
- return U32x8(vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3], vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7]);
- #endif
- }
- inline U32x8 higherToU32(const U16x16& vector) {
- #if defined USE_256BIT_X_SIMD
- return U32x8(U16_HIGH_TO_U32_SIMD256(vector.v));
- #else
- return U32x8(vector.scalars[8], vector.scalars[9], vector.scalars[10], vector.scalars[11], vector.scalars[12], vector.scalars[13], vector.scalars[14], vector.scalars[15]);
- #endif
- }
- inline U16x16 lowerToU16(const U8x32& vector) {
- #if defined USE_256BIT_X_SIMD
- return U16x16(U8_LOW_TO_U16_SIMD256(vector.v));
- #else
- return U16x16(
- vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3],
- vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7],
- vector.scalars[8], vector.scalars[9], vector.scalars[10], vector.scalars[11],
- vector.scalars[12], vector.scalars[13], vector.scalars[14], vector.scalars[15]
- );
- #endif
- }
- inline U16x16 higherToU16(const U8x32& vector) {
- #if defined USE_256BIT_X_SIMD
- return U16x16(U8_HIGH_TO_U16_SIMD256(vector.v));
- #else
- return U16x16(
- vector.scalars[16], vector.scalars[17], vector.scalars[18], vector.scalars[19],
- vector.scalars[20], vector.scalars[21], vector.scalars[22], vector.scalars[23],
- vector.scalars[24], vector.scalars[25], vector.scalars[26], vector.scalars[27],
- vector.scalars[28], vector.scalars[29], vector.scalars[30], vector.scalars[31]
- );
- #endif
- }
- // Saturated packing
- inline U8x32 saturateToU8(const U16x16& lower, const U16x16& upper) {
- #if defined USE_256BIT_X_SIMD
- return U8x32(PACK_SAT_U16_TO_U8_SIMD256(lower.v, upper.v));
- #else
- U8x32 result = U8x32::create_dangerous_uninitialized();
- for (int i = 0; i < 16; i++) {
- result.scalars[i] = impl_limit255(lower.scalars[i]);
- }
- for (int i = 0; i < 16; i++) {
- result.scalars[i + 16] = impl_limit255(upper.scalars[i]);
- }
- return result;
- #endif
- }
- // Unary negation for convenience and code readability.
- // Before using unary negation, always check if:
- // * An addition can be turned into a subtraction?
- // x = -a + b
- // x = b - a
- // * A multiplying constant or scalar can be negated instead?
- // x = -b * 2
- // x = b * -2
- inline F32x8 operator-(const F32x8& value) {
- #if defined USE_256BIT_F_SIMD
- return F32x8(0.0f) - value;
- #else
- return F32x8(
- -value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3],
- -value.scalars[4], -value.scalars[5], -value.scalars[6], -value.scalars[7]
- );
- #endif
- }
- inline I32x8 operator-(const I32x8& value) {
- #if defined USE_256BIT_X_SIMD
- return I32x8(0) - value;
- #else
- return I32x8(
- -value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3],
- -value.scalars[4], -value.scalars[5], -value.scalars[6], -value.scalars[7]
- );
- #endif
- }
- // Helper macros for generating the vector extract functions.
- // Having one function for each type and offset makes sure that the compiler gets an immediate integer within the valid range.
- #if defined USE_AVX2
- // AVX2 does not offer any 256-bit element extraction, only two 128-bit shifts done in parallel, so we might as well use two separate 128-bit extractions.
- template <int OFFSET>
- __m256i impl_extractBytes_AVX2(const __m256i leftInput, const __m256i rightInput) {
- // Extract three halves depending on which ones overlap with the offset.
- __m128i leftPart = _mm256_extractf128_si256(leftInput, (OFFSET < 16) ? 0 : 1);
- __m128i middlePart = _mm256_extractf128_si256(OFFSET < 16 ? rightInput : leftInput, (OFFSET < 16) ? 1 : 0);
- __m128i rightPart = _mm256_extractf128_si256(rightInput, (OFFSET < 16) ? 0 : 1);
- // Combine two 128-bit extracts into a whole 256-bit extract.
- return _mm256_set_m128i(
- _mm_alignr_epi8(leftPart, middlePart, OFFSET - ((OFFSET < 16) ? 0 : 16)),
- _mm_alignr_epi8(middlePart, rightPart, OFFSET - ((OFFSET < 16) ? 0 : 16))
- );
- }
- #define VECTOR_EXTRACT_GENERATOR_256_U8(OFFSET) return U8x32(impl_extractBytes_AVX2<OFFSET>(b.v, a.v));
- #define VECTOR_EXTRACT_GENERATOR_256_U16(OFFSET) return U16x16(impl_extractBytes_AVX2<OFFSET * 2>(b.v, a.v));
- #define VECTOR_EXTRACT_GENERATOR_256_U32(OFFSET) return U32x8(impl_extractBytes_AVX2<OFFSET * 4>(b.v, a.v));
- #define VECTOR_EXTRACT_GENERATOR_256_I32(OFFSET) return I32x8(impl_extractBytes_AVX2<OFFSET * 4>(b.v, a.v));
- #define VECTOR_EXTRACT_GENERATOR_256_F32(OFFSET) return F32x8(SIMD_F32x8(impl_extractBytes_AVX2<OFFSET * 4>(SIMD_U32x8(b.v), SIMD_U32x8(a.v))));
- #else
- template<typename T, int elementCount>
- T vectorExtract_emulated(const T &a, const T &b, int offset) {
- // For safety reasons, uninitialized default construction of SIMD vectors is only available during full emulation, not partial AVX support.
- T result = T::create_dangerous_uninitialized();
- int t = 0;
- for (int s = offset; s < elementCount; s++) {
- result.scalars[t] = a.scalars[s];
- t++;
- }
- for (int s = 0; s < offset; s++) {
- result.scalars[t] = b.scalars[s];
- t++;
- }
- return result;
- }
- #define VECTOR_EXTRACT_GENERATOR_256_U8(OFFSET) return vectorExtract_emulated<U8x32, 32>(a, b, OFFSET);
- #define VECTOR_EXTRACT_GENERATOR_256_U16(OFFSET) return vectorExtract_emulated<U16x16, 16>(a, b, OFFSET);
- #define VECTOR_EXTRACT_GENERATOR_256_U32(OFFSET) return vectorExtract_emulated<U32x8, 8>(a, b, OFFSET);
- #define VECTOR_EXTRACT_GENERATOR_256_I32(OFFSET) return vectorExtract_emulated<I32x8, 8>(a, b, OFFSET);
- #define VECTOR_EXTRACT_GENERATOR_256_F32(OFFSET) return vectorExtract_emulated<F32x8, 8>(a, b, OFFSET);
- #endif
- // Vector extraction concatunates two input vectors and reads a vector between them using an offset.
- // The first and last offsets that only return one of the inputs can be used for readability, because they will be inlined and removed by the compiler.
- // To get elements from the right side, combine the center vector with the right vector and shift one element to the left using vectorExtract_1 for the given type.
- // To get elements from the left side, combine the left vector with the center vector and shift one element to the right using vectorExtract_15 for 16 lanes, vectorExtract_7 for 8 lanes, or vectorExtract_3 for 4 lanes.
- // TODO: Also allow using a template arguments as the element offset with a static assert for the offset, which might be useful in template programming.
- U8x32 inline vectorExtract_0(const U8x32 &a, const U8x32 &b) { return a; }
- U8x32 inline vectorExtract_1(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(1) }
- U8x32 inline vectorExtract_2(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(2) }
- U8x32 inline vectorExtract_3(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(3) }
- U8x32 inline vectorExtract_4(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(4) }
- U8x32 inline vectorExtract_5(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(5) }
- U8x32 inline vectorExtract_6(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(6) }
- U8x32 inline vectorExtract_7(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(7) }
- U8x32 inline vectorExtract_8(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(8) }
- U8x32 inline vectorExtract_9(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(9) }
- U8x32 inline vectorExtract_10(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(10) }
- U8x32 inline vectorExtract_11(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(11) }
- U8x32 inline vectorExtract_12(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(12) }
- U8x32 inline vectorExtract_13(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(13) }
- U8x32 inline vectorExtract_14(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(14) }
- U8x32 inline vectorExtract_15(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(15) }
- U8x32 inline vectorExtract_16(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(16) }
- U8x32 inline vectorExtract_17(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(17) }
- U8x32 inline vectorExtract_18(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(18) }
- U8x32 inline vectorExtract_19(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(19) }
- U8x32 inline vectorExtract_20(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(20) }
- U8x32 inline vectorExtract_21(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(21) }
- U8x32 inline vectorExtract_22(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(22) }
- U8x32 inline vectorExtract_23(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(23) }
- U8x32 inline vectorExtract_24(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(24) }
- U8x32 inline vectorExtract_25(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(25) }
- U8x32 inline vectorExtract_26(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(26) }
- U8x32 inline vectorExtract_27(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(27) }
- U8x32 inline vectorExtract_28(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(28) }
- U8x32 inline vectorExtract_29(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(29) }
- U8x32 inline vectorExtract_30(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(30) }
- U8x32 inline vectorExtract_31(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(31) }
- U8x32 inline vectorExtract_32(const U8x32 &a, const U8x32 &b) { return b; }
- U16x16 inline vectorExtract_0(const U16x16 &a, const U16x16 &b) { return a; }
- U16x16 inline vectorExtract_1(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(1) }
- U16x16 inline vectorExtract_2(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(2) }
- U16x16 inline vectorExtract_3(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(3) }
- U16x16 inline vectorExtract_4(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(4) }
- U16x16 inline vectorExtract_5(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(5) }
- U16x16 inline vectorExtract_6(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(6) }
- U16x16 inline vectorExtract_7(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(7) }
- U16x16 inline vectorExtract_8(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(8) }
- U16x16 inline vectorExtract_9(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(9) }
- U16x16 inline vectorExtract_10(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(10) }
- U16x16 inline vectorExtract_11(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(11) }
- U16x16 inline vectorExtract_12(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(12) }
- U16x16 inline vectorExtract_13(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(13) }
- U16x16 inline vectorExtract_14(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(14) }
- U16x16 inline vectorExtract_15(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(15) }
- U16x16 inline vectorExtract_16(const U16x16 &a, const U16x16 &b) { return b; }
- U32x8 inline vectorExtract_0(const U32x8 &a, const U32x8 &b) { return a; }
- U32x8 inline vectorExtract_1(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(1) }
- U32x8 inline vectorExtract_2(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(2) }
- U32x8 inline vectorExtract_3(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(3) }
- U32x8 inline vectorExtract_4(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(4) }
- U32x8 inline vectorExtract_5(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(5) }
- U32x8 inline vectorExtract_6(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(6) }
- U32x8 inline vectorExtract_7(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(7) }
- U32x8 inline vectorExtract_8(const U32x8 &a, const U32x8 &b) { return b; }
- I32x8 inline vectorExtract_0(const I32x8 &a, const I32x8 &b) { return a; }
- I32x8 inline vectorExtract_1(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(1) }
- I32x8 inline vectorExtract_2(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(2) }
- I32x8 inline vectorExtract_3(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(3) }
- I32x8 inline vectorExtract_4(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(4) }
- I32x8 inline vectorExtract_5(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(5) }
- I32x8 inline vectorExtract_6(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(6) }
- I32x8 inline vectorExtract_7(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(7) }
- I32x8 inline vectorExtract_8(const I32x8 &a, const I32x8 &b) { return b; }
- F32x8 inline vectorExtract_0(const F32x8 &a, const F32x8 &b) { return a; }
- F32x8 inline vectorExtract_1(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(1) }
- F32x8 inline vectorExtract_2(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(2) }
- F32x8 inline vectorExtract_3(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(3) }
- F32x8 inline vectorExtract_4(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(4) }
- F32x8 inline vectorExtract_5(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(5) }
- F32x8 inline vectorExtract_6(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(6) }
- F32x8 inline vectorExtract_7(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(7) }
- F32x8 inline vectorExtract_8(const F32x8 &a, const F32x8 &b) { return b; }
- // Gather instructions load memory from a pointer at multiple index offsets at the same time.
- // The given pointers should be aligned with 4 bytes, so that the fallback solution works on machines with strict alignment requirements.
- #if defined USE_AVX2
- #define GATHER_I32x8_AVX2(SOURCE, EIGHT_OFFSETS, SCALE) _mm256_i32gather_epi32((const int32_t*)(SOURCE), EIGHT_OFFSETS, SCALE)
- #define GATHER_U32x8_AVX2(SOURCE, EIGHT_OFFSETS, SCALE) _mm256_i32gather_epi32((const int32_t*)(SOURCE), EIGHT_OFFSETS, SCALE)
- #define GATHER_F32x8_AVX2(SOURCE, EIGHT_OFFSETS, SCALE) _mm256_i32gather_ps((const float*)(SOURCE), EIGHT_OFFSETS, SCALE)
- #endif
- static inline U32x8 gather_U32(dsr::SafePointer<const uint32_t> data, const U32x8 &elementOffset) {
- #ifdef SAFE_POINTER_CHECKS
- ALIGN32 uint32_t elementOffsets[8];
- if (uintptr_t((void*)elementOffsets) & 31u) { throwError(U"Unaligned stack memory detected in 256-bit gather_U32!\n"); }
- elementOffset.writeAlignedUnsafe(elementOffsets);
- data.assertInside("U32x4 gather_U32 lane 0", (data + elementOffsets[0]).getUnchecked());
- data.assertInside("U32x4 gather_U32 lane 1", (data + elementOffsets[1]).getUnchecked());
- data.assertInside("U32x4 gather_U32 lane 2", (data + elementOffsets[2]).getUnchecked());
- data.assertInside("U32x4 gather_U32 lane 3", (data + elementOffsets[3]).getUnchecked());
- data.assertInside("U32x4 gather_U32 lane 4", (data + elementOffsets[4]).getUnchecked());
- data.assertInside("U32x4 gather_U32 lane 5", (data + elementOffsets[5]).getUnchecked());
- data.assertInside("U32x4 gather_U32 lane 6", (data + elementOffsets[6]).getUnchecked());
- data.assertInside("U32x4 gather_U32 lane 7", (data + elementOffsets[7]).getUnchecked());
- #endif
- #if defined USE_AVX2
- return U32x8(GATHER_I32x8_AVX2(data.getUnsafe(), elementOffset.v, 4));
- #else
- #ifndef SAFE_POINTER_CHECKS
- ALIGN32 uint32_t elementOffsets[8];
- elementOffset.writeAlignedUnsafe(elementOffsets);
- #endif
- return U32x8(
- *(data + elementOffsets[0]),
- *(data + elementOffsets[1]),
- *(data + elementOffsets[2]),
- *(data + elementOffsets[3]),
- *(data + elementOffsets[4]),
- *(data + elementOffsets[5]),
- *(data + elementOffsets[6]),
- *(data + elementOffsets[7])
- );
- #endif
- }
- static inline I32x8 gather_I32(dsr::SafePointer<const int32_t> data, const U32x8 &elementOffset) {
- #ifdef SAFE_POINTER_CHECKS
- ALIGN32 uint32_t elementOffsets[8];
- if (uintptr_t((void*)elementOffsets) & 31u) { throwError(U"Unaligned stack memory detected in 256-bit gather_I32!\n"); }
- elementOffset.writeAlignedUnsafe(elementOffsets);
- data.assertInside("I32x4 gather_I32 lane 0", (data + elementOffsets[0]).getUnchecked());
- data.assertInside("I32x4 gather_I32 lane 1", (data + elementOffsets[1]).getUnchecked());
- data.assertInside("I32x4 gather_I32 lane 2", (data + elementOffsets[2]).getUnchecked());
- data.assertInside("I32x4 gather_I32 lane 3", (data + elementOffsets[3]).getUnchecked());
- data.assertInside("I32x4 gather_I32 lane 4", (data + elementOffsets[4]).getUnchecked());
- data.assertInside("I32x4 gather_I32 lane 5", (data + elementOffsets[5]).getUnchecked());
- data.assertInside("I32x4 gather_I32 lane 6", (data + elementOffsets[6]).getUnchecked());
- data.assertInside("I32x4 gather_I32 lane 7", (data + elementOffsets[7]).getUnchecked());
- #endif
- #if defined USE_AVX2
- return I32x8(GATHER_U32x8_AVX2(data.getUnsafe(), elementOffset.v, 4));
- #else
- #ifndef SAFE_POINTER_CHECKS
- ALIGN32 uint32_t elementOffsets[8];
- elementOffset.writeAlignedUnsafe(elementOffsets);
- #endif
- return I32x8(
- *(data + elementOffsets[0]),
- *(data + elementOffsets[1]),
- *(data + elementOffsets[2]),
- *(data + elementOffsets[3]),
- *(data + elementOffsets[4]),
- *(data + elementOffsets[5]),
- *(data + elementOffsets[6]),
- *(data + elementOffsets[7])
- );
- #endif
- }
- static inline F32x8 gather_F32(dsr::SafePointer<const float> data, const U32x8 &elementOffset) {
- #ifdef SAFE_POINTER_CHECKS
- ALIGN32 uint32_t elementOffsets[8];
- if (uintptr_t((void*)elementOffsets) & 31u) { throwError(U"Unaligned stack memory detected in 256-bit gather_F32!\n"); }
- elementOffset.writeAlignedUnsafe(elementOffsets);
- data.assertInside("F32x4 gather_F32 lane 0", (data + elementOffsets[0]).getUnchecked());
- data.assertInside("F32x4 gather_F32 lane 1", (data + elementOffsets[1]).getUnchecked());
- data.assertInside("F32x4 gather_F32 lane 2", (data + elementOffsets[2]).getUnchecked());
- data.assertInside("F32x4 gather_F32 lane 3", (data + elementOffsets[3]).getUnchecked());
- data.assertInside("F32x4 gather_I32 lane 4", (data + elementOffsets[4]).getUnchecked());
- data.assertInside("F32x4 gather_F32 lane 5", (data + elementOffsets[5]).getUnchecked());
- data.assertInside("F32x4 gather_F32 lane 6", (data + elementOffsets[6]).getUnchecked());
- data.assertInside("F32x4 gather_F32 lane 7", (data + elementOffsets[7]).getUnchecked());
- #endif
- #if defined USE_AVX2
- return F32x8(GATHER_F32x8_AVX2(data.getUnsafe(), elementOffset.v, 4));
- #else
- #ifndef SAFE_POINTER_CHECKS
- ALIGN32 uint32_t elementOffsets[8];
- elementOffset.writeAlignedUnsafe(elementOffsets);
- #endif
- return F32x8(
- *(data + elementOffsets[0]),
- *(data + elementOffsets[1]),
- *(data + elementOffsets[2]),
- *(data + elementOffsets[3]),
- *(data + elementOffsets[4]),
- *(data + elementOffsets[5]),
- *(data + elementOffsets[6]),
- *(data + elementOffsets[7])
- );
- #endif
- }
- // Wrapper functions for explicitly expanding scalars into vectors during math operations.
- #define NUMERICAL_SCALAR_OPERATIONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
- inline VECTOR_TYPE operator+(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left + VECTOR_TYPE(right); } \
- inline VECTOR_TYPE operator+(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) + right; } \
- inline VECTOR_TYPE operator-(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left - VECTOR_TYPE(right); } \
- inline VECTOR_TYPE operator-(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) - right; }
- FOR_ALL_VECTOR_TYPES(NUMERICAL_SCALAR_OPERATIONS)
- #undef NUMERICAL_SCALAR_OPERATIONS
- #define MULTIPLY_SCALAR_OPERATIONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
- inline VECTOR_TYPE operator*(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left * VECTOR_TYPE(right); } \
- inline VECTOR_TYPE operator*(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) * right; }
- // TODO: Implement multiplication for U8x16 and U8x32.
- //FOR_ALL_VECTOR_TYPES(MULTIPLY_SCALAR_OPERATIONS)
- MULTIPLY_SCALAR_OPERATIONS(F32x4, float, 4)
- MULTIPLY_SCALAR_OPERATIONS(F32x8, float, 8)
- MULTIPLY_SCALAR_OPERATIONS(U32x4, uint32_t, 4)
- MULTIPLY_SCALAR_OPERATIONS(U32x8, uint32_t, 8)
- MULTIPLY_SCALAR_OPERATIONS(I32x4, int32_t, 4)
- MULTIPLY_SCALAR_OPERATIONS(I32x8, int32_t, 8)
- MULTIPLY_SCALAR_OPERATIONS(U16x8, uint16_t, 8)
- MULTIPLY_SCALAR_OPERATIONS(U16x16, uint16_t, 16)
- #undef MULTIPLY_SCALAR_OPERATIONS
- // Wrapper functions for explicitly duplicating bit masks into the same lane count.
- #define BITWISE_SCALAR_OPERATIONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
- inline VECTOR_TYPE operator&(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left & VECTOR_TYPE(right); } \
- inline VECTOR_TYPE operator&(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) & right; } \
- inline VECTOR_TYPE operator|(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left | VECTOR_TYPE(right); } \
- inline VECTOR_TYPE operator|(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) | right; } \
- inline VECTOR_TYPE operator^(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left ^ VECTOR_TYPE(right); } \
- inline VECTOR_TYPE operator^(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) ^ right; }
- // TODO: Implement bitwise operations for all unsigned SIMD vectors.
- //FOR_UNSIGNED_VECTOR_TYPES(BITWISE_SCALAR_OPERATIONS)
- BITWISE_SCALAR_OPERATIONS(U32x4, uint32_t, 4)
- BITWISE_SCALAR_OPERATIONS(U32x8, uint32_t, 8)
- #undef BITWISE_SCALAR_OPERATIONS
- // Cleaning up temporary macro definitions to avoid cluttering the namespace.
- #undef FOR_ALL_VECTOR_TYPES
- #undef FOR_FLOAT_VECTOR_TYPES
- #undef FOR_INTEGER_VECTOR_TYPES
- #undef FOR_SIGNED_VECTOR_TYPES
- #undef FOR_UNSIGNED_VECTOR_TYPES
- // TODO: Let SVE define completely separate types for dynamic vectors.
- // The X vectors using the longest SIMD length that is efficient to use for both floating-point and integer types.
- // DSR_DEFAULT_ALIGNMENT
- // The number of bytes memory should be aligned with by default when creating buffers and images.
- // F32xX
- // The longest available SIMD vector for storing 32-bit float values. Iterating laneCountX_32Bit floats at a time.
- // I32xX
- // The longest available SIMD vector for storing signed 32-bit integer values. Iterating laneCountX_32Bit integers at a time.
- // U32xX
- // The longest available SIMD vector for storing unsigned 32-bit integer values. Iterating laneCountX_32Bit integers at a time.
- // U16xX
- // The longest available SIMD vector for storing unsigned 16-bit integer values. Iterating laneCountX_16Bit integers at a time.
- // U8xX
- // The longest available SIMD vector for storing unsigned 8-bit integer values. Iterating laneCountX_8Bit integers at a time.
- #if defined USE_256BIT_X_SIMD || defined EMULATE_256BIT_X_SIMD
- // Using 256-bit SIMD
- #define DSR_DEFAULT_VECTOR_SIZE 32
- #define DSR_DEFAULT_ALIGNMENT 32
- using F32xX = F32x8;
- using I32xX = I32x8;
- using U32xX = U32x8;
- using U16xX = U16x16;
- using U8xX = U8x32;
- // Align memory with 256 bits to allow overwriting padding at the end of each pixel row.
- // Otherwise you would have to preserve data at the end of each row with slow and bloated duplicated code in every filter.
- #else
- // If there is no hardware support for 256-bit vectors, the emulation of 256-bit vectors when used explicitly, is allowed to be aligned with just 128 bits.
- #define DSR_DEFAULT_VECTOR_SIZE 16
- #define DSR_DEFAULT_ALIGNMENT 16
- using F32xX = F32x4;
- using I32xX = I32x4;
- using U32xX = U32x4;
- using U16xX = U16x8;
- using U8xX = U8x16;
- #endif
- // How many lanes do the longest available vector have for a specified lane size.
- // Used to iterate indices and pointers using whole elements.
- static const int laneCountX_32Bit = DSR_DEFAULT_VECTOR_SIZE / 4;
- static const int laneCountX_16Bit = DSR_DEFAULT_VECTOR_SIZE / 2;
- static const int laneCountX_8Bit = DSR_DEFAULT_VECTOR_SIZE;
- // TODO: Let SVE define completely separate types for dynamic vectors.
- // The F vector using the longest SIMD length that is efficient to use when only processing float values, even if no integer types are available in the same size.
- // Used when you know that your algorithm is only going to work with float types and you need the extra performance.
- // Some processors have AVX but not AVX2, meaning that it has 256-bit SIMD for floats, but only 128-bit SIMD for integers.
- // F32xF
- // The longest available SIMD vector for storing 32-bit float values. Iterating laneCountF_32Bit floats at a time.
- #if defined USE_256BIT_F_SIMD || defined EMULATE_256BIT_F_SIMD
- #define DSR_FLOAT_VECTOR_SIZE 32
- #define DSR_FLOAT_ALIGNMENT 32
- using F32xF = F32x8;
- #else
- // F vectors are 128-bits.
- #define DSR_FLOAT_VECTOR_SIZE 16
- #define DSR_FLOAT_ALIGNMENT 16
- using F32xF = F32x4;
- #endif
- // Used to iterate over float pointers when using F32xF.
- static const int laneCountF = DSR_FLOAT_VECTOR_SIZE / 4;
- // Define traits.
- DSR_APPLY_PROPERTY(DsrTrait_Any_U8 , U8x16)
- DSR_APPLY_PROPERTY(DsrTrait_Any_U8 , U8x32)
- DSR_APPLY_PROPERTY(DsrTrait_Any_U16, U16x8)
- DSR_APPLY_PROPERTY(DsrTrait_Any_U16, U16x16)
- DSR_APPLY_PROPERTY(DsrTrait_Any_U32, U32x4)
- DSR_APPLY_PROPERTY(DsrTrait_Any_U32, U32x8)
- DSR_APPLY_PROPERTY(DsrTrait_Any_I32, I32x4)
- DSR_APPLY_PROPERTY(DsrTrait_Any_I32, I32x8)
- DSR_APPLY_PROPERTY(DsrTrait_Any_F32, F32x4)
- DSR_APPLY_PROPERTY(DsrTrait_Any_F32, F32x8)
- DSR_APPLY_PROPERTY(DsrTrait_Any , U8x16)
- DSR_APPLY_PROPERTY(DsrTrait_Any , U8x32)
- DSR_APPLY_PROPERTY(DsrTrait_Any, U16x8)
- DSR_APPLY_PROPERTY(DsrTrait_Any, U16x16)
- DSR_APPLY_PROPERTY(DsrTrait_Any, U32x4)
- DSR_APPLY_PROPERTY(DsrTrait_Any, U32x8)
- DSR_APPLY_PROPERTY(DsrTrait_Any, I32x4)
- DSR_APPLY_PROPERTY(DsrTrait_Any, I32x8)
- DSR_APPLY_PROPERTY(DsrTrait_Any, F32x4)
- DSR_APPLY_PROPERTY(DsrTrait_Any, F32x8)
- // TODO: Use as independent types when the largest vector lengths are not known in compile time on ARM SVE.
- //DSR_APPLY_PROPERTY(DsrTrait_Any_U8 , U8xX)
- //DSR_APPLY_PROPERTY(DsrTrait_Any_U16, U16xX)
- //DSR_APPLY_PROPERTY(DsrTrait_Any_U32, U32xX)
- //DSR_APPLY_PROPERTY(DsrTrait_Any_I32, I32xX)
- //DSR_APPLY_PROPERTY(DsrTrait_Any_F32, F32xX)
- //DSR_APPLY_PROPERTY(DsrTrait_Any_F32, F32xF)
- //DSR_APPLY_PROPERTY(DsrTrait_Any , U8xX)
- //DSR_APPLY_PROPERTY(DsrTrait_Any, U16xX)
- //DSR_APPLY_PROPERTY(DsrTrait_Any, U32xX)
- //DSR_APPLY_PROPERTY(DsrTrait_Any, I32xX)
- //DSR_APPLY_PROPERTY(DsrTrait_Any, F32xX)
- //DSR_APPLY_PROPERTY(DsrTrait_Any, F32xF)
- }
- #endif
|