simd.h 182 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201
  1. // zlib open source license
  2. //
  3. // Copyright (c) 2017 to 2025 David Forsgren Piuva
  4. //
  5. // This software is provided 'as-is', without any express or implied
  6. // warranty. In no event will the authors be held liable for any damages
  7. // arising from the use of this software.
  8. //
  9. // Permission is granted to anyone to use this software for any purpose,
  10. // including commercial applications, and to alter it and redistribute it
  11. // freely, subject to the following restrictions:
  12. //
  13. // 1. The origin of this software must not be misrepresented; you must not
  14. // claim that you wrote the original software. If you use this software
  15. // in a product, an acknowledgment in the product documentation would be
  16. // appreciated but is not required.
  17. //
  18. // 2. Altered source versions must be plainly marked as such, and must not be
  19. // misrepresented as being the original software.
  20. //
  21. // 3. This notice may not be removed or altered from any source
  22. // distribution.
  23. // Hardware abstraction layer for portable SIMD math.
  24. // Used to make calculations faster without having to mess around with any hardware specific assembler code nor intrinsic functions.
  25. // You get the performance you need today and the ability to compile with automatic scalar emulation when building for a future processor that this module has not yet been ported to.
  26. // When you can generate the vectorized code using the same template function as your non-vectorized code, you don't even need to write a reference implementation.
  27. // Using with 128-bit SIMD: (beginner friendly, test once, compile anywhere, no compiler flags)
  28. // If you are new to vectorization or only plan to work with 128-bit extensions such as ARM NEON, you can keep it simple by only using the 128-bit vector types.
  29. // Pros and cons:
  30. // + Most target platforms (excluding older systems such as ARMv6) have 128-bit SIMD extensions such as Intel SSE2 or ARM NEON enabled by default.
  31. // ARMv6 does not support ARM NEON, but most ARMv7 processors support it, so that compilers enable NEON by default.
  32. // All 64-bit ARM processors have ARM NEON, because it stopped being optional in ARMv8.
  33. // Building for 64-bit Intel processors usually have SSE2 enabled by default, so you don't have to change any compiler flags when building on a different system.
  34. // + One build for all computers of the same instruction set.
  35. // Great when your application is not so resource heavy, because the least powerful systems don't have the fancy extensions anyway.
  36. // - You might end up enabling the additional SIMD extensions anyway because the library is already using it to become faster.
  37. // Types:
  38. // * Use F32x4, I32x4 and U32x4 for 4 elements at a time
  39. // * U16x8 for 8 elements at a time
  40. // * U8x16 for 16 elements at a time
  41. // Using the X vector size: (advanced, having to test with different build flags or emulation)
  42. // If you want more performance, you can use variable length type aliases.
  43. // Pros and cons:
  44. // + For heavy calculations where memory access is not the bottleneck, using larger SIMD vectors when enabled allow saving energy and increasing performance.
  45. // - If you forget to test with longer vector lengths (compiling with -mavx2 or -mEMULATE_256BIT_SIMD) then you might find bugs from not iterating or aligning memory correctly.
  46. // Types:
  47. // * Use F32xX, I32xX and U32xX for laneCountX_32Bit elements at a time
  48. // * U16xX for laneCountX_16Bit elements at a time
  49. // * U8xX for laneCountX_8Bit elements at a time
  50. // Using the F vector size: (very dangerous, no test can confirm that memory alignment is correct)
  51. // If you want even more performance, you can let float operations use the longest available F vector size, which might exceed the X vector size.
  52. // Pros and cons:
  53. // - Have to manually set the alignment of buffers to DSR_FLOAT_ALIGNMENT to prevent crashing.
  54. // If the default alignment for buffers changed based on the size of F vectors, the more commonly used X vector would get slowed down from cache misses from padding larger than X vectors.
  55. // AlignedImageF32 and sound backends are already aligned with the F vector size, because they are not generic like Buffer.
  56. // - It can be difficult to detect incorrect memory alignment, because a pointer can accidentally be aligned to more than what was requested.
  57. // If accidentally aligning to 128 bits instead of 256 bits, there is a 50% risk of failing to detect it at runtime and later fail on another computer.
  58. // If sticking with 128-bit or X vectors, all buffers will be correctly aligned automatically.
  59. // + For heavy calculations where memory access is not the bottleneck, using larger SIMD vectors when enabled allow saving energy and increasing performance.
  60. // - If you forget to test with longer vector lengths (compiling with -mavx2 or -mEMULATE_256BIT_SIMD) then you might find bugs from not iterating or aligning memory correctly.
  61. // Types:
  62. // * Use F32xX, I32xX and U32xX for laneCountX_32Bit elements at a time
  63. // Compiler extensions
  64. // On Intel/AMD processors:
  65. // SSE2 is usually enabled by default, because SSE2 is mandatory for 64-bit Intel instructions.
  66. // Use -mavx as a G++ compiler flag to enable the AVX extension, enabling the USE_AVX and USE_256BIT_F_SIMD macros.
  67. // If not available on your computer, you can test your algorithm for 256-bit float SIMD using EMULATE_256BIT_F_SIMD, but only if you use the F vector size.
  68. // Use -mavx2 as a G++ compiler flag to enable the AVX2 extension, enabling the USE_AVX2 and USE_256BIT_X_SIMD macros.
  69. // If not available on your computer, you can test your algorithm for 256-bit float and integer SIMD using EMULATE_256BIT_X_SIMD, but only if you use the X vector size.
  70. // On ARMv6 processors:
  71. // Scalar emulation is used when compiling for ARMv6, because it does not have NEON and VFP is not supported in this abstraction.
  72. // On ARMv7 processors:
  73. // NEON is usually enabled by default for ARMv7, because most of them have the extension.
  74. // On ARMv8 processors:
  75. // NEON can not be disabled for ARMv8, because it is mandatory for ARMv8.
  76. #ifndef DFPSR_SIMD
  77. #define DFPSR_SIMD
  78. #include <cstdint>
  79. #include <cassert>
  80. #include "SafePointer.h"
  81. #include "../math/FVector.h"
  82. #include "../math/IVector.h"
  83. #include "../math/UVector.h"
  84. #include "DsrTraits.h"
  85. #include "../settings.h"
  86. #include "../base/noSimd.h"
  87. namespace dsr {
  88. // Alignment in bytes
  89. #define ALIGN_BYTES(SIZE) __attribute__((aligned(SIZE)))
  90. #define ALIGN16 ALIGN_BYTES(16) // 128-bit alignment
  91. #define ALIGN32 ALIGN_BYTES(32) // 256-bit alignment
  92. #define ALIGN64 ALIGN_BYTES(64) // 512-bit alignment
  93. #define ALIGN128 ALIGN_BYTES(128) // 1024-bit alignment
  94. #define ALIGN256 ALIGN_BYTES(256) // 2048-bit alignment
  95. // Everything declared in here handles things specific for SSE.
  96. // Direct use of the macros will not provide portability to all hardware.
  97. #ifdef USE_SSE2
  98. #define USE_DIRECT_SIMD_MEMORY_ACCESS
  99. #include <emmintrin.h> // SSE2
  100. #ifdef USE_SSSE3
  101. #include <tmmintrin.h> // SSSE3
  102. #endif
  103. #ifdef USE_AVX
  104. #include <immintrin.h> // AVX / AVX2
  105. #endif
  106. // Vector types
  107. #define SIMD_F32x4 __m128
  108. #define SIMD_U8x16 __m128i
  109. #define SIMD_U16x8 __m128i
  110. #define SIMD_U32x4 __m128i
  111. #define SIMD_I32x4 __m128i
  112. // Vector uploads in address order
  113. #define LOAD_VECTOR_F32_SIMD(A, B, C, D) _mm_set_ps(D, C, B, A)
  114. #define LOAD_SCALAR_F32_SIMD(A) _mm_set1_ps(A)
  115. #define LOAD_VECTOR_U8_SIMD(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) _mm_set_epi8(P, O, N, M, L, K, J, I, H, G, F, E, D, C, B, A)
  116. #define LOAD_SCALAR_U8_SIMD(A) _mm_set1_epi8(A)
  117. #define LOAD_VECTOR_U16_SIMD(A, B, C, D, E, F, G, H) _mm_set_epi16(H, G, F, E, D, C, B, A)
  118. #define LOAD_SCALAR_U16_SIMD(A) _mm_set1_epi16(A)
  119. #define LOAD_VECTOR_U32_SIMD(A, B, C, D) _mm_set_epi32(D, C, B, A)
  120. #define LOAD_SCALAR_U32_SIMD(A) _mm_set1_epi32(A)
  121. #define LOAD_VECTOR_I32_SIMD(A, B, C, D) _mm_set_epi32(D, C, B, A)
  122. #define LOAD_SCALAR_I32_SIMD(A) _mm_set1_epi32(A)
  123. // Conversions
  124. #define F32_TO_I32_SIMD(A) _mm_cvttps_epi32(A)
  125. #define F32_TO_U32_SIMD(A) _mm_cvttps_epi32(A)
  126. #define I32_TO_F32_SIMD(A) _mm_cvtepi32_ps(A)
  127. #define U32_TO_F32_SIMD(A) _mm_cvtepi32_ps(A)
  128. // Unpacking conversions
  129. #define U8_LOW_TO_U16_SIMD(A) _mm_unpacklo_epi8(A, _mm_set1_epi8(0))
  130. #define U8_HIGH_TO_U16_SIMD(A) _mm_unpackhi_epi8(A, _mm_set1_epi8(0))
  131. #define U16_LOW_TO_U32_SIMD(A) _mm_unpacklo_epi16(A, _mm_set1_epi16(0))
  132. #define U16_HIGH_TO_U32_SIMD(A) _mm_unpackhi_epi16(A, _mm_set1_epi16(0))
  133. // Saturated packing
  134. // Credit: Using ideas from Victoria Zhislina's NEON_2_SSE.h header from the Intel corporation, but not trying to emulate NEON
  135. inline SIMD_U8x16 PACK_SAT_U16_TO_U8_SIMD(const SIMD_U16x8& a, const SIMD_U16x8& b) {
  136. SIMD_U16x8 mask, a2, b2;
  137. mask = _mm_set1_epi16(0b0111111111111111);
  138. a2 = _mm_and_si128(a, mask);
  139. a2 = _mm_or_si128(a2, _mm_and_si128(_mm_cmpgt_epi16(a2, a), mask));
  140. b2 = _mm_and_si128(b, mask);
  141. b2 = _mm_or_si128(b2, _mm_and_si128(_mm_cmpgt_epi16(b2, b), mask));
  142. return _mm_packus_epi16(a2, b2);
  143. }
  144. // Reinterpret casting
  145. #define REINTERPRET_U32_TO_U8_SIMD(A) (A)
  146. #define REINTERPRET_U32_TO_U16_SIMD(A) (A)
  147. #define REINTERPRET_U8_TO_U32_SIMD(A) (A)
  148. #define REINTERPRET_U16_TO_U32_SIMD(A) (A)
  149. #define REINTERPRET_U32_TO_I32_SIMD(A) (A)
  150. #define REINTERPRET_I32_TO_U32_SIMD(A) (A)
  151. // Vector float operations returning SIMD_F32x4
  152. #define ADD_F32_SIMD(A, B) _mm_add_ps(A, B)
  153. #define SUB_F32_SIMD(A, B) _mm_sub_ps(A, B)
  154. #define MUL_F32_SIMD(A, B) _mm_mul_ps(A, B)
  155. // Vector integer operations returning SIMD_I32x4
  156. #define ADD_I32_SIMD(A, B) _mm_add_epi32(A, B)
  157. #define SUB_I32_SIMD(A, B) _mm_sub_epi32(A, B)
  158. // 32-bit integer multiplications are not available on SSE2.
  159. // Vector integer operations returning SIMD_U32x4
  160. #define ADD_U32_SIMD(A, B) _mm_add_epi32(A, B)
  161. #define SUB_U32_SIMD(A, B) _mm_sub_epi32(A, B)
  162. // 32-bit integer multiplications are not available on SSE2.
  163. // Vector integer operations returning SIMD_U16x8
  164. #define ADD_U16_SIMD(A, B) _mm_add_epi16(A, B)
  165. #define SUB_U16_SIMD(A, B) _mm_sub_epi16(A, B)
  166. #define MUL_U16_SIMD(A, B) _mm_mullo_epi16(A, B)
  167. // Vector integer operations returning SIMD_U8x16
  168. #define ADD_U8_SIMD(A, B) _mm_add_epi8(A, B)
  169. #define ADD_SAT_U8_SIMD(A, B) _mm_adds_epu8(A, B) // Saturated addition
  170. #define SUB_U8_SIMD(A, B) _mm_sub_epi8(A, B)
  171. #define SUB_SAT_U8_SIMD(A, B) _mm_subs_epu8(A, B) // Saturated subtraction
  172. // No 8-bit multiplications
  173. // Statistics
  174. #define MIN_F32_SIMD(A, B) _mm_min_ps(A, B)
  175. #define MAX_F32_SIMD(A, B) _mm_max_ps(A, B)
  176. // TODO: Implement minimum and maximum for integer vectors, so that all operations exist for all applicable types:
  177. // Using _mm256_max_epu16... in AVX2 for 256-bit versions
  178. // Using comparisons and masking in SSE2 when _mm_max_epu16... in SSE4.1 is not available
  179. // Bitwise
  180. #define BITWISE_AND_U32_SIMD(A, B) _mm_and_si128(A, B)
  181. #define BITWISE_OR_U32_SIMD(A, B) _mm_or_si128(A, B)
  182. #define BITWISE_XOR_U32_SIMD(A, B) _mm_xor_si128(A, B)
  183. #ifdef USE_AVX
  184. // 256-bit vector types
  185. #define SIMD_F32x8 __m256
  186. // Vector uploads in address order
  187. #define LOAD_VECTOR_F32_SIMD256(A, B, C, D, E, F, G, H) _mm256_set_ps(H, G, F, E, D, C, B, A)
  188. #define LOAD_SCALAR_F32_SIMD256(A) _mm256_set1_ps(A)
  189. // Vector float operations returning SIMD_F32x4
  190. #define ADD_F32_SIMD256(A, B) _mm256_add_ps(A, B)
  191. #define SUB_F32_SIMD256(A, B) _mm256_sub_ps(A, B)
  192. #define MUL_F32_SIMD256(A, B) _mm256_mul_ps(A, B)
  193. // Statistics
  194. #define MIN_F32_SIMD256(A, B) _mm256_min_ps(A, B)
  195. #define MAX_F32_SIMD256(A, B) _mm256_max_ps(A, B)
  196. #ifdef USE_AVX2
  197. // 256-bit vector types
  198. #define SIMD_U8x32 __m256i
  199. #define SIMD_U16x16 __m256i
  200. #define SIMD_U32x8 __m256i
  201. #define SIMD_I32x8 __m256i
  202. // Vector uploads in address order
  203. #define LOAD_VECTOR_U8_SIMD256(A1, B1, C1, D1, E1, F1, G1, H1, I1, J1, K1, L1, M1, N1, O1, P1, Q1, R1, S1, T1, U1, V1, W1, X1, Y1, Z1, A2, B2, C2, D2, E2, F2) _mm256_set_epi8(F2, E2, D2, C2, B2, A2, Z1, Y1, X1, W1, V1, U1, T1, S1, R1, Q1, P1, O1, N1, M1, L1, K1, J1, I1, H1, G1, F1, E1, D1, C1, B1, A1)
  204. #define LOAD_SCALAR_U8_SIMD256(A) _mm256_set1_epi8(A)
  205. #define LOAD_VECTOR_U16_SIMD256(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) _mm256_set_epi16(P, O, N, M, L, K, J, I, H, G, F, E, D, C, B, A)
  206. #define LOAD_SCALAR_U16_SIMD256(A) _mm256_set1_epi16(A)
  207. #define LOAD_VECTOR_U32_SIMD256(A, B, C, D, E, F, G, H) _mm256_set_epi32(H, G, F, E, D, C, B, A)
  208. #define LOAD_SCALAR_U32_SIMD256(A) _mm256_set1_epi32(A)
  209. #define LOAD_VECTOR_I32_SIMD256(A, B, C, D, E, F, G, H) _mm256_set_epi32(H, G, F, E, D, C, B, A)
  210. #define LOAD_SCALAR_I32_SIMD256(A) _mm256_set1_epi32(A)
  211. // Conversions
  212. #define F32_TO_I32_SIMD256(A) _mm256_cvttps_epi32(A)
  213. #define F32_TO_U32_SIMD256(A) _mm256_cvttps_epi32(A)
  214. #define I32_TO_F32_SIMD256(A) _mm256_cvtepi32_ps(A)
  215. #define U32_TO_F32_SIMD256(A) _mm256_cvtepi32_ps(A)
  216. // Unpacking conversions
  217. #define U8_LOW_TO_U16_SIMD256(A) _mm256_unpacklo_epi8(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi8(0))
  218. #define U8_HIGH_TO_U16_SIMD256(A) _mm256_unpackhi_epi8(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi8(0))
  219. #define U16_LOW_TO_U32_SIMD256(A) _mm256_unpacklo_epi16(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi16(0))
  220. #define U16_HIGH_TO_U32_SIMD256(A) _mm256_unpackhi_epi16(_mm256_permute4x64_epi64(A, 0b11011000), _mm256_set1_epi16(0))
  221. // Saturated packing
  222. // Credit: Using ideas from Victoria Zhislina's NEON_2_SSE.h header from the Intel corporation, but not trying to emulate NEON
  223. inline SIMD_U8x32 PACK_SAT_U16_TO_U8_SIMD256(const SIMD_U16x16& a, const SIMD_U16x16& b) {
  224. SIMD_U16x16 mask, a2, b2;
  225. mask = _mm256_set1_epi16(0b0111111111111111);
  226. a2 = _mm256_and_si256(a, mask);
  227. a2 = _mm256_or_si256(a2, _mm256_and_si256(_mm256_cmpgt_epi16(a2, a), mask));
  228. b2 = _mm256_and_si256(b, mask);
  229. b2 = _mm256_or_si256(b2, _mm256_and_si256(_mm256_cmpgt_epi16(b2, b), mask));
  230. // The 256-bit pack instruction _mm256_packus_epi16 is not serial, so the result has to be permutated into the correct order.
  231. // 0 2 1 3
  232. // | X |
  233. // 0 1 2 3
  234. return _mm256_permute4x64_epi64(_mm256_packus_epi16(a2, b2), 0b11011000);
  235. }
  236. // Reinterpret casting
  237. #define REINTERPRET_U32_TO_U8_SIMD256(A) (A)
  238. #define REINTERPRET_U32_TO_U16_SIMD256(A) (A)
  239. #define REINTERPRET_U8_TO_U32_SIMD256(A) (A)
  240. #define REINTERPRET_U16_TO_U32_SIMD256(A) (A)
  241. #define REINTERPRET_U32_TO_I32_SIMD256(A) (A)
  242. #define REINTERPRET_I32_TO_U32_SIMD256(A) (A)
  243. // Vector integer operations returning SIMD_I32x4
  244. #define ADD_I32_SIMD256(A, B) _mm256_add_epi32(A, B)
  245. #define SUB_I32_SIMD256(A, B) _mm256_sub_epi32(A, B)
  246. #define MUL_I32_SIMD256(A, B) _mm256_mullo_epi32(A, B)
  247. // Vector integer operations returning SIMD_U32x4
  248. #define ADD_U32_SIMD256(A, B) _mm256_add_epi32(A, B)
  249. #define SUB_U32_SIMD256(A, B) _mm256_sub_epi32(A, B)
  250. #define MUL_U32_SIMD256(A, B) _mm256_mullo_epi32(A, B)
  251. // Vector integer operations returning SIMD_U16x8
  252. #define ADD_U16_SIMD256(A, B) _mm256_add_epi16(A, B)
  253. #define SUB_U16_SIMD256(A, B) _mm256_sub_epi16(A, B)
  254. #define MUL_U16_SIMD256(A, B) _mm256_mullo_epi16(A, B)
  255. // Vector integer operations returning SIMD_U8x16
  256. #define ADD_U8_SIMD256(A, B) _mm256_add_epi8(A, B)
  257. #define ADD_SAT_U8_SIMD256(A, B) _mm256_adds_epu8(A, B) // Saturated addition
  258. #define SUB_U8_SIMD256(A, B) _mm256_sub_epi8(A, B)
  259. #define SUB_SAT_U8_SIMD256(A, B) _mm256_subs_epu8(A, B) // Saturated subtraction
  260. // No 8-bit multiplications
  261. // Bitwise
  262. #define BITWISE_AND_U32_SIMD256(A, B) _mm256_and_si256(A, B)
  263. #define BITWISE_OR_U32_SIMD256(A, B) _mm256_or_si256(A, B)
  264. #define BITWISE_XOR_U32_SIMD256(A, B) _mm256_xor_si256(A, B)
  265. #endif
  266. #endif
  267. #endif
  268. // Everything declared in here handles things specific for NEON.
  269. // Direct use of the macros will not provide portability to all hardware.
  270. #ifdef USE_NEON
  271. #include <arm_neon.h> // NEON
  272. // Vector types
  273. #define SIMD_F32x4 float32x4_t
  274. #define SIMD_U8x16 uint8x16_t
  275. #define SIMD_U16x8 uint16x8_t
  276. #define SIMD_U32x4 uint32x4_t
  277. #define SIMD_I32x4 int32x4_t
  278. // Vector uploads in address order
  279. inline SIMD_F32x4 LOAD_VECTOR_F32_SIMD(float a, float b, float c, float d) {
  280. float data[4] ALIGN16 = {a, b, c, d};
  281. #ifdef SAFE_POINTER_CHECKS
  282. if (uintptr_t((void*)data) & 15u) { throwError(U"Unaligned stack memory detected in LOAD_VECTOR_F32_SIMD for NEON!\n"); }
  283. #endif
  284. return vld1q_f32(data);
  285. }
  286. inline SIMD_F32x4 LOAD_SCALAR_F32_SIMD(float a) {
  287. return vdupq_n_f32(a);
  288. }
  289. inline SIMD_U8x16 LOAD_VECTOR_U8_SIMD(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
  290. uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p) {
  291. uint8_t data[16] ALIGN16 = {a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p};
  292. #ifdef SAFE_POINTER_CHECKS
  293. if (uintptr_t((void*)data) & 15u) { throwError(U"Unaligned stack memory detected in LOAD_VECTOR_U8_SIMD for NEON!\n"); }
  294. #endif
  295. return vld1q_u8(data);
  296. }
  297. inline SIMD_U8x16 LOAD_SCALAR_U8_SIMD(uint16_t a) {
  298. return vdupq_n_u8(a);
  299. }
  300. inline SIMD_U16x8 LOAD_VECTOR_U16_SIMD(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h) {
  301. uint16_t data[8] ALIGN16 = {a, b, c, d, e, f, g, h};
  302. #ifdef SAFE_POINTER_CHECKS
  303. if (uintptr_t((void*)data) & 15u) { throwError(U"Unaligned stack memory detected in LOAD_VECTOR_U16_SIMD for NEON!\n"); }
  304. #endif
  305. return vld1q_u16(data);
  306. }
  307. inline SIMD_U16x8 LOAD_SCALAR_U16_SIMD(uint16_t a) {
  308. return vdupq_n_u16(a);
  309. }
  310. inline SIMD_U32x4 LOAD_VECTOR_U32_SIMD(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
  311. uint32_t data[4] ALIGN16 = {a, b, c, d};
  312. #ifdef SAFE_POINTER_CHECKS
  313. if (uintptr_t((void*)data) & 15u) { throwError(U"Unaligned stack memory detected in LOAD_VECTOR_U32_SIMD for NEON!\n"); }
  314. #endif
  315. return vld1q_u32(data);
  316. }
  317. inline SIMD_U32x4 LOAD_SCALAR_U32_SIMD(uint32_t a) {
  318. return vdupq_n_u32(a);
  319. }
  320. inline SIMD_I32x4 LOAD_VECTOR_I32_SIMD(int32_t a, int32_t b, int32_t c, int32_t d) {
  321. int32_t data[4] ALIGN16 = {a, b, c, d};
  322. #ifdef SAFE_POINTER_CHECKS
  323. if (uintptr_t((void*)data) & 15u) { throwError(U"Unaligned stack memory detected in LOAD_VECTOR_I32_SIMD for NEON!\n"); }
  324. #endif
  325. return vld1q_s32(data);
  326. }
  327. inline SIMD_I32x4 LOAD_SCALAR_I32_SIMD(int32_t a) {
  328. return vdupq_n_s32(a);
  329. }
  330. // Conversions
  331. #define F32_TO_I32_SIMD(A) vcvtq_s32_f32(A)
  332. #define F32_TO_U32_SIMD(A) vcvtq_u32_f32(A)
  333. #define I32_TO_F32_SIMD(A) vcvtq_f32_s32(A)
  334. #define U32_TO_F32_SIMD(A) vcvtq_f32_u32(A)
  335. // Unpacking conversions
  336. #define U8_LOW_TO_U16_SIMD(A) vmovl_u8(vget_low_u8(A))
  337. #define U8_HIGH_TO_U16_SIMD(A) vmovl_u8(vget_high_u8(A))
  338. #define U16_LOW_TO_U32_SIMD(A) vmovl_u16(vget_low_u16(A))
  339. #define U16_HIGH_TO_U32_SIMD(A) vmovl_u16(vget_high_u16(A))
  340. // Saturated packing
  341. #define PACK_SAT_U16_TO_U8_SIMD(A, B) vcombine_u8(vqmovn_u16(A), vqmovn_u16(B))
  342. // Reinterpret casting
  343. #define REINTERPRET_U32_TO_U8_SIMD(A) vreinterpretq_u8_u32(A)
  344. #define REINTERPRET_U32_TO_U16_SIMD(A) vreinterpretq_u16_u32(A)
  345. #define REINTERPRET_U8_TO_U32_SIMD(A) vreinterpretq_u32_u8(A)
  346. #define REINTERPRET_U16_TO_U32_SIMD(A) vreinterpretq_u32_u16(A)
  347. #define REINTERPRET_U32_TO_I32_SIMD(A) vreinterpretq_s32_u32(A)
  348. #define REINTERPRET_I32_TO_U32_SIMD(A) vreinterpretq_u32_s32(A)
  349. // Vector float operations returning SIMD_F32x4
  350. #define ADD_F32_SIMD(A, B) vaddq_f32(A, B)
  351. #define SUB_F32_SIMD(A, B) vsubq_f32(A, B)
  352. #define MUL_F32_SIMD(A, B) vmulq_f32(A, B)
  353. // Vector integer operations returning SIMD_I32x4
  354. #define ADD_I32_SIMD(A, B) vaddq_s32(A, B)
  355. #define SUB_I32_SIMD(A, B) vsubq_s32(A, B)
  356. #define MUL_I32_NEON(A, B) vmulq_s32(A, B)
  357. // Vector integer operations returning SIMD_U32x4
  358. #define ADD_U32_SIMD(A, B) vaddq_u32(A, B)
  359. #define SUB_U32_SIMD(A, B) vsubq_u32(A, B)
  360. #define MUL_U32_NEON(A, B) vmulq_u32(A, B)
  361. // Vector integer operations returning SIMD_U16x8
  362. #define ADD_U16_SIMD(A, B) vaddq_u16(A, B)
  363. #define SUB_U16_SIMD(A, B) vsubq_u16(A, B)
  364. #define MUL_U16_SIMD(A, B) vmulq_u16(A, B)
  365. // Vector integer operations returning SIMD_U8x16
  366. #define ADD_U8_SIMD(A, B) vaddq_u8(A, B)
  367. #define ADD_SAT_U8_SIMD(A, B) vqaddq_u8(A, B) // Saturated addition
  368. #define SUB_U8_SIMD(A, B) vsubq_u8(A, B)
  369. #define SUB_SAT_U8_SIMD(A, B) vqsubq_u8(A, B) // Saturated subtraction
  370. // No 8-bit multiplications
  371. // Statistics
  372. #define MIN_F32_SIMD(A, B) vminq_f32(A, B)
  373. #define MAX_F32_SIMD(A, B) vmaxq_f32(A, B)
  374. // Bitwise
  375. #define BITWISE_AND_U32_SIMD(A, B) vandq_u32(A, B)
  376. #define BITWISE_OR_U32_SIMD(A, B) vorrq_u32(A, B)
  377. #define BITWISE_XOR_U32_SIMD(A, B) veorq_u32(A, B)
  378. #endif
  379. /*
  380. The vector types below are supposed to be portable across different CPU architectures.
  381. When mixed with handwritten SIMD intrinsics:
  382. Use "USE_SSE2" instead of "__SSE2__"
  383. Use "USE_AVX2" instead of "__AVX2__"
  384. Use "USE_NEON" instead of "__ARM_NEON"
  385. So that any new variations of the macro named given from the compiler can be added to simd.h instead of duplicated everywhere.
  386. Portability exceptions:
  387. * The "v" variable is the native backend, which is only defined when SIMD is supported by hardware.
  388. Only use when USE_BASIC_SIMD is defined.
  389. Will not work on scalar emulation.
  390. * The "scalars" array is available when emulating a type that does not exist or the SIMD vector has direct access to the memory.
  391. Do not rely on these for accessing elements, because otherwise your code will not be able to compile for ARM NEON.
  392. */
  393. union F32x4 {
  394. private:
  395. // The uninitialized default constructor is private for safety reasons.
  396. F32x4() {}
  397. public:
  398. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  399. static inline F32x4 create_dangerous_uninitialized() { return F32x4(); }
  400. #ifdef USE_BASIC_SIMD
  401. public:
  402. #ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
  403. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  404. // Direct access cannot be done on NEON!
  405. float scalars[4];
  406. #endif
  407. // The SIMD vector of undefined type
  408. // Not accessible while emulating!
  409. SIMD_F32x4 v;
  410. // Construct a portable vector from a native SIMD vector
  411. explicit F32x4(const SIMD_F32x4& v) : v(v) {}
  412. // Construct a portable vector from a set of scalars
  413. F32x4(float a1, float a2, float a3, float a4) : v(LOAD_VECTOR_F32_SIMD(a1, a2, a3, a4)) {}
  414. // Construct a portable vector from a single duplicated scalar
  415. explicit F32x4(float scalar) : v(LOAD_SCALAR_F32_SIMD(scalar)) {}
  416. #else
  417. public:
  418. // Emulate a SIMD vector as an array of scalars without hardware support.
  419. // Only accessible while emulating!
  420. float scalars[4];
  421. // Construct a portable vector from a set of scalars
  422. F32x4(float a1, float a2, float a3, float a4) {
  423. this->scalars[0] = a1;
  424. this->scalars[1] = a2;
  425. this->scalars[2] = a3;
  426. this->scalars[3] = a4;
  427. }
  428. // Construct a portable vector from a single duplicated scalar
  429. explicit F32x4(float scalar) {
  430. this->scalars[0] = scalar;
  431. this->scalars[1] = scalar;
  432. this->scalars[2] = scalar;
  433. this->scalars[3] = scalar;
  434. }
  435. #endif
  436. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  437. static inline F32x4 createGradient(float start, float increment) {
  438. return F32x4(start, start + increment, start + increment * 2.0f, start + increment * 3.0f);
  439. }
  440. // Construct a portable SIMD vector from a pointer to aligned data.
  441. // Data must be aligned with at least 16 bytes.
  442. static inline F32x4 readAlignedUnsafe(const float* data) {
  443. #ifdef SAFE_POINTER_CHECKS
  444. if (uintptr_t((const void*)data) & 15u) { throwError(U"Unaligned pointer detected in F32x4::readAlignedUnsafe!\n"); }
  445. #endif
  446. #ifdef USE_BASIC_SIMD
  447. #if defined USE_SSE2
  448. return F32x4(_mm_load_ps(data));
  449. #elif defined USE_NEON
  450. return F32x4(vld1q_f32(data));
  451. #endif
  452. #else
  453. return F32x4(data[0], data[1], data[2], data[3]);
  454. #endif
  455. }
  456. // Write to aligned memory from the existing vector
  457. // data must be aligned with 32 bytes.
  458. inline void writeAlignedUnsafe(float* data) const {
  459. #ifdef SAFE_POINTER_CHECKS
  460. if (uintptr_t((void*)data) & 15u) { throwError(U"Unaligned pionter detected in F32x4::writeAlignedUnsafe!\n"); }
  461. #endif
  462. #if defined USE_BASIC_SIMD
  463. #if defined USE_SSE2
  464. _mm_store_ps(data, this->v);
  465. #elif defined USE_NEON
  466. vst1q_f32(data, this->v);
  467. #endif
  468. #else
  469. data[0] = this->scalars[0];
  470. data[1] = this->scalars[1];
  471. data[2] = this->scalars[2];
  472. data[3] = this->scalars[3];
  473. #endif
  474. }
  475. #if defined DFPSR_GEOMETRY_FVECTOR
  476. dsr::FVector4D get() const {
  477. float data[4] ALIGN16;
  478. #ifdef SAFE_POINTER_CHECKS
  479. if (uintptr_t(data) & 15u) { throwError(U"Unaligned stack memory detected in FVector4D F32x4::get!\n"); }
  480. #endif
  481. this->writeAlignedUnsafe(data);
  482. return dsr::FVector4D(data[0], data[1], data[2], data[3]);
  483. }
  484. #endif
  485. // Bound and alignment checked reading
  486. static inline F32x4 readAligned(dsr::SafePointer<const float> data, const char* methodName) {
  487. const float* pointer = data.getUnsafe();
  488. #if defined SAFE_POINTER_CHECKS
  489. data.assertInside(methodName, pointer, 16);
  490. #endif
  491. return F32x4::readAlignedUnsafe(pointer);
  492. }
  493. // Bound and alignment checked writing
  494. inline void writeAligned(dsr::SafePointer<float> data, const char* methodName) const {
  495. float* pointer = data.getUnsafe();
  496. #if defined SAFE_POINTER_CHECKS
  497. data.assertInside(methodName, pointer, 16);
  498. #endif
  499. this->writeAlignedUnsafe(pointer);
  500. }
  501. };
  502. // 1 / value
  503. inline F32x4 reciprocal(const F32x4 &value) {
  504. #if defined USE_BASIC_SIMD
  505. #if defined USE_SSE2
  506. // Approximate
  507. SIMD_F32x4 lowQ = _mm_rcp_ps(value.v);
  508. // Refine
  509. return F32x4(SUB_F32_SIMD(ADD_F32_SIMD(lowQ, lowQ), MUL_F32_SIMD(value.v, MUL_F32_SIMD(lowQ, lowQ))));
  510. #elif defined USE_NEON
  511. // Approximate
  512. SIMD_F32x4 result = vrecpeq_f32(value.v);
  513. // Refine
  514. result = MUL_F32_SIMD(vrecpsq_f32(value.v, result), result);
  515. return F32x4(MUL_F32_SIMD(vrecpsq_f32(value.v, result), result));
  516. #else
  517. assert(false);
  518. return F32x4(0);
  519. #endif
  520. #else
  521. return F32x4(1.0f / value.scalars[0], 1.0f / value.scalars[1], 1.0f / value.scalars[2], 1.0f / value.scalars[3]);
  522. #endif
  523. }
  524. // 1 / sqrt(value)
  525. inline F32x4 reciprocalSquareRoot(const F32x4 &value) {
  526. #if defined USE_BASIC_SIMD
  527. #if defined USE_SSE2
  528. SIMD_F32x4 reciRoot = _mm_rsqrt_ps(value.v);
  529. SIMD_F32x4 mul = MUL_F32_SIMD(MUL_F32_SIMD(value.v, reciRoot), reciRoot);
  530. reciRoot = MUL_F32_SIMD(MUL_F32_SIMD(LOAD_SCALAR_F32_SIMD(0.5f), reciRoot), SUB_F32_SIMD(LOAD_SCALAR_F32_SIMD(3.0f), mul));
  531. return F32x4(reciRoot);
  532. #elif defined USE_NEON
  533. // Approximate
  534. SIMD_F32x4 reciRoot = vrsqrteq_f32(value.v);
  535. // Refine
  536. reciRoot = MUL_F32_SIMD(vrsqrtsq_f32(MUL_F32_SIMD(value.v, reciRoot), reciRoot), reciRoot);
  537. return F32x4(reciRoot);
  538. #else
  539. assert(false);
  540. return F32x4(0);
  541. #endif
  542. #else
  543. return F32x4(1.0f / sqrt(value.scalars[0]), 1.0f / sqrt(value.scalars[1]), 1.0f / sqrt(value.scalars[2]), 1.0f / sqrt(value.scalars[3]));
  544. #endif
  545. }
  546. // sqrt(value)
  547. inline F32x4 squareRoot(const F32x4 &value) {
  548. #if defined USE_BASIC_SIMD
  549. #if defined USE_SSE2
  550. SIMD_F32x4 half = LOAD_SCALAR_F32_SIMD(0.5f);
  551. // Approximate
  552. SIMD_F32x4 root = _mm_sqrt_ps(value.v);
  553. // Refine
  554. root = _mm_mul_ps(_mm_add_ps(root, _mm_div_ps(value.v, root)), half);
  555. return F32x4(root);
  556. #elif defined USE_NEON
  557. return F32x4(MUL_F32_SIMD(value.v, reciprocalSquareRoot(value).v));
  558. #else
  559. assert(false);
  560. return F32x4(0);
  561. #endif
  562. #else
  563. return F32x4(sqrt(value.scalars[0]), sqrt(value.scalars[1]), sqrt(value.scalars[2]), sqrt(value.scalars[3]));
  564. #endif
  565. }
  566. union I32x4 {
  567. private:
  568. // The uninitialized default constructor is private for safety reasons.
  569. I32x4() {}
  570. public:
  571. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  572. static inline I32x4 create_dangerous_uninitialized() { return I32x4(); }
  573. #if defined USE_BASIC_SIMD
  574. public:
  575. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  576. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  577. // Direct access cannot be done on NEON!
  578. int32_t scalars[4];
  579. #endif
  580. // The SIMD vector of undefined type
  581. // Not accessible while emulating!
  582. SIMD_I32x4 v;
  583. // Construct a portable vector from a native SIMD vector
  584. explicit I32x4(const SIMD_I32x4& v) : v(v) {}
  585. // Construct a portable vector from a set of scalars
  586. I32x4(int32_t a1, int32_t a2, int32_t a3, int32_t a4) : v(LOAD_VECTOR_I32_SIMD(a1, a2, a3, a4)) {}
  587. // Construct a portable vector from a single duplicated scalar
  588. explicit I32x4(int32_t scalar) : v(LOAD_SCALAR_I32_SIMD(scalar)) {}
  589. #else
  590. public:
  591. // Emulate a SIMD vector as an array of scalars without hardware support.
  592. // Only accessible while emulating!
  593. int32_t scalars[4];
  594. // Construct a portable vector from a set of scalars
  595. I32x4(int32_t a1, int32_t a2, int32_t a3, int32_t a4) {
  596. this->scalars[0] = a1;
  597. this->scalars[1] = a2;
  598. this->scalars[2] = a3;
  599. this->scalars[3] = a4;
  600. }
  601. // Construct a portable vector from a single duplicated scalar
  602. explicit I32x4(int32_t scalar) {
  603. this->scalars[0] = scalar;
  604. this->scalars[1] = scalar;
  605. this->scalars[2] = scalar;
  606. this->scalars[3] = scalar;
  607. }
  608. #endif
  609. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  610. static inline I32x4 createGradient(int32_t start, int32_t increment) {
  611. return I32x4(start, start + increment, start + increment * 2, start + increment * 3);
  612. }
  613. // Construct a portable SIMD vector from a pointer to aligned data.
  614. // Data must be aligned with at least 16 bytes.
  615. static inline I32x4 readAlignedUnsafe(const int32_t* data) {
  616. #ifdef SAFE_POINTER_CHECKS
  617. if (uintptr_t(data) & 15u) { throwError(U"Unaligned pointer detected in I32x4::readAlignedUnsafe!\n"); }
  618. #endif
  619. #if defined USE_BASIC_SIMD
  620. #if defined USE_SSE2
  621. return I32x4(_mm_load_si128((const __m128i*)data));
  622. #elif defined USE_NEON
  623. return I32x4(vld1q_s32(data));
  624. #endif
  625. #else
  626. return I32x4(data[0], data[1], data[2], data[3]);
  627. #endif
  628. }
  629. // Write to aligned memory from the existing vector
  630. // data must be aligned with 32 bytes.
  631. inline void writeAlignedUnsafe(int32_t* data) const {
  632. #ifdef SAFE_POINTER_CHECKS
  633. if (uintptr_t(data) & 15u) { throwError(U"Unaligned pointer detected in I32x4::writeAlignedUnsafe!\n"); }
  634. #endif
  635. #if defined USE_BASIC_SIMD
  636. #if defined USE_SSE2
  637. _mm_store_si128((__m128i*)data, this->v);
  638. #elif defined USE_NEON
  639. vst1q_s32(data, this->v);
  640. #endif
  641. #else
  642. data[0] = this->scalars[0];
  643. data[1] = this->scalars[1];
  644. data[2] = this->scalars[2];
  645. data[3] = this->scalars[3];
  646. #endif
  647. }
  648. #if defined DFPSR_GEOMETRY_IVECTOR
  649. dsr::IVector4D get() const {
  650. int32_t data[4] ALIGN16;
  651. #ifdef SAFE_POINTER_CHECKS
  652. if (uintptr_t(data) & 15u) { throwError(U"Unaligned stack memory detected in IVector4D I32x4::get!\n"); }
  653. #endif
  654. this->writeAlignedUnsafe(data);
  655. return dsr::IVector4D(data[0], data[1], data[2], data[3]);
  656. }
  657. #endif
  658. // Bound and alignment checked reading
  659. static inline I32x4 readAligned(dsr::SafePointer<const int32_t> data, const char* methodName) {
  660. const int32_t* pointer = data.getUnsafe();
  661. #if defined SAFE_POINTER_CHECKS
  662. data.assertInside(methodName, pointer, 16);
  663. #endif
  664. return I32x4::readAlignedUnsafe(pointer);
  665. }
  666. // Bound and alignment checked writing
  667. inline void writeAligned(dsr::SafePointer<int32_t> data, const char* methodName) const {
  668. int32_t* pointer = data.getUnsafe();
  669. #if defined SAFE_POINTER_CHECKS
  670. data.assertInside(methodName, pointer, 16);
  671. #endif
  672. this->writeAlignedUnsafe(pointer);
  673. }
  674. };
  675. union U32x4 {
  676. #if defined USE_BASIC_SIMD
  677. public:
  678. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  679. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  680. // Direct access cannot be done on NEON!
  681. uint32_t scalars[4];
  682. #endif
  683. // The SIMD vector of undefined type
  684. // Not accessible while emulating!
  685. SIMD_U32x4 v;
  686. // Construct a portable vector from a native SIMD vector
  687. explicit U32x4(const SIMD_U32x4& v) : v(v) {}
  688. // Construct a portable vector from a set of scalars
  689. U32x4(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4) : v(LOAD_VECTOR_U32_SIMD(a1, a2, a3, a4)) {}
  690. // Construct a portable vector from a single duplicated scalar
  691. explicit U32x4(uint32_t scalar) : v(LOAD_SCALAR_U32_SIMD(scalar)) {}
  692. #else
  693. public:
  694. // Emulate a SIMD vector as an array of scalars without hardware support.
  695. // Only accessible while emulating!
  696. uint32_t scalars[4];
  697. // Construct a portable vector from a set of scalars
  698. U32x4(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4) {
  699. this->scalars[0] = a1;
  700. this->scalars[1] = a2;
  701. this->scalars[2] = a3;
  702. this->scalars[3] = a4;
  703. }
  704. // Construct a portable vector from a single duplicated scalar
  705. explicit U32x4(uint32_t scalar) {
  706. this->scalars[0] = scalar;
  707. this->scalars[1] = scalar;
  708. this->scalars[2] = scalar;
  709. this->scalars[3] = scalar;
  710. }
  711. #endif
  712. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  713. static inline U32x4 createGradient(uint32_t start, uint32_t increment) {
  714. return U32x4(start, start + increment, start + increment * 2, start + increment * 3);
  715. }
  716. // Construct a portable SIMD vector from a pointer to aligned data
  717. // Data must be aligned with at least 16 bytes.
  718. static inline U32x4 readAlignedUnsafe(const uint32_t* data) {
  719. #ifdef SAFE_POINTER_CHECKS
  720. if (uintptr_t(data) & 15u) { throwError(U"Unaligned pointer detected in U32x4::readAlignedUnsafe!\n"); }
  721. #endif
  722. #if defined USE_BASIC_SIMD
  723. #if defined USE_SSE2
  724. return U32x4(_mm_load_si128((const __m128i*)data));
  725. #elif defined USE_NEON
  726. return U32x4(vld1q_u32(data));
  727. #endif
  728. #else
  729. return U32x4(data[0], data[1], data[2], data[3]);
  730. #endif
  731. }
  732. // Write to aligned memory from the existing vector
  733. // data must be aligned with 32 bytes.
  734. inline void writeAlignedUnsafe(uint32_t* data) const {
  735. #ifdef SAFE_POINTER_CHECKS
  736. if (uintptr_t(data) & 15u) { throwError(U"Unaligned pointer detected in U32x4::writeAlignedUnsafe!\n"); }
  737. #endif
  738. #if defined USE_BASIC_SIMD
  739. #if defined USE_SSE2
  740. _mm_store_si128((__m128i*)data, this->v);
  741. #elif defined USE_NEON
  742. vst1q_u32(data, this->v);
  743. #endif
  744. #else
  745. data[0] = this->scalars[0];
  746. data[1] = this->scalars[1];
  747. data[2] = this->scalars[2];
  748. data[3] = this->scalars[3];
  749. #endif
  750. }
  751. #if defined DFPSR_GEOMETRY_UVECTOR
  752. dsr::UVector4D get() const {
  753. uint32_t data[4] ALIGN16;
  754. #ifdef SAFE_POINTER_CHECKS
  755. if (uintptr_t(data) & 15u) { throwError(U"Unaligned stack memory detected in UVector4D U32x4::get!\n"); }
  756. #endif
  757. this->writeAlignedUnsafe(data);
  758. return dsr::UVector4D(data[0], data[1], data[2], data[3]);
  759. }
  760. #endif
  761. // Bound and alignment checked reading
  762. static inline U32x4 readAligned(dsr::SafePointer<const uint32_t> data, const char* methodName) {
  763. const uint32_t* pointer = data.getUnsafe();
  764. #if defined SAFE_POINTER_CHECKS
  765. data.assertInside(methodName, pointer, 16);
  766. #endif
  767. return U32x4::readAlignedUnsafe(pointer);
  768. }
  769. // Bound and alignment checked writing
  770. inline void writeAligned(dsr::SafePointer<uint32_t> data, const char* methodName) const {
  771. uint32_t* pointer = data.getUnsafe();
  772. #if defined SAFE_POINTER_CHECKS
  773. data.assertInside(methodName, pointer, 16);
  774. #endif
  775. this->writeAlignedUnsafe(pointer);
  776. }
  777. };
  778. union U16x8 {
  779. private:
  780. // The uninitialized default constructor is private for safety reasons.
  781. U16x8() {}
  782. public:
  783. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  784. static inline U16x8 create_dangerous_uninitialized() { return U16x8(); }
  785. #if defined USE_BASIC_SIMD
  786. public:
  787. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  788. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  789. // Direct access cannot be done on NEON!
  790. uint16_t scalars[8];
  791. #endif
  792. // The SIMD vector of undefined type
  793. // Not accessible while emulating!
  794. SIMD_U16x8 v;
  795. // Construct a portable vector from a native SIMD vector
  796. explicit U16x8(const SIMD_U16x8& v) : v(v) {}
  797. // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
  798. // Reinterpret casting is used
  799. explicit U16x8(const U32x4& vector) : v(REINTERPRET_U32_TO_U16_SIMD(vector.v)) {}
  800. // Construct a portable vector from a set of scalars
  801. U16x8(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8) : v(LOAD_VECTOR_U16_SIMD(a1, a2, a3, a4, a5, a6, a7, a8)) {}
  802. // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
  803. // Reinterpret casting is used
  804. // TODO: Remove all reintreprets from constructors to improve readability
  805. explicit U16x8(uint32_t scalar) : v(REINTERPRET_U32_TO_U16_SIMD(LOAD_SCALAR_U32_SIMD(scalar))) {}
  806. // Construct a portable vector from a single duplicated scalar
  807. explicit U16x8(uint16_t scalar) : v(LOAD_SCALAR_U16_SIMD(scalar)) {}
  808. // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
  809. U32x4 get_U32() const {
  810. return U32x4(REINTERPRET_U16_TO_U32_SIMD(this->v));
  811. }
  812. #else
  813. public:
  814. // Emulate a SIMD vector as an array of scalars without hardware support.
  815. // Only accessible while emulating!
  816. uint16_t scalars[8];
  817. // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
  818. // Reinterpret casting is used
  819. explicit U16x8(const U32x4& vector) {
  820. uint64_t *target = (uint64_t*)this->scalars;
  821. uint64_t *source = (uint64_t*)vector.scalars;
  822. target[0] = source[0];
  823. target[1] = source[1];
  824. }
  825. // Construct a portable vector from a set of scalars
  826. U16x8(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8) {
  827. this->scalars[0] = a1;
  828. this->scalars[1] = a2;
  829. this->scalars[2] = a3;
  830. this->scalars[3] = a4;
  831. this->scalars[4] = a5;
  832. this->scalars[5] = a6;
  833. this->scalars[6] = a7;
  834. this->scalars[7] = a8;
  835. }
  836. // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
  837. // Reinterpret casting is used
  838. explicit U16x8(uint32_t scalar) {
  839. uint32_t *target = (uint32_t*)this->scalars;
  840. target[0] = scalar;
  841. target[1] = scalar;
  842. target[2] = scalar;
  843. target[3] = scalar;
  844. }
  845. // Construct a portable vector from a single duplicated scalar
  846. explicit U16x8(uint16_t scalar) {
  847. this->scalars[0] = scalar;
  848. this->scalars[1] = scalar;
  849. this->scalars[2] = scalar;
  850. this->scalars[3] = scalar;
  851. this->scalars[4] = scalar;
  852. this->scalars[5] = scalar;
  853. this->scalars[6] = scalar;
  854. this->scalars[7] = scalar;
  855. }
  856. // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
  857. U32x4 get_U32() const {
  858. U32x4 result(0);
  859. uint64_t *target = (uint64_t*)result.scalars;
  860. uint64_t *source = (uint64_t*)this->scalars;
  861. target[0] = source[0];
  862. target[1] = source[1];
  863. return result;
  864. }
  865. #endif
  866. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  867. static inline U16x8 createGradient(uint16_t start, uint16_t increment) {
  868. return U16x8(
  869. start,
  870. start + increment,
  871. start + increment * 2,
  872. start + increment * 3,
  873. start + increment * 4,
  874. start + increment * 5,
  875. start + increment * 6,
  876. start + increment * 7
  877. );
  878. }
  879. static inline U16x8 readAlignedUnsafe(const uint16_t* data) {
  880. #ifdef SAFE_POINTER_CHECKS
  881. if (uintptr_t(data) & 15u) { throwError(U"Unaligned pointer detected in U16x8::readAlignedUnsafe!\n"); }
  882. #endif
  883. #if defined USE_BASIC_SIMD
  884. #if defined USE_SSE2
  885. return U16x8(_mm_load_si128((const __m128i*)data));
  886. #elif defined USE_NEON
  887. return U16x8(vld1q_u16(data));
  888. #endif
  889. #else
  890. return U16x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  891. #endif
  892. }
  893. // Data must be aligned with at least 16 bytes.
  894. inline void writeAlignedUnsafe(uint16_t* data) const {
  895. #ifdef SAFE_POINTER_CHECKS
  896. if (uintptr_t(data) & 15u) { throwError(U"Unaligned pointer detected in U16x8::writeAlignedUnsafe!\n"); }
  897. #endif
  898. #if defined USE_BASIC_SIMD
  899. #if defined USE_SSE2
  900. _mm_store_si128((__m128i*)data, this->v);
  901. #elif defined USE_NEON
  902. vst1q_u16(data, this->v);
  903. #endif
  904. #else
  905. data[0] = this->scalars[0];
  906. data[1] = this->scalars[1];
  907. data[2] = this->scalars[2];
  908. data[3] = this->scalars[3];
  909. data[4] = this->scalars[4];
  910. data[5] = this->scalars[5];
  911. data[6] = this->scalars[6];
  912. data[7] = this->scalars[7];
  913. #endif
  914. }
  915. // Bound and alignment checked reading
  916. static inline U16x8 readAligned(dsr::SafePointer<const uint16_t> data, const char* methodName) {
  917. const uint16_t* pointer = data.getUnsafe();
  918. #if defined SAFE_POINTER_CHECKS
  919. data.assertInside(methodName, pointer, 16);
  920. #endif
  921. return U16x8::readAlignedUnsafe(pointer);
  922. }
  923. // Bound and alignment checked writing
  924. inline void writeAligned(dsr::SafePointer<uint16_t> data, const char* methodName) const {
  925. uint16_t* pointer = data.getUnsafe();
  926. #if defined SAFE_POINTER_CHECKS
  927. data.assertInside(methodName, pointer, 16);
  928. #endif
  929. this->writeAlignedUnsafe(pointer);
  930. }
  931. };
  932. union U8x16 {
  933. private:
  934. // The uninitialized default constructor is private for safety reasons.
  935. U8x16() {}
  936. public:
  937. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  938. static inline U8x16 create_dangerous_uninitialized() { return U8x16(); }
  939. #if defined USE_BASIC_SIMD
  940. public:
  941. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  942. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  943. // Direct access cannot be done on NEON!
  944. uint8_t scalars[16];
  945. #endif
  946. // The SIMD vector of undefined type
  947. // Not accessible while emulating!
  948. SIMD_U8x16 v;
  949. // Construct a portable vector from a native SIMD vector
  950. explicit U8x16(const SIMD_U8x16& v) : v(v) {}
  951. // Construct a portable vector from a set of scalars
  952. U8x16(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  953. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16)
  954. : v(LOAD_VECTOR_U8_SIMD(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16)) {}
  955. // Construct a portable vector from a single duplicated scalar
  956. explicit U8x16(uint8_t scalar) : v(LOAD_SCALAR_U8_SIMD(scalar)) {}
  957. #else
  958. public:
  959. // Emulate a SIMD vector as an array of scalars without hardware support.
  960. // Only accessible while emulating!
  961. uint8_t scalars[16];
  962. // Construct a portable vector from a set of scalars
  963. U8x16(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  964. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16) {
  965. this->scalars[0] = a1;
  966. this->scalars[1] = a2;
  967. this->scalars[2] = a3;
  968. this->scalars[3] = a4;
  969. this->scalars[4] = a5;
  970. this->scalars[5] = a6;
  971. this->scalars[6] = a7;
  972. this->scalars[7] = a8;
  973. this->scalars[8] = a9;
  974. this->scalars[9] = a10;
  975. this->scalars[10] = a11;
  976. this->scalars[11] = a12;
  977. this->scalars[12] = a13;
  978. this->scalars[13] = a14;
  979. this->scalars[14] = a15;
  980. this->scalars[15] = a16;
  981. }
  982. // Construct a portable vector from a single duplicated scalar
  983. explicit U8x16(uint8_t scalar) {
  984. this->scalars[0] = scalar;
  985. this->scalars[1] = scalar;
  986. this->scalars[2] = scalar;
  987. this->scalars[3] = scalar;
  988. this->scalars[4] = scalar;
  989. this->scalars[5] = scalar;
  990. this->scalars[6] = scalar;
  991. this->scalars[7] = scalar;
  992. this->scalars[8] = scalar;
  993. this->scalars[9] = scalar;
  994. this->scalars[10] = scalar;
  995. this->scalars[11] = scalar;
  996. this->scalars[12] = scalar;
  997. this->scalars[13] = scalar;
  998. this->scalars[14] = scalar;
  999. this->scalars[15] = scalar;
  1000. }
  1001. #endif
  1002. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1003. static inline U8x16 createGradient(uint8_t start, uint8_t increment) {
  1004. return U8x16(
  1005. start,
  1006. start + increment,
  1007. start + increment * 2,
  1008. start + increment * 3,
  1009. start + increment * 4,
  1010. start + increment * 5,
  1011. start + increment * 6,
  1012. start + increment * 7,
  1013. start + increment * 8,
  1014. start + increment * 9,
  1015. start + increment * 10,
  1016. start + increment * 11,
  1017. start + increment * 12,
  1018. start + increment * 13,
  1019. start + increment * 14,
  1020. start + increment * 15
  1021. );
  1022. }
  1023. static inline U8x16 readAlignedUnsafe(const uint8_t* data) {
  1024. #ifdef SAFE_POINTER_CHECKS
  1025. if (uintptr_t(data) & 15u) { throwError(U"Unaligned pointer detected in U8x16::readAlignedUnsafe!\n"); }
  1026. #endif
  1027. #if defined USE_BASIC_SIMD
  1028. #if defined USE_SSE2
  1029. return U8x16(_mm_load_si128((const __m128i*)data));
  1030. #elif defined USE_NEON
  1031. return U8x16(vld1q_u8(data));
  1032. #endif
  1033. #else
  1034. return U8x16(
  1035. data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7],
  1036. data[8], data[9], data[10], data[11], data[12], data[13], data[14], data[15]
  1037. );
  1038. #endif
  1039. }
  1040. // Data must be aligned with at least 16 bytes.
  1041. inline void writeAlignedUnsafe(uint8_t* data) const {
  1042. #ifdef SAFE_POINTER_CHECKS
  1043. if (uintptr_t(data) & 15u) { throwError(U"Unaligned pointer detected in U8x16::writeAlignedUnsafe!\n"); }
  1044. #endif
  1045. #if defined USE_BASIC_SIMD
  1046. #if defined USE_SSE2
  1047. _mm_store_si128((__m128i*)data, this->v);
  1048. #elif defined USE_NEON
  1049. vst1q_u8(data, this->v);
  1050. #endif
  1051. #else
  1052. data[0] = this->scalars[0];
  1053. data[1] = this->scalars[1];
  1054. data[2] = this->scalars[2];
  1055. data[3] = this->scalars[3];
  1056. data[4] = this->scalars[4];
  1057. data[5] = this->scalars[5];
  1058. data[6] = this->scalars[6];
  1059. data[7] = this->scalars[7];
  1060. data[8] = this->scalars[8];
  1061. data[9] = this->scalars[9];
  1062. data[10] = this->scalars[10];
  1063. data[11] = this->scalars[11];
  1064. data[12] = this->scalars[12];
  1065. data[13] = this->scalars[13];
  1066. data[14] = this->scalars[14];
  1067. data[15] = this->scalars[15];
  1068. #endif
  1069. }
  1070. // Bound and alignment checked reading
  1071. static inline U8x16 readAligned(dsr::SafePointer<const uint8_t> data, const char* methodName) {
  1072. const uint8_t* pointer = data.getUnsafe();
  1073. #if defined SAFE_POINTER_CHECKS
  1074. data.assertInside(methodName, pointer, 16);
  1075. #endif
  1076. return U8x16::readAlignedUnsafe(pointer);
  1077. }
  1078. // Bound and alignment checked writing
  1079. inline void writeAligned(dsr::SafePointer<uint8_t> data, const char* methodName) const {
  1080. uint8_t* pointer = data.getUnsafe();
  1081. #if defined SAFE_POINTER_CHECKS
  1082. data.assertInside(methodName, pointer, 16);
  1083. #endif
  1084. this->writeAlignedUnsafe(pointer);
  1085. }
  1086. };
  1087. union F32x8 {
  1088. private:
  1089. // The uninitialized default constructor is private for safety reasons.
  1090. F32x8() {}
  1091. public:
  1092. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  1093. static inline F32x8 create_dangerous_uninitialized() { return F32x8(); }
  1094. #if defined USE_256BIT_F_SIMD
  1095. public:
  1096. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  1097. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  1098. float scalars[8];
  1099. #endif
  1100. // The SIMD vector of undefined type
  1101. // Not accessible while emulating!
  1102. SIMD_F32x8 v;
  1103. // Construct a portable vector from a native SIMD vector
  1104. explicit F32x8(const SIMD_F32x8& v) : v(v) {}
  1105. // Construct a portable vector from a set of scalars
  1106. F32x8(float a1, float a2, float a3, float a4, float a5, float a6, float a7, float a8)
  1107. : v(LOAD_VECTOR_F32_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8)) {}
  1108. // Construct a portable vector from a single duplicated scalar
  1109. explicit F32x8(float scalar) : v(LOAD_SCALAR_F32_SIMD256(scalar)) {}
  1110. #else
  1111. public:
  1112. // Emulate a SIMD vector as an array of scalars without hardware support.
  1113. // Only accessible while emulating!
  1114. float scalars[8];
  1115. // Construct a portable vector from a set of scalars
  1116. F32x8(float a1, float a2, float a3, float a4, float a5, float a6, float a7, float a8) {
  1117. this->scalars[0] = a1;
  1118. this->scalars[1] = a2;
  1119. this->scalars[2] = a3;
  1120. this->scalars[3] = a4;
  1121. this->scalars[4] = a5;
  1122. this->scalars[5] = a6;
  1123. this->scalars[6] = a7;
  1124. this->scalars[7] = a8;
  1125. }
  1126. // Construct a portable vector from a single duplicated scalar
  1127. explicit F32x8(float scalar) {
  1128. this->scalars[0] = scalar;
  1129. this->scalars[1] = scalar;
  1130. this->scalars[2] = scalar;
  1131. this->scalars[3] = scalar;
  1132. this->scalars[4] = scalar;
  1133. this->scalars[5] = scalar;
  1134. this->scalars[6] = scalar;
  1135. this->scalars[7] = scalar;
  1136. }
  1137. #endif
  1138. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1139. static inline F32x8 createGradient(float start, float increment) {
  1140. return F32x8(
  1141. start,
  1142. start + increment,
  1143. start + increment * 2.0f,
  1144. start + increment * 3.0f,
  1145. start + increment * 4.0f,
  1146. start + increment * 5.0f,
  1147. start + increment * 6.0f,
  1148. start + increment * 7.0f
  1149. );
  1150. }
  1151. // Construct a portable SIMD vector from a pointer to aligned data.
  1152. static inline F32x8 readAlignedUnsafe(const float* data) {
  1153. #ifdef SAFE_POINTER_CHECKS
  1154. if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in F32x8::readAlignedUnsafe!\n"); }
  1155. #endif
  1156. #if defined USE_AVX2
  1157. return F32x8(_mm256_load_ps(data));
  1158. #else
  1159. return F32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  1160. #endif
  1161. }
  1162. // Write to aligned memory from the existing vector.
  1163. inline void writeAlignedUnsafe(float* data) const {
  1164. #ifdef SAFE_POINTER_CHECKS
  1165. if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in F32x8::writeAlignedUnsafe!\n"); }
  1166. #endif
  1167. #if defined USE_AVX2
  1168. _mm256_store_ps(data, this->v);
  1169. #else
  1170. data[0] = this->scalars[0];
  1171. data[1] = this->scalars[1];
  1172. data[2] = this->scalars[2];
  1173. data[3] = this->scalars[3];
  1174. data[4] = this->scalars[4];
  1175. data[5] = this->scalars[5];
  1176. data[6] = this->scalars[6];
  1177. data[7] = this->scalars[7];
  1178. #endif
  1179. }
  1180. // Bound and alignment checked reading
  1181. static inline F32x8 readAligned(dsr::SafePointer<const float> data, const char* methodName) {
  1182. const float* pointer = data.getUnsafe();
  1183. #if defined SAFE_POINTER_CHECKS
  1184. data.assertInside(methodName, pointer, 32);
  1185. #endif
  1186. return F32x8::readAlignedUnsafe(pointer);
  1187. }
  1188. // Bound and alignment checked writing
  1189. inline void writeAligned(dsr::SafePointer<float> data, const char* methodName) const {
  1190. float* pointer = data.getUnsafe();
  1191. #if defined SAFE_POINTER_CHECKS
  1192. data.assertInside(methodName, pointer, 32);
  1193. #endif
  1194. this->writeAlignedUnsafe(pointer);
  1195. }
  1196. };
  1197. // 1 / value
  1198. inline F32x8 reciprocal(const F32x8 &value) {
  1199. #if defined USE_AVX2
  1200. // Approximate
  1201. SIMD_F32x8 lowQ = _mm256_rcp_ps(value.v);
  1202. // Refine
  1203. return F32x8(SUB_F32_SIMD256(ADD_F32_SIMD256(lowQ, lowQ), MUL_F32_SIMD256(value.v, MUL_F32_SIMD256(lowQ, lowQ))));
  1204. #else
  1205. return F32x8(
  1206. 1.0f / value.scalars[0],
  1207. 1.0f / value.scalars[1],
  1208. 1.0f / value.scalars[2],
  1209. 1.0f / value.scalars[3],
  1210. 1.0f / value.scalars[4],
  1211. 1.0f / value.scalars[5],
  1212. 1.0f / value.scalars[6],
  1213. 1.0f / value.scalars[7]
  1214. );
  1215. #endif
  1216. }
  1217. // 1 / sqrt(value)
  1218. inline F32x8 reciprocalSquareRoot(const F32x8 &value) {
  1219. #if defined USE_AVX2
  1220. SIMD_F32x8 reciRoot = _mm256_rsqrt_ps(value.v);
  1221. SIMD_F32x8 mul = MUL_F32_SIMD256(MUL_F32_SIMD256(value.v, reciRoot), reciRoot);
  1222. reciRoot = MUL_F32_SIMD256(MUL_F32_SIMD256(LOAD_SCALAR_F32_SIMD256(0.5f), reciRoot), SUB_F32_SIMD256(LOAD_SCALAR_F32_SIMD256(3.0f), mul));
  1223. return F32x8(reciRoot);
  1224. #else
  1225. return F32x8(
  1226. 1.0f / sqrt(value.scalars[0]),
  1227. 1.0f / sqrt(value.scalars[1]),
  1228. 1.0f / sqrt(value.scalars[2]),
  1229. 1.0f / sqrt(value.scalars[3]),
  1230. 1.0f / sqrt(value.scalars[4]),
  1231. 1.0f / sqrt(value.scalars[5]),
  1232. 1.0f / sqrt(value.scalars[6]),
  1233. 1.0f / sqrt(value.scalars[7])
  1234. );
  1235. #endif
  1236. }
  1237. // sqrt(value)
  1238. inline F32x8 squareRoot(const F32x8 &value) {
  1239. #if defined USE_AVX2
  1240. SIMD_F32x8 half = LOAD_SCALAR_F32_SIMD256(0.5f);
  1241. // Approximate
  1242. SIMD_F32x8 root = _mm256_sqrt_ps(value.v);
  1243. // Refine
  1244. root = _mm256_mul_ps(_mm256_add_ps(root, _mm256_div_ps(value.v, root)), half);
  1245. return F32x8(root);
  1246. #else
  1247. return F32x8(
  1248. sqrt(value.scalars[0]),
  1249. sqrt(value.scalars[1]),
  1250. sqrt(value.scalars[2]),
  1251. sqrt(value.scalars[3]),
  1252. sqrt(value.scalars[4]),
  1253. sqrt(value.scalars[5]),
  1254. sqrt(value.scalars[6]),
  1255. sqrt(value.scalars[7]));
  1256. #endif
  1257. }
  1258. union I32x8 {
  1259. private:
  1260. // The uninitialized default constructor is private for safety reasons.
  1261. I32x8() {}
  1262. public:
  1263. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  1264. static inline I32x8 create_dangerous_uninitialized() { return I32x8(); }
  1265. #if defined USE_256BIT_X_SIMD
  1266. public:
  1267. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  1268. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  1269. int32_t scalars[8];
  1270. #endif
  1271. // The SIMD vector of undefined type
  1272. // Not accessible while emulating!
  1273. SIMD_I32x8 v;
  1274. // Construct a portable vector from a native SIMD vector
  1275. explicit I32x8(const SIMD_I32x8& v) : v(v) {}
  1276. // Construct a portable vector from a set of scalars
  1277. I32x8(int32_t a1, int32_t a2, int32_t a3, int32_t a4, int32_t a5, int32_t a6, int32_t a7, int32_t a8)
  1278. : v(LOAD_VECTOR_I32_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8)) {}
  1279. // Construct a portable vector from a single duplicated scalar
  1280. explicit I32x8(int32_t scalar) : v(LOAD_SCALAR_I32_SIMD256(scalar)) {}
  1281. #else
  1282. public:
  1283. // Emulate a SIMD vector as an array of scalars without hardware support.
  1284. // Only accessible while emulating!
  1285. int32_t scalars[8];
  1286. // Construct a portable vector from a set of scalars
  1287. I32x8(int32_t a1, int32_t a2, int32_t a3, int32_t a4, int32_t a5, int32_t a6, int32_t a7, int32_t a8) {
  1288. this->scalars[0] = a1;
  1289. this->scalars[1] = a2;
  1290. this->scalars[2] = a3;
  1291. this->scalars[3] = a4;
  1292. this->scalars[4] = a5;
  1293. this->scalars[5] = a6;
  1294. this->scalars[6] = a7;
  1295. this->scalars[7] = a8;
  1296. }
  1297. // Construct a portable vector from a single duplicated scalar
  1298. explicit I32x8(int32_t scalar) {
  1299. this->scalars[0] = scalar;
  1300. this->scalars[1] = scalar;
  1301. this->scalars[2] = scalar;
  1302. this->scalars[3] = scalar;
  1303. this->scalars[4] = scalar;
  1304. this->scalars[5] = scalar;
  1305. this->scalars[6] = scalar;
  1306. this->scalars[7] = scalar;
  1307. }
  1308. #endif
  1309. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1310. static inline I32x8 createGradient(int32_t start, int32_t increment) {
  1311. return I32x8(
  1312. start,
  1313. start + increment,
  1314. start + increment * 2,
  1315. start + increment * 3,
  1316. start + increment * 4,
  1317. start + increment * 5,
  1318. start + increment * 6,
  1319. start + increment * 7
  1320. );
  1321. }
  1322. // Construct a portable SIMD vector from a pointer to aligned data.
  1323. static inline I32x8 readAlignedUnsafe(const int32_t* data) {
  1324. #ifdef SAFE_POINTER_CHECKS
  1325. if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in I32x8::readAlignedUnsafe!\n"); }
  1326. #endif
  1327. #if defined USE_AVX2
  1328. return I32x8(_mm256_load_si256((const __m256i*)data));
  1329. #else
  1330. return I32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  1331. #endif
  1332. }
  1333. // Write to aligned memory from the existing vector.
  1334. inline void writeAlignedUnsafe(int32_t* data) const {
  1335. #ifdef SAFE_POINTER_CHECKS
  1336. if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in I32x8::writeAlignedUnsafe!\n"); }
  1337. #endif
  1338. #if defined USE_AVX2
  1339. _mm256_store_si256((__m256i*)data, this->v);
  1340. #else
  1341. data[0] = this->scalars[0];
  1342. data[1] = this->scalars[1];
  1343. data[2] = this->scalars[2];
  1344. data[3] = this->scalars[3];
  1345. data[4] = this->scalars[4];
  1346. data[5] = this->scalars[5];
  1347. data[6] = this->scalars[6];
  1348. data[7] = this->scalars[7];
  1349. #endif
  1350. }
  1351. // Bound and alignment checked reading
  1352. static inline I32x8 readAligned(dsr::SafePointer<const int32_t> data, const char* methodName) {
  1353. const int32_t* pointer = data.getUnsafe();
  1354. #if defined SAFE_POINTER_CHECKS
  1355. data.assertInside(methodName, pointer, 32);
  1356. #endif
  1357. return I32x8::readAlignedUnsafe(pointer);
  1358. }
  1359. // Bound and alignment checked writing
  1360. inline void writeAligned(dsr::SafePointer<int32_t> data, const char* methodName) const {
  1361. int32_t* pointer = data.getUnsafe();
  1362. #if defined SAFE_POINTER_CHECKS
  1363. data.assertInside(methodName, pointer, 32);
  1364. #endif
  1365. this->writeAlignedUnsafe(pointer);
  1366. }
  1367. };
  1368. union U32x8 {
  1369. private:
  1370. // The uninitialized default constructor is private for safety reasons.
  1371. U32x8() {}
  1372. public:
  1373. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  1374. static inline U32x8 create_dangerous_uninitialized() { return U32x8(); }
  1375. #if defined USE_256BIT_X_SIMD
  1376. public:
  1377. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  1378. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  1379. uint32_t scalars[8];
  1380. #endif
  1381. // The SIMD vector of undefined type
  1382. // Not accessible while emulating!
  1383. SIMD_U32x8 v;
  1384. // Construct a portable vector from a native SIMD vector
  1385. explicit U32x8(const SIMD_U32x8& v) : v(v) {}
  1386. // Construct a portable vector from a set of scalars
  1387. U32x8(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4, uint32_t a5, uint32_t a6, uint32_t a7, uint32_t a8)
  1388. : v(LOAD_VECTOR_U32_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8)) {}
  1389. // Construct a portable vector from a single duplicated scalar
  1390. explicit U32x8(uint32_t scalar) : v(LOAD_SCALAR_U32_SIMD256(scalar)) {}
  1391. #else
  1392. public:
  1393. // Emulate a SIMD vector as an array of scalars without hardware support.
  1394. // Only accessible while emulating!
  1395. uint32_t scalars[8];
  1396. // Construct a portable vector from a set of scalars
  1397. U32x8(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4, uint32_t a5, uint32_t a6, uint32_t a7, uint32_t a8) {
  1398. this->scalars[0] = a1;
  1399. this->scalars[1] = a2;
  1400. this->scalars[2] = a3;
  1401. this->scalars[3] = a4;
  1402. this->scalars[4] = a5;
  1403. this->scalars[5] = a6;
  1404. this->scalars[6] = a7;
  1405. this->scalars[7] = a8;
  1406. }
  1407. // Construct a portable vector from a single duplicated scalar
  1408. explicit U32x8(uint32_t scalar) {
  1409. this->scalars[0] = scalar;
  1410. this->scalars[1] = scalar;
  1411. this->scalars[2] = scalar;
  1412. this->scalars[3] = scalar;
  1413. this->scalars[4] = scalar;
  1414. this->scalars[5] = scalar;
  1415. this->scalars[6] = scalar;
  1416. this->scalars[7] = scalar;
  1417. }
  1418. #endif
  1419. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1420. static inline U32x8 createGradient(uint32_t start, uint32_t increment) {
  1421. return U32x8(
  1422. start,
  1423. start + increment,
  1424. start + increment * 2,
  1425. start + increment * 3,
  1426. start + increment * 4,
  1427. start + increment * 5,
  1428. start + increment * 6,
  1429. start + increment * 7
  1430. );
  1431. }
  1432. // Construct a portable SIMD vector from a pointer to aligned data.
  1433. // Data must be aligned with at least 32 bytes.
  1434. static inline U32x8 readAlignedUnsafe(const uint32_t* data) {
  1435. #ifdef SAFE_POINTER_CHECKS
  1436. if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in U32x8::readAlignedUnsafe!\n"); }
  1437. #endif
  1438. #if defined USE_AVX2
  1439. return U32x8(_mm256_load_si256((const __m256i*)data));
  1440. #else
  1441. return U32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  1442. #endif
  1443. }
  1444. // Write to aligned memory from the existing vector.
  1445. // data must be aligned with 32 bytes.
  1446. inline void writeAlignedUnsafe(uint32_t* data) const {
  1447. #ifdef SAFE_POINTER_CHECKS
  1448. if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in U32x8::writeAlignedUnsafe!\n"); }
  1449. #endif
  1450. #if defined USE_AVX2
  1451. _mm256_store_si256((__m256i*)data, this->v);
  1452. #else
  1453. data[0] = this->scalars[0];
  1454. data[1] = this->scalars[1];
  1455. data[2] = this->scalars[2];
  1456. data[3] = this->scalars[3];
  1457. data[4] = this->scalars[4];
  1458. data[5] = this->scalars[5];
  1459. data[6] = this->scalars[6];
  1460. data[7] = this->scalars[7];
  1461. #endif
  1462. }
  1463. // Bound and alignment checked reading
  1464. static inline U32x8 readAligned(dsr::SafePointer<const uint32_t> data, const char* methodName) {
  1465. const uint32_t* pointer = data.getUnsafe();
  1466. #if defined SAFE_POINTER_CHECKS
  1467. data.assertInside(methodName, pointer, 32);
  1468. #endif
  1469. return U32x8::readAlignedUnsafe(pointer);
  1470. }
  1471. // Bound and alignment checked writing
  1472. inline void writeAligned(dsr::SafePointer<uint32_t> data, const char* methodName) const {
  1473. uint32_t* pointer = data.getUnsafe();
  1474. #if defined SAFE_POINTER_CHECKS
  1475. data.assertInside(methodName, pointer, 32);
  1476. #endif
  1477. this->writeAlignedUnsafe(pointer);
  1478. }
  1479. };
  1480. union U16x16 {
  1481. private:
  1482. // The uninitialized default constructor is private for safety reasons.
  1483. U16x16() {}
  1484. public:
  1485. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  1486. static inline U16x16 create_dangerous_uninitialized() { return U16x16(); }
  1487. #if defined USE_256BIT_X_SIMD
  1488. public:
  1489. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  1490. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  1491. uint16_t scalars[16];
  1492. #endif
  1493. // The SIMD vector of undefined type
  1494. // Not accessible while emulating!
  1495. SIMD_U16x16 v;
  1496. // Construct a portable vector from a native SIMD vector
  1497. explicit U16x16(const SIMD_U16x16& v) : v(v) {}
  1498. // Construct a vector of 16 x 16-bit unsigned integers from a vector of 8 x 32-bit unsigned integers
  1499. // Reinterpret casting is used
  1500. explicit U16x16(const U32x8& vector) : v(REINTERPRET_U32_TO_U16_SIMD256(vector.v)) {}
  1501. // Construct a portable vector from a set of scalars
  1502. U16x16(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8,
  1503. uint16_t a9, uint16_t a10, uint16_t a11, uint16_t a12, uint16_t a13, uint16_t a14, uint16_t a15, uint16_t a16)
  1504. : v(LOAD_VECTOR_U16_SIMD256(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16)) {}
  1505. // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
  1506. // Reinterpret casting is used
  1507. // TODO: Remove all reintreprets from constructors to improve readability
  1508. explicit U16x16(uint32_t scalar) : v(REINTERPRET_U32_TO_U16_SIMD256(LOAD_SCALAR_U32_SIMD256(scalar))) {}
  1509. // Construct a portable vector from a single duplicated scalar
  1510. explicit U16x16(uint16_t scalar) : v(LOAD_SCALAR_U16_SIMD256(scalar)) {}
  1511. // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
  1512. U32x8 get_U32() const {
  1513. return U32x8(REINTERPRET_U16_TO_U32_SIMD256(this->v));
  1514. }
  1515. #else
  1516. public:
  1517. // Emulate a SIMD vector as an array of scalars without hardware support.
  1518. // Only accessible while emulating!
  1519. uint16_t scalars[16];
  1520. // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
  1521. // Reinterpret casting is used
  1522. explicit U16x16(const U32x8& vector) {
  1523. uint64_t *target = (uint64_t*)this->scalars;
  1524. uint64_t *source = (uint64_t*)vector.scalars;
  1525. target[0] = source[0];
  1526. target[1] = source[1];
  1527. target[2] = source[2];
  1528. target[3] = source[3];
  1529. }
  1530. // Construct a portable vector from a set of scalars
  1531. U16x16(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8,
  1532. uint16_t a9, uint16_t a10, uint16_t a11, uint16_t a12, uint16_t a13, uint16_t a14, uint16_t a15, uint16_t a16) {
  1533. this->scalars[0] = a1;
  1534. this->scalars[1] = a2;
  1535. this->scalars[2] = a3;
  1536. this->scalars[3] = a4;
  1537. this->scalars[4] = a5;
  1538. this->scalars[5] = a6;
  1539. this->scalars[6] = a7;
  1540. this->scalars[7] = a8;
  1541. this->scalars[8] = a9;
  1542. this->scalars[9] = a10;
  1543. this->scalars[10] = a11;
  1544. this->scalars[11] = a12;
  1545. this->scalars[12] = a13;
  1546. this->scalars[13] = a14;
  1547. this->scalars[14] = a15;
  1548. this->scalars[15] = a16;
  1549. }
  1550. // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
  1551. // Reinterpret casting is used
  1552. explicit U16x16(uint32_t scalar) {
  1553. uint32_t *target = (uint32_t*)this->scalars;
  1554. target[0] = scalar;
  1555. target[1] = scalar;
  1556. target[2] = scalar;
  1557. target[3] = scalar;
  1558. target[4] = scalar;
  1559. target[5] = scalar;
  1560. target[6] = scalar;
  1561. target[7] = scalar;
  1562. }
  1563. // Construct a portable vector from a single duplicated scalar
  1564. explicit U16x16(uint16_t scalar) {
  1565. this->scalars[0] = scalar;
  1566. this->scalars[1] = scalar;
  1567. this->scalars[2] = scalar;
  1568. this->scalars[3] = scalar;
  1569. this->scalars[4] = scalar;
  1570. this->scalars[5] = scalar;
  1571. this->scalars[6] = scalar;
  1572. this->scalars[7] = scalar;
  1573. this->scalars[8] = scalar;
  1574. this->scalars[9] = scalar;
  1575. this->scalars[10] = scalar;
  1576. this->scalars[11] = scalar;
  1577. this->scalars[12] = scalar;
  1578. this->scalars[13] = scalar;
  1579. this->scalars[14] = scalar;
  1580. this->scalars[15] = scalar;
  1581. }
  1582. // Reinterpret cast to a vector of 8 x 32-bit unsigned integers
  1583. U32x8 get_U32() const {
  1584. U32x8 result(0);
  1585. uint64_t *target = (uint64_t*)result.scalars;
  1586. uint64_t *source = (uint64_t*)this->scalars;
  1587. target[0] = source[0];
  1588. target[1] = source[1];
  1589. target[2] = source[2];
  1590. target[3] = source[3];
  1591. return result;
  1592. }
  1593. #endif
  1594. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1595. static inline U16x16 createGradient(uint16_t start, uint16_t increment) {
  1596. return U16x16(
  1597. start,
  1598. start + increment,
  1599. start + increment * 2,
  1600. start + increment * 3,
  1601. start + increment * 4,
  1602. start + increment * 5,
  1603. start + increment * 6,
  1604. start + increment * 7,
  1605. start + increment * 8,
  1606. start + increment * 9,
  1607. start + increment * 10,
  1608. start + increment * 11,
  1609. start + increment * 12,
  1610. start + increment * 13,
  1611. start + increment * 14,
  1612. start + increment * 15
  1613. );
  1614. }
  1615. // Data must be aligned with at least 32 bytes.
  1616. static inline U16x16 readAlignedUnsafe(const uint16_t* data) {
  1617. #ifdef SAFE_POINTER_CHECKS
  1618. if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in U16x16::readAlignedUnsafe!\n"); }
  1619. #endif
  1620. #if defined USE_AVX2
  1621. return U16x16(_mm256_load_si256((const __m256i*)data));
  1622. #else
  1623. return U16x16(
  1624. data[0],
  1625. data[1],
  1626. data[2],
  1627. data[3],
  1628. data[4],
  1629. data[5],
  1630. data[6],
  1631. data[7],
  1632. data[8],
  1633. data[9],
  1634. data[10],
  1635. data[11],
  1636. data[12],
  1637. data[13],
  1638. data[14],
  1639. data[15]
  1640. );
  1641. #endif
  1642. }
  1643. // Data must be aligned with at least 32 bytes.
  1644. inline void writeAlignedUnsafe(uint16_t* data) const {
  1645. #ifdef SAFE_POINTER_CHECKS
  1646. if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in U16x16::writeAlignedUnsafe!\n"); }
  1647. #endif
  1648. #if defined USE_AVX2
  1649. _mm256_store_si256((__m256i*)data, this->v);
  1650. #else
  1651. data[0] = this->scalars[0];
  1652. data[1] = this->scalars[1];
  1653. data[2] = this->scalars[2];
  1654. data[3] = this->scalars[3];
  1655. data[4] = this->scalars[4];
  1656. data[5] = this->scalars[5];
  1657. data[6] = this->scalars[6];
  1658. data[7] = this->scalars[7];
  1659. data[8] = this->scalars[8];
  1660. data[9] = this->scalars[9];
  1661. data[10] = this->scalars[10];
  1662. data[11] = this->scalars[11];
  1663. data[12] = this->scalars[12];
  1664. data[13] = this->scalars[13];
  1665. data[14] = this->scalars[14];
  1666. data[15] = this->scalars[15];
  1667. #endif
  1668. }
  1669. // Bound and alignment checked reading
  1670. static inline U16x16 readAligned(dsr::SafePointer<const uint16_t> data, const char* methodName) {
  1671. const uint16_t* pointer = data.getUnsafe();
  1672. #if defined SAFE_POINTER_CHECKS
  1673. data.assertInside(methodName, pointer, 32);
  1674. #endif
  1675. return U16x16::readAlignedUnsafe(pointer);
  1676. }
  1677. // Bound and alignment checked writing
  1678. inline void writeAligned(dsr::SafePointer<uint16_t> data, const char* methodName) const {
  1679. uint16_t* pointer = data.getUnsafe();
  1680. #if defined SAFE_POINTER_CHECKS
  1681. data.assertInside(methodName, pointer, 32);
  1682. #endif
  1683. this->writeAlignedUnsafe(pointer);
  1684. }
  1685. };
  1686. union U8x32 {
  1687. private:
  1688. // The uninitialized default constructor is private for safety reasons.
  1689. U8x32() {}
  1690. public:
  1691. // When the uninitialized constructor is needed for performance, use this named constructor instead.
  1692. static inline U8x32 create_dangerous_uninitialized() { return U8x32(); }
  1693. #if defined USE_256BIT_X_SIMD
  1694. public:
  1695. #if defined USE_DIRECT_SIMD_MEMORY_ACCESS
  1696. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  1697. uint8_t scalars[32];
  1698. #endif
  1699. // The SIMD vector of undefined type
  1700. // Not accessible while emulating!
  1701. SIMD_U8x32 v;
  1702. // Construct a portable vector from a native SIMD vector
  1703. explicit U8x32(const SIMD_U8x32& v) : v(v) {}
  1704. // Construct a portable vector from a set of scalars
  1705. U8x32(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  1706. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16,
  1707. uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24,
  1708. uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32)
  1709. : v(LOAD_VECTOR_U8_SIMD256(
  1710. a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16,
  1711. a17, a18, a19, a20, a21, a22, a23, a24, a25, a26, a27, a28, a29, a30, a31, a32
  1712. )) {}
  1713. // Construct a portable vector from a single duplicated scalar
  1714. explicit U8x32(uint8_t scalar) : v(LOAD_SCALAR_U8_SIMD256(scalar)) {}
  1715. #else
  1716. public:
  1717. // Emulate a SIMD vector as an array of scalars without hardware support.
  1718. // Only accessible while emulating!
  1719. uint8_t scalars[32];
  1720. // Construct a portable vector from a set of scalars
  1721. U8x32(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  1722. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16,
  1723. uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24,
  1724. uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32) {
  1725. this->scalars[0] = a1;
  1726. this->scalars[1] = a2;
  1727. this->scalars[2] = a3;
  1728. this->scalars[3] = a4;
  1729. this->scalars[4] = a5;
  1730. this->scalars[5] = a6;
  1731. this->scalars[6] = a7;
  1732. this->scalars[7] = a8;
  1733. this->scalars[8] = a9;
  1734. this->scalars[9] = a10;
  1735. this->scalars[10] = a11;
  1736. this->scalars[11] = a12;
  1737. this->scalars[12] = a13;
  1738. this->scalars[13] = a14;
  1739. this->scalars[14] = a15;
  1740. this->scalars[15] = a16;
  1741. this->scalars[16] = a17;
  1742. this->scalars[17] = a18;
  1743. this->scalars[18] = a19;
  1744. this->scalars[19] = a20;
  1745. this->scalars[20] = a21;
  1746. this->scalars[21] = a22;
  1747. this->scalars[22] = a23;
  1748. this->scalars[23] = a24;
  1749. this->scalars[24] = a25;
  1750. this->scalars[25] = a26;
  1751. this->scalars[26] = a27;
  1752. this->scalars[27] = a28;
  1753. this->scalars[28] = a29;
  1754. this->scalars[29] = a30;
  1755. this->scalars[30] = a31;
  1756. this->scalars[31] = a32;
  1757. }
  1758. // Construct a portable vector from a single duplicated scalar
  1759. explicit U8x32(uint8_t scalar) {
  1760. for (int i = 0; i < 32; i++) {
  1761. this->scalars[i] = scalar;
  1762. }
  1763. }
  1764. #endif
  1765. // Create a gradient vector using start and increment, so that arbitrary length vectors have a way to initialize linear iterations.
  1766. static inline U8x32 createGradient(uint8_t start, uint8_t increment) {
  1767. return U8x32(
  1768. start,
  1769. start + increment,
  1770. start + increment * 2,
  1771. start + increment * 3,
  1772. start + increment * 4,
  1773. start + increment * 5,
  1774. start + increment * 6,
  1775. start + increment * 7,
  1776. start + increment * 8,
  1777. start + increment * 9,
  1778. start + increment * 10,
  1779. start + increment * 11,
  1780. start + increment * 12,
  1781. start + increment * 13,
  1782. start + increment * 14,
  1783. start + increment * 15,
  1784. start + increment * 16,
  1785. start + increment * 17,
  1786. start + increment * 18,
  1787. start + increment * 19,
  1788. start + increment * 20,
  1789. start + increment * 21,
  1790. start + increment * 22,
  1791. start + increment * 23,
  1792. start + increment * 24,
  1793. start + increment * 25,
  1794. start + increment * 26,
  1795. start + increment * 27,
  1796. start + increment * 28,
  1797. start + increment * 29,
  1798. start + increment * 30,
  1799. start + increment * 31
  1800. );
  1801. }
  1802. // Data must be aligned with at least 32 bytes.
  1803. static inline U8x32 readAlignedUnsafe(const uint8_t* data) {
  1804. #ifdef SAFE_POINTER_CHECKS
  1805. if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in U8x32::readAlignedUnsafe!\n"); }
  1806. #endif
  1807. #if defined USE_AVX2
  1808. return U8x32(_mm256_load_si256((const __m256i*)data));
  1809. #else
  1810. U8x32 result;
  1811. for (int i = 0; i < 32; i++) {
  1812. result.scalars[i] = data[i];
  1813. }
  1814. return result;
  1815. #endif
  1816. }
  1817. // Data must be aligned with at least 32 bytes.
  1818. inline void writeAlignedUnsafe(uint8_t* data) const {
  1819. #ifdef SAFE_POINTER_CHECKS
  1820. if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in U8x32::writeAlignedUnsafe!\n"); }
  1821. #endif
  1822. #if defined USE_AVX2
  1823. _mm256_store_si256((__m256i*)data, this->v);
  1824. #else
  1825. for (int i = 0; i < 32; i++) {
  1826. data[i] = this->scalars[i];
  1827. }
  1828. #endif
  1829. }
  1830. // Bound and alignment checked reading
  1831. static inline U8x32 readAligned(dsr::SafePointer<const uint8_t> data, const char* methodName) {
  1832. const uint8_t* pointer = data.getUnsafe();
  1833. #if defined SAFE_POINTER_CHECKS
  1834. data.assertInside(methodName, pointer, 32);
  1835. #endif
  1836. return U8x32::readAlignedUnsafe(pointer);
  1837. }
  1838. // Bound and alignment checked writing
  1839. inline void writeAligned(dsr::SafePointer<uint8_t> data, const char* methodName) const {
  1840. uint8_t* pointer = data.getUnsafe();
  1841. #if defined SAFE_POINTER_CHECKS
  1842. data.assertInside(methodName, pointer, 32);
  1843. #endif
  1844. this->writeAlignedUnsafe(pointer);
  1845. }
  1846. };
  1847. // Helper macros for doing things to certain sets of SIMD vector types
  1848. // Performing do(vector_type, element_type, lane_count)
  1849. #define FOR_ALL_VECTOR_TYPES(DO) \
  1850. DO(F32x4, float, 4) \
  1851. DO(I32x4, int32_t, 4) \
  1852. DO(U32x4, uint32_t, 4) \
  1853. DO(U16x8, uint16_t, 8) \
  1854. DO(U8x16, uint8_t, 16) \
  1855. DO(F32x8, float, 8) \
  1856. DO(I32x8, int32_t, 8) \
  1857. DO(U32x8, uint32_t, 8) \
  1858. DO(U16x16, uint16_t, 16) \
  1859. DO(U8x32, uint8_t, 32)
  1860. #define FOR_FLOAT_VECTOR_TYPES(DO) \
  1861. DO(F32x4, float, 4) \
  1862. DO(F32x8, float, 8)
  1863. #define FOR_INTEGER_VECTOR_TYPES(DO) \
  1864. DO(I32x4, int32_t, 4) \
  1865. DO(U32x4, uint32_t, 4) \
  1866. DO(U16x8, uint16_t, 8) \
  1867. DO(U8x16, uint8_t, 16) \
  1868. DO(I32x8, int32_t, 8) \
  1869. DO(U32x8, uint32_t, 8) \
  1870. DO(U16x16, uint16_t, 16) \
  1871. DO(U8x32, uint8_t, 32)
  1872. #define FOR_SIGNED_VECTOR_TYPES(DO) \
  1873. DO(F32x4, float, 4) \
  1874. DO(I32x4, int32_t, 4) \
  1875. DO(F32x8, float, 8) \
  1876. DO(I32x8, int32_t, 8)
  1877. #define FOR_UNSIGNED_VECTOR_TYPES(DO) \
  1878. DO(U32x4, uint32_t, 4) \
  1879. DO(U16x8, uint16_t, 8) \
  1880. DO(U8x16, uint8_t, 16) \
  1881. DO(U32x8, uint32_t, 8) \
  1882. DO(U16x16, uint16_t, 16) \
  1883. DO(U8x32, uint8_t, 32)
  1884. // Print SIMD vectors to the terminal or append them to strings.
  1885. #define CREATE_METHOD_PRINT(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  1886. inline dsr::String& string_toStreamIndented(dsr::String& target, const VECTOR_TYPE& source, const dsr::ReadableString& indentation) { \
  1887. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  1888. source.writeAlignedUnsafe(a); \
  1889. dsr::string_append(target, indentation, a[0]); \
  1890. for (int i = 1; i < LANE_COUNT; i++) { \
  1891. string_append(target, U", ", a[i]); \
  1892. } \
  1893. return target; \
  1894. }
  1895. // All SIMD vectors can be printed.
  1896. FOR_ALL_VECTOR_TYPES(CREATE_METHOD_PRINT)
  1897. #undef CREATE_METHOD_PRINT
  1898. // Integer equality returns true iff all comparisons are identical.
  1899. #define CREATE_EXACT_EQUALITY(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  1900. inline bool allLanesEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  1901. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  1902. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  1903. left.writeAlignedUnsafe(a); \
  1904. right.writeAlignedUnsafe(b); \
  1905. for (int i = 0; i < LANE_COUNT; i++) { \
  1906. if (a[i] != b[i]) return false; \
  1907. } \
  1908. return true; \
  1909. } \
  1910. inline bool allLanesNotEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  1911. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  1912. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  1913. left.writeAlignedUnsafe(a); \
  1914. right.writeAlignedUnsafe(b); \
  1915. for (int i = 0; i < LANE_COUNT; i++) { \
  1916. if (a[i] == b[i]) return false; \
  1917. } \
  1918. return true; \
  1919. }
  1920. // Integer SIMD vectors have exact equlity.
  1921. FOR_INTEGER_VECTOR_TYPES(CREATE_EXACT_EQUALITY)
  1922. #undef CREATE_EXACT_EQUALITY
  1923. // Float equality returns true iff all comparisons are near.
  1924. #define CREATE_TOLERANT_EQUALITY(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  1925. inline bool allLanesEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  1926. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  1927. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  1928. left.writeAlignedUnsafe(a); \
  1929. right.writeAlignedUnsafe(b); \
  1930. for (int i = 0; i < LANE_COUNT; i++) { \
  1931. if (fabs(a[i] - b[i]) >= 0.0001f) return false; \
  1932. } \
  1933. return true; \
  1934. } \
  1935. inline bool allLanesNotEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  1936. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  1937. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  1938. left.writeAlignedUnsafe(a); \
  1939. right.writeAlignedUnsafe(b); \
  1940. for (int i = 0; i < LANE_COUNT; i++) { \
  1941. if (fabs(a[i] - b[i]) < 0.0001f) return false; \
  1942. } \
  1943. return true; \
  1944. }
  1945. // Float SIMD vectors have inexact equality.
  1946. FOR_FLOAT_VECTOR_TYPES(CREATE_TOLERANT_EQUALITY)
  1947. #undef CREATE_TOLERANT_EQUALITY
  1948. #define CREATE_COMPARISONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  1949. inline bool allLanesGreater(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  1950. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  1951. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  1952. left.writeAlignedUnsafe(a); \
  1953. right.writeAlignedUnsafe(b); \
  1954. for (int i = 0; i < LANE_COUNT; i++) { \
  1955. if (a[i] <= b[i]) return false; \
  1956. } \
  1957. return true; \
  1958. } \
  1959. inline bool allLanesGreaterOrEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  1960. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  1961. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  1962. left.writeAlignedUnsafe(a); \
  1963. right.writeAlignedUnsafe(b); \
  1964. for (int i = 0; i < LANE_COUNT; i++) { \
  1965. if (a[i] < b[i]) return false; \
  1966. } \
  1967. return true; \
  1968. } \
  1969. inline bool allLanesLesser(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  1970. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  1971. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  1972. left.writeAlignedUnsafe(a); \
  1973. right.writeAlignedUnsafe(b); \
  1974. for (int i = 0; i < LANE_COUNT; i++) { \
  1975. if (a[i] >= b[i]) return false; \
  1976. } \
  1977. return true; \
  1978. } \
  1979. inline bool allLanesLesserOrEqual(const VECTOR_TYPE& left, const VECTOR_TYPE& right) { \
  1980. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE a[LANE_COUNT]; \
  1981. ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE b[LANE_COUNT]; \
  1982. left.writeAlignedUnsafe(a); \
  1983. right.writeAlignedUnsafe(b); \
  1984. for (int i = 0; i < LANE_COUNT; i++) { \
  1985. if (a[i] > b[i]) return false; \
  1986. } \
  1987. return true; \
  1988. }
  1989. FOR_ALL_VECTOR_TYPES(CREATE_COMPARISONS)
  1990. #undef CREATE_COMPARISONS
  1991. inline F32x4 operator+(const F32x4& left, const F32x4& right) {
  1992. #if defined USE_BASIC_SIMD
  1993. return F32x4(ADD_F32_SIMD(left.v, right.v));
  1994. #else
  1995. return F32x4(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3]);
  1996. #endif
  1997. }
  1998. inline F32x4 operator-(const F32x4& left, const F32x4& right) {
  1999. #if defined USE_BASIC_SIMD
  2000. return F32x4(SUB_F32_SIMD(left.v, right.v));
  2001. #else
  2002. return F32x4(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3]);
  2003. #endif
  2004. }
  2005. inline F32x4 operator*(const F32x4& left, const F32x4& right) {
  2006. #if defined USE_BASIC_SIMD
  2007. return F32x4(MUL_F32_SIMD(left.v, right.v));
  2008. #else
  2009. return F32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
  2010. #endif
  2011. }
  2012. inline F32x4 min(const F32x4& left, const F32x4& right) {
  2013. #if defined USE_BASIC_SIMD
  2014. return F32x4(MIN_F32_SIMD(left.v, right.v));
  2015. #else
  2016. float v0 = left.scalars[0];
  2017. float v1 = left.scalars[1];
  2018. float v2 = left.scalars[2];
  2019. float v3 = left.scalars[3];
  2020. float r0 = right.scalars[0];
  2021. float r1 = right.scalars[1];
  2022. float r2 = right.scalars[2];
  2023. float r3 = right.scalars[3];
  2024. if (r0 < v0) { v0 = r0; }
  2025. if (r1 < v1) { v1 = r1; }
  2026. if (r2 < v2) { v2 = r2; }
  2027. if (r3 < v3) { v3 = r3; }
  2028. return F32x4(v0, v1, v2, v3);
  2029. #endif
  2030. }
  2031. inline F32x4 max(const F32x4& left, const F32x4& right) {
  2032. #if defined USE_BASIC_SIMD
  2033. return F32x4(MAX_F32_SIMD(left.v, right.v));
  2034. #else
  2035. float v0 = left.scalars[0];
  2036. float v1 = left.scalars[1];
  2037. float v2 = left.scalars[2];
  2038. float v3 = left.scalars[3];
  2039. float r0 = right.scalars[0];
  2040. float r1 = right.scalars[1];
  2041. float r2 = right.scalars[2];
  2042. float r3 = right.scalars[3];
  2043. if (r0 > v0) { v0 = r0; }
  2044. if (r1 > v1) { v1 = r1; }
  2045. if (r2 > v2) { v2 = r2; }
  2046. if (r3 > v3) { v3 = r3; }
  2047. return F32x4(v0, v1, v2, v3);
  2048. #endif
  2049. }
  2050. inline I32x4 operator+(const I32x4& left, const I32x4& right) {
  2051. #if defined USE_BASIC_SIMD
  2052. return I32x4(ADD_I32_SIMD(left.v, right.v));
  2053. #else
  2054. return I32x4(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3]);
  2055. #endif
  2056. }
  2057. inline I32x4 operator-(const I32x4& left, const I32x4& right) {
  2058. #if defined USE_BASIC_SIMD
  2059. return I32x4(SUB_I32_SIMD(left.v, right.v));
  2060. #else
  2061. return I32x4(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3]);
  2062. #endif
  2063. }
  2064. inline I32x4 operator*(const I32x4& left, const I32x4& right) {
  2065. #if defined USE_BASIC_SIMD
  2066. #if defined USE_SSE2
  2067. // TODO: Use AVX2 for 32-bit integer multiplication when available.
  2068. return I32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
  2069. #elif defined USE_NEON
  2070. return I32x4(MUL_I32_NEON(left.v, right.v));
  2071. #endif
  2072. #else
  2073. return I32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
  2074. #endif
  2075. }
  2076. // TODO: Specify the behavior of truncated unsigned integer overflow and add it to the tests.
  2077. inline U32x4 operator+(const U32x4& left, const U32x4& right) {
  2078. #if defined USE_BASIC_SIMD
  2079. return U32x4(ADD_U32_SIMD(left.v, right.v));
  2080. #else
  2081. return U32x4(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3]);
  2082. #endif
  2083. }
  2084. inline U32x4 operator-(const U32x4& left, const U32x4& right) {
  2085. #if defined USE_BASIC_SIMD
  2086. return U32x4(SUB_U32_SIMD(left.v, right.v));
  2087. #else
  2088. return U32x4(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3]);
  2089. #endif
  2090. }
  2091. inline U32x4 operator*(const U32x4& left, const U32x4& right) {
  2092. #if defined USE_BASIC_SIMD
  2093. #if defined USE_SSE2
  2094. // Emulate a NEON instruction on SSE2 registers
  2095. return U32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
  2096. #else // NEON
  2097. return U32x4(MUL_U32_NEON(left.v, right.v));
  2098. #endif
  2099. #else
  2100. return U32x4(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3]);
  2101. #endif
  2102. }
  2103. // Bitwise and
  2104. inline U32x4 operator&(const U32x4& left, const U32x4& right) {
  2105. #if defined USE_BASIC_SIMD
  2106. return U32x4(BITWISE_AND_U32_SIMD(left.v, right.v));
  2107. #else
  2108. return U32x4(left.scalars[0] & right.scalars[0], left.scalars[1] & right.scalars[1], left.scalars[2] & right.scalars[2], left.scalars[3] & right.scalars[3]);
  2109. #endif
  2110. }
  2111. // Bitwise or
  2112. inline U32x4 operator|(const U32x4& left, const U32x4& right) {
  2113. #if defined USE_BASIC_SIMD
  2114. return U32x4(BITWISE_OR_U32_SIMD(left.v, right.v));
  2115. #else
  2116. return U32x4(left.scalars[0] | right.scalars[0], left.scalars[1] | right.scalars[1], left.scalars[2] | right.scalars[2], left.scalars[3] | right.scalars[3]);
  2117. #endif
  2118. }
  2119. // Bitwise xor
  2120. inline U32x4 operator^(const U32x4& left, const U32x4& right) {
  2121. #if defined USE_BASIC_SIMD
  2122. return U32x4(BITWISE_XOR_U32_SIMD(left.v, right.v));
  2123. #else
  2124. return U32x4(left.scalars[0] ^ right.scalars[0], left.scalars[1] ^ right.scalars[1], left.scalars[2] ^ right.scalars[2], left.scalars[3] ^ right.scalars[3]);
  2125. #endif
  2126. }
  2127. // Bitwise negation
  2128. inline U32x4 operator~(const U32x4& value) {
  2129. #if defined USE_NEON
  2130. return U32x4(vmvnq_u32(value.v));
  2131. #elif defined USE_BASIC_SIMD
  2132. // Fall back on xor against all ones.
  2133. return value ^ U32x4(~uint32_t(0));
  2134. #else
  2135. return U32x4(~value.scalars[0], ~value.scalars[1], ~value.scalars[2], ~value.scalars[3]);
  2136. #endif
  2137. }
  2138. inline U32x4 operator<<(const U32x4& left, const U32x4 &bitOffsets) {
  2139. assert(allLanesLesser(bitOffsets, U32x4(32u)));
  2140. #if defined USE_NEON
  2141. return U32x4(vshlq_u32(left.v, vreinterpretq_s32_u32(bitOffsets.v)));
  2142. #else
  2143. return U32x4(
  2144. left.scalars[0] << bitOffsets.scalars[0],
  2145. left.scalars[1] << bitOffsets.scalars[1],
  2146. left.scalars[2] << bitOffsets.scalars[2],
  2147. left.scalars[3] << bitOffsets.scalars[3]
  2148. );
  2149. #endif
  2150. }
  2151. inline U32x4 operator>>(const U32x4& left, const U32x4 &bitOffsets) {
  2152. assert(allLanesLesser(bitOffsets, U32x4(32u)));
  2153. #if defined USE_NEON
  2154. //return U32x4(vshrq_u32(left.v, vreinterpretq_s32_u32(bitOffsets.v)));
  2155. return U32x4(vshlq_u32(left.v, vnegq_s32(vreinterpretq_s32_u32(bitOffsets.v))));
  2156. #else
  2157. return U32x4(
  2158. left.scalars[0] >> bitOffsets.scalars[0],
  2159. left.scalars[1] >> bitOffsets.scalars[1],
  2160. left.scalars[2] >> bitOffsets.scalars[2],
  2161. left.scalars[3] >> bitOffsets.scalars[3]
  2162. );
  2163. #endif
  2164. }
  2165. // bitOffset must be an immediate constant, so a template argument is used.
  2166. template <uint32_t bitOffset>
  2167. inline U32x4 bitShiftLeftImmediate(const U32x4& left) {
  2168. static_assert(bitOffset < 32u, "Immediate left shift of 32-bit values may not shift more than 31 bits!");
  2169. #if defined USE_SSE2
  2170. return U32x4(_mm_slli_epi32(left.v, bitOffset));
  2171. #else
  2172. #if defined USE_NEON
  2173. return U32x4(vshlq_u32(left.v, LOAD_SCALAR_I32_SIMD(bitOffset)));
  2174. #else
  2175. return U32x4(left.scalars[0] << bitOffset, left.scalars[1] << bitOffset, left.scalars[2] << bitOffset, left.scalars[3] << bitOffset);
  2176. #endif
  2177. #endif
  2178. }
  2179. // bitOffset must be an immediate constant.
  2180. template <uint32_t bitOffset>
  2181. inline U32x4 bitShiftRightImmediate(const U32x4& left) {
  2182. static_assert(bitOffset < 32u, "Immediate right shift of 32-bit values may not shift more than 31 bits!");
  2183. #if defined USE_SSE2
  2184. return U32x4(_mm_srli_epi32(left.v, bitOffset));
  2185. #else
  2186. #if defined USE_NEON
  2187. // TODO: Why is vshrq_u32 not found?
  2188. //return U32x4(vshrq_u32(left.v, LOAD_SCALAR_I32_SIMD(bitOffset)));
  2189. return U32x4(vshlq_u32(left.v, LOAD_SCALAR_I32_SIMD(-(int32_t)bitOffset)));
  2190. #else
  2191. return U32x4(left.scalars[0] >> bitOffset, left.scalars[1] >> bitOffset, left.scalars[2] >> bitOffset, left.scalars[3] >> bitOffset);
  2192. #endif
  2193. #endif
  2194. }
  2195. inline U16x8 operator<<(const U16x8& left, const U16x8 &bitOffsets) {
  2196. assert(allLanesLesser(bitOffsets, U16x8(16u)));
  2197. #if defined USE_NEON
  2198. return U16x8(vshlq_u16(left.v, vreinterpretq_s16_u16(bitOffsets.v)));
  2199. #else
  2200. return U16x8(
  2201. left.scalars[0] << bitOffsets.scalars[0],
  2202. left.scalars[1] << bitOffsets.scalars[1],
  2203. left.scalars[2] << bitOffsets.scalars[2],
  2204. left.scalars[3] << bitOffsets.scalars[3],
  2205. left.scalars[4] << bitOffsets.scalars[4],
  2206. left.scalars[5] << bitOffsets.scalars[5],
  2207. left.scalars[6] << bitOffsets.scalars[6],
  2208. left.scalars[7] << bitOffsets.scalars[7]
  2209. );
  2210. #endif
  2211. }
  2212. inline U16x8 operator>>(const U16x8& left, const U16x8 &bitOffsets) {
  2213. assert(allLanesLesser(bitOffsets, U16x8(16u)));
  2214. #if defined USE_NEON
  2215. //return U16x8(vshrq_u16(left.v, vreinterpretq_s16_u16(bitOffsets.v)));
  2216. return U16x8(vshlq_u16(left.v, vnegq_s16(vreinterpretq_s16_u16(bitOffsets.v))));
  2217. #else
  2218. return U16x8(
  2219. left.scalars[0] >> bitOffsets.scalars[0],
  2220. left.scalars[1] >> bitOffsets.scalars[1],
  2221. left.scalars[2] >> bitOffsets.scalars[2],
  2222. left.scalars[3] >> bitOffsets.scalars[3],
  2223. left.scalars[4] >> bitOffsets.scalars[4],
  2224. left.scalars[5] >> bitOffsets.scalars[5],
  2225. left.scalars[6] >> bitOffsets.scalars[6],
  2226. left.scalars[7] >> bitOffsets.scalars[7]
  2227. );
  2228. #endif
  2229. }
  2230. // bitOffset must be an immediate constant, so a template argument is used.
  2231. template <uint32_t bitOffset>
  2232. inline U16x8 bitShiftLeftImmediate(const U16x8& left) {
  2233. static_assert(bitOffset < 16u, "Immediate left shift of 16-bit values may not shift more than 15 bits!");
  2234. #if defined USE_SSE2
  2235. return U16x8(_mm_slli_epi16(left.v, bitOffset));
  2236. #else
  2237. #if defined USE_NEON
  2238. return U16x8(vshlq_u32(left.v, vdupq_n_s16(bitOffset)));
  2239. #else
  2240. return U16x8(
  2241. left.scalars[0] << bitOffset,
  2242. left.scalars[1] << bitOffset,
  2243. left.scalars[2] << bitOffset,
  2244. left.scalars[3] << bitOffset,
  2245. left.scalars[4] << bitOffset,
  2246. left.scalars[5] << bitOffset,
  2247. left.scalars[6] << bitOffset,
  2248. left.scalars[7] << bitOffset
  2249. );
  2250. #endif
  2251. #endif
  2252. }
  2253. // bitOffset must be an immediate constant.
  2254. template <uint32_t bitOffset>
  2255. inline U16x8 bitShiftRightImmediate(const U16x8& left) {
  2256. static_assert(bitOffset < 16u, "Immediate right shift of 16-bit values may not shift more than 15 bits!");
  2257. #if defined USE_SSE2
  2258. return U16x8(_mm_srli_epi16(left.v, bitOffset));
  2259. #else
  2260. #if defined USE_NEON
  2261. //return U16x8(vshrq_u16(left.v, vdupq_n_s16(bitOffset)));
  2262. return U16x8(vshlq_u16(left.v, vdupq_n_s16(-(int32_t)bitOffset)));
  2263. #else
  2264. return U16x8(
  2265. left.scalars[0] >> bitOffset,
  2266. left.scalars[1] >> bitOffset,
  2267. left.scalars[2] >> bitOffset,
  2268. left.scalars[3] >> bitOffset,
  2269. left.scalars[4] >> bitOffset,
  2270. left.scalars[5] >> bitOffset,
  2271. left.scalars[6] >> bitOffset,
  2272. left.scalars[7] >> bitOffset
  2273. );
  2274. #endif
  2275. #endif
  2276. }
  2277. inline U8x16 operator<<(const U8x16& left, const U8x16 &bitOffsets) {
  2278. assert(allLanesLesser(bitOffsets, U8x16(8u)));
  2279. #if defined USE_NEON
  2280. return U8x16(vshlq_u16(left.v, vreinterpretq_s8_u8(bitOffsets.v)));
  2281. #else
  2282. return U8x16(
  2283. left.scalars[ 0] << bitOffsets.scalars[ 0],
  2284. left.scalars[ 1] << bitOffsets.scalars[ 1],
  2285. left.scalars[ 2] << bitOffsets.scalars[ 2],
  2286. left.scalars[ 3] << bitOffsets.scalars[ 3],
  2287. left.scalars[ 4] << bitOffsets.scalars[ 4],
  2288. left.scalars[ 5] << bitOffsets.scalars[ 5],
  2289. left.scalars[ 6] << bitOffsets.scalars[ 6],
  2290. left.scalars[ 7] << bitOffsets.scalars[ 7],
  2291. left.scalars[ 8] << bitOffsets.scalars[ 8],
  2292. left.scalars[ 9] << bitOffsets.scalars[ 9],
  2293. left.scalars[10] << bitOffsets.scalars[10],
  2294. left.scalars[11] << bitOffsets.scalars[11],
  2295. left.scalars[12] << bitOffsets.scalars[12],
  2296. left.scalars[13] << bitOffsets.scalars[13],
  2297. left.scalars[14] << bitOffsets.scalars[14],
  2298. left.scalars[15] << bitOffsets.scalars[15]
  2299. );
  2300. #endif
  2301. }
  2302. inline U8x16 operator>>(const U8x16& left, const U8x16 &bitOffsets) {
  2303. assert(allLanesLesser(bitOffsets, U8x16(8u)));
  2304. #if defined USE_NEON
  2305. //return U8x16(vshrq_u16(left.v, vreinterpretq_s8_u8(bitOffsets.v)));
  2306. return U8x16(vshlq_u16(left.v, vnegq_s8(vreinterpretq_s8_u8(bitOffsets.v))));
  2307. #else
  2308. return U8x16(
  2309. left.scalars[ 0] >> bitOffsets.scalars[ 0],
  2310. left.scalars[ 1] >> bitOffsets.scalars[ 1],
  2311. left.scalars[ 2] >> bitOffsets.scalars[ 2],
  2312. left.scalars[ 3] >> bitOffsets.scalars[ 3],
  2313. left.scalars[ 4] >> bitOffsets.scalars[ 4],
  2314. left.scalars[ 5] >> bitOffsets.scalars[ 5],
  2315. left.scalars[ 6] >> bitOffsets.scalars[ 6],
  2316. left.scalars[ 7] >> bitOffsets.scalars[ 7],
  2317. left.scalars[ 8] >> bitOffsets.scalars[ 8],
  2318. left.scalars[ 9] >> bitOffsets.scalars[ 9],
  2319. left.scalars[10] >> bitOffsets.scalars[10],
  2320. left.scalars[11] >> bitOffsets.scalars[11],
  2321. left.scalars[12] >> bitOffsets.scalars[12],
  2322. left.scalars[13] >> bitOffsets.scalars[13],
  2323. left.scalars[14] >> bitOffsets.scalars[14],
  2324. left.scalars[15] >> bitOffsets.scalars[15]
  2325. );
  2326. #endif
  2327. }
  2328. // bitOffset must be an immediate constant, so a template argument is used.
  2329. template <uint32_t bitOffset>
  2330. inline U8x16 bitShiftLeftImmediate(const U8x16& left) {
  2331. static_assert(bitOffset < 8u, "Immediate left shift of 8-bit values may not shift more than 7 bits!");
  2332. #if defined USE_NEON
  2333. return U8x16(vshlq_u32(left.v, vdupq_n_s8(bitOffset)));
  2334. #else
  2335. return U8x16(
  2336. left.scalars[ 0] << bitOffset,
  2337. left.scalars[ 1] << bitOffset,
  2338. left.scalars[ 2] << bitOffset,
  2339. left.scalars[ 3] << bitOffset,
  2340. left.scalars[ 4] << bitOffset,
  2341. left.scalars[ 5] << bitOffset,
  2342. left.scalars[ 6] << bitOffset,
  2343. left.scalars[ 7] << bitOffset,
  2344. left.scalars[ 8] << bitOffset,
  2345. left.scalars[ 9] << bitOffset,
  2346. left.scalars[10] << bitOffset,
  2347. left.scalars[11] << bitOffset,
  2348. left.scalars[12] << bitOffset,
  2349. left.scalars[13] << bitOffset,
  2350. left.scalars[14] << bitOffset,
  2351. left.scalars[15] << bitOffset
  2352. );
  2353. #endif
  2354. }
  2355. // bitOffset must be an immediate constant.
  2356. template <uint32_t bitOffset>
  2357. inline U8x16 bitShiftRightImmediate(const U8x16& left) {
  2358. static_assert(bitOffset < 8u, "Immediate right shift of 8-bit values may not shift more than 7 bits!");
  2359. #if defined USE_NEON
  2360. //return U8x16(vshrq_u32(left.v, vdupq_n_s8(bitOffset)));
  2361. return U8x16(vshlq_u32(left.v, vdupq_n_s8(-(int32_t)bitOffset)));
  2362. #else
  2363. return U8x16(
  2364. left.scalars[ 0] >> bitOffset,
  2365. left.scalars[ 1] >> bitOffset,
  2366. left.scalars[ 2] >> bitOffset,
  2367. left.scalars[ 3] >> bitOffset,
  2368. left.scalars[ 4] >> bitOffset,
  2369. left.scalars[ 5] >> bitOffset,
  2370. left.scalars[ 6] >> bitOffset,
  2371. left.scalars[ 7] >> bitOffset,
  2372. left.scalars[ 8] >> bitOffset,
  2373. left.scalars[ 9] >> bitOffset,
  2374. left.scalars[10] >> bitOffset,
  2375. left.scalars[11] >> bitOffset,
  2376. left.scalars[12] >> bitOffset,
  2377. left.scalars[13] >> bitOffset,
  2378. left.scalars[14] >> bitOffset,
  2379. left.scalars[15] >> bitOffset
  2380. );
  2381. #endif
  2382. }
  2383. inline U16x8 operator+(const U16x8& left, const U16x8& right) {
  2384. #if defined USE_BASIC_SIMD
  2385. return U16x8(ADD_U16_SIMD(left.v, right.v));
  2386. #else
  2387. return U16x8(left.scalars[0] + right.scalars[0], left.scalars[1] + right.scalars[1], left.scalars[2] + right.scalars[2], left.scalars[3] + right.scalars[3],
  2388. left.scalars[4] + right.scalars[4], left.scalars[5] + right.scalars[5], left.scalars[6] + right.scalars[6], left.scalars[7] + right.scalars[7]);
  2389. #endif
  2390. }
  2391. inline U16x8 operator-(const U16x8& left, const U16x8& right) {
  2392. #if defined USE_BASIC_SIMD
  2393. return U16x8(SUB_U16_SIMD(left.v, right.v));
  2394. #else
  2395. return U16x8(left.scalars[0] - right.scalars[0], left.scalars[1] - right.scalars[1], left.scalars[2] - right.scalars[2], left.scalars[3] - right.scalars[3],
  2396. left.scalars[4] - right.scalars[4], left.scalars[5] - right.scalars[5], left.scalars[6] - right.scalars[6], left.scalars[7] - right.scalars[7]);
  2397. #endif
  2398. }
  2399. inline U16x8 operator*(const U16x8& left, const U16x8& right) {
  2400. #if defined USE_BASIC_SIMD
  2401. return U16x8(MUL_U16_SIMD(left.v, right.v));
  2402. #else
  2403. return U16x8(left.scalars[0] * right.scalars[0], left.scalars[1] * right.scalars[1], left.scalars[2] * right.scalars[2], left.scalars[3] * right.scalars[3],
  2404. left.scalars[4] * right.scalars[4], left.scalars[5] * right.scalars[5], left.scalars[6] * right.scalars[6], left.scalars[7] * right.scalars[7]);
  2405. #endif
  2406. }
  2407. inline U8x16 operator+(const U8x16& left, const U8x16& right) {
  2408. #if defined USE_BASIC_SIMD
  2409. return U8x16(ADD_U8_SIMD(left.v, right.v));
  2410. #else
  2411. return U8x16(
  2412. left.scalars[0] + right.scalars[0],
  2413. left.scalars[1] + right.scalars[1],
  2414. left.scalars[2] + right.scalars[2],
  2415. left.scalars[3] + right.scalars[3],
  2416. left.scalars[4] + right.scalars[4],
  2417. left.scalars[5] + right.scalars[5],
  2418. left.scalars[6] + right.scalars[6],
  2419. left.scalars[7] + right.scalars[7],
  2420. left.scalars[8] + right.scalars[8],
  2421. left.scalars[9] + right.scalars[9],
  2422. left.scalars[10] + right.scalars[10],
  2423. left.scalars[11] + right.scalars[11],
  2424. left.scalars[12] + right.scalars[12],
  2425. left.scalars[13] + right.scalars[13],
  2426. left.scalars[14] + right.scalars[14],
  2427. left.scalars[15] + right.scalars[15]
  2428. );
  2429. #endif
  2430. }
  2431. inline U8x16 operator-(const U8x16& left, const U8x16& right) {
  2432. #if defined USE_BASIC_SIMD
  2433. return U8x16(SUB_U8_SIMD(left.v, right.v));
  2434. #else
  2435. return U8x16(
  2436. left.scalars[0] - right.scalars[0],
  2437. left.scalars[1] - right.scalars[1],
  2438. left.scalars[2] - right.scalars[2],
  2439. left.scalars[3] - right.scalars[3],
  2440. left.scalars[4] - right.scalars[4],
  2441. left.scalars[5] - right.scalars[5],
  2442. left.scalars[6] - right.scalars[6],
  2443. left.scalars[7] - right.scalars[7],
  2444. left.scalars[8] - right.scalars[8],
  2445. left.scalars[9] - right.scalars[9],
  2446. left.scalars[10] - right.scalars[10],
  2447. left.scalars[11] - right.scalars[11],
  2448. left.scalars[12] - right.scalars[12],
  2449. left.scalars[13] - right.scalars[13],
  2450. left.scalars[14] - right.scalars[14],
  2451. left.scalars[15] - right.scalars[15]
  2452. );
  2453. #endif
  2454. }
  2455. inline uint8_t impl_limit0(int32_t x) { return x < 0 ? 0 : x; }
  2456. inline uint8_t impl_limit255(uint32_t x) { return x > 255 ? 255 : x; }
  2457. inline U8x16 saturatedAddition(const U8x16& left, const U8x16& right) {
  2458. #if defined USE_BASIC_SIMD
  2459. return U8x16(ADD_SAT_U8_SIMD(left.v, right.v));
  2460. #else
  2461. return U8x16(
  2462. impl_limit255((uint32_t)left.scalars[0] + (uint32_t)right.scalars[0]),
  2463. impl_limit255((uint32_t)left.scalars[1] + (uint32_t)right.scalars[1]),
  2464. impl_limit255((uint32_t)left.scalars[2] + (uint32_t)right.scalars[2]),
  2465. impl_limit255((uint32_t)left.scalars[3] + (uint32_t)right.scalars[3]),
  2466. impl_limit255((uint32_t)left.scalars[4] + (uint32_t)right.scalars[4]),
  2467. impl_limit255((uint32_t)left.scalars[5] + (uint32_t)right.scalars[5]),
  2468. impl_limit255((uint32_t)left.scalars[6] + (uint32_t)right.scalars[6]),
  2469. impl_limit255((uint32_t)left.scalars[7] + (uint32_t)right.scalars[7]),
  2470. impl_limit255((uint32_t)left.scalars[8] + (uint32_t)right.scalars[8]),
  2471. impl_limit255((uint32_t)left.scalars[9] + (uint32_t)right.scalars[9]),
  2472. impl_limit255((uint32_t)left.scalars[10] + (uint32_t)right.scalars[10]),
  2473. impl_limit255((uint32_t)left.scalars[11] + (uint32_t)right.scalars[11]),
  2474. impl_limit255((uint32_t)left.scalars[12] + (uint32_t)right.scalars[12]),
  2475. impl_limit255((uint32_t)left.scalars[13] + (uint32_t)right.scalars[13]),
  2476. impl_limit255((uint32_t)left.scalars[14] + (uint32_t)right.scalars[14]),
  2477. impl_limit255((uint32_t)left.scalars[15] + (uint32_t)right.scalars[15])
  2478. );
  2479. #endif
  2480. }
  2481. inline U8x16 saturatedSubtraction(const U8x16& left, const U8x16& right) {
  2482. #if defined USE_BASIC_SIMD
  2483. return U8x16(SUB_SAT_U8_SIMD(left.v, right.v));
  2484. #else
  2485. return U8x16(
  2486. impl_limit0((int32_t)left.scalars[0] - (int32_t)right.scalars[0]),
  2487. impl_limit0((int32_t)left.scalars[1] - (int32_t)right.scalars[1]),
  2488. impl_limit0((int32_t)left.scalars[2] - (int32_t)right.scalars[2]),
  2489. impl_limit0((int32_t)left.scalars[3] - (int32_t)right.scalars[3]),
  2490. impl_limit0((int32_t)left.scalars[4] - (int32_t)right.scalars[4]),
  2491. impl_limit0((int32_t)left.scalars[5] - (int32_t)right.scalars[5]),
  2492. impl_limit0((int32_t)left.scalars[6] - (int32_t)right.scalars[6]),
  2493. impl_limit0((int32_t)left.scalars[7] - (int32_t)right.scalars[7]),
  2494. impl_limit0((int32_t)left.scalars[8] - (int32_t)right.scalars[8]),
  2495. impl_limit0((int32_t)left.scalars[9] - (int32_t)right.scalars[9]),
  2496. impl_limit0((int32_t)left.scalars[10] - (int32_t)right.scalars[10]),
  2497. impl_limit0((int32_t)left.scalars[11] - (int32_t)right.scalars[11]),
  2498. impl_limit0((int32_t)left.scalars[12] - (int32_t)right.scalars[12]),
  2499. impl_limit0((int32_t)left.scalars[13] - (int32_t)right.scalars[13]),
  2500. impl_limit0((int32_t)left.scalars[14] - (int32_t)right.scalars[14]),
  2501. impl_limit0((int32_t)left.scalars[15] - (int32_t)right.scalars[15])
  2502. );
  2503. #endif
  2504. }
  2505. inline I32x4 truncateToI32(const F32x4& vector) {
  2506. #if defined USE_BASIC_SIMD
  2507. return I32x4(F32_TO_I32_SIMD(vector.v));
  2508. #else
  2509. return I32x4((int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3]);
  2510. #endif
  2511. }
  2512. inline U32x4 truncateToU32(const F32x4& vector) {
  2513. #if defined USE_BASIC_SIMD
  2514. return U32x4(F32_TO_U32_SIMD(vector.v));
  2515. #else
  2516. return U32x4((uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3]);
  2517. #endif
  2518. }
  2519. inline F32x4 floatFromI32(const I32x4& vector) {
  2520. #if defined USE_BASIC_SIMD
  2521. return F32x4(I32_TO_F32_SIMD(vector.v));
  2522. #else
  2523. return F32x4((float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3]);
  2524. #endif
  2525. }
  2526. inline F32x4 floatFromU32(const U32x4& vector) {
  2527. #if defined USE_BASIC_SIMD
  2528. return F32x4(U32_TO_F32_SIMD(vector.v));
  2529. #else
  2530. return F32x4((float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3]);
  2531. #endif
  2532. }
  2533. inline I32x4 I32FromU32(const U32x4& vector) {
  2534. #if defined USE_BASIC_SIMD
  2535. return I32x4(REINTERPRET_U32_TO_I32_SIMD(vector.v));
  2536. #else
  2537. return I32x4((int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3]);
  2538. #endif
  2539. }
  2540. inline U32x4 U32FromI32(const I32x4& vector) {
  2541. #if defined USE_BASIC_SIMD
  2542. return U32x4(REINTERPRET_I32_TO_U32_SIMD(vector.v));
  2543. #else
  2544. return U32x4((uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3]);
  2545. #endif
  2546. }
  2547. // Warning! Behavior depends on endianness.
  2548. inline U8x16 reinterpret_U8FromU32(const U32x4& vector) {
  2549. #if defined USE_BASIC_SIMD
  2550. return U8x16(REINTERPRET_U32_TO_U8_SIMD(vector.v));
  2551. #else
  2552. const uint8_t *source = (const uint8_t*)vector.scalars;
  2553. return U8x16(
  2554. source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7],
  2555. source[8], source[9], source[10], source[11], source[12], source[13], source[14], source[15]
  2556. );
  2557. #endif
  2558. }
  2559. // Warning! Behavior depends on endianness.
  2560. inline U32x4 reinterpret_U32FromU8(const U8x16& vector) {
  2561. #if defined USE_BASIC_SIMD
  2562. return U32x4(REINTERPRET_U8_TO_U32_SIMD(vector.v));
  2563. #else
  2564. const uint32_t *source = (const uint32_t*)vector.scalars;
  2565. return U32x4(source[0], source[1], source[2], source[3]);
  2566. #endif
  2567. }
  2568. // Warning! Behavior depends on endianness.
  2569. inline U16x8 reinterpret_U16FromU32(const U32x4& vector) {
  2570. #if defined USE_BASIC_SIMD
  2571. return U16x8(REINTERPRET_U32_TO_U16_SIMD(vector.v));
  2572. #else
  2573. const uint16_t *source = (const uint16_t*)vector.scalars;
  2574. return U16x8(
  2575. source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7]
  2576. );
  2577. #endif
  2578. }
  2579. // Warning! Behavior depends on endianness.
  2580. inline U32x4 reinterpret_U32FromU16(const U16x8& vector) {
  2581. #if defined USE_BASIC_SIMD
  2582. return U32x4(REINTERPRET_U16_TO_U32_SIMD(vector.v));
  2583. #else
  2584. const uint32_t *source = (const uint32_t*)vector.scalars;
  2585. return U32x4(source[0], source[1], source[2], source[3]);
  2586. #endif
  2587. }
  2588. // Unpacking to larger integers
  2589. inline U32x4 lowerToU32(const U16x8& vector) {
  2590. #if defined USE_BASIC_SIMD
  2591. return U32x4(U16_LOW_TO_U32_SIMD(vector.v));
  2592. #else
  2593. return U32x4(vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3]);
  2594. #endif
  2595. }
  2596. inline U32x4 higherToU32(const U16x8& vector) {
  2597. #if defined USE_BASIC_SIMD
  2598. return U32x4(U16_HIGH_TO_U32_SIMD(vector.v));
  2599. #else
  2600. return U32x4(vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7]);
  2601. #endif
  2602. }
  2603. inline U16x8 lowerToU16(const U8x16& vector) {
  2604. #if defined USE_BASIC_SIMD
  2605. return U16x8(U8_LOW_TO_U16_SIMD(vector.v));
  2606. #else
  2607. return U16x8(
  2608. vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3],
  2609. vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7]
  2610. );
  2611. #endif
  2612. }
  2613. inline U16x8 higherToU16(const U8x16& vector) {
  2614. #if defined USE_BASIC_SIMD
  2615. return U16x8(U8_HIGH_TO_U16_SIMD(vector.v));
  2616. #else
  2617. return U16x8(
  2618. vector.scalars[8], vector.scalars[9], vector.scalars[10], vector.scalars[11],
  2619. vector.scalars[12], vector.scalars[13], vector.scalars[14], vector.scalars[15]
  2620. );
  2621. #endif
  2622. }
  2623. // Saturated packing
  2624. inline U8x16 saturateToU8(const U16x8& lower, const U16x8& upper) {
  2625. #if defined USE_BASIC_SIMD
  2626. return U8x16(PACK_SAT_U16_TO_U8_SIMD(lower.v, upper.v));
  2627. #else
  2628. return U8x16(
  2629. impl_limit255(lower.scalars[0]),
  2630. impl_limit255(lower.scalars[1]),
  2631. impl_limit255(lower.scalars[2]),
  2632. impl_limit255(lower.scalars[3]),
  2633. impl_limit255(lower.scalars[4]),
  2634. impl_limit255(lower.scalars[5]),
  2635. impl_limit255(lower.scalars[6]),
  2636. impl_limit255(lower.scalars[7]),
  2637. impl_limit255(upper.scalars[0]),
  2638. impl_limit255(upper.scalars[1]),
  2639. impl_limit255(upper.scalars[2]),
  2640. impl_limit255(upper.scalars[3]),
  2641. impl_limit255(upper.scalars[4]),
  2642. impl_limit255(upper.scalars[5]),
  2643. impl_limit255(upper.scalars[6]),
  2644. impl_limit255(upper.scalars[7])
  2645. );
  2646. #endif
  2647. }
  2648. // Unary negation for convenience and code readability.
  2649. // Before using unary negation, always check if:
  2650. // * An addition can be turned into a subtraction?
  2651. // x = -a + b
  2652. // x = b - a
  2653. // * A multiplying constant or scalar can be negated instead?
  2654. // x = -b * 2
  2655. // x = b * -2
  2656. inline F32x4 operator-(const F32x4& value) {
  2657. #if defined USE_BASIC_SIMD
  2658. return F32x4(0.0f) - value;
  2659. #else
  2660. return F32x4(-value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3]);
  2661. #endif
  2662. }
  2663. inline I32x4 operator-(const I32x4& value) {
  2664. #if defined USE_BASIC_SIMD
  2665. return I32x4(0) - value;
  2666. #else
  2667. return I32x4(-value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3]);
  2668. #endif
  2669. }
  2670. // Helper macros for generating the vector extract functions.
  2671. // Having one function for each type and offset makes sure that the compiler gets an immediate integer within the valid range.
  2672. #if defined USE_BASIC_SIMD
  2673. #if defined USE_SSE2
  2674. #if defined USE_SSSE3
  2675. #define _MM_ALIGNR_EPI8(A, B, OFFSET) _mm_alignr_epi8(A, B, OFFSET)
  2676. #else
  2677. // If SSSE3 is not used, emulate it using stack memory and unaligned reading of data.
  2678. static inline SIMD_U8x16 _MM_ALIGNR_EPI8(SIMD_U8x16 a, SIMD_U8x16 b, int offset) {
  2679. ALIGN16 uint8_t vectorBuffer[32];
  2680. #ifdef SAFE_POINTER_CHECKS
  2681. if (uintptr_t((void*)vectorBuffer) & 15u) { throwError(U"Unaligned stack memory detected in 128-bit VECTOR_EXTRACT_GENERATOR!\n"); }
  2682. #endif
  2683. _mm_store_si128((SIMD_U8x16*)(vectorBuffer), b);
  2684. _mm_store_si128((SIMD_U8x16*)(vectorBuffer + 16), a);
  2685. return _mm_loadu_si128((SIMD_U8x16*)(vectorBuffer + offset));
  2686. }
  2687. #endif
  2688. #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return U8x16(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET));
  2689. #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return U16x8(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 2));
  2690. #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return U32x4(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 4));
  2691. #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return I32x4(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 4));
  2692. #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return F32x4(SIMD_F32x4(_MM_ALIGNR_EPI8(SIMD_U32x4(b.v), SIMD_U32x4(a.v), OFFSET * 4)));
  2693. #elif defined USE_NEON
  2694. #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return U8x16(vextq_u8(a.v, b.v, OFFSET));
  2695. #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return U16x8(vextq_u16(a.v, b.v, OFFSET));
  2696. #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return U32x4(vextq_u32(a.v, b.v, OFFSET));
  2697. #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return I32x4(vextq_s32(a.v, b.v, OFFSET));
  2698. #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return F32x4(vextq_f32(a.v, b.v, OFFSET));
  2699. #endif
  2700. #else
  2701. #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  2702. #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  2703. #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  2704. #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  2705. #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  2706. #endif
  2707. // Vector extraction concatunates two input vectors and reads a vector between them using an offset.
  2708. // The first and last offsets that only return one of the inputs can be used for readability, because they will be inlined and removed by the compiler.
  2709. // To get elements from the right side, combine the center vector with the right vector and shift one element to the left using vectorExtract_1 for the given type.
  2710. // To get elements from the left side, combine the left vector with the center vector and shift one element to the right using vectorExtract_15 for 16 lanes, vectorExtract_7 for 8 lanes, or vectorExtract_3 for 4 lanes.
  2711. U8x16 inline vectorExtract_0(const U8x16 &a, const U8x16 &b) { return a; }
  2712. U8x16 inline vectorExtract_1(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(1, U8x16(a.scalars[1], a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0])) }
  2713. U8x16 inline vectorExtract_2(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(2, U8x16(a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1])) }
  2714. U8x16 inline vectorExtract_3(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(3, U8x16(a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2])) }
  2715. U8x16 inline vectorExtract_4(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(4, U8x16(a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3])) }
  2716. U8x16 inline vectorExtract_5(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(5, U8x16(a.scalars[5], a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4])) }
  2717. U8x16 inline vectorExtract_6(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(6, U8x16(a.scalars[6], a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5])) }
  2718. U8x16 inline vectorExtract_7(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(7, U8x16(a.scalars[7], a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6])) }
  2719. U8x16 inline vectorExtract_8(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(8, U8x16(a.scalars[8], a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7])) }
  2720. U8x16 inline vectorExtract_9(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(9, U8x16(a.scalars[9], a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8])) }
  2721. U8x16 inline vectorExtract_10(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(10, U8x16(a.scalars[10], a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9])) }
  2722. U8x16 inline vectorExtract_11(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(11, U8x16(a.scalars[11], a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10])) }
  2723. U8x16 inline vectorExtract_12(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(12, U8x16(a.scalars[12], a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11])) }
  2724. U8x16 inline vectorExtract_13(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(13, U8x16(a.scalars[13], a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11], b.scalars[12])) }
  2725. U8x16 inline vectorExtract_14(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(14, U8x16(a.scalars[14], a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11], b.scalars[12], b.scalars[13])) }
  2726. U8x16 inline vectorExtract_15(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(15, U8x16(a.scalars[15], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6], b.scalars[7], b.scalars[8], b.scalars[9], b.scalars[10], b.scalars[11], b.scalars[12], b.scalars[13], b.scalars[14])) }
  2727. U8x16 inline vectorExtract_16(const U8x16 &a, const U8x16 &b) { return b; }
  2728. U16x8 inline vectorExtract_0(const U16x8 &a, const U16x8 &b) { return a; }
  2729. U16x8 inline vectorExtract_1(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(1, U16x8(a.scalars[1], a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0])) }
  2730. U16x8 inline vectorExtract_2(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(2, U16x8(a.scalars[2], a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1])) }
  2731. U16x8 inline vectorExtract_3(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(3, U16x8(a.scalars[3], a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2])) }
  2732. U16x8 inline vectorExtract_4(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(4, U16x8(a.scalars[4], a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3])) }
  2733. U16x8 inline vectorExtract_5(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(5, U16x8(a.scalars[5], a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4])) }
  2734. U16x8 inline vectorExtract_6(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(6, U16x8(a.scalars[6], a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5])) }
  2735. U16x8 inline vectorExtract_7(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(7, U16x8(a.scalars[7], b.scalars[0], b.scalars[1], b.scalars[2], b.scalars[3], b.scalars[4], b.scalars[5], b.scalars[6])) }
  2736. U16x8 inline vectorExtract_8(const U16x8 &a, const U16x8 &b) { return b; }
  2737. U32x4 inline vectorExtract_0(const U32x4 &a, const U32x4 &b) { return a; }
  2738. U32x4 inline vectorExtract_1(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(1, U32x4(a.scalars[1], a.scalars[2], a.scalars[3], b.scalars[0])) }
  2739. U32x4 inline vectorExtract_2(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(2, U32x4(a.scalars[2], a.scalars[3], b.scalars[0], b.scalars[1])) }
  2740. U32x4 inline vectorExtract_3(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(3, U32x4(a.scalars[3], b.scalars[0], b.scalars[1], b.scalars[2])) }
  2741. U32x4 inline vectorExtract_4(const U32x4 &a, const U32x4 &b) { return b; }
  2742. I32x4 inline vectorExtract_0(const I32x4 &a, const I32x4 &b) { return a; }
  2743. I32x4 inline vectorExtract_1(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(1, I32x4(a.scalars[1], a.scalars[2], a.scalars[3], b.scalars[0])) }
  2744. I32x4 inline vectorExtract_2(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(2, I32x4(a.scalars[2], a.scalars[3], b.scalars[0], b.scalars[1])) }
  2745. I32x4 inline vectorExtract_3(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(3, I32x4(a.scalars[3], b.scalars[0], b.scalars[1], b.scalars[2])) }
  2746. I32x4 inline vectorExtract_4(const I32x4 &a, const I32x4 &b) { return b; }
  2747. F32x4 inline vectorExtract_0(const F32x4 &a, const F32x4 &b) { return a; }
  2748. F32x4 inline vectorExtract_1(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(1, F32x4(a.scalars[1], a.scalars[2], a.scalars[3], b.scalars[0])) }
  2749. F32x4 inline vectorExtract_2(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(2, F32x4(a.scalars[2], a.scalars[3], b.scalars[0], b.scalars[1])) }
  2750. F32x4 inline vectorExtract_3(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(3, F32x4(a.scalars[3], b.scalars[0], b.scalars[1], b.scalars[2])) }
  2751. F32x4 inline vectorExtract_4(const F32x4 &a, const F32x4 &b) { return b; }
  2752. // Gather instructions load memory from a pointer at multiple index offsets at the same time.
  2753. // The given pointers should be aligned with 4 bytes, so that the fallback solution works on machines with strict alignment requirements.
  2754. #if defined USE_AVX2
  2755. #define GATHER_I32x4_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_epi32((const int32_t*)(SOURCE), FOUR_OFFSETS, SCALE)
  2756. #define GATHER_U32x4_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_epi32((const int32_t*)(SOURCE), FOUR_OFFSETS, SCALE)
  2757. #define GATHER_F32x4_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_ps((const float*)(SOURCE), FOUR_OFFSETS, SCALE)
  2758. #endif
  2759. static inline U32x4 gather_U32(dsr::SafePointer<const uint32_t> data, const U32x4 &elementOffset) {
  2760. #ifdef SAFE_POINTER_CHECKS
  2761. ALIGN16 uint32_t elementOffsets[4];
  2762. if (uintptr_t((void*)elementOffsets) & 15u) { throwError(U"Unaligned stack memory detected in 128-bit gather_U32!\n"); }
  2763. elementOffset.writeAlignedUnsafe(elementOffsets);
  2764. data.assertInside("U32x4 gather_U32 lane 0", (data + elementOffsets[0]).getUnchecked());
  2765. data.assertInside("U32x4 gather_U32 lane 1", (data + elementOffsets[1]).getUnchecked());
  2766. data.assertInside("U32x4 gather_U32 lane 2", (data + elementOffsets[2]).getUnchecked());
  2767. data.assertInside("U32x4 gather_U32 lane 3", (data + elementOffsets[3]).getUnchecked());
  2768. #endif
  2769. #if defined USE_AVX2
  2770. return U32x4(GATHER_U32x4_AVX2(data.getUnsafe(), elementOffset.v, 4));
  2771. #else
  2772. #ifndef SAFE_POINTER_CHECKS
  2773. ALIGN16 uint32_t elementOffsets[4];
  2774. elementOffset.writeAlignedUnsafe(elementOffsets);
  2775. #endif
  2776. return U32x4(
  2777. *(data + elementOffsets[0]),
  2778. *(data + elementOffsets[1]),
  2779. *(data + elementOffsets[2]),
  2780. *(data + elementOffsets[3])
  2781. );
  2782. #endif
  2783. }
  2784. static inline I32x4 gather_I32(dsr::SafePointer<const int32_t> data, const U32x4 &elementOffset) {
  2785. #ifdef SAFE_POINTER_CHECKS
  2786. ALIGN16 uint32_t elementOffsets[4];
  2787. if (uintptr_t((void*)elementOffsets) & 15u) { throwError(U"Unaligned stack memory detected in 128-bit gather_I32!\n"); }
  2788. elementOffset.writeAlignedUnsafe(elementOffsets);
  2789. data.assertInside("I32x4 gather_I32 lane 0", (data + elementOffsets[0]).getUnchecked());
  2790. data.assertInside("I32x4 gather_I32 lane 1", (data + elementOffsets[1]).getUnchecked());
  2791. data.assertInside("I32x4 gather_I32 lane 2", (data + elementOffsets[2]).getUnchecked());
  2792. data.assertInside("I32x4 gather_I32 lane 3", (data + elementOffsets[3]).getUnchecked());
  2793. #endif
  2794. #if defined USE_AVX2
  2795. return I32x4(GATHER_U32x4_AVX2(data.getUnsafe(), elementOffset.v, 4));
  2796. #else
  2797. #ifndef SAFE_POINTER_CHECKS
  2798. ALIGN16 uint32_t elementOffsets[4];
  2799. elementOffset.writeAlignedUnsafe(elementOffsets);
  2800. #endif
  2801. return I32x4(
  2802. *(data + elementOffsets[0]),
  2803. *(data + elementOffsets[1]),
  2804. *(data + elementOffsets[2]),
  2805. *(data + elementOffsets[3])
  2806. );
  2807. #endif
  2808. }
  2809. static inline F32x4 gather_F32(dsr::SafePointer<const float> data, const U32x4 &elementOffset) {
  2810. #ifdef SAFE_POINTER_CHECKS
  2811. ALIGN16 uint32_t elementOffsets[4];
  2812. if (uintptr_t((void*)elementOffsets) & 15u) { throwError(U"Unaligned stack memory detected in 128-bit gather_F32!\n"); }
  2813. elementOffset.writeAlignedUnsafe(elementOffsets);
  2814. data.assertInside("F32x4 gather_F32 lane 0", (data + elementOffsets[0]).getUnchecked());
  2815. data.assertInside("F32x4 gather_F32 lane 1", (data + elementOffsets[1]).getUnchecked());
  2816. data.assertInside("F32x4 gather_F32 lane 2", (data + elementOffsets[2]).getUnchecked());
  2817. data.assertInside("F32x4 gather_F32 lane 3", (data + elementOffsets[3]).getUnchecked());
  2818. #endif
  2819. #if defined USE_AVX2
  2820. return F32x4(GATHER_F32x4_AVX2(data.getUnsafe(), elementOffset.v, 4));
  2821. #else
  2822. #ifndef SAFE_POINTER_CHECKS
  2823. ALIGN16 uint32_t elementOffsets[4];
  2824. elementOffset.writeAlignedUnsafe(elementOffsets);
  2825. #endif
  2826. return F32x4(
  2827. *(data + elementOffsets[0]),
  2828. *(data + elementOffsets[1]),
  2829. *(data + elementOffsets[2]),
  2830. *(data + elementOffsets[3])
  2831. );
  2832. #endif
  2833. }
  2834. inline F32x8 operator+(const F32x8& left, const F32x8& right) {
  2835. #if defined USE_256BIT_F_SIMD
  2836. return F32x8(ADD_F32_SIMD256(left.v, right.v));
  2837. #else
  2838. return F32x8(
  2839. left.scalars[0] + right.scalars[0],
  2840. left.scalars[1] + right.scalars[1],
  2841. left.scalars[2] + right.scalars[2],
  2842. left.scalars[3] + right.scalars[3],
  2843. left.scalars[4] + right.scalars[4],
  2844. left.scalars[5] + right.scalars[5],
  2845. left.scalars[6] + right.scalars[6],
  2846. left.scalars[7] + right.scalars[7]
  2847. );
  2848. #endif
  2849. }
  2850. inline F32x8 operator-(const F32x8& left, const F32x8& right) {
  2851. #if defined USE_256BIT_F_SIMD
  2852. return F32x8(SUB_F32_SIMD256(left.v, right.v));
  2853. #else
  2854. return F32x8(
  2855. left.scalars[0] - right.scalars[0],
  2856. left.scalars[1] - right.scalars[1],
  2857. left.scalars[2] - right.scalars[2],
  2858. left.scalars[3] - right.scalars[3],
  2859. left.scalars[4] - right.scalars[4],
  2860. left.scalars[5] - right.scalars[5],
  2861. left.scalars[6] - right.scalars[6],
  2862. left.scalars[7] - right.scalars[7]
  2863. );
  2864. #endif
  2865. }
  2866. inline F32x8 operator*(const F32x8& left, const F32x8& right) {
  2867. #if defined USE_256BIT_F_SIMD
  2868. return F32x8(MUL_F32_SIMD256(left.v, right.v));
  2869. #else
  2870. return F32x8(
  2871. left.scalars[0] * right.scalars[0],
  2872. left.scalars[1] * right.scalars[1],
  2873. left.scalars[2] * right.scalars[2],
  2874. left.scalars[3] * right.scalars[3],
  2875. left.scalars[4] * right.scalars[4],
  2876. left.scalars[5] * right.scalars[5],
  2877. left.scalars[6] * right.scalars[6],
  2878. left.scalars[7] * right.scalars[7]
  2879. );
  2880. #endif
  2881. }
  2882. inline F32x8 min(const F32x8& left, const F32x8& right) {
  2883. #if defined USE_256BIT_F_SIMD
  2884. return F32x8(MIN_F32_SIMD256(left.v, right.v));
  2885. #else
  2886. float v0 = left.scalars[0];
  2887. float v1 = left.scalars[1];
  2888. float v2 = left.scalars[2];
  2889. float v3 = left.scalars[3];
  2890. float v4 = left.scalars[4];
  2891. float v5 = left.scalars[5];
  2892. float v6 = left.scalars[6];
  2893. float v7 = left.scalars[7];
  2894. float r0 = right.scalars[0];
  2895. float r1 = right.scalars[1];
  2896. float r2 = right.scalars[2];
  2897. float r3 = right.scalars[3];
  2898. float r4 = right.scalars[4];
  2899. float r5 = right.scalars[5];
  2900. float r6 = right.scalars[6];
  2901. float r7 = right.scalars[7];
  2902. if (r0 < v0) { v0 = r0; }
  2903. if (r1 < v1) { v1 = r1; }
  2904. if (r2 < v2) { v2 = r2; }
  2905. if (r3 < v3) { v3 = r3; }
  2906. if (r4 < v4) { v4 = r4; }
  2907. if (r5 < v5) { v5 = r5; }
  2908. if (r6 < v6) { v6 = r6; }
  2909. if (r7 < v7) { v7 = r7; }
  2910. return F32x8(v0, v1, v2, v3, v4, v5, v6, v7);
  2911. #endif
  2912. }
  2913. inline F32x8 max(const F32x8& left, const F32x8& right) {
  2914. #if defined USE_256BIT_F_SIMD
  2915. return F32x8(MAX_F32_SIMD256(left.v, right.v));
  2916. #else
  2917. float v0 = left.scalars[0];
  2918. float v1 = left.scalars[1];
  2919. float v2 = left.scalars[2];
  2920. float v3 = left.scalars[3];
  2921. float v4 = left.scalars[4];
  2922. float v5 = left.scalars[5];
  2923. float v6 = left.scalars[6];
  2924. float v7 = left.scalars[7];
  2925. float r0 = right.scalars[0];
  2926. float r1 = right.scalars[1];
  2927. float r2 = right.scalars[2];
  2928. float r3 = right.scalars[3];
  2929. float r4 = right.scalars[4];
  2930. float r5 = right.scalars[5];
  2931. float r6 = right.scalars[6];
  2932. float r7 = right.scalars[7];
  2933. if (r0 > v0) { v0 = r0; }
  2934. if (r1 > v1) { v1 = r1; }
  2935. if (r2 > v2) { v2 = r2; }
  2936. if (r3 > v3) { v3 = r3; }
  2937. if (r4 > v4) { v4 = r4; }
  2938. if (r5 > v5) { v5 = r5; }
  2939. if (r6 > v6) { v6 = r6; }
  2940. if (r7 > v7) { v7 = r7; }
  2941. return F32x8(v0, v1, v2, v3, v4, v5, v6, v7);
  2942. #endif
  2943. }
  2944. inline I32x8 operator+(const I32x8& left, const I32x8& right) {
  2945. #if defined USE_256BIT_X_SIMD
  2946. return I32x8(ADD_I32_SIMD256(left.v, right.v));
  2947. #else
  2948. return I32x8(
  2949. left.scalars[0] + right.scalars[0],
  2950. left.scalars[1] + right.scalars[1],
  2951. left.scalars[2] + right.scalars[2],
  2952. left.scalars[3] + right.scalars[3],
  2953. left.scalars[4] + right.scalars[4],
  2954. left.scalars[5] + right.scalars[5],
  2955. left.scalars[6] + right.scalars[6],
  2956. left.scalars[7] + right.scalars[7]);
  2957. #endif
  2958. }
  2959. inline I32x8 operator-(const I32x8& left, const I32x8& right) {
  2960. #if defined USE_256BIT_X_SIMD
  2961. return I32x8(SUB_I32_SIMD256(left.v, right.v));
  2962. #else
  2963. return I32x8(
  2964. left.scalars[0] - right.scalars[0],
  2965. left.scalars[1] - right.scalars[1],
  2966. left.scalars[2] - right.scalars[2],
  2967. left.scalars[3] - right.scalars[3],
  2968. left.scalars[4] - right.scalars[4],
  2969. left.scalars[5] - right.scalars[5],
  2970. left.scalars[6] - right.scalars[6],
  2971. left.scalars[7] - right.scalars[7]);
  2972. #endif
  2973. }
  2974. inline I32x8 operator*(const I32x8& left, const I32x8& right) {
  2975. #if defined USE_AVX2
  2976. return I32x8(MUL_I32_SIMD256(left.v, right.v));
  2977. #else
  2978. return I32x8(
  2979. left.scalars[0] * right.scalars[0],
  2980. left.scalars[1] * right.scalars[1],
  2981. left.scalars[2] * right.scalars[2],
  2982. left.scalars[3] * right.scalars[3],
  2983. left.scalars[4] * right.scalars[4],
  2984. left.scalars[5] * right.scalars[5],
  2985. left.scalars[6] * right.scalars[6],
  2986. left.scalars[7] * right.scalars[7]
  2987. );
  2988. #endif
  2989. }
  2990. inline U32x8 operator+(const U32x8& left, const U32x8& right) {
  2991. #if defined USE_256BIT_X_SIMD
  2992. return U32x8(ADD_U32_SIMD256(left.v, right.v));
  2993. #else
  2994. return U32x8(
  2995. left.scalars[0] + right.scalars[0],
  2996. left.scalars[1] + right.scalars[1],
  2997. left.scalars[2] + right.scalars[2],
  2998. left.scalars[3] + right.scalars[3],
  2999. left.scalars[4] + right.scalars[4],
  3000. left.scalars[5] + right.scalars[5],
  3001. left.scalars[6] + right.scalars[6],
  3002. left.scalars[7] + right.scalars[7]
  3003. );
  3004. #endif
  3005. }
  3006. inline U32x8 operator-(const U32x8& left, const U32x8& right) {
  3007. #if defined USE_256BIT_X_SIMD
  3008. return U32x8(SUB_U32_SIMD256(left.v, right.v));
  3009. #else
  3010. return U32x8(
  3011. left.scalars[0] - right.scalars[0],
  3012. left.scalars[1] - right.scalars[1],
  3013. left.scalars[2] - right.scalars[2],
  3014. left.scalars[3] - right.scalars[3],
  3015. left.scalars[4] - right.scalars[4],
  3016. left.scalars[5] - right.scalars[5],
  3017. left.scalars[6] - right.scalars[6],
  3018. left.scalars[7] - right.scalars[7]
  3019. );
  3020. #endif
  3021. }
  3022. inline U32x8 operator*(const U32x8& left, const U32x8& right) {
  3023. #if defined USE_256BIT_X_SIMD
  3024. return U32x8(MUL_U32_SIMD256(left.v, right.v));
  3025. #else
  3026. return U32x8(
  3027. left.scalars[0] * right.scalars[0],
  3028. left.scalars[1] * right.scalars[1],
  3029. left.scalars[2] * right.scalars[2],
  3030. left.scalars[3] * right.scalars[3],
  3031. left.scalars[4] * right.scalars[4],
  3032. left.scalars[5] * right.scalars[5],
  3033. left.scalars[6] * right.scalars[6],
  3034. left.scalars[7] * right.scalars[7]
  3035. );
  3036. #endif
  3037. }
  3038. inline U32x8 operator&(const U32x8& left, const U32x8& right) {
  3039. #if defined USE_256BIT_X_SIMD
  3040. return U32x8(BITWISE_AND_U32_SIMD256(left.v, right.v));
  3041. #else
  3042. return U32x8(
  3043. left.scalars[0] & right.scalars[0],
  3044. left.scalars[1] & right.scalars[1],
  3045. left.scalars[2] & right.scalars[2],
  3046. left.scalars[3] & right.scalars[3],
  3047. left.scalars[4] & right.scalars[4],
  3048. left.scalars[5] & right.scalars[5],
  3049. left.scalars[6] & right.scalars[6],
  3050. left.scalars[7] & right.scalars[7]
  3051. );
  3052. #endif
  3053. }
  3054. inline U32x8 operator|(const U32x8& left, const U32x8& right) {
  3055. #if defined USE_256BIT_X_SIMD
  3056. return U32x8(BITWISE_OR_U32_SIMD256(left.v, right.v));
  3057. #else
  3058. return U32x8(
  3059. left.scalars[0] | right.scalars[0],
  3060. left.scalars[1] | right.scalars[1],
  3061. left.scalars[2] | right.scalars[2],
  3062. left.scalars[3] | right.scalars[3],
  3063. left.scalars[4] | right.scalars[4],
  3064. left.scalars[5] | right.scalars[5],
  3065. left.scalars[6] | right.scalars[6],
  3066. left.scalars[7] | right.scalars[7]
  3067. );
  3068. #endif
  3069. }
  3070. inline U32x8 operator^(const U32x8& left, const U32x8& right) {
  3071. #if defined USE_256BIT_X_SIMD
  3072. return U32x8(BITWISE_XOR_U32_SIMD256(left.v, right.v));
  3073. #else
  3074. return U32x8(
  3075. left.scalars[0] ^ right.scalars[0],
  3076. left.scalars[1] ^ right.scalars[1],
  3077. left.scalars[2] ^ right.scalars[2],
  3078. left.scalars[3] ^ right.scalars[3],
  3079. left.scalars[4] ^ right.scalars[4],
  3080. left.scalars[5] ^ right.scalars[5],
  3081. left.scalars[6] ^ right.scalars[6],
  3082. left.scalars[7] ^ right.scalars[7]
  3083. );
  3084. #endif
  3085. }
  3086. inline U32x8 operator~(const U32x8& value) {
  3087. #if defined USE_BASIC_SIMD
  3088. return value ^ U32x8(~uint32_t(0));
  3089. #else
  3090. return U32x8(
  3091. ~value.scalars[0],
  3092. ~value.scalars[1],
  3093. ~value.scalars[2],
  3094. ~value.scalars[3],
  3095. ~value.scalars[4],
  3096. ~value.scalars[5],
  3097. ~value.scalars[6],
  3098. ~value.scalars[7]
  3099. );
  3100. #endif
  3101. }
  3102. // ARM NEON does not support 256-bit vectors and Intel's AVX2 does not support variable shifting.
  3103. inline U32x8 operator<<(const U32x8& left, const U32x8 &bitOffsets) {
  3104. assert(allLanesLesser(bitOffsets, U32x8(32u)));
  3105. return U32x8(
  3106. left.scalars[0] << bitOffsets.scalars[0],
  3107. left.scalars[1] << bitOffsets.scalars[1],
  3108. left.scalars[2] << bitOffsets.scalars[2],
  3109. left.scalars[3] << bitOffsets.scalars[3],
  3110. left.scalars[4] << bitOffsets.scalars[4],
  3111. left.scalars[5] << bitOffsets.scalars[5],
  3112. left.scalars[6] << bitOffsets.scalars[6],
  3113. left.scalars[7] << bitOffsets.scalars[7]
  3114. );
  3115. }
  3116. inline U32x8 operator>>(const U32x8& left, const U32x8 &bitOffsets) {
  3117. assert(allLanesLesser(bitOffsets, U32x8(32u)));
  3118. return U32x8(
  3119. left.scalars[0] >> bitOffsets.scalars[0],
  3120. left.scalars[1] >> bitOffsets.scalars[1],
  3121. left.scalars[2] >> bitOffsets.scalars[2],
  3122. left.scalars[3] >> bitOffsets.scalars[3],
  3123. left.scalars[4] >> bitOffsets.scalars[4],
  3124. left.scalars[5] >> bitOffsets.scalars[5],
  3125. left.scalars[6] >> bitOffsets.scalars[6],
  3126. left.scalars[7] >> bitOffsets.scalars[7]
  3127. );
  3128. }
  3129. // bitOffset must be an immediate constant from 0 to 31, so a template argument is used.
  3130. template <uint32_t bitOffset>
  3131. inline U32x8 bitShiftLeftImmediate(const U32x8& left) {
  3132. static_assert(bitOffset < 32u, "Immediate left shift of 32-bit values may not shift more than 31 bits!");
  3133. #if defined USE_AVX2
  3134. return U32x8(_mm256_slli_epi32(left.v, bitOffset));
  3135. #else
  3136. return U32x8(
  3137. left.scalars[0] << bitOffset,
  3138. left.scalars[1] << bitOffset,
  3139. left.scalars[2] << bitOffset,
  3140. left.scalars[3] << bitOffset,
  3141. left.scalars[4] << bitOffset,
  3142. left.scalars[5] << bitOffset,
  3143. left.scalars[6] << bitOffset,
  3144. left.scalars[7] << bitOffset
  3145. );
  3146. #endif
  3147. }
  3148. // bitOffset must be an immediate constant from 0 to 31, so a template argument is used.
  3149. template <uint32_t bitOffset>
  3150. inline U32x8 bitShiftRightImmediate(const U32x8& left) {
  3151. static_assert(bitOffset < 32u, "Immediate right shift of 32-bit values may not shift more than 31 bits!");
  3152. #if defined USE_AVX2
  3153. return U32x8(_mm256_srli_epi32(left.v, bitOffset));
  3154. #else
  3155. return U32x8(
  3156. left.scalars[0] >> bitOffset,
  3157. left.scalars[1] >> bitOffset,
  3158. left.scalars[2] >> bitOffset,
  3159. left.scalars[3] >> bitOffset,
  3160. left.scalars[4] >> bitOffset,
  3161. left.scalars[5] >> bitOffset,
  3162. left.scalars[6] >> bitOffset,
  3163. left.scalars[7] >> bitOffset
  3164. );
  3165. #endif
  3166. }
  3167. inline U16x16 operator<<(const U16x16& left, const U16x16 &bitOffsets) {
  3168. assert(allLanesLesser(bitOffsets, U16x16(16u)));
  3169. return U16x16(
  3170. left.scalars[ 0] << bitOffsets.scalars[ 0],
  3171. left.scalars[ 1] << bitOffsets.scalars[ 1],
  3172. left.scalars[ 2] << bitOffsets.scalars[ 2],
  3173. left.scalars[ 3] << bitOffsets.scalars[ 3],
  3174. left.scalars[ 4] << bitOffsets.scalars[ 4],
  3175. left.scalars[ 5] << bitOffsets.scalars[ 5],
  3176. left.scalars[ 6] << bitOffsets.scalars[ 6],
  3177. left.scalars[ 7] << bitOffsets.scalars[ 7],
  3178. left.scalars[ 8] << bitOffsets.scalars[ 8],
  3179. left.scalars[ 9] << bitOffsets.scalars[ 9],
  3180. left.scalars[10] << bitOffsets.scalars[10],
  3181. left.scalars[11] << bitOffsets.scalars[11],
  3182. left.scalars[12] << bitOffsets.scalars[12],
  3183. left.scalars[13] << bitOffsets.scalars[13],
  3184. left.scalars[14] << bitOffsets.scalars[14],
  3185. left.scalars[15] << bitOffsets.scalars[15]
  3186. );
  3187. }
  3188. inline U16x16 operator>>(const U16x16& left, const U16x16 &bitOffsets) {
  3189. assert(allLanesLesser(bitOffsets, U16x16(16u)));
  3190. return U16x16(
  3191. left.scalars[ 0] >> bitOffsets.scalars[ 0],
  3192. left.scalars[ 1] >> bitOffsets.scalars[ 1],
  3193. left.scalars[ 2] >> bitOffsets.scalars[ 2],
  3194. left.scalars[ 3] >> bitOffsets.scalars[ 3],
  3195. left.scalars[ 4] >> bitOffsets.scalars[ 4],
  3196. left.scalars[ 5] >> bitOffsets.scalars[ 5],
  3197. left.scalars[ 6] >> bitOffsets.scalars[ 6],
  3198. left.scalars[ 7] >> bitOffsets.scalars[ 7],
  3199. left.scalars[ 8] >> bitOffsets.scalars[ 8],
  3200. left.scalars[ 9] >> bitOffsets.scalars[ 9],
  3201. left.scalars[10] >> bitOffsets.scalars[10],
  3202. left.scalars[11] >> bitOffsets.scalars[11],
  3203. left.scalars[12] >> bitOffsets.scalars[12],
  3204. left.scalars[13] >> bitOffsets.scalars[13],
  3205. left.scalars[14] >> bitOffsets.scalars[14],
  3206. left.scalars[15] >> bitOffsets.scalars[15]
  3207. );
  3208. }
  3209. // bitOffset must be an immediate constant from 0 to 31, so a template argument is used.
  3210. template <uint32_t bitOffset>
  3211. inline U16x16 bitShiftLeftImmediate(const U16x16& left) {
  3212. static_assert(bitOffset < 16u, "Immediate left shift of 16-bit values may not shift more than 15 bits!");
  3213. #if defined USE_AVX2
  3214. return U16x16(_mm256_slli_epi16(left.v, bitOffset));
  3215. #else
  3216. return U16x16(
  3217. left.scalars[ 0] << bitOffset,
  3218. left.scalars[ 1] << bitOffset,
  3219. left.scalars[ 2] << bitOffset,
  3220. left.scalars[ 3] << bitOffset,
  3221. left.scalars[ 4] << bitOffset,
  3222. left.scalars[ 5] << bitOffset,
  3223. left.scalars[ 6] << bitOffset,
  3224. left.scalars[ 7] << bitOffset,
  3225. left.scalars[ 8] << bitOffset,
  3226. left.scalars[ 9] << bitOffset,
  3227. left.scalars[10] << bitOffset,
  3228. left.scalars[11] << bitOffset,
  3229. left.scalars[12] << bitOffset,
  3230. left.scalars[13] << bitOffset,
  3231. left.scalars[14] << bitOffset,
  3232. left.scalars[15] << bitOffset
  3233. );
  3234. #endif
  3235. }
  3236. // bitOffset must be an immediate constant from 0 to 31, so a template argument is used.
  3237. template <uint32_t bitOffset>
  3238. inline U16x16 bitShiftRightImmediate(const U16x16& left) {
  3239. static_assert(bitOffset < 16u, "Immediate right shift of 16-bit values may not shift more than 15 bits!");
  3240. #if defined USE_AVX2
  3241. return U16x16(_mm256_srli_epi16(left.v, bitOffset));
  3242. #else
  3243. return U16x16(
  3244. left.scalars[ 0] >> bitOffset,
  3245. left.scalars[ 1] >> bitOffset,
  3246. left.scalars[ 2] >> bitOffset,
  3247. left.scalars[ 3] >> bitOffset,
  3248. left.scalars[ 4] >> bitOffset,
  3249. left.scalars[ 5] >> bitOffset,
  3250. left.scalars[ 6] >> bitOffset,
  3251. left.scalars[ 7] >> bitOffset,
  3252. left.scalars[ 8] >> bitOffset,
  3253. left.scalars[ 9] >> bitOffset,
  3254. left.scalars[10] >> bitOffset,
  3255. left.scalars[11] >> bitOffset,
  3256. left.scalars[12] >> bitOffset,
  3257. left.scalars[13] >> bitOffset,
  3258. left.scalars[14] >> bitOffset,
  3259. left.scalars[15] >> bitOffset
  3260. );
  3261. #endif
  3262. }
  3263. inline U8x32 operator<<(const U8x32& left, const U8x32 &bitOffsets) {
  3264. assert(allLanesLesser(bitOffsets, U8x32(32u)));
  3265. return U8x32(
  3266. left.scalars[ 0] << bitOffsets.scalars[ 0],
  3267. left.scalars[ 1] << bitOffsets.scalars[ 1],
  3268. left.scalars[ 2] << bitOffsets.scalars[ 2],
  3269. left.scalars[ 3] << bitOffsets.scalars[ 3],
  3270. left.scalars[ 4] << bitOffsets.scalars[ 4],
  3271. left.scalars[ 5] << bitOffsets.scalars[ 5],
  3272. left.scalars[ 6] << bitOffsets.scalars[ 6],
  3273. left.scalars[ 7] << bitOffsets.scalars[ 7],
  3274. left.scalars[ 8] << bitOffsets.scalars[ 8],
  3275. left.scalars[ 9] << bitOffsets.scalars[ 9],
  3276. left.scalars[10] << bitOffsets.scalars[10],
  3277. left.scalars[11] << bitOffsets.scalars[11],
  3278. left.scalars[12] << bitOffsets.scalars[12],
  3279. left.scalars[13] << bitOffsets.scalars[13],
  3280. left.scalars[14] << bitOffsets.scalars[14],
  3281. left.scalars[15] << bitOffsets.scalars[15],
  3282. left.scalars[16] << bitOffsets.scalars[16],
  3283. left.scalars[17] << bitOffsets.scalars[17],
  3284. left.scalars[18] << bitOffsets.scalars[18],
  3285. left.scalars[19] << bitOffsets.scalars[19],
  3286. left.scalars[20] << bitOffsets.scalars[20],
  3287. left.scalars[21] << bitOffsets.scalars[21],
  3288. left.scalars[22] << bitOffsets.scalars[22],
  3289. left.scalars[23] << bitOffsets.scalars[23],
  3290. left.scalars[24] << bitOffsets.scalars[24],
  3291. left.scalars[25] << bitOffsets.scalars[25],
  3292. left.scalars[26] << bitOffsets.scalars[26],
  3293. left.scalars[27] << bitOffsets.scalars[27],
  3294. left.scalars[28] << bitOffsets.scalars[28],
  3295. left.scalars[29] << bitOffsets.scalars[29],
  3296. left.scalars[30] << bitOffsets.scalars[30],
  3297. left.scalars[31] << bitOffsets.scalars[31]
  3298. );
  3299. }
  3300. inline U8x32 operator>>(const U8x32& left, const U8x32 &bitOffsets) {
  3301. assert(allLanesLesser(bitOffsets, U8x32(32u)));
  3302. return U8x32(
  3303. left.scalars[ 0] >> bitOffsets.scalars[ 0],
  3304. left.scalars[ 1] >> bitOffsets.scalars[ 1],
  3305. left.scalars[ 2] >> bitOffsets.scalars[ 2],
  3306. left.scalars[ 3] >> bitOffsets.scalars[ 3],
  3307. left.scalars[ 4] >> bitOffsets.scalars[ 4],
  3308. left.scalars[ 5] >> bitOffsets.scalars[ 5],
  3309. left.scalars[ 6] >> bitOffsets.scalars[ 6],
  3310. left.scalars[ 7] >> bitOffsets.scalars[ 7],
  3311. left.scalars[ 8] >> bitOffsets.scalars[ 8],
  3312. left.scalars[ 9] >> bitOffsets.scalars[ 9],
  3313. left.scalars[10] >> bitOffsets.scalars[10],
  3314. left.scalars[11] >> bitOffsets.scalars[11],
  3315. left.scalars[12] >> bitOffsets.scalars[12],
  3316. left.scalars[13] >> bitOffsets.scalars[13],
  3317. left.scalars[14] >> bitOffsets.scalars[14],
  3318. left.scalars[15] >> bitOffsets.scalars[15],
  3319. left.scalars[16] >> bitOffsets.scalars[16],
  3320. left.scalars[17] >> bitOffsets.scalars[17],
  3321. left.scalars[18] >> bitOffsets.scalars[18],
  3322. left.scalars[19] >> bitOffsets.scalars[19],
  3323. left.scalars[20] >> bitOffsets.scalars[20],
  3324. left.scalars[21] >> bitOffsets.scalars[21],
  3325. left.scalars[22] >> bitOffsets.scalars[22],
  3326. left.scalars[23] >> bitOffsets.scalars[23],
  3327. left.scalars[24] >> bitOffsets.scalars[24],
  3328. left.scalars[25] >> bitOffsets.scalars[25],
  3329. left.scalars[26] >> bitOffsets.scalars[26],
  3330. left.scalars[27] >> bitOffsets.scalars[27],
  3331. left.scalars[28] >> bitOffsets.scalars[28],
  3332. left.scalars[29] >> bitOffsets.scalars[29],
  3333. left.scalars[30] >> bitOffsets.scalars[30],
  3334. left.scalars[31] >> bitOffsets.scalars[31]
  3335. );
  3336. }
  3337. // bitOffset must be an immediate constant from 0 to 31, so a template argument is used.
  3338. template <uint32_t bitOffset>
  3339. inline U8x32 bitShiftLeftImmediate(const U8x32& left) {
  3340. static_assert(bitOffset < 8u, "Immediate left shift of 32-bit values may not shift more than 7 bits!");
  3341. // TODO: Use a larger lane and a mask generated in compile time.
  3342. return U8x32(
  3343. left.scalars[ 0] << bitOffset,
  3344. left.scalars[ 1] << bitOffset,
  3345. left.scalars[ 2] << bitOffset,
  3346. left.scalars[ 3] << bitOffset,
  3347. left.scalars[ 4] << bitOffset,
  3348. left.scalars[ 5] << bitOffset,
  3349. left.scalars[ 6] << bitOffset,
  3350. left.scalars[ 7] << bitOffset,
  3351. left.scalars[ 8] << bitOffset,
  3352. left.scalars[ 9] << bitOffset,
  3353. left.scalars[10] << bitOffset,
  3354. left.scalars[11] << bitOffset,
  3355. left.scalars[12] << bitOffset,
  3356. left.scalars[13] << bitOffset,
  3357. left.scalars[14] << bitOffset,
  3358. left.scalars[15] << bitOffset,
  3359. left.scalars[16] << bitOffset,
  3360. left.scalars[17] << bitOffset,
  3361. left.scalars[18] << bitOffset,
  3362. left.scalars[19] << bitOffset,
  3363. left.scalars[20] << bitOffset,
  3364. left.scalars[21] << bitOffset,
  3365. left.scalars[22] << bitOffset,
  3366. left.scalars[23] << bitOffset,
  3367. left.scalars[24] << bitOffset,
  3368. left.scalars[25] << bitOffset,
  3369. left.scalars[26] << bitOffset,
  3370. left.scalars[27] << bitOffset,
  3371. left.scalars[28] << bitOffset,
  3372. left.scalars[29] << bitOffset,
  3373. left.scalars[30] << bitOffset,
  3374. left.scalars[31] << bitOffset
  3375. );
  3376. }
  3377. // bitOffset must be an immediate constant from 0 to 31, so a template argument is used.
  3378. template <uint32_t bitOffset>
  3379. inline U8x32 bitShiftRightImmediate(const U8x32& left) {
  3380. static_assert(bitOffset < 8u, "Immediate right shift of 32-bit values may not shift more than 7 bits!");
  3381. // TODO: Use a larger lane and a mask generated in compile time.
  3382. return U8x32(
  3383. left.scalars[ 0] >> bitOffset,
  3384. left.scalars[ 1] >> bitOffset,
  3385. left.scalars[ 2] >> bitOffset,
  3386. left.scalars[ 3] >> bitOffset,
  3387. left.scalars[ 4] >> bitOffset,
  3388. left.scalars[ 5] >> bitOffset,
  3389. left.scalars[ 6] >> bitOffset,
  3390. left.scalars[ 7] >> bitOffset,
  3391. left.scalars[ 8] >> bitOffset,
  3392. left.scalars[ 9] >> bitOffset,
  3393. left.scalars[10] >> bitOffset,
  3394. left.scalars[11] >> bitOffset,
  3395. left.scalars[12] >> bitOffset,
  3396. left.scalars[13] >> bitOffset,
  3397. left.scalars[14] >> bitOffset,
  3398. left.scalars[15] >> bitOffset,
  3399. left.scalars[16] >> bitOffset,
  3400. left.scalars[17] >> bitOffset,
  3401. left.scalars[18] >> bitOffset,
  3402. left.scalars[19] >> bitOffset,
  3403. left.scalars[20] >> bitOffset,
  3404. left.scalars[21] >> bitOffset,
  3405. left.scalars[22] >> bitOffset,
  3406. left.scalars[23] >> bitOffset,
  3407. left.scalars[24] >> bitOffset,
  3408. left.scalars[25] >> bitOffset,
  3409. left.scalars[26] >> bitOffset,
  3410. left.scalars[27] >> bitOffset,
  3411. left.scalars[28] >> bitOffset,
  3412. left.scalars[29] >> bitOffset,
  3413. left.scalars[30] >> bitOffset,
  3414. left.scalars[31] >> bitOffset
  3415. );
  3416. }
  3417. inline U16x16 operator+(const U16x16& left, const U16x16& right) {
  3418. #if defined USE_256BIT_X_SIMD
  3419. return U16x16(ADD_U16_SIMD256(left.v, right.v));
  3420. #else
  3421. return U16x16(
  3422. left.scalars[0] + right.scalars[0],
  3423. left.scalars[1] + right.scalars[1],
  3424. left.scalars[2] + right.scalars[2],
  3425. left.scalars[3] + right.scalars[3],
  3426. left.scalars[4] + right.scalars[4],
  3427. left.scalars[5] + right.scalars[5],
  3428. left.scalars[6] + right.scalars[6],
  3429. left.scalars[7] + right.scalars[7],
  3430. left.scalars[8] + right.scalars[8],
  3431. left.scalars[9] + right.scalars[9],
  3432. left.scalars[10] + right.scalars[10],
  3433. left.scalars[11] + right.scalars[11],
  3434. left.scalars[12] + right.scalars[12],
  3435. left.scalars[13] + right.scalars[13],
  3436. left.scalars[14] + right.scalars[14],
  3437. left.scalars[15] + right.scalars[15]
  3438. );
  3439. #endif
  3440. }
  3441. inline U16x16 operator-(const U16x16& left, const U16x16& right) {
  3442. #if defined USE_256BIT_X_SIMD
  3443. return U16x16(SUB_U16_SIMD256(left.v, right.v));
  3444. #else
  3445. return U16x16(
  3446. left.scalars[0] - right.scalars[0],
  3447. left.scalars[1] - right.scalars[1],
  3448. left.scalars[2] - right.scalars[2],
  3449. left.scalars[3] - right.scalars[3],
  3450. left.scalars[4] - right.scalars[4],
  3451. left.scalars[5] - right.scalars[5],
  3452. left.scalars[6] - right.scalars[6],
  3453. left.scalars[7] - right.scalars[7],
  3454. left.scalars[8] - right.scalars[8],
  3455. left.scalars[9] - right.scalars[9],
  3456. left.scalars[10] - right.scalars[10],
  3457. left.scalars[11] - right.scalars[11],
  3458. left.scalars[12] - right.scalars[12],
  3459. left.scalars[13] - right.scalars[13],
  3460. left.scalars[14] - right.scalars[14],
  3461. left.scalars[15] - right.scalars[15]
  3462. );
  3463. #endif
  3464. }
  3465. inline U16x16 operator*(const U16x16& left, const U16x16& right) {
  3466. #if defined USE_256BIT_X_SIMD
  3467. return U16x16(MUL_U16_SIMD256(left.v, right.v));
  3468. #else
  3469. return U16x16(
  3470. left.scalars[0] * right.scalars[0],
  3471. left.scalars[1] * right.scalars[1],
  3472. left.scalars[2] * right.scalars[2],
  3473. left.scalars[3] * right.scalars[3],
  3474. left.scalars[4] * right.scalars[4],
  3475. left.scalars[5] * right.scalars[5],
  3476. left.scalars[6] * right.scalars[6],
  3477. left.scalars[7] * right.scalars[7],
  3478. left.scalars[8] * right.scalars[8],
  3479. left.scalars[9] * right.scalars[9],
  3480. left.scalars[10] * right.scalars[10],
  3481. left.scalars[11] * right.scalars[11],
  3482. left.scalars[12] * right.scalars[12],
  3483. left.scalars[13] * right.scalars[13],
  3484. left.scalars[14] * right.scalars[14],
  3485. left.scalars[15] * right.scalars[15]
  3486. );
  3487. #endif
  3488. }
  3489. inline U8x32 operator+(const U8x32& left, const U8x32& right) {
  3490. #if defined USE_256BIT_X_SIMD
  3491. return U8x32(ADD_U8_SIMD256(left.v, right.v));
  3492. #else
  3493. U8x32 result = U8x32::create_dangerous_uninitialized();
  3494. for (int i = 0; i < 32; i++) {
  3495. result.scalars[i] = left.scalars[i] + right.scalars[i];
  3496. }
  3497. return result;
  3498. #endif
  3499. }
  3500. inline U8x32 operator-(const U8x32& left, const U8x32& right) {
  3501. #if defined USE_256BIT_X_SIMD
  3502. return U8x32(SUB_U8_SIMD256(left.v, right.v));
  3503. #else
  3504. U8x32 result = U8x32::create_dangerous_uninitialized();
  3505. for (int i = 0; i < 32; i++) {
  3506. result.scalars[i] = left.scalars[i] - right.scalars[i];
  3507. }
  3508. return result;
  3509. #endif
  3510. }
  3511. inline U8x32 saturatedAddition(const U8x32& left, const U8x32& right) {
  3512. #if defined USE_256BIT_X_SIMD
  3513. return U8x32(ADD_SAT_U8_SIMD256(left.v, right.v));
  3514. #else
  3515. U8x32 result = U8x32::create_dangerous_uninitialized();
  3516. for (int i = 0; i < 32; i++) {
  3517. result.scalars[i] = impl_limit255((uint32_t)left.scalars[i] + (uint32_t)right.scalars[i]);
  3518. }
  3519. return result;
  3520. #endif
  3521. }
  3522. inline U8x32 saturatedSubtraction(const U8x32& left, const U8x32& right) {
  3523. #if defined USE_256BIT_X_SIMD
  3524. return U8x32(SUB_SAT_U8_SIMD256(left.v, right.v));
  3525. #else
  3526. U8x32 result = U8x32::create_dangerous_uninitialized();
  3527. for (int i = 0; i < 32; i++) {
  3528. result.scalars[i] = impl_limit0((int32_t)left.scalars[i] - (int32_t)right.scalars[i]);
  3529. }
  3530. return result;
  3531. #endif
  3532. }
  3533. inline I32x8 truncateToI32(const F32x8& vector) {
  3534. #if defined USE_256BIT_X_SIMD
  3535. return I32x8(F32_TO_I32_SIMD256(vector.v));
  3536. #else
  3537. return I32x8(
  3538. (int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3],
  3539. (int32_t)vector.scalars[4], (int32_t)vector.scalars[5], (int32_t)vector.scalars[6], (int32_t)vector.scalars[7]
  3540. );
  3541. #endif
  3542. }
  3543. inline U32x8 truncateToU32(const F32x8& vector) {
  3544. #if defined USE_256BIT_X_SIMD
  3545. return U32x8(F32_TO_U32_SIMD256(vector.v));
  3546. #else
  3547. return U32x8(
  3548. (uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3],
  3549. (uint32_t)vector.scalars[4], (uint32_t)vector.scalars[5], (uint32_t)vector.scalars[6], (uint32_t)vector.scalars[7]
  3550. );
  3551. #endif
  3552. }
  3553. inline F32x8 floatFromI32(const I32x8& vector) {
  3554. #if defined USE_256BIT_X_SIMD
  3555. return F32x8(I32_TO_F32_SIMD256(vector.v));
  3556. #else
  3557. return F32x8(
  3558. (float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3],
  3559. (float)vector.scalars[4], (float)vector.scalars[5], (float)vector.scalars[6], (float)vector.scalars[7]
  3560. );
  3561. #endif
  3562. }
  3563. inline F32x8 floatFromU32(const U32x8& vector) {
  3564. #if defined USE_256BIT_X_SIMD
  3565. return F32x8(U32_TO_F32_SIMD256(vector.v));
  3566. #else
  3567. return F32x8(
  3568. (float)vector.scalars[0], (float)vector.scalars[1], (float)vector.scalars[2], (float)vector.scalars[3],
  3569. (float)vector.scalars[4], (float)vector.scalars[5], (float)vector.scalars[6], (float)vector.scalars[7]
  3570. );
  3571. #endif
  3572. }
  3573. inline I32x8 I32FromU32(const U32x8& vector) {
  3574. #if defined USE_256BIT_X_SIMD
  3575. return I32x8(REINTERPRET_U32_TO_I32_SIMD256(vector.v));
  3576. #else
  3577. return I32x8(
  3578. (int32_t)vector.scalars[0], (int32_t)vector.scalars[1], (int32_t)vector.scalars[2], (int32_t)vector.scalars[3],
  3579. (int32_t)vector.scalars[4], (int32_t)vector.scalars[5], (int32_t)vector.scalars[6], (int32_t)vector.scalars[7]
  3580. );
  3581. #endif
  3582. }
  3583. inline U32x8 U32FromI32(const I32x8& vector) {
  3584. #if defined USE_256BIT_X_SIMD
  3585. return U32x8(REINTERPRET_I32_TO_U32_SIMD256(vector.v));
  3586. #else
  3587. return U32x8(
  3588. (uint32_t)vector.scalars[0], (uint32_t)vector.scalars[1], (uint32_t)vector.scalars[2], (uint32_t)vector.scalars[3],
  3589. (uint32_t)vector.scalars[4], (uint32_t)vector.scalars[5], (uint32_t)vector.scalars[6], (uint32_t)vector.scalars[7]
  3590. );
  3591. #endif
  3592. }
  3593. // Warning! Behavior depends on endianness.
  3594. inline U8x32 reinterpret_U8FromU32(const U32x8& vector) {
  3595. #if defined USE_256BIT_X_SIMD
  3596. return U8x32(REINTERPRET_U32_TO_U8_SIMD256(vector.v));
  3597. #else
  3598. const uint8_t *source = (const uint8_t*)vector.scalars;
  3599. U8x32 result = U8x32::create_dangerous_uninitialized();
  3600. for (int i = 0; i < 32; i++) {
  3601. result.scalars[i] = source[i];
  3602. }
  3603. return result;
  3604. #endif
  3605. }
  3606. // Warning! Behavior depends on endianness.
  3607. inline U32x8 reinterpret_U32FromU8(const U8x32& vector) {
  3608. #if defined USE_256BIT_X_SIMD
  3609. return U32x8(REINTERPRET_U8_TO_U32_SIMD256(vector.v));
  3610. #else
  3611. const uint32_t *source = (const uint32_t*)vector.scalars;
  3612. return U32x8(source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7]);
  3613. #endif
  3614. }
  3615. // Warning! Behavior depends on endianness.
  3616. inline U16x16 reinterpret_U16FromU32(const U32x8& vector) {
  3617. #if defined USE_256BIT_X_SIMD
  3618. return U16x16(REINTERPRET_U32_TO_U16_SIMD256(vector.v));
  3619. #else
  3620. const uint16_t *source = (const uint16_t*)vector.scalars;
  3621. return U16x16(
  3622. source[0], source[1], source[2] , source[3] , source[4] , source[5] , source[6] , source[7] ,
  3623. source[8], source[9], source[10], source[11], source[12], source[13], source[14], source[15]
  3624. );
  3625. #endif
  3626. }
  3627. // Warning! Behavior depends on endianness.
  3628. inline U32x8 reinterpret_U32FromU16(const U16x16& vector) {
  3629. #if defined USE_256BIT_X_SIMD
  3630. return U32x8(REINTERPRET_U16_TO_U32_SIMD256(vector.v));
  3631. #else
  3632. const uint32_t *source = (const uint32_t*)vector.scalars;
  3633. return U32x8(source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7]);
  3634. #endif
  3635. }
  3636. // Unpacking to larger integers
  3637. inline U32x8 lowerToU32(const U16x16& vector) {
  3638. #if defined USE_256BIT_X_SIMD
  3639. return U32x8(U16_LOW_TO_U32_SIMD256(vector.v));
  3640. #else
  3641. return U32x8(vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3], vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7]);
  3642. #endif
  3643. }
  3644. inline U32x8 higherToU32(const U16x16& vector) {
  3645. #if defined USE_256BIT_X_SIMD
  3646. return U32x8(U16_HIGH_TO_U32_SIMD256(vector.v));
  3647. #else
  3648. return U32x8(vector.scalars[8], vector.scalars[9], vector.scalars[10], vector.scalars[11], vector.scalars[12], vector.scalars[13], vector.scalars[14], vector.scalars[15]);
  3649. #endif
  3650. }
  3651. inline U16x16 lowerToU16(const U8x32& vector) {
  3652. #if defined USE_256BIT_X_SIMD
  3653. return U16x16(U8_LOW_TO_U16_SIMD256(vector.v));
  3654. #else
  3655. return U16x16(
  3656. vector.scalars[0], vector.scalars[1], vector.scalars[2], vector.scalars[3],
  3657. vector.scalars[4], vector.scalars[5], vector.scalars[6], vector.scalars[7],
  3658. vector.scalars[8], vector.scalars[9], vector.scalars[10], vector.scalars[11],
  3659. vector.scalars[12], vector.scalars[13], vector.scalars[14], vector.scalars[15]
  3660. );
  3661. #endif
  3662. }
  3663. inline U16x16 higherToU16(const U8x32& vector) {
  3664. #if defined USE_256BIT_X_SIMD
  3665. return U16x16(U8_HIGH_TO_U16_SIMD256(vector.v));
  3666. #else
  3667. return U16x16(
  3668. vector.scalars[16], vector.scalars[17], vector.scalars[18], vector.scalars[19],
  3669. vector.scalars[20], vector.scalars[21], vector.scalars[22], vector.scalars[23],
  3670. vector.scalars[24], vector.scalars[25], vector.scalars[26], vector.scalars[27],
  3671. vector.scalars[28], vector.scalars[29], vector.scalars[30], vector.scalars[31]
  3672. );
  3673. #endif
  3674. }
  3675. // Saturated packing
  3676. inline U8x32 saturateToU8(const U16x16& lower, const U16x16& upper) {
  3677. #if defined USE_256BIT_X_SIMD
  3678. return U8x32(PACK_SAT_U16_TO_U8_SIMD256(lower.v, upper.v));
  3679. #else
  3680. U8x32 result = U8x32::create_dangerous_uninitialized();
  3681. for (int i = 0; i < 16; i++) {
  3682. result.scalars[i] = impl_limit255(lower.scalars[i]);
  3683. }
  3684. for (int i = 0; i < 16; i++) {
  3685. result.scalars[i + 16] = impl_limit255(upper.scalars[i]);
  3686. }
  3687. return result;
  3688. #endif
  3689. }
  3690. // Unary negation for convenience and code readability.
  3691. // Before using unary negation, always check if:
  3692. // * An addition can be turned into a subtraction?
  3693. // x = -a + b
  3694. // x = b - a
  3695. // * A multiplying constant or scalar can be negated instead?
  3696. // x = -b * 2
  3697. // x = b * -2
  3698. inline F32x8 operator-(const F32x8& value) {
  3699. #if defined USE_256BIT_F_SIMD
  3700. return F32x8(0.0f) - value;
  3701. #else
  3702. return F32x8(
  3703. -value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3],
  3704. -value.scalars[4], -value.scalars[5], -value.scalars[6], -value.scalars[7]
  3705. );
  3706. #endif
  3707. }
  3708. inline I32x8 operator-(const I32x8& value) {
  3709. #if defined USE_256BIT_X_SIMD
  3710. return I32x8(0) - value;
  3711. #else
  3712. return I32x8(
  3713. -value.scalars[0], -value.scalars[1], -value.scalars[2], -value.scalars[3],
  3714. -value.scalars[4], -value.scalars[5], -value.scalars[6], -value.scalars[7]
  3715. );
  3716. #endif
  3717. }
  3718. // Helper macros for generating the vector extract functions.
  3719. // Having one function for each type and offset makes sure that the compiler gets an immediate integer within the valid range.
  3720. #if defined USE_AVX2
  3721. // AVX2 does not offer any 256-bit element extraction, only two 128-bit shifts done in parallel, so we might as well use two separate 128-bit extractions.
  3722. template <int OFFSET>
  3723. __m256i impl_extractBytes_AVX2(const __m256i leftInput, const __m256i rightInput) {
  3724. // Extract three halves depending on which ones overlap with the offset.
  3725. __m128i leftPart = _mm256_extractf128_si256(leftInput, (OFFSET < 16) ? 0 : 1);
  3726. __m128i middlePart = _mm256_extractf128_si256(OFFSET < 16 ? rightInput : leftInput, (OFFSET < 16) ? 1 : 0);
  3727. __m128i rightPart = _mm256_extractf128_si256(rightInput, (OFFSET < 16) ? 0 : 1);
  3728. // Combine two 128-bit extracts into a whole 256-bit extract.
  3729. return _mm256_set_m128i(
  3730. _mm_alignr_epi8(leftPart, middlePart, OFFSET - ((OFFSET < 16) ? 0 : 16)),
  3731. _mm_alignr_epi8(middlePart, rightPart, OFFSET - ((OFFSET < 16) ? 0 : 16))
  3732. );
  3733. }
  3734. #define VECTOR_EXTRACT_GENERATOR_256_U8(OFFSET) return U8x32(impl_extractBytes_AVX2<OFFSET>(b.v, a.v));
  3735. #define VECTOR_EXTRACT_GENERATOR_256_U16(OFFSET) return U16x16(impl_extractBytes_AVX2<OFFSET * 2>(b.v, a.v));
  3736. #define VECTOR_EXTRACT_GENERATOR_256_U32(OFFSET) return U32x8(impl_extractBytes_AVX2<OFFSET * 4>(b.v, a.v));
  3737. #define VECTOR_EXTRACT_GENERATOR_256_I32(OFFSET) return I32x8(impl_extractBytes_AVX2<OFFSET * 4>(b.v, a.v));
  3738. #define VECTOR_EXTRACT_GENERATOR_256_F32(OFFSET) return F32x8(SIMD_F32x8(impl_extractBytes_AVX2<OFFSET * 4>(SIMD_U32x8(b.v), SIMD_U32x8(a.v))));
  3739. #else
  3740. template<typename T, int elementCount>
  3741. T vectorExtract_emulated(const T &a, const T &b, int offset) {
  3742. // For safety reasons, uninitialized default construction of SIMD vectors is only available during full emulation, not partial AVX support.
  3743. T result = T::create_dangerous_uninitialized();
  3744. int t = 0;
  3745. for (int s = offset; s < elementCount; s++) {
  3746. result.scalars[t] = a.scalars[s];
  3747. t++;
  3748. }
  3749. for (int s = 0; s < offset; s++) {
  3750. result.scalars[t] = b.scalars[s];
  3751. t++;
  3752. }
  3753. return result;
  3754. }
  3755. #define VECTOR_EXTRACT_GENERATOR_256_U8(OFFSET) return vectorExtract_emulated<U8x32, 32>(a, b, OFFSET);
  3756. #define VECTOR_EXTRACT_GENERATOR_256_U16(OFFSET) return vectorExtract_emulated<U16x16, 16>(a, b, OFFSET);
  3757. #define VECTOR_EXTRACT_GENERATOR_256_U32(OFFSET) return vectorExtract_emulated<U32x8, 8>(a, b, OFFSET);
  3758. #define VECTOR_EXTRACT_GENERATOR_256_I32(OFFSET) return vectorExtract_emulated<I32x8, 8>(a, b, OFFSET);
  3759. #define VECTOR_EXTRACT_GENERATOR_256_F32(OFFSET) return vectorExtract_emulated<F32x8, 8>(a, b, OFFSET);
  3760. #endif
  3761. // Vector extraction concatunates two input vectors and reads a vector between them using an offset.
  3762. // The first and last offsets that only return one of the inputs can be used for readability, because they will be inlined and removed by the compiler.
  3763. // To get elements from the right side, combine the center vector with the right vector and shift one element to the left using vectorExtract_1 for the given type.
  3764. // To get elements from the left side, combine the left vector with the center vector and shift one element to the right using vectorExtract_15 for 16 lanes, vectorExtract_7 for 8 lanes, or vectorExtract_3 for 4 lanes.
  3765. // TODO: Also allow using a template arguments as the element offset with a static assert for the offset, which might be useful in template programming.
  3766. U8x32 inline vectorExtract_0(const U8x32 &a, const U8x32 &b) { return a; }
  3767. U8x32 inline vectorExtract_1(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(1) }
  3768. U8x32 inline vectorExtract_2(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(2) }
  3769. U8x32 inline vectorExtract_3(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(3) }
  3770. U8x32 inline vectorExtract_4(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(4) }
  3771. U8x32 inline vectorExtract_5(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(5) }
  3772. U8x32 inline vectorExtract_6(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(6) }
  3773. U8x32 inline vectorExtract_7(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(7) }
  3774. U8x32 inline vectorExtract_8(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(8) }
  3775. U8x32 inline vectorExtract_9(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(9) }
  3776. U8x32 inline vectorExtract_10(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(10) }
  3777. U8x32 inline vectorExtract_11(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(11) }
  3778. U8x32 inline vectorExtract_12(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(12) }
  3779. U8x32 inline vectorExtract_13(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(13) }
  3780. U8x32 inline vectorExtract_14(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(14) }
  3781. U8x32 inline vectorExtract_15(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(15) }
  3782. U8x32 inline vectorExtract_16(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(16) }
  3783. U8x32 inline vectorExtract_17(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(17) }
  3784. U8x32 inline vectorExtract_18(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(18) }
  3785. U8x32 inline vectorExtract_19(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(19) }
  3786. U8x32 inline vectorExtract_20(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(20) }
  3787. U8x32 inline vectorExtract_21(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(21) }
  3788. U8x32 inline vectorExtract_22(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(22) }
  3789. U8x32 inline vectorExtract_23(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(23) }
  3790. U8x32 inline vectorExtract_24(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(24) }
  3791. U8x32 inline vectorExtract_25(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(25) }
  3792. U8x32 inline vectorExtract_26(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(26) }
  3793. U8x32 inline vectorExtract_27(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(27) }
  3794. U8x32 inline vectorExtract_28(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(28) }
  3795. U8x32 inline vectorExtract_29(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(29) }
  3796. U8x32 inline vectorExtract_30(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(30) }
  3797. U8x32 inline vectorExtract_31(const U8x32 &a, const U8x32 &b) { VECTOR_EXTRACT_GENERATOR_256_U8(31) }
  3798. U8x32 inline vectorExtract_32(const U8x32 &a, const U8x32 &b) { return b; }
  3799. U16x16 inline vectorExtract_0(const U16x16 &a, const U16x16 &b) { return a; }
  3800. U16x16 inline vectorExtract_1(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(1) }
  3801. U16x16 inline vectorExtract_2(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(2) }
  3802. U16x16 inline vectorExtract_3(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(3) }
  3803. U16x16 inline vectorExtract_4(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(4) }
  3804. U16x16 inline vectorExtract_5(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(5) }
  3805. U16x16 inline vectorExtract_6(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(6) }
  3806. U16x16 inline vectorExtract_7(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(7) }
  3807. U16x16 inline vectorExtract_8(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(8) }
  3808. U16x16 inline vectorExtract_9(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(9) }
  3809. U16x16 inline vectorExtract_10(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(10) }
  3810. U16x16 inline vectorExtract_11(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(11) }
  3811. U16x16 inline vectorExtract_12(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(12) }
  3812. U16x16 inline vectorExtract_13(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(13) }
  3813. U16x16 inline vectorExtract_14(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(14) }
  3814. U16x16 inline vectorExtract_15(const U16x16 &a, const U16x16 &b) { VECTOR_EXTRACT_GENERATOR_256_U16(15) }
  3815. U16x16 inline vectorExtract_16(const U16x16 &a, const U16x16 &b) { return b; }
  3816. U32x8 inline vectorExtract_0(const U32x8 &a, const U32x8 &b) { return a; }
  3817. U32x8 inline vectorExtract_1(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(1) }
  3818. U32x8 inline vectorExtract_2(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(2) }
  3819. U32x8 inline vectorExtract_3(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(3) }
  3820. U32x8 inline vectorExtract_4(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(4) }
  3821. U32x8 inline vectorExtract_5(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(5) }
  3822. U32x8 inline vectorExtract_6(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(6) }
  3823. U32x8 inline vectorExtract_7(const U32x8 &a, const U32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_U32(7) }
  3824. U32x8 inline vectorExtract_8(const U32x8 &a, const U32x8 &b) { return b; }
  3825. I32x8 inline vectorExtract_0(const I32x8 &a, const I32x8 &b) { return a; }
  3826. I32x8 inline vectorExtract_1(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(1) }
  3827. I32x8 inline vectorExtract_2(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(2) }
  3828. I32x8 inline vectorExtract_3(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(3) }
  3829. I32x8 inline vectorExtract_4(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(4) }
  3830. I32x8 inline vectorExtract_5(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(5) }
  3831. I32x8 inline vectorExtract_6(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(6) }
  3832. I32x8 inline vectorExtract_7(const I32x8 &a, const I32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_I32(7) }
  3833. I32x8 inline vectorExtract_8(const I32x8 &a, const I32x8 &b) { return b; }
  3834. F32x8 inline vectorExtract_0(const F32x8 &a, const F32x8 &b) { return a; }
  3835. F32x8 inline vectorExtract_1(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(1) }
  3836. F32x8 inline vectorExtract_2(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(2) }
  3837. F32x8 inline vectorExtract_3(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(3) }
  3838. F32x8 inline vectorExtract_4(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(4) }
  3839. F32x8 inline vectorExtract_5(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(5) }
  3840. F32x8 inline vectorExtract_6(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(6) }
  3841. F32x8 inline vectorExtract_7(const F32x8 &a, const F32x8 &b) { VECTOR_EXTRACT_GENERATOR_256_F32(7) }
  3842. F32x8 inline vectorExtract_8(const F32x8 &a, const F32x8 &b) { return b; }
  3843. // Gather instructions load memory from a pointer at multiple index offsets at the same time.
  3844. // The given pointers should be aligned with 4 bytes, so that the fallback solution works on machines with strict alignment requirements.
  3845. #if defined USE_AVX2
  3846. #define GATHER_I32x8_AVX2(SOURCE, EIGHT_OFFSETS, SCALE) _mm256_i32gather_epi32((const int32_t*)(SOURCE), EIGHT_OFFSETS, SCALE)
  3847. #define GATHER_U32x8_AVX2(SOURCE, EIGHT_OFFSETS, SCALE) _mm256_i32gather_epi32((const int32_t*)(SOURCE), EIGHT_OFFSETS, SCALE)
  3848. #define GATHER_F32x8_AVX2(SOURCE, EIGHT_OFFSETS, SCALE) _mm256_i32gather_ps((const float*)(SOURCE), EIGHT_OFFSETS, SCALE)
  3849. #endif
  3850. static inline U32x8 gather_U32(dsr::SafePointer<const uint32_t> data, const U32x8 &elementOffset) {
  3851. #ifdef SAFE_POINTER_CHECKS
  3852. ALIGN32 uint32_t elementOffsets[8];
  3853. if (uintptr_t((void*)elementOffsets) & 31u) { throwError(U"Unaligned stack memory detected in 256-bit gather_U32!\n"); }
  3854. elementOffset.writeAlignedUnsafe(elementOffsets);
  3855. data.assertInside("U32x4 gather_U32 lane 0", (data + elementOffsets[0]).getUnchecked());
  3856. data.assertInside("U32x4 gather_U32 lane 1", (data + elementOffsets[1]).getUnchecked());
  3857. data.assertInside("U32x4 gather_U32 lane 2", (data + elementOffsets[2]).getUnchecked());
  3858. data.assertInside("U32x4 gather_U32 lane 3", (data + elementOffsets[3]).getUnchecked());
  3859. data.assertInside("U32x4 gather_U32 lane 4", (data + elementOffsets[4]).getUnchecked());
  3860. data.assertInside("U32x4 gather_U32 lane 5", (data + elementOffsets[5]).getUnchecked());
  3861. data.assertInside("U32x4 gather_U32 lane 6", (data + elementOffsets[6]).getUnchecked());
  3862. data.assertInside("U32x4 gather_U32 lane 7", (data + elementOffsets[7]).getUnchecked());
  3863. #endif
  3864. #if defined USE_AVX2
  3865. return U32x8(GATHER_I32x8_AVX2(data.getUnsafe(), elementOffset.v, 4));
  3866. #else
  3867. #ifndef SAFE_POINTER_CHECKS
  3868. ALIGN32 uint32_t elementOffsets[8];
  3869. elementOffset.writeAlignedUnsafe(elementOffsets);
  3870. #endif
  3871. return U32x8(
  3872. *(data + elementOffsets[0]),
  3873. *(data + elementOffsets[1]),
  3874. *(data + elementOffsets[2]),
  3875. *(data + elementOffsets[3]),
  3876. *(data + elementOffsets[4]),
  3877. *(data + elementOffsets[5]),
  3878. *(data + elementOffsets[6]),
  3879. *(data + elementOffsets[7])
  3880. );
  3881. #endif
  3882. }
  3883. static inline I32x8 gather_I32(dsr::SafePointer<const int32_t> data, const U32x8 &elementOffset) {
  3884. #ifdef SAFE_POINTER_CHECKS
  3885. ALIGN32 uint32_t elementOffsets[8];
  3886. if (uintptr_t((void*)elementOffsets) & 31u) { throwError(U"Unaligned stack memory detected in 256-bit gather_I32!\n"); }
  3887. elementOffset.writeAlignedUnsafe(elementOffsets);
  3888. data.assertInside("I32x4 gather_I32 lane 0", (data + elementOffsets[0]).getUnchecked());
  3889. data.assertInside("I32x4 gather_I32 lane 1", (data + elementOffsets[1]).getUnchecked());
  3890. data.assertInside("I32x4 gather_I32 lane 2", (data + elementOffsets[2]).getUnchecked());
  3891. data.assertInside("I32x4 gather_I32 lane 3", (data + elementOffsets[3]).getUnchecked());
  3892. data.assertInside("I32x4 gather_I32 lane 4", (data + elementOffsets[4]).getUnchecked());
  3893. data.assertInside("I32x4 gather_I32 lane 5", (data + elementOffsets[5]).getUnchecked());
  3894. data.assertInside("I32x4 gather_I32 lane 6", (data + elementOffsets[6]).getUnchecked());
  3895. data.assertInside("I32x4 gather_I32 lane 7", (data + elementOffsets[7]).getUnchecked());
  3896. #endif
  3897. #if defined USE_AVX2
  3898. return I32x8(GATHER_U32x8_AVX2(data.getUnsafe(), elementOffset.v, 4));
  3899. #else
  3900. #ifndef SAFE_POINTER_CHECKS
  3901. ALIGN32 uint32_t elementOffsets[8];
  3902. elementOffset.writeAlignedUnsafe(elementOffsets);
  3903. #endif
  3904. return I32x8(
  3905. *(data + elementOffsets[0]),
  3906. *(data + elementOffsets[1]),
  3907. *(data + elementOffsets[2]),
  3908. *(data + elementOffsets[3]),
  3909. *(data + elementOffsets[4]),
  3910. *(data + elementOffsets[5]),
  3911. *(data + elementOffsets[6]),
  3912. *(data + elementOffsets[7])
  3913. );
  3914. #endif
  3915. }
  3916. static inline F32x8 gather_F32(dsr::SafePointer<const float> data, const U32x8 &elementOffset) {
  3917. #ifdef SAFE_POINTER_CHECKS
  3918. ALIGN32 uint32_t elementOffsets[8];
  3919. if (uintptr_t((void*)elementOffsets) & 31u) { throwError(U"Unaligned stack memory detected in 256-bit gather_F32!\n"); }
  3920. elementOffset.writeAlignedUnsafe(elementOffsets);
  3921. data.assertInside("F32x4 gather_F32 lane 0", (data + elementOffsets[0]).getUnchecked());
  3922. data.assertInside("F32x4 gather_F32 lane 1", (data + elementOffsets[1]).getUnchecked());
  3923. data.assertInside("F32x4 gather_F32 lane 2", (data + elementOffsets[2]).getUnchecked());
  3924. data.assertInside("F32x4 gather_F32 lane 3", (data + elementOffsets[3]).getUnchecked());
  3925. data.assertInside("F32x4 gather_I32 lane 4", (data + elementOffsets[4]).getUnchecked());
  3926. data.assertInside("F32x4 gather_F32 lane 5", (data + elementOffsets[5]).getUnchecked());
  3927. data.assertInside("F32x4 gather_F32 lane 6", (data + elementOffsets[6]).getUnchecked());
  3928. data.assertInside("F32x4 gather_F32 lane 7", (data + elementOffsets[7]).getUnchecked());
  3929. #endif
  3930. #if defined USE_AVX2
  3931. return F32x8(GATHER_F32x8_AVX2(data.getUnsafe(), elementOffset.v, 4));
  3932. #else
  3933. #ifndef SAFE_POINTER_CHECKS
  3934. ALIGN32 uint32_t elementOffsets[8];
  3935. elementOffset.writeAlignedUnsafe(elementOffsets);
  3936. #endif
  3937. return F32x8(
  3938. *(data + elementOffsets[0]),
  3939. *(data + elementOffsets[1]),
  3940. *(data + elementOffsets[2]),
  3941. *(data + elementOffsets[3]),
  3942. *(data + elementOffsets[4]),
  3943. *(data + elementOffsets[5]),
  3944. *(data + elementOffsets[6]),
  3945. *(data + elementOffsets[7])
  3946. );
  3947. #endif
  3948. }
  3949. // Wrapper functions for explicitly expanding scalars into vectors during math operations.
  3950. #define NUMERICAL_SCALAR_OPERATIONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  3951. inline VECTOR_TYPE operator+(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left + VECTOR_TYPE(right); } \
  3952. inline VECTOR_TYPE operator+(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) + right; } \
  3953. inline VECTOR_TYPE operator-(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left - VECTOR_TYPE(right); } \
  3954. inline VECTOR_TYPE operator-(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) - right; }
  3955. FOR_ALL_VECTOR_TYPES(NUMERICAL_SCALAR_OPERATIONS)
  3956. #undef NUMERICAL_SCALAR_OPERATIONS
  3957. #define MULTIPLY_SCALAR_OPERATIONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  3958. inline VECTOR_TYPE operator*(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left * VECTOR_TYPE(right); } \
  3959. inline VECTOR_TYPE operator*(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) * right; }
  3960. // TODO: Implement multiplication for U8x16 and U8x32.
  3961. //FOR_ALL_VECTOR_TYPES(MULTIPLY_SCALAR_OPERATIONS)
  3962. MULTIPLY_SCALAR_OPERATIONS(F32x4, float, 4)
  3963. MULTIPLY_SCALAR_OPERATIONS(F32x8, float, 8)
  3964. MULTIPLY_SCALAR_OPERATIONS(U32x4, uint32_t, 4)
  3965. MULTIPLY_SCALAR_OPERATIONS(U32x8, uint32_t, 8)
  3966. MULTIPLY_SCALAR_OPERATIONS(I32x4, int32_t, 4)
  3967. MULTIPLY_SCALAR_OPERATIONS(I32x8, int32_t, 8)
  3968. MULTIPLY_SCALAR_OPERATIONS(U16x8, uint16_t, 8)
  3969. MULTIPLY_SCALAR_OPERATIONS(U16x16, uint16_t, 16)
  3970. #undef MULTIPLY_SCALAR_OPERATIONS
  3971. // Wrapper functions for explicitly duplicating bit masks into the same lane count.
  3972. #define BITWISE_SCALAR_OPERATIONS(VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
  3973. inline VECTOR_TYPE operator&(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left & VECTOR_TYPE(right); } \
  3974. inline VECTOR_TYPE operator&(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) & right; } \
  3975. inline VECTOR_TYPE operator|(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left | VECTOR_TYPE(right); } \
  3976. inline VECTOR_TYPE operator|(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) | right; } \
  3977. inline VECTOR_TYPE operator^(const VECTOR_TYPE& left, ELEMENT_TYPE right) { return left ^ VECTOR_TYPE(right); } \
  3978. inline VECTOR_TYPE operator^(ELEMENT_TYPE left, const VECTOR_TYPE& right) { return VECTOR_TYPE(left) ^ right; }
  3979. // TODO: Implement bitwise operations for all unsigned SIMD vectors.
  3980. //FOR_UNSIGNED_VECTOR_TYPES(BITWISE_SCALAR_OPERATIONS)
  3981. BITWISE_SCALAR_OPERATIONS(U32x4, uint32_t, 4)
  3982. BITWISE_SCALAR_OPERATIONS(U32x8, uint32_t, 8)
  3983. #undef BITWISE_SCALAR_OPERATIONS
  3984. // Cleaning up temporary macro definitions to avoid cluttering the namespace.
  3985. #undef FOR_ALL_VECTOR_TYPES
  3986. #undef FOR_FLOAT_VECTOR_TYPES
  3987. #undef FOR_INTEGER_VECTOR_TYPES
  3988. #undef FOR_SIGNED_VECTOR_TYPES
  3989. #undef FOR_UNSIGNED_VECTOR_TYPES
  3990. // TODO: Let SVE define completely separate types for dynamic vectors.
  3991. // The X vectors using the longest SIMD length that is efficient to use for both floating-point and integer types.
  3992. // DSR_DEFAULT_ALIGNMENT
  3993. // The number of bytes memory should be aligned with by default when creating buffers and images.
  3994. // F32xX
  3995. // The longest available SIMD vector for storing 32-bit float values. Iterating laneCountX_32Bit floats at a time.
  3996. // I32xX
  3997. // The longest available SIMD vector for storing signed 32-bit integer values. Iterating laneCountX_32Bit integers at a time.
  3998. // U32xX
  3999. // The longest available SIMD vector for storing unsigned 32-bit integer values. Iterating laneCountX_32Bit integers at a time.
  4000. // U16xX
  4001. // The longest available SIMD vector for storing unsigned 16-bit integer values. Iterating laneCountX_16Bit integers at a time.
  4002. // U8xX
  4003. // The longest available SIMD vector for storing unsigned 8-bit integer values. Iterating laneCountX_8Bit integers at a time.
  4004. #if defined USE_256BIT_X_SIMD || defined EMULATE_256BIT_X_SIMD
  4005. // Using 256-bit SIMD
  4006. #define DSR_DEFAULT_VECTOR_SIZE 32
  4007. #define DSR_DEFAULT_ALIGNMENT 32
  4008. using F32xX = F32x8;
  4009. using I32xX = I32x8;
  4010. using U32xX = U32x8;
  4011. using U16xX = U16x16;
  4012. using U8xX = U8x32;
  4013. // Align memory with 256 bits to allow overwriting padding at the end of each pixel row.
  4014. // Otherwise you would have to preserve data at the end of each row with slow and bloated duplicated code in every filter.
  4015. #else
  4016. // If there is no hardware support for 256-bit vectors, the emulation of 256-bit vectors when used explicitly, is allowed to be aligned with just 128 bits.
  4017. #define DSR_DEFAULT_VECTOR_SIZE 16
  4018. #define DSR_DEFAULT_ALIGNMENT 16
  4019. using F32xX = F32x4;
  4020. using I32xX = I32x4;
  4021. using U32xX = U32x4;
  4022. using U16xX = U16x8;
  4023. using U8xX = U8x16;
  4024. #endif
  4025. // How many lanes do the longest available vector have for a specified lane size.
  4026. // Used to iterate indices and pointers using whole elements.
  4027. static const int laneCountX_32Bit = DSR_DEFAULT_VECTOR_SIZE / 4;
  4028. static const int laneCountX_16Bit = DSR_DEFAULT_VECTOR_SIZE / 2;
  4029. static const int laneCountX_8Bit = DSR_DEFAULT_VECTOR_SIZE;
  4030. // TODO: Let SVE define completely separate types for dynamic vectors.
  4031. // The F vector using the longest SIMD length that is efficient to use when only processing float values, even if no integer types are available in the same size.
  4032. // Used when you know that your algorithm is only going to work with float types and you need the extra performance.
  4033. // Some processors have AVX but not AVX2, meaning that it has 256-bit SIMD for floats, but only 128-bit SIMD for integers.
  4034. // F32xF
  4035. // The longest available SIMD vector for storing 32-bit float values. Iterating laneCountF_32Bit floats at a time.
  4036. #if defined USE_256BIT_F_SIMD || defined EMULATE_256BIT_F_SIMD
  4037. #define DSR_FLOAT_VECTOR_SIZE 32
  4038. #define DSR_FLOAT_ALIGNMENT 32
  4039. using F32xF = F32x8;
  4040. #else
  4041. // F vectors are 128-bits.
  4042. #define DSR_FLOAT_VECTOR_SIZE 16
  4043. #define DSR_FLOAT_ALIGNMENT 16
  4044. using F32xF = F32x4;
  4045. #endif
  4046. // Used to iterate over float pointers when using F32xF.
  4047. static const int laneCountF = DSR_FLOAT_VECTOR_SIZE / 4;
  4048. // Define traits.
  4049. DSR_APPLY_PROPERTY(DsrTrait_Any_U8 , U8x16)
  4050. DSR_APPLY_PROPERTY(DsrTrait_Any_U8 , U8x32)
  4051. DSR_APPLY_PROPERTY(DsrTrait_Any_U16, U16x8)
  4052. DSR_APPLY_PROPERTY(DsrTrait_Any_U16, U16x16)
  4053. DSR_APPLY_PROPERTY(DsrTrait_Any_U32, U32x4)
  4054. DSR_APPLY_PROPERTY(DsrTrait_Any_U32, U32x8)
  4055. DSR_APPLY_PROPERTY(DsrTrait_Any_I32, I32x4)
  4056. DSR_APPLY_PROPERTY(DsrTrait_Any_I32, I32x8)
  4057. DSR_APPLY_PROPERTY(DsrTrait_Any_F32, F32x4)
  4058. DSR_APPLY_PROPERTY(DsrTrait_Any_F32, F32x8)
  4059. DSR_APPLY_PROPERTY(DsrTrait_Any , U8x16)
  4060. DSR_APPLY_PROPERTY(DsrTrait_Any , U8x32)
  4061. DSR_APPLY_PROPERTY(DsrTrait_Any, U16x8)
  4062. DSR_APPLY_PROPERTY(DsrTrait_Any, U16x16)
  4063. DSR_APPLY_PROPERTY(DsrTrait_Any, U32x4)
  4064. DSR_APPLY_PROPERTY(DsrTrait_Any, U32x8)
  4065. DSR_APPLY_PROPERTY(DsrTrait_Any, I32x4)
  4066. DSR_APPLY_PROPERTY(DsrTrait_Any, I32x8)
  4067. DSR_APPLY_PROPERTY(DsrTrait_Any, F32x4)
  4068. DSR_APPLY_PROPERTY(DsrTrait_Any, F32x8)
  4069. // TODO: Use as independent types when the largest vector lengths are not known in compile time on ARM SVE.
  4070. //DSR_APPLY_PROPERTY(DsrTrait_Any_U8 , U8xX)
  4071. //DSR_APPLY_PROPERTY(DsrTrait_Any_U16, U16xX)
  4072. //DSR_APPLY_PROPERTY(DsrTrait_Any_U32, U32xX)
  4073. //DSR_APPLY_PROPERTY(DsrTrait_Any_I32, I32xX)
  4074. //DSR_APPLY_PROPERTY(DsrTrait_Any_F32, F32xX)
  4075. //DSR_APPLY_PROPERTY(DsrTrait_Any_F32, F32xF)
  4076. //DSR_APPLY_PROPERTY(DsrTrait_Any , U8xX)
  4077. //DSR_APPLY_PROPERTY(DsrTrait_Any, U16xX)
  4078. //DSR_APPLY_PROPERTY(DsrTrait_Any, U32xX)
  4079. //DSR_APPLY_PROPERTY(DsrTrait_Any, I32xX)
  4080. //DSR_APPLY_PROPERTY(DsrTrait_Any, F32xX)
  4081. //DSR_APPLY_PROPERTY(DsrTrait_Any, F32xF)
  4082. }
  4083. #endif