| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899 |
- //-------------------------------------------------------------------------------------
- // DirectXMathConvert.inl -- SIMD C++ Math library
- //
- // THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
- // ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
- // THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
- // PARTICULAR PURPOSE.
- //
- // Copyright (c) Microsoft Corporation. All rights reserved.
- //
- // http://go.microsoft.com/fwlink/?LinkID=615560
- //-------------------------------------------------------------------------------------
- #pragma once
- /****************************************************************************
- *
- * Data conversion
- *
- ****************************************************************************/
- //------------------------------------------------------------------------------
- #pragma warning(push)
- #pragma warning(disable:4701)
- // C4701: false positives
- inline XMVECTOR XM_CALLCONV XMConvertVectorIntToFloat
- (
- FXMVECTOR VInt,
- uint32_t DivExponent
- )
- {
- assert(DivExponent<32);
- #if defined(_XM_NO_INTRINSICS_)
- float fScale = 1.0f / (float)(1U << DivExponent);
- uint32_t ElementIndex = 0;
- XMVECTOR Result;
- do {
- int32_t iTemp = (int32_t)VInt.vector4_u32[ElementIndex];
- Result.vector4_f32[ElementIndex] = ((float)iTemp) * fScale;
- } while (++ElementIndex<4);
- return Result;
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- float fScale = 1.0f / (float)(1U << DivExponent);
- float32x4_t vResult = vcvtq_f32_s32( VInt );
- return vmulq_n_f32( vResult, fScale );
- #else // _XM_SSE_INTRINSICS_
- // Convert to floats
- XMVECTOR vResult = _mm_cvtepi32_ps(_mm_castps_si128(VInt));
- // Convert DivExponent into 1.0f/(1<<DivExponent)
- uint32_t uScale = 0x3F800000U - (DivExponent << 23);
- // Splat the scalar value
- __m128i vScale = _mm_set1_epi32(uScale);
- vResult = _mm_mul_ps(vResult,_mm_castsi128_ps(vScale));
- return vResult;
- #endif
- }
- //------------------------------------------------------------------------------
- inline XMVECTOR XM_CALLCONV XMConvertVectorFloatToInt
- (
- FXMVECTOR VFloat,
- uint32_t MulExponent
- )
- {
- assert(MulExponent<32);
- #if defined(_XM_NO_INTRINSICS_)
- // Get the scalar factor.
- float fScale = (float)(1U << MulExponent);
- uint32_t ElementIndex = 0;
- XMVECTOR Result;
- do {
- int32_t iResult;
- float fTemp = VFloat.vector4_f32[ElementIndex]*fScale;
- if (fTemp <= -(65536.0f*32768.0f)) {
- iResult = (-0x7FFFFFFF)-1;
- } else if (fTemp > (65536.0f*32768.0f)-128.0f) {
- iResult = 0x7FFFFFFF;
- } else {
- iResult = (int32_t)fTemp;
- }
- Result.vector4_u32[ElementIndex] = (uint32_t)iResult;
- } while (++ElementIndex<4);
- return Result;
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- float32x4_t vResult = vmulq_n_f32(VFloat, (float)(1U << MulExponent));
- // In case of positive overflow, detect it
- uint32x4_t vOverflow = vcgtq_f32(vResult,g_XMMaxInt);
- // Float to int conversion
- int32x4_t vResulti = vcvtq_s32_f32(vResult);
- // If there was positive overflow, set to 0x7FFFFFFF
- vResult = vandq_u32(vOverflow,g_XMAbsMask);
- vOverflow = vbicq_u32(vResulti,vOverflow);
- vOverflow = vorrq_u32(vOverflow,vResult);
- return vOverflow;
- #else // _XM_SSE_INTRINSICS_
- XMVECTOR vResult = _mm_set_ps1((float)(1U << MulExponent));
- vResult = _mm_mul_ps(vResult,VFloat);
- // In case of positive overflow, detect it
- XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxInt);
- // Float to int conversion
- __m128i vResulti = _mm_cvttps_epi32(vResult);
- // If there was positive overflow, set to 0x7FFFFFFF
- vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
- vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
- vOverflow = _mm_or_ps(vOverflow,vResult);
- return vOverflow;
- #endif
- }
- //------------------------------------------------------------------------------
- inline XMVECTOR XM_CALLCONV XMConvertVectorUIntToFloat
- (
- FXMVECTOR VUInt,
- uint32_t DivExponent
- )
- {
- assert(DivExponent<32);
- #if defined(_XM_NO_INTRINSICS_)
- float fScale = 1.0f / (float)(1U << DivExponent);
- uint32_t ElementIndex = 0;
- XMVECTOR Result;
- do {
- Result.vector4_f32[ElementIndex] = (float)VUInt.vector4_u32[ElementIndex] * fScale;
- } while (++ElementIndex<4);
- return Result;
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- float fScale = 1.0f / (float)(1U << DivExponent);
- float32x4_t vResult = vcvtq_f32_u32( VUInt );
- return vmulq_n_f32( vResult, fScale );
- #else // _XM_SSE_INTRINSICS_
- // For the values that are higher than 0x7FFFFFFF, a fixup is needed
- // Determine which ones need the fix.
- XMVECTOR vMask = _mm_and_ps(VUInt,g_XMNegativeZero);
- // Force all values positive
- XMVECTOR vResult = _mm_xor_ps(VUInt,vMask);
- // Convert to floats
- vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
- // Convert 0x80000000 -> 0xFFFFFFFF
- __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
- // For only the ones that are too big, add the fixup
- vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
- vResult = _mm_add_ps(vResult,vMask);
- // Convert DivExponent into 1.0f/(1<<DivExponent)
- uint32_t uScale = 0x3F800000U - (DivExponent << 23);
- // Splat
- iMask = _mm_set1_epi32(uScale);
- vResult = _mm_mul_ps(vResult,_mm_castsi128_ps(iMask));
- return vResult;
- #endif
- }
- //------------------------------------------------------------------------------
- inline XMVECTOR XM_CALLCONV XMConvertVectorFloatToUInt
- (
- FXMVECTOR VFloat,
- uint32_t MulExponent
- )
- {
- assert(MulExponent<32);
- #if defined(_XM_NO_INTRINSICS_)
- // Get the scalar factor.
- float fScale = (float)(1U << MulExponent);
- uint32_t ElementIndex = 0;
- XMVECTOR Result;
- do {
- uint32_t uResult;
- float fTemp = VFloat.vector4_f32[ElementIndex]*fScale;
- if (fTemp <= 0.0f) {
- uResult = 0;
- } else if (fTemp >= (65536.0f*65536.0f)) {
- uResult = 0xFFFFFFFFU;
- } else {
- uResult = (uint32_t)fTemp;
- }
- Result.vector4_u32[ElementIndex] = uResult;
- } while (++ElementIndex<4);
- return Result;
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- float32x4_t vResult = vmulq_n_f32(VFloat,(float)(1U << MulExponent));
- // In case of overflow, detect it
- uint32x4_t vOverflow = vcgtq_f32(vResult,g_XMMaxUInt);
- // Float to int conversion
- uint32x4_t vResulti = vcvtq_u32_f32(vResult);
- // If there was overflow, set to 0xFFFFFFFFU
- vResult = vbicq_u32(vResulti,vOverflow);
- vOverflow = vorrq_u32(vOverflow,vResult);
- return vOverflow;
- #else // _XM_SSE_INTRINSICS_
- XMVECTOR vResult = _mm_set_ps1(static_cast<float>(1U << MulExponent));
- vResult = _mm_mul_ps(vResult,VFloat);
- // Clamp to >=0
- vResult = _mm_max_ps(vResult,g_XMZero);
- // Any numbers that are too big, set to 0xFFFFFFFFU
- XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
- XMVECTOR vValue = g_XMUnsignedFix;
- // Too large for a signed integer?
- XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
- // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
- vValue = _mm_and_ps(vValue,vMask);
- // Perform fixup only on numbers too large (Keeps low bit precision)
- vResult = _mm_sub_ps(vResult,vValue);
- __m128i vResulti = _mm_cvttps_epi32(vResult);
- // Convert from signed to unsigned pnly if greater than 0x80000000
- vMask = _mm_and_ps(vMask,g_XMNegativeZero);
- vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
- // On those that are too large, set to 0xFFFFFFFF
- vResult = _mm_or_ps(vResult,vOverflow);
- return vResult;
- #endif
- }
- #pragma warning(pop)
- /****************************************************************************
- *
- * Vector and matrix load operations
- *
- ****************************************************************************/
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline XMVECTOR XM_CALLCONV XMLoadInt(const uint32_t* pSource)
- {
- assert(pSource);
- #if defined(_XM_NO_INTRINSICS_)
- XMVECTOR V;
- V.vector4_u32[0] = *pSource;
- V.vector4_u32[1] = 0;
- V.vector4_u32[2] = 0;
- V.vector4_u32[3] = 0;
- return V;
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- uint32x4_t zero = vdupq_n_u32(0);
- return vld1q_lane_u32( pSource, zero, 0 );
- #elif defined(_XM_SSE_INTRINSICS_)
- return _mm_load_ss( reinterpret_cast<const float*>(pSource) );
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline XMVECTOR XM_CALLCONV XMLoadFloat(const float* pSource)
- {
- assert(pSource);
- #if defined(_XM_NO_INTRINSICS_)
- XMVECTOR V;
- V.vector4_f32[0] = *pSource;
- V.vector4_f32[1] = 0.f;
- V.vector4_f32[2] = 0.f;
- V.vector4_f32[3] = 0.f;
- return V;
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- float32x4_t zero = vdupq_n_f32(0);
- return vld1q_lane_f32( pSource, zero, 0 );
- #elif defined(_XM_SSE_INTRINSICS_)
- return _mm_load_ss( pSource );
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline XMVECTOR XM_CALLCONV XMLoadInt2
- (
- const uint32_t* pSource
- )
- {
- assert(pSource);
- #if defined(_XM_NO_INTRINSICS_)
- XMVECTOR V;
- V.vector4_u32[0] = pSource[0];
- V.vector4_u32[1] = pSource[1];
- V.vector4_u32[2] = 0;
- V.vector4_u32[3] = 0;
- return V;
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- uint32x2_t x = vld1_u32( pSource );
- uint32x2_t zero = vdup_n_u32(0);
- return vcombine_u32( x, zero );
- #elif defined(_XM_SSE_INTRINSICS_)
- __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pSource) );
- __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pSource+1) );
- return _mm_unpacklo_ps( x, y );
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline XMVECTOR XM_CALLCONV XMLoadInt2A
- (
- const uint32_t* pSource
- )
- {
- assert(pSource);
- assert(((uintptr_t)pSource & 0xF) == 0);
- #if defined(_XM_NO_INTRINSICS_)
- XMVECTOR V;
- V.vector4_u32[0] = pSource[0];
- V.vector4_u32[1] = pSource[1];
- V.vector4_u32[2] = 0;
- V.vector4_u32[3] = 0;
- return V;
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- uint32x2_t x = vld1_u32_ex( pSource, 64 );
- uint32x2_t zero = vdup_n_u32(0);
- return vcombine_u32( x, zero );
- #elif defined(_XM_SSE_INTRINSICS_)
- __m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) );
- return _mm_castsi128_ps(V);
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline XMVECTOR XM_CALLCONV XMLoadFloat2
- (
- const XMFLOAT2* pSource
- )
- {
- assert(pSource);
- #if defined(_XM_NO_INTRINSICS_)
- XMVECTOR V;
- V.vector4_f32[0] = pSource->x;
- V.vector4_f32[1] = pSource->y;
- V.vector4_f32[2] = 0.f;
- V.vector4_f32[3] = 0.f;
- return V;
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- float32x2_t x = vld1_f32( reinterpret_cast<const float*>(pSource) );
- float32x2_t zero = vdup_n_f32(0);
- return vcombine_f32( x, zero );
- #elif defined(_XM_SSE_INTRINSICS_)
- __m128 x = _mm_load_ss( &pSource->x );
- __m128 y = _mm_load_ss( &pSource->y );
- return _mm_unpacklo_ps( x, y );
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline XMVECTOR XM_CALLCONV XMLoadFloat2A
- (
- const XMFLOAT2A* pSource
- )
- {
- assert(pSource);
- assert(((uintptr_t)pSource & 0xF) == 0);
- #if defined(_XM_NO_INTRINSICS_)
- XMVECTOR V;
- V.vector4_f32[0] = pSource->x;
- V.vector4_f32[1] = pSource->y;
- V.vector4_f32[2] = 0.f;
- V.vector4_f32[3] = 0.f;
- return V;
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- float32x2_t x = vld1_f32_ex( reinterpret_cast<const float*>(pSource), 64 );
- float32x2_t zero = vdup_n_f32(0);
- return vcombine_f32( x, zero );
- #elif defined(_XM_SSE_INTRINSICS_)
- __m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) );
- return _mm_castsi128_ps(V);
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline XMVECTOR XM_CALLCONV XMLoadSInt2
- (
- const XMINT2* pSource
- )
- {
- assert(pSource);
- #if defined(_XM_NO_INTRINSICS_)
- XMVECTOR V;
- V.vector4_f32[0] = (float)pSource->x;
- V.vector4_f32[1] = (float)pSource->y;
- V.vector4_f32[2] = 0.f;
- V.vector4_f32[3] = 0.f;
- return V;
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- int32x2_t x = vld1_s32( reinterpret_cast<const int32_t*>(pSource) );
- float32x2_t v = vcvt_f32_s32( x );
- float32x2_t zero = vdup_n_f32(0);
- return vcombine_f32( v, zero );
- #elif defined(_XM_SSE_INTRINSICS_)
- __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
- __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
- __m128 V = _mm_unpacklo_ps( x, y );
- return _mm_cvtepi32_ps(_mm_castps_si128(V));
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline XMVECTOR XM_CALLCONV XMLoadUInt2
- (
- const XMUINT2* pSource
- )
- {
- assert(pSource);
- #if defined(_XM_NO_INTRINSICS_)
- XMVECTOR V;
- V.vector4_f32[0] = (float)pSource->x;
- V.vector4_f32[1] = (float)pSource->y;
- V.vector4_f32[2] = 0.f;
- V.vector4_f32[3] = 0.f;
- return V;
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- uint32x2_t x = vld1_u32( reinterpret_cast<const uint32_t*>(pSource) );
- float32x2_t v = vcvt_f32_u32( x );
- float32x2_t zero = vdup_n_f32(0);
- return vcombine_f32( v, zero );
- #elif defined(_XM_SSE_INTRINSICS_)
- __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
- __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
- __m128 V = _mm_unpacklo_ps( x, y );
- // For the values that are higher than 0x7FFFFFFF, a fixup is needed
- // Determine which ones need the fix.
- XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero);
- // Force all values positive
- XMVECTOR vResult = _mm_xor_ps(V,vMask);
- // Convert to floats
- vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
- // Convert 0x80000000 -> 0xFFFFFFFF
- __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
- // For only the ones that are too big, add the fixup
- vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
- vResult = _mm_add_ps(vResult,vMask);
- return vResult;
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline XMVECTOR XM_CALLCONV XMLoadInt3
- (
- const uint32_t* pSource
- )
- {
- assert(pSource);
- #if defined(_XM_NO_INTRINSICS_)
- XMVECTOR V;
- V.vector4_u32[0] = pSource[0];
- V.vector4_u32[1] = pSource[1];
- V.vector4_u32[2] = pSource[2];
- V.vector4_u32[3] = 0;
- return V;
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- uint32x2_t x = vld1_u32( pSource );
- uint32x2_t zero = vdup_n_u32(0);
- uint32x2_t y = vld1_lane_u32( pSource+2, zero, 0 );
- return vcombine_u32( x, y );
- #elif defined(_XM_SSE_INTRINSICS_)
- __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pSource) );
- __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pSource+1) );
- __m128 z = _mm_load_ss( reinterpret_cast<const float*>(pSource+2) );
- __m128 xy = _mm_unpacklo_ps( x, y );
- return _mm_movelh_ps( xy, z );
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline XMVECTOR XM_CALLCONV XMLoadInt3A
- (
- const uint32_t* pSource
- )
- {
- assert(pSource);
- assert(((uintptr_t)pSource & 0xF) == 0);
- #if defined(_XM_NO_INTRINSICS_)
- XMVECTOR V;
- V.vector4_u32[0] = pSource[0];
- V.vector4_u32[1] = pSource[1];
- V.vector4_u32[2] = pSource[2];
- V.vector4_u32[3] = 0;
- return V;
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- // Reads an extra integer which is zero'd
- uint32x4_t V = vld1q_u32_ex( pSource, 128 );
- return vsetq_lane_u32( 0, V, 3 );
- #elif defined(_XM_SSE_INTRINSICS_)
- // Reads an extra integer which is zero'd
- __m128i V = _mm_load_si128( reinterpret_cast<const __m128i*>(pSource) );
- V = _mm_and_si128( V, g_XMMask3 );
- return _mm_castsi128_ps(V);
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline XMVECTOR XM_CALLCONV XMLoadFloat3
- (
- const XMFLOAT3* pSource
- )
- {
- assert(pSource);
- #if defined(_XM_NO_INTRINSICS_)
- XMVECTOR V;
- V.vector4_f32[0] = pSource->x;
- V.vector4_f32[1] = pSource->y;
- V.vector4_f32[2] = pSource->z;
- V.vector4_f32[3] = 0.f;
- return V;
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- float32x2_t x = vld1_f32( reinterpret_cast<const float*>(pSource) );
- float32x2_t zero = vdup_n_f32(0);
- float32x2_t y = vld1_lane_f32( reinterpret_cast<const float*>(pSource)+2, zero, 0 );
- return vcombine_f32( x, y );
- #elif defined(_XM_SSE_INTRINSICS_)
- __m128 x = _mm_load_ss( &pSource->x );
- __m128 y = _mm_load_ss( &pSource->y );
- __m128 z = _mm_load_ss( &pSource->z );
- __m128 xy = _mm_unpacklo_ps( x, y );
- return _mm_movelh_ps( xy, z );
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline XMVECTOR XM_CALLCONV XMLoadFloat3A
- (
- const XMFLOAT3A* pSource
- )
- {
- assert(pSource);
- assert(((uintptr_t)pSource & 0xF) == 0);
- #if defined(_XM_NO_INTRINSICS_)
- XMVECTOR V;
- V.vector4_f32[0] = pSource->x;
- V.vector4_f32[1] = pSource->y;
- V.vector4_f32[2] = pSource->z;
- V.vector4_f32[3] = 0.f;
- return V;
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- // Reads an extra float which is zero'd
- float32x4_t V = vld1q_f32_ex( reinterpret_cast<const float*>(pSource), 128 );
- return vsetq_lane_f32( 0, V, 3 );
- #elif defined(_XM_SSE_INTRINSICS_)
- // Reads an extra float which is zero'd
- __m128 V = _mm_load_ps( &pSource->x );
- return _mm_and_ps( V, g_XMMask3 );
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline XMVECTOR XM_CALLCONV XMLoadSInt3
- (
- const XMINT3* pSource
- )
- {
- assert(pSource);
- #if defined(_XM_NO_INTRINSICS_)
- XMVECTOR V;
- V.vector4_f32[0] = (float)pSource->x;
- V.vector4_f32[1] = (float)pSource->y;
- V.vector4_f32[2] = (float)pSource->z;
- V.vector4_f32[3] = 0.f;
- return V;
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- int32x2_t x = vld1_s32( reinterpret_cast<const int32_t*>(pSource) );
- int32x2_t zero = vdup_n_s32(0);
- int32x2_t y = vld1_lane_s32( reinterpret_cast<const int32_t*>(pSource)+2, zero, 0 );
- int32x4_t v = vcombine_s32( x, y );
- return vcvtq_f32_s32( v );
- #elif defined(_XM_SSE_INTRINSICS_)
- __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
- __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
- __m128 z = _mm_load_ss( reinterpret_cast<const float*>(&pSource->z) );
- __m128 xy = _mm_unpacklo_ps( x, y );
- __m128 V = _mm_movelh_ps( xy, z );
- return _mm_cvtepi32_ps(_mm_castps_si128(V));
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline XMVECTOR XM_CALLCONV XMLoadUInt3
- (
- const XMUINT3* pSource
- )
- {
- assert(pSource);
- #if defined(_XM_NO_INTRINSICS_)
- XMVECTOR V;
- V.vector4_f32[0] = (float)pSource->x;
- V.vector4_f32[1] = (float)pSource->y;
- V.vector4_f32[2] = (float)pSource->z;
- V.vector4_f32[3] = 0.f;
- return V;
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- uint32x2_t x = vld1_u32( reinterpret_cast<const uint32_t*>(pSource) );
- uint32x2_t zero = vdup_n_u32(0);
- uint32x2_t y = vld1_lane_u32( reinterpret_cast<const uint32_t*>(pSource)+2, zero, 0 );
- uint32x4_t v = vcombine_u32( x, y );
- return vcvtq_f32_u32( v );
- #elif defined(_XM_SSE_INTRINSICS_)
- __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
- __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
- __m128 z = _mm_load_ss( reinterpret_cast<const float*>(&pSource->z) );
- __m128 xy = _mm_unpacklo_ps( x, y );
- __m128 V = _mm_movelh_ps( xy, z );
- // For the values that are higher than 0x7FFFFFFF, a fixup is needed
- // Determine which ones need the fix.
- XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero);
- // Force all values positive
- XMVECTOR vResult = _mm_xor_ps(V,vMask);
- // Convert to floats
- vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
- // Convert 0x80000000 -> 0xFFFFFFFF
- __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
- // For only the ones that are too big, add the fixup
- vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
- vResult = _mm_add_ps(vResult,vMask);
- return vResult;
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline XMVECTOR XM_CALLCONV XMLoadInt4
- (
- const uint32_t* pSource
- )
- {
- assert(pSource);
- #if defined(_XM_NO_INTRINSICS_)
- XMVECTOR V;
- V.vector4_u32[0] = pSource[0];
- V.vector4_u32[1] = pSource[1];
- V.vector4_u32[2] = pSource[2];
- V.vector4_u32[3] = pSource[3];
- return V;
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- return vld1q_u32( pSource );
- #elif defined(_XM_SSE_INTRINSICS_)
- __m128i V = _mm_loadu_si128( reinterpret_cast<const __m128i*>(pSource) );
- return _mm_castsi128_ps(V);
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline XMVECTOR XM_CALLCONV XMLoadInt4A
- (
- const uint32_t* pSource
- )
- {
- assert(pSource);
- assert(((uintptr_t)pSource & 0xF) == 0);
- #if defined(_XM_NO_INTRINSICS_)
- XMVECTOR V;
- V.vector4_u32[0] = pSource[0];
- V.vector4_u32[1] = pSource[1];
- V.vector4_u32[2] = pSource[2];
- V.vector4_u32[3] = pSource[3];
- return V;
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- return vld1q_u32_ex( pSource, 128 );
- #elif defined(_XM_SSE_INTRINSICS_)
- __m128i V = _mm_load_si128( reinterpret_cast<const __m128i*>(pSource) );
- return _mm_castsi128_ps(V);
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline XMVECTOR XM_CALLCONV XMLoadFloat4
- (
- const XMFLOAT4* pSource
- )
- {
- assert(pSource);
- #if defined(_XM_NO_INTRINSICS_)
- XMVECTOR V;
- V.vector4_f32[0] = pSource->x;
- V.vector4_f32[1] = pSource->y;
- V.vector4_f32[2] = pSource->z;
- V.vector4_f32[3] = pSource->w;
- return V;
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- return vld1q_f32( reinterpret_cast<const float*>(pSource) );
- #elif defined(_XM_SSE_INTRINSICS_)
- return _mm_loadu_ps( &pSource->x );
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline XMVECTOR XM_CALLCONV XMLoadFloat4A
- (
- const XMFLOAT4A* pSource
- )
- {
- assert(pSource);
- assert(((uintptr_t)pSource & 0xF) == 0);
- #if defined(_XM_NO_INTRINSICS_)
- XMVECTOR V;
- V.vector4_f32[0] = pSource->x;
- V.vector4_f32[1] = pSource->y;
- V.vector4_f32[2] = pSource->z;
- V.vector4_f32[3] = pSource->w;
- return V;
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- return vld1q_f32_ex( reinterpret_cast<const float*>(pSource), 128 );
- #elif defined(_XM_SSE_INTRINSICS_)
- return _mm_load_ps( &pSource->x );
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline XMVECTOR XM_CALLCONV XMLoadSInt4
- (
- const XMINT4* pSource
- )
- {
- assert(pSource);
- #if defined(_XM_NO_INTRINSICS_)
- XMVECTOR V;
- V.vector4_f32[0] = (float)pSource->x;
- V.vector4_f32[1] = (float)pSource->y;
- V.vector4_f32[2] = (float)pSource->z;
- V.vector4_f32[3] = (float)pSource->w;
- return V;
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- int32x4_t v = vld1q_s32( reinterpret_cast<const int32_t*>(pSource) );
- return vcvtq_f32_s32( v );
- #elif defined(_XM_SSE_INTRINSICS_)
- __m128i V = _mm_loadu_si128( reinterpret_cast<const __m128i*>(pSource) );
- return _mm_cvtepi32_ps(V);
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline XMVECTOR XM_CALLCONV XMLoadUInt4
- (
- const XMUINT4* pSource
- )
- {
- assert(pSource);
- #if defined(_XM_NO_INTRINSICS_)
- XMVECTOR V;
- V.vector4_f32[0] = (float)pSource->x;
- V.vector4_f32[1] = (float)pSource->y;
- V.vector4_f32[2] = (float)pSource->z;
- V.vector4_f32[3] = (float)pSource->w;
- return V;
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- uint32x4_t v = vld1q_u32( reinterpret_cast<const uint32_t*>(pSource) );
- return vcvtq_f32_u32( v );
- #elif defined(_XM_SSE_INTRINSICS_)
- __m128i V = _mm_loadu_si128( reinterpret_cast<const __m128i*>(pSource) );
- // For the values that are higher than 0x7FFFFFFF, a fixup is needed
- // Determine which ones need the fix.
- XMVECTOR vMask = _mm_and_ps(_mm_castsi128_ps(V),g_XMNegativeZero);
- // Force all values positive
- XMVECTOR vResult = _mm_xor_ps(_mm_castsi128_ps(V),vMask);
- // Convert to floats
- vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
- // Convert 0x80000000 -> 0xFFFFFFFF
- __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
- // For only the ones that are too big, add the fixup
- vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
- vResult = _mm_add_ps(vResult,vMask);
- return vResult;
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline XMMATRIX XM_CALLCONV XMLoadFloat3x3
- (
- const XMFLOAT3X3* pSource
- )
- {
- assert(pSource);
- #if defined(_XM_NO_INTRINSICS_)
- XMMATRIX M;
- M.r[0].vector4_f32[0] = pSource->m[0][0];
- M.r[0].vector4_f32[1] = pSource->m[0][1];
- M.r[0].vector4_f32[2] = pSource->m[0][2];
- M.r[0].vector4_f32[3] = 0.0f;
- M.r[1].vector4_f32[0] = pSource->m[1][0];
- M.r[1].vector4_f32[1] = pSource->m[1][1];
- M.r[1].vector4_f32[2] = pSource->m[1][2];
- M.r[1].vector4_f32[3] = 0.0f;
- M.r[2].vector4_f32[0] = pSource->m[2][0];
- M.r[2].vector4_f32[1] = pSource->m[2][1];
- M.r[2].vector4_f32[2] = pSource->m[2][2];
- M.r[2].vector4_f32[3] = 0.0f;
- M.r[3].vector4_f32[0] = 0.0f;
- M.r[3].vector4_f32[1] = 0.0f;
- M.r[3].vector4_f32[2] = 0.0f;
- M.r[3].vector4_f32[3] = 1.0f;
- return M;
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- float32x4_t v0 = vld1q_f32( &pSource->m[0][0] );
- float32x4_t v1 = vld1q_f32( &pSource->m[1][1] );
- float32x2_t v2 = vcreate_f32( (uint64_t)*(const uint32_t*)&pSource->m[2][2] );
- float32x4_t T = vextq_f32( v0, v1, 3 );
- XMMATRIX M;
- M.r[0] = vandq_u32( v0, g_XMMask3 );
- M.r[1] = vandq_u32( T, g_XMMask3 );
- M.r[2] = vcombine_f32( vget_high_f32(v1), v2 );
- M.r[3] = g_XMIdentityR3;
- return M;
- #elif defined(_XM_SSE_INTRINSICS_)
- __m128 Z = _mm_setzero_ps();
- __m128 V1 = _mm_loadu_ps( &pSource->m[0][0] );
- __m128 V2 = _mm_loadu_ps( &pSource->m[1][1] );
- __m128 V3 = _mm_load_ss( &pSource->m[2][2] );
- __m128 T1 = _mm_unpackhi_ps( V1, Z );
- __m128 T2 = _mm_unpacklo_ps( V2, Z );
- __m128 T3 = _mm_shuffle_ps( V3, T2, _MM_SHUFFLE( 0, 1, 0, 0 ) );
- __m128 T4 = _mm_movehl_ps( T2, T3 );
- __m128 T5 = _mm_movehl_ps( Z, T1 );
- XMMATRIX M;
- M.r[0] = _mm_movelh_ps( V1, T1 );
- M.r[1] = _mm_add_ps( T4, T5 );
- M.r[2] = _mm_shuffle_ps( V2, V3, _MM_SHUFFLE(1, 0, 3, 2) );
- M.r[3] = g_XMIdentityR3;
- return M;
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline XMMATRIX XM_CALLCONV XMLoadFloat4x3
- (
- const XMFLOAT4X3* pSource
- )
- {
- assert(pSource);
- #if defined(_XM_NO_INTRINSICS_)
- XMMATRIX M;
- M.r[0].vector4_f32[0] = pSource->m[0][0];
- M.r[0].vector4_f32[1] = pSource->m[0][1];
- M.r[0].vector4_f32[2] = pSource->m[0][2];
- M.r[0].vector4_f32[3] = 0.0f;
- M.r[1].vector4_f32[0] = pSource->m[1][0];
- M.r[1].vector4_f32[1] = pSource->m[1][1];
- M.r[1].vector4_f32[2] = pSource->m[1][2];
- M.r[1].vector4_f32[3] = 0.0f;
- M.r[2].vector4_f32[0] = pSource->m[2][0];
- M.r[2].vector4_f32[1] = pSource->m[2][1];
- M.r[2].vector4_f32[2] = pSource->m[2][2];
- M.r[2].vector4_f32[3] = 0.0f;
- M.r[3].vector4_f32[0] = pSource->m[3][0];
- M.r[3].vector4_f32[1] = pSource->m[3][1];
- M.r[3].vector4_f32[2] = pSource->m[3][2];
- M.r[3].vector4_f32[3] = 1.0f;
- return M;
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- float32x4_t v0 = vld1q_f32( &pSource->m[0][0] );
- float32x4_t v1 = vld1q_f32( &pSource->m[1][1] );
- float32x4_t v2 = vld1q_f32( &pSource->m[2][2] );
- float32x4_t T1 = vextq_f32( v0, v1, 3 );
- float32x4_t T2 = vcombine_f32( vget_high_f32(v1), vget_low_f32(v2) );
- float32x4_t T3 = vextq_f32( v2, v2, 1 );
- XMMATRIX M;
- M.r[0] = vandq_u32( v0, g_XMMask3 );
- M.r[1] = vandq_u32( T1, g_XMMask3 );
- M.r[2] = vandq_u32( T2, g_XMMask3 );
- M.r[3] = vsetq_lane_f32( 1.f, T3, 3 );
- return M;
- #elif defined(_XM_SSE_INTRINSICS_)
- // Use unaligned load instructions to
- // load the 12 floats
- // vTemp1 = x1,y1,z1,x2
- XMVECTOR vTemp1 = _mm_loadu_ps(&pSource->m[0][0]);
- // vTemp2 = y2,z2,x3,y3
- XMVECTOR vTemp2 = _mm_loadu_ps(&pSource->m[1][1]);
- // vTemp4 = z3,x4,y4,z4
- XMVECTOR vTemp4 = _mm_loadu_ps(&pSource->m[2][2]);
- // vTemp3 = x3,y3,z3,z3
- XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2));
- // vTemp2 = y2,z2,x2,x2
- vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0));
- // vTemp2 = x2,y2,z2,z2
- vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(1,1,0,2));
- // vTemp1 = x1,y1,z1,0
- vTemp1 = _mm_and_ps(vTemp1,g_XMMask3);
- // vTemp2 = x2,y2,z2,0
- vTemp2 = _mm_and_ps(vTemp2,g_XMMask3);
- // vTemp3 = x3,y3,z3,0
- vTemp3 = _mm_and_ps(vTemp3,g_XMMask3);
- // vTemp4i = x4,y4,z4,0
- __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4),32/8);
- // vTemp4i = x4,y4,z4,1.0f
- vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3);
- XMMATRIX M(vTemp1,
- vTemp2,
- vTemp3,
- _mm_castsi128_ps(vTemp4i));
- return M;
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline XMMATRIX XM_CALLCONV XMLoadFloat4x3A
- (
- const XMFLOAT4X3A* pSource
- )
- {
- assert(pSource);
- assert(((uintptr_t)pSource & 0xF) == 0);
- #if defined(_XM_NO_INTRINSICS_)
- XMMATRIX M;
- M.r[0].vector4_f32[0] = pSource->m[0][0];
- M.r[0].vector4_f32[1] = pSource->m[0][1];
- M.r[0].vector4_f32[2] = pSource->m[0][2];
- M.r[0].vector4_f32[3] = 0.0f;
- M.r[1].vector4_f32[0] = pSource->m[1][0];
- M.r[1].vector4_f32[1] = pSource->m[1][1];
- M.r[1].vector4_f32[2] = pSource->m[1][2];
- M.r[1].vector4_f32[3] = 0.0f;
- M.r[2].vector4_f32[0] = pSource->m[2][0];
- M.r[2].vector4_f32[1] = pSource->m[2][1];
- M.r[2].vector4_f32[2] = pSource->m[2][2];
- M.r[2].vector4_f32[3] = 0.0f;
- M.r[3].vector4_f32[0] = pSource->m[3][0];
- M.r[3].vector4_f32[1] = pSource->m[3][1];
- M.r[3].vector4_f32[2] = pSource->m[3][2];
- M.r[3].vector4_f32[3] = 1.0f;
- return M;
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- float32x4_t v0 = vld1q_f32_ex( &pSource->m[0][0], 128 );
- float32x4_t v1 = vld1q_f32_ex( &pSource->m[1][1], 128 );
- float32x4_t v2 = vld1q_f32_ex( &pSource->m[2][2], 128 );
- float32x4_t T1 = vextq_f32( v0, v1, 3 );
- float32x4_t T2 = vcombine_f32( vget_high_f32(v1), vget_low_f32(v2) );
- float32x4_t T3 = vextq_f32( v2, v2, 1 );
- XMMATRIX M;
- M.r[0] = vandq_u32( v0, g_XMMask3 );
- M.r[1] = vandq_u32( T1, g_XMMask3 );
- M.r[2] = vandq_u32( T2, g_XMMask3 );
- M.r[3] = vsetq_lane_f32( 1.f, T3, 3 );
- return M;
- #elif defined(_XM_SSE_INTRINSICS_)
- // Use aligned load instructions to
- // load the 12 floats
- // vTemp1 = x1,y1,z1,x2
- XMVECTOR vTemp1 = _mm_load_ps(&pSource->m[0][0]);
- // vTemp2 = y2,z2,x3,y3
- XMVECTOR vTemp2 = _mm_load_ps(&pSource->m[1][1]);
- // vTemp4 = z3,x4,y4,z4
- XMVECTOR vTemp4 = _mm_load_ps(&pSource->m[2][2]);
- // vTemp3 = x3,y3,z3,z3
- XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2));
- // vTemp2 = y2,z2,x2,x2
- vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0));
- // vTemp2 = x2,y2,z2,z2
- vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(1,1,0,2));
- // vTemp1 = x1,y1,z1,0
- vTemp1 = _mm_and_ps(vTemp1,g_XMMask3);
- // vTemp2 = x2,y2,z2,0
- vTemp2 = _mm_and_ps(vTemp2,g_XMMask3);
- // vTemp3 = x3,y3,z3,0
- vTemp3 = _mm_and_ps(vTemp3,g_XMMask3);
- // vTemp4i = x4,y4,z4,0
- __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4),32/8);
- // vTemp4i = x4,y4,z4,1.0f
- vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3);
- XMMATRIX M(vTemp1,
- vTemp2,
- vTemp3,
- _mm_castsi128_ps(vTemp4i));
- return M;
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline XMMATRIX XM_CALLCONV XMLoadFloat4x4
- (
- const XMFLOAT4X4* pSource
- )
- {
- assert(pSource);
- #if defined(_XM_NO_INTRINSICS_)
- XMMATRIX M;
- M.r[0].vector4_f32[0] = pSource->m[0][0];
- M.r[0].vector4_f32[1] = pSource->m[0][1];
- M.r[0].vector4_f32[2] = pSource->m[0][2];
- M.r[0].vector4_f32[3] = pSource->m[0][3];
- M.r[1].vector4_f32[0] = pSource->m[1][0];
- M.r[1].vector4_f32[1] = pSource->m[1][1];
- M.r[1].vector4_f32[2] = pSource->m[1][2];
- M.r[1].vector4_f32[3] = pSource->m[1][3];
- M.r[2].vector4_f32[0] = pSource->m[2][0];
- M.r[2].vector4_f32[1] = pSource->m[2][1];
- M.r[2].vector4_f32[2] = pSource->m[2][2];
- M.r[2].vector4_f32[3] = pSource->m[2][3];
- M.r[3].vector4_f32[0] = pSource->m[3][0];
- M.r[3].vector4_f32[1] = pSource->m[3][1];
- M.r[3].vector4_f32[2] = pSource->m[3][2];
- M.r[3].vector4_f32[3] = pSource->m[3][3];
- return M;
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- XMMATRIX M;
- M.r[0] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_11) );
- M.r[1] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_21) );
- M.r[2] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_31) );
- M.r[3] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_41) );
- return M;
- #elif defined(_XM_SSE_INTRINSICS_)
- XMMATRIX M;
- M.r[0] = _mm_loadu_ps( &pSource->_11 );
- M.r[1] = _mm_loadu_ps( &pSource->_21 );
- M.r[2] = _mm_loadu_ps( &pSource->_31 );
- M.r[3] = _mm_loadu_ps( &pSource->_41 );
- return M;
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline XMMATRIX XM_CALLCONV XMLoadFloat4x4A
- (
- const XMFLOAT4X4A* pSource
- )
- {
- assert(pSource);
- assert(((uintptr_t)pSource & 0xF) == 0);
- #if defined(_XM_NO_INTRINSICS_)
- XMMATRIX M;
- M.r[0].vector4_f32[0] = pSource->m[0][0];
- M.r[0].vector4_f32[1] = pSource->m[0][1];
- M.r[0].vector4_f32[2] = pSource->m[0][2];
- M.r[0].vector4_f32[3] = pSource->m[0][3];
- M.r[1].vector4_f32[0] = pSource->m[1][0];
- M.r[1].vector4_f32[1] = pSource->m[1][1];
- M.r[1].vector4_f32[2] = pSource->m[1][2];
- M.r[1].vector4_f32[3] = pSource->m[1][3];
- M.r[2].vector4_f32[0] = pSource->m[2][0];
- M.r[2].vector4_f32[1] = pSource->m[2][1];
- M.r[2].vector4_f32[2] = pSource->m[2][2];
- M.r[2].vector4_f32[3] = pSource->m[2][3];
- M.r[3].vector4_f32[0] = pSource->m[3][0];
- M.r[3].vector4_f32[1] = pSource->m[3][1];
- M.r[3].vector4_f32[2] = pSource->m[3][2];
- M.r[3].vector4_f32[3] = pSource->m[3][3];
- return M;
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- XMMATRIX M;
- M.r[0] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_11), 128 );
- M.r[1] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_21), 128 );
- M.r[2] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_31), 128 );
- M.r[3] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_41), 128 );
- return M;
- #elif defined(_XM_SSE_INTRINSICS_)
- XMMATRIX M;
- M.r[0] = _mm_load_ps( &pSource->_11 );
- M.r[1] = _mm_load_ps( &pSource->_21 );
- M.r[2] = _mm_load_ps( &pSource->_31 );
- M.r[3] = _mm_load_ps( &pSource->_41 );
- return M;
- #endif
- }
- /****************************************************************************
- *
- * Vector and matrix store operations
- *
- ****************************************************************************/
- _Use_decl_annotations_
- inline void XM_CALLCONV XMStoreInt
- (
- uint32_t* pDestination,
- FXMVECTOR V
- )
- {
- assert(pDestination);
- #if defined(_XM_NO_INTRINSICS_)
- *pDestination = XMVectorGetIntX( V );
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- vst1q_lane_u32( pDestination, *reinterpret_cast<const uint32x4_t*>(&V), 0 );
- #elif defined(_XM_SSE_INTRINSICS_)
- _mm_store_ss( reinterpret_cast<float*>(pDestination), V );
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline void XM_CALLCONV XMStoreFloat
- (
- float* pDestination,
- FXMVECTOR V
- )
- {
- assert(pDestination);
- #if defined(_XM_NO_INTRINSICS_)
- *pDestination = XMVectorGetX( V );
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- vst1q_lane_f32( pDestination, V, 0 );
- #elif defined(_XM_SSE_INTRINSICS_)
- _mm_store_ss( pDestination, V );
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline void XM_CALLCONV XMStoreInt2
- (
- uint32_t* pDestination,
- FXMVECTOR V
- )
- {
- assert(pDestination);
- #if defined(_XM_NO_INTRINSICS_)
- pDestination[0] = V.vector4_u32[0];
- pDestination[1] = V.vector4_u32[1];
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- uint32x2_t VL = vget_low_u32(V);
- vst1_u32( pDestination, VL );
- #elif defined(_XM_SSE_INTRINSICS_)
- XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
- _mm_store_ss( reinterpret_cast<float*>(&pDestination[0]), V );
- _mm_store_ss( reinterpret_cast<float*>(&pDestination[1]), T );
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline void XM_CALLCONV XMStoreInt2A
- (
- uint32_t* pDestination,
- FXMVECTOR V
- )
- {
- assert(pDestination);
- assert(((uintptr_t)pDestination & 0xF) == 0);
- #if defined(_XM_NO_INTRINSICS_)
- pDestination[0] = V.vector4_u32[0];
- pDestination[1] = V.vector4_u32[1];
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- uint32x2_t VL = vget_low_u32(V);
- vst1_u32_ex( pDestination, VL, 64 );
- #elif defined(_XM_SSE_INTRINSICS_)
- _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline void XM_CALLCONV XMStoreFloat2
- (
- XMFLOAT2* pDestination,
- FXMVECTOR V
- )
- {
- assert(pDestination);
- #if defined(_XM_NO_INTRINSICS_)
- pDestination->x = V.vector4_f32[0];
- pDestination->y = V.vector4_f32[1];
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- float32x2_t VL = vget_low_f32(V);
- vst1_f32( reinterpret_cast<float*>(pDestination), VL );
- #elif defined(_XM_SSE_INTRINSICS_)
- XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
- _mm_store_ss( &pDestination->x, V );
- _mm_store_ss( &pDestination->y, T );
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline void XM_CALLCONV XMStoreFloat2A
- (
- XMFLOAT2A* pDestination,
- FXMVECTOR V
- )
- {
- assert(pDestination);
- assert(((uintptr_t)pDestination & 0xF) == 0);
- #if defined(_XM_NO_INTRINSICS_)
- pDestination->x = V.vector4_f32[0];
- pDestination->y = V.vector4_f32[1];
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- float32x2_t VL = vget_low_f32(V);
- vst1_f32_ex( reinterpret_cast<float*>(pDestination), VL, 64 );
- #elif defined(_XM_SSE_INTRINSICS_)
- _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline void XM_CALLCONV XMStoreSInt2
- (
- XMINT2* pDestination,
- FXMVECTOR V
- )
- {
- assert(pDestination);
- #if defined(_XM_NO_INTRINSICS_)
- pDestination->x = (int32_t)V.vector4_f32[0];
- pDestination->y = (int32_t)V.vector4_f32[1];
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- int32x2_t v = vget_low_s32(V);
- v = vcvt_s32_f32( v );
- vst1_s32( reinterpret_cast<int32_t*>(pDestination), v );
- #elif defined(_XM_SSE_INTRINSICS_)
- // In case of positive overflow, detect it
- XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt);
- // Float to int conversion
- __m128i vResulti = _mm_cvttps_epi32(V);
- // If there was positive overflow, set to 0x7FFFFFFF
- XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
- vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
- vOverflow = _mm_or_ps(vOverflow,vResult);
- // Write two ints
- XMVECTOR T = XM_PERMUTE_PS( vOverflow, _MM_SHUFFLE( 1, 1, 1, 1 ) );
- _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vOverflow );
- _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T );
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline void XM_CALLCONV XMStoreUInt2
- (
- XMUINT2* pDestination,
- FXMVECTOR V
- )
- {
- assert(pDestination);
- #if defined(_XM_NO_INTRINSICS_)
- pDestination->x = (uint32_t)V.vector4_f32[0];
- pDestination->y = (uint32_t)V.vector4_f32[1];
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- float32x2_t v = vget_low_f32(V);
- uint32x2_t iv = vcvt_u32_f32( v );
- vst1_u32( reinterpret_cast<uint32_t*>(pDestination), iv );
- #elif defined(_XM_SSE_INTRINSICS_)
- // Clamp to >=0
- XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
- // Any numbers that are too big, set to 0xFFFFFFFFU
- XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
- XMVECTOR vValue = g_XMUnsignedFix;
- // Too large for a signed integer?
- XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
- // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
- vValue = _mm_and_ps(vValue,vMask);
- // Perform fixup only on numbers too large (Keeps low bit precision)
- vResult = _mm_sub_ps(vResult,vValue);
- __m128i vResulti = _mm_cvttps_epi32(vResult);
- // Convert from signed to unsigned pnly if greater than 0x80000000
- vMask = _mm_and_ps(vMask,g_XMNegativeZero);
- vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
- // On those that are too large, set to 0xFFFFFFFF
- vResult = _mm_or_ps(vResult,vOverflow);
- // Write two uints
- XMVECTOR T = XM_PERMUTE_PS( vResult, _MM_SHUFFLE( 1, 1, 1, 1 ) );
- _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vResult );
- _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T );
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline void XM_CALLCONV XMStoreInt3
- (
- uint32_t* pDestination,
- FXMVECTOR V
- )
- {
- assert(pDestination);
- #if defined(_XM_NO_INTRINSICS_)
- pDestination[0] = V.vector4_u32[0];
- pDestination[1] = V.vector4_u32[1];
- pDestination[2] = V.vector4_u32[2];
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- uint32x2_t VL = vget_low_u32(V);
- vst1_u32( pDestination, VL );
- vst1q_lane_u32( pDestination+2, *reinterpret_cast<const uint32x4_t*>(&V), 2 );
- #elif defined(_XM_SSE_INTRINSICS_)
- XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
- XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
- _mm_store_ss( reinterpret_cast<float*>(pDestination), V );
- _mm_store_ss( reinterpret_cast<float*>(&pDestination[1]), T1 );
- _mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), T2 );
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline void XM_CALLCONV XMStoreInt3A
- (
- uint32_t* pDestination,
- FXMVECTOR V
- )
- {
- assert(pDestination);
- assert(((uintptr_t)pDestination & 0xF) == 0);
- #if defined(_XM_NO_INTRINSICS_)
- pDestination[0] = V.vector4_u32[0];
- pDestination[1] = V.vector4_u32[1];
- pDestination[2] = V.vector4_u32[2];
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- uint32x2_t VL = vget_low_u32(V);
- vst1_u32_ex( pDestination, VL, 64 );
- vst1q_lane_u32( pDestination+2, *reinterpret_cast<const uint32x4_t*>(&V), 2 );
- #elif defined(_XM_SSE_INTRINSICS_)
- XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
- _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
- _mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), T );
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline void XM_CALLCONV XMStoreFloat3
- (
- XMFLOAT3* pDestination,
- FXMVECTOR V
- )
- {
- assert(pDestination);
- #if defined(_XM_NO_INTRINSICS_)
- pDestination->x = V.vector4_f32[0];
- pDestination->y = V.vector4_f32[1];
- pDestination->z = V.vector4_f32[2];
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- float32x2_t VL = vget_low_f32(V);
- vst1_f32( reinterpret_cast<float*>(pDestination), VL );
- vst1q_lane_f32( reinterpret_cast<float*>(pDestination)+2, V, 2 );
- #elif defined(_XM_SSE_INTRINSICS_)
- XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
- XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
- _mm_store_ss( &pDestination->x, V );
- _mm_store_ss( &pDestination->y, T1 );
- _mm_store_ss( &pDestination->z, T2 );
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline void XM_CALLCONV XMStoreFloat3A
- (
- XMFLOAT3A* pDestination,
- FXMVECTOR V
- )
- {
- assert(pDestination);
- assert(((uintptr_t)pDestination & 0xF) == 0);
- #if defined(_XM_NO_INTRINSICS_)
- pDestination->x = V.vector4_f32[0];
- pDestination->y = V.vector4_f32[1];
- pDestination->z = V.vector4_f32[2];
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- float32x2_t VL = vget_low_f32(V);
- vst1_f32_ex( reinterpret_cast<float*>(pDestination), VL, 64 );
- vst1q_lane_f32( reinterpret_cast<float*>(pDestination)+2, V, 2 );
- #elif defined(_XM_SSE_INTRINSICS_)
- XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
- _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
- _mm_store_ss( &pDestination->z, T );
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline void XM_CALLCONV XMStoreSInt3
- (
- XMINT3* pDestination,
- FXMVECTOR V
- )
- {
- assert(pDestination);
- #if defined(_XM_NO_INTRINSICS_)
- pDestination->x = (int32_t)V.vector4_f32[0];
- pDestination->y = (int32_t)V.vector4_f32[1];
- pDestination->z = (int32_t)V.vector4_f32[2];
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- int32x4_t v = vcvtq_s32_f32(V);
- int32x2_t vL = vget_low_s32(v);
- vst1_s32( reinterpret_cast<int32_t*>(pDestination), vL );
- vst1q_lane_s32( reinterpret_cast<int32_t*>(pDestination)+2, v, 2 );
- #elif defined(_XM_SSE_INTRINSICS_)
- // In case of positive overflow, detect it
- XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt);
- // Float to int conversion
- __m128i vResulti = _mm_cvttps_epi32(V);
- // If there was positive overflow, set to 0x7FFFFFFF
- XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
- vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
- vOverflow = _mm_or_ps(vOverflow,vResult);
- // Write 3 uints
- XMVECTOR T1 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(1,1,1,1));
- XMVECTOR T2 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(2,2,2,2));
- _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vOverflow );
- _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T1 );
- _mm_store_ss( reinterpret_cast<float*>(&pDestination->z), T2 );
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline void XM_CALLCONV XMStoreUInt3
- (
- XMUINT3* pDestination,
- FXMVECTOR V
- )
- {
- assert(pDestination);
- #if defined(_XM_NO_INTRINSICS_)
- pDestination->x = (uint32_t)V.vector4_f32[0];
- pDestination->y = (uint32_t)V.vector4_f32[1];
- pDestination->z = (uint32_t)V.vector4_f32[2];
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- uint32x4_t v = vcvtq_u32_f32(V);
- uint32x2_t vL = vget_low_u32(v);
- vst1_u32( reinterpret_cast<uint32_t*>(pDestination), vL );
- vst1q_lane_u32( reinterpret_cast<uint32_t*>(pDestination)+2, v, 2 );
- #elif defined(_XM_SSE_INTRINSICS_)
- // Clamp to >=0
- XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
- // Any numbers that are too big, set to 0xFFFFFFFFU
- XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
- XMVECTOR vValue = g_XMUnsignedFix;
- // Too large for a signed integer?
- XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
- // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
- vValue = _mm_and_ps(vValue,vMask);
- // Perform fixup only on numbers too large (Keeps low bit precision)
- vResult = _mm_sub_ps(vResult,vValue);
- __m128i vResulti = _mm_cvttps_epi32(vResult);
- // Convert from signed to unsigned pnly if greater than 0x80000000
- vMask = _mm_and_ps(vMask,g_XMNegativeZero);
- vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
- // On those that are too large, set to 0xFFFFFFFF
- vResult = _mm_or_ps(vResult,vOverflow);
- // Write 3 uints
- XMVECTOR T1 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(1,1,1,1));
- XMVECTOR T2 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(2,2,2,2));
- _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vResult );
- _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T1 );
- _mm_store_ss( reinterpret_cast<float*>(&pDestination->z), T2 );
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline void XM_CALLCONV XMStoreInt4
- (
- uint32_t* pDestination,
- FXMVECTOR V
- )
- {
- assert(pDestination);
- #if defined(_XM_NO_INTRINSICS_)
- pDestination[0] = V.vector4_u32[0];
- pDestination[1] = V.vector4_u32[1];
- pDestination[2] = V.vector4_u32[2];
- pDestination[3] = V.vector4_u32[3];
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- vst1q_u32( pDestination, V );
- #elif defined(_XM_SSE_INTRINSICS_)
- _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline void XM_CALLCONV XMStoreInt4A
- (
- uint32_t* pDestination,
- FXMVECTOR V
- )
- {
- assert(pDestination);
- assert(((uintptr_t)pDestination & 0xF) == 0);
- #if defined(_XM_NO_INTRINSICS_)
- pDestination[0] = V.vector4_u32[0];
- pDestination[1] = V.vector4_u32[1];
- pDestination[2] = V.vector4_u32[2];
- pDestination[3] = V.vector4_u32[3];
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- vst1q_u32_ex( pDestination, V, 128 );
- #elif defined(_XM_SSE_INTRINSICS_)
- _mm_store_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline void XM_CALLCONV XMStoreFloat4
- (
- XMFLOAT4* pDestination,
- FXMVECTOR V
- )
- {
- assert(pDestination);
- #if defined(_XM_NO_INTRINSICS_)
- pDestination->x = V.vector4_f32[0];
- pDestination->y = V.vector4_f32[1];
- pDestination->z = V.vector4_f32[2];
- pDestination->w = V.vector4_f32[3];
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- vst1q_f32( reinterpret_cast<float*>(pDestination), V );
- #elif defined(_XM_SSE_INTRINSICS_)
- _mm_storeu_ps( &pDestination->x, V );
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline void XM_CALLCONV XMStoreFloat4A
- (
- XMFLOAT4A* pDestination,
- FXMVECTOR V
- )
- {
- assert(pDestination);
- assert(((uintptr_t)pDestination & 0xF) == 0);
- #if defined(_XM_NO_INTRINSICS_)
- pDestination->x = V.vector4_f32[0];
- pDestination->y = V.vector4_f32[1];
- pDestination->z = V.vector4_f32[2];
- pDestination->w = V.vector4_f32[3];
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- vst1q_f32_ex( reinterpret_cast<float*>(pDestination), V, 128 );
- #elif defined(_XM_SSE_INTRINSICS_)
- _mm_store_ps( &pDestination->x, V );
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline void XM_CALLCONV XMStoreSInt4
- (
- XMINT4* pDestination,
- FXMVECTOR V
- )
- {
- assert(pDestination);
- #if defined(_XM_NO_INTRINSICS_)
- pDestination->x = (int32_t)V.vector4_f32[0];
- pDestination->y = (int32_t)V.vector4_f32[1];
- pDestination->z = (int32_t)V.vector4_f32[2];
- pDestination->w = (int32_t)V.vector4_f32[3];
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- int32x4_t v = vcvtq_s32_f32(V);
- vst1q_s32( reinterpret_cast<int32_t*>(pDestination), v );
- #elif defined(_XM_SSE_INTRINSICS_)
- // In case of positive overflow, detect it
- XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt);
- // Float to int conversion
- __m128i vResulti = _mm_cvttps_epi32(V);
- // If there was positive overflow, set to 0x7FFFFFFF
- XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
- vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
- vOverflow = _mm_or_ps(vOverflow,vResult);
- _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vOverflow) );
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline void XM_CALLCONV XMStoreUInt4
- (
- XMUINT4* pDestination,
- FXMVECTOR V
- )
- {
- assert(pDestination);
- #if defined(_XM_NO_INTRINSICS_)
- pDestination->x = (uint32_t)V.vector4_f32[0];
- pDestination->y = (uint32_t)V.vector4_f32[1];
- pDestination->z = (uint32_t)V.vector4_f32[2];
- pDestination->w = (uint32_t)V.vector4_f32[3];
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- uint32x4_t v = vcvtq_u32_f32(V);
- vst1q_u32( reinterpret_cast<uint32_t*>(pDestination), v );
- #elif defined(_XM_SSE_INTRINSICS_)
- // Clamp to >=0
- XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
- // Any numbers that are too big, set to 0xFFFFFFFFU
- XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
- XMVECTOR vValue = g_XMUnsignedFix;
- // Too large for a signed integer?
- XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
- // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
- vValue = _mm_and_ps(vValue,vMask);
- // Perform fixup only on numbers too large (Keeps low bit precision)
- vResult = _mm_sub_ps(vResult,vValue);
- __m128i vResulti = _mm_cvttps_epi32(vResult);
- // Convert from signed to unsigned pnly if greater than 0x80000000
- vMask = _mm_and_ps(vMask,g_XMNegativeZero);
- vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
- // On those that are too large, set to 0xFFFFFFFF
- vResult = _mm_or_ps(vResult,vOverflow);
- _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vResult) );
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline void XM_CALLCONV XMStoreFloat3x3
- (
- XMFLOAT3X3* pDestination,
- FXMMATRIX M
- )
- {
- assert(pDestination);
- #if defined(_XM_NO_INTRINSICS_)
- pDestination->m[0][0] = M.r[0].vector4_f32[0];
- pDestination->m[0][1] = M.r[0].vector4_f32[1];
- pDestination->m[0][2] = M.r[0].vector4_f32[2];
- pDestination->m[1][0] = M.r[1].vector4_f32[0];
- pDestination->m[1][1] = M.r[1].vector4_f32[1];
- pDestination->m[1][2] = M.r[1].vector4_f32[2];
- pDestination->m[2][0] = M.r[2].vector4_f32[0];
- pDestination->m[2][1] = M.r[2].vector4_f32[1];
- pDestination->m[2][2] = M.r[2].vector4_f32[2];
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- float32x4_t T1 = vextq_f32( M.r[0], M.r[1], 1 );
- float32x4_t T2 = vbslq_f32( g_XMMask3, M.r[0], T1 );
- vst1q_f32( &pDestination->m[0][0], T2 );
- T1 = vextq_f32( M.r[1], M.r[1], 1 );
- T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) );
- vst1q_f32( &pDestination->m[1][1], T2 );
- vst1q_lane_f32( &pDestination->m[2][2], M.r[2], 2 );
- #elif defined(_XM_SSE_INTRINSICS_)
- XMVECTOR vTemp1 = M.r[0];
- XMVECTOR vTemp2 = M.r[1];
- XMVECTOR vTemp3 = M.r[2];
- XMVECTOR vWork = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,0,2,2));
- vTemp1 = _mm_shuffle_ps(vTemp1,vWork,_MM_SHUFFLE(2,0,1,0));
- _mm_storeu_ps(&pDestination->m[0][0],vTemp1);
- vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
- _mm_storeu_ps(&pDestination->m[1][1],vTemp2);
- vTemp3 = XM_PERMUTE_PS(vTemp3,_MM_SHUFFLE(2,2,2,2));
- _mm_store_ss(&pDestination->m[2][2],vTemp3);
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline void XM_CALLCONV XMStoreFloat4x3
- (
- XMFLOAT4X3* pDestination,
- FXMMATRIX M
- )
- {
- assert(pDestination);
- #if defined(_XM_NO_INTRINSICS_)
- pDestination->m[0][0] = M.r[0].vector4_f32[0];
- pDestination->m[0][1] = M.r[0].vector4_f32[1];
- pDestination->m[0][2] = M.r[0].vector4_f32[2];
- pDestination->m[1][0] = M.r[1].vector4_f32[0];
- pDestination->m[1][1] = M.r[1].vector4_f32[1];
- pDestination->m[1][2] = M.r[1].vector4_f32[2];
- pDestination->m[2][0] = M.r[2].vector4_f32[0];
- pDestination->m[2][1] = M.r[2].vector4_f32[1];
- pDestination->m[2][2] = M.r[2].vector4_f32[2];
- pDestination->m[3][0] = M.r[3].vector4_f32[0];
- pDestination->m[3][1] = M.r[3].vector4_f32[1];
- pDestination->m[3][2] = M.r[3].vector4_f32[2];
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- float32x4_t T1 = vextq_f32( M.r[0], M.r[1], 1 );
- float32x4_t T2 = vbslq_f32( g_XMMask3, M.r[0], T1 );
- vst1q_f32( &pDestination->m[0][0], T2 );
- T1 = vextq_f32( M.r[1], M.r[1], 1 );
- T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) );
- vst1q_f32( &pDestination->m[1][1], T2 );
- T1 = vdupq_lane_f32( vget_high_f32( M.r[2] ), 0 );
- T2 = vextq_f32( T1, M.r[3], 3 );
- vst1q_f32( &pDestination->m[2][2], T2 );
- #elif defined(_XM_SSE_INTRINSICS_)
- XMVECTOR vTemp1 = M.r[0];
- XMVECTOR vTemp2 = M.r[1];
- XMVECTOR vTemp3 = M.r[2];
- XMVECTOR vTemp4 = M.r[3];
- XMVECTOR vTemp2x = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
- vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(2,2,0,0));
- vTemp1 = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,2,1,0));
- vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2));
- vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0));
- _mm_storeu_ps(&pDestination->m[0][0],vTemp1);
- _mm_storeu_ps(&pDestination->m[1][1],vTemp2x);
- _mm_storeu_ps(&pDestination->m[2][2],vTemp3);
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline void XM_CALLCONV XMStoreFloat4x3A
- (
- XMFLOAT4X3A* pDestination,
- FXMMATRIX M
- )
- {
- assert(pDestination);
- assert(((uintptr_t)pDestination & 0xF) == 0);
- #if defined(_XM_NO_INTRINSICS_)
- pDestination->m[0][0] = M.r[0].vector4_f32[0];
- pDestination->m[0][1] = M.r[0].vector4_f32[1];
- pDestination->m[0][2] = M.r[0].vector4_f32[2];
- pDestination->m[1][0] = M.r[1].vector4_f32[0];
- pDestination->m[1][1] = M.r[1].vector4_f32[1];
- pDestination->m[1][2] = M.r[1].vector4_f32[2];
- pDestination->m[2][0] = M.r[2].vector4_f32[0];
- pDestination->m[2][1] = M.r[2].vector4_f32[1];
- pDestination->m[2][2] = M.r[2].vector4_f32[2];
- pDestination->m[3][0] = M.r[3].vector4_f32[0];
- pDestination->m[3][1] = M.r[3].vector4_f32[1];
- pDestination->m[3][2] = M.r[3].vector4_f32[2];
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- float32x4_t T1 = vextq_f32( M.r[0], M.r[1], 1 );
- float32x4_t T2 = vbslq_f32( g_XMMask3, M.r[0], T1 );
- vst1q_f32_ex( &pDestination->m[0][0], T2, 128 );
- T1 = vextq_f32( M.r[1], M.r[1], 1 );
- T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) );
- vst1q_f32_ex( &pDestination->m[1][1], T2, 128 );
- T1 = vdupq_lane_f32( vget_high_f32( M.r[2] ), 0 );
- T2 = vextq_f32( T1, M.r[3], 3 );
- vst1q_f32_ex( &pDestination->m[2][2], T2, 128 );
- #elif defined(_XM_SSE_INTRINSICS_)
- // x1,y1,z1,w1
- XMVECTOR vTemp1 = M.r[0];
- // x2,y2,z2,w2
- XMVECTOR vTemp2 = M.r[1];
- // x3,y3,z3,w3
- XMVECTOR vTemp3 = M.r[2];
- // x4,y4,z4,w4
- XMVECTOR vTemp4 = M.r[3];
- // z1,z1,x2,y2
- XMVECTOR vTemp = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(1,0,2,2));
- // y2,z2,x3,y3 (Final)
- vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
- // x1,y1,z1,x2 (Final)
- vTemp1 = _mm_shuffle_ps(vTemp1,vTemp,_MM_SHUFFLE(2,0,1,0));
- // z3,z3,x4,x4
- vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2));
- // z3,x4,y4,z4 (Final)
- vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0));
- // Store in 3 operations
- _mm_store_ps(&pDestination->m[0][0],vTemp1);
- _mm_store_ps(&pDestination->m[1][1],vTemp2);
- _mm_store_ps(&pDestination->m[2][2],vTemp3);
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline void XM_CALLCONV XMStoreFloat4x4
- (
- XMFLOAT4X4* pDestination,
- FXMMATRIX M
- )
- {
- assert(pDestination);
- #if defined(_XM_NO_INTRINSICS_)
- pDestination->m[0][0] = M.r[0].vector4_f32[0];
- pDestination->m[0][1] = M.r[0].vector4_f32[1];
- pDestination->m[0][2] = M.r[0].vector4_f32[2];
- pDestination->m[0][3] = M.r[0].vector4_f32[3];
- pDestination->m[1][0] = M.r[1].vector4_f32[0];
- pDestination->m[1][1] = M.r[1].vector4_f32[1];
- pDestination->m[1][2] = M.r[1].vector4_f32[2];
- pDestination->m[1][3] = M.r[1].vector4_f32[3];
- pDestination->m[2][0] = M.r[2].vector4_f32[0];
- pDestination->m[2][1] = M.r[2].vector4_f32[1];
- pDestination->m[2][2] = M.r[2].vector4_f32[2];
- pDestination->m[2][3] = M.r[2].vector4_f32[3];
- pDestination->m[3][0] = M.r[3].vector4_f32[0];
- pDestination->m[3][1] = M.r[3].vector4_f32[1];
- pDestination->m[3][2] = M.r[3].vector4_f32[2];
- pDestination->m[3][3] = M.r[3].vector4_f32[3];
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- vst1q_f32( reinterpret_cast<float*>(&pDestination->_11), M.r[0] );
- vst1q_f32( reinterpret_cast<float*>(&pDestination->_21), M.r[1] );
- vst1q_f32( reinterpret_cast<float*>(&pDestination->_31), M.r[2] );
- vst1q_f32( reinterpret_cast<float*>(&pDestination->_41), M.r[3] );
- #elif defined(_XM_SSE_INTRINSICS_)
- _mm_storeu_ps( &pDestination->_11, M.r[0] );
- _mm_storeu_ps( &pDestination->_21, M.r[1] );
- _mm_storeu_ps( &pDestination->_31, M.r[2] );
- _mm_storeu_ps( &pDestination->_41, M.r[3] );
- #endif
- }
- //------------------------------------------------------------------------------
- _Use_decl_annotations_
- inline void XM_CALLCONV XMStoreFloat4x4A
- (
- XMFLOAT4X4A* pDestination,
- FXMMATRIX M
- )
- {
- assert(pDestination);
- assert(((uintptr_t)pDestination & 0xF) == 0);
- #if defined(_XM_NO_INTRINSICS_)
- pDestination->m[0][0] = M.r[0].vector4_f32[0];
- pDestination->m[0][1] = M.r[0].vector4_f32[1];
- pDestination->m[0][2] = M.r[0].vector4_f32[2];
- pDestination->m[0][3] = M.r[0].vector4_f32[3];
- pDestination->m[1][0] = M.r[1].vector4_f32[0];
- pDestination->m[1][1] = M.r[1].vector4_f32[1];
- pDestination->m[1][2] = M.r[1].vector4_f32[2];
- pDestination->m[1][3] = M.r[1].vector4_f32[3];
- pDestination->m[2][0] = M.r[2].vector4_f32[0];
- pDestination->m[2][1] = M.r[2].vector4_f32[1];
- pDestination->m[2][2] = M.r[2].vector4_f32[2];
- pDestination->m[2][3] = M.r[2].vector4_f32[3];
- pDestination->m[3][0] = M.r[3].vector4_f32[0];
- pDestination->m[3][1] = M.r[3].vector4_f32[1];
- pDestination->m[3][2] = M.r[3].vector4_f32[2];
- pDestination->m[3][3] = M.r[3].vector4_f32[3];
- #elif defined(_XM_ARM_NEON_INTRINSICS_)
- vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_11), M.r[0], 128 );
- vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_21), M.r[1], 128 );
- vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_31), M.r[2], 128 );
- vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_41), M.r[3], 128 );
- #elif defined(_XM_SSE_INTRINSICS_)
- _mm_store_ps( &pDestination->_11, M.r[0] );
- _mm_store_ps( &pDestination->_21, M.r[1] );
- _mm_store_ps( &pDestination->_31, M.r[2] );
- _mm_store_ps( &pDestination->_41, M.r[3] );
- #endif
- }
|