DirectXMathConvert.inl 63 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899
  1. //-------------------------------------------------------------------------------------
  2. // DirectXMathConvert.inl -- SIMD C++ Math library
  3. //
  4. // THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
  5. // ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
  6. // THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
  7. // PARTICULAR PURPOSE.
  8. //
  9. // Copyright (c) Microsoft Corporation. All rights reserved.
  10. //
  11. // http://go.microsoft.com/fwlink/?LinkID=615560
  12. //-------------------------------------------------------------------------------------
  13. #pragma once
  14. /****************************************************************************
  15. *
  16. * Data conversion
  17. *
  18. ****************************************************************************/
  19. //------------------------------------------------------------------------------
  20. #pragma warning(push)
  21. #pragma warning(disable:4701)
  22. // C4701: false positives
  23. inline XMVECTOR XM_CALLCONV XMConvertVectorIntToFloat
  24. (
  25. FXMVECTOR VInt,
  26. uint32_t DivExponent
  27. )
  28. {
  29. assert(DivExponent<32);
  30. #if defined(_XM_NO_INTRINSICS_)
  31. float fScale = 1.0f / (float)(1U << DivExponent);
  32. uint32_t ElementIndex = 0;
  33. XMVECTOR Result;
  34. do {
  35. int32_t iTemp = (int32_t)VInt.vector4_u32[ElementIndex];
  36. Result.vector4_f32[ElementIndex] = ((float)iTemp) * fScale;
  37. } while (++ElementIndex<4);
  38. return Result;
  39. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  40. float fScale = 1.0f / (float)(1U << DivExponent);
  41. float32x4_t vResult = vcvtq_f32_s32( VInt );
  42. return vmulq_n_f32( vResult, fScale );
  43. #else // _XM_SSE_INTRINSICS_
  44. // Convert to floats
  45. XMVECTOR vResult = _mm_cvtepi32_ps(_mm_castps_si128(VInt));
  46. // Convert DivExponent into 1.0f/(1<<DivExponent)
  47. uint32_t uScale = 0x3F800000U - (DivExponent << 23);
  48. // Splat the scalar value
  49. __m128i vScale = _mm_set1_epi32(uScale);
  50. vResult = _mm_mul_ps(vResult,_mm_castsi128_ps(vScale));
  51. return vResult;
  52. #endif
  53. }
  54. //------------------------------------------------------------------------------
  55. inline XMVECTOR XM_CALLCONV XMConvertVectorFloatToInt
  56. (
  57. FXMVECTOR VFloat,
  58. uint32_t MulExponent
  59. )
  60. {
  61. assert(MulExponent<32);
  62. #if defined(_XM_NO_INTRINSICS_)
  63. // Get the scalar factor.
  64. float fScale = (float)(1U << MulExponent);
  65. uint32_t ElementIndex = 0;
  66. XMVECTOR Result;
  67. do {
  68. int32_t iResult;
  69. float fTemp = VFloat.vector4_f32[ElementIndex]*fScale;
  70. if (fTemp <= -(65536.0f*32768.0f)) {
  71. iResult = (-0x7FFFFFFF)-1;
  72. } else if (fTemp > (65536.0f*32768.0f)-128.0f) {
  73. iResult = 0x7FFFFFFF;
  74. } else {
  75. iResult = (int32_t)fTemp;
  76. }
  77. Result.vector4_u32[ElementIndex] = (uint32_t)iResult;
  78. } while (++ElementIndex<4);
  79. return Result;
  80. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  81. float32x4_t vResult = vmulq_n_f32(VFloat, (float)(1U << MulExponent));
  82. // In case of positive overflow, detect it
  83. uint32x4_t vOverflow = vcgtq_f32(vResult,g_XMMaxInt);
  84. // Float to int conversion
  85. int32x4_t vResulti = vcvtq_s32_f32(vResult);
  86. // If there was positive overflow, set to 0x7FFFFFFF
  87. vResult = vandq_u32(vOverflow,g_XMAbsMask);
  88. vOverflow = vbicq_u32(vResulti,vOverflow);
  89. vOverflow = vorrq_u32(vOverflow,vResult);
  90. return vOverflow;
  91. #else // _XM_SSE_INTRINSICS_
  92. XMVECTOR vResult = _mm_set_ps1((float)(1U << MulExponent));
  93. vResult = _mm_mul_ps(vResult,VFloat);
  94. // In case of positive overflow, detect it
  95. XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxInt);
  96. // Float to int conversion
  97. __m128i vResulti = _mm_cvttps_epi32(vResult);
  98. // If there was positive overflow, set to 0x7FFFFFFF
  99. vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
  100. vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
  101. vOverflow = _mm_or_ps(vOverflow,vResult);
  102. return vOverflow;
  103. #endif
  104. }
  105. //------------------------------------------------------------------------------
  106. inline XMVECTOR XM_CALLCONV XMConvertVectorUIntToFloat
  107. (
  108. FXMVECTOR VUInt,
  109. uint32_t DivExponent
  110. )
  111. {
  112. assert(DivExponent<32);
  113. #if defined(_XM_NO_INTRINSICS_)
  114. float fScale = 1.0f / (float)(1U << DivExponent);
  115. uint32_t ElementIndex = 0;
  116. XMVECTOR Result;
  117. do {
  118. Result.vector4_f32[ElementIndex] = (float)VUInt.vector4_u32[ElementIndex] * fScale;
  119. } while (++ElementIndex<4);
  120. return Result;
  121. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  122. float fScale = 1.0f / (float)(1U << DivExponent);
  123. float32x4_t vResult = vcvtq_f32_u32( VUInt );
  124. return vmulq_n_f32( vResult, fScale );
  125. #else // _XM_SSE_INTRINSICS_
  126. // For the values that are higher than 0x7FFFFFFF, a fixup is needed
  127. // Determine which ones need the fix.
  128. XMVECTOR vMask = _mm_and_ps(VUInt,g_XMNegativeZero);
  129. // Force all values positive
  130. XMVECTOR vResult = _mm_xor_ps(VUInt,vMask);
  131. // Convert to floats
  132. vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
  133. // Convert 0x80000000 -> 0xFFFFFFFF
  134. __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
  135. // For only the ones that are too big, add the fixup
  136. vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
  137. vResult = _mm_add_ps(vResult,vMask);
  138. // Convert DivExponent into 1.0f/(1<<DivExponent)
  139. uint32_t uScale = 0x3F800000U - (DivExponent << 23);
  140. // Splat
  141. iMask = _mm_set1_epi32(uScale);
  142. vResult = _mm_mul_ps(vResult,_mm_castsi128_ps(iMask));
  143. return vResult;
  144. #endif
  145. }
  146. //------------------------------------------------------------------------------
  147. inline XMVECTOR XM_CALLCONV XMConvertVectorFloatToUInt
  148. (
  149. FXMVECTOR VFloat,
  150. uint32_t MulExponent
  151. )
  152. {
  153. assert(MulExponent<32);
  154. #if defined(_XM_NO_INTRINSICS_)
  155. // Get the scalar factor.
  156. float fScale = (float)(1U << MulExponent);
  157. uint32_t ElementIndex = 0;
  158. XMVECTOR Result;
  159. do {
  160. uint32_t uResult;
  161. float fTemp = VFloat.vector4_f32[ElementIndex]*fScale;
  162. if (fTemp <= 0.0f) {
  163. uResult = 0;
  164. } else if (fTemp >= (65536.0f*65536.0f)) {
  165. uResult = 0xFFFFFFFFU;
  166. } else {
  167. uResult = (uint32_t)fTemp;
  168. }
  169. Result.vector4_u32[ElementIndex] = uResult;
  170. } while (++ElementIndex<4);
  171. return Result;
  172. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  173. float32x4_t vResult = vmulq_n_f32(VFloat,(float)(1U << MulExponent));
  174. // In case of overflow, detect it
  175. uint32x4_t vOverflow = vcgtq_f32(vResult,g_XMMaxUInt);
  176. // Float to int conversion
  177. uint32x4_t vResulti = vcvtq_u32_f32(vResult);
  178. // If there was overflow, set to 0xFFFFFFFFU
  179. vResult = vbicq_u32(vResulti,vOverflow);
  180. vOverflow = vorrq_u32(vOverflow,vResult);
  181. return vOverflow;
  182. #else // _XM_SSE_INTRINSICS_
  183. XMVECTOR vResult = _mm_set_ps1(static_cast<float>(1U << MulExponent));
  184. vResult = _mm_mul_ps(vResult,VFloat);
  185. // Clamp to >=0
  186. vResult = _mm_max_ps(vResult,g_XMZero);
  187. // Any numbers that are too big, set to 0xFFFFFFFFU
  188. XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
  189. XMVECTOR vValue = g_XMUnsignedFix;
  190. // Too large for a signed integer?
  191. XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
  192. // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
  193. vValue = _mm_and_ps(vValue,vMask);
  194. // Perform fixup only on numbers too large (Keeps low bit precision)
  195. vResult = _mm_sub_ps(vResult,vValue);
  196. __m128i vResulti = _mm_cvttps_epi32(vResult);
  197. // Convert from signed to unsigned pnly if greater than 0x80000000
  198. vMask = _mm_and_ps(vMask,g_XMNegativeZero);
  199. vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
  200. // On those that are too large, set to 0xFFFFFFFF
  201. vResult = _mm_or_ps(vResult,vOverflow);
  202. return vResult;
  203. #endif
  204. }
  205. #pragma warning(pop)
  206. /****************************************************************************
  207. *
  208. * Vector and matrix load operations
  209. *
  210. ****************************************************************************/
  211. //------------------------------------------------------------------------------
  212. _Use_decl_annotations_
  213. inline XMVECTOR XM_CALLCONV XMLoadInt(const uint32_t* pSource)
  214. {
  215. assert(pSource);
  216. #if defined(_XM_NO_INTRINSICS_)
  217. XMVECTOR V;
  218. V.vector4_u32[0] = *pSource;
  219. V.vector4_u32[1] = 0;
  220. V.vector4_u32[2] = 0;
  221. V.vector4_u32[3] = 0;
  222. return V;
  223. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  224. uint32x4_t zero = vdupq_n_u32(0);
  225. return vld1q_lane_u32( pSource, zero, 0 );
  226. #elif defined(_XM_SSE_INTRINSICS_)
  227. return _mm_load_ss( reinterpret_cast<const float*>(pSource) );
  228. #endif
  229. }
  230. //------------------------------------------------------------------------------
  231. _Use_decl_annotations_
  232. inline XMVECTOR XM_CALLCONV XMLoadFloat(const float* pSource)
  233. {
  234. assert(pSource);
  235. #if defined(_XM_NO_INTRINSICS_)
  236. XMVECTOR V;
  237. V.vector4_f32[0] = *pSource;
  238. V.vector4_f32[1] = 0.f;
  239. V.vector4_f32[2] = 0.f;
  240. V.vector4_f32[3] = 0.f;
  241. return V;
  242. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  243. float32x4_t zero = vdupq_n_f32(0);
  244. return vld1q_lane_f32( pSource, zero, 0 );
  245. #elif defined(_XM_SSE_INTRINSICS_)
  246. return _mm_load_ss( pSource );
  247. #endif
  248. }
  249. //------------------------------------------------------------------------------
  250. _Use_decl_annotations_
  251. inline XMVECTOR XM_CALLCONV XMLoadInt2
  252. (
  253. const uint32_t* pSource
  254. )
  255. {
  256. assert(pSource);
  257. #if defined(_XM_NO_INTRINSICS_)
  258. XMVECTOR V;
  259. V.vector4_u32[0] = pSource[0];
  260. V.vector4_u32[1] = pSource[1];
  261. V.vector4_u32[2] = 0;
  262. V.vector4_u32[3] = 0;
  263. return V;
  264. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  265. uint32x2_t x = vld1_u32( pSource );
  266. uint32x2_t zero = vdup_n_u32(0);
  267. return vcombine_u32( x, zero );
  268. #elif defined(_XM_SSE_INTRINSICS_)
  269. __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pSource) );
  270. __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pSource+1) );
  271. return _mm_unpacklo_ps( x, y );
  272. #endif
  273. }
  274. //------------------------------------------------------------------------------
  275. _Use_decl_annotations_
  276. inline XMVECTOR XM_CALLCONV XMLoadInt2A
  277. (
  278. const uint32_t* pSource
  279. )
  280. {
  281. assert(pSource);
  282. assert(((uintptr_t)pSource & 0xF) == 0);
  283. #if defined(_XM_NO_INTRINSICS_)
  284. XMVECTOR V;
  285. V.vector4_u32[0] = pSource[0];
  286. V.vector4_u32[1] = pSource[1];
  287. V.vector4_u32[2] = 0;
  288. V.vector4_u32[3] = 0;
  289. return V;
  290. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  291. uint32x2_t x = vld1_u32_ex( pSource, 64 );
  292. uint32x2_t zero = vdup_n_u32(0);
  293. return vcombine_u32( x, zero );
  294. #elif defined(_XM_SSE_INTRINSICS_)
  295. __m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) );
  296. return _mm_castsi128_ps(V);
  297. #endif
  298. }
  299. //------------------------------------------------------------------------------
  300. _Use_decl_annotations_
  301. inline XMVECTOR XM_CALLCONV XMLoadFloat2
  302. (
  303. const XMFLOAT2* pSource
  304. )
  305. {
  306. assert(pSource);
  307. #if defined(_XM_NO_INTRINSICS_)
  308. XMVECTOR V;
  309. V.vector4_f32[0] = pSource->x;
  310. V.vector4_f32[1] = pSource->y;
  311. V.vector4_f32[2] = 0.f;
  312. V.vector4_f32[3] = 0.f;
  313. return V;
  314. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  315. float32x2_t x = vld1_f32( reinterpret_cast<const float*>(pSource) );
  316. float32x2_t zero = vdup_n_f32(0);
  317. return vcombine_f32( x, zero );
  318. #elif defined(_XM_SSE_INTRINSICS_)
  319. __m128 x = _mm_load_ss( &pSource->x );
  320. __m128 y = _mm_load_ss( &pSource->y );
  321. return _mm_unpacklo_ps( x, y );
  322. #endif
  323. }
  324. //------------------------------------------------------------------------------
  325. _Use_decl_annotations_
  326. inline XMVECTOR XM_CALLCONV XMLoadFloat2A
  327. (
  328. const XMFLOAT2A* pSource
  329. )
  330. {
  331. assert(pSource);
  332. assert(((uintptr_t)pSource & 0xF) == 0);
  333. #if defined(_XM_NO_INTRINSICS_)
  334. XMVECTOR V;
  335. V.vector4_f32[0] = pSource->x;
  336. V.vector4_f32[1] = pSource->y;
  337. V.vector4_f32[2] = 0.f;
  338. V.vector4_f32[3] = 0.f;
  339. return V;
  340. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  341. float32x2_t x = vld1_f32_ex( reinterpret_cast<const float*>(pSource), 64 );
  342. float32x2_t zero = vdup_n_f32(0);
  343. return vcombine_f32( x, zero );
  344. #elif defined(_XM_SSE_INTRINSICS_)
  345. __m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) );
  346. return _mm_castsi128_ps(V);
  347. #endif
  348. }
  349. //------------------------------------------------------------------------------
  350. _Use_decl_annotations_
  351. inline XMVECTOR XM_CALLCONV XMLoadSInt2
  352. (
  353. const XMINT2* pSource
  354. )
  355. {
  356. assert(pSource);
  357. #if defined(_XM_NO_INTRINSICS_)
  358. XMVECTOR V;
  359. V.vector4_f32[0] = (float)pSource->x;
  360. V.vector4_f32[1] = (float)pSource->y;
  361. V.vector4_f32[2] = 0.f;
  362. V.vector4_f32[3] = 0.f;
  363. return V;
  364. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  365. int32x2_t x = vld1_s32( reinterpret_cast<const int32_t*>(pSource) );
  366. float32x2_t v = vcvt_f32_s32( x );
  367. float32x2_t zero = vdup_n_f32(0);
  368. return vcombine_f32( v, zero );
  369. #elif defined(_XM_SSE_INTRINSICS_)
  370. __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
  371. __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
  372. __m128 V = _mm_unpacklo_ps( x, y );
  373. return _mm_cvtepi32_ps(_mm_castps_si128(V));
  374. #endif
  375. }
  376. //------------------------------------------------------------------------------
  377. _Use_decl_annotations_
  378. inline XMVECTOR XM_CALLCONV XMLoadUInt2
  379. (
  380. const XMUINT2* pSource
  381. )
  382. {
  383. assert(pSource);
  384. #if defined(_XM_NO_INTRINSICS_)
  385. XMVECTOR V;
  386. V.vector4_f32[0] = (float)pSource->x;
  387. V.vector4_f32[1] = (float)pSource->y;
  388. V.vector4_f32[2] = 0.f;
  389. V.vector4_f32[3] = 0.f;
  390. return V;
  391. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  392. uint32x2_t x = vld1_u32( reinterpret_cast<const uint32_t*>(pSource) );
  393. float32x2_t v = vcvt_f32_u32( x );
  394. float32x2_t zero = vdup_n_f32(0);
  395. return vcombine_f32( v, zero );
  396. #elif defined(_XM_SSE_INTRINSICS_)
  397. __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
  398. __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
  399. __m128 V = _mm_unpacklo_ps( x, y );
  400. // For the values that are higher than 0x7FFFFFFF, a fixup is needed
  401. // Determine which ones need the fix.
  402. XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero);
  403. // Force all values positive
  404. XMVECTOR vResult = _mm_xor_ps(V,vMask);
  405. // Convert to floats
  406. vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
  407. // Convert 0x80000000 -> 0xFFFFFFFF
  408. __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
  409. // For only the ones that are too big, add the fixup
  410. vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
  411. vResult = _mm_add_ps(vResult,vMask);
  412. return vResult;
  413. #endif
  414. }
  415. //------------------------------------------------------------------------------
  416. _Use_decl_annotations_
  417. inline XMVECTOR XM_CALLCONV XMLoadInt3
  418. (
  419. const uint32_t* pSource
  420. )
  421. {
  422. assert(pSource);
  423. #if defined(_XM_NO_INTRINSICS_)
  424. XMVECTOR V;
  425. V.vector4_u32[0] = pSource[0];
  426. V.vector4_u32[1] = pSource[1];
  427. V.vector4_u32[2] = pSource[2];
  428. V.vector4_u32[3] = 0;
  429. return V;
  430. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  431. uint32x2_t x = vld1_u32( pSource );
  432. uint32x2_t zero = vdup_n_u32(0);
  433. uint32x2_t y = vld1_lane_u32( pSource+2, zero, 0 );
  434. return vcombine_u32( x, y );
  435. #elif defined(_XM_SSE_INTRINSICS_)
  436. __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pSource) );
  437. __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pSource+1) );
  438. __m128 z = _mm_load_ss( reinterpret_cast<const float*>(pSource+2) );
  439. __m128 xy = _mm_unpacklo_ps( x, y );
  440. return _mm_movelh_ps( xy, z );
  441. #endif
  442. }
  443. //------------------------------------------------------------------------------
  444. _Use_decl_annotations_
  445. inline XMVECTOR XM_CALLCONV XMLoadInt3A
  446. (
  447. const uint32_t* pSource
  448. )
  449. {
  450. assert(pSource);
  451. assert(((uintptr_t)pSource & 0xF) == 0);
  452. #if defined(_XM_NO_INTRINSICS_)
  453. XMVECTOR V;
  454. V.vector4_u32[0] = pSource[0];
  455. V.vector4_u32[1] = pSource[1];
  456. V.vector4_u32[2] = pSource[2];
  457. V.vector4_u32[3] = 0;
  458. return V;
  459. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  460. // Reads an extra integer which is zero'd
  461. uint32x4_t V = vld1q_u32_ex( pSource, 128 );
  462. return vsetq_lane_u32( 0, V, 3 );
  463. #elif defined(_XM_SSE_INTRINSICS_)
  464. // Reads an extra integer which is zero'd
  465. __m128i V = _mm_load_si128( reinterpret_cast<const __m128i*>(pSource) );
  466. V = _mm_and_si128( V, g_XMMask3 );
  467. return _mm_castsi128_ps(V);
  468. #endif
  469. }
  470. //------------------------------------------------------------------------------
  471. _Use_decl_annotations_
  472. inline XMVECTOR XM_CALLCONV XMLoadFloat3
  473. (
  474. const XMFLOAT3* pSource
  475. )
  476. {
  477. assert(pSource);
  478. #if defined(_XM_NO_INTRINSICS_)
  479. XMVECTOR V;
  480. V.vector4_f32[0] = pSource->x;
  481. V.vector4_f32[1] = pSource->y;
  482. V.vector4_f32[2] = pSource->z;
  483. V.vector4_f32[3] = 0.f;
  484. return V;
  485. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  486. float32x2_t x = vld1_f32( reinterpret_cast<const float*>(pSource) );
  487. float32x2_t zero = vdup_n_f32(0);
  488. float32x2_t y = vld1_lane_f32( reinterpret_cast<const float*>(pSource)+2, zero, 0 );
  489. return vcombine_f32( x, y );
  490. #elif defined(_XM_SSE_INTRINSICS_)
  491. __m128 x = _mm_load_ss( &pSource->x );
  492. __m128 y = _mm_load_ss( &pSource->y );
  493. __m128 z = _mm_load_ss( &pSource->z );
  494. __m128 xy = _mm_unpacklo_ps( x, y );
  495. return _mm_movelh_ps( xy, z );
  496. #endif
  497. }
  498. //------------------------------------------------------------------------------
  499. _Use_decl_annotations_
  500. inline XMVECTOR XM_CALLCONV XMLoadFloat3A
  501. (
  502. const XMFLOAT3A* pSource
  503. )
  504. {
  505. assert(pSource);
  506. assert(((uintptr_t)pSource & 0xF) == 0);
  507. #if defined(_XM_NO_INTRINSICS_)
  508. XMVECTOR V;
  509. V.vector4_f32[0] = pSource->x;
  510. V.vector4_f32[1] = pSource->y;
  511. V.vector4_f32[2] = pSource->z;
  512. V.vector4_f32[3] = 0.f;
  513. return V;
  514. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  515. // Reads an extra float which is zero'd
  516. float32x4_t V = vld1q_f32_ex( reinterpret_cast<const float*>(pSource), 128 );
  517. return vsetq_lane_f32( 0, V, 3 );
  518. #elif defined(_XM_SSE_INTRINSICS_)
  519. // Reads an extra float which is zero'd
  520. __m128 V = _mm_load_ps( &pSource->x );
  521. return _mm_and_ps( V, g_XMMask3 );
  522. #endif
  523. }
  524. //------------------------------------------------------------------------------
  525. _Use_decl_annotations_
  526. inline XMVECTOR XM_CALLCONV XMLoadSInt3
  527. (
  528. const XMINT3* pSource
  529. )
  530. {
  531. assert(pSource);
  532. #if defined(_XM_NO_INTRINSICS_)
  533. XMVECTOR V;
  534. V.vector4_f32[0] = (float)pSource->x;
  535. V.vector4_f32[1] = (float)pSource->y;
  536. V.vector4_f32[2] = (float)pSource->z;
  537. V.vector4_f32[3] = 0.f;
  538. return V;
  539. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  540. int32x2_t x = vld1_s32( reinterpret_cast<const int32_t*>(pSource) );
  541. int32x2_t zero = vdup_n_s32(0);
  542. int32x2_t y = vld1_lane_s32( reinterpret_cast<const int32_t*>(pSource)+2, zero, 0 );
  543. int32x4_t v = vcombine_s32( x, y );
  544. return vcvtq_f32_s32( v );
  545. #elif defined(_XM_SSE_INTRINSICS_)
  546. __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
  547. __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
  548. __m128 z = _mm_load_ss( reinterpret_cast<const float*>(&pSource->z) );
  549. __m128 xy = _mm_unpacklo_ps( x, y );
  550. __m128 V = _mm_movelh_ps( xy, z );
  551. return _mm_cvtepi32_ps(_mm_castps_si128(V));
  552. #endif
  553. }
  554. //------------------------------------------------------------------------------
  555. _Use_decl_annotations_
  556. inline XMVECTOR XM_CALLCONV XMLoadUInt3
  557. (
  558. const XMUINT3* pSource
  559. )
  560. {
  561. assert(pSource);
  562. #if defined(_XM_NO_INTRINSICS_)
  563. XMVECTOR V;
  564. V.vector4_f32[0] = (float)pSource->x;
  565. V.vector4_f32[1] = (float)pSource->y;
  566. V.vector4_f32[2] = (float)pSource->z;
  567. V.vector4_f32[3] = 0.f;
  568. return V;
  569. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  570. uint32x2_t x = vld1_u32( reinterpret_cast<const uint32_t*>(pSource) );
  571. uint32x2_t zero = vdup_n_u32(0);
  572. uint32x2_t y = vld1_lane_u32( reinterpret_cast<const uint32_t*>(pSource)+2, zero, 0 );
  573. uint32x4_t v = vcombine_u32( x, y );
  574. return vcvtq_f32_u32( v );
  575. #elif defined(_XM_SSE_INTRINSICS_)
  576. __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
  577. __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
  578. __m128 z = _mm_load_ss( reinterpret_cast<const float*>(&pSource->z) );
  579. __m128 xy = _mm_unpacklo_ps( x, y );
  580. __m128 V = _mm_movelh_ps( xy, z );
  581. // For the values that are higher than 0x7FFFFFFF, a fixup is needed
  582. // Determine which ones need the fix.
  583. XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero);
  584. // Force all values positive
  585. XMVECTOR vResult = _mm_xor_ps(V,vMask);
  586. // Convert to floats
  587. vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
  588. // Convert 0x80000000 -> 0xFFFFFFFF
  589. __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
  590. // For only the ones that are too big, add the fixup
  591. vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
  592. vResult = _mm_add_ps(vResult,vMask);
  593. return vResult;
  594. #endif
  595. }
  596. //------------------------------------------------------------------------------
  597. _Use_decl_annotations_
  598. inline XMVECTOR XM_CALLCONV XMLoadInt4
  599. (
  600. const uint32_t* pSource
  601. )
  602. {
  603. assert(pSource);
  604. #if defined(_XM_NO_INTRINSICS_)
  605. XMVECTOR V;
  606. V.vector4_u32[0] = pSource[0];
  607. V.vector4_u32[1] = pSource[1];
  608. V.vector4_u32[2] = pSource[2];
  609. V.vector4_u32[3] = pSource[3];
  610. return V;
  611. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  612. return vld1q_u32( pSource );
  613. #elif defined(_XM_SSE_INTRINSICS_)
  614. __m128i V = _mm_loadu_si128( reinterpret_cast<const __m128i*>(pSource) );
  615. return _mm_castsi128_ps(V);
  616. #endif
  617. }
  618. //------------------------------------------------------------------------------
  619. _Use_decl_annotations_
  620. inline XMVECTOR XM_CALLCONV XMLoadInt4A
  621. (
  622. const uint32_t* pSource
  623. )
  624. {
  625. assert(pSource);
  626. assert(((uintptr_t)pSource & 0xF) == 0);
  627. #if defined(_XM_NO_INTRINSICS_)
  628. XMVECTOR V;
  629. V.vector4_u32[0] = pSource[0];
  630. V.vector4_u32[1] = pSource[1];
  631. V.vector4_u32[2] = pSource[2];
  632. V.vector4_u32[3] = pSource[3];
  633. return V;
  634. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  635. return vld1q_u32_ex( pSource, 128 );
  636. #elif defined(_XM_SSE_INTRINSICS_)
  637. __m128i V = _mm_load_si128( reinterpret_cast<const __m128i*>(pSource) );
  638. return _mm_castsi128_ps(V);
  639. #endif
  640. }
  641. //------------------------------------------------------------------------------
  642. _Use_decl_annotations_
  643. inline XMVECTOR XM_CALLCONV XMLoadFloat4
  644. (
  645. const XMFLOAT4* pSource
  646. )
  647. {
  648. assert(pSource);
  649. #if defined(_XM_NO_INTRINSICS_)
  650. XMVECTOR V;
  651. V.vector4_f32[0] = pSource->x;
  652. V.vector4_f32[1] = pSource->y;
  653. V.vector4_f32[2] = pSource->z;
  654. V.vector4_f32[3] = pSource->w;
  655. return V;
  656. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  657. return vld1q_f32( reinterpret_cast<const float*>(pSource) );
  658. #elif defined(_XM_SSE_INTRINSICS_)
  659. return _mm_loadu_ps( &pSource->x );
  660. #endif
  661. }
  662. //------------------------------------------------------------------------------
  663. _Use_decl_annotations_
  664. inline XMVECTOR XM_CALLCONV XMLoadFloat4A
  665. (
  666. const XMFLOAT4A* pSource
  667. )
  668. {
  669. assert(pSource);
  670. assert(((uintptr_t)pSource & 0xF) == 0);
  671. #if defined(_XM_NO_INTRINSICS_)
  672. XMVECTOR V;
  673. V.vector4_f32[0] = pSource->x;
  674. V.vector4_f32[1] = pSource->y;
  675. V.vector4_f32[2] = pSource->z;
  676. V.vector4_f32[3] = pSource->w;
  677. return V;
  678. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  679. return vld1q_f32_ex( reinterpret_cast<const float*>(pSource), 128 );
  680. #elif defined(_XM_SSE_INTRINSICS_)
  681. return _mm_load_ps( &pSource->x );
  682. #endif
  683. }
  684. //------------------------------------------------------------------------------
  685. _Use_decl_annotations_
  686. inline XMVECTOR XM_CALLCONV XMLoadSInt4
  687. (
  688. const XMINT4* pSource
  689. )
  690. {
  691. assert(pSource);
  692. #if defined(_XM_NO_INTRINSICS_)
  693. XMVECTOR V;
  694. V.vector4_f32[0] = (float)pSource->x;
  695. V.vector4_f32[1] = (float)pSource->y;
  696. V.vector4_f32[2] = (float)pSource->z;
  697. V.vector4_f32[3] = (float)pSource->w;
  698. return V;
  699. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  700. int32x4_t v = vld1q_s32( reinterpret_cast<const int32_t*>(pSource) );
  701. return vcvtq_f32_s32( v );
  702. #elif defined(_XM_SSE_INTRINSICS_)
  703. __m128i V = _mm_loadu_si128( reinterpret_cast<const __m128i*>(pSource) );
  704. return _mm_cvtepi32_ps(V);
  705. #endif
  706. }
  707. //------------------------------------------------------------------------------
  708. _Use_decl_annotations_
  709. inline XMVECTOR XM_CALLCONV XMLoadUInt4
  710. (
  711. const XMUINT4* pSource
  712. )
  713. {
  714. assert(pSource);
  715. #if defined(_XM_NO_INTRINSICS_)
  716. XMVECTOR V;
  717. V.vector4_f32[0] = (float)pSource->x;
  718. V.vector4_f32[1] = (float)pSource->y;
  719. V.vector4_f32[2] = (float)pSource->z;
  720. V.vector4_f32[3] = (float)pSource->w;
  721. return V;
  722. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  723. uint32x4_t v = vld1q_u32( reinterpret_cast<const uint32_t*>(pSource) );
  724. return vcvtq_f32_u32( v );
  725. #elif defined(_XM_SSE_INTRINSICS_)
  726. __m128i V = _mm_loadu_si128( reinterpret_cast<const __m128i*>(pSource) );
  727. // For the values that are higher than 0x7FFFFFFF, a fixup is needed
  728. // Determine which ones need the fix.
  729. XMVECTOR vMask = _mm_and_ps(_mm_castsi128_ps(V),g_XMNegativeZero);
  730. // Force all values positive
  731. XMVECTOR vResult = _mm_xor_ps(_mm_castsi128_ps(V),vMask);
  732. // Convert to floats
  733. vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
  734. // Convert 0x80000000 -> 0xFFFFFFFF
  735. __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
  736. // For only the ones that are too big, add the fixup
  737. vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
  738. vResult = _mm_add_ps(vResult,vMask);
  739. return vResult;
  740. #endif
  741. }
  742. //------------------------------------------------------------------------------
  743. _Use_decl_annotations_
  744. inline XMMATRIX XM_CALLCONV XMLoadFloat3x3
  745. (
  746. const XMFLOAT3X3* pSource
  747. )
  748. {
  749. assert(pSource);
  750. #if defined(_XM_NO_INTRINSICS_)
  751. XMMATRIX M;
  752. M.r[0].vector4_f32[0] = pSource->m[0][0];
  753. M.r[0].vector4_f32[1] = pSource->m[0][1];
  754. M.r[0].vector4_f32[2] = pSource->m[0][2];
  755. M.r[0].vector4_f32[3] = 0.0f;
  756. M.r[1].vector4_f32[0] = pSource->m[1][0];
  757. M.r[1].vector4_f32[1] = pSource->m[1][1];
  758. M.r[1].vector4_f32[2] = pSource->m[1][2];
  759. M.r[1].vector4_f32[3] = 0.0f;
  760. M.r[2].vector4_f32[0] = pSource->m[2][0];
  761. M.r[2].vector4_f32[1] = pSource->m[2][1];
  762. M.r[2].vector4_f32[2] = pSource->m[2][2];
  763. M.r[2].vector4_f32[3] = 0.0f;
  764. M.r[3].vector4_f32[0] = 0.0f;
  765. M.r[3].vector4_f32[1] = 0.0f;
  766. M.r[3].vector4_f32[2] = 0.0f;
  767. M.r[3].vector4_f32[3] = 1.0f;
  768. return M;
  769. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  770. float32x4_t v0 = vld1q_f32( &pSource->m[0][0] );
  771. float32x4_t v1 = vld1q_f32( &pSource->m[1][1] );
  772. float32x2_t v2 = vcreate_f32( (uint64_t)*(const uint32_t*)&pSource->m[2][2] );
  773. float32x4_t T = vextq_f32( v0, v1, 3 );
  774. XMMATRIX M;
  775. M.r[0] = vandq_u32( v0, g_XMMask3 );
  776. M.r[1] = vandq_u32( T, g_XMMask3 );
  777. M.r[2] = vcombine_f32( vget_high_f32(v1), v2 );
  778. M.r[3] = g_XMIdentityR3;
  779. return M;
  780. #elif defined(_XM_SSE_INTRINSICS_)
  781. __m128 Z = _mm_setzero_ps();
  782. __m128 V1 = _mm_loadu_ps( &pSource->m[0][0] );
  783. __m128 V2 = _mm_loadu_ps( &pSource->m[1][1] );
  784. __m128 V3 = _mm_load_ss( &pSource->m[2][2] );
  785. __m128 T1 = _mm_unpackhi_ps( V1, Z );
  786. __m128 T2 = _mm_unpacklo_ps( V2, Z );
  787. __m128 T3 = _mm_shuffle_ps( V3, T2, _MM_SHUFFLE( 0, 1, 0, 0 ) );
  788. __m128 T4 = _mm_movehl_ps( T2, T3 );
  789. __m128 T5 = _mm_movehl_ps( Z, T1 );
  790. XMMATRIX M;
  791. M.r[0] = _mm_movelh_ps( V1, T1 );
  792. M.r[1] = _mm_add_ps( T4, T5 );
  793. M.r[2] = _mm_shuffle_ps( V2, V3, _MM_SHUFFLE(1, 0, 3, 2) );
  794. M.r[3] = g_XMIdentityR3;
  795. return M;
  796. #endif
  797. }
  798. //------------------------------------------------------------------------------
  799. _Use_decl_annotations_
  800. inline XMMATRIX XM_CALLCONV XMLoadFloat4x3
  801. (
  802. const XMFLOAT4X3* pSource
  803. )
  804. {
  805. assert(pSource);
  806. #if defined(_XM_NO_INTRINSICS_)
  807. XMMATRIX M;
  808. M.r[0].vector4_f32[0] = pSource->m[0][0];
  809. M.r[0].vector4_f32[1] = pSource->m[0][1];
  810. M.r[0].vector4_f32[2] = pSource->m[0][2];
  811. M.r[0].vector4_f32[3] = 0.0f;
  812. M.r[1].vector4_f32[0] = pSource->m[1][0];
  813. M.r[1].vector4_f32[1] = pSource->m[1][1];
  814. M.r[1].vector4_f32[2] = pSource->m[1][2];
  815. M.r[1].vector4_f32[3] = 0.0f;
  816. M.r[2].vector4_f32[0] = pSource->m[2][0];
  817. M.r[2].vector4_f32[1] = pSource->m[2][1];
  818. M.r[2].vector4_f32[2] = pSource->m[2][2];
  819. M.r[2].vector4_f32[3] = 0.0f;
  820. M.r[3].vector4_f32[0] = pSource->m[3][0];
  821. M.r[3].vector4_f32[1] = pSource->m[3][1];
  822. M.r[3].vector4_f32[2] = pSource->m[3][2];
  823. M.r[3].vector4_f32[3] = 1.0f;
  824. return M;
  825. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  826. float32x4_t v0 = vld1q_f32( &pSource->m[0][0] );
  827. float32x4_t v1 = vld1q_f32( &pSource->m[1][1] );
  828. float32x4_t v2 = vld1q_f32( &pSource->m[2][2] );
  829. float32x4_t T1 = vextq_f32( v0, v1, 3 );
  830. float32x4_t T2 = vcombine_f32( vget_high_f32(v1), vget_low_f32(v2) );
  831. float32x4_t T3 = vextq_f32( v2, v2, 1 );
  832. XMMATRIX M;
  833. M.r[0] = vandq_u32( v0, g_XMMask3 );
  834. M.r[1] = vandq_u32( T1, g_XMMask3 );
  835. M.r[2] = vandq_u32( T2, g_XMMask3 );
  836. M.r[3] = vsetq_lane_f32( 1.f, T3, 3 );
  837. return M;
  838. #elif defined(_XM_SSE_INTRINSICS_)
  839. // Use unaligned load instructions to
  840. // load the 12 floats
  841. // vTemp1 = x1,y1,z1,x2
  842. XMVECTOR vTemp1 = _mm_loadu_ps(&pSource->m[0][0]);
  843. // vTemp2 = y2,z2,x3,y3
  844. XMVECTOR vTemp2 = _mm_loadu_ps(&pSource->m[1][1]);
  845. // vTemp4 = z3,x4,y4,z4
  846. XMVECTOR vTemp4 = _mm_loadu_ps(&pSource->m[2][2]);
  847. // vTemp3 = x3,y3,z3,z3
  848. XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2));
  849. // vTemp2 = y2,z2,x2,x2
  850. vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0));
  851. // vTemp2 = x2,y2,z2,z2
  852. vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(1,1,0,2));
  853. // vTemp1 = x1,y1,z1,0
  854. vTemp1 = _mm_and_ps(vTemp1,g_XMMask3);
  855. // vTemp2 = x2,y2,z2,0
  856. vTemp2 = _mm_and_ps(vTemp2,g_XMMask3);
  857. // vTemp3 = x3,y3,z3,0
  858. vTemp3 = _mm_and_ps(vTemp3,g_XMMask3);
  859. // vTemp4i = x4,y4,z4,0
  860. __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4),32/8);
  861. // vTemp4i = x4,y4,z4,1.0f
  862. vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3);
  863. XMMATRIX M(vTemp1,
  864. vTemp2,
  865. vTemp3,
  866. _mm_castsi128_ps(vTemp4i));
  867. return M;
  868. #endif
  869. }
  870. //------------------------------------------------------------------------------
  871. _Use_decl_annotations_
  872. inline XMMATRIX XM_CALLCONV XMLoadFloat4x3A
  873. (
  874. const XMFLOAT4X3A* pSource
  875. )
  876. {
  877. assert(pSource);
  878. assert(((uintptr_t)pSource & 0xF) == 0);
  879. #if defined(_XM_NO_INTRINSICS_)
  880. XMMATRIX M;
  881. M.r[0].vector4_f32[0] = pSource->m[0][0];
  882. M.r[0].vector4_f32[1] = pSource->m[0][1];
  883. M.r[0].vector4_f32[2] = pSource->m[0][2];
  884. M.r[0].vector4_f32[3] = 0.0f;
  885. M.r[1].vector4_f32[0] = pSource->m[1][0];
  886. M.r[1].vector4_f32[1] = pSource->m[1][1];
  887. M.r[1].vector4_f32[2] = pSource->m[1][2];
  888. M.r[1].vector4_f32[3] = 0.0f;
  889. M.r[2].vector4_f32[0] = pSource->m[2][0];
  890. M.r[2].vector4_f32[1] = pSource->m[2][1];
  891. M.r[2].vector4_f32[2] = pSource->m[2][2];
  892. M.r[2].vector4_f32[3] = 0.0f;
  893. M.r[3].vector4_f32[0] = pSource->m[3][0];
  894. M.r[3].vector4_f32[1] = pSource->m[3][1];
  895. M.r[3].vector4_f32[2] = pSource->m[3][2];
  896. M.r[3].vector4_f32[3] = 1.0f;
  897. return M;
  898. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  899. float32x4_t v0 = vld1q_f32_ex( &pSource->m[0][0], 128 );
  900. float32x4_t v1 = vld1q_f32_ex( &pSource->m[1][1], 128 );
  901. float32x4_t v2 = vld1q_f32_ex( &pSource->m[2][2], 128 );
  902. float32x4_t T1 = vextq_f32( v0, v1, 3 );
  903. float32x4_t T2 = vcombine_f32( vget_high_f32(v1), vget_low_f32(v2) );
  904. float32x4_t T3 = vextq_f32( v2, v2, 1 );
  905. XMMATRIX M;
  906. M.r[0] = vandq_u32( v0, g_XMMask3 );
  907. M.r[1] = vandq_u32( T1, g_XMMask3 );
  908. M.r[2] = vandq_u32( T2, g_XMMask3 );
  909. M.r[3] = vsetq_lane_f32( 1.f, T3, 3 );
  910. return M;
  911. #elif defined(_XM_SSE_INTRINSICS_)
  912. // Use aligned load instructions to
  913. // load the 12 floats
  914. // vTemp1 = x1,y1,z1,x2
  915. XMVECTOR vTemp1 = _mm_load_ps(&pSource->m[0][0]);
  916. // vTemp2 = y2,z2,x3,y3
  917. XMVECTOR vTemp2 = _mm_load_ps(&pSource->m[1][1]);
  918. // vTemp4 = z3,x4,y4,z4
  919. XMVECTOR vTemp4 = _mm_load_ps(&pSource->m[2][2]);
  920. // vTemp3 = x3,y3,z3,z3
  921. XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2));
  922. // vTemp2 = y2,z2,x2,x2
  923. vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0));
  924. // vTemp2 = x2,y2,z2,z2
  925. vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(1,1,0,2));
  926. // vTemp1 = x1,y1,z1,0
  927. vTemp1 = _mm_and_ps(vTemp1,g_XMMask3);
  928. // vTemp2 = x2,y2,z2,0
  929. vTemp2 = _mm_and_ps(vTemp2,g_XMMask3);
  930. // vTemp3 = x3,y3,z3,0
  931. vTemp3 = _mm_and_ps(vTemp3,g_XMMask3);
  932. // vTemp4i = x4,y4,z4,0
  933. __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4),32/8);
  934. // vTemp4i = x4,y4,z4,1.0f
  935. vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3);
  936. XMMATRIX M(vTemp1,
  937. vTemp2,
  938. vTemp3,
  939. _mm_castsi128_ps(vTemp4i));
  940. return M;
  941. #endif
  942. }
  943. //------------------------------------------------------------------------------
  944. _Use_decl_annotations_
  945. inline XMMATRIX XM_CALLCONV XMLoadFloat4x4
  946. (
  947. const XMFLOAT4X4* pSource
  948. )
  949. {
  950. assert(pSource);
  951. #if defined(_XM_NO_INTRINSICS_)
  952. XMMATRIX M;
  953. M.r[0].vector4_f32[0] = pSource->m[0][0];
  954. M.r[0].vector4_f32[1] = pSource->m[0][1];
  955. M.r[0].vector4_f32[2] = pSource->m[0][2];
  956. M.r[0].vector4_f32[3] = pSource->m[0][3];
  957. M.r[1].vector4_f32[0] = pSource->m[1][0];
  958. M.r[1].vector4_f32[1] = pSource->m[1][1];
  959. M.r[1].vector4_f32[2] = pSource->m[1][2];
  960. M.r[1].vector4_f32[3] = pSource->m[1][3];
  961. M.r[2].vector4_f32[0] = pSource->m[2][0];
  962. M.r[2].vector4_f32[1] = pSource->m[2][1];
  963. M.r[2].vector4_f32[2] = pSource->m[2][2];
  964. M.r[2].vector4_f32[3] = pSource->m[2][3];
  965. M.r[3].vector4_f32[0] = pSource->m[3][0];
  966. M.r[3].vector4_f32[1] = pSource->m[3][1];
  967. M.r[3].vector4_f32[2] = pSource->m[3][2];
  968. M.r[3].vector4_f32[3] = pSource->m[3][3];
  969. return M;
  970. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  971. XMMATRIX M;
  972. M.r[0] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_11) );
  973. M.r[1] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_21) );
  974. M.r[2] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_31) );
  975. M.r[3] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_41) );
  976. return M;
  977. #elif defined(_XM_SSE_INTRINSICS_)
  978. XMMATRIX M;
  979. M.r[0] = _mm_loadu_ps( &pSource->_11 );
  980. M.r[1] = _mm_loadu_ps( &pSource->_21 );
  981. M.r[2] = _mm_loadu_ps( &pSource->_31 );
  982. M.r[3] = _mm_loadu_ps( &pSource->_41 );
  983. return M;
  984. #endif
  985. }
  986. //------------------------------------------------------------------------------
  987. _Use_decl_annotations_
  988. inline XMMATRIX XM_CALLCONV XMLoadFloat4x4A
  989. (
  990. const XMFLOAT4X4A* pSource
  991. )
  992. {
  993. assert(pSource);
  994. assert(((uintptr_t)pSource & 0xF) == 0);
  995. #if defined(_XM_NO_INTRINSICS_)
  996. XMMATRIX M;
  997. M.r[0].vector4_f32[0] = pSource->m[0][0];
  998. M.r[0].vector4_f32[1] = pSource->m[0][1];
  999. M.r[0].vector4_f32[2] = pSource->m[0][2];
  1000. M.r[0].vector4_f32[3] = pSource->m[0][3];
  1001. M.r[1].vector4_f32[0] = pSource->m[1][0];
  1002. M.r[1].vector4_f32[1] = pSource->m[1][1];
  1003. M.r[1].vector4_f32[2] = pSource->m[1][2];
  1004. M.r[1].vector4_f32[3] = pSource->m[1][3];
  1005. M.r[2].vector4_f32[0] = pSource->m[2][0];
  1006. M.r[2].vector4_f32[1] = pSource->m[2][1];
  1007. M.r[2].vector4_f32[2] = pSource->m[2][2];
  1008. M.r[2].vector4_f32[3] = pSource->m[2][3];
  1009. M.r[3].vector4_f32[0] = pSource->m[3][0];
  1010. M.r[3].vector4_f32[1] = pSource->m[3][1];
  1011. M.r[3].vector4_f32[2] = pSource->m[3][2];
  1012. M.r[3].vector4_f32[3] = pSource->m[3][3];
  1013. return M;
  1014. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1015. XMMATRIX M;
  1016. M.r[0] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_11), 128 );
  1017. M.r[1] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_21), 128 );
  1018. M.r[2] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_31), 128 );
  1019. M.r[3] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_41), 128 );
  1020. return M;
  1021. #elif defined(_XM_SSE_INTRINSICS_)
  1022. XMMATRIX M;
  1023. M.r[0] = _mm_load_ps( &pSource->_11 );
  1024. M.r[1] = _mm_load_ps( &pSource->_21 );
  1025. M.r[2] = _mm_load_ps( &pSource->_31 );
  1026. M.r[3] = _mm_load_ps( &pSource->_41 );
  1027. return M;
  1028. #endif
  1029. }
  1030. /****************************************************************************
  1031. *
  1032. * Vector and matrix store operations
  1033. *
  1034. ****************************************************************************/
  1035. _Use_decl_annotations_
  1036. inline void XM_CALLCONV XMStoreInt
  1037. (
  1038. uint32_t* pDestination,
  1039. FXMVECTOR V
  1040. )
  1041. {
  1042. assert(pDestination);
  1043. #if defined(_XM_NO_INTRINSICS_)
  1044. *pDestination = XMVectorGetIntX( V );
  1045. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1046. vst1q_lane_u32( pDestination, *reinterpret_cast<const uint32x4_t*>(&V), 0 );
  1047. #elif defined(_XM_SSE_INTRINSICS_)
  1048. _mm_store_ss( reinterpret_cast<float*>(pDestination), V );
  1049. #endif
  1050. }
  1051. //------------------------------------------------------------------------------
  1052. _Use_decl_annotations_
  1053. inline void XM_CALLCONV XMStoreFloat
  1054. (
  1055. float* pDestination,
  1056. FXMVECTOR V
  1057. )
  1058. {
  1059. assert(pDestination);
  1060. #if defined(_XM_NO_INTRINSICS_)
  1061. *pDestination = XMVectorGetX( V );
  1062. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1063. vst1q_lane_f32( pDestination, V, 0 );
  1064. #elif defined(_XM_SSE_INTRINSICS_)
  1065. _mm_store_ss( pDestination, V );
  1066. #endif
  1067. }
  1068. //------------------------------------------------------------------------------
  1069. _Use_decl_annotations_
  1070. inline void XM_CALLCONV XMStoreInt2
  1071. (
  1072. uint32_t* pDestination,
  1073. FXMVECTOR V
  1074. )
  1075. {
  1076. assert(pDestination);
  1077. #if defined(_XM_NO_INTRINSICS_)
  1078. pDestination[0] = V.vector4_u32[0];
  1079. pDestination[1] = V.vector4_u32[1];
  1080. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1081. uint32x2_t VL = vget_low_u32(V);
  1082. vst1_u32( pDestination, VL );
  1083. #elif defined(_XM_SSE_INTRINSICS_)
  1084. XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
  1085. _mm_store_ss( reinterpret_cast<float*>(&pDestination[0]), V );
  1086. _mm_store_ss( reinterpret_cast<float*>(&pDestination[1]), T );
  1087. #endif
  1088. }
  1089. //------------------------------------------------------------------------------
  1090. _Use_decl_annotations_
  1091. inline void XM_CALLCONV XMStoreInt2A
  1092. (
  1093. uint32_t* pDestination,
  1094. FXMVECTOR V
  1095. )
  1096. {
  1097. assert(pDestination);
  1098. assert(((uintptr_t)pDestination & 0xF) == 0);
  1099. #if defined(_XM_NO_INTRINSICS_)
  1100. pDestination[0] = V.vector4_u32[0];
  1101. pDestination[1] = V.vector4_u32[1];
  1102. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1103. uint32x2_t VL = vget_low_u32(V);
  1104. vst1_u32_ex( pDestination, VL, 64 );
  1105. #elif defined(_XM_SSE_INTRINSICS_)
  1106. _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
  1107. #endif
  1108. }
  1109. //------------------------------------------------------------------------------
  1110. _Use_decl_annotations_
  1111. inline void XM_CALLCONV XMStoreFloat2
  1112. (
  1113. XMFLOAT2* pDestination,
  1114. FXMVECTOR V
  1115. )
  1116. {
  1117. assert(pDestination);
  1118. #if defined(_XM_NO_INTRINSICS_)
  1119. pDestination->x = V.vector4_f32[0];
  1120. pDestination->y = V.vector4_f32[1];
  1121. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1122. float32x2_t VL = vget_low_f32(V);
  1123. vst1_f32( reinterpret_cast<float*>(pDestination), VL );
  1124. #elif defined(_XM_SSE_INTRINSICS_)
  1125. XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
  1126. _mm_store_ss( &pDestination->x, V );
  1127. _mm_store_ss( &pDestination->y, T );
  1128. #endif
  1129. }
  1130. //------------------------------------------------------------------------------
  1131. _Use_decl_annotations_
  1132. inline void XM_CALLCONV XMStoreFloat2A
  1133. (
  1134. XMFLOAT2A* pDestination,
  1135. FXMVECTOR V
  1136. )
  1137. {
  1138. assert(pDestination);
  1139. assert(((uintptr_t)pDestination & 0xF) == 0);
  1140. #if defined(_XM_NO_INTRINSICS_)
  1141. pDestination->x = V.vector4_f32[0];
  1142. pDestination->y = V.vector4_f32[1];
  1143. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1144. float32x2_t VL = vget_low_f32(V);
  1145. vst1_f32_ex( reinterpret_cast<float*>(pDestination), VL, 64 );
  1146. #elif defined(_XM_SSE_INTRINSICS_)
  1147. _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
  1148. #endif
  1149. }
  1150. //------------------------------------------------------------------------------
  1151. _Use_decl_annotations_
  1152. inline void XM_CALLCONV XMStoreSInt2
  1153. (
  1154. XMINT2* pDestination,
  1155. FXMVECTOR V
  1156. )
  1157. {
  1158. assert(pDestination);
  1159. #if defined(_XM_NO_INTRINSICS_)
  1160. pDestination->x = (int32_t)V.vector4_f32[0];
  1161. pDestination->y = (int32_t)V.vector4_f32[1];
  1162. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1163. int32x2_t v = vget_low_s32(V);
  1164. v = vcvt_s32_f32( v );
  1165. vst1_s32( reinterpret_cast<int32_t*>(pDestination), v );
  1166. #elif defined(_XM_SSE_INTRINSICS_)
  1167. // In case of positive overflow, detect it
  1168. XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt);
  1169. // Float to int conversion
  1170. __m128i vResulti = _mm_cvttps_epi32(V);
  1171. // If there was positive overflow, set to 0x7FFFFFFF
  1172. XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
  1173. vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
  1174. vOverflow = _mm_or_ps(vOverflow,vResult);
  1175. // Write two ints
  1176. XMVECTOR T = XM_PERMUTE_PS( vOverflow, _MM_SHUFFLE( 1, 1, 1, 1 ) );
  1177. _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vOverflow );
  1178. _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T );
  1179. #endif
  1180. }
  1181. //------------------------------------------------------------------------------
  1182. _Use_decl_annotations_
  1183. inline void XM_CALLCONV XMStoreUInt2
  1184. (
  1185. XMUINT2* pDestination,
  1186. FXMVECTOR V
  1187. )
  1188. {
  1189. assert(pDestination);
  1190. #if defined(_XM_NO_INTRINSICS_)
  1191. pDestination->x = (uint32_t)V.vector4_f32[0];
  1192. pDestination->y = (uint32_t)V.vector4_f32[1];
  1193. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1194. float32x2_t v = vget_low_f32(V);
  1195. uint32x2_t iv = vcvt_u32_f32( v );
  1196. vst1_u32( reinterpret_cast<uint32_t*>(pDestination), iv );
  1197. #elif defined(_XM_SSE_INTRINSICS_)
  1198. // Clamp to >=0
  1199. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  1200. // Any numbers that are too big, set to 0xFFFFFFFFU
  1201. XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
  1202. XMVECTOR vValue = g_XMUnsignedFix;
  1203. // Too large for a signed integer?
  1204. XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
  1205. // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
  1206. vValue = _mm_and_ps(vValue,vMask);
  1207. // Perform fixup only on numbers too large (Keeps low bit precision)
  1208. vResult = _mm_sub_ps(vResult,vValue);
  1209. __m128i vResulti = _mm_cvttps_epi32(vResult);
  1210. // Convert from signed to unsigned pnly if greater than 0x80000000
  1211. vMask = _mm_and_ps(vMask,g_XMNegativeZero);
  1212. vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
  1213. // On those that are too large, set to 0xFFFFFFFF
  1214. vResult = _mm_or_ps(vResult,vOverflow);
  1215. // Write two uints
  1216. XMVECTOR T = XM_PERMUTE_PS( vResult, _MM_SHUFFLE( 1, 1, 1, 1 ) );
  1217. _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vResult );
  1218. _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T );
  1219. #endif
  1220. }
  1221. //------------------------------------------------------------------------------
  1222. _Use_decl_annotations_
  1223. inline void XM_CALLCONV XMStoreInt3
  1224. (
  1225. uint32_t* pDestination,
  1226. FXMVECTOR V
  1227. )
  1228. {
  1229. assert(pDestination);
  1230. #if defined(_XM_NO_INTRINSICS_)
  1231. pDestination[0] = V.vector4_u32[0];
  1232. pDestination[1] = V.vector4_u32[1];
  1233. pDestination[2] = V.vector4_u32[2];
  1234. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1235. uint32x2_t VL = vget_low_u32(V);
  1236. vst1_u32( pDestination, VL );
  1237. vst1q_lane_u32( pDestination+2, *reinterpret_cast<const uint32x4_t*>(&V), 2 );
  1238. #elif defined(_XM_SSE_INTRINSICS_)
  1239. XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
  1240. XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
  1241. _mm_store_ss( reinterpret_cast<float*>(pDestination), V );
  1242. _mm_store_ss( reinterpret_cast<float*>(&pDestination[1]), T1 );
  1243. _mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), T2 );
  1244. #endif
  1245. }
  1246. //------------------------------------------------------------------------------
  1247. _Use_decl_annotations_
  1248. inline void XM_CALLCONV XMStoreInt3A
  1249. (
  1250. uint32_t* pDestination,
  1251. FXMVECTOR V
  1252. )
  1253. {
  1254. assert(pDestination);
  1255. assert(((uintptr_t)pDestination & 0xF) == 0);
  1256. #if defined(_XM_NO_INTRINSICS_)
  1257. pDestination[0] = V.vector4_u32[0];
  1258. pDestination[1] = V.vector4_u32[1];
  1259. pDestination[2] = V.vector4_u32[2];
  1260. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1261. uint32x2_t VL = vget_low_u32(V);
  1262. vst1_u32_ex( pDestination, VL, 64 );
  1263. vst1q_lane_u32( pDestination+2, *reinterpret_cast<const uint32x4_t*>(&V), 2 );
  1264. #elif defined(_XM_SSE_INTRINSICS_)
  1265. XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
  1266. _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
  1267. _mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), T );
  1268. #endif
  1269. }
  1270. //------------------------------------------------------------------------------
  1271. _Use_decl_annotations_
  1272. inline void XM_CALLCONV XMStoreFloat3
  1273. (
  1274. XMFLOAT3* pDestination,
  1275. FXMVECTOR V
  1276. )
  1277. {
  1278. assert(pDestination);
  1279. #if defined(_XM_NO_INTRINSICS_)
  1280. pDestination->x = V.vector4_f32[0];
  1281. pDestination->y = V.vector4_f32[1];
  1282. pDestination->z = V.vector4_f32[2];
  1283. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1284. float32x2_t VL = vget_low_f32(V);
  1285. vst1_f32( reinterpret_cast<float*>(pDestination), VL );
  1286. vst1q_lane_f32( reinterpret_cast<float*>(pDestination)+2, V, 2 );
  1287. #elif defined(_XM_SSE_INTRINSICS_)
  1288. XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
  1289. XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
  1290. _mm_store_ss( &pDestination->x, V );
  1291. _mm_store_ss( &pDestination->y, T1 );
  1292. _mm_store_ss( &pDestination->z, T2 );
  1293. #endif
  1294. }
  1295. //------------------------------------------------------------------------------
  1296. _Use_decl_annotations_
  1297. inline void XM_CALLCONV XMStoreFloat3A
  1298. (
  1299. XMFLOAT3A* pDestination,
  1300. FXMVECTOR V
  1301. )
  1302. {
  1303. assert(pDestination);
  1304. assert(((uintptr_t)pDestination & 0xF) == 0);
  1305. #if defined(_XM_NO_INTRINSICS_)
  1306. pDestination->x = V.vector4_f32[0];
  1307. pDestination->y = V.vector4_f32[1];
  1308. pDestination->z = V.vector4_f32[2];
  1309. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1310. float32x2_t VL = vget_low_f32(V);
  1311. vst1_f32_ex( reinterpret_cast<float*>(pDestination), VL, 64 );
  1312. vst1q_lane_f32( reinterpret_cast<float*>(pDestination)+2, V, 2 );
  1313. #elif defined(_XM_SSE_INTRINSICS_)
  1314. XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
  1315. _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
  1316. _mm_store_ss( &pDestination->z, T );
  1317. #endif
  1318. }
  1319. //------------------------------------------------------------------------------
  1320. _Use_decl_annotations_
  1321. inline void XM_CALLCONV XMStoreSInt3
  1322. (
  1323. XMINT3* pDestination,
  1324. FXMVECTOR V
  1325. )
  1326. {
  1327. assert(pDestination);
  1328. #if defined(_XM_NO_INTRINSICS_)
  1329. pDestination->x = (int32_t)V.vector4_f32[0];
  1330. pDestination->y = (int32_t)V.vector4_f32[1];
  1331. pDestination->z = (int32_t)V.vector4_f32[2];
  1332. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1333. int32x4_t v = vcvtq_s32_f32(V);
  1334. int32x2_t vL = vget_low_s32(v);
  1335. vst1_s32( reinterpret_cast<int32_t*>(pDestination), vL );
  1336. vst1q_lane_s32( reinterpret_cast<int32_t*>(pDestination)+2, v, 2 );
  1337. #elif defined(_XM_SSE_INTRINSICS_)
  1338. // In case of positive overflow, detect it
  1339. XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt);
  1340. // Float to int conversion
  1341. __m128i vResulti = _mm_cvttps_epi32(V);
  1342. // If there was positive overflow, set to 0x7FFFFFFF
  1343. XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
  1344. vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
  1345. vOverflow = _mm_or_ps(vOverflow,vResult);
  1346. // Write 3 uints
  1347. XMVECTOR T1 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(1,1,1,1));
  1348. XMVECTOR T2 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(2,2,2,2));
  1349. _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vOverflow );
  1350. _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T1 );
  1351. _mm_store_ss( reinterpret_cast<float*>(&pDestination->z), T2 );
  1352. #endif
  1353. }
  1354. //------------------------------------------------------------------------------
  1355. _Use_decl_annotations_
  1356. inline void XM_CALLCONV XMStoreUInt3
  1357. (
  1358. XMUINT3* pDestination,
  1359. FXMVECTOR V
  1360. )
  1361. {
  1362. assert(pDestination);
  1363. #if defined(_XM_NO_INTRINSICS_)
  1364. pDestination->x = (uint32_t)V.vector4_f32[0];
  1365. pDestination->y = (uint32_t)V.vector4_f32[1];
  1366. pDestination->z = (uint32_t)V.vector4_f32[2];
  1367. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1368. uint32x4_t v = vcvtq_u32_f32(V);
  1369. uint32x2_t vL = vget_low_u32(v);
  1370. vst1_u32( reinterpret_cast<uint32_t*>(pDestination), vL );
  1371. vst1q_lane_u32( reinterpret_cast<uint32_t*>(pDestination)+2, v, 2 );
  1372. #elif defined(_XM_SSE_INTRINSICS_)
  1373. // Clamp to >=0
  1374. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  1375. // Any numbers that are too big, set to 0xFFFFFFFFU
  1376. XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
  1377. XMVECTOR vValue = g_XMUnsignedFix;
  1378. // Too large for a signed integer?
  1379. XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
  1380. // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
  1381. vValue = _mm_and_ps(vValue,vMask);
  1382. // Perform fixup only on numbers too large (Keeps low bit precision)
  1383. vResult = _mm_sub_ps(vResult,vValue);
  1384. __m128i vResulti = _mm_cvttps_epi32(vResult);
  1385. // Convert from signed to unsigned pnly if greater than 0x80000000
  1386. vMask = _mm_and_ps(vMask,g_XMNegativeZero);
  1387. vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
  1388. // On those that are too large, set to 0xFFFFFFFF
  1389. vResult = _mm_or_ps(vResult,vOverflow);
  1390. // Write 3 uints
  1391. XMVECTOR T1 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(1,1,1,1));
  1392. XMVECTOR T2 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(2,2,2,2));
  1393. _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vResult );
  1394. _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T1 );
  1395. _mm_store_ss( reinterpret_cast<float*>(&pDestination->z), T2 );
  1396. #endif
  1397. }
  1398. //------------------------------------------------------------------------------
  1399. _Use_decl_annotations_
  1400. inline void XM_CALLCONV XMStoreInt4
  1401. (
  1402. uint32_t* pDestination,
  1403. FXMVECTOR V
  1404. )
  1405. {
  1406. assert(pDestination);
  1407. #if defined(_XM_NO_INTRINSICS_)
  1408. pDestination[0] = V.vector4_u32[0];
  1409. pDestination[1] = V.vector4_u32[1];
  1410. pDestination[2] = V.vector4_u32[2];
  1411. pDestination[3] = V.vector4_u32[3];
  1412. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1413. vst1q_u32( pDestination, V );
  1414. #elif defined(_XM_SSE_INTRINSICS_)
  1415. _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
  1416. #endif
  1417. }
  1418. //------------------------------------------------------------------------------
  1419. _Use_decl_annotations_
  1420. inline void XM_CALLCONV XMStoreInt4A
  1421. (
  1422. uint32_t* pDestination,
  1423. FXMVECTOR V
  1424. )
  1425. {
  1426. assert(pDestination);
  1427. assert(((uintptr_t)pDestination & 0xF) == 0);
  1428. #if defined(_XM_NO_INTRINSICS_)
  1429. pDestination[0] = V.vector4_u32[0];
  1430. pDestination[1] = V.vector4_u32[1];
  1431. pDestination[2] = V.vector4_u32[2];
  1432. pDestination[3] = V.vector4_u32[3];
  1433. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1434. vst1q_u32_ex( pDestination, V, 128 );
  1435. #elif defined(_XM_SSE_INTRINSICS_)
  1436. _mm_store_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
  1437. #endif
  1438. }
  1439. //------------------------------------------------------------------------------
  1440. _Use_decl_annotations_
  1441. inline void XM_CALLCONV XMStoreFloat4
  1442. (
  1443. XMFLOAT4* pDestination,
  1444. FXMVECTOR V
  1445. )
  1446. {
  1447. assert(pDestination);
  1448. #if defined(_XM_NO_INTRINSICS_)
  1449. pDestination->x = V.vector4_f32[0];
  1450. pDestination->y = V.vector4_f32[1];
  1451. pDestination->z = V.vector4_f32[2];
  1452. pDestination->w = V.vector4_f32[3];
  1453. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1454. vst1q_f32( reinterpret_cast<float*>(pDestination), V );
  1455. #elif defined(_XM_SSE_INTRINSICS_)
  1456. _mm_storeu_ps( &pDestination->x, V );
  1457. #endif
  1458. }
  1459. //------------------------------------------------------------------------------
  1460. _Use_decl_annotations_
  1461. inline void XM_CALLCONV XMStoreFloat4A
  1462. (
  1463. XMFLOAT4A* pDestination,
  1464. FXMVECTOR V
  1465. )
  1466. {
  1467. assert(pDestination);
  1468. assert(((uintptr_t)pDestination & 0xF) == 0);
  1469. #if defined(_XM_NO_INTRINSICS_)
  1470. pDestination->x = V.vector4_f32[0];
  1471. pDestination->y = V.vector4_f32[1];
  1472. pDestination->z = V.vector4_f32[2];
  1473. pDestination->w = V.vector4_f32[3];
  1474. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1475. vst1q_f32_ex( reinterpret_cast<float*>(pDestination), V, 128 );
  1476. #elif defined(_XM_SSE_INTRINSICS_)
  1477. _mm_store_ps( &pDestination->x, V );
  1478. #endif
  1479. }
  1480. //------------------------------------------------------------------------------
  1481. _Use_decl_annotations_
  1482. inline void XM_CALLCONV XMStoreSInt4
  1483. (
  1484. XMINT4* pDestination,
  1485. FXMVECTOR V
  1486. )
  1487. {
  1488. assert(pDestination);
  1489. #if defined(_XM_NO_INTRINSICS_)
  1490. pDestination->x = (int32_t)V.vector4_f32[0];
  1491. pDestination->y = (int32_t)V.vector4_f32[1];
  1492. pDestination->z = (int32_t)V.vector4_f32[2];
  1493. pDestination->w = (int32_t)V.vector4_f32[3];
  1494. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1495. int32x4_t v = vcvtq_s32_f32(V);
  1496. vst1q_s32( reinterpret_cast<int32_t*>(pDestination), v );
  1497. #elif defined(_XM_SSE_INTRINSICS_)
  1498. // In case of positive overflow, detect it
  1499. XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt);
  1500. // Float to int conversion
  1501. __m128i vResulti = _mm_cvttps_epi32(V);
  1502. // If there was positive overflow, set to 0x7FFFFFFF
  1503. XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
  1504. vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
  1505. vOverflow = _mm_or_ps(vOverflow,vResult);
  1506. _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vOverflow) );
  1507. #endif
  1508. }
  1509. //------------------------------------------------------------------------------
  1510. _Use_decl_annotations_
  1511. inline void XM_CALLCONV XMStoreUInt4
  1512. (
  1513. XMUINT4* pDestination,
  1514. FXMVECTOR V
  1515. )
  1516. {
  1517. assert(pDestination);
  1518. #if defined(_XM_NO_INTRINSICS_)
  1519. pDestination->x = (uint32_t)V.vector4_f32[0];
  1520. pDestination->y = (uint32_t)V.vector4_f32[1];
  1521. pDestination->z = (uint32_t)V.vector4_f32[2];
  1522. pDestination->w = (uint32_t)V.vector4_f32[3];
  1523. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1524. uint32x4_t v = vcvtq_u32_f32(V);
  1525. vst1q_u32( reinterpret_cast<uint32_t*>(pDestination), v );
  1526. #elif defined(_XM_SSE_INTRINSICS_)
  1527. // Clamp to >=0
  1528. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  1529. // Any numbers that are too big, set to 0xFFFFFFFFU
  1530. XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
  1531. XMVECTOR vValue = g_XMUnsignedFix;
  1532. // Too large for a signed integer?
  1533. XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
  1534. // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
  1535. vValue = _mm_and_ps(vValue,vMask);
  1536. // Perform fixup only on numbers too large (Keeps low bit precision)
  1537. vResult = _mm_sub_ps(vResult,vValue);
  1538. __m128i vResulti = _mm_cvttps_epi32(vResult);
  1539. // Convert from signed to unsigned pnly if greater than 0x80000000
  1540. vMask = _mm_and_ps(vMask,g_XMNegativeZero);
  1541. vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
  1542. // On those that are too large, set to 0xFFFFFFFF
  1543. vResult = _mm_or_ps(vResult,vOverflow);
  1544. _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vResult) );
  1545. #endif
  1546. }
  1547. //------------------------------------------------------------------------------
  1548. _Use_decl_annotations_
  1549. inline void XM_CALLCONV XMStoreFloat3x3
  1550. (
  1551. XMFLOAT3X3* pDestination,
  1552. FXMMATRIX M
  1553. )
  1554. {
  1555. assert(pDestination);
  1556. #if defined(_XM_NO_INTRINSICS_)
  1557. pDestination->m[0][0] = M.r[0].vector4_f32[0];
  1558. pDestination->m[0][1] = M.r[0].vector4_f32[1];
  1559. pDestination->m[0][2] = M.r[0].vector4_f32[2];
  1560. pDestination->m[1][0] = M.r[1].vector4_f32[0];
  1561. pDestination->m[1][1] = M.r[1].vector4_f32[1];
  1562. pDestination->m[1][2] = M.r[1].vector4_f32[2];
  1563. pDestination->m[2][0] = M.r[2].vector4_f32[0];
  1564. pDestination->m[2][1] = M.r[2].vector4_f32[1];
  1565. pDestination->m[2][2] = M.r[2].vector4_f32[2];
  1566. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1567. float32x4_t T1 = vextq_f32( M.r[0], M.r[1], 1 );
  1568. float32x4_t T2 = vbslq_f32( g_XMMask3, M.r[0], T1 );
  1569. vst1q_f32( &pDestination->m[0][0], T2 );
  1570. T1 = vextq_f32( M.r[1], M.r[1], 1 );
  1571. T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) );
  1572. vst1q_f32( &pDestination->m[1][1], T2 );
  1573. vst1q_lane_f32( &pDestination->m[2][2], M.r[2], 2 );
  1574. #elif defined(_XM_SSE_INTRINSICS_)
  1575. XMVECTOR vTemp1 = M.r[0];
  1576. XMVECTOR vTemp2 = M.r[1];
  1577. XMVECTOR vTemp3 = M.r[2];
  1578. XMVECTOR vWork = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,0,2,2));
  1579. vTemp1 = _mm_shuffle_ps(vTemp1,vWork,_MM_SHUFFLE(2,0,1,0));
  1580. _mm_storeu_ps(&pDestination->m[0][0],vTemp1);
  1581. vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
  1582. _mm_storeu_ps(&pDestination->m[1][1],vTemp2);
  1583. vTemp3 = XM_PERMUTE_PS(vTemp3,_MM_SHUFFLE(2,2,2,2));
  1584. _mm_store_ss(&pDestination->m[2][2],vTemp3);
  1585. #endif
  1586. }
  1587. //------------------------------------------------------------------------------
  1588. _Use_decl_annotations_
  1589. inline void XM_CALLCONV XMStoreFloat4x3
  1590. (
  1591. XMFLOAT4X3* pDestination,
  1592. FXMMATRIX M
  1593. )
  1594. {
  1595. assert(pDestination);
  1596. #if defined(_XM_NO_INTRINSICS_)
  1597. pDestination->m[0][0] = M.r[0].vector4_f32[0];
  1598. pDestination->m[0][1] = M.r[0].vector4_f32[1];
  1599. pDestination->m[0][2] = M.r[0].vector4_f32[2];
  1600. pDestination->m[1][0] = M.r[1].vector4_f32[0];
  1601. pDestination->m[1][1] = M.r[1].vector4_f32[1];
  1602. pDestination->m[1][2] = M.r[1].vector4_f32[2];
  1603. pDestination->m[2][0] = M.r[2].vector4_f32[0];
  1604. pDestination->m[2][1] = M.r[2].vector4_f32[1];
  1605. pDestination->m[2][2] = M.r[2].vector4_f32[2];
  1606. pDestination->m[3][0] = M.r[3].vector4_f32[0];
  1607. pDestination->m[3][1] = M.r[3].vector4_f32[1];
  1608. pDestination->m[3][2] = M.r[3].vector4_f32[2];
  1609. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1610. float32x4_t T1 = vextq_f32( M.r[0], M.r[1], 1 );
  1611. float32x4_t T2 = vbslq_f32( g_XMMask3, M.r[0], T1 );
  1612. vst1q_f32( &pDestination->m[0][0], T2 );
  1613. T1 = vextq_f32( M.r[1], M.r[1], 1 );
  1614. T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) );
  1615. vst1q_f32( &pDestination->m[1][1], T2 );
  1616. T1 = vdupq_lane_f32( vget_high_f32( M.r[2] ), 0 );
  1617. T2 = vextq_f32( T1, M.r[3], 3 );
  1618. vst1q_f32( &pDestination->m[2][2], T2 );
  1619. #elif defined(_XM_SSE_INTRINSICS_)
  1620. XMVECTOR vTemp1 = M.r[0];
  1621. XMVECTOR vTemp2 = M.r[1];
  1622. XMVECTOR vTemp3 = M.r[2];
  1623. XMVECTOR vTemp4 = M.r[3];
  1624. XMVECTOR vTemp2x = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
  1625. vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(2,2,0,0));
  1626. vTemp1 = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,2,1,0));
  1627. vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2));
  1628. vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0));
  1629. _mm_storeu_ps(&pDestination->m[0][0],vTemp1);
  1630. _mm_storeu_ps(&pDestination->m[1][1],vTemp2x);
  1631. _mm_storeu_ps(&pDestination->m[2][2],vTemp3);
  1632. #endif
  1633. }
  1634. //------------------------------------------------------------------------------
  1635. _Use_decl_annotations_
  1636. inline void XM_CALLCONV XMStoreFloat4x3A
  1637. (
  1638. XMFLOAT4X3A* pDestination,
  1639. FXMMATRIX M
  1640. )
  1641. {
  1642. assert(pDestination);
  1643. assert(((uintptr_t)pDestination & 0xF) == 0);
  1644. #if defined(_XM_NO_INTRINSICS_)
  1645. pDestination->m[0][0] = M.r[0].vector4_f32[0];
  1646. pDestination->m[0][1] = M.r[0].vector4_f32[1];
  1647. pDestination->m[0][2] = M.r[0].vector4_f32[2];
  1648. pDestination->m[1][0] = M.r[1].vector4_f32[0];
  1649. pDestination->m[1][1] = M.r[1].vector4_f32[1];
  1650. pDestination->m[1][2] = M.r[1].vector4_f32[2];
  1651. pDestination->m[2][0] = M.r[2].vector4_f32[0];
  1652. pDestination->m[2][1] = M.r[2].vector4_f32[1];
  1653. pDestination->m[2][2] = M.r[2].vector4_f32[2];
  1654. pDestination->m[3][0] = M.r[3].vector4_f32[0];
  1655. pDestination->m[3][1] = M.r[3].vector4_f32[1];
  1656. pDestination->m[3][2] = M.r[3].vector4_f32[2];
  1657. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1658. float32x4_t T1 = vextq_f32( M.r[0], M.r[1], 1 );
  1659. float32x4_t T2 = vbslq_f32( g_XMMask3, M.r[0], T1 );
  1660. vst1q_f32_ex( &pDestination->m[0][0], T2, 128 );
  1661. T1 = vextq_f32( M.r[1], M.r[1], 1 );
  1662. T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) );
  1663. vst1q_f32_ex( &pDestination->m[1][1], T2, 128 );
  1664. T1 = vdupq_lane_f32( vget_high_f32( M.r[2] ), 0 );
  1665. T2 = vextq_f32( T1, M.r[3], 3 );
  1666. vst1q_f32_ex( &pDestination->m[2][2], T2, 128 );
  1667. #elif defined(_XM_SSE_INTRINSICS_)
  1668. // x1,y1,z1,w1
  1669. XMVECTOR vTemp1 = M.r[0];
  1670. // x2,y2,z2,w2
  1671. XMVECTOR vTemp2 = M.r[1];
  1672. // x3,y3,z3,w3
  1673. XMVECTOR vTemp3 = M.r[2];
  1674. // x4,y4,z4,w4
  1675. XMVECTOR vTemp4 = M.r[3];
  1676. // z1,z1,x2,y2
  1677. XMVECTOR vTemp = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(1,0,2,2));
  1678. // y2,z2,x3,y3 (Final)
  1679. vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
  1680. // x1,y1,z1,x2 (Final)
  1681. vTemp1 = _mm_shuffle_ps(vTemp1,vTemp,_MM_SHUFFLE(2,0,1,0));
  1682. // z3,z3,x4,x4
  1683. vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2));
  1684. // z3,x4,y4,z4 (Final)
  1685. vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0));
  1686. // Store in 3 operations
  1687. _mm_store_ps(&pDestination->m[0][0],vTemp1);
  1688. _mm_store_ps(&pDestination->m[1][1],vTemp2);
  1689. _mm_store_ps(&pDestination->m[2][2],vTemp3);
  1690. #endif
  1691. }
  1692. //------------------------------------------------------------------------------
  1693. _Use_decl_annotations_
  1694. inline void XM_CALLCONV XMStoreFloat4x4
  1695. (
  1696. XMFLOAT4X4* pDestination,
  1697. FXMMATRIX M
  1698. )
  1699. {
  1700. assert(pDestination);
  1701. #if defined(_XM_NO_INTRINSICS_)
  1702. pDestination->m[0][0] = M.r[0].vector4_f32[0];
  1703. pDestination->m[0][1] = M.r[0].vector4_f32[1];
  1704. pDestination->m[0][2] = M.r[0].vector4_f32[2];
  1705. pDestination->m[0][3] = M.r[0].vector4_f32[3];
  1706. pDestination->m[1][0] = M.r[1].vector4_f32[0];
  1707. pDestination->m[1][1] = M.r[1].vector4_f32[1];
  1708. pDestination->m[1][2] = M.r[1].vector4_f32[2];
  1709. pDestination->m[1][3] = M.r[1].vector4_f32[3];
  1710. pDestination->m[2][0] = M.r[2].vector4_f32[0];
  1711. pDestination->m[2][1] = M.r[2].vector4_f32[1];
  1712. pDestination->m[2][2] = M.r[2].vector4_f32[2];
  1713. pDestination->m[2][3] = M.r[2].vector4_f32[3];
  1714. pDestination->m[3][0] = M.r[3].vector4_f32[0];
  1715. pDestination->m[3][1] = M.r[3].vector4_f32[1];
  1716. pDestination->m[3][2] = M.r[3].vector4_f32[2];
  1717. pDestination->m[3][3] = M.r[3].vector4_f32[3];
  1718. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1719. vst1q_f32( reinterpret_cast<float*>(&pDestination->_11), M.r[0] );
  1720. vst1q_f32( reinterpret_cast<float*>(&pDestination->_21), M.r[1] );
  1721. vst1q_f32( reinterpret_cast<float*>(&pDestination->_31), M.r[2] );
  1722. vst1q_f32( reinterpret_cast<float*>(&pDestination->_41), M.r[3] );
  1723. #elif defined(_XM_SSE_INTRINSICS_)
  1724. _mm_storeu_ps( &pDestination->_11, M.r[0] );
  1725. _mm_storeu_ps( &pDestination->_21, M.r[1] );
  1726. _mm_storeu_ps( &pDestination->_31, M.r[2] );
  1727. _mm_storeu_ps( &pDestination->_41, M.r[3] );
  1728. #endif
  1729. }
  1730. //------------------------------------------------------------------------------
  1731. _Use_decl_annotations_
  1732. inline void XM_CALLCONV XMStoreFloat4x4A
  1733. (
  1734. XMFLOAT4X4A* pDestination,
  1735. FXMMATRIX M
  1736. )
  1737. {
  1738. assert(pDestination);
  1739. assert(((uintptr_t)pDestination & 0xF) == 0);
  1740. #if defined(_XM_NO_INTRINSICS_)
  1741. pDestination->m[0][0] = M.r[0].vector4_f32[0];
  1742. pDestination->m[0][1] = M.r[0].vector4_f32[1];
  1743. pDestination->m[0][2] = M.r[0].vector4_f32[2];
  1744. pDestination->m[0][3] = M.r[0].vector4_f32[3];
  1745. pDestination->m[1][0] = M.r[1].vector4_f32[0];
  1746. pDestination->m[1][1] = M.r[1].vector4_f32[1];
  1747. pDestination->m[1][2] = M.r[1].vector4_f32[2];
  1748. pDestination->m[1][3] = M.r[1].vector4_f32[3];
  1749. pDestination->m[2][0] = M.r[2].vector4_f32[0];
  1750. pDestination->m[2][1] = M.r[2].vector4_f32[1];
  1751. pDestination->m[2][2] = M.r[2].vector4_f32[2];
  1752. pDestination->m[2][3] = M.r[2].vector4_f32[3];
  1753. pDestination->m[3][0] = M.r[3].vector4_f32[0];
  1754. pDestination->m[3][1] = M.r[3].vector4_f32[1];
  1755. pDestination->m[3][2] = M.r[3].vector4_f32[2];
  1756. pDestination->m[3][3] = M.r[3].vector4_f32[3];
  1757. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1758. vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_11), M.r[0], 128 );
  1759. vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_21), M.r[1], 128 );
  1760. vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_31), M.r[2], 128 );
  1761. vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_41), M.r[3], 128 );
  1762. #elif defined(_XM_SSE_INTRINSICS_)
  1763. _mm_store_ps( &pDestination->_11, M.r[0] );
  1764. _mm_store_ps( &pDestination->_21, M.r[1] );
  1765. _mm_store_ps( &pDestination->_31, M.r[2] );
  1766. _mm_store_ps( &pDestination->_41, M.r[3] );
  1767. #endif
  1768. }