DirectXPackedVector.inl 153 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579
  1. //-------------------------------------------------------------------------------------
  2. // DirectXPackedVector.inl -- SIMD C++ Math library
  3. //
  4. // THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
  5. // ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
  6. // THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
  7. // PARTICULAR PURPOSE.
  8. //
  9. // Copyright (c) Microsoft Corporation. All rights reserved.
  10. //
  11. // http://go.microsoft.com/fwlink/?LinkID=615560
  12. //-------------------------------------------------------------------------------------
  13. #pragma once
  14. /****************************************************************************
  15. *
  16. * Data conversion
  17. *
  18. ****************************************************************************/
  19. //------------------------------------------------------------------------------
  20. inline float XMConvertHalfToFloat
  21. (
  22. HALF Value
  23. )
  24. {
  25. #if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
  26. __m128i V1 = _mm_cvtsi32_si128( static_cast<uint32_t>(Value) );
  27. __m128 V2 = _mm_cvtph_ps( V1 );
  28. return _mm_cvtss_f32( V2 );
  29. #elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)) && !defined(_XM_NO_INTRINSICS_)
  30. uint16x4_t vHalf = vdup_n_u16(Value);
  31. float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
  32. return vgetq_lane_f32(vFloat, 0);
  33. #else
  34. uint32_t Mantissa = (uint32_t)(Value & 0x03FF);
  35. uint32_t Exponent = (Value & 0x7C00);
  36. if ( Exponent == 0x7C00 ) // INF/NAN
  37. {
  38. Exponent = (uint32_t)0x8f;
  39. }
  40. else if (Exponent != 0) // The value is normalized
  41. {
  42. Exponent = (uint32_t)((Value >> 10) & 0x1F);
  43. }
  44. else if (Mantissa != 0) // The value is denormalized
  45. {
  46. // Normalize the value in the resulting float
  47. Exponent = 1;
  48. do
  49. {
  50. Exponent--;
  51. Mantissa <<= 1;
  52. } while ((Mantissa & 0x0400) == 0);
  53. Mantissa &= 0x03FF;
  54. }
  55. else // The value is zero
  56. {
  57. Exponent = (uint32_t)-112;
  58. }
  59. uint32_t Result = ((Value & 0x8000) << 16) | // Sign
  60. ((Exponent + 112) << 23) | // Exponent
  61. (Mantissa << 13); // Mantissa
  62. return reinterpret_cast<float*>(&Result)[0];
  63. #endif // !_XM_F16C_INTRINSICS_
  64. }
  65. //------------------------------------------------------------------------------
  66. #ifdef _PREFAST_
  67. #pragma prefast(push)
  68. #pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" )
  69. #endif
  70. _Use_decl_annotations_
  71. inline float* XMConvertHalfToFloatStream
  72. (
  73. float* pOutputStream,
  74. size_t OutputStride,
  75. const HALF* pInputStream,
  76. size_t InputStride,
  77. size_t HalfCount
  78. )
  79. {
  80. assert(pOutputStream);
  81. assert(pInputStream);
  82. assert(InputStride >= sizeof(HALF));
  83. _Analysis_assume_(InputStride >= sizeof(HALF));
  84. assert(OutputStride >= sizeof(float));
  85. _Analysis_assume_(OutputStride >= sizeof(float));
  86. #if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
  87. const uint8_t* pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
  88. uint8_t* pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
  89. size_t i = 0;
  90. size_t four = HalfCount >> 2;
  91. if ( four > 0 )
  92. {
  93. if (InputStride == sizeof(HALF))
  94. {
  95. if (OutputStride == sizeof(float))
  96. {
  97. if ( ((uintptr_t)pFloat & 0xF) == 0)
  98. {
  99. // Packed input, aligned & packed output
  100. for (size_t j = 0; j < four; ++j)
  101. {
  102. __m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
  103. pHalf += InputStride*4;
  104. __m128 FV = _mm_cvtph_ps( HV );
  105. XM_STREAM_PS( reinterpret_cast<float*>(pFloat), FV );
  106. pFloat += OutputStride*4;
  107. i += 4;
  108. }
  109. }
  110. else
  111. {
  112. // Packed input, packed output
  113. for (size_t j = 0; j < four; ++j)
  114. {
  115. __m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
  116. pHalf += InputStride*4;
  117. __m128 FV = _mm_cvtph_ps( HV );
  118. _mm_storeu_ps( reinterpret_cast<float*>(pFloat), FV );
  119. pFloat += OutputStride*4;
  120. i += 4;
  121. }
  122. }
  123. }
  124. else
  125. {
  126. // Packed input, scattered output
  127. for (size_t j = 0; j < four; ++j)
  128. {
  129. __m128i HV = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pHalf) );
  130. pHalf += InputStride*4;
  131. __m128 FV = _mm_cvtph_ps( HV );
  132. _mm_store_ss( reinterpret_cast<float*>(pFloat), FV );
  133. pFloat += OutputStride;
  134. *reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 1 );
  135. pFloat += OutputStride;
  136. *reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 2 );
  137. pFloat += OutputStride;
  138. *reinterpret_cast<int*>(pFloat) = _mm_extract_ps( FV, 3 );
  139. pFloat += OutputStride;
  140. i += 4;
  141. }
  142. }
  143. }
  144. else if (OutputStride == sizeof(float))
  145. {
  146. if ( ((uintptr_t)pFloat & 0xF) == 0)
  147. {
  148. // Scattered input, aligned & packed output
  149. for (size_t j = 0; j < four; ++j)
  150. {
  151. uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
  152. pHalf += InputStride;
  153. uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
  154. pHalf += InputStride;
  155. uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
  156. pHalf += InputStride;
  157. uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
  158. pHalf += InputStride;
  159. __m128i HV = _mm_setzero_si128();
  160. HV = _mm_insert_epi16( HV, H1, 0 );
  161. HV = _mm_insert_epi16( HV, H2, 1 );
  162. HV = _mm_insert_epi16( HV, H3, 2 );
  163. HV = _mm_insert_epi16( HV, H4, 3 );
  164. __m128 FV = _mm_cvtph_ps( HV );
  165. XM_STREAM_PS( reinterpret_cast<float*>(pFloat ), FV );
  166. pFloat += OutputStride*4;
  167. i += 4;
  168. }
  169. }
  170. else
  171. {
  172. // Scattered input, packed output
  173. for (size_t j = 0; j < four; ++j)
  174. {
  175. uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
  176. pHalf += InputStride;
  177. uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
  178. pHalf += InputStride;
  179. uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
  180. pHalf += InputStride;
  181. uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
  182. pHalf += InputStride;
  183. __m128i HV = _mm_setzero_si128();
  184. HV = _mm_insert_epi16( HV, H1, 0 );
  185. HV = _mm_insert_epi16( HV, H2, 1 );
  186. HV = _mm_insert_epi16( HV, H3, 2 );
  187. HV = _mm_insert_epi16( HV, H4, 3 );
  188. __m128 FV = _mm_cvtph_ps( HV );
  189. _mm_storeu_ps( reinterpret_cast<float*>(pFloat ), FV );
  190. pFloat += OutputStride*4;
  191. i += 4;
  192. }
  193. }
  194. }
  195. else
  196. {
  197. // Scattered input, scattered output
  198. for (size_t j = 0; j < four; ++j)
  199. {
  200. uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
  201. pHalf += InputStride;
  202. uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
  203. pHalf += InputStride;
  204. uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
  205. pHalf += InputStride;
  206. uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
  207. pHalf += InputStride;
  208. __m128i HV = _mm_setzero_si128();
  209. HV = _mm_insert_epi16(HV, H1, 0);
  210. HV = _mm_insert_epi16(HV, H2, 1);
  211. HV = _mm_insert_epi16(HV, H3, 2);
  212. HV = _mm_insert_epi16(HV, H4, 3);
  213. __m128 FV = _mm_cvtph_ps(HV);
  214. _mm_store_ss(reinterpret_cast<float*>(pFloat), FV);
  215. pFloat += OutputStride;
  216. *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 1);
  217. pFloat += OutputStride;
  218. *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 2);
  219. pFloat += OutputStride;
  220. *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 3);
  221. pFloat += OutputStride;
  222. i += 4;
  223. }
  224. }
  225. }
  226. for (; i < HalfCount; ++i)
  227. {
  228. *reinterpret_cast<float*>(pFloat) = XMConvertHalfToFloat(reinterpret_cast<const HALF*>(pHalf)[0]);
  229. pHalf += InputStride;
  230. pFloat += OutputStride;
  231. }
  232. XM_SFENCE();
  233. return pOutputStream;
  234. #elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)) && !defined(_XM_NO_INTRINSICS_)
  235. const uint8_t* pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
  236. uint8_t* pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
  237. size_t i = 0;
  238. size_t four = HalfCount >> 2;
  239. if (four > 0)
  240. {
  241. if (InputStride == sizeof(HALF))
  242. {
  243. if (OutputStride == sizeof(float))
  244. {
  245. // Packed input, packed output
  246. for (size_t j = 0; j < four; ++j)
  247. {
  248. uint16x4_t vHalf = vld1_u16(reinterpret_cast<const uint16_t*>(pHalf));
  249. pHalf += InputStride * 4;
  250. float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
  251. vst1q_f32(reinterpret_cast<float*>(pFloat), vFloat);
  252. pFloat += OutputStride * 4;
  253. i += 4;
  254. }
  255. }
  256. else
  257. {
  258. // Packed input, scattered output
  259. for (size_t j = 0; j < four; ++j)
  260. {
  261. uint16x4_t vHalf = vld1_u16(reinterpret_cast<const uint16_t*>(pHalf));
  262. pHalf += InputStride * 4;
  263. float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
  264. vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 0);
  265. pFloat += OutputStride;
  266. vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 1);
  267. pFloat += OutputStride;
  268. vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 2);
  269. pFloat += OutputStride;
  270. vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 3);
  271. pFloat += OutputStride;
  272. i += 4;
  273. }
  274. }
  275. }
  276. else if (OutputStride == sizeof(float))
  277. {
  278. // Scattered input, packed output
  279. for (size_t j = 0; j < four; ++j)
  280. {
  281. uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
  282. pHalf += InputStride;
  283. uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
  284. pHalf += InputStride;
  285. uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
  286. pHalf += InputStride;
  287. uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
  288. pHalf += InputStride;
  289. uint64_t iHalf = uint64_t(H1) | (uint64_t(H2) << 16) | (uint64_t(H3) << 32) | (uint64_t(H4) << 48);
  290. uint16x4_t vHalf = vcreate_u16(iHalf);
  291. float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
  292. vst1q_f32(reinterpret_cast<float*>(pFloat), vFloat);
  293. pFloat += OutputStride * 4;
  294. i += 4;
  295. }
  296. }
  297. else
  298. {
  299. // Scattered input, scattered output
  300. for (size_t j = 0; j < four; ++j)
  301. {
  302. uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
  303. pHalf += InputStride;
  304. uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
  305. pHalf += InputStride;
  306. uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
  307. pHalf += InputStride;
  308. uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
  309. pHalf += InputStride;
  310. uint64_t iHalf = uint64_t(H1) | (uint64_t(H2) << 16) | (uint64_t(H3) << 32) | (uint64_t(H4) << 48);
  311. uint16x4_t vHalf = vcreate_u16(iHalf);
  312. float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
  313. vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 0);
  314. pFloat += OutputStride;
  315. vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 1);
  316. pFloat += OutputStride;
  317. vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 2);
  318. pFloat += OutputStride;
  319. vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 3);
  320. pFloat += OutputStride;
  321. i += 4;
  322. }
  323. }
  324. }
  325. for (; i < HalfCount; ++i)
  326. {
  327. *reinterpret_cast<float*>(pFloat) = XMConvertHalfToFloat(reinterpret_cast<const HALF*>(pHalf)[0]);
  328. pHalf += InputStride;
  329. pFloat += OutputStride;
  330. }
  331. return pOutputStream;
  332. #else
  333. const uint8_t* pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
  334. uint8_t* pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
  335. for (size_t i = 0; i < HalfCount; i++)
  336. {
  337. *reinterpret_cast<float*>(pFloat) = XMConvertHalfToFloat(reinterpret_cast<const HALF*>(pHalf)[0]);
  338. pHalf += InputStride;
  339. pFloat += OutputStride;
  340. }
  341. return pOutputStream;
  342. #endif // !_XM_F16C_INTRINSICS_
  343. }
  344. //------------------------------------------------------------------------------
  345. inline HALF XMConvertFloatToHalf
  346. (
  347. float Value
  348. )
  349. {
  350. #if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
  351. __m128 V1 = _mm_set_ss( Value );
  352. __m128i V2 = _mm_cvtps_ph( V1, 0 );
  353. return static_cast<HALF>( _mm_cvtsi128_si32(V2) );
  354. #elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)) && !defined(_XM_NO_INTRINSICS_)
  355. float32x4_t vFloat = vdupq_n_f32(Value);
  356. float16x4_t vHalf = vcvt_f16_f32(vFloat);
  357. return vget_lane_u16(vreinterpret_u16_f16(vHalf), 0);
  358. #else
  359. uint32_t Result;
  360. uint32_t IValue = reinterpret_cast<uint32_t *>(&Value)[0];
  361. uint32_t Sign = (IValue & 0x80000000U) >> 16U;
  362. IValue = IValue & 0x7FFFFFFFU; // Hack off the sign
  363. if (IValue > 0x477FE000U)
  364. {
  365. // The number is too large to be represented as a half. Saturate to infinity.
  366. if (((IValue & 0x7F800000) == 0x7F800000) && ((IValue & 0x7FFFFF ) != 0))
  367. {
  368. Result = 0x7FFF; // NAN
  369. }
  370. else
  371. {
  372. Result = 0x7C00U; // INF
  373. }
  374. }
  375. else
  376. {
  377. if (IValue < 0x38800000U)
  378. {
  379. // The number is too small to be represented as a normalized half.
  380. // Convert it to a denormalized value.
  381. uint32_t Shift = 113U - (IValue >> 23U);
  382. IValue = (0x800000U | (IValue & 0x7FFFFFU)) >> Shift;
  383. }
  384. else
  385. {
  386. // Rebias the exponent to represent the value as a normalized half.
  387. IValue += 0xC8000000U;
  388. }
  389. Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U)&0x7FFFU;
  390. }
  391. return (HALF)(Result|Sign);
  392. #endif // !_XM_F16C_INTRINSICS_
  393. }
  394. //------------------------------------------------------------------------------
  395. _Use_decl_annotations_
  396. inline HALF* XMConvertFloatToHalfStream
  397. (
  398. HALF* pOutputStream,
  399. size_t OutputStride,
  400. const float* pInputStream,
  401. size_t InputStride,
  402. size_t FloatCount
  403. )
  404. {
  405. assert(pOutputStream);
  406. assert(pInputStream);
  407. assert(InputStride >= sizeof(float));
  408. _Analysis_assume_(InputStride >= sizeof(float));
  409. assert(OutputStride >= sizeof(HALF));
  410. _Analysis_assume_(OutputStride >= sizeof(HALF));
  411. #if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
  412. const uint8_t* pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
  413. uint8_t* pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
  414. size_t i = 0;
  415. size_t four = FloatCount >> 2;
  416. if (four > 0)
  417. {
  418. if (InputStride == sizeof(float))
  419. {
  420. if (OutputStride == sizeof(HALF))
  421. {
  422. if ( ((uintptr_t)pFloat & 0xF) == 0)
  423. {
  424. // Aligned and packed input, packed output
  425. for (size_t j = 0; j < four; ++j)
  426. {
  427. __m128 FV = _mm_load_ps( reinterpret_cast<const float*>(pFloat) );
  428. pFloat += InputStride*4;
  429. __m128i HV = _mm_cvtps_ph( FV, 0 );
  430. _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
  431. pHalf += OutputStride*4;
  432. i += 4;
  433. }
  434. }
  435. else
  436. {
  437. // Packed input, packed output
  438. for (size_t j = 0; j < four; ++j)
  439. {
  440. __m128 FV = _mm_loadu_ps( reinterpret_cast<const float*>(pFloat) );
  441. pFloat += InputStride*4;
  442. __m128i HV = _mm_cvtps_ph( FV, 0 );
  443. _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
  444. pHalf += OutputStride*4;
  445. i += 4;
  446. }
  447. }
  448. }
  449. else
  450. {
  451. if ( ((uintptr_t)pFloat & 0xF) == 0)
  452. {
  453. // Aligned & packed input, scattered output
  454. for (size_t j = 0; j < four; ++j)
  455. {
  456. __m128 FV = _mm_load_ps( reinterpret_cast<const float*>(pFloat) );
  457. pFloat += InputStride*4;
  458. __m128i HV = _mm_cvtps_ph( FV, 0 );
  459. *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 0 ) );
  460. pHalf += OutputStride;
  461. *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 1 ) );
  462. pHalf += OutputStride;
  463. *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 2 ) );
  464. pHalf += OutputStride;
  465. *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 3 ) );
  466. pHalf += OutputStride;
  467. i += 4;
  468. }
  469. }
  470. else
  471. {
  472. // Packed input, scattered output
  473. for (size_t j = 0; j < four; ++j)
  474. {
  475. __m128 FV = _mm_loadu_ps( reinterpret_cast<const float*>(pFloat) );
  476. pFloat += InputStride*4;
  477. __m128i HV = _mm_cvtps_ph( FV, 0 );
  478. *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 0 ) );
  479. pHalf += OutputStride;
  480. *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 1 ) );
  481. pHalf += OutputStride;
  482. *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 2 ) );
  483. pHalf += OutputStride;
  484. *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>( _mm_extract_epi16( HV, 3 ) );
  485. pHalf += OutputStride;
  486. i += 4;
  487. }
  488. }
  489. }
  490. }
  491. else if (OutputStride == sizeof(HALF))
  492. {
  493. // Scattered input, packed output
  494. for (size_t j = 0; j < four; ++j)
  495. {
  496. __m128 FV1 = _mm_load_ss( reinterpret_cast<const float*>(pFloat) );
  497. pFloat += InputStride;
  498. __m128 FV2 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
  499. pFloat += InputStride;
  500. __m128 FV3 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
  501. pFloat += InputStride;
  502. __m128 FV4 = _mm_broadcast_ss( reinterpret_cast<const float*>(pFloat) );
  503. pFloat += InputStride;
  504. __m128 FV = _mm_blend_ps( FV1, FV2, 0x2 );
  505. __m128 FT = _mm_blend_ps( FV3, FV4, 0x8 );
  506. FV = _mm_blend_ps( FV, FT, 0xC );
  507. __m128i HV = _mm_cvtps_ph( FV, 0 );
  508. _mm_storel_epi64( reinterpret_cast<__m128i*>(pHalf), HV );
  509. pHalf += OutputStride*4;
  510. i += 4;
  511. }
  512. }
  513. else
  514. {
  515. // Scattered input, scattered output
  516. for (size_t j = 0; j < four; ++j)
  517. {
  518. __m128 FV1 = _mm_load_ss(reinterpret_cast<const float*>(pFloat));
  519. pFloat += InputStride;
  520. __m128 FV2 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
  521. pFloat += InputStride;
  522. __m128 FV3 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
  523. pFloat += InputStride;
  524. __m128 FV4 = _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
  525. pFloat += InputStride;
  526. __m128 FV = _mm_blend_ps(FV1, FV2, 0x2);
  527. __m128 FT = _mm_blend_ps(FV3, FV4, 0x8);
  528. FV = _mm_blend_ps(FV, FT, 0xC);
  529. __m128i HV = _mm_cvtps_ph(FV, 0);
  530. *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 0));
  531. pHalf += OutputStride;
  532. *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 1));
  533. pHalf += OutputStride;
  534. *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 2));
  535. pHalf += OutputStride;
  536. *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 3));
  537. pHalf += OutputStride;
  538. i += 4;
  539. }
  540. }
  541. }
  542. for (; i < FloatCount; ++i)
  543. {
  544. *reinterpret_cast<HALF*>(pHalf) = XMConvertFloatToHalf(reinterpret_cast<const float*>(pFloat)[0]);
  545. pFloat += InputStride;
  546. pHalf += OutputStride;
  547. }
  548. return pOutputStream;
  549. #elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)) && !defined(_XM_NO_INTRINSICS_)
  550. const uint8_t* pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
  551. uint8_t* pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
  552. size_t i = 0;
  553. size_t four = FloatCount >> 2;
  554. if (four > 0)
  555. {
  556. if (InputStride == sizeof(float))
  557. {
  558. if (OutputStride == sizeof(HALF))
  559. {
  560. // Packed input, packed output
  561. for (size_t j = 0; j < four; ++j)
  562. {
  563. float32x4_t vFloat = vld1q_f32(reinterpret_cast<const float*>(pFloat));
  564. pFloat += InputStride*4;
  565. uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat));
  566. vst1_u16(reinterpret_cast<uint16_t*>(pHalf), vHalf);
  567. pHalf += OutputStride*4;
  568. i += 4;
  569. }
  570. }
  571. else
  572. {
  573. // Packed input, scattered output
  574. for (size_t j = 0; j < four; ++j)
  575. {
  576. float32x4_t vFloat = vld1q_f32(reinterpret_cast<const float*>(pFloat));
  577. pFloat += InputStride*4;
  578. uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat));
  579. vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 0);
  580. pHalf += OutputStride;
  581. vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 1);
  582. pHalf += OutputStride;
  583. vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 2);
  584. pHalf += OutputStride;
  585. vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 3);
  586. pHalf += OutputStride;
  587. i += 4;
  588. }
  589. }
  590. }
  591. else if (OutputStride == sizeof(HALF))
  592. {
  593. // Scattered input, packed output
  594. for (size_t j = 0; j < four; ++j)
  595. {
  596. float32x4_t vFloat = vdupq_n_f32(0);
  597. vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 0);
  598. pFloat += InputStride;
  599. vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 1);
  600. pFloat += InputStride;
  601. vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 2);
  602. pFloat += InputStride;
  603. vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 3);
  604. pFloat += InputStride;
  605. uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat));
  606. vst1_u16(reinterpret_cast<uint16_t*>(pHalf), vHalf);
  607. pHalf += OutputStride*4;
  608. i += 4;
  609. }
  610. }
  611. else
  612. {
  613. // Scattered input, scattered output
  614. for (size_t j = 0; j < four; ++j)
  615. {
  616. float32x4_t vFloat = vdupq_n_f32(0);
  617. vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 0);
  618. pFloat += InputStride;
  619. vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 1);
  620. pFloat += InputStride;
  621. vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 2);
  622. pFloat += InputStride;
  623. vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat), vFloat, 3);
  624. pFloat += InputStride;
  625. uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat));
  626. vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 0);
  627. pHalf += OutputStride;
  628. vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 1);
  629. pHalf += OutputStride;
  630. vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 2);
  631. pHalf += OutputStride;
  632. vst1_lane_u16(reinterpret_cast<float*>(pHalf), vHalf, 3);
  633. pHalf += OutputStride;
  634. i += 4;
  635. }
  636. }
  637. }
  638. for (; i < FloatCount; ++i)
  639. {
  640. *reinterpret_cast<HALF*>(pHalf) = XMConvertFloatToHalf(reinterpret_cast<const float*>(pFloat)[0]);
  641. pFloat += InputStride;
  642. pHalf += OutputStride;
  643. }
  644. return pOutputStream;
  645. #else
  646. const uint8_t* pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
  647. uint8_t* pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
  648. for (size_t i = 0; i < FloatCount; i++)
  649. {
  650. *reinterpret_cast<HALF*>(pHalf) = XMConvertFloatToHalf(reinterpret_cast<const float*>(pFloat)[0]);
  651. pFloat += InputStride;
  652. pHalf += OutputStride;
  653. }
  654. return pOutputStream;
  655. #endif // !_XM_F16C_INTRINSICS_
  656. }
  657. #ifdef _PREFAST_
  658. #pragma prefast(pop)
  659. #endif
  660. /****************************************************************************
  661. *
  662. * Vector and matrix load operations
  663. *
  664. ****************************************************************************/
  665. #ifdef _PREFAST_
  666. #pragma prefast(push)
  667. #pragma prefast(disable:28931, "PREfast noise: Esp:1266")
  668. #endif
  669. _Use_decl_annotations_
  670. inline XMVECTOR XM_CALLCONV XMLoadColor
  671. (
  672. const XMCOLOR* pSource
  673. )
  674. {
  675. assert(pSource);
  676. #if defined(_XM_NO_INTRINSICS_)
  677. // int32_t -> Float conversions are done in one instruction.
  678. // uint32_t -> Float calls a runtime function. Keep in int32_t
  679. int32_t iColor = (int32_t)(pSource->c);
  680. XMVECTORF32 vColor = { { {
  681. (float) ((iColor >> 16) & 0xFF) * (1.0f / 255.0f),
  682. (float) ((iColor >> 8) & 0xFF) * (1.0f / 255.0f),
  683. (float) (iColor & 0xFF) * (1.0f / 255.0f),
  684. (float) ((iColor >> 24) & 0xFF) * (1.0f / 255.0f)
  685. } } };
  686. return vColor.v;
  687. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  688. uint32_t bgra = pSource->c;
  689. uint32_t rgba = (bgra & 0xFF00FF00) | ((bgra >> 16) & 0xFF) | ((bgra << 16) & 0xFF0000);
  690. uint32x2_t vInt8 = vdup_n_u32(rgba);
  691. uint16x8_t vInt16 = vmovl_u8( vreinterpret_u8_u32(vInt8) );
  692. uint32x4_t vInt = vmovl_u16( vget_low_u16(vInt16) );
  693. float32x4_t R = vcvtq_f32_u32(vInt);
  694. return vmulq_n_f32( R, 1.0f/255.0f );
  695. #elif defined(_XM_SSE_INTRINSICS_)
  696. // Splat the color in all four entries
  697. __m128i vInt = _mm_set1_epi32(pSource->c);
  698. // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
  699. vInt = _mm_and_si128(vInt,g_XMMaskA8R8G8B8);
  700. // a is unsigned! Flip the bit to convert the order to signed
  701. vInt = _mm_xor_si128(vInt,g_XMFlipA8R8G8B8);
  702. // Convert to floating point numbers
  703. XMVECTOR vTemp = _mm_cvtepi32_ps(vInt);
  704. // RGB + 0, A + 0x80000000.f to undo the signed order.
  705. vTemp = _mm_add_ps(vTemp,g_XMFixAA8R8G8B8);
  706. // Convert 0-255 to 0.0f-1.0f
  707. return _mm_mul_ps(vTemp,g_XMNormalizeA8R8G8B8);
  708. #endif
  709. }
  710. //------------------------------------------------------------------------------
  711. _Use_decl_annotations_
  712. inline XMVECTOR XM_CALLCONV XMLoadHalf2
  713. (
  714. const XMHALF2* pSource
  715. )
  716. {
  717. assert(pSource);
  718. #if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
  719. __m128 V = _mm_load_ss( reinterpret_cast<const float*>(pSource) );
  720. return _mm_cvtph_ps( _mm_castps_si128( V ) );
  721. #else
  722. XMVECTORF32 vResult = { { {
  723. XMConvertHalfToFloat(pSource->x),
  724. XMConvertHalfToFloat(pSource->y),
  725. 0.0f,
  726. 0.0f
  727. } } };
  728. return vResult.v;
  729. #endif // !_XM_F16C_INTRINSICS_
  730. }
  731. //------------------------------------------------------------------------------
  732. _Use_decl_annotations_
  733. inline XMVECTOR XM_CALLCONV XMLoadShortN2
  734. (
  735. const XMSHORTN2* pSource
  736. )
  737. {
  738. assert(pSource);
  739. #if defined(_XM_NO_INTRINSICS_)
  740. XMVECTORF32 vResult = { { {
  741. (pSource->x == -32768) ? -1.f : ((float) pSource->x * (1.0f / 32767.0f)),
  742. (pSource->y == -32768) ? -1.f : ((float) pSource->y * (1.0f / 32767.0f)),
  743. 0.0f,
  744. 0.0f
  745. } } };
  746. return vResult.v;
  747. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  748. uint32x2_t vInt16 = vld1_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
  749. int32x4_t vInt = vmovl_s16( vreinterpret_s16_u32(vInt16) );
  750. vInt = vandq_s32( vInt, g_XMMaskXY );
  751. float32x4_t R = vcvtq_f32_s32(vInt);
  752. R = vmulq_n_f32( R, 1.0f/32767.0f );
  753. return vmaxq_f32( R, vdupq_n_f32(-1.f) );
  754. #elif defined(_XM_SSE_INTRINSICS_)
  755. // Splat the two shorts in all four entries (WORD alignment okay,
  756. // DWORD alignment preferred)
  757. __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
  758. // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
  759. vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
  760. // x needs to be sign extended
  761. vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16);
  762. // Convert to floating point numbers
  763. vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
  764. // x - 0x8000 to undo the signed order.
  765. vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16);
  766. // Convert -1.0f - 1.0f
  767. vTemp = _mm_mul_ps(vTemp,g_XMNormalizeX16Y16);
  768. // Clamp result (for case of -32768)
  769. return _mm_max_ps( vTemp, g_XMNegativeOne );
  770. #endif
  771. }
  772. //------------------------------------------------------------------------------
  773. _Use_decl_annotations_
  774. inline XMVECTOR XM_CALLCONV XMLoadShort2
  775. (
  776. const XMSHORT2* pSource
  777. )
  778. {
  779. assert(pSource);
  780. #if defined(_XM_NO_INTRINSICS_)
  781. XMVECTORF32 vResult = { { {
  782. (float) pSource->x,
  783. (float) pSource->y,
  784. 0.f,
  785. 0.f
  786. } } };
  787. return vResult.v;
  788. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  789. uint32x2_t vInt16 = vld1_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
  790. int32x4_t vInt = vmovl_s16( vreinterpret_s16_u32(vInt16) );
  791. vInt = vandq_s32( vInt, g_XMMaskXY );
  792. return vcvtq_f32_s32(vInt);
  793. #elif defined(_XM_SSE_INTRINSICS_)
  794. // Splat the two shorts in all four entries (WORD alignment okay,
  795. // DWORD alignment preferred)
  796. __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
  797. // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
  798. vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
  799. // x needs to be sign extended
  800. vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16);
  801. // Convert to floating point numbers
  802. vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
  803. // x - 0x8000 to undo the signed order.
  804. vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16);
  805. // Y is 65536 too large
  806. return _mm_mul_ps(vTemp,g_XMFixupY16);
  807. #endif
  808. }
  809. //------------------------------------------------------------------------------
  810. _Use_decl_annotations_
  811. inline XMVECTOR XM_CALLCONV XMLoadUShortN2
  812. (
  813. const XMUSHORTN2* pSource
  814. )
  815. {
  816. assert(pSource);
  817. #if defined(_XM_NO_INTRINSICS_)
  818. XMVECTORF32 vResult = { { {
  819. (float) pSource->x / 65535.0f,
  820. (float) pSource->y / 65535.0f,
  821. 0.f,
  822. 0.f
  823. } } };
  824. return vResult.v;
  825. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  826. uint32x2_t vInt16 = vld1_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
  827. uint32x4_t vInt = vmovl_u16( vreinterpret_u16_u32(vInt16) );
  828. vInt = vandq_u32( vInt, g_XMMaskXY );
  829. float32x4_t R = vcvtq_f32_u32(vInt);
  830. R = vmulq_n_f32( R, 1.0f/65535.0f );
  831. return vmaxq_f32( R, vdupq_n_f32(-1.f) );
  832. #elif defined(_XM_SSE_INTRINSICS_)
  833. static const XMVECTORF32 FixupY16 = { { { 1.0f / 65535.0f, 1.0f / (65535.0f*65536.0f), 0.0f, 0.0f } } };
  834. static const XMVECTORF32 FixaddY16 = { { { 0, 32768.0f*65536.0f, 0, 0 } } };
  835. // Splat the two shorts in all four entries (WORD alignment okay,
  836. // DWORD alignment preferred)
  837. __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
  838. // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
  839. vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
  840. // y needs to be sign flipped
  841. vTemp = _mm_xor_ps(vTemp,g_XMFlipY);
  842. // Convert to floating point numbers
  843. vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
  844. // y + 0x8000 to undo the signed order.
  845. vTemp = _mm_add_ps(vTemp,FixaddY16);
  846. // Y is 65536 times too large
  847. vTemp = _mm_mul_ps(vTemp,FixupY16);
  848. return vTemp;
  849. #endif
  850. }
  851. //------------------------------------------------------------------------------
  852. _Use_decl_annotations_
  853. inline XMVECTOR XM_CALLCONV XMLoadUShort2
  854. (
  855. const XMUSHORT2* pSource
  856. )
  857. {
  858. assert(pSource);
  859. #if defined(_XM_NO_INTRINSICS_)
  860. XMVECTORF32 vResult = { { {
  861. (float) pSource->x,
  862. (float) pSource->y,
  863. 0.f,
  864. 0.f
  865. } } };
  866. return vResult.v;
  867. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  868. uint32x2_t vInt16 = vld1_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
  869. uint32x4_t vInt = vmovl_u16( vreinterpret_u16_u32(vInt16) );
  870. vInt = vandq_u32( vInt, g_XMMaskXY );
  871. return vcvtq_f32_u32(vInt);
  872. #elif defined(_XM_SSE_INTRINSICS_)
  873. static const XMVECTORF32 FixaddY16 = { { { 0, 32768.0f, 0, 0 } } };
  874. // Splat the two shorts in all four entries (WORD alignment okay,
  875. // DWORD alignment preferred)
  876. __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
  877. // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
  878. vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
  879. // y needs to be sign flipped
  880. vTemp = _mm_xor_ps(vTemp,g_XMFlipY);
  881. // Convert to floating point numbers
  882. vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
  883. // Y is 65536 times too large
  884. vTemp = _mm_mul_ps(vTemp,g_XMFixupY16);
  885. // y + 0x8000 to undo the signed order.
  886. vTemp = _mm_add_ps(vTemp,FixaddY16);
  887. return vTemp;
  888. #endif
  889. }
  890. //------------------------------------------------------------------------------
  891. _Use_decl_annotations_
  892. inline XMVECTOR XM_CALLCONV XMLoadByteN2
  893. (
  894. const XMBYTEN2* pSource
  895. )
  896. {
  897. assert(pSource);
  898. #if defined(_XM_NO_INTRINSICS_)
  899. XMVECTORF32 vResult = { { {
  900. (pSource->x == -128) ? -1.f : ((float) pSource->x * (1.0f / 127.0f)),
  901. (pSource->y == -128) ? -1.f : ((float) pSource->y * (1.0f / 127.0f)),
  902. 0.0f,
  903. 0.0f
  904. } } };
  905. return vResult.v;
  906. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  907. uint16x4_t vInt8 = vld1_dup_u16( reinterpret_cast<const uint16_t*>( pSource ) );
  908. int16x8_t vInt16 = vmovl_s8( vreinterpret_s8_u16(vInt8) );
  909. int32x4_t vInt = vmovl_s16( vget_low_s16( vInt16 ) );
  910. vInt = vandq_s32( vInt, g_XMMaskXY );
  911. float32x4_t R = vcvtq_f32_s32(vInt);
  912. R = vmulq_n_f32( R, 1.0f/127.0f );
  913. return vmaxq_f32( R, vdupq_n_f32(-1.f) );
  914. #elif defined(_XM_SSE_INTRINSICS_)
  915. static const XMVECTORF32 Scale = { { { 1.0f / 127.0f, 1.0f / (127.0f*256.0f), 0, 0 } } };
  916. static const XMVECTORU32 Mask = { { { 0xFF, 0xFF00, 0, 0 } } };
  917. // Splat the color in all four entries (x,z,y,w)
  918. XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
  919. // Mask
  920. vTemp = _mm_and_ps(vTemp,Mask);
  921. // x,y and z are unsigned! Flip the bits to convert the order to signed
  922. vTemp = _mm_xor_ps(vTemp,g_XMXorByte4);
  923. // Convert to floating point numbers
  924. vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
  925. // x, y and z - 0x80 to complete the conversion
  926. vTemp = _mm_add_ps(vTemp,g_XMAddByte4);
  927. // Fix y, z and w because they are too large
  928. vTemp = _mm_mul_ps(vTemp,Scale);
  929. // Clamp result (for case of -128)
  930. return _mm_max_ps( vTemp, g_XMNegativeOne );
  931. #endif
  932. }
  933. //------------------------------------------------------------------------------
  934. _Use_decl_annotations_
  935. inline XMVECTOR XM_CALLCONV XMLoadByte2
  936. (
  937. const XMBYTE2* pSource
  938. )
  939. {
  940. assert(pSource);
  941. #if defined(_XM_NO_INTRINSICS_)
  942. XMVECTORF32 vResult = { { {
  943. (float) pSource->x,
  944. (float) pSource->y,
  945. 0.0f,
  946. 0.0f
  947. } } };
  948. return vResult.v;
  949. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  950. uint16x4_t vInt8 = vld1_dup_u16( reinterpret_cast<const uint16_t*>( pSource ) );
  951. int16x8_t vInt16 = vmovl_s8( vreinterpret_s8_u16(vInt8) );
  952. int32x4_t vInt = vmovl_s16( vget_low_s16(vInt16) );
  953. vInt = vandq_s32( vInt, g_XMMaskXY );
  954. return vcvtq_f32_s32(vInt);
  955. #elif defined(_XM_SSE_INTRINSICS_)
  956. static const XMVECTORF32 Scale = { { { 1.0f, 1.0f / 256.0f, 1.0f / 65536.0f, 1.0f / (65536.0f*256.0f) } } };
  957. static const XMVECTORU32 Mask = { { { 0xFF, 0xFF00, 0, 0 } } };
  958. // Splat the color in all four entries (x,z,y,w)
  959. XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
  960. // Mask
  961. vTemp = _mm_and_ps(vTemp,Mask);
  962. // x,y and z are unsigned! Flip the bits to convert the order to signed
  963. vTemp = _mm_xor_ps(vTemp,g_XMXorByte4);
  964. // Convert to floating point numbers
  965. vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
  966. // x, y and z - 0x80 to complete the conversion
  967. vTemp = _mm_add_ps(vTemp,g_XMAddByte4);
  968. // Fix y, z and w because they are too large
  969. return _mm_mul_ps(vTemp,Scale);
  970. #endif
  971. }
  972. //------------------------------------------------------------------------------
  973. _Use_decl_annotations_
  974. inline XMVECTOR XM_CALLCONV XMLoadUByteN2
  975. (
  976. const XMUBYTEN2* pSource
  977. )
  978. {
  979. assert(pSource);
  980. #if defined(_XM_NO_INTRINSICS_)
  981. XMVECTORF32 vResult = { { {
  982. (float) pSource->x * (1.0f / 255.0f),
  983. (float) pSource->y * (1.0f / 255.0f),
  984. 0.0f,
  985. 0.0f
  986. } } };
  987. return vResult.v;
  988. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  989. uint16x4_t vInt8 = vld1_dup_u16( reinterpret_cast<const uint16_t*>( pSource ) );
  990. uint16x8_t vInt16 = vmovl_u8( vreinterpret_u8_u16(vInt8) );
  991. uint32x4_t vInt = vmovl_u16( vget_low_u16(vInt16) );
  992. vInt = vandq_u32( vInt, g_XMMaskXY );
  993. float32x4_t R = vcvtq_f32_u32(vInt);
  994. return vmulq_n_f32( R, 1.0f/255.0f );
  995. #elif defined(_XM_SSE_INTRINSICS_)
  996. static const XMVECTORF32 Scale = { { { 1.0f / 255.0f, 1.0f / (255.0f*256.0f), 0, 0 } } };
  997. static const XMVECTORU32 Mask = { { { 0xFF, 0xFF00, 0, 0 } } };
  998. // Splat the color in all four entries (x,z,y,w)
  999. XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
  1000. // Mask
  1001. vTemp = _mm_and_ps(vTemp,Mask);
  1002. // w is signed! Flip the bits to convert the order to unsigned
  1003. vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
  1004. // Convert to floating point numbers
  1005. vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
  1006. // w + 0x80 to complete the conversion
  1007. vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
  1008. // Fix y, z and w because they are too large
  1009. return _mm_mul_ps(vTemp,Scale);
  1010. #endif
  1011. }
  1012. //------------------------------------------------------------------------------
  1013. _Use_decl_annotations_
  1014. inline XMVECTOR XM_CALLCONV XMLoadUByte2
  1015. (
  1016. const XMUBYTE2* pSource
  1017. )
  1018. {
  1019. assert(pSource);
  1020. #if defined(_XM_NO_INTRINSICS_)
  1021. XMVECTORF32 vResult = { { {
  1022. (float) pSource->x,
  1023. (float) pSource->y,
  1024. 0.0f,
  1025. 0.0f
  1026. } } };
  1027. return vResult.v;
  1028. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1029. uint16x4_t vInt8 = vld1_dup_u16( reinterpret_cast<const uint16_t*>( pSource ) );
  1030. uint16x8_t vInt16 = vmovl_u8( vreinterpret_u8_u32(vInt8) );
  1031. uint32x4_t vInt = vmovl_u16( vget_low_u16(vInt16) );
  1032. vInt = vandq_s32( vInt, g_XMMaskXY );
  1033. return vcvtq_f32_u32(vInt);
  1034. #elif defined(_XM_SSE_INTRINSICS_)
  1035. static const XMVECTORF32 Scale = { { { 1.0f, 1.0f / 256.0f, 0, 0 } } };
  1036. static const XMVECTORU32 Mask = { { { 0xFF, 0xFF00, 0, 0 } } };
  1037. // Splat the color in all four entries (x,z,y,w)
  1038. XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
  1039. // Mask
  1040. vTemp = _mm_and_ps(vTemp,Mask);
  1041. // w is signed! Flip the bits to convert the order to unsigned
  1042. vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
  1043. // Convert to floating point numbers
  1044. vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
  1045. // w + 0x80 to complete the conversion
  1046. vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
  1047. // Fix y, z and w because they are too large
  1048. return _mm_mul_ps(vTemp,Scale);
  1049. #endif
  1050. }
  1051. //------------------------------------------------------------------------------
  1052. _Use_decl_annotations_
  1053. inline XMVECTOR XM_CALLCONV XMLoadU565
  1054. (
  1055. const XMU565* pSource
  1056. )
  1057. {
  1058. assert(pSource);
  1059. #if defined(_XM_NO_INTRINSICS_)
  1060. XMVECTORF32 vResult = { { {
  1061. float(pSource->v & 0x1F),
  1062. float((pSource->v >> 5) & 0x3F),
  1063. float((pSource->v >> 11) & 0x1F),
  1064. 0.f,
  1065. } } };
  1066. return vResult.v;
  1067. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1068. static const XMVECTORI32 U565And = { { { 0x1F, 0x3F << 5, 0x1F << 11, 0 } } };
  1069. static const XMVECTORF32 U565Mul = { { { 1.0f, 1.0f / 32.0f, 1.0f / 2048.f, 0 } } };
  1070. uint16x4_t vInt16 = vld1_dup_u16( reinterpret_cast<const uint16_t*>( pSource ) );
  1071. uint32x4_t vInt = vmovl_u16( vInt16 );
  1072. vInt = vandq_u32(vInt,U565And);
  1073. float32x4_t R = vcvtq_f32_u32(vInt);
  1074. return vmulq_f32(R,U565Mul);
  1075. #elif defined(_XM_SSE_INTRINSICS_)
  1076. static const XMVECTORI32 U565And = { { { 0x1F, 0x3F << 5, 0x1F << 11, 0 } } };
  1077. static const XMVECTORF32 U565Mul = { { { 1.0f, 1.0f / 32.0f, 1.0f / 2048.f, 0 } } };
  1078. // Get the 32 bit value and splat it
  1079. XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
  1080. // Mask off x, y and z
  1081. vResult = _mm_and_ps(vResult,U565And);
  1082. // Convert to float
  1083. vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
  1084. // Normalize x, y, and z
  1085. vResult = _mm_mul_ps(vResult,U565Mul);
  1086. return vResult;
  1087. #endif
  1088. }
  1089. //------------------------------------------------------------------------------
  1090. _Use_decl_annotations_
  1091. inline XMVECTOR XM_CALLCONV XMLoadFloat3PK
  1092. (
  1093. const XMFLOAT3PK* pSource
  1094. )
  1095. {
  1096. assert(pSource);
  1097. ALIGN(16) uint32_t ALIGN_END(16) Result[4];
  1098. uint32_t Mantissa;
  1099. uint32_t Exponent;
  1100. // X Channel (6-bit mantissa)
  1101. Mantissa = pSource->xm;
  1102. if ( pSource->xe == 0x1f ) // INF or NAN
  1103. {
  1104. Result[0] = 0x7f800000 | (pSource->xm << 17);
  1105. }
  1106. else
  1107. {
  1108. if ( pSource->xe != 0 ) // The value is normalized
  1109. {
  1110. Exponent = pSource->xe;
  1111. }
  1112. else if (Mantissa != 0) // The value is denormalized
  1113. {
  1114. // Normalize the value in the resulting float
  1115. Exponent = 1;
  1116. do
  1117. {
  1118. Exponent--;
  1119. Mantissa <<= 1;
  1120. } while ((Mantissa & 0x40) == 0);
  1121. Mantissa &= 0x3F;
  1122. }
  1123. else // The value is zero
  1124. {
  1125. Exponent = (uint32_t)-112;
  1126. }
  1127. Result[0] = ((Exponent + 112) << 23) | (Mantissa << 17);
  1128. }
  1129. // Y Channel (6-bit mantissa)
  1130. Mantissa = pSource->ym;
  1131. if ( pSource->ye == 0x1f ) // INF or NAN
  1132. {
  1133. Result[1] = 0x7f800000 | (pSource->ym << 17);
  1134. }
  1135. else
  1136. {
  1137. if ( pSource->ye != 0 ) // The value is normalized
  1138. {
  1139. Exponent = pSource->ye;
  1140. }
  1141. else if (Mantissa != 0) // The value is denormalized
  1142. {
  1143. // Normalize the value in the resulting float
  1144. Exponent = 1;
  1145. do
  1146. {
  1147. Exponent--;
  1148. Mantissa <<= 1;
  1149. } while ((Mantissa & 0x40) == 0);
  1150. Mantissa &= 0x3F;
  1151. }
  1152. else // The value is zero
  1153. {
  1154. Exponent = (uint32_t)-112;
  1155. }
  1156. Result[1] = ((Exponent + 112) << 23) | (Mantissa << 17);
  1157. }
  1158. // Z Channel (5-bit mantissa)
  1159. Mantissa = pSource->zm;
  1160. if ( pSource->ze == 0x1f ) // INF or NAN
  1161. {
  1162. Result[2] = 0x7f800000 | (pSource->zm << 17);
  1163. }
  1164. else
  1165. {
  1166. if ( pSource->ze != 0 ) // The value is normalized
  1167. {
  1168. Exponent = pSource->ze;
  1169. }
  1170. else if (Mantissa != 0) // The value is denormalized
  1171. {
  1172. // Normalize the value in the resulting float
  1173. Exponent = 1;
  1174. do
  1175. {
  1176. Exponent--;
  1177. Mantissa <<= 1;
  1178. } while ((Mantissa & 0x20) == 0);
  1179. Mantissa &= 0x1F;
  1180. }
  1181. else // The value is zero
  1182. {
  1183. Exponent = (uint32_t)-112;
  1184. }
  1185. Result[2] = ((Exponent + 112) << 23) | (Mantissa << 18);
  1186. }
  1187. return XMLoadFloat3A( reinterpret_cast<const XMFLOAT3A*>(&Result) );
  1188. }
  1189. //------------------------------------------------------------------------------
  1190. _Use_decl_annotations_
  1191. inline XMVECTOR XM_CALLCONV XMLoadFloat3SE
  1192. (
  1193. const XMFLOAT3SE* pSource
  1194. )
  1195. {
  1196. assert(pSource);
  1197. union { float f; int32_t i; } fi;
  1198. fi.i = 0x33800000 + (pSource->e << 23);
  1199. float Scale = fi.f;
  1200. XMVECTORF32 v = { { {
  1201. Scale * float(pSource->xm),
  1202. Scale * float(pSource->ym),
  1203. Scale * float(pSource->zm),
  1204. 1.0f } } };
  1205. return v;
  1206. }
  1207. //------------------------------------------------------------------------------
  1208. _Use_decl_annotations_
  1209. inline XMVECTOR XM_CALLCONV XMLoadHalf4
  1210. (
  1211. const XMHALF4* pSource
  1212. )
  1213. {
  1214. assert(pSource);
  1215. #if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
  1216. __m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) );
  1217. return _mm_cvtph_ps( V );
  1218. #else
  1219. XMVECTORF32 vResult = { { {
  1220. XMConvertHalfToFloat(pSource->x),
  1221. XMConvertHalfToFloat(pSource->y),
  1222. XMConvertHalfToFloat(pSource->z),
  1223. XMConvertHalfToFloat(pSource->w)
  1224. } } };
  1225. return vResult.v;
  1226. #endif // !_XM_F16C_INTRINSICS_
  1227. }
  1228. //------------------------------------------------------------------------------
  1229. _Use_decl_annotations_
  1230. inline XMVECTOR XM_CALLCONV XMLoadShortN4
  1231. (
  1232. const XMSHORTN4* pSource
  1233. )
  1234. {
  1235. assert(pSource);
  1236. #if defined(_XM_NO_INTRINSICS_)
  1237. XMVECTORF32 vResult = { { {
  1238. (pSource->x == -32768) ? -1.f : ((float) pSource->x * (1.0f / 32767.0f)),
  1239. (pSource->y == -32768) ? -1.f : ((float) pSource->y * (1.0f / 32767.0f)),
  1240. (pSource->z == -32768) ? -1.f : ((float) pSource->z * (1.0f / 32767.0f)),
  1241. (pSource->w == -32768) ? -1.f : ((float) pSource->w * (1.0f / 32767.0f))
  1242. } } };
  1243. return vResult.v;
  1244. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1245. int16x4_t vInt = vld1_s16( (const int16_t*)pSource );
  1246. int32x4_t V = vmovl_s16( vInt );
  1247. V = vcvtq_f32_s32( V );
  1248. V = vmulq_n_f32( V, 1.0f/32767.0f );
  1249. return vmaxq_f32( V, vdupq_n_f32(-1.f) );
  1250. #elif defined(_XM_SSE_INTRINSICS_)
  1251. // Splat the color in all four entries (x,z,y,w)
  1252. __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
  1253. // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
  1254. __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16);
  1255. // x and z are unsigned! Flip the bits to convert the order to signed
  1256. vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16);
  1257. // Convert to floating point numbers
  1258. vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
  1259. // x and z - 0x8000 to complete the conversion
  1260. vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16);
  1261. // Convert to -1.0f - 1.0f
  1262. vTemp = _mm_mul_ps(vTemp,g_XMNormalizeX16Y16Z16W16);
  1263. // Very important! The entries are x,z,y,w, flip it to x,y,z,w
  1264. vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0));
  1265. // Clamp result (for case of -32768)
  1266. return _mm_max_ps( vTemp, g_XMNegativeOne );
  1267. #endif
  1268. }
  1269. //------------------------------------------------------------------------------
  1270. _Use_decl_annotations_
  1271. inline XMVECTOR XM_CALLCONV XMLoadShort4
  1272. (
  1273. const XMSHORT4* pSource
  1274. )
  1275. {
  1276. assert(pSource);
  1277. #if defined(_XM_NO_INTRINSICS_)
  1278. XMVECTORF32 vResult = { { {
  1279. (float) pSource->x,
  1280. (float) pSource->y,
  1281. (float) pSource->z,
  1282. (float) pSource->w
  1283. } } };
  1284. return vResult.v;
  1285. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1286. int16x4_t vInt = vld1_s16( (const int16_t*)pSource );
  1287. int32x4_t V = vmovl_s16( vInt );
  1288. return vcvtq_f32_s32( V );
  1289. #elif defined(_XM_SSE_INTRINSICS_)
  1290. // Splat the color in all four entries (x,z,y,w)
  1291. __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
  1292. // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
  1293. __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16);
  1294. // x and z are unsigned! Flip the bits to convert the order to signed
  1295. vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16);
  1296. // Convert to floating point numbers
  1297. vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
  1298. // x and z - 0x8000 to complete the conversion
  1299. vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16);
  1300. // Fix y and w because they are 65536 too large
  1301. vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16);
  1302. // Very important! The entries are x,z,y,w, flip it to x,y,z,w
  1303. return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0));
  1304. #endif
  1305. }
  1306. //------------------------------------------------------------------------------
  1307. _Use_decl_annotations_
  1308. inline XMVECTOR XM_CALLCONV XMLoadUShortN4
  1309. (
  1310. const XMUSHORTN4* pSource
  1311. )
  1312. {
  1313. assert(pSource);
  1314. #if defined(_XM_NO_INTRINSICS_)
  1315. XMVECTORF32 vResult = { { {
  1316. (float) pSource->x / 65535.0f,
  1317. (float) pSource->y / 65535.0f,
  1318. (float) pSource->z / 65535.0f,
  1319. (float) pSource->w / 65535.0f
  1320. } } };
  1321. return vResult.v;
  1322. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1323. uint16x4_t vInt = vld1_u16( (const uint16_t*)pSource );
  1324. uint32x4_t V = vmovl_u16( vInt );
  1325. V = vcvtq_f32_u32( V );
  1326. return vmulq_n_f32( V, 1.0f/65535.0f );
  1327. #elif defined(_XM_SSE_INTRINSICS_)
  1328. static const XMVECTORF32 FixupY16W16 = { { { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / (65535.0f*65536.0f), 1.0f / (65535.0f*65536.0f) } } };
  1329. static const XMVECTORF32 FixaddY16W16 = { { { 0, 0, 32768.0f*65536.0f, 32768.0f*65536.0f } } };
  1330. // Splat the color in all four entries (x,z,y,w)
  1331. __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
  1332. // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
  1333. __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16);
  1334. // y and w are signed! Flip the bits to convert the order to unsigned
  1335. vTemp = _mm_xor_ps(vTemp,g_XMFlipZW);
  1336. // Convert to floating point numbers
  1337. vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
  1338. // y and w + 0x8000 to complete the conversion
  1339. vTemp = _mm_add_ps(vTemp,FixaddY16W16);
  1340. // Fix y and w because they are 65536 too large
  1341. vTemp = _mm_mul_ps(vTemp,FixupY16W16);
  1342. // Very important! The entries are x,z,y,w, flip it to x,y,z,w
  1343. return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0));
  1344. #endif
  1345. }
  1346. //------------------------------------------------------------------------------
  1347. _Use_decl_annotations_
  1348. inline XMVECTOR XM_CALLCONV XMLoadUShort4
  1349. (
  1350. const XMUSHORT4* pSource
  1351. )
  1352. {
  1353. assert(pSource);
  1354. #if defined(_XM_NO_INTRINSICS_)
  1355. XMVECTORF32 vResult = { { {
  1356. (float) pSource->x,
  1357. (float) pSource->y,
  1358. (float) pSource->z,
  1359. (float) pSource->w
  1360. } } };
  1361. return vResult.v;
  1362. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1363. uint16x4_t vInt = vld1_u16( (const uint16_t*)pSource );
  1364. uint32x4_t V = vmovl_u16( vInt );
  1365. return vcvtq_f32_u32( V );
  1366. #elif defined(_XM_SSE_INTRINSICS_)
  1367. static const XMVECTORF32 FixaddY16W16 = { { { 0, 0, 32768.0f, 32768.0f } } };
  1368. // Splat the color in all four entries (x,z,y,w)
  1369. __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
  1370. // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
  1371. __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16);
  1372. // y and w are signed! Flip the bits to convert the order to unsigned
  1373. vTemp = _mm_xor_ps(vTemp,g_XMFlipZW);
  1374. // Convert to floating point numbers
  1375. vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
  1376. // Fix y and w because they are 65536 too large
  1377. vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16);
  1378. // y and w + 0x8000 to complete the conversion
  1379. vTemp = _mm_add_ps(vTemp,FixaddY16W16);
  1380. // Very important! The entries are x,z,y,w, flip it to x,y,z,w
  1381. return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0));
  1382. #endif
  1383. }
  1384. //------------------------------------------------------------------------------
  1385. _Use_decl_annotations_
  1386. inline XMVECTOR XM_CALLCONV XMLoadXDecN4
  1387. (
  1388. const XMXDECN4* pSource
  1389. )
  1390. {
  1391. assert(pSource);
  1392. #if defined(_XM_NO_INTRINSICS_)
  1393. static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00};
  1394. uint32_t ElementX = pSource->v & 0x3FF;
  1395. uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
  1396. uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
  1397. XMVECTORF32 vResult = { { {
  1398. (ElementX == 0x200) ? -1.f : ((float) (int16_t) (ElementX | SignExtend[ElementX >> 9]) / 511.0f),
  1399. (ElementY == 0x200) ? -1.f : ((float) (int16_t) (ElementY | SignExtend[ElementY >> 9]) / 511.0f),
  1400. (ElementZ == 0x200) ? -1.f : ((float) (int16_t) (ElementZ | SignExtend[ElementZ >> 9]) / 511.0f),
  1401. (float) (pSource->v >> 30) / 3.0f
  1402. } } };
  1403. return vResult.v;
  1404. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1405. uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
  1406. vInt = vandq_u32(vInt,g_XMMaskA2B10G10R10);
  1407. vInt = veorq_u32(vInt,g_XMFlipA2B10G10R10);
  1408. float32x4_t R = vcvtq_f32_s32( vreinterpretq_s32_u32(vInt) );
  1409. R = vaddq_f32(R,g_XMFixAA2B10G10R10);
  1410. R = vmulq_f32(R,g_XMNormalizeA2B10G10R10);
  1411. return vmaxq_f32( R, vdupq_n_f32(-1.0f) );
  1412. #elif defined(_XM_SSE_INTRINSICS_)
  1413. // Splat the color in all four entries
  1414. __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
  1415. // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
  1416. vTemp = _mm_and_ps(vTemp,g_XMMaskA2B10G10R10);
  1417. // a is unsigned! Flip the bit to convert the order to signed
  1418. vTemp = _mm_xor_ps(vTemp,g_XMFlipA2B10G10R10);
  1419. // Convert to floating point numbers
  1420. vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
  1421. // RGB + 0, A + 0x80000000.f to undo the signed order.
  1422. vTemp = _mm_add_ps(vTemp,g_XMFixAA2B10G10R10);
  1423. // Convert 0-255 to 0.0f-1.0f
  1424. vTemp = _mm_mul_ps(vTemp,g_XMNormalizeA2B10G10R10);
  1425. // Clamp result (for case of -512)
  1426. return _mm_max_ps( vTemp, g_XMNegativeOne );
  1427. #endif
  1428. }
  1429. //------------------------------------------------------------------------------
  1430. #pragma warning(push)
  1431. #pragma warning(disable : 4996)
  1432. // C4996: ignore deprecation warning
  1433. _Use_decl_annotations_
  1434. inline XMVECTOR XM_CALLCONV XMLoadXDec4
  1435. (
  1436. const XMXDEC4* pSource
  1437. )
  1438. {
  1439. assert(pSource);
  1440. #if defined(_XM_NO_INTRINSICS_)
  1441. static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00};
  1442. uint32_t ElementX = pSource->v & 0x3FF;
  1443. uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
  1444. uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
  1445. XMVECTORF32 vResult = { { {
  1446. (float) (int16_t) (ElementX | SignExtend[ElementX >> 9]),
  1447. (float) (int16_t) (ElementY | SignExtend[ElementY >> 9]),
  1448. (float) (int16_t) (ElementZ | SignExtend[ElementZ >> 9]),
  1449. (float) (pSource->v >> 30)
  1450. } } };
  1451. return vResult.v;
  1452. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1453. static const XMVECTORU32 XDec4Xor = { { { 0x200, 0x200 << 10, 0x200 << 20, 0x80000000 } } };
  1454. static const XMVECTORF32 XDec4Add = { { { -512.0f, -512.0f*1024.0f, -512.0f*1024.0f*1024.0f, 32768 * 65536.0f } } };
  1455. uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
  1456. vInt = vandq_u32(vInt,g_XMMaskDec4);
  1457. vInt = veorq_u32(vInt,XDec4Xor);
  1458. float32x4_t R = vcvtq_f32_s32( vreinterpretq_s32_u32(vInt) );
  1459. R = vaddq_f32(R ,XDec4Add);
  1460. return vmulq_f32(R,g_XMMulDec4);
  1461. #elif defined(_XM_SSE_INTRINSICS_)
  1462. static const XMVECTORU32 XDec4Xor = { { { 0x200, 0x200 << 10, 0x200 << 20, 0x80000000 } } };
  1463. static const XMVECTORF32 XDec4Add = { { { -512.0f, -512.0f*1024.0f, -512.0f*1024.0f*1024.0f, 32768 * 65536.0f } } };
  1464. // Splat the color in all four entries
  1465. XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
  1466. // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
  1467. vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
  1468. // a is unsigned! Flip the bit to convert the order to signed
  1469. vTemp = _mm_xor_ps(vTemp,XDec4Xor);
  1470. // Convert to floating point numbers
  1471. vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
  1472. // RGB + 0, A + 0x80000000.f to undo the signed order.
  1473. vTemp = _mm_add_ps(vTemp,XDec4Add);
  1474. // Convert 0-255 to 0.0f-1.0f
  1475. vTemp = _mm_mul_ps(vTemp,g_XMMulDec4);
  1476. return vTemp;
  1477. #endif
  1478. }
  1479. #pragma warning(pop)
  1480. //------------------------------------------------------------------------------
  1481. _Use_decl_annotations_
  1482. inline XMVECTOR XM_CALLCONV XMLoadUDecN4
  1483. (
  1484. const XMUDECN4* pSource
  1485. )
  1486. {
  1487. assert(pSource);
  1488. #if defined(_XM_NO_INTRINSICS_)
  1489. uint32_t ElementX = pSource->v & 0x3FF;
  1490. uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
  1491. uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
  1492. XMVECTORF32 vResult = { { {
  1493. (float) ElementX / 1023.0f,
  1494. (float) ElementY / 1023.0f,
  1495. (float) ElementZ / 1023.0f,
  1496. (float) (pSource->v >> 30) / 3.0f
  1497. } } };
  1498. return vResult.v;
  1499. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1500. static const XMVECTORF32 UDecN4Mul = { { { 1.0f / 1023.0f, 1.0f / (1023.0f*1024.0f), 1.0f / (1023.0f*1024.0f*1024.0f), 1.0f / (3.0f*1024.0f*1024.0f*1024.0f) } } };
  1501. uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
  1502. vInt = vandq_u32(vInt,g_XMMaskDec4);
  1503. float32x4_t R = vcvtq_f32_u32( vInt );
  1504. return vmulq_f32(R,UDecN4Mul);
  1505. #elif defined(_XM_SSE_INTRINSICS_)
  1506. static const XMVECTORF32 UDecN4Mul = { { { 1.0f / 1023.0f, 1.0f / (1023.0f*1024.0f), 1.0f / (1023.0f*1024.0f*1024.0f), 1.0f / (3.0f*1024.0f*1024.0f*1024.0f) } } };
  1507. // Splat the color in all four entries
  1508. XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
  1509. // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
  1510. vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
  1511. // a is unsigned! Flip the bit to convert the order to signed
  1512. vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
  1513. // Convert to floating point numbers
  1514. vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
  1515. // RGB + 0, A + 0x80000000.f to undo the signed order.
  1516. vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
  1517. // Convert 0-255 to 0.0f-1.0f
  1518. vTemp = _mm_mul_ps(vTemp,UDecN4Mul);
  1519. return vTemp;
  1520. #endif
  1521. }
  1522. //------------------------------------------------------------------------------
  1523. _Use_decl_annotations_
  1524. inline XMVECTOR XM_CALLCONV XMLoadUDecN4_XR
  1525. (
  1526. const XMUDECN4* pSource
  1527. )
  1528. {
  1529. assert(pSource);
  1530. #if defined(_XM_NO_INTRINSICS_)
  1531. int32_t ElementX = pSource->v & 0x3FF;
  1532. int32_t ElementY = (pSource->v >> 10) & 0x3FF;
  1533. int32_t ElementZ = (pSource->v >> 20) & 0x3FF;
  1534. XMVECTORF32 vResult = { { {
  1535. (float) (ElementX - 0x180) / 510.0f,
  1536. (float) (ElementY - 0x180) / 510.0f,
  1537. (float) (ElementZ - 0x180) / 510.0f,
  1538. (float) (pSource->v >> 30) / 3.0f
  1539. } } };
  1540. return vResult.v;
  1541. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1542. static const XMVECTORF32 XRMul = { { { 1.0f / 510.0f, 1.0f / (510.0f*1024.0f), 1.0f / (510.0f*1024.0f*1024.0f), 1.0f / (3.0f*1024.0f*1024.0f*1024.0f) } } };
  1543. static const XMVECTORI32 XRBias = { { { 0x180, 0x180 * 1024, 0x180 * 1024 * 1024, 0 } } };
  1544. uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
  1545. vInt = vandq_u32(vInt,g_XMMaskDec4);
  1546. int32x4_t vTemp = vsubq_s32( vreinterpretq_s32_u32(vInt), XRBias );
  1547. vTemp = veorq_u32( vTemp, g_XMFlipW );
  1548. float32x4_t R = vcvtq_f32_s32( vTemp );
  1549. R = vaddq_f32(R,g_XMAddUDec4);
  1550. return vmulq_f32(R,XRMul);
  1551. #elif defined(_XM_SSE_INTRINSICS_)
  1552. static const XMVECTORF32 XRMul = { { { 1.0f / 510.0f, 1.0f / (510.0f*1024.0f), 1.0f / (510.0f*1024.0f*1024.0f), 1.0f / (3.0f*1024.0f*1024.0f*1024.0f) } } };
  1553. static const XMVECTORI32 XRBias = { { { 0x180, 0x180 * 1024, 0x180 * 1024 * 1024, 0 } } };
  1554. // Splat the color in all four entries
  1555. XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
  1556. // Mask channels
  1557. vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
  1558. // Subtract bias
  1559. vTemp = _mm_castsi128_ps( _mm_sub_epi32( _mm_castps_si128(vTemp), XRBias ) );
  1560. // a is unsigned! Flip the bit to convert the order to signed
  1561. vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
  1562. // Convert to floating point numbers
  1563. vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
  1564. // RGB + 0, A + 0x80000000.f to undo the signed order.
  1565. vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
  1566. // Convert to 0.0f-1.0f
  1567. return _mm_mul_ps(vTemp,XRMul);
  1568. #endif
  1569. }
  1570. //------------------------------------------------------------------------------
  1571. _Use_decl_annotations_
  1572. inline XMVECTOR XM_CALLCONV XMLoadUDec4
  1573. (
  1574. const XMUDEC4* pSource
  1575. )
  1576. {
  1577. assert(pSource);
  1578. #if defined(_XM_NO_INTRINSICS_)
  1579. uint32_t ElementX = pSource->v & 0x3FF;
  1580. uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
  1581. uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
  1582. XMVECTORF32 vResult = { { {
  1583. (float) ElementX,
  1584. (float) ElementY,
  1585. (float) ElementZ,
  1586. (float) (pSource->v >> 30)
  1587. } } };
  1588. return vResult.v;
  1589. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1590. uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
  1591. vInt = vandq_u32(vInt,g_XMMaskDec4);
  1592. float32x4_t R = vcvtq_f32_u32( vInt );
  1593. return vmulq_f32(R,g_XMMulDec4);
  1594. #elif defined(_XM_SSE_INTRINSICS_)
  1595. // Splat the color in all four entries
  1596. XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
  1597. // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
  1598. vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
  1599. // a is unsigned! Flip the bit to convert the order to signed
  1600. vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
  1601. // Convert to floating point numbers
  1602. vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
  1603. // RGB + 0, A + 0x80000000.f to undo the signed order.
  1604. vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
  1605. // Convert 0-255 to 0.0f-1.0f
  1606. vTemp = _mm_mul_ps(vTemp,g_XMMulDec4);
  1607. return vTemp;
  1608. #endif
  1609. }
  1610. //------------------------------------------------------------------------------
  1611. #pragma warning(push)
  1612. #pragma warning(disable : 4996)
  1613. // C4996: ignore deprecation warning
  1614. _Use_decl_annotations_
  1615. inline XMVECTOR XM_CALLCONV XMLoadDecN4
  1616. (
  1617. const XMDECN4* pSource
  1618. )
  1619. {
  1620. assert(pSource);
  1621. #if defined(_XM_NO_INTRINSICS_)
  1622. static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00};
  1623. static const uint32_t SignExtendW[] = {0x00000000, 0xFFFFFFFC};
  1624. uint32_t ElementX = pSource->v & 0x3FF;
  1625. uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
  1626. uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
  1627. uint32_t ElementW = pSource->v >> 30;
  1628. XMVECTORF32 vResult = { { {
  1629. (ElementX == 0x200) ? -1.f : ((float) (int16_t) (ElementX | SignExtend[ElementX >> 9]) / 511.0f),
  1630. (ElementY == 0x200) ? -1.f : ((float) (int16_t) (ElementY | SignExtend[ElementY >> 9]) / 511.0f),
  1631. (ElementZ == 0x200) ? -1.f : ((float) (int16_t) (ElementZ | SignExtend[ElementZ >> 9]) / 511.0f),
  1632. (ElementW == 0x2) ? -1.f : ((float) (int16_t) (ElementW | SignExtendW[(ElementW >> 1) & 1]))
  1633. } } };
  1634. return vResult.v;
  1635. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1636. static const XMVECTORF32 DecN4Mul = { { { 1.0f / 511.0f, 1.0f / (511.0f*1024.0f), 1.0f / (511.0f*1024.0f*1024.0f), 1.0f / (1024.0f*1024.0f*1024.0f) } } };
  1637. uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
  1638. vInt = vandq_u32(vInt,g_XMMaskDec4);
  1639. vInt = veorq_u32(vInt,g_XMXorDec4);
  1640. float32x4_t R = vcvtq_f32_s32( vreinterpretq_s32_u32(vInt) );
  1641. R = vaddq_f32(R,g_XMAddDec4);
  1642. R = vmulq_f32(R,DecN4Mul);
  1643. return vmaxq_f32( R, vdupq_n_f32(-1.0f) );
  1644. #elif defined(_XM_SSE_INTRINSICS_)
  1645. static const XMVECTORF32 DecN4Mul = { { { 1.0f / 511.0f, 1.0f / (511.0f*1024.0f), 1.0f / (511.0f*1024.0f*1024.0f), 1.0f / (1024.0f*1024.0f*1024.0f) } } };
  1646. // Splat the color in all four entries
  1647. XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
  1648. // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
  1649. vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
  1650. // a is unsigned! Flip the bit to convert the order to signed
  1651. vTemp = _mm_xor_ps(vTemp,g_XMXorDec4);
  1652. // Convert to floating point numbers
  1653. vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
  1654. // RGB + 0, A + 0x80000000.f to undo the signed order.
  1655. vTemp = _mm_add_ps(vTemp,g_XMAddDec4);
  1656. // Convert 0-255 to 0.0f-1.0f
  1657. vTemp = _mm_mul_ps(vTemp,DecN4Mul);
  1658. // Clamp result (for case of -512/-1)
  1659. return _mm_max_ps( vTemp, g_XMNegativeOne );
  1660. #endif
  1661. }
  1662. //------------------------------------------------------------------------------
  1663. _Use_decl_annotations_
  1664. inline XMVECTOR XM_CALLCONV XMLoadDec4
  1665. (
  1666. const XMDEC4* pSource
  1667. )
  1668. {
  1669. assert(pSource);
  1670. #if defined(_XM_NO_INTRINSICS_)
  1671. static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00};
  1672. static const uint32_t SignExtendW[] = {0x00000000, 0xFFFFFFFC};
  1673. uint32_t ElementX = pSource->v & 0x3FF;
  1674. uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
  1675. uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
  1676. uint32_t ElementW = pSource->v >> 30;
  1677. XMVECTORF32 vResult = { { {
  1678. (float) (int16_t) (ElementX | SignExtend[ElementX >> 9]),
  1679. (float) (int16_t) (ElementY | SignExtend[ElementY >> 9]),
  1680. (float) (int16_t) (ElementZ | SignExtend[ElementZ >> 9]),
  1681. (float) (int16_t) (ElementW | SignExtendW[ElementW >> 1])
  1682. } } };
  1683. return vResult.v;
  1684. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1685. uint32x4_t vInt = vld1q_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
  1686. vInt = vandq_u32(vInt,g_XMMaskDec4);
  1687. vInt = veorq_u32(vInt,g_XMXorDec4);
  1688. float32x4_t R = vcvtq_f32_s32( vreinterpretq_s32_u32(vInt) );
  1689. R = vaddq_f32(R,g_XMAddDec4);
  1690. return vmulq_f32(R,g_XMMulDec4);
  1691. #elif defined(_XM_SSE_INTRINSICS_)
  1692. // Splat the color in all four entries
  1693. XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
  1694. // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
  1695. vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
  1696. // a is unsigned! Flip the bit to convert the order to signed
  1697. vTemp = _mm_xor_ps(vTemp,g_XMXorDec4);
  1698. // Convert to floating point numbers
  1699. vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
  1700. // RGB + 0, A + 0x80000000.f to undo the signed order.
  1701. vTemp = _mm_add_ps(vTemp,g_XMAddDec4);
  1702. // Convert 0-255 to 0.0f-1.0f
  1703. vTemp = _mm_mul_ps(vTemp,g_XMMulDec4);
  1704. return vTemp;
  1705. #endif
  1706. }
  1707. #pragma warning(pop)
  1708. //------------------------------------------------------------------------------
  1709. _Use_decl_annotations_
  1710. inline XMVECTOR XM_CALLCONV XMLoadUByteN4
  1711. (
  1712. const XMUBYTEN4* pSource
  1713. )
  1714. {
  1715. assert(pSource);
  1716. #if defined(_XM_NO_INTRINSICS_)
  1717. XMVECTORF32 vResult = { { {
  1718. (float) pSource->x / 255.0f,
  1719. (float) pSource->y / 255.0f,
  1720. (float) pSource->z / 255.0f,
  1721. (float) pSource->w / 255.0f
  1722. } } };
  1723. return vResult.v;
  1724. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1725. uint32x2_t vInt8 = vld1_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
  1726. uint16x8_t vInt16 = vmovl_u8( vreinterpret_u8_u32(vInt8) );
  1727. uint32x4_t vInt = vmovl_u16( vget_low_u16(vInt16) );
  1728. float32x4_t R = vcvtq_f32_u32(vInt);
  1729. return vmulq_n_f32( R, 1.0f/255.0f );
  1730. #elif defined(_XM_SSE_INTRINSICS_)
  1731. static const XMVECTORF32 LoadUByteN4Mul = { { { 1.0f / 255.0f, 1.0f / (255.0f*256.0f), 1.0f / (255.0f*65536.0f), 1.0f / (255.0f*65536.0f*256.0f) } } };
  1732. // Splat the color in all four entries (x,z,y,w)
  1733. XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
  1734. // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
  1735. vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
  1736. // w is signed! Flip the bits to convert the order to unsigned
  1737. vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
  1738. // Convert to floating point numbers
  1739. vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
  1740. // w + 0x80 to complete the conversion
  1741. vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
  1742. // Fix y, z and w because they are too large
  1743. vTemp = _mm_mul_ps(vTemp,LoadUByteN4Mul);
  1744. return vTemp;
  1745. #endif
  1746. }
  1747. //------------------------------------------------------------------------------
  1748. _Use_decl_annotations_
  1749. inline XMVECTOR XM_CALLCONV XMLoadUByte4
  1750. (
  1751. const XMUBYTE4* pSource
  1752. )
  1753. {
  1754. assert(pSource);
  1755. #if defined(_XM_NO_INTRINSICS_)
  1756. XMVECTORF32 vResult = { { {
  1757. (float) pSource->x,
  1758. (float) pSource->y,
  1759. (float) pSource->z,
  1760. (float) pSource->w
  1761. } } };
  1762. return vResult.v;
  1763. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1764. uint32x2_t vInt8 = vld1_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
  1765. uint16x8_t vInt16 = vmovl_u8( vreinterpret_u8_u32(vInt8) );
  1766. uint32x4_t vInt = vmovl_u16( vget_low_u16(vInt16) );
  1767. return vcvtq_f32_u32(vInt);
  1768. #elif defined(_XM_SSE_INTRINSICS_)
  1769. static const XMVECTORF32 LoadUByte4Mul = { { { 1.0f, 1.0f / 256.0f, 1.0f / 65536.0f, 1.0f / (65536.0f*256.0f) } } };
  1770. // Splat the color in all four entries (x,z,y,w)
  1771. XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
  1772. // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
  1773. vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
  1774. // w is signed! Flip the bits to convert the order to unsigned
  1775. vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
  1776. // Convert to floating point numbers
  1777. vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
  1778. // w + 0x80 to complete the conversion
  1779. vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
  1780. // Fix y, z and w because they are too large
  1781. vTemp = _mm_mul_ps(vTemp,LoadUByte4Mul);
  1782. return vTemp;
  1783. #endif
  1784. }
  1785. //------------------------------------------------------------------------------
  1786. _Use_decl_annotations_
  1787. inline XMVECTOR XM_CALLCONV XMLoadByteN4
  1788. (
  1789. const XMBYTEN4* pSource
  1790. )
  1791. {
  1792. assert(pSource);
  1793. #if defined(_XM_NO_INTRINSICS_)
  1794. XMVECTORF32 vResult = { { {
  1795. (pSource->x == -128) ? -1.f : ((float) pSource->x / 127.0f),
  1796. (pSource->y == -128) ? -1.f : ((float) pSource->y / 127.0f),
  1797. (pSource->z == -128) ? -1.f : ((float) pSource->z / 127.0f),
  1798. (pSource->w == -128) ? -1.f : ((float) pSource->w / 127.0f)
  1799. } } };
  1800. return vResult.v;
  1801. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1802. uint32x2_t vInt8 = vld1_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
  1803. int16x8_t vInt16 = vmovl_s8( vreinterpret_s8_u32(vInt8) );
  1804. int32x4_t vInt = vmovl_s16( vget_low_s16(vInt16) );
  1805. float32x4_t R = vcvtq_f32_s32(vInt);
  1806. R = vmulq_n_f32( R, 1.0f/127.0f );
  1807. return vmaxq_f32( R, vdupq_n_f32(-1.f) );
  1808. #elif defined(_XM_SSE_INTRINSICS_)
  1809. static const XMVECTORF32 LoadByteN4Mul = { { { 1.0f / 127.0f, 1.0f / (127.0f*256.0f), 1.0f / (127.0f*65536.0f), 1.0f / (127.0f*65536.0f*256.0f) } } };
  1810. // Splat the color in all four entries (x,z,y,w)
  1811. XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
  1812. // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
  1813. vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
  1814. // x,y and z are unsigned! Flip the bits to convert the order to signed
  1815. vTemp = _mm_xor_ps(vTemp,g_XMXorByte4);
  1816. // Convert to floating point numbers
  1817. vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
  1818. // x, y and z - 0x80 to complete the conversion
  1819. vTemp = _mm_add_ps(vTemp,g_XMAddByte4);
  1820. // Fix y, z and w because they are too large
  1821. vTemp = _mm_mul_ps(vTemp,LoadByteN4Mul);
  1822. // Clamp result (for case of -128)
  1823. return _mm_max_ps( vTemp, g_XMNegativeOne );
  1824. #endif
  1825. }
  1826. //------------------------------------------------------------------------------
  1827. _Use_decl_annotations_
  1828. inline XMVECTOR XM_CALLCONV XMLoadByte4
  1829. (
  1830. const XMBYTE4* pSource
  1831. )
  1832. {
  1833. assert(pSource);
  1834. #if defined(_XM_NO_INTRINSICS_)
  1835. XMVECTORF32 vResult = { { {
  1836. (float) pSource->x,
  1837. (float) pSource->y,
  1838. (float) pSource->z,
  1839. (float) pSource->w
  1840. } } };
  1841. return vResult.v;
  1842. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1843. uint32x2_t vInt8 = vld1_dup_u32( reinterpret_cast<const uint32_t*>( pSource ) );
  1844. int16x8_t vInt16 = vmovl_s8( vreinterpret_s8_u32(vInt8) );
  1845. int32x4_t vInt = vmovl_s16( vget_low_s16(vInt16) );
  1846. return vcvtq_f32_s32(vInt);
  1847. #elif defined(_XM_SSE_INTRINSICS_)
  1848. static const XMVECTORF32 LoadByte4Mul = { { { 1.0f, 1.0f / 256.0f, 1.0f / 65536.0f, 1.0f / (65536.0f*256.0f) } } };
  1849. // Splat the color in all four entries (x,z,y,w)
  1850. XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
  1851. // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
  1852. vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
  1853. // x,y and z are unsigned! Flip the bits to convert the order to signed
  1854. vTemp = _mm_xor_ps(vTemp,g_XMXorByte4);
  1855. // Convert to floating point numbers
  1856. vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
  1857. // x, y and z - 0x80 to complete the conversion
  1858. vTemp = _mm_add_ps(vTemp,g_XMAddByte4);
  1859. // Fix y, z and w because they are too large
  1860. vTemp = _mm_mul_ps(vTemp,LoadByte4Mul);
  1861. return vTemp;
  1862. #endif
  1863. }
  1864. //------------------------------------------------------------------------------
  1865. _Use_decl_annotations_
  1866. inline XMVECTOR XM_CALLCONV XMLoadUNibble4
  1867. (
  1868. const XMUNIBBLE4* pSource
  1869. )
  1870. {
  1871. assert(pSource);
  1872. #if defined(_XM_NO_INTRINSICS_)
  1873. XMVECTORF32 vResult = { { {
  1874. float(pSource->v & 0xF),
  1875. float((pSource->v >> 4) & 0xF),
  1876. float((pSource->v >> 8) & 0xF),
  1877. float((pSource->v >> 12) & 0xF)
  1878. } } };
  1879. return vResult.v;
  1880. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1881. static const XMVECTORI32 UNibble4And = { { { 0xF, 0xF0, 0xF00, 0xF000 } } };
  1882. static const XMVECTORF32 UNibble4Mul = { { { 1.0f, 1.0f / 16.f, 1.0f / 256.f, 1.0f / 4096.f } } };
  1883. uint16x4_t vInt16 = vld1_dup_u16( reinterpret_cast<const uint16_t*>( pSource ) );
  1884. uint32x4_t vInt = vmovl_u16( vInt16 );
  1885. vInt = vandq_u32(vInt,UNibble4And);
  1886. float32x4_t R = vcvtq_f32_u32(vInt);
  1887. return vmulq_f32(R,UNibble4Mul);
  1888. #elif defined(_XM_SSE_INTRINSICS_)
  1889. static const XMVECTORI32 UNibble4And = { { { 0xF, 0xF0, 0xF00, 0xF000 } } };
  1890. static const XMVECTORF32 UNibble4Mul = { { { 1.0f, 1.0f / 16.f, 1.0f / 256.f, 1.0f / 4096.f } } };
  1891. // Get the 32 bit value and splat it
  1892. XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
  1893. // Mask off x, y and z
  1894. vResult = _mm_and_ps(vResult,UNibble4And);
  1895. // Convert to float
  1896. vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
  1897. // Normalize x, y, and z
  1898. vResult = _mm_mul_ps(vResult,UNibble4Mul);
  1899. return vResult;
  1900. #endif
  1901. }
  1902. //------------------------------------------------------------------------------
  1903. _Use_decl_annotations_
  1904. inline XMVECTOR XM_CALLCONV XMLoadU555
  1905. (
  1906. const XMU555* pSource
  1907. )
  1908. {
  1909. assert(pSource);
  1910. #if defined(_XM_NO_INTRINSICS_)
  1911. XMVECTORF32 vResult = { { {
  1912. float(pSource->v & 0x1F),
  1913. float((pSource->v >> 5) & 0x1F),
  1914. float((pSource->v >> 10) & 0x1F),
  1915. float((pSource->v >> 15) & 0x1)
  1916. } } };
  1917. return vResult.v;
  1918. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1919. static const XMVECTORI32 U555And = { { { 0x1F, 0x1F << 5, 0x1F << 10, 0x8000 } } };
  1920. static const XMVECTORF32 U555Mul = { { { 1.0f, 1.0f / 32.f, 1.0f / 1024.f, 1.0f / 32768.f } } };
  1921. uint16x4_t vInt16 = vld1_dup_u16( reinterpret_cast<const uint16_t*>( pSource ) );
  1922. uint32x4_t vInt = vmovl_u16( vInt16 );
  1923. vInt = vandq_u32(vInt,U555And);
  1924. float32x4_t R = vcvtq_f32_u32(vInt);
  1925. return vmulq_f32(R,U555Mul);
  1926. #elif defined(_XM_SSE_INTRINSICS_)
  1927. static const XMVECTORI32 U555And = { { { 0x1F, 0x1F << 5, 0x1F << 10, 0x8000 } } };
  1928. static const XMVECTORF32 U555Mul = { { { 1.0f, 1.0f / 32.f, 1.0f / 1024.f, 1.0f / 32768.f } } };
  1929. // Get the 32 bit value and splat it
  1930. XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
  1931. // Mask off x, y and z
  1932. vResult = _mm_and_ps(vResult,U555And);
  1933. // Convert to float
  1934. vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
  1935. // Normalize x, y, and z
  1936. vResult = _mm_mul_ps(vResult,U555Mul);
  1937. return vResult;
  1938. #endif
  1939. }
  1940. #ifdef _PREFAST_
  1941. #pragma prefast(pop)
  1942. #endif
  1943. /****************************************************************************
  1944. *
  1945. * Vector and matrix store operations
  1946. *
  1947. ****************************************************************************/
  1948. _Use_decl_annotations_
  1949. inline void XM_CALLCONV XMStoreColor
  1950. (
  1951. XMCOLOR* pDestination,
  1952. FXMVECTOR V
  1953. )
  1954. {
  1955. assert(pDestination);
  1956. #if defined(_XM_NO_INTRINSICS_)
  1957. XMVECTOR N = XMVectorSaturate(V);
  1958. N = XMVectorMultiply(N, g_UByteMax);
  1959. N = XMVectorRound(N);
  1960. XMFLOAT4A tmp;
  1961. XMStoreFloat4A( &tmp, N );
  1962. pDestination->c = ((uint32_t)tmp.w << 24) |
  1963. ((uint32_t)tmp.x << 16) |
  1964. ((uint32_t)tmp.y << 8) |
  1965. ((uint32_t)tmp.z);
  1966. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1967. float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0) );
  1968. R = vminq_f32(R, vdupq_n_f32(1.0f));
  1969. R = vmulq_n_f32( R, 255.0f );
  1970. R = XMVectorRound(R);
  1971. uint32x4_t vInt32 = vcvtq_u32_f32(R);
  1972. uint16x4_t vInt16 = vqmovn_u32( vInt32 );
  1973. uint8x8_t vInt8 = vqmovn_u16( vcombine_u16(vInt16,vInt16) );
  1974. uint32_t rgba = vget_lane_u32( vreinterpret_u32_u8(vInt8), 0 );
  1975. pDestination->c = (rgba & 0xFF00FF00) | ((rgba >> 16) & 0xFF) | ((rgba << 16) & 0xFF0000);
  1976. #elif defined(_XM_SSE_INTRINSICS_)
  1977. // Set <0 to 0
  1978. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  1979. // Set>1 to 1
  1980. vResult = _mm_min_ps(vResult,g_XMOne);
  1981. // Convert to 0-255
  1982. vResult = _mm_mul_ps(vResult,g_UByteMax);
  1983. // Shuffle RGBA to ARGB
  1984. vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2));
  1985. // Convert to int
  1986. __m128i vInt = _mm_cvtps_epi32(vResult);
  1987. // Mash to shorts
  1988. vInt = _mm_packs_epi32(vInt,vInt);
  1989. // Mash to bytes
  1990. vInt = _mm_packus_epi16(vInt,vInt);
  1991. // Store the color
  1992. _mm_store_ss(reinterpret_cast<float *>(&pDestination->c),_mm_castsi128_ps(vInt));
  1993. #endif
  1994. }
  1995. //------------------------------------------------------------------------------
  1996. _Use_decl_annotations_
  1997. inline void XM_CALLCONV XMStoreHalf2
  1998. (
  1999. XMHALF2* pDestination,
  2000. FXMVECTOR V
  2001. )
  2002. {
  2003. assert(pDestination);
  2004. #if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
  2005. __m128i V1 = _mm_cvtps_ph( V, 0 );
  2006. _mm_store_ss( reinterpret_cast<float*>(pDestination), _mm_castsi128_ps(V1) );
  2007. #else
  2008. pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V));
  2009. pDestination->y = XMConvertFloatToHalf(XMVectorGetY(V));
  2010. #endif // !_XM_F16C_INTRINSICS_
  2011. }
  2012. //------------------------------------------------------------------------------
  2013. _Use_decl_annotations_
  2014. inline void XM_CALLCONV XMStoreShortN2
  2015. (
  2016. XMSHORTN2* pDestination,
  2017. FXMVECTOR V
  2018. )
  2019. {
  2020. assert(pDestination);
  2021. #if defined(_XM_NO_INTRINSICS_)
  2022. XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
  2023. N = XMVectorMultiply(N, g_ShortMax);
  2024. N = XMVectorRound(N);
  2025. XMFLOAT4A tmp;
  2026. XMStoreFloat4A( &tmp, N );
  2027. pDestination->x = (int16_t)tmp.x;
  2028. pDestination->y = (int16_t)tmp.y;
  2029. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2030. float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-1.f) );
  2031. R = vminq_f32(R, vdupq_n_f32(1.0f));
  2032. R = vmulq_n_f32( R, 32767.0f );
  2033. int32x4_t vInt32 = vcvtq_s32_f32(R);
  2034. int16x4_t vInt16 = vqmovn_s32( vInt32 );
  2035. vst1_lane_u32( &pDestination->v, vreinterpret_u32_s16(vInt16), 0 );
  2036. #elif defined(_XM_SSE_INTRINSICS_)
  2037. XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
  2038. vResult = _mm_min_ps(vResult,g_XMOne);
  2039. vResult = _mm_mul_ps(vResult,g_ShortMax);
  2040. __m128i vResulti = _mm_cvtps_epi32(vResult);
  2041. vResulti = _mm_packs_epi32(vResulti,vResulti);
  2042. _mm_store_ss(reinterpret_cast<float *>(&pDestination->x),_mm_castsi128_ps(vResulti));
  2043. #endif
  2044. }
  2045. //------------------------------------------------------------------------------
  2046. _Use_decl_annotations_
  2047. inline void XM_CALLCONV XMStoreShort2
  2048. (
  2049. XMSHORT2* pDestination,
  2050. FXMVECTOR V
  2051. )
  2052. {
  2053. assert(pDestination);
  2054. #if defined(_XM_NO_INTRINSICS_)
  2055. XMVECTOR N = XMVectorClamp(V, g_ShortMin, g_ShortMax);
  2056. N = XMVectorRound(N);
  2057. XMFLOAT4A tmp;
  2058. XMStoreFloat4A( &tmp, N );
  2059. pDestination->x = (int16_t)tmp.x;
  2060. pDestination->y = (int16_t)tmp.y;
  2061. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2062. float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-32767.f) );
  2063. R = vminq_f32(R, vdupq_n_f32(32767.0f));
  2064. int32x4_t vInt32 = vcvtq_s32_f32(R);
  2065. int16x4_t vInt16 = vqmovn_s32( vInt32 );
  2066. vst1_lane_u32( &pDestination->v, vreinterpret_u32_s16(vInt16), 0 );
  2067. #elif defined(_XM_SSE_INTRINSICS_)
  2068. // Bounds check
  2069. XMVECTOR vResult = _mm_max_ps(V,g_ShortMin);
  2070. vResult = _mm_min_ps(vResult,g_ShortMax);
  2071. // Convert to int with rounding
  2072. __m128i vInt = _mm_cvtps_epi32(vResult);
  2073. // Pack the ints into shorts
  2074. vInt = _mm_packs_epi32(vInt,vInt);
  2075. _mm_store_ss(reinterpret_cast<float *>(&pDestination->x),_mm_castsi128_ps(vInt));
  2076. #endif
  2077. }
  2078. //------------------------------------------------------------------------------
  2079. _Use_decl_annotations_
  2080. inline void XM_CALLCONV XMStoreUShortN2
  2081. (
  2082. XMUSHORTN2* pDestination,
  2083. FXMVECTOR V
  2084. )
  2085. {
  2086. assert(pDestination);
  2087. #if defined(_XM_NO_INTRINSICS_)
  2088. XMVECTOR N = XMVectorSaturate(V);
  2089. N = XMVectorMultiplyAdd(N, g_UShortMax, g_XMOneHalf.v);
  2090. N = XMVectorTruncate(N);
  2091. XMFLOAT4A tmp;
  2092. XMStoreFloat4A( &tmp, N );
  2093. pDestination->x = (int16_t)tmp.x;
  2094. pDestination->y = (int16_t)tmp.y;
  2095. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2096. float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f) );
  2097. R = vminq_f32(R, vdupq_n_f32(1.0f));
  2098. R = vmulq_n_f32( R, 65535.0f );
  2099. R = vaddq_f32( R, g_XMOneHalf );
  2100. uint32x4_t vInt32 = vcvtq_u32_f32(R);
  2101. uint16x4_t vInt16 = vqmovn_u32( vInt32 );
  2102. vst1_lane_u32( &pDestination->v, vreinterpret_u32_u16(vInt16), 0 );
  2103. #elif defined(_XM_SSE_INTRINSICS_)
  2104. // Bounds check
  2105. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  2106. vResult = _mm_min_ps(vResult,g_XMOne);
  2107. vResult = _mm_mul_ps(vResult,g_UShortMax);
  2108. vResult = _mm_add_ps(vResult,g_XMOneHalf);
  2109. // Convert to int
  2110. __m128i vInt = _mm_cvttps_epi32(vResult);
  2111. // Since the SSE pack instruction clamps using signed rules,
  2112. // manually extract the values to store them to memory
  2113. pDestination->x = static_cast<int16_t>(_mm_extract_epi16(vInt,0));
  2114. pDestination->y = static_cast<int16_t>(_mm_extract_epi16(vInt,2));
  2115. #endif
  2116. }
  2117. //------------------------------------------------------------------------------
  2118. _Use_decl_annotations_
  2119. inline void XM_CALLCONV XMStoreUShort2
  2120. (
  2121. XMUSHORT2* pDestination,
  2122. FXMVECTOR V
  2123. )
  2124. {
  2125. assert(pDestination);
  2126. #if defined(_XM_NO_INTRINSICS_)
  2127. XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UShortMax);
  2128. N = XMVectorRound(N);
  2129. XMFLOAT4A tmp;
  2130. XMStoreFloat4A( &tmp, N );
  2131. pDestination->x = (int16_t)tmp.x;
  2132. pDestination->y = (int16_t)tmp.y;
  2133. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2134. float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f) );
  2135. R = vminq_f32(R, vdupq_n_f32(65535.0f));
  2136. uint32x4_t vInt32 = vcvtq_u32_f32(R);
  2137. uint16x4_t vInt16 = vqmovn_u32( vInt32 );
  2138. vst1_lane_u32( &pDestination->v, vreinterpret_u32_u16(vInt16), 0 );
  2139. #elif defined(_XM_SSE_INTRINSICS_)
  2140. // Bounds check
  2141. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  2142. vResult = _mm_min_ps(vResult,g_UShortMax);
  2143. // Convert to int with rounding
  2144. __m128i vInt = _mm_cvtps_epi32(vResult);
  2145. // Since the SSE pack instruction clamps using signed rules,
  2146. // manually extract the values to store them to memory
  2147. pDestination->x = static_cast<int16_t>(_mm_extract_epi16(vInt,0));
  2148. pDestination->y = static_cast<int16_t>(_mm_extract_epi16(vInt,2));
  2149. #endif
  2150. }
  2151. //------------------------------------------------------------------------------
  2152. _Use_decl_annotations_
  2153. inline void XM_CALLCONV XMStoreByteN2
  2154. (
  2155. XMBYTEN2* pDestination,
  2156. FXMVECTOR V
  2157. )
  2158. {
  2159. assert(pDestination);
  2160. #if defined(_XM_NO_INTRINSICS_)
  2161. XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
  2162. N = XMVectorMultiply(N, g_ByteMax);
  2163. N = XMVectorRound(N);
  2164. XMFLOAT4A tmp;
  2165. XMStoreFloat4A( &tmp, N );
  2166. pDestination->x = (int8_t)tmp.x;
  2167. pDestination->y = (int8_t)tmp.y;
  2168. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2169. float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-1.f) );
  2170. R = vminq_f32(R, vdupq_n_f32(1.0f));
  2171. R = vmulq_n_f32( R, 127.0f );
  2172. int32x4_t vInt32 = vcvtq_s32_f32(R);
  2173. int16x4_t vInt16 = vqmovn_s32( vInt32 );
  2174. int8x8_t vInt8 = vqmovn_s16( vcombine_s16(vInt16,vInt16) );
  2175. vst1_lane_u16( reinterpret_cast<uint16_t*>( pDestination ), vreinterpret_u16_s8(vInt8), 0 );
  2176. #elif defined(_XM_SSE_INTRINSICS_)
  2177. // Clamp to bounds
  2178. XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
  2179. vResult = _mm_min_ps(vResult,g_XMOne);
  2180. // Scale by multiplication
  2181. vResult = _mm_mul_ps(vResult,g_ByteMax);
  2182. // Convert to int by rounding
  2183. __m128i vInt = _mm_cvtps_epi32(vResult);
  2184. // No SSE operations will write to 16-bit values, so we have to extract them manually
  2185. uint16_t x = static_cast<uint16_t>(_mm_extract_epi16(vInt,0));
  2186. uint16_t y = static_cast<uint16_t>(_mm_extract_epi16(vInt,2));
  2187. pDestination->v = ((y & 0xFF) << 8) | (x & 0xFF);
  2188. #endif
  2189. }
  2190. //------------------------------------------------------------------------------
  2191. _Use_decl_annotations_
  2192. inline void XM_CALLCONV XMStoreByte2
  2193. (
  2194. XMBYTE2* pDestination,
  2195. FXMVECTOR V
  2196. )
  2197. {
  2198. assert(pDestination);
  2199. #if defined(_XM_NO_INTRINSICS_)
  2200. XMVECTOR N = XMVectorClamp(V, g_ByteMin, g_ByteMax);
  2201. N = XMVectorRound(N);
  2202. XMFLOAT4A tmp;
  2203. XMStoreFloat4A( &tmp, N );
  2204. pDestination->x = (int8_t)tmp.x;
  2205. pDestination->y = (int8_t)tmp.y;
  2206. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2207. float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-127.f) );
  2208. R = vminq_f32(R, vdupq_n_f32(127.0f));
  2209. int32x4_t vInt32 = vcvtq_s32_f32(R);
  2210. int16x4_t vInt16 = vqmovn_s32( vInt32 );
  2211. int8x8_t vInt8 = vqmovn_s16( vcombine_s16(vInt16,vInt16) );
  2212. vst1_lane_u16( reinterpret_cast<uint16_t*>( pDestination ), vreinterpret_u16_s8(vInt8), 0 );
  2213. #elif defined(_XM_SSE_INTRINSICS_)
  2214. // Clamp to bounds
  2215. XMVECTOR vResult = _mm_max_ps(V,g_ByteMin);
  2216. vResult = _mm_min_ps(vResult,g_ByteMax);
  2217. // Convert to int by rounding
  2218. __m128i vInt = _mm_cvtps_epi32(vResult);
  2219. // No SSE operations will write to 16-bit values, so we have to extract them manually
  2220. uint16_t x = static_cast<uint16_t>(_mm_extract_epi16(vInt,0));
  2221. uint16_t y = static_cast<uint16_t>(_mm_extract_epi16(vInt,2));
  2222. pDestination->v = ((y & 0xFF) << 8) | (x & 0xFF);
  2223. #endif
  2224. }
  2225. //------------------------------------------------------------------------------
  2226. _Use_decl_annotations_
  2227. inline void XM_CALLCONV XMStoreUByteN2
  2228. (
  2229. XMUBYTEN2* pDestination,
  2230. FXMVECTOR V
  2231. )
  2232. {
  2233. assert(pDestination);
  2234. #if defined(_XM_NO_INTRINSICS_)
  2235. XMVECTOR N = XMVectorSaturate(V);
  2236. N = XMVectorMultiplyAdd(N, g_UByteMax, g_XMOneHalf.v);
  2237. N = XMVectorTruncate(N);
  2238. XMFLOAT4A tmp;
  2239. XMStoreFloat4A( &tmp, N );
  2240. pDestination->x = (uint8_t)tmp.x;
  2241. pDestination->y = (uint8_t)tmp.y;
  2242. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2243. float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f) );
  2244. R = vminq_f32(R, vdupq_n_f32(1.0f));
  2245. R = vmulq_n_f32( R, 255.0f );
  2246. R = vaddq_f32( R, g_XMOneHalf );
  2247. uint32x4_t vInt32 = vcvtq_u32_f32(R);
  2248. uint16x4_t vInt16 = vqmovn_u32( vInt32 );
  2249. uint8x8_t vInt8 = vqmovn_u16( vcombine_u16(vInt16,vInt16) );
  2250. vst1_lane_u16( reinterpret_cast<uint16_t*>( pDestination ), vreinterpret_u16_u8(vInt8), 0 );
  2251. #elif defined(_XM_SSE_INTRINSICS_)
  2252. // Clamp to bounds
  2253. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  2254. vResult = _mm_min_ps(vResult,g_XMOne);
  2255. // Scale by multiplication
  2256. vResult = _mm_mul_ps(vResult,g_UByteMax);
  2257. vResult = _mm_add_ps(vResult,g_XMOneHalf);
  2258. // Convert to int
  2259. __m128i vInt = _mm_cvttps_epi32(vResult);
  2260. // No SSE operations will write to 16-bit values, so we have to extract them manually
  2261. uint16_t x = static_cast<uint16_t>(_mm_extract_epi16(vInt,0));
  2262. uint16_t y = static_cast<uint16_t>(_mm_extract_epi16(vInt,2));
  2263. pDestination->v = ((y & 0xFF) << 8) | (x & 0xFF);
  2264. #endif
  2265. }
  2266. //------------------------------------------------------------------------------
  2267. _Use_decl_annotations_
  2268. inline void XM_CALLCONV XMStoreUByte2
  2269. (
  2270. XMUBYTE2* pDestination,
  2271. FXMVECTOR V
  2272. )
  2273. {
  2274. assert(pDestination);
  2275. #if defined(_XM_NO_INTRINSICS_)
  2276. XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UByteMax);
  2277. N = XMVectorRound(N);
  2278. XMFLOAT4A tmp;
  2279. XMStoreFloat4A( &tmp, N );
  2280. pDestination->x = (uint8_t)tmp.x;
  2281. pDestination->y = (uint8_t)tmp.y;
  2282. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2283. float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f) );
  2284. R = vminq_f32(R, vdupq_n_f32(255.0f));
  2285. uint32x4_t vInt32 = vcvtq_u32_f32(R);
  2286. uint16x4_t vInt16 = vqmovn_u32( vInt32 );
  2287. uint8x8_t vInt8 = vqmovn_u16( vcombine_u16(vInt16,vInt16) );
  2288. vst1_lane_u16( reinterpret_cast<uint16_t*>( pDestination ), vreinterpret_u16_u8(vInt8), 0 );
  2289. #elif defined(_XM_SSE_INTRINSICS_)
  2290. // Clamp to bounds
  2291. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  2292. vResult = _mm_min_ps(vResult,g_UByteMax);
  2293. // Convert to int by rounding
  2294. __m128i vInt = _mm_cvtps_epi32(vResult);
  2295. // No SSE operations will write to 16-bit values, so we have to extract them manually
  2296. uint16_t x = static_cast<uint16_t>(_mm_extract_epi16(vInt,0));
  2297. uint16_t y = static_cast<uint16_t>(_mm_extract_epi16(vInt,2));
  2298. pDestination->v = ((y & 0xFF) << 8) | (x & 0xFF);
  2299. #endif
  2300. }
  2301. //------------------------------------------------------------------------------
  2302. _Use_decl_annotations_
  2303. inline void XM_CALLCONV XMStoreU565
  2304. (
  2305. XMU565* pDestination,
  2306. FXMVECTOR V
  2307. )
  2308. {
  2309. assert(pDestination);
  2310. static const XMVECTORF32 Max = { { { 31.0f, 63.0f, 31.0f, 0.0f } } };
  2311. #if defined(_XM_NO_INTRINSICS_)
  2312. XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v);
  2313. N = XMVectorRound(N);
  2314. XMFLOAT4A tmp;
  2315. XMStoreFloat4A( &tmp, N );
  2316. pDestination->v = (((uint16_t)tmp.z & 0x1F) << 11) |
  2317. (((uint16_t)tmp.y & 0x3F) << 5) |
  2318. (((uint16_t)tmp.x & 0x1F));
  2319. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2320. static const XMVECTORF32 Scale = { { { 1.0f, 32.f, 32.f*64.f, 0.f } } };
  2321. static const XMVECTORU32 Mask = { { { 0x1F, 0x3F << 5, 0x1F << 11, 0 } } };
  2322. float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(0));
  2323. vResult = vminq_f32(vResult,Max);
  2324. vResult = vmulq_f32(vResult,Scale);
  2325. uint32x4_t vResulti = vcvtq_u32_f32(vResult);
  2326. vResulti = vandq_u32(vResulti,Mask);
  2327. // Do a horizontal or of 4 entries
  2328. uint32x2_t vTemp = vget_low_u32(vResulti);
  2329. uint32x2_t vhi = vget_high_u32(vResulti);
  2330. vTemp = vorr_u32( vTemp, vhi );
  2331. vTemp = vpadd_u32( vTemp, vTemp );
  2332. vst1_lane_u16( &pDestination->v, vreinterpret_u16_u32( vTemp ), 0 );
  2333. #elif defined(_XM_SSE_INTRINSICS_)
  2334. // Bounds check
  2335. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  2336. vResult = _mm_min_ps(vResult,Max);
  2337. // Convert to int with rounding
  2338. __m128i vInt = _mm_cvtps_epi32(vResult);
  2339. // No SSE operations will write to 16-bit values, so we have to extract them manually
  2340. uint16_t x = static_cast<uint16_t>(_mm_extract_epi16(vInt,0));
  2341. uint16_t y = static_cast<uint16_t>(_mm_extract_epi16(vInt,2));
  2342. uint16_t z = static_cast<uint16_t>(_mm_extract_epi16(vInt,4));
  2343. pDestination->v = ((z & 0x1F) << 11) |
  2344. ((y & 0x3F) << 5) |
  2345. ((x & 0x1F));
  2346. #endif
  2347. }
  2348. //------------------------------------------------------------------------------
  2349. _Use_decl_annotations_
  2350. inline void XM_CALLCONV XMStoreFloat3PK
  2351. (
  2352. XMFLOAT3PK* pDestination,
  2353. FXMVECTOR V
  2354. )
  2355. {
  2356. assert(pDestination);
  2357. ALIGN(16) uint32_t ALIGN_END(16) IValue[4];
  2358. XMStoreFloat3A( reinterpret_cast<XMFLOAT3A*>(&IValue), V );
  2359. uint32_t Result[3];
  2360. // X & Y Channels (5-bit exponent, 6-bit mantissa)
  2361. for(uint32_t j=0; j < 2; ++j)
  2362. {
  2363. uint32_t Sign = IValue[j] & 0x80000000;
  2364. uint32_t I = IValue[j] & 0x7FFFFFFF;
  2365. if ((I & 0x7F800000) == 0x7F800000)
  2366. {
  2367. // INF or NAN
  2368. Result[j] = 0x7c0;
  2369. if (( I & 0x7FFFFF ) != 0)
  2370. {
  2371. Result[j] = 0x7c0 | (((I>>17)|(I>>11)|(I>>6)|(I))&0x3f);
  2372. }
  2373. else if ( Sign )
  2374. {
  2375. // -INF is clamped to 0 since 3PK is positive only
  2376. Result[j] = 0;
  2377. }
  2378. }
  2379. else if ( Sign )
  2380. {
  2381. // 3PK is positive only, so clamp to zero
  2382. Result[j] = 0;
  2383. }
  2384. else if (I > 0x477E0000U)
  2385. {
  2386. // The number is too large to be represented as a float11, set to max
  2387. Result[j] = 0x7BF;
  2388. }
  2389. else
  2390. {
  2391. if (I < 0x38800000U)
  2392. {
  2393. // The number is too small to be represented as a normalized float11
  2394. // Convert it to a denormalized value.
  2395. uint32_t Shift = 113U - (I >> 23U);
  2396. I = (0x800000U | (I & 0x7FFFFFU)) >> Shift;
  2397. }
  2398. else
  2399. {
  2400. // Rebias the exponent to represent the value as a normalized float11
  2401. I += 0xC8000000U;
  2402. }
  2403. Result[j] = ((I + 0xFFFFU + ((I >> 17U) & 1U)) >> 17U)&0x7ffU;
  2404. }
  2405. }
  2406. // Z Channel (5-bit exponent, 5-bit mantissa)
  2407. uint32_t Sign = IValue[2] & 0x80000000;
  2408. uint32_t I = IValue[2] & 0x7FFFFFFF;
  2409. if ((I & 0x7F800000) == 0x7F800000)
  2410. {
  2411. // INF or NAN
  2412. Result[2] = 0x3e0;
  2413. if ( I & 0x7FFFFF )
  2414. {
  2415. Result[2] = 0x3e0 | (((I>>18)|(I>>13)|(I>>3)|(I))&0x1f);
  2416. }
  2417. else if ( Sign )
  2418. {
  2419. // -INF is clamped to 0 since 3PK is positive only
  2420. Result[2] = 0;
  2421. }
  2422. }
  2423. else if ( Sign )
  2424. {
  2425. // 3PK is positive only, so clamp to zero
  2426. Result[2] = 0;
  2427. }
  2428. else if (I > 0x477C0000U)
  2429. {
  2430. // The number is too large to be represented as a float10, set to max
  2431. Result[2] = 0x3df;
  2432. }
  2433. else
  2434. {
  2435. if (I < 0x38800000U)
  2436. {
  2437. // The number is too small to be represented as a normalized float10
  2438. // Convert it to a denormalized value.
  2439. uint32_t Shift = 113U - (I >> 23U);
  2440. I = (0x800000U | (I & 0x7FFFFFU)) >> Shift;
  2441. }
  2442. else
  2443. {
  2444. // Rebias the exponent to represent the value as a normalized float10
  2445. I += 0xC8000000U;
  2446. }
  2447. Result[2] = ((I + 0x1FFFFU + ((I >> 18U) & 1U)) >> 18U)&0x3ffU;
  2448. }
  2449. // Pack Result into memory
  2450. pDestination->v = (Result[0] & 0x7ff)
  2451. | ( (Result[1] & 0x7ff) << 11 )
  2452. | ( (Result[2] & 0x3ff) << 22 );
  2453. }
  2454. //------------------------------------------------------------------------------
  2455. _Use_decl_annotations_
  2456. inline void XM_CALLCONV XMStoreFloat3SE
  2457. (
  2458. XMFLOAT3SE* pDestination,
  2459. FXMVECTOR V
  2460. )
  2461. {
  2462. assert(pDestination);
  2463. XMFLOAT3A tmp;
  2464. XMStoreFloat3A( &tmp, V );
  2465. static const float maxf9 = float(0x1FF << 7);
  2466. static const float minf9 = float(1.f / (1 << 16));
  2467. float x = (tmp.x >= 0.f) ? ( (tmp.x > maxf9) ? maxf9 : tmp.x ) : 0.f;
  2468. float y = (tmp.y >= 0.f) ? ( (tmp.y > maxf9) ? maxf9 : tmp.y ) : 0.f;
  2469. float z = (tmp.z >= 0.f) ? ( (tmp.z > maxf9) ? maxf9 : tmp.z ) : 0.f;
  2470. const float max_xy = (x > y) ? x : y;
  2471. const float max_xyz = (max_xy > z) ? max_xy : z;
  2472. const float maxColor = (max_xyz > minf9) ? max_xyz : minf9;
  2473. union { float f; int32_t i; } fi;
  2474. fi.f = maxColor;
  2475. fi.i += 0x00004000; // round up leaving 9 bits in fraction (including assumed 1)
  2476. uint32_t exp = fi.i >> 23;
  2477. pDestination->e = exp - 0x6f;
  2478. fi.i = 0x83000000 - (exp << 23);
  2479. float ScaleR = fi.f;
  2480. pDestination->xm = static_cast<uint32_t>( Internal::round_to_nearest(x * ScaleR) );
  2481. pDestination->ym = static_cast<uint32_t>( Internal::round_to_nearest(y * ScaleR) );
  2482. pDestination->zm = static_cast<uint32_t>( Internal::round_to_nearest(z * ScaleR) );
  2483. }
  2484. //------------------------------------------------------------------------------
  2485. _Use_decl_annotations_
  2486. inline void XM_CALLCONV XMStoreHalf4
  2487. (
  2488. XMHALF4* pDestination,
  2489. FXMVECTOR V
  2490. )
  2491. {
  2492. assert(pDestination);
  2493. #if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
  2494. __m128i V1 = _mm_cvtps_ph( V, 0 );
  2495. _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), V1 );
  2496. #else
  2497. XMFLOAT4A t;
  2498. XMStoreFloat4A(&t, V );
  2499. pDestination->x = XMConvertFloatToHalf(t.x);
  2500. pDestination->y = XMConvertFloatToHalf(t.y);
  2501. pDestination->z = XMConvertFloatToHalf(t.z);
  2502. pDestination->w = XMConvertFloatToHalf(t.w);
  2503. #endif // !_XM_F16C_INTRINSICS_
  2504. }
  2505. //------------------------------------------------------------------------------
  2506. _Use_decl_annotations_
  2507. inline void XM_CALLCONV XMStoreShortN4
  2508. (
  2509. XMSHORTN4* pDestination,
  2510. FXMVECTOR V
  2511. )
  2512. {
  2513. assert(pDestination);
  2514. #if defined(_XM_NO_INTRINSICS_)
  2515. XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
  2516. N = XMVectorMultiply(N, g_ShortMax);
  2517. N = XMVectorRound(N);
  2518. XMFLOAT4A tmp;
  2519. XMStoreFloat4A(&tmp, N );
  2520. pDestination->x = (int16_t)tmp.x;
  2521. pDestination->y = (int16_t)tmp.y;
  2522. pDestination->z = (int16_t)tmp.z;
  2523. pDestination->w = (int16_t)tmp.w;
  2524. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2525. float32x4_t vResult = vmaxq_f32( V, vdupq_n_f32(-1.f) );
  2526. vResult = vminq_f32( vResult, vdupq_n_f32(1.0f) );
  2527. vResult = vmulq_n_f32( vResult, 32767.0f );
  2528. vResult = vcvtq_s32_f32( vResult );
  2529. int16x4_t vInt = vmovn_s32( vResult );
  2530. vst1_s16( reinterpret_cast<int16_t*>(pDestination), vInt );
  2531. #elif defined(_XM_SSE_INTRINSICS_)
  2532. XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
  2533. vResult = _mm_min_ps(vResult,g_XMOne);
  2534. vResult = _mm_mul_ps(vResult,g_ShortMax);
  2535. __m128i vResulti = _mm_cvtps_epi32(vResult);
  2536. vResulti = _mm_packs_epi32(vResulti,vResulti);
  2537. _mm_store_sd(reinterpret_cast<double *>(&pDestination->x),_mm_castsi128_pd(vResulti));
  2538. #endif
  2539. }
  2540. //------------------------------------------------------------------------------
  2541. _Use_decl_annotations_
  2542. inline void XM_CALLCONV XMStoreShort4
  2543. (
  2544. XMSHORT4* pDestination,
  2545. FXMVECTOR V
  2546. )
  2547. {
  2548. assert(pDestination);
  2549. #if defined(_XM_NO_INTRINSICS_)
  2550. XMVECTOR N = XMVectorClamp(V, g_ShortMin, g_ShortMax);
  2551. N = XMVectorRound(N);
  2552. XMFLOAT4A tmp;
  2553. XMStoreFloat4A(&tmp, N );
  2554. pDestination->x = (int16_t)tmp.x;
  2555. pDestination->y = (int16_t)tmp.y;
  2556. pDestination->z = (int16_t)tmp.z;
  2557. pDestination->w = (int16_t)tmp.w;
  2558. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2559. float32x4_t vResult = vmaxq_f32( V, g_ShortMin );
  2560. vResult = vminq_f32( vResult, g_ShortMax );
  2561. vResult = vcvtq_s32_f32( vResult );
  2562. int16x4_t vInt = vmovn_s32( vResult );
  2563. vst1_s16( reinterpret_cast<int16_t*>(pDestination), vInt );
  2564. #elif defined(_XM_SSE_INTRINSICS_)
  2565. // Bounds check
  2566. XMVECTOR vResult = _mm_max_ps(V,g_ShortMin);
  2567. vResult = _mm_min_ps(vResult,g_ShortMax);
  2568. // Convert to int with rounding
  2569. __m128i vInt = _mm_cvtps_epi32(vResult);
  2570. // Pack the ints into shorts
  2571. vInt = _mm_packs_epi32(vInt,vInt);
  2572. _mm_store_sd(reinterpret_cast<double *>(&pDestination->x),_mm_castsi128_pd(vInt));
  2573. #endif
  2574. }
  2575. //------------------------------------------------------------------------------
  2576. _Use_decl_annotations_
  2577. inline void XM_CALLCONV XMStoreUShortN4
  2578. (
  2579. XMUSHORTN4* pDestination,
  2580. FXMVECTOR V
  2581. )
  2582. {
  2583. assert(pDestination);
  2584. #if defined(_XM_NO_INTRINSICS_)
  2585. XMVECTOR N = XMVectorSaturate(V);
  2586. N = XMVectorMultiplyAdd(N, g_UShortMax, g_XMOneHalf.v);
  2587. N = XMVectorTruncate(N);
  2588. XMFLOAT4A tmp;
  2589. XMStoreFloat4A(&tmp, N );
  2590. pDestination->x = (int16_t)tmp.x;
  2591. pDestination->y = (int16_t)tmp.y;
  2592. pDestination->z = (int16_t)tmp.z;
  2593. pDestination->w = (int16_t)tmp.w;
  2594. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2595. float32x4_t vResult = vmaxq_f32( V, vdupq_n_f32(0) );
  2596. vResult = vminq_f32( vResult, vdupq_n_f32(1.0f) );
  2597. vResult = vmulq_n_f32( vResult, 65535.0f );
  2598. vResult = vaddq_f32( vResult, g_XMOneHalf );
  2599. vResult = vcvtq_u32_f32( vResult );
  2600. uint16x4_t vInt = vmovn_u32( vResult );
  2601. vst1_u16( reinterpret_cast<uint16_t*>(pDestination), vInt );
  2602. #elif defined(_XM_SSE_INTRINSICS_)
  2603. // Bounds check
  2604. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  2605. vResult = _mm_min_ps(vResult,g_XMOne);
  2606. vResult = _mm_mul_ps(vResult,g_UShortMax);
  2607. vResult = _mm_add_ps(vResult,g_XMOneHalf);
  2608. // Convert to int
  2609. __m128i vInt = _mm_cvttps_epi32(vResult);
  2610. // Since the SSE pack instruction clamps using signed rules,
  2611. // manually extract the values to store them to memory
  2612. pDestination->x = static_cast<int16_t>(_mm_extract_epi16(vInt,0));
  2613. pDestination->y = static_cast<int16_t>(_mm_extract_epi16(vInt,2));
  2614. pDestination->z = static_cast<int16_t>(_mm_extract_epi16(vInt,4));
  2615. pDestination->w = static_cast<int16_t>(_mm_extract_epi16(vInt,6));
  2616. #endif
  2617. }
  2618. //------------------------------------------------------------------------------
  2619. _Use_decl_annotations_
  2620. inline void XM_CALLCONV XMStoreUShort4
  2621. (
  2622. XMUSHORT4* pDestination,
  2623. FXMVECTOR V
  2624. )
  2625. {
  2626. assert(pDestination);
  2627. #if defined(_XM_NO_INTRINSICS_)
  2628. XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UShortMax);
  2629. N = XMVectorRound(N);
  2630. XMFLOAT4A tmp;
  2631. XMStoreFloat4A(&tmp, N );
  2632. pDestination->x = (int16_t)tmp.x;
  2633. pDestination->y = (int16_t)tmp.y;
  2634. pDestination->z = (int16_t)tmp.z;
  2635. pDestination->w = (int16_t)tmp.w;
  2636. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2637. float32x4_t vResult = vmaxq_f32( V, vdupq_n_f32(0) );
  2638. vResult = vminq_f32( vResult, g_UShortMax );
  2639. vResult = vcvtq_u32_f32( vResult );
  2640. uint16x4_t vInt = vmovn_u32( vResult );
  2641. vst1_u16( reinterpret_cast<uint16_t*>(pDestination), vInt );
  2642. #elif defined(_XM_SSE_INTRINSICS_)
  2643. // Bounds check
  2644. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  2645. vResult = _mm_min_ps(vResult,g_UShortMax);
  2646. // Convert to int with rounding
  2647. __m128i vInt = _mm_cvtps_epi32(vResult);
  2648. // Since the SSE pack instruction clamps using signed rules,
  2649. // manually extract the values to store them to memory
  2650. pDestination->x = static_cast<int16_t>(_mm_extract_epi16(vInt,0));
  2651. pDestination->y = static_cast<int16_t>(_mm_extract_epi16(vInt,2));
  2652. pDestination->z = static_cast<int16_t>(_mm_extract_epi16(vInt,4));
  2653. pDestination->w = static_cast<int16_t>(_mm_extract_epi16(vInt,6));
  2654. #endif
  2655. }
  2656. //------------------------------------------------------------------------------
  2657. _Use_decl_annotations_
  2658. inline void XM_CALLCONV XMStoreXDecN4
  2659. (
  2660. XMXDECN4* pDestination,
  2661. FXMVECTOR V
  2662. )
  2663. {
  2664. assert(pDestination);
  2665. static const XMVECTORF32 Min = { { { -1.0f, -1.0f, -1.0f, 0.0f } } };
  2666. #if defined(_XM_NO_INTRINSICS_)
  2667. static const XMVECTORF32 Scale = { { { 511.0f, 511.0f, 511.0f, 3.0f } } };
  2668. XMVECTOR N = XMVectorClamp(V, Min.v, g_XMOne.v);
  2669. N = XMVectorMultiply(N, Scale.v);
  2670. N = XMVectorRound(N);
  2671. XMFLOAT4A tmp;
  2672. XMStoreFloat4A(&tmp, N );
  2673. pDestination->v = ((uint32_t)tmp.w << 30) |
  2674. (((int32_t)tmp.z & 0x3FF) << 20) |
  2675. (((int32_t)tmp.y & 0x3FF) << 10) |
  2676. (((int32_t)tmp.x & 0x3FF));
  2677. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2678. static const XMVECTORF32 Scale = { { { 511.0f, 511.0f*1024.0f, 511.0f*1048576.0f, 3.0f*536870912.0f } } };
  2679. static const XMVECTORI32 ScaleMask = { { { 0x3FF, 0x3FF << 10, 0x3FF << 20, 0x3 << 29 } } };
  2680. float32x4_t vResult = vmaxq_f32(V,Min);
  2681. vResult = vminq_f32(vResult,vdupq_n_f32(1.0f));
  2682. vResult = vmulq_f32(vResult,Scale);
  2683. int32x4_t vResulti = vcvtq_s32_f32(vResult);
  2684. vResulti = vandq_s32(vResulti,ScaleMask);
  2685. int32x4_t vResultw = vandq_s32(vResulti,g_XMMaskW);
  2686. vResulti = vaddq_s32(vResulti,vResultw);
  2687. // Do a horizontal or of all 4 entries
  2688. uint32x2_t vTemp = vget_low_u32(vreinterpret_u32_s32(vResulti));
  2689. uint32x2_t vhi = vget_high_u32(vreinterpret_u32_s32(vResulti));
  2690. vTemp = vorr_u32( vTemp, vhi );
  2691. vTemp = vpadd_u32( vTemp, vTemp );
  2692. vst1_lane_u32( &pDestination->v, vTemp, 0 );
  2693. #elif defined(_XM_SSE_INTRINSICS_)
  2694. static const XMVECTORF32 Scale = { { { 511.0f, 511.0f*1024.0f, 511.0f*1048576.0f, 3.0f*536870912.0f } } };
  2695. static const XMVECTORI32 ScaleMask = { { { 0x3FF, 0x3FF << 10, 0x3FF << 20, 0x3 << 29 } } };
  2696. XMVECTOR vResult = _mm_max_ps(V,Min);
  2697. vResult = _mm_min_ps(vResult,g_XMOne);
  2698. // Scale by multiplication
  2699. vResult = _mm_mul_ps(vResult,Scale);
  2700. // Convert to int (W is unsigned)
  2701. __m128i vResulti = _mm_cvtps_epi32(vResult);
  2702. // Mask off any fraction
  2703. vResulti = _mm_and_si128(vResulti,ScaleMask);
  2704. // To fix W, add itself to shift it up to <<30 instead of <<29
  2705. __m128i vResultw = _mm_and_si128(vResulti,g_XMMaskW);
  2706. vResulti = _mm_add_epi32(vResulti,vResultw);
  2707. // Do a horizontal or of all 4 entries
  2708. vResult = XM_PERMUTE_PS(_mm_castsi128_ps(vResulti),_MM_SHUFFLE(0,3,2,1));
  2709. vResulti = _mm_or_si128(vResulti,_mm_castps_si128(vResult));
  2710. vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,3,2,1));
  2711. vResulti = _mm_or_si128(vResulti,_mm_castps_si128(vResult));
  2712. vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,3,2,1));
  2713. vResulti = _mm_or_si128(vResulti,_mm_castps_si128(vResult));
  2714. _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
  2715. #endif
  2716. }
  2717. //------------------------------------------------------------------------------
  2718. #pragma warning(push)
  2719. #pragma warning(disable : 4996)
  2720. // C4996: ignore deprecation warning
  2721. _Use_decl_annotations_
  2722. inline void XM_CALLCONV XMStoreXDec4
  2723. (
  2724. XMXDEC4* pDestination,
  2725. FXMVECTOR V
  2726. )
  2727. {
  2728. assert(pDestination);
  2729. static const XMVECTORF32 MinXDec4 = { { { -511.0f, -511.0f, -511.0f, 0.0f } } };
  2730. static const XMVECTORF32 MaxXDec4 = { { { 511.0f, 511.0f, 511.0f, 3.0f } } };
  2731. #if defined(_XM_NO_INTRINSICS_)
  2732. XMVECTOR N = XMVectorClamp(V, MinXDec4, MaxXDec4);
  2733. XMFLOAT4A tmp;
  2734. XMStoreFloat4A(&tmp, N );
  2735. pDestination->v = ((uint32_t)tmp.w << 30) |
  2736. (((int32_t)tmp.z & 0x3FF) << 20) |
  2737. (((int32_t)tmp.y & 0x3FF) << 10) |
  2738. (((int32_t)tmp.x & 0x3FF));
  2739. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2740. static const XMVECTORF32 ScaleXDec4 = { { { 1.0f, 1024.0f / 2.0f, 1024.0f*1024.0f, 1024.0f*1024.0f*1024.0f / 2.0f } } };
  2741. static const XMVECTORI32 MaskXDec4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } };
  2742. float32x4_t vResult = vmaxq_f32(V,MinXDec4);
  2743. vResult = vminq_f32(vResult,MaxXDec4);
  2744. vResult = vmulq_f32(vResult,ScaleXDec4);
  2745. int32x4_t vResulti = vcvtq_s32_f32(vResult);
  2746. vResulti = vandq_s32(vResulti,MaskXDec4);
  2747. // Do a horizontal or of 4 entries
  2748. uint32x2_t vTemp = vget_low_u32(vreinterpret_u32_s32(vResulti));
  2749. uint32x2_t vTemp2 = vget_high_u32(vreinterpret_u32_s32(vResulti));
  2750. vTemp = vorr_u32( vTemp, vTemp2 );
  2751. // Perform a single bit left shift on y|w
  2752. vTemp2 = vdup_lane_u32( vTemp, 1 );
  2753. vTemp2 = vadd_s32( vTemp2, vTemp2 );
  2754. vTemp = vorr_u32( vTemp, vTemp2 );
  2755. vst1_lane_u32( &pDestination->v, vTemp, 0 );
  2756. #elif defined(_XM_SSE_INTRINSICS_)
  2757. static const XMVECTORF32 ScaleXDec4 = { { { 1.0f, 1024.0f / 2.0f, 1024.0f*1024.0f, 1024.0f*1024.0f*1024.0f / 2.0f } } };
  2758. static const XMVECTORI32 MaskXDec4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } };
  2759. // Clamp to bounds
  2760. XMVECTOR vResult = _mm_max_ps(V,MinXDec4);
  2761. vResult = _mm_min_ps(vResult,MaxXDec4);
  2762. // Scale by multiplication
  2763. vResult = _mm_mul_ps(vResult,ScaleXDec4);
  2764. // Convert to int
  2765. __m128i vResulti = _mm_cvttps_epi32(vResult);
  2766. // Mask off any fraction
  2767. vResulti = _mm_and_si128(vResulti,MaskXDec4);
  2768. // Do a horizontal or of 4 entries
  2769. __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
  2770. // x = x|z, y = y|w
  2771. vResulti = _mm_or_si128(vResulti,vResulti2);
  2772. // Move Z to the x position
  2773. vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
  2774. // Perform a single bit left shift on y|w
  2775. vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
  2776. // i = x|y|z|w
  2777. vResulti = _mm_or_si128(vResulti,vResulti2);
  2778. _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
  2779. #endif
  2780. }
  2781. #pragma warning(pop)
  2782. //------------------------------------------------------------------------------
  2783. _Use_decl_annotations_
  2784. inline void XM_CALLCONV XMStoreUDecN4
  2785. (
  2786. XMUDECN4* pDestination,
  2787. FXMVECTOR V
  2788. )
  2789. {
  2790. assert(pDestination);
  2791. #if defined(_XM_NO_INTRINSICS_)
  2792. static const XMVECTORF32 Scale = { { { 1023.0f, 1023.0f, 1023.0f, 3.0f } } };
  2793. XMVECTOR N = XMVectorSaturate(V);
  2794. N = XMVectorMultiply(N, Scale.v);
  2795. XMFLOAT4A tmp;
  2796. XMStoreFloat4A(&tmp, N );
  2797. pDestination->v = ((uint32_t)tmp.w << 30) |
  2798. (((uint32_t)tmp.z & 0x3FF) << 20) |
  2799. (((uint32_t)tmp.y & 0x3FF) << 10) |
  2800. (((uint32_t)tmp.x & 0x3FF));
  2801. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2802. static const XMVECTORF32 ScaleUDecN4 = { { { 1023.0f, 1023.0f*1024.0f*0.5f, 1023.0f*1024.0f*1024.0f, 3.0f*1024.0f*1024.0f*1024.0f*0.5f } } };
  2803. static const XMVECTORI32 MaskUDecN4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } };
  2804. float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(0.f));
  2805. vResult = vminq_f32(vResult,vdupq_n_f32(1.f));
  2806. vResult = vmulq_f32(vResult,ScaleUDecN4);
  2807. uint32x4_t vResulti = vcvtq_u32_f32(vResult);
  2808. vResulti = vandq_u32(vResulti,MaskUDecN4);
  2809. // Do a horizontal or of 4 entries
  2810. uint32x2_t vTemp = vget_low_u32(vResulti);
  2811. uint32x2_t vTemp2 = vget_high_u32(vResulti);
  2812. vTemp = vorr_u32( vTemp, vTemp2 );
  2813. // Perform a single bit left shift on y|w
  2814. vTemp2 = vdup_lane_u32( vTemp, 1 );
  2815. vTemp2 = vadd_u32( vTemp2, vTemp2 );
  2816. vTemp = vorr_u32( vTemp, vTemp2 );
  2817. vst1_lane_u32( &pDestination->v, vTemp, 0 );
  2818. #elif defined(_XM_SSE_INTRINSICS_)
  2819. static const XMVECTORF32 ScaleUDecN4 = { { { 1023.0f, 1023.0f*1024.0f*0.5f, 1023.0f*1024.0f*1024.0f, 3.0f*1024.0f*1024.0f*1024.0f*0.5f } } };
  2820. static const XMVECTORI32 MaskUDecN4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } };
  2821. // Clamp to bounds
  2822. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  2823. vResult = _mm_min_ps(vResult,g_XMOne);
  2824. // Scale by multiplication
  2825. vResult = _mm_mul_ps(vResult,ScaleUDecN4);
  2826. // Convert to int
  2827. __m128i vResulti = _mm_cvttps_epi32(vResult);
  2828. // Mask off any fraction
  2829. vResulti = _mm_and_si128(vResulti,MaskUDecN4);
  2830. // Do a horizontal or of 4 entries
  2831. __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
  2832. // x = x|z, y = y|w
  2833. vResulti = _mm_or_si128(vResulti,vResulti2);
  2834. // Move Z to the x position
  2835. vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
  2836. // Perform a left shift by one bit on y|w
  2837. vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
  2838. // i = x|y|z|w
  2839. vResulti = _mm_or_si128(vResulti,vResulti2);
  2840. _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
  2841. #endif
  2842. }
  2843. //------------------------------------------------------------------------------
  2844. _Use_decl_annotations_
  2845. inline void XM_CALLCONV XMStoreUDecN4_XR
  2846. (
  2847. XMUDECN4* pDestination,
  2848. FXMVECTOR V
  2849. )
  2850. {
  2851. assert(pDestination);
  2852. static const XMVECTORF32 Scale = { { { 510.0f, 510.0f, 510.0f, 3.0f } } };
  2853. static const XMVECTORF32 Bias = { { { 384.0f, 384.0f, 384.0f, 0.0f } } };
  2854. static const XMVECTORF32 C = { { { 1023.f, 1023.f, 1023.f, 3.f } } };
  2855. #if defined(_XM_NO_INTRINSICS_)
  2856. XMVECTOR N = XMVectorMultiplyAdd( V, Scale, Bias );
  2857. N = XMVectorClamp( N, g_XMZero, C );
  2858. XMFLOAT4A tmp;
  2859. XMStoreFloat4A(&tmp, N );
  2860. pDestination->v = ((uint32_t)tmp.w << 30)
  2861. | (((uint32_t)tmp.z & 0x3FF) << 20)
  2862. | (((uint32_t)tmp.y & 0x3FF) << 10)
  2863. | (((uint32_t)tmp.x & 0x3FF));
  2864. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2865. static const XMVECTORF32 Shift = { { { 1.0f, 1024.0f*0.5f, 1024.0f*1024.0f, 1024.0f*1024.0f*1024.0f*0.5f } } };
  2866. static const XMVECTORU32 MaskUDecN4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } };
  2867. float32x4_t vResult = vmlaq_f32( Bias, V, Scale );
  2868. vResult = vmaxq_f32(vResult,vdupq_n_f32(0.f));
  2869. vResult = vminq_f32(vResult,C);
  2870. vResult = vmulq_f32(vResult,Shift);
  2871. uint32x4_t vResulti = vcvtq_u32_f32(vResult);
  2872. vResulti = vandq_u32(vResulti,MaskUDecN4);
  2873. // Do a horizontal or of 4 entries
  2874. uint32x2_t vTemp = vget_low_u32(vResulti);
  2875. uint32x2_t vTemp2 = vget_high_u32(vResulti);
  2876. vTemp = vorr_u32( vTemp, vTemp2 );
  2877. // Perform a single bit left shift on y|w
  2878. vTemp2 = vdup_lane_u32( vTemp, 1 );
  2879. vTemp2 = vadd_u32( vTemp2, vTemp2 );
  2880. vTemp = vorr_u32( vTemp, vTemp2 );
  2881. vst1_lane_u32( &pDestination->v, vTemp, 0 );
  2882. #elif defined(_XM_SSE_INTRINSICS_)
  2883. static const XMVECTORF32 Shift = { { { 1.0f, 1024.0f*0.5f, 1024.0f*1024.0f, 1024.0f*1024.0f*1024.0f*0.5f } } };
  2884. static const XMVECTORU32 MaskUDecN4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } };
  2885. // Scale & bias
  2886. XMVECTOR vResult = _mm_mul_ps( V, Scale );
  2887. vResult = _mm_add_ps( vResult, Bias );
  2888. // Clamp to bounds
  2889. vResult = _mm_max_ps(vResult,g_XMZero);
  2890. vResult = _mm_min_ps(vResult,C);
  2891. // Scale by shift values
  2892. vResult = _mm_mul_ps(vResult,Shift);
  2893. // Convert to int
  2894. __m128i vResulti = _mm_cvttps_epi32(vResult);
  2895. // Mask off any fraction
  2896. vResulti = _mm_and_si128(vResulti,MaskUDecN4);
  2897. // Do a horizontal or of 4 entries
  2898. __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
  2899. // x = x|z, y = y|w
  2900. vResulti = _mm_or_si128(vResulti,vResulti2);
  2901. // Move Z to the x position
  2902. vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
  2903. // Perform a left shift by one bit on y|w
  2904. vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
  2905. // i = x|y|z|w
  2906. vResulti = _mm_or_si128(vResulti,vResulti2);
  2907. _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
  2908. #endif
  2909. }
  2910. //------------------------------------------------------------------------------
  2911. _Use_decl_annotations_
  2912. inline void XM_CALLCONV XMStoreUDec4
  2913. (
  2914. XMUDEC4* pDestination,
  2915. FXMVECTOR V
  2916. )
  2917. {
  2918. assert(pDestination);
  2919. static const XMVECTORF32 MaxUDec4 = { { { 1023.0f, 1023.0f, 1023.0f, 3.0f } } };
  2920. #if defined(_XM_NO_INTRINSICS_)
  2921. XMVECTOR N = XMVectorClamp(V, XMVectorZero(), MaxUDec4);
  2922. XMFLOAT4A tmp;
  2923. XMStoreFloat4A(&tmp, N );
  2924. pDestination->v = ((uint32_t)tmp.w << 30) |
  2925. (((uint32_t)tmp.z & 0x3FF) << 20) |
  2926. (((uint32_t)tmp.y & 0x3FF) << 10) |
  2927. (((uint32_t)tmp.x & 0x3FF));
  2928. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2929. static const XMVECTORF32 ScaleUDec4 = { { { 1.0f, 1024.0f / 2.0f, 1024.0f*1024.0f, 1024.0f*1024.0f*1024.0f / 2.0f } } };
  2930. static const XMVECTORI32 MaskUDec4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } };
  2931. float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(0.f));
  2932. vResult = vminq_f32(vResult,MaxUDec4);
  2933. vResult = vmulq_f32(vResult,ScaleUDec4);
  2934. uint32x4_t vResulti = vcvtq_u32_f32(vResult);
  2935. vResulti = vandq_u32(vResulti,MaskUDec4);
  2936. // Do a horizontal or of 4 entries
  2937. uint32x2_t vTemp = vget_low_u32(vResulti);
  2938. uint32x2_t vTemp2 = vget_high_u32(vResulti);
  2939. vTemp = vorr_u32( vTemp, vTemp2 );
  2940. // Perform a single bit left shift on y|w
  2941. vTemp2 = vdup_lane_u32( vTemp, 1 );
  2942. vTemp2 = vadd_u32( vTemp2, vTemp2 );
  2943. vTemp = vorr_u32( vTemp, vTemp2 );
  2944. vst1_lane_u32( &pDestination->v, vTemp, 0 );
  2945. #elif defined(_XM_SSE_INTRINSICS_)
  2946. static const XMVECTORF32 ScaleUDec4 = { { { 1.0f, 1024.0f / 2.0f, 1024.0f*1024.0f, 1024.0f*1024.0f*1024.0f / 2.0f } } };
  2947. static const XMVECTORI32 MaskUDec4 = { { { 0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1) } } };
  2948. // Clamp to bounds
  2949. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  2950. vResult = _mm_min_ps(vResult,MaxUDec4);
  2951. // Scale by multiplication
  2952. vResult = _mm_mul_ps(vResult,ScaleUDec4);
  2953. // Convert to int
  2954. __m128i vResulti = _mm_cvttps_epi32(vResult);
  2955. // Mask off any fraction
  2956. vResulti = _mm_and_si128(vResulti,MaskUDec4);
  2957. // Do a horizontal or of 4 entries
  2958. __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
  2959. // x = x|z, y = y|w
  2960. vResulti = _mm_or_si128(vResulti,vResulti2);
  2961. // Move Z to the x position
  2962. vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
  2963. // Perform a left shift by one bit on y|w
  2964. vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
  2965. // i = x|y|z|w
  2966. vResulti = _mm_or_si128(vResulti,vResulti2);
  2967. _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
  2968. #endif
  2969. }
  2970. //------------------------------------------------------------------------------
  2971. #pragma warning(push)
  2972. #pragma warning(disable : 4996)
  2973. // C4996: ignore deprecation warning
  2974. _Use_decl_annotations_
  2975. inline void XM_CALLCONV XMStoreDecN4
  2976. (
  2977. XMDECN4* pDestination,
  2978. FXMVECTOR V
  2979. )
  2980. {
  2981. assert(pDestination);
  2982. #if defined(_XM_NO_INTRINSICS_)
  2983. static const XMVECTORF32 Scale = { { { 511.0f, 511.0f, 511.0f, 1.0f } } };
  2984. XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
  2985. N = XMVectorMultiply(N, Scale.v);
  2986. XMFLOAT4A tmp;
  2987. XMStoreFloat4A(&tmp, N );
  2988. pDestination->v = ((int32_t)tmp.w << 30) |
  2989. (((int32_t)tmp.z & 0x3FF) << 20) |
  2990. (((int32_t)tmp.y & 0x3FF) << 10) |
  2991. (((int32_t)tmp.x & 0x3FF));
  2992. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2993. static const XMVECTORF32 ScaleDecN4 = { { { 511.0f, 511.0f*1024.0f, 511.0f*1024.0f*1024.0f, 1.0f*1024.0f*1024.0f*1024.0f } } };
  2994. float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(-1.f));
  2995. vResult = vminq_f32(vResult,vdupq_n_f32(1.f));
  2996. vResult = vmulq_f32(vResult,ScaleDecN4);
  2997. int32x4_t vResulti = vcvtq_s32_f32(vResult);
  2998. vResulti = vandq_s32(vResulti,g_XMMaskDec4);
  2999. // Do a horizontal or of 4 entries
  3000. uint32x2_t vTemp = vget_low_u32(vreinterpret_u32_s32(vResulti));
  3001. uint32x2_t vhi = vget_high_u32(vreinterpret_u32_s32(vResulti));
  3002. vTemp = vorr_u32( vTemp, vhi );
  3003. vTemp = vpadd_u32( vTemp, vTemp );
  3004. vst1_lane_u32( &pDestination->v, vTemp, 0 );
  3005. #elif defined(_XM_SSE_INTRINSICS_)
  3006. static const XMVECTORF32 ScaleDecN4 = { { { 511.0f, 511.0f*1024.0f, 511.0f*1024.0f*1024.0f, 1.0f*1024.0f*1024.0f*1024.0f } } };
  3007. // Clamp to bounds
  3008. XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
  3009. vResult = _mm_min_ps(vResult,g_XMOne);
  3010. // Scale by multiplication
  3011. vResult = _mm_mul_ps(vResult,ScaleDecN4);
  3012. // Convert to int
  3013. __m128i vResulti = _mm_cvttps_epi32(vResult);
  3014. // Mask off any fraction
  3015. vResulti = _mm_and_si128(vResulti,g_XMMaskDec4);
  3016. // Do a horizontal or of 4 entries
  3017. __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
  3018. // x = x|z, y = y|w
  3019. vResulti = _mm_or_si128(vResulti,vResulti2);
  3020. // Move Z to the x position
  3021. vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
  3022. // i = x|y|z|w
  3023. vResulti = _mm_or_si128(vResulti,vResulti2);
  3024. _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
  3025. #endif
  3026. }
  3027. //------------------------------------------------------------------------------
  3028. _Use_decl_annotations_
  3029. inline void XM_CALLCONV XMStoreDec4
  3030. (
  3031. XMDEC4* pDestination,
  3032. FXMVECTOR V
  3033. )
  3034. {
  3035. assert(pDestination);
  3036. static const XMVECTORF32 MinDec4 = { { { -511.0f, -511.0f, -511.0f, -1.0f } } };
  3037. static const XMVECTORF32 MaxDec4 = { { { 511.0f, 511.0f, 511.0f, 1.0f } } };
  3038. #if defined(_XM_NO_INTRINSICS_)
  3039. XMVECTOR N = XMVectorClamp(V, MinDec4, MaxDec4);
  3040. XMFLOAT4A tmp;
  3041. XMStoreFloat4A(&tmp, N );
  3042. pDestination->v = ((int32_t)tmp.w << 30) |
  3043. (((int32_t)tmp.z & 0x3FF) << 20) |
  3044. (((int32_t)tmp.y & 0x3FF) << 10) |
  3045. (((int32_t)tmp.x & 0x3FF));
  3046. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  3047. static const XMVECTORF32 ScaleDec4 = { { { 1.0f, 1024.0f, 1024.0f*1024.0f, 1024.0f*1024.0f*1024.0f } } };
  3048. float32x4_t vResult = vmaxq_f32(V,MinDec4);
  3049. vResult = vminq_f32(vResult,MaxDec4);
  3050. vResult = vmulq_f32(vResult,ScaleDec4);
  3051. int32x4_t vResulti = vcvtq_s32_f32(vResult);
  3052. vResulti = vandq_s32(vResulti,g_XMMaskDec4);
  3053. // Do a horizontal or of all 4 entries
  3054. uint32x2_t vTemp = vget_low_u32(vreinterpret_u32_s32(vResulti));
  3055. uint32x2_t vhi = vget_high_u32(vreinterpret_u32_s32(vResulti));
  3056. vTemp = vorr_u32( vTemp, vhi );
  3057. vTemp = vpadd_u32( vTemp, vTemp );
  3058. vst1_lane_u32( &pDestination->v, vTemp, 0 );
  3059. #elif defined(_XM_SSE_INTRINSICS_)
  3060. static const XMVECTORF32 ScaleDec4 = { { { 1.0f, 1024.0f, 1024.0f*1024.0f, 1024.0f*1024.0f*1024.0f } } };
  3061. // Clamp to bounds
  3062. XMVECTOR vResult = _mm_max_ps(V,MinDec4);
  3063. vResult = _mm_min_ps(vResult,MaxDec4);
  3064. // Scale by multiplication
  3065. vResult = _mm_mul_ps(vResult,ScaleDec4);
  3066. // Convert to int
  3067. __m128i vResulti = _mm_cvttps_epi32(vResult);
  3068. // Mask off any fraction
  3069. vResulti = _mm_and_si128(vResulti,g_XMMaskDec4);
  3070. // Do a horizontal or of 4 entries
  3071. __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
  3072. // x = x|z, y = y|w
  3073. vResulti = _mm_or_si128(vResulti,vResulti2);
  3074. // Move Z to the x position
  3075. vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
  3076. // i = x|y|z|w
  3077. vResulti = _mm_or_si128(vResulti,vResulti2);
  3078. _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
  3079. #endif
  3080. }
  3081. #pragma warning(pop)
  3082. //------------------------------------------------------------------------------
  3083. _Use_decl_annotations_
  3084. inline void XM_CALLCONV XMStoreUByteN4
  3085. (
  3086. XMUBYTEN4* pDestination,
  3087. FXMVECTOR V
  3088. )
  3089. {
  3090. assert(pDestination);
  3091. #if defined(_XM_NO_INTRINSICS_)
  3092. XMVECTOR N = XMVectorSaturate(V);
  3093. N = XMVectorMultiply(N, g_UByteMax);
  3094. N = XMVectorTruncate(N);
  3095. XMFLOAT4A tmp;
  3096. XMStoreFloat4A(&tmp, N );
  3097. pDestination->x = (uint8_t)tmp.x;
  3098. pDestination->y = (uint8_t)tmp.y;
  3099. pDestination->z = (uint8_t)tmp.z;
  3100. pDestination->w = (uint8_t)tmp.w;
  3101. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  3102. float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0) );
  3103. R = vminq_f32(R, vdupq_n_f32(1.0f));
  3104. R = vmulq_n_f32( R, 255.0f );
  3105. uint32x4_t vInt32 = vcvtq_u32_f32(R);
  3106. uint16x4_t vInt16 = vqmovn_u32( vInt32 );
  3107. uint8x8_t vInt8 = vqmovn_u16( vcombine_u16(vInt16,vInt16) );
  3108. vst1_lane_u32( &pDestination->v, vreinterpret_u32_u8(vInt8), 0 );
  3109. #elif defined(_XM_SSE_INTRINSICS_)
  3110. static const XMVECTORF32 ScaleUByteN4 = { { { 255.0f, 255.0f*256.0f*0.5f, 255.0f*256.0f*256.0f, 255.0f*256.0f*256.0f*256.0f*0.5f } } };
  3111. static const XMVECTORI32 MaskUByteN4 = { { { 0xFF, 0xFF << (8 - 1), 0xFF << 16, 0xFF << (24 - 1) } } };
  3112. // Clamp to bounds
  3113. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  3114. vResult = _mm_min_ps(vResult,g_XMOne);
  3115. // Scale by multiplication
  3116. vResult = _mm_mul_ps(vResult,ScaleUByteN4);
  3117. // Convert to int
  3118. __m128i vResulti = _mm_cvttps_epi32(vResult);
  3119. // Mask off any fraction
  3120. vResulti = _mm_and_si128(vResulti,MaskUByteN4);
  3121. // Do a horizontal or of 4 entries
  3122. __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
  3123. // x = x|z, y = y|w
  3124. vResulti = _mm_or_si128(vResulti,vResulti2);
  3125. // Move Z to the x position
  3126. vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
  3127. // Perform a single bit left shift to fix y|w
  3128. vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
  3129. // i = x|y|z|w
  3130. vResulti = _mm_or_si128(vResulti,vResulti2);
  3131. _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
  3132. #endif
  3133. }
  3134. //------------------------------------------------------------------------------
  3135. _Use_decl_annotations_
  3136. inline void XM_CALLCONV XMStoreUByte4
  3137. (
  3138. XMUBYTE4* pDestination,
  3139. FXMVECTOR V
  3140. )
  3141. {
  3142. assert(pDestination);
  3143. #if defined(_XM_NO_INTRINSICS_)
  3144. XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UByteMax);
  3145. N = XMVectorRound(N);
  3146. XMFLOAT4A tmp;
  3147. XMStoreFloat4A(&tmp, N );
  3148. pDestination->x = (uint8_t)tmp.x;
  3149. pDestination->y = (uint8_t)tmp.y;
  3150. pDestination->z = (uint8_t)tmp.z;
  3151. pDestination->w = (uint8_t)tmp.w;
  3152. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  3153. float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0) );
  3154. R = vminq_f32(R, vdupq_n_f32(255.0f));
  3155. uint32x4_t vInt32 = vcvtq_u32_f32(R);
  3156. uint16x4_t vInt16 = vqmovn_u32( vInt32 );
  3157. uint8x8_t vInt8 = vqmovn_u16( vcombine_u16(vInt16,vInt16) );
  3158. vst1_lane_u32( &pDestination->v, vreinterpret_u32_u8(vInt8), 0 );
  3159. #elif defined(_XM_SSE_INTRINSICS_)
  3160. static const XMVECTORF32 ScaleUByte4 = { { { 1.0f, 256.0f*0.5f, 256.0f*256.0f, 256.0f*256.0f*256.0f*0.5f } } };
  3161. static const XMVECTORI32 MaskUByte4 = { { { 0xFF, 0xFF << (8 - 1), 0xFF << 16, 0xFF << (24 - 1) } } };
  3162. // Clamp to bounds
  3163. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  3164. vResult = _mm_min_ps(vResult,g_UByteMax);
  3165. // Scale by multiplication
  3166. vResult = _mm_mul_ps(vResult,ScaleUByte4);
  3167. // Convert to int by rounding
  3168. __m128i vResulti = _mm_cvtps_epi32(vResult);
  3169. // Mask off any fraction
  3170. vResulti = _mm_and_si128(vResulti,MaskUByte4);
  3171. // Do a horizontal or of 4 entries
  3172. __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
  3173. // x = x|z, y = y|w
  3174. vResulti = _mm_or_si128(vResulti,vResulti2);
  3175. // Move Z to the x position
  3176. vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
  3177. // Perform a single bit left shift to fix y|w
  3178. vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
  3179. // i = x|y|z|w
  3180. vResulti = _mm_or_si128(vResulti,vResulti2);
  3181. _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
  3182. #endif
  3183. }
  3184. //------------------------------------------------------------------------------
  3185. _Use_decl_annotations_
  3186. inline void XM_CALLCONV XMStoreByteN4
  3187. (
  3188. XMBYTEN4* pDestination,
  3189. FXMVECTOR V
  3190. )
  3191. {
  3192. assert(pDestination);
  3193. #if defined(_XM_NO_INTRINSICS_)
  3194. XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
  3195. N = XMVectorMultiply(V, g_ByteMax);
  3196. N = XMVectorTruncate(N);
  3197. XMFLOAT4A tmp;
  3198. XMStoreFloat4A(&tmp, N );
  3199. pDestination->x = (int8_t)tmp.x;
  3200. pDestination->y = (int8_t)tmp.y;
  3201. pDestination->z = (int8_t)tmp.z;
  3202. pDestination->w = (int8_t)tmp.w;
  3203. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  3204. float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-1.f) );
  3205. R = vminq_f32(R, vdupq_n_f32(1.0f));
  3206. R = vmulq_n_f32( R, 127.0f );
  3207. int32x4_t vInt32 = vcvtq_s32_f32(R);
  3208. int16x4_t vInt16 = vqmovn_s32( vInt32 );
  3209. int8x8_t vInt8 = vqmovn_s16( vcombine_s16(vInt16,vInt16) );
  3210. vst1_lane_u32( &pDestination->v, vreinterpret_u32_s8(vInt8), 0 );
  3211. #elif defined(_XM_SSE_INTRINSICS_)
  3212. static const XMVECTORF32 ScaleByteN4 = { { { 127.0f, 127.0f*256.0f, 127.0f*256.0f*256.0f, 127.0f*256.0f*256.0f*256.0f } } };
  3213. static const XMVECTORI32 MaskByteN4 = { { { 0xFF, 0xFF << 8, 0xFF << 16, 0xFF << 24 } } };
  3214. // Clamp to bounds
  3215. XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
  3216. vResult = _mm_min_ps(vResult,g_XMOne);
  3217. // Scale by multiplication
  3218. vResult = _mm_mul_ps(vResult,ScaleByteN4);
  3219. // Convert to int
  3220. __m128i vResulti = _mm_cvttps_epi32(vResult);
  3221. // Mask off any fraction
  3222. vResulti = _mm_and_si128(vResulti,MaskByteN4);
  3223. // Do a horizontal or of 4 entries
  3224. __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
  3225. // x = x|z, y = y|w
  3226. vResulti = _mm_or_si128(vResulti,vResulti2);
  3227. // Move Z to the x position
  3228. vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
  3229. // i = x|y|z|w
  3230. vResulti = _mm_or_si128(vResulti,vResulti2);
  3231. _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
  3232. #endif
  3233. }
  3234. //------------------------------------------------------------------------------
  3235. _Use_decl_annotations_
  3236. inline void XM_CALLCONV XMStoreByte4
  3237. (
  3238. XMBYTE4* pDestination,
  3239. FXMVECTOR V
  3240. )
  3241. {
  3242. assert(pDestination);
  3243. #if defined(_XM_NO_INTRINSICS_)
  3244. XMVECTOR N = XMVectorClamp(V, g_ByteMin, g_ByteMax);
  3245. N = XMVectorRound(N);
  3246. XMFLOAT4A tmp;
  3247. XMStoreFloat4A(&tmp, N );
  3248. pDestination->x = (int8_t)tmp.x;
  3249. pDestination->y = (int8_t)tmp.y;
  3250. pDestination->z = (int8_t)tmp.z;
  3251. pDestination->w = (int8_t)tmp.w;
  3252. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  3253. float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-127.f) );
  3254. R = vminq_f32(R, vdupq_n_f32(127.f));
  3255. int32x4_t vInt32 = vcvtq_s32_f32(R);
  3256. int16x4_t vInt16 = vqmovn_s32( vInt32 );
  3257. int8x8_t vInt8 = vqmovn_s16( vcombine_s16(vInt16,vInt16) );
  3258. vst1_lane_u32( &pDestination->v, vreinterpret_u32_s8(vInt8), 0 );
  3259. #elif defined(_XM_SSE_INTRINSICS_)
  3260. static const XMVECTORF32 ScaleByte4 = { { { 1.0f, 256.0f, 256.0f*256.0f, 256.0f*256.0f*256.0f } } };
  3261. static const XMVECTORI32 MaskByte4 = { { { 0xFF, 0xFF << 8, 0xFF << 16, 0xFF << 24 } } };
  3262. // Clamp to bounds
  3263. XMVECTOR vResult = _mm_max_ps(V,g_ByteMin);
  3264. vResult = _mm_min_ps(vResult,g_ByteMax);
  3265. // Scale by multiplication
  3266. vResult = _mm_mul_ps(vResult,ScaleByte4);
  3267. // Convert to int by rounding
  3268. __m128i vResulti = _mm_cvtps_epi32(vResult);
  3269. // Mask off any fraction
  3270. vResulti = _mm_and_si128(vResulti,MaskByte4);
  3271. // Do a horizontal or of 4 entries
  3272. __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
  3273. // x = x|z, y = y|w
  3274. vResulti = _mm_or_si128(vResulti,vResulti2);
  3275. // Move Z to the x position
  3276. vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
  3277. // i = x|y|z|w
  3278. vResulti = _mm_or_si128(vResulti,vResulti2);
  3279. _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
  3280. #endif
  3281. }
  3282. //------------------------------------------------------------------------------
  3283. _Use_decl_annotations_
  3284. inline void XM_CALLCONV XMStoreUNibble4
  3285. (
  3286. XMUNIBBLE4* pDestination,
  3287. FXMVECTOR V
  3288. )
  3289. {
  3290. assert(pDestination);
  3291. static const XMVECTORF32 Max = { { { 15.0f, 15.0f, 15.0f, 15.0f } } };
  3292. #if defined(_XM_NO_INTRINSICS_)
  3293. XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v);
  3294. N = XMVectorRound(N);
  3295. XMFLOAT4A tmp;
  3296. XMStoreFloat4A(&tmp, N );
  3297. pDestination->v = (((uint16_t)tmp.w & 0xF) << 12) |
  3298. (((uint16_t)tmp.z & 0xF) << 8) |
  3299. (((uint16_t)tmp.y & 0xF) << 4) |
  3300. (((uint16_t)tmp.x & 0xF));
  3301. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  3302. static const XMVECTORF32 Scale = { { { 1.0f, 16.f, 16.f*16.f, 16.f*16.f*16.f } } };
  3303. static const XMVECTORU32 Mask = { { { 0xF, 0xF << 4, 0xF << 8, 0xF << 12 } } };
  3304. float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(0));
  3305. vResult = vminq_f32(vResult,Max);
  3306. vResult = vmulq_f32(vResult,Scale);
  3307. uint32x4_t vResulti = vcvtq_u32_f32(vResult);
  3308. vResulti = vandq_u32(vResulti,Mask);
  3309. // Do a horizontal or of 4 entries
  3310. uint32x2_t vTemp = vget_low_u32(vResulti);
  3311. uint32x2_t vhi = vget_high_u32(vResulti);
  3312. vTemp = vorr_u32( vTemp, vhi );
  3313. vTemp = vpadd_u32( vTemp, vTemp );
  3314. vst1_lane_u16( &pDestination->v, vreinterpret_u16_u32( vTemp ), 0 );
  3315. #elif defined(_XM_SSE_INTRINSICS_)
  3316. // Bounds check
  3317. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  3318. vResult = _mm_min_ps(vResult,Max);
  3319. // Convert to int with rounding
  3320. __m128i vInt = _mm_cvtps_epi32(vResult);
  3321. // No SSE operations will write to 16-bit values, so we have to extract them manually
  3322. uint16_t x = static_cast<uint16_t>(_mm_extract_epi16(vInt,0));
  3323. uint16_t y = static_cast<uint16_t>(_mm_extract_epi16(vInt,2));
  3324. uint16_t z = static_cast<uint16_t>(_mm_extract_epi16(vInt,4));
  3325. uint16_t w = static_cast<uint16_t>(_mm_extract_epi16(vInt,6));
  3326. pDestination->v = ((w & 0xF) << 12) |
  3327. ((z & 0xF) << 8) |
  3328. ((y & 0xF) << 4) |
  3329. ((x & 0xF));
  3330. #endif
  3331. }
  3332. //------------------------------------------------------------------------------
  3333. _Use_decl_annotations_
  3334. inline void XM_CALLCONV XMStoreU555
  3335. (
  3336. XMU555* pDestination,
  3337. FXMVECTOR V
  3338. )
  3339. {
  3340. assert(pDestination);
  3341. static const XMVECTORF32 Max = { { { 31.0f, 31.0f, 31.0f, 1.0f } } };
  3342. #if defined(_XM_NO_INTRINSICS_)
  3343. XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v);
  3344. N = XMVectorRound(N);
  3345. XMFLOAT4A tmp;
  3346. XMStoreFloat4A(&tmp, N );
  3347. pDestination->v = ((tmp.w > 0.f) ? 0x8000 : 0) |
  3348. (((uint16_t)tmp.z & 0x1F) << 10) |
  3349. (((uint16_t)tmp.y & 0x1F) << 5) |
  3350. (((uint16_t)tmp.x & 0x1F));
  3351. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  3352. static const XMVECTORF32 Scale = { { { 1.0f, 32.f / 2.f, 32.f*32.f, 32.f*32.f*32.f / 2.f } } };
  3353. static const XMVECTORU32 Mask = { { { 0x1F, 0x1F << (5 - 1), 0x1F << 10, 0x1 << (15 - 1) } } };
  3354. float32x4_t vResult = vmaxq_f32(V,vdupq_n_f32(0));
  3355. vResult = vminq_f32(vResult,Max);
  3356. vResult = vmulq_f32(vResult,Scale);
  3357. uint32x4_t vResulti = vcvtq_u32_f32(vResult);
  3358. vResulti = vandq_u32(vResulti,Mask);
  3359. // Do a horizontal or of 4 entries
  3360. uint32x2_t vTemp = vget_low_u32(vResulti);
  3361. uint32x2_t vTemp2 = vget_high_u32(vResulti);
  3362. vTemp = vorr_u32( vTemp, vTemp2 );
  3363. // Perform a single bit left shift on y|w
  3364. vTemp2 = vdup_lane_u32( vTemp, 1 );
  3365. vTemp2 = vadd_s32( vTemp2, vTemp2 );
  3366. vTemp = vorr_u32( vTemp, vTemp2 );
  3367. vst1_lane_u16( &pDestination->v, vreinterpret_u16_u32( vTemp ), 0 );
  3368. #elif defined(_XM_SSE_INTRINSICS_)
  3369. // Bounds check
  3370. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  3371. vResult = _mm_min_ps(vResult,Max);
  3372. // Convert to int with rounding
  3373. __m128i vInt = _mm_cvtps_epi32(vResult);
  3374. // No SSE operations will write to 16-bit values, so we have to extract them manually
  3375. uint16_t x = static_cast<uint16_t>(_mm_extract_epi16(vInt,0));
  3376. uint16_t y = static_cast<uint16_t>(_mm_extract_epi16(vInt,2));
  3377. uint16_t z = static_cast<uint16_t>(_mm_extract_epi16(vInt,4));
  3378. uint16_t w = static_cast<uint16_t>(_mm_extract_epi16(vInt,6));
  3379. pDestination->v = ((w) ? 0x8000 : 0) |
  3380. ((z & 0x1F) << 10) |
  3381. ((y & 0x1F) << 5) |
  3382. ((x & 0x1F));
  3383. #endif
  3384. }
  3385. /****************************************************************************
  3386. *
  3387. * XMCOLOR operators
  3388. *
  3389. ****************************************************************************/
  3390. //------------------------------------------------------------------------------
  3391. inline XMCOLOR::XMCOLOR
  3392. (
  3393. float _r,
  3394. float _g,
  3395. float _b,
  3396. float _a
  3397. )
  3398. {
  3399. XMStoreColor(this, XMVectorSet(_r, _g, _b, _a));
  3400. }
  3401. //------------------------------------------------------------------------------
  3402. _Use_decl_annotations_
  3403. inline XMCOLOR::XMCOLOR
  3404. (
  3405. const float* pArray
  3406. )
  3407. {
  3408. XMStoreColor(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
  3409. }
  3410. /****************************************************************************
  3411. *
  3412. * XMHALF2 operators
  3413. *
  3414. ****************************************************************************/
  3415. //------------------------------------------------------------------------------
  3416. inline XMHALF2::XMHALF2
  3417. (
  3418. float _x,
  3419. float _y
  3420. )
  3421. {
  3422. x = XMConvertFloatToHalf(_x);
  3423. y = XMConvertFloatToHalf(_y);
  3424. }
  3425. //------------------------------------------------------------------------------
  3426. _Use_decl_annotations_
  3427. inline XMHALF2::XMHALF2
  3428. (
  3429. const float* pArray
  3430. )
  3431. {
  3432. assert( pArray != nullptr );
  3433. x = XMConvertFloatToHalf(pArray[0]);
  3434. y = XMConvertFloatToHalf(pArray[1]);
  3435. }
  3436. /****************************************************************************
  3437. *
  3438. * XMSHORTN2 operators
  3439. *
  3440. ****************************************************************************/
  3441. //------------------------------------------------------------------------------
  3442. inline XMSHORTN2::XMSHORTN2
  3443. (
  3444. float _x,
  3445. float _y
  3446. )
  3447. {
  3448. XMStoreShortN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
  3449. }
  3450. //------------------------------------------------------------------------------
  3451. _Use_decl_annotations_
  3452. inline XMSHORTN2::XMSHORTN2
  3453. (
  3454. const float* pArray
  3455. )
  3456. {
  3457. XMStoreShortN2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
  3458. }
  3459. /****************************************************************************
  3460. *
  3461. * XMSHORT2 operators
  3462. *
  3463. ****************************************************************************/
  3464. //------------------------------------------------------------------------------
  3465. inline XMSHORT2::XMSHORT2
  3466. (
  3467. float _x,
  3468. float _y
  3469. )
  3470. {
  3471. XMStoreShort2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
  3472. }
  3473. //------------------------------------------------------------------------------
  3474. _Use_decl_annotations_
  3475. inline XMSHORT2::XMSHORT2
  3476. (
  3477. const float* pArray
  3478. )
  3479. {
  3480. XMStoreShort2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
  3481. }
  3482. /****************************************************************************
  3483. *
  3484. * XMUSHORTN2 operators
  3485. *
  3486. ****************************************************************************/
  3487. //------------------------------------------------------------------------------
  3488. inline XMUSHORTN2::XMUSHORTN2
  3489. (
  3490. float _x,
  3491. float _y
  3492. )
  3493. {
  3494. XMStoreUShortN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
  3495. }
  3496. //------------------------------------------------------------------------------
  3497. _Use_decl_annotations_
  3498. inline XMUSHORTN2::XMUSHORTN2
  3499. (
  3500. const float* pArray
  3501. )
  3502. {
  3503. XMStoreUShortN2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
  3504. }
  3505. /****************************************************************************
  3506. *
  3507. * XMUSHORT2 operators
  3508. *
  3509. ****************************************************************************/
  3510. //------------------------------------------------------------------------------
  3511. inline XMUSHORT2::XMUSHORT2
  3512. (
  3513. float _x,
  3514. float _y
  3515. )
  3516. {
  3517. XMStoreUShort2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
  3518. }
  3519. //------------------------------------------------------------------------------
  3520. _Use_decl_annotations_
  3521. inline XMUSHORT2::XMUSHORT2
  3522. (
  3523. const float* pArray
  3524. )
  3525. {
  3526. XMStoreUShort2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
  3527. }
  3528. /****************************************************************************
  3529. *
  3530. * XMBYTEN2 operators
  3531. *
  3532. ****************************************************************************/
  3533. //------------------------------------------------------------------------------
  3534. inline XMBYTEN2::XMBYTEN2
  3535. (
  3536. float _x,
  3537. float _y
  3538. )
  3539. {
  3540. XMStoreByteN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
  3541. }
  3542. //------------------------------------------------------------------------------
  3543. _Use_decl_annotations_
  3544. inline XMBYTEN2::XMBYTEN2
  3545. (
  3546. const float* pArray
  3547. )
  3548. {
  3549. XMStoreByteN2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
  3550. }
  3551. /****************************************************************************
  3552. *
  3553. * XMBYTE2 operators
  3554. *
  3555. ****************************************************************************/
  3556. //------------------------------------------------------------------------------
  3557. inline XMBYTE2::XMBYTE2
  3558. (
  3559. float _x,
  3560. float _y
  3561. )
  3562. {
  3563. XMStoreByte2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
  3564. }
  3565. //------------------------------------------------------------------------------
  3566. _Use_decl_annotations_
  3567. inline XMBYTE2::XMBYTE2
  3568. (
  3569. const float* pArray
  3570. )
  3571. {
  3572. XMStoreByte2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
  3573. }
  3574. /****************************************************************************
  3575. *
  3576. * XMUBYTEN2 operators
  3577. *
  3578. ****************************************************************************/
  3579. //------------------------------------------------------------------------------
  3580. inline XMUBYTEN2::XMUBYTEN2
  3581. (
  3582. float _x,
  3583. float _y
  3584. )
  3585. {
  3586. XMStoreUByteN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
  3587. }
  3588. //------------------------------------------------------------------------------
  3589. _Use_decl_annotations_
  3590. inline XMUBYTEN2::XMUBYTEN2
  3591. (
  3592. const float* pArray
  3593. )
  3594. {
  3595. XMStoreUByteN2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
  3596. }
  3597. /****************************************************************************
  3598. *
  3599. * XMUBYTE2 operators
  3600. *
  3601. ****************************************************************************/
  3602. //------------------------------------------------------------------------------
  3603. inline XMUBYTE2::XMUBYTE2
  3604. (
  3605. float _x,
  3606. float _y
  3607. )
  3608. {
  3609. XMStoreUByte2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
  3610. }
  3611. //------------------------------------------------------------------------------
  3612. _Use_decl_annotations_
  3613. inline XMUBYTE2::XMUBYTE2
  3614. (
  3615. const float* pArray
  3616. )
  3617. {
  3618. XMStoreUByte2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
  3619. }
  3620. /****************************************************************************
  3621. *
  3622. * XMU565 operators
  3623. *
  3624. ****************************************************************************/
  3625. inline XMU565::XMU565
  3626. (
  3627. float _x,
  3628. float _y,
  3629. float _z
  3630. )
  3631. {
  3632. XMStoreU565(this, XMVectorSet( _x, _y, _z, 0.0f ));
  3633. }
  3634. _Use_decl_annotations_
  3635. inline XMU565::XMU565
  3636. (
  3637. const float *pArray
  3638. )
  3639. {
  3640. XMStoreU565(this, XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pArray)));
  3641. }
  3642. /****************************************************************************
  3643. *
  3644. * XMFLOAT3PK operators
  3645. *
  3646. ****************************************************************************/
  3647. inline XMFLOAT3PK::XMFLOAT3PK
  3648. (
  3649. float _x,
  3650. float _y,
  3651. float _z
  3652. )
  3653. {
  3654. XMStoreFloat3PK(this, XMVectorSet( _x, _y, _z, 0.0f ));
  3655. }
  3656. _Use_decl_annotations_
  3657. inline XMFLOAT3PK::XMFLOAT3PK
  3658. (
  3659. const float *pArray
  3660. )
  3661. {
  3662. XMStoreFloat3PK(this, XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pArray)));
  3663. }
  3664. /****************************************************************************
  3665. *
  3666. * XMFLOAT3SE operators
  3667. *
  3668. ****************************************************************************/
  3669. inline XMFLOAT3SE::XMFLOAT3SE
  3670. (
  3671. float _x,
  3672. float _y,
  3673. float _z
  3674. )
  3675. {
  3676. XMStoreFloat3SE(this, XMVectorSet( _x, _y, _z, 0.0f ));
  3677. }
  3678. _Use_decl_annotations_
  3679. inline XMFLOAT3SE::XMFLOAT3SE
  3680. (
  3681. const float *pArray
  3682. )
  3683. {
  3684. XMStoreFloat3SE(this, XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pArray)));
  3685. }
  3686. /****************************************************************************
  3687. *
  3688. * XMHALF4 operators
  3689. *
  3690. ****************************************************************************/
  3691. //------------------------------------------------------------------------------
  3692. inline XMHALF4::XMHALF4
  3693. (
  3694. float _x,
  3695. float _y,
  3696. float _z,
  3697. float _w
  3698. )
  3699. {
  3700. x = XMConvertFloatToHalf(_x);
  3701. y = XMConvertFloatToHalf(_y);
  3702. z = XMConvertFloatToHalf(_z);
  3703. w = XMConvertFloatToHalf(_w);
  3704. }
  3705. //------------------------------------------------------------------------------
  3706. _Use_decl_annotations_
  3707. inline XMHALF4::XMHALF4
  3708. (
  3709. const float* pArray
  3710. )
  3711. {
  3712. XMConvertFloatToHalfStream(&x, sizeof(HALF), pArray, sizeof(float), 4);
  3713. }
  3714. /****************************************************************************
  3715. *
  3716. * XMSHORTN4 operators
  3717. *
  3718. ****************************************************************************/
  3719. //------------------------------------------------------------------------------
  3720. inline XMSHORTN4::XMSHORTN4
  3721. (
  3722. float _x,
  3723. float _y,
  3724. float _z,
  3725. float _w
  3726. )
  3727. {
  3728. XMStoreShortN4(this, XMVectorSet(_x, _y, _z, _w));
  3729. }
  3730. //------------------------------------------------------------------------------
  3731. _Use_decl_annotations_
  3732. inline XMSHORTN4::XMSHORTN4
  3733. (
  3734. const float* pArray
  3735. )
  3736. {
  3737. XMStoreShortN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
  3738. }
  3739. /****************************************************************************
  3740. *
  3741. * XMSHORT4 operators
  3742. *
  3743. ****************************************************************************/
  3744. //------------------------------------------------------------------------------
  3745. inline XMSHORT4::XMSHORT4
  3746. (
  3747. float _x,
  3748. float _y,
  3749. float _z,
  3750. float _w
  3751. )
  3752. {
  3753. XMStoreShort4(this, XMVectorSet(_x, _y, _z, _w));
  3754. }
  3755. //------------------------------------------------------------------------------
  3756. _Use_decl_annotations_
  3757. inline XMSHORT4::XMSHORT4
  3758. (
  3759. const float* pArray
  3760. )
  3761. {
  3762. XMStoreShort4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
  3763. }
  3764. /****************************************************************************
  3765. *
  3766. * XMUSHORTN4 operators
  3767. *
  3768. ****************************************************************************/
  3769. //------------------------------------------------------------------------------
  3770. inline XMUSHORTN4::XMUSHORTN4
  3771. (
  3772. float _x,
  3773. float _y,
  3774. float _z,
  3775. float _w
  3776. )
  3777. {
  3778. XMStoreUShortN4(this, XMVectorSet(_x, _y, _z, _w));
  3779. }
  3780. //------------------------------------------------------------------------------
  3781. _Use_decl_annotations_
  3782. inline XMUSHORTN4::XMUSHORTN4
  3783. (
  3784. const float* pArray
  3785. )
  3786. {
  3787. XMStoreUShortN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
  3788. }
  3789. /****************************************************************************
  3790. *
  3791. * XMUSHORT4 operators
  3792. *
  3793. ****************************************************************************/
  3794. //------------------------------------------------------------------------------
  3795. inline XMUSHORT4::XMUSHORT4
  3796. (
  3797. float _x,
  3798. float _y,
  3799. float _z,
  3800. float _w
  3801. )
  3802. {
  3803. XMStoreUShort4(this, XMVectorSet(_x, _y, _z, _w));
  3804. }
  3805. //------------------------------------------------------------------------------
  3806. _Use_decl_annotations_
  3807. inline XMUSHORT4::XMUSHORT4
  3808. (
  3809. const float* pArray
  3810. )
  3811. {
  3812. XMStoreUShort4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
  3813. }
  3814. /****************************************************************************
  3815. *
  3816. * XMXDECN4 operators
  3817. *
  3818. ****************************************************************************/
  3819. //------------------------------------------------------------------------------
  3820. inline XMXDECN4::XMXDECN4
  3821. (
  3822. float _x,
  3823. float _y,
  3824. float _z,
  3825. float _w
  3826. )
  3827. {
  3828. XMStoreXDecN4(this, XMVectorSet(_x, _y, _z, _w));
  3829. }
  3830. //------------------------------------------------------------------------------
  3831. _Use_decl_annotations_
  3832. inline XMXDECN4::XMXDECN4
  3833. (
  3834. const float* pArray
  3835. )
  3836. {
  3837. XMStoreXDecN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
  3838. }
  3839. /****************************************************************************
  3840. *
  3841. * XMXDEC4 operators
  3842. *
  3843. ****************************************************************************/
  3844. #pragma warning(push)
  3845. #pragma warning(disable : 4996)
  3846. // C4996: ignore deprecation warning
  3847. //------------------------------------------------------------------------------
  3848. inline XMXDEC4::XMXDEC4
  3849. (
  3850. float _x,
  3851. float _y,
  3852. float _z,
  3853. float _w
  3854. )
  3855. {
  3856. XMStoreXDec4(this, XMVectorSet(_x, _y, _z, _w));
  3857. }
  3858. //------------------------------------------------------------------------------
  3859. _Use_decl_annotations_
  3860. inline XMXDEC4::XMXDEC4
  3861. (
  3862. const float* pArray
  3863. )
  3864. {
  3865. XMStoreXDec4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
  3866. }
  3867. /****************************************************************************
  3868. *
  3869. * XMDECN4 operators
  3870. *
  3871. ****************************************************************************/
  3872. //------------------------------------------------------------------------------
  3873. inline XMDECN4::XMDECN4
  3874. (
  3875. float _x,
  3876. float _y,
  3877. float _z,
  3878. float _w
  3879. )
  3880. {
  3881. XMStoreDecN4(this, XMVectorSet(_x, _y, _z, _w));
  3882. }
  3883. //------------------------------------------------------------------------------
  3884. _Use_decl_annotations_
  3885. inline XMDECN4::XMDECN4
  3886. (
  3887. const float* pArray
  3888. )
  3889. {
  3890. XMStoreDecN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
  3891. }
  3892. /****************************************************************************
  3893. *
  3894. * XMDEC4 operators
  3895. *
  3896. ****************************************************************************/
  3897. //------------------------------------------------------------------------------
  3898. inline XMDEC4::XMDEC4
  3899. (
  3900. float _x,
  3901. float _y,
  3902. float _z,
  3903. float _w
  3904. )
  3905. {
  3906. XMStoreDec4(this, XMVectorSet(_x, _y, _z, _w));
  3907. }
  3908. //------------------------------------------------------------------------------
  3909. _Use_decl_annotations_
  3910. inline XMDEC4::XMDEC4
  3911. (
  3912. const float* pArray
  3913. )
  3914. {
  3915. XMStoreDec4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
  3916. }
  3917. #pragma warning(pop)
  3918. /****************************************************************************
  3919. *
  3920. * XMUDECN4 operators
  3921. *
  3922. ****************************************************************************/
  3923. //------------------------------------------------------------------------------
  3924. inline XMUDECN4::XMUDECN4
  3925. (
  3926. float _x,
  3927. float _y,
  3928. float _z,
  3929. float _w
  3930. )
  3931. {
  3932. XMStoreUDecN4(this, XMVectorSet(_x, _y, _z, _w));
  3933. }
  3934. //------------------------------------------------------------------------------
  3935. _Use_decl_annotations_
  3936. inline XMUDECN4::XMUDECN4
  3937. (
  3938. const float* pArray
  3939. )
  3940. {
  3941. XMStoreUDecN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
  3942. }
  3943. /****************************************************************************
  3944. *
  3945. * XMUDEC4 operators
  3946. *
  3947. ****************************************************************************/
  3948. //------------------------------------------------------------------------------
  3949. inline XMUDEC4::XMUDEC4
  3950. (
  3951. float _x,
  3952. float _y,
  3953. float _z,
  3954. float _w
  3955. )
  3956. {
  3957. XMStoreUDec4(this, XMVectorSet(_x, _y, _z, _w));
  3958. }
  3959. //------------------------------------------------------------------------------
  3960. _Use_decl_annotations_
  3961. inline XMUDEC4::XMUDEC4
  3962. (
  3963. const float* pArray
  3964. )
  3965. {
  3966. XMStoreUDec4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
  3967. }
  3968. /****************************************************************************
  3969. *
  3970. * XMBYTEN4 operators
  3971. *
  3972. ****************************************************************************/
  3973. //------------------------------------------------------------------------------
  3974. inline XMBYTEN4::XMBYTEN4
  3975. (
  3976. float _x,
  3977. float _y,
  3978. float _z,
  3979. float _w
  3980. )
  3981. {
  3982. XMStoreByteN4(this, XMVectorSet(_x, _y, _z, _w));
  3983. }
  3984. //------------------------------------------------------------------------------
  3985. _Use_decl_annotations_
  3986. inline XMBYTEN4::XMBYTEN4
  3987. (
  3988. const float* pArray
  3989. )
  3990. {
  3991. XMStoreByteN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
  3992. }
  3993. /****************************************************************************
  3994. *
  3995. * XMBYTE4 operators
  3996. *
  3997. ****************************************************************************/
  3998. //------------------------------------------------------------------------------
  3999. inline XMBYTE4::XMBYTE4
  4000. (
  4001. float _x,
  4002. float _y,
  4003. float _z,
  4004. float _w
  4005. )
  4006. {
  4007. XMStoreByte4(this, XMVectorSet(_x, _y, _z, _w));
  4008. }
  4009. //------------------------------------------------------------------------------
  4010. _Use_decl_annotations_
  4011. inline XMBYTE4::XMBYTE4
  4012. (
  4013. const float* pArray
  4014. )
  4015. {
  4016. XMStoreByte4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
  4017. }
  4018. /****************************************************************************
  4019. *
  4020. * XMUBYTEN4 operators
  4021. *
  4022. ****************************************************************************/
  4023. //------------------------------------------------------------------------------
  4024. inline XMUBYTEN4::XMUBYTEN4
  4025. (
  4026. float _x,
  4027. float _y,
  4028. float _z,
  4029. float _w
  4030. )
  4031. {
  4032. XMStoreUByteN4(this, XMVectorSet(_x, _y, _z, _w));
  4033. }
  4034. //------------------------------------------------------------------------------
  4035. _Use_decl_annotations_
  4036. inline XMUBYTEN4::XMUBYTEN4
  4037. (
  4038. const float* pArray
  4039. )
  4040. {
  4041. XMStoreUByteN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
  4042. }
  4043. /****************************************************************************
  4044. *
  4045. * XMUBYTE4 operators
  4046. *
  4047. ****************************************************************************/
  4048. //------------------------------------------------------------------------------
  4049. inline XMUBYTE4::XMUBYTE4
  4050. (
  4051. float _x,
  4052. float _y,
  4053. float _z,
  4054. float _w
  4055. )
  4056. {
  4057. XMStoreUByte4(this, XMVectorSet(_x, _y, _z, _w));
  4058. }
  4059. //------------------------------------------------------------------------------
  4060. _Use_decl_annotations_
  4061. inline XMUBYTE4::XMUBYTE4
  4062. (
  4063. const float* pArray
  4064. )
  4065. {
  4066. XMStoreUByte4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
  4067. }
  4068. /****************************************************************************
  4069. *
  4070. * XMUNIBBLE4 operators
  4071. *
  4072. ****************************************************************************/
  4073. //------------------------------------------------------------------------------
  4074. inline XMUNIBBLE4::XMUNIBBLE4
  4075. (
  4076. float _x,
  4077. float _y,
  4078. float _z,
  4079. float _w
  4080. )
  4081. {
  4082. XMStoreUNibble4(this, XMVectorSet( _x, _y, _z, _w ));
  4083. }
  4084. //------------------------------------------------------------------------------
  4085. _Use_decl_annotations_
  4086. inline XMUNIBBLE4::XMUNIBBLE4
  4087. (
  4088. const float *pArray
  4089. )
  4090. {
  4091. XMStoreUNibble4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
  4092. }
  4093. /****************************************************************************
  4094. *
  4095. * XMU555 operators
  4096. *
  4097. ****************************************************************************/
  4098. //------------------------------------------------------------------------------
  4099. inline XMU555::XMU555
  4100. (
  4101. float _x,
  4102. float _y,
  4103. float _z,
  4104. bool _w
  4105. )
  4106. {
  4107. XMStoreU555(this, XMVectorSet(_x, _y, _z, ((_w) ? 1.0f : 0.0f) ));
  4108. }
  4109. //------------------------------------------------------------------------------
  4110. _Use_decl_annotations_
  4111. inline XMU555::XMU555
  4112. (
  4113. const float *pArray,
  4114. bool _w
  4115. )
  4116. {
  4117. XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pArray));
  4118. XMStoreU555(this, XMVectorSetW(V, ((_w) ? 1.0f : 0.0f) ));
  4119. }