DirectXMathVector.inl 489 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196619761986199620062016202620362046205620662076208620962106211621262136214621562166217621862196220622162226223622462256226622762286229623062316232623362346235623662376238623962406241624262436244624562466247624862496250625162526253625462556256625762586259626062616262626362646265626662676268626962706271627262736274627562766277627862796280628162826283628462856286628762886289629062916292629362946295629662976298629963006301630263036304630563066307630863096310631163126313631463156316631763186319632063216322632363246325632663276328632963306331633263336334633563366337633863396340634163426343634463456346634763486349635063516352635363546355635663576358635963606361636263636364636563666367636863696370637163726373637463756376637763786379638063816382638363846385638663876388638963906391639263936394639563966397639863996400640164026403640464056406640764086409641064116412641364146415641664176418641964206421642264236424642564266427642864296430643164326433643464356436643764386439644064416442644364446445644664476448644964506451645264536454645564566457645864596460646164626463646464656466646764686469647064716472647364746475647664776478647964806481648264836484648564866487648864896490649164926493649464956496649764986499650065016502650365046505650665076508650965106511651265136514651565166517651865196520652165226523652465256526652765286529653065316532653365346535653665376538653965406541654265436544654565466547654865496550655165526553655465556556655765586559656065616562656365646565656665676568656965706571657265736574657565766577657865796580658165826583658465856586658765886589659065916592659365946595659665976598659966006601660266036604660566066607660866096610661166126613661466156616661766186619662066216622662366246625662666276628662966306631663266336634663566366637663866396640664166426643664466456646664766486649665066516652665366546655665666576658665966606661666266636664666566666667666866696670667166726673667466756676667766786679668066816682668366846685668666876688668966906691669266936694669566966697669866996700670167026703670467056706670767086709671067116712671367146715671667176718671967206721672267236724672567266727672867296730673167326733673467356736673767386739674067416742674367446745674667476748674967506751675267536754675567566757675867596760676167626763676467656766676767686769677067716772677367746775677667776778677967806781678267836784678567866787678867896790679167926793679467956796679767986799680068016802680368046805680668076808680968106811681268136814681568166817681868196820682168226823682468256826682768286829683068316832683368346835683668376838683968406841684268436844684568466847684868496850685168526853685468556856685768586859686068616862686368646865686668676868686968706871687268736874687568766877687868796880688168826883688468856886688768886889689068916892689368946895689668976898689969006901690269036904690569066907690869096910691169126913691469156916691769186919692069216922692369246925692669276928692969306931693269336934693569366937693869396940694169426943694469456946694769486949695069516952695369546955695669576958695969606961696269636964696569666967696869696970697169726973697469756976697769786979698069816982698369846985698669876988698969906991699269936994699569966997699869997000700170027003700470057006700770087009701070117012701370147015701670177018701970207021702270237024702570267027702870297030703170327033703470357036703770387039704070417042704370447045704670477048704970507051705270537054705570567057705870597060706170627063706470657066706770687069707070717072707370747075707670777078707970807081708270837084708570867087708870897090709170927093709470957096709770987099710071017102710371047105710671077108710971107111711271137114711571167117711871197120712171227123712471257126712771287129713071317132713371347135713671377138713971407141714271437144714571467147714871497150715171527153715471557156715771587159716071617162716371647165716671677168716971707171717271737174717571767177717871797180718171827183718471857186718771887189719071917192719371947195719671977198719972007201720272037204720572067207720872097210721172127213721472157216721772187219722072217222722372247225722672277228722972307231723272337234723572367237723872397240724172427243724472457246724772487249725072517252725372547255725672577258725972607261726272637264726572667267726872697270727172727273727472757276727772787279728072817282728372847285728672877288728972907291729272937294729572967297729872997300730173027303730473057306730773087309731073117312731373147315731673177318731973207321732273237324732573267327732873297330733173327333733473357336733773387339734073417342734373447345734673477348734973507351735273537354735573567357735873597360736173627363736473657366736773687369737073717372737373747375737673777378737973807381738273837384738573867387738873897390739173927393739473957396739773987399740074017402740374047405740674077408740974107411741274137414741574167417741874197420742174227423742474257426742774287429743074317432743374347435743674377438743974407441744274437444744574467447744874497450745174527453745474557456745774587459746074617462746374647465746674677468746974707471747274737474747574767477747874797480748174827483748474857486748774887489749074917492749374947495749674977498749975007501750275037504750575067507750875097510751175127513751475157516751775187519752075217522752375247525752675277528752975307531753275337534753575367537753875397540754175427543754475457546754775487549755075517552755375547555755675577558755975607561756275637564756575667567756875697570757175727573757475757576757775787579758075817582758375847585758675877588758975907591759275937594759575967597759875997600760176027603760476057606760776087609761076117612761376147615761676177618761976207621762276237624762576267627762876297630763176327633763476357636763776387639764076417642764376447645764676477648764976507651765276537654765576567657765876597660766176627663766476657666766776687669767076717672767376747675767676777678767976807681768276837684768576867687768876897690769176927693769476957696769776987699770077017702770377047705770677077708770977107711771277137714771577167717771877197720772177227723772477257726772777287729773077317732773377347735773677377738773977407741774277437744774577467747774877497750775177527753775477557756775777587759776077617762776377647765776677677768776977707771777277737774777577767777777877797780778177827783778477857786778777887789779077917792779377947795779677977798779978007801780278037804780578067807780878097810781178127813781478157816781778187819782078217822782378247825782678277828782978307831783278337834783578367837783878397840784178427843784478457846784778487849785078517852785378547855785678577858785978607861786278637864786578667867786878697870787178727873787478757876787778787879788078817882788378847885788678877888788978907891789278937894789578967897789878997900790179027903790479057906790779087909791079117912791379147915791679177918791979207921792279237924792579267927792879297930793179327933793479357936793779387939794079417942794379447945794679477948794979507951795279537954795579567957795879597960796179627963796479657966796779687969797079717972797379747975797679777978797979807981798279837984798579867987798879897990799179927993799479957996799779987999800080018002800380048005800680078008800980108011801280138014801580168017801880198020802180228023802480258026802780288029803080318032803380348035803680378038803980408041804280438044804580468047804880498050805180528053805480558056805780588059806080618062806380648065806680678068806980708071807280738074807580768077807880798080808180828083808480858086808780888089809080918092809380948095809680978098809981008101810281038104810581068107810881098110811181128113811481158116811781188119812081218122812381248125812681278128812981308131813281338134813581368137813881398140814181428143814481458146814781488149815081518152815381548155815681578158815981608161816281638164816581668167816881698170817181728173817481758176817781788179818081818182818381848185818681878188818981908191819281938194819581968197819881998200820182028203820482058206820782088209821082118212821382148215821682178218821982208221822282238224822582268227822882298230823182328233823482358236823782388239824082418242824382448245824682478248824982508251825282538254825582568257825882598260826182628263826482658266826782688269827082718272827382748275827682778278827982808281828282838284828582868287828882898290829182928293829482958296829782988299830083018302830383048305830683078308830983108311831283138314831583168317831883198320832183228323832483258326832783288329833083318332833383348335833683378338833983408341834283438344834583468347834883498350835183528353835483558356835783588359836083618362836383648365836683678368836983708371837283738374837583768377837883798380838183828383838483858386838783888389839083918392839383948395839683978398839984008401840284038404840584068407840884098410841184128413841484158416841784188419842084218422842384248425842684278428842984308431843284338434843584368437843884398440844184428443844484458446844784488449845084518452845384548455845684578458845984608461846284638464846584668467846884698470847184728473847484758476847784788479848084818482848384848485848684878488848984908491849284938494849584968497849884998500850185028503850485058506850785088509851085118512851385148515851685178518851985208521852285238524852585268527852885298530853185328533853485358536853785388539854085418542854385448545854685478548854985508551855285538554855585568557855885598560856185628563856485658566856785688569857085718572857385748575857685778578857985808581858285838584858585868587858885898590859185928593859485958596859785988599860086018602860386048605860686078608860986108611861286138614861586168617861886198620862186228623862486258626862786288629863086318632863386348635863686378638863986408641864286438644864586468647864886498650865186528653865486558656865786588659866086618662866386648665866686678668866986708671867286738674867586768677867886798680868186828683868486858686868786888689869086918692869386948695869686978698869987008701870287038704870587068707870887098710871187128713871487158716871787188719872087218722872387248725872687278728872987308731873287338734873587368737873887398740874187428743874487458746874787488749875087518752875387548755875687578758875987608761876287638764876587668767876887698770877187728773877487758776877787788779878087818782878387848785878687878788878987908791879287938794879587968797879887998800880188028803880488058806880788088809881088118812881388148815881688178818881988208821882288238824882588268827882888298830883188328833883488358836883788388839884088418842884388448845884688478848884988508851885288538854885588568857885888598860886188628863886488658866886788688869887088718872887388748875887688778878887988808881888288838884888588868887888888898890889188928893889488958896889788988899890089018902890389048905890689078908890989108911891289138914891589168917891889198920892189228923892489258926892789288929893089318932893389348935893689378938893989408941894289438944894589468947894889498950895189528953895489558956895789588959896089618962896389648965896689678968896989708971897289738974897589768977897889798980898189828983898489858986898789888989899089918992899389948995899689978998899990009001900290039004900590069007900890099010901190129013901490159016901790189019902090219022902390249025902690279028902990309031903290339034903590369037903890399040904190429043904490459046904790489049905090519052905390549055905690579058905990609061906290639064906590669067906890699070907190729073907490759076907790789079908090819082908390849085908690879088908990909091909290939094909590969097909890999100910191029103910491059106910791089109911091119112911391149115911691179118911991209121912291239124912591269127912891299130913191329133913491359136913791389139914091419142914391449145914691479148914991509151915291539154915591569157915891599160916191629163916491659166916791689169917091719172917391749175917691779178917991809181918291839184918591869187918891899190919191929193919491959196919791989199920092019202920392049205920692079208920992109211921292139214921592169217921892199220922192229223922492259226922792289229923092319232923392349235923692379238923992409241924292439244924592469247924892499250925192529253925492559256925792589259926092619262926392649265926692679268926992709271927292739274927592769277927892799280928192829283928492859286928792889289929092919292929392949295929692979298929993009301930293039304930593069307930893099310931193129313931493159316931793189319932093219322932393249325932693279328932993309331933293339334933593369337933893399340934193429343934493459346934793489349935093519352935393549355935693579358935993609361936293639364936593669367936893699370937193729373937493759376937793789379938093819382938393849385938693879388938993909391939293939394939593969397939893999400940194029403940494059406940794089409941094119412941394149415941694179418941994209421942294239424942594269427942894299430943194329433943494359436943794389439944094419442944394449445944694479448944994509451945294539454945594569457945894599460946194629463946494659466946794689469947094719472947394749475947694779478947994809481948294839484948594869487948894899490949194929493949494959496949794989499950095019502950395049505950695079508950995109511951295139514951595169517951895199520952195229523952495259526952795289529953095319532953395349535953695379538953995409541954295439544954595469547954895499550955195529553955495559556955795589559956095619562956395649565956695679568956995709571957295739574957595769577957895799580958195829583958495859586958795889589959095919592959395949595959695979598959996009601960296039604960596069607960896099610961196129613961496159616961796189619962096219622962396249625962696279628962996309631963296339634963596369637963896399640964196429643964496459646964796489649965096519652965396549655965696579658965996609661966296639664966596669667966896699670967196729673967496759676967796789679968096819682968396849685968696879688968996909691969296939694969596969697969896999700970197029703970497059706970797089709971097119712971397149715971697179718971997209721972297239724972597269727972897299730973197329733973497359736973797389739974097419742974397449745974697479748974997509751975297539754975597569757975897599760976197629763976497659766976797689769977097719772977397749775977697779778977997809781978297839784978597869787978897899790979197929793979497959796979797989799980098019802980398049805980698079808980998109811981298139814981598169817981898199820982198229823982498259826982798289829983098319832983398349835983698379838983998409841984298439844984598469847984898499850985198529853985498559856985798589859986098619862986398649865986698679868986998709871987298739874987598769877987898799880988198829883988498859886988798889889989098919892989398949895989698979898989999009901990299039904990599069907990899099910991199129913991499159916991799189919992099219922992399249925992699279928992999309931993299339934993599369937993899399940994199429943994499459946994799489949995099519952995399549955995699579958995999609961996299639964996599669967996899699970997199729973997499759976997799789979998099819982998399849985998699879988998999909991999299939994999599969997999899991000010001100021000310004100051000610007100081000910010100111001210013100141001510016100171001810019100201002110022100231002410025100261002710028100291003010031100321003310034100351003610037100381003910040100411004210043100441004510046100471004810049100501005110052100531005410055100561005710058100591006010061100621006310064100651006610067100681006910070100711007210073100741007510076100771007810079100801008110082100831008410085100861008710088100891009010091100921009310094100951009610097100981009910100101011010210103101041010510106101071010810109101101011110112101131011410115101161011710118101191012010121101221012310124101251012610127101281012910130101311013210133101341013510136101371013810139101401014110142101431014410145101461014710148101491015010151101521015310154101551015610157101581015910160101611016210163101641016510166101671016810169101701017110172101731017410175101761017710178101791018010181101821018310184101851018610187101881018910190101911019210193101941019510196101971019810199102001020110202102031020410205102061020710208102091021010211102121021310214102151021610217102181021910220102211022210223102241022510226102271022810229102301023110232102331023410235102361023710238102391024010241102421024310244102451024610247102481024910250102511025210253102541025510256102571025810259102601026110262102631026410265102661026710268102691027010271102721027310274102751027610277102781027910280102811028210283102841028510286102871028810289102901029110292102931029410295102961029710298102991030010301103021030310304103051030610307103081030910310103111031210313103141031510316103171031810319103201032110322103231032410325103261032710328103291033010331103321033310334103351033610337103381033910340103411034210343103441034510346103471034810349103501035110352103531035410355103561035710358103591036010361103621036310364103651036610367103681036910370103711037210373103741037510376103771037810379103801038110382103831038410385103861038710388103891039010391103921039310394103951039610397103981039910400104011040210403104041040510406104071040810409104101041110412104131041410415104161041710418104191042010421104221042310424104251042610427104281042910430104311043210433104341043510436104371043810439104401044110442104431044410445104461044710448104491045010451104521045310454104551045610457104581045910460104611046210463104641046510466104671046810469104701047110472104731047410475104761047710478104791048010481104821048310484104851048610487104881048910490104911049210493104941049510496104971049810499105001050110502105031050410505105061050710508105091051010511105121051310514105151051610517105181051910520105211052210523105241052510526105271052810529105301053110532105331053410535105361053710538105391054010541105421054310544105451054610547105481054910550105511055210553105541055510556105571055810559105601056110562105631056410565105661056710568105691057010571105721057310574105751057610577105781057910580105811058210583105841058510586105871058810589105901059110592105931059410595105961059710598105991060010601106021060310604106051060610607106081060910610106111061210613106141061510616106171061810619106201062110622106231062410625106261062710628106291063010631106321063310634106351063610637106381063910640106411064210643106441064510646106471064810649106501065110652106531065410655106561065710658106591066010661106621066310664106651066610667106681066910670106711067210673106741067510676106771067810679106801068110682106831068410685106861068710688106891069010691106921069310694106951069610697106981069910700107011070210703107041070510706107071070810709107101071110712107131071410715107161071710718107191072010721107221072310724107251072610727107281072910730107311073210733107341073510736107371073810739107401074110742107431074410745107461074710748107491075010751107521075310754107551075610757107581075910760107611076210763107641076510766107671076810769107701077110772107731077410775107761077710778107791078010781107821078310784107851078610787107881078910790107911079210793107941079510796107971079810799108001080110802108031080410805108061080710808108091081010811108121081310814108151081610817108181081910820108211082210823108241082510826108271082810829108301083110832108331083410835108361083710838108391084010841108421084310844108451084610847108481084910850108511085210853108541085510856108571085810859108601086110862108631086410865108661086710868108691087010871108721087310874108751087610877108781087910880108811088210883108841088510886108871088810889108901089110892108931089410895108961089710898108991090010901109021090310904109051090610907109081090910910109111091210913109141091510916109171091810919109201092110922109231092410925109261092710928109291093010931109321093310934109351093610937109381093910940109411094210943109441094510946109471094810949109501095110952109531095410955109561095710958109591096010961109621096310964109651096610967109681096910970109711097210973109741097510976109771097810979109801098110982109831098410985109861098710988109891099010991109921099310994109951099610997109981099911000110011100211003110041100511006110071100811009110101101111012110131101411015110161101711018110191102011021110221102311024110251102611027110281102911030110311103211033110341103511036110371103811039110401104111042110431104411045110461104711048110491105011051110521105311054110551105611057110581105911060110611106211063110641106511066110671106811069110701107111072110731107411075110761107711078110791108011081110821108311084110851108611087110881108911090110911109211093110941109511096110971109811099111001110111102111031110411105111061110711108111091111011111111121111311114111151111611117111181111911120111211112211123111241112511126111271112811129111301113111132111331113411135111361113711138111391114011141111421114311144111451114611147111481114911150111511115211153111541115511156111571115811159111601116111162111631116411165111661116711168111691117011171111721117311174111751117611177111781117911180111811118211183111841118511186111871118811189111901119111192111931119411195111961119711198111991120011201112021120311204112051120611207112081120911210112111121211213112141121511216112171121811219112201122111222112231122411225112261122711228112291123011231112321123311234112351123611237112381123911240112411124211243112441124511246112471124811249112501125111252112531125411255112561125711258112591126011261112621126311264112651126611267112681126911270112711127211273112741127511276112771127811279112801128111282112831128411285112861128711288112891129011291112921129311294112951129611297112981129911300113011130211303113041130511306113071130811309113101131111312113131131411315113161131711318113191132011321113221132311324113251132611327113281132911330113311133211333113341133511336113371133811339113401134111342113431134411345113461134711348113491135011351113521135311354113551135611357113581135911360113611136211363113641136511366113671136811369113701137111372113731137411375113761137711378113791138011381113821138311384113851138611387113881138911390113911139211393113941139511396113971139811399114001140111402114031140411405114061140711408114091141011411114121141311414114151141611417114181141911420114211142211423114241142511426114271142811429114301143111432114331143411435114361143711438114391144011441114421144311444114451144611447114481144911450114511145211453114541145511456114571145811459114601146111462114631146411465114661146711468114691147011471114721147311474114751147611477114781147911480114811148211483114841148511486114871148811489114901149111492114931149411495114961149711498114991150011501115021150311504115051150611507115081150911510115111151211513115141151511516115171151811519115201152111522115231152411525115261152711528115291153011531115321153311534115351153611537115381153911540115411154211543115441154511546115471154811549115501155111552115531155411555115561155711558115591156011561115621156311564115651156611567115681156911570115711157211573115741157511576115771157811579115801158111582115831158411585115861158711588115891159011591115921159311594115951159611597115981159911600116011160211603116041160511606116071160811609116101161111612116131161411615116161161711618116191162011621116221162311624116251162611627116281162911630116311163211633116341163511636116371163811639116401164111642116431164411645116461164711648116491165011651116521165311654116551165611657116581165911660116611166211663116641166511666116671166811669116701167111672116731167411675116761167711678116791168011681116821168311684116851168611687116881168911690116911169211693116941169511696116971169811699117001170111702117031170411705117061170711708117091171011711117121171311714117151171611717117181171911720117211172211723117241172511726117271172811729117301173111732117331173411735117361173711738117391174011741117421174311744117451174611747117481174911750117511175211753117541175511756117571175811759117601176111762117631176411765117661176711768117691177011771117721177311774117751177611777117781177911780117811178211783117841178511786117871178811789117901179111792117931179411795117961179711798117991180011801118021180311804118051180611807118081180911810118111181211813118141181511816118171181811819118201182111822118231182411825118261182711828118291183011831118321183311834118351183611837118381183911840118411184211843118441184511846118471184811849118501185111852118531185411855118561185711858118591186011861118621186311864118651186611867118681186911870118711187211873118741187511876118771187811879118801188111882118831188411885118861188711888118891189011891118921189311894118951189611897118981189911900119011190211903119041190511906119071190811909119101191111912119131191411915119161191711918119191192011921119221192311924119251192611927119281192911930119311193211933119341193511936119371193811939119401194111942119431194411945119461194711948119491195011951119521195311954119551195611957119581195911960119611196211963119641196511966119671196811969119701197111972119731197411975119761197711978119791198011981119821198311984119851198611987119881198911990119911199211993119941199511996119971199811999120001200112002120031200412005120061200712008120091201012011120121201312014120151201612017120181201912020120211202212023120241202512026120271202812029120301203112032120331203412035120361203712038120391204012041120421204312044120451204612047120481204912050120511205212053120541205512056120571205812059120601206112062120631206412065120661206712068120691207012071120721207312074120751207612077120781207912080120811208212083120841208512086120871208812089120901209112092120931209412095120961209712098120991210012101121021210312104121051210612107121081210912110121111211212113121141211512116121171211812119121201212112122121231212412125121261212712128121291213012131121321213312134121351213612137121381213912140121411214212143121441214512146121471214812149121501215112152121531215412155121561215712158121591216012161121621216312164121651216612167121681216912170121711217212173121741217512176121771217812179121801218112182121831218412185121861218712188121891219012191121921219312194121951219612197121981219912200122011220212203122041220512206122071220812209122101221112212122131221412215122161221712218122191222012221122221222312224122251222612227122281222912230122311223212233122341223512236122371223812239122401224112242122431224412245122461224712248122491225012251122521225312254122551225612257122581225912260122611226212263122641226512266122671226812269122701227112272122731227412275122761227712278122791228012281122821228312284122851228612287122881228912290122911229212293122941229512296122971229812299123001230112302123031230412305123061230712308123091231012311123121231312314123151231612317123181231912320123211232212323123241232512326123271232812329123301233112332123331233412335123361233712338123391234012341123421234312344123451234612347123481234912350123511235212353123541235512356123571235812359123601236112362123631236412365123661236712368123691237012371123721237312374123751237612377123781237912380123811238212383123841238512386123871238812389123901239112392123931239412395123961239712398123991240012401124021240312404124051240612407124081240912410124111241212413124141241512416124171241812419124201242112422124231242412425124261242712428124291243012431124321243312434124351243612437124381243912440124411244212443124441244512446124471244812449124501245112452124531245412455124561245712458124591246012461124621246312464124651246612467124681246912470124711247212473124741247512476124771247812479124801248112482124831248412485124861248712488124891249012491124921249312494124951249612497124981249912500125011250212503125041250512506125071250812509125101251112512125131251412515125161251712518125191252012521125221252312524125251252612527125281252912530125311253212533125341253512536125371253812539125401254112542125431254412545125461254712548125491255012551125521255312554125551255612557125581255912560125611256212563125641256512566125671256812569125701257112572125731257412575125761257712578125791258012581125821258312584125851258612587125881258912590125911259212593125941259512596125971259812599126001260112602126031260412605126061260712608126091261012611126121261312614126151261612617126181261912620126211262212623126241262512626126271262812629126301263112632126331263412635126361263712638126391264012641126421264312644126451264612647126481264912650126511265212653126541265512656126571265812659126601266112662126631266412665126661266712668126691267012671126721267312674126751267612677126781267912680126811268212683126841268512686126871268812689126901269112692126931269412695126961269712698126991270012701127021270312704127051270612707127081270912710127111271212713127141271512716127171271812719127201272112722127231272412725127261272712728127291273012731127321273312734127351273612737127381273912740127411274212743127441274512746127471274812749127501275112752127531275412755127561275712758127591276012761127621276312764127651276612767127681276912770127711277212773127741277512776127771277812779127801278112782127831278412785127861278712788127891279012791127921279312794127951279612797127981279912800128011280212803128041280512806128071280812809128101281112812128131281412815128161281712818128191282012821128221282312824128251282612827128281282912830128311283212833128341283512836128371283812839128401284112842128431284412845128461284712848128491285012851128521285312854128551285612857128581285912860128611286212863128641286512866128671286812869128701287112872128731287412875128761287712878128791288012881128821288312884128851288612887128881288912890128911289212893128941289512896128971289812899129001290112902129031290412905129061290712908129091291012911129121291312914129151291612917129181291912920129211292212923129241292512926129271292812929129301293112932129331293412935129361293712938129391294012941129421294312944129451294612947129481294912950129511295212953129541295512956129571295812959129601296112962129631296412965129661296712968129691297012971129721297312974129751297612977129781297912980129811298212983129841298512986129871298812989129901299112992129931299412995129961299712998129991300013001130021300313004130051300613007130081300913010130111301213013130141301513016130171301813019130201302113022130231302413025130261302713028130291303013031130321303313034130351303613037130381303913040130411304213043130441304513046130471304813049130501305113052130531305413055130561305713058130591306013061130621306313064130651306613067130681306913070130711307213073130741307513076130771307813079130801308113082130831308413085130861308713088130891309013091130921309313094130951309613097130981309913100131011310213103131041310513106131071310813109131101311113112131131311413115131161311713118131191312013121131221312313124131251312613127131281312913130131311313213133131341313513136131371313813139131401314113142131431314413145131461314713148131491315013151131521315313154131551315613157131581315913160131611316213163131641316513166131671316813169131701317113172131731317413175131761317713178131791318013181131821318313184131851318613187131881318913190131911319213193131941319513196131971319813199132001320113202132031320413205132061320713208132091321013211132121321313214132151321613217132181321913220132211322213223132241322513226132271322813229132301323113232132331323413235132361323713238132391324013241132421324313244132451324613247132481324913250132511325213253132541325513256132571325813259132601326113262132631326413265132661326713268132691327013271132721327313274132751327613277132781327913280132811328213283132841328513286132871328813289132901329113292132931329413295132961329713298132991330013301133021330313304133051330613307133081330913310133111331213313133141331513316133171331813319133201332113322133231332413325133261332713328133291333013331133321333313334133351333613337133381333913340133411334213343133441334513346133471334813349133501335113352133531335413355133561335713358133591336013361133621336313364133651336613367133681336913370133711337213373133741337513376133771337813379133801338113382133831338413385133861338713388133891339013391133921339313394133951339613397133981339913400134011340213403134041340513406134071340813409134101341113412134131341413415134161341713418134191342013421134221342313424134251342613427134281342913430134311343213433134341343513436134371343813439134401344113442134431344413445134461344713448134491345013451134521345313454134551345613457134581345913460134611346213463134641346513466134671346813469134701347113472134731347413475134761347713478134791348013481134821348313484134851348613487134881348913490134911349213493134941349513496134971349813499135001350113502135031350413505135061350713508135091351013511135121351313514135151351613517135181351913520135211352213523135241352513526135271352813529135301353113532135331353413535135361353713538135391354013541135421354313544135451354613547135481354913550135511355213553135541355513556135571355813559135601356113562135631356413565135661356713568135691357013571135721357313574135751357613577135781357913580135811358213583135841358513586135871358813589135901359113592135931359413595135961359713598135991360013601136021360313604136051360613607136081360913610136111361213613136141361513616136171361813619136201362113622136231362413625136261362713628136291363013631136321363313634136351363613637136381363913640136411364213643136441364513646136471364813649136501365113652136531365413655136561365713658136591366013661136621366313664136651366613667136681366913670136711367213673136741367513676136771367813679136801368113682136831368413685136861368713688136891369013691136921369313694136951369613697136981369913700137011370213703137041370513706137071370813709137101371113712137131371413715137161371713718137191372013721137221372313724137251372613727137281372913730137311373213733137341373513736137371373813739137401374113742137431374413745137461374713748137491375013751137521375313754137551375613757137581375913760137611376213763137641376513766137671376813769137701377113772137731377413775137761377713778137791378013781137821378313784137851378613787137881378913790137911379213793137941379513796137971379813799138001380113802138031380413805138061380713808138091381013811138121381313814138151381613817138181381913820138211382213823138241382513826138271382813829138301383113832138331383413835138361383713838138391384013841138421384313844138451384613847138481384913850138511385213853138541385513856138571385813859138601386113862138631386413865138661386713868138691387013871138721387313874138751387613877138781387913880138811388213883138841388513886138871388813889138901389113892138931389413895138961389713898138991390013901139021390313904139051390613907139081390913910139111391213913139141391513916139171391813919139201392113922139231392413925139261392713928139291393013931139321393313934139351393613937139381393913940139411394213943139441394513946139471394813949139501395113952139531395413955139561395713958139591396013961139621396313964139651396613967139681396913970139711397213973139741397513976139771397813979139801398113982139831398413985139861398713988139891399013991139921399313994139951399613997139981399914000140011400214003140041400514006140071400814009140101401114012140131401414015140161401714018140191402014021140221402314024140251402614027140281402914030140311403214033140341403514036140371403814039140401404114042140431404414045140461404714048140491405014051140521405314054140551405614057140581405914060140611406214063140641406514066140671406814069140701407114072140731407414075140761407714078140791408014081140821408314084140851408614087140881408914090140911409214093140941409514096140971409814099141001410114102141031410414105141061410714108141091411014111141121411314114141151411614117141181411914120141211412214123141241412514126141271412814129141301413114132141331413414135141361413714138141391414014141141421414314144141451414614147141481414914150141511415214153141541415514156141571415814159141601416114162141631416414165141661416714168141691417014171141721417314174141751417614177141781417914180141811418214183141841418514186141871418814189141901419114192141931419414195141961419714198141991420014201142021420314204142051420614207142081420914210142111421214213142141421514216142171421814219142201422114222142231422414225142261422714228142291423014231142321423314234142351423614237142381423914240142411424214243142441424514246142471424814249142501425114252142531425414255142561425714258142591426014261142621426314264142651426614267142681426914270142711427214273142741427514276142771427814279142801428114282142831428414285142861428714288142891429014291142921429314294142951429614297142981429914300143011430214303143041430514306143071430814309143101431114312143131431414315143161431714318143191432014321143221432314324143251432614327143281432914330143311433214333143341433514336143371433814339143401434114342143431434414345143461434714348143491435014351143521435314354143551435614357143581435914360143611436214363143641436514366143671436814369143701437114372143731437414375143761437714378143791438014381143821438314384143851438614387143881438914390143911439214393143941439514396143971439814399144001440114402144031440414405144061440714408144091441014411144121441314414144151441614417144181441914420144211442214423144241442514426144271442814429144301443114432144331443414435144361443714438144391444014441144421444314444144451444614447144481444914450144511445214453144541445514456144571445814459144601446114462144631446414465144661446714468144691447014471144721447314474144751447614477144781447914480144811448214483144841448514486144871448814489144901449114492144931449414495144961449714498144991450014501145021450314504145051450614507145081450914510145111451214513145141451514516145171451814519145201452114522145231452414525145261452714528145291453014531145321453314534145351453614537145381453914540145411454214543145441454514546145471454814549145501455114552145531455414555145561455714558145591456014561145621456314564145651456614567145681456914570145711457214573145741457514576145771457814579145801458114582145831458414585145861458714588145891459014591145921459314594
  1. //-------------------------------------------------------------------------------------
  2. // DirectXMathVector.inl -- SIMD C++ Math library
  3. //
  4. // THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
  5. // ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
  6. // THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
  7. // PARTICULAR PURPOSE.
  8. //
  9. // Copyright (c) Microsoft Corporation. All rights reserved.
  10. //
  11. // http://go.microsoft.com/fwlink/?LinkID=615560
  12. //-------------------------------------------------------------------------------------
  13. #pragma once
  14. #if defined(_XM_NO_INTRINSICS_)
  15. #define XMISNAN(x) ((*(const uint32_t*)&(x) & 0x7F800000) == 0x7F800000 && (*(const uint32_t*)&(x) & 0x7FFFFF) != 0)
  16. #define XMISINF(x) ((*(const uint32_t*)&(x) & 0x7FFFFFFF) == 0x7F800000)
  17. #endif
  18. #if defined(_XM_SSE_INTRINSICS_)
  19. #define XM3UNPACK3INTO4(l1,l2,l3) \
  20. XMVECTOR V3 = _mm_shuffle_ps(l2,l3,_MM_SHUFFLE(0,0,3,2));\
  21. XMVECTOR V2 = _mm_shuffle_ps(l2,l1,_MM_SHUFFLE(3,3,1,0));\
  22. V2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(1,1,0,2));\
  23. XMVECTOR V4 = _mm_castsi128_ps( _mm_srli_si128(_mm_castps_si128(L3),32/8) );
  24. #define XM3PACK4INTO3(v2x) \
  25. v2x = _mm_shuffle_ps(V2,V3,_MM_SHUFFLE(1,0,2,1));\
  26. V2 = _mm_shuffle_ps(V2,V1,_MM_SHUFFLE(2,2,0,0));\
  27. V1 = _mm_shuffle_ps(V1,V2,_MM_SHUFFLE(0,2,1,0));\
  28. V3 = _mm_shuffle_ps(V3,V4,_MM_SHUFFLE(0,0,2,2));\
  29. V3 = _mm_shuffle_ps(V3,V4,_MM_SHUFFLE(2,1,2,0));\
  30. #endif
  31. /****************************************************************************
  32. *
  33. * General Vector
  34. *
  35. ****************************************************************************/
  36. //------------------------------------------------------------------------------
  37. // Assignment operations
  38. //------------------------------------------------------------------------------
  39. //------------------------------------------------------------------------------
  40. // Return a vector with all elements equaling zero
  41. inline XMVECTOR XM_CALLCONV XMVectorZero()
  42. {
  43. #if defined(_XM_NO_INTRINSICS_)
  44. XMVECTORF32 vResult = { { { 0.0f, 0.0f, 0.0f, 0.0f } } };
  45. return vResult.v;
  46. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  47. return vdupq_n_f32(0);
  48. #elif defined(_XM_SSE_INTRINSICS_)
  49. return _mm_setzero_ps();
  50. #endif
  51. }
  52. //------------------------------------------------------------------------------
  53. // Initialize a vector with four floating point values
  54. inline XMVECTOR XM_CALLCONV XMVectorSet
  55. (
  56. float x,
  57. float y,
  58. float z,
  59. float w
  60. )
  61. {
  62. #if defined(_XM_NO_INTRINSICS_)
  63. XMVECTORF32 vResult = { { { x, y, z, w } } };
  64. return vResult.v;
  65. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  66. float32x2_t V0 = vcreate_f32(((uint64_t)*(const uint32_t *)&x) | ((uint64_t)(*(const uint32_t *)&y) << 32));
  67. float32x2_t V1 = vcreate_f32(((uint64_t)*(const uint32_t *)&z) | ((uint64_t)(*(const uint32_t *)&w) << 32));
  68. return vcombine_f32(V0, V1);
  69. #elif defined(_XM_SSE_INTRINSICS_)
  70. return _mm_set_ps( w, z, y, x );
  71. #endif
  72. }
  73. //------------------------------------------------------------------------------
  74. // Initialize a vector with four integer values
  75. inline XMVECTOR XM_CALLCONV XMVectorSetInt
  76. (
  77. uint32_t x,
  78. uint32_t y,
  79. uint32_t z,
  80. uint32_t w
  81. )
  82. {
  83. #if defined(_XM_NO_INTRINSICS_)
  84. XMVECTORU32 vResult = { { { x, y, z, w } } };
  85. return vResult.v;
  86. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  87. uint32x2_t V0 = vcreate_u32(((uint64_t)x) | ((uint64_t)y << 32));
  88. uint32x2_t V1 = vcreate_u32(((uint64_t)z) | ((uint64_t)w << 32));
  89. return vcombine_u32(V0, V1);
  90. #elif defined(_XM_SSE_INTRINSICS_)
  91. __m128i V = _mm_set_epi32( w, z, y, x );
  92. return _mm_castsi128_ps(V);
  93. #endif
  94. }
  95. //------------------------------------------------------------------------------
  96. // Initialize a vector with a replicated floating point value
  97. inline XMVECTOR XM_CALLCONV XMVectorReplicate
  98. (
  99. float Value
  100. )
  101. {
  102. #if defined(_XM_NO_INTRINSICS_)
  103. XMVECTORF32 vResult;
  104. vResult.f[0] =
  105. vResult.f[1] =
  106. vResult.f[2] =
  107. vResult.f[3] = Value;
  108. return vResult.v;
  109. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  110. return vdupq_n_f32( Value );
  111. #elif defined(_XM_SSE_INTRINSICS_)
  112. return _mm_set_ps1( Value );
  113. #endif
  114. }
  115. //------------------------------------------------------------------------------
  116. // Initialize a vector with a replicated floating point value passed by pointer
  117. _Use_decl_annotations_
  118. inline XMVECTOR XM_CALLCONV XMVectorReplicatePtr
  119. (
  120. const float *pValue
  121. )
  122. {
  123. #if defined(_XM_NO_INTRINSICS_)
  124. float Value = pValue[0];
  125. XMVECTORF32 vResult;
  126. vResult.f[0] =
  127. vResult.f[1] =
  128. vResult.f[2] =
  129. vResult.f[3] = Value;
  130. return vResult.v;
  131. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  132. return vld1q_dup_f32( pValue );
  133. #elif defined(_XM_AVX_INTRINSICS_)
  134. return _mm_broadcast_ss( pValue );
  135. #elif defined(_XM_SSE_INTRINSICS_)
  136. return _mm_load_ps1( pValue );
  137. #endif
  138. }
  139. //------------------------------------------------------------------------------
  140. // Initialize a vector with a replicated integer value
  141. inline XMVECTOR XM_CALLCONV XMVectorReplicateInt
  142. (
  143. uint32_t Value
  144. )
  145. {
  146. #if defined(_XM_NO_INTRINSICS_)
  147. XMVECTORU32 vResult;
  148. vResult.u[0] =
  149. vResult.u[1] =
  150. vResult.u[2] =
  151. vResult.u[3] = Value;
  152. return vResult.v;
  153. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  154. return vdupq_n_u32( Value );
  155. #elif defined(_XM_SSE_INTRINSICS_)
  156. __m128i vTemp = _mm_set1_epi32( Value );
  157. return _mm_castsi128_ps(vTemp);
  158. #endif
  159. }
  160. //------------------------------------------------------------------------------
  161. // Initialize a vector with a replicated integer value passed by pointer
  162. _Use_decl_annotations_
  163. inline XMVECTOR XM_CALLCONV XMVectorReplicateIntPtr
  164. (
  165. const uint32_t *pValue
  166. )
  167. {
  168. #if defined(_XM_NO_INTRINSICS_)
  169. uint32_t Value = pValue[0];
  170. XMVECTORU32 vResult;
  171. vResult.u[0] =
  172. vResult.u[1] =
  173. vResult.u[2] =
  174. vResult.u[3] = Value;
  175. return vResult.v;
  176. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  177. return vld1q_dup_u32(pValue);
  178. #elif defined(_XM_SSE_INTRINSICS_)
  179. return _mm_load_ps1(reinterpret_cast<const float *>(pValue));
  180. #endif
  181. }
  182. //------------------------------------------------------------------------------
  183. // Initialize a vector with all bits set (true mask)
  184. inline XMVECTOR XM_CALLCONV XMVectorTrueInt()
  185. {
  186. #if defined(_XM_NO_INTRINSICS_)
  187. XMVECTORU32 vResult = { { { 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU } } };
  188. return vResult.v;
  189. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  190. return vdupq_n_s32(-1);
  191. #elif defined(_XM_SSE_INTRINSICS_)
  192. __m128i V = _mm_set1_epi32(-1);
  193. return _mm_castsi128_ps(V);
  194. #endif
  195. }
  196. //------------------------------------------------------------------------------
  197. // Initialize a vector with all bits clear (false mask)
  198. inline XMVECTOR XM_CALLCONV XMVectorFalseInt()
  199. {
  200. #if defined(_XM_NO_INTRINSICS_)
  201. XMVECTORF32 vResult = { { { 0.0f, 0.0f, 0.0f, 0.0f } } };
  202. return vResult;
  203. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  204. return vdupq_n_u32(0);
  205. #elif defined(_XM_SSE_INTRINSICS_)
  206. return _mm_setzero_ps();
  207. #endif
  208. }
  209. //------------------------------------------------------------------------------
  210. // Replicate the x component of the vector
  211. inline XMVECTOR XM_CALLCONV XMVectorSplatX
  212. (
  213. FXMVECTOR V
  214. )
  215. {
  216. #if defined(_XM_NO_INTRINSICS_)
  217. XMVECTORF32 vResult;
  218. vResult.f[0] =
  219. vResult.f[1] =
  220. vResult.f[2] =
  221. vResult.f[3] = V.vector4_f32[0];
  222. return vResult.v;
  223. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  224. return vdupq_lane_f32( vget_low_f32( V ), 0 );
  225. #elif defined(_XM_AVX2_INTRINSICS_)
  226. return _mm_broadcastss_ps( V );
  227. #elif defined(_XM_SSE_INTRINSICS_)
  228. return XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
  229. #endif
  230. }
  231. //------------------------------------------------------------------------------
  232. // Replicate the y component of the vector
  233. inline XMVECTOR XM_CALLCONV XMVectorSplatY
  234. (
  235. FXMVECTOR V
  236. )
  237. {
  238. #if defined(_XM_NO_INTRINSICS_)
  239. XMVECTORF32 vResult;
  240. vResult.f[0] =
  241. vResult.f[1] =
  242. vResult.f[2] =
  243. vResult.f[3] = V.vector4_f32[1];
  244. return vResult.v;
  245. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  246. return vdupq_lane_f32( vget_low_f32( V ), 1 );
  247. #elif defined(_XM_SSE_INTRINSICS_)
  248. return XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
  249. #endif
  250. }
  251. //------------------------------------------------------------------------------
  252. // Replicate the z component of the vector
  253. inline XMVECTOR XM_CALLCONV XMVectorSplatZ
  254. (
  255. FXMVECTOR V
  256. )
  257. {
  258. #if defined(_XM_NO_INTRINSICS_)
  259. XMVECTORF32 vResult;
  260. vResult.f[0] =
  261. vResult.f[1] =
  262. vResult.f[2] =
  263. vResult.f[3] = V.vector4_f32[2];
  264. return vResult.v;
  265. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  266. return vdupq_lane_f32( vget_high_f32( V ), 0 );
  267. #elif defined(_XM_SSE_INTRINSICS_)
  268. return XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
  269. #endif
  270. }
  271. //------------------------------------------------------------------------------
  272. // Replicate the w component of the vector
  273. inline XMVECTOR XM_CALLCONV XMVectorSplatW
  274. (
  275. FXMVECTOR V
  276. )
  277. {
  278. #if defined(_XM_NO_INTRINSICS_)
  279. XMVECTORF32 vResult;
  280. vResult.f[0] =
  281. vResult.f[1] =
  282. vResult.f[2] =
  283. vResult.f[3] = V.vector4_f32[3];
  284. return vResult.v;
  285. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  286. return vdupq_lane_f32( vget_high_f32( V ), 1 );
  287. #elif defined(_XM_SSE_INTRINSICS_)
  288. return XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) );
  289. #endif
  290. }
  291. //------------------------------------------------------------------------------
  292. // Return a vector of 1.0f,1.0f,1.0f,1.0f
  293. inline XMVECTOR XM_CALLCONV XMVectorSplatOne()
  294. {
  295. #if defined(_XM_NO_INTRINSICS_)
  296. XMVECTORF32 vResult;
  297. vResult.f[0] =
  298. vResult.f[1] =
  299. vResult.f[2] =
  300. vResult.f[3] = 1.0f;
  301. return vResult.v;
  302. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  303. return vdupq_n_f32(1.0f);
  304. #elif defined(_XM_SSE_INTRINSICS_)
  305. return g_XMOne;
  306. #endif
  307. }
  308. //------------------------------------------------------------------------------
  309. // Return a vector of INF,INF,INF,INF
  310. inline XMVECTOR XM_CALLCONV XMVectorSplatInfinity()
  311. {
  312. #if defined(_XM_NO_INTRINSICS_)
  313. XMVECTORU32 vResult;
  314. vResult.u[0] =
  315. vResult.u[1] =
  316. vResult.u[2] =
  317. vResult.u[3] = 0x7F800000;
  318. return vResult.v;
  319. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  320. return vdupq_n_u32(0x7F800000);
  321. #elif defined(_XM_SSE_INTRINSICS_)
  322. return g_XMInfinity;
  323. #endif
  324. }
  325. //------------------------------------------------------------------------------
  326. // Return a vector of Q_NAN,Q_NAN,Q_NAN,Q_NAN
  327. inline XMVECTOR XM_CALLCONV XMVectorSplatQNaN()
  328. {
  329. #if defined(_XM_NO_INTRINSICS_)
  330. XMVECTORU32 vResult;
  331. vResult.u[0] =
  332. vResult.u[1] =
  333. vResult.u[2] =
  334. vResult.u[3] = 0x7FC00000;
  335. return vResult.v;
  336. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  337. return vdupq_n_u32(0x7FC00000);
  338. #elif defined(_XM_SSE_INTRINSICS_)
  339. return g_XMQNaN;
  340. #endif
  341. }
  342. //------------------------------------------------------------------------------
  343. // Return a vector of 1.192092896e-7f,1.192092896e-7f,1.192092896e-7f,1.192092896e-7f
  344. inline XMVECTOR XM_CALLCONV XMVectorSplatEpsilon()
  345. {
  346. #if defined(_XM_NO_INTRINSICS_)
  347. XMVECTORU32 vResult;
  348. vResult.u[0] =
  349. vResult.u[1] =
  350. vResult.u[2] =
  351. vResult.u[3] = 0x34000000;
  352. return vResult.v;
  353. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  354. return vdupq_n_u32(0x34000000);
  355. #elif defined(_XM_SSE_INTRINSICS_)
  356. return g_XMEpsilon;
  357. #endif
  358. }
  359. //------------------------------------------------------------------------------
  360. // Return a vector of -0.0f (0x80000000),-0.0f,-0.0f,-0.0f
  361. inline XMVECTOR XM_CALLCONV XMVectorSplatSignMask()
  362. {
  363. #if defined(_XM_NO_INTRINSICS_)
  364. XMVECTORU32 vResult;
  365. vResult.u[0] =
  366. vResult.u[1] =
  367. vResult.u[2] =
  368. vResult.u[3] = 0x80000000U;
  369. return vResult.v;
  370. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  371. return vdupq_n_u32(0x80000000U);
  372. #elif defined(_XM_SSE_INTRINSICS_)
  373. __m128i V = _mm_set1_epi32( 0x80000000 );
  374. return _mm_castsi128_ps(V);
  375. #endif
  376. }
  377. //------------------------------------------------------------------------------
  378. // Return a floating point value via an index. This is not a recommended
  379. // function to use due to performance loss.
  380. inline float XM_CALLCONV XMVectorGetByIndex(FXMVECTOR V, size_t i)
  381. {
  382. assert( i < 4 );
  383. _Analysis_assume_( i < 4 );
  384. #if defined(_XM_NO_INTRINSICS_)
  385. return V.vector4_f32[i];
  386. #else
  387. XMVECTORF32 U;
  388. U.v = V;
  389. return U.f[i];
  390. #endif
  391. }
  392. //------------------------------------------------------------------------------
  393. // Return the X component in an FPU register.
  394. inline float XM_CALLCONV XMVectorGetX(FXMVECTOR V)
  395. {
  396. #if defined(_XM_NO_INTRINSICS_)
  397. return V.vector4_f32[0];
  398. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  399. return vgetq_lane_f32(V, 0);
  400. #elif defined(_XM_SSE_INTRINSICS_)
  401. return _mm_cvtss_f32(V);
  402. #endif
  403. }
  404. // Return the Y component in an FPU register.
  405. inline float XM_CALLCONV XMVectorGetY(FXMVECTOR V)
  406. {
  407. #if defined(_XM_NO_INTRINSICS_)
  408. return V.vector4_f32[1];
  409. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  410. return vgetq_lane_f32(V, 1);
  411. #elif defined(_XM_SSE_INTRINSICS_)
  412. XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
  413. return _mm_cvtss_f32(vTemp);
  414. #endif
  415. }
  416. // Return the Z component in an FPU register.
  417. inline float XM_CALLCONV XMVectorGetZ(FXMVECTOR V)
  418. {
  419. #if defined(_XM_NO_INTRINSICS_)
  420. return V.vector4_f32[2];
  421. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  422. return vgetq_lane_f32(V, 2);
  423. #elif defined(_XM_SSE_INTRINSICS_)
  424. XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
  425. return _mm_cvtss_f32(vTemp);
  426. #endif
  427. }
  428. // Return the W component in an FPU register.
  429. inline float XM_CALLCONV XMVectorGetW(FXMVECTOR V)
  430. {
  431. #if defined(_XM_NO_INTRINSICS_)
  432. return V.vector4_f32[3];
  433. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  434. return vgetq_lane_f32(V, 3);
  435. #elif defined(_XM_SSE_INTRINSICS_)
  436. XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
  437. return _mm_cvtss_f32(vTemp);
  438. #endif
  439. }
  440. //------------------------------------------------------------------------------
  441. // Store a component indexed by i into a 32 bit float location in memory.
  442. _Use_decl_annotations_
  443. inline void XM_CALLCONV XMVectorGetByIndexPtr(float *f, FXMVECTOR V, size_t i)
  444. {
  445. assert( f != nullptr );
  446. assert( i < 4 );
  447. _Analysis_assume_( i < 4 );
  448. #if defined(_XM_NO_INTRINSICS_)
  449. *f = V.vector4_f32[i];
  450. #else
  451. XMVECTORF32 U;
  452. U.v = V;
  453. *f = U.f[i];
  454. #endif
  455. }
  456. //------------------------------------------------------------------------------
  457. // Store the X component into a 32 bit float location in memory.
  458. _Use_decl_annotations_
  459. inline void XM_CALLCONV XMVectorGetXPtr(float *x, FXMVECTOR V)
  460. {
  461. assert( x != nullptr);
  462. #if defined(_XM_NO_INTRINSICS_)
  463. *x = V.vector4_f32[0];
  464. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  465. vst1q_lane_f32(x,V,0);
  466. #elif defined(_XM_SSE_INTRINSICS_)
  467. _mm_store_ss(x,V);
  468. #endif
  469. }
  470. // Store the Y component into a 32 bit float location in memory.
  471. _Use_decl_annotations_
  472. inline void XM_CALLCONV XMVectorGetYPtr(float *y, FXMVECTOR V)
  473. {
  474. assert( y != nullptr );
  475. #if defined(_XM_NO_INTRINSICS_)
  476. *y = V.vector4_f32[1];
  477. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  478. vst1q_lane_f32(y,V,1);
  479. #elif defined(_XM_SSE4_INTRINSICS_)
  480. *((int*)y) = _mm_extract_ps( V, 1 );
  481. #elif defined(_XM_SSE_INTRINSICS_)
  482. XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
  483. _mm_store_ss(y,vResult);
  484. #endif
  485. }
  486. // Store the Z component into a 32 bit float location in memory.
  487. _Use_decl_annotations_
  488. inline void XM_CALLCONV XMVectorGetZPtr(float *z, FXMVECTOR V)
  489. {
  490. assert( z != nullptr );
  491. #if defined(_XM_NO_INTRINSICS_)
  492. *z = V.vector4_f32[2];
  493. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  494. vst1q_lane_f32(z,V,2);
  495. #elif defined(_XM_SSE4_INTRINSICS_)
  496. *((int*)z) = _mm_extract_ps( V, 2 );
  497. #elif defined(_XM_SSE_INTRINSICS_)
  498. XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
  499. _mm_store_ss(z,vResult);
  500. #endif
  501. }
  502. // Store the W component into a 32 bit float location in memory.
  503. _Use_decl_annotations_
  504. inline void XM_CALLCONV XMVectorGetWPtr(float *w, FXMVECTOR V)
  505. {
  506. assert( w != nullptr );
  507. #if defined(_XM_NO_INTRINSICS_)
  508. *w = V.vector4_f32[3];
  509. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  510. vst1q_lane_f32(w,V,3);
  511. #elif defined(_XM_SSE4_INTRINSICS_)
  512. *((int*)w) = _mm_extract_ps( V, 3 );
  513. #elif defined(_XM_SSE_INTRINSICS_)
  514. XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
  515. _mm_store_ss(w,vResult);
  516. #endif
  517. }
  518. //------------------------------------------------------------------------------
  519. // Return an integer value via an index. This is not a recommended
  520. // function to use due to performance loss.
  521. inline uint32_t XM_CALLCONV XMVectorGetIntByIndex(FXMVECTOR V, size_t i)
  522. {
  523. assert( i < 4 );
  524. _Analysis_assume_( i < 4 );
  525. #if defined(_XM_NO_INTRINSICS_)
  526. return V.vector4_u32[i];
  527. #else
  528. XMVECTORU32 U;
  529. U.v = V;
  530. return U.u[i];
  531. #endif
  532. }
  533. //------------------------------------------------------------------------------
  534. // Return the X component in an integer register.
  535. inline uint32_t XM_CALLCONV XMVectorGetIntX(FXMVECTOR V)
  536. {
  537. #if defined(_XM_NO_INTRINSICS_)
  538. return V.vector4_u32[0];
  539. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  540. return vgetq_lane_u32(V, 0);
  541. #elif defined(_XM_SSE_INTRINSICS_)
  542. return static_cast<uint32_t>(_mm_cvtsi128_si32(_mm_castps_si128(V)));
  543. #endif
  544. }
  545. // Return the Y component in an integer register.
  546. inline uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V)
  547. {
  548. #if defined(_XM_NO_INTRINSICS_)
  549. return V.vector4_u32[1];
  550. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  551. return vgetq_lane_u32(V, 1);
  552. #elif defined(_XM_SSE4_INTRINSICS_)
  553. __m128i V1 = _mm_castps_si128( V );
  554. return static_cast<uint32_t>( _mm_extract_epi32( V1, 1 ) );
  555. #elif defined(_XM_SSE_INTRINSICS_)
  556. __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(1,1,1,1));
  557. return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
  558. #endif
  559. }
  560. // Return the Z component in an integer register.
  561. inline uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V)
  562. {
  563. #if defined(_XM_NO_INTRINSICS_)
  564. return V.vector4_u32[2];
  565. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  566. return vgetq_lane_u32(V, 2);
  567. #elif defined(_XM_SSE4_INTRINSICS_)
  568. __m128i V1 = _mm_castps_si128( V );
  569. return static_cast<uint32_t>( _mm_extract_epi32( V1, 2 ) );
  570. #elif defined(_XM_SSE_INTRINSICS_)
  571. __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(2,2,2,2));
  572. return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
  573. #endif
  574. }
  575. // Return the W component in an integer register.
  576. inline uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V)
  577. {
  578. #if defined(_XM_NO_INTRINSICS_)
  579. return V.vector4_u32[3];
  580. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  581. return vgetq_lane_u32(V, 3);
  582. #elif defined(_XM_SSE4_INTRINSICS_)
  583. __m128i V1 = _mm_castps_si128( V );
  584. return static_cast<uint32_t>( _mm_extract_epi32( V1, 3 ) );
  585. #elif defined(_XM_SSE_INTRINSICS_)
  586. __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(3,3,3,3));
  587. return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
  588. #endif
  589. }
  590. //------------------------------------------------------------------------------
  591. // Store a component indexed by i into a 32 bit integer location in memory.
  592. _Use_decl_annotations_
  593. inline void XM_CALLCONV XMVectorGetIntByIndexPtr(uint32_t *x, FXMVECTOR V, size_t i)
  594. {
  595. assert( x != nullptr );
  596. assert( i < 4 );
  597. _Analysis_assume_( i < 4 );
  598. #if defined(_XM_NO_INTRINSICS_)
  599. *x = V.vector4_u32[i];
  600. #else
  601. XMVECTORU32 U;
  602. U.v = V;
  603. *x = U.u[i];
  604. #endif
  605. }
  606. //------------------------------------------------------------------------------
  607. // Store the X component into a 32 bit integer location in memory.
  608. _Use_decl_annotations_
  609. inline void XM_CALLCONV XMVectorGetIntXPtr(uint32_t *x, FXMVECTOR V)
  610. {
  611. assert( x != nullptr );
  612. #if defined(_XM_NO_INTRINSICS_)
  613. *x = V.vector4_u32[0];
  614. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  615. vst1q_lane_u32(x,*reinterpret_cast<const uint32x4_t*>(&V),0);
  616. #elif defined(_XM_SSE_INTRINSICS_)
  617. _mm_store_ss(reinterpret_cast<float *>(x),V);
  618. #endif
  619. }
  620. // Store the Y component into a 32 bit integer location in memory.
  621. _Use_decl_annotations_
  622. inline void XM_CALLCONV XMVectorGetIntYPtr(uint32_t *y, FXMVECTOR V)
  623. {
  624. assert( y != nullptr );
  625. #if defined(_XM_NO_INTRINSICS_)
  626. *y = V.vector4_u32[1];
  627. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  628. vst1q_lane_u32(y,*reinterpret_cast<const uint32x4_t*>(&V),1);
  629. #elif defined(_XM_SSE4_INTRINSICS_)
  630. __m128i V1 = _mm_castps_si128( V );
  631. *y = static_cast<uint32_t>( _mm_extract_epi32( V1, 1 ) );
  632. #elif defined(_XM_SSE_INTRINSICS_)
  633. XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
  634. _mm_store_ss(reinterpret_cast<float *>(y),vResult);
  635. #endif
  636. }
  637. // Store the Z component into a 32 bit integer locaCantion in memory.
  638. _Use_decl_annotations_
  639. inline void XM_CALLCONV XMVectorGetIntZPtr(uint32_t *z, FXMVECTOR V)
  640. {
  641. assert( z != nullptr );
  642. #if defined(_XM_NO_INTRINSICS_)
  643. *z = V.vector4_u32[2];
  644. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  645. vst1q_lane_u32(z,*reinterpret_cast<const uint32x4_t*>(&V),2);
  646. #elif defined(_XM_SSE4_INTRINSICS_)
  647. __m128i V1 = _mm_castps_si128( V );
  648. *z = static_cast<uint32_t>( _mm_extract_epi32( V1, 2 ) );
  649. #elif defined(_XM_SSE_INTRINSICS_)
  650. XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
  651. _mm_store_ss(reinterpret_cast<float *>(z),vResult);
  652. #endif
  653. }
  654. // Store the W component into a 32 bit integer location in memory.
  655. _Use_decl_annotations_
  656. inline void XM_CALLCONV XMVectorGetIntWPtr(uint32_t *w, FXMVECTOR V)
  657. {
  658. assert( w != nullptr );
  659. #if defined(_XM_NO_INTRINSICS_)
  660. *w = V.vector4_u32[3];
  661. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  662. vst1q_lane_u32(w,*reinterpret_cast<const uint32x4_t*>(&V),3);
  663. #elif defined(_XM_SSE4_INTRINSICS_)
  664. __m128i V1 = _mm_castps_si128( V );
  665. *w = static_cast<uint32_t>( _mm_extract_epi32( V1, 3 ) );
  666. #elif defined(_XM_SSE_INTRINSICS_)
  667. XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
  668. _mm_store_ss(reinterpret_cast<float *>(w),vResult);
  669. #endif
  670. }
  671. //------------------------------------------------------------------------------
  672. // Set a single indexed floating point component
  673. inline XMVECTOR XM_CALLCONV XMVectorSetByIndex(FXMVECTOR V, float f, size_t i)
  674. {
  675. assert( i < 4 );
  676. _Analysis_assume_( i < 4 );
  677. XMVECTORF32 U;
  678. U.v = V;
  679. U.f[i] = f;
  680. return U.v;
  681. }
  682. //------------------------------------------------------------------------------
  683. // Sets the X component of a vector to a passed floating point value
  684. inline XMVECTOR XM_CALLCONV XMVectorSetX(FXMVECTOR V, float x)
  685. {
  686. #if defined(_XM_NO_INTRINSICS_)
  687. XMVECTORF32 U = { { {
  688. x,
  689. V.vector4_f32[1],
  690. V.vector4_f32[2],
  691. V.vector4_f32[3]
  692. } } };
  693. return U.v;
  694. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  695. return vsetq_lane_f32(x,V,0);
  696. #elif defined(_XM_SSE_INTRINSICS_)
  697. XMVECTOR vResult = _mm_set_ss(x);
  698. vResult = _mm_move_ss(V,vResult);
  699. return vResult;
  700. #endif
  701. }
  702. // Sets the Y component of a vector to a passed floating point value
  703. inline XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y)
  704. {
  705. #if defined(_XM_NO_INTRINSICS_)
  706. XMVECTORF32 U = { { {
  707. V.vector4_f32[0],
  708. y,
  709. V.vector4_f32[2],
  710. V.vector4_f32[3]
  711. } } };
  712. return U.v;
  713. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  714. return vsetq_lane_f32(y,V,1);
  715. #elif defined(_XM_SSE4_INTRINSICS_)
  716. XMVECTOR vResult = _mm_set_ss(y);
  717. vResult = _mm_insert_ps( V, vResult, 0x10 );
  718. return vResult;
  719. #elif defined(_XM_SSE_INTRINSICS_)
  720. // Swap y and x
  721. XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
  722. // Convert input to vector
  723. XMVECTOR vTemp = _mm_set_ss(y);
  724. // Replace the x component
  725. vResult = _mm_move_ss(vResult,vTemp);
  726. // Swap y and x again
  727. vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1));
  728. return vResult;
  729. #endif
  730. }
  731. // Sets the Z component of a vector to a passed floating point value
  732. inline XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z)
  733. {
  734. #if defined(_XM_NO_INTRINSICS_)
  735. XMVECTORF32 U = { { {
  736. V.vector4_f32[0],
  737. V.vector4_f32[1],
  738. z,
  739. V.vector4_f32[3]
  740. } } };
  741. return U.v;
  742. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  743. return vsetq_lane_f32(z,V,2);
  744. #elif defined(_XM_SSE4_INTRINSICS_)
  745. XMVECTOR vResult = _mm_set_ss(z);
  746. vResult = _mm_insert_ps( V, vResult, 0x20 );
  747. return vResult;
  748. #elif defined(_XM_SSE_INTRINSICS_)
  749. // Swap z and x
  750. XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2));
  751. // Convert input to vector
  752. XMVECTOR vTemp = _mm_set_ss(z);
  753. // Replace the x component
  754. vResult = _mm_move_ss(vResult,vTemp);
  755. // Swap z and x again
  756. vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2));
  757. return vResult;
  758. #endif
  759. }
  760. // Sets the W component of a vector to a passed floating point value
  761. inline XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w)
  762. {
  763. #if defined(_XM_NO_INTRINSICS_)
  764. XMVECTORF32 U = { { {
  765. V.vector4_f32[0],
  766. V.vector4_f32[1],
  767. V.vector4_f32[2],
  768. w
  769. } } };
  770. return U.v;
  771. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  772. return vsetq_lane_f32(w,V,3);
  773. #elif defined(_XM_SSE4_INTRINSICS_)
  774. XMVECTOR vResult = _mm_set_ss(w);
  775. vResult = _mm_insert_ps( V, vResult, 0x30 );
  776. return vResult;
  777. #elif defined(_XM_SSE_INTRINSICS_)
  778. // Swap w and x
  779. XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3));
  780. // Convert input to vector
  781. XMVECTOR vTemp = _mm_set_ss(w);
  782. // Replace the x component
  783. vResult = _mm_move_ss(vResult,vTemp);
  784. // Swap w and x again
  785. vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3));
  786. return vResult;
  787. #endif
  788. }
  789. //------------------------------------------------------------------------------
  790. // Sets a component of a vector to a floating point value passed by pointer
  791. _Use_decl_annotations_
  792. inline XMVECTOR XM_CALLCONV XMVectorSetByIndexPtr(FXMVECTOR V, const float *f, size_t i)
  793. {
  794. assert( f != nullptr );
  795. assert( i < 4 );
  796. _Analysis_assume_( i < 4 );
  797. XMVECTORF32 U;
  798. U.v = V;
  799. U.f[i] = *f;
  800. return U.v;
  801. }
  802. //------------------------------------------------------------------------------
  803. // Sets the X component of a vector to a floating point value passed by pointer
  804. _Use_decl_annotations_
  805. inline XMVECTOR XM_CALLCONV XMVectorSetXPtr(FXMVECTOR V, const float *x)
  806. {
  807. assert( x != nullptr );
  808. #if defined(_XM_NO_INTRINSICS_)
  809. XMVECTORF32 U = { { {
  810. *x,
  811. V.vector4_f32[1],
  812. V.vector4_f32[2],
  813. V.vector4_f32[3]
  814. } } };
  815. return U.v;
  816. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  817. return vld1q_lane_f32(x,V,0);
  818. #elif defined(_XM_SSE_INTRINSICS_)
  819. XMVECTOR vResult = _mm_load_ss(x);
  820. vResult = _mm_move_ss(V,vResult);
  821. return vResult;
  822. #endif
  823. }
  824. // Sets the Y component of a vector to a floating point value passed by pointer
  825. _Use_decl_annotations_
  826. inline XMVECTOR XM_CALLCONV XMVectorSetYPtr(FXMVECTOR V, const float *y)
  827. {
  828. assert( y != nullptr );
  829. #if defined(_XM_NO_INTRINSICS_)
  830. XMVECTORF32 U = { { {
  831. V.vector4_f32[0],
  832. *y,
  833. V.vector4_f32[2],
  834. V.vector4_f32[3]
  835. } } };
  836. return U.v;
  837. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  838. return vld1q_lane_f32(y,V,1);
  839. #elif defined(_XM_SSE_INTRINSICS_)
  840. // Swap y and x
  841. XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
  842. // Convert input to vector
  843. XMVECTOR vTemp = _mm_load_ss(y);
  844. // Replace the x component
  845. vResult = _mm_move_ss(vResult,vTemp);
  846. // Swap y and x again
  847. vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1));
  848. return vResult;
  849. #endif
  850. }
  851. // Sets the Z component of a vector to a floating point value passed by pointer
  852. _Use_decl_annotations_
  853. inline XMVECTOR XM_CALLCONV XMVectorSetZPtr(FXMVECTOR V, const float *z)
  854. {
  855. assert( z != nullptr );
  856. #if defined(_XM_NO_INTRINSICS_)
  857. XMVECTORF32 U = { { {
  858. V.vector4_f32[0],
  859. V.vector4_f32[1],
  860. *z,
  861. V.vector4_f32[3]
  862. } } };
  863. return U.v;
  864. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  865. return vld1q_lane_f32(z,V,2);
  866. #elif defined(_XM_SSE_INTRINSICS_)
  867. // Swap z and x
  868. XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2));
  869. // Convert input to vector
  870. XMVECTOR vTemp = _mm_load_ss(z);
  871. // Replace the x component
  872. vResult = _mm_move_ss(vResult,vTemp);
  873. // Swap z and x again
  874. vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2));
  875. return vResult;
  876. #endif
  877. }
  878. // Sets the W component of a vector to a floating point value passed by pointer
  879. _Use_decl_annotations_
  880. inline XMVECTOR XM_CALLCONV XMVectorSetWPtr(FXMVECTOR V, const float *w)
  881. {
  882. assert( w != nullptr );
  883. #if defined(_XM_NO_INTRINSICS_)
  884. XMVECTORF32 U = { { {
  885. V.vector4_f32[0],
  886. V.vector4_f32[1],
  887. V.vector4_f32[2],
  888. *w
  889. } } };
  890. return U.v;
  891. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  892. return vld1q_lane_f32(w,V,3);
  893. #elif defined(_XM_SSE_INTRINSICS_)
  894. // Swap w and x
  895. XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3));
  896. // Convert input to vector
  897. XMVECTOR vTemp = _mm_load_ss(w);
  898. // Replace the x component
  899. vResult = _mm_move_ss(vResult,vTemp);
  900. // Swap w and x again
  901. vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3));
  902. return vResult;
  903. #endif
  904. }
  905. //------------------------------------------------------------------------------
  906. // Sets a component of a vector to an integer passed by value
  907. inline XMVECTOR XM_CALLCONV XMVectorSetIntByIndex(FXMVECTOR V, uint32_t x, size_t i)
  908. {
  909. assert( i < 4 );
  910. _Analysis_assume_( i < 4 );
  911. XMVECTORU32 tmp;
  912. tmp.v = V;
  913. tmp.u[i] = x;
  914. return tmp;
  915. }
  916. //------------------------------------------------------------------------------
  917. // Sets the X component of a vector to an integer passed by value
  918. inline XMVECTOR XM_CALLCONV XMVectorSetIntX(FXMVECTOR V, uint32_t x)
  919. {
  920. #if defined(_XM_NO_INTRINSICS_)
  921. XMVECTORU32 U = { { {
  922. x,
  923. V.vector4_u32[1],
  924. V.vector4_u32[2],
  925. V.vector4_u32[3]
  926. } } };
  927. return U.v;
  928. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  929. return vsetq_lane_u32(x,V,0);
  930. #elif defined(_XM_SSE_INTRINSICS_)
  931. __m128i vTemp = _mm_cvtsi32_si128(x);
  932. XMVECTOR vResult = _mm_move_ss(V,_mm_castsi128_ps(vTemp));
  933. return vResult;
  934. #endif
  935. }
  936. // Sets the Y component of a vector to an integer passed by value
  937. inline XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y)
  938. {
  939. #if defined(_XM_NO_INTRINSICS_)
  940. XMVECTORU32 U = { { {
  941. V.vector4_u32[0],
  942. y,
  943. V.vector4_u32[2],
  944. V.vector4_u32[3]
  945. } } };
  946. return U.v;
  947. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  948. return vsetq_lane_u32(y,V,1);
  949. #elif defined(_XM_SSE4_INTRINSICS_)
  950. __m128i vResult = _mm_castps_si128( V );
  951. vResult = _mm_insert_epi32( vResult, static_cast<int>(y), 1 );
  952. return _mm_castsi128_ps( vResult );
  953. #elif defined(_XM_SSE_INTRINSICS_)
  954. // Swap y and x
  955. XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
  956. // Convert input to vector
  957. __m128i vTemp = _mm_cvtsi32_si128(y);
  958. // Replace the x component
  959. vResult = _mm_move_ss(vResult,_mm_castsi128_ps(vTemp));
  960. // Swap y and x again
  961. vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1));
  962. return vResult;
  963. #endif
  964. }
  965. // Sets the Z component of a vector to an integer passed by value
  966. inline XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z)
  967. {
  968. #if defined(_XM_NO_INTRINSICS_)
  969. XMVECTORU32 U = { { {
  970. V.vector4_u32[0],
  971. V.vector4_u32[1],
  972. z,
  973. V.vector4_u32[3]
  974. } } };
  975. return U.v;
  976. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  977. return vsetq_lane_u32(z,V,2);
  978. #elif defined(_XM_SSE4_INTRINSICS_)
  979. __m128i vResult = _mm_castps_si128( V );
  980. vResult = _mm_insert_epi32( vResult, static_cast<int>(z), 2 );
  981. return _mm_castsi128_ps( vResult );
  982. #elif defined(_XM_SSE_INTRINSICS_)
  983. // Swap z and x
  984. XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2));
  985. // Convert input to vector
  986. __m128i vTemp = _mm_cvtsi32_si128(z);
  987. // Replace the x component
  988. vResult = _mm_move_ss(vResult,_mm_castsi128_ps(vTemp));
  989. // Swap z and x again
  990. vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2));
  991. return vResult;
  992. #endif
  993. }
  994. // Sets the W component of a vector to an integer passed by value
  995. inline XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w)
  996. {
  997. #if defined(_XM_NO_INTRINSICS_)
  998. XMVECTORU32 U = { { {
  999. V.vector4_u32[0],
  1000. V.vector4_u32[1],
  1001. V.vector4_u32[2],
  1002. w
  1003. } } };
  1004. return U.v;
  1005. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1006. return vsetq_lane_u32(w,V,3);
  1007. #elif defined(_XM_SSE4_INTRINSICS_)
  1008. __m128i vResult = _mm_castps_si128( V );
  1009. vResult = _mm_insert_epi32( vResult, static_cast<int>(w), 3 );
  1010. return _mm_castsi128_ps( vResult );
  1011. #elif defined(_XM_SSE_INTRINSICS_)
  1012. // Swap w and x
  1013. XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3));
  1014. // Convert input to vector
  1015. __m128i vTemp = _mm_cvtsi32_si128(w);
  1016. // Replace the x component
  1017. vResult = _mm_move_ss(vResult,_mm_castsi128_ps(vTemp));
  1018. // Swap w and x again
  1019. vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3));
  1020. return vResult;
  1021. #endif
  1022. }
  1023. //------------------------------------------------------------------------------
  1024. // Sets a component of a vector to an integer value passed by pointer
  1025. _Use_decl_annotations_
  1026. inline XMVECTOR XM_CALLCONV XMVectorSetIntByIndexPtr(FXMVECTOR V, const uint32_t *x, size_t i)
  1027. {
  1028. assert( x != nullptr );
  1029. assert( i < 4 );
  1030. _Analysis_assume_( i < 4 );
  1031. XMVECTORU32 tmp;
  1032. tmp.v = V;
  1033. tmp.u[i] = *x;
  1034. return tmp;
  1035. }
  1036. //------------------------------------------------------------------------------
  1037. // Sets the X component of a vector to an integer value passed by pointer
  1038. _Use_decl_annotations_
  1039. inline XMVECTOR XM_CALLCONV XMVectorSetIntXPtr(FXMVECTOR V, const uint32_t *x)
  1040. {
  1041. assert( x != nullptr );
  1042. #if defined(_XM_NO_INTRINSICS_)
  1043. XMVECTORU32 U = { { {
  1044. *x,
  1045. V.vector4_u32[1],
  1046. V.vector4_u32[2],
  1047. V.vector4_u32[3]
  1048. } } };
  1049. return U.v;
  1050. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1051. return vld1q_lane_u32(x,*reinterpret_cast<const uint32x4_t *>(&V),0);
  1052. #elif defined(_XM_SSE_INTRINSICS_)
  1053. XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(x));
  1054. XMVECTOR vResult = _mm_move_ss(V,vTemp);
  1055. return vResult;
  1056. #endif
  1057. }
  1058. // Sets the Y component of a vector to an integer value passed by pointer
  1059. _Use_decl_annotations_
  1060. inline XMVECTOR XM_CALLCONV XMVectorSetIntYPtr(FXMVECTOR V, const uint32_t *y)
  1061. {
  1062. assert( y != nullptr );
  1063. #if defined(_XM_NO_INTRINSICS_)
  1064. XMVECTORU32 U = { { {
  1065. V.vector4_u32[0],
  1066. *y,
  1067. V.vector4_u32[2],
  1068. V.vector4_u32[3]
  1069. } } };
  1070. return U.v;
  1071. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1072. return vld1q_lane_u32(y,*reinterpret_cast<const uint32x4_t *>(&V),1);
  1073. #elif defined(_XM_SSE_INTRINSICS_)
  1074. // Swap y and x
  1075. XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
  1076. // Convert input to vector
  1077. XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(y));
  1078. // Replace the x component
  1079. vResult = _mm_move_ss(vResult,vTemp);
  1080. // Swap y and x again
  1081. vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1));
  1082. return vResult;
  1083. #endif
  1084. }
  1085. // Sets the Z component of a vector to an integer value passed by pointer
  1086. _Use_decl_annotations_
  1087. inline XMVECTOR XM_CALLCONV XMVectorSetIntZPtr(FXMVECTOR V, const uint32_t *z)
  1088. {
  1089. assert( z != nullptr );
  1090. #if defined(_XM_NO_INTRINSICS_)
  1091. XMVECTORU32 U = { { {
  1092. V.vector4_u32[0],
  1093. V.vector4_u32[1],
  1094. *z,
  1095. V.vector4_u32[3]
  1096. } } };
  1097. return U.v;
  1098. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1099. return vld1q_lane_u32(z,*reinterpret_cast<const uint32x4_t *>(&V),2);
  1100. #elif defined(_XM_SSE_INTRINSICS_)
  1101. // Swap z and x
  1102. XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2));
  1103. // Convert input to vector
  1104. XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(z));
  1105. // Replace the x component
  1106. vResult = _mm_move_ss(vResult,vTemp);
  1107. // Swap z and x again
  1108. vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2));
  1109. return vResult;
  1110. #endif
  1111. }
  1112. // Sets the W component of a vector to an integer value passed by pointer
  1113. _Use_decl_annotations_
  1114. inline XMVECTOR XM_CALLCONV XMVectorSetIntWPtr(FXMVECTOR V, const uint32_t *w)
  1115. {
  1116. assert( w != nullptr );
  1117. #if defined(_XM_NO_INTRINSICS_)
  1118. XMVECTORU32 U = { { {
  1119. V.vector4_u32[0],
  1120. V.vector4_u32[1],
  1121. V.vector4_u32[2],
  1122. *w
  1123. } } };
  1124. return U.v;
  1125. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1126. return vld1q_lane_u32(w,*reinterpret_cast<const uint32x4_t *>(&V),3);
  1127. #elif defined(_XM_SSE_INTRINSICS_)
  1128. // Swap w and x
  1129. XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3));
  1130. // Convert input to vector
  1131. XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(w));
  1132. // Replace the x component
  1133. vResult = _mm_move_ss(vResult,vTemp);
  1134. // Swap w and x again
  1135. vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3));
  1136. return vResult;
  1137. #endif
  1138. }
  1139. //------------------------------------------------------------------------------
  1140. inline XMVECTOR XM_CALLCONV XMVectorSwizzle
  1141. (
  1142. FXMVECTOR V,
  1143. uint32_t E0,
  1144. uint32_t E1,
  1145. uint32_t E2,
  1146. uint32_t E3
  1147. )
  1148. {
  1149. assert( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) );
  1150. _Analysis_assume_( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) );
  1151. #if defined(_XM_NO_INTRINSICS_)
  1152. XMVECTORF32 Result = { { {
  1153. V.vector4_f32[E0],
  1154. V.vector4_f32[E1],
  1155. V.vector4_f32[E2],
  1156. V.vector4_f32[E3]
  1157. } } };
  1158. return Result.v;
  1159. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1160. static const uint32_t ControlElement[ 4 ] =
  1161. {
  1162. 0x03020100, // XM_SWIZZLE_X
  1163. 0x07060504, // XM_SWIZZLE_Y
  1164. 0x0B0A0908, // XM_SWIZZLE_Z
  1165. 0x0F0E0D0C, // XM_SWIZZLE_W
  1166. };
  1167. int8x8x2_t tbl;
  1168. tbl.val[0] = vget_low_f32(V);
  1169. tbl.val[1] = vget_high_f32(V);
  1170. uint32x2_t idx = vcreate_u32( ((uint64_t)ControlElement[E0]) | (((uint64_t)ControlElement[E1]) << 32) );
  1171. const uint8x8_t rL = vtbl2_u8( tbl, idx );
  1172. idx = vcreate_u32( ((uint64_t)ControlElement[E2]) | (((uint64_t)ControlElement[E3]) << 32) );
  1173. const uint8x8_t rH = vtbl2_u8( tbl, idx );
  1174. return vcombine_f32( rL, rH );
  1175. #elif defined(_XM_AVX_INTRINSICS_)
  1176. unsigned int elem[4] = { E0, E1, E2, E3 };
  1177. __m128i vControl = _mm_loadu_si128( reinterpret_cast<const __m128i *>(&elem[0]) );
  1178. return _mm_permutevar_ps( V, vControl );
  1179. #else
  1180. const uint32_t *aPtr = (const uint32_t* )(&V);
  1181. XMVECTOR Result;
  1182. uint32_t *pWork = (uint32_t*)(&Result);
  1183. pWork[0] = aPtr[E0];
  1184. pWork[1] = aPtr[E1];
  1185. pWork[2] = aPtr[E2];
  1186. pWork[3] = aPtr[E3];
  1187. return Result;
  1188. #endif
  1189. }
  1190. //------------------------------------------------------------------------------
  1191. inline XMVECTOR XM_CALLCONV XMVectorPermute
  1192. (
  1193. FXMVECTOR V1,
  1194. FXMVECTOR V2,
  1195. uint32_t PermuteX,
  1196. uint32_t PermuteY,
  1197. uint32_t PermuteZ,
  1198. uint32_t PermuteW
  1199. )
  1200. {
  1201. assert( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
  1202. _Analysis_assume_( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
  1203. #if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
  1204. static const uint32_t ControlElement[ 8 ] =
  1205. {
  1206. 0x03020100, // XM_PERMUTE_0X
  1207. 0x07060504, // XM_PERMUTE_0Y
  1208. 0x0B0A0908, // XM_PERMUTE_0Z
  1209. 0x0F0E0D0C, // XM_PERMUTE_0W
  1210. 0x13121110, // XM_PERMUTE_1X
  1211. 0x17161514, // XM_PERMUTE_1Y
  1212. 0x1B1A1918, // XM_PERMUTE_1Z
  1213. 0x1F1E1D1C, // XM_PERMUTE_1W
  1214. };
  1215. int8x8x4_t tbl;
  1216. tbl.val[0] = vget_low_f32(V1);
  1217. tbl.val[1] = vget_high_f32(V1);
  1218. tbl.val[2] = vget_low_f32(V2);
  1219. tbl.val[3] = vget_high_f32(V2);
  1220. uint32x2_t idx = vcreate_u32( ((uint64_t)ControlElement[PermuteX]) | (((uint64_t)ControlElement[PermuteY]) << 32) );
  1221. const uint8x8_t rL = vtbl4_u8( tbl, idx );
  1222. idx = vcreate_u32( ((uint64_t)ControlElement[PermuteZ]) | (((uint64_t)ControlElement[PermuteW]) << 32) );
  1223. const uint8x8_t rH = vtbl4_u8( tbl, idx );
  1224. return vcombine_f32( rL, rH );
  1225. #elif defined(_XM_AVX_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
  1226. static const XMVECTORU32 three = { { { 3, 3, 3, 3 } } };
  1227. _declspec(align(16)) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW };
  1228. __m128i vControl = _mm_load_si128( reinterpret_cast<const __m128i *>(&elem[0]) );
  1229. __m128i vSelect = _mm_cmpgt_epi32( vControl, three );
  1230. vControl = _mm_castps_si128( _mm_and_ps( _mm_castsi128_ps( vControl ), three ) );
  1231. __m128 shuffled1 = _mm_permutevar_ps( V1, vControl );
  1232. __m128 shuffled2 = _mm_permutevar_ps( V2, vControl );
  1233. __m128 masked1 = _mm_andnot_ps( _mm_castsi128_ps( vSelect ), shuffled1 );
  1234. __m128 masked2 = _mm_and_ps( _mm_castsi128_ps( vSelect ), shuffled2 );
  1235. return _mm_or_ps( masked1, masked2 );
  1236. #else
  1237. const uint32_t *aPtr[2];
  1238. aPtr[0] = (const uint32_t* )(&V1);
  1239. aPtr[1] = (const uint32_t* )(&V2);
  1240. XMVECTOR Result;
  1241. uint32_t *pWork = (uint32_t*)(&Result);
  1242. const uint32_t i0 = PermuteX & 3;
  1243. const uint32_t vi0 = PermuteX >> 2;
  1244. pWork[0] = aPtr[vi0][i0];
  1245. const uint32_t i1 = PermuteY & 3;
  1246. const uint32_t vi1 = PermuteY >> 2;
  1247. pWork[1] = aPtr[vi1][i1];
  1248. const uint32_t i2 = PermuteZ & 3;
  1249. const uint32_t vi2 = PermuteZ >> 2;
  1250. pWork[2] = aPtr[vi2][i2];
  1251. const uint32_t i3 = PermuteW & 3;
  1252. const uint32_t vi3 = PermuteW >> 2;
  1253. pWork[3] = aPtr[vi3][i3];
  1254. return Result;
  1255. #endif
  1256. }
  1257. //------------------------------------------------------------------------------
  1258. // Define a control vector to be used in XMVectorSelect
  1259. // operations. The four integers specified in XMVectorSelectControl
  1260. // serve as indices to select between components in two vectors.
  1261. // The first index controls selection for the first component of
  1262. // the vectors involved in a select operation, the second index
  1263. // controls selection for the second component etc. A value of
  1264. // zero for an index causes the corresponding component from the first
  1265. // vector to be selected whereas a one causes the component from the
  1266. // second vector to be selected instead.
  1267. inline XMVECTOR XM_CALLCONV XMVectorSelectControl
  1268. (
  1269. uint32_t VectorIndex0,
  1270. uint32_t VectorIndex1,
  1271. uint32_t VectorIndex2,
  1272. uint32_t VectorIndex3
  1273. )
  1274. {
  1275. #if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
  1276. // x=Index0,y=Index1,z=Index2,w=Index3
  1277. __m128i vTemp = _mm_set_epi32(VectorIndex3,VectorIndex2,VectorIndex1,VectorIndex0);
  1278. // Any non-zero entries become 0xFFFFFFFF else 0
  1279. vTemp = _mm_cmpgt_epi32(vTemp,g_XMZero);
  1280. return _mm_castsi128_ps(vTemp);
  1281. #elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
  1282. int32x2_t V0 = vcreate_s32(((uint64_t)VectorIndex0) | ((uint64_t)VectorIndex1 << 32));
  1283. int32x2_t V1 = vcreate_s32(((uint64_t)VectorIndex2) | ((uint64_t)VectorIndex3 << 32));
  1284. int32x4_t vTemp = vcombine_s32(V0, V1);
  1285. // Any non-zero entries become 0xFFFFFFFF else 0
  1286. return vcgtq_s32(vTemp,g_XMZero);
  1287. #else
  1288. XMVECTOR ControlVector;
  1289. const uint32_t ControlElement[] =
  1290. {
  1291. XM_SELECT_0,
  1292. XM_SELECT_1
  1293. };
  1294. assert(VectorIndex0 < 2);
  1295. assert(VectorIndex1 < 2);
  1296. assert(VectorIndex2 < 2);
  1297. assert(VectorIndex3 < 2);
  1298. _Analysis_assume_(VectorIndex0 < 2);
  1299. _Analysis_assume_(VectorIndex1 < 2);
  1300. _Analysis_assume_(VectorIndex2 < 2);
  1301. _Analysis_assume_(VectorIndex3 < 2);
  1302. ControlVector.vector4_u32[0] = ControlElement[VectorIndex0];
  1303. ControlVector.vector4_u32[1] = ControlElement[VectorIndex1];
  1304. ControlVector.vector4_u32[2] = ControlElement[VectorIndex2];
  1305. ControlVector.vector4_u32[3] = ControlElement[VectorIndex3];
  1306. return ControlVector;
  1307. #endif
  1308. }
  1309. //------------------------------------------------------------------------------
  1310. inline XMVECTOR XM_CALLCONV XMVectorSelect
  1311. (
  1312. FXMVECTOR V1,
  1313. FXMVECTOR V2,
  1314. FXMVECTOR Control
  1315. )
  1316. {
  1317. #if defined(_XM_NO_INTRINSICS_)
  1318. XMVECTORU32 Result = { { {
  1319. (V1.vector4_u32[0] & ~Control.vector4_u32[0]) | (V2.vector4_u32[0] & Control.vector4_u32[0]),
  1320. (V1.vector4_u32[1] & ~Control.vector4_u32[1]) | (V2.vector4_u32[1] & Control.vector4_u32[1]),
  1321. (V1.vector4_u32[2] & ~Control.vector4_u32[2]) | (V2.vector4_u32[2] & Control.vector4_u32[2]),
  1322. (V1.vector4_u32[3] & ~Control.vector4_u32[3]) | (V2.vector4_u32[3] & Control.vector4_u32[3]),
  1323. } } };
  1324. return Result.v;
  1325. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1326. return vbslq_f32( Control, V2, V1 );
  1327. #elif defined(_XM_SSE_INTRINSICS_)
  1328. XMVECTOR vTemp1 = _mm_andnot_ps(Control,V1);
  1329. XMVECTOR vTemp2 = _mm_and_ps(V2,Control);
  1330. return _mm_or_ps(vTemp1,vTemp2);
  1331. #endif
  1332. }
  1333. //------------------------------------------------------------------------------
  1334. inline XMVECTOR XM_CALLCONV XMVectorMergeXY
  1335. (
  1336. FXMVECTOR V1,
  1337. FXMVECTOR V2
  1338. )
  1339. {
  1340. #if defined(_XM_NO_INTRINSICS_)
  1341. XMVECTORU32 Result = { { {
  1342. V1.vector4_u32[0],
  1343. V2.vector4_u32[0],
  1344. V1.vector4_u32[1],
  1345. V2.vector4_u32[1],
  1346. } } };
  1347. return Result.v;
  1348. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1349. return vzipq_f32( V1, V2 ).val[0];
  1350. #elif defined(_XM_SSE_INTRINSICS_)
  1351. return _mm_unpacklo_ps( V1, V2 );
  1352. #endif
  1353. }
  1354. //------------------------------------------------------------------------------
  1355. inline XMVECTOR XM_CALLCONV XMVectorMergeZW
  1356. (
  1357. FXMVECTOR V1,
  1358. FXMVECTOR V2
  1359. )
  1360. {
  1361. #if defined(_XM_NO_INTRINSICS_)
  1362. XMVECTORU32 Result = { { {
  1363. V1.vector4_u32[2],
  1364. V2.vector4_u32[2],
  1365. V1.vector4_u32[3],
  1366. V2.vector4_u32[3]
  1367. } } };
  1368. return Result.v;
  1369. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1370. return vzipq_f32( V1, V2 ).val[1];
  1371. #elif defined(_XM_SSE_INTRINSICS_)
  1372. return _mm_unpackhi_ps( V1, V2 );
  1373. #endif
  1374. }
  1375. //------------------------------------------------------------------------------
  1376. inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements)
  1377. {
  1378. assert( Elements < 4 );
  1379. _Analysis_assume_( Elements < 4 );
  1380. return XMVectorPermute(V1, V2, Elements, ((Elements) + 1), ((Elements) + 2), ((Elements) + 3));
  1381. }
  1382. //------------------------------------------------------------------------------
  1383. inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements)
  1384. {
  1385. assert( Elements < 4 );
  1386. _Analysis_assume_( Elements < 4 );
  1387. return XMVectorSwizzle( V, Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3 );
  1388. }
  1389. //------------------------------------------------------------------------------
  1390. inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements)
  1391. {
  1392. assert( Elements < 4 );
  1393. _Analysis_assume_( Elements < 4 );
  1394. return XMVectorSwizzle( V, (4 - (Elements)) & 3, (5 - (Elements)) & 3, (6 - (Elements)) & 3, (7 - (Elements)) & 3 );
  1395. }
  1396. //------------------------------------------------------------------------------
  1397. inline XMVECTOR XM_CALLCONV XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS, uint32_t VSLeftRotateElements,
  1398. uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3)
  1399. {
  1400. XMVECTOR Control = XMVectorSelectControl(Select0&1, Select1&1, Select2&1, Select3&1);
  1401. return XMVectorSelect( VD, XMVectorRotateLeft(VS, VSLeftRotateElements), Control );
  1402. }
  1403. //------------------------------------------------------------------------------
  1404. // Comparison operations
  1405. //------------------------------------------------------------------------------
  1406. //------------------------------------------------------------------------------
  1407. inline XMVECTOR XM_CALLCONV XMVectorEqual
  1408. (
  1409. FXMVECTOR V1,
  1410. FXMVECTOR V2
  1411. )
  1412. {
  1413. #if defined(_XM_NO_INTRINSICS_)
  1414. XMVECTORU32 Control = { { {
  1415. (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFF : 0,
  1416. (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFF : 0,
  1417. (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFF : 0,
  1418. (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFF : 0,
  1419. } } };
  1420. return Control.v;
  1421. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1422. return vceqq_f32( V1, V2 );
  1423. #elif defined(_XM_SSE_INTRINSICS_)
  1424. return _mm_cmpeq_ps( V1, V2 );
  1425. #endif
  1426. }
  1427. //------------------------------------------------------------------------------
  1428. _Use_decl_annotations_
  1429. inline XMVECTOR XM_CALLCONV XMVectorEqualR
  1430. (
  1431. uint32_t* pCR,
  1432. FXMVECTOR V1,
  1433. FXMVECTOR V2
  1434. )
  1435. {
  1436. assert( pCR != nullptr );
  1437. #if defined(_XM_NO_INTRINSICS_)
  1438. uint32_t ux = (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
  1439. uint32_t uy = (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
  1440. uint32_t uz = (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
  1441. uint32_t uw = (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
  1442. uint32_t CR = 0;
  1443. if (ux&uy&uz&uw)
  1444. {
  1445. // All elements are greater
  1446. CR = XM_CRMASK_CR6TRUE;
  1447. }
  1448. else if (!(ux|uy|uz|uw))
  1449. {
  1450. // All elements are not greater
  1451. CR = XM_CRMASK_CR6FALSE;
  1452. }
  1453. *pCR = CR;
  1454. XMVECTORU32 Control = { { { ux, uy, uz, uw } } };
  1455. return Control;
  1456. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1457. uint32x4_t vResult = vceqq_f32( V1, V2 );
  1458. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  1459. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  1460. uint32_t r = vget_lane_u32(vTemp.val[1], 1);
  1461. uint32_t CR = 0;
  1462. if ( r == 0xFFFFFFFFU )
  1463. {
  1464. // All elements are equal
  1465. CR = XM_CRMASK_CR6TRUE;
  1466. }
  1467. else if ( !r )
  1468. {
  1469. // All elements are not equal
  1470. CR = XM_CRMASK_CR6FALSE;
  1471. }
  1472. *pCR = CR;
  1473. return vResult;
  1474. #elif defined(_XM_SSE_INTRINSICS_)
  1475. XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
  1476. uint32_t CR = 0;
  1477. int iTest = _mm_movemask_ps(vTemp);
  1478. if (iTest==0xf)
  1479. {
  1480. CR = XM_CRMASK_CR6TRUE;
  1481. }
  1482. else if (!iTest)
  1483. {
  1484. // All elements are not greater
  1485. CR = XM_CRMASK_CR6FALSE;
  1486. }
  1487. *pCR = CR;
  1488. return vTemp;
  1489. #endif
  1490. }
  1491. //------------------------------------------------------------------------------
  1492. // Treat the components of the vectors as unsigned integers and
  1493. // compare individual bits between the two. This is useful for
  1494. // comparing control vectors and result vectors returned from
  1495. // other comparison operations.
  1496. inline XMVECTOR XM_CALLCONV XMVectorEqualInt
  1497. (
  1498. FXMVECTOR V1,
  1499. FXMVECTOR V2
  1500. )
  1501. {
  1502. #if defined(_XM_NO_INTRINSICS_)
  1503. XMVECTORU32 Control = { { {
  1504. (V1.vector4_u32[0] == V2.vector4_u32[0]) ? 0xFFFFFFFF : 0,
  1505. (V1.vector4_u32[1] == V2.vector4_u32[1]) ? 0xFFFFFFFF : 0,
  1506. (V1.vector4_u32[2] == V2.vector4_u32[2]) ? 0xFFFFFFFF : 0,
  1507. (V1.vector4_u32[3] == V2.vector4_u32[3]) ? 0xFFFFFFFF : 0,
  1508. } } };
  1509. return Control.v;
  1510. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1511. return vceqq_u32( V1, V2 );
  1512. #elif defined(_XM_SSE_INTRINSICS_)
  1513. __m128i V = _mm_cmpeq_epi32( _mm_castps_si128(V1),_mm_castps_si128(V2) );
  1514. return _mm_castsi128_ps(V);
  1515. #endif
  1516. }
  1517. //------------------------------------------------------------------------------
  1518. _Use_decl_annotations_
  1519. inline XMVECTOR XM_CALLCONV XMVectorEqualIntR
  1520. (
  1521. uint32_t* pCR,
  1522. FXMVECTOR V1,
  1523. FXMVECTOR V2
  1524. )
  1525. {
  1526. assert( pCR != nullptr );
  1527. #if defined(_XM_NO_INTRINSICS_)
  1528. XMVECTOR Control = XMVectorEqualInt(V1, V2);
  1529. *pCR = 0;
  1530. if (XMVector4EqualInt(Control, XMVectorTrueInt()))
  1531. {
  1532. // All elements are equal
  1533. *pCR |= XM_CRMASK_CR6TRUE;
  1534. }
  1535. else if (XMVector4EqualInt(Control, XMVectorFalseInt()))
  1536. {
  1537. // All elements are not equal
  1538. *pCR |= XM_CRMASK_CR6FALSE;
  1539. }
  1540. return Control;
  1541. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1542. uint32x4_t vResult = vceqq_u32( V1, V2 );
  1543. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  1544. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  1545. uint32_t r = vget_lane_u32(vTemp.val[1], 1);
  1546. uint32_t CR = 0;
  1547. if ( r == 0xFFFFFFFFU )
  1548. {
  1549. // All elements are equal
  1550. CR = XM_CRMASK_CR6TRUE;
  1551. }
  1552. else if ( !r )
  1553. {
  1554. // All elements are not equal
  1555. CR = XM_CRMASK_CR6FALSE;
  1556. }
  1557. *pCR = CR;
  1558. return vResult;
  1559. #elif defined(_XM_SSE_INTRINSICS_)
  1560. __m128i V = _mm_cmpeq_epi32( _mm_castps_si128(V1),_mm_castps_si128(V2) );
  1561. int iTemp = _mm_movemask_ps(_mm_castsi128_ps(V));
  1562. uint32_t CR = 0;
  1563. if (iTemp==0x0F)
  1564. {
  1565. CR = XM_CRMASK_CR6TRUE;
  1566. }
  1567. else if (!iTemp)
  1568. {
  1569. CR = XM_CRMASK_CR6FALSE;
  1570. }
  1571. *pCR = CR;
  1572. return _mm_castsi128_ps(V);
  1573. #endif
  1574. }
  1575. //------------------------------------------------------------------------------
  1576. inline XMVECTOR XM_CALLCONV XMVectorNearEqual
  1577. (
  1578. FXMVECTOR V1,
  1579. FXMVECTOR V2,
  1580. FXMVECTOR Epsilon
  1581. )
  1582. {
  1583. #if defined(_XM_NO_INTRINSICS_)
  1584. float fDeltax = V1.vector4_f32[0]-V2.vector4_f32[0];
  1585. float fDeltay = V1.vector4_f32[1]-V2.vector4_f32[1];
  1586. float fDeltaz = V1.vector4_f32[2]-V2.vector4_f32[2];
  1587. float fDeltaw = V1.vector4_f32[3]-V2.vector4_f32[3];
  1588. fDeltax = fabsf(fDeltax);
  1589. fDeltay = fabsf(fDeltay);
  1590. fDeltaz = fabsf(fDeltaz);
  1591. fDeltaw = fabsf(fDeltaw);
  1592. XMVECTORU32 Control = { { {
  1593. (fDeltax <= Epsilon.vector4_f32[0]) ? 0xFFFFFFFFU : 0,
  1594. (fDeltay <= Epsilon.vector4_f32[1]) ? 0xFFFFFFFFU : 0,
  1595. (fDeltaz <= Epsilon.vector4_f32[2]) ? 0xFFFFFFFFU : 0,
  1596. (fDeltaw <= Epsilon.vector4_f32[3]) ? 0xFFFFFFFFU : 0,
  1597. } } };
  1598. return Control.v;
  1599. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1600. XMVECTOR vDelta = vsubq_f32(V1,V2);
  1601. return vacleq_f32( vDelta, Epsilon );
  1602. #elif defined(_XM_SSE_INTRINSICS_)
  1603. // Get the difference
  1604. XMVECTOR vDelta = _mm_sub_ps(V1,V2);
  1605. // Get the absolute value of the difference
  1606. XMVECTOR vTemp = _mm_setzero_ps();
  1607. vTemp = _mm_sub_ps(vTemp,vDelta);
  1608. vTemp = _mm_max_ps(vTemp,vDelta);
  1609. vTemp = _mm_cmple_ps(vTemp,Epsilon);
  1610. return vTemp;
  1611. #endif
  1612. }
  1613. //------------------------------------------------------------------------------
  1614. inline XMVECTOR XM_CALLCONV XMVectorNotEqual
  1615. (
  1616. FXMVECTOR V1,
  1617. FXMVECTOR V2
  1618. )
  1619. {
  1620. #if defined(_XM_NO_INTRINSICS_)
  1621. XMVECTORU32 Control = { { {
  1622. (V1.vector4_f32[0] != V2.vector4_f32[0]) ? 0xFFFFFFFF : 0,
  1623. (V1.vector4_f32[1] != V2.vector4_f32[1]) ? 0xFFFFFFFF : 0,
  1624. (V1.vector4_f32[2] != V2.vector4_f32[2]) ? 0xFFFFFFFF : 0,
  1625. (V1.vector4_f32[3] != V2.vector4_f32[3]) ? 0xFFFFFFFF : 0,
  1626. } } };
  1627. return Control.v;
  1628. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1629. return vmvnq_u32(vceqq_f32(V1, V2));
  1630. #elif defined(_XM_SSE_INTRINSICS_)
  1631. return _mm_cmpneq_ps( V1, V2 );
  1632. #endif
  1633. }
  1634. //------------------------------------------------------------------------------
  1635. inline XMVECTOR XM_CALLCONV XMVectorNotEqualInt
  1636. (
  1637. FXMVECTOR V1,
  1638. FXMVECTOR V2
  1639. )
  1640. {
  1641. #if defined(_XM_NO_INTRINSICS_)
  1642. XMVECTORU32 Control = { { {
  1643. (V1.vector4_u32[0] != V2.vector4_u32[0]) ? 0xFFFFFFFFU : 0,
  1644. (V1.vector4_u32[1] != V2.vector4_u32[1]) ? 0xFFFFFFFFU : 0,
  1645. (V1.vector4_u32[2] != V2.vector4_u32[2]) ? 0xFFFFFFFFU : 0,
  1646. (V1.vector4_u32[3] != V2.vector4_u32[3]) ? 0xFFFFFFFFU : 0
  1647. } } };
  1648. return Control.v;
  1649. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1650. return vmvnq_u32(vceqq_u32(V1, V2));
  1651. #elif defined(_XM_SSE_INTRINSICS_)
  1652. __m128i V = _mm_cmpeq_epi32( _mm_castps_si128(V1),_mm_castps_si128(V2) );
  1653. return _mm_xor_ps(_mm_castsi128_ps(V),g_XMNegOneMask);
  1654. #endif
  1655. }
  1656. //------------------------------------------------------------------------------
  1657. inline XMVECTOR XM_CALLCONV XMVectorGreater
  1658. (
  1659. FXMVECTOR V1,
  1660. FXMVECTOR V2
  1661. )
  1662. {
  1663. #if defined(_XM_NO_INTRINSICS_)
  1664. XMVECTORU32 Control = { { {
  1665. (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFF : 0,
  1666. (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFF : 0,
  1667. (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFF : 0,
  1668. (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFF : 0
  1669. } } };
  1670. return Control.v;
  1671. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1672. return vcgtq_f32( V1, V2 );
  1673. #elif defined(_XM_SSE_INTRINSICS_)
  1674. return _mm_cmpgt_ps( V1, V2 );
  1675. #endif
  1676. }
  1677. //------------------------------------------------------------------------------
  1678. _Use_decl_annotations_
  1679. inline XMVECTOR XM_CALLCONV XMVectorGreaterR
  1680. (
  1681. uint32_t* pCR,
  1682. FXMVECTOR V1,
  1683. FXMVECTOR V2
  1684. )
  1685. {
  1686. assert( pCR != nullptr );
  1687. #if defined(_XM_NO_INTRINSICS_)
  1688. uint32_t ux = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
  1689. uint32_t uy = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
  1690. uint32_t uz = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
  1691. uint32_t uw = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
  1692. uint32_t CR = 0;
  1693. if (ux&uy&uz&uw)
  1694. {
  1695. // All elements are greater
  1696. CR = XM_CRMASK_CR6TRUE;
  1697. }
  1698. else if (!(ux|uy|uz|uw))
  1699. {
  1700. // All elements are not greater
  1701. CR = XM_CRMASK_CR6FALSE;
  1702. }
  1703. *pCR = CR;
  1704. XMVECTORU32 Control = { { { ux, uy, uz, uw } } };
  1705. return Control.v;
  1706. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1707. uint32x4_t vResult = vcgtq_f32( V1, V2 );
  1708. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  1709. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  1710. uint32_t r = vget_lane_u32(vTemp.val[1], 1);
  1711. uint32_t CR = 0;
  1712. if ( r == 0xFFFFFFFFU )
  1713. {
  1714. // All elements are greater
  1715. CR = XM_CRMASK_CR6TRUE;
  1716. }
  1717. else if ( !r )
  1718. {
  1719. // All elements are not greater
  1720. CR = XM_CRMASK_CR6FALSE;
  1721. }
  1722. *pCR = CR;
  1723. return vResult;
  1724. #elif defined(_XM_SSE_INTRINSICS_)
  1725. XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
  1726. uint32_t CR = 0;
  1727. int iTest = _mm_movemask_ps(vTemp);
  1728. if (iTest==0xf)
  1729. {
  1730. CR = XM_CRMASK_CR6TRUE;
  1731. }
  1732. else if (!iTest)
  1733. {
  1734. // All elements are not greater
  1735. CR = XM_CRMASK_CR6FALSE;
  1736. }
  1737. *pCR = CR;
  1738. return vTemp;
  1739. #endif
  1740. }
  1741. //------------------------------------------------------------------------------
  1742. inline XMVECTOR XM_CALLCONV XMVectorGreaterOrEqual
  1743. (
  1744. FXMVECTOR V1,
  1745. FXMVECTOR V2
  1746. )
  1747. {
  1748. #if defined(_XM_NO_INTRINSICS_)
  1749. XMVECTORU32 Control = { { {
  1750. (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0,
  1751. (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0,
  1752. (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0,
  1753. (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0
  1754. } } };
  1755. return Control.v;
  1756. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1757. return vcgeq_f32( V1, V2 );
  1758. #elif defined(_XM_SSE_INTRINSICS_)
  1759. return _mm_cmpge_ps( V1, V2 );
  1760. #endif
  1761. }
  1762. //------------------------------------------------------------------------------
  1763. _Use_decl_annotations_
  1764. inline XMVECTOR XM_CALLCONV XMVectorGreaterOrEqualR
  1765. (
  1766. uint32_t* pCR,
  1767. FXMVECTOR V1,
  1768. FXMVECTOR V2
  1769. )
  1770. {
  1771. assert( pCR != nullptr );
  1772. #if defined(_XM_NO_INTRINSICS_)
  1773. uint32_t ux = (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
  1774. uint32_t uy = (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
  1775. uint32_t uz = (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
  1776. uint32_t uw = (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
  1777. uint32_t CR = 0;
  1778. if (ux&uy&uz&uw)
  1779. {
  1780. // All elements are greater
  1781. CR = XM_CRMASK_CR6TRUE;
  1782. }
  1783. else if (!(ux|uy|uz|uw))
  1784. {
  1785. // All elements are not greater
  1786. CR = XM_CRMASK_CR6FALSE;
  1787. }
  1788. *pCR = CR;
  1789. XMVECTORU32 Control = { { { ux, uy, uz, uw } } };
  1790. return Control.v;
  1791. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1792. uint32x4_t vResult = vcgeq_f32( V1, V2 );
  1793. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  1794. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  1795. uint32_t r = vget_lane_u32(vTemp.val[1], 1);
  1796. uint32_t CR = 0;
  1797. if ( r == 0xFFFFFFFFU )
  1798. {
  1799. // All elements are greater or equal
  1800. CR = XM_CRMASK_CR6TRUE;
  1801. }
  1802. else if ( !r )
  1803. {
  1804. // All elements are not greater or equal
  1805. CR = XM_CRMASK_CR6FALSE;
  1806. }
  1807. *pCR = CR;
  1808. return vResult;
  1809. #elif defined(_XM_SSE_INTRINSICS_)
  1810. XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
  1811. uint32_t CR = 0;
  1812. int iTest = _mm_movemask_ps(vTemp);
  1813. if (iTest==0xf)
  1814. {
  1815. CR = XM_CRMASK_CR6TRUE;
  1816. }
  1817. else if (!iTest)
  1818. {
  1819. // All elements are not greater
  1820. CR = XM_CRMASK_CR6FALSE;
  1821. }
  1822. *pCR = CR;
  1823. return vTemp;
  1824. #endif
  1825. }
  1826. //------------------------------------------------------------------------------
  1827. inline XMVECTOR XM_CALLCONV XMVectorLess
  1828. (
  1829. FXMVECTOR V1,
  1830. FXMVECTOR V2
  1831. )
  1832. {
  1833. #if defined(_XM_NO_INTRINSICS_)
  1834. XMVECTORU32 Control = { { {
  1835. (V1.vector4_f32[0] < V2.vector4_f32[0]) ? 0xFFFFFFFF : 0,
  1836. (V1.vector4_f32[1] < V2.vector4_f32[1]) ? 0xFFFFFFFF : 0,
  1837. (V1.vector4_f32[2] < V2.vector4_f32[2]) ? 0xFFFFFFFF : 0,
  1838. (V1.vector4_f32[3] < V2.vector4_f32[3]) ? 0xFFFFFFFF : 0
  1839. } } };
  1840. return Control.v;
  1841. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1842. return vcltq_f32( V1, V2 );
  1843. #elif defined(_XM_SSE_INTRINSICS_)
  1844. return _mm_cmplt_ps( V1, V2 );
  1845. #endif
  1846. }
  1847. //------------------------------------------------------------------------------
  1848. inline XMVECTOR XM_CALLCONV XMVectorLessOrEqual
  1849. (
  1850. FXMVECTOR V1,
  1851. FXMVECTOR V2
  1852. )
  1853. {
  1854. #if defined(_XM_NO_INTRINSICS_)
  1855. XMVECTORU32 Control = { { {
  1856. (V1.vector4_f32[0] <= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0,
  1857. (V1.vector4_f32[1] <= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0,
  1858. (V1.vector4_f32[2] <= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0,
  1859. (V1.vector4_f32[3] <= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0
  1860. } } };
  1861. return Control.v;
  1862. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1863. return vcleq_f32( V1, V2 );
  1864. #elif defined(_XM_SSE_INTRINSICS_)
  1865. return _mm_cmple_ps( V1, V2 );
  1866. #endif
  1867. }
  1868. //------------------------------------------------------------------------------
  1869. inline XMVECTOR XM_CALLCONV XMVectorInBounds
  1870. (
  1871. FXMVECTOR V,
  1872. FXMVECTOR Bounds
  1873. )
  1874. {
  1875. #if defined(_XM_NO_INTRINSICS_)
  1876. XMVECTORU32 Control = { { {
  1877. (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFF : 0,
  1878. (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFF : 0,
  1879. (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFF : 0,
  1880. (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFF : 0
  1881. } } };
  1882. return Control.v;
  1883. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1884. // Test if less than or equal
  1885. XMVECTOR vTemp1 = vcleq_f32(V,Bounds);
  1886. // Negate the bounds
  1887. XMVECTOR vTemp2 = vnegq_f32(Bounds);
  1888. // Test if greater or equal (Reversed)
  1889. vTemp2 = vcleq_f32(vTemp2,V);
  1890. // Blend answers
  1891. vTemp1 = vandq_u32(vTemp1,vTemp2);
  1892. return vTemp1;
  1893. #elif defined(_XM_SSE_INTRINSICS_)
  1894. // Test if less than or equal
  1895. XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
  1896. // Negate the bounds
  1897. XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
  1898. // Test if greater or equal (Reversed)
  1899. vTemp2 = _mm_cmple_ps(vTemp2,V);
  1900. // Blend answers
  1901. vTemp1 = _mm_and_ps(vTemp1,vTemp2);
  1902. return vTemp1;
  1903. #endif
  1904. }
  1905. //------------------------------------------------------------------------------
  1906. _Use_decl_annotations_
  1907. inline XMVECTOR XM_CALLCONV XMVectorInBoundsR
  1908. (
  1909. uint32_t* pCR,
  1910. FXMVECTOR V,
  1911. FXMVECTOR Bounds
  1912. )
  1913. {
  1914. assert( pCR != nullptr );
  1915. #if defined(_XM_NO_INTRINSICS_)
  1916. uint32_t ux = (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
  1917. uint32_t uy = (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
  1918. uint32_t uz = (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
  1919. uint32_t uw = (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
  1920. uint32_t CR = 0;
  1921. if (ux&uy&uz&uw)
  1922. {
  1923. // All elements are in bounds
  1924. CR = XM_CRMASK_CR6BOUNDS;
  1925. }
  1926. *pCR = CR;
  1927. XMVECTORU32 Control = { { { ux, uy, uz, uw } } };
  1928. return Control.v;
  1929. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1930. // Test if less than or equal
  1931. XMVECTOR vTemp1 = vcleq_f32(V,Bounds);
  1932. // Negate the bounds
  1933. XMVECTOR vTemp2 = vnegq_f32(Bounds);
  1934. // Test if greater or equal (Reversed)
  1935. vTemp2 = vcleq_f32(vTemp2,V);
  1936. // Blend answers
  1937. vTemp1 = vandq_u32(vTemp1,vTemp2);
  1938. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1));
  1939. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  1940. uint32_t r = vget_lane_u32(vTemp.val[1], 1);
  1941. uint32_t CR = 0;
  1942. if ( r == 0xFFFFFFFFU )
  1943. {
  1944. // All elements are in bounds
  1945. CR = XM_CRMASK_CR6BOUNDS;
  1946. }
  1947. *pCR = CR;
  1948. return vTemp1;
  1949. #elif defined(_XM_SSE_INTRINSICS_)
  1950. // Test if less than or equal
  1951. XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
  1952. // Negate the bounds
  1953. XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
  1954. // Test if greater or equal (Reversed)
  1955. vTemp2 = _mm_cmple_ps(vTemp2,V);
  1956. // Blend answers
  1957. vTemp1 = _mm_and_ps(vTemp1,vTemp2);
  1958. uint32_t CR = 0;
  1959. if (_mm_movemask_ps(vTemp1)==0xf) {
  1960. // All elements are in bounds
  1961. CR = XM_CRMASK_CR6BOUNDS;
  1962. }
  1963. *pCR = CR;
  1964. return vTemp1;
  1965. #endif
  1966. }
  1967. //------------------------------------------------------------------------------
  1968. inline XMVECTOR XM_CALLCONV XMVectorIsNaN
  1969. (
  1970. FXMVECTOR V
  1971. )
  1972. {
  1973. #if defined(_XM_NO_INTRINSICS_)
  1974. XMVECTORU32 Control = { { {
  1975. XMISNAN(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0,
  1976. XMISNAN(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0,
  1977. XMISNAN(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0,
  1978. XMISNAN(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0
  1979. } } };
  1980. return Control.v;
  1981. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  1982. // Test against itself. NaN is always not equal
  1983. uint32x4_t vTempNan = vceqq_f32( V, V );
  1984. // Flip results
  1985. return vmvnq_u32( vTempNan );
  1986. #elif defined(_XM_SSE_INTRINSICS_)
  1987. // Test against itself. NaN is always not equal
  1988. return _mm_cmpneq_ps(V,V);
  1989. #endif
  1990. }
  1991. //------------------------------------------------------------------------------
  1992. inline XMVECTOR XM_CALLCONV XMVectorIsInfinite
  1993. (
  1994. FXMVECTOR V
  1995. )
  1996. {
  1997. #if defined(_XM_NO_INTRINSICS_)
  1998. XMVECTORU32 Control = { { {
  1999. XMISINF(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0,
  2000. XMISINF(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0,
  2001. XMISINF(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0,
  2002. XMISINF(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0
  2003. } } };
  2004. return Control.v;
  2005. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2006. // Mask off the sign bit
  2007. uint32x4_t vTemp = vandq_u32(V,g_XMAbsMask);
  2008. // Compare to infinity
  2009. vTemp = vceqq_f32(vTemp,g_XMInfinity);
  2010. // If any are infinity, the signs are true.
  2011. return vTemp;
  2012. #elif defined(_XM_SSE_INTRINSICS_)
  2013. // Mask off the sign bit
  2014. __m128 vTemp = _mm_and_ps(V,g_XMAbsMask);
  2015. // Compare to infinity
  2016. vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity);
  2017. // If any are infinity, the signs are true.
  2018. return vTemp;
  2019. #endif
  2020. }
  2021. //------------------------------------------------------------------------------
  2022. // Rounding and clamping operations
  2023. //------------------------------------------------------------------------------
  2024. //------------------------------------------------------------------------------
  2025. inline XMVECTOR XM_CALLCONV XMVectorMin
  2026. (
  2027. FXMVECTOR V1,
  2028. FXMVECTOR V2
  2029. )
  2030. {
  2031. #if defined(_XM_NO_INTRINSICS_)
  2032. XMVECTORF32 Result = { { {
  2033. (V1.vector4_f32[0] < V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0],
  2034. (V1.vector4_f32[1] < V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1],
  2035. (V1.vector4_f32[2] < V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2],
  2036. (V1.vector4_f32[3] < V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3]
  2037. } } };
  2038. return Result.v;
  2039. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2040. return vminq_f32( V1, V2 );
  2041. #elif defined(_XM_SSE_INTRINSICS_)
  2042. return _mm_min_ps( V1, V2 );
  2043. #endif
  2044. }
  2045. //------------------------------------------------------------------------------
  2046. inline XMVECTOR XM_CALLCONV XMVectorMax
  2047. (
  2048. FXMVECTOR V1,
  2049. FXMVECTOR V2
  2050. )
  2051. {
  2052. #if defined(_XM_NO_INTRINSICS_)
  2053. XMVECTORF32 Result = { { {
  2054. (V1.vector4_f32[0] > V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0],
  2055. (V1.vector4_f32[1] > V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1],
  2056. (V1.vector4_f32[2] > V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2],
  2057. (V1.vector4_f32[3] > V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3]
  2058. } } };
  2059. return Result.v;
  2060. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2061. return vmaxq_f32( V1, V2 );
  2062. #elif defined(_XM_SSE_INTRINSICS_)
  2063. return _mm_max_ps( V1, V2 );
  2064. #endif
  2065. }
  2066. //------------------------------------------------------------------------------
  2067. namespace Internal
  2068. {
  2069. // Round to nearest (even) a.k.a. banker's rounding
  2070. inline float round_to_nearest( float x )
  2071. {
  2072. float i = floorf(x);
  2073. x -= i;
  2074. if(x < 0.5f)
  2075. return i;
  2076. if(x > 0.5f)
  2077. return i + 1.f;
  2078. float int_part;
  2079. (void)modff( i / 2.f, &int_part );
  2080. if ( (2.f*int_part) == i )
  2081. {
  2082. return i;
  2083. }
  2084. return i + 1.f;
  2085. }
  2086. };
  2087. #if !defined(_XM_NO_INTRINSICS_) && !defined(__clang__)
  2088. #pragma float_control(push)
  2089. #pragma float_control(precise, on)
  2090. #endif
  2091. inline XMVECTOR XM_CALLCONV XMVectorRound
  2092. (
  2093. FXMVECTOR V
  2094. )
  2095. {
  2096. #if defined(_XM_NO_INTRINSICS_)
  2097. XMVECTORF32 Result = { { {
  2098. Internal::round_to_nearest(V.vector4_f32[0]),
  2099. Internal::round_to_nearest(V.vector4_f32[1]),
  2100. Internal::round_to_nearest(V.vector4_f32[2]),
  2101. Internal::round_to_nearest(V.vector4_f32[3])
  2102. } } };
  2103. return Result.v;
  2104. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2105. #if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
  2106. return vrndnq_f32(V);
  2107. #else
  2108. uint32x4_t sign = vandq_u32( V, g_XMNegativeZero );
  2109. uint32x4_t sMagic = vorrq_u32( g_XMNoFraction, sign );
  2110. float32x4_t R1 = vaddq_f32( V, sMagic );
  2111. R1 = vsubq_f32( R1, sMagic );
  2112. float32x4_t R2 = vabsq_f32( V );
  2113. uint32x4_t mask = vcleq_f32( R2, g_XMNoFraction );
  2114. XMVECTOR vResult = vbslq_f32( mask, R1, V );
  2115. return vResult;
  2116. #endif
  2117. #elif defined(_XM_SSE4_INTRINSICS_)
  2118. return _mm_round_ps( V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC );
  2119. #elif defined(_XM_SSE_INTRINSICS_)
  2120. __m128 sign = _mm_and_ps( V, g_XMNegativeZero );
  2121. __m128 sMagic = _mm_or_ps( g_XMNoFraction, sign );
  2122. __m128 R1 = _mm_add_ps( V, sMagic );
  2123. R1 = _mm_sub_ps( R1, sMagic );
  2124. __m128 R2 = _mm_and_ps( V, g_XMAbsMask );
  2125. __m128 mask = _mm_cmple_ps( R2, g_XMNoFraction );
  2126. R2 = _mm_andnot_ps(mask,V);
  2127. R1 = _mm_and_ps(R1,mask);
  2128. XMVECTOR vResult = _mm_xor_ps(R1, R2);
  2129. return vResult;
  2130. #endif
  2131. }
  2132. #if !defined(_XM_NO_INTRINSICS_) && !defined(__clang__)
  2133. #pragma float_control(pop)
  2134. #endif
  2135. //------------------------------------------------------------------------------
  2136. inline XMVECTOR XM_CALLCONV XMVectorTruncate
  2137. (
  2138. FXMVECTOR V
  2139. )
  2140. {
  2141. #if defined(_XM_NO_INTRINSICS_)
  2142. XMVECTOR Result;
  2143. uint32_t i;
  2144. // Avoid C4701
  2145. Result.vector4_f32[0] = 0.0f;
  2146. for (i = 0; i < 4; i++)
  2147. {
  2148. if (XMISNAN(V.vector4_f32[i]))
  2149. {
  2150. Result.vector4_u32[i] = 0x7FC00000;
  2151. }
  2152. else if (fabsf(V.vector4_f32[i]) < 8388608.0f)
  2153. {
  2154. Result.vector4_f32[i] = (float)((int32_t)V.vector4_f32[i]);
  2155. }
  2156. else
  2157. {
  2158. Result.vector4_f32[i] = V.vector4_f32[i];
  2159. }
  2160. }
  2161. return Result;
  2162. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2163. #if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
  2164. return vrndq_f32(V);
  2165. #else
  2166. float32x4_t vTest = vabsq_f32( V );
  2167. vTest = vcltq_f32( vTest, g_XMNoFraction );
  2168. int32x4_t vInt = vcvtq_s32_f32( V );
  2169. XMVECTOR vResult = vcvtq_f32_s32( vInt );
  2170. // All numbers less than 8388608 will use the round to int
  2171. // All others, use the ORIGINAL value
  2172. return vbslq_f32( vTest, vResult, V );
  2173. #endif
  2174. #elif defined(_XM_SSE4_INTRINSICS_)
  2175. return _mm_round_ps( V, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC );
  2176. #elif defined(_XM_SSE_INTRINSICS_)
  2177. // To handle NAN, INF and numbers greater than 8388608, use masking
  2178. // Get the abs value
  2179. __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask);
  2180. // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF
  2181. vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction);
  2182. // Convert to int and back to float for rounding with truncation
  2183. __m128i vInt = _mm_cvttps_epi32(V);
  2184. // Convert back to floats
  2185. XMVECTOR vResult = _mm_cvtepi32_ps(vInt);
  2186. // All numbers less than 8388608 will use the round to int
  2187. vResult = _mm_and_ps(vResult,_mm_castsi128_ps(vTest));
  2188. // All others, use the ORIGINAL value
  2189. vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V));
  2190. vResult = _mm_or_ps(vResult,_mm_castsi128_ps(vTest));
  2191. return vResult;
  2192. #endif
  2193. }
  2194. //------------------------------------------------------------------------------
  2195. inline XMVECTOR XM_CALLCONV XMVectorFloor
  2196. (
  2197. FXMVECTOR V
  2198. )
  2199. {
  2200. #if defined(_XM_NO_INTRINSICS_)
  2201. XMVECTORF32 Result = { { {
  2202. floorf(V.vector4_f32[0]),
  2203. floorf(V.vector4_f32[1]),
  2204. floorf(V.vector4_f32[2]),
  2205. floorf(V.vector4_f32[3])
  2206. } } };
  2207. return Result.v;
  2208. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2209. #if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
  2210. return vrndmq_f32(V);
  2211. #else
  2212. float32x4_t vTest = vabsq_f32( V );
  2213. vTest = vcltq_f32( vTest, g_XMNoFraction );
  2214. // Truncate
  2215. int32x4_t vInt = vcvtq_s32_f32( V );
  2216. XMVECTOR vResult = vcvtq_f32_s32( vInt );
  2217. XMVECTOR vLarger = vcgtq_f32( vResult, V );
  2218. // 0 -> 0, 0xffffffff -> -1.0f
  2219. vLarger = vcvtq_f32_s32( vLarger );
  2220. vResult = vaddq_f32( vResult, vLarger );
  2221. // All numbers less than 8388608 will use the round to int
  2222. // All others, use the ORIGINAL value
  2223. return vbslq_f32( vTest, vResult, V );
  2224. #endif
  2225. #elif defined(_XM_SSE4_INTRINSICS_)
  2226. return _mm_floor_ps( V );
  2227. #elif defined(_XM_SSE_INTRINSICS_)
  2228. // To handle NAN, INF and numbers greater than 8388608, use masking
  2229. __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask);
  2230. vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction);
  2231. // Truncate
  2232. __m128i vInt = _mm_cvttps_epi32(V);
  2233. XMVECTOR vResult = _mm_cvtepi32_ps(vInt);
  2234. __m128 vLarger = _mm_cmpgt_ps( vResult, V );
  2235. // 0 -> 0, 0xffffffff -> -1.0f
  2236. vLarger = _mm_cvtepi32_ps( _mm_castps_si128( vLarger ) );
  2237. vResult = _mm_add_ps( vResult, vLarger );
  2238. // All numbers less than 8388608 will use the round to int
  2239. vResult = _mm_and_ps(vResult,_mm_castsi128_ps(vTest));
  2240. // All others, use the ORIGINAL value
  2241. vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V));
  2242. vResult = _mm_or_ps(vResult,_mm_castsi128_ps(vTest));
  2243. return vResult;
  2244. #endif
  2245. }
  2246. //------------------------------------------------------------------------------
  2247. inline XMVECTOR XM_CALLCONV XMVectorCeiling
  2248. (
  2249. FXMVECTOR V
  2250. )
  2251. {
  2252. #if defined(_XM_NO_INTRINSICS_)
  2253. XMVECTORF32 Result = { { {
  2254. ceilf(V.vector4_f32[0]),
  2255. ceilf(V.vector4_f32[1]),
  2256. ceilf(V.vector4_f32[2]),
  2257. ceilf(V.vector4_f32[3])
  2258. } } };
  2259. return Result.v;
  2260. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2261. #if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
  2262. return vrndpq_f32(V);
  2263. #else
  2264. float32x4_t vTest = vabsq_f32( V );
  2265. vTest = vcltq_f32( vTest, g_XMNoFraction );
  2266. // Truncate
  2267. int32x4_t vInt = vcvtq_s32_f32( V );
  2268. XMVECTOR vResult = vcvtq_f32_s32( vInt );
  2269. XMVECTOR vSmaller = vcltq_f32( vResult, V );
  2270. // 0 -> 0, 0xffffffff -> -1.0f
  2271. vSmaller = vcvtq_f32_s32( vSmaller );
  2272. vResult = vsubq_f32( vResult, vSmaller );
  2273. // All numbers less than 8388608 will use the round to int
  2274. // All others, use the ORIGINAL value
  2275. return vbslq_f32( vTest, vResult, V );
  2276. #endif
  2277. #elif defined(_XM_SSE4_INTRINSICS_)
  2278. return _mm_ceil_ps( V );
  2279. #elif defined(_XM_SSE_INTRINSICS_)
  2280. // To handle NAN, INF and numbers greater than 8388608, use masking
  2281. __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask);
  2282. vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction);
  2283. // Truncate
  2284. __m128i vInt = _mm_cvttps_epi32(V);
  2285. XMVECTOR vResult = _mm_cvtepi32_ps(vInt);
  2286. __m128 vSmaller = _mm_cmplt_ps( vResult, V );
  2287. // 0 -> 0, 0xffffffff -> -1.0f
  2288. vSmaller = _mm_cvtepi32_ps( _mm_castps_si128( vSmaller ) );
  2289. vResult = _mm_sub_ps( vResult, vSmaller );
  2290. // All numbers less than 8388608 will use the round to int
  2291. vResult = _mm_and_ps(vResult,_mm_castsi128_ps(vTest));
  2292. // All others, use the ORIGINAL value
  2293. vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V));
  2294. vResult = _mm_or_ps(vResult,_mm_castsi128_ps(vTest));
  2295. return vResult;
  2296. #endif
  2297. }
  2298. //------------------------------------------------------------------------------
  2299. inline XMVECTOR XM_CALLCONV XMVectorClamp
  2300. (
  2301. FXMVECTOR V,
  2302. FXMVECTOR Min,
  2303. FXMVECTOR Max
  2304. )
  2305. {
  2306. assert(XMVector4LessOrEqual(Min, Max));
  2307. #if defined(_XM_NO_INTRINSICS_)
  2308. XMVECTOR Result;
  2309. Result = XMVectorMax(Min, V);
  2310. Result = XMVectorMin(Max, Result);
  2311. return Result;
  2312. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2313. XMVECTOR vResult;
  2314. vResult = vmaxq_f32(Min,V);
  2315. vResult = vminq_f32(vResult,Max);
  2316. return vResult;
  2317. #elif defined(_XM_SSE_INTRINSICS_)
  2318. XMVECTOR vResult;
  2319. vResult = _mm_max_ps(Min,V);
  2320. vResult = _mm_min_ps(vResult,Max);
  2321. return vResult;
  2322. #endif
  2323. }
  2324. //------------------------------------------------------------------------------
  2325. inline XMVECTOR XM_CALLCONV XMVectorSaturate
  2326. (
  2327. FXMVECTOR V
  2328. )
  2329. {
  2330. #if defined(_XM_NO_INTRINSICS_)
  2331. const XMVECTOR Zero = XMVectorZero();
  2332. return XMVectorClamp(V, Zero, g_XMOne.v);
  2333. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2334. // Set <0 to 0
  2335. XMVECTOR vResult = vmaxq_f32(V, vdupq_n_f32(0) );
  2336. // Set>1 to 1
  2337. return vminq_f32(vResult, vdupq_n_f32(1.0f) );
  2338. #elif defined(_XM_SSE_INTRINSICS_)
  2339. // Set <0 to 0
  2340. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  2341. // Set>1 to 1
  2342. return _mm_min_ps(vResult,g_XMOne);
  2343. #endif
  2344. }
  2345. //------------------------------------------------------------------------------
  2346. // Bitwise logical operations
  2347. //------------------------------------------------------------------------------
  2348. inline XMVECTOR XM_CALLCONV XMVectorAndInt
  2349. (
  2350. FXMVECTOR V1,
  2351. FXMVECTOR V2
  2352. )
  2353. {
  2354. #if defined(_XM_NO_INTRINSICS_)
  2355. XMVECTORU32 Result = { { {
  2356. V1.vector4_u32[0] & V2.vector4_u32[0],
  2357. V1.vector4_u32[1] & V2.vector4_u32[1],
  2358. V1.vector4_u32[2] & V2.vector4_u32[2],
  2359. V1.vector4_u32[3] & V2.vector4_u32[3]
  2360. } } };
  2361. return Result;
  2362. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2363. return vandq_u32(V1,V2);
  2364. #elif defined(_XM_SSE_INTRINSICS_)
  2365. return _mm_and_ps(V1,V2);
  2366. #endif
  2367. }
  2368. //------------------------------------------------------------------------------
  2369. inline XMVECTOR XM_CALLCONV XMVectorAndCInt
  2370. (
  2371. FXMVECTOR V1,
  2372. FXMVECTOR V2
  2373. )
  2374. {
  2375. #if defined(_XM_NO_INTRINSICS_)
  2376. XMVECTORU32 Result = { { {
  2377. V1.vector4_u32[0] & ~V2.vector4_u32[0],
  2378. V1.vector4_u32[1] & ~V2.vector4_u32[1],
  2379. V1.vector4_u32[2] & ~V2.vector4_u32[2],
  2380. V1.vector4_u32[3] & ~V2.vector4_u32[3]
  2381. } } };
  2382. return Result.v;
  2383. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2384. return vbicq_u32(V1,V2);
  2385. #elif defined(_XM_SSE_INTRINSICS_)
  2386. __m128i V = _mm_andnot_si128( _mm_castps_si128(V2), _mm_castps_si128(V1) );
  2387. return _mm_castsi128_ps(V);
  2388. #endif
  2389. }
  2390. //------------------------------------------------------------------------------
  2391. inline XMVECTOR XM_CALLCONV XMVectorOrInt
  2392. (
  2393. FXMVECTOR V1,
  2394. FXMVECTOR V2
  2395. )
  2396. {
  2397. #if defined(_XM_NO_INTRINSICS_)
  2398. XMVECTORU32 Result = { { {
  2399. V1.vector4_u32[0] | V2.vector4_u32[0],
  2400. V1.vector4_u32[1] | V2.vector4_u32[1],
  2401. V1.vector4_u32[2] | V2.vector4_u32[2],
  2402. V1.vector4_u32[3] | V2.vector4_u32[3]
  2403. } } };
  2404. return Result.v;
  2405. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2406. return vorrq_u32(V1,V2);
  2407. #elif defined(_XM_SSE_INTRINSICS_)
  2408. __m128i V = _mm_or_si128( _mm_castps_si128(V1), _mm_castps_si128(V2) );
  2409. return _mm_castsi128_ps(V);
  2410. #endif
  2411. }
  2412. //------------------------------------------------------------------------------
  2413. inline XMVECTOR XM_CALLCONV XMVectorNorInt
  2414. (
  2415. FXMVECTOR V1,
  2416. FXMVECTOR V2
  2417. )
  2418. {
  2419. #if defined(_XM_NO_INTRINSICS_)
  2420. XMVECTORU32 Result = { { {
  2421. ~(V1.vector4_u32[0] | V2.vector4_u32[0]),
  2422. ~(V1.vector4_u32[1] | V2.vector4_u32[1]),
  2423. ~(V1.vector4_u32[2] | V2.vector4_u32[2]),
  2424. ~(V1.vector4_u32[3] | V2.vector4_u32[3])
  2425. } } };
  2426. return Result.v;
  2427. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2428. uint32x4_t Result = vorrq_u32(V1,V2);
  2429. return vbicq_u32(g_XMNegOneMask, Result);
  2430. #elif defined(_XM_SSE_INTRINSICS_)
  2431. __m128i Result;
  2432. Result = _mm_or_si128( _mm_castps_si128(V1), _mm_castps_si128(V2) );
  2433. Result = _mm_andnot_si128( Result,g_XMNegOneMask);
  2434. return _mm_castsi128_ps(Result);
  2435. #endif
  2436. }
  2437. //------------------------------------------------------------------------------
  2438. inline XMVECTOR XM_CALLCONV XMVectorXorInt
  2439. (
  2440. FXMVECTOR V1,
  2441. FXMVECTOR V2
  2442. )
  2443. {
  2444. #if defined(_XM_NO_INTRINSICS_)
  2445. XMVECTORU32 Result = { { {
  2446. V1.vector4_u32[0] ^ V2.vector4_u32[0],
  2447. V1.vector4_u32[1] ^ V2.vector4_u32[1],
  2448. V1.vector4_u32[2] ^ V2.vector4_u32[2],
  2449. V1.vector4_u32[3] ^ V2.vector4_u32[3]
  2450. } } };
  2451. return Result.v;
  2452. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2453. return veorq_u32(V1,V2);
  2454. #elif defined(_XM_SSE_INTRINSICS_)
  2455. __m128i V = _mm_xor_si128( _mm_castps_si128(V1), _mm_castps_si128(V2) );
  2456. return _mm_castsi128_ps(V);
  2457. #endif
  2458. }
  2459. //------------------------------------------------------------------------------
  2460. // Computation operations
  2461. //------------------------------------------------------------------------------
  2462. //------------------------------------------------------------------------------
  2463. inline XMVECTOR XM_CALLCONV XMVectorNegate
  2464. (
  2465. FXMVECTOR V
  2466. )
  2467. {
  2468. #if defined(_XM_NO_INTRINSICS_)
  2469. XMVECTORF32 Result = { { {
  2470. -V.vector4_f32[0],
  2471. -V.vector4_f32[1],
  2472. -V.vector4_f32[2],
  2473. -V.vector4_f32[3]
  2474. } } };
  2475. return Result.v;
  2476. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2477. return vnegq_f32(V);
  2478. #elif defined(_XM_SSE_INTRINSICS_)
  2479. XMVECTOR Z;
  2480. Z = _mm_setzero_ps();
  2481. return _mm_sub_ps( Z, V );
  2482. #endif
  2483. }
  2484. //------------------------------------------------------------------------------
  2485. inline XMVECTOR XM_CALLCONV XMVectorAdd
  2486. (
  2487. FXMVECTOR V1,
  2488. FXMVECTOR V2
  2489. )
  2490. {
  2491. #if defined(_XM_NO_INTRINSICS_)
  2492. XMVECTORF32 Result = { { {
  2493. V1.vector4_f32[0] + V2.vector4_f32[0],
  2494. V1.vector4_f32[1] + V2.vector4_f32[1],
  2495. V1.vector4_f32[2] + V2.vector4_f32[2],
  2496. V1.vector4_f32[3] + V2.vector4_f32[3]
  2497. } } };
  2498. return Result.v;
  2499. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2500. return vaddq_f32( V1, V2 );
  2501. #elif defined(_XM_SSE_INTRINSICS_)
  2502. return _mm_add_ps( V1, V2 );
  2503. #endif
  2504. }
  2505. //------------------------------------------------------------------------------
  2506. inline XMVECTOR XM_CALLCONV XMVectorSum
  2507. (
  2508. FXMVECTOR V
  2509. )
  2510. {
  2511. #if defined(_XM_NO_INTRINSICS_)
  2512. XMVECTORF32 Result;
  2513. Result.f[0] =
  2514. Result.f[1] =
  2515. Result.f[2] =
  2516. Result.f[3] = V.vector4_f32[0] + V.vector4_f32[1] + V.vector4_f32[2] + V.vector4_f32[3];
  2517. return Result.v;
  2518. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2519. #if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
  2520. XMVECTOR vTemp = vpaddq_f32(V, V);
  2521. return vpaddq_f32(vTemp,vTemp);
  2522. #else
  2523. float32x2_t v1 = vget_low_f32(V);
  2524. float32x2_t v2 = vget_high_f32(V);
  2525. v1 = vadd_f32(v1, v2);
  2526. v1 = vpadd_f32(v1, v1);
  2527. return vcombine_f32(v1, v1);
  2528. #endif
  2529. #elif defined(_XM_SSE3_INTRINSICS_)
  2530. XMVECTOR vTemp = _mm_hadd_ps(V, V);
  2531. return _mm_hadd_ps(vTemp,vTemp);
  2532. #elif defined(_XM_SSE_INTRINSICS_)
  2533. XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 3, 0, 1));
  2534. XMVECTOR vTemp2 = _mm_add_ps(V, vTemp);
  2535. vTemp = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 0, 3, 2));
  2536. return _mm_add_ps(vTemp, vTemp2);
  2537. #endif
  2538. }
  2539. //------------------------------------------------------------------------------
  2540. inline XMVECTOR XM_CALLCONV XMVectorAddAngles
  2541. (
  2542. FXMVECTOR V1,
  2543. FXMVECTOR V2
  2544. )
  2545. {
  2546. #if defined(_XM_NO_INTRINSICS_)
  2547. const XMVECTOR Zero = XMVectorZero();
  2548. // Add the given angles together. If the range of V1 is such
  2549. // that -Pi <= V1 < Pi and the range of V2 is such that
  2550. // -2Pi <= V2 <= 2Pi, then the range of the resulting angle
  2551. // will be -Pi <= Result < Pi.
  2552. XMVECTOR Result = XMVectorAdd(V1, V2);
  2553. XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v);
  2554. XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask);
  2555. Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v);
  2556. Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask);
  2557. Result = XMVectorAdd(Result, Offset);
  2558. return Result;
  2559. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2560. // Adjust the angles
  2561. XMVECTOR vResult = vaddq_f32(V1,V2);
  2562. // Less than Pi?
  2563. uint32x4_t vOffset = vcltq_f32(vResult,g_XMNegativePi);
  2564. vOffset = vandq_u32(vOffset,g_XMTwoPi);
  2565. // Add 2Pi to all entries less than -Pi
  2566. vResult = vaddq_f32(vResult,vOffset);
  2567. // Greater than or equal to Pi?
  2568. vOffset = vcgeq_f32(vResult,g_XMPi);
  2569. vOffset = vandq_u32(vOffset,g_XMTwoPi);
  2570. // Sub 2Pi to all entries greater than Pi
  2571. vResult = vsubq_f32(vResult,vOffset);
  2572. return vResult;
  2573. #elif defined(_XM_SSE_INTRINSICS_)
  2574. // Adjust the angles
  2575. XMVECTOR vResult = _mm_add_ps(V1,V2);
  2576. // Less than Pi?
  2577. XMVECTOR vOffset = _mm_cmplt_ps(vResult,g_XMNegativePi);
  2578. vOffset = _mm_and_ps(vOffset,g_XMTwoPi);
  2579. // Add 2Pi to all entries less than -Pi
  2580. vResult = _mm_add_ps(vResult,vOffset);
  2581. // Greater than or equal to Pi?
  2582. vOffset = _mm_cmpge_ps(vResult,g_XMPi);
  2583. vOffset = _mm_and_ps(vOffset,g_XMTwoPi);
  2584. // Sub 2Pi to all entries greater than Pi
  2585. vResult = _mm_sub_ps(vResult,vOffset);
  2586. return vResult;
  2587. #endif
  2588. }
  2589. //------------------------------------------------------------------------------
  2590. inline XMVECTOR XM_CALLCONV XMVectorSubtract
  2591. (
  2592. FXMVECTOR V1,
  2593. FXMVECTOR V2
  2594. )
  2595. {
  2596. #if defined(_XM_NO_INTRINSICS_)
  2597. XMVECTORF32 Result = { { {
  2598. V1.vector4_f32[0] - V2.vector4_f32[0],
  2599. V1.vector4_f32[1] - V2.vector4_f32[1],
  2600. V1.vector4_f32[2] - V2.vector4_f32[2],
  2601. V1.vector4_f32[3] - V2.vector4_f32[3]
  2602. } } };
  2603. return Result.v;
  2604. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2605. return vsubq_f32( V1, V2 );
  2606. #elif defined(_XM_SSE_INTRINSICS_)
  2607. return _mm_sub_ps( V1, V2 );
  2608. #endif
  2609. }
  2610. //------------------------------------------------------------------------------
  2611. inline XMVECTOR XM_CALLCONV XMVectorSubtractAngles
  2612. (
  2613. FXMVECTOR V1,
  2614. FXMVECTOR V2
  2615. )
  2616. {
  2617. #if defined(_XM_NO_INTRINSICS_)
  2618. const XMVECTOR Zero = XMVectorZero();
  2619. // Subtract the given angles. If the range of V1 is such
  2620. // that -Pi <= V1 < Pi and the range of V2 is such that
  2621. // -2Pi <= V2 <= 2Pi, then the range of the resulting angle
  2622. // will be -Pi <= Result < Pi.
  2623. XMVECTOR Result = XMVectorSubtract(V1, V2);
  2624. XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v);
  2625. XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask);
  2626. Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v);
  2627. Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask);
  2628. Result = XMVectorAdd(Result, Offset);
  2629. return Result;
  2630. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2631. // Adjust the angles
  2632. XMVECTOR vResult = vsubq_f32(V1,V2);
  2633. // Less than Pi?
  2634. uint32x4_t vOffset = vcltq_f32(vResult,g_XMNegativePi);
  2635. vOffset = vandq_u32(vOffset,g_XMTwoPi);
  2636. // Add 2Pi to all entries less than -Pi
  2637. vResult = vaddq_f32(vResult,vOffset);
  2638. // Greater than or equal to Pi?
  2639. vOffset = vcgeq_f32(vResult,g_XMPi);
  2640. vOffset = vandq_u32(vOffset,g_XMTwoPi);
  2641. // Sub 2Pi to all entries greater than Pi
  2642. vResult = vsubq_f32(vResult,vOffset);
  2643. return vResult;
  2644. #elif defined(_XM_SSE_INTRINSICS_)
  2645. // Adjust the angles
  2646. XMVECTOR vResult = _mm_sub_ps(V1,V2);
  2647. // Less than Pi?
  2648. XMVECTOR vOffset = _mm_cmplt_ps(vResult,g_XMNegativePi);
  2649. vOffset = _mm_and_ps(vOffset,g_XMTwoPi);
  2650. // Add 2Pi to all entries less than -Pi
  2651. vResult = _mm_add_ps(vResult,vOffset);
  2652. // Greater than or equal to Pi?
  2653. vOffset = _mm_cmpge_ps(vResult,g_XMPi);
  2654. vOffset = _mm_and_ps(vOffset,g_XMTwoPi);
  2655. // Sub 2Pi to all entries greater than Pi
  2656. vResult = _mm_sub_ps(vResult,vOffset);
  2657. return vResult;
  2658. #endif
  2659. }
  2660. //------------------------------------------------------------------------------
  2661. inline XMVECTOR XM_CALLCONV XMVectorMultiply
  2662. (
  2663. FXMVECTOR V1,
  2664. FXMVECTOR V2
  2665. )
  2666. {
  2667. #if defined(_XM_NO_INTRINSICS_)
  2668. XMVECTORF32 Result = { { {
  2669. V1.vector4_f32[0] * V2.vector4_f32[0],
  2670. V1.vector4_f32[1] * V2.vector4_f32[1],
  2671. V1.vector4_f32[2] * V2.vector4_f32[2],
  2672. V1.vector4_f32[3] * V2.vector4_f32[3]
  2673. } } };
  2674. return Result.v;
  2675. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2676. return vmulq_f32( V1, V2 );
  2677. #elif defined(_XM_SSE_INTRINSICS_)
  2678. return _mm_mul_ps( V1, V2 );
  2679. #endif
  2680. }
  2681. //------------------------------------------------------------------------------
  2682. inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd
  2683. (
  2684. FXMVECTOR V1,
  2685. FXMVECTOR V2,
  2686. FXMVECTOR V3
  2687. )
  2688. {
  2689. #if defined(_XM_NO_INTRINSICS_)
  2690. XMVECTORF32 Result = { { {
  2691. V1.vector4_f32[0] * V2.vector4_f32[0] + V3.vector4_f32[0],
  2692. V1.vector4_f32[1] * V2.vector4_f32[1] + V3.vector4_f32[1],
  2693. V1.vector4_f32[2] * V2.vector4_f32[2] + V3.vector4_f32[2],
  2694. V1.vector4_f32[3] * V2.vector4_f32[3] + V3.vector4_f32[3]
  2695. } } };
  2696. return Result.v;
  2697. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2698. return vmlaq_f32( V3, V1, V2 );
  2699. #elif defined(_XM_FMA3_INTRINSICS_)
  2700. return _mm_fmadd_ps( V1, V2, V3 );
  2701. #elif defined(_XM_SSE_INTRINSICS_)
  2702. XMVECTOR vResult = _mm_mul_ps( V1, V2 );
  2703. return _mm_add_ps(vResult, V3 );
  2704. #endif
  2705. }
  2706. //------------------------------------------------------------------------------
  2707. inline XMVECTOR XM_CALLCONV XMVectorDivide
  2708. (
  2709. FXMVECTOR V1,
  2710. FXMVECTOR V2
  2711. )
  2712. {
  2713. #if defined(_XM_NO_INTRINSICS_)
  2714. XMVECTORF32 Result = { { {
  2715. V1.vector4_f32[0] / V2.vector4_f32[0],
  2716. V1.vector4_f32[1] / V2.vector4_f32[1],
  2717. V1.vector4_f32[2] / V2.vector4_f32[2],
  2718. V1.vector4_f32[3] / V2.vector4_f32[3]
  2719. } } };
  2720. return Result.v;
  2721. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2722. #if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
  2723. return vdivq_f32( V1, V2 );
  2724. #else
  2725. // 2 iterations of Newton-Raphson refinement of reciprocal
  2726. float32x4_t Reciprocal = vrecpeq_f32(V2);
  2727. float32x4_t S = vrecpsq_f32( Reciprocal, V2 );
  2728. Reciprocal = vmulq_f32( S, Reciprocal );
  2729. S = vrecpsq_f32( Reciprocal, V2 );
  2730. Reciprocal = vmulq_f32( S, Reciprocal );
  2731. return vmulq_f32( V1, Reciprocal );
  2732. #endif
  2733. #elif defined(_XM_SSE_INTRINSICS_)
  2734. return _mm_div_ps( V1, V2 );
  2735. #endif
  2736. }
  2737. //------------------------------------------------------------------------------
  2738. inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract
  2739. (
  2740. FXMVECTOR V1,
  2741. FXMVECTOR V2,
  2742. FXMVECTOR V3
  2743. )
  2744. {
  2745. #if defined(_XM_NO_INTRINSICS_)
  2746. XMVECTORF32 Result = { { {
  2747. V3.vector4_f32[0] - (V1.vector4_f32[0] * V2.vector4_f32[0]),
  2748. V3.vector4_f32[1] - (V1.vector4_f32[1] * V2.vector4_f32[1]),
  2749. V3.vector4_f32[2] - (V1.vector4_f32[2] * V2.vector4_f32[2]),
  2750. V3.vector4_f32[3] - (V1.vector4_f32[3] * V2.vector4_f32[3])
  2751. } } };
  2752. return Result;
  2753. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2754. return vmlsq_f32( V3, V1, V2 );
  2755. #elif defined(_XM_FMA3_INTRINSICS_)
  2756. return _mm_fnmadd_ps(V1, V2, V3);
  2757. #elif defined(_XM_SSE_INTRINSICS_)
  2758. XMVECTOR R = _mm_mul_ps( V1, V2 );
  2759. return _mm_sub_ps( V3, R );
  2760. #endif
  2761. }
  2762. //------------------------------------------------------------------------------
  2763. inline XMVECTOR XM_CALLCONV XMVectorScale
  2764. (
  2765. FXMVECTOR V,
  2766. float ScaleFactor
  2767. )
  2768. {
  2769. #if defined(_XM_NO_INTRINSICS_)
  2770. XMVECTORF32 Result = { { {
  2771. V.vector4_f32[0] * ScaleFactor,
  2772. V.vector4_f32[1] * ScaleFactor,
  2773. V.vector4_f32[2] * ScaleFactor,
  2774. V.vector4_f32[3] * ScaleFactor
  2775. } } };
  2776. return Result.v;
  2777. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2778. return vmulq_n_f32( V, ScaleFactor );
  2779. #elif defined(_XM_SSE_INTRINSICS_)
  2780. XMVECTOR vResult = _mm_set_ps1(ScaleFactor);
  2781. return _mm_mul_ps(vResult,V);
  2782. #endif
  2783. }
  2784. //------------------------------------------------------------------------------
  2785. inline XMVECTOR XM_CALLCONV XMVectorReciprocalEst
  2786. (
  2787. FXMVECTOR V
  2788. )
  2789. {
  2790. #if defined(_XM_NO_INTRINSICS_)
  2791. XMVECTORF32 Result = { { {
  2792. 1.f / V.vector4_f32[0],
  2793. 1.f / V.vector4_f32[1],
  2794. 1.f / V.vector4_f32[2],
  2795. 1.f / V.vector4_f32[3]
  2796. } } };
  2797. return Result.v;
  2798. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2799. return vrecpeq_f32(V);
  2800. #elif defined(_XM_SSE_INTRINSICS_)
  2801. return _mm_rcp_ps(V);
  2802. #endif
  2803. }
  2804. //------------------------------------------------------------------------------
  2805. inline XMVECTOR XM_CALLCONV XMVectorReciprocal
  2806. (
  2807. FXMVECTOR V
  2808. )
  2809. {
  2810. #if defined(_XM_NO_INTRINSICS_)
  2811. XMVECTORF32 Result = { { {
  2812. 1.f / V.vector4_f32[0],
  2813. 1.f / V.vector4_f32[1],
  2814. 1.f / V.vector4_f32[2],
  2815. 1.f / V.vector4_f32[3]
  2816. } } };
  2817. return Result.v;
  2818. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2819. #if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
  2820. float32x4_t one = vdupq_n_f32(1.0f);
  2821. return vdivq_f32(one,V);
  2822. #else
  2823. // 2 iterations of Newton-Raphson refinement
  2824. float32x4_t Reciprocal = vrecpeq_f32(V);
  2825. float32x4_t S = vrecpsq_f32( Reciprocal, V );
  2826. Reciprocal = vmulq_f32( S, Reciprocal );
  2827. S = vrecpsq_f32( Reciprocal, V );
  2828. return vmulq_f32( S, Reciprocal );
  2829. #endif
  2830. #elif defined(_XM_SSE_INTRINSICS_)
  2831. return _mm_div_ps(g_XMOne,V);
  2832. #endif
  2833. }
  2834. //------------------------------------------------------------------------------
  2835. // Return an estimated square root
  2836. inline XMVECTOR XM_CALLCONV XMVectorSqrtEst
  2837. (
  2838. FXMVECTOR V
  2839. )
  2840. {
  2841. #if defined(_XM_NO_INTRINSICS_)
  2842. XMVECTORF32 Result = { { {
  2843. sqrtf(V.vector4_f32[0]),
  2844. sqrtf(V.vector4_f32[1]),
  2845. sqrtf(V.vector4_f32[2]),
  2846. sqrtf(V.vector4_f32[3])
  2847. } } };
  2848. return Result.v;
  2849. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2850. // 1 iteration of Newton-Raphson refinment of sqrt
  2851. float32x4_t S0 = vrsqrteq_f32(V);
  2852. float32x4_t P0 = vmulq_f32( V, S0 );
  2853. float32x4_t R0 = vrsqrtsq_f32( P0, S0 );
  2854. float32x4_t S1 = vmulq_f32( S0, R0 );
  2855. XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v);
  2856. XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0) );
  2857. XMVECTOR Result = vmulq_f32( V, S1 );
  2858. XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero);
  2859. return XMVectorSelect(V, Result, Select);
  2860. #elif defined(_XM_SSE_INTRINSICS_)
  2861. return _mm_sqrt_ps(V);
  2862. #endif
  2863. }
  2864. //------------------------------------------------------------------------------
  2865. inline XMVECTOR XM_CALLCONV XMVectorSqrt
  2866. (
  2867. FXMVECTOR V
  2868. )
  2869. {
  2870. #if defined(_XM_NO_INTRINSICS_)
  2871. XMVECTORF32 Result = { { {
  2872. sqrtf(V.vector4_f32[0]),
  2873. sqrtf(V.vector4_f32[1]),
  2874. sqrtf(V.vector4_f32[2]),
  2875. sqrtf(V.vector4_f32[3])
  2876. } } };
  2877. return Result.v;
  2878. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2879. // 3 iterations of Newton-Raphson refinment of sqrt
  2880. float32x4_t S0 = vrsqrteq_f32(V);
  2881. float32x4_t P0 = vmulq_f32( V, S0 );
  2882. float32x4_t R0 = vrsqrtsq_f32( P0, S0 );
  2883. float32x4_t S1 = vmulq_f32( S0, R0 );
  2884. float32x4_t P1 = vmulq_f32( V, S1 );
  2885. float32x4_t R1 = vrsqrtsq_f32( P1, S1 );
  2886. float32x4_t S2 = vmulq_f32( S1, R1 );
  2887. float32x4_t P2 = vmulq_f32( V, S2 );
  2888. float32x4_t R2 = vrsqrtsq_f32( P2, S2 );
  2889. float32x4_t S3 = vmulq_f32( S2, R2 );
  2890. XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v);
  2891. XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0) );
  2892. XMVECTOR Result = vmulq_f32( V, S3 );
  2893. XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero);
  2894. return XMVectorSelect(V, Result, Select);
  2895. #elif defined(_XM_SSE_INTRINSICS_)
  2896. return _mm_sqrt_ps(V);
  2897. #endif
  2898. }
  2899. //------------------------------------------------------------------------------
  2900. inline XMVECTOR XM_CALLCONV XMVectorReciprocalSqrtEst
  2901. (
  2902. FXMVECTOR V
  2903. )
  2904. {
  2905. #if defined(_XM_NO_INTRINSICS_)
  2906. XMVECTORF32 Result = { { {
  2907. 1.f / sqrtf(V.vector4_f32[0]),
  2908. 1.f / sqrtf(V.vector4_f32[1]),
  2909. 1.f / sqrtf(V.vector4_f32[2]),
  2910. 1.f / sqrtf(V.vector4_f32[3])
  2911. } } };
  2912. return Result.v;
  2913. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2914. return vrsqrteq_f32(V);
  2915. #elif defined(_XM_SSE_INTRINSICS_)
  2916. return _mm_rsqrt_ps(V);
  2917. #endif
  2918. }
  2919. //------------------------------------------------------------------------------
  2920. inline XMVECTOR XM_CALLCONV XMVectorReciprocalSqrt
  2921. (
  2922. FXMVECTOR V
  2923. )
  2924. {
  2925. #if defined(_XM_NO_INTRINSICS_)
  2926. XMVECTORF32 Result = { { {
  2927. 1.f / sqrtf(V.vector4_f32[0]),
  2928. 1.f / sqrtf(V.vector4_f32[1]),
  2929. 1.f / sqrtf(V.vector4_f32[2]),
  2930. 1.f / sqrtf(V.vector4_f32[3])
  2931. } } };
  2932. return Result;
  2933. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2934. // 2 iterations of Newton-Raphson refinement of reciprocal
  2935. float32x4_t S0 = vrsqrteq_f32(V);
  2936. float32x4_t P0 = vmulq_f32( V, S0 );
  2937. float32x4_t R0 = vrsqrtsq_f32( P0, S0 );
  2938. float32x4_t S1 = vmulq_f32( S0, R0 );
  2939. float32x4_t P1 = vmulq_f32( V, S1 );
  2940. float32x4_t R1 = vrsqrtsq_f32( P1, S1 );
  2941. return vmulq_f32( S1, R1 );
  2942. #elif defined(_XM_SSE_INTRINSICS_)
  2943. XMVECTOR vResult = _mm_sqrt_ps(V);
  2944. vResult = _mm_div_ps(g_XMOne,vResult);
  2945. return vResult;
  2946. #endif
  2947. }
  2948. //------------------------------------------------------------------------------
  2949. inline XMVECTOR XM_CALLCONV XMVectorExp2
  2950. (
  2951. FXMVECTOR V
  2952. )
  2953. {
  2954. #if defined(_XM_NO_INTRINSICS_)
  2955. XMVECTORF32 Result = { { {
  2956. powf(2.0f, V.vector4_f32[0]),
  2957. powf(2.0f, V.vector4_f32[1]),
  2958. powf(2.0f, V.vector4_f32[2]),
  2959. powf(2.0f, V.vector4_f32[3])
  2960. } } };
  2961. return Result;
  2962. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  2963. int32x4_t itrunc = vcvtq_s32_f32(V);
  2964. float32x4_t ftrunc = vcvtq_f32_s32(itrunc);
  2965. float32x4_t y = vsubq_f32(V, ftrunc);
  2966. float32x4_t poly = vmlaq_f32( g_XMExpEst6, g_XMExpEst7, y );
  2967. poly = vmlaq_f32( g_XMExpEst5, poly, y );
  2968. poly = vmlaq_f32( g_XMExpEst4, poly, y );
  2969. poly = vmlaq_f32( g_XMExpEst3, poly, y );
  2970. poly = vmlaq_f32( g_XMExpEst2, poly, y );
  2971. poly = vmlaq_f32( g_XMExpEst1, poly, y );
  2972. poly = vmlaq_f32( g_XMOne, poly, y );
  2973. int32x4_t biased = vaddq_s32(itrunc, g_XMExponentBias);
  2974. biased = vshlq_n_s32(biased, 23);
  2975. float32x4_t result0 = XMVectorDivide(biased, poly);
  2976. biased = vaddq_s32(itrunc, g_XM253);
  2977. biased = vshlq_n_s32(biased, 23);
  2978. float32x4_t result1 = XMVectorDivide(biased, poly);
  2979. result1 = vmulq_f32(g_XMMinNormal.v, result1);
  2980. // Use selection to handle the cases
  2981. // if (V is NaN) -> QNaN;
  2982. // else if (V sign bit set)
  2983. // if (V > -150)
  2984. // if (V.exponent < -126) -> result1
  2985. // else -> result0
  2986. // else -> +0
  2987. // else
  2988. // if (V < 128) -> result0
  2989. // else -> +inf
  2990. int32x4_t comp = vcltq_s32( V, g_XMBin128);
  2991. float32x4_t result2 = vbslq_f32( comp, result0, g_XMInfinity );
  2992. comp = vcltq_s32(itrunc, g_XMSubnormalExponent);
  2993. float32x4_t result3 = vbslq_f32( comp, result1, result0 );
  2994. comp = vcltq_s32(V, g_XMBinNeg150);
  2995. float32x4_t result4 = vbslq_f32( comp, result3, g_XMZero );
  2996. int32x4_t sign = vandq_s32(V, g_XMNegativeZero);
  2997. comp = vceqq_s32(sign, g_XMNegativeZero);
  2998. float32x4_t result5 = vbslq_f32( comp, result4, result2 );
  2999. int32x4_t t0 = vandq_s32(V, g_XMQNaNTest);
  3000. int32x4_t t1 = vandq_s32(V, g_XMInfinity);
  3001. t0 = vceqq_s32(t0, g_XMZero);
  3002. t1 = vceqq_s32(t1, g_XMInfinity);
  3003. int32x4_t isNaN = vbicq_s32( t1,t0);
  3004. float32x4_t vResult = vbslq_f32( isNaN, g_XMQNaN, result5 );
  3005. return vResult;
  3006. #elif defined(_XM_SSE_INTRINSICS_)
  3007. __m128i itrunc = _mm_cvttps_epi32(V);
  3008. __m128 ftrunc = _mm_cvtepi32_ps(itrunc);
  3009. __m128 y = _mm_sub_ps(V, ftrunc);
  3010. __m128 poly = _mm_mul_ps(g_XMExpEst7, y);
  3011. poly = _mm_add_ps(g_XMExpEst6, poly);
  3012. poly = _mm_mul_ps(poly, y);
  3013. poly = _mm_add_ps(g_XMExpEst5, poly);
  3014. poly = _mm_mul_ps(poly, y);
  3015. poly = _mm_add_ps(g_XMExpEst4, poly);
  3016. poly = _mm_mul_ps(poly, y);
  3017. poly = _mm_add_ps(g_XMExpEst3, poly);
  3018. poly = _mm_mul_ps(poly, y);
  3019. poly = _mm_add_ps(g_XMExpEst2, poly);
  3020. poly = _mm_mul_ps(poly, y);
  3021. poly = _mm_add_ps(g_XMExpEst1, poly);
  3022. poly = _mm_mul_ps(poly, y);
  3023. poly = _mm_add_ps(g_XMOne, poly);
  3024. __m128i biased = _mm_add_epi32(itrunc, g_XMExponentBias);
  3025. biased = _mm_slli_epi32(biased, 23);
  3026. __m128 result0 = _mm_div_ps(_mm_castsi128_ps(biased), poly);
  3027. biased = _mm_add_epi32(itrunc, g_XM253);
  3028. biased = _mm_slli_epi32(biased, 23);
  3029. __m128 result1 = _mm_div_ps(_mm_castsi128_ps(biased), poly);
  3030. result1 = _mm_mul_ps(g_XMMinNormal.v, result1);
  3031. // Use selection to handle the cases
  3032. // if (V is NaN) -> QNaN;
  3033. // else if (V sign bit set)
  3034. // if (V > -150)
  3035. // if (V.exponent < -126) -> result1
  3036. // else -> result0
  3037. // else -> +0
  3038. // else
  3039. // if (V < 128) -> result0
  3040. // else -> +inf
  3041. __m128i comp = _mm_cmplt_epi32( _mm_castps_si128(V), g_XMBin128);
  3042. __m128i select0 = _mm_and_si128(comp, _mm_castps_si128(result0));
  3043. __m128i select1 = _mm_andnot_si128(comp, g_XMInfinity);
  3044. __m128i result2 = _mm_or_si128(select0, select1);
  3045. comp = _mm_cmplt_epi32(itrunc, g_XMSubnormalExponent);
  3046. select1 = _mm_and_si128(comp, _mm_castps_si128(result1));
  3047. select0 = _mm_andnot_si128(comp, _mm_castps_si128(result0));
  3048. __m128i result3 = _mm_or_si128(select0, select1);
  3049. comp = _mm_cmplt_epi32(_mm_castps_si128(V), g_XMBinNeg150);
  3050. select0 = _mm_and_si128(comp, result3);
  3051. select1 = _mm_andnot_si128(comp, g_XMZero);
  3052. __m128i result4 = _mm_or_si128(select0, select1);
  3053. __m128i sign = _mm_and_si128(_mm_castps_si128(V), g_XMNegativeZero);
  3054. comp = _mm_cmpeq_epi32(sign, g_XMNegativeZero);
  3055. select0 = _mm_and_si128(comp, result4);
  3056. select1 = _mm_andnot_si128(comp, result2);
  3057. __m128i result5 = _mm_or_si128(select0, select1);
  3058. __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
  3059. __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
  3060. t0 = _mm_cmpeq_epi32(t0, g_XMZero);
  3061. t1 = _mm_cmpeq_epi32(t1, g_XMInfinity);
  3062. __m128i isNaN = _mm_andnot_si128(t0, t1);
  3063. select0 = _mm_and_si128(isNaN, g_XMQNaN);
  3064. select1 = _mm_andnot_si128(isNaN, result5);
  3065. __m128i vResult = _mm_or_si128(select0, select1);
  3066. return _mm_castsi128_ps(vResult);
  3067. #endif
  3068. }
  3069. //------------------------------------------------------------------------------
  3070. inline XMVECTOR XM_CALLCONV XMVectorExpE
  3071. (
  3072. FXMVECTOR V
  3073. )
  3074. {
  3075. #if defined(_XM_NO_INTRINSICS_)
  3076. XMVECTORF32 Result = { { {
  3077. expf(V.vector4_f32[0]),
  3078. expf(V.vector4_f32[1]),
  3079. expf(V.vector4_f32[2]),
  3080. expf(V.vector4_f32[3])
  3081. } } };
  3082. return Result.v;
  3083. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  3084. // expE(V) = exp2(vin*log2(e))
  3085. float32x4_t Ve = vmulq_f32(g_XMLgE, V);
  3086. int32x4_t itrunc = vcvtq_s32_f32(Ve);
  3087. float32x4_t ftrunc = vcvtq_f32_s32(itrunc);
  3088. float32x4_t y = vsubq_f32(Ve, ftrunc);
  3089. float32x4_t poly = vmlaq_f32( g_XMExpEst6, g_XMExpEst7, y );
  3090. poly = vmlaq_f32( g_XMExpEst5, poly, y );
  3091. poly = vmlaq_f32( g_XMExpEst4, poly, y );
  3092. poly = vmlaq_f32( g_XMExpEst3, poly, y );
  3093. poly = vmlaq_f32( g_XMExpEst2, poly, y );
  3094. poly = vmlaq_f32( g_XMExpEst1, poly, y );
  3095. poly = vmlaq_f32( g_XMOne, poly, y );
  3096. int32x4_t biased = vaddq_s32(itrunc, g_XMExponentBias);
  3097. biased = vshlq_n_s32(biased, 23);
  3098. float32x4_t result0 = XMVectorDivide(biased, poly);
  3099. biased = vaddq_s32(itrunc, g_XM253);
  3100. biased = vshlq_n_s32(biased, 23);
  3101. float32x4_t result1 = XMVectorDivide(biased, poly);
  3102. result1 = vmulq_f32(g_XMMinNormal.v, result1);
  3103. // Use selection to handle the cases
  3104. // if (V is NaN) -> QNaN;
  3105. // else if (V sign bit set)
  3106. // if (V > -150)
  3107. // if (V.exponent < -126) -> result1
  3108. // else -> result0
  3109. // else -> +0
  3110. // else
  3111. // if (V < 128) -> result0
  3112. // else -> +inf
  3113. int32x4_t comp = vcltq_s32( Ve, g_XMBin128);
  3114. float32x4_t result2 = vbslq_f32( comp, result0, g_XMInfinity );
  3115. comp = vcltq_s32(itrunc, g_XMSubnormalExponent);
  3116. float32x4_t result3 = vbslq_f32( comp, result1, result0 );
  3117. comp = vcltq_s32(Ve, g_XMBinNeg150);
  3118. float32x4_t result4 = vbslq_f32( comp, result3, g_XMZero );
  3119. int32x4_t sign = vandq_s32(Ve, g_XMNegativeZero);
  3120. comp = vceqq_s32(sign, g_XMNegativeZero);
  3121. float32x4_t result5 = vbslq_f32( comp, result4, result2 );
  3122. int32x4_t t0 = vandq_s32(Ve, g_XMQNaNTest);
  3123. int32x4_t t1 = vandq_s32(Ve, g_XMInfinity);
  3124. t0 = vceqq_s32(t0, g_XMZero);
  3125. t1 = vceqq_s32(t1, g_XMInfinity);
  3126. int32x4_t isNaN = vbicq_s32( t1,t0);
  3127. float32x4_t vResult = vbslq_f32( isNaN, g_XMQNaN, result5 );
  3128. return vResult;
  3129. #elif defined(_XM_SSE_INTRINSICS_)
  3130. // expE(V) = exp2(vin*log2(e))
  3131. __m128 Ve = _mm_mul_ps(g_XMLgE, V);
  3132. __m128i itrunc = _mm_cvttps_epi32(Ve);
  3133. __m128 ftrunc = _mm_cvtepi32_ps(itrunc);
  3134. __m128 y = _mm_sub_ps(Ve, ftrunc);
  3135. __m128 poly = _mm_mul_ps(g_XMExpEst7, y);
  3136. poly = _mm_add_ps(g_XMExpEst6, poly);
  3137. poly = _mm_mul_ps(poly, y);
  3138. poly = _mm_add_ps(g_XMExpEst5, poly);
  3139. poly = _mm_mul_ps(poly, y);
  3140. poly = _mm_add_ps(g_XMExpEst4, poly);
  3141. poly = _mm_mul_ps(poly, y);
  3142. poly = _mm_add_ps(g_XMExpEst3, poly);
  3143. poly = _mm_mul_ps(poly, y);
  3144. poly = _mm_add_ps(g_XMExpEst2, poly);
  3145. poly = _mm_mul_ps(poly, y);
  3146. poly = _mm_add_ps(g_XMExpEst1, poly);
  3147. poly = _mm_mul_ps(poly, y);
  3148. poly = _mm_add_ps(g_XMOne, poly);
  3149. __m128i biased = _mm_add_epi32(itrunc, g_XMExponentBias);
  3150. biased = _mm_slli_epi32(biased, 23);
  3151. __m128 result0 = _mm_div_ps(_mm_castsi128_ps(biased), poly);
  3152. biased = _mm_add_epi32(itrunc, g_XM253);
  3153. biased = _mm_slli_epi32(biased, 23);
  3154. __m128 result1 = _mm_div_ps(_mm_castsi128_ps(biased), poly);
  3155. result1 = _mm_mul_ps(g_XMMinNormal.v, result1);
  3156. // Use selection to handle the cases
  3157. // if (V is NaN) -> QNaN;
  3158. // else if (V sign bit set)
  3159. // if (V > -150)
  3160. // if (V.exponent < -126) -> result1
  3161. // else -> result0
  3162. // else -> +0
  3163. // else
  3164. // if (V < 128) -> result0
  3165. // else -> +inf
  3166. __m128i comp = _mm_cmplt_epi32( _mm_castps_si128(Ve), g_XMBin128);
  3167. __m128i select0 = _mm_and_si128(comp, _mm_castps_si128(result0));
  3168. __m128i select1 = _mm_andnot_si128(comp, g_XMInfinity);
  3169. __m128i result2 = _mm_or_si128(select0, select1);
  3170. comp = _mm_cmplt_epi32(itrunc, g_XMSubnormalExponent);
  3171. select1 = _mm_and_si128(comp, _mm_castps_si128(result1));
  3172. select0 = _mm_andnot_si128(comp, _mm_castps_si128(result0));
  3173. __m128i result3 = _mm_or_si128(select0, select1);
  3174. comp = _mm_cmplt_epi32(_mm_castps_si128(Ve), g_XMBinNeg150);
  3175. select0 = _mm_and_si128(comp, result3);
  3176. select1 = _mm_andnot_si128(comp, g_XMZero);
  3177. __m128i result4 = _mm_or_si128(select0, select1);
  3178. __m128i sign = _mm_and_si128(_mm_castps_si128(Ve), g_XMNegativeZero);
  3179. comp = _mm_cmpeq_epi32(sign, g_XMNegativeZero);
  3180. select0 = _mm_and_si128(comp, result4);
  3181. select1 = _mm_andnot_si128(comp, result2);
  3182. __m128i result5 = _mm_or_si128(select0, select1);
  3183. __m128i t0 = _mm_and_si128(_mm_castps_si128(Ve), g_XMQNaNTest);
  3184. __m128i t1 = _mm_and_si128(_mm_castps_si128(Ve), g_XMInfinity);
  3185. t0 = _mm_cmpeq_epi32(t0, g_XMZero);
  3186. t1 = _mm_cmpeq_epi32(t1, g_XMInfinity);
  3187. __m128i isNaN = _mm_andnot_si128(t0, t1);
  3188. select0 = _mm_and_si128(isNaN, g_XMQNaN);
  3189. select1 = _mm_andnot_si128(isNaN, result5);
  3190. __m128i vResult = _mm_or_si128(select0, select1);
  3191. return _mm_castsi128_ps(vResult);
  3192. #endif
  3193. }
  3194. //------------------------------------------------------------------------------
  3195. inline XMVECTOR XM_CALLCONV XMVectorExp
  3196. (
  3197. FXMVECTOR V
  3198. )
  3199. {
  3200. return XMVectorExp2(V);
  3201. }
  3202. //------------------------------------------------------------------------------
  3203. #if defined(_XM_SSE_INTRINSICS_)
  3204. namespace Internal
  3205. {
  3206. inline __m128i multi_sll_epi32(__m128i value, __m128i count)
  3207. {
  3208. __m128i v = _mm_shuffle_epi32(value, _MM_SHUFFLE(0,0,0,0));
  3209. __m128i c = _mm_shuffle_epi32(count, _MM_SHUFFLE(0,0,0,0));
  3210. c = _mm_and_si128(c, g_XMMaskX);
  3211. __m128i r0 = _mm_sll_epi32(v, c);
  3212. v = _mm_shuffle_epi32(value, _MM_SHUFFLE(1,1,1,1));
  3213. c = _mm_shuffle_epi32(count, _MM_SHUFFLE(1,1,1,1));
  3214. c = _mm_and_si128(c, g_XMMaskX);
  3215. __m128i r1 = _mm_sll_epi32(v, c);
  3216. v = _mm_shuffle_epi32(value, _MM_SHUFFLE(2,2,2,2));
  3217. c = _mm_shuffle_epi32(count, _MM_SHUFFLE(2,2,2,2));
  3218. c = _mm_and_si128(c, g_XMMaskX);
  3219. __m128i r2 = _mm_sll_epi32(v, c);
  3220. v = _mm_shuffle_epi32(value, _MM_SHUFFLE(3,3,3,3));
  3221. c = _mm_shuffle_epi32(count, _MM_SHUFFLE(3,3,3,3));
  3222. c = _mm_and_si128(c, g_XMMaskX);
  3223. __m128i r3 = _mm_sll_epi32(v, c);
  3224. // (r0,r0,r1,r1)
  3225. __m128 r01 = _mm_shuffle_ps(_mm_castsi128_ps(r0), _mm_castsi128_ps(r1), _MM_SHUFFLE(0,0,0,0));
  3226. // (r2,r2,r3,r3)
  3227. __m128 r23 = _mm_shuffle_ps(_mm_castsi128_ps(r2), _mm_castsi128_ps(r3), _MM_SHUFFLE(0,0,0,0));
  3228. // (r0,r1,r2,r3)
  3229. __m128 result = _mm_shuffle_ps(r01, r23, _MM_SHUFFLE(2,0,2,0));
  3230. return _mm_castps_si128(result);
  3231. }
  3232. inline __m128i multi_srl_epi32(__m128i value, __m128i count)
  3233. {
  3234. __m128i v = _mm_shuffle_epi32(value, _MM_SHUFFLE(0,0,0,0));
  3235. __m128i c = _mm_shuffle_epi32(count, _MM_SHUFFLE(0,0,0,0));
  3236. c = _mm_and_si128(c, g_XMMaskX);
  3237. __m128i r0 = _mm_srl_epi32(v, c);
  3238. v = _mm_shuffle_epi32(value, _MM_SHUFFLE(1,1,1,1));
  3239. c = _mm_shuffle_epi32(count, _MM_SHUFFLE(1,1,1,1));
  3240. c = _mm_and_si128(c, g_XMMaskX);
  3241. __m128i r1 = _mm_srl_epi32(v, c);
  3242. v = _mm_shuffle_epi32(value, _MM_SHUFFLE(2,2,2,2));
  3243. c = _mm_shuffle_epi32(count, _MM_SHUFFLE(2,2,2,2));
  3244. c = _mm_and_si128(c, g_XMMaskX);
  3245. __m128i r2 = _mm_srl_epi32(v, c);
  3246. v = _mm_shuffle_epi32(value, _MM_SHUFFLE(3,3,3,3));
  3247. c = _mm_shuffle_epi32(count, _MM_SHUFFLE(3,3,3,3));
  3248. c = _mm_and_si128(c, g_XMMaskX);
  3249. __m128i r3 = _mm_srl_epi32(v, c);
  3250. // (r0,r0,r1,r1)
  3251. __m128 r01 = _mm_shuffle_ps(_mm_castsi128_ps(r0), _mm_castsi128_ps(r1), _MM_SHUFFLE(0,0,0,0));
  3252. // (r2,r2,r3,r3)
  3253. __m128 r23 = _mm_shuffle_ps(_mm_castsi128_ps(r2), _mm_castsi128_ps(r3), _MM_SHUFFLE(0,0,0,0));
  3254. // (r0,r1,r2,r3)
  3255. __m128 result = _mm_shuffle_ps(r01, r23, _MM_SHUFFLE(2,0,2,0));
  3256. return _mm_castps_si128(result);
  3257. }
  3258. inline __m128i GetLeadingBit(const __m128i value)
  3259. {
  3260. static const XMVECTORI32 g_XM0000FFFF = { { { 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF } } };
  3261. static const XMVECTORI32 g_XM000000FF = { { { 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF } } };
  3262. static const XMVECTORI32 g_XM0000000F = { { { 0x0000000F, 0x0000000F, 0x0000000F, 0x0000000F } } };
  3263. static const XMVECTORI32 g_XM00000003 = { { { 0x00000003, 0x00000003, 0x00000003, 0x00000003 } } };
  3264. __m128i v = value, r, c, b, s;
  3265. c = _mm_cmpgt_epi32(v, g_XM0000FFFF); // c = (v > 0xFFFF)
  3266. b = _mm_srli_epi32(c, 31); // b = (c ? 1 : 0)
  3267. r = _mm_slli_epi32(b, 4); // r = (b << 4)
  3268. v = multi_srl_epi32(v, r); // v = (v >> r)
  3269. c = _mm_cmpgt_epi32(v, g_XM000000FF); // c = (v > 0xFF)
  3270. b = _mm_srli_epi32(c, 31); // b = (c ? 1 : 0)
  3271. s = _mm_slli_epi32(b, 3); // s = (b << 3)
  3272. v = multi_srl_epi32(v, s); // v = (v >> s)
  3273. r = _mm_or_si128(r, s); // r = (r | s)
  3274. c = _mm_cmpgt_epi32(v, g_XM0000000F); // c = (v > 0xF)
  3275. b = _mm_srli_epi32(c, 31); // b = (c ? 1 : 0)
  3276. s = _mm_slli_epi32(b, 2); // s = (b << 2)
  3277. v = multi_srl_epi32(v, s); // v = (v >> s)
  3278. r = _mm_or_si128(r, s); // r = (r | s)
  3279. c = _mm_cmpgt_epi32(v, g_XM00000003); // c = (v > 0x3)
  3280. b = _mm_srli_epi32(c, 31); // b = (c ? 1 : 0)
  3281. s = _mm_slli_epi32(b, 1); // s = (b << 1)
  3282. v = multi_srl_epi32(v, s); // v = (v >> s)
  3283. r = _mm_or_si128(r, s); // r = (r | s)
  3284. s = _mm_srli_epi32(v, 1);
  3285. r = _mm_or_si128(r, s);
  3286. return r;
  3287. }
  3288. } // namespace Internal
  3289. #endif // _XM_SSE_INTRINSICS_
  3290. #if defined(_XM_ARM_NEON_INTRINSICS_)
  3291. namespace Internal
  3292. {
  3293. inline int32x4_t GetLeadingBit(const int32x4_t value)
  3294. {
  3295. static const XMVECTORI32 g_XM0000FFFF = { { { 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF } } };
  3296. static const XMVECTORI32 g_XM000000FF = { { { 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF } } };
  3297. static const XMVECTORI32 g_XM0000000F = { { { 0x0000000F, 0x0000000F, 0x0000000F, 0x0000000F } } };
  3298. static const XMVECTORI32 g_XM00000003 = { { { 0x00000003, 0x00000003, 0x00000003, 0x00000003 } } };
  3299. int32x4_t v = value, r, c, b, s;
  3300. c = vcgtq_s32(v, g_XM0000FFFF); // c = (v > 0xFFFF)
  3301. b = vshrq_n_u32(c, 31); // b = (c ? 1 : 0)
  3302. r = vshlq_n_s32(b, 4); // r = (b << 4)
  3303. r = vnegq_s32( r );
  3304. v = vshlq_u32( v, r ); // v = (v >> r)
  3305. c = vcgtq_s32(v, g_XM000000FF); // c = (v > 0xFF)
  3306. b = vshrq_n_u32(c, 31); // b = (c ? 1 : 0)
  3307. s = vshlq_n_s32(b, 3); // s = (b << 3)
  3308. s = vnegq_s32( s );
  3309. v = vshlq_u32(v, s); // v = (v >> s)
  3310. r = vorrq_s32(r, s); // r = (r | s)
  3311. c = vcgtq_s32(v, g_XM0000000F); // c = (v > 0xF)
  3312. b = vshrq_n_u32(c, 31); // b = (c ? 1 : 0)
  3313. s = vshlq_n_s32(b, 2); // s = (b << 2)
  3314. s = vnegq_s32( s );
  3315. v = vshlq_u32(v, s); // v = (v >> s)
  3316. r = vorrq_s32(r, s); // r = (r | s)
  3317. c = vcgtq_s32(v, g_XM00000003); // c = (v > 0x3)
  3318. b = vshrq_n_u32(c, 31); // b = (c ? 1 : 0)
  3319. s = vshlq_n_s32(b, 1); // s = (b << 1)
  3320. s = vnegq_s32( s );
  3321. v = vshlq_u32(v, s); // v = (v >> s)
  3322. r = vorrq_s32(r, s); // r = (r | s)
  3323. s = vshrq_n_u32(v, 1);
  3324. r = vorrq_s32(r, s);
  3325. return r;
  3326. }
  3327. } // namespace Internal
  3328. #endif
  3329. //------------------------------------------------------------------------------
  3330. inline XMVECTOR XM_CALLCONV XMVectorLog2
  3331. (
  3332. FXMVECTOR V
  3333. )
  3334. {
  3335. #if defined(_XM_NO_INTRINSICS_)
  3336. const float fScale = 1.4426950f; // (1.0f / logf(2.0f));
  3337. XMVECTORF32 Result = { { {
  3338. logf(V.vector4_f32[0])*fScale,
  3339. logf(V.vector4_f32[1])*fScale,
  3340. logf(V.vector4_f32[2])*fScale,
  3341. logf(V.vector4_f32[3])*fScale
  3342. } } };
  3343. return Result.v;
  3344. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  3345. int32x4_t rawBiased = vandq_s32(V, g_XMInfinity);
  3346. int32x4_t trailing = vandq_s32(V, g_XMQNaNTest);
  3347. int32x4_t isExponentZero = vceqq_s32(g_XMZero, rawBiased);
  3348. // Compute exponent and significand for normals.
  3349. int32x4_t biased = vshrq_n_u32(rawBiased, 23);
  3350. int32x4_t exponentNor = vsubq_s32(biased, g_XMExponentBias);
  3351. int32x4_t trailingNor = trailing;
  3352. // Compute exponent and significand for subnormals.
  3353. int32x4_t leading = Internal::GetLeadingBit(trailing);
  3354. int32x4_t shift = vsubq_s32(g_XMNumTrailing, leading);
  3355. int32x4_t exponentSub = vsubq_s32(g_XMSubnormalExponent, shift);
  3356. int32x4_t trailingSub = vshlq_u32(trailing, shift);
  3357. trailingSub = vandq_s32(trailingSub, g_XMQNaNTest);
  3358. int32x4_t e = vbslq_f32( isExponentZero, exponentSub, exponentNor );
  3359. int32x4_t t = vbslq_f32( isExponentZero, trailingSub, trailingNor );
  3360. // Compute the approximation.
  3361. int32x4_t tmp = vorrq_s32(g_XMOne, t);
  3362. float32x4_t y = vsubq_f32(tmp, g_XMOne);
  3363. float32x4_t log2 = vmlaq_f32( g_XMLogEst6, g_XMLogEst7, y );
  3364. log2 = vmlaq_f32( g_XMLogEst5, log2, y );
  3365. log2 = vmlaq_f32( g_XMLogEst4, log2, y );
  3366. log2 = vmlaq_f32( g_XMLogEst3, log2, y );
  3367. log2 = vmlaq_f32( g_XMLogEst2, log2, y );
  3368. log2 = vmlaq_f32( g_XMLogEst1, log2, y );
  3369. log2 = vmlaq_f32( g_XMLogEst0, log2, y );
  3370. log2 = vmlaq_f32( vcvtq_f32_s32(e), log2, y );
  3371. // if (x is NaN) -> QNaN
  3372. // else if (V is positive)
  3373. // if (V is infinite) -> +inf
  3374. // else -> log2(V)
  3375. // else
  3376. // if (V is zero) -> -inf
  3377. // else -> -QNaN
  3378. int32x4_t isInfinite = vandq_s32((V), g_XMAbsMask);
  3379. isInfinite = vceqq_s32(isInfinite, g_XMInfinity);
  3380. int32x4_t isGreaterZero = vcgtq_s32((V), g_XMZero);
  3381. int32x4_t isNotFinite = vcgtq_s32((V), g_XMInfinity);
  3382. int32x4_t isPositive = vbicq_s32( isGreaterZero,isNotFinite);
  3383. int32x4_t isZero = vandq_s32((V), g_XMAbsMask);
  3384. isZero = vceqq_s32(isZero, g_XMZero);
  3385. int32x4_t t0 = vandq_s32((V), g_XMQNaNTest);
  3386. int32x4_t t1 = vandq_s32((V), g_XMInfinity);
  3387. t0 = vceqq_s32(t0, g_XMZero);
  3388. t1 = vceqq_s32(t1, g_XMInfinity);
  3389. int32x4_t isNaN = vbicq_s32( t1,t0);
  3390. float32x4_t result = vbslq_f32( isInfinite, g_XMInfinity, log2 );
  3391. tmp = vbslq_f32( isZero, g_XMNegInfinity, g_XMNegQNaN );
  3392. result = vbslq_f32(isPositive, result, tmp);
  3393. result = vbslq_f32(isNaN, g_XMQNaN, result );
  3394. return result;
  3395. #elif defined(_XM_SSE_INTRINSICS_)
  3396. __m128i rawBiased = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
  3397. __m128i trailing = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
  3398. __m128i isExponentZero = _mm_cmpeq_epi32(g_XMZero, rawBiased);
  3399. // Compute exponent and significand for normals.
  3400. __m128i biased = _mm_srli_epi32(rawBiased, 23);
  3401. __m128i exponentNor = _mm_sub_epi32(biased, g_XMExponentBias);
  3402. __m128i trailingNor = trailing;
  3403. // Compute exponent and significand for subnormals.
  3404. __m128i leading = Internal::GetLeadingBit(trailing);
  3405. __m128i shift = _mm_sub_epi32(g_XMNumTrailing, leading);
  3406. __m128i exponentSub = _mm_sub_epi32(g_XMSubnormalExponent, shift);
  3407. __m128i trailingSub = Internal::multi_sll_epi32(trailing, shift);
  3408. trailingSub = _mm_and_si128(trailingSub, g_XMQNaNTest);
  3409. __m128i select0 = _mm_and_si128(isExponentZero, exponentSub);
  3410. __m128i select1 = _mm_andnot_si128(isExponentZero, exponentNor);
  3411. __m128i e = _mm_or_si128(select0, select1);
  3412. select0 = _mm_and_si128(isExponentZero, trailingSub);
  3413. select1 = _mm_andnot_si128(isExponentZero, trailingNor);
  3414. __m128i t = _mm_or_si128(select0, select1);
  3415. // Compute the approximation.
  3416. __m128i tmp = _mm_or_si128(g_XMOne, t);
  3417. __m128 y = _mm_sub_ps(_mm_castsi128_ps(tmp), g_XMOne);
  3418. __m128 log2 = _mm_mul_ps(g_XMLogEst7, y);
  3419. log2 = _mm_add_ps(g_XMLogEst6, log2);
  3420. log2 = _mm_mul_ps(log2, y);
  3421. log2 = _mm_add_ps(g_XMLogEst5, log2);
  3422. log2 = _mm_mul_ps(log2, y);
  3423. log2 = _mm_add_ps(g_XMLogEst4, log2);
  3424. log2 = _mm_mul_ps(log2, y);
  3425. log2 = _mm_add_ps(g_XMLogEst3, log2);
  3426. log2 = _mm_mul_ps(log2, y);
  3427. log2 = _mm_add_ps(g_XMLogEst2, log2);
  3428. log2 = _mm_mul_ps(log2, y);
  3429. log2 = _mm_add_ps(g_XMLogEst1, log2);
  3430. log2 = _mm_mul_ps(log2, y);
  3431. log2 = _mm_add_ps(g_XMLogEst0, log2);
  3432. log2 = _mm_mul_ps(log2, y);
  3433. log2 = _mm_add_ps(log2, _mm_cvtepi32_ps(e));
  3434. // if (x is NaN) -> QNaN
  3435. // else if (V is positive)
  3436. // if (V is infinite) -> +inf
  3437. // else -> log2(V)
  3438. // else
  3439. // if (V is zero) -> -inf
  3440. // else -> -QNaN
  3441. __m128i isInfinite = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
  3442. isInfinite = _mm_cmpeq_epi32(isInfinite, g_XMInfinity);
  3443. __m128i isGreaterZero = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMZero);
  3444. __m128i isNotFinite = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMInfinity);
  3445. __m128i isPositive = _mm_andnot_si128(isNotFinite, isGreaterZero);
  3446. __m128i isZero = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
  3447. isZero = _mm_cmpeq_epi32(isZero, g_XMZero);
  3448. __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
  3449. __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
  3450. t0 = _mm_cmpeq_epi32(t0, g_XMZero);
  3451. t1 = _mm_cmpeq_epi32(t1, g_XMInfinity);
  3452. __m128i isNaN = _mm_andnot_si128(t0, t1);
  3453. select0 = _mm_and_si128(isInfinite, g_XMInfinity);
  3454. select1 = _mm_andnot_si128(isInfinite, _mm_castps_si128(log2));
  3455. __m128i result = _mm_or_si128(select0, select1);
  3456. select0 = _mm_and_si128(isZero, g_XMNegInfinity);
  3457. select1 = _mm_andnot_si128(isZero, g_XMNegQNaN);
  3458. tmp = _mm_or_si128(select0, select1);
  3459. select0 = _mm_and_si128(isPositive, result);
  3460. select1 = _mm_andnot_si128(isPositive, tmp);
  3461. result = _mm_or_si128(select0, select1);
  3462. select0 = _mm_and_si128(isNaN, g_XMQNaN);
  3463. select1 = _mm_andnot_si128(isNaN, result);
  3464. result = _mm_or_si128(select0, select1);
  3465. return _mm_castsi128_ps(result);
  3466. #endif
  3467. }
  3468. //------------------------------------------------------------------------------
  3469. inline XMVECTOR XM_CALLCONV XMVectorLogE
  3470. (
  3471. FXMVECTOR V
  3472. )
  3473. {
  3474. #if defined(_XM_NO_INTRINSICS_)
  3475. XMVECTORF32 Result = { { {
  3476. logf(V.vector4_f32[0]),
  3477. logf(V.vector4_f32[1]),
  3478. logf(V.vector4_f32[2]),
  3479. logf(V.vector4_f32[3])
  3480. } } };
  3481. return Result.v;
  3482. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  3483. int32x4_t rawBiased = vandq_s32(V, g_XMInfinity);
  3484. int32x4_t trailing = vandq_s32(V, g_XMQNaNTest);
  3485. int32x4_t isExponentZero = vceqq_s32(g_XMZero, rawBiased);
  3486. // Compute exponent and significand for normals.
  3487. int32x4_t biased = vshrq_n_u32(rawBiased, 23);
  3488. int32x4_t exponentNor = vsubq_s32(biased, g_XMExponentBias);
  3489. int32x4_t trailingNor = trailing;
  3490. // Compute exponent and significand for subnormals.
  3491. int32x4_t leading = Internal::GetLeadingBit(trailing);
  3492. int32x4_t shift = vsubq_s32(g_XMNumTrailing, leading);
  3493. int32x4_t exponentSub = vsubq_s32(g_XMSubnormalExponent, shift);
  3494. int32x4_t trailingSub = vshlq_u32(trailing, shift);
  3495. trailingSub = vandq_s32(trailingSub, g_XMQNaNTest);
  3496. int32x4_t e = vbslq_f32( isExponentZero, exponentSub, exponentNor );
  3497. int32x4_t t = vbslq_f32( isExponentZero, trailingSub, trailingNor );
  3498. // Compute the approximation.
  3499. int32x4_t tmp = vorrq_s32(g_XMOne, t);
  3500. float32x4_t y = vsubq_f32(tmp, g_XMOne);
  3501. float32x4_t log2 = vmlaq_f32( g_XMLogEst6, g_XMLogEst7, y );
  3502. log2 = vmlaq_f32( g_XMLogEst5, log2, y );
  3503. log2 = vmlaq_f32( g_XMLogEst4, log2, y );
  3504. log2 = vmlaq_f32( g_XMLogEst3, log2, y );
  3505. log2 = vmlaq_f32( g_XMLogEst2, log2, y );
  3506. log2 = vmlaq_f32( g_XMLogEst1, log2, y );
  3507. log2 = vmlaq_f32( g_XMLogEst0, log2, y );
  3508. log2 = vmlaq_f32( vcvtq_f32_s32(e), log2, y );
  3509. log2 = vmulq_f32(g_XMInvLgE, log2);
  3510. // if (x is NaN) -> QNaN
  3511. // else if (V is positive)
  3512. // if (V is infinite) -> +inf
  3513. // else -> log2(V)
  3514. // else
  3515. // if (V is zero) -> -inf
  3516. // else -> -QNaN
  3517. int32x4_t isInfinite = vandq_s32((V), g_XMAbsMask);
  3518. isInfinite = vceqq_s32(isInfinite, g_XMInfinity);
  3519. int32x4_t isGreaterZero = vcgtq_s32((V), g_XMZero);
  3520. int32x4_t isNotFinite = vcgtq_s32((V), g_XMInfinity);
  3521. int32x4_t isPositive = vbicq_s32( isGreaterZero,isNotFinite);
  3522. int32x4_t isZero = vandq_s32((V), g_XMAbsMask);
  3523. isZero = vceqq_s32(isZero, g_XMZero);
  3524. int32x4_t t0 = vandq_s32((V), g_XMQNaNTest);
  3525. int32x4_t t1 = vandq_s32((V), g_XMInfinity);
  3526. t0 = vceqq_s32(t0, g_XMZero);
  3527. t1 = vceqq_s32(t1, g_XMInfinity);
  3528. int32x4_t isNaN = vbicq_s32( t1,t0);
  3529. float32x4_t result = vbslq_f32( isInfinite, g_XMInfinity, log2 );
  3530. tmp = vbslq_f32( isZero, g_XMNegInfinity, g_XMNegQNaN );
  3531. result = vbslq_f32(isPositive, result, tmp);
  3532. result = vbslq_f32(isNaN, g_XMQNaN, result );
  3533. return result;
  3534. #elif defined(_XM_SSE_INTRINSICS_)
  3535. __m128i rawBiased = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
  3536. __m128i trailing = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
  3537. __m128i isExponentZero = _mm_cmpeq_epi32(g_XMZero, rawBiased);
  3538. // Compute exponent and significand for normals.
  3539. __m128i biased = _mm_srli_epi32(rawBiased, 23);
  3540. __m128i exponentNor = _mm_sub_epi32(biased, g_XMExponentBias);
  3541. __m128i trailingNor = trailing;
  3542. // Compute exponent and significand for subnormals.
  3543. __m128i leading = Internal::GetLeadingBit(trailing);
  3544. __m128i shift = _mm_sub_epi32(g_XMNumTrailing, leading);
  3545. __m128i exponentSub = _mm_sub_epi32(g_XMSubnormalExponent, shift);
  3546. __m128i trailingSub = Internal::multi_sll_epi32(trailing, shift);
  3547. trailingSub = _mm_and_si128(trailingSub, g_XMQNaNTest);
  3548. __m128i select0 = _mm_and_si128(isExponentZero, exponentSub);
  3549. __m128i select1 = _mm_andnot_si128(isExponentZero, exponentNor);
  3550. __m128i e = _mm_or_si128(select0, select1);
  3551. select0 = _mm_and_si128(isExponentZero, trailingSub);
  3552. select1 = _mm_andnot_si128(isExponentZero, trailingNor);
  3553. __m128i t = _mm_or_si128(select0, select1);
  3554. // Compute the approximation.
  3555. __m128i tmp = _mm_or_si128(g_XMOne, t);
  3556. __m128 y = _mm_sub_ps(_mm_castsi128_ps(tmp), g_XMOne);
  3557. __m128 log2 = _mm_mul_ps(g_XMLogEst7, y);
  3558. log2 = _mm_add_ps(g_XMLogEst6, log2);
  3559. log2 = _mm_mul_ps(log2, y);
  3560. log2 = _mm_add_ps(g_XMLogEst5, log2);
  3561. log2 = _mm_mul_ps(log2, y);
  3562. log2 = _mm_add_ps(g_XMLogEst4, log2);
  3563. log2 = _mm_mul_ps(log2, y);
  3564. log2 = _mm_add_ps(g_XMLogEst3, log2);
  3565. log2 = _mm_mul_ps(log2, y);
  3566. log2 = _mm_add_ps(g_XMLogEst2, log2);
  3567. log2 = _mm_mul_ps(log2, y);
  3568. log2 = _mm_add_ps(g_XMLogEst1, log2);
  3569. log2 = _mm_mul_ps(log2, y);
  3570. log2 = _mm_add_ps(g_XMLogEst0, log2);
  3571. log2 = _mm_mul_ps(log2, y);
  3572. log2 = _mm_add_ps(log2, _mm_cvtepi32_ps(e));
  3573. log2 = _mm_mul_ps(g_XMInvLgE, log2);
  3574. // if (x is NaN) -> QNaN
  3575. // else if (V is positive)
  3576. // if (V is infinite) -> +inf
  3577. // else -> log2(V)
  3578. // else
  3579. // if (V is zero) -> -inf
  3580. // else -> -QNaN
  3581. __m128i isInfinite = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
  3582. isInfinite = _mm_cmpeq_epi32(isInfinite, g_XMInfinity);
  3583. __m128i isGreaterZero = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMZero);
  3584. __m128i isNotFinite = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMInfinity);
  3585. __m128i isPositive = _mm_andnot_si128(isNotFinite, isGreaterZero);
  3586. __m128i isZero = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
  3587. isZero = _mm_cmpeq_epi32(isZero, g_XMZero);
  3588. __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
  3589. __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
  3590. t0 = _mm_cmpeq_epi32(t0, g_XMZero);
  3591. t1 = _mm_cmpeq_epi32(t1, g_XMInfinity);
  3592. __m128i isNaN = _mm_andnot_si128(t0, t1);
  3593. select0 = _mm_and_si128(isInfinite, g_XMInfinity);
  3594. select1 = _mm_andnot_si128(isInfinite, _mm_castps_si128(log2));
  3595. __m128i result = _mm_or_si128(select0, select1);
  3596. select0 = _mm_and_si128(isZero, g_XMNegInfinity);
  3597. select1 = _mm_andnot_si128(isZero, g_XMNegQNaN);
  3598. tmp = _mm_or_si128(select0, select1);
  3599. select0 = _mm_and_si128(isPositive, result);
  3600. select1 = _mm_andnot_si128(isPositive, tmp);
  3601. result = _mm_or_si128(select0, select1);
  3602. select0 = _mm_and_si128(isNaN, g_XMQNaN);
  3603. select1 = _mm_andnot_si128(isNaN, result);
  3604. result = _mm_or_si128(select0, select1);
  3605. return _mm_castsi128_ps(result);
  3606. #endif
  3607. }
  3608. //------------------------------------------------------------------------------
  3609. inline XMVECTOR XM_CALLCONV XMVectorLog
  3610. (
  3611. FXMVECTOR V
  3612. )
  3613. {
  3614. return XMVectorLog2(V);
  3615. }
  3616. //------------------------------------------------------------------------------
  3617. inline XMVECTOR XM_CALLCONV XMVectorPow
  3618. (
  3619. FXMVECTOR V1,
  3620. FXMVECTOR V2
  3621. )
  3622. {
  3623. #if defined(_XM_NO_INTRINSICS_)
  3624. XMVECTORF32 Result = { { {
  3625. powf(V1.vector4_f32[0], V2.vector4_f32[0]),
  3626. powf(V1.vector4_f32[1], V2.vector4_f32[1]),
  3627. powf(V1.vector4_f32[2], V2.vector4_f32[2]),
  3628. powf(V1.vector4_f32[3], V2.vector4_f32[3])
  3629. } } };
  3630. return Result.v;
  3631. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  3632. XMVECTORF32 vResult = { { {
  3633. powf(vgetq_lane_f32(V1, 0), vgetq_lane_f32(V2, 0)),
  3634. powf(vgetq_lane_f32(V1, 1), vgetq_lane_f32(V2, 1)),
  3635. powf(vgetq_lane_f32(V1, 2), vgetq_lane_f32(V2, 2)),
  3636. powf(vgetq_lane_f32(V1, 3), vgetq_lane_f32(V2, 3))
  3637. } } };
  3638. return vResult.v;
  3639. #elif defined(_XM_SSE_INTRINSICS_)
  3640. __declspec(align(16)) float a[4];
  3641. __declspec(align(16)) float b[4];
  3642. _mm_store_ps( a, V1 );
  3643. _mm_store_ps( b, V2 );
  3644. XMVECTOR vResult = _mm_setr_ps(
  3645. powf(a[0],b[0]),
  3646. powf(a[1],b[1]),
  3647. powf(a[2],b[2]),
  3648. powf(a[3],b[3]));
  3649. return vResult;
  3650. #endif
  3651. }
  3652. //------------------------------------------------------------------------------
  3653. inline XMVECTOR XM_CALLCONV XMVectorAbs
  3654. (
  3655. FXMVECTOR V
  3656. )
  3657. {
  3658. #if defined(_XM_NO_INTRINSICS_)
  3659. XMVECTORF32 vResult = { { {
  3660. fabsf(V.vector4_f32[0]),
  3661. fabsf(V.vector4_f32[1]),
  3662. fabsf(V.vector4_f32[2]),
  3663. fabsf(V.vector4_f32[3])
  3664. } } };
  3665. return vResult.v;
  3666. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  3667. return vabsq_f32( V );
  3668. #elif defined(_XM_SSE_INTRINSICS_)
  3669. XMVECTOR vResult = _mm_setzero_ps();
  3670. vResult = _mm_sub_ps(vResult,V);
  3671. vResult = _mm_max_ps(vResult,V);
  3672. return vResult;
  3673. #endif
  3674. }
  3675. //------------------------------------------------------------------------------
  3676. inline XMVECTOR XM_CALLCONV XMVectorMod
  3677. (
  3678. FXMVECTOR V1,
  3679. FXMVECTOR V2
  3680. )
  3681. {
  3682. // V1 % V2 = V1 - V2 * truncate(V1 / V2)
  3683. #if defined(_XM_NO_INTRINSICS_)
  3684. XMVECTOR Quotient = XMVectorDivide(V1, V2);
  3685. Quotient = XMVectorTruncate(Quotient);
  3686. XMVECTOR Result = XMVectorNegativeMultiplySubtract(V2, Quotient, V1);
  3687. return Result;
  3688. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  3689. XMVECTOR vResult = XMVectorDivide(V1, V2);
  3690. vResult = XMVectorTruncate(vResult);
  3691. return vmlsq_f32( V1, vResult, V2 );
  3692. #elif defined(_XM_SSE_INTRINSICS_)
  3693. XMVECTOR vResult = _mm_div_ps(V1, V2);
  3694. vResult = XMVectorTruncate(vResult);
  3695. vResult = _mm_mul_ps(vResult,V2);
  3696. vResult = _mm_sub_ps(V1,vResult);
  3697. return vResult;
  3698. #endif
  3699. }
  3700. //------------------------------------------------------------------------------
  3701. inline XMVECTOR XM_CALLCONV XMVectorModAngles
  3702. (
  3703. FXMVECTOR Angles
  3704. )
  3705. {
  3706. #if defined(_XM_NO_INTRINSICS_)
  3707. XMVECTOR V;
  3708. XMVECTOR Result;
  3709. // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI
  3710. V = XMVectorMultiply(Angles, g_XMReciprocalTwoPi.v);
  3711. V = XMVectorRound(V);
  3712. Result = XMVectorNegativeMultiplySubtract(g_XMTwoPi.v, V, Angles);
  3713. return Result;
  3714. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  3715. // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI
  3716. XMVECTOR vResult = vmulq_f32(Angles,g_XMReciprocalTwoPi);
  3717. // Use the inline function due to complexity for rounding
  3718. vResult = XMVectorRound(vResult);
  3719. return vmlsq_f32( Angles, vResult, g_XMTwoPi );
  3720. #elif defined(_XM_SSE_INTRINSICS_)
  3721. // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI
  3722. XMVECTOR vResult = _mm_mul_ps(Angles,g_XMReciprocalTwoPi);
  3723. // Use the inline function due to complexity for rounding
  3724. vResult = XMVectorRound(vResult);
  3725. vResult = _mm_mul_ps(vResult,g_XMTwoPi);
  3726. vResult = _mm_sub_ps(Angles,vResult);
  3727. return vResult;
  3728. #endif
  3729. }
  3730. //------------------------------------------------------------------------------
  3731. inline XMVECTOR XM_CALLCONV XMVectorSin
  3732. (
  3733. FXMVECTOR V
  3734. )
  3735. {
  3736. // 11-degree minimax approximation
  3737. #if defined(_XM_NO_INTRINSICS_)
  3738. XMVECTORF32 Result = { { {
  3739. sinf(V.vector4_f32[0]),
  3740. sinf(V.vector4_f32[1]),
  3741. sinf(V.vector4_f32[2]),
  3742. sinf(V.vector4_f32[3])
  3743. } } };
  3744. return Result.v;
  3745. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  3746. // Force the value within the bounds of pi
  3747. XMVECTOR x = XMVectorModAngles(V);
  3748. // Map in [-pi/2,pi/2] with sin(y) = sin(x).
  3749. uint32x4_t sign = vandq_u32(x, g_XMNegativeZero);
  3750. uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0
  3751. float32x4_t absx = vabsq_f32( x );
  3752. float32x4_t rflx = vsubq_f32(c, x);
  3753. uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
  3754. x = vbslq_f32( comp, x, rflx );
  3755. float32x4_t x2 = vmulq_f32(x, x);
  3756. // Compute polynomial approximation
  3757. const XMVECTOR SC1 = g_XMSinCoefficients1;
  3758. const XMVECTOR SC0 = g_XMSinCoefficients0;
  3759. XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1);
  3760. XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(SC1), 0);
  3761. vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0);
  3762. Result = vmlaq_f32(vConstants, Result, x2);
  3763. vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1);
  3764. Result = vmlaq_f32(vConstants, Result, x2);
  3765. vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0);
  3766. Result = vmlaq_f32(vConstants, Result, x2);
  3767. Result = vmlaq_f32(g_XMOne, Result, x2);
  3768. Result = vmulq_f32(Result, x);
  3769. return Result;
  3770. #elif defined(_XM_SSE_INTRINSICS_)
  3771. // Force the value within the bounds of pi
  3772. XMVECTOR x = XMVectorModAngles(V);
  3773. // Map in [-pi/2,pi/2] with sin(y) = sin(x).
  3774. __m128 sign = _mm_and_ps(x, g_XMNegativeZero);
  3775. __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0
  3776. __m128 absx = _mm_andnot_ps(sign, x); // |x|
  3777. __m128 rflx = _mm_sub_ps(c, x);
  3778. __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
  3779. __m128 select0 = _mm_and_ps(comp, x);
  3780. __m128 select1 = _mm_andnot_ps(comp, rflx);
  3781. x = _mm_or_ps(select0, select1);
  3782. __m128 x2 = _mm_mul_ps(x, x);
  3783. // Compute polynomial approximation
  3784. const XMVECTOR SC1 = g_XMSinCoefficients1;
  3785. XMVECTOR vConstants = XM_PERMUTE_PS( SC1, _MM_SHUFFLE(0, 0, 0, 0) );
  3786. __m128 Result = _mm_mul_ps(vConstants, x2);
  3787. const XMVECTOR SC0 = g_XMSinCoefficients0;
  3788. vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(3, 3, 3, 3) );
  3789. Result = _mm_add_ps(Result, vConstants);
  3790. Result = _mm_mul_ps(Result, x2);
  3791. vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(2, 2, 2, 2) );
  3792. Result = _mm_add_ps(Result, vConstants);
  3793. Result = _mm_mul_ps(Result, x2);
  3794. vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(1, 1, 1, 1) );
  3795. Result = _mm_add_ps(Result, vConstants);
  3796. Result = _mm_mul_ps(Result, x2);
  3797. vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(0, 0, 0, 0) );
  3798. Result = _mm_add_ps(Result, vConstants);
  3799. Result = _mm_mul_ps(Result, x2);
  3800. Result = _mm_add_ps(Result, g_XMOne);
  3801. Result = _mm_mul_ps(Result, x);
  3802. return Result;
  3803. #endif
  3804. }
  3805. //------------------------------------------------------------------------------
  3806. inline XMVECTOR XM_CALLCONV XMVectorCos
  3807. (
  3808. FXMVECTOR V
  3809. )
  3810. {
  3811. // 10-degree minimax approximation
  3812. #if defined(_XM_NO_INTRINSICS_)
  3813. XMVECTORF32 Result = { { {
  3814. cosf(V.vector4_f32[0]),
  3815. cosf(V.vector4_f32[1]),
  3816. cosf(V.vector4_f32[2]),
  3817. cosf(V.vector4_f32[3])
  3818. } } };
  3819. return Result.v;
  3820. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  3821. // Map V to x in [-pi,pi].
  3822. XMVECTOR x = XMVectorModAngles(V);
  3823. // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
  3824. uint32x4_t sign = vandq_u32(x, g_XMNegativeZero);
  3825. uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0
  3826. float32x4_t absx = vabsq_f32( x );
  3827. float32x4_t rflx = vsubq_f32(c, x);
  3828. uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
  3829. x = vbslq_f32( comp, x, rflx );
  3830. sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne );
  3831. float32x4_t x2 = vmulq_f32(x, x);
  3832. // Compute polynomial approximation
  3833. const XMVECTOR CC1 = g_XMCosCoefficients1;
  3834. const XMVECTOR CC0 = g_XMCosCoefficients0;
  3835. XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1);
  3836. XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(CC1), 0 );
  3837. vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0);
  3838. Result = vmlaq_f32(vConstants, Result, x2);
  3839. vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1);
  3840. Result = vmlaq_f32(vConstants, Result, x2);
  3841. vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0);
  3842. Result = vmlaq_f32(vConstants, Result, x2);
  3843. Result = vmlaq_f32(g_XMOne, Result, x2);
  3844. Result = vmulq_f32(Result, sign);
  3845. return Result;
  3846. #elif defined(_XM_SSE_INTRINSICS_)
  3847. // Map V to x in [-pi,pi].
  3848. XMVECTOR x = XMVectorModAngles(V);
  3849. // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
  3850. XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
  3851. __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0
  3852. __m128 absx = _mm_andnot_ps(sign, x); // |x|
  3853. __m128 rflx = _mm_sub_ps(c, x);
  3854. __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
  3855. __m128 select0 = _mm_and_ps(comp, x);
  3856. __m128 select1 = _mm_andnot_ps(comp, rflx);
  3857. x = _mm_or_ps(select0, select1);
  3858. select0 = _mm_and_ps(comp, g_XMOne);
  3859. select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
  3860. sign = _mm_or_ps(select0, select1);
  3861. __m128 x2 = _mm_mul_ps(x, x);
  3862. // Compute polynomial approximation
  3863. const XMVECTOR CC1 = g_XMCosCoefficients1;
  3864. XMVECTOR vConstants = XM_PERMUTE_PS( CC1, _MM_SHUFFLE(0, 0, 0, 0) );
  3865. __m128 Result = _mm_mul_ps(vConstants, x2);
  3866. const XMVECTOR CC0 = g_XMCosCoefficients0;
  3867. vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(3, 3, 3, 3) );
  3868. Result = _mm_add_ps(Result, vConstants);
  3869. Result = _mm_mul_ps(Result, x2);
  3870. vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(2, 2, 2, 2) );
  3871. Result = _mm_add_ps(Result, vConstants);
  3872. Result = _mm_mul_ps(Result, x2);
  3873. vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(1, 1, 1, 1) );
  3874. Result = _mm_add_ps(Result, vConstants);
  3875. Result = _mm_mul_ps(Result, x2);
  3876. vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(0, 0, 0, 0) );
  3877. Result = _mm_add_ps(Result, vConstants);
  3878. Result = _mm_mul_ps(Result, x2);
  3879. Result = _mm_add_ps(Result, g_XMOne);
  3880. Result = _mm_mul_ps(Result, sign);
  3881. return Result;
  3882. #endif
  3883. }
  3884. //------------------------------------------------------------------------------
  3885. _Use_decl_annotations_
  3886. inline void XM_CALLCONV XMVectorSinCos
  3887. (
  3888. XMVECTOR* pSin,
  3889. XMVECTOR* pCos,
  3890. FXMVECTOR V
  3891. )
  3892. {
  3893. assert(pSin != nullptr);
  3894. assert(pCos != nullptr);
  3895. // 11/10-degree minimax approximation
  3896. #if defined(_XM_NO_INTRINSICS_)
  3897. XMVECTORF32 Sin = { { {
  3898. sinf(V.vector4_f32[0]),
  3899. sinf(V.vector4_f32[1]),
  3900. sinf(V.vector4_f32[2]),
  3901. sinf(V.vector4_f32[3])
  3902. } } };
  3903. XMVECTORF32 Cos = { { {
  3904. cosf(V.vector4_f32[0]),
  3905. cosf(V.vector4_f32[1]),
  3906. cosf(V.vector4_f32[2]),
  3907. cosf(V.vector4_f32[3])
  3908. } } };
  3909. *pSin = Sin.v;
  3910. *pCos = Cos.v;
  3911. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  3912. // Force the value within the bounds of pi
  3913. XMVECTOR x = XMVectorModAngles(V);
  3914. // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
  3915. uint32x4_t sign = vandq_u32(x, g_XMNegativeZero);
  3916. uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0
  3917. float32x4_t absx = vabsq_f32( x );
  3918. float32x4_t rflx = vsubq_f32(c, x);
  3919. uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
  3920. x = vbslq_f32( comp, x, rflx );
  3921. sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne );
  3922. float32x4_t x2 = vmulq_f32(x, x);
  3923. // Compute polynomial approximation for sine
  3924. const XMVECTOR SC1 = g_XMSinCoefficients1;
  3925. const XMVECTOR SC0 = g_XMSinCoefficients0;
  3926. XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1);
  3927. XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(SC1), 0);
  3928. vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0);
  3929. Result = vmlaq_f32(vConstants, Result, x2);
  3930. vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1);
  3931. Result = vmlaq_f32(vConstants, Result, x2);
  3932. vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0);
  3933. Result = vmlaq_f32(vConstants, Result, x2);
  3934. Result = vmlaq_f32(g_XMOne, Result, x2);
  3935. *pSin = vmulq_f32(Result, x);
  3936. // Compute polynomial approximation for cosine
  3937. const XMVECTOR CC1 = g_XMCosCoefficients1;
  3938. const XMVECTOR CC0 = g_XMCosCoefficients0;
  3939. vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1);
  3940. Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(CC1), 0);
  3941. vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0);
  3942. Result = vmlaq_f32(vConstants, Result, x2);
  3943. vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1);
  3944. Result = vmlaq_f32(vConstants, Result, x2);
  3945. vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0);
  3946. Result = vmlaq_f32(vConstants, Result, x2);
  3947. Result = vmlaq_f32(g_XMOne, Result, x2);
  3948. *pCos = vmulq_f32(Result, sign);
  3949. #elif defined(_XM_SSE_INTRINSICS_)
  3950. // Force the value within the bounds of pi
  3951. XMVECTOR x = XMVectorModAngles(V);
  3952. // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x).
  3953. XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
  3954. __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0
  3955. __m128 absx = _mm_andnot_ps(sign, x); // |x|
  3956. __m128 rflx = _mm_sub_ps(c, x);
  3957. __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
  3958. __m128 select0 = _mm_and_ps(comp, x);
  3959. __m128 select1 = _mm_andnot_ps(comp, rflx);
  3960. x = _mm_or_ps(select0, select1);
  3961. select0 = _mm_and_ps(comp, g_XMOne);
  3962. select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
  3963. sign = _mm_or_ps(select0, select1);
  3964. __m128 x2 = _mm_mul_ps(x, x);
  3965. // Compute polynomial approximation of sine
  3966. const XMVECTOR SC1 = g_XMSinCoefficients1;
  3967. XMVECTOR vConstants = XM_PERMUTE_PS( SC1, _MM_SHUFFLE(0, 0, 0, 0) );
  3968. __m128 Result = _mm_mul_ps(vConstants, x2);
  3969. const XMVECTOR SC0 = g_XMSinCoefficients0;
  3970. vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(3, 3, 3, 3) );
  3971. Result = _mm_add_ps(Result, vConstants);
  3972. Result = _mm_mul_ps(Result, x2);
  3973. vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(2, 2, 2, 2) );
  3974. Result = _mm_add_ps(Result, vConstants);
  3975. Result = _mm_mul_ps(Result, x2);
  3976. vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(1, 1, 1, 1) );
  3977. Result = _mm_add_ps(Result, vConstants);
  3978. Result = _mm_mul_ps(Result, x2);
  3979. vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(0, 0, 0, 0) );
  3980. Result = _mm_add_ps(Result, vConstants);
  3981. Result = _mm_mul_ps(Result, x2);
  3982. Result = _mm_add_ps(Result, g_XMOne);
  3983. Result = _mm_mul_ps(Result, x);
  3984. *pSin = Result;
  3985. // Compute polynomial approximation of cosine
  3986. const XMVECTOR CC1 = g_XMCosCoefficients1;
  3987. vConstants = XM_PERMUTE_PS( CC1, _MM_SHUFFLE(0, 0, 0, 0) );
  3988. Result = _mm_mul_ps(vConstants, x2);
  3989. const XMVECTOR CC0 = g_XMCosCoefficients0;
  3990. vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(3, 3, 3, 3) );
  3991. Result = _mm_add_ps(Result, vConstants);
  3992. Result = _mm_mul_ps(Result, x2);
  3993. vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(2, 2, 2, 2) );
  3994. Result = _mm_add_ps(Result, vConstants);
  3995. Result = _mm_mul_ps(Result, x2);
  3996. vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(1, 1, 1, 1) );
  3997. Result = _mm_add_ps(Result, vConstants);
  3998. Result = _mm_mul_ps(Result, x2);
  3999. vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(0, 0, 0, 0) );
  4000. Result = _mm_add_ps(Result, vConstants);
  4001. Result = _mm_mul_ps(Result, x2);
  4002. Result = _mm_add_ps(Result, g_XMOne);
  4003. Result = _mm_mul_ps(Result, sign);
  4004. *pCos = Result;
  4005. #endif
  4006. }
  4007. //------------------------------------------------------------------------------
  4008. inline XMVECTOR XM_CALLCONV XMVectorTan
  4009. (
  4010. FXMVECTOR V
  4011. )
  4012. {
  4013. // Cody and Waite algorithm to compute tangent.
  4014. #if defined(_XM_NO_INTRINSICS_)
  4015. XMVECTORF32 Result = { { {
  4016. tanf(V.vector4_f32[0]),
  4017. tanf(V.vector4_f32[1]),
  4018. tanf(V.vector4_f32[2]),
  4019. tanf(V.vector4_f32[3])
  4020. } } };
  4021. return Result.v;
  4022. #elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
  4023. static const XMVECTORF32 TanCoefficients0 = { { { 1.0f, -4.667168334e-1f, 2.566383229e-2f, -3.118153191e-4f } } };
  4024. static const XMVECTORF32 TanCoefficients1 = { { { 4.981943399e-7f, -1.333835001e-1f, 3.424887824e-3f, -1.786170734e-5f } } };
  4025. static const XMVECTORF32 TanConstants = { { { 1.570796371f, 6.077100628e-11f, 0.000244140625f, 0.63661977228f /*2 / Pi*/ } } };
  4026. static const XMVECTORU32 Mask = { { { 0x1, 0x1, 0x1, 0x1 } } };
  4027. XMVECTOR TwoDivPi = XMVectorSplatW(TanConstants.v);
  4028. XMVECTOR Zero = XMVectorZero();
  4029. XMVECTOR C0 = XMVectorSplatX(TanConstants.v);
  4030. XMVECTOR C1 = XMVectorSplatY(TanConstants.v);
  4031. XMVECTOR Epsilon = XMVectorSplatZ(TanConstants.v);
  4032. XMVECTOR VA = XMVectorMultiply(V, TwoDivPi);
  4033. VA = XMVectorRound(VA);
  4034. XMVECTOR VC = XMVectorNegativeMultiplySubtract(VA, C0, V);
  4035. XMVECTOR VB = XMVectorAbs(VA);
  4036. VC = XMVectorNegativeMultiplySubtract(VA, C1, VC);
  4037. #if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
  4038. VB = vcvtq_u32_f32( VB );
  4039. #elif defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
  4040. reinterpret_cast<__m128i *>(&VB)[0] = _mm_cvttps_epi32(VB);
  4041. #else
  4042. for (size_t i = 0; i < 4; i++)
  4043. {
  4044. VB.vector4_u32[i] = (uint32_t)VB.vector4_f32[i];
  4045. }
  4046. #endif
  4047. XMVECTOR VC2 = XMVectorMultiply(VC, VC);
  4048. XMVECTOR T7 = XMVectorSplatW(TanCoefficients1.v);
  4049. XMVECTOR T6 = XMVectorSplatZ(TanCoefficients1.v);
  4050. XMVECTOR T4 = XMVectorSplatX(TanCoefficients1.v);
  4051. XMVECTOR T3 = XMVectorSplatW(TanCoefficients0.v);
  4052. XMVECTOR T5 = XMVectorSplatY(TanCoefficients1.v);
  4053. XMVECTOR T2 = XMVectorSplatZ(TanCoefficients0.v);
  4054. XMVECTOR T1 = XMVectorSplatY(TanCoefficients0.v);
  4055. XMVECTOR T0 = XMVectorSplatX(TanCoefficients0.v);
  4056. XMVECTOR VBIsEven = XMVectorAndInt(VB, Mask.v);
  4057. VBIsEven = XMVectorEqualInt(VBIsEven, Zero);
  4058. XMVECTOR N = XMVectorMultiplyAdd(VC2, T7, T6);
  4059. XMVECTOR D = XMVectorMultiplyAdd(VC2, T4, T3);
  4060. N = XMVectorMultiplyAdd(VC2, N, T5);
  4061. D = XMVectorMultiplyAdd(VC2, D, T2);
  4062. N = XMVectorMultiply(VC2, N);
  4063. D = XMVectorMultiplyAdd(VC2, D, T1);
  4064. N = XMVectorMultiplyAdd(VC, N, VC);
  4065. XMVECTOR VCNearZero = XMVectorInBounds(VC, Epsilon);
  4066. D = XMVectorMultiplyAdd(VC2, D, T0);
  4067. N = XMVectorSelect(N, VC, VCNearZero);
  4068. D = XMVectorSelect(D, g_XMOne.v, VCNearZero);
  4069. XMVECTOR R0 = XMVectorNegate(N);
  4070. XMVECTOR R1 = XMVectorDivide(N,D);
  4071. R0 = XMVectorDivide(D,R0);
  4072. XMVECTOR VIsZero = XMVectorEqual(V, Zero);
  4073. XMVECTOR Result = XMVectorSelect(R0, R1, VBIsEven);
  4074. Result = XMVectorSelect(Result, Zero, VIsZero);
  4075. return Result;
  4076. #endif
  4077. }
  4078. //------------------------------------------------------------------------------
  4079. inline XMVECTOR XM_CALLCONV XMVectorSinH
  4080. (
  4081. FXMVECTOR V
  4082. )
  4083. {
  4084. #if defined(_XM_NO_INTRINSICS_)
  4085. XMVECTORF32 Result = { { {
  4086. sinhf(V.vector4_f32[0]),
  4087. sinhf(V.vector4_f32[1]),
  4088. sinhf(V.vector4_f32[2]),
  4089. sinhf(V.vector4_f32[3])
  4090. } } };
  4091. return Result.v;
  4092. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  4093. static const XMVECTORF32 Scale = { { { 1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f } } }; // 1.0f / ln(2.0f)
  4094. XMVECTOR V1 = vmlaq_f32( g_XMNegativeOne.v, V, Scale.v );
  4095. XMVECTOR V2 = vmlsq_f32( g_XMNegativeOne.v, V, Scale.v );
  4096. XMVECTOR E1 = XMVectorExp(V1);
  4097. XMVECTOR E2 = XMVectorExp(V2);
  4098. return vsubq_f32(E1, E2);
  4099. #elif defined(_XM_SSE_INTRINSICS_)
  4100. static const XMVECTORF32 Scale = { { { 1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f } } }; // 1.0f / ln(2.0f)
  4101. XMVECTOR V1 = _mm_mul_ps(V, Scale);
  4102. V1 = _mm_add_ps(V1,g_XMNegativeOne);
  4103. XMVECTOR V2 = _mm_mul_ps(V, Scale);
  4104. V2 = _mm_sub_ps(g_XMNegativeOne,V2);
  4105. XMVECTOR E1 = XMVectorExp(V1);
  4106. XMVECTOR E2 = XMVectorExp(V2);
  4107. return _mm_sub_ps(E1, E2);
  4108. #endif
  4109. }
  4110. //------------------------------------------------------------------------------
  4111. inline XMVECTOR XM_CALLCONV XMVectorCosH
  4112. (
  4113. FXMVECTOR V
  4114. )
  4115. {
  4116. #if defined(_XM_NO_INTRINSICS_)
  4117. XMVECTORF32 Result = { { {
  4118. coshf(V.vector4_f32[0]),
  4119. coshf(V.vector4_f32[1]),
  4120. coshf(V.vector4_f32[2]),
  4121. coshf(V.vector4_f32[3])
  4122. } } };
  4123. return Result.v;
  4124. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  4125. static const XMVECTORF32 Scale = { { { 1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f } } }; // 1.0f / ln(2.0f)
  4126. XMVECTOR V1 = vmlaq_f32(g_XMNegativeOne.v, V, Scale.v);
  4127. XMVECTOR V2 = vmlsq_f32(g_XMNegativeOne.v, V, Scale.v);
  4128. XMVECTOR E1 = XMVectorExp(V1);
  4129. XMVECTOR E2 = XMVectorExp(V2);
  4130. return vaddq_f32(E1, E2);
  4131. #elif defined(_XM_SSE_INTRINSICS_)
  4132. static const XMVECTORF32 Scale = { { { 1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f } } }; // 1.0f / ln(2.0f)
  4133. XMVECTOR V1 = _mm_mul_ps(V,Scale.v);
  4134. V1 = _mm_add_ps(V1,g_XMNegativeOne.v);
  4135. XMVECTOR V2 = _mm_mul_ps(V, Scale.v);
  4136. V2 = _mm_sub_ps(g_XMNegativeOne.v,V2);
  4137. XMVECTOR E1 = XMVectorExp(V1);
  4138. XMVECTOR E2 = XMVectorExp(V2);
  4139. return _mm_add_ps(E1, E2);
  4140. #endif
  4141. }
  4142. //------------------------------------------------------------------------------
  4143. inline XMVECTOR XM_CALLCONV XMVectorTanH
  4144. (
  4145. FXMVECTOR V
  4146. )
  4147. {
  4148. #if defined(_XM_NO_INTRINSICS_)
  4149. XMVECTORF32 Result = { { {
  4150. tanhf(V.vector4_f32[0]),
  4151. tanhf(V.vector4_f32[1]),
  4152. tanhf(V.vector4_f32[2]),
  4153. tanhf(V.vector4_f32[3])
  4154. } } };
  4155. return Result.v;
  4156. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  4157. static const XMVECTORF32 Scale = { { { 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f } } }; // 2.0f / ln(2.0f)
  4158. XMVECTOR E = vmulq_f32(V, Scale.v);
  4159. E = XMVectorExp(E);
  4160. E = vmlaq_f32( g_XMOneHalf.v, E, g_XMOneHalf.v );
  4161. E = XMVectorReciprocal(E);
  4162. return vsubq_f32(g_XMOne.v, E);
  4163. #elif defined(_XM_SSE_INTRINSICS_)
  4164. static const XMVECTORF32 Scale = { { { 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f } } }; // 2.0f / ln(2.0f)
  4165. XMVECTOR E = _mm_mul_ps(V, Scale.v);
  4166. E = XMVectorExp(E);
  4167. E = _mm_mul_ps(E,g_XMOneHalf.v);
  4168. E = _mm_add_ps(E,g_XMOneHalf.v);
  4169. E = _mm_div_ps(g_XMOne.v,E);
  4170. return _mm_sub_ps(g_XMOne.v,E);
  4171. #endif
  4172. }
  4173. //------------------------------------------------------------------------------
  4174. inline XMVECTOR XM_CALLCONV XMVectorASin
  4175. (
  4176. FXMVECTOR V
  4177. )
  4178. {
  4179. // 7-degree minimax approximation
  4180. #if defined(_XM_NO_INTRINSICS_)
  4181. XMVECTORF32 Result = { { {
  4182. asinf(V.vector4_f32[0]),
  4183. asinf(V.vector4_f32[1]),
  4184. asinf(V.vector4_f32[2]),
  4185. asinf(V.vector4_f32[3])
  4186. } } };
  4187. return Result.v;
  4188. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  4189. uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero);
  4190. float32x4_t x = vabsq_f32(V);
  4191. // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
  4192. float32x4_t oneMValue = vsubq_f32(g_XMOne, x);
  4193. float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
  4194. float32x4_t root = XMVectorSqrt(clampOneMValue);
  4195. // Compute polynomial approximation
  4196. const XMVECTOR AC1 = g_XMArcCoefficients1;
  4197. XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0);
  4198. XMVECTOR t0 = vmlaq_lane_f32( vConstants, x, vget_high_f32(AC1), 1 );
  4199. vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1);
  4200. t0 = vmlaq_f32( vConstants, t0, x );
  4201. vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0);
  4202. t0 = vmlaq_f32( vConstants, t0, x );
  4203. const XMVECTOR AC0 = g_XMArcCoefficients0;
  4204. vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1);
  4205. t0 = vmlaq_f32( vConstants, t0, x );
  4206. vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0);
  4207. t0 = vmlaq_f32( vConstants, t0, x );
  4208. vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1);
  4209. t0 = vmlaq_f32( vConstants, t0, x );
  4210. vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0);
  4211. t0 = vmlaq_f32( vConstants, t0, x );
  4212. t0 = vmulq_f32(t0, root);
  4213. float32x4_t t1 = vsubq_f32(g_XMPi, t0);
  4214. t0 = vbslq_f32( nonnegative, t0, t1 );
  4215. t0 = vsubq_f32(g_XMHalfPi, t0);
  4216. return t0;
  4217. #elif defined(_XM_SSE_INTRINSICS_)
  4218. __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
  4219. __m128 mvalue = _mm_sub_ps(g_XMZero, V);
  4220. __m128 x = _mm_max_ps(V, mvalue); // |V|
  4221. // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
  4222. __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
  4223. __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
  4224. __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|)
  4225. // Compute polynomial approximation
  4226. const XMVECTOR AC1 = g_XMArcCoefficients1;
  4227. XMVECTOR vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(3, 3, 3, 3) );
  4228. __m128 t0 = _mm_mul_ps(vConstants, x);
  4229. vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(2, 2, 2, 2) );
  4230. t0 = _mm_add_ps(t0, vConstants);
  4231. t0 = _mm_mul_ps(t0, x);
  4232. vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(1, 1, 1, 1) );
  4233. t0 = _mm_add_ps(t0, vConstants);
  4234. t0 = _mm_mul_ps(t0, x);
  4235. vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(0, 0, 0, 0) );
  4236. t0 = _mm_add_ps(t0, vConstants);
  4237. t0 = _mm_mul_ps(t0, x);
  4238. const XMVECTOR AC0 = g_XMArcCoefficients0;
  4239. vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(3, 3, 3, 3) );
  4240. t0 = _mm_add_ps(t0, vConstants);
  4241. t0 = _mm_mul_ps(t0, x);
  4242. vConstants = XM_PERMUTE_PS( AC0,_MM_SHUFFLE(2, 2, 2, 2) );
  4243. t0 = _mm_add_ps(t0, vConstants);
  4244. t0 = _mm_mul_ps(t0, x);
  4245. vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(1, 1, 1, 1) );
  4246. t0 = _mm_add_ps(t0, vConstants);
  4247. t0 = _mm_mul_ps(t0, x);
  4248. vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(0, 0, 0, 0) );
  4249. t0 = _mm_add_ps(t0, vConstants);
  4250. t0 = _mm_mul_ps(t0, root);
  4251. __m128 t1 = _mm_sub_ps(g_XMPi, t0);
  4252. t0 = _mm_and_ps(nonnegative, t0);
  4253. t1 = _mm_andnot_ps(nonnegative, t1);
  4254. t0 = _mm_or_ps(t0, t1);
  4255. t0 = _mm_sub_ps(g_XMHalfPi, t0);
  4256. return t0;
  4257. #endif
  4258. }
  4259. //------------------------------------------------------------------------------
  4260. inline XMVECTOR XM_CALLCONV XMVectorACos
  4261. (
  4262. FXMVECTOR V
  4263. )
  4264. {
  4265. // 7-degree minimax approximation
  4266. #if defined(_XM_NO_INTRINSICS_)
  4267. XMVECTORF32 Result = { { {
  4268. acosf(V.vector4_f32[0]),
  4269. acosf(V.vector4_f32[1]),
  4270. acosf(V.vector4_f32[2]),
  4271. acosf(V.vector4_f32[3])
  4272. } } };
  4273. return Result.v;
  4274. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  4275. uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero);
  4276. float32x4_t x = vabsq_f32(V);
  4277. // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
  4278. float32x4_t oneMValue = vsubq_f32(g_XMOne, x);
  4279. float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
  4280. float32x4_t root = XMVectorSqrt(clampOneMValue);
  4281. // Compute polynomial approximation
  4282. const XMVECTOR AC1 = g_XMArcCoefficients1;
  4283. XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0);
  4284. XMVECTOR t0 = vmlaq_lane_f32( vConstants, x, vget_high_f32(AC1), 1 );
  4285. vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1);
  4286. t0 = vmlaq_f32( vConstants, t0, x );
  4287. vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0);
  4288. t0 = vmlaq_f32( vConstants, t0, x );
  4289. const XMVECTOR AC0 = g_XMArcCoefficients0;
  4290. vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1);
  4291. t0 = vmlaq_f32( vConstants, t0, x );
  4292. vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0);
  4293. t0 = vmlaq_f32( vConstants, t0, x );
  4294. vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1);
  4295. t0 = vmlaq_f32( vConstants, t0, x );
  4296. vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0);
  4297. t0 = vmlaq_f32( vConstants, t0, x );
  4298. t0 = vmulq_f32(t0, root);
  4299. float32x4_t t1 = vsubq_f32(g_XMPi, t0);
  4300. t0 = vbslq_f32( nonnegative, t0, t1 );
  4301. return t0;
  4302. #elif defined(_XM_SSE_INTRINSICS_)
  4303. __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
  4304. __m128 mvalue = _mm_sub_ps(g_XMZero, V);
  4305. __m128 x = _mm_max_ps(V, mvalue); // |V|
  4306. // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
  4307. __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
  4308. __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
  4309. __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|)
  4310. // Compute polynomial approximation
  4311. const XMVECTOR AC1 = g_XMArcCoefficients1;
  4312. XMVECTOR vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(3, 3, 3, 3) );
  4313. __m128 t0 = _mm_mul_ps(vConstants, x);
  4314. vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(2, 2, 2, 2) );
  4315. t0 = _mm_add_ps(t0, vConstants);
  4316. t0 = _mm_mul_ps(t0, x);
  4317. vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(1, 1, 1, 1) );
  4318. t0 = _mm_add_ps(t0, vConstants);
  4319. t0 = _mm_mul_ps(t0, x);
  4320. vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(0, 0, 0, 0) );
  4321. t0 = _mm_add_ps(t0, vConstants);
  4322. t0 = _mm_mul_ps(t0, x);
  4323. const XMVECTOR AC0 = g_XMArcCoefficients0;
  4324. vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(3, 3, 3, 3) );
  4325. t0 = _mm_add_ps(t0, vConstants);
  4326. t0 = _mm_mul_ps(t0, x);
  4327. vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(2, 2, 2, 2) );
  4328. t0 = _mm_add_ps(t0, vConstants);
  4329. t0 = _mm_mul_ps(t0, x);
  4330. vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(1, 1, 1, 1) );
  4331. t0 = _mm_add_ps(t0, vConstants);
  4332. t0 = _mm_mul_ps(t0, x);
  4333. vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(0, 0, 0, 0) );
  4334. t0 = _mm_add_ps(t0, vConstants);
  4335. t0 = _mm_mul_ps(t0, root);
  4336. __m128 t1 = _mm_sub_ps(g_XMPi, t0);
  4337. t0 = _mm_and_ps(nonnegative, t0);
  4338. t1 = _mm_andnot_ps(nonnegative, t1);
  4339. t0 = _mm_or_ps(t0, t1);
  4340. return t0;
  4341. #endif
  4342. }
  4343. //------------------------------------------------------------------------------
  4344. inline XMVECTOR XM_CALLCONV XMVectorATan
  4345. (
  4346. FXMVECTOR V
  4347. )
  4348. {
  4349. // 17-degree minimax approximation
  4350. #if defined(_XM_NO_INTRINSICS_)
  4351. XMVECTORF32 Result = { { {
  4352. atanf(V.vector4_f32[0]),
  4353. atanf(V.vector4_f32[1]),
  4354. atanf(V.vector4_f32[2]),
  4355. atanf(V.vector4_f32[3])
  4356. } } };
  4357. return Result.v;
  4358. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  4359. float32x4_t absV = vabsq_f32(V);
  4360. float32x4_t invV = XMVectorReciprocal(V);
  4361. uint32x4_t comp = vcgtq_f32(V, g_XMOne);
  4362. uint32x4_t sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne);
  4363. comp = vcleq_f32(absV, g_XMOne);
  4364. sign = vbslq_f32(comp, g_XMZero, sign);
  4365. uint32x4_t x = vbslq_f32(comp, V, invV);
  4366. float32x4_t x2 = vmulq_f32(x, x);
  4367. // Compute polynomial approximation
  4368. const XMVECTOR TC1 = g_XMATanCoefficients1;
  4369. XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(TC1), 0);
  4370. XMVECTOR Result = vmlaq_lane_f32( vConstants, x2, vget_high_f32(TC1), 1 );
  4371. vConstants = vdupq_lane_f32(vget_low_f32(TC1), 1);
  4372. Result = vmlaq_f32( vConstants, Result, x2 );
  4373. vConstants = vdupq_lane_f32(vget_low_f32(TC1), 0);
  4374. Result = vmlaq_f32( vConstants, Result, x2 );
  4375. const XMVECTOR TC0 = g_XMATanCoefficients0;
  4376. vConstants = vdupq_lane_f32(vget_high_f32(TC0), 1);
  4377. Result = vmlaq_f32( vConstants, Result, x2 );
  4378. vConstants = vdupq_lane_f32(vget_high_f32(TC0), 0);
  4379. Result = vmlaq_f32( vConstants, Result, x2 );
  4380. vConstants = vdupq_lane_f32(vget_low_f32(TC0), 1);
  4381. Result = vmlaq_f32( vConstants, Result, x2 );
  4382. vConstants = vdupq_lane_f32(vget_low_f32(TC0), 0);
  4383. Result = vmlaq_f32( vConstants, Result, x2 );
  4384. Result = vmlaq_f32( g_XMOne, Result, x2 );
  4385. Result = vmulq_f32( Result, x );
  4386. float32x4_t result1 = vmulq_f32(sign, g_XMHalfPi);
  4387. result1 = vsubq_f32(result1, Result);
  4388. comp = vceqq_f32(sign, g_XMZero);
  4389. Result = vbslq_f32( comp, Result, result1 );
  4390. return Result;
  4391. #elif defined(_XM_SSE_INTRINSICS_)
  4392. __m128 absV = XMVectorAbs(V);
  4393. __m128 invV = _mm_div_ps(g_XMOne, V);
  4394. __m128 comp = _mm_cmpgt_ps(V, g_XMOne);
  4395. __m128 select0 = _mm_and_ps(comp, g_XMOne);
  4396. __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
  4397. __m128 sign = _mm_or_ps(select0, select1);
  4398. comp = _mm_cmple_ps(absV, g_XMOne);
  4399. select0 = _mm_and_ps(comp, g_XMZero);
  4400. select1 = _mm_andnot_ps(comp, sign);
  4401. sign = _mm_or_ps(select0, select1);
  4402. select0 = _mm_and_ps(comp, V);
  4403. select1 = _mm_andnot_ps(comp, invV);
  4404. __m128 x = _mm_or_ps(select0, select1);
  4405. __m128 x2 = _mm_mul_ps(x, x);
  4406. // Compute polynomial approximation
  4407. const XMVECTOR TC1 = g_XMATanCoefficients1;
  4408. XMVECTOR vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(3, 3, 3, 3) );
  4409. __m128 Result = _mm_mul_ps(vConstants, x2);
  4410. vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(2, 2, 2, 2) );
  4411. Result = _mm_add_ps(Result, vConstants);
  4412. Result = _mm_mul_ps(Result, x2);
  4413. vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(1, 1, 1, 1) );
  4414. Result = _mm_add_ps(Result, vConstants);
  4415. Result = _mm_mul_ps(Result, x2);
  4416. vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(0, 0, 0, 0) );
  4417. Result = _mm_add_ps(Result, vConstants);
  4418. Result = _mm_mul_ps(Result, x2);
  4419. const XMVECTOR TC0 = g_XMATanCoefficients0;
  4420. vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(3, 3, 3, 3) );
  4421. Result = _mm_add_ps(Result, vConstants);
  4422. Result = _mm_mul_ps(Result, x2);
  4423. vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(2, 2, 2, 2) );
  4424. Result = _mm_add_ps(Result, vConstants);
  4425. Result = _mm_mul_ps(Result, x2);
  4426. vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(1, 1, 1, 1) );
  4427. Result = _mm_add_ps(Result, vConstants);
  4428. Result = _mm_mul_ps(Result, x2);
  4429. vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(0, 0, 0, 0) );
  4430. Result = _mm_add_ps(Result, vConstants);
  4431. Result = _mm_mul_ps(Result, x2);
  4432. Result = _mm_add_ps(Result, g_XMOne);
  4433. Result = _mm_mul_ps(Result, x);
  4434. __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi);
  4435. result1 = _mm_sub_ps(result1, Result);
  4436. comp = _mm_cmpeq_ps(sign, g_XMZero);
  4437. select0 = _mm_and_ps(comp, Result);
  4438. select1 = _mm_andnot_ps(comp, result1);
  4439. Result = _mm_or_ps(select0, select1);
  4440. return Result;
  4441. #endif
  4442. }
  4443. //------------------------------------------------------------------------------
  4444. inline XMVECTOR XM_CALLCONV XMVectorATan2
  4445. (
  4446. FXMVECTOR Y,
  4447. FXMVECTOR X
  4448. )
  4449. {
  4450. #if defined(_XM_NO_INTRINSICS_)
  4451. XMVECTORF32 Result = { { {
  4452. atan2f(Y.vector4_f32[0], X.vector4_f32[0]),
  4453. atan2f(Y.vector4_f32[1], X.vector4_f32[1]),
  4454. atan2f(Y.vector4_f32[2], X.vector4_f32[2]),
  4455. atan2f(Y.vector4_f32[3], X.vector4_f32[3])
  4456. } } };
  4457. return Result.v;
  4458. #else
  4459. // Return the inverse tangent of Y / X in the range of -Pi to Pi with the following exceptions:
  4460. // Y == 0 and X is Negative -> Pi with the sign of Y
  4461. // y == 0 and x is positive -> 0 with the sign of y
  4462. // Y != 0 and X == 0 -> Pi / 2 with the sign of Y
  4463. // Y != 0 and X is Negative -> atan(y/x) + (PI with the sign of Y)
  4464. // X == -Infinity and Finite Y -> Pi with the sign of Y
  4465. // X == +Infinity and Finite Y -> 0 with the sign of Y
  4466. // Y == Infinity and X is Finite -> Pi / 2 with the sign of Y
  4467. // Y == Infinity and X == -Infinity -> 3Pi / 4 with the sign of Y
  4468. // Y == Infinity and X == +Infinity -> Pi / 4 with the sign of Y
  4469. static const XMVECTORF32 ATan2Constants = { { { XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f } } };
  4470. XMVECTOR Zero = XMVectorZero();
  4471. XMVECTOR ATanResultValid = XMVectorTrueInt();
  4472. XMVECTOR Pi = XMVectorSplatX(ATan2Constants);
  4473. XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants);
  4474. XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants);
  4475. XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants);
  4476. XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero);
  4477. XMVECTOR XEqualsZero = XMVectorEqual(X, Zero);
  4478. XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v);
  4479. XIsPositive = XMVectorEqualInt(XIsPositive, Zero);
  4480. XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y);
  4481. XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X);
  4482. XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v);
  4483. Pi = XMVectorOrInt(Pi, YSign);
  4484. PiOverTwo = XMVectorOrInt(PiOverTwo, YSign);
  4485. PiOverFour = XMVectorOrInt(PiOverFour, YSign);
  4486. ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign);
  4487. XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive);
  4488. XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero);
  4489. XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero);
  4490. XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive);
  4491. XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity);
  4492. XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity);
  4493. ATanResultValid = XMVectorEqualInt(Result, ATanResultValid);
  4494. XMVECTOR V = XMVectorDivide(Y, X);
  4495. XMVECTOR R0 = XMVectorATan(V);
  4496. R1 = XMVectorSelect( Pi, g_XMNegativeZero, XIsPositive );
  4497. R2 = XMVectorAdd(R0, R1);
  4498. return XMVectorSelect(Result, R2, ATanResultValid);
  4499. #endif
  4500. }
  4501. //------------------------------------------------------------------------------
  4502. inline XMVECTOR XM_CALLCONV XMVectorSinEst
  4503. (
  4504. FXMVECTOR V
  4505. )
  4506. {
  4507. // 7-degree minimax approximation
  4508. #if defined(_XM_NO_INTRINSICS_)
  4509. XMVECTORF32 Result = { { {
  4510. sinf(V.vector4_f32[0]),
  4511. sinf(V.vector4_f32[1]),
  4512. sinf(V.vector4_f32[2]),
  4513. sinf(V.vector4_f32[3])
  4514. } } };
  4515. return Result.v;
  4516. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  4517. // Force the value within the bounds of pi
  4518. XMVECTOR x = XMVectorModAngles(V);
  4519. // Map in [-pi/2,pi/2] with sin(y) = sin(x).
  4520. uint32x4_t sign = vandq_u32(x, g_XMNegativeZero);
  4521. uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0
  4522. float32x4_t absx = vabsq_f32( x );
  4523. float32x4_t rflx = vsubq_f32(c, x);
  4524. uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
  4525. x = vbslq_f32( comp, x, rflx );
  4526. float32x4_t x2 = vmulq_f32(x, x);
  4527. // Compute polynomial approximation
  4528. const XMVECTOR SEC = g_XMSinCoefficients1;
  4529. XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0);
  4530. XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(SEC), 1);
  4531. vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1);
  4532. Result = vmlaq_f32(vConstants, Result, x2);
  4533. Result = vmlaq_f32(g_XMOne, Result, x2);
  4534. Result = vmulq_f32(Result, x);
  4535. return Result;
  4536. #elif defined(_XM_SSE_INTRINSICS_)
  4537. // Force the value within the bounds of pi
  4538. XMVECTOR x = XMVectorModAngles(V);
  4539. // Map in [-pi/2,pi/2] with sin(y) = sin(x).
  4540. __m128 sign = _mm_and_ps(x, g_XMNegativeZero);
  4541. __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0
  4542. __m128 absx = _mm_andnot_ps(sign, x); // |x|
  4543. __m128 rflx = _mm_sub_ps(c, x);
  4544. __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
  4545. __m128 select0 = _mm_and_ps(comp, x);
  4546. __m128 select1 = _mm_andnot_ps(comp, rflx);
  4547. x = _mm_or_ps(select0, select1);
  4548. __m128 x2 = _mm_mul_ps(x, x);
  4549. // Compute polynomial approximation
  4550. const XMVECTOR SEC = g_XMSinCoefficients1;
  4551. XMVECTOR vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(3, 3, 3, 3) );
  4552. __m128 Result = _mm_mul_ps(vConstants, x2);
  4553. vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(2, 2, 2, 2) );
  4554. Result = _mm_add_ps(Result, vConstants);
  4555. Result = _mm_mul_ps(Result, x2);
  4556. vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(1, 1, 1, 1) );
  4557. Result = _mm_add_ps(Result, vConstants);
  4558. Result = _mm_mul_ps(Result, x2);
  4559. Result = _mm_add_ps(Result, g_XMOne);
  4560. Result = _mm_mul_ps(Result, x);
  4561. return Result;
  4562. #endif
  4563. }
  4564. //------------------------------------------------------------------------------
  4565. inline XMVECTOR XM_CALLCONV XMVectorCosEst
  4566. (
  4567. FXMVECTOR V
  4568. )
  4569. {
  4570. // 6-degree minimax approximation
  4571. #if defined(_XM_NO_INTRINSICS_)
  4572. XMVECTORF32 Result = { { {
  4573. cosf(V.vector4_f32[0]),
  4574. cosf(V.vector4_f32[1]),
  4575. cosf(V.vector4_f32[2]),
  4576. cosf(V.vector4_f32[3])
  4577. } } };
  4578. return Result.v;
  4579. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  4580. // Map V to x in [-pi,pi].
  4581. XMVECTOR x = XMVectorModAngles(V);
  4582. // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
  4583. uint32x4_t sign = vandq_u32(x, g_XMNegativeZero);
  4584. uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0
  4585. float32x4_t absx = vabsq_f32( x );
  4586. float32x4_t rflx = vsubq_f32(c, x);
  4587. uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
  4588. x = vbslq_f32( comp, x, rflx );
  4589. sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne );
  4590. float32x4_t x2 = vmulq_f32(x, x);
  4591. // Compute polynomial approximation
  4592. const XMVECTOR CEC = g_XMCosCoefficients1;
  4593. XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0);
  4594. XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(CEC), 1);
  4595. vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1);
  4596. Result = vmlaq_f32(vConstants, Result, x2);
  4597. Result = vmlaq_f32(g_XMOne, Result, x2);
  4598. Result = vmulq_f32(Result, sign);
  4599. return Result;
  4600. #elif defined(_XM_SSE_INTRINSICS_)
  4601. // Map V to x in [-pi,pi].
  4602. XMVECTOR x = XMVectorModAngles(V);
  4603. // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
  4604. XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
  4605. __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0
  4606. __m128 absx = _mm_andnot_ps(sign, x); // |x|
  4607. __m128 rflx = _mm_sub_ps(c, x);
  4608. __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
  4609. __m128 select0 = _mm_and_ps(comp, x);
  4610. __m128 select1 = _mm_andnot_ps(comp, rflx);
  4611. x = _mm_or_ps(select0, select1);
  4612. select0 = _mm_and_ps(comp, g_XMOne);
  4613. select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
  4614. sign = _mm_or_ps(select0, select1);
  4615. __m128 x2 = _mm_mul_ps(x, x);
  4616. // Compute polynomial approximation
  4617. const XMVECTOR CEC = g_XMCosCoefficients1;
  4618. XMVECTOR vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(3, 3, 3, 3) );
  4619. __m128 Result = _mm_mul_ps(vConstants, x2);
  4620. vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(2, 2, 2, 2) );
  4621. Result = _mm_add_ps(Result, vConstants);
  4622. Result = _mm_mul_ps(Result, x2);
  4623. vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(1, 1, 1, 1) );
  4624. Result = _mm_add_ps(Result, vConstants);
  4625. Result = _mm_mul_ps(Result, x2);
  4626. Result = _mm_add_ps(Result, g_XMOne);
  4627. Result = _mm_mul_ps(Result, sign);
  4628. return Result;
  4629. #endif
  4630. }
  4631. //------------------------------------------------------------------------------
  4632. _Use_decl_annotations_
  4633. inline void XM_CALLCONV XMVectorSinCosEst
  4634. (
  4635. XMVECTOR* pSin,
  4636. XMVECTOR* pCos,
  4637. FXMVECTOR V
  4638. )
  4639. {
  4640. assert(pSin != nullptr);
  4641. assert(pCos != nullptr);
  4642. // 7/6-degree minimax approximation
  4643. #if defined(_XM_NO_INTRINSICS_)
  4644. XMVECTORF32 Sin = { { {
  4645. sinf(V.vector4_f32[0]),
  4646. sinf(V.vector4_f32[1]),
  4647. sinf(V.vector4_f32[2]),
  4648. sinf(V.vector4_f32[3])
  4649. } } };
  4650. XMVECTORF32 Cos = { { {
  4651. cosf(V.vector4_f32[0]),
  4652. cosf(V.vector4_f32[1]),
  4653. cosf(V.vector4_f32[2]),
  4654. cosf(V.vector4_f32[3])
  4655. } } };
  4656. *pSin = Sin.v;
  4657. *pCos = Cos.v;
  4658. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  4659. // Force the value within the bounds of pi
  4660. XMVECTOR x = XMVectorModAngles(V);
  4661. // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
  4662. uint32x4_t sign = vandq_u32(x, g_XMNegativeZero);
  4663. uint32x4_t c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0
  4664. float32x4_t absx = vabsq_f32( x );
  4665. float32x4_t rflx = vsubq_f32(c, x);
  4666. uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
  4667. x = vbslq_f32( comp, x, rflx );
  4668. sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne );
  4669. float32x4_t x2 = vmulq_f32(x, x);
  4670. // Compute polynomial approximation for sine
  4671. const XMVECTOR SEC = g_XMSinCoefficients1;
  4672. XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0);
  4673. XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(SEC), 1);
  4674. vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1);
  4675. Result = vmlaq_f32(vConstants, Result, x2);
  4676. Result = vmlaq_f32(g_XMOne, Result, x2);
  4677. *pSin = vmulq_f32(Result, x);
  4678. // Compute polynomial approximation
  4679. const XMVECTOR CEC = g_XMCosCoefficients1;
  4680. vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0);
  4681. Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(CEC), 1);
  4682. vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1);
  4683. Result = vmlaq_f32(vConstants, Result, x2);
  4684. Result = vmlaq_f32(g_XMOne, Result, x2);
  4685. *pCos = vmulq_f32(Result, sign);
  4686. #elif defined(_XM_SSE_INTRINSICS_)
  4687. // Force the value within the bounds of pi
  4688. XMVECTOR x = XMVectorModAngles(V);
  4689. // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x).
  4690. XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
  4691. __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0
  4692. __m128 absx = _mm_andnot_ps(sign, x); // |x|
  4693. __m128 rflx = _mm_sub_ps(c, x);
  4694. __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
  4695. __m128 select0 = _mm_and_ps(comp, x);
  4696. __m128 select1 = _mm_andnot_ps(comp, rflx);
  4697. x = _mm_or_ps(select0, select1);
  4698. select0 = _mm_and_ps(comp, g_XMOne);
  4699. select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
  4700. sign = _mm_or_ps(select0, select1);
  4701. __m128 x2 = _mm_mul_ps(x, x);
  4702. // Compute polynomial approximation for sine
  4703. const XMVECTOR SEC = g_XMSinCoefficients1;
  4704. XMVECTOR vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(3, 3, 3, 3) );
  4705. __m128 Result = _mm_mul_ps(vConstants, x2);
  4706. vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(2, 2, 2, 2) );
  4707. Result = _mm_add_ps(Result, vConstants);
  4708. Result = _mm_mul_ps(Result, x2);
  4709. vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(1, 1, 1, 1) );
  4710. Result = _mm_add_ps(Result, vConstants);
  4711. Result = _mm_mul_ps(Result, x2);
  4712. Result = _mm_add_ps(Result, g_XMOne);
  4713. Result = _mm_mul_ps(Result, x);
  4714. *pSin = Result;
  4715. // Compute polynomial approximation for cosine
  4716. const XMVECTOR CEC = g_XMCosCoefficients1;
  4717. vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(3, 3, 3, 3) );
  4718. Result = _mm_mul_ps(vConstants, x2);
  4719. vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(2, 2, 2, 2) );
  4720. Result = _mm_add_ps(Result, vConstants);
  4721. Result = _mm_mul_ps(Result, x2);
  4722. vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(1, 1, 1, 1) );
  4723. Result = _mm_add_ps(Result, vConstants);
  4724. Result = _mm_mul_ps(Result, x2);
  4725. Result = _mm_add_ps(Result, g_XMOne);
  4726. Result = _mm_mul_ps(Result, sign);
  4727. *pCos = Result;
  4728. #endif
  4729. }
  4730. //------------------------------------------------------------------------------
  4731. inline XMVECTOR XM_CALLCONV XMVectorTanEst
  4732. (
  4733. FXMVECTOR V
  4734. )
  4735. {
  4736. #if defined(_XM_NO_INTRINSICS_)
  4737. XMVECTORF32 Result = { { {
  4738. tanf(V.vector4_f32[0]),
  4739. tanf(V.vector4_f32[1]),
  4740. tanf(V.vector4_f32[2]),
  4741. tanf(V.vector4_f32[3])
  4742. } } };
  4743. return Result.v;
  4744. #else
  4745. XMVECTOR OneOverPi = XMVectorSplatW(g_XMTanEstCoefficients.v);
  4746. XMVECTOR V1 = XMVectorMultiply(V, OneOverPi);
  4747. V1 = XMVectorRound(V1);
  4748. V1 = XMVectorNegativeMultiplySubtract(g_XMPi.v, V1, V);
  4749. XMVECTOR T0 = XMVectorSplatX(g_XMTanEstCoefficients.v);
  4750. XMVECTOR T1 = XMVectorSplatY(g_XMTanEstCoefficients.v);
  4751. XMVECTOR T2 = XMVectorSplatZ(g_XMTanEstCoefficients.v);
  4752. XMVECTOR V2T2 = XMVectorNegativeMultiplySubtract(V1, V1, T2);
  4753. XMVECTOR V2 = XMVectorMultiply(V1, V1);
  4754. XMVECTOR V1T0 = XMVectorMultiply(V1, T0);
  4755. XMVECTOR V1T1 = XMVectorMultiply(V1, T1);
  4756. XMVECTOR D = XMVectorReciprocalEst(V2T2);
  4757. XMVECTOR N = XMVectorMultiplyAdd(V2, V1T1, V1T0);
  4758. return XMVectorMultiply(N, D);
  4759. #endif
  4760. }
  4761. //------------------------------------------------------------------------------
  4762. inline XMVECTOR XM_CALLCONV XMVectorASinEst
  4763. (
  4764. FXMVECTOR V
  4765. )
  4766. {
  4767. // 3-degree minimax approximation
  4768. #if defined(_XM_NO_INTRINSICS_)
  4769. XMVECTORF32 Result;
  4770. Result.f[0] = asinf( V.vector4_f32[0] );
  4771. Result.f[1] = asinf( V.vector4_f32[1] );
  4772. Result.f[2] = asinf( V.vector4_f32[2] );
  4773. Result.f[3] = asinf( V.vector4_f32[3] );
  4774. return Result.v;
  4775. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  4776. uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero);
  4777. float32x4_t x = vabsq_f32(V);
  4778. // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
  4779. float32x4_t oneMValue = vsubq_f32(g_XMOne, x);
  4780. float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
  4781. float32x4_t root = XMVectorSqrt(clampOneMValue);
  4782. // Compute polynomial approximation
  4783. const XMVECTOR AEC = g_XMArcEstCoefficients;
  4784. XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0);
  4785. XMVECTOR t0 = vmlaq_lane_f32( vConstants, x, vget_high_f32(AEC), 1 );
  4786. vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1);
  4787. t0 = vmlaq_f32( vConstants, t0, x );
  4788. vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0);
  4789. t0 = vmlaq_f32( vConstants, t0, x );
  4790. t0 = vmulq_f32(t0, root);
  4791. float32x4_t t1 = vsubq_f32(g_XMPi, t0);
  4792. t0 = vbslq_f32( nonnegative, t0, t1 );
  4793. t0 = vsubq_f32(g_XMHalfPi, t0);
  4794. return t0;
  4795. #elif defined(_XM_SSE_INTRINSICS_)
  4796. __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
  4797. __m128 mvalue = _mm_sub_ps(g_XMZero, V);
  4798. __m128 x = _mm_max_ps(V, mvalue); // |V|
  4799. // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
  4800. __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
  4801. __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
  4802. __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|)
  4803. // Compute polynomial approximation
  4804. const XMVECTOR AEC = g_XMArcEstCoefficients;
  4805. XMVECTOR vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(3, 3, 3, 3) );
  4806. __m128 t0 = _mm_mul_ps(vConstants, x);
  4807. vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(2, 2, 2, 2) );
  4808. t0 = _mm_add_ps(t0, vConstants);
  4809. t0 = _mm_mul_ps(t0, x);
  4810. vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(1, 1, 1, 1) );
  4811. t0 = _mm_add_ps(t0, vConstants);
  4812. t0 = _mm_mul_ps(t0, x);
  4813. vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(0, 0, 0, 0) );
  4814. t0 = _mm_add_ps(t0, vConstants);
  4815. t0 = _mm_mul_ps(t0, root);
  4816. __m128 t1 = _mm_sub_ps(g_XMPi, t0);
  4817. t0 = _mm_and_ps(nonnegative, t0);
  4818. t1 = _mm_andnot_ps(nonnegative, t1);
  4819. t0 = _mm_or_ps(t0, t1);
  4820. t0 = _mm_sub_ps(g_XMHalfPi, t0);
  4821. return t0;
  4822. #endif
  4823. }
  4824. //------------------------------------------------------------------------------
  4825. inline XMVECTOR XM_CALLCONV XMVectorACosEst
  4826. (
  4827. FXMVECTOR V
  4828. )
  4829. {
  4830. // 3-degree minimax approximation
  4831. #if defined(_XM_NO_INTRINSICS_)
  4832. XMVECTORF32 Result = { { {
  4833. acosf(V.vector4_f32[0]),
  4834. acosf(V.vector4_f32[1]),
  4835. acosf(V.vector4_f32[2]),
  4836. acosf(V.vector4_f32[3])
  4837. } } };
  4838. return Result.v;
  4839. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  4840. uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero);
  4841. float32x4_t x = vabsq_f32(V);
  4842. // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
  4843. float32x4_t oneMValue = vsubq_f32(g_XMOne, x);
  4844. float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
  4845. float32x4_t root = XMVectorSqrt(clampOneMValue);
  4846. // Compute polynomial approximation
  4847. const XMVECTOR AEC = g_XMArcEstCoefficients;
  4848. XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0);
  4849. XMVECTOR t0 = vmlaq_lane_f32( vConstants, x, vget_high_f32(AEC), 1 );
  4850. vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1);
  4851. t0 = vmlaq_f32( vConstants, t0, x );
  4852. vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0);
  4853. t0 = vmlaq_f32( vConstants, t0, x );
  4854. t0 = vmulq_f32(t0, root);
  4855. float32x4_t t1 = vsubq_f32(g_XMPi, t0);
  4856. t0 = vbslq_f32( nonnegative, t0, t1 );
  4857. return t0;
  4858. #elif defined(_XM_SSE_INTRINSICS_)
  4859. __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
  4860. __m128 mvalue = _mm_sub_ps(g_XMZero, V);
  4861. __m128 x = _mm_max_ps(V, mvalue); // |V|
  4862. // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
  4863. __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
  4864. __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
  4865. __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|)
  4866. // Compute polynomial approximation
  4867. const XMVECTOR AEC = g_XMArcEstCoefficients;
  4868. XMVECTOR vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(3, 3, 3, 3) );
  4869. __m128 t0 = _mm_mul_ps(vConstants, x);
  4870. vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(2, 2, 2, 2) );
  4871. t0 = _mm_add_ps(t0, vConstants);
  4872. t0 = _mm_mul_ps(t0, x);
  4873. vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(1, 1, 1, 1) );
  4874. t0 = _mm_add_ps(t0, vConstants);
  4875. t0 = _mm_mul_ps(t0, x);
  4876. vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(0, 0, 0, 0) );
  4877. t0 = _mm_add_ps(t0, vConstants);
  4878. t0 = _mm_mul_ps(t0, root);
  4879. __m128 t1 = _mm_sub_ps(g_XMPi, t0);
  4880. t0 = _mm_and_ps(nonnegative, t0);
  4881. t1 = _mm_andnot_ps(nonnegative, t1);
  4882. t0 = _mm_or_ps(t0, t1);
  4883. return t0;
  4884. #endif
  4885. }
  4886. //------------------------------------------------------------------------------
  4887. inline XMVECTOR XM_CALLCONV XMVectorATanEst
  4888. (
  4889. FXMVECTOR V
  4890. )
  4891. {
  4892. // 9-degree minimax approximation
  4893. #if defined(_XM_NO_INTRINSICS_)
  4894. XMVECTORF32 Result = { { {
  4895. atanf(V.vector4_f32[0]),
  4896. atanf(V.vector4_f32[1]),
  4897. atanf(V.vector4_f32[2]),
  4898. atanf(V.vector4_f32[3])
  4899. } } };
  4900. return Result.v;
  4901. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  4902. float32x4_t absV = vabsq_f32(V);
  4903. float32x4_t invV = XMVectorReciprocalEst(V);
  4904. uint32x4_t comp = vcgtq_f32(V, g_XMOne);
  4905. uint32x4_t sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne );
  4906. comp = vcleq_f32(absV, g_XMOne);
  4907. sign = vbslq_f32(comp, g_XMZero, sign );
  4908. uint32x4_t x = vbslq_f32(comp, V, invV );
  4909. float32x4_t x2 = vmulq_f32(x, x);
  4910. // Compute polynomial approximation
  4911. const XMVECTOR AEC = g_XMATanEstCoefficients1;
  4912. XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0);
  4913. XMVECTOR Result = vmlaq_lane_f32( vConstants, x2, vget_high_f32(AEC), 1 );
  4914. vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1);
  4915. Result = vmlaq_f32( vConstants, Result, x2 );
  4916. vConstants = vdupq_lane_f32(vget_low_f32( AEC), 0);
  4917. Result = vmlaq_f32( vConstants, Result, x2 );
  4918. // ATanEstCoefficients0 is already splatted
  4919. Result = vmlaq_f32( g_XMATanEstCoefficients0, Result, x2 );
  4920. Result = vmulq_f32( Result, x );
  4921. float32x4_t result1 = vmulq_f32(sign, g_XMHalfPi);
  4922. result1 = vsubq_f32(result1, Result);
  4923. comp = vceqq_f32(sign, g_XMZero);
  4924. Result = vbslq_f32( comp, Result, result1 );
  4925. return Result;
  4926. #elif defined(_XM_SSE_INTRINSICS_)
  4927. __m128 absV = XMVectorAbs(V);
  4928. __m128 invV = _mm_div_ps(g_XMOne, V);
  4929. __m128 comp = _mm_cmpgt_ps(V, g_XMOne);
  4930. __m128 select0 = _mm_and_ps(comp, g_XMOne);
  4931. __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
  4932. __m128 sign = _mm_or_ps(select0, select1);
  4933. comp = _mm_cmple_ps(absV, g_XMOne);
  4934. select0 = _mm_and_ps(comp, g_XMZero);
  4935. select1 = _mm_andnot_ps(comp, sign);
  4936. sign = _mm_or_ps(select0, select1);
  4937. select0 = _mm_and_ps(comp, V);
  4938. select1 = _mm_andnot_ps(comp, invV);
  4939. __m128 x = _mm_or_ps(select0, select1);
  4940. __m128 x2 = _mm_mul_ps(x, x);
  4941. // Compute polynomial approximation
  4942. const XMVECTOR AEC = g_XMATanEstCoefficients1;
  4943. XMVECTOR vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(3, 3, 3, 3) );
  4944. __m128 Result = _mm_mul_ps(vConstants, x2);
  4945. vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(2, 2, 2, 2) );
  4946. Result = _mm_add_ps(Result, vConstants);
  4947. Result = _mm_mul_ps(Result, x2);
  4948. vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(1, 1, 1, 1) );
  4949. Result = _mm_add_ps(Result, vConstants);
  4950. Result = _mm_mul_ps(Result, x2);
  4951. vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(0, 0, 0, 0) );
  4952. Result = _mm_add_ps(Result, vConstants);
  4953. Result = _mm_mul_ps(Result, x2);
  4954. // ATanEstCoefficients0 is already splatted
  4955. Result = _mm_add_ps(Result, g_XMATanEstCoefficients0);
  4956. Result = _mm_mul_ps(Result, x);
  4957. __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi);
  4958. result1 = _mm_sub_ps(result1, Result);
  4959. comp = _mm_cmpeq_ps(sign, g_XMZero);
  4960. select0 = _mm_and_ps(comp, Result);
  4961. select1 = _mm_andnot_ps(comp, result1);
  4962. Result = _mm_or_ps(select0, select1);
  4963. return Result;
  4964. #endif
  4965. }
  4966. //------------------------------------------------------------------------------
  4967. inline XMVECTOR XM_CALLCONV XMVectorATan2Est
  4968. (
  4969. FXMVECTOR Y,
  4970. FXMVECTOR X
  4971. )
  4972. {
  4973. #if defined(_XM_NO_INTRINSICS_)
  4974. XMVECTORF32 Result = { { {
  4975. atan2f(Y.vector4_f32[0], X.vector4_f32[0]),
  4976. atan2f(Y.vector4_f32[1], X.vector4_f32[1]),
  4977. atan2f(Y.vector4_f32[2], X.vector4_f32[2]),
  4978. atan2f(Y.vector4_f32[3], X.vector4_f32[3]),
  4979. } } };
  4980. return Result.v;
  4981. #else
  4982. static const XMVECTORF32 ATan2Constants = { { { XM_PI, XM_PIDIV2, XM_PIDIV4, 2.3561944905f /* Pi*3/4 */ } } };
  4983. const XMVECTOR Zero = XMVectorZero();
  4984. XMVECTOR ATanResultValid = XMVectorTrueInt();
  4985. XMVECTOR Pi = XMVectorSplatX(ATan2Constants);
  4986. XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants);
  4987. XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants);
  4988. XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants);
  4989. XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero);
  4990. XMVECTOR XEqualsZero = XMVectorEqual(X, Zero);
  4991. XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v);
  4992. XIsPositive = XMVectorEqualInt(XIsPositive, Zero);
  4993. XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y);
  4994. XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X);
  4995. XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v);
  4996. Pi = XMVectorOrInt(Pi, YSign);
  4997. PiOverTwo = XMVectorOrInt(PiOverTwo, YSign);
  4998. PiOverFour = XMVectorOrInt(PiOverFour, YSign);
  4999. ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign);
  5000. XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive);
  5001. XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero);
  5002. XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero);
  5003. XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive);
  5004. XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity);
  5005. XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity);
  5006. ATanResultValid = XMVectorEqualInt(Result, ATanResultValid);
  5007. XMVECTOR Reciprocal = XMVectorReciprocalEst(X);
  5008. XMVECTOR V = XMVectorMultiply(Y, Reciprocal);
  5009. XMVECTOR R0 = XMVectorATanEst(V);
  5010. R1 = XMVectorSelect( Pi, g_XMNegativeZero, XIsPositive );
  5011. R2 = XMVectorAdd(R0, R1);
  5012. Result = XMVectorSelect(Result, R2, ATanResultValid);
  5013. return Result;
  5014. #endif
  5015. }
  5016. //------------------------------------------------------------------------------
  5017. inline XMVECTOR XM_CALLCONV XMVectorLerp
  5018. (
  5019. FXMVECTOR V0,
  5020. FXMVECTOR V1,
  5021. float t
  5022. )
  5023. {
  5024. // V0 + t * (V1 - V0)
  5025. #if defined(_XM_NO_INTRINSICS_)
  5026. XMVECTOR Scale = XMVectorReplicate(t);
  5027. XMVECTOR Length = XMVectorSubtract(V1, V0);
  5028. return XMVectorMultiplyAdd(Length, Scale, V0);
  5029. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  5030. XMVECTOR L = vsubq_f32( V1, V0 );
  5031. return vmlaq_n_f32( V0, L, t );
  5032. #elif defined(_XM_SSE_INTRINSICS_)
  5033. XMVECTOR L = _mm_sub_ps( V1, V0 );
  5034. XMVECTOR S = _mm_set_ps1( t );
  5035. XMVECTOR Result = _mm_mul_ps( L, S );
  5036. return _mm_add_ps( Result, V0 );
  5037. #endif
  5038. }
  5039. //------------------------------------------------------------------------------
  5040. inline XMVECTOR XM_CALLCONV XMVectorLerpV
  5041. (
  5042. FXMVECTOR V0,
  5043. FXMVECTOR V1,
  5044. FXMVECTOR T
  5045. )
  5046. {
  5047. // V0 + T * (V1 - V0)
  5048. #if defined(_XM_NO_INTRINSICS_)
  5049. XMVECTOR Length = XMVectorSubtract(V1, V0);
  5050. return XMVectorMultiplyAdd(Length, T, V0);
  5051. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  5052. XMVECTOR L = vsubq_f32( V1, V0 );
  5053. return vmlaq_f32( V0, L, T );
  5054. #elif defined(_XM_SSE_INTRINSICS_)
  5055. XMVECTOR Length = _mm_sub_ps( V1, V0 );
  5056. XMVECTOR Result = _mm_mul_ps( Length, T );
  5057. return _mm_add_ps( Result, V0 );
  5058. #endif
  5059. }
  5060. //------------------------------------------------------------------------------
  5061. inline XMVECTOR XM_CALLCONV XMVectorHermite
  5062. (
  5063. FXMVECTOR Position0,
  5064. FXMVECTOR Tangent0,
  5065. FXMVECTOR Position1,
  5066. GXMVECTOR Tangent1,
  5067. float t
  5068. )
  5069. {
  5070. // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 +
  5071. // (t^3 - 2 * t^2 + t) * Tangent0 +
  5072. // (-2 * t^3 + 3 * t^2) * Position1 +
  5073. // (t^3 - t^2) * Tangent1
  5074. #if defined(_XM_NO_INTRINSICS_)
  5075. float t2 = t * t;
  5076. float t3 = t * t2;
  5077. XMVECTOR P0 = XMVectorReplicate(2.0f * t3 - 3.0f * t2 + 1.0f);
  5078. XMVECTOR T0 = XMVectorReplicate(t3 - 2.0f * t2 + t);
  5079. XMVECTOR P1 = XMVectorReplicate(-2.0f * t3 + 3.0f * t2);
  5080. XMVECTOR T1 = XMVectorReplicate(t3 - t2);
  5081. XMVECTOR Result = XMVectorMultiply(P0, Position0);
  5082. Result = XMVectorMultiplyAdd(T0, Tangent0, Result);
  5083. Result = XMVectorMultiplyAdd(P1, Position1, Result);
  5084. Result = XMVectorMultiplyAdd(T1, Tangent1, Result);
  5085. return Result;
  5086. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  5087. float t2 = t * t;
  5088. float t3 = t * t2;
  5089. float p0 = 2.0f * t3 - 3.0f * t2 + 1.0f;
  5090. float t0 = t3 - 2.0f * t2 + t;
  5091. float p1 = -2.0f * t3 + 3.0f * t2;
  5092. float t1 = t3 - t2;
  5093. XMVECTOR vResult = vmulq_n_f32(Position0, p0 );
  5094. vResult = vmlaq_n_f32( vResult, Tangent0, t0 );
  5095. vResult = vmlaq_n_f32( vResult, Position1, p1 );
  5096. vResult = vmlaq_n_f32( vResult, Tangent1, t1 );
  5097. return vResult;
  5098. #elif defined(_XM_SSE_INTRINSICS_)
  5099. float t2 = t * t;
  5100. float t3 = t * t2;
  5101. XMVECTOR P0 = _mm_set_ps1(2.0f * t3 - 3.0f * t2 + 1.0f);
  5102. XMVECTOR T0 = _mm_set_ps1(t3 - 2.0f * t2 + t);
  5103. XMVECTOR P1 = _mm_set_ps1(-2.0f * t3 + 3.0f * t2);
  5104. XMVECTOR T1 = _mm_set_ps1(t3 - t2);
  5105. XMVECTOR vResult = _mm_mul_ps(P0, Position0);
  5106. XMVECTOR vTemp = _mm_mul_ps(T0, Tangent0);
  5107. vResult = _mm_add_ps(vResult,vTemp);
  5108. vTemp = _mm_mul_ps(P1, Position1);
  5109. vResult = _mm_add_ps(vResult,vTemp);
  5110. vTemp = _mm_mul_ps(T1, Tangent1);
  5111. vResult = _mm_add_ps(vResult,vTemp);
  5112. return vResult;
  5113. #endif
  5114. }
  5115. //------------------------------------------------------------------------------
  5116. inline XMVECTOR XM_CALLCONV XMVectorHermiteV
  5117. (
  5118. FXMVECTOR Position0,
  5119. FXMVECTOR Tangent0,
  5120. FXMVECTOR Position1,
  5121. GXMVECTOR Tangent1,
  5122. HXMVECTOR T
  5123. )
  5124. {
  5125. // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 +
  5126. // (t^3 - 2 * t^2 + t) * Tangent0 +
  5127. // (-2 * t^3 + 3 * t^2) * Position1 +
  5128. // (t^3 - t^2) * Tangent1
  5129. #if defined(_XM_NO_INTRINSICS_)
  5130. XMVECTOR T2 = XMVectorMultiply(T, T);
  5131. XMVECTOR T3 = XMVectorMultiply(T , T2);
  5132. XMVECTOR P0 = XMVectorReplicate(2.0f * T3.vector4_f32[0] - 3.0f * T2.vector4_f32[0] + 1.0f);
  5133. XMVECTOR T0 = XMVectorReplicate(T3.vector4_f32[1] - 2.0f * T2.vector4_f32[1] + T.vector4_f32[1]);
  5134. XMVECTOR P1 = XMVectorReplicate(-2.0f * T3.vector4_f32[2] + 3.0f * T2.vector4_f32[2]);
  5135. XMVECTOR T1 = XMVectorReplicate(T3.vector4_f32[3] - T2.vector4_f32[3]);
  5136. XMVECTOR Result = XMVectorMultiply(P0, Position0);
  5137. Result = XMVectorMultiplyAdd(T0, Tangent0, Result);
  5138. Result = XMVectorMultiplyAdd(P1, Position1, Result);
  5139. Result = XMVectorMultiplyAdd(T1, Tangent1, Result);
  5140. return Result;
  5141. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  5142. static const XMVECTORF32 CatMulT2 = { { { -3.0f, -2.0f, 3.0f, -1.0f } } };
  5143. static const XMVECTORF32 CatMulT3 = { { { 2.0f, 1.0f, -2.0f, 1.0f } } };
  5144. XMVECTOR T2 = vmulq_f32(T,T);
  5145. XMVECTOR T3 = vmulq_f32(T,T2);
  5146. // Mul by the constants against t^2
  5147. T2 = vmulq_f32(T2,CatMulT2);
  5148. // Mul by the constants against t^3
  5149. T3 = vmlaq_f32(T2, T3, CatMulT3 );
  5150. // T3 now has the pre-result.
  5151. // I need to add t.y only
  5152. T2 = vandq_u32(T,g_XMMaskY);
  5153. T3 = vaddq_f32(T3,T2);
  5154. // Add 1.0f to x
  5155. T3 = vaddq_f32(T3,g_XMIdentityR0);
  5156. // Now, I have the constants created
  5157. // Mul the x constant to Position0
  5158. XMVECTOR vResult = vmulq_lane_f32( Position0, vget_low_f32( T3 ), 0 ); // T3[0]
  5159. // Mul the y constant to Tangent0
  5160. vResult = vmlaq_lane_f32(vResult, Tangent0, vget_low_f32( T3 ), 1 ); // T3[1]
  5161. // Mul the z constant to Position1
  5162. vResult = vmlaq_lane_f32(vResult, Position1, vget_high_f32( T3 ), 0 ); // T3[2]
  5163. // Mul the w constant to Tangent1
  5164. vResult = vmlaq_lane_f32(vResult, Tangent1, vget_high_f32( T3 ), 1 ); // T3[3]
  5165. return vResult;
  5166. #elif defined(_XM_SSE_INTRINSICS_)
  5167. static const XMVECTORF32 CatMulT2 = { { { -3.0f, -2.0f, 3.0f, -1.0f } } };
  5168. static const XMVECTORF32 CatMulT3 = { { { 2.0f, 1.0f, -2.0f, 1.0f } } };
  5169. XMVECTOR T2 = _mm_mul_ps(T,T);
  5170. XMVECTOR T3 = _mm_mul_ps(T,T2);
  5171. // Mul by the constants against t^2
  5172. T2 = _mm_mul_ps(T2,CatMulT2);
  5173. // Mul by the constants against t^3
  5174. T3 = _mm_mul_ps(T3,CatMulT3);
  5175. // T3 now has the pre-result.
  5176. T3 = _mm_add_ps(T3,T2);
  5177. // I need to add t.y only
  5178. T2 = _mm_and_ps(T,g_XMMaskY);
  5179. T3 = _mm_add_ps(T3,T2);
  5180. // Add 1.0f to x
  5181. T3 = _mm_add_ps(T3,g_XMIdentityR0);
  5182. // Now, I have the constants created
  5183. // Mul the x constant to Position0
  5184. XMVECTOR vResult = XM_PERMUTE_PS(T3,_MM_SHUFFLE(0,0,0,0));
  5185. vResult = _mm_mul_ps(vResult,Position0);
  5186. // Mul the y constant to Tangent0
  5187. T2 = XM_PERMUTE_PS(T3,_MM_SHUFFLE(1,1,1,1));
  5188. T2 = _mm_mul_ps(T2,Tangent0);
  5189. vResult = _mm_add_ps(vResult,T2);
  5190. // Mul the z constant to Position1
  5191. T2 = XM_PERMUTE_PS(T3,_MM_SHUFFLE(2,2,2,2));
  5192. T2 = _mm_mul_ps(T2,Position1);
  5193. vResult = _mm_add_ps(vResult,T2);
  5194. // Mul the w constant to Tangent1
  5195. T3 = XM_PERMUTE_PS(T3,_MM_SHUFFLE(3,3,3,3));
  5196. T3 = _mm_mul_ps(T3,Tangent1);
  5197. vResult = _mm_add_ps(vResult,T3);
  5198. return vResult;
  5199. #endif
  5200. }
  5201. //------------------------------------------------------------------------------
  5202. inline XMVECTOR XM_CALLCONV XMVectorCatmullRom
  5203. (
  5204. FXMVECTOR Position0,
  5205. FXMVECTOR Position1,
  5206. FXMVECTOR Position2,
  5207. GXMVECTOR Position3,
  5208. float t
  5209. )
  5210. {
  5211. // Result = ((-t^3 + 2 * t^2 - t) * Position0 +
  5212. // (3 * t^3 - 5 * t^2 + 2) * Position1 +
  5213. // (-3 * t^3 + 4 * t^2 + t) * Position2 +
  5214. // (t^3 - t^2) * Position3) * 0.5
  5215. #if defined(_XM_NO_INTRINSICS_)
  5216. float t2 = t * t;
  5217. float t3 = t * t2;
  5218. XMVECTOR P0 = XMVectorReplicate((-t3 + 2.0f * t2 - t) * 0.5f);
  5219. XMVECTOR P1 = XMVectorReplicate((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f);
  5220. XMVECTOR P2 = XMVectorReplicate((-3.0f * t3 + 4.0f * t2 + t) * 0.5f);
  5221. XMVECTOR P3 = XMVectorReplicate((t3 - t2) * 0.5f);
  5222. XMVECTOR Result = XMVectorMultiply(P0, Position0);
  5223. Result = XMVectorMultiplyAdd(P1, Position1, Result);
  5224. Result = XMVectorMultiplyAdd(P2, Position2, Result);
  5225. Result = XMVectorMultiplyAdd(P3, Position3, Result);
  5226. return Result;
  5227. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  5228. float t2 = t * t;
  5229. float t3 = t * t2;
  5230. float p0 = (-t3 + 2.0f * t2 - t) * 0.5f;
  5231. float p1 = (3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f;
  5232. float p2 = (-3.0f * t3 + 4.0f * t2 + t) * 0.5f;
  5233. float p3 = (t3 - t2) * 0.5f;
  5234. XMVECTOR P1 = vmulq_n_f32(Position1, p1);
  5235. XMVECTOR P0 = vmlaq_n_f32(P1, Position0, p0);
  5236. XMVECTOR P3 = vmulq_n_f32(Position3, p3);
  5237. XMVECTOR P2 = vmlaq_n_f32(P3, Position2, p2);
  5238. P0 = vaddq_f32(P0,P2);
  5239. return P0;
  5240. #elif defined(_XM_SSE_INTRINSICS_)
  5241. float t2 = t * t;
  5242. float t3 = t * t2;
  5243. XMVECTOR P0 = _mm_set_ps1((-t3 + 2.0f * t2 - t) * 0.5f);
  5244. XMVECTOR P1 = _mm_set_ps1((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f);
  5245. XMVECTOR P2 = _mm_set_ps1((-3.0f * t3 + 4.0f * t2 + t) * 0.5f);
  5246. XMVECTOR P3 = _mm_set_ps1((t3 - t2) * 0.5f);
  5247. P0 = _mm_mul_ps(P0, Position0);
  5248. P1 = _mm_mul_ps(P1, Position1);
  5249. P2 = _mm_mul_ps(P2, Position2);
  5250. P3 = _mm_mul_ps(P3, Position3);
  5251. P0 = _mm_add_ps(P0,P1);
  5252. P2 = _mm_add_ps(P2,P3);
  5253. P0 = _mm_add_ps(P0,P2);
  5254. return P0;
  5255. #endif
  5256. }
  5257. //------------------------------------------------------------------------------
  5258. inline XMVECTOR XM_CALLCONV XMVectorCatmullRomV
  5259. (
  5260. FXMVECTOR Position0,
  5261. FXMVECTOR Position1,
  5262. FXMVECTOR Position2,
  5263. GXMVECTOR Position3,
  5264. HXMVECTOR T
  5265. )
  5266. {
  5267. #if defined(_XM_NO_INTRINSICS_)
  5268. float fx = T.vector4_f32[0];
  5269. float fy = T.vector4_f32[1];
  5270. float fz = T.vector4_f32[2];
  5271. float fw = T.vector4_f32[3];
  5272. XMVECTORF32 vResult = { { {
  5273. 0.5f*((-fx*fx*fx + 2 * fx*fx - fx)*Position0.vector4_f32[0]
  5274. + (3 * fx*fx*fx - 5 * fx*fx + 2)*Position1.vector4_f32[0]
  5275. + (-3 * fx*fx*fx + 4 * fx*fx + fx)*Position2.vector4_f32[0]
  5276. + (fx*fx*fx - fx*fx)*Position3.vector4_f32[0]),
  5277. 0.5f*((-fy*fy*fy + 2 * fy*fy - fy)*Position0.vector4_f32[1]
  5278. + (3 * fy*fy*fy - 5 * fy*fy + 2)*Position1.vector4_f32[1]
  5279. + (-3 * fy*fy*fy + 4 * fy*fy + fy)*Position2.vector4_f32[1]
  5280. + (fy*fy*fy - fy*fy)*Position3.vector4_f32[1]),
  5281. 0.5f*((-fz*fz*fz + 2 * fz*fz - fz)*Position0.vector4_f32[2]
  5282. + (3 * fz*fz*fz - 5 * fz*fz + 2)*Position1.vector4_f32[2]
  5283. + (-3 * fz*fz*fz + 4 * fz*fz + fz)*Position2.vector4_f32[2]
  5284. + (fz*fz*fz - fz*fz)*Position3.vector4_f32[2]),
  5285. 0.5f*((-fw*fw*fw + 2 * fw*fw - fw)*Position0.vector4_f32[3]
  5286. + (3 * fw*fw*fw - 5 * fw*fw + 2)*Position1.vector4_f32[3]
  5287. + (-3 * fw*fw*fw + 4 * fw*fw + fw)*Position2.vector4_f32[3]
  5288. + (fw*fw*fw - fw*fw)*Position3.vector4_f32[3])
  5289. } } };
  5290. return vResult.v;
  5291. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  5292. static const XMVECTORF32 Catmul2 = { { { 2.0f, 2.0f, 2.0f, 2.0f } } };
  5293. static const XMVECTORF32 Catmul3 = { { { 3.0f, 3.0f, 3.0f, 3.0f } } };
  5294. static const XMVECTORF32 Catmul4 = { { { 4.0f, 4.0f, 4.0f, 4.0f } } };
  5295. static const XMVECTORF32 Catmul5 = { { { 5.0f, 5.0f, 5.0f, 5.0f } } };
  5296. // Cache T^2 and T^3
  5297. XMVECTOR T2 = vmulq_f32(T,T);
  5298. XMVECTOR T3 = vmulq_f32(T,T2);
  5299. // Perform the Position0 term
  5300. XMVECTOR vResult = vaddq_f32(T2,T2);
  5301. vResult = vsubq_f32(vResult,T);
  5302. vResult = vsubq_f32(vResult,T3);
  5303. vResult = vmulq_f32(vResult,Position0);
  5304. // Perform the Position1 term and add
  5305. XMVECTOR vTemp = vmulq_f32(T3,Catmul3);
  5306. vTemp = vmlsq_f32(vTemp, T2, Catmul5);
  5307. vTemp = vaddq_f32(vTemp,Catmul2);
  5308. vResult = vmlaq_f32(vResult, vTemp, Position1);
  5309. // Perform the Position2 term and add
  5310. vTemp = vmulq_f32(T2,Catmul4);
  5311. vTemp = vmlsq_f32(vTemp, T3, Catmul3);
  5312. vTemp = vaddq_f32(vTemp,T);
  5313. vResult = vmlaq_f32(vResult, vTemp, Position2);
  5314. // Position3 is the last term
  5315. T3 = vsubq_f32(T3,T2);
  5316. vResult = vmlaq_f32(vResult, T3, Position3);
  5317. // Multiply by 0.5f and exit
  5318. vResult = vmulq_f32(vResult,g_XMOneHalf);
  5319. return vResult;
  5320. #elif defined(_XM_SSE_INTRINSICS_)
  5321. static const XMVECTORF32 Catmul2 = { { { 2.0f, 2.0f, 2.0f, 2.0f } } };
  5322. static const XMVECTORF32 Catmul3 = { { { 3.0f, 3.0f, 3.0f, 3.0f } } };
  5323. static const XMVECTORF32 Catmul4 = { { { 4.0f, 4.0f, 4.0f, 4.0f } } };
  5324. static const XMVECTORF32 Catmul5 = { { { 5.0f, 5.0f, 5.0f, 5.0f } } };
  5325. // Cache T^2 and T^3
  5326. XMVECTOR T2 = _mm_mul_ps(T,T);
  5327. XMVECTOR T3 = _mm_mul_ps(T,T2);
  5328. // Perform the Position0 term
  5329. XMVECTOR vResult = _mm_add_ps(T2,T2);
  5330. vResult = _mm_sub_ps(vResult,T);
  5331. vResult = _mm_sub_ps(vResult,T3);
  5332. vResult = _mm_mul_ps(vResult,Position0);
  5333. // Perform the Position1 term and add
  5334. XMVECTOR vTemp = _mm_mul_ps(T3,Catmul3);
  5335. XMVECTOR vTemp2 = _mm_mul_ps(T2,Catmul5);
  5336. vTemp = _mm_sub_ps(vTemp,vTemp2);
  5337. vTemp = _mm_add_ps(vTemp,Catmul2);
  5338. vTemp = _mm_mul_ps(vTemp,Position1);
  5339. vResult = _mm_add_ps(vResult,vTemp);
  5340. // Perform the Position2 term and add
  5341. vTemp = _mm_mul_ps(T2,Catmul4);
  5342. vTemp2 = _mm_mul_ps(T3,Catmul3);
  5343. vTemp = _mm_sub_ps(vTemp,vTemp2);
  5344. vTemp = _mm_add_ps(vTemp,T);
  5345. vTemp = _mm_mul_ps(vTemp,Position2);
  5346. vResult = _mm_add_ps(vResult,vTemp);
  5347. // Position3 is the last term
  5348. T3 = _mm_sub_ps(T3,T2);
  5349. T3 = _mm_mul_ps(T3,Position3);
  5350. vResult = _mm_add_ps(vResult,T3);
  5351. // Multiply by 0.5f and exit
  5352. vResult = _mm_mul_ps(vResult,g_XMOneHalf);
  5353. return vResult;
  5354. #endif
  5355. }
  5356. //------------------------------------------------------------------------------
  5357. inline XMVECTOR XM_CALLCONV XMVectorBaryCentric
  5358. (
  5359. FXMVECTOR Position0,
  5360. FXMVECTOR Position1,
  5361. FXMVECTOR Position2,
  5362. float f,
  5363. float g
  5364. )
  5365. {
  5366. // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0)
  5367. #if defined(_XM_NO_INTRINSICS_)
  5368. XMVECTOR P10 = XMVectorSubtract(Position1, Position0);
  5369. XMVECTOR ScaleF = XMVectorReplicate(f);
  5370. XMVECTOR P20 = XMVectorSubtract(Position2, Position0);
  5371. XMVECTOR ScaleG = XMVectorReplicate(g);
  5372. XMVECTOR Result = XMVectorMultiplyAdd(P10, ScaleF, Position0);
  5373. Result = XMVectorMultiplyAdd(P20, ScaleG, Result);
  5374. return Result;
  5375. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  5376. XMVECTOR R1 = vsubq_f32(Position1,Position0);
  5377. XMVECTOR R2 = vsubq_f32(Position2,Position0);
  5378. R1 = vmlaq_n_f32( Position0, R1, f);
  5379. return vmlaq_n_f32( R1, R2, g );
  5380. #elif defined(_XM_SSE_INTRINSICS_)
  5381. XMVECTOR R1 = _mm_sub_ps(Position1,Position0);
  5382. XMVECTOR SF = _mm_set_ps1(f);
  5383. XMVECTOR R2 = _mm_sub_ps(Position2,Position0);
  5384. XMVECTOR SG = _mm_set_ps1(g);
  5385. R1 = _mm_mul_ps(R1,SF);
  5386. R2 = _mm_mul_ps(R2,SG);
  5387. R1 = _mm_add_ps(R1,Position0);
  5388. R1 = _mm_add_ps(R1,R2);
  5389. return R1;
  5390. #endif
  5391. }
  5392. //------------------------------------------------------------------------------
  5393. inline XMVECTOR XM_CALLCONV XMVectorBaryCentricV
  5394. (
  5395. FXMVECTOR Position0,
  5396. FXMVECTOR Position1,
  5397. FXMVECTOR Position2,
  5398. GXMVECTOR F,
  5399. HXMVECTOR G
  5400. )
  5401. {
  5402. // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0)
  5403. #if defined(_XM_NO_INTRINSICS_)
  5404. XMVECTOR P10 = XMVectorSubtract(Position1, Position0);
  5405. XMVECTOR P20 = XMVectorSubtract(Position2, Position0);
  5406. XMVECTOR Result = XMVectorMultiplyAdd(P10, F, Position0);
  5407. Result = XMVectorMultiplyAdd(P20, G, Result);
  5408. return Result;
  5409. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  5410. XMVECTOR R1 = vsubq_f32(Position1,Position0);
  5411. XMVECTOR R2 = vsubq_f32(Position2,Position0);
  5412. R1 = vmlaq_f32( Position0, R1, F );
  5413. return vmlaq_f32( R1, R2, G);
  5414. #elif defined(_XM_SSE_INTRINSICS_)
  5415. XMVECTOR R1 = _mm_sub_ps(Position1,Position0);
  5416. XMVECTOR R2 = _mm_sub_ps(Position2,Position0);
  5417. R1 = _mm_mul_ps(R1,F);
  5418. R2 = _mm_mul_ps(R2,G);
  5419. R1 = _mm_add_ps(R1,Position0);
  5420. R1 = _mm_add_ps(R1,R2);
  5421. return R1;
  5422. #endif
  5423. }
  5424. /****************************************************************************
  5425. *
  5426. * 2D Vector
  5427. *
  5428. ****************************************************************************/
  5429. //------------------------------------------------------------------------------
  5430. // Comparison operations
  5431. //------------------------------------------------------------------------------
  5432. //------------------------------------------------------------------------------
  5433. inline bool XM_CALLCONV XMVector2Equal
  5434. (
  5435. FXMVECTOR V1,
  5436. FXMVECTOR V2
  5437. )
  5438. {
  5439. #if defined(_XM_NO_INTRINSICS_)
  5440. return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1])) != 0);
  5441. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  5442. uint32x2_t vTemp = vceq_f32( vget_low_f32(V1), vget_low_f32(V2) );
  5443. return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU );
  5444. #elif defined(_XM_SSE_INTRINSICS_)
  5445. XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
  5446. // z and w are don't care
  5447. return (((_mm_movemask_ps(vTemp)&3)==3) != 0);
  5448. #endif
  5449. }
  5450. //------------------------------------------------------------------------------
  5451. inline uint32_t XM_CALLCONV XMVector2EqualR
  5452. (
  5453. FXMVECTOR V1,
  5454. FXMVECTOR V2
  5455. )
  5456. {
  5457. #if defined(_XM_NO_INTRINSICS_)
  5458. uint32_t CR = 0;
  5459. if ((V1.vector4_f32[0] == V2.vector4_f32[0]) &&
  5460. (V1.vector4_f32[1] == V2.vector4_f32[1]))
  5461. {
  5462. CR = XM_CRMASK_CR6TRUE;
  5463. }
  5464. else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) &&
  5465. (V1.vector4_f32[1] != V2.vector4_f32[1]))
  5466. {
  5467. CR = XM_CRMASK_CR6FALSE;
  5468. }
  5469. return CR;
  5470. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  5471. uint32x2_t vTemp = vceq_f32( vget_low_f32(V1), vget_low_f32(V2) );
  5472. uint64_t r = vget_lane_u64( vTemp, 0 );
  5473. uint32_t CR = 0;
  5474. if ( r == 0xFFFFFFFFFFFFFFFFU )
  5475. {
  5476. CR = XM_CRMASK_CR6TRUE;
  5477. }
  5478. else if ( !r )
  5479. {
  5480. CR = XM_CRMASK_CR6FALSE;
  5481. }
  5482. return CR;
  5483. #elif defined(_XM_SSE_INTRINSICS_)
  5484. XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
  5485. // z and w are don't care
  5486. int iTest = _mm_movemask_ps(vTemp)&3;
  5487. uint32_t CR = 0;
  5488. if (iTest==3)
  5489. {
  5490. CR = XM_CRMASK_CR6TRUE;
  5491. }
  5492. else if (!iTest)
  5493. {
  5494. CR = XM_CRMASK_CR6FALSE;
  5495. }
  5496. return CR;
  5497. #endif
  5498. }
  5499. //------------------------------------------------------------------------------
  5500. inline bool XM_CALLCONV XMVector2EqualInt
  5501. (
  5502. FXMVECTOR V1,
  5503. FXMVECTOR V2
  5504. )
  5505. {
  5506. #if defined(_XM_NO_INTRINSICS_)
  5507. return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1])) != 0);
  5508. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  5509. uint32x2_t vTemp = vceq_u32( vget_low_u32(V1), vget_low_u32(V2) );
  5510. return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU );
  5511. #elif defined(_XM_SSE_INTRINSICS_)
  5512. __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
  5513. return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&3)==3) != 0);
  5514. #endif
  5515. }
  5516. //------------------------------------------------------------------------------
  5517. inline uint32_t XM_CALLCONV XMVector2EqualIntR
  5518. (
  5519. FXMVECTOR V1,
  5520. FXMVECTOR V2
  5521. )
  5522. {
  5523. #if defined(_XM_NO_INTRINSICS_)
  5524. uint32_t CR = 0;
  5525. if ((V1.vector4_u32[0] == V2.vector4_u32[0]) &&
  5526. (V1.vector4_u32[1] == V2.vector4_u32[1]))
  5527. {
  5528. CR = XM_CRMASK_CR6TRUE;
  5529. }
  5530. else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) &&
  5531. (V1.vector4_u32[1] != V2.vector4_u32[1]))
  5532. {
  5533. CR = XM_CRMASK_CR6FALSE;
  5534. }
  5535. return CR;
  5536. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  5537. uint32x2_t vTemp = vceq_u32( vget_low_u32(V1), vget_low_u32(V2) );
  5538. uint64_t r = vget_lane_u64( vTemp, 0 );
  5539. uint32_t CR = 0;
  5540. if ( r == 0xFFFFFFFFFFFFFFFFU )
  5541. {
  5542. CR = XM_CRMASK_CR6TRUE;
  5543. }
  5544. else if ( !r )
  5545. {
  5546. CR = XM_CRMASK_CR6FALSE;
  5547. }
  5548. return CR;
  5549. #elif defined(_XM_SSE_INTRINSICS_)
  5550. __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
  5551. int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp))&3;
  5552. uint32_t CR = 0;
  5553. if (iTest==3)
  5554. {
  5555. CR = XM_CRMASK_CR6TRUE;
  5556. }
  5557. else if (!iTest)
  5558. {
  5559. CR = XM_CRMASK_CR6FALSE;
  5560. }
  5561. return CR;
  5562. #endif
  5563. }
  5564. //------------------------------------------------------------------------------
  5565. inline bool XM_CALLCONV XMVector2NearEqual
  5566. (
  5567. FXMVECTOR V1,
  5568. FXMVECTOR V2,
  5569. FXMVECTOR Epsilon
  5570. )
  5571. {
  5572. #if defined(_XM_NO_INTRINSICS_)
  5573. float dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]);
  5574. float dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]);
  5575. return ((dx <= Epsilon.vector4_f32[0]) &&
  5576. (dy <= Epsilon.vector4_f32[1]));
  5577. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  5578. float32x2_t vDelta = vsub_f32(vget_low_u32(V1), vget_low_u32(V2));
  5579. uint32x2_t vTemp = vacle_f32( vDelta, vget_low_u32(Epsilon) );
  5580. uint64_t r = vget_lane_u64( vTemp, 0 );
  5581. return ( r == 0xFFFFFFFFFFFFFFFFU );
  5582. #elif defined(_XM_SSE_INTRINSICS_)
  5583. // Get the difference
  5584. XMVECTOR vDelta = _mm_sub_ps(V1,V2);
  5585. // Get the absolute value of the difference
  5586. XMVECTOR vTemp = _mm_setzero_ps();
  5587. vTemp = _mm_sub_ps(vTemp,vDelta);
  5588. vTemp = _mm_max_ps(vTemp,vDelta);
  5589. vTemp = _mm_cmple_ps(vTemp,Epsilon);
  5590. // z and w are don't care
  5591. return (((_mm_movemask_ps(vTemp)&3)==0x3) != 0);
  5592. #endif
  5593. }
  5594. //------------------------------------------------------------------------------
  5595. inline bool XM_CALLCONV XMVector2NotEqual
  5596. (
  5597. FXMVECTOR V1,
  5598. FXMVECTOR V2
  5599. )
  5600. {
  5601. #if defined(_XM_NO_INTRINSICS_)
  5602. return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1])) != 0);
  5603. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  5604. uint32x2_t vTemp = vceq_f32( vget_low_f32(V1), vget_low_f32(V2) );
  5605. return ( vget_lane_u64( vTemp, 0 ) != 0xFFFFFFFFFFFFFFFFU );
  5606. #elif defined(_XM_SSE_INTRINSICS_)
  5607. XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
  5608. // z and w are don't care
  5609. return (((_mm_movemask_ps(vTemp)&3)!=3) != 0);
  5610. #endif
  5611. }
  5612. //------------------------------------------------------------------------------
  5613. inline bool XM_CALLCONV XMVector2NotEqualInt
  5614. (
  5615. FXMVECTOR V1,
  5616. FXMVECTOR V2
  5617. )
  5618. {
  5619. #if defined(_XM_NO_INTRINSICS_)
  5620. return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1])) != 0);
  5621. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  5622. uint32x2_t vTemp = vceq_u32( vget_low_u32(V1), vget_low_u32(V2) );
  5623. return ( vget_lane_u64( vTemp, 0 ) != 0xFFFFFFFFFFFFFFFFU );
  5624. #elif defined(_XM_SSE_INTRINSICS_)
  5625. __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
  5626. return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&3)!=3) != 0);
  5627. #endif
  5628. }
  5629. //------------------------------------------------------------------------------
  5630. inline bool XM_CALLCONV XMVector2Greater
  5631. (
  5632. FXMVECTOR V1,
  5633. FXMVECTOR V2
  5634. )
  5635. {
  5636. #if defined(_XM_NO_INTRINSICS_)
  5637. return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1])) != 0);
  5638. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  5639. uint32x2_t vTemp = vcgt_f32( vget_low_f32(V1), vget_low_f32(V2) );
  5640. return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU );
  5641. #elif defined(_XM_SSE_INTRINSICS_)
  5642. XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
  5643. // z and w are don't care
  5644. return (((_mm_movemask_ps(vTemp)&3)==3) != 0);
  5645. #endif
  5646. }
  5647. //------------------------------------------------------------------------------
  5648. inline uint32_t XM_CALLCONV XMVector2GreaterR
  5649. (
  5650. FXMVECTOR V1,
  5651. FXMVECTOR V2
  5652. )
  5653. {
  5654. #if defined(_XM_NO_INTRINSICS_)
  5655. uint32_t CR = 0;
  5656. if ((V1.vector4_f32[0] > V2.vector4_f32[0]) &&
  5657. (V1.vector4_f32[1] > V2.vector4_f32[1]))
  5658. {
  5659. CR = XM_CRMASK_CR6TRUE;
  5660. }
  5661. else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) &&
  5662. (V1.vector4_f32[1] <= V2.vector4_f32[1]))
  5663. {
  5664. CR = XM_CRMASK_CR6FALSE;
  5665. }
  5666. return CR;
  5667. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  5668. uint32x2_t vTemp = vcgt_f32( vget_low_f32(V1), vget_low_f32(V2) );
  5669. uint64_t r = vget_lane_u64( vTemp, 0 );
  5670. uint32_t CR = 0;
  5671. if ( r == 0xFFFFFFFFFFFFFFFFU )
  5672. {
  5673. CR = XM_CRMASK_CR6TRUE;
  5674. }
  5675. else if ( !r )
  5676. {
  5677. CR = XM_CRMASK_CR6FALSE;
  5678. }
  5679. return CR;
  5680. #elif defined(_XM_SSE_INTRINSICS_)
  5681. XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
  5682. int iTest = _mm_movemask_ps(vTemp)&3;
  5683. uint32_t CR = 0;
  5684. if (iTest==3)
  5685. {
  5686. CR = XM_CRMASK_CR6TRUE;
  5687. }
  5688. else if (!iTest)
  5689. {
  5690. CR = XM_CRMASK_CR6FALSE;
  5691. }
  5692. return CR;
  5693. #endif
  5694. }
  5695. //------------------------------------------------------------------------------
  5696. inline bool XM_CALLCONV XMVector2GreaterOrEqual
  5697. (
  5698. FXMVECTOR V1,
  5699. FXMVECTOR V2
  5700. )
  5701. {
  5702. #if defined(_XM_NO_INTRINSICS_)
  5703. return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1])) != 0);
  5704. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  5705. uint32x2_t vTemp = vcge_f32( vget_low_f32(V1), vget_low_f32(V2) );
  5706. return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU );
  5707. #elif defined(_XM_SSE_INTRINSICS_)
  5708. XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
  5709. return (((_mm_movemask_ps(vTemp)&3)==3) != 0);
  5710. #endif
  5711. }
  5712. //------------------------------------------------------------------------------
  5713. inline uint32_t XM_CALLCONV XMVector2GreaterOrEqualR
  5714. (
  5715. FXMVECTOR V1,
  5716. FXMVECTOR V2
  5717. )
  5718. {
  5719. #if defined(_XM_NO_INTRINSICS_)
  5720. uint32_t CR = 0;
  5721. if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) &&
  5722. (V1.vector4_f32[1] >= V2.vector4_f32[1]))
  5723. {
  5724. CR = XM_CRMASK_CR6TRUE;
  5725. }
  5726. else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) &&
  5727. (V1.vector4_f32[1] < V2.vector4_f32[1]))
  5728. {
  5729. CR = XM_CRMASK_CR6FALSE;
  5730. }
  5731. return CR;
  5732. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  5733. uint32x2_t vTemp = vcge_f32( vget_low_f32(V1), vget_low_f32(V2) );
  5734. uint64_t r = vget_lane_u64( vTemp, 0 );
  5735. uint32_t CR = 0;
  5736. if ( r == 0xFFFFFFFFFFFFFFFFU )
  5737. {
  5738. CR = XM_CRMASK_CR6TRUE;
  5739. }
  5740. else if ( !r )
  5741. {
  5742. CR = XM_CRMASK_CR6FALSE;
  5743. }
  5744. return CR;
  5745. #elif defined(_XM_SSE_INTRINSICS_)
  5746. XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
  5747. int iTest = _mm_movemask_ps(vTemp)&3;
  5748. uint32_t CR = 0;
  5749. if (iTest == 3)
  5750. {
  5751. CR = XM_CRMASK_CR6TRUE;
  5752. }
  5753. else if (!iTest)
  5754. {
  5755. CR = XM_CRMASK_CR6FALSE;
  5756. }
  5757. return CR;
  5758. #endif
  5759. }
  5760. //------------------------------------------------------------------------------
  5761. inline bool XM_CALLCONV XMVector2Less
  5762. (
  5763. FXMVECTOR V1,
  5764. FXMVECTOR V2
  5765. )
  5766. {
  5767. #if defined(_XM_NO_INTRINSICS_)
  5768. return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1])) != 0);
  5769. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  5770. uint32x2_t vTemp = vclt_f32( vget_low_f32(V1), vget_low_f32(V2) );
  5771. return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU );
  5772. #elif defined(_XM_SSE_INTRINSICS_)
  5773. XMVECTOR vTemp = _mm_cmplt_ps(V1,V2);
  5774. return (((_mm_movemask_ps(vTemp)&3)==3) != 0);
  5775. #endif
  5776. }
  5777. //------------------------------------------------------------------------------
  5778. inline bool XM_CALLCONV XMVector2LessOrEqual
  5779. (
  5780. FXMVECTOR V1,
  5781. FXMVECTOR V2
  5782. )
  5783. {
  5784. #if defined(_XM_NO_INTRINSICS_)
  5785. return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1])) != 0);
  5786. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  5787. uint32x2_t vTemp = vcle_f32( vget_low_f32(V1), vget_low_f32(V2) );
  5788. return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU );
  5789. #elif defined(_XM_SSE_INTRINSICS_)
  5790. XMVECTOR vTemp = _mm_cmple_ps(V1,V2);
  5791. return (((_mm_movemask_ps(vTemp)&3)==3) != 0);
  5792. #endif
  5793. }
  5794. //------------------------------------------------------------------------------
  5795. inline bool XM_CALLCONV XMVector2InBounds
  5796. (
  5797. FXMVECTOR V,
  5798. FXMVECTOR Bounds
  5799. )
  5800. {
  5801. #if defined(_XM_NO_INTRINSICS_)
  5802. return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) &&
  5803. (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1])) != 0);
  5804. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  5805. float32x2_t VL = vget_low_f32( V );
  5806. float32x2_t B = vget_low_f32( Bounds );
  5807. // Test if less than or equal
  5808. uint32x2_t ivTemp1 = vcle_f32(VL,B);
  5809. // Negate the bounds
  5810. float32x2_t vTemp2 = vneg_f32(B);
  5811. // Test if greater or equal (Reversed)
  5812. uint32x2_t ivTemp2 = vcle_f32(vTemp2,VL);
  5813. // Blend answers
  5814. ivTemp1 = vand_u32(ivTemp1,ivTemp2);
  5815. // x and y in bounds?
  5816. return ( vget_lane_u64( ivTemp1, 0 ) == 0xFFFFFFFFFFFFFFFFU );
  5817. #elif defined(_XM_SSE_INTRINSICS_)
  5818. // Test if less than or equal
  5819. XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
  5820. // Negate the bounds
  5821. XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
  5822. // Test if greater or equal (Reversed)
  5823. vTemp2 = _mm_cmple_ps(vTemp2,V);
  5824. // Blend answers
  5825. vTemp1 = _mm_and_ps(vTemp1,vTemp2);
  5826. // x and y in bounds? (z and w are don't care)
  5827. return (((_mm_movemask_ps(vTemp1)&0x3)==0x3) != 0);
  5828. #endif
  5829. }
  5830. //------------------------------------------------------------------------------
  5831. inline bool XM_CALLCONV XMVector2IsNaN
  5832. (
  5833. FXMVECTOR V
  5834. )
  5835. {
  5836. #if defined(_XM_NO_INTRINSICS_)
  5837. return (XMISNAN(V.vector4_f32[0]) ||
  5838. XMISNAN(V.vector4_f32[1]));
  5839. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  5840. float32x2_t VL = vget_low_f32( V );
  5841. // Test against itself. NaN is always not equal
  5842. uint32x2_t vTempNan = vceq_f32( VL, VL );
  5843. // If x or y are NaN, the mask is zero
  5844. return ( vget_lane_u64( vTempNan, 0 ) != 0xFFFFFFFFFFFFFFFFU );
  5845. #elif defined(_XM_SSE_INTRINSICS_)
  5846. // Test against itself. NaN is always not equal
  5847. XMVECTOR vTempNan = _mm_cmpneq_ps(V,V);
  5848. // If x or y are NaN, the mask is non-zero
  5849. return ((_mm_movemask_ps(vTempNan)&3) != 0);
  5850. #endif
  5851. }
  5852. //------------------------------------------------------------------------------
  5853. inline bool XM_CALLCONV XMVector2IsInfinite
  5854. (
  5855. FXMVECTOR V
  5856. )
  5857. {
  5858. #if defined(_XM_NO_INTRINSICS_)
  5859. return (XMISINF(V.vector4_f32[0]) ||
  5860. XMISINF(V.vector4_f32[1]));
  5861. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  5862. // Mask off the sign bit
  5863. uint32x2_t vTemp = vand_u32( vget_low_f32( V ) , vget_low_f32( g_XMAbsMask ) );
  5864. // Compare to infinity
  5865. vTemp = vceq_f32(vTemp, vget_low_f32( g_XMInfinity) );
  5866. // If any are infinity, the signs are true.
  5867. return vget_lane_u64( vTemp, 0 ) != 0;
  5868. #elif defined(_XM_SSE_INTRINSICS_)
  5869. // Mask off the sign bit
  5870. __m128 vTemp = _mm_and_ps(V,g_XMAbsMask);
  5871. // Compare to infinity
  5872. vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity);
  5873. // If x or z are infinity, the signs are true.
  5874. return ((_mm_movemask_ps(vTemp)&3) != 0);
  5875. #endif
  5876. }
  5877. //------------------------------------------------------------------------------
  5878. // Computation operations
  5879. //------------------------------------------------------------------------------
  5880. //------------------------------------------------------------------------------
  5881. inline XMVECTOR XM_CALLCONV XMVector2Dot
  5882. (
  5883. FXMVECTOR V1,
  5884. FXMVECTOR V2
  5885. )
  5886. {
  5887. #if defined(_XM_NO_INTRINSICS_)
  5888. XMVECTORF32 Result;
  5889. Result.f[0] =
  5890. Result.f[1] =
  5891. Result.f[2] =
  5892. Result.f[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1];
  5893. return Result.v;
  5894. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  5895. // Perform the dot product on x and y
  5896. float32x2_t vTemp = vmul_f32( vget_low_f32(V1), vget_low_f32(V2) );
  5897. vTemp = vpadd_f32( vTemp, vTemp );
  5898. return vcombine_f32( vTemp, vTemp );
  5899. #elif defined(_XM_SSE4_INTRINSICS_)
  5900. return _mm_dp_ps( V1, V2, 0x3f );
  5901. #elif defined(_XM_SSE3_INTRINSICS_)
  5902. XMVECTOR vDot = _mm_mul_ps(V1, V2);
  5903. vDot = _mm_hadd_ps(vDot, vDot);
  5904. vDot = _mm_moveldup_ps(vDot);
  5905. return vDot;
  5906. #elif defined(_XM_SSE_INTRINSICS_)
  5907. // Perform the dot product on x and y
  5908. XMVECTOR vLengthSq = _mm_mul_ps(V1,V2);
  5909. // vTemp has y splatted
  5910. XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
  5911. // x+y
  5912. vLengthSq = _mm_add_ss(vLengthSq,vTemp);
  5913. vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
  5914. return vLengthSq;
  5915. #endif
  5916. }
  5917. //------------------------------------------------------------------------------
  5918. inline XMVECTOR XM_CALLCONV XMVector2Cross
  5919. (
  5920. FXMVECTOR V1,
  5921. FXMVECTOR V2
  5922. )
  5923. {
  5924. // [ V1.x*V2.y - V1.y*V2.x, V1.x*V2.y - V1.y*V2.x ]
  5925. #if defined(_XM_NO_INTRINSICS_)
  5926. float fCross = (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]);
  5927. XMVECTORF32 vResult;
  5928. vResult.f[0] =
  5929. vResult.f[1] =
  5930. vResult.f[2] =
  5931. vResult.f[3] = fCross;
  5932. return vResult.v;
  5933. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  5934. static const XMVECTORF32 Negate = { { { 1.f, -1.f, 0, 0 } } };
  5935. float32x2_t vTemp = vmul_f32( vget_low_f32( V1 ), vrev64_f32( vget_low_f32( V2 ) ) );
  5936. vTemp = vmul_f32( vTemp, vget_low_f32( Negate ) );
  5937. vTemp = vpadd_f32( vTemp, vTemp );
  5938. return vcombine_f32( vTemp, vTemp );
  5939. #elif defined(_XM_SSE_INTRINSICS_)
  5940. // Swap x and y
  5941. XMVECTOR vResult = XM_PERMUTE_PS(V2,_MM_SHUFFLE(0,1,0,1));
  5942. // Perform the muls
  5943. vResult = _mm_mul_ps(vResult,V1);
  5944. // Splat y
  5945. XMVECTOR vTemp = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(1,1,1,1));
  5946. // Sub the values
  5947. vResult = _mm_sub_ss(vResult,vTemp);
  5948. // Splat the cross product
  5949. vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,0,0,0));
  5950. return vResult;
  5951. #endif
  5952. }
  5953. //------------------------------------------------------------------------------
  5954. inline XMVECTOR XM_CALLCONV XMVector2LengthSq
  5955. (
  5956. FXMVECTOR V
  5957. )
  5958. {
  5959. return XMVector2Dot(V, V);
  5960. }
  5961. //------------------------------------------------------------------------------
  5962. inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst
  5963. (
  5964. FXMVECTOR V
  5965. )
  5966. {
  5967. #if defined(_XM_NO_INTRINSICS_)
  5968. XMVECTOR Result;
  5969. Result = XMVector2LengthSq(V);
  5970. Result = XMVectorReciprocalSqrtEst(Result);
  5971. return Result;
  5972. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  5973. float32x2_t VL = vget_low_f32(V);
  5974. // Dot2
  5975. float32x2_t vTemp = vmul_f32( VL, VL );
  5976. vTemp = vpadd_f32( vTemp, vTemp );
  5977. // Reciprocal sqrt (estimate)
  5978. vTemp = vrsqrte_f32( vTemp );
  5979. return vcombine_f32( vTemp, vTemp );
  5980. #elif defined(_XM_SSE4_INTRINSICS_)
  5981. XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
  5982. return _mm_rsqrt_ps( vTemp );
  5983. #elif defined(_XM_SSE3_INTRINSICS_)
  5984. XMVECTOR vLengthSq = _mm_mul_ps(V, V);
  5985. XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq);
  5986. vLengthSq = _mm_rsqrt_ss(vTemp);
  5987. vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
  5988. return vLengthSq;
  5989. #elif defined(_XM_SSE_INTRINSICS_)
  5990. // Perform the dot product on x and y
  5991. XMVECTOR vLengthSq = _mm_mul_ps(V,V);
  5992. // vTemp has y splatted
  5993. XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
  5994. // x+y
  5995. vLengthSq = _mm_add_ss(vLengthSq,vTemp);
  5996. vLengthSq = _mm_rsqrt_ss(vLengthSq);
  5997. vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
  5998. return vLengthSq;
  5999. #endif
  6000. }
  6001. //------------------------------------------------------------------------------
  6002. inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLength
  6003. (
  6004. FXMVECTOR V
  6005. )
  6006. {
  6007. #if defined(_XM_NO_INTRINSICS_)
  6008. XMVECTOR Result;
  6009. Result = XMVector2LengthSq(V);
  6010. Result = XMVectorReciprocalSqrt(Result);
  6011. return Result;
  6012. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  6013. float32x2_t VL = vget_low_f32(V);
  6014. // Dot2
  6015. float32x2_t vTemp = vmul_f32( VL, VL );
  6016. vTemp = vpadd_f32( vTemp, vTemp );
  6017. // Reciprocal sqrt
  6018. float32x2_t S0 = vrsqrte_f32(vTemp);
  6019. float32x2_t P0 = vmul_f32( vTemp, S0 );
  6020. float32x2_t R0 = vrsqrts_f32( P0, S0 );
  6021. float32x2_t S1 = vmul_f32( S0, R0 );
  6022. float32x2_t P1 = vmul_f32( vTemp, S1 );
  6023. float32x2_t R1 = vrsqrts_f32( P1, S1 );
  6024. float32x2_t Result = vmul_f32( S1, R1 );
  6025. return vcombine_f32( Result, Result );
  6026. #elif defined(_XM_SSE4_INTRINSICS_)
  6027. XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
  6028. XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
  6029. return _mm_div_ps( g_XMOne, vLengthSq );
  6030. #elif defined(_XM_SSE3_INTRINSICS_)
  6031. XMVECTOR vLengthSq = _mm_mul_ps(V,V);
  6032. XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq);
  6033. vLengthSq = _mm_sqrt_ss(vTemp);
  6034. vLengthSq = _mm_div_ss(g_XMOne, vLengthSq);
  6035. vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
  6036. return vLengthSq;
  6037. #elif defined(_XM_SSE_INTRINSICS_)
  6038. // Perform the dot product on x and y
  6039. XMVECTOR vLengthSq = _mm_mul_ps(V,V);
  6040. // vTemp has y splatted
  6041. XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
  6042. // x+y
  6043. vLengthSq = _mm_add_ss(vLengthSq,vTemp);
  6044. vLengthSq = _mm_sqrt_ss(vLengthSq);
  6045. vLengthSq = _mm_div_ss(g_XMOne,vLengthSq);
  6046. vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
  6047. return vLengthSq;
  6048. #endif
  6049. }
  6050. //------------------------------------------------------------------------------
  6051. inline XMVECTOR XM_CALLCONV XMVector2LengthEst
  6052. (
  6053. FXMVECTOR V
  6054. )
  6055. {
  6056. #if defined(_XM_NO_INTRINSICS_)
  6057. XMVECTOR Result;
  6058. Result = XMVector2LengthSq(V);
  6059. Result = XMVectorSqrtEst(Result);
  6060. return Result;
  6061. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  6062. float32x2_t VL = vget_low_f32(V);
  6063. // Dot2
  6064. float32x2_t vTemp = vmul_f32( VL, VL );
  6065. vTemp = vpadd_f32( vTemp, vTemp );
  6066. const float32x2_t zero = vdup_n_f32(0);
  6067. uint32x2_t VEqualsZero = vceq_f32( vTemp, zero );
  6068. // Sqrt (estimate)
  6069. float32x2_t Result = vrsqrte_f32( vTemp );
  6070. Result = vmul_f32( vTemp, Result );
  6071. Result = vbsl_f32( VEqualsZero, zero, Result );
  6072. return vcombine_f32( Result, Result );
  6073. #elif defined(_XM_SSE4_INTRINSICS_)
  6074. XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
  6075. return _mm_sqrt_ps( vTemp );
  6076. #elif defined(_XM_SSE3_INTRINSICS_)
  6077. XMVECTOR vLengthSq = _mm_mul_ps(V, V);
  6078. XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq);
  6079. vLengthSq = _mm_sqrt_ss(vTemp);
  6080. vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
  6081. return vLengthSq;
  6082. #elif defined(_XM_SSE_INTRINSICS_)
  6083. // Perform the dot product on x and y
  6084. XMVECTOR vLengthSq = _mm_mul_ps(V,V);
  6085. // vTemp has y splatted
  6086. XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
  6087. // x+y
  6088. vLengthSq = _mm_add_ss(vLengthSq,vTemp);
  6089. vLengthSq = _mm_sqrt_ss(vLengthSq);
  6090. vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
  6091. return vLengthSq;
  6092. #endif
  6093. }
  6094. //------------------------------------------------------------------------------
  6095. inline XMVECTOR XM_CALLCONV XMVector2Length
  6096. (
  6097. FXMVECTOR V
  6098. )
  6099. {
  6100. #if defined(_XM_NO_INTRINSICS_)
  6101. XMVECTOR Result;
  6102. Result = XMVector2LengthSq(V);
  6103. Result = XMVectorSqrt(Result);
  6104. return Result;
  6105. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  6106. float32x2_t VL = vget_low_f32(V);
  6107. // Dot2
  6108. float32x2_t vTemp = vmul_f32( VL, VL );
  6109. vTemp = vpadd_f32( vTemp, vTemp );
  6110. const float32x2_t zero = vdup_n_f32(0);
  6111. uint32x2_t VEqualsZero = vceq_f32( vTemp, zero );
  6112. // Sqrt
  6113. float32x2_t S0 = vrsqrte_f32( vTemp );
  6114. float32x2_t P0 = vmul_f32( vTemp, S0 );
  6115. float32x2_t R0 = vrsqrts_f32( P0, S0 );
  6116. float32x2_t S1 = vmul_f32( S0, R0 );
  6117. float32x2_t P1 = vmul_f32( vTemp, S1 );
  6118. float32x2_t R1 = vrsqrts_f32( P1, S1 );
  6119. float32x2_t Result = vmul_f32( S1, R1 );
  6120. Result = vmul_f32( vTemp, Result );
  6121. Result = vbsl_f32( VEqualsZero, zero, Result );
  6122. return vcombine_f32( Result, Result );
  6123. #elif defined(_XM_SSE4_INTRINSICS_)
  6124. XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
  6125. return _mm_sqrt_ps( vTemp );
  6126. #elif defined(_XM_SSE3_INTRINSICS_)
  6127. XMVECTOR vLengthSq = _mm_mul_ps(V, V);
  6128. XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq);
  6129. vLengthSq = _mm_sqrt_ss(vTemp);
  6130. vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
  6131. return vLengthSq;
  6132. #elif defined(_XM_SSE_INTRINSICS_)
  6133. // Perform the dot product on x and y
  6134. XMVECTOR vLengthSq = _mm_mul_ps(V,V);
  6135. // vTemp has y splatted
  6136. XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
  6137. // x+y
  6138. vLengthSq = _mm_add_ss(vLengthSq,vTemp);
  6139. vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
  6140. vLengthSq = _mm_sqrt_ps(vLengthSq);
  6141. return vLengthSq;
  6142. #endif
  6143. }
  6144. //------------------------------------------------------------------------------
  6145. // XMVector2NormalizeEst uses a reciprocal estimate and
  6146. // returns QNaN on zero and infinite vectors.
  6147. inline XMVECTOR XM_CALLCONV XMVector2NormalizeEst
  6148. (
  6149. FXMVECTOR V
  6150. )
  6151. {
  6152. #if defined(_XM_NO_INTRINSICS_)
  6153. XMVECTOR Result;
  6154. Result = XMVector2ReciprocalLength(V);
  6155. Result = XMVectorMultiply(V, Result);
  6156. return Result;
  6157. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  6158. float32x2_t VL = vget_low_f32(V);
  6159. // Dot2
  6160. float32x2_t vTemp = vmul_f32( VL, VL );
  6161. vTemp = vpadd_f32( vTemp, vTemp );
  6162. // Reciprocal sqrt (estimate)
  6163. vTemp = vrsqrte_f32( vTemp );
  6164. // Normalize
  6165. float32x2_t Result = vmul_f32( VL, vTemp );
  6166. return vcombine_f32( Result, Result );
  6167. #elif defined(_XM_SSE4_INTRINSICS_)
  6168. XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
  6169. XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
  6170. return _mm_mul_ps(vResult, V);
  6171. #elif defined(_XM_SSE3_INTRINSICS_)
  6172. XMVECTOR vLengthSq = _mm_mul_ps(V, V);
  6173. vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
  6174. vLengthSq = _mm_rsqrt_ss(vLengthSq);
  6175. vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
  6176. vLengthSq = _mm_mul_ps(vLengthSq, V);
  6177. return vLengthSq;
  6178. #elif defined(_XM_SSE_INTRINSICS_)
  6179. // Perform the dot product on x and y
  6180. XMVECTOR vLengthSq = _mm_mul_ps(V,V);
  6181. // vTemp has y splatted
  6182. XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
  6183. // x+y
  6184. vLengthSq = _mm_add_ss(vLengthSq,vTemp);
  6185. vLengthSq = _mm_rsqrt_ss(vLengthSq);
  6186. vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
  6187. vLengthSq = _mm_mul_ps(vLengthSq,V);
  6188. return vLengthSq;
  6189. #endif
  6190. }
  6191. //------------------------------------------------------------------------------
  6192. inline XMVECTOR XM_CALLCONV XMVector2Normalize
  6193. (
  6194. FXMVECTOR V
  6195. )
  6196. {
  6197. #if defined(_XM_NO_INTRINSICS_)
  6198. XMVECTOR vResult = XMVector2Length( V );
  6199. float fLength = vResult.vector4_f32[0];
  6200. // Prevent divide by zero
  6201. if (fLength > 0) {
  6202. fLength = 1.0f/fLength;
  6203. }
  6204. vResult.vector4_f32[0] = V.vector4_f32[0]*fLength;
  6205. vResult.vector4_f32[1] = V.vector4_f32[1]*fLength;
  6206. vResult.vector4_f32[2] = V.vector4_f32[2]*fLength;
  6207. vResult.vector4_f32[3] = V.vector4_f32[3]*fLength;
  6208. return vResult;
  6209. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  6210. float32x2_t VL = vget_low_f32(V);
  6211. // Dot2
  6212. float32x2_t vTemp = vmul_f32( VL, VL );
  6213. vTemp = vpadd_f32( vTemp, vTemp );
  6214. uint32x2_t VEqualsZero = vceq_f32( vTemp, vdup_n_f32(0) );
  6215. uint32x2_t VEqualsInf = vceq_f32( vTemp, vget_low_f32(g_XMInfinity) );
  6216. // Reciprocal sqrt (2 iterations of Newton-Raphson)
  6217. float32x2_t S0 = vrsqrte_f32( vTemp );
  6218. float32x2_t P0 = vmul_f32( vTemp, S0 );
  6219. float32x2_t R0 = vrsqrts_f32( P0, S0 );
  6220. float32x2_t S1 = vmul_f32( S0, R0 );
  6221. float32x2_t P1 = vmul_f32( vTemp, S1 );
  6222. float32x2_t R1 = vrsqrts_f32( P1, S1 );
  6223. vTemp = vmul_f32( S1, R1 );
  6224. // Normalize
  6225. float32x2_t Result = vmul_f32( VL, vTemp );
  6226. Result = vbsl_f32( VEqualsZero, vdup_n_f32(0), Result );
  6227. Result = vbsl_f32( VEqualsInf, vget_low_f32(g_XMQNaN), Result );
  6228. return vcombine_f32( Result, Result );
  6229. #elif defined(_XM_SSE4_INTRINSICS_)
  6230. XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x3f );
  6231. // Prepare for the division
  6232. XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
  6233. // Create zero with a single instruction
  6234. XMVECTOR vZeroMask = _mm_setzero_ps();
  6235. // Test for a divide by zero (Must be FP to detect -0.0)
  6236. vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
  6237. // Failsafe on zero (Or epsilon) length planes
  6238. // If the length is infinity, set the elements to zero
  6239. vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
  6240. // Reciprocal mul to perform the normalization
  6241. vResult = _mm_div_ps(V,vResult);
  6242. // Any that are infinity, set to zero
  6243. vResult = _mm_and_ps(vResult,vZeroMask);
  6244. // Select qnan or result based on infinite length
  6245. XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
  6246. XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
  6247. vResult = _mm_or_ps(vTemp1,vTemp2);
  6248. return vResult;
  6249. #elif defined(_XM_SSE3_INTRINSICS_)
  6250. // Perform the dot product on x and y only
  6251. XMVECTOR vLengthSq = _mm_mul_ps(V, V);
  6252. vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
  6253. vLengthSq = _mm_moveldup_ps(vLengthSq);
  6254. // Prepare for the division
  6255. XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
  6256. // Create zero with a single instruction
  6257. XMVECTOR vZeroMask = _mm_setzero_ps();
  6258. // Test for a divide by zero (Must be FP to detect -0.0)
  6259. vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
  6260. // Failsafe on zero (Or epsilon) length planes
  6261. // If the length is infinity, set the elements to zero
  6262. vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
  6263. // Reciprocal mul to perform the normalization
  6264. vResult = _mm_div_ps(V, vResult);
  6265. // Any that are infinity, set to zero
  6266. vResult = _mm_and_ps(vResult, vZeroMask);
  6267. // Select qnan or result based on infinite length
  6268. XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
  6269. XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
  6270. vResult = _mm_or_ps(vTemp1, vTemp2);
  6271. return vResult;
  6272. #elif defined(_XM_SSE_INTRINSICS_)
  6273. // Perform the dot product on x and y only
  6274. XMVECTOR vLengthSq = _mm_mul_ps(V,V);
  6275. XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
  6276. vLengthSq = _mm_add_ss(vLengthSq,vTemp);
  6277. vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
  6278. // Prepare for the division
  6279. XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
  6280. // Create zero with a single instruction
  6281. XMVECTOR vZeroMask = _mm_setzero_ps();
  6282. // Test for a divide by zero (Must be FP to detect -0.0)
  6283. vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
  6284. // Failsafe on zero (Or epsilon) length planes
  6285. // If the length is infinity, set the elements to zero
  6286. vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
  6287. // Reciprocal mul to perform the normalization
  6288. vResult = _mm_div_ps(V,vResult);
  6289. // Any that are infinity, set to zero
  6290. vResult = _mm_and_ps(vResult,vZeroMask);
  6291. // Select qnan or result based on infinite length
  6292. XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
  6293. XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
  6294. vResult = _mm_or_ps(vTemp1,vTemp2);
  6295. return vResult;
  6296. #endif
  6297. }
  6298. //------------------------------------------------------------------------------
  6299. inline XMVECTOR XM_CALLCONV XMVector2ClampLength
  6300. (
  6301. FXMVECTOR V,
  6302. float LengthMin,
  6303. float LengthMax
  6304. )
  6305. {
  6306. XMVECTOR ClampMax = XMVectorReplicate(LengthMax);
  6307. XMVECTOR ClampMin = XMVectorReplicate(LengthMin);
  6308. return XMVector2ClampLengthV(V, ClampMin, ClampMax);
  6309. }
  6310. //------------------------------------------------------------------------------
  6311. inline XMVECTOR XM_CALLCONV XMVector2ClampLengthV
  6312. (
  6313. FXMVECTOR V,
  6314. FXMVECTOR LengthMin,
  6315. FXMVECTOR LengthMax
  6316. )
  6317. {
  6318. assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)));
  6319. assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)));
  6320. assert(XMVector2GreaterOrEqual(LengthMin, g_XMZero));
  6321. assert(XMVector2GreaterOrEqual(LengthMax, g_XMZero));
  6322. assert(XMVector2GreaterOrEqual(LengthMax, LengthMin));
  6323. XMVECTOR LengthSq = XMVector2LengthSq(V);
  6324. const XMVECTOR Zero = XMVectorZero();
  6325. XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq);
  6326. XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v);
  6327. XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero);
  6328. XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength);
  6329. XMVECTOR Normal = XMVectorMultiply(V, RcpLength);
  6330. XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength);
  6331. Length = XMVectorSelect(LengthSq, Length, Select);
  6332. Normal = XMVectorSelect(LengthSq, Normal, Select);
  6333. XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax);
  6334. XMVECTOR ControlMin = XMVectorLess(Length, LengthMin);
  6335. XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax);
  6336. ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin);
  6337. XMVECTOR Result = XMVectorMultiply(Normal, ClampLength);
  6338. // Preserve the original vector (with no precision loss) if the length falls within the given range
  6339. XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin);
  6340. Result = XMVectorSelect(Result, V, Control);
  6341. return Result;
  6342. }
  6343. //------------------------------------------------------------------------------
  6344. inline XMVECTOR XM_CALLCONV XMVector2Reflect
  6345. (
  6346. FXMVECTOR Incident,
  6347. FXMVECTOR Normal
  6348. )
  6349. {
  6350. // Result = Incident - (2 * dot(Incident, Normal)) * Normal
  6351. XMVECTOR Result;
  6352. Result = XMVector2Dot(Incident, Normal);
  6353. Result = XMVectorAdd(Result, Result);
  6354. Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident);
  6355. return Result;
  6356. }
  6357. //------------------------------------------------------------------------------
  6358. inline XMVECTOR XM_CALLCONV XMVector2Refract
  6359. (
  6360. FXMVECTOR Incident,
  6361. FXMVECTOR Normal,
  6362. float RefractionIndex
  6363. )
  6364. {
  6365. XMVECTOR Index = XMVectorReplicate(RefractionIndex);
  6366. return XMVector2RefractV(Incident, Normal, Index);
  6367. }
  6368. //------------------------------------------------------------------------------
  6369. // Return the refraction of a 2D vector
  6370. inline XMVECTOR XM_CALLCONV XMVector2RefractV
  6371. (
  6372. FXMVECTOR Incident,
  6373. FXMVECTOR Normal,
  6374. FXMVECTOR RefractionIndex
  6375. )
  6376. {
  6377. // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) +
  6378. // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
  6379. #if defined(_XM_NO_INTRINSICS_)
  6380. float IDotN = (Incident.vector4_f32[0]*Normal.vector4_f32[0])+(Incident.vector4_f32[1]*Normal.vector4_f32[1]);
  6381. // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
  6382. float RY = 1.0f-(IDotN*IDotN);
  6383. float RX = 1.0f-(RY*RefractionIndex.vector4_f32[0]*RefractionIndex.vector4_f32[0]);
  6384. RY = 1.0f-(RY*RefractionIndex.vector4_f32[1]*RefractionIndex.vector4_f32[1]);
  6385. if (RX>=0.0f) {
  6386. RX = (RefractionIndex.vector4_f32[0]*Incident.vector4_f32[0])-(Normal.vector4_f32[0]*((RefractionIndex.vector4_f32[0]*IDotN)+sqrtf(RX)));
  6387. } else {
  6388. RX = 0.0f;
  6389. }
  6390. if (RY>=0.0f) {
  6391. RY = (RefractionIndex.vector4_f32[1]*Incident.vector4_f32[1])-(Normal.vector4_f32[1]*((RefractionIndex.vector4_f32[1]*IDotN)+sqrtf(RY)));
  6392. } else {
  6393. RY = 0.0f;
  6394. }
  6395. XMVECTOR vResult;
  6396. vResult.vector4_f32[0] = RX;
  6397. vResult.vector4_f32[1] = RY;
  6398. vResult.vector4_f32[2] = 0.0f;
  6399. vResult.vector4_f32[3] = 0.0f;
  6400. return vResult;
  6401. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  6402. float32x2_t IL = vget_low_f32( Incident );
  6403. float32x2_t NL = vget_low_f32( Normal );
  6404. float32x2_t RIL = vget_low_f32( RefractionIndex );
  6405. // Get the 2D Dot product of Incident-Normal
  6406. float32x2_t vTemp = vmul_f32(IL, NL);
  6407. float32x2_t IDotN = vpadd_f32( vTemp, vTemp );
  6408. // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
  6409. vTemp = vmls_f32( vget_low_f32( g_XMOne ), IDotN, IDotN);
  6410. vTemp = vmul_f32(vTemp,RIL);
  6411. vTemp = vmls_f32(vget_low_f32( g_XMOne ), vTemp, RIL );
  6412. // If any terms are <=0, sqrt() will fail, punt to zero
  6413. uint32x2_t vMask = vcgt_f32(vTemp, vget_low_f32(g_XMZero) );
  6414. // Sqrt(vTemp)
  6415. float32x2_t S0 = vrsqrte_f32(vTemp);
  6416. float32x2_t P0 = vmul_f32( vTemp, S0 );
  6417. float32x2_t R0 = vrsqrts_f32( P0, S0 );
  6418. float32x2_t S1 = vmul_f32( S0, R0 );
  6419. float32x2_t P1 = vmul_f32( vTemp, S1 );
  6420. float32x2_t R1 = vrsqrts_f32( P1, S1 );
  6421. float32x2_t S2 = vmul_f32( S1, R1 );
  6422. vTemp = vmul_f32( vTemp, S2 );
  6423. // R = RefractionIndex * IDotN + sqrt(R)
  6424. vTemp = vmla_f32( vTemp, RIL, IDotN );
  6425. // Result = RefractionIndex * Incident - Normal * R
  6426. float32x2_t vResult = vmul_f32(RIL,IL);
  6427. vResult = vmls_f32( vResult, vTemp, NL );
  6428. vResult = vand_u32(vResult,vMask);
  6429. return vcombine_f32(vResult, vResult);
  6430. #elif defined(_XM_SSE_INTRINSICS_)
  6431. // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) +
  6432. // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
  6433. // Get the 2D Dot product of Incident-Normal
  6434. XMVECTOR IDotN = XMVector2Dot(Incident, Normal);
  6435. // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
  6436. XMVECTOR vTemp = _mm_mul_ps(IDotN,IDotN);
  6437. vTemp = _mm_sub_ps(g_XMOne,vTemp);
  6438. vTemp = _mm_mul_ps(vTemp,RefractionIndex);
  6439. vTemp = _mm_mul_ps(vTemp,RefractionIndex);
  6440. vTemp = _mm_sub_ps(g_XMOne,vTemp);
  6441. // If any terms are <=0, sqrt() will fail, punt to zero
  6442. XMVECTOR vMask = _mm_cmpgt_ps(vTemp,g_XMZero);
  6443. // R = RefractionIndex * IDotN + sqrt(R)
  6444. vTemp = _mm_sqrt_ps(vTemp);
  6445. XMVECTOR vResult = _mm_mul_ps(RefractionIndex,IDotN);
  6446. vTemp = _mm_add_ps(vTemp,vResult);
  6447. // Result = RefractionIndex * Incident - Normal * R
  6448. vResult = _mm_mul_ps(RefractionIndex,Incident);
  6449. vTemp = _mm_mul_ps(vTemp,Normal);
  6450. vResult = _mm_sub_ps(vResult,vTemp);
  6451. vResult = _mm_and_ps(vResult,vMask);
  6452. return vResult;
  6453. #endif
  6454. }
  6455. //------------------------------------------------------------------------------
  6456. inline XMVECTOR XM_CALLCONV XMVector2Orthogonal
  6457. (
  6458. FXMVECTOR V
  6459. )
  6460. {
  6461. #if defined(_XM_NO_INTRINSICS_)
  6462. XMVECTORF32 Result = { { {
  6463. -V.vector4_f32[1],
  6464. V.vector4_f32[0],
  6465. 0.f,
  6466. 0.f
  6467. } } };
  6468. return Result.v;
  6469. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  6470. static const XMVECTORF32 Negate = { { { -1.f, 1.f, 0, 0 } } };
  6471. const float32x2_t zero = vdup_n_f32(0);
  6472. float32x2_t VL = vget_low_f32( V );
  6473. float32x2_t Result = vmul_f32( vrev64_f32( VL ), vget_low_f32( Negate ) );
  6474. return vcombine_f32( Result, zero );
  6475. #elif defined(_XM_SSE_INTRINSICS_)
  6476. XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
  6477. vResult = _mm_mul_ps(vResult,g_XMNegateX);
  6478. return vResult;
  6479. #endif
  6480. }
  6481. //------------------------------------------------------------------------------
  6482. inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormalsEst
  6483. (
  6484. FXMVECTOR N1,
  6485. FXMVECTOR N2
  6486. )
  6487. {
  6488. XMVECTOR Result = XMVector2Dot(N1, N2);
  6489. Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
  6490. Result = XMVectorACosEst(Result);
  6491. return Result;
  6492. }
  6493. //------------------------------------------------------------------------------
  6494. inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormals
  6495. (
  6496. FXMVECTOR N1,
  6497. FXMVECTOR N2
  6498. )
  6499. {
  6500. XMVECTOR Result = XMVector2Dot(N1, N2);
  6501. Result = XMVectorClamp(Result, g_XMNegativeOne, g_XMOne);
  6502. Result = XMVectorACos(Result);
  6503. return Result;
  6504. }
  6505. //------------------------------------------------------------------------------
  6506. inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenVectors
  6507. (
  6508. FXMVECTOR V1,
  6509. FXMVECTOR V2
  6510. )
  6511. {
  6512. XMVECTOR L1 = XMVector2ReciprocalLength(V1);
  6513. XMVECTOR L2 = XMVector2ReciprocalLength(V2);
  6514. XMVECTOR Dot = XMVector2Dot(V1, V2);
  6515. L1 = XMVectorMultiply(L1, L2);
  6516. XMVECTOR CosAngle = XMVectorMultiply(Dot, L1);
  6517. CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v);
  6518. return XMVectorACos(CosAngle);
  6519. }
  6520. //------------------------------------------------------------------------------
  6521. inline XMVECTOR XM_CALLCONV XMVector2LinePointDistance
  6522. (
  6523. FXMVECTOR LinePoint1,
  6524. FXMVECTOR LinePoint2,
  6525. FXMVECTOR Point
  6526. )
  6527. {
  6528. // Given a vector PointVector from LinePoint1 to Point and a vector
  6529. // LineVector from LinePoint1 to LinePoint2, the scaled distance
  6530. // PointProjectionScale from LinePoint1 to the perpendicular projection
  6531. // of PointVector onto the line is defined as:
  6532. //
  6533. // PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector)
  6534. XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1);
  6535. XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1);
  6536. XMVECTOR LengthSq = XMVector2LengthSq(LineVector);
  6537. XMVECTOR PointProjectionScale = XMVector2Dot(PointVector, LineVector);
  6538. PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq);
  6539. XMVECTOR DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale);
  6540. DistanceVector = XMVectorSubtract(PointVector, DistanceVector);
  6541. return XMVector2Length(DistanceVector);
  6542. }
  6543. //------------------------------------------------------------------------------
  6544. inline XMVECTOR XM_CALLCONV XMVector2IntersectLine
  6545. (
  6546. FXMVECTOR Line1Point1,
  6547. FXMVECTOR Line1Point2,
  6548. FXMVECTOR Line2Point1,
  6549. GXMVECTOR Line2Point2
  6550. )
  6551. {
  6552. #if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
  6553. XMVECTOR V1 = XMVectorSubtract(Line1Point2, Line1Point1);
  6554. XMVECTOR V2 = XMVectorSubtract(Line2Point2, Line2Point1);
  6555. XMVECTOR V3 = XMVectorSubtract(Line1Point1, Line2Point1);
  6556. XMVECTOR C1 = XMVector2Cross(V1, V2);
  6557. XMVECTOR C2 = XMVector2Cross(V2, V3);
  6558. XMVECTOR Result;
  6559. const XMVECTOR Zero = XMVectorZero();
  6560. if (XMVector2NearEqual(C1, Zero, g_XMEpsilon.v))
  6561. {
  6562. if (XMVector2NearEqual(C2, Zero, g_XMEpsilon.v))
  6563. {
  6564. // Coincident
  6565. Result = g_XMInfinity.v;
  6566. }
  6567. else
  6568. {
  6569. // Parallel
  6570. Result = g_XMQNaN.v;
  6571. }
  6572. }
  6573. else
  6574. {
  6575. // Intersection point = Line1Point1 + V1 * (C2 / C1)
  6576. XMVECTOR Scale = XMVectorReciprocal(C1);
  6577. Scale = XMVectorMultiply(C2, Scale);
  6578. Result = XMVectorMultiplyAdd(V1, Scale, Line1Point1);
  6579. }
  6580. return Result;
  6581. #elif defined(_XM_SSE_INTRINSICS_)
  6582. XMVECTOR V1 = _mm_sub_ps(Line1Point2, Line1Point1);
  6583. XMVECTOR V2 = _mm_sub_ps(Line2Point2, Line2Point1);
  6584. XMVECTOR V3 = _mm_sub_ps(Line1Point1, Line2Point1);
  6585. // Generate the cross products
  6586. XMVECTOR C1 = XMVector2Cross(V1, V2);
  6587. XMVECTOR C2 = XMVector2Cross(V2, V3);
  6588. // If C1 is not close to epsilon, use the calculated value
  6589. XMVECTOR vResultMask = _mm_setzero_ps();
  6590. vResultMask = _mm_sub_ps(vResultMask,C1);
  6591. vResultMask = _mm_max_ps(vResultMask,C1);
  6592. // 0xFFFFFFFF if the calculated value is to be used
  6593. vResultMask = _mm_cmpgt_ps(vResultMask,g_XMEpsilon);
  6594. // If C1 is close to epsilon, which fail type is it? INFINITY or NAN?
  6595. XMVECTOR vFailMask = _mm_setzero_ps();
  6596. vFailMask = _mm_sub_ps(vFailMask,C2);
  6597. vFailMask = _mm_max_ps(vFailMask,C2);
  6598. vFailMask = _mm_cmple_ps(vFailMask,g_XMEpsilon);
  6599. XMVECTOR vFail = _mm_and_ps(vFailMask,g_XMInfinity);
  6600. vFailMask = _mm_andnot_ps(vFailMask,g_XMQNaN);
  6601. // vFail is NAN or INF
  6602. vFail = _mm_or_ps(vFail,vFailMask);
  6603. // Intersection point = Line1Point1 + V1 * (C2 / C1)
  6604. XMVECTOR vResult = _mm_div_ps(C2,C1);
  6605. vResult = _mm_mul_ps(vResult,V1);
  6606. vResult = _mm_add_ps(vResult,Line1Point1);
  6607. // Use result, or failure value
  6608. vResult = _mm_and_ps(vResult,vResultMask);
  6609. vResultMask = _mm_andnot_ps(vResultMask,vFail);
  6610. vResult = _mm_or_ps(vResult,vResultMask);
  6611. return vResult;
  6612. #endif
  6613. }
  6614. //------------------------------------------------------------------------------
  6615. inline XMVECTOR XM_CALLCONV XMVector2Transform
  6616. (
  6617. FXMVECTOR V,
  6618. FXMMATRIX M
  6619. )
  6620. {
  6621. #if defined(_XM_NO_INTRINSICS_)
  6622. XMVECTOR Y = XMVectorSplatY(V);
  6623. XMVECTOR X = XMVectorSplatX(V);
  6624. XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]);
  6625. Result = XMVectorMultiplyAdd(X, M.r[0], Result);
  6626. return Result;
  6627. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  6628. float32x2_t VL = vget_low_f32( V );
  6629. float32x4_t Result = vmlaq_lane_f32( M.r[3], M.r[1], VL, 1 ); // Y
  6630. return vmlaq_lane_f32( Result, M.r[0], VL, 0 ); // X
  6631. #elif defined(_XM_SSE_INTRINSICS_)
  6632. XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
  6633. vResult = _mm_mul_ps(vResult,M.r[0]);
  6634. XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
  6635. vTemp = _mm_mul_ps(vTemp,M.r[1]);
  6636. vResult = _mm_add_ps(vResult,vTemp);
  6637. vResult = _mm_add_ps(vResult,M.r[3]);
  6638. return vResult;
  6639. #endif
  6640. }
  6641. //------------------------------------------------------------------------------
  6642. _Use_decl_annotations_
  6643. inline XMFLOAT4* XM_CALLCONV XMVector2TransformStream
  6644. (
  6645. XMFLOAT4* pOutputStream,
  6646. size_t OutputStride,
  6647. const XMFLOAT2* pInputStream,
  6648. size_t InputStride,
  6649. size_t VectorCount,
  6650. FXMMATRIX M
  6651. )
  6652. {
  6653. assert(pOutputStream != nullptr);
  6654. assert(pInputStream != nullptr);
  6655. assert(InputStride >= sizeof(XMFLOAT2));
  6656. _Analysis_assume_(InputStride >= sizeof(XMFLOAT2));
  6657. assert(OutputStride >= sizeof(XMFLOAT4));
  6658. _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4));
  6659. #if defined(_XM_NO_INTRINSICS_)
  6660. const uint8_t* pInputVector = (const uint8_t*)pInputStream;
  6661. uint8_t* pOutputVector = (uint8_t*)pOutputStream;
  6662. const XMVECTOR row0 = M.r[0];
  6663. const XMVECTOR row1 = M.r[1];
  6664. const XMVECTOR row3 = M.r[3];
  6665. for (size_t i = 0; i < VectorCount; i++)
  6666. {
  6667. XMVECTOR V = XMLoadFloat2((const XMFLOAT2*)pInputVector);
  6668. XMVECTOR Y = XMVectorSplatY(V);
  6669. XMVECTOR X = XMVectorSplatX(V);
  6670. XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3);
  6671. Result = XMVectorMultiplyAdd(X, row0, Result);
  6672. #ifdef _PREFAST_
  6673. #pragma prefast(push)
  6674. #pragma prefast(disable : 26015, "PREfast noise: Esp:1307" )
  6675. #endif
  6676. XMStoreFloat4((XMFLOAT4*)pOutputVector, Result);
  6677. #ifdef _PREFAST_
  6678. #pragma prefast(pop)
  6679. #endif
  6680. pInputVector += InputStride;
  6681. pOutputVector += OutputStride;
  6682. }
  6683. return pOutputStream;
  6684. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  6685. const uint8_t* pInputVector = (const uint8_t*)pInputStream;
  6686. uint8_t* pOutputVector = (uint8_t*)pOutputStream;
  6687. const XMVECTOR row0 = M.r[0];
  6688. const XMVECTOR row1 = M.r[1];
  6689. const XMVECTOR row3 = M.r[3];
  6690. size_t i = 0;
  6691. size_t four = VectorCount >> 2;
  6692. if ( four > 0 )
  6693. {
  6694. if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT4)))
  6695. {
  6696. for (size_t j = 0; j < four; ++j)
  6697. {
  6698. float32x4x2_t V = vld2q_f32( reinterpret_cast<const float*>(pInputVector) );
  6699. pInputVector += sizeof(XMFLOAT2)*4;
  6700. float32x2_t r3 = vget_low_f32( row3 );
  6701. float32x2_t r = vget_low_f32( row0 );
  6702. XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M
  6703. XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N
  6704. __prefetch( pInputVector );
  6705. r3 = vget_high_f32( row3 );
  6706. r = vget_high_f32( row0 );
  6707. XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O
  6708. XMVECTOR vResult3 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P
  6709. __prefetch( pInputVector+XM_CACHE_LINE_SIZE );
  6710. r = vget_low_f32( row1 );
  6711. vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M
  6712. vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N
  6713. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
  6714. r = vget_high_f32( row1 );
  6715. vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O
  6716. vResult3 = vmlaq_lane_f32( vResult3, V.val[1], r, 1 ); // Dx+Hy+P
  6717. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
  6718. float32x4x4_t R;
  6719. R.val[0] = vResult0;
  6720. R.val[1] = vResult1;
  6721. R.val[2] = vResult2;
  6722. R.val[3] = vResult3;
  6723. vst4q_f32( reinterpret_cast<float*>(pOutputVector), R );
  6724. pOutputVector += sizeof(XMFLOAT4)*4;
  6725. i += 4;
  6726. }
  6727. }
  6728. }
  6729. for (; i < VectorCount; i++)
  6730. {
  6731. float32x2_t V = vld1_f32( reinterpret_cast<const float*>(pInputVector) );
  6732. pInputVector += InputStride;
  6733. XMVECTOR vResult = vmlaq_lane_f32( row3, row0, V, 0 ); // X
  6734. vResult = vmlaq_lane_f32( vResult, row1, V, 1 ); // Y
  6735. vst1q_f32( reinterpret_cast<float*>(pOutputVector), vResult );
  6736. pOutputVector += OutputStride;
  6737. }
  6738. return pOutputStream;
  6739. #elif defined(_XM_SSE_INTRINSICS_)
  6740. const uint8_t* pInputVector = (const uint8_t*)pInputStream;
  6741. uint8_t* pOutputVector = (uint8_t*)pOutputStream;
  6742. const XMVECTOR row0 = M.r[0];
  6743. const XMVECTOR row1 = M.r[1];
  6744. const XMVECTOR row3 = M.r[3];
  6745. size_t i = 0;
  6746. size_t two = VectorCount >> 1;
  6747. if ( two > 0 )
  6748. {
  6749. if ( InputStride == sizeof(XMFLOAT2) )
  6750. {
  6751. if ( !((uintptr_t)pOutputStream & 0xF) && !(OutputStride & 0xF) )
  6752. {
  6753. // Packed input, aligned output
  6754. for (size_t j = 0; j < two; ++j)
  6755. {
  6756. XMVECTOR V = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
  6757. pInputVector += sizeof(XMFLOAT2)*2;
  6758. XMVECTOR Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
  6759. XMVECTOR X = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
  6760. XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
  6761. XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
  6762. vTemp = _mm_add_ps( vTemp, row3 );
  6763. vTemp = _mm_add_ps( vTemp, vTemp2 );
  6764. XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTemp );
  6765. pOutputVector += OutputStride;
  6766. Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
  6767. X = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
  6768. vTemp = _mm_mul_ps( Y, row1 );
  6769. vTemp2 = _mm_mul_ps( X, row0 );
  6770. vTemp = _mm_add_ps( vTemp, row3 );
  6771. vTemp = _mm_add_ps( vTemp, vTemp2 );
  6772. XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTemp );
  6773. pOutputVector += OutputStride;
  6774. i += 2;
  6775. }
  6776. }
  6777. else
  6778. {
  6779. // Packed input, unaligned output
  6780. for (size_t j = 0; j < two; ++j)
  6781. {
  6782. XMVECTOR V = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
  6783. pInputVector += sizeof(XMFLOAT2)*2;
  6784. XMVECTOR Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
  6785. XMVECTOR X = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
  6786. XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
  6787. XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
  6788. vTemp = _mm_add_ps( vTemp, row3 );
  6789. vTemp = _mm_add_ps( vTemp, vTemp2 );
  6790. _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
  6791. pOutputVector += OutputStride;
  6792. Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
  6793. X = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
  6794. vTemp = _mm_mul_ps( Y, row1 );
  6795. vTemp2 = _mm_mul_ps( X, row0 );
  6796. vTemp = _mm_add_ps( vTemp, row3 );
  6797. vTemp = _mm_add_ps( vTemp, vTemp2 );
  6798. _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
  6799. pOutputVector += OutputStride;
  6800. i += 2;
  6801. }
  6802. }
  6803. }
  6804. }
  6805. if ( !((uintptr_t)pInputVector & 0xF) && !(InputStride & 0xF) )
  6806. {
  6807. if ( !((uintptr_t)pOutputStream & 0xF) && !(OutputStride & 0xF) )
  6808. {
  6809. // Aligned input, aligned output
  6810. for (; i < VectorCount; i++)
  6811. {
  6812. XMVECTOR V = _mm_castsi128_ps( _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pInputVector) ) );
  6813. pInputVector += InputStride;
  6814. XMVECTOR Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
  6815. XMVECTOR X = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
  6816. XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
  6817. XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
  6818. vTemp = _mm_add_ps( vTemp, row3 );
  6819. vTemp = _mm_add_ps( vTemp, vTemp2 );
  6820. XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTemp );
  6821. pOutputVector += OutputStride;
  6822. }
  6823. }
  6824. else
  6825. {
  6826. // Aligned input, unaligned output
  6827. for (; i < VectorCount; i++)
  6828. {
  6829. XMVECTOR V = _mm_castsi128_ps( _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pInputVector) ) );
  6830. pInputVector += InputStride;
  6831. XMVECTOR Y = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
  6832. XMVECTOR X = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
  6833. XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
  6834. XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
  6835. vTemp = _mm_add_ps( vTemp, row3 );
  6836. vTemp = _mm_add_ps( vTemp, vTemp2 );
  6837. _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
  6838. pOutputVector += OutputStride;
  6839. }
  6840. }
  6841. }
  6842. else
  6843. {
  6844. // Unaligned input
  6845. for (; i < VectorCount; i++)
  6846. {
  6847. __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pInputVector) );
  6848. __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pInputVector+4) );
  6849. pInputVector += InputStride;
  6850. XMVECTOR Y = XM_PERMUTE_PS(y,_MM_SHUFFLE(0,0,0,0));
  6851. XMVECTOR X = XM_PERMUTE_PS(x,_MM_SHUFFLE(0,0,0,0));
  6852. XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
  6853. XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
  6854. vTemp = _mm_add_ps( vTemp, row3 );
  6855. vTemp = _mm_add_ps( vTemp, vTemp2 );
  6856. _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
  6857. pOutputVector += OutputStride;
  6858. }
  6859. }
  6860. XM_SFENCE();
  6861. return pOutputStream;
  6862. #endif
  6863. }
  6864. //------------------------------------------------------------------------------
  6865. inline XMVECTOR XM_CALLCONV XMVector2TransformCoord
  6866. (
  6867. FXMVECTOR V,
  6868. FXMMATRIX M
  6869. )
  6870. {
  6871. XMVECTOR Y = XMVectorSplatY(V);
  6872. XMVECTOR X = XMVectorSplatX(V);
  6873. XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]);
  6874. Result = XMVectorMultiplyAdd(X, M.r[0], Result);
  6875. XMVECTOR W = XMVectorSplatW(Result);
  6876. return XMVectorDivide( Result, W );
  6877. }
  6878. //------------------------------------------------------------------------------
  6879. _Use_decl_annotations_
  6880. inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
  6881. (
  6882. XMFLOAT2* pOutputStream,
  6883. size_t OutputStride,
  6884. const XMFLOAT2* pInputStream,
  6885. size_t InputStride,
  6886. size_t VectorCount,
  6887. FXMMATRIX M
  6888. )
  6889. {
  6890. assert(pOutputStream != nullptr);
  6891. assert(pInputStream != nullptr);
  6892. assert(InputStride >= sizeof(XMFLOAT2));
  6893. _Analysis_assume_(InputStride >= sizeof(XMFLOAT2));
  6894. assert(OutputStride >= sizeof(XMFLOAT2));
  6895. _Analysis_assume_(OutputStride >= sizeof(XMFLOAT2));
  6896. #if defined(_XM_NO_INTRINSICS_)
  6897. const uint8_t* pInputVector = (const uint8_t*)pInputStream;
  6898. uint8_t* pOutputVector = (uint8_t*)pOutputStream;
  6899. const XMVECTOR row0 = M.r[0];
  6900. const XMVECTOR row1 = M.r[1];
  6901. const XMVECTOR row3 = M.r[3];
  6902. for (size_t i = 0; i < VectorCount; i++)
  6903. {
  6904. XMVECTOR V = XMLoadFloat2((const XMFLOAT2*)pInputVector);
  6905. XMVECTOR Y = XMVectorSplatY(V);
  6906. XMVECTOR X = XMVectorSplatX(V);
  6907. XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3);
  6908. Result = XMVectorMultiplyAdd(X, row0, Result);
  6909. XMVECTOR W = XMVectorSplatW(Result);
  6910. Result = XMVectorDivide(Result, W);
  6911. #ifdef _PREFAST_
  6912. #pragma prefast(push)
  6913. #pragma prefast(disable : 26015, "PREfast noise: Esp:1307" )
  6914. #endif
  6915. XMStoreFloat2((XMFLOAT2*)pOutputVector, Result);
  6916. #ifdef _PREFAST_
  6917. #pragma prefast(pop)
  6918. #endif
  6919. pInputVector += InputStride;
  6920. pOutputVector += OutputStride;
  6921. }
  6922. return pOutputStream;
  6923. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  6924. const uint8_t* pInputVector = (const uint8_t*)pInputStream;
  6925. uint8_t* pOutputVector = (uint8_t*)pOutputStream;
  6926. const XMVECTOR row0 = M.r[0];
  6927. const XMVECTOR row1 = M.r[1];
  6928. const XMVECTOR row3 = M.r[3];
  6929. size_t i = 0;
  6930. size_t four = VectorCount >> 2;
  6931. if ( four > 0 )
  6932. {
  6933. if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT2)))
  6934. {
  6935. for (size_t j = 0; j < four; ++j)
  6936. {
  6937. float32x4x2_t V = vld2q_f32( reinterpret_cast<const float*>(pInputVector) );
  6938. pInputVector += sizeof(XMFLOAT2)*4;
  6939. float32x2_t r3 = vget_low_f32( row3 );
  6940. float32x2_t r = vget_low_f32( row0 );
  6941. XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M
  6942. XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N
  6943. __prefetch( pInputVector );
  6944. r3 = vget_high_f32( row3 );
  6945. r = vget_high_f32( row0 );
  6946. XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P
  6947. __prefetch( pInputVector+XM_CACHE_LINE_SIZE );
  6948. r = vget_low_f32( row1 );
  6949. vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M
  6950. vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N
  6951. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
  6952. r = vget_high_f32( row1 );
  6953. W = vmlaq_lane_f32( W, V.val[1], r, 1 ); // Dx+Hy+P
  6954. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
  6955. #if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
  6956. V.val[0] = vdivq_f32( vResult0, W );
  6957. V.val[1] = vdivq_f32( vResult1, W );
  6958. #else
  6959. // 2 iterations of Newton-Raphson refinement of reciprocal
  6960. float32x4_t Reciprocal = vrecpeq_f32(W);
  6961. float32x4_t S = vrecpsq_f32( Reciprocal, W );
  6962. Reciprocal = vmulq_f32( S, Reciprocal );
  6963. S = vrecpsq_f32( Reciprocal, W );
  6964. Reciprocal = vmulq_f32( S, Reciprocal );
  6965. V.val[0] = vmulq_f32( vResult0, Reciprocal );
  6966. V.val[1] = vmulq_f32( vResult1, Reciprocal );
  6967. #endif
  6968. vst2q_f32( reinterpret_cast<float*>(pOutputVector),V );
  6969. pOutputVector += sizeof(XMFLOAT2)*4;
  6970. i += 4;
  6971. }
  6972. }
  6973. }
  6974. for (; i < VectorCount; i++)
  6975. {
  6976. float32x2_t V = vld1_f32( reinterpret_cast<const float*>(pInputVector) );
  6977. pInputVector += InputStride;
  6978. XMVECTOR vResult = vmlaq_lane_f32( row3, row0, V, 0 ); // X
  6979. vResult = vmlaq_lane_f32( vResult, row1, V, 1 ); // Y
  6980. V = vget_high_f32( vResult );
  6981. float32x2_t W = vdup_lane_f32( V, 1 );
  6982. #if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
  6983. V = vget_low_f32( vResult );
  6984. V = vdiv_f32( V, W );
  6985. #else
  6986. // 2 iterations of Newton-Raphson refinement of reciprocal for W
  6987. float32x2_t Reciprocal = vrecpe_f32( W );
  6988. float32x2_t S = vrecps_f32( Reciprocal, W );
  6989. Reciprocal = vmul_f32( S, Reciprocal );
  6990. S = vrecps_f32( Reciprocal, W );
  6991. Reciprocal = vmul_f32( S, Reciprocal );
  6992. V = vget_low_f32( vResult );
  6993. V = vmul_f32( V, Reciprocal );
  6994. #endif
  6995. vst1_f32( reinterpret_cast<float*>(pOutputVector), V );
  6996. pOutputVector += OutputStride;
  6997. }
  6998. return pOutputStream;
  6999. #elif defined(_XM_SSE_INTRINSICS_)
  7000. const uint8_t* pInputVector = (const uint8_t*)pInputStream;
  7001. uint8_t* pOutputVector = (uint8_t*)pOutputStream;
  7002. const XMVECTOR row0 = M.r[0];
  7003. const XMVECTOR row1 = M.r[1];
  7004. const XMVECTOR row3 = M.r[3];
  7005. size_t i = 0;
  7006. size_t two = VectorCount >> 1;
  7007. if ( two > 0 )
  7008. {
  7009. if ( InputStride == sizeof(XMFLOAT2) )
  7010. {
  7011. if ( OutputStride == sizeof(XMFLOAT2) )
  7012. {
  7013. if ( !((uintptr_t)pOutputStream & 0xF) )
  7014. {
  7015. // Packed input, aligned & packed output
  7016. for (size_t j = 0; j < two; ++j)
  7017. {
  7018. XMVECTOR V = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
  7019. pInputVector += sizeof(XMFLOAT2)*2;
  7020. // Result 1
  7021. XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
  7022. XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
  7023. XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
  7024. XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
  7025. vTemp = _mm_add_ps( vTemp, row3 );
  7026. vTemp = _mm_add_ps( vTemp, vTemp2 );
  7027. XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  7028. XMVECTOR V1 = _mm_div_ps( vTemp, W );
  7029. // Result 2
  7030. Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) );
  7031. X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
  7032. vTemp = _mm_mul_ps( Y, row1 );
  7033. vTemp2 = _mm_mul_ps( X, row0 );
  7034. vTemp = _mm_add_ps( vTemp, row3 );
  7035. vTemp = _mm_add_ps( vTemp, vTemp2 );
  7036. W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  7037. XMVECTOR V2 = _mm_div_ps( vTemp, W );
  7038. vTemp = _mm_movelh_ps( V1, V2 );
  7039. XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTemp );
  7040. pOutputVector += sizeof(XMFLOAT2)*2;
  7041. i += 2;
  7042. }
  7043. }
  7044. else
  7045. {
  7046. // Packed input, unaligned & packed output
  7047. for (size_t j = 0; j < two; ++j)
  7048. {
  7049. XMVECTOR V = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
  7050. pInputVector += sizeof(XMFLOAT2)*2;
  7051. // Result 1
  7052. XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
  7053. XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
  7054. XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
  7055. XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
  7056. vTemp = _mm_add_ps( vTemp, row3 );
  7057. vTemp = _mm_add_ps( vTemp, vTemp2 );
  7058. XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  7059. XMVECTOR V1 = _mm_div_ps( vTemp, W );
  7060. // Result 2
  7061. Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) );
  7062. X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
  7063. vTemp = _mm_mul_ps( Y, row1 );
  7064. vTemp2 = _mm_mul_ps( X, row0 );
  7065. vTemp = _mm_add_ps( vTemp, row3 );
  7066. vTemp = _mm_add_ps( vTemp, vTemp2 );
  7067. W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  7068. XMVECTOR V2 = _mm_div_ps( vTemp, W );
  7069. vTemp = _mm_movelh_ps( V1, V2 );
  7070. _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
  7071. pOutputVector += sizeof(XMFLOAT2)*2;
  7072. i += 2;
  7073. }
  7074. }
  7075. }
  7076. else
  7077. {
  7078. // Packed input, unpacked output
  7079. for (size_t j = 0; j < two; ++j)
  7080. {
  7081. XMVECTOR V = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
  7082. pInputVector += sizeof(XMFLOAT2)*2;
  7083. // Result 1
  7084. XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
  7085. XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
  7086. XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
  7087. XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
  7088. vTemp = _mm_add_ps( vTemp, row3 );
  7089. vTemp = _mm_add_ps( vTemp, vTemp2 );
  7090. XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  7091. vTemp = _mm_div_ps( vTemp, W );
  7092. vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
  7093. _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
  7094. _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
  7095. pOutputVector += OutputStride;
  7096. // Result 2
  7097. Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) );
  7098. X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
  7099. vTemp = _mm_mul_ps( Y, row1 );
  7100. vTemp2 = _mm_mul_ps( X, row0 );
  7101. vTemp = _mm_add_ps( vTemp, row3 );
  7102. vTemp = _mm_add_ps( vTemp, vTemp2 );
  7103. W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  7104. vTemp = _mm_div_ps( vTemp, W );
  7105. vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
  7106. _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
  7107. _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
  7108. pOutputVector += OutputStride;
  7109. i += 2;
  7110. }
  7111. }
  7112. }
  7113. }
  7114. if ( !((uintptr_t)pInputVector & 0xF) && !(InputStride & 0xF) )
  7115. {
  7116. // Aligned input
  7117. for (; i < VectorCount; i++)
  7118. {
  7119. XMVECTOR V = _mm_castsi128_ps( _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pInputVector) ) );
  7120. pInputVector += InputStride;
  7121. XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
  7122. XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
  7123. XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
  7124. XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
  7125. vTemp = _mm_add_ps( vTemp, row3 );
  7126. vTemp = _mm_add_ps( vTemp, vTemp2 );
  7127. XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  7128. vTemp = _mm_div_ps( vTemp, W );
  7129. vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
  7130. _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
  7131. _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
  7132. pOutputVector += OutputStride;
  7133. }
  7134. }
  7135. else
  7136. {
  7137. // Unaligned input
  7138. for (; i < VectorCount; i++)
  7139. {
  7140. __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pInputVector) );
  7141. __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pInputVector+4) );
  7142. pInputVector += InputStride;
  7143. XMVECTOR Y = XM_PERMUTE_PS( y, _MM_SHUFFLE(0, 0, 0, 0) );
  7144. XMVECTOR X = XM_PERMUTE_PS( x, _MM_SHUFFLE(0, 0, 0, 0) );
  7145. XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
  7146. XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
  7147. vTemp = _mm_add_ps( vTemp, row3 );
  7148. vTemp = _mm_add_ps( vTemp, vTemp2 );
  7149. XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  7150. vTemp = _mm_div_ps( vTemp, W );
  7151. vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
  7152. _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
  7153. _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
  7154. pOutputVector += OutputStride;
  7155. }
  7156. }
  7157. XM_SFENCE();
  7158. return pOutputStream;
  7159. #endif
  7160. }
  7161. //------------------------------------------------------------------------------
  7162. inline XMVECTOR XM_CALLCONV XMVector2TransformNormal
  7163. (
  7164. FXMVECTOR V,
  7165. FXMMATRIX M
  7166. )
  7167. {
  7168. #if defined(_XM_NO_INTRINSICS_)
  7169. XMVECTOR Y = XMVectorSplatY(V);
  7170. XMVECTOR X = XMVectorSplatX(V);
  7171. XMVECTOR Result = XMVectorMultiply(Y, M.r[1]);
  7172. Result = XMVectorMultiplyAdd(X, M.r[0], Result);
  7173. return Result;
  7174. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  7175. float32x2_t VL = vget_low_f32( V );
  7176. float32x4_t Result = vmulq_lane_f32( M.r[1], VL, 1 ); // Y
  7177. return vmlaq_lane_f32( Result, M.r[0], VL, 0 ); // X
  7178. #elif defined(_XM_SSE_INTRINSICS_)
  7179. XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
  7180. vResult = _mm_mul_ps(vResult,M.r[0]);
  7181. XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
  7182. vTemp = _mm_mul_ps(vTemp,M.r[1]);
  7183. vResult = _mm_add_ps(vResult,vTemp);
  7184. return vResult;
  7185. #endif
  7186. }
  7187. //------------------------------------------------------------------------------
  7188. _Use_decl_annotations_
  7189. inline XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream
  7190. (
  7191. XMFLOAT2* pOutputStream,
  7192. size_t OutputStride,
  7193. const XMFLOAT2* pInputStream,
  7194. size_t InputStride,
  7195. size_t VectorCount,
  7196. FXMMATRIX M
  7197. )
  7198. {
  7199. assert(pOutputStream != nullptr);
  7200. assert(pInputStream != nullptr);
  7201. assert(InputStride >= sizeof(XMFLOAT2));
  7202. _Analysis_assume_(InputStride >= sizeof(XMFLOAT2));
  7203. assert(OutputStride >= sizeof(XMFLOAT2));
  7204. _Analysis_assume_(OutputStride >= sizeof(XMFLOAT2));
  7205. #if defined(_XM_NO_INTRINSICS_)
  7206. const uint8_t* pInputVector = (const uint8_t*)pInputStream;
  7207. uint8_t* pOutputVector = (uint8_t*)pOutputStream;
  7208. const XMVECTOR row0 = M.r[0];
  7209. const XMVECTOR row1 = M.r[1];
  7210. for (size_t i = 0; i < VectorCount; i++)
  7211. {
  7212. XMVECTOR V = XMLoadFloat2((const XMFLOAT2*)pInputVector);
  7213. XMVECTOR Y = XMVectorSplatY(V);
  7214. XMVECTOR X = XMVectorSplatX(V);
  7215. XMVECTOR Result = XMVectorMultiply(Y, row1);
  7216. Result = XMVectorMultiplyAdd(X, row0, Result);
  7217. #ifdef _PREFAST_
  7218. #pragma prefast(push)
  7219. #pragma prefast(disable : 26015, "PREfast noise: Esp:1307" )
  7220. #endif
  7221. XMStoreFloat2((XMFLOAT2*)pOutputVector, Result);
  7222. #ifdef _PREFAST_
  7223. #pragma prefast(pop)
  7224. #endif
  7225. pInputVector += InputStride;
  7226. pOutputVector += OutputStride;
  7227. }
  7228. return pOutputStream;
  7229. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  7230. const uint8_t* pInputVector = (const uint8_t*)pInputStream;
  7231. uint8_t* pOutputVector = (uint8_t*)pOutputStream;
  7232. const XMVECTOR row0 = M.r[0];
  7233. const XMVECTOR row1 = M.r[1];
  7234. size_t i = 0;
  7235. size_t four = VectorCount >> 2;
  7236. if ( four > 0 )
  7237. {
  7238. if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT2)))
  7239. {
  7240. for (size_t j = 0; j < four; ++j)
  7241. {
  7242. float32x4x2_t V = vld2q_f32( reinterpret_cast<const float*>(pInputVector) );
  7243. pInputVector += sizeof(XMFLOAT2)*4;
  7244. float32x2_t r = vget_low_f32( row0 );
  7245. XMVECTOR vResult0 = vmulq_lane_f32( V.val[0], r, 0 ); // Ax
  7246. XMVECTOR vResult1 = vmulq_lane_f32( V.val[0], r, 1 ); // Bx
  7247. __prefetch( pInputVector );
  7248. __prefetch( pInputVector+XM_CACHE_LINE_SIZE );
  7249. r = vget_low_f32( row1 );
  7250. vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey
  7251. vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy
  7252. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
  7253. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
  7254. V.val[0] = vResult0;
  7255. V.val[1] = vResult1;
  7256. vst2q_f32( reinterpret_cast<float*>(pOutputVector), V );
  7257. pOutputVector += sizeof(XMFLOAT2)*4;
  7258. i += 4;
  7259. }
  7260. }
  7261. }
  7262. for (; i < VectorCount; i++)
  7263. {
  7264. float32x2_t V = vld1_f32( reinterpret_cast<const float*>(pInputVector) );
  7265. pInputVector += InputStride;
  7266. XMVECTOR vResult = vmulq_lane_f32( row0, V, 0 ); // X
  7267. vResult = vmlaq_lane_f32( vResult, row1, V, 1 ); // Y
  7268. V = vget_low_f32( vResult );
  7269. vst1_f32( reinterpret_cast<float*>(pOutputVector), V );
  7270. pOutputVector += OutputStride;
  7271. }
  7272. return pOutputStream;
  7273. #elif defined(_XM_SSE_INTRINSICS_)
  7274. const uint8_t* pInputVector = (const uint8_t*)pInputStream;
  7275. uint8_t* pOutputVector = (uint8_t*)pOutputStream;
  7276. const XMVECTOR row0 = M.r[0];
  7277. const XMVECTOR row1 = M.r[1];
  7278. size_t i = 0;
  7279. size_t two = VectorCount >> 1;
  7280. if ( two > 0 )
  7281. {
  7282. if ( InputStride == sizeof(XMFLOAT2) )
  7283. {
  7284. if ( OutputStride == sizeof(XMFLOAT2) )
  7285. {
  7286. if ( !((uintptr_t)pOutputStream & 0xF) )
  7287. {
  7288. // Packed input, aligned & packed output
  7289. for (size_t j = 0; j < two; ++j)
  7290. {
  7291. XMVECTOR V = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
  7292. pInputVector += sizeof(XMFLOAT2)*2;
  7293. // Result 1
  7294. XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
  7295. XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
  7296. XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
  7297. XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
  7298. XMVECTOR V1 = _mm_add_ps( vTemp, vTemp2 );
  7299. // Result 2
  7300. Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) );
  7301. X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
  7302. vTemp = _mm_mul_ps( Y, row1 );
  7303. vTemp2 = _mm_mul_ps( X, row0 );
  7304. XMVECTOR V2 = _mm_add_ps( vTemp, vTemp2 );
  7305. vTemp = _mm_movelh_ps( V1, V2 );
  7306. XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTemp );
  7307. pOutputVector += sizeof(XMFLOAT2)*2;
  7308. i += 2;
  7309. }
  7310. }
  7311. else
  7312. {
  7313. // Packed input, unaligned & packed output
  7314. for (size_t j = 0; j < two; ++j)
  7315. {
  7316. XMVECTOR V = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
  7317. pInputVector += sizeof(XMFLOAT2)*2;
  7318. // Result 1
  7319. XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
  7320. XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
  7321. XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
  7322. XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
  7323. XMVECTOR V1 = _mm_add_ps( vTemp, vTemp2 );
  7324. // Result 2
  7325. Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) );
  7326. X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
  7327. vTemp = _mm_mul_ps( Y, row1 );
  7328. vTemp2 = _mm_mul_ps( X, row0 );
  7329. XMVECTOR V2 = _mm_add_ps( vTemp, vTemp2 );
  7330. vTemp = _mm_movelh_ps( V1, V2 );
  7331. _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
  7332. pOutputVector += sizeof(XMFLOAT2)*2;
  7333. i += 2;
  7334. }
  7335. }
  7336. }
  7337. else
  7338. {
  7339. // Packed input, unpacked output
  7340. for (size_t j = 0; j < two; ++j)
  7341. {
  7342. XMVECTOR V = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
  7343. pInputVector += sizeof(XMFLOAT2)*2;
  7344. // Result 1
  7345. XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
  7346. XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
  7347. XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
  7348. XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
  7349. vTemp = _mm_add_ps( vTemp, vTemp2 );
  7350. vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
  7351. _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
  7352. _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
  7353. pOutputVector += OutputStride;
  7354. // Result 2
  7355. Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) );
  7356. X = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
  7357. vTemp = _mm_mul_ps( Y, row1 );
  7358. vTemp2 = _mm_mul_ps( X, row0 );
  7359. vTemp = _mm_add_ps( vTemp, vTemp2 );
  7360. vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
  7361. _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
  7362. _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
  7363. pOutputVector += OutputStride;
  7364. i += 2;
  7365. }
  7366. }
  7367. }
  7368. }
  7369. if ( !((uintptr_t)pInputVector & 0xF) && !(InputStride & 0xF) )
  7370. {
  7371. // Aligned input
  7372. for (; i < VectorCount; i++)
  7373. {
  7374. XMVECTOR V = _mm_castsi128_ps( _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pInputVector) ) );
  7375. pInputVector += InputStride;
  7376. XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
  7377. XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
  7378. XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
  7379. XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
  7380. vTemp = _mm_add_ps( vTemp, vTemp2 );
  7381. vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
  7382. _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
  7383. _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
  7384. pOutputVector += OutputStride;
  7385. }
  7386. }
  7387. else
  7388. {
  7389. // Unaligned input
  7390. for (; i < VectorCount; i++)
  7391. {
  7392. __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pInputVector) );
  7393. __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pInputVector+4) );
  7394. pInputVector += InputStride;
  7395. XMVECTOR Y = XM_PERMUTE_PS( y, _MM_SHUFFLE(0, 0, 0, 0) );
  7396. XMVECTOR X = XM_PERMUTE_PS( x, _MM_SHUFFLE(0, 0, 0, 0) );
  7397. XMVECTOR vTemp = _mm_mul_ps( Y, row1 );
  7398. XMVECTOR vTemp2 = _mm_mul_ps( X, row0 );
  7399. vTemp = _mm_add_ps( vTemp, vTemp2 );
  7400. vTemp2 = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(1, 1, 1, 1) );
  7401. _mm_store_ss( reinterpret_cast<float*>(pOutputVector), vTemp );
  7402. _mm_store_ss( reinterpret_cast<float*>(pOutputVector+4), vTemp2 );
  7403. pOutputVector += OutputStride;
  7404. }
  7405. }
  7406. XM_SFENCE();
  7407. return pOutputStream;
  7408. #endif
  7409. }
  7410. /****************************************************************************
  7411. *
  7412. * 3D Vector
  7413. *
  7414. ****************************************************************************/
  7415. //------------------------------------------------------------------------------
  7416. // Comparison operations
  7417. //------------------------------------------------------------------------------
  7418. //------------------------------------------------------------------------------
  7419. inline bool XM_CALLCONV XMVector3Equal
  7420. (
  7421. FXMVECTOR V1,
  7422. FXMVECTOR V2
  7423. )
  7424. {
  7425. #if defined(_XM_NO_INTRINSICS_)
  7426. return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2])) != 0);
  7427. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  7428. uint32x4_t vResult = vceqq_f32( V1, V2 );
  7429. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  7430. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  7431. return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
  7432. #elif defined(_XM_SSE_INTRINSICS_)
  7433. XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
  7434. return (((_mm_movemask_ps(vTemp)&7)==7) != 0);
  7435. #endif
  7436. }
  7437. //------------------------------------------------------------------------------
  7438. inline uint32_t XM_CALLCONV XMVector3EqualR
  7439. (
  7440. FXMVECTOR V1,
  7441. FXMVECTOR V2
  7442. )
  7443. {
  7444. #if defined(_XM_NO_INTRINSICS_)
  7445. uint32_t CR = 0;
  7446. if ((V1.vector4_f32[0] == V2.vector4_f32[0]) &&
  7447. (V1.vector4_f32[1] == V2.vector4_f32[1]) &&
  7448. (V1.vector4_f32[2] == V2.vector4_f32[2]))
  7449. {
  7450. CR = XM_CRMASK_CR6TRUE;
  7451. }
  7452. else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) &&
  7453. (V1.vector4_f32[1] != V2.vector4_f32[1]) &&
  7454. (V1.vector4_f32[2] != V2.vector4_f32[2]))
  7455. {
  7456. CR = XM_CRMASK_CR6FALSE;
  7457. }
  7458. return CR;
  7459. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  7460. uint32x4_t vResult = vceqq_f32( V1, V2 );
  7461. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  7462. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  7463. uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU;
  7464. uint32_t CR = 0;
  7465. if ( r == 0xFFFFFFU )
  7466. {
  7467. CR = XM_CRMASK_CR6TRUE;
  7468. }
  7469. else if ( !r )
  7470. {
  7471. CR = XM_CRMASK_CR6FALSE;
  7472. }
  7473. return CR;
  7474. #elif defined(_XM_SSE_INTRINSICS_)
  7475. XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
  7476. int iTest = _mm_movemask_ps(vTemp)&7;
  7477. uint32_t CR = 0;
  7478. if (iTest==7)
  7479. {
  7480. CR = XM_CRMASK_CR6TRUE;
  7481. }
  7482. else if (!iTest)
  7483. {
  7484. CR = XM_CRMASK_CR6FALSE;
  7485. }
  7486. return CR;
  7487. #endif
  7488. }
  7489. //------------------------------------------------------------------------------
  7490. inline bool XM_CALLCONV XMVector3EqualInt
  7491. (
  7492. FXMVECTOR V1,
  7493. FXMVECTOR V2
  7494. )
  7495. {
  7496. #if defined(_XM_NO_INTRINSICS_)
  7497. return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2])) != 0);
  7498. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  7499. uint32x4_t vResult = vceqq_u32( V1, V2 );
  7500. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  7501. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  7502. return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
  7503. #elif defined(_XM_SSE_INTRINSICS_)
  7504. __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
  7505. return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&7)==7) != 0);
  7506. #endif
  7507. }
  7508. //------------------------------------------------------------------------------
  7509. inline uint32_t XM_CALLCONV XMVector3EqualIntR
  7510. (
  7511. FXMVECTOR V1,
  7512. FXMVECTOR V2
  7513. )
  7514. {
  7515. #if defined(_XM_NO_INTRINSICS_)
  7516. uint32_t CR = 0;
  7517. if ((V1.vector4_u32[0] == V2.vector4_u32[0]) &&
  7518. (V1.vector4_u32[1] == V2.vector4_u32[1]) &&
  7519. (V1.vector4_u32[2] == V2.vector4_u32[2]))
  7520. {
  7521. CR = XM_CRMASK_CR6TRUE;
  7522. }
  7523. else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) &&
  7524. (V1.vector4_u32[1] != V2.vector4_u32[1]) &&
  7525. (V1.vector4_u32[2] != V2.vector4_u32[2]))
  7526. {
  7527. CR = XM_CRMASK_CR6FALSE;
  7528. }
  7529. return CR;
  7530. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  7531. uint32x4_t vResult = vceqq_u32( V1, V2 );
  7532. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  7533. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  7534. uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU;
  7535. uint32_t CR = 0;
  7536. if ( r == 0xFFFFFFU )
  7537. {
  7538. CR = XM_CRMASK_CR6TRUE;
  7539. }
  7540. else if ( !r )
  7541. {
  7542. CR = XM_CRMASK_CR6FALSE;
  7543. }
  7544. return CR;
  7545. #elif defined(_XM_SSE_INTRINSICS_)
  7546. __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
  7547. int iTemp = _mm_movemask_ps(_mm_castsi128_ps(vTemp))&7;
  7548. uint32_t CR = 0;
  7549. if (iTemp==7)
  7550. {
  7551. CR = XM_CRMASK_CR6TRUE;
  7552. }
  7553. else if (!iTemp)
  7554. {
  7555. CR = XM_CRMASK_CR6FALSE;
  7556. }
  7557. return CR;
  7558. #endif
  7559. }
  7560. //------------------------------------------------------------------------------
  7561. inline bool XM_CALLCONV XMVector3NearEqual
  7562. (
  7563. FXMVECTOR V1,
  7564. FXMVECTOR V2,
  7565. FXMVECTOR Epsilon
  7566. )
  7567. {
  7568. #if defined(_XM_NO_INTRINSICS_)
  7569. float dx, dy, dz;
  7570. dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]);
  7571. dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]);
  7572. dz = fabsf(V1.vector4_f32[2]-V2.vector4_f32[2]);
  7573. return (((dx <= Epsilon.vector4_f32[0]) &&
  7574. (dy <= Epsilon.vector4_f32[1]) &&
  7575. (dz <= Epsilon.vector4_f32[2])) != 0);
  7576. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  7577. float32x4_t vDelta = vsubq_f32( V1, V2 );
  7578. uint32x4_t vResult = vacleq_f32( vDelta, Epsilon );
  7579. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  7580. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  7581. return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
  7582. #elif defined(_XM_SSE_INTRINSICS_)
  7583. // Get the difference
  7584. XMVECTOR vDelta = _mm_sub_ps(V1,V2);
  7585. // Get the absolute value of the difference
  7586. XMVECTOR vTemp = _mm_setzero_ps();
  7587. vTemp = _mm_sub_ps(vTemp,vDelta);
  7588. vTemp = _mm_max_ps(vTemp,vDelta);
  7589. vTemp = _mm_cmple_ps(vTemp,Epsilon);
  7590. // w is don't care
  7591. return (((_mm_movemask_ps(vTemp)&7)==0x7) != 0);
  7592. #endif
  7593. }
  7594. //------------------------------------------------------------------------------
  7595. inline bool XM_CALLCONV XMVector3NotEqual
  7596. (
  7597. FXMVECTOR V1,
  7598. FXMVECTOR V2
  7599. )
  7600. {
  7601. #if defined(_XM_NO_INTRINSICS_)
  7602. return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2])) != 0);
  7603. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  7604. uint32x4_t vResult = vceqq_f32( V1, V2 );
  7605. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  7606. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  7607. return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0xFFFFFFU );
  7608. #elif defined(_XM_SSE_INTRINSICS_)
  7609. XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
  7610. return (((_mm_movemask_ps(vTemp)&7)!=7) != 0);
  7611. #endif
  7612. }
  7613. //------------------------------------------------------------------------------
  7614. inline bool XM_CALLCONV XMVector3NotEqualInt
  7615. (
  7616. FXMVECTOR V1,
  7617. FXMVECTOR V2
  7618. )
  7619. {
  7620. #if defined(_XM_NO_INTRINSICS_)
  7621. return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2])) != 0);
  7622. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  7623. uint32x4_t vResult = vceqq_u32( V1, V2 );
  7624. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  7625. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  7626. return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0xFFFFFFU );
  7627. #elif defined(_XM_SSE_INTRINSICS_)
  7628. __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
  7629. return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&7)!=7) != 0);
  7630. #endif
  7631. }
  7632. //------------------------------------------------------------------------------
  7633. inline bool XM_CALLCONV XMVector3Greater
  7634. (
  7635. FXMVECTOR V1,
  7636. FXMVECTOR V2
  7637. )
  7638. {
  7639. #if defined(_XM_NO_INTRINSICS_)
  7640. return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2])) != 0);
  7641. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  7642. uint32x4_t vResult = vcgtq_f32( V1, V2 );
  7643. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  7644. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  7645. return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
  7646. #elif defined(_XM_SSE_INTRINSICS_)
  7647. XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
  7648. return (((_mm_movemask_ps(vTemp)&7)==7) != 0);
  7649. #endif
  7650. }
  7651. //------------------------------------------------------------------------------
  7652. inline uint32_t XM_CALLCONV XMVector3GreaterR
  7653. (
  7654. FXMVECTOR V1,
  7655. FXMVECTOR V2
  7656. )
  7657. {
  7658. #if defined(_XM_NO_INTRINSICS_)
  7659. uint32_t CR = 0;
  7660. if ((V1.vector4_f32[0] > V2.vector4_f32[0]) &&
  7661. (V1.vector4_f32[1] > V2.vector4_f32[1]) &&
  7662. (V1.vector4_f32[2] > V2.vector4_f32[2]))
  7663. {
  7664. CR = XM_CRMASK_CR6TRUE;
  7665. }
  7666. else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) &&
  7667. (V1.vector4_f32[1] <= V2.vector4_f32[1]) &&
  7668. (V1.vector4_f32[2] <= V2.vector4_f32[2]))
  7669. {
  7670. CR = XM_CRMASK_CR6FALSE;
  7671. }
  7672. return CR;
  7673. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  7674. uint32x4_t vResult = vcgtq_f32( V1, V2 );
  7675. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  7676. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  7677. uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU;
  7678. uint32_t CR = 0;
  7679. if ( r == 0xFFFFFFU )
  7680. {
  7681. CR = XM_CRMASK_CR6TRUE;
  7682. }
  7683. else if ( !r )
  7684. {
  7685. CR = XM_CRMASK_CR6FALSE;
  7686. }
  7687. return CR;
  7688. #elif defined(_XM_SSE_INTRINSICS_)
  7689. XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
  7690. uint32_t CR = 0;
  7691. int iTest = _mm_movemask_ps(vTemp)&7;
  7692. if (iTest==7)
  7693. {
  7694. CR = XM_CRMASK_CR6TRUE;
  7695. }
  7696. else if (!iTest)
  7697. {
  7698. CR = XM_CRMASK_CR6FALSE;
  7699. }
  7700. return CR;
  7701. #endif
  7702. }
  7703. //------------------------------------------------------------------------------
  7704. inline bool XM_CALLCONV XMVector3GreaterOrEqual
  7705. (
  7706. FXMVECTOR V1,
  7707. FXMVECTOR V2
  7708. )
  7709. {
  7710. #if defined(_XM_NO_INTRINSICS_)
  7711. return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2])) != 0);
  7712. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  7713. uint32x4_t vResult = vcgeq_f32( V1, V2 );
  7714. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  7715. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  7716. return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
  7717. #elif defined(_XM_SSE_INTRINSICS_)
  7718. XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
  7719. return (((_mm_movemask_ps(vTemp)&7)==7) != 0);
  7720. #endif
  7721. }
  7722. //------------------------------------------------------------------------------
  7723. inline uint32_t XM_CALLCONV XMVector3GreaterOrEqualR
  7724. (
  7725. FXMVECTOR V1,
  7726. FXMVECTOR V2
  7727. )
  7728. {
  7729. #if defined(_XM_NO_INTRINSICS_)
  7730. uint32_t CR = 0;
  7731. if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) &&
  7732. (V1.vector4_f32[1] >= V2.vector4_f32[1]) &&
  7733. (V1.vector4_f32[2] >= V2.vector4_f32[2]))
  7734. {
  7735. CR = XM_CRMASK_CR6TRUE;
  7736. }
  7737. else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) &&
  7738. (V1.vector4_f32[1] < V2.vector4_f32[1]) &&
  7739. (V1.vector4_f32[2] < V2.vector4_f32[2]))
  7740. {
  7741. CR = XM_CRMASK_CR6FALSE;
  7742. }
  7743. return CR;
  7744. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  7745. uint32x4_t vResult = vcgeq_f32( V1, V2 );
  7746. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  7747. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  7748. uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU;
  7749. uint32_t CR = 0;
  7750. if ( r == 0xFFFFFFU )
  7751. {
  7752. CR = XM_CRMASK_CR6TRUE;
  7753. }
  7754. else if ( !r )
  7755. {
  7756. CR = XM_CRMASK_CR6FALSE;
  7757. }
  7758. return CR;
  7759. #elif defined(_XM_SSE_INTRINSICS_)
  7760. XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
  7761. uint32_t CR = 0;
  7762. int iTest = _mm_movemask_ps(vTemp)&7;
  7763. if (iTest==7)
  7764. {
  7765. CR = XM_CRMASK_CR6TRUE;
  7766. }
  7767. else if (!iTest)
  7768. {
  7769. CR = XM_CRMASK_CR6FALSE;
  7770. }
  7771. return CR;
  7772. #endif
  7773. }
  7774. //------------------------------------------------------------------------------
  7775. inline bool XM_CALLCONV XMVector3Less
  7776. (
  7777. FXMVECTOR V1,
  7778. FXMVECTOR V2
  7779. )
  7780. {
  7781. #if defined(_XM_NO_INTRINSICS_)
  7782. return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2])) != 0);
  7783. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  7784. uint32x4_t vResult = vcltq_f32( V1, V2 );
  7785. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  7786. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  7787. return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
  7788. #elif defined(_XM_SSE_INTRINSICS_)
  7789. XMVECTOR vTemp = _mm_cmplt_ps(V1,V2);
  7790. return (((_mm_movemask_ps(vTemp)&7)==7) != 0);
  7791. #endif
  7792. }
  7793. //------------------------------------------------------------------------------
  7794. inline bool XM_CALLCONV XMVector3LessOrEqual
  7795. (
  7796. FXMVECTOR V1,
  7797. FXMVECTOR V2
  7798. )
  7799. {
  7800. #if defined(_XM_NO_INTRINSICS_)
  7801. return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2])) != 0);
  7802. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  7803. uint32x4_t vResult = vcleq_f32( V1, V2 );
  7804. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  7805. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  7806. return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
  7807. #elif defined(_XM_SSE_INTRINSICS_)
  7808. XMVECTOR vTemp = _mm_cmple_ps(V1,V2);
  7809. return (((_mm_movemask_ps(vTemp)&7)==7) != 0);
  7810. #endif
  7811. }
  7812. //------------------------------------------------------------------------------
  7813. inline bool XM_CALLCONV XMVector3InBounds
  7814. (
  7815. FXMVECTOR V,
  7816. FXMVECTOR Bounds
  7817. )
  7818. {
  7819. #if defined(_XM_NO_INTRINSICS_)
  7820. return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) &&
  7821. (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) &&
  7822. (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2])) != 0);
  7823. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  7824. // Test if less than or equal
  7825. uint32x4_t ivTemp1 = vcleq_f32(V,Bounds);
  7826. // Negate the bounds
  7827. float32x4_t vTemp2 = vnegq_f32(Bounds);
  7828. // Test if greater or equal (Reversed)
  7829. uint32x4_t ivTemp2 = vcleq_f32(vTemp2,V);
  7830. // Blend answers
  7831. ivTemp1 = vandq_u32(ivTemp1,ivTemp2);
  7832. // in bounds?
  7833. int8x8x2_t vTemp = vzip_u8(vget_low_u8(ivTemp1), vget_high_u8(ivTemp1));
  7834. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  7835. return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
  7836. #elif defined(_XM_SSE_INTRINSICS_)
  7837. // Test if less than or equal
  7838. XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
  7839. // Negate the bounds
  7840. XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
  7841. // Test if greater or equal (Reversed)
  7842. vTemp2 = _mm_cmple_ps(vTemp2,V);
  7843. // Blend answers
  7844. vTemp1 = _mm_and_ps(vTemp1,vTemp2);
  7845. // x,y and z in bounds? (w is don't care)
  7846. return (((_mm_movemask_ps(vTemp1)&0x7)==0x7) != 0);
  7847. #else
  7848. return XMComparisonAllInBounds(XMVector3InBoundsR(V, Bounds));
  7849. #endif
  7850. }
  7851. //------------------------------------------------------------------------------
  7852. inline bool XM_CALLCONV XMVector3IsNaN
  7853. (
  7854. FXMVECTOR V
  7855. )
  7856. {
  7857. #if defined(_XM_NO_INTRINSICS_)
  7858. return (XMISNAN(V.vector4_f32[0]) ||
  7859. XMISNAN(V.vector4_f32[1]) ||
  7860. XMISNAN(V.vector4_f32[2]));
  7861. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  7862. // Test against itself. NaN is always not equal
  7863. uint32x4_t vTempNan = vceqq_f32( V, V );
  7864. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempNan), vget_high_u8(vTempNan));
  7865. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  7866. // If x or y or z are NaN, the mask is zero
  7867. return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0xFFFFFFU );
  7868. #elif defined(_XM_SSE_INTRINSICS_)
  7869. // Test against itself. NaN is always not equal
  7870. XMVECTOR vTempNan = _mm_cmpneq_ps(V,V);
  7871. // If x or y or z are NaN, the mask is non-zero
  7872. return ((_mm_movemask_ps(vTempNan)&7) != 0);
  7873. #endif
  7874. }
  7875. //------------------------------------------------------------------------------
  7876. inline bool XM_CALLCONV XMVector3IsInfinite
  7877. (
  7878. FXMVECTOR V
  7879. )
  7880. {
  7881. #if defined(_XM_NO_INTRINSICS_)
  7882. return (XMISINF(V.vector4_f32[0]) ||
  7883. XMISINF(V.vector4_f32[1]) ||
  7884. XMISINF(V.vector4_f32[2]));
  7885. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  7886. // Mask off the sign bit
  7887. uint32x4_t vTempInf = vandq_u32( V, g_XMAbsMask );
  7888. // Compare to infinity
  7889. vTempInf = vceqq_f32(vTempInf, g_XMInfinity );
  7890. // If any are infinity, the signs are true.
  7891. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempInf), vget_high_u8(vTempInf));
  7892. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  7893. return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0 );
  7894. #elif defined(_XM_SSE_INTRINSICS_)
  7895. // Mask off the sign bit
  7896. __m128 vTemp = _mm_and_ps(V,g_XMAbsMask);
  7897. // Compare to infinity
  7898. vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity);
  7899. // If x,y or z are infinity, the signs are true.
  7900. return ((_mm_movemask_ps(vTemp)&7) != 0);
  7901. #endif
  7902. }
  7903. //------------------------------------------------------------------------------
  7904. // Computation operations
  7905. //------------------------------------------------------------------------------
  7906. //------------------------------------------------------------------------------
  7907. inline XMVECTOR XM_CALLCONV XMVector3Dot
  7908. (
  7909. FXMVECTOR V1,
  7910. FXMVECTOR V2
  7911. )
  7912. {
  7913. #if defined(_XM_NO_INTRINSICS_)
  7914. float fValue = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2];
  7915. XMVECTORF32 vResult;
  7916. vResult.f[0] =
  7917. vResult.f[1] =
  7918. vResult.f[2] =
  7919. vResult.f[3] = fValue;
  7920. return vResult.v;
  7921. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  7922. float32x4_t vTemp = vmulq_f32( V1, V2 );
  7923. float32x2_t v1 = vget_low_f32( vTemp );
  7924. float32x2_t v2 = vget_high_f32( vTemp );
  7925. v1 = vpadd_f32( v1, v1 );
  7926. v2 = vdup_lane_f32( v2, 0 );
  7927. v1 = vadd_f32( v1, v2 );
  7928. return vcombine_f32( v1, v1 );
  7929. #elif defined(_XM_SSE4_INTRINSICS_)
  7930. return _mm_dp_ps( V1, V2, 0x7f );
  7931. #elif defined(_XM_SSE3_INTRINSICS_)
  7932. XMVECTOR vTemp = _mm_mul_ps(V1,V2);
  7933. vTemp = _mm_and_ps(vTemp, g_XMMask3);
  7934. vTemp = _mm_hadd_ps(vTemp,vTemp);
  7935. return _mm_hadd_ps(vTemp,vTemp);
  7936. #elif defined(_XM_SSE_INTRINSICS_)
  7937. // Perform the dot product
  7938. XMVECTOR vDot = _mm_mul_ps(V1,V2);
  7939. // x=Dot.vector4_f32[1], y=Dot.vector4_f32[2]
  7940. XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1));
  7941. // Result.vector4_f32[0] = x+y
  7942. vDot = _mm_add_ss(vDot,vTemp);
  7943. // x=Dot.vector4_f32[2]
  7944. vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
  7945. // Result.vector4_f32[0] = (x+y)+z
  7946. vDot = _mm_add_ss(vDot,vTemp);
  7947. // Splat x
  7948. return XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0));
  7949. #endif
  7950. }
  7951. //------------------------------------------------------------------------------
  7952. inline XMVECTOR XM_CALLCONV XMVector3Cross
  7953. (
  7954. FXMVECTOR V1,
  7955. FXMVECTOR V2
  7956. )
  7957. {
  7958. // [ V1.y*V2.z - V1.z*V2.y, V1.z*V2.x - V1.x*V2.z, V1.x*V2.y - V1.y*V2.x ]
  7959. #if defined(_XM_NO_INTRINSICS_)
  7960. XMVECTORF32 vResult = { { {
  7961. (V1.vector4_f32[1] * V2.vector4_f32[2]) - (V1.vector4_f32[2] * V2.vector4_f32[1]),
  7962. (V1.vector4_f32[2] * V2.vector4_f32[0]) - (V1.vector4_f32[0] * V2.vector4_f32[2]),
  7963. (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]),
  7964. 0.0f
  7965. } } };
  7966. return vResult.v;
  7967. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  7968. float32x2_t v1xy = vget_low_f32(V1);
  7969. float32x2_t v2xy = vget_low_f32(V2);
  7970. float32x2_t v1yx = vrev64_f32( v1xy );
  7971. float32x2_t v2yx = vrev64_f32( v2xy );
  7972. float32x2_t v1zz = vdup_lane_f32( vget_high_f32(V1), 0 );
  7973. float32x2_t v2zz = vdup_lane_f32( vget_high_f32(V2), 0 );
  7974. XMVECTOR vResult = vmulq_f32( vcombine_f32(v1yx,v1xy), vcombine_f32(v2zz,v2yx) );
  7975. vResult = vmlsq_f32( vResult, vcombine_f32(v1zz,v1yx), vcombine_f32(v2yx,v2xy) );
  7976. vResult = veorq_u32( vResult, g_XMFlipY );
  7977. return vandq_u32( vResult, g_XMMask3 );
  7978. #elif defined(_XM_SSE_INTRINSICS_)
  7979. // y1,z1,x1,w1
  7980. XMVECTOR vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(3,0,2,1));
  7981. // z2,x2,y2,w2
  7982. XMVECTOR vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(3,1,0,2));
  7983. // Perform the left operation
  7984. XMVECTOR vResult = _mm_mul_ps(vTemp1,vTemp2);
  7985. // z1,x1,y1,w1
  7986. vTemp1 = XM_PERMUTE_PS(vTemp1,_MM_SHUFFLE(3,0,2,1));
  7987. // y2,z2,x2,w2
  7988. vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(3,1,0,2));
  7989. // Perform the right operation
  7990. vTemp1 = _mm_mul_ps(vTemp1,vTemp2);
  7991. // Subract the right from left, and return answer
  7992. vResult = _mm_sub_ps(vResult,vTemp1);
  7993. // Set w to zero
  7994. return _mm_and_ps(vResult,g_XMMask3);
  7995. #endif
  7996. }
  7997. //------------------------------------------------------------------------------
  7998. inline XMVECTOR XM_CALLCONV XMVector3LengthSq
  7999. (
  8000. FXMVECTOR V
  8001. )
  8002. {
  8003. return XMVector3Dot(V, V);
  8004. }
  8005. //------------------------------------------------------------------------------
  8006. inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst
  8007. (
  8008. FXMVECTOR V
  8009. )
  8010. {
  8011. #if defined(_XM_NO_INTRINSICS_)
  8012. XMVECTOR Result;
  8013. Result = XMVector3LengthSq(V);
  8014. Result = XMVectorReciprocalSqrtEst(Result);
  8015. return Result;
  8016. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  8017. // Dot3
  8018. float32x4_t vTemp = vmulq_f32( V, V );
  8019. float32x2_t v1 = vget_low_f32( vTemp );
  8020. float32x2_t v2 = vget_high_f32( vTemp );
  8021. v1 = vpadd_f32( v1, v1 );
  8022. v2 = vdup_lane_f32( v2, 0 );
  8023. v1 = vadd_f32( v1, v2 );
  8024. // Reciprocal sqrt (estimate)
  8025. v2 = vrsqrte_f32( v1 );
  8026. return vcombine_f32(v2, v2);
  8027. #elif defined(_XM_SSE4_INTRINSICS_)
  8028. XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
  8029. return _mm_rsqrt_ps( vTemp );
  8030. #elif defined(_XM_SSE3_INTRINSICS_)
  8031. XMVECTOR vLengthSq = _mm_mul_ps(V, V);
  8032. vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
  8033. vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
  8034. vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
  8035. vLengthSq = _mm_rsqrt_ps(vLengthSq);
  8036. return vLengthSq;
  8037. #elif defined(_XM_SSE_INTRINSICS_)
  8038. // Perform the dot product on x,y and z
  8039. XMVECTOR vLengthSq = _mm_mul_ps(V,V);
  8040. // vTemp has z and y
  8041. XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,2,1,2));
  8042. // x+z, y
  8043. vLengthSq = _mm_add_ss(vLengthSq,vTemp);
  8044. // y,y,y,y
  8045. vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
  8046. // x+z+y,??,??,??
  8047. vLengthSq = _mm_add_ss(vLengthSq,vTemp);
  8048. // Splat the length squared
  8049. vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
  8050. // Get the reciprocal
  8051. vLengthSq = _mm_rsqrt_ps(vLengthSq);
  8052. return vLengthSq;
  8053. #endif
  8054. }
  8055. //------------------------------------------------------------------------------
  8056. inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLength
  8057. (
  8058. FXMVECTOR V
  8059. )
  8060. {
  8061. #if defined(_XM_NO_INTRINSICS_)
  8062. XMVECTOR Result;
  8063. Result = XMVector3LengthSq(V);
  8064. Result = XMVectorReciprocalSqrt(Result);
  8065. return Result;
  8066. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  8067. // Dot3
  8068. float32x4_t vTemp = vmulq_f32( V, V );
  8069. float32x2_t v1 = vget_low_f32( vTemp );
  8070. float32x2_t v2 = vget_high_f32( vTemp );
  8071. v1 = vpadd_f32( v1, v1 );
  8072. v2 = vdup_lane_f32( v2, 0 );
  8073. v1 = vadd_f32( v1, v2 );
  8074. // Reciprocal sqrt
  8075. float32x2_t S0 = vrsqrte_f32(v1);
  8076. float32x2_t P0 = vmul_f32( v1, S0 );
  8077. float32x2_t R0 = vrsqrts_f32( P0, S0 );
  8078. float32x2_t S1 = vmul_f32( S0, R0 );
  8079. float32x2_t P1 = vmul_f32( v1, S1 );
  8080. float32x2_t R1 = vrsqrts_f32( P1, S1 );
  8081. float32x2_t Result = vmul_f32( S1, R1 );
  8082. return vcombine_f32( Result, Result );
  8083. #elif defined(_XM_SSE4_INTRINSICS_)
  8084. XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
  8085. XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
  8086. return _mm_div_ps( g_XMOne, vLengthSq );
  8087. #elif defined(_XM_SSE3_INTRINSICS_)
  8088. XMVECTOR vDot = _mm_mul_ps(V, V);
  8089. vDot = _mm_and_ps(vDot, g_XMMask3);
  8090. vDot = _mm_hadd_ps(vDot, vDot);
  8091. vDot = _mm_hadd_ps(vDot, vDot);
  8092. vDot = _mm_sqrt_ps(vDot);
  8093. vDot = _mm_div_ps(g_XMOne,vDot);
  8094. return vDot;
  8095. #elif defined(_XM_SSE_INTRINSICS_)
  8096. // Perform the dot product
  8097. XMVECTOR vDot = _mm_mul_ps(V,V);
  8098. // x=Dot.y, y=Dot.z
  8099. XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1));
  8100. // Result.x = x+y
  8101. vDot = _mm_add_ss(vDot,vTemp);
  8102. // x=Dot.z
  8103. vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
  8104. // Result.x = (x+y)+z
  8105. vDot = _mm_add_ss(vDot,vTemp);
  8106. // Splat x
  8107. vDot = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0));
  8108. // Get the reciprocal
  8109. vDot = _mm_sqrt_ps(vDot);
  8110. // Get the reciprocal
  8111. vDot = _mm_div_ps(g_XMOne,vDot);
  8112. return vDot;
  8113. #endif
  8114. }
  8115. //------------------------------------------------------------------------------
  8116. inline XMVECTOR XM_CALLCONV XMVector3LengthEst
  8117. (
  8118. FXMVECTOR V
  8119. )
  8120. {
  8121. #if defined(_XM_NO_INTRINSICS_)
  8122. XMVECTOR Result;
  8123. Result = XMVector3LengthSq(V);
  8124. Result = XMVectorSqrtEst(Result);
  8125. return Result;
  8126. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  8127. // Dot3
  8128. float32x4_t vTemp = vmulq_f32( V, V );
  8129. float32x2_t v1 = vget_low_f32( vTemp );
  8130. float32x2_t v2 = vget_high_f32( vTemp );
  8131. v1 = vpadd_f32( v1, v1 );
  8132. v2 = vdup_lane_f32( v2, 0 );
  8133. v1 = vadd_f32( v1, v2 );
  8134. const float32x2_t zero = vdup_n_f32(0);
  8135. uint32x2_t VEqualsZero = vceq_f32( v1, zero );
  8136. // Sqrt (estimate)
  8137. float32x2_t Result = vrsqrte_f32( v1 );
  8138. Result = vmul_f32( v1, Result );
  8139. Result = vbsl_f32( VEqualsZero, zero, Result );
  8140. return vcombine_f32( Result, Result );
  8141. #elif defined(_XM_SSE4_INTRINSICS_)
  8142. XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
  8143. return _mm_sqrt_ps( vTemp );
  8144. #elif defined(_XM_SSE3_INTRINSICS_)
  8145. XMVECTOR vLengthSq = _mm_mul_ps(V, V);
  8146. vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
  8147. vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
  8148. vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
  8149. vLengthSq = _mm_sqrt_ps(vLengthSq);
  8150. return vLengthSq;
  8151. #elif defined(_XM_SSE_INTRINSICS_)
  8152. // Perform the dot product on x,y and z
  8153. XMVECTOR vLengthSq = _mm_mul_ps(V,V);
  8154. // vTemp has z and y
  8155. XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,2,1,2));
  8156. // x+z, y
  8157. vLengthSq = _mm_add_ss(vLengthSq,vTemp);
  8158. // y,y,y,y
  8159. vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
  8160. // x+z+y,??,??,??
  8161. vLengthSq = _mm_add_ss(vLengthSq,vTemp);
  8162. // Splat the length squared
  8163. vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
  8164. // Get the length
  8165. vLengthSq = _mm_sqrt_ps(vLengthSq);
  8166. return vLengthSq;
  8167. #endif
  8168. }
  8169. //------------------------------------------------------------------------------
  8170. inline XMVECTOR XM_CALLCONV XMVector3Length
  8171. (
  8172. FXMVECTOR V
  8173. )
  8174. {
  8175. #if defined(_XM_NO_INTRINSICS_)
  8176. XMVECTOR Result;
  8177. Result = XMVector3LengthSq(V);
  8178. Result = XMVectorSqrt(Result);
  8179. return Result;
  8180. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  8181. // Dot3
  8182. float32x4_t vTemp = vmulq_f32( V, V );
  8183. float32x2_t v1 = vget_low_f32( vTemp );
  8184. float32x2_t v2 = vget_high_f32( vTemp );
  8185. v1 = vpadd_f32( v1, v1 );
  8186. v2 = vdup_lane_f32( v2, 0 );
  8187. v1 = vadd_f32( v1, v2 );
  8188. const float32x2_t zero = vdup_n_f32(0);
  8189. uint32x2_t VEqualsZero = vceq_f32( v1, zero );
  8190. // Sqrt
  8191. float32x2_t S0 = vrsqrte_f32( v1 );
  8192. float32x2_t P0 = vmul_f32( v1, S0 );
  8193. float32x2_t R0 = vrsqrts_f32( P0, S0 );
  8194. float32x2_t S1 = vmul_f32( S0, R0 );
  8195. float32x2_t P1 = vmul_f32( v1, S1 );
  8196. float32x2_t R1 = vrsqrts_f32( P1, S1 );
  8197. float32x2_t Result = vmul_f32( S1, R1 );
  8198. Result = vmul_f32( v1, Result );
  8199. Result = vbsl_f32( VEqualsZero, zero, Result );
  8200. return vcombine_f32( Result, Result );
  8201. #elif defined(_XM_SSE4_INTRINSICS_)
  8202. XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
  8203. return _mm_sqrt_ps( vTemp );
  8204. #elif defined(_XM_SSE3_INTRINSICS_)
  8205. XMVECTOR vLengthSq = _mm_mul_ps(V, V);
  8206. vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
  8207. vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
  8208. vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
  8209. vLengthSq = _mm_sqrt_ps(vLengthSq);
  8210. return vLengthSq;
  8211. #elif defined(_XM_SSE_INTRINSICS_)
  8212. // Perform the dot product on x,y and z
  8213. XMVECTOR vLengthSq = _mm_mul_ps(V,V);
  8214. // vTemp has z and y
  8215. XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,2,1,2));
  8216. // x+z, y
  8217. vLengthSq = _mm_add_ss(vLengthSq,vTemp);
  8218. // y,y,y,y
  8219. vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
  8220. // x+z+y,??,??,??
  8221. vLengthSq = _mm_add_ss(vLengthSq,vTemp);
  8222. // Splat the length squared
  8223. vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
  8224. // Get the length
  8225. vLengthSq = _mm_sqrt_ps(vLengthSq);
  8226. return vLengthSq;
  8227. #endif
  8228. }
  8229. //------------------------------------------------------------------------------
  8230. // XMVector3NormalizeEst uses a reciprocal estimate and
  8231. // returns QNaN on zero and infinite vectors.
  8232. inline XMVECTOR XM_CALLCONV XMVector3NormalizeEst
  8233. (
  8234. FXMVECTOR V
  8235. )
  8236. {
  8237. #if defined(_XM_NO_INTRINSICS_)
  8238. XMVECTOR Result;
  8239. Result = XMVector3ReciprocalLength(V);
  8240. Result = XMVectorMultiply(V, Result);
  8241. return Result;
  8242. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  8243. // Dot3
  8244. float32x4_t vTemp = vmulq_f32( V, V );
  8245. float32x2_t v1 = vget_low_f32( vTemp );
  8246. float32x2_t v2 = vget_high_f32( vTemp );
  8247. v1 = vpadd_f32( v1, v1 );
  8248. v2 = vdup_lane_f32( v2, 0 );
  8249. v1 = vadd_f32( v1, v2 );
  8250. // Reciprocal sqrt (estimate)
  8251. v2 = vrsqrte_f32( v1 );
  8252. // Normalize
  8253. return vmulq_f32( V, vcombine_f32(v2,v2) );
  8254. #elif defined(_XM_SSE4_INTRINSICS_)
  8255. XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
  8256. XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
  8257. return _mm_mul_ps(vResult, V);
  8258. #elif defined(_XM_SSE3_INTRINSICS_)
  8259. XMVECTOR vDot = _mm_mul_ps(V, V);
  8260. vDot = _mm_and_ps(vDot, g_XMMask3);
  8261. vDot = _mm_hadd_ps(vDot, vDot);
  8262. vDot = _mm_hadd_ps(vDot, vDot);
  8263. vDot = _mm_rsqrt_ps(vDot);
  8264. vDot = _mm_mul_ps(vDot,V);
  8265. return vDot;
  8266. #elif defined(_XM_SSE_INTRINSICS_)
  8267. // Perform the dot product
  8268. XMVECTOR vDot = _mm_mul_ps(V,V);
  8269. // x=Dot.y, y=Dot.z
  8270. XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1));
  8271. // Result.x = x+y
  8272. vDot = _mm_add_ss(vDot,vTemp);
  8273. // x=Dot.z
  8274. vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
  8275. // Result.x = (x+y)+z
  8276. vDot = _mm_add_ss(vDot,vTemp);
  8277. // Splat x
  8278. vDot = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0));
  8279. // Get the reciprocal
  8280. vDot = _mm_rsqrt_ps(vDot);
  8281. // Perform the normalization
  8282. vDot = _mm_mul_ps(vDot,V);
  8283. return vDot;
  8284. #endif
  8285. }
  8286. //------------------------------------------------------------------------------
  8287. inline XMVECTOR XM_CALLCONV XMVector3Normalize
  8288. (
  8289. FXMVECTOR V
  8290. )
  8291. {
  8292. #if defined(_XM_NO_INTRINSICS_)
  8293. float fLength;
  8294. XMVECTOR vResult;
  8295. vResult = XMVector3Length( V );
  8296. fLength = vResult.vector4_f32[0];
  8297. // Prevent divide by zero
  8298. if (fLength > 0) {
  8299. fLength = 1.0f/fLength;
  8300. }
  8301. vResult.vector4_f32[0] = V.vector4_f32[0]*fLength;
  8302. vResult.vector4_f32[1] = V.vector4_f32[1]*fLength;
  8303. vResult.vector4_f32[2] = V.vector4_f32[2]*fLength;
  8304. vResult.vector4_f32[3] = V.vector4_f32[3]*fLength;
  8305. return vResult;
  8306. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  8307. // Dot3
  8308. float32x4_t vTemp = vmulq_f32( V, V );
  8309. float32x2_t v1 = vget_low_f32( vTemp );
  8310. float32x2_t v2 = vget_high_f32( vTemp );
  8311. v1 = vpadd_f32( v1, v1 );
  8312. v2 = vdup_lane_f32( v2, 0 );
  8313. v1 = vadd_f32( v1, v2 );
  8314. uint32x2_t VEqualsZero = vceq_f32( v1, vdup_n_f32(0) );
  8315. uint32x2_t VEqualsInf = vceq_f32( v1, vget_low_f32(g_XMInfinity) );
  8316. // Reciprocal sqrt (2 iterations of Newton-Raphson)
  8317. float32x2_t S0 = vrsqrte_f32( v1 );
  8318. float32x2_t P0 = vmul_f32( v1, S0 );
  8319. float32x2_t R0 = vrsqrts_f32( P0, S0 );
  8320. float32x2_t S1 = vmul_f32( S0, R0 );
  8321. float32x2_t P1 = vmul_f32( v1, S1 );
  8322. float32x2_t R1 = vrsqrts_f32( P1, S1 );
  8323. v2 = vmul_f32( S1, R1 );
  8324. // Normalize
  8325. XMVECTOR vResult = vmulq_f32( V, vcombine_f32(v2,v2) );
  8326. vResult = vbslq_f32( vcombine_f32(VEqualsZero,VEqualsZero), vdupq_n_f32(0), vResult );
  8327. return vbslq_f32( vcombine_f32(VEqualsInf,VEqualsInf), g_XMQNaN, vResult );
  8328. #elif defined(_XM_SSE4_INTRINSICS_)
  8329. XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x7f );
  8330. // Prepare for the division
  8331. XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
  8332. // Create zero with a single instruction
  8333. XMVECTOR vZeroMask = _mm_setzero_ps();
  8334. // Test for a divide by zero (Must be FP to detect -0.0)
  8335. vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
  8336. // Failsafe on zero (Or epsilon) length planes
  8337. // If the length is infinity, set the elements to zero
  8338. vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
  8339. // Divide to perform the normalization
  8340. vResult = _mm_div_ps(V,vResult);
  8341. // Any that are infinity, set to zero
  8342. vResult = _mm_and_ps(vResult,vZeroMask);
  8343. // Select qnan or result based on infinite length
  8344. XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
  8345. XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
  8346. vResult = _mm_or_ps(vTemp1,vTemp2);
  8347. return vResult;
  8348. #elif defined(_XM_SSE3_INTRINSICS_)
  8349. // Perform the dot product on x,y and z only
  8350. XMVECTOR vLengthSq = _mm_mul_ps(V, V);
  8351. vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
  8352. vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
  8353. vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
  8354. // Prepare for the division
  8355. XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
  8356. // Create zero with a single instruction
  8357. XMVECTOR vZeroMask = _mm_setzero_ps();
  8358. // Test for a divide by zero (Must be FP to detect -0.0)
  8359. vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
  8360. // Failsafe on zero (Or epsilon) length planes
  8361. // If the length is infinity, set the elements to zero
  8362. vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
  8363. // Divide to perform the normalization
  8364. vResult = _mm_div_ps(V,vResult);
  8365. // Any that are infinity, set to zero
  8366. vResult = _mm_and_ps(vResult,vZeroMask);
  8367. // Select qnan or result based on infinite length
  8368. XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
  8369. XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
  8370. vResult = _mm_or_ps(vTemp1,vTemp2);
  8371. return vResult;
  8372. #elif defined(_XM_SSE_INTRINSICS_)
  8373. // Perform the dot product on x,y and z only
  8374. XMVECTOR vLengthSq = _mm_mul_ps(V,V);
  8375. XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,1,2,1));
  8376. vLengthSq = _mm_add_ss(vLengthSq,vTemp);
  8377. vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
  8378. vLengthSq = _mm_add_ss(vLengthSq,vTemp);
  8379. vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
  8380. // Prepare for the division
  8381. XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
  8382. // Create zero with a single instruction
  8383. XMVECTOR vZeroMask = _mm_setzero_ps();
  8384. // Test for a divide by zero (Must be FP to detect -0.0)
  8385. vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
  8386. // Failsafe on zero (Or epsilon) length planes
  8387. // If the length is infinity, set the elements to zero
  8388. vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
  8389. // Divide to perform the normalization
  8390. vResult = _mm_div_ps(V,vResult);
  8391. // Any that are infinity, set to zero
  8392. vResult = _mm_and_ps(vResult,vZeroMask);
  8393. // Select qnan or result based on infinite length
  8394. XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
  8395. XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
  8396. vResult = _mm_or_ps(vTemp1,vTemp2);
  8397. return vResult;
  8398. #endif
  8399. }
  8400. //------------------------------------------------------------------------------
  8401. inline XMVECTOR XM_CALLCONV XMVector3ClampLength
  8402. (
  8403. FXMVECTOR V,
  8404. float LengthMin,
  8405. float LengthMax
  8406. )
  8407. {
  8408. XMVECTOR ClampMax = XMVectorReplicate(LengthMax);
  8409. XMVECTOR ClampMin = XMVectorReplicate(LengthMin);
  8410. return XMVector3ClampLengthV(V, ClampMin, ClampMax);
  8411. }
  8412. //------------------------------------------------------------------------------
  8413. inline XMVECTOR XM_CALLCONV XMVector3ClampLengthV
  8414. (
  8415. FXMVECTOR V,
  8416. FXMVECTOR LengthMin,
  8417. FXMVECTOR LengthMax
  8418. )
  8419. {
  8420. assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin)));
  8421. assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax)));
  8422. assert(XMVector3GreaterOrEqual(LengthMin, XMVectorZero()));
  8423. assert(XMVector3GreaterOrEqual(LengthMax, XMVectorZero()));
  8424. assert(XMVector3GreaterOrEqual(LengthMax, LengthMin));
  8425. XMVECTOR LengthSq = XMVector3LengthSq(V);
  8426. const XMVECTOR Zero = XMVectorZero();
  8427. XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq);
  8428. XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v);
  8429. XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero);
  8430. XMVECTOR Normal = XMVectorMultiply(V, RcpLength);
  8431. XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength);
  8432. XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength);
  8433. Length = XMVectorSelect(LengthSq, Length, Select);
  8434. Normal = XMVectorSelect(LengthSq, Normal, Select);
  8435. XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax);
  8436. XMVECTOR ControlMin = XMVectorLess(Length, LengthMin);
  8437. XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax);
  8438. ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin);
  8439. XMVECTOR Result = XMVectorMultiply(Normal, ClampLength);
  8440. // Preserve the original vector (with no precision loss) if the length falls within the given range
  8441. XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin);
  8442. Result = XMVectorSelect(Result, V, Control);
  8443. return Result;
  8444. }
  8445. //------------------------------------------------------------------------------
  8446. inline XMVECTOR XM_CALLCONV XMVector3Reflect
  8447. (
  8448. FXMVECTOR Incident,
  8449. FXMVECTOR Normal
  8450. )
  8451. {
  8452. // Result = Incident - (2 * dot(Incident, Normal)) * Normal
  8453. XMVECTOR Result = XMVector3Dot(Incident, Normal);
  8454. Result = XMVectorAdd(Result, Result);
  8455. Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident);
  8456. return Result;
  8457. }
  8458. //------------------------------------------------------------------------------
  8459. inline XMVECTOR XM_CALLCONV XMVector3Refract
  8460. (
  8461. FXMVECTOR Incident,
  8462. FXMVECTOR Normal,
  8463. float RefractionIndex
  8464. )
  8465. {
  8466. XMVECTOR Index = XMVectorReplicate(RefractionIndex);
  8467. return XMVector3RefractV(Incident, Normal, Index);
  8468. }
  8469. //------------------------------------------------------------------------------
  8470. inline XMVECTOR XM_CALLCONV XMVector3RefractV
  8471. (
  8472. FXMVECTOR Incident,
  8473. FXMVECTOR Normal,
  8474. FXMVECTOR RefractionIndex
  8475. )
  8476. {
  8477. // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) +
  8478. // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
  8479. #if defined(_XM_NO_INTRINSICS_)
  8480. const XMVECTOR Zero = XMVectorZero();
  8481. XMVECTOR IDotN = XMVector3Dot(Incident, Normal);
  8482. // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
  8483. XMVECTOR R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v);
  8484. R = XMVectorMultiply(R, RefractionIndex);
  8485. R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v);
  8486. if (XMVector4LessOrEqual(R, Zero))
  8487. {
  8488. // Total internal reflection
  8489. return Zero;
  8490. }
  8491. else
  8492. {
  8493. // R = RefractionIndex * IDotN + sqrt(R)
  8494. R = XMVectorSqrt(R);
  8495. R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R);
  8496. // Result = RefractionIndex * Incident - Normal * R
  8497. XMVECTOR Result = XMVectorMultiply(RefractionIndex, Incident);
  8498. Result = XMVectorNegativeMultiplySubtract(Normal, R, Result);
  8499. return Result;
  8500. }
  8501. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  8502. XMVECTOR IDotN = XMVector3Dot(Incident,Normal);
  8503. // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
  8504. float32x4_t R = vmlsq_f32( g_XMOne, IDotN, IDotN);
  8505. R = vmulq_f32(R, RefractionIndex);
  8506. R = vmlsq_f32(g_XMOne, R, RefractionIndex );
  8507. uint32x4_t vResult = vcleq_f32(R,g_XMZero);
  8508. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  8509. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  8510. if ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU )
  8511. {
  8512. // Total internal reflection
  8513. vResult = g_XMZero;
  8514. }
  8515. else
  8516. {
  8517. // Sqrt(R)
  8518. float32x4_t S0 = vrsqrteq_f32(R);
  8519. float32x4_t P0 = vmulq_f32( R, S0 );
  8520. float32x4_t R0 = vrsqrtsq_f32( P0, S0 );
  8521. float32x4_t S1 = vmulq_f32( S0, R0 );
  8522. float32x4_t P1 = vmulq_f32( R, S1 );
  8523. float32x4_t R1 = vrsqrtsq_f32( P1, S1 );
  8524. float32x4_t S2 = vmulq_f32( S1, R1 );
  8525. R = vmulq_f32( R, S2 );
  8526. // R = RefractionIndex * IDotN + sqrt(R)
  8527. R = vmlaq_f32( R, RefractionIndex, IDotN );
  8528. // Result = RefractionIndex * Incident - Normal * R
  8529. vResult = vmulq_f32(RefractionIndex, Incident);
  8530. vResult = vmlsq_f32( vResult, R, Normal );
  8531. }
  8532. return vResult;
  8533. #elif defined(_XM_SSE_INTRINSICS_)
  8534. // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) +
  8535. // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
  8536. XMVECTOR IDotN = XMVector3Dot(Incident, Normal);
  8537. // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
  8538. XMVECTOR R = _mm_mul_ps(IDotN, IDotN);
  8539. R = _mm_sub_ps(g_XMOne,R);
  8540. R = _mm_mul_ps(R, RefractionIndex);
  8541. R = _mm_mul_ps(R, RefractionIndex);
  8542. R = _mm_sub_ps(g_XMOne,R);
  8543. XMVECTOR vResult = _mm_cmple_ps(R,g_XMZero);
  8544. if (_mm_movemask_ps(vResult)==0x0f)
  8545. {
  8546. // Total internal reflection
  8547. vResult = g_XMZero;
  8548. }
  8549. else
  8550. {
  8551. // R = RefractionIndex * IDotN + sqrt(R)
  8552. R = _mm_sqrt_ps(R);
  8553. vResult = _mm_mul_ps(RefractionIndex,IDotN);
  8554. R = _mm_add_ps(R,vResult);
  8555. // Result = RefractionIndex * Incident - Normal * R
  8556. vResult = _mm_mul_ps(RefractionIndex, Incident);
  8557. R = _mm_mul_ps(R,Normal);
  8558. vResult = _mm_sub_ps(vResult,R);
  8559. }
  8560. return vResult;
  8561. #endif
  8562. }
  8563. //------------------------------------------------------------------------------
  8564. inline XMVECTOR XM_CALLCONV XMVector3Orthogonal
  8565. (
  8566. FXMVECTOR V
  8567. )
  8568. {
  8569. XMVECTOR Zero = XMVectorZero();
  8570. XMVECTOR Z = XMVectorSplatZ(V);
  8571. XMVECTOR YZYY = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(V);
  8572. XMVECTOR NegativeV = XMVectorSubtract(Zero, V);
  8573. XMVECTOR ZIsNegative = XMVectorLess(Z, Zero);
  8574. XMVECTOR YZYYIsNegative = XMVectorLess(YZYY, Zero);
  8575. XMVECTOR S = XMVectorAdd(YZYY, Z);
  8576. XMVECTOR D = XMVectorSubtract(YZYY, Z);
  8577. XMVECTOR Select = XMVectorEqualInt(ZIsNegative, YZYYIsNegative);
  8578. XMVECTOR R0 = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X>(NegativeV, S);
  8579. XMVECTOR R1 = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X>(V, D);
  8580. return XMVectorSelect(R1, R0, Select);
  8581. }
  8582. //------------------------------------------------------------------------------
  8583. inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormalsEst
  8584. (
  8585. FXMVECTOR N1,
  8586. FXMVECTOR N2
  8587. )
  8588. {
  8589. XMVECTOR Result = XMVector3Dot(N1, N2);
  8590. Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
  8591. Result = XMVectorACosEst(Result);
  8592. return Result;
  8593. }
  8594. //------------------------------------------------------------------------------
  8595. inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormals
  8596. (
  8597. FXMVECTOR N1,
  8598. FXMVECTOR N2
  8599. )
  8600. {
  8601. XMVECTOR Result = XMVector3Dot(N1, N2);
  8602. Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
  8603. Result = XMVectorACos(Result);
  8604. return Result;
  8605. }
  8606. //------------------------------------------------------------------------------
  8607. inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenVectors
  8608. (
  8609. FXMVECTOR V1,
  8610. FXMVECTOR V2
  8611. )
  8612. {
  8613. XMVECTOR L1 = XMVector3ReciprocalLength(V1);
  8614. XMVECTOR L2 = XMVector3ReciprocalLength(V2);
  8615. XMVECTOR Dot = XMVector3Dot(V1, V2);
  8616. L1 = XMVectorMultiply(L1, L2);
  8617. XMVECTOR CosAngle = XMVectorMultiply(Dot, L1);
  8618. CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v);
  8619. return XMVectorACos(CosAngle);
  8620. }
  8621. //------------------------------------------------------------------------------
  8622. inline XMVECTOR XM_CALLCONV XMVector3LinePointDistance
  8623. (
  8624. FXMVECTOR LinePoint1,
  8625. FXMVECTOR LinePoint2,
  8626. FXMVECTOR Point
  8627. )
  8628. {
  8629. // Given a vector PointVector from LinePoint1 to Point and a vector
  8630. // LineVector from LinePoint1 to LinePoint2, the scaled distance
  8631. // PointProjectionScale from LinePoint1 to the perpendicular projection
  8632. // of PointVector onto the line is defined as:
  8633. //
  8634. // PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector)
  8635. XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1);
  8636. XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1);
  8637. XMVECTOR LengthSq = XMVector3LengthSq(LineVector);
  8638. XMVECTOR PointProjectionScale = XMVector3Dot(PointVector, LineVector);
  8639. PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq);
  8640. XMVECTOR DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale);
  8641. DistanceVector = XMVectorSubtract(PointVector, DistanceVector);
  8642. return XMVector3Length(DistanceVector);
  8643. }
  8644. //------------------------------------------------------------------------------
  8645. _Use_decl_annotations_
  8646. inline void XM_CALLCONV XMVector3ComponentsFromNormal
  8647. (
  8648. XMVECTOR* pParallel,
  8649. XMVECTOR* pPerpendicular,
  8650. FXMVECTOR V,
  8651. FXMVECTOR Normal
  8652. )
  8653. {
  8654. assert(pParallel != nullptr);
  8655. assert(pPerpendicular != nullptr);
  8656. XMVECTOR Scale = XMVector3Dot(V, Normal);
  8657. XMVECTOR Parallel = XMVectorMultiply(Normal, Scale);
  8658. *pParallel = Parallel;
  8659. *pPerpendicular = XMVectorSubtract(V, Parallel);
  8660. }
  8661. //------------------------------------------------------------------------------
  8662. // Transform a vector using a rotation expressed as a unit quaternion
  8663. inline XMVECTOR XM_CALLCONV XMVector3Rotate
  8664. (
  8665. FXMVECTOR V,
  8666. FXMVECTOR RotationQuaternion
  8667. )
  8668. {
  8669. XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v);
  8670. XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion);
  8671. XMVECTOR Result = XMQuaternionMultiply(Q, A);
  8672. return XMQuaternionMultiply(Result, RotationQuaternion);
  8673. }
  8674. //------------------------------------------------------------------------------
  8675. // Transform a vector using the inverse of a rotation expressed as a unit quaternion
  8676. inline XMVECTOR XM_CALLCONV XMVector3InverseRotate
  8677. (
  8678. FXMVECTOR V,
  8679. FXMVECTOR RotationQuaternion
  8680. )
  8681. {
  8682. XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v);
  8683. XMVECTOR Result = XMQuaternionMultiply(RotationQuaternion, A);
  8684. XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion);
  8685. return XMQuaternionMultiply(Result, Q);
  8686. }
  8687. //------------------------------------------------------------------------------
  8688. inline XMVECTOR XM_CALLCONV XMVector3Transform
  8689. (
  8690. FXMVECTOR V,
  8691. FXMMATRIX M
  8692. )
  8693. {
  8694. #if defined(_XM_NO_INTRINSICS_)
  8695. XMVECTOR Z = XMVectorSplatZ(V);
  8696. XMVECTOR Y = XMVectorSplatY(V);
  8697. XMVECTOR X = XMVectorSplatX(V);
  8698. XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]);
  8699. Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
  8700. Result = XMVectorMultiplyAdd(X, M.r[0], Result);
  8701. return Result;
  8702. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  8703. float32x2_t VL = vget_low_f32( V );
  8704. XMVECTOR vResult = vmlaq_lane_f32( M.r[3], M.r[0], VL, 0 ); // X
  8705. vResult = vmlaq_lane_f32( vResult, M.r[1], VL, 1 ); // Y
  8706. return vmlaq_lane_f32( vResult, M.r[2], vget_high_f32( V ), 0 ); // Z
  8707. #elif defined(_XM_SSE_INTRINSICS_)
  8708. XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
  8709. vResult = _mm_mul_ps(vResult,M.r[0]);
  8710. XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
  8711. vTemp = _mm_mul_ps(vTemp,M.r[1]);
  8712. vResult = _mm_add_ps(vResult,vTemp);
  8713. vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
  8714. vTemp = _mm_mul_ps(vTemp,M.r[2]);
  8715. vResult = _mm_add_ps(vResult,vTemp);
  8716. vResult = _mm_add_ps(vResult,M.r[3]);
  8717. return vResult;
  8718. #endif
  8719. }
  8720. //------------------------------------------------------------------------------
  8721. #ifdef _PREFAST_
  8722. #pragma prefast(push)
  8723. #pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" )
  8724. #endif
  8725. _Use_decl_annotations_
  8726. inline XMFLOAT4* XM_CALLCONV XMVector3TransformStream
  8727. (
  8728. XMFLOAT4* pOutputStream,
  8729. size_t OutputStride,
  8730. const XMFLOAT3* pInputStream,
  8731. size_t InputStride,
  8732. size_t VectorCount,
  8733. FXMMATRIX M
  8734. )
  8735. {
  8736. assert(pOutputStream != nullptr);
  8737. assert(pInputStream != nullptr);
  8738. assert(InputStride >= sizeof(XMFLOAT3));
  8739. _Analysis_assume_(InputStride >= sizeof(XMFLOAT3));
  8740. assert(OutputStride >= sizeof(XMFLOAT4));
  8741. _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4));
  8742. #if defined(_XM_NO_INTRINSICS_)
  8743. const uint8_t* pInputVector = (const uint8_t*)pInputStream;
  8744. uint8_t* pOutputVector = (uint8_t*)pOutputStream;
  8745. const XMVECTOR row0 = M.r[0];
  8746. const XMVECTOR row1 = M.r[1];
  8747. const XMVECTOR row2 = M.r[2];
  8748. const XMVECTOR row3 = M.r[3];
  8749. for (size_t i = 0; i < VectorCount; i++)
  8750. {
  8751. XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
  8752. XMVECTOR Z = XMVectorSplatZ(V);
  8753. XMVECTOR Y = XMVectorSplatY(V);
  8754. XMVECTOR X = XMVectorSplatX(V);
  8755. XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3);
  8756. Result = XMVectorMultiplyAdd(Y, row1, Result);
  8757. Result = XMVectorMultiplyAdd(X, row0, Result);
  8758. XMStoreFloat4((XMFLOAT4*)pOutputVector, Result);
  8759. pInputVector += InputStride;
  8760. pOutputVector += OutputStride;
  8761. }
  8762. return pOutputStream;
  8763. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  8764. const uint8_t* pInputVector = (const uint8_t*)pInputStream;
  8765. uint8_t* pOutputVector = (uint8_t*)pOutputStream;
  8766. const XMVECTOR row0 = M.r[0];
  8767. const XMVECTOR row1 = M.r[1];
  8768. const XMVECTOR row2 = M.r[2];
  8769. const XMVECTOR row3 = M.r[3];
  8770. size_t i = 0;
  8771. size_t four = VectorCount >> 2;
  8772. if ( four > 0 )
  8773. {
  8774. if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT4)))
  8775. {
  8776. for (size_t j = 0; j < four; ++j)
  8777. {
  8778. float32x4x3_t V = vld3q_f32( reinterpret_cast<const float*>(pInputVector) );
  8779. pInputVector += sizeof(XMFLOAT3)*4;
  8780. float32x2_t r3 = vget_low_f32( row3 );
  8781. float32x2_t r = vget_low_f32( row0 );
  8782. XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M
  8783. XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N
  8784. __prefetch( pInputVector );
  8785. r3 = vget_high_f32( row3 );
  8786. r = vget_high_f32( row0 );
  8787. XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O
  8788. XMVECTOR vResult3 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P
  8789. __prefetch( pInputVector+XM_CACHE_LINE_SIZE );
  8790. r = vget_low_f32( row1 );
  8791. vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M
  8792. vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N
  8793. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
  8794. r = vget_high_f32( row1 );
  8795. vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O
  8796. vResult3 = vmlaq_lane_f32( vResult3, V.val[1], r, 1 ); // Dx+Hy+P
  8797. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
  8798. r = vget_low_f32( row2 );
  8799. vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz+M
  8800. vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz+N
  8801. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) );
  8802. r = vget_high_f32( row2 );
  8803. vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz+O
  8804. vResult3 = vmlaq_lane_f32( vResult3, V.val[2], r, 1 ); // Dx+Hy+Lz+P
  8805. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );
  8806. float32x4x4_t R;
  8807. R.val[0] = vResult0;
  8808. R.val[1] = vResult1;
  8809. R.val[2] = vResult2;
  8810. R.val[3] = vResult3;
  8811. vst4q_f32( reinterpret_cast<float*>(pOutputVector), R );
  8812. pOutputVector += sizeof(XMFLOAT4)*4;
  8813. i += 4;
  8814. }
  8815. }
  8816. }
  8817. for (; i < VectorCount; i++)
  8818. {
  8819. float32x2_t VL = vld1_f32( reinterpret_cast<const float*>(pInputVector) );
  8820. float32x2_t zero = vdup_n_f32(0);
  8821. float32x2_t VH = vld1_lane_f32( reinterpret_cast<const float*>(pInputVector)+2, zero, 0 );
  8822. pInputVector += InputStride;
  8823. XMVECTOR vResult = vmlaq_lane_f32( row3, row0, VL, 0 ); // X
  8824. vResult = vmlaq_lane_f32( vResult, row1, VL, 1); // Y
  8825. vResult = vmlaq_lane_f32( vResult, row2, VH, 0 ); // Z
  8826. vst1q_f32( reinterpret_cast<float*>(pOutputVector), vResult );
  8827. pOutputVector += OutputStride;
  8828. }
  8829. return pOutputStream;
  8830. #elif defined(_XM_SSE_INTRINSICS_)
  8831. const uint8_t* pInputVector = (const uint8_t*)pInputStream;
  8832. uint8_t* pOutputVector = (uint8_t*)pOutputStream;
  8833. const XMVECTOR row0 = M.r[0];
  8834. const XMVECTOR row1 = M.r[1];
  8835. const XMVECTOR row2 = M.r[2];
  8836. const XMVECTOR row3 = M.r[3];
  8837. size_t i = 0;
  8838. size_t four = VectorCount >> 2;
  8839. if ( four > 0 )
  8840. {
  8841. if (InputStride == sizeof(XMFLOAT3))
  8842. {
  8843. if ( !((uintptr_t)pOutputStream & 0xF) && !(OutputStride & 0xF) )
  8844. {
  8845. // Packed input, aligned output
  8846. for (size_t j = 0; j < four; ++j)
  8847. {
  8848. __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
  8849. __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );
  8850. __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
  8851. pInputVector += sizeof(XMFLOAT3)*4;
  8852. // Unpack the 4 vectors (.w components are junk)
  8853. XM3UNPACK3INTO4(V1,L2,L3);
  8854. // Result 1
  8855. XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
  8856. XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
  8857. XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
  8858. XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
  8859. XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
  8860. XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
  8861. vTemp = _mm_add_ps( vTemp, row3 );
  8862. vTemp = _mm_add_ps( vTemp, vTemp2 );
  8863. vTemp = _mm_add_ps( vTemp, vTemp3 );
  8864. XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTemp );
  8865. pOutputVector += OutputStride;
  8866. // Result 2
  8867. Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
  8868. Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
  8869. X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
  8870. vTemp = _mm_mul_ps( Z, row2 );
  8871. vTemp2 = _mm_mul_ps( Y, row1 );
  8872. vTemp3 = _mm_mul_ps( X, row0 );
  8873. vTemp = _mm_add_ps( vTemp, row3 );
  8874. vTemp = _mm_add_ps( vTemp, vTemp2 );
  8875. vTemp = _mm_add_ps( vTemp, vTemp3 );
  8876. XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTemp );
  8877. pOutputVector += OutputStride;
  8878. // Result 3
  8879. Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
  8880. Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
  8881. X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
  8882. vTemp = _mm_mul_ps( Z, row2 );
  8883. vTemp2 = _mm_mul_ps( Y, row1 );
  8884. vTemp3 = _mm_mul_ps( X, row0 );
  8885. vTemp = _mm_add_ps( vTemp, row3 );
  8886. vTemp = _mm_add_ps( vTemp, vTemp2 );
  8887. vTemp = _mm_add_ps( vTemp, vTemp3 );
  8888. XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTemp );
  8889. pOutputVector += OutputStride;
  8890. // Result 4
  8891. Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
  8892. Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
  8893. X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
  8894. vTemp = _mm_mul_ps( Z, row2 );
  8895. vTemp2 = _mm_mul_ps( Y, row1 );
  8896. vTemp3 = _mm_mul_ps( X, row0 );
  8897. vTemp = _mm_add_ps( vTemp, row3 );
  8898. vTemp = _mm_add_ps( vTemp, vTemp2 );
  8899. vTemp = _mm_add_ps( vTemp, vTemp3 );
  8900. XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTemp );
  8901. pOutputVector += OutputStride;
  8902. i += 4;
  8903. }
  8904. }
  8905. else
  8906. {
  8907. // Packed input, unaligned output
  8908. for (size_t j = 0; j < four; ++j)
  8909. {
  8910. __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
  8911. __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );
  8912. __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
  8913. pInputVector += sizeof(XMFLOAT3)*4;
  8914. // Unpack the 4 vectors (.w components are junk)
  8915. XM3UNPACK3INTO4(V1,L2,L3);
  8916. // Result 1
  8917. XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
  8918. XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
  8919. XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
  8920. XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
  8921. XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
  8922. XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
  8923. vTemp = _mm_add_ps( vTemp, row3 );
  8924. vTemp = _mm_add_ps( vTemp, vTemp2 );
  8925. vTemp = _mm_add_ps( vTemp, vTemp3 );
  8926. _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
  8927. pOutputVector += OutputStride;
  8928. // Result 2
  8929. Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
  8930. Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
  8931. X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
  8932. vTemp = _mm_mul_ps( Z, row2 );
  8933. vTemp2 = _mm_mul_ps( Y, row1 );
  8934. vTemp3 = _mm_mul_ps( X, row0 );
  8935. vTemp = _mm_add_ps( vTemp, row3 );
  8936. vTemp = _mm_add_ps( vTemp, vTemp2 );
  8937. vTemp = _mm_add_ps( vTemp, vTemp3 );
  8938. _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
  8939. pOutputVector += OutputStride;
  8940. // Result 3
  8941. Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
  8942. Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
  8943. X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
  8944. vTemp = _mm_mul_ps( Z, row2 );
  8945. vTemp2 = _mm_mul_ps( Y, row1 );
  8946. vTemp3 = _mm_mul_ps( X, row0 );
  8947. vTemp = _mm_add_ps( vTemp, row3 );
  8948. vTemp = _mm_add_ps( vTemp, vTemp2 );
  8949. vTemp = _mm_add_ps( vTemp, vTemp3 );
  8950. _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
  8951. pOutputVector += OutputStride;
  8952. // Result 4
  8953. Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
  8954. Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
  8955. X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
  8956. vTemp = _mm_mul_ps( Z, row2 );
  8957. vTemp2 = _mm_mul_ps( Y, row1 );
  8958. vTemp3 = _mm_mul_ps( X, row0 );
  8959. vTemp = _mm_add_ps( vTemp, row3 );
  8960. vTemp = _mm_add_ps( vTemp, vTemp2 );
  8961. vTemp = _mm_add_ps( vTemp, vTemp3 );
  8962. _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
  8963. pOutputVector += OutputStride;
  8964. i += 4;
  8965. }
  8966. }
  8967. }
  8968. }
  8969. if ( !((uintptr_t)pOutputStream & 0xF) && !(OutputStride & 0xF) )
  8970. {
  8971. // Aligned output
  8972. for (; i < VectorCount; ++i)
  8973. {
  8974. XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
  8975. pInputVector += InputStride;
  8976. XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
  8977. XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
  8978. XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
  8979. XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
  8980. XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
  8981. XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
  8982. vTemp = _mm_add_ps( vTemp, row3 );
  8983. vTemp = _mm_add_ps( vTemp, vTemp2 );
  8984. vTemp = _mm_add_ps( vTemp, vTemp3 );
  8985. XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTemp );
  8986. pOutputVector += OutputStride;
  8987. }
  8988. }
  8989. else
  8990. {
  8991. // Unaligned output
  8992. for (; i < VectorCount; ++i)
  8993. {
  8994. XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
  8995. pInputVector += InputStride;
  8996. XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
  8997. XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
  8998. XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
  8999. XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
  9000. XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
  9001. XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
  9002. vTemp = _mm_add_ps( vTemp, row3 );
  9003. vTemp = _mm_add_ps( vTemp, vTemp2 );
  9004. vTemp = _mm_add_ps( vTemp, vTemp3 );
  9005. _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTemp );
  9006. pOutputVector += OutputStride;
  9007. }
  9008. }
  9009. XM_SFENCE();
  9010. return pOutputStream;
  9011. #endif
  9012. }
  9013. #ifdef _PREFAST_
  9014. #pragma prefast(pop)
  9015. #endif
  9016. //------------------------------------------------------------------------------
  9017. inline XMVECTOR XM_CALLCONV XMVector3TransformCoord
  9018. (
  9019. FXMVECTOR V,
  9020. FXMMATRIX M
  9021. )
  9022. {
  9023. XMVECTOR Z = XMVectorSplatZ(V);
  9024. XMVECTOR Y = XMVectorSplatY(V);
  9025. XMVECTOR X = XMVectorSplatX(V);
  9026. XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]);
  9027. Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
  9028. Result = XMVectorMultiplyAdd(X, M.r[0], Result);
  9029. XMVECTOR W = XMVectorSplatW(Result);
  9030. return XMVectorDivide( Result, W );
  9031. }
  9032. //------------------------------------------------------------------------------
  9033. #ifdef _PREFAST_
  9034. #pragma prefast(push)
  9035. #pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" )
  9036. #endif
  9037. _Use_decl_annotations_
  9038. inline XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream
  9039. (
  9040. XMFLOAT3* pOutputStream,
  9041. size_t OutputStride,
  9042. const XMFLOAT3* pInputStream,
  9043. size_t InputStride,
  9044. size_t VectorCount,
  9045. FXMMATRIX M
  9046. )
  9047. {
  9048. assert(pOutputStream != nullptr);
  9049. assert(pInputStream != nullptr);
  9050. assert(InputStride >= sizeof(XMFLOAT3));
  9051. _Analysis_assume_(InputStride >= sizeof(XMFLOAT3));
  9052. assert(OutputStride >= sizeof(XMFLOAT3));
  9053. _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3));
  9054. #if defined(_XM_NO_INTRINSICS_)
  9055. const uint8_t* pInputVector = (const uint8_t*)pInputStream;
  9056. uint8_t* pOutputVector = (uint8_t*)pOutputStream;
  9057. const XMVECTOR row0 = M.r[0];
  9058. const XMVECTOR row1 = M.r[1];
  9059. const XMVECTOR row2 = M.r[2];
  9060. const XMVECTOR row3 = M.r[3];
  9061. for (size_t i = 0; i < VectorCount; i++)
  9062. {
  9063. XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
  9064. XMVECTOR Z = XMVectorSplatZ(V);
  9065. XMVECTOR Y = XMVectorSplatY(V);
  9066. XMVECTOR X = XMVectorSplatX(V);
  9067. XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3);
  9068. Result = XMVectorMultiplyAdd(Y, row1, Result);
  9069. Result = XMVectorMultiplyAdd(X, row0, Result);
  9070. XMVECTOR W = XMVectorSplatW(Result);
  9071. Result = XMVectorDivide(Result, W);
  9072. XMStoreFloat3((XMFLOAT3*)pOutputVector, Result);
  9073. pInputVector += InputStride;
  9074. pOutputVector += OutputStride;
  9075. }
  9076. return pOutputStream;
  9077. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  9078. const uint8_t* pInputVector = (const uint8_t*)pInputStream;
  9079. uint8_t* pOutputVector = (uint8_t*)pOutputStream;
  9080. const XMVECTOR row0 = M.r[0];
  9081. const XMVECTOR row1 = M.r[1];
  9082. const XMVECTOR row2 = M.r[2];
  9083. const XMVECTOR row3 = M.r[3];
  9084. size_t i = 0;
  9085. size_t four = VectorCount >> 2;
  9086. if ( four > 0 )
  9087. {
  9088. if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3)))
  9089. {
  9090. for (size_t j = 0; j < four; ++j)
  9091. {
  9092. float32x4x3_t V = vld3q_f32( reinterpret_cast<const float*>(pInputVector) );
  9093. pInputVector += sizeof(XMFLOAT3)*4;
  9094. float32x2_t r3 = vget_low_f32( row3 );
  9095. float32x2_t r = vget_low_f32( row0 );
  9096. XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M
  9097. XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N
  9098. __prefetch( pInputVector );
  9099. r3 = vget_high_f32( row3 );
  9100. r = vget_high_f32( row0 );
  9101. XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O
  9102. XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P
  9103. __prefetch( pInputVector+XM_CACHE_LINE_SIZE );
  9104. r = vget_low_f32( row1 );
  9105. vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M
  9106. vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N
  9107. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
  9108. r = vget_high_f32( row1 );
  9109. vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O
  9110. W = vmlaq_lane_f32( W, V.val[1], r, 1 ); // Dx+Hy+P
  9111. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
  9112. r = vget_low_f32( row2 );
  9113. vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz+M
  9114. vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz+N
  9115. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) );
  9116. r = vget_high_f32( row2 );
  9117. vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz+O
  9118. W = vmlaq_lane_f32( W, V.val[2], r, 1 ); // Dx+Hy+Lz+P
  9119. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );
  9120. #if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
  9121. V.val[0] = vdivq_f32( vResult0, W );
  9122. V.val[1] = vdivq_f32( vResult1, W );
  9123. V.val[2] = vdivq_f32( vResult2, W );
  9124. #else
  9125. // 2 iterations of Newton-Raphson refinement of reciprocal
  9126. float32x4_t Reciprocal = vrecpeq_f32(W);
  9127. float32x4_t S = vrecpsq_f32( Reciprocal, W );
  9128. Reciprocal = vmulq_f32( S, Reciprocal );
  9129. S = vrecpsq_f32( Reciprocal, W );
  9130. Reciprocal = vmulq_f32( S, Reciprocal );
  9131. V.val[0] = vmulq_f32( vResult0, Reciprocal );
  9132. V.val[1] = vmulq_f32( vResult1, Reciprocal );
  9133. V.val[2] = vmulq_f32( vResult2, Reciprocal );
  9134. #endif
  9135. vst3q_f32( reinterpret_cast<float*>(pOutputVector),V );
  9136. pOutputVector += sizeof(XMFLOAT3)*4;
  9137. i += 4;
  9138. }
  9139. }
  9140. }
  9141. for (; i < VectorCount; i++)
  9142. {
  9143. float32x2_t VL = vld1_f32( reinterpret_cast<const float*>(pInputVector) );
  9144. float32x2_t zero = vdup_n_f32(0);
  9145. float32x2_t VH = vld1_lane_f32( reinterpret_cast<const float*>(pInputVector)+2, zero, 0 );
  9146. pInputVector += InputStride;
  9147. XMVECTOR vResult = vmlaq_lane_f32( row3, row0, VL, 0 ); // X
  9148. vResult = vmlaq_lane_f32( vResult, row1, VL, 1 ); // Y
  9149. vResult = vmlaq_lane_f32( vResult, row2, VH, 0 ); // Z
  9150. VH = vget_high_f32(vResult);
  9151. XMVECTOR W = vdupq_lane_f32( VH, 1 );
  9152. #if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
  9153. vResult = vdivq_f32( vResult, W );
  9154. #else
  9155. // 2 iterations of Newton-Raphson refinement of reciprocal for W
  9156. float32x4_t Reciprocal = vrecpeq_f32( W );
  9157. float32x4_t S = vrecpsq_f32( Reciprocal, W );
  9158. Reciprocal = vmulq_f32( S, Reciprocal );
  9159. S = vrecpsq_f32( Reciprocal, W );
  9160. Reciprocal = vmulq_f32( S, Reciprocal );
  9161. vResult = vmulq_f32( vResult, Reciprocal );
  9162. #endif
  9163. VL = vget_low_f32( vResult );
  9164. vst1_f32( reinterpret_cast<float*>(pOutputVector), VL );
  9165. vst1q_lane_f32( reinterpret_cast<float*>(pOutputVector)+2, vResult, 2 );
  9166. pOutputVector += OutputStride;
  9167. }
  9168. return pOutputStream;
  9169. #elif defined(_XM_SSE_INTRINSICS_)
  9170. const uint8_t* pInputVector = (const uint8_t*)pInputStream;
  9171. uint8_t* pOutputVector = (uint8_t*)pOutputStream;
  9172. const XMVECTOR row0 = M.r[0];
  9173. const XMVECTOR row1 = M.r[1];
  9174. const XMVECTOR row2 = M.r[2];
  9175. const XMVECTOR row3 = M.r[3];
  9176. size_t i = 0;
  9177. size_t four = VectorCount >> 2;
  9178. if ( four > 0 )
  9179. {
  9180. if (InputStride == sizeof(XMFLOAT3))
  9181. {
  9182. if (OutputStride == sizeof(XMFLOAT3))
  9183. {
  9184. if ( !((uintptr_t)pOutputStream & 0xF) )
  9185. {
  9186. // Packed input, aligned & packed output
  9187. for (size_t j = 0; j < four; ++j)
  9188. {
  9189. __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
  9190. __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );
  9191. __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
  9192. pInputVector += sizeof(XMFLOAT3)*4;
  9193. // Unpack the 4 vectors (.w components are junk)
  9194. XM3UNPACK3INTO4(V1,L2,L3);
  9195. // Result 1
  9196. XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
  9197. XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
  9198. XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
  9199. XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
  9200. XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
  9201. XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
  9202. vTemp = _mm_add_ps( vTemp, row3 );
  9203. vTemp = _mm_add_ps( vTemp, vTemp2 );
  9204. vTemp = _mm_add_ps( vTemp, vTemp3 );
  9205. XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  9206. V1 = _mm_div_ps( vTemp, W );
  9207. // Result 2
  9208. Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
  9209. Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
  9210. X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
  9211. vTemp = _mm_mul_ps( Z, row2 );
  9212. vTemp2 = _mm_mul_ps( Y, row1 );
  9213. vTemp3 = _mm_mul_ps( X, row0 );
  9214. vTemp = _mm_add_ps( vTemp, row3 );
  9215. vTemp = _mm_add_ps( vTemp, vTemp2 );
  9216. vTemp = _mm_add_ps( vTemp, vTemp3 );
  9217. W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  9218. V2 = _mm_div_ps( vTemp, W );
  9219. // Result 3
  9220. Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
  9221. Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
  9222. X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
  9223. vTemp = _mm_mul_ps( Z, row2 );
  9224. vTemp2 = _mm_mul_ps( Y, row1 );
  9225. vTemp3 = _mm_mul_ps( X, row0 );
  9226. vTemp = _mm_add_ps( vTemp, row3 );
  9227. vTemp = _mm_add_ps( vTemp, vTemp2 );
  9228. vTemp = _mm_add_ps( vTemp, vTemp3 );
  9229. W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  9230. V3 = _mm_div_ps( vTemp, W );
  9231. // Result 4
  9232. Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
  9233. Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
  9234. X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
  9235. vTemp = _mm_mul_ps( Z, row2 );
  9236. vTemp2 = _mm_mul_ps( Y, row1 );
  9237. vTemp3 = _mm_mul_ps( X, row0 );
  9238. vTemp = _mm_add_ps( vTemp, row3 );
  9239. vTemp = _mm_add_ps( vTemp, vTemp2 );
  9240. vTemp = _mm_add_ps( vTemp, vTemp3 );
  9241. W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  9242. V4 = _mm_div_ps( vTemp, W );
  9243. // Pack and store the vectors
  9244. XM3PACK4INTO3(vTemp);
  9245. XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), V1 );
  9246. XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector+16), vTemp );
  9247. XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector+32), V3 );
  9248. pOutputVector += sizeof(XMFLOAT3)*4;
  9249. i += 4;
  9250. }
  9251. }
  9252. else
  9253. {
  9254. // Packed input, unaligned & packed output
  9255. for (size_t j = 0; j < four; ++j)
  9256. {
  9257. __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
  9258. __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );
  9259. __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
  9260. pInputVector += sizeof(XMFLOAT3)*4;
  9261. // Unpack the 4 vectors (.w components are junk)
  9262. XM3UNPACK3INTO4(V1,L2,L3);
  9263. // Result 1
  9264. XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
  9265. XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
  9266. XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
  9267. XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
  9268. XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
  9269. XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
  9270. vTemp = _mm_add_ps( vTemp, row3 );
  9271. vTemp = _mm_add_ps( vTemp, vTemp2 );
  9272. vTemp = _mm_add_ps( vTemp, vTemp3 );
  9273. XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  9274. V1 = _mm_div_ps( vTemp, W );
  9275. // Result 2
  9276. Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
  9277. Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
  9278. X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
  9279. vTemp = _mm_mul_ps( Z, row2 );
  9280. vTemp2 = _mm_mul_ps( Y, row1 );
  9281. vTemp3 = _mm_mul_ps( X, row0 );
  9282. vTemp = _mm_add_ps( vTemp, row3 );
  9283. vTemp = _mm_add_ps( vTemp, vTemp2 );
  9284. vTemp = _mm_add_ps( vTemp, vTemp3 );
  9285. W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  9286. V2 = _mm_div_ps( vTemp, W );
  9287. // Result 3
  9288. Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
  9289. Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
  9290. X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
  9291. vTemp = _mm_mul_ps( Z, row2 );
  9292. vTemp2 = _mm_mul_ps( Y, row1 );
  9293. vTemp3 = _mm_mul_ps( X, row0 );
  9294. vTemp = _mm_add_ps( vTemp, row3 );
  9295. vTemp = _mm_add_ps( vTemp, vTemp2 );
  9296. vTemp = _mm_add_ps( vTemp, vTemp3 );
  9297. W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  9298. V3 = _mm_div_ps( vTemp, W );
  9299. // Result 4
  9300. Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
  9301. Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
  9302. X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
  9303. vTemp = _mm_mul_ps( Z, row2 );
  9304. vTemp2 = _mm_mul_ps( Y, row1 );
  9305. vTemp3 = _mm_mul_ps( X, row0 );
  9306. vTemp = _mm_add_ps( vTemp, row3 );
  9307. vTemp = _mm_add_ps( vTemp, vTemp2 );
  9308. vTemp = _mm_add_ps( vTemp, vTemp3 );
  9309. W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  9310. V4 = _mm_div_ps( vTemp, W );
  9311. // Pack and store the vectors
  9312. XM3PACK4INTO3(vTemp);
  9313. _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), V1 );
  9314. _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector+16), vTemp );
  9315. _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector+32), V3 );
  9316. pOutputVector += sizeof(XMFLOAT3)*4;
  9317. i += 4;
  9318. }
  9319. }
  9320. }
  9321. else
  9322. {
  9323. // Packed input, unpacked output
  9324. for (size_t j = 0; j < four; ++j)
  9325. {
  9326. __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
  9327. __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );
  9328. __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
  9329. pInputVector += sizeof(XMFLOAT3)*4;
  9330. // Unpack the 4 vectors (.w components are junk)
  9331. XM3UNPACK3INTO4(V1,L2,L3);
  9332. // Result 1
  9333. XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
  9334. XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
  9335. XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
  9336. XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
  9337. XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
  9338. XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
  9339. vTemp = _mm_add_ps( vTemp, row3 );
  9340. vTemp = _mm_add_ps( vTemp, vTemp2 );
  9341. vTemp = _mm_add_ps( vTemp, vTemp3 );
  9342. XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  9343. vTemp = _mm_div_ps( vTemp, W );
  9344. XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
  9345. pOutputVector += OutputStride;
  9346. // Result 2
  9347. Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
  9348. Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
  9349. X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
  9350. vTemp = _mm_mul_ps( Z, row2 );
  9351. vTemp2 = _mm_mul_ps( Y, row1 );
  9352. vTemp3 = _mm_mul_ps( X, row0 );
  9353. vTemp = _mm_add_ps( vTemp, row3 );
  9354. vTemp = _mm_add_ps( vTemp, vTemp2 );
  9355. vTemp = _mm_add_ps( vTemp, vTemp3 );
  9356. W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  9357. vTemp = _mm_div_ps( vTemp, W );
  9358. XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
  9359. pOutputVector += OutputStride;
  9360. // Result 3
  9361. Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
  9362. Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
  9363. X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
  9364. vTemp = _mm_mul_ps( Z, row2 );
  9365. vTemp2 = _mm_mul_ps( Y, row1 );
  9366. vTemp3 = _mm_mul_ps( X, row0 );
  9367. vTemp = _mm_add_ps( vTemp, row3 );
  9368. vTemp = _mm_add_ps( vTemp, vTemp2 );
  9369. vTemp = _mm_add_ps( vTemp, vTemp3 );
  9370. W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  9371. vTemp = _mm_div_ps( vTemp, W );
  9372. XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
  9373. pOutputVector += OutputStride;
  9374. // Result 4
  9375. Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
  9376. Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
  9377. X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
  9378. vTemp = _mm_mul_ps( Z, row2 );
  9379. vTemp2 = _mm_mul_ps( Y, row1 );
  9380. vTemp3 = _mm_mul_ps( X, row0 );
  9381. vTemp = _mm_add_ps( vTemp, row3 );
  9382. vTemp = _mm_add_ps( vTemp, vTemp2 );
  9383. vTemp = _mm_add_ps( vTemp, vTemp3 );
  9384. W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  9385. vTemp = _mm_div_ps( vTemp, W );
  9386. XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
  9387. pOutputVector += OutputStride;
  9388. i += 4;
  9389. }
  9390. }
  9391. }
  9392. }
  9393. for (; i < VectorCount; i++)
  9394. {
  9395. XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
  9396. pInputVector += InputStride;
  9397. XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
  9398. XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
  9399. XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
  9400. XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
  9401. XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
  9402. XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
  9403. vTemp = _mm_add_ps( vTemp, row3 );
  9404. vTemp = _mm_add_ps( vTemp, vTemp2 );
  9405. vTemp = _mm_add_ps( vTemp, vTemp3 );
  9406. XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  9407. vTemp = _mm_div_ps( vTemp, W );
  9408. XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
  9409. pOutputVector += OutputStride;
  9410. }
  9411. XM_SFENCE();
  9412. return pOutputStream;
  9413. #endif
  9414. }
  9415. #ifdef _PREFAST_
  9416. #pragma prefast(pop)
  9417. #endif
  9418. //------------------------------------------------------------------------------
  9419. inline XMVECTOR XM_CALLCONV XMVector3TransformNormal
  9420. (
  9421. FXMVECTOR V,
  9422. FXMMATRIX M
  9423. )
  9424. {
  9425. #if defined(_XM_NO_INTRINSICS_)
  9426. XMVECTOR Z = XMVectorSplatZ(V);
  9427. XMVECTOR Y = XMVectorSplatY(V);
  9428. XMVECTOR X = XMVectorSplatX(V);
  9429. XMVECTOR Result = XMVectorMultiply(Z, M.r[2]);
  9430. Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
  9431. Result = XMVectorMultiplyAdd(X, M.r[0], Result);
  9432. return Result;
  9433. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  9434. float32x2_t VL = vget_low_f32( V );
  9435. XMVECTOR vResult = vmulq_lane_f32( M.r[0], VL, 0 ); // X
  9436. vResult = vmlaq_lane_f32( vResult, M.r[1], VL, 1 ); // Y
  9437. return vmlaq_lane_f32( vResult, M.r[2], vget_high_f32( V ), 0 ); // Z
  9438. #elif defined(_XM_SSE_INTRINSICS_)
  9439. XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
  9440. vResult = _mm_mul_ps(vResult,M.r[0]);
  9441. XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
  9442. vTemp = _mm_mul_ps(vTemp,M.r[1]);
  9443. vResult = _mm_add_ps(vResult,vTemp);
  9444. vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
  9445. vTemp = _mm_mul_ps(vTemp,M.r[2]);
  9446. vResult = _mm_add_ps(vResult,vTemp);
  9447. return vResult;
  9448. #endif
  9449. }
  9450. //------------------------------------------------------------------------------
  9451. #ifdef _PREFAST_
  9452. #pragma prefast(push)
  9453. #pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" )
  9454. #endif
  9455. _Use_decl_annotations_
  9456. inline XMFLOAT3* XM_CALLCONV XMVector3TransformNormalStream
  9457. (
  9458. XMFLOAT3* pOutputStream,
  9459. size_t OutputStride,
  9460. const XMFLOAT3* pInputStream,
  9461. size_t InputStride,
  9462. size_t VectorCount,
  9463. FXMMATRIX M
  9464. )
  9465. {
  9466. assert(pOutputStream != nullptr);
  9467. assert(pInputStream != nullptr);
  9468. assert(InputStride >= sizeof(XMFLOAT3));
  9469. _Analysis_assume_(InputStride >= sizeof(XMFLOAT3));
  9470. assert(OutputStride >= sizeof(XMFLOAT3));
  9471. _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3));
  9472. #if defined(_XM_NO_INTRINSICS_)
  9473. const uint8_t* pInputVector = (const uint8_t*)pInputStream;
  9474. uint8_t* pOutputVector = (uint8_t*)pOutputStream;
  9475. const XMVECTOR row0 = M.r[0];
  9476. const XMVECTOR row1 = M.r[1];
  9477. const XMVECTOR row2 = M.r[2];
  9478. for (size_t i = 0; i < VectorCount; i++)
  9479. {
  9480. XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
  9481. XMVECTOR Z = XMVectorSplatZ(V);
  9482. XMVECTOR Y = XMVectorSplatY(V);
  9483. XMVECTOR X = XMVectorSplatX(V);
  9484. XMVECTOR Result = XMVectorMultiply(Z, row2);
  9485. Result = XMVectorMultiplyAdd(Y, row1, Result);
  9486. Result = XMVectorMultiplyAdd(X, row0, Result);
  9487. XMStoreFloat3((XMFLOAT3*)pOutputVector, Result);
  9488. pInputVector += InputStride;
  9489. pOutputVector += OutputStride;
  9490. }
  9491. return pOutputStream;
  9492. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  9493. const uint8_t* pInputVector = (const uint8_t*)pInputStream;
  9494. uint8_t* pOutputVector = (uint8_t*)pOutputStream;
  9495. const XMVECTOR row0 = M.r[0];
  9496. const XMVECTOR row1 = M.r[1];
  9497. const XMVECTOR row2 = M.r[2];
  9498. size_t i = 0;
  9499. size_t four = VectorCount >> 2;
  9500. if ( four > 0 )
  9501. {
  9502. if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3)))
  9503. {
  9504. for (size_t j = 0; j < four; ++j)
  9505. {
  9506. float32x4x3_t V = vld3q_f32( reinterpret_cast<const float*>(pInputVector) );
  9507. pInputVector += sizeof(XMFLOAT3)*4;
  9508. float32x2_t r = vget_low_f32( row0 );
  9509. XMVECTOR vResult0 = vmulq_lane_f32( V.val[0], r, 0 ); // Ax
  9510. XMVECTOR vResult1 = vmulq_lane_f32( V.val[0], r, 1 ); // Bx
  9511. __prefetch( pInputVector );
  9512. r = vget_high_f32( row0 );
  9513. XMVECTOR vResult2 = vmulq_lane_f32( V.val[0], r, 0 ); // Cx
  9514. __prefetch( pInputVector+XM_CACHE_LINE_SIZE );
  9515. r = vget_low_f32( row1 );
  9516. vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey
  9517. vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy
  9518. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
  9519. r = vget_high_f32( row1 );
  9520. vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy
  9521. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
  9522. r = vget_low_f32( row2 );
  9523. vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz
  9524. vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz
  9525. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) );
  9526. r = vget_high_f32( row2 );
  9527. vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz
  9528. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );
  9529. V.val[0] = vResult0;
  9530. V.val[1] = vResult1;
  9531. V.val[2] = vResult2;
  9532. vst3q_f32( reinterpret_cast<float*>(pOutputVector), V );
  9533. pOutputVector += sizeof(XMFLOAT3)*4;
  9534. i += 4;
  9535. }
  9536. }
  9537. }
  9538. for (; i < VectorCount; i++)
  9539. {
  9540. float32x2_t VL = vld1_f32( reinterpret_cast<const float*>(pInputVector) );
  9541. float32x2_t zero = vdup_n_f32(0);
  9542. float32x2_t VH = vld1_lane_f32( reinterpret_cast<const float*>(pInputVector)+2, zero, 0 );
  9543. pInputVector += InputStride;
  9544. XMVECTOR vResult = vmulq_lane_f32( row0, VL, 0 ); // X
  9545. vResult = vmlaq_lane_f32( vResult, row1, VL, 1 ); // Y
  9546. vResult = vmlaq_lane_f32( vResult, row2, VH, 0 ); // Z
  9547. VL = vget_low_f32( vResult );
  9548. vst1_f32( reinterpret_cast<float*>(pOutputVector), VL );
  9549. vst1q_lane_f32( reinterpret_cast<float*>(pOutputVector)+2, vResult, 2 );
  9550. pOutputVector += OutputStride;
  9551. }
  9552. return pOutputStream;
  9553. #elif defined(_XM_SSE_INTRINSICS_)
  9554. const uint8_t* pInputVector = (const uint8_t*)pInputStream;
  9555. uint8_t* pOutputVector = (uint8_t*)pOutputStream;
  9556. const XMVECTOR row0 = M.r[0];
  9557. const XMVECTOR row1 = M.r[1];
  9558. const XMVECTOR row2 = M.r[2];
  9559. size_t i = 0;
  9560. size_t four = VectorCount >> 2;
  9561. if ( four > 0 )
  9562. {
  9563. if (InputStride == sizeof(XMFLOAT3))
  9564. {
  9565. if (OutputStride == sizeof(XMFLOAT3))
  9566. {
  9567. if ( !((uintptr_t)pOutputStream & 0xF) )
  9568. {
  9569. // Packed input, aligned & packed output
  9570. for (size_t j = 0; j < four; ++j)
  9571. {
  9572. __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
  9573. __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );
  9574. __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
  9575. pInputVector += sizeof(XMFLOAT3)*4;
  9576. // Unpack the 4 vectors (.w components are junk)
  9577. XM3UNPACK3INTO4(V1,L2,L3);
  9578. // Result 1
  9579. XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
  9580. XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
  9581. XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
  9582. XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
  9583. XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
  9584. XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
  9585. vTemp = _mm_add_ps( vTemp, vTemp2 );
  9586. V1 = _mm_add_ps( vTemp, vTemp3 );
  9587. // Result 2
  9588. Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
  9589. Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
  9590. X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
  9591. vTemp = _mm_mul_ps( Z, row2 );
  9592. vTemp2 = _mm_mul_ps( Y, row1 );
  9593. vTemp3 = _mm_mul_ps( X, row0 );
  9594. vTemp = _mm_add_ps( vTemp, vTemp2 );
  9595. V2 = _mm_add_ps( vTemp, vTemp3 );
  9596. // Result 3
  9597. Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
  9598. Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
  9599. X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
  9600. vTemp = _mm_mul_ps( Z, row2 );
  9601. vTemp2 = _mm_mul_ps( Y, row1 );
  9602. vTemp3 = _mm_mul_ps( X, row0 );
  9603. vTemp = _mm_add_ps( vTemp, vTemp2 );
  9604. V3 = _mm_add_ps( vTemp, vTemp3 );
  9605. // Result 4
  9606. Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
  9607. Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
  9608. X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
  9609. vTemp = _mm_mul_ps( Z, row2 );
  9610. vTemp2 = _mm_mul_ps( Y, row1 );
  9611. vTemp3 = _mm_mul_ps( X, row0 );
  9612. vTemp = _mm_add_ps( vTemp, vTemp2 );
  9613. V4 = _mm_add_ps( vTemp, vTemp3 );
  9614. // Pack and store the vectors
  9615. XM3PACK4INTO3(vTemp);
  9616. XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), V1 );
  9617. XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector+16), vTemp );
  9618. XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector+32), V3 );
  9619. pOutputVector += sizeof(XMFLOAT3)*4;
  9620. i += 4;
  9621. }
  9622. }
  9623. else
  9624. {
  9625. // Packed input, unaligned & packed output
  9626. for (size_t j = 0; j < four; ++j)
  9627. {
  9628. __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
  9629. __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );
  9630. __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
  9631. pInputVector += sizeof(XMFLOAT3)*4;
  9632. // Unpack the 4 vectors (.w components are junk)
  9633. XM3UNPACK3INTO4(V1,L2,L3);
  9634. // Result 1
  9635. XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
  9636. XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
  9637. XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
  9638. XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
  9639. XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
  9640. XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
  9641. vTemp = _mm_add_ps( vTemp, vTemp2 );
  9642. V1 = _mm_add_ps( vTemp, vTemp3 );
  9643. // Result 2
  9644. Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
  9645. Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
  9646. X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
  9647. vTemp = _mm_mul_ps( Z, row2 );
  9648. vTemp2 = _mm_mul_ps( Y, row1 );
  9649. vTemp3 = _mm_mul_ps( X, row0 );
  9650. vTemp = _mm_add_ps( vTemp, vTemp2 );
  9651. V2 = _mm_add_ps( vTemp, vTemp3 );
  9652. // Result 3
  9653. Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
  9654. Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
  9655. X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
  9656. vTemp = _mm_mul_ps( Z, row2 );
  9657. vTemp2 = _mm_mul_ps( Y, row1 );
  9658. vTemp3 = _mm_mul_ps( X, row0 );
  9659. vTemp = _mm_add_ps( vTemp, vTemp2 );
  9660. V3 = _mm_add_ps( vTemp, vTemp3 );
  9661. // Result 4
  9662. Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
  9663. Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
  9664. X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
  9665. vTemp = _mm_mul_ps( Z, row2 );
  9666. vTemp2 = _mm_mul_ps( Y, row1 );
  9667. vTemp3 = _mm_mul_ps( X, row0 );
  9668. vTemp = _mm_add_ps( vTemp, vTemp2 );
  9669. V4 = _mm_add_ps( vTemp, vTemp3 );
  9670. // Pack and store the vectors
  9671. XM3PACK4INTO3(vTemp);
  9672. _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), V1 );
  9673. _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector+16), vTemp );
  9674. _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector+32), V3 );
  9675. pOutputVector += sizeof(XMFLOAT3)*4;
  9676. i += 4;
  9677. }
  9678. }
  9679. }
  9680. else
  9681. {
  9682. // Packed input, unpacked output
  9683. for (size_t j = 0; j < four; ++j)
  9684. {
  9685. __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
  9686. __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );
  9687. __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
  9688. pInputVector += sizeof(XMFLOAT3)*4;
  9689. // Unpack the 4 vectors (.w components are junk)
  9690. XM3UNPACK3INTO4(V1,L2,L3);
  9691. // Result 1
  9692. XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
  9693. XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
  9694. XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
  9695. XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
  9696. XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
  9697. XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
  9698. vTemp = _mm_add_ps( vTemp, vTemp2 );
  9699. vTemp = _mm_add_ps( vTemp, vTemp3 );
  9700. XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
  9701. pOutputVector += OutputStride;
  9702. // Result 2
  9703. Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
  9704. Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
  9705. X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
  9706. vTemp = _mm_mul_ps( Z, row2 );
  9707. vTemp2 = _mm_mul_ps( Y, row1 );
  9708. vTemp3 = _mm_mul_ps( X, row0 );
  9709. vTemp = _mm_add_ps( vTemp, vTemp2 );
  9710. vTemp = _mm_add_ps( vTemp, vTemp3 );
  9711. XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
  9712. pOutputVector += OutputStride;
  9713. // Result 3
  9714. Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
  9715. Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
  9716. X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
  9717. vTemp = _mm_mul_ps( Z, row2 );
  9718. vTemp2 = _mm_mul_ps( Y, row1 );
  9719. vTemp3 = _mm_mul_ps( X, row0 );
  9720. vTemp = _mm_add_ps( vTemp, vTemp2 );
  9721. vTemp = _mm_add_ps( vTemp, vTemp3 );
  9722. XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
  9723. pOutputVector += OutputStride;
  9724. // Result 4
  9725. Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
  9726. Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
  9727. X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
  9728. vTemp = _mm_mul_ps( Z, row2 );
  9729. vTemp2 = _mm_mul_ps( Y, row1 );
  9730. vTemp3 = _mm_mul_ps( X, row0 );
  9731. vTemp = _mm_add_ps( vTemp, vTemp2 );
  9732. vTemp = _mm_add_ps( vTemp, vTemp3 );
  9733. XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
  9734. pOutputVector += OutputStride;
  9735. i += 4;
  9736. }
  9737. }
  9738. }
  9739. }
  9740. for (; i < VectorCount; i++)
  9741. {
  9742. XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
  9743. pInputVector += InputStride;
  9744. XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
  9745. XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
  9746. XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
  9747. XMVECTOR vTemp = _mm_mul_ps( Z, row2 );
  9748. XMVECTOR vTemp2 = _mm_mul_ps( Y, row1 );
  9749. XMVECTOR vTemp3 = _mm_mul_ps( X, row0 );
  9750. vTemp = _mm_add_ps( vTemp, vTemp2 );
  9751. vTemp = _mm_add_ps( vTemp, vTemp3 );
  9752. XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
  9753. pOutputVector += OutputStride;
  9754. }
  9755. XM_SFENCE();
  9756. return pOutputStream;
  9757. #endif
  9758. }
  9759. #ifdef _PREFAST_
  9760. #pragma prefast(pop)
  9761. #endif
  9762. //------------------------------------------------------------------------------
  9763. inline XMVECTOR XM_CALLCONV XMVector3Project
  9764. (
  9765. FXMVECTOR V,
  9766. float ViewportX,
  9767. float ViewportY,
  9768. float ViewportWidth,
  9769. float ViewportHeight,
  9770. float ViewportMinZ,
  9771. float ViewportMaxZ,
  9772. FXMMATRIX Projection,
  9773. CXMMATRIX View,
  9774. CXMMATRIX World
  9775. )
  9776. {
  9777. const float HalfViewportWidth = ViewportWidth * 0.5f;
  9778. const float HalfViewportHeight = ViewportHeight * 0.5f;
  9779. XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f);
  9780. XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
  9781. XMMATRIX Transform = XMMatrixMultiply(World, View);
  9782. Transform = XMMatrixMultiply(Transform, Projection);
  9783. XMVECTOR Result = XMVector3TransformCoord(V, Transform);
  9784. Result = XMVectorMultiplyAdd(Result, Scale, Offset);
  9785. return Result;
  9786. }
  9787. //------------------------------------------------------------------------------
  9788. #ifdef _PREFAST_
  9789. #pragma prefast(push)
  9790. #pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" )
  9791. #endif
  9792. _Use_decl_annotations_
  9793. inline XMFLOAT3* XM_CALLCONV XMVector3ProjectStream
  9794. (
  9795. XMFLOAT3* pOutputStream,
  9796. size_t OutputStride,
  9797. const XMFLOAT3* pInputStream,
  9798. size_t InputStride,
  9799. size_t VectorCount,
  9800. float ViewportX,
  9801. float ViewportY,
  9802. float ViewportWidth,
  9803. float ViewportHeight,
  9804. float ViewportMinZ,
  9805. float ViewportMaxZ,
  9806. FXMMATRIX Projection,
  9807. CXMMATRIX View,
  9808. CXMMATRIX World
  9809. )
  9810. {
  9811. assert(pOutputStream != nullptr);
  9812. assert(pInputStream != nullptr);
  9813. assert(InputStride >= sizeof(XMFLOAT3));
  9814. _Analysis_assume_(InputStride >= sizeof(XMFLOAT3));
  9815. assert(OutputStride >= sizeof(XMFLOAT3));
  9816. _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3));
  9817. #if defined(_XM_NO_INTRINSICS_)
  9818. const float HalfViewportWidth = ViewportWidth * 0.5f;
  9819. const float HalfViewportHeight = ViewportHeight * 0.5f;
  9820. XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f);
  9821. XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
  9822. XMMATRIX Transform = XMMatrixMultiply(World, View);
  9823. Transform = XMMatrixMultiply(Transform, Projection);
  9824. const uint8_t* pInputVector = (const uint8_t*)pInputStream;
  9825. uint8_t* pOutputVector = (uint8_t*)pOutputStream;
  9826. for (size_t i = 0; i < VectorCount; i++)
  9827. {
  9828. XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
  9829. XMVECTOR Result = XMVector3TransformCoord(V, Transform);
  9830. Result = XMVectorMultiplyAdd(Result, Scale, Offset);
  9831. XMStoreFloat3((XMFLOAT3*)pOutputVector, Result);
  9832. pInputVector += InputStride;
  9833. pOutputVector += OutputStride;
  9834. }
  9835. return pOutputStream;
  9836. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  9837. const float HalfViewportWidth = ViewportWidth * 0.5f;
  9838. const float HalfViewportHeight = ViewportHeight * 0.5f;
  9839. XMMATRIX Transform = XMMatrixMultiply(World, View);
  9840. Transform = XMMatrixMultiply(Transform, Projection);
  9841. const uint8_t* pInputVector = (const uint8_t*)pInputStream;
  9842. uint8_t* pOutputVector = (uint8_t*)pOutputStream;
  9843. size_t i = 0;
  9844. size_t four = VectorCount >> 2;
  9845. if ( four > 0 )
  9846. {
  9847. if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3)))
  9848. {
  9849. XMVECTOR ScaleX = vdupq_n_f32(HalfViewportWidth);
  9850. XMVECTOR ScaleY = vdupq_n_f32(-HalfViewportHeight);
  9851. XMVECTOR ScaleZ = vdupq_n_f32(ViewportMaxZ - ViewportMinZ);
  9852. XMVECTOR OffsetX = vdupq_n_f32(ViewportX + HalfViewportWidth);
  9853. XMVECTOR OffsetY = vdupq_n_f32(ViewportY + HalfViewportHeight);
  9854. XMVECTOR OffsetZ = vdupq_n_f32(ViewportMinZ);
  9855. for (size_t j = 0; j < four; ++j)
  9856. {
  9857. float32x4x3_t V = vld3q_f32( reinterpret_cast<const float*>(pInputVector) );
  9858. pInputVector += sizeof(XMFLOAT3)*4;
  9859. float32x2_t r3 = vget_low_f32( Transform.r[3] );
  9860. float32x2_t r = vget_low_f32( Transform.r[0] );
  9861. XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Ax+M
  9862. XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Bx+N
  9863. __prefetch( pInputVector );
  9864. r3 = vget_high_f32( Transform.r[3] );
  9865. r = vget_high_f32( Transform.r[0] );
  9866. XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), V.val[0], r, 0 ); // Cx+O
  9867. XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), V.val[0], r, 1 ); // Dx+P
  9868. __prefetch( pInputVector+XM_CACHE_LINE_SIZE );
  9869. r = vget_low_f32( Transform.r[1] );
  9870. vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey+M
  9871. vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy+N
  9872. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
  9873. r = vget_high_f32( Transform.r[1] );
  9874. vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy+O
  9875. W = vmlaq_lane_f32( W, V.val[1], r, 1 ); // Dx+Hy+P
  9876. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
  9877. r = vget_low_f32( Transform.r[2] );
  9878. vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz+M
  9879. vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz+N
  9880. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) );
  9881. r = vget_high_f32( Transform.r[2] );
  9882. vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz+O
  9883. W = vmlaq_lane_f32( W, V.val[2], r, 1 ); // Dx+Hy+Lz+P
  9884. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );
  9885. #if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
  9886. vResult0 = vdivq_f32( vResult0, W );
  9887. vResult1 = vdivq_f32( vResult1, W );
  9888. vResult2 = vdivq_f32( vResult2, W );
  9889. #else
  9890. // 2 iterations of Newton-Raphson refinement of reciprocal
  9891. float32x4_t Reciprocal = vrecpeq_f32(W);
  9892. float32x4_t S = vrecpsq_f32( Reciprocal, W );
  9893. Reciprocal = vmulq_f32( S, Reciprocal );
  9894. S = vrecpsq_f32( Reciprocal, W );
  9895. Reciprocal = vmulq_f32( S, Reciprocal );
  9896. vResult0 = vmulq_f32( vResult0, Reciprocal );
  9897. vResult1 = vmulq_f32( vResult1, Reciprocal );
  9898. vResult2 = vmulq_f32( vResult2, Reciprocal );
  9899. #endif
  9900. V.val[0] = vmlaq_f32( OffsetX, vResult0, ScaleX );
  9901. V.val[1] = vmlaq_f32( OffsetY, vResult1, ScaleY );
  9902. V.val[2] = vmlaq_f32( OffsetZ, vResult2, ScaleZ );
  9903. vst3q_f32( reinterpret_cast<float*>(pOutputVector),V );
  9904. pOutputVector += sizeof(XMFLOAT3)*4;
  9905. i += 4;
  9906. }
  9907. }
  9908. }
  9909. if ( i < VectorCount)
  9910. {
  9911. XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f);
  9912. XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
  9913. for (; i < VectorCount; i++)
  9914. {
  9915. float32x2_t VL = vld1_f32( reinterpret_cast<const float*>(pInputVector) );
  9916. float32x2_t zero = vdup_n_f32(0);
  9917. float32x2_t VH = vld1_lane_f32( reinterpret_cast<const float*>(pInputVector)+2, zero, 0 );
  9918. pInputVector += InputStride;
  9919. XMVECTOR vResult = vmlaq_lane_f32( Transform.r[3], Transform.r[0], VL, 0 ); // X
  9920. vResult = vmlaq_lane_f32( vResult, Transform.r[1], VL, 1 ); // Y
  9921. vResult = vmlaq_lane_f32( vResult, Transform.r[2], VH, 0 ); // Z
  9922. VH = vget_high_f32(vResult);
  9923. XMVECTOR W = vdupq_lane_f32( VH, 1 );
  9924. #if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
  9925. vResult = vdivq_f32( vResult, W );
  9926. #else
  9927. // 2 iterations of Newton-Raphson refinement of reciprocal for W
  9928. float32x4_t Reciprocal = vrecpeq_f32( W );
  9929. float32x4_t S = vrecpsq_f32( Reciprocal, W );
  9930. Reciprocal = vmulq_f32( S, Reciprocal );
  9931. S = vrecpsq_f32( Reciprocal, W );
  9932. Reciprocal = vmulq_f32( S, Reciprocal );
  9933. vResult = vmulq_f32( vResult, Reciprocal );
  9934. #endif
  9935. vResult = vmlaq_f32( Offset, vResult, Scale );
  9936. VL = vget_low_f32( vResult );
  9937. vst1_f32( reinterpret_cast<float*>(pOutputVector), VL );
  9938. vst1q_lane_f32( reinterpret_cast<float*>(pOutputVector)+2, vResult, 2 );
  9939. pOutputVector += OutputStride;
  9940. }
  9941. }
  9942. return pOutputStream;
  9943. #elif defined(_XM_SSE_INTRINSICS_)
  9944. const float HalfViewportWidth = ViewportWidth * 0.5f;
  9945. const float HalfViewportHeight = ViewportHeight * 0.5f;
  9946. XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f);
  9947. XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
  9948. XMMATRIX Transform = XMMatrixMultiply(World, View);
  9949. Transform = XMMatrixMultiply(Transform, Projection);
  9950. const uint8_t* pInputVector = (const uint8_t*)pInputStream;
  9951. uint8_t* pOutputVector = (uint8_t*)pOutputStream;
  9952. size_t i = 0;
  9953. size_t four = VectorCount >> 2;
  9954. if ( four > 0 )
  9955. {
  9956. if (InputStride == sizeof(XMFLOAT3))
  9957. {
  9958. if (OutputStride == sizeof(XMFLOAT3))
  9959. {
  9960. if ( !((uintptr_t)pOutputStream & 0xF) )
  9961. {
  9962. // Packed input, aligned & packed output
  9963. for (size_t j = 0; j < four; ++j)
  9964. {
  9965. __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
  9966. __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );
  9967. __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
  9968. pInputVector += sizeof(XMFLOAT3)*4;
  9969. // Unpack the 4 vectors (.w components are junk)
  9970. XM3UNPACK3INTO4(V1,L2,L3);
  9971. // Result 1
  9972. XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
  9973. XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
  9974. XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
  9975. XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] );
  9976. XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
  9977. XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] );
  9978. vTemp = _mm_add_ps( vTemp, Transform.r[3] );
  9979. vTemp = _mm_add_ps( vTemp, vTemp2 );
  9980. vTemp = _mm_add_ps( vTemp, vTemp3 );
  9981. XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  9982. vTemp = _mm_div_ps( vTemp, W );
  9983. vTemp = _mm_mul_ps( vTemp, Scale );
  9984. V1 = _mm_add_ps( vTemp, Offset );
  9985. // Result 2
  9986. Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
  9987. Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
  9988. X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
  9989. vTemp = _mm_mul_ps( Z, Transform.r[2] );
  9990. vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
  9991. vTemp3 = _mm_mul_ps( X, Transform.r[0] );
  9992. vTemp = _mm_add_ps( vTemp, Transform.r[3] );
  9993. vTemp = _mm_add_ps( vTemp, vTemp2 );
  9994. vTemp = _mm_add_ps( vTemp, vTemp3 );
  9995. W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  9996. vTemp = _mm_div_ps( vTemp, W );
  9997. vTemp = _mm_mul_ps( vTemp, Scale );
  9998. V2 = _mm_add_ps( vTemp, Offset );
  9999. // Result 3
  10000. Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
  10001. Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
  10002. X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
  10003. vTemp = _mm_mul_ps( Z, Transform.r[2] );
  10004. vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
  10005. vTemp3 = _mm_mul_ps( X, Transform.r[0] );
  10006. vTemp = _mm_add_ps( vTemp, Transform.r[3] );
  10007. vTemp = _mm_add_ps( vTemp, vTemp2 );
  10008. vTemp = _mm_add_ps( vTemp, vTemp3 );
  10009. W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  10010. vTemp = _mm_div_ps( vTemp, W );
  10011. vTemp = _mm_mul_ps( vTemp, Scale );
  10012. V3 = _mm_add_ps( vTemp, Offset );
  10013. // Result 4
  10014. Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
  10015. Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
  10016. X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
  10017. vTemp = _mm_mul_ps( Z, Transform.r[2] );
  10018. vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
  10019. vTemp3 = _mm_mul_ps( X, Transform.r[0] );
  10020. vTemp = _mm_add_ps( vTemp, Transform.r[3] );
  10021. vTemp = _mm_add_ps( vTemp, vTemp2 );
  10022. vTemp = _mm_add_ps( vTemp, vTemp3 );
  10023. W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  10024. vTemp = _mm_div_ps( vTemp, W );
  10025. vTemp = _mm_mul_ps( vTemp, Scale );
  10026. V4 = _mm_add_ps( vTemp, Offset );
  10027. // Pack and store the vectors
  10028. XM3PACK4INTO3(vTemp);
  10029. XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), V1 );
  10030. XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector+16), vTemp );
  10031. XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector+32), V3 );
  10032. pOutputVector += sizeof(XMFLOAT3)*4;
  10033. i += 4;
  10034. }
  10035. }
  10036. else
  10037. {
  10038. // Packed input, unaligned & packed output
  10039. for (size_t j = 0; j < four; ++j)
  10040. {
  10041. __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
  10042. __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );
  10043. __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
  10044. pInputVector += sizeof(XMFLOAT3)*4;
  10045. // Unpack the 4 vectors (.w components are junk)
  10046. XM3UNPACK3INTO4(V1,L2,L3);
  10047. // Result 1
  10048. XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
  10049. XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
  10050. XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
  10051. XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] );
  10052. XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
  10053. XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] );
  10054. vTemp = _mm_add_ps( vTemp, Transform.r[3] );
  10055. vTemp = _mm_add_ps( vTemp, vTemp2 );
  10056. vTemp = _mm_add_ps( vTemp, vTemp3 );
  10057. XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  10058. vTemp = _mm_div_ps( vTemp, W );
  10059. vTemp = _mm_mul_ps( vTemp, Scale );
  10060. V1 = _mm_add_ps( vTemp, Offset );
  10061. // Result 2
  10062. Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
  10063. Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
  10064. X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
  10065. vTemp = _mm_mul_ps( Z, Transform.r[2] );
  10066. vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
  10067. vTemp3 = _mm_mul_ps( X, Transform.r[0] );
  10068. vTemp = _mm_add_ps( vTemp, Transform.r[3] );
  10069. vTemp = _mm_add_ps( vTemp, vTemp2 );
  10070. vTemp = _mm_add_ps( vTemp, vTemp3 );
  10071. W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  10072. vTemp = _mm_div_ps( vTemp, W );
  10073. vTemp = _mm_mul_ps( vTemp, Scale );
  10074. V2 = _mm_add_ps( vTemp, Offset );
  10075. // Result 3
  10076. Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
  10077. Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
  10078. X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
  10079. vTemp = _mm_mul_ps( Z, Transform.r[2] );
  10080. vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
  10081. vTemp3 = _mm_mul_ps( X, Transform.r[0] );
  10082. vTemp = _mm_add_ps( vTemp, Transform.r[3] );
  10083. vTemp = _mm_add_ps( vTemp, vTemp2 );
  10084. vTemp = _mm_add_ps( vTemp, vTemp3 );
  10085. W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  10086. vTemp = _mm_div_ps( vTemp, W );
  10087. vTemp = _mm_mul_ps( vTemp, Scale );
  10088. V3 = _mm_add_ps( vTemp, Offset );
  10089. // Result 4
  10090. Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
  10091. Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
  10092. X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
  10093. vTemp = _mm_mul_ps( Z, Transform.r[2] );
  10094. vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
  10095. vTemp3 = _mm_mul_ps( X, Transform.r[0] );
  10096. vTemp = _mm_add_ps( vTemp, Transform.r[3] );
  10097. vTemp = _mm_add_ps( vTemp, vTemp2 );
  10098. vTemp = _mm_add_ps( vTemp, vTemp3 );
  10099. W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  10100. vTemp = _mm_div_ps( vTemp, W );
  10101. vTemp = _mm_mul_ps( vTemp, Scale );
  10102. V4 = _mm_add_ps( vTemp, Offset );
  10103. // Pack and store the vectors
  10104. XM3PACK4INTO3(vTemp);
  10105. _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), V1 );
  10106. _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector+16), vTemp );
  10107. _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector+32), V3 );
  10108. pOutputVector += sizeof(XMFLOAT3)*4;
  10109. i += 4;
  10110. }
  10111. }
  10112. }
  10113. else
  10114. {
  10115. // Packed input, unpacked output
  10116. for (size_t j = 0; j < four; ++j)
  10117. {
  10118. __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
  10119. __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );
  10120. __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
  10121. pInputVector += sizeof(XMFLOAT3)*4;
  10122. // Unpack the 4 vectors (.w components are junk)
  10123. XM3UNPACK3INTO4(V1,L2,L3);
  10124. // Result 1
  10125. XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
  10126. XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
  10127. XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
  10128. XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] );
  10129. XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
  10130. XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] );
  10131. vTemp = _mm_add_ps( vTemp, Transform.r[3] );
  10132. vTemp = _mm_add_ps( vTemp, vTemp2 );
  10133. vTemp = _mm_add_ps( vTemp, vTemp3 );
  10134. XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  10135. vTemp = _mm_div_ps( vTemp, W );
  10136. vTemp = _mm_mul_ps( vTemp, Scale );
  10137. vTemp = _mm_add_ps( vTemp, Offset );
  10138. XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
  10139. pOutputVector += OutputStride;
  10140. // Result 2
  10141. Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
  10142. Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
  10143. X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
  10144. vTemp = _mm_mul_ps( Z, Transform.r[2] );
  10145. vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
  10146. vTemp3 = _mm_mul_ps( X, Transform.r[0] );
  10147. vTemp = _mm_add_ps( vTemp, Transform.r[3] );
  10148. vTemp = _mm_add_ps( vTemp, vTemp2 );
  10149. vTemp = _mm_add_ps( vTemp, vTemp3 );
  10150. W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  10151. vTemp = _mm_div_ps( vTemp, W );
  10152. vTemp = _mm_mul_ps( vTemp, Scale );
  10153. vTemp = _mm_add_ps( vTemp, Offset );
  10154. XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
  10155. pOutputVector += OutputStride;
  10156. // Result 3
  10157. Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
  10158. Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
  10159. X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
  10160. vTemp = _mm_mul_ps( Z, Transform.r[2] );
  10161. vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
  10162. vTemp3 = _mm_mul_ps( X, Transform.r[0] );
  10163. vTemp = _mm_add_ps( vTemp, Transform.r[3] );
  10164. vTemp = _mm_add_ps( vTemp, vTemp2 );
  10165. vTemp = _mm_add_ps( vTemp, vTemp3 );
  10166. W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  10167. vTemp = _mm_div_ps( vTemp, W );
  10168. vTemp = _mm_mul_ps( vTemp, Scale );
  10169. vTemp = _mm_add_ps( vTemp, Offset );
  10170. XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
  10171. pOutputVector += OutputStride;
  10172. // Result 4
  10173. Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
  10174. Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
  10175. X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
  10176. vTemp = _mm_mul_ps( Z, Transform.r[2] );
  10177. vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
  10178. vTemp3 = _mm_mul_ps( X, Transform.r[0] );
  10179. vTemp = _mm_add_ps( vTemp, Transform.r[3] );
  10180. vTemp = _mm_add_ps( vTemp, vTemp2 );
  10181. vTemp = _mm_add_ps( vTemp, vTemp3 );
  10182. W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  10183. vTemp = _mm_div_ps( vTemp, W );
  10184. vTemp = _mm_mul_ps( vTemp, Scale );
  10185. vTemp = _mm_add_ps( vTemp, Offset );
  10186. XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
  10187. pOutputVector += OutputStride;
  10188. i += 4;
  10189. }
  10190. }
  10191. }
  10192. }
  10193. for (; i < VectorCount; i++)
  10194. {
  10195. XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
  10196. pInputVector += InputStride;
  10197. XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
  10198. XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
  10199. XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
  10200. XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] );
  10201. XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
  10202. XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] );
  10203. vTemp = _mm_add_ps( vTemp, Transform.r[3] );
  10204. vTemp = _mm_add_ps( vTemp, vTemp2 );
  10205. vTemp = _mm_add_ps( vTemp, vTemp3 );
  10206. XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  10207. vTemp = _mm_div_ps( vTemp, W );
  10208. vTemp = _mm_mul_ps( vTemp, Scale );
  10209. vTemp = _mm_add_ps( vTemp, Offset );
  10210. XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
  10211. pOutputVector += OutputStride;
  10212. }
  10213. XM_SFENCE();
  10214. return pOutputStream;
  10215. #endif
  10216. }
  10217. #ifdef _PREFAST_
  10218. #pragma prefast(pop)
  10219. #endif
  10220. //------------------------------------------------------------------------------
  10221. inline XMVECTOR XM_CALLCONV XMVector3Unproject
  10222. (
  10223. FXMVECTOR V,
  10224. float ViewportX,
  10225. float ViewportY,
  10226. float ViewportWidth,
  10227. float ViewportHeight,
  10228. float ViewportMinZ,
  10229. float ViewportMaxZ,
  10230. FXMMATRIX Projection,
  10231. CXMMATRIX View,
  10232. CXMMATRIX World
  10233. )
  10234. {
  10235. static const XMVECTORF32 D = { { { -1.0f, 1.0f, 0.0f, 0.0f } } };
  10236. XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
  10237. Scale = XMVectorReciprocal(Scale);
  10238. XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
  10239. Offset = XMVectorMultiplyAdd(Scale, Offset, D.v);
  10240. XMMATRIX Transform = XMMatrixMultiply(World, View);
  10241. Transform = XMMatrixMultiply(Transform, Projection);
  10242. Transform = XMMatrixInverse(nullptr, Transform);
  10243. XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset);
  10244. return XMVector3TransformCoord(Result, Transform);
  10245. }
  10246. //------------------------------------------------------------------------------
  10247. #ifdef _PREFAST_
  10248. #pragma prefast(push)
  10249. #pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" )
  10250. #endif
  10251. _Use_decl_annotations_
  10252. inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream
  10253. (
  10254. XMFLOAT3* pOutputStream,
  10255. size_t OutputStride,
  10256. const XMFLOAT3* pInputStream,
  10257. size_t InputStride,
  10258. size_t VectorCount,
  10259. float ViewportX,
  10260. float ViewportY,
  10261. float ViewportWidth,
  10262. float ViewportHeight,
  10263. float ViewportMinZ,
  10264. float ViewportMaxZ,
  10265. FXMMATRIX Projection,
  10266. CXMMATRIX View,
  10267. CXMMATRIX World)
  10268. {
  10269. assert(pOutputStream != nullptr);
  10270. assert(pInputStream != nullptr);
  10271. assert(InputStride >= sizeof(XMFLOAT3));
  10272. _Analysis_assume_(InputStride >= sizeof(XMFLOAT3));
  10273. assert(OutputStride >= sizeof(XMFLOAT3));
  10274. _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3));
  10275. #if defined(_XM_NO_INTRINSICS_)
  10276. static const XMVECTORF32 D = { { { -1.0f, 1.0f, 0.0f, 0.0f } } };
  10277. XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
  10278. Scale = XMVectorReciprocal(Scale);
  10279. XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
  10280. Offset = XMVectorMultiplyAdd(Scale, Offset, D.v);
  10281. XMMATRIX Transform = XMMatrixMultiply(World, View);
  10282. Transform = XMMatrixMultiply(Transform, Projection);
  10283. Transform = XMMatrixInverse(nullptr, Transform);
  10284. const uint8_t* pInputVector = (const uint8_t*)pInputStream;
  10285. uint8_t* pOutputVector = (uint8_t*)pOutputStream;
  10286. for (size_t i = 0; i < VectorCount; i++)
  10287. {
  10288. XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
  10289. XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset);
  10290. Result = XMVector3TransformCoord(Result, Transform);
  10291. XMStoreFloat3((XMFLOAT3*)pOutputVector, Result);
  10292. pInputVector += InputStride;
  10293. pOutputVector += OutputStride;
  10294. }
  10295. return pOutputStream;
  10296. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  10297. XMMATRIX Transform = XMMatrixMultiply(World, View);
  10298. Transform = XMMatrixMultiply(Transform, Projection);
  10299. Transform = XMMatrixInverse(nullptr, Transform);
  10300. const uint8_t* pInputVector = (const uint8_t*)pInputStream;
  10301. uint8_t* pOutputVector = (uint8_t*)pOutputStream;
  10302. float sx = 1.f / (ViewportWidth * 0.5f);
  10303. float sy = 1.f / (-ViewportHeight * 0.5f);
  10304. float sz = 1.f / (ViewportMaxZ - ViewportMinZ);
  10305. float ox = (-ViewportX * sx) - 1.f;
  10306. float oy = (-ViewportY * sy) + 1.f;
  10307. float oz = (-ViewportMinZ * sz);
  10308. size_t i = 0;
  10309. size_t four = VectorCount >> 2;
  10310. if ( four > 0 )
  10311. {
  10312. if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3)))
  10313. {
  10314. for (size_t j = 0; j < four; ++j)
  10315. {
  10316. float32x4x3_t V = vld3q_f32( reinterpret_cast<const float*>(pInputVector) );
  10317. pInputVector += sizeof(XMFLOAT3)*4;
  10318. XMVECTOR ScaleX = vdupq_n_f32(sx);
  10319. XMVECTOR OffsetX = vdupq_n_f32(ox);
  10320. XMVECTOR VX = vmlaq_f32( OffsetX, ScaleX, V.val[0] );
  10321. float32x2_t r3 = vget_low_f32( Transform.r[3] );
  10322. float32x2_t r = vget_low_f32( Transform.r[0] );
  10323. XMVECTOR vResult0 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), VX, r, 0 ); // Ax+M
  10324. XMVECTOR vResult1 = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), VX, r, 1 ); // Bx+N
  10325. __prefetch( pInputVector );
  10326. r3 = vget_high_f32( Transform.r[3] );
  10327. r = vget_high_f32( Transform.r[0] );
  10328. XMVECTOR vResult2 = vmlaq_lane_f32( vdupq_lane_f32( r3, 0 ), VX, r, 0 ); // Cx+O
  10329. XMVECTOR W = vmlaq_lane_f32( vdupq_lane_f32( r3, 1 ), VX, r, 1 ); // Dx+P
  10330. __prefetch( pInputVector+XM_CACHE_LINE_SIZE );
  10331. XMVECTOR ScaleY = vdupq_n_f32(sy);
  10332. XMVECTOR OffsetY = vdupq_n_f32(oy);
  10333. XMVECTOR VY = vmlaq_f32( OffsetY, ScaleY, V.val[1] );
  10334. r = vget_low_f32( Transform.r[1] );
  10335. vResult0 = vmlaq_lane_f32( vResult0, VY, r, 0 ); // Ax+Ey+M
  10336. vResult1 = vmlaq_lane_f32( vResult1, VY, r, 1 ); // Bx+Fy+N
  10337. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
  10338. r = vget_high_f32( Transform.r[1] );
  10339. vResult2 = vmlaq_lane_f32( vResult2, VY, r, 0 ); // Cx+Gy+O
  10340. W = vmlaq_lane_f32( W, VY, r, 1 ); // Dx+Hy+P
  10341. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
  10342. XMVECTOR ScaleZ = vdupq_n_f32(sz);
  10343. XMVECTOR OffsetZ = vdupq_n_f32(oz);
  10344. XMVECTOR VZ = vmlaq_f32( OffsetZ, ScaleZ, V.val[2] );
  10345. r = vget_low_f32( Transform.r[2] );
  10346. vResult0 = vmlaq_lane_f32( vResult0, VZ, r, 0 ); // Ax+Ey+Iz+M
  10347. vResult1 = vmlaq_lane_f32( vResult1, VZ, r, 1 ); // Bx+Fy+Jz+N
  10348. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) );
  10349. r = vget_high_f32( Transform.r[2] );
  10350. vResult2 = vmlaq_lane_f32( vResult2, VZ, r, 0 ); // Cx+Gy+Kz+O
  10351. W = vmlaq_lane_f32( W, VZ, r, 1 ); // Dx+Hy+Lz+P
  10352. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );
  10353. #if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
  10354. V.val[0] = vdivq_f32( vResult0, W );
  10355. V.val[1] = vdivq_f32( vResult1, W );
  10356. V.val[2] = vdivq_f32( vResult2, W );
  10357. #else
  10358. // 2 iterations of Newton-Raphson refinement of reciprocal
  10359. float32x4_t Reciprocal = vrecpeq_f32(W);
  10360. float32x4_t S = vrecpsq_f32( Reciprocal, W );
  10361. Reciprocal = vmulq_f32( S, Reciprocal );
  10362. S = vrecpsq_f32( Reciprocal, W );
  10363. Reciprocal = vmulq_f32( S, Reciprocal );
  10364. V.val[0] = vmulq_f32( vResult0, Reciprocal );
  10365. V.val[1] = vmulq_f32( vResult1, Reciprocal );
  10366. V.val[2] = vmulq_f32( vResult2, Reciprocal );
  10367. #endif
  10368. vst3q_f32( reinterpret_cast<float*>(pOutputVector),V );
  10369. pOutputVector += sizeof(XMFLOAT3)*4;
  10370. i += 4;
  10371. }
  10372. }
  10373. }
  10374. if (i < VectorCount)
  10375. {
  10376. float32x2_t ScaleL = vcreate_f32(((uint64_t)*(const uint32_t *)&sx) | ((uint64_t)(*(const uint32_t *)&sy) << 32));
  10377. float32x2_t ScaleH = vcreate_f32((uint64_t)*(const uint32_t *)&sz);
  10378. float32x2_t OffsetL = vcreate_f32(((uint64_t)*(const uint32_t *)&ox) | ((uint64_t)(*(const uint32_t *)&oy) << 32));
  10379. float32x2_t OffsetH = vcreate_f32((uint64_t)*(const uint32_t *)&oz);
  10380. for (; i < VectorCount; i++)
  10381. {
  10382. float32x2_t VL = vld1_f32( reinterpret_cast<const float*>(pInputVector) );
  10383. float32x2_t zero = vdup_n_f32(0);
  10384. float32x2_t VH = vld1_lane_f32( reinterpret_cast<const float*>(pInputVector)+2, zero, 0 );
  10385. pInputVector += InputStride;
  10386. VL = vmla_f32( OffsetL, VL, ScaleL );
  10387. VH = vmla_f32( OffsetH, VH, ScaleH );
  10388. XMVECTOR vResult = vmlaq_lane_f32( Transform.r[3], Transform.r[0], VL, 0 ); // X
  10389. vResult = vmlaq_lane_f32( vResult, Transform.r[1], VL, 1 ); // Y
  10390. vResult = vmlaq_lane_f32( vResult, Transform.r[2], VH, 0 ); // Z
  10391. VH = vget_high_f32(vResult);
  10392. XMVECTOR W = vdupq_lane_f32( VH, 1 );
  10393. #if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
  10394. vResult = vdivq_f32( vResult, W );
  10395. #else
  10396. // 2 iterations of Newton-Raphson refinement of reciprocal for W
  10397. float32x4_t Reciprocal = vrecpeq_f32( W );
  10398. float32x4_t S = vrecpsq_f32( Reciprocal, W );
  10399. Reciprocal = vmulq_f32( S, Reciprocal );
  10400. S = vrecpsq_f32( Reciprocal, W );
  10401. Reciprocal = vmulq_f32( S, Reciprocal );
  10402. vResult = vmulq_f32( vResult, Reciprocal );
  10403. #endif
  10404. VL = vget_low_f32( vResult );
  10405. vst1_f32( reinterpret_cast<float*>(pOutputVector), VL );
  10406. vst1q_lane_f32( reinterpret_cast<float*>(pOutputVector)+2, vResult, 2 );
  10407. pOutputVector += OutputStride;
  10408. }
  10409. }
  10410. return pOutputStream;
  10411. #elif defined(_XM_SSE_INTRINSICS_)
  10412. static const XMVECTORF32 D = { { { -1.0f, 1.0f, 0.0f, 0.0f } } };
  10413. XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
  10414. Scale = XMVectorReciprocal(Scale);
  10415. XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
  10416. Offset = _mm_mul_ps(Scale, Offset);
  10417. Offset = _mm_add_ps(Offset, D);
  10418. XMMATRIX Transform = XMMatrixMultiply(World, View);
  10419. Transform = XMMatrixMultiply(Transform, Projection);
  10420. Transform = XMMatrixInverse(nullptr, Transform);
  10421. const uint8_t* pInputVector = (const uint8_t*)pInputStream;
  10422. uint8_t* pOutputVector = (uint8_t*)pOutputStream;
  10423. size_t i = 0;
  10424. size_t four = VectorCount >> 2;
  10425. if ( four > 0 )
  10426. {
  10427. if (InputStride == sizeof(XMFLOAT3))
  10428. {
  10429. if (OutputStride == sizeof(XMFLOAT3))
  10430. {
  10431. if ( !((uintptr_t)pOutputStream & 0xF) )
  10432. {
  10433. // Packed input, aligned & packed output
  10434. for (size_t j = 0; j < four; ++j)
  10435. {
  10436. __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
  10437. __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );
  10438. __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
  10439. pInputVector += sizeof(XMFLOAT3)*4;
  10440. // Unpack the 4 vectors (.w components are junk)
  10441. XM3UNPACK3INTO4(V1,L2,L3);
  10442. // Result 1
  10443. V1 = _mm_mul_ps( V1, Scale );
  10444. V1 = _mm_add_ps( V1, Offset );
  10445. XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
  10446. XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
  10447. XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
  10448. XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] );
  10449. XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
  10450. XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] );
  10451. vTemp = _mm_add_ps( vTemp, Transform.r[3] );
  10452. vTemp = _mm_add_ps( vTemp, vTemp2 );
  10453. vTemp = _mm_add_ps( vTemp, vTemp3 );
  10454. XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  10455. V1 = _mm_div_ps( vTemp, W );
  10456. // Result 2
  10457. V2 = _mm_mul_ps( V2, Scale );
  10458. V2 = _mm_add_ps( V2, Offset );
  10459. Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
  10460. Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
  10461. X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
  10462. vTemp = _mm_mul_ps( Z, Transform.r[2] );
  10463. vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
  10464. vTemp3 = _mm_mul_ps( X, Transform.r[0] );
  10465. vTemp = _mm_add_ps( vTemp, Transform.r[3] );
  10466. vTemp = _mm_add_ps( vTemp, vTemp2 );
  10467. vTemp = _mm_add_ps( vTemp, vTemp3 );
  10468. W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  10469. V2 = _mm_div_ps( vTemp, W );
  10470. // Result 3
  10471. V3 = _mm_mul_ps( V3, Scale );
  10472. V3 = _mm_add_ps( V3, Offset );
  10473. Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
  10474. Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
  10475. X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
  10476. vTemp = _mm_mul_ps( Z, Transform.r[2] );
  10477. vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
  10478. vTemp3 = _mm_mul_ps( X, Transform.r[0] );
  10479. vTemp = _mm_add_ps( vTemp, Transform.r[3] );
  10480. vTemp = _mm_add_ps( vTemp, vTemp2 );
  10481. vTemp = _mm_add_ps( vTemp, vTemp3 );
  10482. W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  10483. V3 = _mm_div_ps( vTemp, W );
  10484. // Result 4
  10485. V4 = _mm_mul_ps( V4, Scale );
  10486. V4 = _mm_add_ps( V4, Offset );
  10487. Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
  10488. Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
  10489. X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
  10490. vTemp = _mm_mul_ps( Z, Transform.r[2] );
  10491. vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
  10492. vTemp3 = _mm_mul_ps( X, Transform.r[0] );
  10493. vTemp = _mm_add_ps( vTemp, Transform.r[3] );
  10494. vTemp = _mm_add_ps( vTemp, vTemp2 );
  10495. vTemp = _mm_add_ps( vTemp, vTemp3 );
  10496. W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  10497. V4 = _mm_div_ps( vTemp, W );
  10498. // Pack and store the vectors
  10499. XM3PACK4INTO3(vTemp);
  10500. XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), V1 );
  10501. XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector+16), vTemp );
  10502. XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector+32), V3 );
  10503. pOutputVector += sizeof(XMFLOAT3)*4;
  10504. i += 4;
  10505. }
  10506. }
  10507. else
  10508. {
  10509. // Packed input, unaligned & packed output
  10510. for (size_t j = 0; j < four; ++j)
  10511. {
  10512. __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
  10513. __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );
  10514. __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
  10515. pInputVector += sizeof(XMFLOAT3)*4;
  10516. // Unpack the 4 vectors (.w components are junk)
  10517. XM3UNPACK3INTO4(V1,L2,L3);
  10518. // Result 1
  10519. V1 = _mm_mul_ps( V1, Scale );
  10520. V1 = _mm_add_ps( V1, Offset );
  10521. XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
  10522. XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
  10523. XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
  10524. XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] );
  10525. XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
  10526. XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] );
  10527. vTemp = _mm_add_ps( vTemp, Transform.r[3] );
  10528. vTemp = _mm_add_ps( vTemp, vTemp2 );
  10529. vTemp = _mm_add_ps( vTemp, vTemp3 );
  10530. XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  10531. V1 = _mm_div_ps( vTemp, W );
  10532. // Result 2
  10533. V2 = _mm_mul_ps( V2, Scale );
  10534. V2 = _mm_add_ps( V2, Offset );
  10535. Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
  10536. Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
  10537. X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
  10538. vTemp = _mm_mul_ps( Z, Transform.r[2] );
  10539. vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
  10540. vTemp3 = _mm_mul_ps( X, Transform.r[0] );
  10541. vTemp = _mm_add_ps( vTemp, Transform.r[3] );
  10542. vTemp = _mm_add_ps( vTemp, vTemp2 );
  10543. vTemp = _mm_add_ps( vTemp, vTemp3 );
  10544. W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  10545. V2 = _mm_div_ps( vTemp, W );
  10546. // Result 3
  10547. V3 = _mm_mul_ps( V3, Scale );
  10548. V3 = _mm_add_ps( V3, Offset );
  10549. Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
  10550. Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
  10551. X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
  10552. vTemp = _mm_mul_ps( Z, Transform.r[2] );
  10553. vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
  10554. vTemp3 = _mm_mul_ps( X, Transform.r[0] );
  10555. vTemp = _mm_add_ps( vTemp, Transform.r[3] );
  10556. vTemp = _mm_add_ps( vTemp, vTemp2 );
  10557. vTemp = _mm_add_ps( vTemp, vTemp3 );
  10558. W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  10559. V3 = _mm_div_ps( vTemp, W );
  10560. // Result 4
  10561. V4 = _mm_mul_ps( V4, Scale );
  10562. V4 = _mm_add_ps( V4, Offset );
  10563. Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
  10564. Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
  10565. X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
  10566. vTemp = _mm_mul_ps( Z, Transform.r[2] );
  10567. vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
  10568. vTemp3 = _mm_mul_ps( X, Transform.r[0] );
  10569. vTemp = _mm_add_ps( vTemp, Transform.r[3] );
  10570. vTemp = _mm_add_ps( vTemp, vTemp2 );
  10571. vTemp = _mm_add_ps( vTemp, vTemp3 );
  10572. W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  10573. V4 = _mm_div_ps( vTemp, W );
  10574. // Pack and store the vectors
  10575. XM3PACK4INTO3(vTemp);
  10576. _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), V1 );
  10577. _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector+16), vTemp );
  10578. _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector+32), V3 );
  10579. pOutputVector += sizeof(XMFLOAT3)*4;
  10580. i += 4;
  10581. }
  10582. }
  10583. }
  10584. else
  10585. {
  10586. // Packed input, unpacked output
  10587. for (size_t j = 0; j < four; ++j)
  10588. {
  10589. __m128 V1 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
  10590. __m128 L2 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+16) );
  10591. __m128 L3 = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector+32) );
  10592. pInputVector += sizeof(XMFLOAT3)*4;
  10593. // Unpack the 4 vectors (.w components are junk)
  10594. XM3UNPACK3INTO4(V1,L2,L3);
  10595. // Result 1
  10596. V1 = _mm_mul_ps( V1, Scale );
  10597. V1 = _mm_add_ps( V1, Offset );
  10598. XMVECTOR Z = XM_PERMUTE_PS( V1, _MM_SHUFFLE(2, 2, 2, 2) );
  10599. XMVECTOR Y = XM_PERMUTE_PS( V1, _MM_SHUFFLE(1, 1, 1, 1) );
  10600. XMVECTOR X = XM_PERMUTE_PS( V1, _MM_SHUFFLE(0, 0, 0, 0) );
  10601. XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] );
  10602. XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
  10603. XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] );
  10604. vTemp = _mm_add_ps( vTemp, Transform.r[3] );
  10605. vTemp = _mm_add_ps( vTemp, vTemp2 );
  10606. vTemp = _mm_add_ps( vTemp, vTemp3 );
  10607. XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  10608. vTemp = _mm_div_ps( vTemp, W );
  10609. XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
  10610. pOutputVector += OutputStride;
  10611. // Result 2
  10612. V2 = _mm_mul_ps( V2, Scale );
  10613. V2 = _mm_add_ps( V2, Offset );
  10614. Z = XM_PERMUTE_PS( V2, _MM_SHUFFLE(2, 2, 2, 2) );
  10615. Y = XM_PERMUTE_PS( V2, _MM_SHUFFLE(1, 1, 1, 1) );
  10616. X = XM_PERMUTE_PS( V2, _MM_SHUFFLE(0, 0, 0, 0) );
  10617. vTemp = _mm_mul_ps( Z, Transform.r[2] );
  10618. vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
  10619. vTemp3 = _mm_mul_ps( X, Transform.r[0] );
  10620. vTemp = _mm_add_ps( vTemp, Transform.r[3] );
  10621. vTemp = _mm_add_ps( vTemp, vTemp2 );
  10622. vTemp = _mm_add_ps( vTemp, vTemp3 );
  10623. W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  10624. vTemp = _mm_div_ps( vTemp, W );
  10625. XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
  10626. pOutputVector += OutputStride;
  10627. // Result 3
  10628. V3 = _mm_mul_ps( V3, Scale );
  10629. V3 = _mm_add_ps( V3, Offset );
  10630. Z = XM_PERMUTE_PS( V3, _MM_SHUFFLE(2, 2, 2, 2) );
  10631. Y = XM_PERMUTE_PS( V3, _MM_SHUFFLE(1, 1, 1, 1) );
  10632. X = XM_PERMUTE_PS( V3, _MM_SHUFFLE(0, 0, 0, 0) );
  10633. vTemp = _mm_mul_ps( Z, Transform.r[2] );
  10634. vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
  10635. vTemp3 = _mm_mul_ps( X, Transform.r[0] );
  10636. vTemp = _mm_add_ps( vTemp, Transform.r[3] );
  10637. vTemp = _mm_add_ps( vTemp, vTemp2 );
  10638. vTemp = _mm_add_ps( vTemp, vTemp3 );
  10639. W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  10640. vTemp = _mm_div_ps( vTemp, W );
  10641. XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
  10642. pOutputVector += OutputStride;
  10643. // Result 4
  10644. V4 = _mm_mul_ps( V4, Scale );
  10645. V4 = _mm_add_ps( V4, Offset );
  10646. Z = XM_PERMUTE_PS( V4, _MM_SHUFFLE(2, 2, 2, 2) );
  10647. Y = XM_PERMUTE_PS( V4, _MM_SHUFFLE(1, 1, 1, 1) );
  10648. X = XM_PERMUTE_PS( V4, _MM_SHUFFLE(0, 0, 0, 0) );
  10649. vTemp = _mm_mul_ps( Z, Transform.r[2] );
  10650. vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
  10651. vTemp3 = _mm_mul_ps( X, Transform.r[0] );
  10652. vTemp = _mm_add_ps( vTemp, Transform.r[3] );
  10653. vTemp = _mm_add_ps( vTemp, vTemp2 );
  10654. vTemp = _mm_add_ps( vTemp, vTemp3 );
  10655. W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  10656. vTemp = _mm_div_ps( vTemp, W );
  10657. XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
  10658. pOutputVector += OutputStride;
  10659. i += 4;
  10660. }
  10661. }
  10662. }
  10663. }
  10664. for (; i < VectorCount; i++)
  10665. {
  10666. XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
  10667. pInputVector += InputStride;
  10668. V = _mm_mul_ps( V, Scale );
  10669. V = _mm_add_ps( V, Offset );
  10670. XMVECTOR Z = XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
  10671. XMVECTOR Y = XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
  10672. XMVECTOR X = XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
  10673. XMVECTOR vTemp = _mm_mul_ps( Z, Transform.r[2] );
  10674. XMVECTOR vTemp2 = _mm_mul_ps( Y, Transform.r[1] );
  10675. XMVECTOR vTemp3 = _mm_mul_ps( X, Transform.r[0] );
  10676. vTemp = _mm_add_ps( vTemp, Transform.r[3] );
  10677. vTemp = _mm_add_ps( vTemp, vTemp2 );
  10678. vTemp = _mm_add_ps( vTemp, vTemp3 );
  10679. XMVECTOR W = XM_PERMUTE_PS( vTemp, _MM_SHUFFLE(3, 3, 3, 3) );
  10680. vTemp = _mm_div_ps( vTemp, W );
  10681. XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
  10682. pOutputVector += OutputStride;
  10683. }
  10684. XM_SFENCE();
  10685. return pOutputStream;
  10686. #endif
  10687. }
  10688. #ifdef _PREFAST_
  10689. #pragma prefast(pop)
  10690. #endif
  10691. /****************************************************************************
  10692. *
  10693. * 4D Vector
  10694. *
  10695. ****************************************************************************/
  10696. //------------------------------------------------------------------------------
  10697. // Comparison operations
  10698. //------------------------------------------------------------------------------
  10699. //------------------------------------------------------------------------------
  10700. inline bool XM_CALLCONV XMVector4Equal
  10701. (
  10702. FXMVECTOR V1,
  10703. FXMVECTOR V2
  10704. )
  10705. {
  10706. #if defined(_XM_NO_INTRINSICS_)
  10707. return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2]) && (V1.vector4_f32[3] == V2.vector4_f32[3])) != 0);
  10708. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  10709. uint32x4_t vResult = vceqq_f32( V1, V2 );
  10710. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  10711. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  10712. return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
  10713. #elif defined(_XM_SSE_INTRINSICS_)
  10714. XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
  10715. return ((_mm_movemask_ps(vTemp)==0x0f) != 0);
  10716. #else
  10717. return XMComparisonAllTrue(XMVector4EqualR(V1, V2));
  10718. #endif
  10719. }
  10720. //------------------------------------------------------------------------------
  10721. inline uint32_t XM_CALLCONV XMVector4EqualR
  10722. (
  10723. FXMVECTOR V1,
  10724. FXMVECTOR V2
  10725. )
  10726. {
  10727. #if defined(_XM_NO_INTRINSICS_)
  10728. uint32_t CR = 0;
  10729. if ((V1.vector4_f32[0] == V2.vector4_f32[0]) &&
  10730. (V1.vector4_f32[1] == V2.vector4_f32[1]) &&
  10731. (V1.vector4_f32[2] == V2.vector4_f32[2]) &&
  10732. (V1.vector4_f32[3] == V2.vector4_f32[3]))
  10733. {
  10734. CR = XM_CRMASK_CR6TRUE;
  10735. }
  10736. else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) &&
  10737. (V1.vector4_f32[1] != V2.vector4_f32[1]) &&
  10738. (V1.vector4_f32[2] != V2.vector4_f32[2]) &&
  10739. (V1.vector4_f32[3] != V2.vector4_f32[3]))
  10740. {
  10741. CR = XM_CRMASK_CR6FALSE;
  10742. }
  10743. return CR;
  10744. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  10745. uint32x4_t vResult = vceqq_f32( V1, V2 );
  10746. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  10747. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  10748. uint32_t r = vget_lane_u32(vTemp.val[1], 1);
  10749. uint32_t CR = 0;
  10750. if ( r == 0xFFFFFFFFU )
  10751. {
  10752. CR = XM_CRMASK_CR6TRUE;
  10753. }
  10754. else if ( !r )
  10755. {
  10756. CR = XM_CRMASK_CR6FALSE;
  10757. }
  10758. return CR;
  10759. #elif defined(_XM_SSE_INTRINSICS_)
  10760. XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
  10761. int iTest = _mm_movemask_ps(vTemp);
  10762. uint32_t CR = 0;
  10763. if (iTest==0xf) // All equal?
  10764. {
  10765. CR = XM_CRMASK_CR6TRUE;
  10766. }
  10767. else if (iTest==0) // All not equal?
  10768. {
  10769. CR = XM_CRMASK_CR6FALSE;
  10770. }
  10771. return CR;
  10772. #endif
  10773. }
  10774. //------------------------------------------------------------------------------
  10775. inline bool XM_CALLCONV XMVector4EqualInt
  10776. (
  10777. FXMVECTOR V1,
  10778. FXMVECTOR V2
  10779. )
  10780. {
  10781. #if defined(_XM_NO_INTRINSICS_)
  10782. return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2]) && (V1.vector4_u32[3] == V2.vector4_u32[3])) != 0);
  10783. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  10784. uint32x4_t vResult = vceqq_u32( V1, V2 );
  10785. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  10786. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  10787. return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
  10788. #elif defined(_XM_SSE_INTRINSICS_)
  10789. __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
  10790. return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp))==0xf) != 0);
  10791. #else
  10792. return XMComparisonAllTrue(XMVector4EqualIntR(V1, V2));
  10793. #endif
  10794. }
  10795. //------------------------------------------------------------------------------
  10796. inline uint32_t XM_CALLCONV XMVector4EqualIntR
  10797. (
  10798. FXMVECTOR V1,
  10799. FXMVECTOR V2
  10800. )
  10801. {
  10802. #if defined(_XM_NO_INTRINSICS_)
  10803. uint32_t CR = 0;
  10804. if (V1.vector4_u32[0] == V2.vector4_u32[0] &&
  10805. V1.vector4_u32[1] == V2.vector4_u32[1] &&
  10806. V1.vector4_u32[2] == V2.vector4_u32[2] &&
  10807. V1.vector4_u32[3] == V2.vector4_u32[3])
  10808. {
  10809. CR = XM_CRMASK_CR6TRUE;
  10810. }
  10811. else if (V1.vector4_u32[0] != V2.vector4_u32[0] &&
  10812. V1.vector4_u32[1] != V2.vector4_u32[1] &&
  10813. V1.vector4_u32[2] != V2.vector4_u32[2] &&
  10814. V1.vector4_u32[3] != V2.vector4_u32[3])
  10815. {
  10816. CR = XM_CRMASK_CR6FALSE;
  10817. }
  10818. return CR;
  10819. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  10820. uint32x4_t vResult = vceqq_u32( V1, V2 );
  10821. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  10822. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  10823. uint32_t r = vget_lane_u32(vTemp.val[1], 1);
  10824. uint32_t CR = 0;
  10825. if ( r == 0xFFFFFFFFU )
  10826. {
  10827. CR = XM_CRMASK_CR6TRUE;
  10828. }
  10829. else if ( !r )
  10830. {
  10831. CR = XM_CRMASK_CR6FALSE;
  10832. }
  10833. return CR;
  10834. #elif defined(_XM_SSE_INTRINSICS_)
  10835. __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
  10836. int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp));
  10837. uint32_t CR = 0;
  10838. if (iTest==0xf) // All equal?
  10839. {
  10840. CR = XM_CRMASK_CR6TRUE;
  10841. }
  10842. else if (iTest==0) // All not equal?
  10843. {
  10844. CR = XM_CRMASK_CR6FALSE;
  10845. }
  10846. return CR;
  10847. #endif
  10848. }
  10849. inline bool XM_CALLCONV XMVector4NearEqual
  10850. (
  10851. FXMVECTOR V1,
  10852. FXMVECTOR V2,
  10853. FXMVECTOR Epsilon
  10854. )
  10855. {
  10856. #if defined(_XM_NO_INTRINSICS_)
  10857. float dx, dy, dz, dw;
  10858. dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]);
  10859. dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]);
  10860. dz = fabsf(V1.vector4_f32[2]-V2.vector4_f32[2]);
  10861. dw = fabsf(V1.vector4_f32[3]-V2.vector4_f32[3]);
  10862. return (((dx <= Epsilon.vector4_f32[0]) &&
  10863. (dy <= Epsilon.vector4_f32[1]) &&
  10864. (dz <= Epsilon.vector4_f32[2]) &&
  10865. (dw <= Epsilon.vector4_f32[3])) != 0);
  10866. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  10867. float32x4_t vDelta = vsubq_f32( V1, V2 );
  10868. uint32x4_t vResult = vacleq_f32( vDelta, Epsilon );
  10869. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  10870. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  10871. return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
  10872. #elif defined(_XM_SSE_INTRINSICS_)
  10873. // Get the difference
  10874. XMVECTOR vDelta = _mm_sub_ps(V1,V2);
  10875. // Get the absolute value of the difference
  10876. XMVECTOR vTemp = _mm_setzero_ps();
  10877. vTemp = _mm_sub_ps(vTemp,vDelta);
  10878. vTemp = _mm_max_ps(vTemp,vDelta);
  10879. vTemp = _mm_cmple_ps(vTemp,Epsilon);
  10880. return ((_mm_movemask_ps(vTemp)==0xf) != 0);
  10881. #endif
  10882. }
  10883. //------------------------------------------------------------------------------
  10884. inline bool XM_CALLCONV XMVector4NotEqual
  10885. (
  10886. FXMVECTOR V1,
  10887. FXMVECTOR V2
  10888. )
  10889. {
  10890. #if defined(_XM_NO_INTRINSICS_)
  10891. return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2]) || (V1.vector4_f32[3] != V2.vector4_f32[3])) != 0);
  10892. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  10893. uint32x4_t vResult = vceqq_f32( V1, V2 );
  10894. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  10895. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  10896. return ( vget_lane_u32(vTemp.val[1], 1) != 0xFFFFFFFFU );
  10897. #elif defined(_XM_SSE_INTRINSICS_)
  10898. XMVECTOR vTemp = _mm_cmpneq_ps(V1,V2);
  10899. return ((_mm_movemask_ps(vTemp)) != 0);
  10900. #else
  10901. return XMComparisonAnyFalse(XMVector4EqualR(V1, V2));
  10902. #endif
  10903. }
  10904. //------------------------------------------------------------------------------
  10905. inline bool XM_CALLCONV XMVector4NotEqualInt
  10906. (
  10907. FXMVECTOR V1,
  10908. FXMVECTOR V2
  10909. )
  10910. {
  10911. #if defined(_XM_NO_INTRINSICS_)
  10912. return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2]) || (V1.vector4_u32[3] != V2.vector4_u32[3])) != 0);
  10913. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  10914. uint32x4_t vResult = vceqq_u32( V1, V2 );
  10915. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  10916. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  10917. return ( vget_lane_u32(vTemp.val[1], 1) != 0xFFFFFFFFU );
  10918. #elif defined(_XM_SSE_INTRINSICS_)
  10919. __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
  10920. return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp))!=0xF) != 0);
  10921. #else
  10922. return XMComparisonAnyFalse(XMVector4EqualIntR(V1, V2));
  10923. #endif
  10924. }
  10925. //------------------------------------------------------------------------------
  10926. inline bool XM_CALLCONV XMVector4Greater
  10927. (
  10928. FXMVECTOR V1,
  10929. FXMVECTOR V2
  10930. )
  10931. {
  10932. #if defined(_XM_NO_INTRINSICS_)
  10933. return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2]) && (V1.vector4_f32[3] > V2.vector4_f32[3])) != 0);
  10934. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  10935. uint32x4_t vResult = vcgtq_f32( V1, V2 );
  10936. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  10937. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  10938. return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
  10939. #elif defined(_XM_SSE_INTRINSICS_)
  10940. XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
  10941. return ((_mm_movemask_ps(vTemp)==0x0f) != 0);
  10942. #else
  10943. return XMComparisonAllTrue(XMVector4GreaterR(V1, V2));
  10944. #endif
  10945. }
  10946. //------------------------------------------------------------------------------
  10947. inline uint32_t XM_CALLCONV XMVector4GreaterR
  10948. (
  10949. FXMVECTOR V1,
  10950. FXMVECTOR V2
  10951. )
  10952. {
  10953. #if defined(_XM_NO_INTRINSICS_)
  10954. uint32_t CR = 0;
  10955. if (V1.vector4_f32[0] > V2.vector4_f32[0] &&
  10956. V1.vector4_f32[1] > V2.vector4_f32[1] &&
  10957. V1.vector4_f32[2] > V2.vector4_f32[2] &&
  10958. V1.vector4_f32[3] > V2.vector4_f32[3])
  10959. {
  10960. CR = XM_CRMASK_CR6TRUE;
  10961. }
  10962. else if (V1.vector4_f32[0] <= V2.vector4_f32[0] &&
  10963. V1.vector4_f32[1] <= V2.vector4_f32[1] &&
  10964. V1.vector4_f32[2] <= V2.vector4_f32[2] &&
  10965. V1.vector4_f32[3] <= V2.vector4_f32[3])
  10966. {
  10967. CR = XM_CRMASK_CR6FALSE;
  10968. }
  10969. return CR;
  10970. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  10971. uint32x4_t vResult = vcgtq_f32( V1, V2 );
  10972. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  10973. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  10974. uint32_t r = vget_lane_u32(vTemp.val[1], 1);
  10975. uint32_t CR = 0;
  10976. if ( r == 0xFFFFFFFFU )
  10977. {
  10978. CR = XM_CRMASK_CR6TRUE;
  10979. }
  10980. else if ( !r )
  10981. {
  10982. CR = XM_CRMASK_CR6FALSE;
  10983. }
  10984. return CR;
  10985. #elif defined(_XM_SSE_INTRINSICS_)
  10986. uint32_t CR = 0;
  10987. XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
  10988. int iTest = _mm_movemask_ps(vTemp);
  10989. if (iTest==0xf) {
  10990. CR = XM_CRMASK_CR6TRUE;
  10991. }
  10992. else if (!iTest)
  10993. {
  10994. CR = XM_CRMASK_CR6FALSE;
  10995. }
  10996. return CR;
  10997. #endif
  10998. }
  10999. //------------------------------------------------------------------------------
  11000. inline bool XM_CALLCONV XMVector4GreaterOrEqual
  11001. (
  11002. FXMVECTOR V1,
  11003. FXMVECTOR V2
  11004. )
  11005. {
  11006. #if defined(_XM_NO_INTRINSICS_)
  11007. return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2]) && (V1.vector4_f32[3] >= V2.vector4_f32[3])) != 0);
  11008. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  11009. uint32x4_t vResult = vcgeq_f32( V1, V2 );
  11010. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  11011. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  11012. return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
  11013. #elif defined(_XM_SSE_INTRINSICS_)
  11014. XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
  11015. return ((_mm_movemask_ps(vTemp)==0x0f) != 0);
  11016. #else
  11017. return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V1, V2));
  11018. #endif
  11019. }
  11020. //------------------------------------------------------------------------------
  11021. inline uint32_t XM_CALLCONV XMVector4GreaterOrEqualR
  11022. (
  11023. FXMVECTOR V1,
  11024. FXMVECTOR V2
  11025. )
  11026. {
  11027. #if defined(_XM_NO_INTRINSICS_)
  11028. uint32_t CR = 0;
  11029. if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) &&
  11030. (V1.vector4_f32[1] >= V2.vector4_f32[1]) &&
  11031. (V1.vector4_f32[2] >= V2.vector4_f32[2]) &&
  11032. (V1.vector4_f32[3] >= V2.vector4_f32[3]))
  11033. {
  11034. CR = XM_CRMASK_CR6TRUE;
  11035. }
  11036. else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) &&
  11037. (V1.vector4_f32[1] < V2.vector4_f32[1]) &&
  11038. (V1.vector4_f32[2] < V2.vector4_f32[2]) &&
  11039. (V1.vector4_f32[3] < V2.vector4_f32[3]))
  11040. {
  11041. CR = XM_CRMASK_CR6FALSE;
  11042. }
  11043. return CR;
  11044. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  11045. uint32x4_t vResult = vcgeq_f32( V1, V2 );
  11046. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  11047. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  11048. uint32_t r = vget_lane_u32(vTemp.val[1], 1);
  11049. uint32_t CR = 0;
  11050. if ( r == 0xFFFFFFFFU )
  11051. {
  11052. CR = XM_CRMASK_CR6TRUE;
  11053. }
  11054. else if ( !r )
  11055. {
  11056. CR = XM_CRMASK_CR6FALSE;
  11057. }
  11058. return CR;
  11059. #elif defined(_XM_SSE_INTRINSICS_)
  11060. uint32_t CR = 0;
  11061. XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
  11062. int iTest = _mm_movemask_ps(vTemp);
  11063. if (iTest==0x0f)
  11064. {
  11065. CR = XM_CRMASK_CR6TRUE;
  11066. }
  11067. else if (!iTest)
  11068. {
  11069. CR = XM_CRMASK_CR6FALSE;
  11070. }
  11071. return CR;
  11072. #endif
  11073. }
  11074. //------------------------------------------------------------------------------
  11075. inline bool XM_CALLCONV XMVector4Less
  11076. (
  11077. FXMVECTOR V1,
  11078. FXMVECTOR V2
  11079. )
  11080. {
  11081. #if defined(_XM_NO_INTRINSICS_)
  11082. return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2]) && (V1.vector4_f32[3] < V2.vector4_f32[3])) != 0);
  11083. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  11084. uint32x4_t vResult = vcltq_f32( V1, V2 );
  11085. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  11086. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  11087. return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
  11088. #elif defined(_XM_SSE_INTRINSICS_)
  11089. XMVECTOR vTemp = _mm_cmplt_ps(V1,V2);
  11090. return ((_mm_movemask_ps(vTemp)==0x0f) != 0);
  11091. #else
  11092. return XMComparisonAllTrue(XMVector4GreaterR(V2, V1));
  11093. #endif
  11094. }
  11095. //------------------------------------------------------------------------------
  11096. inline bool XM_CALLCONV XMVector4LessOrEqual
  11097. (
  11098. FXMVECTOR V1,
  11099. FXMVECTOR V2
  11100. )
  11101. {
  11102. #if defined(_XM_NO_INTRINSICS_)
  11103. return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2]) && (V1.vector4_f32[3] <= V2.vector4_f32[3])) != 0);
  11104. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  11105. uint32x4_t vResult = vcleq_f32( V1, V2 );
  11106. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  11107. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  11108. return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
  11109. #elif defined(_XM_SSE_INTRINSICS_)
  11110. XMVECTOR vTemp = _mm_cmple_ps(V1,V2);
  11111. return ((_mm_movemask_ps(vTemp)==0x0f) != 0);
  11112. #else
  11113. return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V2, V1));
  11114. #endif
  11115. }
  11116. //------------------------------------------------------------------------------
  11117. inline bool XM_CALLCONV XMVector4InBounds
  11118. (
  11119. FXMVECTOR V,
  11120. FXMVECTOR Bounds
  11121. )
  11122. {
  11123. #if defined(_XM_NO_INTRINSICS_)
  11124. return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) &&
  11125. (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) &&
  11126. (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) &&
  11127. (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3])) != 0);
  11128. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  11129. // Test if less than or equal
  11130. uint32x4_t ivTemp1 = vcleq_f32(V,Bounds);
  11131. // Negate the bounds
  11132. float32x4_t vTemp2 = vnegq_f32(Bounds);
  11133. // Test if greater or equal (Reversed)
  11134. uint32x4_t ivTemp2 = vcleq_f32(vTemp2,V);
  11135. // Blend answers
  11136. ivTemp1 = vandq_u32(ivTemp1,ivTemp2);
  11137. // in bounds?
  11138. int8x8x2_t vTemp = vzip_u8(vget_low_u8(ivTemp1), vget_high_u8(ivTemp1));
  11139. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  11140. return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
  11141. #elif defined(_XM_SSE_INTRINSICS_)
  11142. // Test if less than or equal
  11143. XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
  11144. // Negate the bounds
  11145. XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
  11146. // Test if greater or equal (Reversed)
  11147. vTemp2 = _mm_cmple_ps(vTemp2,V);
  11148. // Blend answers
  11149. vTemp1 = _mm_and_ps(vTemp1,vTemp2);
  11150. // All in bounds?
  11151. return ((_mm_movemask_ps(vTemp1)==0x0f) != 0);
  11152. #else
  11153. return XMComparisonAllInBounds(XMVector4InBoundsR(V, Bounds));
  11154. #endif
  11155. }
  11156. //------------------------------------------------------------------------------
  11157. inline bool XM_CALLCONV XMVector4IsNaN
  11158. (
  11159. FXMVECTOR V
  11160. )
  11161. {
  11162. #if defined(_XM_NO_INTRINSICS_)
  11163. return (XMISNAN(V.vector4_f32[0]) ||
  11164. XMISNAN(V.vector4_f32[1]) ||
  11165. XMISNAN(V.vector4_f32[2]) ||
  11166. XMISNAN(V.vector4_f32[3]));
  11167. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  11168. // Test against itself. NaN is always not equal
  11169. uint32x4_t vTempNan = vceqq_f32( V, V );
  11170. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempNan), vget_high_u8(vTempNan));
  11171. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  11172. // If any are NaN, the mask is zero
  11173. return ( vget_lane_u32(vTemp.val[1], 1) != 0xFFFFFFFFU );
  11174. #elif defined(_XM_SSE_INTRINSICS_)
  11175. // Test against itself. NaN is always not equal
  11176. XMVECTOR vTempNan = _mm_cmpneq_ps(V,V);
  11177. // If any are NaN, the mask is non-zero
  11178. return (_mm_movemask_ps(vTempNan)!=0);
  11179. #endif
  11180. }
  11181. //------------------------------------------------------------------------------
  11182. inline bool XM_CALLCONV XMVector4IsInfinite
  11183. (
  11184. FXMVECTOR V
  11185. )
  11186. {
  11187. #if defined(_XM_NO_INTRINSICS_)
  11188. return (XMISINF(V.vector4_f32[0]) ||
  11189. XMISINF(V.vector4_f32[1]) ||
  11190. XMISINF(V.vector4_f32[2]) ||
  11191. XMISINF(V.vector4_f32[3]));
  11192. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  11193. // Mask off the sign bit
  11194. uint32x4_t vTempInf = vandq_u32( V, g_XMAbsMask );
  11195. // Compare to infinity
  11196. vTempInf = vceqq_f32(vTempInf, g_XMInfinity );
  11197. // If any are infinity, the signs are true.
  11198. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempInf), vget_high_u8(vTempInf));
  11199. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  11200. return ( vget_lane_u32(vTemp.val[1], 1) != 0 );
  11201. #elif defined(_XM_SSE_INTRINSICS_)
  11202. // Mask off the sign bit
  11203. XMVECTOR vTemp = _mm_and_ps(V,g_XMAbsMask);
  11204. // Compare to infinity
  11205. vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity);
  11206. // If any are infinity, the signs are true.
  11207. return (_mm_movemask_ps(vTemp) != 0);
  11208. #endif
  11209. }
  11210. //------------------------------------------------------------------------------
  11211. // Computation operations
  11212. //------------------------------------------------------------------------------
  11213. //------------------------------------------------------------------------------
  11214. inline XMVECTOR XM_CALLCONV XMVector4Dot
  11215. (
  11216. FXMVECTOR V1,
  11217. FXMVECTOR V2
  11218. )
  11219. {
  11220. #if defined(_XM_NO_INTRINSICS_)
  11221. XMVECTORF32 Result;
  11222. Result.f[0] =
  11223. Result.f[1] =
  11224. Result.f[2] =
  11225. Result.f[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2] + V1.vector4_f32[3] * V2.vector4_f32[3];
  11226. return Result.v;
  11227. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  11228. float32x4_t vTemp = vmulq_f32( V1, V2 );
  11229. float32x2_t v1 = vget_low_f32( vTemp );
  11230. float32x2_t v2 = vget_high_f32( vTemp );
  11231. v1 = vadd_f32( v1, v2 );
  11232. v1 = vpadd_f32( v1, v1 );
  11233. return vcombine_f32( v1, v1 );
  11234. #elif defined(_XM_SSE4_INTRINSICS_)
  11235. return _mm_dp_ps( V1, V2, 0xff );
  11236. #elif defined(_XM_SSE3_INTRINSICS_)
  11237. XMVECTOR vTemp = _mm_mul_ps(V1, V2);
  11238. vTemp = _mm_hadd_ps(vTemp, vTemp);
  11239. return _mm_hadd_ps(vTemp, vTemp);
  11240. #elif defined(_XM_SSE_INTRINSICS_)
  11241. XMVECTOR vTemp2 = V2;
  11242. XMVECTOR vTemp = _mm_mul_ps(V1,vTemp2);
  11243. vTemp2 = _mm_shuffle_ps(vTemp2,vTemp,_MM_SHUFFLE(1,0,0,0)); // Copy X to the Z position and Y to the W position
  11244. vTemp2 = _mm_add_ps(vTemp2,vTemp); // Add Z = X+Z; W = Y+W;
  11245. vTemp = _mm_shuffle_ps(vTemp,vTemp2,_MM_SHUFFLE(0,3,0,0)); // Copy W to the Z position
  11246. vTemp = _mm_add_ps(vTemp,vTemp2); // Add Z and W together
  11247. return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(2,2,2,2)); // Splat Z and return
  11248. #endif
  11249. }
  11250. //------------------------------------------------------------------------------
  11251. inline XMVECTOR XM_CALLCONV XMVector4Cross
  11252. (
  11253. FXMVECTOR V1,
  11254. FXMVECTOR V2,
  11255. FXMVECTOR V3
  11256. )
  11257. {
  11258. // [ ((v2.z*v3.w-v2.w*v3.z)*v1.y)-((v2.y*v3.w-v2.w*v3.y)*v1.z)+((v2.y*v3.z-v2.z*v3.y)*v1.w),
  11259. // ((v2.w*v3.z-v2.z*v3.w)*v1.x)-((v2.w*v3.x-v2.x*v3.w)*v1.z)+((v2.z*v3.x-v2.x*v3.z)*v1.w),
  11260. // ((v2.y*v3.w-v2.w*v3.y)*v1.x)-((v2.x*v3.w-v2.w*v3.x)*v1.y)+((v2.x*v3.y-v2.y*v3.x)*v1.w),
  11261. // ((v2.z*v3.y-v2.y*v3.z)*v1.x)-((v2.z*v3.x-v2.x*v3.z)*v1.y)+((v2.y*v3.x-v2.x*v3.y)*v1.z) ]
  11262. #if defined(_XM_NO_INTRINSICS_)
  11263. XMVECTORF32 Result = { { {
  11264. (((V2.vector4_f32[2] * V3.vector4_f32[3]) - (V2.vector4_f32[3] * V3.vector4_f32[2]))*V1.vector4_f32[1]) - (((V2.vector4_f32[1] * V3.vector4_f32[3]) - (V2.vector4_f32[3] * V3.vector4_f32[1]))*V1.vector4_f32[2]) + (((V2.vector4_f32[1] * V3.vector4_f32[2]) - (V2.vector4_f32[2] * V3.vector4_f32[1]))*V1.vector4_f32[3]),
  11265. (((V2.vector4_f32[3] * V3.vector4_f32[2]) - (V2.vector4_f32[2] * V3.vector4_f32[3]))*V1.vector4_f32[0]) - (((V2.vector4_f32[3] * V3.vector4_f32[0]) - (V2.vector4_f32[0] * V3.vector4_f32[3]))*V1.vector4_f32[2]) + (((V2.vector4_f32[2] * V3.vector4_f32[0]) - (V2.vector4_f32[0] * V3.vector4_f32[2]))*V1.vector4_f32[3]),
  11266. (((V2.vector4_f32[1] * V3.vector4_f32[3]) - (V2.vector4_f32[3] * V3.vector4_f32[1]))*V1.vector4_f32[0]) - (((V2.vector4_f32[0] * V3.vector4_f32[3]) - (V2.vector4_f32[3] * V3.vector4_f32[0]))*V1.vector4_f32[1]) + (((V2.vector4_f32[0] * V3.vector4_f32[1]) - (V2.vector4_f32[1] * V3.vector4_f32[0]))*V1.vector4_f32[3]),
  11267. (((V2.vector4_f32[2] * V3.vector4_f32[1]) - (V2.vector4_f32[1] * V3.vector4_f32[2]))*V1.vector4_f32[0]) - (((V2.vector4_f32[2] * V3.vector4_f32[0]) - (V2.vector4_f32[0] * V3.vector4_f32[2]))*V1.vector4_f32[1]) + (((V2.vector4_f32[1] * V3.vector4_f32[0]) - (V2.vector4_f32[0] * V3.vector4_f32[1]))*V1.vector4_f32[2]),
  11268. } } };
  11269. return Result.v;
  11270. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  11271. const float32x2_t select = vget_low_f32( g_XMMaskX );
  11272. // Term1: V2zwyz * V3wzwy
  11273. const float32x2_t v2xy = vget_low_f32(V2);
  11274. const float32x2_t v2zw = vget_high_f32(V2);
  11275. const float32x2_t v2yx = vrev64_f32(v2xy);
  11276. const float32x2_t v2wz = vrev64_f32(v2zw);
  11277. const float32x2_t v2yz = vbsl_f32( select, v2yx, v2wz );
  11278. const float32x2_t v3zw = vget_high_f32(V3);
  11279. const float32x2_t v3wz = vrev64_f32(v3zw);
  11280. const float32x2_t v3xy = vget_low_f32(V3);
  11281. const float32x2_t v3wy = vbsl_f32( select, v3wz, v3xy );
  11282. float32x4_t vTemp1 = vcombine_f32(v2zw,v2yz);
  11283. float32x4_t vTemp2 = vcombine_f32(v3wz,v3wy);
  11284. XMVECTOR vResult = vmulq_f32( vTemp1, vTemp2 );
  11285. // - V2wzwy * V3zwyz
  11286. const float32x2_t v2wy = vbsl_f32( select, v2wz, v2xy );
  11287. const float32x2_t v3yx = vrev64_f32(v3xy);
  11288. const float32x2_t v3yz = vbsl_f32( select, v3yx, v3wz );
  11289. vTemp1 = vcombine_f32(v2wz,v2wy);
  11290. vTemp2 = vcombine_f32(v3zw,v3yz);
  11291. vResult = vmlsq_f32( vResult, vTemp1, vTemp2 );
  11292. // term1 * V1yxxx
  11293. const float32x2_t v1xy = vget_low_f32(V1);
  11294. const float32x2_t v1yx = vrev64_f32(v1xy);
  11295. vTemp1 = vcombine_f32( v1yx, vdup_lane_f32( v1yx, 1 ) );
  11296. vResult = vmulq_f32( vResult, vTemp1 );
  11297. // Term2: V2ywxz * V3wxwx
  11298. const float32x2_t v2yw = vrev64_f32(v2wy);
  11299. const float32x2_t v2xz = vbsl_f32( select, v2xy, v2wz );
  11300. const float32x2_t v3wx = vbsl_f32( select, v3wz, v3yx );
  11301. vTemp1 = vcombine_f32(v2yw,v2xz);
  11302. vTemp2 = vcombine_f32(v3wx,v3wx);
  11303. float32x4_t vTerm = vmulq_f32( vTemp1, vTemp2 );
  11304. // - V2wxwx * V3ywxz
  11305. const float32x2_t v2wx = vbsl_f32( select, v2wz, v2yx );
  11306. const float32x2_t v3yw = vrev64_f32(v3wy);
  11307. const float32x2_t v3xz = vbsl_f32( select, v3xy, v3wz );
  11308. vTemp1 = vcombine_f32(v2wx,v2wx);
  11309. vTemp2 = vcombine_f32(v3yw,v3xz);
  11310. vTerm = vmlsq_f32( vTerm, vTemp1, vTemp2 );
  11311. // vResult - term2 * V1zzyy
  11312. const float32x2_t v1zw = vget_high_f32(V1);
  11313. vTemp1 = vcombine_f32( vdup_lane_f32(v1zw, 0), vdup_lane_f32(v1yx, 0) );
  11314. vResult = vmlsq_f32( vResult, vTerm, vTemp1 );
  11315. // Term3: V2yzxy * V3zxyx
  11316. const float32x2_t v3zx = vrev64_f32(v3xz);
  11317. vTemp1 = vcombine_f32(v2yz,v2xy);
  11318. vTemp2 = vcombine_f32(v3zx,v3yx);
  11319. vTerm = vmulq_f32( vTemp1, vTemp2 );
  11320. // - V2zxyx * V3yzxy
  11321. const float32x2_t v2zx = vrev64_f32(v2xz);
  11322. vTemp1 = vcombine_f32(v2zx,v2yx);
  11323. vTemp2 = vcombine_f32(v3yz,v3xy);
  11324. vTerm = vmlsq_f32( vTerm, vTemp1, vTemp2 );
  11325. // vResult + term3 * V1wwwz
  11326. const float32x2_t v1wz = vrev64_f32(v1zw);
  11327. vTemp1 = vcombine_f32( vdup_lane_f32( v1wz, 0 ), v1wz );
  11328. return vmlaq_f32( vResult, vTerm, vTemp1 );
  11329. #elif defined(_XM_SSE_INTRINSICS_)
  11330. // V2zwyz * V3wzwy
  11331. XMVECTOR vResult = XM_PERMUTE_PS(V2,_MM_SHUFFLE(2,1,3,2));
  11332. XMVECTOR vTemp3 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(1,3,2,3));
  11333. vResult = _mm_mul_ps(vResult,vTemp3);
  11334. // - V2wzwy * V3zwyz
  11335. XMVECTOR vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(1,3,2,3));
  11336. vTemp3 = XM_PERMUTE_PS(vTemp3,_MM_SHUFFLE(1,3,0,1));
  11337. vTemp2 = _mm_mul_ps(vTemp2,vTemp3);
  11338. vResult = _mm_sub_ps(vResult,vTemp2);
  11339. // term1 * V1yxxx
  11340. XMVECTOR vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(0,0,0,1));
  11341. vResult = _mm_mul_ps(vResult,vTemp1);
  11342. // V2ywxz * V3wxwx
  11343. vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(2,0,3,1));
  11344. vTemp3 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(0,3,0,3));
  11345. vTemp3 = _mm_mul_ps(vTemp3,vTemp2);
  11346. // - V2wxwx * V3ywxz
  11347. vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(2,1,2,1));
  11348. vTemp1 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(2,0,3,1));
  11349. vTemp2 = _mm_mul_ps(vTemp2,vTemp1);
  11350. vTemp3 = _mm_sub_ps(vTemp3,vTemp2);
  11351. // vResult - temp * V1zzyy
  11352. vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(1,1,2,2));
  11353. vTemp1 = _mm_mul_ps(vTemp1,vTemp3);
  11354. vResult = _mm_sub_ps(vResult,vTemp1);
  11355. // V2yzxy * V3zxyx
  11356. vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(1,0,2,1));
  11357. vTemp3 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(0,1,0,2));
  11358. vTemp3 = _mm_mul_ps(vTemp3,vTemp2);
  11359. // - V2zxyx * V3yzxy
  11360. vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(2,0,2,1));
  11361. vTemp1 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(1,0,2,1));
  11362. vTemp1 = _mm_mul_ps(vTemp1,vTemp2);
  11363. vTemp3 = _mm_sub_ps(vTemp3,vTemp1);
  11364. // vResult + term * V1wwwz
  11365. vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(2,3,3,3));
  11366. vTemp3 = _mm_mul_ps(vTemp3,vTemp1);
  11367. vResult = _mm_add_ps(vResult,vTemp3);
  11368. return vResult;
  11369. #endif
  11370. }
  11371. //------------------------------------------------------------------------------
  11372. inline XMVECTOR XM_CALLCONV XMVector4LengthSq
  11373. (
  11374. FXMVECTOR V
  11375. )
  11376. {
  11377. return XMVector4Dot(V, V);
  11378. }
  11379. //------------------------------------------------------------------------------
  11380. inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst
  11381. (
  11382. FXMVECTOR V
  11383. )
  11384. {
  11385. #if defined(_XM_NO_INTRINSICS_)
  11386. XMVECTOR Result;
  11387. Result = XMVector4LengthSq(V);
  11388. Result = XMVectorReciprocalSqrtEst(Result);
  11389. return Result;
  11390. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  11391. // Dot4
  11392. float32x4_t vTemp = vmulq_f32( V, V );
  11393. float32x2_t v1 = vget_low_f32( vTemp );
  11394. float32x2_t v2 = vget_high_f32( vTemp );
  11395. v1 = vadd_f32( v1, v2 );
  11396. v1 = vpadd_f32( v1, v1 );
  11397. // Reciprocal sqrt (estimate)
  11398. v2 = vrsqrte_f32( v1 );
  11399. return vcombine_f32(v2, v2);
  11400. #elif defined(_XM_SSE4_INTRINSICS_)
  11401. XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
  11402. return _mm_rsqrt_ps( vTemp );
  11403. #elif defined(_XM_SSE3_INTRINSICS_)
  11404. XMVECTOR vLengthSq = _mm_mul_ps(V, V);
  11405. vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
  11406. vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
  11407. vLengthSq = _mm_rsqrt_ps(vLengthSq);
  11408. return vLengthSq;
  11409. #elif defined(_XM_SSE_INTRINSICS_)
  11410. // Perform the dot product on x,y,z and w
  11411. XMVECTOR vLengthSq = _mm_mul_ps(V,V);
  11412. // vTemp has z and w
  11413. XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2));
  11414. // x+z, y+w
  11415. vLengthSq = _mm_add_ps(vLengthSq,vTemp);
  11416. // x+z,x+z,x+z,y+w
  11417. vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0));
  11418. // ??,??,y+w,y+w
  11419. vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
  11420. // ??,??,x+z+y+w,??
  11421. vLengthSq = _mm_add_ps(vLengthSq,vTemp);
  11422. // Splat the length
  11423. vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2));
  11424. // Get the reciprocal
  11425. vLengthSq = _mm_rsqrt_ps(vLengthSq);
  11426. return vLengthSq;
  11427. #endif
  11428. }
  11429. //------------------------------------------------------------------------------
  11430. inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLength
  11431. (
  11432. FXMVECTOR V
  11433. )
  11434. {
  11435. #if defined(_XM_NO_INTRINSICS_)
  11436. XMVECTOR Result;
  11437. Result = XMVector4LengthSq(V);
  11438. Result = XMVectorReciprocalSqrt(Result);
  11439. return Result;
  11440. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  11441. // Dot4
  11442. float32x4_t vTemp = vmulq_f32( V, V );
  11443. float32x2_t v1 = vget_low_f32( vTemp );
  11444. float32x2_t v2 = vget_high_f32( vTemp );
  11445. v1 = vadd_f32( v1, v2 );
  11446. v1 = vpadd_f32( v1, v1 );
  11447. // Reciprocal sqrt
  11448. float32x2_t S0 = vrsqrte_f32(v1);
  11449. float32x2_t P0 = vmul_f32( v1, S0 );
  11450. float32x2_t R0 = vrsqrts_f32( P0, S0 );
  11451. float32x2_t S1 = vmul_f32( S0, R0 );
  11452. float32x2_t P1 = vmul_f32( v1, S1 );
  11453. float32x2_t R1 = vrsqrts_f32( P1, S1 );
  11454. float32x2_t Result = vmul_f32( S1, R1 );
  11455. return vcombine_f32( Result, Result );
  11456. #elif defined(_XM_SSE4_INTRINSICS_)
  11457. XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
  11458. XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
  11459. return _mm_div_ps( g_XMOne, vLengthSq );
  11460. #elif defined(_XM_SSE3_INTRINSICS_)
  11461. XMVECTOR vLengthSq = _mm_mul_ps(V, V);
  11462. vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
  11463. vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
  11464. vLengthSq = _mm_sqrt_ps(vLengthSq);
  11465. vLengthSq = _mm_div_ps(g_XMOne, vLengthSq);
  11466. return vLengthSq;
  11467. #elif defined(_XM_SSE_INTRINSICS_)
  11468. // Perform the dot product on x,y,z and w
  11469. XMVECTOR vLengthSq = _mm_mul_ps(V,V);
  11470. // vTemp has z and w
  11471. XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2));
  11472. // x+z, y+w
  11473. vLengthSq = _mm_add_ps(vLengthSq,vTemp);
  11474. // x+z,x+z,x+z,y+w
  11475. vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0));
  11476. // ??,??,y+w,y+w
  11477. vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
  11478. // ??,??,x+z+y+w,??
  11479. vLengthSq = _mm_add_ps(vLengthSq,vTemp);
  11480. // Splat the length
  11481. vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2));
  11482. // Get the reciprocal
  11483. vLengthSq = _mm_sqrt_ps(vLengthSq);
  11484. // Accurate!
  11485. vLengthSq = _mm_div_ps(g_XMOne,vLengthSq);
  11486. return vLengthSq;
  11487. #endif
  11488. }
  11489. //------------------------------------------------------------------------------
  11490. inline XMVECTOR XM_CALLCONV XMVector4LengthEst
  11491. (
  11492. FXMVECTOR V
  11493. )
  11494. {
  11495. #if defined(_XM_NO_INTRINSICS_)
  11496. XMVECTOR Result;
  11497. Result = XMVector4LengthSq(V);
  11498. Result = XMVectorSqrtEst(Result);
  11499. return Result;
  11500. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  11501. // Dot4
  11502. float32x4_t vTemp = vmulq_f32( V, V );
  11503. float32x2_t v1 = vget_low_f32( vTemp );
  11504. float32x2_t v2 = vget_high_f32( vTemp );
  11505. v1 = vadd_f32( v1, v2 );
  11506. v1 = vpadd_f32( v1, v1 );
  11507. const float32x2_t zero = vdup_n_f32(0);
  11508. uint32x2_t VEqualsZero = vceq_f32( v1, zero );
  11509. // Sqrt (estimate)
  11510. float32x2_t Result = vrsqrte_f32( v1 );
  11511. Result = vmul_f32( v1, Result );
  11512. Result = vbsl_f32( VEqualsZero, zero, Result );
  11513. return vcombine_f32( Result, Result );
  11514. #elif defined(_XM_SSE4_INTRINSICS_)
  11515. XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
  11516. return _mm_sqrt_ps( vTemp );
  11517. #elif defined(_XM_SSE3_INTRINSICS_)
  11518. XMVECTOR vLengthSq = _mm_mul_ps(V, V);
  11519. vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
  11520. vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
  11521. vLengthSq = _mm_sqrt_ps(vLengthSq);
  11522. return vLengthSq;
  11523. #elif defined(_XM_SSE_INTRINSICS_)
  11524. // Perform the dot product on x,y,z and w
  11525. XMVECTOR vLengthSq = _mm_mul_ps(V,V);
  11526. // vTemp has z and w
  11527. XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2));
  11528. // x+z, y+w
  11529. vLengthSq = _mm_add_ps(vLengthSq,vTemp);
  11530. // x+z,x+z,x+z,y+w
  11531. vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0));
  11532. // ??,??,y+w,y+w
  11533. vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
  11534. // ??,??,x+z+y+w,??
  11535. vLengthSq = _mm_add_ps(vLengthSq,vTemp);
  11536. // Splat the length
  11537. vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2));
  11538. // Get the length
  11539. vLengthSq = _mm_sqrt_ps(vLengthSq);
  11540. return vLengthSq;
  11541. #endif
  11542. }
  11543. //------------------------------------------------------------------------------
  11544. inline XMVECTOR XM_CALLCONV XMVector4Length
  11545. (
  11546. FXMVECTOR V
  11547. )
  11548. {
  11549. #if defined(_XM_NO_INTRINSICS_)
  11550. XMVECTOR Result;
  11551. Result = XMVector4LengthSq(V);
  11552. Result = XMVectorSqrt(Result);
  11553. return Result;
  11554. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  11555. // Dot4
  11556. float32x4_t vTemp = vmulq_f32( V, V );
  11557. float32x2_t v1 = vget_low_f32( vTemp );
  11558. float32x2_t v2 = vget_high_f32( vTemp );
  11559. v1 = vadd_f32( v1, v2 );
  11560. v1 = vpadd_f32( v1, v1 );
  11561. const float32x2_t zero = vdup_n_f32(0);
  11562. uint32x2_t VEqualsZero = vceq_f32( v1, zero );
  11563. // Sqrt
  11564. float32x2_t S0 = vrsqrte_f32( v1 );
  11565. float32x2_t P0 = vmul_f32( v1, S0 );
  11566. float32x2_t R0 = vrsqrts_f32( P0, S0 );
  11567. float32x2_t S1 = vmul_f32( S0, R0 );
  11568. float32x2_t P1 = vmul_f32( v1, S1 );
  11569. float32x2_t R1 = vrsqrts_f32( P1, S1 );
  11570. float32x2_t Result = vmul_f32( S1, R1 );
  11571. Result = vmul_f32( v1, Result );
  11572. Result = vbsl_f32( VEqualsZero, zero, Result );
  11573. return vcombine_f32( Result, Result );
  11574. #elif defined(_XM_SSE4_INTRINSICS_)
  11575. XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
  11576. return _mm_sqrt_ps( vTemp );
  11577. #elif defined(_XM_SSE3_INTRINSICS_)
  11578. XMVECTOR vLengthSq = _mm_mul_ps(V, V);
  11579. vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
  11580. vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
  11581. vLengthSq = _mm_sqrt_ps(vLengthSq);
  11582. return vLengthSq;
  11583. #elif defined(_XM_SSE_INTRINSICS_)
  11584. // Perform the dot product on x,y,z and w
  11585. XMVECTOR vLengthSq = _mm_mul_ps(V,V);
  11586. // vTemp has z and w
  11587. XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2));
  11588. // x+z, y+w
  11589. vLengthSq = _mm_add_ps(vLengthSq,vTemp);
  11590. // x+z,x+z,x+z,y+w
  11591. vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0));
  11592. // ??,??,y+w,y+w
  11593. vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
  11594. // ??,??,x+z+y+w,??
  11595. vLengthSq = _mm_add_ps(vLengthSq,vTemp);
  11596. // Splat the length
  11597. vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2));
  11598. // Get the length
  11599. vLengthSq = _mm_sqrt_ps(vLengthSq);
  11600. return vLengthSq;
  11601. #endif
  11602. }
  11603. //------------------------------------------------------------------------------
  11604. // XMVector4NormalizeEst uses a reciprocal estimate and
  11605. // returns QNaN on zero and infinite vectors.
  11606. inline XMVECTOR XM_CALLCONV XMVector4NormalizeEst
  11607. (
  11608. FXMVECTOR V
  11609. )
  11610. {
  11611. #if defined(_XM_NO_INTRINSICS_)
  11612. XMVECTOR Result;
  11613. Result = XMVector4ReciprocalLength(V);
  11614. Result = XMVectorMultiply(V, Result);
  11615. return Result;
  11616. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  11617. // Dot4
  11618. float32x4_t vTemp = vmulq_f32( V, V );
  11619. float32x2_t v1 = vget_low_f32( vTemp );
  11620. float32x2_t v2 = vget_high_f32( vTemp );
  11621. v1 = vadd_f32( v1, v2 );
  11622. v1 = vpadd_f32( v1, v1 );
  11623. // Reciprocal sqrt (estimate)
  11624. v2 = vrsqrte_f32( v1 );
  11625. // Normalize
  11626. return vmulq_f32( V, vcombine_f32(v2,v2) );
  11627. #elif defined(_XM_SSE4_INTRINSICS_)
  11628. XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
  11629. XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
  11630. return _mm_mul_ps(vResult, V);
  11631. #elif defined(_XM_SSE3_INTRINSICS_)
  11632. XMVECTOR vDot = _mm_mul_ps(V, V);
  11633. vDot = _mm_hadd_ps(vDot, vDot);
  11634. vDot = _mm_hadd_ps(vDot, vDot);
  11635. vDot = _mm_rsqrt_ps(vDot);
  11636. vDot = _mm_mul_ps(vDot, V);
  11637. return vDot;
  11638. #elif defined(_XM_SSE_INTRINSICS_)
  11639. // Perform the dot product on x,y,z and w
  11640. XMVECTOR vLengthSq = _mm_mul_ps(V,V);
  11641. // vTemp has z and w
  11642. XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2));
  11643. // x+z, y+w
  11644. vLengthSq = _mm_add_ps(vLengthSq,vTemp);
  11645. // x+z,x+z,x+z,y+w
  11646. vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0));
  11647. // ??,??,y+w,y+w
  11648. vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
  11649. // ??,??,x+z+y+w,??
  11650. vLengthSq = _mm_add_ps(vLengthSq,vTemp);
  11651. // Splat the length
  11652. vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2));
  11653. // Get the reciprocal
  11654. XMVECTOR vResult = _mm_rsqrt_ps(vLengthSq);
  11655. // Reciprocal mul to perform the normalization
  11656. vResult = _mm_mul_ps(vResult,V);
  11657. return vResult;
  11658. #endif
  11659. }
  11660. //------------------------------------------------------------------------------
  11661. inline XMVECTOR XM_CALLCONV XMVector4Normalize
  11662. (
  11663. FXMVECTOR V
  11664. )
  11665. {
  11666. #if defined(_XM_NO_INTRINSICS_)
  11667. float fLength;
  11668. XMVECTOR vResult;
  11669. vResult = XMVector4Length( V );
  11670. fLength = vResult.vector4_f32[0];
  11671. // Prevent divide by zero
  11672. if (fLength > 0) {
  11673. fLength = 1.0f/fLength;
  11674. }
  11675. vResult.vector4_f32[0] = V.vector4_f32[0]*fLength;
  11676. vResult.vector4_f32[1] = V.vector4_f32[1]*fLength;
  11677. vResult.vector4_f32[2] = V.vector4_f32[2]*fLength;
  11678. vResult.vector4_f32[3] = V.vector4_f32[3]*fLength;
  11679. return vResult;
  11680. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  11681. // Dot4
  11682. float32x4_t vTemp = vmulq_f32( V, V );
  11683. float32x2_t v1 = vget_low_f32( vTemp );
  11684. float32x2_t v2 = vget_high_f32( vTemp );
  11685. v1 = vadd_f32( v1, v2 );
  11686. v1 = vpadd_f32( v1, v1 );
  11687. uint32x2_t VEqualsZero = vceq_f32( v1, vdup_n_f32(0) );
  11688. uint32x2_t VEqualsInf = vceq_f32( v1, vget_low_f32(g_XMInfinity) );
  11689. // Reciprocal sqrt (2 iterations of Newton-Raphson)
  11690. float32x2_t S0 = vrsqrte_f32( v1 );
  11691. float32x2_t P0 = vmul_f32( v1, S0 );
  11692. float32x2_t R0 = vrsqrts_f32( P0, S0 );
  11693. float32x2_t S1 = vmul_f32( S0, R0 );
  11694. float32x2_t P1 = vmul_f32( v1, S1 );
  11695. float32x2_t R1 = vrsqrts_f32( P1, S1 );
  11696. v2 = vmul_f32( S1, R1 );
  11697. // Normalize
  11698. XMVECTOR vResult = vmulq_f32( V, vcombine_f32(v2,v2) );
  11699. vResult = vbslq_f32( vcombine_f32(VEqualsZero,VEqualsZero), vdupq_n_f32(0), vResult );
  11700. return vbslq_f32( vcombine_f32(VEqualsInf,VEqualsInf), g_XMQNaN, vResult );
  11701. #elif defined(_XM_SSE4_INTRINSICS_)
  11702. XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0xff );
  11703. // Prepare for the division
  11704. XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
  11705. // Create zero with a single instruction
  11706. XMVECTOR vZeroMask = _mm_setzero_ps();
  11707. // Test for a divide by zero (Must be FP to detect -0.0)
  11708. vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
  11709. // Failsafe on zero (Or epsilon) length planes
  11710. // If the length is infinity, set the elements to zero
  11711. vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
  11712. // Divide to perform the normalization
  11713. vResult = _mm_div_ps(V,vResult);
  11714. // Any that are infinity, set to zero
  11715. vResult = _mm_and_ps(vResult,vZeroMask);
  11716. // Select qnan or result based on infinite length
  11717. XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
  11718. XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
  11719. vResult = _mm_or_ps(vTemp1,vTemp2);
  11720. return vResult;
  11721. #elif defined(_XM_SSE3_INTRINSICS_)
  11722. // Perform the dot product on x,y,z and w
  11723. XMVECTOR vLengthSq = _mm_mul_ps(V, V);
  11724. vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
  11725. vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
  11726. // Prepare for the division
  11727. XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
  11728. // Create zero with a single instruction
  11729. XMVECTOR vZeroMask = _mm_setzero_ps();
  11730. // Test for a divide by zero (Must be FP to detect -0.0)
  11731. vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
  11732. // Failsafe on zero (Or epsilon) length planes
  11733. // If the length is infinity, set the elements to zero
  11734. vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
  11735. // Divide to perform the normalization
  11736. vResult = _mm_div_ps(V,vResult);
  11737. // Any that are infinity, set to zero
  11738. vResult = _mm_and_ps(vResult,vZeroMask);
  11739. // Select qnan or result based on infinite length
  11740. XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
  11741. XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
  11742. vResult = _mm_or_ps(vTemp1,vTemp2);
  11743. return vResult;
  11744. #elif defined(_XM_SSE_INTRINSICS_)
  11745. // Perform the dot product on x,y,z and w
  11746. XMVECTOR vLengthSq = _mm_mul_ps(V,V);
  11747. // vTemp has z and w
  11748. XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2));
  11749. // x+z, y+w
  11750. vLengthSq = _mm_add_ps(vLengthSq,vTemp);
  11751. // x+z,x+z,x+z,y+w
  11752. vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0));
  11753. // ??,??,y+w,y+w
  11754. vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
  11755. // ??,??,x+z+y+w,??
  11756. vLengthSq = _mm_add_ps(vLengthSq,vTemp);
  11757. // Splat the length
  11758. vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2));
  11759. // Prepare for the division
  11760. XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
  11761. // Create zero with a single instruction
  11762. XMVECTOR vZeroMask = _mm_setzero_ps();
  11763. // Test for a divide by zero (Must be FP to detect -0.0)
  11764. vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
  11765. // Failsafe on zero (Or epsilon) length planes
  11766. // If the length is infinity, set the elements to zero
  11767. vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
  11768. // Divide to perform the normalization
  11769. vResult = _mm_div_ps(V,vResult);
  11770. // Any that are infinity, set to zero
  11771. vResult = _mm_and_ps(vResult,vZeroMask);
  11772. // Select qnan or result based on infinite length
  11773. XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
  11774. XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
  11775. vResult = _mm_or_ps(vTemp1,vTemp2);
  11776. return vResult;
  11777. #endif
  11778. }
  11779. //------------------------------------------------------------------------------
  11780. inline XMVECTOR XM_CALLCONV XMVector4ClampLength
  11781. (
  11782. FXMVECTOR V,
  11783. float LengthMin,
  11784. float LengthMax
  11785. )
  11786. {
  11787. XMVECTOR ClampMax = XMVectorReplicate(LengthMax);
  11788. XMVECTOR ClampMin = XMVectorReplicate(LengthMin);
  11789. return XMVector4ClampLengthV(V, ClampMin, ClampMax);
  11790. }
  11791. //------------------------------------------------------------------------------
  11792. inline XMVECTOR XM_CALLCONV XMVector4ClampLengthV
  11793. (
  11794. FXMVECTOR V,
  11795. FXMVECTOR LengthMin,
  11796. FXMVECTOR LengthMax
  11797. )
  11798. {
  11799. assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetW(LengthMin) == XMVectorGetX(LengthMin)));
  11800. assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetW(LengthMax) == XMVectorGetX(LengthMax)));
  11801. assert(XMVector4GreaterOrEqual(LengthMin, XMVectorZero()));
  11802. assert(XMVector4GreaterOrEqual(LengthMax, XMVectorZero()));
  11803. assert(XMVector4GreaterOrEqual(LengthMax, LengthMin));
  11804. XMVECTOR LengthSq = XMVector4LengthSq(V);
  11805. const XMVECTOR Zero = XMVectorZero();
  11806. XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq);
  11807. XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v);
  11808. XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero);
  11809. XMVECTOR Normal = XMVectorMultiply(V, RcpLength);
  11810. XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength);
  11811. XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength);
  11812. Length = XMVectorSelect(LengthSq, Length, Select);
  11813. Normal = XMVectorSelect(LengthSq, Normal, Select);
  11814. XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax);
  11815. XMVECTOR ControlMin = XMVectorLess(Length, LengthMin);
  11816. XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax);
  11817. ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin);
  11818. XMVECTOR Result = XMVectorMultiply(Normal, ClampLength);
  11819. // Preserve the original vector (with no precision loss) if the length falls within the given range
  11820. XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin);
  11821. Result = XMVectorSelect(Result, V, Control);
  11822. return Result;
  11823. }
  11824. //------------------------------------------------------------------------------
  11825. inline XMVECTOR XM_CALLCONV XMVector4Reflect
  11826. (
  11827. FXMVECTOR Incident,
  11828. FXMVECTOR Normal
  11829. )
  11830. {
  11831. // Result = Incident - (2 * dot(Incident, Normal)) * Normal
  11832. XMVECTOR Result = XMVector4Dot(Incident, Normal);
  11833. Result = XMVectorAdd(Result, Result);
  11834. Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident);
  11835. return Result;
  11836. }
  11837. //------------------------------------------------------------------------------
  11838. inline XMVECTOR XM_CALLCONV XMVector4Refract
  11839. (
  11840. FXMVECTOR Incident,
  11841. FXMVECTOR Normal,
  11842. float RefractionIndex
  11843. )
  11844. {
  11845. XMVECTOR Index = XMVectorReplicate(RefractionIndex);
  11846. return XMVector4RefractV(Incident, Normal, Index);
  11847. }
  11848. //------------------------------------------------------------------------------
  11849. inline XMVECTOR XM_CALLCONV XMVector4RefractV
  11850. (
  11851. FXMVECTOR Incident,
  11852. FXMVECTOR Normal,
  11853. FXMVECTOR RefractionIndex
  11854. )
  11855. {
  11856. #if defined(_XM_NO_INTRINSICS_)
  11857. XMVECTOR IDotN;
  11858. XMVECTOR R;
  11859. const XMVECTOR Zero = XMVectorZero();
  11860. // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) +
  11861. // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
  11862. IDotN = XMVector4Dot(Incident, Normal);
  11863. // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
  11864. R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v);
  11865. R = XMVectorMultiply(R, RefractionIndex);
  11866. R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v);
  11867. if (XMVector4LessOrEqual(R, Zero))
  11868. {
  11869. // Total internal reflection
  11870. return Zero;
  11871. }
  11872. else
  11873. {
  11874. XMVECTOR Result;
  11875. // R = RefractionIndex * IDotN + sqrt(R)
  11876. R = XMVectorSqrt(R);
  11877. R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R);
  11878. // Result = RefractionIndex * Incident - Normal * R
  11879. Result = XMVectorMultiply(RefractionIndex, Incident);
  11880. Result = XMVectorNegativeMultiplySubtract(Normal, R, Result);
  11881. return Result;
  11882. }
  11883. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  11884. XMVECTOR IDotN = XMVector4Dot(Incident,Normal);
  11885. // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
  11886. float32x4_t R = vmlsq_f32( g_XMOne, IDotN, IDotN);
  11887. R = vmulq_f32(R, RefractionIndex);
  11888. R = vmlsq_f32(g_XMOne, R, RefractionIndex );
  11889. uint32x4_t vResult = vcleq_f32(R,g_XMZero);
  11890. int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
  11891. vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
  11892. if ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU )
  11893. {
  11894. // Total internal reflection
  11895. vResult = g_XMZero;
  11896. }
  11897. else
  11898. {
  11899. // Sqrt(R)
  11900. float32x4_t S0 = vrsqrteq_f32(R);
  11901. float32x4_t P0 = vmulq_f32( R, S0 );
  11902. float32x4_t R0 = vrsqrtsq_f32( P0, S0 );
  11903. float32x4_t S1 = vmulq_f32( S0, R0 );
  11904. float32x4_t P1 = vmulq_f32( R, S1 );
  11905. float32x4_t R1 = vrsqrtsq_f32( P1, S1 );
  11906. float32x4_t S2 = vmulq_f32( S1, R1 );
  11907. R = vmulq_f32( R, S2 );
  11908. // R = RefractionIndex * IDotN + sqrt(R)
  11909. R = vmlaq_f32( R, RefractionIndex, IDotN );
  11910. // Result = RefractionIndex * Incident - Normal * R
  11911. vResult = vmulq_f32(RefractionIndex, Incident);
  11912. vResult = vmlsq_f32( vResult, R, Normal );
  11913. }
  11914. return vResult;
  11915. #elif defined(_XM_SSE_INTRINSICS_)
  11916. XMVECTOR IDotN = XMVector4Dot(Incident,Normal);
  11917. // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
  11918. XMVECTOR R = _mm_mul_ps(IDotN,IDotN);
  11919. R = _mm_sub_ps(g_XMOne,R);
  11920. R = _mm_mul_ps(R, RefractionIndex);
  11921. R = _mm_mul_ps(R, RefractionIndex);
  11922. R = _mm_sub_ps(g_XMOne,R);
  11923. XMVECTOR vResult = _mm_cmple_ps(R,g_XMZero);
  11924. if (_mm_movemask_ps(vResult)==0x0f)
  11925. {
  11926. // Total internal reflection
  11927. vResult = g_XMZero;
  11928. }
  11929. else
  11930. {
  11931. // R = RefractionIndex * IDotN + sqrt(R)
  11932. R = _mm_sqrt_ps(R);
  11933. vResult = _mm_mul_ps(RefractionIndex, IDotN);
  11934. R = _mm_add_ps(R,vResult);
  11935. // Result = RefractionIndex * Incident - Normal * R
  11936. vResult = _mm_mul_ps(RefractionIndex, Incident);
  11937. R = _mm_mul_ps(R,Normal);
  11938. vResult = _mm_sub_ps(vResult,R);
  11939. }
  11940. return vResult;
  11941. #endif
  11942. }
  11943. //------------------------------------------------------------------------------
  11944. inline XMVECTOR XM_CALLCONV XMVector4Orthogonal
  11945. (
  11946. FXMVECTOR V
  11947. )
  11948. {
  11949. #if defined(_XM_NO_INTRINSICS_)
  11950. XMVECTORF32 Result = { { {
  11951. V.vector4_f32[2],
  11952. V.vector4_f32[3],
  11953. -V.vector4_f32[0],
  11954. -V.vector4_f32[1]
  11955. } } };
  11956. return Result.v;
  11957. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  11958. static const XMVECTORF32 Negate = { { { 1.f, 1.f, -1.f, -1.f } } };
  11959. float32x4_t Result = vcombine_f32( vget_high_f32( V ), vget_low_f32( V ) );
  11960. return vmulq_f32( Result, Negate );
  11961. #elif defined(_XM_SSE_INTRINSICS_)
  11962. static const XMVECTORF32 FlipZW = { { { 1.0f, 1.0f, -1.0f, -1.0f } } };
  11963. XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,0,3,2));
  11964. vResult = _mm_mul_ps(vResult,FlipZW);
  11965. return vResult;
  11966. #endif
  11967. }
  11968. //------------------------------------------------------------------------------
  11969. inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormalsEst
  11970. (
  11971. FXMVECTOR N1,
  11972. FXMVECTOR N2
  11973. )
  11974. {
  11975. XMVECTOR Result = XMVector4Dot(N1, N2);
  11976. Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
  11977. Result = XMVectorACosEst(Result);
  11978. return Result;
  11979. }
  11980. //------------------------------------------------------------------------------
  11981. inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormals
  11982. (
  11983. FXMVECTOR N1,
  11984. FXMVECTOR N2
  11985. )
  11986. {
  11987. XMVECTOR Result = XMVector4Dot(N1, N2);
  11988. Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
  11989. Result = XMVectorACos(Result);
  11990. return Result;
  11991. }
  11992. //------------------------------------------------------------------------------
  11993. inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenVectors
  11994. (
  11995. FXMVECTOR V1,
  11996. FXMVECTOR V2
  11997. )
  11998. {
  11999. XMVECTOR L1 = XMVector4ReciprocalLength(V1);
  12000. XMVECTOR L2 = XMVector4ReciprocalLength(V2);
  12001. XMVECTOR Dot = XMVector4Dot(V1, V2);
  12002. L1 = XMVectorMultiply(L1, L2);
  12003. XMVECTOR CosAngle = XMVectorMultiply(Dot, L1);
  12004. CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v);
  12005. return XMVectorACos(CosAngle);
  12006. }
  12007. //------------------------------------------------------------------------------
  12008. inline XMVECTOR XM_CALLCONV XMVector4Transform
  12009. (
  12010. FXMVECTOR V,
  12011. FXMMATRIX M
  12012. )
  12013. {
  12014. #if defined(_XM_NO_INTRINSICS_)
  12015. float fX = (M.m[0][0]*V.vector4_f32[0])+(M.m[1][0]*V.vector4_f32[1])+(M.m[2][0]*V.vector4_f32[2])+(M.m[3][0]*V.vector4_f32[3]);
  12016. float fY = (M.m[0][1]*V.vector4_f32[0])+(M.m[1][1]*V.vector4_f32[1])+(M.m[2][1]*V.vector4_f32[2])+(M.m[3][1]*V.vector4_f32[3]);
  12017. float fZ = (M.m[0][2]*V.vector4_f32[0])+(M.m[1][2]*V.vector4_f32[1])+(M.m[2][2]*V.vector4_f32[2])+(M.m[3][2]*V.vector4_f32[3]);
  12018. float fW = (M.m[0][3]*V.vector4_f32[0])+(M.m[1][3]*V.vector4_f32[1])+(M.m[2][3]*V.vector4_f32[2])+(M.m[3][3]*V.vector4_f32[3]);
  12019. XMVECTORF32 vResult = { { { fX, fY, fZ, fW } } };
  12020. return vResult.v;
  12021. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  12022. float32x2_t VL = vget_low_f32( V );
  12023. XMVECTOR vResult = vmulq_lane_f32( M.r[0], VL, 0 ); // X
  12024. vResult = vmlaq_lane_f32( vResult, M.r[1], VL, 1 ); // Y
  12025. float32x2_t VH = vget_high_f32( V );
  12026. vResult = vmlaq_lane_f32( vResult, M.r[2], VH, 0 ); // Z
  12027. return vmlaq_lane_f32( vResult, M.r[3], VH, 1 ); // W
  12028. #elif defined(_XM_SSE_INTRINSICS_)
  12029. // Splat x,y,z and w
  12030. XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
  12031. XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
  12032. XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
  12033. XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
  12034. // Mul by the matrix
  12035. vTempX = _mm_mul_ps(vTempX,M.r[0]);
  12036. vTempY = _mm_mul_ps(vTempY,M.r[1]);
  12037. vTempZ = _mm_mul_ps(vTempZ,M.r[2]);
  12038. vTempW = _mm_mul_ps(vTempW,M.r[3]);
  12039. // Add them all together
  12040. vTempX = _mm_add_ps(vTempX,vTempY);
  12041. vTempZ = _mm_add_ps(vTempZ,vTempW);
  12042. vTempX = _mm_add_ps(vTempX,vTempZ);
  12043. return vTempX;
  12044. #endif
  12045. }
  12046. //------------------------------------------------------------------------------
  12047. _Use_decl_annotations_
  12048. inline XMFLOAT4* XM_CALLCONV XMVector4TransformStream
  12049. (
  12050. XMFLOAT4* pOutputStream,
  12051. size_t OutputStride,
  12052. const XMFLOAT4* pInputStream,
  12053. size_t InputStride,
  12054. size_t VectorCount,
  12055. FXMMATRIX M
  12056. )
  12057. {
  12058. assert(pOutputStream != nullptr);
  12059. assert(pInputStream != nullptr);
  12060. assert(InputStride >= sizeof(XMFLOAT4));
  12061. _Analysis_assume_(InputStride >= sizeof(XMFLOAT4));
  12062. assert(OutputStride >= sizeof(XMFLOAT4));
  12063. _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4));
  12064. #if defined(_XM_NO_INTRINSICS_)
  12065. const uint8_t* pInputVector = (const uint8_t*)pInputStream;
  12066. uint8_t* pOutputVector = (uint8_t*)pOutputStream;
  12067. const XMVECTOR row0 = M.r[0];
  12068. const XMVECTOR row1 = M.r[1];
  12069. const XMVECTOR row2 = M.r[2];
  12070. const XMVECTOR row3 = M.r[3];
  12071. for (size_t i = 0; i < VectorCount; i++)
  12072. {
  12073. XMVECTOR V = XMLoadFloat4((const XMFLOAT4*)pInputVector);
  12074. XMVECTOR W = XMVectorSplatW(V);
  12075. XMVECTOR Z = XMVectorSplatZ(V);
  12076. XMVECTOR Y = XMVectorSplatY(V);
  12077. XMVECTOR X = XMVectorSplatX(V);
  12078. XMVECTOR Result = XMVectorMultiply(W, row3);
  12079. Result = XMVectorMultiplyAdd(Z, row2, Result);
  12080. Result = XMVectorMultiplyAdd(Y, row1, Result);
  12081. Result = XMVectorMultiplyAdd(X, row0, Result);
  12082. #ifdef _PREFAST_
  12083. #pragma prefast(push)
  12084. #pragma prefast(disable : 26015, "PREfast noise: Esp:1307" )
  12085. #endif
  12086. XMStoreFloat4((XMFLOAT4*)pOutputVector, Result);
  12087. #ifdef _PREFAST_
  12088. #pragma prefast(pop)
  12089. #endif
  12090. pInputVector += InputStride;
  12091. pOutputVector += OutputStride;
  12092. }
  12093. return pOutputStream;
  12094. #elif defined(_XM_ARM_NEON_INTRINSICS_)
  12095. const uint8_t* pInputVector = (const uint8_t*)pInputStream;
  12096. uint8_t* pOutputVector = (uint8_t*)pOutputStream;
  12097. const XMVECTOR row0 = M.r[0];
  12098. const XMVECTOR row1 = M.r[1];
  12099. const XMVECTOR row2 = M.r[2];
  12100. const XMVECTOR row3 = M.r[3];
  12101. size_t i = 0;
  12102. size_t four = VectorCount >> 2;
  12103. if ( four > 0 )
  12104. {
  12105. if ((InputStride == sizeof(XMFLOAT4)) && (OutputStride == sizeof(XMFLOAT4)))
  12106. {
  12107. for (size_t j = 0; j < four; ++j)
  12108. {
  12109. float32x4x4_t V = vld4q_f32( reinterpret_cast<const float*>(pInputVector) );
  12110. pInputVector += sizeof(XMFLOAT4)*4;
  12111. float32x2_t r = vget_low_f32( row0 );
  12112. XMVECTOR vResult0 = vmulq_lane_f32( V.val[0], r, 0 ); // Ax
  12113. XMVECTOR vResult1 = vmulq_lane_f32( V.val[0], r, 1 ); // Bx
  12114. __prefetch( pInputVector );
  12115. r = vget_high_f32( row0 );
  12116. XMVECTOR vResult2 = vmulq_lane_f32( V.val[0], r, 0 ); // Cx
  12117. XMVECTOR vResult3 = vmulq_lane_f32( V.val[0], r, 1 ); // Dx
  12118. __prefetch( pInputVector+XM_CACHE_LINE_SIZE );
  12119. r = vget_low_f32( row1 );
  12120. vResult0 = vmlaq_lane_f32( vResult0, V.val[1], r, 0 ); // Ax+Ey
  12121. vResult1 = vmlaq_lane_f32( vResult1, V.val[1], r, 1 ); // Bx+Fy
  12122. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*2) );
  12123. r = vget_high_f32( row1 );
  12124. vResult2 = vmlaq_lane_f32( vResult2, V.val[1], r, 0 ); // Cx+Gy
  12125. vResult3 = vmlaq_lane_f32( vResult3, V.val[1], r, 1 ); // Dx+Hy
  12126. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*3) );
  12127. r = vget_low_f32( row2 );
  12128. vResult0 = vmlaq_lane_f32( vResult0, V.val[2], r, 0 ); // Ax+Ey+Iz
  12129. vResult1 = vmlaq_lane_f32( vResult1, V.val[2], r, 1 ); // Bx+Fy+Jz
  12130. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*4) );
  12131. r = vget_high_f32( row2 );
  12132. vResult2 = vmlaq_lane_f32( vResult2, V.val[2], r, 0 ); // Cx+Gy+Kz
  12133. vResult3 = vmlaq_lane_f32( vResult3, V.val[2], r, 1 ); // Dx+Hy+Lz
  12134. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*5) );
  12135. r = vget_low_f32( row3 );
  12136. vResult0 = vmlaq_lane_f32( vResult0, V.val[3], r, 0 ); // Ax+Ey+Iz+Mw
  12137. vResult1 = vmlaq_lane_f32( vResult1, V.val[3], r, 1 ); // Bx+Fy+Jz+Nw
  12138. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*6) );
  12139. r = vget_high_f32( row3 );
  12140. vResult2 = vmlaq_lane_f32( vResult2, V.val[3], r, 0 ); // Cx+Gy+Kz+Ow
  12141. vResult3 = vmlaq_lane_f32( vResult3, V.val[3], r, 1 ); // Dx+Hy+Lz+Pw
  12142. __prefetch( pInputVector+(XM_CACHE_LINE_SIZE*7) );
  12143. V.val[0] = vResult0;
  12144. V.val[1] = vResult1;
  12145. V.val[2] = vResult2;
  12146. V.val[3] = vResult3;
  12147. vst4q_f32( reinterpret_cast<float*>(pOutputVector), V );
  12148. pOutputVector += sizeof(XMFLOAT4)*4;
  12149. i += 4;
  12150. }
  12151. }
  12152. }
  12153. for (; i < VectorCount; i++)
  12154. {
  12155. XMVECTOR V = vld1q_f32( reinterpret_cast<const float*>(pInputVector) );
  12156. pInputVector += InputStride;
  12157. float32x2_t VL = vget_low_f32( V );
  12158. XMVECTOR vResult = vmulq_lane_f32( row0, VL, 0 ); // X
  12159. vResult = vmlaq_lane_f32( vResult, row1, VL, 1 ); // Y
  12160. float32x2_t VH = vget_high_f32( V );
  12161. vResult = vmlaq_lane_f32( vResult, row2, VH, 0 ); // Z
  12162. vResult = vmlaq_lane_f32( vResult, row3, VH, 1 ); // W
  12163. vst1q_f32( reinterpret_cast<float*>(pOutputVector), vResult );
  12164. pOutputVector += OutputStride;
  12165. }
  12166. return pOutputStream;
  12167. #elif defined(_XM_SSE_INTRINSICS_)
  12168. const uint8_t* pInputVector = (const uint8_t*)pInputStream;
  12169. uint8_t* pOutputVector = (uint8_t*)pOutputStream;
  12170. const XMVECTOR row0 = M.r[0];
  12171. const XMVECTOR row1 = M.r[1];
  12172. const XMVECTOR row2 = M.r[2];
  12173. const XMVECTOR row3 = M.r[3];
  12174. if ( !((uintptr_t)pOutputStream & 0xF) && !(OutputStride & 0xF) )
  12175. {
  12176. if ( !((uintptr_t)pInputStream & 0xF) && !(InputStride & 0xF) )
  12177. {
  12178. // Aligned input, aligned output
  12179. for (size_t i = 0; i < VectorCount; i++)
  12180. {
  12181. __m128 V = _mm_load_ps( reinterpret_cast<const float*>(pInputVector) );
  12182. pInputVector += InputStride;
  12183. XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
  12184. XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
  12185. XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
  12186. XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
  12187. vTempX = _mm_mul_ps(vTempX,row0);
  12188. vTempY = _mm_mul_ps(vTempY,row1);
  12189. vTempZ = _mm_mul_ps(vTempZ,row2);
  12190. vTempW = _mm_mul_ps(vTempW,row3);
  12191. vTempX = _mm_add_ps(vTempX,vTempY);
  12192. vTempZ = _mm_add_ps(vTempZ,vTempW);
  12193. vTempX = _mm_add_ps(vTempX,vTempZ);
  12194. XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTempX );
  12195. pOutputVector += OutputStride;
  12196. }
  12197. }
  12198. else
  12199. {
  12200. // Unaligned input, aligned output
  12201. for (size_t i = 0; i < VectorCount; i++)
  12202. {
  12203. __m128 V = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
  12204. pInputVector += InputStride;
  12205. XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
  12206. XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
  12207. XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
  12208. XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
  12209. vTempX = _mm_mul_ps(vTempX,row0);
  12210. vTempY = _mm_mul_ps(vTempY,row1);
  12211. vTempZ = _mm_mul_ps(vTempZ,row2);
  12212. vTempW = _mm_mul_ps(vTempW,row3);
  12213. vTempX = _mm_add_ps(vTempX,vTempY);
  12214. vTempZ = _mm_add_ps(vTempZ,vTempW);
  12215. vTempX = _mm_add_ps(vTempX,vTempZ);
  12216. XM_STREAM_PS( reinterpret_cast<float*>(pOutputVector), vTempX );
  12217. pOutputVector += OutputStride;
  12218. }
  12219. }
  12220. }
  12221. else
  12222. {
  12223. if ( !((uintptr_t)pInputStream & 0xF) && !(InputStride & 0xF) )
  12224. {
  12225. // Aligned input, unaligned output
  12226. for (size_t i = 0; i < VectorCount; i++)
  12227. {
  12228. __m128 V = _mm_load_ps( reinterpret_cast<const float*>(pInputVector) );
  12229. pInputVector += InputStride;
  12230. XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
  12231. XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
  12232. XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
  12233. XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
  12234. vTempX = _mm_mul_ps(vTempX,row0);
  12235. vTempY = _mm_mul_ps(vTempY,row1);
  12236. vTempZ = _mm_mul_ps(vTempZ,row2);
  12237. vTempW = _mm_mul_ps(vTempW,row3);
  12238. vTempX = _mm_add_ps(vTempX,vTempY);
  12239. vTempZ = _mm_add_ps(vTempZ,vTempW);
  12240. vTempX = _mm_add_ps(vTempX,vTempZ);
  12241. _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTempX );
  12242. pOutputVector += OutputStride;
  12243. }
  12244. }
  12245. else
  12246. {
  12247. // Unaligned input, unaligned output
  12248. for (size_t i = 0; i < VectorCount; i++)
  12249. {
  12250. __m128 V = _mm_loadu_ps( reinterpret_cast<const float*>(pInputVector) );
  12251. pInputVector += InputStride;
  12252. XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
  12253. XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
  12254. XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
  12255. XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
  12256. vTempX = _mm_mul_ps(vTempX,row0);
  12257. vTempY = _mm_mul_ps(vTempY,row1);
  12258. vTempZ = _mm_mul_ps(vTempZ,row2);
  12259. vTempW = _mm_mul_ps(vTempW,row3);
  12260. vTempX = _mm_add_ps(vTempX,vTempY);
  12261. vTempZ = _mm_add_ps(vTempZ,vTempW);
  12262. vTempX = _mm_add_ps(vTempX,vTempZ);
  12263. _mm_storeu_ps( reinterpret_cast<float*>(pOutputVector), vTempX );
  12264. pOutputVector += OutputStride;
  12265. }
  12266. }
  12267. }
  12268. XM_SFENCE();
  12269. return pOutputStream;
  12270. #endif
  12271. }
  12272. /****************************************************************************
  12273. *
  12274. * XMVECTOR operators
  12275. *
  12276. ****************************************************************************/
  12277. #ifndef _XM_NO_XMVECTOR_OVERLOADS_
  12278. //------------------------------------------------------------------------------
  12279. inline XMVECTOR XM_CALLCONV operator+ (FXMVECTOR V)
  12280. {
  12281. return V;
  12282. }
  12283. //------------------------------------------------------------------------------
  12284. inline XMVECTOR XM_CALLCONV operator- (FXMVECTOR V)
  12285. {
  12286. return XMVectorNegate(V);
  12287. }
  12288. //------------------------------------------------------------------------------
  12289. inline XMVECTOR& XM_CALLCONV operator+=
  12290. (
  12291. XMVECTOR& V1,
  12292. FXMVECTOR V2
  12293. )
  12294. {
  12295. V1 = XMVectorAdd(V1, V2);
  12296. return V1;
  12297. }
  12298. //------------------------------------------------------------------------------
  12299. inline XMVECTOR& XM_CALLCONV operator-=
  12300. (
  12301. XMVECTOR& V1,
  12302. FXMVECTOR V2
  12303. )
  12304. {
  12305. V1 = XMVectorSubtract(V1, V2);
  12306. return V1;
  12307. }
  12308. //------------------------------------------------------------------------------
  12309. inline XMVECTOR& XM_CALLCONV operator*=
  12310. (
  12311. XMVECTOR& V1,
  12312. FXMVECTOR V2
  12313. )
  12314. {
  12315. V1 = XMVectorMultiply(V1, V2);
  12316. return V1;
  12317. }
  12318. //------------------------------------------------------------------------------
  12319. inline XMVECTOR& XM_CALLCONV operator/=
  12320. (
  12321. XMVECTOR& V1,
  12322. FXMVECTOR V2
  12323. )
  12324. {
  12325. V1 = XMVectorDivide(V1,V2);
  12326. return V1;
  12327. }
  12328. //------------------------------------------------------------------------------
  12329. inline XMVECTOR& operator*=
  12330. (
  12331. XMVECTOR& V,
  12332. const float S
  12333. )
  12334. {
  12335. V = XMVectorScale(V, S);
  12336. return V;
  12337. }
  12338. //------------------------------------------------------------------------------
  12339. inline XMVECTOR& operator/=
  12340. (
  12341. XMVECTOR& V,
  12342. const float S
  12343. )
  12344. {
  12345. XMVECTOR vS = XMVectorReplicate( S );
  12346. V = XMVectorDivide(V, vS);
  12347. return V;
  12348. }
  12349. //------------------------------------------------------------------------------
  12350. inline XMVECTOR XM_CALLCONV operator+
  12351. (
  12352. FXMVECTOR V1,
  12353. FXMVECTOR V2
  12354. )
  12355. {
  12356. return XMVectorAdd(V1, V2);
  12357. }
  12358. //------------------------------------------------------------------------------
  12359. inline XMVECTOR XM_CALLCONV operator-
  12360. (
  12361. FXMVECTOR V1,
  12362. FXMVECTOR V2
  12363. )
  12364. {
  12365. return XMVectorSubtract(V1, V2);
  12366. }
  12367. //------------------------------------------------------------------------------
  12368. inline XMVECTOR XM_CALLCONV operator*
  12369. (
  12370. FXMVECTOR V1,
  12371. FXMVECTOR V2
  12372. )
  12373. {
  12374. return XMVectorMultiply(V1, V2);
  12375. }
  12376. //------------------------------------------------------------------------------
  12377. inline XMVECTOR XM_CALLCONV operator/
  12378. (
  12379. FXMVECTOR V1,
  12380. FXMVECTOR V2
  12381. )
  12382. {
  12383. return XMVectorDivide(V1,V2);
  12384. }
  12385. //------------------------------------------------------------------------------
  12386. inline XMVECTOR XM_CALLCONV operator*
  12387. (
  12388. FXMVECTOR V,
  12389. const float S
  12390. )
  12391. {
  12392. return XMVectorScale(V, S);
  12393. }
  12394. //------------------------------------------------------------------------------
  12395. inline XMVECTOR XM_CALLCONV operator/
  12396. (
  12397. FXMVECTOR V,
  12398. const float S
  12399. )
  12400. {
  12401. XMVECTOR vS = XMVectorReplicate( S );
  12402. return XMVectorDivide(V, vS);
  12403. }
  12404. //------------------------------------------------------------------------------
  12405. inline XMVECTOR XM_CALLCONV operator*
  12406. (
  12407. float S,
  12408. FXMVECTOR V
  12409. )
  12410. {
  12411. return XMVectorScale(V, S);
  12412. }
  12413. #endif /* !_XM_NO_XMVECTOR_OVERLOADS_ */
  12414. #if defined(_XM_NO_INTRINSICS_)
  12415. #undef XMISNAN
  12416. #undef XMISINF
  12417. #endif
  12418. #if defined(_XM_SSE_INTRINSICS_)
  12419. #undef XM3UNPACK3INTO4
  12420. #undef XM3PACK4INTO3
  12421. #endif