bc7_cmpmsc.h 262 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111
  1. //===================================================================================
  2. // Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
  3. //
  4. // Permission is hereby granted, free of charge, to any person obtaining a copy
  5. // of this software and associated documentation files(the "Software"), to deal
  6. // in the Software without restriction, including without limitation the rights
  7. // to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
  8. // copies of the Software, and to permit persons to whom the Software is
  9. // furnished to do so, subject to the following conditions :
  10. //
  11. // The above copyright notice and this permission notice shall be included in
  12. // all copies or substantial portions of the Software.
  13. //
  14. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
  17. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  19. // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  20. // THE SOFTWARE.
  21. //
  22. //==================================================================================
  23. //----------------------------------------------------------------------------------
  24. // File: BC7Encode.hlsl
  25. //
  26. // The Compute Shader for BC7 Encoder
  27. //
  28. // Copyright (c) Microsoft Corporation. All rights reserved.
  29. // Licensed under the MIT License.
  30. //----------------------------------------------------------------------------------
  31. #ifdef ASPM_GPU
  32. #pragma warning(disable : 3078) // "loop control variable conflicts with a previous declaration in the outer scope"
  33. #else // using CPU
  34. #include "common_def.h"
  35. #include "bcn_common_api.h"
  36. #include <algorithm>
  37. #endif
  38. // TryMode456CS
  39. #define ENABLE_MODE4
  40. #define ENABLE_MODE5
  41. #define ENABLE_MODE6
  42. // TryMode02CS
  43. #define ENABLE_MODE0
  44. #define ENABLE_MODE2
  45. // TryMode137CS
  46. #define ENABLE_MODE1
  47. #define ENABLE_MODE3
  48. #define ENABLE_MODE7
  49. //#define ENABLE_CMP_MODE0
  50. //#define ENABLE_CMP_MODE1
  51. //#define ENABLE_CMP_MODE2
  52. //#define ENABLE_CMP_MODE3
  53. //#define ENABLE_CMP_MODE4
  54. //#define ENABLE_CMP_MODE5
  55. #define ENABLE_CMP_MODE6
  56. //#define ENABLE_CMP_MODE7
  57. #define ENABLE_CMP_API
  58. #define USE_NEW_SP_ERR_IDX
  59. #define ENABLE_CMP_REFINE_MODE6_API // API to improve mode 6 quality
  60. #define MAX_TRY_SHAKER 1 // used in cmp_ep_shaker
  61. //====================================================================================
  62. // HLSL Host Simulation
  63. //====================================================================================
  64. // Simulate HLSL compute code on a CPU host must run single treaded
  65. // On cpu the code simulates a single compute unit as used by CMP DXC host
  66. // Enable SIMULATE_GPU to run simulation in CPU using HPC in CMP GUI or CMP CLI
  67. // Note: some bcn_encode_kernel.cpp files have specific code you simulate with, enable
  68. // the define USE_NEW_SINGLE_HEADER_INTERFACES and pick the external or local codec
  69. // to run with.
  70. //===========================================================================
  71. // Prototype to degug a simple simulation of shader using shared global data
  72. // run as single thread on CPU
  73. // #define SIMULATE_GPU
  74. //===========================================================================
  75. #if !defined(ASPM_GPU)
  76. #define THREAD_GROUP_SIZE 64
  77. #define BLOCK_SIZE_X 4
  78. #define BLOCK_SIZE_Y 4
  79. #define MAX_UINT 0xFFFFFFFF
  80. #define MIN_UINT 0x00000000
  81. // Source Texture to process
  82. // Texture2D g_Input;
  83. // Normalized 0..1
  84. struct Texture2D
  85. {
  86. CGU_Vec4f Texture[16];
  87. CGU_Vec4f Load(CGU_Vec3ui index)
  88. {
  89. CGU_INT offset;
  90. offset = (index.x + (index.y * 4)) & 0x0F;
  91. return Texture[offset];
  92. };
  93. CGU_Vec4f Load(CGU_Vec3ui index, CGU_UINT32 z)
  94. {
  95. CMP_UNUSED(z);
  96. CGU_INT offset;
  97. offset = (index.x + (index.y * 4)) & 0x0F;
  98. return Texture[offset];
  99. };
  100. // Ignoring z in Texture2D load
  101. CGU_Vec4ui Load(CGU_Vec4ui index)
  102. {
  103. CGU_INT offset;
  104. offset = (index.x + (index.y * 4)) & 0x0F;
  105. // implicit conversion of float to uint
  106. CGU_Vec4ui res;
  107. res.x = Texture[offset].x;
  108. res.y = Texture[offset].y;
  109. res.z = Texture[offset].z;
  110. res.w = Texture[offset].w;
  111. return res;
  112. };
  113. };
  114. // matches GPU struct in HLSL
  115. struct BufferShared
  116. {
  117. CGU_Vec4ui pixel;
  118. CGU_UINT32 error;
  119. CGU_UINT32 mode;
  120. CGU_UINT32 partition;
  121. CGU_UINT32 index_selector;
  122. CGU_UINT32 rotation;
  123. CGU_UINT32 pbit;
  124. CGU_Vec4ui endPoint_low;
  125. CGU_Vec4ui endPoint_high;
  126. CGU_Vec4ui endPoint_low_quantized;
  127. CGU_Vec4ui endPoint_high_quantized;
  128. CGU_UINT32 colorindex;
  129. CGU_UINT32 alphaindex;
  130. };
  131. struct SharedIOData
  132. {
  133. CGU_UINT32 error;
  134. CGU_UINT32 mode;
  135. CGU_UINT32 index_selector;
  136. CGU_UINT32 rotation;
  137. CGU_UINT32 partition;
  138. CGU_Vec4ui data2;
  139. };
  140. CMP_STATIC BufferShared shared_temp[THREAD_GROUP_SIZE];
  141. CMP_STATIC Texture2D g_Input;
  142. // cbuffer input: On cpu will use 1 block
  143. CMP_STATIC CGU_UINT32 g_tex_width; // Not used in HLSLHost simulation code
  144. CMP_STATIC CGU_UINT32 g_num_block_x = 1;
  145. CMP_STATIC CGU_UINT32 g_format; // Not used in HLSLHost simulation code
  146. CMP_STATIC CGU_UINT32 g_mode_id = 1;
  147. CMP_STATIC CGU_UINT32 g_start_block_id = 0;
  148. CMP_STATIC CGU_UINT32 g_num_total_blocks;
  149. CMP_STATIC CGU_FLOAT g_alpha_weight = 1.0f;
  150. CMP_STATIC CGU_FLOAT g_quality = 1.0f;
  151. CMP_STATIC SharedIOData g_InBuff[THREAD_GROUP_SIZE];
  152. CMP_STATIC CGU_Vec4ui g_OutBuff[THREAD_GROUP_SIZE]; // Used by EncodeBlocks & TryMode...
  153. CMP_STATIC SharedIOData g_OutBuff1[THREAD_GROUP_SIZE]; // Used by TryMode...
  154. // Forward definitions
  155. void TryMode456CS(CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID);
  156. void TryMode137CS(CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID);
  157. void TryMode02CS( CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID);
  158. void EncodeBlocks(CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID);
  159. CMP_STATIC void HLSLHost(CGU_Vec4f image_src[16])
  160. {
  161. //====================================
  162. // Simulate a single block CS
  163. //====================================
  164. // Load image_src
  165. CGU_Vec4ui imageBlock[16];
  166. for (CGU_INT i = 0; i < 16; i++)
  167. {
  168. g_Input.Texture[i].x = image_src[i].x / 255.0f;
  169. g_Input.Texture[i].y = image_src[i].y / 255.0f;
  170. g_Input.Texture[i].z = image_src[i].z / 255.0f;
  171. g_Input.Texture[i].w = image_src[i].w / 255.0f;
  172. }
  173. // Init global Buffers for first time use
  174. for (CGU_INT i = 0; i < THREAD_GROUP_SIZE; i++)
  175. {
  176. memset(&shared_temp[i], 0, sizeof(BufferShared));
  177. memset(&g_InBuff[i], 0, sizeof(SharedIOData));
  178. memset(&g_OutBuff1[i], 0, sizeof(SharedIOData));
  179. }
  180. // First Shader call
  181. CGU_Vec3ui SV_GroupID = {0, 0, 0}; // = Dispatch (1..(n-1),1,1) where n = number of (4x4) blocks in the image;
  182. CGU_Vec3ui SV_GrounThreadID = {0, 0, 0};
  183. g_start_block_id = 0;
  184. // // Global Group Memory Sync for Pixel
  185. // for (CGU_INT i = 0; i < 16; i++)
  186. // {
  187. // CGU_Vec4f px = g_Input.Load(CGU_Vec3ui(i % 4, i / 4, 0));
  188. // px = cmp_clampVec4f(px * 255.0f, 0.0f, 255.0f);
  189. // //printf("in px[%2d] %3.0f %3.0f %3.0f\n",i, px.x, px.y, px.z);
  190. // shared_temp[i].pixel.r = (CGU_UINT32)px.r;
  191. // shared_temp[i].pixel.g = (CGU_UINT32)px.g;
  192. // shared_temp[i].pixel.b = (CGU_UINT32)px.b;
  193. // shared_temp[i].pixel.a = (CGU_UINT32)px.a;
  194. // }
  195. g_mode_id = 6;
  196. for (CGU_INT SV_GroupIndex = 15; SV_GroupIndex >= 0; SV_GroupIndex--)
  197. {
  198. TryMode456CS(SV_GroupIndex, SV_GroupID);
  199. }
  200. // Return Outbuff back to inbuff for next CS use
  201. for (CGU_INT i = 0; i < THREAD_GROUP_SIZE; i++)
  202. {
  203. memcpy(&g_InBuff[i], &g_OutBuff1[i], sizeof(SharedIOData));
  204. }
  205. // Global Group Memory Sync for Pixel
  206. //for (CGU_INT i = 0; i < 16; i++)
  207. //{
  208. // CGU_Vec4f px = g_Input.Load(CGU_Vec3ui(i % 4, i / 4, 0));
  209. // px = cmp_clampVec4f(px * 255.0f, 0.0f, 255.0f);
  210. // shared_temp[i].pixel.r = (CGU_UINT32)px.r;
  211. // shared_temp[i].pixel.g = (CGU_UINT32)px.g;
  212. // shared_temp[i].pixel.b = (CGU_UINT32)px.b;
  213. // shared_temp[i].pixel.a = (CGU_UINT32)px.a;
  214. //}
  215. // Next Shader call
  216. g_mode_id = 1;
  217. for (CGU_INT SV_GroupIndex = 63; SV_GroupIndex >= 0; SV_GroupIndex--)
  218. {
  219. TryMode137CS(SV_GroupIndex, SV_GroupID);
  220. }
  221. // Return Outbuff back to inbuff for next shader call
  222. for (CGU_INT i = 0; i < THREAD_GROUP_SIZE; i++)
  223. {
  224. memcpy(&g_InBuff[i], &g_OutBuff1[i], sizeof(SharedIOData));
  225. }
  226. // Final Shader call
  227. for (CGU_INT SV_GroupIndex = 15; SV_GroupIndex >= 0; SV_GroupIndex--)
  228. {
  229. EncodeBlocks(SV_GroupIndex, SV_GroupID);
  230. }
  231. }
  232. #endif
  233. #ifdef ENABLE_CMP_API
  234. // Change this to CGU_Vec4ui par_vectors42_nd[4][2];
  235. CMP_STATIC CMP_CONSTANT CGU_UINT32 par_vectors42_nd[4][2][4] = {
  236. // type = 2
  237. {{0, 0, 0, 0}, {0, 0, 0, 0}}, // 0 {0,0}
  238. {{0, 0, 0, 0}, {1, 1, 1, 1}}, // 1 {0,1}
  239. {{1, 1, 1, 1}, {0, 0, 0, 0}}, // 2 {1,0}
  240. {{1, 1, 1, 1}, {1, 1, 1, 1}} // 3 {1,1}
  241. };
  242. #define COMP_RED 0
  243. #define COMP_GREEN 1
  244. #define COMP_BLUE 2
  245. #define COMP_ALPHA 3
  246. typedef struct
  247. {
  248. CGU_UINT32 numPartitionModes;
  249. CGU_UINT32 maxSubSets;
  250. CGU_UINT32 channels3or4;
  251. CGU_UINT32 bits;
  252. CGU_UINT32 clusters;
  253. CGU_UINT32 componentBits;
  254. CGU_UINT32 partitionBits;
  255. CGU_UINT32 indexBits;
  256. } MODESETTINGS;
  257. CMP_STATIC CMP_CONSTANT MODESETTINGS g_modesettings[8] = {
  258. // numPartitionModes,maxSubSets channels3or4, bits, clusters, componentBits, partitionBits, indexBits
  259. {16, 3, 3, 26, 8, 4, 4, 3}, // Mode 0
  260. {64, 2, 3, 37, 8, 6, 6, 3}, // Mode 1
  261. {64, 3, 3, 30, 4, 5, 6, 2}, // Mode 2
  262. {64, 2, 3, 44, 4, 7, 6, 2}, // Mode 3
  263. { 0, 0, 0, 0, 0, 0, 0, 2}, // Mode 4
  264. { 0, 0, 0, 0, 0, 0, 0, 2}, // Mode 5
  265. { 0, 0, 4, 58, 16, 7, 0, 4}, // Mode 6
  266. {64, 2, 4, 42, 4, 5, 6, 2} // Mode 7
  267. };
  268. #ifndef ASPM_HLSL //=======================================================
  269. CMP_STATIC CMP_CONSTANT CGU_UINT32 subset_mask_table2[128] = {
  270. // 2 subset region patterns
  271. 0x0000CCCCu, // 0 1100 1100 1100 1100 (MSB..LSB)
  272. 0x00008888u, // 1 1000 1000 1000 1000
  273. 0x0000EEEEu, // 2 1110 1110 1110 1110
  274. 0x0000ECC8u, // 3 1110 1100 1100 1000
  275. 0x0000C880u, // 4 1100 1000 1000 0000
  276. 0x0000FEECu, // 5 1111 1110 1110 1100
  277. 0x0000FEC8u, // 6 1111 1110 1100 1000
  278. 0x0000EC80u, // 7 1110 1100 1000 0000
  279. 0x0000C800u, // 8 1100 1000 0000 0000
  280. 0x0000FFECu, // 9 1111 1111 1110 1100
  281. 0x0000FE80u, // 10 1111 1110 1000 0000
  282. 0x0000E800u, // 11 1110 1000 0000 0000
  283. 0x0000FFE8u, // 12 1111 1111 1110 1000
  284. 0x0000FF00u, // 13 1111 1111 0000 0000
  285. 0x0000FFF0u, // 14 1111 1111 1111 0000
  286. 0x0000F000u, // 15 1111 0000 0000 0000
  287. 0x0000F710u, // 16 1111 0111 0001 0000
  288. 0x0000008Eu, // 17 0000 0000 1000 1110
  289. 0x00007100u, // 18 0111 0001 0000 0000
  290. 0x000008CEu, // 19 0000 1000 1100 1110
  291. 0x0000008Cu, // 20 0000 0000 1000 1100
  292. 0x00007310u, // 21 0111 0011 0001 0000
  293. 0x00003100u, // 22 0011 0001 0000 0000
  294. 0x00008CCEu, // 23 1000 1100 1100 1110
  295. 0x0000088Cu, // 24 0000 1000 1000 1100
  296. 0x00003110u, // 25 0011 0001 0001 0000
  297. 0x00006666u, // 26 0110 0110 0110 0110
  298. 0x0000366Cu, // 27 0011 0110 0110 1100
  299. 0x000017E8u, // 28 0001 0111 1110 1000
  300. 0x00000FF0u, // 29 0000 1111 1111 0000
  301. 0x0000718Eu, // 30 0111 0001 1000 1110
  302. 0x0000399Cu, // 31 0011 1001 1001 1100
  303. 0x0000AAAAu, // 32 1010 1010 1010 1010
  304. 0x0000F0F0u, // 33 1111 0000 1111 0000
  305. 0x00005A5Au, // 34 0101 1010 0101 1010
  306. 0x000033CCu, // 35 0011 0011 1100 1100
  307. 0x00003C3Cu, // 36 0011 1100 0011 1100
  308. 0x000055AAu, // 37 0101 0101 1010 1010
  309. 0x00009696u, // 38 1001 0110 1001 0110
  310. 0x0000A55Au, // 39 1010 0101 0101 1010
  311. 0x000073CEu, // 40 0111 0011 1100 1110
  312. 0x000013C8u, // 41 0001 0011 1100 1000
  313. 0x0000324Cu, // 42 0011 0010 0100 1100
  314. 0x00003BDCu, // 43 0011 1011 1101 1100
  315. 0x00006996u, // 44 0110 1001 1001 0110
  316. 0x0000C33Cu, // 45 1100 0011 0011 1100
  317. 0x00009966u, // 46 1001 1001 0110 0110
  318. 0x00000660u, // 47 0000 0110 0110 0000
  319. 0x00000272u, // 48 0000 0010 0111 0010
  320. 0x000004E4u, // 49 0000 0100 1110 0100
  321. 0x00004E40u, // 50 0100 1110 0100 0000
  322. 0x00002720u, // 51 0010 0111 0010 0000
  323. 0x0000C936u, // 52 1100 1001 0011 0110
  324. 0x0000936Cu, // 53 1001 0011 0110 1100
  325. 0x000039C6u, // 54 0011 1001 1100 0110
  326. 0x0000639Cu, // 55 0110 0011 1001 1100
  327. 0x00009336u, // 56 1001 0011 0011 0110
  328. 0x00009CC6u, // 57 1001 1100 1100 0110
  329. 0x0000817Eu, // 58 1000 0001 0111 1110
  330. 0x0000E718u, // 59 1110 0111 0001 1000
  331. 0x0000CCF0u, // 60 1100 1100 1111 0000
  332. 0x00000FCCu, // 61 0000 1111 1100 1100
  333. 0x00007744u, // 62 0111 0111 0100 0100
  334. 0x0000EE22u, // 63 1110 1110 0010 0010
  335. // 3 Subset region patterns
  336. 0xF60008CCu, // 0 1111 0110 0000 0000 : 0000 1000 1100 1100 = 2222122011001100 (MSB...LSB)
  337. 0x73008CC8u, // 1 0111 0011 0000 0000 : 1000 1100 1100 1000 = 1222112211001000
  338. 0x3310CC80u, // 2 0011 0011 0001 0000 : 1100 1100 1000 0000 = 1122112210020000
  339. 0x00CEEC00u, // 3 0000 0000 1100 1110 : 1110 1100 0000 0000 = 1110110022002220
  340. 0xCC003300u, // 4 1100 1100 0000 0000 : 0011 0011 0000 0000 = 2211221100000000
  341. 0xCC0000CCu, // 5 1100 1100 0000 0000 : 0000 0000 1100 1100 = 2200220011001100
  342. 0x00CCFF00u, // 6 0000 0000 1100 1100 : 1111 1111 0000 0000 = 1111111122002200
  343. 0x3300CCCCu, // 7 0011 0011 0000 0000 : 1100 1100 1100 1100 = 1122112211001100
  344. 0xF0000F00u, // 8 1111 0000 0000 0000 : 0000 1111 0000 0000 = 2222111100000000
  345. 0xF0000FF0u, // 9 1111 0000 0000 0000 : 0000 1111 1111 0000 = 2222111111110000
  346. 0xFF0000F0u, // 10 1111 1111 0000 0000 : 0000 0000 1111 0000 = 2222222211110000
  347. 0x88884444u, // 11 1000 1000 1000 1000 : 0100 0100 0100 0100 = 2100210021002100
  348. 0x88886666u, // 12 1000 1000 1000 1000 : 0110 0110 0110 0110 = 2110211021102110
  349. 0xCCCC2222u, // 13 1100 1100 1100 1100 : 0010 0010 0010 0010 = 2210221022102210
  350. 0xEC80136Cu, // 14 1110 1100 1000 0000 : 0001 0011 0110 1100 = 2221221121101100
  351. 0x7310008Cu, // 15 0111 0011 0001 0000 : 0000 0000 1000 1100 = 0222002210021100
  352. 0xC80036C8u, // 16 1100 1000 0000 0000 : 0011 0110 1100 1000 = 2211211011001000
  353. 0x310008CEu, // 17 0011 0001 0000 0000 : 0000 1000 1100 1110 = 0022100211001110
  354. 0xCCC03330u, // 18 1100 1100 1100 0000 : 0011 0011 0011 0000 = 2211221122110000
  355. 0x0CCCF000u, // 19 0000 1100 1100 1100 : 1111 0000 0000 0000 = 1111220022002200
  356. 0xEE0000EEu, // 20 1110 1110 0000 0000 : 0000 0000 1110 1110 = 2220222011101110
  357. 0x77008888u, // 21 0111 0111 0000 0000 : 1000 1000 1000 1000 = 1222122210001000
  358. 0xCC0022C0u, // 22 1100 1100 0000 0000 : 0010 0010 1100 0000 = 2210221011000000
  359. 0x33004430u, // 23 0011 0011 0000 0000 : 0100 0100 0011 0000 = 0122012200110000
  360. 0x00CC0C22u, // 24 0000 0000 1100 1100 : 0000 1100 0010 0010 = 0000110022102210
  361. 0xFC880344u, // 25 1111 1100 1000 1000 : 0000 0011 0100 0100 = 2222221121002100
  362. 0x06606996u, // 26 0000 0110 0110 0000 : 0110 1001 1001 0110 = 0110122112210110
  363. 0x66009960u, // 27 0110 0110 0000 0000 : 1001 1001 0110 0000 = 1221122101100000
  364. 0xC88C0330u, // 28 1100 1000 1000 1100 : 0000 0011 0011 0000 = 2200201120112200
  365. 0xF9000066u, // 29 1111 1001 0000 0000 : 0000 0000 0110 0110 = 2222200201100110
  366. 0x0CC0C22Cu, // 30 0000 1100 1100 0000 : 1100 0010 0010 1100 = 1100221022101100
  367. 0x73108C00u, // 31 0111 0011 0001 0000 : 1000 1100 0000 0000 = 1222112200020000
  368. 0xEC801300u, // 32 1110 1100 1000 0000 : 0001 0011 0000 0000 = 2221221120000000
  369. 0x08CEC400u, // 33 0000 1000 1100 1110 : 1100 0100 0000 0000 = 1100210022002220
  370. 0xEC80004Cu, // 34 1110 1100 1000 0000 : 0000 0000 0100 1100 = 2220220021001100
  371. 0x44442222u, // 35 0100 0100 0100 0100 : 0010 0010 0010 0010 = 0210021002100210
  372. 0x0F0000F0u, // 36 0000 1111 0000 0000 : 0000 0000 1111 0000 = 0000222211110000
  373. 0x49242492u, // 37 0100 1001 0010 0100 : 0010 0100 1001 0010 = 0210210210210210
  374. 0x42942942u, // 38 0100 0010 1001 0100 : 0010 1001 0100 0010 = 0210102121020210
  375. 0x0C30C30Cu, // 39 0000 1100 0011 0000 : 1100 0011 0000 1100 = 1100221100221100
  376. 0x03C0C03Cu, // 40 0000 0011 1100 0000 : 1100 0000 0011 1100 = 1100002222111100
  377. 0xFF0000AAu, // 41 1111 1111 0000 0000 : 0000 0000 1010 1010 = 2222222210101010
  378. 0x5500AA00u, // 42 0101 0101 0000 0000 : 1010 1010 0000 0000 = 1212121200000000
  379. 0xCCCC3030u, // 43 1100 1100 1100 1100 : 0011 0000 0011 0000 = 2211220022112200
  380. 0x0C0CC0C0u, // 44 0000 1100 0000 1100 : 1100 0000 1100 0000 = 1100220011002200
  381. 0x66669090u, // 45 0110 0110 0110 0110 : 1001 0000 1001 0000 = 1221022012210220
  382. 0x0FF0A00Au, // 46 0000 1111 1111 0000 : 1010 0000 0000 1010 = 1010222222221010
  383. 0x5550AAA0u, // 47 0101 0101 0101 0000 : 1010 1010 1010 0000 = 1212121212120000
  384. 0xF0000AAAu, // 48 1111 0000 0000 0000 : 0000 1010 1010 1010 = 2222101010101010
  385. 0x0E0EE0E0u, // 49 0000 1110 0000 1110 : 1110 0000 1110 0000 = 1110222011102220
  386. 0x88887070u, // 50 1000 1000 1000 1000 : 0111 0000 0111 0000 = 2111200021112000
  387. 0x99906660u, // 51 1001 1001 1001 0000 : 0110 0110 0110 0000 = 2112211221120000
  388. 0xE00E0EE0u, // 52 1110 0000 0000 1110 : 0000 1110 1110 0000 = 2220111011102220
  389. 0x88880770u, // 53 1000 1000 1000 1000 : 0000 0111 0111 0000 = 2000211121112000
  390. 0xF0000666u, // 54 1111 0000 0000 0000 : 0000 0110 0110 0110 = 2222011001100110
  391. 0x99006600u, // 55 1001 1001 0000 0000 : 0110 0110 0000 0000 = 2112211200000000
  392. 0xFF000066u, // 56 1111 1111 0000 0000 : 0000 0000 0110 0110 = 2222222201100110
  393. 0xC00C0CC0u, // 57 1100 0000 0000 1100 : 0000 1100 1100 0000 = 2200110011002200
  394. 0xCCCC0330u, // 58 1100 1100 1100 1100 : 0000 0011 0011 0000 = 2200221122112200
  395. 0x90006000u, // 59 1001 0000 0000 0000 : 0110 0000 0000 0000 = 2112000000000000
  396. 0x08088080u, // 60 0000 1000 0000 1000 : 1000 0000 1000 0000 = 1000200010002000
  397. 0xEEEE1010u, // 61 1110 1110 1110 1110 : 0001 0000 0001 0000 = 2221222022212220
  398. 0xFFF0000Au, // 62 1111 1111 1111 0000 : 0000 0000 0000 1010 = 2222222222221010
  399. 0x731008CEu, // 63 0111 0011 0001 0000 : 0000 1000 1100 1110 = 0222102211021110
  400. };
  401. CMP_STATIC CMP_CONSTANT CGU_UINT8 cmp_npv_nd[2][8] = {
  402. {1, 2, 4, 8, 16, 32, 0, 0}, // 3
  403. {1, 2, 4, 0, 0, 0, 0, 0} // 4
  404. };
  405. CMP_STATIC CMP_CONSTANT CGU_UINT8 cmp_par_vectors_nd[2][8][64][2][4] = {
  406. {
  407. // 3D
  408. {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  409. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  410. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  411. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  412. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  413. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  414. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  415. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  416. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  417. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  418. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  419. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  420. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
  421. {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{1, 1, 1, 0}, {1, 1, 1, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  422. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  423. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  424. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  425. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  426. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  427. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  428. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  429. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  430. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  431. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  432. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  433. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
  434. {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {1, 1, 1, 0}}, {{1, 1, 1, 0}, {0, 0, 0, 0}}, {{1, 1, 1, 0}, {1, 1, 1, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  435. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  436. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  437. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  438. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  439. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  440. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  441. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  442. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  443. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  444. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  445. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  446. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
  447. {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{1, 1, 0, 0}, {1, 1, 0, 0}}, {{1, 0, 1, 0}, {1, 0, 1, 0}}, {{0, 1, 1, 0}, {0, 1, 1, 0}}, {{0, 0, 0, 0}, {1, 1, 1, 0}},
  448. {{1, 1, 1, 0}, {0, 0, 0, 0}}, {{0, 1, 0, 0}, {0, 1, 0, 0}}, {{1, 1, 1, 0}, {1, 1, 1, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  449. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  450. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  451. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  452. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  453. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  454. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  455. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  456. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  457. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  458. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  459. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
  460. {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{1, 1, 0, 0}, {0, 0, 0, 0}}, {{1, 0, 1, 0}, {0, 0, 0, 0}}, {{0, 1, 1, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {1, 1, 0, 0}},
  461. {{1, 1, 0, 0}, {1, 1, 0, 0}}, {{1, 0, 1, 0}, {1, 1, 0, 0}}, {{0, 1, 1, 0}, {1, 1, 0, 0}}, {{0, 0, 0, 0}, {1, 0, 1, 0}}, {{1, 1, 0, 0}, {1, 0, 1, 0}},
  462. {{1, 0, 1, 0}, {1, 0, 1, 0}}, {{0, 1, 1, 0}, {1, 0, 1, 0}}, {{0, 0, 0, 0}, {0, 1, 1, 0}}, {{1, 1, 0, 0}, {0, 1, 1, 0}}, {{1, 0, 1, 0}, {0, 1, 1, 0}},
  463. {{0, 1, 1, 0}, {0, 1, 1, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  464. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  465. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  466. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  467. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  468. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  469. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  470. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  471. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  472. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
  473. {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{1, 1, 0, 0}, {0, 0, 0, 0}}, {{1, 0, 1, 0}, {0, 0, 0, 0}}, {{0, 1, 1, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {1, 1, 0, 0}},
  474. {{1, 1, 0, 0}, {1, 1, 0, 0}}, {{1, 0, 1, 0}, {1, 1, 0, 0}}, {{0, 1, 1, 0}, {1, 1, 0, 0}}, {{0, 0, 0, 0}, {1, 0, 1, 0}}, {{1, 1, 0, 0}, {1, 0, 1, 0}},
  475. {{1, 0, 1, 0}, {1, 0, 1, 0}}, {{0, 1, 1, 0}, {1, 0, 1, 0}}, {{0, 0, 0, 0}, {0, 1, 1, 0}}, {{1, 1, 0, 0}, {0, 1, 1, 0}}, {{1, 0, 1, 0}, {0, 1, 1, 0}},
  476. {{0, 1, 1, 0}, {0, 1, 1, 0}}, {{1, 0, 0, 0}, {1, 1, 1, 0}}, {{0, 1, 0, 0}, {1, 1, 1, 0}}, {{0, 0, 1, 0}, {1, 1, 1, 0}}, {{1, 1, 1, 0}, {1, 1, 1, 0}},
  477. {{1, 0, 0, 0}, {0, 0, 1, 0}}, {{0, 1, 0, 0}, {0, 0, 1, 0}}, {{0, 0, 1, 0}, {0, 0, 1, 0}}, {{1, 1, 1, 0}, {0, 0, 1, 0}}, {{1, 0, 0, 0}, {1, 0, 0, 0}},
  478. {{0, 1, 0, 0}, {1, 0, 0, 0}}, {{0, 0, 1, 0}, {1, 0, 0, 0}}, {{1, 1, 1, 0}, {1, 0, 0, 0}}, {{1, 0, 0, 0}, {0, 1, 0, 0}}, {{0, 1, 0, 0}, {0, 1, 0, 0}},
  479. {{0, 0, 1, 0}, {0, 1, 0, 0}}, {{1, 1, 1, 0}, {0, 1, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  480. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  481. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  482. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  483. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  484. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  485. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
  486. {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  487. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  488. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  489. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  490. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  491. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  492. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  493. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  494. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  495. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  496. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  497. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  498. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
  499. {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  500. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  501. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  502. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  503. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  504. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  505. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  506. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  507. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  508. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  509. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  510. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  511. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
  512. },
  513. {
  514. // 4D
  515. {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  516. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  517. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  518. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  519. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  520. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  521. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  522. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  523. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  524. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  525. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  526. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  527. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
  528. {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{1, 1, 1, 1}, {1, 1, 1, 1}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  529. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  530. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  531. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  532. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  533. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  534. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  535. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  536. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  537. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  538. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  539. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  540. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
  541. {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {1, 1, 1, 1}}, {{1, 1, 1, 1}, {0, 0, 0, 0}}, {{1, 1, 1, 1}, {1, 1, 1, 1}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  542. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  543. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  544. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  545. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  546. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  547. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  548. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  549. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  550. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  551. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  552. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  553. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
  554. {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 1, 1, 1}}, {{0, 1, 1, 1}, {0, 0, 0, 0}}, {{0, 1, 1, 1}, {0, 1, 1, 1}}, {{1, 0, 0, 0}, {1, 0, 0, 0}},
  555. {{1, 0, 0, 0}, {1, 1, 1, 1}}, {{1, 1, 1, 1}, {1, 0, 0, 0}}, {{1, 1, 1, 1}, {1, 1, 1, 1}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  556. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  557. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  558. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  559. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  560. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  561. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  562. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  563. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  564. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  565. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  566. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
  567. {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 1, 1, 1}}, {{0, 1, 1, 1}, {0, 0, 0, 0}}, {{0, 1, 1, 1}, {0, 1, 1, 1}}, {{1, 0, 0, 0}, {1, 0, 0, 0}},
  568. {{1, 0, 0, 0}, {1, 1, 1, 1}}, {{1, 1, 1, 1}, {1, 0, 0, 0}}, {{1, 1, 1, 1}, {1, 1, 1, 1}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 1, 1}},
  569. {{0, 0, 1, 1}, {0, 0, 0, 0}}, {{0, 1, 0, 1}, {0, 1, 0, 1}}, {{1, 0, 0, 0}, {1, 0, 0, 0}}, {{1, 0, 0, 0}, {1, 0, 1, 1}}, {{1, 0, 1, 1}, {1, 0, 0, 0}},
  570. {{1, 1, 0, 1}, {1, 1, 0, 1}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  571. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  572. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  573. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  574. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  575. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  576. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  577. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  578. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  579. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
  580. {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  581. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  582. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  583. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  584. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  585. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  586. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  587. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  588. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  589. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  590. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  591. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  592. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
  593. {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  594. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  595. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  596. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  597. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  598. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  599. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  600. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  601. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  602. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  603. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  604. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  605. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
  606. {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  607. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  608. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  609. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  610. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  611. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  612. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  613. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  614. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  615. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  616. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  617. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
  618. {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
  619. },
  620. };
  621. CMP_STATIC CMP_CONSTANT CGU_UINT8 cmp_rampI[3][16] = {
  622. {0, 21, 43, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // 2 bit index
  623. {0, 9, 18, 27, 37, 46, 55, 64, 0, 0, 0, 0, 0, 0, 0, 0}, // 3 bit index
  624. {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64} // 4 bit index
  625. };
  626. // The data is saved as a packed INT = (BC7_FIXUPINDEX1 << 4 + BC7_FIXUPINDEX2)
  627. CMP_STATIC CMP_CONSTANT CGU_UINT32 CMPFIXUPINDEX[128] = {
  628. // 2 subset partitions 0..63
  629. 0xf0u,0xf0u,0xf0u,0xf0u,0xf0u,0xf0u,0xf0u,0xf0u,
  630. 0xf0u,0xf0u,0xf0u,0xf0u,0xf0u,0xf0u,0xf0u,0xf0u,
  631. 0xf0u,0x20u,0x80u,0x20u,0x20u,0x80u,0x80u,0xf0u,
  632. 0x20u,0x80u,0x20u,0x20u,0x80u,0x80u,0x20u,0x20u,
  633. 0xf0u,0xf0u,0x60u,0x80u,0x20u,0x80u,0xf0u,0xf0u,
  634. 0x20u,0x80u,0x20u,0x20u,0x20u,0xf0u,0xf0u,0x60u,
  635. 0x60u,0x20u,0x60u,0x80u,0xf0u,0xf0u,0x20u,0x20u,
  636. 0xf0u,0xf0u,0xf0u,0xf0u,0xf0u,0x20u,0x20u,0xf0u,
  637. // 3 subset partitions 64..128
  638. 0x3fu,0x38u,0xf8u,0xf3u,0x8fu,0x3fu,0xf3u,0xf8u,
  639. 0x8fu,0x8fu,0x6fu,0x6fu,0x6fu,0x5fu,0x3fu,0x38u,
  640. 0x3fu,0x38u,0x8fu,0xf3u,0x3fu,0x38u,0x6fu,0xa8u,
  641. 0x53u,0x8fu,0x86u,0x6au,0x8fu,0x5fu,0xfau,0xf8u,
  642. 0x8fu,0xf3u,0x3fu,0x5au,0x6au,0xa8u,0x89u,0xfau,
  643. 0xf6u,0x3fu,0xf8u,0x5fu,0xf3u,0xf6u,0xf6u,0xf8u,
  644. 0x3fu,0xf3u,0x5fu,0x5fu,0x5fu,0x8fu,0x5fu,0xafu,
  645. 0x5fu,0xafu,0x8fu,0xdfu,0xf3u,0xcfu,0x3fu,0x38u };
  646. INLINE void cmp_get_fixuptable(CMP_INOUT CGU_UINT32 fixup[3], CGU_INT part_id)
  647. {
  648. CGU_UINT32 skip_packed = CMPFIXUPINDEX[part_id]; // gather_int2(FIXUPINDEX, part_id);
  649. fixup[0] = 0;
  650. fixup[1] = skip_packed >> 4;
  651. fixup[2] = skip_packed & 15;
  652. }
  653. INLINE CGU_UINT8 shift_right_epocode2(CMP_IN CGU_UINT8 v, CMP_IN CGU_INT bits)
  654. {
  655. return v >> bits; // (perf warning expected)
  656. }
  657. INLINE CGU_UINT8 expand_epocode2(CMP_IN CGU_UINT8 v, CMP_IN CGU_INT bits)
  658. {
  659. CGU_UINT8 vv = v << (8 - bits);
  660. return vv + shift_right_epocode2(vv, bits);
  661. }
  662. INLINE CGV_FLOAT cmp_GetRamp(CMP_IN CGU_INT index_bits, // ramp bits Valid range 2..4
  663. CMP_IN CGU_INT bits, // Component Valid range 5..8
  664. CMP_IN CGU_INT p1, // 0..255
  665. CMP_IN CGU_INT p2, // 0..255
  666. CMP_IN CGU_UINT8 index)
  667. {
  668. CGU_INT e1 = expand_epocode2(p1, bits);
  669. CGU_INT e2 = expand_epocode2(p2, bits);
  670. CGV_FLOAT ramp = cmp_rampI[index_bits - 2][index] / 64.0F;
  671. CGV_FLOAT rampf = floor(e1 + ramp * (e2 - e1) + 0.5F);
  672. return rampf;
  673. }
  674. #if defined(USE_NEW_SP_ERR_IDX)
  675. #ifndef ASPM_GPU
  676. struct BC7_EncodeRamps2
  677. {
  678. CGU_INT ep_d[4][256];
  679. CGU_UINT8 sp_err[3*4*256*2*2*16];
  680. CGU_INT sp_idx[3*4*256*2*2*16*2];
  681. CGU_BOOL ramp_init;
  682. };
  683. BC7_EncodeRamps2 BC7EncodeRamps2;
  684. #define LOG_CL_RANGE2 5
  685. #define LOG_CL_BASE2 2
  686. #define BIT_BASE2 5
  687. #define BIT_RANGE2 9
  688. #define BTT2(bits) (bits-BIT_BASE2)
  689. #define CLT2(cl) (cl-LOG_CL_BASE2)
  690. #define SOURCE_BLOCK_SIZE 16
  691. CMP_CONSTANT CGU_FLOAT rampWeights2[5][SOURCE_BLOCK_SIZE] = {
  692. { 0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f}, // 0 bit index
  693. { 0.000000f,1.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f}, // 1 bit index
  694. { 0.000000f,0.328125f,0.671875f,1.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f}, // 2 bit index
  695. { 0.000000f,0.140625f,0.281250f,0.421875f,0.578125f,0.718750f,0.859375f,1.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f}, // 3 bit index
  696. { 0.000000f,0.062500f,0.140625f,0.203125f,0.265625f,0.328125f,0.406250f,0.468750f,0.531250f,0.593750f,0.671875f,0.734375f,0.796875f,0.859375f,0.937500f,1.000000f} // 4 bit index
  697. };
  698. CGU_INT old_expandbits(CGU_INT bits, CGU_INT v)
  699. {
  700. return (v << (8 - bits) | v >> (2 * bits - 8));
  701. }
  702. void old_init_BC7ramps()
  703. {
  704. CMP_STATIC CGU_BOOL g_rampsInitialized = FALSE;
  705. if (g_rampsInitialized == TRUE)
  706. return;
  707. g_rampsInitialized = TRUE;
  708. BC7EncodeRamps2.ramp_init = TRUE;
  709. //bc7_isa(); ASPM_PRINT((" INIT Ramps\n"));
  710. CGU_INT bits;
  711. CGU_INT p1;
  712. CGU_INT p2;
  713. CGU_INT clogBC7;
  714. CGU_INT index;
  715. CGU_INT j;
  716. CGU_INT o1;
  717. CGU_INT o2;
  718. for (bits = BIT_BASE2; bits < BIT_RANGE2; bits++)
  719. {
  720. for (p1 = 0; p1 < (1 << bits); p1++)
  721. {
  722. BC7EncodeRamps2.ep_d[BTT2(bits)][p1] = old_expandbits(bits, p1);
  723. } //p1
  724. } //bits<BIT_RANGE
  725. for (clogBC7 = LOG_CL_BASE2; clogBC7 < LOG_CL_RANGE2; clogBC7++)
  726. {
  727. for (bits = BIT_BASE2; bits < BIT_RANGE2; bits++)
  728. {
  729. // SP_ERR_IDX : Init
  730. for (j = 0; j < 256; j++)
  731. {
  732. for (o1 = 0; o1 < 2; o1++)
  733. {
  734. for (o2 = 0; o2 < 2; o2++)
  735. {
  736. for (index = 0; index < 16; index++)
  737. {
  738. BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) + (j * 2 * 2 * 16 * 2) +
  739. (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 0] = 0;
  740. BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) + (j * 2 * 2 * 16 * 2) +
  741. (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 1] = 255;
  742. BC7EncodeRamps2.sp_err[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16) + (BTT2(bits) * 256 * 2 * 2 * 16) + (j * 2 * 2 * 16) + (o1 * 2 * 16) +
  743. (o2 * 16) + index] = 255;
  744. } // i<16
  745. } //o2<2;
  746. } //o1<2
  747. } //j<256
  748. // SP_ERR_IDX : calc
  749. for (p1 = 0; p1 < (1 << bits); p1++)
  750. {
  751. for (p2 = 0; p2 < (1 << bits); p2++)
  752. {
  753. for (index = 0; index < (1 << clogBC7); index++)
  754. {
  755. CGV_INT floatf =
  756. floor((CGV_FLOAT)BC7EncodeRamps2.ep_d[BTT2(bits)][p1] +
  757. rampWeights2[clogBC7][index] * (CGV_FLOAT)((BC7EncodeRamps2.ep_d[BTT2(bits)][p2] - BC7EncodeRamps2.ep_d[BTT2(bits)][p1])) + 0.5F);
  758. BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) + (floatf * 2 * 2 * 16 * 2) +
  759. ((p1 & 0x1) * 2 * 16 * 2) + ((p2 & 0x1) * 16 * 2) + (index * 2) + 0] = p1;
  760. BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) + (floatf * 2 * 2 * 16 * 2) +
  761. ((p1 & 0x1) * 2 * 16 * 2) + ((p2 & 0x1) * 16 * 2) + (index * 2) + 1] = p2;
  762. BC7EncodeRamps2.sp_err[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16) + (BTT2(bits) * 256 * 2 * 2 * 16) + (floatf * 2 * 2 * 16) +
  763. ((p1 & 0x1) * 2 * 16) + (p2 & 0x1 * 16) + index] = 0;
  764. } //i<(1 << clogBC7)
  765. } //p2
  766. } //p1<(1 << bits)
  767. for (j = 0; j < 256; j++)
  768. {
  769. for (o1 = 0; o1 < 2; o1++)
  770. {
  771. for (o2 = 0; o2 < 2; o2++)
  772. {
  773. for (index = 0; index < (1 << clogBC7); index++)
  774. {
  775. if ( // check for unitialized sp_idx
  776. (BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) + (j * 2 * 2 * 16 * 2) +
  777. (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 0] == 0) &&
  778. (BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) + (j * 2 * 2 * 16 * 2) +
  779. (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 1] == 255))
  780. {
  781. CGU_INT k;
  782. CGU_INT tf;
  783. CGU_INT tc;
  784. for (k = 1; k < 256; k++)
  785. {
  786. tf = j - k;
  787. tc = j + k;
  788. if ((tf >= 0 && BC7EncodeRamps2.sp_err[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16) + (BTT2(bits) * 256 * 2 * 2 * 16) +
  789. (tf * 2 * 2 * 16) + (o1 * 2 * 16) + (o2 * 16) + index] == 0))
  790. {
  791. BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) +
  792. (j * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 0] =
  793. BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) +
  794. (tf * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 0];
  795. BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) +
  796. (j * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 1] =
  797. BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) +
  798. (tf * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 1];
  799. break;
  800. }
  801. else if ((tc < 256 && BC7EncodeRamps2.sp_err[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16) + (BTT2(bits) * 256 * 2 * 2 * 16) +
  802. (tc * 2 * 2 * 16) + (o1 * 2 * 16) + (o2 * 16) + index] == 0))
  803. {
  804. BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) +
  805. (j * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 0] =
  806. BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) +
  807. (tc * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 0];
  808. break;
  809. }
  810. }
  811. BC7EncodeRamps2.sp_err[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16) + (BTT2(bits) * 256 * 2 * 2 * 16) + (j * 2 * 2 * 16) +
  812. (o1 * 2 * 16) + (o2 * 16) + index] = (CGU_UINT8)k;
  813. } //sp_idx < 0
  814. } //i<(1 << clogBC7)
  815. } //o2
  816. } //o1
  817. } //j
  818. } //bits<BIT_RANGE
  819. } //clogBC7<LOG_CL_RANGE
  820. }
  821. CGV_FLOAT old_img_absf(CGV_FLOAT a)
  822. {
  823. return a > 0.0F ? a : -a;
  824. }
  825. INLINE CGV_FLOAT old_get_sperr(CGU_INT clogBC7, // ramp bits Valid range 2..4
  826. CGU_INT bits, // Component Valid range 5..8
  827. CGV_INT p1, // 0..255
  828. CGU_INT t1,
  829. CGU_INT t2,
  830. CGV_UINT8 index)
  831. {
  832. if (BC7EncodeRamps2.ramp_init)
  833. return BC7EncodeRamps2.sp_err[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16) + (BTT2(bits) * 256 * 2 * 2 * 16) + (p1 * 2 * 2 * 16) + (t1 * 2 * 16) + (t2 * 16) + index];
  834. else
  835. return 0.0f;
  836. }
  837. #endif
  838. #endif
  839. #endif // Not ASPM_HLSL
  840. #endif // ENABLE_CMP_API
  841. #define get_end_point_l(subset) shared_temp[threadBase + subset].endPoint_low_quantized
  842. #define get_end_point_h(subset) shared_temp[threadBase + subset].endPoint_high_quantized
  843. #define get_color_index(index) shared_temp[threadBase + index].error
  844. #define get_alpha_index(index) shared_temp[threadBase + index].mode
  845. //4 bit index: 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64
  846. CMP_STATIC CMP_CONSTANT CGU_UINT32 aStep[3][64] = {
  847. {0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7,
  848. 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15},
  849. //3 bit index: 0, 9, 18, 27, 37, 46, 55, 64
  850. {0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  851. 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7},
  852. //2 bit index: 0, 21, 43, 64
  853. {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  854. 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}};
  855. CMP_STATIC CMP_CONSTANT CGU_UINT32 aWeight[3][16] = {{0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64},
  856. {0, 9, 18, 27, 37, 46, 55, 64, 0, 0, 0, 0, 0, 0, 0, 0},
  857. {0, 21, 43, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}};
  858. //Associated to partition 0-63
  859. CMP_STATIC CMP_CONSTANT CGU_UINT32 blockPartitions[64] = {
  860. 0xCCCC, 0x8888, 0xEEEE, 0xECC8, 0xC880, 0xFEEC, 0xFEC8, 0xEC80, 0xC800, 0xFFEC, 0xFE80, 0xE800, 0xFFE8, 0xFF00, 0xFFF0, 0xF000,
  861. 0xF710, 0x008E, 0x7100, 0x08CE, 0x008C, 0x7310, 0x3100, 0x8CCE, 0x088C, 0x3110, 0x6666, 0x366C, 0x17E8, 0x0FF0, 0x718E, 0x399C,
  862. 0xaaaa, 0xf0f0, 0x5a5a, 0x33cc, 0x3c3c, 0x55aa, 0x9696, 0xa55a, 0x73ce, 0x13c8, 0x324c, 0x3bdc, 0x6996, 0xc33c, 0x9966, 0x660,
  863. 0x272, 0x4e4, 0x4e40, 0x2720, 0xc936, 0x936c, 0x39c6, 0x639c, 0x9336, 0x9cc6, 0x817e, 0xe718, 0xccf0, 0xfcc, 0x7744, 0xee22,
  864. };
  865. //Associated to partition 64-127
  866. CMP_STATIC CMP_CONSTANT CGU_UINT32 blockPartitions2[64] = {
  867. 0xaa685050, 0x6a5a5040, 0x5a5a4200, 0x5450a0a8, 0xa5a50000, 0xa0a05050, 0x5555a0a0, 0x5a5a5050, 0xaa550000, 0xaa555500, 0xaaaa5500,
  868. 0x90909090, 0x94949494, 0xa4a4a4a4, 0xa9a59450, 0x2a0a4250, 0xa5945040, 0x0a425054, 0xa5a5a500, 0x55a0a0a0, 0xa8a85454, 0x6a6a4040,
  869. 0xa4a45000, 0x1a1a0500, 0x0050a4a4, 0xaaa59090, 0x14696914, 0x69691400, 0xa08585a0, 0xaa821414, 0x50a4a450, 0x6a5a0200, 0xa9a58000,
  870. 0x5090a0a8, 0xa8a09050, 0x24242424, 0x00aa5500, 0x24924924, 0x24499224, 0x50a50a50, 0x500aa550, 0xaaaa4444, 0x66660000, 0xa5a0a5a0,
  871. 0x50a050a0, 0x69286928, 0x44aaaa44, 0x66666600, 0xaa444444, 0x54a854a8, 0x95809580, 0x96969600, 0xa85454a8, 0x80959580, 0xaa141414,
  872. 0x96960000, 0xaaaa1414, 0xa05050a0, 0xa0a5a5a0, 0x96000000, 0x40804080, 0xa9a8a9a8, 0xaaaaaa44, 0x2a4a5254,
  873. };
  874. CMP_STATIC CMP_CONSTANT CGU_Vec2ui candidateFixUpIndex1D[128] = {
  875. {15, 0},{15, 0},{15, 0},{15, 0},
  876. {15, 0},{15, 0},{15, 0},{15, 0},
  877. {15, 0},{15, 0},{15, 0},{15, 0},
  878. {15, 0},{15, 0},{15, 0},{15, 0},
  879. {15, 0},{ 2, 0},{ 8, 0},{ 2, 0},
  880. { 2, 0},{ 8, 0},{ 8, 0},{15, 0},
  881. { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0},
  882. { 8, 0},{ 8, 0},{ 2, 0},{ 2, 0},
  883. {15, 0},{15, 0},{ 6, 0},{ 8, 0},
  884. { 2, 0},{ 8, 0},{15, 0},{15, 0},
  885. { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0},
  886. { 2, 0},{15, 0},{15, 0},{ 6, 0},
  887. { 6, 0},{ 2, 0},{ 6, 0},{ 8, 0},
  888. {15, 0},{15, 0},{ 2, 0},{ 2, 0},
  889. {15, 0},{15, 0},{15, 0},{15, 0},
  890. {15, 0},{ 2, 0},{ 2, 0},{15, 0},
  891. //candidateFixUpIndex1D[i][1], i < 64 should not be used
  892. { 3,15},{ 3, 8},{15, 8},{15, 3},
  893. { 8,15},{ 3,15},{15, 3},{15, 8},
  894. { 8,15},{ 8,15},{ 6,15},{ 6,15},
  895. { 6,15},{ 5,15},{ 3,15},{ 3, 8},
  896. { 3,15},{ 3, 8},{ 8,15},{15, 3},
  897. { 3,15},{ 3, 8},{ 6,15},{10, 8},
  898. { 5, 3},{ 8,15},{ 8, 6},{ 6,10},
  899. { 8,15},{ 5,15},{15,10},{15, 8},
  900. { 8,15},{15, 3},{ 3,15},{ 5,10},
  901. { 6,10},{10, 8},{ 8, 9},{15,10},
  902. {15, 6},{ 3,15},{15, 8},{ 5,15},
  903. {15, 3},{15, 6},{15, 6},{15, 8}, //The Spec doesn't mark the first fixed up index in this row, so I apply 15 for them, and seems correct
  904. { 3,15},{15, 3},{ 5,15},{ 5,15},
  905. { 5,15},{ 8,15},{ 5,15},{10,15},
  906. { 5,15},{10,15},{ 8,15},{13,15},
  907. {15, 3},{12,15},{ 3,15},{ 3, 8},
  908. };
  909. CMP_STATIC CMP_CONSTANT CGU_Vec2ui candidateFixUpIndex1DOrdered[128] = {
  910. {15, 0},{15, 0},{15, 0},{15, 0},
  911. {15, 0},{15, 0},{15, 0},{15, 0},
  912. {15, 0},{15, 0},{15, 0},{15, 0},
  913. {15, 0},{15, 0},{15, 0},{15, 0},
  914. {15, 0},{ 2, 0},{ 8, 0},{ 2, 0},
  915. { 2, 0},{ 8, 0},{ 8, 0},{15, 0},
  916. { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0},
  917. { 8, 0},{ 8, 0},{ 2, 0},{ 2, 0},
  918. {15, 0},{15, 0},{ 6, 0},{ 8, 0},
  919. { 2, 0},{ 8, 0},{15, 0},{15, 0},
  920. { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0},
  921. { 2, 0},{15, 0},{15, 0},{ 6, 0},
  922. { 6, 0},{ 2, 0},{ 6, 0},{ 8, 0},
  923. {15, 0},{15, 0},{ 2, 0},{ 2, 0},
  924. {15, 0},{15, 0},{15, 0},{15, 0},
  925. {15, 0},{ 2, 0},{ 2, 0},{15, 0},
  926. //candidateFixUpIndex1DOrdered[i][1], i < 64 should not be used
  927. { 3,15},{ 3, 8},{ 8,15},{ 3,15},
  928. { 8,15},{ 3,15},{ 3,15},{ 8,15},
  929. { 8,15},{ 8,15},{ 6,15},{ 6,15},
  930. { 6,15},{ 5,15},{ 3,15},{ 3, 8},
  931. { 3,15},{ 3, 8},{ 8,15},{ 3,15},
  932. { 3,15},{ 3, 8},{ 6,15},{ 8,10},
  933. { 3, 5},{ 8,15},{ 6, 8},{ 6,10},
  934. { 8,15},{ 5,15},{10,15},{ 8,15},
  935. { 8,15},{ 3,15},{ 3,15},{ 5,10},
  936. { 6,10},{ 8,10},{ 8, 9},{10,15},
  937. { 6,15},{ 3,15},{ 8,15},{ 5,15},
  938. { 3,15},{ 6,15},{ 6,15},{ 8,15}, //The Spec doesn't mark the first fixed up index in this row, so I apply 15 for them, and seems correct
  939. { 3,15},{ 3,15},{ 5,15},{ 5,15},
  940. { 5,15},{ 8,15},{ 5,15},{10,15},
  941. { 5,15},{10,15},{ 8,15},{13,15},
  942. { 3,15},{12,15},{ 3,15},{ 3, 8}
  943. };
  944. CGU_Vec4ui quantize(CGU_Vec4ui color, CGU_UINT32 uPrec)
  945. {
  946. return (((color << 8) + color) * ((1 << uPrec) - 1) + 32768U) >> 16;
  947. }
  948. CGU_Vec4ui unquantize(CGU_Vec4ui color, CGU_UINT32 uPrec)
  949. {
  950. #ifdef ASPM_GPU
  951. color = color << (8 - uPrec);
  952. return color | (color >> uPrec);
  953. #else
  954. CGU_Vec4ui res;
  955. color.x = color.x << (8 - uPrec);
  956. color.y = color.y << (8 - uPrec);
  957. color.z = color.z << (8 - uPrec);
  958. color.w = color.w << (8 - uPrec);
  959. res.x = color.x | (color.x >> uPrec);
  960. res.y = color.y | (color.y >> uPrec);
  961. res.z = color.z | (color.z >> uPrec);
  962. res.w = color.w | (color.w >> uPrec);
  963. return res;
  964. #endif
  965. }
  966. void swap(CMP_INOUT CGU_Vec4ui CMP_REFINOUT lhs, CMP_INOUT CGU_Vec4ui CMP_REFINOUT rhs)
  967. {
  968. CGU_Vec4ui tmp = lhs;
  969. lhs = rhs;
  970. rhs = tmp;
  971. }
  972. void swap(CMP_INOUT CGU_Vec3ui CMP_REFINOUT lhs, CMP_INOUT CGU_Vec3ui CMP_REFINOUT rhs)
  973. {
  974. CGU_Vec3ui tmp = lhs;
  975. lhs = rhs;
  976. rhs = tmp;
  977. }
  978. void swap(CMP_INOUT CGU_UINT32 CMP_REFINOUT lhs, CMP_INOUT CGU_UINT32 CMP_REFINOUT rhs)
  979. {
  980. CGU_UINT32 tmp = lhs;
  981. lhs = rhs;
  982. rhs = tmp;
  983. }
  984. CGU_UINT32 ComputeError(CMP_IN CGU_Vec4ui a, CMP_IN CGU_Vec4ui b)
  985. {
  986. return dot(a.rgb, b.rgb) + (g_alpha_weight * a.a * b.a);
  987. }
  988. void Ensure_A_Is_Larger(CMP_INOUT CGU_Vec4ui CMP_REFINOUT a, CMP_INOUT CGU_Vec4ui CMP_REFINOUT b)
  989. {
  990. if (a.x < b.x)
  991. swap(a.x, b.x);
  992. if (a.y < b.y)
  993. swap(a.y, b.y);
  994. if (a.z < b.z)
  995. swap(a.z, b.z);
  996. if (a.w < b.w)
  997. swap(a.w, b.w);
  998. }
  999. void compress_endpoints0(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_INOUT CGU_Vec4ui quantized[2], CGU_Vec2ui P)
  1000. {
  1001. #ifdef ASPM_GPU
  1002. CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++)
  1003. {
  1004. quantized[j].rgb = quantize(endPoint[j].rgbb, 5).rgb & 0xFFFFFFFE;
  1005. quantized[j].rgb |= P[j];
  1006. quantized[j].a = 0xFF;
  1007. endPoint[j].rgb = unquantize(quantized[j].rgbb, 5).rgb;
  1008. endPoint[j].a = 0xFF;
  1009. quantized[j] <<= 3;
  1010. }
  1011. #else
  1012. CGU_Vec4ui rgbb;
  1013. CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++)
  1014. {
  1015. rgbb.r = endPoint[j].r;
  1016. rgbb.g = endPoint[j].g;
  1017. rgbb.b = endPoint[j].b;
  1018. rgbb.a = endPoint[j].b;
  1019. quantized[j].rgb = quantize(rgbb, 5).rgb;
  1020. quantized[j].r &= 0xFFFFFFFE;
  1021. quantized[j].g &= 0xFFFFFFFE;
  1022. quantized[j].b &= 0xFFFFFFFE;
  1023. quantized[j].r |= P[j];
  1024. quantized[j].g |= P[j];
  1025. quantized[j].b |= P[j];
  1026. quantized[j].a = 0xFF;
  1027. rgbb.r = quantized[j].r;
  1028. rgbb.g = quantized[j].g;
  1029. rgbb.b = quantized[j].b;
  1030. rgbb.a = quantized[j].b;
  1031. endPoint[j].rgb = unquantize(rgbb, 5).rgb;
  1032. endPoint[j].a = 0xFF;
  1033. quantized[j].r <<= 3;
  1034. quantized[j].g <<= 3;
  1035. quantized[j].b <<= 3;
  1036. quantized[j].a <<= 3;
  1037. }
  1038. #endif
  1039. }
  1040. void compress_endpoints1(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_OUT CGU_Vec4ui quantized[2], CGU_Vec2ui P)
  1041. {
  1042. #ifdef ASPM_GPU
  1043. CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++)
  1044. {
  1045. quantized[j].rgb = quantize(endPoint[j].rgbb, 7).rgb & 0xFFFFFFFE;
  1046. quantized[j].rgb |= P[j];
  1047. quantized[j].a = 0xFF;
  1048. endPoint[j].rgb = unquantize(quantized[j].rgbb, 7).rgb;
  1049. endPoint[j].a = 0xFF;
  1050. quantized[j] <<= 1;
  1051. }
  1052. #else
  1053. CGU_Vec4ui rgbb;
  1054. CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++)
  1055. {
  1056. rgbb.r = endPoint[j].r;
  1057. rgbb.g = endPoint[j].g;
  1058. rgbb.b = endPoint[j].b;
  1059. rgbb.a = endPoint[j].b;
  1060. quantized[j].rgb = quantize(rgbb, 7).rgb;
  1061. quantized[j].r &= 0xFFFFFFFE;
  1062. quantized[j].g &= 0xFFFFFFFE;
  1063. quantized[j].b &= 0xFFFFFFFE;
  1064. quantized[j].r |= P[j];
  1065. quantized[j].g |= P[j];
  1066. quantized[j].b |= P[j];
  1067. quantized[j].a = 0xFF;
  1068. rgbb.r = quantized[j].r;
  1069. rgbb.g = quantized[j].g;
  1070. rgbb.b = quantized[j].b;
  1071. rgbb.a = quantized[j].b;
  1072. endPoint[j].rgb = unquantize(rgbb, 7).rgb;
  1073. endPoint[j].a = 0xFF;
  1074. quantized[j].r = quantized[j].r << 1;
  1075. quantized[j].g = quantized[j].g << 1;
  1076. quantized[j].b = quantized[j].b << 1;
  1077. quantized[j].a = quantized[j].a << 1;
  1078. }
  1079. #endif
  1080. }
  1081. void compress_endpoints2(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_INOUT CGU_Vec4ui quantized[2])
  1082. {
  1083. #ifdef ASPM_GPU
  1084. CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++)
  1085. {
  1086. quantized[j].rgb = quantize(endPoint[j].rgbb, 5).rgb;
  1087. quantized[j].a = 0xFF;
  1088. endPoint[j].rgb = unquantize(quantized[j].rgbb, 5).rgb;
  1089. endPoint[j].a = 0xFF;
  1090. quantized[j] <<= 3;
  1091. }
  1092. #else
  1093. CGU_Vec4ui rgbb;
  1094. CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++)
  1095. {
  1096. rgbb.r = endPoint[j].r;
  1097. rgbb.g = endPoint[j].g;
  1098. rgbb.b = endPoint[j].b;
  1099. rgbb.a = endPoint[j].b;
  1100. quantized[j].rgb = quantize(rgbb, 5).rgb;
  1101. quantized[j].a = 0xFF;
  1102. rgbb.r = quantized[j].r;
  1103. rgbb.g = quantized[j].g;
  1104. rgbb.b = quantized[j].b;
  1105. rgbb.a = quantized[j].b;
  1106. endPoint[j].rgb = unquantize(rgbb, 5).rgb;
  1107. endPoint[j].a = 0xFF;
  1108. quantized[j].r <<= 3;
  1109. quantized[j].g <<= 3;
  1110. quantized[j].b <<= 3;
  1111. quantized[j].a <<= 3;
  1112. }
  1113. #endif
  1114. }
  1115. void compress_endpoints3(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_INOUT CGU_Vec4ui quantized[2], CGU_Vec2ui P)
  1116. {
  1117. for (CGU_UINT32 j = 0; j < 2; j++)
  1118. {
  1119. quantized[j].r = endPoint[j].x & 0xFFFFFFFE;
  1120. quantized[j].g = endPoint[j].y & 0xFFFFFFFE;
  1121. quantized[j].b = endPoint[j].z & 0xFFFFFFFE;
  1122. quantized[j].a = 0xFF;
  1123. quantized[j].r |= P[j];
  1124. quantized[j].g |= P[j];
  1125. quantized[j].b |= P[j];
  1126. endPoint[j].r = quantized[j].r;
  1127. endPoint[j].g = quantized[j].g;
  1128. endPoint[j].b = quantized[j].b;
  1129. endPoint[j].a = 0xFF;
  1130. }
  1131. }
  1132. void compress_endpoints4(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_INOUT CGU_Vec4ui quantized[2])
  1133. {
  1134. #ifdef ASPM_HLSL
  1135. [unroll] for ( uint j = 0; j < 2; j ++ )
  1136. {
  1137. quantized[j].rgb = quantize(endPoint[j].rgbb, 5).rgb;
  1138. quantized[j].a = quantize(endPoint[j].a, 6).r;
  1139. endPoint[j].rgb = unquantize(quantized[j].rgbb, 5).rgb;
  1140. endPoint[j].a = unquantize(quantized[j].a, 6).r;
  1141. quantized[j].rgb <<= 3;
  1142. quantized[j].a <<= 2;
  1143. }
  1144. #else
  1145. CGU_Vec4ui rgbb;
  1146. CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++)
  1147. {
  1148. rgbb.r = endPoint[j].r;
  1149. rgbb.g = endPoint[j].g;
  1150. rgbb.b = endPoint[j].b;
  1151. rgbb.a = endPoint[j].b;
  1152. quantized[j].rgb = quantize(rgbb, 5).rgb;
  1153. quantized[j].a = quantize(endPoint[j].a, 6).r;
  1154. rgbb.r = quantized[j].r;
  1155. rgbb.g = quantized[j].g;
  1156. rgbb.b = quantized[j].b;
  1157. rgbb.a = quantized[j].b;
  1158. endPoint[j].rgb = unquantize(rgbb, 5).rgb;
  1159. endPoint[j].a = unquantize(quantized[j].a, 6).r;
  1160. quantized[j].r <<= 3;
  1161. quantized[j].g <<= 3;
  1162. quantized[j].b <<= 3;
  1163. quantized[j].a <<= 2;
  1164. }
  1165. #endif
  1166. }
  1167. void compress_endpoints5(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_INOUT CGU_Vec4ui quantized[2])
  1168. {
  1169. #ifdef ASPM_HLSL
  1170. CMP_UNROLL for ( uint j = 0; j < 2; j ++ )
  1171. {
  1172. quantized[j].rgb = quantize(endPoint[j].rgbb, 7).rgb;
  1173. quantized[j].a = endPoint[j].a;
  1174. endPoint[j].rgb = unquantize(quantized[j].rgbb, 7).rgb;
  1175. // endPoint[j].a Alpha is full precision
  1176. quantized[j].rgb <<= 1;
  1177. }
  1178. #else
  1179. CGU_Vec4ui rgbb;
  1180. CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++)
  1181. {
  1182. rgbb.r = endPoint[j].r;
  1183. rgbb.g = endPoint[j].g;
  1184. rgbb.b = endPoint[j].b;
  1185. rgbb.a = endPoint[j].b;
  1186. quantized[j].rgb = quantize(rgbb, 7).rgb;
  1187. quantized[j].a = endPoint[j].a;
  1188. rgbb.r = quantized[j].r;
  1189. rgbb.g = quantized[j].g;
  1190. rgbb.b = quantized[j].b;
  1191. rgbb.a = quantized[j].b;
  1192. endPoint[j].rgb = unquantize(rgbb, 7).rgb;
  1193. quantized[j].r <<= 1;
  1194. quantized[j].g <<= 1;
  1195. quantized[j].b <<= 1;
  1196. }
  1197. #endif
  1198. }
  1199. void compress_endpoints6(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_OUT CGU_Vec4ui quantized[2], CGU_Vec2ui P)
  1200. {
  1201. for (CGU_UINT32 j = 0; j < 2; j++)
  1202. {
  1203. quantized[j].x = endPoint[j].x & 0xFFFFFFFE;
  1204. quantized[j].y = endPoint[j].y & 0xFFFFFFFE;
  1205. quantized[j].z = endPoint[j].z & 0xFFFFFFFE;
  1206. quantized[j].w = endPoint[j].w & 0xFFFFFFFE;
  1207. quantized[j].x = quantized[j].x | P[j];
  1208. quantized[j].y = quantized[j].y | P[j];
  1209. quantized[j].z = quantized[j].z | P[j];
  1210. quantized[j].w = quantized[j].w | P[j];
  1211. endPoint[j] = quantized[j];
  1212. }
  1213. }
  1214. void compress_endpoints7(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_INOUT CGU_Vec4ui quantized[2], CGU_Vec2ui P)
  1215. {
  1216. CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++)
  1217. {
  1218. quantized[j] = quantize(endPoint[j], 6);
  1219. quantized[j].x = (quantized[j].x & 0xFFFFFFFE) | P[j];
  1220. quantized[j].y = (quantized[j].y & 0xFFFFFFFE) | P[j];
  1221. quantized[j].z = (quantized[j].z & 0xFFFFFFFE) | P[j];
  1222. quantized[j].w = (quantized[j].w & 0xFFFFFFFE) | P[j];
  1223. endPoint[j] = unquantize(quantized[j], 6);
  1224. }
  1225. CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++)
  1226. {
  1227. quantized[j].x = quantized[j].x << 2;
  1228. quantized[j].y = quantized[j].y << 2;
  1229. quantized[j].z = quantized[j].z << 2;
  1230. quantized[j].w = quantized[j].w << 2;
  1231. }
  1232. }
  1233. void block_package0(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 partition, CGU_UINT32 threadBase)
  1234. {
  1235. block.x = 0x01 | ((partition - 64) << 1) | ((get_end_point_l(0).r & 0xF0) << 1) | ((get_end_point_h(0).r & 0xF0) << 5) |
  1236. ((get_end_point_l(1).r & 0xF0) << 9) | ((get_end_point_h(1).r & 0xF0) << 13) | ((get_end_point_l(2).r & 0xF0) << 17) |
  1237. ((get_end_point_h(2).r & 0xF0) << 21) | ((get_end_point_l(0).g & 0xF0) << 25);
  1238. block.y = ((get_end_point_l(0).g & 0xF0) >> 7) | ((get_end_point_h(0).g & 0xF0) >> 3) | ((get_end_point_l(1).g & 0xF0) << 1) |
  1239. ((get_end_point_h(1).g & 0xF0) << 5) | ((get_end_point_l(2).g & 0xF0) << 9) | ((get_end_point_h(2).g & 0xF0) << 13) |
  1240. ((get_end_point_l(0).b & 0xF0) << 17) | ((get_end_point_h(0).b & 0xF0) << 21) | ((get_end_point_l(1).b & 0xF0) << 25);
  1241. block.z = ((get_end_point_l(1).b & 0xF0) >> 7) | ((get_end_point_h(1).b & 0xF0) >> 3) | ((get_end_point_l(2).b & 0xF0) << 1) |
  1242. ((get_end_point_h(2).b & 0xF0) << 5) | ((get_end_point_l(0).r & 0x08) << 10) | ((get_end_point_h(0).r & 0x08) << 11) |
  1243. ((get_end_point_l(1).r & 0x08) << 12) | ((get_end_point_h(1).r & 0x08) << 13) | ((get_end_point_l(2).r & 0x08) << 14) |
  1244. ((get_end_point_h(2).r & 0x08) << 15) | (get_color_index(0) << 19);
  1245. block.w = 0;
  1246. CGU_UINT32 i = 1;
  1247. for (; i <= cmp_min(candidateFixUpIndex1DOrdered[partition][0], 4); i++)
  1248. {
  1249. block.z |= get_color_index(i) << (i * 3 + 18);
  1250. }
  1251. if (candidateFixUpIndex1DOrdered[partition][0] < 4) //i = 4
  1252. {
  1253. block.z |= get_color_index(4) << 29;
  1254. i += 1;
  1255. }
  1256. else //i = 5
  1257. {
  1258. block.w |= (get_color_index(4) & 0x04) >> 2;
  1259. for (; i <= candidateFixUpIndex1DOrdered[partition][0]; i++)
  1260. block.w |= get_color_index(i) << (i * 3 - 14);
  1261. }
  1262. for (; i <= candidateFixUpIndex1DOrdered[partition][1]; i++)
  1263. {
  1264. block.w |= get_color_index(i) << (i * 3 - 15);
  1265. }
  1266. for (; i < 16; i++)
  1267. {
  1268. block.w |= get_color_index(i) << (i * 3 - 16);
  1269. }
  1270. }
  1271. void block_package1(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 partition, CGU_UINT32 threadBase)
  1272. {
  1273. block.x = 0x02 | (partition << 2) | ((get_end_point_l(0).r & 0xFC) << 6) | ((get_end_point_h(0).r & 0xFC) << 12) | ((get_end_point_l(1).r & 0xFC) << 18) |
  1274. ((get_end_point_h(1).r & 0xFC) << 24);
  1275. block.y = ((get_end_point_l(0).g & 0xFC) >> 2) | ((get_end_point_h(0).g & 0xFC) << 4) | ((get_end_point_l(1).g & 0xFC) << 10) |
  1276. ((get_end_point_h(1).g & 0xFC) << 16) | ((get_end_point_l(0).b & 0xFC) << 22) | ((get_end_point_h(0).b & 0xFC) << 28);
  1277. block.z = ((get_end_point_h(0).b & 0xFC) >> 4) | ((get_end_point_l(1).b & 0xFC) << 2) | ((get_end_point_h(1).b & 0xFC) << 8) |
  1278. ((get_end_point_l(0).r & 0x02) << 15) | ((get_end_point_l(1).r & 0x02) << 16) | (get_color_index(0) << 18);
  1279. if (candidateFixUpIndex1DOrdered[partition][0] == 15)
  1280. {
  1281. block.w = (get_color_index(15) << 30) | (get_color_index(14) << 27) | (get_color_index(13) << 24) | (get_color_index(12) << 21) |
  1282. (get_color_index(11) << 18) | (get_color_index(10) << 15) | (get_color_index(9) << 12) | (get_color_index(8) << 9) |
  1283. (get_color_index(7) << 6) | (get_color_index(6) << 3) | get_color_index(5);
  1284. block.z |=
  1285. (get_color_index(4) << 29) | (get_color_index(3) << 26) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18);
  1286. }
  1287. else if (candidateFixUpIndex1DOrdered[partition][0] == 2)
  1288. {
  1289. block.w = (get_color_index(15) << 29) | (get_color_index(14) << 26) | (get_color_index(13) << 23) | (get_color_index(12) << 20) |
  1290. (get_color_index(11) << 17) | (get_color_index(10) << 14) | (get_color_index(9) << 11) | (get_color_index(8) << 8) |
  1291. (get_color_index(7) << 5) | (get_color_index(6) << 2) | (get_color_index(5) >> 1);
  1292. block.z |= (get_color_index(5) << 31) | (get_color_index(4) << 28) | (get_color_index(3) << 25) | (get_color_index(2) << 23) |
  1293. (get_color_index(1) << 20) | (get_color_index(0) << 18);
  1294. }
  1295. else if (candidateFixUpIndex1DOrdered[partition][0] == 8)
  1296. {
  1297. block.w = (get_color_index(15) << 29) | (get_color_index(14) << 26) | (get_color_index(13) << 23) | (get_color_index(12) << 20) |
  1298. (get_color_index(11) << 17) | (get_color_index(10) << 14) | (get_color_index(9) << 11) | (get_color_index(8) << 9) |
  1299. (get_color_index(7) << 6) | (get_color_index(6) << 3) | get_color_index(5);
  1300. block.z |=
  1301. (get_color_index(4) << 29) | (get_color_index(3) << 26) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18);
  1302. }
  1303. else //candidateFixUpIndex1DOrdered[partition] == 6
  1304. {
  1305. block.w = (get_color_index(15) << 29) | (get_color_index(14) << 26) | (get_color_index(13) << 23) | (get_color_index(12) << 20) |
  1306. (get_color_index(11) << 17) | (get_color_index(10) << 14) | (get_color_index(9) << 11) | (get_color_index(8) << 8) |
  1307. (get_color_index(7) << 5) | (get_color_index(6) << 3) | get_color_index(5);
  1308. block.z |=
  1309. (get_color_index(4) << 29) | (get_color_index(3) << 26) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18);
  1310. }
  1311. }
  1312. void block_package2(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 partition, CGU_UINT32 threadBase)
  1313. {
  1314. block.x = 0x04 | ((partition - 64) << 3) | ((get_end_point_l(0).r & 0xF8) << 6) | ((get_end_point_h(0).r & 0xF8) << 11) |
  1315. ((get_end_point_l(1).r & 0xF8) << 16) | ((get_end_point_h(1).r & 0xF8) << 21) | ((get_end_point_l(2).r & 0xF8) << 26);
  1316. block.y = ((get_end_point_l(2).r & 0xF8) >> 6) | ((get_end_point_h(2).r & 0xF8) >> 1) | ((get_end_point_l(0).g & 0xF8) << 4) |
  1317. ((get_end_point_h(0).g & 0xF8) << 9) | ((get_end_point_l(1).g & 0xF8) << 14) | ((get_end_point_h(1).g & 0xF8) << 19) |
  1318. ((get_end_point_l(2).g & 0xF8) << 24);
  1319. block.z = ((get_end_point_h(2).g & 0xF8) >> 3) | ((get_end_point_l(0).b & 0xF8) << 2) | ((get_end_point_h(0).b & 0xF8) << 7) |
  1320. ((get_end_point_l(1).b & 0xF8) << 12) | ((get_end_point_h(1).b & 0xF8) << 17) | ((get_end_point_l(2).b & 0xF8) << 22) |
  1321. ((get_end_point_h(2).b & 0xF8) << 27);
  1322. block.w = ((get_end_point_h(2).b & 0xF8) >> 5) | (get_color_index(0) << 3);
  1323. CGU_UINT32 i = 1;
  1324. for (; i <= candidateFixUpIndex1DOrdered[partition][0]; i++)
  1325. {
  1326. block.w |= get_color_index(i) << (i * 2 + 2);
  1327. }
  1328. for (; i <= candidateFixUpIndex1DOrdered[partition][1]; i++)
  1329. {
  1330. block.w |= get_color_index(i) << (i * 2 + 1);
  1331. }
  1332. for (; i < 16; i++)
  1333. {
  1334. block.w |= get_color_index(i) << (i * 2);
  1335. }
  1336. }
  1337. void block_package3(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 partition, CGU_UINT32 threadBase)
  1338. {
  1339. block.x = 0x08 | (partition << 4) | ((get_end_point_l(0).r & 0xFE) << 9) | ((get_end_point_h(0).r & 0xFE) << 16) | ((get_end_point_l(1).r & 0xFE) << 23) |
  1340. ((get_end_point_h(1).r & 0xFE) << 30);
  1341. block.y = ((get_end_point_h(1).r & 0xFE) >> 2) | ((get_end_point_l(0).g & 0xFE) << 5) | ((get_end_point_h(0).g & 0xFE) << 12) |
  1342. ((get_end_point_l(1).g & 0xFE) << 19) | ((get_end_point_h(1).g & 0xFE) << 26);
  1343. block.z = ((get_end_point_h(1).g & 0xFE) >> 6) | ((get_end_point_l(0).b & 0xFE) << 1) | ((get_end_point_h(0).b & 0xFE) << 8) |
  1344. ((get_end_point_l(1).b & 0xFE) << 15) | ((get_end_point_h(1).b & 0xFE) << 22) | ((get_end_point_l(0).r & 0x01) << 30) |
  1345. ((get_end_point_h(0).r & 0x01) << 31);
  1346. block.w = ((get_end_point_l(1).r & 0x01) << 0) | ((get_end_point_h(1).r & 0x01) << 1) | (get_color_index(0) << 2);
  1347. CGU_UINT32 i = 1;
  1348. for (; i <= candidateFixUpIndex1DOrdered[partition][0]; i++)
  1349. {
  1350. block.w |= get_color_index(i) << (i * 2 + 1);
  1351. }
  1352. for (; i < 16; i++)
  1353. {
  1354. block.w |= get_color_index(i) << (i * 2);
  1355. }
  1356. }
  1357. void block_package4(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 rotation, CGU_UINT32 index_selector, CGU_UINT32 threadBase)
  1358. {
  1359. block.x = 0x10 | ((rotation & 3) << 5) | ((index_selector & 1) << 7) | ((get_end_point_l(0).r & 0xF8) << 5) | ((get_end_point_h(0).r & 0xF8) << 10) |
  1360. ((get_end_point_l(0).g & 0xF8) << 15) | ((get_end_point_h(0).g & 0xF8) << 20) | ((get_end_point_l(0).b & 0xF8) << 25);
  1361. block.y = ((get_end_point_l(0).b & 0xF8) >> 7) | ((get_end_point_h(0).b & 0xF8) >> 2) | ((get_end_point_l(0).a & 0xFC) << 4) |
  1362. ((get_end_point_h(0).a & 0xFC) << 10) | ((get_color_index(0) & 1) << 18) | (get_color_index(1) << 19) | (get_color_index(2) << 21) |
  1363. (get_color_index(3) << 23) | (get_color_index(4) << 25) | (get_color_index(5) << 27) | (get_color_index(6) << 29) | (get_color_index(7) << 31);
  1364. block.z = (get_color_index(7) >> 1) | (get_color_index(8) << 1) | (get_color_index(9) << 3) | (get_color_index(10) << 5) | (get_color_index(11) << 7) |
  1365. (get_color_index(12) << 9) | (get_color_index(13) << 11) | (get_color_index(14) << 13) | (get_color_index(15) << 15) |
  1366. ((get_alpha_index(0) & 3) << 17) | (get_alpha_index(1) << 19) | (get_alpha_index(2) << 22) | (get_alpha_index(3) << 25) |
  1367. (get_alpha_index(4) << 28) | (get_alpha_index(5) << 31);
  1368. block.w = (get_alpha_index(5) >> 1) | (get_alpha_index(6) << 2) | (get_alpha_index(7) << 5) | (get_alpha_index(8) << 8) | (get_alpha_index(9) << 11) |
  1369. (get_alpha_index(10) << 14) | (get_alpha_index(11) << 17) | (get_alpha_index(12) << 20) | (get_alpha_index(13) << 23) |
  1370. (get_alpha_index(14) << 26) | (get_alpha_index(15) << 29);
  1371. }
  1372. void block_package5(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 rotation, CGU_UINT32 threadBase)
  1373. {
  1374. block.x = 0x20 | (rotation << 6) | ((get_end_point_l(0).r & 0xFE) << 7) | ((get_end_point_h(0).r & 0xFE) << 14) | ((get_end_point_l(0).g & 0xFE) << 21) |
  1375. ((get_end_point_h(0).g & 0xFE) << 28);
  1376. block.y = ((get_end_point_h(0).g & 0xFE) >> 4) | ((get_end_point_l(0).b & 0xFE) << 3) | ((get_end_point_h(0).b & 0xFE) << 10) |
  1377. (get_end_point_l(0).a << 18) | (get_end_point_h(0).a << 26);
  1378. block.z = (get_end_point_h(0).a >> 6) | (get_color_index(0) << 2) | (get_color_index(1) << 3) | (get_color_index(2) << 5) | (get_color_index(3) << 7) |
  1379. (get_color_index(4) << 9) | (get_color_index(5) << 11) | (get_color_index(6) << 13) | (get_color_index(7) << 15) | (get_color_index(8) << 17) |
  1380. (get_color_index(9) << 19) | (get_color_index(10) << 21) | (get_color_index(11) << 23) | (get_color_index(12) << 25) |
  1381. (get_color_index(13) << 27) | (get_color_index(14) << 29) | (get_color_index(15) << 31);
  1382. block.w = (get_color_index(15) >> 1) | (get_alpha_index(0) << 1) | (get_alpha_index(1) << 2) | (get_alpha_index(2) << 4) | (get_alpha_index(3) << 6) |
  1383. (get_alpha_index(4) << 8) | (get_alpha_index(5) << 10) | (get_alpha_index(6) << 12) | (get_alpha_index(7) << 14) | (get_alpha_index(8) << 16) |
  1384. (get_alpha_index(9) << 18) | (get_alpha_index(10) << 20) | (get_alpha_index(11) << 22) | (get_alpha_index(12) << 24) |
  1385. (get_alpha_index(13) << 26) | (get_alpha_index(14) << 28) | (get_alpha_index(15) << 30);
  1386. }
  1387. void block_package6(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 threadBase)
  1388. {
  1389. block.x = 0x40 | ((get_end_point_l(0).r & 0xFE) << 6) | ((get_end_point_h(0).r & 0xFE) << 13) | ((get_end_point_l(0).g & 0xFE) << 20) |
  1390. ((get_end_point_h(0).g & 0xFE) << 27);
  1391. block.y = ((get_end_point_h(0).g & 0xFE) >> 5) | ((get_end_point_l(0).b & 0xFE) << 2) | ((get_end_point_h(0).b & 0xFE) << 9) |
  1392. ((get_end_point_l(0).a & 0xFE) << 16) | ((get_end_point_h(0).a & 0xFE) << 23) | (get_end_point_l(0).r & 0x01) << 31;
  1393. block.z = (get_end_point_h(0).r & 0x01) | (get_color_index(0) << 1) | (get_color_index(1) << 4) | (get_color_index(2) << 8) | (get_color_index(3) << 12) |
  1394. (get_color_index(4) << 16) | (get_color_index(5) << 20) | (get_color_index(6) << 24) | (get_color_index(7) << 28);
  1395. block.w = (get_color_index(8) << 0) | (get_color_index(9) << 4) | (get_color_index(10) << 8) | (get_color_index(11) << 12) | (get_color_index(12) << 16) |
  1396. (get_color_index(13) << 20) | (get_color_index(14) << 24) | (get_color_index(15) << 28);
  1397. }
  1398. void block_package7(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 partition, CGU_UINT32 threadBase)
  1399. {
  1400. block.x = 0x80 | (partition << 8) | ((get_end_point_l(0).r & 0xF8) << 11) | ((get_end_point_h(0).r & 0xF8) << 16) | ((get_end_point_l(1).r & 0xF8) << 21) |
  1401. ((get_end_point_h(1).r & 0xF8) << 26);
  1402. block.y = ((get_end_point_h(1).r & 0xF8) >> 6) | ((get_end_point_l(0).g & 0xF8) >> 1) | ((get_end_point_h(0).g & 0xF8) << 4) |
  1403. ((get_end_point_l(1).g & 0xF8) << 9) | ((get_end_point_h(1).g & 0xF8) << 14) | ((get_end_point_l(0).b & 0xF8) << 19) |
  1404. ((get_end_point_h(0).b & 0xF8) << 24);
  1405. block.z = ((get_end_point_l(1).b & 0xF8) >> 3) | ((get_end_point_h(1).b & 0xF8) << 2) | ((get_end_point_l(0).a & 0xF8) << 7) |
  1406. ((get_end_point_h(0).a & 0xF8) << 12) | ((get_end_point_l(1).a & 0xF8) << 17) | ((get_end_point_h(1).a & 0xF8) << 22) |
  1407. ((get_end_point_l(0).r & 0x04) << 28) | ((get_end_point_h(0).r & 0x04) << 29);
  1408. block.w = ((get_end_point_l(1).r & 0x04) >> 2) | ((get_end_point_h(1).r & 0x04) >> 1) | (get_color_index(0) << 2);
  1409. CGU_UINT32 i = 1;
  1410. for (; i <= candidateFixUpIndex1DOrdered[partition][0]; i++)
  1411. {
  1412. block.w |= get_color_index(i) << (i * 2 + 1);
  1413. }
  1414. for (; i < 16; i++)
  1415. {
  1416. block.w |= get_color_index(i) << (i * 2);
  1417. }
  1418. }
  1419. void GroupSync()
  1420. {
  1421. #ifdef ASPM_GPU
  1422. GroupMemoryBarrierWithGroupSync();
  1423. #endif
  1424. }
  1425. void set_pixel_rotation(CMP_INOUT CGU_Vec4ui CMP_REFINOUT pixel, CGU_UINT32 rotation)
  1426. {
  1427. #ifdef ASPM_GPU
  1428. if (1 == rotation)
  1429. {
  1430. pixel.ra = pixel.ar;
  1431. }
  1432. else if (2 == rotation)
  1433. {
  1434. pixel.ga = pixel.ag;
  1435. }
  1436. else if (3 == rotation)
  1437. {
  1438. pixel.ba = pixel.ab;
  1439. }
  1440. #else
  1441. CGU_UINT32 r, g, b, a;
  1442. r = pixel.r;
  1443. g = pixel.g;
  1444. b = pixel.b;
  1445. a = pixel.a;
  1446. if (1 == rotation)
  1447. {
  1448. pixel.r = a;
  1449. pixel.a = r;
  1450. }
  1451. else if (2 == rotation)
  1452. {
  1453. pixel.g = a;
  1454. pixel.a = g;
  1455. }
  1456. else if (3 == rotation)
  1457. {
  1458. pixel.b = a;
  1459. pixel.a = b;
  1460. }
  1461. #endif
  1462. }
  1463. CGU_BOOL cmp_ImageHasAlpha(CGU_UINT32 threadBase)
  1464. {
  1465. #if defined(ENABLED_MODE6) || defined(ENABLE_CMP_MODE6)
  1466. CGU_UINT32 alpha;
  1467. for (CGU_INT ii = 0; ii < 16; ii++)
  1468. {
  1469. alpha = shared_temp[threadBase + ii].pixel.a;
  1470. if ((alpha < 255))
  1471. return true;
  1472. }
  1473. #endif
  1474. return false;
  1475. }
  1476. #ifdef ENABLE_CMP_API
  1477. CGU_UINT32 GetRamp2(CGU_UINT32 e0, CGU_UINT32 e1, CGU_UINT32 index, CGU_UINT32 indexprecision)
  1478. {
  1479. if (indexprecision == 2)
  1480. return (CGU_UINT32)(((64 - aWeight[2][index]) * e0 + aWeight[2][index] * e1 + 32) >> 6);
  1481. else if (indexprecision == 3)
  1482. return (CGU_UINT32)(((64 - aWeight[1][index]) * e0 + aWeight[1][index] * e1 + 32) >> 6);
  1483. else // indexprecision == 4
  1484. return (CGU_UINT32)(((64 - aWeight[0][index]) * e0 + aWeight[0][index] * e1 + 32) >> 6);
  1485. }
  1486. //====================================== MODE 6 ==========================================
  1487. void cmp_encode_apply_swap(CMP_INOUT CGU_Vec4ui epo_code_out[2], CMP_INOUT CGU_UINT32 block_index[2], CMP_IN CGU_INT bits)
  1488. {
  1489. CGU_UINT32 levels = 1 << bits;
  1490. if ((block_index[0] & 15) >= levels / 2)
  1491. {
  1492. // swap end points
  1493. CGU_Vec4ui t = epo_code_out[0];
  1494. epo_code_out[0] = epo_code_out[1];
  1495. epo_code_out[1] = t;
  1496. block_index[0] = (CGU_UINT32)(0x11111111 * (levels - 1)) - block_index[0];
  1497. block_index[1] = (CGU_UINT32)(0x11111111 * (levels - 1)) - block_index[1];
  1498. }
  1499. }
  1500. CGU_INT cmp_Write32Bit(CMP_INOUT CGU_UINT32 base[4], CMP_IN CGU_INT offset, CMP_IN CGU_INT bits, CMP_IN CGU_UINT32 bitVal)
  1501. {
  1502. base[offset / 32] |= ((CGU_UINT32)bitVal) << (offset % 32);
  1503. if (offset % 32 + bits > 32)
  1504. {
  1505. if ((offset / 32 + 1) < 4)
  1506. base[(offset / 32) + 1] |= cmp_shift_right_uint32(bitVal, 32 - offset % 32);
  1507. }
  1508. offset += bits;
  1509. return offset;
  1510. }
  1511. void cmp_encode_index2(CMP_INOUT CGU_UINT32 data[4], CMP_IN CGU_INT pPos, CMP_INOUT CGU_UINT32 color_index[2], CMP_IN CGU_INT bits, CMP_IN CGU_INT flips)
  1512. {
  1513. CGU_INT levels = 1 << bits;
  1514. CGU_INT flips_shifted = flips;
  1515. for (CGU_INT k1 = 0; k1 < 2; k1++)
  1516. {
  1517. CGU_UINT32 qbits_shifted = color_index[k1];
  1518. for (CGU_INT k2 = 0; k2 < 8; k2++)
  1519. {
  1520. CGU_UINT32 q = qbits_shifted & 15;
  1521. if ((flips_shifted & 1) > 0)
  1522. q = (levels - 1) - q;
  1523. if (k1 == 0 && k2 == 0)
  1524. pPos = cmp_Write32Bit(data, pPos, bits - 1, q);
  1525. else
  1526. pPos = cmp_Write32Bit(data, pPos, bits, q);
  1527. qbits_shifted >>= 4;
  1528. flips_shifted >>= 1;
  1529. }
  1530. }
  1531. }
  1532. void cmp_eigen_vector(CMP_INOUT CGV_Vec4f CMP_REFINOUT eigen_vector,
  1533. CMP_INOUT CGU_Vec4f CMP_REFINOUT image_mean,
  1534. CMP_IN CGV_Vec4ui image_src[16],
  1535. CMP_IN CGU_INT numEntries)
  1536. {
  1537. CGU_INT k;
  1538. image_mean = 0.0f;
  1539. eigen_vector = 0.0f;
  1540. CGV_FLOAT vector_covOut[10];
  1541. CGV_FLOAT covar[10] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
  1542. CGV_Vec4f rgbasum = {0.0f, 0.0f, 0.0f, 0.0f};
  1543. for (k = 0; k < numEntries; k++)
  1544. {
  1545. CGV_Vec4f rgba;
  1546. rgba.x = image_src[k].x;
  1547. rgba.y = image_src[k].y;
  1548. rgba.z = image_src[k].z;
  1549. rgba.w = image_src[k].w;
  1550. rgbasum.x += rgba.x;
  1551. rgbasum.y += rgba.y;
  1552. rgbasum.z += rgba.z;
  1553. rgbasum.w += rgba.w;
  1554. covar[0] += rgba.x * rgba.x; //covar[0].x => covar[0]
  1555. covar[1] += rgba.x * rgba.y; //covar[0].y => covar[1]
  1556. covar[2] += rgba.x * rgba.z; //covar[0].z => covar[2]
  1557. covar[3] += rgba.x * rgba.w; //covar[0].w => covar[3]
  1558. covar[4] += rgba.y * rgba.y; //covar[1].y => covar[4]
  1559. covar[5] += rgba.y * rgba.z; //covar[1].z => covar[5]
  1560. covar[6] += rgba.y * rgba.w; //covar[1].w => covar[6]
  1561. covar[7] += rgba.z * rgba.z; //covar[2].z => covar[7]
  1562. covar[8] += rgba.z * rgba.w; //covar[2].w => covar[8]
  1563. covar[9] += rgba.w * rgba.w; //covar[3].w => covar[9]
  1564. }
  1565. image_mean = rgbasum / (CGV_FLOAT)numEntries;
  1566. vector_covOut[0] = covar[0] - (rgbasum.x * rgbasum.x / numEntries);
  1567. vector_covOut[1] = covar[1] - (rgbasum.x * rgbasum.y / numEntries);
  1568. vector_covOut[2] = covar[2] - (rgbasum.x * rgbasum.z / numEntries);
  1569. vector_covOut[3] = covar[3] - (rgbasum.x * rgbasum.w / numEntries);
  1570. vector_covOut[4] = covar[4] - (rgbasum.y * rgbasum.y / numEntries);
  1571. vector_covOut[5] = covar[5] - (rgbasum.y * rgbasum.z / numEntries);
  1572. vector_covOut[6] = covar[6] - (rgbasum.y * rgbasum.w / numEntries);
  1573. vector_covOut[7] = covar[7] - (rgbasum.z * rgbasum.z / numEntries);
  1574. vector_covOut[8] = covar[8] - (rgbasum.z * rgbasum.w / numEntries);
  1575. vector_covOut[9] = covar[9] - (rgbasum.w * rgbasum.w / numEntries);
  1576. CGV_FLOAT inv_var = 1.0 / (256 * 256); // GPU multiply is faster 1.5258789062500000e-05
  1577. for (k = 0; k < 10; k++)
  1578. {
  1579. vector_covOut[k] = vector_covOut[k] * inv_var;
  1580. }
  1581. // Compute eigen_vector
  1582. CGV_Vec4f vec = {1.0f, 1.0f, 1.0f, 1.0f};
  1583. CGU_INT powerIterations = 6; // 4 not enough for HQ : can use quality to set ranges from 2..n
  1584. for (k = 0; k < powerIterations; k++)
  1585. {
  1586. eigen_vector.x = vector_covOut[0] * vec.x + vector_covOut[1] * vec.y + vector_covOut[2] * vec.z + vector_covOut[3] * vec.w;
  1587. eigen_vector.y = vector_covOut[1] * vec.x + vector_covOut[4] * vec.y + vector_covOut[5] * vec.z + vector_covOut[6] * vec.w;
  1588. eigen_vector.z = vector_covOut[2] * vec.x + vector_covOut[5] * vec.y + vector_covOut[7] * vec.z + vector_covOut[8] * vec.w;
  1589. eigen_vector.w = vector_covOut[3] * vec.x + vector_covOut[6] * vec.y + vector_covOut[8] * vec.z + vector_covOut[9] * vec.w;
  1590. // renormalize every other iteration
  1591. if (k % 2 == 1)
  1592. {
  1593. CGV_FLOAT norm_sq = cmp_dot4f(eigen_vector, eigen_vector);
  1594. CGV_FLOAT rnorm = cmp_Image_rsqrt(norm_sq);
  1595. vec = eigen_vector * rnorm;
  1596. }
  1597. else
  1598. vec = eigen_vector;
  1599. }
  1600. eigen_vector = vec;
  1601. //printf("eigen_vector [%1.8f,%1.3f,%1.8f,%1.8f]\n", eigen_vector.x, eigen_vector.y, eigen_vector.z, eigen_vector.w);
  1602. }
  1603. void cmp_endpoints2(CMP_INOUT CGU_Vec4ui end_points_out[2], CMP_IN CGV_Vec4f ext[2], CMP_IN CGV_Vec4f eigen_vector, CMP_IN CGV_Vec4f image_mean)
  1604. {
  1605. CGV_FLOAT levelHigh = 255; // Mode 6 levels = 1 << bits = 128 then use (level * 2) - 1
  1606. CGV_FLOAT levelLow = 254; // Mode 6 levels = 1 << bits = 128 then use (level * 2) - 2
  1607. CGV_Vec4f qep_b[2];
  1608. CGV_FLOAT err0 = 0.0f;
  1609. CGV_FLOAT err1 = 0.0f;
  1610. CGV_Vec4f block_endpoints[2];
  1611. block_endpoints[0] = ext[0] * eigen_vector + image_mean;
  1612. block_endpoints[1] = ext[1] * eigen_vector + image_mean;
  1613. for (CGU_INT subset = 0; subset < 2; subset++)
  1614. { // this code effects quality
  1615. qep_b[0].x = cmp_clampf((CGV_INT)((block_endpoints[subset].x / 255.0f * levelHigh) / 2.0f + 0.5f) * 2.0f, 0, levelLow);
  1616. qep_b[0].y = cmp_clampf((CGV_INT)((block_endpoints[subset].y / 255.0f * levelHigh) / 2.0f + 0.5f) * 2.0f, 0, levelLow);
  1617. qep_b[0].z = cmp_clampf((CGV_INT)((block_endpoints[subset].z / 255.0f * levelHigh) / 2.0f + 0.5f) * 2.0f, 0, levelLow);
  1618. qep_b[0].w = cmp_clampf((CGV_INT)((block_endpoints[subset].w / 255.0f * levelHigh) / 2.0f + 0.5f) * 2.0f, 0, levelLow);
  1619. qep_b[1].x = cmp_clampf((CGV_INT)((block_endpoints[subset].x / 255.0f * levelHigh - 1) / 2.0f + 0.5f) * 2 + 1, 1, levelHigh);
  1620. qep_b[1].y = cmp_clampf((CGV_INT)((block_endpoints[subset].y / 255.0f * levelHigh - 1) / 2.0f + 0.5f) * 2 + 1, 1, levelHigh);
  1621. qep_b[1].z = cmp_clampf((CGV_INT)((block_endpoints[subset].z / 255.0f * levelHigh - 1) / 2.0f + 0.5f) * 2 + 1, 1, levelHigh);
  1622. qep_b[1].w = cmp_clampf((CGV_INT)((block_endpoints[subset].w / 255.0f * levelHigh - 1) / 2.0f + 0.5f) * 2 + 1, 1, levelHigh);
  1623. err0 = cmp_dot4f(block_endpoints[subset] - qep_b[0], block_endpoints[subset] - qep_b[0]);
  1624. err1 = cmp_dot4f(block_endpoints[subset] - qep_b[1], block_endpoints[subset] - qep_b[1]);
  1625. if (subset == 0)
  1626. {
  1627. end_points_out[1].x = (err0 < err1) ? qep_b[0].x : qep_b[1].x;
  1628. end_points_out[1].y = (err0 < err1) ? qep_b[0].y : qep_b[1].y;
  1629. end_points_out[1].z = (err0 < err1) ? qep_b[0].z : qep_b[1].z;
  1630. end_points_out[1].w = (err0 < err1) ? qep_b[0].w : qep_b[1].w;
  1631. }
  1632. else
  1633. {
  1634. end_points_out[0].x = ((err0 < err1) ? qep_b[0].x : qep_b[1].x);
  1635. end_points_out[0].y = ((err0 < err1) ? qep_b[0].y : qep_b[1].y);
  1636. end_points_out[0].z = ((err0 < err1) ? qep_b[0].z : qep_b[1].z);
  1637. end_points_out[0].w = ((err0 < err1) ? qep_b[0].w : qep_b[1].w);
  1638. }
  1639. }
  1640. }
  1641. void cmp_block_endpoints(CMP_INOUT CGU_Vec4ui end_points_out[2],
  1642. CMP_IN CGV_Vec4f eigen_vector,
  1643. CMP_IN CGV_Vec4f image_mean,
  1644. CMP_IN CGU_Vec4ui image_src[16],
  1645. CMP_IN CGU_INT numEntries, //IN: range 0..15 (MAX_SUBSET_SIZE)
  1646. CMP_IN CGU_INT partition_mask // 0xFFFF:FFFF
  1647. )
  1648. {
  1649. CGV_Vec4f ext[2] = {{255.0f, 255.0f, 255.0f, 255.0f}, {0.0f, 0.0f, 0.0f, 0.0f}};
  1650. // find min/max
  1651. CGV_INT mask_shifted = partition_mask << 1;
  1652. for (CGU_INT k3 = 0; k3 <= numEntries; k3++)
  1653. {
  1654. mask_shifted >>= 1;
  1655. if ((mask_shifted & 1) == 0)
  1656. continue;
  1657. CGV_FLOAT dot = 0;
  1658. CGV_Vec4f diff;
  1659. diff.x = image_src[k3].x - image_mean.x;
  1660. diff.y = image_src[k3].y - image_mean.y;
  1661. diff.z = image_src[k3].z - image_mean.z;
  1662. diff.w = image_src[k3].w - image_mean.w;
  1663. dot += cmp_dot4f(eigen_vector, diff);
  1664. ext[0].x = cmp_minf(ext[0].x, dot);
  1665. ext[0].y = cmp_minf(ext[0].y, dot);
  1666. ext[0].z = cmp_minf(ext[0].z, dot);
  1667. ext[0].w = cmp_minf(ext[0].w, dot);
  1668. ext[1].x = cmp_maxf(ext[1].x, dot);
  1669. ext[1].y = cmp_maxf(ext[1].y, dot);
  1670. ext[1].z = cmp_maxf(ext[1].z, dot);
  1671. ext[1].w = cmp_maxf(ext[1].w, dot);
  1672. }
  1673. // create some distance if the endpoints collapse
  1674. if (ext[1].x - ext[0].x < 1.0f)
  1675. {
  1676. ext[0] -= 0.5f;
  1677. ext[1] += 0.5f;
  1678. }
  1679. cmp_endpoints2(end_points_out, ext, eigen_vector, image_mean);
  1680. }
  1681. CGV_UINT8 clampIndex2(CGV_UINT8 v, CGV_UINT8 a, CGV_UINT8 b)
  1682. {
  1683. if (v < a)
  1684. return a;
  1685. else if (v > b)
  1686. return b;
  1687. return v;
  1688. }
  1689. void cmp_block_index(CMP_INOUT CGU_UINT32 index_out[16],
  1690. CMP_IN CGV_Vec4f eigen_vector,
  1691. CMP_IN CGV_Vec4f image_mean,
  1692. CMP_IN CGU_Vec4ui image_src[16],
  1693. CMP_IN CGU_UINT32 numEntries // Range 0..15 (MAX_SUBSET_SIZE)
  1694. )
  1695. {
  1696. //=====================
  1697. // Get Projected Index
  1698. //=====================
  1699. CGV_FLOAT image_projected[16];
  1700. CGV_FLOAT image_v[16];
  1701. CGV_FLOAT image_z[16];
  1702. CGV_FLOAT projected_high; // Values are +ve about centered image projection
  1703. CGV_FLOAT projected_low; // Values are -ve about centered image projection
  1704. CGV_FLOAT image_s;
  1705. //====================================================================
  1706. // Center the image to new coordinate axis centered at the mean value
  1707. //====================================================================
  1708. CGV_Vec4f image_centered[16];
  1709. CGV_Vec4f diff;
  1710. for (CGU_UINT32 k1 = 0; k1 <= numEntries; k1++)
  1711. {
  1712. diff.x = image_src[k1].x - image_mean.x;
  1713. diff.y = image_src[k1].y - image_mean.y;
  1714. diff.z = image_src[k1].z - image_mean.z;
  1715. diff.w = image_src[k1].w - image_mean.w;
  1716. image_centered[k1] = diff * eigen_vector;
  1717. image_projected[k1] = image_centered[k1].x + image_centered[k1].y + image_centered[k1].z + image_centered[k1].w;
  1718. }
  1719. projected_high = image_projected[0];
  1720. projected_low = image_projected[0];
  1721. for (CGU_UINT32 i1 = 1; i1 <= numEntries; i1++)
  1722. {
  1723. if (projected_high < image_projected[i1])
  1724. projected_high = image_projected[i1];
  1725. if (projected_low > image_projected[i1])
  1726. projected_low = image_projected[i1];
  1727. }
  1728. CGV_FLOAT img_diff = projected_low - projected_high;
  1729. if (img_diff == 0.0f)
  1730. return;
  1731. image_s = numEntries / img_diff;
  1732. // Get initial index projection
  1733. for (CGU_UINT32 idx = 0; idx <= numEntries; idx++)
  1734. {
  1735. image_v[idx] = image_projected[idx] * image_s;
  1736. image_z[idx] = floor(image_v[idx] + 0.5F - projected_high * image_s);
  1737. index_out[idx] = (CGV_UINT32)image_z[idx];
  1738. }
  1739. // get minimum index
  1740. CGU_UINT32 index_min = index_out[0];
  1741. for (CGU_UINT32 i3 = 1; i3 <= numEntries; i3++)
  1742. {
  1743. if (index_out[i3] < index_min)
  1744. index_min = index_out[i3];
  1745. }
  1746. // Reposition all index by min index (using min index as 0)
  1747. //printf("index : ");
  1748. for (CGU_UINT32 i4 = 0; i4 <= numEntries; i4++)
  1749. {
  1750. index_out[i4] = clampIndex2(index_out[i4] - index_min, 0, 15);
  1751. //printf("%02x,", index_out[i4]);
  1752. }
  1753. //printf("\n");
  1754. }
  1755. CGU_UINT32 cmp_calcblockerr(CGU_Vec4ui endPoint_in[2], CGU_Vec4ui image_src[16])
  1756. {
  1757. CGU_UINT32 error = 0;
  1758. CGU_Vec4ui pixel = image_src[0];
  1759. CGU_Vec4ui endPoint[2];
  1760. CGU_Vec4i pixelDiff;
  1761. endPoint[0] = endPoint_in[0];
  1762. endPoint[1] = endPoint_in[1];
  1763. pixelDiff.x = pixel.x - endPoint[0].x;
  1764. pixelDiff.y = pixel.y - endPoint[0].y;
  1765. pixelDiff.z = pixel.z - endPoint[0].z;
  1766. pixelDiff.w = pixel.w - endPoint[0].w;
  1767. CGU_Vec4i span;
  1768. CGU_Vec2i span_norm_sqr;
  1769. CGU_Vec2i dotProduct;
  1770. span.x = endPoint[1].x - endPoint[0].x;
  1771. span.y = endPoint[1].y - endPoint[0].y;
  1772. span.z = endPoint[1].z - endPoint[0].z;
  1773. span.w = endPoint[1].w - endPoint[0].w;
  1774. span_norm_sqr = cmp_dotVec4i(span, span);
  1775. dotProduct = cmp_dotVec4i(span, pixelDiff);
  1776. if (span_norm_sqr.x > 0 && dotProduct.x >= 0 && CGU_UINT32(dotProduct.x * 63.49999) > CGU_UINT32(32 * span_norm_sqr.x))
  1777. {
  1778. span.x = -span.x;
  1779. span.y = -span.y;
  1780. span.z = -span.z;
  1781. span.w = -span.w;
  1782. swap(endPoint[0], endPoint[1]);
  1783. }
  1784. CGU_UINT32 color_index;
  1785. CGU_Vec4ui pixel_r;
  1786. for (CGU_UINT32 i = 0; i < 16; i++)
  1787. {
  1788. pixel = image_src[i];
  1789. pixelDiff.x = pixel.x - endPoint[0].x;
  1790. pixelDiff.y = pixel.y - endPoint[0].y;
  1791. pixelDiff.z = pixel.z - endPoint[0].z;
  1792. pixelDiff.w = pixel.w - endPoint[0].w;
  1793. dotProduct.x = cmp_dotVec4i(span, pixelDiff);
  1794. color_index = (span_norm_sqr.x <= 0 || dotProduct.x <= 0)
  1795. ? 0
  1796. : ((dotProduct.x < span_norm_sqr.x) ? aStep[0][CGU_UINT32(dotProduct.x * 63.49999 / span_norm_sqr.x)] : aStep[0][63]);
  1797. pixel_r = (endPoint[0] * (64 - aWeight[0][color_index]) + endPoint[1] * aWeight[0][color_index] + 32u) >> 6;
  1798. Ensure_A_Is_Larger(pixel_r, pixel);
  1799. pixel_r -= pixel;
  1800. error += ComputeError(pixel_r, pixel_r);
  1801. }
  1802. return error;
  1803. }
  1804. CGU_FLOAT cmp_GetIndexedEndPoints(CMP_INOUT CGU_Vec4ui epo_code_out[2],
  1805. CMP_INOUT CGU_UINT32 index_out[16],
  1806. CMP_IN CGU_Vec4ui image_src[16],
  1807. CMP_IN CGU_INT numEntries,
  1808. CMP_IN CGU_INT partition_mask)
  1809. {
  1810. CGV_Vec4f image_mean = {0.0f, 0.0f, 0.0f, 0.0f};
  1811. CGV_Vec4f eigen_vector;
  1812. for (CGU_INT i0 = 0; i0 < 16; i0++)
  1813. index_out[i0] = 0;
  1814. cmp_eigen_vector(eigen_vector, image_mean, image_src, numEntries);
  1815. cmp_block_endpoints(epo_code_out, eigen_vector, image_mean, image_src, numEntries, partition_mask);
  1816. cmp_block_index(index_out, eigen_vector, image_mean, image_src, numEntries);
  1817. CGU_UINT32 besterr = cmp_calcblockerr(epo_code_out, image_src);
  1818. return besterr;
  1819. }
  1820. void cmp_encode_mode6(CMP_INOUT CGU_UINT32 cmp_out[4], CMP_IN CGU_Vec4ui epo_code_out[2], CMP_IN CGU_UINT32 packed_color_index[2])
  1821. {
  1822. cmp_encode_apply_swap(epo_code_out, packed_color_index, 4);
  1823. CGU_INT k;
  1824. for (k = 0; k < 4; k++)
  1825. cmp_out[k] = 0;
  1826. CGU_INT pos = 0;
  1827. // mode 6
  1828. pos = cmp_Write32Bit(cmp_out, pos, 7, 64);
  1829. // endpoints
  1830. pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[0].x >> 1);
  1831. pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[1].x >> 1);
  1832. pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[0].y >> 1);
  1833. pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[1].y >> 1);
  1834. pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[0].z >> 1);
  1835. pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[1].z >> 1);
  1836. pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[0].w >> 1);
  1837. pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[1].w >> 1);
  1838. // p bits
  1839. pos = cmp_Write32Bit(cmp_out, pos, 1, epo_code_out[0].x & 1);
  1840. pos = cmp_Write32Bit(cmp_out, pos, 1, epo_code_out[1].x & 1);
  1841. // quantized values
  1842. cmp_encode_index2(cmp_out, pos, packed_color_index, 4, 0);
  1843. }
  1844. //====================================== MODES 01237 ==========================================
  1845. CGU_UINT32 index_collapse2(CMP_INOUT CGU_UINT32 index[16], CGU_UINT32 numEntries)
  1846. {
  1847. CGU_UINT32 minIndex = index[0];
  1848. CGU_UINT32 MaxIndex = index[0];
  1849. for (CGU_UINT32 km = 1; km < numEntries; km++)
  1850. {
  1851. if (index[km] < minIndex)
  1852. minIndex = index[km];
  1853. if (index[km] > MaxIndex)
  1854. MaxIndex = index[km];
  1855. }
  1856. if (MaxIndex == 0)
  1857. return 0;
  1858. CGU_UINT32 D = 1;
  1859. for (CGU_UINT32 d = 2; d <= MaxIndex - minIndex; d++)
  1860. {
  1861. for (CGU_UINT32 ent = 0U; ent < numEntries; ent++)
  1862. {
  1863. CGU_UINT8 imod = (index[ent] - minIndex);
  1864. if (fmod(imod, d) > 0.0f)
  1865. {
  1866. if (ent >= numEntries)
  1867. D = d;
  1868. break;
  1869. }
  1870. }
  1871. }
  1872. CGU_FLOAT invD = 1.0f / D;
  1873. for (CGU_UINT32 ki = 0; ki < numEntries; ki++)
  1874. {
  1875. index[ki] = (CGU_UINT32)((index[ki] - minIndex) * invD);
  1876. }
  1877. for (CGU_UINT32 k = 1; k < numEntries; k++)
  1878. {
  1879. if (index[k] > MaxIndex)
  1880. MaxIndex = index[k];
  1881. }
  1882. return (MaxIndex);
  1883. }
  1884. INLINE void GetClusterMean2(CMP_INOUT CGV_Vec4f image_cluster_mean[16],
  1885. CMP_IN CGU_Vec4ui image_src[16],
  1886. CMP_IN CGU_UINT32 index_cluster[16],
  1887. CMP_IN CGU_UINT32 numEntries, // < 16
  1888. CMP_IN CGU_UINT32 channels3or4)
  1889. { // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
  1890. // unused index values are underfined
  1891. CGU_UINT32 i_cnt[16];
  1892. CGU_UINT32 i_comp[16];
  1893. CGU_UINT32 idx;
  1894. for (CGU_UINT32 i0 = 0; i0 < numEntries; i0++)
  1895. {
  1896. idx = index_cluster[i0] & 0x0F;
  1897. i_cnt[idx] = 0;
  1898. image_cluster_mean[idx] = 0.0f;
  1899. }
  1900. CGU_UINT32 ic = 0;
  1901. for (CGU_UINT32 i1 = 0; i1 < numEntries; i1++)
  1902. {
  1903. idx = index_cluster[i1] & 0x0F;
  1904. if (i_cnt[idx] == 0)
  1905. i_comp[ic++] = idx;
  1906. i_cnt[idx]++;
  1907. image_cluster_mean[idx].x += image_src[i1].x;
  1908. image_cluster_mean[idx].y += image_src[i1].y;
  1909. image_cluster_mean[idx].z += image_src[i1].z;
  1910. image_cluster_mean[idx].w += image_src[i1].w;
  1911. }
  1912. for (CGU_UINT32 i = 0; i < ic; i++)
  1913. {
  1914. CGU_UINT32 icmp = i_comp[i];
  1915. if (i_cnt[icmp] != 0)
  1916. {
  1917. image_cluster_mean[icmp].x = (CGV_FLOAT)floor((image_cluster_mean[icmp].x / (CGV_FLOAT)i_cnt[icmp]) + 0.5F);
  1918. image_cluster_mean[icmp].y = (CGV_FLOAT)floor((image_cluster_mean[icmp].y / (CGV_FLOAT)i_cnt[icmp]) + 0.5F);
  1919. image_cluster_mean[icmp].z = (CGV_FLOAT)floor((image_cluster_mean[icmp].z / (CGV_FLOAT)i_cnt[icmp]) + 0.5F);
  1920. if (channels3or4 == 4)
  1921. image_cluster_mean[icmp].w = (CGV_FLOAT)floor((image_cluster_mean[icmp].w / (CGV_FLOAT)i_cnt[icmp]) + 0.5F);
  1922. else
  1923. image_cluster_mean[icmp].w = 0.0f;
  1924. }
  1925. }
  1926. }
  1927. #ifndef ASPM_HLSL // CPU Version
  1928. #define USE_OLDCODE
  1929. INLINE CGU_UINT8 cmp_get_partition_subset2(CMP_IN CGU_INT part_id, CMP_IN CGU_INT maxSubsets, CMP_IN CGU_INT index)
  1930. {
  1931. if (maxSubsets == 2)
  1932. {
  1933. CGU_UINT32 mask_packed = subset_mask_table2[part_id];
  1934. return ((mask_packed & (0x01 << index)) ? 1 : 0); // This can be moved to caller, just return mask!!
  1935. }
  1936. // 3 region subsets
  1937. part_id += 64;
  1938. CGU_UINT32 mask0 = subset_mask_table2[part_id] & 0xFFFF;
  1939. CGU_UINT32 mask1 = subset_mask_table2[part_id] >> 16;
  1940. CGU_UINT32 mask = 0x01 << index;
  1941. return ((mask1 & mask) ? 2 : 0 + (mask0 & mask) ? 1 : 0); // This can be moved to caller, just return mask!!
  1942. }
  1943. void cmp_GetPartitionSubSet2_mode01237(CMP_INOUT CGV_Vec4ui image_subsets[3][16], // OUT: Subset pattern mapped with image src colors
  1944. CMP_INOUT CGU_INT entryCount_out[3], // OUT: Number of entries per subset
  1945. CMP_IN CGU_UINT8 partition, // Partition Shape 0..63
  1946. CMP_IN CGV_Vec4ui image_src[16], // Image colors
  1947. CMP_IN CGU_INT blockMode, // [0,1,2,3 or 7]
  1948. CMP_IN CGU_UINT8 channels3or4)
  1949. { // 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
  1950. CGU_UINT8 maxSubsets = 2;
  1951. if (blockMode == 0 || blockMode == 2)
  1952. maxSubsets = 3;
  1953. entryCount_out[0] = 0;
  1954. entryCount_out[1] = 0;
  1955. entryCount_out[2] = 0;
  1956. for (CGU_INT i = 0; i < 16; i++)
  1957. {
  1958. CGU_UINT8 subset = cmp_get_partition_subset2(partition, maxSubsets, i);
  1959. image_subsets[subset][entryCount_out[subset]].x = image_src[i].x;
  1960. image_subsets[subset][entryCount_out[subset]].y = image_src[i].y;
  1961. image_subsets[subset][entryCount_out[subset]].z = image_src[i].z;
  1962. // if we have only 3 channels then set the alpha subset to 0
  1963. if (channels3or4 == 3)
  1964. image_subsets[subset][entryCount_out[subset]].w = 0.0F;
  1965. else
  1966. image_subsets[subset][entryCount_out[subset]].w = image_src[i].w;
  1967. entryCount_out[subset]++;
  1968. }
  1969. }
  1970. void cmp_GetImageCentered(CMP_INOUT CGV_Vec4f image_centered[16],
  1971. CMP_INOUT CGV_Vec4f CMP_REFINOUT mean_out,
  1972. CMP_IN CGV_Vec4ui image_src[16],
  1973. CMP_IN CGU_INT numEntries,
  1974. CMP_IN CGU_UINT8 channels3or4)
  1975. {
  1976. (channels3or4);
  1977. mean_out = 0.0f;
  1978. CGU_INT k;
  1979. for (k = 0; k < numEntries; k++)
  1980. {
  1981. mean_out.x = mean_out.x + image_src[k].x;
  1982. mean_out.y = mean_out.y + image_src[k].y;
  1983. mean_out.z = mean_out.z + image_src[k].z;
  1984. if (channels3or4 == 4)
  1985. mean_out.w = mean_out.w + image_src[k].w;
  1986. }
  1987. mean_out /= (CGV_FLOAT)numEntries;
  1988. for (k = 0; k < numEntries; k++)
  1989. {
  1990. image_centered[k].x = image_src[k].x - mean_out.x;
  1991. image_centered[k].y = image_src[k].y - mean_out.y;
  1992. image_centered[k].z = image_src[k].z - mean_out.z;
  1993. if (channels3or4 == 4)
  1994. image_centered[k].w = image_src[k].w - mean_out.w;
  1995. }
  1996. }
  1997. void cmp_GetCovarianceVector(CMP_INOUT CGV_FLOAT covariance_out[16],
  1998. CMP_IN CGV_Vec4f image_centered[16],
  1999. CMP_IN CGU_INT numEntries,
  2000. CMP_IN CGU_UINT8 channels3or4)
  2001. {
  2002. CGU_UINT8 ch1;
  2003. CGU_UINT8 ch2;
  2004. CGU_INT k;
  2005. for (ch1 = 0; ch1 < channels3or4; ch1++)
  2006. for (ch2 = 0; ch2 <= ch1; ch2++)
  2007. {
  2008. covariance_out[ch1 + ch2 * 4] = 0;
  2009. for (k = 0; k < numEntries; k++)
  2010. covariance_out[ch1 + ch2 * 4] += image_centered[k][ch1] * image_centered[k][ch2];
  2011. }
  2012. for (ch1 = 0; ch1 < channels3or4; ch1++)
  2013. for (ch2 = ch1 + 1; ch2 < channels3or4; ch2++)
  2014. covariance_out[ch1 + ch2 * 4] = covariance_out[ch2 + ch1 * 4];
  2015. }
  2016. void cmp_GetEigenVector(CMP_INOUT CGV_Vec4f CMP_REFINOUT EigenVector_out, // Normalized Eigen Vector output
  2017. CMP_IN CGV_FLOAT CovarianceVector[16], // Covariance Vector
  2018. CMP_IN CGU_UINT8 channels3or4)
  2019. {
  2020. CGV_FLOAT vector_covIn[16];
  2021. CGV_FLOAT vector_covOut[16];
  2022. CGV_FLOAT vector_maxCovariance;
  2023. CGU_UINT8 ch1;
  2024. CGU_UINT8 ch2;
  2025. CGU_UINT8 ch3;
  2026. for (ch1 = 0; ch1 < channels3or4; ch1++)
  2027. for (ch2 = 0; ch2 < channels3or4; ch2++)
  2028. {
  2029. vector_covIn[ch1 + ch2 * 4] = CovarianceVector[ch1 + ch2 * 4];
  2030. }
  2031. vector_maxCovariance = 0;
  2032. for (ch1 = 0; ch1 < channels3or4; ch1++)
  2033. {
  2034. if (vector_covIn[ch1 + ch1 * 4] > vector_maxCovariance)
  2035. vector_maxCovariance = vector_covIn[ch1 + ch1 * 4];
  2036. }
  2037. // Normalize Input Covariance Vector
  2038. for (ch1 = 0; ch1 < channels3or4; ch1++)
  2039. for (ch2 = 0; ch2 < channels3or4; ch2++)
  2040. {
  2041. if (vector_maxCovariance > 0)
  2042. vector_covIn[ch1 + ch2 * 4] = vector_covIn[ch1 + ch2 * 4] / vector_maxCovariance;
  2043. }
  2044. for (ch1 = 0; ch1 < channels3or4; ch1++)
  2045. {
  2046. for (ch2 = 0; ch2 < channels3or4; ch2++)
  2047. {
  2048. CGV_FLOAT vector_temp_cov = 0;
  2049. for (ch3 = 0; ch3 < channels3or4; ch3++)
  2050. {
  2051. vector_temp_cov = vector_temp_cov + vector_covIn[ch1 + ch3 * 4] * vector_covIn[ch3 + ch2 * 4];
  2052. }
  2053. vector_covOut[ch1 + ch2 * 4] = vector_temp_cov;
  2054. }
  2055. }
  2056. vector_maxCovariance = 0;
  2057. CGU_INT maxCovariance_channel = 0;
  2058. for (ch1 = 0; ch1 < channels3or4; ch1++)
  2059. {
  2060. if (vector_covOut[ch1 + ch1 * 4] > vector_maxCovariance)
  2061. {
  2062. maxCovariance_channel = ch1;
  2063. vector_maxCovariance = vector_covOut[ch1 + ch1 * 4];
  2064. }
  2065. }
  2066. CGV_FLOAT vector_t = 0;
  2067. for (ch1 = 0; ch1 < channels3or4; ch1++)
  2068. {
  2069. vector_t = vector_t + vector_covOut[maxCovariance_channel + ch1 * 4] * vector_covOut[maxCovariance_channel + ch1 * 4];
  2070. EigenVector_out[ch1] = vector_covOut[maxCovariance_channel + ch1 * 4];
  2071. }
  2072. // Normalize the Eigen Vector
  2073. vector_t = sqrt(vector_t);
  2074. for (ch1 = 0; ch1 < channels3or4; ch1++)
  2075. {
  2076. if (vector_t > 0)
  2077. EigenVector_out[ch1] = EigenVector_out[ch1] / vector_t;
  2078. }
  2079. }
  2080. void cmp_GetProjecedImage(CMP_INOUT CGV_FLOAT projection_out[16],
  2081. CMP_IN CGV_Vec4f image_centered[16],
  2082. CMP_IN CGU_INT numEntries,
  2083. CMP_IN CGV_Vec4f EigenVector,
  2084. CMP_IN CGU_UINT8 channels3or4)
  2085. {
  2086. // EigenVector must be normalized
  2087. for (CGU_INT k = 0; k < numEntries; k++)
  2088. {
  2089. projection_out[k] = 0.0F;
  2090. projection_out[k] = projection_out[k] + (image_centered[k].x * EigenVector.x);
  2091. projection_out[k] = projection_out[k] + (image_centered[k].y * EigenVector.y);
  2092. projection_out[k] = projection_out[k] + (image_centered[k].z * EigenVector.z);
  2093. if (channels3or4 == 4)
  2094. projection_out[k] = projection_out[k] + (image_centered[k].w * EigenVector.w);
  2095. }
  2096. }
  2097. typedef struct
  2098. {
  2099. CGV_FLOAT image;
  2100. CGU_UINT8 index;
  2101. } CMP_di2;
  2102. void cmp_GetProjectedIndex(CMP_INOUT CGU_UINT8 projected_index_out[16], //output: index, uncentered, in the range 0..clusters-1
  2103. CMP_IN CGV_FLOAT image_projected[16], // image_block points, might be uncentered
  2104. CMP_IN CGU_INT clusters, // clusters: number of points in the ramp (max 16)
  2105. CMP_IN CGU_INT numEntries)
  2106. {
  2107. CMP_di2 what[16];
  2108. CGV_FLOAT image_v[16];
  2109. CGV_FLOAT image_z[16];
  2110. CGV_FLOAT image_l;
  2111. CGV_FLOAT image_mm;
  2112. CGV_FLOAT image_r = 0.0F;
  2113. CGV_FLOAT image_dm = 0.0F;
  2114. CGV_FLOAT image_min;
  2115. CGV_FLOAT image_max;
  2116. CGV_FLOAT image_s;
  2117. CGU_INT i;
  2118. CGU_INT j;
  2119. for (i = 0; i < 16; i++)
  2120. projected_index_out[i] = 0;
  2121. image_min = image_projected[0];
  2122. image_max = image_projected[0];
  2123. for (i = 1; i < numEntries; i++)
  2124. {
  2125. if (image_min < image_projected[i])
  2126. image_min = image_projected[i];
  2127. if (image_max > image_projected[i])
  2128. image_max = image_projected[i];
  2129. }
  2130. CGV_FLOAT img_diff = image_max - image_min;
  2131. if (img_diff == 0.0f)
  2132. return;
  2133. if (cmp_isnan(img_diff))
  2134. return;
  2135. image_s = (clusters - 1) / img_diff;
  2136. for (i = 0; i < numEntries; i++)
  2137. {
  2138. image_v[i] = image_projected[i] * image_s;
  2139. image_z[i] = floor(image_v[i] + 0.5F - image_min * image_s);
  2140. projected_index_out[i] = (CGU_UINT8)image_z[i];
  2141. what[i].image = image_v[i] - image_z[i] - image_min * image_s;
  2142. what[i].index = i;
  2143. image_dm += what[i].image;
  2144. image_r += what[i].image * what[i].image;
  2145. }
  2146. if (numEntries * image_r - image_dm * image_dm >= (CGV_FLOAT)(numEntries - 1) / 8)
  2147. {
  2148. image_dm /= numEntries;
  2149. for (i = 0; i < numEntries; i++)
  2150. what[i].image -= image_dm;
  2151. CGU_UINT8 tmp_index;
  2152. CGV_FLOAT tmp_image;
  2153. for (i = 1; i < numEntries; i++)
  2154. {
  2155. for (j = i; j > 0; j--)
  2156. {
  2157. if (what[j - 1].image > what[j].image)
  2158. {
  2159. tmp_index = what[j].index;
  2160. tmp_image = what[j].image;
  2161. what[j].index = what[j - 1].index;
  2162. what[j].image = what[j - 1].image;
  2163. what[j - 1].index = tmp_index;
  2164. what[j - 1].image = tmp_image;
  2165. }
  2166. }
  2167. }
  2168. // got into fundamental simplex
  2169. // move coordinate system origin to its center
  2170. // i=0 < numEntries avoids varying int division by 0
  2171. for (i = 0; i < numEntries; i++)
  2172. {
  2173. what[i].image = what[i].image - (CGV_FLOAT)(((2.0f * i + 1) - numEntries) / (2.0f * numEntries));
  2174. }
  2175. image_mm = 0.0F;
  2176. image_l = 0.0F;
  2177. j = -1;
  2178. for (i = 0; i < numEntries; i++)
  2179. {
  2180. image_l += what[i].image;
  2181. if (image_l < image_mm)
  2182. {
  2183. image_mm = image_l;
  2184. j = i;
  2185. }
  2186. }
  2187. j = j + 1;
  2188. // avoid j = j%numEntries use this
  2189. while (j > numEntries)
  2190. j = j - numEntries;
  2191. for (i = j; i < numEntries; i++)
  2192. {
  2193. CGU_UINT8 idx = what[i].index;
  2194. CGU_UINT8 pidx = projected_index_out[idx] + 1; //gather_index(projected_index_out,idx)+1;
  2195. projected_index_out[idx] = pidx; // scatter_index(projected_index_out,idx,pidx);
  2196. }
  2197. }
  2198. // get minimum index
  2199. CGU_UINT8 index_min = projected_index_out[0];
  2200. for (i = 1; i < numEntries; i++)
  2201. {
  2202. if (projected_index_out[i] < index_min)
  2203. index_min = projected_index_out[i];
  2204. }
  2205. // reposition all index by min index (using min index as 0)
  2206. for (i = 0; i < numEntries; i++)
  2207. {
  2208. projected_index_out[i] = cmp_clampi(projected_index_out[i] - index_min, 0, 15);
  2209. }
  2210. }
  2211. CGV_FLOAT cmp_err_Total(CMP_IN CGV_Vec4ui image_src1[16], CMP_IN CGV_Vec4f image_src2[16], CMP_IN CGU_INT numEntries, CMP_IN CGU_UINT8 channels3or4)
  2212. {
  2213. CGV_FLOAT err_t = 0.0F;
  2214. for (CGU_INT k = 0; k < numEntries; k++)
  2215. {
  2216. err_t = err_t + cmp_squaref(image_src1[k].x - image_src2[k].x);
  2217. err_t = err_t + cmp_squaref(image_src1[k].y - image_src2[k].y);
  2218. err_t = err_t + cmp_squaref(image_src1[k].z - image_src2[k].z);
  2219. if (channels3or4 == 4)
  2220. err_t = err_t + cmp_squaref(image_src1[k].w - image_src2[k].w);
  2221. }
  2222. return err_t;
  2223. };
  2224. CGV_FLOAT cmp_GetQuantizeIndex_old(CMP_INOUT CGU_UINT8 index_out[16],
  2225. CMP_IN CGV_Vec4ui image_src[16],
  2226. CMP_IN CGU_INT numEntries,
  2227. CMP_IN CGU_INT numClusters,
  2228. CMP_IN CGU_UINT8 channels3or4)
  2229. {
  2230. CGV_FLOAT covariance_vector[16];
  2231. CGV_Vec4f image_centered[16];
  2232. CGV_FLOAT image_projected[16];
  2233. CGV_Vec4f image_mean = 0.0f;
  2234. CGV_Vec4f eigen_vector = 0.0f;
  2235. // Init vars
  2236. for (CGU_INT ik = 0; ik < 16; ik++)
  2237. {
  2238. covariance_vector[ik] = 0.0f;
  2239. image_centered[ik] = 0.0f;
  2240. image_projected[ik] = 0.0f;
  2241. }
  2242. cmp_GetImageCentered(image_centered, image_mean, image_src, numEntries, channels3or4);
  2243. cmp_GetCovarianceVector(covariance_vector, image_centered, numEntries, channels3or4);
  2244. //-----------------------------------------------------
  2245. // check if all covariances are the same
  2246. // if so then set all index to same value 0 and return
  2247. // use EPSILON to set the limit for all same limit
  2248. //-----------------------------------------------------
  2249. CGV_FLOAT image_covt = 0.0F;
  2250. image_covt = covariance_vector[0];
  2251. image_covt = image_covt + covariance_vector[5];
  2252. image_covt = image_covt + covariance_vector[10];
  2253. if (channels3or4 == 4)
  2254. image_covt = image_covt + covariance_vector[15];
  2255. if (image_covt < 0.00390625f)
  2256. {
  2257. for (CGU_INT i = 0; i < 16; i++)
  2258. index_out[i] = 0;
  2259. return 0.0f;
  2260. }
  2261. cmp_GetEigenVector(eigen_vector, covariance_vector, channels3or4);
  2262. cmp_GetProjecedImage(image_projected, image_centered, numEntries, eigen_vector, channels3or4);
  2263. cmp_GetProjectedIndex(index_out, image_projected, numClusters, numEntries);
  2264. //==========================================
  2265. // Refine
  2266. //==========================================
  2267. CGV_FLOAT image_q = 0.0F;
  2268. eigen_vector = 0.0f;
  2269. for (CGU_INT k = 0; k < numEntries; k++)
  2270. {
  2271. eigen_vector.x = eigen_vector.x + image_centered[k].x * index_out[k];
  2272. eigen_vector.y = eigen_vector.y + image_centered[k].y * index_out[k];
  2273. eigen_vector.z = eigen_vector.z + image_centered[k].z * index_out[k];
  2274. if (channels3or4 == 4)
  2275. eigen_vector.w = eigen_vector.w + image_centered[k].w * index_out[k];
  2276. }
  2277. image_q = image_q + eigen_vector.x * eigen_vector.x;
  2278. image_q = image_q + eigen_vector.y * eigen_vector.y;
  2279. image_q = image_q + eigen_vector.z * eigen_vector.z;
  2280. if (channels3or4 == 4)
  2281. image_q = image_q + eigen_vector.w * eigen_vector.w;
  2282. image_q = sqrt(image_q);
  2283. // direction needs to be normalized
  2284. if (image_q != 0.0F)
  2285. eigen_vector = eigen_vector / image_q;
  2286. // Get new projected data
  2287. cmp_GetProjecedImage(image_projected, image_centered, numEntries, eigen_vector, channels3or4);
  2288. cmp_GetProjectedIndex(index_out, image_projected, numClusters, numEntries);
  2289. // Calc Error
  2290. CGV_FLOAT image_t = 0.0F;
  2291. CGV_FLOAT index_average = 0.0F;
  2292. for (CGU_INT ik = 0; ik < numEntries; ik++)
  2293. {
  2294. index_average = index_average + index_out[ik];
  2295. image_t = image_t + index_out[ik] * index_out[ik];
  2296. }
  2297. index_average = index_average / (CGV_FLOAT)numEntries;
  2298. image_t = image_t - index_average * index_average * (CGV_FLOAT)numEntries;
  2299. if (image_t != 0.0F)
  2300. image_t = 1.0F / image_t;
  2301. eigen_vector = 0.0f;
  2302. for (CGU_INT nk = 0; nk < numEntries; nk++)
  2303. {
  2304. eigen_vector.x = eigen_vector.x + image_centered[nk].x * index_out[nk];
  2305. eigen_vector.y = eigen_vector.y + image_centered[nk].y * index_out[nk];
  2306. eigen_vector.z = eigen_vector.z + image_centered[nk].z * index_out[nk];
  2307. if (channels3or4 == 4)
  2308. eigen_vector.w = eigen_vector.w + image_centered[nk].w * index_out[nk];
  2309. }
  2310. CGV_Vec4f image_decomp[SOURCE_BLOCK_SIZE];
  2311. for (CGU_UINT32 ii = 0; ii < SOURCE_BLOCK_SIZE; ii++)
  2312. image_decomp[ii] = 0.0f;
  2313. for (CGU_INT i = 0; i < numEntries; i++)
  2314. {
  2315. image_decomp[i].x = image_mean.x + eigen_vector.x * image_t * (index_out[i] - index_average);
  2316. image_decomp[i].y = image_mean.y + eigen_vector.y * image_t * (index_out[i] - index_average);
  2317. image_decomp[i].z = image_mean.z + eigen_vector.z * image_t * (index_out[i] - index_average);
  2318. if (channels3or4 == 4)
  2319. image_decomp[i].w = image_mean.w + eigen_vector.w * image_t * (index_out[i] - index_average);
  2320. }
  2321. CGV_FLOAT err_1 = cmp_err_Total(image_src, image_decomp, numEntries, channels3or4);
  2322. return err_1;
  2323. }
  2324. typedef struct
  2325. {
  2326. CGV_FLOAT image;
  2327. CGU_UINT8 index;
  2328. } CMP_du2;
  2329. void cmp_sortPartitionProjection(CMP_IN CGV_FLOAT projection[64], CMP_INOUT CGU_UINT8 order[64],
  2330. CMP_IN CGU_UINT8 numPartitions) // max 64
  2331. {
  2332. CMP_du2 what[64];
  2333. CGU_UINT8 Parti;
  2334. CGU_UINT8 Partj;
  2335. for (Parti = 0; Parti < numPartitions; Parti++)
  2336. {
  2337. what[Parti].index = Parti;
  2338. what[Parti].image = projection[Parti];
  2339. }
  2340. CGU_UINT8 index;
  2341. CGV_FLOAT data;
  2342. for (Parti = 1; Parti < numPartitions; Parti++)
  2343. {
  2344. for (Partj = Parti; Partj > 0; Partj--)
  2345. {
  2346. if (what[Partj - 1].image > what[Partj].image)
  2347. {
  2348. index = what[Partj].index;
  2349. data = what[Partj].image;
  2350. what[Partj].index = what[Partj - 1].index;
  2351. what[Partj].image = what[Partj - 1].image;
  2352. what[Partj - 1].index = index;
  2353. what[Partj - 1].image = data;
  2354. }
  2355. }
  2356. }
  2357. for (Parti = 0; Parti < numPartitions; Parti++)
  2358. order[Parti] = what[Parti].index;
  2359. };
  2360. CGU_BOOL cmp_get_ideal_cluster(CMP_INOUT CGV_Vec4f image_cluster[2],
  2361. CMP_IN CGU_UINT32 index_cluster[16],
  2362. CMP_IN CGU_INT Mi_,
  2363. CMP_IN CGV_Vec4ui image_src[16],
  2364. CMP_IN CGU_INT numEntries,
  2365. CMP_IN CGU_UINT8 channels3or4)
  2366. {
  2367. // get ideal cluster centers
  2368. CGV_Vec4f image_cluster_mean[16];
  2369. for (CGU_INT ii = 0; ii < numEntries; ii++)
  2370. {
  2371. image_cluster_mean[ii] = 0.0f;
  2372. }
  2373. GetClusterMean2(image_cluster_mean, image_src, index_cluster, numEntries, channels3or4); // unrounded
  2374. CGV_FLOAT image_matrix0[2] = {0, 0}; // matrix /inverse matrix
  2375. CGV_FLOAT image_matrix1[2] = {0, 0}; // matrix /inverse matrix
  2376. CGV_Vec4f image_rp[2]; // right part for RMS fit problem
  2377. image_rp[0] = 0.0f;
  2378. image_rp[1] = 0.0f;
  2379. // weight with cnt if runnning on compacted index
  2380. for (CGU_INT k = 0; k < numEntries; k++)
  2381. {
  2382. image_matrix0[0] += (Mi_ - index_cluster[k]) * (Mi_ - index_cluster[k]);
  2383. image_matrix0[1] += index_cluster[k] * (Mi_ - index_cluster[k]); // im is symmetric
  2384. image_matrix1[1] += index_cluster[k] * index_cluster[k];
  2385. image_rp[0] += image_cluster_mean[index_cluster[k]] * (Mi_ - index_cluster[k]);
  2386. image_rp[1] += image_cluster_mean[index_cluster[k]] * index_cluster[k];
  2387. }
  2388. CGV_FLOAT matrix_dd = image_matrix0[0] * image_matrix1[1] - image_matrix0[1] * image_matrix0[1];
  2389. // assert(matrix_dd !=0);
  2390. // matrix_dd=0 means that index_cidx[k] and (Mi_-index_cidx[k]) collinear which implies only one active index;
  2391. // taken care of separately
  2392. if (matrix_dd == 0)
  2393. {
  2394. image_cluster[0] = 0.0f;
  2395. image_cluster[1] = 0.0f;
  2396. return FALSE;
  2397. }
  2398. image_matrix1[0] = image_matrix0[0];
  2399. image_matrix0[0] = image_matrix1[1] / matrix_dd;
  2400. image_matrix1[1] = image_matrix1[0] / matrix_dd;
  2401. image_matrix1[0] = image_matrix0[1] = -image_matrix0[1] / matrix_dd;
  2402. CGV_FLOAT Mif = (CGV_FLOAT)Mi_;
  2403. // values can exceed 255 here, clamp made no diff in quality!
  2404. image_cluster[0] = (((image_rp[0] * image_matrix0[0]) + (image_rp[1] * image_matrix0[1])) * Mif);
  2405. image_cluster[1] = (((image_rp[0] * image_matrix1[0]) + (image_rp[1] * image_matrix1[1])) * Mif);
  2406. return TRUE;
  2407. }
  2408. CGV_FLOAT cmp_quant_solid_color(CMP_INOUT CGU_UINT32 index_out[16],
  2409. CMP_INOUT CGV_Vec4ui epo_code_out[2],
  2410. CMP_IN CGV_Vec4ui image_src[16],
  2411. CMP_IN CGU_INT numEntries,
  2412. CMP_IN CGU_UINT8 Mi_,
  2413. CMP_IN CGU_UINT8 bits[4],
  2414. CMP_IN CGU_INT type,
  2415. CMP_IN CGU_UINT8 channels3or4,
  2416. CMP_IN CGU_INT blockMode)
  2417. {
  2418. #ifndef ASPM_GPU
  2419. #if defined(USE_NEW_SP_ERR_IDX)
  2420. CGU_INT clogBC7 = 0;
  2421. CGU_INT iv = Mi_ + 1;
  2422. while (iv >>= 1)
  2423. clogBC7++;
  2424. old_init_BC7ramps(); // first time call inits global
  2425. #endif
  2426. #endif
  2427. CGU_INT index_bits = g_modesettings[blockMode].indexBits;
  2428. CGV_Vec4ui epo_0[2];
  2429. epo_0[0] = 0u;
  2430. epo_0[1] = 0u;
  2431. CGU_UINT8 image_log = 0;
  2432. CGU_UINT8 image_idx = 0;
  2433. CGU_BOOL use_par = FALSE;
  2434. if (type != 0)
  2435. use_par = TRUE;
  2436. CGV_FLOAT error_1 = CMP_FLOAT_MAX;
  2437. //CGU_UINT8 ch;
  2438. CGU_UINT8 ch1;
  2439. //CGU_INT k;
  2440. CGU_INT i;
  2441. for (CGU_INT pn = 0; pn < cmp_npv_nd[channels3or4 - 3][type] && (error_1 != 0.0F); pn++)
  2442. {
  2443. CGU_Vec4ui o1[2] = {{0u, 0u, 0u, 0u}, {2u, 2u, 2u, 2u}};
  2444. CGU_Vec4ui o2[2] = {{0u, 0u, 0u, 0u}, {2u, 2u, 2u, 2u}};
  2445. if (use_par == TRUE)
  2446. {
  2447. if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][0])
  2448. o1[0][0] = 1;
  2449. else
  2450. o1[1][0] = 1;
  2451. if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][0])
  2452. o2[0][0] = 1;
  2453. else
  2454. o2[1][0] = 1;
  2455. if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][1])
  2456. o1[0][1] = 1;
  2457. else
  2458. o1[1][1] = 1;
  2459. if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][1])
  2460. o2[0][1] = 1;
  2461. else
  2462. o2[1][1] = 1;
  2463. if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][2])
  2464. o1[0][2] = 1;
  2465. else
  2466. o1[1][2] = 1;
  2467. if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][2])
  2468. o2[0][2] = 1;
  2469. else
  2470. o2[1][2] = 1;
  2471. if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][3])
  2472. o1[0][3] = 1;
  2473. else
  2474. o1[1][3] = 1;
  2475. if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][3])
  2476. o2[0][3] = 1;
  2477. else
  2478. o2[1][3] = 1;
  2479. }
  2480. CGU_INT image_tcr[MAX_CHANNELS];
  2481. CGU_INT epo_dr_0[MAX_CHANNELS];
  2482. CGV_FLOAT error_0 = CMP_FLOAT_MAX;
  2483. for (CGU_UINT8 iclogBC7 = 0; iclogBC7 < (1 << index_bits) && (error_0 != 0); iclogBC7++)
  2484. {
  2485. CGV_FLOAT error_t = 0;
  2486. CGU_INT t1o[MAX_CHANNELS], t2o[MAX_CHANNELS];
  2487. for (ch1 = 0; ch1 < channels3or4; ch1++)
  2488. {
  2489. // D
  2490. CGV_FLOAT error_ta = CMP_FLOAT_MAX;
  2491. for (CGU_UINT8 t1 = o1[0][ch1]; t1 < o1[1][ch1]; t1++)
  2492. {
  2493. // C
  2494. // This is needed for non-integer mean points of "collapsed" sets
  2495. for (CGU_UINT8 t2 = o2[0][ch1]; t2 < o2[1][ch1]; t2++)
  2496. {
  2497. // B
  2498. CGU_INT image_tf;
  2499. CGU_INT image_tc;
  2500. image_tf = (CGU_INT)floor(image_src[0][ch1]);
  2501. image_tc = (CGU_INT)ceil(image_src[0][ch1]);
  2502. #ifndef ASPM_GPU
  2503. #ifdef USE_NEW_SP_ERR_IDX
  2504. CGV_FLOAT err_tf = old_get_sperr(clogBC7, bits[ch1], image_tf, t1, t2, iclogBC7);
  2505. CGV_FLOAT err_tc = old_get_sperr(clogBC7, bits[ch1], image_tc, t1, t2, iclogBC7);
  2506. if (err_tf > err_tc)
  2507. image_tcr[ch1] = image_tc;
  2508. else if (err_tf < err_tc)
  2509. image_tcr[ch1] = image_tf;
  2510. else
  2511. image_tcr[ch1] = (CGV_INT)floor(image_src[ch1][COMP_RED] + 0.5F);
  2512. //===============================
  2513. // Refine this for better quality!
  2514. //===============================
  2515. CGV_FLOAT error_tr;
  2516. error_tr = old_get_sperr(clogBC7, bits[ch1], image_tcr[ch1], t1, t2, iclogBC7);
  2517. error_tr = (error_tr * error_tr) +
  2518. 2 * error_tr * old_img_absf(image_tcr[ch1] - image_src[ch1][COMP_RED]) +
  2519. (image_tcr[ch1] - image_src[ch1][COMP_RED]) * (image_tcr[ch1] - image_src[ch1][COMP_RED]);
  2520. if (error_tr < error_ta)
  2521. {
  2522. error_ta = error_tr;
  2523. t1o[ch1] = t1;
  2524. t2o[ch1] = t2;
  2525. epo_dr_0[ch1] = cmp_clampi(image_tcr[ch1], 0, 255);
  2526. }
  2527. #endif
  2528. #else
  2529. image_tcr[ch1] = (CGU_INT)floor(image_src[0][ch1] + 0.5F);
  2530. error_ta = 0;
  2531. t1o[ch1] = t1;
  2532. t2o[ch1] = t2;
  2533. epo_dr_0[ch1] = cmp_clampi(image_tcr[ch1], 0, 255);
  2534. #endif
  2535. } // B
  2536. } //C
  2537. error_t += error_ta;
  2538. } // D
  2539. if (error_t <= error_0)
  2540. {
  2541. // We have a solid color: Use image src if on GPU
  2542. image_log = iclogBC7;
  2543. image_idx = image_log;
  2544. #ifndef ASPM_GPU
  2545. #ifdef USE_BC7_SP_ERR_IDX
  2546. if (BC7EncodeRamps2.ramp_init) {
  2547. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  2548. {
  2549. CGV_INT index = (CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) +
  2550. (BTT2(bits[ch]) * 256 * 2 * 2 * 16 * 2) +
  2551. (epo_dr_0[ch] * 2 * 2 * 16 * 2) +
  2552. (t1o[ch] * 2 * 16 * 2) +
  2553. (t2o[ch] * 16 * 2) +
  2554. (iclogBC7 * 2);
  2555. epo_0[0][ch] = BC7EncodeRamps2.sp_idx[index + 0] & 0xFF;
  2556. epo_0[1][ch] = BC7EncodeRamps2.sp_idx[index + 1] & 0xFF;
  2557. }
  2558. }
  2559. #endif
  2560. #else
  2561. CGU_UINT8 ch;
  2562. CGU_UINT8 k;
  2563. // This needs improving
  2564. CGV_FLOAT MinC[4] = {255, 255, 255, 255};
  2565. CGV_FLOAT MaxC[4] = {0, 0, 0, 0};
  2566. // get min max colors
  2567. for (ch = 0; ch < channels3or4; ch++)
  2568. for (k = 0; k < numEntries; k++)
  2569. {
  2570. if (image_src[k][ch] < MinC[ch])
  2571. MinC[ch] = image_src[k][ch];
  2572. if (image_src[k][ch] > MaxC[ch])
  2573. MaxC[ch] = image_src[k][ch];
  2574. }
  2575. epo_0[0][0] = (CGU_UINT8)MinC[0];
  2576. epo_0[1][0] = (CGU_UINT8)MaxC[0];
  2577. epo_0[0][1] = (CGU_UINT8)MinC[1];
  2578. epo_0[1][1] = (CGU_UINT8)MaxC[1];
  2579. epo_0[0][2] = (CGU_UINT8)MinC[2];
  2580. epo_0[1][2] = (CGU_UINT8)MaxC[2];
  2581. epo_0[0][3] = (CGU_UINT8)MinC[3];
  2582. epo_0[1][3] = (CGU_UINT8)MaxC[3];
  2583. #endif
  2584. error_0 = error_t;
  2585. }
  2586. } // E
  2587. if (error_0 < error_1)
  2588. {
  2589. image_idx = image_log;
  2590. epo_code_out[0] = epo_0[0];
  2591. epo_code_out[1] = epo_0[1];
  2592. error_1 = error_0;
  2593. }
  2594. } //1
  2595. // Get Image error
  2596. CGV_Vec4f image_decomp[16];
  2597. for (i = 0; i < numEntries; i++)
  2598. {
  2599. index_out[i] = image_idx;
  2600. {
  2601. image_decomp[i][0] = cmp_GetRamp(index_bits, bits[0], epo_code_out[0].x, epo_code_out[1].x, i);
  2602. image_decomp[i][1] = cmp_GetRamp(index_bits, bits[1], epo_code_out[0].y, epo_code_out[1].y, i);
  2603. image_decomp[i][2] = cmp_GetRamp(index_bits, bits[2], epo_code_out[0].z, epo_code_out[1].z, i);
  2604. if (channels3or4 == 4)
  2605. image_decomp[i][3] = cmp_GetRamp(index_bits, bits[3], epo_code_out[0].w, epo_code_out[1].w, i);
  2606. }
  2607. }
  2608. // Do we need to do this rather then err_1 * numEntries
  2609. CGV_FLOAT error_quant;
  2610. error_quant = cmp_err_Total(image_src, image_decomp, numEntries, channels3or4);
  2611. return error_quant;
  2612. }
  2613. INLINE CGV_FLOAT old_sq_image(CGV_FLOAT v)
  2614. {
  2615. return v * v;
  2616. }
  2617. CGV_FLOAT cmp_shake3(CMP_INOUT CGU_Vec4ui epo_code_shake[2],
  2618. CMP_IN CGV_Vec4f image_cluster[2],
  2619. CMP_IN CGU_UINT32 index_cidx[16],
  2620. CMP_IN CGV_Vec4ui image_src[16],
  2621. CMP_IN CGU_INT index_bits,
  2622. CMP_IN CGU_INT type,
  2623. CMP_IN CGU_UINT8 max_bits[4],
  2624. CMP_IN CGU_UINT8 use_par,
  2625. CMP_IN CGU_INT numEntries, // max 16
  2626. CMP_IN CGU_UINT8 channels3or4)
  2627. {
  2628. CGV_FLOAT best_err = CMP_FLOAT_MAX;
  2629. CGV_FLOAT err_ed[16] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
  2630. CGU_INT epo_code_par[2][2][2][MAX_CHANNELS];
  2631. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  2632. {
  2633. CGU_UINT8 ppA = 0;
  2634. CGU_UINT8 ppB = 0;
  2635. CGU_UINT8 rr = (use_par ? 2 : 1);
  2636. CGU_INT epo_code_epi[2][2]; // first/second, coord, begin rage end range
  2637. for (ppA = 0; ppA < rr; ppA++)
  2638. { // loop max =2
  2639. for (ppB = 0; ppB < rr; ppB++)
  2640. { //loop max =2
  2641. // set default ranges
  2642. epo_code_epi[0][0] = epo_code_epi[0][1] = cmp_ep_find_floor2(image_cluster[0][ch], max_bits[ch], use_par, ppA);
  2643. epo_code_epi[1][0] = epo_code_epi[1][1] = cmp_ep_find_floor2(image_cluster[1][ch], max_bits[ch], use_par, ppB);
  2644. // set begin range
  2645. epo_code_epi[0][0] -= ((epo_code_epi[0][0] < 1 ? epo_code_epi[0][0] : 1)) & (~use_par);
  2646. epo_code_epi[1][0] -= ((epo_code_epi[1][0] < 1 ? epo_code_epi[1][0] : 1)) & (~use_par);
  2647. // set end range
  2648. epo_code_epi[0][1] += ((1 << max_bits[ch]) - 1 - epo_code_epi[0][1] < 2 ? (1 << max_bits[ch]) - 1 - epo_code_epi[0][1] : 2) & (~use_par);
  2649. epo_code_epi[1][1] += ((1 << max_bits[ch]) - 1 - epo_code_epi[1][1] < 2 ? (1 << max_bits[ch]) - 1 - epo_code_epi[1][1] : 2) & (~use_par);
  2650. CGU_INT step = (1 << use_par);
  2651. err_ed[(ppA * 8) + (ppB * 4) + ch] = CMP_FLOAT_MAX;
  2652. for (CGU_INT epo_p1 = epo_code_epi[0][0]; epo_p1 <= epo_code_epi[0][1]; epo_p1 += step)
  2653. {
  2654. for (CGU_INT epo_p2 = epo_code_epi[1][0]; epo_p2 <= epo_code_epi[1][1]; epo_p2 += step)
  2655. {
  2656. CGV_FLOAT image_square_diff = 0.0F;
  2657. CGU_INT _mc = numEntries;
  2658. CGV_FLOAT image_ramp;
  2659. while (_mc > 0)
  2660. {
  2661. image_ramp = cmp_GetRamp(index_bits, max_bits[ch], epo_p1, epo_p2, index_cidx[_mc - 1]);
  2662. image_square_diff += cmp_squaref(image_ramp - image_src[(_mc - 1)][ch]);
  2663. _mc--;
  2664. }
  2665. if (image_square_diff < err_ed[(ppA * 8) + (ppB * 4) + ch])
  2666. {
  2667. err_ed[(ppA * 8) + (ppB * 4) + ch] = image_square_diff;
  2668. epo_code_par[ppA][ppB][0][ch] = epo_p1;
  2669. epo_code_par[ppA][ppB][1][ch] = epo_p2;
  2670. }
  2671. }
  2672. }
  2673. } // pp1
  2674. } // pp0
  2675. } // j
  2676. //---------------------------------------------------------
  2677. for (CGU_INT pn = 0; pn < cmp_npv_nd[channels3or4 - 3][type]; pn++)
  2678. {
  2679. CGV_FLOAT err_2 = 0.0F;
  2680. CGU_INT d1;
  2681. CGU_INT d2;
  2682. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  2683. {
  2684. d1 = cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][ch];
  2685. d2 = cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][ch];
  2686. err_2 += err_ed[(d1 * 8) + (d2 * 4) + ch];
  2687. }
  2688. if (err_2 < best_err)
  2689. {
  2690. best_err = err_2;
  2691. d1 = cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][0];
  2692. d2 = cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][0];
  2693. epo_code_shake[0][0] = epo_code_par[d1][d2][0][0];
  2694. epo_code_shake[1][0] = epo_code_par[d1][d2][1][0];
  2695. d1 = cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][1];
  2696. d2 = cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][1];
  2697. epo_code_shake[0][1] = epo_code_par[d1][d2][0][1];
  2698. epo_code_shake[1][1] = epo_code_par[d1][d2][1][1];
  2699. d1 = cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][2];
  2700. d2 = cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][2];
  2701. epo_code_shake[0][2] = epo_code_par[d1][d2][0][2];
  2702. epo_code_shake[1][2] = epo_code_par[d1][d2][1][2];
  2703. d1 = cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][3];
  2704. d2 = cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][3];
  2705. epo_code_shake[0][3] = epo_code_par[d1][d2][0][3];
  2706. epo_code_shake[1][3] = epo_code_par[d1][d2][1][3];
  2707. }
  2708. }
  2709. return best_err;
  2710. }
  2711. CGV_FLOAT cmp_requantized_index(CMP_INOUT CGU_UINT8 index_out[16],
  2712. CMP_INOUT CGU_Vec4ui epo_code_best[2],
  2713. CMP_IN CGU_INT index_bits,
  2714. CMP_IN CGU_UINT8 max_bits[4],
  2715. CMP_IN CGV_Vec4ui image_src[16],
  2716. CMP_IN CGU_INT numEntries,
  2717. CMP_IN CGU_UINT8 channels3or4)
  2718. {
  2719. //CGV_Vec4f image_requantize[16];
  2720. //CGV_FLOAT err_r = 0.0F;
  2721. CGU_UINT8 k;
  2722. CGU_UINT8 ch;
  2723. // for (k = 0; k < 16; k++)
  2724. // {
  2725. // image_requantize[k][0] = cmp_GetRamp(index_bits, max_bits[0], epo_code_best[0][0], epo_code_best[1][0], k);
  2726. // image_requantize[k][1] = cmp_GetRamp(index_bits, max_bits[1], epo_code_best[0][1], epo_code_best[1][1], k);
  2727. // image_requantize[k][2] = cmp_GetRamp(index_bits, max_bits[2], epo_code_best[0][2], epo_code_best[1][2], k);
  2728. // if (channels3or4 == 4)
  2729. // image_requantize[k][3] = cmp_GetRamp(index_bits, max_bits[3], epo_code_best[0][3], epo_code_best[1][3], k);
  2730. // else
  2731. // image_requantize[k][3] = 0.0f;
  2732. // }
  2733. //=========================================
  2734. // requantized image based on new epo_code
  2735. //=========================================
  2736. CGV_FLOAT image_requantize[SOURCE_BLOCK_SIZE][MAX_CHANNELS];
  2737. CGV_FLOAT err_r = 0.0F;
  2738. for (ch = 0; ch < channels3or4; ch++)
  2739. {
  2740. for (k = 0; k < SOURCE_BLOCK_SIZE; k++)
  2741. {
  2742. image_requantize[k][ch] = cmp_GetRamp(index_bits, max_bits[ch], epo_code_best[0][ch], epo_code_best[1][ch], k);
  2743. }
  2744. }
  2745. //=========================================
  2746. // Calc the error for the requantized image
  2747. //=========================================
  2748. CGV_Vec4f imageDiff;
  2749. //CGU_UINT8 block_entries = (1 << index_bits);
  2750. //
  2751. // for (k = 0; k < numEntries; k++)
  2752. // {
  2753. // CGV_FLOAT err_cmin = 262145.0f; // (256 * 256 * 4) + 1; CMP_FLOAT_MAX;
  2754. // CGU_UINT8 hold_index = 0;
  2755. // CGV_FLOAT image_err;
  2756. //
  2757. // for (CGU_UINT8 k1 = 0; k1 < block_entries; k1++)
  2758. // {
  2759. // imageDiff.x = image_requantize[k1].x - image_src[k].x;
  2760. // imageDiff.y = image_requantize[k1].y - image_src[k].y;
  2761. // imageDiff.z = image_requantize[k1].z - image_src[k].z;
  2762. // imageDiff.w = image_requantize[k1].w - image_src[k].w;
  2763. // image_err = cmp_dot4f(imageDiff, imageDiff);
  2764. //
  2765. // if (image_err < err_cmin)
  2766. // {
  2767. // err_cmin = image_err;
  2768. // hold_index = k1;
  2769. // }
  2770. // }
  2771. //
  2772. // index_out[k] = hold_index;
  2773. // err_r += err_cmin;
  2774. // }
  2775. //=========================================
  2776. // Calc the error for the requantized image
  2777. //=========================================
  2778. for (k = 0; k < numEntries; k++)
  2779. {
  2780. CGV_FLOAT err_cmin = CMP_FLOAT_MAX;
  2781. CGV_INT hold_index_j = 0;
  2782. for (CGV_INT iclogBC7 = 0; iclogBC7 < (1 << index_bits); iclogBC7++)
  2783. {
  2784. CGV_FLOAT image_err = 0.0F;
  2785. for (ch = 0; ch < channels3or4; ch++)
  2786. {
  2787. image_err += old_sq_image(image_requantize[iclogBC7][ch] - image_src[k][ch]);
  2788. }
  2789. if (image_err < err_cmin)
  2790. {
  2791. err_cmin = image_err;
  2792. hold_index_j = iclogBC7;
  2793. }
  2794. }
  2795. index_out[k] = (CGV_UINT8)hold_index_j;
  2796. err_r += err_cmin;
  2797. }
  2798. return err_r;
  2799. }
  2800. CGV_FLOAT cmp_optimize_IndexAndEndPoints(CMP_INOUT CGU_Vec4ui epo_code_out[2],
  2801. CMP_INOUT CGU_UINT32 index_io[16],
  2802. CMP_INOUT CGU_UINT32 index_packed_out[2],
  2803. CMP_IN CGV_Vec4ui image_src[16],
  2804. CMP_IN CGU_INT numEntries,
  2805. CMP_IN CGU_UINT8 Mi_,
  2806. CMP_IN CGU_UINT8 bits,
  2807. CMP_IN CGU_UINT8 channels3or4,
  2808. CMP_IN CGU_FLOAT errorThreshold,
  2809. CMP_IN CGU_INT blockMode)
  2810. {
  2811. CGV_FLOAT err_best = CMP_FLOAT_MAX;
  2812. CGU_INT type;
  2813. CGU_UINT8 channels2 = 2 * channels3or4;
  2814. type = bits % channels2;
  2815. CGU_UINT8 use_par = (type != 0);
  2816. CGU_UINT8 max_bits[4] = {0, 0, 0, 0};
  2817. CGU_UINT8 ch;
  2818. CGU_INT k;
  2819. for (ch = 0; ch < channels3or4; ch++)
  2820. max_bits[ch] = (bits + channels2 - 1) / channels2;
  2821. CGU_INT index_bits = g_modesettings[blockMode].indexBits;
  2822. CGU_INT clt_clogBC7 = index_bits - 2;
  2823. if (clt_clogBC7 > 3)
  2824. return CMP_FLOAT_MAX;
  2825. Mi_ = Mi_ - 1;
  2826. CGU_UINT32 index_tmp[16];
  2827. CGU_INT maxTry = MAX_TRY_SHAKER;
  2828. for (k = 0; k < numEntries; k++)
  2829. index_tmp[k] = cmp_clampui8(index_io[k], 0, 15);
  2830. epo_code_out[0] = 0u;
  2831. epo_code_out[1] = 0u;
  2832. CGV_FLOAT err_requant = 0.0F;
  2833. CGU_UINT8 MaxIndex;
  2834. MaxIndex = index_collapse2(index_tmp, numEntries);
  2835. //===============================
  2836. // we have a solid color 4x4 block
  2837. //===============================
  2838. if (MaxIndex == 0)
  2839. {
  2840. return cmp_quant_solid_color(index_io, epo_code_out, image_src, numEntries, Mi_, max_bits, type, channels3or4, blockMode);
  2841. }
  2842. for (CGU_INT ii = 0; ii < maxTry; ii++)
  2843. {
  2844. //===============================
  2845. // We have ramp colors to process
  2846. //===============================
  2847. CGV_FLOAT err_cluster = CMP_FLOAT_MAX;
  2848. CGV_FLOAT err_shake;
  2849. CGU_UINT32 index_cluster[16];
  2850. CGU_Vec4ui epo_code_best[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}};
  2851. for (CGU_UINT8 ii2 = 0; ii2 < numEntries; ii2++)
  2852. index_cluster[ii2] = 0;
  2853. CGU_UINT8 mi = Mi_;
  2854. for (CGU_UINT8 index_slope = 1; (index_slope * MaxIndex) <= mi; index_slope++)
  2855. {
  2856. CGV_Vec4f image_cluster[2] = {{0.0f, 0.0f, 0.0f, 0.0f}, {0.0f, 0.0f, 0.0f, 0.0f}};
  2857. for (CGU_UINT8 index_offset = 0; index_offset <= (mi - index_slope * MaxIndex); index_offset++)
  2858. {
  2859. //-------------------------------------
  2860. // set a new index data to try
  2861. //-------------------------------------
  2862. for (k = 0; k < numEntries; k++)
  2863. index_cluster[k] = index_tmp[k] * index_slope + index_offset;
  2864. if (cmp_get_ideal_cluster(image_cluster, index_cluster, Mi_, image_src, numEntries, channels3or4))
  2865. {
  2866. CGU_Vec4ui epo_code_shake[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}};
  2867. err_shake = cmp_shake3( epo_code_shake,
  2868. image_cluster,
  2869. index_cluster,
  2870. image_src,
  2871. index_bits,
  2872. type,
  2873. max_bits,
  2874. use_par,
  2875. numEntries,
  2876. channels3or4);
  2877. if (err_shake < err_cluster)
  2878. {
  2879. err_cluster = err_shake;
  2880. epo_code_best[0] = epo_code_shake[0];
  2881. epo_code_best[1] = epo_code_shake[1];
  2882. }
  2883. }
  2884. }
  2885. }
  2886. if ((err_cluster != CMP_FLOAT_MAX))
  2887. {
  2888. //=========================
  2889. // test results for quality
  2890. //=========================
  2891. CGU_UINT8 index_best[16] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
  2892. err_requant = cmp_requantized_index(index_best,
  2893. epo_code_best,
  2894. index_bits,
  2895. max_bits,
  2896. image_src,
  2897. numEntries,
  2898. channels3or4);
  2899. if (err_requant < err_best)
  2900. {
  2901. //better = 1;
  2902. for (k = 0; k < numEntries; k++)
  2903. index_io[k] = index_tmp[k] = index_best[k];
  2904. cmp_pack4bitindex32(index_packed_out, index_io);
  2905. epo_code_out[0] = epo_code_best[0];
  2906. epo_code_out[1] = epo_code_best[1];
  2907. err_best = err_requant;
  2908. }
  2909. }
  2910. // Early out if we have our target err
  2911. if (err_best <= errorThreshold)
  2912. break;
  2913. MaxIndex = index_collapse2(index_tmp, numEntries);
  2914. if (MaxIndex == 0)
  2915. break;
  2916. }
  2917. return err_best;
  2918. }
  2919. CGU_UINT8 cmp_Write8Bit2(CMP_INOUT CGU_UINT8 base[16], CMP_IN CGU_INT offset, CMP_IN CGU_INT bits, CMP_IN CGU_UINT8 bitVal)
  2920. {
  2921. base[offset / 8] |= bitVal << (offset % 8);
  2922. if (offset % 8 + bits > 8)
  2923. {
  2924. base[offset / 8 + 1] |= shift_right_uint82(bitVal, 8 - offset % 8);
  2925. }
  2926. return (offset += bits);
  2927. }
  2928. INLINE CGU_UINT8 shift_right_uint8V2(CMP_IN CGU_UINT8 v, CMP_IN CGU_UINT8 bits)
  2929. {
  2930. return v >> bits; // (perf warning expected)
  2931. }
  2932. void cmp_Write8BitV2(CMP_INOUT CGU_UINT8 base[16], CMP_IN CGU_INT offset, CMP_IN CGU_INT bits, CMP_IN CGU_UINT8 bitVal)
  2933. {
  2934. base[offset / 8] |= bitVal << (offset % 8);
  2935. if (offset % 8 + bits > 8)
  2936. {
  2937. base[offset / 8 + 1] |= shift_right_uint8V2(bitVal, 8 - offset % 8);
  2938. }
  2939. }
  2940. void cmp_Encode_mode01237(CMP_IN CGU_INT blockMode,
  2941. CMP_IN CGU_UINT8 bestPartition,
  2942. CMP_IN CGU_UINT32 packedEndpoints[6],
  2943. CMP_IN CGU_UINT8 index16[16],
  2944. CMP_INOUT CGU_UINT8 cmp_out[16])
  2945. {
  2946. CGU_UINT8 blockindex[SOURCE_BLOCK_SIZE];
  2947. CGU_UINT32 indexBitsV = g_modesettings[blockMode].indexBits;
  2948. CGU_UINT32 k;
  2949. CGU_UINT32 ch;
  2950. for (k = 0; k < COMPRESSED_BLOCK_SIZE; k++)
  2951. cmp_out[k] = 0;
  2952. // mode 0 = 1, mode 1 = 01, mode 2 = 001, mode 3 = 0001, ...
  2953. CGU_INT bitPosition = blockMode;
  2954. bitPosition = cmp_Write8Bit2(cmp_out, bitPosition, 1, 1);
  2955. // Write partition bits
  2956. bitPosition = cmp_Write8Bit2(cmp_out, bitPosition, g_modesettings[blockMode].partitionBits, bestPartition);
  2957. // Sort out the index set and tag whether we need to flip the
  2958. // endpoints to get the correct state in the implicit index bits
  2959. // The implicitly encoded MSB of the fixup index must be 0
  2960. CGU_UINT32 fixup[3] = {0, 0, 0};
  2961. cmp_get_fixuptable(fixup, (g_modesettings[blockMode].maxSubSets == 2 ? bestPartition : bestPartition + 64));
  2962. // Extract indices and mark subsets that need to have their colours flipped to get the
  2963. // right state for the implicit MSB of the fixup index
  2964. CGU_INT flipColours[3] = {0, 0, 0};
  2965. for (k = 0; k < SOURCE_BLOCK_SIZE; k++)
  2966. {
  2967. blockindex[k] = index16[k];
  2968. for (CGU_UINT8 j = 0; j < g_modesettings[blockMode].maxSubSets; j++)
  2969. {
  2970. if (k == fixup[j])
  2971. {
  2972. if (blockindex[k] & (1 << (indexBitsV - 1)))
  2973. {
  2974. flipColours[j] = 1;
  2975. }
  2976. }
  2977. }
  2978. }
  2979. // Now we must flip the endpoints where necessary so that the implicitly encoded
  2980. // index bits have the correct state
  2981. for (k = 0; k < g_modesettings[blockMode].maxSubSets; k++)
  2982. {
  2983. if (flipColours[k] == 1)
  2984. {
  2985. CGU_UINT32 temp = packedEndpoints[k * 2 + 0];
  2986. packedEndpoints[k * 2 + 0] = packedEndpoints[k * 2 + 1];
  2987. packedEndpoints[k * 2 + 1] = temp;
  2988. }
  2989. }
  2990. // ...next flip the indices where necessary
  2991. for (k = 0; k < SOURCE_BLOCK_SIZE; k++)
  2992. {
  2993. CGU_UINT8 partsub = cmp_get_partition_subset2(bestPartition, g_modesettings[blockMode].maxSubSets, k);
  2994. if (flipColours[partsub] == 1)
  2995. {
  2996. blockindex[k] = ((1 << indexBitsV) - 1) - blockindex[k];
  2997. }
  2998. }
  2999. // Endpoints are stored in the following order RRRR GGGG BBBB (AAAA) (PPPP)
  3000. // i.e. components are packed together
  3001. CGU_Vec4ui unpackedColours[MAX_SUBSETS * 2];
  3002. CGU_UINT8 parityBits[MAX_SUBSETS][2];
  3003. // Init
  3004. for (k = 0; k < MAX_SUBSETS * 2; k++)
  3005. unpackedColours[k] = 0;
  3006. // Unpack the colour values for the subsets
  3007. for (k = 0; k < g_modesettings[blockMode].maxSubSets; k++)
  3008. {
  3009. CGU_UINT32 packedColours[2] = {packedEndpoints[k * 2 + 0], packedEndpoints[k * 2 + 1]};
  3010. if (blockMode == 0 || blockMode == 3 || blockMode == 7)
  3011. { // TWO_PBIT
  3012. parityBits[k][0] = packedColours[0] & 1;
  3013. parityBits[k][1] = packedColours[1] & 1;
  3014. packedColours[0] >>= 1;
  3015. packedColours[1] >>= 1;
  3016. }
  3017. else if (blockMode == 1)
  3018. { // ONE_PBIT
  3019. parityBits[k][0] = packedColours[1] & 1;
  3020. parityBits[k][1] = packedColours[1] & 1;
  3021. packedColours[0] >>= 1;
  3022. packedColours[1] >>= 1;
  3023. }
  3024. else if (blockMode == 2)
  3025. {
  3026. parityBits[k][0] = 0;
  3027. parityBits[k][1] = 0;
  3028. }
  3029. for (ch = 0; ch < g_modesettings[blockMode].channels3or4; ch++)
  3030. {
  3031. unpackedColours[k * 2][ch] = packedColours[0] & ((1 << g_modesettings[blockMode].componentBits) - 1);
  3032. unpackedColours[k * 2 + 1][ch] = packedColours[1] & ((1 << g_modesettings[blockMode].componentBits) - 1);
  3033. packedColours[0] >>= g_modesettings[blockMode].componentBits;
  3034. packedColours[1] >>= g_modesettings[blockMode].componentBits;
  3035. }
  3036. }
  3037. // Loop over component
  3038. for (ch = 0; ch < g_modesettings[blockMode].channels3or4; ch++)
  3039. {
  3040. // loop over subsets
  3041. for (k = 0; k < g_modesettings[blockMode].maxSubSets; k++)
  3042. {
  3043. bitPosition = cmp_Write8Bit2(cmp_out, bitPosition, g_modesettings[blockMode].componentBits, unpackedColours[k * 2][ch] & 0xFF);
  3044. bitPosition = cmp_Write8Bit2(cmp_out, bitPosition, g_modesettings[blockMode].componentBits, unpackedColours[k * 2 + 1][ch] & 0xFF);
  3045. }
  3046. }
  3047. // write parity bits
  3048. if (blockMode != 2)
  3049. {
  3050. for (k = 0; k < g_modesettings[blockMode].maxSubSets; k++)
  3051. {
  3052. if (blockMode == 1)
  3053. { // ONE_PBIT
  3054. bitPosition = cmp_Write8Bit2(cmp_out, bitPosition, 1, parityBits[k][0] & 0x01);
  3055. }
  3056. else
  3057. { // TWO_PBIT
  3058. bitPosition = cmp_Write8Bit2(cmp_out, bitPosition, 1, parityBits[k][0] & 0x01);
  3059. bitPosition = cmp_Write8Bit2(cmp_out, bitPosition, 1, parityBits[k][1] & 0x01);
  3060. }
  3061. }
  3062. }
  3063. // Encode the index bits
  3064. CGU_INT bitPositionV = bitPosition;
  3065. for (k = 0; k < 16; k++)
  3066. {
  3067. CGU_UINT8 partsub = cmp_get_partition_subset2(bestPartition, g_modesettings[blockMode].maxSubSets, k);
  3068. // If this is a fixup index then drop the MSB which is implicitly 0
  3069. if (k == fixup[partsub])
  3070. {
  3071. cmp_Write8BitV2(cmp_out, bitPositionV, g_modesettings[blockMode].indexBits - 1, blockindex[k] & 0x07F);
  3072. bitPositionV += g_modesettings[blockMode].indexBits - 1;
  3073. }
  3074. else
  3075. {
  3076. cmp_Write8BitV2(cmp_out, bitPositionV, g_modesettings[blockMode].indexBits, blockindex[k]);
  3077. bitPositionV += g_modesettings[blockMode].indexBits;
  3078. }
  3079. }
  3080. }
  3081. CGV_FLOAT cmp_process_mode(CMP_INOUT CGU_UINT32 best_cmp_out[5], CMP_IN CGU_Vec4ui image_src[16], CMP_IN CGU_INT block_mode)
  3082. {
  3083. #ifdef USE_OLDCODE
  3084. CGV_FLOAT best_err = 1e30f;
  3085. CGU_Vec4ui epo_code[6];
  3086. CGU_Vec4ui bestEndpoints[6];
  3087. CGU_UINT8 bestindex[3][16];
  3088. CGU_INT bestEntryCount[3];
  3089. CGU_UINT8 bestindex16[16];
  3090. CGU_UINT32 packedEndpoints[6] = {0, 0, 0, 0, 0, 0};
  3091. CGU_UINT32 k;
  3092. CGU_UINT32 ch;
  3093. CGU_UINT32 subset;
  3094. // Check for a solid color for a fast encode
  3095. CGV_Vec4ui mean_out = 0.0f;
  3096. for (k = 0; k < 16; k++)
  3097. {
  3098. mean_out = mean_out + image_src[k];
  3099. bestindex16[k] = 0;
  3100. }
  3101. mean_out = mean_out / 16;
  3102. // Image has alpha
  3103. if (mean_out.w < 255)
  3104. {
  3105. }
  3106. CGU_UINT8 storedBestindex[64][3][16];
  3107. CGV_FLOAT storedError[64];
  3108. CGU_UINT8 sortedPartition[64];
  3109. CGV_FLOAT quality = 1.0f;
  3110. CGV_FLOAT opaque_err = 0.0f;
  3111. CGV_Vec4ui image_subsets[3][16];
  3112. CGU_INT subset_entryCount[MAX_SUBSETS] = {0, 0, 0};
  3113. CGU_UINT8 bestPartition = 0;
  3114. for (CGU_UINT8 mode_blockPartition = 0; mode_blockPartition < 64; mode_blockPartition++)
  3115. {
  3116. cmp_GetPartitionSubSet2_mode01237(
  3117. image_subsets, subset_entryCount, mode_blockPartition, image_src, block_mode, g_modesettings[block_mode].channels3or4);
  3118. CGV_Vec4ui subset_image_src[16];
  3119. CGU_UINT8 index_out1[16];
  3120. CGV_FLOAT err_quant = 0.0F;
  3121. // Store the quntize error for this partition to be sorted and processed later
  3122. for (subset = 0; subset < g_modesettings[block_mode].maxSubSets; subset++)
  3123. {
  3124. CGU_INT numEntries = subset_entryCount[subset];
  3125. for (CGU_UINT8 ii = 0; ii < 16; ii++)
  3126. subset_image_src[ii] = image_subsets[subset][ii];
  3127. err_quant += cmp_GetQuantizeIndex_old(
  3128. index_out1, subset_image_src, numEntries, g_modesettings[block_mode].clusters, g_modesettings[block_mode].channels3or4);
  3129. for (CGU_UINT8 idx = 0; idx < numEntries; idx++)
  3130. storedBestindex[mode_blockPartition][subset][idx] = index_out1[idx];
  3131. }
  3132. storedError[mode_blockPartition] = err_quant;
  3133. }
  3134. // Sort the results
  3135. cmp_sortPartitionProjection(storedError, sortedPartition, 64); // 64 partitions
  3136. CGU_UINT8 numShakeAttempts = cmp_max8(1, cmp_min8((CGU_UINT8)floor(8 * quality + 0.5), 64)); // 64 partitions
  3137. CGV_FLOAT err_best = CMP_FLOAT_MAX;
  3138. // Now do the endpoint shaking
  3139. for (CGU_UINT8 nSA = 0; nSA < numShakeAttempts; nSA++)
  3140. {
  3141. CGV_FLOAT err_optimized = 0.0F;
  3142. CGU_UINT8 sortedBlockPartition;
  3143. sortedBlockPartition = sortedPartition[nSA];
  3144. //********************************************
  3145. // Get the partition shape for the given mode
  3146. //********************************************
  3147. cmp_GetPartitionSubSet2_mode01237(
  3148. image_subsets, subset_entryCount, sortedBlockPartition, image_src, block_mode, g_modesettings[block_mode].channels3or4);
  3149. //*****************************
  3150. // Process the partition shape
  3151. //*****************************
  3152. for (subset = 0; subset < g_modesettings[block_mode].maxSubSets; subset++)
  3153. {
  3154. CGU_INT numEntries = subset_entryCount[subset];
  3155. CGU_UINT32 index_io[16];
  3156. CGV_Vec4ui src_image_block[16];
  3157. CGU_Vec4ui tmp_epo_code[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}};
  3158. for (k = 0; k < 16; k++)
  3159. src_image_block[k] = image_subsets[subset][k];
  3160. for (k = 0; k < 16; k++)
  3161. index_io[k] = storedBestindex[sortedBlockPartition][subset][k];
  3162. CGU_UINT32 index_packed_out[2] = {0, 0};
  3163. err_optimized += cmp_optimize_IndexAndEndPoints(tmp_epo_code,
  3164. index_io,
  3165. index_packed_out,
  3166. src_image_block,
  3167. numEntries,
  3168. g_modesettings[block_mode].clusters,
  3169. g_modesettings[block_mode].bits,
  3170. g_modesettings[block_mode].channels3or4,
  3171. 0.01f,
  3172. 1);
  3173. for (k = 0; k < 16; k++)
  3174. storedBestindex[sortedBlockPartition][subset][k] = index_io[k];
  3175. epo_code[subset * 2] = tmp_epo_code[0];
  3176. epo_code[subset * 2 + 1] = tmp_epo_code[1];
  3177. shared_temp[subset * 2].endPoint_low = tmp_epo_code[0];
  3178. shared_temp[subset * 2 + 1].endPoint_high = tmp_epo_code[1];
  3179. }
  3180. //****************************************
  3181. // Check if result is better than the last
  3182. //****************************************
  3183. if (err_optimized < err_best)
  3184. {
  3185. bestPartition = sortedBlockPartition;
  3186. CGU_INT bestIndexCount = 0;
  3187. for (subset = 0; subset < g_modesettings[block_mode].maxSubSets; subset++)
  3188. {
  3189. CGU_UINT32 numEntries = subset_entryCount[subset];
  3190. bestEntryCount[subset] = numEntries;
  3191. if (numEntries)
  3192. {
  3193. bestEndpoints[subset * 2] = epo_code[subset * 2];
  3194. bestEndpoints[subset * 2 + 1] = epo_code[subset * 2 + 1];
  3195. shared_temp[subset * 2].endPoint_low = bestEndpoints[subset * 2];
  3196. shared_temp[subset * 2 + 1].endPoint_high = bestEndpoints[subset * 2 + 1];
  3197. for (k = 0; k < numEntries; k++)
  3198. {
  3199. bestindex[subset][k] = storedBestindex[sortedBlockPartition][subset][k];
  3200. bestindex16[bestIndexCount++] = storedBestindex[sortedBlockPartition][subset][k];
  3201. shared_temp[k].colorindex = storedBestindex[sortedBlockPartition][subset][k];
  3202. }
  3203. }
  3204. }
  3205. err_best = err_optimized;
  3206. // Early out if we found we can compress with error below the quality threshold
  3207. if (err_best <= 0.01f) // Thresh hold err
  3208. {
  3209. break;
  3210. }
  3211. }
  3212. }
  3213. if (block_mode != 7)
  3214. err_best += opaque_err;
  3215. if (err_best > best_err)
  3216. return best_err;
  3217. //**************************
  3218. // Save the encoded block
  3219. //**************************
  3220. best_err = err_best;
  3221. // Now we have all the data needed to encode the block
  3222. // We need to pack the endpoints prior to encoding
  3223. for (subset = 0; subset < g_modesettings[block_mode].maxSubSets; subset++)
  3224. {
  3225. packedEndpoints[subset * 2] = 0;
  3226. packedEndpoints[subset * 2 + 1] = 0;
  3227. if (bestEntryCount[subset])
  3228. {
  3229. CGU_UINT32 rightAlignment = 0;
  3230. // Sort out parity bits
  3231. if (block_mode != 2)
  3232. {
  3233. // Sort out BCC parity bits
  3234. packedEndpoints[subset * 2] = bestEndpoints[subset * 2][0] & 1;
  3235. packedEndpoints[subset * 2 + 1] = bestEndpoints[subset * 2 + 1][0] & 1;
  3236. for (ch = 0; ch < g_modesettings[block_mode].channels3or4; ch++)
  3237. {
  3238. bestEndpoints[subset * 2][ch] >>= 1;
  3239. bestEndpoints[subset * 2 + 1][ch] >>= 1;
  3240. }
  3241. rightAlignment++;
  3242. }
  3243. // Fixup endpoints
  3244. for (ch = 0; ch < g_modesettings[block_mode].channels3or4; ch++)
  3245. {
  3246. packedEndpoints[subset * 2] |= bestEndpoints[subset * 2][ch] << rightAlignment;
  3247. packedEndpoints[subset * 2 + 1] |= bestEndpoints[subset * 2 + 1][ch] << rightAlignment;
  3248. rightAlignment += g_modesettings[block_mode].componentBits;
  3249. }
  3250. }
  3251. }
  3252. CGU_UINT8 idxCount[3] = {0, 0, 0};
  3253. for (k = 0; k < SOURCE_BLOCK_SIZE; k++)
  3254. {
  3255. CGU_UINT8 partsub = cmp_get_partition_subset2(bestPartition, g_modesettings[block_mode].maxSubSets, k);
  3256. CGU_UINT8 idxC = idxCount[partsub];
  3257. bestindex16[k] = bestindex[partsub][idxC];
  3258. idxCount[partsub] = idxC + 1;
  3259. shared_temp[k].colorindex = bestindex16[k];
  3260. }
  3261. CGU_UINT8 cmp_out[COMPRESSED_BLOCK_SIZE];
  3262. cmp_Encode_mode01237(block_mode, bestPartition, packedEndpoints, bestindex16, cmp_out);
  3263. best_cmp_out[0] = (CGU_UINT32)cmp_out[0] + (CGU_UINT32)(cmp_out[1] << 8) + (CGU_UINT32)(cmp_out[2] << 16) + (CGU_UINT32)(cmp_out[3] << 24);
  3264. best_cmp_out[1] = (CGU_UINT32)cmp_out[4] + (CGU_UINT32)(cmp_out[5] << 8) + (CGU_UINT32)(cmp_out[6] << 16) + (CGU_UINT32)(cmp_out[7] << 24);
  3265. best_cmp_out[2] = (CGU_UINT32)cmp_out[8] + (CGU_UINT32)(cmp_out[9] << 8) + (CGU_UINT32)(cmp_out[10] << 16) + (CGU_UINT32)(cmp_out[11] << 24);
  3266. best_cmp_out[3] = (CGU_UINT32)cmp_out[12] + (CGU_UINT32)(cmp_out[13] << 8) + (CGU_UINT32)(cmp_out[14] << 16) + (CGU_UINT32)(cmp_out[15] << 24);
  3267. //CGU_Vec4ui block = {0, 0, 0, 0};
  3268. //block_package1(block, bestPartition, 0);
  3269. //best_cmp_out[0] = block[0];
  3270. //best_cmp_out[1] = block[1];
  3271. //best_cmp_out[2] = block[2];
  3272. //best_cmp_out[3] = block[3];
  3273. //
  3274. //printSharedTemp();
  3275. return best_err;
  3276. #else
  3277. CGU_UINT8 bestPartition = 0;
  3278. // Find the best partion
  3279. CGU_UINT32 pbit = 0;
  3280. CGU_UINT32 error;
  3281. CGU_UINT32 bestErr = MAX_UINT;
  3282. CGU_UINT32 bestpbit = 0;
  3283. for (CGU_UINT8 mode_blockPartition = 0; mode_blockPartition < 64; mode_blockPartition++)
  3284. {
  3285. error = cmp_GetPartitionError(pbit, mode_blockPartition, image_src);
  3286. if (error < bestErr)
  3287. {
  3288. bestErr = error;
  3289. bestpbit = pbit;
  3290. bestPartition = mode_blockPartition;
  3291. }
  3292. }
  3293. // Get the index for the partition
  3294. for (CGU_INT threadInBlock = 15; threadInBlock >= 0; threadInBlock--)
  3295. {
  3296. ProcessBlock(1, bestPartition, 0, bestpbit, 0, threadInBlock, threadInBlock, 0);
  3297. }
  3298. // print results for debug
  3299. printSharedTemp();
  3300. //=======================
  3301. // Encode final block
  3302. //========================
  3303. {
  3304. // CGU_Vec4ui blockGreen = {0xffe00040, 0xfffe0007, 0x00000001, 0x00000000};
  3305. // CGU_Vec4ui blockBlue = {0x00000040, 0xfffffff8, 0x00000001, 0x00000000};
  3306. // CGU_Vec4ui block00 = {0xf0617fc0, 0xfffe0c3f, 0xff00fe11, 0xff01ef00};
  3307. CGU_Vec4ui blockRed = {0x001fffc0, 0xfffe0000, 0x00000001, 0x00000000};
  3308. CGU_Vec4ui block = {0, 0, 0, 0};
  3309. CGU_UINT32 input_mode = 1;
  3310. switch (input_mode)
  3311. {
  3312. case 1:
  3313. block_package1(block, bestPartition, 0);
  3314. break;
  3315. case 3:
  3316. block_package3(block, bestPartition, 0);
  3317. break;
  3318. case 7:
  3319. block_package7(block, bestPartition, 0);
  3320. break;
  3321. default: // error unsupported mode used!
  3322. block = blockRed;
  3323. break;
  3324. }
  3325. best_cmp_out[0] = block[0];
  3326. best_cmp_out[1] = block[1];
  3327. best_cmp_out[2] = block[2];
  3328. best_cmp_out[3] = block[3];
  3329. }
  3330. return 0.0f;
  3331. #endif
  3332. }
  3333. #endif // Not ASPM_HLSL
  3334. //======================================= MODES 45 =============================================
  3335. #ifndef ASPM_HLSL
  3336. #if defined(ENABLE_CMP_MODE4) || defined(ENABLE_CMP_MODE5)
  3337. // Compression Results
  3338. struct cmp_mode_parameters2
  3339. {
  3340. CGV_INT color_qendpoint[8];
  3341. CGV_INT alpha_qendpoint[8];
  3342. CGV_UINT8 color_index[16];
  3343. CGV_UINT8 alpha_index[16];
  3344. CGV_UINT32 idxMode;
  3345. CGV_UINT32 rotated_channel;
  3346. };
  3347. CMP_STATIC CMP_CONSTANT CGU_UINT8 componentRotations2[4][4] = {
  3348. { COMP_ALPHA, COMP_RED, COMP_GREEN, COMP_BLUE },
  3349. { COMP_RED, COMP_ALPHA, COMP_GREEN, COMP_BLUE },
  3350. { COMP_GREEN, COMP_RED, COMP_ALPHA, COMP_BLUE },
  3351. { COMP_BLUE, COMP_RED, COMP_GREEN, COMP_ALPHA }
  3352. };
  3353. INLINE CGV_UINT8 old_shift_right_uint(CGV_UINT8 v, CGU_UINT8 bits)
  3354. {
  3355. return v >> bits; // (perf warning expected)
  3356. }
  3357. void old_Write8Bit(CGV_UINT8 base[], CGU_INT* uniform offset, CGU_INT bits, CGV_UINT8 bitVal)
  3358. {
  3359. base[*offset / 8] |= bitVal << (*offset % 8);
  3360. if (*offset % 8 + bits > 8)
  3361. {
  3362. base[*offset / 8 + 1] |= old_shift_right_uint(bitVal, 8 - *offset % 8);
  3363. }
  3364. *offset += bits;
  3365. }
  3366. INLINE void old_swap_index(CGV_UINT8 u[], CGV_UINT8 v[], CGU_INT n)
  3367. {
  3368. for (CGU_INT i = 0; i < n; i++)
  3369. {
  3370. CGV_UINT8 t = u[i];
  3371. u[i] = v[i];
  3372. v[i] = t;
  3373. }
  3374. }
  3375. INLINE void old_swap_epo(CGV_INT u[], CGV_INT v[], CGV_INT n)
  3376. {
  3377. for (CGU_INT i = 0; i < n; i++)
  3378. {
  3379. CGV_INT t = u[i];
  3380. u[i] = v[i];
  3381. v[i] = t;
  3382. }
  3383. }
  3384. INLINE void old_encode_swap(CGV_INT endpoint[], CGU_INT channels, CGV_UINT8 block_index[MAX_SUBSET_SIZE], CGU_INT bits)
  3385. {
  3386. CGU_INT levels = 1 << bits;
  3387. if (block_index[0] >= levels / 2)
  3388. {
  3389. old_swap_epo(&endpoint[0], &endpoint[channels], channels);
  3390. for (CGU_INT k = 0; k < SOURCE_BLOCK_SIZE; k++)
  3391. #ifdef ASPM_GPU
  3392. block_index[k] = (levels - 1) - block_index[k];
  3393. #else
  3394. block_index[k] = CGV_UINT8(levels - 1) - block_index[k];
  3395. #endif
  3396. }
  3397. }
  3398. void old_encode_index(CGV_UINT8 data[16], CGU_INT* uniform pPos, CGV_UINT8 block_index[MAX_SUBSET_SIZE], CGU_INT bits)
  3399. {
  3400. old_Write8Bit(data, pPos, bits - 1, block_index[0]);
  3401. for (CGU_INT j = 1; j < SOURCE_BLOCK_SIZE; j++)
  3402. {
  3403. CGV_UINT8 qbits = block_index[j] & 0xFF;
  3404. old_Write8Bit(data, pPos, bits, qbits);
  3405. }
  3406. }
  3407. void cmp_Encode_mode4(CMP_INOUT CGV_UINT8 cmp_out[COMPRESSED_BLOCK_SIZE], cmp_mode_parameters2 params)
  3408. {
  3409. CGU_INT bitPosition = 4; // Position the pointer at the LSB
  3410. for (CGU_INT k = 0; k < COMPRESSED_BLOCK_SIZE; k++)
  3411. cmp_out[k] = 0;
  3412. // mode 4 (5 bits) 00001
  3413. old_Write8Bit(cmp_out, &bitPosition, 1, 1);
  3414. // rotation 2 bits
  3415. old_Write8Bit(cmp_out, &bitPosition, 2, CMP_STATIC_CAST(CGV_UINT8, params.rotated_channel));
  3416. // idxMode 1 bit
  3417. old_Write8Bit(cmp_out, &bitPosition, 1, CMP_STATIC_CAST(CGV_UINT8, params.idxMode));
  3418. CGU_INT idxBits[2] = {2, 3};
  3419. if (params.idxMode)
  3420. {
  3421. idxBits[0] = 3;
  3422. idxBits[1] = 2;
  3423. // Indicate if we need to fixup the index
  3424. old_swap_index(params.color_index, params.alpha_index, 16);
  3425. old_encode_swap(params.alpha_qendpoint, 4, params.color_index, 2);
  3426. old_encode_swap(params.color_qendpoint, 4, params.alpha_index, 3);
  3427. }
  3428. else
  3429. {
  3430. old_encode_swap(params.color_qendpoint, 4, params.color_index, 2);
  3431. old_encode_swap(params.alpha_qendpoint, 4, params.alpha_index, 3);
  3432. }
  3433. // color endpoints 5 bits each
  3434. // R0 : R1
  3435. // G0 : G1
  3436. // B0 : B1
  3437. for (CGU_INT component = 0; component < 3; component++)
  3438. {
  3439. old_Write8Bit(cmp_out, &bitPosition, 5, CMP_STATIC_CAST(CGV_UINT8, params.color_qendpoint[component]));
  3440. old_Write8Bit(cmp_out, &bitPosition, 5, CMP_STATIC_CAST(CGV_UINT8, params.color_qendpoint[4 + component]));
  3441. }
  3442. // alpha endpoints (6 bits each)
  3443. // A0 : A1
  3444. old_Write8Bit(cmp_out, &bitPosition, 6, CMP_STATIC_CAST(CGV_UINT8, params.alpha_qendpoint[0]));
  3445. old_Write8Bit(cmp_out, &bitPosition, 6, CMP_STATIC_CAST(CGV_UINT8, params.alpha_qendpoint[4]));
  3446. // index 2 bits each (31 bits total)
  3447. old_encode_index(cmp_out, &bitPosition, params.color_index, 2);
  3448. // index 3 bits each (47 bits total)
  3449. old_encode_index(cmp_out, &bitPosition, params.alpha_index, 3);
  3450. }
  3451. void cmp_Encode_mode5(CMP_INOUT CGV_UINT8 cmp_out[COMPRESSED_BLOCK_SIZE], cmp_mode_parameters2 params)
  3452. {
  3453. for (CGU_INT k = 0; k < COMPRESSED_BLOCK_SIZE; k++)
  3454. cmp_out[k] = 0;
  3455. // mode 5 bits = 000001
  3456. CGU_INT bitPosition = 5; // Position the pointer at the LSB
  3457. old_Write8Bit(cmp_out, &bitPosition, 1, 1);
  3458. // Write 2 bit rotation
  3459. old_Write8Bit(cmp_out, &bitPosition, 2, CMP_STATIC_CAST(CGV_UINT8, params.rotated_channel));
  3460. old_encode_swap(params.color_qendpoint, 4, params.color_index, 2);
  3461. old_encode_swap(params.alpha_qendpoint, 4, params.alpha_index, 2);
  3462. // color endpoints (7 bits each)
  3463. // R0 : R1
  3464. // G0 : G1
  3465. // B0 : B1
  3466. for (CGU_INT component = 0; component < 3; component++)
  3467. {
  3468. old_Write8Bit(cmp_out, &bitPosition, 7, CMP_STATIC_CAST(CGV_UINT8, params.color_qendpoint[component]));
  3469. old_Write8Bit(cmp_out, &bitPosition, 7, CMP_STATIC_CAST(CGV_UINT8, params.color_qendpoint[4 + component]));
  3470. }
  3471. // alpha endpoints (8 bits each)
  3472. // A0 : A1
  3473. old_Write8Bit(cmp_out, &bitPosition, 8, CMP_STATIC_CAST(CGV_UINT8, params.alpha_qendpoint[0]));
  3474. old_Write8Bit(cmp_out, &bitPosition, 8, CMP_STATIC_CAST(CGV_UINT8, params.alpha_qendpoint[4]));
  3475. // color index 2 bits each (31 bits total)
  3476. // alpha index 2 bits each (31 bits total)
  3477. old_encode_index(cmp_out, &bitPosition, params.color_index, 2);
  3478. old_encode_index(cmp_out, &bitPosition, params.alpha_index, 2);
  3479. }
  3480. void Compress_mode45(CMP_INOUT CGU_UINT32 cmp_out[4], CGU_INT blockMode, CGU_Vec4ui image_src[SOURCE_BLOCK_SIZE])
  3481. {
  3482. cmp_mode_parameters2 best_candidate;
  3483. CGU_UINT32 channels3or4 = 4;
  3484. CGU_UINT8 numClusters0[2];
  3485. CGU_UINT8 numClusters1[2];
  3486. CGU_INT modeBits[2];
  3487. CGU_INT max_idxMode;
  3488. if (blockMode == 4)
  3489. {
  3490. max_idxMode = 2;
  3491. modeBits[0] = 30; // bits = 2 * (Red 5+ Grn 5+ blu 5)
  3492. modeBits[1] = 36; // bits = 2 * (Alpha 6+6+6)
  3493. numClusters0[0] = 4;
  3494. numClusters0[1] = 8;
  3495. numClusters1[0] = 8;
  3496. numClusters1[1] = 4;
  3497. }
  3498. else
  3499. {
  3500. max_idxMode = 1;
  3501. modeBits[0] = 42; // bits = 2 * (Red 7+ Grn 7+ blu 7)
  3502. modeBits[1] = 48; // bits = 2 * (Alpha 8+8+8) = 48
  3503. numClusters0[0] = 4;
  3504. numClusters0[1] = 4;
  3505. numClusters1[0] = 4;
  3506. numClusters1[1] = 4;
  3507. }
  3508. CGU_Vec4ui src_color_Block[SOURCE_BLOCK_SIZE];
  3509. CGU_Vec4ui src_alpha_Block[SOURCE_BLOCK_SIZE];
  3510. CGV_FLOAT best_err = CMP_FLOAT_MAX;
  3511. // Go through each possible rotation and selection of index rotationBits)
  3512. for (CGU_UINT8 rotated_channel = 0; rotated_channel < channels3or4; rotated_channel++)
  3513. {
  3514. // A
  3515. for (CGU_INT k = 0; k < SOURCE_BLOCK_SIZE; k++)
  3516. {
  3517. for (CGU_INT p = 0; p < 3; p++)
  3518. {
  3519. src_color_Block[k][p] = image_src[k][componentRotations2[rotated_channel][p+1]];
  3520. src_alpha_Block[k][p] = image_src[k][componentRotations2[rotated_channel][0]];
  3521. }
  3522. src_color_Block[k][3] = image_src[k][3];
  3523. src_alpha_Block[k][3] = image_src[k][componentRotations2[3][3]];
  3524. }
  3525. CGV_FLOAT err_quantizer;
  3526. CGV_FLOAT err_bestQuantizer = CMP_FLOAT_MAX;
  3527. for (CGU_INT idxMode = 0; idxMode < max_idxMode; idxMode++)
  3528. {
  3529. err_quantizer = cmp_GetQuantizeIndex_old(best_candidate.color_index, src_color_Block, SOURCE_BLOCK_SIZE, numClusters0[idxMode], 3);
  3530. err_quantizer += cmp_GetQuantizeIndex_old(best_candidate.alpha_index, src_alpha_Block, SOURCE_BLOCK_SIZE, numClusters1[idxMode], 3) / 3.0F;
  3531. // If quality is high then run the full shaking for this config and
  3532. // store the result if it beats the best overall error
  3533. // Otherwise only run the shaking if the error is better than the best
  3534. // quantizer error
  3535. if (err_quantizer <= err_bestQuantizer)
  3536. {
  3537. err_bestQuantizer = err_quantizer;
  3538. // Shake size gives the size of the shake cube
  3539. CGV_FLOAT err_overallError;
  3540. CGU_Vec4ui color_qendpoint2[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}};
  3541. CGV_Vec4ui src_image_block[16];
  3542. CGU_Vec4ui alpha_qendpoint2[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}};
  3543. CGU_UINT32 alpha_index[16];
  3544. CGU_UINT32 color_index[16];
  3545. for (int k = 0; k < 16; k++) {
  3546. alpha_index[k] = best_candidate.alpha_index[k];
  3547. color_index[k] = best_candidate.color_index[k];
  3548. }
  3549. CGU_UINT32 color_index_packed_out[2] = {0, 0};
  3550. CGU_UINT32 alpha_index_packed_out[2] = {0, 0};
  3551. err_overallError = cmp_optimize_IndexAndEndPoints(color_qendpoint2,
  3552. color_index,
  3553. color_index_packed_out,
  3554. src_color_Block,
  3555. 16,
  3556. numClusters0[idxMode],
  3557. modeBits[0],
  3558. 3,
  3559. 0.01f,
  3560. blockMode);
  3561. // Alpha scalar block
  3562. err_overallError += cmp_optimize_IndexAndEndPoints(alpha_qendpoint2,
  3563. alpha_index,
  3564. alpha_index_packed_out,
  3565. src_alpha_Block,
  3566. 16,
  3567. numClusters1[idxMode],
  3568. modeBits[1],
  3569. 3,
  3570. 0.01f,
  3571. blockMode) / 3;
  3572. // If we beat the previous best then encode the block
  3573. if (err_overallError < best_err)
  3574. {
  3575. best_err = err_overallError;
  3576. best_candidate.idxMode = idxMode;
  3577. best_candidate.rotated_channel = rotated_channel;
  3578. best_candidate.alpha_qendpoint[0] = alpha_qendpoint2[0].x;
  3579. best_candidate.alpha_qendpoint[1] = alpha_qendpoint2[0].y;
  3580. best_candidate.alpha_qendpoint[2] = alpha_qendpoint2[0].z;
  3581. best_candidate.alpha_qendpoint[3] = alpha_qendpoint2[0].w;
  3582. best_candidate.alpha_qendpoint[4] = alpha_qendpoint2[1].x;
  3583. best_candidate.alpha_qendpoint[5] = alpha_qendpoint2[1].y;
  3584. best_candidate.alpha_qendpoint[6] = alpha_qendpoint2[1].z;
  3585. best_candidate.alpha_qendpoint[7] = alpha_qendpoint2[1].w;
  3586. best_candidate.color_qendpoint[0] = color_qendpoint2[0].x;
  3587. best_candidate.color_qendpoint[1] = color_qendpoint2[0].y;
  3588. best_candidate.color_qendpoint[2] = color_qendpoint2[0].z;
  3589. best_candidate.color_qendpoint[3] = color_qendpoint2[0].w;
  3590. best_candidate.color_qendpoint[4] = color_qendpoint2[1].x;
  3591. best_candidate.color_qendpoint[5] = color_qendpoint2[1].y;
  3592. best_candidate.color_qendpoint[6] = color_qendpoint2[1].z;
  3593. best_candidate.color_qendpoint[7] = color_qendpoint2[1].w;
  3594. for (int k = 0; k < 16; k++) {
  3595. best_candidate.color_index[k] = color_index[k];
  3596. best_candidate.alpha_index[k] = alpha_index[k];
  3597. }
  3598. CGV_UINT8 cmp_out16[COMPRESSED_BLOCK_SIZE];
  3599. if (blockMode == 4)
  3600. cmp_Encode_mode4(cmp_out16, best_candidate);
  3601. else
  3602. cmp_Encode_mode5(cmp_out16, best_candidate);
  3603. cmp_out[0] = (CGU_UINT32)cmp_out16[0] + (CGU_UINT32)(cmp_out16[1] << 8) + (CGU_UINT32)(cmp_out16[2] << 16) + (CGU_UINT32)(cmp_out16[3] << 24);
  3604. cmp_out[1] = (CGU_UINT32)cmp_out16[4] + (CGU_UINT32)(cmp_out16[5] << 8) + (CGU_UINT32)(cmp_out16[6] << 16) + (CGU_UINT32)(cmp_out16[7] << 24);
  3605. cmp_out[2] = (CGU_UINT32)cmp_out16[8] + (CGU_UINT32)(cmp_out16[9] << 8) + (CGU_UINT32)(cmp_out16[10] << 16) + (CGU_UINT32)(cmp_out16[11] << 24);
  3606. cmp_out[3] = (CGU_UINT32)cmp_out16[12] + (CGU_UINT32)(cmp_out16[13] << 8) + (CGU_UINT32)(cmp_out16[14] << 16) + (CGU_UINT32)(cmp_out16[15] << 24);
  3607. }
  3608. }
  3609. } // B
  3610. } // A
  3611. }
  3612. #endif
  3613. #endif
  3614. #ifdef ENABLE_CMP_REFINE_MODE6_API
  3615. CGU_BOOL get_ideal_cluster2(CMP_INOUT CGV_Vec4f image_cluster[2],
  3616. CMP_IN CGU_UINT32 index_cluster[16],
  3617. CMP_IN CGU_INT Mi_,
  3618. CMP_IN CGU_Vec4ui image_src[16],
  3619. CMP_IN CGU_UINT32 numEntries,
  3620. CMP_IN CGU_UINT32 channels3or4)
  3621. {
  3622. // get ideal cluster centers
  3623. CGV_Vec4f image_cluster_mean[16];
  3624. for (CGU_UINT32 ii = 0; ii < 16; ii++)
  3625. {
  3626. image_cluster_mean[ii] = 0.0f;
  3627. }
  3628. GetClusterMean2(image_cluster_mean, image_src, index_cluster, numEntries, channels3or4); // unrounded
  3629. CGV_FLOAT image_matrix0[2] = {0, 0}; // matrix /inverse matrix
  3630. CGV_FLOAT image_matrix1[2] = {0, 0}; // matrix /inverse matrix
  3631. CGV_Vec4f image_rp[2]; // right part for RMS fit problem
  3632. image_rp[0] = 0.0f;
  3633. image_rp[1] = 0.0f;
  3634. // weight with cnt if runnning on compacted index
  3635. for (CGU_UINT32 k = 0; k < numEntries; k++)
  3636. {
  3637. image_matrix0[0] += (Mi_ - index_cluster[k]) * (Mi_ - index_cluster[k]);
  3638. image_matrix0[1] += index_cluster[k] * (Mi_ - index_cluster[k]); // im is symmetric
  3639. image_matrix1[1] += index_cluster[k] * index_cluster[k];
  3640. image_rp[0] += image_cluster_mean[index_cluster[k]] * (CGU_FLOAT)(Mi_ - index_cluster[k]);
  3641. image_rp[1] += image_cluster_mean[index_cluster[k]] * (CGU_FLOAT)index_cluster[k];
  3642. }
  3643. CGV_FLOAT matrix_dd = image_matrix0[0] * image_matrix1[1] - image_matrix0[1] * image_matrix0[1];
  3644. // assert(matrix_dd !=0);
  3645. // matrix_dd=0 means that index_cidx[k] and (Mi_-index_cidx[k]) collinear which implies only one active index;
  3646. // taken care of separately
  3647. if (matrix_dd == 0)
  3648. {
  3649. image_cluster[0] = 0.0f;
  3650. image_cluster[1] = 0.0f;
  3651. return FALSE;
  3652. }
  3653. image_matrix1[0] = image_matrix0[0];
  3654. image_matrix0[0] = image_matrix1[1] / matrix_dd;
  3655. image_matrix1[1] = image_matrix1[0] / matrix_dd;
  3656. image_matrix1[0] = image_matrix0[1] = -image_matrix0[1] / matrix_dd;
  3657. CGV_FLOAT Mif = (CGV_FLOAT)Mi_;
  3658. // values can exceed 255 here, clamp made no diff in quality!
  3659. image_cluster[0] = (((image_rp[0] * image_matrix0[0]) + (image_rp[1] * image_matrix0[1])) * Mif);
  3660. image_cluster[1] = (((image_rp[0] * image_matrix1[0]) + (image_rp[1] * image_matrix1[1])) * Mif);
  3661. return TRUE;
  3662. }
  3663. CGV_FLOAT shake2(CMP_INOUT CGU_Vec4ui epo_code_shake[2],
  3664. CMP_IN CGV_Vec4f image_cluster[2],
  3665. CMP_IN CGU_UINT32 index_cluster[16],
  3666. CMP_IN CGU_Vec4ui image_src[16],
  3667. CMP_IN CGU_UINT32 index_bits,
  3668. CMP_IN CGU_UINT32 mtype,
  3669. CMP_IN CGU_UINT32 max_bits[4],
  3670. CMP_IN CGU_UINT32 use_par,
  3671. CMP_IN CGU_UINT32 numEntries, // max 16
  3672. CMP_IN CGU_UINT32 channels3or4)
  3673. {
  3674. CMP_UNUSED(mtype);
  3675. CGV_FLOAT best_err = CMP_FLOAT_MAX;
  3676. #define SHAKESIZE1 1
  3677. #define SHAKESIZE2 2
  3678. // shake single or - cartesian
  3679. // shake odd/odd and even/even or - same parity
  3680. // shake odd/odd odd/even , even/odd and even/even - bcc
  3681. CGV_FLOAT err_ed[2][2][4];
  3682. CGU_UINT32 epo_code_par[2][2][2][4];
  3683. for (CGU_UINT32 ch = 0; ch < channels3or4; ch++)
  3684. {
  3685. CGU_UINT32 ppA = 0;
  3686. CGU_UINT32 ppB = 0;
  3687. CGU_UINT32 rr = (use_par ? 2 : 1);
  3688. CGU_UINT32 epo_code_epi0[2]; // first/second, coord, begin rage end range
  3689. CGU_UINT32 epo_code_epi1[2]; // first/second, coord, begin rage end range
  3690. for (ppA = 0; ppA < rr; ppA++)
  3691. { // loop max =2
  3692. for (ppB = 0; ppB < rr; ppB++)
  3693. { //loop max =2
  3694. // set default ranges
  3695. switch (ch)
  3696. {
  3697. case 0:
  3698. epo_code_epi0[0] = epo_code_epi0[1] = cmp_ep_find_floor2(image_cluster[0].x, max_bits[0], use_par, ppA);
  3699. epo_code_epi1[0] = epo_code_epi1[1] = cmp_ep_find_floor2(image_cluster[1].x, max_bits[0], use_par, ppB);
  3700. break;
  3701. case 1:
  3702. epo_code_epi0[0] = epo_code_epi0[1] = cmp_ep_find_floor2(image_cluster[0].y, max_bits[1], use_par, ppA);
  3703. epo_code_epi1[0] = epo_code_epi1[1] = cmp_ep_find_floor2(image_cluster[1].y, max_bits[1], use_par, ppB);
  3704. break;
  3705. case 2:
  3706. epo_code_epi0[0] = epo_code_epi0[1] = cmp_ep_find_floor2(image_cluster[0].z, max_bits[2], use_par, ppA);
  3707. epo_code_epi1[0] = epo_code_epi1[1] = cmp_ep_find_floor2(image_cluster[1].z, max_bits[2], use_par, ppB);
  3708. break;
  3709. case 3:
  3710. if (channels3or4 == 4)
  3711. {
  3712. epo_code_epi0[0] = epo_code_epi0[1] = cmp_ep_find_floor2(image_cluster[0].w, max_bits[3], use_par, ppA);
  3713. epo_code_epi1[0] = epo_code_epi1[1] = cmp_ep_find_floor2(image_cluster[1].w, max_bits[3], use_par, ppB);
  3714. }
  3715. break;
  3716. }
  3717. // set begin range
  3718. epo_code_epi0[0] -= ((epo_code_epi0[0] < SHAKESIZE1 ? epo_code_epi0[0] : SHAKESIZE1)) & (~use_par);
  3719. epo_code_epi1[0] -= ((epo_code_epi1[0] < SHAKESIZE1 ? epo_code_epi1[0] : SHAKESIZE1)) & (~use_par);
  3720. // set end range
  3721. epo_code_epi0[1] +=
  3722. ((1 << max_bits[ch]) - 1 - epo_code_epi0[1] < SHAKESIZE2 ? (1 << max_bits[ch]) - 1 - epo_code_epi0[1] : SHAKESIZE2) & (~use_par);
  3723. epo_code_epi1[1] +=
  3724. ((1 << max_bits[ch]) - 1 - epo_code_epi1[1] < SHAKESIZE2 ? (1 << max_bits[ch]) - 1 - epo_code_epi1[1] : SHAKESIZE2) & (~use_par);
  3725. CGU_UINT32 step = (1 << use_par);
  3726. err_ed[ppA][ppB][ch] = CMP_FLOAT_MAX;
  3727. for (CGU_UINT32 epo_p0 = epo_code_epi0[0]; epo_p0 <= epo_code_epi0[1]; epo_p0 += step)
  3728. {
  3729. for (CGU_UINT32 epo_p1 = epo_code_epi1[0]; epo_p1 <= epo_code_epi1[1]; epo_p1 += step)
  3730. {
  3731. CGV_FLOAT image_square_diff = 0.0F;
  3732. CGV_FLOAT image_ramp;
  3733. for (CGU_UINT32 _mc = 1; _mc < numEntries; _mc++)
  3734. {
  3735. image_ramp = GetRamp2(epo_p0, epo_p1, index_cluster[_mc], index_bits);
  3736. switch (ch)
  3737. {
  3738. case 0:
  3739. image_square_diff += cmp_squaref(image_ramp - image_src[_mc].x);
  3740. break;
  3741. case 1:
  3742. image_square_diff += cmp_squaref(image_ramp - image_src[_mc].y);
  3743. break;
  3744. case 2:
  3745. image_square_diff += cmp_squaref(image_ramp - image_src[_mc].z);
  3746. break;
  3747. case 3:
  3748. if (channels3or4 == 4)
  3749. image_square_diff += cmp_squaref(image_ramp - image_src[_mc].w);
  3750. break;
  3751. }
  3752. }
  3753. if (image_square_diff < err_ed[ppA][ppB][ch])
  3754. {
  3755. err_ed[ppA][ppB][ch] = image_square_diff;
  3756. epo_code_par[ppA][ppB][0][ch] = epo_p0;
  3757. epo_code_par[ppA][ppB][1][ch] = epo_p1;
  3758. }
  3759. }
  3760. }
  3761. } // pp1
  3762. } // pp0
  3763. } // j
  3764. //---------------------------------------------------------
  3765. // CMP_CONSTANT CGU_UINT8 npv_nd[2][8] = {
  3766. // {1, 2, 4, 8, 16, 32, 0, 0}, // 3 channel
  3767. // {1, 2, 4, 0, 0, 0, 0, 0} // 4 channel tyep index 0..7
  3768. // };
  3769. // for (CGU_INT pn = 0; pn < npv_nd[channels3or4 - 3][type]; pn++)
  3770. CGU_UINT32 bits = 4; // for mode 6 its 4
  3771. for (CGU_UINT32 pn = 0; pn < bits; pn++)
  3772. {
  3773. CGV_FLOAT err_2 = 0.0F;
  3774. CGU_UINT32 d1 = 0;
  3775. CGU_UINT32 d2 = 0;
  3776. for (CGU_UINT32 ch = 0; ch < channels3or4; ch++)
  3777. {
  3778. d1 = par_vectors42_nd[pn][0][ch];
  3779. d2 = par_vectors42_nd[pn][1][ch];
  3780. err_2 += err_ed[d1][d2][ch];
  3781. }
  3782. if (err_2 < best_err)
  3783. {
  3784. best_err = err_2;
  3785. d1 = par_vectors42_nd[pn][0][0];
  3786. d2 = par_vectors42_nd[pn][1][0];
  3787. epo_code_shake[0].x = epo_code_par[d1][d2][0][0];
  3788. epo_code_shake[1].x = epo_code_par[d1][d2][1][0];
  3789. d1 = par_vectors42_nd[pn][0][1];
  3790. d2 = par_vectors42_nd[pn][1][1];
  3791. epo_code_shake[0].y = epo_code_par[d1][d2][0][1];
  3792. epo_code_shake[1].y = epo_code_par[d1][d2][1][1];
  3793. d1 = par_vectors42_nd[pn][0][2];
  3794. d2 = par_vectors42_nd[pn][1][2];
  3795. epo_code_shake[0].z = epo_code_par[d1][d2][0][2];
  3796. epo_code_shake[1].z = epo_code_par[d1][d2][1][2];
  3797. if (channels3or4 == 4)
  3798. {
  3799. d1 = par_vectors42_nd[pn][0][3];
  3800. d2 = par_vectors42_nd[pn][1][3];
  3801. epo_code_shake[0].w = epo_code_par[d1][d2][0][3];
  3802. epo_code_shake[1].w = epo_code_par[d1][d2][1][3];
  3803. }
  3804. }
  3805. }
  3806. return best_err;
  3807. }
  3808. CGV_FLOAT requantized_image_err2(CMP_INOUT CGU_UINT32 index_best[16],
  3809. CMP_IN CGU_Vec4ui epo_code_best[2],
  3810. CMP_IN CGU_UINT32 index_bits,
  3811. CMP_IN CGU_UINT32 max_bits[4],
  3812. CMP_IN CGU_Vec4ui image_src[16],
  3813. CMP_IN CGU_UINT32 numEntries, // max 16
  3814. CMP_IN CGU_UINT32 channels3or4)
  3815. { // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
  3816. CMP_UNUSED(channels3or4);
  3817. CMP_UNUSED(max_bits);
  3818. //=========================================
  3819. // requantized image based on new epo_code
  3820. //=========================================
  3821. CGV_Vec4f image_requantize[16];
  3822. CGV_FLOAT err_requant = 0.0F;
  3823. for (CGU_UINT32 k = 0; k < numEntries; k++)
  3824. {
  3825. image_requantize[k].x = GetRamp2(epo_code_best[0].x, epo_code_best[1].x, k, index_bits);
  3826. image_requantize[k].y = GetRamp2(epo_code_best[0].y, epo_code_best[1].y, k, index_bits);
  3827. image_requantize[k].z = GetRamp2(epo_code_best[0].z, epo_code_best[1].z, k, index_bits);
  3828. image_requantize[k].w = GetRamp2(epo_code_best[0].w, epo_code_best[1].w, k, index_bits);
  3829. }
  3830. //=========================================
  3831. // Calc the error for the requantized image
  3832. //=========================================
  3833. CGV_FLOAT err_cmin;
  3834. CGU_UINT32 best_indx;
  3835. CGV_FLOAT image_err;
  3836. CGV_Vec4f imageDiff;
  3837. for (CGU_UINT32 k1 = 0; k1 < numEntries; k1++)
  3838. {
  3839. // start with error as sum of 4 channels with Max pixel
  3840. // value 256 squared plus 1 for err min check = (256 * 256 * 4) + 1;
  3841. err_cmin = 262145.0f;
  3842. best_indx = 0;
  3843. for (CGU_UINT8 k2 = 0; k2 < numEntries; k2++)
  3844. {
  3845. image_err = 0.0F;
  3846. imageDiff.x = image_requantize[k2].x - image_src[k1].x;
  3847. imageDiff.y = image_requantize[k2].y - image_src[k1].y;
  3848. imageDiff.z = image_requantize[k2].z - image_src[k1].z;
  3849. imageDiff.w = image_requantize[k2].w - image_src[k1].w;
  3850. image_err = cmp_dot4f(imageDiff, imageDiff);
  3851. if (image_err < err_cmin)
  3852. {
  3853. err_cmin = image_err;
  3854. best_indx = k2;
  3855. }
  3856. }
  3857. index_best[k1] = best_indx;
  3858. err_requant += err_cmin;
  3859. }
  3860. return err_requant;
  3861. }
  3862. CGV_FLOAT cmp_mode6_optimize_IndexAndEndPoints(CMP_INOUT CGU_Vec4ui epo_code_out[2], //
  3863. CMP_INOUT CGU_UINT32 index_io[16], // Make sure input index is 0..15 range
  3864. CMP_IN CGU_Vec4ui image_src[16],
  3865. CMP_IN CGU_UINT32 numEntries, // max 16
  3866. CMP_IN CGU_UINT32 Mi_, // last cluster , This should be no larger than 16
  3867. CMP_IN CGU_UINT32 bits, // total for all components
  3868. CMP_IN CGU_UINT32 channels3or4, // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
  3869. CMP_IN CGU_FLOAT errorThreshold)
  3870. {
  3871. CMP_UNUSED(bits);
  3872. CGV_FLOAT err_best = CMP_FLOAT_MAX;
  3873. CGU_UINT32 type = 2; // = bits % (2 * channels3or4) for Mode 6 with 58 bits and 4 channels type is 2
  3874. CGU_UINT32 use_par = 1; // as type == 2 use par is 1 = (type != 0);
  3875. CGU_UINT32 max_bits[4] = {8, 8, 8, 8}; // Mode 6 max bits is 8 = (bits + channels2 - 1) / channels2;
  3876. CGU_UINT32 index_bits = 4; // channel bits !! = 4
  3877. // CGU_INT iv;
  3878. // iv = Mi_;
  3879. // while (iv >>= 1)
  3880. // index_bits++;
  3881. Mi_ = Mi_ - 1;
  3882. CGU_UINT32 index_tmp[16];
  3883. CGU_UINT32 maxTry = MAX_TRY_SHAKER; // should be set by quality
  3884. CGV_FLOAT err_requant = 0.0F;
  3885. // Init best index to input index
  3886. for (CGU_UINT32 k = 0; k < numEntries; k++)
  3887. index_tmp[k] = index_io[k];
  3888. CGU_UINT32 MaxIndex;
  3889. MaxIndex = index_collapse2(index_tmp, numEntries);
  3890. // we have a solid color 4x4 block no need for optimization!
  3891. if (MaxIndex == 0)
  3892. return 0.0f;
  3893. for (CGU_UINT32 ii = 0; ii < maxTry; ii++)
  3894. {
  3895. //===============================
  3896. // We have ramp colors to process
  3897. //===============================
  3898. CGV_FLOAT err_cluster = CMP_FLOAT_MAX;
  3899. CGV_FLOAT err_shake;
  3900. CGU_UINT32 index_cluster[16];
  3901. CGU_Vec4ui epo_code_best[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}};
  3902. for (CGU_UINT32 ii2 = 0; ii2 < numEntries; ii2++)
  3903. index_cluster[ii2] = 0;
  3904. CGU_UINT32 mi = Mi_;
  3905. for (CGU_UINT32 index_slope = 1; (index_slope * MaxIndex) <= mi; index_slope++)
  3906. {
  3907. CGV_Vec4f image_cluster[2] = {{0.0f, 0.0f, 0.0f, 0.0f}, {0.0f, 0.0f, 0.0f, 0.0f}};
  3908. for (CGU_UINT32 index_offset = 0; index_offset <= (mi - index_slope * MaxIndex); index_offset++)
  3909. {
  3910. //-------------------------------------
  3911. // set a new index data to try
  3912. //-------------------------------------
  3913. for (CGU_UINT32 k = 0; k < numEntries; k++)
  3914. index_cluster[k] = index_tmp[k] * index_slope + index_offset;
  3915. if (get_ideal_cluster2(image_cluster, index_cluster, Mi_, image_src, numEntries, channels3or4))
  3916. {
  3917. CGU_Vec4ui epo_code_shake[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}};
  3918. err_shake = shake2( epo_code_shake, // return new epo
  3919. image_cluster,
  3920. index_cluster,
  3921. image_src,
  3922. index_bits,
  3923. type,
  3924. max_bits,
  3925. use_par,
  3926. numEntries, // max 16
  3927. channels3or4);
  3928. if (err_shake < err_cluster)
  3929. {
  3930. err_cluster = err_shake;
  3931. epo_code_best[0] = epo_code_shake[0];
  3932. epo_code_best[1] = epo_code_shake[1];
  3933. }
  3934. }
  3935. }
  3936. }
  3937. if ((err_cluster != CMP_FLOAT_MAX))
  3938. {
  3939. //=========================
  3940. // test results for quality
  3941. //=========================
  3942. CGU_UINT32 index_best[16] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
  3943. err_requant = requantized_image_err2( index_best, // new index results
  3944. epo_code_best, // prior result input
  3945. index_bits,
  3946. max_bits,
  3947. image_src,
  3948. numEntries,
  3949. channels3or4);
  3950. if (err_requant < err_best)
  3951. {
  3952. //better = 1;
  3953. for (CGU_UINT32 k = 0; k < numEntries; k++)
  3954. index_io[k] = index_tmp[k] = index_best[k];
  3955. //cmp_pack4bitindex(index_packed_out, index_io);
  3956. epo_code_out[0] = epo_code_best[0];
  3957. epo_code_out[1] = epo_code_best[1];
  3958. err_best = err_requant;
  3959. }
  3960. }
  3961. // Early out if we have our target err
  3962. if (err_best <= errorThreshold)
  3963. break;
  3964. MaxIndex = index_collapse2(index_tmp, numEntries);
  3965. if (MaxIndex == 0)
  3966. break;
  3967. }
  3968. // Did not find anything better over Max trys
  3969. return err_best;
  3970. }
  3971. #endif
  3972. #endif // ENABLE_CMP_API : CPU & GPU Code block
  3973. //=================================================================================
  3974. // GPU API Interfaces
  3975. // mode 4 5 6 all have 1 subset per block, and fix-up index is always index 0
  3976. //=================================================================================
  3977. CMP_NUMTHREADS(THREAD_GROUP_SIZE, 1, 1) void TryMode456CS(CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID)
  3978. {
  3979. CMP_CONSTANT CGU_UINT32 MAX_USED_THREAD = 16;
  3980. CGU_UINT32 BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD;
  3981. CGU_UINT32 blockInGroup = GI / MAX_USED_THREAD;
  3982. CGU_UINT32 blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
  3983. CGU_UINT32 threadBase = blockInGroup * MAX_USED_THREAD;
  3984. CGU_UINT32 threadInBlock = GI - threadBase;
  3985. CGU_UINT32 block_y = blockID / g_num_block_x;
  3986. CGU_UINT32 block_x = blockID - block_y * g_num_block_x;
  3987. CGU_UINT32 base_x = block_x * BLOCK_SIZE_X;
  3988. CGU_UINT32 base_y = block_y * BLOCK_SIZE_Y;
  3989. #if (defined(ENABLE_MODE4) || defined(ENABLE_MODE5) || defined(ENABLE_MODE6)|| defined(ENABLE_CMP_MODE6))
  3990. if (threadInBlock < 16)
  3991. {
  3992. CGU_Vec4f px = g_Input.Load(CGU_Vec3ui(base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0)) * 255.0f;
  3993. px = clamp(px, 0.0f, 255.0f);
  3994. shared_temp[GI].pixel.r = (CGU_UINT32)px.r;
  3995. shared_temp[GI].pixel.g = (CGU_UINT32)px.g;
  3996. shared_temp[GI].pixel.b = (CGU_UINT32)px.b;
  3997. shared_temp[GI].pixel.a = (CGU_UINT32)px.a;
  3998. shared_temp[GI].endPoint_low = shared_temp[GI].pixel;
  3999. shared_temp[GI].endPoint_high = shared_temp[GI].pixel;
  4000. }
  4001. GroupSync();
  4002. if (threadInBlock < 8)
  4003. {
  4004. shared_temp[GI].endPoint_low = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 8].endPoint_low);
  4005. shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 8].endPoint_high);
  4006. }
  4007. GroupSync();
  4008. if (threadInBlock < 4)
  4009. {
  4010. shared_temp[GI].endPoint_low = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 4].endPoint_low);
  4011. shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 4].endPoint_high);
  4012. }
  4013. GroupSync();
  4014. if (threadInBlock < 2)
  4015. {
  4016. shared_temp[GI].endPoint_low = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 2].endPoint_low);
  4017. shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 2].endPoint_high);
  4018. }
  4019. GroupSync();
  4020. if (threadInBlock < 1)
  4021. {
  4022. shared_temp[GI].endPoint_low = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 1].endPoint_low);
  4023. shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 1].endPoint_high);
  4024. }
  4025. GroupSync();
  4026. CGU_Vec4ui endPoint[2];
  4027. endPoint[0] = shared_temp[threadBase].endPoint_low;
  4028. endPoint[1] = shared_temp[threadBase].endPoint_high;
  4029. CGU_UINT32 error = 0xFFFFFFFF;
  4030. CGU_UINT32 mode = 0;
  4031. CGU_UINT32 index_selector = 0;
  4032. CGU_UINT32 rotation = 0;
  4033. CGU_Vec2ui indexPrec;
  4034. if (threadInBlock < 8) // all threads of threadInBlock < 8 will be working on trying out mode 4, since only mode 4 has index selector bit
  4035. {
  4036. if (0 == (threadInBlock & 1)) // thread 0, 2, 4, 6
  4037. {
  4038. //2 represents 2bit index precision; 1 represents 3bit index precision
  4039. index_selector = 0;
  4040. indexPrec = CGU_Vec2ui( 2, 1 );
  4041. }
  4042. else // thread 1, 3, 5, 7
  4043. {
  4044. //2 represents 2bit index precision; 1 represents 3bit index precision
  4045. index_selector = 1;
  4046. indexPrec = CGU_Vec2ui( 1, 2 );
  4047. }
  4048. }
  4049. else
  4050. {
  4051. //2 represents 2bit index precision
  4052. indexPrec = CGU_Vec2ui( 2, 2 );
  4053. }
  4054. CGU_Vec4ui pixel_r;
  4055. CGU_UINT32 color_index;
  4056. CGU_UINT32 alpha_index;
  4057. CGU_Vec4i span;
  4058. CGU_Vec2i span_norm_sqr;
  4059. CGU_Vec2i dotProduct;
  4060. #if defined(ENABLE_MODE4) || defined(ENABLE_MODE5)
  4061. if (threadInBlock < 12) // Try mode 4 5 in threads 0..11
  4062. {
  4063. CGU_Vec4ui ep_quantized[2];
  4064. // mode 4 5 have component rotation
  4065. if ((threadInBlock < 2) || (8 == threadInBlock)) // rotation = 0 in thread 0, 1
  4066. {
  4067. rotation = 0;
  4068. }
  4069. else if ((threadInBlock < 4) || (9 == threadInBlock)) // rotation = 1 in thread 2, 3
  4070. {
  4071. rotation = 1;
  4072. set_pixel_rotation(endPoint[0],rotation);
  4073. set_pixel_rotation(endPoint[1],rotation);
  4074. }
  4075. else if ((threadInBlock < 6) || (10 == threadInBlock)) // rotation = 2 in thread 4, 5
  4076. {
  4077. rotation = 2;
  4078. set_pixel_rotation(endPoint[0],rotation);
  4079. set_pixel_rotation(endPoint[1],rotation);
  4080. }
  4081. else if ((threadInBlock < 8) || (11 == threadInBlock)) // rotation = 3 in thread 6, 7
  4082. {
  4083. rotation = 3;
  4084. set_pixel_rotation(endPoint[0],rotation);
  4085. set_pixel_rotation(endPoint[1],rotation);
  4086. }
  4087. if (threadInBlock < 8) // try mode 4 in threads 0..7
  4088. {
  4089. // mode 4 thread distribution
  4090. // Thread 0 1 2 3 4 5 6 7
  4091. // Rotation 0 0 1 1 2 2 3 3
  4092. // Index selector 0 1 0 1 0 1 0 1
  4093. mode = 4;
  4094. compress_endpoints4( endPoint,ep_quantized );
  4095. }
  4096. else // try mode 5 in threads 8..11
  4097. {
  4098. // mode 5 thread distribution
  4099. // Thread 8 9 10 11
  4100. // Rotation 0 1 2 3
  4101. mode = 5;
  4102. compress_endpoints5( endPoint,ep_quantized );
  4103. }
  4104. CGU_Vec4ui pixel = shared_temp[threadBase + 0].pixel;
  4105. set_pixel_rotation(pixel,rotation);
  4106. span = cmp_castimp(endPoint[1] - endPoint[0]);
  4107. span_norm_sqr = CGU_Vec2i( dot( span.rgb, span.rgb ), span.a * span.a );
  4108. // should be the same as above
  4109. CGU_Vec3ui diff0 = pixel.rgb - endPoint[0].rgb;
  4110. CGU_Vec3ui diff1 = pixel.rgb - endPoint[1].rgb;
  4111. dotProduct = CGU_Vec2i( dot( diff0, diff0), dot( diff1, diff1) );
  4112. if ( dotProduct.x > dotProduct.y )
  4113. {
  4114. span.rgb.x = -span.rgb.x;
  4115. span.rgb.y = -span.rgb.y;
  4116. span.rgb.z = -span.rgb.z;
  4117. swap(endPoint[0].rgb, endPoint[1].rgb);
  4118. }
  4119. CGU_UINT32 diffa0 = pixel.a - endPoint[0].a;
  4120. CGU_UINT32 diffa1 = pixel.a - endPoint[1].a;
  4121. dotProduct = CGU_Vec2i( dot( diffa0, diffa0 ), dot( diffa1,diffa1 ) );
  4122. if ( dotProduct.x > dotProduct.y )
  4123. {
  4124. span.a = -span.a;
  4125. swap(endPoint[0].a, endPoint[1].a);
  4126. }
  4127. error = 0;
  4128. for ( CGU_UINT32 i = 0; i < 16; i ++ )
  4129. {
  4130. pixel = shared_temp[threadBase + i].pixel;
  4131. set_pixel_rotation(pixel,rotation);
  4132. diff0 = pixel.rgb - endPoint[0].rgb;
  4133. dotProduct.x = dot( span.rgb, diff0 );
  4134. color_index = ( span_norm_sqr.x <= 0 /*endPoint[0] == endPoint[1]*/ || dotProduct.x <= 0 /*pixel == endPoint[0]*/ ) ? 0
  4135. : ( ( dotProduct.x < span_norm_sqr.x ) ? aStep[indexPrec.x][ CGU_UINT32( dotProduct.x * 63.49999 / span_norm_sqr.x ) ] : aStep[indexPrec.x][63] );
  4136. diffa0 = pixel.a - endPoint[0].a;
  4137. dotProduct.y = dot( span.a, diffa0 );
  4138. alpha_index = ( span_norm_sqr.y <= 0 || dotProduct.y <= 0 ) ? 0
  4139. : ( ( dotProduct.y < span_norm_sqr.y ) ? aStep[indexPrec.y][ CGU_UINT32( dotProduct.y * 63.49999 / span_norm_sqr.y ) ] : aStep[indexPrec.y][63] );
  4140. pixel_r.rgb = ( endPoint[0].rgb * ( 64 - aWeight[indexPrec.x][color_index] ) + endPoint[1].rgb * aWeight[indexPrec.x][color_index] + 32U );
  4141. pixel_r.rgb.x = pixel_r.rgb.x >> 6;
  4142. pixel_r.rgb.y = pixel_r.rgb.y >> 6;
  4143. pixel_r.rgb.z = pixel_r.rgb.z >> 6;
  4144. pixel_r.a = ( endPoint[0].a * ( 64 - aWeight[indexPrec.y][alpha_index] ) + endPoint[1].a * aWeight[indexPrec.y][alpha_index] + 32 ) >> 6;
  4145. Ensure_A_Is_Larger( pixel_r, pixel );
  4146. pixel_r -= pixel;
  4147. set_pixel_rotation(pixel_r,rotation);
  4148. error += ComputeError(pixel_r, pixel_r);
  4149. }
  4150. }
  4151. else
  4152. #endif
  4153. #ifdef ENABLE_MODE6
  4154. if (threadInBlock < 16)// Try mode 6 in threads 12..15, since in mode 4 5 6, only mode 6 has p bit
  4155. {
  4156. CGU_UINT32 p = threadInBlock - 12;
  4157. CGU_Vec4ui ep_quantized[2];
  4158. compress_endpoints6( endPoint,ep_quantized, CGU_Vec2ui(p & 1 , (p >> 1)& 1 ) );
  4159. CGU_Vec4ui pixel = shared_temp[threadBase + 0].pixel;
  4160. span = cmp_castimp( endPoint[1] - endPoint[0] );
  4161. span_norm_sqr = dot( span, span );
  4162. CGU_Vec4ui diff4 = pixel - endPoint[0];
  4163. dotProduct = dot( span, diff4 );
  4164. if ( span_norm_sqr.x > 0 && dotProduct.x >= 0 && CGU_UINT32( dotProduct.x * 63.49999 ) > CGU_UINT32( 32 * span_norm_sqr.x ) )
  4165. {
  4166. span = -span;
  4167. swap(endPoint[0], endPoint[1]);
  4168. }
  4169. error = 0;
  4170. for ( CGU_UINT32 i = 0; i < 16; i ++ )
  4171. {
  4172. pixel = shared_temp[threadBase + i].pixel;
  4173. diff4 = pixel - endPoint[0];
  4174. dotProduct.x = dot( span, diff4 );
  4175. color_index = ( span_norm_sqr.x <= 0 || dotProduct.x <= 0 ) ? 0
  4176. : ( ( dotProduct.x < span_norm_sqr.x ) ? aStep[0][ CGU_UINT32( dotProduct.x * 63.49999 / span_norm_sqr.x ) ] : aStep[0][63] );
  4177. pixel_r = ( endPoint[0] * ( 64 - aWeight[0][color_index] ) +
  4178. endPoint[1] * aWeight[0][color_index] + 32U ) >> 6;
  4179. Ensure_A_Is_Larger( pixel_r, pixel );
  4180. pixel_r -= pixel;
  4181. error += ComputeError(pixel_r, pixel_r);
  4182. }
  4183. mode = 6;
  4184. rotation = p; // Borrow rotation for p
  4185. }
  4186. #endif
  4187. shared_temp[GI].error = error;
  4188. shared_temp[GI].mode = mode;
  4189. shared_temp[GI].index_selector = index_selector;
  4190. shared_temp[GI].rotation = rotation;
  4191. GroupSync();
  4192. if (threadInBlock < 8)
  4193. {
  4194. if ( shared_temp[GI].error > shared_temp[GI + 8].error )
  4195. {
  4196. shared_temp[GI].error = shared_temp[GI + 8].error;
  4197. shared_temp[GI].mode = shared_temp[GI + 8].mode;
  4198. shared_temp[GI].index_selector = shared_temp[GI + 8].index_selector;
  4199. shared_temp[GI].rotation = shared_temp[GI + 8].rotation;
  4200. }
  4201. }
  4202. GroupSync();
  4203. if (threadInBlock < 4)
  4204. {
  4205. if ( shared_temp[GI].error > shared_temp[GI + 4].error )
  4206. {
  4207. shared_temp[GI].error = shared_temp[GI + 4].error;
  4208. shared_temp[GI].mode = shared_temp[GI + 4].mode;
  4209. shared_temp[GI].index_selector = shared_temp[GI + 4].index_selector;
  4210. shared_temp[GI].rotation = shared_temp[GI + 4].rotation;
  4211. }
  4212. }
  4213. GroupSync();
  4214. if (threadInBlock < 2)
  4215. {
  4216. if ( shared_temp[GI].error > shared_temp[GI + 2].error )
  4217. {
  4218. shared_temp[GI].error = shared_temp[GI + 2].error;
  4219. shared_temp[GI].mode = shared_temp[GI + 2].mode;
  4220. shared_temp[GI].index_selector = shared_temp[GI + 2].index_selector;
  4221. shared_temp[GI].rotation = shared_temp[GI + 2].rotation;
  4222. }
  4223. }
  4224. GroupSync();
  4225. if (threadInBlock < 1)
  4226. {
  4227. if ( shared_temp[GI].error > shared_temp[GI + 1].error )
  4228. {
  4229. shared_temp[GI].error = shared_temp[GI + 1].error;
  4230. shared_temp[GI].mode = shared_temp[GI + 1].mode;
  4231. shared_temp[GI].index_selector = shared_temp[GI + 1].index_selector;
  4232. shared_temp[GI].rotation = shared_temp[GI + 1].rotation;
  4233. }
  4234. // Save the fast mode settings for modes 4&5 check if q = 0 for mode 6)
  4235. g_OutBuff1[blockID].error = shared_temp[GI].error;
  4236. g_OutBuff1[blockID].mode = shared_temp[GI].mode & 0x07;
  4237. g_OutBuff1[blockID].rotation = shared_temp[GI].rotation;
  4238. g_OutBuff1[blockID].index_selector = shared_temp[GI].index_selector;
  4239. g_OutBuff1[blockID].partition = 0;
  4240. g_OutBuff1[blockID].data2 = 0;
  4241. // Enable cmp test
  4242. #ifdef ENABLE_CMP_MODE6
  4243. if ((g_quality > 0.05f)
  4244. #ifdef ENABLE_MODE6
  4245. && (shared_temp[GI].mode == 6)
  4246. #endif
  4247. )
  4248. {
  4249. CGU_Vec4ui image_src[16];
  4250. for (int i = 0; i < 16; i++)
  4251. {
  4252. image_src[i].x = shared_temp[threadBase + i].pixel.x;
  4253. image_src[i].y = shared_temp[threadBase + i].pixel.y;
  4254. image_src[i].z = shared_temp[threadBase + i].pixel.z;
  4255. image_src[i].w = shared_temp[threadBase + i].pixel.w;
  4256. }
  4257. CGU_Vec4ui epo_code_out[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}};
  4258. CGU_UINT32 index_packed_out[2] = {0, 0};
  4259. CGU_UINT32 cmp_out6[4] = {0, 0, 0, 0};
  4260. CGU_UINT32 best_index_out[16];
  4261. CGU_UINT32 besterr = cmp_GetIndexedEndPoints(epo_code_out,
  4262. best_index_out,
  4263. image_src,
  4264. 15, // numEntries 0..15 (Note this function is changed from using 16)
  4265. 0xffffffff);
  4266. // Error cal needs updating to be the same all over
  4267. //if (besterr > shared_temp[GI].error)
  4268. {
  4269. cmp_pack4bitindex32(index_packed_out, best_index_out);
  4270. #ifdef ENABLE_CMP_REFINE_MODE6_API
  4271. if (g_quality > 0.5f)
  4272. {
  4273. // Refined for better quailty using prior best_index_out initial input
  4274. besterr = cmp_mode6_optimize_IndexAndEndPoints(epo_code_out,
  4275. best_index_out,
  4276. image_src,
  4277. 16, // numEntries
  4278. g_modesettings[6].clusters, // 16,
  4279. g_modesettings[6].bits, // 58,
  4280. g_modesettings[6].channels3or4, // 4,
  4281. 0.1f);
  4282. cmp_pack4bitindex32(index_packed_out, best_index_out);
  4283. }
  4284. #endif
  4285. cmp_encode_mode6(cmp_out6, epo_code_out, index_packed_out);
  4286. // Addin CMP results
  4287. g_OutBuff1[blockID].error = besterr;
  4288. g_OutBuff1[blockID].mode = 6 | 0x10;
  4289. g_OutBuff1[blockID].data2.x = cmp_out6[0];
  4290. g_OutBuff1[blockID].data2.y = cmp_out6[1];
  4291. g_OutBuff1[blockID].data2.z = cmp_out6[2];
  4292. g_OutBuff1[blockID].data2.w = cmp_out6[3];
  4293. } // if better then fast mode
  4294. }
  4295. #endif
  4296. }
  4297. #else
  4298. // Init
  4299. if (threadInBlock < 1) {
  4300. g_OutBuff1[blockID].error = MAX_UINT;
  4301. g_OutBuff1[blockID].mode = 0;
  4302. g_OutBuff1[blockID].rotation = 0;
  4303. g_OutBuff1[blockID].index_selector = 0;
  4304. g_OutBuff1[blockID].partition = 0;
  4305. g_OutBuff1[blockID].data2 = 0;
  4306. }
  4307. GroupSync();
  4308. #endif
  4309. }
  4310. CMP_NUMTHREADS(THREAD_GROUP_SIZE, 1, 1) void TryMode137CS(CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID) // mode 1 3 7 all have 2 subsets per block
  4311. {
  4312. const CGU_UINT32 MAX_USED_THREAD = 64;
  4313. CGU_UINT32 BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD;
  4314. CGU_UINT32 blockInGroup = GI / MAX_USED_THREAD;
  4315. CGU_UINT32 blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
  4316. CGU_UINT32 threadBase = blockInGroup * MAX_USED_THREAD;
  4317. CGU_UINT32 threadInBlock = GI - threadBase;
  4318. CGU_UINT32 block_y = blockID / g_num_block_x;
  4319. CGU_UINT32 block_x = blockID - block_y * g_num_block_x;
  4320. CGU_UINT32 base_x = block_x * BLOCK_SIZE_X;
  4321. CGU_UINT32 base_y = block_y * BLOCK_SIZE_Y;
  4322. if (threadInBlock < 16)
  4323. {
  4324. CGU_Vec4f px = g_Input.Load(CGU_Vec3ui(base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0)) * 255.0f;
  4325. px = clamp(px, 0.0f, 255.0f);
  4326. shared_temp[GI].pixel.r = (CGU_UINT32)px.r;
  4327. shared_temp[GI].pixel.g = (CGU_UINT32)px.g;
  4328. shared_temp[GI].pixel.b = (CGU_UINT32)px.b;
  4329. shared_temp[GI].pixel.a = (CGU_UINT32)px.a;
  4330. }
  4331. GroupSync();
  4332. shared_temp[GI].error = 0xFFFFFFFF;
  4333. // Use this to test only one of modes 1,3, or 7
  4334. // if (g_mode_id != 7) {
  4335. // if (threadInBlock == 0)
  4336. // g_OutBuff1[blockID].error = g_InBuff[blockID].error;
  4337. // g_OutBuff1[blockID].mode = g_InBuff[blockID].mode;
  4338. // g_OutBuff1[blockID].partition = g_InBuff[blockID].partition;
  4339. // g_OutBuff1[blockID].index_selector = g_InBuff[blockID].index_selector;
  4340. // g_OutBuff1[blockID].rotation = g_InBuff[blockID].rotation;
  4341. // g_OutBuff1[blockID].data2 = g_InBuff[blockID].data2;
  4342. // return;
  4343. // }
  4344. #if defined(ENABLE_MODE1) || defined(ENABLE_MODE3) || defined(ENABLE_MODE7)
  4345. CGU_Vec4ui pixel_r;
  4346. CGU_Vec4ui endPoint[2][2]; // endPoint[0..1 for subset id][0..1 for low and high in the subset]
  4347. CGU_Vec4ui endPointBackup[2][2];
  4348. CGU_UINT32 color_index;
  4349. if (threadInBlock < 64)
  4350. {
  4351. CGU_UINT32 partition = threadInBlock;
  4352. CGU_UINT32 i;
  4353. endPoint[0][0] = MAX_UINT;
  4354. endPoint[0][1] = MIN_UINT;
  4355. endPoint[1][0] = MAX_UINT;
  4356. endPoint[1][1] = MIN_UINT;
  4357. CGU_UINT32 bits = blockPartitions[partition];
  4358. for (i = 0; i < 16; i++)
  4359. {
  4360. CGU_Vec4ui pixel = shared_temp[threadBase + i].pixel;
  4361. if (((bits >> i) & 0x01) == 1)
  4362. {
  4363. endPoint[1][0] = cmp_min(endPoint[1][0], pixel);
  4364. endPoint[1][1] = cmp_max(endPoint[1][1], pixel);
  4365. }
  4366. else
  4367. {
  4368. endPoint[0][0] = cmp_min(endPoint[0][0], pixel);
  4369. endPoint[0][1] = cmp_max(endPoint[0][1], pixel);
  4370. }
  4371. }
  4372. endPointBackup[0][0] = endPoint[0][0];
  4373. endPointBackup[0][1] = endPoint[0][1];
  4374. endPointBackup[1][0] = endPoint[1][0];
  4375. endPointBackup[1][1] = endPoint[1][1];
  4376. CGU_UINT32 max_p = 2; // mode 1
  4377. #if defined(ENABLE_MODE3) || defined(ENABLE_MODE7)
  4378. if (g_mode_id != 1)
  4379. {
  4380. // in mode 3 7, there are two p bits per subset, one for each end point
  4381. max_p = 4;
  4382. }
  4383. #endif
  4384. CGU_UINT32 final_p[2] = {0, 0};
  4385. CGU_UINT32 error[2] = {MAX_UINT, MAX_UINT};
  4386. for (CGU_UINT32 p = 0; p < max_p; p++)
  4387. {
  4388. endPoint[0][0] = endPointBackup[0][0];
  4389. endPoint[0][1] = endPointBackup[0][1];
  4390. endPoint[1][0] = endPointBackup[1][0];
  4391. endPoint[1][1] = endPointBackup[1][1];
  4392. for (i = 0; i < 2; i++) // loop through 2 subsets
  4393. {
  4394. #if defined(ENABLE_MODE1)
  4395. if (g_mode_id == 1)
  4396. {
  4397. CGU_Vec4ui quantized[2];
  4398. compress_endpoints1(endPoint[i], quantized, p);
  4399. }
  4400. #endif
  4401. #if defined(ENABLE_MODE3)
  4402. if (g_mode_id == 3)
  4403. {
  4404. CGU_Vec4ui quantized[2];
  4405. compress_endpoints3(endPoint[i], quantized, CGU_Vec2ui(p & 1, (p >> 1) & 1));
  4406. }
  4407. #endif
  4408. #if defined(ENABLE_MODE7)
  4409. if (g_mode_id == 7)
  4410. {
  4411. CGU_Vec4ui quantized[2];
  4412. compress_endpoints7(endPoint[i], quantized, CGU_Vec2ui(p & 1, (p >> 1) & 1));
  4413. }
  4414. #endif
  4415. }
  4416. CGU_Vec4i span[2];
  4417. span[0].x = endPoint[0][1].x - endPoint[0][0].x;
  4418. span[0].y = endPoint[0][1].y - endPoint[0][0].y;
  4419. span[0].z = endPoint[0][1].z - endPoint[0][0].z;
  4420. span[0].w = endPoint[0][1].w - endPoint[0][0].w;
  4421. span[1].x = endPoint[1][1].x - endPoint[1][0].x;
  4422. span[1].y = endPoint[1][1].y - endPoint[1][0].y;
  4423. span[1].z = endPoint[1][1].z - endPoint[1][0].z;
  4424. span[1].w = endPoint[1][1].w - endPoint[1][0].w;
  4425. #if defined(ENABLE_MODE3)
  4426. if (g_mode_id != 7)
  4427. {
  4428. span[0].w = span[1].w = 0;
  4429. }
  4430. #endif
  4431. CGU_INT span_norm_sqr[2];
  4432. span_norm_sqr[0] = dot(span[0], span[0]);
  4433. span_norm_sqr[1] = dot(span[1], span[1]);
  4434. CGU_Vec4i diff;
  4435. diff.x = shared_temp[threadBase + 0].pixel.x - endPoint[0][0].x;
  4436. diff.y = shared_temp[threadBase + 0].pixel.y - endPoint[0][0].y;
  4437. diff.z = shared_temp[threadBase + 0].pixel.z - endPoint[0][0].z;
  4438. diff.w = shared_temp[threadBase + 0].pixel.w - endPoint[0][0].w;
  4439. // TODO: again, this shouldn't be necessary here in error calculation
  4440. CGU_INT dotProduct = dot(span[0],diff);
  4441. if (span_norm_sqr[0] > 0 && dotProduct > 0 && CGU_UINT32(dotProduct * 63.49999) > CGU_UINT32(32 * span_norm_sqr[0]))
  4442. {
  4443. span[0].x = -span[0].x;
  4444. span[0].y = -span[0].y;
  4445. span[0].z = -span[0].z;
  4446. span[0].w = -span[0].w;
  4447. swap(endPoint[0][0], endPoint[0][1]);
  4448. }
  4449. diff.x = shared_temp[threadBase + candidateFixUpIndex1D[partition].x].pixel.x - endPoint[1][0].x;
  4450. diff.y = shared_temp[threadBase + candidateFixUpIndex1D[partition].x].pixel.y - endPoint[1][0].y;
  4451. diff.z = shared_temp[threadBase + candidateFixUpIndex1D[partition].x].pixel.z - endPoint[1][0].z;
  4452. diff.w = shared_temp[threadBase + candidateFixUpIndex1D[partition].x].pixel.w - endPoint[1][0].w;
  4453. dotProduct = dot(span[1], diff);
  4454. if (span_norm_sqr[1] > 0 && dotProduct > 0 && CGU_UINT32(dotProduct * 63.49999) > CGU_UINT32(32 * span_norm_sqr[1]))
  4455. {
  4456. span[1].x = -span[1].x;
  4457. span[1].y = -span[1].y;
  4458. span[1].z = -span[1].z;
  4459. span[1].w = -span[1].w;
  4460. swap(endPoint[1][0], endPoint[1][1]);
  4461. }
  4462. CGU_UINT32 step_selector = 1; // mode 1 has 3 bit index
  4463. #if defined(ENABLE_MODE3) || defined(ENABLE_MODE7)
  4464. if (g_mode_id != 1)
  4465. {
  4466. step_selector = 2; // mode 3 7 have 2 bit index
  4467. }
  4468. #endif
  4469. CGU_UINT32 p_error[2] = {0, 0};
  4470. for (i = 0; i < 16; i++)
  4471. {
  4472. CGU_UINT32 subset_index = (bits >> i) & 0x01;
  4473. if (subset_index == 1)
  4474. {
  4475. diff.x = shared_temp[threadBase + i].pixel.x - endPoint[1][0].x;
  4476. diff.y = shared_temp[threadBase + i].pixel.y - endPoint[1][0].y;
  4477. diff.z = shared_temp[threadBase + i].pixel.z - endPoint[1][0].z;
  4478. diff.w = shared_temp[threadBase + i].pixel.w - endPoint[1][0].w;
  4479. dotProduct = dot(span[1], diff);
  4480. color_index = (span_norm_sqr[1] <= 0 || dotProduct <= 0)
  4481. ? 0
  4482. : ((dotProduct < span_norm_sqr[1]) ? aStep[step_selector][CGU_UINT32(dotProduct * 63.49999 / span_norm_sqr[1])]
  4483. : aStep[step_selector][63]);
  4484. }
  4485. else
  4486. {
  4487. diff.x = shared_temp[threadBase + i].pixel.x - endPoint[0][0].x;
  4488. diff.y = shared_temp[threadBase + i].pixel.y - endPoint[0][0].y;
  4489. diff.z = shared_temp[threadBase + i].pixel.z - endPoint[0][0].z;
  4490. diff.w = shared_temp[threadBase + i].pixel.w - endPoint[0][0].w;
  4491. dotProduct = dot(span[0], diff);
  4492. color_index = (span_norm_sqr[0] <= 0 || dotProduct <= 0)
  4493. ? 0
  4494. : ((dotProduct < span_norm_sqr[0]) ? aStep[step_selector][CGU_UINT32(dotProduct * 63.49999 / span_norm_sqr[0])]
  4495. : aStep[step_selector][63]);
  4496. }
  4497. pixel_r = (endPoint[subset_index][0] * (64 - aWeight[step_selector][color_index]) +
  4498. endPoint[subset_index][1] * aWeight[step_selector][color_index] + 32U) >>
  4499. 6;
  4500. if (g_mode_id != 7)
  4501. {
  4502. pixel_r.a = 255;
  4503. }
  4504. CGU_Vec4ui pixel = shared_temp[threadBase + i].pixel;
  4505. Ensure_A_Is_Larger(pixel_r, pixel);
  4506. pixel_r -= pixel;
  4507. CGU_UINT32 pixel_error = ComputeError(pixel_r, pixel_r);
  4508. if (subset_index == 1)
  4509. p_error[1] += pixel_error;
  4510. else
  4511. p_error[0] += pixel_error;
  4512. }
  4513. for (i = 0; i < 2; i++)
  4514. {
  4515. if (p_error[i] < error[i])
  4516. {
  4517. error[i] = p_error[i];
  4518. final_p[i] = p;
  4519. }
  4520. }
  4521. }
  4522. shared_temp[GI].error = error[0] + error[1];
  4523. shared_temp[GI].mode = g_mode_id;
  4524. shared_temp[GI].partition = partition;
  4525. // mode 1 3 7 don't have rotation, we use rotation for p bits
  4526. if (g_mode_id == 1)
  4527. shared_temp[GI].rotation = (final_p[1] << 1) | final_p[0];
  4528. else
  4529. shared_temp[GI].rotation = (final_p[1] << 2) | final_p[0];
  4530. }
  4531. GroupSync();
  4532. if (threadInBlock < 32)
  4533. {
  4534. if (shared_temp[GI].error > shared_temp[GI + 32].error)
  4535. {
  4536. shared_temp[GI].error = shared_temp[GI + 32].error;
  4537. shared_temp[GI].mode = shared_temp[GI + 32].mode;
  4538. shared_temp[GI].partition = shared_temp[GI + 32].partition;
  4539. shared_temp[GI].rotation = shared_temp[GI + 32].rotation;
  4540. }
  4541. }
  4542. GroupSync();
  4543. if (threadInBlock < 16)
  4544. {
  4545. if (shared_temp[GI].error > shared_temp[GI + 16].error)
  4546. {
  4547. shared_temp[GI].error = shared_temp[GI + 16].error;
  4548. shared_temp[GI].mode = shared_temp[GI + 16].mode;
  4549. shared_temp[GI].partition = shared_temp[GI + 16].partition;
  4550. shared_temp[GI].rotation = shared_temp[GI + 16].rotation;
  4551. }
  4552. }
  4553. GroupSync();
  4554. if (threadInBlock < 8)
  4555. {
  4556. if (shared_temp[GI].error > shared_temp[GI + 8].error)
  4557. {
  4558. shared_temp[GI].error = shared_temp[GI + 8].error;
  4559. shared_temp[GI].mode = shared_temp[GI + 8].mode;
  4560. shared_temp[GI].partition = shared_temp[GI + 8].partition;
  4561. shared_temp[GI].rotation = shared_temp[GI + 8].rotation;
  4562. }
  4563. }
  4564. GroupSync();
  4565. if (threadInBlock < 4)
  4566. {
  4567. if (shared_temp[GI].error > shared_temp[GI + 4].error)
  4568. {
  4569. shared_temp[GI].error = shared_temp[GI + 4].error;
  4570. shared_temp[GI].mode = shared_temp[GI + 4].mode;
  4571. shared_temp[GI].partition = shared_temp[GI + 4].partition;
  4572. shared_temp[GI].rotation = shared_temp[GI + 4].rotation;
  4573. }
  4574. }
  4575. GroupSync();
  4576. if (threadInBlock < 2)
  4577. {
  4578. if (shared_temp[GI].error > shared_temp[GI + 2].error)
  4579. {
  4580. shared_temp[GI].error = shared_temp[GI + 2].error;
  4581. shared_temp[GI].mode = shared_temp[GI + 2].mode;
  4582. shared_temp[GI].partition = shared_temp[GI + 2].partition;
  4583. shared_temp[GI].rotation = shared_temp[GI + 2].rotation;
  4584. }
  4585. }
  4586. GroupSync();
  4587. if (threadInBlock < 1)
  4588. {
  4589. if (shared_temp[GI].error > shared_temp[GI + 1].error)
  4590. {
  4591. shared_temp[GI].error = shared_temp[GI + 1].error;
  4592. shared_temp[GI].mode = shared_temp[GI + 1].mode;
  4593. shared_temp[GI].partition = shared_temp[GI + 1].partition;
  4594. shared_temp[GI].rotation = shared_temp[GI + 1].rotation;
  4595. }
  4596. if ((g_InBuff[blockID].error > shared_temp[GI].error)){
  4597. g_OutBuff1[blockID].error = shared_temp[GI].error;
  4598. g_OutBuff1[blockID].mode = shared_temp[GI].mode;
  4599. g_OutBuff1[blockID].partition = shared_temp[GI].partition;
  4600. g_OutBuff1[blockID].rotation = shared_temp[GI].rotation;
  4601. g_OutBuff1[blockID].index_selector = 0;
  4602. g_OutBuff1[blockID].data2 = 0;
  4603. }
  4604. else
  4605. {
  4606. g_OutBuff1[blockID].error = g_InBuff[blockID].error;
  4607. g_OutBuff1[blockID].mode = g_InBuff[blockID].mode;
  4608. g_OutBuff1[blockID].partition = g_InBuff[blockID].partition;
  4609. g_OutBuff1[blockID].index_selector = g_InBuff[blockID].index_selector;
  4610. g_OutBuff1[blockID].rotation = g_InBuff[blockID].rotation;
  4611. g_OutBuff1[blockID].data2 = g_InBuff[blockID].data2;
  4612. }
  4613. }
  4614. #else
  4615. GroupSync();
  4616. if (threadInBlock < 1)
  4617. {
  4618. // cary over prior results
  4619. g_OutBuff1[blockID].error = g_InBuff[blockID].error;
  4620. g_OutBuff1[blockID].mode = g_InBuff[blockID].mode;
  4621. g_OutBuff1[blockID].partition = g_InBuff[blockID].partition;
  4622. g_OutBuff1[blockID].index_selector = g_InBuff[blockID].index_selector;
  4623. g_OutBuff1[blockID].rotation = g_InBuff[blockID].rotation;
  4624. g_OutBuff1[blockID].data2 = g_InBuff[blockID].data2;
  4625. }
  4626. #endif
  4627. }
  4628. CMP_NUMTHREADS(THREAD_GROUP_SIZE, 1, 1) void TryMode02CS(CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID) // mode 0 2 have 3 subsets per block
  4629. {
  4630. const CGU_UINT32 MAX_USED_THREAD = 64;
  4631. CGU_UINT32 BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD;
  4632. CGU_UINT32 blockInGroup = GI / MAX_USED_THREAD;
  4633. CGU_UINT32 blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
  4634. CGU_UINT32 threadBase = blockInGroup * MAX_USED_THREAD;
  4635. CGU_UINT32 threadInBlock = GI - threadBase;
  4636. CGU_UINT32 block_y = blockID / g_num_block_x;
  4637. CGU_UINT32 block_x = blockID - block_y * g_num_block_x;
  4638. CGU_UINT32 base_x = block_x * BLOCK_SIZE_X;
  4639. CGU_UINT32 base_y = block_y * BLOCK_SIZE_Y;
  4640. #if defined(ENABLE_MODE0) || defined(ENABLE_MODE2)
  4641. if (threadInBlock < 16)
  4642. {
  4643. CGU_Vec4f px = g_Input.Load(CGU_Vec3ui(base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0)) * 255.0f;
  4644. px = clamp(px, 0.0f, 255.0f);
  4645. shared_temp[GI].pixel.r = (CGU_UINT32)px.r;
  4646. shared_temp[GI].pixel.g = (CGU_UINT32)px.g;
  4647. shared_temp[GI].pixel.b = (CGU_UINT32)px.b;
  4648. shared_temp[GI].pixel.a = (CGU_UINT32)px.a;
  4649. }
  4650. GroupSync();
  4651. shared_temp[GI].error = 0xFFFFFFFF;
  4652. CGU_UINT32 num_partitions;
  4653. if (0 == g_mode_id)
  4654. {
  4655. num_partitions = 16;
  4656. }
  4657. else
  4658. {
  4659. num_partitions = 64;
  4660. }
  4661. CGU_Vec4ui pixel_r;
  4662. CGU_Vec4ui endPoint[3][2]; // endPoint[0..1 for subset id][0..1 for low and high in the subset]
  4663. CGU_Vec4ui endPointBackup[3][2];
  4664. CGU_UINT32 color_index[16];
  4665. if (threadInBlock < num_partitions)
  4666. {
  4667. CGU_UINT32 partition = threadInBlock + 64;
  4668. endPoint[0][0] = MAX_UINT;
  4669. endPoint[0][1] = MIN_UINT;
  4670. endPoint[1][0] = MAX_UINT;
  4671. endPoint[1][1] = MIN_UINT;
  4672. endPoint[2][0] = MAX_UINT;
  4673. endPoint[2][1] = MIN_UINT;
  4674. CGU_UINT32 bits2 = blockPartitions2[partition - 64];
  4675. CGU_UINT32 i;
  4676. for ( i = 0; i < 16; i ++ )
  4677. {
  4678. CGU_Vec4ui pixel = shared_temp[threadBase + i].pixel;
  4679. CGU_UINT32 subset_index = ( bits2 >> ( i * 2 ) ) & 0x03;
  4680. if ( subset_index == 2 )
  4681. {
  4682. endPoint[2][0] = cmp_min( endPoint[2][0], pixel );
  4683. endPoint[2][1] = cmp_max( endPoint[2][1], pixel );
  4684. }
  4685. else if ( subset_index == 1 )
  4686. {
  4687. endPoint[1][0] = cmp_min( endPoint[1][0], pixel );
  4688. endPoint[1][1] = cmp_max( endPoint[1][1], pixel );
  4689. }
  4690. else
  4691. {
  4692. endPoint[0][0] = cmp_min( endPoint[0][0], pixel );
  4693. endPoint[0][1] = cmp_max( endPoint[0][1], pixel );
  4694. }
  4695. }
  4696. endPointBackup[0][0] = endPoint[0][0];
  4697. endPointBackup[0][1] = endPoint[0][1];
  4698. endPointBackup[1][0] = endPoint[1][0];
  4699. endPointBackup[1][1] = endPoint[1][1];
  4700. endPointBackup[2][0] = endPoint[2][0];
  4701. endPointBackup[2][1] = endPoint[2][1];
  4702. CGU_UINT32 max_p;
  4703. if (0 == g_mode_id)
  4704. {
  4705. max_p = 4;
  4706. }
  4707. else
  4708. {
  4709. max_p = 1;
  4710. }
  4711. CGU_UINT32 final_p[3] = { 0, 0, 0 };
  4712. CGU_UINT32 error[3] = { MAX_UINT, MAX_UINT, MAX_UINT };
  4713. CGU_Vec4ui ep_quantized[2];
  4714. for ( CGU_UINT32 p = 0; p < max_p; p ++ )
  4715. {
  4716. endPoint[0][0] = endPointBackup[0][0];
  4717. endPoint[0][1] = endPointBackup[0][1];
  4718. endPoint[1][0] = endPointBackup[1][0];
  4719. endPoint[1][1] = endPointBackup[1][1];
  4720. endPoint[2][0] = endPointBackup[2][0];
  4721. endPoint[2][1] = endPointBackup[2][1];
  4722. for ( i = 0; i < 3; i ++ )
  4723. {
  4724. if (0 == g_mode_id)
  4725. {
  4726. compress_endpoints0( endPoint[i],ep_quantized, CGU_Vec2ui(p& 1, (p >> 1)& 1));
  4727. }
  4728. else
  4729. {
  4730. compress_endpoints2( endPoint[i],ep_quantized );
  4731. }
  4732. }
  4733. CGU_UINT32 step_selector = 1 + (2 == g_mode_id);
  4734. CGU_Vec4i span[3];
  4735. span[0] = cmp_castimp(endPoint[0][1] - endPoint[0][0]);
  4736. span[1] = cmp_castimp(endPoint[1][1] - endPoint[1][0]);
  4737. span[2] = cmp_castimp(endPoint[2][1] - endPoint[2][0]);
  4738. span[0].w = span[1].w = span[2].w = 0;
  4739. CGU_INT span_norm_sqr[3];
  4740. span_norm_sqr[0] = dot( span[0], span[0] );
  4741. span_norm_sqr[1] = dot( span[1], span[1] );
  4742. span_norm_sqr[2] = dot( span[2], span[2] );
  4743. // TODO: again, this shouldn't be necessary here in error calculation
  4744. CGU_UINT32 ci[3] = { 0, candidateFixUpIndex1D[partition].x, candidateFixUpIndex1D[partition].y };
  4745. CGU_Vec4ui diff;
  4746. for (i = 0; i < 3; i ++)
  4747. {
  4748. diff = shared_temp[threadBase + ci[i]].pixel - endPoint[i][0];
  4749. CGU_INT dotProduct = dot( span[i], diff );
  4750. if ( span_norm_sqr[i] > 0 && dotProduct > 0 && CGU_UINT32( dotProduct * 63.49999 ) > CGU_UINT32( 32 * span_norm_sqr[i] ) )
  4751. {
  4752. span[i] = -span[i];
  4753. swap(endPoint[i][0], endPoint[i][1]);
  4754. }
  4755. }
  4756. CGU_UINT32 p_error[3] = { 0, 0, 0 };
  4757. for ( i = 0; i < 16; i ++ )
  4758. {
  4759. CGU_UINT32 subset_index = ( bits2 >> ( i * 2 ) ) & 0x03;
  4760. if ( subset_index == 2 )
  4761. {
  4762. diff = shared_temp[threadBase + i].pixel - endPoint[2][0];
  4763. CGU_INT dotProduct = dot( span[2], diff );
  4764. color_index[i] = ( span_norm_sqr[2] <= 0 || dotProduct <= 0 ) ? 0
  4765. : ( ( dotProduct < span_norm_sqr[2] ) ? aStep[step_selector][ CGU_UINT32( dotProduct * 63.49999 / span_norm_sqr[2] ) ] : aStep[step_selector][63] );
  4766. }
  4767. else if ( subset_index == 1 )
  4768. {
  4769. diff = shared_temp[threadBase + i].pixel - endPoint[1][0];
  4770. CGU_INT dotProduct = dot( span[1], diff );
  4771. color_index[i] = ( span_norm_sqr[1] <= 0 || dotProduct <= 0 ) ? 0
  4772. : ( ( dotProduct < span_norm_sqr[1] ) ? aStep[step_selector][ CGU_UINT32( dotProduct * 63.49999 / span_norm_sqr[1] ) ] : aStep[step_selector][63] );
  4773. }
  4774. else
  4775. {
  4776. diff = shared_temp[threadBase + i].pixel - endPoint[0][0];
  4777. CGU_INT dotProduct = dot( span[0], diff );
  4778. color_index[i] = ( span_norm_sqr[0] <= 0 || dotProduct <= 0 ) ? 0
  4779. : ( ( dotProduct < span_norm_sqr[0] ) ? aStep[step_selector][ CGU_UINT32( dotProduct * 63.49999 / span_norm_sqr[0] ) ] : aStep[step_selector][63] );
  4780. }
  4781. pixel_r = ( endPoint[subset_index][0]*( 64 - aWeight[step_selector][color_index[i]] ) +
  4782. endPoint[subset_index][1]* aWeight[step_selector][color_index[i]] + 32U ) >> 6;
  4783. pixel_r.a = 255;
  4784. CGU_Vec4ui pixel = shared_temp[threadBase + i].pixel;
  4785. Ensure_A_Is_Larger( pixel_r, pixel );
  4786. pixel_r -= pixel;
  4787. CGU_UINT32 pixel_error = ComputeError(pixel_r, pixel_r);
  4788. if ( subset_index == 2 )
  4789. p_error[2] += pixel_error;
  4790. else if ( subset_index == 1 )
  4791. p_error[1] += pixel_error;
  4792. else
  4793. p_error[0] += pixel_error;
  4794. }
  4795. for ( i = 0; i < 3; i++ )
  4796. {
  4797. if (p_error[i] < error[i])
  4798. {
  4799. error[i] = p_error[i];
  4800. final_p[i] = p; // Borrow rotation for p
  4801. }
  4802. }
  4803. }
  4804. shared_temp[GI].error = error[0] + error[1] + error[2];
  4805. shared_temp[GI].partition = partition;
  4806. shared_temp[GI].rotation = (final_p[2] << 4) | (final_p[1] << 2) | final_p[0];
  4807. }
  4808. GroupSync();
  4809. if (threadInBlock < 32)
  4810. {
  4811. if ( shared_temp[GI].error > shared_temp[GI + 32].error )
  4812. {
  4813. shared_temp[GI].error = shared_temp[GI + 32].error;
  4814. shared_temp[GI].partition = shared_temp[GI + 32].partition;
  4815. shared_temp[GI].rotation = shared_temp[GI + 32].rotation;
  4816. }
  4817. }
  4818. GroupSync();
  4819. if (threadInBlock < 16)
  4820. {
  4821. if ( shared_temp[GI].error > shared_temp[GI + 16].error )
  4822. {
  4823. shared_temp[GI].error = shared_temp[GI + 16].error;
  4824. shared_temp[GI].partition = shared_temp[GI + 16].partition;
  4825. shared_temp[GI].rotation = shared_temp[GI + 16].rotation;
  4826. }
  4827. }
  4828. GroupSync();
  4829. if (threadInBlock < 8)
  4830. {
  4831. if ( shared_temp[GI].error > shared_temp[GI + 8].error )
  4832. {
  4833. shared_temp[GI].error = shared_temp[GI + 8].error;
  4834. shared_temp[GI].partition = shared_temp[GI + 8].partition;
  4835. shared_temp[GI].rotation = shared_temp[GI + 8].rotation;
  4836. }
  4837. }
  4838. GroupSync();
  4839. if (threadInBlock < 4)
  4840. {
  4841. if ( shared_temp[GI].error > shared_temp[GI + 4].error )
  4842. {
  4843. shared_temp[GI].error = shared_temp[GI + 4].error;
  4844. shared_temp[GI].partition = shared_temp[GI + 4].partition;
  4845. shared_temp[GI].rotation = shared_temp[GI + 4].rotation;
  4846. }
  4847. }
  4848. GroupSync();
  4849. if (threadInBlock < 2)
  4850. {
  4851. if ( shared_temp[GI].error > shared_temp[GI + 2].error )
  4852. {
  4853. shared_temp[GI].error = shared_temp[GI + 2].error;
  4854. shared_temp[GI].partition = shared_temp[GI + 2].partition;
  4855. shared_temp[GI].rotation = shared_temp[GI + 2].rotation;
  4856. }
  4857. }
  4858. GroupSync();
  4859. if (threadInBlock < 1)
  4860. {
  4861. if ( shared_temp[GI].error > shared_temp[GI + 1].error )
  4862. {
  4863. shared_temp[GI].error = shared_temp[GI + 1].error;
  4864. shared_temp[GI].partition = shared_temp[GI + 1].partition;
  4865. shared_temp[GI].rotation = shared_temp[GI + 1].rotation;
  4866. }
  4867. if (g_InBuff[blockID].error > shared_temp[GI].error)
  4868. {
  4869. g_OutBuff1[blockID].error = shared_temp[GI].error;
  4870. g_OutBuff1[blockID].mode = g_mode_id;
  4871. g_OutBuff1[blockID].partition = shared_temp[GI].partition;
  4872. g_OutBuff1[blockID].rotation = shared_temp[GI].rotation;
  4873. g_OutBuff1[blockID].data2 = 0;
  4874. }
  4875. else
  4876. {
  4877. g_OutBuff1[blockID].error = g_InBuff[blockID].error;
  4878. g_OutBuff1[blockID].mode = g_InBuff[blockID].mode;
  4879. g_OutBuff1[blockID].partition = g_InBuff[blockID].partition;
  4880. g_OutBuff1[blockID].index_selector = g_InBuff[blockID].index_selector;
  4881. g_OutBuff1[blockID].rotation = g_InBuff[blockID].rotation;
  4882. g_OutBuff1[blockID].data2 = g_InBuff[blockID].data2;
  4883. }
  4884. }
  4885. #endif
  4886. }
  4887. CMP_NUMTHREADS(THREAD_GROUP_SIZE, 1, 1) void EncodeBlocks(CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID)
  4888. {
  4889. CMP_CONSTANT CGU_UINT32 MAX_USED_THREAD = 16;
  4890. CGU_UINT32 BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD;
  4891. CGU_UINT32 blockInGroup = GI / MAX_USED_THREAD;
  4892. CGU_UINT32 blockID = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
  4893. CGU_UINT32 threadBase = blockInGroup * MAX_USED_THREAD;
  4894. CGU_UINT32 threadInBlock = GI - threadBase;
  4895. CGU_UINT32 block_y = blockID / g_num_block_x;
  4896. CGU_UINT32 block_x = blockID - block_y * g_num_block_x;
  4897. CGU_UINT32 base_x = block_x * BLOCK_SIZE_X;
  4898. CGU_UINT32 base_y = block_y * BLOCK_SIZE_Y;
  4899. CGU_UINT32 use_cmp = g_InBuff[blockID].mode & 0x10;
  4900. CGU_UINT32 best_mode = g_InBuff[blockID].mode & 0x07;
  4901. CGU_UINT32 best_partition = g_InBuff[blockID].partition;
  4902. CGU_UINT32 best_index_selector = g_InBuff[blockID].index_selector;
  4903. CGU_UINT32 best_rotation = g_InBuff[blockID].rotation;
  4904. if (threadInBlock < 16)
  4905. {
  4906. CGU_Vec4f px = g_Input.Load(CGU_Vec3ui(base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0)) * 255.0f;
  4907. px = clamp(px, 0.0f, 255.0f);
  4908. CGU_Vec4ui pixel;
  4909. pixel.r = (CGU_UINT32)px.r;
  4910. pixel.g = (CGU_UINT32)px.g;
  4911. pixel.b = (CGU_UINT32)px.b;
  4912. pixel.a = (CGU_UINT32)px.a;
  4913. if ((4 == best_mode) || (5 == best_mode))
  4914. set_pixel_rotation(pixel,best_rotation);
  4915. shared_temp[GI].pixel = pixel;
  4916. }
  4917. GroupSync();
  4918. CGU_UINT32 bits = blockPartitions[best_partition];
  4919. CGU_UINT32 bits2 = blockPartitions2[best_partition - 64];
  4920. CGU_Vec4ui ep[2];
  4921. ep[0] = MAX_UINT;
  4922. ep[1] = MIN_UINT;
  4923. CGU_Vec4ui ep_quantized[2];
  4924. CGU_Vec3ui diff3;
  4925. CGU_Vec4ui diff4;
  4926. CMP_UNROLL for (CGU_INT ii = 2; ii >= 0; -- ii)
  4927. {
  4928. if (threadInBlock < 16)
  4929. {
  4930. CGU_Vec4ui epTemp[2];
  4931. epTemp[0] = MAX_UINT;
  4932. epTemp[1] = MIN_UINT;
  4933. CGU_Vec4ui pixel = shared_temp[GI].pixel;
  4934. CGU_UINT32 subset_index = ( bits >> threadInBlock ) & 0x01;
  4935. CGU_UINT32 subset_index2 = ( bits2 >> ( threadInBlock * 2 ) ) & 0x03;
  4936. if (0 == ii)
  4937. {
  4938. if ((0 == best_mode) || (2 == best_mode))
  4939. {
  4940. if (0 == subset_index2)
  4941. {
  4942. epTemp[0] = epTemp[1] = pixel;
  4943. }
  4944. }
  4945. else if ((1 == best_mode) || (3 == best_mode) || (7 == best_mode))
  4946. {
  4947. if (0 == subset_index)
  4948. {
  4949. epTemp[0] = epTemp[1] = pixel;
  4950. }
  4951. }
  4952. else if ((4 == best_mode) || (5 == best_mode) || (6 == best_mode))
  4953. {
  4954. epTemp[0] = epTemp[1] = pixel;
  4955. }
  4956. }
  4957. else if (1 == ii)
  4958. {
  4959. if ((0 == best_mode) || (2 == best_mode))
  4960. {
  4961. if (1 == subset_index2)
  4962. {
  4963. epTemp[0] = epTemp[1] = pixel;
  4964. }
  4965. }
  4966. else if ((1 == best_mode) || (3 == best_mode) || (7 == best_mode))
  4967. {
  4968. if (1 == subset_index)
  4969. {
  4970. epTemp[0] = epTemp[1] = pixel;
  4971. }
  4972. }
  4973. }
  4974. else
  4975. {
  4976. if ((0 == best_mode) || (2 == best_mode))
  4977. {
  4978. if (2 == subset_index2)
  4979. {
  4980. epTemp[0] = epTemp[1] = pixel;
  4981. }
  4982. }
  4983. }
  4984. shared_temp[GI].endPoint_low = epTemp[0];
  4985. shared_temp[GI].endPoint_high = epTemp[1];
  4986. }
  4987. GroupSync();
  4988. if (threadInBlock < 8)
  4989. {
  4990. shared_temp[GI].endPoint_low = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 8].endPoint_low);
  4991. shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 8].endPoint_high);
  4992. }
  4993. GroupSync();
  4994. if (threadInBlock < 4)
  4995. {
  4996. shared_temp[GI].endPoint_low = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 4].endPoint_low);
  4997. shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 4].endPoint_high);
  4998. }
  4999. GroupSync();
  5000. if (threadInBlock < 2)
  5001. {
  5002. shared_temp[GI].endPoint_low = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 2].endPoint_low);
  5003. shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 2].endPoint_high);
  5004. }
  5005. GroupSync();
  5006. if (threadInBlock < 1)
  5007. {
  5008. shared_temp[GI].endPoint_low = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 1].endPoint_low);
  5009. shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 1].endPoint_high);
  5010. }
  5011. GroupSync();
  5012. if (ii == (int)threadInBlock)
  5013. {
  5014. ep[0] = shared_temp[threadBase].endPoint_low;
  5015. ep[1] = shared_temp[threadBase].endPoint_high;
  5016. }
  5017. }
  5018. if (threadInBlock < 3)
  5019. {
  5020. CGU_Vec2ui P;
  5021. if (1 == best_mode)
  5022. {
  5023. P = (best_rotation >> threadInBlock) & 1;
  5024. }
  5025. else
  5026. {
  5027. P = CGU_Vec2ui((best_rotation >> (threadInBlock * 2 + 0))&1, (best_rotation >> (threadInBlock * 2 + 1))&1);
  5028. }
  5029. if (0 == best_mode)
  5030. {
  5031. compress_endpoints0( ep,ep_quantized, P );
  5032. }
  5033. else if (1 == best_mode)
  5034. {
  5035. compress_endpoints1( ep,ep_quantized, P );
  5036. }
  5037. else if (2 == best_mode)
  5038. {
  5039. compress_endpoints2( ep,ep_quantized );
  5040. }
  5041. else if (3 == best_mode)
  5042. {
  5043. compress_endpoints3( ep,ep_quantized, P );
  5044. }
  5045. else if (4 == best_mode)
  5046. {
  5047. compress_endpoints4( ep,ep_quantized );
  5048. }
  5049. else if (5 == best_mode)
  5050. {
  5051. compress_endpoints5( ep,ep_quantized);
  5052. }
  5053. else if (6 == best_mode)
  5054. {
  5055. compress_endpoints6( ep,ep_quantized, P );
  5056. }
  5057. else //if (7 == mode)
  5058. {
  5059. compress_endpoints7( ep,ep_quantized, P );
  5060. }
  5061. CGU_Vec4i span = cmp_castimp(ep[1] - ep[0]);
  5062. if (best_mode < 4)
  5063. span.w = 0;
  5064. if ((4 == best_mode) || (5 == best_mode))
  5065. {
  5066. if (0 == threadInBlock)
  5067. {
  5068. CGU_Vec2i span_norm_sqr = CGU_Vec2i( dot( span.rgb, span.rgb ),span.a * span.a );
  5069. diff3 = shared_temp[threadBase + 0].pixel.rgb - ep[0].rgb;
  5070. CGU_Vec2i dotProduct = CGU_Vec2i( dot( span.rgb, diff3 ), span.a * ( shared_temp[threadBase + 0].pixel.a - ep[0].a ) );
  5071. if ( span_norm_sqr.x > 0 && dotProduct.x > 0 && CGU_UINT32( dotProduct.x * 63.49999 ) > CGU_UINT32( 32 * span_norm_sqr.x ) )
  5072. {
  5073. swap(ep[0].rgb, ep[1].rgb);
  5074. swap(ep_quantized[0].rgb, ep_quantized[1].rgb);
  5075. }
  5076. if ( span_norm_sqr.y > 0 && dotProduct.y > 0 && CGU_UINT32( dotProduct.y * 63.49999 ) > CGU_UINT32( 32 * span_norm_sqr.y ) )
  5077. {
  5078. swap(ep[0].a, ep[1].a);
  5079. swap(ep_quantized[0].a, ep_quantized[1].a);
  5080. }
  5081. }
  5082. }
  5083. else //if ((0 == mode) || (2 == mode) || (1 == mode) || (3 == mode) || (7 == mode) || (6 == mode))
  5084. {
  5085. CGU_INT p;
  5086. if (0 == threadInBlock)
  5087. {
  5088. p = 0;
  5089. }
  5090. else if (1 == threadInBlock)
  5091. {
  5092. p = candidateFixUpIndex1D[best_partition].x;
  5093. }
  5094. else //if (2 == threadInBlock)
  5095. {
  5096. p = candidateFixUpIndex1D[best_partition].y;
  5097. }
  5098. CGU_INT span_norm_sqr = dot( span, span );
  5099. diff4 = shared_temp[threadBase + p].pixel - ep[0];
  5100. CGU_INT dotProduct = dot( span, diff4 );
  5101. if ( span_norm_sqr > 0 && dotProduct > 0 && CGU_UINT32( dotProduct * 63.49999 ) > CGU_UINT32( 32 * span_norm_sqr ) )
  5102. {
  5103. swap(ep[0], ep[1]);
  5104. swap(ep_quantized[0], ep_quantized[1]);
  5105. }
  5106. }
  5107. shared_temp[GI].endPoint_low = ep[0];
  5108. shared_temp[GI].endPoint_high = ep[1];
  5109. shared_temp[GI].endPoint_low_quantized = ep_quantized[0];
  5110. shared_temp[GI].endPoint_high_quantized = ep_quantized[1];
  5111. }
  5112. GroupSync();
  5113. if (threadInBlock < 16)
  5114. {
  5115. CGU_UINT32 color_index = 0;
  5116. CGU_UINT32 alpha_index = 0;
  5117. CGU_Vec4ui epTemp[2];
  5118. CGU_Vec2ui indexPrec;
  5119. if ((0 == best_mode) || (1 == best_mode))
  5120. {
  5121. indexPrec = 1;
  5122. }
  5123. else if (6 == best_mode)
  5124. {
  5125. indexPrec = 0;
  5126. }
  5127. else if (4 == best_mode)
  5128. {
  5129. if (0 == best_index_selector)
  5130. {
  5131. indexPrec = CGU_Vec2ui(2, 1);
  5132. }
  5133. else
  5134. {
  5135. indexPrec = CGU_Vec2ui(1, 2);
  5136. }
  5137. }
  5138. else
  5139. {
  5140. indexPrec = 2;
  5141. }
  5142. CGU_INT subset_index;
  5143. if ((0 == best_mode) || (2 == best_mode))
  5144. {
  5145. subset_index = (bits2 >> (threadInBlock * 2)) & 0x03;
  5146. }
  5147. else if ((1 == best_mode) || (3 == best_mode) || (7 == best_mode))
  5148. {
  5149. subset_index = (bits >> threadInBlock) & 0x01;
  5150. }
  5151. else
  5152. {
  5153. subset_index = 0;
  5154. }
  5155. epTemp[0] = shared_temp[threadBase + subset_index].endPoint_low;
  5156. epTemp[1] = shared_temp[threadBase + subset_index].endPoint_high;
  5157. CGU_Vec4i span = cmp_castimp(epTemp[1] - epTemp[0]);
  5158. if (best_mode < 4)
  5159. {
  5160. span.w = 0;
  5161. }
  5162. if ((4 == best_mode) || (5 == best_mode))
  5163. {
  5164. CGU_Vec2i span_norm_sqr;
  5165. span_norm_sqr.x = dot( span.rgb, span.rgb );
  5166. span_norm_sqr.y = span.a * span.a;
  5167. diff3 = shared_temp[threadBase + threadInBlock].pixel.rgb - epTemp[0].rgb;
  5168. CGU_INT dotProduct = dot( span.rgb, diff3 );
  5169. color_index = ( span_norm_sqr.x <= 0 || dotProduct <= 0 ) ? 0
  5170. : ( ( dotProduct < span_norm_sqr.x ) ? aStep[indexPrec.x][ CGU_UINT32( dotProduct * 63.49999 / span_norm_sqr.x ) ] : aStep[indexPrec.x][63] );
  5171. CGU_UINT32 diffa = shared_temp[threadBase + threadInBlock].pixel.a - epTemp[0].a;
  5172. dotProduct = dot( span.a, diffa );
  5173. alpha_index = ( span_norm_sqr.y <= 0 || dotProduct <= 0 ) ? 0
  5174. : ( ( dotProduct < span_norm_sqr.y ) ? aStep[indexPrec.y][ CGU_UINT32( dotProduct * 63.49999 / span_norm_sqr.y ) ] : aStep[indexPrec.y][63] );
  5175. if (best_index_selector)
  5176. {
  5177. swap(color_index, alpha_index);
  5178. }
  5179. }
  5180. else
  5181. {
  5182. CGU_INT span_norm_sqr = dot( span, span );
  5183. diff4 = shared_temp[threadBase + threadInBlock].pixel - epTemp[0] ;
  5184. CGU_INT dotProduct = dot( span, diff4);
  5185. color_index = ( span_norm_sqr <= 0 || dotProduct <= 0 ) ? 0
  5186. : ( ( dotProduct < span_norm_sqr ) ? aStep[indexPrec.x][ CGU_UINT32( dotProduct * 63.49999 / span_norm_sqr ) ] : aStep[indexPrec.x][63] );
  5187. }
  5188. shared_temp[GI].error = color_index;
  5189. shared_temp[GI].mode = alpha_index;
  5190. }
  5191. GroupSync();
  5192. if (0 == threadInBlock)
  5193. {
  5194. CGU_Vec4ui blockRed = {0x001fffc0, 0xfffe0000, 0x00000001, 0x00000000};
  5195. CGU_Vec4ui blockBlue = {0x00000040, 0xfffffff8, 0x00000001, 0x00000000};
  5196. CGU_Vec4ui block = {0, 0, 0, 0};
  5197. switch (best_mode)
  5198. {
  5199. case 0:
  5200. block_package0(block, best_partition, threadBase);
  5201. //block = blockRed;
  5202. break;
  5203. case 1:
  5204. block_package1(block, best_partition, threadBase);
  5205. //block = blockRed;
  5206. break;
  5207. case 2:
  5208. block_package2(block, best_partition, threadBase);
  5209. //block = blockRed;
  5210. break;
  5211. case 3:
  5212. block_package3(block, best_partition, threadBase);
  5213. //block = blockRed;
  5214. break;
  5215. case 4:
  5216. block_package4(block, best_rotation, best_index_selector, threadBase);
  5217. //block = blockRed;
  5218. break;
  5219. case 5:
  5220. block_package5(block, best_rotation, threadBase);
  5221. //block = blockRed;
  5222. break;
  5223. case 6:
  5224. if (use_cmp) {
  5225. block = g_InBuff[blockID].data2;
  5226. //block = blockBlue;
  5227. }
  5228. else {
  5229. block_package6( block, threadBase );
  5230. //block = blockRed;
  5231. }
  5232. break;
  5233. case 7:
  5234. block_package7(block, best_partition, threadBase);
  5235. //block = blockRed;
  5236. break;
  5237. default: // error!
  5238. block = blockRed;
  5239. break;
  5240. }
  5241. g_OutBuff[blockID] = block;
  5242. }
  5243. }
  5244. //=================================================
  5245. // This is a prototype API interface to run on CPU
  5246. // move to GPU when completed
  5247. //=================================================
  5248. CMP_STATIC CGU_Vec4ui CompressBlockBC7_CMPMSC(CMP_IN CGU_Vec4f image_src[16], CMP_IN CGU_FLOAT fquality)
  5249. {
  5250. CMP_UNUSED(fquality);
  5251. CGU_Vec4ui cmp = {0, 0, 0, 0};
  5252. #ifndef ASPM_HLSL
  5253. #ifdef SIMULATE_GPU
  5254. HLSLHost(image_src);
  5255. cmp = g_OutBuff[0];
  5256. #else
  5257. CGU_Vec4ui image_srcui[16];
  5258. // Transfer local pixel data over to shared global
  5259. for (CGU_INT ii = 0; ii < 16; ii++)
  5260. {
  5261. image_srcui[ii].x = image_src[ii].x;
  5262. image_srcui[ii].y = image_src[ii].y;
  5263. image_srcui[ii].z = image_src[ii].z;
  5264. image_srcui[ii].w = image_src[ii].w;
  5265. }
  5266. #if defined (ENABLE_CMP_MODE6)
  5267. CGU_Vec4ui epo_code_out[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}};
  5268. CGU_UINT32 best_index_out[16];
  5269. CGU_FLOAT besterr;
  5270. CGU_FLOAT err;
  5271. // Fast Encode of block
  5272. besterr = cmp_GetIndexedEndPoints(epo_code_out,
  5273. best_index_out,
  5274. image_srcui,
  5275. 15, // numEntries 0..15 (Note this function is changed from using 16)
  5276. 0xffffffff);
  5277. CGU_UINT32 index_packed_out[2] = {0, 0};
  5278. cmp_pack4bitindex32(index_packed_out, best_index_out);
  5279. #ifdef ENABLE_CMP_REFINE_MODE6_API
  5280. // Refined for better quailty
  5281. err = cmp_mode6_optimize_IndexAndEndPoints(epo_code_out,
  5282. best_index_out,
  5283. image_srcui, // using shared_temp[].pixel with 0 thread offset
  5284. 16, // numEntries
  5285. g_modesettings[6].clusters, // 16,
  5286. g_modesettings[6].bits, // 58,
  5287. g_modesettings[6].channels3or4, // 4,
  5288. 0.1f);
  5289. cmp_pack4bitindex32(index_packed_out, best_index_out);
  5290. #endif
  5291. // encode results
  5292. CGU_UINT32 cmp_out6[4] = {0, 0, 0, 0};
  5293. cmp_encode_mode6(cmp_out6, epo_code_out, index_packed_out);
  5294. cmp.x = cmp_out6[0];
  5295. cmp.y = cmp_out6[1];
  5296. cmp.z = cmp_out6[2];
  5297. cmp.w = cmp_out6[3];
  5298. #endif
  5299. #if defined (ENABLE_CMP_MODE4) || defined(ENABLE_CMP_MODE5)
  5300. {
  5301. CGU_UINT32 cmp_out[4] = {0, 0, 0, 0};
  5302. Compress_mode45(cmp_out, 4, image_srcui);
  5303. cmp.x = cmp_out[0];
  5304. cmp.y = cmp_out[1];
  5305. cmp.z = cmp_out[2];
  5306. cmp.w = cmp_out[3];
  5307. }
  5308. #endif
  5309. #if defined(ENABLE_CMP_MODE1)
  5310. {
  5311. CGU_UINT32 cmp_out1[5] = {0, 0, 0, 0, 0};
  5312. cmp_process_mode(cmp_out1, image_srcui, 1);
  5313. cmp.x = cmp_out1[0];
  5314. cmp.y = cmp_out1[1];
  5315. cmp.z = cmp_out1[2];
  5316. cmp.w = cmp_out1[3];
  5317. }
  5318. #endif
  5319. #endif // SIMULATE_GPU
  5320. #endif // Not HLSL
  5321. return cmp;
  5322. }