bc7_encode_kernel.cpp 131 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675
  1. //===================================================================================
  2. // Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
  3. //
  4. // Permission is hereby granted, free of charge, to any person obtaining a copy
  5. // of this software and associated documentation files(the "Software"), to deal
  6. // in the Software without restriction, including without limitation the rights
  7. // to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
  8. // copies of the Software, and to permit persons to whom the Software is
  9. // furnished to do so, subject to the following conditions :
  10. //
  11. // The above copyright notice and this permission notice shall be included in
  12. // all copies or substantial portions of the Software.
  13. //
  14. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
  17. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  19. // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  20. // THE SOFTWARE.
  21. //
  22. //==================================================================================
  23. // Ref: GPUOpen-Tools/Compressonator
  24. ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  25. // Copyright (c) 2016, Intel Corporation
  26. // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
  27. // documentation files (the "Software"), to deal in the Software without restriction, including without limitation
  28. // the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
  29. // permit persons to whom the Software is furnished to do so, subject to the following conditions:
  30. // The above copyright notice and this permission notice shall be included in all copies or substantial portions of
  31. // the Software.
  32. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
  33. // THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  34. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  35. // TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  36. // SOFTWARE.
  37. ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  38. //--------------------------------------
  39. // Common BC7 Header
  40. //--------------------------------------
  41. #include "bc7_encode_kernel.h"
  42. //#define USE_ICMP
  43. #ifndef ASPM_OPENCL
  44. //#define USE_NEW_SINGLE_HEADER_INTERFACES
  45. #endif
  46. #ifdef USE_NEW_SINGLE_HEADER_INTERFACES
  47. #define USE_CMPMSC
  48. //#define USE_MSC
  49. //#define USE_INT
  50. //#define USE_RGBCX_RDO
  51. //#define USE_VOLT
  52. //#define USE_ICBC
  53. #endif
  54. #include "bc7_common_encoder.h"
  55. #ifndef ASPM
  56. //---------------------------------------------
  57. // Predefinitions for GPU and CPU compiled code
  58. //---------------------------------------------
  59. INLINE CGU_INT a_compare(const void* arg1, const void* arg2)
  60. {
  61. if (((CMP_di*)arg1)->image - ((CMP_di*)arg2)->image > 0)
  62. return 1;
  63. if (((CMP_di*)arg1)->image - ((CMP_di*)arg2)->image < 0)
  64. return -1;
  65. return 0;
  66. };
  67. #endif
  68. #ifndef ASPM_GPU
  69. CMP_GLOBAL BC7_EncodeRamps BC7EncodeRamps
  70. #ifndef ASPM
  71. = {0}
  72. #endif
  73. ;
  74. //---------------------------------------------
  75. // CPU: Computes max of two float values
  76. //---------------------------------------------
  77. float bc7_maxf(float l1, float r1)
  78. {
  79. return (l1 > r1 ? l1 : r1);
  80. }
  81. //---------------------------------------------
  82. // CPU: Computes max of two float values
  83. //---------------------------------------------
  84. float bc7_minf(float l1, float r1)
  85. {
  86. return (l1 < r1 ? l1 : r1);
  87. }
  88. #endif
  89. INLINE CGV_INT shift_right_epocode(CGV_INT v, CGU_INT bits)
  90. {
  91. return v >> bits; // (perf warning expected)
  92. }
  93. INLINE CGV_INT expand_epocode(CGV_INT v, CGU_INT bits)
  94. {
  95. CGV_INT vv = v << (8 - bits);
  96. return vv + shift_right_epocode(vv, bits);
  97. }
  98. // valid bit range is 0..8
  99. CGU_INT expandbits(CGU_INT bits, CGU_INT v)
  100. {
  101. return (v << (8 - bits) | v >> (2 * bits - 8));
  102. }
  103. CMP_EXPORT CGU_INT bc7_isa()
  104. {
  105. #ifndef ASPM_GPU
  106. #if defined(ISPC_TARGET_SSE2)
  107. ASPM_PRINT(("SSE2"));
  108. return 0;
  109. #elif defined(ISPC_TARGET_SSE4)
  110. ASPM_PRINT(("SSE4"));
  111. return 1;
  112. #elif defined(ISPC_TARGET_AVX)
  113. ASPM_PRINT(("AVX"));
  114. return 2;
  115. #elif defined(ISPC_TARGET_AVX2)
  116. ASPM_PRINT(("AVX2"));
  117. return 3;
  118. #else
  119. ASPM_PRINT(("CPU"));
  120. #endif
  121. #endif
  122. return -1;
  123. }
  124. CMP_EXPORT void init_BC7ramps()
  125. {
  126. #ifdef ASPM_GPU
  127. #else
  128. CMP_STATIC CGU_BOOL g_rampsInitialized = FALSE;
  129. if (g_rampsInitialized == TRUE)
  130. return;
  131. g_rampsInitialized = TRUE;
  132. BC7EncodeRamps.ramp_init = TRUE;
  133. //bc7_isa(); ASPM_PRINT((" INIT Ramps\n"));
  134. CGU_INT bits;
  135. CGU_INT p1;
  136. CGU_INT p2;
  137. CGU_INT clogBC7;
  138. CGU_INT index;
  139. CGU_INT j;
  140. CGU_INT o1;
  141. CGU_INT o2;
  142. CGU_INT maxi = 0;
  143. for (bits = BIT_BASE; bits < BIT_RANGE; bits++)
  144. {
  145. for (p1 = 0; p1 < (1 << bits); p1++)
  146. {
  147. BC7EncodeRamps.ep_d[BTT(bits)][p1] = expandbits(bits, p1);
  148. } //p1
  149. } //bits<BIT_RANGE
  150. for (clogBC7 = LOG_CL_BASE; clogBC7 < LOG_CL_RANGE; clogBC7++)
  151. {
  152. for (bits = BIT_BASE; bits < BIT_RANGE; bits++)
  153. {
  154. #ifdef USE_BC7_RAMP
  155. for (p1 = 0; p1 < (1 << bits); p1++)
  156. {
  157. for (p2 = 0; p2 < (1 << bits); p2++)
  158. {
  159. for (index = 0; index < (1 << clogBC7); index++)
  160. {
  161. if (index > maxi)
  162. maxi = index;
  163. BC7EncodeRamps.ramp[(CLT(clogBC7) * 4 * 256 * 256 * 16) + (BTT(bits) * 256 * 256 * 16) + (p1 * 256 * 16) + (p2 * 16) + index] =
  164. //floor((CGV_FLOAT)BC7EncodeRamps.ep_d[BTT(bits)][p1] + rampWeights[clogBC7][index] * (CGV_FLOAT)((BC7EncodeRamps.ep_d[BTT(bits)][p2] - BC7EncodeRamps.ep_d[BTT(bits)][p1]))+ 0.5F);
  165. floor(BC7EncodeRamps.ep_d[BTT(bits)][p1] +
  166. rampWeights[clogBC7][index] * ((BC7EncodeRamps.ep_d[BTT(bits)][p2] - BC7EncodeRamps.ep_d[BTT(bits)][p1])) + 0.5F);
  167. } //index<(1 << clogBC7)
  168. } //p2<(1 << bits)
  169. } //p1<(1 << bits)
  170. #endif
  171. #ifdef USE_BC7_SP_ERR_IDX
  172. for (j = 0; j < 256; j++)
  173. {
  174. for (o1 = 0; o1 < 2; o1++)
  175. {
  176. for (o2 = 0; o2 < 2; o2++)
  177. {
  178. for (index = 0; index < 16; index++)
  179. {
  180. BC7EncodeRamps.sp_idx[(CLT(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT(bits) * 256 * 2 * 2 * 16 * 2) + (j * 2 * 2 * 16 * 2) +
  181. (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 0] = 0;
  182. BC7EncodeRamps.sp_idx[(CLT(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT(bits) * 256 * 2 * 2 * 16 * 2) + (j * 2 * 2 * 16 * 2) +
  183. (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 1] = 255;
  184. BC7EncodeRamps.sp_err[(CLT(clogBC7) * 4 * 256 * 2 * 2 * 16) + (BTT(bits) * 256 * 2 * 2 * 16) + (j * 2 * 2 * 16) + (o1 * 2 * 16) +
  185. (o2 * 16) + index] = 255;
  186. } // i<16
  187. } //o2<2;
  188. } //o1<2
  189. } //j<256
  190. for (p1 = 0; p1 < (1 << bits); p1++)
  191. {
  192. for (p2 = 0; p2 < (1 << bits); p2++)
  193. {
  194. for (index = 0; index < (1 << clogBC7); index++)
  195. {
  196. #ifdef USE_BC7_RAMP
  197. CGV_INT floatf =
  198. (CGV_INT)
  199. BC7EncodeRamps.ramp[(CLT(clogBC7) * 4 * 256 * 256 * 16) + (BTT(bits) * 256 * 256 * 16) + (p1 * 256 * 16) + (p2 * 16) + index];
  200. #else
  201. CGV_INT floatf =
  202. floor((CGV_FLOAT)BC7EncodeRamps.ep_d[BTT(bits)][p1] +
  203. rampWeights[clogBC7][index] * (CGV_FLOAT)((BC7EncodeRamps.ep_d[BTT(bits)][p2] - BC7EncodeRamps.ep_d[BTT(bits)][p1])) + 0.5F);
  204. #endif
  205. BC7EncodeRamps.sp_idx[(CLT(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT(bits) * 256 * 2 * 2 * 16 * 2) + (floatf * 2 * 2 * 16 * 2) +
  206. ((p1 & 0x1) * 2 * 16 * 2) + ((p2 & 0x1) * 16 * 2) + (index * 2) + 0] = p1;
  207. BC7EncodeRamps.sp_idx[(CLT(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT(bits) * 256 * 2 * 2 * 16 * 2) + (floatf * 2 * 2 * 16 * 2) +
  208. ((p1 & 0x1) * 2 * 16 * 2) + ((p2 & 0x1) * 16 * 2) + (index * 2) + 1] = p2;
  209. BC7EncodeRamps.sp_err[(CLT(clogBC7) * 4 * 256 * 2 * 2 * 16) + (BTT(bits) * 256 * 2 * 2 * 16) + (floatf * 2 * 2 * 16) +
  210. ((p1 & 0x1) * 2 * 16) + (p2 & 0x1 * 16) + index] = 0;
  211. } //i<(1 << clogBC7)
  212. } //p2
  213. } //p1<(1 << bits)
  214. for (j = 0; j < 256; j++)
  215. {
  216. for (o1 = 0; o1 < 2; o1++)
  217. {
  218. for (o2 = 0; o2 < 2; o2++)
  219. {
  220. for (index = 0; index < (1 << clogBC7); index++)
  221. {
  222. if ( // check for unitialized sp_idx
  223. (BC7EncodeRamps.sp_idx[(CLT(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT(bits) * 256 * 2 * 2 * 16 * 2) + (j * 2 * 2 * 16 * 2) +
  224. (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 0] == 0) &&
  225. (BC7EncodeRamps.sp_idx[(CLT(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT(bits) * 256 * 2 * 2 * 16 * 2) + (j * 2 * 2 * 16 * 2) +
  226. (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 1] == 255))
  227. {
  228. CGU_INT k;
  229. CGU_INT tf;
  230. CGU_INT tc;
  231. for (k = 1; k < 256; k++)
  232. {
  233. tf = j - k;
  234. tc = j + k;
  235. if ((tf >= 0 && BC7EncodeRamps.sp_err[(CLT(clogBC7) * 4 * 256 * 2 * 2 * 16) + (BTT(bits) * 256 * 2 * 2 * 16) +
  236. (tf * 2 * 2 * 16) + (o1 * 2 * 16) + (o2 * 16) + index] == 0))
  237. {
  238. BC7EncodeRamps.sp_idx[(CLT(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT(bits) * 256 * 2 * 2 * 16 * 2) +
  239. (j * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 0] =
  240. BC7EncodeRamps.sp_idx[(CLT(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT(bits) * 256 * 2 * 2 * 16 * 2) +
  241. (tf * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 0];
  242. BC7EncodeRamps.sp_idx[(CLT(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT(bits) * 256 * 2 * 2 * 16 * 2) +
  243. (j * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 1] =
  244. BC7EncodeRamps.sp_idx[(CLT(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT(bits) * 256 * 2 * 2 * 16 * 2) +
  245. (tf * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 1];
  246. break;
  247. }
  248. else if ((tc < 256 && BC7EncodeRamps.sp_err[(CLT(clogBC7) * 4 * 256 * 2 * 2 * 16) + (BTT(bits) * 256 * 2 * 2 * 16) +
  249. (tc * 2 * 2 * 16) + (o1 * 2 * 16) + (o2 * 16) + index] == 0))
  250. {
  251. BC7EncodeRamps.sp_idx[(CLT(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT(bits) * 256 * 2 * 2 * 16 * 2) +
  252. (j * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 0] =
  253. BC7EncodeRamps.sp_idx[(CLT(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT(bits) * 256 * 2 * 2 * 16 * 2) +
  254. (tc * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 0];
  255. break;
  256. }
  257. }
  258. //BC7EncodeRamps.sp_err[(CLT(clogBC7)*4*256*2*2*16)+(BTT(bits)*256*2*2*16)+(j*2*2*16)+(o1*2*16)+(o2*16)+index] = (CGV_FLOAT) k;
  259. BC7EncodeRamps.sp_err[(CLT(clogBC7) * 4 * 256 * 2 * 2 * 16) + (BTT(bits) * 256 * 2 * 2 * 16) + (j * 2 * 2 * 16) +
  260. (o1 * 2 * 16) + (o2 * 16) + index] = (CGU_UINT8)k;
  261. } //sp_idx < 0
  262. } //i<(1 << clogBC7)
  263. } //o2
  264. } //o1
  265. } //j
  266. #endif
  267. } //bits<BIT_RANGE
  268. } //clogBC7<LOG_CL_RANGE
  269. #endif
  270. }
  271. //----------------------------------------------------------
  272. //====== Common BC7 ASPM Code used for SPMD (CPU/GPU) ======
  273. //----------------------------------------------------------
  274. #define SOURCE_BLOCK_SIZE 16 // Size of a source block in pixels (each pixel has RGBA:8888 channels)
  275. #define COMPRESSED_BLOCK_SIZE 16 // Size of a compressed block in bytes
  276. #define MAX_CHANNELS 4
  277. #define MAX_SUBSETS 3 // Maximum number of possible subsets
  278. #define MAX_SUBSET_SIZE 16 // Largest possible size for an individual subset
  279. #ifndef ASPM_GPU
  280. extern "C" CGU_INT timerStart(CGU_INT id);
  281. extern "C" CGU_INT timerEnd(CGU_INT id);
  282. #define TIMERSTART(x) timerStart(x)
  283. #define TIMEREND(x) timerEnd(x)
  284. #else
  285. #define TIMERSTART(x)
  286. #define TIMEREND(x)
  287. #endif
  288. #ifdef ASPM_GPU
  289. #define GATHER_UINT8(x, y) x[y]
  290. #else
  291. #define GATHER_UINT8(x, y) gather_uint8(x, y)
  292. #endif
  293. // INLINE CGV_UINT8 gather_uint8 (CMP_CONSTANT CGU_UINT8 * __constant uniform ptr, CGV_INT idx)
  294. // {
  295. // return ptr[idx]; // (perf warning expected)
  296. // }
  297. //
  298. // INLINE CGV_UINT8 gather_cmpout(CMP_CONSTANT CGV_UINT8 * __constant uniform ptr, CGU_INT idx)
  299. // {
  300. // return ptr[idx]; // (perf warning expected)
  301. // }
  302. //
  303. //INLINE CGV_UINT8 gather_index(CMP_CONSTANT varying CGV_UINT8* __constant uniform ptr, CGV_INT idx)
  304. //{
  305. // return ptr[idx]; // (perf warning expected)
  306. //}
  307. //
  308. //INLINE void scatter_index(CGV_UINT8* ptr, CGV_INT idx, CGV_UINT8 value)
  309. //{
  310. // ptr[idx] = value; // (perf warning expected)
  311. //}
  312. //
  313. #ifdef USE_VARYING
  314. INLINE CGV_INT gather_epocode(CMP_CONSTANT CGV_INT* ptr, CGV_INT idx)
  315. {
  316. return ptr[idx]; // (perf warning expected)
  317. }
  318. #endif
  319. INLINE CGV_UINT32 gather_partid(CMP_CONSTANT CGV_UINT32* uniform ptr, CGV_INT idx)
  320. {
  321. return ptr[idx]; // (perf warning expected)
  322. }
  323. //INLINE CGV_UINT8 gather_vuint8(CMP_CONSTANT varying CGV_UINT8* __constant uniform ptr, CGV_INT idx)
  324. //{
  325. // return ptr[idx]; // (perf warning expected)
  326. //}
  327. INLINE void cmp_swap_epo(CGV_INT u[], CGV_INT v[], CGV_INT n)
  328. {
  329. for (CGU_INT i = 0; i < n; i++)
  330. {
  331. CGV_INT t = u[i];
  332. u[i] = v[i];
  333. v[i] = t;
  334. }
  335. }
  336. INLINE void cmp_swap_index(CGV_UINT8 u[], CGV_UINT8 v[], CGU_INT n)
  337. {
  338. for (CGU_INT i = 0; i < n; i++)
  339. {
  340. CGV_UINT8 t = u[i];
  341. u[i] = v[i];
  342. v[i] = t;
  343. }
  344. }
  345. void cmp_memsetBC7(CGV_UINT8 ptr[], CGV_UINT8 value, CGU_UINT32 size)
  346. {
  347. for (CGV_UINT32 i = 0; i < size; i++)
  348. {
  349. ptr[i] = value;
  350. }
  351. }
  352. void cmp_memcpy(CMP_GLOBAL CGU_UINT8 dst[], CGU_UINT8 src[], CGU_UINT32 size)
  353. {
  354. #ifdef ASPM_GPU
  355. for (CGV_INT i = 0; i < size; i++)
  356. {
  357. dst[i] = src[i];
  358. }
  359. #else
  360. memcpy(dst, src, size);
  361. #endif
  362. }
  363. INLINE CGV_FLOAT sq_image(CGV_FLOAT v)
  364. {
  365. return v * v;
  366. }
  367. INLINE CGV_INT clampEPO(CGV_INT v, CGV_INT a, CGV_INT b)
  368. {
  369. if (v < a)
  370. return a;
  371. else if (v > b)
  372. return b;
  373. return v;
  374. }
  375. INLINE CGV_UINT8 clampIndex(CGV_UINT8 v, CGV_UINT8 a, CGV_UINT8 b)
  376. {
  377. if (v < a)
  378. return a;
  379. else if (v > b)
  380. return b;
  381. return v;
  382. }
  383. INLINE CGV_UINT32 shift_right_uint32(CGV_UINT32 v, CGU_INT bits)
  384. {
  385. return v >> bits; // (perf warning expected)
  386. }
  387. INLINE CGV_UINT8 shift_right_uint8(CGV_UINT8 v, CGU_UINT8 bits)
  388. {
  389. return v >> bits; // (perf warning expected)
  390. }
  391. INLINE CGV_UINT8 shift_right_uint8V(CGV_UINT8 v, CGV_UINT8 bits)
  392. {
  393. return v >> bits; // (perf warning expected)
  394. }
  395. // valid bit range is 0..8
  396. INLINE CGV_INT expandEPObits(CGV_INT v, uniform CGV_INT bits)
  397. {
  398. CGV_INT vv = v << (8 - bits);
  399. return vv + shift_right_uint32(vv, bits);
  400. }
  401. CGV_FLOAT err_absf(CGV_FLOAT a)
  402. {
  403. return a > 0.0F ? a : -a;
  404. }
  405. CGV_FLOAT img_absf(CGV_FLOAT a)
  406. {
  407. return a > 0.0F ? a : -a;
  408. }
  409. CGU_UINT8 min8(CGU_UINT8 a, CGU_UINT8 b)
  410. {
  411. return a < b ? a : b;
  412. }
  413. CGU_UINT8 max8(CGU_UINT8 a, CGU_UINT8 b)
  414. {
  415. return a > b ? a : b;
  416. }
  417. void pack_index(CGV_UINT32 packed_index[2], CGV_UINT8 src_index[MAX_SUBSET_SIZE])
  418. {
  419. // Converts from unpacked index to packed index
  420. packed_index[0] = 0x0000;
  421. packed_index[1] = 0x0000;
  422. CGV_UINT8 shift = 0; // was CGV_UINT8
  423. for (CGU_INT k = 0; k < 16; k++)
  424. {
  425. packed_index[k / 8] |= (CGV_UINT32)(src_index[k] & 0x0F) << shift;
  426. shift += 4;
  427. }
  428. }
  429. void unpack_index(CGV_UINT8 unpacked_index[MAX_SUBSET_SIZE], CGV_UINT32 src_packed[2])
  430. {
  431. // Converts from packed index to unpacked index
  432. CGV_UINT8 shift = 0; // was CGV_UINT8
  433. for (CGV_UINT8 k = 0; k < 16; k++)
  434. {
  435. unpacked_index[k] = (CGV_UINT8)(src_packed[k / 8] >> shift) & 0xF;
  436. if (k == 7)
  437. shift = 0;
  438. else
  439. shift += 4;
  440. }
  441. }
  442. //====================================== CMP MATH UTILS ============================================
  443. CGV_FLOAT err_Total(CGV_FLOAT image_src1[SOURCE_BLOCK_SIZE * MAX_CHANNELS],
  444. CGV_FLOAT image_src2[SOURCE_BLOCK_SIZE * MAX_CHANNELS],
  445. CGV_INT numEntries, // < 16
  446. CGU_UINT8 channels3or4)
  447. { // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
  448. CGV_FLOAT err_t = 0.0F;
  449. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  450. for (CGV_INT k = 0; k < numEntries; k++)
  451. {
  452. err_t = err_t + sq_image(image_src1[k + ch * SOURCE_BLOCK_SIZE] - image_src2[k + ch * SOURCE_BLOCK_SIZE]);
  453. }
  454. return err_t;
  455. };
  456. void GetImageCentered(CGV_FLOAT image_centered_out[SOURCE_BLOCK_SIZE * MAX_CHANNELS],
  457. CGV_FLOAT mean_out[MAX_CHANNELS],
  458. CGV_FLOAT image_src[SOURCE_BLOCK_SIZE * MAX_CHANNELS],
  459. CGV_INT numEntries, // < 16
  460. CGU_UINT8 channels3or4)
  461. { // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
  462. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  463. {
  464. mean_out[ch] = 0.0F;
  465. if (numEntries > 0)
  466. {
  467. for (CGV_INT k = 0; k < numEntries; k++)
  468. {
  469. mean_out[ch] = mean_out[ch] + image_src[k + (ch * SOURCE_BLOCK_SIZE)];
  470. }
  471. mean_out[ch] /= numEntries;
  472. for (CGV_INT k = 0; k < numEntries; k++)
  473. image_centered_out[k + (ch * SOURCE_BLOCK_SIZE)] = image_src[k + (ch * SOURCE_BLOCK_SIZE)] - mean_out[ch];
  474. }
  475. }
  476. }
  477. void GetCovarianceVector(CGV_FLOAT covariance_out[MAX_CHANNELS * MAX_CHANNELS], // OUT: Covariance vector
  478. CGV_FLOAT image_centered[SOURCE_BLOCK_SIZE * MAX_CHANNELS],
  479. CGV_INT numEntries, // < 16
  480. CGU_UINT8 channels3or4)
  481. { // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
  482. for (CGU_UINT8 ch1 = 0; ch1 < channels3or4; ch1++)
  483. for (CGU_UINT8 ch2 = 0; ch2 <= ch1; ch2++)
  484. {
  485. covariance_out[ch1 + ch2 * 4] = 0;
  486. for (CGV_INT k = 0; k < numEntries; k++)
  487. covariance_out[ch1 + ch2 * 4] += image_centered[k + (ch1 * SOURCE_BLOCK_SIZE)] * image_centered[k + (ch2 * SOURCE_BLOCK_SIZE)];
  488. }
  489. for (CGU_UINT8 ch1 = 0; ch1 < channels3or4; ch1++)
  490. for (CGU_UINT8 ch2 = ch1 + 1; ch2 < channels3or4; ch2++)
  491. covariance_out[ch1 + ch2 * 4] = covariance_out[ch2 + ch1 * 4];
  492. }
  493. void GetProjecedImage(CGV_FLOAT projection_out[SOURCE_BLOCK_SIZE], //output projected data
  494. CGV_FLOAT image_centered[SOURCE_BLOCK_SIZE * MAX_CHANNELS],
  495. CGV_INT numEntries, // < 16
  496. CGV_FLOAT EigenVector[MAX_CHANNELS],
  497. CGU_UINT8 channels3or4)
  498. { // 3 = RGB or 4 = RGBA
  499. projection_out[0] = 0.0F;
  500. // EigenVector must be normalized
  501. for (CGV_INT k = 0; k < numEntries; k++)
  502. {
  503. projection_out[k] = 0.0F;
  504. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  505. {
  506. projection_out[k] = projection_out[k] + (image_centered[k + (ch * SOURCE_BLOCK_SIZE)] * EigenVector[ch]);
  507. }
  508. }
  509. }
  510. INLINE CGV_UINT8 get_partition_subset(CGV_INT part_id, CGU_INT maxSubsets, CGV_INT index)
  511. {
  512. if (maxSubsets == 2)
  513. {
  514. CGV_UINT32 mask_packed = subset_mask_table[part_id];
  515. return ((mask_packed & (0x01 << index)) ? 1 : 0); // This can be moved to caller, just return mask!!
  516. }
  517. // 3 region subsets
  518. part_id += 64;
  519. CGV_UINT32 mask0 = subset_mask_table[part_id] & 0xFFFF;
  520. CGV_UINT32 mask1 = subset_mask_table[part_id] >> 16;
  521. CGV_UINT32 mask = 0x01 << index;
  522. return ((mask1 & mask) ? 2 : 0 + (mask0 & mask) ? 1 : 0); // This can be moved to caller, just return mask!!
  523. }
  524. void GetPartitionSubSet_mode01237(CGV_FLOAT subsets_out[MAX_SUBSETS][SOURCE_BLOCK_SIZE][MAX_CHANNELS], // OUT: Subset pattern mapped with image src colors
  525. CGV_INT entryCount_out[MAX_SUBSETS], // OUT: Number of entries per subset
  526. CGV_UINT8 partition, // Partition Shape 0..63
  527. CGV_FLOAT image_src[SOURCE_BLOCK_SIZE * MAX_CHANNELS], // Image colors
  528. CGU_INT blockMode, // [0,1,2,3 or 7]
  529. CGU_UINT8 channels3or4)
  530. { // 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
  531. CGU_UINT8 maxSubsets = 2;
  532. if (blockMode == 0 || blockMode == 2)
  533. maxSubsets = 3;
  534. entryCount_out[0] = 0;
  535. entryCount_out[1] = 0;
  536. entryCount_out[2] = 0;
  537. for (CGV_INT i = 0; i < MAX_SUBSET_SIZE; i++)
  538. {
  539. CGV_UINT8 subset = get_partition_subset(partition, maxSubsets, i);
  540. for (CGU_INT ch = 0; ch < 3; ch++)
  541. subsets_out[subset][entryCount_out[subset]][ch] = image_src[i + (ch * SOURCE_BLOCK_SIZE)];
  542. //subsets_out[subset*64+(entryCount_out[subset]*MAX_CHANNELS+ch)] = image_src[i+(ch*SOURCE_BLOCK_SIZE)];
  543. // if we have only 3 channels then set the alpha subset to 0
  544. if (channels3or4 == 3)
  545. subsets_out[subset][entryCount_out[subset]][3] = 0.0F;
  546. else
  547. subsets_out[subset][entryCount_out[subset]][3] = image_src[i + (COMP_ALPHA * SOURCE_BLOCK_SIZE)];
  548. entryCount_out[subset]++;
  549. }
  550. }
  551. INLINE void GetClusterMean(CGV_FLOAT cluster_mean_out[SOURCE_BLOCK_SIZE][MAX_CHANNELS],
  552. CGV_FLOAT image_src[SOURCE_BLOCK_SIZE * MAX_CHANNELS],
  553. CGV_UINT8 index_in[MAX_SUBSET_SIZE],
  554. CGV_INT numEntries, // < 16
  555. CGU_UINT8 channels3or4)
  556. { // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
  557. // unused index values are underfined
  558. CGV_UINT8 i_cnt[MAX_SUBSET_SIZE];
  559. CGV_UINT8 i_comp[MAX_SUBSET_SIZE];
  560. for (CGV_INT i = 0; i < numEntries; i++)
  561. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  562. {
  563. CGV_UINT8 idx = index_in[i] & 0x0F;
  564. cluster_mean_out[idx][ch] = 0;
  565. i_cnt[idx] = 0;
  566. }
  567. CGV_UINT8 ic = 0; // was CGV_INT
  568. for (CGV_INT i = 0; i < numEntries; i++)
  569. {
  570. CGV_UINT8 idx = index_in[i] & 0x0F;
  571. if (i_cnt[idx] == 0)
  572. i_comp[ic++] = idx;
  573. i_cnt[idx]++;
  574. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  575. {
  576. cluster_mean_out[idx][ch] += image_src[i + (ch * SOURCE_BLOCK_SIZE)];
  577. }
  578. }
  579. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  580. for (CGU_INT i = 0; i < ic; i++)
  581. {
  582. if (i_cnt[i_comp[i]] != 0)
  583. {
  584. CGV_UINT8 icmp = i_comp[i];
  585. cluster_mean_out[icmp][ch] = (CGV_FLOAT)floor((cluster_mean_out[icmp][ch] / (CGV_FLOAT)i_cnt[icmp]) + 0.5F);
  586. }
  587. }
  588. }
  589. INLINE void GetImageMean(CGV_FLOAT image_mean_out[SOURCE_BLOCK_SIZE * MAX_CHANNELS],
  590. CGV_FLOAT image_src[SOURCE_BLOCK_SIZE * MAX_CHANNELS],
  591. CGV_INT numEntries,
  592. CGU_UINT8 channels)
  593. {
  594. for (CGU_UINT8 ch = 0; ch < channels; ch++)
  595. image_mean_out[ch] = 0;
  596. for (CGV_INT i = 0; i < numEntries; i++)
  597. for (CGU_UINT8 ch = 0; ch < channels; ch++)
  598. image_mean_out[ch] += image_src[i + ch * SOURCE_BLOCK_SIZE];
  599. for (CGU_UINT8 ch = 0; ch < channels; ch++)
  600. image_mean_out[ch] /= (CGV_FLOAT)numEntries; // Performance Warning: Conversion from unsigned int to float is slow. Use "int" if possible
  601. }
  602. // calculate an eigen vector corresponding to a biggest eigen value
  603. // will work for non-zero non-negative matricies only
  604. void GetEigenVector(CGV_FLOAT EigenVector_out[MAX_CHANNELS], // Normalized Eigen Vector output
  605. CGV_FLOAT CovarianceVector[MAX_CHANNELS * MAX_CHANNELS], // Covariance Vector
  606. CGU_UINT8 channels3or4)
  607. { // IN: 3 = RGB or 4 = RGBA
  608. CGV_FLOAT vector_covIn[MAX_CHANNELS * MAX_CHANNELS];
  609. CGV_FLOAT vector_covOut[MAX_CHANNELS * MAX_CHANNELS];
  610. CGV_FLOAT vector_maxCovariance;
  611. for (CGU_UINT8 ch1 = 0; ch1 < channels3or4; ch1++)
  612. for (CGU_UINT8 ch2 = 0; ch2 < channels3or4; ch2++)
  613. {
  614. vector_covIn[ch1 + ch2 * 4] = CovarianceVector[ch1 + ch2 * 4];
  615. }
  616. vector_maxCovariance = 0;
  617. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  618. {
  619. if (vector_covIn[ch + ch * 4] > vector_maxCovariance)
  620. vector_maxCovariance = vector_covIn[ch + ch * 4];
  621. }
  622. // Normalize Input Covariance Vector
  623. for (CGU_UINT8 ch1 = 0; ch1 < channels3or4; ch1++)
  624. for (CGU_UINT8 ch2 = 0; ch2 < channels3or4; ch2++)
  625. {
  626. if (vector_maxCovariance > 0)
  627. vector_covIn[ch1 + ch2 * 4] = vector_covIn[ch1 + ch2 * 4] / vector_maxCovariance;
  628. }
  629. for (CGU_UINT8 ch1 = 0; ch1 < channels3or4; ch1++)
  630. {
  631. for (CGU_UINT8 ch2 = 0; ch2 < channels3or4; ch2++)
  632. {
  633. CGV_FLOAT vector_temp_cov = 0;
  634. for (CGU_UINT8 ch3 = 0; ch3 < channels3or4; ch3++)
  635. {
  636. vector_temp_cov = vector_temp_cov + vector_covIn[ch1 + ch3 * 4] * vector_covIn[ch3 + ch2 * 4];
  637. }
  638. vector_covOut[ch1 + ch2 * 4] = vector_temp_cov;
  639. }
  640. }
  641. vector_maxCovariance = 0;
  642. CGV_INT maxCovariance_channel = 0;
  643. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  644. {
  645. if (vector_covOut[ch + ch * 4] > vector_maxCovariance)
  646. {
  647. maxCovariance_channel = ch;
  648. vector_maxCovariance = vector_covOut[ch + ch * 4];
  649. }
  650. }
  651. CGV_FLOAT vector_t = 0;
  652. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  653. {
  654. vector_t = vector_t + vector_covOut[maxCovariance_channel + ch * 4] * vector_covOut[maxCovariance_channel + ch * 4];
  655. EigenVector_out[ch] = vector_covOut[maxCovariance_channel + ch * 4];
  656. }
  657. // Normalize the Eigen Vector
  658. vector_t = sqrt(vector_t);
  659. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  660. {
  661. if (vector_t > 0)
  662. EigenVector_out[ch] = EigenVector_out[ch] / vector_t;
  663. }
  664. }
  665. CGV_UINT8 index_collapse(CGV_UINT8 index[MAX_SUBSET_SIZE], CGV_INT numEntries)
  666. {
  667. CGV_UINT8 minIndex = index[0];
  668. CGV_UINT8 MaxIndex = index[0];
  669. for (CGV_INT k = 1; k < numEntries; k++)
  670. {
  671. if (index[k] < minIndex)
  672. minIndex = index[k];
  673. if (index[k] > MaxIndex)
  674. MaxIndex = index[k];
  675. }
  676. CGV_UINT8 D = 1;
  677. for (CGV_UINT8 d = 2; d <= MaxIndex - minIndex; d++)
  678. {
  679. for (CGV_INT ent = 0; ent < numEntries; ent++)
  680. {
  681. if ((index[ent] - minIndex) % d != 0)
  682. {
  683. if (ent >= numEntries)
  684. D = d;
  685. break;
  686. }
  687. }
  688. }
  689. for (CGV_INT k = 0; k < numEntries; k++)
  690. {
  691. index[k] = (index[k] - minIndex) / D;
  692. }
  693. for (CGV_INT k = 1; k < numEntries; k++)
  694. {
  695. if (index[k] > MaxIndex)
  696. MaxIndex = index[k];
  697. }
  698. return (MaxIndex);
  699. }
  700. void sortProjected_indexs(CGV_UINT8 index_ordered[MAX_SUBSET_SIZE],
  701. CGV_FLOAT projection[SOURCE_BLOCK_SIZE],
  702. CGV_INT numEntries // max 16
  703. )
  704. {
  705. CMP_di what[SOURCE_BLOCK_SIZE];
  706. for (CGV_UINT8 i = 0; i < numEntries; i++)
  707. {
  708. what[i].index = i;
  709. what[i].image = projection[i];
  710. }
  711. CGV_UINT8 tmp_index;
  712. CGV_FLOAT tmp_image;
  713. for (CGV_INT i = 1; i < numEntries; i++)
  714. {
  715. for (CGV_INT j = i; j > 0; j--)
  716. {
  717. if (what[j - 1].image > what[j].image)
  718. {
  719. tmp_index = what[j].index;
  720. tmp_image = what[j].image;
  721. what[j].index = what[j - 1].index;
  722. what[j].image = what[j - 1].image;
  723. what[j - 1].index = tmp_index;
  724. what[j - 1].image = tmp_image;
  725. }
  726. }
  727. }
  728. for (CGV_INT i = 0; i < numEntries; i++)
  729. index_ordered[i] = what[i].index;
  730. };
  731. void sortPartitionProjection(CGV_FLOAT projection[MAX_PARTITION_ENTRIES],
  732. CGV_UINT8 order[MAX_PARTITION_ENTRIES],
  733. CGU_UINT8 numPartitions // max 64
  734. )
  735. {
  736. CMP_du what[MAX_PARTITION_ENTRIES];
  737. for (CGU_UINT8 Parti = 0; Parti < numPartitions; Parti++)
  738. {
  739. what[Parti].index = Parti;
  740. what[Parti].image = projection[Parti];
  741. }
  742. CGV_UINT8 index;
  743. CGV_FLOAT data;
  744. for (CGU_UINT8 Parti = 1; Parti < numPartitions; Parti++)
  745. {
  746. for (CGU_UINT8 Partj = Parti; Partj > 0; Partj--)
  747. {
  748. if (what[Partj - 1].image > what[Partj].image)
  749. {
  750. index = what[Partj].index;
  751. data = what[Partj].image;
  752. what[Partj].index = what[Partj - 1].index;
  753. what[Partj].image = what[Partj - 1].image;
  754. what[Partj - 1].index = index;
  755. what[Partj - 1].image = data;
  756. }
  757. }
  758. }
  759. for (CGU_UINT8 Parti = 0; Parti < numPartitions; Parti++)
  760. order[Parti] = what[Parti].index;
  761. };
  762. void cmp_Write8Bit(CGV_UINT8 base[], CGU_INT* uniform offset, CGU_INT bits, CGV_UINT8 bitVal)
  763. {
  764. base[*offset / 8] |= bitVal << (*offset % 8);
  765. if (*offset % 8 + bits > 8)
  766. {
  767. base[*offset / 8 + 1] |= shift_right_uint8(bitVal, 8 - *offset % 8);
  768. }
  769. *offset += bits;
  770. }
  771. void cmp_Write8BitV(CGV_UINT8 base[], CGV_INT offset, CGU_INT bits, CGV_UINT8 bitVal)
  772. {
  773. base[offset / 8] |= bitVal << (offset % 8);
  774. if (offset % 8 + bits > 8)
  775. {
  776. base[offset / 8 + 1] |= shift_right_uint8V(bitVal, 8 - offset % 8);
  777. }
  778. }
  779. INLINE CGV_INT ep_find_floor(CGV_FLOAT v, CGU_UINT8 bits, CGV_UINT8 use_par, CGV_UINT8 odd)
  780. {
  781. CGV_INT i1 = 0;
  782. CGV_INT i2 = 1 << (bits - use_par);
  783. odd = use_par ? odd : 0;
  784. while (i2 - i1 > 1)
  785. {
  786. CGV_INT j = (i1 + i2) / 2; // Warning in ASMP code
  787. CGV_INT ep_d = expandEPObits((j << use_par) + odd, bits);
  788. if (v >= ep_d)
  789. i1 = j;
  790. else
  791. i2 = j;
  792. }
  793. return (i1 << use_par) + odd;
  794. }
  795. //==========================================================
  796. // Not used for Modes 4&5
  797. INLINE CGV_FLOAT GetRamp(CGU_INT clogBC7, // ramp bits Valid range 2..4
  798. CGU_INT bits, // Component Valid range 5..8
  799. CGV_INT p1, // 0..255
  800. CGV_INT p2, // 0..255
  801. CGV_UINT8 index)
  802. { // 0..15
  803. #ifdef ASPM_GPU // GPU Code
  804. CGV_FLOAT rampf = 0.0F;
  805. CGV_INT e1 = expand_epocode(p1, bits);
  806. CGV_INT e2 = expand_epocode(p2, bits);
  807. CGV_FLOAT ramp = gather_epocode(rampI, clogBC7 * 16 + index) / 64.0F;
  808. rampf = floor(e1 + ramp * (e2 - e1) + 0.5F); // returns 0..255 values
  809. return rampf;
  810. #else // CPU ASPM Code
  811. #ifdef USE_BC7_RAMP
  812. CGV_FLOAT rampf = BC7EncodeRamps.ramp[(CLT(clogBC7) * 4 * 256 * 256 * 16) + (BTT(bits) * 256 * 256 * 16) + (p1 * 256 * 16) + (p2 * 16) + index];
  813. return rampf;
  814. #else
  815. return (CGV_FLOAT)floor((CGV_FLOAT)BC7EncodeRamps.ep_d[BTT(bits)][p1] +
  816. rampWeights[clogBC7][index] * (CGV_FLOAT)((BC7EncodeRamps.ep_d[BTT(bits)][p2] - BC7EncodeRamps.ep_d[BTT(bits)][p1])) + 0.5F);
  817. #endif
  818. #endif
  819. }
  820. // Not used for Modes 4&5
  821. INLINE CGV_FLOAT get_sperr(CGU_INT clogBC7, // ramp bits Valid range 2..4
  822. CGU_INT bits, // Component Valid range 5..8
  823. CGV_INT p1, // 0..255
  824. CGU_INT t1,
  825. CGU_INT t2,
  826. CGV_UINT8 index)
  827. {
  828. #ifdef ASPM_GPU
  829. return 0.0F;
  830. #else
  831. #ifdef USE_BC7_SP_ERR_IDX
  832. if (BC7EncodeRamps.ramp_init)
  833. return BC7EncodeRamps
  834. .sp_err[(CLT(clogBC7) * 4 * 256 * 2 * 2 * 16) + (BTT(bits) * 256 * 2 * 2 * 16) + (p1 * 2 * 2 * 16) + (t1 * 2 * 16) + (t2 * 16) + index];
  835. else
  836. return 0.0F;
  837. #else
  838. return 0.0F;
  839. #endif
  840. #endif
  841. }
  842. INLINE void get_fixuptable(CGV_INT fixup[3], CGV_INT part_id)
  843. {
  844. CGV_INT skip_packed = FIXUPINDEX[part_id]; // gather_int2(FIXUPINDEX, part_id);
  845. fixup[0] = 0;
  846. fixup[1] = skip_packed >> 4;
  847. fixup[2] = skip_packed & 15;
  848. }
  849. //===================================== COMPRESS CODE =============================================
  850. INLINE void SetDefaultIndex(CGV_UINT8 index_io[MAX_SUBSET_SIZE])
  851. {
  852. // Use this a final call
  853. for (CGU_INT i = 0; i < MAX_SUBSET_SIZE; i++)
  854. index_io[i] = 0;
  855. }
  856. INLINE void SetDefaultEPOCode(CGV_INT epo_code_io[8], CGV_INT R, CGV_INT G, CGV_INT B, CGV_INT A)
  857. {
  858. epo_code_io[0] = R;
  859. epo_code_io[1] = G;
  860. epo_code_io[2] = B;
  861. epo_code_io[3] = A;
  862. epo_code_io[4] = R;
  863. epo_code_io[5] = G;
  864. epo_code_io[6] = B;
  865. epo_code_io[7] = A;
  866. }
  867. void GetProjectedIndex(CGV_UINT8 projected_index_out[MAX_SUBSET_SIZE], //output: index, uncentered, in the range 0..clusters-1
  868. CGV_FLOAT image_projected[SOURCE_BLOCK_SIZE], // image_block points, might be uncentered
  869. CGV_INT clusters, // clusters: number of points in the ramp (max 16)
  870. CGV_INT numEntries)
  871. { // n - number of points in v_ max 15
  872. CMP_di what[SOURCE_BLOCK_SIZE];
  873. CGV_FLOAT image_v[SOURCE_BLOCK_SIZE];
  874. CGV_FLOAT image_z[SOURCE_BLOCK_SIZE];
  875. CGV_FLOAT image_l;
  876. CGV_FLOAT image_mm;
  877. CGV_FLOAT image_r = 0.0F;
  878. CGV_FLOAT image_dm = 0.0F;
  879. CGV_FLOAT image_min;
  880. CGV_FLOAT image_max;
  881. CGV_FLOAT image_s;
  882. SetDefaultIndex(projected_index_out);
  883. image_min = image_projected[0];
  884. image_max = image_projected[0];
  885. for (CGV_INT i = 1; i < numEntries; i++)
  886. {
  887. if (image_min < image_projected[i])
  888. image_min = image_projected[i];
  889. if (image_max > image_projected[i])
  890. image_max = image_projected[i];
  891. }
  892. CGV_FLOAT img_diff = image_max - image_min;
  893. if (img_diff == 0.0f)
  894. return;
  895. if (cmp_isnan(img_diff))
  896. return;
  897. image_s = (clusters - 1) / img_diff;
  898. for (CGV_UINT8 i = 0; i < numEntries; i++)
  899. {
  900. image_v[i] = image_projected[i] * image_s;
  901. image_z[i] = floor(image_v[i] + 0.5F - image_min * image_s);
  902. projected_index_out[i] = (CGV_UINT8)image_z[i];
  903. what[i].image = image_v[i] - image_z[i] - image_min * image_s;
  904. what[i].index = i;
  905. image_dm += what[i].image;
  906. image_r += what[i].image * what[i].image;
  907. }
  908. if (numEntries * image_r - image_dm * image_dm >= (CGV_FLOAT)(numEntries - 1) / 8)
  909. {
  910. image_dm /= numEntries;
  911. for (CGV_INT i = 0; i < numEntries; i++)
  912. what[i].image -= image_dm;
  913. CGV_UINT8 tmp_index;
  914. CGV_FLOAT tmp_image;
  915. for (CGV_INT i = 1; i < numEntries; i++)
  916. {
  917. for (CGV_INT j = i; j > 0; j--)
  918. {
  919. if (what[j - 1].image > what[j].image)
  920. {
  921. tmp_index = what[j].index;
  922. tmp_image = what[j].image;
  923. what[j].index = what[j - 1].index;
  924. what[j].image = what[j - 1].image;
  925. what[j - 1].index = tmp_index;
  926. what[j - 1].image = tmp_image;
  927. }
  928. }
  929. }
  930. // got into fundamental simplex
  931. // move coordinate system origin to its center
  932. // i=0 < numEntries avoids varying int division by 0
  933. for (CGV_INT i = 0; i < numEntries; i++)
  934. {
  935. what[i].image = what[i].image - (CGV_FLOAT)(((2.0f * i + 1) - numEntries) / (2.0f * numEntries));
  936. }
  937. image_mm = 0.0F;
  938. image_l = 0.0F;
  939. CGV_INT j = -1;
  940. for (CGV_INT i = 0; i < numEntries; i++)
  941. {
  942. image_l += what[i].image;
  943. if (image_l < image_mm)
  944. {
  945. image_mm = image_l;
  946. j = i;
  947. }
  948. }
  949. j = j + 1;
  950. // avoid j = j%numEntries us this
  951. while (j > numEntries)
  952. j = j - numEntries;
  953. for (CGV_INT i = j; i < numEntries; i++)
  954. {
  955. CGV_UINT8 idx = what[i].index;
  956. CGV_UINT8 pidx = projected_index_out[idx] + 1; //gather_index(projected_index_out,idx)+1;
  957. projected_index_out[idx] = pidx; // scatter_index(projected_index_out,idx,pidx);
  958. }
  959. }
  960. // get minimum index
  961. CGV_UINT8 index_min = projected_index_out[0];
  962. for (CGV_INT i = 1; i < numEntries; i++)
  963. {
  964. if (projected_index_out[i] < index_min)
  965. index_min = projected_index_out[i];
  966. }
  967. // reposition all index by min index (using min index as 0)
  968. for (CGV_INT i = 0; i < numEntries; i++)
  969. {
  970. projected_index_out[i] = clampIndex(projected_index_out[i] - index_min, 0, 15);
  971. }
  972. }
  973. CGV_FLOAT GetQuantizeIndex(CGV_UINT32 index_packed_out[2],
  974. CGV_UINT8 index_out[MAX_SUBSET_SIZE], // OUT:
  975. CGV_FLOAT image_src[SOURCE_BLOCK_SIZE * MAX_CHANNELS],
  976. CGV_INT numEntries, //IN: range 0..15 (MAX_SUBSET_SIZE)
  977. CGU_INT numClusters,
  978. CGU_UINT8 channels3or4)
  979. { // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
  980. CGV_FLOAT image_centered[SOURCE_BLOCK_SIZE * MAX_CHANNELS];
  981. CGV_FLOAT image_mean[MAX_CHANNELS];
  982. CGV_FLOAT eigen_vector[MAX_CHANNELS];
  983. CGV_FLOAT covariance_vector[MAX_CHANNELS * MAX_CHANNELS];
  984. GetImageCentered(image_centered, image_mean, image_src, numEntries, channels3or4);
  985. GetCovarianceVector(covariance_vector, image_centered, numEntries, channels3or4);
  986. //-----------------------------------------------------
  987. // check if all covariances are the same
  988. // if so then set all index to same value 0 and return
  989. // use EPSILON to set the limit for all same limit
  990. //-----------------------------------------------------
  991. CGV_FLOAT image_covt = 0.0F;
  992. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  993. image_covt = image_covt + covariance_vector[ch + ch * 4];
  994. if (image_covt < EPSILON)
  995. {
  996. SetDefaultIndex(index_out);
  997. index_packed_out[0] = 0;
  998. index_packed_out[1] = 0;
  999. return 0.;
  1000. }
  1001. GetEigenVector(eigen_vector, covariance_vector, channels3or4);
  1002. CGV_FLOAT image_projected[SOURCE_BLOCK_SIZE];
  1003. GetProjecedImage(image_projected, image_centered, numEntries, eigen_vector, channels3or4);
  1004. GetProjectedIndex(index_out, image_projected, numClusters, numEntries);
  1005. //==========================================
  1006. // Refine
  1007. //==========================================
  1008. CGV_FLOAT image_q = 0.0F;
  1009. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  1010. {
  1011. eigen_vector[ch] = 0;
  1012. for (CGV_INT k = 0; k < numEntries; k++)
  1013. eigen_vector[ch] = eigen_vector[ch] + image_centered[k + (ch * SOURCE_BLOCK_SIZE)] * index_out[k];
  1014. image_q = image_q + eigen_vector[ch] * eigen_vector[ch];
  1015. }
  1016. image_q = sqrt(image_q);
  1017. // direction needs to be normalized
  1018. if (image_q != 0.0F)
  1019. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  1020. eigen_vector[ch] = eigen_vector[ch] / image_q;
  1021. // Get new projected data
  1022. GetProjecedImage(image_projected, image_centered, numEntries, eigen_vector, channels3or4);
  1023. GetProjectedIndex(index_out, image_projected, numClusters, numEntries);
  1024. // pack the index for use in icmp
  1025. pack_index(index_packed_out, index_out);
  1026. //===========================
  1027. // Calc Error
  1028. //===========================
  1029. // Get the new image based on new index
  1030. CGV_FLOAT image_t = 0.0F;
  1031. CGV_FLOAT index_average = 0.0F;
  1032. for (CGV_INT ik = 0; ik < numEntries; ik++)
  1033. {
  1034. index_average = index_average + index_out[ik];
  1035. image_t = image_t + index_out[ik] * index_out[ik];
  1036. }
  1037. index_average = index_average / (CGV_FLOAT)numEntries;
  1038. image_t = image_t - index_average * index_average * (CGV_FLOAT)numEntries;
  1039. if (image_t != 0.0F)
  1040. image_t = 1.0F / image_t;
  1041. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  1042. {
  1043. eigen_vector[ch] = 0;
  1044. for (CGV_INT nk = 0; nk < numEntries; nk++)
  1045. eigen_vector[ch] = eigen_vector[ch] + image_centered[nk + (ch * SOURCE_BLOCK_SIZE)] * index_out[nk];
  1046. }
  1047. CGV_FLOAT image_decomp[SOURCE_BLOCK_SIZE * MAX_CHANNELS];
  1048. for (CGV_INT i = 0; i < numEntries; i++)
  1049. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  1050. image_decomp[i + (ch * SOURCE_BLOCK_SIZE)] = image_mean[ch] + eigen_vector[ch] * image_t * (index_out[i] - index_average);
  1051. CGV_FLOAT err_1 = err_Total(image_src, image_decomp, numEntries, channels3or4);
  1052. return err_1;
  1053. // return 0.0F;
  1054. }
  1055. CGV_FLOAT quant_solid_color(CGV_UINT8 index_out[MAX_SUBSET_SIZE],
  1056. CGV_INT epo_code_out[2 * MAX_CHANNELS],
  1057. CGV_FLOAT image_src[SOURCE_BLOCK_SIZE * MAX_CHANNELS],
  1058. CGV_INT numEntries,
  1059. CGU_UINT8 Mi_, // last cluster
  1060. CGU_UINT8 bits[3], // including parity
  1061. CGU_INT type,
  1062. CGU_UINT8 channels3or4 // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
  1063. )
  1064. {
  1065. CGU_INT clogBC7 = 0;
  1066. CGU_INT iv = Mi_ + 1;
  1067. while (iv >>= 1)
  1068. clogBC7++;
  1069. // init epo_0
  1070. CGV_INT epo_0[2 * MAX_CHANNELS];
  1071. SetDefaultEPOCode(epo_0, 0xFF, 0, 0, 0);
  1072. CGV_UINT8 image_log = 0;
  1073. CGV_UINT8 image_idx = 0;
  1074. CGU_BOOL use_par = FALSE;
  1075. if (type != 0)
  1076. use_par = TRUE;
  1077. CGV_FLOAT error_1 = CMP_FLOAT_MAX;
  1078. for (CGU_INT pn = 0; pn < npv_nd[channels3or4 - 3][type] && (error_1 != 0.0F); pn++)
  1079. {
  1080. //1
  1081. CGU_INT o1[2 * MAX_CHANNELS]; // = { 0,2 };
  1082. CGU_INT o2[2 * MAX_CHANNELS]; // = { 0,2 };
  1083. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  1084. {
  1085. //A
  1086. o2[ch] = o1[ch] = 0;
  1087. o2[4 + ch] = o1[4 + ch] = 2;
  1088. if (use_par == TRUE)
  1089. {
  1090. if (par_vectors_nd[channels3or4 - 3][type][pn][0][ch])
  1091. o1[ch] = 1;
  1092. else
  1093. o1[4 + ch] = 1;
  1094. if (par_vectors_nd[channels3or4 - 3][type][pn][1][ch])
  1095. o2[ch] = 1;
  1096. else
  1097. o2[4 + ch] = 1;
  1098. }
  1099. } //A
  1100. CGV_INT image_tcr[MAX_CHANNELS];
  1101. CGV_INT epo_dr_0[MAX_CHANNELS];
  1102. CGV_FLOAT error_tr;
  1103. CGV_FLOAT error_0 = CMP_FLOAT_MAX;
  1104. for (CGV_UINT8 iclogBC7 = 0; iclogBC7 < (1 << clogBC7) && (error_0 != 0); iclogBC7++)
  1105. {
  1106. //E
  1107. CGV_FLOAT error_t = 0;
  1108. CGV_INT t1o[MAX_CHANNELS], t2o[MAX_CHANNELS];
  1109. for (CGU_UINT8 ch1 = 0; ch1 < channels3or4; ch1++)
  1110. {
  1111. // D
  1112. CGV_FLOAT error_ta = CMP_FLOAT_MAX;
  1113. for (CGU_INT t1 = o1[ch1]; t1 < o1[4 + ch1]; t1++)
  1114. {
  1115. // C
  1116. // This is needed for non-integer mean points of "collapsed" sets
  1117. for (CGU_INT t2 = o2[ch1]; t2 < o2[4 + ch1]; t2++)
  1118. {
  1119. // B
  1120. CGV_INT image_tf;
  1121. CGV_INT image_tc;
  1122. image_tf = (CGV_INT)floor(image_src[COMP_RED + (ch1 * SOURCE_BLOCK_SIZE)]);
  1123. image_tc = (CGV_INT)ceil(image_src[COMP_RED + (ch1 * SOURCE_BLOCK_SIZE)]);
  1124. #ifdef USE_BC7_SP_ERR_IDX
  1125. CGV_FLOAT err_tf = get_sperr(clogBC7, bits[ch1], image_tf, t1, t2, iclogBC7);
  1126. CGV_FLOAT err_tc = get_sperr(clogBC7, bits[ch1], image_tc, t1, t2, iclogBC7);
  1127. if (err_tf > err_tc)
  1128. image_tcr[ch1] = image_tc;
  1129. else if (err_tf < err_tc)
  1130. image_tcr[ch1] = image_tf;
  1131. else
  1132. image_tcr[ch1] = (CGV_INT)floor(image_src[COMP_RED + (ch1 * SOURCE_BLOCK_SIZE)] + 0.5F);
  1133. //image_tcr[ch1] = image_tf + (image_tc - image_tf)/2;
  1134. //===============================
  1135. // Refine this for better quality!
  1136. //===============================
  1137. error_tr = get_sperr(clogBC7, bits[ch1], image_tcr[ch1], t1, t2, iclogBC7);
  1138. error_tr = (error_tr * error_tr) + 2 * error_tr * img_absf(image_tcr[ch1] - image_src[COMP_RED + (ch1 * SOURCE_BLOCK_SIZE)]) +
  1139. (image_tcr[ch1] - image_src[COMP_RED + (ch1 * SOURCE_BLOCK_SIZE)]) *
  1140. (image_tcr[ch1] - image_src[COMP_RED + (ch1 * SOURCE_BLOCK_SIZE)]);
  1141. if (error_tr < error_ta)
  1142. {
  1143. error_ta = error_tr;
  1144. t1o[ch1] = t1;
  1145. t2o[ch1] = t2;
  1146. epo_dr_0[ch1] = clampEPO(image_tcr[ch1], 0, 255);
  1147. }
  1148. #else
  1149. image_tcr[ch1] = floor(image_src[COMP_RED + (ch1 * SOURCE_BLOCK_SIZE)] + 0.5F);
  1150. error_ta = 0;
  1151. t1o[ch1] = t1;
  1152. t2o[ch1] = t2;
  1153. epo_dr_0[ch1] = clampEPO(image_tcr[ch1], 0, 255);
  1154. #endif
  1155. } // B
  1156. } //C
  1157. error_t += error_ta;
  1158. } // D
  1159. if (error_t < error_0)
  1160. {
  1161. // We have a solid color: Use image src if on GPU
  1162. image_log = iclogBC7;
  1163. image_idx = image_log;
  1164. #ifdef ASPM_GPU // This needs improving
  1165. CGV_FLOAT MinC[4] = {255, 255, 255, 255};
  1166. CGV_FLOAT MaxC[4] = {0, 0, 0, 0};
  1167. // get min max colors
  1168. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  1169. for (CGV_INT k = 0; k < numEntries; k++)
  1170. {
  1171. if (image_src[k + ch * SOURCE_BLOCK_SIZE] < MinC[ch])
  1172. MinC[ch] = image_src[k + ch * SOURCE_BLOCK_SIZE];
  1173. if (image_src[k + ch * SOURCE_BLOCK_SIZE] > MaxC[ch])
  1174. MaxC[ch] = image_src[k + ch * SOURCE_BLOCK_SIZE];
  1175. }
  1176. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  1177. {
  1178. epo_0[ch] = MinC[ch];
  1179. epo_0[4 + ch] = MaxC[ch];
  1180. }
  1181. #else // This is good on CPU
  1182. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  1183. {
  1184. #ifdef USE_BC7_SP_ERR_IDX
  1185. if (BC7EncodeRamps.ramp_init)
  1186. {
  1187. CGV_INT index = (CLT(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT(bits[ch]) * 256 * 2 * 2 * 16 * 2) + (epo_dr_0[ch] * 2 * 2 * 16 * 2) +
  1188. (t1o[ch] * 2 * 16 * 2) + (t2o[ch] * 16 * 2) + (iclogBC7 * 2);
  1189. epo_0[ch] = BC7EncodeRamps.sp_idx[index + 0] & 0xFF; // gather_epocode(u_BC7Encode->sp_idx,index+0)&0xFF;
  1190. epo_0[4 + ch] = BC7EncodeRamps.sp_idx[index + 1] & 0xFF; // gather_epocode(u_BC7Encode->sp_idx,index+1)&0xFF;
  1191. }
  1192. else
  1193. {
  1194. epo_0[ch] = 0;
  1195. epo_0[4 + ch] = 0;
  1196. }
  1197. #else
  1198. epo_0[ch] = 0;
  1199. epo_0[4 + ch] = 0;
  1200. #endif
  1201. }
  1202. #endif
  1203. error_0 = error_t;
  1204. }
  1205. //if (error_0 == 0)
  1206. // break;
  1207. } // E
  1208. if (error_0 < error_1)
  1209. {
  1210. image_idx = image_log;
  1211. for (CGU_UINT8 chE = 0; chE < channels3or4; chE++)
  1212. {
  1213. epo_code_out[chE] = epo_0[chE];
  1214. epo_code_out[4 + chE] = epo_0[4 + chE];
  1215. }
  1216. error_1 = error_0;
  1217. }
  1218. } //1
  1219. // Get Image error
  1220. CGV_FLOAT image_decomp[SOURCE_BLOCK_SIZE * MAX_CHANNELS];
  1221. for (CGV_INT i = 0; i < numEntries; i++)
  1222. {
  1223. index_out[i] = image_idx;
  1224. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  1225. {
  1226. image_decomp[i + (ch * SOURCE_BLOCK_SIZE)] = GetRamp(clogBC7, bits[ch], epo_code_out[ch], epo_code_out[4 + ch], image_idx);
  1227. }
  1228. }
  1229. // Do we need to do this rather then err_1 * numEntries
  1230. CGV_FLOAT error_quant;
  1231. error_quant = err_Total(image_src, image_decomp, numEntries, channels3or4);
  1232. return error_quant;
  1233. //return err_1 * numEntries;
  1234. }
  1235. CGV_FLOAT requantized_image_err(CGV_UINT8 index_out[MAX_SUBSET_SIZE],
  1236. CGV_INT epo_code[2 * MAX_CHANNELS],
  1237. CGU_INT clogBC7,
  1238. CGU_UINT8 max_bits[MAX_CHANNELS],
  1239. CGV_FLOAT image_src[SOURCE_BLOCK_SIZE * MAX_CHANNELS],
  1240. CGV_INT numEntries, // max 16
  1241. CGU_UINT8 channels3or4)
  1242. { // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
  1243. //=========================================
  1244. // requantized image based on new epo_code
  1245. //=========================================
  1246. CGV_FLOAT image_requantize[SOURCE_BLOCK_SIZE][MAX_CHANNELS];
  1247. CGV_FLOAT err_r = 0.0F;
  1248. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  1249. {
  1250. for (CGU_INT k = 0; k < SOURCE_BLOCK_SIZE; k++)
  1251. {
  1252. image_requantize[k][ch] = GetRamp(clogBC7, max_bits[ch], epo_code[ch], epo_code[4 + ch], (CGV_UINT8)k);
  1253. }
  1254. }
  1255. //=========================================
  1256. // Calc the error for the requantized image
  1257. //=========================================
  1258. for (CGV_INT k = 0; k < numEntries; k++)
  1259. {
  1260. CGV_FLOAT err_cmin = CMP_FLOAT_MAX;
  1261. CGV_INT hold_index_j = 0;
  1262. for (CGV_INT iclogBC7 = 0; iclogBC7 < (1 << clogBC7); iclogBC7++)
  1263. {
  1264. CGV_FLOAT image_err = 0.0F;
  1265. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  1266. {
  1267. image_err += sq_image(image_requantize[iclogBC7][ch] - image_src[k + (ch * SOURCE_BLOCK_SIZE)]);
  1268. }
  1269. if (image_err < err_cmin)
  1270. {
  1271. err_cmin = image_err;
  1272. hold_index_j = iclogBC7;
  1273. }
  1274. }
  1275. index_out[k] = (CGV_UINT8)hold_index_j;
  1276. err_r += err_cmin;
  1277. }
  1278. return err_r;
  1279. }
  1280. CGU_BOOL get_ideal_cluster(CGV_FLOAT image_out[2 * MAX_CHANNELS],
  1281. CGV_UINT8 index_in[MAX_SUBSET_SIZE],
  1282. CGU_INT Mi_,
  1283. CGV_FLOAT image_src[SOURCE_BLOCK_SIZE * MAX_CHANNELS],
  1284. CGV_INT numEntries,
  1285. CGU_UINT8 channels3or4)
  1286. {
  1287. // get ideal cluster centers
  1288. CGV_FLOAT image_cluster_mean[SOURCE_BLOCK_SIZE][MAX_CHANNELS];
  1289. GetClusterMean(image_cluster_mean, image_src, index_in, numEntries, channels3or4); // unrounded
  1290. CGV_FLOAT image_matrix0[2] = {0, 0}; // matrix /inverse matrix
  1291. CGV_FLOAT image_matrix1[2] = {0, 0}; // matrix /inverse matrix
  1292. CGV_FLOAT image_rp[2 * MAX_CHANNELS]; // right part for RMS fit problem
  1293. for (CGU_INT i = 0; i < 2 * MAX_CHANNELS; i++)
  1294. image_rp[i] = 0;
  1295. // weight with cnt if runnning on compacted index
  1296. for (CGV_INT k = 0; k < numEntries; k++)
  1297. {
  1298. image_matrix0[0] += (Mi_ - index_in[k]) * (Mi_ - index_in[k]);
  1299. image_matrix0[1] += index_in[k] * (Mi_ - index_in[k]); // im is symmetric
  1300. image_matrix1[1] += index_in[k] * index_in[k];
  1301. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  1302. {
  1303. image_rp[ch] += (Mi_ - index_in[k]) * image_cluster_mean[index_in[k]][ch];
  1304. image_rp[4 + ch] += index_in[k] * image_cluster_mean[index_in[k]][ch];
  1305. }
  1306. }
  1307. CGV_FLOAT matrix_dd = image_matrix0[0] * image_matrix1[1] - image_matrix0[1] * image_matrix0[1];
  1308. // assert(matrix_dd !=0);
  1309. // matrix_dd=0 means that index_cidx[k] and (Mi_-index_cidx[k]) collinear which implies only one active index;
  1310. // taken care of separately
  1311. if (matrix_dd == 0)
  1312. {
  1313. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  1314. {
  1315. image_out[ch] = 0;
  1316. image_out[4 + ch] = 0;
  1317. }
  1318. return FALSE;
  1319. }
  1320. image_matrix1[0] = image_matrix0[0];
  1321. image_matrix0[0] = image_matrix1[1] / matrix_dd;
  1322. image_matrix1[1] = image_matrix1[0] / matrix_dd;
  1323. image_matrix1[0] = image_matrix0[1] = -image_matrix0[1] / matrix_dd;
  1324. CGV_FLOAT Mif = (CGV_FLOAT)Mi_;
  1325. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  1326. {
  1327. image_out[ch] = (image_matrix0[0] * image_rp[ch] + image_matrix0[1] * image_rp[4 + ch]) * Mif;
  1328. image_out[4 + ch] = (image_matrix1[0] * image_rp[ch] + image_matrix1[1] * image_rp[4 + ch]) * Mif;
  1329. }
  1330. return TRUE;
  1331. }
  1332. CGV_FLOAT shake(CGV_INT epo_code_shaker_out[2 * MAX_CHANNELS],
  1333. CGV_FLOAT image_ep[2 * MAX_CHANNELS],
  1334. CGV_UINT8 index_cidx[MAX_SUBSET_SIZE],
  1335. CGV_FLOAT image_src[SOURCE_BLOCK_SIZE * MAX_CHANNELS],
  1336. CGU_INT clogBC7,
  1337. CGU_INT type,
  1338. CGU_UINT8 max_bits[MAX_CHANNELS],
  1339. CGU_UINT8 use_par,
  1340. CGV_INT numEntries, // max 16
  1341. CGU_UINT8 channels3or4)
  1342. {
  1343. #define SHAKESIZE1 1
  1344. #define SHAKESIZE2 2
  1345. // shake single or - cartesian
  1346. // shake odd/odd and even/even or - same parity
  1347. // shake odd/odd odd/even , even/odd and even/even - bcc
  1348. CGV_FLOAT best_err = CMP_FLOAT_MAX;
  1349. CGV_FLOAT err_ed[16] = {0};
  1350. CGV_INT epo_code_par[2][2][2][MAX_CHANNELS];
  1351. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  1352. {
  1353. CGU_UINT8 ppA = 0;
  1354. CGU_UINT8 ppB = 0;
  1355. CGU_UINT8 rr = (use_par ? 2 : 1);
  1356. CGV_INT epo_code_epi[2][2]; // first/second, coord, begin rage end range
  1357. for (ppA = 0; ppA < rr; ppA++)
  1358. { // loop max =2
  1359. for (ppB = 0; ppB < rr; ppB++)
  1360. { //loop max =2
  1361. // set default ranges
  1362. epo_code_epi[0][0] = epo_code_epi[0][1] = ep_find_floor(image_ep[ch], max_bits[ch], use_par, ppA);
  1363. epo_code_epi[1][0] = epo_code_epi[1][1] = ep_find_floor(image_ep[4 + ch], max_bits[ch], use_par, ppB);
  1364. // set begin range
  1365. epo_code_epi[0][0] -= ((epo_code_epi[0][0] < SHAKESIZE1 ? epo_code_epi[0][0] : SHAKESIZE1)) & (~use_par);
  1366. epo_code_epi[1][0] -= ((epo_code_epi[1][0] < SHAKESIZE1 ? epo_code_epi[1][0] : SHAKESIZE1)) & (~use_par);
  1367. // set end range
  1368. epo_code_epi[0][1] +=
  1369. ((1 << max_bits[ch]) - 1 - epo_code_epi[0][1] < SHAKESIZE2 ? (1 << max_bits[ch]) - 1 - epo_code_epi[0][1] : SHAKESIZE2) & (~use_par);
  1370. epo_code_epi[1][1] +=
  1371. ((1 << max_bits[ch]) - 1 - epo_code_epi[1][1] < SHAKESIZE2 ? (1 << max_bits[ch]) - 1 - epo_code_epi[1][1] : SHAKESIZE2) & (~use_par);
  1372. CGV_INT step = (1 << use_par);
  1373. err_ed[(ppA * 8) + (ppB * 4) + ch] = CMP_FLOAT_MAX;
  1374. for (CGV_INT epo_p1 = epo_code_epi[0][0]; epo_p1 <= epo_code_epi[0][1]; epo_p1 += step)
  1375. {
  1376. for (CGV_INT epo_p2 = epo_code_epi[1][0]; epo_p2 <= epo_code_epi[1][1]; epo_p2 += step)
  1377. {
  1378. CGV_FLOAT image_square_diff = 0.0F;
  1379. CGV_INT _mc = numEntries;
  1380. CGV_FLOAT image_ramp;
  1381. while (_mc > 0)
  1382. {
  1383. image_ramp = GetRamp(clogBC7, max_bits[ch], epo_p1, epo_p2, index_cidx[_mc - 1]);
  1384. image_square_diff += sq_image(image_ramp - image_src[(_mc - 1) + (ch * SOURCE_BLOCK_SIZE)]);
  1385. _mc--;
  1386. }
  1387. if (image_square_diff < err_ed[(ppA * 8) + (ppB * 4) + ch])
  1388. {
  1389. err_ed[(ppA * 8) + (ppB * 4) + ch] = image_square_diff;
  1390. epo_code_par[ppA][ppB][0][ch] = epo_p1;
  1391. epo_code_par[ppA][ppB][1][ch] = epo_p2;
  1392. }
  1393. }
  1394. }
  1395. } // pp1
  1396. } // pp0
  1397. } // j
  1398. //---------------------------------------------------------
  1399. for (CGU_INT pn = 0; pn < npv_nd[channels3or4 - 3][type]; pn++)
  1400. {
  1401. CGV_FLOAT err_2 = 0.0F;
  1402. CGU_INT d1;
  1403. CGU_INT d2;
  1404. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  1405. {
  1406. d1 = par_vectors_nd[channels3or4 - 3][type][pn][0][ch];
  1407. d2 = par_vectors_nd[channels3or4 - 3][type][pn][1][ch];
  1408. err_2 += err_ed[(d1 * 8) + (d2 * 4) + ch];
  1409. }
  1410. if (err_2 < best_err)
  1411. {
  1412. best_err = err_2;
  1413. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  1414. {
  1415. d1 = par_vectors_nd[channels3or4 - 3][type][pn][0][ch];
  1416. d2 = par_vectors_nd[channels3or4 - 3][type][pn][1][ch];
  1417. epo_code_shaker_out[ch] = epo_code_par[d1][d2][0][ch];
  1418. epo_code_shaker_out[4 + ch] = epo_code_par[d1][d2][1][ch];
  1419. }
  1420. }
  1421. }
  1422. return best_err;
  1423. }
  1424. CGV_FLOAT optimize_IndexAndEndPoints(CGV_UINT8 index_io[MAX_SUBSET_SIZE],
  1425. CGV_INT epo_code_out[8],
  1426. CGV_FLOAT image_src[SOURCE_BLOCK_SIZE * MAX_CHANNELS],
  1427. CGV_INT numEntries, // max 16
  1428. CGU_UINT8 Mi_, // last cluster , This should be no larger than 16
  1429. CGU_UINT8 bits, // total for all components
  1430. CGU_UINT8 channels3or4, // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
  1431. uniform CMP_GLOBAL BC7_Encode u_BC7Encode[])
  1432. {
  1433. CGV_FLOAT err_best = CMP_FLOAT_MAX;
  1434. CGU_INT type;
  1435. CGU_UINT8 channels2 = 2 * channels3or4;
  1436. type = bits % channels2;
  1437. CGU_UINT8 use_par = (type != 0);
  1438. CGU_UINT8 max_bits[MAX_CHANNELS];
  1439. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  1440. max_bits[ch] = (bits + channels2 - 1) / channels2;
  1441. CGU_INT iv;
  1442. CGU_INT clogBC7 = 0;
  1443. iv = Mi_;
  1444. while (iv >>= 1)
  1445. clogBC7++;
  1446. CGU_INT clt_clogBC7 = CLT(clogBC7);
  1447. if (clt_clogBC7 > 3)
  1448. {
  1449. ASPM_PRINT(("Err: optimize_IndexAndEndPoints, clt_clogBC7\n"));
  1450. return CMP_FLOAT_MAX;
  1451. }
  1452. Mi_ = Mi_ - 1;
  1453. CGV_UINT8 MaxIndex;
  1454. CGV_UINT8 index_tmp[MAX_SUBSET_SIZE];
  1455. CGU_INT maxTry = 5;
  1456. CGV_UINT8 index_best[MAX_SUBSET_SIZE];
  1457. for (CGV_INT k = 0; k < numEntries; k++)
  1458. {
  1459. index_best[k] = index_tmp[k] = clampIndex(index_io[k], 0, 15);
  1460. }
  1461. CGV_INT epo_code_best[2 * MAX_CHANNELS];
  1462. SetDefaultEPOCode(epo_code_out, 0xFF, 0, 0, 0);
  1463. SetDefaultEPOCode(epo_code_best, 0, 0, 0, 0);
  1464. CGV_FLOAT err_requant = 0.0F;
  1465. MaxIndex = index_collapse(index_tmp, numEntries);
  1466. //===============================
  1467. // we have a solid color 4x4 block
  1468. //===============================
  1469. if (MaxIndex == 0)
  1470. {
  1471. return quant_solid_color(index_io, epo_code_out, image_src, numEntries, Mi_, max_bits, type, channels3or4);
  1472. }
  1473. do
  1474. {
  1475. //===============================
  1476. // We have ramp colors to process
  1477. //===============================
  1478. CGV_FLOAT err_cluster = CMP_FLOAT_MAX;
  1479. CGV_FLOAT err_shake;
  1480. CGV_UINT8 index_cluster[MAX_PARTITION_ENTRIES];
  1481. for (CGV_UINT8 index_slope = 1; (MaxIndex != 0) && (index_slope * MaxIndex <= Mi_); index_slope++)
  1482. {
  1483. for (CGV_UINT8 index_offset = 0; index_offset <= Mi_ - index_slope * MaxIndex; index_offset++)
  1484. {
  1485. //-------------------------------------
  1486. // set a new index data to try
  1487. //-------------------------------------
  1488. for (CGV_INT k = 0; k < numEntries; k++)
  1489. index_cluster[k] = index_tmp[k] * index_slope + index_offset;
  1490. CGV_FLOAT image_cluster[2 * MAX_CHANNELS];
  1491. CGV_INT epo_code_shake[2 * MAX_CHANNELS];
  1492. SetDefaultEPOCode(epo_code_shake, 0, 0, 0xFF, 0);
  1493. if (get_ideal_cluster(image_cluster, index_cluster, Mi_, image_src, numEntries, channels3or4) == FALSE)
  1494. {
  1495. break;
  1496. }
  1497. err_shake = shake(epo_code_shake, // return new epo
  1498. image_cluster,
  1499. index_cluster,
  1500. image_src,
  1501. clogBC7,
  1502. type,
  1503. max_bits,
  1504. use_par,
  1505. numEntries, // max 16
  1506. channels3or4);
  1507. if (err_shake < err_cluster)
  1508. {
  1509. err_cluster = err_shake;
  1510. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  1511. {
  1512. epo_code_best[ch] = clampEPO(epo_code_shake[ch], 0, 255);
  1513. epo_code_best[4 + ch] = clampEPO(epo_code_shake[4 + ch], 0, 255);
  1514. }
  1515. }
  1516. }
  1517. }
  1518. CGV_INT change = 0;
  1519. CGV_INT better = 0;
  1520. if ((err_cluster != CMP_FLOAT_MAX))
  1521. {
  1522. //=========================
  1523. // test results for quality
  1524. //=========================
  1525. err_requant = requantized_image_err(index_best, // new index results
  1526. epo_code_best, // prior result input
  1527. clogBC7,
  1528. max_bits,
  1529. image_src,
  1530. numEntries,
  1531. channels3or4);
  1532. // change/better
  1533. // Has the index values changed from that last set
  1534. for (CGV_INT k = 0; k < numEntries; k++)
  1535. change = change || (index_cluster[k] != index_best[k]);
  1536. if (err_requant < err_best)
  1537. {
  1538. better = 1;
  1539. for (CGV_INT k = 0; k < numEntries; k++)
  1540. {
  1541. index_io[k] = index_tmp[k] = index_best[k];
  1542. }
  1543. for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
  1544. {
  1545. epo_code_out[ch] = epo_code_best[0 * 4 + ch];
  1546. epo_code_out[4 + ch] = epo_code_best[1 * 4 + ch];
  1547. }
  1548. err_best = err_requant;
  1549. }
  1550. }
  1551. // Early out if we have our target err
  1552. if (err_best <= u_BC7Encode->errorThreshold)
  1553. {
  1554. break;
  1555. }
  1556. CGV_INT done;
  1557. done = !(change && better);
  1558. if ((maxTry > 0) && (!done))
  1559. {
  1560. maxTry--;
  1561. MaxIndex = index_collapse(index_tmp, numEntries);
  1562. }
  1563. else
  1564. {
  1565. maxTry = 0;
  1566. }
  1567. } while (maxTry);
  1568. if (err_best == CMP_FLOAT_MAX)
  1569. {
  1570. ASPM_PRINT(("Err: requantized_image_err\n"));
  1571. }
  1572. return err_best;
  1573. }
  1574. CGU_UINT8 get_partitionsToTry(uniform CMP_GLOBAL BC7_Encode u_BC7Encode[], CGU_UINT8 maxPartitions)
  1575. {
  1576. CGU_FLOAT u_minPartitionSearchSize = 0.30f;
  1577. if (u_BC7Encode->quality <= BC7_qFAST_THRESHOLD)
  1578. { // Using this to match performance and quality of CPU code
  1579. u_minPartitionSearchSize = u_minPartitionSearchSize + (u_BC7Encode->quality * BC7_qFAST_THRESHOLD);
  1580. }
  1581. else
  1582. {
  1583. u_minPartitionSearchSize = u_BC7Encode->quality;
  1584. }
  1585. return (CGU_UINT8)(maxPartitions * u_minPartitionSearchSize);
  1586. }
  1587. INLINE void cmp_encode_swap(CGV_INT endpoint[], CGU_INT channels, CGV_UINT8 block_index[MAX_SUBSET_SIZE], CGU_INT bits)
  1588. {
  1589. CGU_INT levels = 1 << bits;
  1590. if (block_index[0] >= levels / 2)
  1591. {
  1592. cmp_swap_epo(&endpoint[0], &endpoint[channels], channels);
  1593. for (CGU_INT k = 0; k < SOURCE_BLOCK_SIZE; k++)
  1594. #ifdef ASPM_GPU
  1595. block_index[k] = (levels - 1) - block_index[k];
  1596. #else
  1597. block_index[k] = CGV_UINT8(levels - 1) - block_index[k];
  1598. #endif
  1599. }
  1600. }
  1601. void cmp_encode_index(CGV_UINT8 data[16], CGU_INT* uniform pPos, CGV_UINT8 block_index[MAX_SUBSET_SIZE], CGU_INT bits)
  1602. {
  1603. cmp_Write8Bit(data, pPos, bits - 1, block_index[0]);
  1604. for (CGU_INT j = 1; j < SOURCE_BLOCK_SIZE; j++)
  1605. {
  1606. CGV_UINT8 qbits = block_index[j] & 0xFF;
  1607. cmp_Write8Bit(data, pPos, bits, qbits);
  1608. }
  1609. }
  1610. void encode_endpoint(CGV_UINT8 data[16], CGU_INT* uniform pPos, CGV_UINT8 block_index[16], CGU_INT bits, CGV_UINT32 flips)
  1611. {
  1612. CGU_INT levels = 1 << bits;
  1613. CGV_INT flips_shifted = flips;
  1614. for (CGU_INT k1 = 0; k1 < 16; k1++)
  1615. {
  1616. CGV_UINT8 qbits_shifted = block_index[k1];
  1617. for (CGU_INT k2 = 0; k2 < 8; k2++)
  1618. {
  1619. CGV_INT q = qbits_shifted & 15;
  1620. if ((flips_shifted & 1) > 0)
  1621. q = (levels - 1) - q;
  1622. if (k1 == 0 && k2 == 0)
  1623. cmp_Write8Bit(data, pPos, bits - 1, CMP_STATIC_CAST(CGV_UINT8, q));
  1624. else
  1625. cmp_Write8Bit(data, pPos, bits, CMP_STATIC_CAST(CGV_UINT8, q));
  1626. qbits_shifted >>= 4;
  1627. flips_shifted >>= 1;
  1628. }
  1629. }
  1630. }
  1631. INLINE CGV_UINT32 pow32(CGV_UINT32 x)
  1632. {
  1633. return 1 << x;
  1634. }
  1635. void Encode_mode01237(CGU_INT blockMode,
  1636. CGV_UINT8 bestPartition,
  1637. CGV_UINT32 packedEndpoints[6],
  1638. CGV_UINT8 index16[16],
  1639. CGV_UINT8 cmp_out[COMPRESSED_BLOCK_SIZE])
  1640. {
  1641. CGU_INT partitionBits;
  1642. CGU_UINT32 componentBits;
  1643. CGU_UINT8 maxSubsets;
  1644. CGU_INT channels;
  1645. CGU_UINT8 indexBits;
  1646. switch (blockMode)
  1647. {
  1648. case 0:
  1649. componentBits = 4;
  1650. maxSubsets = 3;
  1651. partitionBits = 4;
  1652. channels = 3;
  1653. indexBits = 3;
  1654. break;
  1655. case 2:
  1656. componentBits = 5;
  1657. maxSubsets = 3;
  1658. partitionBits = 6;
  1659. channels = 3;
  1660. indexBits = 2;
  1661. break;
  1662. case 3:
  1663. componentBits = 7;
  1664. maxSubsets = 2;
  1665. partitionBits = 6;
  1666. channels = 3;
  1667. indexBits = 2;
  1668. break;
  1669. case 7:
  1670. componentBits = 5;
  1671. maxSubsets = 2;
  1672. partitionBits = 6;
  1673. channels = 4;
  1674. indexBits = 2;
  1675. break;
  1676. default:
  1677. case 1:
  1678. componentBits = 6;
  1679. maxSubsets = 2;
  1680. partitionBits = 6;
  1681. channels = 3;
  1682. indexBits = 3;
  1683. break;
  1684. }
  1685. CGV_UINT8 blockindex[SOURCE_BLOCK_SIZE];
  1686. CGV_INT indexBitsV = indexBits;
  1687. for (CGU_INT k = 0; k < COMPRESSED_BLOCK_SIZE; k++)
  1688. cmp_out[k] = 0;
  1689. // mode 0 = 1, mode 1 = 01, mode 2 = 001, mode 3 = 0001, ...
  1690. CGU_INT bitPosition = blockMode;
  1691. cmp_Write8Bit(cmp_out, &bitPosition, 1, 1);
  1692. // Write partition bits
  1693. cmp_Write8Bit(cmp_out, &bitPosition, partitionBits, bestPartition);
  1694. // Sort out the index set and tag whether we need to flip the
  1695. // endpoints to get the correct state in the implicit index bits
  1696. // The implicitly encoded MSB of the fixup index must be 0
  1697. CGV_INT fixup[3];
  1698. get_fixuptable(fixup, (maxSubsets == 2 ? bestPartition : bestPartition + 64));
  1699. // Extract indices and mark subsets that need to have their colours flipped to get the
  1700. // right state for the implicit MSB of the fixup index
  1701. CGV_INT flipColours[3] = {0, 0, 0};
  1702. for (CGV_INT k = 0; k < SOURCE_BLOCK_SIZE; k++)
  1703. {
  1704. blockindex[k] = index16[k];
  1705. for (CGU_UINT8 j = 0; j < maxSubsets; j++)
  1706. {
  1707. if (k == fixup[j])
  1708. {
  1709. if (blockindex[k] & (1 << (indexBitsV - 1)))
  1710. {
  1711. flipColours[j] = 1;
  1712. }
  1713. }
  1714. }
  1715. }
  1716. // Now we must flip the endpoints where necessary so that the implicitly encoded
  1717. // index bits have the correct state
  1718. for (CGU_INT subset = 0; subset < maxSubsets; subset++)
  1719. {
  1720. if (flipColours[subset] == 1)
  1721. {
  1722. CGV_UINT32 temp = packedEndpoints[subset * 2 + 0];
  1723. packedEndpoints[subset * 2 + 0] = packedEndpoints[subset * 2 + 1];
  1724. packedEndpoints[subset * 2 + 1] = temp;
  1725. }
  1726. }
  1727. // ...next flip the indices where necessary
  1728. for (CGV_INT k = 0; k < SOURCE_BLOCK_SIZE; k++)
  1729. {
  1730. CGV_UINT8 partsub = get_partition_subset(bestPartition, maxSubsets, k);
  1731. if (flipColours[partsub] == 1)
  1732. {
  1733. blockindex[k] = ((1 << indexBitsV) - 1) - blockindex[k];
  1734. }
  1735. }
  1736. // Endpoints are stored in the following order RRRR GGGG BBBB (AAAA) (PPPP)
  1737. // i.e. components are packed together
  1738. CGV_UINT32 unpackedColours[MAX_SUBSETS * 2 * MAX_CHANNELS];
  1739. CGV_UINT8 parityBits[MAX_SUBSETS][2];
  1740. // Unpack the colour values for the subsets
  1741. for (CGU_INT subset = 0; subset < maxSubsets; subset++)
  1742. {
  1743. CGV_UINT32 packedColours[2] = {packedEndpoints[subset * 2 + 0], packedEndpoints[subset * 2 + 1]};
  1744. if (blockMode == 0 || blockMode == 3 || blockMode == 7)
  1745. { // TWO_PBIT
  1746. parityBits[subset][0] = packedColours[0] & 1;
  1747. parityBits[subset][1] = packedColours[1] & 1;
  1748. packedColours[0] >>= 1;
  1749. packedColours[1] >>= 1;
  1750. }
  1751. else if (blockMode == 1)
  1752. { // ONE_PBIT
  1753. parityBits[subset][0] = packedColours[1] & 1;
  1754. parityBits[subset][1] = packedColours[1] & 1;
  1755. packedColours[0] >>= 1;
  1756. packedColours[1] >>= 1;
  1757. }
  1758. else if (blockMode == 2)
  1759. {
  1760. parityBits[subset][0] = 0;
  1761. parityBits[subset][1] = 0;
  1762. }
  1763. for (CGU_INT ch = 0; ch < channels; ch++)
  1764. {
  1765. unpackedColours[(subset * 2 + 0) * MAX_CHANNELS + ch] = packedColours[0] & ((1 << componentBits) - 1);
  1766. unpackedColours[(subset * 2 + 1) * MAX_CHANNELS + ch] = packedColours[1] & ((1 << componentBits) - 1);
  1767. packedColours[0] >>= componentBits;
  1768. packedColours[1] >>= componentBits;
  1769. }
  1770. }
  1771. // Loop over component
  1772. for (CGU_INT ch = 0; ch < channels; ch++)
  1773. {
  1774. // loop over subsets
  1775. for (CGU_INT subset = 0; subset < maxSubsets; subset++)
  1776. {
  1777. cmp_Write8Bit(cmp_out, &bitPosition, componentBits, unpackedColours[(subset * 2 + 0) * MAX_CHANNELS + ch] & 0xFF);
  1778. cmp_Write8Bit(cmp_out, &bitPosition, componentBits, unpackedColours[(subset * 2 + 1) * MAX_CHANNELS + ch] & 0xFF);
  1779. }
  1780. }
  1781. // write parity bits
  1782. if (blockMode != 2)
  1783. {
  1784. for (CGV_INT subset = 0; subset < maxSubsets; subset++)
  1785. {
  1786. if (blockMode == 1)
  1787. { // ONE_PBIT
  1788. cmp_Write8Bit(cmp_out, &bitPosition, 1, parityBits[subset][0] & 0x01);
  1789. }
  1790. else
  1791. { // TWO_PBIT
  1792. cmp_Write8Bit(cmp_out, &bitPosition, 1, parityBits[subset][0] & 0x01);
  1793. cmp_Write8Bit(cmp_out, &bitPosition, 1, parityBits[subset][1] & 0x01);
  1794. }
  1795. }
  1796. }
  1797. // Encode the index bits
  1798. CGV_INT bitPositionV = bitPosition;
  1799. for (CGV_INT k = 0; k < SOURCE_BLOCK_SIZE; k++)
  1800. {
  1801. CGV_UINT8 partsub = get_partition_subset(bestPartition, maxSubsets, k);
  1802. // If this is a fixup index then drop the MSB which is implicitly 0
  1803. if (k == fixup[partsub])
  1804. {
  1805. cmp_Write8BitV(cmp_out, bitPositionV, indexBits - 1, blockindex[k] & 0x07F);
  1806. bitPositionV += indexBits - 1;
  1807. }
  1808. else
  1809. {
  1810. cmp_Write8BitV(cmp_out, bitPositionV, indexBits, blockindex[k]);
  1811. bitPositionV += indexBits;
  1812. }
  1813. }
  1814. }
  1815. void Encode_mode4(CGV_UINT8 cmp_out[COMPRESSED_BLOCK_SIZE], varying cmp_mode_parameters* uniform params)
  1816. {
  1817. CGU_INT bitPosition = 4; // Position the pointer at the LSB
  1818. for (CGU_INT k = 0; k < COMPRESSED_BLOCK_SIZE; k++)
  1819. cmp_out[k] = 0;
  1820. // mode 4 (5 bits) 00001
  1821. cmp_Write8Bit(cmp_out, &bitPosition, 1, 1);
  1822. // rotation 2 bits
  1823. cmp_Write8Bit(cmp_out, &bitPosition, 2, CMP_STATIC_CAST(CGV_UINT8, params->rotated_channel));
  1824. // idxMode 1 bit
  1825. cmp_Write8Bit(cmp_out, &bitPosition, 1, CMP_STATIC_CAST(CGV_UINT8, params->idxMode));
  1826. CGU_INT idxBits[2] = {2, 3};
  1827. if (params->idxMode)
  1828. {
  1829. idxBits[0] = 3;
  1830. idxBits[1] = 2;
  1831. // Indicate if we need to fixup the index
  1832. cmp_swap_index(params->color_index, params->alpha_index, 16);
  1833. cmp_encode_swap(params->alpha_qendpoint, 4, params->color_index, 2);
  1834. cmp_encode_swap(params->color_qendpoint, 4, params->alpha_index, 3);
  1835. }
  1836. else
  1837. {
  1838. cmp_encode_swap(params->color_qendpoint, 4, params->color_index, 2);
  1839. cmp_encode_swap(params->alpha_qendpoint, 4, params->alpha_index, 3);
  1840. }
  1841. // color endpoints 5 bits each
  1842. // R0 : R1
  1843. // G0 : G1
  1844. // B0 : B1
  1845. for (CGU_INT component = 0; component < 3; component++)
  1846. {
  1847. cmp_Write8Bit(cmp_out, &bitPosition, 5, CMP_STATIC_CAST(CGV_UINT8, params->color_qendpoint[component]));
  1848. cmp_Write8Bit(cmp_out, &bitPosition, 5, CMP_STATIC_CAST(CGV_UINT8, params->color_qendpoint[4 + component]));
  1849. }
  1850. // alpha endpoints (6 bits each)
  1851. // A0 : A1
  1852. cmp_Write8Bit(cmp_out, &bitPosition, 6, CMP_STATIC_CAST(CGV_UINT8, params->alpha_qendpoint[0]));
  1853. cmp_Write8Bit(cmp_out, &bitPosition, 6, CMP_STATIC_CAST(CGV_UINT8, params->alpha_qendpoint[4]));
  1854. // index 2 bits each (31 bits total)
  1855. cmp_encode_index(cmp_out, &bitPosition, params->color_index, 2);
  1856. // index 3 bits each (47 bits total)
  1857. cmp_encode_index(cmp_out, &bitPosition, params->alpha_index, 3);
  1858. }
  1859. void Encode_mode5(CGV_UINT8 cmp_out[COMPRESSED_BLOCK_SIZE], varying cmp_mode_parameters* uniform params)
  1860. {
  1861. for (CGU_INT k = 0; k < COMPRESSED_BLOCK_SIZE; k++)
  1862. cmp_out[k] = 0;
  1863. // mode 5 bits = 000001
  1864. CGU_INT bitPosition = 5; // Position the pointer at the LSB
  1865. cmp_Write8Bit(cmp_out, &bitPosition, 1, 1);
  1866. // Write 2 bit rotation
  1867. cmp_Write8Bit(cmp_out, &bitPosition, 2, CMP_STATIC_CAST(CGV_UINT8, params->rotated_channel));
  1868. cmp_encode_swap(params->color_qendpoint, 4, params->color_index, 2);
  1869. cmp_encode_swap(params->alpha_qendpoint, 4, params->alpha_index, 2);
  1870. // color endpoints (7 bits each)
  1871. // R0 : R1
  1872. // G0 : G1
  1873. // B0 : B1
  1874. for (CGU_INT component = 0; component < 3; component++)
  1875. {
  1876. cmp_Write8Bit(cmp_out, &bitPosition, 7, CMP_STATIC_CAST(CGV_UINT8, params->color_qendpoint[component]));
  1877. cmp_Write8Bit(cmp_out, &bitPosition, 7, CMP_STATIC_CAST(CGV_UINT8, params->color_qendpoint[4 + component]));
  1878. }
  1879. // alpha endpoints (8 bits each)
  1880. // A0 : A1
  1881. cmp_Write8Bit(cmp_out, &bitPosition, 8, CMP_STATIC_CAST(CGV_UINT8, params->alpha_qendpoint[0]));
  1882. cmp_Write8Bit(cmp_out, &bitPosition, 8, CMP_STATIC_CAST(CGV_UINT8, params->alpha_qendpoint[4]));
  1883. // color index 2 bits each (31 bits total)
  1884. // alpha index 2 bits each (31 bits total)
  1885. cmp_encode_index(cmp_out, &bitPosition, params->color_index, 2);
  1886. cmp_encode_index(cmp_out, &bitPosition, params->alpha_index, 2);
  1887. }
  1888. void Encode_mode6(CGV_UINT8 index[MAX_SUBSET_SIZE], CGV_INT epo_code[8], CGV_UINT8 cmp_out[COMPRESSED_BLOCK_SIZE])
  1889. {
  1890. for (CGU_INT k = 0; k < COMPRESSED_BLOCK_SIZE; k++)
  1891. cmp_out[k] = 0;
  1892. cmp_encode_swap(epo_code, 4, index, 4);
  1893. // Mode = 6 bits = 0000001
  1894. CGU_INT bitPosition = 6; // Position the pointer at the LSB
  1895. cmp_Write8Bit(cmp_out, &bitPosition, 1, 1);
  1896. // endpoints
  1897. for (CGU_INT p = 0; p < 4; p++)
  1898. {
  1899. cmp_Write8Bit(cmp_out, &bitPosition, 7, CMP_STATIC_CAST(CGV_UINT8, epo_code[0 + p] >> 1));
  1900. cmp_Write8Bit(cmp_out, &bitPosition, 7, CMP_STATIC_CAST(CGV_UINT8, epo_code[4 + p] >> 1));
  1901. }
  1902. // p bits
  1903. cmp_Write8Bit(cmp_out, &bitPosition, 1, epo_code[0] & 1);
  1904. cmp_Write8Bit(cmp_out, &bitPosition, 1, epo_code[4] & 1);
  1905. // quantized values
  1906. cmp_encode_index(cmp_out, &bitPosition, index, 4);
  1907. }
  1908. void Compress_mode01237(CGU_INT blockMode, BC7_EncodeState EncodeState[], uniform CMP_GLOBAL BC7_Encode u_BC7Encode[])
  1909. {
  1910. CGV_UINT8 storedBestindex[MAX_PARTITIONS][MAX_SUBSETS][MAX_SUBSET_SIZE];
  1911. CGV_FLOAT storedError[MAX_PARTITIONS];
  1912. CGV_UINT8 sortedPartition[MAX_PARTITIONS];
  1913. EncodeState->numPartitionModes = 64;
  1914. EncodeState->maxSubSets = 2;
  1915. if (blockMode == 0)
  1916. {
  1917. EncodeState->numPartitionModes = 16;
  1918. EncodeState->channels3or4 = 3;
  1919. EncodeState->bits = 26;
  1920. EncodeState->clusters = 8;
  1921. EncodeState->componentBits = 4;
  1922. EncodeState->maxSubSets = 3;
  1923. }
  1924. else if (blockMode == 2)
  1925. {
  1926. EncodeState->channels3or4 = 3;
  1927. EncodeState->bits = 30;
  1928. EncodeState->clusters = 4;
  1929. EncodeState->componentBits = 5;
  1930. EncodeState->maxSubSets = 3;
  1931. }
  1932. else if (blockMode == 1)
  1933. {
  1934. EncodeState->channels3or4 = 3;
  1935. EncodeState->bits = 37;
  1936. EncodeState->clusters = 8;
  1937. EncodeState->componentBits = 6;
  1938. }
  1939. else if (blockMode == 3)
  1940. {
  1941. EncodeState->channels3or4 = 3;
  1942. EncodeState->bits = 44;
  1943. EncodeState->clusters = 4;
  1944. EncodeState->componentBits = 7;
  1945. }
  1946. else if (blockMode == 7)
  1947. {
  1948. EncodeState->channels3or4 = 4;
  1949. EncodeState->bits = 42; // (2* (R 5 + G 5 + B 5 + A 5)) + 2 parity bits
  1950. EncodeState->clusters = 4;
  1951. EncodeState->componentBits = 5; // 5 bit components
  1952. }
  1953. CGV_FLOAT image_subsets[MAX_SUBSETS][MAX_SUBSET_SIZE][MAX_CHANNELS];
  1954. CGV_INT subset_entryCount[MAX_SUBSETS] = {0, 0, 0};
  1955. // Loop over the available partitions for the block mode and quantize them
  1956. // to figure out the best candidates for further refinement
  1957. CGU_UINT8 mode_partitionsToTry;
  1958. mode_partitionsToTry = get_partitionsToTry(u_BC7Encode, EncodeState->numPartitionModes);
  1959. CGV_UINT8 bestPartition = 0;
  1960. for (CGU_INT mode_blockPartition = 0; mode_blockPartition < mode_partitionsToTry; mode_blockPartition++)
  1961. {
  1962. GetPartitionSubSet_mode01237(
  1963. image_subsets, subset_entryCount, CMP_STATIC_CAST(CGV_UINT8, mode_blockPartition), EncodeState->image_src, blockMode, EncodeState->channels3or4);
  1964. CGV_FLOAT subset_image_src[SOURCE_BLOCK_SIZE * MAX_CHANNELS];
  1965. CGV_UINT8 index_out1[SOURCE_BLOCK_SIZE];
  1966. CGV_FLOAT err_quant = 0.0F;
  1967. // Store the quntize error for this partition to be sorted and processed later
  1968. for (CGU_INT subset = 0; subset < EncodeState->maxSubSets; subset++)
  1969. {
  1970. CGV_INT numEntries = subset_entryCount[subset];
  1971. for (CGU_INT ii = 0; ii < SOURCE_BLOCK_SIZE; ii++)
  1972. {
  1973. subset_image_src[ii + COMP_RED * SOURCE_BLOCK_SIZE] = image_subsets[subset][ii][0];
  1974. subset_image_src[ii + COMP_GREEN * SOURCE_BLOCK_SIZE] = image_subsets[subset][ii][1];
  1975. subset_image_src[ii + COMP_BLUE * SOURCE_BLOCK_SIZE] = image_subsets[subset][ii][2];
  1976. subset_image_src[ii + COMP_ALPHA * SOURCE_BLOCK_SIZE] = image_subsets[subset][ii][3];
  1977. }
  1978. CGV_UINT32 color_index2[2];
  1979. err_quant += GetQuantizeIndex(color_index2, index_out1, subset_image_src, numEntries, EncodeState->clusters, EncodeState->channels3or4);
  1980. for (CGV_INT idx = 0; idx < numEntries; idx++)
  1981. {
  1982. storedBestindex[mode_blockPartition][subset][idx] = index_out1[idx];
  1983. }
  1984. }
  1985. storedError[mode_blockPartition] = err_quant;
  1986. }
  1987. // Sort the results
  1988. sortPartitionProjection(storedError, sortedPartition, mode_partitionsToTry);
  1989. CGV_INT epo_code[MAX_SUBSETS * 2 * MAX_CHANNELS];
  1990. CGV_INT bestEndpoints[MAX_SUBSETS * 2 * MAX_CHANNELS];
  1991. CGV_UINT8 bestindex[MAX_SUBSETS * MAX_SUBSET_SIZE];
  1992. CGV_INT bestEntryCount[MAX_SUBSETS];
  1993. CGV_UINT8 bestindex16[MAX_SUBSET_SIZE];
  1994. // Extensive shaking is most important when the ramp is short, and
  1995. // when we have less index. On a long ramp the quality of the
  1996. // initial quantizing is relatively more important
  1997. // We modulate the shake size according to the number of ramp index
  1998. // - the more index we have the less shaking should be required to find a near
  1999. // optimal match
  2000. CGU_UINT8 numShakeAttempts = max8(1, min8((CGU_UINT8)floor(8 * u_BC7Encode->quality + 0.5), mode_partitionsToTry));
  2001. CGV_FLOAT err_best = CMP_FLOAT_MAX;
  2002. // Now do the endpoint shaking
  2003. for (CGU_INT nSA = 0; nSA < numShakeAttempts; nSA++)
  2004. {
  2005. CGV_FLOAT err_optimized = 0.0F;
  2006. CGV_UINT8 sortedBlockPartition;
  2007. sortedBlockPartition = sortedPartition[nSA];
  2008. //********************************************
  2009. // Get the partition shape for the given mode
  2010. //********************************************
  2011. GetPartitionSubSet_mode01237(image_subsets, subset_entryCount, sortedBlockPartition, EncodeState->image_src, blockMode, EncodeState->channels3or4);
  2012. //*****************************
  2013. // Process the partition shape
  2014. //*****************************
  2015. for (CGU_INT subset = 0; subset < EncodeState->maxSubSets; subset++)
  2016. {
  2017. CGV_INT numEntries = subset_entryCount[subset];
  2018. CGV_FLOAT src_image_block[SOURCE_BLOCK_SIZE * MAX_CHANNELS];
  2019. CGV_UINT8 index_io[MAX_SUBSET_SIZE];
  2020. CGV_INT tmp_epo_code[8];
  2021. for (CGU_INT k = 0; k < SOURCE_BLOCK_SIZE; k++)
  2022. {
  2023. src_image_block[k + COMP_RED * SOURCE_BLOCK_SIZE] = image_subsets[subset][k][0];
  2024. src_image_block[k + COMP_GREEN * SOURCE_BLOCK_SIZE] = image_subsets[subset][k][1];
  2025. src_image_block[k + COMP_BLUE * SOURCE_BLOCK_SIZE] = image_subsets[subset][k][2];
  2026. src_image_block[k + COMP_ALPHA * SOURCE_BLOCK_SIZE] = image_subsets[subset][k][3];
  2027. }
  2028. for (CGU_INT k = 0; k < MAX_SUBSET_SIZE; k++)
  2029. {
  2030. index_io[k] = storedBestindex[sortedBlockPartition][subset][k];
  2031. }
  2032. err_optimized += optimize_IndexAndEndPoints(index_io,
  2033. tmp_epo_code,
  2034. src_image_block,
  2035. numEntries,
  2036. CMP_STATIC_CAST(CGU_INT8, EncodeState->clusters), // Mi_
  2037. EncodeState->bits,
  2038. EncodeState->channels3or4,
  2039. u_BC7Encode);
  2040. for (CGU_INT k = 0; k < MAX_SUBSET_SIZE; k++)
  2041. {
  2042. storedBestindex[sortedBlockPartition][subset][k] = index_io[k];
  2043. }
  2044. for (CGU_INT ch = 0; ch < MAX_CHANNELS; ch++)
  2045. {
  2046. epo_code[(subset * 2 + 0) * 4 + ch] = tmp_epo_code[ch];
  2047. epo_code[(subset * 2 + 1) * 4 + ch] = tmp_epo_code[4 + ch];
  2048. }
  2049. }
  2050. //****************************************
  2051. // Check if result is better than the last
  2052. //****************************************
  2053. if (err_optimized < err_best)
  2054. {
  2055. bestPartition = sortedBlockPartition;
  2056. CGV_INT bestIndexCount = 0;
  2057. for (CGU_INT subset = 0; subset < EncodeState->maxSubSets; subset++)
  2058. {
  2059. CGV_INT numEntries = subset_entryCount[subset];
  2060. bestEntryCount[subset] = numEntries;
  2061. if (numEntries)
  2062. {
  2063. for (CGU_INT ch = 0; ch < EncodeState->channels3or4; ch++)
  2064. {
  2065. bestEndpoints[(subset * 2 + 0) * 4 + ch] = epo_code[(subset * 2 + 0) * 4 + ch];
  2066. bestEndpoints[(subset * 2 + 1) * 4 + ch] = epo_code[(subset * 2 + 1) * 4 + ch];
  2067. }
  2068. for (CGV_INT k = 0; k < numEntries; k++)
  2069. {
  2070. bestindex[subset * MAX_SUBSET_SIZE + k] = storedBestindex[sortedBlockPartition][subset][k];
  2071. bestindex16[bestIndexCount++] = storedBestindex[sortedBlockPartition][subset][k];
  2072. }
  2073. }
  2074. }
  2075. err_best = err_optimized;
  2076. // Early out if we found we can compress with error below the quality threshold
  2077. if (err_best <= u_BC7Encode->errorThreshold)
  2078. {
  2079. break;
  2080. }
  2081. }
  2082. }
  2083. if (blockMode != 7)
  2084. err_best += EncodeState->opaque_err;
  2085. if (err_best > EncodeState->best_err)
  2086. return;
  2087. //**************************
  2088. // Save the encoded block
  2089. //**************************
  2090. EncodeState->best_err = err_best;
  2091. // Now we have all the data needed to encode the block
  2092. // We need to pack the endpoints prior to encoding
  2093. CGV_UINT32 packedEndpoints[MAX_SUBSETS * 2] = {0, 0, 0, 0, 0, 0};
  2094. for (CGU_INT subset = 0; subset < EncodeState->maxSubSets; subset++)
  2095. {
  2096. packedEndpoints[(subset * 2) + 0] = 0;
  2097. packedEndpoints[(subset * 2) + 1] = 0;
  2098. if (bestEntryCount[subset])
  2099. {
  2100. CGU_UINT32 rightAlignment = 0;
  2101. // Sort out parity bits
  2102. if (blockMode != 2)
  2103. {
  2104. // Sort out BCC parity bits
  2105. packedEndpoints[(subset * 2) + 0] = bestEndpoints[(subset * 2 + 0) * 4 + 0] & 1;
  2106. packedEndpoints[(subset * 2) + 1] = bestEndpoints[(subset * 2 + 1) * 4 + 0] & 1;
  2107. for (CGU_INT ch = 0; ch < EncodeState->channels3or4; ch++)
  2108. {
  2109. bestEndpoints[(subset * 2 + 0) * 4 + ch] >>= 1;
  2110. bestEndpoints[(subset * 2 + 1) * 4 + ch] >>= 1;
  2111. }
  2112. rightAlignment++;
  2113. }
  2114. // Fixup endpoints
  2115. for (CGU_INT ch = 0; ch < EncodeState->channels3or4; ch++)
  2116. {
  2117. packedEndpoints[(subset * 2) + 0] |= bestEndpoints[((subset * 2) + 0) * 4 + ch] << rightAlignment;
  2118. packedEndpoints[(subset * 2) + 1] |= bestEndpoints[((subset * 2) + 1) * 4 + ch] << rightAlignment;
  2119. rightAlignment += EncodeState->componentBits;
  2120. }
  2121. }
  2122. }
  2123. CGV_UINT8 idxCount[3] = {0, 0, 0};
  2124. for (CGV_INT k = 0; k < SOURCE_BLOCK_SIZE; k++)
  2125. {
  2126. CGV_UINT8 partsub = get_partition_subset(bestPartition, EncodeState->maxSubSets, k);
  2127. CGV_UINT8 idxC = idxCount[partsub];
  2128. bestindex16[k] = bestindex[partsub * MAX_SUBSET_SIZE + idxC];
  2129. idxCount[partsub] = idxC + 1;
  2130. }
  2131. Encode_mode01237(blockMode, bestPartition, packedEndpoints, bestindex16, EncodeState->cmp_out);
  2132. }
  2133. void Compress_mode45(CGU_INT blockMode, BC7_EncodeState EncodeState[], uniform CMP_GLOBAL BC7_Encode u_BC7Encode[])
  2134. {
  2135. cmp_mode_parameters best_candidate;
  2136. EncodeState->channels3or4 = 4;
  2137. cmp_memsetBC7((CGV_UINT8*)&best_candidate, 0, sizeof(cmp_mode_parameters));
  2138. if (blockMode == 4)
  2139. {
  2140. EncodeState->max_idxMode = 2;
  2141. EncodeState->modeBits[0] = 30; // bits = 2 * (Red 5+ Grn 5+ blu 5)
  2142. EncodeState->modeBits[1] = 36; // bits = 2 * (Alpha 6+6+6)
  2143. EncodeState->numClusters0[0] = 4;
  2144. EncodeState->numClusters0[1] = 8;
  2145. EncodeState->numClusters1[0] = 8;
  2146. EncodeState->numClusters1[1] = 4;
  2147. }
  2148. else
  2149. {
  2150. EncodeState->max_idxMode = 1;
  2151. EncodeState->modeBits[0] = 42; // bits = 2 * (Red 7+ Grn 7+ blu 7)
  2152. EncodeState->modeBits[1] = 48; // bits = 2 * (Alpha 8+8+8) = 48
  2153. EncodeState->numClusters0[0] = 4;
  2154. EncodeState->numClusters0[1] = 4;
  2155. EncodeState->numClusters1[0] = 4;
  2156. EncodeState->numClusters1[1] = 4;
  2157. }
  2158. CGV_FLOAT src_color_Block[SOURCE_BLOCK_SIZE * MAX_CHANNELS];
  2159. CGV_FLOAT src_alpha_Block[SOURCE_BLOCK_SIZE * MAX_CHANNELS];
  2160. // Go through each possible rotation and selection of index rotationBits)
  2161. for (CGU_UINT8 rotated_channel = 0; rotated_channel < EncodeState->channels3or4; rotated_channel++)
  2162. {
  2163. // A
  2164. for (CGU_INT k = 0; k < SOURCE_BLOCK_SIZE; k++)
  2165. {
  2166. for (CGU_INT p = 0; p < 3; p++)
  2167. {
  2168. src_color_Block[k + p * SOURCE_BLOCK_SIZE] = EncodeState->image_src[k + componentRotations[rotated_channel][p + 1] * SOURCE_BLOCK_SIZE];
  2169. src_alpha_Block[k + p * SOURCE_BLOCK_SIZE] = EncodeState->image_src[k + componentRotations[rotated_channel][0] * SOURCE_BLOCK_SIZE];
  2170. }
  2171. }
  2172. CGV_FLOAT err_quantizer;
  2173. CGV_FLOAT err_bestQuantizer = CMP_FLOAT_MAX;
  2174. for (CGU_INT idxMode = 0; idxMode < EncodeState->max_idxMode; idxMode++)
  2175. {
  2176. // B
  2177. CGV_UINT32 color_index2[2]; // reserved .. Not used!
  2178. err_quantizer =
  2179. GetQuantizeIndex(color_index2, best_candidate.color_index, src_color_Block, SOURCE_BLOCK_SIZE, EncodeState->numClusters0[idxMode], 3);
  2180. err_quantizer +=
  2181. GetQuantizeIndex(color_index2, best_candidate.alpha_index, src_alpha_Block, SOURCE_BLOCK_SIZE, EncodeState->numClusters1[idxMode], 3) / 3.0F;
  2182. // If quality is high then run the full shaking for this config and
  2183. // store the result if it beats the best overall error
  2184. // Otherwise only run the shaking if the error is better than the best
  2185. // quantizer error
  2186. if (err_quantizer <= err_bestQuantizer)
  2187. {
  2188. err_bestQuantizer = err_quantizer;
  2189. // Shake size gives the size of the shake cube
  2190. CGV_FLOAT err_overallError;
  2191. err_overallError = optimize_IndexAndEndPoints(best_candidate.color_index,
  2192. best_candidate.color_qendpoint,
  2193. src_color_Block,
  2194. SOURCE_BLOCK_SIZE,
  2195. EncodeState->numClusters0[idxMode],
  2196. CMP_STATIC_CAST(CGU_UINT8, EncodeState->modeBits[0]),
  2197. 3,
  2198. u_BC7Encode);
  2199. // Alpha scalar block
  2200. err_overallError += optimize_IndexAndEndPoints(best_candidate.alpha_index,
  2201. best_candidate.alpha_qendpoint,
  2202. src_alpha_Block,
  2203. SOURCE_BLOCK_SIZE,
  2204. EncodeState->numClusters1[idxMode],
  2205. CMP_STATIC_CAST(CGU_UINT8, EncodeState->modeBits[1]),
  2206. 3,
  2207. u_BC7Encode) / 3.0f;
  2208. // If we beat the previous best then encode the block
  2209. if (err_overallError < EncodeState->best_err)
  2210. {
  2211. best_candidate.idxMode = idxMode;
  2212. best_candidate.rotated_channel = rotated_channel;
  2213. if (blockMode == 4)
  2214. Encode_mode4(EncodeState->cmp_out, &best_candidate);
  2215. else
  2216. Encode_mode5(EncodeState->cmp_out, &best_candidate);
  2217. EncodeState->best_err = err_overallError;
  2218. }
  2219. }
  2220. } // B
  2221. } // A
  2222. }
  2223. void Compress_mode6(BC7_EncodeState EncodeState[], uniform CMP_GLOBAL BC7_Encode u_BC7Encode[])
  2224. {
  2225. CGV_FLOAT err;
  2226. CGV_INT epo_code_out[8] = {0};
  2227. CGV_UINT8 best_index_out[MAX_SUBSET_SIZE];
  2228. CGV_UINT32 best_packedindex_out[2];
  2229. // CGV_FLOAT block_endpoints[8];
  2230. // icmp_get_block_endpoints(block_endpoints, EncodeState->image_src, -1, 4);
  2231. // icmp_GetQuantizedEpoCode(epo_code_out, block_endpoints, 6,4);
  2232. // err = icmp_GetQuantizeIndex(best_packedindex_out, best_index_out, EncodeState->image_src, 4, block_endpoints, 0,4);
  2233. err = GetQuantizeIndex(best_packedindex_out,
  2234. best_index_out,
  2235. EncodeState->image_src,
  2236. 16, // numEntries
  2237. 16, // clusters
  2238. 4); // channels3or4
  2239. //*****************************
  2240. // Process the partition shape
  2241. //*****************************
  2242. err = optimize_IndexAndEndPoints(best_index_out,
  2243. epo_code_out,
  2244. EncodeState->image_src,
  2245. 16, //numEntries
  2246. 16, // Mi_ = clusters
  2247. 58, // bits
  2248. 4, // channels3or4
  2249. u_BC7Encode);
  2250. //**************************
  2251. // Save the encoded block
  2252. //**************************
  2253. if (err < EncodeState->best_err)
  2254. {
  2255. EncodeState->best_err = err;
  2256. Encode_mode6(best_index_out, epo_code_out, EncodeState->cmp_out);
  2257. }
  2258. }
  2259. void copy_BC7_Encode_settings(BC7_EncodeState EncodeState[], uniform CMP_GLOBAL BC7_Encode settings[])
  2260. {
  2261. EncodeState->best_err = CMP_FLOAT_MAX;
  2262. EncodeState->validModeMask = settings->validModeMask;
  2263. #ifdef USE_ICMP
  2264. EncodeState->part_count = settings->part_count;
  2265. EncodeState->channels = settings->channels;
  2266. #endif
  2267. }
  2268. //===================================== COMPRESS CODE =============================================
  2269. #ifdef USE_ICMP
  2270. #include "external/bc7_icmp.h"
  2271. #endif
  2272. bool notValidBlockForMode(CGU_UINT32 blockMode, CGU_BOOL blockNeedsAlpha, CGU_BOOL blockAlphaZeroOne, uniform CMP_GLOBAL BC7_Encode u_BC7Encode[])
  2273. {
  2274. // Do we need to skip alpha processing blocks
  2275. if ((blockNeedsAlpha == FALSE) && (blockMode > 3))
  2276. {
  2277. return TRUE;
  2278. }
  2279. // Optional restriction for colour-only blocks so that they
  2280. // don't use modes that have combined colour+alpha - this
  2281. // avoids the possibility that the encoder might choose an
  2282. // alpha other than 1.0 (due to parity) and cause something to
  2283. // become accidentally slightly transparent (it's possible that
  2284. // when encoding 3-component texture applications will assume that
  2285. // the 4th component can safely be assumed to be 1.0 all the time)
  2286. if ((blockNeedsAlpha == FALSE) && (u_BC7Encode->colourRestrict == TRUE) && ((blockMode == 6) || (blockMode == 7)))
  2287. { // COMBINED_ALPHA
  2288. return TRUE;
  2289. }
  2290. // Optional restriction for blocks with alpha to avoid issues with
  2291. // punch-through or thresholded alpha encoding
  2292. if ((blockNeedsAlpha == TRUE) && (u_BC7Encode->alphaRestrict == TRUE) && (blockAlphaZeroOne == TRUE) && ((blockMode == 6) || (blockMode == 7)))
  2293. { // COMBINED_ALPHA
  2294. return TRUE;
  2295. }
  2296. return FALSE;
  2297. }
  2298. void BC7_CompressBlock(BC7_EncodeState EncodeState[], uniform CMP_GLOBAL BC7_Encode u_BC7Encode[])
  2299. {
  2300. #ifdef USE_NEW_SINGLE_HEADER_INTERFACES
  2301. CGV_Vec4f image_src[16];
  2302. //int px = 0;
  2303. for (int i = 0; i < 16; i++)
  2304. {
  2305. image_src[i].x = EncodeState->image_src[i];
  2306. image_src[i].y = EncodeState->image_src[i + 16];
  2307. image_src[i].z = EncodeState->image_src[i + 32];
  2308. image_src[i].w = EncodeState->image_src[i + 48];
  2309. }
  2310. CGU_Vec4ui cmp = CompressBlockBC7_UNORM(image_src, u_BC7Encode->quality);
  2311. //EncodeState->cmp_isout16Bytes = true;
  2312. //EncodeState->cmp_out[0] = cmp.x & 0xFF;
  2313. //EncodeState->cmp_out[1] = (cmp.x >> 8) & 0xFF;
  2314. //EncodeState->cmp_out[2] = (cmp.x >> 16) & 0xFF;
  2315. //EncodeState->cmp_out[3] = (cmp.x >> 24) & 0xFF;
  2316. //EncodeState->cmp_out[4] = cmp.y & 0xFF;
  2317. //EncodeState->cmp_out[5] = (cmp.y >> 8) & 0xFF;
  2318. //EncodeState->cmp_out[6] = (cmp.y >> 16) & 0xFF;
  2319. //EncodeState->cmp_out[7] = (cmp.y >> 24) & 0xFF;
  2320. //EncodeState->cmp_out[8] = cmp.z & 0xFF;
  2321. //EncodeState->cmp_out[9] = (cmp.z >> 8) & 0xFF;
  2322. //EncodeState->cmp_out[10] = (cmp.z >> 16) & 0xFF;
  2323. //EncodeState->cmp_out[11] = (cmp.z >> 24) & 0xFF;
  2324. //EncodeState->cmp_out[12] = cmp.w & 0xFF;
  2325. //EncodeState->cmp_out[13] = (cmp.w >> 8) & 0xFF;
  2326. //EncodeState->cmp_out[14] = (cmp.w >> 16) & 0xFF;
  2327. //EncodeState->cmp_out[15] = (cmp.w >> 24) & 0xFF;
  2328. EncodeState->cmp_isout16Bytes = false;
  2329. EncodeState->best_cmp_out[0] = cmp.x;
  2330. EncodeState->best_cmp_out[1] = cmp.y;
  2331. EncodeState->best_cmp_out[2] = cmp.z;
  2332. EncodeState->best_cmp_out[3] = cmp.w;
  2333. return;
  2334. #else
  2335. CGU_BOOL blockNeedsAlpha = FALSE;
  2336. CGU_BOOL blockAlphaZeroOne = FALSE;
  2337. CGV_FLOAT alpha_err = 0.0f;
  2338. CGV_FLOAT alpha_min = 255.0F;
  2339. for (CGU_INT k = 0; k < SOURCE_BLOCK_SIZE; k++)
  2340. {
  2341. if (EncodeState->image_src[k + COMP_ALPHA * SOURCE_BLOCK_SIZE] < alpha_min)
  2342. alpha_min = EncodeState->image_src[k + COMP_ALPHA * SOURCE_BLOCK_SIZE];
  2343. alpha_err += sq_image(EncodeState->image_src[k + COMP_ALPHA * SOURCE_BLOCK_SIZE] - 255.0F);
  2344. if (blockAlphaZeroOne == FALSE)
  2345. {
  2346. if ((EncodeState->image_src[k + COMP_ALPHA * SOURCE_BLOCK_SIZE] == 255.0F) || (EncodeState->image_src[k + COMP_ALPHA * SOURCE_BLOCK_SIZE] == 0.0F))
  2347. {
  2348. blockAlphaZeroOne = TRUE;
  2349. }
  2350. }
  2351. }
  2352. if (alpha_min != 255.0F)
  2353. {
  2354. blockNeedsAlpha = TRUE;
  2355. }
  2356. EncodeState->best_err = CMP_FLOAT_MAX;
  2357. EncodeState->opaque_err = alpha_err;
  2358. #ifdef USE_ICMP
  2359. EncodeState->refineIterations = 4;
  2360. EncodeState->fastSkipTreshold = 4;
  2361. EncodeState->channels = 4;
  2362. EncodeState->part_count = 64;
  2363. EncodeState->cmp_isout16Bytes = FALSE;
  2364. #else
  2365. EncodeState->cmp_isout16Bytes = TRUE;
  2366. #endif
  2367. // We change the order in which we visit the block modes to try to maximize the chance
  2368. // that we manage to early out as quickly as possible.
  2369. // This is a significant performance optimization for the lower quality modes where the
  2370. // exit threshold is higher, and also tends to improve quality (as the generally higher quality
  2371. // modes are now enumerated earlier, so the first encoding that passes the threshold will
  2372. // tend to pass by a greater margin than if we used a dumb ordering, and thus overall error will
  2373. // be improved)
  2374. CGU_INT blockModeOrder[NUM_BLOCK_TYPES] = {4, 6, 1, 3, 0, 2, 7, 5};
  2375. // used for debugging and mode tests
  2376. // 76543210
  2377. // u_BC7Encode->validModeMask = 0b01000000;
  2378. for (CGU_INT block = 0; block < NUM_BLOCK_TYPES; block++)
  2379. {
  2380. CGU_INT blockMode = blockModeOrder[block];
  2381. if (u_BC7Encode->quality < BC7_qFAST_THRESHOLD)
  2382. {
  2383. if (notValidBlockForMode(blockMode, blockNeedsAlpha, blockAlphaZeroOne, u_BC7Encode))
  2384. continue;
  2385. }
  2386. CGU_INT Mode = 0x0001 << blockMode;
  2387. if (!(u_BC7Encode->validModeMask & Mode))
  2388. continue;
  2389. switch (blockMode)
  2390. {
  2391. // image processing with no alpha
  2392. case 0:
  2393. #ifdef USE_ICMP
  2394. icmp_mode02(EncodeState);
  2395. #else
  2396. Compress_mode01237(blockMode, EncodeState, u_BC7Encode);
  2397. #endif
  2398. break;
  2399. case 1:
  2400. #ifdef USE_ICMP
  2401. icmp_mode13(EncodeState);
  2402. #else
  2403. Compress_mode01237(blockMode, EncodeState, u_BC7Encode);
  2404. #endif
  2405. break;
  2406. case 2:
  2407. #ifdef USE_ICMP
  2408. icmp_mode13(EncodeState);
  2409. #else
  2410. Compress_mode01237(blockMode, EncodeState, u_BC7Encode);
  2411. #endif
  2412. break;
  2413. case 3:
  2414. #ifdef USE_ICMP
  2415. icmp_mode13(EncodeState);
  2416. #else
  2417. Compress_mode01237(blockMode, EncodeState, u_BC7Encode);
  2418. #endif
  2419. break;
  2420. // image processing with alpha
  2421. case 4:
  2422. #ifdef USE_ICMP
  2423. icmp_mode4(EncodeState);
  2424. #else
  2425. Compress_mode45(blockMode, EncodeState, u_BC7Encode);
  2426. #endif
  2427. break;
  2428. case 5:
  2429. #ifdef USE_ICMP
  2430. icmp_mode5(EncodeState);
  2431. #else
  2432. Compress_mode45(blockMode, EncodeState, u_BC7Encode);
  2433. #endif
  2434. break;
  2435. case 6:
  2436. #ifdef USE_ICMP
  2437. icmp_mode6(EncodeState);
  2438. #else
  2439. Compress_mode6(EncodeState, u_BC7Encode);
  2440. #endif
  2441. break;
  2442. case 7:
  2443. #ifdef USE_ICMP
  2444. icmp_mode7(EncodeState);
  2445. #else
  2446. Compress_mode01237(blockMode, EncodeState, u_BC7Encode);
  2447. #endif
  2448. break;
  2449. }
  2450. // Early out if we found we can compress with error below the quality threshold
  2451. if (EncodeState->best_err <= u_BC7Encode->errorThreshold)
  2452. {
  2453. break;
  2454. }
  2455. }
  2456. #endif
  2457. }
  2458. //====================================== BC7_ENCODECLASS END =============================================
  2459. #ifndef ASPM_GPU
  2460. INLINE void load_block_interleaved_rgba2(CGV_FLOAT image_src[64], uniform texture_surface* uniform src, CGV_INT block_xx, CGU_INT block_yy)
  2461. {
  2462. for (CGU_INT y = 0; y < 4; y++)
  2463. for (CGU_INT x = 0; x < 4; x++)
  2464. {
  2465. CGU_UINT32* uniform src_ptr = (CGV_UINT32*)&src->ptr[(block_yy * 4 + y) * src->stride];
  2466. #ifdef USE_VARYING
  2467. CGV_UINT32 rgba = gather_partid(src_ptr, block_xx * 4 + x);
  2468. image_src[16 * 0 + y * 4 + x] = (CGV_FLOAT)((rgba >> 0) & 255);
  2469. image_src[16 * 1 + y * 4 + x] = (CGV_FLOAT)((rgba >> 8) & 255);
  2470. image_src[16 * 2 + y * 4 + x] = (CGV_FLOAT)((rgba >> 16) & 255);
  2471. image_src[16 * 3 + y * 4 + x] = (CGV_FLOAT)((rgba >> 24) & 255);
  2472. #else
  2473. CGV_UINT32 rgba = src_ptr[block_xx * 4 + x];
  2474. image_src[16 * 0 + y * 4 + x] = (CGU_FLOAT)((rgba >> 0) & 255);
  2475. image_src[16 * 1 + y * 4 + x] = (CGU_FLOAT)((rgba >> 8) & 255);
  2476. image_src[16 * 2 + y * 4 + x] = (CGU_FLOAT)((rgba >> 16) & 255);
  2477. image_src[16 * 3 + y * 4 + x] = (CGU_FLOAT)((rgba >> 24) & 255);
  2478. #endif
  2479. }
  2480. }
  2481. #if defined(CMP_USE_FOREACH_ASPM) || defined(USE_VARYING)
  2482. INLINE void scatter_uint2(CGU_UINT32* ptr, CGV_INT idx, CGV_UINT32 value)
  2483. {
  2484. ptr[idx] = value; // (perf warning expected)
  2485. }
  2486. #endif
  2487. INLINE void store_data_uint32(CGU_UINT8 dst[], CGU_INT width, CGV_INT v_xx, CGU_INT yy, CGV_UINT32 data[], CGU_INT data_size)
  2488. {
  2489. for (CGU_INT k = 0; k < data_size; k++)
  2490. {
  2491. CGU_UINT32* dst_ptr = (CGV_UINT32*)&dst[(yy)*width * data_size];
  2492. #ifdef USE_VARYING
  2493. scatter_uint2(dst_ptr, v_xx * data_size + k, data[k]);
  2494. #else
  2495. dst_ptr[v_xx * data_size + k] = data[k];
  2496. #endif
  2497. }
  2498. }
  2499. #ifdef USE_VARYING
  2500. INLINE void scatter_uint8(CGU_UINT8* ptr, CGV_UINT32 idx, CGV_UINT8 value)
  2501. {
  2502. ptr[idx] = value; // (perf warning expected)
  2503. }
  2504. #endif
  2505. INLINE void store_data_uint8(CGU_UINT8 u_dstptr[], CGU_INT src_width, CGU_INT block_x, CGU_INT block_y, CGV_UINT8 data[], CGU_INT data_size)
  2506. {
  2507. for (CGU_INT k = 0; k < data_size; k++)
  2508. {
  2509. #ifdef USE_VARYING
  2510. CGU_UINT8* dst_blockptr = (CGU_UINT8*)&u_dstptr[(block_y * src_width * 4)];
  2511. scatter_uint8(dst_blockptr, k + (block_x * data_size), data[k]);
  2512. #else
  2513. u_dstptr[(block_y * src_width * 4) + k + (block_x * data_size)] = data[k];
  2514. #endif
  2515. }
  2516. }
  2517. INLINE void store_data_uint32(CGU_UINT8 dst[], CGV_UINT32 width, CGU_INT v_xx, CGU_INT yy, CGV_UINT8 data[], CGU_INT data_size)
  2518. {
  2519. for (CGU_INT k = 0; k < data_size; k++)
  2520. {
  2521. #if defined(CMP_USE_FOREACH_ASPM) || defined(USE_VARYING)
  2522. CGU_UINT32* dst_ptr = (CGV_UINT32*)&dst[(yy)*width * data_size];
  2523. scatter_uint2(dst_ptr, v_xx * data_size + k, data[k]);
  2524. #else
  2525. dst[((yy)*width * data_size) + v_xx * data_size + k] = data[k];
  2526. #endif
  2527. }
  2528. }
  2529. void CompressBlockBC7_XY(uniform texture_surface u_srcptr[], CGU_INT block_x, CGU_INT block_y, CGU_UINT8 u_dst[], uniform BC7_Encode u_settings[])
  2530. {
  2531. BC7_EncodeState _state;
  2532. varying BC7_EncodeState* uniform state = &_state;
  2533. copy_BC7_Encode_settings(state, u_settings);
  2534. load_block_interleaved_rgba2(state->image_src, u_srcptr, block_x, block_y);
  2535. BC7_CompressBlock(state, u_settings);
  2536. if (state->cmp_isout16Bytes)
  2537. store_data_uint8(u_dst, u_srcptr->width, block_x, block_y, state->cmp_out, 16);
  2538. else
  2539. store_data_uint32(u_dst, u_srcptr->width, block_x, block_y, state->best_cmp_out, 4);
  2540. }
  2541. CMP_EXPORT void CompressBlockBC7_encode(uniform texture_surface src[], CGU_UINT8 dst[], uniform BC7_Encode settings[])
  2542. {
  2543. // bc7_isa(); ASPM_PRINT(("ASPM encode [%d,%d]\n",bc7_isa(),src->width,src->height));
  2544. for (CGU_INT u_yy = 0; u_yy < src->height / 4; u_yy++)
  2545. #ifdef CMP_USE_FOREACH_ASPM
  2546. foreach (v_xx = 0 ... src->width / 4)
  2547. {
  2548. #else
  2549. for (CGV_INT v_xx = 0; v_xx < src->width / 4; v_xx++)
  2550. {
  2551. #endif
  2552. CompressBlockBC7_XY(src, v_xx, u_yy, dst, settings);
  2553. }
  2554. }
  2555. #endif
  2556. #ifndef ASPM_GPU
  2557. #ifndef ASPM
  2558. //======================= DECOMPRESS =========================================
  2559. #ifndef USE_HIGH_PRECISION_INTERPOLATION_BC7
  2560. CGU_UINT16 aWeight2[] = {0, 21, 43, 64};
  2561. CGU_UINT16 aWeight3[] = {0, 9, 18, 27, 37, 46, 55, 64};
  2562. CGU_UINT16 aWeight4[] = {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64};
  2563. CGU_UINT8 interpolate(CGU_UINT8 e0, CGU_UINT8 e1, CGU_UINT8 index, CGU_UINT8 indexprecision)
  2564. {
  2565. if (indexprecision == 2)
  2566. return (CGU_UINT8)(((64 - aWeight2[index]) * CGU_UINT16(e0) + aWeight2[index] * CGU_UINT16(e1) + 32) >> 6);
  2567. else if (indexprecision == 3)
  2568. return (CGU_UINT8)(((64 - aWeight3[index]) * CGU_UINT16(e0) + aWeight3[index] * CGU_UINT16(e1) + 32) >> 6);
  2569. else // indexprecision == 4
  2570. return (CGU_UINT8)(((64 - aWeight4[index]) * CGU_UINT16(e0) + aWeight4[index] * CGU_UINT16(e1) + 32) >> 6);
  2571. }
  2572. #endif
  2573. void GetBC7Ramp(CGU_UINT32 endpoint[][MAX_DIMENSION_BIG],
  2574. CGU_FLOAT ramp[MAX_DIMENSION_BIG][(1 << MAX_INDEX_BITS)],
  2575. CGU_UINT32 clusters[2],
  2576. CGU_UINT32 componentBits[MAX_DIMENSION_BIG])
  2577. {
  2578. CGU_UINT32 ep[2][MAX_DIMENSION_BIG];
  2579. CGU_UINT32 i;
  2580. // Expand each endpoint component to 8 bits by shifting the MSB to bit 7
  2581. // and then replicating the high bits to the low bits revealed by
  2582. // the shift
  2583. for (i = 0; i < MAX_DIMENSION_BIG; i++)
  2584. {
  2585. ep[0][i] = 0;
  2586. ep[1][i] = 0;
  2587. if (componentBits[i])
  2588. {
  2589. ep[0][i] = (CGU_UINT32)(endpoint[0][i] << (8 - componentBits[i]));
  2590. ep[1][i] = (CGU_UINT32)(endpoint[1][i] << (8 - componentBits[i]));
  2591. ep[0][i] += (CGU_UINT32)(ep[0][i] >> componentBits[i]);
  2592. ep[1][i] += (CGU_UINT32)(ep[1][i] >> componentBits[i]);
  2593. ep[0][i] = min8(255, max8(0, CMP_STATIC_CAST(CGU_UINT8, ep[0][i])));
  2594. ep[1][i] = min8(255, max8(0, CMP_STATIC_CAST(CGU_UINT8, ep[1][i])));
  2595. }
  2596. }
  2597. // If this block type has no explicit alpha channel
  2598. // then make sure alpha is 1.0 for all points on the ramp
  2599. if (!componentBits[COMP_ALPHA])
  2600. {
  2601. ep[0][COMP_ALPHA] = ep[1][COMP_ALPHA] = 255;
  2602. }
  2603. CGU_UINT32 rampIndex = clusters[0];
  2604. rampIndex = (CGU_UINT32)(log((double)rampIndex) / log(2.0));
  2605. // Generate colours for the RGB ramp
  2606. for (i = 0; i < clusters[0]; i++)
  2607. {
  2608. #ifdef USE_HIGH_PRECISION_INTERPOLATION_BC7
  2609. ramp[COMP_RED][i] =
  2610. (CGU_FLOAT)floor((ep[0][COMP_RED] * (1.0 - rampLerpWeightsBC7[rampIndex][i])) + (ep[1][COMP_RED] * rampLerpWeightsBC7[rampIndex][i]) + 0.5);
  2611. ramp[COMP_RED][i] = bc7_minf(255.0, bc7_maxf(0., ramp[COMP_RED][i]));
  2612. ramp[COMP_GREEN][i] =
  2613. (CGU_FLOAT)floor((ep[0][COMP_GREEN] * (1.0 - rampLerpWeightsBC7[rampIndex][i])) + (ep[1][COMP_GREEN] * rampLerpWeightsBC7[rampIndex][i]) + 0.5);
  2614. ramp[COMP_GREEN][i] = bc7_minf(255.0, bc7_maxf(0., ramp[COMP_GREEN][i]));
  2615. ramp[COMP_BLUE][i] =
  2616. (CGU_FLOAT)floor((ep[0][COMP_BLUE] * (1.0 - rampLerpWeightsBC7[rampIndex][i])) + (ep[1][COMP_BLUE] * rampLerpWeightsBC7[rampIndex][i]) + 0.5);
  2617. ramp[COMP_BLUE][i] = bc7_minf(255.0, bc7_maxf(0., ramp[COMP_BLUE][i]));
  2618. #else
  2619. ramp[COMP_RED][i] = interpolate(ep[0][COMP_RED], ep[1][COMP_RED], i, rampIndex);
  2620. ramp[COMP_GREEN][i] = interpolate(ep[0][COMP_GREEN], ep[1][COMP_GREEN], i, rampIndex);
  2621. ramp[COMP_BLUE][i] = interpolate(ep[0][COMP_BLUE], ep[1][COMP_BLUE], i, rampIndex);
  2622. #endif
  2623. }
  2624. rampIndex = clusters[1];
  2625. rampIndex = (CGU_UINT32)(log((CGU_FLOAT)rampIndex) / log(2.0));
  2626. if (!componentBits[COMP_ALPHA])
  2627. {
  2628. for (i = 0; i < clusters[1]; i++)
  2629. {
  2630. ramp[COMP_ALPHA][i] = 255.;
  2631. }
  2632. }
  2633. else
  2634. {
  2635. // Generate alphas
  2636. for (i = 0; i < clusters[1]; i++)
  2637. {
  2638. #ifdef USE_HIGH_PRECISION_INTERPOLATION_BC7
  2639. ramp[COMP_ALPHA][i] =
  2640. (CGU_FLOAT)floor((ep[0][COMP_ALPHA] * (1.0 - rampLerpWeightsBC7[rampIndex][i])) + (ep[1][COMP_ALPHA] * rampLerpWeightsBC7[rampIndex][i]) + 0.5);
  2641. ramp[COMP_ALPHA][i] = bc7_minf(255.0, bc7_maxf(0., ramp[COMP_ALPHA][i]));
  2642. #else
  2643. ramp[COMP_ALPHA][i] = interpolate(ep[0][COMP_ALPHA], ep[1][COMP_ALPHA], i, rampIndex);
  2644. #endif
  2645. }
  2646. }
  2647. }
  2648. //
  2649. // Bit reader - reads one bit from a buffer at the current bit offset
  2650. // and increments the offset
  2651. //
  2652. CGU_UINT32 ReadBit(const CGU_UINT8 base[], CGU_UINT32& m_bitPosition)
  2653. {
  2654. int byteLocation;
  2655. int remainder;
  2656. CGU_UINT32 bit = 0;
  2657. byteLocation = m_bitPosition / 8;
  2658. remainder = m_bitPosition % 8;
  2659. bit = base[byteLocation];
  2660. bit >>= remainder;
  2661. bit &= 0x1;
  2662. // Increment bit position
  2663. m_bitPosition++;
  2664. return (bit);
  2665. }
  2666. void DecompressDualIndexBlock(CGU_UINT8 out[MAX_SUBSET_SIZE][MAX_DIMENSION_BIG],
  2667. const CGU_UINT8 in[COMPRESSED_BLOCK_SIZE],
  2668. CGU_UINT32 endpoint[2][MAX_DIMENSION_BIG],
  2669. CGU_UINT32& m_bitPosition,
  2670. CGU_UINT32 m_rotation,
  2671. CGU_UINT32 m_blockMode,
  2672. CGU_UINT32 m_indexSwap,
  2673. CGU_UINT32 m_componentBits[MAX_DIMENSION_BIG])
  2674. {
  2675. CGU_UINT32 i, j, k;
  2676. CGU_FLOAT ramp[MAX_DIMENSION_BIG][1 << MAX_INDEX_BITS];
  2677. CGU_UINT32 blockIndices[2][MAX_SUBSET_SIZE];
  2678. CGU_UINT32 clusters[2];
  2679. clusters[0] = 1 << bti[m_blockMode].indexBits[0];
  2680. clusters[1] = 1 << bti[m_blockMode].indexBits[1];
  2681. if (m_indexSwap)
  2682. {
  2683. CGU_UINT32 temp = clusters[0];
  2684. clusters[0] = clusters[1];
  2685. clusters[1] = temp;
  2686. }
  2687. GetBC7Ramp(endpoint, ramp, clusters, m_componentBits);
  2688. // Extract the indices
  2689. for (i = 0; i < 2; i++)
  2690. {
  2691. for (j = 0; j < MAX_SUBSET_SIZE; j++)
  2692. {
  2693. blockIndices[i][j] = 0;
  2694. // If this is a fixup index then clear the implicit bit
  2695. if (j == 0)
  2696. {
  2697. blockIndices[i][j] &= ~(1 << (bti[m_blockMode].indexBits[i] - 1U));
  2698. for (k = 0; k < static_cast<CGU_UINT32>(bti[m_blockMode].indexBits[i] - 1); k++)
  2699. {
  2700. blockIndices[i][j] |= (CGU_UINT32)ReadBit(in, m_bitPosition) << k;
  2701. }
  2702. }
  2703. else
  2704. {
  2705. for (k = 0; k < bti[m_blockMode].indexBits[i]; k++)
  2706. {
  2707. blockIndices[i][j] |= (CGU_UINT32)ReadBit(in, m_bitPosition) << k;
  2708. }
  2709. }
  2710. }
  2711. }
  2712. // Generate block colours
  2713. for (i = 0; i < MAX_SUBSET_SIZE; i++)
  2714. {
  2715. out[i][COMP_ALPHA] = (CGU_UINT8)ramp[COMP_ALPHA][blockIndices[m_indexSwap ^ 1][i]];
  2716. out[i][COMP_RED] = (CGU_UINT8)ramp[COMP_RED][blockIndices[m_indexSwap][i]];
  2717. out[i][COMP_GREEN] = (CGU_UINT8)ramp[COMP_GREEN][blockIndices[m_indexSwap][i]];
  2718. out[i][COMP_BLUE] = (CGU_UINT8)ramp[COMP_BLUE][blockIndices[m_indexSwap][i]];
  2719. }
  2720. // Resolve the component rotation
  2721. CGU_INT8 swap;
  2722. for (i = 0; i < MAX_SUBSET_SIZE; i++)
  2723. {
  2724. switch (m_rotation)
  2725. {
  2726. case 0:
  2727. // Do nothing
  2728. break;
  2729. case 1:
  2730. // Swap A and R
  2731. swap = out[i][COMP_ALPHA];
  2732. out[i][COMP_ALPHA] = out[i][COMP_RED];
  2733. out[i][COMP_RED] = swap;
  2734. break;
  2735. case 2:
  2736. // Swap A and G
  2737. swap = out[i][COMP_ALPHA];
  2738. out[i][COMP_ALPHA] = out[i][COMP_GREEN];
  2739. out[i][COMP_GREEN] = swap;
  2740. break;
  2741. case 3:
  2742. // Swap A and B
  2743. swap = out[i][COMP_ALPHA];
  2744. out[i][COMP_ALPHA] = out[i][COMP_BLUE];
  2745. out[i][COMP_BLUE] = swap;
  2746. break;
  2747. }
  2748. }
  2749. }
  2750. void DecompressBC7_internal(CGU_UINT8 out[MAX_SUBSET_SIZE][MAX_DIMENSION_BIG], const CGU_UINT8 in[COMPRESSED_BLOCK_SIZE], const BC7_Encode* u_BC7Encode)
  2751. {
  2752. if (u_BC7Encode)
  2753. {
  2754. }
  2755. CGU_UINT32 i, j;
  2756. CGU_UINT32 blockIndices[MAX_SUBSET_SIZE];
  2757. CGU_UINT32 endpoint[MAX_SUBSETS][2][MAX_DIMENSION_BIG];
  2758. CGU_UINT32 m_blockMode;
  2759. CGU_UINT32 m_partition;
  2760. CGU_UINT32 m_rotation;
  2761. CGU_UINT32 m_indexSwap;
  2762. CGU_UINT32 m_bitPosition;
  2763. CGU_UINT32 m_componentBits[MAX_DIMENSION_BIG];
  2764. m_blockMode = 0;
  2765. m_partition = 0;
  2766. m_rotation = 0;
  2767. m_indexSwap = 0;
  2768. // Position the read pointer at the LSB of the block
  2769. m_bitPosition = 0;
  2770. while (!ReadBit(in, m_bitPosition) && (m_blockMode < 8))
  2771. {
  2772. m_blockMode++;
  2773. }
  2774. if (m_blockMode > 7)
  2775. {
  2776. // Something really bad happened...
  2777. return;
  2778. }
  2779. for (i = 0; i < bti[m_blockMode].rotationBits; i++)
  2780. {
  2781. m_rotation |= ReadBit(in, m_bitPosition) << i;
  2782. }
  2783. for (i = 0; i < bti[m_blockMode].indexModeBits; i++)
  2784. {
  2785. m_indexSwap |= ReadBit(in, m_bitPosition) << i;
  2786. }
  2787. for (i = 0; i < bti[m_blockMode].partitionBits; i++)
  2788. {
  2789. m_partition |= ReadBit(in, m_bitPosition) << i;
  2790. }
  2791. if (bti[m_blockMode].encodingType == NO_ALPHA)
  2792. {
  2793. m_componentBits[COMP_ALPHA] = 0;
  2794. m_componentBits[COMP_RED] = m_componentBits[COMP_GREEN] = m_componentBits[COMP_BLUE] = bti[m_blockMode].vectorBits / 3;
  2795. }
  2796. else if (bti[m_blockMode].encodingType == COMBINED_ALPHA)
  2797. {
  2798. m_componentBits[COMP_ALPHA] = m_componentBits[COMP_RED] = m_componentBits[COMP_GREEN] = m_componentBits[COMP_BLUE] = bti[m_blockMode].vectorBits / 4;
  2799. }
  2800. else if (bti[m_blockMode].encodingType == SEPARATE_ALPHA)
  2801. {
  2802. m_componentBits[COMP_ALPHA] = bti[m_blockMode].scalarBits;
  2803. m_componentBits[COMP_RED] = m_componentBits[COMP_GREEN] = m_componentBits[COMP_BLUE] = bti[m_blockMode].vectorBits / 3;
  2804. }
  2805. CGU_UINT32 subset, ep, component;
  2806. // Endpoints are stored in the following order RRRR GGGG BBBB (AAAA) (PPPP)
  2807. // i.e. components are packed together
  2808. // Loop over components
  2809. for (component = 0; component < MAX_DIMENSION_BIG; component++)
  2810. {
  2811. // loop over subsets
  2812. for (subset = 0; subset < (int)bti[m_blockMode].subsetCount; subset++)
  2813. {
  2814. // Loop over endpoints
  2815. for (ep = 0; ep < 2; ep++)
  2816. {
  2817. endpoint[subset][ep][component] = 0;
  2818. for (j = 0; j < m_componentBits[component]; j++)
  2819. {
  2820. endpoint[subset][ep][component] |= ReadBit(in, m_bitPosition) << j;
  2821. }
  2822. }
  2823. }
  2824. }
  2825. // Now get any parity bits
  2826. if (bti[m_blockMode].pBitType != NO_PBIT)
  2827. {
  2828. for (subset = 0; subset < (int)bti[m_blockMode].subsetCount; subset++)
  2829. {
  2830. CGU_UINT32 pBit[2];
  2831. if (bti[m_blockMode].pBitType == ONE_PBIT)
  2832. {
  2833. pBit[0] = ReadBit(in, m_bitPosition);
  2834. pBit[1] = pBit[0];
  2835. }
  2836. else if (bti[m_blockMode].pBitType == TWO_PBIT)
  2837. {
  2838. pBit[0] = ReadBit(in, m_bitPosition);
  2839. pBit[1] = ReadBit(in, m_bitPosition);
  2840. }
  2841. for (component = 0; component < MAX_DIMENSION_BIG; component++)
  2842. {
  2843. if (m_componentBits[component])
  2844. {
  2845. endpoint[subset][0][component] <<= 1;
  2846. endpoint[subset][1][component] <<= 1;
  2847. endpoint[subset][0][component] |= pBit[0];
  2848. endpoint[subset][1][component] |= pBit[1];
  2849. }
  2850. }
  2851. }
  2852. }
  2853. if (bti[m_blockMode].pBitType != NO_PBIT)
  2854. {
  2855. // Now that we've unpacked the parity bits, update the component size information
  2856. // for the ramp generator
  2857. for (j = 0; j < MAX_DIMENSION_BIG; j++)
  2858. {
  2859. if (m_componentBits[j])
  2860. {
  2861. m_componentBits[j] += 1;
  2862. }
  2863. }
  2864. }
  2865. // If this block has two independent sets of indices then put it to that decoder
  2866. if (bti[m_blockMode].encodingType == SEPARATE_ALPHA)
  2867. {
  2868. DecompressDualIndexBlock(out, in, endpoint[0], m_bitPosition, m_rotation, m_blockMode, m_indexSwap, m_componentBits);
  2869. return;
  2870. }
  2871. CGU_UINT32 fixup[MAX_SUBSETS] = {0, 0, 0};
  2872. switch (bti[m_blockMode].subsetCount)
  2873. {
  2874. case 3:
  2875. fixup[1] = BC7_FIXUPINDICES_LOCAL[2][m_partition][1];
  2876. fixup[2] = BC7_FIXUPINDICES_LOCAL[2][m_partition][2];
  2877. break;
  2878. case 2:
  2879. fixup[1] = BC7_FIXUPINDICES_LOCAL[1][m_partition][1];
  2880. break;
  2881. default:
  2882. break;
  2883. }
  2884. //--------------------------------------------------------------------
  2885. // New Code : Possible replacement for BC7_PARTITIONS for CPU code
  2886. //--------------------------------------------------------------------
  2887. // Extract index bits
  2888. // for (i = 0; i < MAX_SUBSET_SIZE; i++)
  2889. // {
  2890. // CGV_UINT8 p = get_partition_subset(m_partition, bti[m_blockMode].subsetCount - 1, i);
  2891. // //CGU_UINT32 p = partitionTable[i];
  2892. // blockIndices[i] = 0;
  2893. // CGU_UINT32 bitsToRead = bti[m_blockMode].indexBits[0];
  2894. //
  2895. // // If this is a fixup index then set the implicit bit
  2896. // if (i == fixup[p])
  2897. // {
  2898. // blockIndices[i] &= ~(1 << (bitsToRead - 1));
  2899. // bitsToRead--;
  2900. // }
  2901. //
  2902. // for (j = 0; j < bitsToRead; j++)
  2903. // {
  2904. // blockIndices[i] |= ReadBit(in, m_bitPosition) << j;
  2905. // }
  2906. // }
  2907. CGU_UINT8* partitionTable = (CGU_UINT8*)BC7_PARTITIONS[bti[m_blockMode].subsetCount - 1][m_partition];
  2908. // Extract index bits
  2909. for (i = 0; i < MAX_SUBSET_SIZE; i++)
  2910. {
  2911. CGU_UINT8 p = partitionTable[i];
  2912. blockIndices[i] = 0;
  2913. CGU_UINT8 bitsToRead = bti[m_blockMode].indexBits[0];
  2914. // If this is a fixup index then set the implicit bit
  2915. if (i == fixup[p])
  2916. {
  2917. blockIndices[i] &= ~(1 << (bitsToRead - 1));
  2918. bitsToRead--;
  2919. }
  2920. for (j = 0; j < bitsToRead; j++)
  2921. {
  2922. blockIndices[i] |= ReadBit(in, m_bitPosition) << j;
  2923. }
  2924. }
  2925. // Get the ramps
  2926. CGU_UINT32 clusters[2];
  2927. clusters[0] = clusters[1] = 1 << bti[m_blockMode].indexBits[0];
  2928. // Colour Ramps
  2929. CGU_FLOAT c[MAX_SUBSETS][MAX_DIMENSION_BIG][1 << MAX_INDEX_BITS];
  2930. for (i = 0; i < (int)bti[m_blockMode].subsetCount; i++)
  2931. {
  2932. // Unpack the colours
  2933. GetBC7Ramp(endpoint[i], c[i], clusters, m_componentBits);
  2934. }
  2935. //--------------------------------------------------------------------
  2936. // New Code : Possible replacement for BC7_PARTITIONS for CPU code
  2937. //--------------------------------------------------------------------
  2938. // Generate the block colours.
  2939. // for (i = 0; i < MAX_SUBSET_SIZE; i++)
  2940. // {
  2941. // CGV_UINT8 p = get_partition_subset(m_partition, bti[m_blockMode].subsetCount - 1, i);
  2942. // out[i][0] = c[p][0][blockIndices[i]];
  2943. // out[i][1] = c[p][1][blockIndices[i]];
  2944. // out[i][2] = c[p][2][blockIndices[i]];
  2945. // out[i][3] = c[p][3][blockIndices[i]];
  2946. // }
  2947. // Generate the block colours.
  2948. for (i = 0; i < MAX_SUBSET_SIZE; i++)
  2949. {
  2950. for (j = 0; j < MAX_DIMENSION_BIG; j++)
  2951. {
  2952. out[i][j] = (CGU_UINT8)c[partitionTable[i]][j][blockIndices[i]];
  2953. }
  2954. }
  2955. }
  2956. void CompressBlockBC7_Internal(CGU_UINT8 image_src[SOURCE_BLOCK_SIZE][4],
  2957. CMP_GLOBAL CGV_UINT8 cmp_out[COMPRESSED_BLOCK_SIZE],
  2958. uniform CMP_GLOBAL BC7_Encode u_BC7Encode[])
  2959. {
  2960. BC7_EncodeState _state = {0};
  2961. varying BC7_EncodeState* uniform state = &_state;
  2962. copy_BC7_Encode_settings(state, u_BC7Encode);
  2963. CGU_UINT8 offsetR = 0;
  2964. CGU_UINT8 offsetG = 16;
  2965. CGU_UINT8 offsetB = 32;
  2966. CGU_UINT8 offsetA = 48;
  2967. for (CGU_UINT8 i = 0; i < SOURCE_BLOCK_SIZE; i++)
  2968. {
  2969. state->image_src[offsetR++] = (CGV_FLOAT)image_src[i][0];
  2970. state->image_src[offsetG++] = (CGV_FLOAT)image_src[i][1];
  2971. state->image_src[offsetB++] = (CGV_FLOAT)image_src[i][2];
  2972. state->image_src[offsetA++] = (CGV_FLOAT)image_src[i][3];
  2973. }
  2974. BC7_CompressBlock(state, u_BC7Encode);
  2975. if (state->cmp_isout16Bytes)
  2976. {
  2977. for (CGU_UINT8 i = 0; i < COMPRESSED_BLOCK_SIZE; i++)
  2978. {
  2979. cmp_out[i] = state->cmp_out[i];
  2980. }
  2981. }
  2982. else
  2983. {
  2984. #ifdef ASPM_GPU
  2985. cmp_memcpy(cmp_out, (CGU_UINT8*)state->best_cmp_out, 16);
  2986. #else
  2987. memcpy(cmp_out, state->best_cmp_out, 16);
  2988. #endif
  2989. }
  2990. }
  2991. //======================= CPU USER INTERFACES ====================================
  2992. int CMP_CDECL CreateOptionsBC7(void** options)
  2993. {
  2994. (*options) = new BC7_Encode;
  2995. if (!options)
  2996. return CGU_CORE_ERR_NEWMEM;
  2997. init_BC7ramps();
  2998. SetDefaultBC7Options((BC7_Encode*)(*options));
  2999. return CGU_CORE_OK;
  3000. }
  3001. int CMP_CDECL DestroyOptionsBC7(void* options)
  3002. {
  3003. if (!options)
  3004. return CGU_CORE_ERR_INVALIDPTR;
  3005. BC7_Encode* BCOptions = reinterpret_cast<BC7_Encode*>(options);
  3006. delete BCOptions;
  3007. return CGU_CORE_OK;
  3008. }
  3009. int CMP_CDECL SetErrorThresholdBC7(void* options, CGU_FLOAT minThreshold, CGU_FLOAT maxThreshold)
  3010. {
  3011. if (!options)
  3012. return CGU_CORE_ERR_INVALIDPTR;
  3013. BC7_Encode* BC7optionsDefault = (BC7_Encode*)options;
  3014. if (minThreshold < 0.0f)
  3015. minThreshold = 0.0f;
  3016. if (maxThreshold < 0.0f)
  3017. maxThreshold = 0.0f;
  3018. BC7optionsDefault->minThreshold = minThreshold;
  3019. BC7optionsDefault->maxThreshold = maxThreshold;
  3020. return CGU_CORE_OK;
  3021. }
  3022. int CMP_CDECL SetQualityBC7(void* options, CGU_FLOAT fquality)
  3023. {
  3024. if (!options)
  3025. return CGU_CORE_ERR_INVALIDPTR;
  3026. BC7_Encode* BC7optionsDefault = (BC7_Encode*)options;
  3027. if (fquality < 0.0f)
  3028. fquality = 0.0f;
  3029. else if (fquality > 1.0f)
  3030. fquality = 1.0f;
  3031. BC7optionsDefault->quality = fquality;
  3032. // Set Error Thresholds
  3033. BC7optionsDefault->errorThreshold = BC7optionsDefault->maxThreshold * (1.0f - fquality);
  3034. if (fquality > BC7_qFAST_THRESHOLD)
  3035. BC7optionsDefault->errorThreshold += BC7optionsDefault->minThreshold;
  3036. return CGU_CORE_OK;
  3037. }
  3038. int CMP_CDECL SetMaskBC7(void* options, CGU_UINT8 mask)
  3039. {
  3040. if (!options)
  3041. return CGU_CORE_ERR_INVALIDPTR;
  3042. BC7_Encode* BC7options = (BC7_Encode*)options;
  3043. BC7options->validModeMask = mask;
  3044. return CGU_CORE_OK;
  3045. }
  3046. int CMP_CDECL SetAlphaOptionsBC7(void* options, CGU_BOOL imageNeedsAlpha, CGU_BOOL colourRestrict, CGU_BOOL alphaRestrict)
  3047. {
  3048. if (!options)
  3049. return CGU_CORE_ERR_INVALIDPTR;
  3050. BC7_Encode* u_BC7Encode = (BC7_Encode*)options;
  3051. u_BC7Encode->imageNeedsAlpha = imageNeedsAlpha;
  3052. u_BC7Encode->colourRestrict = colourRestrict;
  3053. u_BC7Encode->alphaRestrict = alphaRestrict;
  3054. return CGU_CORE_OK;
  3055. }
  3056. int CMP_CDECL CompressBlockBC7(const unsigned char* srcBlock, unsigned int srcStrideInBytes, CMP_GLOBAL unsigned char cmpBlock[16], const void* options = NULL)
  3057. {
  3058. CMP_Vec4uc inBlock[SOURCE_BLOCK_SIZE];
  3059. //----------------------------------
  3060. // Fill the inBlock with source data
  3061. //----------------------------------
  3062. CGU_INT srcpos = 0;
  3063. CGU_INT dstptr = 0;
  3064. for (CGU_UINT8 row = 0; row < 4; row++)
  3065. {
  3066. srcpos = row * srcStrideInBytes;
  3067. for (CGU_UINT8 col = 0; col < 4; col++)
  3068. {
  3069. inBlock[dstptr].x = CGU_UINT8(srcBlock[srcpos++]);
  3070. inBlock[dstptr].y = CGU_UINT8(srcBlock[srcpos++]);
  3071. inBlock[dstptr].z = CGU_UINT8(srcBlock[srcpos++]);
  3072. inBlock[dstptr].w = CGU_UINT8(srcBlock[srcpos++]);
  3073. dstptr++;
  3074. }
  3075. }
  3076. BC7_Encode* u_BC7Encode = (BC7_Encode*)options;
  3077. BC7_Encode BC7EncodeDefault = {0};
  3078. if (u_BC7Encode == NULL)
  3079. {
  3080. u_BC7Encode = &BC7EncodeDefault;
  3081. SetDefaultBC7Options(u_BC7Encode);
  3082. init_BC7ramps();
  3083. }
  3084. BC7_EncodeState EncodeState
  3085. #ifndef ASPM
  3086. = { 0 }
  3087. #endif
  3088. ;
  3089. EncodeState.best_err = CMP_FLOAT_MAX;
  3090. EncodeState.validModeMask = u_BC7Encode->validModeMask;
  3091. EncodeState.part_count = u_BC7Encode->part_count;
  3092. EncodeState.channels = CMP_STATIC_CAST(CGU_UINT8, u_BC7Encode->channels);
  3093. CGU_UINT8 offsetR = 0;
  3094. CGU_UINT8 offsetG = 16;
  3095. CGU_UINT8 offsetB = 32;
  3096. CGU_UINT8 offsetA = 48;
  3097. CGU_UINT32 offsetSRC = 0;
  3098. for (CGU_UINT8 i = 0; i < SOURCE_BLOCK_SIZE; i++)
  3099. {
  3100. EncodeState.image_src[offsetR++] = (CGV_FLOAT)inBlock[offsetSRC].x;
  3101. EncodeState.image_src[offsetG++] = (CGV_FLOAT)inBlock[offsetSRC].y;
  3102. EncodeState.image_src[offsetB++] = (CGV_FLOAT)inBlock[offsetSRC].z;
  3103. EncodeState.image_src[offsetA++] = (CGV_FLOAT)inBlock[offsetSRC].w;
  3104. offsetSRC++;
  3105. }
  3106. BC7_CompressBlock(&EncodeState, u_BC7Encode);
  3107. if (EncodeState.cmp_isout16Bytes)
  3108. {
  3109. for (CGU_UINT8 i = 0; i < COMPRESSED_BLOCK_SIZE; i++)
  3110. {
  3111. cmpBlock[i] = EncodeState.cmp_out[i];
  3112. }
  3113. }
  3114. else
  3115. {
  3116. memcpy(cmpBlock, EncodeState.best_cmp_out, 16);
  3117. }
  3118. return CGU_CORE_OK;
  3119. }
  3120. int CMP_CDECL DecompressBlockBC7(const unsigned char cmpBlock[16], unsigned char srcBlock[64], const void* options = NULL)
  3121. {
  3122. BC7_Encode* u_BC7Encode = (BC7_Encode*)options;
  3123. BC7_Encode BC7EncodeDefault = {0}; // for q = 0.05
  3124. if (u_BC7Encode == NULL)
  3125. {
  3126. // set for q = 1.0
  3127. u_BC7Encode = &BC7EncodeDefault;
  3128. SetDefaultBC7Options(u_BC7Encode);
  3129. init_BC7ramps();
  3130. }
  3131. DecompressBC7_internal((CGU_UINT8(*)[4])srcBlock, (CGU_UINT8*)cmpBlock, u_BC7Encode);
  3132. return CGU_CORE_OK;
  3133. }
  3134. #endif
  3135. #endif
  3136. //============================================== OpenCL USER INTERFACE ====================================================
  3137. #ifdef ASPM_OPENCL
  3138. CMP_STATIC CMP_KERNEL void CMP_GPUEncoder(uniform CMP_GLOBAL const CGU_Vec4uc ImageSource[],
  3139. CMP_GLOBAL CGV_UINT8 ImageDestination[],
  3140. uniform CMP_GLOBAL Source_Info SourceInfo[],
  3141. uniform CMP_GLOBAL BC7_Encode BC7Encode[])
  3142. {
  3143. CGU_INT xID = 0;
  3144. CGU_INT yID = 0;
  3145. xID = get_global_id(0); // ToDo: Define a size_t 32 bit and 64 bit based on clGetDeviceInfo
  3146. yID = get_global_id(1);
  3147. CGU_INT srcWidth = SourceInfo->m_src_width;
  3148. CGU_INT srcHeight = SourceInfo->m_src_height;
  3149. if (xID >= (srcWidth / BlockX))
  3150. return;
  3151. if (yID >= (srcHeight / BlockY))
  3152. return;
  3153. //ASPM_PRINT(("[ASPM_OCL] %d %d size %d\n",xID,yID,sizeof(BC7_Encode)));
  3154. CGU_INT destI = (xID * COMPRESSED_BLOCK_SIZE) + (yID * (srcWidth / BlockX) * COMPRESSED_BLOCK_SIZE);
  3155. CGU_INT srcindex = 4 * (yID * srcWidth + xID);
  3156. CGU_INT blkindex = 0;
  3157. BC7_EncodeState EncodeState;
  3158. cmp_memsetBC7((CGV_UINT8*)&EncodeState, 0, sizeof(EncodeState));
  3159. copy_BC7_Encode_settings(&EncodeState, BC7Encode);
  3160. //Check if it is a complete 4X4 block
  3161. if (((xID + 1) * BlockX <= srcWidth) && ((yID + 1) * BlockY <= srcHeight))
  3162. {
  3163. srcWidth = srcWidth - 4;
  3164. for (CGU_INT j = 0; j < 4; j++)
  3165. {
  3166. for (CGU_INT i = 0; i < 4; i++)
  3167. {
  3168. EncodeState.image_src[blkindex + 0 * SOURCE_BLOCK_SIZE] = ImageSource[srcindex].x;
  3169. EncodeState.image_src[blkindex + 1 * SOURCE_BLOCK_SIZE] = ImageSource[srcindex].y;
  3170. EncodeState.image_src[blkindex + 2 * SOURCE_BLOCK_SIZE] = ImageSource[srcindex].z;
  3171. EncodeState.image_src[blkindex + 3 * SOURCE_BLOCK_SIZE] = ImageSource[srcindex].w;
  3172. blkindex++;
  3173. srcindex++;
  3174. }
  3175. srcindex += srcWidth;
  3176. }
  3177. BC7_CompressBlock(&EncodeState, BC7Encode);
  3178. //printf("CMP %x %x %x %x %x %x %x %x\n",
  3179. // EncodeState.cmp_out[0],
  3180. // EncodeState.cmp_out[1],
  3181. // EncodeState.cmp_out[2],
  3182. // EncodeState.cmp_out[3],
  3183. // EncodeState.cmp_out[4],
  3184. // EncodeState.cmp_out[5],
  3185. // EncodeState.cmp_out[6],
  3186. // EncodeState.cmp_out[7]
  3187. // );
  3188. for (CGU_INT i = 0; i < COMPRESSED_BLOCK_SIZE; i++)
  3189. {
  3190. ImageDestination[destI + i] = EncodeState.cmp_out[i];
  3191. }
  3192. }
  3193. else
  3194. {
  3195. ASPM_PRINT(("[ASPM_GPU] Unable to process, make sure image size is divisible by 4"));
  3196. }
  3197. }
  3198. #endif