bc1_cmp.h 133 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270
  1. //=====================================================================
  2. // Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
  3. //
  4. // Permission is hereby granted, free of charge, to any person obtaining a copy
  5. // of this software and associated documentation files(the "Software"), to deal
  6. // in the Software without restriction, including without limitation the rights
  7. // to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
  8. // copies of the Software, and to permit persons to whom the Software is
  9. // furnished to do so, subject to the following conditions :
  10. //
  11. // The above copyright notice and this permission notice shall be included in
  12. // all copies or substantial portions of the Software.
  13. //
  14. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
  17. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  19. // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  20. // THE SOFTWARE.
  21. //
  22. // File: BC1_Encode_kernel.hlsl
  23. //--------------------------------------------------------------------------------------
  24. // Copyright (c) Microsoft Corporation. All rights reserved.
  25. // Licensed under the MIT License.
  26. //--------------------------------------------------------------------------------------
  27. #include "common_def.h"
  28. #include "bcn_common_kernel.h"
  29. #include "bcn_common_api.h"
  30. //-----------------------------------------------------------------------
  31. // When build is for CPU, we have some missing API calls common to GPU
  32. // Use CPU CMP_Core replacements
  33. //-----------------------------------------------------------------------
  34. #if defined(ASPM_GPU) || defined(ASPM_HLSL) || defined(ASPM_OPENCL)
  35. #define ALIGN_16
  36. #else
  37. #include INC_cmp_math_func
  38. #if defined(WIN32) || defined(_WIN64)
  39. #define ALIGN_16 __declspec(align(16))
  40. #else // !WIN32 && !_WIN64
  41. #define ALIGN_16
  42. #endif // !WIN32 && !_WIN64
  43. #endif
  44. #define USE_REFINE3D
  45. #define USE_REFINE
  46. #ifndef MAX_ERROR
  47. #define MAX_ERROR 128000.f
  48. #endif
  49. #define NUM_CHANNELS 4
  50. #define NUM_ENDPOINTS 2
  51. #ifndef CMP_QUALITY0
  52. #define CMP_QUALITY0 0.25f
  53. #endif
  54. #ifndef CMP_QUALITY1
  55. #define CMP_QUALITY1 0.50f
  56. #endif
  57. #ifndef CMP_QUALITY2
  58. #define CMP_QUALITY2 0.75f
  59. #endif
  60. #define EPS (2.f / 255.f) * (2.f / 255.f)
  61. #define EPS2 3.f * (2.f / 255.f) * (2.f / 255.f)
  62. static CGU_FLOAT cgu_getRampErr(CGU_FLOAT Prj[BLOCK_SIZE_4X4],
  63. CGU_FLOAT PrjErr[BLOCK_SIZE_4X4],
  64. CGU_FLOAT PreMRep[BLOCK_SIZE_4X4],
  65. CGU_FLOAT StepErr,
  66. CGU_FLOAT lowPosStep,
  67. CGU_FLOAT highPosStep,
  68. CGU_UINT32 dwUniqueColors)
  69. {
  70. CGU_FLOAT error = 0;
  71. CGU_FLOAT step = (highPosStep - lowPosStep) / 3; // using (dwNumChannels=4 - 1);
  72. CGU_FLOAT step_h = step * (CGU_FLOAT)0.5;
  73. CGU_FLOAT rstep = (CGU_FLOAT)1.0f / step;
  74. for (CGU_UINT32 i = 0; i < dwUniqueColors; i++)
  75. {
  76. CGU_FLOAT v;
  77. // Work out which value in the block this select
  78. CGU_FLOAT del;
  79. if ((del = Prj[i] - lowPosStep) <= 0)
  80. v = lowPosStep;
  81. else if (Prj[i] - highPosStep >= 0)
  82. v = highPosStep;
  83. else
  84. v = floor((del + step_h) * rstep) * step + lowPosStep;
  85. // And accumulate the error
  86. CGU_FLOAT d = (Prj[i] - v);
  87. d *= d;
  88. CGU_FLOAT err = PreMRep[i] * d + PrjErr[i];
  89. error += err;
  90. if (StepErr < error)
  91. {
  92. error = StepErr;
  93. break;
  94. }
  95. }
  96. return error;
  97. }
  98. CMP_STATIC CMP_EndPoints cgu_CompressRGBBlockX( CMP_IN CGU_Vec3f BlkInBGRf_UV[BLOCK_SIZE_4X4],
  99. CMP_IN CGU_FLOAT Rpt[BLOCK_SIZE_4X4],
  100. CMP_IN CGU_UINT32 dwUniqueColors,
  101. CMP_IN CGU_Vec3f channelWeightsBGR,
  102. CMP_IN CGU_BOOL b3DRefinement
  103. )
  104. {
  105. CMP_UNUSED(channelWeightsBGR);
  106. CMP_UNUSED(b3DRefinement);
  107. ALIGN_16 CGU_FLOAT Prj0[BLOCK_SIZE_4X4];
  108. ALIGN_16 CGU_FLOAT Prj[BLOCK_SIZE_4X4];
  109. ALIGN_16 CGU_FLOAT PrjErr[BLOCK_SIZE_4X4];
  110. ALIGN_16 CGU_FLOAT RmpIndxs[BLOCK_SIZE_4X4];
  111. CGU_Vec3f LineDirG;
  112. CGU_Vec3f LineDir;
  113. CGU_FLOAT LineDir0[NUM_CHANNELS];
  114. CGU_Vec3f BlkUV[BLOCK_SIZE_4X4];
  115. CGU_Vec3f BlkSh[BLOCK_SIZE_4X4];
  116. CGU_Vec3f Mdl;
  117. CGU_Vec3f rsltC0;
  118. CGU_Vec3f rsltC1;
  119. CGU_Vec3f PosG0 = {0.0f, 0.0f, 0.0f};
  120. CGU_Vec3f PosG1 = {0.0f, 0.0f, 0.0f};
  121. CGU_UINT32 i;
  122. for (i = 0; i < dwUniqueColors; i++)
  123. {
  124. BlkUV[i] = BlkInBGRf_UV[i];
  125. }
  126. // if not more then 2 different colors, we've done
  127. if (dwUniqueColors <= 2)
  128. {
  129. rsltC0 = BlkInBGRf_UV[0] * 255.0f;
  130. rsltC1 = BlkInBGRf_UV[dwUniqueColors - 1] * 255.0f;
  131. }
  132. else
  133. {
  134. // This is our first attempt to find an axis we will go along.
  135. // The cumulation is done to find a line minimizing the MSE from the
  136. // input 3D points.
  137. // While trying to find the axis we found that the diameter of the input
  138. // set is quite small. Do not bother.
  139. // FindAxisIsSmall(BlkSh, LineDir0, Mdl, Blk, Rpt,dwUniqueColors);
  140. {
  141. CGU_UINT32 ii;
  142. CGU_UINT32 jj;
  143. CGU_UINT32 kk;
  144. // These vars cannot be Vec3 as index to them are varying
  145. CGU_FLOAT Crrl[NUM_CHANNELS];
  146. CGU_FLOAT RGB2[NUM_CHANNELS];
  147. LineDir0[0] = LineDir0[1] = LineDir0[2] = RGB2[0] = RGB2[1] = RGB2[2] = Crrl[0] = Crrl[1] = Crrl[2] = Mdl.x = Mdl.y = Mdl.z = 0.f;
  148. // sum position of all points
  149. CGU_FLOAT fNumPoints = 0.0f;
  150. for (ii = 0; ii < dwUniqueColors; ii++)
  151. {
  152. Mdl.x += BlkUV[ii].x * Rpt[ii];
  153. Mdl.y += BlkUV[ii].y * Rpt[ii];
  154. Mdl.z += BlkUV[ii].z * Rpt[ii];
  155. fNumPoints += Rpt[ii];
  156. }
  157. // and then average to calculate center coordinate of block
  158. Mdl /= fNumPoints;
  159. for (ii = 0; ii < dwUniqueColors; ii++)
  160. {
  161. // calculate output block as offsets around block center
  162. BlkSh[ii] = BlkUV[ii] - Mdl;
  163. // compute correlation matrix
  164. // RGB2 = sum of ((distance from point from center) squared)
  165. RGB2[0] += BlkSh[ii].x * BlkSh[ii].x * Rpt[ii];
  166. RGB2[1] += BlkSh[ii].y * BlkSh[ii].y * Rpt[ii];
  167. RGB2[2] += BlkSh[ii].z * BlkSh[ii].z * Rpt[ii];
  168. Crrl[0] += BlkSh[ii].x * BlkSh[ii].y * Rpt[ii];
  169. Crrl[1] += BlkSh[ii].y * BlkSh[ii].z * Rpt[ii];
  170. Crrl[2] += BlkSh[ii].z * BlkSh[ii].x * Rpt[ii];
  171. }
  172. // if set's diameter is small
  173. CGU_UINT32 i0 = 0, i1 = 1;
  174. CGU_FLOAT mxRGB2 = 0.0f;
  175. CGU_FLOAT fEPS = fNumPoints * EPS;
  176. for (kk = 0, jj = 0; jj < 3; jj++)
  177. {
  178. if (RGB2[jj] >= fEPS)
  179. kk++;
  180. else
  181. RGB2[jj] = 0.0f;
  182. if (mxRGB2 < RGB2[jj])
  183. {
  184. mxRGB2 = RGB2[jj];
  185. i0 = jj;
  186. }
  187. }
  188. CGU_FLOAT fEPS2 = fNumPoints * EPS2;
  189. CGU_BOOL AxisIsSmall;
  190. AxisIsSmall = (RGB2[0] < fEPS2);
  191. AxisIsSmall = AxisIsSmall && (RGB2[1] < fEPS2);
  192. AxisIsSmall = AxisIsSmall && (RGB2[2] < fEPS2);
  193. // all are very small to avoid division on the small determinant
  194. if (AxisIsSmall)
  195. {
  196. rsltC0 = BlkInBGRf_UV[0] * 255.0f;
  197. rsltC1 = BlkInBGRf_UV[dwUniqueColors - 1] * 255.0f;
  198. }
  199. else
  200. {
  201. // !AxisIsSmall
  202. if (kk == 1) // really only 1 dimension
  203. LineDir0[i0] = 1.;
  204. else if (kk == 2)
  205. { // really only 2 dimensions
  206. i1 = (RGB2[(i0 + 1) % 3] > 0.f) ? (i0 + 1) % 3 : (i0 + 2) % 3;
  207. CGU_FLOAT Crl = (i1 == (i0 + 1) % 3) ? Crrl[i0] : Crrl[(i0 + 2) % 3];
  208. LineDir0[i1] = Crl / RGB2[i0];
  209. LineDir0[i0] = 1.;
  210. }
  211. else
  212. {
  213. CGU_FLOAT maxDet = 100000.f;
  214. CGU_FLOAT Cs[3];
  215. // select max det for precision
  216. for (jj = 0; jj < 3; jj++)
  217. {
  218. // 3 = nDimensions
  219. CGU_FLOAT Det = RGB2[jj] * RGB2[(jj + 1) % 3] - Crrl[jj] * Crrl[jj];
  220. Cs[jj] = cmp_fabs(Crrl[jj] / sqrt(RGB2[jj] * RGB2[(jj + 1) % 3]));
  221. if (maxDet < Det)
  222. {
  223. maxDet = Det;
  224. i0 = jj;
  225. }
  226. }
  227. // inverse correl matrix
  228. // -- -- -- --
  229. // | A B | | C -B |
  230. // | B C | => | -B A |
  231. // -- -- -- --
  232. CGU_FLOAT mtrx1[2][2];
  233. CGU_FLOAT vc1[2];
  234. CGU_FLOAT vc[2];
  235. vc1[0] = Crrl[(i0 + 2) % 3];
  236. vc1[1] = Crrl[(i0 + 1) % 3];
  237. // C
  238. mtrx1[0][0] = RGB2[(i0 + 1) % 3];
  239. // A
  240. mtrx1[1][1] = RGB2[i0];
  241. // -B
  242. mtrx1[1][0] = mtrx1[0][1] = -Crrl[i0];
  243. // find a solution
  244. vc[0] = mtrx1[0][0] * vc1[0] + mtrx1[0][1] * vc1[1];
  245. vc[1] = mtrx1[1][0] * vc1[0] + mtrx1[1][1] * vc1[1];
  246. // normalize
  247. vc[0] /= maxDet;
  248. vc[1] /= maxDet;
  249. // find a line direction vector
  250. LineDir0[i0] = 1.;
  251. LineDir0[(i0 + 1) % 3] = 1.;
  252. LineDir0[(i0 + 2) % 3] = vc[0] + vc[1];
  253. }
  254. // normalize direction vector
  255. CGU_FLOAT Len = LineDir0[0] * LineDir0[0] + LineDir0[1] * LineDir0[1] + LineDir0[2] * LineDir0[2];
  256. Len = sqrt(Len);
  257. LineDir0[0] = (Len > 0.f) ? LineDir0[0] / Len : 0.0f;
  258. LineDir0[1] = (Len > 0.f) ? LineDir0[1] / Len : 0.0f;
  259. LineDir0[2] = (Len > 0.f) ? LineDir0[2] / Len : 0.0f;
  260. }
  261. } // FindAxisIsSmall
  262. // GCC is being an awful being when it comes to goto-jumps.
  263. // So please bear with this.
  264. CGU_FLOAT ErrG = 10000000.f;
  265. CGU_FLOAT PrjBnd0;
  266. CGU_FLOAT PrjBnd1;
  267. ALIGN_16 CGU_FLOAT PreMRep[BLOCK_SIZE_4X4];
  268. LineDir.x = LineDir0[0];
  269. LineDir.y = LineDir0[1];
  270. LineDir.z = LineDir0[2];
  271. // Here is the main loop.
  272. // 1. Project input set on the axis in consideration.
  273. // 2. Run 1 dimensional search (see scalar case) to find an (sub) optimal pair of end points.
  274. // 3. Compute the vector of indexes (or clusters) for the current approximate ramp.
  275. // 4. Present our color channels as 3 16DIM vectors.
  276. // 5. Find closest approximation of each of 16DIM color vector with the projection of the 16DIM index vector.
  277. // 6. Plug the projections as a new directional vector for the axis.
  278. // 7. Goto 1.
  279. // D - is 16 dim "index" vector (or 16 DIM vector of indexes - {0, 1/3,2/3, 0, ...,}, but shifted and normalized).
  280. // Ci - is a 16 dim vector of color i. for each Ci find a scalar Ai such that (Ai * D - Ci) (Ai * D - Ci) -> min ,
  281. // i.e distance between vector AiD and C is min. You can think of D as a unit interval(vector) "clusterizer", and Ai is a scale
  282. // you need to apply to the clusterizer to approximate the Ci vector instead of the unit vector.
  283. // Solution is
  284. // Ai = (D . Ci) / (D . D); . - is a dot product.
  285. // in 3 dim space Ai(s) represent a line direction, along which
  286. // we again try to find (sub)optimal quantizer.
  287. // That's what our for(;;) loop is about.
  288. for (;;)
  289. {
  290. // 1. Project input set on the axis in consideration.
  291. // From Foley & Van Dam: Closest point of approach of a line (P + v) to a
  292. // point (R) is
  293. // P + ((R-P).v) / (v.v))v
  294. // The distance along v is therefore (R-P).v / (v.v)
  295. // (v.v) is 1 if v is a unit vector.
  296. //
  297. PrjBnd0 = 1000.0f;
  298. PrjBnd1 = -1000.0f;
  299. for (i = 0; i < BLOCK_SIZE_4X4; i++)
  300. Prj0[i] = Prj[i] = PrjErr[i] = PreMRep[i] = 0.f;
  301. for (i = 0; i < dwUniqueColors; i++)
  302. {
  303. Prj0[i] = Prj[i] = dot(BlkSh[i], LineDir);
  304. PrjErr[i] = dot(BlkSh[i] - LineDir * Prj[i], BlkSh[i] - LineDir * Prj[i]);
  305. PrjBnd0 = min(PrjBnd0, Prj[i]);
  306. PrjBnd1 = max(PrjBnd1, Prj[i]);
  307. }
  308. // 2. Run 1 dimensional search (see scalar case) to find an (sub) optimal
  309. // pair of end points.
  310. // min and max of the search interval
  311. CGU_FLOAT Scl0;
  312. CGU_FLOAT Scl1;
  313. Scl0 = PrjBnd0 - (PrjBnd1 - PrjBnd0) * 0.125f;
  314. Scl1 = PrjBnd1 + (PrjBnd1 - PrjBnd0) * 0.125f;
  315. // compute scaling factor to scale down the search interval to [0.,1]
  316. const CGU_FLOAT Scl2 = (Scl1 - Scl0) * (Scl1 - Scl0);
  317. const CGU_FLOAT overScl = 1.f / (Scl1 - Scl0);
  318. for (i = 0; i < dwUniqueColors; i++)
  319. {
  320. // scale them
  321. Prj[i] = (Prj[i] - Scl0) * overScl;
  322. // premultiply the scale square to plug into error computation later
  323. PreMRep[i] = Rpt[i] * Scl2;
  324. }
  325. // scale first approximation of end points
  326. PrjBnd0 = (PrjBnd0 - Scl0) * overScl;
  327. PrjBnd1 = (PrjBnd1 - Scl0) * overScl;
  328. CGU_FLOAT StepErr = MAX_ERROR;
  329. // search step
  330. CGU_FLOAT searchStep = 0.025f;
  331. // low Start/End; high Start/End
  332. const CGU_FLOAT lowStartEnd = (PrjBnd0 - 2.f * searchStep > 0.f) ? PrjBnd0 - 2.f * searchStep : 0.f;
  333. const CGU_FLOAT highStartEnd = (PrjBnd1 + 2.f * searchStep < 1.f) ? PrjBnd1 + 2.f * searchStep : 1.f;
  334. // find the best endpoints
  335. CGU_FLOAT Pos0 = 0;
  336. CGU_FLOAT Pos1 = 0;
  337. CGU_FLOAT lowPosStep, highPosStep;
  338. CGU_FLOAT err;
  339. int l, h;
  340. for (l = 0, lowPosStep = lowStartEnd; l < 8; l++, lowPosStep += searchStep)
  341. {
  342. for (h = 0, highPosStep = highStartEnd; h < 8; h++, highPosStep -= searchStep)
  343. {
  344. // compute an error for the current pair of end points.
  345. err = cgu_getRampErr(Prj, PrjErr, PreMRep, StepErr, lowPosStep, highPosStep, dwUniqueColors);
  346. if (err < StepErr)
  347. {
  348. // save better result
  349. StepErr = err;
  350. Pos0 = lowPosStep;
  351. Pos1 = highPosStep;
  352. }
  353. }
  354. }
  355. // inverse the scaling
  356. Pos0 = Pos0 * (Scl1 - Scl0) + Scl0;
  357. Pos1 = Pos1 * (Scl1 - Scl0) + Scl0;
  358. // did we find somthing better from the previous run?
  359. if (StepErr + 0.001 < ErrG)
  360. {
  361. // yes, remember it
  362. ErrG = StepErr;
  363. LineDirG = LineDir;
  364. PosG0.x = Pos0;
  365. PosG0.y = Pos0;
  366. PosG0.z = Pos0;
  367. PosG1.x = Pos1;
  368. PosG1.y = Pos1;
  369. PosG1.z = Pos1;
  370. // 3. Compute the vector of indexes (or clusters) for the current
  371. // approximate ramp.
  372. // indexes
  373. const CGU_FLOAT step = (Pos1 - Pos0) / 3.0f; // (dwNumChannels=4 - 1);
  374. const CGU_FLOAT step_h = step * (CGU_FLOAT)0.5;
  375. const CGU_FLOAT rstep = (CGU_FLOAT)1.0f / step;
  376. const CGU_FLOAT overBlkTp = 1.f / 3.0f; // (dwNumChannels=4 - 1);
  377. // here the index vector is computed,
  378. // shifted and normalized
  379. CGU_FLOAT indxAvrg = 3.0f / 2.0f; // (dwNumChannels=4 - 1);
  380. for (i = 0; i < dwUniqueColors; i++)
  381. {
  382. CGU_FLOAT del;
  383. // CGU_UINT32 n = (CGU_UINT32)((b - _min_ex + (step*0.5f)) * rstep);
  384. if ((del = Prj0[i] - Pos0) <= 0)
  385. RmpIndxs[i] = 0.f;
  386. else if (Prj0[i] - Pos1 >= 0)
  387. RmpIndxs[i] = 3.0f; // (dwNumChannels=4 - 1);
  388. else
  389. RmpIndxs[i] = floor((del + step_h) * rstep);
  390. // shift and normalization
  391. RmpIndxs[i] = (RmpIndxs[i] - indxAvrg) * overBlkTp;
  392. }
  393. // 4. Present our color channels as 3 16 DIM vectors.
  394. // 5. Find closest aproximation of each of 16DIM color vector with the
  395. // pojection of the 16DIM index vector.
  396. CGU_Vec3f Crs = {0.0f, 0.0f, 0.0f};
  397. CGU_FLOAT Len = 0.0f;
  398. for (i = 0; i < dwUniqueColors; i++)
  399. {
  400. const CGU_FLOAT PreMlt = RmpIndxs[i] * Rpt[i];
  401. Len += RmpIndxs[i] * PreMlt;
  402. Crs.x += BlkSh[i].x * PreMlt;
  403. Crs.y += BlkSh[i].y * PreMlt;
  404. Crs.z += BlkSh[i].z * PreMlt;
  405. }
  406. LineDir.x = LineDir.y = LineDir.z = 0.0f;
  407. if (Len > 0.0f)
  408. {
  409. CGU_FLOAT Len2;
  410. LineDir = Crs / Len;
  411. // 6. Plug the projections as a new directional vector for the axis.
  412. // 7. Goto 1.
  413. Len2 = dot(LineDir, LineDir); // LineDir.x * LineDir.x + LineDir.y * LineDir.y + LineDir.z * LineDir.z;
  414. Len2 = sqrt(Len2);
  415. LineDir /= Len2;
  416. }
  417. }
  418. else // We was not able to find anything better. Drop out.
  419. break;
  420. }
  421. // inverse transform to find end-points of 3-color ramp
  422. rsltC0 = (PosG0 * LineDirG + Mdl) * 255.f;
  423. rsltC1 = (PosG1 * LineDirG + Mdl) * 255.f;
  424. } // !isDone
  425. // We've dealt with (almost) unrestricted full precision realm.
  426. // Now back digital world.
  427. // round the end points to make them look like compressed ones
  428. CGU_Vec3f inpRmpEndPts0 = {0.0f, 255.0f, 0.0f};
  429. CGU_Vec3f inpRmpEndPts1 = {0.0f, 255.0f, 0.0f};
  430. CGU_Vec3f Fctrs0 = {8.0f, 4.0f, 8.0f}; //(1 << (PIX_GRID - BG)); x (1 << (PIX_GRID - GG)); y (1 << (PIX_GRID - RG)); z
  431. CGU_Vec3f Fctrs1 = {32.0f, 64.0f, 32.0f}; //(CGU_FLOAT)(1 << RG); z (CGU_FLOAT)(1 << GG); y (CGU_FLOAT)(1 << BG); x
  432. CGU_FLOAT _Min = 0.0f;
  433. CGU_FLOAT _Max = 255.0f;
  434. {
  435. // MkRmpOnGrid(inpRmpEndPts, rsltC, _Min, _Max);
  436. inpRmpEndPts0 = floor(rsltC0);
  437. if (inpRmpEndPts0.x <= _Min)
  438. inpRmpEndPts0.x = _Min;
  439. else
  440. {
  441. inpRmpEndPts0.x += floor(128.f / Fctrs1.x) - floor(inpRmpEndPts0.x / Fctrs1.x);
  442. inpRmpEndPts0.x = min(inpRmpEndPts0.x, _Max);
  443. }
  444. if (inpRmpEndPts0.y <= _Min)
  445. inpRmpEndPts0.y = _Min;
  446. else
  447. {
  448. inpRmpEndPts0.y += floor(128.f / Fctrs1.y) - floor(inpRmpEndPts0.y / Fctrs1.y);
  449. inpRmpEndPts0.y = min(inpRmpEndPts0.y, _Max);
  450. }
  451. if (inpRmpEndPts0.z <= _Min)
  452. inpRmpEndPts0.z = _Min;
  453. else
  454. {
  455. inpRmpEndPts0.z += floor(128.f / Fctrs1.z) - floor(inpRmpEndPts0.z / Fctrs1.z);
  456. inpRmpEndPts0.z = min(inpRmpEndPts0.z, _Max);
  457. }
  458. inpRmpEndPts0 = floor(inpRmpEndPts0 / Fctrs0) * Fctrs0;
  459. inpRmpEndPts1 = floor(rsltC1);
  460. if (inpRmpEndPts1.x <= _Min)
  461. inpRmpEndPts1.x = _Min;
  462. else
  463. {
  464. inpRmpEndPts1.x += floor(128.f / Fctrs1.x) - floor(inpRmpEndPts1.x / Fctrs1.x);
  465. inpRmpEndPts1.x = min(inpRmpEndPts1.x, _Max);
  466. }
  467. if (inpRmpEndPts1.y <= _Min)
  468. inpRmpEndPts1.y = _Min;
  469. else
  470. {
  471. inpRmpEndPts1.y += floor(128.f / Fctrs1.y) - floor(inpRmpEndPts1.y / Fctrs1.y);
  472. inpRmpEndPts1.y = min(inpRmpEndPts1.y, _Max);
  473. }
  474. if (inpRmpEndPts1.z <= _Min)
  475. inpRmpEndPts1.z = _Min;
  476. else
  477. {
  478. inpRmpEndPts1.z += floor(128.f / Fctrs1.z) - floor(inpRmpEndPts1.z / Fctrs1.z);
  479. inpRmpEndPts1.z = min(inpRmpEndPts1.z, _Max);
  480. }
  481. inpRmpEndPts1 = floor(inpRmpEndPts1 / Fctrs0) * Fctrs0;
  482. } // MkRmpOnGrid
  483. CMP_EndPoints EndPoints;
  484. EndPoints.Color0 = inpRmpEndPts0;
  485. EndPoints.Color1 = inpRmpEndPts1;
  486. return EndPoints;
  487. }
  488. CMP_STATIC CMP_EndPoints cgu_MkRmpOnGridBGR(CMP_IN CGU_Vec3f rsltC0,
  489. CMP_IN CGU_Vec3f rsltC1,
  490. CMP_IN CGU_UINT32 nRedBits,
  491. CMP_IN CGU_UINT32 nGreenBits,
  492. CMP_IN CGU_UINT32 nBlueBits)
  493. {
  494. CGU_Vec3f inpRmpEndPts0 = {0.0f, 255.0f, 0.0f};
  495. CGU_Vec3f inpRmpEndPts1 = {0.0f, 255.0f, 0.0f};
  496. CGU_Vec3f Fctrs0 = {8.0f, 4.0f, 8.0f};
  497. CGU_Vec3f Fctrs1 = {32.0f, 64.0f, 32.0f};
  498. CGU_FLOAT _Min = 0.0f;
  499. CGU_FLOAT _Max = 255.0f;
  500. // user override 565 default setting
  501. if ((nRedBits!=5)||(nGreenBits!=6)||(nBlueBits!=5)) {
  502. Fctrs1[RC] = (CGU_FLOAT)(1 << nRedBits);
  503. Fctrs1[GC] = (CGU_FLOAT)(1 << nGreenBits);
  504. Fctrs1[BC] = (CGU_FLOAT)(1 << nBlueBits);
  505. Fctrs0[RC] = (CGU_FLOAT)(1 << (PIX_GRID-nRedBits));
  506. Fctrs0[GC] = (CGU_FLOAT)(1 << (PIX_GRID-nGreenBits));
  507. Fctrs0[BC] = (CGU_FLOAT)(1 << (PIX_GRID-nBlueBits));
  508. }
  509. inpRmpEndPts0 = cmp_floorVec3f(rsltC0);
  510. if (inpRmpEndPts0.x <= _Min)
  511. inpRmpEndPts0.x = _Min;
  512. else
  513. {
  514. inpRmpEndPts0.x += cmp_floor(128.f / Fctrs1.x) - cmp_floor(inpRmpEndPts0.x / Fctrs1.x);
  515. inpRmpEndPts0.x = cmp_minf(inpRmpEndPts0.x, _Max);
  516. }
  517. if (inpRmpEndPts0.y <= _Min)
  518. inpRmpEndPts0.y = _Min;
  519. else
  520. {
  521. inpRmpEndPts0.y += cmp_floor(128.f / Fctrs1.y) - cmp_floor(inpRmpEndPts0.y / Fctrs1.y);
  522. inpRmpEndPts0.y = cmp_minf(inpRmpEndPts0.y, _Max);
  523. }
  524. if (inpRmpEndPts0.z <= _Min)
  525. inpRmpEndPts0.z = _Min;
  526. else
  527. {
  528. inpRmpEndPts0.z += cmp_floor(128.f / Fctrs1.z) - cmp_floor(inpRmpEndPts0.z / Fctrs1.z);
  529. inpRmpEndPts0.z = cmp_minf(inpRmpEndPts0.z, _Max);
  530. }
  531. inpRmpEndPts0 = cmp_floorVec3f(inpRmpEndPts0 / Fctrs0) * Fctrs0;
  532. inpRmpEndPts1 = cmp_floorVec3f(rsltC1);
  533. if (inpRmpEndPts1.x <= _Min)
  534. inpRmpEndPts1.x = _Min;
  535. else
  536. {
  537. inpRmpEndPts1.x += cmp_floor(128.f / Fctrs1.x) - cmp_floor(inpRmpEndPts1.x / Fctrs1.x);
  538. inpRmpEndPts1.x = cmp_minf(inpRmpEndPts1.x, _Max);
  539. }
  540. if (inpRmpEndPts1.y <= _Min)
  541. inpRmpEndPts1.y = _Min;
  542. else
  543. {
  544. inpRmpEndPts1.y += cmp_floor(128.f / Fctrs1.y) - cmp_floor(inpRmpEndPts1.y / Fctrs1.y);
  545. inpRmpEndPts1.y = cmp_minf(inpRmpEndPts1.y, _Max);
  546. }
  547. if (inpRmpEndPts1.z <= _Min)
  548. inpRmpEndPts1.z = _Min;
  549. else
  550. {
  551. inpRmpEndPts1.z += cmp_floor(128.f / Fctrs1.z) - cmp_floor(inpRmpEndPts1.z / Fctrs1.z);
  552. inpRmpEndPts1.z = cmp_minf(inpRmpEndPts1.z, _Max);
  553. }
  554. inpRmpEndPts1 = cmp_floorVec3f(inpRmpEndPts1 / Fctrs0) * Fctrs0;
  555. CMP_EndPoints EndPoints;
  556. EndPoints.Color0 = inpRmpEndPts0;
  557. EndPoints.Color1 = inpRmpEndPts1;
  558. return EndPoints;
  559. } // MkRmpOnGrid
  560. //===================================================================
  561. // Replaces CompressBlockBC1_RGBA_Internal()
  562. // if ((errLQ > 0.0f) && (fquality > CMP_QUALITY2)) code block
  563. //===================================================================
  564. CMP_STATIC CGU_Vec2ui cgu_CompRGBBlock(CMP_IN CGU_Vec4f src_imageNorm[BLOCK_SIZE_4X4],
  565. CMP_IN CMP_BC15Options BC15Options)
  566. {
  567. //CGU_FLOAT errLQ = 1e6f;
  568. CGU_UINT32 m_nRefinementSteps = BC15Options.m_nRefinementSteps;
  569. CGU_UINT32 dwAlphaThreshold = BC15Options.m_nAlphaThreshold;
  570. CGU_Vec3f channelWeights = {BC15Options.m_fChannelWeights[0],BC15Options.m_fChannelWeights[1],BC15Options.m_fChannelWeights[2]};
  571. CGU_BOOL isSRGB = BC15Options.m_bIsSRGB;
  572. CGU_Vec3f rgbBlock_normal[BLOCK_SIZE_4X4];
  573. CGU_UINT32 nCmpIndices = 0;
  574. CGU_UINT32 c0, c1;
  575. // High Quality
  576. CMP_EndPoints EndPoints = {{0, 0, 0xFF}, {0, 0, 0xFF}};
  577. CGU_UINT32 i;
  578. ALIGN_16 CGU_FLOAT Rpt[BLOCK_SIZE_4X4];
  579. CGU_UINT32 pcIndices = 0;
  580. m_nRefinementSteps = 0;
  581. CGU_Vec3f BlkInBGRf_UV[BLOCK_SIZE_4X4]; // Normalized Block Input (0..1) in BGR channel format
  582. // Default inidices & endpoints for Transparent Block
  583. CGU_Vec3ui nEndpoints0 = {0, 0, 0}; // Endpoints are stored BGR as x,y,z
  584. CGU_Vec3ui nEndpoints1 = {0xFF, 0xFF, 0xFF}; // Endpoints are stored BGR as x,y,z
  585. for (i = 0; i < BLOCK_SIZE_4X4; i++)
  586. {
  587. Rpt[i] = 0.0f;
  588. }
  589. //===============================================================
  590. // Check if we have more then 2 colors and process Alpha block
  591. CGU_UINT32 dwColors = 0;
  592. CGU_UINT32 dwBlk[BLOCK_SIZE_4X4];
  593. CGU_UINT32 R, G, B, A;
  594. for (i = 0; i < BLOCK_SIZE_4X4; i++)
  595. {
  596. // Do any color conversion prior to processing the block
  597. rgbBlock_normal[i] = isSRGB ? cmp_linearToSrgb(src_imageNorm[i].rgb) : src_imageNorm[i].rgb;
  598. R = (CGU_UINT32)(rgbBlock_normal[i].x * 255.0f);
  599. G = (CGU_UINT32)(rgbBlock_normal[i].y * 255.0f);
  600. B = (CGU_UINT32)(rgbBlock_normal[i].z * 255.0f);
  601. //if (dwAlphaThreshold > 0)
  602. // A = (CGU_UINT32)src_imageNorm[i].w * 255.0f;
  603. //else
  604. A = 255;
  605. // Punch Through Alpha in BC1 Codec (1 bit alpha)
  606. //if ((dwAlphaThreshold == 0) || (A >= dwAlphaThreshold))
  607. //{
  608. // copy to local RGB data and have alpha set to 0xFF
  609. dwBlk[dwColors++] = A << 24 | R << 16 | G << 8 | B;
  610. //}
  611. }
  612. if (!dwColors)
  613. {
  614. // All are colors transparent
  615. EndPoints.Color0.x = EndPoints.Color0.y = EndPoints.Color0.z = 0.0f;
  616. EndPoints.Color1.x = EndPoints.Color1.y = EndPoints.Color0.z = 255.0f;
  617. nCmpIndices = 0xFFFFFFFF;
  618. }
  619. else
  620. {
  621. // We have colors to process
  622. nCmpIndices = 0;
  623. // Punch Through Alpha Support ToDo
  624. // CGU_BOOL bHasAlpha = (dwColors != BLOCK_SIZE_4X4);
  625. // bHasAlpha = bHasAlpha && (dwAlphaThreshold > 0); // valid for (dwNumChannels=4);
  626. // if (bHasAlpha) {
  627. // CGU_Vec2ui compBlock = {0xf800f800,0};
  628. // return compBlock;
  629. // }
  630. // Here we are computing an unique number of sorted colors.
  631. // For each unique value we compute the number of it appearences.
  632. // qsort((void *)dwBlk, (size_t)dwColors, sizeof(CGU_UINT32), QSortIntCmp);
  633. {
  634. CGU_UINT32 j;
  635. CMP_di what[BLOCK_SIZE_4X4];
  636. for (i = 0; i < dwColors; i++)
  637. {
  638. what[i].index = i;
  639. what[i].data = dwBlk[i];
  640. }
  641. CGU_UINT32 tmp_index;
  642. CGU_UINT32 tmp_data;
  643. for (i = 1; i < dwColors; i++)
  644. {
  645. for (j = i; j > 0; j--)
  646. {
  647. if (what[j - 1].data > what[j].data)
  648. {
  649. tmp_index = what[j].index;
  650. tmp_data = what[j].data;
  651. what[j].index = what[j - 1].index;
  652. what[j].data = what[j - 1].data;
  653. what[j - 1].index = tmp_index;
  654. what[j - 1].data = tmp_data;
  655. }
  656. }
  657. }
  658. for (i = 0; i < dwColors; i++)
  659. dwBlk[i] = what[i].data;
  660. }
  661. CGU_UINT32 new_p;
  662. CGU_UINT32 dwBlkU[BLOCK_SIZE_4X4];
  663. CGU_UINT32 dwUniqueColors = 0;
  664. new_p = dwBlkU[0] = dwBlk[0];
  665. Rpt[dwUniqueColors] = 1.f;
  666. for (i = 1; i < dwColors; i++)
  667. {
  668. if (new_p != dwBlk[i])
  669. {
  670. dwUniqueColors++;
  671. new_p = dwBlkU[dwUniqueColors] = dwBlk[i];
  672. Rpt[dwUniqueColors] = 1.f;
  673. }
  674. else
  675. Rpt[dwUniqueColors] += 1.f;
  676. }
  677. dwUniqueColors++;
  678. // Simple case of only 2 colors to process
  679. // no need for futher processing as lowest quality methods work best for this case
  680. if (dwUniqueColors <= 2)
  681. {
  682. CGU_Vec3f rsltC0;
  683. CGU_Vec3f rsltC1;
  684. rsltC0.r = rgbBlock_normal[0].b * 255.0f;
  685. rsltC0.g = rgbBlock_normal[0].g * 255.0f;
  686. rsltC0.b = rgbBlock_normal[0].r * 255.0f;
  687. rsltC1.r = rgbBlock_normal[dwUniqueColors - 1].b * 255.0f;
  688. rsltC1.g = rgbBlock_normal[dwUniqueColors - 1].g * 255.0f;
  689. rsltC1.b = rgbBlock_normal[dwUniqueColors - 1].r * 255.0f;
  690. EndPoints = cgu_MkRmpOnGridBGR(rsltC0, rsltC1,5, 6, 5);
  691. }
  692. else
  693. {
  694. // switch from int range back to UV floats
  695. for (i = 0; i < dwUniqueColors; i++)
  696. {
  697. R = (dwBlkU[i] >> 16) & 0xff;
  698. G = (dwBlkU[i] >> 8) & 0xff;
  699. B = (dwBlkU[i] >> 0) & 0xff;
  700. BlkInBGRf_UV[i].z = (CGU_FLOAT)R / 255.0f;
  701. BlkInBGRf_UV[i].y = (CGU_FLOAT)G / 255.0f;
  702. BlkInBGRf_UV[i].x = (CGU_FLOAT)B / 255.0f;
  703. }
  704. CGU_Vec3f channelWeightsBGR;
  705. channelWeightsBGR.x = channelWeights.z;
  706. channelWeightsBGR.y = channelWeights.y;
  707. channelWeightsBGR.z = channelWeights.x;
  708. EndPoints = cgu_CompressRGBBlockX(BlkInBGRf_UV, Rpt, dwUniqueColors, channelWeightsBGR, m_nRefinementSteps);
  709. }
  710. } // colors
  711. //===================================================================
  712. // Process Cluster INPUT is constant EndPointsf OUTPUT is pcIndices
  713. //===================================================================
  714. if (nCmpIndices == 0)
  715. {
  716. R = (CGU_UINT32)(EndPoints.Color0.z);
  717. G = (CGU_UINT32)(EndPoints.Color0.y);
  718. B = (CGU_UINT32)(EndPoints.Color0.x);
  719. CGU_INT32 cluster0 = cmp_constructColor(R, G, B);
  720. R = (CGU_UINT32)(EndPoints.Color1.z);
  721. G = (CGU_UINT32)(EndPoints.Color1.y);
  722. B = (CGU_UINT32)(EndPoints.Color1.x);
  723. CGU_INT32 cluster1 = cmp_constructColor(R, G, B);
  724. CGU_Vec3f InpRmp[NUM_ENDPOINTS];
  725. if ((cluster0 <= cluster1) // valid for 4 channels
  726. // || (cluster0 > cluster1) // valid for 3 channels
  727. )
  728. {
  729. // inverse endpoints
  730. InpRmp[0] = EndPoints.Color1;
  731. InpRmp[1] = EndPoints.Color0;
  732. }
  733. else
  734. {
  735. InpRmp[0] = EndPoints.Color0;
  736. InpRmp[1] = EndPoints.Color1;
  737. }
  738. CGU_Vec3f srcblockBGR[BLOCK_SIZE_4X4];
  739. CGU_FLOAT srcblockA[BLOCK_SIZE_4X4];
  740. // Swizzle the source RGB to BGR for processing
  741. for (i = 0; i < BLOCK_SIZE_4X4; i++)
  742. {
  743. srcblockBGR[i].z = rgbBlock_normal[i].x * 255.0f;
  744. srcblockBGR[i].y = rgbBlock_normal[i].y * 255.0f;
  745. srcblockBGR[i].x = rgbBlock_normal[i].z * 255.0f;
  746. srcblockA[i] = 255.0f;
  747. if (dwAlphaThreshold > 0)
  748. {
  749. CGU_UINT32 alpha = (CGU_UINT32)src_imageNorm[i].w*255.0f;
  750. if (alpha >= dwAlphaThreshold)
  751. srcblockA[i] = alpha;
  752. }
  753. }
  754. // input ramp is on the coarse grid
  755. // make ramp endpoints the way they'll going to be decompressed
  756. CGU_Vec3f InpRmpL[NUM_ENDPOINTS];
  757. CGU_Vec3f Fctrs = {32.0F, 64.0F, 32.0F}; // 1 << RG,1 << GG,1 << BG
  758. {
  759. // ConstantRamp = MkWkRmpPts(InpRmpL, InpRmp);
  760. InpRmpL[0] = InpRmp[0] + floor(InpRmp[0] / Fctrs);
  761. InpRmpL[0] = cmp_clampVec3f(InpRmpL[0], 0.0f, 255.0f);
  762. InpRmpL[1] = InpRmp[1] + floor(InpRmp[1] / Fctrs);
  763. InpRmpL[1] = cmp_clampVec3f(InpRmpL[1], 0.0f, 255.0f);
  764. } // MkWkRmpPts
  765. // build ramp
  766. CGU_Vec3f LerpRmp[4];
  767. CGU_Vec3f offset = {1.0f, 1.0f, 1.0f};
  768. {
  769. //BldRmp(Rmp, InpRmpL, dwNumChannels);
  770. // linear interpolate end points to get the ramp
  771. LerpRmp[0] = InpRmpL[0];
  772. LerpRmp[3] = InpRmpL[1];
  773. LerpRmp[1] = floor((InpRmpL[0] * 2.0f + LerpRmp[3] + offset) / 3.0f);
  774. LerpRmp[2] = floor((InpRmpL[0] + LerpRmp[3] * 2.0f + offset) / 3.0f);
  775. } // BldRmp
  776. //=========================================================================
  777. // Clusterize, Compute error and find DXTC indexes for the current cluster
  778. //=========================================================================
  779. {
  780. // Clusterize
  781. CGU_UINT32 alpha;
  782. // For each colour in the original block assign it
  783. // to the closest cluster and compute the cumulative error
  784. for (i = 0; i < BLOCK_SIZE_4X4; i++)
  785. {
  786. alpha = (CGU_UINT32)srcblockA[i];
  787. if ((dwAlphaThreshold > 0) && alpha == 0)
  788. { //*((CGU_DWORD *)&_Blk[i][AC]) == 0)
  789. pcIndices |= cmp_set2Bit32(4, i); // dwNumChannels 3 or 4 (default is 4)
  790. }
  791. else
  792. {
  793. CGU_FLOAT shortest = 99999999999.f;
  794. CGU_UINT8 shortestIndex = 0;
  795. CGU_Vec3f channelWeightsBGR;
  796. channelWeightsBGR.x = channelWeights.z;
  797. channelWeightsBGR.y = channelWeights.y;
  798. channelWeightsBGR.z = channelWeights.x;
  799. for (CGU_UINT8 rampindex = 0; rampindex < 4; rampindex++)
  800. {
  801. // r is either 1 or 4
  802. // calculate the distance for each component
  803. CGU_FLOAT distance =
  804. dot(((srcblockBGR[i] - LerpRmp[rampindex]) * channelWeightsBGR), ((srcblockBGR[i] - LerpRmp[rampindex]) * channelWeightsBGR));
  805. if (distance < shortest)
  806. {
  807. shortest = distance;
  808. shortestIndex = rampindex;
  809. }
  810. }
  811. // The total is a sum of (error += shortest)
  812. // We have the index of the best cluster, so assign this in the block
  813. // Reorder indices to match correct DXTC ordering
  814. if (shortestIndex == 3) // dwNumChannels - 1
  815. shortestIndex = 1;
  816. else if (shortestIndex)
  817. shortestIndex++;
  818. pcIndices |= cmp_set2Bit32(shortestIndex, i);
  819. }
  820. } // BLOCK_SIZE_4X4
  821. } // Clusterize
  822. } // Process Cluster
  823. //==============================================================
  824. // Generate Compressed Result from nEndpoints & pcIndices
  825. //==============================================================
  826. c0 = cmp_constructColorBGR(EndPoints.Color0);
  827. c1 = cmp_constructColorBGR(EndPoints.Color1);
  828. // Get Processed indices if not set
  829. if (nCmpIndices == 0)
  830. nCmpIndices = pcIndices;
  831. CGU_Vec2ui cmpBlock;
  832. if (c0 <= c1)
  833. {
  834. cmpBlock.x = c1 | (c0 << 16);
  835. }
  836. else
  837. cmpBlock.x = c0 | (c1 << 16);
  838. cmpBlock.y = nCmpIndices;
  839. return cmpBlock;
  840. }
  841. CMP_STATIC void cgu_ProcessColors(CMP_INOUT CGU_Vec3f CMP_PTRINOUT colorMin,
  842. CMP_INOUT CGU_Vec3f CMP_PTRINOUT colorMax,
  843. CMP_INOUT CGU_UINT32 CMP_PTRINOUT c0,
  844. CMP_INOUT CGU_UINT32 CMP_PTRINOUT c1,
  845. CMP_IN CGU_INT setopt,
  846. CMP_IN CGU_BOOL isSRGB)
  847. {
  848. // CGU_UINT32 srbMap[32] = {0,5,8,11,12,13,14,15,16,17,18,19,20,21,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31};
  849. // CGU_UINT32 sgMap[64] = {0,10,14,16,19,20,22,24,25,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,42,43,43,44,45,45,
  850. // 46,47,47,48,48,49,50,50,51,52,52,53,53,54,54,55,55,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63};
  851. CGU_INT32 x, y, z;
  852. CGU_Vec3f scale = {31.0f, 63.0f, 31.0f};
  853. CGU_Vec3f MinColorScaled;
  854. CGU_Vec3f MaxColorScaled;
  855. // Clamp or Transform is needed, the transforms have built in clamps
  856. if (isSRGB)
  857. {
  858. MinColorScaled = cmp_linearToSrgb(CMP_PTRINOUT colorMin);
  859. MaxColorScaled = cmp_linearToSrgb(CMP_PTRINOUT colorMax);
  860. }
  861. else
  862. {
  863. MinColorScaled = cmp_clampVec3f(CMP_PTRINOUT colorMin, 0.0f, 1.0f);
  864. MaxColorScaled = cmp_clampVec3f(CMP_PTRINOUT colorMax, 0.0f, 1.0f);
  865. }
  866. switch (setopt)
  867. {
  868. case 0: // Use Min Max processing
  869. MinColorScaled = cmp_floorVec3f(MinColorScaled * scale);
  870. MaxColorScaled = cmp_ceilVec3f(MaxColorScaled * scale);
  871. CMP_PTRINOUT colorMin = MinColorScaled / scale;
  872. CMP_PTRINOUT colorMax = MaxColorScaled / scale;
  873. break;
  874. default: // Use round processing
  875. MinColorScaled = round(MinColorScaled * scale);
  876. MaxColorScaled = round(MaxColorScaled * scale);
  877. break;
  878. }
  879. x = (CGU_UINT32)(MinColorScaled.x);
  880. y = (CGU_UINT32)(MinColorScaled.y);
  881. z = (CGU_UINT32)(MinColorScaled.z);
  882. //if (isSRGB) {
  883. // // scale RB
  884. // x = srbMap[x]; // &0x1F];
  885. // y = sgMap [y]; // &0x3F];
  886. // z = srbMap[z]; // &0x1F];
  887. // // scale G
  888. //}
  889. CMP_PTRINOUT c0 = (x << 11) | (y << 5) | z;
  890. x = (CGU_UINT32)(MaxColorScaled.x);
  891. y = (CGU_UINT32)(MaxColorScaled.y);
  892. z = (CGU_UINT32)(MaxColorScaled.z);
  893. CMP_PTRINOUT c1 = (x << 11) | (y << 5) | z;
  894. }
  895. CMP_STATIC CGU_FLOAT cgu_getIndicesRGB(CMP_INOUT CGU_UINT32 CMP_PTRINOUT cmpindex,
  896. CMP_IN const CGU_Vec3f block[16],
  897. CMP_IN CGU_Vec3f minColor,
  898. CMP_IN CGU_Vec3f maxColor,
  899. CMP_IN CGU_BOOL getErr)
  900. {
  901. CGU_UINT32 PackedIndices = 0;
  902. CGU_FLOAT err = 0.0f;
  903. CGU_Vec3f cn[4];
  904. CGU_FLOAT minDistance;
  905. if (getErr)
  906. {
  907. // remap to BC1 spec for decoding offsets,
  908. // where cn[0] > cn[1] Max Color = index 0, 2/3 offset =index 2, 1/3 offset = index 3, Min Color = index 1
  909. cn[0] = maxColor;
  910. cn[1] = minColor;
  911. cn[2] = cn[0] * 2.0f / 3.0f + cn[1] * 1.0f / 3.0f;
  912. cn[3] = cn[0] * 1.0f / 3.0f + cn[1] * 2.0f / 3.0f;
  913. }
  914. CGU_FLOAT Scale = 3.f / cmp_dotVec3f(minColor - maxColor, minColor - maxColor);
  915. CGU_Vec3f ScaledRange = (minColor - maxColor) * Scale;
  916. CGU_FLOAT Bias = (cmp_dotVec3f(maxColor, maxColor) - cmp_dotVec3f(maxColor, minColor)) * Scale;
  917. CGU_INT indexMap[4] = {0, 2, 3, 1}; // mapping based on BC1 Spec for color0 > color1
  918. CGU_UINT32 index;
  919. CGU_FLOAT diff;
  920. for (CGU_UINT32 i = 0; i < 16; i++)
  921. {
  922. // Get offset from base scale
  923. diff = cmp_dotVec3f(block[i], ScaledRange) + Bias;
  924. index = ((CGU_UINT32)round(diff)) & 0x3;
  925. // remap linear offset to spec offset
  926. index = indexMap[index];
  927. // use err calc for use in higher quality code
  928. if (getErr)
  929. {
  930. minDistance = cmp_dotVec3f(block[i] - cn[index], block[i] - cn[index]);
  931. err += minDistance;
  932. }
  933. // Map the 2 bit index into compress 32 bit block
  934. if (index)
  935. PackedIndices |= (index << (2 * i));
  936. }
  937. if (getErr)
  938. err = err * 0.0208333f;
  939. CMP_PTRINOUT cmpindex = PackedIndices;
  940. return err;
  941. }
  942. //--------------------------------------------------------------------------------------------------------
  943. // Decompress is RGB (0.0f..255.0f)
  944. //--------------------------------------------------------------------------------------------------------
  945. CMP_STATIC void cgu_decompressRGBBlock(CMP_INOUT CGU_Vec3f rgbBlock[BLOCK_SIZE_4X4], const CGU_Vec2ui compressedBlock)
  946. {
  947. CGU_UINT32 n0 = compressedBlock.x & 0xffff;
  948. CGU_UINT32 n1 = compressedBlock.x >> 16;
  949. CGU_UINT32 index;
  950. //-------------------------------------------------------
  951. // Decode the compressed block 0..255 color range
  952. //-------------------------------------------------------
  953. CGU_Vec3f c0 = cmp_565ToLinear(n0); // max color
  954. CGU_Vec3f c1 = cmp_565ToLinear(n1); // min color
  955. CGU_Vec3f c2;
  956. CGU_Vec3f c3;
  957. if (n0 > n1)
  958. {
  959. c2 = (c0 * 2.0f + c1) / 3.0f;
  960. c3 = (c1 * 2.0f + c0) / 3.0f;
  961. for (CGU_UINT32 i = 0; i < 16; i++)
  962. {
  963. index = (compressedBlock.y >> (2 * i)) & 3;
  964. switch (index)
  965. {
  966. case 0:
  967. rgbBlock[i] = c0;
  968. break;
  969. case 1:
  970. rgbBlock[i] = c1;
  971. break;
  972. case 2:
  973. rgbBlock[i] = c2;
  974. break;
  975. case 3:
  976. rgbBlock[i] = c3;
  977. break;
  978. }
  979. }
  980. }
  981. else
  982. {
  983. // Transparent decode
  984. c2 = (c0 + c1) / 2.0f;
  985. for (CGU_UINT32 i = 0; i < 16; i++)
  986. {
  987. index = (compressedBlock.y >> (2 * i)) & 3;
  988. switch (index)
  989. {
  990. case 0:
  991. rgbBlock[i] = c0;
  992. break;
  993. case 1:
  994. rgbBlock[i] = c1;
  995. break;
  996. case 2:
  997. rgbBlock[i] = c2;
  998. break;
  999. case 3:
  1000. rgbBlock[i] = 0.0f;
  1001. break;
  1002. }
  1003. }
  1004. }
  1005. }
  1006. // The source is 0..255
  1007. CMP_STATIC float cgu_RGBABlockErrorLinear(const CGU_Vec4uc src_rgbBlock[BLOCK_SIZE_4X4], const CGU_Vec2ui compressedBlock)
  1008. {
  1009. CGU_Vec3f rgbBlock[BLOCK_SIZE_4X4];
  1010. // Decompressed block channels are 0..255
  1011. cgu_decompressRGBBlock(rgbBlock, compressedBlock);
  1012. //------------------------------------------------------------------
  1013. // Calculate MSE of the block
  1014. // Note : pow is used as Float type for the code to be usable on CPU
  1015. //------------------------------------------------------------------
  1016. CGU_Vec3f serr;
  1017. serr = 0.0f;
  1018. float sR, sG, sB, R, G, B;
  1019. for (int j = 0; j < 16; j++)
  1020. {
  1021. sR = src_rgbBlock[j].x;
  1022. sG = src_rgbBlock[j].y;
  1023. sB = src_rgbBlock[j].z;
  1024. R = rgbBlock[j].x;
  1025. G = rgbBlock[j].y;
  1026. B = rgbBlock[j].z;
  1027. // Norm colors
  1028. serr.x += pow(sR - R, 2.0f);
  1029. serr.y += pow(sG - G, 2.0f);
  1030. serr.z += pow(sB - B, 2.0f);
  1031. }
  1032. // MSE for 16 texels
  1033. return (serr.x + serr.y + serr.z) / 48.0f;
  1034. }
  1035. // The source is 0..1, decompressed data using cmp_decompressRGBBlock2 is 0..255 which is converted down to 0..1
  1036. CMP_STATIC float cgu_RGBBlockError(const CGU_Vec3f src_rgbBlock[BLOCK_SIZE_4X4], const CGU_Vec2ui compressedBlock, CGU_BOOL isSRGB)
  1037. {
  1038. CGU_Vec3f rgbBlock[BLOCK_SIZE_4X4];
  1039. // Decompressed block channels are 0..255
  1040. cgu_decompressRGBBlock(rgbBlock, compressedBlock);
  1041. //------------------------------------------------------------------
  1042. // Calculate MSE of the block
  1043. // Note : pow is used as Float type for the code to be usable on CPU
  1044. //------------------------------------------------------------------
  1045. CGU_Vec3f serr;
  1046. serr = 0.0f;
  1047. float sR, sG, sB, R, G, B;
  1048. for (int j = 0; j < 16; j++)
  1049. {
  1050. if (isSRGB)
  1051. {
  1052. sR = round(cmp_linearToSrgbf(src_rgbBlock[j].x) * 255.0f);
  1053. sG = round(cmp_linearToSrgbf(src_rgbBlock[j].y) * 255.0f);
  1054. sB = round(cmp_linearToSrgbf(src_rgbBlock[j].z) * 255.0f);
  1055. }
  1056. else
  1057. {
  1058. sR = round(src_rgbBlock[j].x * 255.0f);
  1059. sG = round(src_rgbBlock[j].y * 255.0f);
  1060. sB = round(src_rgbBlock[j].z * 255.0f);
  1061. }
  1062. R = rgbBlock[j].x;
  1063. G = rgbBlock[j].y;
  1064. B = rgbBlock[j].z;
  1065. // Norm colors
  1066. serr.x += pow(sR - R, 2.0f);
  1067. serr.y += pow(sG - G, 2.0f);
  1068. serr.z += pow(sB - B, 2.0f);
  1069. }
  1070. // MSE for 16 texels
  1071. return (serr.x + serr.y + serr.z) / 48.0f;
  1072. }
  1073. CMP_STATIC CGU_Vec2ui cgu_CompressRGBBlock_MinMax(CMP_IN const CGU_Vec3f src_imageRGB[16],
  1074. CMP_IN CGU_FLOAT fquality,
  1075. CMP_IN CGU_BOOL isSRGB,
  1076. CMP_INOUT CGU_Vec3f srcRGB[16], // The list of source colors with blue channel altered
  1077. CMP_INOUT CGU_Vec3f CMP_REFINOUT average_rgb, // The centrepoint of the axis
  1078. CMP_INOUT CGU_FLOAT CMP_REFINOUT errout
  1079. )
  1080. {
  1081. CGU_Vec2ui Q1CompData = {0,0};
  1082. CGU_Vec3f rgb = {0,0,0};
  1083. // -------------------------------------------------------------------------------------
  1084. // (1) Find the array of unique pixel values and sum them to find their average position
  1085. // -------------------------------------------------------------------------------------
  1086. CGU_FLOAT errLQ = 0.0f;
  1087. CGU_BOOL fastProcess = (fquality <= CMP_QUALITY0); // Min Max only
  1088. CGU_Vec3f srcMin = 1.0f; // Min source color
  1089. CGU_Vec3f srcMax = 0.0f; // Max source color
  1090. CGU_Vec2ui Q1compressedBlock = {0, 0};
  1091. CGU_UINT32 c0 = 0;
  1092. CGU_UINT32 c1 = 0;
  1093. average_rgb = 0.0f;
  1094. // Get average and modifed src
  1095. // find average position and save list of pixels as 0F..255F range for processing
  1096. // Note: z (blue) is average of blue+green channels
  1097. for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++)
  1098. {
  1099. srcMin = cmp_minVec3f(srcMin, src_imageRGB[i]);
  1100. srcMax = cmp_maxVec3f(srcMax, src_imageRGB[i]);
  1101. if (!fastProcess)
  1102. {
  1103. rgb = isSRGB ? cmp_linearToSrgb(src_imageRGB[i]) : cmp_saturate(src_imageRGB[i]);
  1104. rgb.z = (rgb.y + rgb.z) * 0.5F; // Z-axiz => (R+G)/2
  1105. srcRGB[i] = rgb;
  1106. average_rgb = average_rgb + rgb;
  1107. }
  1108. }
  1109. // Process two colors for saving in 565 format as C0 and C1
  1110. cgu_ProcessColors(CMP_REFINOUT srcMin, CMP_REFINOUT srcMax, CMP_REFINOUT c0, CMP_REFINOUT c1, isSRGB ? 1 : 0, isSRGB);
  1111. // Save simple min-max encoding
  1112. if (c0 < c1)
  1113. {
  1114. Q1CompData.x = (c0 << 16) | c1;
  1115. CGU_UINT32 index = 0;
  1116. errLQ = cgu_getIndicesRGB(CMP_REFINOUT index, src_imageRGB, srcMin, srcMax, false);
  1117. Q1CompData.y = index;
  1118. errout = cgu_RGBBlockError(src_imageRGB, Q1CompData, isSRGB);
  1119. }
  1120. else
  1121. {
  1122. // Most simple case all colors are equal or 0.0f
  1123. Q1compressedBlock.x = (c1 << 16) | c0;
  1124. Q1compressedBlock.y = 0;
  1125. errout = 0.0f;
  1126. return Q1compressedBlock;
  1127. }
  1128. // 0.0625F is (1/BLOCK_SIZE_4X4)
  1129. average_rgb = average_rgb * 0.0625F;
  1130. return Q1CompData;
  1131. }
  1132. CMP_STATIC CGU_Vec2ui cgu_CompressRGBBlock_Fast(CMP_IN const CGU_Vec3f src_imageRGB[16],
  1133. CMP_IN CGU_FLOAT fquality,
  1134. CMP_IN CGU_BOOL isSRGB,
  1135. CMP_IN CGU_Vec3f srcRGB[16],
  1136. CMP_IN CGU_Vec3f CMP_REFINOUT average_rgb,
  1137. CMP_INOUT CGU_FLOAT CMP_REFINOUT errout)
  1138. {
  1139. CGU_Vec3f axisVectorRGB = {0.0f, 0.0f, 0.0f}; // The axis vector for index projection
  1140. CGU_FLOAT pos_on_axis[16]; // The distance each unique falls along the compression axis
  1141. CGU_FLOAT axisleft = 0; // The extremities and centre (average of left/right) of srcRGB along the compression axis
  1142. CGU_FLOAT axisright = 0; // The extremities and centre (average of left/right) of srcRGB along the compression axis
  1143. CGU_FLOAT axiscentre = 0; // The extremities and centre (average of left/right) of srcRGB along the compression axis
  1144. CGU_INT32 swap = 0; // Indicator if the RGB values need swapping to generate an opaque result
  1145. CGU_Vec3f srcBlock[16]; // The list of source colors with any color space transforms and clipping
  1146. CGU_UINT32 c0 = 0;
  1147. CGU_UINT32 c1 = 0;
  1148. CGU_Vec2ui compressedBlock = {0, 0};
  1149. CGU_FLOAT Q1CompErr;
  1150. CGU_Vec2ui Q1CompData = {0,0};
  1151. CGU_Vec3f rgb = {0,0,0};
  1152. // -------------------------------------------------------------------------------------
  1153. // (4) For each component, reflect points about the average so all lie on the same side
  1154. // of the average, and compute the new average - this gives a second point that defines the axis
  1155. // To compute the sign of the axis sum the positive differences of G for each of R and B (the
  1156. // G axis is always positive in this implementation
  1157. // -------------------------------------------------------------------------------------
  1158. // An interesting situation occurs if the G axis contains no information, in which case the RB
  1159. // axis is also compared. I am not entirely sure if this is the correct implementation - should
  1160. // the priority axis be determined by magnitude?
  1161. {
  1162. CGU_FLOAT rg_pos = 0.0f;
  1163. CGU_FLOAT bg_pos = 0.0f;
  1164. CGU_FLOAT rb_pos = 0.0f;
  1165. for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++)
  1166. {
  1167. rgb = srcRGB[i] - average_rgb;
  1168. axisVectorRGB = axisVectorRGB + cmp_fabsVec3f(rgb);
  1169. if (rgb.x > 0)
  1170. {
  1171. rg_pos += rgb.y;
  1172. rb_pos += rgb.z;
  1173. }
  1174. if (rgb.z > 0)
  1175. bg_pos += rgb.y;
  1176. }
  1177. // Average over BLOCK_SIZE_4X4
  1178. axisVectorRGB = axisVectorRGB * 0.0625F;
  1179. // New average position
  1180. if (rg_pos < 0)
  1181. axisVectorRGB.x = -axisVectorRGB.x;
  1182. if (bg_pos < 0)
  1183. axisVectorRGB.z = -axisVectorRGB.z;
  1184. if ((rg_pos == bg_pos) && (rg_pos == 0))
  1185. {
  1186. if (rb_pos < 0)
  1187. axisVectorRGB.z = -axisVectorRGB.z;
  1188. }
  1189. }
  1190. // -------------------------------------------------------------------------------------
  1191. // (5) Axis projection and remapping
  1192. // -------------------------------------------------------------------------------------
  1193. {
  1194. CGU_FLOAT v2_recip;
  1195. // Normalize the axis for simplicity of future calculation
  1196. v2_recip = cmp_dotVec3f(axisVectorRGB, axisVectorRGB);
  1197. if (v2_recip > 0)
  1198. v2_recip = 1.0f / (CGU_FLOAT)cmp_sqrt(v2_recip);
  1199. else
  1200. v2_recip = 1.0f;
  1201. axisVectorRGB = axisVectorRGB * v2_recip;
  1202. }
  1203. // -------------------------------------------------------------------------------------
  1204. // (6) Map the axis
  1205. // -------------------------------------------------------------------------------------
  1206. // the line joining (and extended on either side of) average and axis
  1207. // defines the axis onto which the points will be projected
  1208. // Project all the points onto the axis, calculate the distance along
  1209. // the axis from the centre of the axis (average)
  1210. // From Foley & Van Dam: Closest point of approach of a line (P + v) to a point (R) is
  1211. // P + ((R-P).v) / (v.v))v
  1212. // The distance along v is therefore (R-P).v / (v.v) where (v.v) is 1 if v is a unit vector.
  1213. //
  1214. // Calculate the extremities at the same time - these need to be reasonably accurately
  1215. // represented in all cases
  1216. {
  1217. axisleft = CMP_FLOAT_MAX;
  1218. axisright = -CMP_FLOAT_MAX;
  1219. for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++)
  1220. {
  1221. // Compute the distance along the axis of the point of closest approach
  1222. CGU_Vec3f temp = (srcRGB[i] - average_rgb);
  1223. pos_on_axis[i] = cmp_dotVec3f(temp, axisVectorRGB);
  1224. // Work out the extremities
  1225. if (pos_on_axis[i] < axisleft)
  1226. axisleft = pos_on_axis[i];
  1227. if (pos_on_axis[i] > axisright)
  1228. axisright = pos_on_axis[i];
  1229. }
  1230. }
  1231. // ---------------------------------------------------------------------------------------------
  1232. // (7) Now we have a good axis and the basic information about how the points are mapped to it
  1233. // Our initial guess is to represent the endpoints accurately, by moving the average
  1234. // to the centre and recalculating the point positions along the line
  1235. // ---------------------------------------------------------------------------------------------
  1236. {
  1237. axiscentre = (axisleft + axisright) * 0.5F;
  1238. average_rgb = average_rgb + (axisVectorRGB * axiscentre);
  1239. for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++)
  1240. pos_on_axis[i] -= axiscentre;
  1241. axisright -= axiscentre;
  1242. axisleft -= axiscentre;
  1243. }
  1244. // -------------------------------------------------------------------------------------
  1245. // (8) Calculate the high and low output colour values
  1246. // Involved in this is a rounding procedure which is undoubtedly slightly twitchy. A
  1247. // straight rounded average is not correct, as the decompressor 'unrounds' by replicating
  1248. // the top bits to the bottom.
  1249. // In order to take account of this process, we don't just apply a straight rounding correction,
  1250. // but base our rounding on the input value (a straight rounding is actually pretty good in terms of
  1251. // error measure, but creates a visual colour and/or brightness shift relative to the original image)
  1252. // The method used here is to apply a centre-biased rounding dependent on the input value, which was
  1253. // (mostly by experiment) found to give minimum MSE while preserving the visual characteristics of
  1254. // the image.
  1255. // rgb = (average_rgb + (left|right)*axisVectorRGB);
  1256. // -------------------------------------------------------------------------------------
  1257. {
  1258. CGU_Vec3f MinColor, MaxColor;
  1259. MinColor = average_rgb + (axisVectorRGB * axisleft);
  1260. MaxColor = average_rgb + (axisVectorRGB * axisright);
  1261. MinColor.z = (MinColor.z * 2) - MinColor.y;
  1262. MaxColor.z = (MaxColor.z * 2) - MaxColor.y;
  1263. cgu_ProcessColors(CMP_REFINOUT MinColor, CMP_REFINOUT MaxColor, CMP_REFINOUT c0, CMP_REFINOUT c1, 1, false);
  1264. // Force to be a 4-colour opaque block - in which case, c0 is greater than c1
  1265. swap = 0;
  1266. if (c0 < c1)
  1267. {
  1268. CGU_UINT32 t;
  1269. t = c0;
  1270. c0 = c1;
  1271. c1 = t;
  1272. swap = 1;
  1273. }
  1274. else if (c0 == c1)
  1275. {
  1276. // This block will always be encoded in 3-colour mode
  1277. // Need to ensure that only one of the two points gets used,
  1278. // avoiding accidentally setting some transparent pixels into the block
  1279. for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++)
  1280. pos_on_axis[i] = axisleft;
  1281. }
  1282. compressedBlock.x = c0 | (c1 << 16);
  1283. // -------------------------------------------------------------------------------------
  1284. // (9) Final clustering, creating the 2-bit values that define the output
  1285. // -------------------------------------------------------------------------------------
  1286. CGU_UINT32 index;
  1287. CGU_FLOAT division;
  1288. {
  1289. compressedBlock.y = 0;
  1290. division = axisright * 2.0f / 3.0f;
  1291. axiscentre = (axisleft + axisright) / 2; // Actually, this code only works if centre is 0 or approximately so
  1292. CGU_FLOAT CompMinErr;
  1293. // This feature is work in progress
  1294. // remap to BC1 spec for decoding offsets,
  1295. // where cn[0] > cn[1] Max Color = index 0, 2/3 offset =index 2, 1/3 offset = index 3, Min Color = index 1
  1296. // CGU_Vec3f cn[4];
  1297. // cn[0] = MaxColor;
  1298. // cn[1] = MinColor;
  1299. // cn[2] = cn[0]*2.0f/3.0f + cn[1]*1.0f/3.0f;
  1300. // cn[3] = cn[0]*1.0f/3.0f + cn[1]*2.0f/3.0f;
  1301. for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++)
  1302. {
  1303. // Endpoints (indicated by block > average) are 0 and 1, while
  1304. // interpolants are 2 and 3
  1305. if (cmp_fabs(pos_on_axis[i]) >= division)
  1306. index = 0;
  1307. else
  1308. index = 2;
  1309. // Positive is in the latter half of the block
  1310. if (pos_on_axis[i] >= axiscentre)
  1311. index += 1;
  1312. index = index ^ swap;
  1313. // Set the output, taking swapping into account
  1314. compressedBlock.y |= (index << (2 * i));
  1315. // use err calc for use in higher quality code
  1316. //CompMinErr += cmp_dotVec3f(srcRGBRef[i] - cn[index],srcRGBRef[i] - cn[index]);
  1317. }
  1318. //CompMinErr = CompMinErr * 0.0208333f;
  1319. CompMinErr = cgu_RGBBlockError(src_imageRGB, compressedBlock, isSRGB);
  1320. Q1CompErr = cgu_RGBBlockError(src_imageRGB, Q1CompData, isSRGB);
  1321. if (CompMinErr > Q1CompErr)
  1322. {
  1323. compressedBlock = Q1CompData;
  1324. errout = Q1CompErr;
  1325. }
  1326. else
  1327. errout = CompMinErr;
  1328. }
  1329. }
  1330. // done
  1331. return compressedBlock;
  1332. }
  1333. CMP_STATIC CGU_UINT8 g_Match5Bit[256][2] = {
  1334. { 0, 0},{ 0, 0},{ 1, 0},{ 1, 0},{ 0, 1},{ 0, 1},{ 0, 1},{ 1, 1},{ 1, 1},{ 1, 1},{ 0, 2},{ 4, 0},{ 1, 2},{ 1, 2},{ 1, 2},{ 2, 2},
  1335. { 2, 2},{ 2, 2},{ 1, 3},{ 5, 1},{ 2, 3},{ 2, 3},{ 0, 4},{ 3, 3},{ 3, 3},{ 3, 3},{ 2, 4},{ 2, 4},{ 2, 4},{ 5, 3},{ 1, 5},{ 1, 5},
  1336. { 2, 5},{ 4, 4},{ 4, 4},{ 3, 5},{ 3, 5},{ 2, 6},{ 2, 6},{ 2, 6},{ 3, 6},{ 5, 5},{ 5, 5},{ 4, 6},{ 8, 4},{ 3, 7},{ 3, 7},{ 3, 7},
  1337. { 6, 6},{ 6, 6},{ 6, 6},{ 5, 7},{ 9, 5},{ 6, 7},{ 6, 7},{ 4, 8},{ 7, 7},{ 7, 7},{ 7, 7},{ 6, 8},{ 6, 8},{ 6, 8},{ 9, 7},{ 5, 9},
  1338. { 5, 9},{ 6, 9},{ 8, 8},{ 8, 8},{ 7, 9},{ 7, 9},{ 6,10},{ 6,10},{ 6,10},{ 7,10},{ 9, 9},{ 9, 9},{ 8,10},{12, 8},{ 7,11},{ 7,11},
  1339. { 7,11},{10,10},{10,10},{10,10},{ 9,11},{13, 9},{10,11},{10,11},{ 8,12},{11,11},{11,11},{11,11},{10,12},{10,12},{10,12},{13,11},
  1340. { 9,13},{ 9,13},{10,13},{12,12},{12,12},{11,13},{11,13},{10,14},{10,14},{10,14},{11,14},{13,13},{13,13},{12,14},{16,12},{11,15},
  1341. {11,15},{11,15},{14,14},{14,14},{14,14},{13,15},{17,13},{14,15},{14,15},{12,16},{15,15},{15,15},{15,15},{14,16},{14,16},{14,16},
  1342. {17,15},{13,17},{13,17},{14,17},{16,16},{16,16},{15,17},{15,17},{14,18},{14,18},{14,18},{15,18},{17,17},{17,17},{16,18},{20,16},
  1343. {15,19},{15,19},{15,19},{18,18},{18,18},{18,18},{17,19},{21,17},{18,19},{18,19},{16,20},{19,19},{19,19},{19,19},{18,20},{18,20},
  1344. {18,20},{21,19},{17,21},{17,21},{18,21},{20,20},{20,20},{19,21},{19,21},{18,22},{18,22},{18,22},{19,22},{21,21},{21,21},{20,22},
  1345. {24,20},{19,23},{19,23},{19,23},{22,22},{22,22},{22,22},{21,23},{25,21},{22,23},{22,23},{20,24},{23,23},{23,23},{23,23},{22,24},
  1346. {22,24},{22,24},{25,23},{21,25},{21,25},{22,25},{24,24},{24,24},{23,25},{23,25},{22,26},{22,26},{22,26},{23,26},{25,25},{25,25},
  1347. {24,26},{28,24},{23,27},{23,27},{23,27},{26,26},{26,26},{26,26},{25,27},{29,25},{26,27},{26,27},{24,28},{27,27},{27,27},{27,27},
  1348. {26,28},{26,28},{26,28},{29,27},{25,29},{25,29},{26,29},{28,28},{28,28},{27,29},{27,29},{26,30},{26,30},{26,30},{27,30},{29,29},
  1349. {29,29},{28,30},{28,30},{27,31},{27,31},{27,31},{30,30},{30,30},{30,30},{29,31},{29,31},{30,31},{30,31},{30,31},{31,31},{31,31}};
  1350. CMP_STATIC CGU_UINT8 g_Match6Bit[256][2] = {
  1351. { 0, 0},{ 1, 0},{ 0, 1},{ 1, 1},{ 1, 1},{ 0, 2},{ 1, 2},{ 2, 2},{ 2, 2},{ 1, 3},{ 0, 4},{ 3, 3},{ 3, 3},{ 0, 5},{ 1, 5},{ 4, 4},
  1352. { 4, 4},{ 1, 6},{ 0, 7},{ 5, 5},{ 5, 5},{ 0, 8},{ 1, 8},{ 6, 6},{ 6, 6},{ 1, 9},{ 2, 9},{ 7, 7},{ 7, 7},{ 2,10},{ 3,10},{ 8, 8},
  1353. { 8, 8},{ 3,11},{ 4,11},{ 9, 9},{ 9, 9},{ 4,12},{ 5,12},{10,10},{10,10},{ 5,13},{ 6,13},{16, 8},{11,11},{ 6,14},{ 7,14},{17, 9},
  1354. {12,12},{ 7,15},{ 8,15},{16,11},{13,13},{10,15},{ 8,16},{ 9,16},{14,14},{13,15},{ 9,17},{10,17},{15,15},{16,15},{10,18},{11,18},
  1355. {12,18},{16,16},{11,19},{12,19},{13,19},{17,17},{12,20},{13,20},{14,20},{18,18},{13,21},{14,21},{15,21},{19,19},{14,22},{15,22},
  1356. {20,20},{20,20},{15,23},{16,23},{21,21},{21,21},{16,24},{17,24},{22,22},{22,22},{17,25},{18,25},{23,23},{23,23},{18,26},{19,26},
  1357. {24,24},{24,24},{19,27},{20,27},{25,25},{25,25},{20,28},{21,28},{26,26},{26,26},{21,29},{22,29},{32,24},{27,27},{22,30},{23,30},
  1358. {33,25},{28,28},{23,31},{24,31},{32,27},{29,29},{26,31},{24,32},{25,32},{30,30},{29,31},{25,33},{26,33},{31,31},{32,31},{26,34},
  1359. {27,34},{28,34},{32,32},{27,35},{28,35},{29,35},{33,33},{28,36},{29,36},{30,36},{34,34},{29,37},{30,37},{31,37},{35,35},{30,38},
  1360. {31,38},{36,36},{36,36},{31,39},{32,39},{37,37},{37,37},{32,40},{33,40},{38,38},{38,38},{33,41},{34,41},{39,39},{39,39},{34,42},
  1361. {35,42},{40,40},{40,40},{35,43},{36,43},{41,41},{41,41},{36,44},{37,44},{42,42},{42,42},{37,45},{38,45},{48,40},{43,43},{38,46},
  1362. {39,46},{49,41},{44,44},{39,47},{40,47},{48,43},{45,45},{42,47},{40,48},{41,48},{46,46},{45,47},{41,49},{42,49},{47,47},{48,47},
  1363. {42,50},{43,50},{44,50},{48,48},{43,51},{44,51},{45,51},{49,49},{44,52},{45,52},{46,52},{50,50},{45,53},{46,53},{47,53},{51,51},
  1364. {46,54},{47,54},{52,52},{52,52},{47,55},{48,55},{53,53},{53,53},{48,56},{49,56},{54,54},{54,54},{49,57},{50,57},{55,55},{55,55},
  1365. {50,58},{51,58},{56,56},{56,56},{51,59},{52,59},{57,57},{57,57},{52,60},{53,60},{58,58},{58,58},{53,61},{54,61},{59,59},{59,59},
  1366. {54,62},{55,62},{60,60},{60,60},{55,63},{56,63},{61,61},{61,61},{58,63},{59,63},{62,62},{62,62},{61,63},{62,63},{63,63},{63,63}};
  1367. CMP_STATIC CGU_Vec2ui cgu_solidColorBlock(CMP_IN CGU_UINT8 Red, CMP_IN CGU_UINT8 Green, CMP_IN CGU_UINT8 Blue)
  1368. {
  1369. CGU_UINT32 maxEndp16;
  1370. CGU_UINT32 minEndp16;
  1371. CGU_UINT32 mask = 0xAAAAAAAAu;
  1372. minEndp16 = g_Match5Bit[Red][0] * 2048U + g_Match6Bit[Green][0] * 32U + g_Match5Bit[Blue][0];
  1373. maxEndp16 = g_Match5Bit[Red][1] * 2048U + g_Match6Bit[Green][1] * 32U + g_Match5Bit[Blue][1];
  1374. // write the color block
  1375. if( maxEndp16 < minEndp16 )
  1376. {
  1377. CGU_UINT32 tmpValue = minEndp16;
  1378. minEndp16 = maxEndp16;
  1379. maxEndp16 = tmpValue;
  1380. mask ^= 0x55555555u;
  1381. }
  1382. CGU_Vec2ui outputBytes;
  1383. outputBytes.x = CGU_UINT32(maxEndp16) | (CGU_UINT32(minEndp16) << 16u);
  1384. outputBytes.y = mask;
  1385. return outputBytes;
  1386. }
  1387. CMP_STATIC void cmp_get_encode_data(CMP_IN CMP_EncodeData CMP_REFINOUT edata, CMP_IN CMP_CONSTANT CGU_Vec4uc src_image[16])
  1388. {
  1389. CMP_CONSTANT CGU_UINT32 fr = src_image[0].r, fg = src_image[0].g, fb = src_image[0].b;
  1390. edata.all_colors_equal = false;
  1391. edata.total.r = fr;
  1392. edata.total.g = fg;
  1393. edata.total.b = fb;
  1394. edata.max.r = fr;
  1395. edata.max.g = fg;
  1396. edata.max.b = fb;
  1397. edata.min.r = fr;
  1398. edata.min.g = fg;
  1399. edata.min.b = fb;
  1400. edata.grayscale_flag = (fr == fg) && (fr == fb);
  1401. edata.any_black_pixels = (fr | fg | fb) < 4;
  1402. for (CGU_UINT32 i = 1; i < 16; i++)
  1403. {
  1404. CMP_CONSTANT CGU_INT r = src_image[i].r, g = src_image[i].g, b = src_image[i].b;
  1405. edata.grayscale_flag &= ((r == g) && (r == b));
  1406. edata.any_black_pixels |= ((r | g | b) < 4);
  1407. edata.max.r = CMP_MAX(edata.max.r, r);
  1408. edata.max.g = CMP_MAX(edata.max.g, g);
  1409. edata.max.b = CMP_MAX(edata.max.b, b);
  1410. edata.min.r = CMP_MIN(edata.min.r, r);
  1411. edata.min.g = CMP_MIN(edata.min.g, g);
  1412. edata.min.b = CMP_MIN(edata.min.b, b);
  1413. edata.total.r += r;
  1414. edata.total.g += g;
  1415. edata.total.b += b;
  1416. }
  1417. edata.avg.r = (edata.total.r + 8) >> 4;
  1418. edata.avg.g = (edata.total.g + 8) >> 4;
  1419. edata.avg.b = (edata.total.b + 8) >> 4;
  1420. }
  1421. #ifndef ASPM_GPU
  1422. /*------------------------------------------------------------------------------------------------
  1423. 1 DIM ramp
  1424. ------------------------------------------------------------------------------------------------*/
  1425. CMP_STATIC inline void cpu_BldClrRmp(CGU_FLOAT _Rmp[MAX_POINTS], CGU_FLOAT _InpRmp[NUM_ENDPOINTS], CGU_UINT32 dwNumPoints)
  1426. {
  1427. CGU_UINT32 dwRndAmount[9] = {0, 0, 0, 0, 1, 1, 2, 2, 3};
  1428. // linear interpolate end points to get the ramp
  1429. _Rmp[0] = _InpRmp[0];
  1430. _Rmp[dwNumPoints - 1] = _InpRmp[1];
  1431. if(dwNumPoints % 2)
  1432. _Rmp[dwNumPoints] = 1000000.f; // for 3 point ramp; not to select the 4th point as min
  1433. for(CGU_UINT32 e = 1; e < dwNumPoints - 1; e++)
  1434. _Rmp[e] = floor((_Rmp[0] * (dwNumPoints - 1 - e) + _Rmp[dwNumPoints - 1] * e + dwRndAmount[dwNumPoints])/ (CGU_FLOAT)(dwNumPoints - 1));
  1435. }
  1436. /*------------------------------------------------------------------------------------------------
  1437. // build 3D ramp
  1438. ------------------------------------------------------------------------------------------------*/
  1439. CMP_STATIC inline void cpu_BldRmp(CGU_FLOAT _Rmp[NUM_CHANNELS][MAX_POINTS], CGU_FLOAT _InpRmp[NUM_CHANNELS][NUM_ENDPOINTS],CGU_UINT32 dwNumPoints) {
  1440. for(CGU_UINT32 j = 0; j < 3; j++)
  1441. cpu_BldClrRmp(_Rmp[j], _InpRmp[j], dwNumPoints);
  1442. }
  1443. /*------------------------------------------------------------------------------------------------
  1444. // this is how the end points is going to be look like when decompressed
  1445. ------------------------------------------------------------------------------------------------*/
  1446. CMP_STATIC inline void cpu_MkWkRmpPts(CMP_INOUT CGU_UINT8 CMP_REFINOUT _bEq,
  1447. CGU_FLOAT _OutRmpPts[NUM_CHANNELS][NUM_ENDPOINTS],
  1448. CGU_FLOAT _InpRmpPts[NUM_CHANNELS][NUM_ENDPOINTS],
  1449. CGU_UINT8 nRedBits,
  1450. CGU_UINT8 nGreenBits,
  1451. CGU_UINT8 nBlueBits)
  1452. {
  1453. CGU_FLOAT Fctrs[3];
  1454. Fctrs[RC] = (CGU_FLOAT)(1 << nRedBits);
  1455. Fctrs[GC] = (CGU_FLOAT)(1 << nGreenBits);
  1456. Fctrs[BC] = (CGU_FLOAT)(1 << nBlueBits);
  1457. CGU_BOOL bEq = true;
  1458. // find whether input ramp is flat
  1459. for(CGU_UINT32 j = 0; j < 3; j++)
  1460. bEq &= (_InpRmpPts[j][0] == _InpRmpPts[j][1]);
  1461. _bEq = bEq?1:0;
  1462. // end points on the integer grid
  1463. for(CGU_UINT32 j = 0; j <3; j++) {
  1464. for(CGU_UINT32 k = 0; k <2; k++) {
  1465. // Apply the lower bit replication to give full dynamic range
  1466. _OutRmpPts[j][k] = _InpRmpPts[j][k] + floor(_InpRmpPts[j][k] / Fctrs[j]);
  1467. _OutRmpPts[j][k] = cmp_max(_OutRmpPts[j][k], 0.f);
  1468. _OutRmpPts[j][k] = cmp_min(_OutRmpPts[j][k], 255.f);
  1469. }
  1470. }
  1471. }
  1472. // Compute error and find DXTC indexes for the current cluster
  1473. CMP_STATIC CGU_FLOAT cpu_ClstrIntnl(CGU_FLOAT _Blk[BLOCK_SIZE_4X4][NUM_CHANNELS],
  1474. CGU_UINT8 pcIndices[BLOCK_SIZE_4X4],
  1475. CGU_FLOAT _Rmp[NUM_CHANNELS][MAX_POINTS],
  1476. int dwBlockSize,
  1477. CGU_UINT8 dwNumPoints,
  1478. bool _ConstRamp,
  1479. CGU_FLOAT _pfWeights[3],
  1480. bool _bUseAlpha)
  1481. {
  1482. CGU_FLOAT Err = 0.f;
  1483. CGU_UINT8 rmp_l = (_ConstRamp) ? 1 : dwNumPoints;
  1484. // For each colour in the original block assign it
  1485. // to the closest cluster and compute the cumulative error
  1486. for(int i=0; i< dwBlockSize; i++) {
  1487. if(_bUseAlpha && *((CGU_UINT32*) &_Blk[i][AC]) == 0)
  1488. pcIndices[i] = dwNumPoints;
  1489. else {
  1490. CGU_FLOAT shortest = 99999999999.f;
  1491. CGU_UINT8 shortestIndex = 0;
  1492. CGU_UINT8 r;
  1493. if ((_pfWeights[0] != 1.0f)||(_pfWeights[1] != 1.0f)||(_pfWeights[2] != 1.0f))
  1494. for(r=0; r < rmp_l; r++) {
  1495. // calculate the distance for each component
  1496. CGU_FLOAT distance = (_Blk[i][RC] - _Rmp[RC][r]) * (_Blk[i][RC] - _Rmp[RC][r]) * _pfWeights[0] +
  1497. (_Blk[i][GC] - _Rmp[GC][r]) * (_Blk[i][GC] - _Rmp[GC][r]) * _pfWeights[1] +
  1498. (_Blk[i][BC] - _Rmp[BC][r]) * (_Blk[i][BC] - _Rmp[BC][r]) * _pfWeights[2];
  1499. if(distance < shortest) {
  1500. shortest = distance;
  1501. shortestIndex = r;
  1502. }
  1503. } else
  1504. for(r=0; r < rmp_l; r++) {
  1505. // calculate the distance for each component
  1506. CGU_FLOAT distance = (_Blk[i][RC] - _Rmp[RC][r]) * (_Blk[i][RC] - _Rmp[RC][r]) +
  1507. (_Blk[i][GC] - _Rmp[GC][r]) * (_Blk[i][GC] - _Rmp[GC][r]) +
  1508. (_Blk[i][BC] - _Rmp[BC][r]) * (_Blk[i][BC] - _Rmp[BC][r]);
  1509. if(distance < shortest) {
  1510. shortest = distance;
  1511. shortestIndex = r;
  1512. }
  1513. }
  1514. Err += shortest;
  1515. // We have the index of the best cluster, so assign this in the block
  1516. // Reorder indices to match correct DXTC ordering
  1517. if(shortestIndex == dwNumPoints - 1)
  1518. shortestIndex = 1;
  1519. else if(shortestIndex)
  1520. shortestIndex++;
  1521. pcIndices[i] = shortestIndex;
  1522. }
  1523. }
  1524. return Err;
  1525. }
  1526. /*------------------------------------------------------------------------------------------------
  1527. // input ramp is on the coarse grid
  1528. ------------------------------------------------------------------------------------------------*/
  1529. CMP_STATIC CGU_FLOAT cpu_ClstrBas( CGU_UINT8 pcIndices[BLOCK_SIZE_4X4],
  1530. CGU_FLOAT _Blk[BLOCK_SIZE_4X4][NUM_CHANNELS],
  1531. CGU_FLOAT _InpRmp[NUM_CHANNELS][NUM_ENDPOINTS],
  1532. int dwBlockSize,
  1533. CGU_UINT8 dwNumPoints,
  1534. CGU_FLOAT _pfWeights[3],
  1535. bool _bUseAlpha,
  1536. CGU_UINT8 nRedBits,
  1537. CGU_UINT8 nGreenBits,
  1538. CGU_UINT8 nBlueBits)
  1539. {
  1540. // make ramp endpoints the way they'll going to be decompressed
  1541. CGU_UINT8 Eq = 1;
  1542. CGU_FLOAT InpRmp[NUM_CHANNELS][NUM_ENDPOINTS];
  1543. cpu_MkWkRmpPts(Eq, InpRmp, _InpRmp, nRedBits, nGreenBits, nBlueBits);
  1544. // build ramp as it would be built by decompressor
  1545. CGU_FLOAT Rmp[NUM_CHANNELS][MAX_POINTS];
  1546. cpu_BldRmp(Rmp, InpRmp, dwNumPoints);
  1547. // clusterize and find a cumulative error
  1548. return cpu_ClstrIntnl(_Blk, pcIndices, Rmp, dwBlockSize, dwNumPoints, Eq, _pfWeights, _bUseAlpha);
  1549. }
  1550. CMP_STATIC CGU_UINT8 nByteBitsMask2[9] = {0x00,0x80,0xc0,0xe0,0xf0,0xf8,0xfc,0xfe,0xff};
  1551. CMP_STATIC CGU_UINT32 cpu_ConstructColor2(CGU_UINT8 R, CGU_UINT8 nRedBits, CGU_UINT8 G, CGU_UINT8 nGreenBits, CGU_UINT8 B, CGU_UINT8 nBlueBits) {
  1552. return ( ((R & nByteBitsMask2[nRedBits]) << (nGreenBits + nBlueBits - (PIX_GRID - nRedBits))) |
  1553. ((G & nByteBitsMask2[nGreenBits])<< (nBlueBits - (PIX_GRID - nGreenBits))) |
  1554. ((B & nByteBitsMask2[nBlueBits]) >> ((PIX_GRID - nBlueBits))));
  1555. }
  1556. CMP_STATIC CGU_FLOAT cpu_Clstr( CGU_UINT32 block_32[BLOCK_SIZE_4X4],
  1557. CGU_UINT32 dwBlockSize,
  1558. CGU_UINT8 nEndpoints[3][NUM_ENDPOINTS],
  1559. CGU_UINT8 pcIndices[BLOCK_SIZE_4X4],
  1560. CGU_UINT8 dwNumPoints,
  1561. CGU_FLOAT _pfWeights[3],
  1562. bool _bUseAlpha,
  1563. CGU_UINT8 _nAlphaThreshold,
  1564. CGU_UINT8 nRedBits,
  1565. CGU_UINT8 nGreenBits,
  1566. CGU_UINT8 nBlueBits)
  1567. {
  1568. CGU_UINT32 c0 = cpu_ConstructColor2(nEndpoints[RC][0], nRedBits, nEndpoints[GC][0], nGreenBits, nEndpoints[BC][0], nBlueBits);
  1569. CGU_UINT32 c1 = cpu_ConstructColor2(nEndpoints[RC][1], nRedBits, nEndpoints[GC][1], nGreenBits, nEndpoints[BC][1], nBlueBits);
  1570. CGU_UINT32 nEndpointIndex0 = 0;
  1571. CGU_UINT32 nEndpointIndex1 = 1;
  1572. if((!(dwNumPoints & 0x1) && c0 <= c1) || ((dwNumPoints & 0x1) && c0 > c1)) {
  1573. nEndpointIndex0 = 1;
  1574. nEndpointIndex1 = 0;
  1575. }
  1576. CGU_FLOAT InpRmp[NUM_CHANNELS][NUM_ENDPOINTS];
  1577. InpRmp[RC][0] = (CGU_FLOAT)nEndpoints[RC][nEndpointIndex0];
  1578. InpRmp[RC][1] = (CGU_FLOAT)nEndpoints[RC][nEndpointIndex1];
  1579. InpRmp[GC][0] = (CGU_FLOAT)nEndpoints[GC][nEndpointIndex0];
  1580. InpRmp[GC][1] = (CGU_FLOAT)nEndpoints[GC][nEndpointIndex1];
  1581. InpRmp[BC][0] = (CGU_FLOAT)nEndpoints[BC][nEndpointIndex0];
  1582. InpRmp[BC][1] = (CGU_FLOAT)nEndpoints[BC][nEndpointIndex1];
  1583. CGU_UINT32 dwAlphaThreshold = _nAlphaThreshold << 24;
  1584. CGU_FLOAT Blk[BLOCK_SIZE_4X4][NUM_CHANNELS];
  1585. for(CGU_UINT32 i = 0; i < dwBlockSize; i++) {
  1586. Blk[i][RC] = (CGU_FLOAT)((block_32[i] & 0xff0000) >> 16);
  1587. Blk[i][GC] = (CGU_FLOAT)((block_32[i] & 0xff00) >> 8);
  1588. Blk[i][BC] = (CGU_FLOAT)(block_32[i] & 0xff);
  1589. if(_bUseAlpha)
  1590. Blk[i][AC] = ((block_32[i] & 0xff000000) >= dwAlphaThreshold) ? 1.f : 0.f;
  1591. }
  1592. return cpu_ClstrBas(pcIndices, Blk, InpRmp, dwBlockSize, dwNumPoints, _pfWeights, _bUseAlpha, nRedBits, nGreenBits, nBlueBits);
  1593. }
  1594. /*------------------------------------------------------------------------------------------------
  1595. Compute cumulative error for the current cluster
  1596. ------------------------------------------------------------------------------------------------*/
  1597. CMP_STATIC CGU_FLOAT cpu_ClstrErr(CGU_FLOAT _Blk[BLOCK_SIZE_4X4][NUM_CHANNELS],
  1598. CGU_FLOAT _Rpt[BLOCK_SIZE_4X4],
  1599. CGU_FLOAT _Rmp[NUM_CHANNELS][MAX_POINTS],
  1600. CGU_UINT32 _NmbClrs,
  1601. CGU_UINT32 _blcktp,
  1602. bool _ConstRamp,
  1603. CGU_Vec3f channelWeights)
  1604. {
  1605. CGU_FLOAT fError = 0.f;
  1606. CGU_UINT32 rmp_l = (_ConstRamp) ? 1 : _blcktp;
  1607. CGU_BOOL useWeights = ((channelWeights[0] != 1.0f) || (channelWeights[1] != 1.0f) || (channelWeights[2] != 1.0f));
  1608. // For each colour in the original block, find the closest cluster
  1609. // and compute the comulative error
  1610. for(CGU_UINT32 i=0; i<_NmbClrs; i++) {
  1611. CGU_FLOAT fShortest = 99999999999.f;
  1612. if(useWeights)
  1613. for(CGU_UINT32 r=0; r < rmp_l; r++) {
  1614. // calculate the distance for each component
  1615. CGU_FLOAT fDistance = (_Blk[i][RC] - _Rmp[RC][r]) * (_Blk[i][RC] - _Rmp[RC][r]) * channelWeights[0] +
  1616. (_Blk[i][GC] - _Rmp[GC][r]) * (_Blk[i][GC] - _Rmp[GC][r]) * channelWeights[1] +
  1617. (_Blk[i][BC] - _Rmp[BC][r]) * (_Blk[i][BC] - _Rmp[BC][r]) * channelWeights[2];
  1618. if(fDistance < fShortest)
  1619. fShortest = fDistance;
  1620. } else
  1621. for(CGU_UINT32 r=0; r < rmp_l; r++) {
  1622. // calculate the distance for each component
  1623. CGU_FLOAT fDistance = (_Blk[i][RC] - _Rmp[RC][r]) * (_Blk[i][RC] - _Rmp[RC][r]) +
  1624. (_Blk[i][GC] - _Rmp[GC][r]) * (_Blk[i][GC] - _Rmp[GC][r]) +
  1625. (_Blk[i][BC] - _Rmp[BC][r]) * (_Blk[i][BC] - _Rmp[BC][r]);
  1626. if(fDistance < fShortest)
  1627. fShortest = fDistance;
  1628. }
  1629. // accumulate the error
  1630. fError += fShortest * _Rpt[i];
  1631. }
  1632. return fError;
  1633. }
  1634. #if defined(USE_REFINE3D)
  1635. CMP_STATIC CGU_FLOAT cmp_Refine3D( CGU_FLOAT _OutRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS],
  1636. CGU_FLOAT _InpRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS],
  1637. CGU_FLOAT _Blk[BLOCK_SIZE_4X4][NUM_CHANNELS],
  1638. CGU_FLOAT _Rpt[BLOCK_SIZE_4X4],
  1639. CGU_UINT32 _NmrClrs,
  1640. CGU_UINT32 dwNumPoints,
  1641. CGU_Vec3f channelWeights,
  1642. CGU_UINT8 nRedBits,
  1643. CGU_UINT8 nGreenBits,
  1644. CGU_UINT8 nBlueBits,
  1645. CGU_UINT32 nRefineSteps)
  1646. {
  1647. ALIGN_16 CGU_FLOAT Rmp[NUM_CHANNELS][MAX_POINTS];
  1648. CGU_FLOAT Blk[BLOCK_SIZE_4X4][NUM_CHANNELS];
  1649. for(CGU_UINT32 i = 0; i < _NmrClrs; i++)
  1650. for(CGU_UINT32 j = 0; j < 3; j++)
  1651. Blk[i][j] = _Blk[i][j];
  1652. CGU_FLOAT fWeightRed = channelWeights.r;
  1653. CGU_FLOAT fWeightGreen = channelWeights.g;
  1654. CGU_FLOAT fWeightBlue = channelWeights.b;
  1655. // here is our grid
  1656. CGU_FLOAT Fctrs[3];
  1657. Fctrs[RC] = (CGU_FLOAT)(1 << (PIX_GRID-nRedBits));
  1658. Fctrs[GC] = (CGU_FLOAT)(1 << (PIX_GRID-nGreenBits));
  1659. Fctrs[BC] = (CGU_FLOAT)(1 << (PIX_GRID-nBlueBits));
  1660. CGU_FLOAT InpRmp0[NUM_CHANNELS][NUM_ENDPOINTS];
  1661. CGU_FLOAT InpRmp[NUM_CHANNELS][NUM_ENDPOINTS];
  1662. for(CGU_UINT32 k = 0; k < 2; k++)
  1663. for(CGU_UINT32 j = 0; j < 3; j++)
  1664. InpRmp0[j][k] = InpRmp[j][k] = _OutRmpPnts[j][k] = _InpRmpPnts[j][k];
  1665. // make ramp endpoints the way they'll going to be decompressed
  1666. // plus check whether the ramp is flat
  1667. CGU_UINT8 Eq;
  1668. CGU_FLOAT WkRmpPts[NUM_CHANNELS][NUM_ENDPOINTS];
  1669. cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
  1670. // build ramp for all 3 colors
  1671. cpu_BldRmp(Rmp, WkRmpPts, dwNumPoints);
  1672. // clusterize for the current ramp
  1673. CGU_FLOAT bestE = cpu_ClstrErr(Blk, _Rpt, Rmp, _NmrClrs, dwNumPoints, Eq, channelWeights);
  1674. if(bestE == 0.f) // if exact, we've done
  1675. return bestE;
  1676. // Jitter endpoints in each direction
  1677. CGU_INT nRefineStart = 0 - (cmp_min(nRefineSteps, (CGU_UINT8)8));
  1678. CGU_INT nRefineEnd = cmp_min(nRefineSteps, (CGU_UINT8)8);
  1679. for(CGU_INT nJitterG0 = nRefineStart; nJitterG0 <= nRefineEnd; nJitterG0++) {
  1680. InpRmp[GC][0] = cmp_min(cmp_max(InpRmp0[GC][0] + nJitterG0 * Fctrs[GC], 0.f), 255.f);
  1681. for(CGU_INT nJitterG1 = nRefineStart; nJitterG1 <= nRefineEnd; nJitterG1++) {
  1682. InpRmp[GC][1] = cmp_min(cmp_max(InpRmp0[GC][1] + nJitterG1 * Fctrs[GC], 0.f), 255.f);
  1683. cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
  1684. cpu_BldClrRmp(Rmp[GC], WkRmpPts[GC], dwNumPoints);
  1685. CGU_FLOAT RmpErrG[MAX_POINTS][BLOCK_SIZE_4X4];
  1686. for(CGU_UINT32 i=0; i < _NmrClrs; i++) {
  1687. for(CGU_UINT32 r = 0; r < dwNumPoints; r++) {
  1688. CGU_FLOAT DistG = (Rmp[GC][r] - Blk[i][GC]);
  1689. RmpErrG[r][i] = DistG * DistG * fWeightGreen;
  1690. }
  1691. }
  1692. for(CGU_INT nJitterB0 = nRefineStart; nJitterB0 <= nRefineEnd; nJitterB0++) {
  1693. InpRmp[BC][0] = cmp_min(cmp_max(InpRmp0[BC][0] + nJitterB0 * Fctrs[BC], 0.f), 255.f);
  1694. for(CGU_INT nJitterB1 = nRefineStart; nJitterB1 <= nRefineEnd; nJitterB1++) {
  1695. InpRmp[BC][1] = cmp_min(cmp_max(InpRmp0[BC][1] + nJitterB1 * Fctrs[BC], 0.f), 255.f);
  1696. cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
  1697. cpu_BldClrRmp(Rmp[BC], WkRmpPts[BC], dwNumPoints);
  1698. CGU_FLOAT RmpErr[MAX_POINTS][BLOCK_SIZE_4X4];
  1699. for(CGU_UINT32 i=0; i < _NmrClrs; i++) {
  1700. for(CGU_UINT32 r = 0; r < dwNumPoints; r++) {
  1701. CGU_FLOAT DistB = (Rmp[BC][r] - Blk[i][BC]);
  1702. RmpErr[r][i] = RmpErrG[r][i] + DistB * DistB * fWeightBlue;
  1703. }
  1704. }
  1705. for(CGU_INT nJitterR0 = nRefineStart; nJitterR0 <= nRefineEnd; nJitterR0++) {
  1706. InpRmp[RC][0] = cmp_min(cmp_max(InpRmp0[RC][0] + nJitterR0 * Fctrs[RC], 0.f), 255.f);
  1707. for(CGU_INT nJitterR1 = nRefineStart; nJitterR1 <= nRefineEnd; nJitterR1++) {
  1708. InpRmp[RC][1] = cmp_min(cmp_max(InpRmp0[RC][1] + nJitterR1 * Fctrs[RC], 0.f), 255.f);
  1709. cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
  1710. cpu_BldClrRmp(Rmp[RC], WkRmpPts[RC], dwNumPoints);
  1711. // compute cumulative error
  1712. CGU_FLOAT mse = 0.f;
  1713. CGU_INT rmp_l = (Eq > 0) ? 1 : dwNumPoints;
  1714. for(CGU_UINT32 k = 0; k < _NmrClrs; k++) {
  1715. CGU_FLOAT MinErr = 10000000.f;
  1716. for(CGU_INT r = 0; r < rmp_l; r++) {
  1717. CGU_FLOAT Dist = (Rmp[RC][r] - Blk[k][RC]);
  1718. CGU_FLOAT Err = RmpErr[r][k] + Dist * Dist * fWeightRed;
  1719. MinErr = cmp_min(MinErr, Err);
  1720. }
  1721. mse += MinErr * _Rpt[k];
  1722. }
  1723. // save if we achieve better result
  1724. if(mse < bestE) {
  1725. bestE = mse;
  1726. for(CGU_UINT32 k = 0; k < 2; k++)
  1727. for(CGU_UINT32 j = 0; j < 3; j++)
  1728. _OutRmpPnts[j][k] = InpRmp[j][k];
  1729. }
  1730. }
  1731. }
  1732. }
  1733. }
  1734. }
  1735. }
  1736. return bestE;
  1737. }
  1738. #endif
  1739. #if defined(USE_REFINE)
  1740. CMP_STATIC CGU_FLOAT cmp_Refine(CGU_FLOAT _OutRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS],
  1741. CGU_FLOAT _InpRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS],
  1742. CGU_FLOAT _Blk[BLOCK_SIZE_4X4][NUM_CHANNELS],
  1743. CGU_FLOAT _Rpt[BLOCK_SIZE_4X4],
  1744. CGU_INT _NmrClrs,
  1745. CGU_UINT8 dwNumPoints,
  1746. CGU_Vec3f channelWeights,
  1747. CGU_UINT32 nRedBits,
  1748. CGU_UINT32 nGreenBits,
  1749. CGU_UINT32 nBlueBits,
  1750. CGU_UINT32 nRefineSteps )
  1751. {
  1752. ALIGN_16 CGU_FLOAT Rmp[NUM_CHANNELS][MAX_POINTS];
  1753. if (nRefineSteps == 0) nRefineSteps = 1;
  1754. CGU_FLOAT Blk[BLOCK_SIZE_4X4][NUM_CHANNELS];
  1755. for(CGU_INT i = 0; i < _NmrClrs; i++)
  1756. for(CGU_INT j = 0; j < 3; j++)
  1757. Blk[i][j] = _Blk[i][j];
  1758. CGU_FLOAT fWeightRed = channelWeights.r;
  1759. CGU_FLOAT fWeightGreen = channelWeights.g;
  1760. CGU_FLOAT fWeightBlue = channelWeights.b;
  1761. // here is our grid
  1762. CGU_FLOAT Fctrs[3];
  1763. Fctrs[RC] = (CGU_FLOAT)(1 << (PIX_GRID-nRedBits));
  1764. Fctrs[GC] = (CGU_FLOAT)(1 << (PIX_GRID-nGreenBits));
  1765. Fctrs[BC] = (CGU_FLOAT)(1 << (PIX_GRID-nBlueBits));
  1766. CGU_FLOAT InpRmp0[NUM_CHANNELS][NUM_ENDPOINTS];
  1767. CGU_FLOAT InpRmp[NUM_CHANNELS][NUM_ENDPOINTS];
  1768. for(CGU_INT k = 0; k < 2; k++)
  1769. for(CGU_INT j = 0; j < 3; j++)
  1770. InpRmp0[j][k] = InpRmp[j][k] = _OutRmpPnts[j][k] = _InpRmpPnts[j][k];
  1771. // make ramp endpoints the way they'll going to be decompressed
  1772. // plus check whether the ramp is flat
  1773. CGU_UINT8 Eq;
  1774. CGU_FLOAT WkRmpPts[NUM_CHANNELS][NUM_ENDPOINTS];
  1775. cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
  1776. // build ramp for all 3 colors
  1777. cpu_BldRmp(Rmp, WkRmpPts, dwNumPoints);
  1778. // clusterize for the current ramp
  1779. CGU_FLOAT bestE = cpu_ClstrErr(Blk, _Rpt, Rmp, _NmrClrs, dwNumPoints, Eq, channelWeights);
  1780. if(bestE == 0.f) // || !nRefineSteps) // if exact, we've done
  1781. return bestE;
  1782. // Tweak each component in isolation and get the best values
  1783. // precompute ramp errors for Green and Blue
  1784. CGU_FLOAT RmpErr[MAX_POINTS][BLOCK_SIZE_4X4];
  1785. for(CGU_INT i=0; i < _NmrClrs; i++) {
  1786. for(CGU_INT r = 0; r < dwNumPoints; r++) {
  1787. CGU_FLOAT DistG = (Rmp[GC][r] - Blk[i][GC]);
  1788. CGU_FLOAT DistB = (Rmp[BC][r] - Blk[i][BC]);
  1789. RmpErr[r][i] = DistG * DistG * fWeightGreen + DistB * DistB * fWeightBlue;
  1790. }
  1791. }
  1792. // First Red
  1793. CGU_FLOAT bstC0 = InpRmp0[RC][0];
  1794. CGU_FLOAT bstC1 = InpRmp0[RC][1];
  1795. CGU_INT nRefineStart = 0 - (cmp_min(nRefineSteps, (CGU_UINT8)8));
  1796. CGU_INT nRefineEnd = cmp_min(nRefineSteps, (CGU_UINT8)8);
  1797. for(CGU_INT i = nRefineStart; i <= nRefineEnd; i++) {
  1798. for(CGU_INT j = nRefineStart; j <= nRefineEnd; j++) {
  1799. // make a move; both sides of interval.
  1800. InpRmp[RC][0] = cmp_min(cmp_max(InpRmp0[RC][0] + i * Fctrs[RC], 0.f), 255.f);
  1801. InpRmp[RC][1] = cmp_min(cmp_max(InpRmp0[RC][1] + j * Fctrs[RC], 0.f), 255.f);
  1802. // make ramp endpoints the way they'll going to be decompressed
  1803. // plus check whether the ramp is flat
  1804. cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
  1805. // build ramp only for red
  1806. cpu_BldClrRmp(Rmp[RC], WkRmpPts[RC], dwNumPoints);
  1807. // compute cumulative error
  1808. CGU_FLOAT mse = 0.f;
  1809. CGU_INT rmp_l = (Eq > 0) ? 1 : dwNumPoints;
  1810. for(CGU_INT k = 0; k < _NmrClrs; k++) {
  1811. CGU_FLOAT MinErr = 10000000.f;
  1812. for(CGU_INT r = 0; r < rmp_l; r++) {
  1813. CGU_FLOAT Dist = (Rmp[RC][r] - Blk[k][RC]);
  1814. CGU_FLOAT Err = RmpErr[r][k] + Dist * Dist * fWeightRed;
  1815. MinErr = cmp_minf(MinErr, Err);
  1816. }
  1817. mse += MinErr * _Rpt[k];
  1818. }
  1819. // save if we achieve better result
  1820. if(mse < bestE) {
  1821. bstC0 = InpRmp[RC][0];
  1822. bstC1 = InpRmp[RC][1];
  1823. bestE = mse;
  1824. }
  1825. }
  1826. }
  1827. // our best REDs
  1828. InpRmp[RC][0] = bstC0;
  1829. InpRmp[RC][1] = bstC1;
  1830. // make ramp endpoints the way they'll going to be decompressed
  1831. // plus check whether the ramp is flat
  1832. cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
  1833. // build ramp only for green
  1834. cpu_BldRmp(Rmp, WkRmpPts, dwNumPoints);
  1835. // precompute ramp errors for Red and Blue
  1836. for(CGU_INT i=0; i < _NmrClrs; i++) {
  1837. for(CGU_INT r = 0; r < dwNumPoints; r++) {
  1838. CGU_FLOAT DistR = (Rmp[RC][r] - Blk[i][RC]);
  1839. CGU_FLOAT DistB = (Rmp[BC][r] - Blk[i][BC]);
  1840. RmpErr[r][i] = DistR * DistR * fWeightRed + DistB * DistB * fWeightBlue;
  1841. }
  1842. }
  1843. // Now green
  1844. bstC0 = InpRmp0[GC][0];
  1845. bstC1 = InpRmp0[GC][1];
  1846. for(CGU_INT i = nRefineStart; i <= nRefineEnd; i++) {
  1847. for(CGU_INT j = nRefineStart; j <= nRefineEnd; j++) {
  1848. InpRmp[GC][0] = cmp_minf(cmp_maxf(InpRmp0[GC][0] + i * Fctrs[GC], 0.f), 255.f);
  1849. InpRmp[GC][1] = cmp_minf(cmp_maxf(InpRmp0[GC][1] + j * Fctrs[GC], 0.f), 255.f);
  1850. cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
  1851. cpu_BldClrRmp(Rmp[GC], WkRmpPts[GC], dwNumPoints);
  1852. CGU_FLOAT mse = 0.f;
  1853. CGU_INT rmp_l = (Eq > 0) ? 1 : dwNumPoints;
  1854. for(CGU_INT k = 0; k < _NmrClrs; k++) {
  1855. CGU_FLOAT MinErr = 10000000.f;
  1856. for(CGU_INT r = 0; r < rmp_l; r++) {
  1857. CGU_FLOAT Dist = (Rmp[GC][r] - Blk[k][GC]);
  1858. CGU_FLOAT Err = RmpErr[r][k] + Dist * Dist * fWeightGreen;
  1859. MinErr = cmp_minf(MinErr, Err);
  1860. }
  1861. mse += MinErr * _Rpt[k];
  1862. }
  1863. if(mse < bestE) {
  1864. bstC0 = InpRmp[GC][0];
  1865. bstC1 = InpRmp[GC][1];
  1866. bestE = mse;
  1867. }
  1868. }
  1869. }
  1870. // our best GREENs
  1871. InpRmp[GC][0] = bstC0;
  1872. InpRmp[GC][1] = bstC1;
  1873. cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
  1874. cpu_BldRmp(Rmp, WkRmpPts, dwNumPoints);
  1875. // ramp err for Red and Green
  1876. for(CGU_INT i=0; i < _NmrClrs; i++) {
  1877. for(CGU_INT r = 0; r < dwNumPoints; r++) {
  1878. CGU_FLOAT DistR = (Rmp[RC][r] - Blk[i][RC]);
  1879. CGU_FLOAT DistG = (Rmp[GC][r] - Blk[i][GC]);
  1880. RmpErr[r][i] = DistR * DistR * fWeightRed + DistG * DistG * fWeightGreen;
  1881. }
  1882. }
  1883. bstC0 = InpRmp0[BC][0];
  1884. bstC1 = InpRmp0[BC][1];
  1885. // Now blue
  1886. for(CGU_INT i = nRefineStart; i <= nRefineEnd; i++) {
  1887. for(CGU_INT j = nRefineStart; j <= nRefineEnd; j++) {
  1888. InpRmp[BC][0] = min(max(InpRmp0[BC][0] + i * Fctrs[BC], 0.f), 255.f);
  1889. InpRmp[BC][1] = min(max(InpRmp0[BC][1] + j * Fctrs[BC], 0.f), 255.f);
  1890. cpu_MkWkRmpPts(Eq, WkRmpPts, InpRmp, nRedBits, nGreenBits, nBlueBits);
  1891. cpu_BldClrRmp(Rmp[BC], WkRmpPts[BC], dwNumPoints);
  1892. CGU_FLOAT mse = 0.f;
  1893. CGU_INT rmp_l = (Eq > 0) ? 1 : dwNumPoints;
  1894. for(CGU_INT k = 0; k < _NmrClrs; k++) {
  1895. CGU_FLOAT MinErr = 10000000.f;
  1896. for(CGU_INT r = 0; r < rmp_l; r++) {
  1897. CGU_FLOAT Dist = (Rmp[BC][r] - Blk[k][BC]);
  1898. CGU_FLOAT Err = RmpErr[r][k] + Dist * Dist * fWeightBlue;
  1899. MinErr = min(MinErr, Err);
  1900. }
  1901. mse += MinErr * _Rpt[k];
  1902. }
  1903. if(mse < bestE) {
  1904. bstC0 = InpRmp[BC][0];
  1905. bstC1 = InpRmp[BC][1];
  1906. bestE = mse;
  1907. }
  1908. }
  1909. }
  1910. // our best BLUEs
  1911. InpRmp[BC][0] = bstC0;
  1912. InpRmp[BC][1] = bstC1;
  1913. // return our best choice
  1914. for(CGU_INT j = 0; j < 3; j++)
  1915. for(CGU_INT k = 0; k < 2; k++)
  1916. _OutRmpPnts[j][k] = InpRmp[j][k];
  1917. return bestE;
  1918. }
  1919. #endif
  1920. //======================================================================================
  1921. // Codec from CompressonatorLib
  1922. //======================================================================================
  1923. #define BLOCK_SIZE_4X4 16
  1924. #define RG 5
  1925. #define GG 6
  1926. #define BG 5
  1927. /*------------------------------------------------------------------------------------------------
  1928. // this is how the end points is going to be rounded in compressed format
  1929. ------------------------------------------------------------------------------------------------*/
  1930. CMP_STATIC void cpu_MkRmpOnGrid(CGU_FLOAT _RmpF[NUM_CHANNELS][NUM_ENDPOINTS],
  1931. CGU_FLOAT _MnMx[NUM_CHANNELS][NUM_ENDPOINTS],
  1932. CGU_FLOAT _Min,
  1933. CGU_FLOAT _Max,
  1934. CGU_UINT8 nRedBits,
  1935. CGU_UINT8 nGreenBits,
  1936. CGU_UINT8 nBlueBits)
  1937. {
  1938. CGU_FLOAT Fctrs0[3];
  1939. CGU_FLOAT Fctrs1[3];
  1940. Fctrs1[RC] = (CGU_FLOAT)(1 << nRedBits);
  1941. Fctrs1[GC] = (CGU_FLOAT)(1 << nGreenBits);
  1942. Fctrs1[BC] = (CGU_FLOAT)(1 << nBlueBits);
  1943. Fctrs0[RC] = (CGU_FLOAT)(1 << (PIX_GRID-nRedBits));
  1944. Fctrs0[GC] = (CGU_FLOAT)(1 << (PIX_GRID-nGreenBits));
  1945. Fctrs0[BC] = (CGU_FLOAT)(1 << (PIX_GRID-nBlueBits));
  1946. for(int j = 0; j < 3; j++) {
  1947. for(int k = 0; k < 2; k++) {
  1948. _RmpF[j][k] = floor(_MnMx[j][k]);
  1949. if(_RmpF[j][k] <= _Min)
  1950. _RmpF[j][k] = _Min;
  1951. else {
  1952. _RmpF[j][k] += floor(128.f / Fctrs1[j]) - floor(_RmpF[j][k] / Fctrs1[j]);
  1953. _RmpF[j][k] = cmp_minf(_RmpF[j][k], _Max);
  1954. }
  1955. _RmpF[j][k] = floor(_RmpF[j][k] / Fctrs0[j]) * Fctrs0[j];
  1956. }
  1957. }
  1958. }
  1959. // Find the first approximation of the line
  1960. // Assume there is a linear relation
  1961. // Z = a * X_In
  1962. // Z = b * Y_In
  1963. // Find a,b to minimize MSE between Z and Z_In
  1964. CMP_STATIC void cpu_FindAxis(CMP_OUT CGU_FLOAT BlkSh[BLOCK_SIZE_4X4][NUM_CHANNELS],
  1965. CMP_IN CGU_FLOAT LineDir0[NUM_CHANNELS],
  1966. CMP_IN CGU_FLOAT fBlockCenter[NUM_CHANNELS],
  1967. CMP_OUT CGU_UINT8 CMP_REFINOUT AxisIsSmall,
  1968. CMP_IN CGU_FLOAT BlkUV[BLOCK_SIZE_4X4][NUM_CHANNELS],
  1969. CMP_IN CGU_FLOAT _inpRpt[BLOCK_SIZE_4X4],
  1970. CMP_IN int nDimensions,
  1971. CMP_IN int dwUniqueColors)
  1972. {
  1973. CGU_FLOAT Crrl[NUM_CHANNELS];
  1974. CGU_FLOAT RGB2[NUM_CHANNELS];
  1975. CGU_INT i;
  1976. LineDir0[0] = LineDir0[1] = LineDir0[2] = RGB2[0] = RGB2[1] = RGB2[2] =
  1977. Crrl[0] = Crrl[1] = Crrl[2] = fBlockCenter[0] = fBlockCenter[1] = fBlockCenter[2] = 0.f;
  1978. // sum position of all points
  1979. CGU_FLOAT fNumPoints = 0.f;
  1980. for(i=0; i < dwUniqueColors; i++) {
  1981. fBlockCenter[0] += BlkUV[i][0] * _inpRpt[i];
  1982. fBlockCenter[1] += BlkUV[i][1] * _inpRpt[i];
  1983. fBlockCenter[2] += BlkUV[i][2] * _inpRpt[i];
  1984. fNumPoints += _inpRpt[i];
  1985. }
  1986. // and then average to calculate center coordinate of block
  1987. fBlockCenter[0] /= fNumPoints;
  1988. fBlockCenter[1] /= fNumPoints;
  1989. fBlockCenter[2] /= fNumPoints;
  1990. for(i = 0; i < dwUniqueColors; i++) {
  1991. // calculate output block as offsets around block center
  1992. BlkSh[i][0] = BlkUV[i][0] - fBlockCenter[0];
  1993. BlkSh[i][1] = BlkUV[i][1] - fBlockCenter[1];
  1994. BlkSh[i][2] = BlkUV[i][2] - fBlockCenter[2];
  1995. // compute correlation matrix
  1996. // RGB2 = sum of ((distance from point from center) squared)
  1997. // Crrl = ???????. Seems to be be some calculation based on distance from point center in two dimensions
  1998. for(int j = 0; j < nDimensions; j++) {
  1999. RGB2[j] += BlkSh[i][j] * BlkSh[i][j] * _inpRpt[i];
  2000. Crrl[j] += BlkSh[i][j] * BlkSh[i][(j+1)%3] * _inpRpt[i];
  2001. }
  2002. }
  2003. // if set's diameter is small
  2004. int i0 = 0, i1 = 1;
  2005. CGU_FLOAT mxRGB2 = 0.f;
  2006. int k = 0, j = 0;
  2007. CGU_FLOAT fEPS = fNumPoints * EPS;
  2008. for(k = 0, j = 0; j < 3; j++) {
  2009. if(RGB2[j] >= fEPS)
  2010. k++;
  2011. else
  2012. RGB2[j] = 0.f;
  2013. if(mxRGB2 < RGB2[j]) {
  2014. mxRGB2 = RGB2[j];
  2015. i0 = j;
  2016. }
  2017. }
  2018. CGU_FLOAT fEPS2 = fNumPoints * EPS2;
  2019. AxisIsSmall = 1;
  2020. for(j = 0; j < 3; j++)
  2021. {
  2022. AxisIsSmall &= (RGB2[j] < fEPS2);
  2023. }
  2024. if(AxisIsSmall) // all are very small to avoid division on the small determinant
  2025. return;
  2026. if(k == 1) // really only 1 dimension
  2027. LineDir0[i0]= 1.;
  2028. else if(k == 2) { // really only 2 dimensions
  2029. i1 = (RGB2[(i0+1)%3] > 0.f) ? (i0+1)%3 : (i0+2)%3;
  2030. CGU_FLOAT Crl = (i1 == (i0+1)%3) ? Crrl[i0] : Crrl[(i0+2)%3];
  2031. LineDir0[i1] = Crl/ RGB2[i0];
  2032. LineDir0[i0]= 1.;
  2033. } else {
  2034. CGU_FLOAT maxDet = 100000.f;
  2035. CGU_FLOAT Cs[3];
  2036. // select max det for precision
  2037. for(j = 0; j < nDimensions; j++) {
  2038. CGU_FLOAT Det = RGB2[j] * RGB2[(j+1)%3] - Crrl[j] * Crrl[j];
  2039. Cs[j] = abs(Crrl[j]/sqrt(RGB2[j] * RGB2[(j+1)%3]));
  2040. if(maxDet < Det) {
  2041. maxDet = Det;
  2042. i0 = j;
  2043. }
  2044. }
  2045. // inverse correl matrix
  2046. // -- -- -- --
  2047. // | A B | | C -B |
  2048. // | B C | => | -B A |
  2049. // -- -- -- --
  2050. CGU_FLOAT mtrx1[2][2];
  2051. CGU_FLOAT vc1[2];
  2052. CGU_FLOAT vc[2];
  2053. vc1[0] = Crrl[(i0 + 2) %3];
  2054. vc1[1] = Crrl[(i0 + 1) %3];
  2055. // C
  2056. mtrx1[0][0] = RGB2[(i0+1)%3];
  2057. // A
  2058. mtrx1[1][1] = RGB2[i0];
  2059. // -B
  2060. mtrx1[1][0] = mtrx1[0][1] = -Crrl[i0];
  2061. // find a solution
  2062. vc[0] = mtrx1[0][0] * vc1[0] + mtrx1[0][1] * vc1[1];
  2063. vc[1] = mtrx1[1][0] * vc1[0] + mtrx1[1][1] * vc1[1];
  2064. // normalize
  2065. vc[0] /= maxDet;
  2066. vc[1] /= maxDet;
  2067. // find a line direction vector
  2068. LineDir0[i0] = 1.;
  2069. LineDir0[(i0 + 1) %3] = 1.;
  2070. LineDir0[(i0 + 2) %3] = vc[0] + vc[1];
  2071. }
  2072. // normalize direction vector
  2073. CGU_FLOAT Len = LineDir0[0] * LineDir0[0] + LineDir0[1] * LineDir0[1] + LineDir0[2] * LineDir0[2];
  2074. Len = sqrt(Len);
  2075. for(j = 0; j < 3; j++)
  2076. LineDir0[j] = (Len > 0.f) ? LineDir0[j] / Len : 0.f;
  2077. }
  2078. CMP_STATIC CGU_FLOAT cpu_RampSrchW( CGU_FLOAT Prj[BLOCK_SIZE_4X4],
  2079. CGU_FLOAT PrjErr[BLOCK_SIZE_4X4],
  2080. CGU_FLOAT PreMRep[BLOCK_SIZE_4X4],
  2081. CGU_FLOAT StepErr,
  2082. CGU_FLOAT lowPosStep,
  2083. CGU_FLOAT highPosStep,
  2084. int dwUniqueColors,
  2085. int dwNumPoints )
  2086. {
  2087. CGU_FLOAT error = 0;
  2088. CGU_FLOAT step = (highPosStep - lowPosStep)/(dwNumPoints - 1);
  2089. CGU_FLOAT step_h = step * (CGU_FLOAT)0.5;
  2090. CGU_FLOAT rstep = (CGU_FLOAT)1.0f / step;
  2091. CGU_INT i;
  2092. for(i=0; i < dwUniqueColors; i++) {
  2093. CGU_FLOAT v;
  2094. // Work out which value in the block this select
  2095. CGU_FLOAT del;
  2096. if((del = Prj[i] - lowPosStep) <= 0)
  2097. v = lowPosStep;
  2098. else if(Prj[i] - highPosStep >= 0)
  2099. v = highPosStep;
  2100. else
  2101. v = floor((del + step_h) * rstep) * step + lowPosStep;
  2102. // And accumulate the error
  2103. CGU_FLOAT d = (Prj[i] - v);
  2104. d *= d;
  2105. CGU_FLOAT err = PreMRep[i] * d + PrjErr[i];
  2106. error += err;
  2107. if(StepErr < error) {
  2108. error = StepErr;
  2109. break;
  2110. }
  2111. }
  2112. return error;
  2113. }
  2114. // This is a float point-based compression
  2115. // it assumes that the number of unique colors is already known; input is in [0., 255.] range.
  2116. // This is C version.
  2117. CMP_STATIC bool cpu_CompressRGBBlockX( CMP_OUT CGU_FLOAT _RsltRmpPnts[NUM_CHANNELS][NUM_ENDPOINTS],
  2118. CMP_IN CGU_FLOAT src_image[BLOCK_SIZE_4X4][NUM_CHANNELS],
  2119. CMP_IN CGU_FLOAT Rpt[BLOCK_SIZE_4X4],
  2120. CMP_IN int dwUniqueColors,
  2121. CMP_IN CGU_UINT8 dwNumPoints,
  2122. CMP_IN bool b3DRefinement,
  2123. CMP_IN CGU_UINT8 nRefinementSteps,
  2124. CMP_IN CGU_FLOAT pfWeights[3],
  2125. CMP_IN CGU_UINT8 nRedBits,
  2126. CMP_IN CGU_UINT8 nGreenBits,
  2127. CMP_IN CGU_UINT8 nBlueBits,
  2128. CMP_IN CGU_FLOAT fquality )
  2129. {
  2130. ALIGN_16 CGU_FLOAT Prj0[BLOCK_SIZE_4X4];
  2131. ALIGN_16 CGU_FLOAT Prj[BLOCK_SIZE_4X4];
  2132. ALIGN_16 CGU_FLOAT PrjErr[BLOCK_SIZE_4X4];
  2133. ALIGN_16 CGU_FLOAT LineDir[NUM_CHANNELS];
  2134. ALIGN_16 CGU_FLOAT RmpIndxs[BLOCK_SIZE_4X4];
  2135. CMP_UNUSED(fquality);
  2136. CMP_UNUSED(b3DRefinement)
  2137. CGU_FLOAT LineDirG[NUM_CHANNELS];
  2138. CGU_FLOAT PosG[NUM_ENDPOINTS];
  2139. CGU_FLOAT BlkUV[BLOCK_SIZE_4X4][NUM_CHANNELS];
  2140. CGU_FLOAT BlkSh[BLOCK_SIZE_4X4][NUM_CHANNELS];
  2141. CGU_FLOAT LineDir0[NUM_CHANNELS];
  2142. CGU_FLOAT Mdl[NUM_CHANNELS];
  2143. CGU_FLOAT rsltC[NUM_CHANNELS][NUM_ENDPOINTS];
  2144. int i, j, k;
  2145. // down to [0., 1.]
  2146. for(i = 0; i < dwUniqueColors; i++)
  2147. for(j = 0; j < 3; j++)
  2148. BlkUV[i][j] = src_image[i][j] / 255.f;
  2149. bool isDONE = false;
  2150. // as usual if not more then 2 different colors, we've done
  2151. if(dwUniqueColors <= 2) {
  2152. for(j = 0; j < 3; j++) {
  2153. rsltC[j][0] = src_image[0][j];
  2154. rsltC[j][1] = src_image[dwUniqueColors - 1][j];
  2155. }
  2156. isDONE = true;
  2157. }
  2158. if ( !isDONE ) {
  2159. // This is our first attempt to find an axis we will go along.
  2160. // The cumulation is done to find a line minimizing the MSE from the input 3D points.
  2161. CGU_UINT8 bSmall;
  2162. cpu_FindAxis(BlkSh, LineDir0, Mdl, bSmall, BlkUV, Rpt, 3, dwUniqueColors);
  2163. // While trying to find the axis we found that the diameter of the input set is quite small.
  2164. // Do not bother.
  2165. if(bSmall) {
  2166. for(j = 0; j < 3; j++) {
  2167. rsltC[j][0] = src_image[0][j];
  2168. rsltC[j][1] = src_image[dwUniqueColors - 1][j];
  2169. }
  2170. isDONE = true;
  2171. }
  2172. }
  2173. // GCC is being an awful being when it comes to goto-jumps.
  2174. // So please bear with this.
  2175. if ( !isDONE ) {
  2176. CGU_FLOAT ErrG = 10000000.f;
  2177. CGU_FLOAT PrjBnd[NUM_ENDPOINTS];
  2178. ALIGN_16 CGU_FLOAT PreMRep[BLOCK_SIZE_4X4];
  2179. for(j =0; j < 3; j++)
  2180. LineDir[j] = LineDir0[j];
  2181. // Here is the main loop.
  2182. // 1. Project input set on the axis in consideration.
  2183. // 2. Run 1 dimensional search (see scalar case) to find an (sub) optimal pair of end points.
  2184. // 3. Compute the vector of indexes (or clusters) for the current approximate ramp.
  2185. // 4. Present our color channels as 3 16DIM vectors.
  2186. // 5. Find closest approximation of each of 16DIM color vector with the projection of the 16DIM index vector.
  2187. // 6. Plug the projections as a new directional vector for the axis.
  2188. // 7. Goto 1.
  2189. // D - is 16 dim "index" vector (or 16 DIM vector of indexes - {0, 1/3, 2/3, 0, ...,}, but shifted and normalized).
  2190. // Ci - is a 16 dim vector of color i.
  2191. // for each Ci find a scalar Ai such that
  2192. // (Ai * D - Ci) (Ai * D - Ci) -> min , i.e distance between vector AiD and C is min.
  2193. // You can think of D as a unit interval(vector) "clusterizer",
  2194. // and Ai is a scale you need to apply to the clusterizer to
  2195. // approximate the Ci vector instead of the unit vector.
  2196. // Solution is
  2197. // Ai = (D . Ci) / (D . D); . - is a dot product.
  2198. // in 3 dim space Ai(s) represent a line direction, along which
  2199. // we again try to find (sub)optimal quantizer.
  2200. // That's what our for(;;) loop is about.
  2201. for(;;) {
  2202. // 1. Project input set on the axis in consideration.
  2203. // From Foley & Van Dam: Closest point of approach of a line (P + v) to a point (R) is
  2204. // P + ((R-P).v) / (v.v))v
  2205. // The distance along v is therefore (R-P).v / (v.v)
  2206. // (v.v) is 1 if v is a unit vector.
  2207. //
  2208. PrjBnd[0] = 1000.;
  2209. PrjBnd[1] = -1000.;
  2210. for(i = 0; i < BLOCK_SIZE_4X4; i++)
  2211. Prj0[i] = Prj[i] = PrjErr[i] = PreMRep[i] = 0.f;
  2212. for(i = 0; i < dwUniqueColors; i++) {
  2213. Prj0[i] = Prj[i] = BlkSh[i][0] * LineDir[0] + BlkSh[i][1] * LineDir[1] + BlkSh[i][2] * LineDir[2];
  2214. PrjErr[i] = (BlkSh[i][0] - LineDir[0] * Prj[i]) * (BlkSh[i][0] - LineDir[0] * Prj[i])
  2215. + (BlkSh[i][1] - LineDir[1] * Prj[i]) * (BlkSh[i][1] - LineDir[1] * Prj[i])
  2216. + (BlkSh[i][2] - LineDir[2] * Prj[i]) * (BlkSh[i][2] - LineDir[2] * Prj[i]);
  2217. PrjBnd[0] = min(PrjBnd[0], Prj[i]);
  2218. PrjBnd[1] = max(PrjBnd[1], Prj[i]);
  2219. }
  2220. // 2. Run 1 dimensional search (see scalar case) to find an (sub) optimal pair of end points.
  2221. // min and max of the search interval
  2222. CGU_FLOAT stepf = 0.125f;
  2223. CGU_FLOAT Scl[NUM_ENDPOINTS];
  2224. Scl[0] = PrjBnd[0] - (PrjBnd[1] - PrjBnd[0]) * stepf;
  2225. Scl[1] = PrjBnd[1] + (PrjBnd[1] - PrjBnd[0]) * stepf;
  2226. // No range found exit
  2227. if (Scl[0] == Scl[1]) {
  2228. return false;
  2229. }
  2230. // compute scaling factor to scale down the search interval to [0.,1]
  2231. const CGU_FLOAT Scl2 = (Scl[1] - Scl[0]) * (Scl[1] - Scl[0]);
  2232. const CGU_FLOAT overScl = 1.f/(Scl[1] - Scl[0]);
  2233. for(i = 0; i < dwUniqueColors; i++) {
  2234. // scale them
  2235. Prj[i] = (Prj[i] - Scl[0]) * overScl;
  2236. // premultiply the scale squire to plug into error computation later
  2237. PreMRep[i] = Rpt[i] * Scl2;
  2238. }
  2239. // scale first approximation of end points
  2240. for(k = 0; k <2; k++)
  2241. PrjBnd[k] = (PrjBnd[k] - Scl[0]) * overScl;
  2242. CGU_FLOAT StepErr = MAX_ERROR;
  2243. // search step
  2244. static const CGU_FLOAT searchStep = 0.025f;
  2245. // low Start/End; high Start/End
  2246. const CGU_FLOAT lowStartEnd = (PrjBnd[0] - 2.f * searchStep > 0.f) ? PrjBnd[0] - 2.f * searchStep : 0.f;
  2247. const CGU_FLOAT highStartEnd = (PrjBnd[1] + 2.f * searchStep < 1.f) ? PrjBnd[1] + 2.f * searchStep : 1.f;
  2248. // find the best endpoints
  2249. CGU_FLOAT Pos[NUM_ENDPOINTS];
  2250. CGU_FLOAT lowPosStep, highPosStep;
  2251. CGU_FLOAT err;
  2252. int l, h;
  2253. for(l = 0, lowPosStep = lowStartEnd; l < 8; l++, lowPosStep += searchStep) {
  2254. for(h = 0, highPosStep = highStartEnd; h < 8; h++, highPosStep -= searchStep) {
  2255. // compute an error for the current pair of end points.
  2256. err = cpu_RampSrchW(Prj, PrjErr, PreMRep, StepErr, lowPosStep, highPosStep, dwUniqueColors, dwNumPoints);
  2257. if(err < StepErr) {
  2258. // save better result
  2259. StepErr = err;
  2260. Pos[0] = lowPosStep;
  2261. Pos[1] = highPosStep;
  2262. }
  2263. }
  2264. }
  2265. // inverse the scaling
  2266. for(k = 0; k < 2; k++)
  2267. Pos[k] = Pos[k] * (Scl[1] - Scl[0])+ Scl[0];
  2268. // did we find somthing better from the previous run?
  2269. if(StepErr + 0.001 < ErrG) {
  2270. // yes, remember it
  2271. ErrG = StepErr;
  2272. LineDirG[0] = LineDir[0];
  2273. LineDirG[1] = LineDir[1];
  2274. LineDirG[2] = LineDir[2];
  2275. PosG[0] = Pos[0];
  2276. PosG[1] = Pos[1];
  2277. // 3. Compute the vector of indexes (or clusters) for the current approximate ramp.
  2278. // indexes
  2279. const CGU_FLOAT step = (Pos[1] - Pos[0]) / (CGU_FLOAT)(dwNumPoints - 1);
  2280. const CGU_FLOAT step_h = step * (CGU_FLOAT)0.5;
  2281. const CGU_FLOAT rstep = (CGU_FLOAT)1.0f / step;
  2282. const CGU_FLOAT overBlkTp = 1.f/ (CGU_FLOAT)(dwNumPoints - 1) ;
  2283. // here the index vector is computed,
  2284. // shifted and normalized
  2285. CGU_FLOAT indxAvrg = (CGU_FLOAT)(dwNumPoints - 1) / 2.f;
  2286. for(i=0; i < dwUniqueColors; i++) {
  2287. CGU_FLOAT del;
  2288. //int n = (int)((b - _min_ex + (step*0.5f)) * rstep);
  2289. if((del = Prj0[i] - Pos[0]) <= 0)
  2290. RmpIndxs[i] = 0.f;
  2291. else if(Prj0[i] - Pos[1] >= 0)
  2292. RmpIndxs[i] = (CGU_FLOAT)(dwNumPoints - 1);
  2293. else
  2294. RmpIndxs[i] = floor((del + step_h) * rstep);
  2295. // shift and normalization
  2296. RmpIndxs[i] = (RmpIndxs[i] - indxAvrg) * overBlkTp;
  2297. }
  2298. // 4. Present our color channels as 3 16DIM vectors.
  2299. // 5. Find closest aproximation of each of 16DIM color vector with the pojection of the 16DIM index vector.
  2300. CGU_FLOAT Crs[3], Len, Len2;
  2301. for(i = 0, Crs[0] = Crs[1] = Crs[2] = Len = 0.f; i < dwUniqueColors; i++) {
  2302. const CGU_FLOAT PreMlt = RmpIndxs[i] * Rpt[i];
  2303. Len += RmpIndxs[i] * PreMlt;
  2304. for(j = 0; j < 3; j++)
  2305. Crs[j] += BlkSh[i][j] * PreMlt;
  2306. }
  2307. LineDir[0] = LineDir[1] = LineDir[2] = 0.f;
  2308. if(Len > 0.f) {
  2309. LineDir[0] = Crs[0]/ Len;
  2310. LineDir[1] = Crs[1]/ Len;
  2311. LineDir[2] = Crs[2]/ Len;
  2312. // 6. Plug the projections as a new directional vector for the axis.
  2313. // 7. Goto 1.
  2314. Len2 = LineDir[0] * LineDir[0] + LineDir[1] * LineDir[1] + LineDir[2] * LineDir[2];
  2315. Len2 = sqrt(Len2);
  2316. LineDir[0] /= Len2;
  2317. LineDir[1] /= Len2;
  2318. LineDir[2] /= Len2;
  2319. }
  2320. } else // We was not able to find anything better. Drop dead.
  2321. break;
  2322. }
  2323. // inverse transform to find end-points of 3-color ramp
  2324. for(k = 0; k < 2; k++)
  2325. for(j = 0; j < 3; j++)
  2326. rsltC[j][k] = (PosG[k] * LineDirG[j] + Mdl[j]) * 255.f;
  2327. }
  2328. // We've dealt with (almost) unrestricted full precision realm.
  2329. // Now back to the dirty digital world.
  2330. // round the end points to make them look like compressed ones
  2331. CGU_FLOAT inpRmpEndPts[NUM_CHANNELS][NUM_ENDPOINTS];
  2332. cpu_MkRmpOnGrid(inpRmpEndPts, rsltC, 0.f, 255.f, nRedBits, nGreenBits, nBlueBits);
  2333. // Try using this on 3 channels
  2334. // static CGU_Vec2i cmp_getLinearEndPoints(CGU_FLOAT _Blk[BLOCK_SIZE_4X4], CMP_IN CGU_FLOAT fquality, CMP_IN CGU_BOOL isSigned);
  2335. // This not a small procedure squeezes and stretches the ramp along each axis (R,G,B) separately while other 2 are fixed.
  2336. // It does it only over coarse grid - 565 that is. It tries to squeeze more precision for the real world ramp.
  2337. #if defined(USE_REFINE) || defined(USE_REFINE3D)
  2338. switch(nRefinementSteps) {
  2339. case 1:
  2340. cmp_Refine(_RsltRmpPnts, inpRmpEndPts, src_image, Rpt, dwUniqueColors, dwNumPoints, pfWeights, nRedBits, nGreenBits, nBlueBits,3);
  2341. break;
  2342. case 2:
  2343. if (dwUniqueColors > 2)
  2344. cmp_Refine3D(_RsltRmpPnts, inpRmpEndPts, src_image, Rpt, dwUniqueColors, dwNumPoints, pfWeights, nRedBits, nGreenBits, nBlueBits, 1);
  2345. else
  2346. cmp_Refine(_RsltRmpPnts, inpRmpEndPts, src_image, Rpt, dwUniqueColors, dwNumPoints, pfWeights, nRedBits, nGreenBits, nBlueBits,3);
  2347. break;
  2348. default:
  2349. cmp_Refine(_RsltRmpPnts, inpRmpEndPts, src_image, Rpt, dwUniqueColors, dwNumPoints, pfWeights, nRedBits, nGreenBits, nBlueBits, 1);
  2350. break;
  2351. }
  2352. #endif
  2353. return true;
  2354. }
  2355. // CPU: CompRGBBlock()
  2356. CMP_STATIC CGU_FLOAT cpu_CompRGBBlock32(CGU_UINT32 block_32[16],
  2357. CGU_UINT32 compressedBlock[2],
  2358. CGU_UINT32 dwBlockSize,
  2359. CGU_UINT8 nRedBits,
  2360. CGU_UINT8 nGreenBits,
  2361. CGU_UINT8 nBlueBits,
  2362. CGU_UINT8 nEndpoints[3][NUM_ENDPOINTS],
  2363. CGU_UINT8 pcIndices[BLOCK_SIZE_4X4],
  2364. CGU_UINT8 dwNumPoints,
  2365. bool b3DRefinement,
  2366. CGU_UINT8 m_nRefinementSteps,
  2367. CGU_FLOAT _pfChannelWeights[3],
  2368. bool _bUseAlpha,
  2369. CGU_UINT8 _nAlphaThreshold)
  2370. {
  2371. ALIGN_16 CGU_FLOAT Rpt[BLOCK_SIZE_4X4];
  2372. ALIGN_16 CGU_FLOAT BlkIn[BLOCK_SIZE_4X4][NUM_CHANNELS];
  2373. CGU_UINT32 mx;
  2374. for (mx=0; mx < BLOCK_SIZE_4X4; mx++) {
  2375. Rpt[mx] = 0;
  2376. BlkIn[mx][0] = 0;
  2377. BlkIn[mx][1] = 0;
  2378. BlkIn[mx][2] = 0;
  2379. BlkIn[mx][3] = 0;
  2380. }
  2381. compressedBlock[0] = 0;
  2382. CGU_UINT32 dwAlphaThreshold = _nAlphaThreshold << 24;
  2383. CGU_UINT32 dwColors = 0;
  2384. CGU_UINT32 dwBlk[BLOCK_SIZE];
  2385. for(CGU_UINT32 i = 0; i < dwBlockSize; i++)
  2386. if(!_bUseAlpha || (block_32[i] & 0xff000000) >= dwAlphaThreshold)
  2387. dwBlk[dwColors++] = block_32[i] | 0xff000000;
  2388. // Do we have any colors ?
  2389. static int id=0;
  2390. if(dwColors) {
  2391. bool bHasAlpha = (dwColors != dwBlockSize);
  2392. if(bHasAlpha && _bUseAlpha && !(dwNumPoints & 0x1))
  2393. return CMP_FLT_MAX;
  2394. // Here we are computing an unique number of colors.
  2395. // For each unique value we compute the number of it appearences.
  2396. //qsort((void *)dwBlk, (size_t)dwColors, sizeof(CGU_UINT32), QSortIntCmp);
  2397. #ifndef ASPM_GPU // this is here for reminder when code moves to GPU
  2398. std::sort(dwBlk, dwBlk + 15);
  2399. #else
  2400. {
  2401. CGU_UINT32 j;
  2402. CMP_di what[BLOCK_SIZE_4X4];
  2403. for (i = 0; i < dwColors; i++)
  2404. {
  2405. what[i].index = i;
  2406. what[i].data = dwBlk[i];
  2407. }
  2408. CGU_UINT32 tmp_index;
  2409. CGU_UINT32 tmp_data;
  2410. for (i = 1; i < dwColors; i++)
  2411. {
  2412. for (j = i; j > 0; j--)
  2413. {
  2414. if (what[j - 1].data > what[j].data)
  2415. {
  2416. tmp_index = what[j].index;
  2417. tmp_data = what[j].data;
  2418. what[j].index = what[j - 1].index;
  2419. what[j].data = what[j - 1].data;
  2420. what[j - 1].index = tmp_index;
  2421. what[j - 1].data = tmp_data;
  2422. }
  2423. }
  2424. }
  2425. for (i = 0; i < dwColors; i++)
  2426. dwBlk[i] = what[i].data;
  2427. }
  2428. #endif
  2429. CGU_UINT32 new_p;
  2430. CGU_UINT32 dwBlkU[BLOCK_SIZE_4X4];
  2431. CGU_UINT32 dwUniqueColors = 0;
  2432. new_p = dwBlkU[0] = dwBlk[0];
  2433. Rpt[dwUniqueColors] = 1.f;
  2434. CGU_UINT32 i;
  2435. for( i = 1; i < dwColors; i++) {
  2436. if(new_p != dwBlk[i]) {
  2437. dwUniqueColors++;
  2438. new_p = dwBlkU[dwUniqueColors] = dwBlk[i];
  2439. Rpt[dwUniqueColors] = 1.f;
  2440. } else
  2441. Rpt[dwUniqueColors] += 1.f;
  2442. }
  2443. dwUniqueColors++;
  2444. // switch to float
  2445. for( i=0; i<dwUniqueColors; i++) {
  2446. BlkIn[i][RC] = (CGU_FLOAT)((dwBlkU[i] >> 16) & 0xff); // R
  2447. BlkIn[i][GC] = (CGU_FLOAT)((dwBlkU[i] >> 8) & 0xff); // G
  2448. BlkIn[i][BC] = (CGU_FLOAT)((dwBlkU[i] >> 0) & 0xff); // B
  2449. BlkIn[i][AC] = 255.0f;
  2450. }
  2451. CGU_FLOAT rsltC[NUM_CHANNELS][NUM_ENDPOINTS];
  2452. if (cpu_CompressRGBBlockX(rsltC, // CMP_EndPoints = CompressRGBBlock_Slow2 (
  2453. BlkIn, // CGU_Vec3f src_imageNorm[BLOCK_SIZE_4X4]
  2454. Rpt, // CGU_FLOAT Rpt[BLOCK_SIZE_4X4],
  2455. dwUniqueColors, // CGU_UINT32 dwUniqueColors,
  2456. dwNumPoints, // CGU_UINT32 dwNumPoints,
  2457. b3DRefinement, //
  2458. m_nRefinementSteps, // CGU_UINT32 m_nRefinementSteps,
  2459. _pfChannelWeights, // CGU_Vec3f channelWeightsBGR,
  2460. nRedBits, // );
  2461. nGreenBits,
  2462. nBlueBits,
  2463. 1.0f) )
  2464. {
  2465. // return to integer realm
  2466. for(int ch = 0; ch < 3; ch++)
  2467. for(int j = 0; j < 2; j++)
  2468. nEndpoints[ch][j] = (CGU_UINT8 )rsltC[ch][j];
  2469. //printf("Endpoints {%3d,%3d,%3d} {%3d,%3d,%3d} ", nEndpoints[0][0],nEndpoints[1][0],nEndpoints[2][0],
  2470. // nEndpoints[0][1],nEndpoints[1][1],nEndpoints[2][1]);
  2471. // Now get the indices using the new end points
  2472. return cpu_Clstr(block_32, dwBlockSize, nEndpoints, pcIndices, dwNumPoints, _pfChannelWeights, _bUseAlpha,_nAlphaThreshold, nRedBits, nGreenBits, nBlueBits);
  2473. }
  2474. else {
  2475. CGU_FLOAT CompErr = CMP_FLT_MAX;
  2476. if (dwNumPoints < 4) {
  2477. CGU_Vec3f src_imageNorm[BLOCK_SIZE_4X4];
  2478. for (CGU_UINT32 px = 0; px < 16; px++)
  2479. {
  2480. src_imageNorm[px].r = (CGU_FLOAT)((block_32[px] >> 16) & 0xff)/ 255.0f;
  2481. src_imageNorm[px].g = (CGU_FLOAT)((block_32[px] >> 8) & 0xff)/ 255.0f;
  2482. src_imageNorm[px].b = (CGU_FLOAT)((block_32[px] >> 0) & 0xff)/ 255.0f;
  2483. }
  2484. // Do a quick compression test
  2485. CGU_Vec3f srcRGB[16]; // The list of source colors with blue channel altered
  2486. CGU_Vec3f average_rgb; // The centrepoint of the axis
  2487. CGU_FLOAT errLQ = CMP_FLT_MAX;
  2488. cgu_CompressRGBBlock_MinMax(src_imageNorm, 1.0f, false,srcRGB, average_rgb, errLQ);
  2489. CGU_Vec2ui cmp = cgu_CompressRGBBlock_Fast(src_imageNorm, 1.0f, false,srcRGB, average_rgb, CompErr);
  2490. compressedBlock[0] = cmp.x;
  2491. compressedBlock[1] = cmp.y;
  2492. }
  2493. return CompErr;
  2494. }
  2495. } else {
  2496. // All colors transparent
  2497. nEndpoints[0][0] = nEndpoints[1][0] = nEndpoints[2][0] = 0;
  2498. nEndpoints[0][1] = nEndpoints[1][1] = nEndpoints[2][1] = 0xff;
  2499. for (CGU_UINT32 ms=0; ms<dwBlockSize; ms++)
  2500. pcIndices[ms] = 0xff;
  2501. return 0.0;
  2502. }
  2503. }
  2504. CMP_STATIC CGU_Vec2ui cpu_CompRGBBlock(CMP_IN CGU_Vec4uc bgraBlock[BLOCK_SIZE_4X4],
  2505. CMP_IN CMP_BC15Options BC15Options,
  2506. CMP_INOUT CGU_FLOAT CMP_REFINOUT err)
  2507. {
  2508. CGU_Vec2ui cmpBlock = {0U,0U};
  2509. CGU_FLOAT pfChannelWeights[3] = {1.0f,1.0f,1.0f};
  2510. CGU_UINT8 nEndpoints[2][3][2];
  2511. CGU_UINT8 nIndices[2][BLOCK_SIZE_4X4];
  2512. CGU_UINT32 compressedBlock[2] = {0,0};
  2513. CGU_FLOAT fError3 = CMP_FLT_MAX;
  2514. fError3 = cpu_CompRGBBlock32((CGU_UINT32*)bgraBlock,
  2515. compressedBlock,
  2516. BLOCK_SIZE_4X4, RG, GG, BG,
  2517. nEndpoints[0],
  2518. nIndices[0],
  2519. 3,
  2520. BC15Options.m_b3DRefinement,
  2521. BC15Options.m_nRefinementSteps,
  2522. pfChannelWeights,
  2523. BC15Options.m_bUseAlpha,
  2524. BC15Options.m_nAlphaThreshold);
  2525. // use case of small min max ranges
  2526. if (compressedBlock[0] > 0)
  2527. {
  2528. //return cmpBlockBlue;
  2529. cmpBlock.x = compressedBlock[0];
  2530. cmpBlock.y = compressedBlock[1];
  2531. err = fError3;
  2532. }
  2533. else
  2534. {
  2535. CGU_FLOAT fError4 = CMP_FLT_MAX;
  2536. fError4 = (fError3 == 0.0) ? CMP_FLT_MAX :cpu_CompRGBBlock32((CGU_UINT32*)bgraBlock,
  2537. compressedBlock,
  2538. BLOCK_SIZE_4X4, RG, GG, BG,
  2539. nEndpoints[1],
  2540. nIndices[1],
  2541. 4,
  2542. BC15Options.m_b3DRefinement,
  2543. BC15Options.m_nRefinementSteps,
  2544. pfChannelWeights,
  2545. BC15Options.m_bUseAlpha,
  2546. BC15Options.m_nAlphaThreshold);
  2547. CGU_UINT32 nMethod;
  2548. if (fError3 <= fError4) {
  2549. err = fError3;
  2550. nMethod = 0;
  2551. }
  2552. else {
  2553. err = fError4;
  2554. nMethod = 1;
  2555. }
  2556. CGU_UINT32 c0 = BC1ConstructColour((nEndpoints[nMethod][RC][0] >> (8-RG)), (nEndpoints[nMethod][GC][0] >> (8-GG)), (nEndpoints[nMethod][BC][0] >> (8-BG)));
  2557. CGU_UINT32 c1 = BC1ConstructColour((nEndpoints[nMethod][RC][1] >> (8-RG)), (nEndpoints[nMethod][GC][1] >> (8-GG)), (nEndpoints[nMethod][BC][1] >> (8-BG)));
  2558. if(nMethod == 1 && c0 <= c1 || nMethod == 0 && c0 > c1)
  2559. compressedBlock[0] = c1 | (c0<<16);
  2560. else
  2561. compressedBlock[0] = c0 | (c1<<16);
  2562. compressedBlock[1] = 0;
  2563. for(CGU_UINT32 i=0; i<16; i++)
  2564. compressedBlock[1] |= (nIndices[nMethod][i] << (2*i));
  2565. cmpBlock.x = compressedBlock[0];
  2566. cmpBlock.y = compressedBlock[1];
  2567. }
  2568. return cmpBlock;
  2569. }
  2570. #endif
  2571. #ifdef ENABLE_NEW_CODE
  2572. //---------------------------------------- Common Utility Code -------------------------------------------------------
  2573. // 1 - Dim error
  2574. CMP_STATIC CGU_FLOAT cgu_RampSrchW( CGU_FLOAT Prj[BLOCK_SIZE_4X4],
  2575. CGU_FLOAT PrjErr[BLOCK_SIZE_4X4],
  2576. CGU_FLOAT PreMRep[BLOCK_SIZE_4X4],
  2577. CGU_FLOAT StepErr,
  2578. CGU_FLOAT lowPosStep,
  2579. CGU_FLOAT highPosStep,
  2580. CGU_UINT32 dwUniqueColors,
  2581. CGU_UINT32 dwNumPoints)
  2582. {
  2583. CGU_FLOAT error = 0;
  2584. CGU_FLOAT step = (highPosStep - lowPosStep) / (dwNumPoints - 1);
  2585. CGU_FLOAT step_h = step * (CGU_FLOAT)0.5;
  2586. CGU_FLOAT rstep = (CGU_FLOAT)1.0f / step;
  2587. for (CGU_UINT32 i = 0; i < dwUniqueColors; i++)
  2588. {
  2589. CGU_FLOAT v;
  2590. // Work out which value in the block this select
  2591. CGU_FLOAT del;
  2592. if ((del = Prj[i] - lowPosStep) <= 0)
  2593. v = lowPosStep;
  2594. else if (Prj[i] - highPosStep >= 0)
  2595. v = highPosStep;
  2596. else
  2597. v = floor((del + step_h) * rstep) * step + lowPosStep;
  2598. // And accumulate the error
  2599. CGU_FLOAT d = (Prj[i] - v);
  2600. d *= d;
  2601. CGU_FLOAT err = PreMRep[i] * d + PrjErr[i];
  2602. error += err;
  2603. if (StepErr < error)
  2604. {
  2605. error = StepErr;
  2606. break;
  2607. }
  2608. }
  2609. return error;
  2610. }
  2611. CMP_STATIC CGU_UINT32 cgu_processCluster( CMP_IN CMP_EndPoints EndPoints,
  2612. CMP_IN CGU_Vec4f rgbBlock_normal[BLOCK_SIZE_4X4],
  2613. CMP_IN CGU_UINT32 dwAlphaThreshold,
  2614. CMP_IN CGU_Vec3f channelWeights,
  2615. CMP_IN CGU_UINT8 indices[BLOCK_SIZE_4X4],
  2616. CMP_OUT CGU_FLOAT CMP_REFINOUT Err )
  2617. {
  2618. Err = 0.f;
  2619. CGU_UINT32 pcIndices = 0;
  2620. CGU_UINT32 R, G, B;
  2621. R = (CGU_UINT32)(EndPoints.Color0.z);
  2622. G = (CGU_UINT32)(EndPoints.Color0.y);
  2623. B = (CGU_UINT32)(EndPoints.Color0.x);
  2624. CGU_INT32 cluster0 = cmp_constructColor(R, G, B);
  2625. R = (CGU_UINT32)(EndPoints.Color1.z);
  2626. G = (CGU_UINT32)(EndPoints.Color1.y);
  2627. B = (CGU_UINT32)(EndPoints.Color1.x);
  2628. CGU_INT32 cluster1 = cmp_constructColor(R, G, B);
  2629. CGU_Vec3f InpRmp[NUM_ENDPOINTS];
  2630. if ((cluster0 <= cluster1) // valid for 4 channels
  2631. // || (cluster0 > cluster1) // valid for 3 channels
  2632. )
  2633. {
  2634. // inverse endpoints
  2635. InpRmp[0] = EndPoints.Color1;
  2636. InpRmp[1] = EndPoints.Color0;
  2637. }
  2638. else
  2639. {
  2640. InpRmp[0] = EndPoints.Color0;
  2641. InpRmp[1] = EndPoints.Color1;
  2642. }
  2643. CGU_Vec3f srcblockLinear[BLOCK_SIZE_4X4];
  2644. CGU_FLOAT srcblockA[BLOCK_SIZE_4X4];
  2645. // Swizzle the source RGB to BGR for processing
  2646. for (CGU_UINT32 i = 0; i < BLOCK_SIZE_4X4; i++)
  2647. {
  2648. srcblockLinear[i].z = rgbBlock_normal[i].x * 255.0f;
  2649. srcblockLinear[i].y = rgbBlock_normal[i].y * 255.0f;
  2650. srcblockLinear[i].x = rgbBlock_normal[i].z * 255.0f;
  2651. srcblockA[i] = 0.0f;
  2652. //if (dwAlphaThreshold > 0)
  2653. //{
  2654. // CGU_UINT32 alpha = (CGU_UINT32)BlockA[i];
  2655. // if (alpha >= dwAlphaThreshold)
  2656. // srcblockA[i] = BlockA[i];
  2657. //}
  2658. }
  2659. // cmp_ClstrBas2()
  2660. // input ramp is on the coarse grid
  2661. // make ramp endpoints the way they'll going to be decompressed
  2662. CGU_Vec3f InpRmpL[NUM_ENDPOINTS];
  2663. CGU_Vec3f Fctrs = {32.0F, 64.0F, 32.0F}; // 1 << RG,1 << GG,1 << BG
  2664. {
  2665. // ConstantRamp = MkWkRmpPts(InpRmpL, InpRmp);
  2666. InpRmpL[0] = InpRmp[0] + cmp_floorVec3f(InpRmp[0] / Fctrs);
  2667. InpRmpL[0] = cmp_clampVec3f(InpRmpL[0], 0.0f, 255.0f);
  2668. InpRmpL[1] = InpRmp[1] + cmp_floorVec3f(InpRmp[1] / Fctrs);
  2669. InpRmpL[1] = cmp_clampVec3f(InpRmpL[1], 0.0f, 255.0f);
  2670. } // MkWkRmpPts
  2671. // build ramp
  2672. CGU_Vec3f LerpRmp[4];
  2673. CGU_Vec3f offset = {1.0f, 1.0f, 1.0f};
  2674. {
  2675. //BldRmp(Rmp, InpRmpL, dwNumChannels);
  2676. // linear interpolate end points to get the ramp
  2677. LerpRmp[0] = InpRmpL[0];
  2678. LerpRmp[3] = InpRmpL[1];
  2679. LerpRmp[1] = cmp_floorVec3f((InpRmpL[0] * 2.0f + LerpRmp[3] + offset) / 3.0f);
  2680. LerpRmp[2] = cmp_floorVec3f((InpRmpL[0] + LerpRmp[3] * 2.0f + offset) / 3.0f);
  2681. } // BldRmp
  2682. //=========================================================================
  2683. // Clusterize, Compute error and find DXTC indexes for the current cluster
  2684. //=========================================================================
  2685. {
  2686. // Clusterize
  2687. CGU_UINT32 alpha;
  2688. // For each colour in the original block assign it
  2689. // to the closest cluster and compute the cumulative error
  2690. for (CGU_UINT32 i = 0; i < BLOCK_SIZE_4X4; i++)
  2691. {
  2692. alpha = (CGU_UINT32)srcblockA[i];
  2693. if ((dwAlphaThreshold > 0) && alpha == 0)
  2694. { //*((CGU_UINT32 *)&_Blk[i][AC]) == 0)
  2695. pcIndices |= cmp_set2Bit32(4, i); // dwNumChannels 3 or 4 (default is 4)
  2696. indices[i] = 4;
  2697. }
  2698. else
  2699. {
  2700. CGU_FLOAT shortest = 99999999999.f;
  2701. CGU_UINT8 shortestIndex = 0;
  2702. CGU_Vec3f channelWeightsBGR;
  2703. channelWeightsBGR.x = channelWeights.z;
  2704. channelWeightsBGR.y = channelWeights.y;
  2705. channelWeightsBGR.z = channelWeights.x;
  2706. for (CGU_UINT8 rampindex = 0; rampindex < 4; rampindex++)
  2707. {
  2708. // r is either 1 or 4
  2709. // calculate the distance for each component
  2710. CGU_FLOAT distance = cmp_dotVec3f(((srcblockLinear[i] - LerpRmp[rampindex]) * channelWeightsBGR),
  2711. ((srcblockLinear[i] - LerpRmp[rampindex]) * channelWeightsBGR));
  2712. if (distance < shortest)
  2713. {
  2714. shortest = distance;
  2715. shortestIndex = rampindex;
  2716. }
  2717. }
  2718. Err += shortest;
  2719. // The total is a sum of (error += shortest)
  2720. // We have the index of the best cluster, so assign this in the block
  2721. // Reorder indices to match correct DXTC ordering
  2722. if (shortestIndex == 3) // dwNumChannels - 1
  2723. shortestIndex = 1;
  2724. else if (shortestIndex)
  2725. shortestIndex++;
  2726. pcIndices |= cmp_set2Bit32(shortestIndex, i);
  2727. indices[i] = shortestIndex;
  2728. }
  2729. } // BLOCK_SIZE_4X4
  2730. } // Clusterize
  2731. return pcIndices;
  2732. }
  2733. #endif
  2734. // Process a rgbBlock which is normalized (0.0f ... 1.0f), signed normal is not implemented
  2735. CMP_STATIC CGU_Vec2ui CompressBlockBC1_NORMALIZED(CMP_IN CGU_Vec4f src_imageNorm[BLOCK_SIZE_4X4],
  2736. CMP_IN CMP_BC15Options BC15Options)
  2737. {
  2738. bool usingMaxQualityOnly = false;
  2739. #ifndef ASPM_GPU
  2740. if (BC15Options.m_fquality > 0.75)
  2741. usingMaxQualityOnly = true;
  2742. #endif
  2743. CGU_FLOAT CompErr = CMP_FLT_MAX;
  2744. CGU_Vec2ui cmpBlock = {0U,0U};
  2745. CGU_Vec2ui cmpBlockTemp = {0U,0U};
  2746. CGU_FLOAT CompErrTemp;
  2747. // Transfer to RGB Norm from RGBA Norm
  2748. CGU_Vec3f src_imageRGBNorm[16];
  2749. CGU_Vec4uc pixels[16];
  2750. CGU_Vec4uc pixelsBGRA[16];
  2751. for (CGU_UINT32 sr = 0; sr < 16; sr++) {
  2752. src_imageRGBNorm[sr] = src_imageNorm[sr].rgb;
  2753. pixelsBGRA[sr].b = pixels[sr].r = src_imageNorm[sr].r * 255.0f;
  2754. pixelsBGRA[sr].g = pixels[sr].g = src_imageNorm[sr].g * 255.0f;
  2755. pixelsBGRA[sr].r = pixels[sr].b = src_imageNorm[sr].b * 255.0f;
  2756. pixelsBGRA[sr].a = pixels[sr].a = src_imageNorm[sr].a * 255.0f;
  2757. }
  2758. // check for a punch through transparent alpha setting
  2759. if ((BC15Options.m_fquality < 0.75) && (BC15Options.m_bUseAlpha)) {
  2760. CGU_Vec2ui cmpBlockAlpha = {0xffff0000,0xffffffffU};
  2761. for (CGU_UINT32 sr = 0; sr < 16; sr++)
  2762. if (pixels[sr].a < BC15Options.m_nAlphaThreshold) {
  2763. return cmpBlockAlpha;
  2764. }
  2765. }
  2766. //================
  2767. // extern codec
  2768. //================
  2769. // For debugging
  2770. // CGU_Vec2ui cmpBlockRed = {0xF800F800,0x00000000};
  2771. // CGU_Vec2ui cmpBlockGreen = {0x7E007E00,0x00000000};
  2772. // CGU_Vec2ui cmpBlockBlue = {0x1F001F00,0x00000000};
  2773. if (!BC15Options.m_bUseAlpha ) {
  2774. //==========================================
  2775. // Gain +0.3 dB for images with soild blocks
  2776. //==========================================
  2777. bool bAllColoursEqual = true;
  2778. // Load the whole 4x4 block
  2779. for (CGU_UINT32 i = 0u; (i < 16u) && bAllColoursEqual; ++i)
  2780. {
  2781. for (CGU_INT c = 0; c < 3; c++)
  2782. bAllColoursEqual = bAllColoursEqual && (pixels[0][c] == pixels[i][c]);
  2783. }
  2784. if (bAllColoursEqual) {
  2785. cmpBlock = cgu_solidColorBlock(pixels[0].x,pixels[0].y,pixels[0].z);
  2786. CompErr = cgu_RGBABlockErrorLinear(pixels, cmpBlock);
  2787. if (BC15Options.m_nRefinementSteps < 1) return cmpBlock;
  2788. }
  2789. }
  2790. if (!usingMaxQualityOnly) {
  2791. //====================================
  2792. // Get src image data, min,max...
  2793. //=====================================
  2794. //CMP_EncodeData edata;
  2795. //cmp_get_encode_data(edata,pixels);
  2796. if (!BC15Options.m_bUseAlpha) {
  2797. //====================================
  2798. // Fast Compression, low quality
  2799. //=====================================
  2800. CGU_Vec3f srcRGB[16]; // The list of source colors with blue channel altered
  2801. CGU_Vec3f average_rgb; // The centrepoint of the axis
  2802. CGU_FLOAT errLQ = CMP_FLT_MAX;
  2803. cmpBlockTemp = cgu_CompressRGBBlock_MinMax(src_imageRGBNorm, BC15Options.m_fquality, BC15Options.m_bIsSRGB,srcRGB, average_rgb, errLQ);
  2804. if ((BC15Options.m_fquality < CMP_QUALITY0) || (errLQ == 0.0f))
  2805. return cmpBlockTemp;
  2806. if (CompErr > errLQ) {
  2807. CompErr = errLQ;
  2808. cmpBlock = cmpBlockTemp;
  2809. }
  2810. cmpBlockTemp = cgu_CompressRGBBlock_Fast(src_imageRGBNorm, BC15Options.m_fquality, BC15Options.m_bIsSRGB,srcRGB, average_rgb, errLQ);
  2811. if (CompErr > errLQ) {
  2812. CompErr = errLQ;
  2813. cmpBlock = cmpBlockTemp;
  2814. }
  2815. if (BC15Options.m_fquality < CMP_QUALITY1)
  2816. return cmpBlock;
  2817. }
  2818. //========================================
  2819. // use GPU codec lower quality then CPU
  2820. //========================================
  2821. cmpBlockTemp = cgu_CompRGBBlock(src_imageNorm,BC15Options);
  2822. CompErrTemp = cgu_RGBABlockErrorLinear(pixels, cmpBlockTemp);
  2823. if (CompErr > CompErrTemp) {
  2824. CompErr = CompErrTemp;
  2825. cmpBlock = cmpBlockTemp;
  2826. }
  2827. if (BC15Options.m_fquality < CMP_QUALITY2) return cmpBlock;
  2828. }// if useCGUCodecs
  2829. //====================================
  2830. // High Quality Codec CPU only
  2831. //=====================================
  2832. #ifndef ASPM_GPU
  2833. cmpBlockTemp = cpu_CompRGBBlock(pixelsBGRA,BC15Options,CompErrTemp);
  2834. CompErrTemp = cgu_RGBABlockErrorLinear(pixels, cmpBlockTemp);
  2835. if (CompErr > CompErrTemp) {
  2836. CompErr = CompErrTemp;
  2837. cmpBlock = cmpBlockTemp;
  2838. }
  2839. #endif
  2840. return cmpBlock;
  2841. }