bcn_common_kernel.h 106 KB


  1. //=============================================================================
  2. // Copyright (c) 2018-2021 Advanced Micro Devices, Inc. All rights reserved.
  3. //
  4. // Permission is hereby granted, free of charge, to any person obtaining a copy
  5. // of this software and associated documentation files(the "Software"), to deal
  6. // in the Software without restriction, including without limitation the rights
  7. // to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
  8. // copies of the Software, and to permit persons to whom the Software is
  9. // furnished to do so, subject to the following conditions :
  10. //
  11. // The above copyright notice and this permission notice shall be included in
  12. // all copies or substantial portions of the Software.
  13. //
  14. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
  17. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  19. // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  20. // THE SOFTWARE.
  21. //
  22. //=====================================================================
  23. //=====================================================================
  24. // Block-compression (BC) functionality ref
  25. // Copyright (c) Microsoft Corporation. All rights reserved.
  26. // Licensed under the MIT License.
  27. //=====================================================================
  28. //************************************************************************************
  29. // ** NOTE **
  30. // Content and data types may change, use CMP_Core.h for interface to your application
  31. //************************************************************************************
  32. #ifndef _BCN_COMMON_KERNEL_H
  33. #define _BCN_COMMON_KERNEL_H
  34. #pragma warning(disable : 4505) // disable warnings on unreferenced local function has been removed
  35. #include "common_def.h"
  36. #include "bcn_common_api.h"
  37. //-----------------------------------------------------------------------
  38. // When build is for CPU, we have some missing API calls common to GPU
  39. // Use CPU CMP_Core replacements
  40. //-----------------------------------------------------------------------
  41. // used in BC1 HiQuaity
  42. #if defined(ASPM_GPU) || defined(ASPM_HLSL) || defined(ASPM_OPENCL)
  43. #define ALIGN_16
  44. #else
  45. #include INC_cmp_math_func
  46. #if defined(WIN32) || defined(_WIN64)
  47. #define ALIGN_16 __declspec(align(16))
  48. #else // !WIN32 && !_WIN64
  49. #define ALIGN_16
  50. #endif // !WIN32 && !_WIN64
  51. #endif
  52. #define DXTC_OFFSET_ALPHA 0
  53. #define DXTC_OFFSET_RGB 2
  54. #define BC1CompBlockSize 8
  55. #define RC 2
  56. #define GC 1
  57. #define BC 0
  58. #define AC 3
  59. /*
  60. Channel Bits
  61. */
  62. #define RGBA8888_CHANNEL_A 3
  63. #define RGBA8888_CHANNEL_R 2
  64. #define RGBA8888_CHANNEL_G 1
  65. #define RGBA8888_CHANNEL_B 0
  66. #define RGBA8888_OFFSET_A (RGBA8888_CHANNEL_A * 8)
  67. #define RGBA8888_OFFSET_R (RGBA8888_CHANNEL_R * 8)
  68. #define RGBA8888_OFFSET_G (RGBA8888_CHANNEL_G * 8)
  69. #define RGBA8888_OFFSET_B (RGBA8888_CHANNEL_B * 8)
  70. #ifndef MAX_ERROR
  71. #define MAX_ERROR 128000.f
  72. #endif
  73. #define MAX_BLOCK 64
  74. #define MAX_POINTS 16
  75. #define BLOCK_SIZE MAX_BLOCK
  76. #define NUM_CHANNELS 4
  77. #define NUM_ENDPOINTS 2
  78. #define BLOCK_SIZE_4X4 16
  79. #define CMP_ALPHA_RAMP 8 // Number of Ramp Points used for Alpha Channels in BC5
  80. #define ConstructColour(r, g, b) (((r) << 11) | ((g) << 5) | (b)) // same as BC1ConstructColour in common_api
  81. #define BYTEPP 4
  82. #define CMP_QUALITY0 0.25f
  83. #define CMP_QUALITY1 0.50f
  84. #define CMP_QUALITY2 0.75f
  85. #define POS(x, y) (pos_on_axis[(x) + (y)*4])
  86. // Find the first approximation of the line
  87. // Assume there is a linear relation
  88. // Z = a * X_In
  89. // Z = b * Y_In
  90. // Find a,b to minimize MSE between Z and Z_In
  91. #define EPS (2.f / 255.f) * (2.f / 255.f)
  92. #define EPS2 3.f * (2.f / 255.f) * (2.f / 255.f)
  93. // Grid precision
  94. #define PIX_GRID 8
  95. #define BYTE_MASK 0x00ff
  96. #define SCH_STPS 3 // number of search steps to make at each end of interval
  97. static CMP_CONSTANT CGU_FLOAT sMvF[] = {0.f, -1.f, 1.f, -2.f, 2.f, -3.f, 3.f, -4.f, 4.f, -5.f, 5.f, -6.f, 6.f, -7.f, 7.f, -8.f, 8.f};
  98. #ifndef GBL_SCH_STEP
  99. #define GBL_SCH_STEP_MXS 0.018f
  100. #define GBL_SCH_EXT_MXS 0.1f
  101. #define LCL_SCH_STEP_MXS 0.6f
  102. #define GBL_SCH_STEP_MXQ 0.0175f
  103. #define GBL_SCH_EXT_MXQ 0.154f
  104. #define LCL_SCH_STEP_MXQ 0.45f
  105. #define GBL_SCH_STEP GBL_SCH_STEP_MXS
  106. #define GBL_SCH_EXT GBL_SCH_EXT_MXS
  107. #define LCL_SCH_STEP LCL_SCH_STEP_MXS
  108. #endif
  109. #ifndef ASPM_GPU
  110. typedef union
  111. {
  112. struct colorblock64U
  113. {
  114. CGU_UINT8 col0;
  115. CGU_UINT8 col1;
  116. CGU_UINT8 indices[6];
  117. };
  118. CGU_INT8 cmp_data8[8];
  119. CGU_INT32 cmp_data32[2];
  120. CGU_UINT64 cmp_data64;
  121. } CMP_BLOCK64_UNORM;
  122. typedef union
  123. {
  124. struct colorblock64S
  125. {
  126. CGU_UINT8 col0;
  127. CGU_UINT8 col1;
  128. CGU_UINT8 indices[6];
  129. };
  130. CGU_INT8 cmp_data8[8];
  131. CGU_INT32 cmp_data32[2];
  132. CGU_UINT64 cmp_data64[2];
  133. } CMP_BLOCK64_SNORM;
  134. typedef union
  135. {
  136. CGU_INT8 cmp_data8[16];
  137. CGU_INT32 cmp_data32[4];
  138. CGU_UINT64 cmp_data64[2];
  139. } CMP_BLOCK128_UNORM;
  140. #endif
  141. typedef struct
  142. {
  143. CGU_UINT32 data;
  144. CGU_UINT32 index;
  145. } CMP_di;
  146. typedef struct
  147. {
  148. CGU_FLOAT data;
  149. CGU_UINT32 index;
  150. } CMP_df;
  151. typedef struct
  152. {
  153. // user setable
  154. CGU_FLOAT m_fquality;
  155. CGU_FLOAT m_fChannelWeights[3];
  156. CGU_BOOL m_bUseChannelWeighting;
  157. CGU_BOOL m_bUseAdaptiveWeighting;
  158. CGU_BOOL m_bUseFloat;
  159. CGU_BOOL m_b3DRefinement;
  160. CGU_BOOL m_bUseAlpha;
  161. CGU_BOOL m_bIsSRGB; // Use Linear to SRGB color conversion used in BC1, default is false
  162. CGU_BOOL m_bIsSNORM;
  163. CGU_BOOL m_sintsrc; // source data pointer is signed data
  164. CGU_UINT32 m_nRefinementSteps;
  165. CGU_UINT32 m_nAlphaThreshold;
  166. CGU_BOOL m_mapDecodeRGBA;
  167. CGU_UINT32 m_src_width;
  168. CGU_UINT32 m_src_height;
  169. } CMP_BC15Options;
  170. typedef struct
  171. {
  172. CGU_Vec3i end_point0;
  173. CGU_Vec3i end_point1;
  174. CGU_UINT8 indices[16];
  175. CGU_BOOL m_3color;
  176. } CMP_BC1_Encode_Results;
  177. // used in BC1 LowQuality code
  178. typedef struct
  179. {
  180. CGU_Vec3f Color0;
  181. CGU_Vec3f Color1;
  182. } CMP_EndPoints;
  183. // Common data info used between encoders
  184. // Defines properties of current 4x4 pixel block
  185. typedef struct
  186. {
  187. CGU_UINT32 grayscale_flag;
  188. CGU_UINT32 any_black_pixels;
  189. CGU_BOOL all_colors_equal;
  190. CGU_Vec3i min;
  191. CGU_Vec3i max;
  192. CGU_Vec3i total;
  193. CGU_Vec3i avg;
  194. } CMP_EncodeData;
  195. typedef struct
  196. {
  197. // Union struct not supported on GPU
  198. // 8 Bytes Total
  199. #ifndef ASPM_GPU
  200. union {
  201. struct { // 2 x 32bit
  202. CGU_UINT32 colors;
  203. CGU_UINT32 indices;
  204. };
  205. struct { // 8 x 8bit
  206. CGU_UINT8 m_low_color[2];
  207. CGU_UINT8 m_high_color[2];
  208. CGU_UINT8 m_selectors[4];
  209. };
  210. };
  211. inline void set_low_color(CGU_UINT16 c)
  212. {
  213. m_low_color[0] = static_cast<CGU_UINT8>(c & 0xFF);
  214. m_low_color[1] = static_cast<CGU_UINT8>((c >> 8) & 0xFF);
  215. }
  216. inline void set_high_color(CGU_UINT16 c)
  217. {
  218. m_high_color[0] = static_cast<CGU_UINT8>(c & 0xFF);
  219. m_high_color[1] = static_cast<CGU_UINT8>((c >> 8) & 0xFF);
  220. }
  221. #else
  222. CGU_UINT32 colors;
  223. CGU_UINT32 indices;
  224. #endif
  225. } CMP_BC1_Block;
  226. // Helper functions to cut precision of floats
  227. // Prec is a power of 10 value from 1,10,100,...,10000... INT MAX power 10
  228. static CGU_BOOL cmp_compareprecision(CGU_FLOAT f1, CGU_FLOAT f2, CGU_INT Prec)
  229. {
  230. CGU_INT scale1 = (CGU_INT)(f1 * Prec);
  231. CGU_INT scale2 = (CGU_INT)(f2 * Prec);
  232. return (scale1 == scale2);
  233. }
  234. // Helper function to compare floats to a set precision
  235. static CGU_FLOAT cmp_getfloatprecision(CGU_FLOAT f1, CGU_INT Prec)
  236. {
  237. CGU_INT scale1 = (CGU_INT)(f1 * Prec);
  238. return ((CGU_FLOAT)(scale1) / Prec);
  239. }
  240. static CGU_FLOAT cmp_getIndicesRGB(CMP_INOUT CGU_UINT32 CMP_PTRINOUT cmpindex,
  241. const CGU_Vec3f block[16],
  242. CGU_Vec3f minColor,
  243. CGU_Vec3f maxColor,
  244. CGU_BOOL getErr)
  245. {
  246. CGU_UINT32 PackedIndices = 0;
  247. CGU_FLOAT err = 0.0f;
  248. CGU_Vec3f cn[4];
  249. CGU_FLOAT minDistance;
  250. if (getErr)
  251. {
  252. // remap to BC1 spec for decoding offsets,
  253. // where cn[0] > cn[1] Max Color = index 0, 2/3 offset =index 2, 1/3 offset = index 3, Min Color = index 1
  254. cn[0] = maxColor;
  255. cn[1] = minColor;
  256. cn[2] = cn[0] * 2.0f / 3.0f + cn[1] * 1.0f / 3.0f;
  257. cn[3] = cn[0] * 1.0f / 3.0f + cn[1] * 2.0f / 3.0f;
  258. }
  259. CGU_FLOAT Scale = 3.f / dot(minColor - maxColor, minColor - maxColor);
  260. CGU_Vec3f ScaledRange = (minColor - maxColor) * Scale;
  261. CGU_FLOAT Bias = (dot(maxColor, maxColor) - dot(maxColor, minColor)) * Scale;
  262. CGU_INT indexMap[4] = {0, 2, 3, 1}; // mapping based on BC1 Spec for color0 > color1
  263. CGU_UINT32 index;
  264. CGU_FLOAT diff;
  265. for (CGU_UINT32 i = 0; i < 16; i++)
  266. {
  267. // Get offset from base scale
  268. diff = dot(block[i], ScaledRange) + Bias;
  269. index = ((CGU_UINT32)round(diff)) & 0x3;
  270. // remap linear offset to spec offset
  271. index = indexMap[index];
  272. // use err calc for use in higher quality code
  273. if (getErr)
  274. {
  275. minDistance = dot(block[i] - cn[index], block[i] - cn[index]);
  276. err += minDistance;
  277. }
  278. // Map the 2 bit index into compress 32 bit block
  279. if (index)
  280. PackedIndices |= (index << (2 * i));
  281. }
  282. if (getErr)
  283. err = err * 0.0208333f;
  284. CMP_PTRINOUT cmpindex = PackedIndices;
  285. return err;
  286. }
  287. //---------------------------------------- BCn Common Utility Code -------------------------------------------------------
  288. #ifndef ASPM_GPU
  289. static void SetDefaultBC15Options(CMP_BC15Options* BC15Options)
  290. {
  291. if (BC15Options)
  292. {
  293. BC15Options->m_fquality = 1.0f;
  294. BC15Options->m_bUseChannelWeighting = false;
  295. BC15Options->m_bUseAdaptiveWeighting = false;
  296. BC15Options->m_fChannelWeights[0] = 0.3086f;
  297. BC15Options->m_fChannelWeights[1] = 0.6094f;
  298. BC15Options->m_fChannelWeights[2] = 0.0820f;
  299. BC15Options->m_nAlphaThreshold = 128;
  300. BC15Options->m_bUseFloat = false;
  301. BC15Options->m_b3DRefinement = false;
  302. BC15Options->m_bUseAlpha = false;
  303. BC15Options->m_bIsSNORM = false;
  304. BC15Options->m_bIsSRGB = false;
  305. BC15Options->m_nRefinementSteps = 0;
  306. BC15Options->m_src_width = 4;
  307. BC15Options->m_src_height = 4;
  308. #ifdef CMP_SET_BC13_DECODER_RGBA
  309. BC15Options->m_mapDecodeRGBA = true;
  310. #else
  311. BC15Options->m_mapDecodeRGBA = false;
  312. #endif
  313. }
  314. }
  315. #endif
  316. static CMP_BC15Options CalculateColourWeightings(CGU_Vec4f rgbaBlock[BLOCK_SIZE_4X4], CMP_BC15Options BC15options)
  317. {
  318. CGU_FLOAT fBaseChannelWeights[3] = {0.3086f, 0.6094f, 0.0820f};
  319. if (!BC15options.m_bUseChannelWeighting)
  320. {
  321. BC15options.m_fChannelWeights[0] = 1.0F;
  322. BC15options.m_fChannelWeights[1] = 1.0F;
  323. BC15options.m_fChannelWeights[2] = 1.0F;
  324. return BC15options;
  325. }
  326. if (BC15options.m_bUseAdaptiveWeighting)
  327. {
  328. float medianR = 0.0f, medianG = 0.0f, medianB = 0.0f;
  329. for (CGU_UINT32 k = 0; k < BLOCK_SIZE_4X4; k++)
  330. {
  331. medianR += rgbaBlock[k].x;
  332. medianG += rgbaBlock[k].y;
  333. medianB += rgbaBlock[k].z;
  334. }
  335. medianR /= BLOCK_SIZE_4X4;
  336. medianG /= BLOCK_SIZE_4X4;
  337. medianB /= BLOCK_SIZE_4X4;
  338. // Now skew the colour weightings based on the gravity center of the block
  339. float largest = max(max(medianR, medianG), medianB);
  340. if (largest > 0)
  341. {
  342. medianR /= largest;
  343. medianG /= largest;
  344. medianB /= largest;
  345. }
  346. else
  347. medianR = medianG = medianB = 1.0f;
  348. // Scale weightings back up to 1.0f
  349. CGU_FLOAT fWeightScale = 1.0f / (fBaseChannelWeights[0] + fBaseChannelWeights[1] + fBaseChannelWeights[2]);
  350. BC15options.m_fChannelWeights[0] = fBaseChannelWeights[0] * fWeightScale;
  351. BC15options.m_fChannelWeights[1] = fBaseChannelWeights[1] * fWeightScale;
  352. BC15options.m_fChannelWeights[2] = fBaseChannelWeights[2] * fWeightScale;
  353. BC15options.m_fChannelWeights[0] = ((BC15options.m_fChannelWeights[0] * 3 * medianR) + BC15options.m_fChannelWeights[0]) * 0.25f;
  354. BC15options.m_fChannelWeights[1] = ((BC15options.m_fChannelWeights[1] * 3 * medianG) + BC15options.m_fChannelWeights[1]) * 0.25f;
  355. BC15options.m_fChannelWeights[2] = ((BC15options.m_fChannelWeights[2] * 3 * medianB) + BC15options.m_fChannelWeights[2]) * 0.25f;
  356. fWeightScale = 1.0f / (BC15options.m_fChannelWeights[0] + BC15options.m_fChannelWeights[1] + BC15options.m_fChannelWeights[2]);
  357. BC15options.m_fChannelWeights[0] *= fWeightScale;
  358. BC15options.m_fChannelWeights[1] *= fWeightScale;
  359. BC15options.m_fChannelWeights[2] *= fWeightScale;
  360. }
  361. else
  362. {
  363. BC15options.m_fChannelWeights[0] = fBaseChannelWeights[0];
  364. BC15options.m_fChannelWeights[1] = fBaseChannelWeights[1];
  365. BC15options.m_fChannelWeights[2] = fBaseChannelWeights[2];
  366. }
  367. return BC15options;
  368. }
  369. static CMP_BC15Options CalculateColourWeightings3f(CGU_Vec3f rgbBlock[BLOCK_SIZE_4X4], CMP_BC15Options BC15options)
  370. {
  371. CGU_FLOAT fBaseChannelWeights[3] = {0.3086f, 0.6094f, 0.0820f};
  372. if (!BC15options.m_bUseChannelWeighting)
  373. {
  374. BC15options.m_fChannelWeights[0] = 1.0F;
  375. BC15options.m_fChannelWeights[1] = 1.0F;
  376. BC15options.m_fChannelWeights[2] = 1.0F;
  377. return BC15options;
  378. }
  379. if (BC15options.m_bUseAdaptiveWeighting)
  380. {
  381. float medianR = 0.0f, medianG = 0.0f, medianB = 0.0f;
  382. for (CGU_UINT32 k = 0; k < BLOCK_SIZE_4X4; k++)
  383. {
  384. medianR += rgbBlock[k].x;
  385. medianG += rgbBlock[k].y;
  386. medianB += rgbBlock[k].z;
  387. }
  388. medianR /= BLOCK_SIZE_4X4;
  389. medianG /= BLOCK_SIZE_4X4;
  390. medianB /= BLOCK_SIZE_4X4;
  391. // Now skew the colour weightings based on the gravity center of the block
  392. float largest = max(max(medianR, medianG), medianB);
  393. if (largest > 0)
  394. {
  395. medianR /= largest;
  396. medianG /= largest;
  397. medianB /= largest;
  398. }
  399. else
  400. medianR = medianG = medianB = 1.0f;
  401. // Scale weightings back up to 1.0f
  402. CGU_FLOAT fWeightScale = 1.0f / (fBaseChannelWeights[0] + fBaseChannelWeights[1] + fBaseChannelWeights[2]);
  403. BC15options.m_fChannelWeights[0] = fBaseChannelWeights[0] * fWeightScale;
  404. BC15options.m_fChannelWeights[1] = fBaseChannelWeights[1] * fWeightScale;
  405. BC15options.m_fChannelWeights[2] = fBaseChannelWeights[2] * fWeightScale;
  406. BC15options.m_fChannelWeights[0] = ((BC15options.m_fChannelWeights[0] * 3 * medianR) + BC15options.m_fChannelWeights[0]) * 0.25f;
  407. BC15options.m_fChannelWeights[1] = ((BC15options.m_fChannelWeights[1] * 3 * medianG) + BC15options.m_fChannelWeights[1]) * 0.25f;
  408. BC15options.m_fChannelWeights[2] = ((BC15options.m_fChannelWeights[2] * 3 * medianB) + BC15options.m_fChannelWeights[2]) * 0.25f;
  409. fWeightScale = 1.0f / (BC15options.m_fChannelWeights[0] + BC15options.m_fChannelWeights[1] + BC15options.m_fChannelWeights[2]);
  410. BC15options.m_fChannelWeights[0] *= fWeightScale;
  411. BC15options.m_fChannelWeights[1] *= fWeightScale;
  412. BC15options.m_fChannelWeights[2] *= fWeightScale;
  413. }
  414. else
  415. {
  416. BC15options.m_fChannelWeights[0] = fBaseChannelWeights[0];
  417. BC15options.m_fChannelWeights[1] = fBaseChannelWeights[1];
  418. BC15options.m_fChannelWeights[2] = fBaseChannelWeights[2];
  419. }
  420. return BC15options;
  421. }
  422. static CGU_FLOAT cmp_getRampErr(CGU_FLOAT Prj[BLOCK_SIZE_4X4],
  423. CGU_FLOAT PrjErr[BLOCK_SIZE_4X4],
  424. CGU_FLOAT PreMRep[BLOCK_SIZE_4X4],
  425. CGU_FLOAT StepErr,
  426. CGU_FLOAT lowPosStep,
  427. CGU_FLOAT highPosStep,
  428. CGU_UINT32 dwUniqueColors)
  429. {
  430. CGU_FLOAT error = 0;
  431. CGU_FLOAT step = (highPosStep - lowPosStep) / 3; // using (dwNumChannels=4 - 1);
  432. CGU_FLOAT step_h = step * (CGU_FLOAT)0.5;
  433. CGU_FLOAT rstep = (CGU_FLOAT)1.0f / step;
  434. for (CGU_UINT32 i = 0; i < dwUniqueColors; i++)
  435. {
  436. CGU_FLOAT v;
  437. // Work out which value in the block this select
  438. CGU_FLOAT del;
  439. if ((del = Prj[i] - lowPosStep) <= 0)
  440. v = lowPosStep;
  441. else if (Prj[i] - highPosStep >= 0)
  442. v = highPosStep;
  443. else
  444. v = floor((del + step_h) * rstep) * step + lowPosStep;
  445. // And accumulate the error
  446. CGU_FLOAT d = (Prj[i] - v);
  447. d *= d;
  448. CGU_FLOAT err = PreMRep[i] * d + PrjErr[i];
  449. error += err;
  450. if (StepErr < error)
  451. {
  452. error = StepErr;
  453. break;
  454. }
  455. }
  456. return error;
  457. }
  458. static CGU_Vec2ui cmp_compressExplicitAlphaBlock(const CGU_FLOAT AlphaBlockUV[16])
  459. {
  460. CGU_Vec2ui compBlock = {0, 0};
  461. CGU_UINT8 i;
  462. for (i = 0; i < 16; i++)
  463. {
  464. CGU_UINT8 v = (CGU_UINT8)(AlphaBlockUV[i] * 255.0F);
  465. v = (v + 7 - (v >> 4));
  466. v >>= 4;
  467. if (v < 0)
  468. v = 0;
  469. else if (v > 0xf)
  470. v = 0xf;
  471. if (i < 8)
  472. compBlock.x |= v << (4 * i);
  473. else
  474. compBlock.y |= v << (4 * (i - 8));
  475. }
  476. return compBlock;
  477. }
  478. static CGU_FLOAT cmp_getRampError(CGU_FLOAT _Blk[BLOCK_SIZE_4X4],
  479. CGU_FLOAT _Rpt[BLOCK_SIZE_4X4],
  480. CGU_FLOAT _maxerror,
  481. CGU_FLOAT _min_ex,
  482. CGU_FLOAT _max_ex,
  483. CGU_INT _NmbrClrs)
  484. { // Max 16
  485. CGU_INT i;
  486. CGU_FLOAT error = 0;
  487. const CGU_FLOAT step = (_max_ex - _min_ex) / 7; // (CGU_FLOAT)(dwNumPoints - 1);
  488. const CGU_FLOAT step_h = step * 0.5f;
  489. const CGU_FLOAT rstep = 1.0f / step;
  490. for (i = 0; i < _NmbrClrs; i++)
  491. {
  492. CGU_FLOAT v;
  493. // Work out which value in the block this select
  494. CGU_FLOAT del;
  495. if ((del = _Blk[i] - _min_ex) <= 0)
  496. v = _min_ex;
  497. else if (_Blk[i] - _max_ex >= 0)
  498. v = _max_ex;
  499. else
  500. v = (floor((del + step_h) * rstep) * step) + _min_ex;
  501. // And accumulate the error
  502. CGU_FLOAT del2 = (_Blk[i] - v);
  503. error += del2 * del2 * _Rpt[i];
  504. // if we've already lost to the previous step bail out
  505. if (_maxerror < error)
  506. {
  507. error = _maxerror;
  508. break;
  509. }
  510. }
  511. return error;
  512. }
  513. static CGU_FLOAT cmp_linearBlockRefine(CGU_FLOAT _Blk[BLOCK_SIZE_4X4],
  514. CGU_FLOAT _Rpt[BLOCK_SIZE_4X4],
  515. CGU_FLOAT _MaxError,
  516. CMP_INOUT CGU_FLOAT CMP_PTRINOUT _min_ex,
  517. CMP_INOUT CGU_FLOAT CMP_PTRINOUT _max_ex,
  518. CGU_FLOAT _m_step,
  519. CGU_FLOAT _min_bnd,
  520. CGU_FLOAT _max_bnd,
  521. CGU_INT _NmbrClrs)
  522. {
  523. // Start out assuming our endpoints are the min and max values we've
  524. // determined
  525. // Attempt a (simple) progressive refinement step to reduce noise in the
  526. // output image by trying to find a better overall match for the endpoints.
  527. CGU_FLOAT maxerror = _MaxError;
  528. CGU_FLOAT min_ex = CMP_PTRINOUT _min_ex;
  529. CGU_FLOAT max_ex = CMP_PTRINOUT _max_ex;
  530. CGU_INT mode, bestmode;
  531. do
  532. {
  533. CGU_FLOAT cr_min0 = min_ex;
  534. CGU_FLOAT cr_max0 = max_ex;
  535. for (bestmode = -1, mode = 0; mode < SCH_STPS * SCH_STPS; mode++)
  536. {
  537. // check each move (see sStep for direction)
  538. CGU_FLOAT cr_min = min_ex + _m_step * sMvF[mode / SCH_STPS];
  539. CGU_FLOAT cr_max = max_ex + _m_step * sMvF[mode % SCH_STPS];
  540. cr_min = max(cr_min, _min_bnd);
  541. cr_max = min(cr_max, _max_bnd);
  542. CGU_FLOAT error;
  543. error = cmp_getRampError(_Blk, _Rpt, maxerror, cr_min, cr_max, _NmbrClrs);
  544. if (error < maxerror)
  545. {
  546. maxerror = error;
  547. bestmode = mode;
  548. cr_min0 = cr_min;
  549. cr_max0 = cr_max;
  550. }
  551. }
  552. if (bestmode != -1)
  553. {
  554. // make move (see sStep for direction)
  555. min_ex = cr_min0;
  556. max_ex = cr_max0;
  557. }
  558. } while (bestmode != -1);
  559. CMP_PTRINOUT _min_ex = min_ex;
  560. CMP_PTRINOUT _max_ex = max_ex;
  561. return maxerror;
  562. }
  563. static CGU_Vec2f cmp_getLinearEndPoints(CGU_FLOAT _Blk[BLOCK_SIZE_4X4], CMP_IN CGU_FLOAT fquality, CMP_IN CGU_BOOL isSigned)
  564. {
  565. CGU_UINT32 i;
  566. CGU_Vec2f cmpMinMax;
  567. //================================================================
  568. // Bounding Box
  569. // lowest quality calculation to get min and max value to use
  570. //================================================================
  571. if (fquality < CMP_QUALITY2)
  572. {
  573. cmpMinMax.x = _Blk[0];
  574. cmpMinMax.y = _Blk[0];
  575. for (i = 1; i < BLOCK_SIZE_4X4; ++i)
  576. {
  577. cmpMinMax.x = min(cmpMinMax.x, _Blk[i]);
  578. cmpMinMax.y = max(cmpMinMax.y, _Blk[i]);
  579. }
  580. return cmpMinMax;
  581. }
  582. //================================================================
  583. // Do more calculations to get the best min and max values to use
  584. //================================================================
  585. CGU_FLOAT Ramp[2];
  586. // Result defaults for SNORM or UNORM
  587. Ramp[0] = isSigned ? -1.0f : 0.0f;
  588. Ramp[1] = 1.0f;
  589. ALIGN_16 CGU_FLOAT afUniqueValues[BLOCK_SIZE_4X4];
  590. ALIGN_16 CGU_FLOAT afValueRepeats[BLOCK_SIZE_4X4];
  591. for (i = 0; i < BLOCK_SIZE_4X4; i++)
  592. afUniqueValues[i] = afValueRepeats[i] = 0.f;
  593. // For each unique value we compute the number of it appearances.
  594. CGU_FLOAT fBlk[BLOCK_SIZE_4X4];
  595. // sort the input
  596. #ifndef ASPM_GPU
  597. memcpy(fBlk, _Blk, BLOCK_SIZE_4X4 * sizeof(CGU_FLOAT));
  598. qsort((void*)fBlk, (size_t)BLOCK_SIZE_4X4, sizeof(CGU_FLOAT), QSortFCmp);
  599. #else
  600. CGU_UINT32 j;
  601. for (i = 0; i < BLOCK_SIZE_4X4; i++)
  602. {
  603. fBlk[i] = _Blk[i];
  604. }
  605. CMP_df what[BLOCK_SIZE];
  606. for (i = 0; i < BLOCK_SIZE_4X4; i++)
  607. {
  608. what[i].index = i;
  609. what[i].data = fBlk[i];
  610. }
  611. CGU_UINT32 tmp_index;
  612. CGU_FLOAT tmp_data;
  613. for (i = 1; i < BLOCK_SIZE_4X4; i++)
  614. {
  615. for (j = i; j > 0; j--)
  616. {
  617. if (what[j - 1].data > what[j].data)
  618. {
  619. tmp_index = what[j].index;
  620. tmp_data = what[j].data;
  621. what[j].index = what[j - 1].index;
  622. what[j].data = what[j - 1].data;
  623. what[j - 1].index = tmp_index;
  624. what[j - 1].data = tmp_data;
  625. }
  626. }
  627. }
  628. for (i = 0; i < BLOCK_SIZE_4X4; i++)
  629. fBlk[i] = what[i].data;
  630. #endif
  631. CGU_FLOAT new_p = -2.0f;
  632. CGU_UINT32 dwUniqueValues = 0;
  633. afUniqueValues[0] = 0.0f;
  634. CGU_BOOL requiresCalculation = true;
  635. {
  636. // Ramp not fixed
  637. for (i = 0; i < BLOCK_SIZE_4X4; i++)
  638. {
  639. if (new_p != fBlk[i])
  640. {
  641. afUniqueValues[dwUniqueValues] = new_p = fBlk[i];
  642. afValueRepeats[dwUniqueValues] = 1.f;
  643. dwUniqueValues++;
  644. }
  645. else if (dwUniqueValues)
  646. afValueRepeats[dwUniqueValues - 1] += 1.f;
  647. }
  648. // if number of unique colors is less or eq 2, we've done
  649. if (dwUniqueValues <= 2)
  650. {
  651. Ramp[0] = floor(afUniqueValues[0] * 255.0f + 0.5f);
  652. if (dwUniqueValues == 1)
  653. Ramp[1] = Ramp[0] + 1.f;
  654. else
  655. Ramp[1] = floor(afUniqueValues[1] * 255.0f + 0.5f);
  656. requiresCalculation = false;
  657. }
  658. } // Ramp not fixed
  659. if (requiresCalculation)
  660. {
  661. CGU_FLOAT min_ex = afUniqueValues[0];
  662. CGU_FLOAT max_ex = afUniqueValues[dwUniqueValues - 1];
  663. CGU_FLOAT min_bnd = 0, max_bnd = 1.;
  664. CGU_FLOAT min_r = min_ex, max_r = max_ex;
  665. CGU_FLOAT gbl_l = 0, gbl_r = 0;
  666. CGU_FLOAT cntr = (min_r + max_r) / 2;
  667. CGU_FLOAT gbl_err = MAX_ERROR;
  668. // Trying to avoid unnecessary calculations. Heuristics: after some analisis
  669. // it appears that in integer case, if the input interval not more then 48
  670. // we won't get much better
  671. bool wantsSearch = !((max_ex - min_ex) <= (48.f / 256.0f));
  672. if (wantsSearch)
  673. {
  674. // Search.
  675. // 1. take the vicinities of both low and high bound of the input
  676. // interval.
  677. // 2. setup some search step
  678. // 3. find the new low and high bound which provides an (sub) optimal
  679. // (infinite precision) clusterization.
  680. CGU_FLOAT gbl_llb = (min_bnd > min_r - GBL_SCH_EXT) ? min_bnd : min_r - GBL_SCH_EXT;
  681. CGU_FLOAT gbl_rrb = (max_bnd < max_r + GBL_SCH_EXT) ? max_bnd : max_r + GBL_SCH_EXT;
  682. CGU_FLOAT gbl_lrb = (cntr < min_r + GBL_SCH_EXT) ? cntr : min_r + GBL_SCH_EXT;
  683. CGU_FLOAT gbl_rlb = (cntr > max_r - GBL_SCH_EXT) ? cntr : max_r - GBL_SCH_EXT;
  684. for (CGU_FLOAT step_l = gbl_llb; step_l < gbl_lrb; step_l += GBL_SCH_STEP)
  685. {
  686. for (CGU_FLOAT step_r = gbl_rrb; gbl_rlb <= step_r; step_r -= GBL_SCH_STEP)
  687. {
  688. CGU_FLOAT sch_err;
  689. // an sse version is avaiable
  690. sch_err = cmp_getRampError(afUniqueValues, afValueRepeats, gbl_err, step_l, step_r, dwUniqueValues);
  691. if (sch_err < gbl_err)
  692. {
  693. gbl_err = sch_err;
  694. gbl_l = step_l;
  695. gbl_r = step_r;
  696. }
  697. }
  698. }
  699. min_r = gbl_l;
  700. max_r = gbl_r;
  701. } // want search
  702. // This is a refinement call. The function tries to make several small
  703. // stretches or squashes to minimize quantization error.
  704. CGU_FLOAT m_step = LCL_SCH_STEP / 256.0f;
  705. cmp_linearBlockRefine(afUniqueValues, afValueRepeats, gbl_err, CMP_REFINOUT min_r, CMP_REFINOUT max_r, m_step, min_bnd, max_bnd, dwUniqueValues);
  706. min_ex = min_r;
  707. max_ex = max_r;
  708. max_ex *= 255.0f;
  709. min_ex *= 255.0f;
  710. Ramp[0] = floor(min_ex + 0.5f);
  711. Ramp[1] = floor(max_ex + 0.5f);
  712. }
  713. // Ensure that the two endpoints are not the same
  714. // This is legal but serves no need & can break some optimizations in the compressor
  715. if (Ramp[0] == Ramp[1])
  716. {
  717. if (Ramp[1] < 255.f)
  718. Ramp[1] = Ramp[1] + 1.0f;
  719. else if (Ramp[1] > 0.0f)
  720. Ramp[1] = Ramp[1] - 1.0f;
  721. }
  722. cmpMinMax.x = Ramp[0];
  723. cmpMinMax.y = Ramp[1];
  724. return cmpMinMax;
  725. }
  726. static CGU_Vec2ui cmp_getBlockPackedIndices(CGU_Vec2f RampMinMax, CGU_FLOAT alphaBlock[BLOCK_SIZE_4X4], CMP_IN CGU_FLOAT fquality)
  727. {
  728. CGU_UINT32 i;
  729. CGU_UINT32 j;
  730. CGU_Vec2ui cmpBlock = {0, 0};
  731. CGU_UINT32 MinRampU;
  732. CGU_UINT32 MaxRampU;
  733. CGU_INT32 pcIndices[BLOCK_SIZE_4X4];
  734. if (fquality < CMP_QUALITY2)
  735. {
  736. CGU_FLOAT Range;
  737. CGU_FLOAT RampSteps; // segments into 0..7 sections
  738. CGU_FLOAT Bias;
  739. if (RampMinMax.x != RampMinMax.y)
  740. Range = RampMinMax.x - RampMinMax.y;
  741. else
  742. Range = 1.0f;
  743. RampSteps = 7.f / Range; // segments into 0..7 sections
  744. Bias = -RampSteps * RampMinMax.y;
  745. for (i = 0; i < 16; ++i)
  746. {
  747. pcIndices[i] = (CGU_UINT32)round(alphaBlock[i] * RampSteps + Bias);
  748. if (i < 5)
  749. {
  750. pcIndices[i] += (pcIndices[i] > 0) - (7 * (pcIndices[i] == 7));
  751. }
  752. else if (i > 5)
  753. {
  754. pcIndices[i] += (pcIndices[i] > 0) - (7 * (pcIndices[i] == 7 ? 1 : 0));
  755. }
  756. else
  757. {
  758. pcIndices[i] += (pcIndices[i] > 0) - (7 * (pcIndices[i] == 7));
  759. }
  760. }
  761. MinRampU = (CGU_UINT32)round(RampMinMax.x * 255.0f);
  762. MaxRampU = (CGU_UINT32)round(RampMinMax.y * 255.0f);
  763. cmpBlock.x = (MinRampU << 8) | MaxRampU;
  764. cmpBlock.y = 0;
  765. for (i = 0; i < 5; ++i)
  766. {
  767. cmpBlock.x |= (pcIndices[i] << (16 + (i * 3)));
  768. }
  769. {
  770. cmpBlock.x |= (pcIndices[5] << 31);
  771. cmpBlock.y |= (pcIndices[5] >> 1);
  772. }
  773. for (i = 6; i < BLOCK_SIZE_4X4; ++i)
  774. {
  775. cmpBlock.y |= (pcIndices[i] << (i * 3 - 16));
  776. }
  777. }
  778. else
  779. {
  780. CGU_UINT32 epoint;
  781. CGU_FLOAT alpha[BLOCK_SIZE_4X4];
  782. CGU_FLOAT OverIntFctr;
  783. CGU_FLOAT shortest;
  784. CGU_FLOAT adist;
  785. for (i = 0; i < BLOCK_SIZE_4X4; i++)
  786. pcIndices[i] = 0;
  787. for (i = 0; i < MAX_POINTS; i++)
  788. alpha[i] = 0;
  789. // GetRmp1
  790. {
  791. if (RampMinMax.x <= RampMinMax.y)
  792. {
  793. CGU_FLOAT t = RampMinMax.x;
  794. RampMinMax.x = RampMinMax.y;
  795. RampMinMax.y = t;
  796. }
  797. //=============================
  798. // final clusterization applied
  799. //=============================
  800. CGU_FLOAT ramp[NUM_ENDPOINTS];
  801. ramp[0] = RampMinMax.x;
  802. ramp[1] = RampMinMax.y;
  803. {
  804. // BldRmp1
  805. alpha[0] = ramp[0];
  806. alpha[1] = ramp[1];
  807. for (epoint = 1; epoint < CMP_ALPHA_RAMP - 1; epoint++)
  808. alpha[epoint + 1] = (alpha[0] * (CMP_ALPHA_RAMP - 1 - epoint) + alpha[1] * epoint) / (CGU_FLOAT)(CMP_ALPHA_RAMP - 1);
  809. for (epoint = CMP_ALPHA_RAMP; epoint < BLOCK_SIZE_4X4; epoint++)
  810. alpha[epoint] = 100000.f;
  811. } // BldRmp1
  812. // FixedRamp
  813. for (i = 0; i < CMP_ALPHA_RAMP; i++)
  814. {
  815. alpha[i] = floor(alpha[i] + 0.5f);
  816. }
  817. } // GetRmp1
  818. OverIntFctr = 1.f / 255.0f;
  819. for (i = 0; i < CMP_ALPHA_RAMP; i++)
  820. alpha[i] *= OverIntFctr;
  821. // For each colour in the original block, calculate its weighted
  822. // distance from each point in the original and assign it
  823. // to the closest cluster
  824. for (i = 0; i < BLOCK_SIZE_4X4; i++)
  825. {
  826. shortest = 10000000.f;
  827. for (j = 0; j < CMP_ALPHA_RAMP; j++)
  828. {
  829. adist = (alphaBlock[i] - alpha[j]);
  830. adist *= adist;
  831. if (adist < shortest)
  832. {
  833. shortest = adist;
  834. pcIndices[i] = j;
  835. }
  836. }
  837. }
  838. //==================================================
  839. // EncodeAlphaBlock
  840. //==================================================
  841. MinRampU = (CGU_UINT32)RampMinMax.x;
  842. MaxRampU = (CGU_UINT32)RampMinMax.y;
  843. cmpBlock.x = (MaxRampU << 8) | MinRampU;
  844. cmpBlock.y = 0;
  845. for (i = 0; i < 5; i++)
  846. {
  847. cmpBlock.x |= (pcIndices[i]) << (16 + (i * 3));
  848. }
  849. {
  850. cmpBlock.x |= (pcIndices[5] & 0x1) << 31;
  851. cmpBlock.y |= (pcIndices[5] & 0x6) >> 1;
  852. }
  853. for (i = 6; i < BLOCK_SIZE_4X4; i++)
  854. {
  855. cmpBlock.y |= (pcIndices[i]) << (i * 3 - 16);
  856. }
  857. }
  858. return cmpBlock;
  859. }
  860. //======================= SNORM CODE ==================================
  861. static CGU_INT8 cmp_snormFloatToSInt(CGU_FLOAT fsnorm)
  862. {
  863. if (isnan(fsnorm))
  864. fsnorm = 0;
  865. else if (fsnorm > 1)
  866. fsnorm = 1; // Clamp to 1
  867. else if (fsnorm < -1)
  868. fsnorm = -1; // Clamp to -1
  869. fsnorm = fsnorm * 127U;
  870. // shift round up or down
  871. if (fsnorm >= 0)
  872. fsnorm += .5f;
  873. else
  874. fsnorm -= .5f;
  875. #ifdef ASPM_GPU
  876. CGU_INT8 res = (CGU_INT8)fsnorm;
  877. #else
  878. CGU_INT8 res = static_cast<CGU_INT8>(fsnorm);
  879. #endif
  880. return (res);
  881. }
  882. static CGU_Vec2f cmp_optimizeEndPoints(CGU_FLOAT pPoints[BLOCK_SIZE_4X4], CGU_INT8 cSteps, CGU_BOOL isSigned)
  883. {
  884. CGU_Vec2f fendpoints;
  885. CGU_FLOAT MAX_VALUE = 1.0f;
  886. CGU_FLOAT MIN_VALUE = isSigned ? -1.0f : 0.0f;
  887. // Find Min and Max points, as starting point
  888. CGU_FLOAT fX = MAX_VALUE;
  889. CGU_FLOAT fY = MIN_VALUE;
  890. if (8 == cSteps)
  891. {
  892. for (CGU_INT8 iPoint = 0; iPoint < BLOCK_SIZE_4X4; iPoint++)
  893. {
  894. if (pPoints[iPoint] < fX)
  895. fX = pPoints[iPoint];
  896. if (pPoints[iPoint] > fY)
  897. fY = pPoints[iPoint];
  898. }
  899. }
  900. else
  901. {
  902. for (CGU_INT8 iPoint = 0; iPoint < BLOCK_SIZE_4X4; iPoint++)
  903. {
  904. if (pPoints[iPoint] < fX && pPoints[iPoint] > MIN_VALUE)
  905. fX = pPoints[iPoint];
  906. if (pPoints[iPoint] > fY && pPoints[iPoint] < MAX_VALUE)
  907. fY = pPoints[iPoint];
  908. }
  909. if (fX == fY)
  910. {
  911. fY = MAX_VALUE;
  912. }
  913. }
  914. //===================
  915. // Use Newton Method
  916. //===================
  917. #ifdef ASPM_GPU
  918. CGU_FLOAT cStepsDiv = (CGU_FLOAT)(cSteps - 1);
  919. #else
  920. CGU_FLOAT cStepsDiv = static_cast<CGU_FLOAT>(cSteps - 1);
  921. #endif
  922. CGU_FLOAT pSteps[8];
  923. CGU_FLOAT fc;
  924. CGU_FLOAT fd;
  925. for (CGU_INT8 iIteration = 0; iIteration < 8; iIteration++)
  926. {
  927. // reach minimum threashold break
  928. if ((fY - fX) < (1.0f / 256.0f))
  929. break;
  930. CGU_FLOAT fScale = cStepsDiv / (fY - fX);
  931. // Calculate new steps
  932. for (CGU_INT8 iStep = 0; iStep < cSteps; iStep++)
  933. {
  934. fc = (cStepsDiv - (CGU_FLOAT)iStep) / cStepsDiv;
  935. fd = (CGU_FLOAT)iStep / cStepsDiv;
  936. pSteps[iStep] = fc * fX + fd * fY;
  937. }
  938. if (6 == cSteps)
  939. {
  940. pSteps[6] = MIN_VALUE;
  941. pSteps[7] = MAX_VALUE;
  942. }
  943. // Evaluate function, and derivatives
  944. CGU_FLOAT dX = 0.0f;
  945. CGU_FLOAT dY = 0.0f;
  946. CGU_FLOAT d2X = 0.0f;
  947. CGU_FLOAT d2Y = 0.0f;
  948. for (CGU_INT8 iPoint = 0; iPoint < BLOCK_SIZE_4X4; iPoint++)
  949. {
  950. CGU_FLOAT fDot = (pPoints[iPoint] - fX) * fScale;
  951. CGU_INT8 iStep;
  952. if (fDot <= 0.0f)
  953. {
  954. iStep = ((6 == cSteps) && (pPoints[iPoint] <= (fX + MIN_VALUE) * 0.5f)) ? 6u : 0u;
  955. }
  956. else if (fDot >= cStepsDiv)
  957. {
  958. iStep = ((6 == cSteps) && (pPoints[iPoint] >= (fY + MAX_VALUE) * 0.5f)) ? 7u : (cSteps - 1);
  959. }
  960. else
  961. {
  962. iStep = (CGU_UINT32)(fDot + 0.5f);
  963. }
  964. // steps to improve quality
  965. if (iStep < cSteps)
  966. {
  967. fc = (cStepsDiv - (CGU_FLOAT)iStep) / cStepsDiv;
  968. fd = (CGU_FLOAT)iStep / cStepsDiv;
  969. CGU_FLOAT fDiff = pSteps[iStep] - pPoints[iPoint];
  970. dX += fc * fDiff;
  971. d2X += fc * fc;
  972. dY += fd * fDiff;
  973. d2Y += fd * fd;
  974. }
  975. }
  976. // Move endpoints
  977. if (d2X > 0.0f)
  978. fX -= dX / d2X;
  979. if (d2Y > 0.0f)
  980. fY -= dY / d2Y;
  981. if (fX > fY)
  982. {
  983. float f = fX;
  984. fX = fY;
  985. fY = f;
  986. }
  987. if ((dX * dX < (1.0f / 64.0f)) && (dY * dY < (1.0f / 64.0f)))
  988. break;
  989. }
  990. fendpoints.x = (fX < MIN_VALUE) ? MIN_VALUE : (fX > MAX_VALUE) ? MAX_VALUE : fX;
  991. fendpoints.y = (fY < MIN_VALUE) ? MIN_VALUE : (fY > MAX_VALUE) ? MAX_VALUE : fY;
  992. return fendpoints;
  993. }
  994. static CGU_Vec2i cmp_findEndpointsAlphaBlockSnorm(CGU_FLOAT alphaBlockSnorm[BLOCK_SIZE_4X4])
  995. {
  996. //================================================================
  997. // Bounding Box
  998. // lowest quality calculation to get min and max value to use
  999. //================================================================
  1000. CGU_Vec2f cmpMinMax;
  1001. cmpMinMax.x = alphaBlockSnorm[0];
  1002. cmpMinMax.y = alphaBlockSnorm[0];
  1003. for (CGU_UINT8 i = 0; i < BLOCK_SIZE_4X4; ++i)
  1004. {
  1005. if (alphaBlockSnorm[i] < cmpMinMax.x)
  1006. {
  1007. cmpMinMax.x = alphaBlockSnorm[i];
  1008. }
  1009. else if (alphaBlockSnorm[i] > cmpMinMax.y)
  1010. {
  1011. cmpMinMax.y = alphaBlockSnorm[i];
  1012. }
  1013. }
  1014. CGU_Vec2i endpoints;
  1015. CGU_Vec2f fendpoints;
  1016. // Are we done for lowest quality setting!
  1017. // CGU_FLOAT fquality = 1.0f;
  1018. //
  1019. // if (fquality < CMP_QUALITY2) {
  1020. // endpoints.x = (CGU_INT8)(cmpMinMax.x);
  1021. // endpoints.y = (CGU_INT8)(cmpMinMax.y);
  1022. // return endpoints;
  1023. // }
  1024. //================================================================
  1025. // Do more calculations to get the best min and max values to use
  1026. //================================================================
  1027. if ((-1.0f == cmpMinMax.x || 1.0f == cmpMinMax.y))
  1028. {
  1029. fendpoints = cmp_optimizeEndPoints(alphaBlockSnorm, 6, true);
  1030. endpoints.x = cmp_snormFloatToSInt(fendpoints.x);
  1031. endpoints.y = cmp_snormFloatToSInt(fendpoints.y);
  1032. }
  1033. else
  1034. {
  1035. fendpoints = cmp_optimizeEndPoints(alphaBlockSnorm, 8, true);
  1036. endpoints.x = cmp_snormFloatToSInt(fendpoints.y);
  1037. endpoints.y = cmp_snormFloatToSInt(fendpoints.x);
  1038. }
  1039. return endpoints;
  1040. }
  1041. #ifndef ASPM_HLSL
  1042. static CGU_UINT64 cmp_getBlockPackedIndicesSNorm(CGU_Vec2f alphaMinMax, CGU_FLOAT alphaBlockSnorm[BLOCK_SIZE_4X4], CGU_UINT64 data)
  1043. {
  1044. CGU_FLOAT alpha[8];
  1045. alpha[0] = alphaMinMax.x;
  1046. alpha[1] = alphaMinMax.y;
  1047. if (alphaMinMax.x > alphaMinMax.y)
  1048. {
  1049. // 8-alpha block: derive the other six alphas.
  1050. // Bit code 000 = alpha_0, 001 = alpha_1, others are interpolated.
  1051. alpha[2] = (alpha[0] * 6.0f + alpha[1]) / 7.0f;
  1052. alpha[3] = (alpha[0] * 5.0f + alpha[1] * 2.0f) / 7.0f;
  1053. alpha[4] = (alpha[0] * 4.0f + alpha[1] * 3.0f) / 7.0f;
  1054. alpha[5] = (alpha[0] * 3.0f + alpha[1] * 4.0f) / 7.0f;
  1055. alpha[6] = (alpha[0] * 2.0f + alpha[1] * 5.0f) / 7.0f;
  1056. alpha[7] = (alpha[0] + alpha[1] * 6.0f) / 7.0f;
  1057. }
  1058. else
  1059. {
  1060. // 6-alpha block.
  1061. // Bit code 000 = alpha_0, 001 = alpha_1, others are interpolated.
  1062. alpha[2] = (alpha[0] * 4.0f + alpha[1]) / 5.0f;
  1063. alpha[3] = (alpha[0] * 3.0f + alpha[1] * 2.0f) / 5.0f;
  1064. alpha[4] = (alpha[0] * 2.0f + alpha[1] * 3.0f) / 5.0f;
  1065. alpha[5] = (alpha[0] + alpha[1] * 4.0f) / 5.0f;
  1066. alpha[6] = -1.0f;
  1067. alpha[7] = 1.0f;
  1068. }
  1069. // Index all colors using best alpha value
  1070. for (CGU_UINT8 i = 0; i < BLOCK_SIZE_4X4; ++i)
  1071. {
  1072. CGU_UINT8 uBestIndex = 0;
  1073. CGU_FLOAT fBestDelta = CMP_FLOAT_MAX;
  1074. for (CGU_INT32 uIndex = 0; uIndex < 8; uIndex++)
  1075. {
  1076. CGU_FLOAT fCurrentDelta = fabs(alpha[uIndex] - alphaBlockSnorm[i]);
  1077. if (fCurrentDelta < fBestDelta)
  1078. {
  1079. uBestIndex = uIndex;
  1080. fBestDelta = fCurrentDelta;
  1081. }
  1082. }
  1083. data &= ~((CGU_UINT64)(0x07) << (3 * i + 16));
  1084. data |= ((CGU_UINT64)(uBestIndex) << (3 * i + 16));
  1085. }
  1086. return data;
  1087. }
  1088. #endif
  1089. static void cmp_getCompressedAlphaRampS(CGU_INT8 alpha[8], const CGU_UINT32 compressedBlock[2])
  1090. {
  1091. alpha[0] = (CGU_INT8)(compressedBlock[0] & 0xff);
  1092. alpha[1] = (CGU_INT8)((compressedBlock[0] >> 8) & 0xff);
  1093. if (alpha[0] > alpha[1])
  1094. {
  1095. // 8-alpha block: derive the other six alphas.
  1096. // Bit code 000 = alpha_0, 001 = alpha_1, others are interpolated.
  1097. #ifdef ASPM_GPU
  1098. alpha[2] = (CGU_UINT8)((6 * alpha[0] + 1 * alpha[1] + 3) / 7); // bit code 010
  1099. alpha[3] = (CGU_UINT8)((5 * alpha[0] + 2 * alpha[1] + 3) / 7); // bit code 011
  1100. alpha[4] = (CGU_UINT8)((4 * alpha[0] + 3 * alpha[1] + 3) / 7); // bit code 100
  1101. alpha[5] = (CGU_UINT8)((3 * alpha[0] + 4 * alpha[1] + 3) / 7); // bit code 101
  1102. alpha[6] = (CGU_UINT8)((2 * alpha[0] + 5 * alpha[1] + 3) / 7); // bit code 110
  1103. alpha[7] = (CGU_UINT8)((1 * alpha[0] + 6 * alpha[1] + 3) / 7); // bit code 111
  1104. #else
  1105. alpha[2] = static_cast<CGU_UINT8>((6 * alpha[0] + 1 * alpha[1] + 3) / 7); // bit code 010
  1106. alpha[3] = static_cast<CGU_UINT8>((5 * alpha[0] + 2 * alpha[1] + 3) / 7); // bit code 011
  1107. alpha[4] = static_cast<CGU_UINT8>((4 * alpha[0] + 3 * alpha[1] + 3) / 7); // bit code 100
  1108. alpha[5] = static_cast<CGU_UINT8>((3 * alpha[0] + 4 * alpha[1] + 3) / 7); // bit code 101
  1109. alpha[6] = static_cast<CGU_UINT8>((2 * alpha[0] + 5 * alpha[1] + 3) / 7); // bit code 110
  1110. alpha[7] = static_cast<CGU_UINT8>((1 * alpha[0] + 6 * alpha[1] + 3) / 7); // bit code 111
  1111. #endif
  1112. }
  1113. else
  1114. {
  1115. // 6-alpha block.
  1116. // Bit code 000 = alpha_0, 001 = alpha_1, others are interpolated.
  1117. #ifdef ASPM_GPU
  1118. alpha[2] = (CGU_UINT8)((4 * alpha[0] + 1 * alpha[1] + 2) / 5); // Bit code 010
  1119. alpha[3] = (CGU_UINT8)((3 * alpha[0] + 2 * alpha[1] + 2) / 5); // Bit code 011
  1120. alpha[4] = (CGU_UINT8)((2 * alpha[0] + 3 * alpha[1] + 2) / 5); // Bit code 100
  1121. alpha[5] = (CGU_UINT8)((1 * alpha[0] + 4 * alpha[1] + 2) / 5); // Bit code 101
  1122. #else
  1123. alpha[2] = static_cast<CGU_UINT8>((4 * alpha[0] + 1 * alpha[1] + 2) / 5); // Bit code 010
  1124. alpha[3] = static_cast<CGU_UINT8>((3 * alpha[0] + 2 * alpha[1] + 2) / 5); // Bit code 011
  1125. alpha[4] = static_cast<CGU_UINT8>((2 * alpha[0] + 3 * alpha[1] + 2) / 5); // Bit code 100
  1126. alpha[5] = static_cast<CGU_UINT8>((1 * alpha[0] + 4 * alpha[1] + 2) / 5); // Bit code 101
  1127. #endif
  1128. alpha[6] = -128; // Bit code 110
  1129. alpha[7] = 127; // Bit code 111
  1130. }
  1131. }
  1132. static void cmp_decompressAlphaBlockS(CGU_INT8 alphaBlock[BLOCK_SIZE_4X4], const CGU_UINT32 compressedBlock[2])
  1133. {
  1134. CGU_UINT32 i;
  1135. CGU_INT8 alpha[8];
  1136. cmp_getCompressedAlphaRampS(alpha, compressedBlock);
  1137. for (i = 0; i < BLOCK_SIZE_4X4; i++)
  1138. {
  1139. CGU_UINT32 index;
  1140. if (i < 5)
  1141. index = (compressedBlock[0] & (0x7 << (16 + (i * 3)))) >> (16 + (i * 3));
  1142. else if (i > 5)
  1143. index = (compressedBlock[1] & (0x7 << (2 + (i - 6) * 3))) >> (2 + (i - 6) * 3);
  1144. else
  1145. {
  1146. index = (compressedBlock[0] & 0x80000000) >> 31;
  1147. index |= (compressedBlock[1] & 0x3) << 1;
  1148. }
  1149. alphaBlock[i] = alpha[index];
  1150. }
  1151. }
  1152. //=============================================================================
  1153. // Processes Alpha Channel either as Unsigned Norm (0..1) or (Signed Norm -1..1)
  1154. static CGU_Vec2ui cmp_compressAlphaBlock(CMP_IN CGU_FLOAT alphaBlock[BLOCK_SIZE_4X4], CMP_IN CGU_FLOAT fquality, CMP_IN CGU_BOOL isSigned)
  1155. {
  1156. CGU_Vec2ui CmpBlock;
  1157. if (isSigned)
  1158. {
  1159. #ifndef ASPM_HLSL
  1160. union
  1161. {
  1162. CGU_INT32 compressedBlock[2];
  1163. struct
  1164. {
  1165. CGU_INT8 red_0;
  1166. CGU_INT8 red_1;
  1167. CGU_UINT8 indices[6];
  1168. };
  1169. CGU_UINT64 data;
  1170. } BC4_Snorm_block;
  1171. #ifndef ASPM_GPU
  1172. BC4_Snorm_block.data = 0LL;
  1173. #else
  1174. BC4_Snorm_block.data = 0;
  1175. #endif
  1176. CGU_Vec2i reds;
  1177. reds = cmp_findEndpointsAlphaBlockSnorm(alphaBlock);
  1178. BC4_Snorm_block.red_0 = reds.x & 0xFF;
  1179. BC4_Snorm_block.red_1 = reds.y & 0xFF;
  1180. // check low end boundaries
  1181. if (BC4_Snorm_block.red_0 == -128)
  1182. BC4_Snorm_block.red_0 = -127;
  1183. if (BC4_Snorm_block.red_1 == -128)
  1184. BC4_Snorm_block.red_1 = -127;
  1185. // Normalize signed int -128..127 to float -1..1
  1186. CGU_Vec2f alphaMinMax;
  1187. alphaMinMax.x = (CGU_FLOAT)(BC4_Snorm_block.red_0) / 127.0f;
  1188. alphaMinMax.y = (CGU_FLOAT)(BC4_Snorm_block.red_1) / 127.0f;
  1189. BC4_Snorm_block.data = cmp_getBlockPackedIndicesSNorm(alphaMinMax, alphaBlock, BC4_Snorm_block.data);
  1190. CmpBlock.x = BC4_Snorm_block.compressedBlock[0];
  1191. CmpBlock.y = BC4_Snorm_block.compressedBlock[1];
  1192. #else
  1193. CGU_Vec2f RampMinMax;
  1194. RampMinMax = cmp_getLinearEndPoints(alphaBlock, fquality, false); // revert code to remove the false param
  1195. CmpBlock = cmp_getBlockPackedIndices(RampMinMax, alphaBlock, fquality);
  1196. #endif
  1197. }
  1198. else
  1199. {
  1200. CGU_Vec2f RampMinMax;
  1201. RampMinMax = cmp_getLinearEndPoints(alphaBlock, fquality, false); // revert code to remove the false param
  1202. CmpBlock = cmp_getBlockPackedIndices(RampMinMax, alphaBlock, fquality);
  1203. }
  1204. return CmpBlock;
  1205. }
  1206. static void cmp_getCompressedAlphaRamp(CGU_UINT8 alpha[8], const CGU_UINT32 compressedBlock[2])
  1207. {
  1208. alpha[0] = (CGU_UINT8)(compressedBlock[0] & 0xff);
  1209. alpha[1] = (CGU_UINT8)((compressedBlock[0] >> 8) & 0xff);
  1210. if (alpha[0] > alpha[1])
  1211. {
  1212. // 8-alpha block: derive the other six alphas.
  1213. // Bit code 000 = alpha_0, 001 = alpha_1, others are interpolated.
  1214. #ifdef ASPM_GPU
  1215. alpha[2] = (CGU_UINT8)((6 * alpha[0] + 1 * alpha[1] + 3) / 7); // bit code 010
  1216. alpha[3] = (CGU_UINT8)((5 * alpha[0] + 2 * alpha[1] + 3) / 7); // bit code 011
  1217. alpha[4] = (CGU_UINT8)((4 * alpha[0] + 3 * alpha[1] + 3) / 7); // bit code 100
  1218. alpha[5] = (CGU_UINT8)((3 * alpha[0] + 4 * alpha[1] + 3) / 7); // bit code 101
  1219. alpha[6] = (CGU_UINT8)((2 * alpha[0] + 5 * alpha[1] + 3) / 7); // bit code 110
  1220. alpha[7] = (CGU_UINT8)((1 * alpha[0] + 6 * alpha[1] + 3) / 7); // bit code 111
  1221. #else
  1222. alpha[2] = static_cast<CGU_UINT8>((6 * alpha[0] + 1 * alpha[1] + 3) / 7); // bit code 010
  1223. alpha[3] = static_cast<CGU_UINT8>((5 * alpha[0] + 2 * alpha[1] + 3) / 7); // bit code 011
  1224. alpha[4] = static_cast<CGU_UINT8>((4 * alpha[0] + 3 * alpha[1] + 3) / 7); // bit code 100
  1225. alpha[5] = static_cast<CGU_UINT8>((3 * alpha[0] + 4 * alpha[1] + 3) / 7); // bit code 101
  1226. alpha[6] = static_cast<CGU_UINT8>((2 * alpha[0] + 5 * alpha[1] + 3) / 7); // bit code 110
  1227. alpha[7] = static_cast<CGU_UINT8>((1 * alpha[0] + 6 * alpha[1] + 3) / 7); // bit code 111
  1228. #endif
  1229. }
  1230. else
  1231. {
  1232. // 6-alpha block.
  1233. // Bit code 000 = alpha_0, 001 = alpha_1, others are interpolated.
  1234. #ifdef ASPM_GPU
  1235. alpha[2] = (CGU_UINT8)((4 * alpha[0] + 1 * alpha[1] + 2) / 5); // Bit code 010
  1236. alpha[3] = (CGU_UINT8)((3 * alpha[0] + 2 * alpha[1] + 2) / 5); // Bit code 011
  1237. alpha[4] = (CGU_UINT8)((2 * alpha[0] + 3 * alpha[1] + 2) / 5); // Bit code 100
  1238. alpha[5] = (CGU_UINT8)((1 * alpha[0] + 4 * alpha[1] + 2) / 5); // Bit code 101
  1239. #else
  1240. alpha[2] = static_cast<CGU_UINT8>((4 * alpha[0] + 1 * alpha[1] + 2) / 5); // Bit code 010
  1241. alpha[3] = static_cast<CGU_UINT8>((3 * alpha[0] + 2 * alpha[1] + 2) / 5); // Bit code 011
  1242. alpha[4] = static_cast<CGU_UINT8>((2 * alpha[0] + 3 * alpha[1] + 2) / 5); // Bit code 100
  1243. alpha[5] = static_cast<CGU_UINT8>((1 * alpha[0] + 4 * alpha[1] + 2) / 5); // Bit code 101
  1244. #endif
  1245. alpha[6] = 0; // Bit code 110
  1246. alpha[7] = 255; // Bit code 111
  1247. }
  1248. }
  1249. static void cmp_decompressAlphaBlock(CGU_UINT8 alphaBlock[BLOCK_SIZE_4X4], const CGU_UINT32 compressedBlock[2])
  1250. {
  1251. CGU_UINT32 i;
  1252. CGU_UINT8 alpha[8];
  1253. cmp_getCompressedAlphaRamp(alpha, compressedBlock);
  1254. for (i = 0; i < BLOCK_SIZE_4X4; i++)
  1255. {
  1256. CGU_UINT32 index;
  1257. if (i < 5)
  1258. index = (compressedBlock[0] & (0x7 << (16 + (i * 3)))) >> (16 + (i * 3));
  1259. else if (i > 5)
  1260. index = (compressedBlock[1] & (0x7 << (2 + (i - 6) * 3))) >> (2 + (i - 6) * 3);
  1261. else
  1262. {
  1263. index = (compressedBlock[0] & 0x80000000) >> 31;
  1264. index |= (compressedBlock[1] & 0x3) << 1;
  1265. }
  1266. alphaBlock[i] = alpha[index];
  1267. }
  1268. }
  1269. static void cmp_ProcessColors(CMP_INOUT CGU_Vec3f CMP_PTRINOUT colorMin,
  1270. CMP_INOUT CGU_Vec3f CMP_PTRINOUT colorMax,
  1271. CMP_INOUT CGU_UINT32 CMP_PTRINOUT c0,
  1272. CMP_INOUT CGU_UINT32 CMP_PTRINOUT c1,
  1273. CGU_INT setopt,
  1274. CGU_BOOL isSRGB)
  1275. {
  1276. // CGU_UINT32 srbMap[32] = {0,5,8,11,12,13,14,15,16,17,18,19,20,21,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31};
  1277. // CGU_UINT32 sgMap[64] = {0,10,14,16,19,20,22,24,25,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,42,43,43,44,45,45,
  1278. // 46,47,47,48,48,49,50,50,51,52,52,53,53,54,54,55,55,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63};
  1279. CGU_INT32 x, y, z;
  1280. CGU_Vec3f scale = {31.0f, 63.0f, 31.0f};
  1281. CGU_Vec3f MinColorScaled;
  1282. CGU_Vec3f MaxColorScaled;
  1283. // Clamp or Transform is needed, the transforms have built in clamps
  1284. if (isSRGB)
  1285. {
  1286. MinColorScaled = cmp_linearToSrgb(CMP_PTRINOUT colorMin);
  1287. MaxColorScaled = cmp_linearToSrgb(CMP_PTRINOUT colorMax);
  1288. }
  1289. else
  1290. {
  1291. MinColorScaled = cmp_clampVec3f(CMP_PTRINOUT colorMin, 0.0f, 1.0f);
  1292. MaxColorScaled = cmp_clampVec3f(CMP_PTRINOUT colorMax, 0.0f, 1.0f);
  1293. }
  1294. switch (setopt)
  1295. {
  1296. case 0: // Use Min Max processing
  1297. MinColorScaled = floor(MinColorScaled * scale);
  1298. MaxColorScaled = ceil(MaxColorScaled * scale);
  1299. CMP_PTRINOUT colorMin = MinColorScaled / scale;
  1300. CMP_PTRINOUT colorMax = MaxColorScaled / scale;
  1301. break;
  1302. default: // Use round processing
  1303. MinColorScaled = round(MinColorScaled * scale);
  1304. MaxColorScaled = round(MaxColorScaled * scale);
  1305. break;
  1306. }
  1307. x = (CGU_UINT32)(MinColorScaled.x);
  1308. y = (CGU_UINT32)(MinColorScaled.y);
  1309. z = (CGU_UINT32)(MinColorScaled.z);
  1310. //if (isSRGB) {
  1311. // // scale RB
  1312. // x = srbMap[x]; // &0x1F];
  1313. // y = sgMap [y]; // &0x3F];
  1314. // z = srbMap[z]; // &0x1F];
  1315. // // scale G
  1316. //}
  1317. CMP_PTRINOUT c0 = (x << 11) | (y << 5) | z;
  1318. x = (CGU_UINT32)(MaxColorScaled.x);
  1319. y = (CGU_UINT32)(MaxColorScaled.y);
  1320. z = (CGU_UINT32)(MaxColorScaled.z);
  1321. CMP_PTRINOUT c1 = (x << 11) | (y << 5) | z;
  1322. }
  1323. #ifndef ASPM_GPU // Used by BC1, BC2 & BC3
  1324. //----------------------------------------------------
  1325. // This function decompresses a DXT colour block
  1326. // The block is decompressed to 8 bits per channel
  1327. // Result buffer is RGBA format, A is set to 255
  1328. //----------------------------------------------------
  1329. static void cmp_decompressDXTRGBA_Internal(CGU_UINT8 rgbBlock[BLOCK_SIZE_4X4X4], const CGU_Vec2ui compressedBlock, const CGU_BOOL mapDecodeRGBA)
  1330. {
  1331. CGU_BOOL bDXT1 = TRUE;
  1332. CGU_UINT32 n0 = compressedBlock.x & 0xffff;
  1333. CGU_UINT32 n1 = compressedBlock.x >> 16;
  1334. CGU_UINT32 r0;
  1335. CGU_UINT32 g0;
  1336. CGU_UINT32 b0;
  1337. CGU_UINT32 r1;
  1338. CGU_UINT32 g1;
  1339. CGU_UINT32 b1;
  1340. r0 = ((n0 & 0xf800) >> 8);
  1341. g0 = ((n0 & 0x07e0) >> 3);
  1342. b0 = ((n0 & 0x001f) << 3);
  1343. r1 = ((n1 & 0xf800) >> 8);
  1344. g1 = ((n1 & 0x07e0) >> 3);
  1345. b1 = ((n1 & 0x001f) << 3);
  1346. // Apply the lower bit replication to give full dynamic range
  1347. r0 += (r0 >> 5);
  1348. r1 += (r1 >> 5);
  1349. g0 += (g0 >> 6);
  1350. g1 += (g1 >> 6);
  1351. b0 += (b0 >> 5);
  1352. b1 += (b1 >> 5);
  1353. if (!mapDecodeRGBA)
  1354. {
  1355. //--------------------------------------------------------------
  1356. // Channel mapping output as BGRA
  1357. //--------------------------------------------------------------
  1358. CGU_UINT32 c0 = 0xff000000 | (r0 << 16) | (g0 << 8) | b0;
  1359. CGU_UINT32 c1 = 0xff000000 | (r1 << 16) | (g1 << 8) | b1;
  1360. if (!bDXT1 || n0 > n1)
  1361. {
  1362. CGU_UINT32 c2 = 0xff000000 | (((2 * r0 + r1) / 3) << 16) | (((2 * g0 + g1) / 3) << 8) | (((2 * b0 + b1) / 3));
  1363. CGU_UINT32 c3 = 0xff000000 | (((2 * r1 + r0) / 3) << 16) | (((2 * g1 + g0) / 3) << 8) | (((2 * b1 + b0) / 3));
  1364. for (int i = 0; i < 16; i++)
  1365. {
  1366. int index = (compressedBlock.y >> (2 * i)) & 3;
  1367. switch (index)
  1368. {
  1369. case 0:
  1370. ((CGU_UINT32*)rgbBlock)[i] = c0;
  1371. break;
  1372. case 1:
  1373. ((CGU_UINT32*)rgbBlock)[i] = c1;
  1374. break;
  1375. case 2:
  1376. ((CGU_UINT32*)rgbBlock)[i] = c2;
  1377. break;
  1378. case 3:
  1379. ((CGU_UINT32*)rgbBlock)[i] = c3;
  1380. break;
  1381. }
  1382. }
  1383. }
  1384. else
  1385. {
  1386. // Transparent decode
  1387. CGU_UINT32 c2 = 0xff000000 | (((r0 + r1) / 2) << 16) | (((g0 + g1) / 2) << 8) | (((b0 + b1) / 2));
  1388. for (int i = 0; i < 16; i++)
  1389. {
  1390. int index = (compressedBlock.y >> (2 * i)) & 3;
  1391. switch (index)
  1392. {
  1393. case 0:
  1394. ((CGU_UINT32*)rgbBlock)[i] = c0;
  1395. break;
  1396. case 1:
  1397. ((CGU_UINT32*)rgbBlock)[i] = c1;
  1398. break;
  1399. case 2:
  1400. ((CGU_UINT32*)rgbBlock)[i] = c2;
  1401. break;
  1402. case 3:
  1403. ((CGU_UINT32*)rgbBlock)[i] = 0x00000000;
  1404. break;
  1405. }
  1406. }
  1407. }
  1408. }
  1409. else
  1410. {
  1411. // MAP_BC15_TO_ABGR
  1412. //--------------------------------------------------------------
  1413. // Channel mapping output as RGBA
  1414. //--------------------------------------------------------------
  1415. CGU_UINT32 c0 = 0xff000000 | (b0 << 16) | (g0 << 8) | r0;
  1416. CGU_UINT32 c1 = 0xff000000 | (b1 << 16) | (g1 << 8) | r1;
  1417. if (!bDXT1 || n0 > n1)
  1418. {
  1419. CGU_UINT32 c2 = 0xff000000 | (((2 * b0 + b1 + 1) / 3) << 16) | (((2 * g0 + g1 + 1) / 3) << 8) | (((2 * r0 + r1 + 1) / 3));
  1420. CGU_UINT32 c3 = 0xff000000 | (((2 * b1 + b0 + 1) / 3) << 16) | (((2 * g1 + g0 + 1) / 3) << 8) | (((2 * r1 + r0 + 1) / 3));
  1421. for (int i = 0; i < 16; i++)
  1422. {
  1423. int index = (compressedBlock.y >> (2 * i)) & 3;
  1424. switch (index)
  1425. {
  1426. case 0:
  1427. ((CMP_GLOBAL CGU_UINT32*)rgbBlock)[i] = c0;
  1428. break;
  1429. case 1:
  1430. ((CMP_GLOBAL CGU_UINT32*)rgbBlock)[i] = c1;
  1431. break;
  1432. case 2:
  1433. ((CMP_GLOBAL CGU_UINT32*)rgbBlock)[i] = c2;
  1434. break;
  1435. case 3:
  1436. ((CMP_GLOBAL CGU_UINT32*)rgbBlock)[i] = c3;
  1437. break;
  1438. }
  1439. }
  1440. }
  1441. else
  1442. {
  1443. // Transparent decode
  1444. CGU_UINT32 c2 = 0xff000000 | (((b0 + b1) / 2) << 16) | (((g0 + g1) / 2) << 8) | (((r0 + r1) / 2));
  1445. for (int i = 0; i < 16; i++)
  1446. {
  1447. int index = (compressedBlock.y >> (2 * i)) & 3;
  1448. switch (index)
  1449. {
  1450. case 0:
  1451. ((CMP_GLOBAL CGU_UINT32*)rgbBlock)[i] = c0;
  1452. break;
  1453. case 1:
  1454. ((CMP_GLOBAL CGU_UINT32*)rgbBlock)[i] = c1;
  1455. break;
  1456. case 2:
  1457. ((CMP_GLOBAL CGU_UINT32*)rgbBlock)[i] = c2;
  1458. break;
  1459. case 3:
  1460. ((CMP_GLOBAL CGU_UINT32*)rgbBlock)[i] = 0x00000000;
  1461. break;
  1462. }
  1463. }
  1464. }
  1465. } //MAP_ABGR
  1466. }
  1467. #endif // !ASPM_GPU
  1468. //--------------------------------------------------------------------------------------------------------
  1469. // Decompress is RGB (0.0f..255.0f)
  1470. //--------------------------------------------------------------------------------------------------------
  1471. static void cmp_decompressRGBBlock(CMP_INOUT CGU_Vec3f rgbBlock[BLOCK_SIZE_4X4], const CGU_Vec2ui compressedBlock)
  1472. {
  1473. CGU_UINT32 n0 = compressedBlock.x & 0xffff;
  1474. CGU_UINT32 n1 = compressedBlock.x >> 16;
  1475. CGU_UINT32 index;
  1476. //-------------------------------------------------------
  1477. // Decode the compressed block 0..255 color range
  1478. //-------------------------------------------------------
  1479. CGU_Vec3f c0 = cmp_565ToLinear(n0); // max color
  1480. CGU_Vec3f c1 = cmp_565ToLinear(n1); // min color
  1481. CGU_Vec3f c2;
  1482. CGU_Vec3f c3;
  1483. if (n0 > n1)
  1484. {
  1485. c2 = (c0 * 2.0f + c1) / 3.0f;
  1486. c3 = (c1 * 2.0f + c0) / 3.0f;
  1487. for (CGU_UINT32 i = 0; i < 16; i++)
  1488. {
  1489. index = (compressedBlock.y >> (2 * i)) & 3;
  1490. switch (index)
  1491. {
  1492. case 0:
  1493. rgbBlock[i] = c0;
  1494. break;
  1495. case 1:
  1496. rgbBlock[i] = c1;
  1497. break;
  1498. case 2:
  1499. rgbBlock[i] = c2;
  1500. break;
  1501. case 3:
  1502. rgbBlock[i] = c3;
  1503. break;
  1504. }
  1505. }
  1506. }
  1507. else
  1508. {
  1509. // Transparent decode
  1510. c2 = (c0 + c1) / 2.0f;
  1511. for (CGU_UINT32 i = 0; i < 16; i++)
  1512. {
  1513. index = (compressedBlock.y >> (2 * i)) & 3;
  1514. switch (index)
  1515. {
  1516. case 0:
  1517. rgbBlock[i] = c0;
  1518. break;
  1519. case 1:
  1520. rgbBlock[i] = c1;
  1521. break;
  1522. case 2:
  1523. rgbBlock[i] = c2;
  1524. break;
  1525. case 3:
  1526. rgbBlock[i] = 0.0f;
  1527. break;
  1528. }
  1529. }
  1530. }
  1531. }
  1532. // The source is 0..1, decompressed data using cmp_decompressRGBBlock is 0..255 which is converted down to 0..1
  1533. static float CMP_RGBBlockError(const CGU_Vec3f src_rgbBlock[BLOCK_SIZE_4X4], const CGU_Vec2ui compressedBlock, CGU_BOOL isSRGB)
  1534. {
  1535. CGU_Vec3f rgbBlock[BLOCK_SIZE_4X4];
  1536. // Decompressed block channels are 0..255
  1537. cmp_decompressRGBBlock(rgbBlock, compressedBlock);
  1538. //------------------------------------------------------------------
  1539. // Calculate MSE of the block
  1540. // Note : pow is used as Float type for the code to be usable on CPU
  1541. //------------------------------------------------------------------
  1542. CGU_Vec3f serr;
  1543. serr = 0.0f;
  1544. float sR, sG, sB, R, G, B;
  1545. for (int j = 0; j < 16; j++)
  1546. {
  1547. if (isSRGB)
  1548. {
  1549. sR = round(cmp_linearToSrgbf(src_rgbBlock[j].x) * 255.0f);
  1550. sG = round(cmp_linearToSrgbf(src_rgbBlock[j].y) * 255.0f);
  1551. sB = round(cmp_linearToSrgbf(src_rgbBlock[j].z) * 255.0f);
  1552. }
  1553. else
  1554. {
  1555. sR = round(src_rgbBlock[j].x * 255.0f);
  1556. sG = round(src_rgbBlock[j].y * 255.0f);
  1557. sB = round(src_rgbBlock[j].z * 255.0f);
  1558. }
  1559. rgbBlock[j] = rgbBlock[j];
  1560. R = rgbBlock[j].x;
  1561. G = rgbBlock[j].y;
  1562. B = rgbBlock[j].z;
  1563. // Norm colors
  1564. serr.x += pow(sR - R, 2.0f);
  1565. serr.y += pow(sG - G, 2.0f);
  1566. serr.z += pow(sB - B, 2.0f);
  1567. }
  1568. // MSE for 16 texels
  1569. return (serr.x + serr.y + serr.z) / 48.0f;
  1570. }
  1571. // Processing input source 0..1.0f)
  1572. static CGU_Vec2ui CompressRGBBlock_FM(const CGU_Vec3f rgbBlockUVf[16], CMP_IN CGU_FLOAT fquality, CGU_BOOL isSRGB, CMP_INOUT CGU_FLOAT CMP_PTRINOUT errout)
  1573. {
  1574. CGU_Vec3f axisVectorRGB = {0.0f, 0.0f, 0.0f}; // The axis vector for index projection
  1575. CGU_FLOAT pos_on_axis[16]; // The distance each unique falls along the compression axis
  1576. CGU_FLOAT axisleft = 0; // The extremities and centre (average of left/right) of srcRGB along the compression axis
  1577. CGU_FLOAT axisright = 0; // The extremities and centre (average of left/right) of srcRGB along the compression axis
  1578. CGU_FLOAT axiscentre = 0; // The extremities and centre (average of left/right) of srcRGB along the compression axis
  1579. CGU_INT32 swap = 0; // Indicator if the RGB values need swapping to generate an opaque result
  1580. CGU_Vec3f average_rgb; // The centrepoint of the axis
  1581. CGU_Vec3f srcRGB[16]; // The list of source colors with blue channel altered
  1582. CGU_Vec3f srcBlock[16]; // The list of source colors with any color space transforms and clipping
  1583. CGU_Vec3f rgb;
  1584. CGU_UINT32 c0 = 0;
  1585. CGU_UINT32 c1 = 0;
  1586. CGU_Vec2ui compressedBlock = {0, 0};
  1587. CGU_FLOAT Q1CompErr = CMP_FLT_MAX;
  1588. CGU_Vec2ui Q1CompData = {0,0};
  1589. // -------------------------------------------------------------------------------------
  1590. // (1) Find the array of unique pixel values and sum them to find their average position
  1591. // -------------------------------------------------------------------------------------
  1592. {
  1593. CGU_FLOAT errLQ = 0.0f;
  1594. CGU_BOOL fastProcess = (fquality <= CMP_QUALITY1);
  1595. CGU_Vec3f srcMin = 1.0f; // Min source color
  1596. CGU_Vec3f srcMax = 0.0f; // Max source color
  1597. CGU_Vec2ui Q1compressedBlock = {0, 0};
  1598. average_rgb = 0.0f;
  1599. // Get average and modifed src
  1600. // find average position and save list of pixels as 0F..255F range for processing
  1601. // Note: z (blue) is average of blue+green channels
  1602. for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++)
  1603. {
  1604. srcMin = cmp_minVec3f(srcMin, rgbBlockUVf[i]);
  1605. srcMax = cmp_maxVec3f(srcMax, rgbBlockUVf[i]);
  1606. if (!fastProcess)
  1607. {
  1608. rgb = isSRGB ? cmp_linearToSrgb(rgbBlockUVf[i]) : cmp_saturate(rgbBlockUVf[i]);
  1609. rgb.z = (rgb.y + rgb.z) * 0.5F; // Z-axiz => (R+G)/2
  1610. srcRGB[i] = rgb;
  1611. average_rgb = average_rgb + rgb;
  1612. }
  1613. }
  1614. // Process two colors for saving in 565 format as C0 and C1
  1615. cmp_ProcessColors(CMP_REFINOUT srcMin, CMP_REFINOUT srcMax, CMP_REFINOUT c0, CMP_REFINOUT c1, isSRGB ? 1 : 0, isSRGB);
  1616. // Save simple min-max encoding
  1617. CGU_UINT32 index = 0;
  1618. if (c0 < c1)
  1619. {
  1620. Q1CompData.x = (c0 << 16) | c1;
  1621. errLQ = cmp_getIndicesRGB(CMP_REFINOUT index, rgbBlockUVf, srcMin, srcMax, false);
  1622. Q1CompData.y = index;
  1623. CMP_PTRINOUT errout = errLQ;
  1624. }
  1625. else
  1626. {
  1627. // Most simple case all colors are equal or 0.0f
  1628. Q1compressedBlock.x = (c1 << 16) | c0;
  1629. Q1compressedBlock.y = 0;
  1630. CMP_PTRINOUT errout = 0.0f;
  1631. return Q1compressedBlock;
  1632. }
  1633. if (fastProcess)
  1634. return Q1CompData;
  1635. // 0.0625F is (1/BLOCK_SIZE_4X4)
  1636. average_rgb = average_rgb * 0.0625F;
  1637. }
  1638. // -------------------------------------------------------------------------------------
  1639. // (4) For each component, reflect points about the average so all lie on the same side
  1640. // of the average, and compute the new average - this gives a second point that defines the axis
  1641. // To compute the sign of the axis sum the positive differences of G for each of R and B (the
  1642. // G axis is always positive in this implementation
  1643. // -------------------------------------------------------------------------------------
  1644. // An interesting situation occurs if the G axis contains no information, in which case the RB
  1645. // axis is also compared. I am not entirely sure if this is the correct implementation - should
  1646. // the priority axis be determined by magnitude?
  1647. {
  1648. CGU_FLOAT rg_pos = 0.0f;
  1649. CGU_FLOAT bg_pos = 0.0f;
  1650. CGU_FLOAT rb_pos = 0.0f;
  1651. for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++)
  1652. {
  1653. rgb = srcRGB[i] - average_rgb;
  1654. axisVectorRGB = axisVectorRGB + fabs(rgb);
  1655. if (rgb.x > 0)
  1656. {
  1657. rg_pos += rgb.y;
  1658. rb_pos += rgb.z;
  1659. }
  1660. if (rgb.z > 0)
  1661. bg_pos += rgb.y;
  1662. }
  1663. // Average over BLOCK_SIZE_4X4
  1664. axisVectorRGB = axisVectorRGB * 0.0625F;
  1665. // New average position
  1666. if (rg_pos < 0)
  1667. axisVectorRGB.x = -axisVectorRGB.x;
  1668. if (bg_pos < 0)
  1669. axisVectorRGB.z = -axisVectorRGB.z;
  1670. if ((rg_pos == bg_pos) && (rg_pos == 0))
  1671. {
  1672. if (rb_pos < 0)
  1673. axisVectorRGB.z = -axisVectorRGB.z;
  1674. }
  1675. }
  1676. // -------------------------------------------------------------------------------------
  1677. // (5) Axis projection and remapping
  1678. // -------------------------------------------------------------------------------------
  1679. {
  1680. CGU_FLOAT v2_recip;
  1681. // Normalize the axis for simplicity of future calculation
  1682. v2_recip = dot(axisVectorRGB, axisVectorRGB);
  1683. if (v2_recip > 0)
  1684. v2_recip = 1.0f / (CGU_FLOAT)sqrt(v2_recip);
  1685. else
  1686. v2_recip = 1.0f;
  1687. axisVectorRGB = axisVectorRGB * v2_recip;
  1688. }
  1689. // -------------------------------------------------------------------------------------
  1690. // (6) Map the axis
  1691. // -------------------------------------------------------------------------------------
  1692. // the line joining (and extended on either side of) average and axis
  1693. // defines the axis onto which the points will be projected
  1694. // Project all the points onto the axis, calculate the distance along
  1695. // the axis from the centre of the axis (average)
  1696. // From Foley & Van Dam: Closest point of approach of a line (P + v) to a point (R) is
  1697. // P + ((R-P).v) / (v.v))v
  1698. // The distance along v is therefore (R-P).v / (v.v) where (v.v) is 1 if v is a unit vector.
  1699. //
  1700. // Calculate the extremities at the same time - these need to be reasonably accurately
  1701. // represented in all cases
  1702. {
  1703. axisleft = CMP_FLOAT_MAX;
  1704. axisright = -CMP_FLOAT_MAX;
  1705. for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++)
  1706. {
  1707. // Compute the distance along the axis of the point of closest approach
  1708. CGU_Vec3f temp = (srcRGB[i] - average_rgb);
  1709. pos_on_axis[i] = dot(temp, axisVectorRGB);
  1710. // Work out the extremities
  1711. if (pos_on_axis[i] < axisleft)
  1712. axisleft = pos_on_axis[i];
  1713. if (pos_on_axis[i] > axisright)
  1714. axisright = pos_on_axis[i];
  1715. }
  1716. }
  1717. // ---------------------------------------------------------------------------------------------
  1718. // (7) Now we have a good axis and the basic information about how the points are mapped to it
  1719. // Our initial guess is to represent the endpoints accurately, by moving the average
  1720. // to the centre and recalculating the point positions along the line
  1721. // ---------------------------------------------------------------------------------------------
  1722. {
  1723. axiscentre = (axisleft + axisright) * 0.5F;
  1724. average_rgb = average_rgb + (axisVectorRGB * axiscentre);
  1725. for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++)
  1726. pos_on_axis[i] -= axiscentre;
  1727. axisright -= axiscentre;
  1728. axisleft -= axiscentre;
  1729. }
  1730. // -------------------------------------------------------------------------------------
  1731. // (8) Calculate the high and low output colour values
  1732. // Involved in this is a rounding procedure which is undoubtedly slightly twitchy. A
  1733. // straight rounded average is not correct, as the decompressor 'unrounds' by replicating
  1734. // the top bits to the bottom.
  1735. // In order to take account of this process, we don't just apply a straight rounding correction,
  1736. // but base our rounding on the input value (a straight rounding is actually pretty good in terms of
  1737. // error measure, but creates a visual colour and/or brightness shift relative to the original image)
  1738. // The method used here is to apply a centre-biased rounding dependent on the input value, which was
  1739. // (mostly by experiment) found to give minimum MSE while preserving the visual characteristics of
  1740. // the image.
  1741. // rgb = (average_rgb + (left|right)*axisVectorRGB);
  1742. // -------------------------------------------------------------------------------------
  1743. {
  1744. CGU_Vec3f MinColor, MaxColor;
  1745. MinColor = average_rgb + (axisVectorRGB * axisleft);
  1746. MaxColor = average_rgb + (axisVectorRGB * axisright);
  1747. MinColor.z = (MinColor.z * 2) - MinColor.y;
  1748. MaxColor.z = (MaxColor.z * 2) - MaxColor.y;
  1749. cmp_ProcessColors(CMP_REFINOUT MinColor, CMP_REFINOUT MaxColor, CMP_REFINOUT c0, CMP_REFINOUT c1, 1, false);
  1750. // Force to be a 4-colour opaque block - in which case, c0 is greater than c1
  1751. swap = 0;
  1752. if (c0 < c1)
  1753. {
  1754. CGU_UINT32 t;
  1755. t = c0;
  1756. c0 = c1;
  1757. c1 = t;
  1758. swap = 1;
  1759. }
  1760. else if (c0 == c1)
  1761. {
  1762. // This block will always be encoded in 3-colour mode
  1763. // Need to ensure that only one of the two points gets used,
  1764. // avoiding accidentally setting some transparent pixels into the block
  1765. for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++)
  1766. pos_on_axis[i] = axisleft;
  1767. }
  1768. compressedBlock.x = c0 | (c1 << 16);
  1769. // -------------------------------------------------------------------------------------
  1770. // (9) Final clustering, creating the 2-bit values that define the output
  1771. // -------------------------------------------------------------------------------------
  1772. CGU_UINT32 index;
  1773. CGU_FLOAT division;
  1774. {
  1775. compressedBlock.y = 0;
  1776. division = axisright * 2.0f / 3.0f;
  1777. axiscentre = (axisleft + axisright) / 2; // Actually, this code only works if centre is 0 or approximately so
  1778. CGU_FLOAT CompMinErr;
  1779. // This feature is work in progress
  1780. // remap to BC1 spec for decoding offsets,
  1781. // where cn[0] > cn[1] Max Color = index 0, 2/3 offset =index 2, 1/3 offset = index 3, Min Color = index 1
  1782. // CGU_Vec3f cn[4];
  1783. // cn[0] = MaxColor;
  1784. // cn[1] = MinColor;
  1785. // cn[2] = cn[0]*2.0f/3.0f + cn[1]*1.0f/3.0f;
  1786. // cn[3] = cn[0]*1.0f/3.0f + cn[1]*2.0f/3.0f;
  1787. for (CGU_INT32 i = 0; i < BLOCK_SIZE_4X4; i++)
  1788. {
  1789. // Endpoints (indicated by block > average) are 0 and 1, while
  1790. // interpolants are 2 and 3
  1791. if (fabs(pos_on_axis[i]) >= division)
  1792. index = 0;
  1793. else
  1794. index = 2;
  1795. // Positive is in the latter half of the block
  1796. if (pos_on_axis[i] >= axiscentre)
  1797. index += 1;
  1798. index = index ^ swap;
  1799. // Set the output, taking swapping into account
  1800. compressedBlock.y |= (index << (2 * i));
  1801. // use err calc for use in higher quality code
  1802. //CompMinErr += dot(srcRGBRef[i] - cn[index],srcRGBRef[i] - cn[index]);
  1803. }
  1804. //CompMinErr = CompMinErr * 0.0208333f;
  1805. CompMinErr = CMP_RGBBlockError(rgbBlockUVf, compressedBlock, isSRGB);
  1806. Q1CompErr = CMP_RGBBlockError(rgbBlockUVf, Q1CompData, isSRGB);
  1807. if (CompMinErr > Q1CompErr)
  1808. {
  1809. compressedBlock = Q1CompData;
  1810. CMP_PTRINOUT errout = Q1CompErr;
  1811. }
  1812. else
  1813. CMP_PTRINOUT errout = CompMinErr;
  1814. }
  1815. }
  1816. // done
  1817. return compressedBlock;
  1818. }
  1819. #ifndef CMP_USE_LOWQUALITY
  1820. static CMP_EndPoints CompressRGBBlock_Slow(CGU_Vec3f BlkInBGRf_UV[BLOCK_SIZE_4X4],
  1821. CGU_FLOAT Rpt[BLOCK_SIZE_4X4],
  1822. CGU_UINT32 dwUniqueColors,
  1823. CGU_Vec3f channelWeightsBGR,
  1824. CGU_UINT32 m_nRefinementSteps)
  1825. {
  1826. CMP_UNUSED(channelWeightsBGR);
  1827. CMP_UNUSED(m_nRefinementSteps);
  1828. ALIGN_16 CGU_FLOAT Prj0[BLOCK_SIZE_4X4];
  1829. ALIGN_16 CGU_FLOAT Prj[BLOCK_SIZE_4X4];
  1830. ALIGN_16 CGU_FLOAT PrjErr[BLOCK_SIZE_4X4];
  1831. ALIGN_16 CGU_FLOAT RmpIndxs[BLOCK_SIZE_4X4];
  1832. CGU_Vec3f LineDirG;
  1833. CGU_Vec3f LineDir;
  1834. CGU_FLOAT LineDir0[NUM_CHANNELS];
  1835. CGU_Vec3f BlkUV[BLOCK_SIZE_4X4];
  1836. CGU_Vec3f BlkSh[BLOCK_SIZE_4X4];
  1837. CGU_Vec3f Mdl;
  1838. CGU_Vec3f rsltC0;
  1839. CGU_Vec3f rsltC1;
  1840. CGU_Vec3f PosG0 = {0.0f, 0.0f, 0.0f};
  1841. CGU_Vec3f PosG1 = {0.0f, 0.0f, 0.0f};
  1842. CGU_UINT32 i;
  1843. for (i = 0; i < dwUniqueColors; i++)
  1844. {
  1845. BlkUV[i] = BlkInBGRf_UV[i];
  1846. }
  1847. // if not more then 2 different colors, we've done
  1848. if (dwUniqueColors <= 2)
  1849. {
  1850. rsltC0 = BlkInBGRf_UV[0] * 255.0f;
  1851. rsltC1 = BlkInBGRf_UV[dwUniqueColors - 1] * 255.0f;
  1852. }
  1853. else
  1854. {
  1855. // This is our first attempt to find an axis we will go along.
  1856. // The cumulation is done to find a line minimizing the MSE from the
  1857. // input 3D points.
  1858. // While trying to find the axis we found that the diameter of the input
  1859. // set is quite small. Do not bother.
  1860. // FindAxisIsSmall(BlkSh, LineDir0, Mdl, Blk, Rpt,dwUniqueColors);
  1861. {
  1862. CGU_UINT32 ii;
  1863. CGU_UINT32 jj;
  1864. CGU_UINT32 kk;
  1865. // These vars cannot be Vec3 as index to them are varying
  1866. CGU_FLOAT Crrl[NUM_CHANNELS];
  1867. CGU_FLOAT RGB2[NUM_CHANNELS];
  1868. LineDir0[0] = LineDir0[1] = LineDir0[2] = RGB2[0] = RGB2[1] = RGB2[2] = Crrl[0] = Crrl[1] = Crrl[2] = Mdl.x = Mdl.y = Mdl.z = 0.f;
  1869. // sum position of all points
  1870. CGU_FLOAT fNumPoints = 0.0f;
  1871. for (ii = 0; ii < dwUniqueColors; ii++)
  1872. {
  1873. Mdl.x += BlkUV[ii].x * Rpt[ii];
  1874. Mdl.y += BlkUV[ii].y * Rpt[ii];
  1875. Mdl.z += BlkUV[ii].z * Rpt[ii];
  1876. fNumPoints += Rpt[ii];
  1877. }
  1878. // and then average to calculate center coordinate of block
  1879. Mdl /= fNumPoints;
  1880. for (ii = 0; ii < dwUniqueColors; ii++)
  1881. {
  1882. // calculate output block as offsets around block center
  1883. BlkSh[ii] = BlkUV[ii] - Mdl;
  1884. // compute correlation matrix
  1885. // RGB2 = sum of ((distance from point from center) squared)
  1886. RGB2[0] += BlkSh[ii].x * BlkSh[ii].x * Rpt[ii];
  1887. RGB2[1] += BlkSh[ii].y * BlkSh[ii].y * Rpt[ii];
  1888. RGB2[2] += BlkSh[ii].z * BlkSh[ii].z * Rpt[ii];
  1889. Crrl[0] += BlkSh[ii].x * BlkSh[ii].y * Rpt[ii];
  1890. Crrl[1] += BlkSh[ii].y * BlkSh[ii].z * Rpt[ii];
  1891. Crrl[2] += BlkSh[ii].z * BlkSh[ii].x * Rpt[ii];
  1892. }
  1893. // if set's diameter is small
  1894. CGU_UINT32 i0 = 0, i1 = 1;
  1895. CGU_FLOAT mxRGB2 = 0.0f;
  1896. CGU_FLOAT fEPS = fNumPoints * EPS;
  1897. for (kk = 0, jj = 0; jj < 3; jj++)
  1898. {
  1899. if (RGB2[jj] >= fEPS)
  1900. kk++;
  1901. else
  1902. RGB2[jj] = 0.0f;
  1903. if (mxRGB2 < RGB2[jj])
  1904. {
  1905. mxRGB2 = RGB2[jj];
  1906. i0 = jj;
  1907. }
  1908. }
  1909. CGU_FLOAT fEPS2 = fNumPoints * EPS2;
  1910. CGU_BOOL AxisIsSmall;
  1911. AxisIsSmall = (RGB2[0] < fEPS2);
  1912. AxisIsSmall = AxisIsSmall && (RGB2[1] < fEPS2);
  1913. AxisIsSmall = AxisIsSmall && (RGB2[2] < fEPS2);
  1914. // all are very small to avoid division on the small determinant
  1915. if (AxisIsSmall)
  1916. {
  1917. rsltC0 = BlkInBGRf_UV[0] * 255.0f;
  1918. rsltC1 = BlkInBGRf_UV[dwUniqueColors - 1] * 255.0f;
  1919. }
  1920. else
  1921. {
  1922. // !AxisIsSmall
  1923. if (kk == 1) // really only 1 dimension
  1924. LineDir0[i0] = 1.;
  1925. else if (kk == 2)
  1926. { // really only 2 dimensions
  1927. i1 = (RGB2[(i0 + 1) % 3] > 0.f) ? (i0 + 1) % 3 : (i0 + 2) % 3;
  1928. CGU_FLOAT Crl = (i1 == (i0 + 1) % 3) ? Crrl[i0] : Crrl[(i0 + 2) % 3];
  1929. LineDir0[i1] = Crl / RGB2[i0];
  1930. LineDir0[i0] = 1.;
  1931. }
  1932. else
  1933. {
  1934. CGU_FLOAT maxDet = 100000.f;
  1935. CGU_FLOAT Cs[3];
  1936. // select max det for precision
  1937. for (jj = 0; jj < 3; jj++)
  1938. {
  1939. // 3 = nDimensions
  1940. CGU_FLOAT Det = RGB2[jj] * RGB2[(jj + 1) % 3] - Crrl[jj] * Crrl[jj];
  1941. Cs[jj] = fabs(Crrl[jj] / sqrt(RGB2[jj] * RGB2[(jj + 1) % 3]));
  1942. if (maxDet < Det)
  1943. {
  1944. maxDet = Det;
  1945. i0 = jj;
  1946. }
  1947. }
  1948. // inverse correl matrix
  1949. // -- -- -- --
  1950. // | A B | | C -B |
  1951. // | B C | => | -B A |
  1952. // -- -- -- --
  1953. CGU_FLOAT mtrx1[2][2];
  1954. CGU_FLOAT vc1[2];
  1955. CGU_FLOAT vc[2];
  1956. vc1[0] = Crrl[(i0 + 2) % 3];
  1957. vc1[1] = Crrl[(i0 + 1) % 3];
  1958. // C
  1959. mtrx1[0][0] = RGB2[(i0 + 1) % 3];
  1960. // A
  1961. mtrx1[1][1] = RGB2[i0];
  1962. // -B
  1963. mtrx1[1][0] = mtrx1[0][1] = -Crrl[i0];
  1964. // find a solution
  1965. vc[0] = mtrx1[0][0] * vc1[0] + mtrx1[0][1] * vc1[1];
  1966. vc[1] = mtrx1[1][0] * vc1[0] + mtrx1[1][1] * vc1[1];
  1967. // normalize
  1968. vc[0] /= maxDet;
  1969. vc[1] /= maxDet;
  1970. // find a line direction vector
  1971. LineDir0[i0] = 1.;
  1972. LineDir0[(i0 + 1) % 3] = 1.;
  1973. LineDir0[(i0 + 2) % 3] = vc[0] + vc[1];
  1974. }
  1975. // normalize direction vector
  1976. CGU_FLOAT Len = LineDir0[0] * LineDir0[0] + LineDir0[1] * LineDir0[1] + LineDir0[2] * LineDir0[2];
  1977. Len = sqrt(Len);
  1978. LineDir0[0] = (Len > 0.f) ? LineDir0[0] / Len : 0.0f;
  1979. LineDir0[1] = (Len > 0.f) ? LineDir0[1] / Len : 0.0f;
  1980. LineDir0[2] = (Len > 0.f) ? LineDir0[2] / Len : 0.0f;
  1981. }
  1982. } // FindAxisIsSmall
  1983. // GCC is being an awful being when it comes to goto-jumps.
  1984. // So please bear with this.
  1985. CGU_FLOAT ErrG = 10000000.f;
  1986. CGU_FLOAT PrjBnd0;
  1987. CGU_FLOAT PrjBnd1;
  1988. ALIGN_16 CGU_FLOAT PreMRep[BLOCK_SIZE_4X4];
  1989. LineDir.x = LineDir0[0];
  1990. LineDir.y = LineDir0[1];
  1991. LineDir.z = LineDir0[2];
  1992. // Here is the main loop.
  1993. // 1. Project input set on the axis in consideration.
  1994. // 2. Run 1 dimensional search (see scalar case) to find an (sub) optimal pair of end points.
  1995. // 3. Compute the vector of indexes (or clusters) for the current approximate ramp.
  1996. // 4. Present our color channels as 3 16DIM vectors.
  1997. // 5. Find closest approximation of each of 16DIM color vector with the projection of the 16DIM index vector.
  1998. // 6. Plug the projections as a new directional vector for the axis.
  1999. // 7. Goto 1.
  2000. // D - is 16 dim "index" vector (or 16 DIM vector of indexes - {0, 1/3,2/3, 0, ...,}, but shifted and normalized).
  2001. // Ci - is a 16 dim vector of color i. for each Ci find a scalar Ai such that (Ai * D - Ci) (Ai * D - Ci) -> min ,
  2002. // i.e distance between vector AiD and C is min. You can think of D as a unit interval(vector) "clusterizer", and Ai is a scale
  2003. // you need to apply to the clusterizer to approximate the Ci vector instead of the unit vector.
  2004. // Solution is
  2005. // Ai = (D . Ci) / (D . D); . - is a dot product.
  2006. // in 3 dim space Ai(s) represent a line direction, along which
  2007. // we again try to find (sub)optimal quantizer.
  2008. // That's what our for(;;) loop is about.
  2009. for (;;)
  2010. {
  2011. // 1. Project input set on the axis in consideration.
  2012. // From Foley & Van Dam: Closest point of approach of a line (P + v) to a
  2013. // point (R) is
  2014. // P + ((R-P).v) / (v.v))v
  2015. // The distance along v is therefore (R-P).v / (v.v)
  2016. // (v.v) is 1 if v is a unit vector.
  2017. //
  2018. PrjBnd0 = 1000.0f;
  2019. PrjBnd1 = -1000.0f;
  2020. for (i = 0; i < BLOCK_SIZE_4X4; i++)
  2021. Prj0[i] = Prj[i] = PrjErr[i] = PreMRep[i] = 0.f;
  2022. for (i = 0; i < dwUniqueColors; i++)
  2023. {
  2024. Prj0[i] = Prj[i] = dot(BlkSh[i], LineDir);
  2025. PrjErr[i] = dot(BlkSh[i] - LineDir * Prj[i], BlkSh[i] - LineDir * Prj[i]);
  2026. PrjBnd0 = min(PrjBnd0, Prj[i]);
  2027. PrjBnd1 = max(PrjBnd1, Prj[i]);
  2028. }
  2029. // 2. Run 1 dimensional search (see scalar case) to find an (sub) optimal
  2030. // pair of end points.
  2031. // min and max of the search interval
  2032. CGU_FLOAT Scl0;
  2033. CGU_FLOAT Scl1;
  2034. Scl0 = PrjBnd0 - (PrjBnd1 - PrjBnd0) * 0.125f;
  2035. Scl1 = PrjBnd1 + (PrjBnd1 - PrjBnd0) * 0.125f;
  2036. // compute scaling factor to scale down the search interval to [0.,1]
  2037. const CGU_FLOAT Scl2 = (Scl1 - Scl0) * (Scl1 - Scl0);
  2038. const CGU_FLOAT overScl = 1.f / (Scl1 - Scl0);
  2039. for (i = 0; i < dwUniqueColors; i++)
  2040. {
  2041. // scale them
  2042. Prj[i] = (Prj[i] - Scl0) * overScl;
  2043. // premultiply the scale square to plug into error computation later
  2044. PreMRep[i] = Rpt[i] * Scl2;
  2045. }
  2046. // scale first approximation of end points
  2047. PrjBnd0 = (PrjBnd0 - Scl0) * overScl;
  2048. PrjBnd1 = (PrjBnd1 - Scl0) * overScl;
  2049. CGU_FLOAT StepErr = MAX_ERROR;
  2050. // search step
  2051. CGU_FLOAT searchStep = 0.025f;
  2052. // low Start/End; high Start/End
  2053. const CGU_FLOAT lowStartEnd = (PrjBnd0 - 2.f * searchStep > 0.f) ? PrjBnd0 - 2.f * searchStep : 0.f;
  2054. const CGU_FLOAT highStartEnd = (PrjBnd1 + 2.f * searchStep < 1.f) ? PrjBnd1 + 2.f * searchStep : 1.f;
  2055. // find the best endpoints
  2056. CGU_FLOAT Pos0 = 0;
  2057. CGU_FLOAT Pos1 = 0;
  2058. CGU_FLOAT lowPosStep, highPosStep;
  2059. CGU_FLOAT err;
  2060. int l, h;
  2061. for (l = 0, lowPosStep = lowStartEnd; l < 8; l++, lowPosStep += searchStep)
  2062. {
  2063. for (h = 0, highPosStep = highStartEnd; h < 8; h++, highPosStep -= searchStep)
  2064. {
  2065. // compute an error for the current pair of end points.
  2066. err = cmp_getRampErr(Prj, PrjErr, PreMRep, StepErr, lowPosStep, highPosStep, dwUniqueColors);
  2067. if (err < StepErr)
  2068. {
  2069. // save better result
  2070. StepErr = err;
  2071. Pos0 = lowPosStep;
  2072. Pos1 = highPosStep;
  2073. }
  2074. }
  2075. }
  2076. // inverse the scaling
  2077. Pos0 = Pos0 * (Scl1 - Scl0) + Scl0;
  2078. Pos1 = Pos1 * (Scl1 - Scl0) + Scl0;
  2079. // did we find somthing better from the previous run?
  2080. if (StepErr + 0.001 < ErrG)
  2081. {
  2082. // yes, remember it
  2083. ErrG = StepErr;
  2084. LineDirG = LineDir;
  2085. PosG0.x = Pos0;
  2086. PosG0.y = Pos0;
  2087. PosG0.z = Pos0;
  2088. PosG1.x = Pos1;
  2089. PosG1.y = Pos1;
  2090. PosG1.z = Pos1;
  2091. // 3. Compute the vector of indexes (or clusters) for the current
  2092. // approximate ramp.
  2093. // indexes
  2094. const CGU_FLOAT step = (Pos1 - Pos0) / 3.0f; // (dwNumChannels=4 - 1);
  2095. const CGU_FLOAT step_h = step * (CGU_FLOAT)0.5;
  2096. const CGU_FLOAT rstep = (CGU_FLOAT)1.0f / step;
  2097. const CGU_FLOAT overBlkTp = 1.f / 3.0f; // (dwNumChannels=4 - 1);
  2098. // here the index vector is computed,
  2099. // shifted and normalized
  2100. CGU_FLOAT indxAvrg = 3.0f / 2.0f; // (dwNumChannels=4 - 1);
  2101. for (i = 0; i < dwUniqueColors; i++)
  2102. {
  2103. CGU_FLOAT del;
  2104. // CGU_UINT32 n = (CGU_UINT32)((b - _min_ex + (step*0.5f)) * rstep);
  2105. if ((del = Prj0[i] - Pos0) <= 0)
  2106. RmpIndxs[i] = 0.f;
  2107. else if (Prj0[i] - Pos1 >= 0)
  2108. RmpIndxs[i] = 3.0f; // (dwNumChannels=4 - 1);
  2109. else
  2110. RmpIndxs[i] = floor((del + step_h) * rstep);
  2111. // shift and normalization
  2112. RmpIndxs[i] = (RmpIndxs[i] - indxAvrg) * overBlkTp;
  2113. }
  2114. // 4. Present our color channels as 3 16 DIM vectors.
  2115. // 5. Find closest aproximation of each of 16DIM color vector with the
  2116. // pojection of the 16DIM index vector.
  2117. CGU_Vec3f Crs = {0.0f, 0.0f, 0.0f};
  2118. CGU_FLOAT Len = 0.0f;
  2119. for (i = 0; i < dwUniqueColors; i++)
  2120. {
  2121. const CGU_FLOAT PreMlt = RmpIndxs[i] * Rpt[i];
  2122. Len += RmpIndxs[i] * PreMlt;
  2123. Crs.x += BlkSh[i].x * PreMlt;
  2124. Crs.y += BlkSh[i].y * PreMlt;
  2125. Crs.z += BlkSh[i].z * PreMlt;
  2126. }
  2127. LineDir.x = LineDir.y = LineDir.z = 0.0f;
  2128. if (Len > 0.0f)
  2129. {
  2130. CGU_FLOAT Len2;
  2131. LineDir = Crs / Len;
  2132. // 6. Plug the projections as a new directional vector for the axis.
  2133. // 7. Goto 1.
  2134. Len2 = dot(LineDir, LineDir); // LineDir.x * LineDir.x + LineDir.y * LineDir.y + LineDir.z * LineDir.z;
  2135. Len2 = sqrt(Len2);
  2136. LineDir /= Len2;
  2137. }
  2138. }
  2139. else // We was not able to find anything better. Drop out.
  2140. break;
  2141. }
  2142. // inverse transform to find end-points of 3-color ramp
  2143. rsltC0 = (PosG0 * LineDirG + Mdl) * 255.f;
  2144. rsltC1 = (PosG1 * LineDirG + Mdl) * 255.f;
  2145. } // !isDone
  2146. // We've dealt with (almost) unrestricted full precision realm.
  2147. // Now back digital world.
  2148. // round the end points to make them look like compressed ones
  2149. CGU_Vec3f inpRmpEndPts0 = {0.0f, 255.0f, 0.0f};
  2150. CGU_Vec3f inpRmpEndPts1 = {0.0f, 255.0f, 0.0f};
  2151. CGU_Vec3f Fctrs0 = {8.0f, 4.0f, 8.0f}; //(1 << (PIX_GRID - BG)); x (1 << (PIX_GRID - GG)); y (1 << (PIX_GRID - RG)); z
  2152. CGU_Vec3f Fctrs1 = {32.0f, 64.0f, 32.0f}; //(CGU_FLOAT)(1 << RG); z (CGU_FLOAT)(1 << GG); y (CGU_FLOAT)(1 << BG); x
  2153. CGU_FLOAT _Min = 0.0f;
  2154. CGU_FLOAT _Max = 255.0f;
  2155. {
  2156. // MkRmpOnGrid(inpRmpEndPts, rsltC, _Min, _Max);
  2157. inpRmpEndPts0 = floor(rsltC0);
  2158. if (inpRmpEndPts0.x <= _Min)
  2159. inpRmpEndPts0.x = _Min;
  2160. else
  2161. {
  2162. inpRmpEndPts0.x += floor(128.f / Fctrs1.x) - floor(inpRmpEndPts0.x / Fctrs1.x);
  2163. inpRmpEndPts0.x = min(inpRmpEndPts0.x, _Max);
  2164. }
  2165. if (inpRmpEndPts0.y <= _Min)
  2166. inpRmpEndPts0.y = _Min;
  2167. else
  2168. {
  2169. inpRmpEndPts0.y += floor(128.f / Fctrs1.y) - floor(inpRmpEndPts0.y / Fctrs1.y);
  2170. inpRmpEndPts0.y = min(inpRmpEndPts0.y, _Max);
  2171. }
  2172. if (inpRmpEndPts0.z <= _Min)
  2173. inpRmpEndPts0.z = _Min;
  2174. else
  2175. {
  2176. inpRmpEndPts0.z += floor(128.f / Fctrs1.z) - floor(inpRmpEndPts0.z / Fctrs1.z);
  2177. inpRmpEndPts0.z = min(inpRmpEndPts0.z, _Max);
  2178. }
  2179. inpRmpEndPts0 = floor(inpRmpEndPts0 / Fctrs0) * Fctrs0;
  2180. inpRmpEndPts1 = floor(rsltC1);
  2181. if (inpRmpEndPts1.x <= _Min)
  2182. inpRmpEndPts1.x = _Min;
  2183. else
  2184. {
  2185. inpRmpEndPts1.x += floor(128.f / Fctrs1.x) - floor(inpRmpEndPts1.x / Fctrs1.x);
  2186. inpRmpEndPts1.x = min(inpRmpEndPts1.x, _Max);
  2187. }
  2188. if (inpRmpEndPts1.y <= _Min)
  2189. inpRmpEndPts1.y = _Min;
  2190. else
  2191. {
  2192. inpRmpEndPts1.y += floor(128.f / Fctrs1.y) - floor(inpRmpEndPts1.y / Fctrs1.y);
  2193. inpRmpEndPts1.y = min(inpRmpEndPts1.y, _Max);
  2194. }
  2195. if (inpRmpEndPts1.z <= _Min)
  2196. inpRmpEndPts1.z = _Min;
  2197. else
  2198. {
  2199. inpRmpEndPts1.z += floor(128.f / Fctrs1.z) - floor(inpRmpEndPts1.z / Fctrs1.z);
  2200. inpRmpEndPts1.z = min(inpRmpEndPts1.z, _Max);
  2201. }
  2202. inpRmpEndPts1 = floor(inpRmpEndPts1 / Fctrs0) * Fctrs0;
  2203. } // MkRmpOnGrid
  2204. CMP_EndPoints EndPoints;
  2205. EndPoints.Color0 = inpRmpEndPts0;
  2206. EndPoints.Color1 = inpRmpEndPts1;
  2207. return EndPoints;
  2208. }
  2209. #endif
  2210. // Process a rgbBlock which is normalized (0.0f ... 1.0f), signed normal is not implemented
  2211. static CGU_Vec2ui CompressBlockBC1_RGBA_Internal(const CGU_Vec3f rgbBlockUVf[BLOCK_SIZE_4X4],
  2212. const CGU_FLOAT BlockA[BLOCK_SIZE_4X4],
  2213. CGU_Vec3f channelWeights,
  2214. CGU_UINT32 dwAlphaThreshold,
  2215. CGU_UINT32 m_nRefinementSteps,
  2216. CMP_IN CGU_FLOAT fquality,
  2217. CGU_BOOL isSRGB)
  2218. {
  2219. CGU_Vec2ui cmpBlock = {0, 0};
  2220. CGU_FLOAT errLQ = 1e6f;
  2221. cmpBlock = CompressRGBBlock_FM(rgbBlockUVf, fquality, isSRGB, CMP_REFINOUT errLQ);
  2222. #ifndef CMP_USE_LOWQUALITY
  2223. //------------------------------------------------------------------
  2224. // Processing is in 0..255 range, code needs to be normized to 0..1
  2225. //------------------------------------------------------------------
  2226. if ((errLQ > 0.0f) && (fquality > CMP_QUALITY2))
  2227. {
  2228. CGU_Vec3f rgbBlock_normal[BLOCK_SIZE_4X4];
  2229. CGU_UINT32 nCmpIndices = 0;
  2230. CGU_UINT32 c0, c1;
  2231. // High Quality
  2232. CMP_EndPoints EndPoints = {{0, 0, 0xFF}, {0, 0, 0xFF}};
  2233. // Hold a err ref to lowest quality compression, to check if new compression is any better
  2234. CGU_Vec2ui Q1CompData = cmpBlock;
  2235. // High Quality
  2236. CGU_UINT32 i;
  2237. ALIGN_16 CGU_FLOAT Rpt[BLOCK_SIZE_4X4];
  2238. CGU_UINT32 pcIndices = 0;
  2239. m_nRefinementSteps = 0;
  2240. CGU_Vec3f BlkInBGRf_UV[BLOCK_SIZE_4X4]; // Normalized Block Input (0..1) in BGR channel format
  2241. // Default inidices & endpoints for Transparent Block
  2242. CGU_Vec3ui nEndpoints0 = {0, 0, 0}; // Endpoints are stored BGR as x,y,z
  2243. CGU_Vec3ui nEndpoints1 = {0xFF, 0xFF, 0xFF}; // Endpoints are stored BGR as x,y,z
  2244. for (i = 0; i < BLOCK_SIZE_4X4; i++)
  2245. {
  2246. Rpt[i] = 0.0f;
  2247. }
  2248. //===============================================================
  2249. // Check if we have more then 2 colors and process Alpha block
  2250. CGU_UINT32 dwColors = 0;
  2251. CGU_UINT32 dwBlk[BLOCK_SIZE_4X4];
  2252. CGU_UINT32 R, G, B, A;
  2253. for (i = 0; i < BLOCK_SIZE_4X4; i++)
  2254. {
  2255. // Do any color conversion prior to processing the block
  2256. rgbBlock_normal[i] = isSRGB ? cmp_linearToSrgb(rgbBlockUVf[i]) : rgbBlockUVf[i];
  2257. R = (CGU_UINT32)(rgbBlock_normal[i].x * 255.0f);
  2258. G = (CGU_UINT32)(rgbBlock_normal[i].y * 255.0f);
  2259. B = (CGU_UINT32)(rgbBlock_normal[i].z * 255.0f);
  2260. if (dwAlphaThreshold > 0)
  2261. A = (CGU_UINT32)BlockA[i];
  2262. else
  2263. A = 255;
  2264. // Punch Through Alpha in BC1 Codec (1 bit alpha)
  2265. if ((dwAlphaThreshold == 0) || (A >= dwAlphaThreshold))
  2266. {
  2267. // copy to local RGB data and have alpha set to 0xFF
  2268. dwBlk[dwColors++] = A << 24 | R << 16 | G << 8 | B;
  2269. }
  2270. }
  2271. if (!dwColors)
  2272. {
  2273. // All are colors transparent
  2274. EndPoints.Color0.x = EndPoints.Color0.y = EndPoints.Color0.z = 0.0f;
  2275. EndPoints.Color1.x = EndPoints.Color1.y = EndPoints.Color0.z = 255.0f;
  2276. nCmpIndices = 0xFFFFFFFF;
  2277. }
  2278. else
  2279. {
  2280. // We have colors to process
  2281. nCmpIndices = 0;
  2282. // Punch Through Alpha Support ToDo
  2283. // CGU_BOOL bHasAlpha = (dwColors != BLOCK_SIZE_4X4);
  2284. // bHasAlpha = bHasAlpha && (dwAlphaThreshold > 0); // valid for (dwNumChannels=4);
  2285. // if (bHasAlpha) {
  2286. // CGU_Vec2ui compBlock = {0xf800f800,0};
  2287. // return compBlock;
  2288. // }
  2289. // Here we are computing an unique number of sorted colors.
  2290. // For each unique value we compute the number of it appearences.
  2291. // qsort((void *)dwBlk, (size_t)dwColors, sizeof(CGU_UINT32), QSortIntCmp);
  2292. #ifndef ASPM_GPU
  2293. std::sort(dwBlk, dwBlk + 15);
  2294. #else
  2295. {
  2296. CGU_UINT32 j;
  2297. CMP_di what[BLOCK_SIZE_4X4];
  2298. for (i = 0; i < dwColors; i++)
  2299. {
  2300. what[i].index = i;
  2301. what[i].data = dwBlk[i];
  2302. }
  2303. CGU_UINT32 tmp_index;
  2304. CGU_UINT32 tmp_data;
  2305. for (i = 1; i < dwColors; i++)
  2306. {
  2307. for (j = i; j > 0; j--)
  2308. {
  2309. if (what[j - 1].data > what[j].data)
  2310. {
  2311. tmp_index = what[j].index;
  2312. tmp_data = what[j].data;
  2313. what[j].index = what[j - 1].index;
  2314. what[j].data = what[j - 1].data;
  2315. what[j - 1].index = tmp_index;
  2316. what[j - 1].data = tmp_data;
  2317. }
  2318. }
  2319. }
  2320. for (i = 0; i < dwColors; i++)
  2321. dwBlk[i] = what[i].data;
  2322. }
  2323. #endif
  2324. CGU_UINT32 new_p;
  2325. CGU_UINT32 dwBlkU[BLOCK_SIZE_4X4];
  2326. CGU_UINT32 dwUniqueColors = 0;
  2327. new_p = dwBlkU[0] = dwBlk[0];
  2328. Rpt[dwUniqueColors] = 1.f;
  2329. for (i = 1; i < dwColors; i++)
  2330. {
  2331. if (new_p != dwBlk[i])
  2332. {
  2333. dwUniqueColors++;
  2334. new_p = dwBlkU[dwUniqueColors] = dwBlk[i];
  2335. Rpt[dwUniqueColors] = 1.f;
  2336. }
  2337. else
  2338. Rpt[dwUniqueColors] += 1.f;
  2339. }
  2340. dwUniqueColors++;
  2341. // Simple case of only 2 colors to process
  2342. // no need for futher processing as lowest quality methods work best for this case
  2343. if (dwUniqueColors <= 2)
  2344. {
  2345. return Q1CompData;
  2346. }
  2347. else
  2348. {
  2349. // switch from int range back to UV floats
  2350. for (i = 0; i < dwUniqueColors; i++)
  2351. {
  2352. R = (dwBlkU[i] >> 16) & 0xff;
  2353. G = (dwBlkU[i] >> 8) & 0xff;
  2354. B = (dwBlkU[i] >> 0) & 0xff;
  2355. BlkInBGRf_UV[i].z = (CGU_FLOAT)R / 255.0f;
  2356. BlkInBGRf_UV[i].y = (CGU_FLOAT)G / 255.0f;
  2357. BlkInBGRf_UV[i].x = (CGU_FLOAT)B / 255.0f;
  2358. }
  2359. CGU_Vec3f channelWeightsBGR;
  2360. channelWeightsBGR.x = channelWeights.z;
  2361. channelWeightsBGR.y = channelWeights.y;
  2362. channelWeightsBGR.z = channelWeights.x;
  2363. EndPoints = CompressRGBBlock_Slow(BlkInBGRf_UV, Rpt, dwUniqueColors, channelWeightsBGR, m_nRefinementSteps);
  2364. }
  2365. } // colors
  2366. //===================================================================
  2367. // Process Cluster INPUT is constant EndPointsf OUTPUT is pcIndices
  2368. //===================================================================
  2369. if (nCmpIndices == 0)
  2370. {
  2371. R = (CGU_UINT32)(EndPoints.Color0.z);
  2372. G = (CGU_UINT32)(EndPoints.Color0.y);
  2373. B = (CGU_UINT32)(EndPoints.Color0.x);
  2374. CGU_INT32 cluster0 = cmp_constructColor(R, G, B);
  2375. R = (CGU_UINT32)(EndPoints.Color1.z);
  2376. G = (CGU_UINT32)(EndPoints.Color1.y);
  2377. B = (CGU_UINT32)(EndPoints.Color1.x);
  2378. CGU_INT32 cluster1 = cmp_constructColor(R, G, B);
  2379. CGU_Vec3f InpRmp[NUM_ENDPOINTS];
  2380. if ((cluster0 <= cluster1) // valid for 4 channels
  2381. // || (cluster0 > cluster1) // valid for 3 channels
  2382. )
  2383. {
  2384. // inverse endpoints
  2385. InpRmp[0] = EndPoints.Color1;
  2386. InpRmp[1] = EndPoints.Color0;
  2387. }
  2388. else
  2389. {
  2390. InpRmp[0] = EndPoints.Color0;
  2391. InpRmp[1] = EndPoints.Color1;
  2392. }
  2393. CGU_Vec3f srcblockBGR[BLOCK_SIZE_4X4];
  2394. CGU_FLOAT srcblockA[BLOCK_SIZE_4X4];
  2395. // Swizzle the source RGB to BGR for processing
  2396. for (i = 0; i < BLOCK_SIZE_4X4; i++)
  2397. {
  2398. srcblockBGR[i].z = rgbBlock_normal[i].x * 255.0f;
  2399. srcblockBGR[i].y = rgbBlock_normal[i].y * 255.0f;
  2400. srcblockBGR[i].x = rgbBlock_normal[i].z * 255.0f;
  2401. srcblockA[i] = 0.0f;
  2402. if (dwAlphaThreshold > 0)
  2403. {
  2404. CGU_UINT32 alpha = (CGU_UINT32)BlockA[i];
  2405. if (alpha >= dwAlphaThreshold)
  2406. srcblockA[i] = BlockA[i];
  2407. }
  2408. }
  2409. // input ramp is on the coarse grid
  2410. // make ramp endpoints the way they'll going to be decompressed
  2411. CGU_Vec3f InpRmpL[NUM_ENDPOINTS];
  2412. CGU_Vec3f Fctrs = {32.0F, 64.0F, 32.0F}; // 1 << RG,1 << GG,1 << BG
  2413. {
  2414. // ConstantRamp = MkWkRmpPts(InpRmpL, InpRmp);
  2415. InpRmpL[0] = InpRmp[0] + floor(InpRmp[0] / Fctrs);
  2416. InpRmpL[0] = cmp_clampVec3f(InpRmpL[0], 0.0f, 255.0f);
  2417. InpRmpL[1] = InpRmp[1] + floor(InpRmp[1] / Fctrs);
  2418. InpRmpL[1] = cmp_clampVec3f(InpRmpL[1], 0.0f, 255.0f);
  2419. } // MkWkRmpPts
  2420. // build ramp
  2421. CGU_Vec3f LerpRmp[4];
  2422. CGU_Vec3f offset = {1.0f, 1.0f, 1.0f};
  2423. {
  2424. //BldRmp(Rmp, InpRmpL, dwNumChannels);
  2425. // linear interpolate end points to get the ramp
  2426. LerpRmp[0] = InpRmpL[0];
  2427. LerpRmp[3] = InpRmpL[1];
  2428. LerpRmp[1] = floor((InpRmpL[0] * 2.0f + LerpRmp[3] + offset) / 3.0f);
  2429. LerpRmp[2] = floor((InpRmpL[0] + LerpRmp[3] * 2.0f + offset) / 3.0f);
  2430. } // BldRmp
  2431. //=========================================================================
  2432. // Clusterize, Compute error and find DXTC indexes for the current cluster
  2433. //=========================================================================
  2434. {
  2435. // Clusterize
  2436. CGU_UINT32 alpha;
  2437. // For each colour in the original block assign it
  2438. // to the closest cluster and compute the cumulative error
  2439. for (i = 0; i < BLOCK_SIZE_4X4; i++)
  2440. {
  2441. alpha = (CGU_UINT32)srcblockA[i];
  2442. if ((dwAlphaThreshold > 0) && alpha == 0)
  2443. { //*((CGU_DWORD *)&_Blk[i][AC]) == 0)
  2444. pcIndices |= cmp_set2Bit32(4, i); // dwNumChannels 3 or 4 (default is 4)
  2445. }
  2446. else
  2447. {
  2448. CGU_FLOAT shortest = 99999999999.f;
  2449. CGU_UINT8 shortestIndex = 0;
  2450. CGU_Vec3f channelWeightsBGR;
  2451. channelWeightsBGR.x = channelWeights.z;
  2452. channelWeightsBGR.y = channelWeights.y;
  2453. channelWeightsBGR.z = channelWeights.x;
  2454. for (CGU_UINT8 rampindex = 0; rampindex < 4; rampindex++)
  2455. {
  2456. // r is either 1 or 4
  2457. // calculate the distance for each component
  2458. CGU_FLOAT distance =
  2459. dot(((srcblockBGR[i] - LerpRmp[rampindex]) * channelWeightsBGR), ((srcblockBGR[i] - LerpRmp[rampindex]) * channelWeightsBGR));
  2460. if (distance < shortest)
  2461. {
  2462. shortest = distance;
  2463. shortestIndex = rampindex;
  2464. }
  2465. }
  2466. // The total is a sum of (error += shortest)
  2467. // We have the index of the best cluster, so assign this in the block
  2468. // Reorder indices to match correct DXTC ordering
  2469. if (shortestIndex == 3) // dwNumChannels - 1
  2470. shortestIndex = 1;
  2471. else if (shortestIndex)
  2472. shortestIndex++;
  2473. pcIndices |= cmp_set2Bit32(shortestIndex, i);
  2474. }
  2475. } // BLOCK_SIZE_4X4
  2476. } // Clusterize
  2477. } // Process Cluster
  2478. //==============================================================
  2479. // Generate Compressed Result from nEndpoints & pcIndices
  2480. //==============================================================
  2481. R = (CGU_UINT32)(EndPoints.Color0.z);
  2482. G = (CGU_UINT32)(EndPoints.Color0.y);
  2483. B = (CGU_UINT32)(EndPoints.Color0.x);
  2484. c0 = cmp_constructColor(R, G, B);
  2485. R = (CGU_UINT32)(EndPoints.Color1.z);
  2486. G = (CGU_UINT32)(EndPoints.Color1.y);
  2487. B = (CGU_UINT32)(EndPoints.Color1.x);
  2488. c1 = cmp_constructColor(R, G, B);
  2489. // Get Processed indices if not set
  2490. if (nCmpIndices == 0)
  2491. nCmpIndices = pcIndices;
  2492. if (c0 <= c1)
  2493. {
  2494. cmpBlock.x = c1 | (c0 << 16);
  2495. }
  2496. else
  2497. cmpBlock.x = c0 | (c1 << 16);
  2498. cmpBlock.y = nCmpIndices;
  2499. // Select best compression
  2500. CGU_FLOAT CompErr = CMP_RGBBlockError(rgbBlockUVf, cmpBlock, isSRGB);
  2501. if (CompErr > errLQ)
  2502. cmpBlock = Q1CompData;
  2503. }
  2504. #endif
  2505. return cmpBlock;
  2506. }
  2507. //============================= Alpha: New single header interfaces: supports GPU shader interface ==================================================
  2508. // Compress a BC1 block - Use new code in cmp_bc1.h
  2509. static CGU_Vec2ui CompressBlockBC1_UNORM(CGU_Vec3f rgbablockf[BLOCK_SIZE_4X4], CMP_IN CGU_FLOAT fquality, CGU_BOOL isSRGB)
  2510. {
  2511. CGU_FLOAT BlockA[BLOCK_SIZE_4X4]; // Not used but required
  2512. CGU_Vec3f channelWeights = {1.0f, 1.0f, 1.0f};
  2513. return CompressBlockBC1_RGBA_Internal(rgbablockf,
  2514. BlockA, // ToDo support nullptr
  2515. channelWeights,
  2516. 0,
  2517. 1,
  2518. fquality,
  2519. isSRGB);
  2520. }
  2521. // Compress a BC2 block
  2522. static CGU_Vec4ui CompressBlockBC2_UNORM(CMP_IN CGU_Vec3f BlockRGB[BLOCK_SIZE_4X4],
  2523. CMP_IN CGU_FLOAT BlockA[BLOCK_SIZE_4X4],
  2524. CGU_FLOAT fquality,
  2525. CGU_BOOL isSRGB)
  2526. {
  2527. CGU_Vec2ui compressedBlocks;
  2528. CGU_Vec4ui compBlock;
  2529. compressedBlocks = cmp_compressExplicitAlphaBlock(BlockA);
  2530. compBlock.x = compressedBlocks.x;
  2531. compBlock.y = compressedBlocks.y;
  2532. CGU_Vec3f channelWeights = {1.0f, 1.0f, 1.0f};
  2533. compressedBlocks = CompressBlockBC1_RGBA_Internal(BlockRGB, BlockA, channelWeights, 0, 1, fquality, isSRGB);
  2534. compBlock.z = compressedBlocks.x;
  2535. compBlock.w = compressedBlocks.y;
  2536. return compBlock;
  2537. }
  2538. // Compress a BC3 block
  2539. static CGU_Vec4ui CompressBlockBC3_UNORM(CMP_IN CGU_Vec3f BlockRGB[BLOCK_SIZE_4X4],
  2540. CMP_IN CGU_FLOAT BlockA[BLOCK_SIZE_4X4],
  2541. CGU_FLOAT fquality,
  2542. CGU_BOOL isSRGB)
  2543. {
  2544. CGU_Vec4ui compBlock;
  2545. CGU_Vec2ui cmpBlock;
  2546. cmpBlock = cmp_compressAlphaBlock(BlockA, fquality, FALSE);
  2547. compBlock.x = cmpBlock.x;
  2548. compBlock.y = cmpBlock.y;
  2549. CGU_Vec2ui compressedBlocks;
  2550. compressedBlocks = CompressBlockBC1_UNORM(BlockRGB, fquality, isSRGB);
  2551. compBlock.z = compressedBlocks.x;
  2552. compBlock.w = compressedBlocks.y;
  2553. return compBlock;
  2554. }
  2555. // Compress a BC4 block
  2556. static CGU_Vec2ui CompressBlockBC4_UNORM(CMP_IN CGU_FLOAT Block[BLOCK_SIZE_4X4], CGU_FLOAT fquality)
  2557. {
  2558. CGU_Vec2ui cmpBlock;
  2559. cmpBlock = cmp_compressAlphaBlock(Block, fquality, FALSE);
  2560. return cmpBlock;
  2561. }
  2562. // Compress a BC4 block
  2563. static CGU_Vec2ui CompressBlockBC4_SNORM(CMP_IN CGU_FLOAT Block[BLOCK_SIZE_4X4], CGU_FLOAT fquality)
  2564. {
  2565. CGU_Vec2ui cmpBlock;
  2566. cmpBlock = cmp_compressAlphaBlock(Block, fquality, TRUE);
  2567. return cmpBlock;
  2568. }
  2569. // Compress a BC5 block
  2570. static CGU_Vec4ui CompressBlockBC5_UNORM(CMP_IN CGU_FLOAT BlockU[BLOCK_SIZE_4X4], CMP_IN CGU_FLOAT BlockV[BLOCK_SIZE_4X4], CGU_FLOAT fquality)
  2571. {
  2572. CGU_Vec4ui compressedBlock = {0, 0, 0, 0};
  2573. CGU_Vec2ui cmpBlock;
  2574. cmpBlock = cmp_compressAlphaBlock(BlockU, fquality, FALSE);
  2575. compressedBlock.x = cmpBlock.x;
  2576. compressedBlock.y = cmpBlock.y;
  2577. cmpBlock = cmp_compressAlphaBlock(BlockV, fquality, FALSE);
  2578. compressedBlock.z = cmpBlock.x;
  2579. compressedBlock.w = cmpBlock.y;
  2580. return compressedBlock;
  2581. }
  2582. // Compress a BC6 & BC7 UNORM block ToDo
  2583. #endif