basisu_astc_hdr_enc.cpp 95 KB


  1. // basisu_astc_hdr_enc.cpp
  2. #include "basisu_astc_hdr_enc.h"
  3. #include "../transcoder/basisu_transcoder.h"
  4. using namespace basist;
  5. namespace basisu
  6. {
  7. const float DEF_R_ERROR_SCALE = 2.0f;
  8. const float DEF_G_ERROR_SCALE = 3.0f;
  9. static inline uint32_t get_max_qlog(uint32_t bits)
  10. {
  11. switch (bits)
  12. {
  13. case 7: return MAX_QLOG7;
  14. case 8: return MAX_QLOG8;
  15. case 9: return MAX_QLOG9;
  16. case 10: return MAX_QLOG10;
  17. case 11: return MAX_QLOG11;
  18. case 12: return MAX_QLOG12;
  19. case 16: return MAX_QLOG16;
  20. default: assert(0); break;
  21. }
  22. return 0;
  23. }
  24. #if 0
  25. static inline float get_max_qlog_val(uint32_t bits)
  26. {
  27. switch (bits)
  28. {
  29. case 7: return MAX_QLOG7_VAL;
  30. case 8: return MAX_QLOG8_VAL;
  31. case 9: return MAX_QLOG9_VAL;
  32. case 10: return MAX_QLOG10_VAL;
  33. case 11: return MAX_QLOG11_VAL;
  34. case 12: return MAX_QLOG12_VAL;
  35. case 16: return MAX_QLOG16_VAL;
  36. default: assert(0); break;
  37. }
  38. return 0;
  39. }
  40. #endif
  41. static inline int get_bit(
  42. int src_val, int src_bit)
  43. {
  44. assert(src_bit >= 0 && src_bit <= 31);
  45. int bit = (src_val >> src_bit) & 1;
  46. return bit;
  47. }
  48. static inline void pack_bit(
  49. int& dst, int dst_bit,
  50. int src_val, int src_bit = 0)
  51. {
  52. assert(dst_bit >= 0 && dst_bit <= 31);
  53. int bit = get_bit(src_val, src_bit);
  54. dst |= (bit << dst_bit);
  55. }
  56. //--------------------------------------------------------------------------------------------------------------------------
  57. astc_hdr_codec_options::astc_hdr_codec_options()
  58. {
  59. init();
  60. }
  61. void astc_hdr_codec_options::init()
  62. {
  63. m_bc6h_err_weight = .85f;
  64. m_r_err_scale = DEF_R_ERROR_SCALE;
  65. m_g_err_scale = DEF_G_ERROR_SCALE;
  66. // Disabling by default to avoid transcoding outliers (try kodim26). The quality lost is very low. TODO: Could include the uber result in the output.
  67. m_allow_uber_mode = false;
  68. // Must set best quality level first to set defaults.
  69. set_quality_best();
  70. set_quality_level(cDefaultLevel);
  71. }
  72. void astc_hdr_codec_options::set_quality_best()
  73. {
  74. m_mode11_direct_only = false;
  75. // highest achievable quality
  76. m_use_solid = true;
  77. m_use_mode11 = true;
  78. m_mode11_uber_mode = true;
  79. m_first_mode11_weight_ise_range = MODE11_FIRST_ISE_RANGE;
  80. m_last_mode11_weight_ise_range = MODE11_LAST_ISE_RANGE;
  81. m_first_mode11_submode = -1;
  82. m_last_mode11_submode = 7;
  83. m_use_mode7_part1 = true;
  84. m_first_mode7_part1_weight_ise_range = MODE7_PART1_FIRST_ISE_RANGE;
  85. m_last_mode7_part1_weight_ise_range = MODE7_PART1_LAST_ISE_RANGE;
  86. m_use_mode7_part2 = true;
  87. m_mode7_part2_part_masks = UINT32_MAX;
  88. m_first_mode7_part2_weight_ise_range = MODE7_PART2_FIRST_ISE_RANGE;
  89. m_last_mode7_part2_weight_ise_range = MODE7_PART2_LAST_ISE_RANGE;
  90. m_use_mode11_part2 = true;
  91. m_mode11_part2_part_masks = UINT32_MAX;
  92. m_first_mode11_part2_weight_ise_range = MODE11_PART2_FIRST_ISE_RANGE;
  93. m_last_mode11_part2_weight_ise_range = MODE11_PART2_LAST_ISE_RANGE;
  94. m_refine_weights = true;
  95. m_use_estimated_partitions = false;
  96. m_max_estimated_partitions = 0;
  97. }
  98. void astc_hdr_codec_options::set_quality_normal()
  99. {
  100. m_use_solid = true;
  101. // We'll allow uber mode in normal if the user allows it.
  102. m_use_mode11 = true;
  103. m_mode11_uber_mode = true;
  104. m_first_mode11_weight_ise_range = 6;
  105. m_last_mode11_weight_ise_range = MODE11_LAST_ISE_RANGE;
  106. m_use_mode7_part1 = true;
  107. m_first_mode7_part1_weight_ise_range = MODE7_PART1_LAST_ISE_RANGE;
  108. m_last_mode7_part1_weight_ise_range = MODE7_PART1_LAST_ISE_RANGE;
  109. m_use_mode7_part2 = true;
  110. m_mode7_part2_part_masks = UINT32_MAX;
  111. m_first_mode7_part2_weight_ise_range = MODE7_PART2_LAST_ISE_RANGE;
  112. m_last_mode7_part2_weight_ise_range = MODE7_PART2_LAST_ISE_RANGE;
  113. m_use_mode11_part2 = true;
  114. m_mode11_part2_part_masks = UINT32_MAX;
  115. m_first_mode11_part2_weight_ise_range = MODE11_PART2_LAST_ISE_RANGE;
  116. m_last_mode11_part2_weight_ise_range = MODE11_PART2_LAST_ISE_RANGE;
  117. m_refine_weights = true;
  118. }
  119. void astc_hdr_codec_options::set_quality_fastest()
  120. {
  121. m_use_solid = true;
  122. m_use_mode11 = true;
  123. m_mode11_uber_mode = false;
  124. m_first_mode11_weight_ise_range = MODE11_LAST_ISE_RANGE;
  125. m_last_mode11_weight_ise_range = MODE11_LAST_ISE_RANGE;
  126. m_use_mode7_part1 = false;
  127. m_use_mode7_part2 = false;
  128. m_use_mode11_part2 = false;
  129. m_refine_weights = false;
  130. }
  131. //--------------------------------------------------------------------------------------------------------------------------
  132. void astc_hdr_codec_options::set_quality_level(int level)
  133. {
  134. level = clamp(level, cMinLevel, cMaxLevel);
  135. m_level = level;
  136. switch (level)
  137. {
  138. case 0:
  139. {
  140. set_quality_fastest();
  141. break;
  142. }
  143. case 1:
  144. {
  145. set_quality_normal();
  146. m_first_mode11_weight_ise_range = MODE11_LAST_ISE_RANGE - 1;
  147. m_last_mode11_weight_ise_range = MODE11_LAST_ISE_RANGE;
  148. m_use_mode7_part1 = false;
  149. m_use_mode7_part2 = false;
  150. m_use_estimated_partitions = true;
  151. m_max_estimated_partitions = 1;
  152. m_mode11_part2_part_masks = 1 | 2;
  153. m_mode7_part2_part_masks = 1 | 2;
  154. break;
  155. }
  156. case 2:
  157. {
  158. set_quality_normal();
  159. m_use_estimated_partitions = true;
  160. m_max_estimated_partitions = 2;
  161. m_mode11_part2_part_masks = 1 | 2;
  162. m_mode7_part2_part_masks = 1 | 2;
  163. break;
  164. }
  165. case 3:
  166. {
  167. set_quality_best();
  168. m_use_estimated_partitions = true;
  169. m_max_estimated_partitions = 2;
  170. m_mode11_part2_part_masks = 1 | 2 | 4 | 8;
  171. m_mode7_part2_part_masks = 1 | 2 | 4 | 8;
  172. break;
  173. }
  174. case 4:
  175. {
  176. set_quality_best();
  177. break;
  178. }
  179. }
  180. }
  181. //--------------------------------------------------------------------------------------------------------------------------
  182. #if 0
  183. static inline half_float qlog12_to_half_slow(uint32_t qlog12)
  184. {
  185. return qlog_to_half_slow(qlog12, 12);
  186. }
  187. #endif
  188. // max usable qlog8 value is 247, 248=inf, >=249 is nan
  189. // max usable qlog7 value is 123, 124=inf, >=125 is nan
  190. // To go from a smaller qlog to an larger one, shift left by X bits.
  191. //const uint32_t TOTAL_USABLE_QLOG8 = 248; // 0-247 are usable, 0=0, 247=60416.0, 246=55296.0
  192. // for qlog7's shift left by 1
  193. //half_float g_qlog8_to_half[256];
  194. //float g_qlog8_to_float[256];
  195. //half_float g_qlog12_to_half[4096];
  196. //float g_qlog12_to_float[4096];
  197. static half_float g_qlog16_to_half[65536];
  198. inline half_float qlog_to_half(uint32_t val, uint32_t bits)
  199. {
  200. assert((bits >= 5) && (bits <= 16));
  201. assert(val < (1U << bits));
  202. return g_qlog16_to_half[val << (16 - bits)];
  203. }
  204. // nearest values given a positive half float value (only)
  205. static uint16_t g_half_to_qlog7[32768], g_half_to_qlog8[32768], g_half_to_qlog9[32768], g_half_to_qlog10[32768], g_half_to_qlog11[32768], g_half_to_qlog12[32768];
  206. const uint32_t HALF_TO_QLOG_TABS_BASE = 7;
  207. static uint16_t* g_pHalf_to_qlog_tabs[8] =
  208. {
  209. g_half_to_qlog7,
  210. g_half_to_qlog8,
  211. g_half_to_qlog9,
  212. g_half_to_qlog10,
  213. g_half_to_qlog11,
  214. g_half_to_qlog12
  215. };
  216. static inline uint32_t half_to_qlog7_12(half_float h, uint32_t bits)
  217. {
  218. assert((bits >= HALF_TO_QLOG_TABS_BASE) && (bits <= 12));
  219. assert(h < 32768);
  220. return g_pHalf_to_qlog_tabs[bits - HALF_TO_QLOG_TABS_BASE][h];
  221. }
  222. #if 0
  223. // Input is the low 11 bits of the qlog
  224. // Returns the 10-bit mantissa of the half float value
  225. static int qlog11_to_half_float_mantissa(int M)
  226. {
  227. assert(M <= 0x7FF);
  228. int Mt;
  229. if (M < 512)
  230. Mt = 3 * M;
  231. else if (M >= 1536)
  232. Mt = 5 * M - 2048;
  233. else
  234. Mt = 4 * M - 512;
  235. return (Mt >> 3);
  236. }
  237. #endif
  238. // Input is the 10-bit mantissa of the half float value
  239. // Output is the 11-bit qlog value
  240. // Inverse of qlog11_to_half_float_mantissa()
  241. static inline int half_float_mantissa_to_qlog11(int hf)
  242. {
  243. int q0 = (hf * 8 + 2) / 3;
  244. int q1 = (hf * 8 + 2048 + 4) / 5;
  245. if (q0 < 512)
  246. return q0;
  247. else if (q1 >= 1536)
  248. return q1;
  249. int q2 = (hf * 8 + 512 + 2) / 4;
  250. return q2;
  251. }
  252. static inline int half_to_qlog16(int hf)
  253. {
  254. // extract 5 bits exponent, which is carried through to qlog16 unchanged
  255. const int exp = (hf >> 10) & 0x1F;
  256. // extract and invert the 10 bit mantissa to nearest qlog11 (should be lossless)
  257. const int mantissa = half_float_mantissa_to_qlog11(hf & 0x3FF);
  258. assert(mantissa <= 0x7FF);
  259. // Now combine to qlog16, which is what ASTC HDR interpolates using the [0-64] weights.
  260. uint32_t qlog16 = (exp << 11) | mantissa;
  261. // should be a lossless operation
  262. assert(qlog16_to_half_slow(qlog16) == hf);
  263. return qlog16;
  264. }
  265. static inline uint32_t quant_qlog16(uint32_t q16, uint32_t desired_bits)
  266. {
  267. assert((desired_bits >= 7) && (desired_bits <= 12));
  268. assert(q16 <= 65535);
  269. const uint32_t shift = 16 - desired_bits;
  270. uint32_t e = (q16 + (1U << (shift - 1U)) - 1U) >> shift;
  271. uint32_t max_val = (1U << desired_bits) - 1U;
  272. e = minimum<uint32_t>(e, max_val);
  273. return e;
  274. }
  275. static void compute_half_to_qlog_table(uint32_t bits, uint16_t* pTable, const basisu::vector<float> &qlog16_to_float)
  276. {
  277. assert(bits >= 5 && bits <= 12);
  278. const uint32_t max_val = (1 << bits) - 1;
  279. // For all positive half-floats
  280. for (uint32_t h = 0; h < 32768; h++)
  281. {
  282. // Skip invalid values
  283. if (is_half_inf_or_nan((half_float)h))
  284. continue;
  285. const float desired_val = half_to_float((half_float)h);
  286. float best_err = 1e+30f;
  287. uint32_t best_qlog = 0;
  288. // For all possible qlog's
  289. for (uint32_t i = 0; i <= max_val; i++)
  290. {
  291. // Skip invalid values
  292. float v = qlog16_to_float[i << (16 - bits)];
  293. if (std::isnan(v))
  294. continue;
  295. // Compute error
  296. float err = fabs(v - desired_val);
  297. // Find best
  298. if (err < best_err)
  299. {
  300. best_err = err;
  301. best_qlog = i;
  302. }
  303. }
  304. pTable[h] = (uint16_t)best_qlog;
  305. }
  306. #if 0
  307. uint32_t t = 0;
  308. const uint32_t nb = 12;
  309. int nb_shift = 16 - nb;
  310. for (uint32_t q16 = 0; q16 < 65536; q16++)
  311. {
  312. half_float h = qlog16_to_half_slow(q16);
  313. if (is_half_inf_or_nan(h))
  314. continue;
  315. int q7 = half_to_qlog7_12(h, nb);
  316. uint32_t best_err = UINT32_MAX, best_l = 0;
  317. for (int l = 0; l < (1 << nb); l++)
  318. {
  319. int dec_q16 = l << nb_shift;
  320. int err = iabs(dec_q16 - q16);
  321. if (err < best_err)
  322. {
  323. best_err = err;
  324. best_l = l;
  325. }
  326. }
  327. //int e = (q16 + 253) >> 9; // 345
  328. int e = (q16 + (1 << (nb_shift - 1)) - 1) >> nb_shift; // 285
  329. if (best_l != e)
  330. //if (q7 != best_l)
  331. {
  332. printf("q16=%u, h=%u, q7=%u, e=%u, best_l=%u\n", q16, h, q7, e, best_l);
  333. t++;
  334. }
  335. }
  336. printf("Mismatches: %u\n", t);
  337. exit(0);
  338. #endif
  339. }
  340. static void init_qlog_tables()
  341. {
  342. basisu::vector<float> qlog16_to_float(65536);
  343. // for all possible qlog16, compute the corresponding half float
  344. for (uint32_t i = 0; i <= 65535; i++)
  345. {
  346. half_float h = qlog16_to_half_slow(i);
  347. g_qlog16_to_half[i] = h;
  348. qlog16_to_float[i] = half_to_float(h);
  349. }
  350. // for all possible half floats, find the nearest qlog5-12 float
  351. for (uint32_t bits = HALF_TO_QLOG_TABS_BASE; bits <= 12; bits++)
  352. {
  353. compute_half_to_qlog_table(bits, g_pHalf_to_qlog_tabs[bits - HALF_TO_QLOG_TABS_BASE], qlog16_to_float);
  354. }
  355. }
  356. // [ise_range][0] = # levels
  357. // [ise_range][1...] = lerp value [0,64]
  358. // in ASTC order
  359. // Supported ISE weight ranges: 0 to 10, 11 total
  360. const uint32_t MIN_SUPPORTED_ISE_WEIGHT_INDEX = 1; // ISE 1=3 levels
  361. const uint32_t MAX_SUPPORTED_ISE_WEIGHT_INDEX = 10; // ISE 10=24 levels
  362. static const uint8_t g_ise_weight_lerps[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][32] =
  363. {
  364. { 0 }, // ise range=0 is invalid for 4x4 block sizes (<24 weight bits in the block)
  365. { 3, 0, 32, 64 }, // 1
  366. { 4, 0, 21, 43, 64 }, // 2
  367. { 5, 0, 16, 32, 48, 64 }, // 3
  368. { 6, 0, 64, 12, 52, 25, 39 }, // 4
  369. { 8, 0, 9, 18, 27, 37, 46, 55, 64 }, // 5
  370. { 10, 0, 64, 7, 57, 14, 50, 21, 43, 28, 36 }, // 6
  371. { 12, 0, 64, 17, 47, 5, 59, 23, 41, 11, 53, 28, 36 }, // 7
  372. { 16, 0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64 }, // 8
  373. { 20, 0, 64, 16, 48, 3, 61, 19, 45, 6, 58, 23, 41, 9, 55, 26, 38, 13, 51, 29, 35 }, // 9
  374. { 24, 0, 64, 8, 56, 16, 48, 24, 40, 2, 62, 11, 53, 19, 45, 27, 37, 5, 59, 13, 51, 22, 42, 30, 34 } // 10
  375. };
  376. //{ 12, 0, 64, 17, 47, 5, 59, 23, 41, 11, 53, 28, 36 }, // 7
  377. //static const uint8_t g_weight_order_7[12] = { 0, 4, 8, 2, 6, 10, 11, 7, 3, 9, 5, 1 };
  378. static vec3F calc_mean(uint32_t num_pixels, const vec4F* pPixels)
  379. {
  380. vec3F mean(0.0f);
  381. for (uint32_t i = 0; i < num_pixels; i++)
  382. {
  383. const vec4F& p = pPixels[i];
  384. mean[0] += p[0];
  385. mean[1] += p[1];
  386. mean[2] += p[2];
  387. }
  388. return mean / static_cast<float>(num_pixels);
  389. }
  390. static vec3F calc_rgb_pca(uint32_t num_pixels, const vec4F* pPixels, const vec3F& mean_color)
  391. {
  392. float cov[6] = { 0, 0, 0, 0, 0, 0 };
  393. for (uint32_t i = 0; i < num_pixels; i++)
  394. {
  395. const vec4F& v = pPixels[i];
  396. float r = v[0] - mean_color[0];
  397. float g = v[1] - mean_color[1];
  398. float b = v[2] - mean_color[2];
  399. cov[0] += r * r;
  400. cov[1] += r * g;
  401. cov[2] += r * b;
  402. cov[3] += g * g;
  403. cov[4] += g * b;
  404. cov[5] += b * b;
  405. }
  406. float xr = .9f, xg = 1.0f, xb = .7f;
  407. for (uint32_t iter = 0; iter < 3; iter++)
  408. {
  409. float r = xr * cov[0] + xg * cov[1] + xb * cov[2];
  410. float g = xr * cov[1] + xg * cov[3] + xb * cov[4];
  411. float b = xr * cov[2] + xg * cov[4] + xb * cov[5];
  412. float m = maximumf(maximumf(fabsf(r), fabsf(g)), fabsf(b));
  413. if (m > 1e-10f)
  414. {
  415. m = 1.0f / m;
  416. r *= m;
  417. g *= m;
  418. b *= m;
  419. }
  420. xr = r;
  421. xg = g;
  422. xb = b;
  423. }
  424. float len = xr * xr + xg * xg + xb * xb;
  425. vec3F axis;
  426. if (len < 1e-10f)
  427. axis.set(0.0f);
  428. else
  429. {
  430. len = 1.0f / sqrtf(len);
  431. xr *= len;
  432. xg *= len;
  433. xb *= len;
  434. axis.set(xr, xg, xb, 0);
  435. }
  436. if (axis.dot(axis) < .5f)
  437. {
  438. axis.set(1.0f, 1.0f, 1.0f, 0.0f);
  439. axis.normalize_in_place();
  440. }
  441. return axis;
  442. }
  443. static vec3F interp_color(const vec3F& mean, const vec3F& dir, float df, const aabb3F& colorspace_box, const aabb3F& input_box, bool* pInside = nullptr)
  444. {
  445. #if 0
  446. assert(mean[0] >= input_box[0][0]);
  447. assert(mean[1] >= input_box[0][1]);
  448. assert(mean[2] >= input_box[0][2]);
  449. assert(mean[0] <= input_box[1][0]);
  450. assert(mean[1] <= input_box[1][1]);
  451. assert(mean[2] <= input_box[1][2]);
  452. #endif
  453. if (pInside)
  454. *pInside = false;
  455. vec3F k(mean + dir * df);
  456. if (colorspace_box.contains(k))
  457. {
  458. if (pInside)
  459. *pInside = true;
  460. return k;
  461. }
  462. // starts inside
  463. vec3F s(mean);
  464. // ends outside
  465. vec3F e(mean + dir * df);
  466. // a ray guaranteed to go from the outside to inside
  467. ray3F r(e, (s - e).normalize_in_place());
  468. vec3F c;
  469. float t = 0.0f;
  470. intersection::result res = intersection::ray_aabb(c, t, r, input_box);
  471. if (res != intersection::cSuccess)
  472. c = k;
  473. return c;
  474. }
  475. // all in Q16 space, 0-65535
  476. static bool compute_least_squares_endpoints_rgb(
  477. uint32_t N, const uint8_t* pSelectors, const vec4F* pSelector_weights,
  478. vec3F* pXl, vec3F* pXh, const vec4F* pColors, const aabb3F& input_box)
  479. {
  480. // Least squares using normal equations: http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf
  481. // https://web.archive.org/web/20150319232457/http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf
  482. // I did this in matrix form first, expanded out all the ops, then optimized it a bit.
  483. float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f;
  484. float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f;
  485. float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f;
  486. float q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f;
  487. for (uint32_t i = 0; i < N; i++)
  488. {
  489. const uint32_t sel = pSelectors[i];
  490. z00 += pSelector_weights[sel][0];
  491. z10 += pSelector_weights[sel][1];
  492. z11 += pSelector_weights[sel][2];
  493. float w = pSelector_weights[sel][3];
  494. q00_r += w * pColors[i][0];
  495. t_r += pColors[i][0];
  496. q00_g += w * pColors[i][1];
  497. t_g += pColors[i][1];
  498. q00_b += w * pColors[i][2];
  499. t_b += pColors[i][2];
  500. }
  501. q10_r = t_r - q00_r;
  502. q10_g = t_g - q00_g;
  503. q10_b = t_b - q00_b;
  504. z01 = z10;
  505. float det = z00 * z11 - z01 * z10;
  506. if (det == 0.0f)
  507. return false;
  508. det = 1.0f / det;
  509. float iz00, iz01, iz10, iz11;
  510. iz00 = z11 * det;
  511. iz01 = -z01 * det;
  512. iz10 = -z10 * det;
  513. iz11 = z00 * det;
  514. (*pXl)[0] = (float)(iz00 * q00_r + iz01 * q10_r);
  515. (*pXh)[0] = (float)(iz10 * q00_r + iz11 * q10_r);
  516. (*pXl)[1] = (float)(iz00 * q00_g + iz01 * q10_g);
  517. (*pXh)[1] = (float)(iz10 * q00_g + iz11 * q10_g);
  518. (*pXl)[2] = (float)(iz00 * q00_b + iz01 * q10_b);
  519. (*pXh)[2] = (float)(iz10 * q00_b + iz11 * q10_b);
  520. for (uint32_t c = 0; c < 3; c++)
  521. {
  522. float l = (*pXl)[c], h = (*pXh)[c];
  523. if (input_box.get_dim(c) < .0000125f)
  524. {
  525. l = input_box[0][c];
  526. h = input_box[1][c];
  527. }
  528. (*pXl)[c] = l;
  529. (*pXh)[c] = h;
  530. }
  531. vec3F mean((*pXl + *pXh) * .5f);
  532. vec3F dir(*pXh - *pXl);
  533. float ln = dir.length();
  534. if (ln)
  535. {
  536. dir /= ln;
  537. float ld = (*pXl - mean).dot(dir);
  538. float hd = (*pXh - mean).dot(dir);
  539. aabb3F colorspace_box(vec3F(0.0f), vec3F(MAX_QLOG16_VAL));
  540. bool was_inside1 = false;
  541. vec3F l = interp_color(mean, dir, ld, colorspace_box, input_box, &was_inside1);
  542. if (!was_inside1)
  543. *pXl = l;
  544. bool was_inside2 = false;
  545. vec3F h = interp_color(mean, dir, hd, colorspace_box, input_box, &was_inside2);
  546. if (!was_inside2)
  547. *pXh = h;
  548. }
  549. pXl->clamp(0.0f, MAX_QLOG16_VAL);
  550. pXh->clamp(0.0f, MAX_QLOG16_VAL);
  551. return true;
  552. }
  553. static vec4F g_astc_ls_weights_ise[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][24];
  554. static uint8_t g_map_astc_to_linear_order[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][24]; // [ise_range][astc_index] -> linear index
  555. static uint8_t g_map_linear_to_astc_order[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][24]; // [ise_range][linear_index] -> astc_index
  556. static void encode_astc_hdr_init()
  557. {
  558. // Precomputed weight constants used during least fit determination. For each entry: w * w, (1.0f - w) * w, (1.0f - w) * (1.0f - w), w
  559. for (uint32_t range = MIN_SUPPORTED_ISE_WEIGHT_INDEX; range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX; range++)
  560. {
  561. const uint32_t num_levels = g_ise_weight_lerps[range][0];
  562. assert((num_levels >= 3) && (num_levels <= 24));
  563. for (uint32_t i = 0; i < num_levels; i++)
  564. {
  565. float w = g_ise_weight_lerps[range][1 + i] * (1.0f / 64.0f);
  566. g_astc_ls_weights_ise[range][i].set(w * w, (1.0f - w) * w, (1.0f - w) * (1.0f - w), w);
  567. }
  568. }
  569. for (uint32_t ise_range = MIN_SUPPORTED_ISE_WEIGHT_INDEX; ise_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX; ise_range++)
  570. {
  571. const uint32_t num_levels = g_ise_weight_lerps[ise_range][0];
  572. assert((num_levels >= 3) && (num_levels <= 24));
  573. uint32_t s[32];
  574. for (uint32_t i = 0; i < num_levels; i++)
  575. s[i] = (g_ise_weight_lerps[ise_range][1 + i] << 8) + i;
  576. std::sort(s, s + num_levels);
  577. for (uint32_t i = 0; i < num_levels; i++)
  578. g_map_linear_to_astc_order[ise_range][i] = (uint8_t)(s[i] & 0xFF);
  579. for (uint32_t i = 0; i < num_levels; i++)
  580. g_map_astc_to_linear_order[ise_range][g_map_linear_to_astc_order[ise_range][i]] = (uint8_t)i;
  581. }
  582. }
  583. void interpolate_qlog12_colors(
  584. const int e[2][3],
  585. half_float* pDecoded_half,
  586. vec3F* pDecoded_float,
  587. uint32_t n, uint32_t ise_weight_range)
  588. {
  589. assert((ise_weight_range >= MIN_SUPPORTED_ISE_WEIGHT_INDEX) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
  590. for (uint32_t i = 0; i < 2; i++)
  591. {
  592. for (uint32_t j = 0; j < 3; j++)
  593. {
  594. assert(in_range(e[i][j], 0, 0xFFF));
  595. }
  596. }
  597. for (uint32_t i = 0; i < n; i++)
  598. {
  599. const int c = g_ise_weight_lerps[ise_weight_range][1 + i];
  600. assert(c == (int)astc_helpers::dequant_bise_weight(i, ise_weight_range));
  601. half_float rf, gf, bf;
  602. {
  603. uint32_t r0 = e[0][0] << 4;
  604. uint32_t r1 = e[1][0] << 4;
  605. int ri = (r0 * (64 - c) + r1 * c + 32) / 64;
  606. rf = qlog16_to_half_slow(ri);
  607. }
  608. {
  609. uint32_t g0 = e[0][1] << 4;
  610. uint32_t g1 = e[1][1] << 4;
  611. int gi = (g0 * (64 - c) + g1 * c + 32) / 64;
  612. gf = qlog16_to_half_slow(gi);
  613. }
  614. {
  615. uint32_t b0 = e[0][2] << 4;
  616. uint32_t b1 = e[1][2] << 4;
  617. int bi = (b0 * (64 - c) + b1 * c + 32) / 64;
  618. bf = qlog16_to_half_slow(bi);
  619. }
  620. if (pDecoded_half)
  621. {
  622. pDecoded_half[i * 3 + 0] = rf;
  623. pDecoded_half[i * 3 + 1] = gf;
  624. pDecoded_half[i * 3 + 2] = bf;
  625. }
  626. if (pDecoded_float)
  627. {
  628. pDecoded_float[i][0] = half_to_float(rf);
  629. pDecoded_float[i][1] = half_to_float(gf);
  630. pDecoded_float[i][2] = half_to_float(bf);
  631. }
  632. }
  633. }
  634. // decoded in ASTC order, not linear order
  635. // return false if the ISE endpoint quantization leads to non-valid endpoints being decoded
  636. bool get_astc_hdr_mode_11_block_colors(
  637. const uint8_t* pEndpoints,
  638. half_float* pDecoded_half,
  639. vec3F* pDecoded_float,
  640. uint32_t n, uint32_t ise_weight_range, uint32_t ise_endpoint_range)
  641. {
  642. assert((ise_weight_range >= 1) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
  643. int e[2][3];
  644. if (!decode_mode11_to_qlog12(pEndpoints, e, ise_endpoint_range))
  645. return false;
  646. interpolate_qlog12_colors(e, pDecoded_half, pDecoded_float, n, ise_weight_range);
  647. return true;
  648. }
  649. // decoded in ASTC order, not linear order
  650. // return false if the ISE endpoint quantization leads to non-valid endpoints being decoded
  651. bool get_astc_hdr_mode_7_block_colors(
  652. const uint8_t* pEndpoints,
  653. half_float* pDecoded_half,
  654. vec3F* pDecoded_float,
  655. uint32_t n, uint32_t ise_weight_range, uint32_t ise_endpoint_range)
  656. {
  657. assert((ise_weight_range >= 1) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
  658. int e[2][3];
  659. if (!decode_mode7_to_qlog12(pEndpoints, e, nullptr, ise_endpoint_range))
  660. return false;
  661. interpolate_qlog12_colors(e, pDecoded_half, pDecoded_float, n, ise_weight_range);
  662. return true;
  663. }
  664. // Fast high precision piecewise linear approximation of log2(bias+x).
  665. // Half may be zero, positive or denormal. No NaN/Inf/negative.
  666. static inline double q(half_float x)
  667. {
  668. union { float f; int32_t i; uint32_t u; } fi;
  669. fi.f = fast_half_to_float_pos_not_inf_or_nan(x);
  670. assert(fi.f >= 0.0f);
  671. fi.f += .125f;
  672. return (double)fi.u; // approx log2f(fi.f), need to return double for the precision
  673. }
  674. double eval_selectors(
  675. uint32_t num_pixels,
  676. uint8_t* pWeights,
  677. const half_float* pBlock_pixels_half,
  678. uint32_t num_weight_levels,
  679. const half_float* pDecoded_half,
  680. const astc_hdr_codec_options& coptions,
  681. uint32_t usable_selector_bitmask)
  682. {
  683. assert((num_pixels >= 1) && (num_pixels <= 16));
  684. assert(usable_selector_bitmask);
  685. const float R_WEIGHT = coptions.m_r_err_scale;
  686. const float G_WEIGHT = coptions.m_g_err_scale;
  687. double total_error = 0;
  688. #ifdef _DEBUG
  689. for (uint32_t i = 0; i < num_weight_levels; i++)
  690. {
  691. assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 0]));
  692. assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 1]));
  693. assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 2]));
  694. }
  695. #endif
  696. for (uint32_t p = 0; p < num_pixels; p++)
  697. {
  698. const half_float* pDesired_half = &pBlock_pixels_half[p * 3];
  699. double lowest_e = 1e+30f;
  700. // this is an approximation of MSLE
  701. for (uint32_t i = 0; i < num_weight_levels; i++)
  702. {
  703. if (((1 << i) & usable_selector_bitmask) == 0)
  704. continue;
  705. // compute piecewise linear approximation of log2(a+eps)-log2(b+eps), for each component, then MSLE
  706. double rd = q(pDecoded_half[i * 3 + 0]) - q(pDesired_half[0]);
  707. double gd = q(pDecoded_half[i * 3 + 1]) - q(pDesired_half[1]);
  708. double bd = q(pDecoded_half[i * 3 + 2]) - q(pDesired_half[2]);
  709. double e = R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd;
  710. if (e < lowest_e)
  711. {
  712. lowest_e = e;
  713. pWeights[p] = (uint8_t)i;
  714. }
  715. }
  716. total_error += lowest_e;
  717. } // p
  718. return total_error;
  719. }
  720. //--------------------------------------------------------------------------------------------------------------------------
  721. double compute_block_error(const half_float* pOrig_block, const half_float* pPacked_block, const astc_hdr_codec_options& coptions)
  722. {
  723. const float R_WEIGHT = coptions.m_r_err_scale;
  724. const float G_WEIGHT = coptions.m_g_err_scale;
  725. double total_error = 0;
  726. for (uint32_t p = 0; p < 16; p++)
  727. {
  728. double rd = q(pOrig_block[p * 3 + 0]) - q(pPacked_block[p * 3 + 0]);
  729. double gd = q(pOrig_block[p * 3 + 1]) - q(pPacked_block[p * 3 + 1]);
  730. double bd = q(pOrig_block[p * 3 + 2]) - q(pPacked_block[p * 3 + 2]);
  731. double e = R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd;
  732. total_error += e;
  733. }
  734. return total_error;
  735. }
  736. //--------------------------------------------------------------------------------------------------------------------------
  737. static inline int compute_clamped_val(int v, int l, int h, bool& did_clamp, int& max_clamp_mag)
  738. {
  739. assert(l < h);
  740. if (v < l)
  741. {
  742. max_clamp_mag = basisu::maximum<int>(max_clamp_mag, l - v);
  743. v = l;
  744. did_clamp = true;
  745. }
  746. else if (v > h)
  747. {
  748. max_clamp_mag = basisu::maximum<int>(max_clamp_mag, v - h);
  749. v = h;
  750. did_clamp = true;
  751. }
  752. return v;
  753. }
  754. static bool pack_astc_mode11_submode(uint32_t submode, uint8_t* pEndpoints, const vec3F& low_q16, const vec3F& high_q16, int& max_clamp_mag)
  755. {
  756. assert(submode <= 7);
  757. const uint8_t s_b_bits[8] = { 7, 8, 6, 7, 8, 6, 7, 6 };
  758. const uint8_t s_c_bits[8] = { 6, 6, 7, 7, 6, 7, 7, 7 };
  759. const uint8_t s_d_bits[8] = { 7, 6, 7, 6, 5, 6, 5, 6 };
  760. const uint32_t a_bits = 9 + (submode >> 1);
  761. const uint32_t b_bits = s_b_bits[submode];
  762. const uint32_t c_bits = s_c_bits[submode];
  763. const uint32_t d_bits = s_d_bits[submode];
  764. const int max_a_val = (1 << a_bits) - 1;
  765. const int max_b_val = (1 << b_bits) - 1;
  766. const int max_c_val = (1 << c_bits) - 1;
  767. // The maximum usable value before it turns to NaN/Inf
  768. const int max_a_qlog = get_max_qlog(a_bits);
  769. const int min_d_val = -(1 << (d_bits - 1));
  770. const int max_d_val = -min_d_val - 1;
  771. assert((max_d_val - min_d_val + 1) == (1 << d_bits));
  772. int val_q[2][3];
  773. for (uint32_t c = 0; c < 3; c++)
  774. {
  775. #if 1
  776. // this is better
  777. const half_float l = qlog16_to_half_slow((uint32_t)std::round(low_q16[c]));
  778. val_q[0][c] = half_to_qlog7_12(l, a_bits);
  779. const half_float h = qlog16_to_half_slow((uint32_t)std::round(high_q16[c]));
  780. val_q[1][c] = half_to_qlog7_12(h, a_bits);
  781. #else
  782. val_q[0][c] = quant_qlog16((uint32_t)std::round(low_q16[c]), a_bits);
  783. val_q[1][c] = quant_qlog16((uint32_t)std::round(high_q16[c]), a_bits);
  784. #endif
  785. #if 1
  786. if (val_q[0][c] == val_q[1][c])
  787. {
  788. #if 0
  789. if (l <= h)
  790. #else
  791. if (low_q16[c] < high_q16[c])
  792. #endif
  793. {
  794. if (val_q[0][c])
  795. val_q[0][c]--;
  796. if (val_q[1][c] != max_a_val)
  797. val_q[1][c]++;
  798. }
  799. else
  800. {
  801. if (val_q[0][c] != max_a_val)
  802. val_q[0][c]++;
  803. if (val_q[1][c])
  804. val_q[1][c]--;
  805. }
  806. }
  807. #endif
  808. val_q[0][c] = minimum<uint32_t>(val_q[0][c], max_a_qlog);
  809. val_q[1][c] = minimum<uint32_t>(val_q[1][c], max_a_qlog);
  810. }
  811. int highest_q = -1, highest_val = 0, highest_comp = 0;
  812. for (uint32_t v = 0; v < 2; v++)
  813. {
  814. for (uint32_t c = 0; c < 3; c++)
  815. {
  816. assert(val_q[v][c] >= 0 && val_q[v][c] <= max_a_val);
  817. if (val_q[v][c] > highest_q)
  818. {
  819. highest_q = val_q[v][c];
  820. highest_val = v;
  821. highest_comp = c;
  822. }
  823. }
  824. }
  825. const bool had_tie = (val_q[highest_val ^ 1][highest_comp] == highest_q);
  826. if (highest_val != 1)
  827. {
  828. for (uint32_t c = 0; c < 3; c++)
  829. {
  830. std::swap(val_q[0][c], val_q[1][c]);
  831. }
  832. }
  833. if (highest_comp)
  834. {
  835. std::swap(val_q[0][0], val_q[0][highest_comp]);
  836. std::swap(val_q[1][0], val_q[1][highest_comp]);
  837. }
  838. int orig_q[2][3];
  839. memcpy(orig_q, val_q, sizeof(val_q));
  840. // val[1][0] is now guaranteed to be highest
  841. int best_va = 0, best_vb0 = 0, best_vb1 = 0, best_vc = 0, best_vd0 = 0, best_vd1 = 0;
  842. int best_max_clamp_mag = 0;
  843. bool best_did_clamp = false;
  844. int best_q[2][3] = { { 0, 0, 0}, { 0, 0, 0 } };
  845. BASISU_NOTE_UNUSED(best_q);
  846. uint32_t best_dist = UINT_MAX;
  847. for (uint32_t pass = 0; pass < 2; pass++)
  848. {
  849. int trial_va = val_q[1][0];
  850. assert(trial_va <= max_a_val);
  851. assert(trial_va >= val_q[1][1]);
  852. assert(trial_va >= val_q[1][2]);
  853. assert(trial_va >= val_q[0][0]);
  854. assert(trial_va >= val_q[0][1]);
  855. assert(trial_va >= val_q[0][2]);
  856. bool did_clamp = false;
  857. int trial_max_clamp_mag = 0;
  858. int trial_vb0 = compute_clamped_val(trial_va - val_q[1][1], 0, max_b_val, did_clamp, trial_max_clamp_mag);
  859. int trial_vb1 = compute_clamped_val(trial_va - val_q[1][2], 0, max_b_val, did_clamp, trial_max_clamp_mag);
  860. int trial_vc = compute_clamped_val(trial_va - val_q[0][0], 0, max_c_val, did_clamp, trial_max_clamp_mag);
  861. int trial_vd0 = compute_clamped_val((trial_va - trial_vb0 - trial_vc) - val_q[0][1], min_d_val, max_d_val, did_clamp, trial_max_clamp_mag);
  862. int trial_vd1 = compute_clamped_val((trial_va - trial_vb1 - trial_vc) - val_q[0][2], min_d_val, max_d_val, did_clamp, trial_max_clamp_mag);
  863. if (!did_clamp)
  864. {
  865. // Make sure decoder gets the expected values
  866. assert(trial_va == val_q[1][0]);
  867. assert(trial_va - trial_vb0 == val_q[1][1]);
  868. assert(trial_va - trial_vb1 == val_q[1][2]);
  869. assert((trial_va - trial_vc) == val_q[0][0]);
  870. assert((trial_va - trial_vb0 - trial_vc - trial_vd0) == val_q[0][1]);
  871. assert((trial_va - trial_vb1 - trial_vc - trial_vd1) == val_q[0][2]);
  872. }
  873. const int r_e0 = clamp<int>(trial_va, 0, max_a_val);
  874. const int r_e1 = clamp<int>(trial_va - trial_vb0, 0, max_a_val);
  875. const int r_e2 = clamp<int>(trial_va - trial_vb1, 0, max_a_val);
  876. const int r_f0 = clamp<int>(trial_va - trial_vc, 0, max_a_val);
  877. const int r_f1 = clamp<int>(trial_va - trial_vb0 - trial_vc - trial_vd0, 0, max_a_val);
  878. const int r_f2 = clamp<int>(trial_va - trial_vb1 - trial_vc - trial_vd1, 0, max_a_val);
  879. assert(r_e0 <= max_a_qlog);
  880. assert(r_e1 <= max_a_qlog);
  881. assert(r_e2 <= max_a_qlog);
  882. assert(r_f0 <= max_a_qlog);
  883. assert(r_f1 <= max_a_qlog);
  884. assert(r_f2 <= max_a_qlog);
  885. if ((!did_clamp) || (!had_tie))
  886. {
  887. best_va = trial_va;
  888. best_vb0 = trial_vb0;
  889. best_vb1 = trial_vb1;
  890. best_vc = trial_vc;
  891. best_vd0 = trial_vd0;
  892. best_vd1 = trial_vd1;
  893. best_max_clamp_mag = trial_max_clamp_mag;
  894. best_did_clamp = did_clamp;
  895. best_q[1][0] = r_e0;
  896. best_q[1][1] = r_e1;
  897. best_q[1][2] = r_e2;
  898. best_q[0][0] = r_f0;
  899. best_q[0][1] = r_f1;
  900. best_q[0][2] = r_f2;
  901. break;
  902. }
  903. // we had a tie and it did clamp, try swapping L/H for a potential slight gain
  904. const uint32_t r_dist1 = basisu::square<int>(r_e0 - val_q[1][0]) + basisu::square<int>(r_e1 - val_q[1][1]) + basisu::square<int>(r_e2 - val_q[1][2]);
  905. const uint32_t r_dist0 = basisu::square<int>(r_f0 - val_q[0][0]) + basisu::square<int>(r_f1 - val_q[0][1]) + basisu::square<int>(r_f2 - val_q[0][2]);
  906. const uint32_t total_dist = r_dist1 + r_dist0;
  907. if (total_dist < best_dist)
  908. {
  909. best_dist = total_dist;
  910. best_va = trial_va;
  911. best_vb0 = trial_vb0;
  912. best_vb1 = trial_vb1;
  913. best_vc = trial_vc;
  914. best_vd0 = trial_vd0;
  915. best_vd1 = trial_vd1;
  916. best_did_clamp = did_clamp;
  917. best_q[1][0] = r_e0;
  918. best_q[1][1] = r_e1;
  919. best_q[1][2] = r_e2;
  920. best_q[0][0] = r_f0;
  921. best_q[0][1] = r_f1;
  922. best_q[0][2] = r_f2;
  923. }
  924. for (uint32_t c = 0; c < 3; c++)
  925. std::swap(val_q[0][c], val_q[1][c]);
  926. }
  927. // pack bits now
  928. int v0 = 0, v1 = 0, v2 = 0, v3 = 0, v4 = 0, v5 = 0;
  929. int x0 = 0, x1 = 0, x2 = 0, x3 = 0, x4 = 0, x5 = 0;
  930. switch (submode)
  931. {
  932. case 0:
  933. x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_vd0, 6); x3 = get_bit(best_vd1, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5);
  934. break;
  935. case 1:
  936. x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_vb0, 7); x3 = get_bit(best_vb1, 7); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5);
  937. break;
  938. case 2:
  939. x0 = get_bit(best_va, 9); x1 = get_bit(best_vc, 6); x2 = get_bit(best_vd0, 6); x3 = get_bit(best_vd1, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5);
  940. break;
  941. case 3:
  942. x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_va, 9); x3 = get_bit(best_vc, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5);
  943. break;
  944. case 4:
  945. x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_vb0, 7); x3 = get_bit(best_vb1, 7); x4 = get_bit(best_va, 9); x5 = get_bit(best_va, 10);
  946. break;
  947. case 5:
  948. x0 = get_bit(best_va, 9); x1 = get_bit(best_va, 10); x2 = get_bit(best_vc, 7); x3 = get_bit(best_vc, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5);
  949. break;
  950. case 6:
  951. x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_va, 11); x3 = get_bit(best_vc, 6); x4 = get_bit(best_va, 9); x5 = get_bit(best_va, 10);
  952. break;
  953. case 7:
  954. x0 = get_bit(best_va, 9); x1 = get_bit(best_va, 10); x2 = get_bit(best_va, 11); x3 = get_bit(best_vc, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5);
  955. break;
  956. default:
  957. break;
  958. }
  959. // write mode
  960. pack_bit(v1, 7, submode, 0);
  961. pack_bit(v2, 7, submode, 1);
  962. pack_bit(v3, 7, submode, 2);
  963. // highest component
  964. pack_bit(v4, 7, highest_comp, 0);
  965. pack_bit(v5, 7, highest_comp, 1);
  966. // write bit 8 of va
  967. pack_bit(v1, 6, best_va, 8);
  968. // extra bits
  969. pack_bit(v2, 6, x0);
  970. pack_bit(v3, 6, x1);
  971. pack_bit(v4, 6, x2);
  972. pack_bit(v5, 6, x3);
  973. pack_bit(v4, 5, x4);
  974. pack_bit(v5, 5, x5);
  975. v0 = best_va & 0xFF;
  976. v1 |= (best_vc & 63);
  977. v2 |= (best_vb0 & 63);
  978. v3 |= (best_vb1 & 63);
  979. v4 |= (best_vd0 & 31);
  980. v5 |= (best_vd1 & 31);
  981. assert(in_range(v0, 0, 255) && in_range(v1, 0, 255) && in_range(v2, 0, 255) && in_range(v3, 0, 255) && in_range(v4, 0, 255) && in_range(v5, 0, 255));
  982. pEndpoints[0] = (uint8_t)v0;
  983. pEndpoints[1] = (uint8_t)v1;
  984. pEndpoints[2] = (uint8_t)v2;
  985. pEndpoints[3] = (uint8_t)v3;
  986. pEndpoints[4] = (uint8_t)v4;
  987. pEndpoints[5] = (uint8_t)v5;
  988. #ifdef _DEBUG
  989. // Test for valid pack by unpacking
  990. {
  991. if (highest_comp)
  992. {
  993. std::swap(best_q[0][0], best_q[0][highest_comp]);
  994. std::swap(best_q[1][0], best_q[1][highest_comp]);
  995. std::swap(orig_q[0][0], orig_q[0][highest_comp]);
  996. std::swap(orig_q[1][0], orig_q[1][highest_comp]);
  997. }
  998. int test_e[2][3];
  999. decode_mode11_to_qlog12(pEndpoints, test_e, astc_helpers::BISE_256_LEVELS);
  1000. for (uint32_t i = 0; i < 2; i++)
  1001. {
  1002. for (uint32_t j = 0; j < 3; j++)
  1003. {
  1004. assert(best_q[i][j] == test_e[i][j] >> (12 - a_bits));
  1005. if (!best_did_clamp)
  1006. {
  1007. assert((orig_q[i][j] == test_e[i][j] >> (12 - a_bits)) ||
  1008. (orig_q[1 - i][j] == test_e[i][j] >> (12 - a_bits)));
  1009. }
  1010. }
  1011. }
  1012. }
  1013. #endif
  1014. max_clamp_mag = best_max_clamp_mag;
  1015. return best_did_clamp;
  1016. }
  1017. //--------------------------------------------------------------------------------------------------------------------------
  1018. static void pack_astc_mode11_direct(uint8_t* pEndpoints, const vec3F& l_q16, const vec3F& h_q16)
  1019. {
  1020. for (uint32_t i = 0; i < 3; i++)
  1021. {
  1022. // TODO: This goes from QLOG16->HALF->QLOG8/7
  1023. half_float l_half = qlog16_to_half_slow(clamp((int)std::round(l_q16[i]), 0, 65535));
  1024. half_float h_half = qlog16_to_half_slow(clamp((int)std::round(h_q16[i]), 0, 65535));
  1025. int l_q, h_q;
  1026. if (i == 2)
  1027. {
  1028. l_q = g_half_to_qlog7[bounds_check((uint32_t)l_half, 0U, 32768U)];
  1029. h_q = g_half_to_qlog7[bounds_check((uint32_t)h_half, 0U, 32768U)];
  1030. l_q = minimum<uint32_t>(l_q, MAX_QLOG7);
  1031. h_q = minimum<uint32_t>(h_q, MAX_QLOG7);
  1032. }
  1033. else
  1034. {
  1035. l_q = g_half_to_qlog8[bounds_check((uint32_t)l_half, 0U, 32768U)];
  1036. h_q = g_half_to_qlog8[bounds_check((uint32_t)h_half, 0U, 32768U)];
  1037. l_q = minimum<uint32_t>(l_q, MAX_QLOG8);
  1038. h_q = minimum<uint32_t>(h_q, MAX_QLOG8);
  1039. }
  1040. #if 1
  1041. if (l_q == h_q)
  1042. {
  1043. const int m = (i == 2) ? MAX_QLOG7 : MAX_QLOG8;
  1044. if (l_q16[i] <= h_q16[i])
  1045. {
  1046. if (l_q)
  1047. l_q--;
  1048. if (h_q != m)
  1049. h_q++;
  1050. }
  1051. else
  1052. {
  1053. if (h_q)
  1054. h_q--;
  1055. if (l_q != m)
  1056. l_q++;
  1057. }
  1058. }
  1059. #endif
  1060. if (i == 2)
  1061. {
  1062. assert(l_q <= (int)MAX_QLOG7 && h_q <= (int)MAX_QLOG7);
  1063. l_q |= 128;
  1064. h_q |= 128;
  1065. }
  1066. else
  1067. {
  1068. assert(l_q <= (int)MAX_QLOG8 && h_q <= (int)MAX_QLOG8);
  1069. }
  1070. pEndpoints[2 * i + 0] = (uint8_t)l_q;
  1071. pEndpoints[2 * i + 1] = (uint8_t)h_q;
  1072. }
  1073. }
  1074. //--------------------------------------------------------------------------------------------------------------------------
  1075. static bool pack_astc_mode7_submode(uint32_t submode, uint8_t* pEndpoints, const vec3F& rgb_q16, float s_q16, int& max_clamp_mag, uint32_t ise_weight_range)
  1076. {
  1077. assert((ise_weight_range >= 1) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
  1078. assert(submode <= 5);
  1079. max_clamp_mag = 0;
  1080. static const uint8_t s_r_bits[6] = { 11, 11, 10, 9, 8, 7 };
  1081. static const uint8_t s_g_b_bits[6] = { 5, 6, 5, 6, 7, 7 };
  1082. static const uint8_t s_s_bits[6] = { 7, 5, 8, 7, 6, 7 };
  1083. // The precision of the components
  1084. const uint32_t prec_bits = s_r_bits[submode];
  1085. int qlog[4], pack_bits[4];
  1086. for (uint32_t i = 0; i < 4; i++)
  1087. {
  1088. const float f = (i == 3) ? s_q16 : rgb_q16[i];
  1089. // The # of bits the component is packed into
  1090. if (i == 0)
  1091. pack_bits[i] = s_r_bits[submode];
  1092. else if (i == 3)
  1093. pack_bits[i] = s_s_bits[submode];
  1094. else
  1095. pack_bits[i] = s_g_b_bits[submode];
  1096. #if 0
  1097. // this is slightly worse
  1098. // TODO: going from qlog16 to half loses some precision. Then going from half to qlog 7-12 will have extra error.
  1099. half_float h = qlog_to_half(clamp((int)std::round(f), 0, MAX_QLOG16), 16);
  1100. qlog[i] = half_to_qlog7_12((half_float)bounds_check((uint32_t)h, 0U, 32768U), prec_bits);
  1101. #else
  1102. qlog[i] = quant_qlog16(clamp<int>((int)std::round(f), 0, MAX_QLOG16), prec_bits);
  1103. // Only bias if there are enough texel weights, 4=6 weights
  1104. if (ise_weight_range >= 4)
  1105. {
  1106. // Explictly bias the high color, and the scale up, to better exploit the weights.
  1107. // The quantized range also then encompases the complete input range.
  1108. const uint32_t max_val = (1 << prec_bits) - 1;
  1109. const uint32_t K = 3;
  1110. if (i == 3)
  1111. {
  1112. qlog[i] = minimum<uint32_t>(qlog[i] + K * 2, max_val);
  1113. }
  1114. else
  1115. {
  1116. qlog[i] = minimum<uint32_t>(qlog[i] + K, max_val);
  1117. }
  1118. }
  1119. #endif
  1120. if (i != 3)
  1121. qlog[i] = minimum<uint32_t>(qlog[i], get_max_qlog(prec_bits));
  1122. // If S=0, we lose freedom for the texel weights to add any value.
  1123. if ((i == 3) && (qlog[i] == 0))
  1124. qlog[i] = 1;
  1125. }
  1126. uint32_t maj_index = 0;
  1127. bool did_clamp = false;
  1128. if (submode != 5)
  1129. {
  1130. int largest_qlog = 0;
  1131. for (uint32_t i = 0; i < 3; i++)
  1132. {
  1133. if (qlog[i] > largest_qlog)
  1134. {
  1135. largest_qlog = qlog[i];
  1136. maj_index = i;
  1137. }
  1138. }
  1139. if (maj_index)
  1140. {
  1141. std::swap(qlog[0], qlog[maj_index]);
  1142. }
  1143. assert(qlog[0] >= qlog[1]);
  1144. assert(qlog[0] >= qlog[2]);
  1145. qlog[1] = qlog[0] - qlog[1];
  1146. qlog[2] = qlog[0] - qlog[2];
  1147. for (uint32_t i = 1; i < 4; i++)
  1148. {
  1149. const int max_val = (1 << pack_bits[i]) - 1;
  1150. if (qlog[i] > max_val)
  1151. {
  1152. max_clamp_mag = maximum<int>(max_clamp_mag, qlog[i] - max_val);
  1153. qlog[i] = max_val;
  1154. did_clamp = true;
  1155. }
  1156. }
  1157. }
  1158. for (uint32_t i = 0; i < 4; i++)
  1159. {
  1160. const int max_val = (1 << pack_bits[i]) - 1; (void)max_val;
  1161. assert(qlog[i] <= max_val);
  1162. }
  1163. int mode = 0;
  1164. int r = qlog[0] & 63; // 6-bits
  1165. int g = qlog[1] & 31; // 5-bits
  1166. int b = qlog[2] & 31; // 5-bits
  1167. int s = qlog[3] & 31; // 5-bits
  1168. int x0 = 0, x1 = 0, x2 = 0, x3 = 0, x4 = 0, x5 = 0, x6 = 0;
  1169. switch (submode)
  1170. {
  1171. case 0:
  1172. {
  1173. mode = (maj_index << 2) | 0;
  1174. assert((mode & 0xC) != 0xC);
  1175. x0 = get_bit(qlog[0], 9); // R9
  1176. x1 = get_bit(qlog[0], 8); // R8
  1177. x2 = get_bit(qlog[0], 7); // R7
  1178. x3 = get_bit(qlog[0], 10); // R10
  1179. x4 = get_bit(qlog[0], 6); // R6
  1180. x5 = get_bit(qlog[3], 6); // S6
  1181. x6 = get_bit(qlog[3], 5); // S5
  1182. break;
  1183. }
  1184. case 1:
  1185. {
  1186. mode = (maj_index << 2) | 1;
  1187. assert((mode & 0xC) != 0xC);
  1188. x0 = get_bit(qlog[0], 8); // R8
  1189. x1 = get_bit(qlog[1], 5); // G5
  1190. x2 = get_bit(qlog[0], 7); // R7
  1191. x3 = get_bit(qlog[2], 5); // B5
  1192. x4 = get_bit(qlog[0], 6); // R6
  1193. x5 = get_bit(qlog[0], 10); // R10
  1194. x6 = get_bit(qlog[0], 9); // R9
  1195. break;
  1196. }
  1197. case 2:
  1198. {
  1199. mode = (maj_index << 2) | 2;
  1200. assert((mode & 0xC) != 0xC);
  1201. x0 = get_bit(qlog[0], 9); // R9
  1202. x1 = get_bit(qlog[0], 8); // R8
  1203. x2 = get_bit(qlog[0], 7); // R7
  1204. x3 = get_bit(qlog[0], 6); // R6
  1205. x4 = get_bit(qlog[3], 7); // S7
  1206. x5 = get_bit(qlog[3], 6); // S6
  1207. x6 = get_bit(qlog[3], 5); // S5
  1208. break;
  1209. }
  1210. case 3:
  1211. {
  1212. mode = (maj_index << 2) | 3;
  1213. assert((mode & 0xC) != 0xC);
  1214. x0 = get_bit(qlog[0], 8); // R8
  1215. x1 = get_bit(qlog[1], 5); // G5
  1216. x2 = get_bit(qlog[0], 7); // R7
  1217. x3 = get_bit(qlog[2], 5); // B5
  1218. x4 = get_bit(qlog[0], 6); // R6
  1219. x5 = get_bit(qlog[3], 6); // S6
  1220. x6 = get_bit(qlog[3], 5); // S5
  1221. break;
  1222. }
  1223. case 4:
  1224. {
  1225. mode = maj_index | 0xC; // 0b1100
  1226. assert((mode & 0xC) == 0xC);
  1227. assert(mode != 0xF);
  1228. x0 = get_bit(qlog[1], 6); // G6
  1229. x1 = get_bit(qlog[1], 5); // G5
  1230. x2 = get_bit(qlog[2], 6); // B6
  1231. x3 = get_bit(qlog[2], 5); // B5
  1232. x4 = get_bit(qlog[0], 6); // R6
  1233. x5 = get_bit(qlog[0], 7); // R7
  1234. x6 = get_bit(qlog[3], 5); // S5
  1235. break;
  1236. }
  1237. case 5:
  1238. {
  1239. mode = 0xF;
  1240. x0 = get_bit(qlog[1], 6); // G6
  1241. x1 = get_bit(qlog[1], 5); // G5
  1242. x2 = get_bit(qlog[2], 6); // B6
  1243. x3 = get_bit(qlog[2], 5); // B5
  1244. x4 = get_bit(qlog[0], 6); // R6
  1245. x5 = get_bit(qlog[3], 6); // S6
  1246. x6 = get_bit(qlog[3], 5); // S5
  1247. break;
  1248. }
  1249. default:
  1250. {
  1251. assert(0);
  1252. break;
  1253. }
  1254. }
  1255. pEndpoints[0] = (uint8_t)((get_bit(mode, 1) << 7) | (get_bit(mode, 0) << 6) | r);
  1256. pEndpoints[1] = (uint8_t)((get_bit(mode, 2) << 7) | (x0 << 6) | (x1 << 5) | g);
  1257. pEndpoints[2] = (uint8_t)((get_bit(mode, 3) << 7) | (x2 << 6) | (x3 << 5) | b);
  1258. pEndpoints[3] = (uint8_t)((x4 << 7) | (x5 << 6) | (x6 << 5) | s);
  1259. #ifdef _DEBUG
  1260. // Test for valid pack by unpacking
  1261. {
  1262. const int inv_shift = 12 - prec_bits;
  1263. int unpacked_e[2][3];
  1264. if (submode != 5)
  1265. {
  1266. unpacked_e[1][0] = left_shift32(qlog[0], inv_shift);
  1267. unpacked_e[1][1] = clamp(left_shift32((qlog[0] - qlog[1]), inv_shift), 0, 0xFFF);
  1268. unpacked_e[1][2] = clamp(left_shift32((qlog[0] - qlog[2]), inv_shift), 0, 0xFFF);
  1269. unpacked_e[0][0] = clamp(left_shift32((qlog[0] - qlog[3]), inv_shift), 0, 0xFFF);
  1270. unpacked_e[0][1] = clamp(left_shift32(((qlog[0] - qlog[1]) - qlog[3]), inv_shift), 0, 0xFFF);
  1271. unpacked_e[0][2] = clamp(left_shift32(((qlog[0] - qlog[2]) - qlog[3]), inv_shift), 0, 0xFFF);
  1272. }
  1273. else
  1274. {
  1275. unpacked_e[1][0] = left_shift32(qlog[0], inv_shift);
  1276. unpacked_e[1][1] = left_shift32(qlog[1], inv_shift);
  1277. unpacked_e[1][2] = left_shift32(qlog[2], inv_shift);
  1278. unpacked_e[0][0] = clamp(left_shift32((qlog[0] - qlog[3]), inv_shift), 0, 0xFFF);
  1279. unpacked_e[0][1] = clamp(left_shift32((qlog[1] - qlog[3]), inv_shift), 0, 0xFFF);
  1280. unpacked_e[0][2] = clamp(left_shift32((qlog[2] - qlog[3]), inv_shift), 0, 0xFFF);
  1281. }
  1282. if (maj_index)
  1283. {
  1284. std::swap(unpacked_e[0][0], unpacked_e[0][maj_index]);
  1285. std::swap(unpacked_e[1][0], unpacked_e[1][maj_index]);
  1286. }
  1287. int e[2][3];
  1288. decode_mode7_to_qlog12_ise20(pEndpoints, e, nullptr);
  1289. for (uint32_t i = 0; i < 3; i++)
  1290. {
  1291. assert(unpacked_e[0][i] == e[0][i]);
  1292. assert(unpacked_e[1][i] == e[1][i]);
  1293. }
  1294. }
  1295. #endif
  1296. return did_clamp;
  1297. }
  1298. //--------------------------------------------------------------------------------------------------------------------------
  1299. static void quantize_ise_endpoints(uint32_t ise_endpoint_range, const uint8_t* pSrc_endpoints, uint8_t *pDst_endpoints, uint32_t n)
  1300. {
  1301. assert((ise_endpoint_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_endpoint_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE));
  1302. if (ise_endpoint_range == astc_helpers::BISE_256_LEVELS)
  1303. {
  1304. memcpy(pDst_endpoints, pSrc_endpoints, n);
  1305. }
  1306. else
  1307. {
  1308. for (uint32_t i = 0; i < n; i++)
  1309. {
  1310. uint32_t v = pSrc_endpoints[i];
  1311. assert(v <= 255);
  1312. pDst_endpoints[i] = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_val_to_ise[v];
  1313. }
  1314. }
  1315. }
  1316. //--------------------------------------------------------------------------------------------------------------------------
  1317. // Note this could fail to find any valid solution if use_endpoint_range!=20.
  1318. // Returns true if improved.
  1319. static bool try_mode11(uint32_t num_pixels,
  1320. uint8_t* pEndpoints, uint8_t* pWeights, double& cur_block_error, uint32_t& submode_used,
  1321. vec3F& low_color_q16, const vec3F& high_color_q16,
  1322. half_float block_pixels_half[16][3],
  1323. uint32_t num_weight_levels, uint32_t ise_weight_range, const astc_hdr_codec_options& coptions, bool direct_only, uint32_t ise_endpoint_range,
  1324. bool constrain_ise_weight8_selectors,
  1325. int32_t first_submode, int32_t last_submode) // -1, 7
  1326. {
  1327. assert((ise_weight_range >= 1) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
  1328. assert((num_weight_levels >= 3) && (num_weight_levels <= 32));
  1329. assert((num_pixels >= 1) && (num_pixels <= 16));
  1330. bool improved_flag = false;
  1331. half_float decoded_half[32][3];
  1332. vec3F decoded_float[32];
  1333. uint8_t orig_trial_endpoints[NUM_MODE11_ENDPOINTS], trial_endpoints[NUM_MODE11_ENDPOINTS], trial_weights[16];
  1334. if (direct_only)
  1335. {
  1336. first_submode = -1;
  1337. last_submode = -1;
  1338. }
  1339. assert(first_submode <= last_submode);
  1340. assert((first_submode >= -1) && (first_submode <= 7));
  1341. assert((last_submode >= -1) && (last_submode <= 7));
  1342. // TODO: First determine if a submode doesn't clamp first. If one is found, encode to that and we're done.
  1343. for (int submode = last_submode; submode >= first_submode; submode--)
  1344. {
  1345. bool did_clamp = false;
  1346. int max_clamp_mag = 0;
  1347. if (submode == -1)
  1348. {
  1349. // If it had to clamp with one of the submodes, try direct which can't clamp, but has low precision.
  1350. pack_astc_mode11_direct(orig_trial_endpoints, low_color_q16, high_color_q16);
  1351. }
  1352. else
  1353. {
  1354. did_clamp = pack_astc_mode11_submode(submode, orig_trial_endpoints, low_color_q16, high_color_q16, max_clamp_mag);
  1355. // If it had to clamp and the clamp was too high, it'll distort the endpoint colors too much, which could lead to noticeable artifacts.
  1356. const int MAX_CLAMP_MAG_ACCEPT_THRESH = 4;
  1357. if ((did_clamp) && (max_clamp_mag > MAX_CLAMP_MAG_ACCEPT_THRESH))
  1358. continue;
  1359. }
  1360. // This will distort the endpoints if the ISE endpoint range isn't 256 levels (20).
  1361. // It could massively distort the endpoints, but still result in a valid encoding.
  1362. quantize_ise_endpoints(ise_endpoint_range, orig_trial_endpoints, trial_endpoints, NUM_MODE11_ENDPOINTS);
  1363. if (!get_astc_hdr_mode_11_block_colors(trial_endpoints, &decoded_half[0][0], decoded_float, num_weight_levels, ise_weight_range, ise_endpoint_range))
  1364. continue;
  1365. uint32_t usable_selector_bitmask = UINT32_MAX;
  1366. if ((constrain_ise_weight8_selectors) && (ise_weight_range == astc_helpers::BISE_16_LEVELS))
  1367. usable_selector_bitmask = (1 << 0) | (1 << 1) | (1 << 4) | (1 << 5) | (1 << 10) | (1 << 11) | (1 << 14) | (1 << 15);
  1368. double trial_blk_error = eval_selectors(num_pixels, trial_weights, &block_pixels_half[0][0], num_weight_levels, &decoded_half[0][0], coptions, usable_selector_bitmask);
  1369. if (trial_blk_error < cur_block_error)
  1370. {
  1371. cur_block_error = trial_blk_error;
  1372. memcpy(pEndpoints, trial_endpoints, NUM_MODE11_ENDPOINTS);
  1373. memcpy(pWeights, trial_weights, num_pixels);
  1374. submode_used = submode + 1;
  1375. improved_flag = true;
  1376. }
  1377. // If it didn't clamp it was a lossless encode at this precision, so we can stop early as there's probably no use trying lower precision submodes.
  1378. // (Although it may be, because a lower precision pack could try nearby voxel coords.)
  1379. // However, at lower levels quantization may cause the decoded endpoints to be very distorted, so we need to evaluate up to direct.
  1380. if (ise_endpoint_range == astc_helpers::BISE_256_LEVELS)
  1381. {
  1382. if (!did_clamp)
  1383. break;
  1384. }
  1385. }
  1386. return improved_flag;
  1387. }
  1388. //--------------------------------------------------------------------------------------------------------------------------
  1389. static bool try_mode7(
  1390. uint32_t num_pixels,
  1391. uint8_t* pEndpoints, uint8_t* pWeights, double& cur_block_error, uint32_t& submode_used,
  1392. vec3F& high_color_q16, const float s_q16,
  1393. half_float block_pixels_half[16][3],
  1394. uint32_t num_weight_levels, uint32_t ise_weight_range, const astc_hdr_codec_options& coptions,
  1395. uint32_t ise_endpoint_range)
  1396. {
  1397. assert((ise_weight_range >= 1) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
  1398. assert((num_pixels >= 1) && (num_pixels <= 16));
  1399. bool improved_flag = false;
  1400. half_float decoded_half[24][3];
  1401. vec3F decoded_float[24];
  1402. uint8_t orig_trial_endpoints[NUM_MODE7_ENDPOINTS], trial_endpoints[NUM_MODE7_ENDPOINTS], trial_weights[16];
  1403. // TODO: First determine if a submode doesn't clamp first. If one is found, encode to that and we're done.
  1404. for (int submode = 0; submode <= 5; submode++)
  1405. {
  1406. int max_clamp_mag = 0;
  1407. const bool did_clamp = pack_astc_mode7_submode(submode, orig_trial_endpoints, high_color_q16, s_q16, max_clamp_mag, ise_weight_range);
  1408. if (submode < 5)
  1409. {
  1410. const int MAX_CLAMP_MAG_ACCEPT_THRESH = 4;
  1411. if ((did_clamp) && (max_clamp_mag > MAX_CLAMP_MAG_ACCEPT_THRESH))
  1412. continue;
  1413. }
  1414. // This will distort the endpoints if the ISE endpoint range isn't 256 levels (20).
  1415. // It could massively distort the endpoints, but still result in a valid encoding.
  1416. quantize_ise_endpoints(ise_endpoint_range, orig_trial_endpoints, trial_endpoints, NUM_MODE7_ENDPOINTS);
  1417. if (!get_astc_hdr_mode_7_block_colors(trial_endpoints, &decoded_half[0][0], decoded_float, num_weight_levels, ise_weight_range, ise_endpoint_range))
  1418. continue;
  1419. double trial_blk_error = eval_selectors(num_pixels, trial_weights, &block_pixels_half[0][0], num_weight_levels, &decoded_half[0][0], coptions);
  1420. if (trial_blk_error < cur_block_error)
  1421. {
  1422. cur_block_error = trial_blk_error;
  1423. memcpy(pEndpoints, trial_endpoints, NUM_MODE7_ENDPOINTS);
  1424. memcpy(pWeights, trial_weights, num_pixels);
  1425. submode_used = submode;
  1426. improved_flag = true;
  1427. }
  1428. if (ise_endpoint_range == astc_helpers::BISE_256_LEVELS)
  1429. {
  1430. if (!did_clamp)
  1431. break;
  1432. }
  1433. }
  1434. return improved_flag;
  1435. }
  1436. //--------------------------------------------------------------------------------------------------------------------------
  1437. static double encode_astc_hdr_block_mode_11(
  1438. uint32_t num_pixels,
  1439. const vec4F* pBlock_pixels,
  1440. uint32_t ise_weight_range,
  1441. uint32_t& best_submode,
  1442. double cur_block_error,
  1443. uint8_t* blk_endpoints, uint8_t* blk_weights,
  1444. const astc_hdr_codec_options& coptions,
  1445. bool direct_only,
  1446. uint32_t ise_endpoint_range,
  1447. bool uber_mode,
  1448. bool constrain_ise_weight8_selectors,
  1449. int32_t first_submode, int32_t last_submode)
  1450. {
  1451. assert((ise_weight_range >= 1) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
  1452. assert((ise_endpoint_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_endpoint_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE));
  1453. assert((num_pixels >= 1) && (num_pixels <= 16));
  1454. best_submode = 0;
  1455. half_float block_pixels_half[16][3];
  1456. vec4F block_pixels_q16[16];
  1457. // TODO: This is done redundantly.
  1458. for (uint32_t i = 0; i < num_pixels; i++)
  1459. {
  1460. block_pixels_half[i][0] = float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][0]);
  1461. block_pixels_q16[i][0] = (float)half_to_qlog16(block_pixels_half[i][0]);
  1462. block_pixels_half[i][1] = float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][1]);
  1463. block_pixels_q16[i][1] = (float)half_to_qlog16(block_pixels_half[i][1]);
  1464. block_pixels_half[i][2] = float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][2]);
  1465. block_pixels_q16[i][2] = (float)half_to_qlog16(block_pixels_half[i][2]);
  1466. block_pixels_q16[i][3] = 0.0f;
  1467. }
  1468. const uint32_t num_weight_levels = astc_helpers::get_ise_levels(ise_weight_range);
  1469. // TODO: should match MAX_SUPPORTED_ISE_WEIGHT_INDEX
  1470. const uint32_t MAX_WEIGHT_LEVELS = 32;
  1471. (void)MAX_WEIGHT_LEVELS;
  1472. assert(num_weight_levels <= MAX_WEIGHT_LEVELS);
  1473. vec3F block_mean_color_q16(calc_mean(num_pixels, block_pixels_q16));
  1474. vec3F block_axis_q16(calc_rgb_pca(num_pixels, block_pixels_q16, block_mean_color_q16));
  1475. aabb3F color_box_q16(cInitExpand);
  1476. float l = 1e+30f, h = -1e+30f;
  1477. vec3F low_color_q16, high_color_q16;
  1478. for (uint32_t i = 0; i < num_pixels; i++)
  1479. {
  1480. color_box_q16.expand(block_pixels_q16[i]);
  1481. vec3F k(vec3F(block_pixels_q16[i]) - block_mean_color_q16);
  1482. float kd = k.dot(block_axis_q16);
  1483. if (kd < l)
  1484. {
  1485. l = kd;
  1486. low_color_q16 = block_pixels_q16[i];
  1487. }
  1488. if (kd > h)
  1489. {
  1490. h = kd;
  1491. high_color_q16 = block_pixels_q16[i];
  1492. }
  1493. }
  1494. vec3F old_low_color_q16(low_color_q16), old_high_color_q16(high_color_q16);
  1495. for (uint32_t i = 0; i < 3; i++)
  1496. {
  1497. low_color_q16[i] = lerp<float>(old_low_color_q16[i], old_high_color_q16[i], 1.0f / 64.0f);
  1498. high_color_q16[i] = lerp<float>(old_low_color_q16[i], old_high_color_q16[i], 63.0f / 64.0f);
  1499. }
  1500. uint8_t trial_blk_endpoints[NUM_MODE11_ENDPOINTS];
  1501. uint8_t trial_blk_weights[16];
  1502. uint32_t trial_best_submode = 0;
  1503. clear_obj(trial_blk_endpoints);
  1504. clear_obj(trial_blk_weights);
  1505. double trial_blk_error = 1e+30f;
  1506. bool did_improve = try_mode11(num_pixels, trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_best_submode,
  1507. low_color_q16, high_color_q16,
  1508. block_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight8_selectors,
  1509. first_submode, last_submode);
  1510. // If we couldn't find ANY usable solution due to endpoint quantization, just return. There's nothing we can do.
  1511. if (!did_improve)
  1512. return cur_block_error;
  1513. // Did the solution improve?
  1514. if (trial_blk_error < cur_block_error)
  1515. {
  1516. cur_block_error = trial_blk_error;
  1517. memcpy(blk_endpoints, trial_blk_endpoints, NUM_MODE11_ENDPOINTS);
  1518. memcpy(blk_weights, trial_blk_weights, num_pixels);
  1519. best_submode = trial_best_submode;
  1520. }
  1521. #define USE_LEAST_SQUARES (1)
  1522. #if USE_LEAST_SQUARES
  1523. // least squares on the most promising trial weight indices found
  1524. const uint32_t NUM_LS_PASSES = 3;
  1525. for (uint32_t pass = 0; pass < NUM_LS_PASSES; pass++)
  1526. {
  1527. vec3F l_q16, h_q16;
  1528. if (!compute_least_squares_endpoints_rgb(num_pixels, trial_blk_weights, &g_astc_ls_weights_ise[ise_weight_range][0], &l_q16, &h_q16, block_pixels_q16, color_box_q16))
  1529. break;
  1530. bool was_improved = try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
  1531. l_q16, h_q16,
  1532. block_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight8_selectors,
  1533. first_submode, last_submode);
  1534. if (!was_improved)
  1535. break;
  1536. // It's improved, so let's take the new weight indices.
  1537. memcpy(trial_blk_weights, blk_weights, num_pixels);
  1538. } // pass
  1539. #endif
  1540. if (uber_mode)
  1541. {
  1542. // Try varying the current best weight indices. This can be expanded/improved, but at potentially great cost.
  1543. uint8_t temp_astc_weights[16];
  1544. memcpy(temp_astc_weights, trial_blk_weights, num_pixels);
  1545. uint32_t min_lin_sel = 256, max_lin_sel = 0;
  1546. for (uint32_t i = 0; i < num_pixels; i++)
  1547. {
  1548. const uint32_t astc_sel = temp_astc_weights[i];
  1549. const uint32_t lin_sel = g_map_astc_to_linear_order[ise_weight_range][astc_sel];
  1550. assert(lin_sel < num_weight_levels);
  1551. min_lin_sel = minimumu(min_lin_sel, lin_sel);
  1552. max_lin_sel = maximumu(max_lin_sel, lin_sel);
  1553. }
  1554. bool was_improved = false;
  1555. (void)was_improved;
  1556. {
  1557. bool weights_changed = false;
  1558. uint8_t trial_weights[16];
  1559. for (uint32_t i = 0; i < num_pixels; i++)
  1560. {
  1561. uint32_t astc_sel = temp_astc_weights[i];
  1562. uint32_t lin_sel = g_map_astc_to_linear_order[ise_weight_range][astc_sel];
  1563. if ((lin_sel == min_lin_sel) && (lin_sel < (num_weight_levels - 1)))
  1564. {
  1565. lin_sel++;
  1566. weights_changed = true;
  1567. }
  1568. trial_weights[i] = g_map_linear_to_astc_order[ise_weight_range][lin_sel];
  1569. }
  1570. if (weights_changed)
  1571. {
  1572. vec3F l_q16, h_q16;
  1573. if (compute_least_squares_endpoints_rgb(num_pixels, trial_weights, &g_astc_ls_weights_ise[ise_weight_range][0], &l_q16, &h_q16, block_pixels_q16, color_box_q16))
  1574. {
  1575. if (try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
  1576. l_q16, h_q16,
  1577. block_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight8_selectors,
  1578. first_submode, last_submode))
  1579. {
  1580. was_improved = true;
  1581. }
  1582. }
  1583. }
  1584. }
  1585. {
  1586. bool weights_changed = false;
  1587. uint8_t trial_weights[16];
  1588. for (uint32_t i = 0; i < num_pixels; i++)
  1589. {
  1590. uint32_t astc_sel = temp_astc_weights[i];
  1591. uint32_t lin_sel = g_map_astc_to_linear_order[ise_weight_range][astc_sel];
  1592. if ((lin_sel == max_lin_sel) && (lin_sel > 0))
  1593. {
  1594. lin_sel--;
  1595. weights_changed = true;
  1596. }
  1597. trial_weights[i] = g_map_linear_to_astc_order[ise_weight_range][lin_sel];
  1598. }
  1599. if (weights_changed)
  1600. {
  1601. vec3F l_q16, h_q16;
  1602. if (compute_least_squares_endpoints_rgb(num_pixels, trial_weights, &g_astc_ls_weights_ise[ise_weight_range][0], &l_q16, &h_q16, block_pixels_q16, color_box_q16))
  1603. {
  1604. if (try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
  1605. l_q16, h_q16,
  1606. block_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight8_selectors,
  1607. first_submode, last_submode))
  1608. {
  1609. was_improved = true;
  1610. }
  1611. }
  1612. }
  1613. }
  1614. {
  1615. bool weights_changed = false;
  1616. uint8_t trial_weights[16];
  1617. for (uint32_t i = 0; i < num_pixels; i++)
  1618. {
  1619. uint32_t astc_sel = temp_astc_weights[i];
  1620. uint32_t lin_sel = g_map_astc_to_linear_order[ise_weight_range][astc_sel];
  1621. if ((lin_sel == max_lin_sel) && (lin_sel > 0))
  1622. {
  1623. lin_sel--;
  1624. weights_changed = true;
  1625. }
  1626. else if ((lin_sel == min_lin_sel) && (lin_sel < (num_weight_levels - 1)))
  1627. {
  1628. lin_sel++;
  1629. weights_changed = true;
  1630. }
  1631. trial_weights[i] = g_map_linear_to_astc_order[ise_weight_range][lin_sel];
  1632. }
  1633. if (weights_changed)
  1634. {
  1635. vec3F l_q16, h_q16;
  1636. if (compute_least_squares_endpoints_rgb(num_pixels, trial_weights, &g_astc_ls_weights_ise[ise_weight_range][0], &l_q16, &h_q16, block_pixels_q16, color_box_q16))
  1637. {
  1638. if (try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
  1639. l_q16, h_q16,
  1640. block_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight8_selectors,
  1641. first_submode, last_submode))
  1642. {
  1643. was_improved = true;
  1644. }
  1645. }
  1646. }
  1647. }
  1648. } // uber_mode
  1649. return cur_block_error;
  1650. }
  1651. //--------------------------------------------------------------------------------------------------------------------------
  1652. static double encode_astc_hdr_block_mode_7(
  1653. uint32_t num_pixels, const vec4F* pBlock_pixels,
  1654. uint32_t ise_weight_range,
  1655. uint32_t& best_submode,
  1656. double cur_block_error,
  1657. uint8_t* blk_endpoints, //[4]
  1658. uint8_t* blk_weights, // [num_pixels]
  1659. const astc_hdr_codec_options& coptions,
  1660. uint32_t ise_endpoint_range)
  1661. {
  1662. assert((num_pixels >= 1) && (num_pixels <= 16));
  1663. assert((ise_weight_range >= 1) && (ise_weight_range <= 10));
  1664. assert((ise_endpoint_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_endpoint_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE));
  1665. const uint32_t num_weight_levels = astc_helpers::get_ise_levels(ise_weight_range);
  1666. const uint32_t MAX_WEIGHT_LEVELS = 24;
  1667. assert(num_weight_levels <= MAX_WEIGHT_LEVELS);
  1668. BASISU_NOTE_UNUSED(MAX_WEIGHT_LEVELS);
  1669. best_submode = 0;
  1670. half_float block_pixels_half[16][3];
  1671. vec4F block_pixels_q16[16];
  1672. for (uint32_t i = 0; i < num_pixels; i++)
  1673. {
  1674. block_pixels_half[i][0] = float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][0]);
  1675. block_pixels_q16[i][0] = (float)half_to_qlog16(block_pixels_half[i][0]);
  1676. block_pixels_half[i][1] = float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][1]);
  1677. block_pixels_q16[i][1] = (float)half_to_qlog16(block_pixels_half[i][1]);
  1678. block_pixels_half[i][2] = float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][2]);
  1679. block_pixels_q16[i][2] = (float)half_to_qlog16(block_pixels_half[i][2]);
  1680. block_pixels_q16[i][3] = 0.0f;
  1681. }
  1682. vec3F block_mean_color_q16(calc_mean(num_pixels, block_pixels_q16));
  1683. vec3F block_axis_q16(0.577350259f);
  1684. aabb3F color_box_q16(cInitExpand);
  1685. float l = 1e+30f, h = -1e+30f;
  1686. for (uint32_t i = 0; i < num_pixels; i++)
  1687. {
  1688. color_box_q16.expand(block_pixels_q16[i]);
  1689. vec3F k(vec3F(block_pixels_q16[i]) - block_mean_color_q16);
  1690. float kd = k.dot(block_axis_q16);
  1691. l = basisu::minimum<float>(l, kd);
  1692. h = basisu::maximum<float>(h, kd);
  1693. }
  1694. vec3F low_color_q16(interp_color(block_mean_color_q16, block_axis_q16, l, color_box_q16, color_box_q16));
  1695. vec3F high_color_q16(interp_color(block_mean_color_q16, block_axis_q16, h, color_box_q16, color_box_q16));
  1696. low_color_q16.clamp(0.0f, MAX_QLOG16_VAL);
  1697. high_color_q16.clamp(0.0f, MAX_QLOG16_VAL);
  1698. vec3F diff(high_color_q16 - low_color_q16);
  1699. float s_q16 = diff.dot(block_axis_q16) * block_axis_q16[0];
  1700. uint8_t trial_blk_endpoints[NUM_MODE7_ENDPOINTS];
  1701. uint8_t trial_blk_weights[16];
  1702. uint32_t trial_best_submode = 0;
  1703. clear_obj(trial_blk_endpoints);
  1704. clear_obj(trial_blk_weights);
  1705. double trial_blk_error = 1e+30f;
  1706. bool did_improve = try_mode7(num_pixels, trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_best_submode,
  1707. high_color_q16, ceilf(s_q16),
  1708. block_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range);
  1709. // If we couldn't find ANY usable solution due to endpoint quantization, just return. There's nothing we can do.
  1710. if (!did_improve)
  1711. {
  1712. return cur_block_error;
  1713. }
  1714. // Did the solution improve?
  1715. if (trial_blk_error < cur_block_error)
  1716. {
  1717. cur_block_error = trial_blk_error;
  1718. memcpy(blk_endpoints, trial_blk_endpoints, NUM_MODE7_ENDPOINTS);
  1719. memcpy(blk_weights, trial_blk_weights, num_pixels);
  1720. best_submode = trial_best_submode;
  1721. }
  1722. const float one_over_num_pixels = 1.0f / (float)num_pixels;
  1723. const uint32_t NUM_TRIALS = 2;
  1724. for (uint32_t trial = 0; trial < NUM_TRIALS; trial++)
  1725. {
  1726. // Given a set of selectors and S, try to compute a better high color
  1727. vec3F new_high_color_q16(block_mean_color_q16);
  1728. int e[2][3];
  1729. int cur_s = 0;
  1730. if (!decode_mode7_to_qlog12(trial_blk_endpoints, e, &cur_s, ise_endpoint_range))
  1731. break;
  1732. cur_s <<= 4;
  1733. for (uint32_t i = 0; i < num_pixels; i++)
  1734. {
  1735. uint32_t astc_sel = trial_blk_weights[i];
  1736. float lerp = g_ise_weight_lerps[ise_weight_range][astc_sel + 1] * (1.0f / 64.0f);
  1737. float k = (float)cur_s * (1.0f - lerp) * one_over_num_pixels;
  1738. new_high_color_q16[0] += k;
  1739. new_high_color_q16[1] += k;
  1740. new_high_color_q16[2] += k;
  1741. }
  1742. bool improved = try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
  1743. new_high_color_q16, (float)cur_s,
  1744. block_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range);
  1745. if (improved)
  1746. {
  1747. memcpy(trial_blk_endpoints, blk_endpoints, NUM_MODE7_ENDPOINTS);
  1748. memcpy(trial_blk_weights, blk_weights, num_pixels);
  1749. }
  1750. // Given a set of selectors and a high color, try to compute a better S.
  1751. float t = 0.0f;
  1752. for (uint32_t i = 0; i < num_pixels; i++)
  1753. {
  1754. uint32_t astc_sel = trial_blk_weights[i];
  1755. float lerp = g_ise_weight_lerps[ise_weight_range][astc_sel + 1] * (1.0f / 64.0f);
  1756. t += (1.0f) - lerp;
  1757. }
  1758. t *= one_over_num_pixels;
  1759. //int e[2][3];
  1760. if (!decode_mode7_to_qlog12(trial_blk_endpoints, e, nullptr, ise_endpoint_range))
  1761. break;
  1762. vec3F cur_h_q16((float)(e[1][0] << 4), (float)(e[1][1] << 4), (float)(e[1][2] << 4));
  1763. if (fabs(t) > .0000125f)
  1764. {
  1765. float s_r = (cur_h_q16[0] - block_mean_color_q16[0]) / t;
  1766. float s_g = (cur_h_q16[1] - block_mean_color_q16[1]) / t;
  1767. float s_b = (cur_h_q16[2] - block_mean_color_q16[2]) / t;
  1768. // TODO: gather statistics on these
  1769. if (try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
  1770. cur_h_q16, ceilf(s_r),
  1771. block_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range))
  1772. {
  1773. improved = true;
  1774. }
  1775. if (try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
  1776. cur_h_q16, ceilf(s_g),
  1777. block_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range))
  1778. {
  1779. improved = true;
  1780. }
  1781. if (try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
  1782. cur_h_q16, ceilf(s_b),
  1783. block_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range))
  1784. {
  1785. improved = true;
  1786. }
  1787. if (try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
  1788. cur_h_q16, ceilf((s_r + s_g + s_b) / 3.0f),
  1789. block_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range))
  1790. {
  1791. improved = true;
  1792. }
  1793. }
  1794. if (!improved)
  1795. break;
  1796. memcpy(trial_blk_endpoints, blk_endpoints, NUM_MODE7_ENDPOINTS);
  1797. memcpy(trial_blk_weights, blk_weights, num_pixels);
  1798. } // trial
  1799. return cur_block_error;
  1800. }
  1801. //--------------------------------------------------------------------------------------------------------------------------
  1802. static bool pack_solid(const vec4F* pBlock_linear_colors, basisu::vector<astc_hdr_pack_results>& all_results, const astc_hdr_codec_options& coptions)
  1803. {
  1804. float r = 0.0f, g = 0.0f, b = 0.0f;
  1805. const float LOG_BIAS = .125f;
  1806. bool solid_block = true;
  1807. for (uint32_t i = 0; i < 16; i++)
  1808. {
  1809. if ((pBlock_linear_colors[0][0] != pBlock_linear_colors[i][0]) ||
  1810. (pBlock_linear_colors[0][1] != pBlock_linear_colors[i][1]) ||
  1811. (pBlock_linear_colors[0][2] != pBlock_linear_colors[i][2]))
  1812. {
  1813. solid_block = false;
  1814. }
  1815. r += log2f(pBlock_linear_colors[i][0] + LOG_BIAS);
  1816. g += log2f(pBlock_linear_colors[i][1] + LOG_BIAS);
  1817. b += log2f(pBlock_linear_colors[i][2] + LOG_BIAS);
  1818. }
  1819. if (solid_block)
  1820. {
  1821. r = pBlock_linear_colors[0][0];
  1822. g = pBlock_linear_colors[0][1];
  1823. b = pBlock_linear_colors[0][2];
  1824. }
  1825. else
  1826. {
  1827. r = maximum<float>(0.0f, powf(2.0f, r * (1.0f / 16.0f)) - LOG_BIAS);
  1828. g = maximum<float>(0.0f, powf(2.0f, g * (1.0f / 16.0f)) - LOG_BIAS);
  1829. b = maximum<float>(0.0f, powf(2.0f, b * (1.0f / 16.0f)) - LOG_BIAS);
  1830. // for safety
  1831. r = minimum<float>(r, MAX_HALF_FLOAT);
  1832. g = minimum<float>(g, MAX_HALF_FLOAT);
  1833. b = minimum<float>(b, MAX_HALF_FLOAT);
  1834. }
  1835. half_float rh = float_to_half_non_neg_no_nan_inf(r), gh = float_to_half_non_neg_no_nan_inf(g), bh = float_to_half_non_neg_no_nan_inf(b), ah = float_to_half_non_neg_no_nan_inf(1.0f);
  1836. astc_hdr_pack_results results;
  1837. results.clear();
  1838. uint8_t* packed_blk = (uint8_t*)&results.m_solid_blk;
  1839. results.m_is_solid = true;
  1840. packed_blk[0] = 0b11111100;
  1841. packed_blk[1] = 255;
  1842. packed_blk[2] = 255;
  1843. packed_blk[3] = 255;
  1844. packed_blk[4] = 255;
  1845. packed_blk[5] = 255;
  1846. packed_blk[6] = 255;
  1847. packed_blk[7] = 255;
  1848. packed_blk[8] = (uint8_t)rh;
  1849. packed_blk[9] = (uint8_t)(rh >> 8);
  1850. packed_blk[10] = (uint8_t)gh;
  1851. packed_blk[11] = (uint8_t)(gh >> 8);
  1852. packed_blk[12] = (uint8_t)bh;
  1853. packed_blk[13] = (uint8_t)(bh >> 8);
  1854. packed_blk[14] = (uint8_t)ah;
  1855. packed_blk[15] = (uint8_t)(ah >> 8);
  1856. results.m_best_block_error = 0;
  1857. if (!solid_block)
  1858. {
  1859. const float R_WEIGHT = coptions.m_r_err_scale;
  1860. const float G_WEIGHT = coptions.m_g_err_scale;
  1861. // This MUST match how errors are computed in eval_selectors().
  1862. for (uint32_t i = 0; i < 16; i++)
  1863. {
  1864. half_float dr = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][0]), dg = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][1]), db = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][2]);
  1865. double rd = q(rh) - q(dr);
  1866. double gd = q(gh) - q(dg);
  1867. double bd = q(bh) - q(db);
  1868. double e = R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd;
  1869. results.m_best_block_error += e;
  1870. }
  1871. }
  1872. const half_float hc[3] = { rh, gh, bh };
  1873. bc6h_enc_block_solid_color(&results.m_bc6h_block, hc);
  1874. all_results.push_back(results);
  1875. return solid_block;
  1876. }
  1877. //--------------------------------------------------------------------------------------------------------------------------
  1878. static void pack_mode11(
  1879. const vec4F* pBlock_linear_colors,
  1880. basisu::vector<astc_hdr_pack_results>& all_results,
  1881. const astc_hdr_codec_options& coptions,
  1882. uint32_t first_weight_ise_range, uint32_t last_weight_ise_range, bool constrain_ise_weight8_selectors)
  1883. {
  1884. uint8_t trial_endpoints[NUM_MODE11_ENDPOINTS], trial_weights[16];
  1885. uint32_t trial_submode11 = 0;
  1886. clear_obj(trial_endpoints);
  1887. clear_obj(trial_weights);
  1888. for (uint32_t weight_ise_range = first_weight_ise_range; weight_ise_range <= last_weight_ise_range; weight_ise_range++)
  1889. {
  1890. const bool direct_only = coptions.m_mode11_direct_only;
  1891. uint32_t endpoint_ise_range = astc_helpers::BISE_256_LEVELS;
  1892. if (weight_ise_range == astc_helpers::BISE_16_LEVELS)
  1893. endpoint_ise_range = astc_helpers::BISE_192_LEVELS;
  1894. else
  1895. {
  1896. assert(weight_ise_range < astc_helpers::BISE_16_LEVELS);
  1897. }
  1898. double trial_error = encode_astc_hdr_block_mode_11(16, pBlock_linear_colors, weight_ise_range, trial_submode11, 1e+30f, trial_endpoints, trial_weights, coptions, direct_only,
  1899. endpoint_ise_range, coptions.m_mode11_uber_mode && (weight_ise_range >= astc_helpers::BISE_4_LEVELS) && coptions.m_allow_uber_mode, constrain_ise_weight8_selectors, coptions.m_first_mode11_submode, coptions.m_last_mode11_submode);
  1900. if (trial_error < 1e+30f)
  1901. {
  1902. astc_hdr_pack_results results;
  1903. results.clear();
  1904. results.m_best_block_error = trial_error;
  1905. results.m_best_submodes[0] = trial_submode11;
  1906. results.m_constrained_weights = constrain_ise_weight8_selectors;
  1907. results.m_best_blk.m_num_partitions = 1;
  1908. results.m_best_blk.m_color_endpoint_modes[0] = 11;
  1909. results.m_best_blk.m_weight_ise_range = weight_ise_range;
  1910. results.m_best_blk.m_endpoint_ise_range = endpoint_ise_range;
  1911. memcpy(results.m_best_blk.m_endpoints, trial_endpoints, NUM_MODE11_ENDPOINTS);
  1912. memcpy(results.m_best_blk.m_weights, trial_weights, 16);
  1913. #ifdef _DEBUG
  1914. {
  1915. half_float block_pixels_half[16][3];
  1916. vec4F block_pixels_q16[16];
  1917. for (uint32_t i = 0; i < 16; i++)
  1918. {
  1919. block_pixels_half[i][0] = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][0]);
  1920. block_pixels_half[i][1] = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][1]);
  1921. block_pixels_half[i][2] = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][2]);
  1922. }
  1923. half_float unpacked_astc_blk_rgba[4][4][4];
  1924. bool res = astc_helpers::decode_block(results.m_best_blk, unpacked_astc_blk_rgba, 4, 4, astc_helpers::cDecodeModeHDR16);
  1925. assert(res);
  1926. half_float unpacked_astc_blk_rgb[4][4][3];
  1927. for (uint32_t y = 0; y < 4; y++)
  1928. for (uint32_t x = 0; x < 4; x++)
  1929. for (uint32_t c = 0; c < 3; c++)
  1930. unpacked_astc_blk_rgb[y][x][c] = unpacked_astc_blk_rgba[y][x][c];
  1931. double cmp_err = compute_block_error(&block_pixels_half[0][0], &unpacked_astc_blk_rgb[0][0][0], coptions);
  1932. assert(results.m_best_block_error == cmp_err);
  1933. }
  1934. #endif
  1935. // transcode to BC6H
  1936. assert(results.m_best_blk.m_color_endpoint_modes[0] == 11);
  1937. // Get qlog12 endpoints
  1938. int e[2][3];
  1939. bool success = decode_mode11_to_qlog12(results.m_best_blk.m_endpoints, e, results.m_best_blk.m_endpoint_ise_range);
  1940. assert(success);
  1941. BASISU_NOTE_UNUSED(success);
  1942. // Transform endpoints to half float
  1943. half_float h_e[3][2] =
  1944. {
  1945. { qlog_to_half(e[0][0], 12), qlog_to_half(e[1][0], 12) },
  1946. { qlog_to_half(e[0][1], 12), qlog_to_half(e[1][1], 12) },
  1947. { qlog_to_half(e[0][2], 12), qlog_to_half(e[1][2], 12) }
  1948. };
  1949. // Transcode to bc6h
  1950. success = transcode_bc6h_1subset(h_e, results.m_best_blk, results.m_bc6h_block);
  1951. assert(success);
  1952. all_results.push_back(results);
  1953. }
  1954. }
  1955. }
  1956. //--------------------------------------------------------------------------------------------------------------------------
  1957. static void pack_mode7_single_part(const vec4F* pBlock_linear_colors, basisu::vector<astc_hdr_pack_results>& all_results, const astc_hdr_codec_options& coptions)
  1958. {
  1959. uint8_t trial_endpoints[NUM_MODE7_ENDPOINTS], trial_weights[16];
  1960. uint32_t trial_submode7 = 0;
  1961. clear_obj(trial_endpoints);
  1962. clear_obj(trial_weights);
  1963. for (uint32_t weight_ise_range = coptions.m_first_mode7_part1_weight_ise_range; weight_ise_range <= coptions.m_last_mode7_part1_weight_ise_range; weight_ise_range++)
  1964. {
  1965. const uint32_t ise_endpoint_range = astc_helpers::BISE_256_LEVELS;
  1966. double trial_error = encode_astc_hdr_block_mode_7(16, pBlock_linear_colors, weight_ise_range, trial_submode7, 1e+30f, trial_endpoints, trial_weights, coptions, ise_endpoint_range);
  1967. if (trial_error < 1e+30f)
  1968. {
  1969. astc_hdr_pack_results results;
  1970. results.clear();
  1971. results.m_best_block_error = trial_error;
  1972. results.m_best_submodes[0] = trial_submode7;
  1973. results.m_best_blk.m_num_partitions = 1;
  1974. results.m_best_blk.m_color_endpoint_modes[0] = 7;
  1975. results.m_best_blk.m_weight_ise_range = weight_ise_range;
  1976. results.m_best_blk.m_endpoint_ise_range = ise_endpoint_range;
  1977. memcpy(results.m_best_blk.m_endpoints, trial_endpoints, NUM_MODE7_ENDPOINTS);
  1978. memcpy(results.m_best_blk.m_weights, trial_weights, 16);
  1979. // transcode to BC6H
  1980. assert(results.m_best_blk.m_color_endpoint_modes[0] == 7);
  1981. // Get qlog12 endpoints
  1982. int e[2][3];
  1983. if (!decode_mode7_to_qlog12(results.m_best_blk.m_endpoints, e, nullptr, results.m_best_blk.m_endpoint_ise_range))
  1984. continue;
  1985. // Transform endpoints to half float
  1986. half_float h_e[3][2] =
  1987. {
  1988. { qlog_to_half(e[0][0], 12), qlog_to_half(e[1][0], 12) },
  1989. { qlog_to_half(e[0][1], 12), qlog_to_half(e[1][1], 12) },
  1990. { qlog_to_half(e[0][2], 12), qlog_to_half(e[1][2], 12) }
  1991. };
  1992. // Transcode to bc6h
  1993. bool status = transcode_bc6h_1subset(h_e, results.m_best_blk, results.m_bc6h_block);
  1994. assert(status);
  1995. (void)status;
  1996. all_results.push_back(results);
  1997. }
  1998. }
  1999. }
  2000. //--------------------------------------------------------------------------------------------------------------------------
  2001. static bool estimate_partition2(const vec4F* pBlock_pixels, int* pBest_parts, uint32_t num_best_parts)
  2002. {
  2003. assert(num_best_parts <= basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2);
  2004. vec3F training_vecs[16], mean(0.0f);
  2005. for (uint32_t i = 0; i < 16; i++)
  2006. {
  2007. vec3F& v = training_vecs[i];
  2008. v[0] = (float)float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][0]);
  2009. v[1] = (float)float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][1]);
  2010. v[2] = (float)float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][2]);
  2011. mean += v;
  2012. }
  2013. mean *= (1.0f / 16.0f);
  2014. vec3F cluster_centroids[2] = { mean - vec3F(.1f), mean + vec3F(.1f) };
  2015. uint32_t cluster_pixels[2][16];
  2016. uint32_t num_cluster_pixels[2];
  2017. vec3F new_cluster_means[2];
  2018. for (uint32_t s = 0; s < 4; s++)
  2019. {
  2020. num_cluster_pixels[0] = 0;
  2021. num_cluster_pixels[1] = 0;
  2022. new_cluster_means[0].clear();
  2023. new_cluster_means[1].clear();
  2024. for (uint32_t i = 0; i < 16; i++)
  2025. {
  2026. float d0 = training_vecs[i].squared_distance(cluster_centroids[0]);
  2027. float d1 = training_vecs[i].squared_distance(cluster_centroids[1]);
  2028. if (d0 < d1)
  2029. {
  2030. cluster_pixels[0][num_cluster_pixels[0]] = i;
  2031. new_cluster_means[0] += training_vecs[i];
  2032. num_cluster_pixels[0]++;
  2033. }
  2034. else
  2035. {
  2036. cluster_pixels[1][num_cluster_pixels[1]] = i;
  2037. new_cluster_means[1] += training_vecs[i];
  2038. num_cluster_pixels[1]++;
  2039. }
  2040. }
  2041. if (!num_cluster_pixels[0] || !num_cluster_pixels[1])
  2042. return false;
  2043. cluster_centroids[0] = new_cluster_means[0] / (float)num_cluster_pixels[0];
  2044. cluster_centroids[1] = new_cluster_means[1] / (float)num_cluster_pixels[1];
  2045. }
  2046. int desired_parts[4][4]; // [y][x]
  2047. for (uint32_t p = 0; p < 2; p++)
  2048. {
  2049. for (uint32_t i = 0; i < num_cluster_pixels[p]; i++)
  2050. {
  2051. const uint32_t pix_index = cluster_pixels[p][i];
  2052. desired_parts[pix_index >> 2][pix_index & 3] = p;
  2053. }
  2054. }
  2055. uint32_t part_similarity[basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2];
  2056. for (uint32_t part_index = 0; part_index < basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2; part_index++)
  2057. {
  2058. const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_bc7;
  2059. int total_sim_non_inv = 0;
  2060. int total_sim_inv = 0;
  2061. for (uint32_t y = 0; y < 4; y++)
  2062. {
  2063. for (uint32_t x = 0; x < 4; x++)
  2064. {
  2065. int part = basist::g_bc7_partition2[16 * bc7_pattern + x + y * 4];
  2066. if (part == desired_parts[y][x])
  2067. total_sim_non_inv++;
  2068. if ((part ^ 1) == desired_parts[y][x])
  2069. total_sim_inv++;
  2070. }
  2071. }
  2072. int total_sim = maximum(total_sim_non_inv, total_sim_inv);
  2073. part_similarity[part_index] = (total_sim << 8) | part_index;
  2074. } // part_index;
  2075. std::sort(part_similarity, part_similarity + basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2);
  2076. for (uint32_t i = 0; i < num_best_parts; i++)
  2077. pBest_parts[i] = part_similarity[(basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2 - 1) - i] & 0xFF;
  2078. return true;
  2079. }
  2080. //--------------------------------------------------------------------------------------------------------------------------
  2081. static void pack_mode7_2part(const vec4F* pBlock_linear_colors, basisu::vector<astc_hdr_pack_results>& all_results, const astc_hdr_codec_options& coptions,
  2082. int num_estimated_partitions, const int *pEstimated_partitions,
  2083. uint32_t first_weight_ise_range, uint32_t last_weight_ise_range)
  2084. {
  2085. assert(coptions.m_mode7_part2_part_masks);
  2086. astc_helpers::log_astc_block trial_blk;
  2087. clear_obj(trial_blk);
  2088. trial_blk.m_grid_width = 4;
  2089. trial_blk.m_grid_height = 4;
  2090. trial_blk.m_num_partitions = 2;
  2091. trial_blk.m_color_endpoint_modes[0] = 7;
  2092. trial_blk.m_color_endpoint_modes[1] = 7;
  2093. uint32_t first_part_index = 0, last_part_index = basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2;
  2094. if (num_estimated_partitions)
  2095. {
  2096. first_part_index = 0;
  2097. last_part_index = num_estimated_partitions;
  2098. }
  2099. for (uint32_t part_index_iter = first_part_index; part_index_iter < last_part_index; ++part_index_iter)
  2100. {
  2101. uint32_t part_index;
  2102. if (num_estimated_partitions)
  2103. {
  2104. part_index = pEstimated_partitions[part_index_iter];
  2105. assert(part_index < basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2);
  2106. }
  2107. else
  2108. {
  2109. part_index = part_index_iter;
  2110. if (((1U << part_index) & coptions.m_mode7_part2_part_masks) == 0)
  2111. continue;
  2112. }
  2113. const uint32_t astc_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_astc;
  2114. const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_bc7;
  2115. const bool invert_flag = basist::g_astc_bc7_common_partitions2[part_index].m_invert;
  2116. vec4F part_pixels[2][16];
  2117. uint32_t pixel_part_index[4][4]; // [y][x]
  2118. uint32_t num_part_pixels[2] = { 0, 0 };
  2119. // Extract each subset's texels for this partition pattern
  2120. for (uint32_t y = 0; y < 4; y++)
  2121. {
  2122. for (uint32_t x = 0; x < 4; x++)
  2123. {
  2124. uint32_t part = basist::g_bc7_partition2[16 * bc7_pattern + x + y * 4];
  2125. if (invert_flag)
  2126. part = 1 - part;
  2127. pixel_part_index[y][x] = part;
  2128. part_pixels[part][num_part_pixels[part]] = pBlock_linear_colors[x + y * 4];
  2129. num_part_pixels[part]++;
  2130. }
  2131. }
  2132. trial_blk.m_partition_id = astc_pattern;
  2133. for (uint32_t weight_ise_range = first_weight_ise_range; weight_ise_range <= last_weight_ise_range; weight_ise_range++)
  2134. {
  2135. assert(weight_ise_range <= astc_helpers::BISE_8_LEVELS);
  2136. uint32_t ise_endpoint_range = astc_helpers::BISE_256_LEVELS;
  2137. if (weight_ise_range == astc_helpers::BISE_5_LEVELS)
  2138. ise_endpoint_range = astc_helpers::BISE_192_LEVELS;
  2139. else if (weight_ise_range == astc_helpers::BISE_6_LEVELS)
  2140. ise_endpoint_range = astc_helpers::BISE_128_LEVELS;
  2141. else if (weight_ise_range == astc_helpers::BISE_8_LEVELS)
  2142. ise_endpoint_range = astc_helpers::BISE_80_LEVELS;
  2143. uint8_t trial_endpoints[2][NUM_MODE7_ENDPOINTS], trial_weights[2][16];
  2144. uint32_t trial_submode7[2];
  2145. clear_obj(trial_endpoints);
  2146. clear_obj(trial_weights);
  2147. clear_obj(trial_submode7);
  2148. double total_trial_err = 0;
  2149. for (uint32_t pack_part_index = 0; pack_part_index < 2; pack_part_index++)
  2150. {
  2151. total_trial_err += encode_astc_hdr_block_mode_7(
  2152. num_part_pixels[pack_part_index], &part_pixels[pack_part_index][0],
  2153. weight_ise_range, trial_submode7[pack_part_index], 1e+30f,
  2154. &trial_endpoints[pack_part_index][0], &trial_weights[pack_part_index][0], coptions, ise_endpoint_range);
  2155. } // pack_part_index
  2156. if (total_trial_err < 1e+30f)
  2157. {
  2158. trial_blk.m_weight_ise_range = weight_ise_range;
  2159. trial_blk.m_endpoint_ise_range = ise_endpoint_range;
  2160. for (uint32_t pack_part_index = 0; pack_part_index < 2; pack_part_index++)
  2161. memcpy(&trial_blk.m_endpoints[pack_part_index * NUM_MODE7_ENDPOINTS], &trial_endpoints[pack_part_index][0], NUM_MODE7_ENDPOINTS);
  2162. uint32_t src_pixel_index[2] = { 0, 0 };
  2163. for (uint32_t y = 0; y < 4; y++)
  2164. {
  2165. for (uint32_t x = 0; x < 4; x++)
  2166. {
  2167. uint32_t p = pixel_part_index[y][x];
  2168. trial_blk.m_weights[x + y * 4] = trial_weights[p][src_pixel_index[p]++];
  2169. }
  2170. }
  2171. astc_hdr_pack_results results;
  2172. results.clear();
  2173. results.m_best_block_error = total_trial_err;
  2174. results.m_best_submodes[0] = trial_submode7[0];
  2175. results.m_best_submodes[1] = trial_submode7[1];
  2176. results.m_best_pat_index = part_index;
  2177. results.m_best_blk = trial_blk;
  2178. bool status = transcode_bc6h_2subsets(part_index, results.m_best_blk, results.m_bc6h_block);
  2179. assert(status);
  2180. BASISU_NOTE_UNUSED(status);
  2181. all_results.push_back(results);
  2182. }
  2183. } // weight_ise_range
  2184. } // part_index
  2185. }
  2186. //--------------------------------------------------------------------------------------------------------------------------
  2187. static void pack_mode11_2part(const vec4F* pBlock_linear_colors, basisu::vector<astc_hdr_pack_results>& all_results, const astc_hdr_codec_options& coptions,
  2188. int num_estimated_partitions, const int* pEstimated_partitions)
  2189. {
  2190. assert(coptions.m_mode11_part2_part_masks);
  2191. astc_helpers::log_astc_block trial_blk;
  2192. clear_obj(trial_blk);
  2193. trial_blk.m_grid_width = 4;
  2194. trial_blk.m_grid_height = 4;
  2195. trial_blk.m_num_partitions = 2;
  2196. trial_blk.m_color_endpoint_modes[0] = 11;
  2197. trial_blk.m_color_endpoint_modes[1] = 11;
  2198. uint32_t first_part_index = 0, last_part_index = basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2;
  2199. if (num_estimated_partitions)
  2200. {
  2201. first_part_index = 0;
  2202. last_part_index = num_estimated_partitions;
  2203. }
  2204. for (uint32_t part_index_iter = first_part_index; part_index_iter < last_part_index; ++part_index_iter)
  2205. {
  2206. uint32_t part_index;
  2207. if (num_estimated_partitions)
  2208. {
  2209. part_index = pEstimated_partitions[part_index_iter];
  2210. assert(part_index < basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2);
  2211. }
  2212. else
  2213. {
  2214. part_index = part_index_iter;
  2215. if (((1U << part_index) & coptions.m_mode11_part2_part_masks) == 0)
  2216. continue;
  2217. }
  2218. const uint32_t astc_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_astc;
  2219. const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_bc7;
  2220. const bool invert_flag = basist::g_astc_bc7_common_partitions2[part_index].m_invert;
  2221. vec4F part_pixels[2][16];
  2222. uint32_t pixel_part_index[4][4]; // [y][x]
  2223. uint32_t num_part_pixels[2] = { 0, 0 };
  2224. // Extract each subset's texels for this partition pattern
  2225. for (uint32_t y = 0; y < 4; y++)
  2226. {
  2227. for (uint32_t x = 0; x < 4; x++)
  2228. {
  2229. uint32_t part = basist::g_bc7_partition2[16 * bc7_pattern + x + y * 4];
  2230. if (invert_flag)
  2231. part = 1 - part;
  2232. pixel_part_index[y][x] = part;
  2233. part_pixels[part][num_part_pixels[part]] = pBlock_linear_colors[x + y * 4];
  2234. num_part_pixels[part]++;
  2235. }
  2236. }
  2237. trial_blk.m_partition_id = astc_pattern;
  2238. for (uint32_t weight_ise_range = coptions.m_first_mode11_part2_weight_ise_range; weight_ise_range <= coptions.m_last_mode11_part2_weight_ise_range; weight_ise_range++)
  2239. {
  2240. bool direct_only = false;
  2241. uint32_t ise_endpoint_range = astc_helpers::BISE_64_LEVELS;
  2242. if (weight_ise_range == astc_helpers::BISE_4_LEVELS)
  2243. ise_endpoint_range = astc_helpers::BISE_40_LEVELS;
  2244. uint8_t trial_endpoints[2][NUM_MODE11_ENDPOINTS], trial_weights[2][16];
  2245. uint32_t trial_submode11[2];
  2246. clear_obj(trial_endpoints);
  2247. clear_obj(trial_weights);
  2248. clear_obj(trial_submode11);
  2249. double total_trial_err = 0;
  2250. for (uint32_t pack_part_index = 0; pack_part_index < 2; pack_part_index++)
  2251. {
  2252. total_trial_err += encode_astc_hdr_block_mode_11(
  2253. num_part_pixels[pack_part_index], &part_pixels[pack_part_index][0],
  2254. weight_ise_range, trial_submode11[pack_part_index], 1e+30f,
  2255. &trial_endpoints[pack_part_index][0], &trial_weights[pack_part_index][0], coptions,
  2256. direct_only, ise_endpoint_range, coptions.m_mode11_uber_mode && (weight_ise_range >= astc_helpers::BISE_4_LEVELS) && coptions.m_allow_uber_mode, false,
  2257. coptions.m_first_mode11_submode, coptions.m_last_mode11_submode);
  2258. } // pack_part_index
  2259. if (total_trial_err < 1e+30f)
  2260. {
  2261. trial_blk.m_weight_ise_range = weight_ise_range;
  2262. trial_blk.m_endpoint_ise_range = ise_endpoint_range;
  2263. for (uint32_t pack_part_index = 0; pack_part_index < 2; pack_part_index++)
  2264. memcpy(&trial_blk.m_endpoints[pack_part_index * NUM_MODE11_ENDPOINTS], &trial_endpoints[pack_part_index][0], NUM_MODE11_ENDPOINTS);
  2265. uint32_t src_pixel_index[2] = { 0, 0 };
  2266. for (uint32_t y = 0; y < 4; y++)
  2267. {
  2268. for (uint32_t x = 0; x < 4; x++)
  2269. {
  2270. uint32_t p = pixel_part_index[y][x];
  2271. trial_blk.m_weights[x + y * 4] = trial_weights[p][src_pixel_index[p]++];
  2272. }
  2273. }
  2274. astc_hdr_pack_results results;
  2275. results.clear();
  2276. results.m_best_block_error = total_trial_err;
  2277. results.m_best_submodes[0] = trial_submode11[0];
  2278. results.m_best_submodes[1] = trial_submode11[1];
  2279. results.m_best_pat_index = part_index;
  2280. results.m_best_blk = trial_blk;
  2281. bool status = transcode_bc6h_2subsets(part_index, results.m_best_blk, results.m_bc6h_block);
  2282. assert(status);
  2283. BASISU_NOTE_UNUSED(status);
  2284. all_results.push_back(results);
  2285. }
  2286. } // weight_ise_range
  2287. } // part_index
  2288. }
  2289. //--------------------------------------------------------------------------------------------------------------------------
  2290. bool g_astc_hdr_enc_initialized;
  2291. void astc_hdr_enc_init()
  2292. {
  2293. if (g_astc_hdr_enc_initialized)
  2294. return;
  2295. astc_hdr_core_init();
  2296. astc_helpers::init_tables(true);
  2297. init_qlog_tables();
  2298. encode_astc_hdr_init();
  2299. g_astc_hdr_enc_initialized = true;
  2300. }
  2301. bool astc_hdr_enc_block(
  2302. const float* pRGBPixels,
  2303. const astc_hdr_codec_options& coptions,
  2304. basisu::vector<astc_hdr_pack_results>& all_results)
  2305. {
  2306. assert(g_astc_hdr_enc_initialized);
  2307. if (!g_astc_hdr_enc_initialized)
  2308. {
  2309. // astc_hdr_enc_init() MUST be called first.
  2310. assert(0);
  2311. return false;
  2312. }
  2313. all_results.resize(0);
  2314. vec4F block_linear_colors[16];
  2315. // Sanity check the input block.
  2316. for (uint32_t i = 0; i < 16; i++)
  2317. {
  2318. for (uint32_t j = 0; j < 3; j++)
  2319. {
  2320. float v = pRGBPixels[i * 3 + j];
  2321. if (std::isinf(v) || std::isnan(v))
  2322. {
  2323. // Input pixels cannot be NaN or +-Inf.
  2324. assert(0);
  2325. return false;
  2326. }
  2327. if (v < 0.0f)
  2328. {
  2329. // Input pixels cannot be signed.
  2330. assert(0);
  2331. return false;
  2332. }
  2333. if (v > MAX_HALF_FLOAT)
  2334. {
  2335. // Too large for half float.
  2336. assert(0);
  2337. return false;
  2338. }
  2339. block_linear_colors[i][j] = v;
  2340. }
  2341. block_linear_colors[i][3] = 1.0f;
  2342. }
  2343. assert(coptions.m_use_solid || coptions.m_use_mode11 || coptions.m_use_mode7_part2 || coptions.m_use_mode7_part1 || coptions.m_use_mode11_part2);
  2344. bool is_solid = false;
  2345. if (coptions.m_use_solid)
  2346. is_solid = pack_solid(block_linear_colors, all_results, coptions);
  2347. if (!is_solid)
  2348. {
  2349. if (coptions.m_use_mode11)
  2350. {
  2351. const size_t cur_num_results = all_results.size();
  2352. pack_mode11(block_linear_colors, all_results, coptions, coptions.m_first_mode11_weight_ise_range, coptions.m_last_mode11_weight_ise_range, false);
  2353. if (coptions.m_last_mode11_weight_ise_range == astc_helpers::BISE_16_LEVELS)
  2354. {
  2355. pack_mode11(block_linear_colors, all_results, coptions, astc_helpers::BISE_16_LEVELS, astc_helpers::BISE_16_LEVELS, true);
  2356. }
  2357. // If we couldn't get any mode 11 results at all, and we were restricted to just trying weight ISE range 8 (which required endpoint quantization) then
  2358. // fall back to weight ISE range 7 (which doesn't need any endpoint quantization).
  2359. // This is to guarantee we always get at least 1 non-solid result.
  2360. if (all_results.size() == cur_num_results)
  2361. {
  2362. if (coptions.m_first_mode11_weight_ise_range == astc_helpers::BISE_16_LEVELS)
  2363. {
  2364. pack_mode11(block_linear_colors, all_results, coptions, astc_helpers::BISE_12_LEVELS, astc_helpers::BISE_12_LEVELS, false);
  2365. }
  2366. }
  2367. }
  2368. if (coptions.m_use_mode7_part1)
  2369. {
  2370. // Mode 7 1-subset never requires endpoint quantization, so it cannot fail to find at least one usable solution.
  2371. pack_mode7_single_part(block_linear_colors, all_results, coptions);
  2372. }
  2373. bool have_est = false;
  2374. int best_parts[basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2];
  2375. if ((coptions.m_use_mode7_part2) || (coptions.m_use_mode11_part2))
  2376. {
  2377. if (coptions.m_use_estimated_partitions)
  2378. have_est = estimate_partition2(block_linear_colors, best_parts, coptions.m_max_estimated_partitions);
  2379. }
  2380. if (coptions.m_use_mode7_part2)
  2381. {
  2382. const size_t cur_num_results = all_results.size();
  2383. pack_mode7_2part(block_linear_colors, all_results, coptions, have_est ? coptions.m_max_estimated_partitions : 0, best_parts,
  2384. coptions.m_first_mode7_part2_weight_ise_range, coptions.m_last_mode7_part2_weight_ise_range);
  2385. // If we couldn't find any packable 2-subset mode 7 results at weight levels >= 5 levels (which always requires endpoint quant), then try falling back to
  2386. // 5 levels which doesn't require endpoint quantization.
  2387. if (all_results.size() == cur_num_results)
  2388. {
  2389. if (coptions.m_first_mode7_part2_weight_ise_range >= astc_helpers::BISE_5_LEVELS)
  2390. {
  2391. pack_mode7_2part(block_linear_colors, all_results, coptions, have_est ? coptions.m_max_estimated_partitions : 0, best_parts,
  2392. astc_helpers::BISE_4_LEVELS, astc_helpers::BISE_4_LEVELS);
  2393. }
  2394. }
  2395. }
  2396. if (coptions.m_use_mode11_part2)
  2397. {
  2398. // This always requires endpoint quant, so it could fail to find any usable solutions.
  2399. pack_mode11_2part(block_linear_colors, all_results, coptions, have_est ? coptions.m_max_estimated_partitions : 0, best_parts);
  2400. }
  2401. }
  2402. if (coptions.m_refine_weights)
  2403. {
  2404. // TODO: Move this above, do it once only.
  2405. basist::half_float rgb_pixels_half[16 * 3];
  2406. for (uint32_t i = 0; i < 16; i++)
  2407. {
  2408. rgb_pixels_half[i * 3 + 0] = float_to_half_non_neg_no_nan_inf(pRGBPixels[i * 3 + 0]);
  2409. rgb_pixels_half[i * 3 + 1] = float_to_half_non_neg_no_nan_inf(pRGBPixels[i * 3 + 1]);
  2410. rgb_pixels_half[i * 3 + 2] = float_to_half_non_neg_no_nan_inf(pRGBPixels[i * 3 + 2]);
  2411. }
  2412. for (uint32_t i = 0; i < all_results.size(); i++)
  2413. {
  2414. bool status = astc_hdr_refine_weights(rgb_pixels_half, all_results[i], coptions, coptions.m_bc6h_err_weight, &all_results[i].m_improved_via_refinement_flag);
  2415. assert(status);
  2416. BASISU_NOTE_UNUSED(status);
  2417. }
  2418. }
  2419. return true;
  2420. }
  2421. bool astc_hdr_pack_results_to_block(astc_blk& dst_blk, const astc_hdr_pack_results& results)
  2422. {
  2423. assert(g_astc_hdr_enc_initialized);
  2424. if (!g_astc_hdr_enc_initialized)
  2425. return false;
  2426. if (results.m_is_solid)
  2427. {
  2428. memcpy(&dst_blk, &results.m_solid_blk, sizeof(results.m_solid_blk));
  2429. }
  2430. else
  2431. {
  2432. bool status = astc_helpers::pack_astc_block((astc_helpers::astc_block&)dst_blk, results.m_best_blk);
  2433. if (!status)
  2434. {
  2435. assert(0);
  2436. return false;
  2437. }
  2438. }
  2439. return true;
  2440. }
  2441. // Refines a block's chosen weight indices, balancing BC6H and ASTC HDR error.
  2442. bool astc_hdr_refine_weights(const half_float *pSource_block, astc_hdr_pack_results& cur_results, const astc_hdr_codec_options& coptions, float bc6h_weight, bool *pImproved_flag)
  2443. {
  2444. if (pImproved_flag)
  2445. *pImproved_flag = false;
  2446. if (cur_results.m_is_solid)
  2447. return true;
  2448. const uint32_t total_weights = astc_helpers::get_ise_levels(cur_results.m_best_blk.m_weight_ise_range);
  2449. assert((total_weights >= 3) && (total_weights <= 16));
  2450. double best_err[4][4];
  2451. uint8_t best_weight[4][4];
  2452. for (uint32_t y = 0; y < 4; y++)
  2453. {
  2454. for (uint32_t x = 0; x < 4; x++)
  2455. {
  2456. best_err[y][x] = 1e+30f;
  2457. best_weight[y][x] = 0;
  2458. }
  2459. }
  2460. astc_hdr_pack_results temp_results;
  2461. const float c_weights[3] = { coptions.m_r_err_scale, coptions.m_g_err_scale, 1.0f };
  2462. for (uint32_t weight_index = 0; weight_index < total_weights; weight_index++)
  2463. {
  2464. temp_results = cur_results;
  2465. for (uint32_t i = 0; i < 16; i++)
  2466. temp_results.m_best_blk.m_weights[i] = (uint8_t)weight_index;
  2467. half_float unpacked_astc_blk_rgba[4][4][4];
  2468. bool res = astc_helpers::decode_block(temp_results.m_best_blk, unpacked_astc_blk_rgba, 4, 4, astc_helpers::cDecodeModeHDR16);
  2469. assert(res);
  2470. basist::bc6h_block trial_bc6h_blk;
  2471. res = basist::astc_hdr_transcode_to_bc6h(temp_results.m_best_blk, trial_bc6h_blk);
  2472. assert(res);
  2473. half_float unpacked_bc6h_blk[4][4][3];
  2474. res = unpack_bc6h(&trial_bc6h_blk, unpacked_bc6h_blk, false);
  2475. assert(res);
  2476. BASISU_NOTE_UNUSED(res);
  2477. for (uint32_t y = 0; y < 4; y++)
  2478. {
  2479. for (uint32_t x = 0; x < 4; x++)
  2480. {
  2481. double total_err = 0.0f;
  2482. for (uint32_t c = 0; c < 3; c++)
  2483. {
  2484. const half_float orig_c = pSource_block[(x + y * 4) * 3 + c];
  2485. const double orig_c_q = q(orig_c);
  2486. const half_float astc_c = unpacked_astc_blk_rgba[y][x][c];
  2487. const double astc_c_q = q(astc_c);
  2488. const double astc_e = square(astc_c_q - orig_c_q) * c_weights[c];
  2489. const half_float bc6h_c = unpacked_bc6h_blk[y][x][c];
  2490. const double bc6h_c_q = q(bc6h_c);
  2491. const double bc6h_e = square(bc6h_c_q - orig_c_q) * c_weights[c];
  2492. const double overall_err = astc_e * (1.0f - bc6h_weight) + bc6h_e * bc6h_weight;
  2493. total_err += overall_err;
  2494. } // c
  2495. if (total_err < best_err[y][x])
  2496. {
  2497. best_err[y][x] = total_err;
  2498. best_weight[y][x] = (uint8_t)weight_index;
  2499. }
  2500. } // x
  2501. } // y
  2502. } // weight_index
  2503. bool any_changed = false;
  2504. for (uint32_t i = 0; i < 16; i++)
  2505. {
  2506. if (cur_results.m_best_blk.m_weights[i] != best_weight[i >> 2][i & 3])
  2507. {
  2508. any_changed = true;
  2509. break;
  2510. }
  2511. }
  2512. if (any_changed)
  2513. {
  2514. memcpy(cur_results.m_best_blk.m_weights, best_weight, 16);
  2515. {
  2516. bool res = basist::astc_hdr_transcode_to_bc6h(cur_results.m_best_blk, cur_results.m_bc6h_block);
  2517. assert(res);
  2518. BASISU_NOTE_UNUSED(res);
  2519. half_float unpacked_astc_blk_rgba[4][4][4];
  2520. res = astc_helpers::decode_block(cur_results.m_best_blk, unpacked_astc_blk_rgba, 4, 4, astc_helpers::cDecodeModeHDR16);
  2521. assert(res);
  2522. half_float unpacked_astc_blk_rgb[4][4][3];
  2523. for (uint32_t y = 0; y < 4; y++)
  2524. for (uint32_t x = 0; x < 4; x++)
  2525. for (uint32_t c = 0; c < 3; c++)
  2526. unpacked_astc_blk_rgb[y][x][c] = unpacked_astc_blk_rgba[y][x][c];
  2527. cur_results.m_best_block_error = compute_block_error(pSource_block, &unpacked_astc_blk_rgb[0][0][0], coptions);
  2528. }
  2529. if (pImproved_flag)
  2530. *pImproved_flag = true;
  2531. }
  2532. return true;
  2533. }
  2534. void astc_hdr_block_stats::update(const astc_hdr_pack_results& log_blk)
  2535. {
  2536. std::lock_guard<std::mutex> lck(m_mutex);
  2537. m_total_blocks++;
  2538. if (log_blk.m_improved_via_refinement_flag)
  2539. m_total_refined++;
  2540. if (log_blk.m_is_solid)
  2541. {
  2542. m_total_solid++;
  2543. }
  2544. else
  2545. {
  2546. int best_weight_range = log_blk.m_best_blk.m_weight_ise_range;
  2547. if (log_blk.m_best_blk.m_color_endpoint_modes[0] == 7)
  2548. {
  2549. m_mode7_submode_hist[bounds_check(log_blk.m_best_submodes[0], 0U, 6U)]++;
  2550. if (log_blk.m_best_blk.m_num_partitions == 2)
  2551. {
  2552. m_total_mode7_2part++;
  2553. m_mode7_submode_hist[bounds_check(log_blk.m_best_submodes[1], 0U, 6U)]++;
  2554. m_total_2part++;
  2555. m_weight_range_hist_7_2part[bounds_check(best_weight_range, 0, 11)]++;
  2556. m_part_hist[bounds_check(log_blk.m_best_pat_index, 0U, 32U)]++;
  2557. }
  2558. else
  2559. {
  2560. m_total_mode7_1part++;
  2561. m_weight_range_hist_7[bounds_check(best_weight_range, 0, 11)]++;
  2562. }
  2563. }
  2564. else
  2565. {
  2566. m_mode11_submode_hist[bounds_check(log_blk.m_best_submodes[0], 0U, 9U)]++;
  2567. if (log_blk.m_constrained_weights)
  2568. m_total_mode11_1part_constrained_weights++;
  2569. if (log_blk.m_best_blk.m_num_partitions == 2)
  2570. {
  2571. m_total_mode11_2part++;
  2572. m_mode11_submode_hist[bounds_check(log_blk.m_best_submodes[1], 0U, 9U)]++;
  2573. m_total_2part++;
  2574. m_weight_range_hist_11_2part[bounds_check(best_weight_range, 0, 11)]++;
  2575. m_part_hist[bounds_check(log_blk.m_best_pat_index, 0U, 32U)]++;
  2576. }
  2577. else
  2578. {
  2579. m_total_mode11_1part++;
  2580. m_weight_range_hist_11[bounds_check(best_weight_range, 0, 11)]++;
  2581. }
  2582. }
  2583. }
  2584. }
  2585. void astc_hdr_block_stats::print()
  2586. {
  2587. std::lock_guard<std::mutex> lck(m_mutex);
  2588. assert(m_total_blocks);
  2589. if (!m_total_blocks)
  2590. return;
  2591. printf("\nLow-level ASTC Encoder Statistics:\n");
  2592. printf("Total blocks: %u\n", m_total_blocks);
  2593. printf("Total solid: %u %3.2f%%\n", m_total_solid, (m_total_solid * 100.0f) / m_total_blocks);
  2594. printf("Total refined: %u %3.2f%%\n", m_total_refined, (m_total_refined * 100.0f) / m_total_blocks);
  2595. printf("Total mode 11, 1 partition: %u %3.2f%%\n", m_total_mode11_1part, (m_total_mode11_1part * 100.0f) / m_total_blocks);
  2596. printf("Total mode 11, 1 partition, constrained weights: %u %3.2f%%\n", m_total_mode11_1part_constrained_weights, (m_total_mode11_1part_constrained_weights * 100.0f) / m_total_blocks);
  2597. printf("Total mode 11, 2 partition: %u %3.2f%%\n", m_total_mode11_2part, (m_total_mode11_2part * 100.0f) / m_total_blocks);
  2598. printf("Total mode 7, 1 partition: %u %3.2f%%\n", m_total_mode7_1part, (m_total_mode7_1part * 100.0f) / m_total_blocks);
  2599. printf("Total mode 7, 2 partition: %u %3.2f%%\n", m_total_mode7_2part, (m_total_mode7_2part * 100.0f) / m_total_blocks);
  2600. printf("Total 2 partitions: %u %3.2f%%\n", m_total_2part, (m_total_2part * 100.0f) / m_total_blocks);
  2601. printf("\n");
  2602. printf("ISE texel weight range histogram mode 11:\n");
  2603. for (uint32_t i = 1; i <= MODE11_LAST_ISE_RANGE; i++)
  2604. printf("%u %u\n", i, m_weight_range_hist_11[i]);
  2605. printf("\n");
  2606. printf("ISE texel weight range histogram mode 11, 2 partition:\n");
  2607. for (uint32_t i = 1; i <= MODE11_PART2_LAST_ISE_RANGE; i++)
  2608. printf("%u %u\n", i, m_weight_range_hist_11_2part[i]);
  2609. printf("\n");
  2610. printf("ISE texel weight range histogram mode 7:\n");
  2611. for (uint32_t i = 1; i <= MODE7_PART1_LAST_ISE_RANGE; i++)
  2612. printf("%u %u\n", i, m_weight_range_hist_7[i]);
  2613. printf("\n");
  2614. printf("ISE texel weight range histogram mode 7, 2 partition:\n");
  2615. for (uint32_t i = 1; i <= MODE7_PART2_LAST_ISE_RANGE; i++)
  2616. printf("%u %u\n", i, m_weight_range_hist_7_2part[i]);
  2617. printf("\n");
  2618. printf("Mode 11 submode histogram:\n");
  2619. for (uint32_t i = 0; i <= MODE11_TOTAL_SUBMODES; i++) // +1 because of the extra direct encoding
  2620. printf("%u %u\n", i, m_mode11_submode_hist[i]);
  2621. printf("\n");
  2622. printf("Mode 7 submode histogram:\n");
  2623. for (uint32_t i = 0; i < MODE7_TOTAL_SUBMODES; i++)
  2624. printf("%u %u\n", i, m_mode7_submode_hist[i]);
  2625. printf("\n");
  2626. printf("Partition pattern table usage histogram:\n");
  2627. for (uint32_t i = 0; i < basist::TOTAL_ASTC_BC7_COMMON_PARTITIONS2; i++)
  2628. printf("%u:%u ", i, m_part_hist[i]);
  2629. printf("\n\n");
  2630. }
  2631. } // namespace basisu