astcenc_decompress_symbolic.cpp 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622
  1. // SPDX-License-Identifier: Apache-2.0
  2. // ----------------------------------------------------------------------------
  3. // Copyright 2011-2024 Arm Limited
  4. //
  5. // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  6. // use this file except in compliance with the License. You may obtain a copy
  7. // of the License at:
  8. //
  9. // http://www.apache.org/licenses/LICENSE-2.0
  10. //
  11. // Unless required by applicable law or agreed to in writing, software
  12. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  13. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  14. // License for the specific language governing permissions and limitations
  15. // under the License.
  16. // ----------------------------------------------------------------------------
  17. /**
  18. * @brief Functions to decompress a symbolic block.
  19. */
  20. #include "astcenc_internal.h"
  21. #include <stdio.h>
  22. #include <assert.h>
  23. /**
  24. * @brief Compute the integer linear interpolation of two color endpoints.
  25. *
  26. * @param u8_mask The mask for lanes using decode_unorm8 rather than decode_f16.
  27. * @param color0 The endpoint0 color.
  28. * @param color1 The endpoint1 color.
  29. * @param weights The interpolation weight (between 0 and 64).
  30. *
  31. * @return The interpolated color.
  32. */
  33. static vint4 lerp_color_int(
  34. vmask4 u8_mask,
  35. vint4 color0,
  36. vint4 color1,
  37. vint4 weights
  38. ) {
  39. vint4 weight1 = weights;
  40. vint4 weight0 = vint4(64) - weight1;
  41. vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32);
  42. color = asr<6>(color);
  43. // For decode_unorm8 values force the codec to bit replicate. This allows the
  44. // rest of the codec to assume the full 0xFFFF range for everything and ignore
  45. // the decode_mode setting
  46. vint4 color_u8 = asr<8>(color) * vint4(257);
  47. color = select(color, color_u8, u8_mask);
  48. return color;
  49. }
  50. /**
  51. * @brief Convert integer color value into a float value for the decoder.
  52. *
  53. * @param data The integer color value post-interpolation.
  54. * @param lns_mask If set treat lane as HDR (LNS) else LDR (unorm16).
  55. *
  56. * @return The float color value.
  57. */
  58. static inline vfloat4 decode_texel(
  59. vint4 data,
  60. vmask4 lns_mask
  61. ) {
  62. vint4 color_lns = vint4::zero();
  63. vint4 color_unorm = vint4::zero();
  64. if (any(lns_mask))
  65. {
  66. color_lns = lns_to_sf16(data);
  67. }
  68. if (!all(lns_mask))
  69. {
  70. color_unorm = unorm16_to_sf16(data);
  71. }
  72. // Pick components and then convert to FP16
  73. vint4 datai = select(color_unorm, color_lns, lns_mask);
  74. return float16_to_float(datai);
  75. }
  76. /* See header for documentation. */
  77. void unpack_weights(
  78. const block_size_descriptor& bsd,
  79. const symbolic_compressed_block& scb,
  80. const decimation_info& di,
  81. bool is_dual_plane,
  82. int weights_plane1[BLOCK_MAX_TEXELS],
  83. int weights_plane2[BLOCK_MAX_TEXELS]
  84. ) {
  85. // Safe to overshoot as all arrays are allocated to full size
  86. if (!is_dual_plane)
  87. {
  88. // Build full 64-entry weight lookup table
  89. vtable_64x8 table;
  90. vtable_prepare(table, scb.weights);
  91. for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
  92. {
  93. vint summed_value(8);
  94. vint weight_count(di.texel_weight_count + i);
  95. int max_weight_count = hmax_s(weight_count);
  96. promise(max_weight_count > 0);
  97. for (int j = 0; j < max_weight_count; j++)
  98. {
  99. vint texel_weights(di.texel_weights_tr[j] + i);
  100. vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
  101. summed_value += vtable_lookup_32bit(table, texel_weights) * texel_weights_int;
  102. }
  103. store(lsr<4>(summed_value), weights_plane1 + i);
  104. }
  105. }
  106. else
  107. {
  108. // Build a 32-entry weight lookup table per plane
  109. // Plane 1
  110. vtable_32x8 tab_plane1;
  111. vtable_prepare(tab_plane1, scb.weights);
  112. // Plane 2
  113. vtable_32x8 tab_plane2;
  114. vtable_prepare(tab_plane2, scb.weights + 32);
  115. for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
  116. {
  117. vint sum_plane1(8);
  118. vint sum_plane2(8);
  119. vint weight_count(di.texel_weight_count + i);
  120. int max_weight_count = hmax_s(weight_count);
  121. promise(max_weight_count > 0);
  122. for (int j = 0; j < max_weight_count; j++)
  123. {
  124. vint texel_weights(di.texel_weights_tr[j] + i);
  125. vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
  126. sum_plane1 += vtable_lookup_32bit(tab_plane1, texel_weights) * texel_weights_int;
  127. sum_plane2 += vtable_lookup_32bit(tab_plane2, texel_weights) * texel_weights_int;
  128. }
  129. store(lsr<4>(sum_plane1), weights_plane1 + i);
  130. store(lsr<4>(sum_plane2), weights_plane2 + i);
  131. }
  132. }
  133. }
  134. /**
  135. * @brief Return an FP32 NaN value for use in error colors.
  136. *
  137. * This NaN encoding will turn into 0xFFFF when converted to an FP16 NaN.
  138. *
  139. * @return The float color value.
  140. */
  141. static float error_color_nan()
  142. {
  143. if32 v;
  144. v.u = 0xFFFFE000U;
  145. return v.f;
  146. }
  147. /* See header for documentation. */
  148. void decompress_symbolic_block(
  149. astcenc_profile decode_mode,
  150. const block_size_descriptor& bsd,
  151. int xpos,
  152. int ypos,
  153. int zpos,
  154. const symbolic_compressed_block& scb,
  155. image_block& blk
  156. ) {
  157. blk.xpos = xpos;
  158. blk.ypos = ypos;
  159. blk.zpos = zpos;
  160. blk.data_min = vfloat4::zero();
  161. blk.data_mean = vfloat4::zero();
  162. blk.data_max = vfloat4::zero();
  163. blk.grayscale = false;
  164. // If we detected an error-block, blow up immediately.
  165. if (scb.block_type == SYM_BTYPE_ERROR)
  166. {
  167. for (unsigned int i = 0; i < bsd.texel_count; i++)
  168. {
  169. blk.data_r[i] = error_color_nan();
  170. blk.data_g[i] = error_color_nan();
  171. blk.data_b[i] = error_color_nan();
  172. blk.data_a[i] = error_color_nan();
  173. blk.rgb_lns[i] = 0;
  174. blk.alpha_lns[i] = 0;
  175. }
  176. return;
  177. }
  178. if ((scb.block_type == SYM_BTYPE_CONST_F16) ||
  179. (scb.block_type == SYM_BTYPE_CONST_U16))
  180. {
  181. vfloat4 color;
  182. uint8_t use_lns = 0;
  183. // UNORM16 constant color block
  184. if (scb.block_type == SYM_BTYPE_CONST_U16)
  185. {
  186. vint4 colori(scb.constant_color);
  187. // Determine the UNORM8 rounding on the decode
  188. vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
  189. // The real decoder would just use the top 8 bits, but we rescale
  190. // in to a 16-bit value that rounds correctly.
  191. vint4 colori_u8 = asr<8>(colori) * 257;
  192. colori = select(colori, colori_u8, u8_mask);
  193. vint4 colorf16 = unorm16_to_sf16(colori);
  194. color = float16_to_float(colorf16);
  195. }
  196. // FLOAT16 constant color block
  197. else
  198. {
  199. switch (decode_mode)
  200. {
  201. case ASTCENC_PRF_LDR_SRGB:
  202. case ASTCENC_PRF_LDR:
  203. color = vfloat4(error_color_nan());
  204. break;
  205. case ASTCENC_PRF_HDR_RGB_LDR_A:
  206. case ASTCENC_PRF_HDR:
  207. // Constant-color block; unpack from FP16 to FP32.
  208. color = float16_to_float(vint4(scb.constant_color));
  209. use_lns = 1;
  210. break;
  211. }
  212. }
  213. for (unsigned int i = 0; i < bsd.texel_count; i++)
  214. {
  215. blk.data_r[i] = color.lane<0>();
  216. blk.data_g[i] = color.lane<1>();
  217. blk.data_b[i] = color.lane<2>();
  218. blk.data_a[i] = color.lane<3>();
  219. blk.rgb_lns[i] = use_lns;
  220. blk.alpha_lns[i] = use_lns;
  221. }
  222. return;
  223. }
  224. // Get the appropriate partition-table entry
  225. int partition_count = scb.partition_count;
  226. const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
  227. // Get the appropriate block descriptors
  228. const auto& bm = bsd.get_block_mode(scb.block_mode);
  229. const auto& di = bsd.get_decimation_info(bm.decimation_mode);
  230. bool is_dual_plane = static_cast<bool>(bm.is_dual_plane);
  231. // Unquantize and undecimate the weights
  232. int plane1_weights[BLOCK_MAX_TEXELS];
  233. int plane2_weights[BLOCK_MAX_TEXELS];
  234. unpack_weights(bsd, scb, di, is_dual_plane, plane1_weights, plane2_weights);
  235. // Now that we have endpoint colors and weights, we can unpack texel colors
  236. int plane2_component = scb.plane2_component;
  237. vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component);
  238. vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
  239. for (int i = 0; i < partition_count; i++)
  240. {
  241. // Decode the color endpoints for this partition
  242. vint4 ep0;
  243. vint4 ep1;
  244. bool rgb_lns;
  245. bool a_lns;
  246. unpack_color_endpoints(decode_mode,
  247. scb.color_formats[i],
  248. scb.color_values[i],
  249. rgb_lns, a_lns,
  250. ep0, ep1);
  251. vmask4 lns_mask(rgb_lns, rgb_lns, rgb_lns, a_lns);
  252. int texel_count = pi.partition_texel_count[i];
  253. for (int j = 0; j < texel_count; j++)
  254. {
  255. int tix = pi.texels_of_partition[i][j];
  256. vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask);
  257. vint4 color = lerp_color_int(u8_mask, ep0, ep1, weight);
  258. vfloat4 colorf = decode_texel(color, lns_mask);
  259. blk.data_r[tix] = colorf.lane<0>();
  260. blk.data_g[tix] = colorf.lane<1>();
  261. blk.data_b[tix] = colorf.lane<2>();
  262. blk.data_a[tix] = colorf.lane<3>();
  263. }
  264. }
  265. }
  266. #if !defined(ASTCENC_DECOMPRESS_ONLY)
  267. /* See header for documentation. */
  268. float compute_symbolic_block_difference_2plane(
  269. const astcenc_config& config,
  270. const block_size_descriptor& bsd,
  271. const symbolic_compressed_block& scb,
  272. const image_block& blk
  273. ) {
  274. // If we detected an error-block, blow up immediately.
  275. if (scb.block_type == SYM_BTYPE_ERROR)
  276. {
  277. return ERROR_CALC_DEFAULT;
  278. }
  279. assert(scb.block_mode >= 0);
  280. assert(scb.partition_count == 1);
  281. assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 1);
  282. // Get the appropriate block descriptor
  283. const block_mode& bm = bsd.get_block_mode(scb.block_mode);
  284. const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
  285. // Unquantize and undecimate the weights
  286. int plane1_weights[BLOCK_MAX_TEXELS];
  287. int plane2_weights[BLOCK_MAX_TEXELS];
  288. unpack_weights(bsd, scb, di, true, plane1_weights, plane2_weights);
  289. vmask4 plane2_mask = vint4::lane_id() == vint4(scb.plane2_component);
  290. vfloat4 summa = vfloat4::zero();
  291. // Decode the color endpoints for this partition
  292. vint4 ep0;
  293. vint4 ep1;
  294. bool rgb_lns;
  295. bool a_lns;
  296. unpack_color_endpoints(config.profile,
  297. scb.color_formats[0],
  298. scb.color_values[0],
  299. rgb_lns, a_lns,
  300. ep0, ep1);
  301. vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
  302. // Unpack and compute error for each texel in the partition
  303. unsigned int texel_count = bsd.texel_count;
  304. for (unsigned int i = 0; i < texel_count; i++)
  305. {
  306. vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask);
  307. vint4 colori = lerp_color_int(u8_mask, ep0, ep1, weight);
  308. vfloat4 color = int_to_float(colori);
  309. vfloat4 oldColor = blk.texel(i);
  310. // Compare error using a perceptual decode metric for RGBM textures
  311. if (config.flags & ASTCENC_FLG_MAP_RGBM)
  312. {
  313. // Fail encodings that result in zero weight M pixels. Note that this can cause
  314. // "interesting" artifacts if we reject all useful encodings - we typically get max
  315. // brightness encodings instead which look just as bad. We recommend users apply a
  316. // bias to their stored M value, limiting the lower value to 16 or 32 to avoid
  317. // getting small M values post-quantization, but we can't prove it would never
  318. // happen, especially at low bit rates ...
  319. if (color.lane<3>() == 0.0f)
  320. {
  321. return -ERROR_CALC_DEFAULT;
  322. }
  323. // Compute error based on decoded RGBM color
  324. color = vfloat4(
  325. color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
  326. color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
  327. color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
  328. 1.0f
  329. );
  330. oldColor = vfloat4(
  331. oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
  332. oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
  333. oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
  334. 1.0f
  335. );
  336. }
  337. vfloat4 error = oldColor - color;
  338. error = min(abs(error), 1e15f);
  339. error = error * error;
  340. summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
  341. }
  342. return summa.lane<0>();
  343. }
  344. /* See header for documentation. */
  345. float compute_symbolic_block_difference_1plane(
  346. const astcenc_config& config,
  347. const block_size_descriptor& bsd,
  348. const symbolic_compressed_block& scb,
  349. const image_block& blk
  350. ) {
  351. assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 0);
  352. // If we detected an error-block, blow up immediately.
  353. if (scb.block_type == SYM_BTYPE_ERROR)
  354. {
  355. return ERROR_CALC_DEFAULT;
  356. }
  357. assert(scb.block_mode >= 0);
  358. // Get the appropriate partition-table entry
  359. unsigned int partition_count = scb.partition_count;
  360. const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
  361. // Get the appropriate block descriptor
  362. const block_mode& bm = bsd.get_block_mode(scb.block_mode);
  363. const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
  364. // Unquantize and undecimate the weights
  365. int plane1_weights[BLOCK_MAX_TEXELS];
  366. unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
  367. vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
  368. vfloat4 summa = vfloat4::zero();
  369. for (unsigned int i = 0; i < partition_count; i++)
  370. {
  371. // Decode the color endpoints for this partition
  372. vint4 ep0;
  373. vint4 ep1;
  374. bool rgb_lns;
  375. bool a_lns;
  376. unpack_color_endpoints(config.profile,
  377. scb.color_formats[i],
  378. scb.color_values[i],
  379. rgb_lns, a_lns,
  380. ep0, ep1);
  381. // Unpack and compute error for each texel in the partition
  382. unsigned int texel_count = pi.partition_texel_count[i];
  383. for (unsigned int j = 0; j < texel_count; j++)
  384. {
  385. unsigned int tix = pi.texels_of_partition[i][j];
  386. vint4 colori = lerp_color_int(u8_mask, ep0, ep1,
  387. vint4(plane1_weights[tix]));
  388. vfloat4 color = int_to_float(colori);
  389. vfloat4 oldColor = blk.texel(tix);
  390. // Compare error using a perceptual decode metric for RGBM textures
  391. if (config.flags & ASTCENC_FLG_MAP_RGBM)
  392. {
  393. // Fail encodings that result in zero weight M pixels. Note that this can cause
  394. // "interesting" artifacts if we reject all useful encodings - we typically get max
  395. // brightness encodings instead which look just as bad. We recommend users apply a
  396. // bias to their stored M value, limiting the lower value to 16 or 32 to avoid
  397. // getting small M values post-quantization, but we can't prove it would never
  398. // happen, especially at low bit rates ...
  399. if (color.lane<3>() == 0.0f)
  400. {
  401. return -ERROR_CALC_DEFAULT;
  402. }
  403. // Compute error based on decoded RGBM color
  404. color = vfloat4(
  405. color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
  406. color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
  407. color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
  408. 1.0f
  409. );
  410. oldColor = vfloat4(
  411. oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
  412. oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
  413. oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
  414. 1.0f
  415. );
  416. }
  417. vfloat4 error = oldColor - color;
  418. error = min(abs(error), 1e15f);
  419. error = error * error;
  420. summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
  421. }
  422. }
  423. return summa.lane<0>();
  424. }
  425. /* See header for documentation. */
  426. float compute_symbolic_block_difference_1plane_1partition(
  427. const astcenc_config& config,
  428. const block_size_descriptor& bsd,
  429. const symbolic_compressed_block& scb,
  430. const image_block& blk
  431. ) {
  432. // If we detected an error-block, blow up immediately.
  433. if (scb.block_type == SYM_BTYPE_ERROR)
  434. {
  435. return ERROR_CALC_DEFAULT;
  436. }
  437. assert(scb.block_mode >= 0);
  438. assert(bsd.get_partition_info(scb.partition_count, scb.partition_index).partition_count == 1);
  439. // Get the appropriate block descriptor
  440. const block_mode& bm = bsd.get_block_mode(scb.block_mode);
  441. const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
  442. // Unquantize and undecimate the weights
  443. ASTCENC_ALIGNAS int plane1_weights[BLOCK_MAX_TEXELS];
  444. unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
  445. // Decode the color endpoints for this partition
  446. vint4 ep0;
  447. vint4 ep1;
  448. bool rgb_lns;
  449. bool a_lns;
  450. unpack_color_endpoints(config.profile,
  451. scb.color_formats[0],
  452. scb.color_values[0],
  453. rgb_lns, a_lns,
  454. ep0, ep1);
  455. vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
  456. // Unpack and compute error for each texel in the partition
  457. vfloatacc summav = vfloatacc::zero();
  458. vint lane_id = vint::lane_id();
  459. unsigned int texel_count = bsd.texel_count;
  460. for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
  461. {
  462. // Compute EP1 contribution
  463. vint weight1 = vint::loada(plane1_weights + i);
  464. vint ep1_r = vint(ep1.lane<0>()) * weight1;
  465. vint ep1_g = vint(ep1.lane<1>()) * weight1;
  466. vint ep1_b = vint(ep1.lane<2>()) * weight1;
  467. vint ep1_a = vint(ep1.lane<3>()) * weight1;
  468. // Compute EP0 contribution
  469. vint weight0 = vint(64) - weight1;
  470. vint ep0_r = vint(ep0.lane<0>()) * weight0;
  471. vint ep0_g = vint(ep0.lane<1>()) * weight0;
  472. vint ep0_b = vint(ep0.lane<2>()) * weight0;
  473. vint ep0_a = vint(ep0.lane<3>()) * weight0;
  474. // Combine contributions
  475. vint colori_r = asr<6>(ep0_r + ep1_r + vint(32));
  476. vint colori_g = asr<6>(ep0_g + ep1_g + vint(32));
  477. vint colori_b = asr<6>(ep0_b + ep1_b + vint(32));
  478. vint colori_a = asr<6>(ep0_a + ep1_a + vint(32));
  479. // If using a U8 decode mode bit replicate top 8 bits
  480. // so rest of codec can assume 0xFFFF max range everywhere
  481. vint colori_r8 = asr<8>(colori_r) * vint(257);
  482. colori_r = select(colori_r, colori_r8, vmask(u8_mask.lane<0>()));
  483. vint colori_g8 = asr<8>(colori_g) * vint(257);
  484. colori_g = select(colori_g, colori_g8, vmask(u8_mask.lane<1>()));
  485. vint colori_b8 = asr<8>(colori_b) * vint(257);
  486. colori_b = select(colori_b, colori_b8, vmask(u8_mask.lane<2>()));
  487. vint colori_a8 = asr<8>(colori_a) * vint(257);
  488. colori_a = select(colori_a, colori_a8, vmask(u8_mask.lane<3>()));
  489. // Compute color diff
  490. vfloat color_r = int_to_float(colori_r);
  491. vfloat color_g = int_to_float(colori_g);
  492. vfloat color_b = int_to_float(colori_b);
  493. vfloat color_a = int_to_float(colori_a);
  494. vfloat color_orig_r = loada(blk.data_r + i);
  495. vfloat color_orig_g = loada(blk.data_g + i);
  496. vfloat color_orig_b = loada(blk.data_b + i);
  497. vfloat color_orig_a = loada(blk.data_a + i);
  498. vfloat color_error_r = min(abs(color_orig_r - color_r), vfloat(1e15f));
  499. vfloat color_error_g = min(abs(color_orig_g - color_g), vfloat(1e15f));
  500. vfloat color_error_b = min(abs(color_orig_b - color_b), vfloat(1e15f));
  501. vfloat color_error_a = min(abs(color_orig_a - color_a), vfloat(1e15f));
  502. // Compute squared error metric
  503. color_error_r = color_error_r * color_error_r;
  504. color_error_g = color_error_g * color_error_g;
  505. color_error_b = color_error_b * color_error_b;
  506. color_error_a = color_error_a * color_error_a;
  507. vfloat metric = color_error_r * blk.channel_weight.lane<0>()
  508. + color_error_g * blk.channel_weight.lane<1>()
  509. + color_error_b * blk.channel_weight.lane<2>()
  510. + color_error_a * blk.channel_weight.lane<3>();
  511. // Mask off bad lanes
  512. vmask mask = lane_id < vint(texel_count);
  513. lane_id += vint(ASTCENC_SIMD_WIDTH);
  514. haccumulate(summav, metric, mask);
  515. }
  516. return hadd_s(summav);
  517. }
  518. #endif