astcenc_ideal_endpoints_and_weights.cpp 51 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652
  1. // SPDX-License-Identifier: Apache-2.0
  2. // ----------------------------------------------------------------------------
  3. // Copyright 2011-2024 Arm Limited
  4. //
  5. // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  6. // use this file except in compliance with the License. You may obtain a copy
  7. // of the License at:
  8. //
  9. // http://www.apache.org/licenses/LICENSE-2.0
  10. //
  11. // Unless required by applicable law or agreed to in writing, software
  12. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  13. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  14. // License for the specific language governing permissions and limitations
  15. // under the License.
  16. // ----------------------------------------------------------------------------
  17. #if !defined(ASTCENC_DECOMPRESS_ONLY)
  18. /**
  19. * @brief Functions for computing color endpoints and texel weights.
  20. */
  21. #include <cassert>
  22. #include "astcenc_internal.h"
  23. #include "astcenc_vecmathlib.h"
  24. /**
  25. * @brief Compute the infilled weight for N texel indices in a decimated grid.
  26. *
  27. * @param di The weight grid decimation to use.
  28. * @param weights The decimated weight values to use.
  29. * @param index The first texel index to interpolate.
  30. *
  31. * @return The interpolated weight for the given set of SIMD_WIDTH texels.
  32. */
  33. static vfloat bilinear_infill_vla(
  34. const decimation_info& di,
  35. const float* weights,
  36. unsigned int index
  37. ) {
  38. // Load the bilinear filter texel weight indexes in the decimated grid
  39. const uint8_t* weight_idx0 = di.texel_weights_tr[0] + index;
  40. const uint8_t* weight_idx1 = di.texel_weights_tr[1] + index;
  41. const uint8_t* weight_idx2 = di.texel_weights_tr[2] + index;
  42. const uint8_t* weight_idx3 = di.texel_weights_tr[3] + index;
  43. // Load the bilinear filter weights from the decimated grid
  44. vfloat weight_val0 = gatherf_byte_inds<vfloat>(weights, weight_idx0);
  45. vfloat weight_val1 = gatherf_byte_inds<vfloat>(weights, weight_idx1);
  46. vfloat weight_val2 = gatherf_byte_inds<vfloat>(weights, weight_idx2);
  47. vfloat weight_val3 = gatherf_byte_inds<vfloat>(weights, weight_idx3);
  48. // Load the weight contribution factors for each decimated weight
  49. vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
  50. vfloat tex_weight_float1 = loada(di.texel_weight_contribs_float_tr[1] + index);
  51. vfloat tex_weight_float2 = loada(di.texel_weight_contribs_float_tr[2] + index);
  52. vfloat tex_weight_float3 = loada(di.texel_weight_contribs_float_tr[3] + index);
  53. // Compute the bilinear interpolation to generate the per-texel weight
  54. return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1) +
  55. (weight_val2 * tex_weight_float2 + weight_val3 * tex_weight_float3);
  56. }
  57. /**
  58. * @brief Compute the infilled weight for N texel indices in a decimated grid.
  59. *
  60. * This is specialized version which computes only two weights per texel for
  61. * encodings that are only decimated in a single axis.
  62. *
  63. * @param di The weight grid decimation to use.
  64. * @param weights The decimated weight values to use.
  65. * @param index The first texel index to interpolate.
  66. *
  67. * @return The interpolated weight for the given set of SIMD_WIDTH texels.
  68. */
  69. static vfloat bilinear_infill_vla_2(
  70. const decimation_info& di,
  71. const float* weights,
  72. unsigned int index
  73. ) {
  74. // Load the bilinear filter texel weight indexes in the decimated grid
  75. const uint8_t* weight_idx0 = di.texel_weights_tr[0] + index;
  76. const uint8_t* weight_idx1 = di.texel_weights_tr[1] + index;
  77. // Load the bilinear filter weights from the decimated grid
  78. vfloat weight_val0 = gatherf_byte_inds<vfloat>(weights, weight_idx0);
  79. vfloat weight_val1 = gatherf_byte_inds<vfloat>(weights, weight_idx1);
  80. // Load the weight contribution factors for each decimated weight
  81. vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
  82. vfloat tex_weight_float1 = loada(di.texel_weight_contribs_float_tr[1] + index);
  83. // Compute the bilinear interpolation to generate the per-texel weight
  84. return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1);
  85. }
  86. /**
  87. * @brief Compute the ideal endpoints and weights for 1 color component.
  88. *
  89. * @param blk The image block color data to compress.
  90. * @param pi The partition info for the current trial.
  91. * @param[out] ei The computed ideal endpoints and weights.
  92. * @param component The color component to compute.
  93. */
  94. static void compute_ideal_colors_and_weights_1_comp(
  95. const image_block& blk,
  96. const partition_info& pi,
  97. endpoints_and_weights& ei,
  98. unsigned int component
  99. ) {
  100. unsigned int partition_count = pi.partition_count;
  101. ei.ep.partition_count = partition_count;
  102. promise(partition_count > 0);
  103. unsigned int texel_count = blk.texel_count;
  104. promise(texel_count > 0);
  105. float error_weight;
  106. const float* data_vr = nullptr;
  107. assert(component < BLOCK_MAX_COMPONENTS);
  108. switch (component)
  109. {
  110. case 0:
  111. error_weight = blk.channel_weight.lane<0>();
  112. data_vr = blk.data_r;
  113. break;
  114. case 1:
  115. error_weight = blk.channel_weight.lane<1>();
  116. data_vr = blk.data_g;
  117. break;
  118. case 2:
  119. error_weight = blk.channel_weight.lane<2>();
  120. data_vr = blk.data_b;
  121. break;
  122. default:
  123. assert(component == 3);
  124. error_weight = blk.channel_weight.lane<3>();
  125. data_vr = blk.data_a;
  126. break;
  127. }
  128. vmask4 sep_mask = vint4::lane_id() == vint4(component);
  129. bool is_constant_wes { true };
  130. float partition0_len_sq { 0.0f };
  131. for (unsigned int i = 0; i < partition_count; i++)
  132. {
  133. float lowvalue { 1e10f };
  134. float highvalue { -1e10f };
  135. unsigned int partition_texel_count = pi.partition_texel_count[i];
  136. for (unsigned int j = 0; j < partition_texel_count; j++)
  137. {
  138. unsigned int tix = pi.texels_of_partition[i][j];
  139. float value = data_vr[tix];
  140. lowvalue = astc::min(value, lowvalue);
  141. highvalue = astc::max(value, highvalue);
  142. }
  143. if (highvalue <= lowvalue)
  144. {
  145. lowvalue = 0.0f;
  146. highvalue = 1e-7f;
  147. }
  148. float length = highvalue - lowvalue;
  149. float length_squared = length * length;
  150. float scale = 1.0f / length;
  151. if (i == 0)
  152. {
  153. partition0_len_sq = length_squared;
  154. }
  155. else
  156. {
  157. is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
  158. }
  159. for (unsigned int j = 0; j < partition_texel_count; j++)
  160. {
  161. unsigned int tix = pi.texels_of_partition[i][j];
  162. float value = (data_vr[tix] - lowvalue) * scale;
  163. value = astc::clamp1f(value);
  164. ei.weights[tix] = value;
  165. ei.weight_error_scale[tix] = length_squared * error_weight;
  166. assert(!astc::isnan(ei.weight_error_scale[tix]));
  167. }
  168. ei.ep.endpt0[i] = select(blk.data_min, vfloat4(lowvalue), sep_mask);
  169. ei.ep.endpt1[i] = select(blk.data_max, vfloat4(highvalue), sep_mask);
  170. }
  171. // Zero initialize any SIMD over-fetch
  172. size_t texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
  173. for (size_t i = texel_count; i < texel_count_simd; i++)
  174. {
  175. ei.weights[i] = 0.0f;
  176. ei.weight_error_scale[i] = 0.0f;
  177. }
  178. ei.is_constant_weight_error_scale = is_constant_wes;
  179. }
  180. /**
  181. * @brief Compute the ideal endpoints and weights for 2 color components.
  182. *
  183. * @param blk The image block color data to compress.
  184. * @param pi The partition info for the current trial.
  185. * @param[out] ei The computed ideal endpoints and weights.
  186. * @param component1 The first color component to compute.
  187. * @param component2 The second color component to compute.
  188. */
  189. static void compute_ideal_colors_and_weights_2_comp(
  190. const image_block& blk,
  191. const partition_info& pi,
  192. endpoints_and_weights& ei,
  193. int component1,
  194. int component2
  195. ) {
  196. unsigned int partition_count = pi.partition_count;
  197. ei.ep.partition_count = partition_count;
  198. promise(partition_count > 0);
  199. unsigned int texel_count = blk.texel_count;
  200. promise(texel_count > 0);
  201. partition_metrics pms[BLOCK_MAX_PARTITIONS];
  202. float error_weight;
  203. const float* data_vr = nullptr;
  204. const float* data_vg = nullptr;
  205. if (component1 == 0 && component2 == 1)
  206. {
  207. error_weight = hadd_s(blk.channel_weight.swz<0, 1>()) / 2.0f;
  208. data_vr = blk.data_r;
  209. data_vg = blk.data_g;
  210. }
  211. else if (component1 == 0 && component2 == 2)
  212. {
  213. error_weight = hadd_s(blk.channel_weight.swz<0, 2>()) / 2.0f;
  214. data_vr = blk.data_r;
  215. data_vg = blk.data_b;
  216. }
  217. else // (component1 == 1 && component2 == 2)
  218. {
  219. assert(component1 == 1 && component2 == 2);
  220. error_weight = hadd_s(blk.channel_weight.swz<1, 2>()) / 2.0f;
  221. data_vr = blk.data_g;
  222. data_vg = blk.data_b;
  223. }
  224. compute_avgs_and_dirs_2_comp(pi, blk, component1, component2, pms);
  225. bool is_constant_wes { true };
  226. float partition0_len_sq { 0.0f };
  227. vmask4 comp1_mask = vint4::lane_id() == vint4(component1);
  228. vmask4 comp2_mask = vint4::lane_id() == vint4(component2);
  229. for (unsigned int i = 0; i < partition_count; i++)
  230. {
  231. vfloat4 dir = pms[i].dir;
  232. if (hadd_s(dir) < 0.0f)
  233. {
  234. dir = vfloat4::zero() - dir;
  235. }
  236. line2 line { pms[i].avg, normalize_safe(dir, unit2()) };
  237. float lowparam { 1e10f };
  238. float highparam { -1e10f };
  239. unsigned int partition_texel_count = pi.partition_texel_count[i];
  240. for (unsigned int j = 0; j < partition_texel_count; j++)
  241. {
  242. unsigned int tix = pi.texels_of_partition[i][j];
  243. vfloat4 point = vfloat2(data_vr[tix], data_vg[tix]);
  244. float param = dot_s(point - line.a, line.b);
  245. ei.weights[tix] = param;
  246. lowparam = astc::min(param, lowparam);
  247. highparam = astc::max(param, highparam);
  248. }
  249. // It is possible for a uniform-color partition to produce length=0;
  250. // this causes NaN issues so set to small value to avoid this problem
  251. if (highparam <= lowparam)
  252. {
  253. lowparam = 0.0f;
  254. highparam = 1e-7f;
  255. }
  256. float length = highparam - lowparam;
  257. float length_squared = length * length;
  258. float scale = 1.0f / length;
  259. if (i == 0)
  260. {
  261. partition0_len_sq = length_squared;
  262. }
  263. else
  264. {
  265. is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
  266. }
  267. for (unsigned int j = 0; j < partition_texel_count; j++)
  268. {
  269. unsigned int tix = pi.texels_of_partition[i][j];
  270. float idx = (ei.weights[tix] - lowparam) * scale;
  271. idx = astc::clamp1f(idx);
  272. ei.weights[tix] = idx;
  273. ei.weight_error_scale[tix] = length_squared * error_weight;
  274. assert(!astc::isnan(ei.weight_error_scale[tix]));
  275. }
  276. vfloat4 lowvalue = line.a + line.b * lowparam;
  277. vfloat4 highvalue = line.a + line.b * highparam;
  278. vfloat4 ep0 = select(blk.data_min, vfloat4(lowvalue.lane<0>()), comp1_mask);
  279. vfloat4 ep1 = select(blk.data_max, vfloat4(highvalue.lane<0>()), comp1_mask);
  280. ei.ep.endpt0[i] = select(ep0, vfloat4(lowvalue.lane<1>()), comp2_mask);
  281. ei.ep.endpt1[i] = select(ep1, vfloat4(highvalue.lane<1>()), comp2_mask);
  282. }
  283. // Zero initialize any SIMD over-fetch
  284. size_t texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
  285. for (size_t i = texel_count; i < texel_count_simd; i++)
  286. {
  287. ei.weights[i] = 0.0f;
  288. ei.weight_error_scale[i] = 0.0f;
  289. }
  290. ei.is_constant_weight_error_scale = is_constant_wes;
  291. }
  292. /**
  293. * @brief Compute the ideal endpoints and weights for 3 color components.
  294. *
  295. * @param blk The image block color data to compress.
  296. * @param pi The partition info for the current trial.
  297. * @param[out] ei The computed ideal endpoints and weights.
  298. * @param omitted_component The color component excluded from the calculation.
  299. */
  300. static void compute_ideal_colors_and_weights_3_comp(
  301. const image_block& blk,
  302. const partition_info& pi,
  303. endpoints_and_weights& ei,
  304. unsigned int omitted_component
  305. ) {
  306. unsigned int partition_count = pi.partition_count;
  307. ei.ep.partition_count = partition_count;
  308. promise(partition_count > 0);
  309. unsigned int texel_count = blk.texel_count;
  310. promise(texel_count > 0);
  311. partition_metrics pms[BLOCK_MAX_PARTITIONS];
  312. float error_weight;
  313. const float* data_vr = nullptr;
  314. const float* data_vg = nullptr;
  315. const float* data_vb = nullptr;
  316. if (omitted_component == 0)
  317. {
  318. error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>());
  319. data_vr = blk.data_g;
  320. data_vg = blk.data_b;
  321. data_vb = blk.data_a;
  322. }
  323. else if (omitted_component == 1)
  324. {
  325. error_weight = hadd_s(blk.channel_weight.swz<0, 2, 3>());
  326. data_vr = blk.data_r;
  327. data_vg = blk.data_b;
  328. data_vb = blk.data_a;
  329. }
  330. else if (omitted_component == 2)
  331. {
  332. error_weight = hadd_s(blk.channel_weight.swz<0, 1, 3>());
  333. data_vr = blk.data_r;
  334. data_vg = blk.data_g;
  335. data_vb = blk.data_a;
  336. }
  337. else
  338. {
  339. assert(omitted_component == 3);
  340. error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>());
  341. data_vr = blk.data_r;
  342. data_vg = blk.data_g;
  343. data_vb = blk.data_b;
  344. }
  345. error_weight = error_weight * (1.0f / 3.0f);
  346. if (omitted_component == 3)
  347. {
  348. compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
  349. }
  350. else
  351. {
  352. compute_avgs_and_dirs_3_comp(pi, blk, omitted_component, pms);
  353. }
  354. bool is_constant_wes { true };
  355. float partition0_len_sq { 0.0f };
  356. for (unsigned int i = 0; i < partition_count; i++)
  357. {
  358. vfloat4 dir = pms[i].dir;
  359. if (hadd_rgb_s(dir) < 0.0f)
  360. {
  361. dir = vfloat4::zero() - dir;
  362. }
  363. line3 line { pms[i].avg, normalize_safe(dir, unit3()) };
  364. float lowparam { 1e10f };
  365. float highparam { -1e10f };
  366. unsigned int partition_texel_count = pi.partition_texel_count[i];
  367. for (unsigned int j = 0; j < partition_texel_count; j++)
  368. {
  369. unsigned int tix = pi.texels_of_partition[i][j];
  370. vfloat4 point = vfloat3(data_vr[tix], data_vg[tix], data_vb[tix]);
  371. float param = dot3_s(point - line.a, line.b);
  372. ei.weights[tix] = param;
  373. lowparam = astc::min(param, lowparam);
  374. highparam = astc::max(param, highparam);
  375. }
  376. // It is possible for a uniform-color partition to produce length=0;
  377. // this causes NaN issues so set to small value to avoid this problem
  378. if (highparam <= lowparam)
  379. {
  380. lowparam = 0.0f;
  381. highparam = 1e-7f;
  382. }
  383. float length = highparam - lowparam;
  384. float length_squared = length * length;
  385. float scale = 1.0f / length;
  386. if (i == 0)
  387. {
  388. partition0_len_sq = length_squared;
  389. }
  390. else
  391. {
  392. is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
  393. }
  394. for (unsigned int j = 0; j < partition_texel_count; j++)
  395. {
  396. unsigned int tix = pi.texels_of_partition[i][j];
  397. float idx = (ei.weights[tix] - lowparam) * scale;
  398. idx = astc::clamp1f(idx);
  399. ei.weights[tix] = idx;
  400. ei.weight_error_scale[tix] = length_squared * error_weight;
  401. assert(!astc::isnan(ei.weight_error_scale[tix]));
  402. }
  403. vfloat4 ep0 = line.a + line.b * lowparam;
  404. vfloat4 ep1 = line.a + line.b * highparam;
  405. vfloat4 bmin = blk.data_min;
  406. vfloat4 bmax = blk.data_max;
  407. assert(omitted_component < BLOCK_MAX_COMPONENTS);
  408. switch (omitted_component)
  409. {
  410. case 0:
  411. ei.ep.endpt0[i] = vfloat4(bmin.lane<0>(), ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>());
  412. ei.ep.endpt1[i] = vfloat4(bmax.lane<0>(), ep1.lane<0>(), ep1.lane<1>(), ep1.lane<2>());
  413. break;
  414. case 1:
  415. ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), bmin.lane<1>(), ep0.lane<1>(), ep0.lane<2>());
  416. ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), bmax.lane<1>(), ep1.lane<1>(), ep1.lane<2>());
  417. break;
  418. case 2:
  419. ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), bmin.lane<2>(), ep0.lane<2>());
  420. ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), ep1.lane<1>(), bmax.lane<2>(), ep1.lane<2>());
  421. break;
  422. default:
  423. ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), bmin.lane<3>());
  424. ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), ep1.lane<1>(), ep1.lane<2>(), bmax.lane<3>());
  425. break;
  426. }
  427. }
  428. // Zero initialize any SIMD over-fetch
  429. size_t texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
  430. for (size_t i = texel_count; i < texel_count_simd; i++)
  431. {
  432. ei.weights[i] = 0.0f;
  433. ei.weight_error_scale[i] = 0.0f;
  434. }
  435. ei.is_constant_weight_error_scale = is_constant_wes;
  436. }
  437. /**
  438. * @brief Compute the ideal endpoints and weights for 4 color components.
  439. *
  440. * @param blk The image block color data to compress.
  441. * @param pi The partition info for the current trial.
  442. * @param[out] ei The computed ideal endpoints and weights.
  443. */
  444. static void compute_ideal_colors_and_weights_4_comp(
  445. const image_block& blk,
  446. const partition_info& pi,
  447. endpoints_and_weights& ei
  448. ) {
  449. const float error_weight = hadd_s(blk.channel_weight) / 4.0f;
  450. unsigned int partition_count = pi.partition_count;
  451. unsigned int texel_count = blk.texel_count;
  452. promise(texel_count > 0);
  453. promise(partition_count > 0);
  454. partition_metrics pms[BLOCK_MAX_PARTITIONS];
  455. compute_avgs_and_dirs_4_comp(pi, blk, pms);
  456. bool is_constant_wes { true };
  457. float partition0_len_sq { 0.0f };
  458. for (unsigned int i = 0; i < partition_count; i++)
  459. {
  460. vfloat4 dir = pms[i].dir;
  461. if (hadd_rgb_s(dir) < 0.0f)
  462. {
  463. dir = vfloat4::zero() - dir;
  464. }
  465. line4 line { pms[i].avg, normalize_safe(dir, unit4()) };
  466. float lowparam { 1e10f };
  467. float highparam { -1e10f };
  468. unsigned int partition_texel_count = pi.partition_texel_count[i];
  469. for (unsigned int j = 0; j < partition_texel_count; j++)
  470. {
  471. unsigned int tix = pi.texels_of_partition[i][j];
  472. vfloat4 point = blk.texel(tix);
  473. float param = dot_s(point - line.a, line.b);
  474. ei.weights[tix] = param;
  475. lowparam = astc::min(param, lowparam);
  476. highparam = astc::max(param, highparam);
  477. }
  478. // It is possible for a uniform-color partition to produce length=0;
  479. // this causes NaN issues so set to small value to avoid this problem
  480. if (highparam <= lowparam)
  481. {
  482. lowparam = 0.0f;
  483. highparam = 1e-7f;
  484. }
  485. float length = highparam - lowparam;
  486. float length_squared = length * length;
  487. float scale = 1.0f / length;
  488. if (i == 0)
  489. {
  490. partition0_len_sq = length_squared;
  491. }
  492. else
  493. {
  494. is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
  495. }
  496. ei.ep.endpt0[i] = line.a + line.b * lowparam;
  497. ei.ep.endpt1[i] = line.a + line.b * highparam;
  498. for (unsigned int j = 0; j < partition_texel_count; j++)
  499. {
  500. unsigned int tix = pi.texels_of_partition[i][j];
  501. float idx = (ei.weights[tix] - lowparam) * scale;
  502. idx = astc::clamp1f(idx);
  503. ei.weights[tix] = idx;
  504. ei.weight_error_scale[tix] = length_squared * error_weight;
  505. assert(!astc::isnan(ei.weight_error_scale[tix]));
  506. }
  507. }
  508. // Zero initialize any SIMD over-fetch
  509. size_t texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
  510. for (size_t i = texel_count; i < texel_count_simd; i++)
  511. {
  512. ei.weights[i] = 0.0f;
  513. ei.weight_error_scale[i] = 0.0f;
  514. }
  515. ei.is_constant_weight_error_scale = is_constant_wes;
  516. }
  517. /* See header for documentation. */
  518. void compute_ideal_colors_and_weights_1plane(
  519. const image_block& blk,
  520. const partition_info& pi,
  521. endpoints_and_weights& ei
  522. ) {
  523. bool uses_alpha = !blk.is_constant_channel(3);
  524. if (uses_alpha)
  525. {
  526. compute_ideal_colors_and_weights_4_comp(blk, pi, ei);
  527. }
  528. else
  529. {
  530. compute_ideal_colors_and_weights_3_comp(blk, pi, ei, 3);
  531. }
  532. }
  533. /* See header for documentation. */
  534. void compute_ideal_colors_and_weights_2planes(
  535. const block_size_descriptor& bsd,
  536. const image_block& blk,
  537. unsigned int plane2_component,
  538. endpoints_and_weights& ei1,
  539. endpoints_and_weights& ei2
  540. ) {
  541. const auto& pi = bsd.get_partition_info(1, 0);
  542. bool uses_alpha = !blk.is_constant_channel(3);
  543. assert(plane2_component < BLOCK_MAX_COMPONENTS);
  544. switch (plane2_component)
  545. {
  546. case 0: // Separate weights for red
  547. if (uses_alpha)
  548. {
  549. compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 0);
  550. }
  551. else
  552. {
  553. compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 1, 2);
  554. }
  555. compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 0);
  556. break;
  557. case 1: // Separate weights for green
  558. if (uses_alpha)
  559. {
  560. compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 1);
  561. }
  562. else
  563. {
  564. compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 0, 2);
  565. }
  566. compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 1);
  567. break;
  568. case 2: // Separate weights for blue
  569. if (uses_alpha)
  570. {
  571. compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 2);
  572. }
  573. else
  574. {
  575. compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 0, 1);
  576. }
  577. compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 2);
  578. break;
  579. default: // Separate weights for alpha
  580. assert(uses_alpha);
  581. compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 3);
  582. compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 3);
  583. break;
  584. }
  585. }
  586. /* See header for documentation. */
  587. float compute_error_of_weight_set_1plane(
  588. const endpoints_and_weights& eai,
  589. const decimation_info& di,
  590. const float* dec_weight_quant_uvalue
  591. ) {
  592. vfloatacc error_summav = vfloatacc::zero();
  593. unsigned int texel_count = di.texel_count;
  594. promise(texel_count > 0);
  595. // Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized
  596. if (di.max_texel_weight_count > 2)
  597. {
  598. for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
  599. {
  600. // Compute the bilinear interpolation of the decimated weight grid
  601. vfloat current_values = bilinear_infill_vla(di, dec_weight_quant_uvalue, i);
  602. // Compute the error between the computed value and the ideal weight
  603. vfloat actual_values = loada(eai.weights + i);
  604. vfloat diff = current_values - actual_values;
  605. vfloat significance = loada(eai.weight_error_scale + i);
  606. vfloat error = diff * diff * significance;
  607. haccumulate(error_summav, error);
  608. }
  609. }
  610. else if (di.max_texel_weight_count > 1)
  611. {
  612. for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
  613. {
  614. // Compute the bilinear interpolation of the decimated weight grid
  615. vfloat current_values = bilinear_infill_vla_2(di, dec_weight_quant_uvalue, i);
  616. // Compute the error between the computed value and the ideal weight
  617. vfloat actual_values = loada(eai.weights + i);
  618. vfloat diff = current_values - actual_values;
  619. vfloat significance = loada(eai.weight_error_scale + i);
  620. vfloat error = diff * diff * significance;
  621. haccumulate(error_summav, error);
  622. }
  623. }
  624. else
  625. {
  626. for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
  627. {
  628. // Load the weight set directly, without interpolation
  629. vfloat current_values = loada(dec_weight_quant_uvalue + i);
  630. // Compute the error between the computed value and the ideal weight
  631. vfloat actual_values = loada(eai.weights + i);
  632. vfloat diff = current_values - actual_values;
  633. vfloat significance = loada(eai.weight_error_scale + i);
  634. vfloat error = diff * diff * significance;
  635. haccumulate(error_summav, error);
  636. }
  637. }
  638. // Resolve the final scalar accumulator sum
  639. return hadd_s(error_summav);
  640. }
  641. /* See header for documentation. */
  642. float compute_error_of_weight_set_2planes(
  643. const endpoints_and_weights& eai1,
  644. const endpoints_and_weights& eai2,
  645. const decimation_info& di,
  646. const float* dec_weight_quant_uvalue_plane1,
  647. const float* dec_weight_quant_uvalue_plane2
  648. ) {
  649. vfloatacc error_summav = vfloatacc::zero();
  650. unsigned int texel_count = di.texel_count;
  651. promise(texel_count > 0);
  652. // Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized
  653. if (di.max_texel_weight_count > 2)
  654. {
  655. for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
  656. {
  657. // Plane 1
  658. // Compute the bilinear interpolation of the decimated weight grid
  659. vfloat current_values1 = bilinear_infill_vla(di, dec_weight_quant_uvalue_plane1, i);
  660. // Compute the error between the computed value and the ideal weight
  661. vfloat actual_values1 = loada(eai1.weights + i);
  662. vfloat diff = current_values1 - actual_values1;
  663. vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
  664. // Plane 2
  665. // Compute the bilinear interpolation of the decimated weight grid
  666. vfloat current_values2 = bilinear_infill_vla(di, dec_weight_quant_uvalue_plane2, i);
  667. // Compute the error between the computed value and the ideal weight
  668. vfloat actual_values2 = loada(eai2.weights + i);
  669. diff = current_values2 - actual_values2;
  670. vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
  671. haccumulate(error_summav, error1 + error2);
  672. }
  673. }
  674. else if (di.max_texel_weight_count > 1)
  675. {
  676. for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
  677. {
  678. // Plane 1
  679. // Compute the bilinear interpolation of the decimated weight grid
  680. vfloat current_values1 = bilinear_infill_vla_2(di, dec_weight_quant_uvalue_plane1, i);
  681. // Compute the error between the computed value and the ideal weight
  682. vfloat actual_values1 = loada(eai1.weights + i);
  683. vfloat diff = current_values1 - actual_values1;
  684. vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
  685. // Plane 2
  686. // Compute the bilinear interpolation of the decimated weight grid
  687. vfloat current_values2 = bilinear_infill_vla_2(di, dec_weight_quant_uvalue_plane2, i);
  688. // Compute the error between the computed value and the ideal weight
  689. vfloat actual_values2 = loada(eai2.weights + i);
  690. diff = current_values2 - actual_values2;
  691. vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
  692. haccumulate(error_summav, error1 + error2);
  693. }
  694. }
  695. else
  696. {
  697. for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
  698. {
  699. // Plane 1
  700. // Load the weight set directly, without interpolation
  701. vfloat current_values1 = loada(dec_weight_quant_uvalue_plane1 + i);
  702. // Compute the error between the computed value and the ideal weight
  703. vfloat actual_values1 = loada(eai1.weights + i);
  704. vfloat diff = current_values1 - actual_values1;
  705. vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
  706. // Plane 2
  707. // Load the weight set directly, without interpolation
  708. vfloat current_values2 = loada(dec_weight_quant_uvalue_plane2 + i);
  709. // Compute the error between the computed value and the ideal weight
  710. vfloat actual_values2 = loada(eai2.weights + i);
  711. diff = current_values2 - actual_values2;
  712. vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
  713. haccumulate(error_summav, error1 + error2);
  714. }
  715. }
  716. // Resolve the final scalar accumulator sum
  717. return hadd_s(error_summav);
  718. }
  719. /* See header for documentation. */
  720. void compute_ideal_weights_for_decimation(
  721. const endpoints_and_weights& ei,
  722. const decimation_info& di,
  723. float* dec_weight_ideal_value
  724. ) {
  725. unsigned int texel_count = di.texel_count;
  726. unsigned int weight_count = di.weight_count;
  727. bool is_direct = texel_count == weight_count;
  728. promise(texel_count > 0);
  729. promise(weight_count > 0);
  730. // If we have a 1:1 mapping just shortcut the computation. Transfer enough to also copy the
  731. // zero-initialized SIMD over-fetch region
  732. if (is_direct)
  733. {
  734. for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
  735. {
  736. vfloat weight(ei.weights + i);
  737. storea(weight, dec_weight_ideal_value + i);
  738. }
  739. return;
  740. }
  741. // Otherwise compute an estimate and perform single refinement iteration
  742. // Compute an initial average for each decimated weight
  743. bool constant_wes = ei.is_constant_weight_error_scale;
  744. vfloat weight_error_scale(ei.weight_error_scale[0]);
  745. // This overshoots - this is OK as we initialize the array tails in the
  746. // decimation table structures to safe values ...
  747. for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
  748. {
  749. // Start with a small value to avoid div-by-zero later
  750. vfloat weight_weight(1e-10f);
  751. vfloat initial_weight = vfloat::zero();
  752. // Accumulate error weighting of all the texels using this weight
  753. vint weight_texel_count(di.weight_texel_count + i);
  754. unsigned int max_texel_count = hmax_s(weight_texel_count);
  755. promise(max_texel_count > 0);
  756. for (unsigned int j = 0; j < max_texel_count; j++)
  757. {
  758. const uint8_t* texel = di.weight_texels_tr[j] + i;
  759. vfloat weight = loada(di.weights_texel_contribs_tr[j] + i);
  760. if (!constant_wes)
  761. {
  762. weight_error_scale = gatherf_byte_inds<vfloat>(ei.weight_error_scale, texel);
  763. }
  764. vfloat contrib_weight = weight * weight_error_scale;
  765. weight_weight += contrib_weight;
  766. initial_weight += gatherf_byte_inds<vfloat>(ei.weights, texel) * contrib_weight;
  767. }
  768. storea(initial_weight / weight_weight, dec_weight_ideal_value + i);
  769. }
  770. // Populate the interpolated weight grid based on the initial average
  771. // Process SIMD-width texel coordinates at at time while we can. Safe to
  772. // over-process full SIMD vectors - the tail is zeroed.
  773. ASTCENC_ALIGNAS float infilled_weights[BLOCK_MAX_TEXELS];
  774. if (di.max_texel_weight_count <= 2)
  775. {
  776. for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
  777. {
  778. vfloat weight = bilinear_infill_vla_2(di, dec_weight_ideal_value, i);
  779. storea(weight, infilled_weights + i);
  780. }
  781. }
  782. else
  783. {
  784. for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
  785. {
  786. vfloat weight = bilinear_infill_vla(di, dec_weight_ideal_value, i);
  787. storea(weight, infilled_weights + i);
  788. }
  789. }
  790. // Perform a single iteration of refinement
  791. // Empirically determined step size; larger values don't help but smaller drops image quality
  792. constexpr float stepsize = 0.25f;
  793. constexpr float chd_scale = -WEIGHTS_TEXEL_SUM;
  794. for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
  795. {
  796. vfloat weight_val = loada(dec_weight_ideal_value + i);
  797. // Accumulate error weighting of all the texels using this weight
  798. // Start with a small value to avoid div-by-zero later
  799. vfloat error_change0(1e-10f);
  800. vfloat error_change1(0.0f);
  801. // Accumulate error weighting of all the texels using this weight
  802. vint weight_texel_count(di.weight_texel_count + i);
  803. unsigned int max_texel_count = hmax_s(weight_texel_count);
  804. promise(max_texel_count > 0);
  805. for (unsigned int j = 0; j < max_texel_count; j++)
  806. {
  807. const uint8_t* texel = di.weight_texels_tr[j] + i;
  808. vfloat contrib_weight = loada(di.weights_texel_contribs_tr[j] + i);
  809. if (!constant_wes)
  810. {
  811. weight_error_scale = gatherf_byte_inds<vfloat>(ei.weight_error_scale, texel);
  812. }
  813. vfloat scale = weight_error_scale * contrib_weight;
  814. vfloat old_weight = gatherf_byte_inds<vfloat>(infilled_weights, texel);
  815. vfloat ideal_weight = gatherf_byte_inds<vfloat>(ei.weights, texel);
  816. error_change0 += contrib_weight * scale;
  817. error_change1 += (old_weight - ideal_weight) * scale;
  818. }
  819. vfloat step = (error_change1 * chd_scale) / error_change0;
  820. step = clamp(-stepsize, stepsize, step);
  821. // Update the weight; note this can store negative values
  822. storea(weight_val + step, dec_weight_ideal_value + i);
  823. }
  824. }
  825. /* See header for documentation. */
  826. void compute_quantized_weights_for_decimation(
  827. const decimation_info& di,
  828. float low_bound,
  829. float high_bound,
  830. const float* dec_weight_ideal_value,
  831. float* weight_set_out,
  832. uint8_t* quantized_weight_set,
  833. quant_method quant_level
  834. ) {
  835. int weight_count = di.weight_count;
  836. promise(weight_count > 0);
  837. const quant_and_transfer_table& qat = quant_and_xfer_tables[quant_level];
  838. // The available quant levels, stored with a minus 1 bias
  839. static const float quant_levels_m1[12] {
  840. 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 7.0f, 9.0f, 11.0f, 15.0f, 19.0f, 23.0f, 31.0f
  841. };
  842. vint steps_m1(get_quant_level(quant_level) - 1);
  843. float quant_level_m1 = quant_levels_m1[quant_level];
  844. // Quantize the weight set using both the specified low/high bounds and standard 0..1 bounds
  845. // TODO: Oddity to investigate; triggered by test in issue #265.
  846. if (high_bound <= low_bound)
  847. {
  848. low_bound = 0.0f;
  849. high_bound = 1.0f;
  850. }
  851. float rscale = high_bound - low_bound;
  852. float scale = 1.0f / rscale;
  853. float scaled_low_bound = low_bound * scale;
  854. rscale *= 1.0f / 64.0f;
  855. vfloat scalev(scale);
  856. vfloat scaled_low_boundv(scaled_low_bound);
  857. vfloat quant_level_m1v(quant_level_m1);
  858. vfloat rscalev(rscale);
  859. vfloat low_boundv(low_bound);
  860. // This runs to the rounded-up SIMD size, which is safe as the loop tail is filled with known
  861. // safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements
  862. if (get_quant_level(quant_level) <= 16)
  863. {
  864. vtable_16x8 table;
  865. vtable_prepare(table, qat.quant_to_unquant);
  866. for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
  867. {
  868. vfloat ix = loada(dec_weight_ideal_value + i) * scalev - scaled_low_boundv;
  869. ix = clampzo(ix);
  870. // Look up the two closest indexes and return the one that was closest
  871. vfloat ix1 = ix * quant_level_m1v;
  872. vint weightl = float_to_int(ix1);
  873. vint weighth = min(weightl + vint(1), steps_m1);
  874. vint ixli = vtable_lookup_32bit(table, weightl);
  875. vint ixhi = vtable_lookup_32bit(table, weighth);
  876. vfloat ixl = int_to_float(ixli);
  877. vfloat ixh = int_to_float(ixhi);
  878. vmask mask = (ixl + ixh) < (vfloat(128.0f) * ix);
  879. vint weight = select(ixli, ixhi, mask);
  880. ixl = select(ixl, ixh, mask);
  881. // Invert the weight-scaling that was done initially
  882. storea(ixl * rscalev + low_boundv, weight_set_out + i);
  883. pack_and_store_low_bytes(weight, quantized_weight_set + i);
  884. }
  885. }
  886. else
  887. {
  888. vtable_32x8 table;
  889. vtable_prepare(table, qat.quant_to_unquant);
  890. for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
  891. {
  892. vfloat ix = loada(dec_weight_ideal_value + i) * scalev - scaled_low_boundv;
  893. ix = clampzo(ix);
  894. // Look up the two closest indexes and return the one that was closest
  895. vfloat ix1 = ix * quant_level_m1v;
  896. vint weightl = float_to_int(ix1);
  897. vint weighth = min(weightl + vint(1), steps_m1);
  898. vint ixli = vtable_lookup_32bit(table, weightl);
  899. vint ixhi = vtable_lookup_32bit(table, weighth);
  900. vfloat ixl = int_to_float(ixli);
  901. vfloat ixh = int_to_float(ixhi);
  902. vmask mask = (ixl + ixh) < (vfloat(128.0f) * ix);
  903. vint weight = select(ixli, ixhi, mask);
  904. ixl = select(ixl, ixh, mask);
  905. // Invert the weight-scaling that was done initially
  906. storea(ixl * rscalev + low_boundv, weight_set_out + i);
  907. pack_and_store_low_bytes(weight, quantized_weight_set + i);
  908. }
  909. }
  910. }
  911. /**
  912. * @brief Compute the RGB + offset for a HDR endpoint mode #7.
  913. *
  914. * Since the matrix needed has a regular structure we can simplify the inverse calculation. This
  915. * gives us ~24 multiplications vs. 96 for a generic inverse.
  916. *
  917. * mat[0] = vfloat4(rgba_ws.x, 0.0f, 0.0f, wght_ws.x);
  918. * mat[1] = vfloat4( 0.0f, rgba_ws.y, 0.0f, wght_ws.y);
  919. * mat[2] = vfloat4( 0.0f, 0.0f, rgba_ws.z, wght_ws.z);
  920. * mat[3] = vfloat4(wght_ws.x, wght_ws.y, wght_ws.z, psum);
  921. * mat = invert(mat);
  922. *
  923. * @param rgba_weight_sum Sum of partition component error weights.
  924. * @param weight_weight_sum Sum of partition component error weights * texel weight.
  925. * @param rgbq_sum Sum of partition component error weights * texel weight * color data.
  926. * @param psum Sum of RGB color weights * texel weight^2.
  927. */
  928. static inline vfloat4 compute_rgbo_vector(
  929. vfloat4 rgba_weight_sum,
  930. vfloat4 weight_weight_sum,
  931. vfloat4 rgbq_sum,
  932. float psum
  933. ) {
  934. float X = rgba_weight_sum.lane<0>();
  935. float Y = rgba_weight_sum.lane<1>();
  936. float Z = rgba_weight_sum.lane<2>();
  937. float P = weight_weight_sum.lane<0>();
  938. float Q = weight_weight_sum.lane<1>();
  939. float R = weight_weight_sum.lane<2>();
  940. float S = psum;
  941. float PP = P * P;
  942. float QQ = Q * Q;
  943. float RR = R * R;
  944. float SZmRR = S * Z - RR;
  945. float DT = SZmRR * Y - Z * QQ;
  946. float YP = Y * P;
  947. float QX = Q * X;
  948. float YX = Y * X;
  949. float mZYP = -Z * YP;
  950. float mZQX = -Z * QX;
  951. float mRYX = -R * YX;
  952. float ZQP = Z * Q * P;
  953. float RYP = R * YP;
  954. float RQX = R * QX;
  955. // Compute the reciprocal of matrix determinant
  956. float rdet = 1.0f / (DT * X + mZYP * P);
  957. // Actually compute the adjugate, and then apply 1/det separately
  958. vfloat4 mat0(DT, ZQP, RYP, mZYP);
  959. vfloat4 mat1(ZQP, SZmRR * X - Z * PP, RQX, mZQX);
  960. vfloat4 mat2(RYP, RQX, (S * Y - QQ) * X - Y * PP, mRYX);
  961. vfloat4 mat3(mZYP, mZQX, mRYX, Z * YX);
  962. vfloat4 vect = rgbq_sum * rdet;
  963. return vfloat4(dot_s(mat0, vect),
  964. dot_s(mat1, vect),
  965. dot_s(mat2, vect),
  966. dot_s(mat3, vect));
  967. }
  968. /* See header for documentation. */
  969. void recompute_ideal_colors_1plane(
  970. const image_block& blk,
  971. const partition_info& pi,
  972. const decimation_info& di,
  973. const uint8_t* dec_weights_uquant,
  974. endpoints& ep,
  975. vfloat4 rgbs_vectors[BLOCK_MAX_PARTITIONS],
  976. vfloat4 rgbo_vectors[BLOCK_MAX_PARTITIONS]
  977. ) {
  978. unsigned int weight_count = di.weight_count;
  979. unsigned int total_texel_count = blk.texel_count;
  980. unsigned int partition_count = pi.partition_count;
  981. promise(weight_count > 0);
  982. promise(total_texel_count > 0);
  983. promise(partition_count > 0);
  984. ASTCENC_ALIGNAS float dec_weight[BLOCK_MAX_WEIGHTS];
  985. for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
  986. {
  987. vint unquant_value(dec_weights_uquant + i);
  988. vfloat unquant_valuef = int_to_float(unquant_value) * vfloat(1.0f / 64.0f);
  989. storea(unquant_valuef, dec_weight + i);
  990. }
  991. ASTCENC_ALIGNAS float undec_weight[BLOCK_MAX_TEXELS];
  992. float* undec_weight_ref;
  993. if (di.max_texel_weight_count == 1)
  994. {
  995. undec_weight_ref = dec_weight;
  996. }
  997. else if (di.max_texel_weight_count <= 2)
  998. {
  999. for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
  1000. {
  1001. vfloat weight = bilinear_infill_vla_2(di, dec_weight, i);
  1002. storea(weight, undec_weight + i);
  1003. }
  1004. undec_weight_ref = undec_weight;
  1005. }
  1006. else
  1007. {
  1008. for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
  1009. {
  1010. vfloat weight = bilinear_infill_vla(di, dec_weight, i);
  1011. storea(weight, undec_weight + i);
  1012. }
  1013. undec_weight_ref = undec_weight;
  1014. }
  1015. vfloat4 rgba_sum(blk.data_mean * static_cast<float>(blk.texel_count));
  1016. for (unsigned int i = 0; i < partition_count; i++)
  1017. {
  1018. unsigned int texel_count = pi.partition_texel_count[i];
  1019. const uint8_t *texel_indexes = pi.texels_of_partition[i];
  1020. // Only compute a partition mean if more than one partition
  1021. if (partition_count > 1)
  1022. {
  1023. rgba_sum = vfloat4::zero();
  1024. promise(texel_count > 0);
  1025. for (unsigned int j = 0; j < texel_count; j++)
  1026. {
  1027. unsigned int tix = texel_indexes[j];
  1028. rgba_sum += blk.texel(tix);
  1029. }
  1030. }
  1031. rgba_sum = rgba_sum * blk.channel_weight;
  1032. vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), 1e-17f);
  1033. vfloat4 scale_dir = normalize((rgba_sum / rgba_weight_sum).swz<0, 1, 2>());
  1034. float scale_max = 0.0f;
  1035. float scale_min = 1e10f;
  1036. float wmin1 = 1.0f;
  1037. float wmax1 = 0.0f;
  1038. float left_sum_s = 0.0f;
  1039. float middle_sum_s = 0.0f;
  1040. float right_sum_s = 0.0f;
  1041. vfloat4 color_vec_x = vfloat4::zero();
  1042. vfloat4 color_vec_y = vfloat4::zero();
  1043. vfloat4 scale_vec = vfloat4::zero();
  1044. float weight_weight_sum_s = 1e-17f;
  1045. vfloat4 color_weight = blk.channel_weight;
  1046. float ls_weight = hadd_rgb_s(color_weight);
  1047. for (unsigned int j = 0; j < texel_count; j++)
  1048. {
  1049. unsigned int tix = texel_indexes[j];
  1050. vfloat4 rgba = blk.texel(tix);
  1051. float idx0 = undec_weight_ref[tix];
  1052. float om_idx0 = 1.0f - idx0;
  1053. wmin1 = astc::min(idx0, wmin1);
  1054. wmax1 = astc::max(idx0, wmax1);
  1055. float scale = dot3_s(scale_dir, rgba);
  1056. scale_min = astc::min(scale, scale_min);
  1057. scale_max = astc::max(scale, scale_max);
  1058. left_sum_s += om_idx0 * om_idx0;
  1059. middle_sum_s += om_idx0 * idx0;
  1060. right_sum_s += idx0 * idx0;
  1061. weight_weight_sum_s += idx0;
  1062. vfloat4 color_idx(idx0);
  1063. vfloat4 cwprod = rgba;
  1064. vfloat4 cwiprod = cwprod * color_idx;
  1065. color_vec_y += cwiprod;
  1066. color_vec_x += cwprod - cwiprod;
  1067. scale_vec += vfloat2(om_idx0, idx0) * (scale * ls_weight);
  1068. }
  1069. vfloat4 left_sum = vfloat4(left_sum_s) * color_weight;
  1070. vfloat4 middle_sum = vfloat4(middle_sum_s) * color_weight;
  1071. vfloat4 right_sum = vfloat4(right_sum_s) * color_weight;
  1072. vfloat4 lmrs_sum = vfloat3(left_sum_s, middle_sum_s, right_sum_s) * ls_weight;
  1073. color_vec_x = color_vec_x * color_weight;
  1074. color_vec_y = color_vec_y * color_weight;
  1075. // Initialize the luminance and scale vectors with a reasonable default
  1076. float scalediv = scale_min / astc::max(scale_max, 1e-10f);
  1077. scalediv = astc::clamp1f(scalediv);
  1078. vfloat4 sds = scale_dir * scale_max;
  1079. rgbs_vectors[i] = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv);
  1080. if (wmin1 >= wmax1 * 0.999f)
  1081. {
  1082. // If all weights in the partition were equal, then just take average of all colors in
  1083. // the partition and use that as both endpoint colors
  1084. vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
  1085. vmask4 notnan_mask = avg == avg;
  1086. ep.endpt0[i] = select(ep.endpt0[i], avg, notnan_mask);
  1087. ep.endpt1[i] = select(ep.endpt1[i], avg, notnan_mask);
  1088. rgbs_vectors[i] = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), 1.0f);
  1089. }
  1090. else
  1091. {
  1092. // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
  1093. // set of texel weights and pixel colors
  1094. vfloat4 color_det1 = (left_sum * right_sum) - (middle_sum * middle_sum);
  1095. vfloat4 color_rdet1 = 1.0f / color_det1;
  1096. float ls_det1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<2>()) - (lmrs_sum.lane<1>() * lmrs_sum.lane<1>());
  1097. float ls_rdet1 = 1.0f / ls_det1;
  1098. vfloat4 color_mss1 = (left_sum * left_sum)
  1099. + (2.0f * middle_sum * middle_sum)
  1100. + (right_sum * right_sum);
  1101. float ls_mss1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<0>())
  1102. + (2.0f * lmrs_sum.lane<1>() * lmrs_sum.lane<1>())
  1103. + (lmrs_sum.lane<2>() * lmrs_sum.lane<2>());
  1104. vfloat4 ep0 = (right_sum * color_vec_x - middle_sum * color_vec_y) * color_rdet1;
  1105. vfloat4 ep1 = (left_sum * color_vec_y - middle_sum * color_vec_x) * color_rdet1;
  1106. vmask4 det_mask = abs(color_det1) > (color_mss1 * 1e-4f);
  1107. vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
  1108. vmask4 full_mask = det_mask & notnan_mask;
  1109. ep.endpt0[i] = select(ep.endpt0[i], ep0, full_mask);
  1110. ep.endpt1[i] = select(ep.endpt1[i], ep1, full_mask);
  1111. float scale_ep0 = (lmrs_sum.lane<2>() * scale_vec.lane<0>() - lmrs_sum.lane<1>() * scale_vec.lane<1>()) * ls_rdet1;
  1112. float scale_ep1 = (lmrs_sum.lane<0>() * scale_vec.lane<1>() - lmrs_sum.lane<1>() * scale_vec.lane<0>()) * ls_rdet1;
  1113. if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1)
  1114. {
  1115. float scalediv2 = scale_ep0 / scale_ep1;
  1116. vfloat4 sdsm = scale_dir * scale_ep1;
  1117. rgbs_vectors[i] = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2);
  1118. }
  1119. }
  1120. // Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR
  1121. if (blk.rgb_lns[0] || blk.alpha_lns[0])
  1122. {
  1123. vfloat4 weight_weight_sum = vfloat4(weight_weight_sum_s) * color_weight;
  1124. float psum = right_sum_s * hadd_rgb_s(color_weight);
  1125. vfloat4 rgbq_sum = color_vec_x + color_vec_y;
  1126. rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
  1127. vfloat4 rgbovec = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
  1128. rgbo_vectors[i] = rgbovec;
  1129. // We can get a failure due to the use of a singular (non-invertible) matrix
  1130. // If it failed, compute rgbo_vectors[] with a different method ...
  1131. if (astc::isnan(dot_s(rgbovec, rgbovec)))
  1132. {
  1133. vfloat4 v0 = ep.endpt0[i];
  1134. vfloat4 v1 = ep.endpt1[i];
  1135. float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
  1136. avgdif = astc::max(avgdif, 0.0f);
  1137. vfloat4 avg = (v0 + v1) * 0.5f;
  1138. vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
  1139. rgbo_vectors[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
  1140. }
  1141. }
  1142. }
  1143. }
  1144. /* See header for documentation. */
  1145. void recompute_ideal_colors_2planes(
  1146. const image_block& blk,
  1147. const block_size_descriptor& bsd,
  1148. const decimation_info& di,
  1149. const uint8_t* dec_weights_uquant_plane1,
  1150. const uint8_t* dec_weights_uquant_plane2,
  1151. endpoints& ep,
  1152. vfloat4& rgbs_vector,
  1153. vfloat4& rgbo_vector,
  1154. int plane2_component
  1155. ) {
  1156. unsigned int weight_count = di.weight_count;
  1157. unsigned int total_texel_count = blk.texel_count;
  1158. promise(total_texel_count > 0);
  1159. promise(weight_count > 0);
  1160. ASTCENC_ALIGNAS float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE];
  1161. ASTCENC_ALIGNAS float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE];
  1162. assert(weight_count <= BLOCK_MAX_WEIGHTS_2PLANE);
  1163. for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
  1164. {
  1165. vint unquant_value1(dec_weights_uquant_plane1 + i);
  1166. vfloat unquant_value1f = int_to_float(unquant_value1) * vfloat(1.0f / 64.0f);
  1167. storea(unquant_value1f, dec_weight_plane1 + i);
  1168. vint unquant_value2(dec_weights_uquant_plane2 + i);
  1169. vfloat unquant_value2f = int_to_float(unquant_value2) * vfloat(1.0f / 64.0f);
  1170. storea(unquant_value2f, dec_weight_plane2 + i);
  1171. }
  1172. ASTCENC_ALIGNAS float undec_weight_plane1[BLOCK_MAX_TEXELS];
  1173. ASTCENC_ALIGNAS float undec_weight_plane2[BLOCK_MAX_TEXELS];
  1174. float* undec_weight_plane1_ref;
  1175. float* undec_weight_plane2_ref;
  1176. if (di.max_texel_weight_count == 1)
  1177. {
  1178. undec_weight_plane1_ref = dec_weight_plane1;
  1179. undec_weight_plane2_ref = dec_weight_plane2;
  1180. }
  1181. else if (di.max_texel_weight_count <= 2)
  1182. {
  1183. for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
  1184. {
  1185. vfloat weight = bilinear_infill_vla_2(di, dec_weight_plane1, i);
  1186. storea(weight, undec_weight_plane1 + i);
  1187. weight = bilinear_infill_vla_2(di, dec_weight_plane2, i);
  1188. storea(weight, undec_weight_plane2 + i);
  1189. }
  1190. undec_weight_plane1_ref = undec_weight_plane1;
  1191. undec_weight_plane2_ref = undec_weight_plane2;
  1192. }
  1193. else
  1194. {
  1195. for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
  1196. {
  1197. vfloat weight = bilinear_infill_vla(di, dec_weight_plane1, i);
  1198. storea(weight, undec_weight_plane1 + i);
  1199. weight = bilinear_infill_vla(di, dec_weight_plane2, i);
  1200. storea(weight, undec_weight_plane2 + i);
  1201. }
  1202. undec_weight_plane1_ref = undec_weight_plane1;
  1203. undec_weight_plane2_ref = undec_weight_plane2;
  1204. }
  1205. unsigned int texel_count = bsd.texel_count;
  1206. vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), 1e-17f);
  1207. vfloat4 scale_dir = normalize(blk.data_mean.swz<0, 1, 2>());
  1208. float scale_max = 0.0f;
  1209. float scale_min = 1e10f;
  1210. float wmin1 = 1.0f;
  1211. float wmax1 = 0.0f;
  1212. float wmin2 = 1.0f;
  1213. float wmax2 = 0.0f;
  1214. float left1_sum_s = 0.0f;
  1215. float middle1_sum_s = 0.0f;
  1216. float right1_sum_s = 0.0f;
  1217. float left2_sum_s = 0.0f;
  1218. float middle2_sum_s = 0.0f;
  1219. float right2_sum_s = 0.0f;
  1220. vfloat4 color_vec_x = vfloat4::zero();
  1221. vfloat4 color_vec_y = vfloat4::zero();
  1222. vfloat4 scale_vec = vfloat4::zero();
  1223. vfloat4 weight_weight_sum = vfloat4(1e-17f);
  1224. vmask4 p2_mask = vint4::lane_id() == vint4(plane2_component);
  1225. vfloat4 color_weight = blk.channel_weight;
  1226. float ls_weight = hadd_rgb_s(color_weight);
  1227. for (unsigned int j = 0; j < texel_count; j++)
  1228. {
  1229. vfloat4 rgba = blk.texel(j);
  1230. float idx0 = undec_weight_plane1_ref[j];
  1231. float om_idx0 = 1.0f - idx0;
  1232. wmin1 = astc::min(idx0, wmin1);
  1233. wmax1 = astc::max(idx0, wmax1);
  1234. float scale = dot3_s(scale_dir, rgba);
  1235. scale_min = astc::min(scale, scale_min);
  1236. scale_max = astc::max(scale, scale_max);
  1237. left1_sum_s += om_idx0 * om_idx0;
  1238. middle1_sum_s += om_idx0 * idx0;
  1239. right1_sum_s += idx0 * idx0;
  1240. float idx1 = undec_weight_plane2_ref[j];
  1241. float om_idx1 = 1.0f - idx1;
  1242. wmin2 = astc::min(idx1, wmin2);
  1243. wmax2 = astc::max(idx1, wmax2);
  1244. left2_sum_s += om_idx1 * om_idx1;
  1245. middle2_sum_s += om_idx1 * idx1;
  1246. right2_sum_s += idx1 * idx1;
  1247. vfloat4 color_idx = select(vfloat4(idx0), vfloat4(idx1), p2_mask);
  1248. vfloat4 cwprod = rgba;
  1249. vfloat4 cwiprod = cwprod * color_idx;
  1250. color_vec_y += cwiprod;
  1251. color_vec_x += cwprod - cwiprod;
  1252. scale_vec += vfloat2(om_idx0, idx0) * (ls_weight * scale);
  1253. weight_weight_sum += color_idx;
  1254. }
  1255. vfloat4 left1_sum = vfloat4(left1_sum_s) * color_weight;
  1256. vfloat4 middle1_sum = vfloat4(middle1_sum_s) * color_weight;
  1257. vfloat4 right1_sum = vfloat4(right1_sum_s) * color_weight;
  1258. vfloat4 lmrs_sum = vfloat3(left1_sum_s, middle1_sum_s, right1_sum_s) * ls_weight;
  1259. vfloat4 left2_sum = vfloat4(left2_sum_s) * color_weight;
  1260. vfloat4 middle2_sum = vfloat4(middle2_sum_s) * color_weight;
  1261. vfloat4 right2_sum = vfloat4(right2_sum_s) * color_weight;
  1262. color_vec_x = color_vec_x * color_weight;
  1263. color_vec_y = color_vec_y * color_weight;
  1264. // Initialize the luminance and scale vectors with a reasonable default
  1265. float scalediv = scale_min / astc::max(scale_max, 1e-10f);
  1266. scalediv = astc::clamp1f(scalediv);
  1267. vfloat4 sds = scale_dir * scale_max;
  1268. rgbs_vector = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv);
  1269. if (wmin1 >= wmax1 * 0.999f)
  1270. {
  1271. // If all weights in the partition were equal, then just take average of all colors in
  1272. // the partition and use that as both endpoint colors
  1273. vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
  1274. vmask4 p1_mask = vint4::lane_id() != vint4(plane2_component);
  1275. vmask4 notnan_mask = avg == avg;
  1276. vmask4 full_mask = p1_mask & notnan_mask;
  1277. ep.endpt0[0] = select(ep.endpt0[0], avg, full_mask);
  1278. ep.endpt1[0] = select(ep.endpt1[0], avg, full_mask);
  1279. rgbs_vector = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), 1.0f);
  1280. }
  1281. else
  1282. {
  1283. // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
  1284. // set of texel weights and pixel colors
  1285. vfloat4 color_det1 = (left1_sum * right1_sum) - (middle1_sum * middle1_sum);
  1286. vfloat4 color_rdet1 = 1.0f / color_det1;
  1287. float ls_det1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<2>()) - (lmrs_sum.lane<1>() * lmrs_sum.lane<1>());
  1288. float ls_rdet1 = 1.0f / ls_det1;
  1289. vfloat4 color_mss1 = (left1_sum * left1_sum)
  1290. + (2.0f * middle1_sum * middle1_sum)
  1291. + (right1_sum * right1_sum);
  1292. float ls_mss1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<0>())
  1293. + (2.0f * lmrs_sum.lane<1>() * lmrs_sum.lane<1>())
  1294. + (lmrs_sum.lane<2>() * lmrs_sum.lane<2>());
  1295. vfloat4 ep0 = (right1_sum * color_vec_x - middle1_sum * color_vec_y) * color_rdet1;
  1296. vfloat4 ep1 = (left1_sum * color_vec_y - middle1_sum * color_vec_x) * color_rdet1;
  1297. float scale_ep0 = (lmrs_sum.lane<2>() * scale_vec.lane<0>() - lmrs_sum.lane<1>() * scale_vec.lane<1>()) * ls_rdet1;
  1298. float scale_ep1 = (lmrs_sum.lane<0>() * scale_vec.lane<1>() - lmrs_sum.lane<1>() * scale_vec.lane<0>()) * ls_rdet1;
  1299. vmask4 p1_mask = vint4::lane_id() != vint4(plane2_component);
  1300. vmask4 det_mask = abs(color_det1) > (color_mss1 * 1e-4f);
  1301. vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
  1302. vmask4 full_mask = p1_mask & det_mask & notnan_mask;
  1303. ep.endpt0[0] = select(ep.endpt0[0], ep0, full_mask);
  1304. ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask);
  1305. if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1)
  1306. {
  1307. float scalediv2 = scale_ep0 / scale_ep1;
  1308. vfloat4 sdsm = scale_dir * scale_ep1;
  1309. rgbs_vector = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2);
  1310. }
  1311. }
  1312. if (wmin2 >= wmax2 * 0.999f)
  1313. {
  1314. // If all weights in the partition were equal, then just take average of all colors in
  1315. // the partition and use that as both endpoint colors
  1316. vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
  1317. vmask4 notnan_mask = avg == avg;
  1318. vmask4 full_mask = p2_mask & notnan_mask;
  1319. ep.endpt0[0] = select(ep.endpt0[0], avg, full_mask);
  1320. ep.endpt1[0] = select(ep.endpt1[0], avg, full_mask);
  1321. }
  1322. else
  1323. {
  1324. // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
  1325. // set of texel weights and pixel colors
  1326. vfloat4 color_det2 = (left2_sum * right2_sum) - (middle2_sum * middle2_sum);
  1327. vfloat4 color_rdet2 = 1.0f / color_det2;
  1328. vfloat4 color_mss2 = (left2_sum * left2_sum)
  1329. + (2.0f * middle2_sum * middle2_sum)
  1330. + (right2_sum * right2_sum);
  1331. vfloat4 ep0 = (right2_sum * color_vec_x - middle2_sum * color_vec_y) * color_rdet2;
  1332. vfloat4 ep1 = (left2_sum * color_vec_y - middle2_sum * color_vec_x) * color_rdet2;
  1333. vmask4 det_mask = abs(color_det2) > (color_mss2 * 1e-4f);
  1334. vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
  1335. vmask4 full_mask = p2_mask & det_mask & notnan_mask;
  1336. ep.endpt0[0] = select(ep.endpt0[0], ep0, full_mask);
  1337. ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask);
  1338. }
  1339. // Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR
  1340. if (blk.rgb_lns[0] || blk.alpha_lns[0])
  1341. {
  1342. weight_weight_sum = weight_weight_sum * color_weight;
  1343. float psum = dot3_s(select(right1_sum, right2_sum, p2_mask), color_weight);
  1344. vfloat4 rgbq_sum = color_vec_x + color_vec_y;
  1345. rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
  1346. rgbo_vector = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
  1347. // We can get a failure due to the use of a singular (non-invertible) matrix
  1348. // If it failed, compute rgbo_vectors[] with a different method ...
  1349. if (astc::isnan(dot_s(rgbo_vector, rgbo_vector)))
  1350. {
  1351. vfloat4 v0 = ep.endpt0[0];
  1352. vfloat4 v1 = ep.endpt1[0];
  1353. float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
  1354. avgdif = astc::max(avgdif, 0.0f);
  1355. vfloat4 avg = (v0 + v1) * 0.5f;
  1356. vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
  1357. rgbo_vector = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
  1358. }
  1359. }
  1360. }
  1361. #endif