basisu_kernels_imp.h 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647
  1. // basisu_kernels_imp.h - Do not directly include
  2. // Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
  3. //
  4. // Licensed under the Apache License, Version 2.0 (the "License");
  5. // you may not use this file except in compliance with the License.
  6. // You may obtain a copy of the License at
  7. //
  8. // http://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software
  11. // distributed under the License is distributed on an "AS IS" BASIS,
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. // See the License for the specific language governing permissions and
  14. // limitations under the License.
  15. using namespace CPPSPMD;
  16. namespace CPPSPMD_NAME(basisu_kernels_namespace)
  17. {
  18. struct perceptual_distance_rgb_4_N : spmd_kernel
  19. {
  20. void _call(int64_t* pDistance,
  21. const uint8_t* pSelectors,
  22. const color_rgba* pBlock_colors,
  23. const color_rgba* pSrc_pixels, uint32_t n,
  24. int64_t early_out_err)
  25. {
  26. assert(early_out_err >= 0);
  27. *pDistance = 0;
  28. __m128i block_colors[4];
  29. vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
  30. for (uint32_t i = 0; i < 4; i++)
  31. {
  32. block_colors[i] = load_rgba32(&pBlock_colors[i]);
  33. store_all(block_colors_r[i], (int)pBlock_colors[i].r);
  34. store_all(block_colors_g[i], (int)pBlock_colors[i].g);
  35. store_all(block_colors_b[i], (int)pBlock_colors[i].b);
  36. }
  37. uint32_t i;
  38. for (i = 0; (i + 4) <= n; i += 4)
  39. {
  40. __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
  41. vint r, g, b, a;
  42. transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
  43. int s0 = pSelectors[i], s1 = pSelectors[i + 1], s2 = pSelectors[i + 2], s3 = pSelectors[i + 3];
  44. vint base_r, base_g, base_b, base_a;
  45. if ((s0 == s1) && (s0 == s2) && (s0 == s3))
  46. {
  47. store_all(base_r, block_colors_r[s0]);
  48. store_all(base_g, block_colors_g[s0]);
  49. store_all(base_b, block_colors_b[s0]);
  50. }
  51. else
  52. {
  53. __m128i k0 = block_colors[s0], k1 = block_colors[s1], k2 = block_colors[s2], k3 = block_colors[s3];
  54. transpose4x4(base_r.m_value, base_g.m_value, base_b.m_value, base_a.m_value, k0, k1, k2, k3);
  55. }
  56. vint dr = base_r - r;
  57. vint dg = base_g - g;
  58. vint db = base_b - b;
  59. vint delta_l = dr * 27 + dg * 92 + db * 9;
  60. vint delta_cr = dr * 128 - delta_l;
  61. vint delta_cb = db * 128 - delta_l;
  62. vint id = ((delta_l * delta_l) >> 7) +
  63. ((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
  64. ((((delta_cb * delta_cb) >> 7) * 3) >> 7);
  65. *pDistance += reduce_add(id);
  66. if (*pDistance >= early_out_err)
  67. return;
  68. }
  69. for (; i < n; i++)
  70. {
  71. int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
  72. int sel = pSelectors[i];
  73. int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
  74. int dr = base_r - r;
  75. int dg = base_g - g;
  76. int db = base_b - b;
  77. int delta_l = dr * 27 + dg * 92 + db * 9;
  78. int delta_cr = dr * 128 - delta_l;
  79. int delta_cb = db * 128 - delta_l;
  80. int id = ((delta_l * delta_l) >> 7) +
  81. ((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
  82. ((((delta_cb * delta_cb) >> 7) * 3) >> 7);
  83. *pDistance += id;
  84. if (*pDistance >= early_out_err)
  85. return;
  86. }
  87. }
  88. };
  89. struct linear_distance_rgb_4_N : spmd_kernel
  90. {
  91. void _call(int64_t* pDistance,
  92. const uint8_t* pSelectors,
  93. const color_rgba* pBlock_colors,
  94. const color_rgba* pSrc_pixels, uint32_t n,
  95. int64_t early_out_err)
  96. {
  97. assert(early_out_err >= 0);
  98. *pDistance = 0;
  99. __m128i block_colors[4];
  100. vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
  101. for (uint32_t i = 0; i < 4; i++)
  102. {
  103. block_colors[i] = load_rgba32(&pBlock_colors[i]);
  104. store_all(block_colors_r[i], (int)pBlock_colors[i].r);
  105. store_all(block_colors_g[i], (int)pBlock_colors[i].g);
  106. store_all(block_colors_b[i], (int)pBlock_colors[i].b);
  107. }
  108. uint32_t i;
  109. for (i = 0; (i + 4) <= n; i += 4)
  110. {
  111. __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
  112. vint r, g, b, a;
  113. transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
  114. int s0 = pSelectors[i], s1 = pSelectors[i + 1], s2 = pSelectors[i + 2], s3 = pSelectors[i + 3];
  115. vint base_r, base_g, base_b, base_a;
  116. if ((s0 == s1) && (s0 == s2) && (s0 == s3))
  117. {
  118. store_all(base_r, block_colors_r[s0]);
  119. store_all(base_g, block_colors_g[s0]);
  120. store_all(base_b, block_colors_b[s0]);
  121. }
  122. else
  123. {
  124. __m128i k0 = block_colors[s0], k1 = block_colors[s1], k2 = block_colors[s2], k3 = block_colors[s3];
  125. transpose4x4(base_r.m_value, base_g.m_value, base_b.m_value, base_a.m_value, k0, k1, k2, k3);
  126. }
  127. vint dr = base_r - r;
  128. vint dg = base_g - g;
  129. vint db = base_b - b;
  130. vint id = dr * dr + dg * dg + db * db;
  131. *pDistance += reduce_add(id);
  132. if (*pDistance >= early_out_err)
  133. return;
  134. }
  135. for (; i < n; i++)
  136. {
  137. int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
  138. int sel = pSelectors[i];
  139. int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
  140. int dr = base_r - r;
  141. int dg = base_g - g;
  142. int db = base_b - b;
  143. int id = dr * dr + dg * dg + db * db;
  144. *pDistance += id;
  145. if (*pDistance >= early_out_err)
  146. return;
  147. }
  148. }
  149. };
  150. struct find_selectors_perceptual_rgb_4_N : spmd_kernel
  151. {
  152. inline vint compute_dist(
  153. const vint& base_r, const vint& base_g, const vint& base_b,
  154. const vint& r, const vint& g, const vint& b)
  155. {
  156. vint dr = base_r - r;
  157. vint dg = base_g - g;
  158. vint db = base_b - b;
  159. vint delta_l = dr * 27 + dg * 92 + db * 9;
  160. vint delta_cr = dr * 128 - delta_l;
  161. vint delta_cb = db * 128 - delta_l;
  162. vint id = VINT_SHIFT_RIGHT(delta_l * delta_l, 7) +
  163. VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cr * delta_cr, 7) * 26, 7) +
  164. VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cb * delta_cb, 7) * 3, 7);
  165. return id;
  166. }
  167. void _call(int64_t* pDistance,
  168. uint8_t* pSelectors,
  169. const color_rgba* pBlock_colors,
  170. const color_rgba* pSrc_pixels, uint32_t n,
  171. int64_t early_out_err)
  172. {
  173. assert(early_out_err >= 0);
  174. *pDistance = 0;
  175. vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
  176. for (uint32_t i = 0; i < 4; i++)
  177. {
  178. store_all(block_colors_r[i], (int)pBlock_colors[i].r);
  179. store_all(block_colors_g[i], (int)pBlock_colors[i].g);
  180. store_all(block_colors_b[i], (int)pBlock_colors[i].b);
  181. }
  182. const __m128i shuf = _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 12, 8, 4, 0);
  183. uint32_t i;
  184. for (i = 0; (i + 4) <= n; i += 4)
  185. {
  186. __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
  187. vint r, g, b, a;
  188. transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
  189. vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
  190. vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
  191. vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
  192. vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
  193. vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
  194. vint sels = spmd_ternaryi(min_dist == dist0, 0, spmd_ternaryi(min_dist == dist1, 1, spmd_ternaryi(min_dist == dist2, 2, 3)));
  195. __m128i vsels = shuffle_epi8(sels.m_value, shuf);
  196. storeu_si32((void *)(pSelectors + i), vsels);
  197. *pDistance += reduce_add(min_dist);
  198. if (*pDistance >= early_out_err)
  199. return;
  200. }
  201. for (; i < n; i++)
  202. {
  203. int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
  204. int best_err = INT_MAX, best_sel = 0;
  205. for (int sel = 0; sel < 4; sel++)
  206. {
  207. int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
  208. int dr = base_r - r;
  209. int dg = base_g - g;
  210. int db = base_b - b;
  211. int delta_l = dr * 27 + dg * 92 + db * 9;
  212. int delta_cr = dr * 128 - delta_l;
  213. int delta_cb = db * 128 - delta_l;
  214. int id = ((delta_l * delta_l) >> 7) +
  215. ((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
  216. ((((delta_cb * delta_cb) >> 7) * 3) >> 7);
  217. if (id < best_err)
  218. {
  219. best_err = id;
  220. best_sel = sel;
  221. }
  222. }
  223. pSelectors[i] = (uint8_t)best_sel;
  224. *pDistance += best_err;
  225. if (*pDistance >= early_out_err)
  226. return;
  227. }
  228. }
  229. };
  230. struct find_selectors_linear_rgb_4_N : spmd_kernel
  231. {
  232. inline vint compute_dist(
  233. const vint& base_r, const vint& base_g, const vint& base_b,
  234. const vint& r, const vint& g, const vint& b)
  235. {
  236. vint dr = base_r - r;
  237. vint dg = base_g - g;
  238. vint db = base_b - b;
  239. vint id = dr * dr + dg * dg + db * db;
  240. return id;
  241. }
  242. void _call(int64_t* pDistance,
  243. uint8_t* pSelectors,
  244. const color_rgba* pBlock_colors,
  245. const color_rgba* pSrc_pixels, uint32_t n,
  246. int64_t early_out_err)
  247. {
  248. assert(early_out_err >= 0);
  249. *pDistance = 0;
  250. vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
  251. for (uint32_t i = 0; i < 4; i++)
  252. {
  253. store_all(block_colors_r[i], (int)pBlock_colors[i].r);
  254. store_all(block_colors_g[i], (int)pBlock_colors[i].g);
  255. store_all(block_colors_b[i], (int)pBlock_colors[i].b);
  256. }
  257. const __m128i shuf = _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 12, 8, 4, 0);
  258. uint32_t i;
  259. for (i = 0; (i + 4) <= n; i += 4)
  260. {
  261. __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
  262. vint r, g, b, a;
  263. transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
  264. vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
  265. vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
  266. vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
  267. vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
  268. vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
  269. vint sels = spmd_ternaryi(min_dist == dist0, 0, spmd_ternaryi(min_dist == dist1, 1, spmd_ternaryi(min_dist == dist2, 2, 3)));
  270. __m128i vsels = shuffle_epi8(sels.m_value, shuf);
  271. storeu_si32((void *)(pSelectors + i), vsels);
  272. *pDistance += reduce_add(min_dist);
  273. if (*pDistance >= early_out_err)
  274. return;
  275. }
  276. for (; i < n; i++)
  277. {
  278. int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
  279. int best_err = INT_MAX, best_sel = 0;
  280. for (int sel = 0; sel < 4; sel++)
  281. {
  282. int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
  283. int dr = base_r - r;
  284. int dg = base_g - g;
  285. int db = base_b - b;
  286. int id = dr * dr + dg * dg + db * db;
  287. if (id < best_err)
  288. {
  289. best_err = id;
  290. best_sel = sel;
  291. }
  292. }
  293. pSelectors[i] = (uint8_t)best_sel;
  294. *pDistance += best_err;
  295. if (*pDistance >= early_out_err)
  296. return;
  297. }
  298. }
  299. };
  300. struct find_lowest_error_perceptual_rgb_4_N : spmd_kernel
  301. {
  302. inline vint compute_dist(
  303. const vint& base_r, const vint& base_g, const vint& base_b,
  304. const vint& r, const vint& g, const vint& b)
  305. {
  306. vint dr = base_r - r;
  307. vint dg = base_g - g;
  308. vint db = base_b - b;
  309. vint delta_l = dr * 27 + dg * 92 + db * 9;
  310. vint delta_cr = dr * 128 - delta_l;
  311. vint delta_cb = db * 128 - delta_l;
  312. vint id = VINT_SHIFT_RIGHT(delta_l * delta_l, 7) +
  313. VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cr * delta_cr, 7) * 26, 7) +
  314. VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cb * delta_cb, 7) * 3, 7);
  315. return id;
  316. }
  317. void _call(int64_t* pDistance,
  318. const color_rgba* pBlock_colors,
  319. const color_rgba* pSrc_pixels, uint32_t n,
  320. int64_t early_out_error)
  321. {
  322. assert(early_out_error >= 0);
  323. *pDistance = 0;
  324. vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
  325. for (uint32_t i = 0; i < 4; i++)
  326. {
  327. store_all(block_colors_r[i], (int)pBlock_colors[i].r);
  328. store_all(block_colors_g[i], (int)pBlock_colors[i].g);
  329. store_all(block_colors_b[i], (int)pBlock_colors[i].b);
  330. }
  331. uint32_t i;
  332. for (i = 0; (i + 4) <= n; i += 4)
  333. {
  334. __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
  335. vint r, g, b, a;
  336. transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
  337. vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
  338. vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
  339. vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
  340. vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
  341. vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
  342. *pDistance += reduce_add(min_dist);
  343. if (*pDistance > early_out_error)
  344. return;
  345. }
  346. for (; i < n; i++)
  347. {
  348. int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
  349. int best_err = INT_MAX;
  350. for (int sel = 0; sel < 4; sel++)
  351. {
  352. int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
  353. int dr = base_r - r;
  354. int dg = base_g - g;
  355. int db = base_b - b;
  356. int delta_l = dr * 27 + dg * 92 + db * 9;
  357. int delta_cr = dr * 128 - delta_l;
  358. int delta_cb = db * 128 - delta_l;
  359. int id = ((delta_l * delta_l) >> 7) +
  360. ((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
  361. ((((delta_cb * delta_cb) >> 7) * 3) >> 7);
  362. if (id < best_err)
  363. {
  364. best_err = id;
  365. }
  366. }
  367. *pDistance += best_err;
  368. if (*pDistance > early_out_error)
  369. return;
  370. }
  371. }
  372. };
  373. struct find_lowest_error_linear_rgb_4_N : spmd_kernel
  374. {
  375. inline vint compute_dist(
  376. const vint& base_r, const vint& base_g, const vint& base_b,
  377. const vint& r, const vint& g, const vint& b)
  378. {
  379. vint dr = base_r - r;
  380. vint dg = base_g - g;
  381. vint db = base_b - b;
  382. vint id = dr * dr + dg * dg + db * db;
  383. return id;
  384. }
  385. void _call(int64_t* pDistance,
  386. const color_rgba* pBlock_colors,
  387. const color_rgba* pSrc_pixels, uint32_t n,
  388. int64_t early_out_error)
  389. {
  390. assert(early_out_error >= 0);
  391. *pDistance = 0;
  392. vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
  393. for (uint32_t i = 0; i < 4; i++)
  394. {
  395. store_all(block_colors_r[i], (int)pBlock_colors[i].r);
  396. store_all(block_colors_g[i], (int)pBlock_colors[i].g);
  397. store_all(block_colors_b[i], (int)pBlock_colors[i].b);
  398. }
  399. uint32_t i;
  400. for (i = 0; (i + 4) <= n; i += 4)
  401. {
  402. __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
  403. vint r, g, b, a;
  404. transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
  405. vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
  406. vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
  407. vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
  408. vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
  409. vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
  410. *pDistance += reduce_add(min_dist);
  411. if (*pDistance > early_out_error)
  412. return;
  413. }
  414. for (; i < n; i++)
  415. {
  416. int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
  417. int best_err = INT_MAX;
  418. for (int sel = 0; sel < 4; sel++)
  419. {
  420. int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
  421. int dr = base_r - r;
  422. int dg = base_g - g;
  423. int db = base_b - b;
  424. int id = dr * dr + dg * dg + db * db;
  425. if (id < best_err)
  426. {
  427. best_err = id;
  428. }
  429. }
  430. *pDistance += best_err;
  431. if (*pDistance > early_out_error)
  432. return;
  433. }
  434. }
  435. };
  436. struct update_covar_matrix_16x16 : spmd_kernel
  437. {
  438. void _call(
  439. uint32_t num_vecs, const void* pWeighted_vecs_void, const void* pOrigin_void, const uint32_t* pVec_indices, void* pMatrix16x16_void)
  440. {
  441. const std::pair<vec16F, uint64_t>* pWeighted_vecs = static_cast< const std::pair<vec16F, uint64_t> *>(pWeighted_vecs_void);
  442. const float* pOrigin = static_cast<const float*>(pOrigin_void);
  443. vfloat org0 = loadu_linear_all(pOrigin), org1 = loadu_linear_all(pOrigin + 4), org2 = loadu_linear_all(pOrigin + 8), org3 = loadu_linear_all(pOrigin + 12);
  444. vfloat mat[16][4];
  445. vfloat vzero(zero_vfloat());
  446. for (uint32_t i = 0; i < 16; i++)
  447. {
  448. store_all(mat[i][0], vzero);
  449. store_all(mat[i][1], vzero);
  450. store_all(mat[i][2], vzero);
  451. store_all(mat[i][3], vzero);
  452. }
  453. for (uint32_t k = 0; k < num_vecs; k++)
  454. {
  455. const uint32_t vec_index = pVec_indices[k];
  456. const float* pW = pWeighted_vecs[vec_index].first.get_ptr();
  457. vfloat weight((float)pWeighted_vecs[vec_index].second);
  458. vfloat vec[4] = { loadu_linear_all(pW) - org0, loadu_linear_all(pW + 4) - org1, loadu_linear_all(pW + 8) - org2, loadu_linear_all(pW + 12) - org3 };
  459. vfloat wvec0 = vec[0] * weight, wvec1 = vec[1] * weight, wvec2 = vec[2] * weight, wvec3 = vec[3] * weight;
  460. for (uint32_t j = 0; j < 16; j++)
  461. {
  462. vfloat vx = ((const float*)vec)[j];
  463. store_all(mat[j][0], mat[j][0] + vx * wvec0);
  464. store_all(mat[j][1], mat[j][1] + vx * wvec1);
  465. store_all(mat[j][2], mat[j][2] + vx * wvec2);
  466. store_all(mat[j][3], mat[j][3] + vx * wvec3);
  467. } // j
  468. } // k
  469. float* pMatrix = static_cast<float*>(pMatrix16x16_void);
  470. float* pDst = pMatrix;
  471. for (uint32_t i = 0; i < 16; i++)
  472. {
  473. storeu_linear_all(pDst, mat[i][0]);
  474. storeu_linear_all(pDst + 4, mat[i][1]);
  475. storeu_linear_all(pDst + 8, mat[i][2]);
  476. storeu_linear_all(pDst + 12, mat[i][3]);
  477. pDst += 16;
  478. }
  479. }
  480. };
  481. } // namespace
  482. using namespace CPPSPMD_NAME(basisu_kernels_namespace);
  483. void CPPSPMD_NAME(perceptual_distance_rgb_4_N)(int64_t* pDistance, const uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
  484. {
  485. spmd_call< perceptual_distance_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
  486. }
  487. void CPPSPMD_NAME(linear_distance_rgb_4_N)(int64_t* pDistance, const uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
  488. {
  489. spmd_call< linear_distance_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
  490. }
  491. void CPPSPMD_NAME(find_selectors_perceptual_rgb_4_N)(int64_t *pDistance, uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
  492. {
  493. spmd_call< find_selectors_perceptual_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
  494. }
  495. void CPPSPMD_NAME(find_selectors_linear_rgb_4_N)(int64_t* pDistance, uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
  496. {
  497. spmd_call< find_selectors_linear_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
  498. }
  499. void CPPSPMD_NAME(find_lowest_error_perceptual_rgb_4_N)(int64_t* pDistance, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_error)
  500. {
  501. spmd_call< find_lowest_error_perceptual_rgb_4_N >(pDistance, pBlock_colors, pSrc_pixels, n, early_out_error);
  502. }
  503. void CPPSPMD_NAME(find_lowest_error_linear_rgb_4_N)(int64_t* pDistance, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_error)
  504. {
  505. spmd_call< find_lowest_error_linear_rgb_4_N >(pDistance, pBlock_colors, pSrc_pixels, n, early_out_error);
  506. }
  507. void CPPSPMD_NAME(update_covar_matrix_16x16)(uint32_t num_vecs, const void* pWeighted_vecs, const void* pOrigin, const uint32_t *pVec_indices, void* pMatrix16x16)
  508. {
  509. spmd_call < update_covar_matrix_16x16 >(num_vecs, pWeighted_vecs, pOrigin, pVec_indices, pMatrix16x16);
  510. }