123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647 |
- // basisu_kernels_imp.h - Do not directly include
- // Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- using namespace CPPSPMD;
- namespace CPPSPMD_NAME(basisu_kernels_namespace)
- {
- struct perceptual_distance_rgb_4_N : spmd_kernel
- {
- void _call(int64_t* pDistance,
- const uint8_t* pSelectors,
- const color_rgba* pBlock_colors,
- const color_rgba* pSrc_pixels, uint32_t n,
- int64_t early_out_err)
- {
- assert(early_out_err >= 0);
- *pDistance = 0;
- __m128i block_colors[4];
- vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
- for (uint32_t i = 0; i < 4; i++)
- {
- block_colors[i] = load_rgba32(&pBlock_colors[i]);
- store_all(block_colors_r[i], (int)pBlock_colors[i].r);
- store_all(block_colors_g[i], (int)pBlock_colors[i].g);
- store_all(block_colors_b[i], (int)pBlock_colors[i].b);
- }
- uint32_t i;
- for (i = 0; (i + 4) <= n; i += 4)
- {
- __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
- vint r, g, b, a;
- transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
- int s0 = pSelectors[i], s1 = pSelectors[i + 1], s2 = pSelectors[i + 2], s3 = pSelectors[i + 3];
- vint base_r, base_g, base_b, base_a;
- if ((s0 == s1) && (s0 == s2) && (s0 == s3))
- {
- store_all(base_r, block_colors_r[s0]);
- store_all(base_g, block_colors_g[s0]);
- store_all(base_b, block_colors_b[s0]);
- }
- else
- {
- __m128i k0 = block_colors[s0], k1 = block_colors[s1], k2 = block_colors[s2], k3 = block_colors[s3];
- transpose4x4(base_r.m_value, base_g.m_value, base_b.m_value, base_a.m_value, k0, k1, k2, k3);
- }
- vint dr = base_r - r;
- vint dg = base_g - g;
- vint db = base_b - b;
- vint delta_l = dr * 27 + dg * 92 + db * 9;
- vint delta_cr = dr * 128 - delta_l;
- vint delta_cb = db * 128 - delta_l;
- vint id = ((delta_l * delta_l) >> 7) +
- ((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
- ((((delta_cb * delta_cb) >> 7) * 3) >> 7);
- *pDistance += reduce_add(id);
- if (*pDistance >= early_out_err)
- return;
- }
- for (; i < n; i++)
- {
- int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
- int sel = pSelectors[i];
- int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
- int dr = base_r - r;
- int dg = base_g - g;
- int db = base_b - b;
- int delta_l = dr * 27 + dg * 92 + db * 9;
- int delta_cr = dr * 128 - delta_l;
- int delta_cb = db * 128 - delta_l;
- int id = ((delta_l * delta_l) >> 7) +
- ((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
- ((((delta_cb * delta_cb) >> 7) * 3) >> 7);
- *pDistance += id;
- if (*pDistance >= early_out_err)
- return;
- }
- }
- };
- struct linear_distance_rgb_4_N : spmd_kernel
- {
- void _call(int64_t* pDistance,
- const uint8_t* pSelectors,
- const color_rgba* pBlock_colors,
- const color_rgba* pSrc_pixels, uint32_t n,
- int64_t early_out_err)
- {
- assert(early_out_err >= 0);
- *pDistance = 0;
- __m128i block_colors[4];
- vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
- for (uint32_t i = 0; i < 4; i++)
- {
- block_colors[i] = load_rgba32(&pBlock_colors[i]);
- store_all(block_colors_r[i], (int)pBlock_colors[i].r);
- store_all(block_colors_g[i], (int)pBlock_colors[i].g);
- store_all(block_colors_b[i], (int)pBlock_colors[i].b);
- }
- uint32_t i;
- for (i = 0; (i + 4) <= n; i += 4)
- {
- __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
- vint r, g, b, a;
- transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
- int s0 = pSelectors[i], s1 = pSelectors[i + 1], s2 = pSelectors[i + 2], s3 = pSelectors[i + 3];
- vint base_r, base_g, base_b, base_a;
- if ((s0 == s1) && (s0 == s2) && (s0 == s3))
- {
- store_all(base_r, block_colors_r[s0]);
- store_all(base_g, block_colors_g[s0]);
- store_all(base_b, block_colors_b[s0]);
- }
- else
- {
- __m128i k0 = block_colors[s0], k1 = block_colors[s1], k2 = block_colors[s2], k3 = block_colors[s3];
- transpose4x4(base_r.m_value, base_g.m_value, base_b.m_value, base_a.m_value, k0, k1, k2, k3);
- }
- vint dr = base_r - r;
- vint dg = base_g - g;
- vint db = base_b - b;
- vint id = dr * dr + dg * dg + db * db;
- *pDistance += reduce_add(id);
- if (*pDistance >= early_out_err)
- return;
- }
- for (; i < n; i++)
- {
- int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
- int sel = pSelectors[i];
- int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
- int dr = base_r - r;
- int dg = base_g - g;
- int db = base_b - b;
- int id = dr * dr + dg * dg + db * db;
- *pDistance += id;
- if (*pDistance >= early_out_err)
- return;
- }
- }
- };
- struct find_selectors_perceptual_rgb_4_N : spmd_kernel
- {
- inline vint compute_dist(
- const vint& base_r, const vint& base_g, const vint& base_b,
- const vint& r, const vint& g, const vint& b)
- {
- vint dr = base_r - r;
- vint dg = base_g - g;
- vint db = base_b - b;
- vint delta_l = dr * 27 + dg * 92 + db * 9;
- vint delta_cr = dr * 128 - delta_l;
- vint delta_cb = db * 128 - delta_l;
- vint id = VINT_SHIFT_RIGHT(delta_l * delta_l, 7) +
- VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cr * delta_cr, 7) * 26, 7) +
- VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cb * delta_cb, 7) * 3, 7);
- return id;
- }
- void _call(int64_t* pDistance,
- uint8_t* pSelectors,
- const color_rgba* pBlock_colors,
- const color_rgba* pSrc_pixels, uint32_t n,
- int64_t early_out_err)
- {
- assert(early_out_err >= 0);
- *pDistance = 0;
- vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
- for (uint32_t i = 0; i < 4; i++)
- {
- store_all(block_colors_r[i], (int)pBlock_colors[i].r);
- store_all(block_colors_g[i], (int)pBlock_colors[i].g);
- store_all(block_colors_b[i], (int)pBlock_colors[i].b);
- }
- const __m128i shuf = _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 12, 8, 4, 0);
- uint32_t i;
- for (i = 0; (i + 4) <= n; i += 4)
- {
- __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
- vint r, g, b, a;
- transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
- vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
- vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
- vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
- vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
- vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
- vint sels = spmd_ternaryi(min_dist == dist0, 0, spmd_ternaryi(min_dist == dist1, 1, spmd_ternaryi(min_dist == dist2, 2, 3)));
- __m128i vsels = shuffle_epi8(sels.m_value, shuf);
- storeu_si32((void *)(pSelectors + i), vsels);
- *pDistance += reduce_add(min_dist);
- if (*pDistance >= early_out_err)
- return;
- }
- for (; i < n; i++)
- {
- int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
- int best_err = INT_MAX, best_sel = 0;
- for (int sel = 0; sel < 4; sel++)
- {
- int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
- int dr = base_r - r;
- int dg = base_g - g;
- int db = base_b - b;
- int delta_l = dr * 27 + dg * 92 + db * 9;
- int delta_cr = dr * 128 - delta_l;
- int delta_cb = db * 128 - delta_l;
- int id = ((delta_l * delta_l) >> 7) +
- ((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
- ((((delta_cb * delta_cb) >> 7) * 3) >> 7);
- if (id < best_err)
- {
- best_err = id;
- best_sel = sel;
- }
- }
- pSelectors[i] = (uint8_t)best_sel;
- *pDistance += best_err;
- if (*pDistance >= early_out_err)
- return;
- }
- }
- };
- struct find_selectors_linear_rgb_4_N : spmd_kernel
- {
- inline vint compute_dist(
- const vint& base_r, const vint& base_g, const vint& base_b,
- const vint& r, const vint& g, const vint& b)
- {
- vint dr = base_r - r;
- vint dg = base_g - g;
- vint db = base_b - b;
- vint id = dr * dr + dg * dg + db * db;
- return id;
- }
- void _call(int64_t* pDistance,
- uint8_t* pSelectors,
- const color_rgba* pBlock_colors,
- const color_rgba* pSrc_pixels, uint32_t n,
- int64_t early_out_err)
- {
- assert(early_out_err >= 0);
- *pDistance = 0;
- vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
- for (uint32_t i = 0; i < 4; i++)
- {
- store_all(block_colors_r[i], (int)pBlock_colors[i].r);
- store_all(block_colors_g[i], (int)pBlock_colors[i].g);
- store_all(block_colors_b[i], (int)pBlock_colors[i].b);
- }
- const __m128i shuf = _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 12, 8, 4, 0);
- uint32_t i;
- for (i = 0; (i + 4) <= n; i += 4)
- {
- __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
- vint r, g, b, a;
- transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
- vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
- vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
- vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
- vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
- vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
- vint sels = spmd_ternaryi(min_dist == dist0, 0, spmd_ternaryi(min_dist == dist1, 1, spmd_ternaryi(min_dist == dist2, 2, 3)));
- __m128i vsels = shuffle_epi8(sels.m_value, shuf);
- storeu_si32((void *)(pSelectors + i), vsels);
- *pDistance += reduce_add(min_dist);
- if (*pDistance >= early_out_err)
- return;
- }
- for (; i < n; i++)
- {
- int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
- int best_err = INT_MAX, best_sel = 0;
- for (int sel = 0; sel < 4; sel++)
- {
- int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
- int dr = base_r - r;
- int dg = base_g - g;
- int db = base_b - b;
- int id = dr * dr + dg * dg + db * db;
- if (id < best_err)
- {
- best_err = id;
- best_sel = sel;
- }
- }
- pSelectors[i] = (uint8_t)best_sel;
- *pDistance += best_err;
- if (*pDistance >= early_out_err)
- return;
- }
- }
- };
- struct find_lowest_error_perceptual_rgb_4_N : spmd_kernel
- {
- inline vint compute_dist(
- const vint& base_r, const vint& base_g, const vint& base_b,
- const vint& r, const vint& g, const vint& b)
- {
- vint dr = base_r - r;
- vint dg = base_g - g;
- vint db = base_b - b;
- vint delta_l = dr * 27 + dg * 92 + db * 9;
- vint delta_cr = dr * 128 - delta_l;
- vint delta_cb = db * 128 - delta_l;
- vint id = VINT_SHIFT_RIGHT(delta_l * delta_l, 7) +
- VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cr * delta_cr, 7) * 26, 7) +
- VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cb * delta_cb, 7) * 3, 7);
- return id;
- }
- void _call(int64_t* pDistance,
- const color_rgba* pBlock_colors,
- const color_rgba* pSrc_pixels, uint32_t n,
- int64_t early_out_error)
- {
- assert(early_out_error >= 0);
- *pDistance = 0;
- vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
- for (uint32_t i = 0; i < 4; i++)
- {
- store_all(block_colors_r[i], (int)pBlock_colors[i].r);
- store_all(block_colors_g[i], (int)pBlock_colors[i].g);
- store_all(block_colors_b[i], (int)pBlock_colors[i].b);
- }
- uint32_t i;
- for (i = 0; (i + 4) <= n; i += 4)
- {
- __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
- vint r, g, b, a;
- transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
- vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
- vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
- vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
- vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
- vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
- *pDistance += reduce_add(min_dist);
- if (*pDistance > early_out_error)
- return;
- }
- for (; i < n; i++)
- {
- int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
- int best_err = INT_MAX;
- for (int sel = 0; sel < 4; sel++)
- {
- int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
- int dr = base_r - r;
- int dg = base_g - g;
- int db = base_b - b;
- int delta_l = dr * 27 + dg * 92 + db * 9;
- int delta_cr = dr * 128 - delta_l;
- int delta_cb = db * 128 - delta_l;
- int id = ((delta_l * delta_l) >> 7) +
- ((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
- ((((delta_cb * delta_cb) >> 7) * 3) >> 7);
-
- if (id < best_err)
- {
- best_err = id;
- }
- }
- *pDistance += best_err;
- if (*pDistance > early_out_error)
- return;
- }
- }
- };
- struct find_lowest_error_linear_rgb_4_N : spmd_kernel
- {
- inline vint compute_dist(
- const vint& base_r, const vint& base_g, const vint& base_b,
- const vint& r, const vint& g, const vint& b)
- {
- vint dr = base_r - r;
- vint dg = base_g - g;
- vint db = base_b - b;
- vint id = dr * dr + dg * dg + db * db;
- return id;
- }
- void _call(int64_t* pDistance,
- const color_rgba* pBlock_colors,
- const color_rgba* pSrc_pixels, uint32_t n,
- int64_t early_out_error)
- {
- assert(early_out_error >= 0);
- *pDistance = 0;
- vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
- for (uint32_t i = 0; i < 4; i++)
- {
- store_all(block_colors_r[i], (int)pBlock_colors[i].r);
- store_all(block_colors_g[i], (int)pBlock_colors[i].g);
- store_all(block_colors_b[i], (int)pBlock_colors[i].b);
- }
- uint32_t i;
- for (i = 0; (i + 4) <= n; i += 4)
- {
- __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
- vint r, g, b, a;
- transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
- vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
- vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
- vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
- vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
- vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
- *pDistance += reduce_add(min_dist);
- if (*pDistance > early_out_error)
- return;
- }
- for (; i < n; i++)
- {
- int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
- int best_err = INT_MAX;
- for (int sel = 0; sel < 4; sel++)
- {
- int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
- int dr = base_r - r;
- int dg = base_g - g;
- int db = base_b - b;
- int id = dr * dr + dg * dg + db * db;
- if (id < best_err)
- {
- best_err = id;
- }
- }
- *pDistance += best_err;
- if (*pDistance > early_out_error)
- return;
- }
- }
- };
- struct update_covar_matrix_16x16 : spmd_kernel
- {
- void _call(
- uint32_t num_vecs, const void* pWeighted_vecs_void, const void* pOrigin_void, const uint32_t* pVec_indices, void* pMatrix16x16_void)
- {
- const std::pair<vec16F, uint64_t>* pWeighted_vecs = static_cast< const std::pair<vec16F, uint64_t> *>(pWeighted_vecs_void);
-
- const float* pOrigin = static_cast<const float*>(pOrigin_void);
- vfloat org0 = loadu_linear_all(pOrigin), org1 = loadu_linear_all(pOrigin + 4), org2 = loadu_linear_all(pOrigin + 8), org3 = loadu_linear_all(pOrigin + 12);
-
- vfloat mat[16][4];
- vfloat vzero(zero_vfloat());
- for (uint32_t i = 0; i < 16; i++)
- {
- store_all(mat[i][0], vzero);
- store_all(mat[i][1], vzero);
- store_all(mat[i][2], vzero);
- store_all(mat[i][3], vzero);
- }
- for (uint32_t k = 0; k < num_vecs; k++)
- {
- const uint32_t vec_index = pVec_indices[k];
- const float* pW = pWeighted_vecs[vec_index].first.get_ptr();
- vfloat weight((float)pWeighted_vecs[vec_index].second);
- vfloat vec[4] = { loadu_linear_all(pW) - org0, loadu_linear_all(pW + 4) - org1, loadu_linear_all(pW + 8) - org2, loadu_linear_all(pW + 12) - org3 };
-
- vfloat wvec0 = vec[0] * weight, wvec1 = vec[1] * weight, wvec2 = vec[2] * weight, wvec3 = vec[3] * weight;
- for (uint32_t j = 0; j < 16; j++)
- {
- vfloat vx = ((const float*)vec)[j];
- store_all(mat[j][0], mat[j][0] + vx * wvec0);
- store_all(mat[j][1], mat[j][1] + vx * wvec1);
- store_all(mat[j][2], mat[j][2] + vx * wvec2);
- store_all(mat[j][3], mat[j][3] + vx * wvec3);
- } // j
- } // k
- float* pMatrix = static_cast<float*>(pMatrix16x16_void);
- float* pDst = pMatrix;
- for (uint32_t i = 0; i < 16; i++)
- {
- storeu_linear_all(pDst, mat[i][0]);
- storeu_linear_all(pDst + 4, mat[i][1]);
- storeu_linear_all(pDst + 8, mat[i][2]);
- storeu_linear_all(pDst + 12, mat[i][3]);
- pDst += 16;
- }
- }
- };
- } // namespace
- using namespace CPPSPMD_NAME(basisu_kernels_namespace);
- void CPPSPMD_NAME(perceptual_distance_rgb_4_N)(int64_t* pDistance, const uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
- {
- spmd_call< perceptual_distance_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
- }
- void CPPSPMD_NAME(linear_distance_rgb_4_N)(int64_t* pDistance, const uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
- {
- spmd_call< linear_distance_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
- }
- void CPPSPMD_NAME(find_selectors_perceptual_rgb_4_N)(int64_t *pDistance, uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
- {
- spmd_call< find_selectors_perceptual_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
- }
- void CPPSPMD_NAME(find_selectors_linear_rgb_4_N)(int64_t* pDistance, uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
- {
- spmd_call< find_selectors_linear_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
- }
- void CPPSPMD_NAME(find_lowest_error_perceptual_rgb_4_N)(int64_t* pDistance, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_error)
- {
- spmd_call< find_lowest_error_perceptual_rgb_4_N >(pDistance, pBlock_colors, pSrc_pixels, n, early_out_error);
- }
- void CPPSPMD_NAME(find_lowest_error_linear_rgb_4_N)(int64_t* pDistance, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_error)
- {
- spmd_call< find_lowest_error_linear_rgb_4_N >(pDistance, pBlock_colors, pSrc_pixels, n, early_out_error);
- }
- void CPPSPMD_NAME(update_covar_matrix_16x16)(uint32_t num_vecs, const void* pWeighted_vecs, const void* pOrigin, const uint32_t *pVec_indices, void* pMatrix16x16)
- {
- spmd_call < update_covar_matrix_16x16 >(num_vecs, pWeighted_vecs, pOrigin, pVec_indices, pMatrix16x16);
- }
|