astcenc_vecmathlib.h 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608
  1. // SPDX-License-Identifier: Apache-2.0
  2. // ----------------------------------------------------------------------------
  3. // Copyright 2019-2025 Arm Limited
  4. // Copyright 2008 Jose Fonseca
  5. //
  6. // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  7. // use this file except in compliance with the License. You may obtain a copy
  8. // of the License at:
  9. //
  10. // http://www.apache.org/licenses/LICENSE-2.0
  11. //
  12. // Unless required by applicable law or agreed to in writing, software
  13. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  14. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  15. // License for the specific language governing permissions and limitations
  16. // under the License.
  17. // ----------------------------------------------------------------------------
  18. /*
  19. * This module implements vector support for floats, ints, and vector lane
  20. * control masks. It provides access to both explicit vector width types, and
  21. * flexible N-wide types where N can be determined at compile time.
  22. *
  23. * The design of this module encourages use of vector length agnostic code, via
  24. * the vint, vfloat, and vmask types. These will take on the widest SIMD vector
  25. * with that is available at compile time. The current vector width is
  26. * accessible for e.g. loop strides via the ASTCENC_SIMD_WIDTH constant.
  27. *
  28. * Explicit scalar types are accessible via the vint1, vfloat1, vmask1 types.
  29. * These are provided primarily for prototyping and algorithm debug of VLA
  30. * implementations.
  31. *
  32. * Explicit 4-wide types are accessible via the vint4, vfloat4, and vmask4
  33. * types. These are provided for use by VLA code, but are also expected to be
  34. * used as a fixed-width type and will supported a reference C++ fallback for
  35. * use on platforms without SIMD intrinsics.
  36. *
  37. * Explicit 8-wide types are accessible via the vint8, vfloat8, and vmask8
  38. * types. These are provide for use by VLA code, and are not expected to be
  39. * used as a fixed-width type in normal code. No reference C implementation is
  40. * provided on platforms without underlying SIMD intrinsics.
  41. *
  42. * With the current implementation ISA support is provided for:
  43. *
  44. * * 1-wide for scalar reference
  45. * * 4-wide for Armv8-A NEON
  46. * * 4-wide for x86-64 SSE2
  47. * * 4-wide for x86-64 SSE4.1
  48. * * 8-wide for Armv8-A SVE
  49. * * 8-wide for x86-64 AVX2
  50. */
  51. #ifndef ASTC_VECMATHLIB_H_INCLUDED
  52. #define ASTC_VECMATHLIB_H_INCLUDED
  53. #if ASTCENC_SSE != 0 || ASTCENC_AVX != 0
  54. #include <immintrin.h>
  55. #endif
  56. #if ASTCENC_SVE != 0
  57. #include <arm_sve.h>
  58. #include <arm_neon_sve_bridge.h>
  59. #endif
  60. #if ASTCENC_NEON != 0
  61. #include <arm_neon.h>
  62. #endif
  63. #if !defined(__clang__) && defined(_MSC_VER)
  64. #define ASTCENC_SIMD_INLINE __forceinline
  65. #define ASTCENC_NO_INLINE
  66. #elif defined(__GNUC__) && !defined(__clang__)
  67. #define ASTCENC_SIMD_INLINE __attribute__((always_inline)) inline
  68. #define ASTCENC_NO_INLINE __attribute__ ((noinline))
  69. #else
  70. #define ASTCENC_SIMD_INLINE __attribute__((always_inline, nodebug)) inline
  71. #define ASTCENC_NO_INLINE __attribute__ ((noinline))
  72. #endif
  73. template<typename T> T gatherf_byte_inds(const float* base, const uint8_t* indices);
  74. #if ASTCENC_AVX >= 2
  75. // If we have AVX2 expose 8-wide VLA.
  76. #include "astcenc_vecmathlib_sse_4.h"
  77. #include "astcenc_vecmathlib_common_4.h"
  78. #include "astcenc_vecmathlib_avx2_8.h"
  79. #define ASTCENC_SIMD_WIDTH 8
  80. using vfloat = vfloat8;
  81. #if defined(ASTCENC_NO_INVARIANCE)
  82. using vfloatacc = vfloat8;
  83. #else
  84. using vfloatacc = vfloat4;
  85. #endif
  86. using vint = vint8;
  87. using vmask = vmask8;
  88. using vtable_16x8 = vtable8_16x8;
  89. using vtable_32x8 = vtable8_32x8;
  90. using vtable_64x8 = vtable8_64x8;
  91. constexpr auto loada = vfloat8::loada;
  92. constexpr auto load1 = vfloat8::load1;
  93. constexpr auto vint_from_size = vint8_from_size;
  94. #elif ASTCENC_SSE >= 20
  95. // If we have SSE expose 4-wide VLA, and 4-wide fixed width.
  96. #include "astcenc_vecmathlib_sse_4.h"
  97. #include "astcenc_vecmathlib_common_4.h"
  98. #define ASTCENC_SIMD_WIDTH 4
  99. using vfloat = vfloat4;
  100. using vfloatacc = vfloat4;
  101. using vint = vint4;
  102. using vmask = vmask4;
  103. using vtable_16x8 = vtable4_16x8;
  104. using vtable_32x8 = vtable4_32x8;
  105. using vtable_64x8 = vtable4_64x8;
  106. constexpr auto loada = vfloat4::loada;
  107. constexpr auto load1 = vfloat4::load1;
  108. constexpr auto vint_from_size = vint4_from_size;
  109. #elif ASTCENC_SVE == 8
  110. // Check the compiler is configured with fixed-length 256-bit SVE.
  111. #if !defined(__ARM_FEATURE_SVE_BITS) || (__ARM_FEATURE_SVE_BITS != 256)
  112. #error "__ARM_FEATURE_SVE_BITS is not set to 256 bits"
  113. #endif
  114. // If we have SVE configured as 8-wide, expose 8-wide VLA.
  115. #include "astcenc_vecmathlib_neon_4.h"
  116. #include "astcenc_vecmathlib_common_4.h"
  117. #include "astcenc_vecmathlib_sve_8.h"
  118. #define ASTCENC_SIMD_WIDTH 8
  119. using vfloat = vfloat8;
  120. #if defined(ASTCENC_NO_INVARIANCE)
  121. using vfloatacc = vfloat8;
  122. #else
  123. using vfloatacc = vfloat4;
  124. #endif
  125. using vint = vint8;
  126. using vmask = vmask8;
  127. using vtable_16x8 = vtable8_16x8;
  128. using vtable_32x8 = vtable8_32x8;
  129. using vtable_64x8 = vtable8_64x8;
  130. constexpr auto loada = vfloat8::loada;
  131. constexpr auto load1 = vfloat8::load1;
  132. constexpr auto vint_from_size = vint8_from_size;
  133. #elif ASTCENC_NEON > 0
  134. // If we have NEON expose 4-wide VLA.
  135. #include "astcenc_vecmathlib_neon_4.h"
  136. #include "astcenc_vecmathlib_common_4.h"
  137. #define ASTCENC_SIMD_WIDTH 4
  138. using vfloat = vfloat4;
  139. using vfloatacc = vfloat4;
  140. using vint = vint4;
  141. using vmask = vmask4;
  142. using vtable_16x8 = vtable4_16x8;
  143. using vtable_32x8 = vtable4_32x8;
  144. using vtable_64x8 = vtable4_64x8;
  145. constexpr auto loada = vfloat4::loada;
  146. constexpr auto load1 = vfloat4::load1;
  147. constexpr auto vint_from_size = vint4_from_size;
  148. #else
  149. // If we have nothing expose 4-wide VLA, and 4-wide fixed width.
  150. // Note: We no longer expose the 1-wide scalar fallback because it is not
  151. // invariant with the 4-wide path due to algorithms that use horizontal
  152. // operations that accumulate a local vector sum before accumulating into
  153. // a running sum.
  154. //
  155. // For 4 items adding into an accumulator using 1-wide vectors the sum is:
  156. //
  157. // result = ((((sum + l0) + l1) + l2) + l3)
  158. //
  159. // ... whereas the accumulator for a 4-wide vector sum is:
  160. //
  161. // result = sum + ((l0 + l2) + (l1 + l3))
  162. //
  163. // In "normal maths" this is the same, but the floating point reassociation
  164. // differences mean that these will not produce the same result.
  165. #include "astcenc_vecmathlib_none_4.h"
  166. #include "astcenc_vecmathlib_common_4.h"
  167. #define ASTCENC_SIMD_WIDTH 4
  168. using vfloat = vfloat4;
  169. using vfloatacc = vfloat4;
  170. using vint = vint4;
  171. using vmask = vmask4;
  172. using vtable_16x8 = vtable4_16x8;
  173. using vtable_32x8 = vtable4_32x8;
  174. using vtable_64x8 = vtable4_64x8;
  175. constexpr auto loada = vfloat4::loada;
  176. constexpr auto load1 = vfloat4::load1;
  177. constexpr auto vint_from_size = vint4_from_size;
  178. #endif
  179. /**
  180. * @brief Round a count down to the largest multiple of the SIMD width.
  181. *
  182. * Assumption that the vector width is a power of two ...
  183. *
  184. * @param count The unrounded value.
  185. *
  186. * @return The rounded value.
  187. */
  188. ASTCENC_SIMD_INLINE size_t round_down_to_simd_multiple_vla(size_t count)
  189. {
  190. return count & static_cast<size_t>(~(ASTCENC_SIMD_WIDTH - 1));
  191. }
  192. /**
  193. * @brief Round a count up to the largest multiple of the SIMD width.
  194. *
  195. * Assumption that the vector width is a power of two ...
  196. *
  197. * @param count The unrounded value.
  198. *
  199. * @return The rounded value.
  200. */
  201. ASTCENC_SIMD_INLINE size_t round_up_to_simd_multiple_vla(size_t count)
  202. {
  203. size_t multiples = (count + ASTCENC_SIMD_WIDTH - 1) / ASTCENC_SIMD_WIDTH;
  204. return multiples * ASTCENC_SIMD_WIDTH;
  205. }
  206. /**
  207. * @brief Return @c a with lanes negated if the @c b lane is negative.
  208. */
  209. ASTCENC_SIMD_INLINE vfloat change_sign(vfloat a, vfloat b)
  210. {
  211. vint ia = float_as_int(a);
  212. vint ib = float_as_int(b);
  213. vint sign_mask(static_cast<int>(0x80000000));
  214. vint r = ia ^ (ib & sign_mask);
  215. return int_as_float(r);
  216. }
  217. /**
  218. * @brief Return fast, but approximate, vector atan(x).
  219. *
  220. * Max error of this implementation is 0.004883.
  221. */
  222. ASTCENC_SIMD_INLINE vfloat atan(vfloat x)
  223. {
  224. vmask c = abs(x) > vfloat(1.0f);
  225. vfloat z = change_sign(vfloat(astc::PI_OVER_TWO), x);
  226. vfloat y = select(x, vfloat(1.0f) / x, c);
  227. y = y / (y * y * vfloat(0.28f) + vfloat(1.0f));
  228. return select(y, z - y, c);
  229. }
  230. /**
  231. * @brief Return fast, but approximate, vector atan2(x, y).
  232. */
  233. ASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x)
  234. {
  235. vfloat z = atan(abs(y / x));
  236. vmask xmask = x < vfloat::zero();
  237. return change_sign(select(z, vfloat(astc::PI) - z, xmask), y);
  238. }
  239. /*
  240. * @brief Factory that returns a unit length 4 component vfloat4.
  241. */
  242. static ASTCENC_SIMD_INLINE vfloat4 unit4()
  243. {
  244. return vfloat4(0.5f);
  245. }
  246. /**
  247. * @brief Factory that returns a unit length 3 component vfloat4.
  248. */
  249. static ASTCENC_SIMD_INLINE vfloat4 unit3()
  250. {
  251. float val = 0.577350258827209473f;
  252. return vfloat4(val, val, val, 0.0f);
  253. }
  254. /**
  255. * @brief Factory that returns a unit length 2 component vfloat4.
  256. */
  257. static ASTCENC_SIMD_INLINE vfloat4 unit2()
  258. {
  259. float val = 0.707106769084930420f;
  260. return vfloat4(val, val, 0.0f, 0.0f);
  261. }
  262. /**
  263. * @brief Factory that returns a 3 component vfloat4.
  264. */
  265. static ASTCENC_SIMD_INLINE vfloat4 vfloat3(float a, float b, float c)
  266. {
  267. return vfloat4(a, b, c, 0.0f);
  268. }
  269. /**
  270. * @brief Factory that returns a 2 component vfloat4.
  271. */
  272. static ASTCENC_SIMD_INLINE vfloat4 vfloat2(float a, float b)
  273. {
  274. return vfloat4(a, b, 0.0f, 0.0f);
  275. }
  276. /**
  277. * @brief Normalize a non-zero length vector to unit length.
  278. */
  279. static ASTCENC_SIMD_INLINE vfloat4 normalize(vfloat4 a)
  280. {
  281. vfloat4 length = dot(a, a);
  282. return a / sqrt(length);
  283. }
  284. /**
  285. * @brief Normalize a vector, returning @c safe if len is zero.
  286. */
  287. static ASTCENC_SIMD_INLINE vfloat4 normalize_safe(vfloat4 a, vfloat4 safe)
  288. {
  289. vfloat4 length = dot(a, a);
  290. if (length.lane<0>() != 0.0f)
  291. {
  292. return a / sqrt(length);
  293. }
  294. return safe;
  295. }
  296. #define POLY0(x, c0) ( c0)
  297. #define POLY1(x, c0, c1) ((POLY0(x, c1) * x) + c0)
  298. #define POLY2(x, c0, c1, c2) ((POLY1(x, c1, c2) * x) + c0)
  299. #define POLY3(x, c0, c1, c2, c3) ((POLY2(x, c1, c2, c3) * x) + c0)
  300. #define POLY4(x, c0, c1, c2, c3, c4) ((POLY3(x, c1, c2, c3, c4) * x) + c0)
  301. #define POLY5(x, c0, c1, c2, c3, c4, c5) ((POLY4(x, c1, c2, c3, c4, c5) * x) + c0)
  302. /**
  303. * @brief Compute an approximate exp2(x) for each lane in the vector.
  304. *
  305. * Based on 5th degree minimax polynomials, ported from this blog
  306. * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
  307. */
  308. static ASTCENC_SIMD_INLINE vfloat4 exp2(vfloat4 x)
  309. {
  310. x = clamp(-126.99999f, 129.0f, x);
  311. vint4 ipart = float_to_int(x - 0.5f);
  312. vfloat4 fpart = x - int_to_float(ipart);
  313. // Integer contrib, using 1 << ipart
  314. vfloat4 iexp = int_as_float(lsl<23>(ipart + 127));
  315. // Fractional contrib, using polynomial fit of 2^x in range [-0.5, 0.5)
  316. vfloat4 fexp = POLY5(fpart,
  317. 9.9999994e-1f,
  318. 6.9315308e-1f,
  319. 2.4015361e-1f,
  320. 5.5826318e-2f,
  321. 8.9893397e-3f,
  322. 1.8775767e-3f);
  323. return iexp * fexp;
  324. }
  325. /**
  326. * @brief Compute an approximate log2(x) for each lane in the vector.
  327. *
  328. * Based on 5th degree minimax polynomials, ported from this blog
  329. * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
  330. */
  331. static ASTCENC_SIMD_INLINE vfloat4 log2(vfloat4 x)
  332. {
  333. vint4 exp(0x7F800000);
  334. vint4 mant(0x007FFFFF);
  335. vint4 one(0x3F800000);
  336. vint4 i = float_as_int(x);
  337. vfloat4 e = int_to_float(lsr<23>(i & exp) - 127);
  338. vfloat4 m = int_as_float((i & mant) | one);
  339. // Polynomial fit of log2(x)/(x - 1), for x in range [1, 2)
  340. vfloat4 p = POLY4(m,
  341. 2.8882704548164776201f,
  342. -2.52074962577807006663f,
  343. 1.48116647521213171641f,
  344. -0.465725644288844778798f,
  345. 0.0596515482674574969533f);
  346. // Increases the polynomial degree, but ensures that log2(1) == 0
  347. p = p * (m - 1.0f);
  348. return p + e;
  349. }
  350. /**
  351. * @brief Compute an approximate pow(x, y) for each lane in the vector.
  352. *
  353. * Power function based on the exp2(log2(x) * y) transform.
  354. */
  355. static ASTCENC_SIMD_INLINE vfloat4 pow(vfloat4 x, vfloat4 y)
  356. {
  357. vmask4 zero_mask = y == vfloat4(0.0f);
  358. vfloat4 estimate = exp2(log2(x) * y);
  359. // Guarantee that y == 0 returns exactly 1.0f
  360. return select(estimate, vfloat4(1.0f), zero_mask);
  361. }
  362. /**
  363. * @brief Count the leading zeros for each lane in @c a.
  364. *
  365. * Valid for all data values of @c a; will return a per-lane value [0, 32].
  366. */
  367. static ASTCENC_SIMD_INLINE vint4 clz(vint4 a)
  368. {
  369. // This function is a horrible abuse of floating point exponents to convert
  370. // the original integer value into a 2^N encoding we can recover easily.
  371. // Convert to float without risk of rounding up by keeping only top 8 bits.
  372. // This trick is is guaranteed to keep top 8 bits and clear the 9th.
  373. a = (~lsr<8>(a)) & a;
  374. a = float_as_int(int_to_float(a));
  375. // Extract and unbias exponent
  376. a = vint4(127 + 31) - lsr<23>(a);
  377. // Clamp result to a valid 32-bit range
  378. return clamp(0, 32, a);
  379. }
  380. /**
  381. * @brief Return lanewise 2^a for each lane in @c a.
  382. *
  383. * Use of signed int means that this is only valid for values in range [0, 31].
  384. */
  385. static ASTCENC_SIMD_INLINE vint4 two_to_the_n(vint4 a)
  386. {
  387. // 2^30 is the largest signed number than can be represented
  388. assert(all(a < vint4(31)));
  389. // This function is a horrible abuse of floating point to use the exponent
  390. // and float conversion to generate a 2^N multiple.
  391. // Bias the exponent
  392. vint4 exp = a + 127;
  393. exp = lsl<23>(exp);
  394. // Reinterpret the bits as a float, and then convert to an int
  395. vfloat4 f = int_as_float(exp);
  396. return float_to_int(f);
  397. }
  398. /**
  399. * @brief Convert unorm16 [0, 65535] to float16 in range [0, 1].
  400. */
  401. static ASTCENC_SIMD_INLINE vint4 unorm16_to_sf16(vint4 p)
  402. {
  403. vint4 fp16_one = vint4(0x3C00);
  404. vint4 fp16_small = lsl<8>(p);
  405. vmask4 is_one = p == vint4(0xFFFF);
  406. vmask4 is_small = p < vint4(4);
  407. // Manually inline clz() on Visual Studio to avoid release build codegen bug
  408. // see https://github.com/ARM-software/astc-encoder/issues/259
  409. #if !defined(__clang__) && defined(_MSC_VER)
  410. vint4 a = (~lsr<8>(p)) & p;
  411. a = float_as_int(int_to_float(a));
  412. a = vint4(127 + 31) - lsr<23>(a);
  413. vint4 lz = clamp(0, 32, a) - 16;
  414. #else
  415. vint4 lz = clz(p) - 16;
  416. #endif
  417. p = p * two_to_the_n(lz + 1);
  418. p = p & vint4(0xFFFF);
  419. p = lsr<6>(p);
  420. p = p | lsl<10>(vint4(14) - lz);
  421. vint4 r = select(p, fp16_one, is_one);
  422. r = select(r, fp16_small, is_small);
  423. return r;
  424. }
  425. /**
  426. * @brief Convert 16-bit LNS to float16.
  427. */
  428. static ASTCENC_SIMD_INLINE vint4 lns_to_sf16(vint4 p)
  429. {
  430. vint4 mc = p & 0x7FF;
  431. vint4 ec = lsr<11>(p);
  432. vint4 mc_512 = mc * 3;
  433. vmask4 mask_512 = mc < vint4(512);
  434. vint4 mc_1536 = mc * 4 - 512;
  435. vmask4 mask_1536 = mc < vint4(1536);
  436. vint4 mc_else = mc * 5 - 2048;
  437. vint4 mt = mc_else;
  438. mt = select(mt, mc_1536, mask_1536);
  439. mt = select(mt, mc_512, mask_512);
  440. vint4 res = lsl<10>(ec) | lsr<3>(mt);
  441. return min(res, vint4(0x7BFF));
  442. }
  443. /**
  444. * @brief Extract mantissa and exponent of a float value.
  445. *
  446. * @param a The input value.
  447. * @param[out] exp The output exponent.
  448. *
  449. * @return The mantissa.
  450. */
  451. static ASTCENC_SIMD_INLINE vfloat4 frexp(vfloat4 a, vint4& exp)
  452. {
  453. // Interpret the bits as an integer
  454. vint4 ai = float_as_int(a);
  455. // Extract and unbias the exponent
  456. exp = (lsr<23>(ai) & 0xFF) - 126;
  457. // Extract and unbias the mantissa
  458. vint4 manti = (ai & static_cast<int>(0x807FFFFF)) | 0x3F000000;
  459. return int_as_float(manti);
  460. }
  461. /**
  462. * @brief Convert float to 16-bit LNS.
  463. */
  464. static ASTCENC_SIMD_INLINE vfloat4 float_to_lns(vfloat4 a)
  465. {
  466. vint4 exp;
  467. vfloat4 mant = frexp(a, exp);
  468. // Do these early before we start messing about ...
  469. vmask4 mask_underflow_nan = ~(a > vfloat4(1.0f / 67108864.0f));
  470. vmask4 mask_infinity = a >= vfloat4(65536.0f);
  471. // If input is smaller than 2^-14, multiply by 2^25 and don't bias.
  472. vmask4 exp_lt_m13 = exp < vint4(-13);
  473. vfloat4 a1a = a * 33554432.0f;
  474. vint4 expa = vint4::zero();
  475. vfloat4 a1b = (mant - 0.5f) * 4096;
  476. vint4 expb = exp + 14;
  477. a = select(a1b, a1a, exp_lt_m13);
  478. exp = select(expb, expa, exp_lt_m13);
  479. vmask4 a_lt_384 = a < vfloat4(384.0f);
  480. vmask4 a_lt_1408 = a <= vfloat4(1408.0f);
  481. vfloat4 a2a = a * (4.0f / 3.0f);
  482. vfloat4 a2b = a + 128.0f;
  483. vfloat4 a2c = (a + 512.0f) * (4.0f / 5.0f);
  484. a = a2c;
  485. a = select(a, a2b, a_lt_1408);
  486. a = select(a, a2a, a_lt_384);
  487. a = a + (int_to_float(exp) * 2048.0f) + 1.0f;
  488. a = select(a, vfloat4(65535.0f), mask_infinity);
  489. a = select(a, vfloat4::zero(), mask_underflow_nan);
  490. return a;
  491. }
  492. namespace astc
  493. {
  494. static ASTCENC_SIMD_INLINE float pow(float x, float y)
  495. {
  496. return pow(vfloat4(x), vfloat4(y)).lane<0>();
  497. }
  498. }
  499. #endif // #ifndef ASTC_VECMATHLIB_H_INCLUDED