astcenc_mathlib.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505
  1. // SPDX-License-Identifier: Apache-2.0
  2. // ----------------------------------------------------------------------------
  3. // Copyright 2011-2025 Arm Limited
  4. //
  5. // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  6. // use this file except in compliance with the License. You may obtain a copy
  7. // of the License at:
  8. //
  9. // http://www.apache.org/licenses/LICENSE-2.0
  10. //
  11. // Unless required by applicable law or agreed to in writing, software
  12. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  13. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  14. // License for the specific language governing permissions and limitations
  15. // under the License.
  16. // ----------------------------------------------------------------------------
  17. /*
  18. * This module implements a variety of mathematical data types and library
  19. * functions used by the codec.
  20. */
  21. #ifndef ASTC_MATHLIB_H_INCLUDED
  22. #define ASTC_MATHLIB_H_INCLUDED
  23. #include <cassert>
  24. #include <cstdint>
  25. #include <cmath>
  26. #ifndef ASTCENC_POPCNT
  27. #if defined(__POPCNT__)
  28. #define ASTCENC_POPCNT 1
  29. #else
  30. #define ASTCENC_POPCNT 0
  31. #endif
  32. #endif
  33. #ifndef ASTCENC_F16C
  34. #if defined(__F16C__)
  35. #define ASTCENC_F16C 1
  36. #else
  37. #define ASTCENC_F16C 0
  38. #endif
  39. #endif
  40. #ifndef ASTCENC_SSE
  41. #if defined(__SSE4_2__)
  42. #define ASTCENC_SSE 42
  43. #elif defined(__SSE4_1__)
  44. #define ASTCENC_SSE 41
  45. #elif defined(__SSE2__) || (defined(_M_AMD64) && !defined(_M_ARM64EC))
  46. #define ASTCENC_SSE 20
  47. #else
  48. #define ASTCENC_SSE 0
  49. #endif
  50. #endif
  51. #ifndef ASTCENC_AVX
  52. #if defined(__AVX2__)
  53. #define ASTCENC_AVX 2
  54. #define ASTCENC_X86_GATHERS 1
  55. #elif defined(__AVX__)
  56. #define ASTCENC_AVX 1
  57. #define ASTCENC_X86_GATHERS 1
  58. #else
  59. #define ASTCENC_AVX 0
  60. #endif
  61. #endif
  62. #ifndef ASTCENC_NEON
  63. #if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
  64. #define ASTCENC_NEON 1
  65. #else
  66. #define ASTCENC_NEON 0
  67. #endif
  68. #endif
  69. #ifndef ASTCENC_SVE
  70. #if defined(__ARM_FEATURE_SVE)
  71. #if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 256
  72. #define ASTCENC_SVE 8
  73. // Auto-detected SVE can only assume vector width of 4 is available, but
  74. // must also allow for hardware being longer and so all use of intrinsics
  75. // must explicitly use predicate masks to limit to 4-wide.
  76. #else
  77. #define ASTCENC_SVE 4
  78. #endif
  79. #else
  80. #define ASTCENC_SVE 0
  81. #endif
  82. #endif
  83. // Force vector-sized SIMD alignment
  84. #if ASTCENC_AVX || ASTCENC_SVE == 8
  85. #define ASTCENC_VECALIGN 32
  86. #elif ASTCENC_SSE || ASTCENC_NEON || ASTCENC_SVE == 4
  87. #define ASTCENC_VECALIGN 16
  88. // Use default alignment for non-SIMD builds
  89. #else
  90. #define ASTCENC_VECALIGN 0
  91. #endif
  92. // C++11 states that alignas(0) should be ignored but GCC doesn't do
  93. // this on some versions, so workaround and avoid emitting alignas(0)
  94. #if ASTCENC_VECALIGN > 0
  95. #define ASTCENC_ALIGNAS alignas(ASTCENC_VECALIGN)
  96. #else
  97. #define ASTCENC_ALIGNAS
  98. #endif
  99. #if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 || ASTCENC_POPCNT != 0
  100. #include <immintrin.h>
  101. #endif
  102. /* ============================================================================
  103. Fast math library; note that many of the higher-order functions in this set
  104. use approximations which are less accurate, but faster, than <cmath> standard
  105. library equivalents.
  106. Note: Many of these are not necessarily faster than simple C versions when
  107. used on a single scalar value, but are included for testing purposes as most
  108. have an option based on SSE intrinsics and therefore provide an obvious route
  109. to future vectorization.
  110. ============================================================================ */
  111. // Union for manipulation of float bit patterns
  112. typedef union
  113. {
  114. uint32_t u;
  115. int32_t s;
  116. float f;
  117. } if32;
  118. // These are namespaced to avoid colliding with C standard library functions.
  119. namespace astc
  120. {
  121. static const float PI = 3.14159265358979323846f;
  122. static const float PI_OVER_TWO = 1.57079632679489661923f;
  123. /**
  124. * @brief SP float absolute value.
  125. *
  126. * @param v The value to make absolute.
  127. *
  128. * @return The absolute value.
  129. */
  130. static inline float fabs(float v)
  131. {
  132. return std::fabs(v);
  133. }
  134. /**
  135. * @brief Test if a float value is a nan.
  136. *
  137. * @param v The value test.
  138. *
  139. * @return Zero is not a NaN, non-zero otherwise.
  140. */
  141. static inline bool isnan(float v)
  142. {
  143. return v != v;
  144. }
  145. /**
  146. * @brief Return the minimum of two values.
  147. *
  148. * For floats, NaNs are turned into @c q.
  149. *
  150. * @param p The first value to compare.
  151. * @param q The second value to compare.
  152. *
  153. * @return The smallest value.
  154. */
  155. template<typename T>
  156. static inline T min(T p, T q)
  157. {
  158. return p < q ? p : q;
  159. }
  160. /**
  161. * @brief Return the minimum of three values.
  162. *
  163. * For floats, NaNs are turned into @c r.
  164. *
  165. * @param p The first value to compare.
  166. * @param q The second value to compare.
  167. * @param r The third value to compare.
  168. *
  169. * @return The smallest value.
  170. */
  171. template<typename T>
  172. static inline T min(T p, T q, T r)
  173. {
  174. return min(min(p, q), r);
  175. }
  176. /**
  177. * @brief Return the minimum of four values.
  178. *
  179. * For floats, NaNs are turned into @c s.
  180. *
  181. * @param p The first value to compare.
  182. * @param q The second value to compare.
  183. * @param r The third value to compare.
  184. * @param s The fourth value to compare.
  185. *
  186. * @return The smallest value.
  187. */
  188. template<typename T>
  189. static inline T min(T p, T q, T r, T s)
  190. {
  191. return min(min(p, q), min(r, s));
  192. }
  193. /**
  194. * @brief Return the maximum of two values.
  195. *
  196. * For floats, NaNs are turned into @c q.
  197. *
  198. * @param p The first value to compare.
  199. * @param q The second value to compare.
  200. *
  201. * @return The largest value.
  202. */
  203. template<typename T>
  204. static inline T max(T p, T q)
  205. {
  206. return p > q ? p : q;
  207. }
  208. /**
  209. * @brief Return the maximum of three values.
  210. *
  211. * For floats, NaNs are turned into @c r.
  212. *
  213. * @param p The first value to compare.
  214. * @param q The second value to compare.
  215. * @param r The third value to compare.
  216. *
  217. * @return The largest value.
  218. */
  219. template<typename T>
  220. static inline T max(T p, T q, T r)
  221. {
  222. return max(max(p, q), r);
  223. }
  224. /**
  225. * @brief Return the maximum of four values.
  226. *
  227. * For floats, NaNs are turned into @c s.
  228. *
  229. * @param p The first value to compare.
  230. * @param q The second value to compare.
  231. * @param r The third value to compare.
  232. * @param s The fourth value to compare.
  233. *
  234. * @return The largest value.
  235. */
  236. template<typename T>
  237. static inline T max(T p, T q, T r, T s)
  238. {
  239. return max(max(p, q), max(r, s));
  240. }
  241. /**
  242. * @brief Clamp a value value between @c mn and @c mx.
  243. *
  244. * For floats, NaNs are turned into @c mn.
  245. *
  246. * @param v The value to clamp.
  247. * @param mn The min value (inclusive).
  248. * @param mx The max value (inclusive).
  249. *
  250. * @return The clamped value.
  251. */
  252. template<typename T>
  253. inline T clamp(T v, T mn, T mx)
  254. {
  255. // Do not reorder; correct NaN handling relies on the fact that comparison
  256. // with NaN returns false and will fall-though to the "min" value.
  257. if (v > mx) return mx;
  258. if (v > mn) return v;
  259. return mn;
  260. }
  261. /**
  262. * @brief Clamp a float value between 0.0f and 1.0f.
  263. *
  264. * NaNs are turned into 0.0f.
  265. *
  266. * @param v The value to clamp.
  267. *
  268. * @return The clamped value.
  269. */
  270. static inline float clamp1f(float v)
  271. {
  272. return astc::clamp(v, 0.0f, 1.0f);
  273. }
  274. /**
  275. * @brief Clamp a float value between 0.0f and 255.0f.
  276. *
  277. * NaNs are turned into 0.0f.
  278. *
  279. * @param v The value to clamp.
  280. *
  281. * @return The clamped value.
  282. */
  283. static inline float clamp255f(float v)
  284. {
  285. return astc::clamp(v, 0.0f, 255.0f);
  286. }
  287. /**
  288. * @brief SP float round-down.
  289. *
  290. * @param v The value to round.
  291. *
  292. * @return The rounded value.
  293. */
  294. static inline float flt_rd(float v)
  295. {
  296. return std::floor(v);
  297. }
  298. /**
  299. * @brief SP float round-to-nearest and convert to integer.
  300. *
  301. * @param v The value to round.
  302. *
  303. * @return The rounded value.
  304. */
  305. static inline int flt2int_rtn(float v)
  306. {
  307. return static_cast<int>(v + 0.5f);
  308. }
  309. /**
  310. * @brief SP float round down and convert to integer.
  311. *
  312. * @param v The value to round.
  313. *
  314. * @return The rounded value.
  315. */
  316. static inline int flt2int_rd(float v)
  317. {
  318. return static_cast<int>(v);
  319. }
  320. /**
  321. * @brief SP float bit-interpreted as an integer.
  322. *
  323. * @param v The value to bitcast.
  324. *
  325. * @return The converted value.
  326. */
  327. static inline int float_as_int(float v)
  328. {
  329. union { int a; float b; } u;
  330. u.b = v;
  331. return u.a;
  332. }
  333. /**
  334. * @brief Integer bit-interpreted as an SP float.
  335. *
  336. * @param v The value to bitcast.
  337. *
  338. * @return The converted value.
  339. */
  340. static inline float int_as_float(int v)
  341. {
  342. union { int a; float b; } u;
  343. u.a = v;
  344. return u.b;
  345. }
  346. /**
  347. * @brief Fast approximation of 1.0 / sqrt(val).
  348. *
  349. * @param v The input value.
  350. *
  351. * @return The approximated result.
  352. */
  353. static inline float rsqrt(float v)
  354. {
  355. return 1.0f / std::sqrt(v);
  356. }
  357. /**
  358. * @brief Fast approximation of sqrt(val).
  359. *
  360. * @param v The input value.
  361. *
  362. * @return The approximated result.
  363. */
  364. static inline float sqrt(float v)
  365. {
  366. return std::sqrt(v);
  367. }
  368. /**
  369. * @brief Extract mantissa and exponent of a float value.
  370. *
  371. * @param v The input value.
  372. * @param[out] expo The output exponent.
  373. *
  374. * @return The mantissa.
  375. */
  376. static inline float frexp(float v, int* expo)
  377. {
  378. if32 p;
  379. p.f = v;
  380. *expo = ((p.u >> 23) & 0xFF) - 126;
  381. p.u = (p.u & 0x807fffff) | 0x3f000000;
  382. return p.f;
  383. }
  384. /**
  385. * @brief Initialize the seed structure for a random number generator.
  386. *
  387. * Important note: For the purposes of ASTC we want sets of random numbers to
  388. * use the codec, but we want the same seed value across instances and threads
  389. * to ensure that image output is stable across compressor runs and across
  390. * platforms. Every PRNG created by this call will therefore return the same
  391. * sequence of values ...
  392. *
  393. * @param state The state structure to initialize.
  394. */
  395. void rand_init(uint64_t state[2]);
  396. /**
  397. * @brief Return the next random number from the generator.
  398. *
  399. * This RNG is an implementation of the "xoroshoro-128+ 1.0" PRNG, based on the
  400. * public-domain implementation given by David Blackman & Sebastiano Vigna at
  401. * http://vigna.di.unimi.it/xorshift/xoroshiro128plus.c
  402. *
  403. * @param state The state structure to use/update.
  404. */
  405. uint64_t rand(uint64_t state[2]);
  406. }
  407. /* ============================================================================
  408. Softfloat library with fp32 and fp16 conversion functionality.
  409. ============================================================================ */
  410. #if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
  411. /* narrowing float->float conversions */
  412. uint16_t float_to_sf16(float val);
  413. float sf16_to_float(uint16_t val);
  414. #endif
  415. /*********************************
  416. Vector library
  417. *********************************/
  418. #include "astcenc_vecmathlib.h"
  419. /*********************************
  420. Declaration of line types
  421. *********************************/
  422. // parametric line, 2D: The line is given by line = a + b * t.
  423. struct line2
  424. {
  425. vfloat4 a;
  426. vfloat4 b;
  427. };
  428. // parametric line, 3D
  429. struct line3
  430. {
  431. vfloat4 a;
  432. vfloat4 b;
  433. };
  434. struct line4
  435. {
  436. vfloat4 a;
  437. vfloat4 b;
  438. };
  439. struct processed_line2
  440. {
  441. vfloat4 amod;
  442. vfloat4 bs;
  443. };
  444. struct processed_line3
  445. {
  446. vfloat4 amod;
  447. vfloat4 bs;
  448. };
  449. struct processed_line4
  450. {
  451. vfloat4 amod;
  452. vfloat4 bs;
  453. };
  454. #endif