astcenc_vecmathlib_common_4.h 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421
  1. // SPDX-License-Identifier: Apache-2.0
  2. // ----------------------------------------------------------------------------
  3. // Copyright 2020-2025 Arm Limited
  4. //
  5. // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  6. // use this file except in compliance with the License. You may obtain a copy
  7. // of the License at:
  8. //
  9. // http://www.apache.org/licenses/LICENSE-2.0
  10. //
  11. // Unless required by applicable law or agreed to in writing, software
  12. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  13. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  14. // License for the specific language governing permissions and limitations
  15. // under the License.
  16. // ----------------------------------------------------------------------------
  17. /**
  18. * @brief Generic 4x32-bit vector functions.
  19. *
  20. * This module implements generic 4-wide vector functions that are valid for
  21. * all instruction sets, typically implemented using lower level 4-wide
  22. * operations that are ISA-specific.
  23. */
  24. #ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
  25. #define ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
  26. #ifndef ASTCENC_SIMD_INLINE
  27. #error "Include astcenc_vecmathlib.h, do not include directly"
  28. #endif
  29. #include <cstdio>
  30. #include <limits>
  31. // ============================================================================
  32. // vint4 operators and functions
  33. // ============================================================================
  34. /**
  35. * @brief Overload: vector by scalar addition.
  36. */
  37. ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, int b)
  38. {
  39. return a + vint4(b);
  40. }
  41. /**
  42. * @brief Overload: vector by vector incremental addition.
  43. */
  44. ASTCENC_SIMD_INLINE vint4& operator+=(vint4& a, const vint4& b)
  45. {
  46. a = a + b;
  47. return a;
  48. }
  49. /**
  50. * @brief Overload: vector by scalar subtraction.
  51. */
  52. ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, int b)
  53. {
  54. return a - vint4(b);
  55. }
  56. /**
  57. * @brief Overload: vector by scalar multiplication.
  58. */
  59. ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, int b)
  60. {
  61. return a * vint4(b);
  62. }
  63. /**
  64. * @brief Overload: vector by scalar bitwise or.
  65. */
  66. ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, int b)
  67. {
  68. return a | vint4(b);
  69. }
  70. /**
  71. * @brief Overload: vector by scalar bitwise and.
  72. */
  73. ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, int b)
  74. {
  75. return a & vint4(b);
  76. }
  77. /**
  78. * @brief Overload: vector by scalar bitwise xor.
  79. */
  80. ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, int b)
  81. {
  82. return a ^ vint4(b);
  83. }
  84. /**
  85. * @brief Return the clamped value between min and max.
  86. */
  87. ASTCENC_SIMD_INLINE vint4 clamp(int minv, int maxv, vint4 a)
  88. {
  89. return min(max(a, vint4(minv)), vint4(maxv));
  90. }
  91. /**
  92. * @brief Return the horizontal sum of RGB vector lanes as a scalar.
  93. */
  94. ASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a)
  95. {
  96. return a.lane<0>() + a.lane<1>() + a.lane<2>();
  97. }
  98. /**
  99. * @brief Return the horizontal minimum of a vector.
  100. */
  101. ASTCENC_SIMD_INLINE int hmin_s(vint4 a)
  102. {
  103. return hmin(a).lane<0>();
  104. }
  105. /**
  106. * @brief Generate a vint4 from a size_t.
  107. */
  108. ASTCENC_SIMD_INLINE vint4 vint4_from_size(size_t a)
  109. {
  110. assert(a <= std::numeric_limits<int>::max());
  111. return vint4(static_cast<int>(a));
  112. }
  113. /**
  114. * @brief Return the horizontal maximum of a vector.
  115. */
  116. ASTCENC_SIMD_INLINE int hmax_s(vint4 a)
  117. {
  118. return hmax(a).lane<0>();
  119. }
  120. // ============================================================================
  121. // vfloat4 operators and functions
  122. // ============================================================================
  123. /**
  124. * @brief Overload: vector by vector incremental addition.
  125. */
  126. ASTCENC_SIMD_INLINE vfloat4& operator+=(vfloat4& a, const vfloat4& b)
  127. {
  128. a = a + b;
  129. return a;
  130. }
  131. /**
  132. * @brief Overload: vector by scalar addition.
  133. */
  134. ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, float b)
  135. {
  136. return a + vfloat4(b);
  137. }
  138. /**
  139. * @brief Overload: vector by scalar subtraction.
  140. */
  141. ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, float b)
  142. {
  143. return a - vfloat4(b);
  144. }
  145. /**
  146. * @brief Overload: vector by scalar multiplication.
  147. */
  148. ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, float b)
  149. {
  150. return a * vfloat4(b);
  151. }
  152. /**
  153. * @brief Overload: scalar by vector multiplication.
  154. */
  155. ASTCENC_SIMD_INLINE vfloat4 operator*(float a, vfloat4 b)
  156. {
  157. return vfloat4(a) * b;
  158. }
  159. /**
  160. * @brief Overload: vector by scalar division.
  161. */
  162. ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, float b)
  163. {
  164. return a / vfloat4(b);
  165. }
  166. /**
  167. * @brief Overload: scalar by vector division.
  168. */
  169. ASTCENC_SIMD_INLINE vfloat4 operator/(float a, vfloat4 b)
  170. {
  171. return vfloat4(a) / b;
  172. }
  173. /**
  174. * @brief Return the min vector of a vector and a scalar.
  175. *
  176. * If either lane value is NaN, @c b will be returned for that lane.
  177. */
  178. ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, float b)
  179. {
  180. return min(a, vfloat4(b));
  181. }
  182. /**
  183. * @brief Return the max vector of a vector and a scalar.
  184. *
  185. * If either lane value is NaN, @c b will be returned for that lane.
  186. */
  187. ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, float b)
  188. {
  189. return max(a, vfloat4(b));
  190. }
  191. /**
  192. * @brief Return the clamped value between min and max.
  193. *
  194. * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
  195. * then @c min will be returned for that lane.
  196. */
  197. ASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a)
  198. {
  199. // Do not reorder - second operand will return if either is NaN
  200. return min(max(a, minv), maxv);
  201. }
  202. /**
  203. * @brief Return the clamped value between 0.0f and 1.0f.
  204. *
  205. * If @c a is NaN then zero will be returned for that lane.
  206. */
  207. ASTCENC_SIMD_INLINE vfloat4 clampzo(vfloat4 a)
  208. {
  209. // Do not reorder - second operand will return if either is NaN
  210. return min(max(a, vfloat4::zero()), 1.0f);
  211. }
  212. /**
  213. * @brief Return the horizontal minimum of a vector.
  214. */
  215. ASTCENC_SIMD_INLINE float hmin_s(vfloat4 a)
  216. {
  217. return hmin(a).lane<0>();
  218. }
  219. /**
  220. * @brief Return the horizontal min of RGB vector lanes as a scalar.
  221. */
  222. ASTCENC_SIMD_INLINE float hmin_rgb_s(vfloat4 a)
  223. {
  224. a.set_lane<3>(a.lane<0>());
  225. return hmin_s(a);
  226. }
  227. /**
  228. * @brief Return the horizontal maximum of a vector.
  229. */
  230. ASTCENC_SIMD_INLINE float hmax_s(vfloat4 a)
  231. {
  232. return hmax(a).lane<0>();
  233. }
  234. /**
  235. * @brief Accumulate lane-wise sums for a vector.
  236. */
  237. ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a)
  238. {
  239. accum = accum + a;
  240. }
  241. /**
  242. * @brief Accumulate lane-wise sums for a masked vector.
  243. */
  244. ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a, vmask4 m)
  245. {
  246. a = select(vfloat4::zero(), a, m);
  247. haccumulate(accum, a);
  248. }
  249. /**
  250. * @brief Return the horizontal sum of RGB vector lanes as a scalar.
  251. */
  252. ASTCENC_SIMD_INLINE float hadd_rgb_s(vfloat4 a)
  253. {
  254. return a.lane<0>() + a.lane<1>() + a.lane<2>();
  255. }
  256. #if !defined(ASTCENC_USE_NATIVE_DOT_PRODUCT)
  257. /**
  258. * @brief Return the dot product for the full 4 lanes, returning scalar.
  259. */
  260. ASTCENC_SIMD_INLINE float dot_s(vfloat4 a, vfloat4 b)
  261. {
  262. vfloat4 m = a * b;
  263. return hadd_s(m);
  264. }
  265. /**
  266. * @brief Return the dot product for the full 4 lanes, returning vector.
  267. */
  268. ASTCENC_SIMD_INLINE vfloat4 dot(vfloat4 a, vfloat4 b)
  269. {
  270. vfloat4 m = a * b;
  271. return vfloat4(hadd_s(m));
  272. }
  273. /**
  274. * @brief Return the dot product for the bottom 3 lanes, returning scalar.
  275. */
  276. ASTCENC_SIMD_INLINE float dot3_s(vfloat4 a, vfloat4 b)
  277. {
  278. vfloat4 m = a * b;
  279. return hadd_rgb_s(m);
  280. }
  281. /**
  282. * @brief Return the dot product for the bottom 3 lanes, returning vector.
  283. */
  284. ASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b)
  285. {
  286. vfloat4 m = a * b;
  287. float d3 = hadd_rgb_s(m);
  288. return vfloat4(d3, d3, d3, 0.0f);
  289. }
  290. #endif
  291. #if !defined(ASTCENC_USE_NATIVE_POPCOUNT)
  292. /**
  293. * @brief Population bit count.
  294. *
  295. * @param v The value to population count.
  296. *
  297. * @return The number of 1 bits.
  298. */
  299. static inline int popcount(uint64_t v)
  300. {
  301. uint64_t mask1 = 0x5555555555555555ULL;
  302. uint64_t mask2 = 0x3333333333333333ULL;
  303. uint64_t mask3 = 0x0F0F0F0F0F0F0F0FULL;
  304. v -= (v >> 1) & mask1;
  305. v = (v & mask2) + ((v >> 2) & mask2);
  306. v += v >> 4;
  307. v &= mask3;
  308. v *= 0x0101010101010101ULL;
  309. v >>= 56;
  310. return static_cast<int>(v);
  311. }
  312. #endif
  313. /**
  314. * @brief Apply signed bit transfer.
  315. *
  316. * @param input0 The first encoded endpoint.
  317. * @param input1 The second encoded endpoint.
  318. */
  319. static ASTCENC_SIMD_INLINE void bit_transfer_signed(
  320. vint4& input0,
  321. vint4& input1
  322. ) {
  323. input1 = lsr<1>(input1) | (input0 & 0x80);
  324. input0 = lsr<1>(input0) & 0x3F;
  325. vmask4 mask = (input0 & 0x20) != vint4::zero();
  326. input0 = select(input0, input0 - 0x40, mask);
  327. }
  328. /**
  329. * @brief Debug function to print a vector of ints.
  330. */
  331. ASTCENC_SIMD_INLINE void print(vint4 a)
  332. {
  333. ASTCENC_ALIGNAS int v[4];
  334. storea(a, v);
  335. printf("v4_i32:\n %8d %8d %8d %8d\n",
  336. v[0], v[1], v[2], v[3]);
  337. }
  338. /**
  339. * @brief Debug function to print a vector of ints.
  340. */
  341. ASTCENC_SIMD_INLINE void printx(vint4 a)
  342. {
  343. ASTCENC_ALIGNAS int v[4];
  344. storea(a, v);
  345. unsigned int uv[4];
  346. std::memcpy(uv, v, sizeof(int) * 4);
  347. printf("v4_i32:\n %08x %08x %08x %08x\n",
  348. uv[0], uv[1], uv[2], uv[3]);
  349. }
  350. /**
  351. * @brief Debug function to print a vector of floats.
  352. */
  353. ASTCENC_SIMD_INLINE void print(vfloat4 a)
  354. {
  355. ASTCENC_ALIGNAS float v[4];
  356. storea(a, v);
  357. printf("v4_f32:\n %0.4f %0.4f %0.4f %0.4f\n",
  358. static_cast<double>(v[0]), static_cast<double>(v[1]),
  359. static_cast<double>(v[2]), static_cast<double>(v[3]));
  360. }
  361. /**
  362. * @brief Debug function to print a vector of masks.
  363. */
  364. ASTCENC_SIMD_INLINE void print(vmask4 a)
  365. {
  366. print(select(vint4(0), vint4(1), a));
  367. }
  368. #endif // #ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED