astcenc_vecmathlib_avx2_8.h 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218
  1. // SPDX-License-Identifier: Apache-2.0
  2. // ----------------------------------------------------------------------------
  3. // Copyright 2019-2025 Arm Limited
  4. //
  5. // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  6. // use this file except in compliance with the License. You may obtain a copy
  7. // of the License at:
  8. //
  9. // http://www.apache.org/licenses/LICENSE-2.0
  10. //
  11. // Unless required by applicable law or agreed to in writing, software
  12. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  13. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  14. // License for the specific language governing permissions and limitations
  15. // under the License.
  16. // ----------------------------------------------------------------------------
  17. /**
  18. * @brief 8x32-bit vectors, implemented using AVX2.
  19. *
  20. * This module implements 8-wide 32-bit float, int, and mask vectors for x86
  21. * AVX2.
  22. *
  23. * There is a baseline level of functionality provided by all vector widths and
  24. * implementations. This is implemented using identical function signatures,
  25. * modulo data type, so we can use them as substitutable implementations in VLA
  26. * code.
  27. */
  28. #ifndef ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
  29. #define ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
  30. #ifndef ASTCENC_SIMD_INLINE
  31. #error "Include astcenc_vecmathlib.h, do not include directly"
  32. #endif
  33. #include <cstdio>
  34. // Define convenience intrinsics that are missing on older compilers
  35. #define astcenc_mm256_set_m128i(m, n) _mm256_insertf128_si256(_mm256_castsi128_si256((n)), (m), 1)
  36. // ============================================================================
  37. // vfloat8 data type
  38. // ============================================================================
  39. /**
  40. * @brief Data type for 8-wide floats.
  41. */
  42. struct vfloat8
  43. {
  44. /**
  45. * @brief Construct from zero-initialized value.
  46. */
  47. ASTCENC_SIMD_INLINE vfloat8() = default;
  48. /**
  49. * @brief Construct from 8 values loaded from an unaligned address.
  50. *
  51. * Consider using loada() which is better with vectors if data is aligned
  52. * to vector length.
  53. */
  54. ASTCENC_SIMD_INLINE explicit vfloat8(const float *p)
  55. {
  56. m = _mm256_loadu_ps(p);
  57. }
  58. /**
  59. * @brief Construct from 1 scalar value replicated across all lanes.
  60. *
  61. * Consider using zero() for constexpr zeros.
  62. */
  63. ASTCENC_SIMD_INLINE explicit vfloat8(float a)
  64. {
  65. m = _mm256_set1_ps(a);
  66. }
  67. /**
  68. * @brief Construct from an existing SIMD register.
  69. */
  70. ASTCENC_SIMD_INLINE explicit vfloat8(__m256 a)
  71. {
  72. m = a;
  73. }
  74. /**
  75. * @brief Factory that returns a vector of zeros.
  76. */
  77. static ASTCENC_SIMD_INLINE vfloat8 zero()
  78. {
  79. return vfloat8(_mm256_setzero_ps());
  80. }
  81. /**
  82. * @brief Factory that returns a replicated scalar loaded from memory.
  83. */
  84. static ASTCENC_SIMD_INLINE vfloat8 load1(const float* p)
  85. {
  86. return vfloat8(_mm256_broadcast_ss(p));
  87. }
  88. /**
  89. * @brief Factory that returns a vector loaded from 32B aligned memory.
  90. */
  91. static ASTCENC_SIMD_INLINE vfloat8 loada(const float* p)
  92. {
  93. return vfloat8(_mm256_load_ps(p));
  94. }
  95. /**
  96. * @brief The vector ...
  97. */
  98. __m256 m;
  99. };
  100. // ============================================================================
  101. // vint8 data type
  102. // ============================================================================
  103. /**
  104. * @brief Data type for 8-wide ints.
  105. */
  106. struct vint8
  107. {
  108. /**
  109. * @brief Construct from zero-initialized value.
  110. */
  111. ASTCENC_SIMD_INLINE vint8() = default;
  112. /**
  113. * @brief Construct from 8 values loaded from an unaligned address.
  114. *
  115. * Consider using loada() which is better with vectors if data is aligned
  116. * to vector length.
  117. */
  118. ASTCENC_SIMD_INLINE explicit vint8(const int *p)
  119. {
  120. m = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
  121. }
  122. /**
  123. * @brief Construct from 8 uint8_t loaded from an unaligned address.
  124. */
  125. ASTCENC_SIMD_INLINE explicit vint8(const uint8_t *p)
  126. {
  127. // _mm_loadu_si64 would be nicer syntax, but missing on older GCC
  128. m = _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(*reinterpret_cast<const long long*>(p)));
  129. }
  130. /**
  131. * @brief Construct from 1 scalar value replicated across all lanes.
  132. *
  133. * Consider using zero() for constexpr zeros.
  134. */
  135. ASTCENC_SIMD_INLINE explicit vint8(int a)
  136. {
  137. m = _mm256_set1_epi32(a);
  138. }
  139. /**
  140. * @brief Construct from an existing SIMD register.
  141. */
  142. ASTCENC_SIMD_INLINE explicit vint8(__m256i a)
  143. {
  144. m = a;
  145. }
  146. /**
  147. * @brief Factory that returns a vector of zeros.
  148. */
  149. static ASTCENC_SIMD_INLINE vint8 zero()
  150. {
  151. return vint8(_mm256_setzero_si256());
  152. }
  153. /**
  154. * @brief Factory that returns a replicated scalar loaded from memory.
  155. */
  156. static ASTCENC_SIMD_INLINE vint8 load1(const int* p)
  157. {
  158. __m128i a = _mm_set1_epi32(*p);
  159. return vint8(_mm256_broadcastd_epi32(a));
  160. }
  161. /**
  162. * @brief Factory that returns a vector loaded from unaligned memory.
  163. */
  164. static ASTCENC_SIMD_INLINE vint8 load(const uint8_t* p)
  165. {
  166. return vint8(_mm256_lddqu_si256(reinterpret_cast<const __m256i*>(p)));
  167. }
  168. /**
  169. * @brief Factory that returns a vector loaded from 32B aligned memory.
  170. */
  171. static ASTCENC_SIMD_INLINE vint8 loada(const int* p)
  172. {
  173. return vint8(_mm256_load_si256(reinterpret_cast<const __m256i*>(p)));
  174. }
  175. /**
  176. * @brief Factory that returns a vector containing the lane IDs.
  177. */
  178. static ASTCENC_SIMD_INLINE vint8 lane_id()
  179. {
  180. return vint8(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0));
  181. }
  182. /**
  183. * @brief The vector ...
  184. */
  185. __m256i m;
  186. };
  187. // ============================================================================
  188. // vmask8 data type
  189. // ============================================================================
  190. /**
  191. * @brief Data type for 8-wide control plane masks.
  192. */
  193. struct vmask8
  194. {
  195. /**
  196. * @brief Construct from an existing SIMD register.
  197. */
  198. ASTCENC_SIMD_INLINE explicit vmask8(__m256 a)
  199. {
  200. m = a;
  201. }
  202. /**
  203. * @brief Construct from an existing SIMD register.
  204. */
  205. ASTCENC_SIMD_INLINE explicit vmask8(__m256i a)
  206. {
  207. m = _mm256_castsi256_ps(a);
  208. }
  209. /**
  210. * @brief Construct from 1 scalar value.
  211. */
  212. ASTCENC_SIMD_INLINE explicit vmask8(bool a)
  213. {
  214. vint8 mask(a == false ? 0 : -1);
  215. m = _mm256_castsi256_ps(mask.m);
  216. }
  217. /**
  218. * @brief The vector ...
  219. */
  220. __m256 m;
  221. };
  222. // ============================================================================
  223. // vmask8 operators and functions
  224. // ============================================================================
  225. /**
  226. * @brief Overload: mask union (or).
  227. */
  228. ASTCENC_SIMD_INLINE vmask8 operator|(vmask8 a, vmask8 b)
  229. {
  230. return vmask8(_mm256_or_ps(a.m, b.m));
  231. }
  232. /**
  233. * @brief Overload: mask intersect (and).
  234. */
  235. ASTCENC_SIMD_INLINE vmask8 operator&(vmask8 a, vmask8 b)
  236. {
  237. return vmask8(_mm256_and_ps(a.m, b.m));
  238. }
  239. /**
  240. * @brief Overload: mask difference (xor).
  241. */
  242. ASTCENC_SIMD_INLINE vmask8 operator^(vmask8 a, vmask8 b)
  243. {
  244. return vmask8(_mm256_xor_ps(a.m, b.m));
  245. }
  246. /**
  247. * @brief Overload: mask invert (not).
  248. */
  249. ASTCENC_SIMD_INLINE vmask8 operator~(vmask8 a)
  250. {
  251. return vmask8(_mm256_xor_si256(_mm256_castps_si256(a.m), _mm256_set1_epi32(-1)));
  252. }
  253. /**
  254. * @brief Return a 8-bit mask code indicating mask status.
  255. *
  256. * bit0 = lane 0
  257. */
  258. ASTCENC_SIMD_INLINE unsigned int mask(vmask8 a)
  259. {
  260. return static_cast<unsigned int>(_mm256_movemask_ps(a.m));
  261. }
  262. /**
  263. * @brief True if any lanes are enabled, false otherwise.
  264. */
  265. ASTCENC_SIMD_INLINE bool any(vmask8 a)
  266. {
  267. return mask(a) != 0;
  268. }
  269. /**
  270. * @brief True if all lanes are enabled, false otherwise.
  271. */
  272. ASTCENC_SIMD_INLINE bool all(vmask8 a)
  273. {
  274. return mask(a) == 0xFF;
  275. }
  276. // ============================================================================
  277. // vint8 operators and functions
  278. // ============================================================================
  279. /**
  280. * @brief Overload: vector by vector addition.
  281. */
  282. ASTCENC_SIMD_INLINE vint8 operator+(vint8 a, vint8 b)
  283. {
  284. return vint8(_mm256_add_epi32(a.m, b.m));
  285. }
  286. /**
  287. * @brief Overload: vector by vector incremental addition.
  288. */
  289. ASTCENC_SIMD_INLINE vint8& operator+=(vint8& a, const vint8& b)
  290. {
  291. a = a + b;
  292. return a;
  293. }
  294. /**
  295. * @brief Overload: vector by vector subtraction.
  296. */
  297. ASTCENC_SIMD_INLINE vint8 operator-(vint8 a, vint8 b)
  298. {
  299. return vint8(_mm256_sub_epi32(a.m, b.m));
  300. }
  301. /**
  302. * @brief Overload: vector by vector multiplication.
  303. */
  304. ASTCENC_SIMD_INLINE vint8 operator*(vint8 a, vint8 b)
  305. {
  306. return vint8(_mm256_mullo_epi32(a.m, b.m));
  307. }
  308. /**
  309. * @brief Overload: vector bit invert.
  310. */
  311. ASTCENC_SIMD_INLINE vint8 operator~(vint8 a)
  312. {
  313. return vint8(_mm256_xor_si256(a.m, _mm256_set1_epi32(-1)));
  314. }
  315. /**
  316. * @brief Overload: vector by vector bitwise or.
  317. */
  318. ASTCENC_SIMD_INLINE vint8 operator|(vint8 a, vint8 b)
  319. {
  320. return vint8(_mm256_or_si256(a.m, b.m));
  321. }
  322. /**
  323. * @brief Overload: vector by vector bitwise and.
  324. */
  325. ASTCENC_SIMD_INLINE vint8 operator&(vint8 a, vint8 b)
  326. {
  327. return vint8(_mm256_and_si256(a.m, b.m));
  328. }
  329. /**
  330. * @brief Overload: vector by vector bitwise xor.
  331. */
  332. ASTCENC_SIMD_INLINE vint8 operator^(vint8 a, vint8 b)
  333. {
  334. return vint8(_mm256_xor_si256(a.m, b.m));
  335. }
  336. /**
  337. * @brief Overload: vector by vector equality.
  338. */
  339. ASTCENC_SIMD_INLINE vmask8 operator==(vint8 a, vint8 b)
  340. {
  341. return vmask8(_mm256_cmpeq_epi32(a.m, b.m));
  342. }
  343. /**
  344. * @brief Overload: vector by vector inequality.
  345. */
  346. ASTCENC_SIMD_INLINE vmask8 operator!=(vint8 a, vint8 b)
  347. {
  348. return ~vmask8(_mm256_cmpeq_epi32(a.m, b.m));
  349. }
  350. /**
  351. * @brief Overload: vector by vector less than.
  352. */
  353. ASTCENC_SIMD_INLINE vmask8 operator<(vint8 a, vint8 b)
  354. {
  355. return vmask8(_mm256_cmpgt_epi32(b.m, a.m));
  356. }
  357. /**
  358. * @brief Overload: vector by vector greater than.
  359. */
  360. ASTCENC_SIMD_INLINE vmask8 operator>(vint8 a, vint8 b)
  361. {
  362. return vmask8(_mm256_cmpgt_epi32(a.m, b.m));
  363. }
  364. /**
  365. * @brief Logical shift left.
  366. */
  367. template <int s> ASTCENC_SIMD_INLINE vint8 lsl(vint8 a)
  368. {
  369. return vint8(_mm256_slli_epi32(a.m, s));
  370. }
  371. /**
  372. * @brief Arithmetic shift right.
  373. */
  374. template <int s> ASTCENC_SIMD_INLINE vint8 asr(vint8 a)
  375. {
  376. return vint8(_mm256_srai_epi32(a.m, s));
  377. }
  378. /**
  379. * @brief Logical shift right.
  380. */
  381. template <int s> ASTCENC_SIMD_INLINE vint8 lsr(vint8 a)
  382. {
  383. return vint8(_mm256_srli_epi32(a.m, s));
  384. }
  385. /**
  386. * @brief Return the min vector of two vectors.
  387. */
  388. ASTCENC_SIMD_INLINE vint8 min(vint8 a, vint8 b)
  389. {
  390. return vint8(_mm256_min_epi32(a.m, b.m));
  391. }
  392. /**
  393. * @brief Return the max vector of two vectors.
  394. */
  395. ASTCENC_SIMD_INLINE vint8 max(vint8 a, vint8 b)
  396. {
  397. return vint8(_mm256_max_epi32(a.m, b.m));
  398. }
  399. /**
  400. * @brief Return the horizontal minimum of a vector.
  401. */
  402. ASTCENC_SIMD_INLINE vint8 hmin(vint8 a)
  403. {
  404. // Build min within groups of 2, then 4, then 8
  405. __m256i m = _mm256_min_epi32(a.m, _mm256_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1)));
  406. m = _mm256_min_epi32(m, _mm256_shuffle_epi32(m, _MM_SHUFFLE(1, 0, 3, 2)));
  407. m = _mm256_min_epi32(m, _mm256_permute2x128_si256(m, m, 0x01));
  408. vint8 vmin(m);
  409. return vmin;
  410. }
  411. /**
  412. * @brief Return the horizontal minimum of a vector.
  413. */
  414. ASTCENC_SIMD_INLINE int hmin_s(vint8 a)
  415. {
  416. return _mm256_cvtsi256_si32(hmin(a).m);
  417. }
  418. /**
  419. * @brief Return the horizontal maximum of a vector.
  420. */
  421. ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
  422. {
  423. // Build max within groups of 2, then 4, then 8
  424. __m256i m = _mm256_max_epi32(a.m, _mm256_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1)));
  425. m = _mm256_max_epi32(m, _mm256_shuffle_epi32(m, _MM_SHUFFLE(1, 0, 3, 2)));
  426. m = _mm256_max_epi32(m, _mm256_permute2x128_si256(m, m, 0x01));
  427. vint8 vmax(m);
  428. return vmax;
  429. }
  430. /**
  431. * @brief Return the horizontal maximum of a vector.
  432. */
  433. ASTCENC_SIMD_INLINE int hmax_s(vint8 a)
  434. {
  435. return _mm256_cvtsi256_si32(hmax(a).m);
  436. }
  437. /**
  438. * @brief Generate a vint8 from a size_t.
  439. */
  440. ASTCENC_SIMD_INLINE vint8 vint8_from_size(size_t a)
  441. {
  442. assert(a <= std::numeric_limits<int>::max());
  443. return vint8(static_cast<int>(a));
  444. }
  445. /**
  446. * @brief Store a vector to a 16B aligned memory address.
  447. */
  448. ASTCENC_SIMD_INLINE void storea(vint8 a, int* p)
  449. {
  450. _mm256_store_si256(reinterpret_cast<__m256i*>(p), a.m);
  451. }
  452. /**
  453. * @brief Store a vector to an unaligned memory address.
  454. */
  455. ASTCENC_SIMD_INLINE void store(vint8 a, int* p)
  456. {
  457. _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), a.m);
  458. }
  459. /**
  460. * @brief Store lowest N (vector width) bytes into an unaligned address.
  461. */
  462. ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p)
  463. {
  464. // This is the most logical implementation, but the convenience intrinsic
  465. // is missing on older compilers (supported in g++ 9 and clang++ 9).
  466. // _mm_storeu_si64(ptr, _mm256_extracti128_si256(v.m, 0))
  467. _mm_storel_epi64(reinterpret_cast<__m128i*>(p), _mm256_extracti128_si256(a.m, 0));
  468. }
  469. /**
  470. * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
  471. */
  472. ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint8 v, uint8_t* p)
  473. {
  474. __m256i shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
  475. 0, 0, 0, 0, 28, 24, 20, 16,
  476. 0, 0, 0, 0, 0, 0, 0, 0,
  477. 0, 0, 0, 0, 12, 8, 4, 0);
  478. __m256i a = _mm256_shuffle_epi8(v.m, shuf);
  479. __m128i a0 = _mm256_extracti128_si256(a, 0);
  480. __m128i a1 = _mm256_extracti128_si256(a, 1);
  481. __m128i b = _mm_unpacklo_epi32(a0, a1);
  482. __m256i r = astcenc_mm256_set_m128i(b, b);
  483. store_nbytes(vint8(r), p);
  484. }
  485. /**
  486. * @brief Return lanes from @c b if @c cond is set, else @c a.
  487. */
  488. ASTCENC_SIMD_INLINE vint8 select(vint8 a, vint8 b, vmask8 cond)
  489. {
  490. __m256i condi = _mm256_castps_si256(cond.m);
  491. return vint8(_mm256_blendv_epi8(a.m, b.m, condi));
  492. }
  493. // ============================================================================
  494. // vfloat8 operators and functions
  495. // ============================================================================
  496. /**
  497. * @brief Overload: vector by vector addition.
  498. */
  499. ASTCENC_SIMD_INLINE vfloat8 operator+(vfloat8 a, vfloat8 b)
  500. {
  501. return vfloat8(_mm256_add_ps(a.m, b.m));
  502. }
  503. /**
  504. * @brief Overload: vector by vector incremental addition.
  505. */
  506. ASTCENC_SIMD_INLINE vfloat8& operator+=(vfloat8& a, const vfloat8& b)
  507. {
  508. a = a + b;
  509. return a;
  510. }
  511. /**
  512. * @brief Overload: vector by vector subtraction.
  513. */
  514. ASTCENC_SIMD_INLINE vfloat8 operator-(vfloat8 a, vfloat8 b)
  515. {
  516. return vfloat8(_mm256_sub_ps(a.m, b.m));
  517. }
  518. /**
  519. * @brief Overload: vector by vector multiplication.
  520. */
  521. ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, vfloat8 b)
  522. {
  523. return vfloat8(_mm256_mul_ps(a.m, b.m));
  524. }
  525. /**
  526. * @brief Overload: vector by scalar multiplication.
  527. */
  528. ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, float b)
  529. {
  530. return vfloat8(_mm256_mul_ps(a.m, _mm256_set1_ps(b)));
  531. }
  532. /**
  533. * @brief Overload: scalar by vector multiplication.
  534. */
  535. ASTCENC_SIMD_INLINE vfloat8 operator*(float a, vfloat8 b)
  536. {
  537. return vfloat8(_mm256_mul_ps(_mm256_set1_ps(a), b.m));
  538. }
  539. /**
  540. * @brief Overload: vector by vector division.
  541. */
  542. ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, vfloat8 b)
  543. {
  544. return vfloat8(_mm256_div_ps(a.m, b.m));
  545. }
  546. /**
  547. * @brief Overload: vector by scalar division.
  548. */
  549. ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, float b)
  550. {
  551. return vfloat8(_mm256_div_ps(a.m, _mm256_set1_ps(b)));
  552. }
  553. /**
  554. * @brief Overload: scalar by vector division.
  555. */
  556. ASTCENC_SIMD_INLINE vfloat8 operator/(float a, vfloat8 b)
  557. {
  558. return vfloat8(_mm256_div_ps(_mm256_set1_ps(a), b.m));
  559. }
  560. /**
  561. * @brief Overload: vector by vector equality.
  562. */
  563. ASTCENC_SIMD_INLINE vmask8 operator==(vfloat8 a, vfloat8 b)
  564. {
  565. return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_EQ_OQ));
  566. }
  567. /**
  568. * @brief Overload: vector by vector inequality.
  569. */
  570. ASTCENC_SIMD_INLINE vmask8 operator!=(vfloat8 a, vfloat8 b)
  571. {
  572. return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_NEQ_OQ));
  573. }
  574. /**
  575. * @brief Overload: vector by vector less than.
  576. */
  577. ASTCENC_SIMD_INLINE vmask8 operator<(vfloat8 a, vfloat8 b)
  578. {
  579. return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_LT_OQ));
  580. }
  581. /**
  582. * @brief Overload: vector by vector greater than.
  583. */
  584. ASTCENC_SIMD_INLINE vmask8 operator>(vfloat8 a, vfloat8 b)
  585. {
  586. return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_GT_OQ));
  587. }
  588. /**
  589. * @brief Overload: vector by vector less than or equal.
  590. */
  591. ASTCENC_SIMD_INLINE vmask8 operator<=(vfloat8 a, vfloat8 b)
  592. {
  593. return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_LE_OQ));
  594. }
  595. /**
  596. * @brief Overload: vector by vector greater than or equal.
  597. */
  598. ASTCENC_SIMD_INLINE vmask8 operator>=(vfloat8 a, vfloat8 b)
  599. {
  600. return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_GE_OQ));
  601. }
  602. /**
  603. * @brief Return the min vector of two vectors.
  604. *
  605. * If either lane value is NaN, @c b will be returned for that lane.
  606. */
  607. ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, vfloat8 b)
  608. {
  609. return vfloat8(_mm256_min_ps(a.m, b.m));
  610. }
  611. /**
  612. * @brief Return the min vector of a vector and a scalar.
  613. *
  614. * If either lane value is NaN, @c b will be returned for that lane.
  615. */
  616. ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, float b)
  617. {
  618. return min(a, vfloat8(b));
  619. }
  620. /**
  621. * @brief Return the max vector of two vectors.
  622. *
  623. * If either lane value is NaN, @c b will be returned for that lane.
  624. */
  625. ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, vfloat8 b)
  626. {
  627. return vfloat8(_mm256_max_ps(a.m, b.m));
  628. }
  629. /**
  630. * @brief Return the max vector of a vector and a scalar.
  631. *
  632. * If either lane value is NaN, @c b will be returned for that lane.
  633. */
  634. ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, float b)
  635. {
  636. return max(a, vfloat8(b));
  637. }
  638. /**
  639. * @brief Return the clamped value between min and max.
  640. *
  641. * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
  642. * then @c min will be returned for that lane.
  643. */
  644. ASTCENC_SIMD_INLINE vfloat8 clamp(float min, float max, vfloat8 a)
  645. {
  646. // Do not reorder - second operand will return if either is NaN
  647. a.m = _mm256_max_ps(a.m, _mm256_set1_ps(min));
  648. a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max));
  649. return a;
  650. }
  651. /**
  652. * @brief Return a clamped value between 0.0f and 1.0f.
  653. *
  654. * If @c a is NaN then zero will be returned for that lane.
  655. */
  656. ASTCENC_SIMD_INLINE vfloat8 clampzo(vfloat8 a)
  657. {
  658. a.m = _mm256_max_ps(a.m, _mm256_setzero_ps());
  659. a.m = _mm256_min_ps(a.m, _mm256_set1_ps(1.0f));
  660. return a;
  661. }
  662. /**
  663. * @brief Return the absolute value of the float vector.
  664. */
  665. ASTCENC_SIMD_INLINE vfloat8 abs(vfloat8 a)
  666. {
  667. __m256 msk = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff));
  668. return vfloat8(_mm256_and_ps(a.m, msk));
  669. }
  670. /**
  671. * @brief Return a float rounded to the nearest integer value.
  672. */
  673. ASTCENC_SIMD_INLINE vfloat8 round(vfloat8 a)
  674. {
  675. constexpr int flags = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
  676. return vfloat8(_mm256_round_ps(a.m, flags));
  677. }
  678. /**
  679. * @brief Return the horizontal minimum of a vector.
  680. */
  681. ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a)
  682. {
  683. __m128 vlow = _mm256_castps256_ps128(a.m);
  684. __m128 vhigh = _mm256_extractf128_ps(a.m, 1);
  685. vlow = _mm_min_ps(vlow, vhigh);
  686. // First do an horizontal reduction.
  687. __m128 shuf = _mm_shuffle_ps(vlow, vlow, _MM_SHUFFLE(2, 3, 0, 1));
  688. __m128 mins = _mm_min_ps(vlow, shuf);
  689. shuf = _mm_movehl_ps(shuf, mins);
  690. mins = _mm_min_ss(mins, shuf);
  691. // This is the most logical implementation, but the convenience intrinsic
  692. // is missing on older compilers (supported in g++ 9 and clang++ 9).
  693. //__m256i r = _mm256_set_m128(m, m)
  694. __m256 r = _mm256_insertf128_ps(_mm256_castps128_ps256(mins), mins, 1);
  695. return vfloat8(_mm256_permute_ps(r, 0));
  696. }
  697. /**
  698. * @brief Return the horizontal minimum of a vector.
  699. */
  700. ASTCENC_SIMD_INLINE float hmin_s(vfloat8 a)
  701. {
  702. return _mm256_cvtss_f32(hmin(a).m);
  703. }
  704. /**
  705. * @brief Return the horizontal maximum of a vector.
  706. */
  707. ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a)
  708. {
  709. __m128 vlow = _mm256_castps256_ps128(a.m);
  710. __m128 vhigh = _mm256_extractf128_ps(a.m, 1);
  711. vhigh = _mm_max_ps(vlow, vhigh);
  712. // First do an horizontal reduction.
  713. __m128 shuf = _mm_shuffle_ps(vhigh, vhigh, _MM_SHUFFLE(2, 3, 0, 1));
  714. __m128 maxs = _mm_max_ps(vhigh, shuf);
  715. shuf = _mm_movehl_ps(shuf,maxs);
  716. maxs = _mm_max_ss(maxs, shuf);
  717. // This is the most logical implementation, but the convenience intrinsic
  718. // is missing on older compilers (supported in g++ 9 and clang++ 9).
  719. //__m256i r = _mm256_set_m128(m, m)
  720. __m256 r = _mm256_insertf128_ps(_mm256_castps128_ps256(maxs), maxs, 1);
  721. return vfloat8(_mm256_permute_ps(r, 0));
  722. }
  723. /**
  724. * @brief Return the horizontal maximum of a vector.
  725. */
  726. ASTCENC_SIMD_INLINE float hmax_s(vfloat8 a)
  727. {
  728. return _mm256_cvtss_f32(hmax(a).m);
  729. }
  730. /**
  731. * @brief Return the horizontal sum of a vector.
  732. */
  733. ASTCENC_SIMD_INLINE float hadd_s(vfloat8 a)
  734. {
  735. // Two sequential 4-wide adds gives invariance with 4-wide code
  736. vfloat4 lo(_mm256_extractf128_ps(a.m, 0));
  737. vfloat4 hi(_mm256_extractf128_ps(a.m, 1));
  738. return hadd_s(lo) + hadd_s(hi);
  739. }
  740. /**
  741. * @brief Return lanes from @c b if @c cond is set, else @c a.
  742. */
  743. ASTCENC_SIMD_INLINE vfloat8 select(vfloat8 a, vfloat8 b, vmask8 cond)
  744. {
  745. return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m));
  746. }
  747. /**
  748. * @brief Accumulate lane-wise sums for a vector, folded 4-wide.
  749. *
  750. * This is invariant with 4-wide implementations.
  751. */
  752. ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a)
  753. {
  754. vfloat4 lo(_mm256_extractf128_ps(a.m, 0));
  755. haccumulate(accum, lo);
  756. vfloat4 hi(_mm256_extractf128_ps(a.m, 1));
  757. haccumulate(accum, hi);
  758. }
  759. /**
  760. * @brief Accumulate lane-wise sums for a vector.
  761. *
  762. * This is NOT invariant with 4-wide implementations.
  763. */
  764. ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a)
  765. {
  766. accum += a;
  767. }
  768. /**
  769. * @brief Accumulate masked lane-wise sums for a vector, folded 4-wide.
  770. *
  771. * This is invariant with 4-wide implementations.
  772. */
  773. ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a, vmask8 m)
  774. {
  775. a = select(vfloat8::zero(), a, m);
  776. haccumulate(accum, a);
  777. }
  778. /**
  779. * @brief Accumulate masked lane-wise sums for a vector.
  780. *
  781. * This is NOT invariant with 4-wide implementations.
  782. */
  783. ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a, vmask8 m)
  784. {
  785. a = select(vfloat8::zero(), a, m);
  786. haccumulate(accum, a);
  787. }
  788. /**
  789. * @brief Return the sqrt of the lanes in the vector.
  790. */
  791. ASTCENC_SIMD_INLINE vfloat8 sqrt(vfloat8 a)
  792. {
  793. return vfloat8(_mm256_sqrt_ps(a.m));
  794. }
  795. /**
  796. * @brief Load a vector of gathered results from an array;
  797. */
  798. ASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices)
  799. {
  800. return vfloat8(_mm256_i32gather_ps(base, indices.m, 4));
  801. }
  802. /**
  803. * @brief Load a vector of gathered results from an array using byte indices from memory
  804. */
  805. template<>
  806. ASTCENC_SIMD_INLINE vfloat8 gatherf_byte_inds<vfloat8>(const float* base, const uint8_t* indices)
  807. {
  808. #if ASTCENC_X86_GATHERS == 0
  809. // Perform manual gather using scalar loads in two separate dependency chains,
  810. // then merge late. MSVC translates this 1:1, which is OK. Clang turns it
  811. // into a bunch of memory-operand inserts on 128-bit halves then merges late,
  812. // which performs significantly worse in tests.
  813. __m256 m0 = _mm256_broadcast_ss(base + indices[0]);
  814. __m256 m1 = _mm256_broadcast_ss(base + indices[1]);
  815. m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[2]), 1 << 2);
  816. m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[3]), 1 << 3);
  817. m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[4]), 1 << 4);
  818. m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[5]), 1 << 5);
  819. m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[6]), 1 << 6);
  820. m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[7]), 1 << 7);
  821. return vfloat8(_mm256_blend_ps(m0, m1, 0xaa));
  822. #else
  823. vint8 inds(indices);
  824. return gatherf(base, inds);
  825. #endif
  826. }
  827. /**
  828. * @brief Store a vector to an unaligned memory address.
  829. */
  830. ASTCENC_SIMD_INLINE void store(vfloat8 a, float* p)
  831. {
  832. _mm256_storeu_ps(p, a.m);
  833. }
  834. /**
  835. * @brief Store a vector to a 32B aligned memory address.
  836. */
  837. ASTCENC_SIMD_INLINE void storea(vfloat8 a, float* p)
  838. {
  839. _mm256_store_ps(p, a.m);
  840. }
  841. /**
  842. * @brief Return a integer value for a float vector, using truncation.
  843. */
  844. ASTCENC_SIMD_INLINE vint8 float_to_int(vfloat8 a)
  845. {
  846. return vint8(_mm256_cvttps_epi32(a.m));
  847. }
  848. /**
  849. * @brief Return a integer value for a float vector, using round-to-nearest.
  850. */
  851. ASTCENC_SIMD_INLINE vint8 float_to_int_rtn(vfloat8 a)
  852. {
  853. a = a + vfloat8(0.5f);
  854. return vint8(_mm256_cvttps_epi32(a.m));
  855. }
  856. /**
  857. * @brief Return a float value for an integer vector.
  858. */
  859. ASTCENC_SIMD_INLINE vfloat8 int_to_float(vint8 a)
  860. {
  861. return vfloat8(_mm256_cvtepi32_ps(a.m));
  862. }
  863. /**
  864. * @brief Return a float value as an integer bit pattern (i.e. no conversion).
  865. *
  866. * It is a common trick to convert floats into integer bit patterns, perform
  867. * some bit hackery based on knowledge they are IEEE 754 layout, and then
  868. * convert them back again. This is the first half of that flip.
  869. */
  870. ASTCENC_SIMD_INLINE vint8 float_as_int(vfloat8 a)
  871. {
  872. return vint8(_mm256_castps_si256(a.m));
  873. }
  874. /**
  875. * @brief Return a integer value as a float bit pattern (i.e. no conversion).
  876. *
  877. * It is a common trick to convert floats into integer bit patterns, perform
  878. * some bit hackery based on knowledge they are IEEE 754 layout, and then
  879. * convert them back again. This is the second half of that flip.
  880. */
  881. ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a)
  882. {
  883. return vfloat8(_mm256_castsi256_ps(a.m));
  884. }
  885. /*
  886. * Table structure for a 16x 8-bit entry table.
  887. */
  888. struct vtable8_16x8 {
  889. vint8 t0;
  890. };
  891. /*
  892. * Table structure for a 32x 8-bit entry table.
  893. */
  894. struct vtable8_32x8 {
  895. vint8 t0;
  896. vint8 t1;
  897. };
  898. /*
  899. * Table structure for a 64x 8-bit entry table.
  900. */
  901. struct vtable8_64x8 {
  902. vint8 t0;
  903. vint8 t1;
  904. vint8 t2;
  905. vint8 t3;
  906. };
  907. /**
  908. * @brief Prepare a vtable lookup table for 16x 8-bit entry table.
  909. */
  910. ASTCENC_SIMD_INLINE void vtable_prepare(
  911. vtable8_16x8& table,
  912. const uint8_t* data
  913. ) {
  914. // AVX2 tables duplicate table entries in each 128-bit half-register
  915. vint4 d0 = vint4::load(data);
  916. table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
  917. }
  918. /**
  919. * @brief Prepare a vtable lookup table for 32x 8-bit entry table.
  920. */
  921. ASTCENC_SIMD_INLINE void vtable_prepare(
  922. vtable8_32x8& table,
  923. const uint8_t* data
  924. ) {
  925. // AVX2 tables duplicate table entries in each 128-bit half-register
  926. vint4 d0 = vint4::load(data);
  927. vint4 d1 = vint4::load(data + 16);
  928. table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
  929. table.t1 = vint8(astcenc_mm256_set_m128i(d1.m, d1.m));
  930. // XOR chain the high rows to allow table emulation
  931. table.t1 = table.t1 ^ table.t0;
  932. }
  933. /**
  934. * @brief Prepare a vtable lookup table 64x 8-bit entry table.
  935. */
  936. ASTCENC_SIMD_INLINE void vtable_prepare(
  937. vtable8_64x8& table,
  938. const uint8_t* data
  939. ) {
  940. // AVX2 tables duplicate table entries in each 128-bit half-register
  941. vint4 d0 = vint4::load(data);
  942. vint4 d1 = vint4::load(data + 16);
  943. vint4 d2 = vint4::load(data + 32);
  944. vint4 d3 = vint4::load(data + 48);
  945. table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
  946. table.t1 = vint8(astcenc_mm256_set_m128i(d1.m, d1.m));
  947. table.t2 = vint8(astcenc_mm256_set_m128i(d2.m, d2.m));
  948. table.t3 = vint8(astcenc_mm256_set_m128i(d3.m, d3.m));
  949. // XOR chain the high rows to allow table emulation
  950. table.t3 = table.t3 ^ table.t2;
  951. table.t2 = table.t2 ^ table.t1;
  952. table.t1 = table.t1 ^ table.t0;
  953. }
  954. /**
  955. * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
  956. */
  957. ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
  958. const vtable8_16x8& tbl,
  959. vint8 idx
  960. ) {
  961. // Set index byte MSB to 1 for unused bytes so shuffle returns zero
  962. __m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
  963. __m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx);
  964. return vint8(result);
  965. }
  966. /**
  967. * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
  968. */
  969. ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
  970. const vtable8_32x8& tbl,
  971. vint8 idx
  972. ) {
  973. // Set index byte MSB to 1 for unused bytes so shuffle returns zero
  974. __m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
  975. __m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx);
  976. idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
  977. __m256i result2 = _mm256_shuffle_epi8(tbl.t1.m, idxx);
  978. result = _mm256_xor_si256(result, result2);
  979. return vint8(result);
  980. }
  981. /**
  982. * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
  983. */
  984. ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
  985. const vtable8_64x8& tbl,
  986. vint8 idx
  987. ) {
  988. // Set index byte MSB to 1 for unused bytes so shuffle returns zero
  989. __m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
  990. __m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx);
  991. idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
  992. __m256i result2 = _mm256_shuffle_epi8(tbl.t1.m, idxx);
  993. result = _mm256_xor_si256(result, result2);
  994. idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
  995. result2 = _mm256_shuffle_epi8(tbl.t2.m, idxx);
  996. result = _mm256_xor_si256(result, result2);
  997. idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
  998. result2 = _mm256_shuffle_epi8(tbl.t3.m, idxx);
  999. result = _mm256_xor_si256(result, result2);
  1000. return vint8(result);
  1001. }
  1002. /**
  1003. * @brief Return a vector of interleaved RGBA data.
  1004. *
  1005. * Input vectors have the value stored in the bottom 8 bits of each lane,
  1006. * with high bits set to zero.
  1007. *
  1008. * Output vector stores a single RGBA texel packed in each lane.
  1009. */
  1010. ASTCENC_SIMD_INLINE vint8 interleave_rgba8(vint8 r, vint8 g, vint8 b, vint8 a)
  1011. {
  1012. return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
  1013. }
  1014. /**
  1015. * @brief Store a vector, skipping masked lanes.
  1016. *
  1017. * All masked lanes must be at the end of vector, after all non-masked lanes.
  1018. */
  1019. ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint8 data, vmask8 mask)
  1020. {
  1021. _mm256_maskstore_epi32(reinterpret_cast<int*>(base), _mm256_castps_si256(mask.m), data.m);
  1022. }
  1023. /**
  1024. * @brief Debug function to print a vector of ints.
  1025. */
  1026. ASTCENC_SIMD_INLINE void print(vint8 a)
  1027. {
  1028. alignas(32) int v[8];
  1029. storea(a, v);
  1030. printf("v8_i32:\n %8d %8d %8d %8d %8d %8d %8d %8d\n",
  1031. v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
  1032. }
  1033. /**
  1034. * @brief Debug function to print a vector of ints.
  1035. */
  1036. ASTCENC_SIMD_INLINE void printx(vint8 a)
  1037. {
  1038. alignas(32) int v[8];
  1039. storea(a, v);
  1040. unsigned int uv[8];
  1041. std::memcpy(uv, v, sizeof(int) * 8);
  1042. printf("v8_i32:\n %08x %08x %08x %08x %08x %08x %08x %08x\n",
  1043. uv[0], uv[1], uv[2], uv[3], uv[4], uv[5], uv[6], uv[7]);
  1044. }
  1045. /**
  1046. * @brief Debug function to print a vector of floats.
  1047. */
  1048. ASTCENC_SIMD_INLINE void print(vfloat8 a)
  1049. {
  1050. alignas(32) float v[8];
  1051. storea(a, v);
  1052. printf("v8_f32:\n %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n",
  1053. static_cast<double>(v[0]), static_cast<double>(v[1]),
  1054. static_cast<double>(v[2]), static_cast<double>(v[3]),
  1055. static_cast<double>(v[4]), static_cast<double>(v[5]),
  1056. static_cast<double>(v[6]), static_cast<double>(v[7]));
  1057. }
  1058. /**
  1059. * @brief Debug function to print a vector of masks.
  1060. */
  1061. ASTCENC_SIMD_INLINE void print(vmask8 a)
  1062. {
  1063. print(select(vint8(0), vint8(1), a));
  1064. }
  1065. #endif // #ifndef ASTC_VECMATHLIB_AVX2_8_H_INCLUDED