astcenc_vecmathlib_sse_4.h 31 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283
  1. // SPDX-License-Identifier: Apache-2.0
  2. // ----------------------------------------------------------------------------
  3. // Copyright 2019-2022 Arm Limited
  4. //
  5. // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  6. // use this file except in compliance with the License. You may obtain a copy
  7. // of the License at:
  8. //
  9. // http://www.apache.org/licenses/LICENSE-2.0
  10. //
  11. // Unless required by applicable law or agreed to in writing, software
  12. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  13. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  14. // License for the specific language governing permissions and limitations
  15. // under the License.
  16. // ----------------------------------------------------------------------------
  17. /**
  18. * @brief 4x32-bit vectors, implemented using SSE.
  19. *
  20. * This module implements 4-wide 32-bit float, int, and mask vectors for x86
  21. * SSE. The implementation requires at least SSE2, but higher levels of SSE can
  22. * be selected at compile time to improve performance.
  23. *
  24. * There is a baseline level of functionality provided by all vector widths and
  25. * implementations. This is implemented using identical function signatures,
  26. * modulo data type, so we can use them as substitutable implementations in VLA
  27. * code.
  28. *
  29. * The 4-wide vectors are also used as a fixed-width type, and significantly
  30. * extend the functionality above that available to VLA code.
  31. */
  32. #ifndef ASTC_VECMATHLIB_SSE_4_H_INCLUDED
  33. #define ASTC_VECMATHLIB_SSE_4_H_INCLUDED
  34. #ifndef ASTCENC_SIMD_INLINE
  35. #error "Include astcenc_vecmathlib.h, do not include directly"
  36. #endif
  37. #include <cstdio>
  38. // ============================================================================
  39. // vfloat4 data type
  40. // ============================================================================
  41. /**
  42. * @brief Data type for 4-wide floats.
  43. */
  44. struct vfloat4
  45. {
  46. /**
  47. * @brief Construct from zero-initialized value.
  48. */
  49. ASTCENC_SIMD_INLINE vfloat4() = default;
  50. /**
  51. * @brief Construct from 4 values loaded from an unaligned address.
  52. *
  53. * Consider using loada() which is better with vectors if data is aligned
  54. * to vector length.
  55. */
  56. ASTCENC_SIMD_INLINE explicit vfloat4(const float *p)
  57. {
  58. m = _mm_loadu_ps(p);
  59. }
  60. /**
  61. * @brief Construct from 1 scalar value replicated across all lanes.
  62. *
  63. * Consider using zero() for constexpr zeros.
  64. */
  65. ASTCENC_SIMD_INLINE explicit vfloat4(float a)
  66. {
  67. m = _mm_set1_ps(a);
  68. }
  69. /**
  70. * @brief Construct from 4 scalar values.
  71. *
  72. * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
  73. */
  74. ASTCENC_SIMD_INLINE explicit vfloat4(float a, float b, float c, float d)
  75. {
  76. m = _mm_set_ps(d, c, b, a);
  77. }
  78. /**
  79. * @brief Construct from an existing SIMD register.
  80. */
  81. ASTCENC_SIMD_INLINE explicit vfloat4(__m128 a)
  82. {
  83. m = a;
  84. }
  85. /**
  86. * @brief Get the scalar value of a single lane.
  87. */
  88. template <int l> ASTCENC_SIMD_INLINE float lane() const
  89. {
  90. return _mm_cvtss_f32(_mm_shuffle_ps(m, m, l));
  91. }
  92. /**
  93. * @brief Set the scalar value of a single lane.
  94. */
  95. template <int l> ASTCENC_SIMD_INLINE void set_lane(float a)
  96. {
  97. #if ASTCENC_SSE >= 41
  98. __m128 v = _mm_set1_ps(a);
  99. m = _mm_insert_ps(m, v, l << 6 | l << 4);
  100. #else
  101. alignas(16) float idx[4];
  102. _mm_store_ps(idx, m);
  103. idx[l] = a;
  104. m = _mm_load_ps(idx);
  105. #endif
  106. }
  107. /**
  108. * @brief Factory that returns a vector of zeros.
  109. */
  110. static ASTCENC_SIMD_INLINE vfloat4 zero()
  111. {
  112. return vfloat4(_mm_setzero_ps());
  113. }
  114. /**
  115. * @brief Factory that returns a replicated scalar loaded from memory.
  116. */
  117. static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p)
  118. {
  119. return vfloat4(_mm_load_ps1(p));
  120. }
  121. /**
  122. * @brief Factory that returns a vector loaded from 16B aligned memory.
  123. */
  124. static ASTCENC_SIMD_INLINE vfloat4 loada(const float* p)
  125. {
  126. return vfloat4(_mm_load_ps(p));
  127. }
  128. /**
  129. * @brief Factory that returns a vector containing the lane IDs.
  130. */
  131. static ASTCENC_SIMD_INLINE vfloat4 lane_id()
  132. {
  133. return vfloat4(_mm_set_ps(3, 2, 1, 0));
  134. }
  135. /**
  136. * @brief Return a swizzled float 2.
  137. */
  138. template <int l0, int l1> ASTCENC_SIMD_INLINE vfloat4 swz() const
  139. {
  140. vfloat4 result(_mm_shuffle_ps(m, m, l0 | l1 << 2));
  141. result.set_lane<2>(0.0f);
  142. result.set_lane<3>(0.0f);
  143. return result;
  144. }
  145. /**
  146. * @brief Return a swizzled float 3.
  147. */
  148. template <int l0, int l1, int l2> ASTCENC_SIMD_INLINE vfloat4 swz() const
  149. {
  150. vfloat4 result(_mm_shuffle_ps(m, m, l0 | l1 << 2 | l2 << 4));
  151. result.set_lane<3>(0.0f);
  152. return result;
  153. }
  154. /**
  155. * @brief Return a swizzled float 4.
  156. */
  157. template <int l0, int l1, int l2, int l3> ASTCENC_SIMD_INLINE vfloat4 swz() const
  158. {
  159. return vfloat4(_mm_shuffle_ps(m, m, l0 | l1 << 2 | l2 << 4 | l3 << 6));
  160. }
  161. /**
  162. * @brief The vector ...
  163. */
  164. __m128 m;
  165. };
  166. // ============================================================================
  167. // vint4 data type
  168. // ============================================================================
  169. /**
  170. * @brief Data type for 4-wide ints.
  171. */
  172. struct vint4
  173. {
  174. /**
  175. * @brief Construct from zero-initialized value.
  176. */
  177. ASTCENC_SIMD_INLINE vint4() = default;
  178. /**
  179. * @brief Construct from 4 values loaded from an unaligned address.
  180. *
  181. * Consider using loada() which is better with vectors if data is aligned
  182. * to vector length.
  183. */
  184. ASTCENC_SIMD_INLINE explicit vint4(const int *p)
  185. {
  186. m = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
  187. }
  188. /**
  189. * @brief Construct from 4 uint8_t loaded from an unaligned address.
  190. */
  191. ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p)
  192. {
  193. // _mm_loadu_si32 would be nicer syntax, but missing on older GCC
  194. __m128i t = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(p));
  195. #if ASTCENC_SSE >= 41
  196. m = _mm_cvtepu8_epi32(t);
  197. #else
  198. t = _mm_unpacklo_epi8(t, _mm_setzero_si128());
  199. m = _mm_unpacklo_epi16(t, _mm_setzero_si128());
  200. #endif
  201. }
  202. /**
  203. * @brief Construct from 1 scalar value replicated across all lanes.
  204. *
  205. * Consider using vfloat4::zero() for constexpr zeros.
  206. */
  207. ASTCENC_SIMD_INLINE explicit vint4(int a)
  208. {
  209. m = _mm_set1_epi32(a);
  210. }
  211. /**
  212. * @brief Construct from 4 scalar values.
  213. *
  214. * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
  215. */
  216. ASTCENC_SIMD_INLINE explicit vint4(int a, int b, int c, int d)
  217. {
  218. m = _mm_set_epi32(d, c, b, a);
  219. }
  220. /**
  221. * @brief Construct from an existing SIMD register.
  222. */
  223. ASTCENC_SIMD_INLINE explicit vint4(__m128i a)
  224. {
  225. m = a;
  226. }
  227. /**
  228. * @brief Get the scalar from a single lane.
  229. */
  230. template <int l> ASTCENC_SIMD_INLINE int lane() const
  231. {
  232. return _mm_cvtsi128_si32(_mm_shuffle_epi32(m, l));
  233. }
  234. /**
  235. * @brief Set the scalar value of a single lane.
  236. */
  237. template <int l> ASTCENC_SIMD_INLINE void set_lane(int a)
  238. {
  239. #if ASTCENC_SSE >= 41
  240. m = _mm_insert_epi32(m, a, l);
  241. #else
  242. alignas(16) int idx[4];
  243. _mm_store_si128(reinterpret_cast<__m128i*>(idx), m);
  244. idx[l] = a;
  245. m = _mm_load_si128(reinterpret_cast<const __m128i*>(idx));
  246. #endif
  247. }
  248. /**
  249. * @brief Factory that returns a vector of zeros.
  250. */
  251. static ASTCENC_SIMD_INLINE vint4 zero()
  252. {
  253. return vint4(_mm_setzero_si128());
  254. }
  255. /**
  256. * @brief Factory that returns a replicated scalar loaded from memory.
  257. */
  258. static ASTCENC_SIMD_INLINE vint4 load1(const int* p)
  259. {
  260. return vint4(*p);
  261. }
  262. /**
  263. * @brief Factory that returns a vector loaded from 16B aligned memory.
  264. */
  265. static ASTCENC_SIMD_INLINE vint4 loada(const int* p)
  266. {
  267. return vint4(_mm_load_si128(reinterpret_cast<const __m128i*>(p)));
  268. }
  269. /**
  270. * @brief Factory that returns a vector containing the lane IDs.
  271. */
  272. static ASTCENC_SIMD_INLINE vint4 lane_id()
  273. {
  274. return vint4(_mm_set_epi32(3, 2, 1, 0));
  275. }
  276. /**
  277. * @brief The vector ...
  278. */
  279. __m128i m;
  280. };
  281. // ============================================================================
  282. // vmask4 data type
  283. // ============================================================================
  284. /**
  285. * @brief Data type for 4-wide control plane masks.
  286. */
  287. struct vmask4
  288. {
  289. /**
  290. * @brief Construct from an existing SIMD register.
  291. */
  292. ASTCENC_SIMD_INLINE explicit vmask4(__m128 a)
  293. {
  294. m = a;
  295. }
  296. /**
  297. * @brief Construct from an existing SIMD register.
  298. */
  299. ASTCENC_SIMD_INLINE explicit vmask4(__m128i a)
  300. {
  301. m = _mm_castsi128_ps(a);
  302. }
  303. /**
  304. * @brief Construct from 1 scalar value.
  305. */
  306. ASTCENC_SIMD_INLINE explicit vmask4(bool a)
  307. {
  308. vint4 mask(a == false ? 0 : -1);
  309. m = _mm_castsi128_ps(mask.m);
  310. }
  311. /**
  312. * @brief Construct from 4 scalar values.
  313. *
  314. * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
  315. */
  316. ASTCENC_SIMD_INLINE explicit vmask4(bool a, bool b, bool c, bool d)
  317. {
  318. vint4 mask(a == false ? 0 : -1,
  319. b == false ? 0 : -1,
  320. c == false ? 0 : -1,
  321. d == false ? 0 : -1);
  322. m = _mm_castsi128_ps(mask.m);
  323. }
  324. /**
  325. * @brief Get the scalar value of a single lane.
  326. */
  327. template <int l> ASTCENC_SIMD_INLINE float lane() const
  328. {
  329. return _mm_cvtss_f32(_mm_shuffle_ps(m, m, l));
  330. }
  331. /**
  332. * @brief The vector ...
  333. */
  334. __m128 m;
  335. };
  336. // ============================================================================
  337. // vmask4 operators and functions
  338. // ============================================================================
  339. /**
  340. * @brief Overload: mask union (or).
  341. */
  342. ASTCENC_SIMD_INLINE vmask4 operator|(vmask4 a, vmask4 b)
  343. {
  344. return vmask4(_mm_or_ps(a.m, b.m));
  345. }
  346. /**
  347. * @brief Overload: mask intersect (and).
  348. */
  349. ASTCENC_SIMD_INLINE vmask4 operator&(vmask4 a, vmask4 b)
  350. {
  351. return vmask4(_mm_and_ps(a.m, b.m));
  352. }
  353. /**
  354. * @brief Overload: mask difference (xor).
  355. */
  356. ASTCENC_SIMD_INLINE vmask4 operator^(vmask4 a, vmask4 b)
  357. {
  358. return vmask4(_mm_xor_ps(a.m, b.m));
  359. }
  360. /**
  361. * @brief Overload: mask invert (not).
  362. */
  363. ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a)
  364. {
  365. return vmask4(_mm_xor_si128(_mm_castps_si128(a.m), _mm_set1_epi32(-1)));
  366. }
  367. /**
  368. * @brief Return a 4-bit mask code indicating mask status.
  369. *
  370. * bit0 = lane 0
  371. */
  372. ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
  373. {
  374. return static_cast<unsigned int>(_mm_movemask_ps(a.m));
  375. }
  376. // ============================================================================
  377. // vint4 operators and functions
  378. // ============================================================================
  379. /**
  380. * @brief Overload: vector by vector addition.
  381. */
  382. ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, vint4 b)
  383. {
  384. return vint4(_mm_add_epi32(a.m, b.m));
  385. }
  386. /**
  387. * @brief Overload: vector by vector subtraction.
  388. */
  389. ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, vint4 b)
  390. {
  391. return vint4(_mm_sub_epi32(a.m, b.m));
  392. }
  393. /**
  394. * @brief Overload: vector by vector multiplication.
  395. */
  396. ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, vint4 b)
  397. {
  398. #if ASTCENC_SSE >= 41
  399. return vint4(_mm_mullo_epi32 (a.m, b.m));
  400. #else
  401. __m128i t1 = _mm_mul_epu32(a.m, b.m);
  402. __m128i t2 = _mm_mul_epu32(
  403. _mm_srli_si128(a.m, 4),
  404. _mm_srli_si128(b.m, 4));
  405. __m128i r = _mm_unpacklo_epi32(
  406. _mm_shuffle_epi32(t1, _MM_SHUFFLE (0, 0, 2, 0)),
  407. _mm_shuffle_epi32(t2, _MM_SHUFFLE (0, 0, 2, 0)));
  408. return vint4(r);
  409. #endif
  410. }
  411. /**
  412. * @brief Overload: vector bit invert.
  413. */
  414. ASTCENC_SIMD_INLINE vint4 operator~(vint4 a)
  415. {
  416. return vint4(_mm_xor_si128(a.m, _mm_set1_epi32(-1)));
  417. }
  418. /**
  419. * @brief Overload: vector by vector bitwise or.
  420. */
  421. ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, vint4 b)
  422. {
  423. return vint4(_mm_or_si128(a.m, b.m));
  424. }
  425. /**
  426. * @brief Overload: vector by vector bitwise and.
  427. */
  428. ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, vint4 b)
  429. {
  430. return vint4(_mm_and_si128(a.m, b.m));
  431. }
  432. /**
  433. * @brief Overload: vector by vector bitwise xor.
  434. */
  435. ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, vint4 b)
  436. {
  437. return vint4(_mm_xor_si128(a.m, b.m));
  438. }
  439. /**
  440. * @brief Overload: vector by vector equality.
  441. */
  442. ASTCENC_SIMD_INLINE vmask4 operator==(vint4 a, vint4 b)
  443. {
  444. return vmask4(_mm_cmpeq_epi32(a.m, b.m));
  445. }
  446. /**
  447. * @brief Overload: vector by vector inequality.
  448. */
  449. ASTCENC_SIMD_INLINE vmask4 operator!=(vint4 a, vint4 b)
  450. {
  451. return ~vmask4(_mm_cmpeq_epi32(a.m, b.m));
  452. }
  453. /**
  454. * @brief Overload: vector by vector less than.
  455. */
  456. ASTCENC_SIMD_INLINE vmask4 operator<(vint4 a, vint4 b)
  457. {
  458. return vmask4(_mm_cmplt_epi32(a.m, b.m));
  459. }
  460. /**
  461. * @brief Overload: vector by vector greater than.
  462. */
  463. ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b)
  464. {
  465. return vmask4(_mm_cmpgt_epi32(a.m, b.m));
  466. }
  467. /**
  468. * @brief Logical shift left.
  469. */
  470. template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
  471. {
  472. return vint4(_mm_slli_epi32(a.m, s));
  473. }
  474. /**
  475. * @brief Logical shift right.
  476. */
  477. template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a)
  478. {
  479. return vint4(_mm_srli_epi32(a.m, s));
  480. }
  481. /**
  482. * @brief Arithmetic shift right.
  483. */
  484. template <int s> ASTCENC_SIMD_INLINE vint4 asr(vint4 a)
  485. {
  486. return vint4(_mm_srai_epi32(a.m, s));
  487. }
  488. /**
  489. * @brief Return the min vector of two vectors.
  490. */
  491. ASTCENC_SIMD_INLINE vint4 min(vint4 a, vint4 b)
  492. {
  493. #if ASTCENC_SSE >= 41
  494. return vint4(_mm_min_epi32(a.m, b.m));
  495. #else
  496. vmask4 d = a < b;
  497. __m128i ap = _mm_and_si128(_mm_castps_si128(d.m), a.m);
  498. __m128i bp = _mm_andnot_si128(_mm_castps_si128(d.m), b.m);
  499. return vint4(_mm_or_si128(ap,bp));
  500. #endif
  501. }
  502. /**
  503. * @brief Return the max vector of two vectors.
  504. */
  505. ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b)
  506. {
  507. #if ASTCENC_SSE >= 41
  508. return vint4(_mm_max_epi32(a.m, b.m));
  509. #else
  510. vmask4 d = a > b;
  511. __m128i ap = _mm_and_si128(_mm_castps_si128(d.m), a.m);
  512. __m128i bp = _mm_andnot_si128(_mm_castps_si128(d.m), b.m);
  513. return vint4(_mm_or_si128(ap,bp));
  514. #endif
  515. }
  516. /**
  517. * @brief Return the horizontal minimum of a vector.
  518. */
  519. ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
  520. {
  521. a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 3, 2))));
  522. a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 1))));
  523. return vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 0)));
  524. }
  525. /*
  526. * @brief Return the horizontal maximum of a vector.
  527. */
  528. ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
  529. {
  530. a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 3, 2))));
  531. a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 1))));
  532. return vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 0)));
  533. }
  534. /**
  535. * @brief Return the horizontal sum of a vector as a scalar.
  536. */
  537. ASTCENC_SIMD_INLINE int hadd_s(vint4 a)
  538. {
  539. // Add top and bottom halves, lane 1/0
  540. __m128i fold = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(a.m),
  541. _mm_castsi128_ps(a.m)));
  542. __m128i t = _mm_add_epi32(a.m, fold);
  543. // Add top and bottom halves, lane 0 (_mm_hadd_ps exists but slow)
  544. t = _mm_add_epi32(t, _mm_shuffle_epi32(t, 0x55));
  545. return _mm_cvtsi128_si32(t);
  546. }
  547. /**
  548. * @brief Store a vector to a 16B aligned memory address.
  549. */
  550. ASTCENC_SIMD_INLINE void storea(vint4 a, int* p)
  551. {
  552. _mm_store_si128(reinterpret_cast<__m128i*>(p), a.m);
  553. }
  554. /**
  555. * @brief Store a vector to an unaligned memory address.
  556. */
  557. ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
  558. {
  559. // Cast due to missing intrinsics
  560. _mm_storeu_ps(reinterpret_cast<float*>(p), _mm_castsi128_ps(a.m));
  561. }
  562. /**
  563. * @brief Store lowest N (vector width) bytes into an unaligned address.
  564. */
  565. ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
  566. {
  567. // Cast due to missing intrinsics
  568. _mm_store_ss(reinterpret_cast<float*>(p), _mm_castsi128_ps(a.m));
  569. }
  570. /**
  571. * @brief Gather N (vector width) indices from the array.
  572. */
  573. ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
  574. {
  575. #if ASTCENC_AVX >= 2
  576. return vint4(_mm_i32gather_epi32(base, indices.m, 4));
  577. #else
  578. alignas(16) int idx[4];
  579. storea(indices, idx);
  580. return vint4(base[idx[0]], base[idx[1]], base[idx[2]], base[idx[3]]);
  581. #endif
  582. }
  583. /**
  584. * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
  585. */
  586. ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
  587. {
  588. #if ASTCENC_SSE >= 41
  589. __m128i shuf = _mm_set_epi8(0,0,0,0, 0,0,0,0, 0,0,0,0, 12,8,4,0);
  590. return vint4(_mm_shuffle_epi8(a.m, shuf));
  591. #else
  592. __m128i va = _mm_unpacklo_epi8(a.m, _mm_shuffle_epi32(a.m, _MM_SHUFFLE(1,1,1,1)));
  593. __m128i vb = _mm_unpackhi_epi8(a.m, _mm_shuffle_epi32(a.m, _MM_SHUFFLE(3,3,3,3)));
  594. return vint4(_mm_unpacklo_epi16(va, vb));
  595. #endif
  596. }
  597. /**
  598. * @brief Return lanes from @c b if @c cond is set, else @c a.
  599. */
  600. ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond)
  601. {
  602. __m128i condi = _mm_castps_si128(cond.m);
  603. #if ASTCENC_SSE >= 41
  604. return vint4(_mm_blendv_epi8(a.m, b.m, condi));
  605. #else
  606. return vint4(_mm_or_si128(_mm_and_si128(condi, b.m), _mm_andnot_si128(condi, a.m)));
  607. #endif
  608. }
  609. // ============================================================================
  610. // vfloat4 operators and functions
  611. // ============================================================================
  612. /**
  613. * @brief Overload: vector by vector addition.
  614. */
  615. ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, vfloat4 b)
  616. {
  617. return vfloat4(_mm_add_ps(a.m, b.m));
  618. }
  619. /**
  620. * @brief Overload: vector by vector subtraction.
  621. */
  622. ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, vfloat4 b)
  623. {
  624. return vfloat4(_mm_sub_ps(a.m, b.m));
  625. }
  626. /**
  627. * @brief Overload: vector by vector multiplication.
  628. */
  629. ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, vfloat4 b)
  630. {
  631. return vfloat4(_mm_mul_ps(a.m, b.m));
  632. }
  633. /**
  634. * @brief Overload: vector by vector division.
  635. */
  636. ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, vfloat4 b)
  637. {
  638. return vfloat4(_mm_div_ps(a.m, b.m));
  639. }
  640. /**
  641. * @brief Overload: vector by vector equality.
  642. */
  643. ASTCENC_SIMD_INLINE vmask4 operator==(vfloat4 a, vfloat4 b)
  644. {
  645. return vmask4(_mm_cmpeq_ps(a.m, b.m));
  646. }
  647. /**
  648. * @brief Overload: vector by vector inequality.
  649. */
  650. ASTCENC_SIMD_INLINE vmask4 operator!=(vfloat4 a, vfloat4 b)
  651. {
  652. return vmask4(_mm_cmpneq_ps(a.m, b.m));
  653. }
  654. /**
  655. * @brief Overload: vector by vector less than.
  656. */
  657. ASTCENC_SIMD_INLINE vmask4 operator<(vfloat4 a, vfloat4 b)
  658. {
  659. return vmask4(_mm_cmplt_ps(a.m, b.m));
  660. }
  661. /**
  662. * @brief Overload: vector by vector greater than.
  663. */
  664. ASTCENC_SIMD_INLINE vmask4 operator>(vfloat4 a, vfloat4 b)
  665. {
  666. return vmask4(_mm_cmpgt_ps(a.m, b.m));
  667. }
  668. /**
  669. * @brief Overload: vector by vector less than or equal.
  670. */
  671. ASTCENC_SIMD_INLINE vmask4 operator<=(vfloat4 a, vfloat4 b)
  672. {
  673. return vmask4(_mm_cmple_ps(a.m, b.m));
  674. }
  675. /**
  676. * @brief Overload: vector by vector greater than or equal.
  677. */
  678. ASTCENC_SIMD_INLINE vmask4 operator>=(vfloat4 a, vfloat4 b)
  679. {
  680. return vmask4(_mm_cmpge_ps(a.m, b.m));
  681. }
  682. /**
  683. * @brief Return the min vector of two vectors.
  684. *
  685. * If either lane value is NaN, @c b will be returned for that lane.
  686. */
  687. ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, vfloat4 b)
  688. {
  689. // Do not reorder - second operand will return if either is NaN
  690. return vfloat4(_mm_min_ps(a.m, b.m));
  691. }
  692. /**
  693. * @brief Return the max vector of two vectors.
  694. *
  695. * If either lane value is NaN, @c b will be returned for that lane.
  696. */
  697. ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, vfloat4 b)
  698. {
  699. // Do not reorder - second operand will return if either is NaN
  700. return vfloat4(_mm_max_ps(a.m, b.m));
  701. }
  702. /**
  703. * @brief Return the absolute value of the float vector.
  704. */
  705. ASTCENC_SIMD_INLINE vfloat4 abs(vfloat4 a)
  706. {
  707. return vfloat4(_mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), a.m), a.m));
  708. }
  709. /**
  710. * @brief Return a float rounded to the nearest integer value.
  711. */
  712. ASTCENC_SIMD_INLINE vfloat4 round(vfloat4 a)
  713. {
  714. #if ASTCENC_SSE >= 41
  715. constexpr int flags = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
  716. return vfloat4(_mm_round_ps(a.m, flags));
  717. #else
  718. __m128 v = a.m;
  719. __m128 neg_zero = _mm_castsi128_ps(_mm_set1_epi32(static_cast<int>(0x80000000)));
  720. __m128 no_fraction = _mm_set1_ps(8388608.0f);
  721. __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
  722. __m128 sign = _mm_and_ps(v, neg_zero);
  723. __m128 s_magic = _mm_or_ps(no_fraction, sign);
  724. __m128 r1 = _mm_add_ps(v, s_magic);
  725. r1 = _mm_sub_ps(r1, s_magic);
  726. __m128 r2 = _mm_and_ps(v, abs_mask);
  727. __m128 mask = _mm_cmple_ps(r2, no_fraction);
  728. r2 = _mm_andnot_ps(mask, v);
  729. r1 = _mm_and_ps(r1, mask);
  730. return vfloat4(_mm_xor_ps(r1, r2));
  731. #endif
  732. }
  733. /**
  734. * @brief Return the horizontal minimum of a vector.
  735. */
  736. ASTCENC_SIMD_INLINE vfloat4 hmin(vfloat4 a)
  737. {
  738. a = min(a, vfloat4(_mm_shuffle_ps(a.m, a.m, _MM_SHUFFLE(0, 0, 3, 2))));
  739. a = min(a, vfloat4(_mm_shuffle_ps(a.m, a.m, _MM_SHUFFLE(0, 0, 0, 1))));
  740. return vfloat4(_mm_shuffle_ps(a.m, a.m, _MM_SHUFFLE(0, 0, 0, 0)));
  741. }
  742. /**
  743. * @brief Return the horizontal maximum of a vector.
  744. */
  745. ASTCENC_SIMD_INLINE vfloat4 hmax(vfloat4 a)
  746. {
  747. a = max(a, vfloat4(_mm_shuffle_ps(a.m, a.m, _MM_SHUFFLE(0, 0, 3, 2))));
  748. a = max(a, vfloat4(_mm_shuffle_ps(a.m, a.m, _MM_SHUFFLE(0, 0, 0, 1))));
  749. return vfloat4(_mm_shuffle_ps(a.m, a.m, _MM_SHUFFLE(0, 0, 0, 0)));
  750. }
  751. /**
  752. * @brief Return the horizontal sum of a vector as a scalar.
  753. */
  754. ASTCENC_SIMD_INLINE float hadd_s(vfloat4 a)
  755. {
  756. // Add top and bottom halves, lane 1/0
  757. __m128 t = _mm_add_ps(a.m, _mm_movehl_ps(a.m, a.m));
  758. // Add top and bottom halves, lane 0 (_mm_hadd_ps exists but slow)
  759. t = _mm_add_ss(t, _mm_shuffle_ps(t, t, 0x55));
  760. return _mm_cvtss_f32(t);
  761. }
  762. /**
  763. * @brief Return the sqrt of the lanes in the vector.
  764. */
  765. ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a)
  766. {
  767. return vfloat4(_mm_sqrt_ps(a.m));
  768. }
  769. /**
  770. * @brief Return lanes from @c b if @c cond is set, else @c a.
  771. */
  772. ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
  773. {
  774. #if ASTCENC_SSE >= 41
  775. return vfloat4(_mm_blendv_ps(a.m, b.m, cond.m));
  776. #else
  777. return vfloat4(_mm_or_ps(_mm_and_ps(cond.m, b.m), _mm_andnot_ps(cond.m, a.m)));
  778. #endif
  779. }
  780. /**
  781. * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
  782. */
  783. ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
  784. {
  785. #if ASTCENC_SSE >= 41
  786. return vfloat4(_mm_blendv_ps(a.m, b.m, cond.m));
  787. #else
  788. __m128 d = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(cond.m), 31));
  789. return vfloat4(_mm_or_ps(_mm_and_ps(d, b.m), _mm_andnot_ps(d, a.m)));
  790. #endif
  791. }
  792. /**
  793. * @brief Load a vector of gathered results from an array;
  794. */
  795. ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
  796. {
  797. #if ASTCENC_AVX >= 2
  798. return vfloat4(_mm_i32gather_ps(base, indices.m, 4));
  799. #else
  800. alignas(16) int idx[4];
  801. storea(indices, idx);
  802. return vfloat4(base[idx[0]], base[idx[1]], base[idx[2]], base[idx[3]]);
  803. #endif
  804. }
  805. /**
  806. * @brief Store a vector to an unaligned memory address.
  807. */
  808. ASTCENC_SIMD_INLINE void store(vfloat4 a, float* p)
  809. {
  810. _mm_storeu_ps(p, a.m);
  811. }
  812. /**
  813. * @brief Store a vector to a 16B aligned memory address.
  814. */
  815. ASTCENC_SIMD_INLINE void storea(vfloat4 a, float* p)
  816. {
  817. _mm_store_ps(p, a.m);
  818. }
  819. /**
  820. * @brief Return a integer value for a float vector, using truncation.
  821. */
  822. ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
  823. {
  824. return vint4(_mm_cvttps_epi32(a.m));
  825. }
  826. /**
  827. * @brief Return a integer value for a float vector, using round-to-nearest.
  828. */
  829. ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
  830. {
  831. a = round(a);
  832. return vint4(_mm_cvttps_epi32(a.m));
  833. }
  834. /**
  835. * @brief Return a float value for an integer vector.
  836. */
  837. ASTCENC_SIMD_INLINE vfloat4 int_to_float(vint4 a)
  838. {
  839. return vfloat4(_mm_cvtepi32_ps(a.m));
  840. }
  841. /**
  842. * @brief Return a float16 value for a float vector, using round-to-nearest.
  843. */
  844. ASTCENC_SIMD_INLINE vint4 float_to_float16(vfloat4 a)
  845. {
  846. #if ASTCENC_F16C >= 1
  847. __m128i packedf16 = _mm_cvtps_ph(a.m, 0);
  848. __m128i f16 = _mm_cvtepu16_epi32(packedf16);
  849. return vint4(f16);
  850. #else
  851. return vint4(
  852. float_to_sf16(a.lane<0>()),
  853. float_to_sf16(a.lane<1>()),
  854. float_to_sf16(a.lane<2>()),
  855. float_to_sf16(a.lane<3>()));
  856. #endif
  857. }
  858. /**
  859. * @brief Return a float16 value for a float scalar, using round-to-nearest.
  860. */
  861. static inline uint16_t float_to_float16(float a)
  862. {
  863. #if ASTCENC_F16C >= 1
  864. __m128i f16 = _mm_cvtps_ph(_mm_set1_ps(a), 0);
  865. return static_cast<uint16_t>(_mm_cvtsi128_si32(f16));
  866. #else
  867. return float_to_sf16(a);
  868. #endif
  869. }
  870. /**
  871. * @brief Return a float value for a float16 vector.
  872. */
  873. ASTCENC_SIMD_INLINE vfloat4 float16_to_float(vint4 a)
  874. {
  875. #if ASTCENC_F16C >= 1
  876. __m128i packed = _mm_packs_epi32(a.m, a.m);
  877. __m128 f32 = _mm_cvtph_ps(packed);
  878. return vfloat4(f32);
  879. #else
  880. return vfloat4(
  881. sf16_to_float(static_cast<uint16_t>(a.lane<0>())),
  882. sf16_to_float(static_cast<uint16_t>(a.lane<1>())),
  883. sf16_to_float(static_cast<uint16_t>(a.lane<2>())),
  884. sf16_to_float(static_cast<uint16_t>(a.lane<3>())));
  885. #endif
  886. }
  887. /**
  888. * @brief Return a float value for a float16 scalar.
  889. */
  890. ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a)
  891. {
  892. #if ASTCENC_F16C >= 1
  893. __m128i packed = _mm_set1_epi16(static_cast<short>(a));
  894. __m128 f32 = _mm_cvtph_ps(packed);
  895. return _mm_cvtss_f32(f32);
  896. #else
  897. return sf16_to_float(a);
  898. #endif
  899. }
  900. /**
  901. * @brief Return a float value as an integer bit pattern (i.e. no conversion).
  902. *
  903. * It is a common trick to convert floats into integer bit patterns, perform
  904. * some bit hackery based on knowledge they are IEEE 754 layout, and then
  905. * convert them back again. This is the first half of that flip.
  906. */
  907. ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)
  908. {
  909. return vint4(_mm_castps_si128(a.m));
  910. }
  911. /**
  912. * @brief Return a integer value as a float bit pattern (i.e. no conversion).
  913. *
  914. * It is a common trick to convert floats into integer bit patterns, perform
  915. * some bit hackery based on knowledge they are IEEE 754 layout, and then
  916. * convert them back again. This is the second half of that flip.
  917. */
  918. ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
  919. {
  920. return vfloat4(_mm_castsi128_ps(v.m));
  921. }
  922. /**
  923. * @brief Prepare a vtable lookup table for use with the native SIMD size.
  924. */
  925. ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
  926. {
  927. t0p = t0;
  928. }
  929. /**
  930. * @brief Prepare a vtable lookup table for use with the native SIMD size.
  931. */
  932. ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
  933. {
  934. #if ASTCENC_SSE >= 30
  935. t0p = t0;
  936. t1p = t0 ^ t1;
  937. #else
  938. t0p = t0;
  939. t1p = t1;
  940. #endif
  941. }
  942. /**
  943. * @brief Prepare a vtable lookup table for use with the native SIMD size.
  944. */
  945. ASTCENC_SIMD_INLINE void vtable_prepare(
  946. vint4 t0, vint4 t1, vint4 t2, vint4 t3,
  947. vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
  948. {
  949. #if ASTCENC_SSE >= 30
  950. t0p = t0;
  951. t1p = t0 ^ t1;
  952. t2p = t1 ^ t2;
  953. t3p = t2 ^ t3;
  954. #else
  955. t0p = t0;
  956. t1p = t1;
  957. t2p = t2;
  958. t3p = t3;
  959. #endif
  960. }
  961. /**
  962. * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
  963. */
  964. ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
  965. {
  966. #if ASTCENC_SSE >= 30
  967. // Set index byte MSB to 1 for unused bytes so shuffle returns zero
  968. __m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
  969. __m128i result = _mm_shuffle_epi8(t0.m, idxx);
  970. return vint4(result);
  971. #else
  972. alignas(ASTCENC_VECALIGN) uint8_t table[16];
  973. storea(t0, reinterpret_cast<int*>(table + 0));
  974. return vint4(table[idx.lane<0>()],
  975. table[idx.lane<1>()],
  976. table[idx.lane<2>()],
  977. table[idx.lane<3>()]);
  978. #endif
  979. }
  980. /**
  981. * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
  982. */
  983. ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
  984. {
  985. #if ASTCENC_SSE >= 30
  986. // Set index byte MSB to 1 for unused bytes so shuffle returns zero
  987. __m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
  988. __m128i result = _mm_shuffle_epi8(t0.m, idxx);
  989. idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
  990. __m128i result2 = _mm_shuffle_epi8(t1.m, idxx);
  991. result = _mm_xor_si128(result, result2);
  992. return vint4(result);
  993. #else
  994. alignas(ASTCENC_VECALIGN) uint8_t table[32];
  995. storea(t0, reinterpret_cast<int*>(table + 0));
  996. storea(t1, reinterpret_cast<int*>(table + 16));
  997. return vint4(table[idx.lane<0>()],
  998. table[idx.lane<1>()],
  999. table[idx.lane<2>()],
  1000. table[idx.lane<3>()]);
  1001. #endif
  1002. }
  1003. /**
  1004. * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
  1005. */
  1006. ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
  1007. {
  1008. #if ASTCENC_SSE >= 30
  1009. // Set index byte MSB to 1 for unused bytes so shuffle returns zero
  1010. __m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
  1011. __m128i result = _mm_shuffle_epi8(t0.m, idxx);
  1012. idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
  1013. __m128i result2 = _mm_shuffle_epi8(t1.m, idxx);
  1014. result = _mm_xor_si128(result, result2);
  1015. idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
  1016. result2 = _mm_shuffle_epi8(t2.m, idxx);
  1017. result = _mm_xor_si128(result, result2);
  1018. idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
  1019. result2 = _mm_shuffle_epi8(t3.m, idxx);
  1020. result = _mm_xor_si128(result, result2);
  1021. return vint4(result);
  1022. #else
  1023. alignas(ASTCENC_VECALIGN) uint8_t table[64];
  1024. storea(t0, reinterpret_cast<int*>(table + 0));
  1025. storea(t1, reinterpret_cast<int*>(table + 16));
  1026. storea(t2, reinterpret_cast<int*>(table + 32));
  1027. storea(t3, reinterpret_cast<int*>(table + 48));
  1028. return vint4(table[idx.lane<0>()],
  1029. table[idx.lane<1>()],
  1030. table[idx.lane<2>()],
  1031. table[idx.lane<3>()]);
  1032. #endif
  1033. }
  1034. /**
  1035. * @brief Return a vector of interleaved RGBA data.
  1036. *
  1037. * Input vectors have the value stored in the bottom 8 bits of each lane,
  1038. * with high bits set to zero.
  1039. *
  1040. * Output vector stores a single RGBA texel packed in each lane.
  1041. */
  1042. ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
  1043. {
  1044. // Workaround an XCode compiler internal fault; note is slower than slli_epi32
  1045. // so we should revert this when we get the opportunity
  1046. #if defined(__APPLE__)
  1047. __m128i value = r.m;
  1048. value = _mm_add_epi32(value, _mm_bslli_si128(g.m, 1));
  1049. value = _mm_add_epi32(value, _mm_bslli_si128(b.m, 2));
  1050. value = _mm_add_epi32(value, _mm_bslli_si128(a.m, 3));
  1051. return vint4(value);
  1052. #else
  1053. __m128i value = r.m;
  1054. value = _mm_add_epi32(value, _mm_slli_epi32(g.m, 8));
  1055. value = _mm_add_epi32(value, _mm_slli_epi32(b.m, 16));
  1056. value = _mm_add_epi32(value, _mm_slli_epi32(a.m, 24));
  1057. return vint4(value);
  1058. #endif
  1059. }
  1060. /**
  1061. * @brief Store a vector, skipping masked lanes.
  1062. *
  1063. * All masked lanes must be at the end of vector, after all non-masked lanes.
  1064. */
  1065. ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint4 data, vmask4 mask)
  1066. {
  1067. #if ASTCENC_AVX >= 2
  1068. _mm_maskstore_epi32(base, _mm_castps_si128(mask.m), data.m);
  1069. #else
  1070. // Note - we cannot use _mm_maskmoveu_si128 as the underlying hardware doesn't guarantee
  1071. // fault suppression on masked lanes so we can get page faults at the end of an image.
  1072. if (mask.lane<3>() != 0.0f)
  1073. {
  1074. store(data, base);
  1075. }
  1076. else if (mask.lane<2>() != 0.0f)
  1077. {
  1078. base[0] = data.lane<0>();
  1079. base[1] = data.lane<1>();
  1080. base[2] = data.lane<2>();
  1081. }
  1082. else if (mask.lane<1>() != 0.0f)
  1083. {
  1084. base[0] = data.lane<0>();
  1085. base[1] = data.lane<1>();
  1086. }
  1087. else if (mask.lane<0>() != 0.0f)
  1088. {
  1089. base[0] = data.lane<0>();
  1090. }
  1091. #endif
  1092. }
  1093. #if defined(ASTCENC_NO_INVARIANCE) && (ASTCENC_SSE >= 41)
  1094. #define ASTCENC_USE_NATIVE_DOT_PRODUCT 1
  1095. /**
  1096. * @brief Return the dot product for the full 4 lanes, returning scalar.
  1097. */
  1098. ASTCENC_SIMD_INLINE float dot_s(vfloat4 a, vfloat4 b)
  1099. {
  1100. return _mm_cvtss_f32(_mm_dp_ps(a.m, b.m, 0xFF));
  1101. }
  1102. /**
  1103. * @brief Return the dot product for the full 4 lanes, returning vector.
  1104. */
  1105. ASTCENC_SIMD_INLINE vfloat4 dot(vfloat4 a, vfloat4 b)
  1106. {
  1107. return vfloat4(_mm_dp_ps(a.m, b.m, 0xFF));
  1108. }
  1109. /**
  1110. * @brief Return the dot product for the bottom 3 lanes, returning scalar.
  1111. */
  1112. ASTCENC_SIMD_INLINE float dot3_s(vfloat4 a, vfloat4 b)
  1113. {
  1114. return _mm_cvtss_f32(_mm_dp_ps(a.m, b.m, 0x77));
  1115. }
  1116. /**
  1117. * @brief Return the dot product for the bottom 3 lanes, returning vector.
  1118. */
  1119. ASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b)
  1120. {
  1121. return vfloat4(_mm_dp_ps(a.m, b.m, 0x77));
  1122. }
  1123. #endif // #if defined(ASTCENC_NO_INVARIANCE) && (ASTCENC_SSE >= 41)
  1124. #if ASTCENC_POPCNT >= 1
  1125. #define ASTCENC_USE_NATIVE_POPCOUNT 1
  1126. /**
  1127. * @brief Population bit count.
  1128. *
  1129. * @param v The value to population count.
  1130. *
  1131. * @return The number of 1 bits.
  1132. */
  1133. ASTCENC_SIMD_INLINE int popcount(uint64_t v)
  1134. {
  1135. return static_cast<int>(_mm_popcnt_u64(v));
  1136. }
  1137. #endif // ASTCENC_POPCNT >= 1
  1138. #endif // #ifndef ASTC_VECMATHLIB_SSE_4_H_INCLUDED