astcenc_vecmathlib_neon_4.h 25 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121
  1. // SPDX-License-Identifier: Apache-2.0
  2. // ----------------------------------------------------------------------------
  3. // Copyright 2019-2024 Arm Limited
  4. //
  5. // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  6. // use this file except in compliance with the License. You may obtain a copy
  7. // of the License at:
  8. //
  9. // http://www.apache.org/licenses/LICENSE-2.0
  10. //
  11. // Unless required by applicable law or agreed to in writing, software
  12. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  13. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  14. // License for the specific language governing permissions and limitations
  15. // under the License.
  16. // ----------------------------------------------------------------------------
  17. /**
  18. * @brief 4x32-bit vectors, implemented using Armv8-A NEON.
  19. *
  20. * This module implements 4-wide 32-bit float, int, and mask vectors for
  21. * Armv8-A NEON.
  22. *
  23. * There is a baseline level of functionality provided by all vector widths and
  24. * implementations. This is implemented using identical function signatures,
  25. * modulo data type, so we can use them as substitutable implementations in VLA
  26. * code.
  27. *
  28. * The 4-wide vectors are also used as a fixed-width type, and significantly
  29. * extend the functionality above that available to VLA code.
  30. */
  31. #ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED
  32. #define ASTC_VECMATHLIB_NEON_4_H_INCLUDED
  33. #ifndef ASTCENC_SIMD_INLINE
  34. #error "Include astcenc_vecmathlib.h, do not include directly"
  35. #endif
  36. #include <cstdio>
  37. #include <cstring>
  38. // ============================================================================
  39. // vfloat4 data type
  40. // ============================================================================
  41. /**
  42. * @brief Data type for 4-wide floats.
  43. */
  44. struct vfloat4
  45. {
  46. /**
  47. * @brief Construct from zero-initialized value.
  48. */
  49. ASTCENC_SIMD_INLINE vfloat4() = default;
  50. /**
  51. * @brief Construct from 4 values loaded from an unaligned address.
  52. *
  53. * Consider using loada() which is better with vectors if data is aligned
  54. * to vector length.
  55. */
  56. ASTCENC_SIMD_INLINE explicit vfloat4(const float *p)
  57. {
  58. m = vld1q_f32(p);
  59. }
  60. /**
  61. * @brief Construct from 1 scalar value replicated across all lanes.
  62. *
  63. * Consider using zero() for constexpr zeros.
  64. */
  65. ASTCENC_SIMD_INLINE explicit vfloat4(float a)
  66. {
  67. m = vdupq_n_f32(a);
  68. }
  69. /**
  70. * @brief Construct from 4 scalar values.
  71. *
  72. * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
  73. */
  74. ASTCENC_SIMD_INLINE explicit vfloat4(float a, float b, float c, float d)
  75. {
  76. float v[4] { a, b, c, d };
  77. m = vld1q_f32(v);
  78. }
  79. /**
  80. * @brief Construct from an existing SIMD register.
  81. */
  82. ASTCENC_SIMD_INLINE explicit vfloat4(float32x4_t a)
  83. {
  84. m = a;
  85. }
  86. /**
  87. * @brief Get the scalar value of a single lane.
  88. */
  89. template <int l> ASTCENC_SIMD_INLINE float lane() const
  90. {
  91. return vgetq_lane_f32(m, l);
  92. }
  93. /**
  94. * @brief Set the scalar value of a single lane.
  95. */
  96. template <int l> ASTCENC_SIMD_INLINE void set_lane(float a)
  97. {
  98. m = vsetq_lane_f32(a, m, l);
  99. }
  100. /**
  101. * @brief Factory that returns a vector of zeros.
  102. */
  103. static ASTCENC_SIMD_INLINE vfloat4 zero()
  104. {
  105. return vfloat4(0.0f);
  106. }
  107. /**
  108. * @brief Factory that returns a replicated scalar loaded from memory.
  109. */
  110. static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p)
  111. {
  112. return vfloat4(vld1q_dup_f32(p));
  113. }
  114. /**
  115. * @brief Factory that returns a vector loaded from 16B aligned memory.
  116. */
  117. static ASTCENC_SIMD_INLINE vfloat4 loada(const float* p)
  118. {
  119. return vfloat4(vld1q_f32(p));
  120. }
  121. /**
  122. * @brief Return a swizzled float 2.
  123. */
  124. template <int l0, int l1> ASTCENC_SIMD_INLINE vfloat4 swz() const
  125. {
  126. return vfloat4(lane<l0>(), lane<l1>(), 0.0f, 0.0f);
  127. }
  128. /**
  129. * @brief Return a swizzled float 3.
  130. */
  131. template <int l0, int l1, int l2> ASTCENC_SIMD_INLINE vfloat4 swz() const
  132. {
  133. return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), 0.0f);
  134. }
  135. /**
  136. * @brief Return a swizzled float 4.
  137. */
  138. template <int l0, int l1, int l2, int l3> ASTCENC_SIMD_INLINE vfloat4 swz() const
  139. {
  140. return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), lane<l3>());
  141. }
  142. /**
  143. * @brief The vector ...
  144. */
  145. float32x4_t m;
  146. };
  147. // ============================================================================
  148. // vint4 data type
  149. // ============================================================================
  150. /**
  151. * @brief Data type for 4-wide ints.
  152. */
  153. struct vint4
  154. {
  155. /**
  156. * @brief Construct from zero-initialized value.
  157. */
  158. ASTCENC_SIMD_INLINE vint4() = default;
  159. /**
  160. * @brief Construct from 4 values loaded from an unaligned address.
  161. *
  162. * Consider using loada() which is better with vectors if data is aligned
  163. * to vector length.
  164. */
  165. ASTCENC_SIMD_INLINE explicit vint4(const int *p)
  166. {
  167. m = vld1q_s32(p);
  168. }
  169. /**
  170. * @brief Construct from 4 uint8_t loaded from an unaligned address.
  171. */
  172. ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p)
  173. {
  174. #if ASTCENC_SVE == 0
  175. // Cast is safe - NEON loads are allowed to be unaligned
  176. uint32x2_t t8 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(p));
  177. uint16x4_t t16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(t8)));
  178. m = vreinterpretq_s32_u32(vmovl_u16(t16));
  179. #else
  180. svint32_t data = svld1ub_s32(svptrue_pat_b32(SV_VL4), p);
  181. m = svget_neonq(data);
  182. #endif
  183. }
  184. /**
  185. * @brief Construct from 1 scalar value replicated across all lanes.
  186. *
  187. * Consider using zero() for constexpr zeros.
  188. */
  189. ASTCENC_SIMD_INLINE explicit vint4(int a)
  190. {
  191. m = vdupq_n_s32(a);
  192. }
  193. /**
  194. * @brief Construct from 4 scalar values.
  195. *
  196. * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
  197. */
  198. ASTCENC_SIMD_INLINE explicit vint4(int a, int b, int c, int d)
  199. {
  200. int v[4] { a, b, c, d };
  201. m = vld1q_s32(v);
  202. }
  203. /**
  204. * @brief Construct from an existing SIMD register.
  205. */
  206. ASTCENC_SIMD_INLINE explicit vint4(int32x4_t a)
  207. {
  208. m = a;
  209. }
  210. /**
  211. * @brief Get the scalar from a single lane.
  212. */
  213. template <int l> ASTCENC_SIMD_INLINE int lane() const
  214. {
  215. return vgetq_lane_s32(m, l);
  216. }
  217. /**
  218. * @brief Set the scalar value of a single lane.
  219. */
  220. template <int l> ASTCENC_SIMD_INLINE void set_lane(int a)
  221. {
  222. m = vsetq_lane_s32(a, m, l);
  223. }
  224. /**
  225. * @brief Factory that returns a vector of zeros.
  226. */
  227. static ASTCENC_SIMD_INLINE vint4 zero()
  228. {
  229. return vint4(0);
  230. }
  231. /**
  232. * @brief Factory that returns a replicated scalar loaded from memory.
  233. */
  234. static ASTCENC_SIMD_INLINE vint4 load1(const int* p)
  235. {
  236. return vint4(*p);
  237. }
  238. /**
  239. * @brief Factory that returns a vector loaded from unaligned memory.
  240. */
  241. static ASTCENC_SIMD_INLINE vint4 load(const uint8_t* p)
  242. {
  243. vint4 data;
  244. std::memcpy(&data.m, p, 4 * sizeof(int));
  245. return data;
  246. }
  247. /**
  248. * @brief Factory that returns a vector loaded from 16B aligned memory.
  249. */
  250. static ASTCENC_SIMD_INLINE vint4 loada(const int* p)
  251. {
  252. return vint4(p);
  253. }
  254. /**
  255. * @brief Factory that returns a vector containing the lane IDs.
  256. */
  257. static ASTCENC_SIMD_INLINE vint4 lane_id()
  258. {
  259. alignas(16) static const int data[4] { 0, 1, 2, 3 };
  260. return vint4(vld1q_s32(data));
  261. }
  262. /**
  263. * @brief The vector ...
  264. */
  265. int32x4_t m;
  266. };
  267. // ============================================================================
  268. // vmask4 data type
  269. // ============================================================================
  270. /**
  271. * @brief Data type for 4-wide control plane masks.
  272. */
  273. struct vmask4
  274. {
  275. /**
  276. * @brief Construct from an existing SIMD register.
  277. */
  278. ASTCENC_SIMD_INLINE explicit vmask4(uint32x4_t a)
  279. {
  280. m = a;
  281. }
  282. #if !defined(_MSC_VER)
  283. /**
  284. * @brief Construct from an existing SIMD register.
  285. */
  286. ASTCENC_SIMD_INLINE explicit vmask4(int32x4_t a)
  287. {
  288. m = vreinterpretq_u32_s32(a);
  289. }
  290. #endif
  291. /**
  292. * @brief Construct from 1 scalar value.
  293. */
  294. ASTCENC_SIMD_INLINE explicit vmask4(bool a)
  295. {
  296. m = vreinterpretq_u32_s32(vdupq_n_s32(a == true ? -1 : 0));
  297. }
  298. /**
  299. * @brief Construct from 4 scalar values.
  300. *
  301. * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
  302. */
  303. ASTCENC_SIMD_INLINE explicit vmask4(bool a, bool b, bool c, bool d)
  304. {
  305. int v[4] {
  306. a == true ? -1 : 0,
  307. b == true ? -1 : 0,
  308. c == true ? -1 : 0,
  309. d == true ? -1 : 0
  310. };
  311. int32x4_t ms = vld1q_s32(v);
  312. m = vreinterpretq_u32_s32(ms);
  313. }
  314. /**
  315. * @brief Get the scalar from a single lane.
  316. */
  317. template <int32_t l> ASTCENC_SIMD_INLINE bool lane() const
  318. {
  319. return vgetq_lane_u32(m, l) != 0;
  320. }
  321. /**
  322. * @brief The vector ...
  323. */
  324. uint32x4_t m;
  325. };
  326. // ============================================================================
  327. // vmask4 operators and functions
  328. // ============================================================================
  329. /**
  330. * @brief Overload: mask union (or).
  331. */
  332. ASTCENC_SIMD_INLINE vmask4 operator|(vmask4 a, vmask4 b)
  333. {
  334. return vmask4(vorrq_u32(a.m, b.m));
  335. }
  336. /**
  337. * @brief Overload: mask intersect (and).
  338. */
  339. ASTCENC_SIMD_INLINE vmask4 operator&(vmask4 a, vmask4 b)
  340. {
  341. return vmask4(vandq_u32(a.m, b.m));
  342. }
  343. /**
  344. * @brief Overload: mask difference (xor).
  345. */
  346. ASTCENC_SIMD_INLINE vmask4 operator^(vmask4 a, vmask4 b)
  347. {
  348. return vmask4(veorq_u32(a.m, b.m));
  349. }
  350. /**
  351. * @brief Overload: mask invert (not).
  352. */
  353. ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a)
  354. {
  355. return vmask4(vmvnq_u32(a.m));
  356. }
  357. /**
  358. * @brief Return a 4-bit mask code indicating mask status.
  359. *
  360. * bit0 = lane 0
  361. */
  362. ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
  363. {
  364. static const int shifta[4] { 0, 1, 2, 3 };
  365. static const int32x4_t shift = vld1q_s32(shifta);
  366. uint32x4_t tmp = vshrq_n_u32(a.m, 31);
  367. return vaddvq_u32(vshlq_u32(tmp, shift));
  368. }
  369. /**
  370. * @brief True if any lanes are enabled, false otherwise.
  371. */
  372. ASTCENC_SIMD_INLINE bool any(vmask4 a)
  373. {
  374. return vmaxvq_u32(a.m) != 0;
  375. }
  376. /**
  377. * @brief True if all lanes are enabled, false otherwise.
  378. */
  379. ASTCENC_SIMD_INLINE bool all(vmask4 a)
  380. {
  381. return vminvq_u32(a.m) != 0;
  382. }
  383. // ============================================================================
  384. // vint4 operators and functions
  385. // ============================================================================
  386. /**
  387. * @brief Overload: vector by vector addition.
  388. */
  389. ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, vint4 b)
  390. {
  391. return vint4(vaddq_s32(a.m, b.m));
  392. }
  393. /**
  394. * @brief Overload: vector by vector subtraction.
  395. */
  396. ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, vint4 b)
  397. {
  398. return vint4(vsubq_s32(a.m, b.m));
  399. }
  400. /**
  401. * @brief Overload: vector by vector multiplication.
  402. */
  403. ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, vint4 b)
  404. {
  405. return vint4(vmulq_s32(a.m, b.m));
  406. }
  407. /**
  408. * @brief Overload: vector bit invert.
  409. */
  410. ASTCENC_SIMD_INLINE vint4 operator~(vint4 a)
  411. {
  412. return vint4(vmvnq_s32(a.m));
  413. }
  414. /**
  415. * @brief Overload: vector by vector bitwise or.
  416. */
  417. ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, vint4 b)
  418. {
  419. return vint4(vorrq_s32(a.m, b.m));
  420. }
  421. /**
  422. * @brief Overload: vector by vector bitwise and.
  423. */
  424. ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, vint4 b)
  425. {
  426. return vint4(vandq_s32(a.m, b.m));
  427. }
  428. /**
  429. * @brief Overload: vector by vector bitwise xor.
  430. */
  431. ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, vint4 b)
  432. {
  433. return vint4(veorq_s32(a.m, b.m));
  434. }
  435. /**
  436. * @brief Overload: vector by vector equality.
  437. */
  438. ASTCENC_SIMD_INLINE vmask4 operator==(vint4 a, vint4 b)
  439. {
  440. return vmask4(vceqq_s32(a.m, b.m));
  441. }
  442. /**
  443. * @brief Overload: vector by vector inequality.
  444. */
  445. ASTCENC_SIMD_INLINE vmask4 operator!=(vint4 a, vint4 b)
  446. {
  447. return ~vmask4(vceqq_s32(a.m, b.m));
  448. }
  449. /**
  450. * @brief Overload: vector by vector less than.
  451. */
  452. ASTCENC_SIMD_INLINE vmask4 operator<(vint4 a, vint4 b)
  453. {
  454. return vmask4(vcltq_s32(a.m, b.m));
  455. }
  456. /**
  457. * @brief Overload: vector by vector greater than.
  458. */
  459. ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b)
  460. {
  461. return vmask4(vcgtq_s32(a.m, b.m));
  462. }
  463. /**
  464. * @brief Logical shift left.
  465. */
  466. template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
  467. {
  468. return vint4(vshlq_s32(a.m, vdupq_n_s32(s)));
  469. }
  470. /**
  471. * @brief Logical shift right.
  472. */
  473. template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a)
  474. {
  475. uint32x4_t ua = vreinterpretq_u32_s32(a.m);
  476. ua = vshlq_u32(ua, vdupq_n_s32(-s));
  477. return vint4(vreinterpretq_s32_u32(ua));
  478. }
  479. /**
  480. * @brief Arithmetic shift right.
  481. */
  482. template <int s> ASTCENC_SIMD_INLINE vint4 asr(vint4 a)
  483. {
  484. return vint4(vshlq_s32(a.m, vdupq_n_s32(-s)));
  485. }
  486. /**
  487. * @brief Return the min vector of two vectors.
  488. */
  489. ASTCENC_SIMD_INLINE vint4 min(vint4 a, vint4 b)
  490. {
  491. return vint4(vminq_s32(a.m, b.m));
  492. }
  493. /**
  494. * @brief Return the max vector of two vectors.
  495. */
  496. ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b)
  497. {
  498. return vint4(vmaxq_s32(a.m, b.m));
  499. }
  500. /**
  501. * @brief Return the horizontal minimum of a vector.
  502. */
  503. ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
  504. {
  505. return vint4(vminvq_s32(a.m));
  506. }
  507. /**
  508. * @brief Return the horizontal maximum of a vector.
  509. */
  510. ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
  511. {
  512. return vint4(vmaxvq_s32(a.m));
  513. }
  514. /**
  515. * @brief Store a vector to a 16B aligned memory address.
  516. */
  517. ASTCENC_SIMD_INLINE void storea(vint4 a, int* p)
  518. {
  519. vst1q_s32(p, a.m);
  520. }
  521. /**
  522. * @brief Store a vector to an unaligned memory address.
  523. */
  524. ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
  525. {
  526. vst1q_s32(p, a.m);
  527. }
  528. /**
  529. * @brief Store a vector to an unaligned memory address.
  530. */
  531. ASTCENC_SIMD_INLINE void store(vint4 a, uint8_t* p)
  532. {
  533. std::memcpy(p, &a.m, sizeof(int) * 4);
  534. }
  535. /**
  536. * @brief Store lowest N (vector width) bytes into an unaligned address.
  537. */
  538. ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
  539. {
  540. vst1q_lane_s32(reinterpret_cast<int32_t*>(p), a.m, 0);
  541. }
  542. /**
  543. * @brief Pack and store low 8 bits of each vector lane.
  544. */
  545. ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint4 a, uint8_t* data)
  546. {
  547. alignas(16) uint8_t shuf[16] {
  548. 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  549. };
  550. uint8x16_t idx = vld1q_u8(shuf);
  551. int8x16_t av = vreinterpretq_s8_s32(a.m);
  552. a = vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx)));
  553. store_nbytes(a, data);
  554. }
  555. /**
  556. * @brief Return lanes from @c b if @c cond is set, else @c a.
  557. */
  558. ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond)
  559. {
  560. return vint4(vbslq_s32(cond.m, b.m, a.m));
  561. }
  562. // ============================================================================
  563. // vfloat4 operators and functions
  564. // ============================================================================
  565. /**
  566. * @brief Overload: vector by vector addition.
  567. */
  568. ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, vfloat4 b)
  569. {
  570. return vfloat4(vaddq_f32(a.m, b.m));
  571. }
  572. /**
  573. * @brief Overload: vector by vector subtraction.
  574. */
  575. ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, vfloat4 b)
  576. {
  577. return vfloat4(vsubq_f32(a.m, b.m));
  578. }
  579. /**
  580. * @brief Overload: vector by vector multiplication.
  581. */
  582. ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, vfloat4 b)
  583. {
  584. return vfloat4(vmulq_f32(a.m, b.m));
  585. }
  586. /**
  587. * @brief Overload: vector by vector division.
  588. */
  589. ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, vfloat4 b)
  590. {
  591. return vfloat4(vdivq_f32(a.m, b.m));
  592. }
  593. /**
  594. * @brief Overload: vector by vector equality.
  595. */
  596. ASTCENC_SIMD_INLINE vmask4 operator==(vfloat4 a, vfloat4 b)
  597. {
  598. return vmask4(vceqq_f32(a.m, b.m));
  599. }
  600. /**
  601. * @brief Overload: vector by vector inequality.
  602. */
  603. ASTCENC_SIMD_INLINE vmask4 operator!=(vfloat4 a, vfloat4 b)
  604. {
  605. return vmask4(vmvnq_u32(vceqq_f32(a.m, b.m)));
  606. }
  607. /**
  608. * @brief Overload: vector by vector less than.
  609. */
  610. ASTCENC_SIMD_INLINE vmask4 operator<(vfloat4 a, vfloat4 b)
  611. {
  612. return vmask4(vcltq_f32(a.m, b.m));
  613. }
  614. /**
  615. * @brief Overload: vector by vector greater than.
  616. */
  617. ASTCENC_SIMD_INLINE vmask4 operator>(vfloat4 a, vfloat4 b)
  618. {
  619. return vmask4(vcgtq_f32(a.m, b.m));
  620. }
  621. /**
  622. * @brief Overload: vector by vector less than or equal.
  623. */
  624. ASTCENC_SIMD_INLINE vmask4 operator<=(vfloat4 a, vfloat4 b)
  625. {
  626. return vmask4(vcleq_f32(a.m, b.m));
  627. }
  628. /**
  629. * @brief Overload: vector by vector greater than or equal.
  630. */
  631. ASTCENC_SIMD_INLINE vmask4 operator>=(vfloat4 a, vfloat4 b)
  632. {
  633. return vmask4(vcgeq_f32(a.m, b.m));
  634. }
  635. /**
  636. * @brief Return the min vector of two vectors.
  637. *
  638. * If either lane value is NaN, @c b will be returned for that lane.
  639. */
  640. ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, vfloat4 b)
  641. {
  642. // Do not reorder - second operand will return if either is NaN
  643. return vfloat4(vminnmq_f32(a.m, b.m));
  644. }
  645. /**
  646. * @brief Return the max vector of two vectors.
  647. *
  648. * If either lane value is NaN, @c b will be returned for that lane.
  649. */
  650. ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, vfloat4 b)
  651. {
  652. // Do not reorder - second operand will return if either is NaN
  653. return vfloat4(vmaxnmq_f32(a.m, b.m));
  654. }
  655. /**
  656. * @brief Return the absolute value of the float vector.
  657. */
  658. ASTCENC_SIMD_INLINE vfloat4 abs(vfloat4 a)
  659. {
  660. float32x4_t zero = vdupq_n_f32(0.0f);
  661. float32x4_t inv = vsubq_f32(zero, a.m);
  662. return vfloat4(vmaxq_f32(a.m, inv));
  663. }
  664. /**
  665. * @brief Return a float rounded to the nearest integer value.
  666. */
  667. ASTCENC_SIMD_INLINE vfloat4 round(vfloat4 a)
  668. {
  669. return vfloat4(vrndnq_f32(a.m));
  670. }
  671. /**
  672. * @brief Return the horizontal minimum of a vector.
  673. */
  674. ASTCENC_SIMD_INLINE vfloat4 hmin(vfloat4 a)
  675. {
  676. return vfloat4(vminvq_f32(a.m));
  677. }
  678. /**
  679. * @brief Return the horizontal maximum of a vector.
  680. */
  681. ASTCENC_SIMD_INLINE vfloat4 hmax(vfloat4 a)
  682. {
  683. return vfloat4(vmaxvq_f32(a.m));
  684. }
  685. /**
  686. * @brief Return the horizontal sum of a vector.
  687. */
  688. ASTCENC_SIMD_INLINE float hadd_s(vfloat4 a)
  689. {
  690. // Perform halving add to ensure invariance; we cannot use vaddqv as this
  691. // does (0 + 1 + 2 + 3) which is not invariant with x86 (0 + 2) + (1 + 3).
  692. float32x2_t t = vadd_f32(vget_high_f32(a.m), vget_low_f32(a.m));
  693. return vget_lane_f32(vpadd_f32(t, t), 0);
  694. }
  695. /**
  696. * @brief Return the sqrt of the lanes in the vector.
  697. */
  698. ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a)
  699. {
  700. return vfloat4(vsqrtq_f32(a.m));
  701. }
  702. /**
  703. * @brief Return lanes from @c b if @c cond is set, else @c a.
  704. */
  705. ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
  706. {
  707. return vfloat4(vbslq_f32(cond.m, b.m, a.m));
  708. }
  709. /**
  710. * @brief Load a vector of gathered results from an array;
  711. */
  712. ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
  713. {
  714. #if ASTCENC_SVE == 0
  715. alignas(16) int idx[4];
  716. storea(indices, idx);
  717. alignas(16) float vals[4];
  718. vals[0] = base[idx[0]];
  719. vals[1] = base[idx[1]];
  720. vals[2] = base[idx[2]];
  721. vals[3] = base[idx[3]];
  722. return vfloat4(vals);
  723. #else
  724. svint32_t offsets = svset_neonq_s32(svundef_s32(), indices.m);
  725. svfloat32_t data = svld1_gather_s32index_f32(svptrue_pat_b32(SV_VL4), base, offsets);
  726. return vfloat4(svget_neonq_f32(data));
  727. #endif
  728. }
  729. /**
  730. * @brief Load a vector of gathered results from an array using byte indices from memory
  731. */
  732. template<>
  733. ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds<vfloat4>(const float* base, const uint8_t* indices)
  734. {
  735. #if ASTCENC_SVE == 0
  736. alignas(16) float vals[4];
  737. vals[0] = base[indices[0]];
  738. vals[1] = base[indices[1]];
  739. vals[2] = base[indices[2]];
  740. vals[3] = base[indices[3]];
  741. return vfloat4(vals);
  742. #else
  743. svint32_t offsets = svld1ub_s32(svptrue_pat_b32(SV_VL4), indices);
  744. svfloat32_t data = svld1_gather_s32index_f32(svptrue_pat_b32(SV_VL4), base, offsets);
  745. return vfloat4(svget_neonq_f32(data));
  746. #endif
  747. }
  748. /**
  749. * @brief Store a vector to an unaligned memory address.
  750. */
  751. ASTCENC_SIMD_INLINE void store(vfloat4 a, float* p)
  752. {
  753. vst1q_f32(p, a.m);
  754. }
  755. /**
  756. * @brief Store a vector to a 16B aligned memory address.
  757. */
  758. ASTCENC_SIMD_INLINE void storea(vfloat4 a, float* p)
  759. {
  760. vst1q_f32(p, a.m);
  761. }
  762. /**
  763. * @brief Return a integer value for a float vector, using truncation.
  764. */
  765. ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
  766. {
  767. return vint4(vcvtq_s32_f32(a.m));
  768. }
  769. /**
  770. * @brief Return a integer value for a float vector, using round-to-nearest.
  771. */
  772. ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
  773. {
  774. a = a + vfloat4(0.5f);
  775. return vint4(vcvtq_s32_f32(a.m));
  776. }
  777. /**
  778. * @brief Return a float value for an integer vector.
  779. */
  780. ASTCENC_SIMD_INLINE vfloat4 int_to_float(vint4 a)
  781. {
  782. return vfloat4(vcvtq_f32_s32(a.m));
  783. }
  784. /**
  785. * @brief Return a float16 value for a float vector, using round-to-nearest.
  786. */
  787. ASTCENC_SIMD_INLINE vint4 float_to_float16(vfloat4 a)
  788. {
  789. // Generate float16 value
  790. float16x4_t f16 = vcvt_f16_f32(a.m);
  791. // Convert each 16-bit float pattern to a 32-bit pattern
  792. uint16x4_t u16 = vreinterpret_u16_f16(f16);
  793. uint32x4_t u32 = vmovl_u16(u16);
  794. return vint4(vreinterpretq_s32_u32(u32));
  795. }
  796. /**
  797. * @brief Return a float16 value for a float scalar, using round-to-nearest.
  798. */
  799. static inline uint16_t float_to_float16(float a)
  800. {
  801. vfloat4 av(a);
  802. return static_cast<uint16_t>(float_to_float16(av).lane<0>());
  803. }
  804. /**
  805. * @brief Return a float value for a float16 vector.
  806. */
  807. ASTCENC_SIMD_INLINE vfloat4 float16_to_float(vint4 a)
  808. {
  809. // Convert each 32-bit float pattern to a 16-bit pattern
  810. uint32x4_t u32 = vreinterpretq_u32_s32(a.m);
  811. uint16x4_t u16 = vmovn_u32(u32);
  812. float16x4_t f16 = vreinterpret_f16_u16(u16);
  813. // Generate float16 value
  814. return vfloat4(vcvt_f32_f16(f16));
  815. }
  816. /**
  817. * @brief Return a float value for a float16 scalar.
  818. */
  819. ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a)
  820. {
  821. vint4 av(a);
  822. return float16_to_float(av).lane<0>();
  823. }
  824. /**
  825. * @brief Return a float value as an integer bit pattern (i.e. no conversion).
  826. *
  827. * It is a common trick to convert floats into integer bit patterns, perform
  828. * some bit hackery based on knowledge they are IEEE 754 layout, and then
  829. * convert them back again. This is the first half of that flip.
  830. */
  831. ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)
  832. {
  833. return vint4(vreinterpretq_s32_f32(a.m));
  834. }
  835. /**
  836. * @brief Return a integer value as a float bit pattern (i.e. no conversion).
  837. *
  838. * It is a common trick to convert floats into integer bit patterns, perform
  839. * some bit hackery based on knowledge they are IEEE 754 layout, and then
  840. * convert them back again. This is the second half of that flip.
  841. */
  842. ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
  843. {
  844. return vfloat4(vreinterpretq_f32_s32(v.m));
  845. }
  846. /*
  847. * Table structure for a 16x 8-bit entry table.
  848. */
  849. struct vtable4_16x8 {
  850. uint8x16_t t0;
  851. };
  852. /*
  853. * Table structure for a 32x 8-bit entry table.
  854. */
  855. struct vtable4_32x8 {
  856. uint8x16x2_t t01;
  857. };
  858. /*
  859. * Table structure for a 64x 8-bit entry table.
  860. */
  861. struct vtable4_64x8 {
  862. uint8x16x4_t t0123;
  863. };
  864. /**
  865. * @brief Prepare a vtable lookup table for 16x 8-bit entry table.
  866. */
  867. ASTCENC_SIMD_INLINE void vtable_prepare(
  868. vtable4_16x8& table,
  869. const uint8_t* data
  870. ) {
  871. table.t0 = vld1q_u8(data);
  872. }
  873. /**
  874. * @brief Prepare a vtable lookup table for 32x 8-bit entry table.
  875. */
  876. ASTCENC_SIMD_INLINE void vtable_prepare(
  877. vtable4_32x8& table,
  878. const uint8_t* data
  879. ) {
  880. table.t01 = uint8x16x2_t {
  881. vld1q_u8(data),
  882. vld1q_u8(data + 16)
  883. };
  884. }
  885. /**
  886. * @brief Prepare a vtable lookup table 64x 8-bit entry table.
  887. */
  888. ASTCENC_SIMD_INLINE void vtable_prepare(
  889. vtable4_64x8& table,
  890. const uint8_t* data
  891. ) {
  892. table.t0123 = uint8x16x4_t {
  893. vld1q_u8(data),
  894. vld1q_u8(data + 16),
  895. vld1q_u8(data + 32),
  896. vld1q_u8(data + 48)
  897. };
  898. }
  899. /**
  900. * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
  901. */
  902. ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
  903. const vtable4_16x8& tbl,
  904. vint4 idx
  905. ) {
  906. // Set index byte above max index for unused bytes so table lookup returns zero
  907. int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
  908. uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
  909. return vint4(vreinterpretq_s32_u8(vqtbl1q_u8(tbl.t0, idx_bytes)));
  910. }
  911. /**
  912. * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
  913. */
  914. ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
  915. const vtable4_32x8& tbl,
  916. vint4 idx
  917. ) {
  918. // Set index byte above max index for unused bytes so table lookup returns zero
  919. int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
  920. uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
  921. return vint4(vreinterpretq_s32_u8(vqtbl2q_u8(tbl.t01, idx_bytes)));
  922. }
  923. /**
  924. * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
  925. */
  926. ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
  927. const vtable4_64x8& tbl,
  928. vint4 idx
  929. ) {
  930. // Set index byte above max index for unused bytes so table lookup returns zero
  931. int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
  932. uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
  933. return vint4(vreinterpretq_s32_u8(vqtbl4q_u8(tbl.t0123, idx_bytes)));
  934. }
  935. /**
  936. * @brief Return a vector of interleaved RGBA data.
  937. *
  938. * Input vectors have the value stored in the bottom 8 bits of each lane,
  939. * with high bits set to zero.
  940. *
  941. * Output vector stores a single RGBA texel packed in each lane.
  942. */
  943. ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
  944. {
  945. return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
  946. }
  947. /**
  948. * @brief Store a single vector lane to an unaligned address.
  949. */
  950. ASTCENC_SIMD_INLINE void store_lane(uint8_t* base, int data)
  951. {
  952. std::memcpy(base, &data, sizeof(int));
  953. }
  954. /**
  955. * @brief Store a vector, skipping masked lanes.
  956. *
  957. * All masked lanes must be at the end of vector, after all non-masked lanes.
  958. */
  959. ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint4 data, vmask4 mask)
  960. {
  961. if (mask.lane<3>())
  962. {
  963. store(data, base);
  964. }
  965. else if (mask.lane<2>() != 0.0f)
  966. {
  967. store_lane(base + 0, data.lane<0>());
  968. store_lane(base + 4, data.lane<1>());
  969. store_lane(base + 8, data.lane<2>());
  970. }
  971. else if (mask.lane<1>() != 0.0f)
  972. {
  973. store_lane(base + 0, data.lane<0>());
  974. store_lane(base + 4, data.lane<1>());
  975. }
  976. else if (mask.lane<0>() != 0.0f)
  977. {
  978. store_lane(base + 0, data.lane<0>());
  979. }
  980. }
  981. #define ASTCENC_USE_NATIVE_POPCOUNT 1
  982. /**
  983. * @brief Population bit count.
  984. *
  985. * @param v The value to population count.
  986. *
  987. * @return The number of 1 bits.
  988. */
  989. ASTCENC_SIMD_INLINE int popcount(uint64_t v)
  990. {
  991. return static_cast<int>(vaddlv_u8(vcnt_u8(vcreate_u8(v))));
  992. }
  993. #endif // #ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED