astcenc_vecmathlib_none_4.h 29 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213
  1. // SPDX-License-Identifier: Apache-2.0
  2. // ----------------------------------------------------------------------------
  3. // Copyright 2019-2024 Arm Limited
  4. //
  5. // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  6. // use this file except in compliance with the License. You may obtain a copy
  7. // of the License at:
  8. //
  9. // http://www.apache.org/licenses/LICENSE-2.0
  10. //
  11. // Unless required by applicable law or agreed to in writing, software
  12. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  13. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  14. // License for the specific language governing permissions and limitations
  15. // under the License.
  16. // ----------------------------------------------------------------------------
  17. /**
  18. * @brief 4x32-bit vectors, implemented using plain C++.
  19. *
  20. * This module implements 4-wide 32-bit float, int, and mask vectors. This
  21. * module provides a scalar fallback for VLA code, primarily useful for
  22. * debugging VLA algorithms without the complexity of handling SIMD. Only the
  23. * baseline level of functionality needed to support VLA is provided.
  24. *
  25. * Note that the vector conditional operators implemented by this module are
  26. * designed to behave like SIMD conditional operators that generate lane masks.
  27. * Rather than returning 0/1 booleans like normal C++ code they will return
  28. * 0/-1 to give a full lane-width bitmask.
  29. *
  30. * Note that the documentation for this module still talks about "vectors" to
  31. * help developers think about the implied VLA behavior when writing optimized
  32. * paths.
  33. */
  34. #ifndef ASTC_VECMATHLIB_NONE_4_H_INCLUDED
  35. #define ASTC_VECMATHLIB_NONE_4_H_INCLUDED
  36. #ifndef ASTCENC_SIMD_INLINE
  37. #error "Include astcenc_vecmathlib.h, do not include directly"
  38. #endif
  39. #include <algorithm>
  40. #include <cstdio>
  41. #include <cstring>
  42. #include <cfenv>
  43. // ============================================================================
  44. // vfloat4 data type
  45. // ============================================================================
  46. /**
  47. * @brief Data type for 4-wide floats.
  48. */
  49. struct vfloat4
  50. {
  51. /**
  52. * @brief Construct from zero-initialized value.
  53. */
  54. ASTCENC_SIMD_INLINE vfloat4() = default;
  55. /**
  56. * @brief Construct from 4 values loaded from an unaligned address.
  57. *
  58. * Consider using loada() which is better with wider VLA vectors if data is
  59. * aligned to vector length.
  60. */
  61. ASTCENC_SIMD_INLINE explicit vfloat4(const float* p)
  62. {
  63. m[0] = p[0];
  64. m[1] = p[1];
  65. m[2] = p[2];
  66. m[3] = p[3];
  67. }
  68. /**
  69. * @brief Construct from 4 scalar values replicated across all lanes.
  70. *
  71. * Consider using zero() for constexpr zeros.
  72. */
  73. ASTCENC_SIMD_INLINE explicit vfloat4(float a)
  74. {
  75. m[0] = a;
  76. m[1] = a;
  77. m[2] = a;
  78. m[3] = a;
  79. }
  80. /**
  81. * @brief Construct from 4 scalar values.
  82. *
  83. * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
  84. */
  85. ASTCENC_SIMD_INLINE explicit vfloat4(float a, float b, float c, float d)
  86. {
  87. m[0] = a;
  88. m[1] = b;
  89. m[2] = c;
  90. m[3] = d;
  91. }
  92. /**
  93. * @brief Get the scalar value of a single lane.
  94. */
  95. template <int l> ASTCENC_SIMD_INLINE float lane() const
  96. {
  97. return m[l];
  98. }
  99. /**
  100. * @brief Set the scalar value of a single lane.
  101. */
  102. template <int l> ASTCENC_SIMD_INLINE void set_lane(float a)
  103. {
  104. m[l] = a;
  105. }
  106. /**
  107. * @brief Factory that returns a vector of zeros.
  108. */
  109. static ASTCENC_SIMD_INLINE vfloat4 zero()
  110. {
  111. return vfloat4(0.0f);
  112. }
  113. /**
  114. * @brief Factory that returns a replicated scalar loaded from memory.
  115. */
  116. static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p)
  117. {
  118. return vfloat4(*p);
  119. }
  120. /**
  121. * @brief Factory that returns a vector loaded from aligned memory.
  122. */
  123. static ASTCENC_SIMD_INLINE vfloat4 loada(const float* p)
  124. {
  125. return vfloat4(p);
  126. }
  127. /**
  128. * @brief Factory that returns a vector containing the lane IDs.
  129. */
  130. static ASTCENC_SIMD_INLINE vfloat4 lane_id()
  131. {
  132. return vfloat4(0.0f, 1.0f, 2.0f, 3.0f);
  133. }
  134. /**
  135. * @brief Return a swizzled float 2.
  136. */
  137. template <int l0, int l1> ASTCENC_SIMD_INLINE vfloat4 swz() const
  138. {
  139. return vfloat4(lane<l0>(), lane<l1>(), 0.0f, 0.0f);
  140. }
  141. /**
  142. * @brief Return a swizzled float 3.
  143. */
  144. template <int l0, int l1, int l2> ASTCENC_SIMD_INLINE vfloat4 swz() const
  145. {
  146. return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), 0.0f);
  147. }
  148. /**
  149. * @brief Return a swizzled float 4.
  150. */
  151. template <int l0, int l1, int l2, int l3> ASTCENC_SIMD_INLINE vfloat4 swz() const
  152. {
  153. return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), lane<l3>());
  154. }
  155. /**
  156. * @brief The vector ...
  157. */
  158. float m[4];
  159. };
  160. // ============================================================================
  161. // vint4 data type
  162. // ============================================================================
  163. /**
  164. * @brief Data type for 4-wide ints.
  165. */
  166. struct vint4
  167. {
  168. /**
  169. * @brief Construct from zero-initialized value.
  170. */
  171. ASTCENC_SIMD_INLINE vint4() = default;
  172. /**
  173. * @brief Construct from 4 values loaded from an unaligned address.
  174. *
  175. * Consider using vint4::loada() which is better with wider VLA vectors
  176. * if data is aligned.
  177. */
  178. ASTCENC_SIMD_INLINE explicit vint4(const int* p)
  179. {
  180. m[0] = p[0];
  181. m[1] = p[1];
  182. m[2] = p[2];
  183. m[3] = p[3];
  184. }
  185. /**
  186. * @brief Construct from 4 uint8_t loaded from an unaligned address.
  187. */
  188. ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p)
  189. {
  190. m[0] = p[0];
  191. m[1] = p[1];
  192. m[2] = p[2];
  193. m[3] = p[3];
  194. }
  195. /**
  196. * @brief Construct from 4 scalar values.
  197. *
  198. * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
  199. */
  200. ASTCENC_SIMD_INLINE explicit vint4(int a, int b, int c, int d)
  201. {
  202. m[0] = a;
  203. m[1] = b;
  204. m[2] = c;
  205. m[3] = d;
  206. }
  207. /**
  208. * @brief Construct from 4 scalar values replicated across all lanes.
  209. *
  210. * Consider using vint4::zero() for constexpr zeros.
  211. */
  212. ASTCENC_SIMD_INLINE explicit vint4(int a)
  213. {
  214. m[0] = a;
  215. m[1] = a;
  216. m[2] = a;
  217. m[3] = a;
  218. }
  219. /**
  220. * @brief Get the scalar value of a single lane.
  221. */
  222. template <int l> ASTCENC_SIMD_INLINE int lane() const
  223. {
  224. return m[l];
  225. }
  226. /**
  227. * @brief Set the scalar value of a single lane.
  228. */
  229. template <int l> ASTCENC_SIMD_INLINE void set_lane(int a)
  230. {
  231. m[l] = a;
  232. }
  233. /**
  234. * @brief Factory that returns a vector of zeros.
  235. */
  236. static ASTCENC_SIMD_INLINE vint4 zero()
  237. {
  238. return vint4(0);
  239. }
  240. /**
  241. * @brief Factory that returns a replicated scalar loaded from memory.
  242. */
  243. static ASTCENC_SIMD_INLINE vint4 load1(const int* p)
  244. {
  245. return vint4(*p);
  246. }
  247. /**
  248. * @brief Factory that returns a vector loaded from unaligned memory.
  249. */
  250. static ASTCENC_SIMD_INLINE vint4 load(const uint8_t* p)
  251. {
  252. vint4 data;
  253. std::memcpy(&data.m, p, 4 * sizeof(int));
  254. return data;
  255. }
  256. /**
  257. * @brief Factory that returns a vector loaded from 16B aligned memory.
  258. */
  259. static ASTCENC_SIMD_INLINE vint4 loada(const int* p)
  260. {
  261. return vint4(p);
  262. }
  263. /**
  264. * @brief Factory that returns a vector containing the lane IDs.
  265. */
  266. static ASTCENC_SIMD_INLINE vint4 lane_id()
  267. {
  268. return vint4(0, 1, 2, 3);
  269. }
  270. /**
  271. * @brief The vector ...
  272. */
  273. int m[4];
  274. };
  275. // ============================================================================
  276. // vmask4 data type
  277. // ============================================================================
  278. /**
  279. * @brief Data type for 4-wide control plane masks.
  280. */
  281. struct vmask4
  282. {
  283. /**
  284. * @brief Construct from an existing mask value.
  285. */
  286. ASTCENC_SIMD_INLINE explicit vmask4(int* p)
  287. {
  288. m[0] = p[0];
  289. m[1] = p[1];
  290. m[2] = p[2];
  291. m[3] = p[3];
  292. }
  293. /**
  294. * @brief Construct from 1 scalar value.
  295. */
  296. ASTCENC_SIMD_INLINE explicit vmask4(bool a)
  297. {
  298. m[0] = a == false ? 0 : -1;
  299. m[1] = a == false ? 0 : -1;
  300. m[2] = a == false ? 0 : -1;
  301. m[3] = a == false ? 0 : -1;
  302. }
  303. /**
  304. * @brief Construct from 4 scalar values.
  305. *
  306. * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
  307. */
  308. ASTCENC_SIMD_INLINE explicit vmask4(bool a, bool b, bool c, bool d)
  309. {
  310. m[0] = a == false ? 0 : -1;
  311. m[1] = b == false ? 0 : -1;
  312. m[2] = c == false ? 0 : -1;
  313. m[3] = d == false ? 0 : -1;
  314. }
  315. /**
  316. * @brief Get the scalar value of a single lane.
  317. */
  318. template <int l> ASTCENC_SIMD_INLINE float lane() const
  319. {
  320. return m[l] != 0;
  321. }
  322. /**
  323. * @brief The vector ...
  324. */
  325. int m[4];
  326. };
  327. // ============================================================================
  328. // vmask4 operators and functions
  329. // ============================================================================
  330. /**
  331. * @brief Overload: mask union (or).
  332. */
  333. ASTCENC_SIMD_INLINE vmask4 operator|(vmask4 a, vmask4 b)
  334. {
  335. return vmask4(a.m[0] | b.m[0],
  336. a.m[1] | b.m[1],
  337. a.m[2] | b.m[2],
  338. a.m[3] | b.m[3]);
  339. }
  340. /**
  341. * @brief Overload: mask intersect (and).
  342. */
  343. ASTCENC_SIMD_INLINE vmask4 operator&(vmask4 a, vmask4 b)
  344. {
  345. return vmask4(a.m[0] & b.m[0],
  346. a.m[1] & b.m[1],
  347. a.m[2] & b.m[2],
  348. a.m[3] & b.m[3]);
  349. }
  350. /**
  351. * @brief Overload: mask difference (xor).
  352. */
  353. ASTCENC_SIMD_INLINE vmask4 operator^(vmask4 a, vmask4 b)
  354. {
  355. return vmask4(a.m[0] ^ b.m[0],
  356. a.m[1] ^ b.m[1],
  357. a.m[2] ^ b.m[2],
  358. a.m[3] ^ b.m[3]);
  359. }
  360. /**
  361. * @brief Overload: mask invert (not).
  362. */
  363. ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a)
  364. {
  365. return vmask4(~a.m[0],
  366. ~a.m[1],
  367. ~a.m[2],
  368. ~a.m[3]);
  369. }
  370. /**
  371. * @brief Return a 1-bit mask code indicating mask status.
  372. *
  373. * bit0 = lane 0
  374. */
  375. ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
  376. {
  377. return ((a.m[0] >> 31) & 0x1) |
  378. ((a.m[1] >> 30) & 0x2) |
  379. ((a.m[2] >> 29) & 0x4) |
  380. ((a.m[3] >> 28) & 0x8);
  381. }
  382. // ============================================================================
  383. // vint4 operators and functions
  384. // ============================================================================
  385. /**
  386. * @brief Overload: vector by vector addition.
  387. */
  388. ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, vint4 b)
  389. {
  390. return vint4(a.m[0] + b.m[0],
  391. a.m[1] + b.m[1],
  392. a.m[2] + b.m[2],
  393. a.m[3] + b.m[3]);
  394. }
  395. /**
  396. * @brief Overload: vector by vector subtraction.
  397. */
  398. ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, vint4 b)
  399. {
  400. return vint4(a.m[0] - b.m[0],
  401. a.m[1] - b.m[1],
  402. a.m[2] - b.m[2],
  403. a.m[3] - b.m[3]);
  404. }
  405. /**
  406. * @brief Overload: vector by vector multiplication.
  407. */
  408. ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, vint4 b)
  409. {
  410. return vint4(a.m[0] * b.m[0],
  411. a.m[1] * b.m[1],
  412. a.m[2] * b.m[2],
  413. a.m[3] * b.m[3]);
  414. }
  415. /**
  416. * @brief Overload: vector bit invert.
  417. */
  418. ASTCENC_SIMD_INLINE vint4 operator~(vint4 a)
  419. {
  420. return vint4(~a.m[0],
  421. ~a.m[1],
  422. ~a.m[2],
  423. ~a.m[3]);
  424. }
  425. /**
  426. * @brief Overload: vector by vector bitwise or.
  427. */
  428. ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, vint4 b)
  429. {
  430. return vint4(a.m[0] | b.m[0],
  431. a.m[1] | b.m[1],
  432. a.m[2] | b.m[2],
  433. a.m[3] | b.m[3]);
  434. }
  435. /**
  436. * @brief Overload: vector by vector bitwise and.
  437. */
  438. ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, vint4 b)
  439. {
  440. return vint4(a.m[0] & b.m[0],
  441. a.m[1] & b.m[1],
  442. a.m[2] & b.m[2],
  443. a.m[3] & b.m[3]);
  444. }
  445. /**
  446. * @brief Overload: vector by vector bitwise xor.
  447. */
  448. ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, vint4 b)
  449. {
  450. return vint4(a.m[0] ^ b.m[0],
  451. a.m[1] ^ b.m[1],
  452. a.m[2] ^ b.m[2],
  453. a.m[3] ^ b.m[3]);
  454. }
  455. /**
  456. * @brief Overload: vector by vector equality.
  457. */
  458. ASTCENC_SIMD_INLINE vmask4 operator==(vint4 a, vint4 b)
  459. {
  460. return vmask4(a.m[0] == b.m[0],
  461. a.m[1] == b.m[1],
  462. a.m[2] == b.m[2],
  463. a.m[3] == b.m[3]);
  464. }
  465. /**
  466. * @brief Overload: vector by vector inequality.
  467. */
  468. ASTCENC_SIMD_INLINE vmask4 operator!=(vint4 a, vint4 b)
  469. {
  470. return vmask4(a.m[0] != b.m[0],
  471. a.m[1] != b.m[1],
  472. a.m[2] != b.m[2],
  473. a.m[3] != b.m[3]);
  474. }
  475. /**
  476. * @brief Overload: vector by vector less than.
  477. */
  478. ASTCENC_SIMD_INLINE vmask4 operator<(vint4 a, vint4 b)
  479. {
  480. return vmask4(a.m[0] < b.m[0],
  481. a.m[1] < b.m[1],
  482. a.m[2] < b.m[2],
  483. a.m[3] < b.m[3]);
  484. }
  485. /**
  486. * @brief Overload: vector by vector greater than.
  487. */
  488. ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b)
  489. {
  490. return vmask4(a.m[0] > b.m[0],
  491. a.m[1] > b.m[1],
  492. a.m[2] > b.m[2],
  493. a.m[3] > b.m[3]);
  494. }
  495. /**
  496. * @brief Logical shift left.
  497. */
  498. template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
  499. {
  500. // Cast to unsigned to avoid shift in/out of sign bit undefined behavior
  501. unsigned int as0 = static_cast<unsigned int>(a.m[0]) << s;
  502. unsigned int as1 = static_cast<unsigned int>(a.m[1]) << s;
  503. unsigned int as2 = static_cast<unsigned int>(a.m[2]) << s;
  504. unsigned int as3 = static_cast<unsigned int>(a.m[3]) << s;
  505. return vint4(static_cast<int>(as0),
  506. static_cast<int>(as1),
  507. static_cast<int>(as2),
  508. static_cast<int>(as3));
  509. }
  510. /**
  511. * @brief Logical shift right.
  512. */
  513. template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a)
  514. {
  515. // Cast to unsigned to avoid shift in/out of sign bit undefined behavior
  516. unsigned int as0 = static_cast<unsigned int>(a.m[0]) >> s;
  517. unsigned int as1 = static_cast<unsigned int>(a.m[1]) >> s;
  518. unsigned int as2 = static_cast<unsigned int>(a.m[2]) >> s;
  519. unsigned int as3 = static_cast<unsigned int>(a.m[3]) >> s;
  520. return vint4(static_cast<int>(as0),
  521. static_cast<int>(as1),
  522. static_cast<int>(as2),
  523. static_cast<int>(as3));
  524. }
  525. /**
  526. * @brief Arithmetic shift right.
  527. */
  528. template <int s> ASTCENC_SIMD_INLINE vint4 asr(vint4 a)
  529. {
  530. return vint4(a.m[0] >> s,
  531. a.m[1] >> s,
  532. a.m[2] >> s,
  533. a.m[3] >> s);
  534. }
  535. /**
  536. * @brief Return the min vector of two vectors.
  537. */
  538. ASTCENC_SIMD_INLINE vint4 min(vint4 a, vint4 b)
  539. {
  540. return vint4(a.m[0] < b.m[0] ? a.m[0] : b.m[0],
  541. a.m[1] < b.m[1] ? a.m[1] : b.m[1],
  542. a.m[2] < b.m[2] ? a.m[2] : b.m[2],
  543. a.m[3] < b.m[3] ? a.m[3] : b.m[3]);
  544. }
  545. /**
  546. * @brief Return the min vector of two vectors.
  547. */
  548. ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b)
  549. {
  550. return vint4(a.m[0] > b.m[0] ? a.m[0] : b.m[0],
  551. a.m[1] > b.m[1] ? a.m[1] : b.m[1],
  552. a.m[2] > b.m[2] ? a.m[2] : b.m[2],
  553. a.m[3] > b.m[3] ? a.m[3] : b.m[3]);
  554. }
  555. /**
  556. * @brief Return the horizontal minimum of a single vector.
  557. */
  558. ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
  559. {
  560. int b = std::min(a.m[0], a.m[1]);
  561. int c = std::min(a.m[2], a.m[3]);
  562. return vint4(std::min(b, c));
  563. }
  564. /**
  565. * @brief Return the horizontal maximum of a single vector.
  566. */
  567. ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
  568. {
  569. int b = std::max(a.m[0], a.m[1]);
  570. int c = std::max(a.m[2], a.m[3]);
  571. return vint4(std::max(b, c));
  572. }
  573. /**
  574. * @brief Return the horizontal sum of vector lanes as a scalar.
  575. */
  576. ASTCENC_SIMD_INLINE int hadd_s(vint4 a)
  577. {
  578. return a.m[0] + a.m[1] + a.m[2] + a.m[3];
  579. }
  580. /**
  581. * @brief Store a vector to an aligned memory address.
  582. */
  583. ASTCENC_SIMD_INLINE void storea(vint4 a, int* p)
  584. {
  585. p[0] = a.m[0];
  586. p[1] = a.m[1];
  587. p[2] = a.m[2];
  588. p[3] = a.m[3];
  589. }
  590. /**
  591. * @brief Store a vector to an unaligned memory address.
  592. */
  593. ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
  594. {
  595. p[0] = a.m[0];
  596. p[1] = a.m[1];
  597. p[2] = a.m[2];
  598. p[3] = a.m[3];
  599. }
  600. /**
  601. * @brief Store a vector to an unaligned memory address.
  602. */
  603. ASTCENC_SIMD_INLINE void store(vint4 a, uint8_t* p)
  604. {
  605. std::memcpy(p, a.m, sizeof(int) * 4);
  606. }
  607. /**
  608. * @brief Store lowest N (vector width) bytes into an unaligned address.
  609. */
  610. ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
  611. {
  612. std::memcpy(p, a.m, sizeof(uint8_t) * 4);
  613. }
  614. /**
  615. * @brief Gather N (vector width) indices from the array.
  616. */
  617. ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
  618. {
  619. return vint4(base[indices.m[0]],
  620. base[indices.m[1]],
  621. base[indices.m[2]],
  622. base[indices.m[3]]);
  623. }
  624. /**
  625. * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
  626. */
  627. ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
  628. {
  629. int b0 = a.m[0] & 0xFF;
  630. int b1 = a.m[1] & 0xFF;
  631. int b2 = a.m[2] & 0xFF;
  632. int b3 = a.m[3] & 0xFF;
  633. int b = b0 | (b1 << 8) | (b2 << 16) | (b3 << 24);
  634. return vint4(b, 0, 0, 0);
  635. }
  636. /**
  637. * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
  638. */
  639. ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond)
  640. {
  641. return vint4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0],
  642. (cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1],
  643. (cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2],
  644. (cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
  645. }
  646. // ============================================================================
  647. // vfloat4 operators and functions
  648. // ============================================================================
  649. /**
  650. * @brief Overload: vector by vector addition.
  651. */
  652. ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, vfloat4 b)
  653. {
  654. return vfloat4(a.m[0] + b.m[0],
  655. a.m[1] + b.m[1],
  656. a.m[2] + b.m[2],
  657. a.m[3] + b.m[3]);
  658. }
  659. /**
  660. * @brief Overload: vector by vector subtraction.
  661. */
  662. ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, vfloat4 b)
  663. {
  664. return vfloat4(a.m[0] - b.m[0],
  665. a.m[1] - b.m[1],
  666. a.m[2] - b.m[2],
  667. a.m[3] - b.m[3]);
  668. }
  669. /**
  670. * @brief Overload: vector by vector multiplication.
  671. */
  672. ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, vfloat4 b)
  673. {
  674. return vfloat4(a.m[0] * b.m[0],
  675. a.m[1] * b.m[1],
  676. a.m[2] * b.m[2],
  677. a.m[3] * b.m[3]);
  678. }
  679. /**
  680. * @brief Overload: vector by vector division.
  681. */
  682. ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, vfloat4 b)
  683. {
  684. return vfloat4(a.m[0] / b.m[0],
  685. a.m[1] / b.m[1],
  686. a.m[2] / b.m[2],
  687. a.m[3] / b.m[3]);
  688. }
  689. /**
  690. * @brief Overload: vector by vector equality.
  691. */
  692. ASTCENC_SIMD_INLINE vmask4 operator==(vfloat4 a, vfloat4 b)
  693. {
  694. return vmask4(a.m[0] == b.m[0],
  695. a.m[1] == b.m[1],
  696. a.m[2] == b.m[2],
  697. a.m[3] == b.m[3]);
  698. }
  699. /**
  700. * @brief Overload: vector by vector inequality.
  701. */
  702. ASTCENC_SIMD_INLINE vmask4 operator!=(vfloat4 a, vfloat4 b)
  703. {
  704. return vmask4(a.m[0] != b.m[0],
  705. a.m[1] != b.m[1],
  706. a.m[2] != b.m[2],
  707. a.m[3] != b.m[3]);
  708. }
  709. /**
  710. * @brief Overload: vector by vector less than.
  711. */
  712. ASTCENC_SIMD_INLINE vmask4 operator<(vfloat4 a, vfloat4 b)
  713. {
  714. return vmask4(a.m[0] < b.m[0],
  715. a.m[1] < b.m[1],
  716. a.m[2] < b.m[2],
  717. a.m[3] < b.m[3]);
  718. }
  719. /**
  720. * @brief Overload: vector by vector greater than.
  721. */
  722. ASTCENC_SIMD_INLINE vmask4 operator>(vfloat4 a, vfloat4 b)
  723. {
  724. return vmask4(a.m[0] > b.m[0],
  725. a.m[1] > b.m[1],
  726. a.m[2] > b.m[2],
  727. a.m[3] > b.m[3]);
  728. }
  729. /**
  730. * @brief Overload: vector by vector less than or equal.
  731. */
  732. ASTCENC_SIMD_INLINE vmask4 operator<=(vfloat4 a, vfloat4 b)
  733. {
  734. return vmask4(a.m[0] <= b.m[0],
  735. a.m[1] <= b.m[1],
  736. a.m[2] <= b.m[2],
  737. a.m[3] <= b.m[3]);
  738. }
  739. /**
  740. * @brief Overload: vector by vector greater than or equal.
  741. */
  742. ASTCENC_SIMD_INLINE vmask4 operator>=(vfloat4 a, vfloat4 b)
  743. {
  744. return vmask4(a.m[0] >= b.m[0],
  745. a.m[1] >= b.m[1],
  746. a.m[2] >= b.m[2],
  747. a.m[3] >= b.m[3]);
  748. }
  749. /**
  750. * @brief Return the min vector of two vectors.
  751. *
  752. * If either lane value is NaN, @c b will be returned for that lane.
  753. */
  754. ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, vfloat4 b)
  755. {
  756. return vfloat4(a.m[0] < b.m[0] ? a.m[0] : b.m[0],
  757. a.m[1] < b.m[1] ? a.m[1] : b.m[1],
  758. a.m[2] < b.m[2] ? a.m[2] : b.m[2],
  759. a.m[3] < b.m[3] ? a.m[3] : b.m[3]);
  760. }
  761. /**
  762. * @brief Return the max vector of two vectors.
  763. *
  764. * If either lane value is NaN, @c b will be returned for that lane.
  765. */
  766. ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, vfloat4 b)
  767. {
  768. return vfloat4(a.m[0] > b.m[0] ? a.m[0] : b.m[0],
  769. a.m[1] > b.m[1] ? a.m[1] : b.m[1],
  770. a.m[2] > b.m[2] ? a.m[2] : b.m[2],
  771. a.m[3] > b.m[3] ? a.m[3] : b.m[3]);
  772. }
  773. /**
  774. * @brief Return the absolute value of the float vector.
  775. */
  776. ASTCENC_SIMD_INLINE vfloat4 abs(vfloat4 a)
  777. {
  778. return vfloat4(std::abs(a.m[0]),
  779. std::abs(a.m[1]),
  780. std::abs(a.m[2]),
  781. std::abs(a.m[3]));
  782. }
  783. /**
  784. * @brief Return a float rounded to the nearest integer value.
  785. */
  786. ASTCENC_SIMD_INLINE vfloat4 round(vfloat4 a)
  787. {
  788. assert(std::fegetround() == FE_TONEAREST);
  789. return vfloat4(std::nearbyint(a.m[0]),
  790. std::nearbyint(a.m[1]),
  791. std::nearbyint(a.m[2]),
  792. std::nearbyint(a.m[3]));
  793. }
  794. /**
  795. * @brief Return the horizontal minimum of a vector.
  796. */
  797. ASTCENC_SIMD_INLINE vfloat4 hmin(vfloat4 a)
  798. {
  799. float tmp1 = std::min(a.m[0], a.m[1]);
  800. float tmp2 = std::min(a.m[2], a.m[3]);
  801. return vfloat4(std::min(tmp1, tmp2));
  802. }
  803. /**
  804. * @brief Return the horizontal maximum of a vector.
  805. */
  806. ASTCENC_SIMD_INLINE vfloat4 hmax(vfloat4 a)
  807. {
  808. float tmp1 = std::max(a.m[0], a.m[1]);
  809. float tmp2 = std::max(a.m[2], a.m[3]);
  810. return vfloat4(std::max(tmp1, tmp2));
  811. }
  812. /**
  813. * @brief Return the horizontal sum of a vector.
  814. */
  815. ASTCENC_SIMD_INLINE float hadd_s(vfloat4 a)
  816. {
  817. // Use halving add, gives invariance with SIMD versions
  818. return (a.m[0] + a.m[2]) + (a.m[1] + a.m[3]);
  819. }
  820. /**
  821. * @brief Return the sqrt of the lanes in the vector.
  822. */
  823. ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a)
  824. {
  825. return vfloat4(std::sqrt(a.m[0]),
  826. std::sqrt(a.m[1]),
  827. std::sqrt(a.m[2]),
  828. std::sqrt(a.m[3]));
  829. }
  830. /**
  831. * @brief Return lanes from @c b if @c cond is set, else @c a.
  832. */
  833. ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
  834. {
  835. return vfloat4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0],
  836. (cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1],
  837. (cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2],
  838. (cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
  839. }
  840. /**
  841. * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
  842. */
  843. ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
  844. {
  845. return vfloat4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0],
  846. (cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1],
  847. (cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2],
  848. (cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
  849. }
  850. /**
  851. * @brief Load a vector of gathered results from an array;
  852. */
  853. ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
  854. {
  855. return vfloat4(base[indices.m[0]],
  856. base[indices.m[1]],
  857. base[indices.m[2]],
  858. base[indices.m[3]]);
  859. }
  860. /**
  861. * @brief Store a vector to an unaligned memory address.
  862. */
  863. ASTCENC_SIMD_INLINE void store(vfloat4 a, float* ptr)
  864. {
  865. ptr[0] = a.m[0];
  866. ptr[1] = a.m[1];
  867. ptr[2] = a.m[2];
  868. ptr[3] = a.m[3];
  869. }
  870. /**
  871. * @brief Store a vector to an aligned memory address.
  872. */
  873. ASTCENC_SIMD_INLINE void storea(vfloat4 a, float* ptr)
  874. {
  875. ptr[0] = a.m[0];
  876. ptr[1] = a.m[1];
  877. ptr[2] = a.m[2];
  878. ptr[3] = a.m[3];
  879. }
  880. /**
  881. * @brief Return a integer value for a float vector, using truncation.
  882. */
  883. ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
  884. {
  885. return vint4(static_cast<int>(a.m[0]),
  886. static_cast<int>(a.m[1]),
  887. static_cast<int>(a.m[2]),
  888. static_cast<int>(a.m[3]));
  889. }
  890. /**f
  891. * @brief Return a integer value for a float vector, using round-to-nearest.
  892. */
  893. ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
  894. {
  895. a = a + vfloat4(0.5f);
  896. return vint4(static_cast<int>(a.m[0]),
  897. static_cast<int>(a.m[1]),
  898. static_cast<int>(a.m[2]),
  899. static_cast<int>(a.m[3]));
  900. }
  901. /**
  902. * @brief Return a float value for a integer vector.
  903. */
  904. ASTCENC_SIMD_INLINE vfloat4 int_to_float(vint4 a)
  905. {
  906. return vfloat4(static_cast<float>(a.m[0]),
  907. static_cast<float>(a.m[1]),
  908. static_cast<float>(a.m[2]),
  909. static_cast<float>(a.m[3]));
  910. }
  911. /**
  912. * @brief Return a float16 value for a float vector, using round-to-nearest.
  913. */
  914. ASTCENC_SIMD_INLINE vint4 float_to_float16(vfloat4 a)
  915. {
  916. return vint4(
  917. float_to_sf16(a.lane<0>()),
  918. float_to_sf16(a.lane<1>()),
  919. float_to_sf16(a.lane<2>()),
  920. float_to_sf16(a.lane<3>()));
  921. }
  922. /**
  923. * @brief Return a float16 value for a float scalar, using round-to-nearest.
  924. */
  925. static inline uint16_t float_to_float16(float a)
  926. {
  927. return float_to_sf16(a);
  928. }
  929. /**
  930. * @brief Return a float value for a float16 vector.
  931. */
  932. ASTCENC_SIMD_INLINE vfloat4 float16_to_float(vint4 a)
  933. {
  934. return vfloat4(
  935. sf16_to_float(static_cast<uint16_t>(a.lane<0>())),
  936. sf16_to_float(static_cast<uint16_t>(a.lane<1>())),
  937. sf16_to_float(static_cast<uint16_t>(a.lane<2>())),
  938. sf16_to_float(static_cast<uint16_t>(a.lane<3>())));
  939. }
  940. /**
  941. * @brief Return a float value for a float16 scalar.
  942. */
  943. ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a)
  944. {
  945. return sf16_to_float(a);
  946. }
  947. /**
  948. * @brief Return a float value as an integer bit pattern (i.e. no conversion).
  949. *
  950. * It is a common trick to convert floats into integer bit patterns, perform
  951. * some bit hackery based on knowledge they are IEEE 754 layout, and then
  952. * convert them back again. This is the first half of that flip.
  953. */
  954. ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)
  955. {
  956. vint4 r;
  957. std::memcpy(r.m, a.m, 4 * 4);
  958. return r;
  959. }
  960. /**
  961. * @brief Return a integer value as a float bit pattern (i.e. no conversion).
  962. *
  963. * It is a common trick to convert floats into integer bit patterns, perform
  964. * some bit hackery based on knowledge they are IEEE 754 layout, and then
  965. * convert them back again. This is the second half of that flip.
  966. */
  967. ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 a)
  968. {
  969. vfloat4 r;
  970. std::memcpy(r.m, a.m, 4 * 4);
  971. return r;
  972. }
  973. /**
  974. * @brief Prepare a vtable lookup table for use with the native SIMD size.
  975. */
  976. ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
  977. {
  978. t0p = t0;
  979. }
  980. /**
  981. * @brief Prepare a vtable lookup table for use with the native SIMD size.
  982. */
  983. ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
  984. {
  985. t0p = t0;
  986. t1p = t1;
  987. }
  988. /**
  989. * @brief Prepare a vtable lookup table for use with the native SIMD size.
  990. */
  991. ASTCENC_SIMD_INLINE void vtable_prepare(
  992. vint4 t0, vint4 t1, vint4 t2, vint4 t3,
  993. vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
  994. {
  995. t0p = t0;
  996. t1p = t1;
  997. t2p = t2;
  998. t3p = t3;
  999. }
  1000. /**
  1001. * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
  1002. */
  1003. ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
  1004. {
  1005. uint8_t table[16];
  1006. std::memcpy(table + 0, t0.m, 4 * sizeof(int));
  1007. return vint4(table[idx.lane<0>()],
  1008. table[idx.lane<1>()],
  1009. table[idx.lane<2>()],
  1010. table[idx.lane<3>()]);
  1011. }
  1012. /**
  1013. * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
  1014. */
  1015. ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
  1016. {
  1017. uint8_t table[32];
  1018. std::memcpy(table + 0, t0.m, 4 * sizeof(int));
  1019. std::memcpy(table + 16, t1.m, 4 * sizeof(int));
  1020. return vint4(table[idx.lane<0>()],
  1021. table[idx.lane<1>()],
  1022. table[idx.lane<2>()],
  1023. table[idx.lane<3>()]);
  1024. }
  1025. /**
  1026. * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
  1027. */
  1028. ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
  1029. {
  1030. uint8_t table[64];
  1031. std::memcpy(table + 0, t0.m, 4 * sizeof(int));
  1032. std::memcpy(table + 16, t1.m, 4 * sizeof(int));
  1033. std::memcpy(table + 32, t2.m, 4 * sizeof(int));
  1034. std::memcpy(table + 48, t3.m, 4 * sizeof(int));
  1035. return vint4(table[idx.lane<0>()],
  1036. table[idx.lane<1>()],
  1037. table[idx.lane<2>()],
  1038. table[idx.lane<3>()]);
  1039. }
  1040. /**
  1041. * @brief Return a vector of interleaved RGBA data.
  1042. *
  1043. * Input vectors have the value stored in the bottom 8 bits of each lane,
  1044. * with high bits set to zero.
  1045. *
  1046. * Output vector stores a single RGBA texel packed in each lane.
  1047. */
  1048. ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
  1049. {
  1050. return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
  1051. }
  1052. /**
  1053. * @brief Store a single vector lane to an unaligned address.
  1054. */
  1055. ASTCENC_SIMD_INLINE void store_lane(uint8_t* base, int data)
  1056. {
  1057. std::memcpy(base, &data, sizeof(int));
  1058. }
  1059. /**
  1060. * @brief Store a vector, skipping masked lanes.
  1061. *
  1062. * All masked lanes must be at the end of vector, after all non-masked lanes.
  1063. * Input is a byte array of at least 4 bytes per unmasked entry.
  1064. */
  1065. ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint4 data, vmask4 mask)
  1066. {
  1067. if (mask.m[3])
  1068. {
  1069. store(data, base);
  1070. }
  1071. else if (mask.m[2])
  1072. {
  1073. store_lane(base + 0, data.lane<0>());
  1074. store_lane(base + 4, data.lane<1>());
  1075. store_lane(base + 8, data.lane<2>());
  1076. }
  1077. else if (mask.m[1])
  1078. {
  1079. store_lane(base + 0, data.lane<0>());
  1080. store_lane(base + 4, data.lane<1>());
  1081. }
  1082. else if (mask.m[0])
  1083. {
  1084. store_lane(base + 0, data.lane<0>());
  1085. }
  1086. }
  1087. #endif // #ifndef ASTC_VECMATHLIB_NONE_4_H_INCLUDED