astcenc_vecmathlib_none_4.h 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206
  1. // SPDX-License-Identifier: Apache-2.0
  2. // ----------------------------------------------------------------------------
  3. // Copyright 2019-2023 Arm Limited
  4. //
  5. // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  6. // use this file except in compliance with the License. You may obtain a copy
  7. // of the License at:
  8. //
  9. // http://www.apache.org/licenses/LICENSE-2.0
  10. //
  11. // Unless required by applicable law or agreed to in writing, software
  12. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  13. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  14. // License for the specific language governing permissions and limitations
  15. // under the License.
  16. // ----------------------------------------------------------------------------
  17. /**
  18. * @brief 4x32-bit vectors, implemented using plain C++.
  19. *
  20. * This module implements 4-wide 32-bit float, int, and mask vectors. This
  21. * module provides a scalar fallback for VLA code, primarily useful for
  22. * debugging VLA algorithms without the complexity of handling SIMD. Only the
  23. * baseline level of functionality needed to support VLA is provided.
  24. *
  25. * Note that the vector conditional operators implemented by this module are
  26. * designed to behave like SIMD conditional operators that generate lane masks.
  27. * Rather than returning 0/1 booleans like normal C++ code they will return
  28. * 0/-1 to give a full lane-width bitmask.
  29. *
  30. * Note that the documentation for this module still talks about "vectors" to
  31. * help developers think about the implied VLA behavior when writing optimized
  32. * paths.
  33. */
  34. #ifndef ASTC_VECMATHLIB_NONE_4_H_INCLUDED
  35. #define ASTC_VECMATHLIB_NONE_4_H_INCLUDED
  36. #ifndef ASTCENC_SIMD_INLINE
  37. #error "Include astcenc_vecmathlib.h, do not include directly"
  38. #endif
  39. #include <algorithm>
  40. #include <cstdio>
  41. #include <cstring>
  42. #include <cfenv>
  43. // ============================================================================
  44. // vfloat4 data type
  45. // ============================================================================
  46. /**
  47. * @brief Data type for 4-wide floats.
  48. */
  49. struct vfloat4
  50. {
  51. /**
  52. * @brief Construct from zero-initialized value.
  53. */
  54. ASTCENC_SIMD_INLINE vfloat4() = default;
  55. /**
  56. * @brief Construct from 4 values loaded from an unaligned address.
  57. *
  58. * Consider using loada() which is better with wider VLA vectors if data is
  59. * aligned to vector length.
  60. */
  61. ASTCENC_SIMD_INLINE explicit vfloat4(const float* p)
  62. {
  63. m[0] = p[0];
  64. m[1] = p[1];
  65. m[2] = p[2];
  66. m[3] = p[3];
  67. }
  68. /**
  69. * @brief Construct from 4 scalar values replicated across all lanes.
  70. *
  71. * Consider using zero() for constexpr zeros.
  72. */
  73. ASTCENC_SIMD_INLINE explicit vfloat4(float a)
  74. {
  75. m[0] = a;
  76. m[1] = a;
  77. m[2] = a;
  78. m[3] = a;
  79. }
  80. /**
  81. * @brief Construct from 4 scalar values.
  82. *
  83. * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
  84. */
  85. ASTCENC_SIMD_INLINE explicit vfloat4(float a, float b, float c, float d)
  86. {
  87. m[0] = a;
  88. m[1] = b;
  89. m[2] = c;
  90. m[3] = d;
  91. }
  92. /**
  93. * @brief Get the scalar value of a single lane.
  94. */
  95. template <int l> ASTCENC_SIMD_INLINE float lane() const
  96. {
  97. return m[l];
  98. }
  99. /**
  100. * @brief Set the scalar value of a single lane.
  101. */
  102. template <int l> ASTCENC_SIMD_INLINE void set_lane(float a)
  103. {
  104. m[l] = a;
  105. }
  106. /**
  107. * @brief Factory that returns a vector of zeros.
  108. */
  109. static ASTCENC_SIMD_INLINE vfloat4 zero()
  110. {
  111. return vfloat4(0.0f);
  112. }
  113. /**
  114. * @brief Factory that returns a replicated scalar loaded from memory.
  115. */
  116. static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p)
  117. {
  118. return vfloat4(*p);
  119. }
  120. /**
  121. * @brief Factory that returns a vector loaded from aligned memory.
  122. */
  123. static ASTCENC_SIMD_INLINE vfloat4 loada(const float* p)
  124. {
  125. return vfloat4(p);
  126. }
  127. /**
  128. * @brief Factory that returns a vector containing the lane IDs.
  129. */
  130. static ASTCENC_SIMD_INLINE vfloat4 lane_id()
  131. {
  132. return vfloat4(0.0f, 1.0f, 2.0f, 3.0f);
  133. }
  134. /**
  135. * @brief Return a swizzled float 2.
  136. */
  137. template <int l0, int l1> ASTCENC_SIMD_INLINE vfloat4 swz() const
  138. {
  139. return vfloat4(lane<l0>(), lane<l1>(), 0.0f, 0.0f);
  140. }
  141. /**
  142. * @brief Return a swizzled float 3.
  143. */
  144. template <int l0, int l1, int l2> ASTCENC_SIMD_INLINE vfloat4 swz() const
  145. {
  146. return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), 0.0f);
  147. }
  148. /**
  149. * @brief Return a swizzled float 4.
  150. */
  151. template <int l0, int l1, int l2, int l3> ASTCENC_SIMD_INLINE vfloat4 swz() const
  152. {
  153. return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), lane<l3>());
  154. }
  155. /**
  156. * @brief The vector ...
  157. */
  158. float m[4];
  159. };
  160. // ============================================================================
  161. // vint4 data type
  162. // ============================================================================
  163. /**
  164. * @brief Data type for 4-wide ints.
  165. */
  166. struct vint4
  167. {
  168. /**
  169. * @brief Construct from zero-initialized value.
  170. */
  171. ASTCENC_SIMD_INLINE vint4() = default;
  172. /**
  173. * @brief Construct from 4 values loaded from an unaligned address.
  174. *
  175. * Consider using vint4::loada() which is better with wider VLA vectors
  176. * if data is aligned.
  177. */
  178. ASTCENC_SIMD_INLINE explicit vint4(const int* p)
  179. {
  180. m[0] = p[0];
  181. m[1] = p[1];
  182. m[2] = p[2];
  183. m[3] = p[3];
  184. }
  185. /**
  186. * @brief Construct from 4 uint8_t loaded from an unaligned address.
  187. */
  188. ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p)
  189. {
  190. m[0] = p[0];
  191. m[1] = p[1];
  192. m[2] = p[2];
  193. m[3] = p[3];
  194. }
  195. /**
  196. * @brief Construct from 4 scalar values.
  197. *
  198. * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
  199. */
  200. ASTCENC_SIMD_INLINE explicit vint4(int a, int b, int c, int d)
  201. {
  202. m[0] = a;
  203. m[1] = b;
  204. m[2] = c;
  205. m[3] = d;
  206. }
  207. /**
  208. * @brief Construct from 4 scalar values replicated across all lanes.
  209. *
  210. * Consider using vint4::zero() for constexpr zeros.
  211. */
  212. ASTCENC_SIMD_INLINE explicit vint4(int a)
  213. {
  214. m[0] = a;
  215. m[1] = a;
  216. m[2] = a;
  217. m[3] = a;
  218. }
  219. /**
  220. * @brief Get the scalar value of a single lane.
  221. */
  222. template <int l> ASTCENC_SIMD_INLINE int lane() const
  223. {
  224. return m[l];
  225. }
  226. /**
  227. * @brief Set the scalar value of a single lane.
  228. */
  229. template <int l> ASTCENC_SIMD_INLINE void set_lane(int a)
  230. {
  231. m[l] = a;
  232. }
  233. /**
  234. * @brief Factory that returns a vector of zeros.
  235. */
  236. static ASTCENC_SIMD_INLINE vint4 zero()
  237. {
  238. return vint4(0);
  239. }
  240. /**
  241. * @brief Factory that returns a replicated scalar loaded from memory.
  242. */
  243. static ASTCENC_SIMD_INLINE vint4 load1(const int* p)
  244. {
  245. return vint4(*p);
  246. }
  247. /**
  248. * @brief Factory that returns a vector loaded from unaligned memory.
  249. */
  250. static ASTCENC_SIMD_INLINE vint4 load(const uint8_t* p)
  251. {
  252. vint4 data;
  253. std::memcpy(&data.m, p, 4 * sizeof(int));
  254. return data;
  255. }
  256. /**
  257. * @brief Factory that returns a vector loaded from 16B aligned memory.
  258. */
  259. static ASTCENC_SIMD_INLINE vint4 loada(const int* p)
  260. {
  261. return vint4(p);
  262. }
  263. /**
  264. * @brief Factory that returns a vector containing the lane IDs.
  265. */
  266. static ASTCENC_SIMD_INLINE vint4 lane_id()
  267. {
  268. return vint4(0, 1, 2, 3);
  269. }
  270. /**
  271. * @brief The vector ...
  272. */
  273. int m[4];
  274. };
  275. // ============================================================================
  276. // vmask4 data type
  277. // ============================================================================
  278. /**
  279. * @brief Data type for 4-wide control plane masks.
  280. */
  281. struct vmask4
  282. {
  283. /**
  284. * @brief Construct from an existing mask value.
  285. */
  286. ASTCENC_SIMD_INLINE explicit vmask4(int* p)
  287. {
  288. m[0] = p[0];
  289. m[1] = p[1];
  290. m[2] = p[2];
  291. m[3] = p[3];
  292. }
  293. /**
  294. * @brief Construct from 1 scalar value.
  295. */
  296. ASTCENC_SIMD_INLINE explicit vmask4(bool a)
  297. {
  298. m[0] = a == false ? 0 : -1;
  299. m[1] = a == false ? 0 : -1;
  300. m[2] = a == false ? 0 : -1;
  301. m[3] = a == false ? 0 : -1;
  302. }
  303. /**
  304. * @brief Construct from 4 scalar values.
  305. *
  306. * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
  307. */
  308. ASTCENC_SIMD_INLINE explicit vmask4(bool a, bool b, bool c, bool d)
  309. {
  310. m[0] = a == false ? 0 : -1;
  311. m[1] = b == false ? 0 : -1;
  312. m[2] = c == false ? 0 : -1;
  313. m[3] = d == false ? 0 : -1;
  314. }
  315. /**
  316. * @brief Get the scalar value of a single lane.
  317. */
  318. template <int l> ASTCENC_SIMD_INLINE float lane() const
  319. {
  320. return m[l] != 0;
  321. }
  322. /**
  323. * @brief The vector ...
  324. */
  325. int m[4];
  326. };
  327. // ============================================================================
  328. // vmask4 operators and functions
  329. // ============================================================================
  330. /**
  331. * @brief Overload: mask union (or).
  332. */
  333. ASTCENC_SIMD_INLINE vmask4 operator|(vmask4 a, vmask4 b)
  334. {
  335. return vmask4(a.m[0] | b.m[0],
  336. a.m[1] | b.m[1],
  337. a.m[2] | b.m[2],
  338. a.m[3] | b.m[3]);
  339. }
  340. /**
  341. * @brief Overload: mask intersect (and).
  342. */
  343. ASTCENC_SIMD_INLINE vmask4 operator&(vmask4 a, vmask4 b)
  344. {
  345. return vmask4(a.m[0] & b.m[0],
  346. a.m[1] & b.m[1],
  347. a.m[2] & b.m[2],
  348. a.m[3] & b.m[3]);
  349. }
  350. /**
  351. * @brief Overload: mask difference (xor).
  352. */
  353. ASTCENC_SIMD_INLINE vmask4 operator^(vmask4 a, vmask4 b)
  354. {
  355. return vmask4(a.m[0] ^ b.m[0],
  356. a.m[1] ^ b.m[1],
  357. a.m[2] ^ b.m[2],
  358. a.m[3] ^ b.m[3]);
  359. }
  360. /**
  361. * @brief Overload: mask invert (not).
  362. */
  363. ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a)
  364. {
  365. return vmask4(~a.m[0],
  366. ~a.m[1],
  367. ~a.m[2],
  368. ~a.m[3]);
  369. }
  370. /**
  371. * @brief Return a 1-bit mask code indicating mask status.
  372. *
  373. * bit0 = lane 0
  374. */
  375. ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
  376. {
  377. return ((a.m[0] >> 31) & 0x1) |
  378. ((a.m[1] >> 30) & 0x2) |
  379. ((a.m[2] >> 29) & 0x4) |
  380. ((a.m[3] >> 28) & 0x8);
  381. }
  382. // ============================================================================
  383. // vint4 operators and functions
  384. // ============================================================================
  385. /**
  386. * @brief Overload: vector by vector addition.
  387. */
  388. ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, vint4 b)
  389. {
  390. return vint4(a.m[0] + b.m[0],
  391. a.m[1] + b.m[1],
  392. a.m[2] + b.m[2],
  393. a.m[3] + b.m[3]);
  394. }
  395. /**
  396. * @brief Overload: vector by vector subtraction.
  397. */
  398. ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, vint4 b)
  399. {
  400. return vint4(a.m[0] - b.m[0],
  401. a.m[1] - b.m[1],
  402. a.m[2] - b.m[2],
  403. a.m[3] - b.m[3]);
  404. }
  405. /**
  406. * @brief Overload: vector by vector multiplication.
  407. */
  408. ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, vint4 b)
  409. {
  410. return vint4(a.m[0] * b.m[0],
  411. a.m[1] * b.m[1],
  412. a.m[2] * b.m[2],
  413. a.m[3] * b.m[3]);
  414. }
  415. /**
  416. * @brief Overload: vector bit invert.
  417. */
  418. ASTCENC_SIMD_INLINE vint4 operator~(vint4 a)
  419. {
  420. return vint4(~a.m[0],
  421. ~a.m[1],
  422. ~a.m[2],
  423. ~a.m[3]);
  424. }
  425. /**
  426. * @brief Overload: vector by vector bitwise or.
  427. */
  428. ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, vint4 b)
  429. {
  430. return vint4(a.m[0] | b.m[0],
  431. a.m[1] | b.m[1],
  432. a.m[2] | b.m[2],
  433. a.m[3] | b.m[3]);
  434. }
  435. /**
  436. * @brief Overload: vector by vector bitwise and.
  437. */
  438. ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, vint4 b)
  439. {
  440. return vint4(a.m[0] & b.m[0],
  441. a.m[1] & b.m[1],
  442. a.m[2] & b.m[2],
  443. a.m[3] & b.m[3]);
  444. }
  445. /**
  446. * @brief Overload: vector by vector bitwise xor.
  447. */
  448. ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, vint4 b)
  449. {
  450. return vint4(a.m[0] ^ b.m[0],
  451. a.m[1] ^ b.m[1],
  452. a.m[2] ^ b.m[2],
  453. a.m[3] ^ b.m[3]);
  454. }
  455. /**
  456. * @brief Overload: vector by vector equality.
  457. */
  458. ASTCENC_SIMD_INLINE vmask4 operator==(vint4 a, vint4 b)
  459. {
  460. return vmask4(a.m[0] == b.m[0],
  461. a.m[1] == b.m[1],
  462. a.m[2] == b.m[2],
  463. a.m[3] == b.m[3]);
  464. }
  465. /**
  466. * @brief Overload: vector by vector inequality.
  467. */
  468. ASTCENC_SIMD_INLINE vmask4 operator!=(vint4 a, vint4 b)
  469. {
  470. return vmask4(a.m[0] != b.m[0],
  471. a.m[1] != b.m[1],
  472. a.m[2] != b.m[2],
  473. a.m[3] != b.m[3]);
  474. }
  475. /**
  476. * @brief Overload: vector by vector less than.
  477. */
  478. ASTCENC_SIMD_INLINE vmask4 operator<(vint4 a, vint4 b)
  479. {
  480. return vmask4(a.m[0] < b.m[0],
  481. a.m[1] < b.m[1],
  482. a.m[2] < b.m[2],
  483. a.m[3] < b.m[3]);
  484. }
  485. /**
  486. * @brief Overload: vector by vector greater than.
  487. */
  488. ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b)
  489. {
  490. return vmask4(a.m[0] > b.m[0],
  491. a.m[1] > b.m[1],
  492. a.m[2] > b.m[2],
  493. a.m[3] > b.m[3]);
  494. }
  495. /**
  496. * @brief Logical shift left.
  497. */
  498. template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
  499. {
  500. return vint4(a.m[0] << s,
  501. a.m[1] << s,
  502. a.m[2] << s,
  503. a.m[3] << s);
  504. }
  505. /**
  506. * @brief Logical shift right.
  507. */
  508. template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a)
  509. {
  510. unsigned int as0 = static_cast<unsigned int>(a.m[0]) >> s;
  511. unsigned int as1 = static_cast<unsigned int>(a.m[1]) >> s;
  512. unsigned int as2 = static_cast<unsigned int>(a.m[2]) >> s;
  513. unsigned int as3 = static_cast<unsigned int>(a.m[3]) >> s;
  514. return vint4(static_cast<int>(as0),
  515. static_cast<int>(as1),
  516. static_cast<int>(as2),
  517. static_cast<int>(as3));
  518. }
  519. /**
  520. * @brief Arithmetic shift right.
  521. */
  522. template <int s> ASTCENC_SIMD_INLINE vint4 asr(vint4 a)
  523. {
  524. return vint4(a.m[0] >> s,
  525. a.m[1] >> s,
  526. a.m[2] >> s,
  527. a.m[3] >> s);
  528. }
  529. /**
  530. * @brief Return the min vector of two vectors.
  531. */
  532. ASTCENC_SIMD_INLINE vint4 min(vint4 a, vint4 b)
  533. {
  534. return vint4(a.m[0] < b.m[0] ? a.m[0] : b.m[0],
  535. a.m[1] < b.m[1] ? a.m[1] : b.m[1],
  536. a.m[2] < b.m[2] ? a.m[2] : b.m[2],
  537. a.m[3] < b.m[3] ? a.m[3] : b.m[3]);
  538. }
  539. /**
  540. * @brief Return the min vector of two vectors.
  541. */
  542. ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b)
  543. {
  544. return vint4(a.m[0] > b.m[0] ? a.m[0] : b.m[0],
  545. a.m[1] > b.m[1] ? a.m[1] : b.m[1],
  546. a.m[2] > b.m[2] ? a.m[2] : b.m[2],
  547. a.m[3] > b.m[3] ? a.m[3] : b.m[3]);
  548. }
  549. /**
  550. * @brief Return the horizontal minimum of a single vector.
  551. */
  552. ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
  553. {
  554. int b = std::min(a.m[0], a.m[1]);
  555. int c = std::min(a.m[2], a.m[3]);
  556. return vint4(std::min(b, c));
  557. }
  558. /**
  559. * @brief Return the horizontal maximum of a single vector.
  560. */
  561. ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
  562. {
  563. int b = std::max(a.m[0], a.m[1]);
  564. int c = std::max(a.m[2], a.m[3]);
  565. return vint4(std::max(b, c));
  566. }
  567. /**
  568. * @brief Return the horizontal sum of vector lanes as a scalar.
  569. */
  570. ASTCENC_SIMD_INLINE int hadd_s(vint4 a)
  571. {
  572. return a.m[0] + a.m[1] + a.m[2] + a.m[3];
  573. }
  574. /**
  575. * @brief Store a vector to an aligned memory address.
  576. */
  577. ASTCENC_SIMD_INLINE void storea(vint4 a, int* p)
  578. {
  579. p[0] = a.m[0];
  580. p[1] = a.m[1];
  581. p[2] = a.m[2];
  582. p[3] = a.m[3];
  583. }
  584. /**
  585. * @brief Store a vector to an unaligned memory address.
  586. */
  587. ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
  588. {
  589. p[0] = a.m[0];
  590. p[1] = a.m[1];
  591. p[2] = a.m[2];
  592. p[3] = a.m[3];
  593. }
  594. /**
  595. * @brief Store a vector to an unaligned memory address.
  596. */
  597. ASTCENC_SIMD_INLINE void store(vint4 a, uint8_t* p)
  598. {
  599. std::memcpy(p, a.m, sizeof(int) * 4);
  600. }
  601. /**
  602. * @brief Store lowest N (vector width) bytes into an unaligned address.
  603. */
  604. ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
  605. {
  606. std::memcpy(p, a.m, sizeof(uint8_t) * 4);
  607. }
  608. /**
  609. * @brief Gather N (vector width) indices from the array.
  610. */
  611. ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
  612. {
  613. return vint4(base[indices.m[0]],
  614. base[indices.m[1]],
  615. base[indices.m[2]],
  616. base[indices.m[3]]);
  617. }
  618. /**
  619. * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
  620. */
  621. ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
  622. {
  623. int b0 = a.m[0] & 0xFF;
  624. int b1 = a.m[1] & 0xFF;
  625. int b2 = a.m[2] & 0xFF;
  626. int b3 = a.m[3] & 0xFF;
  627. int b = b0 | (b1 << 8) | (b2 << 16) | (b3 << 24);
  628. return vint4(b, 0, 0, 0);
  629. }
  630. /**
  631. * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
  632. */
  633. ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond)
  634. {
  635. return vint4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0],
  636. (cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1],
  637. (cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2],
  638. (cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
  639. }
  640. // ============================================================================
  641. // vfloat4 operators and functions
  642. // ============================================================================
  643. /**
  644. * @brief Overload: vector by vector addition.
  645. */
  646. ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, vfloat4 b)
  647. {
  648. return vfloat4(a.m[0] + b.m[0],
  649. a.m[1] + b.m[1],
  650. a.m[2] + b.m[2],
  651. a.m[3] + b.m[3]);
  652. }
  653. /**
  654. * @brief Overload: vector by vector subtraction.
  655. */
  656. ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, vfloat4 b)
  657. {
  658. return vfloat4(a.m[0] - b.m[0],
  659. a.m[1] - b.m[1],
  660. a.m[2] - b.m[2],
  661. a.m[3] - b.m[3]);
  662. }
  663. /**
  664. * @brief Overload: vector by vector multiplication.
  665. */
  666. ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, vfloat4 b)
  667. {
  668. return vfloat4(a.m[0] * b.m[0],
  669. a.m[1] * b.m[1],
  670. a.m[2] * b.m[2],
  671. a.m[3] * b.m[3]);
  672. }
  673. /**
  674. * @brief Overload: vector by vector division.
  675. */
  676. ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, vfloat4 b)
  677. {
  678. return vfloat4(a.m[0] / b.m[0],
  679. a.m[1] / b.m[1],
  680. a.m[2] / b.m[2],
  681. a.m[3] / b.m[3]);
  682. }
  683. /**
  684. * @brief Overload: vector by vector equality.
  685. */
  686. ASTCENC_SIMD_INLINE vmask4 operator==(vfloat4 a, vfloat4 b)
  687. {
  688. return vmask4(a.m[0] == b.m[0],
  689. a.m[1] == b.m[1],
  690. a.m[2] == b.m[2],
  691. a.m[3] == b.m[3]);
  692. }
  693. /**
  694. * @brief Overload: vector by vector inequality.
  695. */
  696. ASTCENC_SIMD_INLINE vmask4 operator!=(vfloat4 a, vfloat4 b)
  697. {
  698. return vmask4(a.m[0] != b.m[0],
  699. a.m[1] != b.m[1],
  700. a.m[2] != b.m[2],
  701. a.m[3] != b.m[3]);
  702. }
  703. /**
  704. * @brief Overload: vector by vector less than.
  705. */
  706. ASTCENC_SIMD_INLINE vmask4 operator<(vfloat4 a, vfloat4 b)
  707. {
  708. return vmask4(a.m[0] < b.m[0],
  709. a.m[1] < b.m[1],
  710. a.m[2] < b.m[2],
  711. a.m[3] < b.m[3]);
  712. }
  713. /**
  714. * @brief Overload: vector by vector greater than.
  715. */
  716. ASTCENC_SIMD_INLINE vmask4 operator>(vfloat4 a, vfloat4 b)
  717. {
  718. return vmask4(a.m[0] > b.m[0],
  719. a.m[1] > b.m[1],
  720. a.m[2] > b.m[2],
  721. a.m[3] > b.m[3]);
  722. }
  723. /**
  724. * @brief Overload: vector by vector less than or equal.
  725. */
  726. ASTCENC_SIMD_INLINE vmask4 operator<=(vfloat4 a, vfloat4 b)
  727. {
  728. return vmask4(a.m[0] <= b.m[0],
  729. a.m[1] <= b.m[1],
  730. a.m[2] <= b.m[2],
  731. a.m[3] <= b.m[3]);
  732. }
  733. /**
  734. * @brief Overload: vector by vector greater than or equal.
  735. */
  736. ASTCENC_SIMD_INLINE vmask4 operator>=(vfloat4 a, vfloat4 b)
  737. {
  738. return vmask4(a.m[0] >= b.m[0],
  739. a.m[1] >= b.m[1],
  740. a.m[2] >= b.m[2],
  741. a.m[3] >= b.m[3]);
  742. }
  743. /**
  744. * @brief Return the min vector of two vectors.
  745. *
  746. * If either lane value is NaN, @c b will be returned for that lane.
  747. */
  748. ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, vfloat4 b)
  749. {
  750. return vfloat4(a.m[0] < b.m[0] ? a.m[0] : b.m[0],
  751. a.m[1] < b.m[1] ? a.m[1] : b.m[1],
  752. a.m[2] < b.m[2] ? a.m[2] : b.m[2],
  753. a.m[3] < b.m[3] ? a.m[3] : b.m[3]);
  754. }
  755. /**
  756. * @brief Return the max vector of two vectors.
  757. *
  758. * If either lane value is NaN, @c b will be returned for that lane.
  759. */
  760. ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, vfloat4 b)
  761. {
  762. return vfloat4(a.m[0] > b.m[0] ? a.m[0] : b.m[0],
  763. a.m[1] > b.m[1] ? a.m[1] : b.m[1],
  764. a.m[2] > b.m[2] ? a.m[2] : b.m[2],
  765. a.m[3] > b.m[3] ? a.m[3] : b.m[3]);
  766. }
  767. /**
  768. * @brief Return the absolute value of the float vector.
  769. */
  770. ASTCENC_SIMD_INLINE vfloat4 abs(vfloat4 a)
  771. {
  772. return vfloat4(std::abs(a.m[0]),
  773. std::abs(a.m[1]),
  774. std::abs(a.m[2]),
  775. std::abs(a.m[3]));
  776. }
  777. /**
  778. * @brief Return a float rounded to the nearest integer value.
  779. */
  780. ASTCENC_SIMD_INLINE vfloat4 round(vfloat4 a)
  781. {
  782. assert(std::fegetround() == FE_TONEAREST);
  783. return vfloat4(std::nearbyint(a.m[0]),
  784. std::nearbyint(a.m[1]),
  785. std::nearbyint(a.m[2]),
  786. std::nearbyint(a.m[3]));
  787. }
  788. /**
  789. * @brief Return the horizontal minimum of a vector.
  790. */
  791. ASTCENC_SIMD_INLINE vfloat4 hmin(vfloat4 a)
  792. {
  793. float tmp1 = std::min(a.m[0], a.m[1]);
  794. float tmp2 = std::min(a.m[2], a.m[3]);
  795. return vfloat4(std::min(tmp1, tmp2));
  796. }
  797. /**
  798. * @brief Return the horizontal maximum of a vector.
  799. */
  800. ASTCENC_SIMD_INLINE vfloat4 hmax(vfloat4 a)
  801. {
  802. float tmp1 = std::max(a.m[0], a.m[1]);
  803. float tmp2 = std::max(a.m[2], a.m[3]);
  804. return vfloat4(std::max(tmp1, tmp2));
  805. }
  806. /**
  807. * @brief Return the horizontal sum of a vector.
  808. */
  809. ASTCENC_SIMD_INLINE float hadd_s(vfloat4 a)
  810. {
  811. // Use halving add, gives invariance with SIMD versions
  812. return (a.m[0] + a.m[2]) + (a.m[1] + a.m[3]);
  813. }
  814. /**
  815. * @brief Return the sqrt of the lanes in the vector.
  816. */
  817. ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a)
  818. {
  819. return vfloat4(std::sqrt(a.m[0]),
  820. std::sqrt(a.m[1]),
  821. std::sqrt(a.m[2]),
  822. std::sqrt(a.m[3]));
  823. }
  824. /**
  825. * @brief Return lanes from @c b if @c cond is set, else @c a.
  826. */
  827. ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
  828. {
  829. return vfloat4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0],
  830. (cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1],
  831. (cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2],
  832. (cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
  833. }
  834. /**
  835. * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
  836. */
  837. ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
  838. {
  839. return vfloat4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0],
  840. (cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1],
  841. (cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2],
  842. (cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
  843. }
  844. /**
  845. * @brief Load a vector of gathered results from an array;
  846. */
  847. ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
  848. {
  849. return vfloat4(base[indices.m[0]],
  850. base[indices.m[1]],
  851. base[indices.m[2]],
  852. base[indices.m[3]]);
  853. }
  854. /**
  855. * @brief Store a vector to an unaligned memory address.
  856. */
  857. ASTCENC_SIMD_INLINE void store(vfloat4 a, float* ptr)
  858. {
  859. ptr[0] = a.m[0];
  860. ptr[1] = a.m[1];
  861. ptr[2] = a.m[2];
  862. ptr[3] = a.m[3];
  863. }
  864. /**
  865. * @brief Store a vector to an aligned memory address.
  866. */
  867. ASTCENC_SIMD_INLINE void storea(vfloat4 a, float* ptr)
  868. {
  869. ptr[0] = a.m[0];
  870. ptr[1] = a.m[1];
  871. ptr[2] = a.m[2];
  872. ptr[3] = a.m[3];
  873. }
  874. /**
  875. * @brief Return a integer value for a float vector, using truncation.
  876. */
  877. ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
  878. {
  879. return vint4(static_cast<int>(a.m[0]),
  880. static_cast<int>(a.m[1]),
  881. static_cast<int>(a.m[2]),
  882. static_cast<int>(a.m[3]));
  883. }
  884. /**f
  885. * @brief Return a integer value for a float vector, using round-to-nearest.
  886. */
  887. ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
  888. {
  889. a = a + vfloat4(0.5f);
  890. return vint4(static_cast<int>(a.m[0]),
  891. static_cast<int>(a.m[1]),
  892. static_cast<int>(a.m[2]),
  893. static_cast<int>(a.m[3]));
  894. }
  895. /**
  896. * @brief Return a float value for a integer vector.
  897. */
  898. ASTCENC_SIMD_INLINE vfloat4 int_to_float(vint4 a)
  899. {
  900. return vfloat4(static_cast<float>(a.m[0]),
  901. static_cast<float>(a.m[1]),
  902. static_cast<float>(a.m[2]),
  903. static_cast<float>(a.m[3]));
  904. }
  905. /**
  906. * @brief Return a float16 value for a float vector, using round-to-nearest.
  907. */
  908. ASTCENC_SIMD_INLINE vint4 float_to_float16(vfloat4 a)
  909. {
  910. return vint4(
  911. float_to_sf16(a.lane<0>()),
  912. float_to_sf16(a.lane<1>()),
  913. float_to_sf16(a.lane<2>()),
  914. float_to_sf16(a.lane<3>()));
  915. }
  916. /**
  917. * @brief Return a float16 value for a float scalar, using round-to-nearest.
  918. */
  919. static inline uint16_t float_to_float16(float a)
  920. {
  921. return float_to_sf16(a);
  922. }
  923. /**
  924. * @brief Return a float value for a float16 vector.
  925. */
  926. ASTCENC_SIMD_INLINE vfloat4 float16_to_float(vint4 a)
  927. {
  928. return vfloat4(
  929. sf16_to_float(static_cast<uint16_t>(a.lane<0>())),
  930. sf16_to_float(static_cast<uint16_t>(a.lane<1>())),
  931. sf16_to_float(static_cast<uint16_t>(a.lane<2>())),
  932. sf16_to_float(static_cast<uint16_t>(a.lane<3>())));
  933. }
  934. /**
  935. * @brief Return a float value for a float16 scalar.
  936. */
  937. ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a)
  938. {
  939. return sf16_to_float(a);
  940. }
  941. /**
  942. * @brief Return a float value as an integer bit pattern (i.e. no conversion).
  943. *
  944. * It is a common trick to convert floats into integer bit patterns, perform
  945. * some bit hackery based on knowledge they are IEEE 754 layout, and then
  946. * convert them back again. This is the first half of that flip.
  947. */
  948. ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)
  949. {
  950. vint4 r;
  951. std::memcpy(r.m, a.m, 4 * 4);
  952. return r;
  953. }
  954. /**
  955. * @brief Return a integer value as a float bit pattern (i.e. no conversion).
  956. *
  957. * It is a common trick to convert floats into integer bit patterns, perform
  958. * some bit hackery based on knowledge they are IEEE 754 layout, and then
  959. * convert them back again. This is the second half of that flip.
  960. */
  961. ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 a)
  962. {
  963. vfloat4 r;
  964. std::memcpy(r.m, a.m, 4 * 4);
  965. return r;
  966. }
  967. /**
  968. * @brief Prepare a vtable lookup table for use with the native SIMD size.
  969. */
  970. ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
  971. {
  972. t0p = t0;
  973. }
  974. /**
  975. * @brief Prepare a vtable lookup table for use with the native SIMD size.
  976. */
  977. ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
  978. {
  979. t0p = t0;
  980. t1p = t1;
  981. }
  982. /**
  983. * @brief Prepare a vtable lookup table for use with the native SIMD size.
  984. */
  985. ASTCENC_SIMD_INLINE void vtable_prepare(
  986. vint4 t0, vint4 t1, vint4 t2, vint4 t3,
  987. vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
  988. {
  989. t0p = t0;
  990. t1p = t1;
  991. t2p = t2;
  992. t3p = t3;
  993. }
  994. /**
  995. * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
  996. */
  997. ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
  998. {
  999. uint8_t table[16];
  1000. std::memcpy(table + 0, t0.m, 4 * sizeof(int));
  1001. return vint4(table[idx.lane<0>()],
  1002. table[idx.lane<1>()],
  1003. table[idx.lane<2>()],
  1004. table[idx.lane<3>()]);
  1005. }
  1006. /**
  1007. * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
  1008. */
  1009. ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
  1010. {
  1011. uint8_t table[32];
  1012. std::memcpy(table + 0, t0.m, 4 * sizeof(int));
  1013. std::memcpy(table + 16, t1.m, 4 * sizeof(int));
  1014. return vint4(table[idx.lane<0>()],
  1015. table[idx.lane<1>()],
  1016. table[idx.lane<2>()],
  1017. table[idx.lane<3>()]);
  1018. }
  1019. /**
  1020. * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
  1021. */
  1022. ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
  1023. {
  1024. uint8_t table[64];
  1025. std::memcpy(table + 0, t0.m, 4 * sizeof(int));
  1026. std::memcpy(table + 16, t1.m, 4 * sizeof(int));
  1027. std::memcpy(table + 32, t2.m, 4 * sizeof(int));
  1028. std::memcpy(table + 48, t3.m, 4 * sizeof(int));
  1029. return vint4(table[idx.lane<0>()],
  1030. table[idx.lane<1>()],
  1031. table[idx.lane<2>()],
  1032. table[idx.lane<3>()]);
  1033. }
  1034. /**
  1035. * @brief Return a vector of interleaved RGBA data.
  1036. *
  1037. * Input vectors have the value stored in the bottom 8 bits of each lane,
  1038. * with high bits set to zero.
  1039. *
  1040. * Output vector stores a single RGBA texel packed in each lane.
  1041. */
  1042. ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
  1043. {
  1044. return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
  1045. }
  1046. /**
  1047. * @brief Store a single vector lane to an unaligned address.
  1048. */
  1049. ASTCENC_SIMD_INLINE void store_lane(uint8_t* base, int data)
  1050. {
  1051. std::memcpy(base, &data, sizeof(int));
  1052. }
  1053. /**
  1054. * @brief Store a vector, skipping masked lanes.
  1055. *
  1056. * All masked lanes must be at the end of vector, after all non-masked lanes.
  1057. * Input is a byte array of at least 4 bytes per unmasked entry.
  1058. */
  1059. ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint4 data, vmask4 mask)
  1060. {
  1061. if (mask.m[3])
  1062. {
  1063. store(data, base);
  1064. }
  1065. else if (mask.m[2])
  1066. {
  1067. store_lane(base + 0, data.lane<0>());
  1068. store_lane(base + 4, data.lane<1>());
  1069. store_lane(base + 8, data.lane<2>());
  1070. }
  1071. else if (mask.m[1])
  1072. {
  1073. store_lane(base + 0, data.lane<0>());
  1074. store_lane(base + 4, data.lane<1>());
  1075. }
  1076. else if (mask.m[0])
  1077. {
  1078. store_lane(base + 0, data.lane<0>());
  1079. }
  1080. }
  1081. #endif // #ifndef ASTC_VECMATHLIB_NONE_4_H_INCLUDED