iron_simd.h 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788
  1. #pragma once
  2. #include <iron_global.h>
  3. #include <string.h>
  4. /*! \file float32x4.h
  5. \brief Provides 128bit four-element floating point SIMD operations which are mapped to equivalent SSE or Neon operations.
  6. */
  7. // Any level of AVX Capability (Could be AVX, AVX2, AVX512, etc.)
  8. //(Currently) only used for checking existence of earlier SSE instruction sets
  9. #if defined(__AVX__)
  10. // Unfortunate situation here
  11. // MSVC does not provide compiletime macros for the following instruction sets
  12. // but their existence is implied by AVX and higher
  13. #define IRON_SSE4_2
  14. #define IRON_SSE4_1
  15. #define IRON_SSSE3
  16. #define IRON_SSE3
  17. #endif
  18. // SSE2 Capability check
  19. // Note for Windows:
  20. // _M_IX86_FP checks SSE2 and SSE for 32bit Windows programs only, and is unset if not a 32bit program.
  21. // SSE2 and earlier is --guaranteed-- to be active for any 64bit Windows program
  22. #if defined(__SSE2__) || (_M_IX86_FP == 2) || (defined(IRON_WINDOWS) && defined(IRON_64))
  23. #define IRON_SSE2
  24. #endif
  25. // SSE Capability check
  26. #if defined(__SSE__) || _M_IX86_FP == 2 || _M_IX86_FP == 1 || (defined(IRON_WINDOWS) && !defined(__aarch64__)) || \
  27. (defined(IRON_WINDOWSAPP) && !defined(__aarch64__)) || (defined(IRON_MACOS) && __x86_64)
  28. #define IRON_SSE
  29. #endif
  30. // NEON Capability check
  31. #if (defined(IRON_IOS) || defined(__aarch64__)) && !defined(IRON_NOSIMD)
  32. #define IRON_NEON
  33. #endif
  34. // No SIMD Capabilities
  35. #if !defined(IRON_SSE4_2) && !defined(IRON_SSE4_1) && !defined(IRON_SSSE3) && !defined(IRON_SSE3) && !defined(IRON_SSE2) && !defined(IRON_SSE) && \
  36. !defined(IRON_NEON) && !defined(IRON_NOSIMD)
  37. #define IRON_NOSIMD
  38. #endif
  39. #define IRON_SHUFFLE_TABLE(LANE_A1, LANE_A2, LANE_B1, LANE_B2) \
  40. ((((LANE_B2)&0x3) << 6) | (((LANE_B1)&0x3) << 4) | (((LANE_A2)&0x3) << 2) | (((LANE_A1)&0x3) << 0))
  41. #if defined(IRON_SSE2)
  42. // SSE_## related headers include earlier revisions, IE
  43. // SSE2 contains all of SSE
  44. #include <emmintrin.h>
  45. typedef __m128 iron_float32x4_t;
  46. typedef __m128 iron_float32x4_mask_t;
  47. #elif defined(IRON_SSE)
  48. #include <xmmintrin.h>
  49. typedef __m128 iron_float32x4_t;
  50. typedef __m128 iron_float32x4_mask_t;
  51. #elif defined(IRON_NEON)
  52. #include <arm_neon.h>
  53. typedef float32x4_t iron_float32x4_t;
  54. typedef uint32x4_t iron_float32x4_mask_t;
  55. #elif defined(IRON_NOSIMD)
  56. #include <iron_math.h>
  57. typedef struct iron_float32x4 {
  58. float values[4];
  59. } iron_float32x4_t;
  60. typedef iron_float32x4_t iron_float32x4_mask_t;
  61. #endif
  62. #if defined(IRON_SSE)
  63. static inline iron_float32x4_t iron_float32x4_intrin_load(const float *values) {
  64. return _mm_load_ps(values);
  65. }
  66. static inline iron_float32x4_t iron_float32x4_intrin_load_unaligned(const float *values) {
  67. return _mm_loadu_ps(values);
  68. }
  69. static inline iron_float32x4_t iron_float32x4_load(float a, float b, float c, float d) {
  70. return _mm_set_ps(d, c, b, a);
  71. }
  72. static inline iron_float32x4_t iron_float32x4_load_all(float t) {
  73. return _mm_set_ps1(t);
  74. }
  75. static inline void iron_float32x4_store(float *destination, iron_float32x4_t value) {
  76. _mm_store_ps(destination, value);
  77. }
  78. static inline void iron_float32x4_store_unaligned(float *destination, iron_float32x4_t value) {
  79. _mm_storeu_ps(destination, value);
  80. }
  81. static inline float iron_float32x4_get(iron_float32x4_t t, int index) {
  82. union {
  83. __m128 value;
  84. float elements[4];
  85. } converter;
  86. converter.value = t;
  87. return converter.elements[index];
  88. }
  89. static inline iron_float32x4_t iron_float32x4_abs(iron_float32x4_t t) {
  90. __m128 mask = _mm_set_ps1(-0.f);
  91. return _mm_andnot_ps(mask, t);
  92. }
  93. static inline iron_float32x4_t iron_float32x4_add(iron_float32x4_t a, iron_float32x4_t b) {
  94. return _mm_add_ps(a, b);
  95. }
  96. static inline iron_float32x4_t iron_float32x4_div(iron_float32x4_t a, iron_float32x4_t b) {
  97. return _mm_div_ps(a, b);
  98. }
  99. static inline iron_float32x4_t iron_float32x4_mul(iron_float32x4_t a, iron_float32x4_t b) {
  100. return _mm_mul_ps(a, b);
  101. }
  102. static inline iron_float32x4_t iron_float32x4_neg(iron_float32x4_t t) {
  103. __m128 negative = _mm_set_ps1(-1.0f);
  104. return _mm_mul_ps(t, negative);
  105. }
  106. static inline iron_float32x4_t iron_float32x4_reciprocal_approximation(iron_float32x4_t t) {
  107. return _mm_rcp_ps(t);
  108. }
  109. static inline iron_float32x4_t iron_float32x4_reciprocal_sqrt_approximation(iron_float32x4_t t) {
  110. return _mm_rsqrt_ps(t);
  111. }
  112. static inline iron_float32x4_t iron_float32x4_sub(iron_float32x4_t a, iron_float32x4_t b) {
  113. return _mm_sub_ps(a, b);
  114. }
  115. static inline iron_float32x4_t iron_float32x4_sqrt(iron_float32x4_t t) {
  116. return _mm_sqrt_ps(t);
  117. }
  118. static inline iron_float32x4_t iron_float32x4_max(iron_float32x4_t a, iron_float32x4_t b) {
  119. return _mm_max_ps(a, b);
  120. }
  121. static inline iron_float32x4_t iron_float32x4_min(iron_float32x4_t a, iron_float32x4_t b) {
  122. return _mm_min_ps(a, b);
  123. }
  124. static inline iron_float32x4_mask_t iron_float32x4_cmpeq(iron_float32x4_t a, iron_float32x4_t b) {
  125. return _mm_cmpeq_ps(a, b);
  126. }
  127. static inline iron_float32x4_mask_t iron_float32x4_cmpge(iron_float32x4_t a, iron_float32x4_t b) {
  128. return _mm_cmpge_ps(a, b);
  129. }
  130. static inline iron_float32x4_mask_t iron_float32x4_cmpgt(iron_float32x4_t a, iron_float32x4_t b) {
  131. return _mm_cmpgt_ps(a, b);
  132. }
  133. static inline iron_float32x4_mask_t iron_float32x4_cmple(iron_float32x4_t a, iron_float32x4_t b) {
  134. return _mm_cmple_ps(a, b);
  135. }
  136. static inline iron_float32x4_mask_t iron_float32x4_cmplt(iron_float32x4_t a, iron_float32x4_t b) {
  137. return _mm_cmplt_ps(a, b);
  138. }
  139. static inline iron_float32x4_mask_t iron_float32x4_cmpneq(iron_float32x4_t a, iron_float32x4_t b) {
  140. return _mm_cmpneq_ps(a, b);
  141. }
  142. static inline iron_float32x4_t iron_float32x4_sel(iron_float32x4_t a, iron_float32x4_t b, iron_float32x4_mask_t mask) {
  143. return _mm_xor_ps(b, _mm_and_ps(mask, _mm_xor_ps(a, b)));
  144. }
  145. static inline iron_float32x4_t iron_float32x4_or(iron_float32x4_t a, iron_float32x4_t b) {
  146. return _mm_or_ps(a, b);
  147. }
  148. static inline iron_float32x4_t iron_float32x4_and(iron_float32x4_t a, iron_float32x4_t b) {
  149. return _mm_and_ps(a, b);
  150. }
  151. static inline iron_float32x4_t iron_float32x4_xor(iron_float32x4_t a, iron_float32x4_t b) {
  152. return _mm_xor_ps(a, b);
  153. }
  154. static inline iron_float32x4_t iron_float32x4_not(iron_float32x4_t t) {
  155. __m128 zeroes = _mm_setzero_ps();
  156. return _mm_xor_ps(t, _mm_cmpeq_ps(zeroes, zeroes));
  157. }
  158. #define iron_float32x4_shuffle_custom(abcd, efgh, left_1, left_2, right_1, right_2) \
  159. _mm_shuffle_ps((abcd), (efgh), IRON_SHUFFLE_TABLE((left_1), (left_2), (right_1), (right_2)))
  160. static inline iron_float32x4_t iron_float32x4_shuffle_aebf(iron_float32x4_t abcd, iron_float32x4_t efgh) {
  161. // aka unpacklo aka zip1 aka interleave low
  162. return _mm_unpacklo_ps(abcd, efgh);
  163. }
  164. static inline iron_float32x4_t iron_float32x4_shuffle_cgdh(iron_float32x4_t abcd, iron_float32x4_t efgh) {
  165. // aka unpackhi aka zip2 aka interleave high
  166. return _mm_unpackhi_ps(abcd, efgh);
  167. }
  168. static inline iron_float32x4_t iron_float32x4_shuffle_abef(iron_float32x4_t abcd, iron_float32x4_t efgh) {
  169. // aka movelh
  170. return _mm_movelh_ps(abcd, efgh);
  171. }
  172. static inline iron_float32x4_t iron_float32x4_shuffle_ghcd(iron_float32x4_t abcd, iron_float32x4_t efgh) {
  173. // aka movehl
  174. return _mm_movehl_ps(abcd, efgh);
  175. }
  176. #elif defined(IRON_NEON)
  177. static inline iron_float32x4_t iron_float32x4_intrin_load(const float *values) {
  178. return vld1q_f32(values);
  179. }
  180. static inline iron_float32x4_t iron_float32x4_intrin_load_unaligned(const float *values) {
  181. return iron_float32x4_intrin_load(values);
  182. }
  183. static inline iron_float32x4_t iron_float32x4_load(float a, float b, float c, float d) {
  184. return (iron_float32x4_t){a, b, c, d};
  185. }
  186. static inline iron_float32x4_t iron_float32x4_load_all(float t) {
  187. return (iron_float32x4_t){t, t, t, t};
  188. }
  189. static inline void iron_float32x4_store(float *destination, iron_float32x4_t value) {
  190. vst1q_f32(destination, value);
  191. }
  192. static inline void iron_float32x4_store_unaligned(float *destination, iron_float32x4_t value) {
  193. iron_float32x4_store(destination, value);
  194. }
  195. static inline float iron_float32x4_get(iron_float32x4_t t, int index) {
  196. return t[index];
  197. }
  198. static inline iron_float32x4_t iron_float32x4_abs(iron_float32x4_t t) {
  199. return vabsq_f32(t);
  200. }
  201. static inline iron_float32x4_t iron_float32x4_add(iron_float32x4_t a, iron_float32x4_t b) {
  202. return vaddq_f32(a, b);
  203. }
  204. static inline iron_float32x4_t iron_float32x4_div(iron_float32x4_t a, iron_float32x4_t b) {
  205. #if defined(__aarch64__)
  206. return vdivq_f32(a, b);
  207. #else
  208. float32x4_t inv = vrecpeq_f32(b);
  209. float32x4_t restep = vrecpsq_f32(b, inv);
  210. inv = vmulq_f32(restep, inv);
  211. return vmulq_f32(a, inv);
  212. #endif
  213. }
  214. static inline iron_float32x4_t iron_float32x4_mul(iron_float32x4_t a, iron_float32x4_t b) {
  215. return vmulq_f32(a, b);
  216. }
  217. static inline iron_float32x4_t iron_float32x4_neg(iron_float32x4_t t) {
  218. return vnegq_f32(t);
  219. }
  220. static inline iron_float32x4_t iron_float32x4_reciprocal_approximation(iron_float32x4_t t) {
  221. return vrecpeq_f32(t);
  222. }
  223. static inline iron_float32x4_t iron_float32x4_reciprocal_sqrt_approximation(iron_float32x4_t t) {
  224. return vrsqrteq_f32(t);
  225. }
  226. static inline iron_float32x4_t iron_float32x4_sub(iron_float32x4_t a, iron_float32x4_t b) {
  227. return vsubq_f32(a, b);
  228. }
  229. static inline iron_float32x4_t iron_float32x4_sqrt(iron_float32x4_t t) {
  230. #if defined(__aarch64__)
  231. return vsqrtq_f32(t);
  232. #else
  233. return vmulq_f32(t, vrsqrteq_f32(t));
  234. #endif
  235. }
  236. static inline iron_float32x4_t iron_float32x4_max(iron_float32x4_t a, iron_float32x4_t b) {
  237. return vmaxq_f32(a, b);
  238. }
  239. static inline iron_float32x4_t iron_float32x4_min(iron_float32x4_t a, iron_float32x4_t b) {
  240. return vminq_f32(a, b);
  241. }
  242. static inline iron_float32x4_mask_t iron_float32x4_cmpeq(iron_float32x4_t a, iron_float32x4_t b) {
  243. return vceqq_f32(a, b);
  244. }
  245. static inline iron_float32x4_mask_t iron_float32x4_cmpge(iron_float32x4_t a, iron_float32x4_t b) {
  246. return vcgeq_f32(a, b);
  247. }
  248. static inline iron_float32x4_mask_t iron_float32x4_cmpgt(iron_float32x4_t a, iron_float32x4_t b) {
  249. return vcgtq_f32(a, b);
  250. }
  251. static inline iron_float32x4_mask_t iron_float32x4_cmple(iron_float32x4_t a, iron_float32x4_t b) {
  252. return vcleq_f32(a, b);
  253. }
  254. static inline iron_float32x4_mask_t iron_float32x4_cmplt(iron_float32x4_t a, iron_float32x4_t b) {
  255. return vcltq_f32(a, b);
  256. }
  257. static inline iron_float32x4_mask_t iron_float32x4_cmpneq(iron_float32x4_t a, iron_float32x4_t b) {
  258. return vmvnq_u32(vceqq_f32(a, b));
  259. }
  260. static inline iron_float32x4_t iron_float32x4_sel(iron_float32x4_t a, iron_float32x4_t b, iron_float32x4_mask_t mask) {
  261. return vbslq_f32(mask, a, b);
  262. }
  263. static inline iron_float32x4_t iron_float32x4_or(iron_float32x4_t a, iron_float32x4_t b) {
  264. uint32x4_t acvt = vreinterpretq_u32_f32(a);
  265. uint32x4_t bcvt = vreinterpretq_u32_f32(b);
  266. return vreinterpretq_f32_u32(vorrq_u32(acvt, bcvt));
  267. }
  268. static inline iron_float32x4_t iron_float32x4_and(iron_float32x4_t a, iron_float32x4_t b) {
  269. uint32x4_t acvt = vreinterpretq_u32_f32(a);
  270. uint32x4_t bcvt = vreinterpretq_u32_f32(b);
  271. return vreinterpretq_f32_u32(vandq_u32(acvt, bcvt));
  272. }
  273. static inline iron_float32x4_t iron_float32x4_xor(iron_float32x4_t a, iron_float32x4_t b) {
  274. uint32x4_t acvt = vreinterpretq_u32_f32(a);
  275. uint32x4_t bcvt = vreinterpretq_u32_f32(b);
  276. return vreinterpretq_f32_u32(veorq_u32(acvt, bcvt));
  277. }
  278. static inline iron_float32x4_t iron_float32x4_not(iron_float32x4_t t) {
  279. uint32x4_t tcvt = vreinterpretq_u32_f32(t);
  280. return vreinterpretq_f32_u32(vmvnq_u32(tcvt));
  281. }
  282. #define iron_float32x4_shuffle_custom(abcd, efgh, left_1, left_2, right_1, right_2) \
  283. (iron_float32x4_t) { \
  284. vgetq_lane_f32((abcd), ((left_1)&0x3)), vgetq_lane_f32((abcd), ((left_2)&0x3)), vgetq_lane_f32((efgh), ((right_1)&0x3)), \
  285. vgetq_lane_f32((efgh), ((right_2)&0x3)) \
  286. }
  287. static inline iron_float32x4_t iron_float32x4_shuffle_aebf(iron_float32x4_t abcd, iron_float32x4_t efgh) {
  288. #if defined(__aarch64__)
  289. return vzip1q_f32(abcd, efgh);
  290. #else
  291. float a = vgetq_lane_f32(abcd, 0);
  292. float b = vgetq_lane_f32(abcd, 1);
  293. float e = vgetq_lane_f32(efgh, 0);
  294. float f = vgetq_lane_f32(efgh, 1);
  295. return (iron_float32x4_t){a, e, b, f};
  296. #endif
  297. }
  298. static inline iron_float32x4_t iron_float32x4_shuffle_cgdh(iron_float32x4_t abcd, iron_float32x4_t efgh) {
  299. #if defined(__aarch64__)
  300. return vzip2q_f32(abcd, efgh);
  301. #else
  302. float c = vgetq_lane_f32(abcd, 2);
  303. float d = vgetq_lane_f32(abcd, 3);
  304. float g = vgetq_lane_f32(efgh, 2);
  305. float h = vgetq_lane_f32(efgh, 3);
  306. return (iron_float32x4_t){c, g, d, h};
  307. #endif
  308. }
  309. static inline iron_float32x4_t iron_float32x4_shuffle_abef(iron_float32x4_t abcd, iron_float32x4_t efgh) {
  310. float32x2_t ab = vget_low_f32(abcd);
  311. float32x2_t ef = vget_low_f32(efgh);
  312. return vcombine_f32(ab, ef);
  313. }
  314. static inline iron_float32x4_t iron_float32x4_shuffle_ghcd(iron_float32x4_t abcd, iron_float32x4_t efgh) {
  315. float32x2_t cd = vget_high_f32(abcd);
  316. float32x2_t gh = vget_high_f32(efgh);
  317. return vcombine_f32(gh, cd);
  318. }
  319. #else
  320. #include <math.h>
  321. static inline iron_float32x4_t iron_float32x4_intrin_load(const float *values) {
  322. iron_float32x4_t value;
  323. value.values[0] = values[0];
  324. value.values[1] = values[1];
  325. value.values[2] = values[2];
  326. value.values[3] = values[3];
  327. return value;
  328. }
  329. static inline iron_float32x4_t iron_float32x4_intrin_load_unaligned(const float *values) {
  330. return iron_float32x4_intrin_load(values);
  331. }
  332. static inline iron_float32x4_t iron_float32x4_load(float a, float b, float c, float d) {
  333. iron_float32x4_t value;
  334. value.values[0] = a;
  335. value.values[1] = b;
  336. value.values[2] = c;
  337. value.values[3] = d;
  338. return value;
  339. }
  340. static inline iron_float32x4_t iron_float32x4_load_all(float t) {
  341. iron_float32x4_t value;
  342. value.values[0] = t;
  343. value.values[1] = t;
  344. value.values[2] = t;
  345. value.values[3] = t;
  346. return value;
  347. }
  348. static inline void iron_float32x4_store(float *destination, iron_float32x4_t value) {
  349. destination[0] = value.values[0];
  350. destination[1] = value.values[1];
  351. destination[2] = value.values[2];
  352. destination[3] = value.values[3];
  353. }
  354. static inline void iron_float32x4_store_unaligned(float *destination, iron_float32x4_t value) {
  355. iron_float32x4_store(destination, value);
  356. }
  357. static inline float iron_float32x4_get(iron_float32x4_t t, int index) {
  358. return t.values[index];
  359. }
  360. static inline iron_float32x4_t iron_float32x4_abs(iron_float32x4_t t) {
  361. iron_float32x4_t value;
  362. value.values[0] = iron_abs(t.values[0]);
  363. value.values[1] = iron_abs(t.values[1]);
  364. value.values[2] = iron_abs(t.values[2]);
  365. value.values[3] = iron_abs(t.values[3]);
  366. return value;
  367. }
  368. static inline iron_float32x4_t iron_float32x4_add(iron_float32x4_t a, iron_float32x4_t b) {
  369. iron_float32x4_t value;
  370. value.values[0] = a.values[0] + b.values[0];
  371. value.values[1] = a.values[1] + b.values[1];
  372. value.values[2] = a.values[2] + b.values[2];
  373. value.values[3] = a.values[3] + b.values[3];
  374. return value;
  375. }
  376. static inline iron_float32x4_t iron_float32x4_div(iron_float32x4_t a, iron_float32x4_t b) {
  377. iron_float32x4_t value;
  378. value.values[0] = a.values[0] / b.values[0];
  379. value.values[1] = a.values[1] / b.values[1];
  380. value.values[2] = a.values[2] / b.values[2];
  381. value.values[3] = a.values[3] / b.values[3];
  382. return value;
  383. }
  384. static inline iron_float32x4_t iron_float32x4_mul(iron_float32x4_t a, iron_float32x4_t b) {
  385. iron_float32x4_t value;
  386. value.values[0] = a.values[0] * b.values[0];
  387. value.values[1] = a.values[1] * b.values[1];
  388. value.values[2] = a.values[2] * b.values[2];
  389. value.values[3] = a.values[3] * b.values[3];
  390. return value;
  391. }
  392. static inline iron_float32x4_t iron_float32x4_neg(iron_float32x4_t t) {
  393. iron_float32x4_t value;
  394. value.values[0] = -t.values[0];
  395. value.values[1] = -t.values[1];
  396. value.values[2] = -t.values[2];
  397. value.values[3] = -t.values[3];
  398. return value;
  399. }
  400. static inline iron_float32x4_t iron_float32x4_reciprocal_approximation(iron_float32x4_t t) {
  401. iron_float32x4_t value;
  402. value.values[0] = 1.0f / t.values[0];
  403. value.values[1] = 1.0f / t.values[1];
  404. value.values[2] = 1.0f / t.values[2];
  405. value.values[3] = 1.0f / t.values[3];
  406. return value;
  407. }
  408. static inline iron_float32x4_t iron_float32x4_reciprocal_sqrt_approximation(iron_float32x4_t t) {
  409. iron_float32x4_t value;
  410. value.values[0] = 1.0f / sqrtf(t.values[0]);
  411. value.values[1] = 1.0f / sqrtf(t.values[1]);
  412. value.values[2] = 1.0f / sqrtf(t.values[2]);
  413. value.values[3] = 1.0f / sqrtf(t.values[3]);
  414. return value;
  415. }
  416. static inline iron_float32x4_t iron_float32x4_sub(iron_float32x4_t a, iron_float32x4_t b) {
  417. iron_float32x4_t value;
  418. value.values[0] = a.values[0] - b.values[0];
  419. value.values[1] = a.values[1] - b.values[1];
  420. value.values[2] = a.values[2] - b.values[2];
  421. value.values[3] = a.values[3] - b.values[3];
  422. return value;
  423. }
  424. static inline iron_float32x4_t iron_float32x4_sqrt(iron_float32x4_t t) {
  425. iron_float32x4_t value;
  426. value.values[0] = sqrtf(t.values[0]);
  427. value.values[1] = sqrtf(t.values[1]);
  428. value.values[2] = sqrtf(t.values[2]);
  429. value.values[3] = sqrtf(t.values[3]);
  430. return value;
  431. }
  432. static inline iron_float32x4_t iron_float32x4_max(iron_float32x4_t a, iron_float32x4_t b) {
  433. iron_float32x4_t value;
  434. value.values[0] = iron_max(a.values[0], b.values[0]);
  435. value.values[1] = iron_max(a.values[1], b.values[1]);
  436. value.values[2] = iron_max(a.values[2], b.values[2]);
  437. value.values[3] = iron_max(a.values[3], b.values[3]);
  438. return value;
  439. }
  440. static inline iron_float32x4_t iron_float32x4_min(iron_float32x4_t a, iron_float32x4_t b) {
  441. iron_float32x4_t value;
  442. value.values[0] = iron_min(a.values[0], b.values[0]);
  443. value.values[1] = iron_min(a.values[1], b.values[1]);
  444. value.values[2] = iron_min(a.values[2], b.values[2]);
  445. value.values[3] = iron_min(a.values[3], b.values[3]);
  446. return value;
  447. }
  448. static inline iron_float32x4_mask_t iron_float32x4_cmpeq(iron_float32x4_t a, iron_float32x4_t b) {
  449. uint32_t mask_cvt[4];
  450. mask_cvt[0] = a.values[0] == b.values[0] ? 0xffffffff : 0;
  451. mask_cvt[1] = a.values[1] == b.values[1] ? 0xffffffff : 0;
  452. mask_cvt[2] = a.values[2] == b.values[2] ? 0xffffffff : 0;
  453. mask_cvt[3] = a.values[3] == b.values[3] ? 0xffffffff : 0;
  454. iron_float32x4_mask_t mask;
  455. memcpy(&mask.values[0], &mask_cvt[0], sizeof(mask_cvt));
  456. return mask;
  457. }
  458. static inline iron_float32x4_mask_t iron_float32x4_cmpge(iron_float32x4_t a, iron_float32x4_t b) {
  459. uint32_t mask_cvt[4];
  460. mask_cvt[0] = a.values[0] >= b.values[0] ? 0xffffffff : 0;
  461. mask_cvt[1] = a.values[1] >= b.values[1] ? 0xffffffff : 0;
  462. mask_cvt[2] = a.values[2] >= b.values[2] ? 0xffffffff : 0;
  463. mask_cvt[3] = a.values[3] >= b.values[3] ? 0xffffffff : 0;
  464. iron_float32x4_mask_t mask;
  465. memcpy(&mask.values[0], &mask_cvt[0], sizeof(mask_cvt));
  466. return mask;
  467. }
  468. static inline iron_float32x4_mask_t iron_float32x4_cmpgt(iron_float32x4_t a, iron_float32x4_t b) {
  469. uint32_t mask_cvt[4];
  470. mask_cvt[0] = a.values[0] > b.values[0] ? 0xffffffff : 0;
  471. mask_cvt[1] = a.values[1] > b.values[1] ? 0xffffffff : 0;
  472. mask_cvt[2] = a.values[2] > b.values[2] ? 0xffffffff : 0;
  473. mask_cvt[3] = a.values[3] > b.values[3] ? 0xffffffff : 0;
  474. iron_float32x4_mask_t mask;
  475. memcpy(&mask.values[0], &mask_cvt[0], sizeof(mask_cvt));
  476. return mask;
  477. }
  478. static inline iron_float32x4_mask_t iron_float32x4_cmple(iron_float32x4_t a, iron_float32x4_t b) {
  479. uint32_t mask_cvt[4];
  480. mask_cvt[0] = a.values[0] <= b.values[0] ? 0xffffffff : 0;
  481. mask_cvt[1] = a.values[1] <= b.values[1] ? 0xffffffff : 0;
  482. mask_cvt[2] = a.values[2] <= b.values[2] ? 0xffffffff : 0;
  483. mask_cvt[3] = a.values[3] <= b.values[3] ? 0xffffffff : 0;
  484. iron_float32x4_mask_t mask;
  485. memcpy(&mask.values[0], &mask_cvt[0], sizeof(mask_cvt));
  486. return mask;
  487. }
  488. static inline iron_float32x4_mask_t iron_float32x4_cmplt(iron_float32x4_t a, iron_float32x4_t b) {
  489. uint32_t mask_cvt[4];
  490. mask_cvt[0] = a.values[0] < b.values[0] ? 0xffffffff : 0;
  491. mask_cvt[1] = a.values[1] < b.values[1] ? 0xffffffff : 0;
  492. mask_cvt[2] = a.values[2] < b.values[2] ? 0xffffffff : 0;
  493. mask_cvt[3] = a.values[3] < b.values[3] ? 0xffffffff : 0;
  494. iron_float32x4_mask_t mask;
  495. memcpy(&mask.values[0], &mask_cvt[0], sizeof(mask_cvt));
  496. return mask;
  497. }
  498. static inline iron_float32x4_mask_t iron_float32x4_cmpneq(iron_float32x4_t a, iron_float32x4_t b) {
  499. uint32_t mask_cvt[4];
  500. mask_cvt[0] = a.values[0] != b.values[0] ? 0xffffffff : 0;
  501. mask_cvt[1] = a.values[1] != b.values[1] ? 0xffffffff : 0;
  502. mask_cvt[2] = a.values[2] != b.values[2] ? 0xffffffff : 0;
  503. mask_cvt[3] = a.values[3] != b.values[3] ? 0xffffffff : 0;
  504. iron_float32x4_mask_t mask;
  505. memcpy(&mask.values[0], &mask_cvt[0], sizeof(mask_cvt));
  506. return mask;
  507. }
  508. static inline iron_float32x4_t iron_float32x4_sel(iron_float32x4_t a, iron_float32x4_t b, iron_float32x4_mask_t mask) {
  509. iron_float32x4_t value;
  510. value.values[0] = mask.values[0] != 0.0f ? a.values[0] : b.values[0];
  511. value.values[1] = mask.values[1] != 0.0f ? a.values[1] : b.values[1];
  512. value.values[2] = mask.values[2] != 0.0f ? a.values[2] : b.values[2];
  513. value.values[3] = mask.values[3] != 0.0f ? a.values[3] : b.values[3];
  514. return value;
  515. }
  516. static inline iron_float32x4_t iron_float32x4_or(iron_float32x4_t a, iron_float32x4_t b) {
  517. uint32_t acvt[4];
  518. uint32_t bcvt[4];
  519. memcpy(&acvt[0], &a.values[0], sizeof(a));
  520. memcpy(&bcvt[0], &b.values[0], sizeof(b));
  521. acvt[0] |= bcvt[0];
  522. acvt[1] |= bcvt[1];
  523. acvt[2] |= bcvt[2];
  524. acvt[3] |= bcvt[3];
  525. iron_float32x4_t value;
  526. memcpy(&value.values[0], &acvt[0], sizeof(acvt));
  527. return value;
  528. }
  529. static inline iron_float32x4_t iron_float32x4_and(iron_float32x4_t a, iron_float32x4_t b) {
  530. uint32_t acvt[4];
  531. uint32_t bcvt[4];
  532. memcpy(&acvt[0], &a.values[0], sizeof(a));
  533. memcpy(&bcvt[0], &b.values[0], sizeof(b));
  534. acvt[0] &= bcvt[0];
  535. acvt[1] &= bcvt[1];
  536. acvt[2] &= bcvt[2];
  537. acvt[3] &= bcvt[3];
  538. iron_float32x4_t value;
  539. memcpy(&value.values[0], &acvt[0], sizeof(acvt));
  540. return value;
  541. }
  542. static inline iron_float32x4_t iron_float32x4_xor(iron_float32x4_t a, iron_float32x4_t b) {
  543. uint32_t acvt[4];
  544. uint32_t bcvt[4];
  545. memcpy(&acvt[0], &a.values[0], sizeof(a));
  546. memcpy(&bcvt[0], &b.values[0], sizeof(b));
  547. acvt[0] ^= bcvt[0];
  548. acvt[1] ^= bcvt[1];
  549. acvt[2] ^= bcvt[2];
  550. acvt[3] ^= bcvt[3];
  551. iron_float32x4_t value;
  552. memcpy(&value.values[0], &acvt[0], sizeof(acvt));
  553. return value;
  554. }
  555. static inline iron_float32x4_t iron_float32x4_not(iron_float32x4_t t) {
  556. uint32_t tcvt[4];
  557. memcpy(&tcvt[0], &t.values[0], sizeof(t));
  558. tcvt[0] = ~tcvt[0];
  559. tcvt[1] = ~tcvt[1];
  560. tcvt[2] = ~tcvt[2];
  561. tcvt[3] = ~tcvt[3];
  562. iron_float32x4_t value;
  563. memcpy(&value.values[0], &tcvt[0], sizeof(tcvt));
  564. return value;
  565. }
  566. static inline iron_float32x4_t iron_float32x4_shuffle_custom(iron_float32x4_t abcd, iron_float32x4_t efgh, const uint32_t left_1, const uint32_t left_2,
  567. const uint32_t right_1, const uint32_t right_2) {
  568. iron_float32x4_t value;
  569. value.values[0] = abcd.values[left_1 & 0x3];
  570. value.values[1] = abcd.values[left_2 & 0x3];
  571. value.values[2] = efgh.values[right_1 & 0x3];
  572. value.values[3] = efgh.values[right_2 & 0x3];
  573. return value;
  574. }
  575. static inline iron_float32x4_t iron_float32x4_shuffle_aebf(iron_float32x4_t abcd, iron_float32x4_t efgh) {
  576. iron_float32x4_t value;
  577. value.values[0] = abcd.values[0];
  578. value.values[1] = efgh.values[0];
  579. value.values[2] = abcd.values[1];
  580. value.values[3] = efgh.values[1];
  581. return value;
  582. }
  583. static inline iron_float32x4_t iron_float32x4_shuffle_cgdh(iron_float32x4_t abcd, iron_float32x4_t efgh) {
  584. iron_float32x4_t value;
  585. value.values[0] = abcd.values[2];
  586. value.values[1] = efgh.values[2];
  587. value.values[2] = abcd.values[3];
  588. value.values[3] = efgh.values[3];
  589. return value;
  590. }
  591. static inline iron_float32x4_t iron_float32x4_shuffle_abef(iron_float32x4_t abcd, iron_float32x4_t efgh) {
  592. iron_float32x4_t value;
  593. value.values[0] = abcd.values[0];
  594. value.values[1] = abcd.values[1];
  595. value.values[2] = efgh.values[0];
  596. value.values[3] = efgh.values[1];
  597. return value;
  598. }
  599. static inline iron_float32x4_t iron_float32x4_shuffle_ghcd(iron_float32x4_t abcd, iron_float32x4_t efgh) {
  600. iron_float32x4_t value;
  601. value.values[0] = efgh.values[2];
  602. value.values[1] = efgh.values[3];
  603. value.values[2] = abcd.values[2];
  604. value.values[3] = abcd.values[3];
  605. return value;
  606. }
  607. #endif