float4_ni.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509
  1. /*
  2. * Copyright 2010-2016 Branimir Karadzic. All rights reserved.
  3. * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
  4. */
  5. #ifndef BX_FLOAT4_NI_H_HEADER_GUARD
  6. #define BX_FLOAT4_NI_H_HEADER_GUARD
  7. namespace bx
  8. {
  9. BX_FLOAT4_INLINE float4_t float4_rcp_ni(float4_t _a);
  10. BX_FLOAT4_INLINE float4_t float4_shuf_xAzC_ni(float4_t _a, float4_t _b)
  11. {
  12. const float4_t xAyB = float4_shuf_xAyB(_a, _b);
  13. const float4_t zCwD = float4_shuf_zCwD(_a, _b);
  14. const float4_t result = float4_shuf_xyAB(xAyB, zCwD);
  15. return result;
  16. }
  17. BX_FLOAT4_INLINE float4_t float4_shuf_yBwD_ni(float4_t _a, float4_t _b)
  18. {
  19. const float4_t xAyB = float4_shuf_xAyB(_a, _b);
  20. const float4_t zCwD = float4_shuf_zCwD(_a, _b);
  21. const float4_t result = float4_shuf_zwCD(xAyB, zCwD);
  22. return result;
  23. }
  24. BX_FLOAT4_INLINE float4_t float4_madd_ni(float4_t _a, float4_t _b, float4_t _c)
  25. {
  26. const float4_t mul = float4_mul(_a, _b);
  27. const float4_t result = float4_add(mul, _c);
  28. return result;
  29. }
  30. BX_FLOAT4_INLINE float4_t float4_nmsub_ni(float4_t _a, float4_t _b, float4_t _c)
  31. {
  32. const float4_t mul = float4_mul(_a, _b);
  33. const float4_t result = float4_sub(_c, mul);
  34. return result;
  35. }
  36. BX_FLOAT4_INLINE float4_t float4_div_nr_ni(float4_t _a, float4_t _b)
  37. {
  38. const float4_t oneish = float4_isplat(0x3f800001);
  39. const float4_t est = float4_rcp_est(_b);
  40. const float4_t iter0 = float4_mul(_a, est);
  41. const float4_t tmp1 = float4_nmsub(_b, est, oneish);
  42. const float4_t result = float4_madd(tmp1, iter0, iter0);
  43. return result;
  44. }
  45. BX_FLOAT4_INLINE float4_t float4_rcp_ni(float4_t _a)
  46. {
  47. const float4_t one = float4_splat(1.0f);
  48. const float4_t result = float4_div(one, _a);
  49. return result;
  50. }
  51. BX_FLOAT4_INLINE float4_t float4_orx_ni(float4_t _a)
  52. {
  53. const float4_t zwxy = float4_swiz_zwxy(_a);
  54. const float4_t tmp0 = float4_or(_a, zwxy);
  55. const float4_t tmp1 = float4_swiz_yyyy(_a);
  56. const float4_t tmp2 = float4_or(tmp0, tmp1);
  57. const float4_t mf000 = float4_ild(UINT32_MAX, 0, 0, 0);
  58. const float4_t result = float4_and(tmp2, mf000);
  59. return result;
  60. }
  61. BX_FLOAT4_INLINE float4_t float4_orc_ni(float4_t _a, float4_t _b)
  62. {
  63. const float4_t aorb = float4_or(_a, _b);
  64. const float4_t mffff = float4_isplat(UINT32_MAX);
  65. const float4_t result = float4_xor(aorb, mffff);
  66. return result;
  67. }
  68. BX_FLOAT4_INLINE float4_t float4_neg_ni(float4_t _a)
  69. {
  70. const float4_t zero = float4_zero();
  71. const float4_t result = float4_sub(zero, _a);
  72. return result;
  73. }
  74. BX_FLOAT4_INLINE float4_t float4_selb_ni(float4_t _mask, float4_t _a, float4_t _b)
  75. {
  76. const float4_t sel_a = float4_and(_a, _mask);
  77. const float4_t sel_b = float4_andc(_b, _mask);
  78. const float4_t result = float4_or(sel_a, sel_b);
  79. return result;
  80. }
  81. BX_FLOAT4_INLINE float4_t float4_sels_ni(float4_t _test, float4_t _a, float4_t _b)
  82. {
  83. const float4_t mask = float4_sra(_test, 31);
  84. const float4_t result = float4_selb(mask, _a, _b);
  85. return result;
  86. }
  87. BX_FLOAT4_INLINE float4_t float4_not_ni(float4_t _a)
  88. {
  89. const float4_t mffff = float4_isplat(UINT32_MAX);
  90. const float4_t result = float4_xor(_a, mffff);
  91. return result;
  92. }
  93. BX_FLOAT4_INLINE float4_t float4_min_ni(float4_t _a, float4_t _b)
  94. {
  95. const float4_t mask = float4_cmplt(_a, _b);
  96. const float4_t result = float4_selb(mask, _a, _b);
  97. return result;
  98. }
  99. BX_FLOAT4_INLINE float4_t float4_max_ni(float4_t _a, float4_t _b)
  100. {
  101. const float4_t mask = float4_cmpgt(_a, _b);
  102. const float4_t result = float4_selb(mask, _a, _b);
  103. return result;
  104. }
  105. BX_FLOAT4_INLINE float4_t float4_abs_ni(float4_t _a)
  106. {
  107. const float4_t a_neg = float4_neg(_a);
  108. const float4_t result = float4_max(a_neg, _a);
  109. return result;
  110. }
  111. BX_FLOAT4_INLINE float4_t float4_imin_ni(float4_t _a, float4_t _b)
  112. {
  113. const float4_t mask = float4_icmplt(_a, _b);
  114. const float4_t result = float4_selb(mask, _a, _b);
  115. return result;
  116. }
  117. BX_FLOAT4_INLINE float4_t float4_imax_ni(float4_t _a, float4_t _b)
  118. {
  119. const float4_t mask = float4_icmpgt(_a, _b);
  120. const float4_t result = float4_selb(mask, _a, _b);
  121. return result;
  122. }
  123. BX_FLOAT4_INLINE float4_t float4_clamp_ni(float4_t _a, float4_t _min, float4_t _max)
  124. {
  125. const float4_t tmp = float4_min(_a, _max);
  126. const float4_t result = float4_max(tmp, _min);
  127. return result;
  128. }
  129. BX_FLOAT4_INLINE float4_t float4_lerp_ni(float4_t _a, float4_t _b, float4_t _s)
  130. {
  131. const float4_t ba = float4_sub(_b, _a);
  132. const float4_t result = float4_madd(_s, ba, _a);
  133. return result;
  134. }
  135. BX_FLOAT4_INLINE float4_t float4_sqrt_nr_ni(float4_t _a)
  136. {
  137. const float4_t half = float4_splat(0.5f);
  138. const float4_t one = float4_splat(1.0f);
  139. const float4_t tmp0 = float4_rsqrt_est(_a);
  140. const float4_t tmp1 = float4_mul(tmp0, _a);
  141. const float4_t tmp2 = float4_mul(tmp1, half);
  142. const float4_t tmp3 = float4_nmsub(tmp0, tmp1, one);
  143. const float4_t result = float4_madd(tmp3, tmp2, tmp1);
  144. return result;
  145. }
  146. BX_FLOAT4_INLINE float4_t float4_sqrt_nr1_ni(float4_t _a)
  147. {
  148. const float4_t half = float4_splat(0.5f);
  149. float4_t result = _a;
  150. for (uint32_t ii = 0; ii < 11; ++ii)
  151. {
  152. const float4_t tmp1 = float4_div(_a, result);
  153. const float4_t tmp2 = float4_add(tmp1, result);
  154. result = float4_mul(tmp2, half);
  155. }
  156. return result;
  157. }
  158. BX_FLOAT4_INLINE float4_t float4_rsqrt_ni(float4_t _a)
  159. {
  160. const float4_t one = float4_splat(1.0f);
  161. const float4_t sqrt = float4_sqrt(_a);
  162. const float4_t result = float4_div(one, sqrt);
  163. return result;
  164. }
  165. BX_FLOAT4_INLINE float4_t float4_rsqrt_nr_ni(float4_t _a)
  166. {
  167. const float4_t rsqrt = float4_rsqrt_est(_a);
  168. const float4_t iter0 = float4_mul(_a, rsqrt);
  169. const float4_t iter1 = float4_mul(iter0, rsqrt);
  170. const float4_t half = float4_splat(0.5f);
  171. const float4_t half_rsqrt = float4_mul(half, rsqrt);
  172. const float4_t three = float4_splat(3.0f);
  173. const float4_t three_sub_iter1 = float4_sub(three, iter1);
  174. const float4_t result = float4_mul(half_rsqrt, three_sub_iter1);
  175. return result;
  176. }
  177. BX_FLOAT4_INLINE float4_t float4_rsqrt_carmack_ni(float4_t _a)
  178. {
  179. const float4_t half = float4_splat(0.5f);
  180. const float4_t ah = float4_mul(half, _a);
  181. const float4_t ashift = float4_sra(_a, 1);
  182. const float4_t magic = float4_isplat(0x5f3759df);
  183. const float4_t msuba = float4_isub(magic, ashift);
  184. const float4_t msubasq = float4_mul(msuba, msuba);
  185. const float4_t tmp0 = float4_splat(1.5f);
  186. const float4_t tmp1 = float4_mul(ah, msubasq);
  187. const float4_t tmp2 = float4_sub(tmp0, tmp1);
  188. const float4_t result = float4_mul(msuba, tmp2);
  189. return result;
  190. }
  191. namespace float4_logexp_detail
  192. {
  193. BX_FLOAT4_INLINE float4_t float4_poly1(float4_t _a, float _b, float _c)
  194. {
  195. const float4_t bbbb = float4_splat(_b);
  196. const float4_t cccc = float4_splat(_c);
  197. const float4_t result = float4_madd(cccc, _a, bbbb);
  198. return result;
  199. }
  200. BX_FLOAT4_INLINE float4_t float4_poly2(float4_t _a, float _b, float _c, float _d)
  201. {
  202. const float4_t bbbb = float4_splat(_b);
  203. const float4_t poly = float4_poly1(_a, _c, _d);
  204. const float4_t result = float4_madd(poly, _a, bbbb);
  205. return result;
  206. }
  207. BX_FLOAT4_INLINE float4_t float4_poly3(float4_t _a, float _b, float _c, float _d, float _e)
  208. {
  209. const float4_t bbbb = float4_splat(_b);
  210. const float4_t poly = float4_poly2(_a, _c, _d, _e);
  211. const float4_t result = float4_madd(poly, _a, bbbb);
  212. return result;
  213. }
  214. BX_FLOAT4_INLINE float4_t float4_poly4(float4_t _a, float _b, float _c, float _d, float _e, float _f)
  215. {
  216. const float4_t bbbb = float4_splat(_b);
  217. const float4_t poly = float4_poly3(_a, _c, _d, _e, _f);
  218. const float4_t result = float4_madd(poly, _a, bbbb);
  219. return result;
  220. }
  221. BX_FLOAT4_INLINE float4_t float4_poly5(float4_t _a, float _b, float _c, float _d, float _e, float _f, float _g)
  222. {
  223. const float4_t bbbb = float4_splat(_b);
  224. const float4_t poly = float4_poly4(_a, _c, _d, _e, _f, _g);
  225. const float4_t result = float4_madd(poly, _a, bbbb);
  226. return result;
  227. }
  228. BX_FLOAT4_INLINE float4_t float4_logpoly(float4_t _a)
  229. {
  230. #if 1
  231. const float4_t result = float4_poly5(_a
  232. , 3.11578814719469302614f, -3.32419399085241980044f
  233. , 2.59883907202499966007f, -1.23152682416275988241f
  234. , 0.318212422185251071475f, -0.0344359067839062357313f
  235. );
  236. #elif 0
  237. const float4_t result = float4_poly4(_a
  238. , 2.8882704548164776201f, -2.52074962577807006663f
  239. , 1.48116647521213171641f, -0.465725644288844778798f
  240. , 0.0596515482674574969533f
  241. );
  242. #elif 0
  243. const float4_t result = float4_poly3(_a
  244. , 2.61761038894603480148f, -1.75647175389045657003f
  245. , 0.688243882994381274313f, -0.107254423828329604454f
  246. );
  247. #else
  248. const float4_t result = float4_poly2(_a
  249. , 2.28330284476918490682f, -1.04913055217340124191f
  250. , 0.204446009836232697516f
  251. );
  252. #endif
  253. return result;
  254. }
  255. BX_FLOAT4_INLINE float4_t float4_exppoly(float4_t _a)
  256. {
  257. #if 1
  258. const float4_t result = float4_poly5(_a
  259. , 9.9999994e-1f, 6.9315308e-1f
  260. , 2.4015361e-1f, 5.5826318e-2f
  261. , 8.9893397e-3f, 1.8775767e-3f
  262. );
  263. #elif 0
  264. const float4_t result = float4_poly4(_a
  265. , 1.0000026f, 6.9300383e-1f
  266. , 2.4144275e-1f, 5.2011464e-2f
  267. , 1.3534167e-2f
  268. );
  269. #elif 0
  270. const float4_t result = float4_poly3(_a
  271. , 9.9992520e-1f, 6.9583356e-1f
  272. , 2.2606716e-1f, 7.8024521e-2f
  273. );
  274. #else
  275. const float4_t result = float4_poly2(_a
  276. , 1.0017247f, 6.5763628e-1f
  277. , 3.3718944e-1f
  278. );
  279. #endif // 0
  280. return result;
  281. }
  282. } // namespace float4_internal
  283. BX_FLOAT4_INLINE float4_t float4_log2_ni(float4_t _a)
  284. {
  285. const float4_t expmask = float4_isplat(0x7f800000);
  286. const float4_t mantmask = float4_isplat(0x007fffff);
  287. const float4_t one = float4_splat(1.0f);
  288. const float4_t c127 = float4_isplat(127);
  289. const float4_t aexp = float4_and(_a, expmask);
  290. const float4_t aexpsr = float4_srl(aexp, 23);
  291. const float4_t tmp0 = float4_isub(aexpsr, c127);
  292. const float4_t exp = float4_itof(tmp0);
  293. const float4_t amask = float4_and(_a, mantmask);
  294. const float4_t mant = float4_or(amask, one);
  295. const float4_t poly = float4_logexp_detail::float4_logpoly(mant);
  296. const float4_t mandiff = float4_sub(mant, one);
  297. const float4_t result = float4_madd(poly, mandiff, exp);
  298. return result;
  299. }
  300. BX_FLOAT4_INLINE float4_t float4_exp2_ni(float4_t _a)
  301. {
  302. const float4_t min = float4_splat( 129.0f);
  303. const float4_t max = float4_splat(-126.99999f);
  304. const float4_t tmp0 = float4_min(_a, min);
  305. const float4_t aaaa = float4_max(tmp0, max);
  306. const float4_t half = float4_splat(0.5f);
  307. const float4_t tmp2 = float4_sub(aaaa, half);
  308. const float4_t ipart = float4_ftoi(tmp2);
  309. const float4_t iround = float4_itof(ipart);
  310. const float4_t fpart = float4_sub(aaaa, iround);
  311. const float4_t c127 = float4_isplat(127);
  312. const float4_t tmp5 = float4_iadd(ipart, c127);
  313. const float4_t expipart = float4_sll(tmp5, 23);
  314. const float4_t expfpart = float4_logexp_detail::float4_exppoly(fpart);
  315. const float4_t result = float4_mul(expipart, expfpart);
  316. return result;
  317. }
  318. BX_FLOAT4_INLINE float4_t float4_pow_ni(float4_t _a, float4_t _b)
  319. {
  320. const float4_t alog2 = float4_log2(_a);
  321. const float4_t alog2b = float4_mul(alog2, _b);
  322. const float4_t result = float4_exp2(alog2b);
  323. return result;
  324. }
  325. BX_FLOAT4_INLINE float4_t float4_dot3_ni(float4_t _a, float4_t _b)
  326. {
  327. const float4_t xyzw = float4_mul(_a, _b);
  328. const float4_t xxxx = float4_swiz_xxxx(xyzw);
  329. const float4_t yyyy = float4_swiz_yyyy(xyzw);
  330. const float4_t zzzz = float4_swiz_zzzz(xyzw);
  331. const float4_t tmp1 = float4_add(xxxx, yyyy);
  332. const float4_t result = float4_add(zzzz, tmp1);
  333. return result;
  334. }
  335. BX_FLOAT4_INLINE float4_t float4_cross3_ni(float4_t _a, float4_t _b)
  336. {
  337. // a.yzx * b.zxy - a.zxy * b.yzx == (a * b.yzx - a.yzx * b).yzx
  338. #if 0
  339. const float4_t a_yzxw = float4_swiz_yzxw(_a);
  340. const float4_t a_zxyw = float4_swiz_zxyw(_a);
  341. const float4_t b_zxyw = float4_swiz_zxyw(_b);
  342. const float4_t b_yzxw = float4_swiz_yzxw(_b);
  343. const float4_t tmp = float4_mul(a_yzxw, b_zxyw);
  344. const float4_t result = float4_nmsub(a_zxyw, b_yzxw, tmp);
  345. #else
  346. const float4_t a_yzxw = float4_swiz_yzxw(_a);
  347. const float4_t b_yzxw = float4_swiz_yzxw(_b);
  348. const float4_t tmp0 = float4_mul(_a, b_yzxw);
  349. const float4_t tmp1 = float4_nmsub(a_yzxw, _b, tmp0);
  350. const float4_t result = float4_swiz_yzxw(tmp1);
  351. #endif
  352. return result;
  353. }
  354. BX_FLOAT4_INLINE float4_t float4_normalize3_ni(float4_t _a)
  355. {
  356. const float4_t dot3 = float4_dot3(_a, _a);
  357. const float4_t invSqrt = float4_rsqrt(dot3);
  358. const float4_t result = float4_mul(_a, invSqrt);
  359. return result;
  360. }
  361. BX_FLOAT4_INLINE float4_t float4_dot_ni(float4_t _a, float4_t _b)
  362. {
  363. const float4_t xyzw = float4_mul(_a, _b);
  364. const float4_t yzwx = float4_swiz_yzwx(xyzw);
  365. const float4_t tmp0 = float4_add(xyzw, yzwx);
  366. const float4_t zwxy = float4_swiz_zwxy(tmp0);
  367. const float4_t result = float4_add(tmp0, zwxy);
  368. return result;
  369. }
  370. BX_FLOAT4_INLINE float4_t float4_ceil_ni(float4_t _a)
  371. {
  372. const float4_t tmp0 = float4_ftoi(_a);
  373. const float4_t tmp1 = float4_itof(tmp0);
  374. const float4_t mask = float4_cmplt(tmp1, _a);
  375. const float4_t one = float4_splat(1.0f);
  376. const float4_t tmp2 = float4_and(one, mask);
  377. const float4_t result = float4_add(tmp1, tmp2);
  378. return result;
  379. }
  380. BX_FLOAT4_INLINE float4_t float4_floor_ni(float4_t _a)
  381. {
  382. const float4_t tmp0 = float4_ftoi(_a);
  383. const float4_t tmp1 = float4_itof(tmp0);
  384. const float4_t mask = float4_cmpgt(tmp1, _a);
  385. const float4_t one = float4_splat(1.0f);
  386. const float4_t tmp2 = float4_and(one, mask);
  387. const float4_t result = float4_sub(tmp1, tmp2);
  388. return result;
  389. }
  390. BX_FLOAT4_INLINE bool float4_test_any_ni(float4_t _a)
  391. {
  392. const float4_t mask = float4_sra(_a, 31);
  393. const float4_t zwxy = float4_swiz_zwxy(mask);
  394. const float4_t tmp0 = float4_or(mask, zwxy);
  395. const float4_t tmp1 = float4_swiz_yyyy(tmp0);
  396. const float4_t tmp2 = float4_or(tmp0, tmp1);
  397. int res;
  398. float4_stx(&res, tmp2);
  399. return 0 != res;
  400. }
  401. BX_FLOAT4_INLINE bool float4_test_all_ni(float4_t _a)
  402. {
  403. const float4_t bits = float4_sra(_a, 31);
  404. const float4_t m1248 = float4_ild(1, 2, 4, 8);
  405. const float4_t mask = float4_and(bits, m1248);
  406. const float4_t zwxy = float4_swiz_zwxy(mask);
  407. const float4_t tmp0 = float4_or(mask, zwxy);
  408. const float4_t tmp1 = float4_swiz_yyyy(tmp0);
  409. const float4_t tmp2 = float4_or(tmp0, tmp1);
  410. int res;
  411. float4_stx(&res, tmp2);
  412. return 0xf == res;
  413. }
  414. } // namespace bx
  415. #endif // BX_FLOAT4_NI_H_HEADER_GUARD