FunctionList.h 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856
  1. #pragma once
  2. #include <cinttypes>
  3. #include <type_traits>
  4. #include <memory>
  5. #include "FastSIMD/FastSIMD.h"
  6. #ifdef _MSC_VER
  7. #if defined( _M_IX86_FP ) && _M_IX86_FP < 2
  8. #define FS_VECTORCALL
  9. #else
  10. #define FS_VECTORCALL __vectorcall
  11. #endif
  12. #define FS_INLINE __forceinline
  13. #else
  14. #define FS_VECTORCALL
  15. #define FS_INLINE __attribute__((always_inline)) inline
  16. #endif
  17. #ifndef NDEBUG
  18. #undef FS_INLINE
  19. #define FS_INLINE inline
  20. #endif
  21. /// <summary>
  22. /// Number of 32 width elements that will fit into a vector
  23. /// </summary>
  24. /// <remarks>
  25. /// Compile time constant
  26. /// </remarks>
  27. /// <code>
  28. /// size_t FS_Size_32()
  29. /// </code>
  30. #define FS_Size_32() FS::template VectorSize<sizeof( int32_t )>
  31. // Vector builders
  32. /// <summary>
  33. /// Vector with values incrementing from 0 based on element index {0, 1, 2, 3...}
  34. /// </summary>
  35. /// <code>
  36. /// example: int32v::FS_Incremented()
  37. /// </code>
  38. #define FS_Incremented() Incremented()
  39. // Load
  40. /// <summary>
  41. /// Copies sizeof(float32v) bytes from given memory location into float32v
  42. /// </summary>
  43. /// <remarks>
  44. /// Memory does not need to be aligned
  45. /// </remarks>
  46. /// <code>
  47. /// float32v FS_Load_f32( void const* ptr )
  48. /// </code>
  49. #define FS_Load_f32( ... ) FS::Load_f32( __VA_ARGS__ )
  50. /// <summary>
  51. /// Copies sizeof(int32v) bytes from given memory location into int32v
  52. /// </summary>
  53. /// <remarks>
  54. /// Memory does not need to be aligned
  55. /// </remarks>
  56. /// <code>
  57. /// int32v FS_Load_i32( void const* ptr )
  58. /// </code>
  59. #define FS_Load_i32( ... ) FS::Load_i32( __VA_ARGS__ )
  60. // Store
  61. /// <summary>
  62. /// Copies all elements of float32v to given memory location
  63. /// </summary>
  64. /// <code>
  65. /// void FS_Store_f32( void* ptr, float32v f )
  66. /// </code>
  67. #define FS_Store_f32( ... ) FS::Store_f32( __VA_ARGS__ )
  68. /// <summary>
  69. /// Copies all elements of int32v to given memory location
  70. /// </summary>
  71. /// <code>
  72. /// void FS_Store_i32( void* ptr, int32v i )
  73. /// </code>
  74. #define FS_Store_i32( ... ) FS::Store_i32( __VA_ARGS__ )
  75. // Extract
  76. /// <summary>
  77. /// Retreive element 0 from vector
  78. /// </summary>
  79. /// <code>
  80. /// float FS_Extract0_f32( float32v f )
  81. /// </code>
  82. #define FS_Extract0_f32( ... ) FS::Extract0_f32( __VA_ARGS__ )
  83. /// <summary>
  84. /// Retreive element 0 from vector
  85. /// </summary>
  86. /// <code>
  87. /// int32_t FS_Extract0_i32( int32v i )
  88. /// </code>
  89. #define FS_Extract0_i32( ... ) FS::Extract0_i32( __VA_ARGS__ )
  90. /// <summary>
  91. /// Retreive element from vector at position
  92. /// </summary>
  93. /// <code>
  94. /// float FS_Extract_f32( float32v f, size_t idx )
  95. /// </code>
  96. #define FS_Extract_f32( ... ) FS::Extract_f32( __VA_ARGS__ )
  97. /// <summary>
  98. /// Retreive element from vector at position
  99. /// </summary>
  100. /// <code>
  101. /// int32_t FS_Extract_i32( int32v i, size_t idx )
  102. /// </code>
  103. #define FS_Extract_i32( ... ) FS::Extract_i32( __VA_ARGS__ )
  104. // Cast
  105. /// <summary>
  106. /// Bitwise cast int to float
  107. /// </summary>
  108. /// <code>
  109. /// float32v FS_Casti32_f32( int32v i )
  110. /// </code>
  111. #define FS_Casti32_f32( ... ) FS::Casti32_f32( __VA_ARGS__ )
  112. /// <summary>
  113. /// Bitwise cast float to int
  114. /// </summary>
  115. /// <code>
  116. /// int32v FS_Castf32_i32( float32v f )
  117. /// </code>
  118. #define FS_Castf32_i32( ... ) FS::Castf32_i32( __VA_ARGS__ )
  119. // Convert
  120. /// <summary>
  121. /// Convert int to float
  122. /// </summary>
  123. /// <remarks>
  124. /// Rounding: truncate
  125. /// </remarks>
  126. /// <code>
  127. /// float32v FS_Converti32_f32( int32v i )
  128. /// </code>
  129. #define FS_Converti32_f32( ... ) FS::Converti32_f32( __VA_ARGS__ )
  130. /// <summary>
  131. /// Convert float to int
  132. /// </summary>
  133. /// <code>
  134. /// int32v FS_Convertf32_i32( float32v f )
  135. /// </code>
  136. #define FS_Convertf32_i32( ... ) FS::Convertf32_i32( __VA_ARGS__ )
  137. // Select
  138. /// <summary>
  139. /// return ( m ? a : b )
  140. /// </summary>
  141. /// <code>
  142. /// float32v FS_Select_f32( mask32v m, float32v a, float32v b )
  143. /// </code>
  144. #define FS_Select_f32( ... ) FS::Select_f32( __VA_ARGS__ )
  145. /// <summary>
  146. /// return ( m ? a : b )
  147. /// </summary>
  148. /// <code>
  149. /// int32v FS_Select_i32( mask32v m, int32v a, int32v b )
  150. /// </code>
  151. #define FS_Select_i32( ... ) FS::Select_i32( __VA_ARGS__ )
  152. // Min, Max
  153. /// <summary>
  154. /// return ( a < b ? a : b )
  155. /// </summary>
  156. /// <code>
  157. /// float32v FS_Min_f32( float32v a, float32v b )
  158. /// </code>
  159. #define FS_Min_f32( ... ) FS::Min_f32( __VA_ARGS__ )
  160. /// <summary>
  161. /// return ( a > b ? a : b )
  162. /// </summary>
  163. /// <code>
  164. /// float32v FS_Max_f32( float32v a, float32v b )
  165. /// </code>
  166. #define FS_Max_f32( ... ) FS::Max_f32( __VA_ARGS__ )
  167. /// <summary>
  168. /// return ( a < b ? a : b )
  169. /// </summary>
  170. /// <code>
  171. /// int32v FS_Min_i32( int32v a, int32v b )
  172. /// </code>
  173. #define FS_Min_i32( ... ) FS::Min_i32( __VA_ARGS__ )
  174. /// <summary>
  175. /// return ( a > b ? a : b )
  176. /// </summary>
  177. /// <code>
  178. /// int32v FS_Max_i32( int32v a, int32v b )
  179. /// </code>
  180. #define FS_Max_i32( ... ) FS::Max_i32( __VA_ARGS__ )
  181. // Bitwise
  182. /// <summary>
  183. /// return ( a & ~b )
  184. /// </summary>
  185. /// <code>
  186. /// float32v FS_BitwiseAndNot_f32( float32v a, float32v b )
  187. /// </code>
  188. #define FS_BitwiseAndNot_f32( ... ) FS::BitwiseAndNot_f32( __VA_ARGS__ )
  189. /// <summary>
  190. /// return ( a & ~b )
  191. /// </summary>
  192. /// <code>
  193. /// int32v FS_BitwiseAndNot_i32( int32v a, int32v b )
  194. /// </code>
  195. #define FS_BitwiseAndNot_i32( ... ) FS::BitwiseAndNot_i32( __VA_ARGS__ )
  196. /// <summary>
  197. /// return ( a & ~b )
  198. /// </summary>
  199. /// <code>
  200. /// mask32v FS_BitwiseAndNot_m32( mask32v a, mask32v b )
  201. /// </code>
  202. #define FS_BitwiseAndNot_m32( ... ) FastSIMD::BitwiseAndNot_m32<FS>( __VA_ARGS__ )
  203. /// <summary>
  204. /// return ZeroExtend( a >> b )
  205. /// </summary>
  206. /// <code>
  207. /// float32v FS_BitwiseShiftRightZX_f32( float32v a, int32_t b )
  208. /// </code>
  209. #define FS_BitwiseShiftRightZX_f32( ... ) FS::BitwiseShiftRightZX_f32( __VA_ARGS__ )
  210. /// <summary>
  211. /// return ZeroExtend( a >> b )
  212. /// </summary>
  213. /// <code>
  214. /// float32v FS_BitwiseShiftRightZX_i32( int32v a, int32_t b )
  215. /// </code>
  216. #define FS_BitwiseShiftRightZX_i32( ... ) FS::BitwiseShiftRightZX_i32( __VA_ARGS__ )
  217. // Abs
  218. /// <summary>
  219. /// return ( a < 0 ? -a : a )
  220. /// </summary>
  221. /// <code>
  222. /// float32v FS_Abs_f32( float32v a )
  223. /// </code>
  224. #define FS_Abs_f32( ... ) FS::Abs_f32( __VA_ARGS__ )
  225. /// <summary>
  226. /// return ( a < 0 ? -a : a )
  227. /// </summary>
  228. /// <code>
  229. /// int32v FS_Abs_i32( int32v a )
  230. /// </code>
  231. #define FS_Abs_i32( ... ) FS::Abs_i32( __VA_ARGS__ )
  232. // Float math
  233. /// <summary>
  234. /// return sqrt( a )
  235. /// </summary>
  236. /// <code>
  237. /// float32v FS_Sqrt_f32( float32v a )
  238. /// </code>
  239. #define FS_Sqrt_f32( ... ) FS::Sqrt_f32( __VA_ARGS__ )
  240. /// <summary>
  241. /// return APPROXIMATE( 1.0 / sqrt( a ) )
  242. /// </summary>
  243. /// <code>
  244. /// float32v FS_InvSqrt_f32( float32v a )
  245. /// </code>
  246. #define FS_InvSqrt_f32( ... ) FS::InvSqrt_f32( __VA_ARGS__ )
  247. /// <summary>
  248. /// return APPROXIMATE( 1.0 / a )
  249. /// </summary>
  250. /// <code>
  251. /// float32v FS_Reciprocal_f32( float32v a )
  252. /// </code>
  253. #define FS_Reciprocal_f32( ... ) FS::Reciprocal_f32( __VA_ARGS__ )
  254. // Floor, Ceil, Round
  255. /// <summary>
  256. /// return floor( a )
  257. /// </summary>
  258. /// <remarks>
  259. /// Rounding: Towards negative infinity
  260. /// </remarks>
  261. /// <code>
  262. /// float32v FS_Floor_f32( float32v a )
  263. /// </code>
  264. #define FS_Floor_f32( ... ) FS::Floor_f32( __VA_ARGS__ )
  265. /// <summary>
  266. /// return ceil( a )
  267. /// </summary>
  268. /// <remarks>
  269. /// Rounding: Towards positive infinity
  270. /// </remarks>
  271. /// <code>
  272. /// float32v FS_Ceil_f32( float32v a )
  273. /// </code>
  274. #define FS_Ceil_f32( ... ) FS::Ceil_f32( __VA_ARGS__ )
  275. /// <summary>
  276. /// return round( a )
  277. /// </summary>
  278. /// <remarks>
  279. /// Rounding: Banker's rounding
  280. /// </remarks>
  281. /// <code>
  282. /// float32v FS_Round_f32( float32v a )
  283. /// </code>
  284. #define FS_Round_f32( ... ) FS::Round_f32( __VA_ARGS__ )
  285. // Trig
  286. /// <summary>
  287. /// return APPROXIMATE( cos( a ) )
  288. /// </summary>
  289. /// <code>
  290. /// float32v FS_Cos_f32( float32v a )
  291. /// </code>
  292. #define FS_Cos_f32( ... ) FastSIMD::Cos_f32<FS>( __VA_ARGS__ )
  293. /// <summary>
  294. /// return APPROXIMATE( sin( a ) )
  295. /// </summary>
  296. /// <code>
  297. /// float32v FS_Sin_f32( float32v a )
  298. /// </code>
  299. #define FS_Sin_f32( ... ) FastSIMD::Sin_f32<FS>( __VA_ARGS__ )
  300. // Math
  301. /// <summary>
  302. /// return pow( v, pow )
  303. /// </summary>
  304. /// <code>
  305. /// float32v FS_Pow_f32( float32v v, float32v pow )
  306. /// </code>
  307. #define FS_Pow_f32( ... ) FastSIMD::Pow_f32<FS>( __VA_ARGS__ )
  308. /// <summary>
  309. /// return log( a )
  310. /// </summary>
  311. /// <remarks>
  312. /// a <= 0 returns 0
  313. /// </remarks>
  314. /// <code>
  315. /// float32v FS_Log_f32( float32v a )
  316. /// </code>
  317. #define FS_Log_f32( ... ) FastSIMD::Log_f32<FS>( __VA_ARGS__ )
  318. /// <summary>
  319. /// return exp( a )
  320. /// </summary>
  321. /// <remarks>
  322. /// a will be clamped to -88.376, 88.376
  323. /// </remarks>
  324. /// <code>
  325. /// float32v FS_Exp_f32( float32v a )
  326. /// </code>
  327. #define FS_Exp_f32( ... ) FastSIMD::Exp_f32<FS>( __VA_ARGS__ )
  328. // Mask
  329. /// <summary>
  330. /// return ( m ? a : 0 )
  331. /// </summary>
  332. /// <code>
  333. /// int32v FS_Mask_i32( int32v a, mask32v m )
  334. /// </code>
  335. #define FS_Mask_i32( ... ) FS::Mask_i32( __VA_ARGS__ )
  336. /// <summary>
  337. /// return ( m ? a : 0 )
  338. /// </summary>
  339. /// <code>
  340. /// float32v FS_Mask_f32( float32v a, mask32v m )
  341. /// </code>
  342. #define FS_Mask_f32( ... ) FS::Mask_f32( __VA_ARGS__ )
  343. /// <summary>
  344. /// return ( m ? 0 : a )
  345. /// </summary>
  346. /// <code>
  347. /// int32v FS_NMask_i32( int32v a, mask32v m )
  348. /// </code>
  349. #define FS_NMask_i32( ... ) FS::NMask_i32( __VA_ARGS__ )
  350. /// <summary>
  351. /// return ( m ? 0 : a )
  352. /// </summary>
  353. /// <code>
  354. /// float32v FS_NMask_f32( float32v a, mask32v m )
  355. /// </code>
  356. #define FS_NMask_f32( ... ) FS::NMask_f32( __VA_ARGS__ )
  357. /// <summary>
  358. /// return m.contains( true )
  359. /// </summary>
  360. /// <code>
  361. /// bool FS_AnyMask_bool( mask32v m )
  362. /// </code>
  363. #define FS_AnyMask_bool( ... ) FS::AnyMask_bool( __VA_ARGS__ )
  364. // FMA
  365. /// <summary>
  366. /// return ( (a * b) + c )
  367. /// </summary>
  368. /// <code>
  369. /// float32v FS_FMulAdd_f32( float32v a, float32v b, float32v c )
  370. /// </code>
  371. #define FS_FMulAdd_f32( ... ) FastSIMD::FMulAdd_f32<FS>( __VA_ARGS__ )
  372. /// <summary>
  373. /// return ( -(a * b) + c )
  374. /// </summary>
  375. /// <code>
  376. /// float32v FS_FNMulAdd_f32( float32v a, float32v b, float32v c )
  377. /// </code>
  378. #define FS_FNMulAdd_f32( ... ) FastSIMD::FNMulAdd_f32<FS>( __VA_ARGS__ )
  379. // Masked float
  380. /// <summary>
  381. /// return ( m ? (a + b) : a )
  382. /// </summary>
  383. /// <code>
  384. /// float32v FS_MaskedAdd_f32( float32v a, float32v b, mask32v m )
  385. /// </code>
  386. #define FS_MaskedAdd_f32( ... ) FastSIMD::MaskedAdd_f32<FS>( __VA_ARGS__ )
  387. /// <summary>
  388. /// return ( m ? (a - b) : a )
  389. /// </summary>
  390. /// <code>
  391. /// float32v FS_MaskedSub_f32( float32v a, float32v b, mask32v m )
  392. /// </code>
  393. #define FS_MaskedSub_f32( ... ) FastSIMD::MaskedSub_f32<FS>( __VA_ARGS__ )
  394. /// <summary>
  395. /// return ( m ? (a * b) : a )
  396. /// </summary>
  397. /// <code>
  398. /// float32v FS_MaskedMul_f32( float32v a, float32v b, mask32v m )
  399. /// </code>
  400. #define FS_MaskedMul_f32( ... ) FastSIMD::MaskedMul_f32<FS>( __VA_ARGS__ )
  401. // Masked int32
  402. /// <summary>
  403. /// return ( m ? (a + b) : a )
  404. /// </summary>
  405. /// <code>
  406. /// int32v FS_MaskedAdd_i32( int32v a, int32v b, mask32v m )
  407. /// </code>
  408. #define FS_MaskedAdd_i32( ... ) FastSIMD::MaskedAdd_i32<FS>( __VA_ARGS__ )
  409. /// <summary>
  410. /// return ( m ? (a - b) : a )
  411. /// </summary>
  412. /// <code>
  413. /// int32v FS_MaskedSub_i32( int32v a, int32v b, mask32v m )
  414. /// </code>
  415. #define FS_MaskedSub_i32( ... ) FastSIMD::MaskedSub_i32<FS>( __VA_ARGS__ )
  416. /// <summary>
  417. /// return ( m ? (a * b) : a )
  418. /// </summary>
  419. /// <code>
  420. /// int32v FS_MaskedMul_i32( int32v a, int32v b, mask32v m )
  421. /// </code>
  422. #define FS_MaskedMul_i32( ... ) FastSIMD::MaskedMul_i32<FS>( __VA_ARGS__ )
  423. /// <summary>
  424. /// return ( m ? (a + 1) : a )
  425. /// </summary>
  426. /// <code>
  427. /// int32v FS_MaskedIncrement_i32( int32v a, mask32v m )
  428. /// </code>
  429. #define FS_MaskedIncrement_i32( ... ) FastSIMD::MaskedIncrement_i32<FS>( __VA_ARGS__ )
  430. /// <summary>
  431. /// return ( m ? (a - 1) : a )
  432. /// </summary>
  433. /// <code>
  434. /// int32v FS_MaskedDecrement_i32( int32v a, mask32v m )
  435. /// </code>
  436. #define FS_MaskedDecrement_i32( ... ) FastSIMD::MaskedDecrement_i32<FS>( __VA_ARGS__ )
  437. // NMasked float
  438. /// <summary>
  439. /// return ( m ? a : (a + b) )
  440. /// </summary>
  441. /// <code>
  442. /// float32v FS_NMaskedAdd_f32( float32v a, float32v b, mask32v m )
  443. /// </code>
  444. #define FS_NMaskedAdd_f32( ... ) FastSIMD::NMaskedAdd_f32<FS>( __VA_ARGS__ )
  445. /// <summary>
  446. /// return ( m ? a : (a - b) )
  447. /// </summary>
  448. /// <code>
  449. /// float32v FS_NMaskedSub_f32( float32v a, float32v b, mask32v m )
  450. /// </code>
  451. #define FS_NMaskedSub_f32( ... ) FastSIMD::NMaskedSub_f32<FS>( __VA_ARGS__ )
  452. /// <summary>
  453. /// return ( m ? a : (a * b) )
  454. /// </summary>
  455. /// <code>
  456. /// float32v FS_NMaskedMul_f32( float32v a, float32v b, mask32v m )
  457. /// </code>
  458. #define FS_NMaskedMul_f32( ... ) FastSIMD::NMaskedMul_f32<FS>( __VA_ARGS__ )
  459. // NMasked int32
  460. /// <summary>
  461. /// return ( m ? a : (a + b) )
  462. /// </summary>
  463. /// <code>
  464. /// int32v FS_NMaskedAdd_i32( int32v a, int32v b, mask32v m )
  465. /// </code>
  466. #define FS_NMaskedAdd_i32( ... ) FastSIMD::NMaskedAdd_i32<FS>( __VA_ARGS__ )
  467. /// <summary>
  468. /// return ( m ? a : (a - b) )
  469. /// </summary>
  470. /// <code>
  471. /// int32v FS_NMaskedSub_i32( int32v a, int32v b, mask32v m )
  472. /// </code>
  473. #define FS_NMaskedSub_i32( ... ) FastSIMD::NMaskedSub_i32<FS>( __VA_ARGS__ )
  474. /// <summary>
  475. /// return ( m ? a : (a * b) )
  476. /// </summary>
  477. /// <code>
  478. /// int32v FS_NMaskedMul_i32( int32v a, int32v b, mask32v m )
  479. /// </code>
  480. #define FS_NMaskedMul_i32( ... ) FastSIMD::NMaskedMul_i32<FS>( __VA_ARGS__ )
  481. namespace FastSIMD
  482. {
  483. //FMA
  484. template<typename FS>
  485. FS_INLINE typename FS::float32v FMulAdd_f32( typename FS::float32v a, typename FS::float32v b, typename FS::float32v c )
  486. {
  487. return (a * b) + c;
  488. }
  489. template<typename FS>
  490. FS_INLINE typename FS::float32v FNMulAdd_f32( typename FS::float32v a, typename FS::float32v b, typename FS::float32v c )
  491. {
  492. return -(a * b) + c;
  493. }
  494. // Masked float
  495. template<typename FS>
  496. FS_INLINE typename FS::float32v MaskedAdd_f32( typename FS::float32v a, typename FS::float32v b, typename FS::mask32v m )
  497. {
  498. return a + FS::Mask_f32( b, m );
  499. }
  500. template<typename FS>
  501. FS_INLINE typename FS::float32v MaskedSub_f32( typename FS::float32v a, typename FS::float32v b, typename FS::mask32v m )
  502. {
  503. return a - FS::Mask_f32( b, m );
  504. }
  505. template<typename FS>
  506. FS_INLINE typename FS::float32v MaskedMul_f32( typename FS::float32v a, typename FS::float32v b, typename FS::mask32v m )
  507. {
  508. return a * FS::Mask_f32( b, m );
  509. }
  510. // Masked int32
  511. template<typename FS>
  512. FS_INLINE typename FS::int32v MaskedAdd_i32( typename FS::int32v a, typename FS::int32v b, typename FS::mask32v m )
  513. {
  514. return a + FS::Mask_i32( b, m );
  515. }
  516. template<typename FS>
  517. FS_INLINE typename FS::int32v MaskedSub_i32( typename FS::int32v a, typename FS::int32v b, typename FS::mask32v m )
  518. {
  519. return a - FS::Mask_i32( b, m );
  520. }
  521. template<typename FS>
  522. FS_INLINE typename FS::int32v MaskedMul_i32( typename FS::int32v a, typename FS::int32v b, typename FS::mask32v m )
  523. {
  524. return a * FS::Mask_i32( b, m );
  525. }
  526. // NMasked float
  527. template<typename FS>
  528. FS_INLINE typename FS::float32v NMaskedAdd_f32( typename FS::float32v a, typename FS::float32v b, typename FS::mask32v m )
  529. {
  530. return a + FS::NMask_f32( b, m );
  531. }
  532. template<typename FS>
  533. FS_INLINE typename FS::float32v NMaskedSub_f32( typename FS::float32v a, typename FS::float32v b, typename FS::mask32v m )
  534. {
  535. return a - FS::NMask_f32( b, m );
  536. }
  537. template<typename FS>
  538. FS_INLINE typename FS::float32v NMaskedMul_f32( typename FS::float32v a, typename FS::float32v b, typename FS::mask32v m )
  539. {
  540. return a * FS::NMask_f32( b, m );
  541. }
  542. // NMasked int32
  543. template<typename FS>
  544. FS_INLINE typename FS::int32v NMaskedAdd_i32( typename FS::int32v a, typename FS::int32v b, typename FS::mask32v m )
  545. {
  546. return a + FS::NMask_i32( b, m );
  547. }
  548. template<typename FS>
  549. FS_INLINE typename FS::int32v NMaskedSub_i32( typename FS::int32v a, typename FS::int32v b, typename FS::mask32v m )
  550. {
  551. return a - FS::NMask_i32( b, m );
  552. }
  553. template<typename FS>
  554. FS_INLINE typename FS::int32v NMaskedMul_i32( typename FS::int32v a, typename FS::int32v b, typename FS::mask32v m )
  555. {
  556. return a * FS::NMask_i32( b, m );
  557. }
  558. template<typename FS, std::enable_if_t<std::is_same_v<typename FS::int32v, typename FS::mask32v>>* = nullptr>
  559. FS_INLINE typename FS::int32v MaskedIncrement_i32( typename FS::int32v a, typename FS::mask32v m )
  560. {
  561. return a - m;
  562. }
  563. template<typename FS, std::enable_if_t<!std::is_same_v<typename FS::int32v, typename FS::mask32v>>* = nullptr>
  564. FS_INLINE typename FS::int32v MaskedIncrement_i32( typename FS::int32v a, typename FS::mask32v m )
  565. {
  566. return MaskedSub_i32<FS>( a, typename FS::int32v( -1 ), m );
  567. }
  568. template<typename FS, std::enable_if_t<std::is_same_v<typename FS::int32v, typename FS::mask32v>>* = nullptr>
  569. FS_INLINE typename FS::int32v MaskedDecrement_i32( typename FS::int32v a, typename FS::mask32v m )
  570. {
  571. return a + m;
  572. }
  573. template<typename FS, std::enable_if_t<!std::is_same_v<typename FS::int32v, typename FS::mask32v>>* = nullptr>
  574. FS_INLINE typename FS::int32v MaskedDecrement_i32( typename FS::int32v a, typename FS::mask32v m )
  575. {
  576. return MaskedAdd_i32<FS>( a, typename FS::int32v( -1 ), m );
  577. }
  578. // Bitwise
  579. template<typename FS, std::enable_if_t<std::is_same_v<typename FS::int32v, typename FS::mask32v>>* = nullptr>
  580. FS_INLINE typename FS::mask32v BitwiseAndNot_m32( typename FS::mask32v a, typename FS::mask32v b )
  581. {
  582. return FS::BitwiseAndNot_i32( a, b );
  583. }
  584. template<typename FS, std::enable_if_t<!std::is_same_v<typename FS::int32v, typename FS::mask32v>>* = nullptr>
  585. FS_INLINE typename FS::mask32v BitwiseAndNot_m32( typename FS::mask32v a, typename FS::mask32v b )
  586. {
  587. return a & (~b);
  588. }
  589. // Trig
  590. template<typename FS>
  591. FS_INLINE typename FS::float32v Cos_f32( typename FS::float32v value )
  592. {
  593. typedef typename FS::int32v int32v;
  594. typedef typename FS::float32v float32v;
  595. typedef typename FS::mask32v mask32v;
  596. value = FS_Abs_f32( value );
  597. value -= FS_Floor_f32( value * float32v( 0.1591549f ) ) * float32v( 6.283185f );
  598. mask32v geHalfPi = value >= float32v( 1.570796f );
  599. mask32v geHalfPi2 = value >= float32v( 3.141593f );
  600. mask32v geHalfPi3 = value >= float32v( 4.7123889f );
  601. float32v cosAngle = value ^ FS_Mask_f32( ( value ^ float32v( 3.141593f ) - value ), geHalfPi );
  602. cosAngle = cosAngle ^ FS_Mask_f32( FS_Casti32_f32( int32v( 0x80000000 ) ), geHalfPi2 );
  603. cosAngle = cosAngle ^ FS_Mask_f32( cosAngle ^ ( float32v( 6.283185f ) - value ), geHalfPi3 );
  604. cosAngle *= cosAngle;
  605. cosAngle = FS_FMulAdd_f32( cosAngle, FS_FMulAdd_f32( cosAngle, float32v( 0.03679168f ), float32v( -0.49558072f ) ), float32v( 0.99940307f ) );
  606. return cosAngle ^ FS_Mask_f32( FS_Casti32_f32( int32v( 0x80000000 ) ), FS_BitwiseAndNot_m32( geHalfPi, geHalfPi3 ) );
  607. }
  608. template<typename FS>
  609. FS_INLINE typename FS::float32v Sin_f32( typename FS::float32v value )
  610. {
  611. return Cos_f32<FS>( typename FS::float32v( 1.570796f ) - value );
  612. }
  613. template<typename FS>
  614. FS_INLINE typename FS::float32v Exp_f32( typename FS::float32v x )
  615. {
  616. typedef typename FS::int32v int32v;
  617. typedef typename FS::float32v float32v;
  618. x = FS_Min_f32( x, float32v( 88.3762626647949f ) );
  619. x = FS_Max_f32( x, float32v( -88.3762626647949f ) );
  620. /* express exp(x) as exp(g + n*log(2)) */
  621. float32v fx = x * float32v( 1.44269504088896341f );
  622. fx += float32v( 0.5f );
  623. float32v flr = FS_Floor_f32( fx );
  624. fx = FS_MaskedSub_f32( flr, float32v( 1 ), flr > fx );
  625. x -= fx * float32v( 0.693359375f );
  626. x -= fx * float32v( -2.12194440e-4f );
  627. float32v y( 1.9875691500E-4f );
  628. y *= x;
  629. y += float32v( 1.3981999507E-3f );
  630. y *= x;
  631. y += float32v( 8.3334519073E-3f );
  632. y *= x;
  633. y += float32v( 4.1665795894E-2f );
  634. y *= x;
  635. y += float32v( 1.6666665459E-1f );
  636. y *= x;
  637. y += float32v( 5.0000001201E-1f );
  638. y *= x * x;
  639. y += x + float32v( 1 );
  640. /* build 2^n */
  641. int32v i = FS_Convertf32_i32( fx );
  642. // another two AVX2 instructions
  643. i += int32v( 0x7f );
  644. i <<= 23;
  645. float32v pow2n = FS_Casti32_f32( i );
  646. return y * pow2n;
  647. }
  648. template<typename FS>
  649. FS_INLINE typename FS::float32v Log_f32( typename FS::float32v x )
  650. {
  651. typedef typename FS::int32v int32v;
  652. typedef typename FS::float32v float32v;
  653. typedef typename FS::mask32v mask32v;
  654. mask32v validMask = x > float32v( 0 );
  655. x = FS_Max_f32( x, FS_Casti32_f32( int32v( 0x00800000 ) ) ); /* cut off denormalized stuff */
  656. // can be done with AVX2
  657. int32v i = FS_BitwiseShiftRightZX_i32( FS_Castf32_i32( x ), 23 );
  658. /* keep only the fractional part */
  659. x &= FS_Casti32_f32( int32v( ~0x7f800000 ) );
  660. x |= float32v( 0.5f );
  661. // this is again another AVX2 instruction
  662. i -= int32v( 0x7f );
  663. float32v e = FS_Converti32_f32( i );
  664. e += float32v( 1 );
  665. mask32v mask = x < float32v( 0.707106781186547524f );
  666. x = FS_MaskedAdd_f32( x, x, mask );
  667. x -= float32v( 1 );
  668. e = FS_MaskedSub_f32( e, float32v( 1 ), mask );
  669. float32v y = float32v( 7.0376836292E-2f );
  670. y *= x;
  671. y += float32v( -1.1514610310E-1f );
  672. y *= x;
  673. y += float32v( 1.1676998740E-1f );
  674. y *= x;
  675. y += float32v( -1.2420140846E-1f );
  676. y *= x;
  677. y += float32v( 1.4249322787E-1f );
  678. y *= x;
  679. y += float32v( -1.6668057665E-1f );
  680. y *= x;
  681. y += float32v( 2.0000714765E-1f );
  682. y *= x;
  683. y += float32v( -2.4999993993E-1f );
  684. y *= x;
  685. y += float32v( 3.3333331174E-1f );
  686. y *= x;
  687. float32v xx = x * x;
  688. y *= xx;
  689. y *= e * float32v( -2.12194440e-4f );
  690. y -= xx * float32v( 0.5f );
  691. x += y;
  692. x += e * float32v( 0.693359375f );
  693. return FS_Mask_f32( x, validMask );
  694. }
  695. template<typename FS>
  696. FS_INLINE typename FS::float32v Pow_f32( typename FS::float32v value, typename FS::float32v pow )
  697. {
  698. return Exp_f32<FS>( pow * Log_f32<FS>( value ) );
  699. }
  700. }