| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856 |
- #pragma once
- #include <cinttypes>
- #include <type_traits>
- #include <memory>
- #include "FastSIMD/FastSIMD.h"
- #ifdef _MSC_VER
- #if defined( _M_IX86_FP ) && _M_IX86_FP < 2
- #define FS_VECTORCALL
- #else
- #define FS_VECTORCALL __vectorcall
- #endif
- #define FS_INLINE __forceinline
- #else
- #define FS_VECTORCALL
- #define FS_INLINE __attribute__((always_inline)) inline
- #endif
- #ifndef NDEBUG
- #undef FS_INLINE
- #define FS_INLINE inline
- #endif
- /// <summary>
- /// Number of 32 width elements that will fit into a vector
- /// </summary>
- /// <remarks>
- /// Compile time constant
- /// </remarks>
- /// <code>
- /// size_t FS_Size_32()
- /// </code>
- #define FS_Size_32() FS::template VectorSize<sizeof( int32_t )>
- // Vector builders
- /// <summary>
- /// Vector with values incrementing from 0 based on element index {0, 1, 2, 3...}
- /// </summary>
- /// <code>
- /// example: int32v::FS_Incremented()
- /// </code>
- #define FS_Incremented() Incremented()
- // Load
- /// <summary>
- /// Copies sizeof(float32v) bytes from given memory location into float32v
- /// </summary>
- /// <remarks>
- /// Memory does not need to be aligned
- /// </remarks>
- /// <code>
- /// float32v FS_Load_f32( void const* ptr )
- /// </code>
- #define FS_Load_f32( ... ) FS::Load_f32( __VA_ARGS__ )
- /// <summary>
- /// Copies sizeof(int32v) bytes from given memory location into int32v
- /// </summary>
- /// <remarks>
- /// Memory does not need to be aligned
- /// </remarks>
- /// <code>
- /// int32v FS_Load_i32( void const* ptr )
- /// </code>
- #define FS_Load_i32( ... ) FS::Load_i32( __VA_ARGS__ )
- // Store
- /// <summary>
- /// Copies all elements of float32v to given memory location
- /// </summary>
- /// <code>
- /// void FS_Store_f32( void* ptr, float32v f )
- /// </code>
- #define FS_Store_f32( ... ) FS::Store_f32( __VA_ARGS__ )
- /// <summary>
- /// Copies all elements of int32v to given memory location
- /// </summary>
- /// <code>
- /// void FS_Store_i32( void* ptr, int32v i )
- /// </code>
- #define FS_Store_i32( ... ) FS::Store_i32( __VA_ARGS__ )
- // Extract
- /// <summary>
- /// Retreive element 0 from vector
- /// </summary>
- /// <code>
- /// float FS_Extract0_f32( float32v f )
- /// </code>
- #define FS_Extract0_f32( ... ) FS::Extract0_f32( __VA_ARGS__ )
- /// <summary>
- /// Retreive element 0 from vector
- /// </summary>
- /// <code>
- /// int32_t FS_Extract0_i32( int32v i )
- /// </code>
- #define FS_Extract0_i32( ... ) FS::Extract0_i32( __VA_ARGS__ )
- /// <summary>
- /// Retreive element from vector at position
- /// </summary>
- /// <code>
- /// float FS_Extract_f32( float32v f, size_t idx )
- /// </code>
- #define FS_Extract_f32( ... ) FS::Extract_f32( __VA_ARGS__ )
- /// <summary>
- /// Retreive element from vector at position
- /// </summary>
- /// <code>
- /// int32_t FS_Extract_i32( int32v i, size_t idx )
- /// </code>
- #define FS_Extract_i32( ... ) FS::Extract_i32( __VA_ARGS__ )
- // Cast
- /// <summary>
- /// Bitwise cast int to float
- /// </summary>
- /// <code>
- /// float32v FS_Casti32_f32( int32v i )
- /// </code>
- #define FS_Casti32_f32( ... ) FS::Casti32_f32( __VA_ARGS__ )
- /// <summary>
- /// Bitwise cast float to int
- /// </summary>
- /// <code>
- /// int32v FS_Castf32_i32( float32v f )
- /// </code>
- #define FS_Castf32_i32( ... ) FS::Castf32_i32( __VA_ARGS__ )
- // Convert
- /// <summary>
- /// Convert int to float
- /// </summary>
- /// <remarks>
- /// Rounding: truncate
- /// </remarks>
- /// <code>
- /// float32v FS_Converti32_f32( int32v i )
- /// </code>
- #define FS_Converti32_f32( ... ) FS::Converti32_f32( __VA_ARGS__ )
- /// <summary>
- /// Convert float to int
- /// </summary>
- /// <code>
- /// int32v FS_Convertf32_i32( float32v f )
- /// </code>
- #define FS_Convertf32_i32( ... ) FS::Convertf32_i32( __VA_ARGS__ )
- // Select
- /// <summary>
- /// return ( m ? a : b )
- /// </summary>
- /// <code>
- /// float32v FS_Select_f32( mask32v m, float32v a, float32v b )
- /// </code>
- #define FS_Select_f32( ... ) FS::Select_f32( __VA_ARGS__ )
- /// <summary>
- /// return ( m ? a : b )
- /// </summary>
- /// <code>
- /// int32v FS_Select_i32( mask32v m, int32v a, int32v b )
- /// </code>
- #define FS_Select_i32( ... ) FS::Select_i32( __VA_ARGS__ )
- // Min, Max
- /// <summary>
- /// return ( a < b ? a : b )
- /// </summary>
- /// <code>
- /// float32v FS_Min_f32( float32v a, float32v b )
- /// </code>
- #define FS_Min_f32( ... ) FS::Min_f32( __VA_ARGS__ )
- /// <summary>
- /// return ( a > b ? a : b )
- /// </summary>
- /// <code>
- /// float32v FS_Max_f32( float32v a, float32v b )
- /// </code>
- #define FS_Max_f32( ... ) FS::Max_f32( __VA_ARGS__ )
- /// <summary>
- /// return ( a < b ? a : b )
- /// </summary>
- /// <code>
- /// int32v FS_Min_i32( int32v a, int32v b )
- /// </code>
- #define FS_Min_i32( ... ) FS::Min_i32( __VA_ARGS__ )
- /// <summary>
- /// return ( a > b ? a : b )
- /// </summary>
- /// <code>
- /// int32v FS_Max_i32( int32v a, int32v b )
- /// </code>
- #define FS_Max_i32( ... ) FS::Max_i32( __VA_ARGS__ )
- // Bitwise
- /// <summary>
- /// return ( a & ~b )
- /// </summary>
- /// <code>
- /// float32v FS_BitwiseAndNot_f32( float32v a, float32v b )
- /// </code>
- #define FS_BitwiseAndNot_f32( ... ) FS::BitwiseAndNot_f32( __VA_ARGS__ )
- /// <summary>
- /// return ( a & ~b )
- /// </summary>
- /// <code>
- /// int32v FS_BitwiseAndNot_i32( int32v a, int32v b )
- /// </code>
- #define FS_BitwiseAndNot_i32( ... ) FS::BitwiseAndNot_i32( __VA_ARGS__ )
- /// <summary>
- /// return ( a & ~b )
- /// </summary>
- /// <code>
- /// mask32v FS_BitwiseAndNot_m32( mask32v a, mask32v b )
- /// </code>
- #define FS_BitwiseAndNot_m32( ... ) FastSIMD::BitwiseAndNot_m32<FS>( __VA_ARGS__ )
- /// <summary>
- /// return ZeroExtend( a >> b )
- /// </summary>
- /// <code>
- /// float32v FS_BitwiseShiftRightZX_f32( float32v a, int32_t b )
- /// </code>
- #define FS_BitwiseShiftRightZX_f32( ... ) FS::BitwiseShiftRightZX_f32( __VA_ARGS__ )
- /// <summary>
- /// return ZeroExtend( a >> b )
- /// </summary>
- /// <code>
- /// float32v FS_BitwiseShiftRightZX_i32( int32v a, int32_t b )
- /// </code>
- #define FS_BitwiseShiftRightZX_i32( ... ) FS::BitwiseShiftRightZX_i32( __VA_ARGS__ )
- // Abs
- /// <summary>
- /// return ( a < 0 ? -a : a )
- /// </summary>
- /// <code>
- /// float32v FS_Abs_f32( float32v a )
- /// </code>
- #define FS_Abs_f32( ... ) FS::Abs_f32( __VA_ARGS__ )
- /// <summary>
- /// return ( a < 0 ? -a : a )
- /// </summary>
- /// <code>
- /// int32v FS_Abs_i32( int32v a )
- /// </code>
- #define FS_Abs_i32( ... ) FS::Abs_i32( __VA_ARGS__ )
- // Float math
- /// <summary>
- /// return sqrt( a )
- /// </summary>
- /// <code>
- /// float32v FS_Sqrt_f32( float32v a )
- /// </code>
- #define FS_Sqrt_f32( ... ) FS::Sqrt_f32( __VA_ARGS__ )
- /// <summary>
- /// return APPROXIMATE( 1.0 / sqrt( a ) )
- /// </summary>
- /// <code>
- /// float32v FS_InvSqrt_f32( float32v a )
- /// </code>
- #define FS_InvSqrt_f32( ... ) FS::InvSqrt_f32( __VA_ARGS__ )
- /// <summary>
- /// return APPROXIMATE( 1.0 / a )
- /// </summary>
- /// <code>
- /// float32v FS_Reciprocal_f32( float32v a )
- /// </code>
- #define FS_Reciprocal_f32( ... ) FS::Reciprocal_f32( __VA_ARGS__ )
- // Floor, Ceil, Round
- /// <summary>
- /// return floor( a )
- /// </summary>
- /// <remarks>
- /// Rounding: Towards negative infinity
- /// </remarks>
- /// <code>
- /// float32v FS_Floor_f32( float32v a )
- /// </code>
- #define FS_Floor_f32( ... ) FS::Floor_f32( __VA_ARGS__ )
- /// <summary>
- /// return ceil( a )
- /// </summary>
- /// <remarks>
- /// Rounding: Towards positive infinity
- /// </remarks>
- /// <code>
- /// float32v FS_Ceil_f32( float32v a )
- /// </code>
- #define FS_Ceil_f32( ... ) FS::Ceil_f32( __VA_ARGS__ )
- /// <summary>
- /// return round( a )
- /// </summary>
- /// <remarks>
- /// Rounding: Banker's rounding
- /// </remarks>
- /// <code>
- /// float32v FS_Round_f32( float32v a )
- /// </code>
- #define FS_Round_f32( ... ) FS::Round_f32( __VA_ARGS__ )
- // Trig
- /// <summary>
- /// return APPROXIMATE( cos( a ) )
- /// </summary>
- /// <code>
- /// float32v FS_Cos_f32( float32v a )
- /// </code>
- #define FS_Cos_f32( ... ) FastSIMD::Cos_f32<FS>( __VA_ARGS__ )
- /// <summary>
- /// return APPROXIMATE( sin( a ) )
- /// </summary>
- /// <code>
- /// float32v FS_Sin_f32( float32v a )
- /// </code>
- #define FS_Sin_f32( ... ) FastSIMD::Sin_f32<FS>( __VA_ARGS__ )
- // Math
- /// <summary>
- /// return pow( v, pow )
- /// </summary>
- /// <code>
- /// float32v FS_Pow_f32( float32v v, float32v pow )
- /// </code>
- #define FS_Pow_f32( ... ) FastSIMD::Pow_f32<FS>( __VA_ARGS__ )
- /// <summary>
- /// return log( a )
- /// </summary>
- /// <remarks>
- /// a <= 0 returns 0
- /// </remarks>
- /// <code>
- /// float32v FS_Log_f32( float32v a )
- /// </code>
- #define FS_Log_f32( ... ) FastSIMD::Log_f32<FS>( __VA_ARGS__ )
- /// <summary>
- /// return exp( a )
- /// </summary>
- /// <remarks>
- /// a will be clamped to -88.376, 88.376
- /// </remarks>
- /// <code>
- /// float32v FS_Exp_f32( float32v a )
- /// </code>
- #define FS_Exp_f32( ... ) FastSIMD::Exp_f32<FS>( __VA_ARGS__ )
- // Mask
- /// <summary>
- /// return ( m ? a : 0 )
- /// </summary>
- /// <code>
- /// int32v FS_Mask_i32( int32v a, mask32v m )
- /// </code>
- #define FS_Mask_i32( ... ) FS::Mask_i32( __VA_ARGS__ )
- /// <summary>
- /// return ( m ? a : 0 )
- /// </summary>
- /// <code>
- /// float32v FS_Mask_f32( float32v a, mask32v m )
- /// </code>
- #define FS_Mask_f32( ... ) FS::Mask_f32( __VA_ARGS__ )
- /// <summary>
- /// return ( m ? 0 : a )
- /// </summary>
- /// <code>
- /// int32v FS_NMask_i32( int32v a, mask32v m )
- /// </code>
- #define FS_NMask_i32( ... ) FS::NMask_i32( __VA_ARGS__ )
- /// <summary>
- /// return ( m ? 0 : a )
- /// </summary>
- /// <code>
- /// float32v FS_NMask_f32( float32v a, mask32v m )
- /// </code>
- #define FS_NMask_f32( ... ) FS::NMask_f32( __VA_ARGS__ )
- /// <summary>
- /// return m.contains( true )
- /// </summary>
- /// <code>
- /// bool FS_AnyMask_bool( mask32v m )
- /// </code>
- #define FS_AnyMask_bool( ... ) FS::AnyMask_bool( __VA_ARGS__ )
- // FMA
- /// <summary>
- /// return ( (a * b) + c )
- /// </summary>
- /// <code>
- /// float32v FS_FMulAdd_f32( float32v a, float32v b, float32v c )
- /// </code>
- #define FS_FMulAdd_f32( ... ) FastSIMD::FMulAdd_f32<FS>( __VA_ARGS__ )
- /// <summary>
- /// return ( -(a * b) + c )
- /// </summary>
- /// <code>
- /// float32v FS_FNMulAdd_f32( float32v a, float32v b, float32v c )
- /// </code>
- #define FS_FNMulAdd_f32( ... ) FastSIMD::FNMulAdd_f32<FS>( __VA_ARGS__ )
- // Masked float
- /// <summary>
- /// return ( m ? (a + b) : a )
- /// </summary>
- /// <code>
- /// float32v FS_MaskedAdd_f32( float32v a, float32v b, mask32v m )
- /// </code>
- #define FS_MaskedAdd_f32( ... ) FastSIMD::MaskedAdd_f32<FS>( __VA_ARGS__ )
- /// <summary>
- /// return ( m ? (a - b) : a )
- /// </summary>
- /// <code>
- /// float32v FS_MaskedSub_f32( float32v a, float32v b, mask32v m )
- /// </code>
- #define FS_MaskedSub_f32( ... ) FastSIMD::MaskedSub_f32<FS>( __VA_ARGS__ )
- /// <summary>
- /// return ( m ? (a * b) : a )
- /// </summary>
- /// <code>
- /// float32v FS_MaskedMul_f32( float32v a, float32v b, mask32v m )
- /// </code>
- #define FS_MaskedMul_f32( ... ) FastSIMD::MaskedMul_f32<FS>( __VA_ARGS__ )
- // Masked int32
- /// <summary>
- /// return ( m ? (a + b) : a )
- /// </summary>
- /// <code>
- /// int32v FS_MaskedAdd_i32( int32v a, int32v b, mask32v m )
- /// </code>
- #define FS_MaskedAdd_i32( ... ) FastSIMD::MaskedAdd_i32<FS>( __VA_ARGS__ )
- /// <summary>
- /// return ( m ? (a - b) : a )
- /// </summary>
- /// <code>
- /// int32v FS_MaskedSub_i32( int32v a, int32v b, mask32v m )
- /// </code>
- #define FS_MaskedSub_i32( ... ) FastSIMD::MaskedSub_i32<FS>( __VA_ARGS__ )
- /// <summary>
- /// return ( m ? (a * b) : a )
- /// </summary>
- /// <code>
- /// int32v FS_MaskedMul_i32( int32v a, int32v b, mask32v m )
- /// </code>
- #define FS_MaskedMul_i32( ... ) FastSIMD::MaskedMul_i32<FS>( __VA_ARGS__ )
- /// <summary>
- /// return ( m ? (a + 1) : a )
- /// </summary>
- /// <code>
- /// int32v FS_MaskedIncrement_i32( int32v a, mask32v m )
- /// </code>
- #define FS_MaskedIncrement_i32( ... ) FastSIMD::MaskedIncrement_i32<FS>( __VA_ARGS__ )
- /// <summary>
- /// return ( m ? (a - 1) : a )
- /// </summary>
- /// <code>
- /// int32v FS_MaskedDecrement_i32( int32v a, mask32v m )
- /// </code>
- #define FS_MaskedDecrement_i32( ... ) FastSIMD::MaskedDecrement_i32<FS>( __VA_ARGS__ )
- // NMasked float
- /// <summary>
- /// return ( m ? a : (a + b) )
- /// </summary>
- /// <code>
- /// float32v FS_NMaskedAdd_f32( float32v a, float32v b, mask32v m )
- /// </code>
- #define FS_NMaskedAdd_f32( ... ) FastSIMD::NMaskedAdd_f32<FS>( __VA_ARGS__ )
- /// <summary>
- /// return ( m ? a : (a - b) )
- /// </summary>
- /// <code>
- /// float32v FS_NMaskedSub_f32( float32v a, float32v b, mask32v m )
- /// </code>
- #define FS_NMaskedSub_f32( ... ) FastSIMD::NMaskedSub_f32<FS>( __VA_ARGS__ )
- /// <summary>
- /// return ( m ? a : (a * b) )
- /// </summary>
- /// <code>
- /// float32v FS_NMaskedMul_f32( float32v a, float32v b, mask32v m )
- /// </code>
- #define FS_NMaskedMul_f32( ... ) FastSIMD::NMaskedMul_f32<FS>( __VA_ARGS__ )
- // NMasked int32
- /// <summary>
- /// return ( m ? a : (a + b) )
- /// </summary>
- /// <code>
- /// int32v FS_NMaskedAdd_i32( int32v a, int32v b, mask32v m )
- /// </code>
- #define FS_NMaskedAdd_i32( ... ) FastSIMD::NMaskedAdd_i32<FS>( __VA_ARGS__ )
- /// <summary>
- /// return ( m ? a : (a - b) )
- /// </summary>
- /// <code>
- /// int32v FS_NMaskedSub_i32( int32v a, int32v b, mask32v m )
- /// </code>
- #define FS_NMaskedSub_i32( ... ) FastSIMD::NMaskedSub_i32<FS>( __VA_ARGS__ )
- /// <summary>
- /// return ( m ? a : (a * b) )
- /// </summary>
- /// <code>
- /// int32v FS_NMaskedMul_i32( int32v a, int32v b, mask32v m )
- /// </code>
- #define FS_NMaskedMul_i32( ... ) FastSIMD::NMaskedMul_i32<FS>( __VA_ARGS__ )
- namespace FastSIMD
- {
- //FMA
- template<typename FS>
- FS_INLINE typename FS::float32v FMulAdd_f32( typename FS::float32v a, typename FS::float32v b, typename FS::float32v c )
- {
- return (a * b) + c;
- }
- template<typename FS>
- FS_INLINE typename FS::float32v FNMulAdd_f32( typename FS::float32v a, typename FS::float32v b, typename FS::float32v c )
- {
- return -(a * b) + c;
- }
- // Masked float
- template<typename FS>
- FS_INLINE typename FS::float32v MaskedAdd_f32( typename FS::float32v a, typename FS::float32v b, typename FS::mask32v m )
- {
- return a + FS::Mask_f32( b, m );
- }
- template<typename FS>
- FS_INLINE typename FS::float32v MaskedSub_f32( typename FS::float32v a, typename FS::float32v b, typename FS::mask32v m )
- {
- return a - FS::Mask_f32( b, m );
- }
- template<typename FS>
- FS_INLINE typename FS::float32v MaskedMul_f32( typename FS::float32v a, typename FS::float32v b, typename FS::mask32v m )
- {
- return a * FS::Mask_f32( b, m );
- }
- // Masked int32
- template<typename FS>
- FS_INLINE typename FS::int32v MaskedAdd_i32( typename FS::int32v a, typename FS::int32v b, typename FS::mask32v m )
- {
- return a + FS::Mask_i32( b, m );
- }
- template<typename FS>
- FS_INLINE typename FS::int32v MaskedSub_i32( typename FS::int32v a, typename FS::int32v b, typename FS::mask32v m )
- {
- return a - FS::Mask_i32( b, m );
- }
- template<typename FS>
- FS_INLINE typename FS::int32v MaskedMul_i32( typename FS::int32v a, typename FS::int32v b, typename FS::mask32v m )
- {
- return a * FS::Mask_i32( b, m );
- }
- // NMasked float
- template<typename FS>
- FS_INLINE typename FS::float32v NMaskedAdd_f32( typename FS::float32v a, typename FS::float32v b, typename FS::mask32v m )
- {
- return a + FS::NMask_f32( b, m );
- }
- template<typename FS>
- FS_INLINE typename FS::float32v NMaskedSub_f32( typename FS::float32v a, typename FS::float32v b, typename FS::mask32v m )
- {
- return a - FS::NMask_f32( b, m );
- }
- template<typename FS>
- FS_INLINE typename FS::float32v NMaskedMul_f32( typename FS::float32v a, typename FS::float32v b, typename FS::mask32v m )
- {
- return a * FS::NMask_f32( b, m );
- }
- // NMasked int32
- template<typename FS>
- FS_INLINE typename FS::int32v NMaskedAdd_i32( typename FS::int32v a, typename FS::int32v b, typename FS::mask32v m )
- {
- return a + FS::NMask_i32( b, m );
- }
- template<typename FS>
- FS_INLINE typename FS::int32v NMaskedSub_i32( typename FS::int32v a, typename FS::int32v b, typename FS::mask32v m )
- {
- return a - FS::NMask_i32( b, m );
- }
- template<typename FS>
- FS_INLINE typename FS::int32v NMaskedMul_i32( typename FS::int32v a, typename FS::int32v b, typename FS::mask32v m )
- {
- return a * FS::NMask_i32( b, m );
- }
- template<typename FS, std::enable_if_t<std::is_same_v<typename FS::int32v, typename FS::mask32v>>* = nullptr>
- FS_INLINE typename FS::int32v MaskedIncrement_i32( typename FS::int32v a, typename FS::mask32v m )
- {
- return a - m;
- }
- template<typename FS, std::enable_if_t<!std::is_same_v<typename FS::int32v, typename FS::mask32v>>* = nullptr>
- FS_INLINE typename FS::int32v MaskedIncrement_i32( typename FS::int32v a, typename FS::mask32v m )
- {
- return MaskedSub_i32<FS>( a, typename FS::int32v( -1 ), m );
- }
- template<typename FS, std::enable_if_t<std::is_same_v<typename FS::int32v, typename FS::mask32v>>* = nullptr>
- FS_INLINE typename FS::int32v MaskedDecrement_i32( typename FS::int32v a, typename FS::mask32v m )
- {
- return a + m;
- }
- template<typename FS, std::enable_if_t<!std::is_same_v<typename FS::int32v, typename FS::mask32v>>* = nullptr>
- FS_INLINE typename FS::int32v MaskedDecrement_i32( typename FS::int32v a, typename FS::mask32v m )
- {
- return MaskedAdd_i32<FS>( a, typename FS::int32v( -1 ), m );
- }
- // Bitwise
- template<typename FS, std::enable_if_t<std::is_same_v<typename FS::int32v, typename FS::mask32v>>* = nullptr>
- FS_INLINE typename FS::mask32v BitwiseAndNot_m32( typename FS::mask32v a, typename FS::mask32v b )
- {
- return FS::BitwiseAndNot_i32( a, b );
- }
- template<typename FS, std::enable_if_t<!std::is_same_v<typename FS::int32v, typename FS::mask32v>>* = nullptr>
- FS_INLINE typename FS::mask32v BitwiseAndNot_m32( typename FS::mask32v a, typename FS::mask32v b )
- {
- return a & (~b);
- }
- // Trig
- template<typename FS>
- FS_INLINE typename FS::float32v Cos_f32( typename FS::float32v value )
- {
- typedef typename FS::int32v int32v;
- typedef typename FS::float32v float32v;
- typedef typename FS::mask32v mask32v;
- value = FS_Abs_f32( value );
- value -= FS_Floor_f32( value * float32v( 0.1591549f ) ) * float32v( 6.283185f );
- mask32v geHalfPi = value >= float32v( 1.570796f );
- mask32v geHalfPi2 = value >= float32v( 3.141593f );
- mask32v geHalfPi3 = value >= float32v( 4.7123889f );
- float32v cosAngle = value ^ FS_Mask_f32( ( value ^ float32v( 3.141593f ) - value ), geHalfPi );
- cosAngle = cosAngle ^ FS_Mask_f32( FS_Casti32_f32( int32v( 0x80000000 ) ), geHalfPi2 );
- cosAngle = cosAngle ^ FS_Mask_f32( cosAngle ^ ( float32v( 6.283185f ) - value ), geHalfPi3 );
- cosAngle *= cosAngle;
- cosAngle = FS_FMulAdd_f32( cosAngle, FS_FMulAdd_f32( cosAngle, float32v( 0.03679168f ), float32v( -0.49558072f ) ), float32v( 0.99940307f ) );
- return cosAngle ^ FS_Mask_f32( FS_Casti32_f32( int32v( 0x80000000 ) ), FS_BitwiseAndNot_m32( geHalfPi, geHalfPi3 ) );
- }
- template<typename FS>
- FS_INLINE typename FS::float32v Sin_f32( typename FS::float32v value )
- {
- return Cos_f32<FS>( typename FS::float32v( 1.570796f ) - value );
- }
- template<typename FS>
- FS_INLINE typename FS::float32v Exp_f32( typename FS::float32v x )
- {
- typedef typename FS::int32v int32v;
- typedef typename FS::float32v float32v;
- x = FS_Min_f32( x, float32v( 88.3762626647949f ) );
- x = FS_Max_f32( x, float32v( -88.3762626647949f ) );
- /* express exp(x) as exp(g + n*log(2)) */
- float32v fx = x * float32v( 1.44269504088896341f );
- fx += float32v( 0.5f );
- float32v flr = FS_Floor_f32( fx );
- fx = FS_MaskedSub_f32( flr, float32v( 1 ), flr > fx );
- x -= fx * float32v( 0.693359375f );
- x -= fx * float32v( -2.12194440e-4f );
- float32v y( 1.9875691500E-4f );
- y *= x;
- y += float32v( 1.3981999507E-3f );
- y *= x;
- y += float32v( 8.3334519073E-3f );
- y *= x;
- y += float32v( 4.1665795894E-2f );
- y *= x;
- y += float32v( 1.6666665459E-1f );
- y *= x;
- y += float32v( 5.0000001201E-1f );
- y *= x * x;
- y += x + float32v( 1 );
- /* build 2^n */
- int32v i = FS_Convertf32_i32( fx );
- // another two AVX2 instructions
- i += int32v( 0x7f );
- i <<= 23;
- float32v pow2n = FS_Casti32_f32( i );
-
- return y * pow2n;
- }
- template<typename FS>
- FS_INLINE typename FS::float32v Log_f32( typename FS::float32v x )
- {
- typedef typename FS::int32v int32v;
- typedef typename FS::float32v float32v;
- typedef typename FS::mask32v mask32v;
-
- mask32v validMask = x > float32v( 0 );
- x = FS_Max_f32( x, FS_Casti32_f32( int32v( 0x00800000 ) ) ); /* cut off denormalized stuff */
- // can be done with AVX2
- int32v i = FS_BitwiseShiftRightZX_i32( FS_Castf32_i32( x ), 23 );
- /* keep only the fractional part */
- x &= FS_Casti32_f32( int32v( ~0x7f800000 ) );
- x |= float32v( 0.5f );
- // this is again another AVX2 instruction
- i -= int32v( 0x7f );
- float32v e = FS_Converti32_f32( i );
- e += float32v( 1 );
- mask32v mask = x < float32v( 0.707106781186547524f );
- x = FS_MaskedAdd_f32( x, x, mask );
- x -= float32v( 1 );
- e = FS_MaskedSub_f32( e, float32v( 1 ), mask );
- float32v y = float32v( 7.0376836292E-2f );
- y *= x;
- y += float32v( -1.1514610310E-1f );
- y *= x;
- y += float32v( 1.1676998740E-1f );
- y *= x;
- y += float32v( -1.2420140846E-1f );
- y *= x;
- y += float32v( 1.4249322787E-1f );
- y *= x;
- y += float32v( -1.6668057665E-1f );
- y *= x;
- y += float32v( 2.0000714765E-1f );
- y *= x;
- y += float32v( -2.4999993993E-1f );
- y *= x;
- y += float32v( 3.3333331174E-1f );
- y *= x;
- float32v xx = x * x;
- y *= xx;
- y *= e * float32v( -2.12194440e-4f );
- y -= xx * float32v( 0.5f );
- x += y;
- x += e * float32v( 0.693359375f );
- return FS_Mask_f32( x, validMask );
- }
- template<typename FS>
- FS_INLINE typename FS::float32v Pow_f32( typename FS::float32v value, typename FS::float32v pow )
- {
- return Exp_f32<FS>( pow * Log_f32<FS>( value ) );
- }
- }
|