2
0

vec2fa.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301
  1. // Copyright 2009-2021 Intel Corporation
  2. // SPDX-License-Identifier: Apache-2.0
  3. #pragma once
  4. #include "../sys/alloc.h"
  5. #include "math.h"
  6. #include "../simd/sse.h"
  7. namespace embree
  8. {
  9. ////////////////////////////////////////////////////////////////////////////////
  10. /// SSE Vec2fa Type
  11. ////////////////////////////////////////////////////////////////////////////////
  12. struct __aligned(16) Vec2fa
  13. {
  14. ALIGNED_STRUCT_(16);
  15. typedef float Scalar;
  16. enum { N = 2 };
  17. union {
  18. __m128 m128;
  19. struct { float x,y,az,aw; };
  20. };
  21. ////////////////////////////////////////////////////////////////////////////////
  22. /// Constructors, Assignment & Cast Operators
  23. ////////////////////////////////////////////////////////////////////////////////
  24. __forceinline Vec2fa( ) {}
  25. __forceinline Vec2fa( const __m128 a ) : m128(a) {}
  26. __forceinline Vec2fa ( const Vec2<float>& other ) { x = other.x; y = other.y; }
  27. __forceinline Vec2fa& operator =( const Vec2<float>& other ) { x = other.x; y = other.y; return *this; }
  28. __forceinline Vec2fa ( const Vec2fa& other ) { m128 = other.m128; }
  29. __forceinline Vec2fa& operator =( const Vec2fa& other ) { m128 = other.m128; return *this; }
  30. __forceinline explicit Vec2fa( const float a ) : m128(_mm_set1_ps(a)) {}
  31. __forceinline Vec2fa( const float x, const float y) : m128(_mm_set_ps(y, y, y, x)) {}
  32. __forceinline explicit Vec2fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
  33. __forceinline operator const __m128&() const { return m128; }
  34. __forceinline operator __m128&() { return m128; }
  35. ////////////////////////////////////////////////////////////////////////////////
  36. /// Loads and Stores
  37. ////////////////////////////////////////////////////////////////////////////////
  38. static __forceinline Vec2fa load( const void* const a ) {
  39. return Vec2fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1))));
  40. }
  41. static __forceinline Vec2fa loadu( const void* const a ) {
  42. return Vec2fa(_mm_and_ps(_mm_loadu_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1))));
  43. }
  44. static __forceinline void storeu ( void* ptr, const Vec2fa& v ) {
  45. _mm_storeu_ps((float*)ptr,v);
  46. }
  47. ////////////////////////////////////////////////////////////////////////////////
  48. /// Constants
  49. ////////////////////////////////////////////////////////////////////////////////
  50. __forceinline Vec2fa( ZeroTy ) : m128(_mm_setzero_ps()) {}
  51. __forceinline Vec2fa( OneTy ) : m128(_mm_set1_ps(1.0f)) {}
  52. __forceinline Vec2fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
  53. __forceinline Vec2fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
  54. ////////////////////////////////////////////////////////////////////////////////
  55. /// Array Access
  56. ////////////////////////////////////////////////////////////////////////////////
  57. __forceinline const float& operator []( const size_t index ) const { assert(index < 2); return (&x)[index]; }
  58. __forceinline float& operator []( const size_t index ) { assert(index < 2); return (&x)[index]; }
  59. };
  60. ////////////////////////////////////////////////////////////////////////////////
  61. /// Unary Operators
  62. ////////////////////////////////////////////////////////////////////////////////
  63. __forceinline Vec2fa operator +( const Vec2fa& a ) { return a; }
  64. __forceinline Vec2fa operator -( const Vec2fa& a ) {
  65. const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
  66. return _mm_xor_ps(a.m128, mask);
  67. }
  68. __forceinline Vec2fa abs ( const Vec2fa& a ) {
  69. const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
  70. return _mm_and_ps(a.m128, mask);
  71. }
  72. __forceinline Vec2fa sign ( const Vec2fa& a ) {
  73. return blendv_ps(Vec2fa(one), -Vec2fa(one), _mm_cmplt_ps (a,Vec2fa(zero)));
  74. }
  75. __forceinline Vec2fa rcp ( const Vec2fa& a )
  76. {
  77. #if defined(__AVX512VL__)
  78. const Vec2fa r = _mm_rcp14_ps(a.m128);
  79. #else
  80. const Vec2fa r = _mm_rcp_ps(a.m128);
  81. #endif
  82. #if defined(__AVX2__)
  83. const Vec2fa res = _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f)));
  84. #else
  85. const Vec2fa res = _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a)));
  86. //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
  87. #endif
  88. return res;
  89. }
  90. __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); }
  91. __forceinline Vec2fa sqr ( const Vec2fa& a ) { return _mm_mul_ps(a,a); }
  92. __forceinline Vec2fa rsqrt( const Vec2fa& a )
  93. {
  94. #if defined(__AVX512VL__)
  95. __m128 r = _mm_rsqrt14_ps(a.m128);
  96. #else
  97. __m128 r = _mm_rsqrt_ps(a.m128);
  98. #endif
  99. return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
  100. }
  101. __forceinline Vec2fa zero_fix(const Vec2fa& a) {
  102. return blendv_ps(a, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
  103. }
  104. __forceinline Vec2fa rcp_safe(const Vec2fa& a) {
  105. return rcp(zero_fix(a));
  106. }
  107. __forceinline Vec2fa log ( const Vec2fa& a ) {
  108. return Vec2fa(logf(a.x),logf(a.y));
  109. }
  110. __forceinline Vec2fa exp ( const Vec2fa& a ) {
  111. return Vec2fa(expf(a.x),expf(a.y));
  112. }
  113. ////////////////////////////////////////////////////////////////////////////////
  114. /// Binary Operators
  115. ////////////////////////////////////////////////////////////////////////////////
  116. __forceinline Vec2fa operator +( const Vec2fa& a, const Vec2fa& b ) { return _mm_add_ps(a.m128, b.m128); }
  117. __forceinline Vec2fa operator -( const Vec2fa& a, const Vec2fa& b ) { return _mm_sub_ps(a.m128, b.m128); }
  118. __forceinline Vec2fa operator *( const Vec2fa& a, const Vec2fa& b ) { return _mm_mul_ps(a.m128, b.m128); }
  119. __forceinline Vec2fa operator *( const Vec2fa& a, const float b ) { return a * Vec2fa(b); }
  120. __forceinline Vec2fa operator *( const float a, const Vec2fa& b ) { return Vec2fa(a) * b; }
  121. __forceinline Vec2fa operator /( const Vec2fa& a, const Vec2fa& b ) { return _mm_div_ps(a.m128,b.m128); }
  122. __forceinline Vec2fa operator /( const Vec2fa& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
  123. __forceinline Vec2fa operator /( const float a, const Vec2fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
  124. __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); }
  125. __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); }
  126. #if defined(__SSE4_1__)
  127. __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) {
  128. const vint4 ai = _mm_castps_si128(a);
  129. const vint4 bi = _mm_castps_si128(b);
  130. const vint4 ci = _mm_min_epi32(ai,bi);
  131. return _mm_castsi128_ps(ci);
  132. }
  133. #endif
  134. #if defined(__SSE4_1__)
  135. __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) {
  136. const vint4 ai = _mm_castps_si128(a);
  137. const vint4 bi = _mm_castps_si128(b);
  138. const vint4 ci = _mm_max_epi32(ai,bi);
  139. return _mm_castsi128_ps(ci);
  140. }
  141. #endif
  142. __forceinline Vec2fa pow ( const Vec2fa& a, const float& b ) {
  143. return Vec2fa(powf(a.x,b),powf(a.y,b));
  144. }
  145. ////////////////////////////////////////////////////////////////////////////////
  146. /// Ternary Operators
  147. ////////////////////////////////////////////////////////////////////////////////
  148. #if defined(__AVX2__)
  149. __forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmadd_ps(a,b,c); }
  150. __forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmsub_ps(a,b,c); }
  151. __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmadd_ps(a,b,c); }
  152. __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmsub_ps(a,b,c); }
  153. #else
  154. __forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b+c; }
  155. __forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b-c; }
  156. __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b+c;}
  157. __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b-c; }
  158. #endif
  159. __forceinline Vec2fa madd ( const float a, const Vec2fa& b, const Vec2fa& c) { return madd(Vec2fa(a),b,c); }
  160. __forceinline Vec2fa msub ( const float a, const Vec2fa& b, const Vec2fa& c) { return msub(Vec2fa(a),b,c); }
  161. __forceinline Vec2fa nmadd ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmadd(Vec2fa(a),b,c); }
  162. __forceinline Vec2fa nmsub ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmsub(Vec2fa(a),b,c); }
  163. ////////////////////////////////////////////////////////////////////////////////
  164. /// Assignment Operators
  165. ////////////////////////////////////////////////////////////////////////////////
  166. __forceinline Vec2fa& operator +=( Vec2fa& a, const Vec2fa& b ) { return a = a + b; }
  167. __forceinline Vec2fa& operator -=( Vec2fa& a, const Vec2fa& b ) { return a = a - b; }
  168. __forceinline Vec2fa& operator *=( Vec2fa& a, const Vec2fa& b ) { return a = a * b; }
  169. __forceinline Vec2fa& operator *=( Vec2fa& a, const float b ) { return a = a * b; }
  170. __forceinline Vec2fa& operator /=( Vec2fa& a, const Vec2fa& b ) { return a = a / b; }
  171. __forceinline Vec2fa& operator /=( Vec2fa& a, const float b ) { return a = a / b; }
  172. ////////////////////////////////////////////////////////////////////////////////
  173. /// Reductions
  174. ////////////////////////////////////////////////////////////////////////////////
  175. __forceinline float reduce_add(const Vec2fa& v) { return v.x+v.y; }
  176. __forceinline float reduce_mul(const Vec2fa& v) { return v.x*v.y; }
  177. __forceinline float reduce_min(const Vec2fa& v) { return min(v.x,v.y); }
  178. __forceinline float reduce_max(const Vec2fa& v) { return max(v.x,v.y); }
  179. ////////////////////////////////////////////////////////////////////////////////
  180. /// Comparison Operators
  181. ////////////////////////////////////////////////////////////////////////////////
  182. __forceinline bool operator ==( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 3) == 3; }
  183. __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 3) != 0; }
  184. ////////////////////////////////////////////////////////////////////////////////
  185. /// Euclidian Space Operators
  186. ////////////////////////////////////////////////////////////////////////////////
  187. #if defined(__SSE4_1__)
  188. __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) {
  189. return _mm_cvtss_f32(_mm_dp_ps(a,b,0x3F));
  190. }
  191. #else
  192. __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) {
  193. return reduce_add(a*b);
  194. }
  195. #endif
  196. __forceinline Vec2fa cross ( const Vec2fa& a ) {
  197. return Vec2fa(-a.y,a.x);
  198. }
  199. __forceinline float sqr_length ( const Vec2fa& a ) { return dot(a,a); }
  200. __forceinline float rcp_length ( const Vec2fa& a ) { return rsqrt(dot(a,a)); }
  201. __forceinline float rcp_length2( const Vec2fa& a ) { return rcp(dot(a,a)); }
  202. __forceinline float length ( const Vec2fa& a ) { return sqrt(dot(a,a)); }
  203. __forceinline Vec2fa normalize( const Vec2fa& a ) { return a*rsqrt(dot(a,a)); }
  204. __forceinline float distance ( const Vec2fa& a, const Vec2fa& b ) { return length(a-b); }
  205. ////////////////////////////////////////////////////////////////////////////////
  206. /// Select
  207. ////////////////////////////////////////////////////////////////////////////////
  208. __forceinline Vec2fa select( bool s, const Vec2fa& t, const Vec2fa& f ) {
  209. __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
  210. return blendv_ps(f, t, mask);
  211. }
  212. __forceinline Vec2fa lerp(const Vec2fa& v0, const Vec2fa& v1, const float t) {
  213. return madd(1.0f-t,v0,t*v1);
  214. }
  215. __forceinline int maxDim ( const Vec2fa& a )
  216. {
  217. const Vec2fa b = abs(a);
  218. if (b.x > b.y) return 0;
  219. else return 1;
  220. }
  221. ////////////////////////////////////////////////////////////////////////////////
  222. /// Rounding Functions
  223. ////////////////////////////////////////////////////////////////////////////////
  224. #if defined(__aarch64__)
  225. //__forceinline Vec2fa trunc(const Vec2fa& a) { return vrndq_f32(a); }
  226. __forceinline Vec2fa floor(const Vec2fa& a) { return vrndmq_f32(a); }
  227. __forceinline Vec2fa ceil (const Vec2fa& a) { return vrndpq_f32(a); }
  228. #elif defined (__SSE4_1__)
  229. //__forceinline Vec2fa trunc( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
  230. __forceinline Vec2fa floor( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); }
  231. __forceinline Vec2fa ceil ( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); }
  232. #else
  233. //__forceinline Vec2fa trunc( const Vec2fa& a ) { return Vec2fa(truncf(a.x),truncf(a.y),truncf(a.z)); }
  234. __forceinline Vec2fa floor( const Vec2fa& a ) { return Vec2fa(floorf(a.x),floorf(a.y)); }
  235. __forceinline Vec2fa ceil ( const Vec2fa& a ) { return Vec2fa(ceilf (a.x),ceilf (a.y)); }
  236. #endif
  237. ////////////////////////////////////////////////////////////////////////////////
  238. /// Output Operators
  239. ////////////////////////////////////////////////////////////////////////////////
  240. __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2fa& a) {
  241. return cout << "(" << a.x << ", " << a.y << ")";
  242. }
  243. typedef Vec2fa Vec2fa_t;
  244. }