vec3fa.h 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361
  1. // ======================================================================== //
  2. // Copyright 2009-2017 Intel Corporation //
  3. // //
  4. // Licensed under the Apache License, Version 2.0 (the "License"); //
  5. // you may not use this file except in compliance with the License. //
  6. // You may obtain a copy of the License at //
  7. // //
  8. // http://www.apache.org/licenses/LICENSE-2.0 //
  9. // //
  10. // Unless required by applicable law or agreed to in writing, software //
  11. // distributed under the License is distributed on an "AS IS" BASIS, //
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
  13. // See the License for the specific language governing permissions and //
  14. // limitations under the License. //
  15. // ======================================================================== //
  16. #pragma once
  17. #include "../sys/alloc.h"
  18. #include "math.h"
  19. #include "../simd/sse.h"
  20. namespace embree
  21. {
  22. ////////////////////////////////////////////////////////////////////////////////
  23. /// SSE Vec3fa Type
  24. ////////////////////////////////////////////////////////////////////////////////
  25. struct __aligned(16) Vec3fa
  26. {
  27. ALIGNED_STRUCT;
  28. typedef float Scalar;
  29. enum { N = 3 };
  30. union {
  31. __m128 m128;
  32. struct { float x,y,z; union { int a; unsigned u; float w; }; };
  33. };
  34. ////////////////////////////////////////////////////////////////////////////////
  35. /// Constructors, Assignment & Cast Operators
  36. ////////////////////////////////////////////////////////////////////////////////
  37. __forceinline Vec3fa( ) {}
  38. __forceinline Vec3fa( const __m128 a ) : m128(a) {}
  39. __forceinline Vec3fa ( const Vec3<float>& other ) { x = other.x; y = other.y; z = other.z; }
  40. __forceinline Vec3fa& operator =( const Vec3<float>& other ) { x = other.x; y = other.y; z = other.z; return *this; }
  41. __forceinline Vec3fa ( const Vec3fa& other ) { m128 = other.m128; }
  42. __forceinline Vec3fa& operator =( const Vec3fa& other ) { m128 = other.m128; return *this; }
  43. __forceinline explicit Vec3fa( const float a ) : m128(_mm_set1_ps(a)) {}
  44. __forceinline Vec3fa( const float x, const float y, const float z) : m128(_mm_set_ps(z, z, y, x)) {}
  45. __forceinline Vec3fa( const Vec3fa& other, const int a1) { m128 = other.m128; a = a1; }
  46. __forceinline Vec3fa( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; }
  47. __forceinline Vec3fa( const Vec3fa& other, const float w1) { m128 = other.m128; w = w1; }
  48. //__forceinline Vec3fa( const float x, const float y, const float z, const int a) : x(x), y(y), z(z), a(a) {} // not working properly!
  49. //__forceinline Vec3fa( const float x, const float y, const float z, const unsigned a) : x(x), y(y), z(z), u(a) {} // not working properly!
  50. __forceinline Vec3fa( const float x, const float y, const float z, const float w) : m128(_mm_set_ps(w, z, y, x)) {}
  51. __forceinline explicit Vec3fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
  52. __forceinline operator const __m128&( void ) const { return m128; }
  53. __forceinline operator __m128&( void ) { return m128; }
  54. #if defined (__SSE4_1__)
  55. friend __forceinline const Vec3fa copy_a( const Vec3fa& a, const Vec3fa& b ) { return _mm_insert_ps(a, b, (3 << 4) | (3 << 6)); }
  56. #else
  57. friend __forceinline const Vec3fa copy_a( const Vec3fa& a, const Vec3fa& b ) { Vec3fa c = a; c.a = b.a; return c; }
  58. #endif
  59. ////////////////////////////////////////////////////////////////////////////////
  60. /// Loads and Stores
  61. ////////////////////////////////////////////////////////////////////////////////
  62. static __forceinline Vec3fa load( const void* const a ) {
  63. return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));
  64. }
  65. static __forceinline Vec3fa loadu( const void* const a ) {
  66. return Vec3fa(_mm_and_ps(_mm_loadu_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));
  67. }
  68. static __forceinline void storeu ( void* ptr, const Vec3fa& v ) {
  69. _mm_storeu_ps((float*)ptr,v);
  70. }
  71. ////////////////////////////////////////////////////////////////////////////////
  72. /// Constants
  73. ////////////////////////////////////////////////////////////////////////////////
  74. __forceinline Vec3fa( ZeroTy ) : m128(_mm_setzero_ps()) {}
  75. __forceinline Vec3fa( OneTy ) : m128(_mm_set1_ps(1.0f)) {}
  76. __forceinline Vec3fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
  77. __forceinline Vec3fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
  78. ////////////////////////////////////////////////////////////////////////////////
  79. /// Array Access
  80. ////////////////////////////////////////////////////////////////////////////////
  81. __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
  82. __forceinline float& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; }
  83. };
  84. ////////////////////////////////////////////////////////////////////////////////
  85. /// Unary Operators
  86. ////////////////////////////////////////////////////////////////////////////////
  87. __forceinline const Vec3fa operator +( const Vec3fa& a ) { return a; }
  88. __forceinline const Vec3fa operator -( const Vec3fa& a ) {
  89. const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
  90. return _mm_xor_ps(a.m128, mask);
  91. }
  92. __forceinline const Vec3fa abs ( const Vec3fa& a ) {
  93. const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
  94. return _mm_and_ps(a.m128, mask);
  95. }
  96. __forceinline const Vec3fa sign ( const Vec3fa& a ) {
  97. return blendv_ps(Vec3fa(one), -Vec3fa(one), _mm_cmplt_ps (a,Vec3fa(zero)));
  98. }
  99. __forceinline const Vec3fa rcp ( const Vec3fa& a )
  100. {
  101. #if defined(__AVX512VL__)
  102. const Vec3fa r = _mm_rcp14_ps(a.m128);
  103. #else
  104. const Vec3fa r = _mm_rcp_ps(a.m128);
  105. #endif
  106. #if defined(__AVX2__)
  107. const Vec3fa res = _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f)));
  108. #else
  109. const Vec3fa res = _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a)));
  110. //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
  111. #endif
  112. return res;
  113. }
  114. __forceinline const Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); }
  115. __forceinline const Vec3fa sqr ( const Vec3fa& a ) { return _mm_mul_ps(a,a); }
  116. __forceinline const Vec3fa rsqrt( const Vec3fa& a )
  117. {
  118. #if defined(__AVX512VL__)
  119. __m128 r = _mm_rsqrt14_ps(a.m128);
  120. #else
  121. __m128 r = _mm_rsqrt_ps(a.m128);
  122. #endif
  123. return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
  124. }
  125. __forceinline const Vec3fa zero_fix(const Vec3fa& a) {
  126. return blendv_ps(a, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
  127. }
  128. __forceinline const Vec3fa rcp_safe(const Vec3fa& a) {
  129. return rcp(zero_fix(a));
  130. }
  131. __forceinline Vec3fa log ( const Vec3fa& a ) {
  132. return Vec3fa(logf(a.x),logf(a.y),logf(a.z));
  133. }
  134. __forceinline Vec3fa exp ( const Vec3fa& a ) {
  135. return Vec3fa(expf(a.x),expf(a.y),expf(a.z));
  136. }
  137. ////////////////////////////////////////////////////////////////////////////////
  138. /// Binary Operators
  139. ////////////////////////////////////////////////////////////////////////////////
  140. __forceinline const Vec3fa operator +( const Vec3fa& a, const Vec3fa& b ) { return _mm_add_ps(a.m128, b.m128); }
  141. __forceinline const Vec3fa operator -( const Vec3fa& a, const Vec3fa& b ) { return _mm_sub_ps(a.m128, b.m128); }
  142. __forceinline const Vec3fa operator *( const Vec3fa& a, const Vec3fa& b ) { return _mm_mul_ps(a.m128, b.m128); }
  143. __forceinline const Vec3fa operator *( const Vec3fa& a, const float b ) { return a * Vec3fa(b); }
  144. __forceinline const Vec3fa operator *( const float a, const Vec3fa& b ) { return Vec3fa(a) * b; }
  145. __forceinline const Vec3fa operator /( const Vec3fa& a, const Vec3fa& b ) { return _mm_div_ps(a.m128,b.m128); }
  146. __forceinline const Vec3fa operator /( const Vec3fa& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
  147. __forceinline const Vec3fa operator /( const float a, const Vec3fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
  148. __forceinline const Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); }
  149. __forceinline const Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); }
  150. #if defined(__SSE4_1__)
  151. __forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) {
  152. const vint4 ai = _mm_castps_si128(a);
  153. const vint4 bi = _mm_castps_si128(b);
  154. const vint4 ci = _mm_min_epi32(ai,bi);
  155. return _mm_castsi128_ps(ci);
  156. }
  157. #endif
  158. #if defined(__SSE4_1__)
  159. __forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) {
  160. const vint4 ai = _mm_castps_si128(a);
  161. const vint4 bi = _mm_castps_si128(b);
  162. const vint4 ci = _mm_max_epi32(ai,bi);
  163. return _mm_castsi128_ps(ci);
  164. }
  165. #endif
  166. __forceinline Vec3fa pow ( const Vec3fa& a, const float& b ) {
  167. return Vec3fa(powf(a.x,b),powf(a.y,b),powf(a.z,b));
  168. }
  169. ////////////////////////////////////////////////////////////////////////////////
  170. /// Ternary Operators
  171. ////////////////////////////////////////////////////////////////////////////////
  172. #if defined(__AVX2__)
  173. __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmadd_ps(a,b,c); }
  174. __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a,b,c); }
  175. __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a,b,c); }
  176. __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a,b,c); }
  177. #else
  178. __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; }
  179. __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; }
  180. __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;}
  181. __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; }
  182. #endif
  183. __forceinline Vec3fa madd ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); }
  184. __forceinline Vec3fa msub ( const float a, const Vec3fa& b, const Vec3fa& c) { return msub(Vec3fa(a),b,c); }
  185. __forceinline Vec3fa nmadd ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmadd(Vec3fa(a),b,c); }
  186. __forceinline Vec3fa nmsub ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmsub(Vec3fa(a),b,c); }
  187. ////////////////////////////////////////////////////////////////////////////////
  188. /// Assignment Operators
  189. ////////////////////////////////////////////////////////////////////////////////
  190. __forceinline Vec3fa& operator +=( Vec3fa& a, const Vec3fa& b ) { return a = a + b; }
  191. __forceinline Vec3fa& operator -=( Vec3fa& a, const Vec3fa& b ) { return a = a - b; }
  192. __forceinline Vec3fa& operator *=( Vec3fa& a, const Vec3fa& b ) { return a = a * b; }
  193. __forceinline Vec3fa& operator *=( Vec3fa& a, const float b ) { return a = a * b; }
  194. __forceinline Vec3fa& operator /=( Vec3fa& a, const Vec3fa& b ) { return a = a / b; }
  195. __forceinline Vec3fa& operator /=( Vec3fa& a, const float b ) { return a = a / b; }
  196. ////////////////////////////////////////////////////////////////////////////////
  197. /// Reductions
  198. ////////////////////////////////////////////////////////////////////////////////
  199. __forceinline float reduce_add(const Vec3fa& v) { return v.x+v.y+v.z; }
  200. __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
  201. __forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); }
  202. __forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); }
  203. ////////////////////////////////////////////////////////////////////////////////
  204. /// Comparison Operators
  205. ////////////////////////////////////////////////////////////////////////////////
  206. __forceinline bool operator ==( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
  207. __forceinline bool operator !=( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
  208. __forceinline Vec3ba eq_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpeq_ps (a.m128, b.m128); }
  209. __forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }
  210. __forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); }
  211. __forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); }
  212. __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnle_ps(a.m128, b.m128); }
  213. __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); }
  214. __forceinline bool isvalid ( const Vec3fa& v ) {
  215. return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE)));
  216. }
  217. __forceinline bool is_finite ( const Vec3fa& a ) {
  218. return all(ge_mask(a,Vec3fa(-FLT_MAX)) & le_mask(a,Vec3fa(+FLT_MAX)));
  219. }
  220. ////////////////////////////////////////////////////////////////////////////////
  221. /// Euclidian Space Operators
  222. ////////////////////////////////////////////////////////////////////////////////
  223. #if defined(__SSE4_1__)
  224. __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) {
  225. return _mm_cvtss_f32(_mm_dp_ps(a,b,0x7F));
  226. }
  227. #else
  228. __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) {
  229. return reduce_add(a*b);
  230. }
  231. #endif
  232. __forceinline Vec3fa cross ( const Vec3fa& a, const Vec3fa& b )
  233. {
  234. vfloat4 a0 = vfloat4(a);
  235. vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b));
  236. vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a));
  237. vfloat4 b1 = vfloat4(b);
  238. return Vec3fa(shuffle<1,2,0,3>(msub(a0,b0,a1*b1)));
  239. }
  240. __forceinline float sqr_length ( const Vec3fa& a ) { return dot(a,a); }
  241. __forceinline float rcp_length ( const Vec3fa& a ) { return rsqrt(dot(a,a)); }
  242. __forceinline float rcp_length2( const Vec3fa& a ) { return rcp(dot(a,a)); }
  243. __forceinline float length ( const Vec3fa& a ) { return sqrt(dot(a,a)); }
  244. __forceinline Vec3fa normalize( const Vec3fa& a ) { return a*rsqrt(dot(a,a)); }
  245. __forceinline float distance ( const Vec3fa& a, const Vec3fa& b ) { return length(a-b); }
  246. __forceinline float halfArea ( const Vec3fa& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); }
  247. __forceinline float area ( const Vec3fa& d ) { return 2.0f*halfArea(d); }
  248. __forceinline Vec3fa normalize_safe( const Vec3fa& a ) {
  249. const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
  250. }
  251. /*! differentiated normalization */
  252. __forceinline Vec3fa dnormalize(const Vec3fa& p, const Vec3fa& dp)
  253. {
  254. const float pp = dot(p,p);
  255. const float pdp = dot(p,dp);
  256. return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp);
  257. }
  258. ////////////////////////////////////////////////////////////////////////////////
  259. /// Select
  260. ////////////////////////////////////////////////////////////////////////////////
  261. __forceinline const Vec3fa select( bool s, const Vec3fa& t, const Vec3fa& f ) {
  262. __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
  263. return blendv_ps(f, t, mask);
  264. }
  265. __forceinline const Vec3fa select( const Vec3ba& s, const Vec3fa& t, const Vec3fa& f ) {
  266. return blendv_ps(f, t, s);
  267. }
  268. __forceinline int maxDim ( const Vec3fa& a )
  269. {
  270. const Vec3fa b = abs(a);
  271. if (b.x > b.y) {
  272. if (b.x > b.z) return 0; else return 2;
  273. } else {
  274. if (b.y > b.z) return 1; else return 2;
  275. }
  276. }
  277. ////////////////////////////////////////////////////////////////////////////////
  278. /// Rounding Functions
  279. ////////////////////////////////////////////////////////////////////////////////
  280. #if defined (__SSE4_1__)
  281. //__forceinline const Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
  282. __forceinline const Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); }
  283. __forceinline const Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); }
  284. #else
  285. //__forceinline const Vec3fa trunc( const Vec3fa& a ) { return Vec3fa(truncf(a.x),truncf(a.y),truncf(a.z)); }
  286. __forceinline const Vec3fa floor( const Vec3fa& a ) { return Vec3fa(floorf(a.x),floorf(a.y),floorf(a.z)); }
  287. __forceinline const Vec3fa ceil ( const Vec3fa& a ) { return Vec3fa(ceilf (a.x),ceilf (a.y),ceilf (a.z)); }
  288. #endif
  289. ////////////////////////////////////////////////////////////////////////////////
  290. /// Output Operators
  291. ////////////////////////////////////////////////////////////////////////////////
  292. inline std::ostream& operator<<(std::ostream& cout, const Vec3fa& a) {
  293. return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
  294. }
  295. typedef Vec3fa Vec3fa_t;
  296. }