vec3fa.h 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791
  1. // Copyright 2009-2021 Intel Corporation
  2. // SPDX-License-Identifier: Apache-2.0
  3. #pragma once
  4. #include "../sys/alloc.h"
  5. #include "emath.h"
  6. #if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
  7. # include "vec3fa_sycl.h"
  8. #else
  9. #include "../simd/sse.h"
  10. namespace embree
  11. {
  12. ////////////////////////////////////////////////////////////////////////////////
  13. /// SSE Vec3fa Type
  14. ////////////////////////////////////////////////////////////////////////////////
  15. struct __aligned(16) Vec3fa
  16. {
  17. ALIGNED_STRUCT_(16);
  18. typedef float Scalar;
  19. enum { N = 3 };
  20. union {
  21. __m128 m128;
  22. struct { float x,y,z; };
  23. };
  24. ////////////////////////////////////////////////////////////////////////////////
  25. /// Constructors, Assignment & Cast Operators
  26. ////////////////////////////////////////////////////////////////////////////////
  27. __forceinline Vec3fa( ) {}
  28. __forceinline Vec3fa( const __m128 a ) : m128(a) {}
  29. __forceinline Vec3fa ( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); }
  30. //__forceinline Vec3fa& operator =( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); return *this; }
  31. __forceinline Vec3fa ( const Vec3fa& other ) { m128 = other.m128; }
  32. __forceinline Vec3fa& operator =( const Vec3fa& other ) { m128 = other.m128; return *this; }
  33. __forceinline explicit Vec3fa( const float a ) : m128(_mm_set1_ps(a)) {}
  34. __forceinline Vec3fa( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {}
  35. __forceinline explicit Vec3fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
  36. __forceinline explicit operator const vfloat4() const { return vfloat4(m128); }
  37. __forceinline explicit operator const vint4() const { return vint4(_mm_cvtps_epi32(m128)); }
  38. __forceinline explicit operator const Vec2fa() const { return Vec2fa(m128); }
  39. __forceinline explicit operator const Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); }
  40. //__forceinline operator const __m128&() const { return m128; }
  41. //__forceinline operator __m128&() { return m128; }
  42. ////////////////////////////////////////////////////////////////////////////////
  43. /// Loads and Stores
  44. ////////////////////////////////////////////////////////////////////////////////
  45. static __forceinline Vec3fa load( const void* const a ) {
  46. #if defined(__aarch64__)
  47. __m128 t = _mm_load_ps((float*)a);
  48. t[3] = 0.0f;
  49. return Vec3fa(t);
  50. #else
  51. return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));
  52. #endif
  53. }
  54. static __forceinline Vec3fa loadu( const void* const a ) {
  55. return Vec3fa(_mm_loadu_ps((float*)a));
  56. }
  57. static __forceinline void storeu ( void* ptr, const Vec3fa& v ) {
  58. _mm_storeu_ps((float*)ptr,v.m128);
  59. }
  60. ////////////////////////////////////////////////////////////////////////////////
  61. /// Constants
  62. ////////////////////////////////////////////////////////////////////////////////
  63. __forceinline Vec3fa( ZeroTy ) : m128(_mm_setzero_ps()) {}
  64. __forceinline Vec3fa( OneTy ) : m128(_mm_set1_ps(1.0f)) {}
  65. __forceinline Vec3fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
  66. __forceinline Vec3fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
  67. ////////////////////////////////////////////////////////////////////////////////
  68. /// Array Access
  69. ////////////////////////////////////////////////////////////////////////////////
  70. __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
  71. __forceinline float& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; }
  72. };
  73. ////////////////////////////////////////////////////////////////////////////////
  74. /// Unary Operators
  75. ////////////////////////////////////////////////////////////////////////////////
  76. __forceinline Vec3fa operator +( const Vec3fa& a ) { return a; }
  77. __forceinline Vec3fa operator -( const Vec3fa& a ) {
  78. #if defined(__aarch64__)
  79. return vnegq_f32(a.m128);
  80. #else
  81. const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
  82. return _mm_xor_ps(a.m128, mask);
  83. #endif
  84. }
  85. __forceinline Vec3fa abs ( const Vec3fa& a ) {
  86. #if defined(__aarch64__)
  87. return _mm_abs_ps(a.m128);
  88. #else
  89. const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
  90. return _mm_and_ps(a.m128, mask);
  91. #endif
  92. }
  93. __forceinline Vec3fa sign ( const Vec3fa& a ) {
  94. return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128));
  95. }
  96. __forceinline Vec3fa rcp ( const Vec3fa& a )
  97. {
  98. #if defined(__aarch64__)
  99. return vdivq_f32(vdupq_n_f32(1.0f),a.m128);
  100. #else
  101. #if defined(__AVX512VL__)
  102. const Vec3fa r = _mm_rcp14_ps(a.m128);
  103. #else
  104. const Vec3fa r = _mm_rcp_ps(a.m128);
  105. #endif
  106. #if defined(__AVX2__)
  107. const Vec3fa h_n = _mm_fnmadd_ps(a.m128, r.m128, vfloat4(1.0)); // First, compute 1 - a * r (which will be very close to 0)
  108. const Vec3fa res = _mm_fmadd_ps(r.m128, h_n.m128, r.m128); // Then compute r + r * h_n
  109. #else
  110. const Vec3fa h_n = _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a.m128, r.m128)); // First, compute 1 - a * r (which will be very close to 0)
  111. const Vec3fa res = _mm_add_ps(r.m128,_mm_mul_ps(r.m128, h_n.m128)); // Then compute r + r * h_n
  112. #endif
  113. return res;
  114. #endif //defined(__aarch64__)
  115. }
  116. __forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); }
  117. __forceinline Vec3fa sqr ( const Vec3fa& a ) { return _mm_mul_ps(a.m128,a.m128); }
  118. __forceinline Vec3fa rsqrt( const Vec3fa& a )
  119. {
  120. #if defined(__aarch64__)
  121. __m128 r = _mm_rsqrt_ps(a.m128);
  122. r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
  123. r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
  124. return r;
  125. #else
  126. #if defined(__AVX512VL__)
  127. __m128 r = _mm_rsqrt14_ps(a.m128);
  128. #else
  129. __m128 r = _mm_rsqrt_ps(a.m128);
  130. #endif
  131. return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
  132. #endif
  133. }
  134. __forceinline Vec3fa zero_fix(const Vec3fa& a) {
  135. return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
  136. }
  137. __forceinline Vec3fa rcp_safe(const Vec3fa& a) {
  138. return rcp(zero_fix(a));
  139. }
  140. __forceinline Vec3fa log ( const Vec3fa& a ) {
  141. return Vec3fa(logf(a.x),logf(a.y),logf(a.z));
  142. }
  143. __forceinline Vec3fa exp ( const Vec3fa& a ) {
  144. return Vec3fa(expf(a.x),expf(a.y),expf(a.z));
  145. }
  146. ////////////////////////////////////////////////////////////////////////////////
  147. /// Binary Operators
  148. ////////////////////////////////////////////////////////////////////////////////
  149. __forceinline Vec3fa operator +( const Vec3fa& a, const Vec3fa& b ) { return _mm_add_ps(a.m128, b.m128); }
  150. __forceinline Vec3fa operator -( const Vec3fa& a, const Vec3fa& b ) { return _mm_sub_ps(a.m128, b.m128); }
  151. __forceinline Vec3fa operator *( const Vec3fa& a, const Vec3fa& b ) { return _mm_mul_ps(a.m128, b.m128); }
  152. __forceinline Vec3fa operator *( const Vec3fa& a, const float b ) { return a * Vec3fa(b); }
  153. __forceinline Vec3fa operator *( const float a, const Vec3fa& b ) { return Vec3fa(a) * b; }
  154. __forceinline Vec3fa operator /( const Vec3fa& a, const Vec3fa& b ) { return _mm_div_ps(a.m128,b.m128); }
  155. __forceinline Vec3fa operator /( const Vec3fa& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
  156. __forceinline Vec3fa operator /( const float a, const Vec3fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
  157. __forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); }
  158. __forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); }
  159. #if defined(__aarch64__) || defined(__SSE4_1__)
  160. __forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) {
  161. const vint4 ai = _mm_castps_si128(a.m128);
  162. const vint4 bi = _mm_castps_si128(b.m128);
  163. const vint4 ci = _mm_min_epi32(ai,bi);
  164. return _mm_castsi128_ps(ci);
  165. }
  166. #endif
  167. #if defined(__aarch64__) || defined(__SSE4_1__)
  168. __forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) {
  169. const vint4 ai = _mm_castps_si128(a.m128);
  170. const vint4 bi = _mm_castps_si128(b.m128);
  171. const vint4 ci = _mm_max_epi32(ai,bi);
  172. return _mm_castsi128_ps(ci);
  173. }
  174. #endif
  175. __forceinline Vec3fa pow ( const Vec3fa& a, const float& b ) {
  176. return Vec3fa(powf(a.x,b),powf(a.y,b),powf(a.z,b));
  177. }
  178. ////////////////////////////////////////////////////////////////////////////////
  179. /// Ternary Operators
  180. ////////////////////////////////////////////////////////////////////////////////
  181. #if defined(__AVX2__) || defined(__ARM_NEON)
  182. __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); }
  183. __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
  184. __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
  185. __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
  186. #else
  187. __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; }
  188. __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;}
  189. __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; }
  190. __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; }
  191. #endif
  192. __forceinline Vec3fa madd ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); }
  193. __forceinline Vec3fa msub ( const float a, const Vec3fa& b, const Vec3fa& c) { return msub(Vec3fa(a),b,c); }
  194. __forceinline Vec3fa nmadd ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmadd(Vec3fa(a),b,c); }
  195. __forceinline Vec3fa nmsub ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmsub(Vec3fa(a),b,c); }
  196. ////////////////////////////////////////////////////////////////////////////////
  197. /// Assignment Operators
  198. ////////////////////////////////////////////////////////////////////////////////
  199. __forceinline Vec3fa& operator +=( Vec3fa& a, const Vec3fa& b ) { return a = a + b; }
  200. __forceinline Vec3fa& operator -=( Vec3fa& a, const Vec3fa& b ) { return a = a - b; }
  201. __forceinline Vec3fa& operator *=( Vec3fa& a, const Vec3fa& b ) { return a = a * b; }
  202. __forceinline Vec3fa& operator *=( Vec3fa& a, const float b ) { return a = a * b; }
  203. __forceinline Vec3fa& operator /=( Vec3fa& a, const Vec3fa& b ) { return a = a / b; }
  204. __forceinline Vec3fa& operator /=( Vec3fa& a, const float b ) { return a = a / b; }
  205. ////////////////////////////////////////////////////////////////////////////////
  206. /// Reductions
  207. ////////////////////////////////////////////////////////////////////////////////
  208. #if defined(__aarch64__)
  209. __forceinline float reduce_add(const Vec3fa& v) {
  210. float32x4_t t = v.m128;
  211. t[3] = 0.0f;
  212. return vaddvq_f32(t);
  213. }
  214. __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
  215. __forceinline float reduce_min(const Vec3fa& v) {
  216. float32x4_t t = v.m128;
  217. t[3] = t[2];
  218. return vminvq_f32(t);
  219. }
  220. __forceinline float reduce_max(const Vec3fa& v) {
  221. float32x4_t t = v.m128;
  222. t[3] = t[2];
  223. return vmaxvq_f32(t);
  224. }
  225. #else
  226. __forceinline float reduce_add(const Vec3fa& v) {
  227. const vfloat4 a(v.m128);
  228. const vfloat4 b = shuffle<1>(a);
  229. const vfloat4 c = shuffle<2>(a);
  230. return _mm_cvtss_f32(a+b+c);
  231. }
  232. __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
  233. __forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); }
  234. __forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); }
  235. #endif
  236. ////////////////////////////////////////////////////////////////////////////////
  237. /// Comparison Operators
  238. ////////////////////////////////////////////////////////////////////////////////
  239. __forceinline bool operator ==( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
  240. __forceinline bool operator !=( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
  241. __forceinline Vec3ba eq_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpeq_ps (a.m128, b.m128); }
  242. __forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }
  243. __forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); }
  244. __forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); }
  245. #if defined(__aarch64__)
  246. __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpgt_ps (a.m128, b.m128); }
  247. __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpge_ps (a.m128, b.m128); }
  248. #else
  249. __forceinline Vec3ba gt_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnle_ps(a.m128, b.m128); }
  250. __forceinline Vec3ba ge_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnlt_ps(a.m128, b.m128); }
  251. #endif
  252. __forceinline bool isvalid ( const Vec3fa& v ) {
  253. return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE)));
  254. }
  255. __forceinline bool is_finite ( const Vec3fa& a ) {
  256. return all(ge_mask(a,Vec3fa(-FLT_MAX)) & le_mask(a,Vec3fa(+FLT_MAX)));
  257. }
  258. __forceinline bool isvalid4 ( const Vec3fa& v ) {
  259. return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE)));
  260. }
  261. __forceinline bool is_finite4 ( const Vec3fa& a ) {
  262. return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX)));
  263. }
  264. ////////////////////////////////////////////////////////////////////////////////
  265. /// Euclidean Space Operators
  266. ////////////////////////////////////////////////////////////////////////////////
  267. #if defined(__SSE4_1__)
  268. __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) {
  269. return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F));
  270. }
  271. #else
  272. __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) {
  273. return reduce_add(a*b);
  274. }
  275. #endif
  276. __forceinline Vec3fa cross ( const Vec3fa& a, const Vec3fa& b )
  277. {
  278. vfloat4 a0 = vfloat4(a.m128);
  279. vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128));
  280. vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128));
  281. vfloat4 b1 = vfloat4(b.m128);
  282. return Vec3fa(shuffle<1,2,0,3>(msub(a0,b0,a1*b1)));
  283. }
  284. __forceinline float sqr_length ( const Vec3fa& a ) { return dot(a,a); }
  285. __forceinline float rcp_length ( const Vec3fa& a ) { return rsqrt(dot(a,a)); }
  286. __forceinline float rcp_length2( const Vec3fa& a ) { return rcp(dot(a,a)); }
  287. __forceinline float length ( const Vec3fa& a ) { return sqrt(dot(a,a)); }
  288. __forceinline Vec3fa normalize( const Vec3fa& a ) { return a*rsqrt(dot(a,a)); }
  289. __forceinline float distance ( const Vec3fa& a, const Vec3fa& b ) { return length(a-b); }
  290. __forceinline float halfArea ( const Vec3fa& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); }
  291. __forceinline float area ( const Vec3fa& d ) { return 2.0f*halfArea(d); }
  292. __forceinline Vec3fa normalize_safe( const Vec3fa& a ) {
  293. const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
  294. }
  295. /*! differentiated normalization */
  296. __forceinline Vec3fa dnormalize(const Vec3fa& p, const Vec3fa& dp)
  297. {
  298. const float pp = dot(p,p);
  299. const float pdp = dot(p,dp);
  300. return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp);
  301. }
  302. ////////////////////////////////////////////////////////////////////////////////
  303. /// Select
  304. ////////////////////////////////////////////////////////////////////////////////
  305. __forceinline Vec3fa select( bool s, const Vec3fa& t, const Vec3fa& f ) {
  306. __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
  307. return blendv_ps(f.m128, t.m128, mask);
  308. }
  309. __forceinline Vec3fa select( const Vec3ba& s, const Vec3fa& t, const Vec3fa& f ) {
  310. return blendv_ps(f.m128, t.m128, s);
  311. }
  312. __forceinline Vec3fa lerp(const Vec3fa& v0, const Vec3fa& v1, const float t) {
  313. return madd(1.0f-t,v0,t*v1);
  314. }
  315. __forceinline int maxDim ( const Vec3fa& a )
  316. {
  317. const Vec3fa b = abs(a);
  318. if (b.x > b.y) {
  319. if (b.x > b.z) return 0; else return 2;
  320. } else {
  321. if (b.y > b.z) return 1; else return 2;
  322. }
  323. }
  324. ////////////////////////////////////////////////////////////////////////////////
  325. /// Rounding Functions
  326. ////////////////////////////////////////////////////////////////////////////////
  327. #if defined(__aarch64__)
  328. __forceinline Vec3fa floor(const Vec3fa& a) { return vrndmq_f32(a.m128); }
  329. __forceinline Vec3fa ceil (const Vec3fa& a) { return vrndpq_f32(a.m128); }
  330. __forceinline Vec3fa trunc(const Vec3fa& a) { return vrndq_f32(a.m128); }
  331. #elif defined (__SSE4_1__)
  332. __forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
  333. __forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); }
  334. __forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); }
  335. #else
  336. __forceinline Vec3fa trunc( const Vec3fa& a ) { return Vec3fa(truncf(a.x),truncf(a.y),truncf(a.z)); }
  337. __forceinline Vec3fa floor( const Vec3fa& a ) { return Vec3fa(floorf(a.x),floorf(a.y),floorf(a.z)); }
  338. __forceinline Vec3fa ceil ( const Vec3fa& a ) { return Vec3fa(ceilf (a.x),ceilf (a.y),ceilf (a.z)); }
  339. #endif
  340. ////////////////////////////////////////////////////////////////////////////////
  341. /// Output Operators
  342. ////////////////////////////////////////////////////////////////////////////////
  343. __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fa& a) {
  344. return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
  345. }
  346. typedef Vec3fa Vec3fa_t;
  347. ////////////////////////////////////////////////////////////////////////////////
  348. /// SSE Vec3fx Type
  349. ////////////////////////////////////////////////////////////////////////////////
  350. struct __aligned(16) Vec3fx
  351. {
  352. ALIGNED_STRUCT_(16);
  353. typedef float Scalar;
  354. enum { N = 3 };
  355. union {
  356. __m128 m128;
  357. struct { float x,y,z; union { int a; unsigned u; float w; }; };
  358. };
  359. ////////////////////////////////////////////////////////////////////////////////
  360. /// Constructors, Assignment & Cast Operators
  361. ////////////////////////////////////////////////////////////////////////////////
  362. __forceinline Vec3fx( ) {}
  363. __forceinline Vec3fx( const __m128 a ) : m128(a) {}
  364. __forceinline explicit Vec3fx(const Vec3fa& v) : m128(v.m128) {}
  365. __forceinline operator Vec3fa () const { return Vec3fa(m128); }
  366. __forceinline explicit Vec3fx ( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); }
  367. //__forceinline Vec3fx& operator =( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); return *this; }
  368. __forceinline Vec3fx ( const Vec3fx& other ) { m128 = other.m128; }
  369. __forceinline Vec3fx& operator =( const Vec3fx& other ) { m128 = other.m128; return *this; }
  370. __forceinline explicit Vec3fx( const float a ) : m128(_mm_set1_ps(a)) {}
  371. __forceinline Vec3fx( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {}
  372. __forceinline Vec3fx( const Vec3fa& other, const int a1) { m128 = other.m128; a = a1; }
  373. __forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; }
  374. __forceinline Vec3fx( const Vec3fa& other, const float w1) {
  375. #if defined (__aarch64__)
  376. m128 = other.m128; m128[3] = w1;
  377. #elif defined (__SSE4_1__)
  378. m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4);
  379. #else
  380. const vint4 mask(-1,-1,-1,0);
  381. m128 = select(vboolf4(_mm_castsi128_ps(mask)),vfloat4(other.m128),vfloat4(w1));
  382. #endif
  383. }
  384. //__forceinline Vec3fx( const float x, const float y, const float z, const int a) : x(x), y(y), z(z), a(a) {} // not working properly!
  385. //__forceinline Vec3fx( const float x, const float y, const float z, const unsigned a) : x(x), y(y), z(z), u(a) {} // not working properly!
  386. __forceinline Vec3fx( const float x, const float y, const float z, const float w) : m128(_mm_set_ps(w, z, y, x)) {}
  387. //__forceinline explicit Vec3fx( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
  388. __forceinline explicit operator const vfloat4() const { return vfloat4(m128); }
  389. __forceinline explicit operator const vint4() const { return vint4(_mm_cvtps_epi32(m128)); }
  390. __forceinline explicit operator const Vec2fa() const { return Vec2fa(m128); }
  391. __forceinline explicit operator const Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); }
  392. //__forceinline operator const __m128&() const { return m128; }
  393. //__forceinline operator __m128&() { return m128; }
  394. ////////////////////////////////////////////////////////////////////////////////
  395. /// Loads and Stores
  396. ////////////////////////////////////////////////////////////////////////////////
  397. static __forceinline Vec3fx load( const void* const a ) {
  398. return Vec3fx(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));
  399. }
  400. static __forceinline Vec3fx loadu( const void* const a ) {
  401. return Vec3fx(_mm_loadu_ps((float*)a));
  402. }
  403. static __forceinline void storeu ( void* ptr, const Vec3fx& v ) {
  404. _mm_storeu_ps((float*)ptr,v.m128);
  405. }
  406. ////////////////////////////////////////////////////////////////////////////////
  407. /// Constants
  408. ////////////////////////////////////////////////////////////////////////////////
  409. __forceinline Vec3fx( ZeroTy ) : m128(_mm_setzero_ps()) {}
  410. __forceinline Vec3fx( OneTy ) : m128(_mm_set1_ps(1.0f)) {}
  411. __forceinline Vec3fx( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
  412. __forceinline Vec3fx( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
  413. ////////////////////////////////////////////////////////////////////////////////
  414. /// Array Access
  415. ////////////////////////////////////////////////////////////////////////////////
  416. __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
  417. __forceinline float& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; }
  418. };
  419. ////////////////////////////////////////////////////////////////////////////////
  420. /// Unary Operators
  421. ////////////////////////////////////////////////////////////////////////////////
  422. __forceinline Vec3fx operator +( const Vec3fx& a ) { return a; }
  423. __forceinline Vec3fx operator -( const Vec3fx& a ) {
  424. const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
  425. return _mm_xor_ps(a.m128, mask);
  426. }
  427. __forceinline Vec3fx abs ( const Vec3fx& a ) {
  428. const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
  429. return _mm_and_ps(a.m128, mask);
  430. }
  431. __forceinline Vec3fx sign ( const Vec3fx& a ) {
  432. return blendv_ps(Vec3fx(one).m128, (-Vec3fx(one)).m128, _mm_cmplt_ps (a.m128,Vec3fx(zero).m128));
  433. }
  434. __forceinline Vec3fx rcp ( const Vec3fx& a )
  435. {
  436. #if defined(__AVX512VL__)
  437. const Vec3fx r = _mm_rcp14_ps(a.m128);
  438. #else
  439. const Vec3fx r = _mm_rcp_ps(a.m128);
  440. #endif
  441. #if defined(__AVX2__)
  442. const Vec3fx res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f)));
  443. #else
  444. const Vec3fx res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128)));
  445. //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
  446. #endif
  447. return res;
  448. }
  449. __forceinline Vec3fx sqrt ( const Vec3fx& a ) { return _mm_sqrt_ps(a.m128); }
  450. __forceinline Vec3fx sqr ( const Vec3fx& a ) { return _mm_mul_ps(a.m128,a.m128); }
  451. __forceinline Vec3fx rsqrt( const Vec3fx& a )
  452. {
  453. #if defined(__AVX512VL__)
  454. __m128 r = _mm_rsqrt14_ps(a.m128);
  455. #else
  456. __m128 r = _mm_rsqrt_ps(a.m128);
  457. #endif
  458. return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
  459. }
  460. __forceinline Vec3fx zero_fix(const Vec3fx& a) {
  461. return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
  462. }
  463. __forceinline Vec3fx rcp_safe(const Vec3fx& a) {
  464. return rcp(zero_fix(a));
  465. }
  466. __forceinline Vec3fx log ( const Vec3fx& a ) {
  467. return Vec3fx(logf(a.x),logf(a.y),logf(a.z));
  468. }
  469. __forceinline Vec3fx exp ( const Vec3fx& a ) {
  470. return Vec3fx(expf(a.x),expf(a.y),expf(a.z));
  471. }
  472. ////////////////////////////////////////////////////////////////////////////////
  473. /// Binary Operators
  474. ////////////////////////////////////////////////////////////////////////////////
  475. __forceinline Vec3fx operator +( const Vec3fx& a, const Vec3fx& b ) { return _mm_add_ps(a.m128, b.m128); }
  476. __forceinline Vec3fx operator -( const Vec3fx& a, const Vec3fx& b ) { return _mm_sub_ps(a.m128, b.m128); }
  477. __forceinline Vec3fx operator *( const Vec3fx& a, const Vec3fx& b ) { return _mm_mul_ps(a.m128, b.m128); }
  478. __forceinline Vec3fx operator *( const Vec3fx& a, const float b ) { return a * Vec3fx(b); }
  479. __forceinline Vec3fx operator *( const float a, const Vec3fx& b ) { return Vec3fx(a) * b; }
  480. __forceinline Vec3fx operator /( const Vec3fx& a, const Vec3fx& b ) { return _mm_div_ps(a.m128,b.m128); }
  481. __forceinline Vec3fx operator /( const Vec3fx& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
  482. __forceinline Vec3fx operator /( const float a, const Vec3fx& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
  483. __forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); }
  484. __forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); }
  485. #if defined(__SSE4_1__) || defined(__aarch64__)
  486. __forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) {
  487. const vint4 ai = _mm_castps_si128(a.m128);
  488. const vint4 bi = _mm_castps_si128(b.m128);
  489. const vint4 ci = _mm_min_epi32(ai,bi);
  490. return _mm_castsi128_ps(ci);
  491. }
  492. #endif
  493. #if defined(__SSE4_1__) || defined(__aarch64__)
  494. __forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) {
  495. const vint4 ai = _mm_castps_si128(a.m128);
  496. const vint4 bi = _mm_castps_si128(b.m128);
  497. const vint4 ci = _mm_max_epi32(ai,bi);
  498. return _mm_castsi128_ps(ci);
  499. }
  500. #endif
  501. __forceinline Vec3fx pow ( const Vec3fx& a, const float& b ) {
  502. return Vec3fx(powf(a.x,b),powf(a.y,b),powf(a.z,b));
  503. }
  504. ////////////////////////////////////////////////////////////////////////////////
  505. /// Ternary Operators
  506. ////////////////////////////////////////////////////////////////////////////////
  507. #if defined(__AVX2__)
  508. __forceinline Vec3fx madd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); }
  509. __forceinline Vec3fx msub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
  510. __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
  511. __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
  512. #else
  513. __forceinline Vec3fx madd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b+c; }
  514. __forceinline Vec3fx msub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b-c; }
  515. __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b+c;}
  516. __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b-c; }
  517. #endif
  518. __forceinline Vec3fx madd ( const float a, const Vec3fx& b, const Vec3fx& c) { return madd(Vec3fx(a),b,c); }
  519. __forceinline Vec3fx msub ( const float a, const Vec3fx& b, const Vec3fx& c) { return msub(Vec3fx(a),b,c); }
  520. __forceinline Vec3fx nmadd ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmadd(Vec3fx(a),b,c); }
  521. __forceinline Vec3fx nmsub ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmsub(Vec3fx(a),b,c); }
  522. ////////////////////////////////////////////////////////////////////////////////
  523. /// Assignment Operators
  524. ////////////////////////////////////////////////////////////////////////////////
  525. __forceinline Vec3fx& operator +=( Vec3fx& a, const Vec3fx& b ) { return a = a + b; }
  526. __forceinline Vec3fx& operator -=( Vec3fx& a, const Vec3fx& b ) { return a = a - b; }
  527. __forceinline Vec3fx& operator *=( Vec3fx& a, const Vec3fx& b ) { return a = a * b; }
  528. __forceinline Vec3fx& operator *=( Vec3fx& a, const float b ) { return a = a * b; }
  529. __forceinline Vec3fx& operator /=( Vec3fx& a, const Vec3fx& b ) { return a = a / b; }
  530. __forceinline Vec3fx& operator /=( Vec3fx& a, const float b ) { return a = a / b; }
  531. ////////////////////////////////////////////////////////////////////////////////
  532. /// Reductions
  533. ////////////////////////////////////////////////////////////////////////////////
  534. __forceinline float reduce_add(const Vec3fx& v) {
  535. const vfloat4 a(v.m128);
  536. const vfloat4 b = shuffle<1>(a);
  537. const vfloat4 c = shuffle<2>(a);
  538. return _mm_cvtss_f32(a+b+c);
  539. }
  540. __forceinline float reduce_mul(const Vec3fx& v) { return v.x*v.y*v.z; }
  541. __forceinline float reduce_min(const Vec3fx& v) { return min(v.x,v.y,v.z); }
  542. __forceinline float reduce_max(const Vec3fx& v) { return max(v.x,v.y,v.z); }
  543. ////////////////////////////////////////////////////////////////////////////////
  544. /// Comparison Operators
  545. ////////////////////////////////////////////////////////////////////////////////
  546. __forceinline bool operator ==( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
  547. __forceinline bool operator !=( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
  548. __forceinline Vec3ba eq_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpeq_ps (a.m128, b.m128); }
  549. __forceinline Vec3ba neq_mask(const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }
  550. __forceinline Vec3ba lt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmplt_ps (a.m128, b.m128); }
  551. __forceinline Vec3ba le_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmple_ps (a.m128, b.m128); }
  552. __forceinline Vec3ba gt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnle_ps(a.m128, b.m128); }
  553. __forceinline Vec3ba ge_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); }
  554. __forceinline bool isvalid ( const Vec3fx& v ) {
  555. return all(gt_mask(v,Vec3fx(-FLT_LARGE)) & lt_mask(v,Vec3fx(+FLT_LARGE)));
  556. }
  557. __forceinline bool is_finite ( const Vec3fx& a ) {
  558. return all(ge_mask(a,Vec3fx(-FLT_MAX)) & le_mask(a,Vec3fx(+FLT_MAX)));
  559. }
  560. __forceinline bool isvalid4 ( const Vec3fx& v ) {
  561. return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE)));
  562. }
  563. __forceinline bool is_finite4 ( const Vec3fx& a ) {
  564. return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX)));
  565. }
  566. ////////////////////////////////////////////////////////////////////////////////
  567. /// Euclidean Space Operators
  568. ////////////////////////////////////////////////////////////////////////////////
  569. #if defined(__SSE4_1__)
  570. __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) {
  571. return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F));
  572. }
  573. #else
  574. __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) {
  575. return reduce_add(a*b);
  576. }
  577. #endif
  578. __forceinline Vec3fx cross ( const Vec3fx& a, const Vec3fx& b )
  579. {
  580. vfloat4 a0 = vfloat4(a.m128);
  581. vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128));
  582. vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128));
  583. vfloat4 b1 = vfloat4(b.m128);
  584. return Vec3fx(shuffle<1,2,0,3>(msub(a0,b0,a1*b1)));
  585. }
  586. __forceinline float sqr_length ( const Vec3fx& a ) { return dot(a,a); }
  587. __forceinline float rcp_length ( const Vec3fx& a ) { return rsqrt(dot(a,a)); }
  588. __forceinline float rcp_length2( const Vec3fx& a ) { return rcp(dot(a,a)); }
  589. __forceinline float length ( const Vec3fx& a ) { return sqrt(dot(a,a)); }
  590. __forceinline Vec3fx normalize( const Vec3fx& a ) { return a*rsqrt(dot(a,a)); }
  591. __forceinline float distance ( const Vec3fx& a, const Vec3fx& b ) { return length(a-b); }
  592. __forceinline float halfArea ( const Vec3fx& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); }
  593. __forceinline float area ( const Vec3fx& d ) { return 2.0f*halfArea(d); }
  594. __forceinline Vec3fx normalize_safe( const Vec3fx& a ) {
  595. const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
  596. }
  597. /*! differentiated normalization */
  598. __forceinline Vec3fx dnormalize(const Vec3fx& p, const Vec3fx& dp)
  599. {
  600. const float pp = dot(p,p);
  601. const float pdp = dot(p,dp);
  602. return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp);
  603. }
  604. ////////////////////////////////////////////////////////////////////////////////
  605. /// Select
  606. ////////////////////////////////////////////////////////////////////////////////
  607. __forceinline Vec3fx select( bool s, const Vec3fx& t, const Vec3fx& f ) {
  608. __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
  609. return blendv_ps(f.m128, t.m128, mask);
  610. }
  611. __forceinline Vec3fx select( const Vec3ba& s, const Vec3fx& t, const Vec3fx& f ) {
  612. return blendv_ps(f.m128, t.m128, s);
  613. }
  614. __forceinline Vec3fx lerp(const Vec3fx& v0, const Vec3fx& v1, const float t) {
  615. return madd(1.0f-t,v0,t*v1);
  616. }
  617. __forceinline int maxDim ( const Vec3fx& a )
  618. {
  619. const Vec3fx b = abs(a);
  620. if (b.x > b.y) {
  621. if (b.x > b.z) return 0; else return 2;
  622. } else {
  623. if (b.y > b.z) return 1; else return 2;
  624. }
  625. }
  626. ////////////////////////////////////////////////////////////////////////////////
  627. /// Rounding Functions
  628. ////////////////////////////////////////////////////////////////////////////////
  629. #if defined(__aarch64__)
  630. __forceinline Vec3fx trunc(const Vec3fx& a) { return vrndq_f32(a.m128); }
  631. __forceinline Vec3fx floor(const Vec3fx& a) { return vrndmq_f32(a.m128); }
  632. __forceinline Vec3fx ceil (const Vec3fx& a) { return vrndpq_f32(a.m128); }
  633. #elif defined (__SSE4_1__)
  634. __forceinline Vec3fx trunc( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
  635. __forceinline Vec3fx floor( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); }
  636. __forceinline Vec3fx ceil ( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); }
  637. #else
  638. __forceinline Vec3fx trunc( const Vec3fx& a ) { return Vec3fx(truncf(a.x),truncf(a.y),truncf(a.z)); }
  639. __forceinline Vec3fx floor( const Vec3fx& a ) { return Vec3fx(floorf(a.x),floorf(a.y),floorf(a.z)); }
  640. __forceinline Vec3fx ceil ( const Vec3fx& a ) { return Vec3fx(ceilf (a.x),ceilf (a.y),ceilf (a.z)); }
  641. #endif
  642. ////////////////////////////////////////////////////////////////////////////////
  643. /// Output Operators
  644. ////////////////////////////////////////////////////////////////////////////////
  645. __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fx& a) {
  646. return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
  647. }
  648. typedef Vec3fx Vec3ff;
  649. }
  650. #endif