vboolf4_sse2.h 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. // ======================================================================== //
  2. // Copyright 2009-2017 Intel Corporation //
  3. // //
  4. // Licensed under the Apache License, Version 2.0 (the "License"); //
  5. // you may not use this file except in compliance with the License. //
  6. // You may obtain a copy of the License at //
  7. // //
  8. // http://www.apache.org/licenses/LICENSE-2.0 //
  9. // //
  10. // Unless required by applicable law or agreed to in writing, software //
  11. // distributed under the License is distributed on an "AS IS" BASIS, //
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
  13. // See the License for the specific language governing permissions and //
  14. // limitations under the License. //
  15. // ======================================================================== //
  16. #pragma once
  17. namespace embree
  18. {
  19. /* 4-wide SSE bool type */
  20. template<>
  21. struct vboolf<4>
  22. {
  23. typedef vboolf4 Bool;
  24. typedef vint4 Int;
  25. typedef vfloat4 Float;
  26. enum { size = 4 }; // number of SIMD elements
  27. union { __m128 v; int i[4]; }; // data
  28. ////////////////////////////////////////////////////////////////////////////////
  29. /// Constructors, Assignment & Cast Operators
  30. ////////////////////////////////////////////////////////////////////////////////
  31. __forceinline vboolf ( ) {}
  32. __forceinline vboolf ( const vboolf4& other ) { v = other.v; }
  33. __forceinline vboolf4& operator=( const vboolf4& other ) { v = other.v; return *this; }
  34. __forceinline vboolf( const __m128 input ) : v(input) {}
  35. __forceinline operator const __m128&( void ) const { return v; }
  36. __forceinline operator const __m128i( void ) const { return _mm_castps_si128(v); }
  37. __forceinline operator const __m128d( void ) const { return _mm_castps_pd(v); }
  38. __forceinline vboolf( bool a )
  39. : v(_mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {}
  40. __forceinline vboolf( bool a, bool b )
  41. : v(_mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)]) {}
  42. __forceinline vboolf( bool a, bool b, bool c, bool d )
  43. : v(_mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {}
  44. __forceinline vboolf( int mask ) {
  45. assert(mask >= 0 && mask < 16);
  46. v = _mm_lookupmask_ps[mask];
  47. }
  48. /* return int32 mask */
  49. __forceinline __m128i mask32() const {
  50. return _mm_castps_si128(v);
  51. }
  52. ////////////////////////////////////////////////////////////////////////////////
  53. /// Constants
  54. ////////////////////////////////////////////////////////////////////////////////
  55. __forceinline vboolf( FalseTy ) : v(_mm_setzero_ps()) {}
  56. __forceinline vboolf( TrueTy ) : v(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) {}
  57. ////////////////////////////////////////////////////////////////////////////////
  58. /// Array Access
  59. ////////////////////////////////////////////////////////////////////////////////
  60. __forceinline bool operator []( const size_t index ) const { assert(index < 4); return (_mm_movemask_ps(v) >> index) & 1; }
  61. __forceinline int& operator []( const size_t index ) { assert(index < 4); return i[index]; }
  62. };
  63. ////////////////////////////////////////////////////////////////////////////////
  64. /// Unary Operators
  65. ////////////////////////////////////////////////////////////////////////////////
  66. __forceinline const vboolf4 operator !( const vboolf4& a ) { return _mm_xor_ps(a, vboolf4(embree::True)); }
  67. ////////////////////////////////////////////////////////////////////////////////
  68. /// Binary Operators
  69. ////////////////////////////////////////////////////////////////////////////////
  70. __forceinline const vboolf4 operator &( const vboolf4& a, const vboolf4& b ) { return _mm_and_ps(a, b); }
  71. __forceinline const vboolf4 operator |( const vboolf4& a, const vboolf4& b ) { return _mm_or_ps (a, b); }
  72. __forceinline const vboolf4 operator ^( const vboolf4& a, const vboolf4& b ) { return _mm_xor_ps(a, b); }
  73. ////////////////////////////////////////////////////////////////////////////////
  74. /// Assignment Operators
  75. ////////////////////////////////////////////////////////////////////////////////
  76. __forceinline const vboolf4 operator &=( vboolf4& a, const vboolf4& b ) { return a = a & b; }
  77. __forceinline const vboolf4 operator |=( vboolf4& a, const vboolf4& b ) { return a = a | b; }
  78. __forceinline const vboolf4 operator ^=( vboolf4& a, const vboolf4& b ) { return a = a ^ b; }
  79. ////////////////////////////////////////////////////////////////////////////////
  80. /// Comparison Operators + Select
  81. ////////////////////////////////////////////////////////////////////////////////
  82. __forceinline const vboolf4 operator !=( const vboolf4& a, const vboolf4& b ) { return _mm_xor_ps(a, b); }
  83. __forceinline const vboolf4 operator ==( const vboolf4& a, const vboolf4& b ) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); }
  84. __forceinline const vboolf4 select( const vboolf4& m, const vboolf4& t, const vboolf4& f ) {
  85. #if defined(__SSE4_1__)
  86. return _mm_blendv_ps(f, t, m);
  87. #else
  88. return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f));
  89. #endif
  90. }
  91. ////////////////////////////////////////////////////////////////////////////////
  92. /// Movement/Shifting/Shuffling Functions
  93. ////////////////////////////////////////////////////////////////////////////////
  94. __forceinline const vboolf4 unpacklo( const vboolf4& a, const vboolf4& b ) { return _mm_unpacklo_ps(a, b); }
  95. __forceinline const vboolf4 unpackhi( const vboolf4& a, const vboolf4& b ) { return _mm_unpackhi_ps(a, b); }
  96. template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const vboolf4 shuffle( const vboolf4& a ) {
  97. return _mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0));
  98. }
  99. template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const vboolf4 shuffle( const vboolf4& a, const vboolf4& b ) {
  100. return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
  101. }
  102. template<size_t i0> __forceinline const vboolf4 shuffle( const vboolf4& b ) {
  103. return shuffle<i0,i0,i0,i0>(b);
  104. }
  105. #if defined(__SSE3__)
  106. template<> __forceinline const vboolf4 shuffle<0, 0, 2, 2>( const vboolf4& a ) { return _mm_moveldup_ps(a); }
  107. template<> __forceinline const vboolf4 shuffle<1, 1, 3, 3>( const vboolf4& a ) { return _mm_movehdup_ps(a); }
  108. template<> __forceinline const vboolf4 shuffle<0, 1, 0, 1>( const vboolf4& a ) { return _mm_castpd_ps(_mm_movedup_pd (a)); }
  109. #endif
  110. #if defined(__SSE4_1__)
  111. template<size_t dst, size_t src, size_t clr> __forceinline const vboolf4 insert( const vboolf4& a, const vboolf4& b ) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
  112. template<size_t dst, size_t src> __forceinline const vboolf4 insert( const vboolf4& a, const vboolf4& b ) { return insert<dst, src, 0>(a, b); }
  113. template<size_t dst> __forceinline const vboolf4 insert( const vboolf4& a, const bool b ) { return insert<dst,0>(a, vboolf4(b)); }
  114. #endif
  115. ////////////////////////////////////////////////////////////////////////////////
  116. /// Reduction Operations
  117. ////////////////////////////////////////////////////////////////////////////////
  118. __forceinline bool reduce_and( const vboolf4& a ) { return _mm_movemask_ps(a) == 0xf; }
  119. __forceinline bool reduce_or ( const vboolf4& a ) { return _mm_movemask_ps(a) != 0x0; }
  120. __forceinline bool all ( const vboolf4& b ) { return _mm_movemask_ps(b) == 0xf; }
  121. __forceinline bool any ( const vboolf4& b ) { return _mm_movemask_ps(b) != 0x0; }
  122. __forceinline bool none ( const vboolf4& b ) { return _mm_movemask_ps(b) == 0x0; }
  123. __forceinline bool all ( const vboolf4& valid, const vboolf4& b ) { return all((!valid) | b); }
  124. __forceinline bool any ( const vboolf4& valid, const vboolf4& b ) { return any( valid & b); }
  125. __forceinline bool none ( const vboolf4& valid, const vboolf4& b ) { return none(valid & b); }
  126. __forceinline size_t movemask( const vboolf4& a ) { return _mm_movemask_ps(a); }
  127. #if defined(__SSE4_2__)
  128. __forceinline size_t popcnt( const vboolf4& a ) { return __popcnt((size_t)_mm_movemask_ps(a)); }
  129. #else
  130. __forceinline size_t popcnt( const vboolf4& a ) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3]); }
  131. #endif
  132. ////////////////////////////////////////////////////////////////////////////////
  133. /// Get/Set Functions
  134. ////////////////////////////////////////////////////////////////////////////////
  135. __forceinline bool get(const vboolf4& a, size_t index) { return a[index]; }
  136. __forceinline void set(vboolf4& a, size_t index) { a[index] = -1; }
  137. __forceinline void clear(vboolf4& a, size_t index) { a[index] = 0; }
  138. ////////////////////////////////////////////////////////////////////////////////
  139. /// Output Operators
  140. ////////////////////////////////////////////////////////////////////////////////
  141. inline std::ostream& operator<<(std::ostream& cout, const vboolf4& a) {
  142. return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">";
  143. }
  144. }