vllong8_avx512.h 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377
  1. // ======================================================================== //
  2. // Copyright 2009-2017 Intel Corporation //
  3. // //
  4. // Licensed under the Apache License, Version 2.0 (the "License"); //
  5. // you may not use this file except in compliance with the License. //
  6. // You may obtain a copy of the License at //
  7. // //
  8. // http://www.apache.org/licenses/LICENSE-2.0 //
  9. // //
  10. // Unless required by applicable law or agreed to in writing, software //
  11. // distributed under the License is distributed on an "AS IS" BASIS, //
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
  13. // See the License for the specific language governing permissions and //
  14. // limitations under the License. //
  15. // ======================================================================== //
  16. #pragma once
  17. namespace embree
  18. {
  19. /* 8-wide AVX-512 64bit long long type */
  20. template<>
  21. struct vllong<8>
  22. {
  23. typedef vboold8 Bool;
  24. enum { size = 8 }; // number of SIMD elements
  25. union { // data
  26. __m512i v;
  27. long long i[8];
  28. };
  29. ////////////////////////////////////////////////////////////////////////////////
  30. /// Constructors, Assignment & Cast Operators
  31. ////////////////////////////////////////////////////////////////////////////////
  32. __forceinline vllong() {}
  33. __forceinline vllong(const vllong8& t) { v = t.v; }
  34. __forceinline vllong8& operator=(const vllong8& f) { v = f.v; return *this; }
  35. __forceinline vllong(const __m512i& t) { v = t; }
  36. __forceinline operator __m512i () const { return v; }
  37. __forceinline operator __m256i () const { return _mm512_castsi512_si256(v); }
  38. __forceinline vllong(const long long i) {
  39. v = _mm512_set1_epi64(i);
  40. }
  41. __forceinline vllong(const long long a, const long long b, const long long c, const long long d) {
  42. v = _mm512_set4_epi64(d,c,b,a);
  43. }
  44. __forceinline vllong(const long long a0, const long long a1, const long long a2, const long long a3,
  45. const long long a4, const long long a5, const long long a6, const long long a7)
  46. {
  47. v = _mm512_set_epi64(a7,a6,a5,a4,a3,a2,a1,a0);
  48. }
  49. ////////////////////////////////////////////////////////////////////////////////
  50. /// Constants
  51. ////////////////////////////////////////////////////////////////////////////////
  52. __forceinline vllong( ZeroTy ) : v(_mm512_setzero_epi32()) {}
  53. __forceinline vllong( OneTy ) : v(_mm512_set1_epi64(1)) {}
  54. __forceinline vllong( StepTy ) : v(_mm512_set_epi64(7,6,5,4,3,2,1,0)) {}
  55. __forceinline vllong( ReverseStepTy ) : v(_mm512_setr_epi64(7,6,5,4,3,2,1,0)) {}
  56. __forceinline static vllong8 zero() { return _mm512_setzero_epi32(); }
  57. __forceinline static vllong8 one () { return _mm512_set1_epi64(1); }
  58. __forceinline static vllong8 neg_one () { return _mm512_set1_epi64(-1); }
  59. ////////////////////////////////////////////////////////////////////////////////
  60. /// Loads and Stores
  61. ////////////////////////////////////////////////////////////////////////////////
  62. static __forceinline void store_nt(void *__restrict__ ptr, const vllong8& a) {
  63. _mm512_stream_si512((__m512i*)ptr,a);
  64. }
  65. static __forceinline vllong8 loadu(const void* addr) {
  66. return _mm512_loadu_si512(addr);
  67. }
  68. static __forceinline vllong8 load(const vllong8* addr) {
  69. return _mm512_load_si512(addr);
  70. }
  71. static __forceinline vllong8 load(const long long* addr) {
  72. return _mm512_load_si512(addr);
  73. }
  74. static __forceinline void store(void* ptr, const vllong8& v) {
  75. _mm512_store_si512(ptr,v);
  76. }
  77. static __forceinline void storeu(void* ptr, const vllong8& v ) {
  78. _mm512_storeu_si512(ptr,v);
  79. }
  80. static __forceinline void storeu(const vboold8& mask, long long* ptr, const vllong8& f ) {
  81. _mm512_mask_storeu_epi64(ptr,mask,f);
  82. }
  83. static __forceinline void store(const vboold8& mask, void* addr, const vllong8& v2) {
  84. _mm512_mask_store_epi64(addr,mask,v2);
  85. }
  86. /* pass by value to avoid compiler generating inefficient code */
  87. static __forceinline void storeu_compact(const vboold8 mask,void * addr, const vllong8& reg) {
  88. _mm512_mask_compressstoreu_epi64(addr,mask,reg);
  89. }
  90. static __forceinline vllong8 compact64bit(const vboold8& mask, vllong8& v) {
  91. return _mm512_mask_compress_epi64(v,mask,v);
  92. }
  93. static __forceinline vllong8 compact64bit(const vboold8& mask, vllong8& dest, const vllong8& source) {
  94. return _mm512_mask_compress_epi64(dest,mask,source);
  95. }
  96. static __forceinline vllong8 compact(const vboold8& mask, vllong8& v) {
  97. return _mm512_mask_compress_epi64(v,mask,v);
  98. }
  99. static __forceinline vllong8 compact(const vboold8& mask, const vllong8& a, vllong8& b) {
  100. return _mm512_mask_compress_epi64(a,mask,b);
  101. }
  102. static __forceinline vllong8 broadcast64bit(size_t v) {
  103. return _mm512_set1_epi64(v);
  104. }
  105. static __forceinline size_t extract64bit(const vllong8& v)
  106. {
  107. return _mm_cvtsi128_si64(_mm512_castsi512_si128(v));
  108. }
  109. ////////////////////////////////////////////////////////////////////////////////
  110. /// Array Access
  111. ////////////////////////////////////////////////////////////////////////////////
  112. __forceinline long long& operator[](const size_t index) { assert(index < 8); return i[index]; }
  113. __forceinline const long long& operator[](const size_t index) const { assert(index < 8); return i[index]; }
  114. };
  115. ////////////////////////////////////////////////////////////////////////////////
  116. /// Unary Operators
  117. ////////////////////////////////////////////////////////////////////////////////
  118. __forceinline const vllong8 asLong ( const __m512& a ) { return _mm512_castps_si512(a); }
  119. __forceinline const vllong8 operator +( const vllong8& a ) { return a; }
  120. __forceinline const vllong8 operator -( const vllong8& a ) { return _mm512_sub_epi64(_mm512_setzero_epi32(), a); }
  121. ////////////////////////////////////////////////////////////////////////////////
  122. /// Binary Operators
  123. ////////////////////////////////////////////////////////////////////////////////
  124. __forceinline const vllong8 operator +( const vllong8& a, const vllong8& b ) { return _mm512_add_epi64(a, b); }
  125. __forceinline const vllong8 operator +( const vllong8& a, const long long b ) { return a + vllong8(b); }
  126. __forceinline const vllong8 operator +( const long long a, const vllong8& b ) { return vllong8(a) + b; }
  127. __forceinline const vllong8 operator -( const vllong8& a, const vllong8& b ) { return _mm512_sub_epi64(a, b); }
  128. __forceinline const vllong8 operator -( const vllong8& a, const long long b ) { return a - vllong8(b); }
  129. __forceinline const vllong8 operator -( const long long a, const vllong8& b ) { return vllong8(a) - b; }
  130. __forceinline const vllong8 operator *( const vllong8& a, const vllong8& b ) { return _mm512_mullo_epi64(a, b); }
  131. __forceinline const vllong8 operator *( const vllong8& a, const long long b ) { return a * vllong8(b); }
  132. __forceinline const vllong8 operator *( const long long a, const vllong8& b ) { return vllong8(a) * b; }
  133. __forceinline const vllong8 operator &( const vllong8& a, const vllong8& b ) { return _mm512_and_epi64(a, b); }
  134. __forceinline const vllong8 operator &( const vllong8& a, const long long b ) { return a & vllong8(b); }
  135. __forceinline const vllong8 operator &( const long long a, const vllong8& b ) { return vllong8(a) & b; }
  136. __forceinline const vllong8 operator |( const vllong8& a, const vllong8& b ) { return _mm512_or_epi64(a, b); }
  137. __forceinline const vllong8 operator |( const vllong8& a, const long long b ) { return a | vllong8(b); }
  138. __forceinline const vllong8 operator |( const long long a, const vllong8& b ) { return vllong8(a) | b; }
  139. __forceinline const vllong8 operator ^( const vllong8& a, const vllong8& b ) { return _mm512_xor_epi64(a, b); }
  140. __forceinline const vllong8 operator ^( const vllong8& a, const long long b ) { return a ^ vllong8(b); }
  141. __forceinline const vllong8 operator ^( const long long a, const vllong8& b ) { return vllong8(a) ^ b; }
  142. __forceinline const vllong8 operator <<( const vllong8& a, const long long n ) { return _mm512_slli_epi64(a, n); }
  143. __forceinline const vllong8 operator >>( const vllong8& a, const long long n ) { return _mm512_srai_epi64(a, n); }
  144. __forceinline const vllong8 operator <<( const vllong8& a, const vllong8& n ) { return _mm512_sllv_epi64(a, n); }
  145. __forceinline const vllong8 operator >>( const vllong8& a, const vllong8& n ) { return _mm512_srav_epi64(a, n); }
  146. __forceinline const vllong8 sll ( const vllong8& a, const long long b ) { return _mm512_slli_epi64(a, b); }
  147. __forceinline const vllong8 sra ( const vllong8& a, const long long b ) { return _mm512_srai_epi64(a, b); }
  148. __forceinline const vllong8 srl ( const vllong8& a, const long long b ) { return _mm512_srli_epi64(a, b); }
  149. __forceinline const vllong8 min( const vllong8& a, const vllong8& b ) { return _mm512_min_epi64(a, b); }
  150. __forceinline const vllong8 min( const vllong8& a, const long long b ) { return min(a,vllong8(b)); }
  151. __forceinline const vllong8 min( const long long a, const vllong8& b ) { return min(vllong8(a),b); }
  152. __forceinline const vllong8 max( const vllong8& a, const vllong8& b ) { return _mm512_max_epi64(a, b); }
  153. __forceinline const vllong8 max( const vllong8& a, const long long b ) { return max(a,vllong8(b)); }
  154. __forceinline const vllong8 max( const long long a, const vllong8& b ) { return max(vllong8(a),b); }
  155. __forceinline const vllong8 mask_add(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_add_epi64(c,m,a,b); }
  156. __forceinline const vllong8 mask_sub(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_sub_epi64(c,m,a,b); }
  157. __forceinline const vllong8 mask_and(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_and_epi64(c,m,a,b); }
  158. __forceinline const vllong8 mask_or (const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_or_epi64(c,m,a,b); }
  159. ////////////////////////////////////////////////////////////////////////////////
  160. /// Assignment Operators
  161. ////////////////////////////////////////////////////////////////////////////////
  162. __forceinline vllong8& operator +=( vllong8& a, const vllong8& b ) { return a = a + b; }
  163. __forceinline vllong8& operator +=( vllong8& a, const long long b ) { return a = a + b; }
  164. __forceinline vllong8& operator -=( vllong8& a, const vllong8& b ) { return a = a - b; }
  165. __forceinline vllong8& operator -=( vllong8& a, const long long b ) { return a = a - b; }
  166. __forceinline vllong8& operator *=( vllong8& a, const vllong8& b ) { return a = a * b; }
  167. __forceinline vllong8& operator *=( vllong8& a, const long long b ) { return a = a * b; }
  168. __forceinline vllong8& operator &=( vllong8& a, const vllong8& b ) { return a = a & b; }
  169. __forceinline vllong8& operator &=( vllong8& a, const long long b ) { return a = a & b; }
  170. __forceinline vllong8& operator |=( vllong8& a, const vllong8& b ) { return a = a | b; }
  171. __forceinline vllong8& operator |=( vllong8& a, const long long b ) { return a = a | b; }
  172. __forceinline vllong8& operator <<=( vllong8& a, const long long b ) { return a = a << b; }
  173. __forceinline vllong8& operator >>=( vllong8& a, const long long b ) { return a = a >> b; }
  174. ////////////////////////////////////////////////////////////////////////////////
  175. /// Comparison Operators + Select
  176. ////////////////////////////////////////////////////////////////////////////////
  177. __forceinline const vboold8 operator ==( const vllong8& a, const vllong8& b ) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); }
  178. __forceinline const vboold8 operator ==( const vllong8& a, const long long b ) { return a == vllong8(b); }
  179. __forceinline const vboold8 operator ==( const long long a, const vllong8& b ) { return vllong8(a) == b; }
  180. __forceinline const vboold8 operator !=( const vllong8& a, const vllong8& b ) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); }
  181. __forceinline const vboold8 operator !=( const vllong8& a, const long long b ) { return a != vllong8(b); }
  182. __forceinline const vboold8 operator !=( const long long a, const vllong8& b ) { return vllong8(a) != b; }
  183. __forceinline const vboold8 operator < ( const vllong8& a, const vllong8& b ) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); }
  184. __forceinline const vboold8 operator < ( const vllong8& a, const long long b ) { return a < vllong8(b); }
  185. __forceinline const vboold8 operator < ( const long long a, const vllong8& b ) { return vllong8(a) < b; }
  186. __forceinline const vboold8 operator >=( const vllong8& a, const vllong8& b ) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); }
  187. __forceinline const vboold8 operator >=( const vllong8& a, const long long b ) { return a >= vllong8(b); }
  188. __forceinline const vboold8 operator >=( const long long a, const vllong8& b ) { return vllong8(a) >= b; }
  189. __forceinline const vboold8 operator > ( const vllong8& a, const vllong8& b ) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); }
  190. __forceinline const vboold8 operator > ( const vllong8& a, const long long b ) { return a > vllong8(b); }
  191. __forceinline const vboold8 operator > ( const long long a, const vllong8& b ) { return vllong8(a) > b; }
  192. __forceinline const vboold8 operator <=( const vllong8& a, const vllong8& b ) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); }
  193. __forceinline const vboold8 operator <=( const vllong8& a, const long long b ) { return a <= vllong8(b); }
  194. __forceinline const vboold8 operator <=( const long long a, const vllong8& b ) { return vllong8(a) <= b; }
  195. __forceinline vboold8 eq(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); }
  196. __forceinline vboold8 ne(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); }
  197. __forceinline vboold8 lt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); }
  198. __forceinline vboold8 ge(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); }
  199. __forceinline vboold8 gt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); }
  200. __forceinline vboold8 le(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); }
  201. __forceinline vboold8 eq(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_EQ); }
  202. __forceinline vboold8 ne(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_NE); }
  203. __forceinline vboold8 lt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LT); }
  204. __forceinline vboold8 ge(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GE); }
  205. __forceinline vboold8 gt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GT); }
  206. __forceinline vboold8 le(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LE); }
  207. __forceinline const vllong8 select( const vboold8& m, const vllong8& t, const vllong8& f ) {
  208. return _mm512_mask_or_epi64(f,m,t,t);
  209. }
  210. __forceinline void xchg(const vboold8& m, vllong8& a, vllong8& b) {
  211. const vllong8 c = a; a = select(m,b,a); b = select(m,c,b);
  212. }
  213. __forceinline vboold8 test(const vboold8& m, const vllong8& a, const vllong8& b) {
  214. return _mm512_mask_test_epi64_mask(m,a,b);
  215. }
  216. __forceinline vboold8 test(const vllong8& a, const vllong8& b) {
  217. return _mm512_test_epi64_mask(a,b);
  218. }
  219. ////////////////////////////////////////////////////////////////////////////////
  220. // Movement/Shifting/Shuffling Functions
  221. ////////////////////////////////////////////////////////////////////////////////
  222. template<size_t i>
  223. __forceinline const vllong8 shuffle( const vllong8& a ) {
  224. return _mm512_castpd_si512(_mm512_permute_pd(_mm512_castsi512_pd(a), _MM_SHUFFLE(i, i, i, i)));
  225. }
  226. template<int A, int B, int C, int D>
  227. __forceinline vllong8 shuffle (const vllong8& v) {
  228. return _mm512_castpd_si512(_mm512_permute_pd(_mm512_castsi512_pd(v),_MM_SHUFFLE(D,C,B,A)));
  229. }
  230. template<int i>
  231. __forceinline vllong8 shuffle4(const vllong8& x) {
  232. return _mm512_castpd_si512(_mm512_shuffle_f64x2(_mm512_castsi512_pd(x),_mm512_castsi512_pd(x),_MM_SHUFFLE(i,i,i,i)));
  233. }
  234. template<int A, int B>
  235. __forceinline vllong8 shuffle4(const vllong8& x) {
  236. return _mm512_castpd_si512(_mm512_shuffle_f64x2(_mm512_castsi512_pd(x),_mm512_castsi512_pd(x),_MM_SHUFFLE(0,0,B,A)));
  237. }
  238. template<int i>
  239. __forceinline vllong8 align_shift_right(const vllong8& a, const vllong8& b)
  240. {
  241. return _mm512_alignr_epi64(a,b,i);
  242. };
  243. __forceinline long long toScalar(const vllong8& a)
  244. {
  245. return _mm_cvtsi128_si64(_mm512_castsi512_si128(a));
  246. }
  247. ////////////////////////////////////////////////////////////////////////////////
  248. /// Reductions
  249. ////////////////////////////////////////////////////////////////////////////////
  250. __forceinline long long reduce_add(const vllong8& a) { return _mm512_reduce_add_epi64(a); }
  251. __forceinline long long reduce_min(const vllong8& a) { return _mm512_reduce_min_epi64(a); }
  252. __forceinline long long reduce_max(const vllong8& a) { return _mm512_reduce_max_epi64(a); }
  253. __forceinline long long reduce_and(const vllong8& a) { return _mm512_reduce_and_epi64(a); }
  254. __forceinline long long reduce_or (const vllong8& a) { return _mm512_reduce_or_epi64(a); }
  255. __forceinline vllong8 vreduce_add2(vllong8 x) { return x + shuffle<1,0,3,2>(x); }
  256. __forceinline vllong8 vreduce_add4(vllong8 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
  257. __forceinline vllong8 vreduce_add (vllong8 x) { x = vreduce_add4(x); return x + shuffle4<1,0>(x); }
  258. __forceinline vllong8 vreduce_min2(vllong8 x) { return min(x,shuffle<1,0,3,2>(x)); }
  259. __forceinline vllong8 vreduce_min4(vllong8 x) { x = vreduce_min2(x); return min(x,shuffle<2,3,0,1>(x)); }
  260. __forceinline vllong8 vreduce_min (vllong8 x) { x = vreduce_min4(x); return min(x,shuffle4<1,0>(x)); }
  261. __forceinline vllong8 vreduce_max2(vllong8 x) { return max(x,shuffle<1,0,3,2>(x)); }
  262. __forceinline vllong8 vreduce_max4(vllong8 x) { x = vreduce_max2(x); return max(x,shuffle<2,3,0,1>(x)); }
  263. __forceinline vllong8 vreduce_max (vllong8 x) { x = vreduce_max4(x); return max(x,shuffle4<1,0>(x)); }
  264. __forceinline vllong8 vreduce_and2(vllong8 x) { return x & shuffle<1,0,3,2>(x); }
  265. __forceinline vllong8 vreduce_and4(vllong8 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); }
  266. __forceinline vllong8 vreduce_and (vllong8 x) { x = vreduce_and4(x); return x & shuffle4<1,0>(x); }
  267. __forceinline vllong8 vreduce_or2(vllong8 x) { return x | shuffle<1,0,3,2>(x); }
  268. __forceinline vllong8 vreduce_or4(vllong8 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); }
  269. __forceinline vllong8 vreduce_or (vllong8 x) { x = vreduce_or4(x); return x | shuffle4<1,0>(x); }
  270. ////////////////////////////////////////////////////////////////////////////////
  271. /// Memory load and store operations
  272. ////////////////////////////////////////////////////////////////////////////////
  273. __forceinline vllong8 permute(const vllong8& v, const vllong8& index) {
  274. return _mm512_permutexvar_epi64(index,v);
  275. }
  276. __forceinline vllong8 reverse(const vllong8& a) {
  277. return permute(a,vllong8(reverse_step));
  278. }
  279. ////////////////////////////////////////////////////////////////////////////////
  280. /// Output Operators
  281. ////////////////////////////////////////////////////////////////////////////////
  282. __forceinline std::ostream& operator<<(std::ostream& cout, const vllong8& v)
  283. {
  284. cout << "<" << v[0];
  285. for (size_t i=1; i<8; i++) cout << ", " << v[i];
  286. cout << ">";
  287. return cout;
  288. }
  289. }