intrinsics.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446
  1. // ======================================================================== //
  2. // Copyright 2009-2017 Intel Corporation //
  3. // //
  4. // Licensed under the Apache License, Version 2.0 (the "License"); //
  5. // you may not use this file except in compliance with the License. //
  6. // You may obtain a copy of the License at //
  7. // //
  8. // http://www.apache.org/licenses/LICENSE-2.0 //
  9. // //
  10. // Unless required by applicable law or agreed to in writing, software //
  11. // distributed under the License is distributed on an "AS IS" BASIS, //
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
  13. // See the License for the specific language governing permissions and //
  14. // limitations under the License. //
  15. // ======================================================================== //
  16. #pragma once
  17. #include "platform.h"
  18. #if defined(__WIN32__)
  19. #include <intrin.h>
  20. #endif
  21. #include <immintrin.h>
  22. #if defined(__BMI__) && defined(__GNUC__)
  23. #if !defined(_tzcnt_u32)
  24. #define _tzcnt_u32 __tzcnt_u32
  25. #endif
  26. #if !defined(_tzcnt_u64)
  27. #define _tzcnt_u64 __tzcnt_u64
  28. #endif
  29. #endif
  30. #if defined(__LZCNT__)
  31. #if !defined(_lzcnt_u32)
  32. #define _lzcnt_u32 __lzcnt32
  33. #endif
  34. #if !defined(_lzcnt_u64)
  35. #define _lzcnt_u64 __lzcnt64
  36. #endif
  37. #endif
  38. #if defined(__WIN32__)
  39. # define NOMINMAX
  40. # include <windows.h>
  41. #endif
  42. /* normally defined in pmmintrin.h, but we always need this */
  43. #if !defined(_MM_SET_DENORMALS_ZERO_MODE)
  44. #define _MM_DENORMALS_ZERO_ON (0x0040)
  45. #define _MM_DENORMALS_ZERO_OFF (0x0000)
  46. #define _MM_DENORMALS_ZERO_MASK (0x0040)
  47. #define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
  48. #endif
  49. namespace embree
  50. {
  51. ////////////////////////////////////////////////////////////////////////////////
  52. /// Windows Platform
  53. ////////////////////////////////////////////////////////////////////////////////
  54. #if defined(__WIN32__)
  55. __forceinline size_t read_tsc()
  56. {
  57. LARGE_INTEGER li;
  58. QueryPerformanceCounter(&li);
  59. return (size_t)li.QuadPart;
  60. }
  61. __forceinline int __bsf(int v) {
  62. #if defined(__AVX2__)
  63. return _tzcnt_u32(v);
  64. #else
  65. unsigned long r = 0; _BitScanForward(&r,v); return r;
  66. #endif
  67. }
  68. __forceinline unsigned __bsf(unsigned v) {
  69. #if defined(__AVX2__)
  70. return _tzcnt_u32(v);
  71. #else
  72. unsigned long r = 0; _BitScanForward(&r,v); return r;
  73. #endif
  74. }
  75. #if defined(__X86_64__)
  76. __forceinline size_t __bsf(size_t v) {
  77. #if defined(__AVX2__)
  78. return _tzcnt_u64(v);
  79. #else
  80. unsigned long r = 0; _BitScanForward64(&r,v); return r;
  81. #endif
  82. }
  83. #endif
  84. __forceinline int __bscf(int& v)
  85. {
  86. int i = __bsf(v);
  87. v &= v-1;
  88. return i;
  89. }
  90. __forceinline unsigned __bscf(unsigned& v)
  91. {
  92. unsigned i = __bsf(v);
  93. v &= v-1;
  94. return i;
  95. }
  96. #if defined(__X86_64__)
  97. __forceinline size_t __bscf(size_t& v)
  98. {
  99. size_t i = __bsf(v);
  100. v &= v-1;
  101. return i;
  102. }
  103. #endif
  104. __forceinline int __bsr(int v) {
  105. #if defined(__AVX2__)
  106. return 31 - _lzcnt_u32(v);
  107. #else
  108. unsigned long r = 0; _BitScanReverse(&r,v); return r;
  109. #endif
  110. }
  111. __forceinline unsigned __bsr(unsigned v) {
  112. #if defined(__AVX2__)
  113. return 31 - _lzcnt_u32(v);
  114. #else
  115. unsigned long r = 0; _BitScanReverse(&r,v); return r;
  116. #endif
  117. }
  118. #if defined(__X86_64__)
  119. __forceinline size_t __bsr(size_t v) {
  120. #if defined(__AVX2__)
  121. return 63 -_lzcnt_u64(v);
  122. #else
  123. unsigned long r = 0; _BitScanReverse64(&r, v); return r;
  124. #endif
  125. }
  126. #endif
  127. __forceinline int lzcnt(const int x)
  128. {
  129. #if defined(__AVX2__)
  130. return _lzcnt_u32(x);
  131. #else
  132. if (unlikely(x == 0)) return 32;
  133. return 31 - __bsr(x);
  134. #endif
  135. }
  136. __forceinline int __btc(int v, int i) {
  137. long r = v; _bittestandcomplement(&r,i); return r;
  138. }
  139. __forceinline int __bts(int v, int i) {
  140. long r = v; _bittestandset(&r,i); return r;
  141. }
  142. __forceinline int __btr(int v, int i) {
  143. long r = v; _bittestandreset(&r,i); return r;
  144. }
  145. #if defined(__X86_64__)
  146. __forceinline size_t __btc(size_t v, size_t i) {
  147. size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r;
  148. }
  149. __forceinline size_t __bts(size_t v, size_t i) {
  150. __int64 r = v; _bittestandset64(&r,i); return r;
  151. }
  152. __forceinline size_t __btr(size_t v, size_t i) {
  153. __int64 r = v; _bittestandreset64(&r,i); return r;
  154. }
  155. #endif
  156. __forceinline int32_t atomic_cmpxchg(volatile int32_t* p, const int32_t c, const int32_t v) {
  157. return _InterlockedCompareExchange((volatile long*)p,v,c);
  158. }
  159. ////////////////////////////////////////////////////////////////////////////////
  160. /// Unix Platform
  161. ////////////////////////////////////////////////////////////////////////////////
  162. #else
  163. #if defined(__i386__) && defined(__PIC__)
  164. __forceinline void __cpuid(int out[4], int op)
  165. {
  166. asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
  167. "cpuid\n\t"
  168. "xchg{l}\t{%%}ebx, %1\n\t"
  169. : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
  170. : "0"(op));
  171. }
  172. __forceinline void __cpuid_count(int out[4], int op1, int op2)
  173. {
  174. asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
  175. "cpuid\n\t"
  176. "xchg{l}\t{%%}ebx, %1\n\t"
  177. : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3])
  178. : "0" (op1), "2" (op2));
  179. }
  180. #else
  181. __forceinline void __cpuid(int out[4], int op) {
  182. asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op));
  183. }
  184. __forceinline void __cpuid_count(int out[4], int op1, int op2) {
  185. asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2));
  186. }
  187. #endif
  188. __forceinline uint64_t read_tsc() {
  189. uint32_t high,low;
  190. asm volatile ("rdtsc" : "=d"(high), "=a"(low));
  191. return (((uint64_t)high) << 32) + (uint64_t)low;
  192. }
  193. __forceinline int __bsf(int v) {
  194. #if defined(__AVX2__)
  195. return _tzcnt_u32(v);
  196. #else
  197. int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
  198. #endif
  199. }
  200. #if defined(__X86_64__)
  201. __forceinline unsigned __bsf(unsigned v)
  202. {
  203. #if defined(__AVX2__)
  204. return _tzcnt_u32(v);
  205. #else
  206. unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
  207. #endif
  208. }
  209. #endif
  210. __forceinline size_t __bsf(size_t v) {
  211. #if defined(__AVX2__)
  212. #if defined(__X86_64__)
  213. return _tzcnt_u64(v);
  214. #else
  215. return _tzcnt_u32(v);
  216. #endif
  217. #else
  218. size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
  219. #endif
  220. }
  221. __forceinline int __bscf(int& v)
  222. {
  223. int i = __bsf(v);
  224. v &= v-1;
  225. return i;
  226. }
  227. #if defined(__X86_64__)
  228. __forceinline unsigned int __bscf(unsigned int& v)
  229. {
  230. unsigned int i = __bsf(v);
  231. v &= v-1;
  232. return i;
  233. }
  234. #endif
  235. __forceinline size_t __bscf(size_t& v)
  236. {
  237. size_t i = __bsf(v);
  238. v &= v-1;
  239. return i;
  240. }
  241. __forceinline int __bsr(int v) {
  242. #if defined(__AVX2__)
  243. return 31 - _lzcnt_u32(v);
  244. #else
  245. int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
  246. #endif
  247. }
  248. #if defined(__X86_64__)
  249. __forceinline unsigned __bsr(unsigned v) {
  250. #if defined(__AVX2__)
  251. return 31 - _lzcnt_u32(v);
  252. #else
  253. unsigned r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
  254. #endif
  255. }
  256. #endif
  257. __forceinline size_t __bsr(size_t v) {
  258. #if defined(__AVX2__)
  259. #if defined(__X86_64__)
  260. return 63 - _lzcnt_u64(v);
  261. #else
  262. return 31 - _lzcnt_u32(v);
  263. #endif
  264. #else
  265. size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
  266. #endif
  267. }
  268. __forceinline int lzcnt(const int x)
  269. {
  270. #if defined(__AVX2__)
  271. return _lzcnt_u32(x);
  272. #else
  273. if (unlikely(x == 0)) return 32;
  274. return 31 - __bsr(x);
  275. #endif
  276. }
  277. __forceinline size_t __blsr(size_t v) {
  278. #if defined(__AVX2__)
  279. #if defined(__INTEL_COMPILER)
  280. return _blsr_u64(v);
  281. #else
  282. #if defined(__X86_64__)
  283. return __blsr_u64(v);
  284. #else
  285. return __blsr_u32(v);
  286. #endif
  287. #endif
  288. #else
  289. return v & (v-1);
  290. #endif
  291. }
  292. __forceinline int __btc(int v, int i) {
  293. int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
  294. }
  295. __forceinline int __bts(int v, int i) {
  296. int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
  297. }
  298. __forceinline int __btr(int v, int i) {
  299. int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
  300. }
  301. __forceinline size_t __btc(size_t v, size_t i) {
  302. size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
  303. }
  304. __forceinline size_t __bts(size_t v, size_t i) {
  305. size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
  306. }
  307. __forceinline size_t __btr(size_t v, size_t i) {
  308. size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
  309. }
  310. __forceinline int32_t atomic_cmpxchg( int32_t volatile* value, int32_t comparand, const int32_t input ) {
  311. return __sync_val_compare_and_swap(value, comparand, input);
  312. }
  313. #endif
  314. ////////////////////////////////////////////////////////////////////////////////
  315. /// All Platforms
  316. ////////////////////////////////////////////////////////////////////////////////
  317. #if defined(__SSE4_2__)
  318. __forceinline int __popcnt(int in) {
  319. return _mm_popcnt_u32(in);
  320. }
  321. __forceinline unsigned __popcnt(unsigned in) {
  322. return _mm_popcnt_u32(in);
  323. }
  324. #if defined(__X86_64__)
  325. __forceinline size_t __popcnt(size_t in) {
  326. return _mm_popcnt_u64(in);
  327. }
  328. #endif
  329. #endif
  330. __forceinline uint64_t rdtsc()
  331. {
  332. int dummy[4];
  333. __cpuid(dummy,0);
  334. uint64_t clock = read_tsc();
  335. __cpuid(dummy,0);
  336. return clock;
  337. }
  338. __forceinline void __pause_cpu (const size_t N = 8)
  339. {
  340. for (size_t i=0; i<N; i++)
  341. _mm_pause();
  342. }
  343. /* prefetches */
  344. __forceinline void prefetchL1 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T0); }
  345. __forceinline void prefetchL2 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T1); }
  346. __forceinline void prefetchL3 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T2); }
  347. __forceinline void prefetchNTA(const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_NTA); }
  348. __forceinline void prefetchEX (const void* ptr) {
  349. #if defined(__INTEL_COMPILER)
  350. _mm_prefetch((const char*)ptr,_MM_HINT_ET0);
  351. #else
  352. _mm_prefetch((const char*)ptr,_MM_HINT_T0);
  353. #endif
  354. }
  355. __forceinline void prefetchL1EX(const void* ptr) {
  356. prefetchEX(ptr);
  357. }
  358. __forceinline void prefetchL2EX(const void* ptr) {
  359. prefetchEX(ptr);
  360. }
  361. #if defined(__AVX2__)
  362. __forceinline unsigned int pext(const unsigned int a, const unsigned int b) { return _pext_u32(a,b); }
  363. __forceinline unsigned int pdep(const unsigned int a, const unsigned int b) { return _pdep_u32(a,b); }
  364. #if defined(__X86_64__)
  365. __forceinline size_t pext(const size_t a, const size_t b) { return _pext_u64(a,b); }
  366. __forceinline size_t pdep(const size_t a, const size_t b) { return _pdep_u64(a,b); }
  367. #endif
  368. #endif
  369. #if defined (__AVX512F__)
  370. __forceinline float mm512_cvtss_f32 (__m512 v) { // FIXME: _mm512_cvtss_f32 not yet supported by clang v4.0.0
  371. return _mm256_cvtss_f32(_mm512_castps512_ps256(v));
  372. }
  373. #endif
  374. }