intrinsics.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566
  1. // Copyright 2009-2021 Intel Corporation
  2. // SPDX-License-Identifier: Apache-2.0
  3. #pragma once
  4. #include "platform.h"
  5. #if defined(__WIN32__)
  6. #include <intrin.h>
  7. #endif
  8. #if defined(__ARM_NEON)
  9. #include "../simd/arm/emulation.h"
  10. #else
  11. #include <immintrin.h>
  12. #if defined(__EMSCRIPTEN__)
  13. #include "../simd/wasm/emulation.h"
  14. #endif
  15. #endif
  16. #if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER)
  17. #if !defined(_tzcnt_u32)
  18. #define _tzcnt_u32 __tzcnt_u32
  19. #endif
  20. #if !defined(_tzcnt_u64)
  21. #define _tzcnt_u64 __tzcnt_u64
  22. #endif
  23. #endif
  24. #if defined(__aarch64__)
  25. #if !defined(_lzcnt_u32)
  26. #define _lzcnt_u32 __builtin_clz
  27. #endif
  28. #else
  29. #if defined(__LZCNT__)
  30. #if !defined(_lzcnt_u32)
  31. #define _lzcnt_u32 __lzcnt32
  32. #endif
  33. #if !defined(_lzcnt_u64)
  34. #define _lzcnt_u64 __lzcnt64
  35. #endif
  36. #endif
  37. #endif
  38. #if defined(__WIN32__)
  39. # if !defined(NOMINMAX)
  40. # define NOMINMAX
  41. # endif
  42. # include <windows.h>
  43. #endif
  44. /* normally defined in pmmintrin.h, but we always need this */
  45. #if !defined(_MM_SET_DENORMALS_ZERO_MODE)
  46. #define _MM_DENORMALS_ZERO_ON (0x0040)
  47. #define _MM_DENORMALS_ZERO_OFF (0x0000)
  48. #define _MM_DENORMALS_ZERO_MASK (0x0040)
  49. #define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
  50. #endif
  51. namespace embree
  52. {
  53. ////////////////////////////////////////////////////////////////////////////////
  54. /// Windows Platform
  55. ////////////////////////////////////////////////////////////////////////////////
  56. #if defined(__WIN32__) && !defined(__INTEL_LLVM_COMPILER)
  57. __forceinline size_t read_tsc()
  58. {
  59. LARGE_INTEGER li;
  60. QueryPerformanceCounter(&li);
  61. return (size_t)li.QuadPart;
  62. }
  63. __forceinline int bsf(int v) {
  64. #if defined(__AVX2__) && !defined(__aarch64__)
  65. return _tzcnt_u32(v);
  66. #else
  67. unsigned long r = 0; _BitScanForward(&r,v); return r;
  68. #endif
  69. }
  70. __forceinline unsigned bsf(unsigned v) {
  71. #if defined(__AVX2__) && !defined(__aarch64__)
  72. return _tzcnt_u32(v);
  73. #else
  74. unsigned long r = 0; _BitScanForward(&r,v); return r;
  75. #endif
  76. }
  77. #if defined(__X86_64__) || defined (__aarch64__)
  78. __forceinline size_t bsf(size_t v) {
  79. #if defined(__AVX2__)
  80. return _tzcnt_u64(v);
  81. #else
  82. unsigned long r = 0; _BitScanForward64(&r,v); return r;
  83. #endif
  84. }
  85. #endif
  86. __forceinline int bscf(int& v)
  87. {
  88. int i = bsf(v);
  89. v &= v-1;
  90. return i;
  91. }
  92. __forceinline unsigned bscf(unsigned& v)
  93. {
  94. unsigned i = bsf(v);
  95. v &= v-1;
  96. return i;
  97. }
  98. #if defined(__X86_64__) || defined (__aarch64__)
  99. __forceinline size_t bscf(size_t& v)
  100. {
  101. size_t i = bsf(v);
  102. v &= v-1;
  103. return i;
  104. }
  105. #endif
  106. __forceinline int bsr(int v) {
  107. #if defined(__AVX2__) && !defined(__aarch64__)
  108. return 31 - _lzcnt_u32(v);
  109. #else
  110. unsigned long r = 0; _BitScanReverse(&r,v); return r;
  111. #endif
  112. }
  113. __forceinline unsigned bsr(unsigned v) {
  114. #if defined(__AVX2__) && !defined(__aarch64__)
  115. return 31 - _lzcnt_u32(v);
  116. #else
  117. unsigned long r = 0; _BitScanReverse(&r,v); return r;
  118. #endif
  119. }
  120. #if defined(__X86_64__) || defined (__aarch64__)
  121. __forceinline size_t bsr(size_t v) {
  122. #if defined(__AVX2__)
  123. return 63 -_lzcnt_u64(v);
  124. #else
  125. unsigned long r = 0; _BitScanReverse64(&r, v); return r;
  126. #endif
  127. }
  128. #endif
  129. __forceinline int lzcnt(const int x)
  130. {
  131. #if defined(__AVX2__) && !defined(__aarch64__)
  132. return _lzcnt_u32(x);
  133. #else
  134. if (unlikely(x == 0)) return 32;
  135. return 31 - bsr(x);
  136. #endif
  137. }
  138. __forceinline int btc(int v, int i) {
  139. long r = v; _bittestandcomplement(&r,i); return r;
  140. }
  141. __forceinline int bts(int v, int i) {
  142. long r = v; _bittestandset(&r,i); return r;
  143. }
  144. __forceinline int btr(int v, int i) {
  145. long r = v; _bittestandreset(&r,i); return r;
  146. }
  147. #if defined(__X86_64__)
  148. __forceinline size_t btc(size_t v, size_t i) {
  149. size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r;
  150. }
  151. __forceinline size_t bts(size_t v, size_t i) {
  152. __int64 r = v; _bittestandset64(&r,i); return r;
  153. }
  154. __forceinline size_t btr(size_t v, size_t i) {
  155. __int64 r = v; _bittestandreset64(&r,i); return r;
  156. }
  157. #endif
  158. __forceinline int32_t atomic_cmpxchg(volatile int32_t* p, const int32_t c, const int32_t v) {
  159. return _InterlockedCompareExchange((volatile long*)p,v,c);
  160. }
  161. ////////////////////////////////////////////////////////////////////////////////
  162. /// Unix Platform
  163. ////////////////////////////////////////////////////////////////////////////////
  164. #else
  165. __forceinline uint64_t read_tsc() {
  166. #if defined(__X86_ASM__)
  167. uint32_t high,low;
  168. asm volatile ("rdtsc" : "=d"(high), "=a"(low));
  169. return (((uint64_t)high) << 32) + (uint64_t)low;
  170. #else
  171. /* Not supported yet, meaning measuring traversal cost per pixel does not work. */
  172. return 0;
  173. #endif
  174. }
  175. __forceinline int bsf(int v) {
  176. #if defined(__ARM_NEON)
  177. return __builtin_ctz(v);
  178. #else
  179. #if defined(__AVX2__)
  180. return _tzcnt_u32(v);
  181. #elif defined(__X86_ASM__)
  182. int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
  183. #else
  184. return __builtin_ctz(v);
  185. #endif
  186. #endif
  187. }
  188. #if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
  189. __forceinline unsigned int bsf(unsigned v) {
  190. return sycl::ctz(v);
  191. }
  192. #else
  193. #if defined(__64BIT__)
  194. __forceinline unsigned bsf(unsigned v)
  195. {
  196. #if defined(__ARM_NEON)
  197. return __builtin_ctz(v);
  198. #else
  199. #if defined(__AVX2__)
  200. return _tzcnt_u32(v);
  201. #elif defined(__X86_ASM__)
  202. unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
  203. #else
  204. return __builtin_ctz(v);
  205. #endif
  206. #endif
  207. }
  208. #endif
  209. #endif
  210. #if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
  211. __forceinline size_t bsf(size_t v) {
  212. return sycl::ctz(v);
  213. }
  214. #else
  215. __forceinline size_t bsf(size_t v) {
  216. #if defined(__AVX2__) && !defined(__aarch64__)
  217. #if defined(__X86_64__)
  218. return _tzcnt_u64(v);
  219. #else
  220. return _tzcnt_u32(v);
  221. #endif
  222. #elif defined(__X86_ASM__)
  223. size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
  224. #else
  225. return __builtin_ctzl(v);
  226. #endif
  227. }
  228. #endif
  229. __forceinline int bscf(int& v)
  230. {
  231. int i = bsf(v);
  232. v &= v-1;
  233. return i;
  234. }
  235. #if defined(__64BIT__)
  236. __forceinline unsigned int bscf(unsigned int& v)
  237. {
  238. unsigned int i = bsf(v);
  239. v &= v-1;
  240. return i;
  241. }
  242. #endif
  243. __forceinline size_t bscf(size_t& v)
  244. {
  245. size_t i = bsf(v);
  246. v &= v-1;
  247. return i;
  248. }
  249. __forceinline int bsr(int v) {
  250. #if defined(__AVX2__) && !defined(__aarch64__)
  251. return 31 - _lzcnt_u32(v);
  252. #elif defined(__X86_ASM__)
  253. int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
  254. #else
  255. return __builtin_clz(v) ^ 31;
  256. #endif
  257. }
  258. #if defined(__64BIT__) || defined(__EMSCRIPTEN__)
  259. __forceinline unsigned bsr(unsigned v) {
  260. #if defined(__AVX2__)
  261. return 31 - _lzcnt_u32(v);
  262. #elif defined(__X86_ASM__)
  263. unsigned r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
  264. #else
  265. return __builtin_clz(v) ^ 31;
  266. #endif
  267. }
  268. #endif
  269. __forceinline size_t bsr(size_t v) {
  270. #if defined(__AVX2__) && !defined(__aarch64__)
  271. #if defined(__X86_64__)
  272. return 63 - _lzcnt_u64(v);
  273. #else
  274. return 31 - _lzcnt_u32(v);
  275. #endif
  276. #elif defined(__X86_ASM__)
  277. size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
  278. #else
  279. return (sizeof(v) * 8 - 1) - __builtin_clzl(v);
  280. #endif
  281. }
  282. __forceinline int lzcnt(const int x)
  283. {
  284. #if defined(__AVX2__) && !defined(__aarch64__)
  285. return _lzcnt_u32(x);
  286. #else
  287. if (unlikely(x == 0)) return 32;
  288. return 31 - bsr(x);
  289. #endif
  290. }
  291. __forceinline size_t blsr(size_t v) {
  292. #if defined(__AVX2__) && !defined(__aarch64__)
  293. #if defined(__INTEL_COMPILER)
  294. return _blsr_u64(v);
  295. #else
  296. #if defined(__X86_64__)
  297. return __blsr_u64(v);
  298. #else
  299. return __blsr_u32(v);
  300. #endif
  301. #endif
  302. #else
  303. return v & (v-1);
  304. #endif
  305. }
  306. __forceinline int btc(int v, int i) {
  307. #if defined(__X86_ASM__)
  308. int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
  309. #else
  310. return (v ^ (1 << i));
  311. #endif
  312. }
  313. __forceinline int bts(int v, int i) {
  314. #if defined(__X86_ASM__)
  315. int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
  316. #else
  317. return (v | (1 << i));
  318. #endif
  319. }
  320. __forceinline int btr(int v, int i) {
  321. #if defined(__X86_ASM__)
  322. int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
  323. #else
  324. return (v & ~(1 << i));
  325. #endif
  326. }
  327. __forceinline size_t btc(size_t v, size_t i) {
  328. #if defined(__X86_ASM__)
  329. size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
  330. #else
  331. return (v ^ (1 << i));
  332. #endif
  333. }
  334. __forceinline size_t bts(size_t v, size_t i) {
  335. #if defined(__X86_ASM__)
  336. size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
  337. #else
  338. return (v | (1 << i));
  339. #endif
  340. }
  341. __forceinline size_t btr(size_t v, size_t i) {
  342. #if defined(__X86_ASM__)
  343. size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
  344. #else
  345. return (v & ~(1 << i));
  346. #endif
  347. }
  348. __forceinline int32_t atomic_cmpxchg(int32_t volatile* value, int32_t comparand, const int32_t input) {
  349. return __sync_val_compare_and_swap(value, comparand, input);
  350. }
  351. #endif
  352. #if !defined(__WIN32__)
  353. #if defined(__i386__) && defined(__PIC__)
  354. __forceinline void __cpuid(int out[4], int op)
  355. {
  356. asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
  357. "cpuid\n\t"
  358. "xchg{l}\t{%%}ebx, %1\n\t"
  359. : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
  360. : "0"(op));
  361. }
  362. __forceinline void __cpuid_count(int out[4], int op1, int op2)
  363. {
  364. asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
  365. "cpuid\n\t"
  366. "xchg{l}\t{%%}ebx, %1\n\t"
  367. : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3])
  368. : "0" (op1), "2" (op2));
  369. }
  370. #elif defined(__X86_ASM__)
  371. __forceinline void __cpuid(int out[4], int op) {
  372. asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op));
  373. }
  374. __forceinline void __cpuid_count(int out[4], int op1, int op2) {
  375. asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2));
  376. }
  377. #endif
  378. #endif
  379. ////////////////////////////////////////////////////////////////////////////////
  380. /// All Platforms
  381. ////////////////////////////////////////////////////////////////////////////////
  382. #if defined(__clang__) || defined(__GNUC__)
  383. #if !defined(_mm_undefined_ps)
  384. __forceinline __m128 _mm_undefined_ps() { return _mm_setzero_ps(); }
  385. #endif
  386. #if !defined(_mm_undefined_si128)
  387. __forceinline __m128i _mm_undefined_si128() { return _mm_setzero_si128(); }
  388. #endif
  389. #if !defined(_mm256_undefined_ps) && defined(__AVX__)
  390. __forceinline __m256 _mm256_undefined_ps() { return _mm256_setzero_ps(); }
  391. #endif
  392. #if !defined(_mm256_undefined_si256) && defined(__AVX__)
  393. __forceinline __m256i _mm256_undefined_si256() { return _mm256_setzero_si256(); }
  394. #endif
  395. #if !defined(_mm512_undefined_ps) && defined(__AVX512F__)
  396. __forceinline __m512 _mm512_undefined_ps() { return _mm512_setzero_ps(); }
  397. #endif
  398. #if !defined(_mm512_undefined_epi32) && defined(__AVX512F__)
  399. __forceinline __m512i _mm512_undefined_epi32() { return _mm512_setzero_si512(); }
  400. #endif
  401. #endif
  402. #if defined(EMBREE_SYCL_SUPPORT) && defined(__SYCL_DEVICE_ONLY__)
  403. __forceinline unsigned int popcnt(unsigned int in) {
  404. return sycl::popcount(in);
  405. }
  406. #else
  407. #if defined(__SSE4_2__) || defined(__ARM_NEON)
  408. __forceinline int popcnt(int in) {
  409. return _mm_popcnt_u32(in);
  410. }
  411. __forceinline unsigned popcnt(unsigned in) {
  412. return _mm_popcnt_u32(in);
  413. }
  414. #if defined(__64BIT__)
  415. __forceinline size_t popcnt(size_t in) {
  416. return _mm_popcnt_u64(in);
  417. }
  418. #endif
  419. #endif
  420. #endif
  421. #if defined(__X86_ASM__)
  422. __forceinline uint64_t rdtsc()
  423. {
  424. int dummy[4];
  425. __cpuid(dummy,0);
  426. uint64_t clock = read_tsc();
  427. __cpuid(dummy,0);
  428. return clock;
  429. }
  430. #endif
  431. __forceinline void pause_cpu(const size_t N = 8)
  432. {
  433. for (size_t i=0; i<N; i++)
  434. _mm_pause();
  435. }
  436. /* prefetches */
  437. __forceinline void prefetchL1 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T0); }
  438. __forceinline void prefetchL2 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T1); }
  439. __forceinline void prefetchL3 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T2); }
  440. __forceinline void prefetchNTA(const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_NTA); }
  441. __forceinline void prefetchEX (const void* ptr) {
  442. #if defined(__INTEL_COMPILER)
  443. _mm_prefetch((const char*)ptr,_MM_HINT_ET0);
  444. #else
  445. _mm_prefetch((const char*)ptr,_MM_HINT_T0);
  446. #endif
  447. }
  448. __forceinline void prefetchL1EX(const void* ptr) {
  449. prefetchEX(ptr);
  450. }
  451. __forceinline void prefetchL2EX(const void* ptr) {
  452. prefetchEX(ptr);
  453. }
  454. #if defined(__AVX2__) && !defined(__aarch64__)
  455. __forceinline unsigned int pext(unsigned int a, unsigned int b) { return _pext_u32(a, b); }
  456. __forceinline unsigned int pdep(unsigned int a, unsigned int b) { return _pdep_u32(a, b); }
  457. #if defined(__X86_64__)
  458. __forceinline size_t pext(size_t a, size_t b) { return _pext_u64(a, b); }
  459. __forceinline size_t pdep(size_t a, size_t b) { return _pdep_u64(a, b); }
  460. #endif
  461. #endif
  462. #if defined(__AVX512F__)
  463. #if defined(__INTEL_COMPILER)
  464. __forceinline float mm512_cvtss_f32(__m512 v) {
  465. return _mm512_cvtss_f32(v);
  466. }
  467. __forceinline int mm512_mask2int(__mmask16 k1) {
  468. return _mm512_mask2int(k1);
  469. }
  470. __forceinline __mmask16 mm512_int2mask(int mask) {
  471. return _mm512_int2mask(mask);
  472. }
  473. #else
  474. __forceinline float mm512_cvtss_f32(__m512 v) { // FIXME: _mm512_cvtss_f32 neither supported by clang v4.0.0 nor GCC 6.3
  475. return _mm_cvtss_f32(_mm512_castps512_ps128(v));
  476. }
  477. __forceinline int mm512_mask2int(__mmask16 k1) { // FIXME: _mm512_mask2int not yet supported by GCC 6.3
  478. return (int)k1;
  479. }
  480. __forceinline __mmask16 mm512_int2mask(int mask) { // FIXME: _mm512_int2mask not yet supported by GCC 6.3
  481. return (__mmask16)mask;
  482. }
  483. #endif
  484. #endif
  485. }