intrinsics.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549
  1. // Copyright 2009-2021 Intel Corporation
  2. // SPDX-License-Identifier: Apache-2.0
  3. #pragma once
  4. #include "platform.h"
  5. #if defined(__WIN32__)
  6. #include <intrin.h>
  7. #endif
  8. #if defined(__ARM_NEON)
  9. #include "../simd/arm/emulation.h"
  10. #else
  11. #include <immintrin.h>
  12. #if defined(__EMSCRIPTEN__)
  13. #include "../simd/wasm/emulation.h"
  14. #endif
  15. #endif
  16. #if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER)
  17. #if !defined(_tzcnt_u32)
  18. #define _tzcnt_u32 __tzcnt_u32
  19. #endif
  20. #if !defined(_tzcnt_u64)
  21. #define _tzcnt_u64 __tzcnt_u64
  22. #endif
  23. #endif
  24. #if defined(__aarch64__)
  25. #if !defined(_lzcnt_u32)
  26. #define _lzcnt_u32 __builtin_clz
  27. #endif
  28. #else
  29. #if defined(__LZCNT__)
  30. #if !defined(_lzcnt_u32)
  31. #define _lzcnt_u32 __lzcnt32
  32. #endif
  33. #if !defined(_lzcnt_u64)
  34. #define _lzcnt_u64 __lzcnt64
  35. #endif
  36. #endif
  37. #endif
  38. #if defined(__WIN32__)
  39. # if !defined(NOMINMAX)
  40. # define NOMINMAX
  41. # endif
  42. # include <windows.h>
  43. #endif
  44. /* normally defined in pmmintrin.h, but we always need this */
  45. #if !defined(_MM_SET_DENORMALS_ZERO_MODE)
  46. #define _MM_DENORMALS_ZERO_ON (0x0040)
  47. #define _MM_DENORMALS_ZERO_OFF (0x0000)
  48. #define _MM_DENORMALS_ZERO_MASK (0x0040)
  49. #define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
  50. #endif
  51. namespace embree
  52. {
  53. ////////////////////////////////////////////////////////////////////////////////
  54. /// Windows Platform
  55. ////////////////////////////////////////////////////////////////////////////////
  56. #if defined(__WIN32__)
  57. __forceinline size_t read_tsc()
  58. {
  59. LARGE_INTEGER li;
  60. QueryPerformanceCounter(&li);
  61. return (size_t)li.QuadPart;
  62. }
  63. __forceinline int bsf(int v) {
  64. #if defined(__AVX2__) && !defined(__aarch64__)
  65. return _tzcnt_u32(v);
  66. #else
  67. unsigned long r = 0; _BitScanForward(&r,v); return r;
  68. #endif
  69. }
  70. __forceinline unsigned bsf(unsigned v) {
  71. #if defined(__AVX2__) && !defined(__aarch64__)
  72. return _tzcnt_u32(v);
  73. #else
  74. unsigned long r = 0; _BitScanForward(&r,v); return r;
  75. #endif
  76. }
  77. #if defined(__X86_64__)
  78. __forceinline size_t bsf(size_t v) {
  79. #if defined(__AVX2__)
  80. return _tzcnt_u64(v);
  81. #else
  82. unsigned long r = 0; _BitScanForward64(&r,v); return r;
  83. #endif
  84. }
  85. #endif
  86. __forceinline int bscf(int& v)
  87. {
  88. int i = bsf(v);
  89. v &= v-1;
  90. return i;
  91. }
  92. __forceinline unsigned bscf(unsigned& v)
  93. {
  94. unsigned i = bsf(v);
  95. v &= v-1;
  96. return i;
  97. }
  98. #if defined(__X86_64__)
  99. __forceinline size_t bscf(size_t& v)
  100. {
  101. size_t i = bsf(v);
  102. v &= v-1;
  103. return i;
  104. }
  105. #endif
  106. __forceinline int bsr(int v) {
  107. #if defined(__AVX2__) && !defined(__aarch64__)
  108. return 31 - _lzcnt_u32(v);
  109. #else
  110. unsigned long r = 0; _BitScanReverse(&r,v); return r;
  111. #endif
  112. }
  113. __forceinline unsigned bsr(unsigned v) {
  114. #if defined(__AVX2__) && !defined(__aarch64__)
  115. return 31 - _lzcnt_u32(v);
  116. #else
  117. unsigned long r = 0; _BitScanReverse(&r,v); return r;
  118. #endif
  119. }
  120. #if defined(__X86_64__)
  121. __forceinline size_t bsr(size_t v) {
  122. #if defined(__AVX2__)
  123. return 63 -_lzcnt_u64(v);
  124. #else
  125. unsigned long r = 0; _BitScanReverse64(&r, v); return r;
  126. #endif
  127. }
  128. #endif
  129. __forceinline int lzcnt(const int x)
  130. {
  131. #if defined(__AVX2__) && !defined(__aarch64__)
  132. return _lzcnt_u32(x);
  133. #else
  134. if (unlikely(x == 0)) return 32;
  135. return 31 - bsr(x);
  136. #endif
  137. }
  138. __forceinline int btc(int v, int i) {
  139. long r = v; _bittestandcomplement(&r,i); return r;
  140. }
  141. __forceinline int bts(int v, int i) {
  142. long r = v; _bittestandset(&r,i); return r;
  143. }
  144. __forceinline int btr(int v, int i) {
  145. long r = v; _bittestandreset(&r,i); return r;
  146. }
  147. #if defined(__X86_64__)
  148. __forceinline size_t btc(size_t v, size_t i) {
  149. size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r;
  150. }
  151. __forceinline size_t bts(size_t v, size_t i) {
  152. __int64 r = v; _bittestandset64(&r,i); return r;
  153. }
  154. __forceinline size_t btr(size_t v, size_t i) {
  155. __int64 r = v; _bittestandreset64(&r,i); return r;
  156. }
  157. #endif
  158. __forceinline int32_t atomic_cmpxchg(volatile int32_t* p, const int32_t c, const int32_t v) {
  159. return _InterlockedCompareExchange((volatile long*)p,v,c);
  160. }
  161. ////////////////////////////////////////////////////////////////////////////////
  162. /// Unix Platform
  163. ////////////////////////////////////////////////////////////////////////////////
  164. #else
  165. #if defined(__i386__) && defined(__PIC__)
  166. __forceinline void __cpuid(int out[4], int op)
  167. {
  168. asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
  169. "cpuid\n\t"
  170. "xchg{l}\t{%%}ebx, %1\n\t"
  171. : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
  172. : "0"(op));
  173. }
  174. __forceinline void __cpuid_count(int out[4], int op1, int op2)
  175. {
  176. asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
  177. "cpuid\n\t"
  178. "xchg{l}\t{%%}ebx, %1\n\t"
  179. : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3])
  180. : "0" (op1), "2" (op2));
  181. }
  182. #elif defined(__X86_ASM__)
  183. __forceinline void __cpuid(int out[4], int op) {
  184. #if defined(__ARM_NEON)
  185. if (op == 0) { // Get CPU name
  186. out[0] = 0x41524d20;
  187. out[1] = 0x41524d20;
  188. out[2] = 0x41524d20;
  189. out[3] = 0x41524d20;
  190. }
  191. #else
  192. asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op));
  193. #endif
  194. }
  195. #if !defined(__ARM_NEON)
  196. __forceinline void __cpuid_count(int out[4], int op1, int op2) {
  197. asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2));
  198. }
  199. #endif
  200. #endif
  201. __forceinline uint64_t read_tsc() {
  202. #if defined(__X86_ASM__)
  203. uint32_t high,low;
  204. asm volatile ("rdtsc" : "=d"(high), "=a"(low));
  205. return (((uint64_t)high) << 32) + (uint64_t)low;
  206. #else
  207. /* Not supported yet, meaning measuring traversal cost per pixel does not work. */
  208. return 0;
  209. #endif
  210. }
  211. __forceinline int bsf(int v) {
  212. #if defined(__ARM_NEON)
  213. return __builtin_ctz(v);
  214. #else
  215. #if defined(__AVX2__)
  216. return _tzcnt_u32(v);
  217. #elif defined(__X86_ASM__)
  218. int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
  219. #else
  220. return __builtin_ctz(v);
  221. #endif
  222. #endif
  223. }
  224. #if defined(__64BIT__)
  225. __forceinline unsigned bsf(unsigned v)
  226. {
  227. #if defined(__ARM_NEON)
  228. return __builtin_ctz(v);
  229. #else
  230. #if defined(__AVX2__)
  231. return _tzcnt_u32(v);
  232. #elif defined(__X86_ASM__)
  233. unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
  234. #else
  235. return __builtin_ctz(v);
  236. #endif
  237. #endif
  238. }
  239. #endif
  240. __forceinline size_t bsf(size_t v) {
  241. #if defined(__AVX2__) && !defined(__aarch64__)
  242. #if defined(__X86_64__)
  243. return _tzcnt_u64(v);
  244. #else
  245. return _tzcnt_u32(v);
  246. #endif
  247. #elif defined(__X86_ASM__)
  248. size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
  249. #else
  250. return __builtin_ctzl(v);
  251. #endif
  252. }
  253. __forceinline int bscf(int& v)
  254. {
  255. int i = bsf(v);
  256. v &= v-1;
  257. return i;
  258. }
  259. #if defined(__64BIT__)
  260. __forceinline unsigned int bscf(unsigned int& v)
  261. {
  262. unsigned int i = bsf(v);
  263. v &= v-1;
  264. return i;
  265. }
  266. #endif
  267. __forceinline size_t bscf(size_t& v)
  268. {
  269. size_t i = bsf(v);
  270. v &= v-1;
  271. return i;
  272. }
  273. __forceinline int bsr(int v) {
  274. #if defined(__AVX2__) && !defined(__aarch64__)
  275. return 31 - _lzcnt_u32(v);
  276. #elif defined(__X86_ASM__)
  277. int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
  278. #else
  279. return __builtin_clz(v) ^ 31;
  280. #endif
  281. }
  282. #if defined(__64BIT__) || defined(__EMSCRIPTEN__)
  283. __forceinline unsigned bsr(unsigned v) {
  284. #if defined(__AVX2__)
  285. return 31 - _lzcnt_u32(v);
  286. #elif defined(__X86_ASM__)
  287. unsigned r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
  288. #else
  289. return __builtin_clz(v) ^ 31;
  290. #endif
  291. }
  292. #endif
  293. __forceinline size_t bsr(size_t v) {
  294. #if defined(__AVX2__) && !defined(__aarch64__)
  295. #if defined(__X86_64__)
  296. return 63 - _lzcnt_u64(v);
  297. #else
  298. return 31 - _lzcnt_u32(v);
  299. #endif
  300. #elif defined(__X86_ASM__)
  301. size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
  302. #else
  303. return (sizeof(v) * 8 - 1) - __builtin_clzl(v);
  304. #endif
  305. }
  306. __forceinline int lzcnt(const int x)
  307. {
  308. #if defined(__AVX2__) && !defined(__aarch64__)
  309. return _lzcnt_u32(x);
  310. #else
  311. if (unlikely(x == 0)) return 32;
  312. return 31 - bsr(x);
  313. #endif
  314. }
  315. __forceinline size_t blsr(size_t v) {
  316. #if defined(__AVX2__) && !defined(__aarch64__)
  317. #if defined(__INTEL_COMPILER)
  318. return _blsr_u64(v);
  319. #else
  320. #if defined(__X86_64__)
  321. return __blsr_u64(v);
  322. #else
  323. return __blsr_u32(v);
  324. #endif
  325. #endif
  326. #else
  327. return v & (v-1);
  328. #endif
  329. }
  330. __forceinline int btc(int v, int i) {
  331. #if defined(__X86_ASM__)
  332. int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
  333. #else
  334. return (v ^ (1 << i));
  335. #endif
  336. }
  337. __forceinline int bts(int v, int i) {
  338. #if defined(__X86_ASM__)
  339. int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
  340. #else
  341. return (v | (1 << i));
  342. #endif
  343. }
  344. __forceinline int btr(int v, int i) {
  345. #if defined(__X86_ASM__)
  346. int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
  347. #else
  348. return (v & ~(1 << i));
  349. #endif
  350. }
  351. __forceinline size_t btc(size_t v, size_t i) {
  352. #if defined(__X86_ASM__)
  353. size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
  354. #else
  355. return (v ^ (1 << i));
  356. #endif
  357. }
  358. __forceinline size_t bts(size_t v, size_t i) {
  359. #if defined(__X86_ASM__)
  360. size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
  361. #else
  362. return (v | (1 << i));
  363. #endif
  364. }
  365. __forceinline size_t btr(size_t v, size_t i) {
  366. #if defined(__X86_ASM__)
  367. size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
  368. #else
  369. return (v & ~(1 << i));
  370. #endif
  371. }
  372. __forceinline int32_t atomic_cmpxchg(int32_t volatile* value, int32_t comparand, const int32_t input) {
  373. return __sync_val_compare_and_swap(value, comparand, input);
  374. }
  375. #endif
  376. ////////////////////////////////////////////////////////////////////////////////
  377. /// All Platforms
  378. ////////////////////////////////////////////////////////////////////////////////
  379. #if defined(__clang__) || defined(__GNUC__)
  380. #if !defined(_mm_undefined_ps)
  381. __forceinline __m128 _mm_undefined_ps() { return _mm_setzero_ps(); }
  382. #endif
  383. #if !defined(_mm_undefined_si128)
  384. __forceinline __m128i _mm_undefined_si128() { return _mm_setzero_si128(); }
  385. #endif
  386. #if !defined(_mm256_undefined_ps) && defined(__AVX__)
  387. __forceinline __m256 _mm256_undefined_ps() { return _mm256_setzero_ps(); }
  388. #endif
  389. #if !defined(_mm256_undefined_si256) && defined(__AVX__)
  390. __forceinline __m256i _mm256_undefined_si256() { return _mm256_setzero_si256(); }
  391. #endif
  392. #if !defined(_mm512_undefined_ps) && defined(__AVX512F__)
  393. __forceinline __m512 _mm512_undefined_ps() { return _mm512_setzero_ps(); }
  394. #endif
  395. #if !defined(_mm512_undefined_epi32) && defined(__AVX512F__)
  396. __forceinline __m512i _mm512_undefined_epi32() { return _mm512_setzero_si512(); }
  397. #endif
  398. #endif
  399. #if defined(__SSE4_2__) || defined(__ARM_NEON)
  400. __forceinline int popcnt(int in) {
  401. return _mm_popcnt_u32(in);
  402. }
  403. __forceinline unsigned popcnt(unsigned in) {
  404. return _mm_popcnt_u32(in);
  405. }
  406. #if defined(__64BIT__)
  407. __forceinline size_t popcnt(size_t in) {
  408. return _mm_popcnt_u64(in);
  409. }
  410. #endif
  411. #endif
  412. #if defined(__X86_ASM__)
  413. __forceinline uint64_t rdtsc()
  414. {
  415. int dummy[4];
  416. __cpuid(dummy,0);
  417. uint64_t clock = read_tsc();
  418. __cpuid(dummy,0);
  419. return clock;
  420. }
  421. #endif
  422. __forceinline void pause_cpu(const size_t N = 8)
  423. {
  424. for (size_t i=0; i<N; i++)
  425. _mm_pause();
  426. }
  427. /* prefetches */
  428. __forceinline void prefetchL1 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T0); }
  429. __forceinline void prefetchL2 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T1); }
  430. __forceinline void prefetchL3 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T2); }
  431. __forceinline void prefetchNTA(const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_NTA); }
  432. __forceinline void prefetchEX (const void* ptr) {
  433. #if defined(__INTEL_COMPILER)
  434. _mm_prefetch((const char*)ptr,_MM_HINT_ET0);
  435. #else
  436. _mm_prefetch((const char*)ptr,_MM_HINT_T0);
  437. #endif
  438. }
  439. __forceinline void prefetchL1EX(const void* ptr) {
  440. prefetchEX(ptr);
  441. }
  442. __forceinline void prefetchL2EX(const void* ptr) {
  443. prefetchEX(ptr);
  444. }
  445. #if defined(__AVX2__) && !defined(__aarch64__)
  446. __forceinline unsigned int pext(unsigned int a, unsigned int b) { return _pext_u32(a, b); }
  447. __forceinline unsigned int pdep(unsigned int a, unsigned int b) { return _pdep_u32(a, b); }
  448. #if defined(__X86_64__)
  449. __forceinline size_t pext(size_t a, size_t b) { return _pext_u64(a, b); }
  450. __forceinline size_t pdep(size_t a, size_t b) { return _pdep_u64(a, b); }
  451. #endif
  452. #endif
  453. #if defined(__AVX512F__)
  454. #if defined(__INTEL_COMPILER)
  455. __forceinline float mm512_cvtss_f32(__m512 v) {
  456. return _mm512_cvtss_f32(v);
  457. }
  458. __forceinline int mm512_mask2int(__mmask16 k1) {
  459. return _mm512_mask2int(k1);
  460. }
  461. __forceinline __mmask16 mm512_int2mask(int mask) {
  462. return _mm512_int2mask(mask);
  463. }
  464. #else
  465. __forceinline float mm512_cvtss_f32(__m512 v) { // FIXME: _mm512_cvtss_f32 neither supported by clang v4.0.0 nor GCC 6.3
  466. return _mm_cvtss_f32(_mm512_castps512_ps128(v));
  467. }
  468. __forceinline int mm512_mask2int(__mmask16 k1) { // FIXME: _mm512_mask2int not yet supported by GCC 6.3
  469. return (int)k1;
  470. }
  471. __forceinline __mmask16 mm512_int2mask(int mask) { // FIXME: _mm512_int2mask not yet supported by GCC 6.3
  472. return (__mmask16)mask;
  473. }
  474. #endif
  475. #endif
  476. }