intrinsics.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559
  1. // Copyright 2009-2020 Intel Corporation
  2. // SPDX-License-Identifier: Apache-2.0
  3. #pragma once
  4. #include "platform.h"
  5. #if defined(__WIN32__)
  6. #include <intrin.h>
  7. #endif
  8. #if defined(__ARM_NEON)
  9. #include "../math/SSE2NEON.h"
  10. #if defined(NEON_AVX2_EMULATION)
  11. #include "../math/AVX2NEON.h"
  12. #endif
  13. #else
  14. #include <immintrin.h>
  15. #endif
  16. #if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER)
  17. #if !defined(_tzcnt_u32)
  18. #define _tzcnt_u32 __tzcnt_u32
  19. #endif
  20. #if !defined(_tzcnt_u64)
  21. #define _tzcnt_u64 __tzcnt_u64
  22. #endif
  23. #endif
  24. #if defined(__aarch64__)
  25. #if !defined(_lzcnt_u32)
  26. #define _lzcnt_u32 __builtin_clz
  27. #endif
  28. #if !defined(_lzcnt_u32)
  29. #define _lzcnt_u32 __builtin_clzll
  30. #endif
  31. #else
  32. #if defined(__LZCNT__)
  33. #if !defined(_lzcnt_u32)
  34. #define _lzcnt_u32 __lzcnt32
  35. #endif
  36. #if !defined(_lzcnt_u64)
  37. #define _lzcnt_u64 __lzcnt64
  38. #endif
  39. #endif
  40. #endif
  41. #if defined(__WIN32__)
  42. # ifndef NOMINMAX
  43. # define NOMINMAX
  44. # endif
  45. # include <windows.h>
  46. #endif
  47. /* normally defined in pmmintrin.h, but we always need this */
  48. #if !defined(_MM_SET_DENORMALS_ZERO_MODE)
  49. #define _MM_DENORMALS_ZERO_ON (0x0040)
  50. #define _MM_DENORMALS_ZERO_OFF (0x0000)
  51. #define _MM_DENORMALS_ZERO_MASK (0x0040)
  52. #define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
  53. #endif
  54. namespace embree
  55. {
  56. ////////////////////////////////////////////////////////////////////////////////
  57. /// Windows Platform
  58. ////////////////////////////////////////////////////////////////////////////////
  59. #if defined(__WIN32__)
  60. __forceinline size_t read_tsc()
  61. {
  62. LARGE_INTEGER li;
  63. QueryPerformanceCounter(&li);
  64. return (size_t)li.QuadPart;
  65. }
  66. __forceinline int bsf(int v) {
  67. #if defined(__AVX2__) && !defined(__aarch64__)
  68. return _tzcnt_u32(v);
  69. #else
  70. unsigned long r = 0; _BitScanForward(&r,v); return r;
  71. #endif
  72. }
  73. __forceinline unsigned bsf(unsigned v) {
  74. #if defined(__AVX2__) && !defined(__aarch64__)
  75. return _tzcnt_u32(v);
  76. #else
  77. unsigned long r = 0; _BitScanForward(&r,v); return r;
  78. #endif
  79. }
  80. #if defined(__X86_64__)
  81. __forceinline size_t bsf(size_t v) {
  82. #if defined(__AVX2__)
  83. return _tzcnt_u64(v);
  84. #else
  85. unsigned long r = 0; _BitScanForward64(&r,v); return r;
  86. #endif
  87. }
  88. #endif
  89. __forceinline int bscf(int& v)
  90. {
  91. int i = bsf(v);
  92. v &= v-1;
  93. return i;
  94. }
  95. __forceinline unsigned bscf(unsigned& v)
  96. {
  97. unsigned i = bsf(v);
  98. v &= v-1;
  99. return i;
  100. }
  101. #if defined(__X86_64__)
  102. __forceinline size_t bscf(size_t& v)
  103. {
  104. size_t i = bsf(v);
  105. v &= v-1;
  106. return i;
  107. }
  108. #endif
  109. __forceinline int bsr(int v) {
  110. #if defined(__AVX2__) && !defined(__aarch64__)
  111. return 31 - _lzcnt_u32(v);
  112. #else
  113. unsigned long r = 0; _BitScanReverse(&r,v); return r;
  114. #endif
  115. }
  116. __forceinline unsigned bsr(unsigned v) {
  117. #if defined(__AVX2__) && !defined(__aarch64__)
  118. return 31 - _lzcnt_u32(v);
  119. #else
  120. unsigned long r = 0; _BitScanReverse(&r,v); return r;
  121. #endif
  122. }
  123. #if defined(__X86_64__)
  124. __forceinline size_t bsr(size_t v) {
  125. #if defined(__AVX2__)
  126. return 63 -_lzcnt_u64(v);
  127. #else
  128. unsigned long r = 0; _BitScanReverse64(&r, v); return r;
  129. #endif
  130. }
  131. #endif
  132. __forceinline int lzcnt(const int x)
  133. {
  134. #if defined(__AVX2__) && !defined(__aarch64__)
  135. return _lzcnt_u32(x);
  136. #else
  137. if (unlikely(x == 0)) return 32;
  138. return 31 - bsr(x);
  139. #endif
  140. }
  141. __forceinline int btc(int v, int i) {
  142. long r = v; _bittestandcomplement(&r,i); return r;
  143. }
  144. __forceinline int bts(int v, int i) {
  145. long r = v; _bittestandset(&r,i); return r;
  146. }
  147. __forceinline int btr(int v, int i) {
  148. long r = v; _bittestandreset(&r,i); return r;
  149. }
  150. #if defined(__X86_64__)
  151. __forceinline size_t btc(size_t v, size_t i) {
  152. size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r;
  153. }
  154. __forceinline size_t bts(size_t v, size_t i) {
  155. __int64 r = v; _bittestandset64(&r,i); return r;
  156. }
  157. __forceinline size_t btr(size_t v, size_t i) {
  158. __int64 r = v; _bittestandreset64(&r,i); return r;
  159. }
  160. #endif
  161. __forceinline int32_t atomic_cmpxchg(volatile int32_t* p, const int32_t c, const int32_t v) {
  162. return _InterlockedCompareExchange((volatile long*)p,v,c);
  163. }
  164. ////////////////////////////////////////////////////////////////////////////////
  165. /// Unix Platform
  166. ////////////////////////////////////////////////////////////////////////////////
  167. #else
  168. #if defined(__i386__) && defined(__PIC__)
  169. __forceinline void __cpuid(int out[4], int op)
  170. {
  171. asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
  172. "cpuid\n\t"
  173. "xchg{l}\t{%%}ebx, %1\n\t"
  174. : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
  175. : "0"(op));
  176. }
  177. __forceinline void __cpuid_count(int out[4], int op1, int op2)
  178. {
  179. asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
  180. "cpuid\n\t"
  181. "xchg{l}\t{%%}ebx, %1\n\t"
  182. : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3])
  183. : "0" (op1), "2" (op2));
  184. }
  185. #else
  186. __forceinline void __cpuid(int out[4], int op) {
  187. #if defined(__ARM_NEON)
  188. if (op == 0) { // Get CPU name
  189. out[0] = 0x41524d20;
  190. out[1] = 0x41524d20;
  191. out[2] = 0x41524d20;
  192. out[3] = 0x41524d20;
  193. }
  194. #else
  195. asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op));
  196. #endif
  197. }
  198. #if !defined(__ARM_NEON)
  199. __forceinline void __cpuid_count(int out[4], int op1, int op2) {
  200. asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2));
  201. }
  202. #endif
  203. #endif
  204. __forceinline uint64_t read_tsc() {
  205. #if defined(__ARM_NEON)
  206. return 0; // FIXME(LTE): mimic rdtsc
  207. #else
  208. uint32_t high,low;
  209. asm volatile ("rdtsc" : "=d"(high), "=a"(low));
  210. return (((uint64_t)high) << 32) + (uint64_t)low;
  211. #endif
  212. }
  213. __forceinline int bsf(int v) {
  214. #if defined(__ARM_NEON)
  215. return __builtin_ctz(v);
  216. #else
  217. #if defined(__AVX2__)
  218. return _tzcnt_u32(v);
  219. #else
  220. int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
  221. #endif
  222. #endif
  223. }
  224. #if defined(__X86_64__) || defined(__aarch64__)
  225. __forceinline unsigned bsf(unsigned v)
  226. {
  227. #if defined(__ARM_NEON)
  228. return __builtin_ctz(v);
  229. #else
  230. #if defined(__AVX2__)
  231. return _tzcnt_u32(v);
  232. #else
  233. unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
  234. #endif
  235. #endif
  236. }
  237. #endif
  238. __forceinline size_t bsf(size_t v) {
  239. #if defined(__AVX2__) && !defined(__aarch64__)
  240. #if defined(__X86_64__)
  241. return _tzcnt_u64(v);
  242. #else
  243. return _tzcnt_u32(v);
  244. #endif
  245. #elif defined(__ARM_NEON)
  246. return __builtin_ctzl(v);
  247. #else
  248. size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
  249. #endif
  250. }
  251. __forceinline int bscf(int& v)
  252. {
  253. int i = bsf(v);
  254. v &= v-1;
  255. return i;
  256. }
  257. #if defined(__X86_64__) || defined(__aarch64__)
  258. __forceinline unsigned int bscf(unsigned int& v)
  259. {
  260. unsigned int i = bsf(v);
  261. v &= v-1;
  262. return i;
  263. }
  264. #endif
  265. __forceinline size_t bscf(size_t& v)
  266. {
  267. size_t i = bsf(v);
  268. v &= v-1;
  269. return i;
  270. }
  271. __forceinline int bsr(int v) {
  272. #if defined(__AVX2__) && !defined(__aarch64__)
  273. return 31 - _lzcnt_u32(v);
  274. #elif defined(__ARM_NEON)
  275. return __builtin_clz(v)^31;
  276. #else
  277. int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
  278. #endif
  279. }
  280. #if defined(__X86_64__) || defined(__aarch64__)
  281. __forceinline unsigned bsr(unsigned v) {
  282. #if defined(__AVX2__)
  283. return 31 - _lzcnt_u32(v);
  284. #elif defined(__ARM_NEON)
  285. return __builtin_clz(v)^31;
  286. #else
  287. unsigned r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
  288. #endif
  289. }
  290. #endif
  291. __forceinline size_t bsr(size_t v) {
  292. #if defined(__AVX2__) && !defined(__aarch64__)
  293. #if defined(__X86_64__)
  294. return 63 - _lzcnt_u64(v);
  295. #else
  296. return 31 - _lzcnt_u32(v);
  297. #endif
  298. #elif defined(__aarch64__)
  299. return (sizeof(v) * 8 - 1) - __builtin_clzl(v);
  300. #else
  301. size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
  302. #endif
  303. }
  304. __forceinline int lzcnt(const int x)
  305. {
  306. #if defined(__AVX2__) && !defined(__aarch64__)
  307. return _lzcnt_u32(x);
  308. #else
  309. if (unlikely(x == 0)) return 32;
  310. return 31 - bsr(x);
  311. #endif
  312. }
  313. __forceinline size_t blsr(size_t v) {
  314. #if defined(__AVX2__) && !defined(__aarch64__)
  315. #if defined(__INTEL_COMPILER)
  316. return _blsr_u64(v);
  317. #else
  318. #if defined(__X86_64__)
  319. return __blsr_u64(v);
  320. #else
  321. return __blsr_u32(v);
  322. #endif
  323. #endif
  324. #else
  325. return v & (v-1);
  326. #endif
  327. }
  328. __forceinline int btc(int v, int i) {
  329. #if defined(__aarch64__)
  330. // _bittestandcomplement(long *a, long b) {
  331. // unsigned char x = (*a >> b) & 1;
  332. // *a = *a ^ (1 << b);
  333. // return x;
  334. // We only need `*a`
  335. return (v ^ (1 << i));
  336. #else
  337. int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
  338. #endif
  339. }
  340. __forceinline int bts(int v, int i) {
  341. #if defined(__aarch64__)
  342. // _bittestandset(long *a, long b) {
  343. // unsigned char x = (*a >> b) & 1;
  344. // *a = *a | (1 << b);
  345. // return x;
  346. return (v | (v << i));
  347. #else
  348. int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
  349. #endif
  350. }
  351. __forceinline int btr(int v, int i) {
  352. #if defined(__aarch64__)
  353. // _bittestandreset(long *a, long b) {
  354. // unsigned char x = (*a >> b) & 1;
  355. // *a = *a & ~(1 << b);
  356. // return x;
  357. return (v & ~(v << i));
  358. #else
  359. int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
  360. #endif
  361. }
  362. __forceinline size_t btc(size_t v, size_t i) {
  363. #if defined(__aarch64__)
  364. return (v ^ (1 << i));
  365. #else
  366. size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
  367. #endif
  368. }
  369. __forceinline size_t bts(size_t v, size_t i) {
  370. #if defined(__aarch64__)
  371. return (v | (v << i));
  372. #else
  373. size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
  374. #endif
  375. }
  376. __forceinline size_t btr(size_t v, size_t i) {
  377. #if defined(__ARM_NEON)
  378. return (v & ~(v << i));
  379. #else
  380. size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
  381. #endif
  382. }
  383. __forceinline int32_t atomic_cmpxchg(int32_t volatile* value, int32_t comparand, const int32_t input) {
  384. return __sync_val_compare_and_swap(value, comparand, input);
  385. }
  386. #endif
  387. ////////////////////////////////////////////////////////////////////////////////
  388. /// All Platforms
  389. ////////////////////////////////////////////////////////////////////////////////
  390. #if defined(__clang__) || defined(__GNUC__)
  391. #if !defined(_mm_undefined_ps)
  392. __forceinline __m128 _mm_undefined_ps() { return _mm_setzero_ps(); }
  393. #endif
  394. #if !defined(_mm_undefined_si128)
  395. __forceinline __m128i _mm_undefined_si128() { return _mm_setzero_si128(); }
  396. #endif
  397. #if !defined(_mm256_undefined_ps) && defined(__AVX__)
  398. __forceinline __m256 _mm256_undefined_ps() { return _mm256_setzero_ps(); }
  399. #endif
  400. #if !defined(_mm256_undefined_si256) && defined(__AVX__)
  401. __forceinline __m256i _mm256_undefined_si256() { return _mm256_setzero_si256(); }
  402. #endif
  403. #if !defined(_mm512_undefined_ps) && defined(__AVX512F__)
  404. __forceinline __m512 _mm512_undefined_ps() { return _mm512_setzero_ps(); }
  405. #endif
  406. #if !defined(_mm512_undefined_epi32) && defined(__AVX512F__)
  407. __forceinline __m512i _mm512_undefined_epi32() { return _mm512_setzero_si512(); }
  408. #endif
  409. #endif
  410. #if defined(__SSE4_2__) || defined(__ARM_NEON)
  411. __forceinline int popcnt(int in) {
  412. return _mm_popcnt_u32(in);
  413. }
  414. __forceinline unsigned popcnt(unsigned in) {
  415. return _mm_popcnt_u32(in);
  416. }
  417. #if defined(__X86_64__) || defined(__ARM_NEON)
  418. __forceinline size_t popcnt(size_t in) {
  419. return _mm_popcnt_u64(in);
  420. }
  421. #endif
  422. #endif
  423. __forceinline uint64_t rdtsc()
  424. {
  425. int dummy[4];
  426. __cpuid(dummy,0);
  427. uint64_t clock = read_tsc();
  428. __cpuid(dummy,0);
  429. return clock;
  430. }
  431. __forceinline void pause_cpu(const size_t N = 8)
  432. {
  433. for (size_t i=0; i<N; i++)
  434. _mm_pause();
  435. }
  436. /* prefetches */
  437. __forceinline void prefetchL1 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T0); }
  438. __forceinline void prefetchL2 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T1); }
  439. __forceinline void prefetchL3 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T2); }
  440. __forceinline void prefetchNTA(const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_NTA); }
  441. __forceinline void prefetchEX (const void* ptr) {
  442. #if defined(__INTEL_COMPILER)
  443. _mm_prefetch((const char*)ptr,_MM_HINT_ET0);
  444. #else
  445. _mm_prefetch((const char*)ptr,_MM_HINT_T0);
  446. #endif
  447. }
  448. __forceinline void prefetchL1EX(const void* ptr) {
  449. prefetchEX(ptr);
  450. }
  451. __forceinline void prefetchL2EX(const void* ptr) {
  452. prefetchEX(ptr);
  453. }
  454. #if defined(__AVX2__) && !defined(__aarch64__)
  455. __forceinline unsigned int pext(unsigned int a, unsigned int b) { return _pext_u32(a, b); }
  456. __forceinline unsigned int pdep(unsigned int a, unsigned int b) { return _pdep_u32(a, b); }
  457. #if defined(__X86_64__)
  458. __forceinline size_t pext(size_t a, size_t b) { return _pext_u64(a, b); }
  459. __forceinline size_t pdep(size_t a, size_t b) { return _pdep_u64(a, b); }
  460. #endif
  461. #endif
  462. #if defined(__AVX512F__)
  463. #if defined(__INTEL_COMPILER)
  464. __forceinline float mm512_cvtss_f32(__m512 v) {
  465. return _mm512_cvtss_f32(v);
  466. }
  467. __forceinline int mm512_mask2int(__mmask16 k1) {
  468. return _mm512_mask2int(k1);
  469. }
  470. __forceinline __mmask16 mm512_int2mask(int mask) {
  471. return _mm512_int2mask(mask);
  472. }
  473. #else
  474. __forceinline float mm512_cvtss_f32(__m512 v) { // FIXME: _mm512_cvtss_f32 neither supported by clang v4.0.0 nor GCC 6.3
  475. return _mm_cvtss_f32(_mm512_castps512_ps128(v));
  476. }
  477. __forceinline int mm512_mask2int(__mmask16 k1) { // FIXME: _mm512_mask2int not yet supported by GCC 6.3
  478. return (int)k1;
  479. }
  480. __forceinline __mmask16 mm512_int2mask(int mask) { // FIXME: _mm512_int2mask not yet supported by GCC 6.3
  481. return (__mmask16)mask;
  482. }
  483. #endif
  484. #endif
  485. }