2
0

intrinsics.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525
  1. // Copyright 2009-2021 Intel Corporation
  2. // SPDX-License-Identifier: Apache-2.0
  3. #pragma once
  4. #include "platform.h"
  5. #if defined(__WIN32__)
  6. #include <intrin.h>
  7. #endif
  8. #if defined(__ARM_NEON)
  9. #include "../simd/arm/emulation.h"
  10. #else
  11. #include <immintrin.h>
  12. #endif
  13. #if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER)
  14. #if !defined(_tzcnt_u32)
  15. #define _tzcnt_u32 __tzcnt_u32
  16. #endif
  17. #if !defined(_tzcnt_u64)
  18. #define _tzcnt_u64 __tzcnt_u64
  19. #endif
  20. #endif
  21. #if defined(__LZCNT__)
  22. #if !defined(_lzcnt_u32)
  23. #define _lzcnt_u32 __lzcnt32
  24. #endif
  25. #if !defined(_lzcnt_u64)
  26. #define _lzcnt_u64 __lzcnt64
  27. #endif
  28. #endif
  29. #if defined(__WIN32__)
  30. // -- GODOT start --
  31. #if !defined(NOMINMAX)
  32. // -- GODOT end --
  33. #define NOMINMAX
  34. // -- GODOT start --
  35. #endif
  36. #include "windows.h"
  37. // -- GODOT end --
  38. #endif
  39. /* normally defined in pmmintrin.h, but we always need this */
  40. #if !defined(_MM_SET_DENORMALS_ZERO_MODE)
  41. #define _MM_DENORMALS_ZERO_ON (0x0040)
  42. #define _MM_DENORMALS_ZERO_OFF (0x0000)
  43. #define _MM_DENORMALS_ZERO_MASK (0x0040)
  44. #define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
  45. #endif
  46. namespace embree
  47. {
  48. ////////////////////////////////////////////////////////////////////////////////
  49. /// Windows Platform
  50. ////////////////////////////////////////////////////////////////////////////////
  51. #if defined(__WIN32__)
  52. __forceinline size_t read_tsc()
  53. {
  54. LARGE_INTEGER li;
  55. QueryPerformanceCounter(&li);
  56. return (size_t)li.QuadPart;
  57. }
  58. __forceinline int bsf(int v) {
  59. #if defined(__AVX2__)
  60. return _tzcnt_u32(v);
  61. #else
  62. unsigned long r = 0; _BitScanForward(&r,v); return r;
  63. #endif
  64. }
  65. __forceinline unsigned bsf(unsigned v) {
  66. #if defined(__AVX2__)
  67. return _tzcnt_u32(v);
  68. #else
  69. unsigned long r = 0; _BitScanForward(&r,v); return r;
  70. #endif
  71. }
  72. #if defined(__X86_64__)
  73. __forceinline size_t bsf(size_t v) {
  74. #if defined(__AVX2__)
  75. return _tzcnt_u64(v);
  76. #else
  77. unsigned long r = 0; _BitScanForward64(&r,v); return r;
  78. #endif
  79. }
  80. #endif
  81. __forceinline int bscf(int& v)
  82. {
  83. int i = bsf(v);
  84. v &= v-1;
  85. return i;
  86. }
  87. __forceinline unsigned bscf(unsigned& v)
  88. {
  89. unsigned i = bsf(v);
  90. v &= v-1;
  91. return i;
  92. }
  93. #if defined(__X86_64__)
  94. __forceinline size_t bscf(size_t& v)
  95. {
  96. size_t i = bsf(v);
  97. v &= v-1;
  98. return i;
  99. }
  100. #endif
  101. __forceinline int bsr(int v) {
  102. #if defined(__AVX2__)
  103. return 31 - _lzcnt_u32(v);
  104. #else
  105. unsigned long r = 0; _BitScanReverse(&r,v); return r;
  106. #endif
  107. }
  108. __forceinline unsigned bsr(unsigned v) {
  109. #if defined(__AVX2__)
  110. return 31 - _lzcnt_u32(v);
  111. #else
  112. unsigned long r = 0; _BitScanReverse(&r,v); return r;
  113. #endif
  114. }
  115. #if defined(__X86_64__)
  116. __forceinline size_t bsr(size_t v) {
  117. #if defined(__AVX2__)
  118. return 63 -_lzcnt_u64(v);
  119. #else
  120. unsigned long r = 0; _BitScanReverse64(&r, v); return r;
  121. #endif
  122. }
  123. #endif
  124. __forceinline int lzcnt(const int x)
  125. {
  126. #if defined(__AVX2__)
  127. return _lzcnt_u32(x);
  128. #else
  129. if (unlikely(x == 0)) return 32;
  130. return 31 - bsr(x);
  131. #endif
  132. }
  133. __forceinline int btc(int v, int i) {
  134. long r = v; _bittestandcomplement(&r,i); return r;
  135. }
  136. __forceinline int bts(int v, int i) {
  137. long r = v; _bittestandset(&r,i); return r;
  138. }
  139. __forceinline int btr(int v, int i) {
  140. long r = v; _bittestandreset(&r,i); return r;
  141. }
  142. #if defined(__X86_64__)
  143. __forceinline size_t btc(size_t v, size_t i) {
  144. size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r;
  145. }
  146. __forceinline size_t bts(size_t v, size_t i) {
  147. __int64 r = v; _bittestandset64(&r,i); return r;
  148. }
  149. __forceinline size_t btr(size_t v, size_t i) {
  150. __int64 r = v; _bittestandreset64(&r,i); return r;
  151. }
  152. #endif
  153. __forceinline int32_t atomic_cmpxchg(volatile int32_t* p, const int32_t c, const int32_t v) {
  154. return _InterlockedCompareExchange((volatile long*)p,v,c);
  155. }
  156. ////////////////////////////////////////////////////////////////////////////////
  157. /// Unix Platform
  158. ////////////////////////////////////////////////////////////////////////////////
  159. #else
  160. #if defined(__i386__) && defined(__PIC__)
  161. __forceinline void __cpuid(int out[4], int op)
  162. {
  163. asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
  164. "cpuid\n\t"
  165. "xchg{l}\t{%%}ebx, %1\n\t"
  166. : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
  167. : "0"(op));
  168. }
  169. __forceinline void __cpuid_count(int out[4], int op1, int op2)
  170. {
  171. asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
  172. "cpuid\n\t"
  173. "xchg{l}\t{%%}ebx, %1\n\t"
  174. : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3])
  175. : "0" (op1), "2" (op2));
  176. }
  177. #elif defined(__X86_ASM__)
  178. __forceinline void __cpuid(int out[4], int op) {
  179. asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op));
  180. }
  181. __forceinline void __cpuid_count(int out[4], int op1, int op2) {
  182. asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2));
  183. }
  184. #endif
  185. __forceinline uint64_t read_tsc() {
  186. #if defined(__X86_ASM__)
  187. uint32_t high,low;
  188. asm volatile ("rdtsc" : "=d"(high), "=a"(low));
  189. return (((uint64_t)high) << 32) + (uint64_t)low;
  190. #else
  191. /* Not supported yet, meaning measuring traversal cost per pixel does not work. */
  192. return 0;
  193. #endif
  194. }
  195. __forceinline int bsf(int v) {
  196. #if defined(__AVX2__)
  197. return _tzcnt_u32(v);
  198. #elif defined(__X86_ASM__)
  199. int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
  200. #else
  201. return __builtin_ctz(v);
  202. #endif
  203. }
  204. #if defined(__64BIT__)
  205. __forceinline unsigned bsf(unsigned v)
  206. {
  207. #if defined(__AVX2__)
  208. return _tzcnt_u32(v);
  209. #elif defined(__X86_ASM__)
  210. unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
  211. #else
  212. return __builtin_ctz(v);
  213. #endif
  214. }
  215. #endif
  216. __forceinline size_t bsf(size_t v) {
  217. #if defined(__AVX2__)
  218. #if defined(__X86_64__)
  219. return _tzcnt_u64(v);
  220. #else
  221. return _tzcnt_u32(v);
  222. #endif
  223. #elif defined(__X86_ASM__)
  224. size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
  225. #else
  226. return __builtin_ctzl(v);
  227. #endif
  228. }
  229. __forceinline int bscf(int& v)
  230. {
  231. int i = bsf(v);
  232. v &= v-1;
  233. return i;
  234. }
  235. #if defined(__64BIT__)
  236. __forceinline unsigned int bscf(unsigned int& v)
  237. {
  238. unsigned int i = bsf(v);
  239. v &= v-1;
  240. return i;
  241. }
  242. #endif
  243. __forceinline size_t bscf(size_t& v)
  244. {
  245. size_t i = bsf(v);
  246. v &= v-1;
  247. return i;
  248. }
  249. __forceinline int bsr(int v) {
  250. #if defined(__AVX2__)
  251. return 31 - _lzcnt_u32(v);
  252. #elif defined(__X86_ASM__)
  253. int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
  254. #else
  255. return __builtin_clz(v) ^ 31;
  256. #endif
  257. }
  258. #if defined(__64BIT__)
  259. __forceinline unsigned bsr(unsigned v) {
  260. #if defined(__AVX2__)
  261. return 31 - _lzcnt_u32(v);
  262. #elif defined(__X86_ASM__)
  263. unsigned r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
  264. #else
  265. return __builtin_clz(v) ^ 31;
  266. #endif
  267. }
  268. #endif
  269. __forceinline size_t bsr(size_t v) {
  270. #if defined(__AVX2__)
  271. #if defined(__X86_64__)
  272. return 63 - _lzcnt_u64(v);
  273. #else
  274. return 31 - _lzcnt_u32(v);
  275. #endif
  276. #elif defined(__X86_ASM__)
  277. size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
  278. #else
  279. return (sizeof(v) * 8 - 1) - __builtin_clzl(v);
  280. #endif
  281. }
  282. __forceinline int lzcnt(const int x)
  283. {
  284. #if defined(__AVX2__)
  285. return _lzcnt_u32(x);
  286. #else
  287. if (unlikely(x == 0)) return 32;
  288. return 31 - bsr(x);
  289. #endif
  290. }
  291. __forceinline size_t blsr(size_t v) {
  292. #if defined(__AVX2__)
  293. #if defined(__INTEL_COMPILER)
  294. return _blsr_u64(v);
  295. #else
  296. #if defined(__X86_64__)
  297. return __blsr_u64(v);
  298. #else
  299. return __blsr_u32(v);
  300. #endif
  301. #endif
  302. #else
  303. return v & (v-1);
  304. #endif
  305. }
  306. __forceinline int btc(int v, int i) {
  307. #if defined(__X86_ASM__)
  308. int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
  309. #else
  310. return (v ^ (1 << i));
  311. #endif
  312. }
  313. __forceinline int bts(int v, int i) {
  314. #if defined(__X86_ASM__)
  315. int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
  316. #else
  317. return (v | (v << i));
  318. #endif
  319. }
  320. __forceinline int btr(int v, int i) {
  321. #if defined(__X86_ASM__)
  322. int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
  323. #else
  324. return (v & ~(v << i));
  325. #endif
  326. }
  327. __forceinline size_t btc(size_t v, size_t i) {
  328. #if defined(__X86_ASM__)
  329. size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
  330. #else
  331. return (v ^ (1 << i));
  332. #endif
  333. }
  334. __forceinline size_t bts(size_t v, size_t i) {
  335. #if defined(__X86_ASM__)
  336. size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
  337. #else
  338. return (v | (v << i));
  339. #endif
  340. }
  341. __forceinline size_t btr(size_t v, size_t i) {
  342. #if defined(__X86_ASM__)
  343. size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
  344. #else
  345. return (v & ~(v << i));
  346. #endif
  347. }
  348. __forceinline int32_t atomic_cmpxchg(int32_t volatile* value, int32_t comparand, const int32_t input) {
  349. return __sync_val_compare_and_swap(value, comparand, input);
  350. }
  351. #endif
  352. ////////////////////////////////////////////////////////////////////////////////
  353. /// All Platforms
  354. ////////////////////////////////////////////////////////////////////////////////
  355. #if defined(__clang__) || defined(__GNUC__)
  356. #if !defined(_mm_undefined_ps)
  357. __forceinline __m128 _mm_undefined_ps() { return _mm_setzero_ps(); }
  358. #endif
  359. #if !defined(_mm_undefined_si128)
  360. __forceinline __m128i _mm_undefined_si128() { return _mm_setzero_si128(); }
  361. #endif
  362. #if !defined(_mm256_undefined_ps) && defined(__AVX__)
  363. __forceinline __m256 _mm256_undefined_ps() { return _mm256_setzero_ps(); }
  364. #endif
  365. #if !defined(_mm256_undefined_si256) && defined(__AVX__)
  366. __forceinline __m256i _mm256_undefined_si256() { return _mm256_setzero_si256(); }
  367. #endif
  368. #if !defined(_mm512_undefined_ps) && defined(__AVX512F__)
  369. __forceinline __m512 _mm512_undefined_ps() { return _mm512_setzero_ps(); }
  370. #endif
  371. #if !defined(_mm512_undefined_epi32) && defined(__AVX512F__)
  372. __forceinline __m512i _mm512_undefined_epi32() { return _mm512_setzero_si512(); }
  373. #endif
  374. #endif
  375. #if defined(__SSE4_2__)
  376. __forceinline int popcnt(int in) {
  377. return _mm_popcnt_u32(in);
  378. }
  379. __forceinline unsigned popcnt(unsigned in) {
  380. return _mm_popcnt_u32(in);
  381. }
  382. #if defined(__64BIT__)
  383. __forceinline size_t popcnt(size_t in) {
  384. return _mm_popcnt_u64(in);
  385. }
  386. #endif
  387. #endif
  388. #if defined(__X86_ASM__)
  389. __forceinline uint64_t rdtsc()
  390. {
  391. int dummy[4];
  392. __cpuid(dummy,0);
  393. uint64_t clock = read_tsc();
  394. __cpuid(dummy,0);
  395. return clock;
  396. }
  397. #endif
  398. __forceinline void pause_cpu(const size_t N = 8)
  399. {
  400. for (size_t i=0; i<N; i++)
  401. _mm_pause();
  402. }
  403. /* prefetches */
  404. __forceinline void prefetchL1 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T0); }
  405. __forceinline void prefetchL2 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T1); }
  406. __forceinline void prefetchL3 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T2); }
  407. __forceinline void prefetchNTA(const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_NTA); }
  408. __forceinline void prefetchEX (const void* ptr) {
  409. #if defined(__INTEL_COMPILER)
  410. _mm_prefetch((const char*)ptr,_MM_HINT_ET0);
  411. #else
  412. _mm_prefetch((const char*)ptr,_MM_HINT_T0);
  413. #endif
  414. }
  415. __forceinline void prefetchL1EX(const void* ptr) {
  416. prefetchEX(ptr);
  417. }
  418. __forceinline void prefetchL2EX(const void* ptr) {
  419. prefetchEX(ptr);
  420. }
  421. #if defined(__AVX2__)
  422. __forceinline unsigned int pext(unsigned int a, unsigned int b) { return _pext_u32(a, b); }
  423. __forceinline unsigned int pdep(unsigned int a, unsigned int b) { return _pdep_u32(a, b); }
  424. #if defined(__X86_64__)
  425. __forceinline size_t pext(size_t a, size_t b) { return _pext_u64(a, b); }
  426. __forceinline size_t pdep(size_t a, size_t b) { return _pdep_u64(a, b); }
  427. #endif
  428. #endif
  429. #if defined(__AVX512F__)
  430. #if defined(__INTEL_COMPILER)
  431. __forceinline float mm512_cvtss_f32(__m512 v) {
  432. return _mm512_cvtss_f32(v);
  433. }
  434. __forceinline int mm512_mask2int(__mmask16 k1) {
  435. return _mm512_mask2int(k1);
  436. }
  437. __forceinline __mmask16 mm512_int2mask(int mask) {
  438. return _mm512_int2mask(mask);
  439. }
  440. #else
  441. __forceinline float mm512_cvtss_f32(__m512 v) { // FIXME: _mm512_cvtss_f32 neither supported by clang v4.0.0 nor GCC 6.3
  442. return _mm_cvtss_f32(_mm512_castps512_ps128(v));
  443. }
  444. __forceinline int mm512_mask2int(__mmask16 k1) { // FIXME: _mm512_mask2int not yet supported by GCC 6.3
  445. return (int)k1;
  446. }
  447. __forceinline __mmask16 mm512_int2mask(int mask) { // FIXME: _mm512_int2mask not yet supported by GCC 6.3
  448. return (__mmask16)mask;
  449. }
  450. #endif
  451. #endif
  452. }