sycl.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307
  1. // Copyright 2009-2021 Intel Corporation
  2. // SPDX-License-Identifier: Apache-2.0
  3. #pragma once
  4. #include "platform.h"
  5. using sycl::float16;
  6. using sycl::float8;
  7. using sycl::float4;
  8. using sycl::float3;
  9. using sycl::float2;
  10. using sycl::int16;
  11. using sycl::int8;
  12. using sycl::int4;
  13. using sycl::int3;
  14. using sycl::int2;
  15. using sycl::uint16;
  16. using sycl::uint8;
  17. using sycl::uint4;
  18. using sycl::uint3;
  19. using sycl::uint2;
  20. using sycl::uchar16;
  21. using sycl::uchar8;
  22. using sycl::uchar4;
  23. using sycl::uchar3;
  24. using sycl::uchar2;
  25. using sycl::ushort16;
  26. using sycl::ushort8;
  27. using sycl::ushort4;
  28. using sycl::ushort3;
  29. using sycl::ushort2;
  30. #ifdef __SYCL_DEVICE_ONLY__
  31. #define GLOBAL __attribute__((opencl_global))
  32. #define LOCAL __attribute__((opencl_local))
  33. SYCL_EXTERNAL extern int work_group_reduce_add(int x);
  34. SYCL_EXTERNAL extern float work_group_reduce_min(float x);
  35. SYCL_EXTERNAL extern float work_group_reduce_max(float x);
  36. SYCL_EXTERNAL extern float atomic_min(volatile GLOBAL float *p, float val);
  37. SYCL_EXTERNAL extern float atomic_min(volatile LOCAL float *p, float val);
  38. SYCL_EXTERNAL extern float atomic_max(volatile GLOBAL float *p, float val);
  39. SYCL_EXTERNAL extern float atomic_max(volatile LOCAL float *p, float val);
  40. SYCL_EXTERNAL extern "C" unsigned int intel_sub_group_ballot(bool valid);
  41. SYCL_EXTERNAL extern "C" void __builtin_IB_assume_uniform(void *p);
  42. // Load message caching control
  43. enum LSC_LDCC {
  44. LSC_LDCC_DEFAULT,
  45. LSC_LDCC_L1UC_L3UC, // Override to L1 uncached and L3 uncached
  46. LSC_LDCC_L1UC_L3C, // Override to L1 uncached and L3 cached
  47. LSC_LDCC_L1C_L3UC, // Override to L1 cached and L3 uncached
  48. LSC_LDCC_L1C_L3C, // Override to L1 cached and L3 cached
  49. LSC_LDCC_L1S_L3UC, // Override to L1 streaming load and L3 uncached
  50. LSC_LDCC_L1S_L3C, // Override to L1 streaming load and L3 cached
  51. LSC_LDCC_L1IAR_L3C, // Override to L1 invalidate-after-read, and L3 cached
  52. };
  53. // Store message caching control (also used for atomics)
  54. enum LSC_STCC {
  55. LSC_STCC_DEFAULT,
  56. LSC_STCC_L1UC_L3UC, // Override to L1 uncached and L3 uncached
  57. LSC_STCC_L1UC_L3WB, // Override to L1 uncached and L3 written back
  58. LSC_STCC_L1WT_L3UC, // Override to L1 written through and L3 uncached
  59. LSC_STCC_L1WT_L3WB, // Override to L1 written through and L3 written back
  60. LSC_STCC_L1S_L3UC, // Override to L1 streaming and L3 uncached
  61. LSC_STCC_L1S_L3WB, // Override to L1 streaming and L3 written back
  62. LSC_STCC_L1WB_L3WB, // Override to L1 written through and L3 written back
  63. };
  64. ///////////////////////////////////////////////////////////////////////
  65. // LSC Loads
  66. ///////////////////////////////////////////////////////////////////////
  67. SYCL_EXTERNAL /* extern "C" */ uint32_t __builtin_IB_lsc_load_global_uchar_to_uint (const GLOBAL uint8_t *base, int elemOff, enum LSC_LDCC cacheOpt); //D8U32
  68. SYCL_EXTERNAL /* extern "C" */ uint32_t __builtin_IB_lsc_load_global_ushort_to_uint(const GLOBAL uint16_t *base, int elemOff, enum LSC_LDCC cacheOpt); //D16U32
  69. SYCL_EXTERNAL /* extern "C" */ uint32_t __builtin_IB_lsc_load_global_uint (const GLOBAL uint32_t *base, int elemOff, enum LSC_LDCC cacheOpt); //D32V1
  70. SYCL_EXTERNAL /* extern "C" */ sycl::uint2 __builtin_IB_lsc_load_global_uint2 (const GLOBAL sycl::uint2 *base, int elemOff, enum LSC_LDCC cacheOpt); //D32V2
  71. SYCL_EXTERNAL /* extern "C" */ sycl::uint3 __builtin_IB_lsc_load_global_uint3 (const GLOBAL sycl::uint3 *base, int elemOff, enum LSC_LDCC cacheOpt); //D32V3
  72. SYCL_EXTERNAL /* extern "C" */ sycl::uint4 __builtin_IB_lsc_load_global_uint4 (const GLOBAL sycl::uint4 *base, int elemOff, enum LSC_LDCC cacheOpt); //D32V4
  73. SYCL_EXTERNAL /* extern "C" */ sycl::uint8 __builtin_IB_lsc_load_global_uint8 (const GLOBAL sycl::uint8 *base, int elemOff, enum LSC_LDCC cacheOpt); //D32V8
  74. SYCL_EXTERNAL /* extern "C" */ uint64_t __builtin_IB_lsc_load_global_ulong (const GLOBAL uint64_t *base, int elemOff, enum LSC_LDCC cacheOpt); //D64V1
  75. SYCL_EXTERNAL /* extern "C" */ sycl::ulong2 __builtin_IB_lsc_load_global_ulong2 (const GLOBAL sycl::ulong2 *base, int elemOff, enum LSC_LDCC cacheOpt); //D64V2
  76. SYCL_EXTERNAL /* extern "C" */ sycl::ulong3 __builtin_IB_lsc_load_global_ulong3 (const GLOBAL sycl::ulong3 *base, int elemOff, enum LSC_LDCC cacheOpt); //D64V3
  77. SYCL_EXTERNAL /* extern "C" */ sycl::ulong4 __builtin_IB_lsc_load_global_ulong4 (const GLOBAL sycl::ulong4 *base, int elemOff, enum LSC_LDCC cacheOpt); //D64V4
  78. SYCL_EXTERNAL /* extern "C" */ sycl::ulong8 __builtin_IB_lsc_load_global_ulong8 (const GLOBAL sycl::ulong8 *base, int elemOff, enum LSC_LDCC cacheOpt); //D64V8
  79. // global address space
  80. SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_uchar_from_uint (GLOBAL uint8_t *base, int immElemOff, uint32_t val, enum LSC_STCC cacheOpt); //D8U32
  81. SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_ushort_from_uint(GLOBAL uint16_t *base, int immElemOff, uint32_t val, enum LSC_STCC cacheOpt); //D16U32
  82. SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_uint (GLOBAL uint32_t *base, int immElemOff, uint32_t val, enum LSC_STCC cacheOpt); //D32V1
  83. SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_uint2 (GLOBAL sycl::uint2 *base, int immElemOff, sycl::uint2 val, enum LSC_STCC cacheOpt); //D32V2
  84. SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_uint3 (GLOBAL sycl::uint3 *base, int immElemOff, sycl::uint3 val, enum LSC_STCC cacheOpt); //D32V3
  85. SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_uint4 (GLOBAL sycl::uint4 *base, int immElemOff, sycl::uint4 val, enum LSC_STCC cacheOpt); //D32V4
  86. SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_uint8 (GLOBAL sycl::uint8 *base, int immElemOff, sycl::uint8 val, enum LSC_STCC cacheOpt); //D32V8
  87. SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_ulong (GLOBAL uint64_t *base, int immElemOff, uint64_t val, enum LSC_STCC cacheOpt); //D64V1
  88. SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_ulong2 (GLOBAL sycl::ulong2 *base, int immElemOff, sycl::ulong2 val, enum LSC_STCC cacheOpt); //D64V2
  89. SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_ulong3 (GLOBAL sycl::ulong3 *base, int immElemOff, sycl::ulong3 val, enum LSC_STCC cacheOpt); //D64V3
  90. SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_ulong4 (GLOBAL sycl::ulong4 *base, int immElemOff, sycl::ulong4 val, enum LSC_STCC cacheOpt); //D64V4
  91. SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_ulong8 (GLOBAL sycl::ulong8 *base, int immElemOff, sycl::ulong8 val, enum LSC_STCC cacheOpt); //D64V8
  92. ///////////////////////////////////////////////////////////////////////
  93. // prefetching
  94. ///////////////////////////////////////////////////////////////////////
  95. //
  96. // LSC Pre-Fetch Load functions with CacheControls
  97. // global address space
  98. SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_uchar (const GLOBAL uint8_t *base, int immElemOff, enum LSC_LDCC cacheOpt); //D8U32
  99. SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_ushort(const GLOBAL uint16_t *base, int immElemOff, enum LSC_LDCC cacheOpt); //D16U32
  100. SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_uint (const GLOBAL uint32_t *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V1
  101. SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_uint2 (const GLOBAL sycl::uint2 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V2
  102. SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_uint3 (const GLOBAL sycl::uint3 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V3
  103. SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_uint4 (const GLOBAL sycl::uint4 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V4
  104. SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_uint8 (const GLOBAL sycl::uint8 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V8
  105. SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_ulong (const GLOBAL uint64_t *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V1
  106. SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_ulong2(const GLOBAL sycl::ulong2 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V2
  107. SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_ulong3(const GLOBAL sycl::ulong3 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V3
  108. SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_ulong4(const GLOBAL sycl::ulong4 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V4
  109. SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_ulong8(const GLOBAL sycl::ulong8 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V8
  110. #else
  111. #define GLOBAL
  112. #define LOCAL
  113. /* dummy functions for host */
  114. inline int work_group_reduce_add(int x) { return x; }
  115. inline float work_group_reduce_min(float x) { return x; }
  116. inline float work_group_reduce_max(float x) { return x; }
  117. inline float atomic_min(volatile float *p, float val) { return val; };
  118. inline float atomic_max(volatile float *p, float val) { return val; };
  119. inline uint32_t intel_sub_group_ballot(bool valid) { return 0; }
  120. #endif
  121. /* creates a temporary that is enforced to be uniform */
  122. #define SYCL_UNIFORM_VAR(Ty,tmp,k) \
  123. Ty tmp##_data; \
  124. Ty* p##tmp##_data = (Ty*) sub_group_broadcast((uint64_t)&tmp##_data,k); \
  125. Ty& tmp = *p##tmp##_data;
  126. #if !defined(__forceinline)
  127. #define __forceinline inline __attribute__((always_inline))
  128. #endif
  129. #if __SYCL_COMPILER_VERSION < 20210801
  130. #define all_of_group all_of
  131. #define any_of_group any_of
  132. #define none_of_group none_of
  133. #define group_broadcast broadcast
  134. #define reduce_over_group reduce
  135. #define exclusive_scan_over_group exclusive_scan
  136. #define inclusive_scan_over_group inclusive_scan
  137. #endif
  138. namespace embree
  139. {
  140. template<typename T>
  141. __forceinline T cselect(const bool mask, const T &a, const T &b)
  142. {
  143. return sycl::select(b,a,(int)mask);
  144. }
  145. template<typename T, typename M>
  146. __forceinline T cselect(const M &mask, const T &a, const T &b)
  147. {
  148. return sycl::select(b,a,mask);
  149. }
  150. __forceinline const sycl::sub_group this_sub_group() {
  151. return sycl::ext::oneapi::experimental::this_sub_group();
  152. }
  153. __forceinline const uint32_t get_sub_group_local_id() {
  154. return this_sub_group().get_local_id()[0];
  155. }
  156. __forceinline const uint32_t get_sub_group_size() {
  157. return this_sub_group().get_max_local_range().size();
  158. }
  159. __forceinline const uint32_t get_sub_group_id() {
  160. return this_sub_group().get_group_id()[0];
  161. }
  162. __forceinline const uint32_t get_num_sub_groups() {
  163. return this_sub_group().get_group_range().size();
  164. }
  165. __forceinline uint32_t sub_group_ballot(bool pred) {
  166. return intel_sub_group_ballot(pred);
  167. }
  168. __forceinline bool sub_group_all_of(bool pred) {
  169. return sycl::all_of_group(this_sub_group(),pred);
  170. }
  171. __forceinline bool sub_group_any_of(bool pred) {
  172. return sycl::any_of_group(this_sub_group(),pred);
  173. }
  174. __forceinline bool sub_group_none_of(bool pred) {
  175. return sycl::none_of_group(this_sub_group(),pred);
  176. }
  177. template <typename T> __forceinline T sub_group_broadcast(T x, sycl::id<1> local_id) {
  178. return sycl::group_broadcast<sycl::sub_group>(this_sub_group(),x,local_id);
  179. }
  180. template <typename T> __forceinline T sub_group_make_uniform(T x) {
  181. return sub_group_broadcast(x,sycl::ctz(intel_sub_group_ballot(true)));
  182. }
  183. __forceinline void assume_uniform_array(void* ptr) {
  184. #ifdef __SYCL_DEVICE_ONLY__
  185. __builtin_IB_assume_uniform(ptr);
  186. #endif
  187. }
  188. template <typename T, class BinaryOperation> __forceinline T sub_group_reduce(T x, BinaryOperation binary_op) {
  189. return sycl::reduce_over_group<sycl::sub_group>(this_sub_group(),x,binary_op);
  190. }
  191. template <typename T, class BinaryOperation> __forceinline T sub_group_reduce(T x, T init, BinaryOperation binary_op) {
  192. return sycl::reduce_over_group<sycl::sub_group>(this_sub_group(),x,init,binary_op);
  193. }
  194. template <typename T> __forceinline T sub_group_reduce_min(T x, T init) {
  195. return sub_group_reduce(x, init, sycl::ext::oneapi::minimum<T>());
  196. }
  197. template <typename T> __forceinline T sub_group_reduce_min(T x) {
  198. return sub_group_reduce(x, sycl::ext::oneapi::minimum<T>());
  199. }
  200. template <typename T> __forceinline T sub_group_reduce_max(T x) {
  201. return sub_group_reduce(x, sycl::ext::oneapi::maximum<T>());
  202. }
  203. template <typename T> __forceinline T sub_group_reduce_add(T x) {
  204. return sub_group_reduce(x, sycl::ext::oneapi::plus<T>());
  205. }
  206. template <typename T, class BinaryOperation> __forceinline T sub_group_exclusive_scan(T x, BinaryOperation binary_op) {
  207. return sycl::exclusive_scan_over_group(this_sub_group(),x,binary_op);
  208. }
  209. template <typename T, class BinaryOperation> __forceinline T sub_group_exclusive_scan_min(T x) {
  210. return sub_group_exclusive_scan(x,sycl::ext::oneapi::minimum<T>());
  211. }
  212. template <typename T, class BinaryOperation> __forceinline T sub_group_exclusive_scan(T x, T init, BinaryOperation binary_op) {
  213. return sycl::exclusive_scan_over_group(this_sub_group(),x,init,binary_op);
  214. }
  215. template <typename T, class BinaryOperation> __forceinline T sub_group_inclusive_scan(T x, BinaryOperation binary_op) {
  216. return sycl::inclusive_scan_over_group(this_sub_group(),x,binary_op);
  217. }
  218. template <typename T, class BinaryOperation> __forceinline T sub_group_inclusive_scan(T x, BinaryOperation binary_op, T init) {
  219. return sycl::inclusive_scan_over_group(this_sub_group(),x,binary_op,init);
  220. }
  221. template <typename T> __forceinline T sub_group_shuffle(T x, sycl::id<1> local_id) {
  222. return this_sub_group().shuffle(x, local_id);
  223. }
  224. template <typename T> __forceinline T sub_group_shuffle_down(T x, uint32_t delta) {
  225. return this_sub_group().shuffle_down(x, delta);
  226. }
  227. template <typename T> __forceinline T sub_group_shuffle_up(T x, uint32_t delta) {
  228. return this_sub_group().shuffle_up(x, delta);
  229. }
  230. template <typename T> __forceinline T sub_group_load(const void* src) {
  231. return this_sub_group().load(sycl::multi_ptr<T,sycl::access::address_space::global_space>((T*)src));
  232. }
  233. template <typename T> __forceinline void sub_group_store(void* dst, const T& x) {
  234. this_sub_group().store(sycl::multi_ptr<T,sycl::access::address_space::global_space>((T*)dst),x);
  235. }
  236. }
  237. #if __SYCL_COMPILER_VERSION < 20210801
  238. #undef all_of_group
  239. #undef any_of_group
  240. #undef none_of_group
  241. #undef group_broadcast
  242. #undef reduce_over_group
  243. #undef exclusive_scan_over_group
  244. #undef inclusive_scan_over_group
  245. #endif