TracyOpenCL.hpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339
  1. #ifndef __TRACYOPENCL_HPP__
  2. #define __TRACYOPENCL_HPP__
  3. #if !defined TRACY_ENABLE
  4. #define TracyCLContext(c, x) nullptr
  5. #define TracyCLDestroy(c)
  6. #define TracyCLContextName(c, x, y)
  7. #define TracyCLNamedZone(c, x, y, z)
  8. #define TracyCLNamedZoneC(c, x, y, z, w)
  9. #define TracyCLZone(c, x)
  10. #define TracyCLZoneC(c, x, y)
  11. #define TracyCLNamedZoneS(c, x, y, z, w)
  12. #define TracyCLNamedZoneCS(c, x, y, z, w, v)
  13. #define TracyCLZoneS(c, x, y)
  14. #define TracyCLZoneCS(c, x, y, z)
  15. #define TracyCLNamedZoneSetEvent(x, e)
  16. #define TracyCLZoneSetEvent(e)
  17. #define TracyCLCollect(c)
  18. namespace tracy
  19. {
  20. class OpenCLCtxScope {};
  21. }
  22. using TracyCLCtx = void*;
  23. #else
  24. #include <CL/cl.h>
  25. #include <atomic>
  26. #include <cassert>
  27. #include "Tracy.hpp"
  28. #include "client/TracyCallstack.hpp"
  29. #include "client/TracyProfiler.hpp"
  30. #include "common/TracyAlloc.hpp"
  31. namespace tracy {
  32. enum class EventPhase : uint8_t
  33. {
  34. Begin,
  35. End
  36. };
  37. struct EventInfo
  38. {
  39. cl_event event;
  40. EventPhase phase;
  41. };
  42. class OpenCLCtx
  43. {
  44. public:
  45. enum { QueryCount = 64 * 1024 };
  46. OpenCLCtx(cl_context context, cl_device_id device)
  47. : m_contextId(GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed))
  48. , m_head(0)
  49. , m_tail(0)
  50. {
  51. int64_t tcpu, tgpu;
  52. assert(m_contextId != 255);
  53. cl_int err = CL_SUCCESS;
  54. cl_command_queue queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
  55. assert(err == CL_SUCCESS);
  56. uint32_t dummyValue = 42;
  57. cl_mem dummyBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(uint32_t), nullptr, &err);
  58. assert(err == CL_SUCCESS);
  59. cl_event writeBufferEvent;
  60. err = clEnqueueWriteBuffer(queue, dummyBuffer, CL_FALSE, 0, sizeof(uint32_t), &dummyValue, 0, nullptr, &writeBufferEvent);
  61. assert(err == CL_SUCCESS);
  62. err = clWaitForEvents(1, &writeBufferEvent);
  63. tcpu = Profiler::GetTime();
  64. assert(err == CL_SUCCESS);
  65. cl_int eventStatus;
  66. err = clGetEventInfo(writeBufferEvent, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, nullptr);
  67. assert(err == CL_SUCCESS);
  68. assert(eventStatus == CL_COMPLETE);
  69. err = clGetEventProfilingInfo(writeBufferEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &tgpu, nullptr);
  70. assert(err == CL_SUCCESS);
  71. err = clReleaseEvent(writeBufferEvent);
  72. assert(err == CL_SUCCESS);
  73. err = clReleaseMemObject(dummyBuffer);
  74. assert(err == CL_SUCCESS);
  75. err = clReleaseCommandQueue(queue);
  76. assert(err == CL_SUCCESS);
  77. auto item = Profiler::QueueSerial();
  78. MemWrite(&item->hdr.type, QueueType::GpuNewContext);
  79. MemWrite(&item->gpuNewContext.cpuTime, tcpu);
  80. MemWrite(&item->gpuNewContext.gpuTime, tgpu);
  81. memset(&item->gpuNewContext.thread, 0, sizeof(item->gpuNewContext.thread));
  82. MemWrite(&item->gpuNewContext.period, 1.0f);
  83. MemWrite(&item->gpuNewContext.type, GpuContextType::OpenCL);
  84. MemWrite(&item->gpuNewContext.context, (uint8_t) m_contextId);
  85. MemWrite(&item->gpuNewContext.flags, (uint8_t)0);
  86. #ifdef TRACY_ON_DEMAND
  87. GetProfiler().DeferItem(*item);
  88. #endif
  89. Profiler::QueueSerialFinish();
  90. }
  91. void Name( const char* name, uint16_t len )
  92. {
  93. auto ptr = (char*)tracy_malloc( len );
  94. memcpy( ptr, name, len );
  95. auto item = Profiler::QueueSerial();
  96. MemWrite( &item->hdr.type, QueueType::GpuContextName );
  97. MemWrite( &item->gpuContextNameFat.context, (uint8_t)m_contextId );
  98. MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr );
  99. MemWrite( &item->gpuContextNameFat.size, len );
  100. #ifdef TRACY_ON_DEMAND
  101. GetProfiler().DeferItem( *item );
  102. #endif
  103. Profiler::QueueSerialFinish();
  104. }
  105. void Collect()
  106. {
  107. ZoneScopedC(Color::Red4);
  108. if (m_tail == m_head) return;
  109. #ifdef TRACY_ON_DEMAND
  110. if (!GetProfiler().IsConnected())
  111. {
  112. m_head = m_tail = 0;
  113. }
  114. #endif
  115. while (m_tail != m_head)
  116. {
  117. EventInfo eventInfo = m_query[m_tail];
  118. cl_event event = eventInfo.event;
  119. cl_int eventStatus;
  120. cl_int err = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, nullptr);
  121. assert(err == CL_SUCCESS);
  122. if (eventStatus != CL_COMPLETE) return;
  123. cl_int eventInfoQuery = (eventInfo.phase == EventPhase::Begin)
  124. ? CL_PROFILING_COMMAND_START
  125. : CL_PROFILING_COMMAND_END;
  126. cl_ulong eventTimeStamp = 0;
  127. err = clGetEventProfilingInfo(event, eventInfoQuery, sizeof(cl_ulong), &eventTimeStamp, nullptr);
  128. assert(err == CL_SUCCESS);
  129. assert(eventTimeStamp != 0);
  130. auto item = Profiler::QueueSerial();
  131. MemWrite(&item->hdr.type, QueueType::GpuTime);
  132. MemWrite(&item->gpuTime.gpuTime, (int64_t)eventTimeStamp);
  133. MemWrite(&item->gpuTime.queryId, (uint16_t)m_tail);
  134. MemWrite(&item->gpuTime.context, m_contextId);
  135. Profiler::QueueSerialFinish();
  136. if (eventInfo.phase == EventPhase::End)
  137. {
  138. // Done with the event, so release it
  139. err = clReleaseEvent(event);
  140. assert(err == CL_SUCCESS);
  141. }
  142. m_tail = (m_tail + 1) % QueryCount;
  143. }
  144. }
  145. tracy_force_inline uint8_t GetId() const
  146. {
  147. return m_contextId;
  148. }
  149. tracy_force_inline unsigned int NextQueryId(EventInfo eventInfo)
  150. {
  151. const auto id = m_head;
  152. m_head = (m_head + 1) % QueryCount;
  153. assert(m_head != m_tail);
  154. m_query[id] = eventInfo;
  155. return id;
  156. }
  157. tracy_force_inline EventInfo& GetQuery(unsigned int id)
  158. {
  159. assert(id < QueryCount);
  160. return m_query[id];
  161. }
  162. private:
  163. unsigned int m_contextId;
  164. EventInfo m_query[QueryCount];
  165. unsigned int m_head;
  166. unsigned int m_tail;
  167. };
  168. class OpenCLCtxScope {
  169. public:
  170. tracy_force_inline OpenCLCtxScope(OpenCLCtx* ctx, const SourceLocationData* srcLoc, bool is_active)
  171. #ifdef TRACY_ON_DEMAND
  172. : m_active(is_active&& GetProfiler().IsConnected())
  173. #else
  174. : m_active(is_active)
  175. #endif
  176. , m_ctx(ctx)
  177. , m_event(nullptr)
  178. {
  179. if (!m_active) return;
  180. m_beginQueryId = ctx->NextQueryId(EventInfo{ nullptr, EventPhase::Begin });
  181. auto item = Profiler::QueueSerial();
  182. MemWrite(&item->hdr.type, QueueType::GpuZoneBeginSerial);
  183. MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
  184. MemWrite(&item->gpuZoneBegin.srcloc, (uint64_t)srcLoc);
  185. MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
  186. MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)m_beginQueryId);
  187. MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
  188. Profiler::QueueSerialFinish();
  189. }
  190. tracy_force_inline OpenCLCtxScope(OpenCLCtx* ctx, const SourceLocationData* srcLoc, int depth, bool is_active)
  191. #ifdef TRACY_ON_DEMAND
  192. : m_active(is_active&& GetProfiler().IsConnected())
  193. #else
  194. : m_active(is_active)
  195. #endif
  196. , m_ctx(ctx)
  197. , m_event(nullptr)
  198. {
  199. if (!m_active) return;
  200. m_beginQueryId = ctx->NextQueryId(EventInfo{ nullptr, EventPhase::Begin });
  201. GetProfiler().SendCallstack(depth);
  202. auto item = Profiler::QueueSerial();
  203. MemWrite(&item->hdr.type, QueueType::GpuZoneBeginCallstackSerial);
  204. MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
  205. MemWrite(&item->gpuZoneBegin.srcloc, (uint64_t)srcLoc);
  206. MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
  207. MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)m_beginQueryId);
  208. MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
  209. Profiler::QueueSerialFinish();
  210. }
  211. tracy_force_inline void SetEvent(cl_event event)
  212. {
  213. if (!m_active) return;
  214. m_event = event;
  215. cl_int err = clRetainEvent(m_event);
  216. assert(err == CL_SUCCESS);
  217. m_ctx->GetQuery(m_beginQueryId).event = m_event;
  218. }
  219. tracy_force_inline ~OpenCLCtxScope()
  220. {
  221. if (!m_active) return;
  222. const auto queryId = m_ctx->NextQueryId(EventInfo{ m_event, EventPhase::End });
  223. auto item = Profiler::QueueSerial();
  224. MemWrite(&item->hdr.type, QueueType::GpuZoneEndSerial);
  225. MemWrite(&item->gpuZoneEnd.cpuTime, Profiler::GetTime());
  226. MemWrite(&item->gpuZoneEnd.thread, GetThreadHandle());
  227. MemWrite(&item->gpuZoneEnd.queryId, (uint16_t)queryId);
  228. MemWrite(&item->gpuZoneEnd.context, m_ctx->GetId());
  229. Profiler::QueueSerialFinish();
  230. }
  231. const bool m_active;
  232. OpenCLCtx* m_ctx;
  233. cl_event m_event;
  234. unsigned int m_beginQueryId;
  235. };
  236. static inline OpenCLCtx* CreateCLContext(cl_context context, cl_device_id device)
  237. {
  238. InitRPMallocThread();
  239. auto ctx = (OpenCLCtx*)tracy_malloc(sizeof(OpenCLCtx));
  240. new (ctx) OpenCLCtx(context, device);
  241. return ctx;
  242. }
  243. static inline void DestroyCLContext(OpenCLCtx* ctx)
  244. {
  245. ctx->~OpenCLCtx();
  246. tracy_free(ctx);
  247. }
  248. } // namespace tracy
  249. using TracyCLCtx = tracy::OpenCLCtx*;
  250. #define TracyCLContext(context, device) tracy::CreateCLContext(context, device);
  251. #define TracyCLDestroy(ctx) tracy::DestroyCLContext(ctx);
  252. #define TracyCLContextName(context, name, size) ctx->Name(name, size);
  253. #if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
  254. # define TracyCLNamedZone(ctx, varname, name, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), TRACY_CALLSTACK, active );
  255. # define TracyCLNamedZoneC(ctx, varname, name, color, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), TRACY_CALLSTACK, active );
  256. # define TracyCLZone(ctx, name) TracyCLNamedZoneS(ctx, __tracy_gpu_zone, name, TRACY_CALLSTACK, true)
  257. # define TracyCLZoneC(ctx, name, color) TracyCLNamedZoneCS(ctx, __tracy_gpu_zone, name, color, TRACY_CALLSTACK, true)
  258. #else
  259. # define TracyCLNamedZone(ctx, varname, name, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), active);
  260. # define TracyCLNamedZoneC(ctx, varname, name, color, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), active);
  261. # define TracyCLZone(ctx, name) TracyCLNamedZone(ctx, __tracy_gpu_zone, name, true)
  262. # define TracyCLZoneC(ctx, name, color) TracyCLNamedZoneC(ctx, __tracy_gpu_zone, name, color, true )
  263. #endif
  264. #ifdef TRACY_HAS_CALLSTACK
  265. # define TracyCLNamedZoneS(ctx, varname, name, depth, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), depth, active);
  266. # define TracyCLNamedZoneCS(ctx, varname, name, color, depth, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), depth, active);
  267. # define TracyCLZoneS(ctx, name, depth) TracyCLNamedZoneS(ctx, __tracy_gpu_zone, name, depth, true)
  268. # define TracyCLZoneCS(ctx, name, color, depth) TracyCLNamedZoneCS(ctx, __tracy_gpu_zone, name, color, depth, true)
  269. #else
  270. # define TracyCLNamedZoneS(ctx, varname, name, depth, active) TracyCLNamedZone(ctx, varname, name, active)
  271. # define TracyCLNamedZoneCS(ctx, varname, name, color, depth, active) TracyCLNamedZoneC(ctx, varname, name, color, active)
  272. # define TracyCLZoneS(ctx, name, depth) TracyCLZone(ctx, name)
  273. # define TracyCLZoneCS(ctx, name, color, depth) TracyCLZoneC(ctx, name, color)
  274. #endif
  275. #define TracyCLNamedZoneSetEvent(varname, event) varname.SetEvent(event)
  276. #define TracyCLZoneSetEvent(event) __tracy_gpu_zone.SetEvent(event)
  277. #define TracyCLCollect(ctx) ctx->Collect()
  278. #endif
  279. #endif