BsProfilerCPU.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424
  1. //********************************** Banshee Engine (www.banshee3d.com) **************************************************//
  2. //**************** Copyright (c) 2016 Marko Pintera ([email protected]). All rights reserved. **********************//
  3. #pragma once
  4. #include "BsCorePrerequisites.h"
  5. #include "BsModule.h"
  6. #include "BsFrameAlloc.h"
  7. namespace BansheeEngine
  8. {
  9. /** @addtogroup Profiling
  10. * @{
  11. */
  12. class CPUProfilerReport;
  13. /**
  14. * Provides various performance measuring methods.
  15. *
  16. * @note Thread safe. Matching begin* \ end* calls must belong to the same thread though.
  17. */
  18. class BS_CORE_EXPORT ProfilerCPU : public Module<ProfilerCPU>
  19. {
  20. /** Timer class responsible for tracking elapsed time. */
  21. class Timer
  22. {
  23. public:
  24. Timer();
  25. /** Sets the start time for the timer. */
  26. void start();
  27. /** Stops the timer and calculates the elapsed time from start time to now. */
  28. void stop();
  29. /** Resets the elapsed time to zero. */
  30. void reset();
  31. double time;
  32. private:
  33. double startTime;
  34. std::chrono::high_resolution_clock mHRClock;
  35. /** Returns time elapsed since CPU was started in millseconds. */
  36. inline double getCurrentTime() const;
  37. };
  38. /** Timer class responsible for tracking number of elapsed CPU cycles. */
  39. class TimerPrecise
  40. {
  41. public:
  42. TimerPrecise();
  43. /** Starts the counter marking the current number of executed CPU cycles since CPU was started. */
  44. void start();
  45. /** Ends the counter and calculates the number of CPU cycles between now and the start time. */
  46. void stop();
  47. /** Resets the cycle count to zero. */
  48. void reset();
  49. UINT64 cycles;
  50. private:
  51. UINT64 startCycles;
  52. /** Queries the CPU for the current number of CPU cycles executed since the program was started. */
  53. static inline UINT64 getNumCycles();
  54. };
  55. /**
  56. * Contains data about a single profiler sample (counting time in milliseconds).
  57. *
  58. * @note
  59. * A sample is created whenever a named profile block is entered. For example if you have a function you are
  60. * profiling, and it gets called 10 times, there will be 10 samples.
  61. */
  62. struct ProfileSample
  63. {
  64. ProfileSample(double _time, UINT64 _numAllocs, UINT64 _numFrees)
  65. :time(_time), numAllocs(_numAllocs), numFrees(_numFrees)
  66. { }
  67. double time;
  68. UINT64 numAllocs;
  69. UINT64 numFrees;
  70. };
  71. /**
  72. * Contains data about a single precise profiler sample (counting CPU cycles).
  73. *
  74. * @note
  75. * A sample is created whenever a named profile block is entered. For example if you have a function you are
  76. * profiling, and it gets called 10 times, there will be 10 samples.
  77. */
  78. struct PreciseProfileSample
  79. {
  80. PreciseProfileSample(UINT64 _cycles, UINT64 _numAllocs, UINT64 _numFrees)
  81. :cycles(_cycles), numAllocs(_numAllocs), numFrees(_numFrees)
  82. { }
  83. UINT64 cycles;
  84. UINT64 numAllocs;
  85. UINT64 numFrees;
  86. };
  87. /** Contains basic (time based) profiling data contained in a profiling block. */
  88. struct ProfileData
  89. {
  90. ProfileData(FrameAlloc* alloc);
  91. /** Begins a new sample and records current sample state. Previous sample must not be active. */
  92. void beginSample();
  93. /**
  94. * Records current sample state and creates a new sample based on start and end state. Adds the sample to the
  95. * sample list.
  96. */
  97. void endSample();
  98. /**
  99. * Removes the last added sample from the sample list and makes it active again. You must call endSample()
  100. * when done as if you called beginSample().
  101. */
  102. void resumeLastSample();
  103. Vector<ProfileSample, StdFrameAlloc<ProfileSample>> samples;
  104. Timer timer;
  105. UINT64 memAllocs;
  106. UINT64 memFrees;
  107. };
  108. /** Contains precise (CPU cycle based) profiling data contained in a profiling block. */
  109. struct PreciseProfileData
  110. {
  111. PreciseProfileData(FrameAlloc* alloc);
  112. /** Begins a new sample and records current sample state. Previous sample must not be active. */
  113. void beginSample();
  114. /**
  115. * Records current sample state and creates a new sample based on start and end state. Adds the sample to the
  116. * sample list.
  117. */
  118. void endSample();
  119. /**
  120. * Removes the last added sample from the sample list and makes it active again. You must call endSample()
  121. * when done as if you called beginSample.
  122. */
  123. void resumeLastSample();
  124. Vector<PreciseProfileSample, StdFrameAlloc<ProfileSample>> samples;
  125. TimerPrecise timer;
  126. UINT64 memAllocs;
  127. UINT64 memFrees;
  128. };
  129. /**
  130. * Contains all sampling information about a single named profiling block. Each block has its own sampling
  131. * information and optionally child blocks.
  132. */
  133. struct ProfiledBlock
  134. {
  135. ProfiledBlock(FrameAlloc* alloc);
  136. ~ProfiledBlock();
  137. /** Attempts to find a child block with the specified name. Returns null if not found. */
  138. ProfiledBlock* findChild(const char* name) const;
  139. char* name;
  140. ProfileData basic;
  141. PreciseProfileData precise;
  142. Vector<ProfiledBlock*, StdFrameAlloc<ProfiledBlock*>> children;
  143. };
  144. /** CPU sampling type. */
  145. enum class ActiveSamplingType
  146. {
  147. Basic, /**< Sample using milliseconds. */
  148. Precise /**< Sample using CPU cycles. */
  149. };
  150. /** Contains data about the currently active profiling block. */
  151. struct ActiveBlock
  152. {
  153. ActiveBlock()
  154. :type(ActiveSamplingType::Basic), block(nullptr)
  155. { }
  156. ActiveBlock(ActiveSamplingType _type, ProfiledBlock* _block)
  157. :type(_type), block(_block)
  158. { }
  159. ActiveSamplingType type;
  160. ProfiledBlock* block;
  161. };
  162. /** Contains data about an active profiling thread. */
  163. struct ThreadInfo
  164. {
  165. ThreadInfo();
  166. /**
  167. * Starts profiling on the thread. New primary profiling block is created with the given name.
  168. */
  169. void begin(const char* _name);
  170. /**
  171. * Ends profiling on the thread. You should end all samples before calling this, but if you don't they will be
  172. * terminated automatically.
  173. */
  174. void end();
  175. /**
  176. * Deletes all internal profiling data and makes the object ready for another iteration. Should be called
  177. * after end in order to delete any existing data.
  178. */
  179. void reset();
  180. /** Gets the primary profiling block used by the thread. */
  181. ProfiledBlock* getBlock(const char* name);
  182. /** Deletes the provided block. */
  183. void releaseBlock(ProfiledBlock* block);
  184. static BS_THREADLOCAL ThreadInfo* activeThread;
  185. bool isActive;
  186. ProfiledBlock* rootBlock;
  187. FrameAlloc frameAlloc;
  188. ActiveBlock activeBlock;
  189. Stack<ActiveBlock, StdFrameAlloc<ActiveBlock>>* activeBlocks;
  190. };
  191. public:
  192. ProfilerCPU();
  193. ~ProfilerCPU();
  194. /**
  195. * Registers a new thread we will be doing sampling in. This needs to be called before any beginSample* \ endSample*
  196. * calls are made in that thread.
  197. *
  198. * @param[in] name Name that will allow you to more easily identify the thread.
  199. */
  200. void beginThread(const char* name);
  201. /** Ends sampling for the current thread. No beginSample* \ endSample* calls after this point. */
  202. void endThread();
  203. /**
  204. * Begins sample measurement. Must be followed by endSample().
  205. *
  206. * @param[in] name Unique name for the sample you can later use to find the sampling data.
  207. */
  208. void beginSample(const char* name);
  209. /**
  210. * Ends sample measurement.
  211. *
  212. * @param[in] name Unique name for the sample.
  213. *
  214. * @note
  215. * Unique name is primarily needed to more easily identify mismatched begin/end sample pairs. Otherwise the name in
  216. * beginSample() would be enough.
  217. */
  218. void endSample(const char* name);
  219. /**
  220. * Begins precise sample measurement. Must be followed by endSamplePrecise().
  221. *
  222. * @param[in] name Unique name for the sample you can later use to find the sampling data.
  223. *
  224. * @note
  225. * This method uses very precise CPU counters to determine variety of data not provided by standard beginSample().
  226. * However due to the way these counters work you should not use this method for larger parts of code. It does not
  227. * consider context switches so if the OS decides to switch context between measurements you will get invalid data.
  228. */
  229. void beginSamplePrecise(const char* name);
  230. /**
  231. * Ends precise sample measurement.
  232. *
  233. * @param[in] name Unique name for the sample.
  234. *
  235. * @note
  236. * Unique name is primarily needed to more easily identify mismatched begin/end sample pairs. Otherwise the name
  237. * in beginSamplePrecise() would be enough.
  238. */
  239. void endSamplePrecise(const char* name);
  240. /** Clears all sampling data, and ends any unfinished sampling blocks. */
  241. void reset();
  242. /**
  243. * Generates a report from all previously sampled data.
  244. *
  245. * @note Generating a report will stop all in-progress sampling. You should make sure
  246. * you call endSample* manually beforehand so this doesn't have to happen.
  247. */
  248. CPUProfilerReport generateReport();
  249. private:
  250. /**
  251. * Calculates overhead that the timing and sampling methods themselves introduce so we might get more accurate
  252. * measurements when creating reports.
  253. */
  254. void estimateTimerOverhead();
  255. private:
  256. double mBasicTimerOverhead;
  257. UINT64 mPreciseTimerOverhead;
  258. double mBasicSamplingOverheadMs;
  259. double mPreciseSamplingOverheadMs;
  260. UINT64 mBasicSamplingOverheadCycles;
  261. UINT64 mPreciseSamplingOverheadCycles;
  262. ProfilerVector<ThreadInfo*> mActiveThreads;
  263. Mutex mThreadSync;
  264. };
  265. /** Profiling entry containing information about a single CPU profiling block containing timing information. */
  266. struct BS_CORE_EXPORT CPUProfilerBasicSamplingEntry
  267. {
  268. struct BS_CORE_EXPORT Data
  269. {
  270. Data();
  271. String name; /**< Name of the profiling block. */
  272. UINT32 numCalls; /**< Number of times the block was entered. */
  273. UINT64 memAllocs; /**< Number of memory allocations that happened within the block. */
  274. UINT64 memFrees; /**< Number of memory deallocations that happened within the block. */
  275. double avgTimeMs; /**< Average time it took to execute the block, per call. In milliseconds. */
  276. double maxTimeMs; /**< Maximum time of a single call in the block. In milliseconds. */
  277. double totalTimeMs; /**< Total time the block took, across all calls. In milliseconds. */
  278. double avgSelfTimeMs; /**< Average time it took to execute the block, per call. Ignores time used by child blocks. In milliseconds. */
  279. double totalSelfTimeMs; /**< Total time the block took, across all calls. Ignores time used by child blocks. In milliseconds. */
  280. double estimatedSelfOverheadMs; /**< Estimated overhead of profiling methods, only for this exact block. In milliseconds. */
  281. double estimatedOverheadMs; /**< Estimated overhead of profiling methods for this block and all children. In milliseconds. */
  282. float pctOfParent; /**< Percent of parent block time this block took to execute. Ranging [0.0, 1.0]. */
  283. } data;
  284. ProfilerVector<CPUProfilerBasicSamplingEntry> childEntries;
  285. };
  286. /**
  287. * Profiling entry containing information about a single CPU profiling block containing CPU cycle count based
  288. * information.
  289. */
  290. struct BS_CORE_EXPORT CPUProfilerPreciseSamplingEntry
  291. {
  292. struct BS_CORE_EXPORT Data
  293. {
  294. Data();
  295. String name; /**< Name of the profiling block. */
  296. UINT32 numCalls; /**< Number of times the block was entered. */
  297. UINT64 memAllocs; /**< Number of memory allocations that happened within the block. */
  298. UINT64 memFrees; /**< Number of memory deallocations that happened within the block. */
  299. UINT64 avgCycles; /**< Average number of cycles it took to execute the block, per call. */
  300. UINT64 maxCycles; /**< Maximum number of cycles of a single call in the block. */
  301. UINT64 totalCycles; /**< Total number of cycles across all calls in the block. */
  302. UINT64 avgSelfCycles; /**< Average number of cycles it took to execute the block, per call. Ignores cycles used by child blocks. */
  303. UINT64 totalSelfCycles; /**< Total number of cycles across all calls in the block. Ignores time used by child blocks. */
  304. UINT64 estimatedSelfOverhead; /**< Estimated overhead of profiling methods, only for this exact block. In cycles. */
  305. UINT64 estimatedOverhead; /**< Estimated overhead of profiling methods for this block and all children. In cycles. */
  306. float pctOfParent; /**< Percent of parent block cycles used by this block. Ranging [0.0, 1.0]. */
  307. } data;
  308. ProfilerVector<CPUProfilerPreciseSamplingEntry> childEntries;
  309. };
  310. /** CPU profiling report containing all profiling information for a single profiling session. */
  311. class BS_CORE_EXPORT CPUProfilerReport
  312. {
  313. public:
  314. CPUProfilerReport();
  315. /**
  316. * Returns root entry for the basic (time based) sampling data. Root entry always contains the profiling block
  317. * associated with the entire thread.
  318. */
  319. const CPUProfilerBasicSamplingEntry& getBasicSamplingData() const { return mBasicSamplingRootEntry; }
  320. /**
  321. * Returns root entry for the precise (CPU cycle based) sampling data. Root entry always contains the profiling
  322. * block associated with the entire thread.
  323. */
  324. const CPUProfilerPreciseSamplingEntry& getPreciseSamplingData() const { return mPreciseSamplingRootEntry; }
  325. private:
  326. friend class ProfilerCPU;
  327. CPUProfilerBasicSamplingEntry mBasicSamplingRootEntry;
  328. CPUProfilerPreciseSamplingEntry mPreciseSamplingRootEntry;
  329. };
  330. /** Easier way to access ProfilerCPU. */
  331. BS_CORE_EXPORT ProfilerCPU& gProfilerCPU();
  332. /** Shortcut for profiling a single function call. */
  333. #define PROFILE_CALL(call, name) \
  334. BansheeEngine::gProfilerCPU().beginSample(name); \
  335. call; \
  336. BansheeEngine::gProfilerCPU().endSample(name);
  337. /** @} */
  338. }