BsProfilerCPU.cpp 28 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024
  1. //********************************** Banshee Engine (www.banshee3d.com) **************************************************//
  2. //**************** Copyright (c) 2016 Marko Pintera ([email protected]). All rights reserved. **********************//
  3. #include "Profiling/BsProfilerCPU.h"
  4. #include "Debug/BsDebug.h"
  5. #include "Platform/BsPlatform.h"
  6. #include <chrono>
  7. #if BS_COMPILER == BS_COMPILER_MSVC
  8. #include <intrin.h>
  9. #endif
  10. #if BS_COMPILER == BS_COMPILER_GNUC || BS_COMPILER == BS_COMPILER_CLANG
  11. #include "cpuid.h"
  12. #endif
  13. #if BS_COMPILER == BS_COMPILER_CLANG
  14. #include "intrin.h"
  15. #endif
  16. using namespace std::chrono;
  17. namespace bs
  18. {
  19. ProfilerCPU::Timer::Timer()
  20. :startTime(0.0f)
  21. {
  22. time = 0.0f;
  23. }
  24. void ProfilerCPU::Timer::start()
  25. {
  26. startTime = getCurrentTime();
  27. }
  28. void ProfilerCPU::Timer::stop()
  29. {
  30. time += getCurrentTime() - startTime;
  31. }
  32. void ProfilerCPU::Timer::reset()
  33. {
  34. time = 0.0f;
  35. }
  36. inline double ProfilerCPU::Timer::getCurrentTime() const
  37. {
  38. high_resolution_clock::time_point timeNow = mHRClock.now();
  39. nanoseconds timeNowNs = timeNow.time_since_epoch();
  40. return timeNowNs.count() * 0.000001;
  41. }
  42. ProfilerCPU::TimerPrecise::TimerPrecise()
  43. {
  44. cycles = 0;
  45. }
  46. void ProfilerCPU::TimerPrecise::start()
  47. {
  48. startCycles = getNumCycles();
  49. }
  50. void ProfilerCPU::TimerPrecise::stop()
  51. {
  52. cycles += getNumCycles() - startCycles;
  53. }
  54. void ProfilerCPU::TimerPrecise::reset()
  55. {
  56. cycles = 0;
  57. }
  58. inline UINT64 ProfilerCPU::TimerPrecise::getNumCycles()
  59. {
  60. #if BS_COMPILER == BS_COMPILER_GNUC
  61. unsigned int a = 0;
  62. unsigned int b[4];
  63. __get_cpuid(a, &b[0], &b[1], &b[2], &b[3]);
  64. #if BS_ARCH_TYPE == BS_ARCHITECTURE_x86_64
  65. UINT32 __a, __d;
  66. __asm__ __volatile__ ("rdtsc" : "=a" (__a), "=d" (__d));
  67. return (UINT64(__a) | UINT64(__d) << 32);
  68. #else
  69. UINT64 x;
  70. __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
  71. return x;
  72. #endif
  73. #elif BS_COMPILER == BS_COMPILER_CLANG
  74. UINT32 a = 0;
  75. UINT32 b[4];
  76. __get_cpuid(a, &b[0], &b[1], &b[2], &b[3]);
  77. return __rdtsc();
  78. #elif BS_COMPILER == BS_COMPILER_MSVC
  79. int a[4];
  80. int b = 0;
  81. __cpuid(a, b);
  82. return __rdtsc();
  83. #else
  84. static_assert(false, "Unsupported compiler");
  85. #endif
  86. }
  87. ProfilerCPU::ProfileData::ProfileData(FrameAlloc* alloc)
  88. :samples(alloc)
  89. { }
  90. void ProfilerCPU::ProfileData::beginSample()
  91. {
  92. memAllocs = MemoryCounter::getNumAllocs();
  93. memFrees = MemoryCounter::getNumFrees();
  94. timer.reset();
  95. timer.start();
  96. }
  97. void ProfilerCPU::ProfileData::endSample()
  98. {
  99. timer.stop();
  100. UINT64 numAllocs = MemoryCounter::getNumAllocs() - memAllocs;
  101. UINT64 numFrees = MemoryCounter::getNumFrees() - memFrees;
  102. samples.push_back(ProfileSample(timer.time, numAllocs, numFrees));
  103. }
  104. void ProfilerCPU::ProfileData::resumeLastSample()
  105. {
  106. timer.start();
  107. samples.erase(samples.end() - 1);
  108. }
  109. ProfilerCPU::PreciseProfileData::PreciseProfileData(FrameAlloc* alloc)
  110. :samples(alloc)
  111. { }
  112. void ProfilerCPU::PreciseProfileData::beginSample()
  113. {
  114. memAllocs = MemoryCounter::getNumAllocs();
  115. memFrees = MemoryCounter::getNumFrees();
  116. timer.reset();
  117. timer.start();
  118. }
  119. void ProfilerCPU::PreciseProfileData::endSample()
  120. {
  121. timer.stop();
  122. UINT64 numAllocs = MemoryCounter::getNumAllocs() - memAllocs;
  123. UINT64 numFrees = MemoryCounter::getNumFrees() - memFrees;
  124. samples.push_back(PreciseProfileSample(timer.cycles, numAllocs, numFrees));
  125. }
  126. void ProfilerCPU::PreciseProfileData::resumeLastSample()
  127. {
  128. timer.start();
  129. samples.erase(samples.end() - 1);
  130. }
  131. BS_THREADLOCAL ProfilerCPU::ThreadInfo* ProfilerCPU::ThreadInfo::activeThread = nullptr;
  132. ProfilerCPU::ThreadInfo::ThreadInfo()
  133. :isActive(false), rootBlock(nullptr), frameAlloc(1024 * 512), activeBlocks(nullptr)
  134. {
  135. }
  136. void ProfilerCPU::ThreadInfo::begin(const char* _name)
  137. {
  138. if(isActive)
  139. {
  140. LOGWRN("Profiler::beginThread called on a thread that was already being sampled");
  141. return;
  142. }
  143. if(rootBlock == nullptr)
  144. rootBlock = getBlock(_name);
  145. activeBlock = ActiveBlock(ActiveSamplingType::Basic, rootBlock);
  146. if (activeBlocks == nullptr)
  147. activeBlocks = frameAlloc.alloc<Stack<ActiveBlock, StdFrameAlloc<ActiveBlock>>>
  148. (StdFrameAlloc<ActiveBlock>(&frameAlloc));
  149. activeBlocks->push(activeBlock);
  150. rootBlock->basic.beginSample();
  151. isActive = true;
  152. }
  153. void ProfilerCPU::ThreadInfo::end()
  154. {
  155. if(activeBlock.type == ActiveSamplingType::Basic)
  156. activeBlock.block->basic.endSample();
  157. else
  158. activeBlock.block->precise.endSample();
  159. activeBlocks->pop();
  160. if(!isActive)
  161. LOGWRN("Profiler::endThread called on a thread that isn't being sampled.");
  162. if (activeBlocks->size() > 0)
  163. {
  164. LOGWRN("Profiler::endThread called but not all sample pairs were closed. Sampling data will not be valid.");
  165. while (activeBlocks->size() > 0)
  166. {
  167. ActiveBlock& curBlock = activeBlocks->top();
  168. if(curBlock.type == ActiveSamplingType::Basic)
  169. curBlock.block->basic.endSample();
  170. else
  171. curBlock.block->precise.endSample();
  172. activeBlocks->pop();
  173. }
  174. }
  175. isActive = false;
  176. activeBlock = ActiveBlock();
  177. frameAlloc.dealloc(activeBlocks);
  178. activeBlocks = nullptr;
  179. }
  180. void ProfilerCPU::ThreadInfo::reset()
  181. {
  182. if(isActive)
  183. end();
  184. if(rootBlock != nullptr)
  185. releaseBlock(rootBlock);
  186. rootBlock = nullptr;
  187. frameAlloc.clear(); // Note: This never actually frees memory
  188. }
  189. ProfilerCPU::ProfiledBlock* ProfilerCPU::ThreadInfo::getBlock(const char* name)
  190. {
  191. ProfiledBlock* block = frameAlloc.alloc<ProfiledBlock>(&frameAlloc);
  192. block->name = (char*)frameAlloc.alloc(((UINT32)strlen(name) + 1) * sizeof(char));
  193. strcpy(block->name, name);
  194. return block;
  195. }
  196. void ProfilerCPU::ThreadInfo::releaseBlock(ProfiledBlock* block)
  197. {
  198. frameAlloc.dealloc((UINT8*)block->name);
  199. frameAlloc.dealloc(block);
  200. }
  201. ProfilerCPU::ProfiledBlock::ProfiledBlock(FrameAlloc* alloc)
  202. :basic(alloc), precise(alloc), children(alloc)
  203. { }
  204. ProfilerCPU::ProfiledBlock::~ProfiledBlock()
  205. {
  206. ThreadInfo* thread = ThreadInfo::activeThread;
  207. for(auto& child : children)
  208. thread->releaseBlock(child);
  209. children.clear();
  210. }
  211. ProfilerCPU::ProfiledBlock* ProfilerCPU::ProfiledBlock::findChild(const char* name) const
  212. {
  213. for(auto& child : children)
  214. {
  215. if(strcmp(child->name, name) == 0)
  216. return child;
  217. }
  218. return nullptr;
  219. }
  220. ProfilerCPU::ProfilerCPU()
  221. : mBasicTimerOverhead(0.0), mPreciseTimerOverhead(0), mBasicSamplingOverheadMs(0.0), mPreciseSamplingOverheadMs(0.0)
  222. , mBasicSamplingOverheadCycles(0), mPreciseSamplingOverheadCycles(0)
  223. {
  224. // TODO - We only estimate overhead on program start. It might be better to estimate it each time beginThread is called,
  225. // and keep separate values per thread.
  226. estimateTimerOverhead();
  227. }
  228. ProfilerCPU::~ProfilerCPU()
  229. {
  230. reset();
  231. Lock lock(mThreadSync);
  232. for(auto& threadInfo : mActiveThreads)
  233. bs_delete<ThreadInfo, ProfilerAlloc>(threadInfo);
  234. }
  235. void ProfilerCPU::beginThread(const char* name)
  236. {
  237. ThreadInfo* thread = ThreadInfo::activeThread;
  238. if(thread == nullptr)
  239. {
  240. ThreadInfo::activeThread = bs_new<ThreadInfo, ProfilerAlloc>();
  241. thread = ThreadInfo::activeThread;
  242. {
  243. Lock lock(mThreadSync);
  244. mActiveThreads.push_back(thread);
  245. }
  246. }
  247. thread->begin(name);
  248. }
  249. void ProfilerCPU::endThread()
  250. {
  251. // I don't do a nullcheck where on purpose, so endSample can be called ASAP
  252. ThreadInfo::activeThread->end();
  253. }
  254. void ProfilerCPU::beginSample(const char* name)
  255. {
  256. ThreadInfo* thread = ThreadInfo::activeThread;
  257. if(thread == nullptr || !thread->isActive)
  258. {
  259. beginThread("Unknown");
  260. thread = ThreadInfo::activeThread;
  261. }
  262. ProfiledBlock* parent = thread->activeBlock.block;
  263. ProfiledBlock* block = nullptr;
  264. if(parent != nullptr)
  265. block = parent->findChild(name);
  266. if(block == nullptr)
  267. {
  268. block = thread->getBlock(name);
  269. if(parent != nullptr)
  270. parent->children.push_back(block);
  271. else
  272. thread->rootBlock->children.push_back(block);
  273. }
  274. thread->activeBlock = ActiveBlock(ActiveSamplingType::Basic, block);
  275. thread->activeBlocks->push(thread->activeBlock);
  276. block->basic.beginSample();
  277. }
  278. void ProfilerCPU::endSample(const char* name)
  279. {
  280. ThreadInfo* thread = ThreadInfo::activeThread;
  281. ProfiledBlock* block = thread->activeBlock.block;
  282. #if BS_DEBUG_MODE
  283. if(block == nullptr)
  284. {
  285. LOGWRN("Mismatched CPUProfiler::endSample. No beginSample was called.");
  286. return;
  287. }
  288. if(thread->activeBlock.type == ActiveSamplingType::Precise)
  289. {
  290. LOGWRN("Mismatched CPUProfiler::endSample. Was expecting Profiler::endSamplePrecise.");
  291. return;
  292. }
  293. if(strcmp(block->name, name) != 0)
  294. {
  295. LOGWRN("Mismatched CPUProfiler::endSample. Was expecting \"" + String(block->name) +
  296. "\" but got \"" + String(name) + "\". Sampling data will not be valid.");
  297. return;
  298. }
  299. #endif
  300. block->basic.endSample();
  301. thread->activeBlocks->pop();
  302. if (!thread->activeBlocks->empty())
  303. thread->activeBlock = thread->activeBlocks->top();
  304. else
  305. thread->activeBlock = ActiveBlock();
  306. }
  307. void ProfilerCPU::beginSamplePrecise(const char* name)
  308. {
  309. // Note: There is a (small) possibility a context switch will happen during this measurement in which case result will be skewed.
  310. // Increasing thread priority might help. This is generally only a problem with code that executes a long time (10-15+ ms - depending on OS quant length)
  311. ThreadInfo* thread = ThreadInfo::activeThread;
  312. if(thread == nullptr || !thread->isActive)
  313. beginThread("Unknown");
  314. ProfiledBlock* parent = thread->activeBlock.block;
  315. ProfiledBlock* block = nullptr;
  316. if(parent != nullptr)
  317. block = parent->findChild(name);
  318. if(block == nullptr)
  319. {
  320. block = thread->getBlock(name);
  321. if(parent != nullptr)
  322. parent->children.push_back(block);
  323. else
  324. thread->rootBlock->children.push_back(block);
  325. }
  326. thread->activeBlock = ActiveBlock(ActiveSamplingType::Precise, block);
  327. thread->activeBlocks->push(thread->activeBlock);
  328. block->precise.beginSample();
  329. }
  330. void ProfilerCPU::endSamplePrecise(const char* name)
  331. {
  332. ThreadInfo* thread = ThreadInfo::activeThread;
  333. ProfiledBlock* block = thread->activeBlock.block;
  334. #if BS_DEBUG_MODE
  335. if(block == nullptr)
  336. {
  337. LOGWRN("Mismatched Profiler::endSamplePrecise. No beginSamplePrecise was called.");
  338. return;
  339. }
  340. if(thread->activeBlock.type == ActiveSamplingType::Basic)
  341. {
  342. LOGWRN("Mismatched CPUProfiler::endSamplePrecise. Was expecting Profiler::endSample.");
  343. return;
  344. }
  345. if (strcmp(block->name, name) != 0)
  346. {
  347. LOGWRN("Mismatched Profiler::endSamplePrecise. Was expecting \"" + String(block->name) +
  348. "\" but got \"" + String(name) + "\". Sampling data will not be valid.");
  349. return;
  350. }
  351. #endif
  352. block->precise.endSample();
  353. thread->activeBlocks->pop();
  354. if (!thread->activeBlocks->empty())
  355. thread->activeBlock = thread->activeBlocks->top();
  356. else
  357. thread->activeBlock = ActiveBlock();
  358. }
  359. void ProfilerCPU::reset()
  360. {
  361. ThreadInfo* thread = ThreadInfo::activeThread;
  362. if(thread != nullptr)
  363. thread->reset();
  364. }
  365. CPUProfilerReport ProfilerCPU::generateReport()
  366. {
  367. CPUProfilerReport report;
  368. ThreadInfo* thread = ThreadInfo::activeThread;
  369. if(thread == nullptr)
  370. return report;
  371. if(thread->isActive)
  372. thread->end();
  373. // We need to separate out basic and precise data and form two separate hierarchies
  374. if(thread->rootBlock == nullptr)
  375. return report;
  376. struct TempEntry
  377. {
  378. TempEntry(ProfiledBlock* _parentBlock, UINT32 _entryIdx)
  379. :parentBlock(_parentBlock), entryIdx(_entryIdx)
  380. { }
  381. ProfiledBlock* parentBlock;
  382. UINT32 entryIdx;
  383. ProfilerVector<UINT32> childIndexes;
  384. };
  385. ProfilerVector<CPUProfilerBasicSamplingEntry> basicEntries;
  386. ProfilerVector<CPUProfilerPreciseSamplingEntry> preciseEntries;
  387. // Fill up flatHierarchy array in a way so we always process children before parents
  388. ProfilerStack<UINT32> todo;
  389. ProfilerVector<TempEntry> flatHierarchy;
  390. UINT32 entryIdx = 0;
  391. todo.push(entryIdx);
  392. flatHierarchy.push_back(TempEntry(thread->rootBlock, entryIdx));
  393. entryIdx++;
  394. while(!todo.empty())
  395. {
  396. UINT32 curDataIdx = todo.top();
  397. ProfiledBlock* curBlock = flatHierarchy[curDataIdx].parentBlock;
  398. todo.pop();
  399. for(auto& child : curBlock->children)
  400. {
  401. flatHierarchy[curDataIdx].childIndexes.push_back(entryIdx);
  402. todo.push(entryIdx);
  403. flatHierarchy.push_back(TempEntry(child, entryIdx));
  404. entryIdx++;
  405. }
  406. }
  407. // Calculate sampling data for all entries
  408. basicEntries.resize(flatHierarchy.size());
  409. preciseEntries.resize(flatHierarchy.size());
  410. for(auto iter = flatHierarchy.rbegin(); iter != flatHierarchy.rend(); ++iter)
  411. {
  412. TempEntry& curData = *iter;
  413. ProfiledBlock* curBlock = curData.parentBlock;
  414. CPUProfilerBasicSamplingEntry* entryBasic = &basicEntries[curData.entryIdx];
  415. CPUProfilerPreciseSamplingEntry* entryPrecise = &preciseEntries[curData.entryIdx];
  416. // Calculate basic data
  417. entryBasic->data.name = String(curBlock->name);
  418. entryBasic->data.memAllocs = 0;
  419. entryBasic->data.memFrees = 0;
  420. entryBasic->data.totalTimeMs = 0.0;
  421. entryBasic->data.maxTimeMs = 0.0;
  422. for(auto& sample : curBlock->basic.samples)
  423. {
  424. entryBasic->data.totalTimeMs += sample.time;
  425. entryBasic->data.maxTimeMs = std::max(entryBasic->data.maxTimeMs, sample.time);
  426. entryBasic->data.memAllocs += sample.numAllocs;
  427. entryBasic->data.memFrees += sample.numFrees;
  428. }
  429. entryBasic->data.numCalls = (UINT32)curBlock->basic.samples.size();
  430. if(entryBasic->data.numCalls > 0)
  431. entryBasic->data.avgTimeMs = entryBasic->data.totalTimeMs / entryBasic->data.numCalls;
  432. double totalChildTime = 0.0;
  433. for(auto& childIdx : curData.childIndexes)
  434. {
  435. CPUProfilerBasicSamplingEntry* childEntry = &basicEntries[childIdx];
  436. totalChildTime += childEntry->data.totalTimeMs;
  437. childEntry->data.pctOfParent = (float)(childEntry->data.totalTimeMs / entryBasic->data.totalTimeMs);
  438. entryBasic->data.estimatedOverheadMs += childEntry->data.estimatedOverheadMs;
  439. }
  440. entryBasic->data.estimatedOverheadMs += curBlock->basic.samples.size() * mBasicSamplingOverheadMs;
  441. entryBasic->data.estimatedOverheadMs += curBlock->precise.samples.size() * mPreciseSamplingOverheadMs;
  442. entryBasic->data.totalSelfTimeMs = entryBasic->data.totalTimeMs - totalChildTime;
  443. if(entryBasic->data.numCalls > 0)
  444. entryBasic->data.avgSelfTimeMs = entryBasic->data.totalSelfTimeMs / entryBasic->data.numCalls;
  445. entryBasic->data.estimatedSelfOverheadMs = mBasicTimerOverhead;
  446. // Calculate precise data
  447. entryPrecise->data.name = String(curBlock->name);
  448. entryPrecise->data.memAllocs = 0;
  449. entryPrecise->data.memFrees = 0;
  450. entryPrecise->data.totalCycles = 0;
  451. entryPrecise->data.maxCycles = 0;
  452. for(auto& sample : curBlock->precise.samples)
  453. {
  454. entryPrecise->data.totalCycles += sample.cycles;
  455. entryPrecise->data.maxCycles = std::max(entryPrecise->data.maxCycles, sample.cycles);
  456. entryPrecise->data.memAllocs += sample.numAllocs;
  457. entryPrecise->data.memFrees += sample.numFrees;
  458. }
  459. entryPrecise->data.numCalls = (UINT32)curBlock->precise.samples.size();
  460. if(entryPrecise->data.numCalls > 0)
  461. entryPrecise->data.avgCycles = entryPrecise->data.totalCycles / entryPrecise->data.numCalls;
  462. UINT64 totalChildCycles = 0;
  463. for(auto& childIdx : curData.childIndexes)
  464. {
  465. CPUProfilerPreciseSamplingEntry* childEntry = &preciseEntries[childIdx];
  466. totalChildCycles += childEntry->data.totalCycles;
  467. childEntry->data.pctOfParent = childEntry->data.totalCycles / (float)entryPrecise->data.totalCycles;
  468. entryPrecise->data.estimatedOverhead += childEntry->data.estimatedOverhead;
  469. }
  470. entryPrecise->data.estimatedOverhead += curBlock->precise.samples.size() * mPreciseSamplingOverheadCycles;
  471. entryPrecise->data.estimatedOverhead += curBlock->basic.samples.size() * mBasicSamplingOverheadCycles;
  472. entryPrecise->data.totalSelfCycles = entryPrecise->data.totalCycles - totalChildCycles;
  473. if(entryPrecise->data.numCalls > 0)
  474. entryPrecise->data.avgSelfCycles = entryPrecise->data.totalSelfCycles / entryPrecise->data.numCalls;
  475. entryPrecise->data.estimatedSelfOverhead = mPreciseTimerOverhead;
  476. }
  477. // Prune empty basic entries
  478. ProfilerStack<UINT32> finalBasicHierarchyTodo;
  479. ProfilerStack<UINT32> parentBasicEntryIndexes;
  480. ProfilerVector<TempEntry> newBasicEntries;
  481. finalBasicHierarchyTodo.push(0);
  482. entryIdx = 0;
  483. parentBasicEntryIndexes.push(entryIdx);
  484. newBasicEntries.push_back(TempEntry(nullptr, entryIdx));
  485. entryIdx++;
  486. while(!finalBasicHierarchyTodo.empty())
  487. {
  488. UINT32 parentEntryIdx = parentBasicEntryIndexes.top();
  489. parentBasicEntryIndexes.pop();
  490. UINT32 curEntryIdx = finalBasicHierarchyTodo.top();
  491. TempEntry& curEntry = flatHierarchy[curEntryIdx];
  492. finalBasicHierarchyTodo.pop();
  493. for(auto& childIdx : curEntry.childIndexes)
  494. {
  495. finalBasicHierarchyTodo.push(childIdx);
  496. CPUProfilerBasicSamplingEntry& basicEntry = basicEntries[childIdx];
  497. if(basicEntry.data.numCalls > 0)
  498. {
  499. newBasicEntries.push_back(TempEntry(nullptr, childIdx));
  500. newBasicEntries[parentEntryIdx].childIndexes.push_back(entryIdx);
  501. parentBasicEntryIndexes.push(entryIdx);
  502. entryIdx++;
  503. }
  504. else
  505. parentBasicEntryIndexes.push(parentEntryIdx);
  506. }
  507. }
  508. if(newBasicEntries.size() > 0)
  509. {
  510. ProfilerVector<CPUProfilerBasicSamplingEntry*> finalBasicEntries;
  511. report.mBasicSamplingRootEntry = basicEntries[newBasicEntries[0].entryIdx];
  512. finalBasicEntries.push_back(&report.mBasicSamplingRootEntry);
  513. finalBasicHierarchyTodo.push(0);
  514. while(!finalBasicHierarchyTodo.empty())
  515. {
  516. UINT32 curEntryIdx = finalBasicHierarchyTodo.top();
  517. finalBasicHierarchyTodo.pop();
  518. TempEntry& curEntry = newBasicEntries[curEntryIdx];
  519. CPUProfilerBasicSamplingEntry* basicEntry = finalBasicEntries[curEntryIdx];
  520. basicEntry->childEntries.resize(curEntry.childIndexes.size());
  521. UINT32 idx = 0;
  522. for(auto& childIdx : curEntry.childIndexes)
  523. {
  524. TempEntry& childEntry = newBasicEntries[childIdx];
  525. basicEntry->childEntries[idx] = basicEntries[childEntry.entryIdx];
  526. finalBasicEntries.push_back(&(basicEntry->childEntries[idx]));
  527. finalBasicHierarchyTodo.push(childIdx);
  528. idx++;
  529. }
  530. }
  531. }
  532. // Prune empty precise entries
  533. ProfilerStack<UINT32> finalPreciseHierarchyTodo;
  534. ProfilerStack<UINT32> parentPreciseEntryIndexes;
  535. ProfilerVector<TempEntry> newPreciseEntries;
  536. finalPreciseHierarchyTodo.push(0);
  537. entryIdx = 0;
  538. parentPreciseEntryIndexes.push(entryIdx);
  539. newPreciseEntries.push_back(TempEntry(nullptr, entryIdx));
  540. entryIdx++;
  541. while(!finalPreciseHierarchyTodo.empty())
  542. {
  543. UINT32 parentEntryIdx = parentPreciseEntryIndexes.top();
  544. parentPreciseEntryIndexes.pop();
  545. UINT32 curEntryIdx = finalPreciseHierarchyTodo.top();
  546. TempEntry& curEntry = flatHierarchy[curEntryIdx];
  547. finalPreciseHierarchyTodo.pop();
  548. for(auto& childIdx : curEntry.childIndexes)
  549. {
  550. finalPreciseHierarchyTodo.push(childIdx);
  551. CPUProfilerPreciseSamplingEntry& preciseEntry = preciseEntries[childIdx];
  552. if(preciseEntry.data.numCalls > 0)
  553. {
  554. newPreciseEntries.push_back(TempEntry(nullptr, childIdx));
  555. newPreciseEntries[parentEntryIdx].childIndexes.push_back(entryIdx);
  556. parentPreciseEntryIndexes.push(entryIdx);
  557. entryIdx++;
  558. }
  559. else
  560. parentPreciseEntryIndexes.push(parentEntryIdx);
  561. }
  562. }
  563. if(newPreciseEntries.size() > 0)
  564. {
  565. ProfilerVector<CPUProfilerPreciseSamplingEntry*> finalPreciseEntries;
  566. report.mPreciseSamplingRootEntry = preciseEntries[newPreciseEntries[0].entryIdx];
  567. finalPreciseEntries.push_back(&report.mPreciseSamplingRootEntry);
  568. finalPreciseHierarchyTodo.push(0);
  569. while(!finalPreciseHierarchyTodo.empty())
  570. {
  571. UINT32 curEntryIdx = finalPreciseHierarchyTodo.top();
  572. finalPreciseHierarchyTodo.pop();
  573. TempEntry& curEntry = newPreciseEntries[curEntryIdx];
  574. CPUProfilerPreciseSamplingEntry* preciseEntry = finalPreciseEntries[curEntryIdx];
  575. preciseEntry->childEntries.resize(curEntry.childIndexes.size());
  576. UINT32 idx = 0;
  577. for(auto& childIdx : curEntry.childIndexes)
  578. {
  579. TempEntry& childEntry = newPreciseEntries[childIdx];
  580. preciseEntry->childEntries[idx] = preciseEntries[childEntry.entryIdx];
  581. finalPreciseEntries.push_back(&preciseEntry->childEntries.back());
  582. finalPreciseHierarchyTodo.push(childIdx);
  583. idx++;
  584. }
  585. }
  586. }
  587. return report;
  588. }
  589. void ProfilerCPU::estimateTimerOverhead()
  590. {
  591. // Get an idea of how long timer calls and RDTSC takes
  592. const UINT32 reps = 1000, sampleReps = 20;
  593. mBasicTimerOverhead = 1000000.0;
  594. mPreciseTimerOverhead = 1000000;
  595. for (UINT32 tries = 0; tries < 20; tries++)
  596. {
  597. Timer timer;
  598. for (UINT32 i = 0; i < reps; i++)
  599. {
  600. timer.start();
  601. timer.stop();
  602. }
  603. double avgTime = double(timer.time)/double(reps);
  604. if (avgTime < mBasicTimerOverhead)
  605. mBasicTimerOverhead = avgTime;
  606. TimerPrecise timerPrecise;
  607. for (UINT32 i = 0; i < reps; i++)
  608. {
  609. timerPrecise.start();
  610. timerPrecise.stop();
  611. }
  612. UINT64 avgCycles = timerPrecise.cycles/reps;
  613. if (avgCycles < mPreciseTimerOverhead)
  614. mPreciseTimerOverhead = avgCycles;
  615. }
  616. mBasicSamplingOverheadMs = 1000000.0;
  617. mPreciseSamplingOverheadMs = 1000000.0;
  618. mBasicSamplingOverheadCycles = 1000000;
  619. mPreciseSamplingOverheadCycles = 1000000;
  620. for (UINT32 tries = 0; tries < 3; tries++)
  621. {
  622. /************************************************************************/
  623. /* AVERAGE TIME IN MS FOR BASIC SAMPLING */
  624. /************************************************************************/
  625. Timer timerA;
  626. timerA.start();
  627. beginThread("Main");
  628. // Two different cases that can effect performance, one where
  629. // sample already exists and other where new one needs to be created
  630. for (UINT32 i = 0; i < sampleReps; i++)
  631. {
  632. beginSample("TestAvg1");
  633. endSample("TestAvg1");
  634. beginSample("TestAvg2");
  635. endSample("TestAvg2");
  636. beginSample("TestAvg3");
  637. endSample("TestAvg3");
  638. beginSample("TestAvg4");
  639. endSample("TestAvg4");
  640. beginSample("TestAvg5");
  641. endSample("TestAvg5");
  642. beginSample("TestAvg6");
  643. endSample("TestAvg6");
  644. beginSample("TestAvg7");
  645. endSample("TestAvg7");
  646. beginSample("TestAvg8");
  647. endSample("TestAvg8");
  648. beginSample("TestAvg9");
  649. endSample("TestAvg9");
  650. beginSample("TestAvg10");
  651. endSample("TestAvg10");
  652. }
  653. for (UINT32 i = 0; i < sampleReps * 5; i++)
  654. {
  655. beginSample(("TestAvg#" + toString(i)).c_str());
  656. endSample(("TestAvg#" + toString(i)).c_str());
  657. }
  658. endThread();
  659. timerA.stop();
  660. reset();
  661. double avgTimeBasic = double(timerA.time)/double(sampleReps * 10 + sampleReps * 5) - mBasicTimerOverhead;
  662. if (avgTimeBasic < mBasicSamplingOverheadMs)
  663. mBasicSamplingOverheadMs = avgTimeBasic;
  664. /************************************************************************/
  665. /* AVERAGE CYCLES FOR BASIC SAMPLING */
  666. /************************************************************************/
  667. TimerPrecise timerPreciseA;
  668. timerPreciseA.start();
  669. beginThread("Main");
  670. // Two different cases that can effect performance, one where
  671. // sample already exists and other where new one needs to be created
  672. for (UINT32 i = 0; i < sampleReps; i++)
  673. {
  674. beginSample("TestAvg1");
  675. endSample("TestAvg1");
  676. beginSample("TestAvg2");
  677. endSample("TestAvg2");
  678. beginSample("TestAvg3");
  679. endSample("TestAvg3");
  680. beginSample("TestAvg4");
  681. endSample("TestAvg4");
  682. beginSample("TestAvg5");
  683. endSample("TestAvg5");
  684. beginSample("TestAvg6");
  685. endSample("TestAvg6");
  686. beginSample("TestAvg7");
  687. endSample("TestAvg7");
  688. beginSample("TestAvg8");
  689. endSample("TestAvg8");
  690. beginSample("TestAvg9");
  691. endSample("TestAvg9");
  692. beginSample("TestAvg10");
  693. endSample("TestAvg10");
  694. }
  695. for (UINT32 i = 0; i < sampleReps * 5; i++)
  696. {
  697. beginSample(("TestAvg#" + toString(i)).c_str());
  698. endSample(("TestAvg#" + toString(i)).c_str());
  699. }
  700. endThread();
  701. timerPreciseA.stop();
  702. reset();
  703. UINT64 avgCyclesBasic = timerPreciseA.cycles/(sampleReps * 10 + sampleReps * 5) - mPreciseTimerOverhead;
  704. if (avgCyclesBasic < mBasicSamplingOverheadCycles)
  705. mBasicSamplingOverheadCycles = avgCyclesBasic;
  706. /************************************************************************/
  707. /* AVERAGE TIME IN MS FOR PRECISE SAMPLING */
  708. /************************************************************************/
  709. Timer timerB;
  710. timerB.start();
  711. beginThread("Main");
  712. // Two different cases that can effect performance, one where
  713. // sample already exists and other where new one needs to be created
  714. for (UINT32 i = 0; i < sampleReps; i++)
  715. {
  716. beginSamplePrecise("TestAvg1");
  717. endSamplePrecise("TestAvg1");
  718. beginSamplePrecise("TestAvg2");
  719. endSamplePrecise("TestAvg2");
  720. beginSamplePrecise("TestAvg3");
  721. endSamplePrecise("TestAvg3");
  722. beginSamplePrecise("TestAvg4");
  723. endSamplePrecise("TestAvg4");
  724. beginSamplePrecise("TestAvg5");
  725. endSamplePrecise("TestAvg5");
  726. beginSamplePrecise("TestAvg6");
  727. endSamplePrecise("TestAvg6");
  728. beginSamplePrecise("TestAvg7");
  729. endSamplePrecise("TestAvg7");
  730. beginSamplePrecise("TestAvg8");
  731. endSamplePrecise("TestAvg8");
  732. beginSamplePrecise("TestAvg9");
  733. endSamplePrecise("TestAvg9");
  734. beginSamplePrecise("TestAvg10");
  735. endSamplePrecise("TestAvg10");
  736. }
  737. for (UINT32 i = 0; i < sampleReps * 5; i++)
  738. {
  739. beginSamplePrecise(("TestAvg#" + toString(i)).c_str());
  740. endSamplePrecise(("TestAvg#" + toString(i)).c_str());
  741. }
  742. endThread();
  743. timerB.stop();
  744. reset();
  745. double avgTimesPrecise = timerB.time/(sampleReps * 10 + sampleReps * 5);
  746. if (avgTimesPrecise < mPreciseSamplingOverheadMs)
  747. mPreciseSamplingOverheadMs = avgTimesPrecise;
  748. /************************************************************************/
  749. /* AVERAGE CYCLES FOR PRECISE SAMPLING */
  750. /************************************************************************/
  751. TimerPrecise timerPreciseB;
  752. timerPreciseB.start();
  753. beginThread("Main");
  754. // Two different cases that can effect performance, one where
  755. // sample already exists and other where new one needs to be created
  756. for (UINT32 i = 0; i < sampleReps; i++)
  757. {
  758. beginSamplePrecise("TestAvg1");
  759. endSamplePrecise("TestAvg1");
  760. beginSamplePrecise("TestAvg2");
  761. endSamplePrecise("TestAvg2");
  762. beginSamplePrecise("TestAvg3");
  763. endSamplePrecise("TestAvg3");
  764. beginSamplePrecise("TestAvg4");
  765. endSamplePrecise("TestAvg4");
  766. beginSamplePrecise("TestAvg5");
  767. endSamplePrecise("TestAvg5");
  768. beginSamplePrecise("TestAvg6");
  769. endSamplePrecise("TestAvg6");
  770. beginSamplePrecise("TestAvg7");
  771. endSamplePrecise("TestAvg7");
  772. beginSamplePrecise("TestAvg8");
  773. endSamplePrecise("TestAvg8");
  774. beginSamplePrecise("TestAvg9");
  775. endSamplePrecise("TestAvg9");
  776. beginSamplePrecise("TestAvg10");
  777. endSamplePrecise("TestAvg10");
  778. }
  779. for (UINT32 i = 0; i < sampleReps * 5; i++)
  780. {
  781. beginSamplePrecise(("TestAvg#" + toString(i)).c_str());
  782. endSamplePrecise(("TestAvg#" + toString(i)).c_str());
  783. }
  784. endThread();
  785. timerPreciseB.stop();
  786. reset();
  787. UINT64 avgCyclesPrecise = timerPreciseB.cycles/(sampleReps * 10 + sampleReps * 5);
  788. if (avgCyclesPrecise < mPreciseSamplingOverheadCycles)
  789. mPreciseSamplingOverheadCycles = avgCyclesPrecise;
  790. }
  791. }
  792. CPUProfilerBasicSamplingEntry::Data::Data()
  793. :numCalls(0), avgTimeMs(0.0), maxTimeMs(0.0), totalTimeMs(0.0),
  794. avgSelfTimeMs(0.0), totalSelfTimeMs(0.0), estimatedSelfOverheadMs(0.0),
  795. estimatedOverheadMs(0.0), pctOfParent(1.0f)
  796. { }
  797. CPUProfilerPreciseSamplingEntry::Data::Data()
  798. :numCalls(0), avgCycles(0), maxCycles(0), totalCycles(0),
  799. avgSelfCycles(0), totalSelfCycles(0), estimatedSelfOverhead(0),
  800. estimatedOverhead(0), pctOfParent(1.0f)
  801. { }
  802. CPUProfilerReport::CPUProfilerReport()
  803. {
  804. }
  805. ProfilerCPU& gProfilerCPU()
  806. {
  807. return ProfilerCPU::instance();
  808. }
  809. }