BsProfilerCPU.cpp 27 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012
  1. //********************************** Banshee Engine (www.banshee3d.com) **************************************************//
  2. //**************** Copyright (c) 2016 Marko Pintera ([email protected]). All rights reserved. **********************//
  3. #include "BsProfilerCPU.h"
  4. #include "BsDebug.h"
  5. #include "BsPlatform.h"
  6. #if BS_COMPILER == BS_COMPILER_GNUC || BS_COMPILER == BS_COMPILER_CLANG
  7. #include "cpuid.h"
  8. #endif
  9. #if BS_COMPILER == BS_COMPILER_CLANG
  10. #include "intrin.h"
  11. #endif
  12. namespace BansheeEngine
  13. {
  14. ProfilerCPU::Timer::Timer()
  15. {
  16. time = 0.0f;
  17. }
  18. void ProfilerCPU::Timer::start()
  19. {
  20. startTime = getCurrentTime();
  21. }
  22. void ProfilerCPU::Timer::stop()
  23. {
  24. time += getCurrentTime() - startTime;
  25. }
  26. void ProfilerCPU::Timer::reset()
  27. {
  28. time = 0.0f;
  29. }
  30. inline double ProfilerCPU::Timer::getCurrentTime()
  31. {
  32. return PlatformUtility::queryPerformanceTimerMs();
  33. }
  34. ProfilerCPU::TimerPrecise::TimerPrecise()
  35. {
  36. cycles = 0;
  37. }
  38. void ProfilerCPU::TimerPrecise::start()
  39. {
  40. startCycles = getNumCycles();
  41. }
  42. void ProfilerCPU::TimerPrecise::stop()
  43. {
  44. cycles += getNumCycles() - startCycles;
  45. }
  46. void ProfilerCPU::TimerPrecise::reset()
  47. {
  48. cycles = 0;
  49. }
  50. inline UINT64 ProfilerCPU::TimerPrecise::getNumCycles()
  51. {
  52. #if BS_COMPILER == BS_COMPILER_GNUC
  53. int a = 0;
  54. int b[4];
  55. __get_cpuid(a, &b[0], &b[1], &b[2], &b[3]);
  56. #if BS_ARCH_TYPE == BS_ARCHITECTURE_x86_64
  57. UINT32 __a, __d;
  58. __asm__ __volatile__ ("rdtsc" : "=a" (__a), "=d" (__d));
  59. return (UINT64(__a) | UINT64(__d) << 32);
  60. #else
  61. UINT64 x;
  62. __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
  63. return x;
  64. #endif
  65. #elif BS_COMPILER == BS_COMPILER_CLANG
  66. UINT32 a = 0;
  67. UINT32 b[4];
  68. __get_cpuid(a, &b[0], &b[1], &b[2], &b[3]);
  69. return __rdtsc();
  70. #elif BS_COMPILER == BS_COMPILER_MSVC
  71. int a[4];
  72. int b = 0;
  73. __cpuid(a, b);
  74. return __rdtsc();
  75. #else
  76. static_assert("Unsupported compiler");
  77. #endif
  78. }
  79. ProfilerCPU::ProfileData::ProfileData(FrameAlloc* alloc)
  80. :samples(alloc)
  81. { }
  82. void ProfilerCPU::ProfileData::beginSample()
  83. {
  84. memAllocs = MemoryCounter::getNumAllocs();
  85. memFrees = MemoryCounter::getNumFrees();
  86. timer.reset();
  87. timer.start();
  88. }
  89. void ProfilerCPU::ProfileData::endSample()
  90. {
  91. timer.stop();
  92. UINT64 numAllocs = MemoryCounter::getNumAllocs() - memAllocs;
  93. UINT64 numFrees = MemoryCounter::getNumFrees() - memFrees;
  94. samples.push_back(ProfileSample(timer.time, numAllocs, numFrees));
  95. }
  96. void ProfilerCPU::ProfileData::resumeLastSample()
  97. {
  98. timer.start();
  99. samples.erase(samples.end() - 1);
  100. }
  101. ProfilerCPU::PreciseProfileData::PreciseProfileData(FrameAlloc* alloc)
  102. :samples(alloc)
  103. { }
  104. void ProfilerCPU::PreciseProfileData::beginSample()
  105. {
  106. memAllocs = MemoryCounter::getNumAllocs();
  107. memFrees = MemoryCounter::getNumFrees();
  108. timer.reset();
  109. timer.start();
  110. }
  111. void ProfilerCPU::PreciseProfileData::endSample()
  112. {
  113. timer.stop();
  114. UINT64 numAllocs = MemoryCounter::getNumAllocs() - memAllocs;
  115. UINT64 numFrees = MemoryCounter::getNumFrees() - memFrees;
  116. samples.push_back(PreciseProfileSample(timer.cycles, numAllocs, numFrees));
  117. }
  118. void ProfilerCPU::PreciseProfileData::resumeLastSample()
  119. {
  120. timer.start();
  121. samples.erase(samples.end() - 1);
  122. }
  123. BS_THREADLOCAL ProfilerCPU::ThreadInfo* ProfilerCPU::ThreadInfo::activeThread = nullptr;
  124. ProfilerCPU::ThreadInfo::ThreadInfo()
  125. :isActive(false), rootBlock(nullptr), frameAlloc(1024 * 512), activeBlocks(nullptr)
  126. {
  127. }
  128. void ProfilerCPU::ThreadInfo::begin(const char* _name)
  129. {
  130. if(isActive)
  131. {
  132. LOGWRN("Profiler::beginThread called on a thread that was already being sampled");
  133. return;
  134. }
  135. if(rootBlock == nullptr)
  136. rootBlock = getBlock(_name);
  137. activeBlock = ActiveBlock(ActiveSamplingType::Basic, rootBlock);
  138. if (activeBlocks == nullptr)
  139. activeBlocks = frameAlloc.alloc<Stack<ActiveBlock, StdFrameAlloc<ActiveBlock>>>(&frameAlloc);
  140. activeBlocks->push(activeBlock);
  141. rootBlock->basic.beginSample();
  142. isActive = true;
  143. }
  144. void ProfilerCPU::ThreadInfo::end()
  145. {
  146. if(activeBlock.type == ActiveSamplingType::Basic)
  147. activeBlock.block->basic.endSample();
  148. else
  149. activeBlock.block->precise.endSample();
  150. activeBlocks->pop();
  151. if(!isActive)
  152. LOGWRN("Profiler::endThread called on a thread that isn't being sampled.");
  153. if (activeBlocks->size() > 0)
  154. {
  155. LOGWRN("Profiler::endThread called but not all sample pairs were closed. Sampling data will not be valid.");
  156. while (activeBlocks->size() > 0)
  157. {
  158. ActiveBlock& curBlock = activeBlocks->top();
  159. if(curBlock.type == ActiveSamplingType::Basic)
  160. curBlock.block->basic.endSample();
  161. else
  162. curBlock.block->precise.endSample();
  163. activeBlocks->pop();
  164. }
  165. }
  166. isActive = false;
  167. activeBlock = ActiveBlock();
  168. frameAlloc.dealloc(activeBlocks);
  169. activeBlocks = nullptr;
  170. }
  171. void ProfilerCPU::ThreadInfo::reset()
  172. {
  173. if(isActive)
  174. end();
  175. if(rootBlock != nullptr)
  176. releaseBlock(rootBlock);
  177. rootBlock = nullptr;
  178. frameAlloc.clear(); // Note: This never actually frees memory
  179. }
  180. ProfilerCPU::ProfiledBlock* ProfilerCPU::ThreadInfo::getBlock(const char* name)
  181. {
  182. ProfiledBlock* block = frameAlloc.alloc<ProfiledBlock>(&frameAlloc);
  183. block->name = (char*)frameAlloc.alloc(((UINT32)strlen(name) + 1) * sizeof(char));
  184. strcpy(block->name, name);
  185. return block;
  186. }
  187. void ProfilerCPU::ThreadInfo::releaseBlock(ProfiledBlock* block)
  188. {
  189. frameAlloc.dealloc((UINT8*)block->name);
  190. frameAlloc.dealloc(block);
  191. }
  192. ProfilerCPU::ProfiledBlock::ProfiledBlock(FrameAlloc* alloc)
  193. :basic(alloc), precise(alloc), children(alloc)
  194. { }
  195. ProfilerCPU::ProfiledBlock::~ProfiledBlock()
  196. {
  197. ThreadInfo* thread = ThreadInfo::activeThread;
  198. for(auto& child : children)
  199. thread->releaseBlock(child);
  200. children.clear();
  201. }
  202. ProfilerCPU::ProfiledBlock* ProfilerCPU::ProfiledBlock::findChild(const char* name) const
  203. {
  204. for(auto& child : children)
  205. {
  206. if(strcmp(child->name, name) == 0)
  207. return child;
  208. }
  209. return nullptr;
  210. }
  211. ProfilerCPU::ProfilerCPU()
  212. : mBasicTimerOverhead(0.0), mPreciseTimerOverhead(0), mBasicSamplingOverheadMs(0.0), mPreciseSamplingOverheadMs(0.0)
  213. , mBasicSamplingOverheadCycles(0), mPreciseSamplingOverheadCycles(0)
  214. {
  215. // TODO - We only estimate overhead on program start. It might be better to estimate it each time beginThread is called,
  216. // and keep separate values per thread.
  217. estimateTimerOverhead();
  218. }
  219. ProfilerCPU::~ProfilerCPU()
  220. {
  221. reset();
  222. Lock lock(mThreadSync);
  223. for(auto& threadInfo : mActiveThreads)
  224. bs_delete<ThreadInfo, ProfilerAlloc>(threadInfo);
  225. }
  226. void ProfilerCPU::beginThread(const char* name)
  227. {
  228. ThreadInfo* thread = ThreadInfo::activeThread;
  229. if(thread == nullptr)
  230. {
  231. ThreadInfo::activeThread = bs_new<ThreadInfo, ProfilerAlloc>();
  232. thread = ThreadInfo::activeThread;
  233. {
  234. Lock lock(mThreadSync);
  235. mActiveThreads.push_back(thread);
  236. }
  237. }
  238. thread->begin(name);
  239. }
  240. void ProfilerCPU::endThread()
  241. {
  242. // I don't do a nullcheck where on purpose, so endSample can be called ASAP
  243. ThreadInfo::activeThread->end();
  244. }
  245. void ProfilerCPU::beginSample(const char* name)
  246. {
  247. ThreadInfo* thread = ThreadInfo::activeThread;
  248. if(thread == nullptr || !thread->isActive)
  249. {
  250. beginThread("Unknown");
  251. thread = ThreadInfo::activeThread;
  252. }
  253. ProfiledBlock* parent = thread->activeBlock.block;
  254. ProfiledBlock* block = nullptr;
  255. if(parent != nullptr)
  256. block = parent->findChild(name);
  257. if(block == nullptr)
  258. {
  259. block = thread->getBlock(name);
  260. if(parent != nullptr)
  261. parent->children.push_back(block);
  262. else
  263. thread->rootBlock->children.push_back(block);
  264. }
  265. thread->activeBlock = ActiveBlock(ActiveSamplingType::Basic, block);
  266. thread->activeBlocks->push(thread->activeBlock);
  267. block->basic.beginSample();
  268. }
  269. void ProfilerCPU::endSample(const char* name)
  270. {
  271. ThreadInfo* thread = ThreadInfo::activeThread;
  272. ProfiledBlock* block = thread->activeBlock.block;
  273. #if BS_DEBUG_MODE
  274. if(block == nullptr)
  275. {
  276. LOGWRN("Mismatched CPUProfiler::endSample. No beginSample was called.");
  277. return;
  278. }
  279. if(thread->activeBlock.type == ActiveSamplingType::Precise)
  280. {
  281. LOGWRN("Mismatched CPUProfiler::endSample. Was expecting Profiler::endSamplePrecise.");
  282. return;
  283. }
  284. if(strcmp(block->name, name) != 0)
  285. {
  286. LOGWRN("Mismatched CPUProfiler::endSample. Was expecting \"" + String(block->name) +
  287. "\" but got \"" + String(name) + "\". Sampling data will not be valid.");
  288. return;
  289. }
  290. #endif
  291. block->basic.endSample();
  292. thread->activeBlocks->pop();
  293. if (!thread->activeBlocks->empty())
  294. thread->activeBlock = thread->activeBlocks->top();
  295. else
  296. thread->activeBlock = ActiveBlock();
  297. }
  298. void ProfilerCPU::beginSamplePrecise(const char* name)
  299. {
  300. // Note: There is a (small) possibility a context switch will happen during this measurement in which case result will be skewed.
  301. // Increasing thread priority might help. This is generally only a problem with code that executes a long time (10-15+ ms - depending on OS quant length)
  302. ThreadInfo* thread = ThreadInfo::activeThread;
  303. if(thread == nullptr || !thread->isActive)
  304. beginThread("Unknown");
  305. ProfiledBlock* parent = thread->activeBlock.block;
  306. ProfiledBlock* block = nullptr;
  307. if(parent != nullptr)
  308. block = parent->findChild(name);
  309. if(block == nullptr)
  310. {
  311. block = thread->getBlock(name);
  312. if(parent != nullptr)
  313. parent->children.push_back(block);
  314. else
  315. thread->rootBlock->children.push_back(block);
  316. }
  317. thread->activeBlock = ActiveBlock(ActiveSamplingType::Precise, block);
  318. thread->activeBlocks->push(thread->activeBlock);
  319. block->precise.beginSample();
  320. }
  321. void ProfilerCPU::endSamplePrecise(const char* name)
  322. {
  323. ThreadInfo* thread = ThreadInfo::activeThread;
  324. ProfiledBlock* block = thread->activeBlock.block;
  325. #if BS_DEBUG_MODE
  326. if(block == nullptr)
  327. {
  328. LOGWRN("Mismatched Profiler::endSamplePrecise. No beginSamplePrecise was called.");
  329. return;
  330. }
  331. if(thread->activeBlock.type == ActiveSamplingType::Basic)
  332. {
  333. LOGWRN("Mismatched CPUProfiler::endSamplePrecise. Was expecting Profiler::endSample.");
  334. return;
  335. }
  336. if (strcmp(block->name, name) != 0)
  337. {
  338. LOGWRN("Mismatched Profiler::endSamplePrecise. Was expecting \"" + String(block->name) +
  339. "\" but got \"" + String(name) + "\". Sampling data will not be valid.");
  340. return;
  341. }
  342. #endif
  343. block->precise.endSample();
  344. thread->activeBlocks->pop();
  345. if (!thread->activeBlocks->empty())
  346. thread->activeBlock = thread->activeBlocks->top();
  347. else
  348. thread->activeBlock = ActiveBlock();
  349. }
  350. void ProfilerCPU::reset()
  351. {
  352. ThreadInfo* thread = ThreadInfo::activeThread;
  353. if(thread != nullptr)
  354. thread->reset();
  355. }
  356. CPUProfilerReport ProfilerCPU::generateReport()
  357. {
  358. CPUProfilerReport report;
  359. ThreadInfo* thread = ThreadInfo::activeThread;
  360. if(thread == nullptr)
  361. return report;
  362. if(thread->isActive)
  363. thread->end();
  364. // We need to separate out basic and precise data and form two separate hierarchies
  365. if(thread->rootBlock == nullptr)
  366. return report;
  367. struct TempEntry
  368. {
  369. TempEntry(ProfiledBlock* _parentBlock, UINT32 _entryIdx)
  370. :parentBlock(_parentBlock), entryIdx(_entryIdx)
  371. { }
  372. ProfiledBlock* parentBlock;
  373. UINT32 entryIdx;
  374. ProfilerVector<UINT32> childIndexes;
  375. };
  376. ProfilerVector<CPUProfilerBasicSamplingEntry> basicEntries;
  377. ProfilerVector<CPUProfilerPreciseSamplingEntry> preciseEntries;
  378. // Fill up flatHierarchy array in a way so we always process children before parents
  379. ProfilerStack<UINT32> todo;
  380. ProfilerVector<TempEntry> flatHierarchy;
  381. UINT32 entryIdx = 0;
  382. todo.push(entryIdx);
  383. flatHierarchy.push_back(TempEntry(thread->rootBlock, entryIdx));
  384. entryIdx++;
  385. while(!todo.empty())
  386. {
  387. UINT32 curDataIdx = todo.top();
  388. ProfiledBlock* curBlock = flatHierarchy[curDataIdx].parentBlock;
  389. todo.pop();
  390. for(auto& child : curBlock->children)
  391. {
  392. flatHierarchy[curDataIdx].childIndexes.push_back(entryIdx);
  393. todo.push(entryIdx);
  394. flatHierarchy.push_back(TempEntry(child, entryIdx));
  395. entryIdx++;
  396. }
  397. }
  398. // Calculate sampling data for all entries
  399. basicEntries.resize(flatHierarchy.size());
  400. preciseEntries.resize(flatHierarchy.size());
  401. for(auto iter = flatHierarchy.rbegin(); iter != flatHierarchy.rend(); ++iter)
  402. {
  403. TempEntry& curData = *iter;
  404. ProfiledBlock* curBlock = curData.parentBlock;
  405. CPUProfilerBasicSamplingEntry* entryBasic = &basicEntries[curData.entryIdx];
  406. CPUProfilerPreciseSamplingEntry* entryPrecise = &preciseEntries[curData.entryIdx];
  407. // Calculate basic data
  408. entryBasic->data.name = String(curBlock->name);
  409. entryBasic->data.memAllocs = 0;
  410. entryBasic->data.memFrees = 0;
  411. entryBasic->data.totalTimeMs = 0.0;
  412. entryBasic->data.maxTimeMs = 0.0;
  413. for(auto& sample : curBlock->basic.samples)
  414. {
  415. entryBasic->data.totalTimeMs += sample.time;
  416. entryBasic->data.maxTimeMs = std::max(entryBasic->data.maxTimeMs, sample.time);
  417. entryBasic->data.memAllocs += sample.numAllocs;
  418. entryBasic->data.memFrees += sample.numFrees;
  419. }
  420. entryBasic->data.numCalls = (UINT32)curBlock->basic.samples.size();
  421. if(entryBasic->data.numCalls > 0)
  422. entryBasic->data.avgTimeMs = entryBasic->data.totalTimeMs / entryBasic->data.numCalls;
  423. double totalChildTime = 0.0;
  424. for(auto& childIdx : curData.childIndexes)
  425. {
  426. CPUProfilerBasicSamplingEntry* childEntry = &basicEntries[childIdx];
  427. totalChildTime += childEntry->data.totalTimeMs;
  428. childEntry->data.pctOfParent = (float)(childEntry->data.totalTimeMs / entryBasic->data.totalTimeMs);
  429. entryBasic->data.estimatedOverheadMs += childEntry->data.estimatedOverheadMs;
  430. }
  431. entryBasic->data.estimatedOverheadMs += curBlock->basic.samples.size() * mBasicSamplingOverheadMs;
  432. entryBasic->data.estimatedOverheadMs += curBlock->precise.samples.size() * mPreciseSamplingOverheadMs;
  433. entryBasic->data.totalSelfTimeMs = entryBasic->data.totalTimeMs - totalChildTime;
  434. if(entryBasic->data.numCalls > 0)
  435. entryBasic->data.avgSelfTimeMs = entryBasic->data.totalSelfTimeMs / entryBasic->data.numCalls;
  436. entryBasic->data.estimatedSelfOverheadMs = mBasicTimerOverhead;
  437. // Calculate precise data
  438. entryPrecise->data.name = String(curBlock->name);
  439. entryPrecise->data.memAllocs = 0;
  440. entryPrecise->data.memFrees = 0;
  441. entryPrecise->data.totalCycles = 0;
  442. entryPrecise->data.maxCycles = 0;
  443. for(auto& sample : curBlock->precise.samples)
  444. {
  445. entryPrecise->data.totalCycles += sample.cycles;
  446. entryPrecise->data.maxCycles = std::max(entryPrecise->data.maxCycles, sample.cycles);
  447. entryPrecise->data.memAllocs += sample.numAllocs;
  448. entryPrecise->data.memFrees += sample.numFrees;
  449. }
  450. entryPrecise->data.numCalls = (UINT32)curBlock->precise.samples.size();
  451. if(entryPrecise->data.numCalls > 0)
  452. entryPrecise->data.avgCycles = entryPrecise->data.totalCycles / entryPrecise->data.numCalls;
  453. UINT64 totalChildCycles = 0;
  454. for(auto& childIdx : curData.childIndexes)
  455. {
  456. CPUProfilerPreciseSamplingEntry* childEntry = &preciseEntries[childIdx];
  457. totalChildCycles += childEntry->data.totalCycles;
  458. childEntry->data.pctOfParent = childEntry->data.totalCycles / (float)entryPrecise->data.totalCycles;
  459. entryPrecise->data.estimatedOverhead += childEntry->data.estimatedOverhead;
  460. }
  461. entryPrecise->data.estimatedOverhead += curBlock->precise.samples.size() * mPreciseSamplingOverheadCycles;
  462. entryPrecise->data.estimatedOverhead += curBlock->basic.samples.size() * mBasicSamplingOverheadCycles;
  463. entryPrecise->data.totalSelfCycles = entryPrecise->data.totalCycles - totalChildCycles;
  464. if(entryPrecise->data.numCalls > 0)
  465. entryPrecise->data.avgSelfCycles = entryPrecise->data.totalSelfCycles / entryPrecise->data.numCalls;
  466. entryPrecise->data.estimatedSelfOverhead = mPreciseTimerOverhead;
  467. }
  468. // Prune empty basic entries
  469. ProfilerStack<UINT32> finalBasicHierarchyTodo;
  470. ProfilerStack<UINT32> parentBasicEntryIndexes;
  471. ProfilerVector<TempEntry> newBasicEntries;
  472. finalBasicHierarchyTodo.push(0);
  473. entryIdx = 0;
  474. parentBasicEntryIndexes.push(entryIdx);
  475. newBasicEntries.push_back(TempEntry(nullptr, entryIdx));
  476. entryIdx++;
  477. while(!finalBasicHierarchyTodo.empty())
  478. {
  479. UINT32 parentEntryIdx = parentBasicEntryIndexes.top();
  480. parentBasicEntryIndexes.pop();
  481. UINT32 curEntryIdx = finalBasicHierarchyTodo.top();
  482. TempEntry& curEntry = flatHierarchy[curEntryIdx];
  483. finalBasicHierarchyTodo.pop();
  484. for(auto& childIdx : curEntry.childIndexes)
  485. {
  486. finalBasicHierarchyTodo.push(childIdx);
  487. CPUProfilerBasicSamplingEntry& basicEntry = basicEntries[childIdx];
  488. if(basicEntry.data.numCalls > 0)
  489. {
  490. newBasicEntries.push_back(TempEntry(nullptr, childIdx));
  491. newBasicEntries[parentEntryIdx].childIndexes.push_back(entryIdx);
  492. parentBasicEntryIndexes.push(entryIdx);
  493. entryIdx++;
  494. }
  495. else
  496. parentBasicEntryIndexes.push(parentEntryIdx);
  497. }
  498. }
  499. if(newBasicEntries.size() > 0)
  500. {
  501. ProfilerVector<CPUProfilerBasicSamplingEntry*> finalBasicEntries;
  502. report.mBasicSamplingRootEntry = basicEntries[newBasicEntries[0].entryIdx];
  503. finalBasicEntries.push_back(&report.mBasicSamplingRootEntry);
  504. finalBasicHierarchyTodo.push(0);
  505. while(!finalBasicHierarchyTodo.empty())
  506. {
  507. UINT32 curEntryIdx = finalBasicHierarchyTodo.top();
  508. finalBasicHierarchyTodo.pop();
  509. TempEntry& curEntry = newBasicEntries[curEntryIdx];
  510. CPUProfilerBasicSamplingEntry* basicEntry = finalBasicEntries[curEntryIdx];
  511. basicEntry->childEntries.resize(curEntry.childIndexes.size());
  512. UINT32 idx = 0;
  513. for(auto& childIdx : curEntry.childIndexes)
  514. {
  515. TempEntry& childEntry = newBasicEntries[childIdx];
  516. basicEntry->childEntries[idx] = basicEntries[childEntry.entryIdx];
  517. finalBasicEntries.push_back(&(basicEntry->childEntries[idx]));
  518. finalBasicHierarchyTodo.push(childIdx);
  519. idx++;
  520. }
  521. }
  522. }
  523. // Prune empty precise entries
  524. ProfilerStack<UINT32> finalPreciseHierarchyTodo;
  525. ProfilerStack<UINT32> parentPreciseEntryIndexes;
  526. ProfilerVector<TempEntry> newPreciseEntries;
  527. finalPreciseHierarchyTodo.push(0);
  528. entryIdx = 0;
  529. parentPreciseEntryIndexes.push(entryIdx);
  530. newPreciseEntries.push_back(TempEntry(nullptr, entryIdx));
  531. entryIdx++;
  532. while(!finalPreciseHierarchyTodo.empty())
  533. {
  534. UINT32 parentEntryIdx = parentPreciseEntryIndexes.top();
  535. parentPreciseEntryIndexes.pop();
  536. UINT32 curEntryIdx = finalPreciseHierarchyTodo.top();
  537. TempEntry& curEntry = flatHierarchy[curEntryIdx];
  538. finalPreciseHierarchyTodo.pop();
  539. for(auto& childIdx : curEntry.childIndexes)
  540. {
  541. finalPreciseHierarchyTodo.push(childIdx);
  542. CPUProfilerPreciseSamplingEntry& preciseEntry = preciseEntries[childIdx];
  543. if(preciseEntry.data.numCalls > 0)
  544. {
  545. newPreciseEntries.push_back(TempEntry(nullptr, childIdx));
  546. newPreciseEntries[parentEntryIdx].childIndexes.push_back(entryIdx);
  547. parentPreciseEntryIndexes.push(entryIdx);
  548. entryIdx++;
  549. }
  550. else
  551. parentPreciseEntryIndexes.push(parentEntryIdx);
  552. }
  553. }
  554. if(newPreciseEntries.size() > 0)
  555. {
  556. ProfilerVector<CPUProfilerPreciseSamplingEntry*> finalPreciseEntries;
  557. report.mPreciseSamplingRootEntry = preciseEntries[newPreciseEntries[0].entryIdx];
  558. finalPreciseEntries.push_back(&report.mPreciseSamplingRootEntry);
  559. finalPreciseHierarchyTodo.push(0);
  560. while(!finalPreciseHierarchyTodo.empty())
  561. {
  562. UINT32 curEntryIdx = finalPreciseHierarchyTodo.top();
  563. finalPreciseHierarchyTodo.pop();
  564. TempEntry& curEntry = newPreciseEntries[curEntryIdx];
  565. CPUProfilerPreciseSamplingEntry* preciseEntry = finalPreciseEntries[curEntryIdx];
  566. preciseEntry->childEntries.resize(curEntry.childIndexes.size());
  567. UINT32 idx = 0;
  568. for(auto& childIdx : curEntry.childIndexes)
  569. {
  570. TempEntry& childEntry = newPreciseEntries[childIdx];
  571. preciseEntry->childEntries[idx] = preciseEntries[childEntry.entryIdx];
  572. finalPreciseEntries.push_back(&preciseEntry->childEntries.back());
  573. finalPreciseHierarchyTodo.push(childIdx);
  574. idx++;
  575. }
  576. }
  577. }
  578. return report;
  579. }
  580. void ProfilerCPU::estimateTimerOverhead()
  581. {
  582. // Get an idea of how long timer calls and RDTSC takes
  583. const UINT32 reps = 1000, sampleReps = 20;
  584. mBasicTimerOverhead = 1000000.0;
  585. mPreciseTimerOverhead = 1000000;
  586. for (UINT32 tries = 0; tries < 20; tries++)
  587. {
  588. Timer timer;
  589. for (UINT32 i = 0; i < reps; i++)
  590. {
  591. timer.start();
  592. timer.stop();
  593. }
  594. double avgTime = double(timer.time)/double(reps);
  595. if (avgTime < mBasicTimerOverhead)
  596. mBasicTimerOverhead = avgTime;
  597. TimerPrecise timerPrecise;
  598. for (UINT32 i = 0; i < reps; i++)
  599. {
  600. timerPrecise.start();
  601. timerPrecise.stop();
  602. }
  603. UINT64 avgCycles = timerPrecise.cycles/reps;
  604. if (avgCycles < mPreciseTimerOverhead)
  605. mPreciseTimerOverhead = avgCycles;
  606. }
  607. mBasicSamplingOverheadMs = 1000000.0;
  608. mPreciseSamplingOverheadMs = 1000000.0;
  609. mBasicSamplingOverheadCycles = 1000000;
  610. mPreciseSamplingOverheadCycles = 1000000;
  611. for (UINT32 tries = 0; tries < 3; tries++)
  612. {
  613. /************************************************************************/
  614. /* AVERAGE TIME IN MS FOR BASIC SAMPLING */
  615. /************************************************************************/
  616. Timer timerA;
  617. timerA.start();
  618. beginThread("Main");
  619. // Two different cases that can effect performance, one where
  620. // sample already exists and other where new one needs to be created
  621. for (UINT32 i = 0; i < sampleReps; i++)
  622. {
  623. beginSample("TestAvg1");
  624. endSample("TestAvg1");
  625. beginSample("TestAvg2");
  626. endSample("TestAvg2");
  627. beginSample("TestAvg3");
  628. endSample("TestAvg3");
  629. beginSample("TestAvg4");
  630. endSample("TestAvg4");
  631. beginSample("TestAvg5");
  632. endSample("TestAvg5");
  633. beginSample("TestAvg6");
  634. endSample("TestAvg6");
  635. beginSample("TestAvg7");
  636. endSample("TestAvg7");
  637. beginSample("TestAvg8");
  638. endSample("TestAvg8");
  639. beginSample("TestAvg9");
  640. endSample("TestAvg9");
  641. beginSample("TestAvg10");
  642. endSample("TestAvg10");
  643. }
  644. for (UINT32 i = 0; i < sampleReps * 5; i++)
  645. {
  646. beginSample(("TestAvg#" + toString(i)).c_str());
  647. endSample(("TestAvg#" + toString(i)).c_str());
  648. }
  649. endThread();
  650. timerA.stop();
  651. reset();
  652. double avgTimeBasic = double(timerA.time)/double(sampleReps * 10 + sampleReps * 5) - mBasicTimerOverhead;
  653. if (avgTimeBasic < mBasicSamplingOverheadMs)
  654. mBasicSamplingOverheadMs = avgTimeBasic;
  655. /************************************************************************/
  656. /* AVERAGE CYCLES FOR BASIC SAMPLING */
  657. /************************************************************************/
  658. TimerPrecise timerPreciseA;
  659. timerPreciseA.start();
  660. beginThread("Main");
  661. // Two different cases that can effect performance, one where
  662. // sample already exists and other where new one needs to be created
  663. for (UINT32 i = 0; i < sampleReps; i++)
  664. {
  665. beginSample("TestAvg1");
  666. endSample("TestAvg1");
  667. beginSample("TestAvg2");
  668. endSample("TestAvg2");
  669. beginSample("TestAvg3");
  670. endSample("TestAvg3");
  671. beginSample("TestAvg4");
  672. endSample("TestAvg4");
  673. beginSample("TestAvg5");
  674. endSample("TestAvg5");
  675. beginSample("TestAvg6");
  676. endSample("TestAvg6");
  677. beginSample("TestAvg7");
  678. endSample("TestAvg7");
  679. beginSample("TestAvg8");
  680. endSample("TestAvg8");
  681. beginSample("TestAvg9");
  682. endSample("TestAvg9");
  683. beginSample("TestAvg10");
  684. endSample("TestAvg10");
  685. }
  686. for (UINT32 i = 0; i < sampleReps * 5; i++)
  687. {
  688. beginSample(("TestAvg#" + toString(i)).c_str());
  689. endSample(("TestAvg#" + toString(i)).c_str());
  690. }
  691. endThread();
  692. timerPreciseA.stop();
  693. reset();
  694. UINT64 avgCyclesBasic = timerPreciseA.cycles/(sampleReps * 10 + sampleReps * 5) - mPreciseTimerOverhead;
  695. if (avgCyclesBasic < mBasicSamplingOverheadCycles)
  696. mBasicSamplingOverheadCycles = avgCyclesBasic;
  697. /************************************************************************/
  698. /* AVERAGE TIME IN MS FOR PRECISE SAMPLING */
  699. /************************************************************************/
  700. Timer timerB;
  701. timerB.start();
  702. beginThread("Main");
  703. // Two different cases that can effect performance, one where
  704. // sample already exists and other where new one needs to be created
  705. for (UINT32 i = 0; i < sampleReps; i++)
  706. {
  707. beginSamplePrecise("TestAvg1");
  708. endSamplePrecise("TestAvg1");
  709. beginSamplePrecise("TestAvg2");
  710. endSamplePrecise("TestAvg2");
  711. beginSamplePrecise("TestAvg3");
  712. endSamplePrecise("TestAvg3");
  713. beginSamplePrecise("TestAvg4");
  714. endSamplePrecise("TestAvg4");
  715. beginSamplePrecise("TestAvg5");
  716. endSamplePrecise("TestAvg5");
  717. beginSamplePrecise("TestAvg6");
  718. endSamplePrecise("TestAvg6");
  719. beginSamplePrecise("TestAvg7");
  720. endSamplePrecise("TestAvg7");
  721. beginSamplePrecise("TestAvg8");
  722. endSamplePrecise("TestAvg8");
  723. beginSamplePrecise("TestAvg9");
  724. endSamplePrecise("TestAvg9");
  725. beginSamplePrecise("TestAvg10");
  726. endSamplePrecise("TestAvg10");
  727. }
  728. for (UINT32 i = 0; i < sampleReps * 5; i++)
  729. {
  730. beginSamplePrecise(("TestAvg#" + toString(i)).c_str());
  731. endSamplePrecise(("TestAvg#" + toString(i)).c_str());
  732. }
  733. endThread();
  734. timerB.stop();
  735. reset();
  736. double avgTimesPrecise = timerB.time/(sampleReps * 10 + sampleReps * 5);
  737. if (avgTimesPrecise < mPreciseSamplingOverheadMs)
  738. mPreciseSamplingOverheadMs = avgTimesPrecise;
  739. /************************************************************************/
  740. /* AVERAGE CYCLES FOR PRECISE SAMPLING */
  741. /************************************************************************/
  742. TimerPrecise timerPreciseB;
  743. timerPreciseB.start();
  744. beginThread("Main");
  745. // Two different cases that can effect performance, one where
  746. // sample already exists and other where new one needs to be created
  747. for (UINT32 i = 0; i < sampleReps; i++)
  748. {
  749. beginSamplePrecise("TestAvg1");
  750. endSamplePrecise("TestAvg1");
  751. beginSamplePrecise("TestAvg2");
  752. endSamplePrecise("TestAvg2");
  753. beginSamplePrecise("TestAvg3");
  754. endSamplePrecise("TestAvg3");
  755. beginSamplePrecise("TestAvg4");
  756. endSamplePrecise("TestAvg4");
  757. beginSamplePrecise("TestAvg5");
  758. endSamplePrecise("TestAvg5");
  759. beginSamplePrecise("TestAvg6");
  760. endSamplePrecise("TestAvg6");
  761. beginSamplePrecise("TestAvg7");
  762. endSamplePrecise("TestAvg7");
  763. beginSamplePrecise("TestAvg8");
  764. endSamplePrecise("TestAvg8");
  765. beginSamplePrecise("TestAvg9");
  766. endSamplePrecise("TestAvg9");
  767. beginSamplePrecise("TestAvg10");
  768. endSamplePrecise("TestAvg10");
  769. }
  770. for (UINT32 i = 0; i < sampleReps * 5; i++)
  771. {
  772. beginSamplePrecise(("TestAvg#" + toString(i)).c_str());
  773. endSamplePrecise(("TestAvg#" + toString(i)).c_str());
  774. }
  775. endThread();
  776. timerPreciseB.stop();
  777. reset();
  778. UINT64 avgCyclesPrecise = timerPreciseB.cycles/(sampleReps * 10 + sampleReps * 5);
  779. if (avgCyclesPrecise < mPreciseSamplingOverheadCycles)
  780. mPreciseSamplingOverheadCycles = avgCyclesPrecise;
  781. }
  782. }
  783. CPUProfilerBasicSamplingEntry::Data::Data()
  784. :numCalls(0), avgTimeMs(0.0), maxTimeMs(0.0), totalTimeMs(0.0),
  785. avgSelfTimeMs(0.0), totalSelfTimeMs(0.0), estimatedSelfOverheadMs(0.0),
  786. estimatedOverheadMs(0.0), pctOfParent(1.0f)
  787. { }
  788. CPUProfilerPreciseSamplingEntry::Data::Data()
  789. :numCalls(0), avgCycles(0), maxCycles(0), totalCycles(0),
  790. avgSelfCycles(0), totalSelfCycles(0), estimatedSelfOverhead(0),
  791. estimatedOverhead(0), pctOfParent(1.0f)
  792. { }
  793. CPUProfilerReport::CPUProfilerReport()
  794. {
  795. }
  796. ProfilerCPU& gProfilerCPU()
  797. {
  798. return ProfilerCPU::instance();
  799. }
  800. }