BsProfilerCPU.cpp 27 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019
  1. //********************************** Banshee Engine (www.banshee3d.com) **************************************************//
  2. //**************** Copyright (c) 2016 Marko Pintera ([email protected]). All rights reserved. **********************//
  3. #include "BsProfilerCPU.h"
  4. #include "BsDebug.h"
  5. #include "BsPlatform.h"
  6. #include <chrono>
  7. #if BS_COMPILER == BS_COMPILER_GNUC || BS_COMPILER == BS_COMPILER_CLANG
  8. #include "cpuid.h"
  9. #endif
  10. #if BS_COMPILER == BS_COMPILER_CLANG
  11. #include "intrin.h"
  12. #endif
  13. using namespace std::chrono;
  14. namespace bs
  15. {
  16. ProfilerCPU::Timer::Timer()
  17. :startTime(0.0f)
  18. {
  19. time = 0.0f;
  20. }
  21. void ProfilerCPU::Timer::start()
  22. {
  23. startTime = getCurrentTime();
  24. }
  25. void ProfilerCPU::Timer::stop()
  26. {
  27. time += getCurrentTime() - startTime;
  28. }
  29. void ProfilerCPU::Timer::reset()
  30. {
  31. time = 0.0f;
  32. }
  33. inline double ProfilerCPU::Timer::getCurrentTime() const
  34. {
  35. steady_clock::time_point timeNow = mHRClock.now();
  36. nanoseconds timeNowNs = timeNow.time_since_epoch();
  37. return timeNowNs.count() * 0.000001;
  38. }
  39. ProfilerCPU::TimerPrecise::TimerPrecise()
  40. {
  41. cycles = 0;
  42. }
  43. void ProfilerCPU::TimerPrecise::start()
  44. {
  45. startCycles = getNumCycles();
  46. }
  47. void ProfilerCPU::TimerPrecise::stop()
  48. {
  49. cycles += getNumCycles() - startCycles;
  50. }
  51. void ProfilerCPU::TimerPrecise::reset()
  52. {
  53. cycles = 0;
  54. }
  55. inline UINT64 ProfilerCPU::TimerPrecise::getNumCycles()
  56. {
  57. #if BS_COMPILER == BS_COMPILER_GNUC
  58. unsigned int a = 0;
  59. unsigned int b[4];
  60. __get_cpuid(a, &b[0], &b[1], &b[2], &b[3]);
  61. #if BS_ARCH_TYPE == BS_ARCHITECTURE_x86_64
  62. UINT32 __a, __d;
  63. __asm__ __volatile__ ("rdtsc" : "=a" (__a), "=d" (__d));
  64. return (UINT64(__a) | UINT64(__d) << 32);
  65. #else
  66. UINT64 x;
  67. __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
  68. return x;
  69. #endif
  70. #elif BS_COMPILER == BS_COMPILER_CLANG
  71. UINT32 a = 0;
  72. UINT32 b[4];
  73. __get_cpuid(a, &b[0], &b[1], &b[2], &b[3]);
  74. return __rdtsc();
  75. #elif BS_COMPILER == BS_COMPILER_MSVC
  76. int a[4];
  77. int b = 0;
  78. __cpuid(a, b);
  79. return __rdtsc();
  80. #else
  81. static_assert(false, "Unsupported compiler");
  82. #endif
  83. }
  84. ProfilerCPU::ProfileData::ProfileData(FrameAlloc* alloc)
  85. :samples(alloc)
  86. { }
  87. void ProfilerCPU::ProfileData::beginSample()
  88. {
  89. memAllocs = MemoryCounter::getNumAllocs();
  90. memFrees = MemoryCounter::getNumFrees();
  91. timer.reset();
  92. timer.start();
  93. }
  94. void ProfilerCPU::ProfileData::endSample()
  95. {
  96. timer.stop();
  97. UINT64 numAllocs = MemoryCounter::getNumAllocs() - memAllocs;
  98. UINT64 numFrees = MemoryCounter::getNumFrees() - memFrees;
  99. samples.push_back(ProfileSample(timer.time, numAllocs, numFrees));
  100. }
  101. void ProfilerCPU::ProfileData::resumeLastSample()
  102. {
  103. timer.start();
  104. samples.erase(samples.end() - 1);
  105. }
  106. ProfilerCPU::PreciseProfileData::PreciseProfileData(FrameAlloc* alloc)
  107. :samples(alloc)
  108. { }
  109. void ProfilerCPU::PreciseProfileData::beginSample()
  110. {
  111. memAllocs = MemoryCounter::getNumAllocs();
  112. memFrees = MemoryCounter::getNumFrees();
  113. timer.reset();
  114. timer.start();
  115. }
  116. void ProfilerCPU::PreciseProfileData::endSample()
  117. {
  118. timer.stop();
  119. UINT64 numAllocs = MemoryCounter::getNumAllocs() - memAllocs;
  120. UINT64 numFrees = MemoryCounter::getNumFrees() - memFrees;
  121. samples.push_back(PreciseProfileSample(timer.cycles, numAllocs, numFrees));
  122. }
  123. void ProfilerCPU::PreciseProfileData::resumeLastSample()
  124. {
  125. timer.start();
  126. samples.erase(samples.end() - 1);
  127. }
  128. BS_THREADLOCAL ProfilerCPU::ThreadInfo* ProfilerCPU::ThreadInfo::activeThread = nullptr;
  129. ProfilerCPU::ThreadInfo::ThreadInfo()
  130. :isActive(false), rootBlock(nullptr), frameAlloc(1024 * 512), activeBlocks(nullptr)
  131. {
  132. }
  133. void ProfilerCPU::ThreadInfo::begin(const char* _name)
  134. {
  135. if(isActive)
  136. {
  137. LOGWRN("Profiler::beginThread called on a thread that was already being sampled");
  138. return;
  139. }
  140. if(rootBlock == nullptr)
  141. rootBlock = getBlock(_name);
  142. activeBlock = ActiveBlock(ActiveSamplingType::Basic, rootBlock);
  143. if (activeBlocks == nullptr)
  144. activeBlocks = frameAlloc.alloc<Stack<ActiveBlock, StdFrameAlloc<ActiveBlock>>>(&frameAlloc);
  145. activeBlocks->push(activeBlock);
  146. rootBlock->basic.beginSample();
  147. isActive = true;
  148. }
  149. void ProfilerCPU::ThreadInfo::end()
  150. {
  151. if(activeBlock.type == ActiveSamplingType::Basic)
  152. activeBlock.block->basic.endSample();
  153. else
  154. activeBlock.block->precise.endSample();
  155. activeBlocks->pop();
  156. if(!isActive)
  157. LOGWRN("Profiler::endThread called on a thread that isn't being sampled.");
  158. if (activeBlocks->size() > 0)
  159. {
  160. LOGWRN("Profiler::endThread called but not all sample pairs were closed. Sampling data will not be valid.");
  161. while (activeBlocks->size() > 0)
  162. {
  163. ActiveBlock& curBlock = activeBlocks->top();
  164. if(curBlock.type == ActiveSamplingType::Basic)
  165. curBlock.block->basic.endSample();
  166. else
  167. curBlock.block->precise.endSample();
  168. activeBlocks->pop();
  169. }
  170. }
  171. isActive = false;
  172. activeBlock = ActiveBlock();
  173. frameAlloc.dealloc(activeBlocks);
  174. activeBlocks = nullptr;
  175. }
  176. void ProfilerCPU::ThreadInfo::reset()
  177. {
  178. if(isActive)
  179. end();
  180. if(rootBlock != nullptr)
  181. releaseBlock(rootBlock);
  182. rootBlock = nullptr;
  183. frameAlloc.clear(); // Note: This never actually frees memory
  184. }
  185. ProfilerCPU::ProfiledBlock* ProfilerCPU::ThreadInfo::getBlock(const char* name)
  186. {
  187. ProfiledBlock* block = frameAlloc.alloc<ProfiledBlock>(&frameAlloc);
  188. block->name = (char*)frameAlloc.alloc(((UINT32)strlen(name) + 1) * sizeof(char));
  189. strcpy(block->name, name);
  190. return block;
  191. }
  192. void ProfilerCPU::ThreadInfo::releaseBlock(ProfiledBlock* block)
  193. {
  194. frameAlloc.dealloc((UINT8*)block->name);
  195. frameAlloc.dealloc(block);
  196. }
  197. ProfilerCPU::ProfiledBlock::ProfiledBlock(FrameAlloc* alloc)
  198. :basic(alloc), precise(alloc), children(alloc)
  199. { }
  200. ProfilerCPU::ProfiledBlock::~ProfiledBlock()
  201. {
  202. ThreadInfo* thread = ThreadInfo::activeThread;
  203. for(auto& child : children)
  204. thread->releaseBlock(child);
  205. children.clear();
  206. }
  207. ProfilerCPU::ProfiledBlock* ProfilerCPU::ProfiledBlock::findChild(const char* name) const
  208. {
  209. for(auto& child : children)
  210. {
  211. if(strcmp(child->name, name) == 0)
  212. return child;
  213. }
  214. return nullptr;
  215. }
  216. ProfilerCPU::ProfilerCPU()
  217. : mBasicTimerOverhead(0.0), mPreciseTimerOverhead(0), mBasicSamplingOverheadMs(0.0), mPreciseSamplingOverheadMs(0.0)
  218. , mBasicSamplingOverheadCycles(0), mPreciseSamplingOverheadCycles(0)
  219. {
  220. // TODO - We only estimate overhead on program start. It might be better to estimate it each time beginThread is called,
  221. // and keep separate values per thread.
  222. estimateTimerOverhead();
  223. }
  224. ProfilerCPU::~ProfilerCPU()
  225. {
  226. reset();
  227. Lock lock(mThreadSync);
  228. for(auto& threadInfo : mActiveThreads)
  229. bs_delete<ThreadInfo, ProfilerAlloc>(threadInfo);
  230. }
  231. void ProfilerCPU::beginThread(const char* name)
  232. {
  233. ThreadInfo* thread = ThreadInfo::activeThread;
  234. if(thread == nullptr)
  235. {
  236. ThreadInfo::activeThread = bs_new<ThreadInfo, ProfilerAlloc>();
  237. thread = ThreadInfo::activeThread;
  238. {
  239. Lock lock(mThreadSync);
  240. mActiveThreads.push_back(thread);
  241. }
  242. }
  243. thread->begin(name);
  244. }
  245. void ProfilerCPU::endThread()
  246. {
  247. // I don't do a nullcheck where on purpose, so endSample can be called ASAP
  248. ThreadInfo::activeThread->end();
  249. }
  250. void ProfilerCPU::beginSample(const char* name)
  251. {
  252. ThreadInfo* thread = ThreadInfo::activeThread;
  253. if(thread == nullptr || !thread->isActive)
  254. {
  255. beginThread("Unknown");
  256. thread = ThreadInfo::activeThread;
  257. }
  258. ProfiledBlock* parent = thread->activeBlock.block;
  259. ProfiledBlock* block = nullptr;
  260. if(parent != nullptr)
  261. block = parent->findChild(name);
  262. if(block == nullptr)
  263. {
  264. block = thread->getBlock(name);
  265. if(parent != nullptr)
  266. parent->children.push_back(block);
  267. else
  268. thread->rootBlock->children.push_back(block);
  269. }
  270. thread->activeBlock = ActiveBlock(ActiveSamplingType::Basic, block);
  271. thread->activeBlocks->push(thread->activeBlock);
  272. block->basic.beginSample();
  273. }
  274. void ProfilerCPU::endSample(const char* name)
  275. {
  276. ThreadInfo* thread = ThreadInfo::activeThread;
  277. ProfiledBlock* block = thread->activeBlock.block;
  278. #if BS_DEBUG_MODE
  279. if(block == nullptr)
  280. {
  281. LOGWRN("Mismatched CPUProfiler::endSample. No beginSample was called.");
  282. return;
  283. }
  284. if(thread->activeBlock.type == ActiveSamplingType::Precise)
  285. {
  286. LOGWRN("Mismatched CPUProfiler::endSample. Was expecting Profiler::endSamplePrecise.");
  287. return;
  288. }
  289. if(strcmp(block->name, name) != 0)
  290. {
  291. LOGWRN("Mismatched CPUProfiler::endSample. Was expecting \"" + String(block->name) +
  292. "\" but got \"" + String(name) + "\". Sampling data will not be valid.");
  293. return;
  294. }
  295. #endif
  296. block->basic.endSample();
  297. thread->activeBlocks->pop();
  298. if (!thread->activeBlocks->empty())
  299. thread->activeBlock = thread->activeBlocks->top();
  300. else
  301. thread->activeBlock = ActiveBlock();
  302. }
  303. void ProfilerCPU::beginSamplePrecise(const char* name)
  304. {
  305. // Note: There is a (small) possibility a context switch will happen during this measurement in which case result will be skewed.
  306. // Increasing thread priority might help. This is generally only a problem with code that executes a long time (10-15+ ms - depending on OS quant length)
  307. ThreadInfo* thread = ThreadInfo::activeThread;
  308. if(thread == nullptr || !thread->isActive)
  309. beginThread("Unknown");
  310. ProfiledBlock* parent = thread->activeBlock.block;
  311. ProfiledBlock* block = nullptr;
  312. if(parent != nullptr)
  313. block = parent->findChild(name);
  314. if(block == nullptr)
  315. {
  316. block = thread->getBlock(name);
  317. if(parent != nullptr)
  318. parent->children.push_back(block);
  319. else
  320. thread->rootBlock->children.push_back(block);
  321. }
  322. thread->activeBlock = ActiveBlock(ActiveSamplingType::Precise, block);
  323. thread->activeBlocks->push(thread->activeBlock);
  324. block->precise.beginSample();
  325. }
  326. void ProfilerCPU::endSamplePrecise(const char* name)
  327. {
  328. ThreadInfo* thread = ThreadInfo::activeThread;
  329. ProfiledBlock* block = thread->activeBlock.block;
  330. #if BS_DEBUG_MODE
  331. if(block == nullptr)
  332. {
  333. LOGWRN("Mismatched Profiler::endSamplePrecise. No beginSamplePrecise was called.");
  334. return;
  335. }
  336. if(thread->activeBlock.type == ActiveSamplingType::Basic)
  337. {
  338. LOGWRN("Mismatched CPUProfiler::endSamplePrecise. Was expecting Profiler::endSample.");
  339. return;
  340. }
  341. if (strcmp(block->name, name) != 0)
  342. {
  343. LOGWRN("Mismatched Profiler::endSamplePrecise. Was expecting \"" + String(block->name) +
  344. "\" but got \"" + String(name) + "\". Sampling data will not be valid.");
  345. return;
  346. }
  347. #endif
  348. block->precise.endSample();
  349. thread->activeBlocks->pop();
  350. if (!thread->activeBlocks->empty())
  351. thread->activeBlock = thread->activeBlocks->top();
  352. else
  353. thread->activeBlock = ActiveBlock();
  354. }
  355. void ProfilerCPU::reset()
  356. {
  357. ThreadInfo* thread = ThreadInfo::activeThread;
  358. if(thread != nullptr)
  359. thread->reset();
  360. }
  361. CPUProfilerReport ProfilerCPU::generateReport()
  362. {
  363. CPUProfilerReport report;
  364. ThreadInfo* thread = ThreadInfo::activeThread;
  365. if(thread == nullptr)
  366. return report;
  367. if(thread->isActive)
  368. thread->end();
  369. // We need to separate out basic and precise data and form two separate hierarchies
  370. if(thread->rootBlock == nullptr)
  371. return report;
  372. struct TempEntry
  373. {
  374. TempEntry(ProfiledBlock* _parentBlock, UINT32 _entryIdx)
  375. :parentBlock(_parentBlock), entryIdx(_entryIdx)
  376. { }
  377. ProfiledBlock* parentBlock;
  378. UINT32 entryIdx;
  379. ProfilerVector<UINT32> childIndexes;
  380. };
  381. ProfilerVector<CPUProfilerBasicSamplingEntry> basicEntries;
  382. ProfilerVector<CPUProfilerPreciseSamplingEntry> preciseEntries;
  383. // Fill up flatHierarchy array in a way so we always process children before parents
  384. ProfilerStack<UINT32> todo;
  385. ProfilerVector<TempEntry> flatHierarchy;
  386. UINT32 entryIdx = 0;
  387. todo.push(entryIdx);
  388. flatHierarchy.push_back(TempEntry(thread->rootBlock, entryIdx));
  389. entryIdx++;
  390. while(!todo.empty())
  391. {
  392. UINT32 curDataIdx = todo.top();
  393. ProfiledBlock* curBlock = flatHierarchy[curDataIdx].parentBlock;
  394. todo.pop();
  395. for(auto& child : curBlock->children)
  396. {
  397. flatHierarchy[curDataIdx].childIndexes.push_back(entryIdx);
  398. todo.push(entryIdx);
  399. flatHierarchy.push_back(TempEntry(child, entryIdx));
  400. entryIdx++;
  401. }
  402. }
  403. // Calculate sampling data for all entries
  404. basicEntries.resize(flatHierarchy.size());
  405. preciseEntries.resize(flatHierarchy.size());
  406. for(auto iter = flatHierarchy.rbegin(); iter != flatHierarchy.rend(); ++iter)
  407. {
  408. TempEntry& curData = *iter;
  409. ProfiledBlock* curBlock = curData.parentBlock;
  410. CPUProfilerBasicSamplingEntry* entryBasic = &basicEntries[curData.entryIdx];
  411. CPUProfilerPreciseSamplingEntry* entryPrecise = &preciseEntries[curData.entryIdx];
  412. // Calculate basic data
  413. entryBasic->data.name = String(curBlock->name);
  414. entryBasic->data.memAllocs = 0;
  415. entryBasic->data.memFrees = 0;
  416. entryBasic->data.totalTimeMs = 0.0;
  417. entryBasic->data.maxTimeMs = 0.0;
  418. for(auto& sample : curBlock->basic.samples)
  419. {
  420. entryBasic->data.totalTimeMs += sample.time;
  421. entryBasic->data.maxTimeMs = std::max(entryBasic->data.maxTimeMs, sample.time);
  422. entryBasic->data.memAllocs += sample.numAllocs;
  423. entryBasic->data.memFrees += sample.numFrees;
  424. }
  425. entryBasic->data.numCalls = (UINT32)curBlock->basic.samples.size();
  426. if(entryBasic->data.numCalls > 0)
  427. entryBasic->data.avgTimeMs = entryBasic->data.totalTimeMs / entryBasic->data.numCalls;
  428. double totalChildTime = 0.0;
  429. for(auto& childIdx : curData.childIndexes)
  430. {
  431. CPUProfilerBasicSamplingEntry* childEntry = &basicEntries[childIdx];
  432. totalChildTime += childEntry->data.totalTimeMs;
  433. childEntry->data.pctOfParent = (float)(childEntry->data.totalTimeMs / entryBasic->data.totalTimeMs);
  434. entryBasic->data.estimatedOverheadMs += childEntry->data.estimatedOverheadMs;
  435. }
  436. entryBasic->data.estimatedOverheadMs += curBlock->basic.samples.size() * mBasicSamplingOverheadMs;
  437. entryBasic->data.estimatedOverheadMs += curBlock->precise.samples.size() * mPreciseSamplingOverheadMs;
  438. entryBasic->data.totalSelfTimeMs = entryBasic->data.totalTimeMs - totalChildTime;
  439. if(entryBasic->data.numCalls > 0)
  440. entryBasic->data.avgSelfTimeMs = entryBasic->data.totalSelfTimeMs / entryBasic->data.numCalls;
  441. entryBasic->data.estimatedSelfOverheadMs = mBasicTimerOverhead;
  442. // Calculate precise data
  443. entryPrecise->data.name = String(curBlock->name);
  444. entryPrecise->data.memAllocs = 0;
  445. entryPrecise->data.memFrees = 0;
  446. entryPrecise->data.totalCycles = 0;
  447. entryPrecise->data.maxCycles = 0;
  448. for(auto& sample : curBlock->precise.samples)
  449. {
  450. entryPrecise->data.totalCycles += sample.cycles;
  451. entryPrecise->data.maxCycles = std::max(entryPrecise->data.maxCycles, sample.cycles);
  452. entryPrecise->data.memAllocs += sample.numAllocs;
  453. entryPrecise->data.memFrees += sample.numFrees;
  454. }
  455. entryPrecise->data.numCalls = (UINT32)curBlock->precise.samples.size();
  456. if(entryPrecise->data.numCalls > 0)
  457. entryPrecise->data.avgCycles = entryPrecise->data.totalCycles / entryPrecise->data.numCalls;
  458. UINT64 totalChildCycles = 0;
  459. for(auto& childIdx : curData.childIndexes)
  460. {
  461. CPUProfilerPreciseSamplingEntry* childEntry = &preciseEntries[childIdx];
  462. totalChildCycles += childEntry->data.totalCycles;
  463. childEntry->data.pctOfParent = childEntry->data.totalCycles / (float)entryPrecise->data.totalCycles;
  464. entryPrecise->data.estimatedOverhead += childEntry->data.estimatedOverhead;
  465. }
  466. entryPrecise->data.estimatedOverhead += curBlock->precise.samples.size() * mPreciseSamplingOverheadCycles;
  467. entryPrecise->data.estimatedOverhead += curBlock->basic.samples.size() * mBasicSamplingOverheadCycles;
  468. entryPrecise->data.totalSelfCycles = entryPrecise->data.totalCycles - totalChildCycles;
  469. if(entryPrecise->data.numCalls > 0)
  470. entryPrecise->data.avgSelfCycles = entryPrecise->data.totalSelfCycles / entryPrecise->data.numCalls;
  471. entryPrecise->data.estimatedSelfOverhead = mPreciseTimerOverhead;
  472. }
  473. // Prune empty basic entries
  474. ProfilerStack<UINT32> finalBasicHierarchyTodo;
  475. ProfilerStack<UINT32> parentBasicEntryIndexes;
  476. ProfilerVector<TempEntry> newBasicEntries;
  477. finalBasicHierarchyTodo.push(0);
  478. entryIdx = 0;
  479. parentBasicEntryIndexes.push(entryIdx);
  480. newBasicEntries.push_back(TempEntry(nullptr, entryIdx));
  481. entryIdx++;
  482. while(!finalBasicHierarchyTodo.empty())
  483. {
  484. UINT32 parentEntryIdx = parentBasicEntryIndexes.top();
  485. parentBasicEntryIndexes.pop();
  486. UINT32 curEntryIdx = finalBasicHierarchyTodo.top();
  487. TempEntry& curEntry = flatHierarchy[curEntryIdx];
  488. finalBasicHierarchyTodo.pop();
  489. for(auto& childIdx : curEntry.childIndexes)
  490. {
  491. finalBasicHierarchyTodo.push(childIdx);
  492. CPUProfilerBasicSamplingEntry& basicEntry = basicEntries[childIdx];
  493. if(basicEntry.data.numCalls > 0)
  494. {
  495. newBasicEntries.push_back(TempEntry(nullptr, childIdx));
  496. newBasicEntries[parentEntryIdx].childIndexes.push_back(entryIdx);
  497. parentBasicEntryIndexes.push(entryIdx);
  498. entryIdx++;
  499. }
  500. else
  501. parentBasicEntryIndexes.push(parentEntryIdx);
  502. }
  503. }
  504. if(newBasicEntries.size() > 0)
  505. {
  506. ProfilerVector<CPUProfilerBasicSamplingEntry*> finalBasicEntries;
  507. report.mBasicSamplingRootEntry = basicEntries[newBasicEntries[0].entryIdx];
  508. finalBasicEntries.push_back(&report.mBasicSamplingRootEntry);
  509. finalBasicHierarchyTodo.push(0);
  510. while(!finalBasicHierarchyTodo.empty())
  511. {
  512. UINT32 curEntryIdx = finalBasicHierarchyTodo.top();
  513. finalBasicHierarchyTodo.pop();
  514. TempEntry& curEntry = newBasicEntries[curEntryIdx];
  515. CPUProfilerBasicSamplingEntry* basicEntry = finalBasicEntries[curEntryIdx];
  516. basicEntry->childEntries.resize(curEntry.childIndexes.size());
  517. UINT32 idx = 0;
  518. for(auto& childIdx : curEntry.childIndexes)
  519. {
  520. TempEntry& childEntry = newBasicEntries[childIdx];
  521. basicEntry->childEntries[idx] = basicEntries[childEntry.entryIdx];
  522. finalBasicEntries.push_back(&(basicEntry->childEntries[idx]));
  523. finalBasicHierarchyTodo.push(childIdx);
  524. idx++;
  525. }
  526. }
  527. }
  528. // Prune empty precise entries
  529. ProfilerStack<UINT32> finalPreciseHierarchyTodo;
  530. ProfilerStack<UINT32> parentPreciseEntryIndexes;
  531. ProfilerVector<TempEntry> newPreciseEntries;
  532. finalPreciseHierarchyTodo.push(0);
  533. entryIdx = 0;
  534. parentPreciseEntryIndexes.push(entryIdx);
  535. newPreciseEntries.push_back(TempEntry(nullptr, entryIdx));
  536. entryIdx++;
  537. while(!finalPreciseHierarchyTodo.empty())
  538. {
  539. UINT32 parentEntryIdx = parentPreciseEntryIndexes.top();
  540. parentPreciseEntryIndexes.pop();
  541. UINT32 curEntryIdx = finalPreciseHierarchyTodo.top();
  542. TempEntry& curEntry = flatHierarchy[curEntryIdx];
  543. finalPreciseHierarchyTodo.pop();
  544. for(auto& childIdx : curEntry.childIndexes)
  545. {
  546. finalPreciseHierarchyTodo.push(childIdx);
  547. CPUProfilerPreciseSamplingEntry& preciseEntry = preciseEntries[childIdx];
  548. if(preciseEntry.data.numCalls > 0)
  549. {
  550. newPreciseEntries.push_back(TempEntry(nullptr, childIdx));
  551. newPreciseEntries[parentEntryIdx].childIndexes.push_back(entryIdx);
  552. parentPreciseEntryIndexes.push(entryIdx);
  553. entryIdx++;
  554. }
  555. else
  556. parentPreciseEntryIndexes.push(parentEntryIdx);
  557. }
  558. }
  559. if(newPreciseEntries.size() > 0)
  560. {
  561. ProfilerVector<CPUProfilerPreciseSamplingEntry*> finalPreciseEntries;
  562. report.mPreciseSamplingRootEntry = preciseEntries[newPreciseEntries[0].entryIdx];
  563. finalPreciseEntries.push_back(&report.mPreciseSamplingRootEntry);
  564. finalPreciseHierarchyTodo.push(0);
  565. while(!finalPreciseHierarchyTodo.empty())
  566. {
  567. UINT32 curEntryIdx = finalPreciseHierarchyTodo.top();
  568. finalPreciseHierarchyTodo.pop();
  569. TempEntry& curEntry = newPreciseEntries[curEntryIdx];
  570. CPUProfilerPreciseSamplingEntry* preciseEntry = finalPreciseEntries[curEntryIdx];
  571. preciseEntry->childEntries.resize(curEntry.childIndexes.size());
  572. UINT32 idx = 0;
  573. for(auto& childIdx : curEntry.childIndexes)
  574. {
  575. TempEntry& childEntry = newPreciseEntries[childIdx];
  576. preciseEntry->childEntries[idx] = preciseEntries[childEntry.entryIdx];
  577. finalPreciseEntries.push_back(&preciseEntry->childEntries.back());
  578. finalPreciseHierarchyTodo.push(childIdx);
  579. idx++;
  580. }
  581. }
  582. }
  583. return report;
  584. }
  585. void ProfilerCPU::estimateTimerOverhead()
  586. {
  587. // Get an idea of how long timer calls and RDTSC takes
  588. const UINT32 reps = 1000, sampleReps = 20;
  589. mBasicTimerOverhead = 1000000.0;
  590. mPreciseTimerOverhead = 1000000;
  591. for (UINT32 tries = 0; tries < 20; tries++)
  592. {
  593. Timer timer;
  594. for (UINT32 i = 0; i < reps; i++)
  595. {
  596. timer.start();
  597. timer.stop();
  598. }
  599. double avgTime = double(timer.time)/double(reps);
  600. if (avgTime < mBasicTimerOverhead)
  601. mBasicTimerOverhead = avgTime;
  602. TimerPrecise timerPrecise;
  603. for (UINT32 i = 0; i < reps; i++)
  604. {
  605. timerPrecise.start();
  606. timerPrecise.stop();
  607. }
  608. UINT64 avgCycles = timerPrecise.cycles/reps;
  609. if (avgCycles < mPreciseTimerOverhead)
  610. mPreciseTimerOverhead = avgCycles;
  611. }
  612. mBasicSamplingOverheadMs = 1000000.0;
  613. mPreciseSamplingOverheadMs = 1000000.0;
  614. mBasicSamplingOverheadCycles = 1000000;
  615. mPreciseSamplingOverheadCycles = 1000000;
  616. for (UINT32 tries = 0; tries < 3; tries++)
  617. {
  618. /************************************************************************/
  619. /* AVERAGE TIME IN MS FOR BASIC SAMPLING */
  620. /************************************************************************/
  621. Timer timerA;
  622. timerA.start();
  623. beginThread("Main");
  624. // Two different cases that can effect performance, one where
  625. // sample already exists and other where new one needs to be created
  626. for (UINT32 i = 0; i < sampleReps; i++)
  627. {
  628. beginSample("TestAvg1");
  629. endSample("TestAvg1");
  630. beginSample("TestAvg2");
  631. endSample("TestAvg2");
  632. beginSample("TestAvg3");
  633. endSample("TestAvg3");
  634. beginSample("TestAvg4");
  635. endSample("TestAvg4");
  636. beginSample("TestAvg5");
  637. endSample("TestAvg5");
  638. beginSample("TestAvg6");
  639. endSample("TestAvg6");
  640. beginSample("TestAvg7");
  641. endSample("TestAvg7");
  642. beginSample("TestAvg8");
  643. endSample("TestAvg8");
  644. beginSample("TestAvg9");
  645. endSample("TestAvg9");
  646. beginSample("TestAvg10");
  647. endSample("TestAvg10");
  648. }
  649. for (UINT32 i = 0; i < sampleReps * 5; i++)
  650. {
  651. beginSample(("TestAvg#" + toString(i)).c_str());
  652. endSample(("TestAvg#" + toString(i)).c_str());
  653. }
  654. endThread();
  655. timerA.stop();
  656. reset();
  657. double avgTimeBasic = double(timerA.time)/double(sampleReps * 10 + sampleReps * 5) - mBasicTimerOverhead;
  658. if (avgTimeBasic < mBasicSamplingOverheadMs)
  659. mBasicSamplingOverheadMs = avgTimeBasic;
  660. /************************************************************************/
  661. /* AVERAGE CYCLES FOR BASIC SAMPLING */
  662. /************************************************************************/
  663. TimerPrecise timerPreciseA;
  664. timerPreciseA.start();
  665. beginThread("Main");
  666. // Two different cases that can effect performance, one where
  667. // sample already exists and other where new one needs to be created
  668. for (UINT32 i = 0; i < sampleReps; i++)
  669. {
  670. beginSample("TestAvg1");
  671. endSample("TestAvg1");
  672. beginSample("TestAvg2");
  673. endSample("TestAvg2");
  674. beginSample("TestAvg3");
  675. endSample("TestAvg3");
  676. beginSample("TestAvg4");
  677. endSample("TestAvg4");
  678. beginSample("TestAvg5");
  679. endSample("TestAvg5");
  680. beginSample("TestAvg6");
  681. endSample("TestAvg6");
  682. beginSample("TestAvg7");
  683. endSample("TestAvg7");
  684. beginSample("TestAvg8");
  685. endSample("TestAvg8");
  686. beginSample("TestAvg9");
  687. endSample("TestAvg9");
  688. beginSample("TestAvg10");
  689. endSample("TestAvg10");
  690. }
  691. for (UINT32 i = 0; i < sampleReps * 5; i++)
  692. {
  693. beginSample(("TestAvg#" + toString(i)).c_str());
  694. endSample(("TestAvg#" + toString(i)).c_str());
  695. }
  696. endThread();
  697. timerPreciseA.stop();
  698. reset();
  699. UINT64 avgCyclesBasic = timerPreciseA.cycles/(sampleReps * 10 + sampleReps * 5) - mPreciseTimerOverhead;
  700. if (avgCyclesBasic < mBasicSamplingOverheadCycles)
  701. mBasicSamplingOverheadCycles = avgCyclesBasic;
  702. /************************************************************************/
  703. /* AVERAGE TIME IN MS FOR PRECISE SAMPLING */
  704. /************************************************************************/
  705. Timer timerB;
  706. timerB.start();
  707. beginThread("Main");
  708. // Two different cases that can effect performance, one where
  709. // sample already exists and other where new one needs to be created
  710. for (UINT32 i = 0; i < sampleReps; i++)
  711. {
  712. beginSamplePrecise("TestAvg1");
  713. endSamplePrecise("TestAvg1");
  714. beginSamplePrecise("TestAvg2");
  715. endSamplePrecise("TestAvg2");
  716. beginSamplePrecise("TestAvg3");
  717. endSamplePrecise("TestAvg3");
  718. beginSamplePrecise("TestAvg4");
  719. endSamplePrecise("TestAvg4");
  720. beginSamplePrecise("TestAvg5");
  721. endSamplePrecise("TestAvg5");
  722. beginSamplePrecise("TestAvg6");
  723. endSamplePrecise("TestAvg6");
  724. beginSamplePrecise("TestAvg7");
  725. endSamplePrecise("TestAvg7");
  726. beginSamplePrecise("TestAvg8");
  727. endSamplePrecise("TestAvg8");
  728. beginSamplePrecise("TestAvg9");
  729. endSamplePrecise("TestAvg9");
  730. beginSamplePrecise("TestAvg10");
  731. endSamplePrecise("TestAvg10");
  732. }
  733. for (UINT32 i = 0; i < sampleReps * 5; i++)
  734. {
  735. beginSamplePrecise(("TestAvg#" + toString(i)).c_str());
  736. endSamplePrecise(("TestAvg#" + toString(i)).c_str());
  737. }
  738. endThread();
  739. timerB.stop();
  740. reset();
  741. double avgTimesPrecise = timerB.time/(sampleReps * 10 + sampleReps * 5);
  742. if (avgTimesPrecise < mPreciseSamplingOverheadMs)
  743. mPreciseSamplingOverheadMs = avgTimesPrecise;
  744. /************************************************************************/
  745. /* AVERAGE CYCLES FOR PRECISE SAMPLING */
  746. /************************************************************************/
  747. TimerPrecise timerPreciseB;
  748. timerPreciseB.start();
  749. beginThread("Main");
  750. // Two different cases that can effect performance, one where
  751. // sample already exists and other where new one needs to be created
  752. for (UINT32 i = 0; i < sampleReps; i++)
  753. {
  754. beginSamplePrecise("TestAvg1");
  755. endSamplePrecise("TestAvg1");
  756. beginSamplePrecise("TestAvg2");
  757. endSamplePrecise("TestAvg2");
  758. beginSamplePrecise("TestAvg3");
  759. endSamplePrecise("TestAvg3");
  760. beginSamplePrecise("TestAvg4");
  761. endSamplePrecise("TestAvg4");
  762. beginSamplePrecise("TestAvg5");
  763. endSamplePrecise("TestAvg5");
  764. beginSamplePrecise("TestAvg6");
  765. endSamplePrecise("TestAvg6");
  766. beginSamplePrecise("TestAvg7");
  767. endSamplePrecise("TestAvg7");
  768. beginSamplePrecise("TestAvg8");
  769. endSamplePrecise("TestAvg8");
  770. beginSamplePrecise("TestAvg9");
  771. endSamplePrecise("TestAvg9");
  772. beginSamplePrecise("TestAvg10");
  773. endSamplePrecise("TestAvg10");
  774. }
  775. for (UINT32 i = 0; i < sampleReps * 5; i++)
  776. {
  777. beginSamplePrecise(("TestAvg#" + toString(i)).c_str());
  778. endSamplePrecise(("TestAvg#" + toString(i)).c_str());
  779. }
  780. endThread();
  781. timerPreciseB.stop();
  782. reset();
  783. UINT64 avgCyclesPrecise = timerPreciseB.cycles/(sampleReps * 10 + sampleReps * 5);
  784. if (avgCyclesPrecise < mPreciseSamplingOverheadCycles)
  785. mPreciseSamplingOverheadCycles = avgCyclesPrecise;
  786. }
  787. }
  788. CPUProfilerBasicSamplingEntry::Data::Data()
  789. :numCalls(0), avgTimeMs(0.0), maxTimeMs(0.0), totalTimeMs(0.0),
  790. avgSelfTimeMs(0.0), totalSelfTimeMs(0.0), estimatedSelfOverheadMs(0.0),
  791. estimatedOverheadMs(0.0), pctOfParent(1.0f)
  792. { }
  793. CPUProfilerPreciseSamplingEntry::Data::Data()
  794. :numCalls(0), avgCycles(0), maxCycles(0), totalCycles(0),
  795. avgSelfCycles(0), totalSelfCycles(0), estimatedSelfOverhead(0),
  796. estimatedOverhead(0), pctOfParent(1.0f)
  797. { }
  798. CPUProfilerReport::CPUProfilerReport()
  799. {
  800. }
  801. ProfilerCPU& gProfilerCPU()
  802. {
  803. return ProfilerCPU::instance();
  804. }
  805. }