BsProfilerCPU.cpp 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974
  1. //__________________________ Banshee Project - A modern game development toolkit _________________________________//
  2. //_____________________________________ www.banshee-project.com __________________________________________________//
  3. //________________________ Copyright (c) 2014 Marko Pintera. All rights reserved. ________________________________//
  4. #include "BsProfilerCPU.h"
  5. #include "BsDebug.h"
  6. #include "BsPlatform.h"
  7. namespace BansheeEngine
  8. {
  9. ProfilerCPU::Timer::Timer()
  10. {
  11. time = 0.0f;
  12. }
  13. void ProfilerCPU::Timer::start()
  14. {
  15. startTime = getCurrentTime();
  16. }
  17. void ProfilerCPU::Timer::stop()
  18. {
  19. time += getCurrentTime() - startTime;
  20. }
  21. void ProfilerCPU::Timer::reset()
  22. {
  23. time = 0.0f;
  24. }
  25. inline double ProfilerCPU::Timer::getCurrentTime()
  26. {
  27. return Platform::queryPerformanceTimerMs();
  28. }
  29. ProfilerCPU::TimerPrecise::TimerPrecise()
  30. {
  31. cycles = 0;
  32. }
  33. void ProfilerCPU::TimerPrecise::start()
  34. {
  35. startCycles = getNumCycles();
  36. }
  37. void ProfilerCPU::TimerPrecise::stop()
  38. {
  39. cycles += getNumCycles() - startCycles;
  40. }
  41. void ProfilerCPU::TimerPrecise::reset()
  42. {
  43. cycles = 0;
  44. }
  45. inline UINT64 ProfilerCPU::TimerPrecise::getNumCycles()
  46. {
  47. #if BS_COMPILER == BS_COMPILER_GNUC
  48. asm volatile("cpuid" : : : "%eax", "%ebx", "%ecx", "%edx" );
  49. UINT32 __a,__d;
  50. asm volatile("rdtsc" : "=a" (__a), "=d" (__d));
  51. return ( UINT64(__a) | UINT64(__d) << 32 );
  52. #else
  53. int a[4];
  54. int b = 0;
  55. __cpuid(a, b);
  56. return __rdtsc();
  57. #endif
  58. }
  59. void ProfilerCPU::ProfileData::beginSample()
  60. {
  61. memAllocs = MemoryCounter::getNumAllocs();
  62. memFrees = MemoryCounter::getNumFrees();
  63. timer.reset();
  64. timer.start();
  65. }
  66. void ProfilerCPU::ProfileData::endSample()
  67. {
  68. timer.stop();
  69. UINT64 numAllocs = MemoryCounter::getNumAllocs() - memAllocs;
  70. UINT64 numFrees = MemoryCounter::getNumFrees() - memFrees;
  71. samples.push_back(ProfileSample(timer.time, numAllocs, numFrees));
  72. }
  73. void ProfilerCPU::ProfileData::resumeLastSample()
  74. {
  75. timer.start();
  76. samples.erase(samples.end() - 1);
  77. }
  78. void ProfilerCPU::PreciseProfileData::beginSample()
  79. {
  80. memAllocs = MemoryCounter::getNumAllocs();
  81. memFrees = MemoryCounter::getNumFrees();
  82. timer.reset();
  83. timer.start();
  84. }
  85. void ProfilerCPU::PreciseProfileData::endSample()
  86. {
  87. timer.stop();
  88. UINT64 numAllocs = MemoryCounter::getNumAllocs() - memAllocs;
  89. UINT64 numFrees = MemoryCounter::getNumFrees() - memFrees;
  90. samples.push_back(PreciseProfileSample(timer.cycles, numAllocs, numFrees));
  91. }
  92. void ProfilerCPU::PreciseProfileData::resumeLastSample()
  93. {
  94. timer.start();
  95. samples.erase(samples.end() - 1);
  96. }
  97. BS_THREADLOCAL ProfilerCPU::ThreadInfo* ProfilerCPU::ThreadInfo::activeThread = nullptr;
  98. ProfilerCPU::ThreadInfo::ThreadInfo()
  99. :isActive(false), rootBlock(nullptr)
  100. {
  101. }
  102. void ProfilerCPU::ThreadInfo::begin(const ProfilerString& _name)
  103. {
  104. if(isActive)
  105. {
  106. LOGWRN("Profiler::beginThread called on a thread that was already being sampled");
  107. return;
  108. }
  109. if(rootBlock == nullptr)
  110. rootBlock = getBlock();
  111. activeBlock = ActiveBlock(ActiveSamplingType::Basic, rootBlock);
  112. activeBlocks.push(activeBlock);
  113. rootBlock->name = _name;
  114. rootBlock->basic.beginSample();
  115. isActive = true;
  116. }
  117. void ProfilerCPU::ThreadInfo::end()
  118. {
  119. if(activeBlock.type == ActiveSamplingType::Basic)
  120. activeBlock.block->basic.endSample();
  121. else
  122. activeBlock.block->precise.endSample();
  123. activeBlocks.pop();
  124. if(!isActive)
  125. LOGWRN("Profiler::endThread called on a thread that isn't being sampled.");
  126. if(activeBlocks.size() > 0)
  127. {
  128. LOGWRN("Profiler::endThread called but not all sample pairs were closed. Sampling data will not be valid.");
  129. while(activeBlocks.size() > 0)
  130. {
  131. ActiveBlock& curBlock = activeBlocks.top();
  132. if(curBlock.type == ActiveSamplingType::Basic)
  133. curBlock.block->basic.endSample();
  134. else
  135. curBlock.block->precise.endSample();
  136. activeBlocks.pop();
  137. }
  138. }
  139. isActive = false;
  140. activeBlocks = ProfilerStack<ActiveBlock>();
  141. activeBlock = ActiveBlock();
  142. }
  143. void ProfilerCPU::ThreadInfo::reset()
  144. {
  145. if(isActive)
  146. end();
  147. if(rootBlock != nullptr)
  148. releaseBlock(rootBlock);
  149. rootBlock = nullptr;
  150. }
  151. ProfilerCPU::ProfiledBlock* ProfilerCPU::ThreadInfo::getBlock()
  152. {
  153. // TODO - Pool this, if possible using the memory allocator stuff
  154. // TODO - Also consider moving all samples in ThreadInfo, and also pool them (otherwise I can't pool ProfiledBlock since it will be variable size)
  155. return bs_new<ProfiledBlock, ProfilerAlloc>();
  156. }
  157. void ProfilerCPU::ThreadInfo::releaseBlock(ProfilerCPU::ProfiledBlock* block)
  158. {
  159. bs_delete<ProfilerAlloc>(block);
  160. }
  161. ProfilerCPU::ProfiledBlock::ProfiledBlock()
  162. { }
  163. ProfilerCPU::ProfiledBlock::~ProfiledBlock()
  164. {
  165. ThreadInfo* thread = ThreadInfo::activeThread;
  166. for(auto& child : children)
  167. thread->releaseBlock(child);
  168. children.clear();
  169. }
  170. ProfilerCPU::ProfiledBlock* ProfilerCPU::ProfiledBlock::findChild(const ProfilerString& name) const
  171. {
  172. for(auto& child : children)
  173. {
  174. if(child->name == name)
  175. return child;
  176. }
  177. return nullptr;
  178. }
  179. ProfilerCPU::ProfilerCPU()
  180. :mBasicTimerOverhead(0.0), mPreciseTimerOverhead(0), mBasicSamplingOverheadMs(0.0), mPreciseSamplingOverheadCycles(0),
  181. mBasicSamplingOverheadCycles(0), mPreciseSamplingOverheadMs(0.0)
  182. {
  183. // TODO - We only estimate overhead on program start. It might be better to estimate it each time beginThread is called,
  184. // and keep separate values per thread.
  185. estimateTimerOverhead();
  186. }
  187. ProfilerCPU::~ProfilerCPU()
  188. {
  189. reset();
  190. BS_LOCK_MUTEX(mThreadSync);
  191. for(auto& threadInfo : mActiveThreads)
  192. bs_delete<ProfilerAlloc>(threadInfo);
  193. }
  194. void ProfilerCPU::beginThread(const ProfilerString& name)
  195. {
  196. ThreadInfo* thread = ThreadInfo::activeThread;
  197. if(thread == nullptr)
  198. {
  199. ThreadInfo::activeThread = bs_new<ThreadInfo, ProfilerAlloc>();
  200. thread = ThreadInfo::activeThread;
  201. {
  202. BS_LOCK_MUTEX(mThreadSync);
  203. mActiveThreads.push_back(thread);
  204. }
  205. }
  206. thread->begin(name);
  207. }
  208. void ProfilerCPU::endThread()
  209. {
  210. // I don't do a nullcheck where on purpose, so endSample can be called ASAP
  211. ThreadInfo::activeThread->end();
  212. }
  213. void ProfilerCPU::beginSample(const ProfilerString& name)
  214. {
  215. ThreadInfo* thread = ThreadInfo::activeThread;
  216. if(thread == nullptr || !thread->isActive)
  217. {
  218. beginThread("Unknown");
  219. thread = ThreadInfo::activeThread;
  220. }
  221. ProfiledBlock* parent = thread->activeBlock.block;
  222. ProfiledBlock* block = nullptr;
  223. if(parent != nullptr)
  224. block = parent->findChild(name);
  225. if(block == nullptr)
  226. {
  227. block = thread->getBlock();
  228. block->name = name;
  229. if(parent != nullptr)
  230. parent->children.push_back(block);
  231. else
  232. thread->rootBlock->children.push_back(block);
  233. }
  234. thread->activeBlock = ActiveBlock(ActiveSamplingType::Basic, block);
  235. thread->activeBlocks.push(thread->activeBlock);
  236. block->basic.beginSample();
  237. }
  238. void ProfilerCPU::endSample(const ProfilerString& name)
  239. {
  240. ThreadInfo* thread = ThreadInfo::activeThread;
  241. ProfiledBlock* block = thread->activeBlock.block;
  242. #if BS_DEBUG_MODE
  243. if(block == nullptr)
  244. {
  245. LOGWRN("Mismatched CPUProfiler::endSample. No beginSample was called.");
  246. return;
  247. }
  248. if(thread->activeBlock.type == ActiveSamplingType::Precise)
  249. {
  250. LOGWRN("Mismatched CPUProfiler::endSample. Was expecting Profiler::endSamplePrecise.");
  251. return;
  252. }
  253. if(block->name != name)
  254. {
  255. LOGWRN("Mismatched CPUProfiler::endSample. Was expecting \"" + String(block->name.c_str()) +
  256. "\" but got \"" + String(name.c_str()) + "\". Sampling data will not be valid.");
  257. return;
  258. }
  259. #endif
  260. block->basic.endSample();
  261. thread->activeBlocks.pop();
  262. if(!thread->activeBlocks.empty())
  263. thread->activeBlock = thread->activeBlocks.top();
  264. else
  265. thread->activeBlock = ActiveBlock();
  266. }
  267. void ProfilerCPU::beginSamplePrecise(const ProfilerString& name)
  268. {
  269. // Note: There is a (small) possibility a context switch will happen during this measurement in which case result will be skewed.
  270. // Increasing thread priority might help. This is generally only a problem with code that executes a long time (10-15+ ms - depending on OS quant length)
  271. ThreadInfo* thread = ThreadInfo::activeThread;
  272. if(thread == nullptr || !thread->isActive)
  273. beginThread("Unknown");
  274. ProfiledBlock* parent = thread->activeBlock.block;
  275. ProfiledBlock* block = nullptr;
  276. if(parent != nullptr)
  277. block = parent->findChild(name);
  278. if(block == nullptr)
  279. {
  280. block = thread->getBlock();
  281. block->name = name;
  282. if(parent != nullptr)
  283. parent->children.push_back(block);
  284. else
  285. thread->rootBlock->children.push_back(block);
  286. }
  287. thread->activeBlock = ActiveBlock(ActiveSamplingType::Precise, block);
  288. thread->activeBlocks.push(thread->activeBlock);
  289. block->precise.beginSample();
  290. }
  291. void ProfilerCPU::endSamplePrecise(const ProfilerString& name)
  292. {
  293. ThreadInfo* thread = ThreadInfo::activeThread;
  294. ProfiledBlock* block = thread->activeBlock.block;
  295. #if BS_DEBUG_MODE
  296. if(block == nullptr)
  297. {
  298. LOGWRN("Mismatched Profiler::endSamplePrecise. No beginSamplePrecise was called.");
  299. return;
  300. }
  301. if(thread->activeBlock.type == ActiveSamplingType::Basic)
  302. {
  303. LOGWRN("Mismatched CPUProfiler::endSamplePrecise. Was expecting Profiler::endSample.");
  304. return;
  305. }
  306. if(block->name != name)
  307. {
  308. LOGWRN("Mismatched Profiler::endSamplePrecise. Was expecting \"" + String(block->name.c_str()) +
  309. "\" but got \"" + String(name.c_str()) + "\". Sampling data will not be valid.");
  310. return;
  311. }
  312. #endif
  313. block->precise.endSample();
  314. thread->activeBlocks.pop();
  315. if(!thread->activeBlocks.empty())
  316. thread->activeBlock = thread->activeBlocks.top();
  317. else
  318. thread->activeBlock = ActiveBlock();
  319. }
  320. void ProfilerCPU::reset()
  321. {
  322. ThreadInfo* thread = ThreadInfo::activeThread;
  323. if(thread != nullptr)
  324. thread->reset();
  325. }
  326. CPUProfilerReport ProfilerCPU::generateReport()
  327. {
  328. CPUProfilerReport report;
  329. ThreadInfo* thread = ThreadInfo::activeThread;
  330. if(thread == nullptr)
  331. return report;
  332. if(thread->isActive)
  333. thread->end();
  334. // We need to separate out basic and precise data and form two separate hierarchies
  335. if(thread->rootBlock == nullptr)
  336. return report;
  337. struct TempEntry
  338. {
  339. TempEntry(ProfiledBlock* _parentBlock, UINT32 _entryIdx)
  340. :parentBlock(_parentBlock), entryIdx(_entryIdx)
  341. { }
  342. ProfiledBlock* parentBlock;
  343. UINT32 entryIdx;
  344. ProfilerVector<UINT32> childIndexes;
  345. };
  346. ProfilerVector<CPUProfilerBasicSamplingEntry> basicEntries;
  347. ProfilerVector<CPUProfilerPreciseSamplingEntry> preciseEntries;
  348. // Fill up flatHierarchy array in a way so we always process children before parents
  349. ProfilerStack<UINT32> todo;
  350. ProfilerVector<TempEntry> flatHierarchy;
  351. UINT32 entryIdx = 0;
  352. todo.push(entryIdx);
  353. flatHierarchy.push_back(TempEntry(thread->rootBlock, entryIdx));
  354. entryIdx++;
  355. while(!todo.empty())
  356. {
  357. UINT32 curDataIdx = todo.top();
  358. ProfiledBlock* curBlock = flatHierarchy[curDataIdx].parentBlock;
  359. todo.pop();
  360. for(auto& child : curBlock->children)
  361. {
  362. flatHierarchy[curDataIdx].childIndexes.push_back(entryIdx);
  363. todo.push(entryIdx);
  364. flatHierarchy.push_back(TempEntry(child, entryIdx));
  365. entryIdx++;
  366. }
  367. }
  368. // Calculate sampling data for all entries
  369. basicEntries.resize(flatHierarchy.size());
  370. preciseEntries.resize(flatHierarchy.size());
  371. for(auto& iter = flatHierarchy.rbegin(); iter != flatHierarchy.rend(); ++iter)
  372. {
  373. TempEntry& curData = *iter;
  374. ProfiledBlock* curBlock = curData.parentBlock;
  375. CPUProfilerBasicSamplingEntry* entryBasic = &basicEntries[curData.entryIdx];
  376. CPUProfilerPreciseSamplingEntry* entryPrecise = &preciseEntries[curData.entryIdx];
  377. // Calculate basic data
  378. entryBasic->data.name = String(curBlock->name.c_str());
  379. entryBasic->data.memAllocs = 0;
  380. entryBasic->data.memFrees = 0;
  381. entryBasic->data.totalTimeMs = 0.0;
  382. entryBasic->data.maxTimeMs = 0.0;
  383. for(auto& sample : curBlock->basic.samples)
  384. {
  385. entryBasic->data.totalTimeMs += sample.time;
  386. entryBasic->data.maxTimeMs = std::max(entryBasic->data.maxTimeMs, sample.time);
  387. entryBasic->data.memAllocs += sample.numAllocs;
  388. entryBasic->data.memFrees += sample.numFrees;
  389. }
  390. entryBasic->data.numCalls = (UINT32)curBlock->basic.samples.size();
  391. if(entryBasic->data.numCalls > 0)
  392. entryBasic->data.avgTimeMs = entryBasic->data.totalTimeMs / entryBasic->data.numCalls;
  393. double totalChildTime = 0.0;
  394. for(auto& childIdx : curData.childIndexes)
  395. {
  396. CPUProfilerBasicSamplingEntry* childEntry = &basicEntries[childIdx];
  397. totalChildTime += childEntry->data.totalTimeMs;
  398. childEntry->data.pctOfParent = (float)(childEntry->data.totalTimeMs / entryBasic->data.totalTimeMs);
  399. entryBasic->data.estimatedOverheadMs += childEntry->data.estimatedOverheadMs;
  400. }
  401. entryBasic->data.estimatedOverheadMs += curBlock->basic.samples.size() * mBasicSamplingOverheadMs;
  402. entryBasic->data.estimatedOverheadMs += curBlock->precise.samples.size() * mPreciseSamplingOverheadMs;
  403. entryBasic->data.totalSelfTimeMs = entryBasic->data.totalTimeMs - totalChildTime;
  404. if(entryBasic->data.numCalls > 0)
  405. entryBasic->data.avgSelfTimeMs = entryBasic->data.totalSelfTimeMs / entryBasic->data.numCalls;
  406. entryBasic->data.estimatedSelfOverheadMs = mBasicTimerOverhead;
  407. // Calculate precise data
  408. entryPrecise->data.name = String(curBlock->name.c_str());
  409. entryPrecise->data.memAllocs = 0;
  410. entryPrecise->data.memFrees = 0;
  411. entryPrecise->data.totalCycles = 0;
  412. entryPrecise->data.maxCycles = 0;
  413. for(auto& sample : curBlock->precise.samples)
  414. {
  415. entryPrecise->data.totalCycles += sample.cycles;
  416. entryPrecise->data.maxCycles = std::max(entryPrecise->data.maxCycles, sample.cycles);
  417. entryPrecise->data.memAllocs += sample.numAllocs;
  418. entryPrecise->data.memFrees += sample.numFrees;
  419. }
  420. entryPrecise->data.numCalls = (UINT32)curBlock->precise.samples.size();
  421. if(entryPrecise->data.numCalls > 0)
  422. entryPrecise->data.avgCycles = entryPrecise->data.totalCycles / entryPrecise->data.numCalls;
  423. UINT64 totalChildCycles = 0;
  424. for(auto& childIdx : curData.childIndexes)
  425. {
  426. CPUProfilerPreciseSamplingEntry* childEntry = &preciseEntries[childIdx];
  427. totalChildCycles += childEntry->data.totalCycles;
  428. childEntry->data.pctOfParent = childEntry->data.totalCycles / (float)entryPrecise->data.totalCycles;
  429. entryPrecise->data.estimatedOverhead += childEntry->data.estimatedOverhead;
  430. }
  431. entryPrecise->data.estimatedOverhead += curBlock->precise.samples.size() * mPreciseSamplingOverheadCycles;
  432. entryPrecise->data.estimatedOverhead += curBlock->basic.samples.size() * mBasicSamplingOverheadCycles;
  433. entryPrecise->data.totalSelfCycles = entryPrecise->data.totalCycles - totalChildCycles;
  434. if(entryPrecise->data.numCalls > 0)
  435. entryPrecise->data.avgSelfCycles = entryPrecise->data.totalSelfCycles / entryPrecise->data.numCalls;
  436. entryPrecise->data.estimatedSelfOverhead = mPreciseTimerOverhead;
  437. }
  438. // Prune empty basic entries
  439. ProfilerStack<UINT32> finalBasicHierarchyTodo;
  440. ProfilerStack<UINT32> parentBasicEntryIndexes;
  441. ProfilerVector<TempEntry> newBasicEntries;
  442. finalBasicHierarchyTodo.push(0);
  443. entryIdx = 0;
  444. parentBasicEntryIndexes.push(entryIdx);
  445. newBasicEntries.push_back(TempEntry(nullptr, entryIdx));
  446. entryIdx++;
  447. while(!finalBasicHierarchyTodo.empty())
  448. {
  449. UINT32 parentEntryIdx = parentBasicEntryIndexes.top();
  450. parentBasicEntryIndexes.pop();
  451. UINT32 curEntryIdx = finalBasicHierarchyTodo.top();
  452. TempEntry& curEntry = flatHierarchy[curEntryIdx];
  453. finalBasicHierarchyTodo.pop();
  454. for(auto& childIdx : curEntry.childIndexes)
  455. {
  456. finalBasicHierarchyTodo.push(childIdx);
  457. CPUProfilerBasicSamplingEntry& basicEntry = basicEntries[childIdx];
  458. if(basicEntry.data.numCalls > 0)
  459. {
  460. newBasicEntries.push_back(TempEntry(nullptr, childIdx));
  461. newBasicEntries[parentEntryIdx].childIndexes.push_back(entryIdx);
  462. parentBasicEntryIndexes.push(entryIdx);
  463. entryIdx++;
  464. }
  465. else
  466. parentBasicEntryIndexes.push(parentEntryIdx);
  467. }
  468. }
  469. if(newBasicEntries.size() > 0)
  470. {
  471. ProfilerVector<CPUProfilerBasicSamplingEntry*> finalBasicEntries;
  472. report.mBasicSamplingRootEntry = basicEntries[newBasicEntries[0].entryIdx];
  473. finalBasicEntries.push_back(&report.mBasicSamplingRootEntry);
  474. finalBasicHierarchyTodo.push(0);
  475. while(!finalBasicHierarchyTodo.empty())
  476. {
  477. UINT32 curEntryIdx = finalBasicHierarchyTodo.top();
  478. finalBasicHierarchyTodo.pop();
  479. TempEntry& curEntry = newBasicEntries[curEntryIdx];
  480. CPUProfilerBasicSamplingEntry* basicEntry = finalBasicEntries[curEntryIdx];
  481. basicEntry->childEntries.resize(curEntry.childIndexes.size());
  482. UINT32 idx = 0;
  483. for(auto& childIdx : curEntry.childIndexes)
  484. {
  485. TempEntry& childEntry = newBasicEntries[childIdx];
  486. basicEntry->childEntries[idx] = basicEntries[childEntry.entryIdx];
  487. finalBasicEntries.push_back(&(basicEntry->childEntries[idx]));
  488. finalBasicHierarchyTodo.push(childIdx);
  489. idx++;
  490. }
  491. }
  492. }
  493. // Prune empty precise entries
  494. ProfilerStack<UINT32> finalPreciseHierarchyTodo;
  495. ProfilerStack<UINT32> parentPreciseEntryIndexes;
  496. ProfilerVector<TempEntry> newPreciseEntries;
  497. finalPreciseHierarchyTodo.push(0);
  498. entryIdx = 0;
  499. parentPreciseEntryIndexes.push(entryIdx);
  500. newPreciseEntries.push_back(TempEntry(nullptr, entryIdx));
  501. entryIdx++;
  502. while(!finalPreciseHierarchyTodo.empty())
  503. {
  504. UINT32 parentEntryIdx = parentPreciseEntryIndexes.top();
  505. parentPreciseEntryIndexes.pop();
  506. UINT32 curEntryIdx = finalPreciseHierarchyTodo.top();
  507. TempEntry& curEntry = flatHierarchy[curEntryIdx];
  508. finalPreciseHierarchyTodo.pop();
  509. for(auto& childIdx : curEntry.childIndexes)
  510. {
  511. finalPreciseHierarchyTodo.push(childIdx);
  512. CPUProfilerPreciseSamplingEntry& preciseEntry = preciseEntries[childIdx];
  513. if(preciseEntry.data.numCalls > 0)
  514. {
  515. newPreciseEntries.push_back(TempEntry(nullptr, childIdx));
  516. newPreciseEntries[parentEntryIdx].childIndexes.push_back(entryIdx);
  517. parentPreciseEntryIndexes.push(entryIdx);
  518. entryIdx++;
  519. }
  520. else
  521. parentPreciseEntryIndexes.push(parentEntryIdx);
  522. }
  523. }
  524. if(newPreciseEntries.size() > 0)
  525. {
  526. ProfilerVector<CPUProfilerPreciseSamplingEntry*> finalPreciseEntries;
  527. report.mPreciseSamplingRootEntry = preciseEntries[newPreciseEntries[0].entryIdx];
  528. finalPreciseEntries.push_back(&report.mPreciseSamplingRootEntry);
  529. finalPreciseHierarchyTodo.push(0);
  530. while(!finalPreciseHierarchyTodo.empty())
  531. {
  532. UINT32 curEntryIdx = finalPreciseHierarchyTodo.top();
  533. finalPreciseHierarchyTodo.pop();
  534. TempEntry& curEntry = newPreciseEntries[curEntryIdx];
  535. CPUProfilerPreciseSamplingEntry* preciseEntry = finalPreciseEntries[curEntryIdx];
  536. preciseEntry->childEntries.resize(curEntry.childIndexes.size());
  537. UINT32 idx = 0;
  538. for(auto& childIdx : curEntry.childIndexes)
  539. {
  540. TempEntry& childEntry = newPreciseEntries[childIdx];
  541. preciseEntry->childEntries[idx] = preciseEntries[childEntry.entryIdx];
  542. finalPreciseEntries.push_back(&preciseEntry->childEntries.back());
  543. finalPreciseHierarchyTodo.push(childIdx);
  544. idx++;
  545. }
  546. }
  547. }
  548. return report;
  549. }
  550. void ProfilerCPU::estimateTimerOverhead()
  551. {
  552. // Get an idea of how long timer calls and RDTSC takes
  553. const UINT32 reps = 1000, sampleReps = 100;
  554. mBasicTimerOverhead = 1000000.0;
  555. mPreciseTimerOverhead = 1000000;
  556. for (UINT32 tries = 0; tries < 20; tries++)
  557. {
  558. Timer timer;
  559. for (UINT32 i = 0; i < reps; i++)
  560. {
  561. timer.start();
  562. timer.stop();
  563. }
  564. double avgTime = double(timer.time)/double(reps);
  565. if (avgTime < mBasicTimerOverhead)
  566. mBasicTimerOverhead = avgTime;
  567. TimerPrecise timerPrecise;
  568. for (UINT32 i = 0; i < reps; i++)
  569. {
  570. timerPrecise.start();
  571. timerPrecise.stop();
  572. }
  573. UINT64 avgCycles = timerPrecise.cycles/reps;
  574. if (avgCycles < mPreciseTimerOverhead)
  575. mPreciseTimerOverhead = avgCycles;
  576. }
  577. mBasicSamplingOverheadMs = 1000000.0;
  578. mPreciseSamplingOverheadMs = 1000000.0;
  579. mBasicSamplingOverheadCycles = 1000000;
  580. mPreciseSamplingOverheadCycles = 1000000;
  581. for (UINT32 tries = 0; tries < 20; tries++)
  582. {
  583. /************************************************************************/
  584. /* AVERAGE TIME IN MS FOR BASIC SAMPLING */
  585. /************************************************************************/
  586. Timer timerA;
  587. timerA.start();
  588. beginThread("Main");
  589. // Two different cases that can effect performance, one where
  590. // sample already exists and other where new one needs to be created
  591. for (UINT32 i = 0; i < sampleReps; i++)
  592. {
  593. beginSample("TestAvg1");
  594. endSample("TestAvg1");
  595. beginSample("TestAvg2");
  596. endSample("TestAvg2");
  597. beginSample("TestAvg3");
  598. endSample("TestAvg3");
  599. beginSample("TestAvg4");
  600. endSample("TestAvg4");
  601. beginSample("TestAvg5");
  602. endSample("TestAvg5");
  603. beginSample("TestAvg6");
  604. endSample("TestAvg6");
  605. beginSample("TestAvg7");
  606. endSample("TestAvg7");
  607. beginSample("TestAvg8");
  608. endSample("TestAvg8");
  609. beginSample("TestAvg9");
  610. endSample("TestAvg9");
  611. beginSample("TestAvg10");
  612. endSample("TestAvg10");
  613. }
  614. for (UINT32 i = 0; i < sampleReps * 5; i++)
  615. {
  616. beginSample("TestAvg#" + ProfilerString(toString(i).c_str()));
  617. endSample("TestAvg#" + ProfilerString(toString(i).c_str()));
  618. }
  619. endThread();
  620. timerA.stop();
  621. reset();
  622. double avgTimeBasic = double(timerA.time)/double(sampleReps * 10 + sampleReps * 5) - mBasicTimerOverhead;
  623. if (avgTimeBasic < mBasicSamplingOverheadMs)
  624. mBasicSamplingOverheadMs = avgTimeBasic;
  625. /************************************************************************/
  626. /* AVERAGE CYCLES FOR BASIC SAMPLING */
  627. /************************************************************************/
  628. TimerPrecise timerPreciseA;
  629. timerPreciseA.start();
  630. beginThread("Main");
  631. // Two different cases that can effect performance, one where
  632. // sample already exists and other where new one needs to be created
  633. for (UINT32 i = 0; i < sampleReps; i++)
  634. {
  635. beginSample("TestAvg1");
  636. endSample("TestAvg1");
  637. beginSample("TestAvg2");
  638. endSample("TestAvg2");
  639. beginSample("TestAvg3");
  640. endSample("TestAvg3");
  641. beginSample("TestAvg4");
  642. endSample("TestAvg4");
  643. beginSample("TestAvg5");
  644. endSample("TestAvg5");
  645. beginSample("TestAvg6");
  646. endSample("TestAvg6");
  647. beginSample("TestAvg7");
  648. endSample("TestAvg7");
  649. beginSample("TestAvg8");
  650. endSample("TestAvg8");
  651. beginSample("TestAvg9");
  652. endSample("TestAvg9");
  653. beginSample("TestAvg10");
  654. endSample("TestAvg10");
  655. }
  656. for (UINT32 i = 0; i < sampleReps * 5; i++)
  657. {
  658. beginSample("TestAvg#" + ProfilerString(toString(i).c_str()));
  659. endSample("TestAvg#" + ProfilerString(toString(i).c_str()));
  660. }
  661. endThread();
  662. timerPreciseA.stop();
  663. reset();
  664. UINT64 avgCyclesBasic = timerPreciseA.cycles/(sampleReps * 10 + sampleReps * 5) - mPreciseTimerOverhead;
  665. if (avgCyclesBasic < mBasicSamplingOverheadCycles)
  666. mBasicSamplingOverheadCycles = avgCyclesBasic;
  667. /************************************************************************/
  668. /* AVERAGE TIME IN MS FOR PRECISE SAMPLING */
  669. /************************************************************************/
  670. Timer timerB;
  671. timerB.start();
  672. beginThread("Main");
  673. // Two different cases that can effect performance, one where
  674. // sample already exists and other where new one needs to be created
  675. for (UINT32 i = 0; i < sampleReps; i++)
  676. {
  677. beginSamplePrecise("TestAvg1");
  678. endSamplePrecise("TestAvg1");
  679. beginSamplePrecise("TestAvg2");
  680. endSamplePrecise("TestAvg2");
  681. beginSamplePrecise("TestAvg3");
  682. endSamplePrecise("TestAvg3");
  683. beginSamplePrecise("TestAvg4");
  684. endSamplePrecise("TestAvg4");
  685. beginSamplePrecise("TestAvg5");
  686. endSamplePrecise("TestAvg5");
  687. beginSamplePrecise("TestAvg6");
  688. endSamplePrecise("TestAvg6");
  689. beginSamplePrecise("TestAvg7");
  690. endSamplePrecise("TestAvg7");
  691. beginSamplePrecise("TestAvg8");
  692. endSamplePrecise("TestAvg8");
  693. beginSamplePrecise("TestAvg9");
  694. endSamplePrecise("TestAvg9");
  695. beginSamplePrecise("TestAvg10");
  696. endSamplePrecise("TestAvg10");
  697. }
  698. for (UINT32 i = 0; i < sampleReps * 5; i++)
  699. {
  700. beginSamplePrecise("TestAvg#" + ProfilerString(toString(i).c_str()));
  701. endSamplePrecise("TestAvg#" + ProfilerString(toString(i).c_str()));
  702. }
  703. endThread();
  704. timerB.stop();
  705. reset();
  706. double avgTimesPrecise = timerB.time/(sampleReps * 10 + sampleReps * 5);
  707. if (avgTimesPrecise < mPreciseSamplingOverheadMs)
  708. mPreciseSamplingOverheadMs = avgTimesPrecise;
  709. /************************************************************************/
  710. /* AVERAGE CYCLES FOR PRECISE SAMPLING */
  711. /************************************************************************/
  712. TimerPrecise timerPreciseB;
  713. timerPreciseB.start();
  714. beginThread("Main");
  715. // Two different cases that can effect performance, one where
  716. // sample already exists and other where new one needs to be created
  717. for (UINT32 i = 0; i < sampleReps; i++)
  718. {
  719. beginSamplePrecise("TestAvg1");
  720. endSamplePrecise("TestAvg1");
  721. beginSamplePrecise("TestAvg2");
  722. endSamplePrecise("TestAvg2");
  723. beginSamplePrecise("TestAvg3");
  724. endSamplePrecise("TestAvg3");
  725. beginSamplePrecise("TestAvg4");
  726. endSamplePrecise("TestAvg4");
  727. beginSamplePrecise("TestAvg5");
  728. endSamplePrecise("TestAvg5");
  729. beginSamplePrecise("TestAvg6");
  730. endSamplePrecise("TestAvg6");
  731. beginSamplePrecise("TestAvg7");
  732. endSamplePrecise("TestAvg7");
  733. beginSamplePrecise("TestAvg8");
  734. endSamplePrecise("TestAvg8");
  735. beginSamplePrecise("TestAvg9");
  736. endSamplePrecise("TestAvg9");
  737. beginSamplePrecise("TestAvg10");
  738. endSamplePrecise("TestAvg10");
  739. }
  740. for (UINT32 i = 0; i < sampleReps * 5; i++)
  741. {
  742. beginSamplePrecise("TestAvg#" + ProfilerString(toString(i).c_str()));
  743. endSamplePrecise("TestAvg#" + ProfilerString(toString(i).c_str()));
  744. }
  745. endThread();
  746. timerPreciseB.stop();
  747. reset();
  748. UINT64 avgCyclesPrecise = timerPreciseB.cycles/(sampleReps * 10 + sampleReps * 5);
  749. if (avgCyclesPrecise < mPreciseSamplingOverheadCycles)
  750. mPreciseSamplingOverheadCycles = avgCyclesPrecise;
  751. }
  752. }
  753. CPUProfilerBasicSamplingEntry::Data::Data()
  754. :numCalls(0), avgTimeMs(0.0), maxTimeMs(0.0), totalTimeMs(0.0),
  755. avgSelfTimeMs(0.0), totalSelfTimeMs(0.0), estimatedSelfOverheadMs(0.0),
  756. estimatedOverheadMs(0.0), pctOfParent(1.0f)
  757. { }
  758. CPUProfilerPreciseSamplingEntry::Data::Data()
  759. :numCalls(0), avgCycles(0), maxCycles(0), totalCycles(0),
  760. avgSelfCycles(0), totalSelfCycles(0), estimatedSelfOverhead(0),
  761. estimatedOverhead(0), pctOfParent(1.0f)
  762. { }
  763. CPUProfilerReport::CPUProfilerReport()
  764. {
  765. }
  766. ProfilerCPU& gProfilerCPU()
  767. {
  768. return ProfilerCPU::instance();
  769. }
  770. }