BsProfilerCPU.cpp 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986
  1. #include "BsProfilerCPU.h"
  2. #include "BsDebug.h"
  3. #include "BsPlatform.h"
  4. namespace BansheeEngine
  5. {
  6. ProfilerCPU::Timer::Timer()
  7. {
  8. time = 0.0f;
  9. }
  10. void ProfilerCPU::Timer::start()
  11. {
  12. startTime = getCurrentTime();
  13. }
  14. void ProfilerCPU::Timer::stop()
  15. {
  16. time += getCurrentTime() - startTime;
  17. }
  18. void ProfilerCPU::Timer::reset()
  19. {
  20. time = 0.0f;
  21. }
  22. inline double ProfilerCPU::Timer::getCurrentTime()
  23. {
  24. return Platform::queryPerformanceTimerMs();
  25. }
  26. ProfilerCPU::TimerPrecise::TimerPrecise()
  27. {
  28. cycles = 0;
  29. }
  30. void ProfilerCPU::TimerPrecise::start()
  31. {
  32. startCycles = getNumCycles();
  33. }
  34. void ProfilerCPU::TimerPrecise::stop()
  35. {
  36. cycles += getNumCycles() - startCycles;
  37. }
  38. void ProfilerCPU::TimerPrecise::reset()
  39. {
  40. cycles = 0;
  41. }
  42. inline UINT64 ProfilerCPU::TimerPrecise::getNumCycles()
  43. {
  44. #if BS_COMPILER == BS_COMPILER_GNUC
  45. asm volatile("cpuid" : : : "%eax", "%ebx", "%ecx", "%edx" );
  46. UINT32 __a,__d;
  47. asm volatile("rdtsc" : "=a" (__a), "=d" (__d));
  48. return ( UINT64(__a) | UINT64(__d) << 32 );
  49. #else
  50. int a[4];
  51. int b = 0;
  52. __cpuid(a, b);
  53. return __rdtsc();
  54. #endif
  55. }
  56. ProfilerCPU::ProfileData::ProfileData(FrameAlloc* alloc)
  57. :samples(alloc)
  58. { }
  59. void ProfilerCPU::ProfileData::beginSample()
  60. {
  61. memAllocs = MemoryCounter::getNumAllocs();
  62. memFrees = MemoryCounter::getNumFrees();
  63. timer.reset();
  64. timer.start();
  65. }
  66. void ProfilerCPU::ProfileData::endSample()
  67. {
  68. timer.stop();
  69. UINT64 numAllocs = MemoryCounter::getNumAllocs() - memAllocs;
  70. UINT64 numFrees = MemoryCounter::getNumFrees() - memFrees;
  71. samples.push_back(ProfileSample(timer.time, numAllocs, numFrees));
  72. }
  73. void ProfilerCPU::ProfileData::resumeLastSample()
  74. {
  75. timer.start();
  76. samples.erase(samples.end() - 1);
  77. }
  78. ProfilerCPU::PreciseProfileData::PreciseProfileData(FrameAlloc* alloc)
  79. :samples(alloc)
  80. { }
  81. void ProfilerCPU::PreciseProfileData::beginSample()
  82. {
  83. memAllocs = MemoryCounter::getNumAllocs();
  84. memFrees = MemoryCounter::getNumFrees();
  85. timer.reset();
  86. timer.start();
  87. }
  88. void ProfilerCPU::PreciseProfileData::endSample()
  89. {
  90. timer.stop();
  91. UINT64 numAllocs = MemoryCounter::getNumAllocs() - memAllocs;
  92. UINT64 numFrees = MemoryCounter::getNumFrees() - memFrees;
  93. samples.push_back(PreciseProfileSample(timer.cycles, numAllocs, numFrees));
  94. }
  95. void ProfilerCPU::PreciseProfileData::resumeLastSample()
  96. {
  97. timer.start();
  98. samples.erase(samples.end() - 1);
  99. }
  100. BS_THREADLOCAL ProfilerCPU::ThreadInfo* ProfilerCPU::ThreadInfo::activeThread = nullptr;
  101. ProfilerCPU::ThreadInfo::ThreadInfo()
  102. :isActive(false), rootBlock(nullptr), frameAlloc(1024 * 512), activeBlocks(nullptr)
  103. {
  104. }
  105. void ProfilerCPU::ThreadInfo::begin(const char* _name)
  106. {
  107. if(isActive)
  108. {
  109. LOGWRN("Profiler::beginThread called on a thread that was already being sampled");
  110. return;
  111. }
  112. if(rootBlock == nullptr)
  113. rootBlock = getBlock(_name);
  114. activeBlock = ActiveBlock(ActiveSamplingType::Basic, rootBlock);
  115. if (activeBlocks == nullptr)
  116. activeBlocks = frameAlloc.alloc<Stack<ActiveBlock, StdFrameAlloc<ActiveBlock>>>(&frameAlloc);
  117. activeBlocks->push(activeBlock);
  118. rootBlock->basic.beginSample();
  119. isActive = true;
  120. }
  121. void ProfilerCPU::ThreadInfo::end()
  122. {
  123. if(activeBlock.type == ActiveSamplingType::Basic)
  124. activeBlock.block->basic.endSample();
  125. else
  126. activeBlock.block->precise.endSample();
  127. activeBlocks->pop();
  128. if(!isActive)
  129. LOGWRN("Profiler::endThread called on a thread that isn't being sampled.");
  130. if (activeBlocks->size() > 0)
  131. {
  132. LOGWRN("Profiler::endThread called but not all sample pairs were closed. Sampling data will not be valid.");
  133. while (activeBlocks->size() > 0)
  134. {
  135. ActiveBlock& curBlock = activeBlocks->top();
  136. if(curBlock.type == ActiveSamplingType::Basic)
  137. curBlock.block->basic.endSample();
  138. else
  139. curBlock.block->precise.endSample();
  140. activeBlocks->pop();
  141. }
  142. }
  143. isActive = false;
  144. activeBlock = ActiveBlock();
  145. frameAlloc.dealloc(activeBlocks);
  146. activeBlocks = nullptr;
  147. }
  148. void ProfilerCPU::ThreadInfo::reset()
  149. {
  150. if(isActive)
  151. end();
  152. if(rootBlock != nullptr)
  153. releaseBlock(rootBlock);
  154. rootBlock = nullptr;
  155. frameAlloc.clear(); // Note: This never actually frees memory
  156. }
  157. ProfilerCPU::ProfiledBlock* ProfilerCPU::ThreadInfo::getBlock(const char* name)
  158. {
  159. ProfiledBlock* block = frameAlloc.alloc<ProfiledBlock>(&frameAlloc);
  160. block->name = (char*)frameAlloc.alloc(((UINT32)strlen(name) + 1) * sizeof(char));
  161. strcpy(block->name, name);
  162. return block;
  163. }
  164. void ProfilerCPU::ThreadInfo::releaseBlock(ProfiledBlock* block)
  165. {
  166. frameAlloc.dealloc((UINT8*)block->name);
  167. frameAlloc.dealloc(block);
  168. }
  169. ProfilerCPU::ProfiledBlock::ProfiledBlock(FrameAlloc* alloc)
  170. :children(alloc), basic(alloc), precise(alloc)
  171. { }
  172. ProfilerCPU::ProfiledBlock::~ProfiledBlock()
  173. {
  174. ThreadInfo* thread = ThreadInfo::activeThread;
  175. for(auto& child : children)
  176. thread->releaseBlock(child);
  177. children.clear();
  178. }
  179. ProfilerCPU::ProfiledBlock* ProfilerCPU::ProfiledBlock::findChild(const char* name) const
  180. {
  181. for(auto& child : children)
  182. {
  183. if(strcmp(child->name, name) == 0)
  184. return child;
  185. }
  186. return nullptr;
  187. }
  188. ProfilerCPU::ProfilerCPU()
  189. :mBasicTimerOverhead(0.0), mPreciseTimerOverhead(0), mBasicSamplingOverheadMs(0.0), mPreciseSamplingOverheadCycles(0),
  190. mBasicSamplingOverheadCycles(0), mPreciseSamplingOverheadMs(0.0)
  191. {
  192. // TODO - We only estimate overhead on program start. It might be better to estimate it each time beginThread is called,
  193. // and keep separate values per thread.
  194. estimateTimerOverhead();
  195. }
  196. ProfilerCPU::~ProfilerCPU()
  197. {
  198. reset();
  199. BS_LOCK_MUTEX(mThreadSync);
  200. for(auto& threadInfo : mActiveThreads)
  201. bs_delete<ThreadInfo, ProfilerAlloc>(threadInfo);
  202. }
  203. void ProfilerCPU::beginThread(const char* name)
  204. {
  205. ThreadInfo* thread = ThreadInfo::activeThread;
  206. if(thread == nullptr)
  207. {
  208. ThreadInfo::activeThread = bs_new<ThreadInfo, ProfilerAlloc>();
  209. thread = ThreadInfo::activeThread;
  210. {
  211. BS_LOCK_MUTEX(mThreadSync);
  212. mActiveThreads.push_back(thread);
  213. }
  214. }
  215. thread->begin(name);
  216. }
  217. void ProfilerCPU::endThread()
  218. {
  219. // I don't do a nullcheck where on purpose, so endSample can be called ASAP
  220. ThreadInfo::activeThread->end();
  221. }
  222. void ProfilerCPU::beginSample(const char* name)
  223. {
  224. ThreadInfo* thread = ThreadInfo::activeThread;
  225. if(thread == nullptr || !thread->isActive)
  226. {
  227. beginThread("Unknown");
  228. thread = ThreadInfo::activeThread;
  229. }
  230. ProfiledBlock* parent = thread->activeBlock.block;
  231. ProfiledBlock* block = nullptr;
  232. if(parent != nullptr)
  233. block = parent->findChild(name);
  234. if(block == nullptr)
  235. {
  236. block = thread->getBlock(name);
  237. if(parent != nullptr)
  238. parent->children.push_back(block);
  239. else
  240. thread->rootBlock->children.push_back(block);
  241. }
  242. thread->activeBlock = ActiveBlock(ActiveSamplingType::Basic, block);
  243. thread->activeBlocks->push(thread->activeBlock);
  244. block->basic.beginSample();
  245. }
  246. void ProfilerCPU::endSample(const char* name)
  247. {
  248. ThreadInfo* thread = ThreadInfo::activeThread;
  249. ProfiledBlock* block = thread->activeBlock.block;
  250. #if BS_DEBUG_MODE
  251. if(block == nullptr)
  252. {
  253. LOGWRN("Mismatched CPUProfiler::endSample. No beginSample was called.");
  254. return;
  255. }
  256. if(thread->activeBlock.type == ActiveSamplingType::Precise)
  257. {
  258. LOGWRN("Mismatched CPUProfiler::endSample. Was expecting Profiler::endSamplePrecise.");
  259. return;
  260. }
  261. if(strcmp(block->name, name) != 0)
  262. {
  263. LOGWRN("Mismatched CPUProfiler::endSample. Was expecting \"" + String(block->name) +
  264. "\" but got \"" + String(name) + "\". Sampling data will not be valid.");
  265. return;
  266. }
  267. #endif
  268. block->basic.endSample();
  269. thread->activeBlocks->pop();
  270. if (!thread->activeBlocks->empty())
  271. thread->activeBlock = thread->activeBlocks->top();
  272. else
  273. thread->activeBlock = ActiveBlock();
  274. }
  275. void ProfilerCPU::beginSamplePrecise(const char* name)
  276. {
  277. // Note: There is a (small) possibility a context switch will happen during this measurement in which case result will be skewed.
  278. // Increasing thread priority might help. This is generally only a problem with code that executes a long time (10-15+ ms - depending on OS quant length)
  279. ThreadInfo* thread = ThreadInfo::activeThread;
  280. if(thread == nullptr || !thread->isActive)
  281. beginThread("Unknown");
  282. ProfiledBlock* parent = thread->activeBlock.block;
  283. ProfiledBlock* block = nullptr;
  284. if(parent != nullptr)
  285. block = parent->findChild(name);
  286. if(block == nullptr)
  287. {
  288. block = thread->getBlock(name);
  289. if(parent != nullptr)
  290. parent->children.push_back(block);
  291. else
  292. thread->rootBlock->children.push_back(block);
  293. }
  294. thread->activeBlock = ActiveBlock(ActiveSamplingType::Precise, block);
  295. thread->activeBlocks->push(thread->activeBlock);
  296. block->precise.beginSample();
  297. }
  298. void ProfilerCPU::endSamplePrecise(const char* name)
  299. {
  300. ThreadInfo* thread = ThreadInfo::activeThread;
  301. ProfiledBlock* block = thread->activeBlock.block;
  302. #if BS_DEBUG_MODE
  303. if(block == nullptr)
  304. {
  305. LOGWRN("Mismatched Profiler::endSamplePrecise. No beginSamplePrecise was called.");
  306. return;
  307. }
  308. if(thread->activeBlock.type == ActiveSamplingType::Basic)
  309. {
  310. LOGWRN("Mismatched CPUProfiler::endSamplePrecise. Was expecting Profiler::endSample.");
  311. return;
  312. }
  313. if (strcmp(block->name, name) != 0)
  314. {
  315. LOGWRN("Mismatched Profiler::endSamplePrecise. Was expecting \"" + String(block->name) +
  316. "\" but got \"" + String(name) + "\". Sampling data will not be valid.");
  317. return;
  318. }
  319. #endif
  320. block->precise.endSample();
  321. thread->activeBlocks->pop();
  322. if (!thread->activeBlocks->empty())
  323. thread->activeBlock = thread->activeBlocks->top();
  324. else
  325. thread->activeBlock = ActiveBlock();
  326. }
  327. void ProfilerCPU::reset()
  328. {
  329. ThreadInfo* thread = ThreadInfo::activeThread;
  330. if(thread != nullptr)
  331. thread->reset();
  332. }
  333. CPUProfilerReport ProfilerCPU::generateReport()
  334. {
  335. CPUProfilerReport report;
  336. ThreadInfo* thread = ThreadInfo::activeThread;
  337. if(thread == nullptr)
  338. return report;
  339. if(thread->isActive)
  340. thread->end();
  341. // We need to separate out basic and precise data and form two separate hierarchies
  342. if(thread->rootBlock == nullptr)
  343. return report;
  344. struct TempEntry
  345. {
  346. TempEntry(ProfiledBlock* _parentBlock, UINT32 _entryIdx)
  347. :parentBlock(_parentBlock), entryIdx(_entryIdx)
  348. { }
  349. ProfiledBlock* parentBlock;
  350. UINT32 entryIdx;
  351. ProfilerVector<UINT32> childIndexes;
  352. };
  353. ProfilerVector<CPUProfilerBasicSamplingEntry> basicEntries;
  354. ProfilerVector<CPUProfilerPreciseSamplingEntry> preciseEntries;
  355. // Fill up flatHierarchy array in a way so we always process children before parents
  356. ProfilerStack<UINT32> todo;
  357. ProfilerVector<TempEntry> flatHierarchy;
  358. UINT32 entryIdx = 0;
  359. todo.push(entryIdx);
  360. flatHierarchy.push_back(TempEntry(thread->rootBlock, entryIdx));
  361. entryIdx++;
  362. while(!todo.empty())
  363. {
  364. UINT32 curDataIdx = todo.top();
  365. ProfiledBlock* curBlock = flatHierarchy[curDataIdx].parentBlock;
  366. todo.pop();
  367. for(auto& child : curBlock->children)
  368. {
  369. flatHierarchy[curDataIdx].childIndexes.push_back(entryIdx);
  370. todo.push(entryIdx);
  371. flatHierarchy.push_back(TempEntry(child, entryIdx));
  372. entryIdx++;
  373. }
  374. }
  375. // Calculate sampling data for all entries
  376. basicEntries.resize(flatHierarchy.size());
  377. preciseEntries.resize(flatHierarchy.size());
  378. for(auto iter = flatHierarchy.rbegin(); iter != flatHierarchy.rend(); ++iter)
  379. {
  380. TempEntry& curData = *iter;
  381. ProfiledBlock* curBlock = curData.parentBlock;
  382. CPUProfilerBasicSamplingEntry* entryBasic = &basicEntries[curData.entryIdx];
  383. CPUProfilerPreciseSamplingEntry* entryPrecise = &preciseEntries[curData.entryIdx];
  384. // Calculate basic data
  385. entryBasic->data.name = String(curBlock->name);
  386. entryBasic->data.memAllocs = 0;
  387. entryBasic->data.memFrees = 0;
  388. entryBasic->data.totalTimeMs = 0.0;
  389. entryBasic->data.maxTimeMs = 0.0;
  390. for(auto& sample : curBlock->basic.samples)
  391. {
  392. entryBasic->data.totalTimeMs += sample.time;
  393. entryBasic->data.maxTimeMs = std::max(entryBasic->data.maxTimeMs, sample.time);
  394. entryBasic->data.memAllocs += sample.numAllocs;
  395. entryBasic->data.memFrees += sample.numFrees;
  396. }
  397. entryBasic->data.numCalls = (UINT32)curBlock->basic.samples.size();
  398. if(entryBasic->data.numCalls > 0)
  399. entryBasic->data.avgTimeMs = entryBasic->data.totalTimeMs / entryBasic->data.numCalls;
  400. double totalChildTime = 0.0;
  401. for(auto& childIdx : curData.childIndexes)
  402. {
  403. CPUProfilerBasicSamplingEntry* childEntry = &basicEntries[childIdx];
  404. totalChildTime += childEntry->data.totalTimeMs;
  405. childEntry->data.pctOfParent = (float)(childEntry->data.totalTimeMs / entryBasic->data.totalTimeMs);
  406. entryBasic->data.estimatedOverheadMs += childEntry->data.estimatedOverheadMs;
  407. }
  408. entryBasic->data.estimatedOverheadMs += curBlock->basic.samples.size() * mBasicSamplingOverheadMs;
  409. entryBasic->data.estimatedOverheadMs += curBlock->precise.samples.size() * mPreciseSamplingOverheadMs;
  410. entryBasic->data.totalSelfTimeMs = entryBasic->data.totalTimeMs - totalChildTime;
  411. if(entryBasic->data.numCalls > 0)
  412. entryBasic->data.avgSelfTimeMs = entryBasic->data.totalSelfTimeMs / entryBasic->data.numCalls;
  413. entryBasic->data.estimatedSelfOverheadMs = mBasicTimerOverhead;
  414. // Calculate precise data
  415. entryPrecise->data.name = String(curBlock->name);
  416. entryPrecise->data.memAllocs = 0;
  417. entryPrecise->data.memFrees = 0;
  418. entryPrecise->data.totalCycles = 0;
  419. entryPrecise->data.maxCycles = 0;
  420. for(auto& sample : curBlock->precise.samples)
  421. {
  422. entryPrecise->data.totalCycles += sample.cycles;
  423. entryPrecise->data.maxCycles = std::max(entryPrecise->data.maxCycles, sample.cycles);
  424. entryPrecise->data.memAllocs += sample.numAllocs;
  425. entryPrecise->data.memFrees += sample.numFrees;
  426. }
  427. entryPrecise->data.numCalls = (UINT32)curBlock->precise.samples.size();
  428. if(entryPrecise->data.numCalls > 0)
  429. entryPrecise->data.avgCycles = entryPrecise->data.totalCycles / entryPrecise->data.numCalls;
  430. UINT64 totalChildCycles = 0;
  431. for(auto& childIdx : curData.childIndexes)
  432. {
  433. CPUProfilerPreciseSamplingEntry* childEntry = &preciseEntries[childIdx];
  434. totalChildCycles += childEntry->data.totalCycles;
  435. childEntry->data.pctOfParent = childEntry->data.totalCycles / (float)entryPrecise->data.totalCycles;
  436. entryPrecise->data.estimatedOverhead += childEntry->data.estimatedOverhead;
  437. }
  438. entryPrecise->data.estimatedOverhead += curBlock->precise.samples.size() * mPreciseSamplingOverheadCycles;
  439. entryPrecise->data.estimatedOverhead += curBlock->basic.samples.size() * mBasicSamplingOverheadCycles;
  440. entryPrecise->data.totalSelfCycles = entryPrecise->data.totalCycles - totalChildCycles;
  441. if(entryPrecise->data.numCalls > 0)
  442. entryPrecise->data.avgSelfCycles = entryPrecise->data.totalSelfCycles / entryPrecise->data.numCalls;
  443. entryPrecise->data.estimatedSelfOverhead = mPreciseTimerOverhead;
  444. }
  445. // Prune empty basic entries
  446. ProfilerStack<UINT32> finalBasicHierarchyTodo;
  447. ProfilerStack<UINT32> parentBasicEntryIndexes;
  448. ProfilerVector<TempEntry> newBasicEntries;
  449. finalBasicHierarchyTodo.push(0);
  450. entryIdx = 0;
  451. parentBasicEntryIndexes.push(entryIdx);
  452. newBasicEntries.push_back(TempEntry(nullptr, entryIdx));
  453. entryIdx++;
  454. while(!finalBasicHierarchyTodo.empty())
  455. {
  456. UINT32 parentEntryIdx = parentBasicEntryIndexes.top();
  457. parentBasicEntryIndexes.pop();
  458. UINT32 curEntryIdx = finalBasicHierarchyTodo.top();
  459. TempEntry& curEntry = flatHierarchy[curEntryIdx];
  460. finalBasicHierarchyTodo.pop();
  461. for(auto& childIdx : curEntry.childIndexes)
  462. {
  463. finalBasicHierarchyTodo.push(childIdx);
  464. CPUProfilerBasicSamplingEntry& basicEntry = basicEntries[childIdx];
  465. if(basicEntry.data.numCalls > 0)
  466. {
  467. newBasicEntries.push_back(TempEntry(nullptr, childIdx));
  468. newBasicEntries[parentEntryIdx].childIndexes.push_back(entryIdx);
  469. parentBasicEntryIndexes.push(entryIdx);
  470. entryIdx++;
  471. }
  472. else
  473. parentBasicEntryIndexes.push(parentEntryIdx);
  474. }
  475. }
  476. if(newBasicEntries.size() > 0)
  477. {
  478. ProfilerVector<CPUProfilerBasicSamplingEntry*> finalBasicEntries;
  479. report.mBasicSamplingRootEntry = basicEntries[newBasicEntries[0].entryIdx];
  480. finalBasicEntries.push_back(&report.mBasicSamplingRootEntry);
  481. finalBasicHierarchyTodo.push(0);
  482. while(!finalBasicHierarchyTodo.empty())
  483. {
  484. UINT32 curEntryIdx = finalBasicHierarchyTodo.top();
  485. finalBasicHierarchyTodo.pop();
  486. TempEntry& curEntry = newBasicEntries[curEntryIdx];
  487. CPUProfilerBasicSamplingEntry* basicEntry = finalBasicEntries[curEntryIdx];
  488. basicEntry->childEntries.resize(curEntry.childIndexes.size());
  489. UINT32 idx = 0;
  490. for(auto& childIdx : curEntry.childIndexes)
  491. {
  492. TempEntry& childEntry = newBasicEntries[childIdx];
  493. basicEntry->childEntries[idx] = basicEntries[childEntry.entryIdx];
  494. finalBasicEntries.push_back(&(basicEntry->childEntries[idx]));
  495. finalBasicHierarchyTodo.push(childIdx);
  496. idx++;
  497. }
  498. }
  499. }
  500. // Prune empty precise entries
  501. ProfilerStack<UINT32> finalPreciseHierarchyTodo;
  502. ProfilerStack<UINT32> parentPreciseEntryIndexes;
  503. ProfilerVector<TempEntry> newPreciseEntries;
  504. finalPreciseHierarchyTodo.push(0);
  505. entryIdx = 0;
  506. parentPreciseEntryIndexes.push(entryIdx);
  507. newPreciseEntries.push_back(TempEntry(nullptr, entryIdx));
  508. entryIdx++;
  509. while(!finalPreciseHierarchyTodo.empty())
  510. {
  511. UINT32 parentEntryIdx = parentPreciseEntryIndexes.top();
  512. parentPreciseEntryIndexes.pop();
  513. UINT32 curEntryIdx = finalPreciseHierarchyTodo.top();
  514. TempEntry& curEntry = flatHierarchy[curEntryIdx];
  515. finalPreciseHierarchyTodo.pop();
  516. for(auto& childIdx : curEntry.childIndexes)
  517. {
  518. finalPreciseHierarchyTodo.push(childIdx);
  519. CPUProfilerPreciseSamplingEntry& preciseEntry = preciseEntries[childIdx];
  520. if(preciseEntry.data.numCalls > 0)
  521. {
  522. newPreciseEntries.push_back(TempEntry(nullptr, childIdx));
  523. newPreciseEntries[parentEntryIdx].childIndexes.push_back(entryIdx);
  524. parentPreciseEntryIndexes.push(entryIdx);
  525. entryIdx++;
  526. }
  527. else
  528. parentPreciseEntryIndexes.push(parentEntryIdx);
  529. }
  530. }
  531. if(newPreciseEntries.size() > 0)
  532. {
  533. ProfilerVector<CPUProfilerPreciseSamplingEntry*> finalPreciseEntries;
  534. report.mPreciseSamplingRootEntry = preciseEntries[newPreciseEntries[0].entryIdx];
  535. finalPreciseEntries.push_back(&report.mPreciseSamplingRootEntry);
  536. finalPreciseHierarchyTodo.push(0);
  537. while(!finalPreciseHierarchyTodo.empty())
  538. {
  539. UINT32 curEntryIdx = finalPreciseHierarchyTodo.top();
  540. finalPreciseHierarchyTodo.pop();
  541. TempEntry& curEntry = newPreciseEntries[curEntryIdx];
  542. CPUProfilerPreciseSamplingEntry* preciseEntry = finalPreciseEntries[curEntryIdx];
  543. preciseEntry->childEntries.resize(curEntry.childIndexes.size());
  544. UINT32 idx = 0;
  545. for(auto& childIdx : curEntry.childIndexes)
  546. {
  547. TempEntry& childEntry = newPreciseEntries[childIdx];
  548. preciseEntry->childEntries[idx] = preciseEntries[childEntry.entryIdx];
  549. finalPreciseEntries.push_back(&preciseEntry->childEntries.back());
  550. finalPreciseHierarchyTodo.push(childIdx);
  551. idx++;
  552. }
  553. }
  554. }
  555. return report;
  556. }
  557. void ProfilerCPU::estimateTimerOverhead()
  558. {
  559. // Get an idea of how long timer calls and RDTSC takes
  560. const UINT32 reps = 1000, sampleReps = 20;
  561. mBasicTimerOverhead = 1000000.0;
  562. mPreciseTimerOverhead = 1000000;
  563. for (UINT32 tries = 0; tries < 20; tries++)
  564. {
  565. Timer timer;
  566. for (UINT32 i = 0; i < reps; i++)
  567. {
  568. timer.start();
  569. timer.stop();
  570. }
  571. double avgTime = double(timer.time)/double(reps);
  572. if (avgTime < mBasicTimerOverhead)
  573. mBasicTimerOverhead = avgTime;
  574. TimerPrecise timerPrecise;
  575. for (UINT32 i = 0; i < reps; i++)
  576. {
  577. timerPrecise.start();
  578. timerPrecise.stop();
  579. }
  580. UINT64 avgCycles = timerPrecise.cycles/reps;
  581. if (avgCycles < mPreciseTimerOverhead)
  582. mPreciseTimerOverhead = avgCycles;
  583. }
  584. mBasicSamplingOverheadMs = 1000000.0;
  585. mPreciseSamplingOverheadMs = 1000000.0;
  586. mBasicSamplingOverheadCycles = 1000000;
  587. mPreciseSamplingOverheadCycles = 1000000;
  588. for (UINT32 tries = 0; tries < 3; tries++)
  589. {
  590. /************************************************************************/
  591. /* AVERAGE TIME IN MS FOR BASIC SAMPLING */
  592. /************************************************************************/
  593. Timer timerA;
  594. timerA.start();
  595. beginThread("Main");
  596. // Two different cases that can effect performance, one where
  597. // sample already exists and other where new one needs to be created
  598. for (UINT32 i = 0; i < sampleReps; i++)
  599. {
  600. beginSample("TestAvg1");
  601. endSample("TestAvg1");
  602. beginSample("TestAvg2");
  603. endSample("TestAvg2");
  604. beginSample("TestAvg3");
  605. endSample("TestAvg3");
  606. beginSample("TestAvg4");
  607. endSample("TestAvg4");
  608. beginSample("TestAvg5");
  609. endSample("TestAvg5");
  610. beginSample("TestAvg6");
  611. endSample("TestAvg6");
  612. beginSample("TestAvg7");
  613. endSample("TestAvg7");
  614. beginSample("TestAvg8");
  615. endSample("TestAvg8");
  616. beginSample("TestAvg9");
  617. endSample("TestAvg9");
  618. beginSample("TestAvg10");
  619. endSample("TestAvg10");
  620. }
  621. for (UINT32 i = 0; i < sampleReps * 5; i++)
  622. {
  623. beginSample(("TestAvg#" + toString(i)).c_str());
  624. endSample(("TestAvg#" + toString(i)).c_str());
  625. }
  626. endThread();
  627. timerA.stop();
  628. reset();
  629. double avgTimeBasic = double(timerA.time)/double(sampleReps * 10 + sampleReps * 5) - mBasicTimerOverhead;
  630. if (avgTimeBasic < mBasicSamplingOverheadMs)
  631. mBasicSamplingOverheadMs = avgTimeBasic;
  632. /************************************************************************/
  633. /* AVERAGE CYCLES FOR BASIC SAMPLING */
  634. /************************************************************************/
  635. TimerPrecise timerPreciseA;
  636. timerPreciseA.start();
  637. beginThread("Main");
  638. // Two different cases that can effect performance, one where
  639. // sample already exists and other where new one needs to be created
  640. for (UINT32 i = 0; i < sampleReps; i++)
  641. {
  642. beginSample("TestAvg1");
  643. endSample("TestAvg1");
  644. beginSample("TestAvg2");
  645. endSample("TestAvg2");
  646. beginSample("TestAvg3");
  647. endSample("TestAvg3");
  648. beginSample("TestAvg4");
  649. endSample("TestAvg4");
  650. beginSample("TestAvg5");
  651. endSample("TestAvg5");
  652. beginSample("TestAvg6");
  653. endSample("TestAvg6");
  654. beginSample("TestAvg7");
  655. endSample("TestAvg7");
  656. beginSample("TestAvg8");
  657. endSample("TestAvg8");
  658. beginSample("TestAvg9");
  659. endSample("TestAvg9");
  660. beginSample("TestAvg10");
  661. endSample("TestAvg10");
  662. }
  663. for (UINT32 i = 0; i < sampleReps * 5; i++)
  664. {
  665. beginSample(("TestAvg#" + toString(i)).c_str());
  666. endSample(("TestAvg#" + toString(i)).c_str());
  667. }
  668. endThread();
  669. timerPreciseA.stop();
  670. reset();
  671. UINT64 avgCyclesBasic = timerPreciseA.cycles/(sampleReps * 10 + sampleReps * 5) - mPreciseTimerOverhead;
  672. if (avgCyclesBasic < mBasicSamplingOverheadCycles)
  673. mBasicSamplingOverheadCycles = avgCyclesBasic;
  674. /************************************************************************/
  675. /* AVERAGE TIME IN MS FOR PRECISE SAMPLING */
  676. /************************************************************************/
  677. Timer timerB;
  678. timerB.start();
  679. beginThread("Main");
  680. // Two different cases that can effect performance, one where
  681. // sample already exists and other where new one needs to be created
  682. for (UINT32 i = 0; i < sampleReps; i++)
  683. {
  684. beginSamplePrecise("TestAvg1");
  685. endSamplePrecise("TestAvg1");
  686. beginSamplePrecise("TestAvg2");
  687. endSamplePrecise("TestAvg2");
  688. beginSamplePrecise("TestAvg3");
  689. endSamplePrecise("TestAvg3");
  690. beginSamplePrecise("TestAvg4");
  691. endSamplePrecise("TestAvg4");
  692. beginSamplePrecise("TestAvg5");
  693. endSamplePrecise("TestAvg5");
  694. beginSamplePrecise("TestAvg6");
  695. endSamplePrecise("TestAvg6");
  696. beginSamplePrecise("TestAvg7");
  697. endSamplePrecise("TestAvg7");
  698. beginSamplePrecise("TestAvg8");
  699. endSamplePrecise("TestAvg8");
  700. beginSamplePrecise("TestAvg9");
  701. endSamplePrecise("TestAvg9");
  702. beginSamplePrecise("TestAvg10");
  703. endSamplePrecise("TestAvg10");
  704. }
  705. for (UINT32 i = 0; i < sampleReps * 5; i++)
  706. {
  707. beginSamplePrecise(("TestAvg#" + toString(i)).c_str());
  708. endSamplePrecise(("TestAvg#" + toString(i)).c_str());
  709. }
  710. endThread();
  711. timerB.stop();
  712. reset();
  713. double avgTimesPrecise = timerB.time/(sampleReps * 10 + sampleReps * 5);
  714. if (avgTimesPrecise < mPreciseSamplingOverheadMs)
  715. mPreciseSamplingOverheadMs = avgTimesPrecise;
  716. /************************************************************************/
  717. /* AVERAGE CYCLES FOR PRECISE SAMPLING */
  718. /************************************************************************/
  719. TimerPrecise timerPreciseB;
  720. timerPreciseB.start();
  721. beginThread("Main");
  722. // Two different cases that can effect performance, one where
  723. // sample already exists and other where new one needs to be created
  724. for (UINT32 i = 0; i < sampleReps; i++)
  725. {
  726. beginSamplePrecise("TestAvg1");
  727. endSamplePrecise("TestAvg1");
  728. beginSamplePrecise("TestAvg2");
  729. endSamplePrecise("TestAvg2");
  730. beginSamplePrecise("TestAvg3");
  731. endSamplePrecise("TestAvg3");
  732. beginSamplePrecise("TestAvg4");
  733. endSamplePrecise("TestAvg4");
  734. beginSamplePrecise("TestAvg5");
  735. endSamplePrecise("TestAvg5");
  736. beginSamplePrecise("TestAvg6");
  737. endSamplePrecise("TestAvg6");
  738. beginSamplePrecise("TestAvg7");
  739. endSamplePrecise("TestAvg7");
  740. beginSamplePrecise("TestAvg8");
  741. endSamplePrecise("TestAvg8");
  742. beginSamplePrecise("TestAvg9");
  743. endSamplePrecise("TestAvg9");
  744. beginSamplePrecise("TestAvg10");
  745. endSamplePrecise("TestAvg10");
  746. }
  747. for (UINT32 i = 0; i < sampleReps * 5; i++)
  748. {
  749. beginSamplePrecise(("TestAvg#" + toString(i)).c_str());
  750. endSamplePrecise(("TestAvg#" + toString(i)).c_str());
  751. }
  752. endThread();
  753. timerPreciseB.stop();
  754. reset();
  755. UINT64 avgCyclesPrecise = timerPreciseB.cycles/(sampleReps * 10 + sampleReps * 5);
  756. if (avgCyclesPrecise < mPreciseSamplingOverheadCycles)
  757. mPreciseSamplingOverheadCycles = avgCyclesPrecise;
  758. }
  759. }
  760. CPUProfilerBasicSamplingEntry::Data::Data()
  761. :numCalls(0), avgTimeMs(0.0), maxTimeMs(0.0), totalTimeMs(0.0),
  762. avgSelfTimeMs(0.0), totalSelfTimeMs(0.0), estimatedSelfOverheadMs(0.0),
  763. estimatedOverheadMs(0.0), pctOfParent(1.0f)
  764. { }
  765. CPUProfilerPreciseSamplingEntry::Data::Data()
  766. :numCalls(0), avgCycles(0), maxCycles(0), totalCycles(0),
  767. avgSelfCycles(0), totalSelfCycles(0), estimatedSelfOverhead(0),
  768. estimatedOverhead(0), pctOfParent(1.0f)
  769. { }
  770. CPUProfilerReport::CPUProfilerReport()
  771. {
  772. }
  773. ProfilerCPU& gProfilerCPU()
  774. {
  775. return ProfilerCPU::instance();
  776. }
  777. }