btThreads.cpp 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792
  1. /*
  2. Copyright (c) 2003-2014 Erwin Coumans http://bullet.googlecode.com
  3. This software is provided 'as-is', without any express or implied warranty.
  4. In no event will the authors be held liable for any damages arising from the use of this software.
  5. Permission is granted to anyone to use this software for any purpose,
  6. including commercial applications, and to alter it and redistribute it freely,
  7. subject to the following restrictions:
  8. 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
  9. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
  10. 3. This notice may not be removed or altered from any source distribution.
  11. */
  12. #include "btThreads.h"
  13. #include "btQuickprof.h"
  14. #include <algorithm> // for min and max
  15. #if BT_USE_OPENMP && BT_THREADSAFE
  16. #include <omp.h>
  17. #endif // #if BT_USE_OPENMP && BT_THREADSAFE
  18. #if BT_USE_PPL && BT_THREADSAFE
  19. // use Microsoft Parallel Patterns Library (installed with Visual Studio 2010 and later)
  20. #include <ppl.h> // if you get a compile error here, check whether your version of Visual Studio includes PPL
  21. // Visual Studio 2010 and later should come with it
  22. #include <concrtrm.h> // for GetProcessorCount()
  23. #endif // #if BT_USE_PPL && BT_THREADSAFE
  24. #if BT_USE_TBB && BT_THREADSAFE
  25. // use Intel Threading Building Blocks for thread management
  26. #define __TBB_NO_IMPLICIT_LINKAGE 1
  27. #include <tbb/tbb.h>
  28. #include <tbb/task_scheduler_init.h>
  29. #include <tbb/parallel_for.h>
  30. #include <tbb/blocked_range.h>
  31. #endif // #if BT_USE_TBB && BT_THREADSAFE
  32. #if BT_THREADSAFE
  33. //
  34. // Lightweight spin-mutex based on atomics
  35. // Using ordinary system-provided mutexes like Windows critical sections was noticeably slower
  36. // presumably because when it fails to lock at first it would sleep the thread and trigger costly
  37. // context switching.
  38. //
  39. #if __cplusplus >= 201103L
  40. // for anything claiming full C++11 compliance, use C++11 atomics
  41. // on GCC or Clang you need to compile with -std=c++11
  42. #define USE_CPP11_ATOMICS 1
  43. #elif defined(_MSC_VER)
  44. // on MSVC, use intrinsics instead
  45. #define USE_MSVC_INTRINSICS 1
  46. #elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))
  47. // available since GCC 4.7 and some versions of clang
  48. // todo: check for clang
  49. #define USE_GCC_BUILTIN_ATOMICS 1
  50. #elif defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)
  51. // available since GCC 4.1
  52. #define USE_GCC_BUILTIN_ATOMICS_OLD 1
  53. #endif
  54. #if USE_CPP11_ATOMICS
  55. #include <atomic>
  56. #include <thread>
  57. #define THREAD_LOCAL_STATIC thread_local static
  58. bool btSpinMutex::tryLock()
  59. {
  60. std::atomic<int>* aDest = reinterpret_cast<std::atomic<int>*>(&mLock);
  61. int expected = 0;
  62. return std::atomic_compare_exchange_weak_explicit(aDest, &expected, int(1), std::memory_order_acq_rel, std::memory_order_acquire);
  63. }
  64. void btSpinMutex::lock()
  65. {
  66. // note: this lock does not sleep the thread.
  67. while (!tryLock())
  68. {
  69. // spin
  70. }
  71. }
  72. void btSpinMutex::unlock()
  73. {
  74. std::atomic<int>* aDest = reinterpret_cast<std::atomic<int>*>(&mLock);
  75. std::atomic_store_explicit(aDest, int(0), std::memory_order_release);
  76. }
  77. #elif USE_MSVC_INTRINSICS
  78. #define WIN32_LEAN_AND_MEAN
  79. #include <windows.h>
  80. #include <intrin.h>
  81. #define THREAD_LOCAL_STATIC __declspec(thread) static
  82. bool btSpinMutex::tryLock()
  83. {
  84. volatile long* aDest = reinterpret_cast<long*>(&mLock);
  85. return (0 == _InterlockedCompareExchange(aDest, 1, 0));
  86. }
  87. void btSpinMutex::lock()
  88. {
  89. // note: this lock does not sleep the thread
  90. while (!tryLock())
  91. {
  92. // spin
  93. }
  94. }
  95. void btSpinMutex::unlock()
  96. {
  97. volatile long* aDest = reinterpret_cast<long*>(&mLock);
  98. _InterlockedExchange(aDest, 0);
  99. }
  100. #elif USE_GCC_BUILTIN_ATOMICS
  101. #define THREAD_LOCAL_STATIC static __thread
  102. bool btSpinMutex::tryLock()
  103. {
  104. int expected = 0;
  105. bool weak = false;
  106. const int memOrderSuccess = __ATOMIC_ACQ_REL;
  107. const int memOrderFail = __ATOMIC_ACQUIRE;
  108. return __atomic_compare_exchange_n(&mLock, &expected, int(1), weak, memOrderSuccess, memOrderFail);
  109. }
  110. void btSpinMutex::lock()
  111. {
  112. // note: this lock does not sleep the thread
  113. while (!tryLock())
  114. {
  115. // spin
  116. }
  117. }
  118. void btSpinMutex::unlock()
  119. {
  120. __atomic_store_n(&mLock, int(0), __ATOMIC_RELEASE);
  121. }
  122. #elif USE_GCC_BUILTIN_ATOMICS_OLD
  123. #define THREAD_LOCAL_STATIC static __thread
  124. bool btSpinMutex::tryLock()
  125. {
  126. return __sync_bool_compare_and_swap(&mLock, int(0), int(1));
  127. }
  128. void btSpinMutex::lock()
  129. {
  130. // note: this lock does not sleep the thread
  131. while (!tryLock())
  132. {
  133. // spin
  134. }
  135. }
  136. void btSpinMutex::unlock()
  137. {
  138. // write 0
  139. __sync_fetch_and_and(&mLock, int(0));
  140. }
  141. #else //#elif USE_MSVC_INTRINSICS
  142. #error "no threading primitives defined -- unknown platform"
  143. #endif //#else //#elif USE_MSVC_INTRINSICS
  144. #else //#if BT_THREADSAFE
  145. // These should not be called ever
  146. void btSpinMutex::lock()
  147. {
  148. btAssert(!"unimplemented btSpinMutex::lock() called");
  149. }
  150. void btSpinMutex::unlock()
  151. {
  152. btAssert(!"unimplemented btSpinMutex::unlock() called");
  153. }
  154. bool btSpinMutex::tryLock()
  155. {
  156. btAssert(!"unimplemented btSpinMutex::tryLock() called");
  157. return true;
  158. }
  159. #define THREAD_LOCAL_STATIC static
  160. #endif // #else //#if BT_THREADSAFE
  161. struct ThreadsafeCounter
  162. {
  163. unsigned int mCounter;
  164. btSpinMutex mMutex;
  165. ThreadsafeCounter()
  166. {
  167. mCounter = 0;
  168. --mCounter; // first count should come back 0
  169. }
  170. unsigned int getNext()
  171. {
  172. // no need to optimize this with atomics, it is only called ONCE per thread!
  173. mMutex.lock();
  174. mCounter++;
  175. if (mCounter >= BT_MAX_THREAD_COUNT)
  176. {
  177. btAssert(!"thread counter exceeded");
  178. // wrap back to the first worker index
  179. mCounter = 1;
  180. }
  181. unsigned int val = mCounter;
  182. mMutex.unlock();
  183. return val;
  184. }
  185. };
  186. static btITaskScheduler* gBtTaskScheduler=0;
  187. static int gThreadsRunningCounter = 0; // useful for detecting if we are trying to do nested parallel-for calls
  188. static btSpinMutex gThreadsRunningCounterMutex;
  189. static ThreadsafeCounter gThreadCounter;
  190. //
  191. // BT_DETECT_BAD_THREAD_INDEX tries to detect when there are multiple threads assigned the same thread index.
  192. //
  193. // BT_DETECT_BAD_THREAD_INDEX is a developer option to test if
  194. // certain assumptions about how the task scheduler manages its threads
  195. // holds true.
  196. // The main assumption is:
  197. // - when the threadpool is resized, the task scheduler either
  198. // 1. destroys all worker threads and creates all new ones in the correct number, OR
  199. // 2. never destroys a worker thread
  200. //
  201. // We make that assumption because we can't easily enumerate the worker threads of a task scheduler
  202. // to assign nice sequential thread-indexes. We also do not get notified if a worker thread is destroyed,
  203. // so we can't tell when a thread-index is no longer being used.
  204. // We allocate thread-indexes as needed with a sequential global thread counter.
  205. //
  206. // Our simple thread-counting scheme falls apart if the task scheduler destroys some threads but
  207. // continues to re-use other threads and the application repeatedly resizes the thread pool of the
  208. // task scheduler.
  209. // In order to prevent the thread-counter from exceeding the global max (BT_MAX_THREAD_COUNT), we
  210. // wrap the thread counter back to 1. This should only happen if the worker threads have all been
  211. // destroyed and re-created.
  212. //
  213. // BT_DETECT_BAD_THREAD_INDEX only works for Win32 right now,
  214. // but could be adapted to work with pthreads
  215. #define BT_DETECT_BAD_THREAD_INDEX 0
  216. #if BT_DETECT_BAD_THREAD_INDEX
  217. typedef DWORD ThreadId_t;
  218. const static ThreadId_t kInvalidThreadId = 0;
  219. ThreadId_t gDebugThreadIds[BT_MAX_THREAD_COUNT];
  220. static ThreadId_t getDebugThreadId()
  221. {
  222. return GetCurrentThreadId();
  223. }
  224. #endif // #if BT_DETECT_BAD_THREAD_INDEX
  225. // return a unique index per thread, main thread is 0, worker threads are in [1, BT_MAX_THREAD_COUNT)
  226. unsigned int btGetCurrentThreadIndex()
  227. {
  228. const unsigned int kNullIndex = ~0U;
  229. THREAD_LOCAL_STATIC unsigned int sThreadIndex = kNullIndex;
  230. if (sThreadIndex == kNullIndex)
  231. {
  232. sThreadIndex = gThreadCounter.getNext();
  233. btAssert(sThreadIndex < BT_MAX_THREAD_COUNT);
  234. }
  235. #if BT_DETECT_BAD_THREAD_INDEX
  236. if (gBtTaskScheduler && sThreadIndex > 0)
  237. {
  238. ThreadId_t tid = getDebugThreadId();
  239. // if not set
  240. if (gDebugThreadIds[sThreadIndex] == kInvalidThreadId)
  241. {
  242. // set it
  243. gDebugThreadIds[sThreadIndex] = tid;
  244. }
  245. else
  246. {
  247. if (gDebugThreadIds[sThreadIndex] != tid)
  248. {
  249. // this could indicate the task scheduler is breaking our assumptions about
  250. // how threads are managed when threadpool is resized
  251. btAssert(!"there are 2 or more threads with the same thread-index!");
  252. __debugbreak();
  253. }
  254. }
  255. }
  256. #endif // #if BT_DETECT_BAD_THREAD_INDEX
  257. return sThreadIndex;
  258. }
  259. bool btIsMainThread()
  260. {
  261. return btGetCurrentThreadIndex() == 0;
  262. }
  263. void btResetThreadIndexCounter()
  264. {
  265. // for when all current worker threads are destroyed
  266. btAssert(btIsMainThread());
  267. gThreadCounter.mCounter = 0;
  268. }
  269. btITaskScheduler::btITaskScheduler(const char* name)
  270. {
  271. m_name = name;
  272. m_savedThreadCounter = 0;
  273. m_isActive = false;
  274. }
  275. void btITaskScheduler::activate()
  276. {
  277. // gThreadCounter is used to assign a thread-index to each worker thread in a task scheduler.
  278. // The main thread is always thread-index 0, and worker threads are numbered from 1 to 63 (BT_MAX_THREAD_COUNT-1)
  279. // The thread-indexes need to be unique amongst the threads that can be running simultaneously.
  280. // Since only one task scheduler can be used at a time, it is OK for a pair of threads that belong to different
  281. // task schedulers to share the same thread index because they can't be running at the same time.
  282. // So each task scheduler needs to keep its own thread counter value
  283. if (!m_isActive)
  284. {
  285. gThreadCounter.mCounter = m_savedThreadCounter; // restore saved thread counter
  286. m_isActive = true;
  287. }
  288. }
  289. void btITaskScheduler::deactivate()
  290. {
  291. if (m_isActive)
  292. {
  293. m_savedThreadCounter = gThreadCounter.mCounter; // save thread counter
  294. m_isActive = false;
  295. }
  296. }
  297. void btPushThreadsAreRunning()
  298. {
  299. gThreadsRunningCounterMutex.lock();
  300. gThreadsRunningCounter++;
  301. gThreadsRunningCounterMutex.unlock();
  302. }
  303. void btPopThreadsAreRunning()
  304. {
  305. gThreadsRunningCounterMutex.lock();
  306. gThreadsRunningCounter--;
  307. gThreadsRunningCounterMutex.unlock();
  308. }
  309. bool btThreadsAreRunning()
  310. {
  311. return gThreadsRunningCounter != 0;
  312. }
  313. void btSetTaskScheduler(btITaskScheduler* ts)
  314. {
  315. int threadId = btGetCurrentThreadIndex(); // make sure we call this on main thread at least once before any workers run
  316. if (threadId != 0)
  317. {
  318. btAssert(!"btSetTaskScheduler must be called from the main thread!");
  319. return;
  320. }
  321. if (gBtTaskScheduler)
  322. {
  323. // deactivate old task scheduler
  324. gBtTaskScheduler->deactivate();
  325. }
  326. gBtTaskScheduler = ts;
  327. if (ts)
  328. {
  329. // activate new task scheduler
  330. ts->activate();
  331. }
  332. }
  333. btITaskScheduler* btGetTaskScheduler()
  334. {
  335. return gBtTaskScheduler;
  336. }
  337. void btParallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body)
  338. {
  339. #if BT_THREADSAFE
  340. #if BT_DETECT_BAD_THREAD_INDEX
  341. if (!btThreadsAreRunning())
  342. {
  343. // clear out thread ids
  344. for (int i = 0; i < BT_MAX_THREAD_COUNT; ++i)
  345. {
  346. gDebugThreadIds[i] = kInvalidThreadId;
  347. }
  348. }
  349. #endif // #if BT_DETECT_BAD_THREAD_INDEX
  350. btAssert(gBtTaskScheduler != NULL); // call btSetTaskScheduler() with a valid task scheduler first!
  351. gBtTaskScheduler->parallelFor(iBegin, iEnd, grainSize, body);
  352. #else // #if BT_THREADSAFE
  353. // non-parallel version of btParallelFor
  354. btAssert(!"called btParallelFor in non-threadsafe build. enable BT_THREADSAFE");
  355. body.forLoop(iBegin, iEnd);
  356. #endif // #if BT_THREADSAFE
  357. }
  358. btScalar btParallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body)
  359. {
  360. #if BT_THREADSAFE
  361. #if BT_DETECT_BAD_THREAD_INDEX
  362. if (!btThreadsAreRunning())
  363. {
  364. // clear out thread ids
  365. for (int i = 0; i < BT_MAX_THREAD_COUNT; ++i)
  366. {
  367. gDebugThreadIds[i] = kInvalidThreadId;
  368. }
  369. }
  370. #endif // #if BT_DETECT_BAD_THREAD_INDEX
  371. btAssert(gBtTaskScheduler != NULL); // call btSetTaskScheduler() with a valid task scheduler first!
  372. return gBtTaskScheduler->parallelSum(iBegin, iEnd, grainSize, body);
  373. #else // #if BT_THREADSAFE
  374. // non-parallel version of btParallelSum
  375. btAssert(!"called btParallelFor in non-threadsafe build. enable BT_THREADSAFE");
  376. return body.sumLoop(iBegin, iEnd);
  377. #endif //#else // #if BT_THREADSAFE
  378. }
  379. ///
  380. /// btTaskSchedulerSequential -- non-threaded implementation of task scheduler
  381. /// (really just useful for testing performance of single threaded vs multi)
  382. ///
  383. class btTaskSchedulerSequential : public btITaskScheduler
  384. {
  385. public:
  386. btTaskSchedulerSequential() : btITaskScheduler("Sequential") {}
  387. virtual int getMaxNumThreads() const BT_OVERRIDE { return 1; }
  388. virtual int getNumThreads() const BT_OVERRIDE { return 1; }
  389. virtual void setNumThreads(int numThreads) BT_OVERRIDE {}
  390. virtual void parallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body) BT_OVERRIDE
  391. {
  392. BT_PROFILE("parallelFor_sequential");
  393. body.forLoop(iBegin, iEnd);
  394. }
  395. virtual btScalar parallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body) BT_OVERRIDE
  396. {
  397. BT_PROFILE("parallelSum_sequential");
  398. return body.sumLoop(iBegin, iEnd);
  399. }
  400. };
  401. #if BT_USE_OPENMP && BT_THREADSAFE
  402. ///
  403. /// btTaskSchedulerOpenMP -- wrapper around OpenMP task scheduler
  404. ///
  405. class btTaskSchedulerOpenMP : public btITaskScheduler
  406. {
  407. int m_numThreads;
  408. public:
  409. btTaskSchedulerOpenMP() : btITaskScheduler("OpenMP")
  410. {
  411. m_numThreads = 0;
  412. }
  413. virtual int getMaxNumThreads() const BT_OVERRIDE
  414. {
  415. return omp_get_max_threads();
  416. }
  417. virtual int getNumThreads() const BT_OVERRIDE
  418. {
  419. return m_numThreads;
  420. }
  421. virtual void setNumThreads(int numThreads) BT_OVERRIDE
  422. {
  423. // With OpenMP, because it is a standard with various implementations, we can't
  424. // know for sure if every implementation has the same behavior of destroying all
  425. // previous threads when resizing the threadpool
  426. m_numThreads = (std::max)(1, (std::min)(int(BT_MAX_THREAD_COUNT), numThreads));
  427. omp_set_num_threads(1); // hopefully, all previous threads get destroyed here
  428. omp_set_num_threads(m_numThreads);
  429. m_savedThreadCounter = 0;
  430. if (m_isActive)
  431. {
  432. btResetThreadIndexCounter();
  433. }
  434. }
  435. virtual void parallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body) BT_OVERRIDE
  436. {
  437. BT_PROFILE("parallelFor_OpenMP");
  438. btPushThreadsAreRunning();
  439. #pragma omp parallel for schedule(static, 1)
  440. for (int i = iBegin; i < iEnd; i += grainSize)
  441. {
  442. BT_PROFILE("OpenMP_forJob");
  443. body.forLoop(i, (std::min)(i + grainSize, iEnd));
  444. }
  445. btPopThreadsAreRunning();
  446. }
  447. virtual btScalar parallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body) BT_OVERRIDE
  448. {
  449. BT_PROFILE("parallelFor_OpenMP");
  450. btPushThreadsAreRunning();
  451. btScalar sum = btScalar(0);
  452. #pragma omp parallel for schedule(static, 1) reduction(+ \
  453. : sum)
  454. for (int i = iBegin; i < iEnd; i += grainSize)
  455. {
  456. BT_PROFILE("OpenMP_sumJob");
  457. sum += body.sumLoop(i, (std::min)(i + grainSize, iEnd));
  458. }
  459. btPopThreadsAreRunning();
  460. return sum;
  461. }
  462. };
  463. #endif // #if BT_USE_OPENMP && BT_THREADSAFE
  464. #if BT_USE_TBB && BT_THREADSAFE
  465. ///
  466. /// btTaskSchedulerTBB -- wrapper around Intel Threaded Building Blocks task scheduler
  467. ///
  468. class btTaskSchedulerTBB : public btITaskScheduler
  469. {
  470. int m_numThreads;
  471. tbb::task_scheduler_init* m_tbbSchedulerInit;
  472. public:
  473. btTaskSchedulerTBB() : btITaskScheduler("IntelTBB")
  474. {
  475. m_numThreads = 0;
  476. m_tbbSchedulerInit = NULL;
  477. }
  478. ~btTaskSchedulerTBB()
  479. {
  480. if (m_tbbSchedulerInit)
  481. {
  482. delete m_tbbSchedulerInit;
  483. m_tbbSchedulerInit = NULL;
  484. }
  485. }
  486. virtual int getMaxNumThreads() const BT_OVERRIDE
  487. {
  488. return tbb::task_scheduler_init::default_num_threads();
  489. }
  490. virtual int getNumThreads() const BT_OVERRIDE
  491. {
  492. return m_numThreads;
  493. }
  494. virtual void setNumThreads(int numThreads) BT_OVERRIDE
  495. {
  496. m_numThreads = (std::max)(1, (std::min)(int(BT_MAX_THREAD_COUNT), numThreads));
  497. if (m_tbbSchedulerInit)
  498. {
  499. // destroys all previous threads
  500. delete m_tbbSchedulerInit;
  501. m_tbbSchedulerInit = NULL;
  502. }
  503. m_tbbSchedulerInit = new tbb::task_scheduler_init(m_numThreads);
  504. m_savedThreadCounter = 0;
  505. if (m_isActive)
  506. {
  507. btResetThreadIndexCounter();
  508. }
  509. }
  510. struct ForBodyAdapter
  511. {
  512. const btIParallelForBody* mBody;
  513. ForBodyAdapter(const btIParallelForBody* body) : mBody(body) {}
  514. void operator()(const tbb::blocked_range<int>& range) const
  515. {
  516. BT_PROFILE("TBB_forJob");
  517. mBody->forLoop(range.begin(), range.end());
  518. }
  519. };
  520. virtual void parallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body) BT_OVERRIDE
  521. {
  522. BT_PROFILE("parallelFor_TBB");
  523. ForBodyAdapter tbbBody(&body);
  524. btPushThreadsAreRunning();
  525. tbb::parallel_for(tbb::blocked_range<int>(iBegin, iEnd, grainSize),
  526. tbbBody,
  527. tbb::simple_partitioner());
  528. btPopThreadsAreRunning();
  529. }
  530. struct SumBodyAdapter
  531. {
  532. const btIParallelSumBody* mBody;
  533. btScalar mSum;
  534. SumBodyAdapter(const btIParallelSumBody* body) : mBody(body), mSum(btScalar(0)) {}
  535. SumBodyAdapter(const SumBodyAdapter& src, tbb::split) : mBody(src.mBody), mSum(btScalar(0)) {}
  536. void join(const SumBodyAdapter& src) { mSum += src.mSum; }
  537. void operator()(const tbb::blocked_range<int>& range)
  538. {
  539. BT_PROFILE("TBB_sumJob");
  540. mSum += mBody->sumLoop(range.begin(), range.end());
  541. }
  542. };
  543. virtual btScalar parallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body) BT_OVERRIDE
  544. {
  545. BT_PROFILE("parallelSum_TBB");
  546. SumBodyAdapter tbbBody(&body);
  547. btPushThreadsAreRunning();
  548. tbb::parallel_deterministic_reduce(tbb::blocked_range<int>(iBegin, iEnd, grainSize), tbbBody);
  549. btPopThreadsAreRunning();
  550. return tbbBody.mSum;
  551. }
  552. };
  553. #endif // #if BT_USE_TBB && BT_THREADSAFE
  554. #if BT_USE_PPL && BT_THREADSAFE
  555. ///
  556. /// btTaskSchedulerPPL -- wrapper around Microsoft Parallel Patterns Lib task scheduler
  557. ///
  558. class btTaskSchedulerPPL : public btITaskScheduler
  559. {
  560. int m_numThreads;
  561. concurrency::combinable<btScalar> m_sum; // for parallelSum
  562. public:
  563. btTaskSchedulerPPL() : btITaskScheduler("PPL")
  564. {
  565. m_numThreads = 0;
  566. }
  567. virtual int getMaxNumThreads() const BT_OVERRIDE
  568. {
  569. return concurrency::GetProcessorCount();
  570. }
  571. virtual int getNumThreads() const BT_OVERRIDE
  572. {
  573. return m_numThreads;
  574. }
  575. virtual void setNumThreads(int numThreads) BT_OVERRIDE
  576. {
  577. // capping the thread count for PPL due to a thread-index issue
  578. const int maxThreadCount = (std::min)(int(BT_MAX_THREAD_COUNT), 31);
  579. m_numThreads = (std::max)(1, (std::min)(maxThreadCount, numThreads));
  580. using namespace concurrency;
  581. if (CurrentScheduler::Id() != -1)
  582. {
  583. CurrentScheduler::Detach();
  584. }
  585. SchedulerPolicy policy;
  586. {
  587. // PPL seems to destroy threads when threadpool is shrunk, but keeps reusing old threads
  588. // force it to destroy old threads
  589. policy.SetConcurrencyLimits(1, 1);
  590. CurrentScheduler::Create(policy);
  591. CurrentScheduler::Detach();
  592. }
  593. policy.SetConcurrencyLimits(m_numThreads, m_numThreads);
  594. CurrentScheduler::Create(policy);
  595. m_savedThreadCounter = 0;
  596. if (m_isActive)
  597. {
  598. btResetThreadIndexCounter();
  599. }
  600. }
  601. struct ForBodyAdapter
  602. {
  603. const btIParallelForBody* mBody;
  604. int mGrainSize;
  605. int mIndexEnd;
  606. ForBodyAdapter(const btIParallelForBody* body, int grainSize, int end) : mBody(body), mGrainSize(grainSize), mIndexEnd(end) {}
  607. void operator()(int i) const
  608. {
  609. BT_PROFILE("PPL_forJob");
  610. mBody->forLoop(i, (std::min)(i + mGrainSize, mIndexEnd));
  611. }
  612. };
  613. virtual void parallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body) BT_OVERRIDE
  614. {
  615. BT_PROFILE("parallelFor_PPL");
  616. // PPL dispatch
  617. ForBodyAdapter pplBody(&body, grainSize, iEnd);
  618. btPushThreadsAreRunning();
  619. // note: MSVC 2010 doesn't support partitioner args, so avoid them
  620. concurrency::parallel_for(iBegin,
  621. iEnd,
  622. grainSize,
  623. pplBody);
  624. btPopThreadsAreRunning();
  625. }
  626. struct SumBodyAdapter
  627. {
  628. const btIParallelSumBody* mBody;
  629. concurrency::combinable<btScalar>* mSum;
  630. int mGrainSize;
  631. int mIndexEnd;
  632. SumBodyAdapter(const btIParallelSumBody* body, concurrency::combinable<btScalar>* sum, int grainSize, int end) : mBody(body), mSum(sum), mGrainSize(grainSize), mIndexEnd(end) {}
  633. void operator()(int i) const
  634. {
  635. BT_PROFILE("PPL_sumJob");
  636. mSum->local() += mBody->sumLoop(i, (std::min)(i + mGrainSize, mIndexEnd));
  637. }
  638. };
  639. static btScalar sumFunc(btScalar a, btScalar b) { return a + b; }
  640. virtual btScalar parallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body) BT_OVERRIDE
  641. {
  642. BT_PROFILE("parallelSum_PPL");
  643. m_sum.clear();
  644. SumBodyAdapter pplBody(&body, &m_sum, grainSize, iEnd);
  645. btPushThreadsAreRunning();
  646. // note: MSVC 2010 doesn't support partitioner args, so avoid them
  647. concurrency::parallel_for(iBegin,
  648. iEnd,
  649. grainSize,
  650. pplBody);
  651. btPopThreadsAreRunning();
  652. return m_sum.combine(sumFunc);
  653. }
  654. };
  655. #endif // #if BT_USE_PPL && BT_THREADSAFE
  656. // create a non-threaded task scheduler (always available)
  657. btITaskScheduler* btGetSequentialTaskScheduler()
  658. {
  659. static btTaskSchedulerSequential sTaskScheduler;
  660. return &sTaskScheduler;
  661. }
  662. // create an OpenMP task scheduler (if available, otherwise returns null)
  663. btITaskScheduler* btGetOpenMPTaskScheduler()
  664. {
  665. #if BT_USE_OPENMP && BT_THREADSAFE
  666. static btTaskSchedulerOpenMP sTaskScheduler;
  667. return &sTaskScheduler;
  668. #else
  669. return NULL;
  670. #endif
  671. }
  672. // create an Intel TBB task scheduler (if available, otherwise returns null)
  673. btITaskScheduler* btGetTBBTaskScheduler()
  674. {
  675. #if BT_USE_TBB && BT_THREADSAFE
  676. static btTaskSchedulerTBB sTaskScheduler;
  677. return &sTaskScheduler;
  678. #else
  679. return NULL;
  680. #endif
  681. }
  682. // create a PPL task scheduler (if available, otherwise returns null)
  683. btITaskScheduler* btGetPPLTaskScheduler()
  684. {
  685. #if BT_USE_PPL && BT_THREADSAFE
  686. static btTaskSchedulerPPL sTaskScheduler;
  687. return &sTaskScheduler;
  688. #else
  689. return NULL;
  690. #endif
  691. }