parallel_prefix_sum.h 3.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. // ======================================================================== //
  2. // Copyright 2009-2017 Intel Corporation //
  3. // //
  4. // Licensed under the Apache License, Version 2.0 (the "License"); //
  5. // you may not use this file except in compliance with the License. //
  6. // You may obtain a copy of the License at //
  7. // //
  8. // http://www.apache.org/licenses/LICENSE-2.0 //
  9. // //
  10. // Unless required by applicable law or agreed to in writing, software //
  11. // distributed under the License is distributed on an "AS IS" BASIS, //
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
  13. // See the License for the specific language governing permissions and //
  14. // limitations under the License. //
  15. // ======================================================================== //
  16. #pragma once
  17. #include "parallel_for.h"
  18. namespace embree
  19. {
  20. template<typename Value>
  21. struct ParallelPrefixSumState
  22. {
  23. enum { MAX_TASKS = 512 };
  24. Value counts[MAX_TASKS];
  25. Value sums [MAX_TASKS];
  26. };
  27. template<typename Index, typename Value, typename Func, typename Reduction>
  28. __forceinline Value parallel_prefix_sum( ParallelPrefixSumState<Value>& state, Index first, Index last, Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction)
  29. {
  30. /* calculate number of tasks to use */
  31. const size_t numThreads = TaskScheduler::threadCount();
  32. const size_t numBlocks = (last-first+minStepSize-1)/minStepSize;
  33. const size_t taskCount = min(numThreads,numBlocks,size_t(ParallelPrefixSumState<Value>::MAX_TASKS));
  34. /* perform parallel prefix sum */
  35. parallel_for(taskCount, [&](const size_t taskIndex)
  36. {
  37. const size_t i0 = first+(taskIndex+0)*(last-first)/taskCount;
  38. const size_t i1 = first+(taskIndex+1)*(last-first)/taskCount;
  39. state.counts[taskIndex] = func(range<size_t>(i0,i1),state.sums[taskIndex]);
  40. });
  41. /* calculate prefix sum */
  42. Value sum=identity;
  43. for (size_t i=0; i<taskCount; i++)
  44. {
  45. const Value c = state.counts[i];
  46. state.sums[i] = sum;
  47. sum=reduction(sum,c);
  48. }
  49. return sum;
  50. }
  51. /*! parallel calculation of prefix sums */
  52. template<typename SrcArray, typename DstArray, typename Value, typename Add>
  53. __forceinline Value parallel_prefix_sum(const SrcArray& src, DstArray& dst, size_t N, const Value& identity, const Add& add, const size_t SINGLE_THREAD_THRESHOLD = 4096)
  54. {
  55. /* perform single threaded prefix operation for small N */
  56. if (N < SINGLE_THREAD_THRESHOLD)
  57. {
  58. Value sum=identity;
  59. for (size_t i=0; i<N; sum=add(sum,src[i++])) dst[i] = sum;
  60. return sum;
  61. }
  62. /* perform parallel prefix operation for large N */
  63. else
  64. {
  65. ParallelPrefixSumState<Value> state;
  66. /* initial run just sets up start values for subtasks */
  67. parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
  68. Value s = identity;
  69. for (size_t i=r.begin(); i<r.end(); i++) s = add(s,src[i]);
  70. return s;
  71. }, add);
  72. /* final run calculates prefix sum */
  73. return parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
  74. Value s = identity;
  75. for (size_t i=r.begin(); i<r.end(); i++) {
  76. dst[i] = add(sum,s);
  77. s = add(s,src[i]);
  78. }
  79. return s;
  80. }, add);
  81. }
  82. }
  83. }