parallel_partition.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. // Copyright 2009-2021 Intel Corporation
  2. // SPDX-License-Identifier: Apache-2.0
  3. #pragma once
  4. #include "parallel_for.h"
  5. #include "../math/range.h"
  6. namespace embree
  7. {
  8. /* serial partitioning */
  9. template<typename T, typename V, typename IsLeft, typename Reduction_T>
  10. __forceinline size_t serial_partitioning(T* array,
  11. const size_t begin,
  12. const size_t end,
  13. V& leftReduction,
  14. V& rightReduction,
  15. const IsLeft& is_left,
  16. const Reduction_T& reduction_t)
  17. {
  18. T* l = array + begin;
  19. T* r = array + end - 1;
  20. while(1)
  21. {
  22. /* *l < pivot */
  23. while (likely(l <= r && is_left(*l) ))
  24. {
  25. //prefetchw(l+4); // FIXME: enable?
  26. reduction_t(leftReduction,*l);
  27. ++l;
  28. }
  29. /* *r >= pivot) */
  30. while (likely(l <= r && !is_left(*r)))
  31. {
  32. //prefetchw(r-4); FIXME: enable?
  33. reduction_t(rightReduction,*r);
  34. --r;
  35. }
  36. if (r<l) break;
  37. reduction_t(leftReduction ,*r);
  38. reduction_t(rightReduction,*l);
  39. xchg(*l,*r);
  40. l++; r--;
  41. }
  42. return l - array;
  43. }
  44. template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
  45. class __aligned(64) parallel_partition_task
  46. {
  47. ALIGNED_CLASS_(64);
  48. private:
  49. static const size_t MAX_TASKS = 64;
  50. T* array;
  51. size_t N;
  52. const IsLeft& is_left;
  53. const Reduction_T& reduction_t;
  54. const Reduction_V& reduction_v;
  55. const Vi& identity;
  56. size_t numTasks;
  57. __aligned(64) size_t counter_start[MAX_TASKS+1];
  58. __aligned(64) size_t counter_left[MAX_TASKS+1];
  59. __aligned(64) range<ssize_t> leftMisplacedRanges[MAX_TASKS];
  60. __aligned(64) range<ssize_t> rightMisplacedRanges[MAX_TASKS];
  61. __aligned(64) V leftReductions[MAX_TASKS];
  62. __aligned(64) V rightReductions[MAX_TASKS];
  63. public:
  64. __forceinline parallel_partition_task(T* array,
  65. const size_t N,
  66. const Vi& identity,
  67. const IsLeft& is_left,
  68. const Reduction_T& reduction_t,
  69. const Reduction_V& reduction_v,
  70. const size_t BLOCK_SIZE)
  71. : array(array), N(N), is_left(is_left), reduction_t(reduction_t), reduction_v(reduction_v), identity(identity),
  72. numTasks(min((N+BLOCK_SIZE-1)/BLOCK_SIZE,min(TaskScheduler::threadCount(),MAX_TASKS))) {}
  73. __forceinline const range<ssize_t>* findStartRange(size_t& index, const range<ssize_t>* const r, const size_t numRanges)
  74. {
  75. size_t i = 0;
  76. while(index >= (size_t)r[i].size())
  77. {
  78. assert(i < numRanges);
  79. index -= (size_t)r[i].size();
  80. i++;
  81. }
  82. return &r[i];
  83. }
  84. __forceinline void swapItemsInMisplacedRanges(const size_t numLeftMisplacedRanges,
  85. const size_t numRightMisplacedRanges,
  86. const size_t startID,
  87. const size_t endID)
  88. {
  89. size_t leftLocalIndex = startID;
  90. size_t rightLocalIndex = startID;
  91. const range<ssize_t>* l_range = findStartRange(leftLocalIndex,leftMisplacedRanges,numLeftMisplacedRanges);
  92. const range<ssize_t>* r_range = findStartRange(rightLocalIndex,rightMisplacedRanges,numRightMisplacedRanges);
  93. size_t l_left = l_range->size() - leftLocalIndex;
  94. size_t r_left = r_range->size() - rightLocalIndex;
  95. T *__restrict__ l = &array[l_range->begin() + leftLocalIndex];
  96. T *__restrict__ r = &array[r_range->begin() + rightLocalIndex];
  97. size_t size = endID - startID;
  98. size_t items = min(size,min(l_left,r_left));
  99. while (size)
  100. {
  101. if (unlikely(l_left == 0))
  102. {
  103. l_range++;
  104. l_left = l_range->size();
  105. l = &array[l_range->begin()];
  106. items = min(size,min(l_left,r_left));
  107. }
  108. if (unlikely(r_left == 0))
  109. {
  110. r_range++;
  111. r_left = r_range->size();
  112. r = &array[r_range->begin()];
  113. items = min(size,min(l_left,r_left));
  114. }
  115. size -= items;
  116. l_left -= items;
  117. r_left -= items;
  118. while(items) {
  119. items--;
  120. xchg(*l++,*r++);
  121. }
  122. }
  123. }
  124. __forceinline size_t partition(V& leftReduction, V& rightReduction)
  125. {
  126. /* partition the individual ranges for each task */
  127. parallel_for(numTasks,[&] (const size_t taskID) {
  128. const size_t startID = (taskID+0)*N/numTasks;
  129. const size_t endID = (taskID+1)*N/numTasks;
  130. V local_left(identity);
  131. V local_right(identity);
  132. const size_t mid = serial_partitioning(array,startID,endID,local_left,local_right,is_left,reduction_t);
  133. counter_start[taskID] = startID;
  134. counter_left [taskID] = mid-startID;
  135. leftReductions[taskID] = local_left;
  136. rightReductions[taskID] = local_right;
  137. });
  138. counter_start[numTasks] = N;
  139. counter_left[numTasks] = 0;
  140. /* finalize the reductions */
  141. for (size_t i=0; i<numTasks; i++) {
  142. reduction_v(leftReduction,leftReductions[i]);
  143. reduction_v(rightReduction,rightReductions[i]);
  144. }
  145. /* calculate mid point for partitioning */
  146. size_t mid = counter_left[0];
  147. for (size_t i=1; i<numTasks; i++)
  148. mid += counter_left[i];
  149. const range<ssize_t> globalLeft (0,mid);
  150. const range<ssize_t> globalRight(mid,N);
  151. /* calculate all left and right ranges that are on the wrong global side */
  152. size_t numMisplacedRangesLeft = 0;
  153. size_t numMisplacedRangesRight = 0;
  154. size_t numMisplacedItemsLeft MAYBE_UNUSED = 0;
  155. size_t numMisplacedItemsRight MAYBE_UNUSED = 0;
  156. for (size_t i=0; i<numTasks; i++)
  157. {
  158. const range<ssize_t> left_range (counter_start[i], counter_start[i] + counter_left[i]);
  159. const range<ssize_t> right_range(counter_start[i] + counter_left[i], counter_start[i+1]);
  160. const range<ssize_t> left_misplaced = globalLeft. intersect(right_range);
  161. const range<ssize_t> right_misplaced = globalRight.intersect(left_range);
  162. if (!left_misplaced.empty())
  163. {
  164. numMisplacedItemsLeft += left_misplaced.size();
  165. leftMisplacedRanges[numMisplacedRangesLeft++] = left_misplaced;
  166. }
  167. if (!right_misplaced.empty())
  168. {
  169. numMisplacedItemsRight += right_misplaced.size();
  170. rightMisplacedRanges[numMisplacedRangesRight++] = right_misplaced;
  171. }
  172. }
  173. assert( numMisplacedItemsLeft == numMisplacedItemsRight );
  174. /* if no items are misplaced we are done */
  175. if (numMisplacedItemsLeft == 0)
  176. return mid;
  177. /* otherwise we copy the items to the right place in parallel */
  178. parallel_for(numTasks,[&] (const size_t taskID) {
  179. const size_t startID = (taskID+0)*numMisplacedItemsLeft/numTasks;
  180. const size_t endID = (taskID+1)*numMisplacedItemsLeft/numTasks;
  181. swapItemsInMisplacedRanges(numMisplacedRangesLeft,numMisplacedRangesRight,startID,endID);
  182. });
  183. return mid;
  184. }
  185. };
  186. template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
  187. __noinline size_t parallel_partitioning(T* array,
  188. const size_t begin,
  189. const size_t end,
  190. const Vi &identity,
  191. V &leftReduction,
  192. V &rightReduction,
  193. const IsLeft& is_left,
  194. const Reduction_T& reduction_t,
  195. const Reduction_V& reduction_v,
  196. size_t BLOCK_SIZE = 128)
  197. {
  198. /* fall back to single threaded partitioning for small N */
  199. if (unlikely(end-begin < BLOCK_SIZE))
  200. return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);
  201. /* otherwise use parallel code */
  202. else {
  203. typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;
  204. std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));
  205. return begin+p->partition(leftReduction,rightReduction);
  206. }
  207. }
  208. template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
  209. __noinline size_t parallel_partitioning(T* array,
  210. const size_t begin,
  211. const size_t end,
  212. const Vi &identity,
  213. V &leftReduction,
  214. V &rightReduction,
  215. const IsLeft& is_left,
  216. const Reduction_T& reduction_t,
  217. const Reduction_V& reduction_v,
  218. size_t BLOCK_SIZE,
  219. size_t PARALLEL_THRESHOLD)
  220. {
  221. /* fall back to single threaded partitioning for small N */
  222. if (unlikely(end-begin < PARALLEL_THRESHOLD))
  223. return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);
  224. /* otherwise use parallel code */
  225. else {
  226. typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;
  227. std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));
  228. return begin+p->partition(leftReduction,rightReduction);
  229. }
  230. }
  231. template<typename T, typename IsLeft>
  232. inline size_t parallel_partitioning(T* array,
  233. const size_t begin,
  234. const size_t end,
  235. const IsLeft& is_left,
  236. size_t BLOCK_SIZE = 128)
  237. {
  238. size_t leftReduction = 0;
  239. size_t rightReduction = 0;
  240. return parallel_partitioning(
  241. array,begin,end,0,leftReduction,rightReduction,is_left,
  242. [] (size_t& t,const T& ref) { },
  243. [] (size_t& t0,size_t& t1) { },
  244. BLOCK_SIZE);
  245. }
  246. }