parallel_partition.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278
  1. // ======================================================================== //
  2. // Copyright 2009-2017 Intel Corporation //
  3. // //
  4. // Licensed under the Apache License, Version 2.0 (the "License"); //
  5. // you may not use this file except in compliance with the License. //
  6. // You may obtain a copy of the License at //
  7. // //
  8. // http://www.apache.org/licenses/LICENSE-2.0 //
  9. // //
  10. // Unless required by applicable law or agreed to in writing, software //
  11. // distributed under the License is distributed on an "AS IS" BASIS, //
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
  13. // See the License for the specific language governing permissions and //
  14. // limitations under the License. //
  15. // ======================================================================== //
  16. #pragma once
  17. #include "parallel_for.h"
  18. #include "../common/math/range.h"
  19. namespace embree
  20. {
  21. /* serial partitioning */
  22. template<typename T, typename V, typename IsLeft, typename Reduction_T>
  23. __forceinline size_t serial_partitioning(T* array,
  24. const size_t begin,
  25. const size_t end,
  26. V& leftReduction,
  27. V& rightReduction,
  28. const IsLeft& is_left,
  29. const Reduction_T& reduction_t)
  30. {
  31. T* l = array + begin;
  32. T* r = array + end - 1;
  33. while(1)
  34. {
  35. /* *l < pivot */
  36. while (likely(l <= r && is_left(*l) ))
  37. {
  38. //prefetchw(l+4); // FIXME: enable?
  39. reduction_t(leftReduction,*l);
  40. ++l;
  41. }
  42. /* *r >= pivot) */
  43. while (likely(l <= r && !is_left(*r)))
  44. {
  45. //prefetchw(r-4); FIXME: enable?
  46. reduction_t(rightReduction,*r);
  47. --r;
  48. }
  49. if (r<l) break;
  50. reduction_t(leftReduction ,*r);
  51. reduction_t(rightReduction,*l);
  52. xchg(*l,*r);
  53. l++; r--;
  54. }
  55. return l - array;
  56. }
  57. template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
  58. class __aligned(64) parallel_partition_task
  59. {
  60. ALIGNED_CLASS;
  61. private:
  62. static const size_t MAX_TASKS = 512;
  63. T* array;
  64. size_t N;
  65. const IsLeft& is_left;
  66. const Reduction_T& reduction_t;
  67. const Reduction_V& reduction_v;
  68. const Vi& identity;
  69. size_t numTasks;
  70. __aligned(64) size_t counter_start[MAX_TASKS+1];
  71. __aligned(64) size_t counter_left[MAX_TASKS+1];
  72. __aligned(64) range<ssize_t> leftMisplacedRanges[MAX_TASKS];
  73. __aligned(64) range<ssize_t> rightMisplacedRanges[MAX_TASKS];
  74. __aligned(64) V leftReductions[MAX_TASKS];
  75. __aligned(64) V rightReductions[MAX_TASKS];
  76. public:
  77. __forceinline parallel_partition_task(T* array,
  78. const size_t N,
  79. const Vi& identity,
  80. const IsLeft& is_left,
  81. const Reduction_T& reduction_t,
  82. const Reduction_V& reduction_v,
  83. const size_t BLOCK_SIZE)
  84. : array(array), N(N), is_left(is_left), reduction_t(reduction_t), reduction_v(reduction_v), identity(identity),
  85. numTasks(min((N+BLOCK_SIZE-1)/BLOCK_SIZE,min(TaskScheduler::threadCount(),MAX_TASKS))) {}
  86. __forceinline const range<ssize_t>* findStartRange(size_t& index, const range<ssize_t>* const r, const size_t numRanges)
  87. {
  88. size_t i = 0;
  89. while(index >= (size_t)r[i].size())
  90. {
  91. assert(i < numRanges);
  92. index -= (size_t)r[i].size();
  93. i++;
  94. }
  95. return &r[i];
  96. }
  97. __forceinline void swapItemsInMisplacedRanges(const size_t numLeftMisplacedRanges,
  98. const size_t numRightMisplacedRanges,
  99. const size_t startID,
  100. const size_t endID)
  101. {
  102. size_t leftLocalIndex = startID;
  103. size_t rightLocalIndex = startID;
  104. const range<ssize_t>* l_range = findStartRange(leftLocalIndex,leftMisplacedRanges,numLeftMisplacedRanges);
  105. const range<ssize_t>* r_range = findStartRange(rightLocalIndex,rightMisplacedRanges,numRightMisplacedRanges);
  106. size_t l_left = l_range->size() - leftLocalIndex;
  107. size_t r_left = r_range->size() - rightLocalIndex;
  108. T *__restrict__ l = &array[l_range->begin() + leftLocalIndex];
  109. T *__restrict__ r = &array[r_range->begin() + rightLocalIndex];
  110. size_t size = endID - startID;
  111. size_t items = min(size,min(l_left,r_left));
  112. while (size)
  113. {
  114. if (unlikely(l_left == 0))
  115. {
  116. l_range++;
  117. l_left = l_range->size();
  118. l = &array[l_range->begin()];
  119. items = min(size,min(l_left,r_left));
  120. }
  121. if (unlikely(r_left == 0))
  122. {
  123. r_range++;
  124. r_left = r_range->size();
  125. r = &array[r_range->begin()];
  126. items = min(size,min(l_left,r_left));
  127. }
  128. size -= items;
  129. l_left -= items;
  130. r_left -= items;
  131. while(items) {
  132. items--;
  133. xchg(*l++,*r++);
  134. }
  135. }
  136. }
  137. __forceinline size_t partition(V& leftReduction, V& rightReduction)
  138. {
  139. /* partition the individual ranges for each task */
  140. parallel_for(numTasks,[&] (const size_t taskID) {
  141. const size_t startID = (taskID+0)*N/numTasks;
  142. const size_t endID = (taskID+1)*N/numTasks;
  143. V local_left(identity);
  144. V local_right(identity);
  145. const size_t mid = serial_partitioning(array,startID,endID,local_left,local_right,is_left,reduction_t);
  146. counter_start[taskID] = startID;
  147. counter_left [taskID] = mid-startID;
  148. leftReductions[taskID] = local_left;
  149. rightReductions[taskID] = local_right;
  150. });
  151. counter_start[numTasks] = N;
  152. counter_left[numTasks] = 0;
  153. /* finalize the reductions */
  154. for (size_t i=0; i<numTasks; i++) {
  155. reduction_v(leftReduction,leftReductions[i]);
  156. reduction_v(rightReduction,rightReductions[i]);
  157. }
  158. /* calculate mid point for partitioning */
  159. size_t mid = counter_left[0];
  160. for (size_t i=1; i<numTasks; i++)
  161. mid += counter_left[i];
  162. const range<ssize_t> globalLeft (0,mid);
  163. const range<ssize_t> globalRight(mid,N);
  164. /* calculate all left and right ranges that are on the wrong global side */
  165. size_t numMisplacedRangesLeft = 0;
  166. size_t numMisplacedRangesRight = 0;
  167. size_t numMisplacedItemsLeft = 0;
  168. size_t numMisplacedItemsRight = 0;
  169. for (size_t i=0; i<numTasks; i++)
  170. {
  171. const range<ssize_t> left_range (counter_start[i], counter_start[i] + counter_left[i]);
  172. const range<ssize_t> right_range(counter_start[i] + counter_left[i], counter_start[i+1]);
  173. const range<ssize_t> left_misplaced = globalLeft. intersect(right_range);
  174. const range<ssize_t> right_misplaced = globalRight.intersect(left_range);
  175. if (!left_misplaced.empty())
  176. {
  177. numMisplacedItemsLeft += left_misplaced.size();
  178. leftMisplacedRanges[numMisplacedRangesLeft++] = left_misplaced;
  179. }
  180. if (!right_misplaced.empty())
  181. {
  182. numMisplacedItemsRight += right_misplaced.size();
  183. rightMisplacedRanges[numMisplacedRangesRight++] = right_misplaced;
  184. }
  185. }
  186. assert( numMisplacedItemsLeft == numMisplacedItemsRight );
  187. /* if no items are misplaced we are done */
  188. if (numMisplacedItemsLeft == 0)
  189. return mid;
  190. /* otherwise we copy the items to the right place in parallel */
  191. parallel_for(numTasks,[&] (const size_t taskID) {
  192. const size_t startID = (taskID+0)*numMisplacedItemsLeft/numTasks;
  193. const size_t endID = (taskID+1)*numMisplacedItemsLeft/numTasks;
  194. swapItemsInMisplacedRanges(numMisplacedRangesLeft,numMisplacedRangesRight,startID,endID);
  195. });
  196. return mid;
  197. }
  198. };
  199. template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
  200. __noinline size_t parallel_partitioning(T* array,
  201. const size_t begin,
  202. const size_t end,
  203. const Vi &identity,
  204. V &leftReduction,
  205. V &rightReduction,
  206. const IsLeft& is_left,
  207. const Reduction_T& reduction_t,
  208. const Reduction_V& reduction_v,
  209. size_t BLOCK_SIZE = 128)
  210. {
  211. /* fall back to single threaded partitioning for small N */
  212. if (unlikely(end-begin < BLOCK_SIZE))
  213. return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);
  214. /* otherwise use parallel code */
  215. else {
  216. typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;
  217. std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));
  218. return begin+p->partition(leftReduction,rightReduction);
  219. }
  220. }
  221. template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
  222. __noinline size_t parallel_partitioning(T* array,
  223. const size_t begin,
  224. const size_t end,
  225. const Vi &identity,
  226. V &leftReduction,
  227. V &rightReduction,
  228. const IsLeft& is_left,
  229. const Reduction_T& reduction_t,
  230. const Reduction_V& reduction_v,
  231. size_t BLOCK_SIZE,
  232. size_t PARALLEL_THRESHOLD)
  233. {
  234. /* fall back to single threaded partitioning for small N */
  235. if (unlikely(end-begin < PARALLEL_THRESHOLD))
  236. return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);
  237. /* otherwise use parallel code */
  238. else {
  239. typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;
  240. std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));
  241. return begin+p->partition(leftReduction,rightReduction);
  242. }
  243. }
  244. }