parallel_sort.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457
  1. // Copyright 2009-2020 Intel Corporation
  2. // SPDX-License-Identifier: Apache-2.0
  3. #pragma once
  4. #include "../simd/simd.h"
  5. #include "parallel_for.h"
  6. #if defined(TASKING_GCD) && defined(BUILD_IOS)
  7. #include "../sys/alloc.h"
  8. #endif
  9. #include <algorithm>
  10. namespace embree
  11. {
  12. template<class T>
  13. __forceinline void insertionsort_ascending(T *__restrict__ array, const size_t length)
  14. {
  15. for(size_t i = 1;i<length;++i)
  16. {
  17. T v = array[i];
  18. size_t j = i;
  19. while(j > 0 && v < array[j-1])
  20. {
  21. array[j] = array[j-1];
  22. --j;
  23. }
  24. array[j] = v;
  25. }
  26. }
  27. template<class T>
  28. __forceinline void insertionsort_decending(T *__restrict__ array, const size_t length)
  29. {
  30. for(size_t i = 1;i<length;++i)
  31. {
  32. T v = array[i];
  33. size_t j = i;
  34. while(j > 0 && v > array[j-1])
  35. {
  36. array[j] = array[j-1];
  37. --j;
  38. }
  39. array[j] = v;
  40. }
  41. }
  42. template<class T>
  43. void quicksort_ascending(T *__restrict__ t,
  44. const ssize_t begin,
  45. const ssize_t end)
  46. {
  47. if (likely(begin < end))
  48. {
  49. const T pivotvalue = t[begin];
  50. ssize_t left = begin - 1;
  51. ssize_t right = end + 1;
  52. while(1)
  53. {
  54. while (t[--right] > pivotvalue);
  55. while (t[++left] < pivotvalue);
  56. if (left >= right) break;
  57. const T temp = t[right];
  58. t[right] = t[left];
  59. t[left] = temp;
  60. }
  61. const int pivot = right;
  62. quicksort_ascending(t, begin, pivot);
  63. quicksort_ascending(t, pivot + 1, end);
  64. }
  65. }
  66. template<class T>
  67. void quicksort_decending(T *__restrict__ t,
  68. const ssize_t begin,
  69. const ssize_t end)
  70. {
  71. if (likely(begin < end))
  72. {
  73. const T pivotvalue = t[begin];
  74. ssize_t left = begin - 1;
  75. ssize_t right = end + 1;
  76. while(1)
  77. {
  78. while (t[--right] < pivotvalue);
  79. while (t[++left] > pivotvalue);
  80. if (left >= right) break;
  81. const T temp = t[right];
  82. t[right] = t[left];
  83. t[left] = temp;
  84. }
  85. const int pivot = right;
  86. quicksort_decending(t, begin, pivot);
  87. quicksort_decending(t, pivot + 1, end);
  88. }
  89. }
  90. template<class T, ssize_t THRESHOLD>
  91. void quicksort_insertionsort_ascending(T *__restrict__ t,
  92. const ssize_t begin,
  93. const ssize_t end)
  94. {
  95. if (likely(begin < end))
  96. {
  97. const ssize_t size = end-begin+1;
  98. if (likely(size <= THRESHOLD))
  99. {
  100. insertionsort_ascending<T>(&t[begin],size);
  101. }
  102. else
  103. {
  104. const T pivotvalue = t[begin];
  105. ssize_t left = begin - 1;
  106. ssize_t right = end + 1;
  107. while(1)
  108. {
  109. while (t[--right] > pivotvalue);
  110. while (t[++left] < pivotvalue);
  111. if (left >= right) break;
  112. const T temp = t[right];
  113. t[right] = t[left];
  114. t[left] = temp;
  115. }
  116. const ssize_t pivot = right;
  117. quicksort_insertionsort_ascending<T,THRESHOLD>(t, begin, pivot);
  118. quicksort_insertionsort_ascending<T,THRESHOLD>(t, pivot + 1, end);
  119. }
  120. }
  121. }
  122. template<class T, ssize_t THRESHOLD>
  123. void quicksort_insertionsort_decending(T *__restrict__ t,
  124. const ssize_t begin,
  125. const ssize_t end)
  126. {
  127. if (likely(begin < end))
  128. {
  129. const ssize_t size = end-begin+1;
  130. if (likely(size <= THRESHOLD))
  131. {
  132. insertionsort_decending<T>(&t[begin],size);
  133. }
  134. else
  135. {
  136. const T pivotvalue = t[begin];
  137. ssize_t left = begin - 1;
  138. ssize_t right = end + 1;
  139. while(1)
  140. {
  141. while (t[--right] < pivotvalue);
  142. while (t[++left] > pivotvalue);
  143. if (left >= right) break;
  144. const T temp = t[right];
  145. t[right] = t[left];
  146. t[left] = temp;
  147. }
  148. const ssize_t pivot = right;
  149. quicksort_insertionsort_decending<T,THRESHOLD>(t, begin, pivot);
  150. quicksort_insertionsort_decending<T,THRESHOLD>(t, pivot + 1, end);
  151. }
  152. }
  153. }
  154. template<typename T>
  155. static void radixsort32(T* const morton, const size_t num, const unsigned int shift = 3*8)
  156. {
  157. static const unsigned int BITS = 8;
  158. static const unsigned int BUCKETS = (1 << BITS);
  159. static const unsigned int CMP_SORT_THRESHOLD = 16;
  160. __aligned(64) unsigned int count[BUCKETS];
  161. /* clear buckets */
  162. for (size_t i=0;i<BUCKETS;i++) count[i] = 0;
  163. /* count buckets */
  164. #if defined(__INTEL_COMPILER)
  165. #pragma nounroll
  166. #endif
  167. for (size_t i=0;i<num;i++)
  168. count[(unsigned(morton[i]) >> shift) & (BUCKETS-1)]++;
  169. /* prefix sums */
  170. __aligned(64) unsigned int head[BUCKETS];
  171. __aligned(64) unsigned int tail[BUCKETS];
  172. head[0] = 0;
  173. for (size_t i=1; i<BUCKETS; i++)
  174. head[i] = head[i-1] + count[i-1];
  175. for (size_t i=0; i<BUCKETS-1; i++)
  176. tail[i] = head[i+1];
  177. tail[BUCKETS-1] = head[BUCKETS-1] + count[BUCKETS-1];
  178. assert(tail[BUCKETS-1] == head[BUCKETS-1] + count[BUCKETS-1]);
  179. assert(tail[BUCKETS-1] == num);
  180. /* in-place swap */
  181. for (size_t i=0;i<BUCKETS;i++)
  182. {
  183. /* process bucket */
  184. while(head[i] < tail[i])
  185. {
  186. T v = morton[head[i]];
  187. while(1)
  188. {
  189. const size_t b = (unsigned(v) >> shift) & (BUCKETS-1);
  190. if (b == i) break;
  191. std::swap(v,morton[head[b]++]);
  192. }
  193. assert((unsigned(v) >> shift & (BUCKETS-1)) == i);
  194. morton[head[i]++] = v;
  195. }
  196. }
  197. if (shift == 0) return;
  198. size_t offset = 0;
  199. for (size_t i=0;i<BUCKETS;i++)
  200. if (count[i])
  201. {
  202. for (size_t j=offset;j<offset+count[i]-1;j++)
  203. assert(((unsigned(morton[j]) >> shift) & (BUCKETS-1)) == i);
  204. if (unlikely(count[i] < CMP_SORT_THRESHOLD))
  205. insertionsort_ascending(morton + offset, count[i]);
  206. else
  207. radixsort32(morton + offset, count[i], shift-BITS);
  208. for (size_t j=offset;j<offset+count[i]-1;j++)
  209. assert(morton[j] <= morton[j+1]);
  210. offset += count[i];
  211. }
  212. }
  213. template<typename Ty, typename Key>
  214. class ParallelRadixSort
  215. {
  216. static const size_t MAX_TASKS = 64;
  217. static const size_t BITS = 8;
  218. static const size_t BUCKETS = (1 << BITS);
  219. typedef unsigned int TyRadixCount[BUCKETS];
  220. template<typename T>
  221. static bool compare(const T& v0, const T& v1) {
  222. return (Key)v0 < (Key)v1;
  223. }
  224. private:
  225. ParallelRadixSort (const ParallelRadixSort& other) DELETED; // do not implement
  226. ParallelRadixSort& operator= (const ParallelRadixSort& other) DELETED; // do not implement
  227. public:
  228. ParallelRadixSort (Ty* const src, Ty* const tmp, const size_t N)
  229. : radixCount(nullptr), src(src), tmp(tmp), N(N) {}
  230. void sort(const size_t blockSize)
  231. {
  232. assert(blockSize > 0);
  233. /* perform single threaded sort for small N */
  234. if (N<=blockSize) // handles also special case of 0!
  235. {
  236. /* do inplace sort inside destination array */
  237. std::sort(src,src+N,compare<Ty>);
  238. }
  239. /* perform parallel sort for large N */
  240. else
  241. {
  242. const size_t numThreads = min((N+blockSize-1)/blockSize,TaskScheduler::threadCount(),size_t(MAX_TASKS));
  243. tbbRadixSort(numThreads);
  244. }
  245. }
  246. ~ParallelRadixSort()
  247. {
  248. alignedFree(radixCount);
  249. radixCount = nullptr;
  250. }
  251. private:
  252. void tbbRadixIteration0(const Key shift,
  253. const Ty* __restrict const src,
  254. Ty* __restrict const dst,
  255. const size_t threadIndex, const size_t threadCount)
  256. {
  257. const size_t startID = (threadIndex+0)*N/threadCount;
  258. const size_t endID = (threadIndex+1)*N/threadCount;
  259. /* mask to extract some number of bits */
  260. const Key mask = BUCKETS-1;
  261. /* count how many items go into the buckets */
  262. for (size_t i=0; i<BUCKETS; i++)
  263. radixCount[threadIndex][i] = 0;
  264. /* iterate over src array and count buckets */
  265. unsigned int * __restrict const count = radixCount[threadIndex];
  266. #if defined(__INTEL_COMPILER)
  267. #pragma nounroll
  268. #endif
  269. for (size_t i=startID; i<endID; i++) {
  270. #if defined(__X86_64__) || defined(__aarch64__)
  271. const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
  272. #else
  273. const Key index = ((Key)src[i] >> shift) & mask;
  274. #endif
  275. count[index]++;
  276. }
  277. }
  278. void tbbRadixIteration1(const Key shift,
  279. const Ty* __restrict const src,
  280. Ty* __restrict const dst,
  281. const size_t threadIndex, const size_t threadCount)
  282. {
  283. const size_t startID = (threadIndex+0)*N/threadCount;
  284. const size_t endID = (threadIndex+1)*N/threadCount;
  285. /* mask to extract some number of bits */
  286. const Key mask = BUCKETS-1;
  287. /* calculate total number of items for each bucket */
  288. __aligned(64) unsigned int total[BUCKETS];
  289. /*
  290. for (size_t i=0; i<BUCKETS; i++)
  291. total[i] = 0;
  292. */
  293. for (size_t i=0; i<BUCKETS; i+=VSIZEX)
  294. vintx::store(&total[i], zero);
  295. for (size_t i=0; i<threadCount; i++)
  296. {
  297. /*
  298. for (size_t j=0; j<BUCKETS; j++)
  299. total[j] += radixCount[i][j];
  300. */
  301. for (size_t j=0; j<BUCKETS; j+=VSIZEX)
  302. vintx::store(&total[j], vintx::load(&total[j]) + vintx::load(&radixCount[i][j]));
  303. }
  304. /* calculate start offset of each bucket */
  305. __aligned(64) unsigned int offset[BUCKETS];
  306. offset[0] = 0;
  307. for (size_t i=1; i<BUCKETS; i++)
  308. offset[i] = offset[i-1] + total[i-1];
  309. /* calculate start offset of each bucket for this thread */
  310. for (size_t i=0; i<threadIndex; i++)
  311. {
  312. /*
  313. for (size_t j=0; j<BUCKETS; j++)
  314. offset[j] += radixCount[i][j];
  315. */
  316. for (size_t j=0; j<BUCKETS; j+=VSIZEX)
  317. vintx::store(&offset[j], vintx::load(&offset[j]) + vintx::load(&radixCount[i][j]));
  318. }
  319. /* copy items into their buckets */
  320. #if defined(__INTEL_COMPILER)
  321. #pragma nounroll
  322. #endif
  323. for (size_t i=startID; i<endID; i++) {
  324. const Ty elt = src[i];
  325. #if defined(__X86_64__) || defined(__aarch64__)
  326. const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
  327. #else
  328. const size_t index = ((Key)src[i] >> shift) & mask;
  329. #endif
  330. dst[offset[index]++] = elt;
  331. }
  332. }
  333. void tbbRadixIteration(const Key shift, const bool last,
  334. const Ty* __restrict src, Ty* __restrict dst,
  335. const size_t numTasks)
  336. {
  337. affinity_partitioner ap;
  338. parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration0(shift,src,dst,taskIndex,numTasks); },ap);
  339. parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration1(shift,src,dst,taskIndex,numTasks); },ap);
  340. }
  341. void tbbRadixSort(const size_t numTasks)
  342. {
  343. radixCount = (TyRadixCount*) alignedMalloc(MAX_TASKS*sizeof(TyRadixCount),64);
  344. if (sizeof(Key) == sizeof(uint32_t)) {
  345. tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
  346. tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
  347. tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
  348. tbbRadixIteration(3*BITS,1,tmp,src,numTasks);
  349. }
  350. else if (sizeof(Key) == sizeof(uint64_t))
  351. {
  352. tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
  353. tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
  354. tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
  355. tbbRadixIteration(3*BITS,0,tmp,src,numTasks);
  356. tbbRadixIteration(4*BITS,0,src,tmp,numTasks);
  357. tbbRadixIteration(5*BITS,0,tmp,src,numTasks);
  358. tbbRadixIteration(6*BITS,0,src,tmp,numTasks);
  359. tbbRadixIteration(7*BITS,1,tmp,src,numTasks);
  360. }
  361. }
  362. private:
  363. TyRadixCount* radixCount;
  364. Ty* const src;
  365. Ty* const tmp;
  366. const size_t N;
  367. };
  368. template<typename Ty>
  369. void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
  370. {
  371. ParallelRadixSort<Ty,Ty>(src,tmp,N).sort(blockSize);
  372. }
  373. template<typename Ty, typename Key>
  374. void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
  375. {
  376. ParallelRadixSort<Ty,Key>(src,tmp,N).sort(blockSize);
  377. }
  378. template<typename Ty>
  379. void radix_sort_u32(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
  380. radix_sort<Ty,uint32_t>(src,tmp,N,blockSize);
  381. }
  382. template<typename Ty>
  383. void radix_sort_u64(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
  384. radix_sort<Ty,uint64_t>(src,tmp,N,blockSize);
  385. }
  386. }