heuristic_binning.h 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951
  1. // ======================================================================== //
  2. // Copyright 2009-2017 Intel Corporation //
  3. // //
  4. // Licensed under the Apache License, Version 2.0 (the "License"); //
  5. // you may not use this file except in compliance with the License. //
  6. // You may obtain a copy of the License at //
  7. // //
  8. // http://www.apache.org/licenses/LICENSE-2.0 //
  9. // //
  10. // Unless required by applicable law or agreed to in writing, software //
  11. // distributed under the License is distributed on an "AS IS" BASIS, //
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
  13. // See the License for the specific language governing permissions and //
  14. // limitations under the License. //
  15. // ======================================================================== //
  16. #pragma once
  17. #include "priminfo.h"
  18. #include "../geometry/bezier1v.h"
  19. #include "../../common/algorithms/parallel_reduce.h"
  20. #include "../../common/algorithms/parallel_partition.h"
  21. namespace embree
  22. {
  23. namespace isa
  24. {
  25. /*! mapping into bins */
  26. template<size_t BINS>
  27. struct BinMapping
  28. {
  29. public:
  30. __forceinline BinMapping() {}
  31. /*! calculates the mapping */
  32. __forceinline BinMapping(const BBox3fa& centBounds, size_t N)
  33. {
  34. num = min(BINS,size_t(4.0f + 0.05f*N));
  35. const vfloat4 diag = (vfloat4) centBounds.size();
  36. scale = select(diag > vfloat4(1E-34f),vfloat4(0.99f*num)/diag,vfloat4(0.0f));
  37. ofs = (vfloat4) centBounds.lower;
  38. }
  39. /*! calculates the mapping */
  40. template<typename PrimInfo>
  41. __forceinline BinMapping(const PrimInfo& pinfo)
  42. {
  43. num = min(BINS,size_t(4.0f + 0.05f*pinfo.size()));
  44. const vfloat4 diag = (vfloat4) pinfo.centBounds.size();
  45. scale = select(diag > vfloat4(1E-34f),vfloat4(0.99f*num)/diag,vfloat4(0.0f));
  46. ofs = (vfloat4) pinfo.centBounds.lower;
  47. }
  48. /*! returns number of bins */
  49. __forceinline size_t size() const { return num; }
  50. /*! slower but safe binning */
  51. __forceinline Vec3ia bin(const Vec3fa& p) const
  52. {
  53. const vint4 i = floori((vfloat4(p)-ofs)*scale);
  54. #if 1
  55. assert(i[0] >= 0 && (size_t)i[0] < num);
  56. assert(i[1] >= 0 && (size_t)i[1] < num);
  57. assert(i[2] >= 0 && (size_t)i[2] < num);
  58. return Vec3ia(i);
  59. #else
  60. return Vec3ia(clamp(i,vint4(0),vint4(num-1)));
  61. #endif
  62. }
  63. /*! faster but unsafe binning */
  64. __forceinline Vec3ia bin_unsafe(const Vec3fa& p) const {
  65. return Vec3ia(floori((vfloat4(p)-ofs)*scale));
  66. }
  67. /*! faster but unsafe binning */
  68. template<typename PrimRef>
  69. __forceinline Vec3ia bin_unsafe(const PrimRef& p) const {
  70. return bin_unsafe(p.binCenter());
  71. }
  72. /*! faster but unsafe binning */
  73. template<typename PrimRef>
  74. __forceinline Vec3ia bin_unsafe(const PrimRef& p, const LinearSpace3fa& space, void* user = nullptr) const {
  75. return bin_unsafe(p.binCenter(space,user));
  76. }
  77. template<typename PrimRef>
  78. __forceinline bool bin_unsafe(const PrimRef& ref,
  79. const vint4& vSplitPos,
  80. const vbool4& splitDimMask) const
  81. {
  82. return any(((vint4)bin_unsafe(center2(ref.bounds())) < vSplitPos) & splitDimMask);
  83. }
  84. /*! returns true if the mapping is invalid in some dimension */
  85. __forceinline bool invalid(const size_t dim) const {
  86. return scale[dim] == 0.0f;
  87. }
  88. /*! stream output */
  89. friend std::ostream& operator<<(std::ostream& cout, const BinMapping& mapping) {
  90. return cout << "BinMapping { num = " << mapping.num << ", ofs = " << mapping.ofs << ", scale = " << mapping.scale << "}";
  91. }
  92. public:
  93. size_t num;
  94. vfloat4 ofs,scale; //!< linear function that maps to bin ID
  95. };
  96. /*! stores all information to perform some split */
  97. template<size_t BINS>
  98. struct BinSplit
  99. {
  100. /*! construct an invalid split by default */
  101. __forceinline BinSplit()
  102. : sah(inf), dim(-1), pos(0), data(0) {}
  103. __forceinline BinSplit(float sah, unsigned data, int dim = 0, float fpos = 0)
  104. : sah(sah), dim(dim), fpos(fpos), data(data) {}
  105. /*! constructs specified split */
  106. __forceinline BinSplit(float sah, int dim, int pos, const BinMapping<BINS>& mapping)
  107. : sah(sah), dim(dim), pos(pos), data(0), mapping(mapping) {}
  108. /*! tests if this split is valid */
  109. __forceinline bool valid() const { return dim != -1; }
  110. /*! calculates surface area heuristic for performing the split */
  111. __forceinline float splitSAH() const { return sah; }
  112. /*! stream output */
  113. friend std::ostream& operator<<(std::ostream& cout, const BinSplit& split) {
  114. return cout << "BinSplit { sah = " << split.sah << ", dim = " << split.dim << ", pos = " << split.pos << "}";
  115. }
  116. public:
  117. float sah; //!< SAH cost of the split
  118. int dim; //!< split dimension
  119. union { int pos; float fpos; }; //!< bin index for splitting
  120. unsigned int data; //!< extra optional split data
  121. BinMapping<BINS> mapping; //!< mapping into bins
  122. };
  123. /*! stores extended information about the split */
  124. template<typename BBox>
  125. struct SplitInfoT
  126. {
  127. __forceinline SplitInfoT () {}
  128. __forceinline SplitInfoT (size_t leftCount, const BBox& leftBounds, size_t rightCount, const BBox& rightBounds)
  129. : leftCount(leftCount), rightCount(rightCount), leftBounds(leftBounds), rightBounds(rightBounds) {}
  130. public:
  131. size_t leftCount,rightCount;
  132. BBox leftBounds,rightBounds;
  133. };
  134. typedef SplitInfoT<BBox3fa> SplitInfo;
  135. typedef SplitInfoT<LBBox3fa> SplitInfo2;
  136. /*! stores all binning information */
  137. template<size_t BINS, typename PrimRef, typename BBox>
  138. struct __aligned(64) BinInfoT
  139. {
  140. typedef BinSplit<BINS> Split;
  141. __forceinline BinInfoT() {
  142. }
  143. __forceinline BinInfoT(EmptyTy) {
  144. clear();
  145. }
  146. /*! bin access function */
  147. __forceinline BBox &bounds(const size_t binID, const size_t dimID) { return _bounds[binID][dimID]; }
  148. __forceinline const BBox &bounds(const size_t binID, const size_t dimID) const { return _bounds[binID][dimID]; }
  149. __forceinline int &counts(const size_t binID, const size_t dimID) { return _counts[binID][dimID]; }
  150. __forceinline const int &counts(const size_t binID, const size_t dimID) const { return _counts[binID][dimID]; }
  151. __forceinline vint4 &counts(const size_t binID) { return _counts[binID]; }
  152. __forceinline const vint4 &counts(const size_t binID) const { return _counts[binID]; }
  153. /*! clears the bin info */
  154. __forceinline void clear()
  155. {
  156. for (size_t i=0; i<BINS; i++) {
  157. bounds(i,0) = bounds(i,1) = bounds(i,2) = empty;
  158. counts(i) = vint4(zero);
  159. }
  160. }
  161. /*! bins an array of primitives */
  162. __forceinline void bin (const PrimRef* prims, size_t N, const BinMapping<BINS>& mapping)
  163. {
  164. if (unlikely(N == 0)) return;
  165. size_t i;
  166. for (i=0; i<N-1; i+=2)
  167. {
  168. /*! map even and odd primitive to bin */
  169. BBox prim0; Vec3fa center0;
  170. prims[i+0].binBoundsAndCenter(prim0,center0);
  171. const vint4 bin0 = (vint4)mapping.bin(center0);
  172. BBox prim1; Vec3fa center1;
  173. prims[i+1].binBoundsAndCenter(prim1,center1);
  174. const vint4 bin1 = (vint4)mapping.bin(center1);
  175. /*! increase bounds for bins for even primitive */
  176. const unsigned int b00 = extract<0>(bin0); bounds(b00,0).extend(prim0);
  177. const unsigned int b01 = extract<1>(bin0); bounds(b01,1).extend(prim0);
  178. const unsigned int b02 = extract<2>(bin0); bounds(b02,2).extend(prim0);
  179. const unsigned int s0 = prims[i+0].size();
  180. counts(b00,0)+=s0;
  181. counts(b01,1)+=s0;
  182. counts(b02,2)+=s0;
  183. /*! increase bounds of bins for odd primitive */
  184. const unsigned int b10 = extract<0>(bin1); bounds(b10,0).extend(prim1);
  185. const unsigned int b11 = extract<1>(bin1); bounds(b11,1).extend(prim1);
  186. const unsigned int b12 = extract<2>(bin1); bounds(b12,2).extend(prim1);
  187. const unsigned int s1 = prims[i+1].size();
  188. counts(b10,0)+=s1;
  189. counts(b11,1)+=s1;
  190. counts(b12,2)+=s1;
  191. }
  192. /*! for uneven number of primitives */
  193. if (i < N)
  194. {
  195. /*! map primitive to bin */
  196. BBox prim0; Vec3fa center0;
  197. prims[i].binBoundsAndCenter(prim0,center0);
  198. const vint4 bin0 = (vint4)mapping.bin(center0);
  199. /*! increase bounds of bins */
  200. const unsigned int s0 = prims[i].size();
  201. const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0);
  202. const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0);
  203. const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0);
  204. }
  205. }
  206. /*! bins an array of primitives */
  207. __forceinline void bin (const PrimRef* prims, size_t N, const BinMapping<BINS>& mapping, const AffineSpace3fa& space, void* user = nullptr)
  208. {
  209. if (N == 0) return;
  210. size_t i;
  211. for (i=0; i<N-1; i+=2)
  212. {
  213. /*! map even and odd primitive to bin */
  214. BBox prim0; Vec3fa center0; prims[i+0].binBoundsAndCenter(prim0,center0,space,user);
  215. const vint4 bin0 = (vint4)mapping.bin(center0);
  216. BBox prim1; Vec3fa center1; prims[i+1].binBoundsAndCenter(prim1,center1,space,user);
  217. const vint4 bin1 = (vint4)mapping.bin(center1);
  218. /*! increase bounds for bins for even primitive */
  219. const unsigned int s0 = prims[i+0].size();
  220. const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0);
  221. const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0);
  222. const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0);
  223. /*! increase bounds of bins for odd primitive */
  224. const unsigned int s1 = prims[i+1].size();
  225. const int b10 = extract<0>(bin1); counts(b10,0)+=s1; bounds(b10,0).extend(prim1);
  226. const int b11 = extract<1>(bin1); counts(b11,1)+=s1; bounds(b11,1).extend(prim1);
  227. const int b12 = extract<2>(bin1); counts(b12,2)+=s1; bounds(b12,2).extend(prim1);
  228. }
  229. /*! for uneven number of primitives */
  230. if (i < N)
  231. {
  232. /*! map primitive to bin */
  233. BBox prim0; Vec3fa center0; prims[i+0].binBoundsAndCenter(prim0,center0,space,user);
  234. const vint4 bin0 = (vint4)mapping.bin(center0);
  235. /*! increase bounds of bins */
  236. const unsigned int s0 = prims[i+0].size();
  237. const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0);
  238. const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0);
  239. const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0);
  240. }
  241. }
  242. __forceinline void bin(const PrimRef* prims, size_t begin, size_t end, const BinMapping<BINS>& mapping) {
  243. bin(prims+begin,end-begin,mapping);
  244. }
  245. __forceinline void bin(const PrimRef* prims, size_t begin, size_t end, const BinMapping<BINS>& mapping, const AffineSpace3fa& space, void* user = nullptr) {
  246. bin(prims+begin,end-begin,mapping,space,user);
  247. }
  248. /*! merges in other binning information */
  249. __forceinline void merge (const BinInfoT& other, size_t numBins)
  250. {
  251. for (size_t i=0; i<numBins; i++)
  252. {
  253. counts(i) += other.counts(i);
  254. bounds(i,0).extend(other.bounds(i,0));
  255. bounds(i,1).extend(other.bounds(i,1));
  256. bounds(i,2).extend(other.bounds(i,2));
  257. }
  258. }
  259. /*! reduces binning information */
  260. static __forceinline const BinInfoT reduce (const BinInfoT& a, const BinInfoT& b, const size_t numBins = BINS)
  261. {
  262. BinInfoT c;
  263. for (size_t i=0; i<numBins; i++)
  264. {
  265. c.counts(i) = a.counts(i)+b.counts(i);
  266. c.bounds(i,0) = embree::merge(a.bounds(i,0),b.bounds(i,0));
  267. c.bounds(i,1) = embree::merge(a.bounds(i,1),b.bounds(i,1));
  268. c.bounds(i,2) = embree::merge(a.bounds(i,2),b.bounds(i,2));
  269. }
  270. return c;
  271. }
  272. /*! finds the best split by scanning binning information */
  273. __forceinline Split best(const BinMapping<BINS>& mapping, const size_t blocks_shift) const
  274. {
  275. /* sweep from right to left and compute parallel prefix of merged bounds */
  276. vfloat4 rAreas[BINS];
  277. vint4 rCounts[BINS];
  278. vint4 count = 0; BBox bx = empty; BBox by = empty; BBox bz = empty;
  279. for (size_t i=mapping.size()-1; i>0; i--)
  280. {
  281. count += counts(i);
  282. rCounts[i] = count;
  283. bx.extend(bounds(i,0)); rAreas[i][0] = expectedApproxHalfArea(bx);
  284. by.extend(bounds(i,1)); rAreas[i][1] = expectedApproxHalfArea(by);
  285. bz.extend(bounds(i,2)); rAreas[i][2] = expectedApproxHalfArea(bz);
  286. rAreas[i][3] = 0.0f;
  287. }
  288. /* sweep from left to right and compute SAH */
  289. vint4 blocks_add = (1 << blocks_shift)-1;
  290. vint4 ii = 1; vfloat4 vbestSAH = pos_inf; vint4 vbestPos = 0;
  291. count = 0; bx = empty; by = empty; bz = empty;
  292. for (size_t i=1; i<mapping.size(); i++, ii+=1)
  293. {
  294. count += counts(i-1);
  295. bx.extend(bounds(i-1,0)); float Ax = expectedApproxHalfArea(bx);
  296. by.extend(bounds(i-1,1)); float Ay = expectedApproxHalfArea(by);
  297. bz.extend(bounds(i-1,2)); float Az = expectedApproxHalfArea(bz);
  298. const vfloat4 lArea = vfloat4(Ax,Ay,Az,Az);
  299. const vfloat4 rArea = rAreas[i];
  300. const vint4 lCount = (count +blocks_add) >> int(blocks_shift);
  301. const vint4 rCount = (rCounts[i]+blocks_add) >> int(blocks_shift);
  302. const vfloat4 sah = madd(lArea,vfloat4(lCount),rArea*vfloat4(rCount));
  303. vbestPos = select(sah < vbestSAH,ii ,vbestPos);
  304. vbestSAH = select(sah < vbestSAH,sah,vbestSAH);
  305. }
  306. /* find best dimension */
  307. float bestSAH = inf;
  308. int bestDim = -1;
  309. int bestPos = 0;
  310. for (int dim=0; dim<3; dim++)
  311. {
  312. /* ignore zero sized dimensions */
  313. if (unlikely(mapping.invalid(dim)))
  314. continue;
  315. /* test if this is a better dimension */
  316. if (vbestSAH[dim] < bestSAH && vbestPos[dim] != 0) {
  317. bestDim = dim;
  318. bestPos = vbestPos[dim];
  319. bestSAH = vbestSAH[dim];
  320. }
  321. }
  322. return Split(bestSAH,bestDim,bestPos,mapping);
  323. }
  324. /*! calculates extended split information */
  325. __forceinline void getSplitInfo(const BinMapping<BINS>& mapping, const Split& split, SplitInfoT<BBox>& info) const
  326. {
  327. if (split.dim == -1) {
  328. new (&info) SplitInfoT<BBox>(0,empty,0,empty);
  329. return;
  330. }
  331. size_t leftCount = 0;
  332. BBox leftBounds = empty;
  333. for (size_t i=0; i<(size_t)split.pos; i++) {
  334. leftCount += counts(i,split.dim);
  335. leftBounds.extend(bounds(i,split.dim));
  336. }
  337. size_t rightCount = 0;
  338. BBox rightBounds = empty;
  339. for (size_t i=split.pos; i<mapping.size(); i++) {
  340. rightCount += counts(i,split.dim);
  341. rightBounds.extend(bounds(i,split.dim));
  342. }
  343. new (&info) SplitInfoT<BBox>(leftCount,leftBounds,rightCount,rightBounds);
  344. }
  345. /*! gets the number of primitives left of the split */
  346. __forceinline size_t getLeftCount(const BinMapping<BINS>& mapping, const Split& split) const
  347. {
  348. if (unlikely(split.dim == -1)) return -1;
  349. size_t leftCount = 0;
  350. for (size_t i = 0; i < (size_t)split.pos; i++) {
  351. leftCount += counts(i, split.dim);
  352. }
  353. return leftCount;
  354. }
  355. /*! gets the number of primitives right of the split */
  356. __forceinline size_t getRightCount(const BinMapping<BINS>& mapping, const Split& split) const
  357. {
  358. if (unlikely(split.dim == -1)) return -1;
  359. size_t rightCount = 0;
  360. for (size_t i = (size_t)split.pos; i<mapping.size(); i++) {
  361. rightCount += counts(i, split.dim);
  362. }
  363. return rightCount;
  364. }
  365. private:
  366. BBox _bounds[BINS][3]; //!< geometry bounds for each bin in each dimension
  367. vint4 _counts[BINS]; //!< counts number of primitives that map into the bins
  368. };
  369. #if defined(__AVX512F__)
  370. /*! mapping into bins */
  371. template<>
  372. struct BinMapping<16>
  373. {
  374. public:
  375. __forceinline BinMapping() {}
  376. /*! calculates the mapping */
  377. template<typename PrimInfo>
  378. __forceinline BinMapping(const PrimInfo& pinfo)
  379. {
  380. num = 16;
  381. const vfloat4 diag = (vfloat4) pinfo.centBounds.size();
  382. scale = select(diag > vfloat4(1E-34f),vfloat4(0.99f*num)/diag,vfloat4(0.0f));
  383. ofs = (vfloat4) pinfo.centBounds.lower;
  384. scale16 = scale;
  385. ofs16 = ofs;
  386. }
  387. /*! returns number of bins */
  388. __forceinline size_t size() const { return num; }
  389. __forceinline vint16 bin16(const Vec3fa& p) const {
  390. return vint16(vint4(floori((vfloat4(p)-ofs)*scale)));
  391. }
  392. __forceinline vint16 bin16(const vfloat16& p) const {
  393. return floori((p-ofs16)*scale16);
  394. }
  395. __forceinline int bin_unsafe(const PrimRef& ref,
  396. const vint16& vSplitPos,
  397. const vbool16& splitDimMask) const
  398. {
  399. const vfloat16 lower(*(vfloat4*)&ref.lower);
  400. const vfloat16 upper(*(vfloat4*)&ref.upper);
  401. const vfloat16 p = lower + upper;
  402. const vint16 i = floori((p-ofs16)*scale16);
  403. return lt(splitDimMask,i,vSplitPos);
  404. }
  405. /*! returns true if the mapping is invalid in some dimension */
  406. __forceinline bool invalid(const size_t dim) const {
  407. return scale[dim] == 0.0f;
  408. }
  409. public:
  410. size_t num;
  411. vfloat4 ofs,scale; //!< linear function that maps to bin ID
  412. vfloat16 ofs16,scale16; //!< linear function that maps to bin ID
  413. };
  414. /* 16 bins in-register binner */
  415. template<typename PrimRef>
  416. struct __aligned(64) BinInfoT<16,PrimRef,BBox3fa>
  417. {
  418. typedef BinSplit<16> Split;
  419. __forceinline BinInfoT() {
  420. }
  421. __forceinline BinInfoT(EmptyTy) {
  422. clear();
  423. }
  424. /*! clears the bin info */
  425. __forceinline void clear()
  426. {
  427. lower[0] = lower[1] = lower[2] = pos_inf;
  428. upper[0] = upper[1] = upper[2] = neg_inf;
  429. count[0] = count[1] = count[2] = 0;
  430. }
  431. static __forceinline vfloat16 prefix_area_rl(const vfloat16 min_x,
  432. const vfloat16 min_y,
  433. const vfloat16 min_z,
  434. const vfloat16 max_x,
  435. const vfloat16 max_y,
  436. const vfloat16 max_z)
  437. {
  438. const vfloat16 r_min_x = reverse_prefix_min(min_x);
  439. const vfloat16 r_min_y = reverse_prefix_min(min_y);
  440. const vfloat16 r_min_z = reverse_prefix_min(min_z);
  441. const vfloat16 r_max_x = reverse_prefix_max(max_x);
  442. const vfloat16 r_max_y = reverse_prefix_max(max_y);
  443. const vfloat16 r_max_z = reverse_prefix_max(max_z);
  444. const vfloat16 dx = r_max_x - r_min_x;
  445. const vfloat16 dy = r_max_y - r_min_y;
  446. const vfloat16 dz = r_max_z - r_min_z;
  447. const vfloat16 area_rl = madd(dx,dy,madd(dx,dz,dy*dz));
  448. return area_rl;
  449. }
  450. static __forceinline vfloat16 prefix_area_lr(const vfloat16 min_x,
  451. const vfloat16 min_y,
  452. const vfloat16 min_z,
  453. const vfloat16 max_x,
  454. const vfloat16 max_y,
  455. const vfloat16 max_z)
  456. {
  457. const vfloat16 r_min_x = prefix_min(min_x);
  458. const vfloat16 r_min_y = prefix_min(min_y);
  459. const vfloat16 r_min_z = prefix_min(min_z);
  460. const vfloat16 r_max_x = prefix_max(max_x);
  461. const vfloat16 r_max_y = prefix_max(max_y);
  462. const vfloat16 r_max_z = prefix_max(max_z);
  463. const vfloat16 dx = r_max_x - r_min_x;
  464. const vfloat16 dy = r_max_y - r_min_y;
  465. const vfloat16 dz = r_max_z - r_min_z;
  466. const vfloat16 area_lr = madd(dx,dy,madd(dx,dz,dy*dz));
  467. return area_lr;
  468. }
  469. /*! bins an array of primitives */
  470. __forceinline void bin (const PrimRef* prims, size_t N, const BinMapping<16>& mapping)
  471. {
  472. const vfloat16 init_min(pos_inf);
  473. const vfloat16 init_max(neg_inf);
  474. vfloat16 min_x0,min_x1,min_x2;
  475. vfloat16 min_y0,min_y1,min_y2;
  476. vfloat16 min_z0,min_z1,min_z2;
  477. vfloat16 max_x0,max_x1,max_x2;
  478. vfloat16 max_y0,max_y1,max_y2;
  479. vfloat16 max_z0,max_z1,max_z2;
  480. vint16 count0,count1,count2;
  481. min_x0 = init_min;
  482. min_x1 = init_min;
  483. min_x2 = init_min;
  484. min_y0 = init_min;
  485. min_y1 = init_min;
  486. min_y2 = init_min;
  487. min_z0 = init_min;
  488. min_z1 = init_min;
  489. min_z2 = init_min;
  490. max_x0 = init_max;
  491. max_x1 = init_max;
  492. max_x2 = init_max;
  493. max_y0 = init_max;
  494. max_y1 = init_max;
  495. max_y2 = init_max;
  496. max_z0 = init_max;
  497. max_z1 = init_max;
  498. max_z2 = init_max;
  499. count0 = vint16::zero();
  500. count1 = vint16::zero();
  501. count2 = vint16::zero();
  502. const vint16 step16(step);
  503. size_t i;
  504. for (i=0; i<N-1; i+=2)
  505. {
  506. /*! map even and odd primitive to bin */
  507. const BBox3fa primA = prims[i+0].bounds();
  508. const vfloat16 centerA = vfloat16((vfloat4)primA.lower) + vfloat16((vfloat4)primA.upper);
  509. const vint16 binA = mapping.bin16(centerA);
  510. const BBox3fa primB = prims[i+1].bounds();
  511. const vfloat16 centerB = vfloat16((vfloat4)primB.lower) + vfloat16((vfloat4)primB.upper);
  512. const vint16 binB = mapping.bin16(centerB);
  513. /* A */
  514. {
  515. const vfloat16 b_min_x = prims[i+0].lower.x;
  516. const vfloat16 b_min_y = prims[i+0].lower.y;
  517. const vfloat16 b_min_z = prims[i+0].lower.z;
  518. const vfloat16 b_max_x = prims[i+0].upper.x;
  519. const vfloat16 b_max_y = prims[i+0].upper.y;
  520. const vfloat16 b_max_z = prims[i+0].upper.z;
  521. const vint16 bin0 = shuffle<0>(binA);
  522. const vint16 bin1 = shuffle<1>(binA);
  523. const vint16 bin2 = shuffle<2>(binA);
  524. const vbool16 m_update_x = step16 == bin0;
  525. const vbool16 m_update_y = step16 == bin1;
  526. const vbool16 m_update_z = step16 == bin2;
  527. assert(__popcnt((size_t)m_update_x) == 1);
  528. assert(__popcnt((size_t)m_update_y) == 1);
  529. assert(__popcnt((size_t)m_update_z) == 1);
  530. min_x0 = mask_min(m_update_x,min_x0,min_x0,b_min_x);
  531. min_y0 = mask_min(m_update_x,min_y0,min_y0,b_min_y);
  532. min_z0 = mask_min(m_update_x,min_z0,min_z0,b_min_z);
  533. // ------------------------------------------------------------------------
  534. max_x0 = mask_max(m_update_x,max_x0,max_x0,b_max_x);
  535. max_y0 = mask_max(m_update_x,max_y0,max_y0,b_max_y);
  536. max_z0 = mask_max(m_update_x,max_z0,max_z0,b_max_z);
  537. // ------------------------------------------------------------------------
  538. min_x1 = mask_min(m_update_y,min_x1,min_x1,b_min_x);
  539. min_y1 = mask_min(m_update_y,min_y1,min_y1,b_min_y);
  540. min_z1 = mask_min(m_update_y,min_z1,min_z1,b_min_z);
  541. // ------------------------------------------------------------------------
  542. max_x1 = mask_max(m_update_y,max_x1,max_x1,b_max_x);
  543. max_y1 = mask_max(m_update_y,max_y1,max_y1,b_max_y);
  544. max_z1 = mask_max(m_update_y,max_z1,max_z1,b_max_z);
  545. // ------------------------------------------------------------------------
  546. min_x2 = mask_min(m_update_z,min_x2,min_x2,b_min_x);
  547. min_y2 = mask_min(m_update_z,min_y2,min_y2,b_min_y);
  548. min_z2 = mask_min(m_update_z,min_z2,min_z2,b_min_z);
  549. // ------------------------------------------------------------------------
  550. max_x2 = mask_max(m_update_z,max_x2,max_x2,b_max_x);
  551. max_y2 = mask_max(m_update_z,max_y2,max_y2,b_max_y);
  552. max_z2 = mask_max(m_update_z,max_z2,max_z2,b_max_z);
  553. // ------------------------------------------------------------------------
  554. count0 = mask_add(m_update_x,count0,count0,vint16(1));
  555. count1 = mask_add(m_update_y,count1,count1,vint16(1));
  556. count2 = mask_add(m_update_z,count2,count2,vint16(1));
  557. }
  558. /* B */
  559. {
  560. const vfloat16 b_min_x = prims[i+1].lower.x;
  561. const vfloat16 b_min_y = prims[i+1].lower.y;
  562. const vfloat16 b_min_z = prims[i+1].lower.z;
  563. const vfloat16 b_max_x = prims[i+1].upper.x;
  564. const vfloat16 b_max_y = prims[i+1].upper.y;
  565. const vfloat16 b_max_z = prims[i+1].upper.z;
  566. const vint16 bin0 = shuffle<0>(binB);
  567. const vint16 bin1 = shuffle<1>(binB);
  568. const vint16 bin2 = shuffle<2>(binB);
  569. const vbool16 m_update_x = step16 == bin0;
  570. const vbool16 m_update_y = step16 == bin1;
  571. const vbool16 m_update_z = step16 == bin2;
  572. assert(__popcnt((size_t)m_update_x) == 1);
  573. assert(__popcnt((size_t)m_update_y) == 1);
  574. assert(__popcnt((size_t)m_update_z) == 1);
  575. min_x0 = mask_min(m_update_x,min_x0,min_x0,b_min_x);
  576. min_y0 = mask_min(m_update_x,min_y0,min_y0,b_min_y);
  577. min_z0 = mask_min(m_update_x,min_z0,min_z0,b_min_z);
  578. // ------------------------------------------------------------------------
  579. max_x0 = mask_max(m_update_x,max_x0,max_x0,b_max_x);
  580. max_y0 = mask_max(m_update_x,max_y0,max_y0,b_max_y);
  581. max_z0 = mask_max(m_update_x,max_z0,max_z0,b_max_z);
  582. // ------------------------------------------------------------------------
  583. min_x1 = mask_min(m_update_y,min_x1,min_x1,b_min_x);
  584. min_y1 = mask_min(m_update_y,min_y1,min_y1,b_min_y);
  585. min_z1 = mask_min(m_update_y,min_z1,min_z1,b_min_z);
  586. // ------------------------------------------------------------------------
  587. max_x1 = mask_max(m_update_y,max_x1,max_x1,b_max_x);
  588. max_y1 = mask_max(m_update_y,max_y1,max_y1,b_max_y);
  589. max_z1 = mask_max(m_update_y,max_z1,max_z1,b_max_z);
  590. // ------------------------------------------------------------------------
  591. min_x2 = mask_min(m_update_z,min_x2,min_x2,b_min_x);
  592. min_y2 = mask_min(m_update_z,min_y2,min_y2,b_min_y);
  593. min_z2 = mask_min(m_update_z,min_z2,min_z2,b_min_z);
  594. // ------------------------------------------------------------------------
  595. max_x2 = mask_max(m_update_z,max_x2,max_x2,b_max_x);
  596. max_y2 = mask_max(m_update_z,max_y2,max_y2,b_max_y);
  597. max_z2 = mask_max(m_update_z,max_z2,max_z2,b_max_z);
  598. // ------------------------------------------------------------------------
  599. count0 = mask_add(m_update_x,count0,count0,vint16(1));
  600. count1 = mask_add(m_update_y,count1,count1,vint16(1));
  601. count2 = mask_add(m_update_z,count2,count2,vint16(1));
  602. }
  603. }
  604. if (i < N)
  605. {
  606. const BBox3fa prim0 = prims[i].bounds();
  607. const vfloat16 center0 = vfloat16((vfloat4)prim0.lower) + vfloat16((vfloat4)prim0.upper);
  608. const vint16 bin = mapping.bin16(center0);
  609. const vfloat16 b_min_x = prims[i].lower.x;
  610. const vfloat16 b_min_y = prims[i].lower.y;
  611. const vfloat16 b_min_z = prims[i].lower.z;
  612. const vfloat16 b_max_x = prims[i].upper.x;
  613. const vfloat16 b_max_y = prims[i].upper.y;
  614. const vfloat16 b_max_z = prims[i].upper.z;
  615. const vint16 bin0 = shuffle<0>(bin);
  616. const vint16 bin1 = shuffle<1>(bin);
  617. const vint16 bin2 = shuffle<2>(bin);
  618. const vbool16 m_update_x = step16 == bin0;
  619. const vbool16 m_update_y = step16 == bin1;
  620. const vbool16 m_update_z = step16 == bin2;
  621. assert(__popcnt((size_t)m_update_x) == 1);
  622. assert(__popcnt((size_t)m_update_y) == 1);
  623. assert(__popcnt((size_t)m_update_z) == 1);
  624. min_x0 = mask_min(m_update_x,min_x0,min_x0,b_min_x);
  625. min_y0 = mask_min(m_update_x,min_y0,min_y0,b_min_y);
  626. min_z0 = mask_min(m_update_x,min_z0,min_z0,b_min_z);
  627. // ------------------------------------------------------------------------
  628. max_x0 = mask_max(m_update_x,max_x0,max_x0,b_max_x);
  629. max_y0 = mask_max(m_update_x,max_y0,max_y0,b_max_y);
  630. max_z0 = mask_max(m_update_x,max_z0,max_z0,b_max_z);
  631. // ------------------------------------------------------------------------
  632. min_x1 = mask_min(m_update_y,min_x1,min_x1,b_min_x);
  633. min_y1 = mask_min(m_update_y,min_y1,min_y1,b_min_y);
  634. min_z1 = mask_min(m_update_y,min_z1,min_z1,b_min_z);
  635. // ------------------------------------------------------------------------
  636. max_x1 = mask_max(m_update_y,max_x1,max_x1,b_max_x);
  637. max_y1 = mask_max(m_update_y,max_y1,max_y1,b_max_y);
  638. max_z1 = mask_max(m_update_y,max_z1,max_z1,b_max_z);
  639. // ------------------------------------------------------------------------
  640. min_x2 = mask_min(m_update_z,min_x2,min_x2,b_min_x);
  641. min_y2 = mask_min(m_update_z,min_y2,min_y2,b_min_y);
  642. min_z2 = mask_min(m_update_z,min_z2,min_z2,b_min_z);
  643. // ------------------------------------------------------------------------
  644. max_x2 = mask_max(m_update_z,max_x2,max_x2,b_max_x);
  645. max_y2 = mask_max(m_update_z,max_y2,max_y2,b_max_y);
  646. max_z2 = mask_max(m_update_z,max_z2,max_z2,b_max_z);
  647. // ------------------------------------------------------------------------
  648. count0 = mask_add(m_update_x,count0,count0,vint16(1));
  649. count1 = mask_add(m_update_y,count1,count1,vint16(1));
  650. count2 = mask_add(m_update_z,count2,count2,vint16(1));
  651. }
  652. lower[0] = Vec3vf16( min_x0, min_y0, min_z0 );
  653. lower[1] = Vec3vf16( min_x1, min_y1, min_z1 );
  654. lower[2] = Vec3vf16( min_x2, min_y2, min_z2 );
  655. upper[0] = Vec3vf16( max_x0, max_y0, max_z0 );
  656. upper[1] = Vec3vf16( max_x1, max_y1, max_z1 );
  657. upper[2] = Vec3vf16( max_x2, max_y2, max_z2 );
  658. count[0] = count0;
  659. count[1] = count1;
  660. count[2] = count2;
  661. }
  662. __forceinline void bin(const PrimRef* prims, size_t begin, size_t end, const BinMapping<16>& mapping) {
  663. bin(prims+begin,end-begin,mapping);
  664. }
  665. /*! merges in other binning information */
  666. __forceinline void merge (const BinInfoT& other, size_t numBins)
  667. {
  668. for (size_t i=0; i<3; i++)
  669. {
  670. lower[i] = min(lower[i],other.lower[i]);
  671. upper[i] = max(upper[i],other.upper[i]);
  672. count[i] += other.count[i];
  673. }
  674. }
  675. /*! reducesr binning information */
  676. static __forceinline const BinInfoT reduce (const BinInfoT& a, const BinInfoT& b)
  677. {
  678. BinInfoT c;
  679. for (size_t i=0; i<3; i++)
  680. {
  681. c.counts[i] = a.counts[i] + b.counts[i];
  682. c.lower[i] = min(a.lower[i],b.lower[i]);
  683. c.upper[i] = max(a.upper[i],b.upper[i]);
  684. }
  685. return c;
  686. }
  687. /*! finds the best split by scanning binning information */
  688. __forceinline Split best(const BinMapping<16>& mapping, const size_t blocks_shift) const
  689. {
  690. /* find best dimension */
  691. float bestSAH = inf;
  692. int bestDim = -1;
  693. int bestPos = 0;
  694. const vint16 blocks_add = (1 << blocks_shift)-1;
  695. const vfloat16 inf(pos_inf);
  696. for (size_t dim=0; dim<3; dim++)
  697. {
  698. /* ignore zero sized dimensions */
  699. if (unlikely(mapping.invalid(dim)))
  700. continue;
  701. const vfloat16 rArea16 = prefix_area_rl(lower[dim].x,lower[dim].y,lower[dim].z, upper[dim].x,upper[dim].y,upper[dim].z);
  702. const vfloat16 lArea16 = prefix_area_lr(lower[dim].x,lower[dim].y,lower[dim].z, upper[dim].x,upper[dim].y,upper[dim].z);
  703. const vint16 lCount16 = prefix_sum(count[dim]);
  704. const vint16 rCount16 = reverse_prefix_sum(count[dim]);
  705. /* compute best split in this dimension */
  706. const vfloat16 leftArea = lArea16;
  707. const vfloat16 rightArea = align_shift_right<1>(zero,rArea16);
  708. const vint16 lC = lCount16;
  709. const vint16 rC = align_shift_right<1>(zero,rCount16);
  710. const vint16 leftCount = ( lC + blocks_add) >> blocks_shift;
  711. const vint16 rightCount = ( rC + blocks_add) >> blocks_shift;
  712. const vbool16 valid = (leftArea < inf) & (rightArea < inf) & vbool16(0x7fff); // handles inf entries
  713. const vfloat16 sah = select(valid,madd(leftArea,vfloat16(leftCount),rightArea*vfloat16(rightCount)),vfloat16(pos_inf));
  714. /* test if this is a better dimension */
  715. if (any(sah < vfloat16(bestSAH)))
  716. {
  717. const size_t index = select_min(sah);
  718. assert(index < 15);
  719. assert(sah[index] < bestSAH);
  720. bestDim = dim;
  721. bestPos = index+1;
  722. bestSAH = sah[index];
  723. }
  724. }
  725. return Split(bestSAH,bestDim,bestPos,mapping);
  726. }
  727. /*! calculates extended split information */
  728. __forceinline void getSplitInfo(const BinMapping<16>& mapping, const Split& split, SplitInfo& info) const
  729. {
  730. if (split.dim == -1) {
  731. new (&info) SplitInfo(0,empty,0,empty);
  732. return;
  733. }
  734. // FIXME: horizontal reduction!
  735. size_t leftCount = 0;
  736. BBox3fa leftBounds = empty;
  737. for (size_t i=0; i<(size_t)split.pos; i++) {
  738. leftCount += count[split.dim][i];
  739. Vec3fa bounds_lower(lower[split.dim].x[i],lower[split.dim].y[i],lower[split.dim].z[i]);
  740. Vec3fa bounds_upper(upper[split.dim].x[i],upper[split.dim].y[i],upper[split.dim].z[i]);
  741. leftBounds.extend(BBox3fa(bounds_lower,bounds_upper));
  742. }
  743. size_t rightCount = 0;
  744. BBox3fa rightBounds = empty;
  745. for (size_t i=split.pos; i<mapping.size(); i++) {
  746. rightCount += count[split.dim][i];
  747. Vec3fa bounds_lower(lower[split.dim].x[i],lower[split.dim].y[i],lower[split.dim].z[i]);
  748. Vec3fa bounds_upper(upper[split.dim].x[i],upper[split.dim].y[i],upper[split.dim].z[i]);
  749. rightBounds.extend(BBox3fa(bounds_lower,bounds_upper));
  750. }
  751. new (&info) SplitInfo(leftCount,leftBounds,rightCount,rightBounds);
  752. }
  753. /*! gets the number of primitives left of the split */
  754. __forceinline size_t getLeftCount(const BinMapping<16>& mapping, const Split& split) const
  755. {
  756. if (unlikely(split.dim == -1)) return -1;
  757. size_t leftCount = 0;
  758. for (size_t i = 0; i < (size_t)split.pos; i++) {
  759. leftCount += count[split.dim][i];
  760. }
  761. return leftCount;
  762. }
  763. /*! gets the number of primitives right of the split */
  764. __forceinline size_t getRightCount(const BinMapping<16>& mapping, const Split& split) const
  765. {
  766. if (unlikely(split.dim == -1)) return -1;
  767. size_t rightCount = 0;
  768. for (size_t i = (size_t)split.pos; i<mapping.size(); i++) {
  769. rightCount += count[split.dim][i];
  770. }
  771. return rightCount;
  772. }
  773. private:
  774. Vec3vf16 lower[3];
  775. Vec3vf16 upper[3];
  776. vint16 count[3];
  777. };
  778. #endif
  779. }
  780. template<typename BinInfoT, typename BinMapping, typename PrimRef>
  781. __forceinline void bin_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, size_t parallelThreshold, const BinMapping& mapping)
  782. {
  783. if (likely(end-begin < parallelThreshold)) {
  784. binner.bin(prims,begin,end,mapping);
  785. } else {
  786. binner = parallel_reduce(begin,end,blockSize,binner,
  787. [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping); return binner; },
  788. [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; });
  789. }
  790. }
  791. template<typename BinInfoT, typename BinMapping, typename PrimRef>
  792. __forceinline void bin_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, size_t parallelThreshold, const BinMapping& mapping, const AffineSpace3fa& space, void* user = nullptr)
  793. {
  794. if (likely(end-begin < parallelThreshold)) {
  795. binner.bin(prims,begin,end,mapping,space,user);
  796. } else {
  797. binner = parallel_reduce(begin,end,blockSize,binner,
  798. [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping, space, user); return binner; },
  799. [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; });
  800. }
  801. }
  802. template<bool parallel, typename BinInfoT, typename BinMapping, typename PrimRef>
  803. __forceinline void bin_serial_or_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, const BinMapping& mapping)
  804. {
  805. if (!parallel) {
  806. binner.bin(prims,begin,end,mapping);
  807. } else {
  808. binner = parallel_reduce(begin,end,blockSize,binner,
  809. [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping); return binner; },
  810. [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; });
  811. }
  812. }
  813. template<bool parallel, typename BinInfoT, typename BinMapping, typename PrimRef>
  814. __forceinline void bin_serial_or_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, const BinMapping& mapping, const AffineSpace3fa& space, void* user = nullptr)
  815. {
  816. if (!parallel) {
  817. binner.bin(prims,begin,end,mapping,space,user);
  818. } else {
  819. binner = parallel_reduce(begin,end,blockSize,binner,
  820. [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping, space, user); return binner; },
  821. [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; });
  822. }
  823. }
  824. }