bvh_intersector_stream.cpp 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619
  1. // ======================================================================== //
  2. // Copyright 2009-2017 Intel Corporation //
  3. // //
  4. // Licensed under the Apache License, Version 2.0 (the "License"); //
  5. // you may not use this file except in compliance with the License. //
  6. // You may obtain a copy of the License at //
  7. // //
  8. // http://www.apache.org/licenses/LICENSE-2.0 //
  9. // //
  10. // Unless required by applicable law or agreed to in writing, software //
  11. // distributed under the License is distributed on an "AS IS" BASIS, //
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
  13. // See the License for the specific language governing permissions and //
  14. // limitations under the License. //
  15. // ======================================================================== //
  16. #include "bvh_intersector_stream.h"
  17. #include "bvh_intersector_single.h"
  18. #include "bvh_intersector_node.h"
  19. #include "../geometry/intersector_iterators.h"
  20. #include "../geometry/triangle_intersector.h"
  21. #include "../geometry/trianglev_intersector.h"
  22. #include "../geometry/trianglev_mb_intersector.h"
  23. #include "../geometry/trianglei_intersector.h"
  24. #include "../geometry/trianglei_mb_intersector.h"
  25. #include "../geometry/quadv_intersector.h"
  26. #include "../geometry/quadi_intersector.h"
  27. #include "../geometry/quadi_mb_intersector.h"
  28. #include "../geometry/bezier1v_intersector.h"
  29. #include "../geometry/bezier1i_intersector.h"
  30. #include "../geometry/linei_intersector.h"
  31. #include "../geometry/subdivpatch1eager_intersector.h"
  32. #include "../geometry/subdivpatch1cached_intersector.h"
  33. #include "../geometry/object_intersector.h"
  34. #include "../common/scene.h"
  35. #include <bitset>
  36. // todo: parent ptr also for single stream, should improve culling.
  37. #define MAX_RAYS 64
  38. namespace embree
  39. {
  40. namespace isa
  41. {
  42. /* enable traversal of either two small streams or one large stream */
  43. #if !defined(__AVX512F__)
  44. static const size_t MAX_RAYS_PER_OCTANT = 8*sizeof(unsigned int);
  45. #else
  46. static const size_t MAX_RAYS_PER_OCTANT = 8*sizeof(size_t);
  47. #endif
  48. static_assert(MAX_RAYS_PER_OCTANT <= MAX_INTERNAL_STREAM_SIZE, "maximal internal stream size exceeded");
  49. // =====================================================================================================
  50. // =====================================================================================================
  51. // =====================================================================================================
  52. template<int K>
  53. __forceinline size_t AOStoSOA(RayK<K>* rayK, Ray** inputRays, const size_t numTotalRays)
  54. {
  55. const size_t numPackets = (numTotalRays+K-1)/K; //todo : OPTIMIZE
  56. for (size_t i = 0; i < numPackets; i++)
  57. new (&rayK[i]) RayK<K>(zero,zero,zero,neg_inf);
  58. Vec3fa min_dir = pos_inf;
  59. Vec3fa max_dir = neg_inf;
  60. for (size_t i = 0; i < numTotalRays; i++) {
  61. const Vec3fa& org = inputRays[i]->org;
  62. const Vec3fa& dir = inputRays[i]->dir;
  63. min_dir = min(min_dir, dir);
  64. max_dir = max(max_dir, dir);
  65. const float tnear = max(0.0f, inputRays[i]->tnear);
  66. const float tfar = inputRays[i]->tfar;
  67. const size_t packetID = i / K;
  68. const size_t slotID = i % K;
  69. rayK[packetID].dir.x[slotID] = dir.x;
  70. rayK[packetID].dir.y[slotID] = dir.y;
  71. rayK[packetID].dir.z[slotID] = dir.z;
  72. rayK[packetID].org.x[slotID] = org.x;
  73. rayK[packetID].org.y[slotID] = org.y;
  74. rayK[packetID].org.z[slotID] = org.z;
  75. rayK[packetID].tnear[slotID] = tnear;
  76. rayK[packetID].tfar[slotID] = tfar;
  77. rayK[packetID].mask[slotID] = inputRays[i]->mask;
  78. rayK[packetID].instID[slotID] = inputRays[i]->instID;
  79. }
  80. const size_t sign_min_dir = movemask(vfloat4(min_dir) < 0.0f);
  81. const size_t sign_max_dir = movemask(vfloat4(max_dir) < 0.0f);
  82. return ((sign_min_dir^sign_max_dir) & 0x7);
  83. }
  84. template<int K, bool occlusion>
  85. __forceinline void SOAtoAOS(Ray** inputRays, RayK<K>* rayK, const size_t numTotalRays)
  86. {
  87. for (size_t i = 0; i < numTotalRays; i++)
  88. {
  89. const size_t packetID = i / K;
  90. const size_t slotID = i % K;
  91. const RayK<K>& ray = rayK[packetID];
  92. if (likely((unsigned)ray.geomID[slotID] != RTC_INVALID_GEOMETRY_ID))
  93. {
  94. if (occlusion)
  95. inputRays[i]->geomID = ray.geomID[slotID];
  96. else
  97. {
  98. inputRays[i]->tfar = ray.tfar[slotID];
  99. inputRays[i]->Ng.x = ray.Ng.x[slotID];
  100. inputRays[i]->Ng.y = ray.Ng.y[slotID];
  101. inputRays[i]->Ng.z = ray.Ng.z[slotID];
  102. inputRays[i]->u = ray.u[slotID];
  103. inputRays[i]->v = ray.v[slotID];
  104. inputRays[i]->geomID = ray.geomID[slotID];
  105. inputRays[i]->primID = ray.primID[slotID];
  106. inputRays[i]->instID = ray.instID[slotID];
  107. }
  108. }
  109. }
  110. }
  111. // =====================================================================================================
  112. // =====================================================================================================
  113. // =====================================================================================================
  114. template<int N, int Nx, int K, int types, bool robust, typename PrimitiveIntersector>
  115. void BVHNIntersectorStream<N, Nx, K, types, robust, PrimitiveIntersector>::intersectCoherentSOA(BVH* __restrict__ bvh, RayK<K>** inputRays, size_t numOctantRays, IntersectContext* context)
  116. {
  117. __aligned(64) StackItemMaskCoherent stack[stackSizeSingle]; //!< stack of nodes
  118. RayK<K>** __restrict__ inputPackets = (RayK<K>**)inputRays;
  119. assert(numOctantRays <= MAX_RAYS);
  120. __aligned(64) Packet packet[MAX_RAYS/K];
  121. __aligned(64) Frusta frusta;
  122. const size_t m_active = initPacketsAndFrusta(inputPackets, numOctantRays, packet, frusta);
  123. if (unlikely(m_active == 0)) return;
  124. stack[0].mask = m_active;
  125. stack[0].parent = 0;
  126. stack[0].child = bvh->root;
  127. stack[0].childID = (unsigned int)-1;
  128. stack[0].dist = (unsigned int)-1;
  129. ///////////////////////////////////////////////////////////////////////////////////
  130. ///////////////////////////////////////////////////////////////////////////////////
  131. ///////////////////////////////////////////////////////////////////////////////////
  132. const NearFarPreCompute pc(frusta.min_rdir);
  133. StackItemMaskCoherent* stackPtr = stack + 1;
  134. while (1) pop:
  135. {
  136. if (unlikely(stackPtr == stack)) break;
  137. STAT3(normal.trav_stack_pop,1,1,1);
  138. stackPtr--;
  139. /*! pop next node */
  140. NodeRef cur = NodeRef(stackPtr->child);
  141. size_t m_trav_active = stackPtr->mask;
  142. assert(m_trav_active);
  143. /* non-root and leaf => full culling test for all rays */
  144. if (unlikely(stackPtr->parent != 0 && cur.isLeaf()))
  145. {
  146. NodeRef parent = NodeRef(stackPtr->parent);
  147. const AlignedNode* __restrict__ const node = parent.alignedNode();
  148. const size_t b = stackPtr->childID;
  149. char *ptr = (char*)&node->lower_x + b*sizeof(float);
  150. assert(cur == node->child(b));
  151. const vfloat<K> minX = vfloat<K>(*(const float*)((const char*)ptr + pc.nearX));
  152. const vfloat<K> minY = vfloat<K>(*(const float*)((const char*)ptr + pc.nearY));
  153. const vfloat<K> minZ = vfloat<K>(*(const float*)((const char*)ptr + pc.nearZ));
  154. const vfloat<K> maxX = vfloat<K>(*(const float*)((const char*)ptr + pc.farX));
  155. const vfloat<K> maxY = vfloat<K>(*(const float*)((const char*)ptr + pc.farY));
  156. const vfloat<K> maxZ = vfloat<K>(*(const float*)((const char*)ptr + pc.farZ));
  157. m_trav_active = intersectAlignedNodePacket(packet, minX, minY, minZ, maxX, maxY, maxZ, m_trav_active);
  158. if (m_trav_active == 0) goto pop;
  159. }
  160. while (1)
  161. {
  162. if (unlikely(cur.isLeaf())) break;
  163. const AlignedNode* __restrict__ const node = cur.alignedNode();
  164. __aligned(64) size_t maskK[N];
  165. for (size_t i = 0; i < N; i++) maskK[i] = m_trav_active;
  166. vfloat<Nx> dist;
  167. const size_t m_node_hit = traverseCoherentStream(m_trav_active, packet, node, pc, frusta, maskK, dist);
  168. if (unlikely(m_node_hit == 0)) goto pop;
  169. BVHNNodeTraverserStreamHitCoherent<N, Nx, types>::traverseClosestHit(cur, m_trav_active, vbool<Nx>((int)m_node_hit), dist, (size_t*)maskK, stackPtr);
  170. assert(m_trav_active);
  171. }
  172. /*! this is a leaf node */
  173. assert(cur != BVH::emptyNode);
  174. STAT3(normal.trav_leaves, 1, 1, 1);
  175. size_t num; Primitive* prim = (Primitive*)cur.leaf(num);
  176. size_t bits = m_trav_active;
  177. /*! intersect stream of rays with all primitives */
  178. size_t lazy_node = 0;
  179. STAT_USER(1,(__popcnt(bits)+K-1)/K*4);
  180. do
  181. {
  182. size_t i = __bsf(bits) / K;
  183. const size_t m_isec = ((((size_t)1 << K)-1) << (i*K));
  184. assert(m_isec & bits);
  185. bits &= ~m_isec;
  186. vbool<K> m_valid = (inputPackets[i]->tnear <= inputPackets[i]->tfar);
  187. PrimitiveIntersector::intersectK(m_valid, *inputPackets[i], context, prim, num, lazy_node);
  188. Packet &p = packet[i];
  189. p.max_dist = min(p.max_dist, inputPackets[i]->tfar);
  190. } while(bits);
  191. } // traversal + intersection
  192. }
  193. template<int N, int Nx, int K, int types, bool robust, typename PrimitiveIntersector>
  194. void BVHNIntersectorStream<N, Nx, K, types, robust, PrimitiveIntersector>::occludedCoherentSOA(BVH* __restrict__ bvh, RayK<K>** inputRays, size_t numOctantRays, IntersectContext* context)
  195. {
  196. __aligned(64) StackItemMaskCoherent stack[stackSizeSingle]; //!< stack of nodes
  197. RayK<K>** __restrict__ inputPackets = (RayK<K>**)inputRays;
  198. assert(numOctantRays <= MAX_RAYS);
  199. /* inactive rays should have been filtered out before */
  200. __aligned(64) Packet packet[MAX_RAYS/K];
  201. __aligned(64) Frusta frusta;
  202. size_t m_active = initPacketsAndFrusta(inputPackets, numOctantRays, packet, frusta);
  203. /* valid rays */
  204. if (unlikely(m_active == 0)) return;
  205. stack[0].mask = m_active;
  206. stack[0].parent = 0;
  207. stack[0].child = bvh->root;
  208. stack[0].childID = (unsigned int)-1;
  209. stack[0].dist = (unsigned int)-1;
  210. ///////////////////////////////////////////////////////////////////////////////////
  211. ///////////////////////////////////////////////////////////////////////////////////
  212. ///////////////////////////////////////////////////////////////////////////////////
  213. const NearFarPreCompute pc(frusta.min_rdir);
  214. StackItemMaskCoherent* stackPtr = stack + 1;
  215. while (1) pop:
  216. {
  217. if (unlikely(stackPtr == stack)) break;
  218. STAT3(normal.trav_stack_pop,1,1,1);
  219. stackPtr--;
  220. /*! pop next node */
  221. NodeRef cur = NodeRef(stackPtr->child);
  222. size_t m_trav_active = stackPtr->mask & m_active;
  223. if (unlikely(!m_trav_active)) continue;
  224. assert(m_trav_active);
  225. /* non-root and leaf => full culling test for all rays */
  226. if (unlikely(stackPtr->parent != 0 && cur.isLeaf()))
  227. {
  228. NodeRef parent = NodeRef(stackPtr->parent);
  229. const AlignedNode* __restrict__ const node = parent.alignedNode();
  230. const size_t b = stackPtr->childID;
  231. char *ptr = (char*)&node->lower_x + b*sizeof(float);
  232. assert(cur == node->child(b));
  233. const vfloat<K> minX = vfloat<K>(*(const float*)((const char*)ptr + pc.nearX));
  234. const vfloat<K> minY = vfloat<K>(*(const float*)((const char*)ptr + pc.nearY));
  235. const vfloat<K> minZ = vfloat<K>(*(const float*)((const char*)ptr + pc.nearZ));
  236. const vfloat<K> maxX = vfloat<K>(*(const float*)((const char*)ptr + pc.farX));
  237. const vfloat<K> maxY = vfloat<K>(*(const float*)((const char*)ptr + pc.farY));
  238. const vfloat<K> maxZ = vfloat<K>(*(const float*)((const char*)ptr + pc.farZ));
  239. m_trav_active = intersectAlignedNodePacket(packet, minX, minY, minZ, maxX, maxY, maxZ, m_trav_active);
  240. if (m_trav_active == 0) goto pop;
  241. }
  242. while (1)
  243. {
  244. if (unlikely(cur.isLeaf())) break;
  245. const AlignedNode* __restrict__ const node = cur.alignedNode();
  246. __aligned(64) size_t maskK[N];
  247. for (size_t i = 0; i < N; i++) maskK[i] = m_trav_active;
  248. vfloat<Nx> dist;
  249. const size_t m_node_hit = traverseCoherentStream(m_trav_active, packet, node, pc, frusta, maskK, dist);
  250. if (unlikely(m_node_hit == 0)) goto pop;
  251. BVHNNodeTraverserStreamHitCoherent<N, Nx, types>::traverseAnyHit(cur, m_trav_active, vbool<Nx>((int)m_node_hit), (size_t*)maskK, stackPtr);
  252. assert(m_trav_active);
  253. }
  254. /*! this is a leaf node */
  255. assert(cur != BVH::emptyNode);
  256. STAT3(normal.trav_leaves, 1, 1, 1);
  257. size_t num; Primitive* prim = (Primitive*)cur.leaf(num);
  258. size_t bits = m_trav_active & m_active;
  259. /*! intersect stream of rays with all primitives */
  260. size_t lazy_node = 0;
  261. STAT_USER(1,(__popcnt(bits)+K-1)/K*4);
  262. while(bits)
  263. {
  264. size_t i = __bsf(bits) / K;
  265. const size_t m_isec = ((((size_t)1 << K)-1) << (i*K));
  266. assert(m_isec & bits);
  267. bits &= ~m_isec;
  268. vbool<K> m_valid = (inputPackets[i]->tnear <= inputPackets[i]->tfar);
  269. vbool<K> m_hit = PrimitiveIntersector::occludedK(m_valid, *inputPackets[i], context, prim, num, lazy_node);
  270. inputPackets[i]->geomID = select(m_hit, vint<K>(zero), inputPackets[i]->geomID);
  271. m_active &= ~((size_t)movemask(m_hit) << (i*K));
  272. }
  273. } // traversal + intersection
  274. }
  275. // =====================================================================================================
  276. // =====================================================================================================
  277. // =====================================================================================================
  278. template<int N, int Nx, int K, int types, bool robust, typename PrimitiveIntersector>
  279. void BVHNIntersectorStream<N, Nx, K, types, robust, PrimitiveIntersector>::intersect(BVH* __restrict__ bvh, Ray** inputRays, size_t numTotalRays, IntersectContext* context)
  280. {
  281. __aligned(64) RayCtx ray_ctx[MAX_RAYS_PER_OCTANT];
  282. __aligned(64) Precalculations pre[MAX_RAYS_PER_OCTANT];
  283. __aligned(64) StackItemMask stack[stackSizeSingle]; //!< stack of nodes
  284. #if ENABLE_COHERENT_STREAM_PATH == 1
  285. if (unlikely(PrimitiveIntersector::validIntersectorK && !robust && isCoherent(context->user->flags)))
  286. {
  287. if (likely(context->flags == IntersectContext::INPUT_RAY_DATA_AOS))
  288. {
  289. /* AOS to SOA conversion */
  290. RayK<K> rayK[MAX_RAYS / K];
  291. RayK<K>* rayK_ptr[MAX_RAYS / K];
  292. for (size_t i = 0; i < MAX_RAYS / K; i++) rayK_ptr[i] = &rayK[i];
  293. AOStoSOA(rayK, inputRays, numTotalRays);
  294. /* stream tracer as fast path */
  295. BVHNIntersectorStream<N, Nx, K, types, robust, PrimitiveIntersector>::intersectCoherentSOA(bvh, (RayK<K>**)rayK_ptr, numTotalRays, context);
  296. /* SOA to AOS conversion */
  297. SOAtoAOS<K, false>(inputRays, rayK, numTotalRays);
  298. }
  299. else
  300. {
  301. assert(context->getInputSIMDWidth() == K);
  302. /* stream tracer as fast path */
  303. BVHNIntersectorStream<N, Nx, K, types, robust, PrimitiveIntersector>::intersectCoherentSOA(bvh, (RayK<K>**)inputRays, numTotalRays, context);
  304. }
  305. return;
  306. }
  307. #endif
  308. assert(context->flags == IntersectContext::INPUT_RAY_DATA_AOS);
  309. for (size_t r = 0; r < numTotalRays; r += MAX_RAYS_PER_OCTANT)
  310. {
  311. Ray** __restrict__ rays = inputRays + r;
  312. const size_t numOctantRays = (r + MAX_RAYS_PER_OCTANT >= numTotalRays) ? numTotalRays-r : MAX_RAYS_PER_OCTANT;
  313. /* inactive rays should have been filtered out before */
  314. size_t m_active = numOctantRays == 8*sizeof(size_t) ? (size_t)-1 : (((size_t)1 << numOctantRays))-1;
  315. if (m_active == 0) return;
  316. /* do per ray precalculations */
  317. for (size_t i = 0; i < numOctantRays; i++) {
  318. new (&ray_ctx[i]) RayCtx(rays[i]);
  319. new (&pre[i]) Precalculations(*rays[i], bvh, bvh->numTimeSteps);
  320. }
  321. stack[0].ptr = BVH::invalidNode;
  322. stack[0].mask = (size_t)-1;
  323. stack[1].ptr = bvh->root;
  324. stack[1].mask = m_active;
  325. ///////////////////////////////////////////////////////////////////////////////////
  326. ///////////////////////////////////////////////////////////////////////////////////
  327. ///////////////////////////////////////////////////////////////////////////////////
  328. const NearFarPreCompute pc(ray_ctx[0].rdir);
  329. StackItemMask* stackPtr = stack + 2;
  330. while (1) pop:
  331. {
  332. /*! pop next node */
  333. STAT3(normal.trav_stack_pop,1,1,1);
  334. stackPtr--;
  335. NodeRef cur = NodeRef(stackPtr->ptr);
  336. size_t m_trav_active = stackPtr->mask;
  337. assert(m_trav_active);
  338. const vfloat<Nx> inf(pos_inf);
  339. while (1)
  340. {
  341. if (unlikely(cur.isLeaf())) break;
  342. const AlignedNode* __restrict__ const node = cur.alignedNode();
  343. assert(m_trav_active);
  344. #if defined(__AVX512F__)
  345. /* AVX512 path for up to 64 rays */
  346. vllong<Nxd> maskK(zero);
  347. vfloat<Nx> dist(inf);
  348. const vbool<Nx> vmask = traversalLoop<true>(m_trav_active,node,pc,ray_ctx,dist,maskK);
  349. if (unlikely(none(vmask))) goto pop;
  350. BVHNNodeTraverserStreamHit<N, Nx, types>::traverseClosestHit(cur, m_trav_active, vmask, dist, (size_t*)&maskK, stackPtr);
  351. #else
  352. /* AVX path for up to 32 rays */
  353. vint<Nx> maskK(zero);
  354. vfloat<Nx> dist(inf);
  355. const vbool<Nx> vmask = traversalLoop<true>(m_trav_active,node,pc,ray_ctx,dist,maskK);
  356. if (unlikely(none(vmask))) goto pop;
  357. BVHNNodeTraverserStreamHit<N, Nx, types>::traverseClosestHit(cur, m_trav_active, vmask, dist, (unsigned int*)&maskK, stackPtr);
  358. assert(m_trav_active);
  359. #endif
  360. }
  361. /* current ray stream is done? */
  362. if (unlikely(cur == BVH::invalidNode))
  363. break;
  364. /*! this is a leaf node */
  365. assert(cur != BVH::emptyNode);
  366. STAT3(normal.trav_leaves, 1, 1, 1);
  367. size_t num; Primitive* prim = (Primitive*)cur.leaf(num);
  368. size_t bits = m_trav_active;
  369. /*! intersect stream of rays with all primitives */
  370. size_t lazy_node = 0;
  371. size_t valid_isec MAYBE_UNUSED = PrimitiveIntersector::intersect(pre, bits, rays, context, 0, prim, num, lazy_node);
  372. /* update tfar in ray context on successful hit */
  373. size_t isec_bits = valid_isec;
  374. while(isec_bits)
  375. {
  376. const size_t i = __bscf(isec_bits);
  377. ray_ctx[i].update(rays[i]);
  378. }
  379. } // traversal + intersection
  380. }
  381. }
  382. template<int N, int Nx, int K, int types, bool robust, typename PrimitiveIntersector>
  383. void BVHNIntersectorStream<N, Nx, K, types, robust, PrimitiveIntersector>::occluded(BVH* __restrict__ bvh, Ray **inputRays, size_t numTotalRays, IntersectContext* context)
  384. {
  385. __aligned(64) RayCtx ray_ctx[MAX_RAYS_PER_OCTANT];
  386. __aligned(64) Precalculations pre[MAX_RAYS_PER_OCTANT];
  387. __aligned(64) StackItemMask stack[stackSizeSingle]; //!< stack of nodes
  388. #if ENABLE_COHERENT_STREAM_PATH == 1
  389. if (unlikely(PrimitiveIntersector::validIntersectorK && !robust && isCoherent(context->user->flags)))
  390. {
  391. if (likely(context->flags == IntersectContext::INPUT_RAY_DATA_AOS))
  392. {
  393. /* AOS to SOA conversion */
  394. RayK<K> rayK[MAX_RAYS / K];
  395. RayK<K>* rayK_ptr[MAX_RAYS / K];
  396. for (size_t i = 0; i < MAX_RAYS / K; i++) rayK_ptr[i] = &rayK[i];
  397. AOStoSOA(rayK, inputRays, numTotalRays);
  398. /* stream tracer as fast path */
  399. BVHNIntersectorStream<N, Nx, K, types, robust, PrimitiveIntersector>::occludedCoherentSOA(bvh, (RayK<K>**)rayK_ptr, numTotalRays, context);
  400. /* SOA to AOS conversion */
  401. SOAtoAOS<K, true>(inputRays, rayK, numTotalRays);
  402. }
  403. else
  404. {
  405. assert(context->getInputSIMDWidth() == K);
  406. BVHNIntersectorStream<N, Nx, K, types, robust, PrimitiveIntersector>::occludedCoherentSOA(bvh, (RayK<K>**)inputRays, numTotalRays, context);
  407. }
  408. return;
  409. }
  410. #endif
  411. assert(context->flags == IntersectContext::INPUT_RAY_DATA_AOS);
  412. for (size_t r = 0; r < numTotalRays; r += MAX_RAYS_PER_OCTANT)
  413. {
  414. Ray** rays = inputRays + r;
  415. const size_t numOctantRays = (r + MAX_RAYS_PER_OCTANT >= numTotalRays) ? numTotalRays-r : MAX_RAYS_PER_OCTANT;
  416. size_t m_active = numOctantRays == 8*sizeof(size_t) ? (size_t)-1 : (((size_t)1 << numOctantRays))-1;
  417. if (unlikely(m_active == 0)) continue;
  418. /* do per ray precalculations */
  419. for (size_t i = 0; i < numOctantRays; i++) {
  420. new (&ray_ctx[i]) RayCtx(rays[i]);
  421. new (&pre[i]) Precalculations(*rays[i], bvh, bvh->numTimeSteps);
  422. }
  423. stack[0].ptr = BVH::invalidNode;
  424. stack[0].mask = (size_t)-1;
  425. stack[1].ptr = bvh->root;
  426. stack[1].mask = m_active;
  427. StackItemMask* stackPtr = stack + 2;
  428. const NearFarPreCompute pc(ray_ctx[0].rdir);
  429. while (1) pop:
  430. {
  431. /*! pop next node */
  432. STAT3(shadow.trav_stack_pop,1,1,1);
  433. stackPtr--;
  434. NodeRef cur = NodeRef(stackPtr->ptr);
  435. assert(stackPtr->mask);
  436. size_t m_trav_active = stackPtr->mask & m_active;
  437. if (unlikely(m_trav_active == 0 && cur != BVH::invalidNode)) continue;
  438. const vfloat<Nx> inf(pos_inf);
  439. while (1)
  440. {
  441. if (likely(cur.isLeaf())) break;
  442. assert(m_trav_active);
  443. const AlignedNode* __restrict__ const node = cur.alignedNode();
  444. #if defined(__AVX512F__)
  445. /* AVX512 path for up to 64 rays */
  446. vllong<Nxd> maskK(zero);
  447. vfloat<Nx> dist(inf);
  448. const vbool<Nx> vmask = traversalLoop<false>(m_trav_active,node,pc,ray_ctx,dist,maskK);
  449. if (unlikely(none(vmask))) goto pop;
  450. BVHNNodeTraverserStreamHit<N, Nx, types>::traverseAnyHit(cur, m_trav_active, vmask, (size_t*)&maskK, stackPtr);
  451. #else
  452. /* AVX path for up to 32 rays */
  453. vint<Nx> maskK(zero);
  454. vfloat<Nx> dist(inf);
  455. const vbool<Nx> vmask = traversalLoop<false>(m_trav_active,node,pc,ray_ctx,dist,maskK);
  456. if (unlikely(none(vmask))) goto pop;
  457. BVHNNodeTraverserStreamHit<N, Nx, types>::traverseAnyHit(cur, m_trav_active, vmask, (unsigned int*)&maskK, stackPtr);
  458. #endif
  459. }
  460. /* current ray stream is done? */
  461. if (unlikely(cur == BVH::invalidNode))
  462. break;
  463. /*! this is a leaf node */
  464. assert(cur != BVH::emptyNode);
  465. STAT3(shadow.trav_leaves, 1, 1, 1);
  466. size_t num; Primitive* prim = (Primitive*)cur.leaf(num);
  467. size_t lazy_node = 0;
  468. size_t bits = m_trav_active & m_active;
  469. assert(bits);
  470. m_active = m_active & ~PrimitiveIntersector::occluded(pre, bits, rays, context, 0, prim, num, lazy_node);
  471. if (unlikely(m_active == 0)) break;
  472. } // traversal + intersection
  473. }
  474. }
  475. ////////////////////////////////////////////////////////////////////////////////
  476. /// ArrayIntersectorKStream Definitions
  477. ////////////////////////////////////////////////////////////////////////////////
  478. typedef ArrayIntersectorKStream<VSIZEX,
  479. TriangleMIntersector1Moeller<SIMD_MODE(4) COMMA true >,
  480. TriangleMIntersectorKMoeller<4 COMMA VSIZEX COMMA VSIZEX COMMA true > > Triangle4IntersectorStreamMoeller;
  481. typedef ArrayIntersectorKStream<VSIZEX,
  482. TriangleMIntersector1Moeller<SIMD_MODE(4) COMMA false >,
  483. TriangleMIntersectorKMoeller<4 COMMA VSIZEX COMMA VSIZEX COMMA false > > Triangle4IntersectorStreamMoellerNoFilter;
  484. typedef ArrayIntersectorKStream<VSIZEX,
  485. TriangleMvIntersector1Pluecker<SIMD_MODE(4) COMMA true >,
  486. TriangleMvIntersectorKPluecker<4 COMMA VSIZEX COMMA VSIZEX COMMA true > > Triangle4vIntersectorStreamPluecker;
  487. typedef ArrayIntersectorKStream<VSIZEX,
  488. TriangleMiIntersector1Moeller<SIMD_MODE(4) COMMA true >,
  489. TriangleMiIntersectorKMoeller<4 COMMA VSIZEX COMMA VSIZEX COMMA true > > Triangle4iIntersectorStreamMoeller;
  490. typedef ArrayIntersectorKStream<VSIZEX,
  491. TriangleMiIntersector1Pluecker<SIMD_MODE(4) COMMA true >,
  492. TriangleMiIntersectorKPluecker<4 COMMA VSIZEX COMMA VSIZEX COMMA true > > Triangle4iIntersectorStreamPluecker;
  493. typedef ArrayIntersectorKStream<VSIZEX,
  494. QuadMvIntersector1Moeller<4 COMMA true >,
  495. QuadMvIntersectorKMoeller<4 COMMA VSIZEX COMMA true > > Quad4vIntersectorStreamMoeller;
  496. typedef ArrayIntersectorKStream<VSIZEX,
  497. QuadMvIntersector1Moeller<4 COMMA false >,
  498. QuadMvIntersectorKMoeller<4 COMMA VSIZEX COMMA false > > Quad4vIntersectorStreamMoellerNoFilter;
  499. typedef ArrayIntersectorKStream<VSIZEX,
  500. QuadMiIntersector1Moeller<4 COMMA true >,
  501. QuadMiIntersectorKMoeller<4 COMMA VSIZEX COMMA true > > Quad4iIntersectorStreamMoeller;
  502. typedef ArrayIntersectorKStream<VSIZEX,
  503. QuadMvIntersector1Pluecker<4 COMMA true >,
  504. QuadMvIntersectorKPluecker<4 COMMA VSIZEX COMMA true > > Quad4vIntersectorStreamPluecker;
  505. typedef ArrayIntersectorKStream<VSIZEX,
  506. QuadMiIntersector1Pluecker<4 COMMA true >,
  507. QuadMiIntersectorKPluecker<4 COMMA VSIZEX COMMA true > > Quad4iIntersectorStreamPluecker;
  508. typedef ArrayIntersectorKStream<VSIZEX,
  509. ObjectIntersector1<false>,
  510. ObjectIntersectorK<VSIZEX COMMA false > > ObjectIntersectorStream;
  511. }
  512. }