bvh_intersector_stream.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295
  1. // Copyright 2009-2020 Intel Corporation
  2. // SPDX-License-Identifier: Apache-2.0
  3. #pragma once
  4. #include "node_intersector_packet_stream.h"
  5. #include "node_intersector_frustum.h"
  6. #include "bvh_traverser_stream.h"
  7. namespace embree
  8. {
  9. namespace isa
  10. {
  11. /*! BVH ray stream intersector. */
  12. template<int N, int Nx, int types, bool robust, typename PrimitiveIntersector>
  13. class BVHNIntersectorStream
  14. {
  15. static const int Nxd = (Nx == N) ? N : Nx/2;
  16. /* shortcuts for frequently used types */
  17. template<int K> using PrimitiveIntersectorK = typename PrimitiveIntersector::template Type<K>;
  18. template<int K> using PrimitiveK = typename PrimitiveIntersectorK<K>::PrimitiveK;
  19. typedef BVHN<N> BVH;
  20. typedef typename BVH::NodeRef NodeRef;
  21. typedef typename BVH::BaseNode BaseNode;
  22. typedef typename BVH::AABBNode AABBNode;
  23. typedef typename BVH::AABBNodeMB AABBNodeMB;
  24. template<int K>
  25. __forceinline static size_t initPacketsAndFrustum(RayK<K>** inputPackets, size_t numOctantRays,
  26. TravRayKStream<K, robust>* packets, Frustum<robust>& frustum, bool& commonOctant)
  27. {
  28. const size_t numPackets = (numOctantRays+K-1)/K;
  29. Vec3vf<K> tmp_min_rdir(pos_inf);
  30. Vec3vf<K> tmp_max_rdir(neg_inf);
  31. Vec3vf<K> tmp_min_org(pos_inf);
  32. Vec3vf<K> tmp_max_org(neg_inf);
  33. vfloat<K> tmp_min_dist(pos_inf);
  34. vfloat<K> tmp_max_dist(neg_inf);
  35. size_t m_active = 0;
  36. for (size_t i = 0; i < numPackets; i++)
  37. {
  38. const vfloat<K> tnear = inputPackets[i]->tnear();
  39. const vfloat<K> tfar = inputPackets[i]->tfar;
  40. vbool<K> m_valid = (tnear <= tfar) & (tnear >= 0.0f);
  41. #if defined(EMBREE_IGNORE_INVALID_RAYS)
  42. m_valid &= inputPackets[i]->valid();
  43. #endif
  44. m_active |= (size_t)movemask(m_valid) << (i*K);
  45. vfloat<K> packet_min_dist = max(tnear, 0.0f);
  46. vfloat<K> packet_max_dist = select(m_valid, tfar, neg_inf);
  47. tmp_min_dist = min(tmp_min_dist, packet_min_dist);
  48. tmp_max_dist = max(tmp_max_dist, packet_max_dist);
  49. const Vec3vf<K>& org = inputPackets[i]->org;
  50. const Vec3vf<K>& dir = inputPackets[i]->dir;
  51. new (&packets[i]) TravRayKStream<K, robust>(org, dir, packet_min_dist, packet_max_dist);
  52. tmp_min_rdir = min(tmp_min_rdir, select(m_valid, packets[i].rdir, Vec3vf<K>(pos_inf)));
  53. tmp_max_rdir = max(tmp_max_rdir, select(m_valid, packets[i].rdir, Vec3vf<K>(neg_inf)));
  54. tmp_min_org = min(tmp_min_org , select(m_valid,org , Vec3vf<K>(pos_inf)));
  55. tmp_max_org = max(tmp_max_org , select(m_valid,org , Vec3vf<K>(neg_inf)));
  56. }
  57. m_active &= (numOctantRays == (8 * sizeof(size_t))) ? (size_t)-1 : (((size_t)1 << numOctantRays)-1);
  58. const Vec3fa reduced_min_rdir(reduce_min(tmp_min_rdir.x),
  59. reduce_min(tmp_min_rdir.y),
  60. reduce_min(tmp_min_rdir.z));
  61. const Vec3fa reduced_max_rdir(reduce_max(tmp_max_rdir.x),
  62. reduce_max(tmp_max_rdir.y),
  63. reduce_max(tmp_max_rdir.z));
  64. const Vec3fa reduced_min_origin(reduce_min(tmp_min_org.x),
  65. reduce_min(tmp_min_org.y),
  66. reduce_min(tmp_min_org.z));
  67. const Vec3fa reduced_max_origin(reduce_max(tmp_max_org.x),
  68. reduce_max(tmp_max_org.y),
  69. reduce_max(tmp_max_org.z));
  70. commonOctant =
  71. (reduced_max_rdir.x < 0.0f || reduced_min_rdir.x >= 0.0f) &&
  72. (reduced_max_rdir.y < 0.0f || reduced_min_rdir.y >= 0.0f) &&
  73. (reduced_max_rdir.z < 0.0f || reduced_min_rdir.z >= 0.0f);
  74. const float frustum_min_dist = reduce_min(tmp_min_dist);
  75. const float frustum_max_dist = reduce_max(tmp_max_dist);
  76. frustum.init(reduced_min_origin, reduced_max_origin,
  77. reduced_min_rdir, reduced_max_rdir,
  78. frustum_min_dist, frustum_max_dist,
  79. N);
  80. return m_active;
  81. }
  82. template<int K>
  83. __forceinline static size_t intersectAABBNodePacket(size_t m_active,
  84. const TravRayKStream<K,robust>* packets,
  85. const AABBNode* __restrict__ node,
  86. size_t boxID,
  87. const NearFarPrecalculations& nf)
  88. {
  89. assert(m_active);
  90. const size_t startPacketID = bsf(m_active) / K;
  91. const size_t endPacketID = bsr(m_active) / K;
  92. size_t m_trav_active = 0;
  93. for (size_t i = startPacketID; i <= endPacketID; i++)
  94. {
  95. const size_t m_hit = intersectNodeK<N>(node, boxID, packets[i], nf);
  96. m_trav_active |= m_hit << (i*K);
  97. }
  98. return m_trav_active;
  99. }
  100. template<int K>
  101. __forceinline static size_t traverseCoherentStream(size_t m_active,
  102. TravRayKStream<K, robust>* packets,
  103. const AABBNode* __restrict__ node,
  104. const Frustum<robust>& frustum,
  105. size_t* maskK,
  106. vfloat<Nx>& dist)
  107. {
  108. size_t m_node_hit = intersectNodeFrustum<N,Nx>(node, frustum, dist);
  109. const size_t first_index = bsf(m_active);
  110. const size_t first_packetID = first_index / K;
  111. const size_t first_rayID = first_index % K;
  112. size_t m_first_hit = intersectNode1<N,Nx>(node, packets[first_packetID], first_rayID, frustum.nf);
  113. /* this make traversal independent of the ordering of rays */
  114. size_t m_node = m_node_hit ^ m_first_hit;
  115. while (unlikely(m_node))
  116. {
  117. const size_t boxID = bscf(m_node);
  118. const size_t m_current = m_active & intersectAABBNodePacket(m_active, packets, node, boxID, frustum.nf);
  119. m_node_hit ^= m_current ? (size_t)0 : ((size_t)1 << boxID);
  120. maskK[boxID] = m_current;
  121. }
  122. return m_node_hit;
  123. }
  124. // TODO: explicit 16-wide path for KNL
  125. template<int K>
  126. __forceinline static vint<Nx> traverseIncoherentStream(size_t m_active,
  127. TravRayKStreamFast<K>* __restrict__ packets,
  128. const AABBNode* __restrict__ node,
  129. const NearFarPrecalculations& nf,
  130. const int shiftTable[32])
  131. {
  132. const vfloat<Nx> bminX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX));
  133. const vfloat<Nx> bminY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY));
  134. const vfloat<Nx> bminZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ));
  135. const vfloat<Nx> bmaxX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX));
  136. const vfloat<Nx> bmaxY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
  137. const vfloat<Nx> bmaxZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
  138. assert(m_active);
  139. vint<Nx> vmask(zero);
  140. do
  141. {
  142. STAT3(shadow.trav_nodes,1,1,1);
  143. const size_t rayID = bscf(m_active);
  144. assert(rayID < MAX_INTERNAL_STREAM_SIZE);
  145. TravRayKStream<K,robust> &p = packets[rayID / K];
  146. const size_t i = rayID % K;
  147. const vint<Nx> bitmask(shiftTable[rayID]);
  148. #if defined (__aarch64__)
  149. const vfloat<Nx> tNearX = madd(bminX, p.rdir.x[i], p.neg_org_rdir.x[i]);
  150. const vfloat<Nx> tNearY = madd(bminY, p.rdir.y[i], p.neg_org_rdir.y[i]);
  151. const vfloat<Nx> tNearZ = madd(bminZ, p.rdir.z[i], p.neg_org_rdir.z[i]);
  152. const vfloat<Nx> tFarX = madd(bmaxX, p.rdir.x[i], p.neg_org_rdir.x[i]);
  153. const vfloat<Nx> tFarY = madd(bmaxY, p.rdir.y[i], p.neg_org_rdir.y[i]);
  154. const vfloat<Nx> tFarZ = madd(bmaxZ, p.rdir.z[i], p.neg_org_rdir.z[i]);
  155. #else
  156. const vfloat<Nx> tNearX = msub(bminX, p.rdir.x[i], p.org_rdir.x[i]);
  157. const vfloat<Nx> tNearY = msub(bminY, p.rdir.y[i], p.org_rdir.y[i]);
  158. const vfloat<Nx> tNearZ = msub(bminZ, p.rdir.z[i], p.org_rdir.z[i]);
  159. const vfloat<Nx> tFarX = msub(bmaxX, p.rdir.x[i], p.org_rdir.x[i]);
  160. const vfloat<Nx> tFarY = msub(bmaxY, p.rdir.y[i], p.org_rdir.y[i]);
  161. const vfloat<Nx> tFarZ = msub(bmaxZ, p.rdir.z[i], p.org_rdir.z[i]);
  162. #endif
  163. const vfloat<Nx> tNear = maxi(tNearX, tNearY, tNearZ, vfloat<Nx>(p.tnear[i]));
  164. const vfloat<Nx> tFar = mini(tFarX , tFarY , tFarZ, vfloat<Nx>(p.tfar[i]));
  165. #if defined(__AVX512ER__)
  166. const vboolx m_node((1 << N)-1);
  167. const vbool<Nx> hit_mask = le(m_node, tNear, tFar);
  168. vmask = mask_or(hit_mask, vmask, vmask, bitmask);
  169. #else
  170. const vbool<Nx> hit_mask = tNear <= tFar;
  171. #if defined(__AVX2__)
  172. vmask = vmask | (bitmask & vint<Nx>(hit_mask));
  173. #else
  174. vmask = select(hit_mask, vmask | bitmask, vmask);
  175. #endif
  176. #endif
  177. } while(m_active);
  178. return vmask;
  179. }
  180. template<int K>
  181. __forceinline static vint<Nx> traverseIncoherentStream(size_t m_active,
  182. TravRayKStreamRobust<K>* __restrict__ packets,
  183. const AABBNode* __restrict__ node,
  184. const NearFarPrecalculations& nf,
  185. const int shiftTable[32])
  186. {
  187. const vfloat<Nx> bminX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX));
  188. const vfloat<Nx> bminY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY));
  189. const vfloat<Nx> bminZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ));
  190. const vfloat<Nx> bmaxX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX));
  191. const vfloat<Nx> bmaxY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
  192. const vfloat<Nx> bmaxZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
  193. assert(m_active);
  194. vint<Nx> vmask(zero);
  195. do
  196. {
  197. STAT3(shadow.trav_nodes,1,1,1);
  198. const size_t rayID = bscf(m_active);
  199. assert(rayID < MAX_INTERNAL_STREAM_SIZE);
  200. TravRayKStream<K,robust> &p = packets[rayID / K];
  201. const size_t i = rayID % K;
  202. const vint<Nx> bitmask(shiftTable[rayID]);
  203. const vfloat<Nx> tNearX = (bminX - p.org.x[i]) * p.rdir.x[i];
  204. const vfloat<Nx> tNearY = (bminY - p.org.y[i]) * p.rdir.y[i];
  205. const vfloat<Nx> tNearZ = (bminZ - p.org.z[i]) * p.rdir.z[i];
  206. const vfloat<Nx> tFarX = (bmaxX - p.org.x[i]) * p.rdir.x[i];
  207. const vfloat<Nx> tFarY = (bmaxY - p.org.y[i]) * p.rdir.y[i];
  208. const vfloat<Nx> tFarZ = (bmaxZ - p.org.z[i]) * p.rdir.z[i];
  209. const vfloat<Nx> tNear = maxi(tNearX, tNearY, tNearZ, vfloat<Nx>(p.tnear[i]));
  210. const vfloat<Nx> tFar = mini(tFarX , tFarY , tFarZ, vfloat<Nx>(p.tfar[i]));
  211. const float round_down = 1.0f-2.0f*float(ulp);
  212. const float round_up = 1.0f+2.0f*float(ulp);
  213. #if defined(__AVX512ER__)
  214. const vboolx m_node((1 << N)-1);
  215. const vbool<Nx> hit_mask = le(m_node, round_down*tNear, round_up*tFar);
  216. vmask = mask_or(hit_mask, vmask, vmask, bitmask);
  217. #else
  218. const vbool<Nx> hit_mask = round_down*tNear <= round_up*tFar;
  219. #if defined(__AVX2__)
  220. vmask = vmask | (bitmask & vint<Nx>(hit_mask));
  221. #else
  222. vmask = select(hit_mask, vmask | bitmask, vmask);
  223. #endif
  224. #endif
  225. } while(m_active);
  226. return vmask;
  227. }
  228. static const size_t stackSizeSingle = 1+(N-1)*BVH::maxDepth;
  229. public:
  230. static void intersect(Accel::Intersectors* This, RayHitN** inputRays, size_t numRays, IntersectContext* context);
  231. static void occluded (Accel::Intersectors* This, RayN** inputRays, size_t numRays, IntersectContext* context);
  232. private:
  233. template<int K>
  234. static void intersectCoherent(Accel::Intersectors* This, RayHitK<K>** inputRays, size_t numRays, IntersectContext* context);
  235. template<int K>
  236. static void occludedCoherent(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context);
  237. template<int K>
  238. static void occludedIncoherent(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context);
  239. };
  240. /*! BVH ray stream intersector with direct fallback to packets. */
  241. template<int N, int Nx>
  242. class BVHNIntersectorStreamPacketFallback
  243. {
  244. public:
  245. static void intersect(Accel::Intersectors* This, RayHitN** inputRays, size_t numRays, IntersectContext* context);
  246. static void occluded (Accel::Intersectors* This, RayN** inputRays, size_t numRays, IntersectContext* context);
  247. private:
  248. template<int K>
  249. static void intersectK(Accel::Intersectors* This, RayHitK<K>** inputRays, size_t numRays, IntersectContext* context);
  250. template<int K>
  251. static void occludedK(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context);
  252. };
  253. }
  254. }