3
0

MaskedOcclusionCullingAVX2.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. ////////////////////////////////////////////////////////////////////////////////
  2. // Copyright 2017 Intel Corporation
  3. //
  4. // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5. // use this file except in compliance with the License. You may obtain a copy
  6. // of the License at
  7. //
  8. // http://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software
  11. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  12. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  13. // License for the specific language governing permissions and limitations
  14. // under the License.
  15. ////////////////////////////////////////////////////////////////////////////////
  16. #include <string.h>
  17. #include <assert.h>
  18. #include <float.h>
  19. #include "MaskedOcclusionCulling.h"
  20. #include "CompilerSpecific.inl"
  21. #if MOC_RECORDER_ENABLE
  22. #include "FrameRecorder.h"
  23. #endif
  24. #if defined(__MICROSOFT_COMPILER) && _MSC_VER < 1900
  25. // If you remove/comment this error, the code will compile & use the SSE41 version instead.
  26. #error Older versions than visual studio 2015 not supported due to compiler bug(s)
  27. #endif
  28. #if !defined(__MICROSOFT_COMPILER) || _MSC_VER >= 1900
  29. // For performance reasons, the MaskedOcclusionCullingAVX2.cpp file should be compiled with VEX encoding for SSE instructions (to avoid
  30. // AVX-SSE transition penalties, see https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties). However, the SSE
  31. // version in MaskedOcclusionCulling.cpp _must_ be compiled without VEX encoding to allow backwards compatibility. Best practice is to
  32. // use lowest supported target platform (e.g. /arch:SSE2) as project default, and elevate only the MaskedOcclusionCullingAVX2/512.cpp files.
  33. #ifndef __AVX2__
  34. #error For best performance, MaskedOcclusionCullingAVX2.cpp should be compiled with /arch:AVX2
  35. #endif
  36. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  37. // AVX specific defines and constants
  38. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  39. #define SIMD_LANES 8
  40. #define TILE_HEIGHT_SHIFT 3
  41. #define SIMD_LANE_IDX _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7)
  42. #define SIMD_SUB_TILE_COL_OFFSET _mm256_setr_epi32(0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3)
  43. #define SIMD_SUB_TILE_ROW_OFFSET _mm256_setr_epi32(0, 0, 0, 0, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT)
  44. #define SIMD_SUB_TILE_COL_OFFSET_F _mm256_setr_ps(0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3)
  45. #define SIMD_SUB_TILE_ROW_OFFSET_F _mm256_setr_ps(0, 0, 0, 0, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT)
  46. #define SIMD_SHUFFLE_SCANLINE_TO_SUBTILES _mm256_setr_epi8(0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF, 0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF)
  47. #define SIMD_LANE_YCOORD_I _mm256_setr_epi32(128, 384, 640, 896, 1152, 1408, 1664, 1920)
  48. #define SIMD_LANE_YCOORD_F _mm256_setr_ps(128.0f, 384.0f, 640.0f, 896.0f, 1152.0f, 1408.0f, 1664.0f, 1920.0f)
  49. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  50. // AVX specific typedefs and functions
  51. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  52. typedef __m256 __mw;
  53. typedef __m256i __mwi;
  54. #define _mmw_set1_ps _mm256_set1_ps
  55. #define _mmw_setzero_ps _mm256_setzero_ps
  56. #define _mmw_and_ps _mm256_and_ps
  57. #define _mmw_or_ps _mm256_or_ps
  58. #define _mmw_xor_ps _mm256_xor_ps
  59. #define _mmw_not_ps(a) _mm256_xor_ps((a), _mm256_castsi256_ps(_mm256_set1_epi32(~0)))
  60. #define _mmw_andnot_ps _mm256_andnot_ps
  61. #define _mmw_neg_ps(a) _mm256_xor_ps((a), _mm256_set1_ps(-0.0f))
  62. #define _mmw_abs_ps(a) _mm256_and_ps((a), _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)))
  63. #define _mmw_add_ps _mm256_add_ps
  64. #define _mmw_sub_ps _mm256_sub_ps
  65. #define _mmw_mul_ps _mm256_mul_ps
  66. #define _mmw_div_ps _mm256_div_ps
  67. #define _mmw_min_ps _mm256_min_ps
  68. #define _mmw_max_ps _mm256_max_ps
  69. #define _mmw_fmadd_ps _mm256_fmadd_ps
  70. #define _mmw_fmsub_ps _mm256_fmsub_ps
  71. #define _mmw_movemask_ps _mm256_movemask_ps
  72. #define _mmw_blendv_ps _mm256_blendv_ps
  73. #define _mmw_cmpge_ps(a,b) _mm256_cmp_ps(a, b, _CMP_GE_OQ)
  74. #define _mmw_cmpgt_ps(a,b) _mm256_cmp_ps(a, b, _CMP_GT_OQ)
  75. #define _mmw_cmpeq_ps(a,b) _mm256_cmp_ps(a, b, _CMP_EQ_OQ)
  76. #define _mmw_floor_ps(x) _mm256_round_ps(x, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)
  77. #define _mmw_ceil_ps(x) _mm256_round_ps(x, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)
  78. #define _mmw_shuffle_ps _mm256_shuffle_ps
  79. #define _mmw_insertf32x4_ps _mm256_insertf128_ps
  80. #define _mmw_cvtepi32_ps _mm256_cvtepi32_ps
  81. #define _mmw_blendv_epi32(a,b,c) simd_cast<__mwi>(_mmw_blendv_ps(simd_cast<__mw>(a), simd_cast<__mw>(b), simd_cast<__mw>(c)))
  82. #define _mmw_set1_epi32 _mm256_set1_epi32
  83. #define _mmw_setzero_epi32 _mm256_setzero_si256
  84. #define _mmw_and_epi32 _mm256_and_si256
  85. #define _mmw_or_epi32 _mm256_or_si256
  86. #define _mmw_xor_epi32 _mm256_xor_si256
  87. #define _mmw_not_epi32(a) _mm256_xor_si256((a), _mm256_set1_epi32(~0))
  88. #define _mmw_andnot_epi32 _mm256_andnot_si256
  89. #define _mmw_neg_epi32(a) _mm256_sub_epi32(_mm256_set1_epi32(0), (a))
  90. #define _mmw_add_epi32 _mm256_add_epi32
  91. #define _mmw_sub_epi32 _mm256_sub_epi32
  92. #define _mmw_min_epi32 _mm256_min_epi32
  93. #define _mmw_max_epi32 _mm256_max_epi32
  94. #define _mmw_subs_epu16 _mm256_subs_epu16
  95. #define _mmw_mullo_epi32 _mm256_mullo_epi32
  96. #define _mmw_cmpeq_epi32 _mm256_cmpeq_epi32
  97. #define _mmw_testz_epi32 _mm256_testz_si256
  98. #define _mmw_cmpgt_epi32 _mm256_cmpgt_epi32
  99. #define _mmw_srai_epi32 _mm256_srai_epi32
  100. #define _mmw_srli_epi32 _mm256_srli_epi32
  101. #define _mmw_slli_epi32 _mm256_slli_epi32
  102. #define _mmw_sllv_ones(x) _mm256_sllv_epi32(SIMD_BITS_ONE, x)
  103. #define _mmw_transpose_epi8(x) _mm256_shuffle_epi8(x, SIMD_SHUFFLE_SCANLINE_TO_SUBTILES)
  104. #define _mmw_abs_epi32 _mm256_abs_epi32
  105. #define _mmw_cvtps_epi32 _mm256_cvtps_epi32
  106. #define _mmw_cvttps_epi32 _mm256_cvttps_epi32
  107. #define _mmx_dp4_ps(a, b) _mm_dp_ps(a, b, 0xFF)
  108. #define _mmx_fmadd_ps _mm_fmadd_ps
  109. #define _mmx_max_epi32 _mm_max_epi32
  110. #define _mmx_min_epi32 _mm_min_epi32
  111. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  112. // SIMD casting functions
  113. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  114. template<typename T, typename Y> FORCE_INLINE T simd_cast(Y A);
  115. template<> FORCE_INLINE __m128 simd_cast<__m128>(float A) { return _mm_set1_ps(A); }
  116. template<> FORCE_INLINE __m128 simd_cast<__m128>(__m128i A) { return _mm_castsi128_ps(A); }
  117. template<> FORCE_INLINE __m128 simd_cast<__m128>(__m128 A) { return A; }
  118. template<> FORCE_INLINE __m128i simd_cast<__m128i>(int A) { return _mm_set1_epi32(A); }
  119. template<> FORCE_INLINE __m128i simd_cast<__m128i>(__m128 A) { return _mm_castps_si128(A); }
  120. template<> FORCE_INLINE __m128i simd_cast<__m128i>(__m128i A) { return A; }
  121. template<> FORCE_INLINE __m256 simd_cast<__m256>(float A) { return _mm256_set1_ps(A); }
  122. template<> FORCE_INLINE __m256 simd_cast<__m256>(__m256i A) { return _mm256_castsi256_ps(A); }
  123. template<> FORCE_INLINE __m256 simd_cast<__m256>(__m256 A) { return A; }
  124. template<> FORCE_INLINE __m256i simd_cast<__m256i>(int A) { return _mm256_set1_epi32(A); }
  125. template<> FORCE_INLINE __m256i simd_cast<__m256i>(__m256 A) { return _mm256_castps_si256(A); }
  126. template<> FORCE_INLINE __m256i simd_cast<__m256i>(__m256i A) { return A; }
  127. #define MAKE_ACCESSOR(name, simd_type, base_type, is_const, elements) \
  128. FORCE_INLINE is_const base_type * name(is_const simd_type &a) { \
  129. union accessor { simd_type m_native; base_type m_array[elements]; }; \
  130. is_const accessor *acs = reinterpret_cast<is_const accessor*>(&a); \
  131. return acs->m_array; \
  132. }
  133. MAKE_ACCESSOR(simd_f32, __m128, float, , 4)
  134. MAKE_ACCESSOR(simd_f32, __m128, float, const, 4)
  135. MAKE_ACCESSOR(simd_i32, __m128i, int, , 4)
  136. MAKE_ACCESSOR(simd_i32, __m128i, int, const, 4)
  137. MAKE_ACCESSOR(simd_f32, __m256, float, , 8)
  138. MAKE_ACCESSOR(simd_f32, __m256, float, const, 8)
  139. MAKE_ACCESSOR(simd_i32, __m256i, int, , 8)
  140. MAKE_ACCESSOR(simd_i32, __m256i, int, const, 8)
  141. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  142. // Specialized AVX input assembly function for general vertex gather
  143. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  144. typedef MaskedOcclusionCulling::VertexLayout VertexLayout;
  145. FORCE_INLINE void GatherVertices(__m256 *vtxX, __m256 *vtxY, __m256 *vtxW, const float *inVtx, const unsigned int *inTrisPtr, int numLanes, const VertexLayout &vtxLayout)
  146. {
  147. assert(numLanes >= 1);
  148. const __m256i SIMD_TRI_IDX_OFFSET = _mm256_setr_epi32(0, 3, 6, 9, 12, 15, 18, 21);
  149. static const __m256i SIMD_LANE_MASK[9] = {
  150. _mm256_setr_epi32( 0, 0, 0, 0, 0, 0, 0, 0),
  151. _mm256_setr_epi32(~0, 0, 0, 0, 0, 0, 0, 0),
  152. _mm256_setr_epi32(~0, ~0, 0, 0, 0, 0, 0, 0),
  153. _mm256_setr_epi32(~0, ~0, ~0, 0, 0, 0, 0, 0),
  154. _mm256_setr_epi32(~0, ~0, ~0, ~0, 0, 0, 0, 0),
  155. _mm256_setr_epi32(~0, ~0, ~0, ~0, ~0, 0, 0, 0),
  156. _mm256_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0, 0, 0),
  157. _mm256_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0, 0),
  158. _mm256_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0)
  159. };
  160. // Compute per-lane index list offset that guards against out of bounds memory accesses
  161. __m256i safeTriIdxOffset = _mm256_and_si256(SIMD_TRI_IDX_OFFSET, SIMD_LANE_MASK[numLanes]);
  162. // Fetch triangle indices.
  163. __m256i vtxIdx[3];
  164. vtxIdx[0] = _mmw_mullo_epi32(_mm256_i32gather_epi32((const int*)inTrisPtr + 0, safeTriIdxOffset, 4), _mmw_set1_epi32(vtxLayout.mStride));
  165. vtxIdx[1] = _mmw_mullo_epi32(_mm256_i32gather_epi32((const int*)inTrisPtr + 1, safeTriIdxOffset, 4), _mmw_set1_epi32(vtxLayout.mStride));
  166. vtxIdx[2] = _mmw_mullo_epi32(_mm256_i32gather_epi32((const int*)inTrisPtr + 2, safeTriIdxOffset, 4), _mmw_set1_epi32(vtxLayout.mStride));
  167. char *vPtr = (char *)inVtx;
  168. // Fetch triangle vertices
  169. for (int i = 0; i < 3; i++)
  170. {
  171. vtxX[i] = _mm256_i32gather_ps((float *)vPtr, vtxIdx[i], 1);
  172. vtxY[i] = _mm256_i32gather_ps((float *)(vPtr + vtxLayout.mOffsetY), vtxIdx[i], 1);
  173. vtxW[i] = _mm256_i32gather_ps((float *)(vPtr + vtxLayout.mOffsetW), vtxIdx[i], 1);
  174. }
  175. }
  176. namespace MaskedOcclusionCullingAVX2
  177. {
  178. static MaskedOcclusionCulling::Implementation gInstructionSet = MaskedOcclusionCulling::AVX2;
  179. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  180. // Include common algorithm implementation (general, SIMD independent code)
  181. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  182. #include "MaskedOcclusionCullingCommon.inl"
  183. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  184. // Utility function to create a new object using the allocator callbacks
  185. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  186. typedef MaskedOcclusionCulling::pfnAlignedAlloc pfnAlignedAlloc;
  187. typedef MaskedOcclusionCulling::pfnAlignedFree pfnAlignedFree;
  188. MaskedOcclusionCulling *CreateMaskedOcclusionCulling(pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree)
  189. {
  190. MaskedOcclusionCullingPrivate *object = (MaskedOcclusionCullingPrivate *)alignedAlloc(64, sizeof(MaskedOcclusionCullingPrivate));
  191. new (object) MaskedOcclusionCullingPrivate(alignedAlloc, alignedFree);
  192. return object;
  193. }
  194. };
  195. #else
  196. namespace MaskedOcclusionCullingAVX2
  197. {
  198. typedef MaskedOcclusionCulling::pfnAlignedAlloc pfnAlignedAlloc;
  199. typedef MaskedOcclusionCulling::pfnAlignedFree pfnAlignedFree;
  200. MaskedOcclusionCulling *CreateMaskedOcclusionCulling(pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree)
  201. {
  202. return nullptr;
  203. }
  204. };
  205. #endif