ParticleTileCullingCS.hlsl 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242
  1. // RUN: %dxc -E main -T cs_6_0 %s | FileCheck %s
  2. // CHECK: groupId
  3. // CHECK: flattenedThreadIdInGroup
  4. // CHECK: threadIdInGroup
  5. // CHECK: bufferLoad
  6. // CHECK: textureLoad
  7. // CHECK: UMin
  8. // CHECK: Countbits
  9. // CHECK: FirstbitHi
  10. // CHECK: barrier
  11. // CHECK: bufferStore
  12. // CHECK: IMax
  13. // CHECK: IMin
  14. // CHECK: bufferStore
  15. // CHECK: AtomicAdd
  16. //
  17. // Copyright (c) Microsoft. All rights reserved.
  18. // This code is licensed under the MIT License (MIT).
  19. // THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
  20. // ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
  21. // IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
  22. // PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
  23. //
  24. // Developed by Minigraph
  25. //
  26. // Author(s): James Stanard
  27. // Julia Careaga
  28. //
  29. #include "ParticleUtility.hlsli"
  30. StructuredBuffer<uint> g_BinParticles : register(t0);
  31. StructuredBuffer<uint> g_BinCounters : register(t1);
  32. Texture2D<uint> g_DepthBounds : register(t2);
  33. StructuredBuffer<ParticleScreenData> g_VisibleParticles : register(t3);
  34. RWStructuredBuffer<uint> g_SortedParticles : register(u0);
  35. RWByteAddressBuffer g_TileHitMasks : register(u1);
  36. RWStructuredBuffer<uint> g_DrawPackets : register(u2);
  37. RWStructuredBuffer<uint> g_FastDrawPackets : register(u3);
  38. RWByteAddressBuffer g_DrawPacketCount : register(u4);
  39. #if TILES_PER_BIN < 64
  40. #define GROUP_THREAD_COUNT 64
  41. #else
  42. #define GROUP_THREAD_COUNT TILES_PER_BIN
  43. #endif
  44. #define GROUP_SIZE_X TILES_PER_BIN_X
  45. #define GROUP_SIZE_Y (GROUP_THREAD_COUNT / GROUP_SIZE_X)
  46. #define MASK_WORDS_PER_ITER (GROUP_THREAD_COUNT / 32)
  47. groupshared uint gs_SortKeys[MAX_PARTICLES_PER_BIN];
  48. groupshared uint gs_IntersectionMasks[TILES_PER_BIN * MASK_WORDS_PER_ITER];
  49. groupshared uint gs_TileParticleCounts[TILES_PER_BIN];
  50. groupshared uint gs_SlowTileParticleCounts[TILES_PER_BIN];
  51. groupshared uint gs_MinMaxDepth[TILES_PER_BIN];
  52. void BitonicSort(uint GI, uint NumElements, uint NextPow2, uint NumThreads)
  53. {
  54. for (uint k = 2; k <= NextPow2; k *= 2)
  55. {
  56. // Align NumElements to the next multiple of k
  57. NumElements = (NumElements + k - 1) & ~(k - 1);
  58. for (uint j = k / 2; j > 0; j /= 2)
  59. {
  60. // Loop over all N/2 unique element pairs
  61. for (uint i = GI; i < NumElements / 2; i += NumThreads)
  62. {
  63. uint Index1 = InsertZeroBit(i, j);
  64. uint Index2 = Index1 | j;
  65. uint A = gs_SortKeys[Index1];
  66. uint B = gs_SortKeys[Index2];
  67. if ((A < B) != ((Index1 & k) == 0))
  68. {
  69. gs_SortKeys[Index1] = B;
  70. gs_SortKeys[Index2] = A;
  71. }
  72. }
  73. GroupMemoryBarrierWithGroupSync();
  74. }
  75. }
  76. }
  77. uint ComputeMaskOffset( uint2 Gid, uint2 GTid )
  78. {
  79. // Sometimes we have more threads than tiles per bin.
  80. uint2 OutTileCoord = Gid.xy * uint2(TILES_PER_BIN_X, TILES_PER_BIN_Y) + uint2(GTid.x, GTid.y % TILES_PER_BIN_Y);
  81. uint OutTileIdx = OutTileCoord.x + OutTileCoord.y * gTileRowPitch;
  82. return OutTileIdx * MAX_PARTICLES_PER_BIN / 8 + GTid.y / TILES_PER_BIN_Y * 4;
  83. }
  84. [RootSignature(Particle_RootSig)]
  85. [numthreads(GROUP_SIZE_X, GROUP_SIZE_Y, 1)]
  86. void main( uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex, uint3 GTid : SV_GroupThreadID )
  87. {
  88. // Each group is assigned a bin
  89. uint BinIndex = Gid.y * gBinsPerRow + Gid.x;
  90. uint ParticleCountInBin = g_BinCounters[BinIndex];
  91. if (ParticleCountInBin == 0)
  92. return;
  93. // Get the start location for particles in this bin
  94. uint BinStart = BinIndex * MAX_PARTICLES_PER_BIN;
  95. // Each thread is assigned a tile
  96. uint2 TileCoord = Gid.xy * uint2(TILES_PER_BIN_X, TILES_PER_BIN_Y) + GTid.xy;
  97. if (GI < TILES_PER_BIN)
  98. {
  99. gs_TileParticleCounts[GI] = 0;
  100. gs_SlowTileParticleCounts[GI] = 0;
  101. gs_MinMaxDepth[GI] = g_DepthBounds[TileCoord] << 2;
  102. }
  103. // Sometimes the counter value exceeds the actual storage size
  104. ParticleCountInBin = min(MAX_PARTICLES_PER_BIN, ParticleCountInBin);
  105. // Compute the next power of two for the bitonic sort
  106. uint NextPow2 = countbits(ParticleCountInBin) <= 1 ? ParticleCountInBin : (2 << firstbithigh(ParticleCountInBin));
  107. // Fill in the sort key array. Each sort key has passenger data (in the least signficant
  108. // bits, so that as the sort keys are moved around, they retain a pointer to the particle
  109. // they refer to.
  110. for (uint k = GI; k < NextPow2; k += GROUP_THREAD_COUNT)
  111. gs_SortKeys[k] = k < ParticleCountInBin ? g_BinParticles[BinStart + k] : 0xffffffff;
  112. GroupMemoryBarrierWithGroupSync();
  113. // Sort the particles from front to back.
  114. BitonicSort(GI, ParticleCountInBin, NextPow2, GROUP_THREAD_COUNT);
  115. // Upper-left tile coord and lower-right coord, clamped to the screen
  116. const int2 StartTile = Gid.xy * uint2(TILES_PER_BIN_X, TILES_PER_BIN_Y);
  117. // Each thread writes the hit mask for one tile
  118. uint OutOffsetInBytes = ComputeMaskOffset(Gid.xy, GTid.xy);
  119. // Loop over all sorted particles, group-size count at a time
  120. for (uint Iter = 0; Iter < ParticleCountInBin; Iter += GROUP_THREAD_COUNT)
  121. {
  122. // Reset temporary particle intersection masks. There are two words (64-bits) per thread.
  123. // [unroll] // Change to allow new unroll behavior.
  124. for (uint C = GI; C < TILES_PER_BIN * MASK_WORDS_PER_ITER; C += GROUP_THREAD_COUNT)
  125. gs_IntersectionMasks[C] = 0;
  126. GroupMemoryBarrierWithGroupSync();
  127. // The array index of the particle this thread will test
  128. uint SortIdx = Iter + GI;
  129. // Compute word and bit to set (from thread index)
  130. uint WordOffset = GI >> 5;
  131. uint BitOffset = GI & 31;
  132. // Only do the loads and stores if this is a valid index (see constant number of iterations comment above)
  133. if (SortIdx < ParticleCountInBin)
  134. {
  135. uint SortKey = gs_SortKeys[SortIdx];
  136. uint GlobalIdx = SortKey & 0x3FFFF;
  137. // After this phase, all we care about is its global index
  138. g_SortedParticles[BinStart + SortIdx] = SortKey;
  139. uint Bounds = g_VisibleParticles[GlobalIdx].Bounds;
  140. int2 MinTile = uint2(Bounds >> 0, Bounds >> 8) & 0xFF;
  141. int2 MaxTile = uint2(Bounds >> 16, Bounds >> 24) & 0xFF;
  142. MinTile = max(MinTile - StartTile, 0);
  143. MaxTile = min(MaxTile - StartTile, int2(TILES_PER_BIN_X, TILES_PER_BIN_Y) - 1);
  144. for (int y = MinTile.y; y <= MaxTile.y; y++)
  145. {
  146. for (int x = MinTile.x; x <= MaxTile.x; x++)
  147. {
  148. uint TileIndex = y * TILES_PER_BIN_X + x;
  149. uint TileMaxZ = gs_MinMaxDepth[TileIndex];
  150. uint Inside = SortKey < TileMaxZ ? 1 : 0;
  151. uint SlowPath = SortKey > (TileMaxZ << 16) ? Inside : 0;
  152. InterlockedAdd(gs_SlowTileParticleCounts[TileIndex], SlowPath);
  153. InterlockedOr(gs_IntersectionMasks[TileIndex * MASK_WORDS_PER_ITER + WordOffset], Inside << BitOffset);
  154. }
  155. }
  156. }
  157. GroupMemoryBarrierWithGroupSync();
  158. #if TILES_PER_BIN < GROUP_THREAD_COUNT
  159. // Copy the hit masks from LDS to the output buffer. Here, each thread copies a single word
  160. if (GI < TILES_PER_BIN * MASK_WORDS_PER_ITER)
  161. {
  162. uint TileIndex = GI % TILES_PER_BIN;
  163. uint Offset = TileIndex * MASK_WORDS_PER_ITER + (GI / TILES_PER_BIN);
  164. uint Mask = gs_IntersectionMasks[Offset];
  165. InterlockedAdd(gs_TileParticleCounts[TileIndex], countbits(Mask));
  166. g_TileHitMasks.Store(OutOffsetInBytes, Mask);
  167. OutOffsetInBytes += 8;
  168. }
  169. #else
  170. // Copy the hit masks from LDS to the output buffer. Here, each thread is assigned a tile.
  171. uint Offset = GI * MASK_WORDS_PER_ITER;
  172. [unroll]
  173. for (uint O = 0; O < MASK_WORDS_PER_ITER; O += 2)
  174. {
  175. uint Mask0 = gs_IntersectionMasks[Offset+O];
  176. uint Mask1 = gs_IntersectionMasks[Offset+O+1];
  177. InterlockedAdd(gs_TileParticleCounts[GI], countbits(Mask0) + countbits(Mask1));
  178. g_TileHitMasks.Store2( OutOffsetInBytes, uint2(Mask0, Mask1) );
  179. OutOffsetInBytes += 8;
  180. }
  181. #endif
  182. GroupMemoryBarrierWithGroupSync();
  183. }
  184. if (GI >= TILES_PER_BIN)
  185. return;
  186. uint ParticleCountInThisThreadsTile = gs_TileParticleCounts[GI];
  187. if (ParticleCountInThisThreadsTile > 0)
  188. {
  189. uint SlowParticlesInThisThreadsTile = gs_SlowTileParticleCounts[GI];
  190. uint Packet = TileCoord.x << 16 | TileCoord.y << 24 | ParticleCountInThisThreadsTile;
  191. uint NewPacketIndex;
  192. if (SlowParticlesInThisThreadsTile > 0)
  193. {
  194. g_DrawPacketCount.InterlockedAdd(0, 1, NewPacketIndex);
  195. g_DrawPackets[NewPacketIndex] = Packet;
  196. }
  197. else
  198. {
  199. g_DrawPacketCount.InterlockedAdd(12, 1, NewPacketIndex);
  200. g_FastDrawPackets[NewPacketIndex] = Packet;
  201. }
  202. }
  203. }