ParticleTileCullingCS_fail_unroll.hlsl 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
  1. // RUN: %dxc -E main -T cs_6_0 %s | FileCheck %s
  2. // CHECK: Could not unroll loop.
  3. // Copied from the original ParticleBinCullingCS.hlsl
  4. // The loop on line 141 cannot be unrolled because
  5. // the starting index is not known at compile time.
  6. //
  7. // Copyright (c) Microsoft. All rights reserved.
  8. // This code is licensed under the MIT License (MIT).
  9. // THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
  10. // ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
  11. // IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
  12. // PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
  13. //
  14. // Developed by Minigraph
  15. //
  16. // Author(s): James Stanard
  17. // Julia Careaga
  18. //
  19. #include "ParticleUtility.hlsli"
  20. StructuredBuffer<uint> g_BinParticles : register(t0);
  21. StructuredBuffer<uint> g_BinCounters : register(t1);
  22. Texture2D<uint> g_DepthBounds : register(t2);
  23. StructuredBuffer<ParticleScreenData> g_VisibleParticles : register(t3);
  24. RWStructuredBuffer<uint> g_SortedParticles : register(u0);
  25. RWByteAddressBuffer g_TileHitMasks : register(u1);
  26. RWStructuredBuffer<uint> g_DrawPackets : register(u2);
  27. RWStructuredBuffer<uint> g_FastDrawPackets : register(u3);
  28. RWByteAddressBuffer g_DrawPacketCount : register(u4);
  29. #if TILES_PER_BIN < 64
  30. #define GROUP_THREAD_COUNT 64
  31. #else
  32. #define GROUP_THREAD_COUNT TILES_PER_BIN
  33. #endif
  34. #define GROUP_SIZE_X TILES_PER_BIN_X
  35. #define GROUP_SIZE_Y (GROUP_THREAD_COUNT / GROUP_SIZE_X)
  36. #define MASK_WORDS_PER_ITER (GROUP_THREAD_COUNT / 32)
  37. groupshared uint gs_SortKeys[MAX_PARTICLES_PER_BIN];
  38. groupshared uint gs_IntersectionMasks[TILES_PER_BIN * MASK_WORDS_PER_ITER];
  39. groupshared uint gs_TileParticleCounts[TILES_PER_BIN];
  40. groupshared uint gs_SlowTileParticleCounts[TILES_PER_BIN];
  41. groupshared uint gs_MinMaxDepth[TILES_PER_BIN];
  42. void BitonicSort(uint GI, uint NumElements, uint NextPow2, uint NumThreads)
  43. {
  44. for (uint k = 2; k <= NextPow2; k *= 2)
  45. {
  46. // Align NumElements to the next multiple of k
  47. NumElements = (NumElements + k - 1) & ~(k - 1);
  48. for (uint j = k / 2; j > 0; j /= 2)
  49. {
  50. // Loop over all N/2 unique element pairs
  51. for (uint i = GI; i < NumElements / 2; i += NumThreads)
  52. {
  53. uint Index1 = InsertZeroBit(i, j);
  54. uint Index2 = Index1 | j;
  55. uint A = gs_SortKeys[Index1];
  56. uint B = gs_SortKeys[Index2];
  57. if ((A < B) != ((Index1 & k) == 0))
  58. {
  59. gs_SortKeys[Index1] = B;
  60. gs_SortKeys[Index2] = A;
  61. }
  62. }
  63. GroupMemoryBarrierWithGroupSync();
  64. }
  65. }
  66. }
  67. uint ComputeMaskOffset( uint2 Gid, uint2 GTid )
  68. {
  69. // Sometimes we have more threads than tiles per bin.
  70. uint2 OutTileCoord = Gid.xy * uint2(TILES_PER_BIN_X, TILES_PER_BIN_Y) + uint2(GTid.x, GTid.y % TILES_PER_BIN_Y);
  71. uint OutTileIdx = OutTileCoord.x + OutTileCoord.y * gTileRowPitch;
  72. return OutTileIdx * MAX_PARTICLES_PER_BIN / 8 + GTid.y / TILES_PER_BIN_Y * 4;
  73. }
  74. [RootSignature(Particle_RootSig)]
  75. [numthreads(GROUP_SIZE_X, GROUP_SIZE_Y, 1)]
  76. void main( uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex, uint3 GTid : SV_GroupThreadID )
  77. {
  78. // Each group is assigned a bin
  79. uint BinIndex = Gid.y * gBinsPerRow + Gid.x;
  80. uint ParticleCountInBin = g_BinCounters[BinIndex];
  81. if (ParticleCountInBin == 0)
  82. return;
  83. // Get the start location for particles in this bin
  84. uint BinStart = BinIndex * MAX_PARTICLES_PER_BIN;
  85. // Each thread is assigned a tile
  86. uint2 TileCoord = Gid.xy * uint2(TILES_PER_BIN_X, TILES_PER_BIN_Y) + GTid.xy;
  87. if (GI < TILES_PER_BIN)
  88. {
  89. gs_TileParticleCounts[GI] = 0;
  90. gs_SlowTileParticleCounts[GI] = 0;
  91. gs_MinMaxDepth[GI] = g_DepthBounds[TileCoord] << 2;
  92. }
  93. // Sometimes the counter value exceeds the actual storage size
  94. ParticleCountInBin = min(MAX_PARTICLES_PER_BIN, ParticleCountInBin);
  95. // Compute the next power of two for the bitonic sort
  96. uint NextPow2 = countbits(ParticleCountInBin) <= 1 ? ParticleCountInBin : (2 << firstbithigh(ParticleCountInBin));
  97. // Fill in the sort key array. Each sort key has passenger data (in the least signficant
  98. // bits, so that as the sort keys are moved around, they retain a pointer to the particle
  99. // they refer to.
  100. for (uint k = GI; k < NextPow2; k += GROUP_THREAD_COUNT)
  101. gs_SortKeys[k] = k < ParticleCountInBin ? g_BinParticles[BinStart + k] : 0xffffffff;
  102. GroupMemoryBarrierWithGroupSync();
  103. // Sort the particles from front to back.
  104. BitonicSort(GI, ParticleCountInBin, NextPow2, GROUP_THREAD_COUNT);
  105. // Upper-left tile coord and lower-right coord, clamped to the screen
  106. const int2 StartTile = Gid.xy * uint2(TILES_PER_BIN_X, TILES_PER_BIN_Y);
  107. // Each thread writes the hit mask for one tile
  108. uint OutOffsetInBytes = ComputeMaskOffset(Gid.xy, GTid.xy);
  109. // Loop over all sorted particles, group-size count at a time
  110. for (uint Iter = 0; Iter < ParticleCountInBin; Iter += GROUP_THREAD_COUNT)
  111. {
  112. // Reset temporary particle intersection masks. There are two words (64-bits) per thread.
  113. [unroll]
  114. for (uint C = GI; C < TILES_PER_BIN * MASK_WORDS_PER_ITER; C += GROUP_THREAD_COUNT)
  115. gs_IntersectionMasks[C] = 0;
  116. GroupMemoryBarrierWithGroupSync();
  117. // The array index of the particle this thread will test
  118. uint SortIdx = Iter + GI;
  119. // Compute word and bit to set (from thread index)
  120. uint WordOffset = GI >> 5;
  121. uint BitOffset = GI & 31;
  122. // Only do the loads and stores if this is a valid index (see constant number of iterations comment above)
  123. if (SortIdx < ParticleCountInBin)
  124. {
  125. uint SortKey = gs_SortKeys[SortIdx];
  126. uint GlobalIdx = SortKey & 0x3FFFF;
  127. // After this phase, all we care about is its global index
  128. g_SortedParticles[BinStart + SortIdx] = SortKey;
  129. uint Bounds = g_VisibleParticles[GlobalIdx].Bounds;
  130. int2 MinTile = uint2(Bounds >> 0, Bounds >> 8) & 0xFF;
  131. int2 MaxTile = uint2(Bounds >> 16, Bounds >> 24) & 0xFF;
  132. MinTile = max(MinTile - StartTile, 0);
  133. MaxTile = min(MaxTile - StartTile, int2(TILES_PER_BIN_X, TILES_PER_BIN_Y) - 1);
  134. for (int y = MinTile.y; y <= MaxTile.y; y++)
  135. {
  136. for (int x = MinTile.x; x <= MaxTile.x; x++)
  137. {
  138. uint TileIndex = y * TILES_PER_BIN_X + x;
  139. uint TileMaxZ = gs_MinMaxDepth[TileIndex];
  140. uint Inside = SortKey < TileMaxZ ? 1 : 0;
  141. uint SlowPath = SortKey > (TileMaxZ << 16) ? Inside : 0;
  142. InterlockedAdd(gs_SlowTileParticleCounts[TileIndex], SlowPath);
  143. InterlockedOr(gs_IntersectionMasks[TileIndex * MASK_WORDS_PER_ITER + WordOffset], Inside << BitOffset);
  144. }
  145. }
  146. }
  147. GroupMemoryBarrierWithGroupSync();
  148. #if TILES_PER_BIN < GROUP_THREAD_COUNT
  149. // Copy the hit masks from LDS to the output buffer. Here, each thread copies a single word
  150. if (GI < TILES_PER_BIN * MASK_WORDS_PER_ITER)
  151. {
  152. uint TileIndex = GI % TILES_PER_BIN;
  153. uint Offset = TileIndex * MASK_WORDS_PER_ITER + (GI / TILES_PER_BIN);
  154. uint Mask = gs_IntersectionMasks[Offset];
  155. InterlockedAdd(gs_TileParticleCounts[TileIndex], countbits(Mask));
  156. g_TileHitMasks.Store(OutOffsetInBytes, Mask);
  157. OutOffsetInBytes += 8;
  158. }
  159. #else
  160. // Copy the hit masks from LDS to the output buffer. Here, each thread is assigned a tile.
  161. uint Offset = GI * MASK_WORDS_PER_ITER;
  162. [unroll]
  163. for (uint O = 0; O < MASK_WORDS_PER_ITER; O += 2)
  164. {
  165. uint Mask0 = gs_IntersectionMasks[Offset+O];
  166. uint Mask1 = gs_IntersectionMasks[Offset+O+1];
  167. InterlockedAdd(gs_TileParticleCounts[GI], countbits(Mask0) + countbits(Mask1));
  168. g_TileHitMasks.Store2( OutOffsetInBytes, uint2(Mask0, Mask1) );
  169. OutOffsetInBytes += 8;
  170. }
  171. #endif
  172. GroupMemoryBarrierWithGroupSync();
  173. }
  174. if (GI >= TILES_PER_BIN)
  175. return;
  176. uint ParticleCountInThisThreadsTile = gs_TileParticleCounts[GI];
  177. if (ParticleCountInThisThreadsTile > 0)
  178. {
  179. uint SlowParticlesInThisThreadsTile = gs_SlowTileParticleCounts[GI];
  180. uint Packet = TileCoord.x << 16 | TileCoord.y << 24 | ParticleCountInThisThreadsTile;
  181. uint NewPacketIndex;
  182. if (SlowParticlesInThisThreadsTile > 0)
  183. {
  184. g_DrawPacketCount.InterlockedAdd(0, 1, NewPacketIndex);
  185. g_DrawPackets[NewPacketIndex] = Packet;
  186. }
  187. else
  188. {
  189. g_DrawPacketCount.InterlockedAdd(12, 1, NewPacketIndex);
  190. g_FastDrawPackets[NewPacketIndex] = Packet;
  191. }
  192. }
  193. }