| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242 |
- // RUN: %dxc -E main -T cs_6_0 %s | FileCheck %s
- // CHECK: groupId
- // CHECK: flattenedThreadIdInGroup
- // CHECK: threadIdInGroup
- // CHECK: bufferLoad
- // CHECK: textureLoad
- // CHECK: UMin
- // CHECK: Countbits
- // CHECK: FirstbitHi
- // CHECK: barrier
- // CHECK: bufferStore
- // CHECK: IMax
- // CHECK: IMin
- // CHECK: bufferStore
- // CHECK: AtomicAdd
- //
- // Copyright (c) Microsoft. All rights reserved.
- // This code is licensed under the MIT License (MIT).
- // THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
- // ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
- // IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
- // PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
- //
- // Developed by Minigraph
- //
- // Author(s): James Stanard
- // Julia Careaga
- //
- #include "ParticleUtility.hlsli"
- StructuredBuffer<uint> g_BinParticles : register(t0);
- StructuredBuffer<uint> g_BinCounters : register(t1);
- Texture2D<uint> g_DepthBounds : register(t2);
- StructuredBuffer<ParticleScreenData> g_VisibleParticles : register(t3);
- RWStructuredBuffer<uint> g_SortedParticles : register(u0);
- RWByteAddressBuffer g_TileHitMasks : register(u1);
- RWStructuredBuffer<uint> g_DrawPackets : register(u2);
- RWStructuredBuffer<uint> g_FastDrawPackets : register(u3);
- RWByteAddressBuffer g_DrawPacketCount : register(u4);
- #if TILES_PER_BIN < 64
- #define GROUP_THREAD_COUNT 64
- #else
- #define GROUP_THREAD_COUNT TILES_PER_BIN
- #endif
- #define GROUP_SIZE_X TILES_PER_BIN_X
- #define GROUP_SIZE_Y (GROUP_THREAD_COUNT / GROUP_SIZE_X)
- #define MASK_WORDS_PER_ITER (GROUP_THREAD_COUNT / 32)
- groupshared uint gs_SortKeys[MAX_PARTICLES_PER_BIN];
- groupshared uint gs_IntersectionMasks[TILES_PER_BIN * MASK_WORDS_PER_ITER];
- groupshared uint gs_TileParticleCounts[TILES_PER_BIN];
- groupshared uint gs_SlowTileParticleCounts[TILES_PER_BIN];
- groupshared uint gs_MinMaxDepth[TILES_PER_BIN];
- void BitonicSort(uint GI, uint NumElements, uint NextPow2, uint NumThreads)
- {
- for (uint k = 2; k <= NextPow2; k *= 2)
- {
- // Align NumElements to the next multiple of k
- NumElements = (NumElements + k - 1) & ~(k - 1);
- for (uint j = k / 2; j > 0; j /= 2)
- {
- // Loop over all N/2 unique element pairs
- for (uint i = GI; i < NumElements / 2; i += NumThreads)
- {
- uint Index1 = InsertZeroBit(i, j);
- uint Index2 = Index1 | j;
- uint A = gs_SortKeys[Index1];
- uint B = gs_SortKeys[Index2];
- if ((A < B) != ((Index1 & k) == 0))
- {
- gs_SortKeys[Index1] = B;
- gs_SortKeys[Index2] = A;
- }
- }
- GroupMemoryBarrierWithGroupSync();
- }
- }
- }
- uint ComputeMaskOffset( uint2 Gid, uint2 GTid )
- {
- // Sometimes we have more threads than tiles per bin.
- uint2 OutTileCoord = Gid.xy * uint2(TILES_PER_BIN_X, TILES_PER_BIN_Y) + uint2(GTid.x, GTid.y % TILES_PER_BIN_Y);
- uint OutTileIdx = OutTileCoord.x + OutTileCoord.y * gTileRowPitch;
- return OutTileIdx * MAX_PARTICLES_PER_BIN / 8 + GTid.y / TILES_PER_BIN_Y * 4;
- }
- [RootSignature(Particle_RootSig)]
- [numthreads(GROUP_SIZE_X, GROUP_SIZE_Y, 1)]
- void main( uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex, uint3 GTid : SV_GroupThreadID )
- {
- // Each group is assigned a bin
- uint BinIndex = Gid.y * gBinsPerRow + Gid.x;
- uint ParticleCountInBin = g_BinCounters[BinIndex];
- if (ParticleCountInBin == 0)
- return;
- // Get the start location for particles in this bin
- uint BinStart = BinIndex * MAX_PARTICLES_PER_BIN;
- // Each thread is assigned a tile
- uint2 TileCoord = Gid.xy * uint2(TILES_PER_BIN_X, TILES_PER_BIN_Y) + GTid.xy;
- if (GI < TILES_PER_BIN)
- {
- gs_TileParticleCounts[GI] = 0;
- gs_SlowTileParticleCounts[GI] = 0;
- gs_MinMaxDepth[GI] = g_DepthBounds[TileCoord] << 2;
- }
- // Sometimes the counter value exceeds the actual storage size
- ParticleCountInBin = min(MAX_PARTICLES_PER_BIN, ParticleCountInBin);
- // Compute the next power of two for the bitonic sort
- uint NextPow2 = countbits(ParticleCountInBin) <= 1 ? ParticleCountInBin : (2 << firstbithigh(ParticleCountInBin));
- // Fill in the sort key array. Each sort key has passenger data (in the least signficant
- // bits, so that as the sort keys are moved around, they retain a pointer to the particle
- // they refer to.
- for (uint k = GI; k < NextPow2; k += GROUP_THREAD_COUNT)
- gs_SortKeys[k] = k < ParticleCountInBin ? g_BinParticles[BinStart + k] : 0xffffffff;
- GroupMemoryBarrierWithGroupSync();
- // Sort the particles from front to back.
- BitonicSort(GI, ParticleCountInBin, NextPow2, GROUP_THREAD_COUNT);
- // Upper-left tile coord and lower-right coord, clamped to the screen
- const int2 StartTile = Gid.xy * uint2(TILES_PER_BIN_X, TILES_PER_BIN_Y);
- // Each thread writes the hit mask for one tile
- uint OutOffsetInBytes = ComputeMaskOffset(Gid.xy, GTid.xy);
- // Loop over all sorted particles, group-size count at a time
- for (uint Iter = 0; Iter < ParticleCountInBin; Iter += GROUP_THREAD_COUNT)
- {
- // Reset temporary particle intersection masks. There are two words (64-bits) per thread.
- // [unroll] // Change to allow new unroll behavior.
- for (uint C = GI; C < TILES_PER_BIN * MASK_WORDS_PER_ITER; C += GROUP_THREAD_COUNT)
- gs_IntersectionMasks[C] = 0;
- GroupMemoryBarrierWithGroupSync();
- // The array index of the particle this thread will test
- uint SortIdx = Iter + GI;
- // Compute word and bit to set (from thread index)
- uint WordOffset = GI >> 5;
- uint BitOffset = GI & 31;
- // Only do the loads and stores if this is a valid index (see constant number of iterations comment above)
- if (SortIdx < ParticleCountInBin)
- {
- uint SortKey = gs_SortKeys[SortIdx];
- uint GlobalIdx = SortKey & 0x3FFFF;
- // After this phase, all we care about is its global index
- g_SortedParticles[BinStart + SortIdx] = SortKey;
- uint Bounds = g_VisibleParticles[GlobalIdx].Bounds;
- int2 MinTile = uint2(Bounds >> 0, Bounds >> 8) & 0xFF;
- int2 MaxTile = uint2(Bounds >> 16, Bounds >> 24) & 0xFF;
- MinTile = max(MinTile - StartTile, 0);
- MaxTile = min(MaxTile - StartTile, int2(TILES_PER_BIN_X, TILES_PER_BIN_Y) - 1);
- for (int y = MinTile.y; y <= MaxTile.y; y++)
- {
- for (int x = MinTile.x; x <= MaxTile.x; x++)
- {
- uint TileIndex = y * TILES_PER_BIN_X + x;
- uint TileMaxZ = gs_MinMaxDepth[TileIndex];
- uint Inside = SortKey < TileMaxZ ? 1 : 0;
- uint SlowPath = SortKey > (TileMaxZ << 16) ? Inside : 0;
- InterlockedAdd(gs_SlowTileParticleCounts[TileIndex], SlowPath);
- InterlockedOr(gs_IntersectionMasks[TileIndex * MASK_WORDS_PER_ITER + WordOffset], Inside << BitOffset);
- }
- }
- }
- GroupMemoryBarrierWithGroupSync();
- #if TILES_PER_BIN < GROUP_THREAD_COUNT
- // Copy the hit masks from LDS to the output buffer. Here, each thread copies a single word
- if (GI < TILES_PER_BIN * MASK_WORDS_PER_ITER)
- {
- uint TileIndex = GI % TILES_PER_BIN;
- uint Offset = TileIndex * MASK_WORDS_PER_ITER + (GI / TILES_PER_BIN);
- uint Mask = gs_IntersectionMasks[Offset];
- InterlockedAdd(gs_TileParticleCounts[TileIndex], countbits(Mask));
- g_TileHitMasks.Store(OutOffsetInBytes, Mask);
- OutOffsetInBytes += 8;
- }
- #else
- // Copy the hit masks from LDS to the output buffer. Here, each thread is assigned a tile.
- uint Offset = GI * MASK_WORDS_PER_ITER;
- [unroll]
- for (uint O = 0; O < MASK_WORDS_PER_ITER; O += 2)
- {
- uint Mask0 = gs_IntersectionMasks[Offset+O];
- uint Mask1 = gs_IntersectionMasks[Offset+O+1];
- InterlockedAdd(gs_TileParticleCounts[GI], countbits(Mask0) + countbits(Mask1));
- g_TileHitMasks.Store2( OutOffsetInBytes, uint2(Mask0, Mask1) );
- OutOffsetInBytes += 8;
- }
- #endif
- GroupMemoryBarrierWithGroupSync();
- }
- if (GI >= TILES_PER_BIN)
- return;
- uint ParticleCountInThisThreadsTile = gs_TileParticleCounts[GI];
- if (ParticleCountInThisThreadsTile > 0)
- {
- uint SlowParticlesInThisThreadsTile = gs_SlowTileParticleCounts[GI];
- uint Packet = TileCoord.x << 16 | TileCoord.y << 24 | ParticleCountInThisThreadsTile;
- uint NewPacketIndex;
- if (SlowParticlesInThisThreadsTile > 0)
- {
- g_DrawPacketCount.InterlockedAdd(0, 1, NewPacketIndex);
- g_DrawPackets[NewPacketIndex] = Packet;
- }
- else
- {
- g_DrawPacketCount.InterlockedAdd(12, 1, NewPacketIndex);
- g_FastDrawPackets[NewPacketIndex] = Packet;
- }
- }
- }
|