ParticleDepthBoundsCS.hlsl 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. // RUN: %dxc -E main -T cs_6_0 %s | FileCheck %s
  2. // CHECK: groupId
  3. // CHECK: flattenedThreadIdInGroup
  4. // CHECK: threadId
  5. // CHECK: textureGather
  6. // CHECK: textureGather
  7. // CHECK: textureGather
  8. // CHECK: textureGather
  9. // CHECK: FMax
  10. // CHECK: FMin
  11. // CHECK: barrier
  12. // CHECK: UMax
  13. // CHECK: atomicrmw umax
  14. // CHECK: Saturate
  15. // CHECK: textureStore
  16. //
  17. // Copyright (c) Microsoft. All rights reserved.
  18. // This code is licensed under the MIT License (MIT).
  19. // THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
  20. // ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
  21. // IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
  22. // PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
  23. //
  24. // Developed by Minigraph
  25. //
  26. // Author(s): James Stanard
  27. // Alex Nankervis
  28. //
  29. #include "ParticleUtility.hlsli"
  30. Texture2D<float> g_Input : register(t0);
  31. RWTexture2D<uint> g_Output8 : register(u0);
  32. RWTexture2D<uint> g_Output16 : register(u1);
  33. RWTexture2D<uint> g_Output32 : register(u2);
  34. groupshared uint gs_Buffer[128];
  35. void Max4( uint This, uint Dx )
  36. {
  37. uint MM1 = gs_Buffer[This + 1 * Dx];
  38. uint MM2 = gs_Buffer[This + 8 * Dx];
  39. uint MM3 = gs_Buffer[This + 9 * Dx];
  40. GroupMemoryBarrierWithGroupSync();
  41. InterlockedMax(gs_Buffer[This], max(MM1, max(MM2, MM3)));
  42. GroupMemoryBarrierWithGroupSync();
  43. }
  44. uint PackMinMax( uint This )
  45. {
  46. float Min = asfloat(~gs_Buffer[This + 64]);
  47. float Max = asfloat(gs_Buffer[This]);
  48. return f32tof16(Max) << 16 | f32tof16(saturate(Min - 0.001));
  49. }
  50. [RootSignature(Particle_RootSig)]
  51. [numthreads( 8, 8, 1 )]
  52. void main( uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex, uint3 DTid : SV_DispatchThreadID )
  53. {
  54. // Load 4x4 depth values (per thread) and compute the min and max of each
  55. float2 UV1 = (DTid.xy * 4 + 1) * gRcpBufferDim;
  56. float2 UV2 = UV1 + float2(2, 0) * gRcpBufferDim;
  57. float2 UV3 = UV1 + float2(0, 2) * gRcpBufferDim;
  58. float2 UV4 = UV1 + float2(2, 2) * gRcpBufferDim;
  59. float4 ZQuad1 = g_Input.Gather(gSampPointClamp, UV1);
  60. float4 ZQuad2 = g_Input.Gather(gSampPointClamp, UV2);
  61. float4 ZQuad3 = g_Input.Gather(gSampPointClamp, UV3);
  62. float4 ZQuad4 = g_Input.Gather(gSampPointClamp, UV4);
  63. float4 MaxQuad = max(max(ZQuad1, ZQuad2), max(ZQuad3, ZQuad4));
  64. float4 MinQuad = min(min(ZQuad1, ZQuad2), min(ZQuad3, ZQuad4));
  65. float maxZ = max(max(MaxQuad.x, MaxQuad.y), max(MaxQuad.z, MaxQuad.w));
  66. float minZ = min(min(MinQuad.x, MinQuad.y), min(MinQuad.z, MinQuad.w));
  67. // Parallel reduction will reduce 4:1 per iteration. This reduces LDS loads and stores
  68. // and can take advantage of min3 and max3 instructions when available.
  69. // Because each iteration puts 3/4 of active threads to sleep, threads are quickly wasted.
  70. // Rather than have each active thread compute both a min and a max, it would be nice if
  71. // we could wake up sleeping threads to share the burden. It turns out this is possible!
  72. // We can have all threads performing Max4() reductions, and by applying it to negative
  73. // min values, we can find the min depth. E.g. min(a, b) = -max(-a, -b)
  74. // Max values to first 64, Min values to last 64
  75. gs_Buffer[GI] = asuint(maxZ);
  76. gs_Buffer[GI + 64] = ~asuint(minZ);
  77. GroupMemoryBarrierWithGroupSync();
  78. // We don't need odd numbered threads, but we could utilize more threads
  79. const uint This = GI * 2;
  80. Max4(This, 1);
  81. // if (X % 2 == 0 && Y % 2 == 0 && Y < 8)
  82. if ((This & 0x49) == 0)
  83. {
  84. uint2 SubTile = uint2(This >> 1, This >> 4) & 3;
  85. g_Output8[Gid.xy * 4 + SubTile] = PackMinMax(This);
  86. }
  87. Max4(This, 2);
  88. // if (X % 4 == 0 && Y % 4 == 0 && Y < 8)
  89. if ((This & 0x5B) == 0)
  90. {
  91. uint2 SubTile = uint2(This >> 2, This >> 5) & 1;
  92. g_Output16[Gid.xy * 2 + SubTile] = PackMinMax(This);
  93. }
  94. Max4(This, 4);
  95. if (This == 0)
  96. g_Output32[Gid.xy] = PackMinMax(This);
  97. }