2
0

ReduceTo1DCS.hlsl 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
  1. // RUN: %dxc -E main -T cs_6_0 %s | FileCheck %s
  2. // CHECK: groupId
  3. // CHECK: threadId
  4. // CHECK: flattenedThreadIdInGroup
  5. // CHECK: textureLoad
  6. // CHECK: dot4
  7. // CHECK: addrspace(3)
  8. // CHECK: barrier
  9. // CHECK: addrspace(3)
  10. // CHECK: barrier
  11. // CHECK: addrspace(3)
  12. // CHECK: barrier
  13. // CHECK: addrspace(3)
  14. // CHECK: barrier
  15. // CHECK: bufferStore
  16. //-----------------------------------------------------------------------------
  17. // File: ReduceTo1DCS.hlsl
  18. //
  19. // Desc: Reduce an input Texture2D to a buffer
  20. //
  21. // Copyright (c) Microsoft Corporation. All rights reserved.
  22. //-----------------------------------------------------------------------------
  23. Texture2D Input : register( t0 );
  24. RWStructuredBuffer<float> Result : register( u0 );
  25. cbuffer cbCS : register( b0 )
  26. {
  27. uint4 g_param; // (g_param.x, g_param.y) is the x and y dimensions of the Dispatch call
  28. // (g_param.z, g_param.w) is the size of the above Input Texture2D
  29. };
  30. //#define CS_FULL_PIXEL_REDUCITON // Defining this or not must be the same as in HDRToneMappingCS11.cpp
  31. #define blocksize 8
  32. #define blocksizeY 8
  33. #define groupthreads (blocksize*blocksizeY)
  34. groupshared float accum[groupthreads];
  35. static const float4 LUM_VECTOR = float4(.299, .587, .114, 0);
  36. [numthreads(blocksize,blocksizeY,1)]
  37. void main( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex )
  38. {
  39. float4 s =
  40. #ifdef CS_FULL_PIXEL_REDUCITON
  41. Input.Load( uint3(DTid.xy , 0) )+
  42. Input.Load( uint3(DTid.xy + uint2(blocksize*g_param.x, 0), 0) ) +
  43. Input.Load( uint3(DTid.xy + uint2(0, blocksizeY*g_param.y), 0) ) +
  44. Input.Load( uint3(DTid.xy + uint2(blocksize*g_param.x, blocksizeY*g_param.y), 0) );
  45. #else
  46. Input.Load( uint3((float)DTid.x/81.0f*g_param.z, (float)DTid.y/81.0f*g_param.w, 0) );
  47. #endif
  48. accum[GI] = dot( s, LUM_VECTOR );
  49. // Parallel reduction algorithm follows
  50. GroupMemoryBarrierWithGroupSync();
  51. if ( GI < 32 )
  52. accum[GI] += accum[32+GI];
  53. GroupMemoryBarrierWithGroupSync();
  54. if ( GI < 16 )
  55. accum[GI] += accum[16+GI];
  56. GroupMemoryBarrierWithGroupSync();
  57. if ( GI < 8 )
  58. accum[GI] += accum[8+GI];
  59. GroupMemoryBarrierWithGroupSync();
  60. if ( GI < 4 )
  61. accum[GI] += accum[4+GI];
  62. GroupMemoryBarrierWithGroupSync();
  63. if ( GI < 2 )
  64. accum[GI] += accum[2+GI];
  65. GroupMemoryBarrierWithGroupSync();
  66. if ( GI < 1 )
  67. accum[GI] += accum[1+GI];
  68. if ( GI == 0 )
  69. {
  70. Result[Gid.y*g_param.x+Gid.x] = accum[0];
  71. }
  72. }