GenerateMipsCS.hlsli 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. //
  2. // Copyright (c) Microsoft. All rights reserved.
  3. // This code is licensed under the MIT License (MIT).
  4. // THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
  5. // ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
  6. // IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
  7. // PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
  8. //
  9. // Developed by Minigraph
  10. //
  11. // Author: James Stanard
  12. //
  13. #define RootSig \
  14. "RootFlags(0), " \
  15. "RootConstants(b0, num32BitConstants = 4), " \
  16. "DescriptorTable(SRV(t0, numDescriptors = 1))," \
  17. "DescriptorTable(UAV(u0, numDescriptors = 4))," \
  18. "StaticSampler(s0," \
  19. "addressU = TEXTURE_ADDRESS_CLAMP," \
  20. "addressV = TEXTURE_ADDRESS_CLAMP," \
  21. "addressW = TEXTURE_ADDRESS_CLAMP," \
  22. "filter = FILTER_MIN_MAG_MIP_LINEAR)"
  23. #ifndef NON_POWER_OF_TWO
  24. #define NON_POWER_OF_TWO 0
  25. #endif
  26. RWTexture2D<float4> OutMip1 : register(u0);
  27. RWTexture2D<float4> OutMip2 : register(u1);
  28. RWTexture2D<float4> OutMip3 : register(u2);
  29. RWTexture2D<float4> OutMip4 : register(u3);
  30. Texture2D<float4> SrcMip : register(t0);
  31. SamplerState BilinearClamp : register(s0);
  32. cbuffer CB : register(b0)
  33. {
  34. uint SrcMipLevel; // Texture level of source mip
  35. uint NumMipLevels; // Number of OutMips to write: [1, 4]
  36. float2 TexelSize; // 1.0 / OutMip1.Dimensions
  37. }
  38. // The reason for separating channels is to reduce bank conflicts in the
  39. // local data memory controller. A large stride will cause more threads
  40. // to collide on the same memory bank.
  41. groupshared float gs_R[64];
  42. groupshared float gs_G[64];
  43. groupshared float gs_B[64];
  44. groupshared float gs_A[64];
  45. void StoreColor( uint Index, float4 Color )
  46. {
  47. gs_R[Index] = Color.r;
  48. gs_G[Index] = Color.g;
  49. gs_B[Index] = Color.b;
  50. gs_A[Index] = Color.a;
  51. }
  52. float4 LoadColor( uint Index )
  53. {
  54. return float4( gs_R[Index], gs_G[Index], gs_B[Index], gs_A[Index]);
  55. }
  56. float3 LinearToSRGB(float3 x)
  57. {
  58. // This is exactly the sRGB curve
  59. //return x < 0.0031308 ? 12.92 * x : 1.055 * pow(abs(x), 1.0 / 2.4) - 0.055;
  60. // This is cheaper but nearly equivalent
  61. return x < 0.0031308 ? 12.92 * x : 1.13005 * sqrt(abs(x - 0.00228)) - 0.13448 * x + 0.005719;
  62. }
  63. float4 PackColor(float4 Linear)
  64. {
  65. #ifdef CONVERT_TO_SRGB
  66. return float4(LinearToSRGB(Linear.rgb), Linear.a);
  67. #else
  68. return Linear;
  69. #endif
  70. }
  71. [RootSignature(RootSig)]
  72. [numthreads( 8, 8, 1 )]
  73. void main( uint GI : SV_GroupIndex, uint3 DTid : SV_DispatchThreadID )
  74. {
  75. // One bilinear sample is insufficient when scaling down by more than 2x.
  76. // You will slightly undersample in the case where the source dimension
  77. // is odd. This is why it's a really good idea to only generate mips on
  78. // power-of-two sized textures. Trying to handle the undersampling case
  79. // will force this shader to be slower and more complicated as it will
  80. // have to take more source texture samples.
  81. #if NON_POWER_OF_TWO == 0
  82. float2 UV = TexelSize * (DTid.xy + 0.5);
  83. float4 Src1 = SrcMip.SampleLevel(BilinearClamp, UV, SrcMipLevel);
  84. #elif NON_POWER_OF_TWO == 1
  85. // > 2:1 in X dimension
  86. // Use 2 bilinear samples to guarantee we don't undersample when downsizing by more than 2x
  87. // horizontally.
  88. float2 UV1 = TexelSize * (DTid.xy + float2(0.25, 0.5));
  89. float2 Off = TexelSize * float2(0.5, 0.0);
  90. float4 Src1 = 0.5 * (SrcMip.SampleLevel(BilinearClamp, UV1, SrcMipLevel) +
  91. SrcMip.SampleLevel(BilinearClamp, UV1 + Off, SrcMipLevel));
  92. #elif NON_POWER_OF_TWO == 2
  93. // > 2:1 in Y dimension
  94. // Use 2 bilinear samples to guarantee we don't undersample when downsizing by more than 2x
  95. // vertically.
  96. float2 UV1 = TexelSize * (DTid.xy + float2(0.5, 0.25));
  97. float2 Off = TexelSize * float2(0.0, 0.5);
  98. float4 Src1 = 0.5 * (SrcMip.SampleLevel(BilinearClamp, UV1, SrcMipLevel) +
  99. SrcMip.SampleLevel(BilinearClamp, UV1 + Off, SrcMipLevel));
  100. #elif NON_POWER_OF_TWO == 3
  101. // > 2:1 in in both dimensions
  102. // Use 4 bilinear samples to guarantee we don't undersample when downsizing by more than 2x
  103. // in both directions.
  104. float2 UV1 = TexelSize * (DTid.xy + float2(0.25, 0.25));
  105. float2 O = TexelSize * 0.5;
  106. float4 Src1 = SrcMip.SampleLevel(BilinearClamp, UV1, SrcMipLevel);
  107. Src1 += SrcMip.SampleLevel(BilinearClamp, UV1 + float2(O.x, 0.0), SrcMipLevel);
  108. Src1 += SrcMip.SampleLevel(BilinearClamp, UV1 + float2(0.0, O.y), SrcMipLevel);
  109. Src1 += SrcMip.SampleLevel(BilinearClamp, UV1 + float2(O.x, O.y), SrcMipLevel);
  110. Src1 *= 0.25;
  111. #endif
  112. OutMip1[DTid.xy] = PackColor(Src1);
  113. // A scalar (constant) branch can exit all threads coherently.
  114. if (NumMipLevels == 1)
  115. return;
  116. // Without lane swizzle operations, the only way to share data with other
  117. // threads is through LDS.
  118. StoreColor(GI, Src1);
  119. // This guarantees all LDS writes are complete and that all threads have
  120. // executed all instructions so far (and therefore have issued their LDS
  121. // write instructions.)
  122. GroupMemoryBarrierWithGroupSync();
  123. // With low three bits for X and high three bits for Y, this bit mask
  124. // (binary: 001001) checks that X and Y are even.
  125. if ((GI & 0x9) == 0)
  126. {
  127. float4 Src2 = LoadColor(GI + 0x01);
  128. float4 Src3 = LoadColor(GI + 0x08);
  129. float4 Src4 = LoadColor(GI + 0x09);
  130. Src1 = 0.25 * (Src1 + Src2 + Src3 + Src4);
  131. OutMip2[DTid.xy / 2] = PackColor(Src1);
  132. StoreColor(GI, Src1);
  133. }
  134. if (NumMipLevels == 2)
  135. return;
  136. GroupMemoryBarrierWithGroupSync();
  137. // This bit mask (binary: 011011) checks that X and Y are multiples of four.
  138. if ((GI & 0x1B) == 0)
  139. {
  140. float4 Src2 = LoadColor(GI + 0x02);
  141. float4 Src3 = LoadColor(GI + 0x10);
  142. float4 Src4 = LoadColor(GI + 0x12);
  143. Src1 = 0.25 * (Src1 + Src2 + Src3 + Src4);
  144. OutMip3[DTid.xy / 4] = PackColor(Src1);
  145. StoreColor(GI, Src1);
  146. }
  147. if (NumMipLevels == 3)
  148. return;
  149. GroupMemoryBarrierWithGroupSync();
  150. // This bit mask would be 111111 (X & Y multiples of 8), but only one
  151. // thread fits that criteria.
  152. if (GI == 0)
  153. {
  154. float4 Src2 = LoadColor(GI + 0x04);
  155. float4 Src3 = LoadColor(GI + 0x20);
  156. float4 Src4 = LoadColor(GI + 0x24);
  157. Src1 = 0.25 * (Src1 + Src2 + Src3 + Src4);
  158. OutMip4[DTid.xy / 8] = PackColor(Src1);
  159. }
  160. }