FXAAPass1CS.hlsli 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
  1. //
  2. // Copyright (c) Microsoft. All rights reserved.
  3. // This code is licensed under the MIT License (MIT).
  4. // THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
  5. // ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
  6. // IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
  7. // PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
  8. //
  9. // Developed by Minigraph
  10. //
  11. // Author: James Stanard
  12. //
  13. // Description: A Compute-optimized implementation of FXAA 3.11 (PC Quality). The
  14. // improvements take advantage of work queues (RWStructuredBuffer with atomic counters)
  15. // for these benefits:
  16. //
  17. // 1) Split horizontal and vertical edge searches into separate dispatches to reduce
  18. // shader complexity and incoherent branching.
  19. // 2) Delay writing new pixel colors until after the source buffer has been fully
  20. // analyzed. This avoids the write-after-scattered-read hazard.
  21. // 3) Modify source buffer in-place rather than ping-ponging buffers, which reduces
  22. // bandwidth and memory demands.
  23. //
  24. // In addition to the above-mentioned benefits of using UAVs, the first pass also
  25. // takes advantage of groupshared memory for storing luma values, further reducing
  26. // fetches and bandwidth.
  27. //
  28. // Another optimization is in the generation of perceived brightness (luma) of pixels.
  29. // The original implementation used sRGB as a good approximation of log-luminance. A
  30. // more precise representation of log-luminance allows the algorithm to operate with a
  31. // higher threshold value while still finding perceivable edges across the full range
  32. // of brightness. The approximation used here is (1 - 2^(-4L)) * 16/15, where L =
  33. // dot( LinearRGB, float3(0.212671, 0.715160, 0.072169) ). A threshold of 0.2 is
  34. // recommended with log-luminance computed this way.
  35. //
  36. // Original Boilerplate:
  37. //
  38. /*============================================================================
  39. NVIDIA FXAA 3.11 by TIMOTHY LOTTES
  40. ------------------------------------------------------------------------------
  41. COPYRIGHT (C) 2010, 2011 NVIDIA CORPORATION. ALL RIGHTS RESERVED.
  42. ------------------------------------------------------------------------------
  43. TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, THIS SOFTWARE IS PROVIDED
  44. *AS IS* AND NVIDIA AND ITS SUPPLIERS DISCLAIM ALL WARRANTIES, EITHER EXPRESS
  45. OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, IMPLIED WARRANTIES OF
  46. MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT SHALL NVIDIA
  47. OR ITS SUPPLIERS BE LIABLE FOR ANY SPECIAL, INCIDENTAL, INDIRECT, OR
  48. CONSEQUENTIAL DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR
  49. LOSS OF BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS INFORMATION,
  50. OR ANY OTHER PECUNIARY LOSS) ARISING OUT OF THE USE OF OR INABILITY TO USE
  51. THIS SOFTWARE, EVEN IF NVIDIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
  52. DAMAGES.
  53. */
  54. #include "FXAARootSignature.hlsli"
  55. cbuffer ConstantBuffer_x : register( b0 )
  56. {
  57. float2 RcpTextureSize;
  58. float ContrastThreshold; // default = 0.2, lower is more expensive
  59. float SubpixelRemoval; // default = 0.75, lower blurs less
  60. };
  61. RWStructuredBuffer<uint> HWork : register(u0);
  62. RWStructuredBuffer<uint> VWork : register(u2);
  63. RWBuffer<float3> HColor : register(u1);
  64. RWBuffer<float3> VColor : register(u3);
  65. Texture2D<float3> Color : register(t0);
  66. SamplerState LinearSampler : register(s0);
  67. #define BOUNDARY_SIZE 1
  68. #define ROW_WIDTH (8 + BOUNDARY_SIZE * 2)
  69. groupshared float gs_LumaCache[ROW_WIDTH * ROW_WIDTH];
  70. // If pre-computed, source luminance as a texture, otherwise write it out for Pass2
  71. #ifdef USE_LUMA_INPUT_BUFFER
  72. Texture2D<float> Luma : register(t1);
  73. #else
  74. RWTexture2D<float> Luma : register(u4);
  75. #endif
  76. //
  77. // Helper functions
  78. //
  79. float RGBToLogLuminance( float3 LinearRGB )
  80. {
  81. float Luma = dot( LinearRGB, float3(0.212671, 0.715160, 0.072169) );
  82. return log2(1 + Luma * 15) / 4;
  83. }
  84. [RootSignature(FXAA_RootSig)]
  85. [numthreads( 8, 8, 1 )]
  86. void main( uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex, uint3 GTid : SV_GroupThreadID, uint3 DTid : SV_DispatchThreadID )
  87. {
  88. #ifdef USE_LUMA_INPUT_BUFFER
  89. // Load 4 lumas per thread into LDS (but only those needed to fill our pixel cache)
  90. if (max(GTid.x, GTid.y) < ROW_WIDTH / 2)
  91. {
  92. int2 ThreadUL = DTid.xy + GTid.xy - (BOUNDARY_SIZE - 1);
  93. float4 Luma4 = Luma.Gather(LinearSampler, ThreadUL * RcpTextureSize);
  94. uint LoadIndex = (GTid.x + GTid.y * ROW_WIDTH) * 2;
  95. gs_LumaCache[LoadIndex ] = Luma4.w;
  96. gs_LumaCache[LoadIndex + 1 ] = Luma4.z;
  97. gs_LumaCache[LoadIndex + ROW_WIDTH ] = Luma4.x;
  98. gs_LumaCache[LoadIndex + ROW_WIDTH + 1] = Luma4.y;
  99. }
  100. #else
  101. // Because we can't use Gather() on RGB, we make each thread read two pixels (but only those needed).
  102. if (GI < ROW_WIDTH * ROW_WIDTH / 2)
  103. {
  104. uint LdsCoord = GI;
  105. int2 UavCoord = uint2(GI % ROW_WIDTH, GI / ROW_WIDTH) + Gid.xy * 8 - BOUNDARY_SIZE;
  106. float Luma1 = RGBToLogLuminance( Color[UavCoord] );
  107. Luma[UavCoord] = Luma1;
  108. gs_LumaCache[LdsCoord] = Luma1;
  109. LdsCoord += ROW_WIDTH * ROW_WIDTH / 2;
  110. UavCoord += int2(0, ROW_WIDTH / 2);
  111. float Luma2 = RGBToLogLuminance( Color[UavCoord] );
  112. Luma[UavCoord] = Luma2;
  113. gs_LumaCache[LdsCoord] = Luma2;
  114. }
  115. #endif
  116. GroupMemoryBarrierWithGroupSync();
  117. uint CenterIdx = (GTid.x + BOUNDARY_SIZE) + (GTid.y + BOUNDARY_SIZE) * ROW_WIDTH;
  118. // Load the ordinal and center luminances
  119. float lumaN = gs_LumaCache[CenterIdx - ROW_WIDTH];
  120. float lumaW = gs_LumaCache[CenterIdx - 1];
  121. float lumaM = gs_LumaCache[CenterIdx];
  122. float lumaE = gs_LumaCache[CenterIdx + 1];
  123. float lumaS = gs_LumaCache[CenterIdx + ROW_WIDTH];
  124. // Contrast threshold test
  125. float rangeMax = max(max(lumaN, lumaW), max(lumaE, max(lumaS, lumaM)));
  126. float rangeMin = min(min(lumaN, lumaW), min(lumaE, min(lumaS, lumaM)));
  127. float range = rangeMax - rangeMin;
  128. if (range < ContrastThreshold)
  129. return;
  130. // Load the corner luminances
  131. float lumaNW = gs_LumaCache[CenterIdx - ROW_WIDTH - 1];
  132. float lumaNE = gs_LumaCache[CenterIdx - ROW_WIDTH + 1];
  133. float lumaSW = gs_LumaCache[CenterIdx + ROW_WIDTH - 1];
  134. float lumaSE = gs_LumaCache[CenterIdx + ROW_WIDTH + 1];
  135. // Pre-sum a few terms so the results can be reused
  136. float lumaNS = lumaN + lumaS;
  137. float lumaWE = lumaW + lumaE;
  138. float lumaNWSW = lumaNW + lumaSW;
  139. float lumaNESE = lumaNE + lumaSE;
  140. float lumaSWSE = lumaSW + lumaSE;
  141. float lumaNWNE = lumaNW + lumaNE;
  142. // Compute horizontal and vertical contrast; see which is bigger
  143. float edgeHorz = abs(lumaNWSW - 2.0 * lumaW) + abs(lumaNS - 2.0 * lumaM) * 2.0 + abs(lumaNESE - 2.0 * lumaE);
  144. float edgeVert = abs(lumaSWSE - 2.0 * lumaS) + abs(lumaWE - 2.0 * lumaM) * 2.0 + abs(lumaNWNE - 2.0 * lumaN);
  145. // Also compute local contrast in the 3x3 region. This can identify standalone pixels that alias.
  146. float avgNeighborLuma = ((lumaNS + lumaWE) * 2.0 + lumaNWSW + lumaNESE) / 12.0;
  147. float subpixelShift = saturate(pow(smoothstep(0, 1, abs(avgNeighborLuma - lumaM) / range), 2) * SubpixelRemoval * 2);
  148. float NegGrad = (edgeHorz >= edgeVert ? lumaN : lumaW) - lumaM;
  149. float PosGrad = (edgeHorz >= edgeVert ? lumaS : lumaE) - lumaM;
  150. uint GradientDir = abs(PosGrad) >= abs(NegGrad) ? 1 : 0;
  151. uint Subpix = uint(subpixelShift * 254.0) & 0xFE;
  152. uint PixelCoord = DTid.y << 20 | DTid.x << 8;
  153. // Packet header: [ 12 bits Y | 12 bits X | 7 bit Subpix | 1 bit dir(Grad) ]
  154. uint WorkHeader = PixelCoord | Subpix | GradientDir;
  155. if (edgeHorz >= edgeVert)
  156. {
  157. uint WorkIdx = HWork.IncrementCounter();
  158. HWork[WorkIdx] = WorkHeader;
  159. HColor[WorkIdx] = Color[DTid.xy + uint2(0, 2 * GradientDir - 1)];
  160. }
  161. else
  162. {
  163. uint WorkIdx = VWork.IncrementCounter();
  164. VWork[WorkIdx] = WorkHeader;
  165. VColor[WorkIdx] = Color[DTid.xy + uint2(2 * GradientDir - 1, 0)];
  166. }
  167. }