VrsSriGenerationCompute.ankiprog 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. // Copyright (C) 2009-present, Panagiotis Christopoulos Charitos and contributors.
  2. // All rights reserved.
  3. // Code licensed under the BSD License.
  4. // http://www.anki3d.org/LICENSE
  5. #pragma anki mutator SRI_TEXEL_DIMENSION 8 16
  6. #pragma anki mutator SHARED_MEMORY 0 1
  7. #pragma anki mutator LIMIT_RATE_TO_2X2 0 1
  8. #pragma anki technique comp
  9. #include <AnKi/Shaders/Functions.hlsl>
  10. #include <AnKi/Shaders/TonemappingFunctions.hlsl>
  11. // Find the maximum luma derivative in x and y, relative to the average luma of the block.
  12. // Each thread handles a 2x2 region when using 8x8 VRS tiles and a 2x4 region when using 16x16 VRS tiles.
  13. Texture2D<Vec4> g_inputTex : register(t0);
  14. SamplerState g_nearestClampSampler : register(s0);
  15. #if SRI_TEXEL_DIMENSION == 8
  16. # define REGION_SIZE_X 2
  17. # define REGION_SIZE_Y 2
  18. #else
  19. # define REGION_SIZE_X 2
  20. # define REGION_SIZE_Y 4
  21. #endif
  22. #define THREADGROUP_SIZE_X (SRI_TEXEL_DIMENSION / REGION_SIZE_X)
  23. #define THREADGROUP_SIZE_Y (SRI_TEXEL_DIMENSION / REGION_SIZE_Y)
  24. RWTexture2D<U32> g_sriStorageTex : register(u0);
  25. struct Constants
  26. {
  27. Vec2 m_oneOverViewportSize;
  28. F32 m_threshold;
  29. F32 m_padding0;
  30. };
  31. ANKI_FAST_CONSTANTS(Constants, g_consts)
  32. #if SHARED_MEMORY
  33. // Ideally, we'd be able to calculate the min/max/average using subgroup operations, but there's no guarantee
  34. // subgroupSize is large enough so we need shared memory as a fallback. We need gl_NumSubgroups entries, but it is not a
  35. // constant, so estimate it assuming a subgroupSize of at least 8.
  36. constexpr U32 kSharedMemoryEntries = THREADGROUP_SIZE_X * THREADGROUP_SIZE_Y / 8u;
  37. groupshared F32 s_averageLuma[kSharedMemoryEntries];
  38. groupshared Vec2 s_maxDerivative[kSharedMemoryEntries];
  39. groupshared U32 s_waveIndexInsideThreadGroup;
  40. #endif
  41. F32 computeLuma(Vec3 color)
  42. {
  43. const F32 l = computeLuminance(color);
  44. return l / (1.0f + l);
  45. }
  46. #define sampleLuma(offsetX, offsetY) computeLuma(g_inputTex.SampleLevel(g_nearestClampSampler, uv, 0.0, IVec2(offsetX, offsetY)).xyz)
  47. [numthreads(THREADGROUP_SIZE_X, THREADGROUP_SIZE_Y, 1)] void main(UVec3 svDispatchThreadId : SV_DISPATCHTHREADID, U32 svGroupIndex : SV_GROUPINDEX,
  48. UVec3 svGroupID : SV_GROUPID)
  49. {
  50. #if SHARED_MEMORY
  51. U32 wavesPerThreadGroup;
  52. U32 waveIndexInsideThreadGroup;
  53. ANKI_COMPUTE_WAVE_INDEX_INSIDE_THREADGROUP(svGroupIndex, s_waveIndexInsideThreadGroup, waveIndexInsideThreadGroup, wavesPerThreadGroup);
  54. #endif
  55. const Vec2 uv = (Vec2(svDispatchThreadId.xy) * Vec2(REGION_SIZE_X, REGION_SIZE_Y) + 0.5) * g_consts.m_oneOverViewportSize;
  56. #if SRI_TEXEL_DIMENSION == 8
  57. // Get luminance.
  58. // l1.y
  59. // l0.z l0.w l1.x
  60. // l0.x l0.y
  61. Vec4 l0;
  62. l0.x = sampleLuma(0, 0);
  63. l0.y = sampleLuma(1, 0);
  64. l0.z = sampleLuma(0, 1);
  65. l0.w = sampleLuma(1, 1);
  66. Vec2 l1;
  67. l1.x = sampleLuma(2, 1);
  68. l1.y = sampleLuma(1, 2);
  69. // Calculate derivatives.
  70. Vec2 a = Vec2(l0.y, l1.x);
  71. Vec2 b = Vec2(l0.x, l0.w);
  72. const Vec2 dx = abs(a - b);
  73. a = Vec2(l0.z, l1.y);
  74. b = Vec2(l0.x, l0.w);
  75. const Vec2 dy = abs(a - b);
  76. F32 maxDerivativeX = max(dx.x, dx.y);
  77. F32 maxDerivativeY = max(dy.x, dy.y);
  78. // Calculate average luma.
  79. F32 averageLuma = (l0.x + l0.y + l0.z + l0.w) / 4.0;
  80. #else
  81. // Get luminance.
  82. // l2.z
  83. // l2.y l1.z l1.w
  84. // l1.x l1.y
  85. // l0.z l0.w l2.x
  86. // l0.x l0.y
  87. Vec4 l0;
  88. l0.x = sampleLuma(0, 0);
  89. l0.y = sampleLuma(1, 0);
  90. l0.z = sampleLuma(0, 1);
  91. l0.w = sampleLuma(1, 1);
  92. Vec4 l1;
  93. l1.x = sampleLuma(0, 2);
  94. l1.y = sampleLuma(1, 2);
  95. l1.z = sampleLuma(0, 3);
  96. l1.w = sampleLuma(1, 3);
  97. Vec3 l2;
  98. l2.x = sampleLuma(2, 1);
  99. l2.y = sampleLuma(-1, 3);
  100. l2.z = sampleLuma(1, 4);
  101. // Calculate derivatives.
  102. Vec4 a = Vec4(l0.y, l2.x, l1.y, l2.y);
  103. Vec4 b = Vec4(l0.x, l0.w, l1.x, l1.z);
  104. const Vec4 dx = abs(a - b);
  105. a = Vec4(l0.z, l0.w, l1.z, l2.z);
  106. b = Vec4(l0.x, l0.y, l1.x, l1.w);
  107. const Vec4 dy = abs(a - b);
  108. F32 maxDerivativeX = max(max(dx.x, dx.y), max(dx.z, dx.w));
  109. F32 maxDerivativeY = max(max(dy.x, dy.y), max(dy.z, dy.w));
  110. // Calculate average luma.
  111. const Vec4 sumL0L1 = l0 + l1;
  112. F32 averageLuma = (sumL0L1.x + sumL0L1.y + sumL0L1.z + sumL0L1.w) / 8.0;
  113. #endif
  114. // Share values in subgroup.
  115. maxDerivativeX = WaveActiveMax(maxDerivativeX);
  116. maxDerivativeY = WaveActiveMax(maxDerivativeY);
  117. averageLuma = WaveActiveSum(averageLuma);
  118. #if SHARED_MEMORY
  119. // Store results in shared memory.
  120. [branch] if(WaveIsFirstLane())
  121. {
  122. s_averageLuma[waveIndexInsideThreadGroup] = averageLuma;
  123. s_maxDerivative[waveIndexInsideThreadGroup] = Vec2(maxDerivativeX, maxDerivativeY);
  124. }
  125. GroupMemoryBarrierWithGroupSync();
  126. #endif
  127. // Write the result
  128. [branch] if(svGroupIndex == 0u)
  129. {
  130. // Get max across all subgroups.
  131. #if SHARED_MEMORY
  132. averageLuma = s_averageLuma[0];
  133. Vec2 maxDerivative = s_maxDerivative[0];
  134. for(U32 i = 1u; i < wavesPerThreadGroup; ++i)
  135. {
  136. averageLuma += s_averageLuma[i];
  137. maxDerivative = max(maxDerivative, s_maxDerivative[i]);
  138. }
  139. #else
  140. const Vec2 maxDerivative = Vec2(maxDerivativeX, maxDerivativeY);
  141. #endif
  142. // Determine shading rate.
  143. const F32 avgLuma = averageLuma / F32(THREADGROUP_SIZE_X * THREADGROUP_SIZE_Y);
  144. const Vec2 lumaDiff = maxDerivative / avgLuma;
  145. const F32 threshold1 = g_consts.m_threshold;
  146. const F32 threshold2 = threshold1 * 0.4;
  147. UVec2 rate;
  148. rate.x = (lumaDiff.x > threshold1) ? 1u : ((lumaDiff.x > threshold2) ? 2u : 4u);
  149. rate.y = (lumaDiff.y > threshold1) ? 1u : ((lumaDiff.y > threshold2) ? 2u : 4u);
  150. #if LIMIT_RATE_TO_2X2
  151. rate = min(rate, UVec2(2, 2));
  152. #endif
  153. const UVec2 outTexelCoord = svGroupID.xy;
  154. g_sriStorageTex[outTexelCoord] = encodeVrsRate(rate);
  155. }
  156. }