| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192 |
- // Copyright (C) 2009-present, Panagiotis Christopoulos Charitos and contributors.
- // All rights reserved.
- // Code licensed under the BSD License.
- // http://www.anki3d.org/LICENSE
- #pragma anki mutator SRI_TEXEL_DIMENSION 8 16
- #pragma anki mutator SHARED_MEMORY 0 1
- #pragma anki mutator LIMIT_RATE_TO_2X2 0 1
- #pragma anki technique comp
- #include <AnKi/Shaders/Functions.hlsl>
- #include <AnKi/Shaders/TonemappingFunctions.hlsl>
- // Find the maximum luma derivative in x and y, relative to the average luma of the block.
- // Each thread handles a 2x2 region when using 8x8 VRS tiles and a 2x4 region when using 16x16 VRS tiles.
- Texture2D<Vec4> g_inputTex : register(t0);
- SamplerState g_nearestClampSampler : register(s0);
- #if SRI_TEXEL_DIMENSION == 8
- # define REGION_SIZE_X 2
- # define REGION_SIZE_Y 2
- #else
- # define REGION_SIZE_X 2
- # define REGION_SIZE_Y 4
- #endif
- #define THREADGROUP_SIZE_X (SRI_TEXEL_DIMENSION / REGION_SIZE_X)
- #define THREADGROUP_SIZE_Y (SRI_TEXEL_DIMENSION / REGION_SIZE_Y)
- RWTexture2D<U32> g_sriStorageTex : register(u0);
- struct Constants
- {
- Vec2 m_oneOverViewportSize;
- F32 m_threshold;
- F32 m_padding0;
- };
- ANKI_FAST_CONSTANTS(Constants, g_consts)
- #if SHARED_MEMORY
- // Ideally, we'd be able to calculate the min/max/average using subgroup operations, but there's no guarantee
- // subgroupSize is large enough so we need shared memory as a fallback. We need gl_NumSubgroups entries, but it is not a
- // constant, so estimate it assuming a subgroupSize of at least 8.
- constexpr U32 kSharedMemoryEntries = THREADGROUP_SIZE_X * THREADGROUP_SIZE_Y / 8u;
- groupshared F32 s_averageLuma[kSharedMemoryEntries];
- groupshared Vec2 s_maxDerivative[kSharedMemoryEntries];
- groupshared U32 s_waveIndexInsideThreadGroup;
- #endif
- F32 computeLuma(Vec3 color)
- {
- const F32 l = computeLuminance(color);
- return l / (1.0f + l);
- }
- #define sampleLuma(offsetX, offsetY) computeLuma(g_inputTex.SampleLevel(g_nearestClampSampler, uv, 0.0, IVec2(offsetX, offsetY)).xyz)
- [numthreads(THREADGROUP_SIZE_X, THREADGROUP_SIZE_Y, 1)] void main(UVec3 svDispatchThreadId : SV_DISPATCHTHREADID, U32 svGroupIndex : SV_GROUPINDEX,
- UVec3 svGroupID : SV_GROUPID)
- {
- #if SHARED_MEMORY
- U32 wavesPerThreadGroup;
- U32 waveIndexInsideThreadGroup;
- ANKI_COMPUTE_WAVE_INDEX_INSIDE_THREADGROUP(svGroupIndex, s_waveIndexInsideThreadGroup, waveIndexInsideThreadGroup, wavesPerThreadGroup);
- #endif
- const Vec2 uv = (Vec2(svDispatchThreadId.xy) * Vec2(REGION_SIZE_X, REGION_SIZE_Y) + 0.5) * g_consts.m_oneOverViewportSize;
- #if SRI_TEXEL_DIMENSION == 8
- // Get luminance.
- // l1.y
- // l0.z l0.w l1.x
- // l0.x l0.y
- Vec4 l0;
- l0.x = sampleLuma(0, 0);
- l0.y = sampleLuma(1, 0);
- l0.z = sampleLuma(0, 1);
- l0.w = sampleLuma(1, 1);
- Vec2 l1;
- l1.x = sampleLuma(2, 1);
- l1.y = sampleLuma(1, 2);
- // Calculate derivatives.
- Vec2 a = Vec2(l0.y, l1.x);
- Vec2 b = Vec2(l0.x, l0.w);
- const Vec2 dx = abs(a - b);
- a = Vec2(l0.z, l1.y);
- b = Vec2(l0.x, l0.w);
- const Vec2 dy = abs(a - b);
- F32 maxDerivativeX = max(dx.x, dx.y);
- F32 maxDerivativeY = max(dy.x, dy.y);
- // Calculate average luma.
- F32 averageLuma = (l0.x + l0.y + l0.z + l0.w) / 4.0;
- #else
- // Get luminance.
- // l2.z
- // l2.y l1.z l1.w
- // l1.x l1.y
- // l0.z l0.w l2.x
- // l0.x l0.y
- Vec4 l0;
- l0.x = sampleLuma(0, 0);
- l0.y = sampleLuma(1, 0);
- l0.z = sampleLuma(0, 1);
- l0.w = sampleLuma(1, 1);
- Vec4 l1;
- l1.x = sampleLuma(0, 2);
- l1.y = sampleLuma(1, 2);
- l1.z = sampleLuma(0, 3);
- l1.w = sampleLuma(1, 3);
- Vec3 l2;
- l2.x = sampleLuma(2, 1);
- l2.y = sampleLuma(-1, 3);
- l2.z = sampleLuma(1, 4);
- // Calculate derivatives.
- Vec4 a = Vec4(l0.y, l2.x, l1.y, l2.y);
- Vec4 b = Vec4(l0.x, l0.w, l1.x, l1.z);
- const Vec4 dx = abs(a - b);
- a = Vec4(l0.z, l0.w, l1.z, l2.z);
- b = Vec4(l0.x, l0.y, l1.x, l1.w);
- const Vec4 dy = abs(a - b);
- F32 maxDerivativeX = max(max(dx.x, dx.y), max(dx.z, dx.w));
- F32 maxDerivativeY = max(max(dy.x, dy.y), max(dy.z, dy.w));
- // Calculate average luma.
- const Vec4 sumL0L1 = l0 + l1;
- F32 averageLuma = (sumL0L1.x + sumL0L1.y + sumL0L1.z + sumL0L1.w) / 8.0;
- #endif
- // Share values in subgroup.
- maxDerivativeX = WaveActiveMax(maxDerivativeX);
- maxDerivativeY = WaveActiveMax(maxDerivativeY);
- averageLuma = WaveActiveSum(averageLuma);
- #if SHARED_MEMORY
- // Store results in shared memory.
- [branch] if(WaveIsFirstLane())
- {
- s_averageLuma[waveIndexInsideThreadGroup] = averageLuma;
- s_maxDerivative[waveIndexInsideThreadGroup] = Vec2(maxDerivativeX, maxDerivativeY);
- }
- GroupMemoryBarrierWithGroupSync();
- #endif
- // Write the result
- [branch] if(svGroupIndex == 0u)
- {
- // Get max across all subgroups.
- #if SHARED_MEMORY
- averageLuma = s_averageLuma[0];
- Vec2 maxDerivative = s_maxDerivative[0];
- for(U32 i = 1u; i < wavesPerThreadGroup; ++i)
- {
- averageLuma += s_averageLuma[i];
- maxDerivative = max(maxDerivative, s_maxDerivative[i]);
- }
- #else
- const Vec2 maxDerivative = Vec2(maxDerivativeX, maxDerivativeY);
- #endif
- // Determine shading rate.
- const F32 avgLuma = averageLuma / F32(THREADGROUP_SIZE_X * THREADGROUP_SIZE_Y);
- const Vec2 lumaDiff = maxDerivative / avgLuma;
- const F32 threshold1 = g_consts.m_threshold;
- const F32 threshold2 = threshold1 * 0.4;
- UVec2 rate;
- rate.x = (lumaDiff.x > threshold1) ? 1u : ((lumaDiff.x > threshold2) ? 2u : 4u);
- rate.y = (lumaDiff.y > threshold1) ? 1u : ((lumaDiff.y > threshold2) ? 2u : 4u);
- #if LIMIT_RATE_TO_2X2
- rate = min(rate, UVec2(2, 2));
- #endif
- const UVec2 outTexelCoord = svGroupID.xy;
- g_sriStorageTex[outTexelCoord] = encodeVrsRate(rate);
- }
- }
|