// Copyright (C) 2009-present, Panagiotis Christopoulos Charitos and contributors. // All rights reserved. // Code licensed under the BSD License. // http://www.anki3d.org/LICENSE #pragma anki mutator SRI_TEXEL_DIMENSION 8 16 #pragma anki mutator SHARED_MEMORY 0 1 #pragma anki mutator LIMIT_RATE_TO_2X2 0 1 #pragma anki technique comp #include #include // Find the maximum luma derivative in x and y, relative to the average luma of the block. // Each thread handles a 2x2 region when using 8x8 VRS tiles and a 2x4 region when using 16x16 VRS tiles. Texture2D g_inputTex : register(t0); SamplerState g_nearestClampSampler : register(s0); #if SRI_TEXEL_DIMENSION == 8 # define REGION_SIZE_X 2 # define REGION_SIZE_Y 2 #else # define REGION_SIZE_X 2 # define REGION_SIZE_Y 4 #endif #define THREADGROUP_SIZE_X (SRI_TEXEL_DIMENSION / REGION_SIZE_X) #define THREADGROUP_SIZE_Y (SRI_TEXEL_DIMENSION / REGION_SIZE_Y) RWTexture2D g_sriStorageTex : register(u0); struct Constants { Vec2 m_oneOverViewportSize; F32 m_threshold; F32 m_padding0; }; ANKI_FAST_CONSTANTS(Constants, g_consts) #if SHARED_MEMORY // Ideally, we'd be able to calculate the min/max/average using subgroup operations, but there's no guarantee // subgroupSize is large enough so we need shared memory as a fallback. We need gl_NumSubgroups entries, but it is not a // constant, so estimate it assuming a subgroupSize of at least 8. constexpr U32 kSharedMemoryEntries = THREADGROUP_SIZE_X * THREADGROUP_SIZE_Y / 8u; groupshared F32 s_averageLuma[kSharedMemoryEntries]; groupshared Vec2 s_maxDerivative[kSharedMemoryEntries]; groupshared U32 s_waveIndexInsideThreadGroup; #endif F32 computeLuma(Vec3 color) { const F32 l = computeLuminance(color); return l / (1.0f + l); } #define sampleLuma(offsetX, offsetY) computeLuma(g_inputTex.SampleLevel(g_nearestClampSampler, uv, 0.0, IVec2(offsetX, offsetY)).xyz) [numthreads(THREADGROUP_SIZE_X, THREADGROUP_SIZE_Y, 1)] void main(UVec3 svDispatchThreadId : SV_DISPATCHTHREADID, U32 svGroupIndex : SV_GROUPINDEX, UVec3 svGroupID : SV_GROUPID) { #if SHARED_MEMORY U32 wavesPerThreadGroup; U32 waveIndexInsideThreadGroup; ANKI_COMPUTE_WAVE_INDEX_INSIDE_THREADGROUP(svGroupIndex, s_waveIndexInsideThreadGroup, waveIndexInsideThreadGroup, wavesPerThreadGroup); #endif const Vec2 uv = (Vec2(svDispatchThreadId.xy) * Vec2(REGION_SIZE_X, REGION_SIZE_Y) + 0.5) * g_consts.m_oneOverViewportSize; #if SRI_TEXEL_DIMENSION == 8 // Get luminance. // l1.y // l0.z l0.w l1.x // l0.x l0.y Vec4 l0; l0.x = sampleLuma(0, 0); l0.y = sampleLuma(1, 0); l0.z = sampleLuma(0, 1); l0.w = sampleLuma(1, 1); Vec2 l1; l1.x = sampleLuma(2, 1); l1.y = sampleLuma(1, 2); // Calculate derivatives. Vec2 a = Vec2(l0.y, l1.x); Vec2 b = Vec2(l0.x, l0.w); const Vec2 dx = abs(a - b); a = Vec2(l0.z, l1.y); b = Vec2(l0.x, l0.w); const Vec2 dy = abs(a - b); F32 maxDerivativeX = max(dx.x, dx.y); F32 maxDerivativeY = max(dy.x, dy.y); // Calculate average luma. F32 averageLuma = (l0.x + l0.y + l0.z + l0.w) / 4.0; #else // Get luminance. // l2.z // l2.y l1.z l1.w // l1.x l1.y // l0.z l0.w l2.x // l0.x l0.y Vec4 l0; l0.x = sampleLuma(0, 0); l0.y = sampleLuma(1, 0); l0.z = sampleLuma(0, 1); l0.w = sampleLuma(1, 1); Vec4 l1; l1.x = sampleLuma(0, 2); l1.y = sampleLuma(1, 2); l1.z = sampleLuma(0, 3); l1.w = sampleLuma(1, 3); Vec3 l2; l2.x = sampleLuma(2, 1); l2.y = sampleLuma(-1, 3); l2.z = sampleLuma(1, 4); // Calculate derivatives. Vec4 a = Vec4(l0.y, l2.x, l1.y, l2.y); Vec4 b = Vec4(l0.x, l0.w, l1.x, l1.z); const Vec4 dx = abs(a - b); a = Vec4(l0.z, l0.w, l1.z, l2.z); b = Vec4(l0.x, l0.y, l1.x, l1.w); const Vec4 dy = abs(a - b); F32 maxDerivativeX = max(max(dx.x, dx.y), max(dx.z, dx.w)); F32 maxDerivativeY = max(max(dy.x, dy.y), max(dy.z, dy.w)); // Calculate average luma. const Vec4 sumL0L1 = l0 + l1; F32 averageLuma = (sumL0L1.x + sumL0L1.y + sumL0L1.z + sumL0L1.w) / 8.0; #endif // Share values in subgroup. maxDerivativeX = WaveActiveMax(maxDerivativeX); maxDerivativeY = WaveActiveMax(maxDerivativeY); averageLuma = WaveActiveSum(averageLuma); #if SHARED_MEMORY // Store results in shared memory. [branch] if(WaveIsFirstLane()) { s_averageLuma[waveIndexInsideThreadGroup] = averageLuma; s_maxDerivative[waveIndexInsideThreadGroup] = Vec2(maxDerivativeX, maxDerivativeY); } GroupMemoryBarrierWithGroupSync(); #endif // Write the result [branch] if(svGroupIndex == 0u) { // Get max across all subgroups. #if SHARED_MEMORY averageLuma = s_averageLuma[0]; Vec2 maxDerivative = s_maxDerivative[0]; for(U32 i = 1u; i < wavesPerThreadGroup; ++i) { averageLuma += s_averageLuma[i]; maxDerivative = max(maxDerivative, s_maxDerivative[i]); } #else const Vec2 maxDerivative = Vec2(maxDerivativeX, maxDerivativeY); #endif // Determine shading rate. const F32 avgLuma = averageLuma / F32(THREADGROUP_SIZE_X * THREADGROUP_SIZE_Y); const Vec2 lumaDiff = maxDerivative / avgLuma; const F32 threshold1 = g_consts.m_threshold; const F32 threshold2 = threshold1 * 0.4; UVec2 rate; rate.x = (lumaDiff.x > threshold1) ? 1u : ((lumaDiff.x > threshold2) ? 2u : 4u); rate.y = (lumaDiff.y > threshold1) ? 1u : ((lumaDiff.y > threshold2) ? 2u : 4u); #if LIMIT_RATE_TO_2X2 rate = min(rate, UVec2(2, 2)); #endif const UVec2 outTexelCoord = svGroupID.xy; g_sriStorageTex[outTexelCoord] = encodeVrsRate(rate); } }