|
@@ -8,57 +8,128 @@
|
|
|
#include <AnKi/Shaders/Functions.glsl>
|
|
#include <AnKi/Shaders/Functions.glsl>
|
|
|
#include <AnKi/Shaders/TonemappingFunctions.glsl>
|
|
#include <AnKi/Shaders/TonemappingFunctions.glsl>
|
|
|
|
|
|
|
|
|
|
+// Find the maximum luma derivative in x and y, relative to the average luma of the block.
|
|
|
|
|
+// Each thread handles a 2x4 region.
|
|
|
|
|
+
|
|
|
layout(set = 0, binding = 0) uniform ANKI_RP texture2D u_inputTex;
|
|
layout(set = 0, binding = 0) uniform ANKI_RP texture2D u_inputTex;
|
|
|
|
|
|
|
|
-#if defined(ANKI_COMPUTE_SHADER)
|
|
|
|
|
-const UVec2 WORKGROUP_SIZE = UVec2(SRI_TEXEL_DIMENSION, SRI_TEXEL_DIMENSION);
|
|
|
|
|
|
|
+const UVec2 WORKGROUP_SIZE = UVec2(SRI_TEXEL_DIMENSION / 2, SRI_TEXEL_DIMENSION / 4);
|
|
|
layout(local_size_x = WORKGROUP_SIZE.x, local_size_y = WORKGROUP_SIZE.y, local_size_z = 1) in;
|
|
layout(local_size_x = WORKGROUP_SIZE.x, local_size_y = WORKGROUP_SIZE.y, local_size_z = 1) in;
|
|
|
|
|
|
|
|
layout(set = 0, binding = 1) uniform writeonly uimage2D u_sriImg;
|
|
layout(set = 0, binding = 1) uniform writeonly uimage2D u_sriImg;
|
|
|
-#else
|
|
|
|
|
-layout(location = 0) out U32 out_shadingRate;
|
|
|
|
|
-#endif
|
|
|
|
|
|
|
|
|
|
-shared F32 s_lumaMin[WORKGROUP_SIZE.y * WORKGROUP_SIZE.x];
|
|
|
|
|
-shared F32 s_lumaMax[WORKGROUP_SIZE.y * WORKGROUP_SIZE.x];
|
|
|
|
|
|
|
+layout(push_constant, std430) uniform b_pc
|
|
|
|
|
+{
|
|
|
|
|
+ F32 u_threshold;
|
|
|
|
|
+ F32 u_padding0;
|
|
|
|
|
+ F32 u_padding1;
|
|
|
|
|
+ F32 u_padding2;
|
|
|
|
|
+};
|
|
|
|
|
+
|
|
|
|
|
+// Ideally, we'd be able to calculate the min/max/average using subgroup operations,
|
|
|
|
|
+// but there's no guarantee subgroupSize is large enough so we need shared memory as a fallback.
|
|
|
|
|
+// We need gl_NumSubgroups entries, but it is not a constant, so estimate it assuming a subgroupSize of at least 8.
|
|
|
|
|
+const U32 SHARED_MEMORY_ENTRIES = WORKGROUP_SIZE.x * WORKGROUP_SIZE.y / 8u;
|
|
|
|
|
+shared F32 s_averageLuma[SHARED_MEMORY_ENTRIES];
|
|
|
|
|
+shared Vec2 s_maxDerivative[SHARED_MEMORY_ENTRIES];
|
|
|
|
|
+
|
|
|
|
|
+F32 sampleLuma(IVec2 location, IVec2 maxLocation)
|
|
|
|
|
+{
|
|
|
|
|
+ const Vec3 color = texelFetch(u_inputTex, min(location, maxLocation), 0).xyz;
|
|
|
|
|
+ return computeLuminance(color);
|
|
|
|
|
+}
|
|
|
|
|
|
|
|
void main()
|
|
void main()
|
|
|
{
|
|
{
|
|
|
- // Get luminance
|
|
|
|
|
- const Vec3 color = texelFetch(u_inputTex, IVec2(gl_GlobalInvocationID.xy), 0).xyz;
|
|
|
|
|
- const F32 luma = computeLuminance(color);
|
|
|
|
|
|
|
+ const IVec2 blockLocation = IVec2(gl_GlobalInvocationID.xy) * IVec2(2, 4);
|
|
|
|
|
+ const IVec2 maxLocation = textureSize(u_inputTex, 0) - IVec2(1, 1);
|
|
|
|
|
+
|
|
|
|
|
+ // Get luminance.
|
|
|
|
|
+ // l0.x l0.y
|
|
|
|
|
+ // l0.z l0.w l2.x
|
|
|
|
|
+ // l1.x l1.y
|
|
|
|
|
+ // l1.z l1.w l2.y
|
|
|
|
|
+ // l2.z
|
|
|
|
|
+
|
|
|
|
|
+ Vec4 l0;
|
|
|
|
|
+ l0.x = sampleLuma(blockLocation + IVec2(0, 0), maxLocation);
|
|
|
|
|
+ l0.y = sampleLuma(blockLocation + IVec2(1, 0), maxLocation);
|
|
|
|
|
+ l0.z = sampleLuma(blockLocation + IVec2(0, 1), maxLocation);
|
|
|
|
|
+ l0.w = sampleLuma(blockLocation + IVec2(1, 1), maxLocation);
|
|
|
|
|
+
|
|
|
|
|
+ Vec4 l1;
|
|
|
|
|
+ l1.x = sampleLuma(blockLocation + IVec2(0, 2), maxLocation);
|
|
|
|
|
+ l1.y = sampleLuma(blockLocation + IVec2(1, 2), maxLocation);
|
|
|
|
|
+ l1.z = sampleLuma(blockLocation + IVec2(0, 3), maxLocation);
|
|
|
|
|
+ l1.w = sampleLuma(blockLocation + IVec2(1, 3), maxLocation);
|
|
|
|
|
+
|
|
|
|
|
+ Vec3 l2;
|
|
|
|
|
+ l2.x = sampleLuma(blockLocation + IVec2(2, 1), maxLocation);
|
|
|
|
|
+ l2.y = sampleLuma(blockLocation + IVec2(2, 3), maxLocation);
|
|
|
|
|
+ l2.z = sampleLuma(blockLocation + IVec2(1, 4), maxLocation);
|
|
|
|
|
+
|
|
|
|
|
+ // Calculate derivatives.
|
|
|
|
|
+ Vec4 a = Vec4(l0.y, l2.x, l1.y, l2.y);
|
|
|
|
|
+ Vec4 b = Vec4(l0.x, l0.w, l1.x, l1.w);
|
|
|
|
|
+ const Vec4 dx = abs(a - b);
|
|
|
|
|
+
|
|
|
|
|
+ a = Vec4(l0.z, l0.w, l1.z, l2.z);
|
|
|
|
|
+ b = Vec4(l0.x, l0.y, l1.x, l1.w);
|
|
|
|
|
+ const Vec4 dy = abs(a - b);
|
|
|
|
|
+
|
|
|
|
|
+ F32 maxDerivativeX = max(max(dx.x, dx.y), max(dx.z, dx.w));
|
|
|
|
|
+ F32 maxDerivativeY = max(max(dy.x, dy.y), max(dy.z, dy.w));
|
|
|
|
|
+ maxDerivativeX = subgroupMax(maxDerivativeX);
|
|
|
|
|
+ maxDerivativeY = subgroupMax(maxDerivativeY);
|
|
|
|
|
+
|
|
|
|
|
+ // Calculate average luma in block.
|
|
|
|
|
+ const Vec4 sumL0L1 = l0 + l1;
|
|
|
|
|
+ F32 averageLuma = (sumL0L1.x + sumL0L1.y + sumL0L1.z + sumL0L1.w) / 8.0;
|
|
|
|
|
+ averageLuma = subgroupAdd(averageLuma);
|
|
|
|
|
+
|
|
|
|
|
+ // Store results in shared memory.
|
|
|
|
|
+ ANKI_BRANCH if(subgroupElect())
|
|
|
|
|
+ {
|
|
|
|
|
+ s_averageLuma[gl_SubgroupID] = averageLuma;
|
|
|
|
|
+ s_maxDerivative[gl_SubgroupID] = Vec2(maxDerivativeX, maxDerivativeY);
|
|
|
|
|
+ }
|
|
|
|
|
|
|
|
- // Store luminance
|
|
|
|
|
- s_lumaMin[gl_LocalInvocationIndex] = luma;
|
|
|
|
|
- s_lumaMax[gl_LocalInvocationIndex] = luma;
|
|
|
|
|
memoryBarrierShared();
|
|
memoryBarrierShared();
|
|
|
barrier();
|
|
barrier();
|
|
|
|
|
|
|
|
- // Gather the results into one
|
|
|
|
|
- ANKI_LOOP for(U32 s = (WORKGROUP_SIZE.x * WORKGROUP_SIZE.y) / 2u; s > 0u; s >>= 1u)
|
|
|
|
|
|
|
+ // Write the result
|
|
|
|
|
+ ANKI_BRANCH if(gl_LocalInvocationIndex == 0u)
|
|
|
{
|
|
{
|
|
|
- if(gl_LocalInvocationIndex < s)
|
|
|
|
|
|
|
+ // Get max across all subgroups.
|
|
|
|
|
+ averageLuma = s_averageLuma[0];
|
|
|
|
|
+ Vec2 maxDerivative = s_maxDerivative[0];
|
|
|
|
|
+ for(U32 i = 1u; i < gl_NumSubgroups; ++i)
|
|
|
{
|
|
{
|
|
|
- s_lumaMin[gl_LocalInvocationIndex] =
|
|
|
|
|
- min(s_lumaMin[gl_LocalInvocationIndex], s_lumaMin[gl_LocalInvocationIndex + s]);
|
|
|
|
|
- s_lumaMax[gl_LocalInvocationIndex] =
|
|
|
|
|
- max(s_lumaMax[gl_LocalInvocationIndex], s_lumaMax[gl_LocalInvocationIndex + s]);
|
|
|
|
|
|
|
+ averageLuma += s_averageLuma[i];
|
|
|
|
|
+ maxDerivative = max(maxDerivative, s_maxDerivative[i]);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- memoryBarrierShared();
|
|
|
|
|
- barrier();
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ // Determine shading rate.
|
|
|
|
|
+ const F32 avgLuma = averageLuma / F32(WORKGROUP_SIZE.x * WORKGROUP_SIZE.y);
|
|
|
|
|
+ const Vec2 lumaDiff = maxDerivative / avgLuma;
|
|
|
|
|
+ const F32 threshold1 = u_threshold;
|
|
|
|
|
+ const F32 threshold2 = threshold1 * 0.4;
|
|
|
|
|
|
|
|
- // Write the result
|
|
|
|
|
- ANKI_BRANCH if(gl_LocalInvocationIndex == 0u)
|
|
|
|
|
- {
|
|
|
|
|
- const F32 diff = s_lumaMax[0] - s_lumaMin[0];
|
|
|
|
|
- const F32 maxLumaDiff = 1.0 / 32.0;
|
|
|
|
|
|
|
+ UVec2 rate;
|
|
|
|
|
+ rate.x = lumaDiff.x > threshold1 ? 1u : (lumaDiff.x > threshold2 ? 2u : 4u);
|
|
|
|
|
+ rate.y = lumaDiff.y > threshold1 ? 1u : (lumaDiff.y > threshold2 ? 2u : 4u);
|
|
|
|
|
|
|
|
- const F32 factor = min(1.0, diff / maxLumaDiff);
|
|
|
|
|
- const U32 rate = 1u << (2u - U32(factor * 2.0));
|
|
|
|
|
|
|
+ // 1x4 and 4x1 shading rates don't exist.
|
|
|
|
|
+ if(rate == UVec2(1u, 4u))
|
|
|
|
|
+ {
|
|
|
|
|
+ rate = UVec2(1u, 2u);
|
|
|
|
|
+ }
|
|
|
|
|
+ else if(rate == UVec2(4u, 1u))
|
|
|
|
|
+ {
|
|
|
|
|
+ rate = UVec2(2u, 1u);
|
|
|
|
|
+ }
|
|
|
|
|
|
|
|
const UVec2 inputTexelCoord = gl_WorkGroupID.xy;
|
|
const UVec2 inputTexelCoord = gl_WorkGroupID.xy;
|
|
|
- imageStore(u_sriImg, IVec2(inputTexelCoord), UVec4(encodeVrsRate(UVec2(rate))));
|
|
|
|
|
|
|
+ imageStore(u_sriImg, IVec2(inputTexelCoord), UVec4(encodeVrsRate(rate)));
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|