3 years ago · 81cc04d1be
--- a/AnKi/Renderer/ConfigVars.defs.h
+++ b/AnKi/Renderer/ConfigVars.defs.h
@@ -10,6 +10,7 @@ ANKI_CONFIG_VAR_U32(RTileSize, 64, 8, 256, "Tile lighting tile size")
 
															 ANKI_CONFIG_VAR_U32(RZSplitCount, 64, 8, 1024, "Clusterer number of Z splits")
														
 
															 ANKI_CONFIG_VAR_BOOL(RPreferCompute, !ANKI_PLATFORM_MOBILE, "Prefer compute shaders")
														
 
															 ANKI_CONFIG_VAR_BOOL(RVrs, true, "Enable VRS in multiple passes")
														
 
															+ANKI_CONFIG_VAR_F32(RVrsThreshold, 0.05f, 0.0f, 1.0f, "Threshold under which a lower shading rate will be applied")
														
 
															 ANKI_CONFIG_VAR_BOOL(RHighQualityHdr, !ANKI_PLATFORM_MOBILE,
														
 
															 					 "If true use R16G16B16 for HDR images. Alternatively use B10G11R11")
														
--- a/AnKi/Renderer/VrsSriGeneration.cpp
+++ b/AnKi/Renderer/VrsSriGeneration.cpp
@@ -117,6 +117,8 @@ void VrsSriGeneration::populateRenderGraph(RenderingContext& ctx)
 
															 		rgraphCtx.bindColorTexture(0, 0, m_r->getTemporalAA().getTonemappedRt());
														
 
															 		rgraphCtx.bindImage(0, 1, m_runCtx.m_rt);
														
 
															+		Vec4 pc(getConfig().getRVrsThreshold());
														
 
															+		cmdb->setPushConstants(&pc, sizeof(pc));
														
 
															 		const U32 workgroupSize = m_sriTexelDimension;
														
 
															 		dispatchPPCompute(cmdb, workgroupSize, workgroupSize, m_r->getInternalResolution().x(),
														
--- a/AnKi/Shaders/VrsSriGeneration.glsl
+++ b/AnKi/Shaders/VrsSriGeneration.glsl
@@ -8,57 +8,128 @@
 
															 #include <AnKi/Shaders/Functions.glsl>
														
 
															 #include <AnKi/Shaders/TonemappingFunctions.glsl>
														
 
															+// Find the maximum luma derivative in x and y, relative to the average luma of the block.
														
 
															+// Each thread handles a 2x4 region.
														
 
															+
														
 
															 layout(set = 0, binding = 0) uniform ANKI_RP texture2D u_inputTex;
														
 
															-#if defined(ANKI_COMPUTE_SHADER)
														
 
															-const UVec2 WORKGROUP_SIZE = UVec2(SRI_TEXEL_DIMENSION, SRI_TEXEL_DIMENSION);
														
 
															+const UVec2 WORKGROUP_SIZE = UVec2(SRI_TEXEL_DIMENSION / 2, SRI_TEXEL_DIMENSION / 4);
														
 
															 layout(local_size_x = WORKGROUP_SIZE.x, local_size_y = WORKGROUP_SIZE.y, local_size_z = 1) in;
														
 
															 layout(set = 0, binding = 1) uniform writeonly uimage2D u_sriImg;
														
 
															-#else
														
 
															-layout(location = 0) out U32 out_shadingRate;
														
 
															-#endif
														
 
															-shared F32 s_lumaMin[WORKGROUP_SIZE.y * WORKGROUP_SIZE.x];
														
 
															-shared F32 s_lumaMax[WORKGROUP_SIZE.y * WORKGROUP_SIZE.x];
														
 
															+layout(push_constant, std430) uniform b_pc
														
 
															+{
														
 
															+	F32 u_threshold;
														
 
															+	F32 u_padding0;
														
 
															+	F32 u_padding1;
														
 
															+	F32 u_padding2;
														
 
															+};
														
 
															+
														
 
															+// Ideally, we'd be able to calculate the min/max/average using subgroup operations,
														
 
															+// but there's no guarantee subgroupSize is large enough so we need shared memory as a fallback.
														
 
															+// We need gl_NumSubgroups entries, but it is not a constant, so estimate it assuming a subgroupSize of at least 8.
														
 
															+const U32 SHARED_MEMORY_ENTRIES = WORKGROUP_SIZE.x * WORKGROUP_SIZE.y / 8u;
														
 
															+shared F32 s_averageLuma[SHARED_MEMORY_ENTRIES];
														
 
															+shared Vec2 s_maxDerivative[SHARED_MEMORY_ENTRIES];
														
 
															+
														
 
															+F32 sampleLuma(IVec2 location, IVec2 maxLocation)
														
 
															+{
														
 
															+	const Vec3 color = texelFetch(u_inputTex, min(location, maxLocation), 0).xyz;
														
 
															+	return computeLuminance(color);
														
 
															+}
														
 
															 void main()
														
 
															 {
														
 
															-	// Get luminance
														
 
															-	const Vec3 color = texelFetch(u_inputTex, IVec2(gl_GlobalInvocationID.xy), 0).xyz;
														
 
															-	const F32 luma = computeLuminance(color);
														
 
															+	const IVec2 blockLocation = IVec2(gl_GlobalInvocationID.xy) * IVec2(2, 4);
														
 
															+	const IVec2 maxLocation = textureSize(u_inputTex, 0) - IVec2(1, 1);
														
 
															+
														
 
															+	// Get luminance.
														
 
															+	// l0.x  l0.y
														
 
															+	// l0.z  l0.w  l2.x
														
 
															+	// l1.x  l1.y
														
 
															+	// l1.z  l1.w  l2.y
														
 
															+	//       l2.z
														
 
															+
														
 
															+	Vec4 l0;
														
 
															+	l0.x = sampleLuma(blockLocation + IVec2(0, 0), maxLocation);
														
 
															+	l0.y = sampleLuma(blockLocation + IVec2(1, 0), maxLocation);
														
 
															+	l0.z = sampleLuma(blockLocation + IVec2(0, 1), maxLocation);
														
 
															+	l0.w = sampleLuma(blockLocation + IVec2(1, 1), maxLocation);
														
 
															+
														
 
															+	Vec4 l1;
														
 
															+	l1.x = sampleLuma(blockLocation + IVec2(0, 2), maxLocation);
														
 
															+	l1.y = sampleLuma(blockLocation + IVec2(1, 2), maxLocation);
														
 
															+	l1.z = sampleLuma(blockLocation + IVec2(0, 3), maxLocation);
														
 
															+	l1.w = sampleLuma(blockLocation + IVec2(1, 3), maxLocation);
														
 
															+
														
 
															+	Vec3 l2;
														
 
															+	l2.x = sampleLuma(blockLocation + IVec2(2, 1), maxLocation);
														
 
															+	l2.y = sampleLuma(blockLocation + IVec2(2, 3), maxLocation);
														
 
															+	l2.z = sampleLuma(blockLocation + IVec2(1, 4), maxLocation);
														
 
															+
														
 
															+	// Calculate derivatives.
														
 
															+	Vec4 a = Vec4(l0.y, l2.x, l1.y, l2.y);
														
 
															+	Vec4 b = Vec4(l0.x, l0.w, l1.x, l1.w);
														
 
															+	const Vec4 dx = abs(a - b);
														
 
															+
														
 
															+	a = Vec4(l0.z, l0.w, l1.z, l2.z);
														
 
															+	b = Vec4(l0.x, l0.y, l1.x, l1.w);
														
 
															+	const Vec4 dy = abs(a - b);
														
 
															+
														
 
															+	F32 maxDerivativeX = max(max(dx.x, dx.y), max(dx.z, dx.w));
														
 
															+	F32 maxDerivativeY = max(max(dy.x, dy.y), max(dy.z, dy.w));
														
 
															+	maxDerivativeX = subgroupMax(maxDerivativeX);
														
 
															+	maxDerivativeY = subgroupMax(maxDerivativeY);
														
 
															+
														
 
															+	// Calculate average luma in block.
														
 
															+	const Vec4 sumL0L1 = l0 + l1;
														
 
															+	F32 averageLuma = (sumL0L1.x + sumL0L1.y + sumL0L1.z + sumL0L1.w) / 8.0;
														
 
															+	averageLuma = subgroupAdd(averageLuma);
														
 
															+
														
 
															+	// Store results in shared memory.
														
 
															+	ANKI_BRANCH if(subgroupElect())
														
 
															+	{
														
 
															+		s_averageLuma[gl_SubgroupID] = averageLuma;
														
 
															+		s_maxDerivative[gl_SubgroupID] = Vec2(maxDerivativeX, maxDerivativeY);
														
 
															+	}
														
 
															-	// Store luminance
														
 
															-	s_lumaMin[gl_LocalInvocationIndex] = luma;
														
 
															-	s_lumaMax[gl_LocalInvocationIndex] = luma;
														
 
															 	memoryBarrierShared();
														
 
															 	barrier();
														
 
															-	// Gather the results into one
														
 
															-	ANKI_LOOP for(U32 s = (WORKGROUP_SIZE.x * WORKGROUP_SIZE.y) / 2u; s > 0u; s >>= 1u)
														
 
															+	// Write the result
														
 
															+	ANKI_BRANCH if(gl_LocalInvocationIndex == 0u)
														
 
															 	{
														
 
															-		if(gl_LocalInvocationIndex < s)
														
 
															+		// Get max across all subgroups.
														
 
															+		averageLuma = s_averageLuma[0];
														
 
															+		Vec2 maxDerivative = s_maxDerivative[0];
														
 
															+		for(U32 i = 1u; i < gl_NumSubgroups; ++i)
														
 
															 		{
														
 
															-			s_lumaMin[gl_LocalInvocationIndex] =
														
 
															-				min(s_lumaMin[gl_LocalInvocationIndex], s_lumaMin[gl_LocalInvocationIndex + s]);
														
 
															-			s_lumaMax[gl_LocalInvocationIndex] =
														
 
															-				max(s_lumaMax[gl_LocalInvocationIndex], s_lumaMax[gl_LocalInvocationIndex + s]);
														
 
															+			averageLuma += s_averageLuma[i];
														
 
															+			maxDerivative = max(maxDerivative, s_maxDerivative[i]);
														
 
															 		}
														
 
															-		memoryBarrierShared();
														
 
															-		barrier();
														
 
															-	}
														
 
															+		// Determine shading rate.
														
 
															+		const F32 avgLuma = averageLuma / F32(WORKGROUP_SIZE.x * WORKGROUP_SIZE.y);
														
 
															+		const Vec2 lumaDiff = maxDerivative / avgLuma;
														
 
															+		const F32 threshold1 = u_threshold;
														
 
															+		const F32 threshold2 = threshold1 * 0.4;
														
 
															-	// Write the result
														
 
															-	ANKI_BRANCH if(gl_LocalInvocationIndex == 0u)
														
 
															-	{
														
 
															-		const F32 diff = s_lumaMax[0] - s_lumaMin[0];
														
 
															-		const F32 maxLumaDiff = 1.0 / 32.0;
														
 
															+		UVec2 rate;
														
 
															+		rate.x = lumaDiff.x > threshold1 ? 1u : (lumaDiff.x > threshold2 ? 2u : 4u);
														
 
															+		rate.y = lumaDiff.y > threshold1 ? 1u : (lumaDiff.y > threshold2 ? 2u : 4u);
														
 
															-		const F32 factor = min(1.0, diff / maxLumaDiff);
														
 
															-		const U32 rate = 1u << (2u - U32(factor * 2.0));
														
 
															+		// 1x4 and 4x1 shading rates don't exist.
														
 
															+		if(rate == UVec2(1u, 4u))
														
 
															+		{
														
 
															+			rate = UVec2(1u, 2u);
														
 
															+		}
														
 
															+		else if(rate == UVec2(4u, 1u))
														
 
															+		{
														
 
															+			rate = UVec2(2u, 1u);
														
 
															+		}
														
 
															 		const UVec2 inputTexelCoord = gl_WorkGroupID.xy;
														
 
															-		imageStore(u_sriImg, IVec2(inputTexelCoord), UVec4(encodeVrsRate(UVec2(rate))));
														
 
															+		imageStore(u_sriImg, IVec2(inputTexelCoord), UVec4(encodeVrsRate(rate)));
														
 
															 	}
														
 
															 }
														
--- a/AnKi/Shaders/VrsSriVisualizeRenderTarget.ankiprog
+++ b/AnKi/Shaders/VrsSriVisualizeRenderTarget.ankiprog
@@ -23,15 +23,23 @@ void main()
 
															 	if(rate == UVec2(1u))
														
 
															 	{
														
 
															-		out_color = Vec3(0.0, 0.0, 1.0);
														
 
															+		out_color = Vec3(1.0, 0.0, 0.0);
														
 
															+	}
														
 
															+	else if(rate == UVec2(2u, 1u) || rate == UVec2(1u, 2u))
														
 
															+	{
														
 
															+		out_color = Vec3(1.0, 0.5, 0.0);
														
 
															 	}
														
 
															 	else if(rate == UVec2(2u))
														
 
															 	{
														
 
															-		out_color = Vec3(0.0, 1.0, 0.0);
														
 
															+		out_color = Vec3(1.0, 1.0, 0.0);
														
 
															+	}
														
 
															+	else if(rate == UVec2(4u, 2u) || rate == UVec2(2u, 4u))
														
 
															+	{
														
 
															+		out_color = Vec3(0.5, 1.0, 0.0);
														
 
															 	}
														
 
															 	else if(rate == UVec2(4u))
														
 
															 	{
														
 
															-		out_color = Vec3(1.0, 0.0, 0.0);
														
 
															+		out_color = Vec3(0.0, 1.0, 0.0);
														
 
															 	}
														
 
															 	else
														
 
															 	{