3 years ago · f940a340a0
--- a/AnKi/Gr/Common.h
+++ b/AnKi/Gr/Common.h
@@ -144,6 +144,12 @@ public:
 
				 	/// The size of a shader group handle that will be placed inside an SBT record.
			
 
				 	U32 m_shaderGroupHandleSize = 0;
			
 
				 
			
 
				+	/// Min subgroup size of the GPU.
			
 
				+	U32 m_minSubgroupSize = 0;
			
 
				+
			
 
				+	/// Max subgroup size of the GPU.
			
 
				+	U32 m_maxSubgroupSize = 0;
			
 
				+
			
 
				 	/// GPU vendor.
			
 
				 	GpuVendor m_gpuVendor = GpuVendor::UNKNOWN;
			
 
				 
			
@@ -173,7 +179,7 @@ public:
 
				 };
			
 
				 ANKI_END_PACKED_STRUCT
			
 
				 static_assert(sizeof(GpuDeviceCapabilities)
			
 
				-				  == sizeof(PtrSize) * 4 + sizeof(U32) * 5 + sizeof(U8) * 3 + sizeof(Bool) * 6,
			
 
				+				  == sizeof(PtrSize) * 4 + sizeof(U32) * 7 + sizeof(U8) * 3 + sizeof(Bool) * 6,
			
 
				 			  "Should be packed");
			
 
				 
			
 
				 /// The type of the allocator for heap allocations
			
--- a/AnKi/Gr/Vulkan/GrManagerImpl.cpp
+++ b/AnKi/Gr/Vulkan/GrManagerImpl.cpp
@@ -480,22 +480,35 @@ Error GrManagerImpl::initInstance(const GrManagerInitInfo& init)
 
				 	{
			
 
				 	case 0x13B5:
			
 
				 		m_capabilities.m_gpuVendor = GpuVendor::ARM;
			
 
				+		m_capabilities.m_minSubgroupSize = 16;
			
 
				+		m_capabilities.m_maxSubgroupSize = 16;
			
 
				 		break;
			
 
				 	case 0x10DE:
			
 
				 		m_capabilities.m_gpuVendor = GpuVendor::NVIDIA;
			
 
				+		m_capabilities.m_minSubgroupSize = 32;
			
 
				+		m_capabilities.m_maxSubgroupSize = 32;
			
 
				 		break;
			
 
				 	case 0x1002:
			
 
				 	case 0x1022:
			
 
				 		m_capabilities.m_gpuVendor = GpuVendor::AMD;
			
 
				+		m_capabilities.m_minSubgroupSize = 32;
			
 
				+		m_capabilities.m_maxSubgroupSize = 64;
			
 
				 		break;
			
 
				 	case 0x8086:
			
 
				 		m_capabilities.m_gpuVendor = GpuVendor::INTEL;
			
 
				+		m_capabilities.m_minSubgroupSize = 8;
			
 
				+		m_capabilities.m_maxSubgroupSize = 32;
			
 
				 		break;
			
 
				 	case 0x5143:
			
 
				 		m_capabilities.m_gpuVendor = GpuVendor::QUALCOMM;
			
 
				+		m_capabilities.m_minSubgroupSize = 64;
			
 
				+		m_capabilities.m_maxSubgroupSize = 128;
			
 
				 		break;
			
 
				 	default:
			
 
				 		m_capabilities.m_gpuVendor = GpuVendor::UNKNOWN;
			
 
				+		// Choose something really low
			
 
				+		m_capabilities.m_minSubgroupSize = 8;
			
 
				+		m_capabilities.m_maxSubgroupSize = 8;
			
 
				 	}
			
 
				 	ANKI_VK_LOGI("GPU is %s. Vendor identified as %s", m_devProps.properties.deviceName,
			
 
				 				 &GPU_VENDOR_STR[m_capabilities.m_gpuVendor][0]);
			
--- a/AnKi/Renderer/ConfigVars.defs.h
+++ b/AnKi/Renderer/ConfigVars.defs.h
@@ -10,6 +10,7 @@ ANKI_CONFIG_VAR_U32(RTileSize, 64, 8, 256, "Tile lighting tile size")
 
				 ANKI_CONFIG_VAR_U32(RZSplitCount, 64, 8, 1024, "Clusterer number of Z splits")
			
 
				 ANKI_CONFIG_VAR_BOOL(RPreferCompute, !ANKI_PLATFORM_MOBILE, "Prefer compute shaders")
			
 
				 ANKI_CONFIG_VAR_BOOL(RVrs, true, "Enable VRS in multiple passes")
			
 
				+ANKI_CONFIG_VAR_F32(RVrsThreshold, 0.05f, 0.0f, 1.0f, "Threshold under which a lower shading rate will be applied")
			
 
				 ANKI_CONFIG_VAR_BOOL(RHighQualityHdr, !ANKI_PLATFORM_MOBILE,
			
 
				 					 "If true use R16G16B16 for HDR images. Alternatively use B10G11R11")
			
 
				 
			
--- a/AnKi/Renderer/VrsSriGeneration.cpp
+++ b/AnKi/Renderer/VrsSriGeneration.cpp
@@ -57,6 +57,18 @@ Error VrsSriGeneration::initInternal()
 
				 	ANKI_CHECK(getResourceManager().loadResource("AnKi/Shaders/VrsSriGenerationCompute.ankiprog", m_prog));
			
 
				 	ShaderProgramResourceVariantInitInfo variantInit(m_prog);
			
 
				 	variantInit.addMutation("SRI_TEXEL_DIMENSION", m_sriTexelDimension);
			
 
				+
			
 
				+	if(m_sriTexelDimension == 16 && getGrManager().getDeviceCapabilities().m_minSubgroupSize >= 32)
			
 
				+	{
			
 
				+		// Algorithm's workgroup size is 32, GPU's subgroup size is min 32 -> each workgroup has 1 subgroup -> No need
			
 
				+		// for shared mem
			
 
				+		variantInit.addMutation("SHARED_MEMORY", 0);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		variantInit.addMutation("SHARED_MEMORY", 1);
			
 
				+	}
			
 
				+
			
 
				 	const ShaderProgramResourceVariant* variant;
			
 
				 	m_prog->getOrCreateVariant(variantInit, variant);
			
 
				 	m_grProg = variant->getProgram();
			
@@ -116,10 +128,13 @@ void VrsSriGeneration::populateRenderGraph(RenderingContext& ctx)
 
				 		cmdb->bindShaderProgram(m_grProg);
			
 
				 
			
 
				 		rgraphCtx.bindColorTexture(0, 0, m_r->getTemporalAA().getTonemappedRt());
			
 
				-		rgraphCtx.bindImage(0, 1, m_runCtx.m_rt);
			
 
				+		cmdb->bindSampler(0, 1, m_r->getSamplers().m_nearestNearestClamp);
			
 
				+		rgraphCtx.bindImage(0, 2, m_runCtx.m_rt);
			
 
				+		const Vec4 pc(1.0f / Vec2(m_r->getInternalResolution()), getConfig().getRVrsThreshold(), 0.0f);
			
 
				+		cmdb->setPushConstants(&pc, sizeof(pc));
			
 
				 
			
 
				-		const U32 workgroupSize = m_sriTexelDimension;
			
 
				-		dispatchPPCompute(cmdb, workgroupSize, workgroupSize, m_r->getInternalResolution().x(),
			
 
				+		const U32 fakeWorkgroupSizeXorY = m_sriTexelDimension;
			
 
				+		dispatchPPCompute(cmdb, fakeWorkgroupSizeXorY, fakeWorkgroupSizeXorY, m_r->getInternalResolution().x(),
			
 
				 						  m_r->getInternalResolution().y());
			
 
				 	});
			
 
				 }
			
--- a/AnKi/Shaders/VrsSriGeneration.glsl
+++ b/AnKi/Shaders/VrsSriGeneration.glsl
@@ -3,62 +3,141 @@
 
				 // Code licensed under the BSD License.
			
 
				 // http://www.anki3d.org/LICENSE
			
 
				 
			
 
				-#pragma anki mutator SRI_TEXEL_DIMENSION 8 16 32
			
 
				+#pragma anki mutator SRI_TEXEL_DIMENSION 8 16
			
 
				+#pragma anki mutator SHARED_MEMORY 0 1
			
 
				 
			
 
				 #include <AnKi/Shaders/Functions.glsl>
			
 
				 #include <AnKi/Shaders/TonemappingFunctions.glsl>
			
 
				 
			
 
				+// Find the maximum luma derivative in x and y, relative to the average luma of the block.
			
 
				+// Each thread handles a 2x4 region.
			
 
				+
			
 
				 layout(set = 0, binding = 0) uniform ANKI_RP texture2D u_inputTex;
			
 
				+layout(set = 0, binding = 1) uniform sampler u_nearestClampSampler;
			
 
				+
			
 
				+const UVec2 REGION_SIZE = UVec2(2u, 4u);
			
 
				 
			
 
				-#if defined(ANKI_COMPUTE_SHADER)
			
 
				-const UVec2 WORKGROUP_SIZE = UVec2(SRI_TEXEL_DIMENSION, SRI_TEXEL_DIMENSION);
			
 
				+const UVec2 WORKGROUP_SIZE = UVec2(SRI_TEXEL_DIMENSION) / REGION_SIZE;
			
 
				 layout(local_size_x = WORKGROUP_SIZE.x, local_size_y = WORKGROUP_SIZE.y, local_size_z = 1) in;
			
 
				 
			
 
				-layout(set = 0, binding = 1) uniform writeonly uimage2D u_sriImg;
			
 
				-#else
			
 
				-layout(location = 0) out U32 out_shadingRate;
			
 
				+layout(set = 0, binding = 2) uniform writeonly uimage2D u_sriImg;
			
 
				+
			
 
				+layout(push_constant, std430) uniform b_pc
			
 
				+{
			
 
				+	Vec2 u_oneOverViewportSize;
			
 
				+	F32 u_threshold;
			
 
				+	F32 u_padding0;
			
 
				+};
			
 
				+
			
 
				+#if SHARED_MEMORY
			
 
				+// Ideally, we'd be able to calculate the min/max/average using subgroup operations, but there's no guarantee
			
 
				+// subgroupSize is large enough so we need shared memory as a fallback. We need gl_NumSubgroups entries, but it is not a
			
 
				+// constant, so estimate it assuming a subgroupSize of at least 8.
			
 
				+const U32 SHARED_MEMORY_ENTRIES = WORKGROUP_SIZE.x * WORKGROUP_SIZE.y / 8u;
			
 
				+shared F32 s_averageLuma[SHARED_MEMORY_ENTRIES];
			
 
				+shared Vec2 s_maxDerivative[SHARED_MEMORY_ENTRIES];
			
 
				 #endif
			
 
				 
			
 
				-shared F32 s_lumaMin[WORKGROUP_SIZE.y * WORKGROUP_SIZE.x];
			
 
				-shared F32 s_lumaMax[WORKGROUP_SIZE.y * WORKGROUP_SIZE.x];
			
 
				+#define sampleLuma(offsetX, offsetY) \
			
 
				+	computeLuminance( \
			
 
				+		textureLodOffset(sampler2D(u_inputTex, u_nearestClampSampler), uv, 0.0, IVec2(offsetX, offsetY)).xyz)
			
 
				 
			
 
				 void main()
			
 
				 {
			
 
				-	// Get luminance
			
 
				-	const Vec3 color = texelFetch(u_inputTex, IVec2(gl_GlobalInvocationID.xy), 0).xyz;
			
 
				-	const F32 luma = computeLuminance(color);
			
 
				+	const Vec2 uv = Vec2(gl_GlobalInvocationID.xy) * Vec2(REGION_SIZE) * u_oneOverViewportSize;
			
 
				+
			
 
				+	// Get luminance.
			
 
				+	//       l2.z
			
 
				+	// l1.z  l1.w  l2.y
			
 
				+	// l1.x  l1.y
			
 
				+	// l0.z  l0.w  l2.x
			
 
				+	// l0.x  l0.y
			
 
				+	Vec4 l0;
			
 
				+	l0.x = sampleLuma(0, 0);
			
 
				+	l0.y = sampleLuma(1, 0);
			
 
				+	l0.z = sampleLuma(0, 1);
			
 
				+	l0.w = sampleLuma(1, 1);
			
 
				+
			
 
				+	Vec4 l1;
			
 
				+	l1.x = sampleLuma(0, 2);
			
 
				+	l1.y = sampleLuma(1, 2);
			
 
				+	l1.z = sampleLuma(0, 3);
			
 
				+	l1.w = sampleLuma(1, 3);
			
 
				+
			
 
				+	Vec3 l2;
			
 
				+	l2.x = sampleLuma(2, 1);
			
 
				+	l2.y = sampleLuma(2, 3);
			
 
				+	l2.z = sampleLuma(1, 4);
			
 
				+
			
 
				+	// Calculate derivatives.
			
 
				+	Vec4 a = Vec4(l0.y, l2.x, l1.y, l2.y);
			
 
				+	Vec4 b = Vec4(l0.x, l0.w, l1.x, l1.w);
			
 
				+	const Vec4 dx = abs(a - b);
			
 
				+
			
 
				+	a = Vec4(l0.z, l0.w, l1.z, l2.z);
			
 
				+	b = Vec4(l0.x, l0.y, l1.x, l1.w);
			
 
				+	const Vec4 dy = abs(a - b);
			
 
				+
			
 
				+	F32 maxDerivativeX = max(max(dx.x, dx.y), max(dx.z, dx.w));
			
 
				+	F32 maxDerivativeY = max(max(dy.x, dy.y), max(dy.z, dy.w));
			
 
				+	maxDerivativeX = subgroupMax(maxDerivativeX);
			
 
				+	maxDerivativeY = subgroupMax(maxDerivativeY);
			
 
				+
			
 
				+	// Calculate average luma in block.
			
 
				+	const Vec4 sumL0L1 = l0 + l1;
			
 
				+	F32 averageLuma = (sumL0L1.x + sumL0L1.y + sumL0L1.z + sumL0L1.w) / 8.0;
			
 
				+	averageLuma = subgroupAdd(averageLuma);
			
 
				+
			
 
				+#if SHARED_MEMORY
			
 
				+	// Store results in shared memory.
			
 
				+	ANKI_BRANCH if(subgroupElect())
			
 
				+	{
			
 
				+		s_averageLuma[gl_SubgroupID] = averageLuma;
			
 
				+		s_maxDerivative[gl_SubgroupID] = Vec2(maxDerivativeX, maxDerivativeY);
			
 
				+	}
			
 
				 
			
 
				-	// Store luminance
			
 
				-	s_lumaMin[gl_LocalInvocationIndex] = luma;
			
 
				-	s_lumaMax[gl_LocalInvocationIndex] = luma;
			
 
				 	memoryBarrierShared();
			
 
				 	barrier();
			
 
				+#endif
			
 
				 
			
 
				-	// Gather the results into one
			
 
				-	ANKI_LOOP for(U32 s = (WORKGROUP_SIZE.x * WORKGROUP_SIZE.y) / 2u; s > 0u; s >>= 1u)
			
 
				+	// Write the result
			
 
				+	ANKI_BRANCH if(gl_LocalInvocationIndex == 0u)
			
 
				 	{
			
 
				-		if(gl_LocalInvocationIndex < s)
			
 
				+		// Get max across all subgroups.
			
 
				+#if SHARED_MEMORY
			
 
				+		averageLuma = s_averageLuma[0];
			
 
				+		Vec2 maxDerivative = s_maxDerivative[0];
			
 
				+
			
 
				+		for(U32 i = 1u; i < gl_NumSubgroups; ++i)
			
 
				 		{
			
 
				-			s_lumaMin[gl_LocalInvocationIndex] =
			
 
				-				min(s_lumaMin[gl_LocalInvocationIndex], s_lumaMin[gl_LocalInvocationIndex + s]);
			
 
				-			s_lumaMax[gl_LocalInvocationIndex] =
			
 
				-				max(s_lumaMax[gl_LocalInvocationIndex], s_lumaMax[gl_LocalInvocationIndex + s]);
			
 
				+			averageLuma += s_averageLuma[i];
			
 
				+			maxDerivative = max(maxDerivative, s_maxDerivative[i]);
			
 
				 		}
			
 
				+#else
			
 
				+		const Vec2 maxDerivative = Vec2(maxDerivativeX, maxDerivativeY);
			
 
				+#endif
			
 
				 
			
 
				-		memoryBarrierShared();
			
 
				-		barrier();
			
 
				-	}
			
 
				+		// Determine shading rate.
			
 
				+		const F32 avgLuma = averageLuma / F32(WORKGROUP_SIZE.x * WORKGROUP_SIZE.y);
			
 
				+		const Vec2 lumaDiff = maxDerivative / avgLuma;
			
 
				+		const F32 threshold1 = u_threshold;
			
 
				+		const F32 threshold2 = threshold1 * 0.4;
			
 
				 
			
 
				-	// Write the result
			
 
				-	ANKI_BRANCH if(gl_LocalInvocationIndex == 0u)
			
 
				-	{
			
 
				-		const F32 diff = s_lumaMax[0] - s_lumaMin[0];
			
 
				-		const F32 maxLumaDiff = 1.0 / 32.0;
			
 
				+		UVec2 rate;
			
 
				+		rate.x = (lumaDiff.x > threshold1) ? 1u : ((lumaDiff.x > threshold2) ? 2u : 4u);
			
 
				+		rate.y = (lumaDiff.y > threshold1) ? 1u : ((lumaDiff.y > threshold2) ? 2u : 4u);
			
 
				 
			
 
				-		const F32 factor = min(1.0, diff / maxLumaDiff);
			
 
				-		const U32 rate = 1u << (2u - U32(factor * 2.0));
			
 
				+		// 1x4 and 4x1 shading rates don't exist.
			
 
				+		if(rate == UVec2(1u, 4u))
			
 
				+		{
			
 
				+			rate = UVec2(1u, 2u);
			
 
				+		}
			
 
				+		else if(rate == UVec2(4u, 1u))
			
 
				+		{
			
 
				+			rate = UVec2(2u, 1u);
			
 
				+		}
			
 
				 
			
 
				-		const UVec2 inputTexelCoord = gl_WorkGroupID.xy;
			
 
				-		imageStore(u_sriImg, IVec2(inputTexelCoord), UVec4(encodeVrsRate(UVec2(rate))));
			
 
				+		const UVec2 outTexelCoord = gl_WorkGroupID.xy;
			
 
				+		imageStore(u_sriImg, IVec2(outTexelCoord), UVec4(encodeVrsRate(rate)));
			
 
				 	}
			
 
				 }
			
--- a/AnKi/Shaders/VrsSriVisualizeRenderTarget.ankiprog
+++ b/AnKi/Shaders/VrsSriVisualizeRenderTarget.ankiprog
@@ -23,15 +23,23 @@ void main()
 
				 
			
 
				 	if(rate == UVec2(1u))
			
 
				 	{
			
 
				-		out_color = Vec3(0.0, 0.0, 1.0);
			
 
				+		out_color = Vec3(1.0, 0.0, 0.0);
			
 
				+	}
			
 
				+	else if(rate == UVec2(2u, 1u) || rate == UVec2(1u, 2u))
			
 
				+	{
			
 
				+		out_color = Vec3(1.0, 0.5, 0.0);
			
 
				 	}
			
 
				 	else if(rate == UVec2(2u))
			
 
				 	{
			
 
				-		out_color = Vec3(0.0, 1.0, 0.0);
			
 
				+		out_color = Vec3(1.0, 1.0, 0.0);
			
 
				+	}
			
 
				+	else if(rate == UVec2(4u, 2u) || rate == UVec2(2u, 4u))
			
 
				+	{
			
 
				+		out_color = Vec3(0.5, 1.0, 0.0);
			
 
				 	}
			
 
				 	else if(rate == UVec2(4u))
			
 
				 	{
			
 
				-		out_color = Vec3(1.0, 0.0, 0.0);
			
 
				+		out_color = Vec3(0.0, 1.0, 0.0);
			
 
				 	}
			
 
				 	else
			
 
				 	{