Browse Source

Minor optimizations and cleanup on VRS SRI generation

Panagiotis Christopoulos Charitos 3 years ago
parent
commit
edcffe1d52

+ 7 - 1
AnKi/Gr/Common.h

@@ -144,6 +144,12 @@ public:
 	/// The size of a shader group handle that will be placed inside an SBT record.
 	/// The size of a shader group handle that will be placed inside an SBT record.
 	U32 m_shaderGroupHandleSize = 0;
 	U32 m_shaderGroupHandleSize = 0;
 
 
+	/// Min subgroup size of the GPU.
+	U32 m_minSubgroupSize = 0;
+
+	/// Max subgroup size of the GPU.
+	U32 m_maxSubgroupSize = 0;
+
 	/// GPU vendor.
 	/// GPU vendor.
 	GpuVendor m_gpuVendor = GpuVendor::UNKNOWN;
 	GpuVendor m_gpuVendor = GpuVendor::UNKNOWN;
 
 
@@ -173,7 +179,7 @@ public:
 };
 };
 ANKI_END_PACKED_STRUCT
 ANKI_END_PACKED_STRUCT
 static_assert(sizeof(GpuDeviceCapabilities)
 static_assert(sizeof(GpuDeviceCapabilities)
-				  == sizeof(PtrSize) * 4 + sizeof(U32) * 5 + sizeof(U8) * 3 + sizeof(Bool) * 6,
+				  == sizeof(PtrSize) * 4 + sizeof(U32) * 7 + sizeof(U8) * 3 + sizeof(Bool) * 6,
 			  "Should be packed");
 			  "Should be packed");
 
 
 /// The type of the allocator for heap allocations
 /// The type of the allocator for heap allocations

+ 13 - 0
AnKi/Gr/Vulkan/GrManagerImpl.cpp

@@ -480,22 +480,35 @@ Error GrManagerImpl::initInstance(const GrManagerInitInfo& init)
 	{
 	{
 	case 0x13B5:
 	case 0x13B5:
 		m_capabilities.m_gpuVendor = GpuVendor::ARM;
 		m_capabilities.m_gpuVendor = GpuVendor::ARM;
+		m_capabilities.m_minSubgroupSize = 16;
+		m_capabilities.m_maxSubgroupSize = 16;
 		break;
 		break;
 	case 0x10DE:
 	case 0x10DE:
 		m_capabilities.m_gpuVendor = GpuVendor::NVIDIA;
 		m_capabilities.m_gpuVendor = GpuVendor::NVIDIA;
+		m_capabilities.m_minSubgroupSize = 32;
+		m_capabilities.m_maxSubgroupSize = 32;
 		break;
 		break;
 	case 0x1002:
 	case 0x1002:
 	case 0x1022:
 	case 0x1022:
 		m_capabilities.m_gpuVendor = GpuVendor::AMD;
 		m_capabilities.m_gpuVendor = GpuVendor::AMD;
+		m_capabilities.m_minSubgroupSize = 32;
+		m_capabilities.m_maxSubgroupSize = 64;
 		break;
 		break;
 	case 0x8086:
 	case 0x8086:
 		m_capabilities.m_gpuVendor = GpuVendor::INTEL;
 		m_capabilities.m_gpuVendor = GpuVendor::INTEL;
+		m_capabilities.m_minSubgroupSize = 8;
+		m_capabilities.m_maxSubgroupSize = 32;
 		break;
 		break;
 	case 0x5143:
 	case 0x5143:
 		m_capabilities.m_gpuVendor = GpuVendor::QUALCOMM;
 		m_capabilities.m_gpuVendor = GpuVendor::QUALCOMM;
+		m_capabilities.m_minSubgroupSize = 64;
+		m_capabilities.m_maxSubgroupSize = 128;
 		break;
 		break;
 	default:
 	default:
 		m_capabilities.m_gpuVendor = GpuVendor::UNKNOWN;
 		m_capabilities.m_gpuVendor = GpuVendor::UNKNOWN;
+		// Choose something really low
+		m_capabilities.m_minSubgroupSize = 8;
+		m_capabilities.m_maxSubgroupSize = 8;
 	}
 	}
 	ANKI_VK_LOGI("GPU is %s. Vendor identified as %s", m_devProps.properties.deviceName,
 	ANKI_VK_LOGI("GPU is %s. Vendor identified as %s", m_devProps.properties.deviceName,
 				 &GPU_VENDOR_STR[m_capabilities.m_gpuVendor][0]);
 				 &GPU_VENDOR_STR[m_capabilities.m_gpuVendor][0]);

+ 17 - 4
AnKi/Renderer/VrsSriGeneration.cpp

@@ -57,6 +57,18 @@ Error VrsSriGeneration::initInternal()
 	ANKI_CHECK(getResourceManager().loadResource("AnKi/Shaders/VrsSriGenerationCompute.ankiprog", m_prog));
 	ANKI_CHECK(getResourceManager().loadResource("AnKi/Shaders/VrsSriGenerationCompute.ankiprog", m_prog));
 	ShaderProgramResourceVariantInitInfo variantInit(m_prog);
 	ShaderProgramResourceVariantInitInfo variantInit(m_prog);
 	variantInit.addMutation("SRI_TEXEL_DIMENSION", m_sriTexelDimension);
 	variantInit.addMutation("SRI_TEXEL_DIMENSION", m_sriTexelDimension);
+
+	if(m_sriTexelDimension == 16 && getGrManager().getDeviceCapabilities().m_minSubgroupSize >= 32)
+	{
+		// Algorithm's workgroup size is 32, GPU's subgroup size is min 32 -> each workgroup has 1 subgroup -> No need
+		// for shared mem
+		variantInit.addMutation("SHARED_MEMORY", 0);
+	}
+	else
+	{
+		variantInit.addMutation("SHARED_MEMORY", 1);
+	}
+
 	const ShaderProgramResourceVariant* variant;
 	const ShaderProgramResourceVariant* variant;
 	m_prog->getOrCreateVariant(variantInit, variant);
 	m_prog->getOrCreateVariant(variantInit, variant);
 	m_grProg = variant->getProgram();
 	m_grProg = variant->getProgram();
@@ -116,12 +128,13 @@ void VrsSriGeneration::populateRenderGraph(RenderingContext& ctx)
 		cmdb->bindShaderProgram(m_grProg);
 		cmdb->bindShaderProgram(m_grProg);
 
 
 		rgraphCtx.bindColorTexture(0, 0, m_r->getTemporalAA().getTonemappedRt());
 		rgraphCtx.bindColorTexture(0, 0, m_r->getTemporalAA().getTonemappedRt());
-		rgraphCtx.bindImage(0, 1, m_runCtx.m_rt);
-		Vec4 pc(getConfig().getRVrsThreshold());
+		cmdb->bindSampler(0, 1, m_r->getSamplers().m_nearestNearestClamp);
+		rgraphCtx.bindImage(0, 2, m_runCtx.m_rt);
+		const Vec4 pc(1.0f / Vec2(m_r->getInternalResolution()), getConfig().getRVrsThreshold(), 0.0f);
 		cmdb->setPushConstants(&pc, sizeof(pc));
 		cmdb->setPushConstants(&pc, sizeof(pc));
 
 
-		const U32 workgroupSize = m_sriTexelDimension;
-		dispatchPPCompute(cmdb, workgroupSize, workgroupSize, m_r->getInternalResolution().x(),
+		const U32 fakeWorkgroupSizeXorY = m_sriTexelDimension;
+		dispatchPPCompute(cmdb, fakeWorkgroupSizeXorY, fakeWorkgroupSizeXorY, m_r->getInternalResolution().x(),
 						  m_r->getInternalResolution().y());
 						  m_r->getInternalResolution().y());
 	});
 	});
 }
 }

+ 43 - 35
AnKi/Shaders/VrsSriGeneration.glsl

@@ -3,7 +3,8 @@
 // Code licensed under the BSD License.
 // Code licensed under the BSD License.
 // http://www.anki3d.org/LICENSE
 // http://www.anki3d.org/LICENSE
 
 
-#pragma anki mutator SRI_TEXEL_DIMENSION 8 16 32
+#pragma anki mutator SRI_TEXEL_DIMENSION 8 16
+#pragma anki mutator SHARED_MEMORY 0 1
 
 
 #include <AnKi/Shaders/Functions.glsl>
 #include <AnKi/Shaders/Functions.glsl>
 #include <AnKi/Shaders/TonemappingFunctions.glsl>
 #include <AnKi/Shaders/TonemappingFunctions.glsl>
@@ -12,61 +13,61 @@
 // Each thread handles a 2x4 region.
 // Each thread handles a 2x4 region.
 
 
 layout(set = 0, binding = 0) uniform ANKI_RP texture2D u_inputTex;
 layout(set = 0, binding = 0) uniform ANKI_RP texture2D u_inputTex;
+layout(set = 0, binding = 1) uniform sampler u_nearestClampSampler;
 
 
-const UVec2 WORKGROUP_SIZE = UVec2(SRI_TEXEL_DIMENSION / 2, SRI_TEXEL_DIMENSION / 4);
+const UVec2 REGION_SIZE = UVec2(2u, 4u);
+
+const UVec2 WORKGROUP_SIZE = UVec2(SRI_TEXEL_DIMENSION) / REGION_SIZE;
 layout(local_size_x = WORKGROUP_SIZE.x, local_size_y = WORKGROUP_SIZE.y, local_size_z = 1) in;
 layout(local_size_x = WORKGROUP_SIZE.x, local_size_y = WORKGROUP_SIZE.y, local_size_z = 1) in;
 
 
-layout(set = 0, binding = 1) uniform writeonly uimage2D u_sriImg;
+layout(set = 0, binding = 2) uniform writeonly uimage2D u_sriImg;
 
 
 layout(push_constant, std430) uniform b_pc
 layout(push_constant, std430) uniform b_pc
 {
 {
+	Vec2 u_oneOverViewportSize;
 	F32 u_threshold;
 	F32 u_threshold;
 	F32 u_padding0;
 	F32 u_padding0;
-	F32 u_padding1;
-	F32 u_padding2;
 };
 };
 
 
-// Ideally, we'd be able to calculate the min/max/average using subgroup operations,
-// but there's no guarantee subgroupSize is large enough so we need shared memory as a fallback.
-// We need gl_NumSubgroups entries, but it is not a constant, so estimate it assuming a subgroupSize of at least 8.
+#if SHARED_MEMORY
+// Ideally, we'd be able to calculate the min/max/average using subgroup operations, but there's no guarantee
+// subgroupSize is large enough so we need shared memory as a fallback. We need gl_NumSubgroups entries, but it is not a
+// constant, so estimate it assuming a subgroupSize of at least 8.
 const U32 SHARED_MEMORY_ENTRIES = WORKGROUP_SIZE.x * WORKGROUP_SIZE.y / 8u;
 const U32 SHARED_MEMORY_ENTRIES = WORKGROUP_SIZE.x * WORKGROUP_SIZE.y / 8u;
 shared F32 s_averageLuma[SHARED_MEMORY_ENTRIES];
 shared F32 s_averageLuma[SHARED_MEMORY_ENTRIES];
 shared Vec2 s_maxDerivative[SHARED_MEMORY_ENTRIES];
 shared Vec2 s_maxDerivative[SHARED_MEMORY_ENTRIES];
+#endif
 
 
-F32 sampleLuma(IVec2 location, IVec2 maxLocation)
-{
-	const Vec3 color = texelFetch(u_inputTex, min(location, maxLocation), 0).xyz;
-	return computeLuminance(color);
-}
+#define sampleLuma(offsetX, offsetY) \
+	computeLuminance( \
+		textureLodOffset(sampler2D(u_inputTex, u_nearestClampSampler), uv, 0.0, IVec2(offsetX, offsetY)).xyz)
 
 
 void main()
 void main()
 {
 {
-	const IVec2 blockLocation = IVec2(gl_GlobalInvocationID.xy) * IVec2(2, 4);
-	const IVec2 maxLocation = textureSize(u_inputTex, 0) - IVec2(1, 1);
+	const Vec2 uv = Vec2(gl_GlobalInvocationID.xy) * Vec2(REGION_SIZE) * u_oneOverViewportSize;
 
 
 	// Get luminance.
 	// Get luminance.
-	// l0.x  l0.y
-	// l0.z  l0.w  l2.x
-	// l1.x  l1.y
-	// l1.z  l1.w  l2.y
 	//       l2.z
 	//       l2.z
-
+	// l1.z  l1.w  l2.y
+	// l1.x  l1.y
+	// l0.z  l0.w  l2.x
+	// l0.x  l0.y
 	Vec4 l0;
 	Vec4 l0;
-	l0.x = sampleLuma(blockLocation + IVec2(0, 0), maxLocation);
-	l0.y = sampleLuma(blockLocation + IVec2(1, 0), maxLocation);
-	l0.z = sampleLuma(blockLocation + IVec2(0, 1), maxLocation);
-	l0.w = sampleLuma(blockLocation + IVec2(1, 1), maxLocation);
+	l0.x = sampleLuma(0, 0);
+	l0.y = sampleLuma(1, 0);
+	l0.z = sampleLuma(0, 1);
+	l0.w = sampleLuma(1, 1);
 
 
 	Vec4 l1;
 	Vec4 l1;
-	l1.x = sampleLuma(blockLocation + IVec2(0, 2), maxLocation);
-	l1.y = sampleLuma(blockLocation + IVec2(1, 2), maxLocation);
-	l1.z = sampleLuma(blockLocation + IVec2(0, 3), maxLocation);
-	l1.w = sampleLuma(blockLocation + IVec2(1, 3), maxLocation);
+	l1.x = sampleLuma(0, 2);
+	l1.y = sampleLuma(1, 2);
+	l1.z = sampleLuma(0, 3);
+	l1.w = sampleLuma(1, 3);
 
 
 	Vec3 l2;
 	Vec3 l2;
-	l2.x = sampleLuma(blockLocation + IVec2(2, 1), maxLocation);
-	l2.y = sampleLuma(blockLocation + IVec2(2, 3), maxLocation);
-	l2.z = sampleLuma(blockLocation + IVec2(1, 4), maxLocation);
+	l2.x = sampleLuma(2, 1);
+	l2.y = sampleLuma(2, 3);
+	l2.z = sampleLuma(1, 4);
 
 
 	// Calculate derivatives.
 	// Calculate derivatives.
 	Vec4 a = Vec4(l0.y, l2.x, l1.y, l2.y);
 	Vec4 a = Vec4(l0.y, l2.x, l1.y, l2.y);
@@ -87,6 +88,7 @@ void main()
 	F32 averageLuma = (sumL0L1.x + sumL0L1.y + sumL0L1.z + sumL0L1.w) / 8.0;
 	F32 averageLuma = (sumL0L1.x + sumL0L1.y + sumL0L1.z + sumL0L1.w) / 8.0;
 	averageLuma = subgroupAdd(averageLuma);
 	averageLuma = subgroupAdd(averageLuma);
 
 
+#if SHARED_MEMORY
 	// Store results in shared memory.
 	// Store results in shared memory.
 	ANKI_BRANCH if(subgroupElect())
 	ANKI_BRANCH if(subgroupElect())
 	{
 	{
@@ -96,18 +98,24 @@ void main()
 
 
 	memoryBarrierShared();
 	memoryBarrierShared();
 	barrier();
 	barrier();
+#endif
 
 
 	// Write the result
 	// Write the result
 	ANKI_BRANCH if(gl_LocalInvocationIndex == 0u)
 	ANKI_BRANCH if(gl_LocalInvocationIndex == 0u)
 	{
 	{
 		// Get max across all subgroups.
 		// Get max across all subgroups.
+#if SHARED_MEMORY
 		averageLuma = s_averageLuma[0];
 		averageLuma = s_averageLuma[0];
 		Vec2 maxDerivative = s_maxDerivative[0];
 		Vec2 maxDerivative = s_maxDerivative[0];
+
 		for(U32 i = 1u; i < gl_NumSubgroups; ++i)
 		for(U32 i = 1u; i < gl_NumSubgroups; ++i)
 		{
 		{
 			averageLuma += s_averageLuma[i];
 			averageLuma += s_averageLuma[i];
 			maxDerivative = max(maxDerivative, s_maxDerivative[i]);
 			maxDerivative = max(maxDerivative, s_maxDerivative[i]);
 		}
 		}
+#else
+		const Vec2 maxDerivative = Vec2(maxDerivativeX, maxDerivativeY);
+#endif
 
 
 		// Determine shading rate.
 		// Determine shading rate.
 		const F32 avgLuma = averageLuma / F32(WORKGROUP_SIZE.x * WORKGROUP_SIZE.y);
 		const F32 avgLuma = averageLuma / F32(WORKGROUP_SIZE.x * WORKGROUP_SIZE.y);
@@ -116,8 +124,8 @@ void main()
 		const F32 threshold2 = threshold1 * 0.4;
 		const F32 threshold2 = threshold1 * 0.4;
 
 
 		UVec2 rate;
 		UVec2 rate;
-		rate.x = lumaDiff.x > threshold1 ? 1u : (lumaDiff.x > threshold2 ? 2u : 4u);
-		rate.y = lumaDiff.y > threshold1 ? 1u : (lumaDiff.y > threshold2 ? 2u : 4u);
+		rate.x = (lumaDiff.x > threshold1) ? 1u : ((lumaDiff.x > threshold2) ? 2u : 4u);
+		rate.y = (lumaDiff.y > threshold1) ? 1u : ((lumaDiff.y > threshold2) ? 2u : 4u);
 
 
 		// 1x4 and 4x1 shading rates don't exist.
 		// 1x4 and 4x1 shading rates don't exist.
 		if(rate == UVec2(1u, 4u))
 		if(rate == UVec2(1u, 4u))
@@ -129,7 +137,7 @@ void main()
 			rate = UVec2(2u, 1u);
 			rate = UVec2(2u, 1u);
 		}
 		}
 
 
-		const UVec2 inputTexelCoord = gl_WorkGroupID.xy;
-		imageStore(u_sriImg, IVec2(inputTexelCoord), UVec4(encodeVrsRate(rate)));
+		const UVec2 outTexelCoord = gl_WorkGroupID.xy;
+		imageStore(u_sriImg, IVec2(outTexelCoord), UVec4(encodeVrsRate(rate)));
 	}
 	}
 }
 }