Browse Source

Merge branch 'master' into material2

Panagiotis Christopoulos Charitos 3 years ago
parent
commit
14c62fd272

+ 7 - 1
AnKi/Gr/Common.h

@@ -147,6 +147,12 @@ public:
 	/// The size of a shader group handle that will be placed inside an SBT record.
 	/// The size of a shader group handle that will be placed inside an SBT record.
 	U32 m_shaderGroupHandleSize = 0;
 	U32 m_shaderGroupHandleSize = 0;
 
 
+	/// Min subgroup size of the GPU.
+	U32 m_minSubgroupSize = 0;
+
+	/// Max subgroup size of the GPU.
+	U32 m_maxSubgroupSize = 0;
+
 	/// GPU vendor.
 	/// GPU vendor.
 	GpuVendor m_gpuVendor = GpuVendor::UNKNOWN;
 	GpuVendor m_gpuVendor = GpuVendor::UNKNOWN;
 
 
@@ -176,7 +182,7 @@ public:
 };
 };
 ANKI_END_PACKED_STRUCT
 ANKI_END_PACKED_STRUCT
 static_assert(sizeof(GpuDeviceCapabilities)
 static_assert(sizeof(GpuDeviceCapabilities)
-				  == sizeof(PtrSize) * 5 + sizeof(U32) * 5 + sizeof(U8) * 3 + sizeof(Bool) * 6,
+				  == sizeof(PtrSize) * 5 + sizeof(U32) * 7 + sizeof(U8) * 3 + sizeof(Bool) * 6,
 			  "Should be packed");
 			  "Should be packed");
 
 
 /// The type of the allocator for heap allocations
 /// The type of the allocator for heap allocations

+ 13 - 0
AnKi/Gr/Vulkan/GrManagerImpl.cpp

@@ -486,22 +486,35 @@ Error GrManagerImpl::initInstance(const GrManagerInitInfo& init)
 	{
 	{
 	case 0x13B5:
 	case 0x13B5:
 		m_capabilities.m_gpuVendor = GpuVendor::ARM;
 		m_capabilities.m_gpuVendor = GpuVendor::ARM;
+		m_capabilities.m_minSubgroupSize = 16;
+		m_capabilities.m_maxSubgroupSize = 16;
 		break;
 		break;
 	case 0x10DE:
 	case 0x10DE:
 		m_capabilities.m_gpuVendor = GpuVendor::NVIDIA;
 		m_capabilities.m_gpuVendor = GpuVendor::NVIDIA;
+		m_capabilities.m_minSubgroupSize = 32;
+		m_capabilities.m_maxSubgroupSize = 32;
 		break;
 		break;
 	case 0x1002:
 	case 0x1002:
 	case 0x1022:
 	case 0x1022:
 		m_capabilities.m_gpuVendor = GpuVendor::AMD;
 		m_capabilities.m_gpuVendor = GpuVendor::AMD;
+		m_capabilities.m_minSubgroupSize = 32;
+		m_capabilities.m_maxSubgroupSize = 64;
 		break;
 		break;
 	case 0x8086:
 	case 0x8086:
 		m_capabilities.m_gpuVendor = GpuVendor::INTEL;
 		m_capabilities.m_gpuVendor = GpuVendor::INTEL;
+		m_capabilities.m_minSubgroupSize = 8;
+		m_capabilities.m_maxSubgroupSize = 32;
 		break;
 		break;
 	case 0x5143:
 	case 0x5143:
 		m_capabilities.m_gpuVendor = GpuVendor::QUALCOMM;
 		m_capabilities.m_gpuVendor = GpuVendor::QUALCOMM;
+		m_capabilities.m_minSubgroupSize = 64;
+		m_capabilities.m_maxSubgroupSize = 128;
 		break;
 		break;
 	default:
 	default:
 		m_capabilities.m_gpuVendor = GpuVendor::UNKNOWN;
 		m_capabilities.m_gpuVendor = GpuVendor::UNKNOWN;
+		// Choose something really low
+		m_capabilities.m_minSubgroupSize = 8;
+		m_capabilities.m_maxSubgroupSize = 8;
 	}
 	}
 	ANKI_VK_LOGI("GPU is %s. Vendor identified as %s", m_devProps.properties.deviceName,
 	ANKI_VK_LOGI("GPU is %s. Vendor identified as %s", m_devProps.properties.deviceName,
 				 &GPU_VENDOR_STR[m_capabilities.m_gpuVendor][0]);
 				 &GPU_VENDOR_STR[m_capabilities.m_gpuVendor][0]);

+ 1 - 0
AnKi/Renderer/ConfigVars.defs.h

@@ -10,6 +10,7 @@ ANKI_CONFIG_VAR_U32(RTileSize, 64, 8, 256, "Tile lighting tile size")
 ANKI_CONFIG_VAR_U32(RZSplitCount, 64, 8, 1024, "Clusterer number of Z splits")
 ANKI_CONFIG_VAR_U32(RZSplitCount, 64, 8, 1024, "Clusterer number of Z splits")
 ANKI_CONFIG_VAR_BOOL(RPreferCompute, !ANKI_PLATFORM_MOBILE, "Prefer compute shaders")
 ANKI_CONFIG_VAR_BOOL(RPreferCompute, !ANKI_PLATFORM_MOBILE, "Prefer compute shaders")
 ANKI_CONFIG_VAR_BOOL(RVrs, true, "Enable VRS in multiple passes")
 ANKI_CONFIG_VAR_BOOL(RVrs, true, "Enable VRS in multiple passes")
+ANKI_CONFIG_VAR_F32(RVrsThreshold, 0.05f, 0.0f, 1.0f, "Threshold under which a lower shading rate will be applied")
 ANKI_CONFIG_VAR_BOOL(RHighQualityHdr, !ANKI_PLATFORM_MOBILE,
 ANKI_CONFIG_VAR_BOOL(RHighQualityHdr, !ANKI_PLATFORM_MOBILE,
 					 "If true use R16G16B16 for HDR images. Alternatively use B10G11R11")
 					 "If true use R16G16B16 for HDR images. Alternatively use B10G11R11")
 
 

+ 18 - 3
AnKi/Renderer/VrsSriGeneration.cpp

@@ -56,6 +56,18 @@ Error VrsSriGeneration::initInternal()
 	ANKI_CHECK(getResourceManager().loadResource("ShaderBinaries/VrsSriGenerationCompute.ankiprogbin", m_prog));
 	ANKI_CHECK(getResourceManager().loadResource("ShaderBinaries/VrsSriGenerationCompute.ankiprogbin", m_prog));
 	ShaderProgramResourceVariantInitInfo variantInit(m_prog);
 	ShaderProgramResourceVariantInitInfo variantInit(m_prog);
 	variantInit.addMutation("SRI_TEXEL_DIMENSION", m_sriTexelDimension);
 	variantInit.addMutation("SRI_TEXEL_DIMENSION", m_sriTexelDimension);
+
+	if(m_sriTexelDimension == 16 && getGrManager().getDeviceCapabilities().m_minSubgroupSize >= 32)
+	{
+		// Algorithm's workgroup size is 32, GPU's subgroup size is min 32 -> each workgroup has 1 subgroup -> No need
+		// for shared mem
+		variantInit.addMutation("SHARED_MEMORY", 0);
+	}
+	else
+	{
+		variantInit.addMutation("SHARED_MEMORY", 1);
+	}
+
 	const ShaderProgramResourceVariant* variant;
 	const ShaderProgramResourceVariant* variant;
 	m_prog->getOrCreateVariant(variantInit, variant);
 	m_prog->getOrCreateVariant(variantInit, variant);
 	m_grProg = variant->getProgram();
 	m_grProg = variant->getProgram();
@@ -116,10 +128,13 @@ void VrsSriGeneration::populateRenderGraph(RenderingContext& ctx)
 		cmdb->bindShaderProgram(m_grProg);
 		cmdb->bindShaderProgram(m_grProg);
 
 
 		rgraphCtx.bindColorTexture(0, 0, m_r->getTemporalAA().getTonemappedRt());
 		rgraphCtx.bindColorTexture(0, 0, m_r->getTemporalAA().getTonemappedRt());
-		rgraphCtx.bindImage(0, 1, m_runCtx.m_rt);
+		cmdb->bindSampler(0, 1, m_r->getSamplers().m_nearestNearestClamp);
+		rgraphCtx.bindImage(0, 2, m_runCtx.m_rt);
+		const Vec4 pc(1.0f / Vec2(m_r->getInternalResolution()), getConfig().getRVrsThreshold(), 0.0f);
+		cmdb->setPushConstants(&pc, sizeof(pc));
 
 
-		const U32 workgroupSize = m_sriTexelDimension;
-		dispatchPPCompute(cmdb, workgroupSize, workgroupSize, m_r->getInternalResolution().x(),
+		const U32 fakeWorkgroupSizeXorY = m_sriTexelDimension;
+		dispatchPPCompute(cmdb, fakeWorkgroupSizeXorY, fakeWorkgroupSizeXorY, m_r->getInternalResolution().x(),
 						  m_r->getInternalResolution().y());
 						  m_r->getInternalResolution().y());
 	});
 	});
 }
 }

+ 1 - 1
AnKi/Shaders/Fsr.glsl

@@ -14,7 +14,7 @@ layout(set = 0, binding = 1) uniform ANKI_RP texture2D u_tex;
 layout(set = 0, binding = 2) writeonly uniform ANKI_RP image2D u_outImg;
 layout(set = 0, binding = 2) writeonly uniform ANKI_RP image2D u_outImg;
 layout(local_size_x = 8, local_size_y = 8) in;
 layout(local_size_x = 8, local_size_y = 8) in;
 #else
 #else
-layout(location = 0) out Vec3 out_color;
+layout(location = 0) out ANKI_RP Vec3 out_color;
 #endif
 #endif
 
 
 layout(push_constant, std430) uniform b_pc
 layout(push_constant, std430) uniform b_pc

+ 112 - 33
AnKi/Shaders/VrsSriGeneration.glsl

@@ -3,62 +3,141 @@
 // Code licensed under the BSD License.
 // Code licensed under the BSD License.
 // http://www.anki3d.org/LICENSE
 // http://www.anki3d.org/LICENSE
 
 
-#pragma anki mutator SRI_TEXEL_DIMENSION 8 16 32
+#pragma anki mutator SRI_TEXEL_DIMENSION 8 16
+#pragma anki mutator SHARED_MEMORY 0 1
 
 
 #include <AnKi/Shaders/Functions.glsl>
 #include <AnKi/Shaders/Functions.glsl>
 #include <AnKi/Shaders/TonemappingFunctions.glsl>
 #include <AnKi/Shaders/TonemappingFunctions.glsl>
 
 
+// Find the maximum luma derivative in x and y, relative to the average luma of the block.
+// Each thread handles a 2x4 region.
+
 layout(set = 0, binding = 0) uniform ANKI_RP texture2D u_inputTex;
 layout(set = 0, binding = 0) uniform ANKI_RP texture2D u_inputTex;
+layout(set = 0, binding = 1) uniform sampler u_nearestClampSampler;
+
+const UVec2 REGION_SIZE = UVec2(2u, 4u);
 
 
-#if defined(ANKI_COMPUTE_SHADER)
-const UVec2 WORKGROUP_SIZE = UVec2(SRI_TEXEL_DIMENSION, SRI_TEXEL_DIMENSION);
+const UVec2 WORKGROUP_SIZE = UVec2(SRI_TEXEL_DIMENSION) / REGION_SIZE;
 layout(local_size_x = WORKGROUP_SIZE.x, local_size_y = WORKGROUP_SIZE.y, local_size_z = 1) in;
 layout(local_size_x = WORKGROUP_SIZE.x, local_size_y = WORKGROUP_SIZE.y, local_size_z = 1) in;
 
 
-layout(set = 0, binding = 1) uniform writeonly uimage2D u_sriImg;
-#else
-layout(location = 0) out U32 out_shadingRate;
+layout(set = 0, binding = 2) uniform writeonly uimage2D u_sriImg;
+
+layout(push_constant, std430) uniform b_pc
+{
+	Vec2 u_oneOverViewportSize;
+	F32 u_threshold;
+	F32 u_padding0;
+};
+
+#if SHARED_MEMORY
+// Ideally, we'd be able to calculate the min/max/average using subgroup operations, but there's no guarantee
+// subgroupSize is large enough so we need shared memory as a fallback. We need gl_NumSubgroups entries, but it is not a
+// constant, so estimate it assuming a subgroupSize of at least 8.
+const U32 SHARED_MEMORY_ENTRIES = WORKGROUP_SIZE.x * WORKGROUP_SIZE.y / 8u;
+shared F32 s_averageLuma[SHARED_MEMORY_ENTRIES];
+shared Vec2 s_maxDerivative[SHARED_MEMORY_ENTRIES];
 #endif
 #endif
 
 
-shared F32 s_lumaMin[WORKGROUP_SIZE.y * WORKGROUP_SIZE.x];
-shared F32 s_lumaMax[WORKGROUP_SIZE.y * WORKGROUP_SIZE.x];
+#define sampleLuma(offsetX, offsetY) \
+	computeLuminance( \
+		textureLodOffset(sampler2D(u_inputTex, u_nearestClampSampler), uv, 0.0, IVec2(offsetX, offsetY)).xyz)
 
 
 void main()
 void main()
 {
 {
-	// Get luminance
-	const Vec3 color = texelFetch(u_inputTex, IVec2(gl_GlobalInvocationID.xy), 0).xyz;
-	const F32 luma = computeLuminance(color);
+	const Vec2 uv = Vec2(gl_GlobalInvocationID.xy) * Vec2(REGION_SIZE) * u_oneOverViewportSize;
+
+	// Get luminance.
+	//       l2.z
+	// l1.z  l1.w  l2.y
+	// l1.x  l1.y
+	// l0.z  l0.w  l2.x
+	// l0.x  l0.y
+	Vec4 l0;
+	l0.x = sampleLuma(0, 0);
+	l0.y = sampleLuma(1, 0);
+	l0.z = sampleLuma(0, 1);
+	l0.w = sampleLuma(1, 1);
+
+	Vec4 l1;
+	l1.x = sampleLuma(0, 2);
+	l1.y = sampleLuma(1, 2);
+	l1.z = sampleLuma(0, 3);
+	l1.w = sampleLuma(1, 3);
+
+	Vec3 l2;
+	l2.x = sampleLuma(2, 1);
+	l2.y = sampleLuma(2, 3);
+	l2.z = sampleLuma(1, 4);
+
+	// Calculate derivatives.
+	Vec4 a = Vec4(l0.y, l2.x, l1.y, l2.y);
+	Vec4 b = Vec4(l0.x, l0.w, l1.x, l1.w);
+	const Vec4 dx = abs(a - b);
+
+	a = Vec4(l0.z, l0.w, l1.z, l2.z);
+	b = Vec4(l0.x, l0.y, l1.x, l1.w);
+	const Vec4 dy = abs(a - b);
+
+	F32 maxDerivativeX = max(max(dx.x, dx.y), max(dx.z, dx.w));
+	F32 maxDerivativeY = max(max(dy.x, dy.y), max(dy.z, dy.w));
+	maxDerivativeX = subgroupMax(maxDerivativeX);
+	maxDerivativeY = subgroupMax(maxDerivativeY);
+
+	// Calculate average luma in block.
+	const Vec4 sumL0L1 = l0 + l1;
+	F32 averageLuma = (sumL0L1.x + sumL0L1.y + sumL0L1.z + sumL0L1.w) / 8.0;
+	averageLuma = subgroupAdd(averageLuma);
+
+#if SHARED_MEMORY
+	// Store results in shared memory.
+	ANKI_BRANCH if(subgroupElect())
+	{
+		s_averageLuma[gl_SubgroupID] = averageLuma;
+		s_maxDerivative[gl_SubgroupID] = Vec2(maxDerivativeX, maxDerivativeY);
+	}
 
 
-	// Store luminance
-	s_lumaMin[gl_LocalInvocationIndex] = luma;
-	s_lumaMax[gl_LocalInvocationIndex] = luma;
 	memoryBarrierShared();
 	memoryBarrierShared();
 	barrier();
 	barrier();
+#endif
 
 
-	// Gather the results into one
-	ANKI_LOOP for(U32 s = (WORKGROUP_SIZE.x * WORKGROUP_SIZE.y) / 2u; s > 0u; s >>= 1u)
+	// Write the result
+	ANKI_BRANCH if(gl_LocalInvocationIndex == 0u)
 	{
 	{
-		if(gl_LocalInvocationIndex < s)
+		// Get max across all subgroups.
+#if SHARED_MEMORY
+		averageLuma = s_averageLuma[0];
+		Vec2 maxDerivative = s_maxDerivative[0];
+
+		for(U32 i = 1u; i < gl_NumSubgroups; ++i)
 		{
 		{
-			s_lumaMin[gl_LocalInvocationIndex] =
-				min(s_lumaMin[gl_LocalInvocationIndex], s_lumaMin[gl_LocalInvocationIndex + s]);
-			s_lumaMax[gl_LocalInvocationIndex] =
-				max(s_lumaMax[gl_LocalInvocationIndex], s_lumaMax[gl_LocalInvocationIndex + s]);
+			averageLuma += s_averageLuma[i];
+			maxDerivative = max(maxDerivative, s_maxDerivative[i]);
 		}
 		}
+#else
+		const Vec2 maxDerivative = Vec2(maxDerivativeX, maxDerivativeY);
+#endif
 
 
-		memoryBarrierShared();
-		barrier();
-	}
+		// Determine shading rate.
+		const F32 avgLuma = averageLuma / F32(WORKGROUP_SIZE.x * WORKGROUP_SIZE.y);
+		const Vec2 lumaDiff = maxDerivative / avgLuma;
+		const F32 threshold1 = u_threshold;
+		const F32 threshold2 = threshold1 * 0.4;
 
 
-	// Write the result
-	ANKI_BRANCH if(gl_LocalInvocationIndex == 0u)
-	{
-		const F32 diff = s_lumaMax[0] - s_lumaMin[0];
-		const F32 maxLumaDiff = 1.0 / 32.0;
+		UVec2 rate;
+		rate.x = (lumaDiff.x > threshold1) ? 1u : ((lumaDiff.x > threshold2) ? 2u : 4u);
+		rate.y = (lumaDiff.y > threshold1) ? 1u : ((lumaDiff.y > threshold2) ? 2u : 4u);
 
 
-		const F32 factor = min(1.0, diff / maxLumaDiff);
-		const U32 rate = 1u << (2u - U32(factor * 2.0));
+		// 1x4 and 4x1 shading rates don't exist.
+		if(rate == UVec2(1u, 4u))
+		{
+			rate = UVec2(1u, 2u);
+		}
+		else if(rate == UVec2(4u, 1u))
+		{
+			rate = UVec2(2u, 1u);
+		}
 
 
-		const UVec2 inputTexelCoord = gl_WorkGroupID.xy;
-		imageStore(u_sriImg, IVec2(inputTexelCoord), UVec4(encodeVrsRate(UVec2(rate))));
+		const UVec2 outTexelCoord = gl_WorkGroupID.xy;
+		imageStore(u_sriImg, IVec2(outTexelCoord), UVec4(encodeVrsRate(rate)));
 	}
 	}
 }
 }

+ 11 - 3
AnKi/Shaders/VrsSriVisualizeRenderTarget.ankiprog

@@ -23,15 +23,23 @@ void main()
 
 
 	if(rate == UVec2(1u))
 	if(rate == UVec2(1u))
 	{
 	{
-		out_color = Vec3(0.0, 0.0, 1.0);
+		out_color = Vec3(1.0, 0.0, 0.0);
+	}
+	else if(rate == UVec2(2u, 1u) || rate == UVec2(1u, 2u))
+	{
+		out_color = Vec3(1.0, 0.5, 0.0);
 	}
 	}
 	else if(rate == UVec2(2u))
 	else if(rate == UVec2(2u))
 	{
 	{
-		out_color = Vec3(0.0, 1.0, 0.0);
+		out_color = Vec3(1.0, 1.0, 0.0);
+	}
+	else if(rate == UVec2(4u, 2u) || rate == UVec2(2u, 4u))
+	{
+		out_color = Vec3(0.5, 1.0, 0.0);
 	}
 	}
 	else if(rate == UVec2(4u))
 	else if(rate == UVec2(4u))
 	{
 	{
-		out_color = Vec3(1.0, 0.0, 0.0);
+		out_color = Vec3(0.0, 1.0, 0.0);
 	}
 	}
 	else
 	else
 	{
 	{

+ 5 - 3
CMakeLists.txt

@@ -78,10 +78,12 @@ endif()
 set(X86 FALSE)
 set(X86 FALSE)
 set(ARM FALSE)
 set(ARM FALSE)
 if(GCC OR CLANG)
 if(GCC OR CLANG)
-	if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86")
-		set(X86 TRUE)
-	elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch")
+	execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine OUTPUT_VARIABLE target_arch)
+
+	if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch" OR ${target_arch} MATCHES "aarch")
 		set(ARM TRUE)
 		set(ARM TRUE)
+	elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86")
+		set(X86 TRUE)
 	else()
 	else()
 		message(FATAL_ERROR "Couldn't find the target architecture from: ${target_arch}")
 		message(FATAL_ERROR "Couldn't find the target architecture from: ${target_arch}")
 	endif()
 	endif()

+ 1 - 1
LICENSE

@@ -1,5 +1,5 @@
 AnKi 3D Engine
 AnKi 3D Engine
-Copyright (c) 2009-2022 Panagiotis Christopoulos Charitos.
+Copyright (c) 2009-2022 Panagiotis Christopoulos Charitos and contributors.
 All rights reserved.
 All rights reserved.
 
 
 Redistribution and use in source and binary forms, with or without
 Redistribution and use in source and binary forms, with or without