Browse Source

Merge pull request #102 from godlikepanos/8x8vrs

8x8 VRS
Panagiotis Christopoulos Charitos 3 years ago
parent
commit
080a07e5f4

+ 3 - 3
AnKi/Gr/Common.h

@@ -153,6 +153,9 @@ public:
 	/// Max subgroup size of the GPU.
 	U32 m_maxSubgroupSize = 0;
 
+	/// Min size of a texel in the shading rate image.
+	U32 m_minShadingRateImageTexelSize = 0;
+
 	/// GPU vendor.
 	GpuVendor m_gpuVendor = GpuVendor::UNKNOWN;
 
@@ -181,9 +184,6 @@ public:
 	Bool m_unalignedBbpTextureFormats = false;
 };
 ANKI_END_PACKED_STRUCT
-static_assert(sizeof(GpuDeviceCapabilities)
-				  == sizeof(PtrSize) * 5 + sizeof(U32) * 7 + sizeof(U8) * 3 + sizeof(Bool) * 6,
-			  "Should be packed");
 
 /// The type of the allocator for heap allocations
 template<typename T>

+ 41 - 8
AnKi/Gr/Vulkan/GrManagerImpl.cpp

@@ -159,7 +159,7 @@ Error GrManagerImpl::initInternal(const GrManagerInitInfo& init)
 	m_occlusionQueryFactory.init(getAllocator(), m_device, VK_QUERY_TYPE_OCCLUSION);
 	m_timestampQueryFactory.init(getAllocator(), m_device, VK_QUERY_TYPE_TIMESTAMP);
 
-	// See if analigned formats are supported
+	// See if unaligned formats are supported
 	{
 		m_capabilities.m_unalignedBbpTextureFormats = true;
 
@@ -1028,16 +1028,49 @@ Error GrManagerImpl::initDevice(const GrManagerInitInfo& init)
 		if(!m_fragmentShadingRateFeatures.attachmentFragmentShadingRate
 		   || !m_fragmentShadingRateFeatures.pipelineFragmentShadingRate)
 		{
-			ANKI_VK_LOGE(VK_KHR_FRAGMENT_SHADING_RATE_EXTENSION_NAME
-						 " doesn't support attachment and/or pipeline rates");
-			return Error::FUNCTION_FAILED;
+			ANKI_VK_LOGW(VK_KHR_FRAGMENT_SHADING_RATE_EXTENSION_NAME
+						 " doesn't support attachment and/or pipeline rates. Will disable VRS");
+			m_capabilities.m_vrs = false;
+		}
+		else
+		{
+			// Disable some things
+			m_fragmentShadingRateFeatures.primitiveFragmentShadingRate = false;
 		}
 
-		// Disable some things
-		m_fragmentShadingRateFeatures.primitiveFragmentShadingRate = false;
+		if(m_capabilities.m_vrs)
+		{
+			VkPhysicalDeviceFragmentShadingRatePropertiesKHR fragmentShadingRateProperties = {};
+			fragmentShadingRateProperties.sType =
+				VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_SHADING_RATE_PROPERTIES_KHR;
+
+			VkPhysicalDeviceProperties2 properties = {};
+			properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
+			properties.pNext = &fragmentShadingRateProperties;
+			vkGetPhysicalDeviceProperties2(m_physicalDevice, &properties);
+
+			if(fragmentShadingRateProperties.minFragmentShadingRateAttachmentTexelSize.width > 16
+			   || fragmentShadingRateProperties.minFragmentShadingRateAttachmentTexelSize.height > 16
+			   || fragmentShadingRateProperties.maxFragmentShadingRateAttachmentTexelSize.width < 8
+			   || fragmentShadingRateProperties.maxFragmentShadingRateAttachmentTexelSize.height < 8)
+			{
+				ANKI_VK_LOGW(VK_KHR_FRAGMENT_SHADING_RATE_EXTENSION_NAME
+							 " doesn't support 8x8 or 16x16 shading rate attachment texel size. Will disable VRS");
+				m_capabilities.m_vrs = false;
+			}
+			else
+			{
+				m_capabilities.m_minShadingRateImageTexelSize =
+					max(fragmentShadingRateProperties.minFragmentShadingRateAttachmentTexelSize.width,
+						fragmentShadingRateProperties.minFragmentShadingRateAttachmentTexelSize.height);
+			}
+		}
 
-		m_fragmentShadingRateFeatures.pNext = const_cast<void*>(ci.pNext);
-		ci.pNext = &m_fragmentShadingRateFeatures;
+		if(m_capabilities.m_vrs)
+		{
+			m_fragmentShadingRateFeatures.pNext = const_cast<void*>(ci.pNext);
+			ci.pNext = &m_fragmentShadingRateFeatures;
+		}
 	}
 
 	ANKI_VK_CHECK(vkCreateDevice(m_physicalDevice, &ci, nullptr, &m_device));

+ 11 - 0
AnKi/Renderer/IndirectDiffuse.cpp

@@ -48,10 +48,15 @@ Error IndirectDiffuse::initInternal()
 	m_rts[1] = m_r->createAndClearRenderTarget(texInit, TextureUsageBit::ALL_SAMPLED);
 
 	// Init VRS SRI generation
+	const Bool enableVrs = getGrManager().getDeviceCapabilities().m_vrs && getConfig().getRVrs() && !preferCompute;
+	if(enableVrs)
 	{
 		m_main.m_fbDescr.m_colorAttachmentCount = 1;
 		m_main.m_fbDescr.bake();
 
+		m_vrs.m_sriTexelDimension = getGrManager().getDeviceCapabilities().m_minShadingRateImageTexelSize;
+		ANKI_ASSERT(m_vrs.m_sriTexelDimension == 8 || m_vrs.m_sriTexelDimension == 16);
+
 		const UVec2 rez = (size + m_vrs.m_sriTexelDimension - 1) / m_vrs.m_sriTexelDimension;
 		m_vrs.m_rtHandle =
 			m_r->create2DRenderTargetDescription(rez.x(), rez.y(), Format::R8_UINT, "IndirectDiffuse VRS SRI");
@@ -69,6 +74,12 @@ Error IndirectDiffuse::initInternal()
 			// need for shared mem
 			variantInit.addMutation("SHARED_MEMORY", 0);
 		}
+		else if(m_vrs.m_sriTexelDimension == 8 && getGrManager().getDeviceCapabilities().m_minSubgroupSize >= 16)
+		{
+			// Algorithm's workgroup size is 16, GPU's subgroup size is min 16 -> each workgroup has 1 subgroup -> No
+			// need for shared mem
+			variantInit.addMutation("SHARED_MEMORY", 0);
+		}
 		else
 		{
 			variantInit.addMutation("SHARED_MEMORY", 1);

+ 8 - 0
AnKi/Renderer/VrsSriGeneration.cpp

@@ -37,6 +37,8 @@ Error VrsSriGeneration::initInternal()
 		return Error::NONE;
 	}
 
+	m_sriTexelDimension = getGrManager().getDeviceCapabilities().m_minShadingRateImageTexelSize;
+	ANKI_ASSERT(m_sriTexelDimension == 8 || m_sriTexelDimension == 16);
 	const UVec2 rez = (m_r->getInternalResolution() + m_sriTexelDimension - 1) / m_sriTexelDimension;
 
 	ANKI_R_LOGV("Intializing VRS SRI generation. SRI resolution %ux%u", rez.x(), rez.y());
@@ -63,6 +65,12 @@ Error VrsSriGeneration::initInternal()
 		// for shared mem
 		variantInit.addMutation("SHARED_MEMORY", 0);
 	}
+	else if(m_sriTexelDimension == 8 && getGrManager().getDeviceCapabilities().m_minSubgroupSize >= 16)
+	{
+		// Algorithm's workgroup size is 16, GPU's subgroup size is min 16 -> each workgroup has 1 subgroup -> No need
+		// for shared mem
+		variantInit.addMutation("SHARED_MEMORY", 0);
+	}
 	else
 	{
 		variantInit.addMutation("SHARED_MEMORY", 1);

+ 1 - 1
AnKi/Renderer/VrsSriGeneration.h

@@ -47,7 +47,7 @@ public:
 	Bool m_sriTexImportedOnce = false;
 	FramebufferDescription m_fbDescr;
 
-	static constexpr U32 m_sriTexelDimension = 16;
+	U32 m_sriTexelDimension = 16;
 
 	class
 	{

+ 28 - 0
AnKi/Shaders/IndirectDiffuseVrsSriGeneration.ankiprog

@@ -13,7 +13,11 @@
 layout(set = 0, binding = 0) uniform texture2D u_inputTex;
 layout(set = 0, binding = 1) uniform sampler u_nearestClampSampler;
 
+#if SRI_TEXEL_DIMENSION == 8
+const UVec2 REGION_SIZE = UVec2(2u, 2u);
+#else
 const UVec2 REGION_SIZE = UVec2(2u, 4u);
+#endif
 
 const UVec2 WORKGROUP_SIZE = UVec2(SRI_TEXEL_DIMENSION) / REGION_SIZE;
 layout(local_size_x = WORKGROUP_SIZE.x, local_size_y = WORKGROUP_SIZE.y, local_size_z = 1) in;
@@ -51,6 +55,28 @@ void main()
 	const Vec2 uv = (Vec2(gl_GlobalInvocationID.xy) * Vec2(REGION_SIZE) + 0.5) * u_oneOverViewportSize;
 	const Vec2 ndc = UV_TO_NDC(uv);
 
+#if SRI_TEXEL_DIMENSION == 8
+	// Get positions
+	// l0.z  l0.w
+	// l0.x  l0.y
+	Vec4 l0;
+	l0.x = sampleViewPositionZ(uv, 0, 0);
+	l0.y = sampleViewPositionZ(uv, 1, 0);
+	l0.z = sampleViewPositionZ(uv, 0, 1);
+	l0.w = sampleViewPositionZ(uv, 1, 1);
+
+	// Calculate derivatives.
+	Vec2 a = Vec2(l0.y, l0.z);
+	Vec2 b = Vec2(l0.x, l0.w);
+	const Vec2 dx = abs(a - b);
+
+	a = Vec2(l0.z, l0.w);
+	b = Vec2(l0.x, l0.y);
+	const Vec2 dy = abs(a - b);
+
+	F32 maxDerivativeX = max(dx.x, dx.y);
+	F32 maxDerivativeY = max(dy.x, dy.y);
+#else
 	// Get positions
 	// l1.z  l1.w
 	// l1.x  l1.y
@@ -79,6 +105,8 @@ void main()
 
 	F32 maxDerivativeX = max(max(dx.x, dx.y), max(dx.z, dx.w));
 	F32 maxDerivativeY = max(max(dy.x, dy.y), max(dy.z, dy.w));
+#endif
+
 	maxDerivativeX = subgroupMax(maxDerivativeX);
 	maxDerivativeY = subgroupMax(maxDerivativeY);
 

+ 42 - 5
AnKi/Shaders/VrsSriGeneration.glsl

@@ -10,12 +10,16 @@
 #include <AnKi/Shaders/TonemappingFunctions.glsl>
 
 // Find the maximum luma derivative in x and y, relative to the average luma of the block.
-// Each thread handles a 2x4 region.
+// Each thread handles a 2x2 region when using 8x8 VRS tiles and a 2x4 region when using 16x16 VRS tiles.
 
 layout(set = 0, binding = 0) uniform ANKI_RP texture2D u_inputTex;
 layout(set = 0, binding = 1) uniform sampler u_nearestClampSampler;
 
+#if SRI_TEXEL_DIMENSION == 8
+const UVec2 REGION_SIZE = UVec2(2u, 2u);
+#else
 const UVec2 REGION_SIZE = UVec2(2u, 4u);
+#endif
 
 const UVec2 WORKGROUP_SIZE = UVec2(SRI_TEXEL_DIMENSION) / REGION_SIZE;
 layout(local_size_x = WORKGROUP_SIZE.x, local_size_y = WORKGROUP_SIZE.y, local_size_z = 1) in;
@@ -44,8 +48,38 @@ shared Vec2 s_maxDerivative[SHARED_MEMORY_ENTRIES];
 
 void main()
 {
-	const Vec2 uv = Vec2(gl_GlobalInvocationID.xy) * Vec2(REGION_SIZE) * u_oneOverViewportSize;
+	const Vec2 uv = (Vec2(gl_GlobalInvocationID.xy) * Vec2(REGION_SIZE) + 0.5) * u_oneOverViewportSize;
 
+#if SRI_TEXEL_DIMENSION == 8
+	// Get luminance.
+	//       l1.y
+	// l0.z  l0.w  l1.x
+	// l0.x  l0.y
+	Vec4 l0;
+	l0.x = sampleLuma(0, 0);
+	l0.y = sampleLuma(1, 0);
+	l0.z = sampleLuma(0, 1);
+	l0.w = sampleLuma(1, 1);
+
+	Vec2 l1;
+	l1.x = sampleLuma(2, 1);
+	l1.y = sampleLuma(1, 2);
+
+	// Calculate derivatives.
+	Vec2 a = Vec2(l0.y, l1.x);
+	Vec2 b = Vec2(l0.x, l0.w);
+	const Vec2 dx = abs(a - b);
+
+	a = Vec2(l0.z, l1.y);
+	b = Vec2(l0.x, l0.w);
+	const Vec2 dy = abs(a - b);
+
+	F32 maxDerivativeX = max(dx.x, dx.y);
+	F32 maxDerivativeY = max(dy.x, dy.y);
+
+	// Calculate average luma.
+	F32 averageLuma = (l0.x + l0.y + l0.z + l0.w) / 4.0;
+#else
 	// Get luminance.
 	//       l2.z
 	// l1.z  l1.w  l2.y
@@ -80,12 +114,15 @@ void main()
 
 	F32 maxDerivativeX = max(max(dx.x, dx.y), max(dx.z, dx.w));
 	F32 maxDerivativeY = max(max(dy.x, dy.y), max(dy.z, dy.w));
-	maxDerivativeX = subgroupMax(maxDerivativeX);
-	maxDerivativeY = subgroupMax(maxDerivativeY);
 
-	// Calculate average luma in block.
+	// Calculate average luma.
 	const Vec4 sumL0L1 = l0 + l1;
 	F32 averageLuma = (sumL0L1.x + sumL0L1.y + sumL0L1.z + sumL0L1.w) / 8.0;
+#endif
+
+	// Share values in subgroup.
+	maxDerivativeX = subgroupMax(maxDerivativeX);
+	maxDerivativeY = subgroupMax(maxDerivativeY);
 	averageLuma = subgroupAdd(averageLuma);
 
 #if SHARED_MEMORY