Browse Source

Subgroup optimizations on IrradianceDice

Panagiotis Christopoulos Charitos 3 years ago
parent
commit
f9fb59b2c4

+ 4 - 1
AnKi/Gr/Common.h

@@ -138,6 +138,9 @@ public:
 	/// Max push constant size.
 	PtrSize m_pushConstantsSize = 128;
 
+	/// The max combined size of shared variables (with paddings) in compute shaders.
+	PtrSize m_computeSharedMemorySize = 16_KB;
+
 	/// Each SBT record should be a multiple of this.
 	U32 m_sbtRecordAlignment = MAX_U32;
 
@@ -173,7 +176,7 @@ public:
 };
 ANKI_END_PACKED_STRUCT
 static_assert(sizeof(GpuDeviceCapabilities)
-				  == sizeof(PtrSize) * 4 + sizeof(U32) * 5 + sizeof(U8) * 3 + sizeof(Bool) * 6,
+				  == sizeof(PtrSize) * 5 + sizeof(U32) * 5 + sizeof(U8) * 3 + sizeof(Bool) * 6,
 			  "Should be packed");
 
 /// The type of the allocator for heap allocations

+ 1 - 0
AnKi/Gr/Vulkan/GrManagerImpl.cpp

@@ -514,6 +514,7 @@ Error GrManagerImpl::initInstance(const GrManagerInitInfo& init)
 	m_capabilities.m_textureBufferBindOffsetAlignment =
 		max<U32>(ANKI_SAFE_ALIGNMENT, U32(m_devProps.properties.limits.minTexelBufferOffsetAlignment));
 	m_capabilities.m_textureBufferMaxRange = MAX_U32;
+	m_capabilities.m_computeSharedMemorySize = m_devProps.properties.limits.maxComputeSharedMemorySize;
 
 	m_capabilities.m_majorApiVersion = vulkanMajor;
 	m_capabilities.m_minorApiVersion = vulkanMinor;

+ 10 - 5
AnKi/Renderer/ConfigVars.defs.h

@@ -31,12 +31,20 @@ ANKI_CONFIG_VAR_U32(RSsrDepthLod, 2, 0, 1000, "Texture LOD of the depth texture
 ANKI_CONFIG_VAR_U32(RSsrMaxSteps, 64, 1, 256, "Max SSR raymarching steps")
 ANKI_CONFIG_VAR_BOOL(RSsrStochastic, false, "Stochastic reflections")
 
+// GI probes
+ANKI_CONFIG_VAR_U32(RIndirectDiffuseProbeTileResolution, (ANKI_PLATFORM_MOBILE) ? 16 : 32, 8, 32, "GI tile resolution")
+ANKI_CONFIG_VAR_U32(RIndirectDiffuseProbeShadowMapResolution, 128, 4, 2048, "GI shadowmap resolution")
+ANKI_CONFIG_VAR_U32(RIndirectDiffuseProbeMaxCachedProbes, 16, 4, 2048, "Max cached probes")
+ANKI_CONFIG_VAR_U32(RIndirectDiffuseProbeMaxVisibleProbes, 8, 1, 256, "Max visible GI probes")
+
+// GI
 ANKI_CONFIG_VAR_U32(RIndirectDiffuseSsgiSampleCount, 8, 1, 1024, "SSGI sample count")
 ANKI_CONFIG_VAR_F32(RIndirectDiffuseSsgiRadius, 2.0f, 0.1f, 100.0f, "SSGI radius in meters")
 ANKI_CONFIG_VAR_U32(RIndirectDiffuseDenoiseSampleCount, 4, 1, 128, "Indirect diffuse denoise sample count")
 ANKI_CONFIG_VAR_F32(RIndirectDiffuseSsaoStrength, 2.5f, 0.1f, 10.0f, "SSAO strength")
 ANKI_CONFIG_VAR_F32(RIndirectDiffuseSsaoBias, -0.1f, -10.0f, 10.0f, "SSAO bias")
 
+// Shadows
 ANKI_CONFIG_VAR_U32(RShadowMappingTileResolution, (ANKI_PLATFORM_MOBILE) ? 128 : 512, 16, 2048,
 					"Shadowmapping tile resolution")
 ANKI_CONFIG_VAR_U32(RShadowMappingTileCountPerRowOrColumn, 16, 1, 256,
@@ -45,19 +53,16 @@ ANKI_CONFIG_VAR_U32(RShadowMappingScratchTileCountX, 4 * (MAX_SHADOW_CASCADES2 +
 					"Number of tiles of the scratch buffer in X")
 ANKI_CONFIG_VAR_U32(RShadowMappingScratchTileCountY, 4, 1, 256, "Number of tiles of the scratch buffer in Y")
 
+// Probe reflections
 ANKI_CONFIG_VAR_U32(RProbeReflectionResolution, 128, 4, 2048, "Reflection probe face resolution")
 ANKI_CONFIG_VAR_U32(RProbeReflectionIrradianceResolution, 16, 4, 2048, "Reflection probe irradiance resolution")
 ANKI_CONFIG_VAR_U32(RProbeRefectionMaxCachedProbes, 32, 4, 256, "Max cached number of reflection probes")
 ANKI_CONFIG_VAR_U32(RProbeReflectionShadowMapResolution, 64, 4, 2048, "Reflection probe shadow resolution")
 
+// Lens flare
 ANKI_CONFIG_VAR_U8(RLensFlareMaxSpritesPerFlare, 8, 4, 255, "Max sprites per lens flare")
 ANKI_CONFIG_VAR_U8(RLensFlareMaxFlares, 16, 8, 255, "Max flare count")
 
-ANKI_CONFIG_VAR_U32(RGiTileResolution, (ANKI_PLATFORM_MOBILE) ? 16 : 32, 4, 2048, "GI tile resolution")
-ANKI_CONFIG_VAR_U32(RGiShadowMapResolution, 128, 4, 2048, "GI shadowmap resolution")
-ANKI_CONFIG_VAR_U32(RGiMaxCachedProbes, 16, 4, 2048, "Max cached probes")
-ANKI_CONFIG_VAR_U32(RGiMaxVisibleProbes, 8, 1, 256, "Max visible GI probes")
-
 ANKI_CONFIG_VAR_U32(RMotionBlurSamples, 32, 1, 2048, "Max motion blur samples")
 
 ANKI_CONFIG_VAR_BOOL(RDbgEnabled, false, "Enable or not debugging")

+ 7 - 9
AnKi/Renderer/IndirectDiffuseProbes.cpp

@@ -108,9 +108,9 @@ Error IndirectDiffuseProbes::init()
 
 Error IndirectDiffuseProbes::initInternal()
 {
-	m_tileSize = getConfig().getRGiTileResolution();
-	m_cacheEntries.create(getAllocator(), getConfig().getRGiMaxCachedProbes());
-	m_maxVisibleProbes = getConfig().getRGiMaxVisibleProbes();
+	m_tileSize = getConfig().getRIndirectDiffuseProbeTileResolution();
+	m_cacheEntries.create(getAllocator(), getConfig().getRIndirectDiffuseProbeMaxCachedProbes());
+	m_maxVisibleProbes = getConfig().getRIndirectDiffuseProbeMaxVisibleProbes();
 	ANKI_ASSERT(m_maxVisibleProbes <= MAX_VISIBLE_GLOBAL_ILLUMINATION_PROBES);
 	ANKI_ASSERT(m_cacheEntries.getSize() >= m_maxVisibleProbes);
 
@@ -166,7 +166,7 @@ Error IndirectDiffuseProbes::initGBuffer()
 
 Error IndirectDiffuseProbes::initShadowMapping()
 {
-	const U32 resolution = getConfig().getRGiShadowMapResolution();
+	const U32 resolution = getConfig().getRIndirectDiffuseProbeShadowMapResolution();
 	ANKI_ASSERT(resolution > 8);
 
 	// RT descr
@@ -700,13 +700,11 @@ void IndirectDiffuseProbes::runIrradiance(RenderPassWorkContext& rgraphCtx, Inte
 		rgraphCtx.bindColorTexture(0, 2, giCtx.m_gbufferColorRts[i], i);
 	}
 
-	// Bind temporary memory
-	allocateAndBindStorage<void*>(sizeof(Vec4) * 6 * m_tileSize * m_tileSize, cmdb, 0, 3);
+	rgraphCtx.bindImage(0, 3, giCtx.m_irradianceProbeRts[probeIdx], TextureSubresourceInfo());
 
-	rgraphCtx.bindImage(0, 4, giCtx.m_irradianceProbeRts[probeIdx], TextureSubresourceInfo());
-
-	struct
+	class
 	{
+	public:
 		IVec3 m_volumeTexel;
 		I32 m_nextTexelOffsetInU;
 	} unis;

+ 1 - 4
AnKi/Renderer/ProbeReflections.cpp

@@ -439,10 +439,7 @@ void ProbeReflections::runIrradiance(RenderPassWorkContext& rgraphCtx)
 	subresource.m_firstLayer = cacheEntryIdx;
 	rgraphCtx.bindTexture(0, 1, m_ctx.m_lightShadingRt, subresource);
 
-	allocateAndBindStorage<void*>(sizeof(Vec4) * 6 * m_irradiance.m_workgroupSize * m_irradiance.m_workgroupSize, cmdb,
-								  0, 3);
-
-	cmdb->bindStorageBuffer(0, 4, m_irradiance.m_diceValuesBuff, 0, m_irradiance.m_diceValuesBuff->getSize());
+	cmdb->bindStorageBuffer(0, 3, m_irradiance.m_diceValuesBuff, 0, m_irradiance.m_diceValuesBuff->getSize());
 
 	// Draw
 	cmdb->dispatchCompute(1, 1, 1);

+ 54 - 48
AnKi/Shaders/IrradianceDice.ankiprog

@@ -33,14 +33,8 @@ layout(set = 0, binding = 1) uniform ANKI_RP textureCube u_lightShadingTexCube;
 layout(set = 0, binding = 2) uniform texture2D u_gbufferTex[3u];
 #endif
 
-// This is a temporary buffer used instead of shared memory because we can't fit it into shared memory
-layout(set = 0, binding = 3) buffer b_ssbo
-{
-	Vec4 u_integrationResults[6u * WORKGROUP_SIZE];
-};
-
 #if STORE_LOCATION == 0
-layout(set = 0, binding = 4) uniform writeonly image3D u_irradianceVolume;
+layout(set = 0, binding = 3) uniform writeonly image3D u_irradianceVolume;
 
 layout(push_constant, std140) uniform b_pc
 {
@@ -48,13 +42,13 @@ layout(push_constant, std140) uniform b_pc
 	I32 u_nextTexelOffsetInU;
 };
 #else
-layout(set = 0, binding = 4) writeonly buffer b_ssbo1
+layout(set = 0, binding = 3) writeonly buffer b_ssbo1
 {
 	ANKI_RP Vec4 u_irradianceDisceResults[6u];
 };
 #endif
 
-shared ANKI_RP Vec3 s_diceIrradiance[6u];
+shared Vec3 s_integrationResults[6u][WORKGROUP_SIZE / 8u];
 
 ANKI_RP Vec3 sampleLightShadingTexture(const U32 face)
 {
@@ -81,8 +75,9 @@ void main()
 	const Vec2 faceUv = (Vec2(gl_LocalInvocationID.xy) + 0.5) / WORKGROUP_SIZE_XY_F;
 	const Vec2 ndc = UV_TO_NDC(faceUv);
 
-	// Initialize
-	ANKI_UNROLL for(U32 f = 0u; f < 6u; ++f)
+	// Compute result for a pixel
+	Vec3 resultFaces[6u];
+	for(U32 f = 0u; f < 6u; ++f)
 	{
 		// Get the direction of the dice face
 		const Vec3 diceDir = getCubemapDirection(Vec2(0.0), f);
@@ -95,33 +90,36 @@ void main()
 		const ANKI_RP Vec3 irradiance = lightShading * lambert * cubeCoordSolidAngle(ndc, WORKGROUP_SIZE_XY_F);
 
 		// Store
-		u_integrationResults[f * WORKGROUP_SIZE + gl_LocalInvocationID.y * U32(WORKGROUP_SIZE_XY)
-							 + gl_LocalInvocationID.x] = irradiance.xyzx;
+		resultFaces[f] = irradiance;
+	}
+
+	// Subgroup reduce
+	ANKI_UNROLL for(U32 f = 0u; f < 6u; ++f)
+	{
+		resultFaces[f] = subgroupAdd(resultFaces[f]);
+	}
+
+	ANKI_BRANCH if(subgroupElect())
+	{
+		ANKI_UNROLL for(U32 f = 0u; f < 6u; ++f)
+		{
+			s_integrationResults[f][gl_SubgroupID] = resultFaces[f];
+		}
 	}
 
 	memoryBarrierBuffer();
 	barrier();
 
-	// Reduce using prefix sum
-	ANKI_LOOP for(U32 s = WORKGROUP_SIZE / 2u; s > 0u; s >>= 1u)
+	// Worgroup reduce
+	ANKI_BRANCH if(gl_LocalInvocationIndex == 0u)
 	{
-		if(gl_LocalInvocationIndex < s)
+		ANKI_UNROLL for(U32 f = 0u; f < 6u; ++f)
 		{
-			ANKI_UNROLL for(U32 f = 0u; f < 6u; ++f)
+			for(U32 i = 1u; i < gl_NumSubgroups; ++i)
 			{
-				u_integrationResults[f * WORKGROUP_SIZE + gl_LocalInvocationIndex] +=
-					u_integrationResults[f * WORKGROUP_SIZE + gl_LocalInvocationIndex + s];
+				s_integrationResults[f][0] += s_integrationResults[f][i];
 			}
 		}
-
-		memoryBarrierBuffer();
-		barrier();
-	}
-
-	if(gl_LocalInvocationIndex < 6u)
-	{
-		s_diceIrradiance[gl_LocalInvocationIndex] =
-			u_integrationResults[gl_LocalInvocationIndex * WORKGROUP_SIZE + 0u].xyz;
 	}
 
 	memoryBarrierShared();
@@ -129,7 +127,7 @@ void main()
 
 #if SECOND_BOUNCE == 1
 	// Initialize again for the 2nd bounce
-	ANKI_UNROLL for(U32 f = 0u; f < 6u; ++f)
+	for(U32 f = 0u; f < 6u; ++f)
 	{
 		// Get the direction of the dice face
 		const Vec3 diceDir = getCubemapDirection(Vec2(0.0), f);
@@ -149,9 +147,9 @@ void main()
 								textureLod(u_gbufferTex[2u], u_nearestAnyClampSampler, gbufferUv, 0.0), gbuffer);
 
 		// Sample irradiance
-		ANKI_RP Vec3 firstBounceIrradiance =
-			sampleAmbientDice(s_diceIrradiance[0], s_diceIrradiance[1], s_diceIrradiance[2], s_diceIrradiance[3],
-							  s_diceIrradiance[4], s_diceIrradiance[5], gbuffer.m_normal);
+		ANKI_RP Vec3 firstBounceIrradiance = sampleAmbientDice(
+			s_integrationResults[0][0], s_integrationResults[1][0], s_integrationResults[2][0],
+			s_integrationResults[3][0], s_integrationResults[4][0], s_integrationResults[5][0], gbuffer.m_normal);
 		firstBounceIrradiance = gbuffer.m_diffuse * firstBounceIrradiance;
 
 		// Compute 2nd bounce
@@ -160,28 +158,40 @@ void main()
 			(firstBounceIrradiance + lightShading * lambert) * cubeCoordSolidAngle(ndc, WORKGROUP_SIZE_XY_F);
 
 		// Store
-		u_integrationResults[f * WORKGROUP_SIZE + gl_LocalInvocationID.y * U32(WORKGROUP_SIZE_XY)
-							 + gl_LocalInvocationID.x] = irradiance.xyzx;
+		resultFaces[f] = irradiance;
+	}
+
+	// Subgroup reduce
+	ANKI_UNROLL for(U32 f = 0u; f < 6u; ++f)
+	{
+		resultFaces[f] = subgroupAdd(resultFaces[f]);
+	}
+
+	ANKI_BRANCH if(subgroupElect())
+	{
+		ANKI_UNROLL for(U32 f = 0u; f < 6u; ++f)
+		{
+			s_integrationResults[f][gl_SubgroupID] = resultFaces[f];
+		}
 	}
 
 	memoryBarrierBuffer();
 	barrier();
 
-	// Reduce using prefix sum again
-	ANKI_LOOP for(U32 s = WORKGROUP_SIZE / 2u; s > 0u; s >>= 1u)
+	// Worgroup reduce
+	ANKI_BRANCH if(gl_LocalInvocationIndex == 0u)
 	{
-		if(gl_LocalInvocationIndex < s)
+		ANKI_UNROLL for(U32 f = 0u; f < 6u; ++f)
 		{
-			ANKI_UNROLL for(U32 f = 0u; f < 6u; ++f)
+			for(U32 i = 1u; i < gl_NumSubgroups; ++i)
 			{
-				u_integrationResults[f * WORKGROUP_SIZE + gl_LocalInvocationIndex] +=
-					u_integrationResults[f * WORKGROUP_SIZE + gl_LocalInvocationIndex + s];
+				s_integrationResults[f][0] += s_integrationResults[f][i];
 			}
 		}
-
-		memoryBarrierBuffer();
-		barrier();
 	}
+
+	memoryBarrierShared();
+	barrier();
 #endif
 
 	// Store the results
@@ -190,11 +200,7 @@ void main()
 		const U32 f = gl_LocalInvocationIndex;
 
 #if DEBUG_MODE == 0
-#	if SECOND_BOUNCE == 1
-		ANKI_RP Vec3 irradiance = u_integrationResults[f * WORKGROUP_SIZE + 0u].xyz;
-#	else
-		ANKI_RP Vec3 irradiance = s_diceIrradiance[f];
-#	endif
+		ANKI_RP Vec3 irradiance = s_integrationResults[f][0];
 		const ANKI_RP Vec3 toStoreValue = irradiance;
 #elif DEBUG_MODE == 1
 		const ANKI_RP Vec3 toStoreValue = colorPerCubeFace(f);