Browse Source

Some optimizations

Panagiotis Christopoulos Charitos 7 months ago
parent
commit
baab6874cc

+ 6 - 2
AnKi/Renderer/IndirectDiffuseClipmaps.cpp

@@ -96,7 +96,8 @@ Error IndirectDiffuseClipmaps::init()
 		m_distanceMomentsVolumes[clipmap] = getRenderer().createAndClearRenderTarget(volumeInit, TextureUsageBit::kSrvCompute);
 		m_distanceMomentsVolumes[clipmap] = getRenderer().createAndClearRenderTarget(volumeInit, TextureUsageBit::kSrvCompute);
 	}
 	}
 
 
-	Array<SubMutation, 1> mutation = {"RAYS_PER_PROBE_PER_FRAME", kRaysPerProbePerFrame};
+	const Array<SubMutation, 2> mutation = {
+		{{"RAYS_PER_PROBE_PER_FRAME", kRaysPerProbePerFrame}, {"GPU_WAVE_SIZE", GrManager::getSingleton().getDeviceCapabilities().m_maxWaveSize}}};
 
 
 	ANKI_CHECK(loadShaderProgram("ShaderBinaries/IndirectDiffuseClipmaps.ankiprogbin", mutation, m_prog, m_tmpVisGrProg, "Test"));
 	ANKI_CHECK(loadShaderProgram("ShaderBinaries/IndirectDiffuseClipmaps.ankiprogbin", mutation, m_prog, m_tmpVisGrProg, "Test"));
 	ANKI_CHECK(loadShaderProgram("ShaderBinaries/IndirectDiffuseClipmaps.ankiprogbin", mutation, m_prog, m_visProbesGrProg, "VisualizeProbes"));
 	ANKI_CHECK(loadShaderProgram("ShaderBinaries/IndirectDiffuseClipmaps.ankiprogbin", mutation, m_prog, m_visProbesGrProg, "VisualizeProbes"));
@@ -112,7 +113,10 @@ Error IndirectDiffuseClipmaps::init()
 
 
 		ShaderProgramResourceVariantInitInfo variantInitInfo(m_prog);
 		ShaderProgramResourceVariantInitInfo variantInitInfo(m_prog);
 		variantInitInfo.requestTechniqueAndTypes(ShaderTypeBit::kRayGen, "RtMaterialFetch");
 		variantInitInfo.requestTechniqueAndTypes(ShaderTypeBit::kRayGen, "RtMaterialFetch");
-		variantInitInfo.addMutation("RAYS_PER_PROBE_PER_FRAME", kRaysPerProbePerFrame);
+		for(const SubMutation& s : mutation)
+		{
+			variantInitInfo.addMutation(s.m_mutatorName, s.m_value);
+		}
 		const ShaderProgramResourceVariant* variant;
 		const ShaderProgramResourceVariant* variant;
 		m_prog->getOrCreateVariant(variantInitInfo, variant);
 		m_prog->getOrCreateVariant(variantInitInfo, variant);
 		m_libraryGrProg.reset(&variant->getProgram());
 		m_libraryGrProg.reset(&variant->getProgram());

+ 1 - 1
AnKi/Renderer/IndirectDiffuseClipmaps.h

@@ -34,7 +34,7 @@ inline NumericCVar<U32> g_indirectDiffuseClipmapRadianceCacheProbeSize("R", "Ind
 																	   "Size of the octahedral for the light cache");
 																	   "Size of the octahedral for the light cache");
 inline NumericCVar<U32> g_indirectDiffuseClipmapDistancesProbeSize("R", "IndirectDiffuseClipmapDistanceSize", 10, 5, 22,
 inline NumericCVar<U32> g_indirectDiffuseClipmapDistancesProbeSize("R", "IndirectDiffuseClipmapDistanceSize", 10, 5, 22,
 																   "Size of the octahedral for the probe distances");
 																   "Size of the octahedral for the probe distances");
-inline NumericCVar<U32> g_indirectDiffuseClipmapIrradianceProbeSize("R", "IndirectDiffuseClipmapIrradianceSize", 6, 4, 22,
+inline NumericCVar<U32> g_indirectDiffuseClipmapIrradianceProbeSize("R", "IndirectDiffuseClipmapIrradianceSize", 5, 4, 20,
 																	"Size of the octahedral for the irradiance");
 																	"Size of the octahedral for the irradiance");
 
 
 /// Indirect diffuse based on clipmaps of probes.
 /// Indirect diffuse based on clipmaps of probes.

+ 21 - 44
AnKi/Shaders/IndirectDiffuseClipmaps.ankiprog

@@ -6,10 +6,11 @@
 #pragma anki 16bit
 #pragma anki 16bit
 
 
 #pragma anki mutator RAYS_PER_PROBE_PER_FRAME 32 64
 #pragma anki mutator RAYS_PER_PROBE_PER_FRAME 32 64
+#pragma anki mutator GPU_WAVE_SIZE 16 32 64
 
 
 #pragma anki technique RtMaterialFetch rgen mutators
 #pragma anki technique RtMaterialFetch rgen mutators
 #pragma anki technique PopulateCaches comp mutators RAYS_PER_PROBE_PER_FRAME
 #pragma anki technique PopulateCaches comp mutators RAYS_PER_PROBE_PER_FRAME
-#pragma anki technique ComputeIrradiance comp mutators
+#pragma anki technique ComputeIrradiance comp mutators GPU_WAVE_SIZE
 #pragma anki technique Test comp mutators
 #pragma anki technique Test comp mutators
 #pragma anki technique VisualizeProbes vert pixel mutators
 #pragma anki technique VisualizeProbes vert pixel mutators
 
 
@@ -619,7 +620,7 @@ struct Consts
 };
 };
 ANKI_FAST_CONSTANTS(Consts, g_consts)
 ANKI_FAST_CONSTANTS(Consts, g_consts)
 
 
-constexpr U32 kThreadCount = 64;
+constexpr U32 kThreadCount = GPU_WAVE_SIZE;
 
 
 Texture3D<Vec4> g_radianceVolume : register(t0);
 Texture3D<Vec4> g_radianceVolume : register(t0);
 
 
@@ -628,33 +629,13 @@ RWTexture3D<Vec4> g_irradianceVolume : register(u0);
 ConstantBuffer<GlobalRendererConstants> g_globalRendererConstants : register(b0);
 ConstantBuffer<GlobalRendererConstants> g_globalRendererConstants : register(b0);
 
 
 groupshared Vec3 g_irradianceResults[kThreadCount];
 groupshared Vec3 g_irradianceResults[kThreadCount];
-groupshared U32 g_resultCount;
-groupshared U32 g_sampleCount;
 
 
 [NumThreads(kThreadCount, 1, 1)] void main(COMPUTE_ARGS)
 [NumThreads(kThreadCount, 1, 1)] void main(COMPUTE_ARGS)
 {
 {
-	if(svGroupIndex == 0)
-	{
-		g_resultCount = 0;
-		g_sampleCount = 0;
-	}
-
-	GroupMemoryBarrierWithGroupSync();
-
 	const Clipmap clipmap = g_globalRendererConstants.m_indirectDiffuseClipmaps[g_consts.m_clipmapIdx];
 	const Clipmap clipmap = g_globalRendererConstants.m_indirectDiffuseClipmaps[g_consts.m_clipmapIdx];
 	const U32 probeIdx = svGroupId.x;
 	const U32 probeIdx = svGroupId.x;
 	const UVec2 irradianceTexel = svGroupId.yz;
 	const UVec2 irradianceTexel = svGroupId.yz;
 
 
-	// Compute the texel coord to write the output
-	UVec3 irradianceTexelCoord;
-	unflatten3dArrayIndex(clipmap.m_probeCounts.z, clipmap.m_probeCounts.y, clipmap.m_probeCounts.x, probeIdx, irradianceTexelCoord.z,
-						  irradianceTexelCoord.y, irradianceTexelCoord.x);
-	irradianceTexelCoord = irradianceTexelCoord.xzy;
-	irradianceTexelCoord.xy *= g_consts.m_irradianceProbeSize + 2;
-	irradianceTexelCoord.xy += 1;
-	irradianceTexelCoord.x += irradianceTexel.x;
-	irradianceTexelCoord.y += irradianceTexel.y;
-
 	// Compute input radiance coordinates
 	// Compute input radiance coordinates
 	UVec3 radianceTexelCoordStart;
 	UVec3 radianceTexelCoordStart;
 	unflatten3dArrayIndex(clipmap.m_probeCounts.z, clipmap.m_probeCounts.y, clipmap.m_probeCounts.x, probeIdx, radianceTexelCoordStart.z,
 	unflatten3dArrayIndex(clipmap.m_probeCounts.z, clipmap.m_probeCounts.y, clipmap.m_probeCounts.x, probeIdx, radianceTexelCoordStart.z,
@@ -673,7 +654,6 @@ groupshared U32 g_sampleCount;
 	const U32 radiancePixelsPerThread = (radianceTexelCount + kThreadCount - 1) / kThreadCount;
 	const U32 radiancePixelsPerThread = (radianceTexelCount + kThreadCount - 1) / kThreadCount;
 
 
 	Vec3 irradiance = 0.0;
 	Vec3 irradiance = 0.0;
-	U32 sampleCount = 0;
 	for(U32 pixel = svGroupIndex * radiancePixelsPerThread; pixel < min(radianceTexelCount, (svGroupIndex + 1) * radiancePixelsPerThread); ++pixel)
 	for(U32 pixel = svGroupIndex * radiancePixelsPerThread; pixel < min(radianceTexelCount, (svGroupIndex + 1) * radiancePixelsPerThread); ++pixel)
 	{
 	{
 		Vec2 octUv = Vec2(pixel % g_consts.m_radianceProbeSize, pixel / g_consts.m_radianceProbeSize);
 		Vec2 octUv = Vec2(pixel % g_consts.m_radianceProbeSize, pixel / g_consts.m_radianceProbeSize);
@@ -683,7 +663,7 @@ groupshared U32 g_sampleCount;
 		const Vec3 sampleDir = octahedronDecode(octUv);
 		const Vec3 sampleDir = octahedronDecode(octUv);
 
 
 		const F32 lambert = dot(dir, sampleDir);
 		const F32 lambert = dot(dir, sampleDir);
-		if(lambert <= 0.0)
+		if(lambert <= kEpsilonF32)
 		{
 		{
 			continue;
 			continue;
 		}
 		}
@@ -694,24 +674,17 @@ groupshared U32 g_sampleCount;
 
 
 		const Vec3 radiance = TEX(g_radianceVolume, coord).xyz;
 		const Vec3 radiance = TEX(g_radianceVolume, coord).xyz;
 
 
-		irradiance += radiance * lambert;
-		++sampleCount;
+		const F32 sampleCount = square(F32(g_consts.m_radianceProbeSize)) / 2.0;
+		irradiance += radiance * lambert / sampleCount;
 	}
 	}
 
 
-	if(sampleCount)
-	{
-		InterlockedAdd(g_sampleCount, sampleCount);
-
-		U32 offset;
-		InterlockedAdd(g_resultCount, 1, offset);
-		g_irradianceResults[offset] = irradiance;
-	}
+	g_irradianceResults[svGroupIndex] = irradiance;
 
 
 	GroupMemoryBarrierWithGroupSync();
 	GroupMemoryBarrierWithGroupSync();
 
 
 	[loop] for(U32 s = kThreadCount / 2u; s > 0u; s >>= 1u)
 	[loop] for(U32 s = kThreadCount / 2u; s > 0u; s >>= 1u)
 	{
 	{
-		if(svGroupIndex < s && svGroupIndex + s < g_resultCount)
+		if(svGroupIndex < s)
 		{
 		{
 			g_irradianceResults[svGroupIndex] += g_irradianceResults[svGroupIndex + s];
 			g_irradianceResults[svGroupIndex] += g_irradianceResults[svGroupIndex + s];
 		}
 		}
@@ -721,20 +694,24 @@ groupshared U32 g_sampleCount;
 		{
 		{
 			GroupMemoryBarrierWithGroupSync();
 			GroupMemoryBarrierWithGroupSync();
 		}
 		}
+#	else
+		GroupMemoryBarrierWithGroupSync();
 #	endif
 #	endif
 	}
 	}
 
 
 	if(svGroupIndex == 0)
 	if(svGroupIndex == 0)
 	{
 	{
-		Vec3 irradiance;
-		if(g_sampleCount)
-		{
-			irradiance = g_irradianceResults[0] / g_sampleCount * k2Pi;
-		}
-		else
-		{
-			irradiance = 0.0;
-		}
+		irradiance = g_irradianceResults[0] * k2Pi;
+
+		// Compute the texel coord to write the output
+		UVec3 irradianceTexelCoord;
+		unflatten3dArrayIndex(clipmap.m_probeCounts.z, clipmap.m_probeCounts.y, clipmap.m_probeCounts.x, probeIdx, irradianceTexelCoord.z,
+							  irradianceTexelCoord.y, irradianceTexelCoord.x);
+		irradianceTexelCoord = irradianceTexelCoord.xzy;
+		irradianceTexelCoord.xy *= g_consts.m_irradianceProbeSize + 2;
+		irradianceTexelCoord.xy += 1;
+		irradianceTexelCoord.x += irradianceTexel.x;
+		irradianceTexelCoord.y += irradianceTexel.y;
 
 
 		TEX(g_irradianceVolume, irradianceTexelCoord).xyz = irradiance;
 		TEX(g_irradianceVolume, irradianceTexelCoord).xyz = irradiance;
 
 

+ 2 - 0
AnKi/Shaders/TonemappingAverageLuminance.ankiprog

@@ -68,6 +68,8 @@ groupshared F32 s_avgLum[THREAD_COUNT_X * THREAD_COUNT_Y];
 		{
 		{
 			GroupMemoryBarrierWithGroupSync();
 			GroupMemoryBarrierWithGroupSync();
 		}
 		}
+#else
+		GroupMemoryBarrierWithGroupSync();
 #endif
 #endif
 	}
 	}