Ver Fonte

Optimize the irradiance calculations

Panagiotis Christopoulos Charitos há 6 meses atrás
pai
commit
832cc7d6c5

+ 7 - 8
AnKi/Renderer/IndirectDiffuseClipmaps.cpp

@@ -399,17 +399,16 @@ void IndirectDiffuseClipmaps::populateRenderGraph(RenderingContext& ctx)
 
 			cmdb.bindConstantBuffer(0, 0, ctx.m_globalRenderingConstantsBuffer);
 
+			cmdb.bindSampler(0, 0, getRenderer().getSamplers().m_trilinearRepeat.get());
+
 			for(U32 clipmap = 0; clipmap < kIndirectDiffuseClipmapCount; ++clipmap)
 			{
-				rgraphCtx.bindSrv(0, 0, radianceVolumes[clipmap]);
-				rgraphCtx.bindUav(0, 0, irradianceVolumes[clipmap]);
-
-				const UVec4 consts(clipmap);
-				cmdb.setFastConstants(&consts, sizeof(consts));
-
-				cmdb.dispatchCompute(m_clipmapInfo[clipmap].m_probeCountTotal, g_indirectDiffuseClipmapIrradianceOctMapSize,
-									 g_indirectDiffuseClipmapIrradianceOctMapSize);
+				rgraphCtx.bindSrv(clipmap, 0, radianceVolumes[clipmap]);
+				rgraphCtx.bindUav(clipmap, 0, irradianceVolumes[clipmap]);
 			}
+
+			cmdb.dispatchCompute(m_clipmapInfo[0].m_probeCounts[0] * kIndirectDiffuseClipmapCount, m_clipmapInfo[0].m_probeCounts[1],
+								 m_clipmapInfo[0].m_probeCounts[2]);
 		});
 	}
 

+ 0 - 4
AnKi/Shaders/Functions.hlsl

@@ -577,7 +577,6 @@ F32 computeMipLevel(Vec2 normalizedUvs)
 }
 #endif
 
-#if ANKI_SUPPORTS_64BIT_TYPES
 /// The regular firstbitlow in DXC has some issues since it invokes a builtin that is only supposed to be used with
 /// 32bit input. This is an alternative implementation but it expects that the input is not zero.
 I32 firstbitlow2(U64 v)
@@ -586,7 +585,6 @@ I32 firstbitlow2(U64 v)
 	const I32 lsb2 = firstbitlow((U32)(v >> 32ul));
 	return (lsb1 >= 0) ? lsb1 : lsb2 + 32;
 }
-#endif
 
 /// Define an alternative firstbitlow to go in pair with the 64bit version.
 I32 firstbitlow2(U32 v)
@@ -594,14 +592,12 @@ I32 firstbitlow2(U32 v)
 	return firstbitlow(v);
 }
 
-#if ANKI_SUPPORTS_64BIT_TYPES
 /// The regular firstbitlow in DXC has some issues since it invokes a builtin that is only supposed to be used with
 /// 32bit input. This is an alternative implementation but it expects that the input is not zero.
 U32 countbits2(U64 v)
 {
 	return countbits(U32(v)) + countbits(U32(v >> 32ul));
 }
-#endif
 
 /// Encode the shading rate to be stored in an SRI. The rates should be power of two, can't be zero and can't exceed 4.
 /// So the possible values are 1,2,4

+ 0 - 4
AnKi/Shaders/Include/Common.h

@@ -5,8 +5,6 @@
 
 #pragma once
 
-#define ANKI_SUPPORTS_64BIT_TYPES !ANKI_PLATFORM_MOBILE
-
 //! == C++ =============================================================================================================
 #if defined(__cplusplus)
 
@@ -180,7 +178,6 @@ constexpr uint kSizeof_IVec3 = 12u;
 typedef int32_t4 IVec4;
 constexpr uint kSizeof_IVec4 = 16u;
 
-#	if ANKI_SUPPORTS_64BIT_TYPES
 typedef uint64_t U64;
 constexpr uint kSizeof_U64 = 8u;
 typedef uint64_t2 U64Vec2;
@@ -198,7 +195,6 @@ typedef int64_t3 I64Vec3;
 constexpr uint kSizeof_I64Vec3 = 24u;
 typedef int64_t4 I64Vec4;
 constexpr uint kSizeof_I64Vec4 = 32u;
-#	endif
 
 typedef bool Bool;
 

+ 96 - 85
AnKi/Shaders/IndirectDiffuseClipmaps.ankiprog

@@ -527,127 +527,138 @@ ANKI_FAST_CONSTANTS(Consts, g_consts)
 // ===========================================================================
 #if NOT_ZERO(ANKI_TECHNIQUE_ComputeIrradiance)
 
-struct Consts
-{
-	U32 m_clipmapIdx;
-	U32 m_padding1;
-	U32 m_padding2;
-	U32 m_padding3;
-};
-ANKI_FAST_CONSTANTS(Consts, g_consts)
-
-constexpr U32 kThreadCount = GPU_WAVE_SIZE;
+constexpr U32 kThreadCount = min(32, GPU_WAVE_SIZE); // Keep it bellow 32 to avoid threads doing no work
 
-Texture3D<Vec4> g_radianceVolume : register(t0);
+Texture3D<Vec4> g_radianceVolumes[kIndirectDiffuseClipmapCount] : register(t0);
 
-RWTexture3D<Vec4> g_irradianceVolume : register(u0);
+RWTexture3D<Vec4> g_irradianceVolumes[kIndirectDiffuseClipmapCount] : register(u0);
 
 ConstantBuffer<GlobalRendererConstants> g_globalRendererConstants : register(b0);
 
-groupshared Vec3 g_irradianceResults[kThreadCount];
+SamplerState g_linearAnyRepeatSampler : register(s0);
 
-[NumThreads(kThreadCount, 1, 1)] void main(COMPUTE_ARGS)
+groupshared U64Vec3 g_irradianceResults[IRRADIANCE_OCTAHEDRON_MAP_SIZE][IRRADIANCE_OCTAHEDRON_MAP_SIZE];
+
+void InterlockedAddColor(U32 x, U32 y, Vec3 color)
 {
-	const Clipmap clipmap = g_globalRendererConstants.m_indirectDiffuseClipmaps[g_consts.m_clipmapIdx];
-	const U32 probeIdx = svGroupId.x;
-	const UVec2 irradianceTexel = svGroupId.yz;
-
-	// Compute input radiance coordinates
-	UVec3 radianceTexelCoordStart;
-	unflatten3dArrayIndex(clipmap.m_probeCounts.z, clipmap.m_probeCounts.y, clipmap.m_probeCounts.x, probeIdx, radianceTexelCoordStart.z,
-						  radianceTexelCoordStart.y, radianceTexelCoordStart.x);
-	radianceTexelCoordStart = radianceTexelCoordStart.xzy;
-	radianceTexelCoordStart.xy *= RADIANCE_OCTAHEDRON_MAP_SIZE + 2;
-	radianceTexelCoordStart.xy += 1;
+	[unroll] for(U32 i = 0; i < 3; ++i)
+	{
+		const F32 fracPart = frac(color[i]);
+		const F32 intPart = color[i] - fracPart;
 
-	// Compute irradiance
-	Vec2 octUv = Vec2(irradianceTexel);
-	octUv += 0.5;
-	octUv /= IRRADIANCE_OCTAHEDRON_MAP_SIZE;
-	const Vec3 dir = octahedronDecode(octUv);
+		U64 val = U64(intPart) << U64(32);
+		val |= U64(fracPart * 10000.0);
+		InterlockedAdd(g_irradianceResults[y][x][i], val);
+	}
+}
 
-	const U32 radianceTexelCount = RADIANCE_OCTAHEDRON_MAP_SIZE * RADIANCE_OCTAHEDRON_MAP_SIZE;
-	const U32 radiancePixelsPerThread = (radianceTexelCount + kThreadCount - 1) / kThreadCount;
+Vec3 decodeAtomicColor(U32 x, U32 y)
+{
+	Vec3 output;
+	[unroll] for(U32 i = 0; i < 3; ++i)
+	{
+		const U64 val = g_irradianceResults[y][x][i];
 
-	Vec3 irradiance = 0.0;
-	for(U32 pixel = svGroupIndex * radiancePixelsPerThread; pixel < min(radianceTexelCount, (svGroupIndex + 1) * radiancePixelsPerThread); ++pixel)
+		output[i] = F32(val >> U64(32));
+		output[i] += F32(val & U64(kMaxU32)) / 10000.0;
+	}
+	return output;
+}
+
+// The group services a single probe. Every thread reads a radiance value and bins it to the appropreate irradiance pixel
+[NumThreads(kThreadCount, 1, 1)] void main(COMPUTE_ARGS)
+{
+	const U32 clipmapIdx = svGroupId.x / g_globalRendererConstants.m_indirectDiffuseClipmaps[0].m_probeCounts[0];
+	const UVec3 probeId = UVec3(svGroupId.x % g_globalRendererConstants.m_indirectDiffuseClipmaps[0].m_probeCounts[0], svGroupId.y, svGroupId.z);
+	const Clipmap clipmap = g_globalRendererConstants.m_indirectDiffuseClipmaps[clipmapIdx];
+
+	// Zero shared memory
+	const U32 irradianceTexelCount = IRRADIANCE_OCTAHEDRON_MAP_SIZE * IRRADIANCE_OCTAHEDRON_MAP_SIZE;
+	const U32 irradiancePixelsPerThread = (irradianceTexelCount + kThreadCount - 1) / kThreadCount;
+	for(U32 pixel = svGroupIndex * irradiancePixelsPerThread; pixel < min(irradianceTexelCount, (svGroupIndex + 1) * irradiancePixelsPerThread);
+		++pixel)
 	{
-		Vec2 octUv = Vec2(pixel % RADIANCE_OCTAHEDRON_MAP_SIZE, pixel / RADIANCE_OCTAHEDRON_MAP_SIZE);
-		octUv += 0.5;
-		octUv /= RADIANCE_OCTAHEDRON_MAP_SIZE;
+		const U32 x = pixel % IRRADIANCE_OCTAHEDRON_MAP_SIZE;
+		const U32 y = pixel / IRRADIANCE_OCTAHEDRON_MAP_SIZE;
 
-		const Vec3 sampleDir = octahedronDecode(octUv);
+		g_irradianceResults[y][x] = 0;
+	}
 
-		const F32 lambert = dot(dir, sampleDir);
-		if(lambert <= kEpsilonF32)
-		{
-			continue;
-		}
+	GroupMemoryBarrierWithGroupSync();
 
-		UVec3 coord = radianceTexelCoordStart;
-		coord.x += pixel % RADIANCE_OCTAHEDRON_MAP_SIZE + 1;
-		coord.y += pixel / RADIANCE_OCTAHEDRON_MAP_SIZE + 1;
+	// Iterate the radiance pixels of this thread and bin them to the irradiance texel. Use bilinear filtering to reduce the sample count
+	UVec3 radianceTexelCoordStart = probeId.xzy;
+	radianceTexelCoordStart.xy *= RADIANCE_OCTAHEDRON_MAP_SIZE + 2;
+	radianceTexelCoordStart.xy += 1;
+	const Vec3 radianceVolumeSize = clipmap.m_probeCounts.xzy * Vec3(RADIANCE_OCTAHEDRON_MAP_SIZE + 2, RADIANCE_OCTAHEDRON_MAP_SIZE + 2, 1.0);
+	const Vec3 radianceTexelStartUv = (Vec3(radianceTexelCoordStart) + Vec3(0.0, 0.0, 0.5)) / radianceVolumeSize;
 
-		const Vec3 radiance = TEX(g_radianceVolume, coord).xyz;
+	const U32 halfRadianceOctMapSize = RADIANCE_OCTAHEDRON_MAP_SIZE / 2;
+	const U32 radianceTexelCount = square(halfRadianceOctMapSize);
+	const U32 radiancePixelsPerThread = (radianceTexelCount + kThreadCount - 1) / kThreadCount;
+	for(U32 pixel = svGroupIndex * radiancePixelsPerThread; pixel < min(radianceTexelCount, (svGroupIndex + 1) * radiancePixelsPerThread); ++pixel)
+	{
+		Vec2 radianceOctUv = Vec2(pixel % halfRadianceOctMapSize, pixel / halfRadianceOctMapSize);
+		radianceOctUv += 0.5;
+		radianceOctUv /= halfRadianceOctMapSize;
 
-		const F32 sampleCount = square(F32(RADIANCE_OCTAHEDRON_MAP_SIZE)) / 2.0;
-		irradiance += radiance * lambert / sampleCount;
-	}
+		const Vec3 sampleDir = octahedronDecode(radianceOctUv);
 
-	g_irradianceResults[svGroupIndex] = irradiance;
+		Vec3 uv = radianceTexelStartUv;
+		uv.xy += radianceOctUv * RADIANCE_OCTAHEDRON_MAP_SIZE / radianceVolumeSize.xy;
 
-	GroupMemoryBarrierWithGroupSync();
+		const Vec3 radiance = g_radianceVolumes[clipmapIdx].SampleLevel(g_linearAnyRepeatSampler, uv, 0.0).xyz;
 
-	[loop] for(U32 s = kThreadCount / 2u; s > 0u; s >>= 1u)
-	{
-		if(svGroupIndex < s)
+		for(U32 irradiancePixelX = 0; irradiancePixelX < IRRADIANCE_OCTAHEDRON_MAP_SIZE; ++irradiancePixelX)
 		{
-			g_irradianceResults[svGroupIndex] += g_irradianceResults[svGroupIndex + s];
-		}
+			for(U32 irradiancePixelY = 0; irradiancePixelY < IRRADIANCE_OCTAHEDRON_MAP_SIZE; ++irradiancePixelY)
+			{
+				Vec2 irradianceOctUv = Vec2(irradiancePixelX, irradiancePixelY);
+				irradianceOctUv += 0.5;
+				irradianceOctUv /= IRRADIANCE_OCTAHEDRON_MAP_SIZE;
+				const Vec3 dir = octahedronDecode(irradianceOctUv);
 
-#	if ANKI_PLATFORM_MOBILE
-		if(s > WaveGetLaneCount())
-		{
-			GroupMemoryBarrierWithGroupSync();
+				const F32 lambert = dot(dir, sampleDir);
+				if(lambert <= kEpsilonF32)
+				{
+					continue;
+				}
+
+				const F32 sampleCount = radianceTexelCount / 2.0;
+				InterlockedAddColor(irradiancePixelX, irradiancePixelY, radiance * lambert / sampleCount);
+			}
 		}
-#	else
-		GroupMemoryBarrierWithGroupSync();
-#	endif
 	}
 
-	if(svGroupIndex == 0)
+	GroupMemoryBarrierWithGroupSync();
+
+	// Write the irradiance
+	for(U32 pixel = svGroupIndex * irradiancePixelsPerThread; pixel < min(irradianceTexelCount, (svGroupIndex + 1) * irradiancePixelsPerThread);
+		++pixel)
 	{
-		irradiance = g_irradianceResults[0] * k2Pi;
+		const U32 x = pixel % IRRADIANCE_OCTAHEDRON_MAP_SIZE;
+		const U32 y = pixel / IRRADIANCE_OCTAHEDRON_MAP_SIZE;
 
-		// Compute the texel coord to write the output
-		UVec3 irradianceTexelCoord;
-		unflatten3dArrayIndex(clipmap.m_probeCounts.z, clipmap.m_probeCounts.y, clipmap.m_probeCounts.x, probeIdx, irradianceTexelCoord.z,
-							  irradianceTexelCoord.y, irradianceTexelCoord.x);
-		irradianceTexelCoord = irradianceTexelCoord.xzy;
+		UVec3 irradianceTexelCoord = probeId.xzy;
 		irradianceTexelCoord.xy *= IRRADIANCE_OCTAHEDRON_MAP_SIZE + 2;
 		irradianceTexelCoord.xy += 1;
-		irradianceTexelCoord.x += irradianceTexel.x;
-		irradianceTexelCoord.y += irradianceTexel.y;
+		const IVec3 irradianceTexelCoordStart = irradianceTexelCoord;
+		irradianceTexelCoord.x += x;
+		irradianceTexelCoord.y += y;
 
-		TEX(g_irradianceVolume, irradianceTexelCoord).xyz = irradiance;
+		const Vec3 irradiance = decodeAtomicColor(x, y) * k2Pi;
+		TEX(g_irradianceVolumes[clipmapIdx], irradianceTexelCoord).xyz = irradiance;
 
 		// Write the borders
-		UVec3 volumeTexCoord;
-		unflatten3dArrayIndex(clipmap.m_probeCounts.z, clipmap.m_probeCounts.y, clipmap.m_probeCounts.x, probeIdx, volumeTexCoord.z, volumeTexCoord.y,
-							  volumeTexCoord.x);
-		volumeTexCoord = volumeTexCoord.xzy;
-
 		IVec2 borders[3];
-		const IVec2 octCoord = IVec2(irradianceTexel);
+		const IVec2 octCoord = IVec2(x, y);
 		const U32 borderCount = octahedronBorder(IRRADIANCE_OCTAHEDRON_MAP_SIZE, octCoord, borders);
 		for(U32 i = 0; i < borderCount; ++i)
 		{
-			IVec3 actualVolumeTexCoord;
-			actualVolumeTexCoord.xy = octCoord + volumeTexCoord * (IRRADIANCE_OCTAHEDRON_MAP_SIZE + 2) + 1;
-			actualVolumeTexCoord.xy += borders[i];
-			actualVolumeTexCoord.z = volumeTexCoord.z;
+			IVec3 actualVolumeTexCoord = irradianceTexelCoordStart;
+			actualVolumeTexCoord.xy += octCoord + borders[i];
 
-			TEX(g_irradianceVolume, actualVolumeTexCoord).xyz = irradiance;
+			TEX(g_irradianceVolumes[clipmapIdx], actualVolumeTexCoord).xyz = irradiance;
 		}
 	}
 }