4 months ago · e418d15da7
--- a/AnKi/Renderer/IndirectDiffuseClipmaps.cpp
+++ b/AnKi/Renderer/IndirectDiffuseClipmaps.cpp
@@ -372,8 +372,6 @@ void IndirectDiffuseClipmaps::populateRenderGraph(RenderingContext& ctx)
 
				 
			
 
				 			cmdb.bindConstantBuffer(0, 0, ctx.m_globalRenderingConstantsBuffer);
			
 
				 
			
 
				-			cmdb.bindSampler(0, 0, getRenderer().getSamplers().m_trilinearRepeat.get());
			
 
				-
			
 
				 			U32 uav = 0;
			
 
				 			for(U32 clipmap = 0; clipmap < kIndirectDiffuseClipmapCount; ++clipmap)
			
 
				 			{
			
@@ -573,7 +571,7 @@ void IndirectDiffuseClipmaps::drawDebugProbes(const RenderingContext& ctx, Rende
 
				 
			
 
				 	cmdb.bindConstantBuffer(0, 0, ctx.m_globalRenderingConstantsBuffer);
			
 
				 
			
 
				-	const RenderTargetHandle visVolume = m_runCtx.m_handles.m_radianceVolumes[clipmap];
			
 
				+	const RenderTargetHandle visVolume = m_runCtx.m_handles.m_avgIrradianceVolumes[clipmap];
			
 
				 	rgraphCtx.bindSrv(0, 0, visVolume);
			
 
				 	rgraphCtx.bindSrv(1, 0, m_runCtx.m_handles.m_probeValidityVolumes[clipmap]);
			
 
				 	cmdb.bindSampler(0, 0, getRenderer().getSamplers().m_trilinearRepeat.get());
			
--- a/AnKi/Shaders/ImportanceSampling.hlsl
+++ b/AnKi/Shaders/ImportanceSampling.hlsl
@@ -147,3 +147,9 @@ vector<T, 3> generateUniformPointOnSphere(TInt sampleIndex, TInt sampleCount, U3
 
				 
			
 
				 	return normalize(vector<T, 3>(x, y, z));
			
 
				 }
			
 
				+
			
 
				+/// Taken from SHforHLSL
			
 
				+F32 sampleDirectionSpherePdf()
			
 
				+{
			
 
				+	return 1.0 / (kPi * 4.0);
			
 
				+}
			
--- a/AnKi/Shaders/IndirectDiffuseClipmaps.ankiprog
+++ b/AnKi/Shaders/IndirectDiffuseClipmaps.ankiprog
@@ -29,6 +29,7 @@
 
				 #include <AnKi/Shaders/IndirectDiffuseClipmaps.hlsl>
			
 
				 #include <AnKi/Shaders/BilateralFilter.hlsl>
			
 
				 #include <AnKi/Shaders/TemporalAA.hlsl>
			
 
				+#include <ThirdParty/SHforHLSL/SH.hlsli>
			
 
				 
			
 
				 #if defined(RT_MATERIAL_FETCH_CLIPMAP) && RT_MATERIAL_FETCH_CLIPMAP
			
 
				 #	define CLIPMAP_VOLUME
			
@@ -334,7 +335,8 @@ ANKI_FAST_CONSTANTS(Consts, g_consts)
 
				 // ===========================================================================
			
 
				 #if NOT_ZERO(ANKI_TECHNIQUE_ComputeIrradiance)
			
 
				 
			
 
				-constexpr U32 kThreadCount = min(32, GPU_WAVE_SIZE); // Keep it bellow 32 to avoid threads doing no work
			
 
				+// Each thread is touching a radiance texel
			
 
				+constexpr U32 kThreadCount = (RADIANCE_OCTAHEDRON_MAP_SIZE * RADIANCE_OCTAHEDRON_MAP_SIZE + 32 - 1) / 32 * 32;
			
 
				 
			
 
				 Texture3D<Vec4> g_radianceVolumes[kIndirectDiffuseClipmapCount] : register(t0);
			
 
				 
			
@@ -343,37 +345,7 @@ RWTexture3D<Vec4> g_avgIrradianceVolumes[kIndirectDiffuseClipmapCount] : registe
 
				 
			
 
				 ConstantBuffer<GlobalRendererConstants> g_globalRendererConstants : register(b0);
			
 
				 
			
 
				-SamplerState g_linearAnyRepeatSampler : register(s0);
			
 
				-
			
 
				-groupshared U64Vec3 g_irradianceResults[IRRADIANCE_OCTAHEDRON_MAP_SIZE][IRRADIANCE_OCTAHEDRON_MAP_SIZE];
			
 
				-
			
 
				-groupshared Vec3 g_avgIrradiance[kThreadCount];
			
 
				-
			
 
				-void InterlockedAddColor(U32 x, U32 y, Vec3 color)
			
 
				-{
			
 
				-	const Vec3 fracPart = frac(color);
			
 
				-	const Vec3 intPart = color - fracPart;
			
 
				-
			
 
				-	U64Vec3 val = U64Vec3(intPart) << U64(32);
			
 
				-	val |= U64Vec3(fracPart * 10000.0);
			
 
				-
			
 
				-	InterlockedAdd(g_irradianceResults[y][x][0], val[0]);
			
 
				-	InterlockedAdd(g_irradianceResults[y][x][1], val[1]);
			
 
				-	InterlockedAdd(g_irradianceResults[y][x][2], val[2]);
			
 
				-}
			
 
				-
			
 
				-Vec3 decodeAtomicColor(U32 x, U32 y)
			
 
				-{
			
 
				-	Vec3 output;
			
 
				-	[unroll] for(U32 i = 0; i < 3; ++i)
			
 
				-	{
			
 
				-		const U64 val = g_irradianceResults[y][x][i];
			
 
				-
			
 
				-		output[i] = F32(val >> U64(32));
			
 
				-		output[i] += F32(val & U64(kMaxU32)) / 10000.0;
			
 
				-	}
			
 
				-	return output;
			
 
				-}
			
 
				+groupshared SH::L2_F16_RGB g_sh[kThreadCount];
			
 
				 
			
 
				 struct StoreBorderFunc
			
 
				 {
			
@@ -388,126 +360,102 @@ struct StoreBorderFunc
 
				 	}
			
 
				 };
			
 
				 
			
 
				-// The group services a single probe. Every thread reads a radiance value and bins it to the appropreate irradiance pixel
			
 
				+// The group services a single probe.
			
 
				+// - Every thread reads a radiance value, converts it to SH and stores is in groupshared
			
 
				+// - Then we do a reduction of all SH
			
 
				+// - Then we use the SH to populate the irradiance
			
 
				 [NumThreads(kThreadCount, 1, 1)] void main(COMPUTE_ARGS)
			
 
				 {
			
 
				 	const IndirectDiffuseClipmapConstants idConsts = g_globalRendererConstants.m_indirectDiffuseClipmaps;
			
 
				 	const U32 clipmapIdx = svGroupId.x / idConsts.m_probeCounts.x;
			
 
				 	const UVec3 probeId = UVec3(svGroupId.x % idConsts.m_probeCounts.x, svGroupId.y, svGroupId.z);
			
 
				 
			
 
				-	// Zero shared memory
			
 
				-	const U32 irradianceTexelCount = IRRADIANCE_OCTAHEDRON_MAP_SIZE * IRRADIANCE_OCTAHEDRON_MAP_SIZE;
			
 
				-	const U32 irradiancePixelsPerThread = (irradianceTexelCount + kThreadCount - 1) / kThreadCount;
			
 
				-	for(U32 pixel = svGroupIndex * irradiancePixelsPerThread; pixel < min(irradianceTexelCount, (svGroupIndex + 1) * irradiancePixelsPerThread);
			
 
				-		++pixel)
			
 
				+	// Read the radiance and put it in SH
			
 
				+	if(svGroupIndex < square(RADIANCE_OCTAHEDRON_MAP_SIZE))
			
 
				 	{
			
 
				-		const U32 x = pixel % IRRADIANCE_OCTAHEDRON_MAP_SIZE;
			
 
				-		const U32 y = pixel / IRRADIANCE_OCTAHEDRON_MAP_SIZE;
			
 
				+		UVec2 radianceOctCoordLocal;
			
 
				+		radianceOctCoordLocal.y = svGroupIndex / RADIANCE_OCTAHEDRON_MAP_SIZE;
			
 
				+		radianceOctCoordLocal.x = svGroupIndex % RADIANCE_OCTAHEDRON_MAP_SIZE;
			
 
				 
			
 
				-		g_irradianceResults[y][x] = 0;
			
 
				-	}
			
 
				+		UVec3 radianceTexelCoordStart = probeId.xzy;
			
 
				+		radianceTexelCoordStart.xy *= RADIANCE_OCTAHEDRON_MAP_SIZE + 2;
			
 
				+		radianceTexelCoordStart.xy += 1;
			
 
				 
			
 
				-	GroupMemoryBarrierWithGroupSync();
			
 
				+		const Vec2 uv = (radianceOctCoordLocal + 0.5) / RADIANCE_OCTAHEDRON_MAP_SIZE;
			
 
				+		const Vec3 sampleDir = octahedronDecode(uv);
			
 
				 
			
 
				-	// Iterate the radiance pixels of this thread and bin them to the irradiance texel. Use bilinear filtering to reduce the sample count
			
 
				-	UVec3 radianceTexelCoordStart = probeId.xzy;
			
 
				-	radianceTexelCoordStart.xy *= RADIANCE_OCTAHEDRON_MAP_SIZE + 2;
			
 
				-	radianceTexelCoordStart.xy += 1;
			
 
				-	const Vec3 radianceVolumeSize = idConsts.m_probeCounts.xzy * Vec3(RADIANCE_OCTAHEDRON_MAP_SIZE + 2, RADIANCE_OCTAHEDRON_MAP_SIZE + 2, 1.0);
			
 
				-	const Vec3 radianceTexelStartUv = (Vec3(radianceTexelCoordStart) + Vec3(0.0, 0.0, 0.5)) / radianceVolumeSize;
			
 
				+		const UVec3 coord = radianceTexelCoordStart + UVec3(radianceOctCoordLocal, 0);
			
 
				+		const Vec3 radiance = TEX(g_radianceVolumes[clipmapIdx], coord);
			
 
				 
			
 
				-	const U32 halfRadianceOctMapSize = RADIANCE_OCTAHEDRON_MAP_SIZE / 2;
			
 
				-	const U32 radianceTexelCount = square(halfRadianceOctMapSize);
			
 
				-	const U32 radiancePixelsPerThread = (radianceTexelCount + kThreadCount - 1) / kThreadCount;
			
 
				-	for(U32 pixel = svGroupIndex * radiancePixelsPerThread; pixel < min(radianceTexelCount, (svGroupIndex + 1) * radiancePixelsPerThread); ++pixel)
			
 
				-	{
			
 
				-		Vec2 radianceOctUv = Vec2(pixel % halfRadianceOctMapSize, pixel / halfRadianceOctMapSize);
			
 
				-		radianceOctUv += 0.5;
			
 
				-		radianceOctUv /= halfRadianceOctMapSize;
			
 
				+		const F16 sampleCountf = square(RADIANCE_OCTAHEDRON_MAP_SIZE);
			
 
				+		const F16 normalization = 1.0 / (sampleCountf * sampleDirectionSpherePdf());
			
 
				 
			
 
				-		const Vec3 sampleDir = octahedronDecode(radianceOctUv);
			
 
				+		g_sh[svGroupIndex] = SH::ProjectOntoL2(HVec3(sampleDir), HVec3(radiance)) * normalization;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		g_sh[svGroupIndex] = SH::L2_F16_RGB::Zero();
			
 
				+	}
			
 
				 
			
 
				-		Vec3 uv = radianceTexelStartUv;
			
 
				-		uv.xy += radianceOctUv * RADIANCE_OCTAHEDRON_MAP_SIZE / radianceVolumeSize.xy;
			
 
				+	// Integrate, like parallel prefix sum
			
 
				+	GroupMemoryBarrierWithGroupSync();
			
 
				 
			
 
				-		const Vec3 radiance = g_radianceVolumes[clipmapIdx].SampleLevel(g_linearAnyRepeatSampler, uv, 0.0).xyz;
			
 
				+	[loop] for(U32 s = kThreadCount / 2u; s > 0u; s >>= 1u)
			
 
				+	{
			
 
				+		if(svGroupIndex < s)
			
 
				+		{
			
 
				+			g_sh[svGroupIndex] = g_sh[svGroupIndex] + g_sh[svGroupIndex + s];
			
 
				+		}
			
 
				 
			
 
				-		for(U32 irradiancePixelX = 0; irradiancePixelX < IRRADIANCE_OCTAHEDRON_MAP_SIZE; ++irradiancePixelX)
			
 
				+#	if ANKI_PLATFORM_MOBILE
			
 
				+		if(s > WaveGetLaneCount())
			
 
				 		{
			
 
				-			for(U32 irradiancePixelY = 0; irradiancePixelY < IRRADIANCE_OCTAHEDRON_MAP_SIZE; ++irradiancePixelY)
			
 
				-			{
			
 
				-				Vec2 irradianceOctUv = Vec2(irradiancePixelX, irradiancePixelY);
			
 
				-				irradianceOctUv += 0.5;
			
 
				-				irradianceOctUv /= IRRADIANCE_OCTAHEDRON_MAP_SIZE;
			
 
				-				const Vec3 dir = octahedronDecode(irradianceOctUv);
			
 
				-
			
 
				-				const F32 lambert = dot(dir, sampleDir);
			
 
				-				if(lambert <= kEpsilonF32)
			
 
				-				{
			
 
				-					continue;
			
 
				-				}
			
 
				-
			
 
				-				const F32 sampleCount = radianceTexelCount / 2.0;
			
 
				-				InterlockedAddColor(irradiancePixelX, irradiancePixelY, radiance * lambert / sampleCount);
			
 
				-			}
			
 
				+			GroupMemoryBarrierWithGroupSync();
			
 
				 		}
			
 
				+#	else
			
 
				+		GroupMemoryBarrierWithGroupSync();
			
 
				+#	endif
			
 
				 	}
			
 
				 
			
 
				-	GroupMemoryBarrierWithGroupSync();
			
 
				+	const SH::L2_F16_RGB sh = g_sh[0];
			
 
				 
			
 
				-	// Write the irradiance
			
 
				-	Vec3 threadAvgIrradiance = 0.0;
			
 
				-	for(U32 pixel = svGroupIndex * irradiancePixelsPerThread; pixel < min(irradianceTexelCount, (svGroupIndex + 1) * irradiancePixelsPerThread);
			
 
				-		++pixel)
			
 
				+	// Store the irradiance
			
 
				+	if(svGroupIndex < square(IRRADIANCE_OCTAHEDRON_MAP_SIZE))
			
 
				 	{
			
 
				-		const U32 x = pixel % IRRADIANCE_OCTAHEDRON_MAP_SIZE;
			
 
				-		const U32 y = pixel / IRRADIANCE_OCTAHEDRON_MAP_SIZE;
			
 
				+		UVec2 octCoordLocal;
			
 
				+		octCoordLocal.y = svGroupIndex / IRRADIANCE_OCTAHEDRON_MAP_SIZE;
			
 
				+		octCoordLocal.x = svGroupIndex % IRRADIANCE_OCTAHEDRON_MAP_SIZE;
			
 
				 
			
 
				-		UVec3 irradianceTexelCoord = probeId.xzy;
			
 
				-		irradianceTexelCoord.xy *= IRRADIANCE_OCTAHEDRON_MAP_SIZE + 2;
			
 
				-		irradianceTexelCoord.xy += 1;
			
 
				-		const IVec3 irradianceTexelCoordStart = irradianceTexelCoord;
			
 
				-		irradianceTexelCoord.x += x;
			
 
				-		irradianceTexelCoord.y += y;
			
 
				+		UVec3 texelCoordStart = probeId.xzy;
			
 
				+		texelCoordStart.xy *= IRRADIANCE_OCTAHEDRON_MAP_SIZE + 2;
			
 
				+		texelCoordStart.xy += 1;
			
 
				 
			
 
				-		const Vec3 irradiance = decodeAtomicColor(x, y) * k2Pi;
			
 
				+		const Vec2 uv = (octCoordLocal + 0.5) / IRRADIANCE_OCTAHEDRON_MAP_SIZE;
			
 
				+		const Vec3 sampleDir = octahedronDecode(uv);
			
 
				 
			
 
				-		threadAvgIrradiance += irradiance / irradianceTexelCount;
			
 
				+		const Vec3 irradiance = SH::CalculateIrradiance<F16>(sh, sampleDir);
			
 
				 
			
 
				-		TEX(g_irradianceVolumes[clipmapIdx], irradianceTexelCoord) = Vec4(irradiance, 0.0);
			
 
				+		const UVec3 coord = texelCoordStart + UVec3(octCoordLocal, 0);
			
 
				+		TEX(g_irradianceVolumes[clipmapIdx], coord) = Vec4(irradiance, 0.0);
			
 
				 
			
 
				 		// Write the borders
			
 
				 		StoreBorderFunc func;
			
 
				 		func.m_clipmapIdx = clipmapIdx;
			
 
				-		func.m_startOfOctCoord = irradianceTexelCoordStart;
			
 
				+		func.m_startOfOctCoord = texelCoordStart;
			
 
				 		func.m_value = irradiance;
			
 
				-		const IVec2 octCoord = IVec2(x, y);
			
 
				-		storeOctahedronBorder(IRRADIANCE_OCTAHEDRON_MAP_SIZE, octCoord, func);
			
 
				+		storeOctahedronBorder(IRRADIANCE_OCTAHEDRON_MAP_SIZE, octCoordLocal, func);
			
 
				 	}
			
 
				 
			
 
				-	g_avgIrradiance[svGroupIndex] = threadAvgIrradiance;
			
 
				-
			
 
				-	GroupMemoryBarrierWithGroupSync();
			
 
				-
			
 
				-	// Compute the avg radiance
			
 
				-	[loop] for(U32 s = kThreadCount / 2u; s > 0u; s >>= 1u)
			
 
				+	// Store the average irradiance
			
 
				+	HVec3 dir;
			
 
				+	HVec3 color;
			
 
				+	SH::ApproximateDirectionalLight(SH::L2toL1(sh), dir, color);
			
 
				+	if(isInfOrNan(Vec3(color)))
			
 
				 	{
			
 
				-		if(svGroupIndex < s)
			
 
				-		{
			
 
				-			g_avgIrradiance[svGroupIndex] += g_avgIrradiance[svGroupIndex + s];
			
 
				-		}
			
 
				-
			
 
				-#	if ANKI_PLATFORM_MOBILE
			
 
				-		if(s > WaveGetLaneCount())
			
 
				-		{
			
 
				-			GroupMemoryBarrierWithGroupSync();
			
 
				-		}
			
 
				-#	else
			
 
				-		GroupMemoryBarrierWithGroupSync();
			
 
				-#	endif
			
 
				+		color = 0.0;
			
 
				 	}
			
 
				 
			
 
				-	g_avgIrradianceVolumes[clipmapIdx][probeId.xzy].xyz = g_avgIrradiance[0];
			
 
				+	TEX(g_avgIrradianceVolumes[clipmapIdx], probeId.xzy) = Vec4(color, 0.0);
			
 
				 }
			
 
				 #endif
			
 
				 
			
--- a/ThirdParty/SHforHLSL/SH.hlsli
+++ b/ThirdParty/SHforHLSL/SH.hlsli
@@ -0,0 +1,555 @@
 
				+//=================================================================================================
			
 
				+//
			
 
				+//  SHforHLSL - Spherical harmonics suppport library for HLSL 2021, by MJP
			
 
				+//  https://github.com/TheRealMJP/SHforHLSL
			
 
				+//  https://therealmjp.github.io/
			
 
				+//
			
 
				+//  All code licensed under the MIT license
			
 
				+//
			
 
				+//=================================================================================================
			
 
				+
			
 
				+//=================================================================================================
			
 
				+//
			
 
				+// This header is intended to be included directly from HLSL 2021+ code, or similar. It
			
 
				+// implements types and utility functions for working with low-order spherical harmonics,
			
 
				+// focused on use cases for graphics.
			
 
				+//
			
 
				+// Currently this library has support for L1 (2 bands, 4 coefficients) and
			
 
				+// L2 (3 bands, 9 coefficients) SH. Depending on the author and material you're reading, you may
			
 
				+// see L1 referred to as both first-order or second-order, and L2 referred to as second-order
			
 
				+// or third-order. Ravi Ramamoorthi tends to refer to three bands as second-order, and
			
 
				+// Peter-Pike Sloan tends to refer to three bands as third-order. This library always uses L1 and
			
 
				+// L2 for clarity.
			
 
				+//
			
 
				+// The core SH type as well as the L1 and L2 aliases are templated on the primitive scalar
			
 
				+// type (T) as well as the number of vector components (N). They are intended to be used with
			
 
				+// 1 or 3 components paired with the float32_t and float16_t primitive types (with
			
 
				+// -enable-16bit-types passed during compilation). When fp16 types are used, the helper functions
			
 
				+// take fp16 arguments to try to avoid implicit conversions where possible.
			
 
				+//
			
 
				+// The core SH type supports basic operator overloading for summing/subtracting two sets of SH
			
 
				+// coefficients as well as multiplying/dividing the set of SH coefficients by a value.
			
 
				+//
			
 
				+// Example #1: integrating and projecting radiance onto L2 SH
			
 
				+//
			
 
				+// SH::L2_RGB radianceSH = SH::L2_RGB::Zero();
			
 
				+// for(int32_t sampleIndex = 0; sampleIndex < NumSamples; ++sampleIndex)
			
 
				+// {
			
 
				+//     float2 u1u2 = RandomFloat2(sampleIndex, NumSamples);
			
 
				+//     float3 sampleDirection = SampleDirectionSphere(u1u2);
			
 
				+//     float3 sampleRadiance = CalculateIncomingRadiance(sampleDirection);
			
 
				+//     radianceSH += SH::ProjectOntoL2(sampleDirection, sampleRadiance);
			
 
				+// }
			
 
				+// radianceSH *= 1.0f / (NumSamples * SampleDirectionSphere_PDF());
			
 
				+//
			
 
				+// Example #2: calculating diffuse lighting for a surface from radiance projected onto L2 SH
			
 
				+//
			
 
				+// SH::L2_RGB radianceSH = FetchRadianceSH(surfacePosition);
			
 
				+// float3 diffuseLighting = SH::CalculateIrradiance(radianceSH, surfaceNormal) * (diffuseAlbedo / Pi);
			
 
				+//
			
 
				+//=================================================================================================
			
 
				+
			
 
				+#ifndef SH_HLSLI_
			
 
				+#define SH_HLSLI_
			
 
				+
			
 
				+namespace SH
			
 
				+{
			
 
				+
			
 
				+// Constants
			
 
				+static const float32_t Pi = 3.141592654f;
			
 
				+static const float32_t SqrtPi = sqrt(Pi);
			
 
				+
			
 
				+static const float32_t CosineA0 = Pi;
			
 
				+static const float32_t CosineA1 = (2.0f * Pi) / 3.0f;
			
 
				+static const float32_t CosineA2 = (0.25f * Pi);
			
 
				+
			
 
				+static const float32_t BasisL0 = 1 / (2 * SqrtPi);
			
 
				+static const float32_t BasisL1 = sqrt(3) / (2 * SqrtPi);
			
 
				+static const float32_t BasisL2_MN2 = sqrt(15) / (2 * SqrtPi);
			
 
				+static const float32_t BasisL2_MN1 = sqrt(15) / (2 * SqrtPi);
			
 
				+static const float32_t BasisL2_M0 = sqrt(5) / (4 * SqrtPi);
			
 
				+static const float32_t BasisL2_M1 = sqrt(15) / (2 * SqrtPi);
			
 
				+static const float32_t BasisL2_M2 = sqrt(15) / (4 * SqrtPi);
			
 
				+
			
 
				+// Base templated type for SH coefficients
			
 
				+template<typename T, int32_t N, int32_t L> struct SH
			
 
				+{
			
 
				+    static const int32_t NumCoefficients = (L + 1) * (L + 1);
			
 
				+
			
 
				+    vector<T, N> C[NumCoefficients];
			
 
				+
			
 
				+    static SH<T, N, L> Zero()
			
 
				+    {
			
 
				+        return (SH<T, N, L>)0;
			
 
				+    }
			
 
				+
			
 
				+    SH<T, N, L> operator+(SH<T, N, L> other)
			
 
				+    {
			
 
				+        SH<T, N, L> result;
			
 
				+        [unroll]
			
 
				+        for(int32_t i = 0; i < NumCoefficients; ++i)
			
 
				+            result.C[i] = C[i] + other.C[i];
			
 
				+        return result;
			
 
				+    }
			
 
				+
			
 
				+    SH<T, N, L> operator-(SH<T, N, L> other)
			
 
				+    {
			
 
				+        SH<T, N, L> result;
			
 
				+        [unroll]
			
 
				+        for(int32_t i = 0; i < NumCoefficients; ++i)
			
 
				+            result.C[i] = C[i] - other.C[i];
			
 
				+        return result;
			
 
				+    }
			
 
				+
			
 
				+    SH<T, N, L> operator*(vector<T, N> value)
			
 
				+    {
			
 
				+        SH<T, N, L> result;
			
 
				+        [unroll]
			
 
				+        for(int32_t i = 0; i < NumCoefficients; ++i)
			
 
				+            result.C[i] = C[i] * value;
			
 
				+        return result;
			
 
				+    }
			
 
				+
			
 
				+    SH<T, N, L> operator/(vector<T, N> value)
			
 
				+    {
			
 
				+        SH<T, N, L> result;
			
 
				+        [unroll]
			
 
				+        for(int32_t i = 0; i < NumCoefficients; ++i)
			
 
				+            result.C[i] = C[i] / value;
			
 
				+        return result;
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+template<typename T, int32_t N = 1> using L1_Generic = SH<T, N, 1>;
			
 
				+using L1 = L1_Generic<float32_t, 1>;
			
 
				+using L1_F16 = L1_Generic<float16_t, 1>;
			
 
				+using L1_RGB = L1_Generic<float32_t, 3>;
			
 
				+using L1_F16_RGB = L1_Generic<float16_t, 3>;
			
 
				+
			
 
				+template<typename T, int32_t N = 1> using L2_Generic = SH<T, N, 2>;
			
 
				+using L2 = L2_Generic<float32_t, 1>;
			
 
				+using L2_F16 = L2_Generic<float16_t, 1>;
			
 
				+using L2_RGB = L2_Generic<float32_t, 3>;
			
 
				+using L2_F16_RGB = L2_Generic<float16_t, 3>;
			
 
				+
			
 
				+// Converts from scalar to RGB SH coefficients
			
 
				+template<typename T> L1_Generic<T, 3> ToRGB(L1_Generic<T, 1> sh)
			
 
				+{
			
 
				+    L1_Generic<T, 3> result;
			
 
				+    for(uint i = 0; i < L1_Generic<T, 1>::NumCoefficients; ++i)
			
 
				+        result.C[i] = sh.C[i];
			
 
				+    return result;
			
 
				+}
			
 
				+
			
 
				+template<typename T> L2_Generic<T, 3> ToRGB(L2_Generic<T, 1> sh)
			
 
				+{
			
 
				+    L2_Generic<T, 3> result;
			
 
				+    for(uint i = 0; i < L2_Generic<T, 1>::NumCoefficients; ++i)
			
 
				+        result.C[i] = sh.C[i];
			
 
				+    return result;
			
 
				+}
			
 
				+
			
 
				+// Truncates a set of L2 coefficients to produce a set of L1 coefficients
			
 
				+template<typename T, int32_t N> L1_Generic<T, N> L2toL1(L2_Generic<T, N> sh)
			
 
				+{
			
 
				+    L1_Generic<T, N> result;
			
 
				+    for(int32_t i = 0; i < L1_Generic<T, N>::NumCoefficients; ++i)
			
 
				+        result.C[i] = sh.C[i];
			
 
				+    return result;
			
 
				+}
			
 
				+
			
 
				+template<typename T, int32_t N> L1_Generic<T, N> Lerp(L1_Generic<T, N> x, L1_Generic<T, N> y, T s)
			
 
				+{
			
 
				+    return x * (T(1.0) - s) + y * s;
			
 
				+}
			
 
				+
			
 
				+template<typename T, int32_t N> L2_Generic<T, N> Lerp(L2_Generic<T, N> x, L2_Generic<T, N> y, T s)
			
 
				+{
			
 
				+    return x * (T(1.0) - s) + y * s;
			
 
				+}
			
 
				+
			
 
				+// Projects a value in a single direction onto a set of L1 SH coefficients
			
 
				+template<typename T, int32_t N> L1_Generic<T, N> ProjectOntoL1(vector<T, 3> direction, vector<T, N> value)
			
 
				+{
			
 
				+    L1_Generic<T, N> sh;
			
 
				+
			
 
				+    // L0
			
 
				+    sh.C[0] = T(BasisL0) * value;
			
 
				+
			
 
				+    // L1
			
 
				+    sh.C[1] = T(BasisL1) * direction.y * value;
			
 
				+    sh.C[2] = T(BasisL1) * direction.z * value;
			
 
				+    sh.C[3] = T(BasisL1) * direction.x * value;
			
 
				+
			
 
				+    return sh;
			
 
				+}
			
 
				+
			
 
				+template<typename T> L1_Generic<T, 1> ProjectOntoL1(vector<T, 3> direction, T value)
			
 
				+{
			
 
				+    return ProjectOntoL1<T, 1>(direction, value);
			
 
				+}
			
 
				+
			
 
				+// Projects a value in a single direction onto a set of L2 SH coefficients
			
 
				+template<typename T, int32_t N> L2_Generic<T, N> ProjectOntoL2(vector<T, 3> direction, vector<T, N> value)
			
 
				+{
			
 
				+    L2_Generic<T, N> sh;
			
 
				+
			
 
				+    // L0
			
 
				+    sh.C[0] = T(BasisL0) * value;
			
 
				+
			
 
				+    // L1
			
 
				+    sh.C[1] = T(BasisL1) * direction.y * value;
			
 
				+    sh.C[2] = T(BasisL1) * direction.z * value;
			
 
				+    sh.C[3] = T(BasisL1) * direction.x * value;
			
 
				+
			
 
				+    // L2
			
 
				+    sh.C[4] = T(BasisL2_MN2) * direction.x * direction.y * value;
			
 
				+    sh.C[5] = T(BasisL2_MN1) * direction.y * direction.z * value;
			
 
				+    sh.C[6] = T(BasisL2_M0) * (T(3.0) * direction.z * direction.z - T(1.0)) * value;
			
 
				+    sh.C[7] = T(BasisL2_M1) * direction.x * direction.z * value;
			
 
				+    sh.C[8] = T(BasisL2_M2) * (direction.x * direction.x - direction.y * direction.y) * value;
			
 
				+
			
 
				+    return sh;
			
 
				+}
			
 
				+
			
 
				+template<typename T> L2_Generic<T, 1> ProjectOntoL2(vector<T, 3> direction, T value)
			
 
				+{
			
 
				+    return ProjectOntoL2<T, 1>(direction, value);
			
 
				+}
			
 
				+
			
 
				+// Calculates the dot product of two sets of L1 SH coefficients
			
 
				+template<typename T, int32_t N> vector<T, N> DotProduct(L1_Generic<T, N> a, L1_Generic<T, N> b)
			
 
				+{
			
 
				+    vector<T, N> result = T(0.0);
			
 
				+    for(int32_t i = 0; i < L1_Generic<T, N>::NumCoefficients; ++i)
			
 
				+        result += a.C[i] * b.C[i];
			
 
				+
			
 
				+    return result;
			
 
				+}
			
 
				+
			
 
				+// Calculates the dot product of two sets of L2 SH coefficients
			
 
				+template<typename T, int32_t N> vector<T, N> DotProduct(L2_Generic<T, N> a, L2_Generic<T, N> b)
			
 
				+{
			
 
				+    vector<T, N> result = T(0.0);
			
 
				+    for(int32_t i = 0; i < L2_Generic<T, N>::NumCoefficients; ++i)
			
 
				+        result += a.C[i] * b.C[i];
			
 
				+
			
 
				+    return result;
			
 
				+}
			
 
				+
			
 
				+// Projects a delta in a direction onto SH and calculates the dot product with a set of L1 SH coefficients.
			
 
				+// Can be used to "look up" a value from SH coefficients in a particular direction.
			
 
				+template<typename T, int32_t N> vector<T, N> Evaluate(L1_Generic<T, N> sh, vector<T, 3> direction)
			
 
				+{
			
 
				+    L1_Generic<T, N> projectedDelta = ProjectOntoL1(direction, (vector<T, N>)(1.0));
			
 
				+    return DotProduct(projectedDelta, sh);
			
 
				+}
			
 
				+
			
 
				+// Projects a delta in a direction onto SH and calculates the dot product with a set of L2 SH coefficients.
			
 
				+// Can be used to "look up" a value from SH coefficients in a particular direction.
			
 
				+template<typename T, int32_t N> vector<T, N> Evaluate(L2_Generic<T, N> sh, vector<T, 3> direction)
			
 
				+{
			
 
				+    L2_Generic<T, N> projectedDelta = ProjectOntoL2(direction, (vector<T, N>)(1.0));
			
 
				+    return DotProduct(projectedDelta, sh);
			
 
				+}
			
 
				+
			
 
				+// Convolves a set of L1 SH coefficients with a set of L1 zonal harmonics
			
 
				+template<typename T, int32_t N> L1_Generic<T, N> ConvolveWithZH(L1_Generic<T, N> sh, vector<T, 2> zh)
			
 
				+{
			
 
				+    // L0
			
 
				+    sh.C[0] *= zh.x;
			
 
				+
			
 
				+    // L1
			
 
				+    sh.C[1] *= zh.y;
			
 
				+    sh.C[2] *= zh.y;
			
 
				+    sh.C[3] *= zh.y;
			
 
				+
			
 
				+    return sh;
			
 
				+}
			
 
				+
			
 
				+// Convolves a set of L2 SH coefficients with a set of L2 zonal harmonics
			
 
				+template<typename T, int32_t N> L2_Generic<T, N> ConvolveWithZH(L2_Generic<T, N> sh, vector<T, 3> zh)
			
 
				+{
			
 
				+    // L0
			
 
				+    sh.C[0] *= zh.x;
			
 
				+
			
 
				+    // L1
			
 
				+    sh.C[1] *= zh.y;
			
 
				+    sh.C[2] *= zh.y;
			
 
				+    sh.C[3] *= zh.y;
			
 
				+
			
 
				+    // L2
			
 
				+    sh.C[4] *= zh.z;
			
 
				+    sh.C[5] *= zh.z;
			
 
				+    sh.C[6] *= zh.z;
			
 
				+    sh.C[7] *= zh.z;
			
 
				+    sh.C[8] *= zh.z;
			
 
				+
			
 
				+    return sh;
			
 
				+}
			
 
				+
			
 
				+// Convolves a set of L1 SH coefficients with a cosine lobe. See [2]
			
 
				+template<typename T, int32_t N> L1_Generic<T, N> ConvolveWithCosineLobe(L1_Generic<T, N> sh)
			
 
				+{
			
 
				+    return ConvolveWithZH(sh, vector<T, 2>(CosineA0, CosineA1));
			
 
				+}
			
 
				+
			
 
				+// Convolves a set of L2 SH coefficients with a cosine lobe. See [2]
			
 
				+template<typename T, int32_t N> L2_Generic<T, N> ConvolveWithCosineLobe(L2_Generic<T, N> sh)
			
 
				+{
			
 
				+    return ConvolveWithZH(sh, vector<T, 3>(CosineA0, CosineA1, CosineA2));
			
 
				+}
			
 
				+
			
 
				+// Computes the "optimal linear direction" for a set of SH coefficients, AKA the "dominant" direction. See [0].
			
 
				+template<typename T, int32_t N> vector<T, 3> OptimalLinearDirection(L1_Generic<T, N> sh)
			
 
				+{
			
 
				+    vector<T, 3> direction = T(0.0);
			
 
				+    for(int32_t i = 0; i < N; ++i)
			
 
				+    {
			
 
				+        direction.x += sh.C[3][i];
			
 
				+        direction.y += sh.C[1][i];
			
 
				+        direction.z += sh.C[2][i];
			
 
				+    }
			
 
				+    return normalize(direction);
			
 
				+}
			
 
				+
			
 
				+// Computes the direction and color of a directional light that approximates a set of L1 SH coefficients. See [0].
			
 
				+template<typename T, int32_t N> void ApproximateDirectionalLight(L1_Generic<T, N> sh, out vector<T, 3> direction, out vector<T, N> color)
			
 
				+{
			
 
				+    direction = OptimalLinearDirection(sh);
			
 
				+    L1_Generic<T, N> dirSH = ProjectOntoL1(direction, (vector<T, N>)(1.0));
			
 
				+    dirSH.C[0] = T(0.0);
			
 
				+    color = DotProduct(dirSH, sh) * T(867.0 / (316.0 * Pi));
			
 
				+}
			
 
				+
			
 
				+// Calculates the irradiance from a set of SH coefficients containing projected radiance.
			
 
				+// Convolves the radiance with a cosine lobe, and then evaluates the result in the given normal direction.
			
 
				+// Note that this does not scale the irradiance by 1 / Pi: if using this result for Lambertian diffuse,
			
 
				+// you will want to include the divide-by-pi that's part of the Lambertian BRDF.
			
 
				+// For example: float3 diffuse = CalculateIrradiance(sh, normal) * diffuseAlbedo / Pi;
			
 
				+template<typename T, int32_t N> vector<T, N> CalculateIrradiance(L1_Generic<T, N> sh, vector<T, 3> normal)
			
 
				+{
			
 
				+    L1_Generic<T, N> convolved = ConvolveWithCosineLobe(sh);
			
 
				+    return Evaluate(convolved, normal);
			
 
				+}
			
 
				+
			
 
				+// Calculates the irradiance from a set of SH coefficients containing projected radiance.
			
 
				+// Convolves the radiance with a cosine lobe, and then evaluates the result in the given normal direction.
			
 
				+// Note that this does not scale the irradiance by 1 / Pi: if using this result for Lambertian diffuse,
			
 
				+// you will want to include the divide-by-pi that's part of the Lambertian BRDF.
			
 
				+// For example: float3 diffuse = CalculateIrradiance(sh, normal) * diffuseAlbedo / Pi;
			
 
				+template<typename T, int32_t N> vector<T, N> CalculateIrradiance(L2_Generic<T, N> sh, vector<T, 3> normal)
			
 
				+{
			
 
				+    L2_Generic<T, N> convolved = ConvolveWithCosineLobe(sh);
			
 
				+    return Evaluate(convolved, normal);
			
 
				+}
			
 
				+
			
 
				+// Calculates the irradiance from a set of L1 SH coeffecients using the non-linear fit from [1]
			
 
				+// Note that this does not scale the irradiance by 1 / Pi: if using this result for Lambertian diffuse,
			
 
				+// you will want to include the divide-by-pi that's part of the Lambertian BRDF.
			
 
				+// For example: float3 diffuse = CalculateIrradianceGeomerics(sh, normal) * diffuseAlbedo / Pi;
			
 
				+template<typename T, int32_t N> vector<T, N> CalculateIrradianceGeomerics(L1_Generic<T, N> sh, vector<T, 3> normal)
			
 
				+{
			
 
				+    vector<T, N> result = T(0.0);
			
 
				+
			
 
				+    for(int32_t i = 0; i < N; ++i)
			
 
				+    {
			
 
				+        T R0 = max(sh.C[0][i], T(0.00001));
			
 
				+
			
 
				+        vector<T, 3> R1 = vector<T, 1>(0.5) * vector<T, 3>(sh.C[3][i], sh.C[1][i], sh.C[2][i]);
			
 
				+        T lenR1 = max(length(R1), T(0.00001));
			
 
				+
			
 
				+        T q = T(0.5) * (T(1.0) + dot(R1 / lenR1, normal));
			
 
				+
			
 
				+        T p = T(1.0) + T(2.0) * lenR1 / R0;
			
 
				+        T a = (T(1.0) - lenR1 / R0) / (T(1.0) + lenR1 / R0);
			
 
				+
			
 
				+        result[i] = R0 * (a + (T(1.0) - a) * (p + T(1.0)) * pow(abs(q), p));
			
 
				+    }
			
 
				+
			
 
				+    return result;
			
 
				+}
			
 
				+
			
 
				+// Calculates the irradiance from a set of L1 SH coefficientions by 'hallucinating" L3 zonal harmonics. See [4].
			
 
				+template<typename T, int32_t N> vector<T, N> CalculateIrradianceL1ZH3Hallucinate(L1_Generic<T, N> sh, vector<T, 3> normal)
			
 
				+{
			
 
				+    const vector<T, 3> lumCoefficients = vector<T, 3>(0.2126, 0.7152, 0.0722);
			
 
				+    const vector<T, 3> zonalAxis = normalize(vector<T, 3>(dot((vector<T, 3>)sh.C[3], lumCoefficients), dot((vector<T, 3>)sh.C[1], lumCoefficients), dot((vector<T, 3>)sh.C[2], lumCoefficients)));
			
 
				+
			
 
				+    vector<T, N> ratio;
			
 
				+    for(int32_t i = 0; i < N; ++i)
			
 
				+        ratio[i] = abs(dot(vector<T, 3>(sh.C[3][i], sh.C[1][i], sh.C[2][i]), zonalAxis)) / sh.C[0][i];
			
 
				+
			
 
				+    const vector<T, N> zonalL2Coeff = sh.C[0] * (T(0.08) * ratio + T(0.6) * ratio * ratio);
			
 
				+
			
 
				+    const T fZ = dot(zonalAxis, normal);
			
 
				+    const T zhDir = sqrt(T(5.0) / (T(16.0) * T(Pi))) * (T(3.0) * fZ * fZ - T(1.0));
			
 
				+
			
 
				+    const vector<T, N> baseIrradiance = CalculateIrradiance(sh, normal);
			
 
				+
			
 
				+    return baseIrradiance + (T(Pi * 0.25) * zonalL2Coeff * zhDir);
			
 
				+}
			
 
				+
			
 
				+// Approximates a GGX lobe with a given roughness/alpha as L1 zonal harmonics, using a fitted curve
			
 
				+template<typename T> vector<T, 2> ApproximateGGXAsL1ZH(T ggxAlpha)
			
 
				+{
			
 
				+    const T l1Scale = T(1.66711256633276) / (T(1.65715038133932) + ggxAlpha);
			
 
				+    return vector<T, 2>(1.0, l1Scale);
			
 
				+}
			
 
				+
			
 
				+// Approximates a GGX lobe with a given roughness/alpha as L2 zonal harmonics, using a fitted curve
			
 
				+template<typename T> vector<T, 3> ApproximateGGXAsL2ZH(T ggxAlpha)
			
 
				+{
			
 
				+    const T l1Scale = T(1.66711256633276) / (T(1.65715038133932) + ggxAlpha);
			
 
				+    const T l2Scale = T(1.56127990596116) / (T(0.96989757593282) + ggxAlpha) - T(0.599972342361123);
			
 
				+    return vector<T, 3>(1.0, l1Scale, l2Scale);
			
 
				+}
			
 
				+
			
 
				+// Convolves a set of L1 SH coefficients with a GGX lobe for a given roughness/alpha
			
 
				+template<typename T, int32_t N> L1_Generic<T, N> ConvolveWithGGX(L1_Generic<T, N> sh, T ggxAlpha)
			
 
				+{
			
 
				+    return ConvolveWithZH(sh, ApproximateGGXAsL1ZH(ggxAlpha));
			
 
				+}
			
 
				+
			
 
				+// Convolves a set of L2 SH coefficients with a GGX lobe for a given roughness/alpha
			
 
				+template<typename T, int32_t N> L2_Generic<T, N> ConvolveWithGGX(L2_Generic<T, N> sh, T ggxAlpha)
			
 
				+{
			
 
				+    return ConvolveWithZH(sh, ApproximateGGXAsL2ZH(ggxAlpha));
			
 
				+}
			
 
				+
			
 
				+// Given a set of L1 SH coefficients represnting incoming radiance, determines a directional light
			
 
				+// direction, color, and modified roughness value that can be used to compute an approximate specular term. See [5]
			
 
				+template<typename T, int32_t N> void ExtractSpecularDirLight(L1_Generic<T, N> shRadiance, T sqrtRoughness, out vector<T, 3> lightDir, out vector<T, N> lightColor, out T modifiedSqrtRoughness)
			
 
				+{
			
 
				+    vector<T, 3> avgL1 = vector<T, 3>(dot(shRadiance.C[3] / shRadiance.C[0], 0.333f), dot(shRadiance.C[1] / shRadiance.C[0], 0.333f), dot(shRadiance.C[2] / shRadiance.C[0], 0.333f));
			
 
				+    avgL1 *= T(0.5);
			
 
				+    T avgL1len = length(avgL1);
			
 
				+
			
 
				+    lightDir = avgL1 / avgL1len;
			
 
				+    lightColor = Evaluate(shRadiance, lightDir) * T(Pi);
			
 
				+    modifiedSqrtRoughness = saturate(sqrtRoughness / sqrt(avgL1len));
			
 
				+}
			
 
				+
			
 
				+// Rotates a set of L1 coefficients by a rotation matrix. Adapted from DirectX::XMSHRotate [3]
			
 
				+template<typename T, int32_t N> L1_Generic<T, N> Rotate(L1_Generic<T, N> sh, float3x3 rotation)
			
 
				+{
			
 
				+    L1_Generic<T, N> result;
			
 
				+
			
 
				+    // L0
			
 
				+    result.C[0] = sh.C[0];
			
 
				+
			
 
				+    // L1
			
 
				+    [unroll]
			
 
				+    for(uint i = 0; i < N; ++i)
			
 
				+    {
			
 
				+        vector<T, 3> dir = vector<T, 3>(sh.C[3][i], sh.C[1][i], sh.C[2][i]);
			
 
				+        dir = vector<T, 3>(mul(dir, rotation));
			
 
				+        result.C[3][i] = dir.x;
			
 
				+        result.C[1][i] = dir.y;
			
 
				+        result.C[2][i] = dir.z;
			
 
				+    }
			
 
				+
			
 
				+    return result;
			
 
				+}
			
 
				+
			
 
				+// Rotates a set of L2 coefficients by a rotation matrix. Adapted from DirectX::XMSHRotate [3]
			
 
				+template<typename T, int32_t N> L2_Generic<T, N> Rotate(L2_Generic<T, N> sh, float3x3 rotation)
			
 
				+{
			
 
				+    // The basis vectors used in DXSH are slightly different than ours,
			
 
				+    // the X and Z are flipped relative to what's used above in ProjectOntoL1/L2.
			
 
				+    // Hence there are several negations here to adapt the code work for us.
			
 
				+    const float32_t r00 = rotation._m00;
			
 
				+    const float32_t r10 = rotation._m01;
			
 
				+    const float32_t r20 = -rotation._m02;
			
 
				+
			
 
				+    const float32_t r01 = rotation._m10;
			
 
				+    const float32_t r11 = rotation._m11;
			
 
				+    const float32_t r21 = -rotation._m12;
			
 
				+
			
 
				+    const float32_t r02 = -rotation._m20;
			
 
				+    const float32_t r12 = -rotation._m21;
			
 
				+    const float32_t r22 = rotation._m22;
			
 
				+
			
 
				+    L2_Generic<T, N> result;
			
 
				+
			
 
				+    // L0
			
 
				+    result.C[0] = sh.C[0];
			
 
				+
			
 
				+    // L1
			
 
				+    result.C[1] = vector<T, N>(r11 * sh.C[1] - r12 * sh.C[2] + r10 * sh.C[3]);
			
 
				+    result.C[2] = vector<T, N>(-r21 * sh.C[1] + r22 * sh.C[2] - r20 * sh.C[3]);
			
 
				+    result.C[3] = vector<T, N>(r01 * sh.C[1] - r02 * sh.C[2] + r00 * sh.C[3]);
			
 
				+
			
 
				+    // L2
			
 
				+    const float32_t t41 = r01 * r00;
			
 
				+    const float32_t t43 = r11 * r10;
			
 
				+    const float32_t t48 = r11 * r12;
			
 
				+    const float32_t t50 = r01 * r02;
			
 
				+    const float32_t t55 = r02 * r02;
			
 
				+    const float32_t t57 = r22 * r22;
			
 
				+    const float32_t t58 = r12 * r12;
			
 
				+    const float32_t t61 = r00 * r02;
			
 
				+    const float32_t t63 = r10 * r12;
			
 
				+    const float32_t t68 = r10 * r10;
			
 
				+    const float32_t t70 = r01 * r01;
			
 
				+    const float32_t t72 = r11 * r11;
			
 
				+    const float32_t t74 = r00 * r00;
			
 
				+    const float32_t t76 = r21 * r21;
			
 
				+    const float32_t t78 = r20 * r20;
			
 
				+
			
 
				+    const float32_t v173 = 0.1732050808e1f;
			
 
				+    const float32_t v577 = 0.5773502693e0f;
			
 
				+    const float32_t v115 = 0.1154700539e1f;
			
 
				+    const float32_t v288 = 0.2886751347e0f;
			
 
				+    const float32_t v866 = 0.8660254040e0f;
			
 
				+
			
 
				+    float32_t r[25];
			
 
				+    r[0] = r11 * r00 + r01 * r10;
			
 
				+    r[1] = -r01 * r12 - r11 * r02;
			
 
				+    r[2] =  v173 * r02 * r12;
			
 
				+    r[3] = -r10 * r02 - r00 * r12;
			
 
				+    r[4] = r00 * r10 - r01 * r11;
			
 
				+    r[5] = - r11 * r20 - r21 * r10;
			
 
				+    r[6] = r11 * r22 + r21 * r12;
			
 
				+    r[7] = -v173 * r22 * r12;
			
 
				+    r[8] = r20 * r12 + r10 * r22;
			
 
				+    r[9] = -r10 * r20 + r11 * r21;
			
 
				+    r[10] = -v577 * (t41 + t43) + v115 * r21 * r20;
			
 
				+    r[11] = v577 * (t48 + t50) - v115 * r21 * r22;
			
 
				+    r[12] = -0.5f * (t55 + t58) + t57;
			
 
				+    r[13] = v577 * (t61 + t63) - v115 * r20 * r22;
			
 
				+    r[14] =  v288 * (t70 - t68 + t72 - t74) - v577 * (t76 - t78);
			
 
				+    r[15] = -r01 * r20 -  r21 * r00;
			
 
				+    r[16] = r01 * r22 + r21 * r02;
			
 
				+    r[17] = -v173 * r22 * r02;
			
 
				+    r[18] = r00 * r22 + r20 * r02;
			
 
				+    r[19] = -r00 * r20 + r01 * r21;
			
 
				+    r[20] = t41 - t43;
			
 
				+    r[21] = -t50 + t48;
			
 
				+    r[22] =  v866 * (t55 - t58);
			
 
				+    r[23] = t63 - t61;
			
 
				+    r[24] = 0.5f * (t74 - t68 - t70 +  t72);
			
 
				+
			
 
				+    for(int32_t i = 0; i < 5; ++i)
			
 
				+    {
			
 
				+        const int32_t base = i * 5;
			
 
				+        result.C[4 + i] = vector<T, N>(r[base + 0] * sh.C[4] + r[base + 1] * sh.C[5] +
			
 
				+                                       r[base + 2] * sh.C[6] + r[base + 3] * sh.C[7] +
			
 
				+                                       r[base + 4] * sh.C[8]);
			
 
				+    }
			
 
				+
			
 
				+    return result;
			
 
				+}
			
 
				+
			
 
				+} // namespace SH
			
 
				+
			
 
				+// References:
			
 
				+//
			
 
				+// [0] Stupid SH Tricks by Peter-Pike Sloan - https://www.ppsloan.org/publications/StupidSH36.pdf
			
 
				+// [1] Converting SH Radiance to Irradiance by Graham Hazel - https://grahamhazel.com/blog/2017/12/22/converting-sh-radiance-to-irradiance/
			
 
				+// [2] An Efficient Representation for Irradiance Environment Maps by Ravi Ramamoorthi and Pat Hanrahan - https://cseweb.ucsd.edu/~ravir/6998/papers/envmap.pdf
			
 
				+// [3] SHMath by Chuck Walbourn (originally written by Peter-Pike Sloan) - https://walbourn.github.io/spherical-harmonics-math/
			
 
				+// [4] ZH3: Quadratic Zonal Harmonics by Thomas Roughton, Peter-Pike Sloan, Ari Silvennoinen, Michal Iwanicki, and Peter Shirley - https://torust.me/ZH3.pdf
			
 
				+// [5] Precomputed Global Illumination in Frostbite by Yuriy O'Donnell - https://www.ea.com/frostbite/news/precomputed-global-illumination-in-frostbite
			
 
				+
			
 
				+#endif // SH_HLSLI_