Browse Source

Shader refactoring

Panagiotis Christopoulos Charitos 1 year ago
parent
commit
c127b78603

+ 1 - 0
AnKi/ShaderCompiler/Dxc.cpp

@@ -90,6 +90,7 @@ Error compileHlslToSpirv(CString src, ShaderType shaderType, Bool compileWith16b
 	dxcArgs.emplaceBack("-Wundef");
 	dxcArgs.emplaceBack("-Wno-unused-const-variable");
 	dxcArgs.emplaceBack("-Wno-unused-parameter");
+	dxcArgs.emplaceBack("-Wno-unneeded-internal-declaration");
 	dxcArgs.emplaceBack("-HV");
 	dxcArgs.emplaceBack("2021");
 	dxcArgs.emplaceBack("-E");

+ 1 - 1
AnKi/Shaders/Bloom.ankiprog

@@ -14,7 +14,7 @@ constexpr U32 kTonemappingBinding = 2u;
 
 struct Constants
 {
-	F32 m_threshold;
+	RF32 m_threshold;
 	F32 m_scale;
 	F32 m_padding0;
 	F32 m_padding1;

+ 1 - 1
AnKi/Shaders/FinalComposite.ankiprog

@@ -72,7 +72,7 @@ RVec3 main([[vk::location(0)]] Vec2 uv : TEXCOORD) : SV_TARGET0
 
 #if FILM_GRAIN
 	const F32 dt = 1.0;
-	outColor = filmGrain(outColor, uv, g_pc.m_filmGrainStrength, F32(g_pc.m_frameCount % 0xFFFFu) * dt);
+	outColor = filmGrain<F32>(outColor, uv, g_pc.m_filmGrainStrength, F32(g_pc.m_frameCount % 0xFFFFu) * dt);
 #endif
 
 #if DBG_ENABLED

+ 47 - 44
AnKi/Shaders/Functions.hlsl

@@ -127,7 +127,7 @@ F32 rand(Vec2 n)
 	return 0.5 + 0.5 * frac(sin(dot(n, Vec2(12.9898, 78.233))) * 43758.5453);
 }
 
-Vec4 nearestDepthUpscale(Vec2 uv, Texture2D depthFull, Texture2D depthHalf, Texture2D colorTex, SamplerState linearAnyClampSampler,
+Vec4 nearestDepthUpscale(Vec2 uv, Texture2D<Vec4> depthFull, Texture2D<Vec4> depthHalf, Texture2D<Vec4> colorTex, SamplerState linearAnyClampSampler,
 						 Vec2 linearDepthCf, F32 depthThreshold)
 {
 	F32 fullDepth = depthFull.SampleLevel(linearAnyClampSampler, uv, 0.0).r; // Sampler not important.
@@ -293,36 +293,40 @@ Vec2 convertCubeUvsu(const Vec3 v, out U32 faceIndex)
 	return 0.5 / mag * uv + 0.5;
 }
 
-RVec3 grayScale(const RVec3 col)
+template<typename T>
+vector<T, 3> grayScale(const vector<T, 3> col)
 {
-	const F32 grey = (col.r + col.g + col.b) * (1.0 / 3.0);
-	return RVec3(grey, grey, grey);
+	const T grey = (col.r + col.g + col.b) * T(1.0 / 3.0);
+	return vector<T, 3>(grey, grey, grey);
 }
 
-Vec3 saturateColor(const Vec3 col, const F32 factor)
+template<typename T>
+vector<T, 3> saturateColor(const vector<T, 3> col, const T factor)
 {
-	const Vec3 lumCoeff = Vec3(0.2125, 0.7154, 0.0721);
-	const F32 d = dot(col, lumCoeff);
-	const Vec3 intensity = Vec3(d, d, d);
+	const vector<T, 3> lumCoeff = vector<T, 3>(0.2125, 0.7154, 0.0721);
+	const T d = dot(col, lumCoeff);
+	const vector<T, 3> intensity = vector<T, 3>(d, d, d);
 	return lerp(intensity, col, factor);
 }
 
-Vec3 gammaCorrection(Vec3 gamma, Vec3 col)
+template<typename T>
+vector<T, 3> gammaCorrection(vector<T, 3> gamma, vector<T, 3> col)
 {
-	return pow(col, 1.0 / gamma);
+	return pow(col, T(1.0) / gamma);
 }
 
 // Can use 0.15 for sharpenFactor
-Vec3 readSharpen(Texture2D tex, SamplerState sampl, Vec2 uv, F32 sharpenFactor, Bool detailed)
+template<typename T>
+vector<T, 3> readSharpen(Texture2D<vector<T, 4>> tex, SamplerState sampl, Vec2 uv, T sharpenFactor, Bool detailed)
 {
-	Vec3 col = tex.SampleLevel(sampl, uv, 0.0).rgb;
+	vector<T, 3> col = tex.SampleLevel(sampl, uv, 0.0).rgb;
 
-	Vec3 col2 = tex.SampleLevel(sampl, uv, 0.0, IVec2(1, 1)).rgb;
+	vector<T, 3> col2 = tex.SampleLevel(sampl, uv, 0.0, IVec2(1, 1)).rgb;
 	col2 += tex.SampleLevel(sampl, uv, 0.0, IVec2(-1, -1)).rgb;
 	col2 += tex.SampleLevel(sampl, uv, 0.0, IVec2(1, -1)).rgb;
 	col2 += tex.SampleLevel(sampl, uv, 0.0, IVec2(-1, 1)).rgb;
 
-	F32 f = 4.0;
+	T f = 4.0;
 	if(detailed)
 	{
 		col2 += tex.SampleLevel(sampl, uv, 0.0, IVec2(0, 1)).rgb;
@@ -333,19 +337,20 @@ Vec3 readSharpen(Texture2D tex, SamplerState sampl, Vec2 uv, F32 sharpenFactor,
 		f = 8.0;
 	}
 
-	col = col * (f * sharpenFactor + 1.0) - sharpenFactor * col2;
-	return max(Vec3(0.0, 0.0, 0.0), col);
+	col = col * (f * sharpenFactor + T(1.0)) - sharpenFactor * col2;
+	return max(vector<T, 3>(0.0, 0.0, 0.0), col);
 }
 
-Vec3 readErosion(Texture2D tex, SamplerState sampl, const Vec2 uv)
+template<typename T>
+vector<T, 3> readErosion(Texture2D<vector<T, 4>> tex, SamplerState sampl, const Vec2 uv)
 {
-	Vec3 minValue = tex.SampleLevel(sampl, uv, 0.0).rgb;
+	vector<T, 3> minValue = tex.SampleLevel(sampl, uv, 0.0).rgb;
 
 #define ANKI_EROSION(x, y) \
 	col2 = tex.SampleLevel(sampl, uv, 0.0, IVec2(x, y)).rgb; \
 	minValue = min(col2, minValue);
 
-	Vec3 col2;
+	vector<T, 3> col2;
 	ANKI_EROSION(1, 1);
 	ANKI_EROSION(-1, -1);
 	ANKI_EROSION(1, -1);
@@ -546,18 +551,20 @@ UVec2 getOptimalGlobalInvocationId8x8Nvidia()
 #endif
 
 // Gaussian distrubution function
-F32 gaussianWeight(F32 s, F32 x)
+template<typename T>
+T gaussianWeight(T s, T x)
 {
-	F32 p = 1.0 / (s * sqrt(2.0 * kPi));
-	p *= exp((x * x) / (-2.0 * s * s));
+	T p = T(1.0) / (s * sqrt(T(2.0) * kPi));
+	p *= exp((x * x) / (T(-2.0) * s * s));
 	return p;
 }
 
 // https://www.shadertoy.com/view/WsfBDf
-Vec3 animateBlueNoise(Vec3 inputBlueNoise, U32 frameIdx)
+template<typename T>
+vector<T, 3> animateBlueNoise(vector<T, 3> inputBlueNoise, U32 frameIdx)
 {
-	const F32 goldenRatioConjugate = 0.61803398875;
-	return frac(inputBlueNoise + F32(frameIdx % 64u) * goldenRatioConjugate);
+	const T goldenRatioConjugate = 0.61803398875;
+	return frac(inputBlueNoise + T(frameIdx % 64u) * goldenRatioConjugate);
 }
 
 #if ANKI_FRAGMENT_SHADER
@@ -642,33 +649,29 @@ Vec2 equirectangularMapping(Vec3 v)
 	return uv;
 }
 
-Vec3 linearToSRgb(Vec3 linearRgb)
+template<typename T>
+vector<T, 3> linearToSRgb(vector<T, 3> linearRgb)
 {
-	const F32 a = 6.10352e-5;
-	const F32 b = 1.0 / 2.4;
-	linearRgb = max(Vec3(a, a, a), linearRgb);
-	return min(linearRgb * 12.92, pow(max(linearRgb, 0.00313067), Vec3(b, b, b)) * 1.055 - 0.055);
+	constexpr T a = 6.10352e-5;
+	constexpr T b = 1.0 / 2.4;
+	linearRgb = max(vector<T, 3>(a, a, a), linearRgb);
+	return min(linearRgb * T(12.92), pow(max(linearRgb, T(0.00313067)), Vec3(b, b, b)) * T(1.055) - T(0.055));
 }
 
-Vec3 sRgbToLinear(Vec3 sRgb)
+template<typename T>
+vector<T, 3> sRgbToLinear(vector<T, 3> sRgb)
 {
-#if ANKI_GLSL
-	const bvec3 cutoff = lessThan(sRgb, Vec3(0.04045));
-	const Vec3 higher = pow((sRgb + 0.055) / 1.055, Vec3(2.4));
-	const Vec3 lower = sRgb / 12.92;
-	return mix(higher, lower, cutoff);
-#else
-	const bool3 cutoff = sRgb < Vec3(0.04045, 0.04045, 0.04045);
-	const Vec3 higher = pow((sRgb + 0.055) / 1.055, Vec3(2.4, 2.4, 2.4));
-	const Vec3 lower = sRgb / 12.92;
+	const bool3 cutoff = sRgb < vector<T, 3>(0.04045, 0.04045, 0.04045);
+	const vector<T, 3> higher = pow((sRgb + T(0.055)) / T(1.055), vector<T, 3>(2.4, 2.4, 2.4));
+	const vector<T, 3> lower = sRgb / T(12.92);
 	return lerp(higher, lower, cutoff);
-#endif
 }
 
-RVec3 filmGrain(RVec3 color, Vec2 uv, F32 strength, F32 time)
+template<typename T>
+vector<T, 3> filmGrain(vector<T, 3> color, Vec2 uv, T strength, F32 time)
 {
-	const F32 x = (uv.x + 4.0) * (uv.y + 4.0) * time;
-	const F32 grain = 1.0 - (fmod((fmod(x, 13.0) + 1.0) * (fmod(x, 123.0) + 1.0), 0.01) - 0.005) * strength;
+	const T x = (uv.x + 4.0) * (uv.y + 4.0) * time;
+	const T grain = T(1.0) - (fmod((fmod(x, T(13.0)) + T(1.0)) * (fmod(x, T(123.0)) + T(1.0)), T(0.01)) - T(0.005)) * strength;
 	return color * grain;
 }
 

+ 146 - 185
AnKi/Shaders/Include/Common.h

@@ -203,7 +203,7 @@ constexpr uint kSizeof_I64Vec4 = 32u;
 
 typedef bool Bool;
 
-#	define _ANKI_DEFINE_OPERATOR_F32_ROWS3(mat, fl, op) \
+#	define _ANKI_DEFINE_OPERATOR_SCALAR_ROWS3(mat, fl, op) \
 		mat operator op(fl f) \
 		{ \
 			mat o; \
@@ -213,7 +213,7 @@ typedef bool Bool;
 			return o; \
 		}
 
-#	define _ANKI_DEFINE_OPERATOR_F32_ROWS4(mat, fl, op) \
+#	define _ANKI_DEFINE_OPERATOR_SCALAR_ROWS4(mat, fl, op) \
 		mat operator op(fl f) \
 		{ \
 			mat o; \
@@ -246,210 +246,171 @@ typedef bool Bool;
 		}
 
 #	define _ANKI_DEFINE_ALL_OPERATORS_ROWS3(mat, fl) \
-		_ANKI_DEFINE_OPERATOR_F32_ROWS3(mat, fl, +) \
-		_ANKI_DEFINE_OPERATOR_F32_ROWS3(mat, fl, -) \
-		_ANKI_DEFINE_OPERATOR_F32_ROWS3(mat, fl, *) \
-		_ANKI_DEFINE_OPERATOR_F32_ROWS3(mat, fl, /) \
+		_ANKI_DEFINE_OPERATOR_SCALAR_ROWS3(mat, fl, +) \
+		_ANKI_DEFINE_OPERATOR_SCALAR_ROWS3(mat, fl, -) \
+		_ANKI_DEFINE_OPERATOR_SCALAR_ROWS3(mat, fl, *) \
+		_ANKI_DEFINE_OPERATOR_SCALAR_ROWS3(mat, fl, /) \
 		_ANKI_DEFINE_OPERATOR_SELF_ROWS3(mat, +) \
 		_ANKI_DEFINE_OPERATOR_SELF_ROWS3(mat, -)
 
 #	define _ANKI_DEFINE_ALL_OPERATORS_ROWS4(mat, fl) \
-		_ANKI_DEFINE_OPERATOR_F32_ROWS4(mat, fl, +) \
-		_ANKI_DEFINE_OPERATOR_F32_ROWS4(mat, fl, -) \
-		_ANKI_DEFINE_OPERATOR_F32_ROWS4(mat, fl, *) \
-		_ANKI_DEFINE_OPERATOR_F32_ROWS4(mat, fl, /) \
+		_ANKI_DEFINE_OPERATOR_SCALAR_ROWS4(mat, fl, +) \
+		_ANKI_DEFINE_OPERATOR_SCALAR_ROWS4(mat, fl, -) \
+		_ANKI_DEFINE_OPERATOR_SCALAR_ROWS4(mat, fl, *) \
+		_ANKI_DEFINE_OPERATOR_SCALAR_ROWS4(mat, fl, /) \
 		_ANKI_DEFINE_OPERATOR_SELF_ROWS4(mat, +) \
 		_ANKI_DEFINE_OPERATOR_SELF_ROWS4(mat, -)
 
-struct Mat3
-{
-	Vec3 m_row0;
-	Vec3 m_row1;
-	Vec3 m_row2;
-
-	_ANKI_DEFINE_ALL_OPERATORS_ROWS3(Mat3, F32)
-
-	void setColumns(Vec3 c0, Vec3 c1, Vec3 c2)
-	{
-		m_row0 = Vec3(c0.x, c1.x, c2.x);
-		m_row1 = Vec3(c0.y, c1.y, c2.y);
-		m_row2 = Vec3(c0.z, c1.z, c2.z);
-	}
-};
-
-struct Mat4
-{
-	Vec4 m_row0;
-	Vec4 m_row1;
-	Vec4 m_row2;
-	Vec4 m_row3;
-
-	_ANKI_DEFINE_ALL_OPERATORS_ROWS4(Mat4, F32)
-
-	Vec4 getTranslationPart()
-	{
-		return Vec4(m_row0.w, m_row1.w, m_row2.w, m_row3.w);
-	}
-
-	void setColumns(Vec4 c0, Vec4 c1, Vec4 c2, Vec4 c3)
-	{
-		m_row0 = Vec4(c0.x, c1.x, c2.x, c3.x);
-		m_row1 = Vec4(c0.y, c1.y, c2.y, c3.y);
-		m_row2 = Vec4(c0.z, c1.z, c2.z, c3.z);
-		m_row3 = Vec4(c0.w, c1.w, c2.w, c3.w);
-	}
-};
+// Mat3 "template". Not an actual template because of bugs
+#	define _ANKI_MAT3(mat, vec, scalar) \
+		struct mat \
+		{ \
+			vec m_row0; \
+			vec m_row1; \
+			vec m_row2; \
+			_ANKI_DEFINE_ALL_OPERATORS_ROWS3(mat, scalar) \
+			void setColumns(vec c0, vec c1, vec c2) \
+			{ \
+				m_row0 = vec(c0.x, c1.x, c2.x); \
+				m_row1 = vec(c0.y, c1.y, c2.y); \
+				m_row2 = vec(c0.z, c1.z, c2.z); \
+			} \
+		}; \
+		vec mul(mat m, vec v) \
+		{ \
+			const scalar a = dot(m.m_row0, v); \
+			const scalar b = dot(m.m_row1, v); \
+			const scalar c = dot(m.m_row2, v); \
+			return vec(a, b, c); \
+		} \
+		mat transpose(mat m) \
+		{ \
+			mat o; \
+			o.setColumns(m.m_row0, m.m_row1, m.m_row2); \
+			return o; \
+		}
 
-struct Mat3x4
-{
-	Vec4 m_row0;
-	Vec4 m_row1;
-	Vec4 m_row2;
+// Mat4 "template". Not an actual template because of bugs
+#	define _ANKI_MAT4(mat, vec, scalar) \
+		struct mat \
+		{ \
+			vec m_row0; \
+			vec m_row1; \
+			vec m_row2; \
+			vec m_row3; \
+			_ANKI_DEFINE_ALL_OPERATORS_ROWS4(mat, scalar) \
+			vec getTranslationPart() \
+			{ \
+				return vec(m_row0.w, m_row1.w, m_row2.w, m_row3.w); \
+			} \
+			void setColumns(vec c0, vec c1, vec c2, vec c3) \
+			{ \
+				m_row0 = vec(c0.x, c1.x, c2.x, c3.x); \
+				m_row1 = vec(c0.y, c1.y, c2.y, c3.y); \
+				m_row2 = vec(c0.z, c1.z, c2.z, c3.z); \
+				m_row3 = vec(c0.w, c1.w, c2.w, c3.w); \
+			} \
+		}; \
+		vec mul(mat m, vec v) \
+		{ \
+			const scalar a = dot(m.m_row0, v); \
+			const scalar b = dot(m.m_row1, v); \
+			const scalar c = dot(m.m_row2, v); \
+			const scalar d = dot(m.m_row3, v); \
+			return vec(a, b, c, d); \
+		} \
+		mat mul(mat a_, mat b_) \
+		{ \
+			const vec a[4] = {a_.m_row0, a_.m_row1, a_.m_row2, a_.m_row3}; \
+			const vec b[4] = {b_.m_row0, b_.m_row1, b_.m_row2, b_.m_row3}; \
+			vec c[4]; \
+			[unroll] for(U32 i = 0; i < 4; i++) \
+			{ \
+				vec t1, t2; \
+				t1 = a[i][0]; \
+				t2 = b[0] * t1; \
+				t1 = a[i][1]; \
+				t2 += b[1] * t1; \
+				t1 = a[i][2]; \
+				t2 += b[2] * t1; \
+				t1 = a[i][3]; \
+				t2 += b[3] * t1; \
+				c[i] = t2; \
+			} \
+			mat o; \
+			o.m_row0 = c[0]; \
+			o.m_row1 = c[1]; \
+			o.m_row2 = c[2]; \
+			o.m_row3 = c[3]; \
+			return o; \
+		}
 
-	_ANKI_DEFINE_ALL_OPERATORS_ROWS3(Mat3x4, F32)
+// Mat3x4 "template". Not an actual template because of bugs
+#	define _ANKI_MAT3x4(mat, row, column, scalar) \
+		struct mat \
+		{ \
+			row m_row0; \
+			row m_row1; \
+			row m_row2; \
+			_ANKI_DEFINE_ALL_OPERATORS_ROWS3(mat, scalar) \
+			column getTranslationPart() \
+			{ \
+				return column(m_row0.w, m_row1.w, m_row2.w); \
+			} \
+			void setColumns(column c0, column c1, column c2, column c3) \
+			{ \
+				m_row0 = row(c0.x, c1.x, c2.x, c3.x); \
+				m_row1 = row(c0.y, c1.y, c2.y, c3.y); \
+				m_row2 = row(c0.z, c1.z, c2.z, c3.z); \
+			} \
+		}; \
+		column mul(mat m, row v) \
+		{ \
+			const scalar a = dot(m.m_row0, v); \
+			const scalar b = dot(m.m_row1, v); \
+			const scalar c = dot(m.m_row2, v); \
+			return column(a, b, c); \
+		} \
+		mat combineTransformations(mat a_, mat b_) \
+		{ \
+			const row a[3] = {a_.m_row0, a_.m_row1, a_.m_row2}; \
+			const row b[3] = {b_.m_row0, b_.m_row1, b_.m_row2}; \
+			row c[3]; \
+			[unroll] for(U32 i = 0; i < 3; i++) \
+			{ \
+				row t2; \
+				t2 = b[0] * a[i][0]; \
+				t2 += b[1] * a[i][1]; \
+				t2 += b[2] * a[i][2]; \
+				const row v4 = row(0.0f, 0.0f, 0.0f, a[i][3]); \
+				t2 += v4; \
+				c[i] = t2; \
+			} \
+			mat o; \
+			o.m_row0 = c[0]; \
+			o.m_row1 = c[1]; \
+			o.m_row2 = c[2]; \
+			return o; \
+		}
 
-	Vec3 getTranslationPart()
-	{
-		return Vec3(m_row0.w, m_row1.w, m_row2.w);
-	}
-
-	void setColumns(Vec3 c0, Vec3 c1, Vec3 c2, Vec3 c3)
-	{
-		m_row0 = Vec4(c0.x, c1.x, c2.x, c3.x);
-		m_row1 = Vec4(c0.y, c1.y, c2.y, c3.y);
-		m_row2 = Vec4(c0.z, c1.z, c2.z, c3.z);
-	}
-};
+_ANKI_MAT3(Mat3, Vec3, F32)
+_ANKI_MAT4(Mat4, Vec4, F32)
+_ANKI_MAT3x4(Mat3x4, Vec4, Vec3, F32)
 
 #	if ANKI_FORCE_FULL_FP_PRECISION
-typedef float RF32;
+	typedef float RF32;
 typedef float2 RVec2;
 typedef float3 RVec3;
 typedef float4 RVec4;
-typedef Mat3 RMat3;
+_ANKI_MAT3(RMat3, Vec3, F32)
 #	else
-typedef min16float RF32;
+	typedef min16float RF32;
 typedef min16float2 RVec2;
 typedef min16float3 RVec3;
 typedef min16float4 RVec4;
-
-struct RMat3
-{
-	RVec3 m_row0;
-	RVec3 m_row1;
-	RVec3 m_row2;
-
-	_ANKI_DEFINE_ALL_OPERATORS_ROWS3(RMat3, RF32)
-
-	void setColumns(RVec3 c0, RVec3 c1, RVec3 c2)
-	{
-		m_row0 = RVec3(c0.x, c1.x, c2.x);
-		m_row1 = RVec3(c0.y, c1.y, c2.y);
-		m_row2 = RVec3(c0.z, c1.z, c2.z);
-	}
-};
-#	endif
-
-// Matrix functions
-Vec3 mul(Mat3 m, Vec3 v)
-{
-	const F32 a = dot(m.m_row0, v);
-	const F32 b = dot(m.m_row1, v);
-	const F32 c = dot(m.m_row2, v);
-	return Vec3(a, b, c);
-}
-
-#	if !ANKI_FORCE_FULL_FP_PRECISION
-RVec3 mul(RMat3 m, RVec3 v)
-{
-	const RF32 a = dot(m.m_row0, v);
-	const RF32 b = dot(m.m_row1, v);
-	const RF32 c = dot(m.m_row2, v);
-	return RVec3(a, b, c);
-}
+_ANKI_MAT3(RMat3, RVec3, RF32)
 #	endif
 
-Vec4 mul(Mat4 m, Vec4 v)
-{
-	const F32 a = dot(m.m_row0, v);
-	const F32 b = dot(m.m_row1, v);
-	const F32 c = dot(m.m_row2, v);
-	const F32 d = dot(m.m_row3, v);
-	return Vec4(a, b, c, d);
-}
-
-Mat4 mul(Mat4 a_, Mat4 b_)
-{
-	const Vec4 a[4] = {a_.m_row0, a_.m_row1, a_.m_row2, a_.m_row3};
-	const Vec4 b[4] = {b_.m_row0, b_.m_row1, b_.m_row2, b_.m_row3};
-	Vec4 c[4];
-
-	[unroll] for(U32 i = 0; i < 4; i++)
-	{
-		Vec4 t1, t2;
-
-		t1 = a[i][0];
-		t2 = b[0] * t1;
-		t1 = a[i][1];
-		t2 += b[1] * t1;
-		t1 = a[i][2];
-		t2 += b[2] * t1;
-		t1 = a[i][3];
-		t2 += b[3] * t1;
-
-		c[i] = t2;
-	}
-
-	Mat4 o;
-	o.m_row0 = c[0];
-	o.m_row1 = c[1];
-	o.m_row2 = c[2];
-	o.m_row3 = c[3];
-	return o;
-}
-
-Vec3 mul(Mat3x4 m, Vec4 v)
-{
-	const F32 a = dot(m.m_row0, v);
-	const F32 b = dot(m.m_row1, v);
-	const F32 c = dot(m.m_row2, v);
-	return Vec3(a, b, c);
-}
-
-Mat3 transpose(Mat3 m)
-{
-	Mat3 o;
-	o.setColumns(m.m_row0, m.m_row1, m.m_row2);
-	return o;
-}
-
-Mat3x4 combineTransformations(Mat3x4 a_, Mat3x4 b_)
-{
-	const Vec4 a[3] = {a_.m_row0, a_.m_row1, a_.m_row2};
-	const Vec4 b[3] = {b_.m_row0, b_.m_row1, b_.m_row2};
-	Vec4 c[3];
-
-	[unroll] for(U32 i = 0; i < 3; i++)
-	{
-		Vec4 t2;
-
-		t2 = b[0] * a[i][0];
-		t2 += b[1] * a[i][1];
-		t2 += b[2] * a[i][2];
-
-		const Vec4 v4 = Vec4(0.0f, 0.0f, 0.0f, a[i][3]);
-		t2 += v4;
-
-		c[i] = t2;
-	}
-
-	Mat3x4 o;
-	o.m_row0 = c[0];
-	o.m_row1 = c[1];
-	o.m_row2 = c[2];
-	return o;
-}
-
 // Common constants
 constexpr F32 kEpsilonF32 = 0.000001f;
 #	if ANKI_SUPPORTS_16BIT_TYPES

+ 5 - 0
AnKi/Shaders/Intellisense.hlsl

@@ -46,6 +46,11 @@ using U32 = unsigned int;
 using F32 = float;
 using Bool = bool;
 
+template<typename T, unsigned kC>
+struct vector
+{
+};
+
 struct UVec2
 {
 	U32 x;

+ 1 - 1
AnKi/Shaders/RtShadowsDenoise.ankiprog

@@ -124,7 +124,7 @@ F32 computeVarianceCenter(Vec2 uv)
 		// F32 localWeight = calculateBilateralWeighPlane(depthCenter, depthTap, 1.0);
 		F32 localWeight = calculateBilateralWeightPlane(positionCenter, normalCenter, positionTap, normalTap, 1.0);
 
-		localWeight *= gaussianWeight(0.4, abs(F32(i)) / F32(sampleCount + 1u));
+		localWeight *= gaussianWeight(0.4f, abs(F32(i)) / F32(sampleCount + 1u));
 
 		shadowFactor += localShadowFactor * localWeight;
 

+ 5 - 3
AnKi/Shaders/Sky.ankiprog

@@ -25,6 +25,7 @@ constexpr F32 kScatteringSteps = 32.0f;
 constexpr Vec3 kGroundAlbedo = 0.3f;
 
 // From https://gamedev.stackexchange.com/questions/96459/fast-ray-sphere-collision-code.
+// Sphere center is the (0,0,0), aka earth's core
 F32 rayIntersectSphere(Vec3 ro, Vec3 rd, F32 rad)
 {
 	const F32 b = dot(ro, rd);
@@ -357,9 +358,10 @@ Vec3 raymarchScattering(Vec3 pos, Vec3 rayDir, Vec3 dirToSun, F32 tMax, F32 numS
 		const Vec3 sunTransmittance = getValFromTLut(g_tLutTex, g_linearAnyClampSampler, newPos, dirToSun);
 		const Vec3 psiMS = getValFromMultiScattLut(g_mLutTex, g_linearAnyClampSampler, newPos, dirToSun);
 
-		const Vec3 rayleighInScattering = rayleighScattering * (rayleighPhaseValue * sunTransmittance + psiMS);
-		const Vec3 mieInScattering = mieScattering * (miePhaseValue * sunTransmittance + psiMS);
-		const Vec3 inScattering = (rayleighInScattering + mieInScattering);
+		const Vec3 phaseTimesScattering = rayleighScattering * rayleighPhaseValue + mieScattering * miePhaseValue;
+
+		// inScattering is the 'S' in the sebh code sample
+		const Vec3 inScattering = phaseTimesScattering * sunTransmittance + (rayleighScattering + mieScattering) * psiMS;
 
 		// Integrated scattering within path segment.
 		const Vec3 scatteringIntegral = (inScattering - inScattering * sampleTransmittance) / extinction;

+ 1 - 1
AnKi/Shaders/TemporalAA.ankiprog

@@ -104,7 +104,7 @@ FragOut main(Vec2 uv : TEXCOORD)
 #	if YCBCR
 	outColor = yCbCrToRgb(outColor);
 #	endif
-	const Vec3 tonemapped = linearToSRgb(tonemap(outColor, readExposureAndAverageLuminance().x));
+	const Vec3 tonemapped = linearToSRgb(tonemap<F32>(outColor, readExposureAndAverageLuminance().x));
 #	if ANKI_COMPUTE_SHADER
 	g_uavTex[svDispatchThreadId.xy] = RVec4(outColor, 0.0);
 	g_tonemappedUavTex[svDispatchThreadId.xy] = RVec4(tonemapped, 0.0);

+ 1 - 1
AnKi/Shaders/Tonemap.ankiprog

@@ -44,7 +44,7 @@ RVec3 main(Vec2 uv : TEXCOORD) : SV_TARGET0
 #	endif
 
 	const RVec3 hdr = g_inputRt.SampleLevel(g_nearestAnyClampSampler, uv, 0.0f).rgb;
-	const Vec3 tonemapped = linearToSRgb(tonemap(hdr, readExposureAndAverageLuminance().x));
+	const Vec3 tonemapped = linearToSRgb(tonemap<RF32>(hdr, readExposureAndAverageLuminance().x));
 
 #	if ANKI_COMPUTE_SHADER
 	g_outUav[svDispatchThreadId.xy] = RVec4(tonemapped, 0.0);

+ 1 - 1
AnKi/Shaders/TonemappingAverageLuminance.ankiprog

@@ -90,7 +90,7 @@ groupshared F32 s_avgLum[THREAD_COUNT_X * THREAD_COUNT_Y];
 		// This is a workaround because sometimes the avg lum becomes nan
 		finalAvgLum = clamp(finalAvgLum, kEpsilonF32, kMaxF32);
 
-		writeExposureAndAverageLuminance(computeExposure(finalAvgLum, 0.0), finalAvgLum);
+		writeExposureAndAverageLuminance(computeExposure(finalAvgLum, 0.0f), finalAvgLum);
 	}
 }
 #pragma anki technique_end comp

+ 52 - 37
AnKi/Shaders/TonemappingFunctions.hlsl

@@ -14,91 +14,106 @@ T log10(T x)
 	return log(x) / log((T)10.0);
 }
 
-RVec3 computeLuminance(RVec3 color)
+template<typename T>
+vector<T, 3> computeLuminance(vector<T, 3> color)
 {
-	return max(dot(RVec3(0.30, 0.59, 0.11), color), kEpsilonRF32);
+	return max(dot(vector<T, 3>(0.30, 0.59, 0.11), color), T(kEpsilonRF32));
 }
 
-RF32 computeExposure(RF32 avgLum, RF32 threshold)
+template<typename T>
+T computeExposure(T avgLum, T threshold)
 {
-	const RF32 keyValue = 1.03 - (2.0 / (2.0 + log10(avgLum + 1.0)));
-	const RF32 linearExposure = (keyValue / avgLum);
-	RF32 exposure = log2(linearExposure);
+	const T keyValue = T(1.03) - (T(2.0) / (T(2.0) + log10(avgLum + T(1.0))));
+	const T linearExposure = (keyValue / avgLum);
+	T exposure = log2(linearExposure);
 
 	exposure -= threshold;
 	return exp2(exposure);
 }
 
-RVec3 computeExposedColor(RVec3 color, RF32 avgLum, RF32 threshold)
+template<typename T>
+vector<T, 3> computeExposedColor(vector<T, 3> color, vector<T, 3> avgLum, vector<T, 3> threshold)
 {
 	return computeExposure(avgLum, threshold) * color;
 }
 
 // Uncharted 2 operator
-RF32 tonemapUncharted2(RF32 color)
+template<typename T>
+vector<T, 3> tonemapUncharted2(vector<T, 3> color)
 {
-	const RF32 A = 0.15;
-	const RF32 B = 0.50;
-	const RF32 C = 0.10;
-	const RF32 D = 0.20;
-	const RF32 E = 0.02;
-	const RF32 F = 0.30;
+	const T A = 0.15;
+	const T B = 0.50;
+	const T C = 0.10;
+	const T D = 0.20;
+	const T E = 0.02;
+	const T F = 0.30;
 
 	return ((color * (A * color + C * B) + D * E) / (color * (A * color + B) + D * F)) - E / F;
 }
 
-constexpr RF32 kAcesA = 2.51;
-constexpr RF32 kAcesB = 0.03;
-constexpr RF32 kAcesC = 2.43;
-constexpr RF32 kAcesD = 0.59;
-constexpr RF32 kAcesE = 0.14;
-
 // See ACES in action and its inverse at https://www.desmos.com/calculator/n1lkpc6hwq
-RVec3 tonemapACESFilm(RVec3 x)
+template<typename T>
+vector<T, 3> tonemapACESFilm(vector<T, 3> x)
 {
+	constexpr T kAcesA = 2.51;
+	constexpr T kAcesB = 0.03;
+	constexpr T kAcesC = 2.43;
+	constexpr T kAcesD = 0.59;
+	constexpr T kAcesE = 0.14;
+
 	return saturate((x * (kAcesA * x + kAcesB)) / (x * (kAcesC * x + kAcesD) + kAcesE));
 }
 
 // https://www.desmos.com/calculator/n1lkpc6hwq
-RVec3 invertTonemapACESFilm(RVec3 x)
+template<typename T>
+vector<T, 3> invertTonemapACESFilm(vector<T, 3> x)
 {
-	RVec3 res = kAcesD * x - kAcesB;
-	res += sqrt(x * x * (kAcesD * kAcesD - 4.0 * kAcesE * kAcesC) + x * (4.0 * kAcesE * kAcesA - 2.0 * kAcesB * kAcesD) + kAcesB * kAcesB);
-	res /= 2.0 * kAcesA - 2.0 * kAcesC * x;
+	constexpr T kAcesA = 2.51;
+	constexpr T kAcesB = 0.03;
+	constexpr T kAcesC = 2.43;
+	constexpr T kAcesD = 0.59;
+	constexpr T kAcesE = 0.14;
+
+	vector<T, 3> res = kAcesD * x - kAcesB;
+	res += sqrt(x * x * (kAcesD * kAcesD - T(4.0) * kAcesE * kAcesC) + x * (T(4.0) * kAcesE * kAcesA - T(2.0) * kAcesB * kAcesD) + kAcesB * kAcesB);
+	res /= T(2.0) * kAcesA - T(2.0) * kAcesC * x;
 
 	return res;
 }
 
-RVec3 tonemap(RVec3 color, RF32 exposure)
+template<typename T>
+vector<T, 3> tonemap(vector<T, 3> color, vector<T, 3> exposure)
 {
 	color *= exposure;
 	return tonemapACESFilm(color);
 }
 
-RVec3 invertTonemap(RVec3 color, RF32 exposure)
+template<typename T>
+vector<T, 3> invertTonemap(vector<T, 3> color, T exposure)
 {
 	color = invertTonemapACESFilm(color);
-	color /= max(kEpsilonRF32, exposure);
+	color /= max(T(kEpsilonRF32), exposure);
 	return color;
 }
 
-RVec3 tonemap(RVec3 color, RF32 avgLum, RF32 threshold)
+template<typename T>
+vector<T, 3> tonemap(vector<T, 3> color, T avgLum, T threshold)
 {
-	const RF32 exposure = computeExposure(avgLum, threshold);
-	return tonemap(color, exposure);
+	const T exposure = computeExposure(avgLum, threshold);
+	return tonemap<T>(color, exposure);
 }
 
 // https://graphicrants.blogspot.com/2013/12/tone-mapping.html
-template<typename TVec3>
-TVec3 reinhardTonemap(TVec3 colour)
+template<typename T>
+vector<T, 3> reinhardTonemap(vector<T, 3> colour)
 {
 	// rgb / (1 + max(rgb))
-	return colour / (1.0 + max(max(colour.r, colour.g), colour.b));
+	return colour / (T(1.0) + max(max(colour.r, colour.g), colour.b));
 }
 
-template<typename TVec3>
-TVec3 invertReinhardTonemap(TVec3 colour)
+template<typename T>
+vector<T, 3> invertReinhardTonemap(vector<T, 3> colour)
 {
 	// rgb / (1 - max(rgb))
-	return colour / max(1.0 / 32768.0, 1.0 - max(max(colour.r, colour.g), colour.b));
+	return colour / max(T(1.0 / 32768.0), T(1.0) - max(max(colour.r, colour.g), colour.b));
 }