Bläddra i källkod

Convert some VRS shaders to HLSL

Panagiotis Christopoulos Charitos 2 år sedan
förälder
incheckning
35e29d3b5e

+ 71 - 64
AnKi/Shaders/VrsSriGenerationCompute.ankiprog

@@ -3,90 +3,98 @@
 // Code licensed under the BSD License.
 // http://www.anki3d.org/LICENSE
 
+#pragma anki hlsl
+
 #pragma anki mutator SRI_TEXEL_DIMENSION 8 16
 #pragma anki mutator SHARED_MEMORY 0 1
 #pragma anki mutator LIMIT_RATE_TO_2X2 0 1
 
 #pragma anki start comp
 
-#include <AnKi/Shaders/Functions.glsl>
-#include <AnKi/Shaders/TonemappingFunctions.glsl>
+#include <AnKi/Shaders/Functions.hlsl>
+#include <AnKi/Shaders/TonemappingFunctions.hlsl>
 
 // Find the maximum luma derivative in x and y, relative to the average luma of the block.
 // Each thread handles a 2x2 region when using 8x8 VRS tiles and a 2x4 region when using 16x16 VRS tiles.
 
-layout(set = 0, binding = 0) uniform ANKI_RP texture2D u_inputTex;
-layout(set = 0, binding = 1) uniform sampler u_nearestClampSampler;
+[[vk::binding(0)]] Texture2D<RVec4> g_inputTex;
+[[vk::binding(1)]] SamplerState g_nearestClampSampler;
 
 #if SRI_TEXEL_DIMENSION == 8
-const UVec2 kRegionSize = UVec2(2u, 2u);
+#	define REGION_SIZE_X 2
+#	define REGION_SIZE_Y 2
 #else
-const UVec2 kRegionSize = UVec2(2u, 4u);
+#	define REGION_SIZE_X 2
+#	define REGION_SIZE_Y 4
 #endif
 
-const UVec2 kWorkgroupSize = UVec2(SRI_TEXEL_DIMENSION) / kRegionSize;
-layout(local_size_x = kWorkgroupSize.x, local_size_y = kWorkgroupSize.y, local_size_z = 1) in;
+#define THREADGROUP_SIZE_X (SRI_TEXEL_DIMENSION / REGION_SIZE_X)
+#define THREADGROUP_SIZE_Y (SRI_TEXEL_DIMENSION / REGION_SIZE_Y)
 
-layout(set = 0, binding = 2) uniform writeonly uimage2D u_sriImg;
+[[vk::binding(2)]] RWTexture2D<U32> g_sriUav;
 
-layout(push_constant, std430) uniform b_pc
+struct Uniforms
 {
-	Vec2 u_oneOverViewportSize;
-	F32 u_threshold;
-	F32 u_padding0;
+	Vec2 m_oneOverViewportSize;
+	F32 m_threshold;
+	F32 m_padding0;
 };
 
+[[vk::push_constant]] ConstantBuffer<Uniforms> g_unis;
+
 #if SHARED_MEMORY
 // Ideally, we'd be able to calculate the min/max/average using subgroup operations, but there's no guarantee
 // subgroupSize is large enough so we need shared memory as a fallback. We need gl_NumSubgroups entries, but it is not a
 // constant, so estimate it assuming a subgroupSize of at least 8.
-const U32 kSharedMemoryEntries = kWorkgroupSize.x * kWorkgroupSize.y / 8u;
-shared F32 s_averageLuma[kSharedMemoryEntries];
-shared Vec2 s_maxDerivative[kSharedMemoryEntries];
+constexpr U32 kSharedMemoryEntries = THREADGROUP_SIZE_X * THREADGROUP_SIZE_Y / 8u;
+groupshared RF32 s_averageLuma[kSharedMemoryEntries];
+groupshared RVec2 s_maxDerivative[kSharedMemoryEntries];
 #endif
 
-F32 computeLuma(Vec3 color)
+RF32 computeLuma(RVec3 color)
 {
-	const F32 l = computeLuminance(color);
+	const RF32 l = computeLuminance(color);
 	return l / (1.0f + l);
 }
 
 #define sampleLuma(offsetX, offsetY) \
-	computeLuma(textureLodOffset(sampler2D(u_inputTex, u_nearestClampSampler), uv, 0.0, IVec2(offsetX, offsetY)).xyz)
+	computeLuma(g_inputTex.SampleLevel(g_nearestClampSampler, uv, 0.0, IVec2(offsetX, offsetY)).xyz)
 
-void main()
+[numthreads(THREADGROUP_SIZE_X, THREADGROUP_SIZE_Y, 1)] void
+main(UVec3 svDispatchThreadId : SV_DISPATCHTHREADID, U32 svGroupIndex : SV_GROUPINDEX, UVec3 svGroupID : SV_GROUPID)
 {
-	const Vec2 uv = (Vec2(gl_GlobalInvocationID.xy) * Vec2(kRegionSize) + 0.5) * u_oneOverViewportSize;
+	const Vec2 uv =
+		(Vec2(svDispatchThreadId.xy) * Vec2(REGION_SIZE_X, REGION_SIZE_Y) + 0.5) * g_unis.m_oneOverViewportSize;
 
 #if SRI_TEXEL_DIMENSION == 8
 	// Get luminance.
 	//       l1.y
 	// l0.z  l0.w  l1.x
 	// l0.x  l0.y
-	Vec4 l0;
+	RVec4 l0;
 	l0.x = sampleLuma(0, 0);
 	l0.y = sampleLuma(1, 0);
 	l0.z = sampleLuma(0, 1);
 	l0.w = sampleLuma(1, 1);
 
-	Vec2 l1;
+	RVec2 l1;
 	l1.x = sampleLuma(2, 1);
 	l1.y = sampleLuma(1, 2);
 
 	// Calculate derivatives.
-	Vec2 a = Vec2(l0.y, l1.x);
-	Vec2 b = Vec2(l0.x, l0.w);
-	const Vec2 dx = abs(a - b);
+	RVec2 a = RVec2(l0.y, l1.x);
+	RVec2 b = RVec2(l0.x, l0.w);
+	const RVec2 dx = abs(a - b);
 
-	a = Vec2(l0.z, l1.y);
-	b = Vec2(l0.x, l0.w);
-	const Vec2 dy = abs(a - b);
+	a = RVec2(l0.z, l1.y);
+	b = RVec2(l0.x, l0.w);
+	const RVec2 dy = abs(a - b);
 
-	F32 maxDerivativeX = max(dx.x, dx.y);
-	F32 maxDerivativeY = max(dy.x, dy.y);
+	RF32 maxDerivativeX = max(dx.x, dx.y);
+	RF32 maxDerivativeY = max(dy.x, dy.y);
 
 	// Calculate average luma.
-	F32 averageLuma = (l0.x + l0.y + l0.z + l0.w) / 4.0;
+	RF32 averageLuma = (l0.x + l0.y + l0.z + l0.w) / 4.0;
 #else
 	// Get luminance.
 	//             l2.z
@@ -94,90 +102,89 @@ void main()
 	//       l1.x  l1.y
 	//       l0.z  l0.w  l2.x
 	//       l0.x  l0.y
-	Vec4 l0;
+	RVec4 l0;
 	l0.x = sampleLuma(0, 0);
 	l0.y = sampleLuma(1, 0);
 	l0.z = sampleLuma(0, 1);
 	l0.w = sampleLuma(1, 1);
 
-	Vec4 l1;
+	RVec4 l1;
 	l1.x = sampleLuma(0, 2);
 	l1.y = sampleLuma(1, 2);
 	l1.z = sampleLuma(0, 3);
 	l1.w = sampleLuma(1, 3);
 
-	Vec3 l2;
+	RVec3 l2;
 	l2.x = sampleLuma(2, 1);
 	l2.y = sampleLuma(-1, 3);
 	l2.z = sampleLuma(1, 4);
 
 	// Calculate derivatives.
-	Vec4 a = Vec4(l0.y, l2.x, l1.y, l2.y);
-	Vec4 b = Vec4(l0.x, l0.w, l1.x, l1.z);
-	const Vec4 dx = abs(a - b);
+	RVec4 a = RVec4(l0.y, l2.x, l1.y, l2.y);
+	RVec4 b = RVec4(l0.x, l0.w, l1.x, l1.z);
+	const RVec4 dx = abs(a - b);
 
-	a = Vec4(l0.z, l0.w, l1.z, l2.z);
-	b = Vec4(l0.x, l0.y, l1.x, l1.w);
-	const Vec4 dy = abs(a - b);
+	a = RVec4(l0.z, l0.w, l1.z, l2.z);
+	b = RVec4(l0.x, l0.y, l1.x, l1.w);
+	const RVec4 dy = abs(a - b);
 
-	F32 maxDerivativeX = max(max(dx.x, dx.y), max(dx.z, dx.w));
-	F32 maxDerivativeY = max(max(dy.x, dy.y), max(dy.z, dy.w));
+	RF32 maxDerivativeX = max(max(dx.x, dx.y), max(dx.z, dx.w));
+	RF32 maxDerivativeY = max(max(dy.x, dy.y), max(dy.z, dy.w));
 
 	// Calculate average luma.
-	const Vec4 sumL0L1 = l0 + l1;
-	F32 averageLuma = (sumL0L1.x + sumL0L1.y + sumL0L1.z + sumL0L1.w) / 8.0;
+	const RVec4 sumL0L1 = l0 + l1;
+	RF32 averageLuma = (sumL0L1.x + sumL0L1.y + sumL0L1.z + sumL0L1.w) / 8.0;
 #endif
 
 	// Share values in subgroup.
-	maxDerivativeX = subgroupMax(maxDerivativeX);
-	maxDerivativeY = subgroupMax(maxDerivativeY);
-	averageLuma = subgroupAdd(averageLuma);
+	maxDerivativeX = WaveActiveMax(maxDerivativeX);
+	maxDerivativeY = WaveActiveMax(maxDerivativeY);
+	averageLuma = WaveActiveSum(averageLuma);
 
 #if SHARED_MEMORY
 	// Store results in shared memory.
-	[branch] if(subgroupElect())
+	[branch] if(WaveIsFirstLane())
 	{
-		s_averageLuma[gl_SubgroupID] = averageLuma;
-		s_maxDerivative[gl_SubgroupID] = Vec2(maxDerivativeX, maxDerivativeY);
+		s_averageLuma[WaveGetLaneIndex()] = averageLuma;
+		s_maxDerivative[WaveGetLaneIndex()] = RVec2(maxDerivativeX, maxDerivativeY);
 	}
 
-	memoryBarrierShared();
-	barrier();
+	GroupMemoryBarrierWithGroupSync();
 #endif
 
 	// Write the result
-	[branch] if(gl_LocalInvocationIndex == 0u)
+	[branch] if(svGroupIndex == 0u)
 	{
 		// Get max across all subgroups.
 #if SHARED_MEMORY
 		averageLuma = s_averageLuma[0];
-		Vec2 maxDerivative = s_maxDerivative[0];
+		RVec2 maxDerivative = s_maxDerivative[0];
 
-		for(U32 i = 1u; i < gl_NumSubgroups; ++i)
+		for(U32 i = 1u; i < WaveGetLaneCount(); ++i)
 		{
 			averageLuma += s_averageLuma[i];
 			maxDerivative = max(maxDerivative, s_maxDerivative[i]);
 		}
 #else
-		const Vec2 maxDerivative = Vec2(maxDerivativeX, maxDerivativeY);
+		const RVec2 maxDerivative = RVec2(maxDerivativeX, maxDerivativeY);
 #endif
 
 		// Determine shading rate.
-		const F32 avgLuma = averageLuma / F32(kWorkgroupSize.x * kWorkgroupSize.y);
-		const Vec2 lumaDiff = maxDerivative / avgLuma;
-		const F32 threshold1 = u_threshold;
-		const F32 threshold2 = threshold1 * 0.4;
+		const RF32 avgLuma = averageLuma / RF32(THREADGROUP_SIZE_X * THREADGROUP_SIZE_Y);
+		const RVec2 lumaDiff = maxDerivative / avgLuma;
+		const RF32 threshold1 = g_unis.m_threshold;
+		const RF32 threshold2 = threshold1 * 0.4;
 
 		UVec2 rate;
 		rate.x = (lumaDiff.x > threshold1) ? 1u : ((lumaDiff.x > threshold2) ? 2u : 4u);
 		rate.y = (lumaDiff.y > threshold1) ? 1u : ((lumaDiff.y > threshold2) ? 2u : 4u);
 
 #if LIMIT_RATE_TO_2X2
-		rate = min(rate, UVec2(2u));
+		rate = min(rate, UVec2(2, 2));
 #endif
 
-		const UVec2 outTexelCoord = gl_WorkGroupID.xy;
-		imageStore(u_sriImg, IVec2(outTexelCoord), UVec4(encodeVrsRate(rate)));
+		const UVec2 outTexelCoord = svGroupID.xy;
+		g_sriUav[outTexelCoord] = encodeVrsRate(rate);
 	}
 }
 

+ 9 - 10
AnKi/Shaders/VrsSriVisualizeRenderTarget.ankiprog

@@ -3,23 +3,22 @@
 // Code licensed under the BSD License.
 // http://www.anki3d.org/LICENSE
 
+#pragma anki hlsl
+
 #pragma anki start vert
-#include <AnKi/Shaders/QuadVert.glsl>
+#include <AnKi/Shaders/QuadVert.hlsl>
 #pragma anki end
 
 #pragma anki start frag
-#include <AnKi/Shaders/Functions.glsl>
-
-layout(set = 0, binding = 0) uniform sampler u_nearestAnyClampSampler;
-layout(set = 0, binding = 1) uniform utexture2D u_inTex;
+#include <AnKi/Shaders/Functions.hlsl>
 
-layout(location = 0) in Vec2 in_uv;
-layout(location = 0) out Vec3 out_color;
+[[vk::binding(0)]] SamplerState g_nearestAnyClampSampler;
+[[vk::binding(1)]] Texture2D<U32> g_inTex;
 
-void main()
+Vec3 main(Vec2 uv : TEXCOORD) : SV_TARGET0
 {
-	const U32 texel = textureLod(u_inTex, u_nearestAnyClampSampler, in_uv, 0.0).x;
+	const U32 texel = g_inTex.SampleLevel(g_nearestAnyClampSampler, uv, 0.0);
 	const UVec2 rate = decodeVrsRate(texel);
-	out_color = visualizeVrsRate(rate);
+	return visualizeVrsRate(rate);
 }
 #pragma anki end