Browse Source

Some mobile optimizations

Panagiotis Christopoulos Charitos 2 months ago
parent
commit
2a3e0b9dff

+ 3 - 2
AnKi/Renderer/IndirectDiffuseClipmaps.cpp

@@ -262,11 +262,12 @@ Error IndirectDiffuseClipmaps::init()
 		m_avgIrradianceVolumes[clipmap] = getRenderer().createAndClearRenderTarget(volumeInit, TextureUsageBit::kSrvCompute);
 	}
 
-	const Array<SubMutation, 5> mutation = {{{"GPU_WAVE_SIZE", MutatorValue(GrManager::getSingleton().getDeviceCapabilities().m_maxWaveSize)},
+	const Array<SubMutation, 6> mutation = {{{"GPU_WAVE_SIZE", MutatorValue(GrManager::getSingleton().getDeviceCapabilities().m_maxWaveSize)},
 											 {"RADIANCE_OCTAHEDRON_MAP_SIZE", MutatorValue(g_cvarRenderIdcRadianceOctMapSize)},
 											 {"IRRADIANCE_OCTAHEDRON_MAP_SIZE", MutatorValue(g_cvarRenderIdcIrradianceOctMapSize)},
 											 {"RT_MATERIAL_FETCH_CLIPMAP", 0},
-											 {"SPATIAL_RECONSTRUCT_TYPE", !g_cvarRenderIdcApplyHighQuality}}};
+											 {"SPATIAL_RECONSTRUCT_TYPE", !g_cvarRenderIdcApplyHighQuality},
+											 {"IRRADIANCE_USE_SH_L2", g_cvarRenderIdcUseSHL2}}};
 
 	constexpr CString kProgFname = "ShaderBinaries/IndirectDiffuseClipmaps.ankiprogbin";
 	ANKI_CHECK(loadShaderProgram(kProgFname, mutation, m_prog, m_applyGiGrProg, "Apply"));

+ 1 - 0
AnKi/Renderer/IndirectDiffuseClipmaps.h

@@ -16,6 +16,7 @@ namespace anki {
 
 ANKI_CVAR(BoolCVar, Render, Idc, false, "Enable ray traced indirect diffuse clipmaps")
 ANKI_CVAR2(BoolCVar, Render, Idc, InlineRt, false, "Use a cheap and less accurate path with inline RT");
+ANKI_CVAR2(BoolCVar, Render, Idc, UseSHL2, !ANKI_PLATFORM_MOBILE, "Use L2 SH for calculations. Else use L1");
 
 constexpr U32 kDefaultClipmapProbeCountXZ = 32;
 constexpr U32 kDefaultClipmapProbeCountY = 12;

+ 4 - 4
AnKi/Shaders/Common.hlsl

@@ -323,9 +323,9 @@ T square(T x)
 
 #define COMPUTE_ARGS \
 	U32 svGroupIndex : \
-		SV_GroupIndex, \
+		SV_GROUPINDEX, \
 		UVec3 svGroupId : \
-		SV_GroupID, \
+		SV_GROUPID, \
 		UVec3 svDispatchThreadId : \
-		SV_DispatchThreadID, \
-		UVec3 svGroupThreadId : SV_GroupThreadID
+		SV_DISPATCHTHREADID, \
+		UVec3 svGroupThreadId : SV_GROUPTHREADID

+ 2 - 2
AnKi/Shaders/Dbg.ankiprog

@@ -248,7 +248,7 @@ struct VertIn
 
 struct VertOut
 {
-	Vec4 m_svPosition : SV_Position;
+	Vec4 m_svPosition : SV_POSITION;
 	Vec4 m_color : COLOR;
 };
 
@@ -269,7 +269,7 @@ VertOut main(VertIn input)
 #	endif // ANKI_VERTEX_SHADER
 
 #	if ANKI_PIXEL_SHADER
-Vec4 main(VertOut input) : SV_Target0
+Vec4 main(VertOut input) : SV_TARGET0
 {
 	return input.m_color;
 }

+ 1 - 1
AnKi/Shaders/FillBuffer.ankiprog

@@ -18,7 +18,7 @@ struct Consts
 };
 ANKI_FAST_CONSTANTS(Consts, g_consts)
 
-[NumThreads(64, 1, 1)] void main(U32 svDispatchThreadId : SV_DispatchThreadID)
+[numthreads(64, 1, 1)] void main(U32 svDispatchThreadId : SV_DISPATCHTHREADID)
 {
 	if(svDispatchThreadId < g_consts.m_elementCount)
 	{

+ 1 - 1
AnKi/Shaders/GBufferPost.ankiprog

@@ -24,7 +24,7 @@ ConstantBuffer<GlobalRendererConstants> g_globalConstants : register(b0);
 
 SamplerState g_linearAnyClampSampler : register(s0);
 
-[NumThreads(8, 8, 1)] void main(UVec2 svDispatchThreadId : SV_DispatchThreadID)
+[numthreads(8, 8, 1)] void main(UVec2 svDispatchThreadId : SV_DISPATCHTHREADID)
 {
 	UVec2 viewportSize;
 	g_depthTex.GetDimensions(viewportSize.x, viewportSize.y);

+ 3 - 3
AnKi/Shaders/GBufferVisualizeProbe.ankiprog

@@ -35,13 +35,13 @@ ConstantBuffer<Consts> g_consts : register(b0);
 
 struct VertIn
 {
-	U32 m_svInstanceId : SV_InstanceID;
-	U32 m_svVertexId : SV_VertexID;
+	U32 m_svInstanceId : SV_INSTANCEID;
+	U32 m_svVertexId : SV_VERTEXID;
 };
 
 struct VertOut
 {
-	Vec4 m_svPosition : SV_Position;
+	Vec4 m_svPosition : SV_POSITION;
 	Vec3 m_sphereCenter : SpherePosition;
 };
 

+ 1 - 1
AnKi/Shaders/HistoryLength.ankiprog

@@ -81,7 +81,7 @@ F32 computeLength(Vec2 coord)
 }
 
 #if ANKI_COMPUTE_SHADER
-[NumThreads(64, 1, 1)] void main(COMPUTE_ARGS)
+[numthreads(64, 1, 1)] void main(COMPUTE_ARGS)
 {
 	const Vec2 coord = getOptimalDispatchThreadId8x8Amd(svGroupIndex, svGroupId.xy);
 

+ 25 - 14
AnKi/Shaders/IndirectDiffuseClipmaps.ankiprog

@@ -10,11 +10,12 @@
 #pragma anki mutator IRRADIANCE_OCTAHEDRON_MAP_SIZE 4 5 6
 #pragma anki mutator RT_MATERIAL_FETCH_CLIPMAP 0 1
 #pragma anki mutator SPATIAL_RECONSTRUCT_TYPE 0 1
+#pragma anki mutator IRRADIANCE_USE_SH_L2 0 1
 
 #pragma anki technique RtMaterialFetch rgen mutators RT_MATERIAL_FETCH_CLIPMAP SPATIAL_RECONSTRUCT_TYPE
 #pragma anki technique RtMaterialFetchInlineRt comp mutators
 #pragma anki technique PopulateCaches comp mutators RADIANCE_OCTAHEDRON_MAP_SIZE
-#pragma anki technique ComputeIrradiance comp mutators GPU_WAVE_SIZE RADIANCE_OCTAHEDRON_MAP_SIZE IRRADIANCE_OCTAHEDRON_MAP_SIZE
+#pragma anki technique ComputeIrradiance comp mutators GPU_WAVE_SIZE RADIANCE_OCTAHEDRON_MAP_SIZE IRRADIANCE_OCTAHEDRON_MAP_SIZE IRRADIANCE_USE_SH_L2
 #pragma anki technique Apply comp mutators SPATIAL_RECONSTRUCT_TYPE
 #pragma anki technique SpatialReconstruct comp mutators SPATIAL_RECONSTRUCT_TYPE
 #pragma anki technique TemporalDenoise comp mutators
@@ -179,7 +180,7 @@ ConstantBuffer<GlobalRendererConstants> g_globalRendererConstants : register(b0)
 
 ANKI_FAST_CONSTANTS(ProbeUpdateConsts, g_consts)
 
-[NumThreads(64, 1, 1)] void main(COMPUTE_ARGS)
+[numthreads(64, 1, 1)] void main(COMPUTE_ARGS)
 {
 	const IndirectDiffuseClipmapConstants idConsts = g_globalRendererConstants.m_indirectDiffuseClipmaps;
 	const U32 clipmapIdx = g_consts.m_clipmapIdx;
@@ -341,7 +342,17 @@ RWTexture3D<Vec4> g_avgIrradianceVolumes[kIndirectDiffuseClipmapCount] : registe
 
 ConstantBuffer<GlobalRendererConstants> g_globalRendererConstants : register(b0);
 
-groupshared SH::L2_F16_RGB g_sh[kThreadCount];
+#	if IRRADIANCE_USE_SH_L2
+#		define SH_TYPE SH::L2_F16_RGB
+#		define SH_PROJECT_ONTO_LX SH::ProjectOntoL2
+#		define SH_TO_L1(x) SH::L2toL1(x)
+#	else
+#		define SH_TYPE SH::L1_F16_RGB
+#		define SH_PROJECT_ONTO_LX SH::ProjectOntoL1
+#		define SH_TO_L1(x) (x)
+#	endif
+
+groupshared SH_TYPE g_sh[kThreadCount];
 
 struct StoreBorderFunc
 {
@@ -360,7 +371,7 @@ struct StoreBorderFunc
 // - Every thread reads a radiance value, converts it to SH and stores is in groupshared
 // - Then we do a reduction of all SH
 // - Then we use the SH to populate the irradiance
-[NumThreads(kThreadCount, 1, 1)] void main(COMPUTE_ARGS)
+[numthreads(kThreadCount, 1, 1)] void main(COMPUTE_ARGS)
 {
 	const IndirectDiffuseClipmapConstants idConsts = g_globalRendererConstants.m_indirectDiffuseClipmaps;
 	const U32 clipmapIdx = svGroupId.x / idConsts.m_probeCounts.x;
@@ -386,11 +397,11 @@ struct StoreBorderFunc
 		const F16 sampleCountf = square(RADIANCE_OCTAHEDRON_MAP_SIZE);
 		const F16 normalization = 1.0 / (sampleCountf * sampleDirectionSpherePdf());
 
-		g_sh[svGroupIndex] = SH::ProjectOntoL2(HVec3(sampleDir), HVec3(radiance)) * normalization;
+		g_sh[svGroupIndex] = SH_PROJECT_ONTO_LX(HVec3(sampleDir), HVec3(radiance)) * normalization;
 	}
 	else
 	{
-		g_sh[svGroupIndex] = SH::L2_F16_RGB::Zero();
+		g_sh[svGroupIndex] = SH_TYPE::Zero();
 	}
 
 	// Integrate, like parallel prefix sum
@@ -413,7 +424,7 @@ struct StoreBorderFunc
 #	endif
 	}
 
-	const SH::L2_F16_RGB sh = g_sh[0];
+	const SH_TYPE sh = g_sh[0];
 
 	// Store the irradiance
 	if(svGroupIndex < square(IRRADIANCE_OCTAHEDRON_MAP_SIZE))
@@ -445,7 +456,7 @@ struct StoreBorderFunc
 	// Store the average irradiance
 	HVec3 dir;
 	HVec3 color;
-	SH::ApproximateDirectionalLight(SH::L2toL1(sh), dir, color);
+	SH::ApproximateDirectionalLight(SH_TO_L1(sh), dir, color);
 	if(isInfOrNan(Vec3(color)))
 	{
 		color = 0.0;
@@ -470,7 +481,7 @@ ConstantBuffer<GlobalRendererConstants> g_globalRendererConstants : register(b0)
 
 SamplerState g_linearAnyRepeatSampler : register(s0);
 
-[NumThreads(64, 1, 1)] void main(COMPUTE_ARGS)
+[numthreads(64, 1, 1)] void main(COMPUTE_ARGS)
 {
 	Vec2 lowTextureSize;
 	g_outTex.GetDimensions(lowTextureSize.x, lowTextureSize.y);
@@ -909,20 +920,20 @@ RWTexture2D<Vec4> g_outTex : register(u0);
 
 struct VertIn
 {
-	U32 m_svVertexId : SV_VertexID;
-	U32 m_svInstanceId : SV_InstanceID;
+	U32 m_svVertexId : SV_VERTEXID;
+	U32 m_svInstanceId : SV_INSTANCEID;
 };
 
 struct VertOut
 {
-	Vec4 m_svPosition : SV_Position;
+	Vec4 m_svPosition : SV_POSITION;
 
 	Vec3 m_probeCenter : PROBE_CENTER;
 };
 
 struct FragOut
 {
-	Vec4 m_color : SV_Target0;
+	Vec4 m_color : SV_TARGET0;
 	F32 m_svDepth : SV_Depth;
 };
 
@@ -945,7 +956,7 @@ SamplerState g_linearAnyRepeatSampler : register(s0);
 constexpr F32 kSphereRadius = 0.05;
 
 #	if ANKI_VERTEX_SHADER
-// Cube vertex positions indexed via SV_VertexID
+// Cube vertex positions indexed via SV_VERTEXID
 constexpr Vec3 cubeVertices[8] = {Vec3(-1, -1, -1), Vec3(1, -1, -1), Vec3(1, 1, -1), Vec3(-1, 1, -1),
 								  Vec3(-1, -1, 1),  Vec3(1, -1, 1),  Vec3(1, 1, 1),  Vec3(-1, 1, 1)};
 

+ 9 - 9
AnKi/Shaders/Intellisense.hlsl

@@ -8,15 +8,15 @@
 #define groupshared
 #define globallycoherent
 #define nointerpolation
-#define SV_DispatchThreadID // gl_GlobalInvocationID
-#define SV_GroupIndex // gl_LocalInvocationIndex
-#define SV_GroupID // gl_WorkGroupID
-#define SV_GroupThreadID // gl_LocalInvocationID
-#define SV_CullPrimitive
-#define SV_VertexID
-#define SV_Position
-#define SV_InstanceID
-#define NumThreads(x, y, z) [nodiscard]
+#define SV_DISPATCHTHREADID // gl_GlobalInvocationID
+#define SV_GROUPINDEX // gl_LocalInvocationIndex
+#define SV_GROUPID // gl_WorkGroupID
+#define SV_GROUPTHREADID // gl_LocalInvocationID
+#define SV_CULLPRIMITIVE
+#define SV_VERTEXID
+#define SV_POSITION
+#define SV_INSTANCEID
+#define numthreads(x, y, z) [nodiscard]
 #define outputtopology(x) [nodiscard]
 #define unroll [nodiscard]
 #define loop [nodiscard]

+ 1 - 1
AnKi/Shaders/LightShading.ankiprog

@@ -48,7 +48,7 @@ Texture2D<Vec4> g_integrationLut : register(t12);
 		const F16 att = computeAttenuationFactor<F16>(light.m_radius, frag2Light); \
 		F16 lambert = max(F16(0.0), dot(gbuffer.m_normal, l));
 
-Vec4 main(VertOut input) : SV_Target0
+Vec4 main(VertOut input) : SV_TARGET0
 {
 	const Vec2 uv = input.m_uv;
 	const Vec2 ndc = uvToNdc(uv);

+ 4 - 4
AnKi/Shaders/MotionBlur.ankiprog

@@ -41,7 +41,7 @@ struct Consts
 
 ANKI_FAST_CONSTANTS(Consts, g_consts)
 
-[numthreads(8, 8, 1)] void main(UVec2 svDispatchThreadId : SV_DispatchThreadID, U32 svGroupIndex : SV_GroupIndex, UVec2 svGroupId : SV_GroupID)
+[numthreads(8, 8, 1)] void main(UVec2 svDispatchThreadId : SV_DISPATCHTHREADID, U32 svGroupIndex : SV_GROUPINDEX, UVec2 svGroupId : SV_GROUPID)
 {
 	// Gather the thread result
 	const F32 pixelsPerThread = TILE_SIZE / 8;
@@ -87,7 +87,7 @@ void sample(IVec2 svDispatchThreadId, IVec2 offset, inout Vec2 maxVel)
 	maxVel = computeMaxVelocity(maxVel, v);
 }
 
-[numthreads(8, 8, 1)] void main(UVec2 svDispatchThreadId : SV_DispatchThreadID)
+[numthreads(8, 8, 1)] void main(UVec2 svDispatchThreadId : SV_DISPATCHTHREADID)
 {
 	Vec2 maxv = 0.0;
 	for(I32 i = -1; i <= 1; ++i)
@@ -151,14 +151,14 @@ F32 readDepth(Vec2 uv)
 }
 
 #	if ANKI_COMPUTE_SHADER
-[numthreads(8, 8, 1)] void main(UVec2 svDispatchThreadId : SV_DispatchThreadID)
+[numthreads(8, 8, 1)] void main(UVec2 svDispatchThreadId : SV_DISPATCHTHREADID)
 {
 	Vec2 colorTexSize;
 	g_colorTex.GetDimensions(colorTexSize.x, colorTexSize.y);
 	const Vec2 uv = (Vec2(svDispatchThreadId) + 0.5) / colorTexSize;
 	const Vec2 x = Vec2(svDispatchThreadId) + 0.5;
 #	else
-Vec3 main(Vec2 uv : TEXCOORDS, Vec4 svPosition : SV_Position) : SV_Target0
+Vec3 main(Vec2 uv : TEXCOORDS, Vec4 svPosition : SV_POSITION) : SV_TARGET0
 {
 	Vec2 colorTexSize;
 	g_colorTex.GetDimensions(colorTexSize.x, colorTexSize.y);

+ 4 - 4
AnKi/Shaders/PackFunctions.hlsl

@@ -162,10 +162,10 @@ struct GbufferInfo
 
 struct GBufferPixelOut
 {
-	ANKI_RELAXED_PRECISION Vec4 m_rt0 : SV_Target0;
-	ANKI_RELAXED_PRECISION Vec4 m_rt1 : SV_Target1;
-	ANKI_RELAXED_PRECISION Vec4 m_rt2 : SV_Target2;
-	Vec2 m_rt3 : SV_Target3;
+	ANKI_RELAXED_PRECISION Vec4 m_rt0 : SV_TARGET0;
+	ANKI_RELAXED_PRECISION Vec4 m_rt1 : SV_TARGET1;
+	ANKI_RELAXED_PRECISION Vec4 m_rt2 : SV_TARGET2;
+	Vec2 m_rt3 : SV_TARGET3;
 };
 
 // Populate the G buffer

+ 8 - 8
AnKi/Shaders/Reflections.ankiprog

@@ -109,8 +109,8 @@ groupshared U32 g_minRoughness;
 groupshared U32 g_maxRoughness;
 groupshared U32 g_allSky;
 
-[NumThreads(TILE_SIZE / 2, TILE_SIZE, 1)] void main(U32 svGroupIndex : SV_GroupIndex, UVec2 svDispatchThreadId : SV_DispatchThreadID,
-													UVec2 svGroupId : SV_GroupID)
+[numthreads(TILE_SIZE / 2, TILE_SIZE, 1)] void main(U32 svGroupIndex : SV_GROUPINDEX, UVec2 svDispatchThreadId : SV_DISPATCHTHREADID,
+													UVec2 svGroupId : SV_GROUPID)
 {
 	if(svDispatchThreadId.x == 0 && svDispatchThreadId.y == 0)
 	{
@@ -384,8 +384,8 @@ void bestCandidateToHallucinate(IVec2 svGroupThreadId, IVec2 offset, F32 depth,
 }
 
 // All calculations in view space
-[NumThreads(NUM_THREADS_SQRT, NUM_THREADS_SQRT, 1)] void main(UVec2 svDispatchThreadId : SV_DispatchThreadID,
-															  UVec2 svGroupThreadId : SV_GroupThreadID, U32 svGroupIndex : SV_GroupIndex)
+[numthreads(NUM_THREADS_SQRT, NUM_THREADS_SQRT, 1)] void main(UVec2 svDispatchThreadId : SV_DISPATCHTHREADID,
+															  UVec2 svGroupThreadId : SV_GROUPTHREADID, U32 svGroupIndex : SV_GROUPINDEX)
 {
 	UVec2 halfViewportSize;
 	g_hitPosAndDepthTex.GetDimensions(halfViewportSize.x, halfViewportSize.y);
@@ -608,7 +608,7 @@ SamplerState g_trilinearClampSampler : register(s0);
 RWTexture2D<Vec4> g_colorAndPdfTex : register(u0);
 RWTexture2D<Vec4> g_hitPosAndDepthTex : register(u1);
 
-[NumThreads(64, 1, 1)] void main(UVec2 svDispatchThreadId : SV_DispatchThreadID)
+[numthreads(64, 1, 1)] void main(UVec2 svDispatchThreadId : SV_DISPATCHTHREADID)
 {
 	if(svDispatchThreadId.x >= g_pixelsFailedSsrCount[0])
 	{
@@ -826,7 +826,7 @@ void reconstructCheckerboardBlack(IVec2 svGroupThreadId, F32 refDepth, inout Vec
 	sumWeight += weight;
 }
 
-[NumThreads(8, 8, 1)] void main(UVec2 svDispatchThreadId : SV_DispatchThreadID, UVec2 svGroupThreadId : SV_GROUPTHREADID,
+[numthreads(8, 8, 1)] void main(UVec2 svDispatchThreadId : SV_DISPATCHTHREADID, UVec2 svGroupThreadId : SV_GROUPTHREADID,
 								U32 svGroupIndex : SV_GROUPINDEX)
 {
 	UVec2 viewportSize;
@@ -1104,7 +1104,7 @@ F16 computeVarianceCenter(IVec2 coord, UVec2 textureSize)
 #	endif
 }
 
-[NumThreads(8, 8, 1)] void main(UVec2 svDispatchThreadId : SV_DispatchThreadID)
+[numthreads(8, 8, 1)] void main(UVec2 svDispatchThreadId : SV_DISPATCHTHREADID)
 {
 	UVec2 outSize;
 	g_outTex.GetDimensions(outSize.x, outSize.y);
@@ -1177,7 +1177,7 @@ Texture2D<UVec4> g_classTileMap : register(t1);
 
 RWTexture2D<Vec4> g_outTex : register(u0);
 
-[NumThreads(8, 8, 1)] void main(UVec2 svDispatchThreadId : SV_DispatchThreadID)
+[numthreads(8, 8, 1)] void main(UVec2 svDispatchThreadId : SV_DISPATCHTHREADID)
 {
 	UVec2 outSize;
 	g_outTex.GetDimensions(outSize.x, outSize.y);

+ 1 - 1
AnKi/Shaders/RtSbtBuild.ankiprog

@@ -89,7 +89,7 @@ RWStructuredBuffer<U32> g_sbtBuffer : register(u0);
 ANKI_FAST_CONSTANTS(RtShadowsSbtBuildConstants, g_consts)
 
 // Patches only raygen and miss handles to the SBT
-[NumThreads(32, 1, 1)] void main(COMPUTE_ARGS)
+[numthreads(32, 1, 1)] void main(COMPUTE_ARGS)
 {
 	const U32 dword = svGroupIndex;
 	if(dword < g_consts.m_shaderHandleDwordSize)

+ 1 - 1
AnKi/Shaders/Sky.ankiprog

@@ -471,7 +471,7 @@ RWTexture2D<Vec4> g_envMap : register(u0);
 
 ConstantBuffer<GlobalRendererConstants> g_consts : register(b0);
 
-[numthreads(8, 8, 1)] void main(UVec2 svDispatchThreadId : SV_DispatchThreadID)
+[numthreads(8, 8, 1)] void main(UVec2 svDispatchThreadId : SV_DISPATCHTHREADID)
 {
 	const Vec2 svDispatchThreadIdf = Vec2(svDispatchThreadId);
 

+ 1 - 1
AnKi/Shaders/TemporalAA.ankiprog

@@ -53,7 +53,7 @@ Vec3 computeTaa(Vec2 coord)
 	TEX(g_outTex, coord) = Vec4(computeTaa(coord), 0.0);
 }
 #elif ANKI_PIXEL_SHADER
-Vec3 main(VertOut input) : SV_Target0
+Vec3 main(VertOut input) : SV_TARGET0
 {
 	return computeTaa(floor(input.m_svPosition));
 }

+ 1 - 1
Tests/Gr/FindPrimeNumbers.hlsl

@@ -41,7 +41,7 @@ groupshared uint g_isPrime;
 [numthreads(NUMTHREADS, 1, 1)]
 #endif
 	void
-	main(uint svGroupIndex : SV_GroupIndex, uint svGroupId : SV_GroupID
+	main(uint svGroupIndex : SV_GROUPINDEX, uint svGroupId : SV_GROUPID
 #if WORKGRAPHS
 		 ,
 		 DispatchNodeInputRecord<FirstNodeInput> input

+ 4 - 4
Tests/Gr/GrAsyncCompute.cpp

@@ -88,7 +88,7 @@ ANKI_TEST(Gr, AsyncComputeBench)
 RWTexture2D<float4> g_inTex : register(u0);
 RWTexture2D<float4> g_outTex : register(u1);
 
-[NumThreads(8, 8, 1)] void main(uint2 svDispatchThreadId : SV_DispatchThreadID)
+[numthreads(8, 8, 1)] void main(uint2 svDispatchThreadId : SV_DISPATCHTHREADID)
 {
 	uint2 texSize;
 	g_inTex.GetDimensions(texSize.x, texSize.y);
@@ -125,13 +125,13 @@ struct Consts
 ConstantBuffer<Consts> g_consts : register(b0, space3000);
 #endif
 
-float4 main(float3 svPosition : POSITION) : SV_Position
+float4 main(float3 svPosition : POSITION) : SV_POSITION
 {
 	return mul(g_consts.m_viewProjMat, float4(svPosition * g_consts.m_scale + g_consts.m_worldPosition, 1.0));
 })";
 
 		const CString pixelShaderSrc = R"(
-float4 main() : SV_Target0
+float4 main() : SV_TARGET0
 {
 	return float4(1.0, 0.0, 0.5, 0.0);
 })";
@@ -164,7 +164,7 @@ struct VertOut
 Texture2D g_inTex : register(t0);
 SamplerState g_sampler : register(s0);
 
-float4 main(VertOut input) : SV_Target0
+float4 main(VertOut input) : SV_TARGET0
 {
 	return g_inTex.Sample(g_sampler, input.m_uv);
 })";

+ 11 - 11
Tests/Gr/GrWorkGraphs.cpp

@@ -142,8 +142,8 @@ struct ThirdNodeRecord
 
 RWStructuredBuffer<uint> g_buff : register(u0);
 
-[Shader("node")] [NodeLaunch("broadcasting")] [NodeIsProgramEntry] [NodeMaxDispatchGrid(1, 1, 1)] [NumThreads(16, 1, 1)]
-void main(DispatchNodeInputRecord<FirstNodeRecord> inp, [MaxRecords(2)] NodeOutput<SecondNodeRecord> secondNode, uint svGroupIndex : SV_GroupIndex)
+[Shader("node")] [NodeLaunch("broadcasting")] [NodeIsProgramEntry] [NodeMaxDispatchGrid(1, 1, 1)] [numthreads(16, 1, 1)]
+void main(DispatchNodeInputRecord<FirstNodeRecord> inp, [MaxRecords(2)] NodeOutput<SecondNodeRecord> secondNode, uint svGroupIndex : SV_GROUPINDEX)
 {
 	GroupNodeOutputRecords<SecondNodeRecord> rec = secondNode.GetGroupNodeOutputRecords(2);
 
@@ -156,7 +156,7 @@ void main(DispatchNodeInputRecord<FirstNodeRecord> inp, [MaxRecords(2)] NodeOutp
 	rec.OutputComplete();
 }
 
-[Shader("node")] [NodeLaunch("broadcasting")] [NumThreads(16, 1, 1)] [NodeMaxDispatchGrid(16, 1, 1)]
+[Shader("node")] [NodeLaunch("broadcasting")] [numthreads(16, 1, 1)] [NodeMaxDispatchGrid(16, 1, 1)]
 void secondNode(DispatchNodeInputRecord<SecondNodeRecord> inp, [MaxRecords(32)] NodeOutput<ThirdNodeRecord> thirdNode,
 				uint svGroupIndex : SV_GROUPINDEX)
 {
@@ -168,8 +168,8 @@ void secondNode(DispatchNodeInputRecord<SecondNodeRecord> inp, [MaxRecords(32)]
 	recs.OutputComplete();
 }
 
-[Shader("node")] [NodeLaunch("coalescing")] [NumThreads(16, 1, 1)]
-void thirdNode([MaxRecords(32)] GroupNodeInputRecords<ThirdNodeRecord> inp, uint svGroupIndex : SV_GroupIndex)
+[Shader("node")] [NodeLaunch("coalescing")] [numthreads(16, 1, 1)]
+void thirdNode([MaxRecords(32)] GroupNodeInputRecords<ThirdNodeRecord> inp, uint svGroupIndex : SV_GROUPINDEX)
 {
 	if (svGroupIndex * 2 < inp.Count())
 		InterlockedAdd(g_buff[0], inp[svGroupIndex * 2].m_value);
@@ -261,9 +261,9 @@ StructuredBuffer<uint> g_positions : register(t1);
 #define THREAD_COUNT 64u
 
 // Operates per object
-[Shader("node")] [NodeLaunch("broadcasting")] [NodeIsProgramEntry] [NodeMaxDispatchGrid(1, 1, 1)] [NumThreads(THREAD_COUNT, 1, 1)]
+[Shader("node")] [NodeLaunch("broadcasting")] [NodeIsProgramEntry] [NodeMaxDispatchGrid(1, 1, 1)] [numthreads(THREAD_COUNT, 1, 1)]
 void main(DispatchNodeInputRecord<FirstNodeRecord> inp, [MaxRecords(THREAD_COUNT)] NodeOutput<SecondNodeRecord> computeAabb,
-		  uint svGroupIndex : SV_GroupIndex, uint svDispatchThreadId : SV_DispatchThreadId)
+		  uint svGroupIndex : SV_GROUPINDEX, uint svDispatchThreadId : SV_DispatchThreadId)
 {
 	GroupNodeOutputRecords<SecondNodeRecord> recs = computeAabb.GetGroupNodeOutputRecords(THREAD_COUNT);
 
@@ -278,8 +278,8 @@ void main(DispatchNodeInputRecord<FirstNodeRecord> inp, [MaxRecords(THREAD_COUNT
 groupshared Aabb g_aabb;
 
 // Operates per position
-[Shader("node")] [NodeLaunch("broadcasting")] [NodeMaxDispatchGrid(1, 1, 1)] [NumThreads(THREAD_COUNT, 1, 1)]
-void computeAabb(DispatchNodeInputRecord<SecondNodeRecord> inp, uint svDispatchThreadId : SV_DispatchThreadId, uint svGroupIndex : SV_GroupIndex)
+[Shader("node")] [NodeLaunch("broadcasting")] [NodeMaxDispatchGrid(1, 1, 1)] [numthreads(THREAD_COUNT, 1, 1)]
+void computeAabb(DispatchNodeInputRecord<SecondNodeRecord> inp, uint svDispatchThreadId : SV_DispatchThreadId, uint svGroupIndex : SV_GROUPINDEX)
 {
 	const Object obj = g_objects[inp.Get().m_objectIndex];
 
@@ -341,8 +341,8 @@ ConstantBuffer<PushConsts> g_consts : register(b0, space3000);
 
 groupshared Aabb g_aabb;
 
-[NumThreads(THREAD_COUNT, 1, 1)]
-void main(uint svDispatchThreadId : SV_DispatchThreadId, uint svGroupIndex : SV_GroupIndex)
+[numthreads(THREAD_COUNT, 1, 1)]
+void main(uint svDispatchThreadId : SV_DispatchThreadId, uint svGroupIndex : SV_GROUPINDEX)
 {
 	const Object obj = g_objects[g_consts.m_objectIndex];
 

+ 1 - 1
Tests/Gr/JobManagerCompute.hlsl

@@ -55,7 +55,7 @@ groupshared uint g_outWorkItemCount;
 
 static const int kMashPushTries = 1000;
 
-[numthreads(NUMTHREADS, 1, 1)] void main(uint svGroupIndex : SV_GroupIndex)
+[numthreads(NUMTHREADS, 1, 1)] void main(uint svGroupIndex : SV_GROUPINDEX)
 {
 	if(svGroupIndex == 0)
 	{

+ 4 - 4
Tests/Gr/JobManagerWg.hlsl

@@ -34,9 +34,9 @@ struct SecondNodeInput
 
 groupshared uint g_newWorkItemCount;
 
-[Shader("node")][NodeLaunch("broadcasting")][NodeIsProgramEntry][NodeMaxDispatchGrid(1, 1, 1)][NumThreads(NUMTHREADS, 1, 1)] void
+[Shader("node")][NodeLaunch("broadcasting")][NodeIsProgramEntry][NodeMaxDispatchGrid(1, 1, 1)][numthreads(NUMTHREADS, 1, 1)] void
 main(DispatchNodeInputRecord<FirstNodeInput> input, uint svDispatchThreadId
-	 : SV_DispatchThreadId, uint svGroupIndex : SV_GroupIndex, [MaxRecords(MAX_CHILDREN)] NodeOutput<SecondNodeInput> secondNode)
+	 : SV_DispatchThreadId, uint svGroupIndex : SV_GROUPINDEX, [MaxRecords(MAX_CHILDREN)] NodeOutput<SecondNodeInput> secondNode)
 {
 	if(svGroupIndex == 0)
 	{
@@ -105,9 +105,9 @@ main(DispatchNodeInputRecord<FirstNodeInput> input, uint svDispatchThreadId
 
 static const int x = 0; // For formatting
 
-[Shader("node")][NodeLaunch("broadcasting")][NumThreads(NUMTHREADS, 1, 1)][NodeDispatchGrid(1, 1, 1)][NodeMaxRecursionDepth(16)] void
+[Shader("node")][NodeLaunch("broadcasting")][numthreads(NUMTHREADS, 1, 1)][NodeDispatchGrid(1, 1, 1)][NodeMaxRecursionDepth(16)] void
 secondNode(DispatchNodeInputRecord<SecondNodeInput> input, [MaxRecords(MAX_CHILDREN)] NodeOutput<SecondNodeInput> secondNode,
-		   uint svGroupIndex : SV_GroupIndex)
+		   uint svGroupIndex : SV_GROUPINDEX)
 {
 	if(svGroupIndex == 0)
 	{

+ 3 - 3
Tests/Gr/WorkDrainCompute.hlsl

@@ -19,8 +19,8 @@ RWStructuredBuffer<float4> g_result : register(u1);
 
 groupshared float4 g_tileMax[TILE_SIZE_X * TILE_SIZE_Y];
 
-[numthreads(TILE_SIZE_X, TILE_SIZE_Y, 1)] void main(uint2 svDispatchThreadId : SV_DispatchThreadID, uint svGroupIndex : SV_GroupIndex,
-													uint2 svGroupId : SV_GroupID)
+[numthreads(TILE_SIZE_X, TILE_SIZE_Y, 1)] void main(uint2 svDispatchThreadId : SV_DISPATCHTHREADID, uint svGroupIndex : SV_GROUPINDEX,
+													uint2 svGroupId : SV_GROUPID)
 {
 	g_tileMax[svGroupIndex] = g_inputTex[svDispatchThreadId];
 
@@ -44,7 +44,7 @@ groupshared float4 g_tileMax[TILE_SIZE_X * TILE_SIZE_Y];
 
 groupshared float4 g_maxColor[64];
 
-[numthreads(64, 1, 1)] void main(uint svGroupIndex : SV_GroupIndex)
+[numthreads(64, 1, 1)] void main(uint svGroupIndex : SV_GROUPINDEX)
 {
 	const uint tilesPerThread = TILE_COUNT / 64;
 

+ 2 - 2
Tests/Gr/WorkDrainWg.hlsl

@@ -30,7 +30,7 @@ groupshared float4 g_tileMax[TILE_SIZE_X * TILE_SIZE_Y];
 
 [Shader("node")][NodeLaunch("broadcasting")][NodeIsProgramEntry][NodeMaxDispatchGrid(1, 1, 1)][numthreads(TILE_SIZE_X, TILE_SIZE_Y, 1)] void
 main(DispatchNodeInputRecord<FirstNodeInput> input, [MaxRecords(1)] NodeOutput<SecondNodeInput> secondNode,
-	 uint2 svDispatchThreadId : SV_DispatchThreadID, uint svGroupIndex : SV_GroupIndex, uint2 svGroupId : SV_GROUPID)
+	 uint2 svDispatchThreadId : SV_DISPATCHTHREADID, uint svGroupIndex : SV_GROUPINDEX, uint2 svGroupId : SV_GROUPID)
 {
 	g_tileMax[svGroupIndex] = g_inputTex[svDispatchThreadId];
 
@@ -79,7 +79,7 @@ main(DispatchNodeInputRecord<FirstNodeInput> input, [MaxRecords(1)] NodeOutput<S
 groupshared float4 g_maxColor[64];
 
 [Shader("node")][NodeLaunch("broadcasting")][NodeMaxDispatchGrid(1, 1, 1)][numthreads(64, 1, 1)] void
-secondNode(DispatchNodeInputRecord<SecondNodeInput> inp, uint svGroupIndex : SV_GroupIndex)
+secondNode(DispatchNodeInputRecord<SecondNodeInput> inp, uint svGroupIndex : SV_GROUPINDEX)
 {
 	const uint tilesPerThread = TILE_COUNT / 64;
 

+ 2 - 4
Tools/FormatSource.py

@@ -15,10 +15,8 @@ import platform
 
 file_extensions = ["h", "hpp", "c", "cpp", "glsl", "hlsl", "ankiprog"]
 directories = ["AnKi", "Tests", "Sandbox", "Tools", "Samples"]
-hlsl_semantics = ["TEXCOORD", "SV_POSITION", "SV_Position", "SV_TARGET0", "SV_TARGET1", "SV_TARGET2", "SV_TARGET3", "SV_TARGET4",
-                  "SV_TARGET5", "SV_TARGET6", "SV_TARGET7", "SV_Target0", "SV_Target1", "SV_Target2", "SV_Target3", "SV_Target4",
-                  "SV_Target5", "SV_Target6", "SV_Target7", "SV_DISPATCHTHREADID", "SV_DispatchThreadID", "SV_GROUPINDEX", "SV_GroupIndex",
-                  "SV_GROUPID", "SV_GroupID", "SV_GROUPTHREADID", "SV_GroupThreadID"]
+hlsl_semantics = ["TEXCOORD", "SV_POSITION", "SV_TARGET0", "SV_TARGET1", "SV_TARGET2", "SV_TARGET3", "SV_TARGET4", "SV_TARGET5",
+                  "SV_TARGET6", "SV_TARGET7", "SV_DISPATCHTHREADID", "SV_GROUPINDEX", "SV_GROUPID", "SV_GROUPTHREADID"]
 hlsl_attribs = ["[shader(\"closesthit\")]", "[shader(\"anyhit\")]", "[shader(\"raygeneration\")]", "[shader(\"miss\")]",
                 "[raypayload]", "[outputtopology(\"triangle\")]"]
 hlsl_attribs_fake = ["______shaderclosesthit", "______shaderanyhit", "______shaderraygeneration", "______shadermiss",