Browse Source

Add a WIP GPU scene shader

Panagiotis Christopoulos Charitos 3 years ago
parent
commit
eeb254c716

+ 8 - 6
AnKi/Scene/Components/ModelComponent.cpp

@@ -60,7 +60,7 @@ Error ModelComponent::loadModelResource(CString filename)
 	m_gpuSceneUniformsOffsetPerPatch.resize(m_node->getMemoryPool(), modelPatchCount);
 	m_gpuSceneUniformsOffsetPerPatch.resize(m_node->getMemoryPool(), modelPatchCount);
 	for(U32 i = 0; i < modelPatchCount; ++i)
 	for(U32 i = 0; i < modelPatchCount; ++i)
 	{
 	{
-		m_gpuSceneUniformsOffsetPerPatch[i] = uniformsSize / 4;
+		m_gpuSceneUniformsOffsetPerPatch[i] = uniformsSize;
 
 
 		const U32 size = U32(m_model->getModelPatches()[i].getMaterial()->getPrefilledLocalUniforms().getSizeInBytes());
 		const U32 size = U32(m_model->getModelPatches()[i].getMaterial()->getPrefilledLocalUniforms().getSizeInBytes());
 		ANKI_ASSERT((size % 4) == 0);
 		ANKI_ASSERT((size % 4) == 0);
@@ -72,7 +72,7 @@ Error ModelComponent::loadModelResource(CString filename)
 
 
 	for(U32 i = 0; i < modelPatchCount; ++i)
 	for(U32 i = 0; i < modelPatchCount; ++i)
 	{
 	{
-		m_gpuSceneUniformsOffsetPerPatch[i] += DwordOffset(m_gpuSceneUniforms.m_offset / 4);
+		m_gpuSceneUniformsOffsetPerPatch[i] += U32(m_gpuSceneUniforms.m_offset);
 	}
 	}
 
 
 	return Error::kNone;
 	return Error::kNone;
@@ -110,16 +110,18 @@ Error ModelComponent::update(SceneComponentUpdateInfo& info, Bool& updated)
 					U32 vertCount;
 					U32 vertCount;
 					mesh.getVertexStreamInfo(l, stream, offset, vertCount);
 					mesh.getVertexStreamInfo(l, stream, offset, vertCount);
 
 
-					ANKI_ASSERT((offset % 4) == 0);
-					view.m_vertexOffsets[l][U32(stream)] = U32(offset / 4);
+					const PtrSize elementSize = getFormatInfo(kMeshRelatedVertexStreamFormats[stream]).m_texelSize;
+
+					ANKI_ASSERT((offset % elementSize) == 0);
+					view.m_lods[l].m_vertexOffsets[U32(stream)] = U32(offset / elementSize);
 				}
 				}
 
 
 				PtrSize offset;
 				PtrSize offset;
 				U32 indexCount;
 				U32 indexCount;
 				IndexType indexType;
 				IndexType indexType;
 				mesh.getIndexBufferInfo(l, offset, indexCount, indexType);
 				mesh.getIndexBufferInfo(l, offset, indexCount, indexType);
-				view.m_indexOffsets[l] = U32(offset);
-				view.m_indexCounts[l] = indexCount;
+				view.m_lods[l].m_indexOffset = U32(offset);
+				view.m_lods[l].m_indexCount = indexCount;
 			}
 			}
 		}
 		}
 
 

+ 4 - 4
AnKi/Scene/Components/ModelComponent.h

@@ -41,13 +41,13 @@ public:
 		return m_model.isCreated();
 		return m_model.isCreated();
 	}
 	}
 
 
-	DwordOffset getMeshViewsGpuSceneOffset() const
+	U32 getMeshViewsGpuSceneOffset() const
 	{
 	{
 		ANKI_ASSERT((m_gpuSceneMeshGpuViews.m_offset % 4) == 0);
 		ANKI_ASSERT((m_gpuSceneMeshGpuViews.m_offset % 4) == 0);
-		return DwordOffset(m_gpuSceneMeshGpuViews.m_offset / 4);
+		return U32(m_gpuSceneMeshGpuViews.m_offset);
 	}
 	}
 
 
-	DwordOffset getUniformsGpuSceneOffset(U32 meshPatchIdx) const
+	U32 getUniformsGpuSceneOffset(U32 meshPatchIdx) const
 	{
 	{
 		return m_gpuSceneUniformsOffsetPerPatch[meshPatchIdx];
 		return m_gpuSceneUniformsOffsetPerPatch[meshPatchIdx];
 	}
 	}
@@ -61,7 +61,7 @@ private:
 
 
 	SegregatedListsGpuMemoryPoolToken m_gpuSceneMeshGpuViews;
 	SegregatedListsGpuMemoryPoolToken m_gpuSceneMeshGpuViews;
 	SegregatedListsGpuMemoryPoolToken m_gpuSceneUniforms;
 	SegregatedListsGpuMemoryPoolToken m_gpuSceneUniforms;
-	DynamicArray<DwordOffset> m_gpuSceneUniformsOffsetPerPatch;
+	DynamicArray<U32> m_gpuSceneUniformsOffsetPerPatch;
 
 
 	Error update(SceneComponentUpdateInfo& info, Bool& updated);
 	Error update(SceneComponentUpdateInfo& info, Bool& updated);
 };
 };

+ 2 - 2
AnKi/Scene/Components/MoveComponent.h

@@ -139,10 +139,10 @@ public:
 	}
 	}
 	/// @}
 	/// @}
 
 
-	DwordOffset getTransformsGpuSceneOffset() const
+	U32 getTransformsGpuSceneOffset() const
 	{
 	{
 		ANKI_ASSERT((m_gpuSceneTransforms.m_offset % 4) == 0);
 		ANKI_ASSERT((m_gpuSceneTransforms.m_offset % 4) == 0);
-		return DwordOffset(m_gpuSceneTransforms.m_offset / 4);
+		return U32(m_gpuSceneTransforms.m_offset);
 	}
 	}
 
 
 private:
 private:

+ 2 - 2
AnKi/Scene/Components/RenderComponent.h

@@ -104,10 +104,10 @@ public:
 										 RebarStagingGpuMemoryPool& alloc,
 										 RebarStagingGpuMemoryPool& alloc,
 										 const Vec4& positionScaleAndTranslation = Vec4(1.0f, 0.0f, 0.0f, 0.0f));
 										 const Vec4& positionScaleAndTranslation = Vec4(1.0f, 0.0f, 0.0f, 0.0f));
 
 
-	DwordOffset getGpuSceneViewOffset() const
+	U32 getGpuSceneViewOffset() const
 	{
 	{
 		ANKI_ASSERT((m_gpuSceneRenderableGpuView.m_offset % 4) == 0);
 		ANKI_ASSERT((m_gpuSceneRenderableGpuView.m_offset % 4) == 0);
-		return DwordOffset(m_gpuSceneRenderableGpuView.m_offset / 4);
+		return U32(m_gpuSceneRenderableGpuView.m_offset);
 	}
 	}
 
 
 private:
 private:

+ 2 - 2
AnKi/Scene/Components/SpatialComponent.h

@@ -110,10 +110,10 @@ public:
 		return m_alwaysVisible;
 		return m_alwaysVisible;
 	}
 	}
 
 
-	DwordOffset getAabbGpuSceneOffset() const
+	U32 getAabbGpuSceneOffset() const
 	{
 	{
 		ANKI_ASSERT((m_gpuSceneAabb.m_offset % 4) == 0);
 		ANKI_ASSERT((m_gpuSceneAabb.m_offset % 4) == 0);
-		return DwordOffset(m_gpuSceneAabb.m_offset / 4);
+		return U32(m_gpuSceneAabb.m_offset);
 	}
 	}
 
 
 private:
 private:

+ 2 - 2
AnKi/Scene/ModelNode.cpp

@@ -198,8 +198,8 @@ void ModelNode::initRenderComponents()
 		view.m_aabbOffset = getFirstComponentOfType<SpatialComponent>().getAabbGpuSceneOffset();
 		view.m_aabbOffset = getFirstComponentOfType<SpatialComponent>().getAabbGpuSceneOffset();
 		view.m_uniformsOffset = getFirstComponentOfType<ModelComponent>().getUniformsGpuSceneOffset(patchIdx);
 		view.m_uniformsOffset = getFirstComponentOfType<ModelComponent>().getUniformsGpuSceneOffset(patchIdx);
 		view.m_meshOffset =
 		view.m_meshOffset =
-			getFirstComponentOfType<ModelComponent>().getMeshViewsGpuSceneOffset() + sizeof(MeshGpuView) / 4 * patchIdx;
-		getExternalSubsystems().m_gpuSceneMicroPatcher->newCopy(getFrameMemoryPool(), rc.getGpuSceneViewOffset() * 4,
+			getFirstComponentOfType<ModelComponent>().getMeshViewsGpuSceneOffset() + sizeof(MeshGpuView) * patchIdx;
+		getExternalSubsystems().m_gpuSceneMicroPatcher->newCopy(getFrameMemoryPool(), rc.getGpuSceneViewOffset(),
 																sizeof(view), &view);
 																sizeof(view), &view);
 
 
 		// Init the proxy
 		// Init the proxy

+ 2 - 5
AnKi/ShaderCompiler/MaliOfflineCompiler.cpp

@@ -115,16 +115,13 @@ static Error runMaliOfflineCompilerInternal(CString maliocExecutable, CString sp
 
 
 	// Execute
 	// Execute
 	Process proc;
 	Process proc;
-	ANKI_CHECK(proc.start(maliocExecutable, args, {}));
+	ANKI_CHECK(proc.start(maliocExecutable, args, {}, ProcessOptions::kOpenStdout));
 	ProcessStatus status;
 	ProcessStatus status;
 	I32 exitCode;
 	I32 exitCode;
 	ANKI_CHECK(proc.wait(-1.0, &status, &exitCode));
 	ANKI_CHECK(proc.wait(-1.0, &status, &exitCode));
 	if(exitCode != 0)
 	if(exitCode != 0)
 	{
 	{
-		StringRaii stderre(&tmpPool);
-		const Error err = proc.readFromStderr(stderre);
-		ANKI_SHADER_COMPILER_LOGE("Mali offline compiler failed with exit code %d. Stderr: %s", exitCode,
-								  (err || stderre.isEmpty()) ? "<no text>" : stderre.cstr());
+		ANKI_SHADER_COMPILER_LOGE("Mali offline compiler failed with exit code %d", exitCode);
 		return Error::kFunctionFailed;
 		return Error::kFunctionFailed;
 	}
 	}
 
 

+ 2 - 5
AnKi/ShaderCompiler/RadeonGpuAnalyzer.cpp

@@ -64,7 +64,7 @@ Error runRadeonGpuAnalyzer(CString rgaExecutable, ConstWeakArray<U8> spirv, Shad
 
 
 	{
 	{
 		Process proc;
 		Process proc;
-		ANKI_CHECK(proc.start(rgaExecutable, args, DynamicArrayRaii<StringRaii>(&tmpPool)));
+		ANKI_CHECK(proc.start(rgaExecutable, args, DynamicArrayRaii<StringRaii>(&tmpPool), ProcessOptions::kNone));
 
 
 		ProcessStatus status;
 		ProcessStatus status;
 		I32 exitCode;
 		I32 exitCode;
@@ -72,10 +72,7 @@ Error runRadeonGpuAnalyzer(CString rgaExecutable, ConstWeakArray<U8> spirv, Shad
 
 
 		if(exitCode != 0)
 		if(exitCode != 0)
 		{
 		{
-			StringRaii stderre(&tmpPool);
-			const Error err = proc.readFromStderr(stderre);
-			ANKI_SHADER_COMPILER_LOGE("RGA failed with exit code %d. Stderr: %s", exitCode,
-									  (err || stderre.isEmpty()) ? "<no text>" : stderre.cstr());
+			ANKI_SHADER_COMPILER_LOGE("RGA failed with exit code %d", exitCode);
 			return Error::kFunctionFailed;
 			return Error::kFunctionFailed;
 		}
 		}
 	}
 	}

+ 398 - 0
AnKi/Shaders/GBufferGenericGpuScene.ankiprog

@@ -0,0 +1,398 @@
+// Copyright (C) 2009-2022, Panagiotis Christopoulos Charitos and contributors.
+// All rights reserved.
+// Code licensed under the BSD License.
+// http://www.anki3d.org/LICENSE
+
+#pragma anki hlsl
+
+#pragma anki mutator ANKI_LOD 0 1 2
+#pragma anki mutator ANKI_VELOCITY 0 1
+#pragma anki mutator ANKI_TECHNIQUE 0 1 2
+#pragma anki mutator ANKI_BONES 0 1
+#pragma anki mutator DIFFUSE_TEX 0 1
+#pragma anki mutator SPECULAR_TEX 0 1
+#pragma anki mutator ROUGHNESS_TEX 0 1
+#pragma anki mutator METAL_TEX 0 1
+#pragma anki mutator NORMAL_TEX 0 1
+#pragma anki mutator PARALLAX 0 1
+#pragma anki mutator EMISSIVE_TEX 0 1
+#pragma anki mutator ALPHA_TEST 0 1
+
+#pragma anki skip_mutation ALPHA_TEST 1 DIFFUSE_TEX 0
+#pragma anki skip_mutation ANKI_VELOCITY 1 ANKI_TECHNIQUE 1
+#pragma anki skip_mutation ANKI_VELOCITY 1 ANKI_TECHNIQUE 2
+#pragma anki skip_mutation ANKI_LOD 1 ANKI_TECHNIQUE 1
+#pragma anki skip_mutation ANKI_LOD 2 ANKI_TECHNIQUE 1
+#pragma anki skip_mutation ANKI_LOD 1 ANKI_TECHNIQUE 2
+#pragma anki skip_mutation ANKI_LOD 2 ANKI_TECHNIQUE 2
+
+// Some defines the clear up things
+#define REALLY_ALPHA_TEST (ALPHA_TEST && DIFFUSE_TEX)
+#define UVS (ANKI_TECHNIQUE == ANKI_RENDERING_TECHNIQUE_GBUFFER || REALLY_ALPHA_TEST)
+#define REALLY_VELOCITY ((ANKI_VELOCITY || ANKI_BONES) && ANKI_TECHNIQUE == ANKI_RENDERING_TECHNIQUE_GBUFFER)
+#define REALLY_USING_PARALLAX \
+	(PARALLAX == 1 && ANKI_TECHNIQUE == ANKI_RENDERING_TECHNIQUE_GBUFFER && ANKI_LOD == 0 && ALPHA_TEST == 0)
+
+#include <AnKi/Shaders/Include/MaterialTypes.h>
+#include <AnKi/Shaders/Include/GpuSceneTypes.h>
+#include <AnKi/Shaders/PackFunctions.hlsl>
+#include <AnKi/Shaders/Functions.hlsl>
+
+ANKI_BINDLESS_SET(MaterialSet::kBindless)
+
+[[vk::binding(MaterialBinding::kTrilinearRepeatSampler, MaterialSet::kGlobal)]] SamplerState g_globalSampler;
+[[vk::binding(MaterialBinding::kGlobalUniforms, MaterialSet::kGlobal)]] ConstantBuffer<MaterialGlobalUniforms>
+	g_globalUniforms;
+[[vk::binding(MaterialBinding::kGpuScene, MaterialSet::kGlobal)]] ByteAddressBuffer g_gpuScene;
+
+[[vk::binding(MaterialBinding::kUnifiedGeometry_R16G16B16_Unorm, MaterialSet::kGlobal)]] Buffer<Vec4>
+	g_unifiedGeom_R16G16B16_Unorm;
+[[vk::binding(MaterialBinding::kUnifiedGeometry_R8G8B8A8_Snorm, MaterialSet::kGlobal)]] Buffer<Vec4>
+	g_unifiedGeom_R8G8B8A8_Snorm;
+[[vk::binding(MaterialBinding::kUnifiedGeometry_R32G32_Sfloat, MaterialSet::kGlobal)]] Buffer<Vec2>
+	g_unifiedGeom_R32G32_Sfloat;
+[[vk::binding(MaterialBinding::kUnifiedGeometry_R8G8B8A8_Uint, MaterialSet::kGlobal)]] Buffer<UVec4>
+	g_unifiedGeom_R8G8B8A8_Uint;
+
+#pragma anki reflect AnKiLocalUniforms
+#pragma anki struct AnKiLocalUniforms
+#pragma anki member U32 m_normalTex if NORMAL_TEX is 1
+
+#pragma anki member RVec3 m_diffColor if DIFFUSE_TEX is 0
+#pragma anki member U32 m_diffTex if DIFFUSE_TEX is 1
+
+#pragma anki member RF32 m_roughness if ROUGHNESS_TEX is 0
+#pragma anki member U32 m_roughnessTex if ROUGHNESS_TEX is 1
+
+#pragma anki member RVec3 m_specColor if SPECULAR_TEX is 0
+#pragma anki member U32 m_specTex if SPECULAR_TEX is 1
+
+#pragma anki member RF32 m_metallic if METAL_TEX is 0
+#pragma anki member U32 m_metallicTex if METAL_TEX is 1
+
+#pragma anki member RVec3 m_emission if EMISSIVE_TEX is 0
+#pragma anki member U32 m_emissiveTex if EMISSIVE_TEX is 1
+
+#pragma anki member RF32 m_heightmapScale if PARALLAX is 1
+#pragma anki member U32 m_heightTex if PARALLAX is 1
+
+#pragma anki member RF32 m_subsurface
+#pragma anki struct end
+
+struct VertIn
+{
+	U32 m_instanceId : SV_INSTANCEID;
+	[[vk::location(0)]] PackedRenderableGpuViewInstance m_renderableGpuViewInstance : INSTANCE;
+};
+
+struct VertOut
+{
+	Vec4 m_position : SV_POSITION;
+
+#if UVS
+	Vec2 m_uv : TEXCOORD;
+#endif
+
+#if REALLY_VELOCITY
+	Vec3 m_prevClipXyw : PREV_CLIP;
+	Vec3 m_crntClipXyw : CRNT_CLIP;
+#endif
+
+#if ANKI_TECHNIQUE == ANKI_RENDERING_TECHNIQUE_GBUFFER
+	RVec3 m_normal : NORMAL;
+	RVec3 m_tangent : TANGENT;
+	RVec3 m_bitangent : BINTANGENT;
+#endif
+
+	nointerpolation U32 m_uniformsOffset : UNIS_OFFSET;
+};
+
+#if ANKI_TECHNIQUE == ANKI_RENDERING_TECHNIQUE_GBUFFER || ANKI_TECHNIQUE == ANKI_RENDERING_TECHNIQUE_GBUFFER_EZ
+struct FragOut
+{
+	Vec4 m_color0 : SV_TARGET0;
+	Vec4 m_color1 : SV_TARGET1;
+	Vec4 m_color2 : SV_TARGET2;
+	Vec2 m_color3 : SV_TARGET3;
+};
+#endif
+
+#pragma anki start vert
+
+UnpackedMeshVertex loadVertex(MeshGpuView mesh, U32 lod)
+{
+	MeshGpuViewLod mlod = mesh.m_lods[lod];
+
+	UnpackedMeshVertex v;
+	v.m_position = g_unifiedGeom_R16G16B16_Unorm[mlod.m_vertexOffsets[(U32)VertexStreamId::kPosition]];
+#if ANKI_BONES
+	v.m_position = v.m_position * mesh.m_positionScale + mesh.m_positionTranslation;
+#endif
+
+	v.m_normal = g_unifiedGeom_R8G8B8A8_Snorm[mlod.m_vertexOffsets[(U32)VertexStreamId::kNormal]].xyz;
+	v.m_tangent = g_unifiedGeom_R8G8B8A8_Snorm[mlod.m_vertexOffsets[(U32)VertexStreamId::kTangent]];
+	v.m_uv = g_unifiedGeom_R32G32_Sfloat[mlod.m_vertexOffsets[(U32)VertexStreamId::kUv]];
+
+#if ANKI_BONES
+	v.m_boneIndices = g_unifiedGeom_R8G8B8A8_Uint[mlod.m_vertexOffsets[(U32)VertexStreamId::kBoneIds]];
+	v.m_boneWeights = g_unifiedGeom_R8G8B8A8_Snorm[mlod.m_vertexOffsets[(U32)VertexStreamId::kBoneWeights]];
+#endif
+
+	return v;
+}
+
+Mat3x4 loadMatrix(U32 byteOffset)
+{
+	Mat3x4 m;
+	m.m_row0 = g_gpuScene.Load<Vec4>(byteOffset);
+	m.m_row1 = g_gpuScene.Load<Vec4>(byteOffset + sizeof(Vec4));
+	m.m_row2 = g_gpuScene.Load<Vec4>(byteOffset + sizeof(Vec4) * 2);
+	return m;
+}
+
+Mat3x4 loadBoneTransform(UnpackedMeshVertex vert, RenderableGpuView2 renderable, U32 index)
+{
+	const U32 boneIdx = vert.m_boneIndices[index];
+	U32 byteOffset = renderable.m_boneTransformsOffset;
+	byteOffset += boneIdx * sizeof(Mat3x4);
+	return loadMatrix(byteOffset);
+}
+
+Mat3x4 loadPreviousBoneTransform(UnpackedMeshVertex vert, RenderableGpuView2 renderable, U32 index)
+{
+	const U32 boneIdx = vert.m_boneIndices[index];
+	U32 byteOffset = renderable.m_previousBoneTransformsOffset;
+	byteOffset += boneIdx * sizeof(Mat3x4);
+	return loadMatrix(byteOffset);
+}
+
+UnpackedRenderableGpuViewInstance loadRenderableGpuViewInstance(VertIn input)
+{
+	UnpackedRenderableGpuViewInstance o;
+	o.m_lod = input.m_renderableGpuViewInstance & 3u;
+	o.m_renderableGpuViewOffset = input.m_renderableGpuViewInstance >> 2u;
+	return o;
+}
+
+#if ANKI_BONES
+void skinning(UnpackedMeshVertex vert, RenderableGpuView2 renderable, inout Vec3 pos, inout Vec3 prevPos,
+			  inout RVec3 normal, inout RVec4 tangent)
+{
+	Mat3x4 skinMat = loadBoneTransform(vert, renderable, 0) * vert.m_boneWeights[0];
+	Mat3x4 prevSkinMat = loadPreviousBoneTransform(vert, renderable, 0) * vert.m_boneWeights[0];
+	[unroll] for(U32 i = 1u; i < 4u; ++i)
+	{
+		skinMat = skinMat + loadBoneTransform(vert, renderable, i) * vert.m_boneWeights[i];
+		prevSkinMat = prevSkinMat + loadPreviousBoneTransform(vert, renderable, i) * vert.m_boneWeights[i];
+	}
+
+#	if ANKI_TECHNIQUE == ANKI_RENDERING_TECHNIQUE_GBUFFER
+	prevPos = mul(prevSkinMat, Vec4(pos, 1.0)).xyz;
+	tangent.xyz = mul(skinMat, Vec4(tangent.xyz, 0.0)).xyz;
+	normal = mul(skinMat, Vec4(normal, 0.0)).xyz;
+#	endif
+	ANKI_MAYBE_UNUSED(prevPos);
+	ANKI_MAYBE_UNUSED(tangent);
+	ANKI_MAYBE_UNUSED(normal);
+
+	pos = mul(skinMat, Vec4(pos, 1.0)).xyz;
+}
+#endif
+
+#if(ANKI_VELOCITY || ANKI_BONES) && ANKI_TECHNIQUE == ANKI_RENDERING_TECHNIQUE_GBUFFER
+void velocity(Mat3x4 worldTransform, Mat3x4 prevWorldTransform, Vec3 prevLocalPos, inout VertOut output)
+{
+	ANKI_MAYBE_UNUSED(prevWorldTransform);
+	ANKI_MAYBE_UNUSED(worldTransform);
+
+#	if ANKI_VELOCITY
+	// Object is also moving
+	const Mat3x4 trf = prevWorldTransform;
+#	else
+	// Object is a skin that is not moving
+	const Mat3x4 trf = worldTransform;
+#	endif
+
+	Vec4 v4 = Vec4(mul(trf, Vec4(prevLocalPos, 1.0)), 1.0);
+	v4 = mul(g_globalUniforms.m_previousViewProjectionMatrix, v4);
+
+	output.m_prevClipXyw = v4.xyw;
+	output.m_crntClipXyw = output.m_position.xyw;
+}
+#endif
+
+VertOut main(VertIn input)
+{
+	VertOut output;
+
+	const UnpackedRenderableGpuViewInstance instance = loadRenderableGpuViewInstance(input);
+	const RenderableGpuView2 renderable = g_gpuScene.Load<RenderableGpuView2>(instance.m_renderableGpuViewOffset);
+	const MeshGpuView mesh = g_gpuScene.Load<MeshGpuView>(renderable.m_meshOffset);
+	UnpackedMeshVertex vert = loadVertex(mesh, instance.m_lod);
+
+	const Mat3x4 worldTransform = loadMatrix(renderable.m_worldTransformsOffset);
+	const Mat3x4 prevWorldTransform = loadMatrix(renderable.m_worldTransformsOffset + sizeof(Mat3x4));
+	ANKI_MAYBE_UNUSED(prevWorldTransform);
+
+#if UVS
+	output.m_uv = vert.m_uv;
+#endif
+	Vec3 prevPos = vert.m_position;
+	ANKI_MAYBE_UNUSED(prevPos);
+	output.m_uniformsOffset = renderable.m_uniformsOffset;
+
+	// Do stuff
+#if ANKI_BONES
+	skinning(vert, renderable, vert.m_position, prevPos, vert.m_normal, vert.m_tangent);
+#endif
+
+	output.m_position = Vec4(mul(worldTransform, Vec4(vert.m_position, 1.0)), 1.0);
+	output.m_position = mul(g_globalUniforms.m_viewProjectionMatrix, output.m_position);
+
+#if ANKI_TECHNIQUE == ANKI_RENDERING_TECHNIQUE_GBUFFER
+	output.m_normal = mul(worldTransform, Vec4(vert.m_normal, 0.0));
+	output.m_tangent = mul(worldTransform, Vec4(vert.m_tangent.xyz, 0.0));
+	output.m_bitangent = cross(output.m_normal, output.m_tangent) * vert.m_tangent.w;
+#endif
+
+#if REALLY_VELOCITY
+	velocity(worldTransform, prevWorldTransform, prevPos, output);
+#endif
+
+	return output;
+};
+
+#pragma anki end
+
+#pragma anki start frag
+
+void doAlphaTest(RF32 alpha)
+{
+	if(alpha == 0.0)
+	{
+		discard;
+	}
+}
+
+#if ANKI_TECHNIQUE == ANKI_RENDERING_TECHNIQUE_SHADOWS
+void main(VertOut input)
+{
+	ANKI_MAYBE_UNUSED(input);
+#	if REALLY_ALPHA_TEST
+	const AnKiLocalUniforms localUniforms =
+		loadAnKiLocalUniforms(g_gpuScene, WaveReadLaneFirst(input.m_uniformsOffset));
+	const RVec4 diffColorA = g_bindlessTextures2dF32[localUniforms.m_diffTex].Sample(g_globalSampler, input.m_uv);
+	doAlphaTest(diffColorA.a);
+#	endif
+}
+#elif ANKI_TECHNIQUE == ANKI_RENDERING_TECHNIQUE_GBUFFER_EZ
+FragOut main(VertOut input)
+{
+	ANKI_MAYBE_UNUSED(input);
+#	if REALLY_ALPHA_TEST
+	const AnKiLocalUniforms localUniforms =
+		loadAnKiLocalUniforms(g_gpuScene, WaveReadLaneFirst(input.m_uniformsOffset));
+	const RVec4 diffColorA = g_bindlessTextures2dF32[localUniforms.m_diffTex].Sample(g_globalSampler, input.m_uv);
+	doAlphaTest(diffColorA.a);
+#	endif
+	return (FragOut)0;
+}
+#elif ANKI_TECHNIQUE == ANKI_RENDERING_TECHNIQUE_GBUFFER
+// Do normal mapping
+RVec3 readNormalFromTexture(VertOut input, Texture2D<RVec4> map, SamplerState sampl, Vec2 texCoords)
+{
+	// First read the texture
+	const RVec3 nAtTangentspace = normalize((map.Sample(sampl, texCoords).rgb - 0.5) * 2.0);
+
+	const RVec3 n = normalize(input.m_normal);
+	const RVec3 t = normalize(input.m_tangent);
+	const RVec3 b = normalize(input.m_bitangent);
+
+	const RMat3 tbnMat = constructMatrixColumns(t, b, n);
+
+	return mul(tbnMat, nAtTangentspace);
+}
+
+FragOut main(VertOut input)
+{
+	const AnKiLocalUniforms localUniforms =
+		loadAnKiLocalUniforms(g_gpuScene, WaveReadLaneFirst(input.m_uniformsOffset));
+
+#	if REALLY_USING_PARALLAX
+	// TODO
+	const Vec2 uv = input.m_uv;
+#	else
+	const Vec2 uv = input.m_uv;
+#	endif
+	ANKI_MAYBE_UNUSED(uv);
+
+#	if DIFFUSE_TEX
+#		if REALLY_ALPHA_TEST
+	const RVec4 diffColorA = g_bindlessTextures2dF32[localUniforms.m_diffTex].Sample(g_globalSampler, uv);
+	doAlphaTest(diffColorA.a);
+	const RVec3 diffColor = diffColorA.rgb;
+#		else
+	const RVec3 diffColor = g_bindlessTextures2dF32[localUniforms.m_diffTex].Sample(g_globalSampler, uv).rgb;
+#		endif
+#	else
+	const RVec3 diffColor = localUniforms.m_diffColor;
+#	endif
+
+#	if SPECULAR_TEX
+	const RVec3 specColor = g_bindlessTextures2dF32[localUniforms.m_specTex].Sample(g_globalSampler, uv).rgb;
+#	else
+	const RVec3 specColor = localUniforms.m_specColor;
+#	endif
+
+#	if ROUGHNESS_TEX
+	const RF32 roughness = g_bindlessTextures2dF32[localUniforms.m_roughnessTex].Sample(g_globalSampler, uv).g;
+#	else
+	const RF32 roughness = localUniforms.m_roughness;
+#	endif
+
+#	if METAL_TEX
+	const RF32 metallic = g_bindlessTextures2dF32[localUniforms.m_metallicTex].Sample(g_globalSampler, uv).b;
+#	else
+	const RF32 metallic = localUniforms.m_metallic;
+#	endif
+
+#	if NORMAL_TEX
+	const RVec3 normal =
+		readNormalFromTexture(input, g_bindlessTextures2dF32[localUniforms.m_normalTex], g_globalSampler, uv);
+#	else
+	const RVec3 normal = normalize(input.m_normal);
+#	endif
+
+#	if EMISSIVE_TEX
+	const RVec3 emission = g_bindlessTextures2dF32[localUniforms.m_emissiveTex].Sample(g_globalSampler, uv).rgb;
+#	else
+	const RVec3 emission = localUniforms.m_emission;
+#	endif
+
+#	if ANKI_VELOCITY || ANKI_BONES
+	const Vec2 prevNdc = input.m_prevClipXyw.xy / input.m_prevClipXyw.z;
+	const Vec2 crntNdc = input.m_crntClipXyw.xy / input.m_crntClipXyw.z;
+
+	// It's NDC_TO_UV(prevNdc) - NDC_TO_UV(crntNdc) or:
+	const Vec2 velocity = (prevNdc - crntNdc) * 0.5;
+#	else
+	const Vec2 velocity = Vec2(1.0, 1.0);
+#	endif
+
+	GbufferInfo g;
+	g.m_diffuse = diffColor;
+	g.m_normal = normal;
+	g.m_f0 = specColor;
+	g.m_roughness = roughness;
+	g.m_subsurface = localUniforms.m_subsurface;
+	g.m_emission = emission;
+	g.m_metallic = metallic;
+	g.m_velocity = velocity;
+
+	FragOut output;
+	packGBuffer(g, output.m_color0, output.m_color1, output.m_color2, output.m_color3);
+	return output;
+}
+#endif
+
+#pragma anki end

+ 2 - 0
AnKi/Shaders/Include/Common.h

@@ -64,6 +64,8 @@ void maybeUnused(T a)
 #	define _ANKI_CONCATENATE(a, b) a##b
 #	define _ANKI_CONCATENATE(a, b) a##b
 #	define ANKI_CONCATENATE(a, b) _ANKI_CONCATENATE(a, b)
 #	define ANKI_CONCATENATE(a, b) _ANKI_CONCATENATE(a, b)
 
 
+#	define static_assert(x)
+
 #	define ANKI_BINDLESS_SET(s) \
 #	define ANKI_BINDLESS_SET(s) \
 		[[vk::binding(0, s)]] Texture2D<uint4> g_bindlessTextures2dU32[kMaxBindlessTextures]; \
 		[[vk::binding(0, s)]] Texture2D<uint4> g_bindlessTextures2dU32[kMaxBindlessTextures]; \
 		[[vk::binding(0, s)]] Texture2D<int4> g_bindlessTextures2dI32[kMaxBindlessTextures]; \
 		[[vk::binding(0, s)]] Texture2D<int4> g_bindlessTextures2dI32[kMaxBindlessTextures]; \

+ 24 - 13
AnKi/Shaders/Include/GpuSceneTypes.h

@@ -9,30 +9,41 @@
 
 
 ANKI_BEGIN_NAMESPACE
 ANKI_BEGIN_NAMESPACE
 
 
-/// Offset in DWORDs
-typedef U32 DwordOffset;
-
-/// @note All offsets in DWORD
+/// @note All offsets in bytes
 struct RenderableGpuView2
 struct RenderableGpuView2
 {
 {
-	DwordOffset m_worldTransformsOffset; ///< First is the crnt transform and the 2nd the previous
-	DwordOffset m_aabbOffset;
-	DwordOffset m_uniformsOffset;
-	DwordOffset m_meshOffset;
-	DwordOffset m_boneTransformsOffset;
-	DwordOffset m_previousBoneTransformsOffset;
+	U32 m_worldTransformsOffset; ///< First is the crnt transform and the 2nd the previous
+	U32 m_aabbOffset;
+	U32 m_uniformsOffset;
+	U32 m_meshOffset;
+	U32 m_boneTransformsOffset;
+	U32 m_previousBoneTransformsOffset;
+};
+
+struct MeshGpuViewLod
+{
+	U32 m_vertexOffsets[(U32)VertexStreamId::kMeshRelatedCount];
+	U32 m_indexCount;
+	U32 m_indexOffset; // TODO Decide on its type
 };
 };
+static_assert(sizeof(MeshGpuViewLod) == sizeof(Vec4) * 2);
 
 
 struct MeshGpuView
 struct MeshGpuView
 {
 {
+	MeshGpuViewLod m_lods[kMaxLodCount];
+
 	Vec3 m_positionTranslation;
 	Vec3 m_positionTranslation;
 	F32 m_positionScale;
 	F32 m_positionScale;
+};
 
 
-	U32 m_vertexOffsets[kMaxLodCount][(U32)VertexStreamId::kMeshRelatedCount];
-	U32 m_indexCounts[kMaxLodCount];
-	U32 m_indexOffsets[kMaxLodCount];
+struct UnpackedRenderableGpuViewInstance
+{
+	U32 m_renderableGpuViewOffset;
+	U32 m_lod;
 };
 };
 
 
+typedef U32 PackedRenderableGpuViewInstance;
+
 struct RenderableGpuView
 struct RenderableGpuView
 {
 {
 	Mat3x4 m_worldTransform;
 	Mat3x4 m_worldTransform;

+ 33 - 0
AnKi/Shaders/Include/MaterialTypes.h

@@ -46,6 +46,39 @@ constexpr U32 kMaterialBindingPreviousBoneTransforms = 3u;
 constexpr U32 kMaterialBindingFirstNonStandardLocal = 4u;
 constexpr U32 kMaterialBindingFirstNonStandardLocal = 4u;
 // End local bindings
 // End local bindings
 
 
+/// @brief
+enum class MaterialSet : U32
+{
+	kBindless,
+	kGlobal
+};
+
+/// Bindings in the MaterialSet::kGlobal descriptor set.
+enum class MaterialBinding : U32
+{
+	kTrilinearRepeatSampler,
+	kGlobalUniforms,
+	kGpuScene,
+
+	// Texture buffer bindings pointing to universal geom buffer:
+	kUnifiedGeometry_R16G16B16_Unorm,
+	kUnifiedGeometry_R8G8B8A8_Snorm,
+	kUnifiedGeometry_R32G32_Sfloat,
+	kUnifiedGeometry_R8G8B8A8_Uint,
+
+	// For FW shading:
+	kLinearClampSampler,
+	kDepthRt,
+	kLightVolume,
+	kClusterShadingUniforms,
+	kClusterShadingLights,
+	kClusters,
+	kShadowSampler,
+
+	kCount,
+	kFirst = 0
+};
+
 // Techniques
 // Techniques
 #define ANKI_RENDERING_TECHNIQUE_GBUFFER 0
 #define ANKI_RENDERING_TECHNIQUE_GBUFFER 0
 #define ANKI_RENDERING_TECHNIQUE_GBUFFER_EZ 1
 #define ANKI_RENDERING_TECHNIQUE_GBUFFER_EZ 1

+ 10 - 0
AnKi/Shaders/Include/MeshTypes.h

@@ -69,4 +69,14 @@ inline constexpr Array<Format, U32(VertexStreamId::kMeshRelatedCount)> kMeshRela
 	Format::kR32G32_Sfloat,   Format::kR8G8B8A8_Uint,  Format::kR8G8B8A8_Snorm};
 	Format::kR32G32_Sfloat,   Format::kR8G8B8A8_Uint,  Format::kR8G8B8A8_Snorm};
 #endif
 #endif
 
 
+struct UnpackedMeshVertex
+{
+	Vec3 m_position;
+	RVec3 m_normal;
+	RVec4 m_tangent;
+	Vec2 m_uv;
+	UVec4 m_boneIndices;
+	RVec4 m_boneWeights;
+};
+
 ANKI_END_NAMESPACE
 ANKI_END_NAMESPACE

+ 1 - 1
AnKi/Util/System.h

@@ -16,7 +16,7 @@ namespace anki {
 /// @{
 /// @{
 
 
 /// Get the number of CPU cores
 /// Get the number of CPU cores
-U32 getCpuCoresCount();
+ANKI_PURE U32 getCpuCoresCount();
 
 
 /// @internal
 /// @internal
 void backtraceInternal(const Function<void(CString)>& lambda);
 void backtraceInternal(const Function<void(CString)>& lambda);

+ 134 - 83
Tools/Shader/ShaderProgramBinaryDumpMain.cpp

@@ -6,6 +6,8 @@
 #include <AnKi/ShaderCompiler/ShaderProgramCompiler.h>
 #include <AnKi/ShaderCompiler/ShaderProgramCompiler.h>
 #include <AnKi/ShaderCompiler/MaliOfflineCompiler.h>
 #include <AnKi/ShaderCompiler/MaliOfflineCompiler.h>
 #include <AnKi/ShaderCompiler/RadeonGpuAnalyzer.h>
 #include <AnKi/ShaderCompiler/RadeonGpuAnalyzer.h>
+#include <AnKi/Util/ThreadHive.h>
+#include <AnKi/Util/System.h>
 
 
 using namespace anki;
 using namespace anki;
 
 
@@ -41,7 +43,7 @@ Error dumpStats(const ShaderProgramBinary& bin)
 {
 {
 	HeapMemoryPool pool(allocAligned, nullptr);
 	HeapMemoryPool pool(allocAligned, nullptr);
 
 
-	printf("\nMali offline compiler stats:\n");
+	printf("\nOffline compilers stats:\n");
 	fflush(stdout);
 	fflush(stdout);
 
 
 	class Stats
 	class Stats
@@ -87,113 +89,162 @@ Error dumpStats(const ShaderProgramBinary& bin)
 		U32 m_count = 0;
 		U32 m_count = 0;
 	};
 	};
 
 
-	Array<StageStats, U32(ShaderType::kCount)> allStats;
-
-	for(const ShaderProgramBinaryVariant& variant : bin.m_variants)
+	class Ctx
 	{
 	{
-		for(ShaderType shaderType : EnumIterable<ShaderType>())
+	public:
+		Array<StageStats, U32(ShaderType::kCount)> m_allStats;
+		Mutex m_allStatsMtx;
+		Atomic<U32> m_variantCount = {0};
+		HeapMemoryPool* m_pool = nullptr;
+		const ShaderProgramBinary* m_bin = nullptr;
+		Atomic<I32> m_error = {0};
+	};
+
+	Ctx ctx;
+	ctx.m_pool = &pool;
+	ctx.m_bin = &bin;
+
+	ThreadHive hive(8, &pool);
+
+	ThreadHiveTaskCallback callback = [](void* userData, [[maybe_unused]] U32 threadId,
+										 [[maybe_unused]] ThreadHive& hive,
+										 [[maybe_unused]] ThreadHiveSemaphore* signalSemaphore) {
+		Ctx& ctx = *static_cast<Ctx*>(userData);
+		U32 variantIdx;
+
+		while((variantIdx = ctx.m_variantCount.fetchAdd(1)) < ctx.m_bin->m_variants.getSize()
+			  && ctx.m_error.load() == 0)
 		{
 		{
-			if(variant.m_codeBlockIndices[shaderType] == kMaxU32)
+			const ShaderProgramBinaryVariant& variant = ctx.m_bin->m_variants[variantIdx];
+
+			for(ShaderType shaderType : EnumIterable<ShaderType>())
 			{
 			{
-				continue;
-			}
+				if(variant.m_codeBlockIndices[shaderType] == kMaxU32)
+				{
+					continue;
+				}
 
 
-			const ShaderProgramBinaryCodeBlock& codeBlock = bin.m_codeBlocks[variant.m_codeBlockIndices[shaderType]];
+				const ShaderProgramBinaryCodeBlock& codeBlock =
+					ctx.m_bin->m_codeBlocks[variant.m_codeBlockIndices[shaderType]];
 
 
-			// Arm stats
-			MaliOfflineCompilerOut maliocOut;
-			Error err = runMaliOfflineCompiler(
+				// Arm stats
+				MaliOfflineCompilerOut maliocOut;
+				Error err = runMaliOfflineCompiler(
 #if ANKI_OS_LINUX
 #if ANKI_OS_LINUX
-				ANKI_SOURCE_DIRECTORY "/ThirdParty/Bin/Linux64/MaliOfflineCompiler/malioc",
+					ANKI_SOURCE_DIRECTORY "/ThirdParty/Bin/Linux64/MaliOfflineCompiler/malioc",
 #elif ANKI_OS_WINDOWS
 #elif ANKI_OS_WINDOWS
-				ANKI_SOURCE_DIRECTORY "/ThirdParty/Bin/Windows64/MaliOfflineCompiler/malioc.exe",
+					ANKI_SOURCE_DIRECTORY "/ThirdParty/Bin/Windows64/MaliOfflineCompiler/malioc.exe",
 #else
 #else
 #	error "Not supported"
 #	error "Not supported"
 #endif
 #endif
-				codeBlock.m_binary, shaderType, pool, maliocOut);
-
-			if(err)
-			{
-				ANKI_LOGE("Mali offline compiler failed");
-				return Error::kFunctionFailed;
-			}
-
-			// Appends stats
-			StageStats& stage = allStats[shaderType];
-
-			if(maliocOut.m_spilling)
-			{
-				++stage.m_spillingCount;
-			}
-
-			++stage.m_count;
-
-			stage.m_avgStats.m_arm.m_fma += maliocOut.m_fma;
-			stage.m_avgStats.m_arm.m_cvt += maliocOut.m_cvt;
-			stage.m_avgStats.m_arm.m_sfu += maliocOut.m_sfu;
-			stage.m_avgStats.m_arm.m_loadStore += maliocOut.m_loadStore;
-			stage.m_avgStats.m_arm.m_varying += maliocOut.m_varying;
-			stage.m_avgStats.m_arm.m_texture += maliocOut.m_texture;
-			stage.m_avgStats.m_arm.m_workRegisters += maliocOut.m_workRegisters;
-			stage.m_avgStats.m_arm.m_fp16ArithmeticPercentage += maliocOut.m_fp16ArithmeticPercentage;
-
-			stage.m_maxStats.m_arm.m_fma = max<F64>(stage.m_maxStats.m_arm.m_fma, maliocOut.m_fma);
-			stage.m_maxStats.m_arm.m_cvt = max<F64>(stage.m_maxStats.m_arm.m_cvt, maliocOut.m_cvt);
-			stage.m_maxStats.m_arm.m_sfu = max<F64>(stage.m_maxStats.m_arm.m_sfu, maliocOut.m_sfu);
-			stage.m_maxStats.m_arm.m_loadStore = max<F64>(stage.m_maxStats.m_arm.m_loadStore, maliocOut.m_loadStore);
-			stage.m_maxStats.m_arm.m_varying = max<F64>(stage.m_maxStats.m_arm.m_varying, maliocOut.m_varying);
-			stage.m_maxStats.m_arm.m_texture = max<F64>(stage.m_maxStats.m_arm.m_texture, maliocOut.m_texture);
-			stage.m_maxStats.m_arm.m_workRegisters =
-				max<F64>(stage.m_maxStats.m_arm.m_workRegisters, maliocOut.m_workRegisters);
-			stage.m_maxStats.m_arm.m_fp16ArithmeticPercentage =
-				max<F64>(stage.m_maxStats.m_arm.m_fp16ArithmeticPercentage, maliocOut.m_fp16ArithmeticPercentage);
-
-			stage.m_minStats.m_arm.m_fma = min<F64>(stage.m_minStats.m_arm.m_fma, maliocOut.m_fma);
-			stage.m_minStats.m_arm.m_cvt = min<F64>(stage.m_minStats.m_arm.m_cvt, maliocOut.m_cvt);
-			stage.m_minStats.m_arm.m_sfu = min<F64>(stage.m_minStats.m_arm.m_sfu, maliocOut.m_sfu);
-			stage.m_minStats.m_arm.m_loadStore = min<F64>(stage.m_minStats.m_arm.m_loadStore, maliocOut.m_loadStore);
-			stage.m_minStats.m_arm.m_varying = min<F64>(stage.m_minStats.m_arm.m_varying, maliocOut.m_varying);
-			stage.m_minStats.m_arm.m_texture = min<F64>(stage.m_minStats.m_arm.m_texture, maliocOut.m_texture);
-			stage.m_minStats.m_arm.m_workRegisters =
-				min<F64>(stage.m_minStats.m_arm.m_workRegisters, maliocOut.m_workRegisters);
-			stage.m_minStats.m_arm.m_fp16ArithmeticPercentage =
-				min<F64>(stage.m_minStats.m_arm.m_fp16ArithmeticPercentage, maliocOut.m_fp16ArithmeticPercentage);
-
-			// AMD
-			RgaOutput rgaOut;
-			err = runRadeonGpuAnalyzer(
+					codeBlock.m_binary, shaderType, *ctx.m_pool, maliocOut);
+
+				if(err)
+				{
+					ANKI_LOGE("Mali offline compiler failed");
+					ctx.m_error.store(1);
+					break;
+				}
+
+				// AMD
+				RgaOutput rgaOut;
+				err = runRadeonGpuAnalyzer(
 #if ANKI_OS_LINUX
 #if ANKI_OS_LINUX
-				ANKI_SOURCE_DIRECTORY "/ThirdParty/Bin/Linux64/RadeonGpuAnalyzer/rga",
+					ANKI_SOURCE_DIRECTORY "/ThirdParty/Bin/Linux64/RadeonGpuAnalyzer/rga",
 #elif ANKI_OS_WINDOWS
 #elif ANKI_OS_WINDOWS
-				ANKI_SOURCE_DIRECTORY "/ThirdParty/Bin/Windows64/RadeonGpuAnalyzer/rga.exe",
+					ANKI_SOURCE_DIRECTORY "/ThirdParty/Bin/Windows64/RadeonGpuAnalyzer/rga.exe",
 #else
 #else
 #	error "Not supported"
 #	error "Not supported"
 #endif
 #endif
-				codeBlock.m_binary, shaderType, pool, rgaOut);
+					codeBlock.m_binary, shaderType, *ctx.m_pool, rgaOut);
+
+				if(err)
+				{
+					ANKI_LOGE("Radeon GPU Analyzer compiler failed");
+					ctx.m_error.store(1);
+					break;
+				}
+
+				// Appends stats
+				LockGuard lock(ctx.m_allStatsMtx);
+
+				StageStats& stage = ctx.m_allStats[shaderType];
+
+				if(maliocOut.m_spilling)
+				{
+					++stage.m_spillingCount;
+				}
+
+				++stage.m_count;
+
+				stage.m_avgStats.m_arm.m_fma += maliocOut.m_fma;
+				stage.m_avgStats.m_arm.m_cvt += maliocOut.m_cvt;
+				stage.m_avgStats.m_arm.m_sfu += maliocOut.m_sfu;
+				stage.m_avgStats.m_arm.m_loadStore += maliocOut.m_loadStore;
+				stage.m_avgStats.m_arm.m_varying += maliocOut.m_varying;
+				stage.m_avgStats.m_arm.m_texture += maliocOut.m_texture;
+				stage.m_avgStats.m_arm.m_workRegisters += maliocOut.m_workRegisters;
+				stage.m_avgStats.m_arm.m_fp16ArithmeticPercentage += maliocOut.m_fp16ArithmeticPercentage;
+
+				stage.m_maxStats.m_arm.m_fma = max<F64>(stage.m_maxStats.m_arm.m_fma, maliocOut.m_fma);
+				stage.m_maxStats.m_arm.m_cvt = max<F64>(stage.m_maxStats.m_arm.m_cvt, maliocOut.m_cvt);
+				stage.m_maxStats.m_arm.m_sfu = max<F64>(stage.m_maxStats.m_arm.m_sfu, maliocOut.m_sfu);
+				stage.m_maxStats.m_arm.m_loadStore =
+					max<F64>(stage.m_maxStats.m_arm.m_loadStore, maliocOut.m_loadStore);
+				stage.m_maxStats.m_arm.m_varying = max<F64>(stage.m_maxStats.m_arm.m_varying, maliocOut.m_varying);
+				stage.m_maxStats.m_arm.m_texture = max<F64>(stage.m_maxStats.m_arm.m_texture, maliocOut.m_texture);
+				stage.m_maxStats.m_arm.m_workRegisters =
+					max<F64>(stage.m_maxStats.m_arm.m_workRegisters, maliocOut.m_workRegisters);
+				stage.m_maxStats.m_arm.m_fp16ArithmeticPercentage =
+					max<F64>(stage.m_maxStats.m_arm.m_fp16ArithmeticPercentage, maliocOut.m_fp16ArithmeticPercentage);
+
+				stage.m_minStats.m_arm.m_fma = min<F64>(stage.m_minStats.m_arm.m_fma, maliocOut.m_fma);
+				stage.m_minStats.m_arm.m_cvt = min<F64>(stage.m_minStats.m_arm.m_cvt, maliocOut.m_cvt);
+				stage.m_minStats.m_arm.m_sfu = min<F64>(stage.m_minStats.m_arm.m_sfu, maliocOut.m_sfu);
+				stage.m_minStats.m_arm.m_loadStore =
+					min<F64>(stage.m_minStats.m_arm.m_loadStore, maliocOut.m_loadStore);
+				stage.m_minStats.m_arm.m_varying = min<F64>(stage.m_minStats.m_arm.m_varying, maliocOut.m_varying);
+				stage.m_minStats.m_arm.m_texture = min<F64>(stage.m_minStats.m_arm.m_texture, maliocOut.m_texture);
+				stage.m_minStats.m_arm.m_workRegisters =
+					min<F64>(stage.m_minStats.m_arm.m_workRegisters, maliocOut.m_workRegisters);
+				stage.m_minStats.m_arm.m_fp16ArithmeticPercentage =
+					min<F64>(stage.m_minStats.m_arm.m_fp16ArithmeticPercentage, maliocOut.m_fp16ArithmeticPercentage);
+
+				stage.m_avgStats.m_amd.m_vgprCount += F64(rgaOut.m_vgprCount);
+				stage.m_avgStats.m_amd.m_sgprCount += F64(rgaOut.m_sgprCount);
+				stage.m_avgStats.m_amd.m_isaSize += F64(rgaOut.m_isaSize);
+
+				stage.m_minStats.m_amd.m_vgprCount = min(stage.m_minStats.m_amd.m_vgprCount, F64(rgaOut.m_vgprCount));
+				stage.m_minStats.m_amd.m_sgprCount = min(stage.m_minStats.m_amd.m_sgprCount, F64(rgaOut.m_sgprCount));
+				stage.m_minStats.m_amd.m_isaSize = min(stage.m_minStats.m_amd.m_isaSize, F64(rgaOut.m_isaSize));
+
+				stage.m_maxStats.m_amd.m_vgprCount = max(stage.m_maxStats.m_amd.m_vgprCount, F64(rgaOut.m_vgprCount));
+				stage.m_maxStats.m_amd.m_sgprCount = max(stage.m_maxStats.m_amd.m_sgprCount, F64(rgaOut.m_sgprCount));
+				stage.m_maxStats.m_amd.m_isaSize = max(stage.m_maxStats.m_amd.m_isaSize, F64(rgaOut.m_isaSize));
+			}
 
 
-			if(err)
+			if(variantIdx > 0 && ((variantIdx + 1) % 32) == 0)
 			{
 			{
-				ANKI_LOGE("Radeon GPU Analyzer compiler failed");
-				return Error::kFunctionFailed;
+				printf("Processed %u out of %u variants\n", variantIdx + 1, ctx.m_bin->m_variants.getSize());
 			}
 			}
+		} // while
+	};
 
 
-			stage.m_avgStats.m_amd.m_vgprCount += F64(rgaOut.m_vgprCount);
-			stage.m_avgStats.m_amd.m_sgprCount += F64(rgaOut.m_sgprCount);
-			stage.m_avgStats.m_amd.m_isaSize += F64(rgaOut.m_isaSize);
+	for(U32 i = 0; i < hive.getThreadCount(); ++i)
+	{
+		hive.submitTask(callback, &ctx);
+	}
 
 
-			stage.m_minStats.m_amd.m_vgprCount = min(stage.m_minStats.m_amd.m_vgprCount, F64(rgaOut.m_vgprCount));
-			stage.m_minStats.m_amd.m_sgprCount = min(stage.m_minStats.m_amd.m_sgprCount, F64(rgaOut.m_sgprCount));
-			stage.m_minStats.m_amd.m_isaSize = min(stage.m_minStats.m_amd.m_isaSize, F64(rgaOut.m_isaSize));
+	hive.waitAllTasks();
 
 
-			stage.m_maxStats.m_amd.m_vgprCount = max(stage.m_maxStats.m_amd.m_vgprCount, F64(rgaOut.m_vgprCount));
-			stage.m_maxStats.m_amd.m_sgprCount = max(stage.m_maxStats.m_amd.m_sgprCount, F64(rgaOut.m_sgprCount));
-			stage.m_maxStats.m_amd.m_isaSize = max(stage.m_maxStats.m_amd.m_isaSize, F64(rgaOut.m_isaSize));
-		}
+	if(ctx.m_error.load() != 0)
+	{
+		return Error::kFunctionFailed;
 	}
 	}
 
 
 	for(ShaderType shaderType : EnumIterable<ShaderType>())
 	for(ShaderType shaderType : EnumIterable<ShaderType>())
 	{
 	{
-		const StageStats& stage = allStats[shaderType];
+		const StageStats& stage = ctx.m_allStats[shaderType];
 		if(stage.m_count == 0)
 		if(stage.m_count == 0)
 		{
 		{
 			continue;
 			continue;