浏览代码

MeshBatch : Refacto to allow multiple materials on mesh batch using sub meshes

TothBenoit 3 天之前
父节点
当前提交
894002219e
共有 4 个文件被更改,包括 548 次插入418 次删除
  1. 11 8
      h3d/GPUCounter.hx
  2. 146 150
      h3d/scene/GPUMeshBatch.hx
  3. 268 164
      h3d/scene/MeshBatch.hx
  4. 123 96
      h3d/shader/InstanceIndirect.hx

+ 11 - 8
h3d/GPUCounter.hx

@@ -3,11 +3,13 @@ package h3d;
 class GPUCounter {
 	public var buffer(default, null) : h3d.Buffer;
 	var accessor : haxe.io.Bytes;
+	var size : Int;
 
-	public function new() {
+	public function new( size : Int = 1 ) {
+		this.size = size;
 		var alloc = hxd.impl.Allocator.get();
-		buffer = alloc.allocBuffer(1,hxd.BufferFormat.INDEX32, UniformReadWrite);
-		accessor = haxe.io.Bytes.alloc(4);
+		buffer = alloc.allocBuffer(size, hxd.BufferFormat.INDEX32, UniformReadWrite);
+		accessor = haxe.io.Bytes.alloc(size << 2);
 	}
 
 	public function dispose(){
@@ -15,13 +17,14 @@ class GPUCounter {
 		alloc.disposeBuffer(buffer);
 	}
 
-	public function get() : Int {
-		buffer.readBytes(accessor, 0, 1);
+	public function get( index : Int = 0 ) : Int {
+		buffer.readBytes(accessor, 0, 1, index);
 		return accessor.getInt32(0);
 	}
 
-	public function reset(){
-		accessor.setInt32(0, 0);
-		buffer.uploadBytes(accessor, 0,1);
+	public function reset() {
+		for ( i in 0...size )
+			accessor.setInt32(i << 2, 0);
+		buffer.uploadBytes(accessor, 0, size);
 	}
 }

+ 146 - 150
h3d/scene/GPUMeshBatch.hx

@@ -1,21 +1,23 @@
 package h3d.scene;
 
 import h3d.scene.MeshBatch.BatchData;
-import h3d.scene.MeshBatch.MeshBatchPart;
 
 class GPUMeshBatch extends MeshBatch {
 
 	static var INDIRECT_DRAW_ARGUMENTS_FMT = hxd.BufferFormat.make([{ name : "", type : DVec4 }, { name : "", type : DFloat }]);
-	static var INSTANCE_OFFSETS_FMT = hxd.BufferFormat.make([{ name : "", type : DFloat }]);
-
-	var matInfos : h3d.Buffer;
-	var emittedSubParts : Array<MeshBatch.MeshBatchPart>;
-	var currentSubParts : Int;
-	var currentMaterialOffset : Int;
-	var instanceOffsetsCpu : haxe.io.Bytes;
-	var instanceOffsetsGpu : h3d.Buffer;
+	static var INSTANCES_INFOS_FMT = hxd.BufferFormat.make([{ name : "", type : DFloat }]);
+	inline static var INSTANCES_INFOS_ELEMENT_COUNT = 1;
+	inline static var SUB_MESHES_INFOS_ELEMENT_COUNT = 4;
+	inline static var SUB_PARTS_INFOS_ELEMENT_COUNT = 4;
+
+	var cpuInstancesInfos : haxe.io.Bytes;
+	var gpuInstancesInfos : h3d.Buffer;
+
+	var subPartsEmitted : Int = 0;
+	var materialsEmitted : Array<Float>;
+
+	var subMeshesInfos : h3d.Buffer;
 	var subPartsInfos : h3d.Buffer;
-	var materialCount : Int;
 
 	public var computePass : h3d.mat.Pass;
 	public var commandBuffer : h3d.Buffer;
@@ -44,7 +46,7 @@ class GPUMeshBatch extends MeshBatch {
 	 * Has effects only if a lod is available in the primitive.
 	 */
 	public function enableGpuLod() {
-		gpuLodEnabled = primitiveSubParts != null || getPrimitive().lodCount() > 1;
+		gpuLodEnabled = primitiveSubMeshes != null || getPrimitive().lodCount() > 1;
 		return gpuLodEnabled;
 	}
 
@@ -58,140 +60,135 @@ class GPUMeshBatch extends MeshBatch {
 	function getLodCount() return gpuLodEnabled ? getPrimitive().lodCount() : 1;
 	override function updateHasPrimitiveOffset() meshBatchFlags.set(HasPrimitiveOffset);
 
-	override function begin( emitCountTip = -1) {
+	override function begin( emitCountTip = -1 ) {
 		if ( !gpuLodEnabled && !gpuCullingEnabled )
 			throw "No need to create a GPUMeshBatch without gpu lod nor gpu culling, create a regular MeshBatch instead";
+		subPartsEmitted = 0;
+		materialsEmitted = [for ( _ in 0...materials.length) 0.0];
+		return super.begin(emitCountTip);
+	}
 
-		emitCountTip = super.begin(emitCountTip);
+	override function initSubMeshResources( emitCountTip ) {
+		if ( cpuInstancesInfos == null ) {
+			var instanceInfosByteSize = INSTANCES_INFOS_ELEMENT_COUNT << 2;
+			cpuInstancesInfos = haxe.io.Bytes.alloc( emitCountTip * instanceInfosByteSize );
+		}
+	}
 
-		if ( primitiveSubParts != null && ( gpuCullingEnabled || gpuLodEnabled ) && instanceOffsetsCpu == null ) {
-			var size = emitCountTip * 2 * 4;
-			instanceOffsetsCpu = haxe.io.Bytes.alloc(size);
+	override function emitSubMesh(subMeshIndex : Int) {
+		var subMesh = getSubMesh(subMeshIndex);
+
+		var instanceInfosByteSize = INSTANCES_INFOS_ELEMENT_COUNT << 2;
+		var minInstanceInfosSize = ( instanceCount + 1 ) * instanceInfosByteSize;
+		if ( cpuInstancesInfos.length < minInstanceInfosSize ) {
+			var next = haxe.io.Bytes.alloc(Std.int(cpuInstancesInfos.length * 3 / 2));
+			next.blit(0, cpuInstancesInfos, 0, cpuInstancesInfos.length);
+			cpuInstancesInfos = next;
 		}
 
-		return emitCountTip;
+		subPartsEmitted += subMesh.subParts.length;
+		for ( subPart in subMesh.subParts )
+			materialsEmitted[subPart.matIndex] += 1.0;
+
+		cpuInstancesInfos.setInt32(instanceCount << 2, subMeshIndex);
 	}
 
-	override function emitPrimitiveSubParts() {
-		if ( primitiveSubParts.length > 1 )
-			throw "Multi material with gpu instancing is not supported";
-		var primitiveSubPart = primitiveSubParts[0];
-		if (emittedSubParts == null) {
-			currentSubParts = 0;
-			currentMaterialOffset = 0;
-			emittedSubParts = [ primitiveSubPart.clone() ];
-		} else {
-			var currentIndexStart = emittedSubParts[currentSubParts].indexStart;
-			if ( currentIndexStart != primitiveSubPart.indexStart  ) {
-				currentSubParts = -1;
-				currentIndexStart = primitiveSubPart.indexStart;
-				currentMaterialOffset = 0;
-				for ( i => part in emittedSubParts ) {
-					if ( part.indexStart == currentIndexStart ) {
-						currentSubParts = i;
-						break;
+	override function flushSubMeshResources() {
+		var alloc = hxd.impl.Allocator.get();
+		var upload = needUpload;
+
+		var instancesInfosElementCount = instanceCount * INSTANCES_INFOS_ELEMENT_COUNT ;
+		if ( gpuInstancesInfos == null || gpuInstancesInfos.isDisposed() || instancesInfosElementCount > gpuInstancesInfos.vertices ) {
+			if ( gpuInstancesInfos != null)
+				alloc.disposeBuffer( gpuInstancesInfos );
+			gpuInstancesInfos = alloc.allocBuffer( instancesInfosElementCount, INSTANCES_INFOS_FMT, UniformReadWrite );
+			upload = true;
+		}
+
+		if ( upload )
+			gpuInstancesInfos.uploadBytes( cpuInstancesInfos, 0, instancesInfosElementCount );
+
+		if ( subMeshesInfos == null ) {
+			var tmpSubMeshesInfos = alloc.allocFloats( SUB_MESHES_INFOS_ELEMENT_COUNT * primitiveSubMeshes.length );
+
+			var pos = 0;
+			var subPartsCount = 0;
+			var subPartsStart = 0;
+			for ( subMesh in primitiveSubMeshes ) {
+				tmpSubMeshesInfos[pos++] = subMesh.bounds.dimension() * 0.5;
+				tmpSubMeshesInfos[pos++] = subMesh.lodCount;
+				tmpSubMeshesInfos[pos++] = subPartsStart;
+				tmpSubMeshesInfos[pos++] = subMesh.subParts.length;
+				subPartsCount += subMesh.subParts.length;
+				subPartsStart += subMesh.subParts.length * subMesh.lodCount;
+			}
+			subMeshesInfos = alloc.ofFloats( tmpSubMeshesInfos, hxd.BufferFormat.VEC4_DATA, Uniform );
+			alloc.disposeFloats(tmpSubMeshesInfos);
+
+			pos = 0;
+			var tmpSubPartsInfos = alloc.allocFloats( SUB_PARTS_INFOS_ELEMENT_COUNT * subPartsCount );
+			for ( subMesh in primitiveSubMeshes ) {
+				var lodCount = subMesh.lodCount;
+				var lodConfig = subMesh.lodConfig;
+				var lodConfigHasCulling = lodConfig.length > lodCount - 1;
+				var minScreenRatioCulling = lodConfigHasCulling ? lodConfig[lodConfig.length - 1] : 0.0;
+				for ( subPart in subMesh.subParts ) {
+					tmpSubPartsInfos[pos++] = subPart.indexCount;
+					tmpSubPartsInfos[pos++] = subPart.indexStart;
+					tmpSubPartsInfos[pos++] = 0 < lodConfig.length ? lodConfig[0] : 0.0;
+					tmpSubPartsInfos[pos++] = subPart.matIndex;
+					for ( i in 1...lodCount ) {
+						tmpSubPartsInfos[pos++] = subPart.lodIndexCount[i - 1];
+						tmpSubPartsInfos[pos++] = subPart.lodIndexStart[i - 1];
+						tmpSubPartsInfos[pos++] = i < lodConfig.length ? lodConfig[i] : 0.0;
+						tmpSubPartsInfos[pos++] = subPart.matIndex;
 					}
-					currentMaterialOffset += part.lodIndexCount.length + 1;
-				}
-				if ( currentSubParts < 0 ) {
-					currentSubParts = emittedSubParts.length;
-					emittedSubParts.push( primitiveSubPart.clone() );
+					tmpSubPartsInfos[pos - 2] = minScreenRatioCulling;
 				}
 			}
+
+			subPartsInfos = alloc.ofFloats( tmpSubPartsInfos, hxd.BufferFormat.VEC4_DATA, Uniform );
+			alloc.disposeFloats(tmpSubPartsInfos);
 		}
-		var maxInstanceID = ( instanceCount + 1 ) * 2;
-		if ( instanceOffsetsCpu.length < maxInstanceID * 4 ) {
-			var next = haxe.io.Bytes.alloc(Std.int(instanceOffsetsCpu.length*3/2));
-			next.blit(0, instanceOffsetsCpu, 0, instanceOffsetsCpu.length);
-			instanceOffsetsCpu = next;
-		}
-		instanceOffsetsCpu.setInt32((instanceCount * 2 + 0) * 4, currentMaterialOffset);
-		instanceOffsetsCpu.setInt32((instanceCount * 2 + 1) * 4, currentSubParts);
 	}
 
 	override function flush() {
 		var alloc = hxd.impl.Allocator.get();
-		var lodCount = getLodCount();
-		materialCount = materials.length;
-		var prim = getPrimitive();
-		var hmd = Std.downcast(prim, h3d.prim.HMDModel);
-
-		if ( emittedSubParts != null ) {
-			var upload = needUpload;
-			var vertex = instanceCount * 2;
-			if ( instanceOffsetsGpu == null || instanceOffsetsGpu.isDisposed() || vertex > instanceOffsetsGpu.vertices ) {
-				if ( instanceOffsetsGpu != null)
-					alloc.disposeBuffer( instanceOffsetsGpu );
-				instanceOffsetsGpu = alloc.allocBuffer( vertex, INSTANCE_OFFSETS_FMT, UniformReadWrite );
-				upload = true;
-			}
-			if ( upload )
-				instanceOffsetsGpu.uploadBytes( instanceOffsetsCpu, 0, vertex );
-			if ( matInfos == null ) {
-				materialCount = 0;
-				var tmpSubPartInfos = alloc.allocFloats( 2 * emittedSubParts.length );
-				var pos = 0;
-				for ( subPart in emittedSubParts ) {
-					var lodCount = subPart.lodIndexCount.length + 1;
-					tmpSubPartInfos[pos++] = lodCount;
-					tmpSubPartInfos[pos++] = subPart.bounds.dimension() * 0.5;
-					materialCount += lodCount;
-				}
-				subPartsInfos = alloc.ofFloats( tmpSubPartInfos, hxd.BufferFormat.VEC4_DATA, Uniform );
-				alloc.disposeFloats(tmpSubPartInfos);
-
-				var tmpMatInfos = alloc.allocFloats( 4 * ( materialCount + emittedSubParts.length ) );
-				pos = 0;
-				for ( subPart in emittedSubParts ) {
-					var maxLod = subPart.lodIndexCount.length;
-					var lodConfig = subPart.lodConfig;
-					tmpMatInfos[pos++] = subPart.indexCount;
-					tmpMatInfos[pos++] = subPart.indexStart;
-					tmpMatInfos[pos++] = ( 0 < lodConfig.length ) ? lodConfig[0] : 0.0;
-					tmpMatInfos[pos++] = ( maxLod < lodConfig.length && maxLod > 0 ) ? lodConfig[lodConfig.length - 1] : 0.0;
-					for ( i in 0...maxLod ) {
-						tmpMatInfos[pos++] = subPart.lodIndexCount[i];
-						tmpMatInfos[pos++] = subPart.lodIndexStart[i];
-						tmpMatInfos[pos++] = ( i + 1 < lodConfig.length ) ? lodConfig[i + 1] : 0.0;
-						pos++;
-					}
-				}
+		var materialCount = materials.length;
 
-				matInfos = alloc.ofFloats( tmpMatInfos, hxd.BufferFormat.VEC4_DATA, Uniform );
-				alloc.disposeFloats(tmpMatInfos);
-			}
-		} else if ( matInfos == null ) {
+		if ( !hasSubMeshes() ) {
+			var prim = getPrimitive();
 			if ( gpuLodEnabled ) {
-				var tmpMatInfos = alloc.allocFloats( 4 * materialCount * lodCount );
-				matInfos = alloc.allocBuffer( materialCount * lodCount, hxd.BufferFormat.VEC4_DATA, Uniform );
+				var lodCount = getLodCount();
+				var tmpSubPartsInfos = alloc.allocFloats( SUB_PARTS_INFOS_ELEMENT_COUNT * materialCount * lodCount );
+				var hmd = Std.downcast(prim, h3d.prim.HMDModel);
 				var lodConfig = hmd.getLodConfig();
-				var startIndex : Int = 0;
 				var lodConfigHasCulling = lodConfig.length > lodCount - 1;
-				var minScreenRatioCulling = lodConfigHasCulling ? lodConfig[lodConfig.length-1] : 0.0;
-				for ( i => lod in @:privateAccess hmd.lods ) {
-					for ( j in 0...materialCount ) {
-						var indexCount = lod.indexCounts[j];
-						var matIndex = i + j * lodCount;
-						tmpMatInfos[matIndex * 4 + 0] = indexCount;
-						tmpMatInfos[matIndex * 4 + 1] = startIndex;
-						tmpMatInfos[matIndex * 4 + 2] = ( i < lodConfig.length ) ? lodConfig[i] : 0.0;
-						tmpMatInfos[matIndex * 4 + 3] = minScreenRatioCulling;
-						startIndex += indexCount;
+				var minScreenRatioCulling = lodConfigHasCulling ? lodConfig[lodConfig.length - 1] : 0.0;
+				var pos = 0;
+				for ( matIndex in 0...materialCount ) {
+					for ( lodIndex in 0...lodCount ) {
+						tmpSubPartsInfos[pos++] = hmd.getMaterialIndexCount(matIndex, lodIndex);
+						tmpSubPartsInfos[pos++] = hmd.getMaterialIndexStart(matIndex, lodIndex);
+						tmpSubPartsInfos[pos++] = lodIndex < lodConfig.length ? lodConfig[lodIndex] : 0.0;
+						tmpSubPartsInfos[pos++] = matIndex;
 					}
+					tmpSubPartsInfos[pos - 2] = minScreenRatioCulling;
 				}
-				matInfos.uploadFloats( tmpMatInfos, 0, materialCount * lodCount );
-				alloc.disposeFloats( tmpMatInfos );
+
+				subPartsInfos = alloc.ofFloats( tmpSubPartsInfos, hxd.BufferFormat.VEC4_DATA, Uniform );
+				alloc.disposeFloats( tmpSubPartsInfos );
 			} else {
-				var tmpMatInfos = alloc.allocFloats( 4 * materialCount );
-				matInfos = alloc.allocBuffer( materialCount, hxd.BufferFormat.VEC4_DATA, Uniform );
+				var tmpSubPartsInfos = alloc.allocFloats( SUB_PARTS_INFOS_ELEMENT_COUNT * materialCount );
 				var pos : Int = 0;
 				for ( i in 0...materials.length ) {
-					tmpMatInfos[pos++] = prim.getMaterialIndexCount(i);
-					tmpMatInfos[pos++] = prim.getMaterialIndexStart(i);
-					pos += 2;
+					tmpSubPartsInfos[pos++] = prim.getMaterialIndexCount(i);
+					tmpSubPartsInfos[pos++] = prim.getMaterialIndexStart(i);
+					tmpSubPartsInfos[pos++] = 0.0;
+					tmpSubPartsInfos[pos++] = i;
 				}
-				matInfos.uploadFloats( tmpMatInfos, 0, materialCount );
-				alloc.disposeFloats( tmpMatInfos );
+				subPartsInfos = alloc.ofFloats( tmpSubPartsInfos, hxd.BufferFormat.VEC4_DATA, Uniform );
+				alloc.disposeFloats( tmpSubPartsInfos );
 			}
 		}
 
@@ -199,7 +196,7 @@ class GPUMeshBatch extends MeshBatch {
 
 		var computeShader : h3d.shader.InstanceIndirect.InstanceIndirectBase;
 		if( computePass == null ) {
-			computeShader = emittedSubParts != null ? new h3d.shader.InstanceIndirect.SubPartInstanceIndirect() : new h3d.shader.InstanceIndirect();
+			computeShader = hasSubMeshes() ? new h3d.shader.InstanceIndirect.SubPartInstanceIndirect() : new h3d.shader.InstanceIndirect();
 			computePass = new h3d.mat.Pass("batchUpdate");
 			computePass.addShader(computeShader);
 			addComputeShaders(computePass);
@@ -211,42 +208,39 @@ class GPUMeshBatch extends MeshBatch {
 		computeShader.ENABLE_CULLING = gpuCullingEnabled;
 		computeShader.ENABLE_DISTANCE_CLIPPING = maxDistance >= 0;
 		computeShader.maxDistance = maxDistance;
-		computeShader.MAX_MATERIAL_COUNT = 16;
-		while ( materialCount * lodCount > computeShader.MAX_MATERIAL_COUNT )
-			computeShader.MAX_MATERIAL_COUNT = computeShader.MAX_MATERIAL_COUNT + 16;
-		computeShader.matInfos = matInfos;
+
+		computeShader.subPartsInfos = subPartsInfos;
 		computeShader.instanceCount = instanceCount;
 
 		var commandCountNeeded : Int;
-		if ( emittedSubParts != null ) {
-			commandCountNeeded = instanceCount;
+		if ( hasSubMeshes() ) {
+			commandCountNeeded = subPartsEmitted;
 			var computeShader : h3d.shader.InstanceIndirect.SubPartInstanceIndirect = cast computeShader;
-			computeShader.subPartCount = emittedSubParts.length;
-			computeShader.subPartInfos = subPartsInfos;
-			computeShader.instanceOffsets = instanceOffsetsGpu;
-			computeShader.MAX_SUB_PART_BUFFER_ELEMENT_COUNT = 16;
-			var maxSubPartsElement = hxd.Math.ceil( emittedSubParts.length / 2 );
-			while ( maxSubPartsElement > computeShader.MAX_SUB_PART_BUFFER_ELEMENT_COUNT )
-				computeShader.MAX_SUB_PART_BUFFER_ELEMENT_COUNT = computeShader.MAX_SUB_PART_BUFFER_ELEMENT_COUNT + 16;
+			computeShader.MATERIAL_COUNT = materialCount;
+			var materialCommandStart = [new h3d.Vector4()];
+			for ( i in 1...materialCount )
+				materialCommandStart.push(new h3d.Vector4(materialsEmitted[i-1]));
+			computeShader.materialCommandStart = materialCommandStart;
+			computeShader.subMeshesInfos = subMeshesInfos;
+			computeShader.instancesInfos = gpuInstancesInfos;
 		} else {
-			commandCountNeeded = instanceCount * materialCount;
+			commandCountNeeded = materialCount * instanceCount;
 			var computeShader : h3d.shader.InstanceIndirect = cast computeShader;
+			var prim = getPrimitive();
 			computeShader.radius = prim.getBounds().dimension() * 0.5;
-			computeShader.lodCount = lodCount;
-			computeShader.materialCount = materialCount;
+			computeShader.lodCount = getLodCount();
+			computeShader.subPartsCount = materialCount;
 		}
 
 		var alloc = hxd.impl.Allocator.get();
 		var commandCountAllocated = hxd.Math.nextPOT( commandCountNeeded );
 		if ( commandBuffer == null ) {
 			commandBuffer = alloc.allocBuffer( commandCountAllocated, INDIRECT_DRAW_ARGUMENTS_FMT, UniformReadWrite );
-			gpuCounter = new h3d.GPUCounter();
+			gpuCounter = new h3d.GPUCounter( materialCount );
 		} else if ( commandBuffer.vertices < commandCountAllocated ) {
 			alloc.disposeBuffer( commandBuffer );
 			commandBuffer = alloc.allocBuffer( commandCountAllocated, INDIRECT_DRAW_ARGUMENTS_FMT, UniformReadWrite );
 		}
-
-		materialCount = 0;
 	}
 
 	function addComputeShaders( pass : h3d.mat.Pass ) {}
@@ -279,7 +273,7 @@ class GPUMeshBatch extends MeshBatch {
 			@:privateAccess instanced.commands.data = commandBuffer.vbuf;
 			@:privateAccess instanced.commands.countBuffer = gpuCounter.buffer.vbuf;
 			@:privateAccess instanced.commands.offset = p.matIndex * instanceCount;
-			@:privateAccess instanced.commands.countOffset = 0;
+			@:privateAccess instanced.commands.countOffset = p.matIndex;
 		}
 	}
 
@@ -298,23 +292,25 @@ class GPUMeshBatch extends MeshBatch {
 		super.cleanPasses();
 
 		var alloc = hxd.impl.Allocator.get();
-		if ( matInfos != null ) {
-			alloc.disposeBuffer(matInfos);
-			matInfos = null;
+		if ( subPartsInfos != null ) {
+			alloc.disposeBuffer(subPartsInfos);
+			subPartsInfos = null;
 		}
 
-		if ( subPartsInfos != null )
-			alloc.disposeBuffer(subPartsInfos);
+		if ( subMeshesInfos != null ) {
+			alloc.disposeBuffer(subMeshesInfos);
+			subMeshesInfos = null;
+		}
 
-		if ( instanceOffsetsGpu != null )
-			alloc.disposeBuffer(instanceOffsetsGpu);
-		instanceOffsetsCpu = null;
+		if ( gpuInstancesInfos != null ) {
+			alloc.disposeBuffer(gpuInstancesInfos);
+			gpuInstancesInfos = null;
+		}
+		cpuInstancesInfos = null;
 
 		if ( commandBuffer != null )
 			alloc.disposeBuffer(commandBuffer);
 		if( gpuCounter != null )
 			gpuCounter.dispose();
-
-		emittedSubParts = null;
 	}
 }

+ 268 - 164
h3d/scene/MeshBatch.hx

@@ -7,8 +7,11 @@ enum MeshBatchFlag {
 	HasPrimitiveOffset;
 	EnableCpuLod;
 	ForceGpuUpdate;
+	EnableSubMesh;
 }
 
+typedef CpuIndirectCallBuffer = { bytes : haxe.io.Bytes, count : Int };
+
 /**
 	h3d.scene.MeshBatch allows to draw multiple meshed in a single draw call.
 	See samples/MeshBatch.hx for an example.
@@ -18,9 +21,10 @@ class MeshBatch extends MultiMaterial {
 	static var modelViewID = hxsl.Globals.allocID("global.modelView");
 	static var modelViewInverseID = hxsl.Globals.allocID("global.modelViewInverse");
 	static var previousModelViewID = hxsl.Globals.allocID("global.previousModelView");
-	static var MAX_BUFFER_ELEMENTS = 4096;
-	static var MAX_STORAGE_BUFFER_ELEMENTS = 128 * 1024 * 1024 >> 2;
 	static var BATCH_START_FMT = hxd.BufferFormat.make([{ name : "Batch_Start", type : DFloat }]);
+	inline static var MAX_BUFFER_ELEMENTS = 4096;
+	inline static var MAX_STORAGE_BUFFER_ELEMENTS = 128 * 1024 * 1024 >> 2;
+	inline static var DEFAULT_EMIT_COUNT_TIP = 128;
 
 	var instanced : h3d.prim.Instanced;
 	var dataPasses : BatchData;
@@ -42,14 +46,19 @@ class MeshBatch extends MultiMaterial {
 	 * 	If set, use this position in emitInstance() instead MeshBatch absolute position
 	**/
 	public var worldPosition : Matrix;
-	var invWorldPosition : Matrix;
 
 	/**
 		Tells the mesh batch to draw only a subpart of the primitive.
-		One primitiveSubPart per material.
 	**/
-	public var primitiveSubParts : Array<MeshBatchPart>;
-	var primitiveSubBytes : Array<haxe.io.Bytes>;
+	public var primitiveSubMeshes : Array<SubMesh>;
+	public var curSubMesh : Int = -1;
+
+	/**
+		Use one indirect call buffer per material.
+		Instances can not be culled for a specific pass yet.
+	**/
+	var cpuIndirectCallBuffers : Array<CpuIndirectCallBuffer>;
+	var gpuIndirectCallBuffers : Array<h3d.impl.InstanceBuffer>;
 
 	/**
 		If set, exact bounds will be recalculated during emitInstance (default true)
@@ -58,8 +67,8 @@ class MeshBatch extends MultiMaterial {
 
 	/**
 	 	With EnableCpuLod, set the lod of the next emitInstance.
-		Without EnableCpuLod and not using primitiveSubParts, set the lod of the whole batch.
-	 */
+		Without EnableCpuLod and not using primitiveSubMeshes, set the lod of the whole batch.
+	**/
 	public var curLod : Int = -1;
 
 	public function new( primitive, ?material, ?parent ) {
@@ -98,13 +107,25 @@ class MeshBatch extends MultiMaterial {
 		meshBatchFlags.set(ForceGpuUpdate);
 	}
 
+	/**
+	 * Use sub mesh to emit instance.
+	 * Don't support multiple materials without Storage Buffer to simplify implementation.
+	**/
+	public function enableSubMesh() {
+		meshBatchFlags.set(EnableSubMesh);
+		if ( materials.length > 1 )
+			meshBatchFlags.set(EnableStorageBuffer);
+	}
+
 	public function enableCpuLod() {
 		var prim = getPrimitive();
 		var lodCount = prim.lodCount();
 		if ( lodCount <= 1 )
 			return;
-		if ( partsFromPrimitive(prim) )
+		if ( partsFromPrimitive(prim) ) {
 			meshBatchFlags.set(EnableCpuLod);
+			meshBatchFlags.set(EnableStorageBuffer);
+		}
 	}
 
 	function getPrimitive() return @:privateAccess instanced.primitive;
@@ -113,6 +134,7 @@ class MeshBatch extends MultiMaterial {
 	function gpuUpdateForced() return meshBatchFlags.has(ForceGpuUpdate);
 	function getMaxElements() return storageBufferEnabled() ? MAX_STORAGE_BUFFER_ELEMENTS : MAX_BUFFER_ELEMENTS;
 	function hasPrimitiveOffset() return meshBatchFlags.has(HasPrimitiveOffset);
+	function hasSubMeshes() return meshBatchFlags.has(EnableSubMesh);
 	function cpuLodEnabled() return meshBatchFlags.has(EnableCpuLod);
 
 	inline function shouldResizeDown( currentSize : Int, minSize : Int ) : Bool {
@@ -121,14 +143,19 @@ class MeshBatch extends MultiMaterial {
 
 	public function begin( emitCountTip = -1 ) : Int {
 		instanceCount = 0;
+
+		if ( emitCountTip < 0 )
+			emitCountTip = DEFAULT_EMIT_COUNT_TIP;
+
+		if ( primitiveSubMeshes != null )
+			enableSubMesh();
+
 		instanced.initBounds();
 		if( shadersChanged ) {
 			initShadersMapping();
 			shadersChanged = false;
 		}
 
-		if( emitCountTip < 0 )
-			emitCountTip = 128;
 		var p = dataPasses;
 		var alloc = hxd.impl.Allocator.get();
 		while( p != null ) {
@@ -140,9 +167,22 @@ class MeshBatch extends MultiMaterial {
 			p = p.next;
 		}
 
+		if ( hasSubMeshes() )
+			initSubMeshResources( emitCountTip );
+
 		return emitCountTip;
 	}
 
+	function initSubMeshResources( emitCountTip ) {
+		if ( cpuIndirectCallBuffers == null ) {
+			var instanceSize = emitCountTip * h3d.impl.InstanceBuffer.ELEMENT_SIZE;
+			cpuIndirectCallBuffers = [for ( _ in 0...materials.length ) { bytes : haxe.io.Bytes.alloc(instanceSize), count : 0 }];
+		} else {
+			for ( cpuIndirectCallBuffer in cpuIndirectCallBuffers )
+				cpuIndirectCallBuffer.count = 0;
+		}
+	}
+
 	function initShadersMapping() {
 		var scene = getScene();
 		if( scene == null ) return;
@@ -206,7 +246,7 @@ class MeshBatch extends MultiMaterial {
 		}
 	}
 
-	function updateHasPrimitiveOffset() meshBatchFlags.setTo(HasPrimitiveOffset, primitiveSubParts != null);
+	function updateHasPrimitiveOffset() meshBatchFlags.setTo(HasPrimitiveOffset, hasSubMeshes());
 
 	function createBatchData() {
 		return new BatchData();
@@ -258,7 +298,7 @@ class MeshBatch extends MultiMaterial {
 			for( i in prev...fmt.length )
 				curPos += fmt[i].getBytesSize() >> 2;
 		}
-		if ( curPos & 3 != 0) {
+		if ( curPos & 3 != 0 ) {
 			var paddingSize = 4 - (curPos & 3);
 			var paddingType : hxsl.Ast.Type = switch ( paddingSize ) {
 			case 1:
@@ -275,63 +315,82 @@ class MeshBatch extends MultiMaterial {
 	}
 
 	public function emitInstance() {
-		if( primitiveSubParts != null )
-			emitPrimitiveSubParts();
+		// When using sub meshes we need to fill the indirect call buffers for multi draw
+		if( hasSubMeshes() )
+			emitSubMesh(curSubMesh);
 
-		if(!gpuUpdateForced()){
-			if( worldPosition == null ) syncPos();
+		// Instance data can be filled from the GPU
+		if( !gpuUpdateForced() ) {
 
-			if (primitiveSubParts == null && calcBounds)
+			if ( !hasSubMeshes() && calcBounds)
 				instanced.addInstanceBounds(worldPosition == null ? absPos : worldPosition);
 
-			var p = dataPasses;
-			while( p != null ) {
-				syncData(p);
-				p = p.next;
-			}
+			// Use the mesh batch abs pos if no world position has been set.
+			if ( worldPosition == null )
+				syncPos();
+
+			syncData();
 		}
 
 		instanceCount++;
 	}
 
-	function emitPrimitiveSubParts() {
+	function getSubMesh( subMeshIndex : Int ) : SubMesh {
+		return primitiveSubMeshes[subMeshIndex];
+	}
+
+	function emitSubMesh(subMeshIndex : Int) {
+		if ( cpuIndirectCallBuffers == null )
+			throw "Something went wrong during the initialization";
+		if ( subMeshIndex < 0 || subMeshIndex >= primitiveSubMeshes.length )
+			throw "Invalid subMeshIndex";
+
+		var subMesh = getSubMesh(subMeshIndex);
+		var subParts = subMesh.subParts;
 		if(calcBounds) @:privateAccess {
-			for ( primitiveSubPart in primitiveSubParts ) {
-				instanced.tmpBounds.load(primitiveSubPart.bounds);
-				instanced.tmpBounds.transform(worldPosition == null ? absPos : worldPosition);
-				instanced.bounds.add(instanced.tmpBounds);
-			}
+			instanced.tmpBounds.load(subMesh.bounds);
+			instanced.tmpBounds.transform(worldPosition == null ? absPos : worldPosition);
+			instanced.bounds.add(instanced.tmpBounds);
 		}
 
-		if( primitiveSubBytes == null ) {
-			if ( primitiveSubParts.length != materials.length )
-				throw "Instancing using primitive sub parts must match material count";
-			primitiveSubBytes = [for ( i in 0...primitiveSubParts.length ) haxe.io.Bytes.alloc(128)];
-			instanced.commands = null;
-		}
 		var instanceSize = h3d.impl.InstanceBuffer.ELEMENT_SIZE;
-		for ( i in 0...primitiveSubBytes.length ) {
-			if( primitiveSubBytes[i].length < (instanceCount+1) * instanceSize ) {
-				var next = haxe.io.Bytes.alloc(Std.int(primitiveSubBytes[i].length*3/2));
-				next.blit(0, primitiveSubBytes[i], 0, instanceCount * instanceSize);
-				primitiveSubBytes[i] = next;
-			}
-		}
-		var p = instanceCount * instanceSize;
-		for ( mid => psBytes in primitiveSubBytes ) {
-			var primitiveSubPart = primitiveSubParts[mid];
-			var indexCount = primitiveSubPart.indexCount;
-			var indexStart = primitiveSubPart.indexStart;
+		for ( subPart in subParts ) {
+			var indexCount = subPart.indexCount;
+			var indexStart = subPart.indexStart;
 			if ( curLod >= 0 && cpuLodEnabled() ) {
-				indexStart = primitiveSubPart.lodIndexStart[curLod];
-				indexCount = primitiveSubPart.lodIndexCount[curLod];
+				indexStart = subPart.lodIndexStart[curLod];
+				indexCount = subPart.lodIndexCount[curLod];
+			}
+
+			if ( indexCount == 0 && storageBufferEnabled() )
+				continue;
+
+			var matIndex = subPart.matIndex;
+			var indirectCallBuffer = cpuIndirectCallBuffers[matIndex];
+
+			// Resize
+			var count = indirectCallBuffer.count++;
+			var pos = count * instanceSize;
+			var minIndirectCallBufferSize = pos + instanceSize;
+			if ( indirectCallBuffer.bytes.length < minIndirectCallBufferSize ) {
+				var next = haxe.io.Bytes.alloc(Std.int((indirectCallBuffer.bytes.length * 3 / 2)));
+				next.blit(0, indirectCallBuffer.bytes, 0, pos);
+				indirectCallBuffer.bytes = next;
 			}
-			psBytes.setInt32(p, indexCount);
-			psBytes.setInt32(p + 4, 1);
-			psBytes.setInt32(p + 8, indexStart);
-			psBytes.setInt32(p + 12, primitiveSubPart.baseVertex);
-			psBytes.setInt32(p + 16, instanceCount);
+
+			// Emit
+			var bytes = indirectCallBuffer.bytes;
+			bytes.setInt32(pos, indexCount);
+			bytes.setInt32(pos + 4, 1);
+			bytes.setInt32(pos + 8, indexStart);
+			bytes.setInt32(pos + 12, 0);
+			bytes.setInt32(pos + 16, instanceCount);
+
+			cpuIndirectCallBuffers[matIndex] = indirectCallBuffer;
 		}
+
+		// To clean
+		instanced.commands = null;
 	}
 
 	override function sync(ctx:RenderContext) {
@@ -340,6 +399,39 @@ class MeshBatch extends MultiMaterial {
 		flush();
 	}
 
+	function flushSubMeshResources() {
+		if ( !storageBufferEnabled() )
+			throw "Storage buffer must be set to use per material indirect call buffers";
+
+		if ( gpuIndirectCallBuffers == null )
+			gpuIndirectCallBuffers = [for ( i in 0...materials.length ) new h3d.impl.InstanceBuffer()];
+
+		for ( matIndex in 0...materials.length ) {
+			var cpuIndirectCallBuffer = cpuIndirectCallBuffers[matIndex];
+			var gpuIndirectCallBuffer = gpuIndirectCallBuffers[matIndex];
+
+			// Upload indirect call buffer
+			var count = cpuIndirectCallBuffer.count;
+			if ( needUpload || gpuIndirectCallBuffer.commandCount != count ) {
+				var bytes = cpuIndirectCallBuffer.bytes;
+				if ( count == 0 ) {
+					count = 1;
+					bytes.setInt32(0,  0);
+					bytes.setInt32(4,  0);
+					bytes.setInt32(8,  0);
+					bytes.setInt32(12, 0);
+					bytes.setInt32(16, 0);
+				}
+
+				var gpuIndirectCallMaxCount = gpuIndirectCallBuffer.maxCommandCount;
+				if ( shouldResizeDown(gpuIndirectCallMaxCount, count) || count > gpuIndirectCallMaxCount )
+					gpuIndirectCallBuffer.allocFromBytes(count, bytes);
+				else
+					gpuIndirectCallBuffer.uploadBytes(count, bytes);
+			}
+		}
+	}
+
 	public function flush() {
 		var p = dataPasses;
 		var alloc = hxd.impl.Allocator.get();
@@ -347,53 +439,60 @@ class MeshBatch extends MultiMaterial {
 		var prim = getPrimitive();
 		var instanceSize = h3d.impl.InstanceBuffer.ELEMENT_SIZE;
 
+		if ( hasSubMeshes() && storageBufferEnabled() )
+			flushSubMeshResources();
+
+		// Allocate and upload GPU buffers for each data passes
 		while( p != null ) {
 			var index = 0;
 			var start = 0;
 			while( start < instanceCount ) {
 				var upload = needUpload;
 				var buf = p.buffers[index];
-				var count = instanceCount - start;
-				if( count > p.maxInstance )
-					count = p.maxInstance;
+				if( instanceCount > p.maxInstance && storageBufferEnabled() )
+					throw "Maximum instance count reached";
 
+				var count = hxd.Math.imin(instanceCount - start, p.maxInstance);
 				var maxVertexCount = gpuUpdateEnabled() ? p.maxInstance : getMaxElements();
 				var vertexCount = Std.int( count * (( 4 * p.paramsCount ) / p.bufferFormat.stride) );
 				var vertexCountAllocated = #if js Std.int( MAX_BUFFER_ELEMENTS * 4 / p.bufferFormat.stride ) #else hxd.Math.imin( hxd.Math.nextPOT( vertexCount ), maxVertexCount ) #end;
 
+				// Lazy instance data buffer allocation
 				if( buf == null || buf.isDisposed() || buf.vertices < vertexCountAllocated ) {
 					var bufferFlags : hxd.impl.Allocator.BufferFlags = storageBufferEnabled() ? UniformReadWrite : UniformDynamic;
-
 					if ( buf != null )
 						alloc.disposeBuffer(buf);
-					buf = alloc.allocBuffer( vertexCountAllocated, p.bufferFormat,bufferFlags );
+					buf = alloc.allocBuffer( vertexCountAllocated, p.bufferFormat, bufferFlags );
 					p.buffers[index] = buf;
 					upload = true;
 				}
+
+				// Upload instance data buffer
 				if( upload && !gpuUpdateForced())
 					buf.uploadFloats(p.data, start * p.paramsCount * 4, vertexCount);
-				if( primitiveSubBytes != null ) {
-					if( p.instanceBuffers == null )
-						p.instanceBuffers = [];
-					var ibuf = p.instanceBuffers[index];
-					if ( ibuf == null )
-						ibuf = new h3d.impl.InstanceBuffer();
-					var ibufUpload = needUpload || ibuf.commandCount != count;
-					if ( ibufUpload ) {
-						var psBytes = primitiveSubBytes[p.matIndex];
+
+				if( hasSubMeshes() && !storageBufferEnabled() ) {
+					if( p.indirectCallBuffers == null )
+						p.indirectCallBuffers = [];
+					var indirectCallBuffer = p.indirectCallBuffers[index];
+					if ( indirectCallBuffer == null )
+						indirectCallBuffer = new h3d.impl.InstanceBuffer();
+					var upload = needUpload || indirectCallBuffer.commandCount != count;
+					if ( upload ) {
+						var bytes = cpuIndirectCallBuffers[p.matIndex].bytes;
 						if ( start > 0 && count < instanceCount ) {
-							psBytes = psBytes.sub(start*instanceSize,count*instanceSize);
+							bytes = bytes.sub(start*instanceSize,count*instanceSize);
 							for( i in 0...count )
-								psBytes.setInt32(i*instanceSize+16, i);
+								bytes.setInt32(i*instanceSize+16, i);
 						}
 
-						var ibufMaxCommandCount = ibuf.maxCommandCount;
-						if ( shouldResizeDown(ibufMaxCommandCount, count) || count > ibufMaxCommandCount) {
-							ibuf.allocFromBytes(count, psBytes);
+						var maxCommandCount = indirectCallBuffer.maxCommandCount;
+						if ( shouldResizeDown(maxCommandCount, count) || count > maxCommandCount) {
+							indirectCallBuffer.allocFromBytes(count, bytes);
 						} else {
-							ibuf.uploadBytes(count, psBytes);
+							indirectCallBuffer.uploadBytes(count, bytes);
 						}
-						p.instanceBuffers[index] = ibuf;
+						p.indirectCallBuffers[index] = indirectCallBuffer;
 					}
 				}
 
@@ -409,18 +508,18 @@ class MeshBatch extends MultiMaterial {
 				alloc.disposeBuffer( p.buffers.pop() );
 			p = p.next;
 		}
-		if( hasPrimitiveOffset() ) {
+		if ( hasPrimitiveOffset() ) {
 			var offsets = prim.resolveBuffer("Batch_Start");
-			if( offsets == null || offsets.vertices < instanceCount || offsets.isDisposed() ) {
-				if( offsets != null ) {
+			if ( offsets == null || offsets.vertices < instanceCount || offsets.isDisposed() ) {
+				if ( offsets != null ) {
 					offsets.dispose();
 					prim.removeBuffer(offsets);
 				}
 				var tmp = haxe.io.Bytes.alloc(4 * instanceCount);
-				for( i in 0...instanceCount )
+				for ( i in 0...instanceCount )
 					tmp.setFloat(i<<2, i);
 				offsets = new h3d.Buffer(instanceCount, BATCH_START_FMT);
-				offsets.uploadBytes(tmp,0,instanceCount);
+				offsets.uploadBytes(tmp, 0, instanceCount);
 				prim.addBuffer(offsets);
 			}
 		}
@@ -431,63 +530,61 @@ class MeshBatch extends MultiMaterial {
 
 	function onFlushPass(p : BatchData) {}
 
-	function syncData( batch : BatchData ) {
-		var startPos = batch.paramsCount * instanceCount << 2;
-		// in case we are bigger than emitCountTip
-		if( startPos + (batch.paramsCount<<2) > batch.data.length )
-			batch.data.grow(batch.data.length << 1);
+	function syncData() {
+		var batch = dataPasses;
+		var invWorldPosition = null;
+		var worldPosition = worldPosition ?? absPos;
+		while( batch != null ) {
+			var startPos = batch.paramsCount * instanceCount << 2;
+			// in case we are bigger than emitCountTip
+			if( startPos + (batch.paramsCount << 2) > batch.data.length )
+				batch.data.grow(batch.data.length << 1);
 
-		var p = batch.params;
-		var buf = batch.data;
-		var shaders = batch.shaders;
+			var p = batch.params;
+			var buf = batch.data;
+			var shaders = batch.shaders;
 
-		var calcInv = false;
-		while( p != null ) {
-			var bufLoader = new hxd.FloatBufferLoader(buf, startPos + p.pos);
-			if( p.perObjectGlobal != null ) {
-				if ( p.perObjectGlobal.gid == modelViewID ) {
-					bufLoader.loadMatrix(worldPosition != null ? worldPosition : absPos);
-				} else if ( p.perObjectGlobal.gid == modelViewInverseID ) {
-					if( worldPosition == null )
-						bufLoader.loadMatrix(getInvPos());
-					else {
-						if( !calcInv ) {
-							calcInv = true;
-							if( invWorldPosition == null ) invWorldPosition = new h3d.Matrix();
-							invWorldPosition.initInverse(worldPosition);
-						}
+			while( p != null ) {
+				var bufLoader = new hxd.FloatBufferLoader(buf, startPos + p.pos);
+				if( p.perObjectGlobal != null ) {
+					if ( p.perObjectGlobal.gid == modelViewID ) {
+						bufLoader.loadMatrix(worldPosition);
+					} else if ( p.perObjectGlobal.gid == modelViewInverseID ) {
+						if ( invWorldPosition == null )
+							invWorldPosition = worldPosition == null ? getInvPos() : worldPosition.getInverse();
 						bufLoader.loadMatrix(invWorldPosition);
+					} else if ( p.perObjectGlobal.gid == previousModelViewID )
+						bufLoader.loadMatrix(worldPosition);
+					else
+						throw "Unsupported global param " + p.perObjectGlobal.path;
+					p = p.next;
+					continue;
+				}
+				var curShader = shaders[p.instance];
+				switch( p.type ) {
+				case TVec(size, _):
+					switch( size ) {
+					case 2:
+						var v : h3d.Vector = curShader.getParamValue(p.index);
+						bufLoader.loadVec2(v);
+					case 3:
+						var v : h3d.Vector = curShader.getParamValue(p.index);
+						bufLoader.loadVec3(v);
+					case 4:
+						var v : h3d.Vector4 = curShader.getParamValue(p.index);
+						bufLoader.loadVec4(v);
 					}
-				} else if ( p.perObjectGlobal.gid == previousModelViewID )
-					bufLoader.loadMatrix(worldPosition != null ? worldPosition : absPos );
-				else
-					throw "Unsupported global param "+p.perObjectGlobal.path;
-				p = p.next;
-				continue;
-			}
-			var curShader = shaders[p.instance];
-			switch( p.type ) {
-			case TVec(size, _):
-				switch( size ) {
-				case 2:
-					var v : h3d.Vector = curShader.getParamValue(p.index);
-					bufLoader.loadVec2(v);
-				case 3:
-					var v : h3d.Vector = curShader.getParamValue(p.index);
-					bufLoader.loadVec3(v);
-				case 4:
-					var v : h3d.Vector4 = curShader.getParamValue(p.index);
-					bufLoader.loadVec4(v);
+				case TFloat:
+					bufLoader.loadFloat(curShader.getParamFloatValue(p.index));
+				case TMat4:
+					var m : h3d.Matrix = curShader.getParamValue(p.index);
+					bufLoader.loadMatrix(m);
+				default:
+					throw "Unsupported batch type "+p.type;
 				}
-			case TFloat:
-				bufLoader.loadFloat(curShader.getParamFloatValue(p.index));
-			case TMat4:
-				var m : h3d.Matrix = curShader.getParamValue(p.index);
-				bufLoader.loadMatrix(m);
-			default:
-				throw "Unsupported batch type "+p.type;
+				p = p.next;
 			}
-			p = p.next;
+			batch = batch.next;
 		}
 		needUpload = true;
 	}
@@ -507,7 +604,7 @@ class MeshBatch extends MultiMaterial {
 	}
 
 	function emitPass(ctx : RenderContext, p : BatchData) {
-		for( i => buf in p.buffers )
+		for( i in 0...p.buffers.length )
 			ctx.emitPass(p.pass, this).index = i | (p.matIndex << 16);
 	}
 
@@ -522,10 +619,10 @@ class MeshBatch extends MultiMaterial {
 				else
 					p.shader.Batch_Buffer = p.buffers[bufferIndex];
 
-				if( p.instanceBuffers == null )
+				if( cpuIndirectCallBuffers == null )
 					setPassCommand(p, bufferIndex);
 				else
-					instanced.commands = p.instanceBuffers[bufferIndex];
+					instanced.commands = storageBufferEnabled() ? gpuIndirectCallBuffers[p.matIndex] : p.indirectCallBuffers[bufferIndex];
 
 				break;
 			}
@@ -547,20 +644,25 @@ class MeshBatch extends MultiMaterial {
 		var hmd = Std.downcast(prim, h3d.prim.HMDModel);
 		if ( hmd == null )
 			return false;
-		if ( primitiveSubParts == null ) {
-			primitiveSubParts = [];
+		if ( primitiveSubMeshes == null ) {
+			var subMesh = new SubMesh();
+			var lodCount = hmd.lodCount();
+			subMesh.bounds = hmd.getBounds();
+			subMesh.lodCount = lodCount;
+			subMesh.lodConfig = hmd.getLodConfig();
+			var subParts = [];
 			for ( m in 0...materials.length ) {
-				var primitiveSubPart = new MeshBatchPart();
+				var primitiveSubPart = new SubPart();
 				primitiveSubPart.indexStart = hmd.getMaterialIndexStart(m, 0);
 				primitiveSubPart.indexCount = hmd.getMaterialIndexCount(m, 0);
-				primitiveSubPart.lodIndexCount = [for (i in 0...hmd.lodCount() ) hmd.getMaterialIndexCount(m, i)];
-				primitiveSubPart.lodIndexStart = [for (i in 0...hmd.lodCount() ) hmd.getMaterialIndexStart(m, i) ];
-				primitiveSubPart.lodConfig = hmd.getLodConfig();
-				primitiveSubPart.baseVertex = 0;
-				primitiveSubPart.bounds = hmd.getBounds();
-
-				primitiveSubParts.push(primitiveSubPart);
+				primitiveSubPart.lodIndexStart = [for (i in 0...lodCount) hmd.getMaterialIndexStart(m, i)];
+				primitiveSubPart.lodIndexCount = [for (i in 0...lodCount) hmd.getMaterialIndexCount(m, i)];
+				primitiveSubPart.matIndex = m;
+				subParts.push(primitiveSubPart);
 			}
+			subMesh.subParts = subParts;
+			primitiveSubMeshes = [subMesh];
+			curSubMesh = 0;
 		}
 		return true;
 	}
@@ -606,7 +708,13 @@ class MeshBatch extends MultiMaterial {
 		if( instanced.commands != null )
 			instanced.commands.dispose();
 
-		primitiveSubBytes = null;
+		cpuIndirectCallBuffers = null;
+		if ( gpuIndirectCallBuffers != null ) {
+			for ( gpuIndirectCallBuffer in gpuIndirectCallBuffers )
+				gpuIndirectCallBuffer.dispose();
+			gpuIndirectCallBuffers = null;
+		}
+
 		shadersChanged = true;
 	}
 }
@@ -618,7 +726,7 @@ class BatchData {
 	public var matIndex : Int;
 	public var indexCount : Int;
 	public var indexStart : Int;
-	public var instanceBuffers : Array<h3d.impl.InstanceBuffer>;
+	public var indirectCallBuffers : Array<h3d.impl.InstanceBuffer>;
 	public var buffers : Array<h3d.Buffer> = [];
 	public var bufferFormat : hxd.BufferFormat;
 	public var data : hxd.FloatBuffer;
@@ -637,35 +745,31 @@ class BatchData {
 		pass.removeShader(shader);
 		for( b in buffers )
 			alloc.disposeBuffer(b);
+		buffers.resize(0);
 
-		if( instanceBuffers != null ) {
-			for( b in instanceBuffers )
+		if( indirectCallBuffers != null ) {
+			for( b in indirectCallBuffers )
 				b.dispose();
 		}
 		alloc.disposeFloats(data);
 	}
 }
 
-class MeshBatchPart {
+class SubMesh {
+	public var subParts : Array<SubPart>;
+	public var bounds : h3d.col.Bounds;
+	public var lodCount : Int;
+	public var lodConfig : Array<Float>;
+	public function new() {
+	}
+}
+
+class SubPart {
 	public var indexStart : Int;
 	public var indexCount : Int;
 	public var lodIndexStart : Array<Int>;
 	public var lodIndexCount : Array<Int>;
-	public var lodConfig : Array<Float>;
-	public var baseVertex : Int;
-	public var bounds : h3d.col.Bounds;
+	public var matIndex : Int = 0;
 	public function new() {
 	}
-
-	public function clone() {
-		var cl = new MeshBatchPart();
-		cl.indexStart = indexStart;
-		cl.indexCount = indexCount;
-		cl.lodIndexStart = lodIndexStart;
-		cl.lodIndexCount = lodIndexCount;
-		cl.lodConfig = lodConfig;
-		cl.baseVertex = baseVertex;
-		cl.bounds = bounds;
-		return cl;
-	}
 }

+ 123 - 96
h3d/shader/InstanceIndirect.hx

@@ -2,7 +2,6 @@ package h3d.shader;
 
 class InstanceIndirectBase extends hxsl.Shader {
 	static var SRC = {
-
 		@global var camera : {
 			var position : Vec3;
 		}
@@ -13,10 +12,8 @@ class InstanceIndirectBase extends hxsl.Shader {
 		@param var instanceData : StoragePartialBuffer<{ modelView : Mat4 }>;
 		@param var instanceCount : Int;
 
-		// 16 by default because 16 * 4 floats = 256 bytes and cbuffer are aligned to 256 bytes
-		@const var MAX_MATERIAL_COUNT : Int = 16;
-		// x : indexCount, y : startIndex, z : minScreenRatio, w : in first lod => minScreenRatioCulling
-		@param var matInfos : Buffer<Vec4, MAX_MATERIAL_COUNT>;
+		// x : indexCount, y : indexStart, z : minScreenRatio, w : materialIndex
+		@param var subPartsInfos : StorageBuffer<Vec4>;
 
 		@const var ENABLE_CULLING : Bool;
 		@param var frustum : Buffer<Vec4, 6>;
@@ -26,8 +23,6 @@ class InstanceIndirectBase extends hxsl.Shader {
 		@const var ENABLE_DISTANCE_CLIPPING : Bool;
 		@param var maxDistance : Float = -1;
 
-		var matID : Int = 0;
-
 		var modelView : Mat4;
 		var invocID : Int;
 		function __init__() {
@@ -38,6 +33,24 @@ class InstanceIndirectBase extends hxsl.Shader {
 			modelView = instanceData[invocID].modelView;
 		}
 
+		function init() {}
+
+		function getRadius() : Float {
+			return 0.0;
+		}
+
+		function getLodCount() : Int {
+			return 1;
+		}
+
+		function getSubPartsStart() : Int {
+			return 0;
+		}
+
+		function getSubPartsCount() : Int {
+			return 1;
+		}
+
 		function emitInstance(instanceID : Int, indexCount : Int, instanceCount : Int, startIndex : Int, startVertex : Int, baseInstance : Int ) {
 			var instancePos = instanceID * 5;
 			commandBuffer[instancePos + 0] = indexCount;
@@ -67,16 +80,21 @@ class InstanceIndirectBase extends hxsl.Shader {
 			return screenRatio < minScreenRatioCulling;
 		}
 
-		function getLodCount() : Int {
-			return 0;
+		function getSubPartInfos( subPartIndex : Int, lod : Int ) : Vec4 {
+			var pos = getSubPartsStart() + subPartIndex * getLodCount() + lod;
+			return subPartsInfos[pos];
+		}
+
+		function getMaterialCommandStart( materialIndex : Int ) : Int {
+			return materialIndex * instanceCount;
 		}
 
-		function getLodScreenRatio( lod : Int ) : Float {
-			return matInfos[lod + matID].z;
+		function getLodScreenRatio(lod : Int) : Float {
+			return getSubPartInfos(0, lod).z;
 		}
 
 		function getMinScreenRatio() : Float {
-			return ENABLE_LOD ? matInfos[matID].w : 0.0;
+			return ENABLE_LOD ? getSubPartInfos(0, getLodCount() - 1).z : 0.0;
 		}
 
 		function computeScreenRatio( distToCam : Float, radius : Float ) : Float {
@@ -84,9 +102,10 @@ class InstanceIndirectBase extends hxsl.Shader {
 			return screenRatio * screenRatio;
 		}
 
-		function selectLod( screenRatio : Float, lodCount : Int ) : Int {
+		function selectLod( screenRatio : Float ) : Int {
 			var lod : Int = 0;
 			if ( ENABLE_LOD ) {
+				var lodCount = getLodCount();
 				for ( i in 0...lodCount ) {
 					var minScreenRatio = getLodScreenRatio(i);
 					if ( screenRatio > minScreenRatio )
@@ -97,63 +116,48 @@ class InstanceIndirectBase extends hxsl.Shader {
 			}
 			return lod;
 		}
-	}
-}
-
-class SubPartInstanceIndirect extends InstanceIndirectBase {
-
-	static var SRC = {
-		// n : material offset, n + 1 : subPart ID
-		@param var instanceOffsets: StorageBuffer<Int>;
-		@const var MAX_SUB_PART_BUFFER_ELEMENT_COUNT : Int = 16;
-		@param var subPartCount : Int;
-		// x : lodCount, y : radius,
-		@param var subPartInfos : Buffer<Vec4, MAX_SUB_PART_BUFFER_ELEMENT_COUNT>;
-
-		var lodCount = 0;
-		function getLodCount() : Int {
-			return lodCount;
-		}
 
 		function main() {
 			if ( invocID < instanceCount ) {
+				init();
+
 				var pos = vec3(modelView[0].w, modelView[1].w, modelView[2].w);
 				var vScale = abs(vec3(1) * modelView.mat3x4() - pos);
 				var scaledRadius = max(max(vScale.x, vScale.y), vScale.z);
 				var toCam = camera.position - pos.xyz;
 				var distToCam = length(toCam);
 
-				var id = invocID * 2;
-				matID = instanceOffsets[id];
-				var subPartID = instanceOffsets[id + 1];
-				var subPartInfo = subPartInfos[subPartID / 2];
-
-				var packedID = (subPartID & 1) << 1;
-				lodCount = int(subPartInfo[packedID]);
-				var radius = subPartInfo[packedID + 1];
-
-				scaledRadius *= radius;
+				scaledRadius *= getRadius();
 				var culled = dot(scaledRadius, scaledRadius) < 1e-6;
 
 				culled = culled || frustumCulling(pos, scaledRadius);
 				culled = culled || distanceClipping(distToCam, scaledRadius);
-				var computeScreenRatio = computeScreenRatio(distToCam, scaledRadius);
-				culled = culled || screenRatioCulling(computeScreenRatio);
+				var screenRatio = computeScreenRatio(distToCam, scaledRadius);
+				culled = culled || screenRatioCulling(screenRatio);
 
+				var subPartsCount = getSubPartsCount();
 				if ( ENABLE_COUNT_BUFFER ) {
 					if ( !culled ) {
-						var id = atomicAdd( countBuffer, 0, 1);
-						var lod = selectLod(computeScreenRatio, lodCount);
-						var matInfo = ivec4(matInfos[lod + matID]);
-						emitInstance( id, matInfo.x, 1, matInfo.y, 0, invocID );
+						var lod = selectLod(screenRatio);
+						for ( subPartIndex in 0...subPartsCount ) {
+							var subPartInfo = getSubPartInfos(subPartIndex, lod);
+							var materialIndex = int(subPartInfo.w);
+							var id = atomicAdd( countBuffer, materialIndex, 1 );
+							var materialCommandStart = getMaterialCommandStart(materialIndex);
+							emitInstance( materialCommandStart + id, int(subPartInfo.x), 1, int(subPartInfo.y), 0, invocID );
+						}
 					}
 				} else {
-					if ( !culled ) {
-						var lod = selectLod(computeScreenRatio, lodCount);
-						var matInfo = ivec4(matInfos[lod + matID]);
-						emitInstance( invocID, matInfo.x, 1, matInfo.y, 0, invocID );
-					} else {
-						emitInstance( invocID, 0, 0, 0, 0, 0 );
+					var lod = selectLod(screenRatio);
+					for ( subPartIndex in 0...subPartsCount ) {
+						var subPartInfo = getSubPartInfos(subPartIndex, lod);
+						var materialIndex = int(subPartInfo.w);
+						var id = atomicAdd( countBuffer, materialIndex, 1 );
+						var materialCommandStart = getMaterialCommandStart(materialIndex);
+						if ( !culled )
+							emitInstance( materialCommandStart + id, int(subPartInfo.x), 1, int(subPartInfo.y), 0, invocID );
+						else
+							emitInstance( materialCommandStart + id, 0, 0, 0, 0, 0 );
 					}
 				}
 			}
@@ -161,60 +165,83 @@ class SubPartInstanceIndirect extends InstanceIndirectBase {
 	}
 }
 
+class SubPartInstanceIndirect extends InstanceIndirectBase {
+	static var SRC = {
+		// n : subMesh index
+		@param var instancesInfos: StorageBuffer<Int>;
+		// x : radius, y : lodCount, z : subPartsStart, w : subPartsCount
+		@param var subMeshesInfos : StorageBuffer<Vec4>;
+
+		@const(32) var MATERIAL_COUNT : Int = 1;
+		@param var materialCommandStart : Array<Vec4, MATERIAL_COUNT>;
+
+		var radius : Float;
+		var lodCount : Int;
+		var subPartsStart : Int;
+		var subPartsCount : Int;
+
+		function getRadius() : Float {
+			return radius;
+		}
+
+		function getLodCount() : Int {
+			return lodCount;
+		}
+
+		function getSubPartsStart()  : Int{
+			return subPartsStart;
+		}
+
+		function getSubPartsCount() : Int {
+			return subPartsCount;
+		}
+
+		function getMaterialCommandStart( materialIndex : Int ) : Int {
+			return int(materialCommandStart[materialIndex].x);
+		}
+
+		function init() {
+			var instanceID = invocID;
+			var subMeshIndex = instancesInfos[instanceID];
+			var subMeshInfos = subMeshesInfos[subMeshIndex];
+			radius = subMeshInfos.x;
+			lodCount = int(subMeshInfos.y);
+			subPartsStart = int(subMeshInfos.z);
+			subPartsCount = int(subMeshInfos.w);
+		}
+	}
+}
+
 class InstanceIndirect extends InstanceIndirectBase {
 	static var SRC = {
 		@param var radius : Float;
-		@param var materialCount : Int;
 		@param var lodCount : Int = 1;
+		@param var subPartsCount : Int;
 
-		function getLodCount() : Int {
-			return lodCount;
+		var fetchedRadius : Float;
+		var fetchedLodCount : Int;
+		var fetchedSubPartsCount : Int;
+
+		function init() {
+			fetchedRadius = radius;
+			fetchedLodCount = lodCount;
+			fetchedSubPartsCount = subPartsCount;
 		}
 
-		function main() {
-			if ( invocID < instanceCount ) {
-				var pos = vec3(modelView[0].w, modelView[1].w, modelView[2].w);
-				var vScale = abs(vec3(1) * modelView.mat3x4() - pos);
-				var scaledRadius = max(max(vScale.x, vScale.y), vScale.z);
-				var toCam = camera.position - pos.xyz;
-				var distToCam = length(toCam);
+		function getRadius() : Float {
+			return fetchedRadius;
+		}
 
-				scaledRadius *= radius;
-				var culled = dot(scaledRadius, scaledRadius) < 1e-6;
+		function getLodCount() : Int {
+			return fetchedLodCount;
+		}
 
-				culled = culled || frustumCulling(pos, scaledRadius);
-				culled = culled || distanceClipping(distToCam, scaledRadius);
-				var computeScreenRatio = computeScreenRatio(distToCam, scaledRadius);
-				culled = culled || screenRatioCulling(computeScreenRatio);
+		function getSubPartsStart() : Int {
+			return 0;
+		}
 
-				if ( ENABLE_COUNT_BUFFER ) {
-					if ( !culled ) {
-						var id = atomicAdd( countBuffer, 0, 1);
-						for ( i in 0...materialCount ) {
-							matID = i * lodCount;
-							var lod = selectLod(computeScreenRatio, lodCount);
-							var matInfo = ivec4(matInfos[lod + matID]);
-							var instanceID = id + i * instanceCount;
-							emitInstance( instanceID, matInfo.x, 1, matInfo.y, 0, invocID );
-						}
-					}
-				} else {
-					if ( !culled ) {
-						for ( i in 0...materialCount ) {
-							matID = i * lodCount;
-							var lod = selectLod(computeScreenRatio, lodCount);
-							var matInfo = ivec4(matInfos[lod + matID]);
-							var instanceID = invocID + i * instanceCount;
-							emitInstance( instanceID, matInfo.x, 1, matInfo.y, 0, invocID );
-						}
-					} else {
-						for ( i in 0...materialCount ) {
-							var instanceID = invocID + i * instanceCount;
-							emitInstance( instanceID, 0, 0, 0, 0, 0 );
-						}
-					}
-				}
-			}
+		function getSubPartsCount() : Int {
+			return fetchedSubPartsCount;
 		}
 	}
 }