3 天之前 · 894002219e
--- a/h3d/GPUCounter.hx
+++ b/h3d/GPUCounter.hx
@@ -3,11 +3,13 @@ package h3d;
 
				 class GPUCounter {
			
 
				 	public var buffer(default, null) : h3d.Buffer;
			
 
				 	var accessor : haxe.io.Bytes;
			
 
				+	var size : Int;
			
 
				 
			
 
				-	public function new() {
			
 
				+	public function new( size : Int = 1 ) {
			
 
				+		this.size = size;
			
 
				 		var alloc = hxd.impl.Allocator.get();
			
 
				-		buffer = alloc.allocBuffer(1,hxd.BufferFormat.INDEX32, UniformReadWrite);
			
 
				-		accessor = haxe.io.Bytes.alloc(4);
			
 
				+		buffer = alloc.allocBuffer(size, hxd.BufferFormat.INDEX32, UniformReadWrite);
			
 
				+		accessor = haxe.io.Bytes.alloc(size << 2);
			
 
				 	}
			
 
				 
			
 
				 	public function dispose(){
			
@@ -15,13 +17,14 @@ class GPUCounter {
 
				 		alloc.disposeBuffer(buffer);
			
 
				 	}
			
 
				 
			
 
				-	public function get() : Int {
			
 
				-		buffer.readBytes(accessor, 0, 1);
			
 
				+	public function get( index : Int = 0 ) : Int {
			
 
				+		buffer.readBytes(accessor, 0, 1, index);
			
 
				 		return accessor.getInt32(0);
			
 
				 	}
			
 
				 
			
 
				-	public function reset(){
			
 
				-		accessor.setInt32(0, 0);
			
 
				-		buffer.uploadBytes(accessor, 0,1);
			
 
				+	public function reset() {
			
 
				+		for ( i in 0...size )
			
 
				+			accessor.setInt32(i << 2, 0);
			
 
				+		buffer.uploadBytes(accessor, 0, size);
			
 
				 	}
			
 
				 }
			
--- a/h3d/scene/GPUMeshBatch.hx
+++ b/h3d/scene/GPUMeshBatch.hx
@@ -1,21 +1,23 @@
 
				 package h3d.scene;
			
 
				 
			
 
				 import h3d.scene.MeshBatch.BatchData;
			
 
				-import h3d.scene.MeshBatch.MeshBatchPart;
			
 
				 
			
 
				 class GPUMeshBatch extends MeshBatch {
			
 
				 
			
 
				 	static var INDIRECT_DRAW_ARGUMENTS_FMT = hxd.BufferFormat.make([{ name : "", type : DVec4 }, { name : "", type : DFloat }]);
			
 
				-	static var INSTANCE_OFFSETS_FMT = hxd.BufferFormat.make([{ name : "", type : DFloat }]);
			
 
				-
			
 
				-	var matInfos : h3d.Buffer;
			
 
				-	var emittedSubParts : Array<MeshBatch.MeshBatchPart>;
			
 
				-	var currentSubParts : Int;
			
 
				-	var currentMaterialOffset : Int;
			
 
				-	var instanceOffsetsCpu : haxe.io.Bytes;
			
 
				-	var instanceOffsetsGpu : h3d.Buffer;
			
 
				+	static var INSTANCES_INFOS_FMT = hxd.BufferFormat.make([{ name : "", type : DFloat }]);
			
 
				+	inline static var INSTANCES_INFOS_ELEMENT_COUNT = 1;
			
 
				+	inline static var SUB_MESHES_INFOS_ELEMENT_COUNT = 4;
			
 
				+	inline static var SUB_PARTS_INFOS_ELEMENT_COUNT = 4;
			
 
				+
			
 
				+	var cpuInstancesInfos : haxe.io.Bytes;
			
 
				+	var gpuInstancesInfos : h3d.Buffer;
			
 
				+
			
 
				+	var subPartsEmitted : Int = 0;
			
 
				+	var materialsEmitted : Array<Float>;
			
 
				+
			
 
				+	var subMeshesInfos : h3d.Buffer;
			
 
				 	var subPartsInfos : h3d.Buffer;
			
 
				-	var materialCount : Int;
			
 
				 
			
 
				 	public var computePass : h3d.mat.Pass;
			
 
				 	public var commandBuffer : h3d.Buffer;
			
@@ -44,7 +46,7 @@ class GPUMeshBatch extends MeshBatch {
 
				 	 * Has effects only if a lod is available in the primitive.
			
 
				 	 */
			
 
				 	public function enableGpuLod() {
			
 
				-		gpuLodEnabled = primitiveSubParts != null || getPrimitive().lodCount() > 1;
			
 
				+		gpuLodEnabled = primitiveSubMeshes != null || getPrimitive().lodCount() > 1;
			
 
				 		return gpuLodEnabled;
			
 
				 	}
			
 
				 
			
@@ -58,140 +60,135 @@ class GPUMeshBatch extends MeshBatch {
 
				 	function getLodCount() return gpuLodEnabled ? getPrimitive().lodCount() : 1;
			
 
				 	override function updateHasPrimitiveOffset() meshBatchFlags.set(HasPrimitiveOffset);
			
 
				 
			
 
				-	override function begin( emitCountTip = -1) {
			
 
				+	override function begin( emitCountTip = -1 ) {
			
 
				 		if ( !gpuLodEnabled && !gpuCullingEnabled )
			
 
				 			throw "No need to create a GPUMeshBatch without gpu lod nor gpu culling, create a regular MeshBatch instead";
			
 
				+		subPartsEmitted = 0;
			
 
				+		materialsEmitted = [for ( _ in 0...materials.length) 0.0];
			
 
				+		return super.begin(emitCountTip);
			
 
				+	}
			
 
				 
			
 
				-		emitCountTip = super.begin(emitCountTip);
			
 
				+	override function initSubMeshResources( emitCountTip ) {
			
 
				+		if ( cpuInstancesInfos == null ) {
			
 
				+			var instanceInfosByteSize = INSTANCES_INFOS_ELEMENT_COUNT << 2;
			
 
				+			cpuInstancesInfos = haxe.io.Bytes.alloc( emitCountTip * instanceInfosByteSize );
			
 
				+		}
			
 
				+	}
			
 
				 
			
 
				-		if ( primitiveSubParts != null && ( gpuCullingEnabled || gpuLodEnabled ) && instanceOffsetsCpu == null ) {
			
 
				-			var size = emitCountTip * 2 * 4;
			
 
				-			instanceOffsetsCpu = haxe.io.Bytes.alloc(size);
			
 
				+	override function emitSubMesh(subMeshIndex : Int) {
			
 
				+		var subMesh = getSubMesh(subMeshIndex);
			
 
				+
			
 
				+		var instanceInfosByteSize = INSTANCES_INFOS_ELEMENT_COUNT << 2;
			
 
				+		var minInstanceInfosSize = ( instanceCount + 1 ) * instanceInfosByteSize;
			
 
				+		if ( cpuInstancesInfos.length < minInstanceInfosSize ) {
			
 
				+			var next = haxe.io.Bytes.alloc(Std.int(cpuInstancesInfos.length * 3 / 2));
			
 
				+			next.blit(0, cpuInstancesInfos, 0, cpuInstancesInfos.length);
			
 
				+			cpuInstancesInfos = next;
			
 
				 		}
			
 
				 
			
 
				-		return emitCountTip;
			
 
				+		subPartsEmitted += subMesh.subParts.length;
			
 
				+		for ( subPart in subMesh.subParts )
			
 
				+			materialsEmitted[subPart.matIndex] += 1.0;
			
 
				+
			
 
				+		cpuInstancesInfos.setInt32(instanceCount << 2, subMeshIndex);
			
 
				 	}
			
 
				 
			
 
				-	override function emitPrimitiveSubParts() {
			
 
				-		if ( primitiveSubParts.length > 1 )
			
 
				-			throw "Multi material with gpu instancing is not supported";
			
 
				-		var primitiveSubPart = primitiveSubParts[0];
			
 
				-		if (emittedSubParts == null) {
			
 
				-			currentSubParts = 0;
			
 
				-			currentMaterialOffset = 0;
			
 
				-			emittedSubParts = [ primitiveSubPart.clone() ];
			
 
				-		} else {
			
 
				-			var currentIndexStart = emittedSubParts[currentSubParts].indexStart;
			
 
				-			if ( currentIndexStart != primitiveSubPart.indexStart  ) {
			
 
				-				currentSubParts = -1;
			
 
				-				currentIndexStart = primitiveSubPart.indexStart;
			
 
				-				currentMaterialOffset = 0;
			
 
				-				for ( i => part in emittedSubParts ) {
			
 
				-					if ( part.indexStart == currentIndexStart ) {
			
 
				-						currentSubParts = i;
			
 
				-						break;
			
 
				+	override function flushSubMeshResources() {
			
 
				+		var alloc = hxd.impl.Allocator.get();
			
 
				+		var upload = needUpload;
			
 
				+
			
 
				+		var instancesInfosElementCount = instanceCount * INSTANCES_INFOS_ELEMENT_COUNT ;
			
 
				+		if ( gpuInstancesInfos == null || gpuInstancesInfos.isDisposed() || instancesInfosElementCount > gpuInstancesInfos.vertices ) {
			
 
				+			if ( gpuInstancesInfos != null)
			
 
				+				alloc.disposeBuffer( gpuInstancesInfos );
			
 
				+			gpuInstancesInfos = alloc.allocBuffer( instancesInfosElementCount, INSTANCES_INFOS_FMT, UniformReadWrite );
			
 
				+			upload = true;
			
 
				+		}
			
 
				+
			
 
				+		if ( upload )
			
 
				+			gpuInstancesInfos.uploadBytes( cpuInstancesInfos, 0, instancesInfosElementCount );
			
 
				+
			
 
				+		if ( subMeshesInfos == null ) {
			
 
				+			var tmpSubMeshesInfos = alloc.allocFloats( SUB_MESHES_INFOS_ELEMENT_COUNT * primitiveSubMeshes.length );
			
 
				+
			
 
				+			var pos = 0;
			
 
				+			var subPartsCount = 0;
			
 
				+			var subPartsStart = 0;
			
 
				+			for ( subMesh in primitiveSubMeshes ) {
			
 
				+				tmpSubMeshesInfos[pos++] = subMesh.bounds.dimension() * 0.5;
			
 
				+				tmpSubMeshesInfos[pos++] = subMesh.lodCount;
			
 
				+				tmpSubMeshesInfos[pos++] = subPartsStart;
			
 
				+				tmpSubMeshesInfos[pos++] = subMesh.subParts.length;
			
 
				+				subPartsCount += subMesh.subParts.length;
			
 
				+				subPartsStart += subMesh.subParts.length * subMesh.lodCount;
			
 
				+			}
			
 
				+			subMeshesInfos = alloc.ofFloats( tmpSubMeshesInfos, hxd.BufferFormat.VEC4_DATA, Uniform );
			
 
				+			alloc.disposeFloats(tmpSubMeshesInfos);
			
 
				+
			
 
				+			pos = 0;
			
 
				+			var tmpSubPartsInfos = alloc.allocFloats( SUB_PARTS_INFOS_ELEMENT_COUNT * subPartsCount );
			
 
				+			for ( subMesh in primitiveSubMeshes ) {
			
 
				+				var lodCount = subMesh.lodCount;
			
 
				+				var lodConfig = subMesh.lodConfig;
			
 
				+				var lodConfigHasCulling = lodConfig.length > lodCount - 1;
			
 
				+				var minScreenRatioCulling = lodConfigHasCulling ? lodConfig[lodConfig.length - 1] : 0.0;
			
 
				+				for ( subPart in subMesh.subParts ) {
			
 
				+					tmpSubPartsInfos[pos++] = subPart.indexCount;
			
 
				+					tmpSubPartsInfos[pos++] = subPart.indexStart;
			
 
				+					tmpSubPartsInfos[pos++] = 0 < lodConfig.length ? lodConfig[0] : 0.0;
			
 
				+					tmpSubPartsInfos[pos++] = subPart.matIndex;
			
 
				+					for ( i in 1...lodCount ) {
			
 
				+						tmpSubPartsInfos[pos++] = subPart.lodIndexCount[i - 1];
			
 
				+						tmpSubPartsInfos[pos++] = subPart.lodIndexStart[i - 1];
			
 
				+						tmpSubPartsInfos[pos++] = i < lodConfig.length ? lodConfig[i] : 0.0;
			
 
				+						tmpSubPartsInfos[pos++] = subPart.matIndex;
			
 
				 					}
			
 
				-					currentMaterialOffset += part.lodIndexCount.length + 1;
			
 
				-				}
			
 
				-				if ( currentSubParts < 0 ) {
			
 
				-					currentSubParts = emittedSubParts.length;
			
 
				-					emittedSubParts.push( primitiveSubPart.clone() );
			
 
				+					tmpSubPartsInfos[pos - 2] = minScreenRatioCulling;
			
 
				 				}
			
 
				 			}
			
 
				+
			
 
				+			subPartsInfos = alloc.ofFloats( tmpSubPartsInfos, hxd.BufferFormat.VEC4_DATA, Uniform );
			
 
				+			alloc.disposeFloats(tmpSubPartsInfos);
			
 
				 		}
			
 
				-		var maxInstanceID = ( instanceCount + 1 ) * 2;
			
 
				-		if ( instanceOffsetsCpu.length < maxInstanceID * 4 ) {
			
 
				-			var next = haxe.io.Bytes.alloc(Std.int(instanceOffsetsCpu.length*3/2));
			
 
				-			next.blit(0, instanceOffsetsCpu, 0, instanceOffsetsCpu.length);
			
 
				-			instanceOffsetsCpu = next;
			
 
				-		}
			
 
				-		instanceOffsetsCpu.setInt32((instanceCount * 2 + 0) * 4, currentMaterialOffset);
			
 
				-		instanceOffsetsCpu.setInt32((instanceCount * 2 + 1) * 4, currentSubParts);
			
 
				 	}
			
 
				 
			
 
				 	override function flush() {
			
 
				 		var alloc = hxd.impl.Allocator.get();
			
 
				-		var lodCount = getLodCount();
			
 
				-		materialCount = materials.length;
			
 
				-		var prim = getPrimitive();
			
 
				-		var hmd = Std.downcast(prim, h3d.prim.HMDModel);
			
 
				-
			
 
				-		if ( emittedSubParts != null ) {
			
 
				-			var upload = needUpload;
			
 
				-			var vertex = instanceCount * 2;
			
 
				-			if ( instanceOffsetsGpu == null || instanceOffsetsGpu.isDisposed() || vertex > instanceOffsetsGpu.vertices ) {
			
 
				-				if ( instanceOffsetsGpu != null)
			
 
				-					alloc.disposeBuffer( instanceOffsetsGpu );
			
 
				-				instanceOffsetsGpu = alloc.allocBuffer( vertex, INSTANCE_OFFSETS_FMT, UniformReadWrite );
			
 
				-				upload = true;
			
 
				-			}
			
 
				-			if ( upload )
			
 
				-				instanceOffsetsGpu.uploadBytes( instanceOffsetsCpu, 0, vertex );
			
 
				-			if ( matInfos == null ) {
			
 
				-				materialCount = 0;
			
 
				-				var tmpSubPartInfos = alloc.allocFloats( 2 * emittedSubParts.length );
			
 
				-				var pos = 0;
			
 
				-				for ( subPart in emittedSubParts ) {
			
 
				-					var lodCount = subPart.lodIndexCount.length + 1;
			
 
				-					tmpSubPartInfos[pos++] = lodCount;
			
 
				-					tmpSubPartInfos[pos++] = subPart.bounds.dimension() * 0.5;
			
 
				-					materialCount += lodCount;
			
 
				-				}
			
 
				-				subPartsInfos = alloc.ofFloats( tmpSubPartInfos, hxd.BufferFormat.VEC4_DATA, Uniform );
			
 
				-				alloc.disposeFloats(tmpSubPartInfos);
			
 
				-
			
 
				-				var tmpMatInfos = alloc.allocFloats( 4 * ( materialCount + emittedSubParts.length ) );
			
 
				-				pos = 0;
			
 
				-				for ( subPart in emittedSubParts ) {
			
 
				-					var maxLod = subPart.lodIndexCount.length;
			
 
				-					var lodConfig = subPart.lodConfig;
			
 
				-					tmpMatInfos[pos++] = subPart.indexCount;
			
 
				-					tmpMatInfos[pos++] = subPart.indexStart;
			
 
				-					tmpMatInfos[pos++] = ( 0 < lodConfig.length ) ? lodConfig[0] : 0.0;
			
 
				-					tmpMatInfos[pos++] = ( maxLod < lodConfig.length && maxLod > 0 ) ? lodConfig[lodConfig.length - 1] : 0.0;
			
 
				-					for ( i in 0...maxLod ) {
			
 
				-						tmpMatInfos[pos++] = subPart.lodIndexCount[i];
			
 
				-						tmpMatInfos[pos++] = subPart.lodIndexStart[i];
			
 
				-						tmpMatInfos[pos++] = ( i + 1 < lodConfig.length ) ? lodConfig[i + 1] : 0.0;
			
 
				-						pos++;
			
 
				-					}
			
 
				-				}
			
 
				+		var materialCount = materials.length;
			
 
				 
			
 
				-				matInfos = alloc.ofFloats( tmpMatInfos, hxd.BufferFormat.VEC4_DATA, Uniform );
			
 
				-				alloc.disposeFloats(tmpMatInfos);
			
 
				-			}
			
 
				-		} else if ( matInfos == null ) {
			
 
				+		if ( !hasSubMeshes() ) {
			
 
				+			var prim = getPrimitive();
			
 
				 			if ( gpuLodEnabled ) {
			
 
				-				var tmpMatInfos = alloc.allocFloats( 4 * materialCount * lodCount );
			
 
				-				matInfos = alloc.allocBuffer( materialCount * lodCount, hxd.BufferFormat.VEC4_DATA, Uniform );
			
 
				+				var lodCount = getLodCount();
			
 
				+				var tmpSubPartsInfos = alloc.allocFloats( SUB_PARTS_INFOS_ELEMENT_COUNT * materialCount * lodCount );
			
 
				+				var hmd = Std.downcast(prim, h3d.prim.HMDModel);
			
 
				 				var lodConfig = hmd.getLodConfig();
			
 
				-				var startIndex : Int = 0;
			
 
				 				var lodConfigHasCulling = lodConfig.length > lodCount - 1;
			
 
				-				var minScreenRatioCulling = lodConfigHasCulling ? lodConfig[lodConfig.length-1] : 0.0;
			
 
				-				for ( i => lod in @:privateAccess hmd.lods ) {
			
 
				-					for ( j in 0...materialCount ) {
			
 
				-						var indexCount = lod.indexCounts[j];
			
 
				-						var matIndex = i + j * lodCount;
			
 
				-						tmpMatInfos[matIndex * 4 + 0] = indexCount;
			
 
				-						tmpMatInfos[matIndex * 4 + 1] = startIndex;
			
 
				-						tmpMatInfos[matIndex * 4 + 2] = ( i < lodConfig.length ) ? lodConfig[i] : 0.0;
			
 
				-						tmpMatInfos[matIndex * 4 + 3] = minScreenRatioCulling;
			
 
				-						startIndex += indexCount;
			
 
				+				var minScreenRatioCulling = lodConfigHasCulling ? lodConfig[lodConfig.length - 1] : 0.0;
			
 
				+				var pos = 0;
			
 
				+				for ( matIndex in 0...materialCount ) {
			
 
				+					for ( lodIndex in 0...lodCount ) {
			
 
				+						tmpSubPartsInfos[pos++] = hmd.getMaterialIndexCount(matIndex, lodIndex);
			
 
				+						tmpSubPartsInfos[pos++] = hmd.getMaterialIndexStart(matIndex, lodIndex);
			
 
				+						tmpSubPartsInfos[pos++] = lodIndex < lodConfig.length ? lodConfig[lodIndex] : 0.0;
			
 
				+						tmpSubPartsInfos[pos++] = matIndex;
			
 
				 					}
			
 
				+					tmpSubPartsInfos[pos - 2] = minScreenRatioCulling;
			
 
				 				}
			
 
				-				matInfos.uploadFloats( tmpMatInfos, 0, materialCount * lodCount );
			
 
				-				alloc.disposeFloats( tmpMatInfos );
			
 
				+
			
 
				+				subPartsInfos = alloc.ofFloats( tmpSubPartsInfos, hxd.BufferFormat.VEC4_DATA, Uniform );
			
 
				+				alloc.disposeFloats( tmpSubPartsInfos );
			
 
				 			} else {
			
 
				-				var tmpMatInfos = alloc.allocFloats( 4 * materialCount );
			
 
				-				matInfos = alloc.allocBuffer( materialCount, hxd.BufferFormat.VEC4_DATA, Uniform );
			
 
				+				var tmpSubPartsInfos = alloc.allocFloats( SUB_PARTS_INFOS_ELEMENT_COUNT * materialCount );
			
 
				 				var pos : Int = 0;
			
 
				 				for ( i in 0...materials.length ) {
			
 
				-					tmpMatInfos[pos++] = prim.getMaterialIndexCount(i);
			
 
				-					tmpMatInfos[pos++] = prim.getMaterialIndexStart(i);
			
 
				-					pos += 2;
			
 
				+					tmpSubPartsInfos[pos++] = prim.getMaterialIndexCount(i);
			
 
				+					tmpSubPartsInfos[pos++] = prim.getMaterialIndexStart(i);
			
 
				+					tmpSubPartsInfos[pos++] = 0.0;
			
 
				+					tmpSubPartsInfos[pos++] = i;
			
 
				 				}
			
 
				-				matInfos.uploadFloats( tmpMatInfos, 0, materialCount );
			
 
				-				alloc.disposeFloats( tmpMatInfos );
			
 
				+				subPartsInfos = alloc.ofFloats( tmpSubPartsInfos, hxd.BufferFormat.VEC4_DATA, Uniform );
			
 
				+				alloc.disposeFloats( tmpSubPartsInfos );
			
 
				 			}
			
 
				 		}
			
 
				 
			
@@ -199,7 +196,7 @@ class GPUMeshBatch extends MeshBatch {
 
				 
			
 
				 		var computeShader : h3d.shader.InstanceIndirect.InstanceIndirectBase;
			
 
				 		if( computePass == null ) {
			
 
				-			computeShader = emittedSubParts != null ? new h3d.shader.InstanceIndirect.SubPartInstanceIndirect() : new h3d.shader.InstanceIndirect();
			
 
				+			computeShader = hasSubMeshes() ? new h3d.shader.InstanceIndirect.SubPartInstanceIndirect() : new h3d.shader.InstanceIndirect();
			
 
				 			computePass = new h3d.mat.Pass("batchUpdate");
			
 
				 			computePass.addShader(computeShader);
			
 
				 			addComputeShaders(computePass);
			
@@ -211,42 +208,39 @@ class GPUMeshBatch extends MeshBatch {
 
				 		computeShader.ENABLE_CULLING = gpuCullingEnabled;
			
 
				 		computeShader.ENABLE_DISTANCE_CLIPPING = maxDistance >= 0;
			
 
				 		computeShader.maxDistance = maxDistance;
			
 
				-		computeShader.MAX_MATERIAL_COUNT = 16;
			
 
				-		while ( materialCount * lodCount > computeShader.MAX_MATERIAL_COUNT )
			
 
				-			computeShader.MAX_MATERIAL_COUNT = computeShader.MAX_MATERIAL_COUNT + 16;
			
 
				-		computeShader.matInfos = matInfos;
			
 
				+
			
 
				+		computeShader.subPartsInfos = subPartsInfos;
			
 
				 		computeShader.instanceCount = instanceCount;
			
 
				 
			
 
				 		var commandCountNeeded : Int;
			
 
				-		if ( emittedSubParts != null ) {
			
 
				-			commandCountNeeded = instanceCount;
			
 
				+		if ( hasSubMeshes() ) {
			
 
				+			commandCountNeeded = subPartsEmitted;
			
 
				 			var computeShader : h3d.shader.InstanceIndirect.SubPartInstanceIndirect = cast computeShader;
			
 
				-			computeShader.subPartCount = emittedSubParts.length;
			
 
				-			computeShader.subPartInfos = subPartsInfos;
			
 
				-			computeShader.instanceOffsets = instanceOffsetsGpu;
			
 
				-			computeShader.MAX_SUB_PART_BUFFER_ELEMENT_COUNT = 16;
			
 
				-			var maxSubPartsElement = hxd.Math.ceil( emittedSubParts.length / 2 );
			
 
				-			while ( maxSubPartsElement > computeShader.MAX_SUB_PART_BUFFER_ELEMENT_COUNT )
			
 
				-				computeShader.MAX_SUB_PART_BUFFER_ELEMENT_COUNT = computeShader.MAX_SUB_PART_BUFFER_ELEMENT_COUNT + 16;
			
 
				+			computeShader.MATERIAL_COUNT = materialCount;
			
 
				+			var materialCommandStart = [new h3d.Vector4()];
			
 
				+			for ( i in 1...materialCount )
			
 
				+				materialCommandStart.push(new h3d.Vector4(materialsEmitted[i-1]));
			
 
				+			computeShader.materialCommandStart = materialCommandStart;
			
 
				+			computeShader.subMeshesInfos = subMeshesInfos;
			
 
				+			computeShader.instancesInfos = gpuInstancesInfos;
			
 
				 		} else {
			
 
				-			commandCountNeeded = instanceCount * materialCount;
			
 
				+			commandCountNeeded = materialCount * instanceCount;
			
 
				 			var computeShader : h3d.shader.InstanceIndirect = cast computeShader;
			
 
				+			var prim = getPrimitive();
			
 
				 			computeShader.radius = prim.getBounds().dimension() * 0.5;
			
 
				-			computeShader.lodCount = lodCount;
			
 
				-			computeShader.materialCount = materialCount;
			
 
				+			computeShader.lodCount = getLodCount();
			
 
				+			computeShader.subPartsCount = materialCount;
			
 
				 		}
			
 
				 
			
 
				 		var alloc = hxd.impl.Allocator.get();
			
 
				 		var commandCountAllocated = hxd.Math.nextPOT( commandCountNeeded );
			
 
				 		if ( commandBuffer == null ) {
			
 
				 			commandBuffer = alloc.allocBuffer( commandCountAllocated, INDIRECT_DRAW_ARGUMENTS_FMT, UniformReadWrite );
			
 
				-			gpuCounter = new h3d.GPUCounter();
			
 
				+			gpuCounter = new h3d.GPUCounter( materialCount );
			
 
				 		} else if ( commandBuffer.vertices < commandCountAllocated ) {
			
 
				 			alloc.disposeBuffer( commandBuffer );
			
 
				 			commandBuffer = alloc.allocBuffer( commandCountAllocated, INDIRECT_DRAW_ARGUMENTS_FMT, UniformReadWrite );
			
 
				 		}
			
 
				-
			
 
				-		materialCount = 0;
			
 
				 	}
			
 
				 
			
 
				 	function addComputeShaders( pass : h3d.mat.Pass ) {}
			
@@ -279,7 +273,7 @@ class GPUMeshBatch extends MeshBatch {
 
				 			@:privateAccess instanced.commands.data = commandBuffer.vbuf;
			
 
				 			@:privateAccess instanced.commands.countBuffer = gpuCounter.buffer.vbuf;
			
 
				 			@:privateAccess instanced.commands.offset = p.matIndex * instanceCount;
			
 
				-			@:privateAccess instanced.commands.countOffset = 0;
			
 
				+			@:privateAccess instanced.commands.countOffset = p.matIndex;
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -298,23 +292,25 @@ class GPUMeshBatch extends MeshBatch {
 
				 		super.cleanPasses();
			
 
				 
			
 
				 		var alloc = hxd.impl.Allocator.get();
			
 
				-		if ( matInfos != null ) {
			
 
				-			alloc.disposeBuffer(matInfos);
			
 
				-			matInfos = null;
			
 
				+		if ( subPartsInfos != null ) {
			
 
				+			alloc.disposeBuffer(subPartsInfos);
			
 
				+			subPartsInfos = null;
			
 
				 		}
			
 
				 
			
 
				-		if ( subPartsInfos != null )
			
 
				-			alloc.disposeBuffer(subPartsInfos);
			
 
				+		if ( subMeshesInfos != null ) {
			
 
				+			alloc.disposeBuffer(subMeshesInfos);
			
 
				+			subMeshesInfos = null;
			
 
				+		}
			
 
				 
			
 
				-		if ( instanceOffsetsGpu != null )
			
 
				-			alloc.disposeBuffer(instanceOffsetsGpu);
			
 
				-		instanceOffsetsCpu = null;
			
 
				+		if ( gpuInstancesInfos != null ) {
			
 
				+			alloc.disposeBuffer(gpuInstancesInfos);
			
 
				+			gpuInstancesInfos = null;
			
 
				+		}
			
 
				+		cpuInstancesInfos = null;
			
 
				 
			
 
				 		if ( commandBuffer != null )
			
 
				 			alloc.disposeBuffer(commandBuffer);
			
 
				 		if( gpuCounter != null )
			
 
				 			gpuCounter.dispose();
			
 
				-
			
 
				-		emittedSubParts = null;
			
 
				 	}
			
 
				 }
			
--- a/h3d/scene/MeshBatch.hx
+++ b/h3d/scene/MeshBatch.hx
@@ -7,8 +7,11 @@ enum MeshBatchFlag {
 
				 	HasPrimitiveOffset;

			
 
				 	EnableCpuLod;

			
 
				 	ForceGpuUpdate;

			
 
				+	EnableSubMesh;

			
 
				 }

			
 
				 

			
 
				+typedef CpuIndirectCallBuffer = { bytes : haxe.io.Bytes, count : Int };

			
 
				+

			
 
				 /**

			
 
				 	h3d.scene.MeshBatch allows to draw multiple meshed in a single draw call.

			
 
				 	See samples/MeshBatch.hx for an example.

			
@@ -18,9 +21,10 @@ class MeshBatch extends MultiMaterial {
 
				 	static var modelViewID = hxsl.Globals.allocID("global.modelView");

			
 
				 	static var modelViewInverseID = hxsl.Globals.allocID("global.modelViewInverse");

			
 
				 	static var previousModelViewID = hxsl.Globals.allocID("global.previousModelView");

			
 
				-	static var MAX_BUFFER_ELEMENTS = 4096;

			
 
				-	static var MAX_STORAGE_BUFFER_ELEMENTS = 128 * 1024 * 1024 >> 2;

			
 
				 	static var BATCH_START_FMT = hxd.BufferFormat.make([{ name : "Batch_Start", type : DFloat }]);

			
 
				+	inline static var MAX_BUFFER_ELEMENTS = 4096;

			
 
				+	inline static var MAX_STORAGE_BUFFER_ELEMENTS = 128 * 1024 * 1024 >> 2;

			
 
				+	inline static var DEFAULT_EMIT_COUNT_TIP = 128;

			
 
				 

			
 
				 	var instanced : h3d.prim.Instanced;

			
 
				 	var dataPasses : BatchData;

			
@@ -42,14 +46,19 @@ class MeshBatch extends MultiMaterial {
 
				 	 * 	If set, use this position in emitInstance() instead MeshBatch absolute position

			
 
				 	**/

			
 
				 	public var worldPosition : Matrix;

			
 
				-	var invWorldPosition : Matrix;

			
 
				 

			
 
				 	/**

			
 
				 		Tells the mesh batch to draw only a subpart of the primitive.

			
 
				-		One primitiveSubPart per material.

			
 
				 	**/

			
 
				-	public var primitiveSubParts : Array<MeshBatchPart>;

			
 
				-	var primitiveSubBytes : Array<haxe.io.Bytes>;

			
 
				+	public var primitiveSubMeshes : Array<SubMesh>;

			
 
				+	public var curSubMesh : Int = -1;

			
 
				+

			
 
				+	/**

			
 
				+		Use one indirect call buffer per material.

			
 
				+		Instances can not be culled for a specific pass yet.

			
 
				+	**/

			
 
				+	var cpuIndirectCallBuffers : Array<CpuIndirectCallBuffer>;

			
 
				+	var gpuIndirectCallBuffers : Array<h3d.impl.InstanceBuffer>;

			
 
				 

			
 
				 	/**

			
 
				 		If set, exact bounds will be recalculated during emitInstance (default true)

			
@@ -58,8 +67,8 @@ class MeshBatch extends MultiMaterial {
 
				 

			
 
				 	/**

			
 
				 	 	With EnableCpuLod, set the lod of the next emitInstance.

			
 
				-		Without EnableCpuLod and not using primitiveSubParts, set the lod of the whole batch.

			
 
				-	 */

			
 
				+		Without EnableCpuLod and not using primitiveSubMeshes, set the lod of the whole batch.

			
 
				+	**/

			
 
				 	public var curLod : Int = -1;

			
 
				 

			
 
				 	public function new( primitive, ?material, ?parent ) {

			
@@ -98,13 +107,25 @@ class MeshBatch extends MultiMaterial {
 
				 		meshBatchFlags.set(ForceGpuUpdate);

			
 
				 	}

			
 
				 

			
 
				+	/**

			
 
				+	 * Use sub mesh to emit instance.

			
 
				+	 * Don't support multiple materials without Storage Buffer to simplify implementation.

			
 
				+	**/

			
 
				+	public function enableSubMesh() {

			
 
				+		meshBatchFlags.set(EnableSubMesh);

			
 
				+		if ( materials.length > 1 )

			
 
				+			meshBatchFlags.set(EnableStorageBuffer);

			
 
				+	}

			
 
				+

			
 
				 	public function enableCpuLod() {

			
 
				 		var prim = getPrimitive();

			
 
				 		var lodCount = prim.lodCount();

			
 
				 		if ( lodCount <= 1 )

			
 
				 			return;

			
 
				-		if ( partsFromPrimitive(prim) )

			
 
				+		if ( partsFromPrimitive(prim) ) {

			
 
				 			meshBatchFlags.set(EnableCpuLod);

			
 
				+			meshBatchFlags.set(EnableStorageBuffer);

			
 
				+		}

			
 
				 	}

			
 
				 

			
 
				 	function getPrimitive() return @:privateAccess instanced.primitive;

			
@@ -113,6 +134,7 @@ class MeshBatch extends MultiMaterial {
 
				 	function gpuUpdateForced() return meshBatchFlags.has(ForceGpuUpdate);

			
 
				 	function getMaxElements() return storageBufferEnabled() ? MAX_STORAGE_BUFFER_ELEMENTS : MAX_BUFFER_ELEMENTS;

			
 
				 	function hasPrimitiveOffset() return meshBatchFlags.has(HasPrimitiveOffset);

			
 
				+	function hasSubMeshes() return meshBatchFlags.has(EnableSubMesh);

			
 
				 	function cpuLodEnabled() return meshBatchFlags.has(EnableCpuLod);

			
 
				 

			
 
				 	inline function shouldResizeDown( currentSize : Int, minSize : Int ) : Bool {

			
@@ -121,14 +143,19 @@ class MeshBatch extends MultiMaterial {
 
				 

			
 
				 	public function begin( emitCountTip = -1 ) : Int {

			
 
				 		instanceCount = 0;

			
 
				+

			
 
				+		if ( emitCountTip < 0 )

			
 
				+			emitCountTip = DEFAULT_EMIT_COUNT_TIP;

			
 
				+

			
 
				+		if ( primitiveSubMeshes != null )

			
 
				+			enableSubMesh();

			
 
				+

			
 
				 		instanced.initBounds();

			
 
				 		if( shadersChanged ) {

			
 
				 			initShadersMapping();

			
 
				 			shadersChanged = false;

			
 
				 		}

			
 
				 

			
 
				-		if( emitCountTip < 0 )

			
 
				-			emitCountTip = 128;

			
 
				 		var p = dataPasses;

			
 
				 		var alloc = hxd.impl.Allocator.get();

			
 
				 		while( p != null ) {

			
@@ -140,9 +167,22 @@ class MeshBatch extends MultiMaterial {
 
				 			p = p.next;

			
 
				 		}

			
 
				 

			
 
				+		if ( hasSubMeshes() )

			
 
				+			initSubMeshResources( emitCountTip );

			
 
				+

			
 
				 		return emitCountTip;

			
 
				 	}

			
 
				 

			
 
				+	function initSubMeshResources( emitCountTip ) {

			
 
				+		if ( cpuIndirectCallBuffers == null ) {

			
 
				+			var instanceSize = emitCountTip * h3d.impl.InstanceBuffer.ELEMENT_SIZE;

			
 
				+			cpuIndirectCallBuffers = [for ( _ in 0...materials.length ) { bytes : haxe.io.Bytes.alloc(instanceSize), count : 0 }];

			
 
				+		} else {

			
 
				+			for ( cpuIndirectCallBuffer in cpuIndirectCallBuffers )

			
 
				+				cpuIndirectCallBuffer.count = 0;

			
 
				+		}

			
 
				+	}

			
 
				+

			
 
				 	function initShadersMapping() {

			
 
				 		var scene = getScene();

			
 
				 		if( scene == null ) return;

			
@@ -206,7 +246,7 @@ class MeshBatch extends MultiMaterial {
 
				 		}

			
 
				 	}

			
 
				 

			
 
				-	function updateHasPrimitiveOffset() meshBatchFlags.setTo(HasPrimitiveOffset, primitiveSubParts != null);

			
 
				+	function updateHasPrimitiveOffset() meshBatchFlags.setTo(HasPrimitiveOffset, hasSubMeshes());

			
 
				 

			
 
				 	function createBatchData() {

			
 
				 		return new BatchData();

			
@@ -258,7 +298,7 @@ class MeshBatch extends MultiMaterial {
 
				 			for( i in prev...fmt.length )

			
 
				 				curPos += fmt[i].getBytesSize() >> 2;

			
 
				 		}

			
 
				-		if ( curPos & 3 != 0) {

			
 
				+		if ( curPos & 3 != 0 ) {

			
 
				 			var paddingSize = 4 - (curPos & 3);

			
 
				 			var paddingType : hxsl.Ast.Type = switch ( paddingSize ) {

			
 
				 			case 1:

			
@@ -275,63 +315,82 @@ class MeshBatch extends MultiMaterial {
 
				 	}

			
 
				 

			
 
				 	public function emitInstance() {

			
 
				-		if( primitiveSubParts != null )

			
 
				-			emitPrimitiveSubParts();

			
 
				+		// When using sub meshes we need to fill the indirect call buffers for multi draw

			
 
				+		if( hasSubMeshes() )

			
 
				+			emitSubMesh(curSubMesh);

			
 
				 

			
 
				-		if(!gpuUpdateForced()){

			
 
				-			if( worldPosition == null ) syncPos();

			
 
				+		// Instance data can be filled from the GPU

			
 
				+		if( !gpuUpdateForced() ) {

			
 
				 

			
 
				-			if (primitiveSubParts == null && calcBounds)

			
 
				+			if ( !hasSubMeshes() && calcBounds)

			
 
				 				instanced.addInstanceBounds(worldPosition == null ? absPos : worldPosition);

			
 
				 

			
 
				-			var p = dataPasses;

			
 
				-			while( p != null ) {

			
 
				-				syncData(p);

			
 
				-				p = p.next;

			
 
				-			}

			
 
				+			// Use the mesh batch abs pos if no world position has been set.

			
 
				+			if ( worldPosition == null )

			
 
				+				syncPos();

			
 
				+

			
 
				+			syncData();

			
 
				 		}

			
 
				 

			
 
				 		instanceCount++;

			
 
				 	}

			
 
				 

			
 
				-	function emitPrimitiveSubParts() {

			
 
				+	function getSubMesh( subMeshIndex : Int ) : SubMesh {

			
 
				+		return primitiveSubMeshes[subMeshIndex];

			
 
				+	}

			
 
				+

			
 
				+	function emitSubMesh(subMeshIndex : Int) {

			
 
				+		if ( cpuIndirectCallBuffers == null )

			
 
				+			throw "Something went wrong during the initialization";

			
 
				+		if ( subMeshIndex < 0 || subMeshIndex >= primitiveSubMeshes.length )

			
 
				+			throw "Invalid subMeshIndex";

			
 
				+

			
 
				+		var subMesh = getSubMesh(subMeshIndex);

			
 
				+		var subParts = subMesh.subParts;

			
 
				 		if(calcBounds) @:privateAccess {

			
 
				-			for ( primitiveSubPart in primitiveSubParts ) {

			
 
				-				instanced.tmpBounds.load(primitiveSubPart.bounds);

			
 
				-				instanced.tmpBounds.transform(worldPosition == null ? absPos : worldPosition);

			
 
				-				instanced.bounds.add(instanced.tmpBounds);

			
 
				-			}

			
 
				+			instanced.tmpBounds.load(subMesh.bounds);

			
 
				+			instanced.tmpBounds.transform(worldPosition == null ? absPos : worldPosition);

			
 
				+			instanced.bounds.add(instanced.tmpBounds);

			
 
				 		}

			
 
				 

			
 
				-		if( primitiveSubBytes == null ) {

			
 
				-			if ( primitiveSubParts.length != materials.length )

			
 
				-				throw "Instancing using primitive sub parts must match material count";

			
 
				-			primitiveSubBytes = [for ( i in 0...primitiveSubParts.length ) haxe.io.Bytes.alloc(128)];

			
 
				-			instanced.commands = null;

			
 
				-		}

			
 
				 		var instanceSize = h3d.impl.InstanceBuffer.ELEMENT_SIZE;

			
 
				-		for ( i in 0...primitiveSubBytes.length ) {

			
 
				-			if( primitiveSubBytes[i].length < (instanceCount+1) * instanceSize ) {

			
 
				-				var next = haxe.io.Bytes.alloc(Std.int(primitiveSubBytes[i].length*3/2));

			
 
				-				next.blit(0, primitiveSubBytes[i], 0, instanceCount * instanceSize);

			
 
				-				primitiveSubBytes[i] = next;

			
 
				-			}

			
 
				-		}

			
 
				-		var p = instanceCount * instanceSize;

			
 
				-		for ( mid => psBytes in primitiveSubBytes ) {

			
 
				-			var primitiveSubPart = primitiveSubParts[mid];

			
 
				-			var indexCount = primitiveSubPart.indexCount;

			
 
				-			var indexStart = primitiveSubPart.indexStart;

			
 
				+		for ( subPart in subParts ) {

			
 
				+			var indexCount = subPart.indexCount;

			
 
				+			var indexStart = subPart.indexStart;

			
 
				 			if ( curLod >= 0 && cpuLodEnabled() ) {

			
 
				-				indexStart = primitiveSubPart.lodIndexStart[curLod];

			
 
				-				indexCount = primitiveSubPart.lodIndexCount[curLod];

			
 
				+				indexStart = subPart.lodIndexStart[curLod];

			
 
				+				indexCount = subPart.lodIndexCount[curLod];

			
 
				+			}

			
 
				+

			
 
				+			if ( indexCount == 0 && storageBufferEnabled() )

			
 
				+				continue;

			
 
				+

			
 
				+			var matIndex = subPart.matIndex;

			
 
				+			var indirectCallBuffer = cpuIndirectCallBuffers[matIndex];

			
 
				+

			
 
				+			// Resize

			
 
				+			var count = indirectCallBuffer.count++;

			
 
				+			var pos = count * instanceSize;

			
 
				+			var minIndirectCallBufferSize = pos + instanceSize;

			
 
				+			if ( indirectCallBuffer.bytes.length < minIndirectCallBufferSize ) {

			
 
				+				var next = haxe.io.Bytes.alloc(Std.int((indirectCallBuffer.bytes.length * 3 / 2)));

			
 
				+				next.blit(0, indirectCallBuffer.bytes, 0, pos);

			
 
				+				indirectCallBuffer.bytes = next;

			
 
				 			}

			
 
				-			psBytes.setInt32(p, indexCount);

			
 
				-			psBytes.setInt32(p + 4, 1);

			
 
				-			psBytes.setInt32(p + 8, indexStart);

			
 
				-			psBytes.setInt32(p + 12, primitiveSubPart.baseVertex);

			
 
				-			psBytes.setInt32(p + 16, instanceCount);

			
 
				+

			
 
				+			// Emit

			
 
				+			var bytes = indirectCallBuffer.bytes;

			
 
				+			bytes.setInt32(pos, indexCount);

			
 
				+			bytes.setInt32(pos + 4, 1);

			
 
				+			bytes.setInt32(pos + 8, indexStart);

			
 
				+			bytes.setInt32(pos + 12, 0);

			
 
				+			bytes.setInt32(pos + 16, instanceCount);

			
 
				+

			
 
				+			cpuIndirectCallBuffers[matIndex] = indirectCallBuffer;

			
 
				 		}

			
 
				+

			
 
				+		// To clean

			
 
				+		instanced.commands = null;

			
 
				 	}

			
 
				 

			
 
				 	override function sync(ctx:RenderContext) {

			
@@ -340,6 +399,39 @@ class MeshBatch extends MultiMaterial {
 
				 		flush();

			
 
				 	}

			
 
				 

			
 
				+	function flushSubMeshResources() {

			
 
				+		if ( !storageBufferEnabled() )

			
 
				+			throw "Storage buffer must be set to use per material indirect call buffers";

			
 
				+

			
 
				+		if ( gpuIndirectCallBuffers == null )

			
 
				+			gpuIndirectCallBuffers = [for ( i in 0...materials.length ) new h3d.impl.InstanceBuffer()];

			
 
				+

			
 
				+		for ( matIndex in 0...materials.length ) {

			
 
				+			var cpuIndirectCallBuffer = cpuIndirectCallBuffers[matIndex];

			
 
				+			var gpuIndirectCallBuffer = gpuIndirectCallBuffers[matIndex];

			
 
				+

			
 
				+			// Upload indirect call buffer

			
 
				+			var count = cpuIndirectCallBuffer.count;

			
 
				+			if ( needUpload || gpuIndirectCallBuffer.commandCount != count ) {

			
 
				+				var bytes = cpuIndirectCallBuffer.bytes;

			
 
				+				if ( count == 0 ) {

			
 
				+					count = 1;

			
 
				+					bytes.setInt32(0,  0);

			
 
				+					bytes.setInt32(4,  0);

			
 
				+					bytes.setInt32(8,  0);

			
 
				+					bytes.setInt32(12, 0);

			
 
				+					bytes.setInt32(16, 0);

			
 
				+				}

			
 
				+

			
 
				+				var gpuIndirectCallMaxCount = gpuIndirectCallBuffer.maxCommandCount;

			
 
				+				if ( shouldResizeDown(gpuIndirectCallMaxCount, count) || count > gpuIndirectCallMaxCount )

			
 
				+					gpuIndirectCallBuffer.allocFromBytes(count, bytes);

			
 
				+				else

			
 
				+					gpuIndirectCallBuffer.uploadBytes(count, bytes);

			
 
				+			}

			
 
				+		}

			
 
				+	}

			
 
				+

			
 
				 	public function flush() {

			
 
				 		var p = dataPasses;

			
 
				 		var alloc = hxd.impl.Allocator.get();

			
@@ -347,53 +439,60 @@ class MeshBatch extends MultiMaterial {
 
				 		var prim = getPrimitive();

			
 
				 		var instanceSize = h3d.impl.InstanceBuffer.ELEMENT_SIZE;

			
 
				 

			
 
				+		if ( hasSubMeshes() && storageBufferEnabled() )

			
 
				+			flushSubMeshResources();

			
 
				+

			
 
				+		// Allocate and upload GPU buffers for each data passes

			
 
				 		while( p != null ) {

			
 
				 			var index = 0;

			
 
				 			var start = 0;

			
 
				 			while( start < instanceCount ) {

			
 
				 				var upload = needUpload;

			
 
				 				var buf = p.buffers[index];

			
 
				-				var count = instanceCount - start;

			
 
				-				if( count > p.maxInstance )

			
 
				-					count = p.maxInstance;

			
 
				+				if( instanceCount > p.maxInstance && storageBufferEnabled() )

			
 
				+					throw "Maximum instance count reached";

			
 
				 

			
 
				+				var count = hxd.Math.imin(instanceCount - start, p.maxInstance);

			
 
				 				var maxVertexCount = gpuUpdateEnabled() ? p.maxInstance : getMaxElements();

			
 
				 				var vertexCount = Std.int( count * (( 4 * p.paramsCount ) / p.bufferFormat.stride) );

			
 
				 				var vertexCountAllocated = #if js Std.int( MAX_BUFFER_ELEMENTS * 4 / p.bufferFormat.stride ) #else hxd.Math.imin( hxd.Math.nextPOT( vertexCount ), maxVertexCount ) #end;

			
 
				 

			
 
				+				// Lazy instance data buffer allocation

			
 
				 				if( buf == null || buf.isDisposed() || buf.vertices < vertexCountAllocated ) {

			
 
				 					var bufferFlags : hxd.impl.Allocator.BufferFlags = storageBufferEnabled() ? UniformReadWrite : UniformDynamic;

			
 
				-

			
 
				 					if ( buf != null )

			
 
				 						alloc.disposeBuffer(buf);

			
 
				-					buf = alloc.allocBuffer( vertexCountAllocated, p.bufferFormat,bufferFlags );

			
 
				+					buf = alloc.allocBuffer( vertexCountAllocated, p.bufferFormat, bufferFlags );

			
 
				 					p.buffers[index] = buf;

			
 
				 					upload = true;

			
 
				 				}

			
 
				+

			
 
				+				// Upload instance data buffer

			
 
				 				if( upload && !gpuUpdateForced())

			
 
				 					buf.uploadFloats(p.data, start * p.paramsCount * 4, vertexCount);

			
 
				-				if( primitiveSubBytes != null ) {

			
 
				-					if( p.instanceBuffers == null )

			
 
				-						p.instanceBuffers = [];

			
 
				-					var ibuf = p.instanceBuffers[index];

			
 
				-					if ( ibuf == null )

			
 
				-						ibuf = new h3d.impl.InstanceBuffer();

			
 
				-					var ibufUpload = needUpload || ibuf.commandCount != count;

			
 
				-					if ( ibufUpload ) {

			
 
				-						var psBytes = primitiveSubBytes[p.matIndex];

			
 
				+

			
 
				+				if( hasSubMeshes() && !storageBufferEnabled() ) {

			
 
				+					if( p.indirectCallBuffers == null )

			
 
				+						p.indirectCallBuffers = [];

			
 
				+					var indirectCallBuffer = p.indirectCallBuffers[index];

			
 
				+					if ( indirectCallBuffer == null )

			
 
				+						indirectCallBuffer = new h3d.impl.InstanceBuffer();

			
 
				+					var upload = needUpload || indirectCallBuffer.commandCount != count;

			
 
				+					if ( upload ) {

			
 
				+						var bytes = cpuIndirectCallBuffers[p.matIndex].bytes;

			
 
				 						if ( start > 0 && count < instanceCount ) {

			
 
				-							psBytes = psBytes.sub(start*instanceSize,count*instanceSize);

			
 
				+							bytes = bytes.sub(start*instanceSize,count*instanceSize);

			
 
				 							for( i in 0...count )

			
 
				-								psBytes.setInt32(i*instanceSize+16, i);

			
 
				+								bytes.setInt32(i*instanceSize+16, i);

			
 
				 						}

			
 
				 

			
 
				-						var ibufMaxCommandCount = ibuf.maxCommandCount;

			
 
				-						if ( shouldResizeDown(ibufMaxCommandCount, count) || count > ibufMaxCommandCount) {

			
 
				-							ibuf.allocFromBytes(count, psBytes);

			
 
				+						var maxCommandCount = indirectCallBuffer.maxCommandCount;

			
 
				+						if ( shouldResizeDown(maxCommandCount, count) || count > maxCommandCount) {

			
 
				+							indirectCallBuffer.allocFromBytes(count, bytes);

			
 
				 						} else {

			
 
				-							ibuf.uploadBytes(count, psBytes);

			
 
				+							indirectCallBuffer.uploadBytes(count, bytes);

			
 
				 						}

			
 
				-						p.instanceBuffers[index] = ibuf;

			
 
				+						p.indirectCallBuffers[index] = indirectCallBuffer;

			
 
				 					}

			
 
				 				}

			
 
				 

			
@@ -409,18 +508,18 @@ class MeshBatch extends MultiMaterial {
 
				 				alloc.disposeBuffer( p.buffers.pop() );

			
 
				 			p = p.next;

			
 
				 		}

			
 
				-		if( hasPrimitiveOffset() ) {

			
 
				+		if ( hasPrimitiveOffset() ) {

			
 
				 			var offsets = prim.resolveBuffer("Batch_Start");

			
 
				-			if( offsets == null || offsets.vertices < instanceCount || offsets.isDisposed() ) {

			
 
				-				if( offsets != null ) {

			
 
				+			if ( offsets == null || offsets.vertices < instanceCount || offsets.isDisposed() ) {

			
 
				+				if ( offsets != null ) {

			
 
				 					offsets.dispose();

			
 
				 					prim.removeBuffer(offsets);

			
 
				 				}

			
 
				 				var tmp = haxe.io.Bytes.alloc(4 * instanceCount);

			
 
				-				for( i in 0...instanceCount )

			
 
				+				for ( i in 0...instanceCount )

			
 
				 					tmp.setFloat(i<<2, i);

			
 
				 				offsets = new h3d.Buffer(instanceCount, BATCH_START_FMT);

			
 
				-				offsets.uploadBytes(tmp,0,instanceCount);

			
 
				+				offsets.uploadBytes(tmp, 0, instanceCount);

			
 
				 				prim.addBuffer(offsets);

			
 
				 			}

			
 
				 		}

			
@@ -431,63 +530,61 @@ class MeshBatch extends MultiMaterial {
 
				 

			
 
				 	function onFlushPass(p : BatchData) {}

			
 
				 

			
 
				-	function syncData( batch : BatchData ) {

			
 
				-		var startPos = batch.paramsCount * instanceCount << 2;

			
 
				-		// in case we are bigger than emitCountTip

			
 
				-		if( startPos + (batch.paramsCount<<2) > batch.data.length )

			
 
				-			batch.data.grow(batch.data.length << 1);

			
 
				+	function syncData() {

			
 
				+		var batch = dataPasses;

			
 
				+		var invWorldPosition = null;

			
 
				+		var worldPosition = worldPosition ?? absPos;

			
 
				+		while( batch != null ) {

			
 
				+			var startPos = batch.paramsCount * instanceCount << 2;

			
 
				+			// in case we are bigger than emitCountTip

			
 
				+			if( startPos + (batch.paramsCount << 2) > batch.data.length )

			
 
				+				batch.data.grow(batch.data.length << 1);

			
 
				 

			
 
				-		var p = batch.params;

			
 
				-		var buf = batch.data;

			
 
				-		var shaders = batch.shaders;

			
 
				+			var p = batch.params;

			
 
				+			var buf = batch.data;

			
 
				+			var shaders = batch.shaders;

			
 
				 

			
 
				-		var calcInv = false;

			
 
				-		while( p != null ) {

			
 
				-			var bufLoader = new hxd.FloatBufferLoader(buf, startPos + p.pos);

			
 
				-			if( p.perObjectGlobal != null ) {

			
 
				-				if ( p.perObjectGlobal.gid == modelViewID ) {

			
 
				-					bufLoader.loadMatrix(worldPosition != null ? worldPosition : absPos);

			
 
				-				} else if ( p.perObjectGlobal.gid == modelViewInverseID ) {

			
 
				-					if( worldPosition == null )

			
 
				-						bufLoader.loadMatrix(getInvPos());

			
 
				-					else {

			
 
				-						if( !calcInv ) {

			
 
				-							calcInv = true;

			
 
				-							if( invWorldPosition == null ) invWorldPosition = new h3d.Matrix();

			
 
				-							invWorldPosition.initInverse(worldPosition);

			
 
				-						}

			
 
				+			while( p != null ) {

			
 
				+				var bufLoader = new hxd.FloatBufferLoader(buf, startPos + p.pos);

			
 
				+				if( p.perObjectGlobal != null ) {

			
 
				+					if ( p.perObjectGlobal.gid == modelViewID ) {

			
 
				+						bufLoader.loadMatrix(worldPosition);

			
 
				+					} else if ( p.perObjectGlobal.gid == modelViewInverseID ) {

			
 
				+						if ( invWorldPosition == null )

			
 
				+							invWorldPosition = worldPosition == null ? getInvPos() : worldPosition.getInverse();

			
 
				 						bufLoader.loadMatrix(invWorldPosition);

			
 
				+					} else if ( p.perObjectGlobal.gid == previousModelViewID )

			
 
				+						bufLoader.loadMatrix(worldPosition);

			
 
				+					else

			
 
				+						throw "Unsupported global param " + p.perObjectGlobal.path;

			
 
				+					p = p.next;

			
 
				+					continue;

			
 
				+				}

			
 
				+				var curShader = shaders[p.instance];

			
 
				+				switch( p.type ) {

			
 
				+				case TVec(size, _):

			
 
				+					switch( size ) {

			
 
				+					case 2:

			
 
				+						var v : h3d.Vector = curShader.getParamValue(p.index);

			
 
				+						bufLoader.loadVec2(v);

			
 
				+					case 3:

			
 
				+						var v : h3d.Vector = curShader.getParamValue(p.index);

			
 
				+						bufLoader.loadVec3(v);

			
 
				+					case 4:

			
 
				+						var v : h3d.Vector4 = curShader.getParamValue(p.index);

			
 
				+						bufLoader.loadVec4(v);

			
 
				 					}

			
 
				-				} else if ( p.perObjectGlobal.gid == previousModelViewID )

			
 
				-					bufLoader.loadMatrix(worldPosition != null ? worldPosition : absPos );

			
 
				-				else

			
 
				-					throw "Unsupported global param "+p.perObjectGlobal.path;

			
 
				-				p = p.next;

			
 
				-				continue;

			
 
				-			}

			
 
				-			var curShader = shaders[p.instance];

			
 
				-			switch( p.type ) {

			
 
				-			case TVec(size, _):

			
 
				-				switch( size ) {

			
 
				-				case 2:

			
 
				-					var v : h3d.Vector = curShader.getParamValue(p.index);

			
 
				-					bufLoader.loadVec2(v);

			
 
				-				case 3:

			
 
				-					var v : h3d.Vector = curShader.getParamValue(p.index);

			
 
				-					bufLoader.loadVec3(v);

			
 
				-				case 4:

			
 
				-					var v : h3d.Vector4 = curShader.getParamValue(p.index);

			
 
				-					bufLoader.loadVec4(v);

			
 
				+				case TFloat:

			
 
				+					bufLoader.loadFloat(curShader.getParamFloatValue(p.index));

			
 
				+				case TMat4:

			
 
				+					var m : h3d.Matrix = curShader.getParamValue(p.index);

			
 
				+					bufLoader.loadMatrix(m);

			
 
				+				default:

			
 
				+					throw "Unsupported batch type "+p.type;

			
 
				 				}

			
 
				-			case TFloat:

			
 
				-				bufLoader.loadFloat(curShader.getParamFloatValue(p.index));

			
 
				-			case TMat4:

			
 
				-				var m : h3d.Matrix = curShader.getParamValue(p.index);

			
 
				-				bufLoader.loadMatrix(m);

			
 
				-			default:

			
 
				-				throw "Unsupported batch type "+p.type;

			
 
				+				p = p.next;

			
 
				 			}

			
 
				-			p = p.next;

			
 
				+			batch = batch.next;

			
 
				 		}

			
 
				 		needUpload = true;

			
 
				 	}

			
@@ -507,7 +604,7 @@ class MeshBatch extends MultiMaterial {
 
				 	}

			
 
				 

			
 
				 	function emitPass(ctx : RenderContext, p : BatchData) {

			
 
				-		for( i => buf in p.buffers )

			
 
				+		for( i in 0...p.buffers.length )

			
 
				 			ctx.emitPass(p.pass, this).index = i | (p.matIndex << 16);

			
 
				 	}

			
 
				 

			
@@ -522,10 +619,10 @@ class MeshBatch extends MultiMaterial {
 
				 				else

			
 
				 					p.shader.Batch_Buffer = p.buffers[bufferIndex];

			
 
				 

			
 
				-				if( p.instanceBuffers == null )

			
 
				+				if( cpuIndirectCallBuffers == null )

			
 
				 					setPassCommand(p, bufferIndex);

			
 
				 				else

			
 
				-					instanced.commands = p.instanceBuffers[bufferIndex];

			
 
				+					instanced.commands = storageBufferEnabled() ? gpuIndirectCallBuffers[p.matIndex] : p.indirectCallBuffers[bufferIndex];

			
 
				 

			
 
				 				break;

			
 
				 			}

			
@@ -547,20 +644,25 @@ class MeshBatch extends MultiMaterial {
 
				 		var hmd = Std.downcast(prim, h3d.prim.HMDModel);

			
 
				 		if ( hmd == null )

			
 
				 			return false;

			
 
				-		if ( primitiveSubParts == null ) {

			
 
				-			primitiveSubParts = [];

			
 
				+		if ( primitiveSubMeshes == null ) {

			
 
				+			var subMesh = new SubMesh();

			
 
				+			var lodCount = hmd.lodCount();

			
 
				+			subMesh.bounds = hmd.getBounds();

			
 
				+			subMesh.lodCount = lodCount;

			
 
				+			subMesh.lodConfig = hmd.getLodConfig();

			
 
				+			var subParts = [];

			
 
				 			for ( m in 0...materials.length ) {

			
 
				-				var primitiveSubPart = new MeshBatchPart();

			
 
				+				var primitiveSubPart = new SubPart();

			
 
				 				primitiveSubPart.indexStart = hmd.getMaterialIndexStart(m, 0);

			
 
				 				primitiveSubPart.indexCount = hmd.getMaterialIndexCount(m, 0);

			
 
				-				primitiveSubPart.lodIndexCount = [for (i in 0...hmd.lodCount() ) hmd.getMaterialIndexCount(m, i)];

			
 
				-				primitiveSubPart.lodIndexStart = [for (i in 0...hmd.lodCount() ) hmd.getMaterialIndexStart(m, i) ];

			
 
				-				primitiveSubPart.lodConfig = hmd.getLodConfig();

			
 
				-				primitiveSubPart.baseVertex = 0;

			
 
				-				primitiveSubPart.bounds = hmd.getBounds();

			
 
				-

			
 
				-				primitiveSubParts.push(primitiveSubPart);

			
 
				+				primitiveSubPart.lodIndexStart = [for (i in 0...lodCount) hmd.getMaterialIndexStart(m, i)];

			
 
				+				primitiveSubPart.lodIndexCount = [for (i in 0...lodCount) hmd.getMaterialIndexCount(m, i)];

			
 
				+				primitiveSubPart.matIndex = m;

			
 
				+				subParts.push(primitiveSubPart);

			
 
				 			}

			
 
				+			subMesh.subParts = subParts;

			
 
				+			primitiveSubMeshes = [subMesh];

			
 
				+			curSubMesh = 0;

			
 
				 		}

			
 
				 		return true;

			
 
				 	}

			
@@ -606,7 +708,13 @@ class MeshBatch extends MultiMaterial {
 
				 		if( instanced.commands != null )

			
 
				 			instanced.commands.dispose();

			
 
				 

			
 
				-		primitiveSubBytes = null;

			
 
				+		cpuIndirectCallBuffers = null;

			
 
				+		if ( gpuIndirectCallBuffers != null ) {

			
 
				+			for ( gpuIndirectCallBuffer in gpuIndirectCallBuffers )

			
 
				+				gpuIndirectCallBuffer.dispose();

			
 
				+			gpuIndirectCallBuffers = null;

			
 
				+		}

			
 
				+

			
 
				 		shadersChanged = true;

			
 
				 	}

			
 
				 }

			
@@ -618,7 +726,7 @@ class BatchData {
 
				 	public var matIndex : Int;

			
 
				 	public var indexCount : Int;

			
 
				 	public var indexStart : Int;

			
 
				-	public var instanceBuffers : Array<h3d.impl.InstanceBuffer>;

			
 
				+	public var indirectCallBuffers : Array<h3d.impl.InstanceBuffer>;

			
 
				 	public var buffers : Array<h3d.Buffer> = [];

			
 
				 	public var bufferFormat : hxd.BufferFormat;

			
 
				 	public var data : hxd.FloatBuffer;

			
@@ -637,35 +745,31 @@ class BatchData {
 
				 		pass.removeShader(shader);

			
 
				 		for( b in buffers )

			
 
				 			alloc.disposeBuffer(b);

			
 
				+		buffers.resize(0);

			
 
				 

			
 
				-		if( instanceBuffers != null ) {

			
 
				-			for( b in instanceBuffers )

			
 
				+		if( indirectCallBuffers != null ) {

			
 
				+			for( b in indirectCallBuffers )

			
 
				 				b.dispose();

			
 
				 		}

			
 
				 		alloc.disposeFloats(data);

			
 
				 	}

			
 
				 }

			
 
				 

			
 
				-class MeshBatchPart {

			
 
				+class SubMesh {

			
 
				+	public var subParts : Array<SubPart>;

			
 
				+	public var bounds : h3d.col.Bounds;

			
 
				+	public var lodCount : Int;

			
 
				+	public var lodConfig : Array<Float>;

			
 
				+	public function new() {

			
 
				+	}

			
 
				+}

			
 
				+

			
 
				+class SubPart {

			
 
				 	public var indexStart : Int;

			
 
				 	public var indexCount : Int;

			
 
				 	public var lodIndexStart : Array<Int>;

			
 
				 	public var lodIndexCount : Array<Int>;

			
 
				-	public var lodConfig : Array<Float>;

			
 
				-	public var baseVertex : Int;

			
 
				-	public var bounds : h3d.col.Bounds;

			
 
				+	public var matIndex : Int = 0;

			
 
				 	public function new() {

			
 
				 	}

			
 
				-

			
 
				-	public function clone() {

			
 
				-		var cl = new MeshBatchPart();

			
 
				-		cl.indexStart = indexStart;

			
 
				-		cl.indexCount = indexCount;

			
 
				-		cl.lodIndexStart = lodIndexStart;

			
 
				-		cl.lodIndexCount = lodIndexCount;

			
 
				-		cl.lodConfig = lodConfig;

			
 
				-		cl.baseVertex = baseVertex;

			
 
				-		cl.bounds = bounds;

			
 
				-		return cl;

			
 
				-	}

			
 
				 }
			
--- a/h3d/shader/InstanceIndirect.hx
+++ b/h3d/shader/InstanceIndirect.hx
@@ -2,7 +2,6 @@ package h3d.shader;
 
				 
			
 
				 class InstanceIndirectBase extends hxsl.Shader {
			
 
				 	static var SRC = {
			
 
				-
			
 
				 		@global var camera : {
			
 
				 			var position : Vec3;
			
 
				 		}
			
@@ -13,10 +12,8 @@ class InstanceIndirectBase extends hxsl.Shader {
 
				 		@param var instanceData : StoragePartialBuffer<{ modelView : Mat4 }>;
			
 
				 		@param var instanceCount : Int;
			
 
				 
			
 
				-		// 16 by default because 16 * 4 floats = 256 bytes and cbuffer are aligned to 256 bytes
			
 
				-		@const var MAX_MATERIAL_COUNT : Int = 16;
			
 
				-		// x : indexCount, y : startIndex, z : minScreenRatio, w : in first lod => minScreenRatioCulling
			
 
				-		@param var matInfos : Buffer<Vec4, MAX_MATERIAL_COUNT>;
			
 
				+		// x : indexCount, y : indexStart, z : minScreenRatio, w : materialIndex
			
 
				+		@param var subPartsInfos : StorageBuffer<Vec4>;
			
 
				 
			
 
				 		@const var ENABLE_CULLING : Bool;
			
 
				 		@param var frustum : Buffer<Vec4, 6>;
			
@@ -26,8 +23,6 @@ class InstanceIndirectBase extends hxsl.Shader {
 
				 		@const var ENABLE_DISTANCE_CLIPPING : Bool;
			
 
				 		@param var maxDistance : Float = -1;
			
 
				 
			
 
				-		var matID : Int = 0;
			
 
				-
			
 
				 		var modelView : Mat4;
			
 
				 		var invocID : Int;
			
 
				 		function __init__() {
			
@@ -38,6 +33,24 @@ class InstanceIndirectBase extends hxsl.Shader {
 
				 			modelView = instanceData[invocID].modelView;
			
 
				 		}
			
 
				 
			
 
				+		function init() {}
			
 
				+
			
 
				+		function getRadius() : Float {
			
 
				+			return 0.0;
			
 
				+		}
			
 
				+
			
 
				+		function getLodCount() : Int {
			
 
				+			return 1;
			
 
				+		}
			
 
				+
			
 
				+		function getSubPartsStart() : Int {
			
 
				+			return 0;
			
 
				+		}
			
 
				+
			
 
				+		function getSubPartsCount() : Int {
			
 
				+			return 1;
			
 
				+		}
			
 
				+
			
 
				 		function emitInstance(instanceID : Int, indexCount : Int, instanceCount : Int, startIndex : Int, startVertex : Int, baseInstance : Int ) {
			
 
				 			var instancePos = instanceID * 5;
			
 
				 			commandBuffer[instancePos + 0] = indexCount;
			
@@ -67,16 +80,21 @@ class InstanceIndirectBase extends hxsl.Shader {
 
				 			return screenRatio < minScreenRatioCulling;
			
 
				 		}
			
 
				 
			
 
				-		function getLodCount() : Int {
			
 
				-			return 0;
			
 
				+		function getSubPartInfos( subPartIndex : Int, lod : Int ) : Vec4 {
			
 
				+			var pos = getSubPartsStart() + subPartIndex * getLodCount() + lod;
			
 
				+			return subPartsInfos[pos];
			
 
				+		}
			
 
				+
			
 
				+		function getMaterialCommandStart( materialIndex : Int ) : Int {
			
 
				+			return materialIndex * instanceCount;
			
 
				 		}
			
 
				 
			
 
				-		function getLodScreenRatio( lod : Int ) : Float {
			
 
				-			return matInfos[lod + matID].z;
			
 
				+		function getLodScreenRatio(lod : Int) : Float {
			
 
				+			return getSubPartInfos(0, lod).z;
			
 
				 		}
			
 
				 
			
 
				 		function getMinScreenRatio() : Float {
			
 
				-			return ENABLE_LOD ? matInfos[matID].w : 0.0;
			
 
				+			return ENABLE_LOD ? getSubPartInfos(0, getLodCount() - 1).z : 0.0;
			
 
				 		}
			
 
				 
			
 
				 		function computeScreenRatio( distToCam : Float, radius : Float ) : Float {
			
@@ -84,9 +102,10 @@ class InstanceIndirectBase extends hxsl.Shader {
 
				 			return screenRatio * screenRatio;
			
 
				 		}
			
 
				 
			
 
				-		function selectLod( screenRatio : Float, lodCount : Int ) : Int {
			
 
				+		function selectLod( screenRatio : Float ) : Int {
			
 
				 			var lod : Int = 0;
			
 
				 			if ( ENABLE_LOD ) {
			
 
				+				var lodCount = getLodCount();
			
 
				 				for ( i in 0...lodCount ) {
			
 
				 					var minScreenRatio = getLodScreenRatio(i);
			
 
				 					if ( screenRatio > minScreenRatio )
			
@@ -97,63 +116,48 @@ class InstanceIndirectBase extends hxsl.Shader {
 
				 			}
			
 
				 			return lod;
			
 
				 		}
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-class SubPartInstanceIndirect extends InstanceIndirectBase {
			
 
				-
			
 
				-	static var SRC = {
			
 
				-		// n : material offset, n + 1 : subPart ID
			
 
				-		@param var instanceOffsets: StorageBuffer<Int>;
			
 
				-		@const var MAX_SUB_PART_BUFFER_ELEMENT_COUNT : Int = 16;
			
 
				-		@param var subPartCount : Int;
			
 
				-		// x : lodCount, y : radius,
			
 
				-		@param var subPartInfos : Buffer<Vec4, MAX_SUB_PART_BUFFER_ELEMENT_COUNT>;
			
 
				-
			
 
				-		var lodCount = 0;
			
 
				-		function getLodCount() : Int {
			
 
				-			return lodCount;
			
 
				-		}
			
 
				 
			
 
				 		function main() {
			
 
				 			if ( invocID < instanceCount ) {
			
 
				+				init();
			
 
				+
			
 
				 				var pos = vec3(modelView[0].w, modelView[1].w, modelView[2].w);
			
 
				 				var vScale = abs(vec3(1) * modelView.mat3x4() - pos);
			
 
				 				var scaledRadius = max(max(vScale.x, vScale.y), vScale.z);
			
 
				 				var toCam = camera.position - pos.xyz;
			
 
				 				var distToCam = length(toCam);
			
 
				 
			
 
				-				var id = invocID * 2;
			
 
				-				matID = instanceOffsets[id];
			
 
				-				var subPartID = instanceOffsets[id + 1];
			
 
				-				var subPartInfo = subPartInfos[subPartID / 2];
			
 
				-
			
 
				-				var packedID = (subPartID & 1) << 1;
			
 
				-				lodCount = int(subPartInfo[packedID]);
			
 
				-				var radius = subPartInfo[packedID + 1];
			
 
				-
			
 
				-				scaledRadius *= radius;
			
 
				+				scaledRadius *= getRadius();
			
 
				 				var culled = dot(scaledRadius, scaledRadius) < 1e-6;
			
 
				 
			
 
				 				culled = culled || frustumCulling(pos, scaledRadius);
			
 
				 				culled = culled || distanceClipping(distToCam, scaledRadius);
			
 
				-				var computeScreenRatio = computeScreenRatio(distToCam, scaledRadius);
			
 
				-				culled = culled || screenRatioCulling(computeScreenRatio);
			
 
				+				var screenRatio = computeScreenRatio(distToCam, scaledRadius);
			
 
				+				culled = culled || screenRatioCulling(screenRatio);
			
 
				 
			
 
				+				var subPartsCount = getSubPartsCount();
			
 
				 				if ( ENABLE_COUNT_BUFFER ) {
			
 
				 					if ( !culled ) {
			
 
				-						var id = atomicAdd( countBuffer, 0, 1);
			
 
				-						var lod = selectLod(computeScreenRatio, lodCount);
			
 
				-						var matInfo = ivec4(matInfos[lod + matID]);
			
 
				-						emitInstance( id, matInfo.x, 1, matInfo.y, 0, invocID );
			
 
				+						var lod = selectLod(screenRatio);
			
 
				+						for ( subPartIndex in 0...subPartsCount ) {
			
 
				+							var subPartInfo = getSubPartInfos(subPartIndex, lod);
			
 
				+							var materialIndex = int(subPartInfo.w);
			
 
				+							var id = atomicAdd( countBuffer, materialIndex, 1 );
			
 
				+							var materialCommandStart = getMaterialCommandStart(materialIndex);
			
 
				+							emitInstance( materialCommandStart + id, int(subPartInfo.x), 1, int(subPartInfo.y), 0, invocID );
			
 
				+						}
			
 
				 					}
			
 
				 				} else {
			
 
				-					if ( !culled ) {
			
 
				-						var lod = selectLod(computeScreenRatio, lodCount);
			
 
				-						var matInfo = ivec4(matInfos[lod + matID]);
			
 
				-						emitInstance( invocID, matInfo.x, 1, matInfo.y, 0, invocID );
			
 
				-					} else {
			
 
				-						emitInstance( invocID, 0, 0, 0, 0, 0 );
			
 
				+					var lod = selectLod(screenRatio);
			
 
				+					for ( subPartIndex in 0...subPartsCount ) {
			
 
				+						var subPartInfo = getSubPartInfos(subPartIndex, lod);
			
 
				+						var materialIndex = int(subPartInfo.w);
			
 
				+						var id = atomicAdd( countBuffer, materialIndex, 1 );
			
 
				+						var materialCommandStart = getMaterialCommandStart(materialIndex);
			
 
				+						if ( !culled )
			
 
				+							emitInstance( materialCommandStart + id, int(subPartInfo.x), 1, int(subPartInfo.y), 0, invocID );
			
 
				+						else
			
 
				+							emitInstance( materialCommandStart + id, 0, 0, 0, 0, 0 );
			
 
				 					}
			
 
				 				}
			
 
				 			}
			
@@ -161,60 +165,83 @@ class SubPartInstanceIndirect extends InstanceIndirectBase {
 
				 	}
			
 
				 }
			
 
				 
			
 
				+class SubPartInstanceIndirect extends InstanceIndirectBase {
			
 
				+	static var SRC = {
			
 
				+		// n : subMesh index
			
 
				+		@param var instancesInfos: StorageBuffer<Int>;
			
 
				+		// x : radius, y : lodCount, z : subPartsStart, w : subPartsCount
			
 
				+		@param var subMeshesInfos : StorageBuffer<Vec4>;
			
 
				+
			
 
				+		@const(32) var MATERIAL_COUNT : Int = 1;
			
 
				+		@param var materialCommandStart : Array<Vec4, MATERIAL_COUNT>;
			
 
				+
			
 
				+		var radius : Float;
			
 
				+		var lodCount : Int;
			
 
				+		var subPartsStart : Int;
			
 
				+		var subPartsCount : Int;
			
 
				+
			
 
				+		function getRadius() : Float {
			
 
				+			return radius;
			
 
				+		}
			
 
				+
			
 
				+		function getLodCount() : Int {
			
 
				+			return lodCount;
			
 
				+		}
			
 
				+
			
 
				+		function getSubPartsStart()  : Int{
			
 
				+			return subPartsStart;
			
 
				+		}
			
 
				+
			
 
				+		function getSubPartsCount() : Int {
			
 
				+			return subPartsCount;
			
 
				+		}
			
 
				+
			
 
				+		function getMaterialCommandStart( materialIndex : Int ) : Int {
			
 
				+			return int(materialCommandStart[materialIndex].x);
			
 
				+		}
			
 
				+
			
 
				+		function init() {
			
 
				+			var instanceID = invocID;
			
 
				+			var subMeshIndex = instancesInfos[instanceID];
			
 
				+			var subMeshInfos = subMeshesInfos[subMeshIndex];
			
 
				+			radius = subMeshInfos.x;
			
 
				+			lodCount = int(subMeshInfos.y);
			
 
				+			subPartsStart = int(subMeshInfos.z);
			
 
				+			subPartsCount = int(subMeshInfos.w);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 class InstanceIndirect extends InstanceIndirectBase {
			
 
				 	static var SRC = {
			
 
				 		@param var radius : Float;
			
 
				-		@param var materialCount : Int;
			
 
				 		@param var lodCount : Int = 1;
			
 
				+		@param var subPartsCount : Int;
			
 
				 
			
 
				-		function getLodCount() : Int {
			
 
				-			return lodCount;
			
 
				+		var fetchedRadius : Float;
			
 
				+		var fetchedLodCount : Int;
			
 
				+		var fetchedSubPartsCount : Int;
			
 
				+
			
 
				+		function init() {
			
 
				+			fetchedRadius = radius;
			
 
				+			fetchedLodCount = lodCount;
			
 
				+			fetchedSubPartsCount = subPartsCount;
			
 
				 		}
			
 
				 
			
 
				-		function main() {
			
 
				-			if ( invocID < instanceCount ) {
			
 
				-				var pos = vec3(modelView[0].w, modelView[1].w, modelView[2].w);
			
 
				-				var vScale = abs(vec3(1) * modelView.mat3x4() - pos);
			
 
				-				var scaledRadius = max(max(vScale.x, vScale.y), vScale.z);
			
 
				-				var toCam = camera.position - pos.xyz;
			
 
				-				var distToCam = length(toCam);
			
 
				+		function getRadius() : Float {
			
 
				+			return fetchedRadius;
			
 
				+		}
			
 
				 
			
 
				-				scaledRadius *= radius;
			
 
				-				var culled = dot(scaledRadius, scaledRadius) < 1e-6;
			
 
				+		function getLodCount() : Int {
			
 
				+			return fetchedLodCount;
			
 
				+		}
			
 
				 
			
 
				-				culled = culled || frustumCulling(pos, scaledRadius);
			
 
				-				culled = culled || distanceClipping(distToCam, scaledRadius);
			
 
				-				var computeScreenRatio = computeScreenRatio(distToCam, scaledRadius);
			
 
				-				culled = culled || screenRatioCulling(computeScreenRatio);
			
 
				+		function getSubPartsStart() : Int {
			
 
				+			return 0;
			
 
				+		}
			
 
				 
			
 
				-				if ( ENABLE_COUNT_BUFFER ) {
			
 
				-					if ( !culled ) {
			
 
				-						var id = atomicAdd( countBuffer, 0, 1);
			
 
				-						for ( i in 0...materialCount ) {
			
 
				-							matID = i * lodCount;
			
 
				-							var lod = selectLod(computeScreenRatio, lodCount);
			
 
				-							var matInfo = ivec4(matInfos[lod + matID]);
			
 
				-							var instanceID = id + i * instanceCount;
			
 
				-							emitInstance( instanceID, matInfo.x, 1, matInfo.y, 0, invocID );
			
 
				-						}
			
 
				-					}
			
 
				-				} else {
			
 
				-					if ( !culled ) {
			
 
				-						for ( i in 0...materialCount ) {
			
 
				-							matID = i * lodCount;
			
 
				-							var lod = selectLod(computeScreenRatio, lodCount);
			
 
				-							var matInfo = ivec4(matInfos[lod + matID]);
			
 
				-							var instanceID = invocID + i * instanceCount;
			
 
				-							emitInstance( instanceID, matInfo.x, 1, matInfo.y, 0, invocID );
			
 
				-						}
			
 
				-					} else {
			
 
				-						for ( i in 0...materialCount ) {
			
 
				-							var instanceID = invocID + i * instanceCount;
			
 
				-							emitInstance( instanceID, 0, 0, 0, 0, 0 );
			
 
				-						}
			
 
				-					}
			
 
				-				}
			
 
				-			}
			
 
				+		function getSubPartsCount() : Int {
			
 
				+			return fetchedSubPartsCount;
			
 
				 		}
			
 
				 	}
			
 
				 }