Przeglądaj źródła

GPUMeshBatch : Do one dispatch for all data passes

TothBenoit 4 miesięcy temu
rodzic
commit
92f1a6069b
2 zmienionych plików z 213 dodań i 166 usunięć
  1. 61 96
      h3d/scene/GPUMeshBatch.hx
  2. 152 70
      h3d/shader/InstanceIndirect.hx

+ 61 - 96
h3d/scene/GPUMeshBatch.hx

@@ -18,6 +18,10 @@ class GPUMeshBatch extends MeshBatch {
 	var countBytes : haxe.io.Bytes;
 	var materialCount : Int;
 
+	public var computePass : h3d.mat.Pass;
+	public var commandBuffer : h3d.Buffer;
+	public var countBuffer : h3d.Buffer;
+
 	var gpuLodEnabled : Bool;
 	var gpuCullingEnabled : Bool;
 
@@ -69,10 +73,6 @@ class GPUMeshBatch extends MeshBatch {
 		return emitCountTip;
 	}
 
-	override function createBatchData() {
-		return new GPUBatchData();
-	}
-
 	override function emitPrimitiveSubParts() {
 		if ( primitiveSubParts.length > 1 )
 			throw "Multi material with gpu instancing is not supported";
@@ -199,63 +199,27 @@ class GPUMeshBatch extends MeshBatch {
 
 		super.flush();
 
-		materialCount = 0;
-	}
-
-	override function onFlushBuffer(p : BatchData, index : Int, count : Int) {
-		var p = cast(p, GPUBatchData);
-		var alloc = hxd.impl.Allocator.get();
-
-		var commandCountAllocated = hxd.Math.imin( hxd.Math.nextPOT( count ), p.maxInstance );
-		if ( p.commandBuffers == null) {
-			p.commandBuffers = [];
-			p.countBuffers = [];
-		}
-		var buf = p.commandBuffers[index];
-		var cbuf = p.countBuffers[index];
-		if ( buf == null ) {
-			buf = alloc.allocBuffer( commandCountAllocated, INDIRECT_DRAW_ARGUMENTS_FMT, UniformReadWrite );
-			cbuf = alloc.allocBuffer( 1, hxd.BufferFormat.VEC4_DATA, UniformReadWrite );
-			p.commandBuffers[index] = buf;
-			p.countBuffers[index] = cbuf;
-		}
-		else if ( buf.vertices < commandCountAllocated ) {
-			alloc.disposeBuffer( buf );
-			buf = alloc.allocBuffer( commandCountAllocated, INDIRECT_DRAW_ARGUMENTS_FMT, UniformReadWrite );
-			p.commandBuffers[index] = buf;
-		}
-	}
-
-	override function onFlushPass(p : BatchData) {
-		var p = cast(p, GPUBatchData);
-		var prim = getPrimitive();
-		var lodCount = getLodCount();
-
-		var computeShader;
-		if( p.computePass == null ) {
-			computeShader = new h3d.shader.InstanceIndirect();
-			var computePass = new h3d.mat.Pass("batchUpdate");
+		var computeShader : h3d.shader.InstanceIndirect.InstanceIndirectBase;
+		if( computePass == null ) {
+			computeShader = emittedSubParts != null ? new h3d.shader.InstanceIndirect.SubPartInstanceIndirect() : new h3d.shader.InstanceIndirect();
+			computePass = new h3d.mat.Pass("batchUpdate");
 			computePass.addShader(computeShader);
 			addComputeShaders(computePass);
-			p.computePass = computePass;
 		} else {
-			computeShader = p.computePass.getShader(h3d.shader.InstanceIndirect);
+			computeShader = computePass.getShader(h3d.shader.InstanceIndirect.InstanceIndirectBase);
 		}
 
 		computeShader.ENABLE_LOD = gpuLodEnabled;
 		computeShader.ENABLE_CULLING = gpuCullingEnabled;
 		computeShader.ENABLE_DISTANCE_CLIPPING = maxDistance >= 0;
-		computeShader.radius = prim.getBounds().dimension() * 0.5;
 		computeShader.maxDistance = maxDistance;
-		computeShader.matInfos = matInfos;
-		computeShader.lodCount = lodCount;
-		computeShader.materialCount = materialCount;
 		computeShader.MAX_MATERIAL_COUNT = 16;
 		while ( materialCount * lodCount > computeShader.MAX_MATERIAL_COUNT )
 			computeShader.MAX_MATERIAL_COUNT = computeShader.MAX_MATERIAL_COUNT + 16;
+		computeShader.matInfos = matInfos;
 
 		if ( emittedSubParts != null ) {
-			computeShader.USING_SUB_PART = true;
+			var computeShader : h3d.shader.InstanceIndirect.SubPartInstanceIndirect = cast computeShader;
 			computeShader.subPartCount = emittedSubParts.length;
 			computeShader.subPartInfos = subPartsInfos;
 			computeShader.instanceOffsets = instanceOffsetsGpu;
@@ -263,45 +227,61 @@ class GPUMeshBatch extends MeshBatch {
 			var maxSubPartsElement = hxd.Math.ceil( emittedSubParts.length / 2 );
 			while ( maxSubPartsElement > computeShader.MAX_SUB_PART_BUFFER_ELEMENT_COUNT )
 				computeShader.MAX_SUB_PART_BUFFER_ELEMENT_COUNT = computeShader.MAX_SUB_PART_BUFFER_ELEMENT_COUNT + 16;
+		} else {
+			var computeShader : h3d.shader.InstanceIndirect = cast computeShader;
+			computeShader.instanceCount = instanceCount;
+			computeShader.radius = prim.getBounds().dimension() * 0.5;
+			computeShader.lodCount = lodCount;
+			computeShader.materialCount = materialCount;
+			computeShader.instanceCount = instanceCount;
 		}
+
+		var alloc = hxd.impl.Allocator.get();
+		var commandCountAllocated = hxd.Math.nextPOT( instanceCount * materialCount );
+		if ( commandBuffer == null ) {
+			commandBuffer = alloc.allocBuffer( commandCountAllocated, INDIRECT_DRAW_ARGUMENTS_FMT, UniformReadWrite );
+			countBuffer = alloc.allocBuffer( 1, hxd.BufferFormat.VEC4_DATA, UniformReadWrite );
+			if ( countBytes == null ) {
+				countBytes = haxe.io.Bytes.alloc(4*4);
+				countBytes.setInt32(0, 0);
+			}
+		} else if ( commandBuffer.vertices < commandCountAllocated ) {
+			alloc.disposeBuffer( commandBuffer );
+			commandBuffer = alloc.allocBuffer( commandCountAllocated, INDIRECT_DRAW_ARGUMENTS_FMT, UniformReadWrite );
+		}
+
+		materialCount = 0;
 	}
 
 	function addComputeShaders( pass : h3d.mat.Pass ) {}
 
-	override function emitPass(ctx : RenderContext, p : BatchData) {
-		var p = cast(p, GPUBatchData);
-		var emittedCount = 0;
-		for( i => buf in p.buffers ) {
-			ctx.emitPass(p.pass, this).index = i | (p.matIndex << 16);
-			if ( p.commandBuffers != null && p.commandBuffers.length > 0 ) {
-				var count = hxd.Math.imin( instanceCount - p.maxInstance * i, p.maxInstance);
-				var computeShader = p.computePass.getShader(h3d.shader.InstanceIndirect);
-				if ( gpuCullingEnabled )
-					computeShader.frustum = ctx.getCameraFrustumBuffer();
-				computeShader.instanceData = buf;
-				computeShader.matIndex = p.matIndex;
-				computeShader.commandBuffer = p.commandBuffers[i];
-				if ( countBytes == null ) {
-					countBytes = haxe.io.Bytes.alloc(4*4);
-					countBytes.setInt32(0, 0);
-				}
-				p.countBuffers[i].uploadBytes(countBytes, 0, 1);
-				computeShader.countBuffer = p.countBuffers[i];
-				computeShader.startInstanceOffset = emittedCount;
-				computeShader.ENABLE_COUNT_BUFFER = isCountBufferAllowed();
-				ctx.computeList(@:privateAccess p.computePass.shaders);
-				ctx.computeDispatch(count);
-				emittedCount += count;
-			}
+	override function emit(ctx:RenderContext) {
+		super.emit(ctx);
+		if ( commandBuffer != null && instanceCount > 0) {
+			var computeShader = computePass.getShader(h3d.shader.InstanceIndirect.InstanceIndirectBase);
+			if ( gpuCullingEnabled )
+				computeShader.frustum = ctx.getCameraFrustumBuffer();
+			computeShader.instanceData = dataPasses.buffers[0];
+			computeShader.commandBuffer = commandBuffer;
+			countBuffer.uploadBytes(countBytes, 0, 1);
+			computeShader.countBuffer = countBuffer;
+			computeShader.ENABLE_COUNT_BUFFER = isCountBufferAllowed();
+			ctx.computeList(@:privateAccess computePass.shaders);
+			ctx.computeDispatch(instanceCount);
 		}
 	}
 
+	override function emitPass(ctx : RenderContext, p : BatchData) {
+		ctx.emitPass(p.pass, this).index = p.matIndex << 16;
+	}
+
 	override function setPassCommand(p : BatchData, bufferIndex : Int) {
 		super.setPassCommand(p, bufferIndex);
-		var p = cast(p, GPUBatchData);
-		if ( p.commandBuffers != null && p.commandBuffers.length > 0 ) {
-			@:privateAccess instanced.commands.data = p.commandBuffers[bufferIndex].vbuf;
-			@:privateAccess instanced.commands.countBuffer = p.countBuffers[bufferIndex].vbuf;
+		if ( commandBuffer != null ) {
+			@:privateAccess instanced.commands.data = commandBuffer.vbuf;
+			@:privateAccess instanced.commands.countBuffer = countBuffer.vbuf;
+			@:privateAccess instanced.commands.offset = p.matIndex * instanceCount;
+			@:privateAccess instanced.commands.countOffset = 0;
 		}
 	}
 
@@ -332,27 +312,12 @@ class GPUMeshBatch extends MeshBatch {
 			alloc.disposeBuffer(instanceOffsetsGpu);
 		instanceOffsetsCpu = null;
 
+		if ( commandBuffer != null )
+			alloc.disposeBuffer(commandBuffer);
+		if ( countBuffer != null )
+			alloc.disposeBuffer(countBuffer);
+
 		emittedSubParts = null;
 		countBytes = null;
 	}
-}
-
-class GPUBatchData extends BatchData {
-	public var computePass : h3d.mat.Pass;
-	public var commandBuffers : Array<h3d.Buffer>;
-	public var countBuffers : Array<h3d.Buffer>;
-
-	override function clean() {
-		super.clean();
-
-		var alloc = hxd.impl.Allocator.get();
-		if ( commandBuffers != null && commandBuffers.length > 0 ) {
-			for ( buf in commandBuffers )
-				alloc.disposeBuffer(buf);
-			commandBuffers.resize(0);
-			for ( buf in countBuffers )
-				alloc.disposeBuffer(buf);
-			countBuffers.resize(0);
-		}
-	}
 }

+ 152 - 70
h3d/shader/InstanceIndirect.hx

@@ -1,31 +1,19 @@
 package h3d.shader;
 
-class InstanceIndirect extends hxsl.Shader {
+class InstanceIndirectBase extends hxsl.Shader {
 	static var SRC = {
 
 		@global var camera : {
 			var position : Vec3;
 		}
 
-		// n : material offset, n + 1 : subPart ID
 		@const var ENABLE_COUNT_BUFFER : Bool;
 		@param var countBuffer : RWBuffer<Int>;
-		@param var instanceOffsets: StorageBuffer<Int>;
 		@param var commandBuffer : RWBuffer<Int>;
 		@param var instanceData : StoragePartialBuffer<{ modelView : Mat4 }>;
-		@param var radius : Float;
-
-		@const var USING_SUB_PART : Bool = false;
-		@const var MAX_SUB_PART_BUFFER_ELEMENT_COUNT : Int = 16;
-		@param var subPartCount : Int;
-		@param var startInstanceOffset : Int;
-		// x : lodCount, y : radius,
-		@param var subPartInfos : Buffer<Vec4, MAX_SUB_PART_BUFFER_ELEMENT_COUNT>;
 
 		// 16 by default because 16 * 4 floats = 256 bytes and cbuffer are aligned to 256 bytes
 		@const var MAX_MATERIAL_COUNT : Int = 16;
-		@param var materialCount : Int;
-		@param var matIndex : Int;
 		// x : indexCount, y : startIndex, z : minScreenRatio, w : in first lod => minScreenRatioCulling
 		@param var matInfos : Buffer<Vec4, MAX_MATERIAL_COUNT>;
 
@@ -33,97 +21,191 @@ class InstanceIndirect extends hxsl.Shader {
 		@param var frustum : Buffer<Vec4, 6>;
 
 		@const var ENABLE_LOD : Bool;
-		@param var lodCount : Int = 1;
 
 		@const var ENABLE_DISTANCE_CLIPPING : Bool;
 		@param var maxDistance : Float = -1;
 
+		var matID : Int = 0;
+
 		var modelView : Mat4;
 		function __init__() {
 			modelView = instanceData[computeVar.globalInvocation.x].modelView;
 		}
 
+		function emitInstance(instanceID : Int, indexCount : Int, instanceCount : Int, startIndex : Int, startVertex : Int, baseInstance : Int ) {
+			var instancePos = instanceID * 5;
+			commandBuffer[instancePos + 0] = indexCount;
+			commandBuffer[instancePos + 1] = instanceCount;
+			commandBuffer[instancePos + 2] = startIndex;
+			commandBuffer[instancePos + 3] = startVertex;
+			commandBuffer[instancePos + 4] = baseInstance;
+		}
+
+		function frustumCulling( pos : Vec3, radius : Float ) : Bool {
+			var culled = false;
+			if ( ENABLE_CULLING ) {
+				@unroll for ( i  in 0...6 ) {
+					var plane = frustum[i];
+					culled = culled || plane.x * pos.x + plane.y * pos.y + plane.z * pos.z - plane.w < -radius;
+				}
+			}
+			return culled;
+		}
+
+		function distanceClipping( distToCam : Float, radius : Float ) : Bool {
+			return ( ENABLE_DISTANCE_CLIPPING ) ? distToCam > maxDistance + radius : false;
+		}
+
+		function screenRatioCulling( screenRatio : Float ) : Bool {
+			var minScreenRatioCulling = getMinScreenRatio();
+			return screenRatio < minScreenRatioCulling;
+		}
+
+		function getLodCount() : Int {
+			return 0;
+		}
+
+		function getLodScreenRatio( lod : Int ) : Float {
+			return matInfos[lod + matID].z;
+		}
+
+		function getMinScreenRatio() : Float {
+			return ENABLE_LOD ? matInfos[0].w : 0.0;
+		}
+
+		function computeScreenRatio( distToCam : Float, radius : Float ) : Float {
+			var screenRatio = radius / distToCam;
+			return screenRatio * screenRatio;
+		}
+
+		function selectLod( screenRatio : Float, lodCount : Int ) : Int {
+			var lod : Int = 0;
+			if ( ENABLE_LOD ) {
+				for ( i in 0...lodCount ) {
+					var minScreenRatio = getLodScreenRatio(i);
+					if ( screenRatio > minScreenRatio )
+						break;
+					lod++;
+				}
+				lod = clamp(lod, 0, int(lodCount) - 1);
+			}
+			return lod;
+		}
+	}
+}
+
+class SubPartInstanceIndirect extends InstanceIndirectBase {
+
+	static var SRC = {
+		// n : material offset, n + 1 : subPart ID
+		@param var instanceOffsets: StorageBuffer<Int>;
+		@const var MAX_SUB_PART_BUFFER_ELEMENT_COUNT : Int = 16;
+		@param var subPartCount : Int;
+		// x : lodCount, y : radius,
+		@param var subPartInfos : Buffer<Vec4, MAX_SUB_PART_BUFFER_ELEMENT_COUNT>;
+
+		var lodCount = 0;
+		function getLodCount() : Int {
+			return lodCount;
+		}
+
 		function main() {
 			var invocID = computeVar.globalInvocation.x;
-			var lod : Int = 0;
 			var pos = vec3(0) * modelView.mat3x4();
 			var vScale = abs(vec3(1) * modelView.mat3x4() - pos);
 			var scaledRadius = max(max(vScale.x, vScale.y), vScale.z);
 			var toCam = camera.position - pos.xyz;
 			var distToCam = length(toCam);
 
-			var radius = radius;
-			var matOffset = matIndex * lodCount;
-			var lodCount = lodCount;
-
-			if ( USING_SUB_PART ) {
-				var id = (invocID + startInstanceOffset) * 2;
-				matOffset = instanceOffsets[id];
-				var subPartID = instanceOffsets[id + 1];
-				var subPartInfo = subPartInfos[subPartID / 2];
+			var id = invocID * 2;
+			matID = instanceOffsets[id];
+			var subPartID = instanceOffsets[id + 1];
+			var subPartInfo = subPartInfos[subPartID / 2];
 
-				var packedID = (subPartID & 1) << 1;
-				lodCount = int(subPartInfo[packedID]);
-				radius = subPartInfo[packedID + 1];
-			}
+			var packedID = (subPartID & 1) << 1;
+			lodCount = int(subPartInfo[packedID]);
+			var radius = subPartInfo[packedID + 1];
 
 			scaledRadius *= radius;
 			var culled = dot(scaledRadius, scaledRadius) < 1e-6;
 
-			if ( ENABLE_CULLING ) {
-				@unroll for ( i  in 0...6 ) {
-					var plane = frustum[i];
-					culled = culled || plane.x * pos.x + plane.y * pos.y + plane.z * pos.z - plane.w < -scaledRadius;
+			culled = culled || frustumCulling(pos, scaledRadius);
+			culled = culled || distanceClipping(distToCam, scaledRadius);
+			var computeScreenRatio = computeScreenRatio(distToCam, scaledRadius);
+			culled = culled || screenRatioCulling(computeScreenRatio);
+
+			if ( ENABLE_COUNT_BUFFER ) {
+				if ( !culled ) {
+					var id = atomicAdd( countBuffer, 0, 1);
+					var lod = selectLod(computeScreenRatio, lodCount);
+					var matInfo = ivec4(matInfos[lod + matID]);
+					emitInstance( id, matInfo.x, 1, matInfo.y, 0, invocID );
+				}
+			} else {
+				if ( !culled ) {
+					var lod = selectLod(computeScreenRatio, lodCount);
+					var matInfo = ivec4(matInfos[lod + matID]);
+					emitInstance( invocID, matInfo.x, 1, matInfo.y, 0, invocID );
+				} else {
+					emitInstance( invocID, 0, 0, 0, 0, 0 );
 				}
 			}
+		}
+	}
+}
 
-			if ( ENABLE_DISTANCE_CLIPPING ) {
-				culled = culled || distToCam > maxDistance + scaledRadius;
-			}
+class InstanceIndirect extends InstanceIndirectBase {
+	static var SRC = {
+		@param var radius : Float;
+		@param var instanceCount : Int;
+		@param var materialCount : Int;
+		@param var lodCount : Int = 1;
 
-			if ( ENABLE_LOD ) {
-				var screenRatio = scaledRadius / distToCam;
-				screenRatio = screenRatio * screenRatio;
-				var minScreenRatioCulling = matInfos[matOffset].w;
-				var culledByScreenRatio = screenRatio < minScreenRatioCulling;
-				culled = culled || culledByScreenRatio;
-				var lodStart = culledByScreenRatio ? lodCount : 0;
-				for ( i in lodStart...lodCount ) {
-					var minScreenRatio = matInfos[i + matOffset].z;
-					if ( screenRatio > minScreenRatio )
-						break;
-					lod++;
-				}
-				lod = clamp(lod, 0, int(lodCount) - 1);
-			}
+		function getLodCount() : Int {
+			return lodCount;
+		}
+
+		function main() {
+			var invocID = computeVar.globalInvocation.x;
+			var pos = vec3(0) * modelView.mat3x4();
+			var vScale = abs(vec3(1) * modelView.mat3x4() - pos);
+			var scaledRadius = max(max(vScale.x, vScale.y), vScale.z);
+			var toCam = camera.position - pos.xyz;
+			var distToCam = length(toCam);
+
+			scaledRadius *= radius;
+			var culled = dot(scaledRadius, scaledRadius) < 1e-6;
+
+			culled = culled || frustumCulling(pos, scaledRadius);
+			culled = culled || distanceClipping(distToCam, scaledRadius);
+			var computeScreenRatio = computeScreenRatio(distToCam, scaledRadius);
+			culled = culled || screenRatioCulling(computeScreenRatio);
 
-			var matInfo = ivec4(0.0);
-			if ( !culled ) {
-				matInfo = ivec4(matInfos[lod + matOffset]);
-				culled = culled || matInfo.x <= 0;
-			}
 			if ( ENABLE_COUNT_BUFFER ) {
 				if ( !culled ) {
 					var id = atomicAdd( countBuffer, 0, 1);
-					commandBuffer[ id * 5 ] = matInfo.x;
-					commandBuffer[ id * 5 + 1] = 1;
-					commandBuffer[ id * 5 + 2] = matInfo.y;
-					commandBuffer[ id * 5 + 3] = 0;
-					commandBuffer[ id * 5 + 4] = invocID;
+					for ( i in 0...materialCount ) {
+						matID = i * lodCount;
+						var lod = selectLod(computeScreenRatio, lodCount);
+						var matInfo = ivec4(matInfos[lod + matID]);
+						var instanceID = id + i * instanceCount;
+						emitInstance( instanceID, matInfo.x, 1, matInfo.y, 0, invocID );
+					}
 				}
 			} else {
 				if ( !culled ) {
-					commandBuffer[ invocID * 5 ] = matInfo.x;
-					commandBuffer[ invocID * 5 + 1] = 1;
-					commandBuffer[ invocID * 5 + 2] = matInfo.y;
-					commandBuffer[ invocID * 5 + 3] = 0;
-					commandBuffer[ invocID * 5 + 4] = invocID;
+					for ( i in 0...materialCount ) {
+						matID = i * lodCount;
+						var lod = selectLod(computeScreenRatio, lodCount);
+						var matInfo = ivec4(matInfos[lod + matID]);
+						var instanceID = invocID + i * instanceCount;
+						emitInstance( instanceID, matInfo.x, 1, matInfo.y, 0, invocID );
+					}
 				} else {
-					commandBuffer[ invocID * 5 ] = 0;
-					commandBuffer[ invocID * 5 + 1] = 0;
-					commandBuffer[ invocID * 5 + 2] = 0;
-					commandBuffer[ invocID * 5 + 3] = 0;
-					commandBuffer[ invocID * 5 + 4] = 0;
+					for ( i in 0...materialCount ) {
+						var instanceID = invocID + i * instanceCount;
+						emitInstance( instanceID, 0, 0, 0, 0, 0 );
+					}
 				}
 			}
 		}