7 years ago · 90aadf835b
--- a/examples/37-gpudrivenrendering/cs_downscaleHiZ.sc
+++ b/examples/37-gpudrivenrendering/cs_downscaleHiZ.sc
@@ -14,31 +14,35 @@ uniform vec4 u_inputRTSize;
 
															 NUM_THREADS(16, 16, 1)
														
 
															 void main()
														
 
															 {
														
 
															-	//this shader can be used to both copy a mip over to the output and downscale it. 
														
 
															-	
														
 
															-	ivec2 coord = gl_GlobalInvocationID.xy;
														
 
															-		
														
 
															-	if (all(coord.xy < u_inputRTSize.xy))
														
 
															-	{	
														
 
															+	// this shader can be used to both copy a mip over to the output and downscale it.
														
 
															+
														
 
															+	ivec2 coord = ivec2(gl_GlobalInvocationID.xy);
														
 
															+
														
 
															+	if (all(lessThan(coord.xy, u_inputRTSize.xy) ) )
														
 
															+	{
														
 
															 		float maxDepth = 1.0;
														
 
															-		
														
 
															-		if ( u_inputRTSize.z > 1)
														
 
															+
														
 
															+		if (u_inputRTSize.z > 1)
														
 
															 		{
														
 
															-			vec4 depths = vec4( imageLoad(s_texOcclusionDepthIn, u_inputRTSize.zw * coord.xy ).r,
														
 
															-								imageLoad(s_texOcclusionDepthIn, u_inputRTSize.zw * coord.xy + ivec2(1,0) ).r,
														
 
															-								imageLoad(s_texOcclusionDepthIn, u_inputRTSize.zw * coord.xy + ivec2(0,1)).r,
														
 
															-								imageLoad(s_texOcclusionDepthIn, u_inputRTSize.zw * coord.xy + ivec2(1,1)).r
														
 
															-								);
														
 
															-
														
 
															-			//find and return max depth
														
 
															-			maxDepth = max(max(depths.x, depths.y), max(depths.z, depths.w));
														
 
															+			vec4 depths = vec4(
														
 
															+				  imageLoad(s_texOcclusionDepthIn, ivec2(u_inputRTSize.zw * coord.xy                   ) ).x
														
 
															+				, imageLoad(s_texOcclusionDepthIn, ivec2(u_inputRTSize.zw * coord.xy + ivec2(1.0, 0.0) ) ).x
														
 
															+				, imageLoad(s_texOcclusionDepthIn, ivec2(u_inputRTSize.zw * coord.xy + ivec2(0.0, 1.0) ) ).x
														
 
															+				, imageLoad(s_texOcclusionDepthIn, ivec2(u_inputRTSize.zw * coord.xy + ivec2(1.0, 1.0) ) ).x
														
 
															+				);
														
 
															+
														
 
															+			// find and return max depth
														
 
															+			maxDepth = max(
														
 
															+				  max(depths.x, depths.y)
														
 
															+				, max(depths.z, depths.w)
														
 
															+				);
														
 
															 		}
														
 
															 		else
														
 
															 		{
														
 
															-			//do not downscale, just copy the value over to the output rendertarget
														
 
															-			maxDepth = imageLoad(s_texOcclusionDepthIn, coord.xy ).r;
														
 
															+			// do not downscale, just copy the value over to the output rendertarget
														
 
															+			maxDepth = imageLoad(s_texOcclusionDepthIn, coord.xy).x;
														
 
															 		}
														
 
															-			
														
 
															+
														
 
															 		imageStore(s_texOcclusionDepthOut, coord, vec4(maxDepth,0,0,1) );
														
 
															 	}
														
 
															-}
														
 
															+}
														
--- a/examples/37-gpudrivenrendering/cs_occludeProps.sc
+++ b/examples/37-gpudrivenrendering/cs_occludeProps.sc
@@ -14,38 +14,39 @@ BUFFER_WR(instancePredicates, bool, 3);
 
															 uniform vec4 u_inputRTSize;
														
 
															 uniform vec4 u_cullingConfig;
														
 
															- 
														
 
															+
														
 
															 NUM_THREADS(64, 1, 1)
														
 
															 void main()
														
 
															 {
														
 
															 	bool predicate = false;
														
 
															-	
														
 
															+
														
 
															 	//make sure that we not processing more instances than available
														
 
															-	if (gl_GlobalInvocationID.x < (int)u_cullingConfig.x)
														
 
															+	if (gl_GlobalInvocationID.x < uint(u_cullingConfig.x) )
														
 
															 	{
														
 
															 		//get the bounding box for this instance
														
 
															 		vec4 bboxMin = instanceDataIn[2 * gl_GlobalInvocationID.x] ;
														
 
															 		vec3 bboxMax = instanceDataIn[2 * gl_GlobalInvocationID.x + 1].xyz;
														
 
															-		
														
 
															-		int drawcallID = bboxMin.w;
														
 
															-	
														
 
															+
														
 
															+		int drawcallID = int(bboxMin.w);
														
 
															+
														
 
															 		//Adapted from http://blog.selfshadow.com/publications/practical-visibility/
														
 
															 		vec3 bboxSize = bboxMax.xyz - bboxMin.xyz;
														
 
															-		vec3 boxCorners[] = { 	bboxMin.xyz,
														
 
															-								bboxMin.xyz + vec3(bboxSize.x,0,0),
														
 
															-								bboxMin.xyz + vec3(0, bboxSize.y,0),
														
 
															-								bboxMin.xyz + vec3(0, 0, bboxSize.z),
														
 
															-								bboxMin.xyz + vec3(bboxSize.xy,0),
														
 
															-								bboxMin.xyz + vec3(0, bboxSize.yz),
														
 
															-								bboxMin.xyz + vec3(bboxSize.x, 0, bboxSize.z),
														
 
															-								bboxMin.xyz + bboxSize.xyz
														
 
															-							 };
														
 
															-		float minZ = 1;
														
 
															-		vec2 minXY = vec2(1,1);
														
 
															-		vec2 maxXY = vec2(0,0);
														
 
															-
														
 
															-		[unroll]
														
 
															+		vec3 boxCorners[] = {
														
 
															+			bboxMin.xyz,
														
 
															+			bboxMin.xyz + vec3(bboxSize.x,0,0),
														
 
															+			bboxMin.xyz + vec3(0, bboxSize.y,0),
														
 
															+			bboxMin.xyz + vec3(0, 0, bboxSize.z),
														
 
															+			bboxMin.xyz + vec3(bboxSize.xy,0),
														
 
															+			bboxMin.xyz + vec3(0, bboxSize.yz),
														
 
															+			bboxMin.xyz + vec3(bboxSize.x, 0, bboxSize.z),
														
 
															+			bboxMin.xyz + bboxSize.xyz
														
 
															+		};
														
 
															+		float minZ = 1.0;
														
 
															+		vec2 minXY = vec2(1.0, 1.0);
														
 
															+		vec2 maxXY = vec2(0.0, 0.0);
														
 
															+
														
 
															+		UNROLL
														
 
															 		for (int i = 0; i < 8; i++)
														
 
															 		{
														
 
															 			//transform World space aaBox to NDC
														
@@ -61,20 +62,20 @@ void main()
 
															 			minXY = min(clipPos.xy, minXY);
														
 
															 			maxXY = max(clipPos.xy, maxXY);
														
 
															-			minZ = saturate(min(minZ, clipPos.z));		
														
 
															+			minZ = saturate(min(minZ, clipPos.z));
														
 
															 		}
														
 
															 		vec4 boxUVs = vec4(minXY, maxXY);
														
 
															 		// Calculate hi-Z buffer mip
														
 
															-		ivec2 size = (maxXY - minXY) * u_inputRTSize.xy;
														
 
															+		ivec2 size = ivec2( (maxXY - minXY) * u_inputRTSize.xy);
														
 
															 		float mip = ceil(log2(max(size.x, size.y)));
														
 
															 		mip = clamp(mip, 0, u_cullingConfig.z);
														
 
															 		// Texel footprint for the lower (finer-grained) level
														
 
															-		float  level_lower = max(mip - 1, 0);
														
 
															-		vec2 scale = exp2(-level_lower);
														
 
															+		float level_lower = max(mip - 1, 0);
														
 
															+		vec2 scale = vec2_splat(exp2(-level_lower) );
														
 
															 		vec2 a = floor(boxUVs.xy*scale);
														
 
															 		vec2 b = ceil(boxUVs.zw*scale);
														
 
															 		vec2 dims = b - a;
														
@@ -88,8 +89,8 @@ void main()
 
															 						texture2DLod(s_texOcclusionDepth, boxUVs.zy, mip).x,
														
 
															 						texture2DLod(s_texOcclusionDepth, boxUVs.xw, mip).x,
														
 
															 						texture2DLod(s_texOcclusionDepth, boxUVs.zw, mip).x,
														
 
															-					};		
														
 
															-		
														
 
															+					};
														
 
															+
														
 
															 		//find the max depth
														
 
															 		float maxDepth = max( max(depth.x, depth.y), max(depth.z, depth.w) );
														
@@ -98,7 +99,7 @@ void main()
 
															 			predicate = true;
														
 
															 			//increase instance count for this particular prop type
														
 
															-			InterlockedAdd( drawcallInstanceCount[ drawcallID ], 1);			
														
 
															+			atomicAdd(drawcallInstanceCount[ drawcallID ], 1);
														
 
															 		}
														
 
															 	}
														
--- a/examples/37-gpudrivenrendering/cs_streamCompaction.sc
+++ b/examples/37-gpudrivenrendering/cs_streamCompaction.sc
@@ -21,102 +21,105 @@ BUFFER_RW(drawcallData, uvec4, 4);
 
															 BUFFER_WR(instanceDataOut, vec4, 5);
														
 
															 uniform vec4 u_cullingConfig;
														
 
															- 
														
 
															+
														
 
															 // Based on Parallel Prefix Sum (Scan) with CUDA by Mark Harris
														
 
															-groupshared uint temp[2048];
														
 
															+SHARED uint temp[2048];
														
 
															 NUM_THREADS(1024, 1, 1)
														
 
															 void main()
														
 
															 {
														
 
															-	int tID = gl_GlobalInvocationID.x;
														
 
															-	int NoofInstancesPowOf2 = u_cullingConfig.y;
														
 
															-	int NoofDrawcalls = u_cullingConfig.w;
														
 
															+	uint tID = gl_GlobalInvocationID.x;
														
 
															+	int NoofInstancesPowOf2 = int(u_cullingConfig.y);
														
 
															+	int NoofDrawcalls = int(u_cullingConfig.w);
														
 
															 	int offset = 1;
														
 
															-	temp[2 * tID] = instancePredicates[2 * tID]; // load input into shared memory
														
 
															-	temp[2 * tID + 1] = instancePredicates[2 * tID + 1];
														
 
															+	temp[2 * tID    ] = uint(instancePredicates[2 * tID    ]); // load input into shared memory
														
 
															+	temp[2 * tID + 1] = uint(instancePredicates[2 * tID + 1]);
														
 
															 	int d;
														
 
															-		
														
 
															+
														
 
															 	//perform reduction
														
 
															-	for (d = NoofInstancesPowOf2 >> 1; d > 0; d >>= 1) 
														
 
															+	for (d = NoofInstancesPowOf2 >> 1; d > 0; d >>= 1)
														
 
															 	{
														
 
															-		GroupMemoryBarrierWithGroupSync();
														
 
															+		barrier();
														
 
															 		if (tID < d)
														
 
															 		{
														
 
															-			int ai = offset * (2 * tID + 1) - 1;
														
 
															-			int bi = offset * (2 * tID + 2) - 1;
														
 
															+			int ai = int(offset * (2 * tID + 1) - 1);
														
 
															+			int bi = int(offset * (2 * tID + 2) - 1);
														
 
															 			temp[bi] += temp[ai];
														
 
															 		}
														
 
															+
														
 
															 		offset *= 2;
														
 
															 	}
														
 
															 	// clear the last element
														
 
															 	if (tID == 0)
														
 
															+	{
														
 
															 		temp[NoofInstancesPowOf2 - 1] = 0;
														
 
															+	}
														
 
															-	//perform downsweep and build scan
														
 
															+	// perform downsweep and build scan
														
 
															 	for ( d = 1; d < NoofInstancesPowOf2; d *= 2)
														
 
															 	{
														
 
															 		offset >>= 1;
														
 
															-		GroupMemoryBarrierWithGroupSync();
														
 
															+		barrier();
														
 
															 		if (tID < d)
														
 
															 		{
														
 
															-			int ai = offset * (2 * tID + 1) - 1;
														
 
															-			int bi = offset * (2 * tID + 2) - 1;
														
 
															-			int t = temp[ai];
														
 
															+			int ai = int(offset * (2 * tID + 1) - 1);
														
 
															+			int bi = int(offset * (2 * tID + 2) - 1);
														
 
															+			int t  = int(temp[ai]);
														
 
															 			temp[ai] = temp[bi];
														
 
															 			temp[bi] += t;
														
 
															 		}
														
 
															 	}
														
 
															-	GroupMemoryBarrierWithGroupSync();
														
 
															+	barrier();
														
 
															-	int index = 2 * tID;
														
 
															+	int index = int(2 * tID);
														
 
															-	//scatter results
														
 
															-	if (instancePredicates[index] != 0)
														
 
															-	{	
														
 
															-		instanceDataOut[ 4 * temp[index] ] = instanceDataIn[ 4 * index ];
														
 
															-		instanceDataOut[ 4 * temp[index] + 1 ] = instanceDataIn[ 4 * index + 1 ];
														
 
															-		instanceDataOut[ 4 * temp[index] + 2 ] = instanceDataIn[ 4 * index + 2 ];
														
 
															-		instanceDataOut[ 4 * temp[index] + 3 ] = instanceDataIn[ 4 * index + 3 ];
														
 
															+	// scatter results
														
 
															+	if (instancePredicates[index])
														
 
															+	{
														
 
															+		instanceDataOut[4 * temp[index]    ] = instanceDataIn[4 * index    ];
														
 
															+		instanceDataOut[4 * temp[index] + 1] = instanceDataIn[4 * index + 1];
														
 
															+		instanceDataOut[4 * temp[index] + 2] = instanceDataIn[4 * index + 2];
														
 
															+		instanceDataOut[4 * temp[index] + 3] = instanceDataIn[4 * index + 3];
														
 
															 	}
														
 
															-	index = 2 * tID + 1;
														
 
															+	index = int(2 * tID + 1);
														
 
															-	if (instancePredicates[index] != 0)
														
 
															+	if (instancePredicates[index])
														
 
															 	{
														
 
															-		instanceDataOut[ 4 * temp[index] ] = instanceDataIn[ 4 * index ];			
														
 
															-		instanceDataOut[ 4 * temp[index] + 1 ] = instanceDataIn[ 4 * index + 1 ];
														
 
															-		instanceDataOut[ 4 * temp[index] + 2 ] = instanceDataIn[ 4 * index + 2 ];
														
 
															-		instanceDataOut[ 4 * temp[index] + 3 ] = instanceDataIn[ 4 * index + 3 ];	
														
 
															+		instanceDataOut[4 * temp[index]    ] = instanceDataIn[4 * index    ];
														
 
															+		instanceDataOut[4 * temp[index] + 1] = instanceDataIn[4 * index + 1];
														
 
															+		instanceDataOut[4 * temp[index] + 2] = instanceDataIn[4 * index + 2];
														
 
															+		instanceDataOut[4 * temp[index] + 3] = instanceDataIn[4 * index + 3];
														
 
															 	}
														
 
															-  
														
 
															+
														
 
															 	if (tID == 0)
														
 
															 	{
														
 
															 		uint startInstance = 0;
														
 
															-		
														
 
															+
														
 
															 		//copy data to indirect buffer, could possible be done in a different compute shader
														
 
															 		for (int k = 0; k < NoofDrawcalls; k++)
														
 
															-		{				
														
 
															+		{
														
 
															 			drawIndexedIndirect(
														
 
															-				drawcallData, 
														
 
															-				k, 
														
 
															+				drawcallData,
														
 
															+				k,
														
 
															 				drawcallConstData[ k * 3 ], 			//number of indices
														
 
															 				drawcallInstanceCount[k], 				//number of instances
														
 
															-				drawcallConstData[ k * 3 + 1 ],			//offset into the index buffer 
														
 
															-				drawcallConstData[ k * 3 + 2 ],			//offset into the vertex buffer 
														
 
															+				drawcallConstData[ k * 3 + 1 ],			//offset into the index buffer
														
 
															+				drawcallConstData[ k * 3 + 2 ],			//offset into the vertex buffer
														
 
															 				startInstance							//offset into the instance buffer
														
 
															 				);
														
 
															 			startInstance += drawcallInstanceCount[k];
														
 
															-										
														
 
															+
														
 
															 			drawcallInstanceCount[k] = 0;
														
 
															 		}
														
 
															 	}
														
 
															- 
														
 
															-}
														
 
															+
														
 
															+}
														
--- a/examples/37-gpudrivenrendering/fs_instancedIndirectRendering.sc
+++ b/examples/37-gpudrivenrendering/fs_instancedIndirectRendering.sc
@@ -11,14 +11,14 @@ uniform vec4 u_colour[50];
 
															 void main()
														
 
															 {
														
 
															-	vec4 colour = u_colour[v_materialID.x];
														
 
															-	
														
 
															+	vec4 colour = u_colour[uint(v_materialID)];
														
 
															+
														
 
															 	if ( colour.w < 1.0f )
														
 
															 	{
														
 
															 		//render dithered alpha
														
 
															-		if ( (gl_FragCoord.x % 2) == (gl_FragCoord.y % 2) )
														
 
															+		if ( (int(gl_FragCoord.x) % 2) == (int(gl_FragCoord.y) % 2) )
														
 
															 			discard;
														
 
															 	}
														
 
															-	
														
 
															+
														
 
															 	gl_FragColor = vec4( colour.xyz,1 );
														
 
															 }
														
--- a/examples/37-gpudrivenrendering/gpudrivenrendering.cpp
+++ b/examples/37-gpudrivenrendering/gpudrivenrendering.cpp
@@ -418,6 +418,11 @@ public:
 
															 		// Enable debug text.
														
 
															 		bgfx::setDebug(m_debug);
														
 
															+		//create uniforms
														
 
															+		u_inputRTSize   = bgfx::createUniform("u_inputRTSize", bgfx::UniformType::Vec4);
														
 
															+		u_cullingConfig = bgfx::createUniform("u_cullingConfig", bgfx::UniformType::Vec4);
														
 
															+		u_colour        = bgfx::createUniform("u_colour", bgfx::UniformType::Vec4);
														
 
															+
														
 
															 		//create props
														
 
															 		{
														
 
															 			m_totalInstancesCount = 0;
														
@@ -769,11 +774,6 @@ public:
 
															 		//create samplers
														
 
															 		s_texOcclusionDepthIn = bgfx::createUniform("s_texOcclusionDepthIn", bgfx::UniformType::Int1);
														
 
															-		//create uniforms
														
 
															-		u_inputRTSize = bgfx::createUniform("u_inputRTSize", bgfx::UniformType::Vec4);
														
 
															-		u_cullingConfig = bgfx::createUniform("u_cullingConfig", bgfx::UniformType::Vec4);
														
 
															-		u_colour = bgfx::createUniform("u_colour", bgfx::UniformType::Vec4);
														
 
															-
														
 
															 		m_timeOffset = bx::getHPCounter();
														
 
															 		m_useIndirect = true;
														
--- a/examples/37-gpudrivenrendering/varying.def.sc
+++ b/examples/37-gpudrivenrendering/varying.def.sc
@@ -1,7 +1,9 @@
 
															-uint v_materialID : TEXCOORD0;
														
 
															-
														
 
															 vec3 a_position  : POSITION;
														
 
															+vec2 a_texcoord0 : TEXCOORD0;
														
 
															 vec4 i_data0     : TEXCOORD7;
														
 
															 vec4 i_data1     : TEXCOORD6;
														
 
															 vec4 i_data2     : TEXCOORD5;
														
 
															 vec4 i_data3     : TEXCOORD4;
														
 
															+
														
 
															+vec2  v_texcoord0 : TEXCOORD0;
														
 
															+float v_materialID : TEXCOORD0;
														
--- a/examples/37-gpudrivenrendering/varying_pos_tex0.def.sc
+++ b/examples/37-gpudrivenrendering/varying_pos_tex0.def.sc
@@ -1,4 +0,0 @@
 
															-vec2 v_texcoord0 : TEXCOORD0;
														
 
															-
														
 
															-vec3 a_position  : POSITION;
														
 
															-vec2 a_texcoord0 : TEXCOORD0;
														
--- a/scripts/genie.lua
+++ b/scripts/genie.lua
@@ -464,6 +464,7 @@ or _OPTIONS["with-combined-examples"] then
 
															 		, "34-mvs"
														
 
															 		, "35-dynamic"
														
 
															 		, "36-sky"
														
 
															+		, "37-gpudrivenrendering"
														
 
															 		)
														
 
															 	-- C99 source doesn't compile under WinRT settings
														
--- a/src/bgfx_compute.sh
+++ b/src/bgfx_compute.sh
@@ -251,39 +251,16 @@ __IMAGE_IMPL_A(r32ui,       x,    uvec4, xxxx)
 
															 __IMAGE_IMPL_A(rg32ui,      xy,   uvec4, xyyy)
														
 
															 __IMAGE_IMPL_A(rgba32ui,    xyzw, uvec4, xyzw)
														
 
															-#define __ATOMIC_IMPL_TYPE(_genType, _glFunc, _dxFunc)            \
														
 
															-			_genType _glFunc(inout _genType _mem, _genType _data) \
														
 
															-			{                                                     \
														
 
															-				_genType result;                                  \
														
 
															-				_dxFunc(_mem, _data, result);                     \
														
 
															-				return result;                                    \
														
 
															-			}
														
 
															-
														
 
															-#define __ATOMIC_IMPL(_glFunc, _dxFunc)                \
														
 
															-			__ATOMIC_IMPL_TYPE(int,  _glFunc, _dxFunc) \
														
 
															-			__ATOMIC_IMPL_TYPE(uint, _glFunc, _dxFunc)
														
 
															-
														
 
															-__ATOMIC_IMPL(atomicAdd,      InterlockedAdd);
														
 
															-__ATOMIC_IMPL(atomicAnd,      InterlockedAnd);
														
 
															-__ATOMIC_IMPL(atomicExchange, InterlockedExchange);
														
 
															-__ATOMIC_IMPL(atomicMax,      InterlockedMax);
														
 
															-__ATOMIC_IMPL(atomicMin,      InterlockedMin);
														
 
															-__ATOMIC_IMPL(atomicOr,       InterlockedOr);
														
 
															-__ATOMIC_IMPL(atomicXor,      InterlockedXor);
														
 
															-
														
 
															-int atomicCompSwap(inout int _mem, int _compare, int _data)
														
 
															-{
														
 
															-	int result;
														
 
															-	InterlockedCompareExchange(_mem, _compare, _data, result);
														
 
															-	return result;
														
 
															-}
														
 
															-
														
 
															-uint atomicCompSwap(inout uint _mem, uint _compare, uint _data)
														
 
															-{
														
 
															-	uint result;
														
 
															-	InterlockedCompareExchange(_mem, _compare, _data, result);
														
 
															-	return result;
														
 
															-}
														
 
															+#define atomicAdd(_mem, _data)      InterlockedAdd(_mem, _data)
														
 
															+#define atomicAnd(_mem, _data)      InterlockedAnd(_mem, _data)
														
 
															+#define atomicExchange(_mem, _data) InterlockedExchange(_mem, _data)
														
 
															+#define atomicMax(_mem, _data)      InterlockedMax(_mem, _data)
														
 
															+#define atomicMin(_mem, _data)      InterlockedMin(_mem, _data)
														
 
															+#define atomicOr(_mem, _data)       InterlockedOr(_mem, _data)
														
 
															+#define atomicXor(_mem, _data)      InterlockedXor(_mem, _data)
														
 
															+
														
 
															+#define atomicCompSwap(_mem, _compare, _data) \
														
 
															+	InterlockedCompareExchange(_mem,_compare, _data)
														
 
															 // InterlockedCompareStore