Просмотр исходного кода

metal backend:
- iOS9 fixes
- removing cpu/gpu syncs (wip)

attilaz 9 лет назад
Родитель
Сommit
c35935da81
2 измененных файлов с 124 добавлено и 52 удалено
  1. 29 8
      src/renderer_mtl.h
  2. 95 44
      src/renderer_mtl.mm

+ 29 - 8
src/renderer_mtl.h

@@ -23,6 +23,8 @@ namespace bgfx { namespace mtl
 	// objects with creation functions starting with 'new' has a refcount 1 after creation, object must be destroyed with release.
 	// commandBuffer, commandEncoders are autoreleased objects. Needs AutoreleasePool!
 
+#define MTL_MAX_FRAMES_IN_FLIGHT (3)
+	
 #define MTL_CLASS(name) \
 	class name \
 	{ \
@@ -35,6 +37,13 @@ namespace bgfx { namespace mtl
 
 		typedef void (*mtlCallback)(void* userData);
 
+	MTL_CLASS(BlitCommandEncoder)
+		void endEncoding()
+		{
+			[m_obj endEncoding];
+		}
+	MTL_CLASS_END
+	
 	MTL_CLASS(Buffer)
 		void* contents()
 		{
@@ -148,8 +157,14 @@ namespace bgfx { namespace mtl
 
 		id<MTLLibrary> newLibraryWithSource(const char* _source)
 		{
+			MTLCompileOptions* options = [MTLCompileOptions new];
+			//NOTE: turned of as 'When using the fast variants, math functions execute more quickly,
+			//      but operate over a **LIMITED RANGE** and their behavior when handling NaN values is not defined.'
+			if (BX_ENABLED(BX_PLATFORM_IOS))
+				options.fastMathEnabled = NO;
+
 			NSError* error;
-			id<MTLLibrary> lib = [m_obj newLibraryWithSource:@(_source) options:nil error:&error];
+			id<MTLLibrary> lib = [m_obj newLibraryWithSource:@(_source) options:options error:&error];
 			BX_WARN(NULL == error
 				, "Shader compilation failed: %s"
 				, [error.localizedDescription cStringUsingEncoding:NSASCIIStringEncoding]
@@ -583,10 +598,12 @@ namespace bgfx { namespace mtl
 	struct BufferMtl
 	{
 		BufferMtl()
-			: m_buffer(NULL)
-			, m_flags(BGFX_BUFFER_NONE)
+			: m_flags(BGFX_BUFFER_NONE)
 			, m_dynamic(false)
+			, m_bufferIndex(0)
 		{
+			for (uint32_t ii = 0; ii < MTL_MAX_FRAMES_IN_FLIGHT; ++ii)
+				m_buffers[ii] = NULL;
 		}
 
 		void create(uint32_t _size, void* _data, uint16_t _flags, uint16_t _stride = 0, bool _vertex = false);
@@ -594,18 +611,22 @@ namespace bgfx { namespace mtl
 
 		void destroy()
 		{
-			if (NULL != m_buffer)
+			for (uint32_t ii = 0; ii < MTL_MAX_FRAMES_IN_FLIGHT; ++ii)
 			{
-				[m_buffer release];
-				m_buffer = NULL;
-				m_dynamic = false;
+				MTL_RELEASE(m_buffers[ii]);
 			}
+			m_dynamic = false;
 		}
+		
+		Buffer getBuffer() const { return m_buffers[m_bufferIndex]; }
 
-		Buffer   m_buffer;
 		uint32_t m_size;
 		uint16_t m_flags;
+		
 		bool m_dynamic;
+	private:
+		uint8_t  m_bufferIndex;
+		Buffer   m_buffers[MTL_MAX_FRAMES_IN_FLIGHT];
 	};
 
 	typedef BufferMtl IndexBufferMtl;

+ 95 - 44
src/renderer_mtl.mm

@@ -18,34 +18,35 @@
 #import <Foundation/Foundation.h>
 
 #define UNIFORM_BUFFER_SIZE (8*1024*1024)
-#define UNIFORM_BUFFER_COUNT (3)
 
 /*
  // known metal shader generation issues:
-   03-raymarch: OSX nothing is visible  ( depth/color order should be swapped in fragment output struct)
    15-shadowmaps-simple: shader compilation error
    16-shadowmaps:  //problem with essl -> metal: SAMPLER2D(u_shadowMap0, 4);  sampler index is lost. Shadowmap is set to slot 4, but
       metal shader uses sampler/texture slot 0. this could require changes outside of renderer_mtl?
 	  packFloatToRGBA needs highp. currently it uses half.
    24-nbody: no generated compute shaders for metal
    27-terrain: shaderc generates invalid metal shader for vs_terrain_height_texture. vertex output: half4 gl_Position [[position]], should be float4
-
+ 
 Known issues(driver problems??):
   OSX mac mini(late 2014), OSX10.11.3 : nanovg-rendering: color writemask off causes problem...
-  iPad mini 2,  iOS 8.1.1:  21-deferred: scissor not working properly
-							26-occlusion: doesn't work with two rendercommandencoders, merge should fix this
+TODO: check if swap really solves this?	03-raymarch: OSX nothing is visible  ( depth/color order should be swapped in fragment output struct)
 
+  iPad mini 2,  iOS 8.1.1:  21-deferred: scissor not working properly
+							26-occlusion: query doesn't work with two rendercommandencoders, merge should fix this
+			Only on this device ( no problem on iPad Air 2 with iOS9.3.1)
+ 
 TODOs:
   07-callback, saveScreenshot should be implemented with one frame latency (using saveScreenshotBegin and End)
   - iOS device orientation change is not handled properly
-
+ 
  22-windows: todo support multiple windows
-
- - optimization: remove heavy sync, merge views with same fb and no clear.
+ 
+ - optimization: remove sync points, merge views with same fb and no clear.
       13-stencil and 16-shadowmaps are very inefficient. every view stores/loads backbuffer data
-
+ 
   - 15-shadowmaps-simple (example needs modification mtxCrop znew = z * 0.5 + 0.5 is not needed ) could be hacked in shader too
-
+ 
  BGFX_RESET_FLIP_AFTER_RENDER on low level renderers should be true? (crashes even with BGFX_RESET_FLIP_AFTER_RENDER because there is
  one rendering frame before reset). Do I have absolutely need to send result to View at flip or can I do it in submit?
  */
@@ -334,7 +335,7 @@ namespace bgfx { namespace mtl
 			: m_metalLayer(NULL)
 			, m_backBufferPixelFormatHash(0)
 			, m_maxAnisotropy(1)
-			, m_uniformBufferIndex(0)
+			, m_bufferIndex(0)
 			, m_numWindows(1)
 			, m_rtMsaa(false)
 			, m_drawable(NULL)
@@ -405,7 +406,8 @@ namespace bgfx { namespace mtl
 			m_textureDescriptor = newTextureDescriptor();
 			m_samplerDescriptor = newSamplerDescriptor();
 
-			for (uint8_t i=0; i < UNIFORM_BUFFER_COUNT; ++i)
+			m_framesSemaphore.post(MTL_MAX_FRAMES_IN_FLIGHT);
+			for (uint8_t i=0; i < MTL_MAX_FRAMES_IN_FLIGHT; ++i)
 			{
 				m_uniformBuffers[i] = m_device.newBufferWithLength(UNIFORM_BUFFER_SIZE, 0);
 			}
@@ -585,7 +587,7 @@ namespace bgfx { namespace mtl
 				MTL_RELEASE(m_backBufferStencil);
 			}
 
-			for (uint8_t i=0; i < UNIFORM_BUFFER_COUNT; ++i)
+			for (uint8_t i=0; i < MTL_MAX_FRAMES_IN_FLIGHT; ++i)
 			{
 				MTL_RELEASE(m_uniformBuffers[i]);
 			}
@@ -806,7 +808,7 @@ namespace bgfx { namespace mtl
 				return;
 			}
 
-			//TODO: we should wait for completion of pending commandBuffers
+			sync();
 			//TODO: implement this with saveScreenshotBegin/End
 
 			Texture backBuffer = m_drawable.texture;
@@ -908,7 +910,7 @@ namespace bgfx { namespace mtl
 			}
 
 			VertexBufferMtl& vb = m_vertexBuffers[_blitter.m_vb->handle.idx];
-			rce.setVertexBuffer(vb.m_buffer, 0, 1);
+			rce.setVertexBuffer(vb.getBuffer(), 0, 1);
 
 			float proj[16];
 			bx::mtxOrtho(proj, 0.0f, (float)width, (float)height, 0.0f, 0.0f, 1000.0f);
@@ -925,13 +927,20 @@ namespace bgfx { namespace mtl
 			const uint32_t numVertices = _numIndices*4/6;
 			if (0 < numVertices)
 			{
-				m_indexBuffers [_blitter.m_ib->handle.idx].update(0, _numIndices*2, _blitter.m_ib->data);
+				m_indexBuffers [_blitter.m_ib->handle.idx].update(0, _numIndices*2, _blitter.m_ib->data, true);
 				m_vertexBuffers[_blitter.m_vb->handle.idx].update(0, numVertices*_blitter.m_decl.m_stride, _blitter.m_vb->data, true);
 
-				m_renderCommandEncoder.drawIndexedPrimitives(MTLPrimitiveTypeTriangle, _numIndices, MTLIndexTypeUInt16, m_indexBuffers[_blitter.m_ib->handle.idx].m_buffer, 0, 1);
+				m_renderCommandEncoder.drawIndexedPrimitives(MTLPrimitiveTypeTriangle, _numIndices, MTLIndexTypeUInt16, m_indexBuffers[_blitter.m_ib->handle.idx].getBuffer(), 0, 1);
 			}
 		}
 
+		static void commandBufferFinishedCallback(void* _data)
+		{
+			RendererContextMtl* renderer = (RendererContextMtl*)_data;
+			if ( renderer )
+				renderer->m_framesSemaphore.post();
+		}
+
 		void flip(HMD& /*_hmd*/) BX_OVERRIDE
 		{
 			if (NULL == m_drawable
@@ -944,11 +953,13 @@ namespace bgfx { namespace mtl
 			m_commandBuffer.presentDrawable(m_drawable);
 			MTL_RELEASE(m_drawable);
 
+			m_commandBuffer.addCompletedHandler(commandBufferFinishedCallback, this);
+
 			m_commandBuffer.commit();
 
-			//  using heavy syncing now
-			//  TODO: refactor it with double/triple buffering frame data
-			m_commandBuffer.waitUntilCompleted();
+			MTL_RELEASE(m_prevCommandBuffer);
+			m_prevCommandBuffer = m_commandBuffer;
+			retain(m_commandBuffer);
 
 			MTL_RELEASE(m_commandBuffer);
 
@@ -1306,6 +1317,29 @@ namespace bgfx { namespace mtl
 			return m_backBufferDepth.height();
 		}
 
+		void sync()
+		{
+			if ( m_prevCommandBuffer )
+				m_prevCommandBuffer.waitUntilCompleted();
+		}
+
+		BlitCommandEncoder getBlitCommandEncoder()
+		{
+			if ( m_blitCommandEncoder == NULL)
+			{
+				if ( m_commandBuffer == NULL )
+				{
+					m_commandBuffer = m_commandQueue.commandBuffer();
+					retain(m_commandBuffer);
+				}
+				
+				m_blitCommandEncoder = m_commandBuffer.blitCommandEncoder();
+			}
+			
+			return m_blitCommandEncoder;
+		}
+
+
 		Device        m_device;
 		CommandQueue  m_commandQueue;
 		CAMetalLayer* m_metalLayer;
@@ -1320,11 +1354,14 @@ namespace bgfx { namespace mtl
 
 		OcclusionQueryMTL m_occlusionQuery;
 
+		bx::Semaphore m_framesSemaphore;
+
 		Buffer   m_uniformBuffer;
-		Buffer   m_uniformBuffers[UNIFORM_BUFFER_COUNT];
+		Buffer   m_uniformBuffers[MTL_MAX_FRAMES_IN_FLIGHT];
 		uint32_t m_uniformBufferVertexOffset;
 		uint32_t m_uniformBufferFragmentOffset;
-		uint8_t  m_uniformBufferIndex;
+
+		uint8_t  m_bufferIndex;
 
 		uint16_t          m_numWindows;
 		FrameBufferHandle m_windows[BGFX_CONFIG_MAX_FRAME_BUFFERS];
@@ -1361,6 +1398,8 @@ namespace bgfx { namespace mtl
 		// currently active objects data
 		id <CAMetalDrawable> m_drawable;
 		CommandBuffer m_commandBuffer;
+		CommandBuffer m_prevCommandBuffer;
+		BlitCommandEncoder m_blitCommandEncoder;
 		RenderCommandEncoder m_renderCommandEncoder;
 	};
 
@@ -1454,14 +1493,6 @@ namespace bgfx { namespace mtl
 		char* temp = (char*)alloca(tempLen);
 		bx::StaticMemoryBlockWriter writer(temp, tempLen);
 
-		//TODO: remove this hack. some shaders have problem with half<->float conversion
-		writeString(&writer
-					, "#define half float\n"
-					 "#define half2 float2\n"
-					 "#define half3 float3\n"
-					 "#define half4 float4\n"
-					);
-
 		bx::write(&writer, code, codeLen);
 		bx::write(&writer, '\0');
 		code = temp;
@@ -1892,14 +1923,16 @@ namespace bgfx { namespace mtl
 
 		m_size = _size;
 		m_flags = _flags;
+		m_dynamic = false; //NULL == _data;
 
 		if (NULL == _data)
 		{
-			m_buffer = s_renderMtl->m_device.newBufferWithLength(_size, 0);
+			for (uint32_t ii = 0; ii < MTL_MAX_FRAMES_IN_FLIGHT; ++ii)
+				m_buffers[ii] = s_renderMtl->m_device.newBufferWithLength(_size, 0);
 		}
 		else
 		{
-			m_buffer = s_renderMtl->m_device.newBufferWithBytes(_data, _size, 0);
+			m_buffers[0] = s_renderMtl->m_device.newBufferWithBytes(_data, _size, 0);
 		}
 	}
 
@@ -1907,7 +1940,12 @@ namespace bgfx { namespace mtl
 	{
 		BX_UNUSED(_discard);
 
-		memcpy( (uint8_t*)m_buffer.contents() + _offset, _data, _size);
+			//TODO: cannot call this more than once per frame
+		if ( m_dynamic && _discard )
+			m_bufferIndex = (m_bufferIndex + 1) % MTL_MAX_FRAMES_IN_FLIGHT;
+		else
+			s_renderMtl->sync();
+		memcpy( (uint8_t*)getBuffer().contents() + _offset, _data, _size);
 	}
 
 	void VertexBufferMtl::create(uint32_t _size, void* _data, VertexDeclHandle _declHandle, uint16_t _flags)
@@ -2007,7 +2045,7 @@ namespace bgfx { namespace mtl
 
 				desc.storageMode = (MTLStorageMode)(writeOnly||isDepth(TextureFormat::Enum(m_textureFormat))
 													? 2 /*MTLStorageModePrivate*/
-													: 1 /*MTLStorageModeManaged*/
+													: ((BX_ENABLED(BX_PLATFORM_IOS)) ? 0 /* MTLStorageModeShared */ :  1 /*MTLStorageModeManaged*/)
 													);
 
 				desc.usage = MTLTextureUsageShaderRead;
@@ -2109,6 +2147,8 @@ namespace bgfx { namespace mtl
 
 	void TextureMtl::update(uint8_t _side, uint8_t _mip, const Rect& _rect, uint16_t _z, uint16_t _depth, uint16_t _pitch, const Memory* _mem)
 	{
+		s_renderMtl->sync();
+		
 		MTLRegion region =
 		{
 			{ _rect.m_x,     _rect.m_y,      _z     },
@@ -2268,8 +2308,19 @@ namespace bgfx { namespace mtl
 
 	void RendererContextMtl::submit(Frame* _render, ClearQuad& _clearQuad, TextVideoMemBlitter& _textVideoMemBlitter) BX_OVERRIDE
 	{
-		m_commandBuffer = m_commandQueue.commandBuffer();
-		retain(m_commandBuffer); // keep alive to be useable at 'flip'
+		m_framesSemaphore.wait();
+
+		if ( m_commandBuffer == NULL )
+		{
+			m_commandBuffer = m_commandQueue.commandBuffer();
+			retain(m_commandBuffer); // keep alive to be useable at 'flip'
+		}
+		
+		if ( m_blitCommandEncoder )
+		{
+			m_blitCommandEncoder.endEncoding();
+			m_blitCommandEncoder = 0;
+		}
 
 		//TODO: multithreading with multiple commandbuffer
 		// is there a FAST way to tell which view is active?
@@ -2280,8 +2331,8 @@ namespace bgfx { namespace mtl
 		retain(m_drawable); // keep alive to be useable at 'flip'
 #endif
 
-		m_uniformBuffer = m_uniformBuffers[m_uniformBufferIndex];
-		m_uniformBufferIndex = (m_uniformBufferIndex + 1) % UNIFORM_BUFFER_COUNT;
+		m_uniformBuffer = m_uniformBuffers[m_bufferIndex];
+		m_bufferIndex = (m_bufferIndex + 1) % MTL_MAX_FRAMES_IN_FLIGHT;
 		m_uniformBufferVertexOffset = 0;
 		m_uniformBufferFragmentOffset = 0;
 
@@ -2299,13 +2350,13 @@ namespace bgfx { namespace mtl
 		if (0 < _render->m_iboffset)
 		{
 			TransientIndexBuffer* ib = _render->m_transientIb;
-			m_indexBuffers[ib->handle.idx].update(0, _render->m_iboffset, ib->data);
+			m_indexBuffers[ib->handle.idx].update(0, _render->m_iboffset, ib->data, true);
 		}
 
 		if (0 < _render->m_vboffset)
 		{
 			TransientVertexBuffer* vb = _render->m_transientVb;
-			m_vertexBuffers[vb->handle.idx].update(0, _render->m_vboffset, vb->data);
+			m_vertexBuffers[vb->handle.idx].update(0, _render->m_vboffset, vb->data, true);
 		}
 
 		_render->sort();
@@ -2818,12 +2869,12 @@ namespace bgfx { namespace mtl
 						const VertexDecl& vertexDecl = m_vertexDecls[decl];
 						uint32_t offset = draw.m_startVertex  * vertexDecl.getStride();
 
-						rce.setVertexBuffer(vb.m_buffer, offset, 1);
+						rce.setVertexBuffer(vb.getBuffer(), offset, 1);
 
 						if (isValid(draw.m_instanceDataBuffer) )
 						{
 							const VertexBufferMtl& inst = m_vertexBuffers[draw.m_instanceDataBuffer.idx];
-							rce.setVertexBuffer(inst.m_buffer, draw.m_instanceDataOffset, 2);
+							rce.setVertexBuffer(inst.getBuffer(), draw.m_instanceDataOffset, 2);
 						}
 					}
 				}
@@ -2868,7 +2919,7 @@ namespace bgfx { namespace mtl
 								numInstances      = draw.m_numInstances;
 								numPrimsRendered  = numPrimsSubmitted*draw.m_numInstances;
 
-								rce.drawIndexedPrimitives(prim.m_type, numIndices, indexType, ib.m_buffer, 0, draw.m_numInstances);
+								rce.drawIndexedPrimitives(prim.m_type, numIndices, indexType, ib.getBuffer(), 0, draw.m_numInstances);
 							}
 							else if (prim.m_min <= draw.m_numIndices)
 							{
@@ -2878,7 +2929,7 @@ namespace bgfx { namespace mtl
 								numInstances      = draw.m_numInstances;
 								numPrimsRendered  = numPrimsSubmitted*draw.m_numInstances;
 
-								rce.drawIndexedPrimitives(prim.m_type, numIndices, indexType, ib.m_buffer, draw.m_startIndex * indexSize,numInstances);
+								rce.drawIndexedPrimitives(prim.m_type, numIndices, indexType, ib.getBuffer(), draw.m_startIndex * indexSize,numInstances);
 							}
 						}
 						else