12 years ago · 752fa73cec
--- a/src/image.cpp
+++ b/src/image.cpp
@@ -3,60 +3,55 @@
 
				  * License: http://www.opensource.org/licenses/BSD-2-Clause
			
 
				  */
			
 
				 
			
 
				+#include "bgfx_p.h"
			
 
				 #include <bx/float4_t.h>
			
 
				 
			
 
				 namespace bgfx
			
 
				 {
			
 
				 	static void imageSwizzleBGRA8Ref(uint8_t* _rgbaData, uint32_t _width, uint32_t _height) 
			
 
				 	{
			
 
				-		uint32_t dstpitch = _width*4;
			
 
				-		for (uint32_t yy = 0; yy < _height; ++yy)
			
 
				+		const uint32_t dstpitch = _width*4;
			
 
				+		uint8_t* ptr = _rgbaData;
			
 
				+
			
 
				+		for (uint32_t xx = 0, num = _width*_height; xx < num; ++xx)
			
 
				 		{
			
 
				-			uint8_t* dst = &_rgbaData[yy*dstpitch];
			
 
				-
			
 
				-			for (uint32_t xx = 0; xx < _width; ++xx)
			
 
				-			{
			
 
				-				uint8_t tmp = dst[0];
			
 
				-				dst[0] = dst[2];
			
 
				-				dst[2] = tmp;
			
 
				-				dst += 4;
			
 
				-			}
			
 
				+			uint8_t tmp = ptr[0];
			
 
				+			ptr[0] = ptr[2];
			
 
				+			ptr[2] = tmp;
			
 
				+			ptr += 4;
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				 	void imageSwizzleBGRA8(uint8_t* _rgbaData, uint32_t _width, uint32_t _height)
			
 
				 	{
			
 
				-		if (0 != (_width&0xf)
			
 
				-		||  _width < 16)
			
 
				+		// Test can we do four 4-byte pixels at the time.
			
 
				+		if (0 != (_width&0x3)
			
 
				+		||  _width < 4)
			
 
				 		{
			
 
				+			BX_WARN(_width < 4, "Image swizzle is taking slow path. Image width must be multiple of 4 (width %d).", _width);
			
 
				 			imageSwizzleBGRA8Ref(_rgbaData, _width, _height);
			
 
				 			return;
			
 
				 		}
			
 
				 
			
 
				-		uint32_t dstpitch = _width*4;
			
 
				-		uint32_t num = dstpitch/16;
			
 
				+		const uint32_t dstpitch = _width*4;
			
 
				 
			
 
				 		using namespace bx;
			
 
				 
			
 
				 		const float4_t mf0f0 = float4_isplat(0xff00ff00);
			
 
				 		const float4_t m0f0f = float4_isplat(0x00ff00ff);
			
 
				+		uint8_t* ptr = _rgbaData;
			
 
				 
			
 
				-		for (uint32_t yy = 0; yy < _height; ++yy)
			
 
				+		for (uint32_t xx = 0, num = dstpitch/16*_height; xx < num; ++xx)
			
 
				 		{
			
 
				-			uint8_t* ptr = &_rgbaData[yy*dstpitch];
			
 
				-
			
 
				-			for (uint32_t xx = 0; xx < num; ++xx)
			
 
				-			{
			
 
				-				const float4_t tabgr = float4_ld(ptr);
			
 
				-				const float4_t t00ab = float4_srl(tabgr, 16);
			
 
				-				const float4_t tgr00 = float4_sll(tabgr, 16);
			
 
				-				const float4_t tgrab = float4_or(t00ab, tgr00);
			
 
				-				const float4_t ta0g0 = float4_and(tabgr, mf0f0);
			
 
				-				const float4_t t0g0b = float4_and(tgrab, m0f0f);
			
 
				-				const float4_t targb = float4_or(ta0g0, t0g0b);
			
 
				-				float4_st(ptr, targb);
			
 
				-				ptr += 16;
			
 
				-			}
			
 
				+			const float4_t tabgr = float4_ld(ptr);
			
 
				+			const float4_t t00ab = float4_srl(tabgr, 16);
			
 
				+			const float4_t tgr00 = float4_sll(tabgr, 16);
			
 
				+			const float4_t tgrab = float4_or(t00ab, tgr00);
			
 
				+			const float4_t ta0g0 = float4_and(tabgr, mf0f0);
			
 
				+			const float4_t t0r0b = float4_and(tgrab, m0f0f);
			
 
				+			const float4_t targb = float4_or(ta0g0, t0r0b);
			
 
				+			float4_st(ptr, targb);
			
 
				+			ptr += 16;
			
 
				 		}
			
 
				 	}