Jelajahi Sumber

Force SSE2 for all builds. Fixed SIMD load/store reference implementation.

bkaradzic 12 tahun lalu
induk
melakukan
7d02e14aba
4 mengubah file dengan 32 tambahan dan 13 penghapusan
  1. 2 7
      include/bx/float4_ni.h
  2. 26 6
      include/bx/float4_ref.h
  3. 1 0
      include/bx/float4_t.h
  4. 3 0
      premake/toolchain.lua

+ 2 - 7
include/bx/float4_ni.h

@@ -194,16 +194,11 @@ namespace bx
 
 
 	namespace float4_logexp_detail
 	namespace float4_logexp_detail
 	{
 	{
-		BX_FLOAT4_INLINE float4_t float4_poly0(float4_t _a, float _b)
-		{
-			return float4_splat(_b);
-		}
-
 		BX_FLOAT4_INLINE float4_t float4_poly1(float4_t _a, float _b, float _c)
 		BX_FLOAT4_INLINE float4_t float4_poly1(float4_t _a, float _b, float _c)
 		{
 		{
 			const float4_t bbbb   = float4_splat(_b);
 			const float4_t bbbb   = float4_splat(_b);
-			const float4_t poly0  = float4_poly0(_a, _c);
-			const float4_t result = float4_madd(poly0, _a, bbbb);
+			const float4_t cccc   = float4_splat(_c);
+			const float4_t result = float4_madd(cccc, _a, bbbb);
 
 
 			return result;
 			return result;
 		}
 		}

+ 26 - 6
include/bx/float4_ref.h

@@ -182,22 +182,37 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
 
 	BX_FLOAT4_INLINE float4_t float4_ld(const void* _ptr)
 	BX_FLOAT4_INLINE float4_t float4_ld(const void* _ptr)
 	{
 	{
-		return *reinterpret_cast<const float4_t*>(_ptr);
+		const uint32_t* input = reinterpret_cast<const uint32_t*>(_ptr);
+		float4_t result;
+		result.uxyzw[0] = input[0];
+		result.uxyzw[1] = input[1];
+		result.uxyzw[2] = input[2];
+		result.uxyzw[3] = input[3];
+		return result;
 	}
 	}
 
 
 	BX_FLOAT4_INLINE void float4_st(void* _ptr, float4_t _a)
 	BX_FLOAT4_INLINE void float4_st(void* _ptr, float4_t _a)
 	{
 	{
-		*reinterpret_cast<float4_t*>(_ptr) = _a;
+		uint32_t* result = reinterpret_cast<uint32_t*>(_ptr);
+		result[0] = _a.uxyzw[0];
+		result[1] = _a.uxyzw[1];
+		result[2] = _a.uxyzw[2];
+		result[3] = _a.uxyzw[3];
 	}
 	}
 
 
 	BX_FLOAT4_INLINE void float4_stx(void* _ptr, float4_t _a)
 	BX_FLOAT4_INLINE void float4_stx(void* _ptr, float4_t _a)
 	{
 	{
-		*reinterpret_cast<uint32_t*>(_ptr) = _a.uxyzw[0];
+		uint32_t* result = reinterpret_cast<uint32_t*>(_ptr);
+		result[0] = _a.uxyzw[0];
 	}
 	}
 
 
 	BX_FLOAT4_INLINE void float4_stream(void* _ptr, float4_t _a)
 	BX_FLOAT4_INLINE void float4_stream(void* _ptr, float4_t _a)
 	{
 	{
-		*reinterpret_cast<float4_t*>(_ptr) = _a;
+		uint32_t* result = reinterpret_cast<uint32_t*>(_ptr);
+		result[0] = _a.uxyzw[0];
+		result[1] = _a.uxyzw[1];
+		result[2] = _a.uxyzw[2];
+		result[3] = _a.uxyzw[3];
 	}
 	}
 
 
 	BX_FLOAT4_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w)
 	BX_FLOAT4_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w)
@@ -222,8 +237,13 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
 
 	BX_FLOAT4_INLINE float4_t float4_splat(const void* _ptr)
 	BX_FLOAT4_INLINE float4_t float4_splat(const void* _ptr)
 	{
 	{
-		float val = *reinterpret_cast<const float*>(_ptr);
-		return float4_ld(val, val, val, val);
+		const uint32_t val = *reinterpret_cast<const uint32_t*>(_ptr);
+		float4_t result;
+		result.uxyzw[0] = val;
+		result.uxyzw[1] = val;
+		result.uxyzw[2] = val;
+		result.uxyzw[3] = val;
+		return result;
 	}
 	}
 
 
 	BX_FLOAT4_INLINE float4_t float4_splat(float _a)
 	BX_FLOAT4_INLINE float4_t float4_splat(float _a)

+ 1 - 0
include/bx/float4_t.h

@@ -15,6 +15,7 @@
 #elif 0 // __ARM_NEON__
 #elif 0 // __ARM_NEON__
 #	include "float4_neon.h"
 #	include "float4_neon.h"
 #else
 #else
+#	pragma message("************************************\nUsing SIMD reference implementation!\n************************************")
 #	include "float4_ref.h"
 #	include "float4_ref.h"
 #endif //
 #endif //
 
 

+ 3 - 0
premake/toolchain.lua

@@ -183,6 +183,9 @@ function toolchain(_buildDir, _libDir)
 		targetsuffix "Release"
 		targetsuffix "Release"
 
 
 	configuration { "vs*" }
 	configuration { "vs*" }
+		flags {
+			"EnableSSE2",
+		}
 		includedirs { bxDir .. "include/compat/msvc" }
 		includedirs { bxDir .. "include/compat/msvc" }
 		defines {
 		defines {
 			"WIN32",
 			"WIN32",