Branimir Karadžić 9 years ago
parent
commit
096a70b7ff
2 changed files with 40 additions and 7 deletions
  1. 17 3
      include/bx/float4_ni.h
  2. 23 4
      tests/float4_t.cpp

+ 17 - 3
include/bx/float4_ni.h

@@ -177,16 +177,30 @@ namespace bx
 	{
 		const float4_t half   = float4_splat(0.5f);
 		const float4_t one    = float4_splat(1.0f);
-		const float4_t zero   = float4_zero();
 		const float4_t tmp0   = float4_rsqrt_est(_a);
-		const float4_t tmp1   = float4_madd(tmp0, _a, zero);
-		const float4_t tmp2   = float4_madd(tmp1, half, zero);
+		const float4_t tmp1   = float4_mul(tmp0, _a);
+		const float4_t tmp2   = float4_mul(tmp1, half);
 		const float4_t tmp3   = float4_nmsub(tmp0, tmp1, one);
 		const float4_t result = float4_madd(tmp3, tmp2, tmp1);
 
 		return result;
 	}
 
+	BX_FLOAT4_INLINE float4_t float4_sqrt_nr1_ni(float4_t _a)
+	{
+		const float4_t half = float4_splat(0.5f);
+
+		float4_t result = _a;
+		for (uint32_t ii = 0; ii < 11; ++ii)
+		{
+			const float4_t tmp1 = float4_div(_a, result);
+			const float4_t tmp2 = float4_add(tmp1, result);
+			result              = float4_mul(tmp2, half);
+		}
+
+		return result;
+	}
+
 	BX_FLOAT4_INLINE float4_t float4_rsqrt_ni(float4_t _a)
 	{
 		const float4_t one    = float4_splat(1.0f);

+ 23 - 4
tests/float4_t.cpp

@@ -5,6 +5,7 @@
 
 #include "test.h"
 #include <bx/float4_t.h>
+#include <bx/fpumath.h>
 #include <string.h>
 
 using namespace bx;
@@ -70,10 +71,10 @@ void float4_check_float(const char* _str, bx::float4_t _a, float _0, float _1, f
 		, _0, _1, _2, _3
 		);
 
-	CHECK_EQUAL(c.f[0], _0);
-	CHECK_EQUAL(c.f[1], _1);
-	CHECK_EQUAL(c.f[2], _2);
-	CHECK_EQUAL(c.f[3], _3);
+	CHECK(bx::fequal(c.f[0], _0, 0.0001f) );
+	CHECK(bx::fequal(c.f[1], _1, 0.0001f) );
+	CHECK(bx::fequal(c.f[2], _2, 0.0001f) );
+	CHECK(bx::fequal(c.f[3], _3, 0.0001f) );
 }
 
 void float4_check_string(const char* _str, bx::float4_t _a)
@@ -235,6 +236,24 @@ TEST(float4_arithmetic)
 		);
 }
 
+TEST(float4_sqrt)
+{
+	float4_check_float("float4_sqrt"
+		, float4_sqrt(float4_ld(1.0f, 16.0f, 65536.0f, 123456.0f) )
+		, 1.0f, 4.0f, 256.0f, 351.363060096f
+		);
+
+	float4_check_float("float4_sqrt_nr_ni"
+		, float4_sqrt_nr_ni(float4_ld(1.0f, 16.0f, 65536.0f, 123456.0f) )
+		, 1.0f, 4.0f, 256.0f, 351.363060096f
+		);
+
+	float4_check_float("float4_sqrt_nr1_ni"
+		, float4_sqrt_nr1_ni(float4_ld(1.0f, 16.0f, 65536.0f, 123456.0f) )
+		, 1.0f, 4.0f, 256.0f, 351.363060096f
+		);
+}
+
 TEST(float4)
 {
 	const float4_t isplat = float4_isplat(0x80000001);