Преглед изворни кода

Added NEON float4_t. Added float4_t unit test.

bkaradzic пре 12 година
родитељ
комит
7d537a65c9

+ 16 - 5
3rdparty/UnitTest++/src/TestReporterStdout.cpp

@@ -8,6 +8,13 @@
 	namespace std {}
 #endif
 
+#if defined(__ANDROID__)
+#	include <android/log.h>
+#	define outf(format, ...) __android_log_print(ANDROID_LOG_DEBUG, "", format, ##__VA_ARGS__)
+#else
+#	define outf(format, ...) printf(format, ##__VA_ARGS__)
+#endif // defined(__ANDROID__)
+
 namespace UnitTest {
 
 void TestReporterStdout::ReportFailure(TestDetails const& details, char const* failure)
@@ -18,8 +25,8 @@ void TestReporterStdout::ReportFailure(TestDetails const& details, char const* f
     char const* const errorFormat = "%s(%d): error: Failure in %s: %s\n";
 #endif
 
-	using namespace std;
-    printf(errorFormat, details.filename, details.lineNumber, details.testName, failure);
+    using namespace std;
+    outf(errorFormat, details.filename, details.lineNumber, details.testName, failure);
 }
 
 void TestReporterStdout::ReportTestStart(TestDetails const& /*test*/)
@@ -36,11 +43,15 @@ void TestReporterStdout::ReportSummary(int const totalTestCount, int const faile
 	using namespace std;
 
     if (failureCount > 0)
-        printf("FAILURE: %d out of %d tests failed (%d failures).\n", failedTestCount, totalTestCount, failureCount);
+	{
+        outf("FAILURE: %d out of %d tests failed (%d failures).\n", failedTestCount, totalTestCount, failureCount);
+	}
     else
-        printf("Success: %d tests passed.\n", totalTestCount);
+	{
+        outf("Success: %d tests passed.\n", totalTestCount);
+	}
 
-    printf("Test time: %.2f seconds.\n", secondsElapsed);
+    outf("Test time: %.2f seconds.\n", secondsElapsed);
 }
 
 }

+ 298 - 82
include/bx/float4_neon.h

@@ -6,20 +6,13 @@
 #ifndef BX_FLOAT4_NEON_H_HEADER_GUARD
 #define BX_FLOAT4_NEON_H_HEADER_GUARD
 
-#include <arm_neon.h>
-
 namespace bx
 {
+	typedef __builtin_neon_sf  float4_t __attribute__( (__vector_size__(16) ) );
 
-// Reference:
-// http://gcc.gnu.org/onlinedocs/gcc/ARM-NEON-Intrinsics.html
-// http://blogs.arm.com/software-enablement/161-coding-for-neon-part-1-load-and-stores/
-// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
-// http://blogs.arm.com/software-enablement/241-coding-for-neon-part-3-matrix-multiplication/
-// http://blogs.arm.com/software-enablement/277-coding-for-neon-part-4-shifting-left-and-right/
-// http://blogs.arm.com/software-enablement/684-coding-for-neon-part-5-rearranging-vectors/
-
-	typedef __builtin_neon_sf float4_t __attribute__( (__vector_size__(16) ) );
+	typedef __builtin_neon_sf  _f32x2_t	__attribute__( (__vector_size__( 8) ) );
+	typedef __builtin_neon_si  _i32x4_t __attribute__( (__vector_size__(16) ) );
+	typedef __builtin_neon_usi _u32x4_t __attribute__( (__vector_size__(16) ) );
 
 #define ELEMx 0
 #define ELEMy 1
@@ -28,12 +21,7 @@ namespace bx
 #define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \
 			BX_FLOAT4_INLINE float4_t float4_swiz_##_x##_y##_z##_w(float4_t _a) \
 			{ \
-				float4_t result; \
-				result.ixyzw[0] = _a.ixyzw[ELEM##_x]; \
-				result.ixyzw[1] = _a.ixyzw[ELEM##_y]; \
-				result.ixyzw[2] = _a.ixyzw[ELEM##_z]; \
-				result.ixyzw[3] = _a.ixyzw[ELEM##_w]; \
-				return result; \
+				return __builtin_shuffle(_a, (_u32x4_t){ ELEM##_x, ELEM##_y, ELEM##_z, ELEM##_w }); \
 			}
 
 #include "float4_swizzle.inl"
@@ -46,89 +34,106 @@ namespace bx
 
 	BX_FLOAT4_INLINE float4_t float4_shuf_xyAB(float4_t _a, float4_t _b)
 	{
-		return _a; //_mm_movelh_ps(_a, _b);
+		return __builtin_shuffle(_a, _b, (_u32x4_t){ 0, 1, 4, 5 });
 	}
 
 	BX_FLOAT4_INLINE float4_t float4_shuf_ABxy(float4_t _a, float4_t _b)
 	{
-		return _a; //_mm_movelh_ps(_b, _a);
+		return __builtin_shuffle(_a, _b, (_u32x4_t){ 4, 5, 0, 1 });
 	}
 
 	BX_FLOAT4_INLINE float4_t float4_shuf_CDzw(float4_t _a, float4_t _b)
 	{
-		return _a; //_mm_movehl_ps(_a, _b);
+		return __builtin_shuffle(_a, _b, (_u32x4_t){ 6, 7, 2, 3 });
 	}
 
 	BX_FLOAT4_INLINE float4_t float4_shuf_zwCD(float4_t _a, float4_t _b)
 	{
-		return _a; //_mm_movehl_ps(_b, _a);
+		return __builtin_shuffle(_a, _b, (_u32x4_t){ 2, 3, 6, 7 });
 	}
 
 	BX_FLOAT4_INLINE float4_t float4_shuf_xAyB(float4_t _a, float4_t _b)
 	{
-		return _a; //_mm_unpacklo_ps(_a, _b);
+		return __builtin_shuffle(_a, _b, (_u32x4_t){ 0, 4, 1, 5 });
 	}
 
 	BX_FLOAT4_INLINE float4_t float4_shuf_yBxA(float4_t _a, float4_t _b)
 	{
-		return _a; //_mm_unpacklo_ps(_b, _a);
+		return __builtin_shuffle(_a, _b, (_u32x4_t){ 1, 5, 0, 4 });
 	}
 
 	BX_FLOAT4_INLINE float4_t float4_shuf_zCwD(float4_t _a, float4_t _b)
 	{
-		return _a; //_mm_unpackhi_ps(_a, _b);
+		return __builtin_shuffle(_a, _b, (_u32x4_t){ 2, 6, 3, 7 });
 	}
 
 	BX_FLOAT4_INLINE float4_t float4_shuf_CzDw(float4_t _a, float4_t _b)
 	{
-		return _a; //_mm_unpackhi_ps(_b, _a);
+		return __builtin_shuffle(_a, _b, (_u32x4_t){ 6, 2, 7, 3 });
 	}
 
 	BX_FLOAT4_INLINE float float4_x(float4_t _a)
 	{
-		return _a.fxyzw[0];
+		return __builtin_neon_vget_lanev4sf(_a, 0, 3);
 	}
 
 	BX_FLOAT4_INLINE float float4_y(float4_t _a)
 	{
-		return _a.fxyzw[1];
+		return __builtin_neon_vget_lanev4sf(_a, 1, 3);
 	}
 
 	BX_FLOAT4_INLINE float float4_z(float4_t _a)
 	{
-		return _a.fxyzw[2];
+		return __builtin_neon_vget_lanev4sf(_a, 2, 3);
 	}
 
 	BX_FLOAT4_INLINE float float4_w(float4_t _a)
 	{
-		return _a.fxyzw[3];
+		return __builtin_neon_vget_lanev4sf(_a, 3, 3);
 	}
 
-//	BX_FLOAT4_INLINE float4_t float4_ld(const void* _ptr)
-//	{
-//		return _mm_load_ps(reinterpret_cast<const float*>(_ptr) );
-//	}
+	BX_FLOAT4_INLINE float4_t float4_ld(const void* _ptr)
+	{
+		return __builtin_neon_vld1v4sf( (const __builtin_neon_sf*)_ptr);
+	}
 
-//	BX_FLOAT4_INLINE void float4_st(void* _ptr, float4_t _a)
-//	{
-//		_mm_store_ps(reinterpret_cast<float*>(_ptr), _a);
-//	}
+	BX_FLOAT4_INLINE void float4_st(void* _ptr, float4_t _a)
+	{
+		__builtin_neon_vst1v4sf( (__builtin_neon_sf*)_ptr, _a);
+	}
 
-//	BX_FLOAT4_INLINE void float4_stream(void* _ptr, float4_t _a)
-//	{
-//		_mm_stream_ps(reinterpret_cast<float*>(_ptr), _a);
-//	}
+	BX_FLOAT4_INLINE void float4_stx(void* _ptr, float4_t _a)
+	{
+		__builtin_neon_vst1_lanev4sf( (__builtin_neon_sf*)_ptr, _a, 0); 
+	}
+
+	BX_FLOAT4_INLINE void float4_stream(void* _ptr, float4_t _a)
+	{
+		__builtin_neon_vst1v4sf( (__builtin_neon_sf*)_ptr, _a);
+	}
 
 	BX_FLOAT4_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w)
 	{
-		const float32_t val[4] = {_x, _y, _z, _w};
-		return __builtin_neon_vld1v4sf(val);
+		const float4_t val[4] = {_x, _y, _z, _w};
+		return float4_ld(val);
 	}
 
 	BX_FLOAT4_INLINE float4_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
 	{
 		const uint32_t val[4] = {_x, _y, _z, _w};
-		return (float4_t)__builtin_neon_vld1v4si( (const __builtin_neon_si*)val);
+		const _i32x4_t tmp    = __builtin_neon_vld1v4si( (const __builtin_neon_si*)val);
+		const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_splat(const void* _ptr)
+	{
+		const float4_t tmp0   = __builtin_neon_vld1v4sf( (const __builtin_neon_sf *)_ptr);
+		const _f32x2_t tmp1   = __builtin_neon_vget_lowv4sf(tmp0);
+		const float4_t result = __builtin_neon_vdup_lanev4sf(tmp1, 0);
+
+		return result;
 	}
 
 	BX_FLOAT4_INLINE float4_t float4_splat(float _a)
@@ -138,107 +143,318 @@ namespace bx
 
 	BX_FLOAT4_INLINE float4_t float4_isplat(uint32_t _a)
 	{
-		return (float4_t)__builtin_neon_vdup_nv4si( (__builtin_neon_si)_a);
+		const _i32x4_t tmp    = __builtin_neon_vdup_nv4si( (__builtin_neon_si)_a);
+		const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp);
+
+		return result;
 	}
 
 	BX_FLOAT4_INLINE float4_t float4_zero()
 	{
-		return vdupq_n_f32(0.0f);
+		return float4_isplat(0);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_itof(float4_t _a)
+	{
+		const _i32x4_t itof   = __builtin_neon_vreinterpretv4siv4sf(_a);
+		const float4_t result = __builtin_neon_vcvtv4si(itof, 1);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_ftoi(float4_t _a)
+	{
+		const _i32x4_t ftoi   = __builtin_neon_vcvtv4sf(_a, 1);
+		const float4_t result = __builtin_neon_vreinterpretv4sfv4si(ftoi);
+
+		return result;
 	}
 
 	BX_FLOAT4_INLINE float4_t float4_add(float4_t _a, float4_t _b)
 	{
-		return vaddq_f32(_a, _b);
+		return __builtin_neon_vaddv4sf(_a, _b, 3);
 	}
 
 	BX_FLOAT4_INLINE float4_t float4_sub(float4_t _a, float4_t _b)
 	{
-		return vsubq_f32(_a, _b);
+		return __builtin_neon_vsubv4sf(_a, _b, 3);
 	}
 
 	BX_FLOAT4_INLINE float4_t float4_mul(float4_t _a, float4_t _b)
 	{
-		return vmulq_f32(_a, _b);
+		return __builtin_neon_vmulv4sf(_a, _b, 3);
 	}
 
 	BX_FLOAT4_INLINE float4_t float4_rcp_est(float4_t _a)
 	{
-		return vrecpeq_f32(_a);
+		return __builtin_neon_vrecpev4sf(_a, 3);
 	}
 
 	BX_FLOAT4_INLINE float4_t float4_rsqrt_est(float4_t _a)
 	{
-		return vrsqrteq_f32(_a);
+		return __builtin_neon_vrsqrtev4sf(_a, 3);
 	}
 
-	BX_FLOAT4_INLINE float4_t float4_and(float4_t _a, float4_t _b)
+	BX_FLOAT4_INLINE float4_t float4_cmpeq(float4_t _a, float4_t _b)
 	{
-		return (float4_t)__builtin_neon_vandv4si( (int32x4_t)_a, (int32x4_t)_b, 0);
+		const _i32x4_t tmp    = __builtin_neon_vceqv4sf(_a, _b, 3);
+		const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp);
+
+		return result;
 	}
 
-	//BX_FLOAT4_INLINE float4_t float4_andc(float4_t _a, float4_t _b)
-	//{
-	//	return _mm_andnot_ps(_b, _a);
-	//}
+	BX_FLOAT4_INLINE float4_t float4_cmplt(float4_t _a, float4_t _b)
+	{
+		const _i32x4_t tmp    = __builtin_neon_vcgtv4sf(_b, _a, 3);
+		const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp);
+
+		return result;
+	}
 
-	BX_FLOAT4_INLINE float4_t float4_or(float4_t _a, float4_t _b)
+	BX_FLOAT4_INLINE float4_t float4_cmple(float4_t _a, float4_t _b)
 	{
-		return (float4_t)__builtin_neon_vorrv4si( (int32x4_t)_a, (int32x4_t)_b, 0);
+		const _i32x4_t tmp    = __builtin_neon_vcgev4sf(_b, _a, 3);
+		const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp);
+
+		return result;
 	}
 
-	BX_FLOAT4_INLINE float4_t float4_iadd(float4_t _a, float4_t _b)
+	BX_FLOAT4_INLINE float4_t float4_cmpgt(float4_t _a, float4_t _b)
 	{
-		const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a);
-		const uint32x4_t tmp1 = vreinterpretq_u32_f32(_b);
-		const uint32x4_t add  = vaddq_u32(tmp0, tmp1);
-		const float4_t result = vreinterpretq_f32_u32(add);
+		const _i32x4_t tmp    = __builtin_neon_vcgtv4sf(_a, _b, 3);
+		const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp);
 
 		return result;
 	}
 
-	BX_FLOAT4_INLINE float4_t float4_isub(float4_t _a, float4_t _b)
+	BX_FLOAT4_INLINE float4_t float4_cmpge(float4_t _a, float4_t _b)
+	{
+		const _i32x4_t tmp    = __builtin_neon_vcgev4sf(_a, _b, 3);
+		const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_min(float4_t _a, float4_t _b)
+	{
+		return __builtin_neon_vminv4sf(_a, _b, 3);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_max(float4_t _a, float4_t _b)
+	{
+		return __builtin_neon_vmaxv4sf(_a, _b, 3);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_and(float4_t _a, float4_t _b)
+	{
+		const _i32x4_t tmp0   = __builtin_neon_vreinterpretv4siv4sf(_a);
+		const _i32x4_t tmp1   = __builtin_neon_vreinterpretv4siv4sf(_b);
+		const _i32x4_t tmp2   = __builtin_neon_vandv4si(tmp0, tmp1, 0);
+		const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp2);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_andc(float4_t _a, float4_t _b)
+	{
+		const _i32x4_t tmp0   = __builtin_neon_vreinterpretv4siv4sf(_a);
+		const _i32x4_t tmp1   = __builtin_neon_vreinterpretv4siv4sf(_b);
+		const _i32x4_t tmp2   = __builtin_neon_vbicv4si(tmp0, tmp1, 0);
+		const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp2);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_or(float4_t _a, float4_t _b)
 	{
-		const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a);
-		const uint32x4_t tmp1 = vreinterpretq_u32_f32(_b);
-		const uint32x4_t sub  = vsubq_u32(tmp0, tmp1);
-		const float4_t result = vreinterpretq_f32_u32(sub);
+		const _i32x4_t tmp0   = __builtin_neon_vreinterpretv4siv4sf(_a);
+		const _i32x4_t tmp1   = __builtin_neon_vreinterpretv4siv4sf(_b);
+		const _i32x4_t tmp2   = __builtin_neon_vorrv4si(tmp0, tmp1, 0);
+		const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp2);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_xor(float4_t _a, float4_t _b)
+	{
+		const _i32x4_t tmp0   = __builtin_neon_vreinterpretv4siv4sf(_a);
+		const _i32x4_t tmp1   = __builtin_neon_vreinterpretv4siv4sf(_b);
+		const _i32x4_t tmp2   = __builtin_neon_veorv4si(tmp0, tmp1, 0);
+		const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp2);
 
 		return result;
 	}
 
 	BX_FLOAT4_INLINE float4_t float4_sll(float4_t _a, int _count)
 	{
-		const uint32x4_t tmp   = vreinterpretq_u32_f32(_a);
-		const uint32x4_t shift = vshlq_n_u32(tmp, _count);
-		const float4_t result  = vreinterpretq_f32_u32(shift);
+		if (__builtin_constant_p(_count) )
+		{
+			const _i32x4_t tmp0   = __builtin_neon_vreinterpretv4siv4sf(_a);
+			const _i32x4_t tmp1   = __builtin_neon_vshl_nv4si(tmp0, _count, 0);
+			const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp1);
+
+			return result;
+		}
+
+		const _i32x4_t tmp0   = __builtin_neon_vreinterpretv4siv4sf(_a);
+		const _i32x4_t shift  = __builtin_neon_vdup_nv4si( (__builtin_neon_si)_count);
+		const _i32x4_t tmp1   = __builtin_neon_vshlv4si(tmp0, shift, 1);
+		const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp1);
 
 		return result;
 	}
 
 	BX_FLOAT4_INLINE float4_t float4_srl(float4_t _a, int _count)
 	{
-		const uint32x4_t tmp   = vreinterpretq_i32_f32(_a);
-		const uint32x4_t shift = (uint32x4_t)__builtin_neon_vshr_nv4si( (int32x4_t)tmp, _count, 0);
-		const float4_t result  = vreinterpretq_f32_u32(shift);
+		if (__builtin_constant_p(_count) )
+		{
+			const _i32x4_t tmp0   = __builtin_neon_vreinterpretv4siv4sf(_a);
+			const _i32x4_t tmp1   = __builtin_neon_vshr_nv4si(tmp0, _count, 0);
+			const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp1);
+
+			return result;
+		}
+
+		const _i32x4_t tmp0   = __builtin_neon_vreinterpretv4siv4sf(_a);
+		const _i32x4_t shift  = __builtin_neon_vdup_nv4si( (__builtin_neon_si)-_count);
+		const _i32x4_t tmp1   = __builtin_neon_vshlv4si(tmp0, shift, 1);
+		const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp1);
 
 		return result;
 	}
 
 	BX_FLOAT4_INLINE float4_t float4_sra(float4_t _a, int _count)
 	{
-		const int32x4_t a     = vreinterpretq_s32_f32(_a);
-		const int32x4_t shift = __builtin_neon_vshr_nv4si(a, _count, 1);
-		const float4_t result = vreinterpretq_f32_s32(shift);
+		if (__builtin_constant_p(_count) )
+		{
+			const _i32x4_t tmp0   = __builtin_neon_vreinterpretv4siv4sf(_a);
+			const _i32x4_t tmp1   = __builtin_neon_vshr_nv4si(tmp0, _count, 1);
+			const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp1);
+
+			return result;
+		}
+
+		const _i32x4_t tmp0   = __builtin_neon_vreinterpretv4siv4sf(_a);
+		const _i32x4_t shift  = __builtin_neon_vdup_nv4si( (__builtin_neon_si)-_count);
+		const _i32x4_t tmp1   = __builtin_neon_vshlv4si(tmp0, shift, 1);
+		const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp1);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_madd(float4_t _a, float4_t _b, float4_t _c)
+	{
+		return __builtin_neon_vmlav4sf(_c, _a, _b, 3);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_nmsub(float4_t _a, float4_t _b, float4_t _c)
+	{
+		return __builtin_neon_vmlav4sf(_c, _a, _b, 3);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_icmpeq(float4_t _a, float4_t _b)
+	{
+		const _i32x4_t tmp0   = __builtin_neon_vreinterpretv4siv4sf(_a);
+		const _i32x4_t tmp1   = __builtin_neon_vreinterpretv4siv4sf(_b);
+		const _i32x4_t tmp2   = __builtin_neon_vceqv4si(tmp0, tmp1, 1);
+		const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp2);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_icmplt(float4_t _a, float4_t _b)
+	{
+		const _i32x4_t tmp0   = __builtin_neon_vreinterpretv4siv4sf(_a);
+		const _i32x4_t tmp1   = __builtin_neon_vreinterpretv4siv4sf(_b);
+		const _i32x4_t tmp2   = __builtin_neon_vcgtv4si(tmp1, tmp0, 1);
+		const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp2);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_icmpgt(float4_t _a, float4_t _b)
+	{
+		const _i32x4_t tmp0   = __builtin_neon_vreinterpretv4siv4sf(_a);
+		const _i32x4_t tmp1   = __builtin_neon_vreinterpretv4siv4sf(_b);
+		const _i32x4_t tmp2   = __builtin_neon_vcgtv4si(tmp0, tmp1, 1);
+		const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp2);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_imin(float4_t _a, float4_t _b)
+	{
+		const _i32x4_t tmp0   = __builtin_neon_vreinterpretv4siv4sf(_a);
+		const _i32x4_t tmp1   = __builtin_neon_vreinterpretv4siv4sf(_b);
+		const _i32x4_t tmp2   = __builtin_neon_vminv4si(tmp0, tmp1, 1);
+		const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp2);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_imax(float4_t _a, float4_t _b)
+	{
+		const _i32x4_t tmp0   = __builtin_neon_vreinterpretv4siv4sf(_a);
+		const _i32x4_t tmp1   = __builtin_neon_vreinterpretv4siv4sf(_b);
+		const _i32x4_t tmp2   = __builtin_neon_vmaxv4si(tmp0, tmp1, 1);
+		const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp2);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_iadd(float4_t _a, float4_t _b)
+	{
+		const _i32x4_t tmp0   = __builtin_neon_vreinterpretv4siv4sf(_a);
+		const _i32x4_t tmp1   = __builtin_neon_vreinterpretv4siv4sf(_b);
+		const _i32x4_t tmp2   = __builtin_neon_vaddv4si(tmp0, tmp1, 1);
+		const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp2);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_isub(float4_t _a, float4_t _b)
+	{
+		const _i32x4_t tmp0   = __builtin_neon_vreinterpretv4siv4sf(_a);
+		const _i32x4_t tmp1   = __builtin_neon_vreinterpretv4siv4sf(_b);
+		const _i32x4_t tmp2   = __builtin_neon_vsubv4si(tmp0, tmp1, 1);
+		const float4_t result = __builtin_neon_vreinterpretv4sfv4si(tmp2);
 
 		return result;
 	}
 
 } // namespace bx
 
-#define float4_div_nr float4_div_nr_ni
-#define float4_div float4_div_nr_ni
-#define float4_ceil float4_ceil_ni
-#define float4_floor float4_floor_ni
+#define float4_shuf_xAzC     float4_shuf_xAzC_ni
+#define float4_shuf_yBwD     float4_shuf_yBwD_ni
+#define float4_rcp           float4_rcp_ni
+#define float4_orx           float4_orx_ni
+#define float4_orc           float4_orc_ni
+#define float4_neg           float4_neg_ni
+#define float4_madd          float4_madd_ni
+#define float4_nmsub         float4_nmsub_ni
+#define float4_div_nr        float4_div_nr_ni
+#define float4_div           float4_div_nr_ni
+#define float4_selb          float4_selb_ni
+#define float4_sels          float4_sels_ni
+#define float4_not           float4_not_ni
+#define float4_abs           float4_abs_ni
+#define float4_clamp         float4_clamp_ni
+#define float4_lerp          float4_lerp_ni
+#define float4_rsqrt         float4_rsqrt_ni
+#define float4_rsqrt_nr      float4_rsqrt_nr_ni
+#define float4_rsqrt_carmack float4_rsqrt_carmack_ni
+#define float4_sqrt_nr       float4_sqrt_nr_ni
+#define float4_sqrt          float4_sqrt_nr_ni
+#define float4_log2          float4_log2_ni
+#define float4_exp2          float4_exp2_ni
+#define float4_pow           float4_pow_ni
+#define float4_cross3        float4_cross3_ni
+#define float4_normalize3    float4_normalize3_ni
+#define float4_dot3          float4_dot3_ni
+#define float4_dot           float4_dot_ni
+#define float4_ceil          float4_ceil_ni
+#define float4_floor         float4_floor_ni
+
 #include "float4_ni.h"
 
 #endif // BX_FLOAT4_NEON_H_HEADER_GUARD

+ 16 - 0
include/bx/float4_ni.h

@@ -123,6 +123,22 @@ namespace bx
 		return result;
 	}
 
+	BX_FLOAT4_INLINE float4_t float4_imin_ni(float4_t _a, float4_t _b)
+	{
+		const float4_t mask   = float4_icmplt(_a, _b);
+		const float4_t result = float4_selb(mask, _a, _b);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_imax_ni(float4_t _a, float4_t _b)
+	{
+		const float4_t mask   = float4_icmpgt(_a, _b);
+		const float4_t result = float4_selb(mask, _a, _b);
+
+		return result;
+	}
+
 	BX_FLOAT4_INLINE float4_t float4_clamp_ni(float4_t _a, float4_t _min, float4_t _max)
 	{
 		const float4_t tmp    = float4_min(_a, _max);

+ 1 - 1
include/bx/float4_ref.h

@@ -419,7 +419,7 @@ IMPLEMENT_TEST(xyzw , 0xf);
 		return result;
 	}
 
-	BX_FLOAT4_INLINE float4_t float4_max(float4_t _a, float4_t _b)
+	BX_NO_INLINE float4_t float4_max(float4_t _a, float4_t _b)
 	{
 		float4_t result;
 		result.fxyzw[0] = _a.fxyzw[0] > _b.fxyzw[0] ? _a.fxyzw[0] : _b.fxyzw[0];

+ 83 - 28
include/bx/float4_sse.h

@@ -14,7 +14,6 @@
 
 namespace bx
 {
-
 	typedef __m128 float4_t;
 
 #define ELEMx 0
@@ -349,6 +348,58 @@ IMPLEMENT_TEST(xyzw , 0xf);
 		return result;
 	}
 
+	BX_FLOAT4_INLINE float4_t float4_icmpeq(float4_t _a, float4_t _b)
+	{
+		const __m128i tmp0    = _mm_castps_si128(_a);
+		const __m128i tmp1    = _mm_castps_si128(_b);
+		const __m128i tmp2    = _mm_cmpeq_epi32(tmp0, tmp1);
+		const float4_t result = _mm_castsi128_ps(tmp2);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_icmplt(float4_t _a, float4_t _b)
+	{
+		const __m128i tmp0    = _mm_castps_si128(_a);
+		const __m128i tmp1    = _mm_castps_si128(_b);
+		const __m128i tmp2    = _mm_cmplt_epi32(tmp0, tmp1);
+		const float4_t result = _mm_castsi128_ps(tmp2);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_icmpgt(float4_t _a, float4_t _b)
+	{
+		const __m128i tmp0    = _mm_castps_si128(_a);
+		const __m128i tmp1    = _mm_castps_si128(_b);
+		const __m128i tmp2    = _mm_cmpgt_epi32(tmp0, tmp1);
+		const float4_t result = _mm_castsi128_ps(tmp2);
+
+		return result;
+	}
+
+#if defined(__SSE4_1__)
+	BX_FLOAT4_INLINE float4_t float4_imin(float4_t _a, float4_t _b)
+	{
+		const __m128i tmp0    = _mm_castps_si128(_a);
+		const __m128i tmp1    = _mm_castps_si128(_b);
+		const __m128i tmp2    = _mm_min_epi32(tmp0, tmp1);
+		const float4_t result = _mm_castsi128_ps(tmp2);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_imax(float4_t _a, float4_t _b)
+	{
+		const __m128i tmp0    = _mm_castps_si128(_a);
+		const __m128i tmp1    = _mm_castps_si128(_b);
+		const __m128i tmp2    = _mm_max_epi32(tmp0, tmp1);
+		const float4_t result = _mm_castsi128_ps(tmp2);
+
+		return result;
+	}
+#endif // defined(__SSE4_1__)
+
 	BX_FLOAT4_INLINE float4_t float4_iadd(float4_t _a, float4_t _b)
 	{
 		const __m128i a       = _mm_castps_si128(_a);
@@ -371,36 +422,40 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
 } // namespace bx
 
-#define float4_shuf_xAzC float4_shuf_xAzC_ni
-#define float4_shuf_yBwD float4_shuf_yBwD_ni
-#define float4_rcp float4_rcp_ni
-#define float4_orx float4_orx_ni
-#define float4_orc float4_orc_ni
-#define float4_neg float4_neg_ni
-#define float4_madd float4_madd_ni
-#define float4_nmsub float4_nmsub_ni
-#define float4_div_nr float4_div_nr_ni
-#define float4_selb float4_selb_ni
-#define float4_sels float4_sels_ni
-#define float4_not float4_not_ni
-#define float4_abs float4_abs_ni
-#define float4_clamp float4_clamp_ni
-#define float4_lerp float4_lerp_ni
-#define float4_rsqrt float4_rsqrt_ni
-#define float4_rsqrt_nr float4_rsqrt_nr_ni
+#define float4_shuf_xAzC     float4_shuf_xAzC_ni
+#define float4_shuf_yBwD     float4_shuf_yBwD_ni
+#define float4_rcp           float4_rcp_ni
+#define float4_orx           float4_orx_ni
+#define float4_orc           float4_orc_ni
+#define float4_neg           float4_neg_ni
+#define float4_madd          float4_madd_ni
+#define float4_nmsub         float4_nmsub_ni
+#define float4_div_nr        float4_div_nr_ni
+#define float4_selb          float4_selb_ni
+#define float4_sels          float4_sels_ni
+#define float4_not           float4_not_ni
+#define float4_abs           float4_abs_ni
+#define float4_clamp         float4_clamp_ni
+#define float4_lerp          float4_lerp_ni
+#define float4_rsqrt         float4_rsqrt_ni
+#define float4_rsqrt_nr      float4_rsqrt_nr_ni
 #define float4_rsqrt_carmack float4_rsqrt_carmack_ni
-#define float4_sqrt_nr float4_sqrt_nr_ni
-#define float4_log2 float4_log2_ni
-#define float4_exp2 float4_exp2_ni
-#define float4_pow float4_pow_ni
-#define float4_cross3 float4_cross3_ni
-#define float4_normalize3 float4_normalize3_ni
+#define float4_sqrt_nr       float4_sqrt_nr_ni
+#define float4_log2          float4_log2_ni
+#define float4_exp2          float4_exp2_ni
+#define float4_pow           float4_pow_ni
+#define float4_cross3        float4_cross3_ni
+#define float4_normalize3    float4_normalize3_ni
+#define float4_ceil          float4_ceil_ni
+#define float4_floor         float4_floor_ni
+
 #if !defined(__SSE4_1__)
-#define float4_dot3 float4_dot3_ni
-#define float4_dot float4_dot_ni
+#	define float4_dot3       float4_dot3_ni
+#	define float4_dot        float4_dot_ni
+#	define float4_imin       float4_imin_ni
+#	define float4_imax       float4_imax_ni
 #endif // defined(__SSE4_1__)
-#define float4_ceil float4_ceil_ni
-#define float4_floor float4_floor_ni
+
 #include "float4_ni.h"
 
 #endif // BX_FLOAT4_SSE_H_HEADER_GUARD

+ 1 - 1
include/bx/float4_swizzle.inl

@@ -1,5 +1,5 @@
 /*
- * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * Copyright 2010-2013 Branimir Karadzic. All rights reserved.
  * License: http://www.opensource.org/licenses/BSD-2-Clause
  */
 

+ 1 - 1
include/bx/float4_t.h

@@ -12,7 +12,7 @@
 
 #if defined(__SSE2__) || (BX_COMPILER_MSVC && (BX_ARCH_64BIT || _M_IX86_FP >= 2) )
 #	include "float4_sse.h"
-#elif 0 // __ARM_NEON__
+#elif __ARM_NEON__
 #	include "float4_neon.h"
 #else
 #	pragma message("************************************\nUsing SIMD reference implementation!\n************************************")

+ 158 - 57
makefile

@@ -1,57 +1,158 @@
-#
-# Copyright 2011-2013 Branimir Karadzic. All rights reserved.
-# License: http://www.opensource.org/licenses/BSD-2-Clause
-#
-
-all:
-	premake4 --file=premake/premake4.lua vs2008
-	premake4 --file=premake/premake4.lua vs2010
-	premake4 --file=premake/premake4.lua --gcc=mingw gmake
-	premake4 --file=premake/premake4.lua --gcc=linux-gcc gmake
-	premake4 --file=premake/premake4.lua --gcc=osx gmake
-	premake4 --file=premake/premake4.lua xcode4
-
-linux-debug32:
-	make -R -C .build/projects/gmake-linux config=debug32
-linux-release32:
-	make -R -C .build/projects/gmake-linux config=release32
-linux-debug64:
-	make -R -C .build/projects/gmake-linux config=debug64
-linux-release64:
-	make -R -C .build/projects/gmake-linux config=release64
-linux: linux-debug32 linux-release32 linux-debug64 linux-release64
-
-mingw-debug32:
-	make -R -C .build/projects/gmake-mingw config=debug32
-mingw-release32:
-	make -R -C .build/projects/gmake-mingw config=release32
-mingw-debug64:
-	make -R -C .build/projects/gmake-mingw config=debug64
-mingw-release64:
-	make -R -C .build/projects/gmake-mingw config=release64
-mingw: mingw-debug32 mingw-release32 mingw-debug64 mingw-release64
-
-vs2008-debug32:
-	devenv .build/projects/vs2008/bgfx.sln /Build "Debug|Win32"
-vs2008-release32:
-	devenv .build/projects/vs2008/bgfx.sln /Build "Release|Win32"
-vs2008-debug64:
-	devenv .build/projects/vs2008/bgfx.sln /Build "Debug|x64"
-vs2008-release64:
-	devenv .build/projects/vs2008/bgfx.sln /Build "Release|x64"
-vs2008: vs2008-debug32 vs2008-release32 vs2008-debug64 vs2008-release64
-
-osx-debug32:
-	make -C .build/projects/gmake-osx config=debug32
-osx-release32:
-	make -C .build/projects/gmake-osx config=release32
-osx-debug64:
-	make -C .build/projects/gmake-osx config=debug64
-osx-release64:
-	make -C .build/projects/gmake-osx config=release64
-osx: osx-debug32 osx-release32 osx-debug64 osx-release64
-
-clean:
-	@echo Cleaning...
-	-rm -r .build
-	-rm -r .debug
+#
+# Copyright 2011-2013 Branimir Karadzic. All rights reserved.
+# License: http://www.opensource.org/licenses/BSD-2-Clause
+#
+
+all:
+	premake4 --file=premake/premake4.lua vs2008
+	premake4 --file=premake/premake4.lua vs2010
+	premake4 --file=premake/premake4.lua vs2012
+	premake4 --file=premake/premake4.lua --gcc=android-arm gmake
+	premake4 --file=premake/premake4.lua --gcc=android-mips gmake
+	premake4 --file=premake/premake4.lua --gcc=android-x86 gmake
+	premake4 --file=premake/premake4.lua --gcc=nacl gmake
+	premake4 --file=premake/premake4.lua --gcc=nacl-arm gmake
+	premake4 --file=premake/premake4.lua --gcc=pnacl gmake
+	premake4 --file=premake/premake4.lua --gcc=mingw gmake
+	premake4 --file=premake/premake4.lua --gcc=linux-gcc gmake
+	premake4 --file=premake/premake4.lua --gcc=osx gmake
+	premake4 --file=premake/premake4.lua --gcc=ios-arm gmake
+	premake4 --file=premake/premake4.lua --gcc=ios-simulator gmake
+	premake4 --file=premake/premake4.lua xcode4
+
+.build/projects/gmake-android-arm:
+	premake4 --file=premake/premake4.lua --gcc=android-arm gmake
+android-arm-debug: .build/projects/gmake-android-arm
+	make -R -C .build/projects/gmake-android-arm config=debug
+android-arm-release: .build/projects/gmake-android-arm
+	make -R -C .build/projects/gmake-android-arm config=release
+android-arm: android-arm-debug android-arm-release
+
+.build/projects/gmake-android-mips:
+	premake4 --file=premake/premake4.lua --gcc=android-mips gmake
+android-mips-debug: .build/projects/gmake-android-mips
+	make -R -C .build/projects/gmake-android-mips config=debug
+android-mips-release: .build/projects/gmake-android-mips
+	make -R -C .build/projects/gmake-android-mips config=release
+android-mips: android-mips-debug android-mips-release
+
+.build/projects/gmake-android-x86:
+	premake4 --file=premake/premake4.lua --gcc=android-x86 gmake
+android-x86-debug: .build/projects/gmake-android-x86
+	make -R -C .build/projects/gmake-android-x86 config=debug
+android-x86-release: .build/projects/gmake-android-x86
+	make -R -C .build/projects/gmake-android-x86 config=release
+android-x86: android-x86-debug android-x86-release
+
+.build/projects/gmake-linux:
+	premake4 --file=premake/premake4.lua --gcc=linux-gcc gmake
+linux-debug32: .build/projects/gmake-linux
+	make -R -C .build/projects/gmake-linux config=debug32
+linux-release32: .build/projects/gmake-linux
+	make -R -C .build/projects/gmake-linux config=release32
+linux-debug64: .build/projects/gmake-linux
+	make -R -C .build/projects/gmake-linux config=debug64
+linux-release64: .build/projects/gmake-linux
+	make -R -C .build/projects/gmake-linux config=release64
+linux: linux-debug32 linux-release32 linux-debug64 linux-release64
+
+.build/projects/gmake-mingw:
+	premake4 --file=premake/premake4.lua --gcc=mingw gmake
+mingw-debug32: .build/projects/gmake-mingw
+	make -R -C .build/projects/gmake-mingw config=debug32
+mingw-release32: .build/projects/gmake-mingw
+	make -R -C .build/projects/gmake-mingw config=release32
+mingw-debug64: .build/projects/gmake-mingw
+	make -R -C .build/projects/gmake-mingw config=debug64
+mingw-release64: .build/projects/gmake-mingw
+	make -R -C .build/projects/gmake-mingw config=release64
+mingw: mingw-debug32 mingw-release32 mingw-debug64 mingw-release64
+
+.build/projects/vs2008:
+	premake4 --file=premake/premake4.lua vs2008
+vs2008-debug32:
+	devenv .build/projects/vs2008/bgfx.sln /Build "Debug|Win32"
+vs2008-release32:
+	devenv .build/projects/vs2008/bgfx.sln /Build "Release|Win32"
+vs2008-debug64:
+	devenv .build/projects/vs2008/bgfx.sln /Build "Debug|x64"
+vs2008-release64:
+	devenv .build/projects/vs2008/bgfx.sln /Build "Release|x64"
+vs2008: vs2008-debug32 vs2008-release32 vs2008-debug64 vs2008-release64
+
+.build/projects/vs2010:
+	premake4 --file=premake/premake4.lua vs2010
+
+.build/projects/vs2012:
+	premake4 --file=premake/premake4.lua vs2012
+
+.build/projects/gmake-nacl:
+	premake4 --file=premake/premake4.lua --gcc=nacl gmake
+nacl-debug32: .build/projects/gmake-nacl
+	make -R -C .build/projects/gmake-nacl config=debug32
+nacl-release32: .build/projects/gmake-nacl
+	make -R -C .build/projects/gmake-nacl config=release32
+nacl-debug64: .build/projects/gmake-nacl
+	make -R -C .build/projects/gmake-nacl config=debug64
+nacl-release64: .build/projects/gmake-nacl
+	make -R -C .build/projects/gmake-nacl config=release64
+nacl: nacl-debug32 nacl-release32 nacl-debug64 nacl-release64
+
+.build/projects/gmake-nacl-arm:
+	premake4 --file=premake/premake4.lua --gcc=nacl-arm gmake
+nacl-arm-debug: .build/projects/gmake-nacl-arm
+	make -R -C .build/projects/gmake-nacl-arm config=debug
+nacl-arm-release: .build/projects/gmake-nacl-arm
+	make -R -C .build/projects/gmake-nacl-arm config=release
+nacl-arm: nacl-arm-debug32 nacl-arm-release32
+
+.build/projects/gmake-pnacl:
+	premake4 --file=premake/premake4.lua --gcc=pnacl gmake
+pnacl-debug: .build/projects/gmake-pnacl
+	make -R -C .build/projects/gmake-pnacl config=debug
+pnacl-release: .build/projects/gmake-pnacl
+	make -R -C .build/projects/gmake-pnacl config=release
+pnacl: pnacl-debug pnacl-release
+
+.build/projects/gmake-osx:
+	premake4 --file=premake/premake4.lua --gcc=osx gmake
+osx-debug32: .build/projects/gmake-osx
+	make -C .build/projects/gmake-osx config=debug32
+osx-release32: .build/projects/gmake-osx
+	make -C .build/projects/gmake-osx config=release32
+osx-debug64: .build/projects/gmake-osx
+	make -C .build/projects/gmake-osx config=debug64
+osx-release64: .build/projects/gmake-osx
+	make -C .build/projects/gmake-osx config=release64
+osx: osx-debug32 osx-release32 osx-debug64 osx-release64
+
+.build/projects/gmake-ios-arm:
+	premake4 --file=premake/premake4.lua --gcc=ios-arm gmake
+ios-arm-debug: .build/projects/gmake-ios-arm
+	make -R -C .build/projects/gmake-ios-arm config=debug
+ios-arm-release: .build/projects/gmake-ios-arm
+	make -R -C .build/projects/gmake-ios-arm config=release
+ios-arm: ios-arm-debug ios-arm-release
+
+.build/projects/gmake-ios-simulator:
+	premake4 --file=premake/premake4.lua --gcc=ios-simulator gmake
+ios-simulator-debug: .build/projects/gmake-ios-simulator
+	make -R -C .build/projects/gmake-ios-simulator config=debug
+ios-simulator-release: .build/projects/gmake-ios-simulator
+	make -R -C .build/projects/gmake-ios-simulator config=release
+ios-simulator: ios-simulator-debug ios-simulator-release
+
+rebuild-shaders:
+	make -R -C examples rebuild
+
+analyze:
+	cppcheck src/
+	cppcheck examples/
+
+docs:
+	doxygen premake/bgfx.doxygen
+	markdown README.md > .build/docs/readme.html
+
+clean:
+	@echo Cleaning...
+	-@rm -rf .build

+ 33 - 0
premake/premake4.lua

@@ -12,6 +12,7 @@ solution "bx"
 	platforms {
 		"x32",
 		"x64",
+		"Native", -- for targets where bitness is not specified
 	}
 
 	language "C++"
@@ -52,3 +53,35 @@ project "bx.test"
 		BX_DIR .. "tests/**.cpp",
 		BX_DIR .. "tests/**.H",
 	}
+
+	configuration { "vs*" }
+
+	configuration { "android*" }
+		kind "ConsoleApp"
+		targetextension ".so"
+		linkoptions {
+			"-shared",
+		}
+
+	configuration { "nacl or nacl-arm" }
+		kind "ConsoleApp"
+		targetextension ".nexe"
+		links {
+			"ppapi",
+			"pthread",
+		}
+
+	configuration { "pnacl" }
+		kind "ConsoleApp"
+		targetextension ".pexe"
+		links {
+			"ppapi",
+			"pthread",
+		}
+
+	configuration { "linux-*" }
+		links {
+			"pthread",
+		}
+
+	configuration {}

+ 1 - 1
premake/toolchain.lua

@@ -419,7 +419,7 @@ function toolchain(_buildDir, _libDir)
 			"-mthumb",
 			"-march=armv7-a",
 			"-mfloat-abi=softfp",
-			"-mfpu=vfpv3-d16",
+			"-mfpu=neon",
 		}
 
 	configuration { "android-mips" }

+ 2 - 2
premake/unittest++.lua

@@ -29,13 +29,13 @@ project "UnitTest++"
 		"../3rdparty/UnitTest++/src/*.h",
 	}
 
-	configuration { "linux or osx" }
+	configuration { "linux or osx or android-*" }
 		files {
 			"../3rdparty/UnitTest++/src/Posix/**.cpp",
 			"../3rdparty/UnitTest++/src/Posix/**.h",
 		}
 
-	configuration { "windows" }
+	configuration { "mingw or vs*" }
 		files {
 			"../3rdparty/UnitTest++/src/Win32/**.cpp",
 			"../3rdparty/UnitTest++/src/Win32/**.h",

+ 87 - 0
tests/dbg.cpp

@@ -0,0 +1,87 @@
+/*
+ * Copyright 2011-2013 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <ctype.h> // isprint
+
+#include "dbg.h"
+#include <bx/string.h>
+#include <bx/debug.h>
+
+void dbgPrintfVargs(const char* _format, va_list _argList)
+{
+	char temp[8192];
+	char* out = temp;
+	int32_t len = bx::vsnprintf(out, sizeof(temp), _format, _argList);
+	if ( (int32_t)sizeof(temp) < len)
+	{
+		out = (char*)alloca(len+1);
+		len = bx::vsnprintf(out, len, _format, _argList);
+	}
+	out[len] = '\0';
+	bx::debugOutput(out);
+}
+
+void dbgPrintf(const char* _format, ...)
+{
+	va_list argList;
+	va_start(argList, _format);
+	dbgPrintfVargs(_format, argList);
+	va_end(argList);
+}
+
+#define DBG_ADDRESS "%" PRIxPTR
+
+void dbgPrintfData(const void* _data, uint32_t _size, const char* _format, ...)
+{
+#define HEX_DUMP_WIDTH 16
+#define HEX_DUMP_SPACE_WIDTH 48
+#define HEX_DUMP_FORMAT "%-" DBG_STRINGIZE(HEX_DUMP_SPACE_WIDTH) "." DBG_STRINGIZE(HEX_DUMP_SPACE_WIDTH) "s"
+
+	va_list argList;
+	va_start(argList, _format);
+	dbgPrintfVargs(_format, argList);
+	va_end(argList);
+
+	dbgPrintf("\ndata: " DBG_ADDRESS ", size: %d\n", _data, _size);
+
+	if (NULL != _data)
+	{
+		const uint8_t* data = reinterpret_cast<const uint8_t*>(_data);
+		char hex[HEX_DUMP_WIDTH*3+1];
+		char ascii[HEX_DUMP_WIDTH+1];
+		uint32_t hexPos = 0;
+		uint32_t asciiPos = 0;
+		for (uint32_t ii = 0; ii < _size; ++ii)
+		{
+			bx::snprintf(&hex[hexPos], sizeof(hex)-hexPos, "%02x ", data[asciiPos]);
+			hexPos += 3;
+
+			ascii[asciiPos] = isprint(data[asciiPos]) ? data[asciiPos] : '.';
+			asciiPos++;
+
+			if (HEX_DUMP_WIDTH == asciiPos)
+			{
+				ascii[asciiPos] = '\0';
+				dbgPrintf("\t" DBG_ADDRESS "\t" HEX_DUMP_FORMAT "\t%s\n", data, hex, ascii);
+				data += asciiPos;
+				hexPos = 0;
+				asciiPos = 0;
+			}
+		}
+
+		if (0 != asciiPos)
+		{
+			ascii[asciiPos] = '\0';
+			dbgPrintf("\t" DBG_ADDRESS "\t" HEX_DUMP_FORMAT "\t%s\n", data, hex, ascii);
+		}
+	}
+
+#undef HEX_DUMP_WIDTH
+#undef HEX_DUMP_SPACE_WIDTH
+#undef HEX_DUMP_FORMAT
+}

+ 21 - 0
tests/dbg.h

@@ -0,0 +1,21 @@
+/*
+ * Copyright 2011-2013 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef DBG_H_HEADER_GUARD
+#define DBG_H_HEADER_GUARD
+
+#include <stdarg.h> // va_list
+#include <stdint.h>
+
+#define DBG_STRINGIZE(_x) DBG_STRINGIZE_(_x)
+#define DBG_STRINGIZE_(_x) #_x
+#define DBG_FILE_LINE_LITERAL "" __FILE__ "(" DBG_STRINGIZE(__LINE__) "): "
+#define DBG(_format, ...) dbgPrintf(DBG_FILE_LINE_LITERAL "" _format "\n", ##__VA_ARGS__)
+
+extern void dbgPrintfVargs(const char* _format, va_list _argList);
+extern void dbgPrintf(const char* _format, ...);
+extern void dbgPrintfData(const void* _data, uint32_t _size, const char* _format, ...);
+
+#endif // DBG_H_HEADER_GUARD

+ 238 - 0
tests/float4_t.cpp

@@ -0,0 +1,238 @@
+/*
+ * Copyright 2010-2013 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#include "test.h"
+#include <bx/float4_t.h>
+#include <string.h>
+
+using namespace bx;
+
+union float4_cast
+{
+	bx::float4_t f4;
+	float f[4];
+	uint32_t ui[4];
+	int32_t i[4];
+	char c[16];
+};
+
+void float4_check_int32(const char* _str, bx::float4_t _a, int32_t _0, int32_t _1, int32_t _2, int32_t _3)
+{
+	float4_cast c; c.f4 = _a;
+	DBG("%s (%d, %d, %d, %d) == (%d, %d, %d, %d)"
+		, _str
+		, c.i[0], c.i[1], c.i[2], c.i[3]
+		, _0, _1, _2, _3
+		);
+
+	CHECK_EQUAL(c.i[0], _0);
+	CHECK_EQUAL(c.i[1], _1);
+	CHECK_EQUAL(c.i[2], _2);
+	CHECK_EQUAL(c.i[3], _3);
+}
+
+void float4_check_uint32(const char* _str, bx::float4_t _a, uint32_t _0, uint32_t _1, uint32_t _2, uint32_t _3)
+{
+	float4_cast c; c.f4 = _a;
+
+	DBG("%s (0x%08x, 0x%08x, 0x%08x, 0x%08x) == (0x%08x, 0x%08x, 0x%08x, 0x%08x)"
+		, _str
+		, c.ui[0], c.ui[1], c.ui[2], c.ui[3]
+		, _0, _1, _2, _3
+		);
+
+	CHECK_EQUAL(c.ui[0], _0);
+	CHECK_EQUAL(c.ui[1], _1);
+	CHECK_EQUAL(c.ui[2], _2);
+	CHECK_EQUAL(c.ui[3], _3);
+}
+
+void float4_check_float(const char* _str, bx::float4_t _a, float _0, float _1, float _2, float _3)
+{
+	float4_cast c; c.f4 = _a;
+
+	DBG("%s (%f, %f, %f, %f) == (%f, %f, %f, %f)"
+		, _str
+		, c.f[0], c.f[1], c.f[2], c.f[3]
+		, _0, _1, _2, _3
+		);
+
+	CHECK_EQUAL(c.f[0], _0);
+	CHECK_EQUAL(c.f[1], _1);
+	CHECK_EQUAL(c.f[2], _2);
+	CHECK_EQUAL(c.f[3], _3);
+}
+
+void float4_check_string(const char* _str, bx::float4_t _a)
+{
+	float4_cast c; c.f4 = _a;
+	const char test[5] = { c.c[0], c.c[4], c.c[8], c.c[12], '\0' };
+
+	DBG("%s %s", _str, test);
+
+	CHECK(0 == strcmp(_str, test) );
+}
+
+TEST(float4_swizzle)
+{
+	const float4_t xyzw = float4_ild(0x78787878, 0x79797979, 0x7a7a7a7a, 0x77777777);
+
+#define ELEMx 0
+#define ELEMy 1
+#define ELEMz 2
+#define ELEMw 3
+#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \
+			float4_check_string("" #_x #_y #_z #_w "", float4_swiz_##_x##_y##_z##_w(xyzw) ); \
+
+#include <bx/float4_swizzle.inl>
+
+#undef IMPLEMENT_SWIZZLE
+#undef ELEMw
+#undef ELEMz
+#undef ELEMy
+#undef ELEMx
+}
+
+TEST(float4_shuffle)
+{
+	const float4_t xyzw = float4_ild(0x78787878, 0x79797979, 0x7a7a7a7a, 0x77777777);
+	const float4_t ABCD = float4_ild(0x41414141, 0x42424242, 0x43434343, 0x44444444);
+	float4_check_string("xyAB", float4_shuf_xyAB(xyzw, ABCD) );
+	float4_check_string("ABxy", float4_shuf_ABxy(xyzw, ABCD) );
+	float4_check_string("zwCD", float4_shuf_zwCD(xyzw, ABCD) );
+	float4_check_string("CDzw", float4_shuf_CDzw(xyzw, ABCD) );
+	float4_check_string("xAyB", float4_shuf_xAyB(xyzw, ABCD) );
+	float4_check_string("zCwD", float4_shuf_zCwD(xyzw, ABCD) );
+	float4_check_string("xAzC", float4_shuf_xAzC(xyzw, ABCD) );
+	float4_check_string("yBwD", float4_shuf_yBwD(xyzw, ABCD) );
+	float4_check_string("CzDw", float4_shuf_CzDw(xyzw, ABCD) );
+}
+
+TEST(float4_compare)
+{
+	float4_check_uint32("cmpeq"
+		, float4_cmpeq(float4_ld(1.0f, 2.0f, 3.0f, 4.0f), float4_ld(0.0f, 2.0f, 0.0f, 3.0f) )
+		, 0, -1, 0, 0
+		);
+
+	float4_check_uint32("cmplt"
+		, float4_cmplt(float4_ld(1.0f, 2.0f, 3.0f, 4.0f), float4_ld(0.0f, 2.0f, 0.0f, 3.0f) )
+		, 0, 0, 0, 0
+		);
+
+	float4_check_uint32("cmple"
+		, float4_cmple(float4_ld(1.0f, 2.0f, 3.0f, 4.0f), float4_ld(0.0f, 2.0f, 0.0f, 3.0f) )
+		, 0, -1, 0, 0
+		);
+
+	float4_check_uint32("cmpgt"
+		, float4_cmpgt(float4_ld(1.0f, 2.0f, 3.0f, 4.0f), float4_ld(0.0f, 2.0f, 0.0f, 3.0f) )
+		, -1, 0, -1, -1
+		);
+
+	float4_check_uint32("cmpge"
+		, float4_cmpge(float4_ld(1.0f, 2.0f, 3.0f, 4.0f), float4_ld(0.0f, 2.0f, 0.0f, 3.0f) )
+		, -1, -1, -1, -1
+		);
+
+	float4_check_uint32("icmpeq"
+		, float4_icmpeq(float4_ild(0, 1, 2, 3), float4_ild(0, -2, 1, 3) )
+		, -1, 0, 0, -1
+		);
+
+	float4_check_uint32("icmplt"
+		, float4_icmplt(float4_ild(0, 1, 2, 3), float4_ild(0, -2, 1, 3) )
+		, 0, 0, 0, 0
+		);
+
+	float4_check_uint32("icmpgt"
+		, float4_icmpgt(float4_ild(0, 1, 2, 3), float4_ild(0, -2, 1, 3) )
+		, 0, -1, -1, 0
+		);
+}
+
+TEST(float4_load)
+{
+	float4_check_float("ld"
+		, float4_ld(0.0f, 1.0f, 2.0f, 3.0f)
+		, 0.0f, 1.0f, 2.0f, 3.0f
+		);
+
+	float4_check_int32("ild"
+		, float4_ild(-1, 0, 1, 2)
+		, -1, 0, 1, 2
+		);
+
+	float4_check_int32("ild"
+		, float4_ild(-1, -2, -3, -4)
+		, -1, -2, -3, -4
+		);
+
+	float4_check_uint32("zero", float4_zero()
+		, 0, 0, 0, 0
+		);
+
+	float4_check_uint32("isplat", float4_isplat(0x80000001)
+		, 0x80000001, 0x80000001, 0x80000001, 0x80000001
+		);
+
+	float4_check_float("isplat", float4_splat(1.0f)
+		, 1.0f, 1.0f, 1.0f, 1.0f
+		);
+}
+
+TEST(float4)
+{
+	const float4_t isplat = float4_isplat(0x80000001);
+	float4_check_uint32("sll"
+		, float4_sll(isplat, 1)
+		, 0x00000002, 0x00000002, 0x00000002, 0x00000002
+		);
+
+	float4_check_uint32("srl"
+		, float4_srl(isplat, 1)
+		, 0x40000000, 0x40000000, 0x40000000, 0x40000000
+		);
+
+	float4_check_uint32("sra"
+		, float4_sra(isplat, 1)
+		, 0xc0000000, 0xc0000000, 0xc0000000, 0xc0000000
+		);
+
+	float4_check_uint32("and"
+		, float4_and(float4_isplat(0x55555555), float4_isplat(0xaaaaaaaa) )
+		, 0, 0, 0, 0
+		);
+
+	float4_check_uint32("or "
+		, float4_or(float4_isplat(0x55555555), float4_isplat(0xaaaaaaaa) )
+		, -1, -1, -1, -1
+		);
+
+	float4_check_uint32("xor"
+		, float4_or(float4_isplat(0x55555555), float4_isplat(0xaaaaaaaa) )
+		, -1, -1, -1, -1
+		);
+
+	float4_check_int32("imin"
+		, float4_imin(float4_ild(0, 1, 2, 3), float4_ild(-1, 2, -2, 1) )
+		, -1, 1, -2, 1
+		);
+
+	float4_check_float("min"
+		, float4_min(float4_ld(0.0f, 1.0f, 2.0f, 3.0f), float4_ld(-1.0f, 2.0f, -2.0f, 1.0f) )
+		, -1.0f, 1.0f, -2.0f, 1.0f
+		);
+
+	float4_check_int32("imax"
+		, float4_imax(float4_ild(0, 1, 2, 3), float4_ild(-1, 2, -2, 1) )
+		, 0, 2, 2, 3
+		);
+
+	float4_check_float("max"
+		, float4_max(float4_ld(0.0f, 1.0f, 2.0f, 3.0f), float4_ld(-1.0f, 2.0f, -2.0f, 1.0f) )
+		, 0.0f, 2.0f, 2.0f, 3.0f
+		);
+}

+ 17 - 2
tests/main.cpp

@@ -1,4 +1,9 @@
-/*-
+/*
+ * Copyright 2010-2013 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+/*
  * Copyright 2012 Matthew Endsley
  * All rights reserved
  *
@@ -26,6 +31,16 @@
 
 #include "test.h"
 
-int main() {
+#if BX_PLATFORM_ANDROID
+#include <android/native_activity.h>
+
+void ANativeActivity_onCreate(ANativeActivity*, void*, size_t)
+{
+	exit(UnitTest::RunAllTests() );
+}
+#else
+int main()
+{
 	return UnitTest::RunAllTests();
 }
+#endif // BX_PLATFORM

+ 1 - 0
tests/test.h

@@ -8,6 +8,7 @@
 
 #include <bx/bx.h>
 #include <UnitTest++.h>
+#include "dbg.h"
 
 #if !BX_COMPILER_MSVC
 #	define _strdup strdup