9 rokov pred · 224e990ee4
--- a/include/bx/float4_neon.h
+++ b/include/bx/float4_neon.h
@@ -1,562 +0,0 @@
 
				-/*
			
 
				- * Copyright 2010-2016 Branimir Karadzic. All rights reserved.
			
 
				- * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
			
 
				- */
			
 
				-
			
 
				-#ifndef BX_FLOAT4_NEON_H_HEADER_GUARD
			
 
				-#define BX_FLOAT4_NEON_H_HEADER_GUARD
			
 
				-
			
 
				-#define float4_rcp           float4_rcp_ni
			
 
				-#define float4_orx           float4_orx_ni
			
 
				-#define float4_orc           float4_orc_ni
			
 
				-#define float4_neg           float4_neg_ni
			
 
				-#define float4_madd          float4_madd_ni
			
 
				-#define float4_nmsub         float4_nmsub_ni
			
 
				-#define float4_div_nr        float4_div_nr_ni
			
 
				-#define float4_div           float4_div_nr_ni
			
 
				-#define float4_selb          float4_selb_ni
			
 
				-#define float4_sels          float4_sels_ni
			
 
				-#define float4_not           float4_not_ni
			
 
				-#define float4_abs           float4_abs_ni
			
 
				-#define float4_clamp         float4_clamp_ni
			
 
				-#define float4_lerp          float4_lerp_ni
			
 
				-#define float4_rsqrt         float4_rsqrt_ni
			
 
				-#define float4_rsqrt_nr      float4_rsqrt_nr_ni
			
 
				-#define float4_rsqrt_carmack float4_rsqrt_carmack_ni
			
 
				-#define float4_sqrt_nr       float4_sqrt_nr_ni
			
 
				-#define float4_sqrt          float4_sqrt_nr_ni
			
 
				-#define float4_log2          float4_log2_ni
			
 
				-#define float4_exp2          float4_exp2_ni
			
 
				-#define float4_pow           float4_pow_ni
			
 
				-#define float4_cross3        float4_cross3_ni
			
 
				-#define float4_normalize3    float4_normalize3_ni
			
 
				-#define float4_dot3          float4_dot3_ni
			
 
				-#define float4_dot           float4_dot_ni
			
 
				-#define float4_ceil          float4_ceil_ni
			
 
				-#define float4_floor         float4_floor_ni
			
 
				-
			
 
				-#include "float4_ni.h"
			
 
				-
			
 
				-namespace bx
			
 
				-{
			
 
				-#define ELEMx 0
			
 
				-#define ELEMy 1
			
 
				-#define ELEMz 2
			
 
				-#define ELEMw 3
			
 
				-#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \
			
 
				-			template<> \
			
 
				-			BX_FLOAT4_FORCE_INLINE float4_neon_t float4_swiz_##_x##_y##_z##_w(float4_neon_t _a) \
			
 
				-			{ \
			
 
				-				return __builtin_shuffle(_a, (uint32x4_t){ ELEM##_x, ELEM##_y, ELEM##_z, ELEM##_w }); \
			
 
				-			}
			
 
				-
			
 
				-#include "float4_swizzle.inl"
			
 
				-
			
 
				-#undef IMPLEMENT_SWIZZLE
			
 
				-#undef ELEMw
			
 
				-#undef ELEMz
			
 
				-#undef ELEMy
			
 
				-#undef ELEMx
			
 
				-
			
 
				-#define IMPLEMENT_TEST(_xyzw, _swizzle) \
			
 
				-			template<> \
			
 
				-			BX_FLOAT4_FORCE_INLINE bool float4_test_any_##_xyzw(float4_neon_t _test) \
			
 
				-			{ \
			
 
				-				const float4_neon_t tmp0 = float4_swiz_##_swizzle(_test); \
			
 
				-				return float4_test_any_ni(tmp0); \
			
 
				-			} \
			
 
				-			\
			
 
				-			template<> \
			
 
				-			BX_FLOAT4_FORCE_INLINE bool float4_test_all_##_xyzw(float4_neon_t _test) \
			
 
				-			{ \
			
 
				-				const float4_neon_t tmp0 = float4_swiz_##_swizzle(_test); \
			
 
				-				return float4_test_all_ni(tmp0); \
			
 
				-			}
			
 
				-
			
 
				-IMPLEMENT_TEST(x,   xxxx);
			
 
				-IMPLEMENT_TEST(y,   yyyy);
			
 
				-IMPLEMENT_TEST(xy,  xyyy);
			
 
				-IMPLEMENT_TEST(z,   zzzz);
			
 
				-IMPLEMENT_TEST(xz,  xzzz);
			
 
				-IMPLEMENT_TEST(yz,  yzzz);
			
 
				-IMPLEMENT_TEST(xyz, xyzz);
			
 
				-IMPLEMENT_TEST(w,   wwww);
			
 
				-IMPLEMENT_TEST(xw,  xwww);
			
 
				-IMPLEMENT_TEST(yw,  ywww);
			
 
				-IMPLEMENT_TEST(xyw, xyww);
			
 
				-IMPLEMENT_TEST(zw,  zwww);
			
 
				-IMPLEMENT_TEST(xzw, xzww);
			
 
				-IMPLEMENT_TEST(yzw, yzww);
			
 
				-#undef IMPLEMENT_TEST
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE bool float4_test_any_xyzw(float4_neon_t _test)
			
 
				-	{
			
 
				-		return float4_test_any_ni(_test);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE bool float4_test_all_xyzw(float4_neon_t _test)
			
 
				-	{
			
 
				-		return float4_test_all_ni(_test);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_xyAB(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		return __builtin_shuffle(_a, _b, (uint32x4_t){ 0, 1, 4, 5 });
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_ABxy(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		return __builtin_shuffle(_a, _b, (uint32x4_t){ 4, 5, 0, 1 });
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_CDzw(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		return __builtin_shuffle(_a, _b, (uint32x4_t){ 6, 7, 2, 3 });
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_zwCD(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		return __builtin_shuffle(_a, _b, (uint32x4_t){ 2, 3, 6, 7 });
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_xAyB(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		return __builtin_shuffle(_a, _b, (uint32x4_t){ 0, 4, 1, 5 });
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_yBxA(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		return __builtin_shuffle(_a, _b, (uint32x4_t){ 1, 5, 0, 4 });
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_zCwD(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		return __builtin_shuffle(_a, _b, (uint32x4_t){ 2, 6, 3, 7 });
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_CzDw(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		return __builtin_shuffle(_a, _b, (uint32x4_t){ 6, 2, 7, 3 });
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float float4_x(float4_neon_t _a)
			
 
				-	{
			
 
				-		return vgetq_lane_f32(_a, 0);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float float4_y(float4_neon_t _a)
			
 
				-	{
			
 
				-		return vgetq_lane_f32(_a, 1);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float float4_z(float4_neon_t _a)
			
 
				-	{
			
 
				-		return vgetq_lane_f32(_a, 2);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float float4_w(float4_neon_t _a)
			
 
				-	{
			
 
				-		return vgetq_lane_f32(_a, 3);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_ld(const void* _ptr)
			
 
				-	{
			
 
				-		return vld1q_f32( (const float32_t*)_ptr);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE void float4_st(void* _ptr, float4_neon_t _a)
			
 
				-	{
			
 
				-		vst1q_f32( (float32_t*)_ptr, _a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE void float4_stx(void* _ptr, float4_neon_t _a)
			
 
				-	{
			
 
				-		vst1q_lane_f32( (float32_t*)_ptr, _a, 0);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE void float4_stream(void* _ptr, float4_neon_t _a)
			
 
				-	{
			
 
				-		vst1q_f32( (float32_t*)_ptr, _a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_ld(float _x, float _y, float _z, float _w)
			
 
				-	{
			
 
				-		const float32_t val[4] = {_x, _y, _z, _w};
			
 
				-		return float4_ld<float4_neon_t>(val);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
			
 
				-	{
			
 
				-		const uint32_t   val[4]    = {_x, _y, _z, _w};
			
 
				-		const uint32x4_t tmp       = vld1q_u32(val);
			
 
				-		const float4_neon_t result = vreinterpretq_f32_u32(tmp);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_splat(const void* _ptr)
			
 
				-	{
			
 
				-		const float4_neon_t tmp0   = vld1q_f32( (const float32_t*)_ptr);
			
 
				-		const float32x2_t   tmp1   = vget_low_f32(tmp0);
			
 
				-		const float4_neon_t result = vdupq_lane_f32(tmp1, 0);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_splat(float _a)
			
 
				-	{
			
 
				-		return vdupq_n_f32(_a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_isplat(uint32_t _a)
			
 
				-	{
			
 
				-		const int32x4_t tmp    = vdupq_n_s32(_a);
			
 
				-		const float4_neon_t  result = vreinterpretq_f32_s32(tmp);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_zero()
			
 
				-	{
			
 
				-		return float4_isplat<float4_neon_t>(0);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_itof(float4_neon_t _a)
			
 
				-	{
			
 
				-		const int32x4_t itof   = vreinterpretq_s32_f32(_a);
			
 
				-		const float4_neon_t  result = vcvtq_f32_s32(itof);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_ftoi(float4_neon_t _a)
			
 
				-	{
			
 
				-		const int32x4_t ftoi  = vcvtq_s32_f32(_a);
			
 
				-		const float4_neon_t result = vreinterpretq_f32_s32(ftoi);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_add(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		return vaddq_f32(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_sub(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		return vsubq_f32(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_mul(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		return vmulq_f32(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_rcp_est(float4_neon_t _a)
			
 
				-	{
			
 
				-		return vrecpeq_f32(_a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_rsqrt_est(float4_neon_t _a)
			
 
				-	{
			
 
				-		return vrsqrteq_f32(_a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_cmpeq(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		const uint32x4_t tmp    = vceqq_f32(_a, _b);
			
 
				-		const float4_neon_t   result = vreinterpretq_f32_u32(tmp);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_cmplt(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		const uint32x4_t tmp    = vcltq_f32(_a, _b);
			
 
				-		const float4_neon_t   result = vreinterpretq_f32_u32(tmp);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_cmple(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		const uint32x4_t tmp    = vcleq_f32(_a, _b);
			
 
				-		const float4_neon_t   result = vreinterpretq_f32_u32(tmp);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_cmpgt(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		const uint32x4_t tmp    = vcgtq_f32(_a, _b);
			
 
				-		const float4_neon_t   result = vreinterpretq_f32_u32(tmp);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_cmpge(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		const uint32x4_t tmp    = vcgeq_f32(_a, _b);
			
 
				-		const float4_neon_t   result = vreinterpretq_f32_u32(tmp);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_min(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		return vminq_f32(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_max(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		return vmaxq_f32(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_and(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		const int32x4_t tmp0   = vreinterpretq_s32_f32(_a);
			
 
				-		const int32x4_t tmp1   = vreinterpretq_s32_f32(_b);
			
 
				-		const int32x4_t tmp2   = vandq_s32(tmp0, tmp1);
			
 
				-		const float4_neon_t  result = vreinterpretq_f32_s32(tmp2);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_andc(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		const int32x4_t tmp0   = vreinterpretq_s32_f32(_a);
			
 
				-		const int32x4_t tmp1   = vreinterpretq_s32_f32(_b);
			
 
				-		const int32x4_t tmp2   = vbicq_s32(tmp0, tmp1);
			
 
				-		const float4_neon_t  result = vreinterpretq_f32_s32(tmp2);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_or(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		const int32x4_t tmp0   = vreinterpretq_s32_f32(_a);
			
 
				-		const int32x4_t tmp1   = vreinterpretq_s32_f32(_b);
			
 
				-		const int32x4_t tmp2   = vorrq_s32(tmp0, tmp1);
			
 
				-		const float4_neon_t  result = vreinterpretq_f32_s32(tmp2);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_xor(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		const int32x4_t tmp0   = vreinterpretq_s32_f32(_a);
			
 
				-		const int32x4_t tmp1   = vreinterpretq_s32_f32(_b);
			
 
				-		const int32x4_t tmp2   = veorq_s32(tmp0, tmp1);
			
 
				-		const float4_neon_t  result = vreinterpretq_f32_s32(tmp2);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_sll(float4_neon_t _a, int _count)
			
 
				-	{
			
 
				-		if (__builtin_constant_p(_count) )
			
 
				-		{
			
 
				-			const uint32x4_t tmp0   = vreinterpretq_u32_f32(_a);
			
 
				-			const uint32x4_t tmp1   = vshlq_n_u32(tmp0, _count);
			
 
				-			const float4_neon_t   result = vreinterpretq_f32_u32(tmp1);
			
 
				-
			
 
				-			return result;
			
 
				-		}
			
 
				-
			
 
				-		const uint32x4_t tmp0   = vreinterpretq_u32_f32(_a);
			
 
				-		const int32x4_t  shift  = vdupq_n_s32(_count);
			
 
				-		const uint32x4_t tmp1   = vshlq_u32(tmp0, shift);
			
 
				-		const float4_neon_t   result = vreinterpretq_f32_u32(tmp1);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_srl(float4_neon_t _a, int _count)
			
 
				-	{
			
 
				-		if (__builtin_constant_p(_count) )
			
 
				-		{
			
 
				-			const uint32x4_t tmp0   = vreinterpretq_u32_f32(_a);
			
 
				-			const uint32x4_t tmp1   = vshrq_n_u32(tmp0, _count);
			
 
				-			const float4_neon_t   result = vreinterpretq_f32_u32(tmp1);
			
 
				-
			
 
				-			return result;
			
 
				-		}
			
 
				-
			
 
				-		const uint32x4_t tmp0   = vreinterpretq_u32_f32(_a);
			
 
				-		const int32x4_t  shift  = vdupq_n_s32(-_count);
			
 
				-		const uint32x4_t tmp1   = vshlq_u32(tmp0, shift);
			
 
				-		const float4_neon_t   result = vreinterpretq_f32_u32(tmp1);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_sra(float4_neon_t _a, int _count)
			
 
				-	{
			
 
				-		if (__builtin_constant_p(_count) )
			
 
				-		{
			
 
				-			const int32x4_t tmp0   = vreinterpretq_s32_f32(_a);
			
 
				-			const int32x4_t tmp1   = vshrq_n_s32(tmp0, _count);
			
 
				-			const float4_neon_t  result = vreinterpretq_f32_s32(tmp1);
			
 
				-
			
 
				-			return result;
			
 
				-		}
			
 
				-
			
 
				-		const int32x4_t tmp0   = vreinterpretq_s32_f32(_a);
			
 
				-		const int32x4_t shift  = vdupq_n_s32(-_count);
			
 
				-		const int32x4_t tmp1   = vshlq_s32(tmp0, shift);
			
 
				-		const float4_neon_t  result = vreinterpretq_f32_s32(tmp1);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_madd(float4_neon_t _a, float4_neon_t _b, float4_neon_t _c)
			
 
				-	{
			
 
				-		return vmlaq_f32(_c, _a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_nmsub(float4_neon_t _a, float4_neon_t _b, float4_neon_t _c)
			
 
				-	{
			
 
				-		return vmlsq_f32(_c, _a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_icmpeq(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		const int32x4_t  tmp0   = vreinterpretq_s32_f32(_a);
			
 
				-		const int32x4_t  tmp1   = vreinterpretq_s32_f32(_b);
			
 
				-		const uint32x4_t tmp2   = vceqq_s32(tmp0, tmp1);
			
 
				-		const float4_neon_t   result = vreinterpretq_f32_u32(tmp2);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_icmplt(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		const int32x4_t  tmp0   = vreinterpretq_s32_f32(_a);
			
 
				-		const int32x4_t  tmp1   = vreinterpretq_s32_f32(_b);
			
 
				-		const uint32x4_t tmp2   = vcltq_s32(tmp0, tmp1);
			
 
				-		const float4_neon_t   result = vreinterpretq_f32_u32(tmp2);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_icmpgt(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		const int32x4_t  tmp0   = vreinterpretq_s32_f32(_a);
			
 
				-		const int32x4_t  tmp1   = vreinterpretq_s32_f32(_b);
			
 
				-		const uint32x4_t tmp2   = vcgtq_s32(tmp0, tmp1);
			
 
				-		const float4_neon_t   result = vreinterpretq_f32_u32(tmp2);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_imin(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		const int32x4_t tmp0   = vreinterpretq_s32_f32(_a);
			
 
				-		const int32x4_t tmp1   = vreinterpretq_s32_f32(_b);
			
 
				-		const int32x4_t tmp2   = vminq_s32(tmp0, tmp1);
			
 
				-		const float4_neon_t  result = vreinterpretq_f32_s32(tmp2);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_imax(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		const int32x4_t tmp0   = vreinterpretq_s32_f32(_a);
			
 
				-		const int32x4_t tmp1   = vreinterpretq_s32_f32(_b);
			
 
				-		const int32x4_t tmp2   = vmaxq_s32(tmp0, tmp1);
			
 
				-		const float4_neon_t  result = vreinterpretq_f32_s32(tmp2);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_iadd(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		const int32x4_t tmp0   = vreinterpretq_s32_f32(_a);
			
 
				-		const int32x4_t tmp1   = vreinterpretq_s32_f32(_b);
			
 
				-		const int32x4_t tmp2   = vaddq_s32(tmp0, tmp1);
			
 
				-		const float4_neon_t  result = vreinterpretq_f32_s32(tmp2);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_neon_t float4_isub(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		const int32x4_t tmp0   = vreinterpretq_s32_f32(_a);
			
 
				-		const int32x4_t tmp1   = vreinterpretq_s32_f32(_b);
			
 
				-		const int32x4_t tmp2   = vsubq_s32(tmp0, tmp1);
			
 
				-		const float4_neon_t  result = vreinterpretq_f32_s32(tmp2);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_INLINE float4_neon_t float4_shuf_xAzC(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		return float4_shuf_xAzC_ni(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_INLINE float4_neon_t float4_shuf_yBwD(float4_neon_t _a, float4_neon_t _b)
			
 
				-	{
			
 
				-		return float4_shuf_yBwD_ni(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	typedef float4_neon_t float4_t;
			
 
				-
			
 
				-} // namespace bx
			
 
				-
			
 
				-#endif // BX_FLOAT4_NEON_H_HEADER_GUARD
			
--- a/include/bx/float4_ni.h
+++ b/include/bx/float4_ni.h
@@ -1,558 +0,0 @@
 
				-/*
			
 
				- * Copyright 2010-2016 Branimir Karadzic. All rights reserved.
			
 
				- * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
			
 
				- */
			
 
				-
			
 
				-#ifndef BX_FLOAT4_NI_H_HEADER_GUARD
			
 
				-#define BX_FLOAT4_NI_H_HEADER_GUARD
			
 
				-
			
 
				-namespace bx
			
 
				-{
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_shuf_xAzC_ni(Ty _a, Ty _b)
			
 
				-	{
			
 
				-		const Ty xAyB   = float4_shuf_xAyB(_a, _b);
			
 
				-		const Ty zCwD   = float4_shuf_zCwD(_a, _b);
			
 
				-		const Ty result = float4_shuf_xyAB(xAyB, zCwD);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_shuf_yBwD_ni(Ty _a, Ty _b)
			
 
				-	{
			
 
				-		const Ty xAyB   = float4_shuf_xAyB(_a, _b);
			
 
				-		const Ty zCwD   = float4_shuf_zCwD(_a, _b);
			
 
				-		const Ty result = float4_shuf_zwCD(xAyB, zCwD);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_madd_ni(Ty _a, Ty _b, Ty _c)
			
 
				-	{
			
 
				-		const Ty mul    = float4_mul(_a, _b);
			
 
				-		const Ty result = float4_add(mul, _c);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_nmsub_ni(Ty _a, Ty _b, Ty _c)
			
 
				-	{
			
 
				-		const Ty mul    = float4_mul(_a, _b);
			
 
				-		const Ty result = float4_sub(_c, mul);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_div_nr_ni(Ty _a, Ty _b)
			
 
				-	{
			
 
				-		const Ty oneish  = float4_isplat<Ty>(0x3f800001);
			
 
				-		const Ty est     = float4_rcp_est(_b);
			
 
				-		const Ty iter0   = float4_mul(_a, est);
			
 
				-		const Ty tmp1    = float4_nmsub(_b, est, oneish);
			
 
				-		const Ty result  = float4_madd(tmp1, iter0, iter0);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_rcp_ni(Ty _a)
			
 
				-	{
			
 
				-		const Ty one    = float4_splat<Ty>(1.0f);
			
 
				-		const Ty result = float4_div(one, _a);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_orx_ni(Ty _a)
			
 
				-	{
			
 
				-		const Ty zwxy   = float4_swiz_zwxy(_a);
			
 
				-		const Ty tmp0   = float4_or(_a, zwxy);
			
 
				-		const Ty tmp1   = float4_swiz_yyyy(_a);
			
 
				-		const Ty tmp2   = float4_or(tmp0, tmp1);
			
 
				-		const Ty mf000  = float4_ild<Ty>(UINT32_MAX, 0, 0, 0);
			
 
				-		const Ty result = float4_and(tmp2, mf000);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_orc_ni(Ty _a, Ty _b)
			
 
				-	{
			
 
				-		const Ty aorb   = float4_or(_a, _b);
			
 
				-		const Ty mffff  = float4_isplat<Ty>(UINT32_MAX);
			
 
				-		const Ty result = float4_xor(aorb, mffff);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_neg_ni(Ty _a)
			
 
				-	{
			
 
				-		const Ty zero   = float4_zero<Ty>();
			
 
				-		const Ty result = float4_sub(zero, _a);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_selb_ni(Ty _mask, Ty _a, Ty _b)
			
 
				-	{
			
 
				-		const Ty sel_a  = float4_and(_a, _mask);
			
 
				-		const Ty sel_b  = float4_andc(_b, _mask);
			
 
				-		const Ty result = float4_or(sel_a, sel_b);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_sels_ni(Ty _test, Ty _a, Ty _b)
			
 
				-	{
			
 
				-		const Ty mask   = float4_sra(_test, 31);
			
 
				-		const Ty result = float4_selb(mask, _a, _b);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_not_ni(Ty _a)
			
 
				-	{
			
 
				-		const Ty mffff  = float4_isplat<Ty>(UINT32_MAX);
			
 
				-		const Ty result = float4_xor(_a, mffff);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_min_ni(Ty _a, Ty _b)
			
 
				-	{
			
 
				-		const Ty mask   = float4_cmplt(_a, _b);
			
 
				-		const Ty result = float4_selb(mask, _a, _b);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_max_ni(Ty _a, Ty _b)
			
 
				-	{
			
 
				-		const Ty mask   = float4_cmpgt(_a, _b);
			
 
				-		const Ty result = float4_selb(mask, _a, _b);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_abs_ni(Ty _a)
			
 
				-	{
			
 
				-		const Ty a_neg  = float4_neg(_a);
			
 
				-		const Ty result = float4_max(a_neg, _a);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_imin_ni(Ty _a, Ty _b)
			
 
				-	{
			
 
				-		const Ty mask   = float4_icmplt(_a, _b);
			
 
				-		const Ty result = float4_selb(mask, _a, _b);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_imax_ni(Ty _a, Ty _b)
			
 
				-	{
			
 
				-		const Ty mask   = float4_icmpgt(_a, _b);
			
 
				-		const Ty result = float4_selb(mask, _a, _b);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_clamp_ni(Ty _a, Ty _min, Ty _max)
			
 
				-	{
			
 
				-		const Ty tmp    = float4_min(_a, _max);
			
 
				-		const Ty result = float4_max(tmp, _min);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_lerp_ni(Ty _a, Ty _b, Ty _s)
			
 
				-	{
			
 
				-		const Ty ba     = float4_sub(_b, _a);
			
 
				-		const Ty result = float4_madd(_s, ba, _a);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_sqrt_nr_ni(Ty _a)
			
 
				-	{
			
 
				-		const Ty half   = float4_splat<Ty>(0.5f);
			
 
				-		const Ty one    = float4_splat<Ty>(1.0f);
			
 
				-		const Ty tmp0   = float4_rsqrt_est(_a);
			
 
				-		const Ty tmp1   = float4_mul(tmp0, _a);
			
 
				-		const Ty tmp2   = float4_mul(tmp1, half);
			
 
				-		const Ty tmp3   = float4_nmsub(tmp0, tmp1, one);
			
 
				-		const Ty result = float4_madd(tmp3, tmp2, tmp1);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_sqrt_nr1_ni(Ty _a)
			
 
				-	{
			
 
				-		const Ty half = float4_splat<Ty>(0.5f);
			
 
				-
			
 
				-		Ty result = _a;
			
 
				-		for (uint32_t ii = 0; ii < 11; ++ii)
			
 
				-		{
			
 
				-			const Ty tmp1 = float4_div(_a, result);
			
 
				-			const Ty tmp2 = float4_add(tmp1, result);
			
 
				-			result              = float4_mul(tmp2, half);
			
 
				-		}
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_rsqrt_ni(Ty _a)
			
 
				-	{
			
 
				-		const Ty one    = float4_splat<Ty>(1.0f);
			
 
				-		const Ty sqrt   = float4_sqrt(_a);
			
 
				-		const Ty result = float4_div(one, sqrt);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_rsqrt_nr_ni(Ty _a)
			
 
				-	{
			
 
				-		const Ty rsqrt           = float4_rsqrt_est(_a);
			
 
				-		const Ty iter0           = float4_mul(_a, rsqrt);
			
 
				-		const Ty iter1           = float4_mul(iter0, rsqrt);
			
 
				-		const Ty half            = float4_splat<Ty>(0.5f);
			
 
				-		const Ty half_rsqrt      = float4_mul(half, rsqrt);
			
 
				-		const Ty three           = float4_splat<Ty>(3.0f);
			
 
				-		const Ty three_sub_iter1 = float4_sub(three, iter1);
			
 
				-		const Ty result          = float4_mul(half_rsqrt, three_sub_iter1);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_rsqrt_carmack_ni(Ty _a)
			
 
				-	{
			
 
				-		const Ty half    = float4_splat<Ty>(0.5f);
			
 
				-		const Ty ah      = float4_mul(half, _a);
			
 
				-		const Ty ashift  = float4_sra(_a, 1);
			
 
				-		const Ty magic   = float4_isplat<Ty>(0x5f3759df);
			
 
				-		const Ty msuba   = float4_isub(magic, ashift);
			
 
				-		const Ty msubasq = float4_mul(msuba, msuba);
			
 
				-		const Ty tmp0    = float4_splat<Ty>(1.5f);
			
 
				-		const Ty tmp1    = float4_mul(ah, msubasq);
			
 
				-		const Ty tmp2    = float4_sub(tmp0, tmp1);
			
 
				-		const Ty result  = float4_mul(msuba, tmp2);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	namespace float4_logexp_detail
			
 
				-	{
			
 
				-		template<typename Ty>
			
 
				-		BX_FLOAT4_INLINE Ty float4_poly1(Ty _a, float _b, float _c)
			
 
				-		{
			
 
				-			const Ty bbbb   = float4_splat<Ty>(_b);
			
 
				-			const Ty cccc   = float4_splat<Ty>(_c);
			
 
				-			const Ty result = float4_madd(cccc, _a, bbbb);
			
 
				-
			
 
				-			return result;
			
 
				-		}
			
 
				-
			
 
				-		template<typename Ty>
			
 
				-		BX_FLOAT4_INLINE Ty float4_poly2(Ty _a, float _b, float _c, float _d)
			
 
				-		{
			
 
				-			const Ty bbbb   = float4_splat<Ty>(_b);
			
 
				-			const Ty poly   = float4_poly1(_a, _c, _d);
			
 
				-			const Ty result = float4_madd(poly, _a, bbbb);
			
 
				-
			
 
				-			return result;
			
 
				-		}
			
 
				-
			
 
				-		template<typename Ty>
			
 
				-		BX_FLOAT4_INLINE Ty float4_poly3(Ty _a, float _b, float _c, float _d, float _e)
			
 
				-		{
			
 
				-			const Ty bbbb   = float4_splat<Ty>(_b);
			
 
				-			const Ty poly   = float4_poly2(_a, _c, _d, _e);
			
 
				-			const Ty result = float4_madd(poly, _a, bbbb);
			
 
				-
			
 
				-			return result;
			
 
				-		}
			
 
				-
			
 
				-		template<typename Ty>
			
 
				-		BX_FLOAT4_INLINE Ty float4_poly4(Ty _a, float _b, float _c, float _d, float _e, float _f)
			
 
				-		{
			
 
				-			const Ty bbbb   = float4_splat<Ty>(_b);
			
 
				-			const Ty poly   = float4_poly3(_a, _c, _d, _e, _f);
			
 
				-			const Ty result = float4_madd(poly, _a, bbbb);
			
 
				-
			
 
				-			return result;
			
 
				-		}
			
 
				-
			
 
				-		template<typename Ty>
			
 
				-		BX_FLOAT4_INLINE Ty float4_poly5(Ty _a, float _b, float _c, float _d, float _e, float _f, float _g)
			
 
				-		{
			
 
				-			const Ty bbbb   = float4_splat<Ty>(_b);
			
 
				-			const Ty poly   = float4_poly4(_a, _c, _d, _e, _f, _g);
			
 
				-			const Ty result = float4_madd(poly, _a, bbbb);
			
 
				-
			
 
				-			return result;
			
 
				-		}
			
 
				-
			
 
				-		template<typename Ty>
			
 
				-		BX_FLOAT4_INLINE Ty float4_logpoly(Ty _a)
			
 
				-		{
			
 
				-#if 1
			
 
				-			const Ty result = float4_poly5(_a
			
 
				-				, 3.11578814719469302614f, -3.32419399085241980044f
			
 
				-				, 2.59883907202499966007f, -1.23152682416275988241f
			
 
				-				, 0.318212422185251071475f, -0.0344359067839062357313f
			
 
				-				);
			
 
				-#elif 0
			
 
				-			const Ty result = float4_poly4(_a
			
 
				-				, 2.8882704548164776201f, -2.52074962577807006663f
			
 
				-				, 1.48116647521213171641f, -0.465725644288844778798f
			
 
				-				, 0.0596515482674574969533f
			
 
				-				);
			
 
				-#elif 0
			
 
				-			const Ty result = float4_poly3(_a
			
 
				-				, 2.61761038894603480148f, -1.75647175389045657003f
			
 
				-				, 0.688243882994381274313f, -0.107254423828329604454f
			
 
				-				);
			
 
				-#else
			
 
				-			const Ty result = float4_poly2(_a
			
 
				-				, 2.28330284476918490682f, -1.04913055217340124191f
			
 
				-				, 0.204446009836232697516f
			
 
				-				);
			
 
				-#endif
			
 
				-
			
 
				-			return result;
			
 
				-		}
			
 
				-
			
 
				-		template<typename Ty>
			
 
				-		BX_FLOAT4_INLINE Ty float4_exppoly(Ty _a)
			
 
				-		{
			
 
				-#if 1
			
 
				-			const Ty result = float4_poly5(_a
			
 
				-				, 9.9999994e-1f, 6.9315308e-1f
			
 
				-				, 2.4015361e-1f, 5.5826318e-2f
			
 
				-				, 8.9893397e-3f, 1.8775767e-3f
			
 
				-				);
			
 
				-#elif 0
			
 
				-			const Ty result = float4_poly4(_a
			
 
				-				, 1.0000026f, 6.9300383e-1f
			
 
				-				, 2.4144275e-1f, 5.2011464e-2f
			
 
				-				, 1.3534167e-2f
			
 
				-				);
			
 
				-#elif 0
			
 
				-			const Ty result = float4_poly3(_a
			
 
				-				, 9.9992520e-1f, 6.9583356e-1f
			
 
				-				, 2.2606716e-1f, 7.8024521e-2f
			
 
				-				);
			
 
				-#else
			
 
				-			const Ty result = float4_poly2(_a
			
 
				-				, 1.0017247f, 6.5763628e-1f
			
 
				-				, 3.3718944e-1f
			
 
				-				);
			
 
				-#endif // 0
			
 
				-
			
 
				-			return result;
			
 
				-		}
			
 
				-	} // namespace float4_internal
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_log2_ni(Ty _a)
			
 
				-	{
			
 
				-		const Ty expmask  = float4_isplat<Ty>(0x7f800000);
			
 
				-		const Ty mantmask = float4_isplat<Ty>(0x007fffff);
			
 
				-		const Ty one      = float4_splat<Ty>(1.0f);
			
 
				-
			
 
				-		const Ty c127     = float4_isplat<Ty>(127);
			
 
				-		const Ty aexp     = float4_and(_a, expmask);
			
 
				-		const Ty aexpsr   = float4_srl(aexp, 23);
			
 
				-		const Ty tmp0     = float4_isub(aexpsr, c127);
			
 
				-		const Ty exp      = float4_itof(tmp0);
			
 
				-
			
 
				-		const Ty amask    = float4_and(_a, mantmask);
			
 
				-		const Ty mant     = float4_or(amask, one);
			
 
				-
			
 
				-		const Ty poly     = float4_logexp_detail::float4_logpoly(mant);
			
 
				-
			
 
				-		const Ty mandiff  = float4_sub(mant, one);
			
 
				-		const Ty result   = float4_madd(poly, mandiff, exp);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_exp2_ni(Ty _a)
			
 
				-	{
			
 
				-		const Ty min      = float4_splat<Ty>( 129.0f);
			
 
				-		const Ty max      = float4_splat<Ty>(-126.99999f);
			
 
				-		const Ty tmp0     = float4_min(_a, min);
			
 
				-		const Ty aaaa     = float4_max(tmp0, max);
			
 
				-
			
 
				-		const Ty half     = float4_splat<Ty>(0.5f);
			
 
				-		const Ty tmp2     = float4_sub(aaaa, half);
			
 
				-		const Ty ipart    = float4_ftoi(tmp2);
			
 
				-		const Ty iround   = float4_itof(ipart);
			
 
				-		const Ty fpart    = float4_sub(aaaa, iround);
			
 
				-
			
 
				-		const Ty c127     = float4_isplat<Ty>(127);
			
 
				-		const Ty tmp5     = float4_iadd(ipart, c127);
			
 
				-		const Ty expipart = float4_sll(tmp5, 23);
			
 
				-
			
 
				-		const Ty expfpart = float4_logexp_detail::float4_exppoly(fpart);
			
 
				-
			
 
				-		const Ty result   = float4_mul(expipart, expfpart);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_pow_ni(Ty _a, Ty _b)
			
 
				-	{
			
 
				-		const Ty alog2  = float4_log2(_a);
			
 
				-		const Ty alog2b = float4_mul(alog2, _b);
			
 
				-		const Ty result = float4_exp2(alog2b);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_dot3_ni(Ty _a, Ty _b)
			
 
				-	{
			
 
				-		const Ty xyzw   = float4_mul(_a, _b);
			
 
				-		const Ty xxxx   = float4_swiz_xxxx(xyzw);
			
 
				-		const Ty yyyy   = float4_swiz_yyyy(xyzw);
			
 
				-		const Ty zzzz   = float4_swiz_zzzz(xyzw);
			
 
				-		const Ty tmp1   = float4_add(xxxx, yyyy);
			
 
				-		const Ty result = float4_add(zzzz, tmp1);
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_cross3_ni(Ty _a, Ty _b)
			
 
				-	{
			
 
				-		// a.yzx * b.zxy - a.zxy * b.yzx == (a * b.yzx - a.yzx * b).yzx
			
 
				-#if 0
			
 
				-		const Ty a_yzxw = float4_swiz_yzxw(_a);
			
 
				-		const Ty a_zxyw = float4_swiz_zxyw(_a);
			
 
				-		const Ty b_zxyw = float4_swiz_zxyw(_b);
			
 
				-		const Ty b_yzxw = float4_swiz_yzxw(_b);
			
 
				-		const Ty tmp    = float4_mul(a_yzxw, b_zxyw);
			
 
				-		const Ty result = float4_nmsub(a_zxyw, b_yzxw, tmp);
			
 
				-#else
			
 
				-		const Ty a_yzxw = float4_swiz_yzxw(_a);
			
 
				-		const Ty b_yzxw = float4_swiz_yzxw(_b);
			
 
				-		const Ty tmp0   = float4_mul(_a, b_yzxw);
			
 
				-		const Ty tmp1   = float4_nmsub(a_yzxw, _b, tmp0);
			
 
				-		const Ty result = float4_swiz_yzxw(tmp1);
			
 
				-#endif
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_normalize3_ni(Ty _a)
			
 
				-	{
			
 
				-		const Ty dot3    = float4_dot3(_a, _a);
			
 
				-		const Ty invSqrt = float4_rsqrt(dot3);
			
 
				-		const Ty result  = float4_mul(_a, invSqrt);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_dot_ni(Ty _a, Ty _b)
			
 
				-	{
			
 
				-		const Ty xyzw   = float4_mul(_a, _b);
			
 
				-		const Ty yzwx   = float4_swiz_yzwx(xyzw);
			
 
				-		const Ty tmp0   = float4_add(xyzw, yzwx);
			
 
				-		const Ty zwxy   = float4_swiz_zwxy(tmp0);
			
 
				-		const Ty result = float4_add(tmp0, zwxy);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_ceil_ni(Ty _a)
			
 
				-	{
			
 
				-		const Ty tmp0   = float4_ftoi(_a);
			
 
				-		const Ty tmp1   = float4_itof(tmp0);
			
 
				-		const Ty mask   = float4_cmplt(tmp1, _a);
			
 
				-		const Ty one    = float4_splat<Ty>(1.0f);
			
 
				-		const Ty tmp2   = float4_and(one, mask);
			
 
				-		const Ty result = float4_add(tmp1, tmp2);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_floor_ni(Ty _a)
			
 
				-	{
			
 
				-		const Ty tmp0   = float4_ftoi(_a);
			
 
				-		const Ty tmp1   = float4_itof(tmp0);
			
 
				-		const Ty mask   = float4_cmpgt(tmp1, _a);
			
 
				-		const Ty one    = float4_splat<Ty>(1.0f);
			
 
				-		const Ty tmp2   = float4_and(one, mask);
			
 
				-		const Ty result = float4_sub(tmp1, tmp2);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_round_ni(Ty _a)
			
 
				-	{
			
 
				-		const Ty tmp    = float4_ftoi(_a);
			
 
				-		const Ty result = float4_itof(tmp);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE bool float4_test_any_ni(Ty _a)
			
 
				-	{
			
 
				-		const Ty mask   = float4_sra(_a, 31);
			
 
				-		const Ty zwxy   = float4_swiz_zwxy(mask);
			
 
				-		const Ty tmp0   = float4_or(mask, zwxy);
			
 
				-		const Ty tmp1   = float4_swiz_yyyy(tmp0);
			
 
				-		const Ty tmp2   = float4_or(tmp0, tmp1);
			
 
				-		int res;
			
 
				-		float4_stx(&res, tmp2);
			
 
				-		return 0 != res;
			
 
				-	}
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE bool float4_test_all_ni(Ty _a)
			
 
				-	{
			
 
				-		const Ty bits   = float4_sra(_a, 31);
			
 
				-		const Ty m1248  = float4_ild<Ty>(1, 2, 4, 8);
			
 
				-		const Ty mask   = float4_and(bits, m1248);
			
 
				-		const Ty zwxy   = float4_swiz_zwxy(mask);
			
 
				-		const Ty tmp0   = float4_or(mask, zwxy);
			
 
				-		const Ty tmp1   = float4_swiz_yyyy(tmp0);
			
 
				-		const Ty tmp2   = float4_or(tmp0, tmp1);
			
 
				-		int res;
			
 
				-		float4_stx(&res, tmp2);
			
 
				-		return 0xf == res;
			
 
				-	}
			
 
				-
			
 
				-} // namespace bx
			
 
				-
			
 
				-#endif // BX_FLOAT4_NI_H_HEADER_GUARD
			
--- a/include/bx/float4_sse.h
+++ b/include/bx/float4_sse.h
@@ -1,647 +0,0 @@
 
				-/*
			
 
				- * Copyright 2010-2016 Branimir Karadzic. All rights reserved.
			
 
				- * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
			
 
				- */
			
 
				-
			
 
				-#ifndef BX_FLOAT4_SSE_H_HEADER_GUARD
			
 
				-#define BX_FLOAT4_SSE_H_HEADER_GUARD
			
 
				-
			
 
				-#include "float4_ni.h"
			
 
				-
			
 
				-namespace bx
			
 
				-{
			
 
				-#define ELEMx 0
			
 
				-#define ELEMy 1
			
 
				-#define ELEMz 2
			
 
				-#define ELEMw 3
			
 
				-#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \
			
 
				-			template<> \
			
 
				-			BX_FLOAT4_FORCE_INLINE float4_sse_t float4_swiz_##_x##_y##_z##_w(float4_sse_t _a) \
			
 
				-			{ \
			
 
				-				return _mm_shuffle_ps( _a, _a, _MM_SHUFFLE(ELEM##_w, ELEM##_z, ELEM##_y, ELEM##_x ) ); \
			
 
				-			}
			
 
				-
			
 
				-#include "float4_swizzle.inl"
			
 
				-
			
 
				-#undef IMPLEMENT_SWIZZLE
			
 
				-#undef ELEMw
			
 
				-#undef ELEMz
			
 
				-#undef ELEMy
			
 
				-#undef ELEMx
			
 
				-
			
 
				-#define IMPLEMENT_TEST(_xyzw, _mask) \
			
 
				-			template<> \
			
 
				-			BX_FLOAT4_FORCE_INLINE bool float4_test_any_##_xyzw(float4_sse_t _test) \
			
 
				-			{ \
			
 
				-				return 0x0 != (_mm_movemask_ps(_test)&(_mask) ); \
			
 
				-			} \
			
 
				-			\
			
 
				-			template<> \
			
 
				-			BX_FLOAT4_FORCE_INLINE bool float4_test_all_##_xyzw(float4_sse_t _test) \
			
 
				-			{ \
			
 
				-				return (_mask) == (_mm_movemask_ps(_test)&(_mask) ); \
			
 
				-			}
			
 
				-
			
 
				-IMPLEMENT_TEST(x    , 0x1);
			
 
				-IMPLEMENT_TEST(y    , 0x2);
			
 
				-IMPLEMENT_TEST(xy   , 0x3);
			
 
				-IMPLEMENT_TEST(z    , 0x4);
			
 
				-IMPLEMENT_TEST(xz   , 0x5);
			
 
				-IMPLEMENT_TEST(yz   , 0x6);
			
 
				-IMPLEMENT_TEST(xyz  , 0x7);
			
 
				-IMPLEMENT_TEST(w    , 0x8);
			
 
				-IMPLEMENT_TEST(xw   , 0x9);
			
 
				-IMPLEMENT_TEST(yw   , 0xa);
			
 
				-IMPLEMENT_TEST(xyw  , 0xb);
			
 
				-IMPLEMENT_TEST(zw   , 0xc);
			
 
				-IMPLEMENT_TEST(xzw  , 0xd);
			
 
				-IMPLEMENT_TEST(yzw  , 0xe);
			
 
				-IMPLEMENT_TEST(xyzw , 0xf);
			
 
				-
			
 
				-#undef IMPLEMENT_TEST
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_xyAB(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return _mm_movelh_ps(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_ABxy(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return _mm_movelh_ps(_b, _a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_CDzw(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return _mm_movehl_ps(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_zwCD(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return _mm_movehl_ps(_b, _a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_xAyB(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return _mm_unpacklo_ps(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_yBxA(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return _mm_unpacklo_ps(_b, _a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_zCwD(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return _mm_unpackhi_ps(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_CzDw(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return _mm_unpackhi_ps(_b, _a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float float4_x(float4_sse_t _a)
			
 
				-	{
			
 
				-		return _mm_cvtss_f32(_a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float float4_y(float4_sse_t _a)
			
 
				-	{
			
 
				-		const float4_sse_t yyyy = float4_swiz_yyyy(_a);
			
 
				-		const float result  = _mm_cvtss_f32(yyyy);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float float4_z(float4_sse_t _a)
			
 
				-	{
			
 
				-		const float4_sse_t zzzz = float4_swiz_zzzz(_a);
			
 
				-		const float result  = _mm_cvtss_f32(zzzz);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float float4_w(float4_sse_t _a)
			
 
				-	{
			
 
				-		const float4_sse_t wwww = float4_swiz_wwww(_a);
			
 
				-		const float result  = _mm_cvtss_f32(wwww);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_ld(const void* _ptr)
			
 
				-	{
			
 
				-		return _mm_load_ps(reinterpret_cast<const float*>(_ptr) );
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE void float4_st(void* _ptr, float4_sse_t _a)
			
 
				-	{
			
 
				-		_mm_store_ps(reinterpret_cast<float*>(_ptr), _a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE void float4_stx(void* _ptr, float4_sse_t _a)
			
 
				-	{
			
 
				-		_mm_store_ss(reinterpret_cast<float*>(_ptr), _a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE void float4_stream(void* _ptr, float4_sse_t _a)
			
 
				-	{
			
 
				-		_mm_stream_ps(reinterpret_cast<float*>(_ptr), _a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_ld(float _x, float _y, float _z, float _w)
			
 
				-	{
			
 
				-		return _mm_set_ps(_w, _z, _y, _x);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
			
 
				-	{
			
 
				-		const __m128i set     = _mm_set_epi32(_w, _z, _y, _x);
			
 
				-		const float4_sse_t result = _mm_castsi128_ps(set);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_splat(const void* _ptr)
			
 
				-	{
			
 
				-		const float4_sse_t x___   = _mm_load_ss(reinterpret_cast<const float*>(_ptr) );
			
 
				-		const float4_sse_t result = float4_swiz_xxxx(x___);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_splat(float _a)
			
 
				-	{
			
 
				-		return _mm_set1_ps(_a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_isplat(uint32_t _a)
			
 
				-	{
			
 
				-		const __m128i splat   = _mm_set1_epi32(_a);
			
 
				-		const float4_sse_t result = _mm_castsi128_ps(splat);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_zero()
			
 
				-	{
			
 
				-		return _mm_setzero_ps();
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_itof(float4_sse_t _a)
			
 
				-	{
			
 
				-		const __m128i  itof   = _mm_castps_si128(_a);
			
 
				-		const float4_sse_t result = _mm_cvtepi32_ps(itof);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_ftoi(float4_sse_t _a)
			
 
				-	{
			
 
				-		const __m128i ftoi    = _mm_cvtps_epi32(_a);
			
 
				-		const float4_sse_t result = _mm_castsi128_ps(ftoi);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_round(float4_sse_t _a)
			
 
				-	{
			
 
				-#if defined(__SSE4_1__)
			
 
				-		return _mm_round_ps(_a, _MM_FROUND_NINT);
			
 
				-#else
			
 
				-		const __m128i round   = _mm_cvtps_epi32(_a);
			
 
				-		const float4_sse_t result = _mm_cvtepi32_ps(round);
			
 
				-
			
 
				-		return result;
			
 
				-#endif // defined(__SSE4_1__)
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_add(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return _mm_add_ps(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_sub(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return _mm_sub_ps(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_mul(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return _mm_mul_ps(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_div(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return _mm_div_ps(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_rcp_est(float4_sse_t _a)
			
 
				-	{
			
 
				-		return _mm_rcp_ps(_a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_sqrt(float4_sse_t _a)
			
 
				-	{
			
 
				-		return _mm_sqrt_ps(_a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_rsqrt_est(float4_sse_t _a)
			
 
				-	{
			
 
				-		return _mm_rsqrt_ps(_a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_dot3(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-#if defined(__SSE4_1__)
			
 
				-		return _mm_dp_ps(_a, _b, 0x77);
			
 
				-#else
			
 
				-		return float4_dot3_ni(_a, _b);
			
 
				-#endif // defined(__SSE4__)
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_dot(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-#if defined(__SSE4_1__)
			
 
				-		return _mm_dp_ps(_a, _b, 0xFF);
			
 
				-#else
			
 
				-		return float4_dot_ni(_a, _b);
			
 
				-#endif // defined(__SSE4__)
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_cmpeq(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return _mm_cmpeq_ps(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_cmplt(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return _mm_cmplt_ps(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_cmple(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return _mm_cmple_ps(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_cmpgt(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return _mm_cmpgt_ps(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_cmpge(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return _mm_cmpge_ps(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_min(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return _mm_min_ps(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_max(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return _mm_max_ps(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_and(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return _mm_and_ps(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_andc(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return _mm_andnot_ps(_b, _a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_or(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return _mm_or_ps(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_xor(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return _mm_xor_ps(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_sll(float4_sse_t _a, int _count)
			
 
				-	{
			
 
				-		const __m128i a       = _mm_castps_si128(_a);
			
 
				-		const __m128i shift   = _mm_slli_epi32(a, _count);
			
 
				-		const float4_sse_t result = _mm_castsi128_ps(shift);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_srl(float4_sse_t _a, int _count)
			
 
				-	{
			
 
				-		const __m128i a       = _mm_castps_si128(_a);
			
 
				-		const __m128i shift   = _mm_srli_epi32(a, _count);
			
 
				-		const float4_sse_t result = _mm_castsi128_ps(shift);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_sra(float4_sse_t _a, int _count)
			
 
				-	{
			
 
				-		const __m128i a       = _mm_castps_si128(_a);
			
 
				-		const __m128i shift   = _mm_srai_epi32(a, _count);
			
 
				-		const float4_sse_t result = _mm_castsi128_ps(shift);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_icmpeq(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		const __m128i tmp0    = _mm_castps_si128(_a);
			
 
				-		const __m128i tmp1    = _mm_castps_si128(_b);
			
 
				-		const __m128i tmp2    = _mm_cmpeq_epi32(tmp0, tmp1);
			
 
				-		const float4_sse_t result = _mm_castsi128_ps(tmp2);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_icmplt(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		const __m128i tmp0    = _mm_castps_si128(_a);
			
 
				-		const __m128i tmp1    = _mm_castps_si128(_b);
			
 
				-		const __m128i tmp2    = _mm_cmplt_epi32(tmp0, tmp1);
			
 
				-		const float4_sse_t result = _mm_castsi128_ps(tmp2);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_icmpgt(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		const __m128i tmp0    = _mm_castps_si128(_a);
			
 
				-		const __m128i tmp1    = _mm_castps_si128(_b);
			
 
				-		const __m128i tmp2    = _mm_cmpgt_epi32(tmp0, tmp1);
			
 
				-		const float4_sse_t result = _mm_castsi128_ps(tmp2);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_imin(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-#if defined(__SSE4_1__)
			
 
				-		const __m128i tmp0    = _mm_castps_si128(_a);
			
 
				-		const __m128i tmp1    = _mm_castps_si128(_b);
			
 
				-		const __m128i tmp2    = _mm_min_epi32(tmp0, tmp1);
			
 
				-		const float4_sse_t result = _mm_castsi128_ps(tmp2);
			
 
				-
			
 
				-		return result;
			
 
				-#else
			
 
				-		return float4_imin_ni(_a, _b);
			
 
				-#endif // defined(__SSE4_1__)
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_imax(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-#if defined(__SSE4_1__)
			
 
				-		const __m128i tmp0    = _mm_castps_si128(_a);
			
 
				-		const __m128i tmp1    = _mm_castps_si128(_b);
			
 
				-		const __m128i tmp2    = _mm_max_epi32(tmp0, tmp1);
			
 
				-		const float4_sse_t result = _mm_castsi128_ps(tmp2);
			
 
				-
			
 
				-		return result;
			
 
				-#else
			
 
				-		return float4_imax_ni(_a, _b);
			
 
				-#endif // defined(__SSE4_1__)
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_iadd(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		const __m128i a       = _mm_castps_si128(_a);
			
 
				-		const __m128i b       = _mm_castps_si128(_b);
			
 
				-		const __m128i add     = _mm_add_epi32(a, b);
			
 
				-		const float4_sse_t result = _mm_castsi128_ps(add);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_sse_t float4_isub(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		const __m128i a       = _mm_castps_si128(_a);
			
 
				-		const __m128i b       = _mm_castps_si128(_b);
			
 
				-		const __m128i sub     = _mm_sub_epi32(a, b);
			
 
				-		const float4_sse_t result = _mm_castsi128_ps(sub);
			
 
				-
			
 
				-		return result;
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_INLINE float4_sse_t float4_shuf_xAzC(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return float4_shuf_xAzC_ni(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_INLINE float4_sse_t float4_shuf_yBwD(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return float4_shuf_yBwD_ni(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_INLINE float4_sse_t float4_rcp(float4_sse_t _a)
			
 
				-	{
			
 
				-		return float4_rcp_ni(_a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_INLINE float4_sse_t float4_orx(float4_sse_t _a)
			
 
				-	{
			
 
				-		return float4_orx_ni(_a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_INLINE float4_sse_t float4_orc(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return float4_orc_ni(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_INLINE float4_sse_t float4_neg(float4_sse_t _a)
			
 
				-	{
			
 
				-		return float4_neg_ni(_a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_INLINE float4_sse_t float4_madd(float4_sse_t _a, float4_sse_t _b, float4_sse_t _c)
			
 
				-	{
			
 
				-		return float4_madd_ni(_a, _b, _c);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_INLINE float4_sse_t float4_nmsub(float4_sse_t _a, float4_sse_t _b, float4_sse_t _c)
			
 
				-	{
			
 
				-		return float4_nmsub_ni(_a, _b, _c);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_INLINE float4_sse_t float4_div_nr(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return float4_div_nr_ni(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_INLINE float4_sse_t float4_selb(float4_sse_t _mask, float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return float4_selb_ni(_mask, _a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_INLINE float4_sse_t float4_sels(float4_sse_t _test, float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return float4_sels_ni(_test, _a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_INLINE float4_sse_t float4_not(float4_sse_t _a)
			
 
				-	{
			
 
				-		return float4_not_ni(_a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_INLINE float4_sse_t float4_abs(float4_sse_t _a)
			
 
				-	{
			
 
				-		return float4_abs_ni(_a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_INLINE float4_sse_t float4_clamp(float4_sse_t _a, float4_sse_t _min, float4_sse_t _max)
			
 
				-	{
			
 
				-		return float4_clamp_ni(_a, _min, _max);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_INLINE float4_sse_t float4_lerp(float4_sse_t _a, float4_sse_t _b, float4_sse_t _s)
			
 
				-	{
			
 
				-		return float4_lerp_ni(_a, _b, _s);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_INLINE float4_sse_t float4_rsqrt(float4_sse_t _a)
			
 
				-	{
			
 
				-		return float4_rsqrt_ni(_a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_INLINE float4_sse_t float4_rsqrt_nr(float4_sse_t _a)
			
 
				-	{
			
 
				-		return float4_rsqrt_nr_ni(_a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_INLINE float4_sse_t float4_rsqrt_carmack(float4_sse_t _a)
			
 
				-	{
			
 
				-		return float4_rsqrt_carmack_ni(_a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_INLINE float4_sse_t float4_sqrt_nr(float4_sse_t _a)
			
 
				-	{
			
 
				-		return float4_sqrt_nr_ni(_a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_INLINE float4_sse_t float4_log2(float4_sse_t _a)
			
 
				-	{
			
 
				-		return float4_log2_ni(_a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_INLINE float4_sse_t float4_exp2(float4_sse_t _a)
			
 
				-	{
			
 
				-		return float4_exp2_ni(_a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_INLINE float4_sse_t float4_pow(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return float4_pow_ni(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_INLINE float4_sse_t float4_cross3(float4_sse_t _a, float4_sse_t _b)
			
 
				-	{
			
 
				-		return float4_cross3_ni(_a, _b);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_INLINE float4_sse_t float4_normalize3(float4_sse_t _a)
			
 
				-	{
			
 
				-		return float4_normalize3_ni(_a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_INLINE float4_sse_t float4_ceil(float4_sse_t _a)
			
 
				-	{
			
 
				-		return float4_ceil_ni(_a);
			
 
				-	}
			
 
				-
			
 
				-	template<>
			
 
				-	BX_FLOAT4_INLINE float4_sse_t float4_floor(float4_sse_t _a)
			
 
				-	{
			
 
				-		return float4_floor_ni(_a);
			
 
				-	}
			
 
				-
			
 
				-	typedef float4_sse_t float4_t;
			
 
				-
			
 
				-} // namespace bx
			
 
				-
			
 
				-#endif // BX_FLOAT4_SSE_H_HEADER_GUARD
			
--- a/include/bx/float4_t.h
+++ b/include/bx/float4_t.h
@@ -1,436 +0,0 @@
 
				-/*
			
 
				- * Copyright 2010-2016 Branimir Karadzic. All rights reserved.
			
 
				- * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
			
 
				- */
			
 
				-
			
 
				-#ifndef BX_FLOAT4_T_H_HEADER_GUARD
			
 
				-#define BX_FLOAT4_T_H_HEADER_GUARD
			
 
				-
			
 
				-#include "bx.h"
			
 
				-
			
 
				-#define BX_FLOAT4_FORCE_INLINE BX_FORCE_INLINE
			
 
				-#define BX_FLOAT4_INLINE inline
			
 
				-
			
 
				-#define BX_FLOAT4_SSE     0
			
 
				-#define BX_FLOAT4_AVX     0
			
 
				-#define BX_FLOAT4_NEON    0
			
 
				-#define BX_FLOAT4_LANGEXT 0
			
 
				-
			
 
				-#if defined(__SSE2__) || (BX_COMPILER_MSVC && (BX_ARCH_64BIT || _M_IX86_FP >= 2) )
			
 
				-#	include <emmintrin.h> // __m128i
			
 
				-#	if defined(__SSE4_1__)
			
 
				-#		include <smmintrin.h>
			
 
				-#	endif // defined(__SSE4_1__)
			
 
				-#	include <xmmintrin.h> // __m128
			
 
				-#	undef  BX_FLOAT4_SSE
			
 
				-#	define BX_FLOAT4_SSE 1
			
 
				-
			
 
				-namespace bx
			
 
				-{
			
 
				-	typedef __m128 float4_sse_t;
			
 
				-
			
 
				-} // namespace bx
			
 
				-
			
 
				-#elif defined(__ARM_NEON__) && !BX_COMPILER_CLANG
			
 
				-#	include <arm_neon.h>
			
 
				-#	undef  BX_FLOAT4_NEON
			
 
				-#	define BX_FLOAT4_NEON 1
			
 
				-
			
 
				-namespace bx
			
 
				-{
			
 
				-	typedef float32x4_t float4_neon_t;
			
 
				-
			
 
				-} // namespace bx
			
 
				-
			
 
				-#elif BX_COMPILER_CLANG \
			
 
				-		&& !BX_PLATFORM_EMSCRIPTEN \
			
 
				-		&& !BX_PLATFORM_IOS \
			
 
				-		&& BX_CLANG_HAS_EXTENSION(attribute_ext_vector_type)
			
 
				-#	include <math.h>
			
 
				-#	undef  BX_FLOAT4_LANGEXT
			
 
				-#	define BX_FLOAT4_LANGEXT 1
			
 
				-
			
 
				-namespace bx
			
 
				-{
			
 
				-	union float4_langext_t
			
 
				-	{
			
 
				-		float    __attribute__((vector_size(16))) vf;
			
 
				-		int32_t  __attribute__((vector_size(16))) vi;
			
 
				-		uint32_t __attribute__((vector_size(16))) vu;
			
 
				-		float    fxyzw[4];
			
 
				-		int32_t  ixyzw[4];
			
 
				-		uint32_t uxyzw[4];
			
 
				-
			
 
				-	};
			
 
				-} // namespace bx
			
 
				-#endif //
			
 
				-
			
 
				-namespace bx
			
 
				-{
			
 
				-	union float4_ref_t
			
 
				-	{
			
 
				-		float    fxyzw[4];
			
 
				-		int32_t  ixyzw[4];
			
 
				-		uint32_t uxyzw[4];
			
 
				-
			
 
				-	};
			
 
				-} // namespace bx
			
 
				-
			
 
				-namespace bx
			
 
				-{
			
 
				-#define ELEMx 0
			
 
				-#define ELEMy 1
			
 
				-#define ELEMz 2
			
 
				-#define ELEMw 3
			
 
				-#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \
			
 
				-			template<typename Ty> \
			
 
				-			BX_FLOAT4_FORCE_INLINE Ty float4_swiz_##_x##_y##_z##_w(Ty _a);
			
 
				-#include "float4_swizzle.inl"
			
 
				-
			
 
				-#undef IMPLEMENT_SWIZZLE
			
 
				-#undef ELEMw
			
 
				-#undef ELEMz
			
 
				-#undef ELEMy
			
 
				-#undef ELEMx
			
 
				-
			
 
				-#define IMPLEMENT_TEST(_xyzw) \
			
 
				-			template<typename Ty> \
			
 
				-			BX_FLOAT4_FORCE_INLINE bool float4_test_any_##_xyzw(Ty _test); \
			
 
				-			\
			
 
				-			template<typename Ty> \
			
 
				-			BX_FLOAT4_FORCE_INLINE bool float4_test_all_##_xyzw(Ty _test)
			
 
				-
			
 
				-IMPLEMENT_TEST(x   );
			
 
				-IMPLEMENT_TEST(y   );
			
 
				-IMPLEMENT_TEST(xy  );
			
 
				-IMPLEMENT_TEST(z   );
			
 
				-IMPLEMENT_TEST(xz  );
			
 
				-IMPLEMENT_TEST(yz  );
			
 
				-IMPLEMENT_TEST(xyz );
			
 
				-IMPLEMENT_TEST(w   );
			
 
				-IMPLEMENT_TEST(xw  );
			
 
				-IMPLEMENT_TEST(yw  );
			
 
				-IMPLEMENT_TEST(xyw );
			
 
				-IMPLEMENT_TEST(zw  );
			
 
				-IMPLEMENT_TEST(xzw );
			
 
				-IMPLEMENT_TEST(yzw );
			
 
				-IMPLEMENT_TEST(xyzw);
			
 
				-#undef IMPLEMENT_TEST
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_shuf_xyAB(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_shuf_ABxy(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_shuf_CDzw(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_shuf_zwCD(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_shuf_xAyB(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_shuf_yBxA(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_shuf_zCwD(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_shuf_CzDw(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE float float4_x(Ty _a);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE float float4_y(Ty _a);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE float float4_z(Ty _a);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE float float4_w(Ty _a);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_ld(const void* _ptr);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE void float4_st(void* _ptr, Ty _a);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE void float4_stx(void* _ptr, Ty _a);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE void float4_stream(void* _ptr, Ty _a);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_ld(float _x, float _y, float _z, float _w);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_splat(const void* _ptr);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_splat(float _a);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_isplat(uint32_t _a);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_zero();
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_itof(Ty _a);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_ftoi(Ty _a);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_round(Ty _a);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_add(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_sub(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_mul(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_div(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_rcp_est(Ty _a);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_sqrt(Ty _a);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_rsqrt_est(Ty _a);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_dot3(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_dot(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_cmpeq(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_cmplt(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_cmple(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_cmpgt(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_cmpge(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_min(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_max(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_and(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_andc(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_or(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_xor(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_sll(Ty _a, int _count);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_srl(Ty _a, int _count);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_sra(Ty _a, int _count);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_icmpeq(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_icmplt(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_icmpgt(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_imin(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_imax(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_iadd(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_FORCE_INLINE Ty float4_isub(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_shuf_xAzC(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_shuf_yBwD(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_rcp(Ty _a);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_orx(Ty _a);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_orc(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_neg(Ty _a);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_madd(Ty _a, Ty _b, Ty _c);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_nmsub(Ty _a, Ty _b, Ty _c);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_div_nr(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_selb(Ty _mask, Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_sels(Ty _test, Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_not(Ty _a);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_abs(Ty _a);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_clamp(Ty _a, Ty _min, Ty _max);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_lerp(Ty _a, Ty _b, Ty _s);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_rsqrt(Ty _a);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_rsqrt_nr(Ty _a);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_rsqrt_carmack(Ty _a);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_sqrt_nr(Ty _a);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_log2(Ty _a);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_exp2(Ty _a);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_pow(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_cross3(Ty _a, Ty _b);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_normalize3(Ty _a);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_ceil(Ty _a);
			
 
				-
			
 
				-	template<typename Ty>
			
 
				-	BX_FLOAT4_INLINE Ty float4_floor(Ty _a);
			
 
				-
			
 
				-} // namespace bx
			
 
				-
			
 
				-#if BX_FLOAT4_SSE
			
 
				-#	include "float4_sse.h"
			
 
				-#endif // BX_FLOAT4_SSE
			
 
				-
			
 
				-#if BX_FLOAT4_NEON
			
 
				-#	include "float4_neon.h"
			
 
				-#endif // BX_FLOAT4_NEON
			
 
				-
			
 
				-#if BX_FLOAT4_LANGEXT
			
 
				-#	include "float4_langext.h"
			
 
				-#endif // BX_FLOAT4_LANGEXT
			
 
				-
			
 
				-#if !( BX_FLOAT4_SSE \
			
 
				-	|| BX_FLOAT4_AVX \
			
 
				-	|| BX_FLOAT4_NEON \
			
 
				-	|| BX_FLOAT4_LANGEXT \
			
 
				-	 )
			
 
				-#	ifndef BX_FLOAT4_WARN_REFERENCE_IMPL
			
 
				-#		define BX_FLOAT4_WARN_REFERENCE_IMPL 0
			
 
				-#	endif // BX_FLOAT4_WARN_REFERENCE_IMPL
			
 
				-
			
 
				-#	if BX_FLOAT4_WARN_REFERENCE_IMPL
			
 
				-#		pragma message("************************************\nUsing SIMD reference implementation!\n************************************")
			
 
				-#	endif // BX_FLOAT4_WARN_REFERENCE_IMPL
			
 
				-
			
 
				-namespace bx
			
 
				-{
			
 
				-	typedef float4_ref_t float4_t;
			
 
				-}
			
 
				-#endif //
			
 
				-
			
 
				-#include "float4_ref.h"
			
 
				-
			
 
				-namespace bx
			
 
				-{
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_t float4_zero()
			
 
				-	{
			
 
				-		return float4_zero<float4_t>();
			
 
				-	}
			
 
				-
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_t float4_ld(const void* _ptr)
			
 
				-	{
			
 
				-		return float4_ld<float4_t>(_ptr);
			
 
				-	}
			
 
				-
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w)
			
 
				-	{
			
 
				-		return float4_ld<float4_t>(_x, _y, _z, _w);
			
 
				-	}
			
 
				-
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
			
 
				-	{
			
 
				-		return float4_ild<float4_t>(_x, _y, _z, _w);
			
 
				-	}
			
 
				-
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_t float4_splat(const void* _ptr)
			
 
				-	{
			
 
				-		return float4_splat<float4_t>(_ptr);
			
 
				-	}
			
 
				-
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_t float4_splat(float _a)
			
 
				-	{
			
 
				-		return float4_splat<float4_t>(_a);
			
 
				-	}
			
 
				-
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_t float4_isplat(uint32_t _a)
			
 
				-	{
			
 
				-		return float4_isplat<float4_t>(_a);
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-#endif // BX_FLOAT4_T_H_HEADER_GUARD
			
--- a/include/bx/float4x4_t.h
+++ b/include/bx/float4x4_t.h
@@ -3,156 +3,156 @@
 
				  * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
			
 
				  */
			
 
				 
			
 
				-#ifndef BX_FLOAT4X4_H_HEADER_GUARD
			
 
				-#define BX_FLOAT4X4_H_HEADER_GUARD
			
 
				+#ifndef BX_SIMDX4_H_HEADER_GUARD
			
 
				+#define BX_SIMDX4_H_HEADER_GUARD
			
 
				 
			
 
				-#include "float4_t.h"
			
 
				+#include "simd_t.h"
			
 
				 
			
 
				 namespace bx
			
 
				 {
			
 
				 	BX_ALIGN_DECL_16(struct) float4x4_t
			
 
				 	{
			
 
				-		float4_t col[4];
			
 
				+		simd128_t col[4];
			
 
				 	};
			
 
				 
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_t float4_mul_xyz1(float4_t _a, const float4x4_t* _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_t simd_mul_xyz1(simd128_t _a, const float4x4_t* _b)
			
 
				 	{
			
 
				-		const float4_t xxxx   = float4_swiz_xxxx(_a);
			
 
				-		const float4_t yyyy   = float4_swiz_yyyy(_a);
			
 
				-		const float4_t zzzz   = float4_swiz_zzzz(_a);
			
 
				-		const float4_t col0   = float4_mul(_b->col[0], xxxx);
			
 
				-		const float4_t col1   = float4_mul(_b->col[1], yyyy);
			
 
				-		const float4_t col2   = float4_madd(_b->col[2], zzzz, col0);
			
 
				-		const float4_t col3   = float4_add(_b->col[3], col1);
			
 
				-		const float4_t result = float4_add(col2, col3);
			
 
				+		const simd128_t xxxx   = simd_swiz_xxxx(_a);
			
 
				+		const simd128_t yyyy   = simd_swiz_yyyy(_a);
			
 
				+		const simd128_t zzzz   = simd_swiz_zzzz(_a);
			
 
				+		const simd128_t col0   = simd_mul(_b->col[0], xxxx);
			
 
				+		const simd128_t col1   = simd_mul(_b->col[1], yyyy);
			
 
				+		const simd128_t col2   = simd_madd(_b->col[2], zzzz, col0);
			
 
				+		const simd128_t col3   = simd_add(_b->col[3], col1);
			
 
				+		const simd128_t result = simd_add(col2, col3);
			
 
				 
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_t float4_mul(float4_t _a, const float4x4_t* _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_t simd_mul(simd128_t _a, const float4x4_t* _b)
			
 
				 	{
			
 
				-		const float4_t xxxx   = float4_swiz_xxxx(_a);
			
 
				-		const float4_t yyyy   = float4_swiz_yyyy(_a);
			
 
				-		const float4_t zzzz   = float4_swiz_zzzz(_a);
			
 
				-		const float4_t wwww   = float4_swiz_wwww(_a);
			
 
				-		const float4_t col0   = float4_mul(_b->col[0], xxxx);
			
 
				-		const float4_t col1   = float4_mul(_b->col[1], yyyy);
			
 
				-		const float4_t col2   = float4_madd(_b->col[2], zzzz, col0);
			
 
				-		const float4_t col3   = float4_madd(_b->col[3], wwww, col1);
			
 
				-		const float4_t result = float4_add(col2, col3);
			
 
				+		const simd128_t xxxx   = simd_swiz_xxxx(_a);
			
 
				+		const simd128_t yyyy   = simd_swiz_yyyy(_a);
			
 
				+		const simd128_t zzzz   = simd_swiz_zzzz(_a);
			
 
				+		const simd128_t wwww   = simd_swiz_wwww(_a);
			
 
				+		const simd128_t col0   = simd_mul(_b->col[0], xxxx);
			
 
				+		const simd128_t col1   = simd_mul(_b->col[1], yyyy);
			
 
				+		const simd128_t col2   = simd_madd(_b->col[2], zzzz, col0);
			
 
				+		const simd128_t col3   = simd_madd(_b->col[3], wwww, col1);
			
 
				+		const simd128_t result = simd_add(col2, col3);
			
 
				 
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				-	BX_FLOAT4_INLINE void float4x4_mul(float4x4_t* __restrict _result, const float4x4_t* __restrict _a, const float4x4_t* __restrict _b)
			
 
				+	BX_SIMD_INLINE void float4x4_mul(float4x4_t* __restrict _result, const float4x4_t* __restrict _a, const float4x4_t* __restrict _b)
			
 
				 	{
			
 
				-		_result->col[0] = float4_mul(_a->col[0], _b);
			
 
				-		_result->col[1] = float4_mul(_a->col[1], _b);
			
 
				-		_result->col[2] = float4_mul(_a->col[2], _b);
			
 
				-		_result->col[3] = float4_mul(_a->col[3], _b);
			
 
				+		_result->col[0] = simd_mul(_a->col[0], _b);
			
 
				+		_result->col[1] = simd_mul(_a->col[1], _b);
			
 
				+		_result->col[2] = simd_mul(_a->col[2], _b);
			
 
				+		_result->col[3] = simd_mul(_a->col[3], _b);
			
 
				 	}
			
 
				 
			
 
				-	BX_FLOAT4_FORCE_INLINE void float4x4_transpose(float4x4_t* __restrict _result, const float4x4_t* __restrict _mtx)
			
 
				+	BX_SIMD_FORCE_INLINE void float4x4_transpose(float4x4_t* __restrict _result, const float4x4_t* __restrict _mtx)
			
 
				 	{
			
 
				-		const float4_t aibj = float4_shuf_xAyB(_mtx->col[0], _mtx->col[2]); // aibj
			
 
				-		const float4_t emfn = float4_shuf_xAyB(_mtx->col[1], _mtx->col[3]); // emfn
			
 
				-		const float4_t ckdl = float4_shuf_zCwD(_mtx->col[0], _mtx->col[2]); // ckdl
			
 
				-		const float4_t gohp = float4_shuf_zCwD(_mtx->col[1], _mtx->col[3]); // gohp
			
 
				-		_result->col[0] = float4_shuf_xAyB(aibj, emfn); // aeim
			
 
				-		_result->col[1] = float4_shuf_zCwD(aibj, emfn); // bfjn
			
 
				-		_result->col[2] = float4_shuf_xAyB(ckdl, gohp); // cgko
			
 
				-		_result->col[3] = float4_shuf_zCwD(ckdl, gohp); // dhlp
			
 
				+		const simd128_t aibj = simd_shuf_xAyB(_mtx->col[0], _mtx->col[2]); // aibj
			
 
				+		const simd128_t emfn = simd_shuf_xAyB(_mtx->col[1], _mtx->col[3]); // emfn
			
 
				+		const simd128_t ckdl = simd_shuf_zCwD(_mtx->col[0], _mtx->col[2]); // ckdl
			
 
				+		const simd128_t gohp = simd_shuf_zCwD(_mtx->col[1], _mtx->col[3]); // gohp
			
 
				+		_result->col[0] = simd_shuf_xAyB(aibj, emfn); // aeim
			
 
				+		_result->col[1] = simd_shuf_zCwD(aibj, emfn); // bfjn
			
 
				+		_result->col[2] = simd_shuf_xAyB(ckdl, gohp); // cgko
			
 
				+		_result->col[3] = simd_shuf_zCwD(ckdl, gohp); // dhlp
			
 
				 	}
			
 
				 
			
 
				-	BX_FLOAT4_INLINE void float4x4_inverse(float4x4_t* __restrict _result, const float4x4_t* __restrict _a)
			
 
				+	BX_SIMD_INLINE void float4x4_inverse(float4x4_t* __restrict _result, const float4x4_t* __restrict _a)
			
 
				 	{
			
 
				-		const float4_t tmp0 = float4_shuf_xAzC(_a->col[0], _a->col[1]);
			
 
				-		const float4_t tmp1 = float4_shuf_xAzC(_a->col[2], _a->col[3]);
			
 
				-		const float4_t tmp2 = float4_shuf_yBwD(_a->col[0], _a->col[1]);
			
 
				-		const float4_t tmp3 = float4_shuf_yBwD(_a->col[2], _a->col[3]);
			
 
				-		const float4_t t0   = float4_shuf_xyAB(tmp0, tmp1);
			
 
				-		const float4_t t1   = float4_shuf_xyAB(tmp3, tmp2);
			
 
				-		const float4_t t2   = float4_shuf_zwCD(tmp0, tmp1);
			
 
				-		const float4_t t3   = float4_shuf_zwCD(tmp3, tmp2);
			
 
				-
			
 
				-		const float4_t t23 = float4_mul(t2, t3);
			
 
				-		const float4_t t23_yxwz = float4_swiz_yxwz(t23);
			
 
				-		const float4_t t23_wzyx = float4_swiz_wzyx(t23);
			
 
				-
			
 
				-		float4_t cof0, cof1, cof2, cof3;
			
 
				-
			
 
				-		const float4_t zero = float4_zero();
			
 
				-		cof0 = float4_nmsub(t1, t23_yxwz, zero);
			
 
				-		cof0 = float4_madd(t1, t23_wzyx, cof0);
			
 
				-
			
 
				-		cof1 = float4_nmsub(t0, t23_yxwz, zero);
			
 
				-		cof1 = float4_madd(t0, t23_wzyx, cof1);
			
 
				-		cof1 = float4_swiz_zwxy(cof1);
			
 
				-		
			
 
				-		const float4_t t12 = float4_mul(t1, t2);
			
 
				-		const float4_t t12_yxwz = float4_swiz_yxwz(t12);
			
 
				-		const float4_t t12_wzyx = float4_swiz_wzyx(t12);
			
 
				-		
			
 
				-		cof0 = float4_madd(t3, t12_yxwz, cof0);
			
 
				-		cof0 = float4_nmsub(t3, t12_wzyx, cof0);
			
 
				-
			
 
				-		cof3 = float4_mul(t0, t12_yxwz);
			
 
				-		cof3 = float4_nmsub(t0, t12_wzyx, cof3);
			
 
				-		cof3 = float4_swiz_zwxy(cof3);
			
 
				-
			
 
				-		const float4_t t1_zwxy = float4_swiz_zwxy(t1);
			
 
				-		const float4_t t2_zwxy = float4_swiz_zwxy(t2);
			
 
				-
			
 
				-		const float4_t t13 = float4_mul(t1_zwxy, t3);
			
 
				-		const float4_t t13_yxwz = float4_swiz_yxwz(t13);
			
 
				-		const float4_t t13_wzyx = float4_swiz_wzyx(t13);
			
 
				-
			
 
				-		cof0 = float4_madd(t2_zwxy, t13_yxwz, cof0);
			
 
				-		cof0 = float4_nmsub(t2_zwxy, t13_wzyx, cof0);
			
 
				-
			
 
				-		cof2 = float4_mul(t0, t13_yxwz);
			
 
				-		cof2 = float4_nmsub(t0, t13_wzyx, cof2);
			
 
				-		cof2 = float4_swiz_zwxy(cof2);
			
 
				-
			
 
				-		const float4_t t01 = float4_mul(t0, t1);
			
 
				-		const float4_t t01_yxwz = float4_swiz_yxwz(t01);
			
 
				-		const float4_t t01_wzyx = float4_swiz_wzyx(t01);
			
 
				-
			
 
				-		cof2 = float4_nmsub(t3, t01_yxwz, cof2);
			
 
				-		cof2 = float4_madd(t3, t01_wzyx, cof2);
			
 
				-
			
 
				-		cof3 = float4_madd(t2_zwxy, t01_yxwz, cof3);
			
 
				-		cof3 = float4_nmsub(t2_zwxy, t01_wzyx, cof3);
			
 
				-
			
 
				-		const float4_t t03 = float4_mul(t0, t3);
			
 
				-		const float4_t t03_yxwz = float4_swiz_yxwz(t03);
			
 
				-		const float4_t t03_wzyx = float4_swiz_wzyx(t03);
			
 
				-
			
 
				-		cof1 = float4_nmsub(t2_zwxy, t03_yxwz, cof1);
			
 
				-		cof1 = float4_madd(t2_zwxy, t03_wzyx, cof1);
			
 
				-
			
 
				-		cof2 = float4_madd(t1, t03_yxwz, cof2);
			
 
				-		cof2 = float4_nmsub(t1, t03_wzyx, cof2);
			
 
				-
			
 
				-		const float4_t t02 = float4_mul(t0, t2_zwxy);
			
 
				-		const float4_t t02_yxwz = float4_swiz_yxwz(t02);
			
 
				-		const float4_t t02_wzyx = float4_swiz_wzyx(t02);
			
 
				-
			
 
				-		cof1 = float4_madd(t3, t02_yxwz, cof1);
			
 
				-		cof1 = float4_nmsub(t3, t02_wzyx, cof1);
			
 
				-
			
 
				-		cof3 = float4_nmsub(t1, t02_yxwz, cof3);
			
 
				-		cof3 = float4_madd(t1, t02_wzyx, cof3);
			
 
				-
			
 
				-		const float4_t det    = float4_dot(t0, cof0);
			
 
				-		const float4_t invdet = float4_rcp(det);
			
 
				-
			
 
				-		_result->col[0] = float4_mul(cof0, invdet);
			
 
				-		_result->col[1] = float4_mul(cof1, invdet);
			
 
				-		_result->col[2] = float4_mul(cof2, invdet);
			
 
				-		_result->col[3] = float4_mul(cof3, invdet);
			
 
				+		const simd128_t tmp0 = simd_shuf_xAzC(_a->col[0], _a->col[1]);
			
 
				+		const simd128_t tmp1 = simd_shuf_xAzC(_a->col[2], _a->col[3]);
			
 
				+		const simd128_t tmp2 = simd_shuf_yBwD(_a->col[0], _a->col[1]);
			
 
				+		const simd128_t tmp3 = simd_shuf_yBwD(_a->col[2], _a->col[3]);
			
 
				+		const simd128_t t0   = simd_shuf_xyAB(tmp0, tmp1);
			
 
				+		const simd128_t t1   = simd_shuf_xyAB(tmp3, tmp2);
			
 
				+		const simd128_t t2   = simd_shuf_zwCD(tmp0, tmp1);
			
 
				+		const simd128_t t3   = simd_shuf_zwCD(tmp3, tmp2);
			
 
				+
			
 
				+		const simd128_t t23 = simd_mul(t2, t3);
			
 
				+		const simd128_t t23_yxwz = simd_swiz_yxwz(t23);
			
 
				+		const simd128_t t23_wzyx = simd_swiz_wzyx(t23);
			
 
				+
			
 
				+		simd128_t cof0, cof1, cof2, cof3;
			
 
				+
			
 
				+		const simd128_t zero = simd_zero();
			
 
				+		cof0 = simd_nmsub(t1, t23_yxwz, zero);
			
 
				+		cof0 = simd_madd(t1, t23_wzyx, cof0);
			
 
				+
			
 
				+		cof1 = simd_nmsub(t0, t23_yxwz, zero);
			
 
				+		cof1 = simd_madd(t0, t23_wzyx, cof1);
			
 
				+		cof1 = simd_swiz_zwxy(cof1);
			
 
				+
			
 
				+		const simd128_t t12 = simd_mul(t1, t2);
			
 
				+		const simd128_t t12_yxwz = simd_swiz_yxwz(t12);
			
 
				+		const simd128_t t12_wzyx = simd_swiz_wzyx(t12);
			
 
				+
			
 
				+		cof0 = simd_madd(t3, t12_yxwz, cof0);
			
 
				+		cof0 = simd_nmsub(t3, t12_wzyx, cof0);
			
 
				+
			
 
				+		cof3 = simd_mul(t0, t12_yxwz);
			
 
				+		cof3 = simd_nmsub(t0, t12_wzyx, cof3);
			
 
				+		cof3 = simd_swiz_zwxy(cof3);
			
 
				+
			
 
				+		const simd128_t t1_zwxy = simd_swiz_zwxy(t1);
			
 
				+		const simd128_t t2_zwxy = simd_swiz_zwxy(t2);
			
 
				+
			
 
				+		const simd128_t t13 = simd_mul(t1_zwxy, t3);
			
 
				+		const simd128_t t13_yxwz = simd_swiz_yxwz(t13);
			
 
				+		const simd128_t t13_wzyx = simd_swiz_wzyx(t13);
			
 
				+
			
 
				+		cof0 = simd_madd(t2_zwxy, t13_yxwz, cof0);
			
 
				+		cof0 = simd_nmsub(t2_zwxy, t13_wzyx, cof0);
			
 
				+
			
 
				+		cof2 = simd_mul(t0, t13_yxwz);
			
 
				+		cof2 = simd_nmsub(t0, t13_wzyx, cof2);
			
 
				+		cof2 = simd_swiz_zwxy(cof2);
			
 
				+
			
 
				+		const simd128_t t01 = simd_mul(t0, t1);
			
 
				+		const simd128_t t01_yxwz = simd_swiz_yxwz(t01);
			
 
				+		const simd128_t t01_wzyx = simd_swiz_wzyx(t01);
			
 
				+
			
 
				+		cof2 = simd_nmsub(t3, t01_yxwz, cof2);
			
 
				+		cof2 = simd_madd(t3, t01_wzyx, cof2);
			
 
				+
			
 
				+		cof3 = simd_madd(t2_zwxy, t01_yxwz, cof3);
			
 
				+		cof3 = simd_nmsub(t2_zwxy, t01_wzyx, cof3);
			
 
				+
			
 
				+		const simd128_t t03 = simd_mul(t0, t3);
			
 
				+		const simd128_t t03_yxwz = simd_swiz_yxwz(t03);
			
 
				+		const simd128_t t03_wzyx = simd_swiz_wzyx(t03);
			
 
				+
			
 
				+		cof1 = simd_nmsub(t2_zwxy, t03_yxwz, cof1);
			
 
				+		cof1 = simd_madd(t2_zwxy, t03_wzyx, cof1);
			
 
				+
			
 
				+		cof2 = simd_madd(t1, t03_yxwz, cof2);
			
 
				+		cof2 = simd_nmsub(t1, t03_wzyx, cof2);
			
 
				+
			
 
				+		const simd128_t t02 = simd_mul(t0, t2_zwxy);
			
 
				+		const simd128_t t02_yxwz = simd_swiz_yxwz(t02);
			
 
				+		const simd128_t t02_wzyx = simd_swiz_wzyx(t02);
			
 
				+
			
 
				+		cof1 = simd_madd(t3, t02_yxwz, cof1);
			
 
				+		cof1 = simd_nmsub(t3, t02_wzyx, cof1);
			
 
				+
			
 
				+		cof3 = simd_nmsub(t1, t02_yxwz, cof3);
			
 
				+		cof3 = simd_madd(t1, t02_wzyx, cof3);
			
 
				+
			
 
				+		const simd128_t det    = simd_dot(t0, cof0);
			
 
				+		const simd128_t invdet = simd_rcp(det);
			
 
				+
			
 
				+		_result->col[0] = simd_mul(cof0, invdet);
			
 
				+		_result->col[1] = simd_mul(cof1, invdet);
			
 
				+		_result->col[2] = simd_mul(cof2, invdet);
			
 
				+		_result->col[3] = simd_mul(cof3, invdet);
			
 
				 	}
			
 
				 
			
 
				 } // namespace bx
			
 
				 
			
 
				-#endif // BX_FLOAT4X4_H_HEADER_GUARD
			
 
				+#endif // BX_SIMDX4_H_HEADER_GUARD
			
--- a/include/bx/simd128_langext.inl
+++ b/include/bx/simd128_langext.inl
@@ -3,8 +3,8 @@
 
				  * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
			
 
				  */
			
 
				 
			
 
				-#ifndef BX_FLOAT4_LANGEXT_H_HEADER_GUARD
			
 
				-#define BX_FLOAT4_LANGEXT_H_HEADER_GUARD
			
 
				+#ifndef BX_SIMD_LANGEXT_H_HEADER_GUARD
			
 
				+#define BX_SIMD_LANGEXT_H_HEADER_GUARD
			
 
				 
			
 
				 #define float4_rcp           float4_rcp_ni
			
 
				 #define float4_orx           float4_orx_ni
			
@@ -37,7 +37,8 @@
 
				 #define float4_max           float4_max_ni
			
 
				 #define float4_imin          float4_imin_ni
			
 
				 #define float4_imax          float4_imax_ni
			
 
				-#include "float4_ni.h"
			
 
				+
			
 
				+#include "simd_ni.inl"
			
 
				 
			
 
				 namespace bx
			
 
				 {
			
@@ -47,9 +48,9 @@ namespace bx
 
				 #define ELEMw 3
			
 
				 #define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \
			
 
				 			template<> \
			
 
				-			BX_FLOAT4_FORCE_INLINE float4_langext_t float4_swiz_##_x##_y##_z##_w(float4_langext_t _a) \
			
 
				+			BX_SIMD_FORCE_INLINE simd_langext_t float4_swiz_##_x##_y##_z##_w(simd_langext_t _a) \
			
 
				 			{ \
			
 
				-				float4_langext_t result; \
			
 
				+				simd_langext_t result; \
			
 
				 				result.vf = __builtin_shufflevector(_a.vf, _a.vf, ELEM##_x, ELEM##_y, ELEM##_z, ELEM##_w); \
			
 
				 				return result; \
			
 
				 			}
			
@@ -64,7 +65,7 @@ namespace bx
 
				 
			
 
				 #define IMPLEMENT_TEST(_xyzw, _mask) \
			
 
				 			template<> \
			
 
				-			BX_FLOAT4_FORCE_INLINE bool float4_test_any_##_xyzw(float4_langext_t _test) \
			
 
				+			BX_SIMD_FORCE_INLINE bool simd_test_any_##_xyzw(simd_langext_t _test) \
			
 
				 			{ \
			
 
				 				uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \
			
 
				 				             | ( (_test.uxyzw[2]>>31)<<2) \
			
@@ -75,7 +76,7 @@ namespace bx
 
				 			} \
			
 
				 			\
			
 
				 			template<> \
			
 
				-			BX_FLOAT4_FORCE_INLINE bool float4_test_all_##_xyzw(float4_langext_t _test) \
			
 
				+			BX_SIMD_FORCE_INLINE bool simd_test_all_##_xyzw(simd_langext_t _test) \
			
 
				 			{ \
			
 
				 				uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \
			
 
				 				             | ( (_test.uxyzw[2]>>31)<<2) \
			
@@ -104,114 +105,114 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 #undef IMPLEMENT_TEST
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_shuf_xyAB(float4_langext_t _a, float4_langext_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_shuf_xyAB(simd_langext_t _a, simd_langext_t _b)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vf = __builtin_shufflevector(_a.vf, _b.vf, 0, 1, 4, 5);
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_shuf_ABxy(float4_langext_t _a, float4_langext_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_shuf_ABxy(simd_langext_t _a, simd_langext_t _b)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vf = __builtin_shufflevector(_a.vf, _b.vf, 4, 5, 0, 1);
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_shuf_CDzw(float4_langext_t _a, float4_langext_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_shuf_CDzw(simd_langext_t _a, simd_langext_t _b)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vf = __builtin_shufflevector(_a.vf, _b.vf, 6, 7, 2, 3);
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_shuf_zwCD(float4_langext_t _a, float4_langext_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_shuf_zwCD(simd_langext_t _a, simd_langext_t _b)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vf = __builtin_shufflevector(_a.vf, _b.vf, 2, 3, 6, 7);
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_shuf_xAyB(float4_langext_t _a, float4_langext_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_shuf_xAyB(simd_langext_t _a, simd_langext_t _b)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vf = __builtin_shufflevector(_a.vf, _b.vf, 0, 4, 1, 5);
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_shuf_yBxA(float4_langext_t _a, float4_langext_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_shuf_yBxA(simd_langext_t _a, simd_langext_t _b)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vf = __builtin_shufflevector(_a.vf, _b.vf, 1, 5, 0, 4);
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_shuf_zCwD(float4_langext_t _a, float4_langext_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_shuf_zCwD(simd_langext_t _a, simd_langext_t _b)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vf = __builtin_shufflevector(_a.vf, _b.vf, 2, 6, 3, 7);
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_shuf_CzDw(float4_langext_t _a, float4_langext_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_shuf_CzDw(simd_langext_t _a, simd_langext_t _b)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vf = __builtin_shufflevector(_a.vf, _b.vf, 6, 2, 7, 3);
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_shuf_xAzC(float4_langext_t _a, float4_langext_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_shuf_xAzC(simd_langext_t _a, simd_langext_t _b)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vf = __builtin_shufflevector(_a.vf, _b.vf, 0, 4, 2, 6);
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_shuf_yBwD(float4_langext_t _a, float4_langext_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_shuf_yBwD(simd_langext_t _a, simd_langext_t _b)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vf = __builtin_shufflevector(_a.vf, _b.vf, 1, 5, 3, 7);
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float float4_x(float4_langext_t _a)
			
 
				+	BX_SIMD_FORCE_INLINE float float4_x(simd_langext_t _a)
			
 
				 	{
			
 
				 		return _a.fxyzw[0];
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float float4_y(float4_langext_t _a)
			
 
				+	BX_SIMD_FORCE_INLINE float float4_y(simd_langext_t _a)
			
 
				 	{
			
 
				 		return _a.fxyzw[1];
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float float4_z(float4_langext_t _a)
			
 
				+	BX_SIMD_FORCE_INLINE float float4_z(simd_langext_t _a)
			
 
				 	{
			
 
				 		return _a.fxyzw[2];
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float float4_w(float4_langext_t _a)
			
 
				+	BX_SIMD_FORCE_INLINE float float4_w(simd_langext_t _a)
			
 
				 	{
			
 
				 		return _a.fxyzw[3];
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_ld(const void* _ptr)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_ld(const void* _ptr)
			
 
				 	{
			
 
				 		const uint32_t* input = reinterpret_cast<const uint32_t*>(_ptr);
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.uxyzw[0] = input[0];
			
 
				 		result.uxyzw[1] = input[1];
			
 
				 		result.uxyzw[2] = input[2];
			
@@ -220,7 +221,7 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE void float4_st(void* _ptr, float4_langext_t _a)
			
 
				+	BX_SIMD_FORCE_INLINE void float4_st(void* _ptr, simd_langext_t _a)
			
 
				 	{
			
 
				 		uint32_t* result = reinterpret_cast<uint32_t*>(_ptr);
			
 
				 		result[0] = _a.uxyzw[0];
			
@@ -230,14 +231,14 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE void float4_stx(void* _ptr, float4_langext_t _a)
			
 
				+	BX_SIMD_FORCE_INLINE void float4_stx(void* _ptr, simd_langext_t _a)
			
 
				 	{
			
 
				 		uint32_t* result = reinterpret_cast<uint32_t*>(_ptr);
			
 
				 		result[0] = _a.uxyzw[0];
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE void float4_stream(void* _ptr, float4_langext_t _a)
			
 
				+	BX_SIMD_FORCE_INLINE void float4_stream(void* _ptr, simd_langext_t _a)
			
 
				 	{
			
 
				 		uint32_t* result = reinterpret_cast<uint32_t*>(_ptr);
			
 
				 		result[0] = _a.uxyzw[0];
			
@@ -247,109 +248,109 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_ld(float _x, float _y, float _z, float _w)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_ld(float _x, float _y, float _z, float _w)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vf = (float __attribute__((vector_size(16)))){ _x, _y, _z, _w };
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vu = (uint32_t __attribute__((vector_size(16)))){ _x, _y, _z, _w };
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_splat(const void* _ptr)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_splat(const void* _ptr)
			
 
				 	{
			
 
				 		const uint32_t val = *reinterpret_cast<const uint32_t*>(_ptr);
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vu = (uint32_t __attribute__((vector_size(16)))){ val, val, val, val };
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_splat(float _a)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_splat(float _a)
			
 
				 	{
			
 
				-		return float4_ld<float4_langext_t>(_a, _a, _a, _a);
			
 
				+		return float4_ld<simd_langext_t>(_a, _a, _a, _a);
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_isplat(uint32_t _a)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_isplat(uint32_t _a)
			
 
				 	{
			
 
				-		return float4_ild<float4_langext_t>(_a, _a, _a, _a);
			
 
				+		return float4_ild<simd_langext_t>(_a, _a, _a, _a);
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_zero()
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_zero()
			
 
				 	{
			
 
				-		return float4_ild<float4_langext_t>(0, 0, 0, 0);
			
 
				+		return float4_ild<simd_langext_t>(0, 0, 0, 0);
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_itof(float4_langext_t _a)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_itof(simd_langext_t _a)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vf = __builtin_convertvector(_a.vi, float __attribute__((vector_size(16))) );
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_ftoi(float4_langext_t _a)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_ftoi(simd_langext_t _a)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vi = __builtin_convertvector(_a.vf, int32_t __attribute__((vector_size(16))) );
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_round(float4_langext_t _a)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_round(simd_langext_t _a)
			
 
				 	{
			
 
				-		const float4_langext_t tmp    = float4_ftoi(_a);
			
 
				-		const float4_langext_t result = float4_itof(tmp);
			
 
				+		const simd_langext_t tmp    = float4_ftoi(_a);
			
 
				+		const simd_langext_t result = float4_itof(tmp);
			
 
				 
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_add(float4_langext_t _a, float4_langext_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_add(simd_langext_t _a, simd_langext_t _b)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vf = _a.vf + _b.vf;
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_sub(float4_langext_t _a, float4_langext_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_sub(simd_langext_t _a, simd_langext_t _b)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vf = _a.vf - _b.vf;
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_mul(float4_langext_t _a, float4_langext_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_mul(simd_langext_t _a, simd_langext_t _b)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vf = _a.vf * _b.vf;
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_div(float4_langext_t _a, float4_langext_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_div(simd_langext_t _a, simd_langext_t _b)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vf = _a.vf / _b.vf;
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_sqrt(float4_langext_t _a)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_sqrt(simd_langext_t _a)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vf[0] = sqrtf(_a.vf[0]);
			
 
				 		result.vf[1] = sqrtf(_a.vf[1]);
			
 
				 		result.vf[2] = sqrtf(_a.vf[2]);
			
@@ -358,9 +359,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_rsqrt_est(float4_langext_t _a)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_rsqrt_est(simd_langext_t _a)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vf[0] = 1.0f / sqrtf(_a.vf[0]);
			
 
				 		result.vf[1] = 1.0f / sqrtf(_a.vf[1]);
			
 
				 		result.vf[2] = 1.0f / sqrtf(_a.vf[2]);
			
@@ -369,146 +370,146 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_cmpeq(float4_langext_t _a, float4_langext_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_cmpeq(simd_langext_t _a, simd_langext_t _b)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vi = _a.vf == _b.vf;
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_cmplt(float4_langext_t _a, float4_langext_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_cmplt(simd_langext_t _a, simd_langext_t _b)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vi = _a.vf < _b.vf;
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_cmple(float4_langext_t _a, float4_langext_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_cmple(simd_langext_t _a, simd_langext_t _b)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vi = _a.vf <= _b.vf;
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_cmpgt(float4_langext_t _a, float4_langext_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_cmpgt(simd_langext_t _a, simd_langext_t _b)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vi = _a.vf > _b.vf;
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_cmpge(float4_langext_t _a, float4_langext_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_cmpge(simd_langext_t _a, simd_langext_t _b)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vi = _a.vf >= _b.vf;
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_and(float4_langext_t _a, float4_langext_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_and(simd_langext_t _a, simd_langext_t _b)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vu = _a.vu & _b.vu;
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_andc(float4_langext_t _a, float4_langext_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_andc(simd_langext_t _a, simd_langext_t _b)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vu = _a.vu & ~_b.vu;
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_or(float4_langext_t _a, float4_langext_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_or(simd_langext_t _a, simd_langext_t _b)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vu = _a.vu | _b.vu;
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_xor(float4_langext_t _a, float4_langext_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_xor(simd_langext_t _a, simd_langext_t _b)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vu = _a.vu ^ _b.vu;
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_sll(float4_langext_t _a, int _count)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_sll(simd_langext_t _a, int _count)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				-		const float4_langext_t count = float4_isplat<float4_langext_t>(_count);
			
 
				+		simd_langext_t result;
			
 
				+		const simd_langext_t count = float4_isplat<simd_langext_t>(_count);
			
 
				 		result.vu = _a.vu << count.vi;
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_srl(float4_langext_t _a, int _count)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_srl(simd_langext_t _a, int _count)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				-		const float4_langext_t count = float4_isplat<float4_langext_t>(_count);
			
 
				+		simd_langext_t result;
			
 
				+		const simd_langext_t count = float4_isplat<simd_langext_t>(_count);
			
 
				 		result.vu = _a.vu >> count.vi;
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_sra(float4_langext_t _a, int _count)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_sra(simd_langext_t _a, int _count)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				-		const float4_langext_t count = float4_isplat<float4_langext_t>(_count);
			
 
				+		simd_langext_t result;
			
 
				+		const simd_langext_t count = float4_isplat<simd_langext_t>(_count);
			
 
				 		result.vi = _a.vi >> count.vi;
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_icmpeq(float4_langext_t _a, float4_langext_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_icmpeq(simd_langext_t _a, simd_langext_t _b)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vi = _a.vi == _b.vi;
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_icmplt(float4_langext_t _a, float4_langext_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_icmplt(simd_langext_t _a, simd_langext_t _b)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vi = _a.vi < _b.vi;
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_icmpgt(float4_langext_t _a, float4_langext_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_icmpgt(simd_langext_t _a, simd_langext_t _b)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vi = _a.vi > _b.vi;
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_iadd(float4_langext_t _a, float4_langext_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_iadd(simd_langext_t _a, simd_langext_t _b)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vi = _a.vi + _b.vi;
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_langext_t float4_isub(float4_langext_t _a, float4_langext_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd_langext_t float4_isub(simd_langext_t _a, simd_langext_t _b)
			
 
				 	{
			
 
				-		float4_langext_t result;
			
 
				+		simd_langext_t result;
			
 
				 		result.vi = _a.vi - _b.vi;
			
 
				 		return result;
			
 
				 	}
			
 
				 
			
 
				-	typedef float4_langext_t float4_t;
			
 
				+	typedef simd_langext_t simd128_t;
			
 
				 
			
 
				 } // namespace bx
			
 
				 
			
 
				-#endif // BX_FLOAT4_LANGEXT_H_HEADER_GUARD
			
 
				+#endif // BX_SIMD_LANGEXT_H_HEADER_GUARD
			
--- a/include/bx/simd128_neon.inl
+++ b/include/bx/simd128_neon.inl
@@ -0,0 +1,562 @@
 
				+/*
			
 
				+ * Copyright 2010-2016 Branimir Karadzic. All rights reserved.
			
 
				+ * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
			
 
				+ */
			
 
				+
			
 
				+#ifndef BX_SIMD_NEON_H_HEADER_GUARD
			
 
				+#define BX_SIMD_NEON_H_HEADER_GUARD
			
 
				+
			
 
				+#define simd_rcp           simd_rcp_ni
			
 
				+#define simd_orx           simd_orx_ni
			
 
				+#define simd_orc           simd_orc_ni
			
 
				+#define simd_neg           simd_neg_ni
			
 
				+#define simd_madd          simd_madd_ni
			
 
				+#define simd_nmsub         simd_nmsub_ni
			
 
				+#define simd_div_nr        simd_div_nr_ni
			
 
				+#define simd_div           simd_div_nr_ni
			
 
				+#define simd_selb          simd_selb_ni
			
 
				+#define simd_sels          simd_sels_ni
			
 
				+#define simd_not           simd_not_ni
			
 
				+#define simd_abs           simd_abs_ni
			
 
				+#define simd_clamp         simd_clamp_ni
			
 
				+#define simd_lerp          simd_lerp_ni
			
 
				+#define simd_rsqrt         simd_rsqrt_ni
			
 
				+#define simd_rsqrt_nr      simd_rsqrt_nr_ni
			
 
				+#define simd_rsqrt_carmack simd_rsqrt_carmack_ni
			
 
				+#define simd_sqrt_nr       simd_sqrt_nr_ni
			
 
				+#define simd_sqrt          simd_sqrt_nr_ni
			
 
				+#define simd_log2          simd_log2_ni
			
 
				+#define simd_exp2          simd_exp2_ni
			
 
				+#define simd_pow           simd_pow_ni
			
 
				+#define simd_cross3        simd_cross3_ni
			
 
				+#define simd_normalize3    simd_normalize3_ni
			
 
				+#define simd_dot3          simd_dot3_ni
			
 
				+#define simd_dot           simd_dot_ni
			
 
				+#define simd_ceil          simd_ceil_ni
			
 
				+#define simd_floor         simd_floor_ni
			
 
				+
			
 
				+#include "simd_ni.inl"
			
 
				+
			
 
				+namespace bx
			
 
				+{
			
 
				+#define ELEMx 0
			
 
				+#define ELEMy 1
			
 
				+#define ELEMz 2
			
 
				+#define ELEMw 3
			
 
				+#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \
			
 
				+			template<> \
			
 
				+			BX_SIMD_FORCE_INLINE simd128_neon_t simd_swiz_##_x##_y##_z##_w(simd128_neon_t _a) \
			
 
				+			{ \
			
 
				+				return __builtin_shuffle(_a, (uint32x4_t){ ELEM##_x, ELEM##_y, ELEM##_z, ELEM##_w }); \
			
 
				+			}
			
 
				+
			
 
				+#include "simd_swizzle.inl"
			
 
				+
			
 
				+#undef IMPLEMENT_SWIZZLE
			
 
				+#undef ELEMw
			
 
				+#undef ELEMz
			
 
				+#undef ELEMy
			
 
				+#undef ELEMx
			
 
				+
			
 
				+#define IMPLEMENT_TEST(_xyzw, _swizzle) \
			
 
				+			template<> \
			
 
				+			BX_SIMD_FORCE_INLINE bool simd_test_any_##_xyzw(simd128_neon_t _test) \
			
 
				+			{ \
			
 
				+				const simd128_neon_t tmp0 = simd_swiz_##_swizzle(_test); \
			
 
				+				return simd_test_any_ni(tmp0); \
			
 
				+			} \
			
 
				+			\
			
 
				+			template<> \
			
 
				+			BX_SIMD_FORCE_INLINE bool simd_test_all_##_xyzw(simd128_neon_t _test) \
			
 
				+			{ \
			
 
				+				const simd128_neon_t tmp0 = simd_swiz_##_swizzle(_test); \
			
 
				+				return simd_test_all_ni(tmp0); \
			
 
				+			}
			
 
				+
			
 
				+IMPLEMENT_TEST(x,   xxxx);
			
 
				+IMPLEMENT_TEST(y,   yyyy);
			
 
				+IMPLEMENT_TEST(xy,  xyyy);
			
 
				+IMPLEMENT_TEST(z,   zzzz);
			
 
				+IMPLEMENT_TEST(xz,  xzzz);
			
 
				+IMPLEMENT_TEST(yz,  yzzz);
			
 
				+IMPLEMENT_TEST(xyz, xyzz);
			
 
				+IMPLEMENT_TEST(w,   wwww);
			
 
				+IMPLEMENT_TEST(xw,  xwww);
			
 
				+IMPLEMENT_TEST(yw,  ywww);
			
 
				+IMPLEMENT_TEST(xyw, xyww);
			
 
				+IMPLEMENT_TEST(zw,  zwww);
			
 
				+IMPLEMENT_TEST(xzw, xzww);
			
 
				+IMPLEMENT_TEST(yzw, yzww);
			
 
				+#undef IMPLEMENT_TEST
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE bool simd_test_any_xyzw(simd128_neon_t _test)
			
 
				+	{
			
 
				+		return simd_test_any_ni(_test);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE bool simd_test_all_xyzw(simd128_neon_t _test)
			
 
				+	{
			
 
				+		return simd_test_all_ni(_test);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_xyAB(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		return __builtin_shuffle(_a, _b, (uint32x4_t){ 0, 1, 4, 5 });
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_ABxy(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		return __builtin_shuffle(_a, _b, (uint32x4_t){ 4, 5, 0, 1 });
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_CDzw(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		return __builtin_shuffle(_a, _b, (uint32x4_t){ 6, 7, 2, 3 });
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_zwCD(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		return __builtin_shuffle(_a, _b, (uint32x4_t){ 2, 3, 6, 7 });
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_xAyB(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		return __builtin_shuffle(_a, _b, (uint32x4_t){ 0, 4, 1, 5 });
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_yBxA(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		return __builtin_shuffle(_a, _b, (uint32x4_t){ 1, 5, 0, 4 });
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_zCwD(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		return __builtin_shuffle(_a, _b, (uint32x4_t){ 2, 6, 3, 7 });
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_CzDw(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		return __builtin_shuffle(_a, _b, (uint32x4_t){ 6, 2, 7, 3 });
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE float simd_x(simd128_neon_t _a)
			
 
				+	{
			
 
				+		return vgetq_lane_f32(_a, 0);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE float simd_y(simd128_neon_t _a)
			
 
				+	{
			
 
				+		return vgetq_lane_f32(_a, 1);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE float simd_z(simd128_neon_t _a)
			
 
				+	{
			
 
				+		return vgetq_lane_f32(_a, 2);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE float simd_w(simd128_neon_t _a)
			
 
				+	{
			
 
				+		return vgetq_lane_f32(_a, 3);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_ld(const void* _ptr)
			
 
				+	{
			
 
				+		return vld1q_f32( (const float32_t*)_ptr);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE void simd_st(void* _ptr, simd128_neon_t _a)
			
 
				+	{
			
 
				+		vst1q_f32( (float32_t*)_ptr, _a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE void simd_stx(void* _ptr, simd128_neon_t _a)
			
 
				+	{
			
 
				+		vst1q_lane_f32( (float32_t*)_ptr, _a, 0);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE void simd_stream(void* _ptr, simd128_neon_t _a)
			
 
				+	{
			
 
				+		vst1q_f32( (float32_t*)_ptr, _a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_ld(float _x, float _y, float _z, float _w)
			
 
				+	{
			
 
				+		const float32_t val[4] = {_x, _y, _z, _w};
			
 
				+		return simd_ld<simd128_neon_t>(val);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
			
 
				+	{
			
 
				+		const uint32_t   val[4]    = {_x, _y, _z, _w};
			
 
				+		const uint32x4_t tmp       = vld1q_u32(val);
			
 
				+		const simd128_neon_t result = vreinterpretq_f32_u32(tmp);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_splat(const void* _ptr)
			
 
				+	{
			
 
				+		const simd128_neon_t tmp0   = vld1q_f32( (const float32_t*)_ptr);
			
 
				+		const float32x2_t   tmp1   = vget_low_f32(tmp0);
			
 
				+		const simd128_neon_t result = vdupq_lane_f32(tmp1, 0);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_splat(float _a)
			
 
				+	{
			
 
				+		return vdupq_n_f32(_a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_isplat(uint32_t _a)
			
 
				+	{
			
 
				+		const int32x4_t tmp    = vdupq_n_s32(_a);
			
 
				+		const simd128_neon_t  result = vreinterpretq_f32_s32(tmp);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_zero()
			
 
				+	{
			
 
				+		return simd_isplat<simd128_neon_t>(0);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_itof(simd128_neon_t _a)
			
 
				+	{
			
 
				+		const int32x4_t itof   = vreinterpretq_s32_f32(_a);
			
 
				+		const simd128_neon_t  result = vcvtq_f32_s32(itof);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_ftoi(simd128_neon_t _a)
			
 
				+	{
			
 
				+		const int32x4_t ftoi  = vcvtq_s32_f32(_a);
			
 
				+		const simd128_neon_t result = vreinterpretq_f32_s32(ftoi);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_add(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		return vaddq_f32(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_sub(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		return vsubq_f32(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_mul(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		return vmulq_f32(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_rcp_est(simd128_neon_t _a)
			
 
				+	{
			
 
				+		return vrecpeq_f32(_a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_rsqrt_est(simd128_neon_t _a)
			
 
				+	{
			
 
				+		return vrsqrteq_f32(_a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmpeq(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		const uint32x4_t tmp    = vceqq_f32(_a, _b);
			
 
				+		const simd128_neon_t   result = vreinterpretq_f32_u32(tmp);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmplt(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		const uint32x4_t tmp    = vcltq_f32(_a, _b);
			
 
				+		const simd128_neon_t   result = vreinterpretq_f32_u32(tmp);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmple(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		const uint32x4_t tmp    = vcleq_f32(_a, _b);
			
 
				+		const simd128_neon_t   result = vreinterpretq_f32_u32(tmp);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmpgt(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		const uint32x4_t tmp    = vcgtq_f32(_a, _b);
			
 
				+		const simd128_neon_t   result = vreinterpretq_f32_u32(tmp);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmpge(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		const uint32x4_t tmp    = vcgeq_f32(_a, _b);
			
 
				+		const simd128_neon_t   result = vreinterpretq_f32_u32(tmp);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_min(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		return vminq_f32(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_max(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		return vmaxq_f32(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_and(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		const int32x4_t tmp0   = vreinterpretq_s32_f32(_a);
			
 
				+		const int32x4_t tmp1   = vreinterpretq_s32_f32(_b);
			
 
				+		const int32x4_t tmp2   = vandq_s32(tmp0, tmp1);
			
 
				+		const simd128_neon_t  result = vreinterpretq_f32_s32(tmp2);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_andc(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		const int32x4_t tmp0   = vreinterpretq_s32_f32(_a);
			
 
				+		const int32x4_t tmp1   = vreinterpretq_s32_f32(_b);
			
 
				+		const int32x4_t tmp2   = vbicq_s32(tmp0, tmp1);
			
 
				+		const simd128_neon_t  result = vreinterpretq_f32_s32(tmp2);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_or(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		const int32x4_t tmp0   = vreinterpretq_s32_f32(_a);
			
 
				+		const int32x4_t tmp1   = vreinterpretq_s32_f32(_b);
			
 
				+		const int32x4_t tmp2   = vorrq_s32(tmp0, tmp1);
			
 
				+		const simd128_neon_t  result = vreinterpretq_f32_s32(tmp2);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_xor(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		const int32x4_t tmp0   = vreinterpretq_s32_f32(_a);
			
 
				+		const int32x4_t tmp1   = vreinterpretq_s32_f32(_b);
			
 
				+		const int32x4_t tmp2   = veorq_s32(tmp0, tmp1);
			
 
				+		const simd128_neon_t  result = vreinterpretq_f32_s32(tmp2);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_sll(simd128_neon_t _a, int _count)
			
 
				+	{
			
 
				+		if (__builtin_constant_p(_count) )
			
 
				+		{
			
 
				+			const uint32x4_t tmp0   = vreinterpretq_u32_f32(_a);
			
 
				+			const uint32x4_t tmp1   = vshlq_n_u32(tmp0, _count);
			
 
				+			const simd128_neon_t   result = vreinterpretq_f32_u32(tmp1);
			
 
				+
			
 
				+			return result;
			
 
				+		}
			
 
				+
			
 
				+		const uint32x4_t tmp0   = vreinterpretq_u32_f32(_a);
			
 
				+		const int32x4_t  shift  = vdupq_n_s32(_count);
			
 
				+		const uint32x4_t tmp1   = vshlq_u32(tmp0, shift);
			
 
				+		const simd128_neon_t   result = vreinterpretq_f32_u32(tmp1);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_srl(simd128_neon_t _a, int _count)
			
 
				+	{
			
 
				+		if (__builtin_constant_p(_count) )
			
 
				+		{
			
 
				+			const uint32x4_t tmp0   = vreinterpretq_u32_f32(_a);
			
 
				+			const uint32x4_t tmp1   = vshrq_n_u32(tmp0, _count);
			
 
				+			const simd128_neon_t   result = vreinterpretq_f32_u32(tmp1);
			
 
				+
			
 
				+			return result;
			
 
				+		}
			
 
				+
			
 
				+		const uint32x4_t tmp0   = vreinterpretq_u32_f32(_a);
			
 
				+		const int32x4_t  shift  = vdupq_n_s32(-_count);
			
 
				+		const uint32x4_t tmp1   = vshlq_u32(tmp0, shift);
			
 
				+		const simd128_neon_t   result = vreinterpretq_f32_u32(tmp1);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_sra(simd128_neon_t _a, int _count)
			
 
				+	{
			
 
				+		if (__builtin_constant_p(_count) )
			
 
				+		{
			
 
				+			const int32x4_t tmp0   = vreinterpretq_s32_f32(_a);
			
 
				+			const int32x4_t tmp1   = vshrq_n_s32(tmp0, _count);
			
 
				+			const simd128_neon_t  result = vreinterpretq_f32_s32(tmp1);
			
 
				+
			
 
				+			return result;
			
 
				+		}
			
 
				+
			
 
				+		const int32x4_t tmp0   = vreinterpretq_s32_f32(_a);
			
 
				+		const int32x4_t shift  = vdupq_n_s32(-_count);
			
 
				+		const int32x4_t tmp1   = vshlq_s32(tmp0, shift);
			
 
				+		const simd128_neon_t  result = vreinterpretq_f32_s32(tmp1);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_madd(simd128_neon_t _a, simd128_neon_t _b, simd128_neon_t _c)
			
 
				+	{
			
 
				+		return vmlaq_f32(_c, _a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_nmsub(simd128_neon_t _a, simd128_neon_t _b, simd128_neon_t _c)
			
 
				+	{
			
 
				+		return vmlsq_f32(_c, _a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_icmpeq(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		const int32x4_t  tmp0   = vreinterpretq_s32_f32(_a);
			
 
				+		const int32x4_t  tmp1   = vreinterpretq_s32_f32(_b);
			
 
				+		const uint32x4_t tmp2   = vceqq_s32(tmp0, tmp1);
			
 
				+		const simd128_neon_t   result = vreinterpretq_f32_u32(tmp2);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_icmplt(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		const int32x4_t  tmp0   = vreinterpretq_s32_f32(_a);
			
 
				+		const int32x4_t  tmp1   = vreinterpretq_s32_f32(_b);
			
 
				+		const uint32x4_t tmp2   = vcltq_s32(tmp0, tmp1);
			
 
				+		const simd128_neon_t   result = vreinterpretq_f32_u32(tmp2);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_icmpgt(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		const int32x4_t  tmp0   = vreinterpretq_s32_f32(_a);
			
 
				+		const int32x4_t  tmp1   = vreinterpretq_s32_f32(_b);
			
 
				+		const uint32x4_t tmp2   = vcgtq_s32(tmp0, tmp1);
			
 
				+		const simd128_neon_t   result = vreinterpretq_f32_u32(tmp2);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_imin(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		const int32x4_t tmp0   = vreinterpretq_s32_f32(_a);
			
 
				+		const int32x4_t tmp1   = vreinterpretq_s32_f32(_b);
			
 
				+		const int32x4_t tmp2   = vminq_s32(tmp0, tmp1);
			
 
				+		const simd128_neon_t  result = vreinterpretq_f32_s32(tmp2);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_imax(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		const int32x4_t tmp0   = vreinterpretq_s32_f32(_a);
			
 
				+		const int32x4_t tmp1   = vreinterpretq_s32_f32(_b);
			
 
				+		const int32x4_t tmp2   = vmaxq_s32(tmp0, tmp1);
			
 
				+		const simd128_neon_t  result = vreinterpretq_f32_s32(tmp2);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_iadd(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		const int32x4_t tmp0   = vreinterpretq_s32_f32(_a);
			
 
				+		const int32x4_t tmp1   = vreinterpretq_s32_f32(_b);
			
 
				+		const int32x4_t tmp2   = vaddq_s32(tmp0, tmp1);
			
 
				+		const simd128_neon_t  result = vreinterpretq_f32_s32(tmp2);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_isub(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		const int32x4_t tmp0   = vreinterpretq_s32_f32(_a);
			
 
				+		const int32x4_t tmp1   = vreinterpretq_s32_f32(_b);
			
 
				+		const int32x4_t tmp2   = vsubq_s32(tmp0, tmp1);
			
 
				+		const simd128_neon_t  result = vreinterpretq_f32_s32(tmp2);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_INLINE simd128_neon_t simd_shuf_xAzC(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		return simd_shuf_xAzC_ni(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_INLINE simd128_neon_t simd_shuf_yBwD(simd128_neon_t _a, simd128_neon_t _b)
			
 
				+	{
			
 
				+		return simd_shuf_yBwD_ni(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	typedef simd128_neon_t simd128_t;
			
 
				+
			
 
				+} // namespace bx
			
 
				+
			
 
				+#endif // BX_SIMD_NEON_H_HEADER_GUARD
			
--- a/include/bx/simd128_ref.inl
+++ b/include/bx/simd128_ref.inl
@@ -3,41 +3,41 @@
 
				  * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
			
 
				  */
			
 
				 
			
 
				-#ifndef BX_FLOAT4_REF_H_HEADER_GUARD
			
 
				-#define BX_FLOAT4_REF_H_HEADER_GUARD
			
 
				+#ifndef BX_SIMD_REF_H_HEADER_GUARD
			
 
				+#define BX_SIMD_REF_H_HEADER_GUARD
			
 
				 
			
 
				 #include <math.h> // sqrtf
			
 
				 
			
 
				-#define float4_shuf_xAzC float4_shuf_xAzC_ni
			
 
				-#define float4_shuf_yBwD float4_shuf_yBwD_ni
			
 
				-#define float4_rcp float4_rcp_ni
			
 
				-#define float4_orx float4_orx_ni
			
 
				-#define float4_orc float4_orc_ni
			
 
				-#define float4_neg float4_neg_ni
			
 
				-#define float4_madd float4_madd_ni
			
 
				-#define float4_nmsub float4_nmsub_ni
			
 
				-#define float4_div_nr float4_div_nr_ni
			
 
				-#define float4_selb float4_selb_ni
			
 
				-#define float4_sels float4_sels_ni
			
 
				-#define float4_not float4_not_ni
			
 
				-#define float4_abs float4_abs_ni
			
 
				-#define float4_clamp float4_clamp_ni
			
 
				-#define float4_lerp float4_lerp_ni
			
 
				-#define float4_rsqrt float4_rsqrt_ni
			
 
				-#define float4_rsqrt_nr float4_rsqrt_nr_ni
			
 
				-#define float4_rsqrt_carmack float4_rsqrt_carmack_ni
			
 
				-#define float4_sqrt_nr float4_sqrt_nr_ni
			
 
				-#define float4_log2 float4_log2_ni
			
 
				-#define float4_exp2 float4_exp2_ni
			
 
				-#define float4_pow float4_pow_ni
			
 
				-#define float4_cross3 float4_cross3_ni
			
 
				-#define float4_normalize3 float4_normalize3_ni
			
 
				-#define float4_dot3 float4_dot3_ni
			
 
				-#define float4_dot float4_dot_ni
			
 
				-#define float4_ceil float4_ceil_ni
			
 
				-#define float4_floor float4_floor_ni
			
 
				-
			
 
				-#include "float4_ni.h"
			
 
				+#define simd_shuf_xAzC simd_shuf_xAzC_ni
			
 
				+#define simd_shuf_yBwD simd_shuf_yBwD_ni
			
 
				+#define simd_rcp simd_rcp_ni
			
 
				+#define simd_orx simd_orx_ni
			
 
				+#define simd_orc simd_orc_ni
			
 
				+#define simd_neg simd_neg_ni
			
 
				+#define simd_madd simd_madd_ni
			
 
				+#define simd_nmsub simd_nmsub_ni
			
 
				+#define simd_div_nr simd_div_nr_ni
			
 
				+#define simd_selb simd_selb_ni
			
 
				+#define simd_sels simd_sels_ni
			
 
				+#define simd_not simd_not_ni
			
 
				+#define simd_abs simd_abs_ni
			
 
				+#define simd_clamp simd_clamp_ni
			
 
				+#define simd_lerp simd_lerp_ni
			
 
				+#define simd_rsqrt simd_rsqrt_ni
			
 
				+#define simd_rsqrt_nr simd_rsqrt_nr_ni
			
 
				+#define simd_rsqrt_carmack simd_rsqrt_carmack_ni
			
 
				+#define simd_sqrt_nr simd_sqrt_nr_ni
			
 
				+#define simd_log2 simd_log2_ni
			
 
				+#define simd_exp2 simd_exp2_ni
			
 
				+#define simd_pow simd_pow_ni
			
 
				+#define simd_cross3 simd_cross3_ni
			
 
				+#define simd_normalize3 simd_normalize3_ni
			
 
				+#define simd_dot3 simd_dot3_ni
			
 
				+#define simd_dot simd_dot_ni
			
 
				+#define simd_ceil simd_ceil_ni
			
 
				+#define simd_floor simd_floor_ni
			
 
				+
			
 
				+#include "simd_ni.inl"
			
 
				 
			
 
				 namespace bx
			
 
				 {
			
@@ -47,9 +47,9 @@ namespace bx
 
				 #define ELEMw 3
			
 
				 #define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \
			
 
				 			template<> \
			
 
				-			BX_FLOAT4_FORCE_INLINE float4_ref_t float4_swiz_##_x##_y##_z##_w(float4_ref_t _a) \
			
 
				+			BX_SIMD_FORCE_INLINE simd128_ref_t simd_swiz_##_x##_y##_z##_w(simd128_ref_t _a) \
			
 
				 			{ \
			
 
				-				float4_ref_t result; \
			
 
				+				simd128_ref_t result; \
			
 
				 				result.ixyzw[0] = _a.ixyzw[ELEM##_x]; \
			
 
				 				result.ixyzw[1] = _a.ixyzw[ELEM##_y]; \
			
 
				 				result.ixyzw[2] = _a.ixyzw[ELEM##_z]; \
			
@@ -57,7 +57,7 @@ namespace bx
 
				 				return result; \
			
 
				 			}
			
 
				 
			
 
				-#include "float4_swizzle.inl"
			
 
				+#include "simd_swizzle.inl"
			
 
				 
			
 
				 #undef IMPLEMENT_SWIZZLE
			
 
				 #undef ELEMw
			
@@ -67,7 +67,7 @@ namespace bx
 
				 
			
 
				 #define IMPLEMENT_TEST(_xyzw, _mask) \
			
 
				 			template<> \
			
 
				-			BX_FLOAT4_FORCE_INLINE bool float4_test_any_##_xyzw(float4_ref_t _test) \
			
 
				+			BX_SIMD_FORCE_INLINE bool simd_test_any_##_xyzw(simd128_ref_t _test) \
			
 
				 			{ \
			
 
				 				uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \
			
 
				 				             | ( (_test.uxyzw[2]>>31)<<2) \
			
@@ -78,7 +78,7 @@ namespace bx
 
				 			} \
			
 
				 			\
			
 
				 			template<> \
			
 
				-			BX_FLOAT4_FORCE_INLINE bool float4_test_all_##_xyzw(float4_ref_t _test) \
			
 
				+			BX_SIMD_FORCE_INLINE bool simd_test_all_##_xyzw(simd128_ref_t _test) \
			
 
				 			{ \
			
 
				 				uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \
			
 
				 				             | ( (_test.uxyzw[2]>>31)<<2) \
			
@@ -107,9 +107,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 #undef IMPLEMENT_TEST
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_shuf_xyAB(float4_ref_t _a, float4_ref_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_xyAB(simd128_ref_t _a, simd128_ref_t _b)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.uxyzw[0] = _a.uxyzw[0];
			
 
				 		result.uxyzw[1] = _a.uxyzw[1];
			
 
				 		result.uxyzw[2] = _b.uxyzw[0];
			
@@ -118,9 +118,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_shuf_ABxy(float4_ref_t _a, float4_ref_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_ABxy(simd128_ref_t _a, simd128_ref_t _b)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.uxyzw[0] = _b.uxyzw[0];
			
 
				 		result.uxyzw[1] = _b.uxyzw[1];
			
 
				 		result.uxyzw[2] = _a.uxyzw[0];
			
@@ -129,9 +129,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_shuf_CDzw(float4_ref_t _a, float4_ref_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_CDzw(simd128_ref_t _a, simd128_ref_t _b)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.uxyzw[0] = _b.uxyzw[2];
			
 
				 		result.uxyzw[1] = _b.uxyzw[3];
			
 
				 		result.uxyzw[2] = _a.uxyzw[2];
			
@@ -140,9 +140,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_shuf_zwCD(float4_ref_t _a, float4_ref_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_zwCD(simd128_ref_t _a, simd128_ref_t _b)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.uxyzw[0] = _a.uxyzw[2];
			
 
				 		result.uxyzw[1] = _a.uxyzw[3];
			
 
				 		result.uxyzw[2] = _b.uxyzw[2];
			
@@ -151,9 +151,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_shuf_xAyB(float4_ref_t _a, float4_ref_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_xAyB(simd128_ref_t _a, simd128_ref_t _b)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.uxyzw[0] = _a.uxyzw[0];
			
 
				 		result.uxyzw[1] = _b.uxyzw[0];
			
 
				 		result.uxyzw[2] = _a.uxyzw[1];
			
@@ -162,9 +162,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_shuf_yBxA(float4_ref_t _a, float4_ref_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_yBxA(simd128_ref_t _a, simd128_ref_t _b)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.uxyzw[0] = _a.uxyzw[1];
			
 
				 		result.uxyzw[1] = _b.uxyzw[1];
			
 
				 		result.uxyzw[2] = _a.uxyzw[0];
			
@@ -173,9 +173,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_shuf_zCwD(float4_ref_t _a, float4_ref_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_zCwD(simd128_ref_t _a, simd128_ref_t _b)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.uxyzw[0] = _a.uxyzw[2];
			
 
				 		result.uxyzw[1] = _b.uxyzw[2];
			
 
				 		result.uxyzw[2] = _a.uxyzw[3];
			
@@ -184,9 +184,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_shuf_CzDw(float4_ref_t _a, float4_ref_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_CzDw(simd128_ref_t _a, simd128_ref_t _b)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.uxyzw[0] = _b.uxyzw[2];
			
 
				 		result.uxyzw[1] = _a.uxyzw[2];
			
 
				 		result.uxyzw[2] = _b.uxyzw[3];
			
@@ -195,34 +195,34 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float float4_x(float4_ref_t _a)
			
 
				+	BX_SIMD_FORCE_INLINE float simd_x(simd128_ref_t _a)
			
 
				 	{
			
 
				 		return _a.fxyzw[0];
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float float4_y(float4_ref_t _a)
			
 
				+	BX_SIMD_FORCE_INLINE float simd_y(simd128_ref_t _a)
			
 
				 	{
			
 
				 		return _a.fxyzw[1];
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float float4_z(float4_ref_t _a)
			
 
				+	BX_SIMD_FORCE_INLINE float simd_z(simd128_ref_t _a)
			
 
				 	{
			
 
				 		return _a.fxyzw[2];
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float float4_w(float4_ref_t _a)
			
 
				+	BX_SIMD_FORCE_INLINE float simd_w(simd128_ref_t _a)
			
 
				 	{
			
 
				 		return _a.fxyzw[3];
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_ld(const void* _ptr)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_ld(const void* _ptr)
			
 
				 	{
			
 
				 		const uint32_t* input = reinterpret_cast<const uint32_t*>(_ptr);
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.uxyzw[0] = input[0];
			
 
				 		result.uxyzw[1] = input[1];
			
 
				 		result.uxyzw[2] = input[2];
			
@@ -231,7 +231,7 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE void float4_st(void* _ptr, float4_ref_t _a)
			
 
				+	BX_SIMD_FORCE_INLINE void simd_st(void* _ptr, simd128_ref_t _a)
			
 
				 	{
			
 
				 		uint32_t* result = reinterpret_cast<uint32_t*>(_ptr);
			
 
				 		result[0] = _a.uxyzw[0];
			
@@ -241,14 +241,14 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE void float4_stx(void* _ptr, float4_ref_t _a)
			
 
				+	BX_SIMD_FORCE_INLINE void simd_stx(void* _ptr, simd128_ref_t _a)
			
 
				 	{
			
 
				 		uint32_t* result = reinterpret_cast<uint32_t*>(_ptr);
			
 
				 		result[0] = _a.uxyzw[0];
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE void float4_stream(void* _ptr, float4_ref_t _a)
			
 
				+	BX_SIMD_FORCE_INLINE void simd_stream(void* _ptr, simd128_ref_t _a)
			
 
				 	{
			
 
				 		uint32_t* result = reinterpret_cast<uint32_t*>(_ptr);
			
 
				 		result[0] = _a.uxyzw[0];
			
@@ -258,9 +258,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_ld(float _x, float _y, float _z, float _w)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_ld(float _x, float _y, float _z, float _w)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.fxyzw[0] = _x;
			
 
				 		result.fxyzw[1] = _y;
			
 
				 		result.fxyzw[2] = _z;
			
@@ -269,9 +269,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.uxyzw[0] = _x;
			
 
				 		result.uxyzw[1] = _y;
			
 
				 		result.uxyzw[2] = _z;
			
@@ -280,10 +280,10 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_splat(const void* _ptr)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_splat(const void* _ptr)
			
 
				 	{
			
 
				 		const uint32_t val = *reinterpret_cast<const uint32_t*>(_ptr);
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.uxyzw[0] = val;
			
 
				 		result.uxyzw[1] = val;
			
 
				 		result.uxyzw[2] = val;
			
@@ -292,27 +292,27 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_splat(float _a)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_splat(float _a)
			
 
				 	{
			
 
				-		return float4_ld<float4_ref_t>(_a, _a, _a, _a);
			
 
				+		return simd_ld<simd128_ref_t>(_a, _a, _a, _a);
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_isplat(uint32_t _a)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_isplat(uint32_t _a)
			
 
				 	{
			
 
				-		return float4_ild<float4_ref_t>(_a, _a, _a, _a);
			
 
				+		return simd_ild<simd128_ref_t>(_a, _a, _a, _a);
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_zero()
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_zero()
			
 
				 	{
			
 
				-		return float4_ild<float4_ref_t>(0, 0, 0, 0);
			
 
				+		return simd_ild<simd128_ref_t>(0, 0, 0, 0);
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_itof(float4_ref_t _a)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_itof(simd128_ref_t _a)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.fxyzw[0] = (float)_a.ixyzw[0];
			
 
				 		result.fxyzw[1] = (float)_a.ixyzw[1];
			
 
				 		result.fxyzw[2] = (float)_a.ixyzw[2];
			
@@ -321,9 +321,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_ftoi(float4_ref_t _a)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_ftoi(simd128_ref_t _a)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.ixyzw[0] = (int)_a.fxyzw[0];
			
 
				 		result.ixyzw[1] = (int)_a.fxyzw[1];
			
 
				 		result.ixyzw[2] = (int)_a.fxyzw[2];
			
@@ -332,15 +332,15 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_round(float4_ref_t _a)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_round(simd128_ref_t _a)
			
 
				 	{
			
 
				-		return float4_round_ni(_a);
			
 
				+		return simd_round_ni(_a);
			
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_add(float4_ref_t _a, float4_ref_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_add(simd128_ref_t _a, simd128_ref_t _b)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.fxyzw[0] = _a.fxyzw[0] + _b.fxyzw[0];
			
 
				 		result.fxyzw[1] = _a.fxyzw[1] + _b.fxyzw[1];
			
 
				 		result.fxyzw[2] = _a.fxyzw[2] + _b.fxyzw[2];
			
@@ -349,9 +349,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_sub(float4_ref_t _a, float4_ref_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_sub(simd128_ref_t _a, simd128_ref_t _b)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.fxyzw[0] = _a.fxyzw[0] - _b.fxyzw[0];
			
 
				 		result.fxyzw[1] = _a.fxyzw[1] - _b.fxyzw[1];
			
 
				 		result.fxyzw[2] = _a.fxyzw[2] - _b.fxyzw[2];
			
@@ -360,9 +360,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_mul(float4_ref_t _a, float4_ref_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_mul(simd128_ref_t _a, simd128_ref_t _b)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.fxyzw[0] = _a.fxyzw[0] * _b.fxyzw[0];
			
 
				 		result.fxyzw[1] = _a.fxyzw[1] * _b.fxyzw[1];
			
 
				 		result.fxyzw[2] = _a.fxyzw[2] * _b.fxyzw[2];
			
@@ -371,9 +371,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_div(float4_ref_t _a, float4_ref_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_div(simd128_ref_t _a, simd128_ref_t _b)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.fxyzw[0] = _a.fxyzw[0] / _b.fxyzw[0];
			
 
				 		result.fxyzw[1] = _a.fxyzw[1] / _b.fxyzw[1];
			
 
				 		result.fxyzw[2] = _a.fxyzw[2] / _b.fxyzw[2];
			
@@ -382,9 +382,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_rcp_est(float4_ref_t _a)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_rcp_est(simd128_ref_t _a)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.fxyzw[0] = 1.0f / _a.fxyzw[0];
			
 
				 		result.fxyzw[1] = 1.0f / _a.fxyzw[1];
			
 
				 		result.fxyzw[2] = 1.0f / _a.fxyzw[2];
			
@@ -393,9 +393,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_sqrt(float4_ref_t _a)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_sqrt(simd128_ref_t _a)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.fxyzw[0] = sqrtf(_a.fxyzw[0]);
			
 
				 		result.fxyzw[1] = sqrtf(_a.fxyzw[1]);
			
 
				 		result.fxyzw[2] = sqrtf(_a.fxyzw[2]);
			
@@ -404,9 +404,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_rsqrt_est(float4_ref_t _a)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_rsqrt_est(simd128_ref_t _a)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.fxyzw[0] = 1.0f / sqrtf(_a.fxyzw[0]);
			
 
				 		result.fxyzw[1] = 1.0f / sqrtf(_a.fxyzw[1]);
			
 
				 		result.fxyzw[2] = 1.0f / sqrtf(_a.fxyzw[2]);
			
@@ -415,9 +415,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_cmpeq(float4_ref_t _a, float4_ref_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_cmpeq(simd128_ref_t _a, simd128_ref_t _b)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.ixyzw[0] = _a.fxyzw[0] == _b.fxyzw[0] ? 0xffffffff : 0x0;
			
 
				 		result.ixyzw[1] = _a.fxyzw[1] == _b.fxyzw[1] ? 0xffffffff : 0x0;
			
 
				 		result.ixyzw[2] = _a.fxyzw[2] == _b.fxyzw[2] ? 0xffffffff : 0x0;
			
@@ -426,9 +426,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_cmplt(float4_ref_t _a, float4_ref_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_cmplt(simd128_ref_t _a, simd128_ref_t _b)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.ixyzw[0] = _a.fxyzw[0] < _b.fxyzw[0] ? 0xffffffff : 0x0;
			
 
				 		result.ixyzw[1] = _a.fxyzw[1] < _b.fxyzw[1] ? 0xffffffff : 0x0;
			
 
				 		result.ixyzw[2] = _a.fxyzw[2] < _b.fxyzw[2] ? 0xffffffff : 0x0;
			
@@ -437,9 +437,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_cmple(float4_ref_t _a, float4_ref_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_cmple(simd128_ref_t _a, simd128_ref_t _b)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.ixyzw[0] = _a.fxyzw[0] <= _b.fxyzw[0] ? 0xffffffff : 0x0;
			
 
				 		result.ixyzw[1] = _a.fxyzw[1] <= _b.fxyzw[1] ? 0xffffffff : 0x0;
			
 
				 		result.ixyzw[2] = _a.fxyzw[2] <= _b.fxyzw[2] ? 0xffffffff : 0x0;
			
@@ -448,9 +448,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_cmpgt(float4_ref_t _a, float4_ref_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_cmpgt(simd128_ref_t _a, simd128_ref_t _b)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.ixyzw[0] = _a.fxyzw[0] > _b.fxyzw[0] ? 0xffffffff : 0x0;
			
 
				 		result.ixyzw[1] = _a.fxyzw[1] > _b.fxyzw[1] ? 0xffffffff : 0x0;
			
 
				 		result.ixyzw[2] = _a.fxyzw[2] > _b.fxyzw[2] ? 0xffffffff : 0x0;
			
@@ -459,9 +459,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_cmpge(float4_ref_t _a, float4_ref_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_cmpge(simd128_ref_t _a, simd128_ref_t _b)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.ixyzw[0] = _a.fxyzw[0] >= _b.fxyzw[0] ? 0xffffffff : 0x0;
			
 
				 		result.ixyzw[1] = _a.fxyzw[1] >= _b.fxyzw[1] ? 0xffffffff : 0x0;
			
 
				 		result.ixyzw[2] = _a.fxyzw[2] >= _b.fxyzw[2] ? 0xffffffff : 0x0;
			
@@ -470,9 +470,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_min(float4_ref_t _a, float4_ref_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_min(simd128_ref_t _a, simd128_ref_t _b)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.fxyzw[0] = _a.fxyzw[0] < _b.fxyzw[0] ? _a.fxyzw[0] : _b.fxyzw[0];
			
 
				 		result.fxyzw[1] = _a.fxyzw[1] < _b.fxyzw[1] ? _a.fxyzw[1] : _b.fxyzw[1];
			
 
				 		result.fxyzw[2] = _a.fxyzw[2] < _b.fxyzw[2] ? _a.fxyzw[2] : _b.fxyzw[2];
			
@@ -481,9 +481,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_max(float4_ref_t _a, float4_ref_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_max(simd128_ref_t _a, simd128_ref_t _b)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.fxyzw[0] = _a.fxyzw[0] > _b.fxyzw[0] ? _a.fxyzw[0] : _b.fxyzw[0];
			
 
				 		result.fxyzw[1] = _a.fxyzw[1] > _b.fxyzw[1] ? _a.fxyzw[1] : _b.fxyzw[1];
			
 
				 		result.fxyzw[2] = _a.fxyzw[2] > _b.fxyzw[2] ? _a.fxyzw[2] : _b.fxyzw[2];
			
@@ -492,9 +492,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_and(float4_ref_t _a, float4_ref_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_and(simd128_ref_t _a, simd128_ref_t _b)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.uxyzw[0] = _a.uxyzw[0] & _b.uxyzw[0];
			
 
				 		result.uxyzw[1] = _a.uxyzw[1] & _b.uxyzw[1];
			
 
				 		result.uxyzw[2] = _a.uxyzw[2] & _b.uxyzw[2];
			
@@ -503,9 +503,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_andc(float4_ref_t _a, float4_ref_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_andc(simd128_ref_t _a, simd128_ref_t _b)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.uxyzw[0] = _a.uxyzw[0] & ~_b.uxyzw[0];
			
 
				 		result.uxyzw[1] = _a.uxyzw[1] & ~_b.uxyzw[1];
			
 
				 		result.uxyzw[2] = _a.uxyzw[2] & ~_b.uxyzw[2];
			
@@ -514,9 +514,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_or(float4_ref_t _a, float4_ref_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_or(simd128_ref_t _a, simd128_ref_t _b)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.uxyzw[0] = _a.uxyzw[0] | _b.uxyzw[0];
			
 
				 		result.uxyzw[1] = _a.uxyzw[1] | _b.uxyzw[1];
			
 
				 		result.uxyzw[2] = _a.uxyzw[2] | _b.uxyzw[2];
			
@@ -525,9 +525,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_xor(float4_ref_t _a, float4_ref_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_xor(simd128_ref_t _a, simd128_ref_t _b)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.uxyzw[0] = _a.uxyzw[0] ^ _b.uxyzw[0];
			
 
				 		result.uxyzw[1] = _a.uxyzw[1] ^ _b.uxyzw[1];
			
 
				 		result.uxyzw[2] = _a.uxyzw[2] ^ _b.uxyzw[2];
			
@@ -536,9 +536,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_sll(float4_ref_t _a, int _count)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_sll(simd128_ref_t _a, int _count)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.uxyzw[0] = _a.uxyzw[0] << _count;
			
 
				 		result.uxyzw[1] = _a.uxyzw[1] << _count;
			
 
				 		result.uxyzw[2] = _a.uxyzw[2] << _count;
			
@@ -547,9 +547,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_srl(float4_ref_t _a, int _count)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_srl(simd128_ref_t _a, int _count)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.uxyzw[0] = _a.uxyzw[0] >> _count;
			
 
				 		result.uxyzw[1] = _a.uxyzw[1] >> _count;
			
 
				 		result.uxyzw[2] = _a.uxyzw[2] >> _count;
			
@@ -558,9 +558,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_sra(float4_ref_t _a, int _count)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_sra(simd128_ref_t _a, int _count)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.ixyzw[0] = _a.ixyzw[0] >> _count;
			
 
				 		result.ixyzw[1] = _a.ixyzw[1] >> _count;
			
 
				 		result.ixyzw[2] = _a.ixyzw[2] >> _count;
			
@@ -569,9 +569,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_icmpeq(float4_ref_t _a, float4_ref_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_icmpeq(simd128_ref_t _a, simd128_ref_t _b)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.ixyzw[0] = _a.ixyzw[0] == _b.ixyzw[0] ? 0xffffffff : 0x0;
			
 
				 		result.ixyzw[1] = _a.ixyzw[1] == _b.ixyzw[1] ? 0xffffffff : 0x0;
			
 
				 		result.ixyzw[2] = _a.ixyzw[2] == _b.ixyzw[2] ? 0xffffffff : 0x0;
			
@@ -580,9 +580,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_icmplt(float4_ref_t _a, float4_ref_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_icmplt(simd128_ref_t _a, simd128_ref_t _b)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.ixyzw[0] = _a.ixyzw[0] < _b.ixyzw[0] ? 0xffffffff : 0x0;
			
 
				 		result.ixyzw[1] = _a.ixyzw[1] < _b.ixyzw[1] ? 0xffffffff : 0x0;
			
 
				 		result.ixyzw[2] = _a.ixyzw[2] < _b.ixyzw[2] ? 0xffffffff : 0x0;
			
@@ -591,9 +591,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_icmpgt(float4_ref_t _a, float4_ref_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_icmpgt(simd128_ref_t _a, simd128_ref_t _b)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.ixyzw[0] = _a.ixyzw[0] > _b.ixyzw[0] ? 0xffffffff : 0x0;
			
 
				 		result.ixyzw[1] = _a.ixyzw[1] > _b.ixyzw[1] ? 0xffffffff : 0x0;
			
 
				 		result.ixyzw[2] = _a.ixyzw[2] > _b.ixyzw[2] ? 0xffffffff : 0x0;
			
@@ -602,9 +602,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_imin(float4_ref_t _a, float4_ref_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_imin(simd128_ref_t _a, simd128_ref_t _b)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.ixyzw[0] = _a.ixyzw[0] < _b.ixyzw[0] ? _a.ixyzw[0] : _b.ixyzw[0];
			
 
				 		result.ixyzw[1] = _a.ixyzw[1] < _b.ixyzw[1] ? _a.ixyzw[1] : _b.ixyzw[1];
			
 
				 		result.ixyzw[2] = _a.ixyzw[2] < _b.ixyzw[2] ? _a.ixyzw[2] : _b.ixyzw[2];
			
@@ -613,9 +613,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_imax(float4_ref_t _a, float4_ref_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_imax(simd128_ref_t _a, simd128_ref_t _b)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.ixyzw[0] = _a.ixyzw[0] > _b.ixyzw[0] ? _a.ixyzw[0] : _b.ixyzw[0];
			
 
				 		result.ixyzw[1] = _a.ixyzw[1] > _b.ixyzw[1] ? _a.ixyzw[1] : _b.ixyzw[1];
			
 
				 		result.ixyzw[2] = _a.ixyzw[2] > _b.ixyzw[2] ? _a.ixyzw[2] : _b.ixyzw[2];
			
@@ -624,9 +624,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_iadd(float4_ref_t _a, float4_ref_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_iadd(simd128_ref_t _a, simd128_ref_t _b)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.ixyzw[0] = _a.ixyzw[0] + _b.ixyzw[0];
			
 
				 		result.ixyzw[1] = _a.ixyzw[1] + _b.ixyzw[1];
			
 
				 		result.ixyzw[2] = _a.ixyzw[2] + _b.ixyzw[2];
			
@@ -635,9 +635,9 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 	}
			
 
				 
			
 
				 	template<>
			
 
				-	BX_FLOAT4_FORCE_INLINE float4_ref_t float4_isub(float4_ref_t _a, float4_ref_t _b)
			
 
				+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_isub(simd128_ref_t _a, simd128_ref_t _b)
			
 
				 	{
			
 
				-		float4_ref_t result;
			
 
				+		simd128_ref_t result;
			
 
				 		result.ixyzw[0] = _a.ixyzw[0] - _b.ixyzw[0];
			
 
				 		result.ixyzw[1] = _a.ixyzw[1] - _b.ixyzw[1];
			
 
				 		result.ixyzw[2] = _a.ixyzw[2] - _b.ixyzw[2];
			
@@ -647,4 +647,4 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
				 
			
 
				 } // namespace bx
			
 
				 
			
 
				-#endif // BX_FLOAT4_REF_H_HEADER_GUARD
			
 
				+#endif // BX_SIMD_REF_H_HEADER_GUARD
			
--- a/include/bx/simd128_sse.inl
+++ b/include/bx/simd128_sse.inl
@@ -0,0 +1,647 @@
 
				+/*
			
 
				+ * Copyright 2010-2016 Branimir Karadzic. All rights reserved.
			
 
				+ * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
			
 
				+ */
			
 
				+
			
 
				+#ifndef BX_SIMD_SSE_H_HEADER_GUARD
			
 
				+#define BX_SIMD_SSE_H_HEADER_GUARD
			
 
				+
			
 
				+#include "simd_ni.inl"
			
 
				+
			
 
				+namespace bx
			
 
				+{
			
 
				+#define ELEMx 0
			
 
				+#define ELEMy 1
			
 
				+#define ELEMz 2
			
 
				+#define ELEMw 3
			
 
				+#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \
			
 
				+			template<> \
			
 
				+			BX_SIMD_FORCE_INLINE simd128_sse_t simd_swiz_##_x##_y##_z##_w(simd128_sse_t _a) \
			
 
				+			{ \
			
 
				+				return _mm_shuffle_ps( _a, _a, _MM_SHUFFLE(ELEM##_w, ELEM##_z, ELEM##_y, ELEM##_x ) ); \
			
 
				+			}
			
 
				+
			
 
				+#include "simd_swizzle.inl"
			
 
				+
			
 
				+#undef IMPLEMENT_SWIZZLE
			
 
				+#undef ELEMw
			
 
				+#undef ELEMz
			
 
				+#undef ELEMy
			
 
				+#undef ELEMx
			
 
				+
			
 
				+#define IMPLEMENT_TEST(_xyzw, _mask) \
			
 
				+			template<> \
			
 
				+			BX_SIMD_FORCE_INLINE bool simd_test_any_##_xyzw(simd128_sse_t _test) \
			
 
				+			{ \
			
 
				+				return 0x0 != (_mm_movemask_ps(_test)&(_mask) ); \
			
 
				+			} \
			
 
				+			\
			
 
				+			template<> \
			
 
				+			BX_SIMD_FORCE_INLINE bool simd_test_all_##_xyzw(simd128_sse_t _test) \
			
 
				+			{ \
			
 
				+				return (_mask) == (_mm_movemask_ps(_test)&(_mask) ); \
			
 
				+			}
			
 
				+
			
 
				+IMPLEMENT_TEST(x    , 0x1);
			
 
				+IMPLEMENT_TEST(y    , 0x2);
			
 
				+IMPLEMENT_TEST(xy   , 0x3);
			
 
				+IMPLEMENT_TEST(z    , 0x4);
			
 
				+IMPLEMENT_TEST(xz   , 0x5);
			
 
				+IMPLEMENT_TEST(yz   , 0x6);
			
 
				+IMPLEMENT_TEST(xyz  , 0x7);
			
 
				+IMPLEMENT_TEST(w    , 0x8);
			
 
				+IMPLEMENT_TEST(xw   , 0x9);
			
 
				+IMPLEMENT_TEST(yw   , 0xa);
			
 
				+IMPLEMENT_TEST(xyw  , 0xb);
			
 
				+IMPLEMENT_TEST(zw   , 0xc);
			
 
				+IMPLEMENT_TEST(xzw  , 0xd);
			
 
				+IMPLEMENT_TEST(yzw  , 0xe);
			
 
				+IMPLEMENT_TEST(xyzw , 0xf);
			
 
				+
			
 
				+#undef IMPLEMENT_TEST
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_shuf_xyAB(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return _mm_movelh_ps(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_shuf_ABxy(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return _mm_movelh_ps(_b, _a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_shuf_CDzw(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return _mm_movehl_ps(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_shuf_zwCD(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return _mm_movehl_ps(_b, _a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_shuf_xAyB(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return _mm_unpacklo_ps(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_shuf_yBxA(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return _mm_unpacklo_ps(_b, _a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_shuf_zCwD(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return _mm_unpackhi_ps(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_shuf_CzDw(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return _mm_unpackhi_ps(_b, _a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE float simd_x(simd128_sse_t _a)
			
 
				+	{
			
 
				+		return _mm_cvtss_f32(_a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE float simd_y(simd128_sse_t _a)
			
 
				+	{
			
 
				+		const simd128_sse_t yyyy = simd_swiz_yyyy(_a);
			
 
				+		const float result  = _mm_cvtss_f32(yyyy);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE float simd_z(simd128_sse_t _a)
			
 
				+	{
			
 
				+		const simd128_sse_t zzzz = simd_swiz_zzzz(_a);
			
 
				+		const float result  = _mm_cvtss_f32(zzzz);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE float simd_w(simd128_sse_t _a)
			
 
				+	{
			
 
				+		const simd128_sse_t wwww = simd_swiz_wwww(_a);
			
 
				+		const float result  = _mm_cvtss_f32(wwww);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_ld(const void* _ptr)
			
 
				+	{
			
 
				+		return _mm_load_ps(reinterpret_cast<const float*>(_ptr) );
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE void simd_st(void* _ptr, simd128_sse_t _a)
			
 
				+	{
			
 
				+		_mm_store_ps(reinterpret_cast<float*>(_ptr), _a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE void simd_stx(void* _ptr, simd128_sse_t _a)
			
 
				+	{
			
 
				+		_mm_store_ss(reinterpret_cast<float*>(_ptr), _a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE void simd_stream(void* _ptr, simd128_sse_t _a)
			
 
				+	{
			
 
				+		_mm_stream_ps(reinterpret_cast<float*>(_ptr), _a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_ld(float _x, float _y, float _z, float _w)
			
 
				+	{
			
 
				+		return _mm_set_ps(_w, _z, _y, _x);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
			
 
				+	{
			
 
				+		const __m128i set     = _mm_set_epi32(_w, _z, _y, _x);
			
 
				+		const simd128_sse_t result = _mm_castsi128_ps(set);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_splat(const void* _ptr)
			
 
				+	{
			
 
				+		const simd128_sse_t x___   = _mm_load_ss(reinterpret_cast<const float*>(_ptr) );
			
 
				+		const simd128_sse_t result = simd_swiz_xxxx(x___);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_splat(float _a)
			
 
				+	{
			
 
				+		return _mm_set1_ps(_a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_isplat(uint32_t _a)
			
 
				+	{
			
 
				+		const __m128i splat   = _mm_set1_epi32(_a);
			
 
				+		const simd128_sse_t result = _mm_castsi128_ps(splat);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_zero()
			
 
				+	{
			
 
				+		return _mm_setzero_ps();
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_itof(simd128_sse_t _a)
			
 
				+	{
			
 
				+		const __m128i  itof   = _mm_castps_si128(_a);
			
 
				+		const simd128_sse_t result = _mm_cvtepi32_ps(itof);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_ftoi(simd128_sse_t _a)
			
 
				+	{
			
 
				+		const __m128i ftoi    = _mm_cvtps_epi32(_a);
			
 
				+		const simd128_sse_t result = _mm_castsi128_ps(ftoi);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_round(simd128_sse_t _a)
			
 
				+	{
			
 
				+#if defined(__SSE4_1__)
			
 
				+		return _mm_round_ps(_a, _MM_FROUND_NINT);
			
 
				+#else
			
 
				+		const __m128i round   = _mm_cvtps_epi32(_a);
			
 
				+		const simd128_sse_t result = _mm_cvtepi32_ps(round);
			
 
				+
			
 
				+		return result;
			
 
				+#endif // defined(__SSE4_1__)
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_add(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return _mm_add_ps(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_sub(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return _mm_sub_ps(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_mul(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return _mm_mul_ps(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_div(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return _mm_div_ps(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_rcp_est(simd128_sse_t _a)
			
 
				+	{
			
 
				+		return _mm_rcp_ps(_a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_sqrt(simd128_sse_t _a)
			
 
				+	{
			
 
				+		return _mm_sqrt_ps(_a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_rsqrt_est(simd128_sse_t _a)
			
 
				+	{
			
 
				+		return _mm_rsqrt_ps(_a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_dot3(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+#if defined(__SSE4_1__)
			
 
				+		return _mm_dp_ps(_a, _b, 0x77);
			
 
				+#else
			
 
				+		return simd_dot3_ni(_a, _b);
			
 
				+#endif // defined(__SSE4__)
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_dot(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+#if defined(__SSE4_1__)
			
 
				+		return _mm_dp_ps(_a, _b, 0xFF);
			
 
				+#else
			
 
				+		return simd_dot_ni(_a, _b);
			
 
				+#endif // defined(__SSE4__)
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_cmpeq(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return _mm_cmpeq_ps(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_cmplt(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return _mm_cmplt_ps(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_cmple(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return _mm_cmple_ps(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_cmpgt(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return _mm_cmpgt_ps(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_cmpge(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return _mm_cmpge_ps(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_min(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return _mm_min_ps(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_max(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return _mm_max_ps(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_and(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return _mm_and_ps(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_andc(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return _mm_andnot_ps(_b, _a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_or(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return _mm_or_ps(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_xor(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return _mm_xor_ps(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_sll(simd128_sse_t _a, int _count)
			
 
				+	{
			
 
				+		const __m128i a       = _mm_castps_si128(_a);
			
 
				+		const __m128i shift   = _mm_slli_epi32(a, _count);
			
 
				+		const simd128_sse_t result = _mm_castsi128_ps(shift);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_srl(simd128_sse_t _a, int _count)
			
 
				+	{
			
 
				+		const __m128i a       = _mm_castps_si128(_a);
			
 
				+		const __m128i shift   = _mm_srli_epi32(a, _count);
			
 
				+		const simd128_sse_t result = _mm_castsi128_ps(shift);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_sra(simd128_sse_t _a, int _count)
			
 
				+	{
			
 
				+		const __m128i a       = _mm_castps_si128(_a);
			
 
				+		const __m128i shift   = _mm_srai_epi32(a, _count);
			
 
				+		const simd128_sse_t result = _mm_castsi128_ps(shift);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_icmpeq(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		const __m128i tmp0    = _mm_castps_si128(_a);
			
 
				+		const __m128i tmp1    = _mm_castps_si128(_b);
			
 
				+		const __m128i tmp2    = _mm_cmpeq_epi32(tmp0, tmp1);
			
 
				+		const simd128_sse_t result = _mm_castsi128_ps(tmp2);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_icmplt(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		const __m128i tmp0    = _mm_castps_si128(_a);
			
 
				+		const __m128i tmp1    = _mm_castps_si128(_b);
			
 
				+		const __m128i tmp2    = _mm_cmplt_epi32(tmp0, tmp1);
			
 
				+		const simd128_sse_t result = _mm_castsi128_ps(tmp2);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_icmpgt(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		const __m128i tmp0    = _mm_castps_si128(_a);
			
 
				+		const __m128i tmp1    = _mm_castps_si128(_b);
			
 
				+		const __m128i tmp2    = _mm_cmpgt_epi32(tmp0, tmp1);
			
 
				+		const simd128_sse_t result = _mm_castsi128_ps(tmp2);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_imin(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+#if defined(__SSE4_1__)
			
 
				+		const __m128i tmp0    = _mm_castps_si128(_a);
			
 
				+		const __m128i tmp1    = _mm_castps_si128(_b);
			
 
				+		const __m128i tmp2    = _mm_min_epi32(tmp0, tmp1);
			
 
				+		const simd128_sse_t result = _mm_castsi128_ps(tmp2);
			
 
				+
			
 
				+		return result;
			
 
				+#else
			
 
				+		return simd_imin_ni(_a, _b);
			
 
				+#endif // defined(__SSE4_1__)
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_imax(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+#if defined(__SSE4_1__)
			
 
				+		const __m128i tmp0    = _mm_castps_si128(_a);
			
 
				+		const __m128i tmp1    = _mm_castps_si128(_b);
			
 
				+		const __m128i tmp2    = _mm_max_epi32(tmp0, tmp1);
			
 
				+		const simd128_sse_t result = _mm_castsi128_ps(tmp2);
			
 
				+
			
 
				+		return result;
			
 
				+#else
			
 
				+		return simd_imax_ni(_a, _b);
			
 
				+#endif // defined(__SSE4_1__)
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_iadd(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		const __m128i a       = _mm_castps_si128(_a);
			
 
				+		const __m128i b       = _mm_castps_si128(_b);
			
 
				+		const __m128i add     = _mm_add_epi32(a, b);
			
 
				+		const simd128_sse_t result = _mm_castsi128_ps(add);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_isub(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		const __m128i a       = _mm_castps_si128(_a);
			
 
				+		const __m128i b       = _mm_castps_si128(_b);
			
 
				+		const __m128i sub     = _mm_sub_epi32(a, b);
			
 
				+		const simd128_sse_t result = _mm_castsi128_ps(sub);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_INLINE simd128_sse_t simd_shuf_xAzC(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return simd_shuf_xAzC_ni(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_INLINE simd128_sse_t simd_shuf_yBwD(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return simd_shuf_yBwD_ni(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_INLINE simd128_sse_t simd_rcp(simd128_sse_t _a)
			
 
				+	{
			
 
				+		return simd_rcp_ni(_a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_INLINE simd128_sse_t simd_orx(simd128_sse_t _a)
			
 
				+	{
			
 
				+		return simd_orx_ni(_a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_INLINE simd128_sse_t simd_orc(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return simd_orc_ni(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_INLINE simd128_sse_t simd_neg(simd128_sse_t _a)
			
 
				+	{
			
 
				+		return simd_neg_ni(_a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_INLINE simd128_sse_t simd_madd(simd128_sse_t _a, simd128_sse_t _b, simd128_sse_t _c)
			
 
				+	{
			
 
				+		return simd_madd_ni(_a, _b, _c);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_INLINE simd128_sse_t simd_nmsub(simd128_sse_t _a, simd128_sse_t _b, simd128_sse_t _c)
			
 
				+	{
			
 
				+		return simd_nmsub_ni(_a, _b, _c);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_INLINE simd128_sse_t simd_div_nr(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return simd_div_nr_ni(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_INLINE simd128_sse_t simd_selb(simd128_sse_t _mask, simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return simd_selb_ni(_mask, _a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_INLINE simd128_sse_t simd_sels(simd128_sse_t _test, simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return simd_sels_ni(_test, _a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_INLINE simd128_sse_t simd_not(simd128_sse_t _a)
			
 
				+	{
			
 
				+		return simd_not_ni(_a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_INLINE simd128_sse_t simd_abs(simd128_sse_t _a)
			
 
				+	{
			
 
				+		return simd_abs_ni(_a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_INLINE simd128_sse_t simd_clamp(simd128_sse_t _a, simd128_sse_t _min, simd128_sse_t _max)
			
 
				+	{
			
 
				+		return simd_clamp_ni(_a, _min, _max);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_INLINE simd128_sse_t simd_lerp(simd128_sse_t _a, simd128_sse_t _b, simd128_sse_t _s)
			
 
				+	{
			
 
				+		return simd_lerp_ni(_a, _b, _s);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_INLINE simd128_sse_t simd_rsqrt(simd128_sse_t _a)
			
 
				+	{
			
 
				+		return simd_rsqrt_ni(_a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_INLINE simd128_sse_t simd_rsqrt_nr(simd128_sse_t _a)
			
 
				+	{
			
 
				+		return simd_rsqrt_nr_ni(_a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_INLINE simd128_sse_t simd_rsqrt_carmack(simd128_sse_t _a)
			
 
				+	{
			
 
				+		return simd_rsqrt_carmack_ni(_a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_INLINE simd128_sse_t simd_sqrt_nr(simd128_sse_t _a)
			
 
				+	{
			
 
				+		return simd_sqrt_nr_ni(_a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_INLINE simd128_sse_t simd_log2(simd128_sse_t _a)
			
 
				+	{
			
 
				+		return simd_log2_ni(_a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_INLINE simd128_sse_t simd_exp2(simd128_sse_t _a)
			
 
				+	{
			
 
				+		return simd_exp2_ni(_a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_INLINE simd128_sse_t simd_pow(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return simd_pow_ni(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_INLINE simd128_sse_t simd_cross3(simd128_sse_t _a, simd128_sse_t _b)
			
 
				+	{
			
 
				+		return simd_cross3_ni(_a, _b);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_INLINE simd128_sse_t simd_normalize3(simd128_sse_t _a)
			
 
				+	{
			
 
				+		return simd_normalize3_ni(_a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_INLINE simd128_sse_t simd_ceil(simd128_sse_t _a)
			
 
				+	{
			
 
				+		return simd_ceil_ni(_a);
			
 
				+	}
			
 
				+
			
 
				+	template<>
			
 
				+	BX_SIMD_INLINE simd128_sse_t simd_floor(simd128_sse_t _a)
			
 
				+	{
			
 
				+		return simd_floor_ni(_a);
			
 
				+	}
			
 
				+
			
 
				+	typedef simd128_sse_t simd128_t;
			
 
				+
			
 
				+} // namespace bx
			
 
				+
			
 
				+#endif // BX_SIMD_SSE_H_HEADER_GUARD
			
--- a/include/bx/simd_ni.inl
+++ b/include/bx/simd_ni.inl
@@ -0,0 +1,558 @@
 
				+/*
			
 
				+ * Copyright 2010-2016 Branimir Karadzic. All rights reserved.
			
 
				+ * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
			
 
				+ */
			
 
				+
			
 
				+#ifndef BX_SIMD_NI_H_HEADER_GUARD
			
 
				+#define BX_SIMD_NI_H_HEADER_GUARD
			
 
				+
			
 
				+namespace bx
			
 
				+{
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_shuf_xAzC_ni(Ty _a, Ty _b)
			
 
				+	{
			
 
				+		const Ty xAyB   = simd_shuf_xAyB(_a, _b);
			
 
				+		const Ty zCwD   = simd_shuf_zCwD(_a, _b);
			
 
				+		const Ty result = simd_shuf_xyAB(xAyB, zCwD);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_shuf_yBwD_ni(Ty _a, Ty _b)
			
 
				+	{
			
 
				+		const Ty xAyB   = simd_shuf_xAyB(_a, _b);
			
 
				+		const Ty zCwD   = simd_shuf_zCwD(_a, _b);
			
 
				+		const Ty result = simd_shuf_zwCD(xAyB, zCwD);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_madd_ni(Ty _a, Ty _b, Ty _c)
			
 
				+	{
			
 
				+		const Ty mul    = simd_mul(_a, _b);
			
 
				+		const Ty result = simd_add(mul, _c);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_nmsub_ni(Ty _a, Ty _b, Ty _c)
			
 
				+	{
			
 
				+		const Ty mul    = simd_mul(_a, _b);
			
 
				+		const Ty result = simd_sub(_c, mul);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_div_nr_ni(Ty _a, Ty _b)
			
 
				+	{
			
 
				+		const Ty oneish  = simd_isplat<Ty>(0x3f800001);
			
 
				+		const Ty est     = simd_rcp_est(_b);
			
 
				+		const Ty iter0   = simd_mul(_a, est);
			
 
				+		const Ty tmp1    = simd_nmsub(_b, est, oneish);
			
 
				+		const Ty result  = simd_madd(tmp1, iter0, iter0);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_rcp_ni(Ty _a)
			
 
				+	{
			
 
				+		const Ty one    = simd_splat<Ty>(1.0f);
			
 
				+		const Ty result = simd_div(one, _a);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_orx_ni(Ty _a)
			
 
				+	{
			
 
				+		const Ty zwxy   = simd_swiz_zwxy(_a);
			
 
				+		const Ty tmp0   = simd_or(_a, zwxy);
			
 
				+		const Ty tmp1   = simd_swiz_yyyy(_a);
			
 
				+		const Ty tmp2   = simd_or(tmp0, tmp1);
			
 
				+		const Ty mf000  = simd_ild<Ty>(UINT32_MAX, 0, 0, 0);
			
 
				+		const Ty result = simd_and(tmp2, mf000);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_orc_ni(Ty _a, Ty _b)
			
 
				+	{
			
 
				+		const Ty aorb   = simd_or(_a, _b);
			
 
				+		const Ty mffff  = simd_isplat<Ty>(UINT32_MAX);
			
 
				+		const Ty result = simd_xor(aorb, mffff);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_neg_ni(Ty _a)
			
 
				+	{
			
 
				+		const Ty zero   = simd_zero<Ty>();
			
 
				+		const Ty result = simd_sub(zero, _a);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_selb_ni(Ty _mask, Ty _a, Ty _b)
			
 
				+	{
			
 
				+		const Ty sel_a  = simd_and(_a, _mask);
			
 
				+		const Ty sel_b  = simd_andc(_b, _mask);
			
 
				+		const Ty result = simd_or(sel_a, sel_b);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_sels_ni(Ty _test, Ty _a, Ty _b)
			
 
				+	{
			
 
				+		const Ty mask   = simd_sra(_test, 31);
			
 
				+		const Ty result = simd_selb(mask, _a, _b);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_not_ni(Ty _a)
			
 
				+	{
			
 
				+		const Ty mffff  = simd_isplat<Ty>(UINT32_MAX);
			
 
				+		const Ty result = simd_xor(_a, mffff);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_min_ni(Ty _a, Ty _b)
			
 
				+	{
			
 
				+		const Ty mask   = simd_cmplt(_a, _b);
			
 
				+		const Ty result = simd_selb(mask, _a, _b);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_max_ni(Ty _a, Ty _b)
			
 
				+	{
			
 
				+		const Ty mask   = simd_cmpgt(_a, _b);
			
 
				+		const Ty result = simd_selb(mask, _a, _b);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_abs_ni(Ty _a)
			
 
				+	{
			
 
				+		const Ty a_neg  = simd_neg(_a);
			
 
				+		const Ty result = simd_max(a_neg, _a);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_imin_ni(Ty _a, Ty _b)
			
 
				+	{
			
 
				+		const Ty mask   = simd_icmplt(_a, _b);
			
 
				+		const Ty result = simd_selb(mask, _a, _b);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_imax_ni(Ty _a, Ty _b)
			
 
				+	{
			
 
				+		const Ty mask   = simd_icmpgt(_a, _b);
			
 
				+		const Ty result = simd_selb(mask, _a, _b);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_clamp_ni(Ty _a, Ty _min, Ty _max)
			
 
				+	{
			
 
				+		const Ty tmp    = simd_min(_a, _max);
			
 
				+		const Ty result = simd_max(tmp, _min);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_lerp_ni(Ty _a, Ty _b, Ty _s)
			
 
				+	{
			
 
				+		const Ty ba     = simd_sub(_b, _a);
			
 
				+		const Ty result = simd_madd(_s, ba, _a);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_sqrt_nr_ni(Ty _a)
			
 
				+	{
			
 
				+		const Ty half   = simd_splat<Ty>(0.5f);
			
 
				+		const Ty one    = simd_splat<Ty>(1.0f);
			
 
				+		const Ty tmp0   = simd_rsqrt_est(_a);
			
 
				+		const Ty tmp1   = simd_mul(tmp0, _a);
			
 
				+		const Ty tmp2   = simd_mul(tmp1, half);
			
 
				+		const Ty tmp3   = simd_nmsub(tmp0, tmp1, one);
			
 
				+		const Ty result = simd_madd(tmp3, tmp2, tmp1);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_sqrt_nr1_ni(Ty _a)
			
 
				+	{
			
 
				+		const Ty half = simd_splat<Ty>(0.5f);
			
 
				+
			
 
				+		Ty result = _a;
			
 
				+		for (uint32_t ii = 0; ii < 11; ++ii)
			
 
				+		{
			
 
				+			const Ty tmp1 = simd_div(_a, result);
			
 
				+			const Ty tmp2 = simd_add(tmp1, result);
			
 
				+			result              = simd_mul(tmp2, half);
			
 
				+		}
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_rsqrt_ni(Ty _a)
			
 
				+	{
			
 
				+		const Ty one    = simd_splat<Ty>(1.0f);
			
 
				+		const Ty sqrt   = simd_sqrt(_a);
			
 
				+		const Ty result = simd_div(one, sqrt);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_rsqrt_nr_ni(Ty _a)
			
 
				+	{
			
 
				+		const Ty rsqrt           = simd_rsqrt_est(_a);
			
 
				+		const Ty iter0           = simd_mul(_a, rsqrt);
			
 
				+		const Ty iter1           = simd_mul(iter0, rsqrt);
			
 
				+		const Ty half            = simd_splat<Ty>(0.5f);
			
 
				+		const Ty half_rsqrt      = simd_mul(half, rsqrt);
			
 
				+		const Ty three           = simd_splat<Ty>(3.0f);
			
 
				+		const Ty three_sub_iter1 = simd_sub(three, iter1);
			
 
				+		const Ty result          = simd_mul(half_rsqrt, three_sub_iter1);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_rsqrt_carmack_ni(Ty _a)
			
 
				+	{
			
 
				+		const Ty half    = simd_splat<Ty>(0.5f);
			
 
				+		const Ty ah      = simd_mul(half, _a);
			
 
				+		const Ty ashift  = simd_sra(_a, 1);
			
 
				+		const Ty magic   = simd_isplat<Ty>(0x5f3759df);
			
 
				+		const Ty msuba   = simd_isub(magic, ashift);
			
 
				+		const Ty msubasq = simd_mul(msuba, msuba);
			
 
				+		const Ty tmp0    = simd_splat<Ty>(1.5f);
			
 
				+		const Ty tmp1    = simd_mul(ah, msubasq);
			
 
				+		const Ty tmp2    = simd_sub(tmp0, tmp1);
			
 
				+		const Ty result  = simd_mul(msuba, tmp2);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	namespace simd_logexp_detail
			
 
				+	{
			
 
				+		template<typename Ty>
			
 
				+		BX_SIMD_INLINE Ty simd_poly1(Ty _a, float _b, float _c)
			
 
				+		{
			
 
				+			const Ty bbbb   = simd_splat<Ty>(_b);
			
 
				+			const Ty cccc   = simd_splat<Ty>(_c);
			
 
				+			const Ty result = simd_madd(cccc, _a, bbbb);
			
 
				+
			
 
				+			return result;
			
 
				+		}
			
 
				+
			
 
				+		template<typename Ty>
			
 
				+		BX_SIMD_INLINE Ty simd_poly2(Ty _a, float _b, float _c, float _d)
			
 
				+		{
			
 
				+			const Ty bbbb   = simd_splat<Ty>(_b);
			
 
				+			const Ty poly   = simd_poly1(_a, _c, _d);
			
 
				+			const Ty result = simd_madd(poly, _a, bbbb);
			
 
				+
			
 
				+			return result;
			
 
				+		}
			
 
				+
			
 
				+		template<typename Ty>
			
 
				+		BX_SIMD_INLINE Ty simd_poly3(Ty _a, float _b, float _c, float _d, float _e)
			
 
				+		{
			
 
				+			const Ty bbbb   = simd_splat<Ty>(_b);
			
 
				+			const Ty poly   = simd_poly2(_a, _c, _d, _e);
			
 
				+			const Ty result = simd_madd(poly, _a, bbbb);
			
 
				+
			
 
				+			return result;
			
 
				+		}
			
 
				+
			
 
				+		template<typename Ty>
			
 
				+		BX_SIMD_INLINE Ty simd_poly4(Ty _a, float _b, float _c, float _d, float _e, float _f)
			
 
				+		{
			
 
				+			const Ty bbbb   = simd_splat<Ty>(_b);
			
 
				+			const Ty poly   = simd_poly3(_a, _c, _d, _e, _f);
			
 
				+			const Ty result = simd_madd(poly, _a, bbbb);
			
 
				+
			
 
				+			return result;
			
 
				+		}
			
 
				+
			
 
				+		template<typename Ty>
			
 
				+		BX_SIMD_INLINE Ty simd_poly5(Ty _a, float _b, float _c, float _d, float _e, float _f, float _g)
			
 
				+		{
			
 
				+			const Ty bbbb   = simd_splat<Ty>(_b);
			
 
				+			const Ty poly   = simd_poly4(_a, _c, _d, _e, _f, _g);
			
 
				+			const Ty result = simd_madd(poly, _a, bbbb);
			
 
				+
			
 
				+			return result;
			
 
				+		}
			
 
				+
			
 
				+		template<typename Ty>
			
 
				+		BX_SIMD_INLINE Ty simd_logpoly(Ty _a)
			
 
				+		{
			
 
				+#if 1
			
 
				+			const Ty result = simd_poly5(_a
			
 
				+				, 3.11578814719469302614f, -3.32419399085241980044f
			
 
				+				, 2.59883907202499966007f, -1.23152682416275988241f
			
 
				+				, 0.318212422185251071475f, -0.0344359067839062357313f
			
 
				+				);
			
 
				+#elif 0
			
 
				+			const Ty result = simd_poly4(_a
			
 
				+				, 2.8882704548164776201f, -2.52074962577807006663f
			
 
				+				, 1.48116647521213171641f, -0.465725644288844778798f
			
 
				+				, 0.0596515482674574969533f
			
 
				+				);
			
 
				+#elif 0
			
 
				+			const Ty result = simd_poly3(_a
			
 
				+				, 2.61761038894603480148f, -1.75647175389045657003f
			
 
				+				, 0.688243882994381274313f, -0.107254423828329604454f
			
 
				+				);
			
 
				+#else
			
 
				+			const Ty result = simd_poly2(_a
			
 
				+				, 2.28330284476918490682f, -1.04913055217340124191f
			
 
				+				, 0.204446009836232697516f
			
 
				+				);
			
 
				+#endif
			
 
				+
			
 
				+			return result;
			
 
				+		}
			
 
				+
			
 
				+		template<typename Ty>
			
 
				+		BX_SIMD_INLINE Ty simd_exppoly(Ty _a)
			
 
				+		{
			
 
				+#if 1
			
 
				+			const Ty result = simd_poly5(_a
			
 
				+				, 9.9999994e-1f, 6.9315308e-1f
			
 
				+				, 2.4015361e-1f, 5.5826318e-2f
			
 
				+				, 8.9893397e-3f, 1.8775767e-3f
			
 
				+				);
			
 
				+#elif 0
			
 
				+			const Ty result = simd_poly4(_a
			
 
				+				, 1.0000026f, 6.9300383e-1f
			
 
				+				, 2.4144275e-1f, 5.2011464e-2f
			
 
				+				, 1.3534167e-2f
			
 
				+				);
			
 
				+#elif 0
			
 
				+			const Ty result = simd_poly3(_a
			
 
				+				, 9.9992520e-1f, 6.9583356e-1f
			
 
				+				, 2.2606716e-1f, 7.8024521e-2f
			
 
				+				);
			
 
				+#else
			
 
				+			const Ty result = simd_poly2(_a
			
 
				+				, 1.0017247f, 6.5763628e-1f
			
 
				+				, 3.3718944e-1f
			
 
				+				);
			
 
				+#endif // 0
			
 
				+
			
 
				+			return result;
			
 
				+		}
			
 
				+	} // namespace simd_internal
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_log2_ni(Ty _a)
			
 
				+	{
			
 
				+		const Ty expmask  = simd_isplat<Ty>(0x7f800000);
			
 
				+		const Ty mantmask = simd_isplat<Ty>(0x007fffff);
			
 
				+		const Ty one      = simd_splat<Ty>(1.0f);
			
 
				+
			
 
				+		const Ty c127     = simd_isplat<Ty>(127);
			
 
				+		const Ty aexp     = simd_and(_a, expmask);
			
 
				+		const Ty aexpsr   = simd_srl(aexp, 23);
			
 
				+		const Ty tmp0     = simd_isub(aexpsr, c127);
			
 
				+		const Ty exp      = simd_itof(tmp0);
			
 
				+
			
 
				+		const Ty amask    = simd_and(_a, mantmask);
			
 
				+		const Ty mant     = simd_or(amask, one);
			
 
				+
			
 
				+		const Ty poly     = simd_logexp_detail::simd_logpoly(mant);
			
 
				+
			
 
				+		const Ty mandiff  = simd_sub(mant, one);
			
 
				+		const Ty result   = simd_madd(poly, mandiff, exp);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_exp2_ni(Ty _a)
			
 
				+	{
			
 
				+		const Ty min      = simd_splat<Ty>( 129.0f);
			
 
				+		const Ty max      = simd_splat<Ty>(-126.99999f);
			
 
				+		const Ty tmp0     = simd_min(_a, min);
			
 
				+		const Ty aaaa     = simd_max(tmp0, max);
			
 
				+
			
 
				+		const Ty half     = simd_splat<Ty>(0.5f);
			
 
				+		const Ty tmp2     = simd_sub(aaaa, half);
			
 
				+		const Ty ipart    = simd_ftoi(tmp2);
			
 
				+		const Ty iround   = simd_itof(ipart);
			
 
				+		const Ty fpart    = simd_sub(aaaa, iround);
			
 
				+
			
 
				+		const Ty c127     = simd_isplat<Ty>(127);
			
 
				+		const Ty tmp5     = simd_iadd(ipart, c127);
			
 
				+		const Ty expipart = simd_sll(tmp5, 23);
			
 
				+
			
 
				+		const Ty expfpart = simd_logexp_detail::simd_exppoly(fpart);
			
 
				+
			
 
				+		const Ty result   = simd_mul(expipart, expfpart);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_pow_ni(Ty _a, Ty _b)
			
 
				+	{
			
 
				+		const Ty alog2  = simd_log2(_a);
			
 
				+		const Ty alog2b = simd_mul(alog2, _b);
			
 
				+		const Ty result = simd_exp2(alog2b);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_dot3_ni(Ty _a, Ty _b)
			
 
				+	{
			
 
				+		const Ty xyzw   = simd_mul(_a, _b);
			
 
				+		const Ty xxxx   = simd_swiz_xxxx(xyzw);
			
 
				+		const Ty yyyy   = simd_swiz_yyyy(xyzw);
			
 
				+		const Ty zzzz   = simd_swiz_zzzz(xyzw);
			
 
				+		const Ty tmp1   = simd_add(xxxx, yyyy);
			
 
				+		const Ty result = simd_add(zzzz, tmp1);
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_cross3_ni(Ty _a, Ty _b)
			
 
				+	{
			
 
				+		// a.yzx * b.zxy - a.zxy * b.yzx == (a * b.yzx - a.yzx * b).yzx
			
 
				+#if 0
			
 
				+		const Ty a_yzxw = simd_swiz_yzxw(_a);
			
 
				+		const Ty a_zxyw = simd_swiz_zxyw(_a);
			
 
				+		const Ty b_zxyw = simd_swiz_zxyw(_b);
			
 
				+		const Ty b_yzxw = simd_swiz_yzxw(_b);
			
 
				+		const Ty tmp    = simd_mul(a_yzxw, b_zxyw);
			
 
				+		const Ty result = simd_nmsub(a_zxyw, b_yzxw, tmp);
			
 
				+#else
			
 
				+		const Ty a_yzxw = simd_swiz_yzxw(_a);
			
 
				+		const Ty b_yzxw = simd_swiz_yzxw(_b);
			
 
				+		const Ty tmp0   = simd_mul(_a, b_yzxw);
			
 
				+		const Ty tmp1   = simd_nmsub(a_yzxw, _b, tmp0);
			
 
				+		const Ty result = simd_swiz_yzxw(tmp1);
			
 
				+#endif
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_normalize3_ni(Ty _a)
			
 
				+	{
			
 
				+		const Ty dot3    = simd_dot3(_a, _a);
			
 
				+		const Ty invSqrt = simd_rsqrt(dot3);
			
 
				+		const Ty result  = simd_mul(_a, invSqrt);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_dot_ni(Ty _a, Ty _b)
			
 
				+	{
			
 
				+		const Ty xyzw   = simd_mul(_a, _b);
			
 
				+		const Ty yzwx   = simd_swiz_yzwx(xyzw);
			
 
				+		const Ty tmp0   = simd_add(xyzw, yzwx);
			
 
				+		const Ty zwxy   = simd_swiz_zwxy(tmp0);
			
 
				+		const Ty result = simd_add(tmp0, zwxy);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_ceil_ni(Ty _a)
			
 
				+	{
			
 
				+		const Ty tmp0   = simd_ftoi(_a);
			
 
				+		const Ty tmp1   = simd_itof(tmp0);
			
 
				+		const Ty mask   = simd_cmplt(tmp1, _a);
			
 
				+		const Ty one    = simd_splat<Ty>(1.0f);
			
 
				+		const Ty tmp2   = simd_and(one, mask);
			
 
				+		const Ty result = simd_add(tmp1, tmp2);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_floor_ni(Ty _a)
			
 
				+	{
			
 
				+		const Ty tmp0   = simd_ftoi(_a);
			
 
				+		const Ty tmp1   = simd_itof(tmp0);
			
 
				+		const Ty mask   = simd_cmpgt(tmp1, _a);
			
 
				+		const Ty one    = simd_splat<Ty>(1.0f);
			
 
				+		const Ty tmp2   = simd_and(one, mask);
			
 
				+		const Ty result = simd_sub(tmp1, tmp2);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_round_ni(Ty _a)
			
 
				+	{
			
 
				+		const Ty tmp    = simd_ftoi(_a);
			
 
				+		const Ty result = simd_itof(tmp);
			
 
				+
			
 
				+		return result;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE bool simd_test_any_ni(Ty _a)
			
 
				+	{
			
 
				+		const Ty mask   = simd_sra(_a, 31);
			
 
				+		const Ty zwxy   = simd_swiz_zwxy(mask);
			
 
				+		const Ty tmp0   = simd_or(mask, zwxy);
			
 
				+		const Ty tmp1   = simd_swiz_yyyy(tmp0);
			
 
				+		const Ty tmp2   = simd_or(tmp0, tmp1);
			
 
				+		int res;
			
 
				+		simd_stx(&res, tmp2);
			
 
				+		return 0 != res;
			
 
				+	}
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE bool simd_test_all_ni(Ty _a)
			
 
				+	{
			
 
				+		const Ty bits   = simd_sra(_a, 31);
			
 
				+		const Ty m1248  = simd_ild<Ty>(1, 2, 4, 8);
			
 
				+		const Ty mask   = simd_and(bits, m1248);
			
 
				+		const Ty zwxy   = simd_swiz_zwxy(mask);
			
 
				+		const Ty tmp0   = simd_or(mask, zwxy);
			
 
				+		const Ty tmp1   = simd_swiz_yyyy(tmp0);
			
 
				+		const Ty tmp2   = simd_or(tmp0, tmp1);
			
 
				+		int res;
			
 
				+		simd_stx(&res, tmp2);
			
 
				+		return 0xf == res;
			
 
				+	}
			
 
				+
			
 
				+} // namespace bx
			
 
				+
			
 
				+#endif // BX_SIMD_NI_H_HEADER_GUARD
			
--- a/include/bx/float4_swizzle.inl
+++ b/include/bx/float4_swizzle.inl
@@ -1,266 +1,266 @@
 
				-/*

			
 
				- * Copyright 2010-2015 Branimir Karadzic. All rights reserved.

			
 
				- * License: http://www.opensource.org/licenses/BSD-2-Clause

			
 
				- */

			
 
				-

			
 
				-#ifndef BX_FLOAT4_T_H_HEADER_GUARD

			
 
				-#	error "xmacro file, must be included from float4_*.h"

			
 
				-#endif // BX_FLOAT4_T_H_HEADER_GUARD

			
 
				-

			
 
				-// included from float4_t.h

			
 
				-IMPLEMENT_SWIZZLE(x, x, x, x)

			
 
				-IMPLEMENT_SWIZZLE(x, x, x, y)

			
 
				-IMPLEMENT_SWIZZLE(x, x, x, z)

			
 
				-IMPLEMENT_SWIZZLE(x, x, x, w)

			
 
				-IMPLEMENT_SWIZZLE(x, x, y, x)

			
 
				-IMPLEMENT_SWIZZLE(x, x, y, y)

			
 
				-IMPLEMENT_SWIZZLE(x, x, y, z)

			
 
				-IMPLEMENT_SWIZZLE(x, x, y, w)

			
 
				-IMPLEMENT_SWIZZLE(x, x, z, x)

			
 
				-IMPLEMENT_SWIZZLE(x, x, z, y)

			
 
				-IMPLEMENT_SWIZZLE(x, x, z, z)

			
 
				-IMPLEMENT_SWIZZLE(x, x, z, w)

			
 
				-IMPLEMENT_SWIZZLE(x, x, w, x)

			
 
				-IMPLEMENT_SWIZZLE(x, x, w, y)

			
 
				-IMPLEMENT_SWIZZLE(x, x, w, z)

			
 
				-IMPLEMENT_SWIZZLE(x, x, w, w)

			
 
				-IMPLEMENT_SWIZZLE(x, y, x, x)

			
 
				-IMPLEMENT_SWIZZLE(x, y, x, y)

			
 
				-IMPLEMENT_SWIZZLE(x, y, x, z)

			
 
				-IMPLEMENT_SWIZZLE(x, y, x, w)

			
 
				-IMPLEMENT_SWIZZLE(x, y, y, x)

			
 
				-IMPLEMENT_SWIZZLE(x, y, y, y)

			
 
				-IMPLEMENT_SWIZZLE(x, y, y, z)

			
 
				-IMPLEMENT_SWIZZLE(x, y, y, w)

			
 
				-IMPLEMENT_SWIZZLE(x, y, z, x)

			
 
				-IMPLEMENT_SWIZZLE(x, y, z, y)

			
 
				-IMPLEMENT_SWIZZLE(x, y, z, z)

			
 
				-// IMPLEMENT_SWIZZLE(x, y, z, w)

			
 
				-IMPLEMENT_SWIZZLE(x, y, w, x)

			
 
				-IMPLEMENT_SWIZZLE(x, y, w, y)

			
 
				-IMPLEMENT_SWIZZLE(x, y, w, z)

			
 
				-IMPLEMENT_SWIZZLE(x, y, w, w)

			
 
				-IMPLEMENT_SWIZZLE(x, z, x, x)

			
 
				-IMPLEMENT_SWIZZLE(x, z, x, y)

			
 
				-IMPLEMENT_SWIZZLE(x, z, x, z)

			
 
				-IMPLEMENT_SWIZZLE(x, z, x, w)

			
 
				-IMPLEMENT_SWIZZLE(x, z, y, x)

			
 
				-IMPLEMENT_SWIZZLE(x, z, y, y)

			
 
				-IMPLEMENT_SWIZZLE(x, z, y, z)

			
 
				-IMPLEMENT_SWIZZLE(x, z, y, w)

			
 
				-IMPLEMENT_SWIZZLE(x, z, z, x)

			
 
				-IMPLEMENT_SWIZZLE(x, z, z, y)

			
 
				-IMPLEMENT_SWIZZLE(x, z, z, z)

			
 
				-IMPLEMENT_SWIZZLE(x, z, z, w)

			
 
				-IMPLEMENT_SWIZZLE(x, z, w, x)

			
 
				-IMPLEMENT_SWIZZLE(x, z, w, y)

			
 
				-IMPLEMENT_SWIZZLE(x, z, w, z)

			
 
				-IMPLEMENT_SWIZZLE(x, z, w, w)

			
 
				-IMPLEMENT_SWIZZLE(x, w, x, x)

			
 
				-IMPLEMENT_SWIZZLE(x, w, x, y)

			
 
				-IMPLEMENT_SWIZZLE(x, w, x, z)

			
 
				-IMPLEMENT_SWIZZLE(x, w, x, w)

			
 
				-IMPLEMENT_SWIZZLE(x, w, y, x)

			
 
				-IMPLEMENT_SWIZZLE(x, w, y, y)

			
 
				-IMPLEMENT_SWIZZLE(x, w, y, z)

			
 
				-IMPLEMENT_SWIZZLE(x, w, y, w)

			
 
				-IMPLEMENT_SWIZZLE(x, w, z, x)

			
 
				-IMPLEMENT_SWIZZLE(x, w, z, y)

			
 
				-IMPLEMENT_SWIZZLE(x, w, z, z)

			
 
				-IMPLEMENT_SWIZZLE(x, w, z, w)

			
 
				-IMPLEMENT_SWIZZLE(x, w, w, x)

			
 
				-IMPLEMENT_SWIZZLE(x, w, w, y)

			
 
				-IMPLEMENT_SWIZZLE(x, w, w, z)

			
 
				-IMPLEMENT_SWIZZLE(x, w, w, w)

			
 
				-IMPLEMENT_SWIZZLE(y, x, x, x)

			
 
				-IMPLEMENT_SWIZZLE(y, x, x, y)

			
 
				-IMPLEMENT_SWIZZLE(y, x, x, z)

			
 
				-IMPLEMENT_SWIZZLE(y, x, x, w)

			
 
				-IMPLEMENT_SWIZZLE(y, x, y, x)

			
 
				-IMPLEMENT_SWIZZLE(y, x, y, y)

			
 
				-IMPLEMENT_SWIZZLE(y, x, y, z)

			
 
				-IMPLEMENT_SWIZZLE(y, x, y, w)

			
 
				-IMPLEMENT_SWIZZLE(y, x, z, x)

			
 
				-IMPLEMENT_SWIZZLE(y, x, z, y)

			
 
				-IMPLEMENT_SWIZZLE(y, x, z, z)

			
 
				-IMPLEMENT_SWIZZLE(y, x, z, w)

			
 
				-IMPLEMENT_SWIZZLE(y, x, w, x)

			
 
				-IMPLEMENT_SWIZZLE(y, x, w, y)

			
 
				-IMPLEMENT_SWIZZLE(y, x, w, z)

			
 
				-IMPLEMENT_SWIZZLE(y, x, w, w)

			
 
				-IMPLEMENT_SWIZZLE(y, y, x, x)

			
 
				-IMPLEMENT_SWIZZLE(y, y, x, y)

			
 
				-IMPLEMENT_SWIZZLE(y, y, x, z)

			
 
				-IMPLEMENT_SWIZZLE(y, y, x, w)

			
 
				-IMPLEMENT_SWIZZLE(y, y, y, x)

			
 
				-IMPLEMENT_SWIZZLE(y, y, y, y)

			
 
				-IMPLEMENT_SWIZZLE(y, y, y, z)

			
 
				-IMPLEMENT_SWIZZLE(y, y, y, w)

			
 
				-IMPLEMENT_SWIZZLE(y, y, z, x)

			
 
				-IMPLEMENT_SWIZZLE(y, y, z, y)

			
 
				-IMPLEMENT_SWIZZLE(y, y, z, z)

			
 
				-IMPLEMENT_SWIZZLE(y, y, z, w)

			
 
				-IMPLEMENT_SWIZZLE(y, y, w, x)

			
 
				-IMPLEMENT_SWIZZLE(y, y, w, y)

			
 
				-IMPLEMENT_SWIZZLE(y, y, w, z)

			
 
				-IMPLEMENT_SWIZZLE(y, y, w, w)

			
 
				-IMPLEMENT_SWIZZLE(y, z, x, x)

			
 
				-IMPLEMENT_SWIZZLE(y, z, x, y)

			
 
				-IMPLEMENT_SWIZZLE(y, z, x, z)

			
 
				-IMPLEMENT_SWIZZLE(y, z, x, w)

			
 
				-IMPLEMENT_SWIZZLE(y, z, y, x)

			
 
				-IMPLEMENT_SWIZZLE(y, z, y, y)

			
 
				-IMPLEMENT_SWIZZLE(y, z, y, z)

			
 
				-IMPLEMENT_SWIZZLE(y, z, y, w)

			
 
				-IMPLEMENT_SWIZZLE(y, z, z, x)

			
 
				-IMPLEMENT_SWIZZLE(y, z, z, y)

			
 
				-IMPLEMENT_SWIZZLE(y, z, z, z)

			
 
				-IMPLEMENT_SWIZZLE(y, z, z, w)

			
 
				-IMPLEMENT_SWIZZLE(y, z, w, x)

			
 
				-IMPLEMENT_SWIZZLE(y, z, w, y)

			
 
				-IMPLEMENT_SWIZZLE(y, z, w, z)

			
 
				-IMPLEMENT_SWIZZLE(y, z, w, w)

			
 
				-IMPLEMENT_SWIZZLE(y, w, x, x)

			
 
				-IMPLEMENT_SWIZZLE(y, w, x, y)

			
 
				-IMPLEMENT_SWIZZLE(y, w, x, z)

			
 
				-IMPLEMENT_SWIZZLE(y, w, x, w)

			
 
				-IMPLEMENT_SWIZZLE(y, w, y, x)

			
 
				-IMPLEMENT_SWIZZLE(y, w, y, y)

			
 
				-IMPLEMENT_SWIZZLE(y, w, y, z)

			
 
				-IMPLEMENT_SWIZZLE(y, w, y, w)

			
 
				-IMPLEMENT_SWIZZLE(y, w, z, x)

			
 
				-IMPLEMENT_SWIZZLE(y, w, z, y)

			
 
				-IMPLEMENT_SWIZZLE(y, w, z, z)

			
 
				-IMPLEMENT_SWIZZLE(y, w, z, w)

			
 
				-IMPLEMENT_SWIZZLE(y, w, w, x)

			
 
				-IMPLEMENT_SWIZZLE(y, w, w, y)

			
 
				-IMPLEMENT_SWIZZLE(y, w, w, z)

			
 
				-IMPLEMENT_SWIZZLE(y, w, w, w)

			
 
				-IMPLEMENT_SWIZZLE(z, x, x, x)

			
 
				-IMPLEMENT_SWIZZLE(z, x, x, y)

			
 
				-IMPLEMENT_SWIZZLE(z, x, x, z)

			
 
				-IMPLEMENT_SWIZZLE(z, x, x, w)

			
 
				-IMPLEMENT_SWIZZLE(z, x, y, x)

			
 
				-IMPLEMENT_SWIZZLE(z, x, y, y)

			
 
				-IMPLEMENT_SWIZZLE(z, x, y, z)

			
 
				-IMPLEMENT_SWIZZLE(z, x, y, w)

			
 
				-IMPLEMENT_SWIZZLE(z, x, z, x)

			
 
				-IMPLEMENT_SWIZZLE(z, x, z, y)

			
 
				-IMPLEMENT_SWIZZLE(z, x, z, z)

			
 
				-IMPLEMENT_SWIZZLE(z, x, z, w)

			
 
				-IMPLEMENT_SWIZZLE(z, x, w, x)

			
 
				-IMPLEMENT_SWIZZLE(z, x, w, y)

			
 
				-IMPLEMENT_SWIZZLE(z, x, w, z)

			
 
				-IMPLEMENT_SWIZZLE(z, x, w, w)

			
 
				-IMPLEMENT_SWIZZLE(z, y, x, x)

			
 
				-IMPLEMENT_SWIZZLE(z, y, x, y)

			
 
				-IMPLEMENT_SWIZZLE(z, y, x, z)

			
 
				-IMPLEMENT_SWIZZLE(z, y, x, w)

			
 
				-IMPLEMENT_SWIZZLE(z, y, y, x)

			
 
				-IMPLEMENT_SWIZZLE(z, y, y, y)

			
 
				-IMPLEMENT_SWIZZLE(z, y, y, z)

			
 
				-IMPLEMENT_SWIZZLE(z, y, y, w)

			
 
				-IMPLEMENT_SWIZZLE(z, y, z, x)

			
 
				-IMPLEMENT_SWIZZLE(z, y, z, y)

			
 
				-IMPLEMENT_SWIZZLE(z, y, z, z)

			
 
				-IMPLEMENT_SWIZZLE(z, y, z, w)

			
 
				-IMPLEMENT_SWIZZLE(z, y, w, x)

			
 
				-IMPLEMENT_SWIZZLE(z, y, w, y)

			
 
				-IMPLEMENT_SWIZZLE(z, y, w, z)

			
 
				-IMPLEMENT_SWIZZLE(z, y, w, w)

			
 
				-IMPLEMENT_SWIZZLE(z, z, x, x)

			
 
				-IMPLEMENT_SWIZZLE(z, z, x, y)

			
 
				-IMPLEMENT_SWIZZLE(z, z, x, z)

			
 
				-IMPLEMENT_SWIZZLE(z, z, x, w)

			
 
				-IMPLEMENT_SWIZZLE(z, z, y, x)

			
 
				-IMPLEMENT_SWIZZLE(z, z, y, y)

			
 
				-IMPLEMENT_SWIZZLE(z, z, y, z)

			
 
				-IMPLEMENT_SWIZZLE(z, z, y, w)

			
 
				-IMPLEMENT_SWIZZLE(z, z, z, x)

			
 
				-IMPLEMENT_SWIZZLE(z, z, z, y)

			
 
				-IMPLEMENT_SWIZZLE(z, z, z, z)

			
 
				-IMPLEMENT_SWIZZLE(z, z, z, w)

			
 
				-IMPLEMENT_SWIZZLE(z, z, w, x)

			
 
				-IMPLEMENT_SWIZZLE(z, z, w, y)

			
 
				-IMPLEMENT_SWIZZLE(z, z, w, z)

			
 
				-IMPLEMENT_SWIZZLE(z, z, w, w)

			
 
				-IMPLEMENT_SWIZZLE(z, w, x, x)

			
 
				-IMPLEMENT_SWIZZLE(z, w, x, y)

			
 
				-IMPLEMENT_SWIZZLE(z, w, x, z)

			
 
				-IMPLEMENT_SWIZZLE(z, w, x, w)

			
 
				-IMPLEMENT_SWIZZLE(z, w, y, x)

			
 
				-IMPLEMENT_SWIZZLE(z, w, y, y)

			
 
				-IMPLEMENT_SWIZZLE(z, w, y, z)

			
 
				-IMPLEMENT_SWIZZLE(z, w, y, w)

			
 
				-IMPLEMENT_SWIZZLE(z, w, z, x)

			
 
				-IMPLEMENT_SWIZZLE(z, w, z, y)

			
 
				-IMPLEMENT_SWIZZLE(z, w, z, z)

			
 
				-IMPLEMENT_SWIZZLE(z, w, z, w)

			
 
				-IMPLEMENT_SWIZZLE(z, w, w, x)

			
 
				-IMPLEMENT_SWIZZLE(z, w, w, y)

			
 
				-IMPLEMENT_SWIZZLE(z, w, w, z)

			
 
				-IMPLEMENT_SWIZZLE(z, w, w, w)

			
 
				-IMPLEMENT_SWIZZLE(w, x, x, x)

			
 
				-IMPLEMENT_SWIZZLE(w, x, x, y)

			
 
				-IMPLEMENT_SWIZZLE(w, x, x, z)

			
 
				-IMPLEMENT_SWIZZLE(w, x, x, w)

			
 
				-IMPLEMENT_SWIZZLE(w, x, y, x)

			
 
				-IMPLEMENT_SWIZZLE(w, x, y, y)

			
 
				-IMPLEMENT_SWIZZLE(w, x, y, z)

			
 
				-IMPLEMENT_SWIZZLE(w, x, y, w)

			
 
				-IMPLEMENT_SWIZZLE(w, x, z, x)

			
 
				-IMPLEMENT_SWIZZLE(w, x, z, y)

			
 
				-IMPLEMENT_SWIZZLE(w, x, z, z)

			
 
				-IMPLEMENT_SWIZZLE(w, x, z, w)

			
 
				-IMPLEMENT_SWIZZLE(w, x, w, x)

			
 
				-IMPLEMENT_SWIZZLE(w, x, w, y)

			
 
				-IMPLEMENT_SWIZZLE(w, x, w, z)

			
 
				-IMPLEMENT_SWIZZLE(w, x, w, w)

			
 
				-IMPLEMENT_SWIZZLE(w, y, x, x)

			
 
				-IMPLEMENT_SWIZZLE(w, y, x, y)

			
 
				-IMPLEMENT_SWIZZLE(w, y, x, z)

			
 
				-IMPLEMENT_SWIZZLE(w, y, x, w)

			
 
				-IMPLEMENT_SWIZZLE(w, y, y, x)

			
 
				-IMPLEMENT_SWIZZLE(w, y, y, y)

			
 
				-IMPLEMENT_SWIZZLE(w, y, y, z)

			
 
				-IMPLEMENT_SWIZZLE(w, y, y, w)

			
 
				-IMPLEMENT_SWIZZLE(w, y, z, x)

			
 
				-IMPLEMENT_SWIZZLE(w, y, z, y)

			
 
				-IMPLEMENT_SWIZZLE(w, y, z, z)

			
 
				-IMPLEMENT_SWIZZLE(w, y, z, w)

			
 
				-IMPLEMENT_SWIZZLE(w, y, w, x)

			
 
				-IMPLEMENT_SWIZZLE(w, y, w, y)

			
 
				-IMPLEMENT_SWIZZLE(w, y, w, z)

			
 
				-IMPLEMENT_SWIZZLE(w, y, w, w)

			
 
				-IMPLEMENT_SWIZZLE(w, z, x, x)

			
 
				-IMPLEMENT_SWIZZLE(w, z, x, y)

			
 
				-IMPLEMENT_SWIZZLE(w, z, x, z)

			
 
				-IMPLEMENT_SWIZZLE(w, z, x, w)

			
 
				-IMPLEMENT_SWIZZLE(w, z, y, x)

			
 
				-IMPLEMENT_SWIZZLE(w, z, y, y)

			
 
				-IMPLEMENT_SWIZZLE(w, z, y, z)

			
 
				-IMPLEMENT_SWIZZLE(w, z, y, w)

			
 
				-IMPLEMENT_SWIZZLE(w, z, z, x)

			
 
				-IMPLEMENT_SWIZZLE(w, z, z, y)

			
 
				-IMPLEMENT_SWIZZLE(w, z, z, z)

			
 
				-IMPLEMENT_SWIZZLE(w, z, z, w)

			
 
				-IMPLEMENT_SWIZZLE(w, z, w, x)

			
 
				-IMPLEMENT_SWIZZLE(w, z, w, y)

			
 
				-IMPLEMENT_SWIZZLE(w, z, w, z)

			
 
				-IMPLEMENT_SWIZZLE(w, z, w, w)

			
 
				-IMPLEMENT_SWIZZLE(w, w, x, x)

			
 
				-IMPLEMENT_SWIZZLE(w, w, x, y)

			
 
				-IMPLEMENT_SWIZZLE(w, w, x, z)

			
 
				-IMPLEMENT_SWIZZLE(w, w, x, w)

			
 
				-IMPLEMENT_SWIZZLE(w, w, y, x)

			
 
				-IMPLEMENT_SWIZZLE(w, w, y, y)

			
 
				-IMPLEMENT_SWIZZLE(w, w, y, z)

			
 
				-IMPLEMENT_SWIZZLE(w, w, y, w)

			
 
				-IMPLEMENT_SWIZZLE(w, w, z, x)

			
 
				-IMPLEMENT_SWIZZLE(w, w, z, y)

			
 
				-IMPLEMENT_SWIZZLE(w, w, z, z)

			
 
				-IMPLEMENT_SWIZZLE(w, w, z, w)

			
 
				-IMPLEMENT_SWIZZLE(w, w, w, x)

			
 
				-IMPLEMENT_SWIZZLE(w, w, w, y)

			
 
				-IMPLEMENT_SWIZZLE(w, w, w, z)

			
 
				-IMPLEMENT_SWIZZLE(w, w, w, w)

			
 
				+/*
			
 
				+ * Copyright 2010-2015 Branimir Karadzic. All rights reserved.
			
 
				+ * License: http://www.opensource.org/licenses/BSD-2-Clause
			
 
				+ */
			
 
				+
			
 
				+#ifndef BX_SIMD_T_H_HEADER_GUARD
			
 
				+#	error "xmacro file, must be included from simd_*.h"
			
 
				+#endif // BX_FLOAT4_T_H_HEADER_GUARD
			
 
				+
			
 
				+// included from float4_t.h
			
 
				+IMPLEMENT_SWIZZLE(x, x, x, x)
			
 
				+IMPLEMENT_SWIZZLE(x, x, x, y)
			
 
				+IMPLEMENT_SWIZZLE(x, x, x, z)
			
 
				+IMPLEMENT_SWIZZLE(x, x, x, w)
			
 
				+IMPLEMENT_SWIZZLE(x, x, y, x)
			
 
				+IMPLEMENT_SWIZZLE(x, x, y, y)
			
 
				+IMPLEMENT_SWIZZLE(x, x, y, z)
			
 
				+IMPLEMENT_SWIZZLE(x, x, y, w)
			
 
				+IMPLEMENT_SWIZZLE(x, x, z, x)
			
 
				+IMPLEMENT_SWIZZLE(x, x, z, y)
			
 
				+IMPLEMENT_SWIZZLE(x, x, z, z)
			
 
				+IMPLEMENT_SWIZZLE(x, x, z, w)
			
 
				+IMPLEMENT_SWIZZLE(x, x, w, x)
			
 
				+IMPLEMENT_SWIZZLE(x, x, w, y)
			
 
				+IMPLEMENT_SWIZZLE(x, x, w, z)
			
 
				+IMPLEMENT_SWIZZLE(x, x, w, w)
			
 
				+IMPLEMENT_SWIZZLE(x, y, x, x)
			
 
				+IMPLEMENT_SWIZZLE(x, y, x, y)
			
 
				+IMPLEMENT_SWIZZLE(x, y, x, z)
			
 
				+IMPLEMENT_SWIZZLE(x, y, x, w)
			
 
				+IMPLEMENT_SWIZZLE(x, y, y, x)
			
 
				+IMPLEMENT_SWIZZLE(x, y, y, y)
			
 
				+IMPLEMENT_SWIZZLE(x, y, y, z)
			
 
				+IMPLEMENT_SWIZZLE(x, y, y, w)
			
 
				+IMPLEMENT_SWIZZLE(x, y, z, x)
			
 
				+IMPLEMENT_SWIZZLE(x, y, z, y)
			
 
				+IMPLEMENT_SWIZZLE(x, y, z, z)
			
 
				+// IMPLEMENT_SWIZZLE(x, y, z, w)
			
 
				+IMPLEMENT_SWIZZLE(x, y, w, x)
			
 
				+IMPLEMENT_SWIZZLE(x, y, w, y)
			
 
				+IMPLEMENT_SWIZZLE(x, y, w, z)
			
 
				+IMPLEMENT_SWIZZLE(x, y, w, w)
			
 
				+IMPLEMENT_SWIZZLE(x, z, x, x)
			
 
				+IMPLEMENT_SWIZZLE(x, z, x, y)
			
 
				+IMPLEMENT_SWIZZLE(x, z, x, z)
			
 
				+IMPLEMENT_SWIZZLE(x, z, x, w)
			
 
				+IMPLEMENT_SWIZZLE(x, z, y, x)
			
 
				+IMPLEMENT_SWIZZLE(x, z, y, y)
			
 
				+IMPLEMENT_SWIZZLE(x, z, y, z)
			
 
				+IMPLEMENT_SWIZZLE(x, z, y, w)
			
 
				+IMPLEMENT_SWIZZLE(x, z, z, x)
			
 
				+IMPLEMENT_SWIZZLE(x, z, z, y)
			
 
				+IMPLEMENT_SWIZZLE(x, z, z, z)
			
 
				+IMPLEMENT_SWIZZLE(x, z, z, w)
			
 
				+IMPLEMENT_SWIZZLE(x, z, w, x)
			
 
				+IMPLEMENT_SWIZZLE(x, z, w, y)
			
 
				+IMPLEMENT_SWIZZLE(x, z, w, z)
			
 
				+IMPLEMENT_SWIZZLE(x, z, w, w)
			
 
				+IMPLEMENT_SWIZZLE(x, w, x, x)
			
 
				+IMPLEMENT_SWIZZLE(x, w, x, y)
			
 
				+IMPLEMENT_SWIZZLE(x, w, x, z)
			
 
				+IMPLEMENT_SWIZZLE(x, w, x, w)
			
 
				+IMPLEMENT_SWIZZLE(x, w, y, x)
			
 
				+IMPLEMENT_SWIZZLE(x, w, y, y)
			
 
				+IMPLEMENT_SWIZZLE(x, w, y, z)
			
 
				+IMPLEMENT_SWIZZLE(x, w, y, w)
			
 
				+IMPLEMENT_SWIZZLE(x, w, z, x)
			
 
				+IMPLEMENT_SWIZZLE(x, w, z, y)
			
 
				+IMPLEMENT_SWIZZLE(x, w, z, z)
			
 
				+IMPLEMENT_SWIZZLE(x, w, z, w)
			
 
				+IMPLEMENT_SWIZZLE(x, w, w, x)
			
 
				+IMPLEMENT_SWIZZLE(x, w, w, y)
			
 
				+IMPLEMENT_SWIZZLE(x, w, w, z)
			
 
				+IMPLEMENT_SWIZZLE(x, w, w, w)
			
 
				+IMPLEMENT_SWIZZLE(y, x, x, x)
			
 
				+IMPLEMENT_SWIZZLE(y, x, x, y)
			
 
				+IMPLEMENT_SWIZZLE(y, x, x, z)
			
 
				+IMPLEMENT_SWIZZLE(y, x, x, w)
			
 
				+IMPLEMENT_SWIZZLE(y, x, y, x)
			
 
				+IMPLEMENT_SWIZZLE(y, x, y, y)
			
 
				+IMPLEMENT_SWIZZLE(y, x, y, z)
			
 
				+IMPLEMENT_SWIZZLE(y, x, y, w)
			
 
				+IMPLEMENT_SWIZZLE(y, x, z, x)
			
 
				+IMPLEMENT_SWIZZLE(y, x, z, y)
			
 
				+IMPLEMENT_SWIZZLE(y, x, z, z)
			
 
				+IMPLEMENT_SWIZZLE(y, x, z, w)
			
 
				+IMPLEMENT_SWIZZLE(y, x, w, x)
			
 
				+IMPLEMENT_SWIZZLE(y, x, w, y)
			
 
				+IMPLEMENT_SWIZZLE(y, x, w, z)
			
 
				+IMPLEMENT_SWIZZLE(y, x, w, w)
			
 
				+IMPLEMENT_SWIZZLE(y, y, x, x)
			
 
				+IMPLEMENT_SWIZZLE(y, y, x, y)
			
 
				+IMPLEMENT_SWIZZLE(y, y, x, z)
			
 
				+IMPLEMENT_SWIZZLE(y, y, x, w)
			
 
				+IMPLEMENT_SWIZZLE(y, y, y, x)
			
 
				+IMPLEMENT_SWIZZLE(y, y, y, y)
			
 
				+IMPLEMENT_SWIZZLE(y, y, y, z)
			
 
				+IMPLEMENT_SWIZZLE(y, y, y, w)
			
 
				+IMPLEMENT_SWIZZLE(y, y, z, x)
			
 
				+IMPLEMENT_SWIZZLE(y, y, z, y)
			
 
				+IMPLEMENT_SWIZZLE(y, y, z, z)
			
 
				+IMPLEMENT_SWIZZLE(y, y, z, w)
			
 
				+IMPLEMENT_SWIZZLE(y, y, w, x)
			
 
				+IMPLEMENT_SWIZZLE(y, y, w, y)
			
 
				+IMPLEMENT_SWIZZLE(y, y, w, z)
			
 
				+IMPLEMENT_SWIZZLE(y, y, w, w)
			
 
				+IMPLEMENT_SWIZZLE(y, z, x, x)
			
 
				+IMPLEMENT_SWIZZLE(y, z, x, y)
			
 
				+IMPLEMENT_SWIZZLE(y, z, x, z)
			
 
				+IMPLEMENT_SWIZZLE(y, z, x, w)
			
 
				+IMPLEMENT_SWIZZLE(y, z, y, x)
			
 
				+IMPLEMENT_SWIZZLE(y, z, y, y)
			
 
				+IMPLEMENT_SWIZZLE(y, z, y, z)
			
 
				+IMPLEMENT_SWIZZLE(y, z, y, w)
			
 
				+IMPLEMENT_SWIZZLE(y, z, z, x)
			
 
				+IMPLEMENT_SWIZZLE(y, z, z, y)
			
 
				+IMPLEMENT_SWIZZLE(y, z, z, z)
			
 
				+IMPLEMENT_SWIZZLE(y, z, z, w)
			
 
				+IMPLEMENT_SWIZZLE(y, z, w, x)
			
 
				+IMPLEMENT_SWIZZLE(y, z, w, y)
			
 
				+IMPLEMENT_SWIZZLE(y, z, w, z)
			
 
				+IMPLEMENT_SWIZZLE(y, z, w, w)
			
 
				+IMPLEMENT_SWIZZLE(y, w, x, x)
			
 
				+IMPLEMENT_SWIZZLE(y, w, x, y)
			
 
				+IMPLEMENT_SWIZZLE(y, w, x, z)
			
 
				+IMPLEMENT_SWIZZLE(y, w, x, w)
			
 
				+IMPLEMENT_SWIZZLE(y, w, y, x)
			
 
				+IMPLEMENT_SWIZZLE(y, w, y, y)
			
 
				+IMPLEMENT_SWIZZLE(y, w, y, z)
			
 
				+IMPLEMENT_SWIZZLE(y, w, y, w)
			
 
				+IMPLEMENT_SWIZZLE(y, w, z, x)
			
 
				+IMPLEMENT_SWIZZLE(y, w, z, y)
			
 
				+IMPLEMENT_SWIZZLE(y, w, z, z)
			
 
				+IMPLEMENT_SWIZZLE(y, w, z, w)
			
 
				+IMPLEMENT_SWIZZLE(y, w, w, x)
			
 
				+IMPLEMENT_SWIZZLE(y, w, w, y)
			
 
				+IMPLEMENT_SWIZZLE(y, w, w, z)
			
 
				+IMPLEMENT_SWIZZLE(y, w, w, w)
			
 
				+IMPLEMENT_SWIZZLE(z, x, x, x)
			
 
				+IMPLEMENT_SWIZZLE(z, x, x, y)
			
 
				+IMPLEMENT_SWIZZLE(z, x, x, z)
			
 
				+IMPLEMENT_SWIZZLE(z, x, x, w)
			
 
				+IMPLEMENT_SWIZZLE(z, x, y, x)
			
 
				+IMPLEMENT_SWIZZLE(z, x, y, y)
			
 
				+IMPLEMENT_SWIZZLE(z, x, y, z)
			
 
				+IMPLEMENT_SWIZZLE(z, x, y, w)
			
 
				+IMPLEMENT_SWIZZLE(z, x, z, x)
			
 
				+IMPLEMENT_SWIZZLE(z, x, z, y)
			
 
				+IMPLEMENT_SWIZZLE(z, x, z, z)
			
 
				+IMPLEMENT_SWIZZLE(z, x, z, w)
			
 
				+IMPLEMENT_SWIZZLE(z, x, w, x)
			
 
				+IMPLEMENT_SWIZZLE(z, x, w, y)
			
 
				+IMPLEMENT_SWIZZLE(z, x, w, z)
			
 
				+IMPLEMENT_SWIZZLE(z, x, w, w)
			
 
				+IMPLEMENT_SWIZZLE(z, y, x, x)
			
 
				+IMPLEMENT_SWIZZLE(z, y, x, y)
			
 
				+IMPLEMENT_SWIZZLE(z, y, x, z)
			
 
				+IMPLEMENT_SWIZZLE(z, y, x, w)
			
 
				+IMPLEMENT_SWIZZLE(z, y, y, x)
			
 
				+IMPLEMENT_SWIZZLE(z, y, y, y)
			
 
				+IMPLEMENT_SWIZZLE(z, y, y, z)
			
 
				+IMPLEMENT_SWIZZLE(z, y, y, w)
			
 
				+IMPLEMENT_SWIZZLE(z, y, z, x)
			
 
				+IMPLEMENT_SWIZZLE(z, y, z, y)
			
 
				+IMPLEMENT_SWIZZLE(z, y, z, z)
			
 
				+IMPLEMENT_SWIZZLE(z, y, z, w)
			
 
				+IMPLEMENT_SWIZZLE(z, y, w, x)
			
 
				+IMPLEMENT_SWIZZLE(z, y, w, y)
			
 
				+IMPLEMENT_SWIZZLE(z, y, w, z)
			
 
				+IMPLEMENT_SWIZZLE(z, y, w, w)
			
 
				+IMPLEMENT_SWIZZLE(z, z, x, x)
			
 
				+IMPLEMENT_SWIZZLE(z, z, x, y)
			
 
				+IMPLEMENT_SWIZZLE(z, z, x, z)
			
 
				+IMPLEMENT_SWIZZLE(z, z, x, w)
			
 
				+IMPLEMENT_SWIZZLE(z, z, y, x)
			
 
				+IMPLEMENT_SWIZZLE(z, z, y, y)
			
 
				+IMPLEMENT_SWIZZLE(z, z, y, z)
			
 
				+IMPLEMENT_SWIZZLE(z, z, y, w)
			
 
				+IMPLEMENT_SWIZZLE(z, z, z, x)
			
 
				+IMPLEMENT_SWIZZLE(z, z, z, y)
			
 
				+IMPLEMENT_SWIZZLE(z, z, z, z)
			
 
				+IMPLEMENT_SWIZZLE(z, z, z, w)
			
 
				+IMPLEMENT_SWIZZLE(z, z, w, x)
			
 
				+IMPLEMENT_SWIZZLE(z, z, w, y)
			
 
				+IMPLEMENT_SWIZZLE(z, z, w, z)
			
 
				+IMPLEMENT_SWIZZLE(z, z, w, w)
			
 
				+IMPLEMENT_SWIZZLE(z, w, x, x)
			
 
				+IMPLEMENT_SWIZZLE(z, w, x, y)
			
 
				+IMPLEMENT_SWIZZLE(z, w, x, z)
			
 
				+IMPLEMENT_SWIZZLE(z, w, x, w)
			
 
				+IMPLEMENT_SWIZZLE(z, w, y, x)
			
 
				+IMPLEMENT_SWIZZLE(z, w, y, y)
			
 
				+IMPLEMENT_SWIZZLE(z, w, y, z)
			
 
				+IMPLEMENT_SWIZZLE(z, w, y, w)
			
 
				+IMPLEMENT_SWIZZLE(z, w, z, x)
			
 
				+IMPLEMENT_SWIZZLE(z, w, z, y)
			
 
				+IMPLEMENT_SWIZZLE(z, w, z, z)
			
 
				+IMPLEMENT_SWIZZLE(z, w, z, w)
			
 
				+IMPLEMENT_SWIZZLE(z, w, w, x)
			
 
				+IMPLEMENT_SWIZZLE(z, w, w, y)
			
 
				+IMPLEMENT_SWIZZLE(z, w, w, z)
			
 
				+IMPLEMENT_SWIZZLE(z, w, w, w)
			
 
				+IMPLEMENT_SWIZZLE(w, x, x, x)
			
 
				+IMPLEMENT_SWIZZLE(w, x, x, y)
			
 
				+IMPLEMENT_SWIZZLE(w, x, x, z)
			
 
				+IMPLEMENT_SWIZZLE(w, x, x, w)
			
 
				+IMPLEMENT_SWIZZLE(w, x, y, x)
			
 
				+IMPLEMENT_SWIZZLE(w, x, y, y)
			
 
				+IMPLEMENT_SWIZZLE(w, x, y, z)
			
 
				+IMPLEMENT_SWIZZLE(w, x, y, w)
			
 
				+IMPLEMENT_SWIZZLE(w, x, z, x)
			
 
				+IMPLEMENT_SWIZZLE(w, x, z, y)
			
 
				+IMPLEMENT_SWIZZLE(w, x, z, z)
			
 
				+IMPLEMENT_SWIZZLE(w, x, z, w)
			
 
				+IMPLEMENT_SWIZZLE(w, x, w, x)
			
 
				+IMPLEMENT_SWIZZLE(w, x, w, y)
			
 
				+IMPLEMENT_SWIZZLE(w, x, w, z)
			
 
				+IMPLEMENT_SWIZZLE(w, x, w, w)
			
 
				+IMPLEMENT_SWIZZLE(w, y, x, x)
			
 
				+IMPLEMENT_SWIZZLE(w, y, x, y)
			
 
				+IMPLEMENT_SWIZZLE(w, y, x, z)
			
 
				+IMPLEMENT_SWIZZLE(w, y, x, w)
			
 
				+IMPLEMENT_SWIZZLE(w, y, y, x)
			
 
				+IMPLEMENT_SWIZZLE(w, y, y, y)
			
 
				+IMPLEMENT_SWIZZLE(w, y, y, z)
			
 
				+IMPLEMENT_SWIZZLE(w, y, y, w)
			
 
				+IMPLEMENT_SWIZZLE(w, y, z, x)
			
 
				+IMPLEMENT_SWIZZLE(w, y, z, y)
			
 
				+IMPLEMENT_SWIZZLE(w, y, z, z)
			
 
				+IMPLEMENT_SWIZZLE(w, y, z, w)
			
 
				+IMPLEMENT_SWIZZLE(w, y, w, x)
			
 
				+IMPLEMENT_SWIZZLE(w, y, w, y)
			
 
				+IMPLEMENT_SWIZZLE(w, y, w, z)
			
 
				+IMPLEMENT_SWIZZLE(w, y, w, w)
			
 
				+IMPLEMENT_SWIZZLE(w, z, x, x)
			
 
				+IMPLEMENT_SWIZZLE(w, z, x, y)
			
 
				+IMPLEMENT_SWIZZLE(w, z, x, z)
			
 
				+IMPLEMENT_SWIZZLE(w, z, x, w)
			
 
				+IMPLEMENT_SWIZZLE(w, z, y, x)
			
 
				+IMPLEMENT_SWIZZLE(w, z, y, y)
			
 
				+IMPLEMENT_SWIZZLE(w, z, y, z)
			
 
				+IMPLEMENT_SWIZZLE(w, z, y, w)
			
 
				+IMPLEMENT_SWIZZLE(w, z, z, x)
			
 
				+IMPLEMENT_SWIZZLE(w, z, z, y)
			
 
				+IMPLEMENT_SWIZZLE(w, z, z, z)
			
 
				+IMPLEMENT_SWIZZLE(w, z, z, w)
			
 
				+IMPLEMENT_SWIZZLE(w, z, w, x)
			
 
				+IMPLEMENT_SWIZZLE(w, z, w, y)
			
 
				+IMPLEMENT_SWIZZLE(w, z, w, z)
			
 
				+IMPLEMENT_SWIZZLE(w, z, w, w)
			
 
				+IMPLEMENT_SWIZZLE(w, w, x, x)
			
 
				+IMPLEMENT_SWIZZLE(w, w, x, y)
			
 
				+IMPLEMENT_SWIZZLE(w, w, x, z)
			
 
				+IMPLEMENT_SWIZZLE(w, w, x, w)
			
 
				+IMPLEMENT_SWIZZLE(w, w, y, x)
			
 
				+IMPLEMENT_SWIZZLE(w, w, y, y)
			
 
				+IMPLEMENT_SWIZZLE(w, w, y, z)
			
 
				+IMPLEMENT_SWIZZLE(w, w, y, w)
			
 
				+IMPLEMENT_SWIZZLE(w, w, z, x)
			
 
				+IMPLEMENT_SWIZZLE(w, w, z, y)
			
 
				+IMPLEMENT_SWIZZLE(w, w, z, z)
			
 
				+IMPLEMENT_SWIZZLE(w, w, z, w)
			
 
				+IMPLEMENT_SWIZZLE(w, w, w, x)
			
 
				+IMPLEMENT_SWIZZLE(w, w, w, y)
			
 
				+IMPLEMENT_SWIZZLE(w, w, w, z)
			
 
				+IMPLEMENT_SWIZZLE(w, w, w, w)
			
--- a/include/bx/simd_t.h
+++ b/include/bx/simd_t.h
@@ -0,0 +1,436 @@
 
				+/*
			
 
				+ * Copyright 2010-2016 Branimir Karadzic. All rights reserved.
			
 
				+ * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
			
 
				+ */
			
 
				+
			
 
				+#ifndef BX_SIMD_T_H_HEADER_GUARD
			
 
				+#define BX_SIMD_T_H_HEADER_GUARD
			
 
				+
			
 
				+#include "bx.h"
			
 
				+
			
 
				+#define BX_SIMD_FORCE_INLINE BX_FORCE_INLINE
			
 
				+#define BX_SIMD_INLINE inline
			
 
				+
			
 
				+#define BX_SIMD_SSE     0
			
 
				+#define BX_SIMD_AVX     0
			
 
				+#define BX_SIMD_NEON    0
			
 
				+#define BX_SIMD_LANGEXT 0
			
 
				+
			
 
				+#if defined(__SSE2__) || (BX_COMPILER_MSVC && (BX_ARCH_64BIT || _M_IX86_FP >= 2) )
			
 
				+#	include <emmintrin.h> // __m128i
			
 
				+#	if defined(__SSE4_1__)
			
 
				+#		include <smmintrin.h>
			
 
				+#	endif // defined(__SSE4_1__)
			
 
				+#	include <xmmintrin.h> // __m128
			
 
				+#	undef  BX_SIMD_SSE
			
 
				+#	define BX_SIMD_SSE 1
			
 
				+
			
 
				+namespace bx
			
 
				+{
			
 
				+	typedef __m128 simd128_sse_t;
			
 
				+
			
 
				+} // namespace bx
			
 
				+
			
 
				+#elif defined(__ARM_NEON__) && !BX_COMPILER_CLANG
			
 
				+#	include <arm_neon.h>
			
 
				+#	undef  BX_SIMD_NEON
			
 
				+#	define BX_SIMD_NEON 1
			
 
				+
			
 
				+namespace bx
			
 
				+{
			
 
				+	typedef float32x4_t simd128_neon_t;
			
 
				+
			
 
				+} // namespace bx
			
 
				+
			
 
				+#elif   BX_COMPILER_CLANG \
			
 
				+	&& !BX_PLATFORM_EMSCRIPTEN \
			
 
				+	&& !BX_PLATFORM_IOS \
			
 
				+	&&  BX_CLANG_HAS_EXTENSION(attribute_ext_vector_type)
			
 
				+#	include <math.h>
			
 
				+#	undef  BX_SIMD_LANGEXT
			
 
				+#	define BX_SIMD_LANGEXT 1
			
 
				+
			
 
				+namespace bx
			
 
				+{
			
 
				+	union simd128_langext_t
			
 
				+	{
			
 
				+		float    __attribute__((vector_size(16))) vf;
			
 
				+		int32_t  __attribute__((vector_size(16))) vi;
			
 
				+		uint32_t __attribute__((vector_size(16))) vu;
			
 
				+		float    fxyzw[4];
			
 
				+		int32_t  ixyzw[4];
			
 
				+		uint32_t uxyzw[4];
			
 
				+
			
 
				+	};
			
 
				+} // namespace bx
			
 
				+#endif //
			
 
				+
			
 
				+namespace bx
			
 
				+{
			
 
				+	union simd128_ref_t
			
 
				+	{
			
 
				+		float    fxyzw[4];
			
 
				+		int32_t  ixyzw[4];
			
 
				+		uint32_t uxyzw[4];
			
 
				+
			
 
				+	};
			
 
				+} // namespace bx
			
 
				+
			
 
				+namespace bx
			
 
				+{
			
 
				+#define ELEMx 0
			
 
				+#define ELEMy 1
			
 
				+#define ELEMz 2
			
 
				+#define ELEMw 3
			
 
				+#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \
			
 
				+			template<typename Ty> \
			
 
				+			BX_SIMD_FORCE_INLINE Ty simd_swiz_##_x##_y##_z##_w(Ty _a);
			
 
				+#include "simd_swizzle.inl"
			
 
				+
			
 
				+#undef IMPLEMENT_SWIZZLE
			
 
				+#undef ELEMw
			
 
				+#undef ELEMz
			
 
				+#undef ELEMy
			
 
				+#undef ELEMx
			
 
				+
			
 
				+#define IMPLEMENT_TEST(_xyzw) \
			
 
				+			template<typename Ty> \
			
 
				+			BX_SIMD_FORCE_INLINE bool simd_test_any_##_xyzw(Ty _test); \
			
 
				+			\
			
 
				+			template<typename Ty> \
			
 
				+			BX_SIMD_FORCE_INLINE bool simd_test_all_##_xyzw(Ty _test)
			
 
				+
			
 
				+IMPLEMENT_TEST(x   );
			
 
				+IMPLEMENT_TEST(y   );
			
 
				+IMPLEMENT_TEST(xy  );
			
 
				+IMPLEMENT_TEST(z   );
			
 
				+IMPLEMENT_TEST(xz  );
			
 
				+IMPLEMENT_TEST(yz  );
			
 
				+IMPLEMENT_TEST(xyz );
			
 
				+IMPLEMENT_TEST(w   );
			
 
				+IMPLEMENT_TEST(xw  );
			
 
				+IMPLEMENT_TEST(yw  );
			
 
				+IMPLEMENT_TEST(xyw );
			
 
				+IMPLEMENT_TEST(zw  );
			
 
				+IMPLEMENT_TEST(xzw );
			
 
				+IMPLEMENT_TEST(yzw );
			
 
				+IMPLEMENT_TEST(xyzw);
			
 
				+#undef IMPLEMENT_TEST
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_shuf_xyAB(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_shuf_ABxy(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_shuf_CDzw(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_shuf_zwCD(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_shuf_xAyB(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_shuf_yBxA(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_shuf_zCwD(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_shuf_CzDw(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE float simd_x(Ty _a);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE float simd_y(Ty _a);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE float simd_z(Ty _a);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE float simd_w(Ty _a);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_ld(const void* _ptr);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE void simd_st(void* _ptr, Ty _a);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE void simd_stx(void* _ptr, Ty _a);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE void simd_stream(void* _ptr, Ty _a);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_ld(float _x, float _y, float _z, float _w);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_splat(const void* _ptr);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_splat(float _a);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_isplat(uint32_t _a);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_zero();
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_itof(Ty _a);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_ftoi(Ty _a);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_round(Ty _a);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_add(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_sub(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_mul(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_div(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_rcp_est(Ty _a);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_sqrt(Ty _a);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_rsqrt_est(Ty _a);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_dot3(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_dot(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_cmpeq(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_cmplt(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_cmple(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_cmpgt(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_cmpge(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_min(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_max(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_and(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_andc(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_or(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_xor(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_sll(Ty _a, int _count);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_srl(Ty _a, int _count);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_sra(Ty _a, int _count);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_icmpeq(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_icmplt(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_icmpgt(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_imin(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_imax(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_iadd(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_FORCE_INLINE Ty simd_isub(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_shuf_xAzC(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_shuf_yBwD(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_rcp(Ty _a);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_orx(Ty _a);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_orc(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_neg(Ty _a);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_madd(Ty _a, Ty _b, Ty _c);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_nmsub(Ty _a, Ty _b, Ty _c);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_div_nr(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_selb(Ty _mask, Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_sels(Ty _test, Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_not(Ty _a);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_abs(Ty _a);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_clamp(Ty _a, Ty _min, Ty _max);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_lerp(Ty _a, Ty _b, Ty _s);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_rsqrt(Ty _a);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_rsqrt_nr(Ty _a);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_rsqrt_carmack(Ty _a);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_sqrt_nr(Ty _a);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_log2(Ty _a);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_exp2(Ty _a);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_pow(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_cross3(Ty _a, Ty _b);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_normalize3(Ty _a);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_ceil(Ty _a);
			
 
				+
			
 
				+	template<typename Ty>
			
 
				+	BX_SIMD_INLINE Ty simd_floor(Ty _a);
			
 
				+
			
 
				+} // namespace bx
			
 
				+
			
 
				+#if BX_SIMD_SSE
			
 
				+#	include "simd128_sse.inl"
			
 
				+#endif // BX_SIMD_SSE
			
 
				+
			
 
				+#if BX_SIMD_NEON
			
 
				+#	include "simd128_neon.inl"
			
 
				+#endif // BX_SIMD_NEON
			
 
				+
			
 
				+#if BX_SIMD_LANGEXT
			
 
				+#	include "simd128_langext.inl"
			
 
				+#endif // BX_SIMD_LANGEXT
			
 
				+
			
 
				+#if !( BX_SIMD_SSE \
			
 
				+	|| BX_SIMD_AVX \
			
 
				+	|| BX_SIMD_NEON \
			
 
				+	|| BX_SIMD_LANGEXT \
			
 
				+	 )
			
 
				+#	ifndef BX_SIMD_WARN_REFERENCE_IMPL
			
 
				+#		define BX_SIMD_WARN_REFERENCE_IMPL 0
			
 
				+#	endif // BX_SIMD_WARN_REFERENCE_IMPL
			
 
				+
			
 
				+#	if BX_SIMD_WARN_REFERENCE_IMPL
			
 
				+#		pragma message("************************************\nUsing SIMD reference implementation!\n************************************")
			
 
				+#	endif // BX_SIMD_WARN_REFERENCE_IMPL
			
 
				+
			
 
				+namespace bx
			
 
				+{
			
 
				+	typedef simd128_ref_t simd128_t;
			
 
				+}
			
 
				+#endif //
			
 
				+
			
 
				+#include "simd128_ref.inl"
			
 
				+
			
 
				+namespace bx
			
 
				+{
			
 
				+	BX_SIMD_FORCE_INLINE simd128_t simd_zero()
			
 
				+	{
			
 
				+		return simd_zero<simd128_t>();
			
 
				+	}
			
 
				+
			
 
				+	BX_SIMD_FORCE_INLINE simd128_t simd_ld(const void* _ptr)
			
 
				+	{
			
 
				+		return simd_ld<simd128_t>(_ptr);
			
 
				+	}
			
 
				+
			
 
				+	BX_SIMD_FORCE_INLINE simd128_t simd_ld(float _x, float _y, float _z, float _w)
			
 
				+	{
			
 
				+		return simd_ld<simd128_t>(_x, _y, _z, _w);
			
 
				+	}
			
 
				+
			
 
				+	BX_SIMD_FORCE_INLINE simd128_t simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
			
 
				+	{
			
 
				+		return simd_ild<simd128_t>(_x, _y, _z, _w);
			
 
				+	}
			
 
				+
			
 
				+	BX_SIMD_FORCE_INLINE simd128_t simd_splat(const void* _ptr)
			
 
				+	{
			
 
				+		return simd_splat<simd128_t>(_ptr);
			
 
				+	}
			
 
				+
			
 
				+	BX_SIMD_FORCE_INLINE simd128_t simd_splat(float _a)
			
 
				+	{
			
 
				+		return simd_splat<simd128_t>(_a);
			
 
				+	}
			
 
				+
			
 
				+	BX_SIMD_FORCE_INLINE simd128_t simd_isplat(uint32_t _a)
			
 
				+	{
			
 
				+		return simd_isplat<simd128_t>(_a);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#endif // BX_SIMD_T_H_HEADER_GUARD
			
--- a/tests/float4_t.cpp
+++ b/tests/float4_t.cpp
@@ -1,309 +0,0 @@
 
				-/*
			
 
				- * Copyright 2010-2016 Branimir Karadzic. All rights reserved.
			
 
				- * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
			
 
				- */
			
 
				-
			
 
				-#include "test.h"
			
 
				-#include <bx/float4_t.h>
			
 
				-#include <bx/fpumath.h>
			
 
				-#include <string.h>
			
 
				-
			
 
				-using namespace bx;
			
 
				-
			
 
				-union float4_cast
			
 
				-{
			
 
				-	bx::float4_t f4;
			
 
				-	float f[4];
			
 
				-	uint32_t ui[4];
			
 
				-	int32_t i[4];
			
 
				-	char c[16];
			
 
				-};
			
 
				-
			
 
				-void float4_check_bool(const char* _str, bool _a, bool _0)
			
 
				-{
			
 
				-	DBG("%s %d == %d"
			
 
				-		, _str
			
 
				-		, _a
			
 
				-		, _0
			
 
				-		);
			
 
				-
			
 
				-	CHECK_EQUAL(_a, _0);
			
 
				-}
			
 
				-
			
 
				-void float4_check_int32(const char* _str, bx::float4_t _a, int32_t _0, int32_t _1, int32_t _2, int32_t _3)
			
 
				-{
			
 
				-	float4_cast c; c.f4 = _a;
			
 
				-	DBG("%s (%d, %d, %d, %d) == (%d, %d, %d, %d)"
			
 
				-		, _str
			
 
				-		, c.i[0], c.i[1], c.i[2], c.i[3]
			
 
				-		, _0, _1, _2, _3
			
 
				-		);
			
 
				-
			
 
				-	CHECK_EQUAL(c.i[0], _0);
			
 
				-	CHECK_EQUAL(c.i[1], _1);
			
 
				-	CHECK_EQUAL(c.i[2], _2);
			
 
				-	CHECK_EQUAL(c.i[3], _3);
			
 
				-}
			
 
				-
			
 
				-void float4_check_uint32(const char* _str, bx::float4_t _a, uint32_t _0, uint32_t _1, uint32_t _2, uint32_t _3)
			
 
				-{
			
 
				-	float4_cast c; c.f4 = _a;
			
 
				-
			
 
				-	DBG("%s (0x%08x, 0x%08x, 0x%08x, 0x%08x) == (0x%08x, 0x%08x, 0x%08x, 0x%08x)"
			
 
				-		, _str
			
 
				-		, c.ui[0], c.ui[1], c.ui[2], c.ui[3]
			
 
				-		, _0, _1, _2, _3
			
 
				-		);
			
 
				-
			
 
				-	CHECK_EQUAL(c.ui[0], _0);
			
 
				-	CHECK_EQUAL(c.ui[1], _1);
			
 
				-	CHECK_EQUAL(c.ui[2], _2);
			
 
				-	CHECK_EQUAL(c.ui[3], _3);
			
 
				-}
			
 
				-
			
 
				-void float4_check_float(const char* _str, bx::float4_t _a, float _0, float _1, float _2, float _3)
			
 
				-{
			
 
				-	float4_cast c; c.f4 = _a;
			
 
				-
			
 
				-	DBG("%s (%f, %f, %f, %f) == (%f, %f, %f, %f)"
			
 
				-		, _str
			
 
				-		, c.f[0], c.f[1], c.f[2], c.f[3]
			
 
				-		, _0, _1, _2, _3
			
 
				-		);
			
 
				-
			
 
				-	CHECK(bx::fequal(c.f[0], _0, 0.0001f) );
			
 
				-	CHECK(bx::fequal(c.f[1], _1, 0.0001f) );
			
 
				-	CHECK(bx::fequal(c.f[2], _2, 0.0001f) );
			
 
				-	CHECK(bx::fequal(c.f[3], _3, 0.0001f) );
			
 
				-}
			
 
				-
			
 
				-void float4_check_string(const char* _str, bx::float4_t _a)
			
 
				-{
			
 
				-	float4_cast c; c.f4 = _a;
			
 
				-	const char test[5] = { c.c[0], c.c[4], c.c[8], c.c[12], '\0' };
			
 
				-
			
 
				-	DBG("%s %s", _str, test);
			
 
				-
			
 
				-	CHECK(0 == strcmp(_str, test) );
			
 
				-}
			
 
				-
			
 
				-TEST(float4_swizzle)
			
 
				-{
			
 
				-	const float4_t xyzw = float4_ild(0x78787878, 0x79797979, 0x7a7a7a7a, 0x77777777);
			
 
				-
			
 
				-#define ELEMx 0
			
 
				-#define ELEMy 1
			
 
				-#define ELEMz 2
			
 
				-#define ELEMw 3
			
 
				-#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \
			
 
				-			float4_check_string("" #_x #_y #_z #_w "", float4_swiz_##_x##_y##_z##_w(xyzw) ); \
			
 
				-
			
 
				-#include <bx/float4_swizzle.inl>
			
 
				-
			
 
				-#undef IMPLEMENT_SWIZZLE
			
 
				-#undef ELEMw
			
 
				-#undef ELEMz
			
 
				-#undef ELEMy
			
 
				-#undef ELEMx
			
 
				-}
			
 
				-
			
 
				-TEST(float4_shuffle)
			
 
				-{
			
 
				-	const float4_t xyzw = float4_ild(0x78787878, 0x79797979, 0x7a7a7a7a, 0x77777777);
			
 
				-	const float4_t ABCD = float4_ild(0x41414141, 0x42424242, 0x43434343, 0x44444444);
			
 
				-	float4_check_string("xyAB", float4_shuf_xyAB(xyzw, ABCD) );
			
 
				-	float4_check_string("ABxy", float4_shuf_ABxy(xyzw, ABCD) );
			
 
				-	float4_check_string("zwCD", float4_shuf_zwCD(xyzw, ABCD) );
			
 
				-	float4_check_string("CDzw", float4_shuf_CDzw(xyzw, ABCD) );
			
 
				-	float4_check_string("xAyB", float4_shuf_xAyB(xyzw, ABCD) );
			
 
				-	float4_check_string("zCwD", float4_shuf_zCwD(xyzw, ABCD) );
			
 
				-	float4_check_string("xAzC", float4_shuf_xAzC(xyzw, ABCD) );
			
 
				-	float4_check_string("yBwD", float4_shuf_yBwD(xyzw, ABCD) );
			
 
				-	float4_check_string("CzDw", float4_shuf_CzDw(xyzw, ABCD) );
			
 
				-}
			
 
				-
			
 
				-TEST(float4_compare)
			
 
				-{
			
 
				-	float4_check_uint32("cmpeq"
			
 
				-		, float4_cmpeq(float4_ld(1.0f, 2.0f, 3.0f, 4.0f), float4_ld(0.0f, 2.0f, 0.0f, 3.0f) )
			
 
				-		, 0, 0xffffffff, 0, 0
			
 
				-		);
			
 
				-
			
 
				-	float4_check_uint32("cmplt"
			
 
				-		, float4_cmplt(float4_ld(1.0f, 2.0f, 3.0f, 4.0f), float4_ld(0.0f, 2.0f, 0.0f, 3.0f) )
			
 
				-		, 0, 0, 0, 0
			
 
				-		);
			
 
				-
			
 
				-	float4_check_uint32("cmple"
			
 
				-		, float4_cmple(float4_ld(1.0f, 2.0f, 3.0f, 4.0f), float4_ld(0.0f, 2.0f, 0.0f, 3.0f) )
			
 
				-		, 0, 0xffffffff, 0, 0
			
 
				-		);
			
 
				-
			
 
				-	float4_check_uint32("cmpgt"
			
 
				-		, float4_cmpgt(float4_ld(1.0f, 2.0f, 3.0f, 4.0f), float4_ld(0.0f, 2.0f, 0.0f, 3.0f) )
			
 
				-		, 0xffffffff, 0, 0xffffffff, 0xffffffff
			
 
				-		);
			
 
				-
			
 
				-	float4_check_uint32("cmpge"
			
 
				-		, float4_cmpge(float4_ld(1.0f, 2.0f, 3.0f, 4.0f), float4_ld(0.0f, 2.0f, 0.0f, 3.0f) )
			
 
				-		, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
			
 
				-		);
			
 
				-
			
 
				-	float4_check_uint32("icmpeq"
			
 
				-		, float4_icmpeq(float4_ild(0, 1, 2, 3), float4_ild(0, uint32_t(-2), 1, 3) )
			
 
				-		, 0xffffffff, 0, 0, 0xffffffff
			
 
				-		);
			
 
				-
			
 
				-	float4_check_uint32("icmplt"
			
 
				-		, float4_icmplt(float4_ild(0, 1, 2, 3), float4_ild(0, uint32_t(-2), 1, 3) )
			
 
				-		, 0, 0, 0, 0
			
 
				-		);
			
 
				-
			
 
				-	float4_check_uint32("icmpgt"
			
 
				-		, float4_icmpgt(float4_ild(0, 1, 2, 3), float4_ild(0, uint32_t(-2), 1, 3) )
			
 
				-		, 0, 0xffffffff, 0xffffffff, 0
			
 
				-		);
			
 
				-}
			
 
				-
			
 
				-TEST(float4_test)
			
 
				-{
			
 
				-	float4_check_bool("test_any_xyzw"
			
 
				-		, float4_test_any_xyzw(float4_ild(0xffffffff, 0, 0, 0) )
			
 
				-		, true
			
 
				-		);
			
 
				-
			
 
				-	float4_check_bool("test_all_xyzw"
			
 
				-		, float4_test_all_xyzw(float4_ild(0xffffffff, 0, 0xffffffff, 0) )
			
 
				-		, false
			
 
				-		);
			
 
				-
			
 
				-	float4_check_bool("test_all_xyzw"
			
 
				-		, float4_test_all_xyzw(float4_ild(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff) )
			
 
				-		, true
			
 
				-		);
			
 
				-
			
 
				-	float4_check_bool("test_all_xw"
			
 
				-		, float4_test_all_xw(float4_ild(0xffffffff, 0, 0, 0xffffffff) )
			
 
				-		, true
			
 
				-		);
			
 
				-
			
 
				-	float4_check_bool("test_all_xzw"
			
 
				-		, float4_test_all_xzw(float4_ild(0xffffffff, 0, 0, 0xffffffff) )
			
 
				-		, false
			
 
				-		);
			
 
				-}
			
 
				-
			
 
				-TEST(float4_load)
			
 
				-{
			
 
				-	float4_check_float("ld"
			
 
				-		, float4_ld(0.0f, 1.0f, 2.0f, 3.0f)
			
 
				-		, 0.0f, 1.0f, 2.0f, 3.0f
			
 
				-		);
			
 
				-
			
 
				-	float4_check_int32("ild"
			
 
				-		, float4_ild(uint32_t(-1), 0, 1, 2)
			
 
				-		, uint32_t(-1), 0, 1, 2
			
 
				-		);
			
 
				-
			
 
				-	float4_check_int32("ild"
			
 
				-		, float4_ild(uint32_t(-1), uint32_t(-2), uint32_t(-3), uint32_t(-4) )
			
 
				-		, uint32_t(-1), uint32_t(-2), uint32_t(-3), uint32_t(-4)
			
 
				-		);
			
 
				-
			
 
				-	float4_check_uint32("zero", float4_zero()
			
 
				-		, 0, 0, 0, 0
			
 
				-		);
			
 
				-
			
 
				-	float4_check_uint32("isplat", float4_isplat(0x80000001)
			
 
				-		, 0x80000001, 0x80000001, 0x80000001, 0x80000001
			
 
				-		);
			
 
				-
			
 
				-	float4_check_float("isplat", float4_splat(1.0f)
			
 
				-		, 1.0f, 1.0f, 1.0f, 1.0f
			
 
				-		);
			
 
				-}
			
 
				-
			
 
				-TEST(float4_arithmetic)
			
 
				-{
			
 
				-	float4_check_float("madd"
			
 
				-		, float4_madd(float4_ld(0.0f, 1.0f, 2.0f, 3.0f), float4_ld(4.0f, 5.0f, 6.0f, 7.0f), float4_ld(8.0f, 9.0f, 10.0f, 11.0f) )
			
 
				-		, 8.0f, 14.0f, 22.0f, 32.0f
			
 
				-		);
			
 
				-
			
 
				-	float4_check_float("cross3"
			
 
				-		, float4_cross3(float4_ld(1.0f, 0.0f, 0.0f, 0.0f), float4_ld(0.0f, 1.0f, 0.0f, 0.0f) )
			
 
				-		, 0.0f, 0.0f, 1.0f, 0.0f
			
 
				-		);
			
 
				-}
			
 
				-
			
 
				-TEST(float4_sqrt)
			
 
				-{
			
 
				-	float4_check_float("float4_sqrt"
			
 
				-		, float4_sqrt(float4_ld(1.0f, 16.0f, 65536.0f, 123456.0f) )
			
 
				-		, 1.0f, 4.0f, 256.0f, 351.363060096f
			
 
				-		);
			
 
				-
			
 
				-	float4_check_float("float4_sqrt_nr_ni"
			
 
				-		, float4_sqrt_nr_ni(float4_ld(1.0f, 16.0f, 65536.0f, 123456.0f) )
			
 
				-		, 1.0f, 4.0f, 256.0f, 351.363060096f
			
 
				-		);
			
 
				-
			
 
				-	float4_check_float("float4_sqrt_nr1_ni"
			
 
				-		, float4_sqrt_nr1_ni(float4_ld(1.0f, 16.0f, 65536.0f, 123456.0f) )
			
 
				-		, 1.0f, 4.0f, 256.0f, 351.363060096f
			
 
				-		);
			
 
				-}
			
 
				-
			
 
				-TEST(float4)
			
 
				-{
			
 
				-	const float4_t isplat = float4_isplat(0x80000001);
			
 
				-	float4_check_uint32("sll"
			
 
				-		, float4_sll(isplat, 1)
			
 
				-		, 0x00000002, 0x00000002, 0x00000002, 0x00000002
			
 
				-		);
			
 
				-
			
 
				-	float4_check_uint32("srl"
			
 
				-		, float4_srl(isplat, 1)
			
 
				-		, 0x40000000, 0x40000000, 0x40000000, 0x40000000
			
 
				-		);
			
 
				-
			
 
				-	float4_check_uint32("sra"
			
 
				-		, float4_sra(isplat, 1)
			
 
				-		, 0xc0000000, 0xc0000000, 0xc0000000, 0xc0000000
			
 
				-		);
			
 
				-
			
 
				-	float4_check_uint32("and"
			
 
				-		, float4_and(float4_isplat(0x55555555), float4_isplat(0xaaaaaaaa) )
			
 
				-		, 0, 0, 0, 0
			
 
				-		);
			
 
				-
			
 
				-	float4_check_uint32("or "
			
 
				-		, float4_or(float4_isplat(0x55555555), float4_isplat(0xaaaaaaaa) )
			
 
				-		, uint32_t(-1), uint32_t(-1), uint32_t(-1), uint32_t(-1)
			
 
				-		);
			
 
				-
			
 
				-	float4_check_uint32("xor"
			
 
				-		, float4_or(float4_isplat(0x55555555), float4_isplat(0xaaaaaaaa) )
			
 
				-		, uint32_t(-1), uint32_t(-1), uint32_t(-1), uint32_t(-1)
			
 
				-		);
			
 
				-
			
 
				-	float4_check_int32("imin"
			
 
				-		, float4_imin(float4_ild(0, 1, 2, 3), float4_ild(uint32_t(-1), 2, uint32_t(-2), 1) )
			
 
				-		, uint32_t(-1), 1, uint32_t(-2), 1
			
 
				-		);
			
 
				-
			
 
				-	float4_check_float("min"
			
 
				-		, float4_min(float4_ld(0.0f, 1.0f, 2.0f, 3.0f), float4_ld(-1.0f, 2.0f, -2.0f, 1.0f) )
			
 
				-		, -1.0f, 1.0f, -2.0f, 1.0f
			
 
				-		);
			
 
				-
			
 
				-	float4_check_int32("imax"
			
 
				-		, float4_imax(float4_ild(0, 1, 2, 3), float4_ild(uint32_t(-1), 2, uint32_t(-2), 1) )
			
 
				-		, 0, 2, 2, 3
			
 
				-		);
			
 
				-
			
 
				-	float4_check_float("max"
			
 
				-		, float4_max(float4_ld(0.0f, 1.0f, 2.0f, 3.0f), float4_ld(-1.0f, 2.0f, -2.0f, 1.0f) )
			
 
				-		, 0.0f, 2.0f, 2.0f, 3.0f
			
 
				-		);
			
 
				-}
			
--- a/tests/simd_t.cpp
+++ b/tests/simd_t.cpp
@@ -0,0 +1,309 @@
 
				+/*
			
 
				+ * Copyright 2010-2016 Branimir Karadzic. All rights reserved.
			
 
				+ * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
			
 
				+ */
			
 
				+
			
 
				+#include "test.h"
			
 
				+#include <bx/simd_t.h>
			
 
				+#include <bx/fpumath.h>
			
 
				+#include <string.h>
			
 
				+
			
 
				+using namespace bx;
			
 
				+
			
 
				+union simd_cast
			
 
				+{
			
 
				+	bx::simd128_t f4;
			
 
				+	float f[4];
			
 
				+	uint32_t ui[4];
			
 
				+	int32_t i[4];
			
 
				+	char c[16];
			
 
				+};
			
 
				+
			
 
				+void simd_check_bool(const char* _str, bool _a, bool _0)
			
 
				+{
			
 
				+	DBG("%s %d == %d"
			
 
				+		, _str
			
 
				+		, _a
			
 
				+		, _0
			
 
				+		);
			
 
				+
			
 
				+	CHECK_EQUAL(_a, _0);
			
 
				+}
			
 
				+
			
 
				+void simd_check_int32(const char* _str, bx::simd128_t _a, int32_t _0, int32_t _1, int32_t _2, int32_t _3)
			
 
				+{
			
 
				+	simd_cast c; c.f4 = _a;
			
 
				+	DBG("%s (%d, %d, %d, %d) == (%d, %d, %d, %d)"
			
 
				+		, _str
			
 
				+		, c.i[0], c.i[1], c.i[2], c.i[3]
			
 
				+		, _0, _1, _2, _3
			
 
				+		);
			
 
				+
			
 
				+	CHECK_EQUAL(c.i[0], _0);
			
 
				+	CHECK_EQUAL(c.i[1], _1);
			
 
				+	CHECK_EQUAL(c.i[2], _2);
			
 
				+	CHECK_EQUAL(c.i[3], _3);
			
 
				+}
			
 
				+
			
 
				+void simd_check_uint32(const char* _str, bx::simd128_t _a, uint32_t _0, uint32_t _1, uint32_t _2, uint32_t _3)
			
 
				+{
			
 
				+	simd_cast c; c.f4 = _a;
			
 
				+
			
 
				+	DBG("%s (0x%08x, 0x%08x, 0x%08x, 0x%08x) == (0x%08x, 0x%08x, 0x%08x, 0x%08x)"
			
 
				+		, _str
			
 
				+		, c.ui[0], c.ui[1], c.ui[2], c.ui[3]
			
 
				+		, _0, _1, _2, _3
			
 
				+		);
			
 
				+
			
 
				+	CHECK_EQUAL(c.ui[0], _0);
			
 
				+	CHECK_EQUAL(c.ui[1], _1);
			
 
				+	CHECK_EQUAL(c.ui[2], _2);
			
 
				+	CHECK_EQUAL(c.ui[3], _3);
			
 
				+}
			
 
				+
			
 
				+void simd_check_float(const char* _str, bx::simd128_t _a, float _0, float _1, float _2, float _3)
			
 
				+{
			
 
				+	simd_cast c; c.f4 = _a;
			
 
				+
			
 
				+	DBG("%s (%f, %f, %f, %f) == (%f, %f, %f, %f)"
			
 
				+		, _str
			
 
				+		, c.f[0], c.f[1], c.f[2], c.f[3]
			
 
				+		, _0, _1, _2, _3
			
 
				+		);
			
 
				+
			
 
				+	CHECK(bx::fequal(c.f[0], _0, 0.0001f) );
			
 
				+	CHECK(bx::fequal(c.f[1], _1, 0.0001f) );
			
 
				+	CHECK(bx::fequal(c.f[2], _2, 0.0001f) );
			
 
				+	CHECK(bx::fequal(c.f[3], _3, 0.0001f) );
			
 
				+}
			
 
				+
			
 
				+void simd_check_string(const char* _str, bx::simd128_t _a)
			
 
				+{
			
 
				+	simd_cast c; c.f4 = _a;
			
 
				+	const char test[5] = { c.c[0], c.c[4], c.c[8], c.c[12], '\0' };
			
 
				+
			
 
				+	DBG("%s %s", _str, test);
			
 
				+
			
 
				+	CHECK(0 == strcmp(_str, test) );
			
 
				+}
			
 
				+
			
 
				+TEST(simd_swizzle)
			
 
				+{
			
 
				+	const simd128_t xyzw = simd_ild(0x78787878, 0x79797979, 0x7a7a7a7a, 0x77777777);
			
 
				+
			
 
				+#define ELEMx 0
			
 
				+#define ELEMy 1
			
 
				+#define ELEMz 2
			
 
				+#define ELEMw 3
			
 
				+#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \
			
 
				+			simd_check_string("" #_x #_y #_z #_w "", simd_swiz_##_x##_y##_z##_w(xyzw) ); \
			
 
				+
			
 
				+#include <bx/simd_swizzle.inl>
			
 
				+
			
 
				+#undef IMPLEMENT_SWIZZLE
			
 
				+#undef ELEMw
			
 
				+#undef ELEMz
			
 
				+#undef ELEMy
			
 
				+#undef ELEMx
			
 
				+}
			
 
				+
			
 
				+TEST(simd_shuffle)
			
 
				+{
			
 
				+	const simd128_t xyzw = simd_ild(0x78787878, 0x79797979, 0x7a7a7a7a, 0x77777777);
			
 
				+	const simd128_t ABCD = simd_ild(0x41414141, 0x42424242, 0x43434343, 0x44444444);
			
 
				+	simd_check_string("xyAB", simd_shuf_xyAB(xyzw, ABCD) );
			
 
				+	simd_check_string("ABxy", simd_shuf_ABxy(xyzw, ABCD) );
			
 
				+	simd_check_string("zwCD", simd_shuf_zwCD(xyzw, ABCD) );
			
 
				+	simd_check_string("CDzw", simd_shuf_CDzw(xyzw, ABCD) );
			
 
				+	simd_check_string("xAyB", simd_shuf_xAyB(xyzw, ABCD) );
			
 
				+	simd_check_string("zCwD", simd_shuf_zCwD(xyzw, ABCD) );
			
 
				+	simd_check_string("xAzC", simd_shuf_xAzC(xyzw, ABCD) );
			
 
				+	simd_check_string("yBwD", simd_shuf_yBwD(xyzw, ABCD) );
			
 
				+	simd_check_string("CzDw", simd_shuf_CzDw(xyzw, ABCD) );
			
 
				+}
			
 
				+
			
 
				+TEST(simd_compare)
			
 
				+{
			
 
				+	simd_check_uint32("cmpeq"
			
 
				+		, simd_cmpeq(simd_ld(1.0f, 2.0f, 3.0f, 4.0f), simd_ld(0.0f, 2.0f, 0.0f, 3.0f) )
			
 
				+		, 0, 0xffffffff, 0, 0
			
 
				+		);
			
 
				+
			
 
				+	simd_check_uint32("cmplt"
			
 
				+		, simd_cmplt(simd_ld(1.0f, 2.0f, 3.0f, 4.0f), simd_ld(0.0f, 2.0f, 0.0f, 3.0f) )
			
 
				+		, 0, 0, 0, 0
			
 
				+		);
			
 
				+
			
 
				+	simd_check_uint32("cmple"
			
 
				+		, simd_cmple(simd_ld(1.0f, 2.0f, 3.0f, 4.0f), simd_ld(0.0f, 2.0f, 0.0f, 3.0f) )
			
 
				+		, 0, 0xffffffff, 0, 0
			
 
				+		);
			
 
				+
			
 
				+	simd_check_uint32("cmpgt"
			
 
				+		, simd_cmpgt(simd_ld(1.0f, 2.0f, 3.0f, 4.0f), simd_ld(0.0f, 2.0f, 0.0f, 3.0f) )
			
 
				+		, 0xffffffff, 0, 0xffffffff, 0xffffffff
			
 
				+		);
			
 
				+
			
 
				+	simd_check_uint32("cmpge"
			
 
				+		, simd_cmpge(simd_ld(1.0f, 2.0f, 3.0f, 4.0f), simd_ld(0.0f, 2.0f, 0.0f, 3.0f) )
			
 
				+		, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
			
 
				+		);
			
 
				+
			
 
				+	simd_check_uint32("icmpeq"
			
 
				+		, simd_icmpeq(simd_ild(0, 1, 2, 3), simd_ild(0, uint32_t(-2), 1, 3) )
			
 
				+		, 0xffffffff, 0, 0, 0xffffffff
			
 
				+		);
			
 
				+
			
 
				+	simd_check_uint32("icmplt"
			
 
				+		, simd_icmplt(simd_ild(0, 1, 2, 3), simd_ild(0, uint32_t(-2), 1, 3) )
			
 
				+		, 0, 0, 0, 0
			
 
				+		);
			
 
				+
			
 
				+	simd_check_uint32("icmpgt"
			
 
				+		, simd_icmpgt(simd_ild(0, 1, 2, 3), simd_ild(0, uint32_t(-2), 1, 3) )
			
 
				+		, 0, 0xffffffff, 0xffffffff, 0
			
 
				+		);
			
 
				+}
			
 
				+
			
 
				+TEST(simd_test)
			
 
				+{
			
 
				+	simd_check_bool("test_any_xyzw"
			
 
				+		, simd_test_any_xyzw(simd_ild(0xffffffff, 0, 0, 0) )
			
 
				+		, true
			
 
				+		);
			
 
				+
			
 
				+	simd_check_bool("test_all_xyzw"
			
 
				+		, simd_test_all_xyzw(simd_ild(0xffffffff, 0, 0xffffffff, 0) )
			
 
				+		, false
			
 
				+		);
			
 
				+
			
 
				+	simd_check_bool("test_all_xyzw"
			
 
				+		, simd_test_all_xyzw(simd_ild(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff) )
			
 
				+		, true
			
 
				+		);
			
 
				+
			
 
				+	simd_check_bool("test_all_xw"
			
 
				+		, simd_test_all_xw(simd_ild(0xffffffff, 0, 0, 0xffffffff) )
			
 
				+		, true
			
 
				+		);
			
 
				+
			
 
				+	simd_check_bool("test_all_xzw"
			
 
				+		, simd_test_all_xzw(simd_ild(0xffffffff, 0, 0, 0xffffffff) )
			
 
				+		, false
			
 
				+		);
			
 
				+}
			
 
				+
			
 
				+TEST(simd_load)
			
 
				+{
			
 
				+	simd_check_float("ld"
			
 
				+		, simd_ld(0.0f, 1.0f, 2.0f, 3.0f)
			
 
				+		, 0.0f, 1.0f, 2.0f, 3.0f
			
 
				+		);
			
 
				+
			
 
				+	simd_check_int32("ild"
			
 
				+		, simd_ild(uint32_t(-1), 0, 1, 2)
			
 
				+		, uint32_t(-1), 0, 1, 2
			
 
				+		);
			
 
				+
			
 
				+	simd_check_int32("ild"
			
 
				+		, simd_ild(uint32_t(-1), uint32_t(-2), uint32_t(-3), uint32_t(-4) )
			
 
				+		, uint32_t(-1), uint32_t(-2), uint32_t(-3), uint32_t(-4)
			
 
				+		);
			
 
				+
			
 
				+	simd_check_uint32("zero", simd_zero()
			
 
				+		, 0, 0, 0, 0
			
 
				+		);
			
 
				+
			
 
				+	simd_check_uint32("isplat", simd_isplat(0x80000001)
			
 
				+		, 0x80000001, 0x80000001, 0x80000001, 0x80000001
			
 
				+		);
			
 
				+
			
 
				+	simd_check_float("isplat", simd_splat(1.0f)
			
 
				+		, 1.0f, 1.0f, 1.0f, 1.0f
			
 
				+		);
			
 
				+}
			
 
				+
			
 
				+TEST(simd_arithmetic)
			
 
				+{
			
 
				+	simd_check_float("madd"
			
 
				+		, simd_madd(simd_ld(0.0f, 1.0f, 2.0f, 3.0f), simd_ld(4.0f, 5.0f, 6.0f, 7.0f), simd_ld(8.0f, 9.0f, 10.0f, 11.0f) )
			
 
				+		, 8.0f, 14.0f, 22.0f, 32.0f
			
 
				+		);
			
 
				+
			
 
				+	simd_check_float("cross3"
			
 
				+		, simd_cross3(simd_ld(1.0f, 0.0f, 0.0f, 0.0f), simd_ld(0.0f, 1.0f, 0.0f, 0.0f) )
			
 
				+		, 0.0f, 0.0f, 1.0f, 0.0f
			
 
				+		);
			
 
				+}
			
 
				+
			
 
				+TEST(simd_sqrt)
			
 
				+{
			
 
				+	simd_check_float("simd_sqrt"
			
 
				+		, simd_sqrt(simd_ld(1.0f, 16.0f, 65536.0f, 123456.0f) )
			
 
				+		, 1.0f, 4.0f, 256.0f, 351.363060096f
			
 
				+		);
			
 
				+
			
 
				+	simd_check_float("simd_sqrt_nr_ni"
			
 
				+		, simd_sqrt_nr_ni(simd_ld(1.0f, 16.0f, 65536.0f, 123456.0f) )
			
 
				+		, 1.0f, 4.0f, 256.0f, 351.363060096f
			
 
				+		);
			
 
				+
			
 
				+	simd_check_float("simd_sqrt_nr1_ni"
			
 
				+		, simd_sqrt_nr1_ni(simd_ld(1.0f, 16.0f, 65536.0f, 123456.0f) )
			
 
				+		, 1.0f, 4.0f, 256.0f, 351.363060096f
			
 
				+		);
			
 
				+}
			
 
				+
			
 
				+TEST(float4)
			
 
				+{
			
 
				+	const simd128_t isplat = simd_isplat(0x80000001);
			
 
				+	simd_check_uint32("sll"
			
 
				+		, simd_sll(isplat, 1)
			
 
				+		, 0x00000002, 0x00000002, 0x00000002, 0x00000002
			
 
				+		);
			
 
				+
			
 
				+	simd_check_uint32("srl"
			
 
				+		, simd_srl(isplat, 1)
			
 
				+		, 0x40000000, 0x40000000, 0x40000000, 0x40000000
			
 
				+		);
			
 
				+
			
 
				+	simd_check_uint32("sra"
			
 
				+		, simd_sra(isplat, 1)
			
 
				+		, 0xc0000000, 0xc0000000, 0xc0000000, 0xc0000000
			
 
				+		);
			
 
				+
			
 
				+	simd_check_uint32("and"
			
 
				+		, simd_and(simd_isplat(0x55555555), simd_isplat(0xaaaaaaaa) )
			
 
				+		, 0, 0, 0, 0
			
 
				+		);
			
 
				+
			
 
				+	simd_check_uint32("or "
			
 
				+		, simd_or(simd_isplat(0x55555555), simd_isplat(0xaaaaaaaa) )
			
 
				+		, uint32_t(-1), uint32_t(-1), uint32_t(-1), uint32_t(-1)
			
 
				+		);
			
 
				+
			
 
				+	simd_check_uint32("xor"
			
 
				+		, simd_or(simd_isplat(0x55555555), simd_isplat(0xaaaaaaaa) )
			
 
				+		, uint32_t(-1), uint32_t(-1), uint32_t(-1), uint32_t(-1)
			
 
				+		);
			
 
				+
			
 
				+	simd_check_int32("imin"
			
 
				+		, simd_imin(simd_ild(0, 1, 2, 3), simd_ild(uint32_t(-1), 2, uint32_t(-2), 1) )
			
 
				+		, uint32_t(-1), 1, uint32_t(-2), 1
			
 
				+		);
			
 
				+
			
 
				+	simd_check_float("min"
			
 
				+		, simd_min(simd_ld(0.0f, 1.0f, 2.0f, 3.0f), simd_ld(-1.0f, 2.0f, -2.0f, 1.0f) )
			
 
				+		, -1.0f, 1.0f, -2.0f, 1.0f
			
 
				+		);
			
 
				+
			
 
				+	simd_check_int32("imax"
			
 
				+		, simd_imax(simd_ild(0, 1, 2, 3), simd_ild(uint32_t(-1), 2, uint32_t(-2), 1) )
			
 
				+		, 0, 2, 2, 3
			
 
				+		);
			
 
				+
			
 
				+	simd_check_float("max"
			
 
				+		, simd_max(simd_ld(0.0f, 1.0f, 2.0f, 3.0f), simd_ld(-1.0f, 2.0f, -2.0f, 1.0f) )
			
 
				+		, 0.0f, 2.0f, 2.0f, 3.0f
			
 
				+		);
			
 
				+}