Branimir Karadžić 8 år sedan
förälder
incheckning
44253ed7f7

+ 3 - 6
include/bx/inline/simd128_langext.inl

@@ -3,8 +3,9 @@
  * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
  * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
  */
  */
 
 
-#ifndef BX_SIMD128_LANGEXT_H_HEADER_GUARD
-#define BX_SIMD128_LANGEXT_H_HEADER_GUARD
+#ifndef BX_SIMD_T_H_HEADER_GUARD
+#	error "Must be included from bx/simd_t.h!"
+#endif // BX_SIMD_T_H_HEADER_GUARD
 
 
 #define simd_rcp           simd_rcp_ni
 #define simd_rcp           simd_rcp_ni
 #define simd_orx           simd_orx_ni
 #define simd_orx           simd_orx_ni
@@ -38,8 +39,6 @@
 #define simd_imin          simd_imin_ni
 #define simd_imin          simd_imin_ni
 #define simd_imax          simd_imax_ni
 #define simd_imax          simd_imax_ni
 
 
-#include "simd_ni.inl"
-
 namespace bx
 namespace bx
 {
 {
 #define ELEMx 0
 #define ELEMx 0
@@ -511,5 +510,3 @@ BX_SIMD128_IMPLEMENT_TEST(xyzw , 0xf);
 	typedef simd128_langext_t simd128_t;
 	typedef simd128_langext_t simd128_t;
 
 
 } // namespace bx
 } // namespace bx
-
-#endif // BX_SIMD128_LANGEXT_H_HEADER_GUARD

+ 3 - 6
include/bx/inline/simd128_neon.inl

@@ -3,8 +3,9 @@
  * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
  * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
  */
  */
 
 
-#ifndef BX_SIMD128_NEON_H_HEADER_GUARD
-#define BX_SIMD128_NEON_H_HEADER_GUARD
+#ifndef BX_SIMD_T_H_HEADER_GUARD
+#	error "Must be included from bx/simd_t.h!"
+#endif // BX_SIMD_T_H_HEADER_GUARD
 
 
 #define simd_rcp           simd_rcp_ni
 #define simd_rcp           simd_rcp_ni
 #define simd_orx           simd_orx_ni
 #define simd_orx           simd_orx_ni
@@ -35,8 +36,6 @@
 #define simd_ceil          simd_ceil_ni
 #define simd_ceil          simd_ceil_ni
 #define simd_floor         simd_floor_ni
 #define simd_floor         simd_floor_ni
 
 
-#include "simd_ni.inl"
-
 namespace bx
 namespace bx
 {
 {
 #define ELEMx 0
 #define ELEMx 0
@@ -558,5 +557,3 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww);
 	typedef simd128_neon_t simd128_t;
 	typedef simd128_neon_t simd128_t;
 
 
 } // namespace bx
 } // namespace bx
-
-#endif // BX_SIMD128_NEON_H_HEADER_GUARD

+ 38 - 6
include/bx/inline/simd128_ref.inl

@@ -3,8 +3,9 @@
  * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
  * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
  */
  */
 
 
-#ifndef BX_SIMD128_REF_H_HEADER_GUARD
-#define BX_SIMD128_REF_H_HEADER_GUARD
+#ifndef BX_SIMD_T_H_HEADER_GUARD
+#	error "Must be included from bx/simd_t.h!"
+#endif // BX_SIMD_T_H_HEADER_GUARD
 
 
 #define simd_shuf_xAzC simd_shuf_xAzC_ni
 #define simd_shuf_xAzC simd_shuf_xAzC_ni
 #define simd_shuf_yBwD simd_shuf_yBwD_ni
 #define simd_shuf_yBwD simd_shuf_yBwD_ni
@@ -35,8 +36,6 @@
 #define simd_ceil simd_ceil_ni
 #define simd_ceil simd_ceil_ni
 #define simd_floor simd_floor_ni
 #define simd_floor simd_floor_ni
 
 
-#include "simd_ni.inl"
-
 namespace bx
 namespace bx
 {
 {
 #define ELEMx 0
 #define ELEMx 0
@@ -643,6 +642,39 @@ BX_SIMD128_IMPLEMENT_TEST(xyzw , 0xf);
 		return result;
 		return result;
 	}
 	}
 
 
-} // namespace bx
+	BX_SIMD_FORCE_INLINE simd128_t simd_zero()
+	{
+		return simd_zero<simd128_t>();
+	}
+
+	BX_SIMD_FORCE_INLINE simd128_t simd_ld(const void* _ptr)
+	{
+		return simd_ld<simd128_t>(_ptr);
+	}
+
+	BX_SIMD_FORCE_INLINE simd128_t simd_ld(float _x, float _y, float _z, float _w)
+	{
+		return simd_ld<simd128_t>(_x, _y, _z, _w);
+	}
 
 
-#endif // BX_SIMD128_REF_H_HEADER_GUARD
+	BX_SIMD_FORCE_INLINE simd128_t simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
+	{
+		return simd_ild<simd128_t>(_x, _y, _z, _w);
+	}
+
+	BX_SIMD_FORCE_INLINE simd128_t simd_splat(const void* _ptr)
+	{
+		return simd_splat<simd128_t>(_ptr);
+	}
+
+	BX_SIMD_FORCE_INLINE simd128_t simd_splat(float _a)
+	{
+		return simd_splat<simd128_t>(_a);
+	}
+
+	BX_SIMD_FORCE_INLINE simd128_t simd_isplat(uint32_t _a)
+	{
+		return simd_isplat<simd128_t>(_a);
+	}
+
+} // namespace bx

+ 3 - 6
include/bx/inline/simd128_sse.inl

@@ -3,10 +3,9 @@
  * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
  * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
  */
  */
 
 
-#ifndef BX_SIMD128_SSE_H_HEADER_GUARD
-#define BX_SIMD128_SSE_H_HEADER_GUARD
-
-#include "simd_ni.inl"
+#ifndef BX_SIMD_T_H_HEADER_GUARD
+#	error "Must be included from bx/simd_t.h!"
+#endif // BX_SIMD_T_H_HEADER_GUARD
 
 
 namespace bx
 namespace bx
 {
 {
@@ -643,5 +642,3 @@ BX_SIMD128_IMPLEMENT_TEST(xyzw , 0xf);
 	typedef simd128_sse_t simd128_t;
 	typedef simd128_sse_t simd128_t;
 
 
 } // namespace bx
 } // namespace bx
-
-#endif // BX_SIMD128_SSE_H_HEADER_GUARD

+ 3 - 6
include/bx/inline/simd256_avx.inl

@@ -3,10 +3,9 @@
  * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
  * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
  */
  */
 
 
-#ifndef BX_SIMD256_AVX_H_HEADER_GUARD
-#define BX_SIMD256_AVX_H_HEADER_GUARD
-
-#include "simd_ni.inl"
+#ifndef BX_SIMD_T_H_HEADER_GUARD
+#	error "Must be included from bx/simd_t.h!"
+#endif // BX_SIMD_T_H_HEADER_GUARD
 
 
 namespace bx
 namespace bx
 {
 {
@@ -73,5 +72,3 @@ namespace bx
 	typedef simd256_avx_t simd256_t;
 	typedef simd256_avx_t simd256_t;
 
 
 } // namespace bx
 } // namespace bx
-
-#endif // BX_SIMD256_AVX_H_HEADER_GUARD

+ 3 - 6
include/bx/inline/simd256_ref.inl

@@ -3,10 +3,9 @@
  * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
  * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
  */
  */
 
 
-#ifndef BX_SIMD256_REF_H_HEADER_GUARD
-#define BX_SIMD256_REF_H_HEADER_GUARD
-
-#include "simd_ni.inl"
+#ifndef BX_SIMD_T_H_HEADER_GUARD
+#	error "Must be included from bx/simd_t.h!"
+#endif // BX_SIMD_T_H_HEADER_GUARD
 
 
 namespace bx
 namespace bx
 {
 {
@@ -83,5 +82,3 @@ namespace bx
 	}
 	}
 
 
 } // namespace bx
 } // namespace bx
-
-#endif // BX_SIMD256_REF_H_HEADER_GUARD

+ 0 - 5
include/bx/inline/simd_ni.inl

@@ -3,9 +3,6 @@
  * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
  * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
  */
  */
 
 
-#ifndef BX_SIMD_NI_H_HEADER_GUARD
-#define BX_SIMD_NI_H_HEADER_GUARD
-
 namespace bx
 namespace bx
 {
 {
 	template<typename Ty>
 	template<typename Ty>
@@ -554,5 +551,3 @@ namespace bx
 	}
 	}
 
 
 } // namespace bx
 } // namespace bx
-
-#endif // BX_SIMD_NI_H_HEADER_GUARD

+ 206 - 120
include/bx/simd_t.h

@@ -52,7 +52,7 @@ namespace bx
 #define ELEMw 3
 #define ELEMw 3
 #define BX_SIMD128_IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \
 #define BX_SIMD128_IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \
 			template<typename Ty> \
 			template<typename Ty> \
-			BX_SIMD_FORCE_INLINE Ty simd_swiz_##_x##_y##_z##_w(Ty _a);
+			Ty simd_swiz_##_x##_y##_z##_w(Ty _a);
 #include "inline/simd128_swizzle.inl"
 #include "inline/simd128_swizzle.inl"
 
 
 #undef BX_SIMD128_IMPLEMENT_SWIZZLE
 #undef BX_SIMD128_IMPLEMENT_SWIZZLE
@@ -86,254 +86,362 @@ BX_SIMD128_IMPLEMENT_TEST(xyzw);
 #undef BX_SIMD128_IMPLEMENT_TEST
 #undef BX_SIMD128_IMPLEMENT_TEST
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_shuf_xyAB(Ty _a, Ty _b);
+	Ty simd_shuf_xyAB(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_shuf_ABxy(Ty _a, Ty _b);
+	Ty simd_shuf_ABxy(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_shuf_CDzw(Ty _a, Ty _b);
+	Ty simd_shuf_CDzw(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_shuf_zwCD(Ty _a, Ty _b);
+	Ty simd_shuf_zwCD(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_shuf_xAyB(Ty _a, Ty _b);
+	Ty simd_shuf_xAyB(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_shuf_yBxA(Ty _a, Ty _b);
+	Ty simd_shuf_yBxA(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_shuf_zCwD(Ty _a, Ty _b);
+	Ty simd_shuf_zCwD(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_shuf_CzDw(Ty _a, Ty _b);
+	Ty simd_shuf_CzDw(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE float simd_x(Ty _a);
+	float simd_x(Ty _a);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE float simd_y(Ty _a);
+	float simd_y(Ty _a);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE float simd_z(Ty _a);
+	float simd_z(Ty _a);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE float simd_w(Ty _a);
+	float simd_w(Ty _a);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_ld(const void* _ptr);
+	Ty simd_ld(const void* _ptr);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE void simd_st(void* _ptr, Ty _a);
+	void simd_st(void* _ptr, Ty _a);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE void simd_stx(void* _ptr, Ty _a);
+	void simd_stx(void* _ptr, Ty _a);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE void simd_stream(void* _ptr, Ty _a);
+	void simd_stream(void* _ptr, Ty _a);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_ld(float _x, float _y, float _z, float _w);
+	Ty simd_ld(float _x, float _y, float _z, float _w);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_ld(float _x, float _y, float _z, float _w, float _a, float _b, float _c, float _d);
+	Ty simd_ld(float _x, float _y, float _z, float _w, float _a, float _b, float _c, float _d);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w);
+	Ty simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w, uint32_t _a, uint32_t _b, uint32_t _c, uint32_t _d);
+	Ty simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w, uint32_t _a, uint32_t _b, uint32_t _c, uint32_t _d);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_splat(const void* _ptr);
+	Ty simd_splat(const void* _ptr);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_splat(float _a);
+	Ty simd_splat(float _a);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_isplat(uint32_t _a);
+	Ty simd_isplat(uint32_t _a);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_zero();
+	Ty simd_zero();
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_itof(Ty _a);
+	Ty simd_itof(Ty _a);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_ftoi(Ty _a);
+	Ty simd_ftoi(Ty _a);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_round(Ty _a);
+	Ty simd_round(Ty _a);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_add(Ty _a, Ty _b);
+	Ty simd_add(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_sub(Ty _a, Ty _b);
+	Ty simd_sub(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_mul(Ty _a, Ty _b);
+	Ty simd_mul(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_div(Ty _a, Ty _b);
+	Ty simd_div(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_rcp_est(Ty _a);
+	Ty simd_rcp_est(Ty _a);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_sqrt(Ty _a);
+	Ty simd_sqrt(Ty _a);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_rsqrt_est(Ty _a);
+	Ty simd_rsqrt_est(Ty _a);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_dot3(Ty _a, Ty _b);
+	Ty simd_dot3(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_dot(Ty _a, Ty _b);
+	Ty simd_dot(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_cmpeq(Ty _a, Ty _b);
+	Ty simd_cmpeq(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_cmplt(Ty _a, Ty _b);
+	Ty simd_cmplt(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_cmple(Ty _a, Ty _b);
+	Ty simd_cmple(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_cmpgt(Ty _a, Ty _b);
+	Ty simd_cmpgt(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_cmpge(Ty _a, Ty _b);
+	Ty simd_cmpge(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_min(Ty _a, Ty _b);
+	Ty simd_min(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_max(Ty _a, Ty _b);
+	Ty simd_max(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_and(Ty _a, Ty _b);
+	Ty simd_and(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_andc(Ty _a, Ty _b);
+	Ty simd_andc(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_or(Ty _a, Ty _b);
+	Ty simd_or(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_xor(Ty _a, Ty _b);
+	Ty simd_xor(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_sll(Ty _a, int _count);
+	Ty simd_sll(Ty _a, int _count);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_srl(Ty _a, int _count);
+	Ty simd_srl(Ty _a, int _count);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_sra(Ty _a, int _count);
+	Ty simd_sra(Ty _a, int _count);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_icmpeq(Ty _a, Ty _b);
+	Ty simd_icmpeq(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_icmplt(Ty _a, Ty _b);
+	Ty simd_icmplt(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_icmpgt(Ty _a, Ty _b);
+	Ty simd_icmpgt(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_imin(Ty _a, Ty _b);
+	Ty simd_imin(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_imax(Ty _a, Ty _b);
+	Ty simd_imax(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_iadd(Ty _a, Ty _b);
+	Ty simd_iadd(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_FORCE_INLINE Ty simd_isub(Ty _a, Ty _b);
+	Ty simd_isub(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_INLINE Ty simd_shuf_xAzC(Ty _a, Ty _b);
+	Ty simd_shuf_xAzC(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_INLINE Ty simd_shuf_yBwD(Ty _a, Ty _b);
+	Ty simd_shuf_yBwD(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_INLINE Ty simd_rcp(Ty _a);
+	Ty simd_rcp(Ty _a);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_INLINE Ty simd_orx(Ty _a);
+	Ty simd_orx(Ty _a);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_INLINE Ty simd_orc(Ty _a, Ty _b);
+	Ty simd_orc(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_INLINE Ty simd_neg(Ty _a);
+	Ty simd_neg(Ty _a);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_INLINE Ty simd_madd(Ty _a, Ty _b, Ty _c);
+	Ty simd_madd(Ty _a, Ty _b, Ty _c);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_INLINE Ty simd_nmsub(Ty _a, Ty _b, Ty _c);
+	Ty simd_nmsub(Ty _a, Ty _b, Ty _c);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_INLINE Ty simd_div_nr(Ty _a, Ty _b);
+	Ty simd_div_nr(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_INLINE Ty simd_selb(Ty _mask, Ty _a, Ty _b);
+	Ty simd_selb(Ty _mask, Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_INLINE Ty simd_sels(Ty _test, Ty _a, Ty _b);
+	Ty simd_sels(Ty _test, Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_INLINE Ty simd_not(Ty _a);
+	Ty simd_not(Ty _a);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_INLINE Ty simd_abs(Ty _a);
+	Ty simd_abs(Ty _a);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_INLINE Ty simd_clamp(Ty _a, Ty _min, Ty _max);
+	Ty simd_clamp(Ty _a, Ty _min, Ty _max);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_INLINE Ty simd_lerp(Ty _a, Ty _b, Ty _s);
+	Ty simd_lerp(Ty _a, Ty _b, Ty _s);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_INLINE Ty simd_rsqrt(Ty _a);
+	Ty simd_rsqrt(Ty _a);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_INLINE Ty simd_rsqrt_nr(Ty _a);
+	Ty simd_rsqrt_nr(Ty _a);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_INLINE Ty simd_rsqrt_carmack(Ty _a);
+	Ty simd_rsqrt_carmack(Ty _a);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_INLINE Ty simd_sqrt_nr(Ty _a);
+	Ty simd_sqrt_nr(Ty _a);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_INLINE Ty simd_log2(Ty _a);
+	Ty simd_log2(Ty _a);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_INLINE Ty simd_exp2(Ty _a);
+	Ty simd_exp2(Ty _a);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_INLINE Ty simd_pow(Ty _a, Ty _b);
+	Ty simd_pow(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_INLINE Ty simd_cross3(Ty _a, Ty _b);
+	Ty simd_cross3(Ty _a, Ty _b);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_INLINE Ty simd_normalize3(Ty _a);
+	Ty simd_normalize3(Ty _a);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_INLINE Ty simd_ceil(Ty _a);
+	Ty simd_ceil(Ty _a);
 
 
 	template<typename Ty>
 	template<typename Ty>
-	BX_SIMD_INLINE Ty simd_floor(Ty _a);
+	Ty simd_floor(Ty _a);
 
 
+	template<typename Ty>
+	Ty simd_shuf_xAzC_ni(Ty _a, Ty _b);
+
+	template<typename Ty>
+	Ty simd_shuf_yBwD_ni(Ty _a, Ty _b);
+
+	template<typename Ty>
+	Ty simd_madd_ni(Ty _a, Ty _b, Ty _c);
+
+	template<typename Ty>
+	Ty simd_nmsub_ni(Ty _a, Ty _b, Ty _c);
+
+	template<typename Ty>
+	Ty simd_div_nr_ni(Ty _a, Ty _b);
+
+	template<typename Ty>
+	Ty simd_rcp_ni(Ty _a);
+
+	template<typename Ty>
+	Ty simd_orx_ni(Ty _a);
+
+	template<typename Ty>
+	Ty simd_orc_ni(Ty _a, Ty _b);
+
+	template<typename Ty>
+	Ty simd_neg_ni(Ty _a);
+
+	template<typename Ty>
+	Ty simd_selb_ni(Ty _mask, Ty _a, Ty _b);
+
+	template<typename Ty>
+	Ty simd_sels_ni(Ty _test, Ty _a, Ty _b);
+
+	template<typename Ty>
+	Ty simd_not_ni(Ty _a);
+
+	template<typename Ty>
+	Ty simd_min_ni(Ty _a, Ty _b);
+
+	template<typename Ty>
+	Ty simd_max_ni(Ty _a, Ty _b);
+
+	template<typename Ty>
+	Ty simd_abs_ni(Ty _a);
+
+	template<typename Ty>
+	Ty simd_imin_ni(Ty _a, Ty _b);
+
+	template<typename Ty>
+	Ty simd_imax_ni(Ty _a, Ty _b);
+
+	template<typename Ty>
+	Ty simd_clamp_ni(Ty _a, Ty _min, Ty _max);
+
+	template<typename Ty>
+	Ty simd_lerp_ni(Ty _a, Ty _b, Ty _s);
+
+	template<typename Ty>
+	Ty simd_sqrt_nr_ni(Ty _a);
+
+	template<typename Ty>
+	Ty simd_sqrt_nr1_ni(Ty _a);
+
+	template<typename Ty>
+	Ty simd_rsqrt_ni(Ty _a);
+
+	template<typename Ty>
+	Ty simd_rsqrt_nr_ni(Ty _a);
+
+	template<typename Ty>
+	Ty simd_rsqrt_carmack_ni(Ty _a);
+	
+	template<typename Ty>
+	Ty simd_log2_ni(Ty _a);
+
+	template<typename Ty>
+	Ty simd_exp2_ni(Ty _a);
+
+	template<typename Ty>
+	Ty simd_pow_ni(Ty _a, Ty _b);
+
+	template<typename Ty>
+	Ty simd_dot3_ni(Ty _a, Ty _b);
+
+	template<typename Ty>
+	Ty simd_cross3_ni(Ty _a, Ty _b);
+
+	template<typename Ty>
+	Ty simd_normalize3_ni(Ty _a);
+
+	template<typename Ty>
+	Ty simd_dot_ni(Ty _a, Ty _b);
+
+	template<typename Ty>
+	Ty simd_ceil_ni(Ty _a);
+
+	template<typename Ty>
+	Ty simd_floor_ni(Ty _a);
+
+	template<typename Ty>
+	Ty simd_round_ni(Ty _a);
+
+	template<typename Ty>
+	bool simd_test_any_ni(Ty _a);
+
+	template<typename Ty>
+	bool simd_test_all_ni(Ty _a);	
+	
 #if BX_SIMD_AVX
 #if BX_SIMD_AVX
 	typedef __m256 simd256_avx_t;
 	typedef __m256 simd256_avx_t;
 #endif // BX_SIMD_SSE
 #endif // BX_SIMD_SSE
@@ -361,6 +469,8 @@ BX_SIMD128_IMPLEMENT_TEST(xyzw);
 
 
 } // namespace bx
 } // namespace bx
 
 
+#include "inline/simd_ni.inl"
+
 #if BX_SIMD_AVX
 #if BX_SIMD_AVX
 #	include "inline/simd256_avx.inl"
 #	include "inline/simd256_avx.inl"
 #endif // BX_SIMD_AVX
 #endif // BX_SIMD_AVX
@@ -421,47 +531,23 @@ namespace bx
 	typedef simd256_ref_t simd256_t;
 	typedef simd256_ref_t simd256_t;
 #endif // !BX_SIMD_AVX
 #endif // !BX_SIMD_AVX
 
 
-} // namespace bx
-
-#include "inline/simd128_ref.inl"
-#include "inline/simd256_ref.inl"
-
-namespace bx
-{
-	BX_SIMD_FORCE_INLINE simd128_t simd_zero()
-	{
-		return simd_zero<simd128_t>();
-	}
-
-	BX_SIMD_FORCE_INLINE simd128_t simd_ld(const void* _ptr)
-	{
-		return simd_ld<simd128_t>(_ptr);
-	}
+	simd128_t simd_zero();
+	
+	simd128_t simd_ld(const void* _ptr);
 
 
-	BX_SIMD_FORCE_INLINE simd128_t simd_ld(float _x, float _y, float _z, float _w)
-	{
-		return simd_ld<simd128_t>(_x, _y, _z, _w);
-	}
+	simd128_t simd_ld(float _x, float _y, float _z, float _w);
 
 
-	BX_SIMD_FORCE_INLINE simd128_t simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
-	{
-		return simd_ild<simd128_t>(_x, _y, _z, _w);
-	}
+	simd128_t simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w);
 
 
-	BX_SIMD_FORCE_INLINE simd128_t simd_splat(const void* _ptr)
-	{
-		return simd_splat<simd128_t>(_ptr);
-	}
+	simd128_t simd_splat(const void* _ptr);
 
 
-	BX_SIMD_FORCE_INLINE simd128_t simd_splat(float _a)
-	{
-		return simd_splat<simd128_t>(_a);
-	}
+	simd128_t simd_splat(float _a);
 
 
-	BX_SIMD_FORCE_INLINE simd128_t simd_isplat(uint32_t _a)
-	{
-		return simd_isplat<simd128_t>(_a);
-	}
+	simd128_t simd_isplat(uint32_t _a);
+	
 } // namespace bx
 } // namespace bx
 
 
+#include "inline/simd128_ref.inl"
+#include "inline/simd256_ref.inl"
+
 #endif // BX_SIMD_T_H_HEADER_GUARD
 #endif // BX_SIMD_T_H_HEADER_GUARD