Jelajahi Sumber

SIMD: AVX WIP.

Branimir Karadžić 9 tahun lalu
induk
melakukan
76d42a9e03
5 mengubah file dengan 280 tambahan dan 48 penghapusan
  1. 45 9
      include/bx/simd256_avx.inl
  2. 51 9
      include/bx/simd256_ref.inl
  3. 39 18
      include/bx/simd_t.h
  4. 9 0
      scripts/toolchain.lua
  5. 136 12
      tests/simd_t.cpp

+ 45 - 9
include/bx/simd256_avx.inl

@@ -1,9 +1,45 @@
-/*
- * Copyright 2010-2016 Branimir Karadzic. All rights reserved.
- * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
- */
-
-#ifndef BX_SIMD256_AVX_H_HEADER_GUARD
-#define BX_SIMD256_AVX_H_HEADER_GUARD
-
-#endif // BX_SIMD256_AVX_H_HEADER_GUARD
+/*
+ * Copyright 2010-2016 Branimir Karadzic. All rights reserved.
+ * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
+ */
+
+#ifndef BX_SIMD256_AVX_H_HEADER_GUARD
+#define BX_SIMD256_AVX_H_HEADER_GUARD
+
+#include "simd_ni.inl"
+
+namespace bx
+{
+
+	template<>
+	BX_SIMD_FORCE_INLINE simd256_avx_t simd_ld(const void* _ptr)
+	{
+		return _mm256_load_ps(reinterpret_cast<const float*>(_ptr) );
+	}
+
+	template<>
+	BX_SIMD_FORCE_INLINE void simd_st(void* _ptr, simd256_avx_t _a)
+	{
+		_mm256_store_ps(reinterpret_cast<float*>(_ptr), _a);
+	}
+
+	template<>
+	BX_SIMD_FORCE_INLINE simd256_avx_t simd_ld(float _x, float _y, float _z, float _w, float _A, float _B, float _C, float _D)
+	{
+		return _mm256_set_ps(_D, _C, _B, _A, _w, _z, _y, _x);
+	}
+
+	template<>
+	BX_SIMD_FORCE_INLINE simd256_avx_t simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w, uint32_t _A, uint32_t _B, uint32_t _C, uint32_t _D)
+	{
+		const __m256i set          = _mm256_set_epi32(_D, _C, _B, _A, _w, _z, _y, _x);
+		const simd256_avx_t result = _mm256_castsi256_ps(set);
+
+		return result;
+	}
+
+	typedef simd256_avx_t simd256_t;
+
+} // namespace bx
+
+#endif // BX_SIMD256_AVX_H_HEADER_GUARD

+ 51 - 9
include/bx/simd256_ref.inl

@@ -1,9 +1,51 @@
-/*
- * Copyright 2010-2016 Branimir Karadzic. All rights reserved.
- * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
- */
-
-#ifndef BX_SIMD256_REF_H_HEADER_GUARD
-#define BX_SIMD256_REF_H_HEADER_GUARD
-
-#endif // BX_SIMD256_REF_H_HEADER_GUARD
+/*
+ * Copyright 2010-2016 Branimir Karadzic. All rights reserved.
+ * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
+ */
+
+#ifndef BX_SIMD256_REF_H_HEADER_GUARD
+#define BX_SIMD256_REF_H_HEADER_GUARD
+
+#include "simd_ni.inl"
+
+namespace bx
+{
+	template<>
+	BX_SIMD_FORCE_INLINE simd256_ref_t simd_ld(const void* _ptr)
+	{
+		const simd128_t* ptr = reinterpret_cast<const simd128_t*>(_ptr);
+		simd256_ref_t result;
+		result.simd128[0] = simd_ld<simd128_t>(&ptr[0]);
+		result.simd128[1] = simd_ld<simd128_t>(&ptr[1]);
+		return result;
+	}
+
+	template<>
+	BX_SIMD_FORCE_INLINE void simd_st(void* _ptr, simd256_ref_t _a)
+	{
+		simd128_t* result = reinterpret_cast<simd128_t*>(_ptr);
+		simd_st<simd128_t>(&result[0], _a.simd128[0]);
+		simd_st<simd128_t>(&result[1], _a.simd128[1]);
+	}
+
+	template<>
+	BX_SIMD_FORCE_INLINE simd256_ref_t simd_ld(float _x, float _y, float _z, float _w, float _A, float _B, float _C, float _D)
+	{
+		simd256_ref_t result;
+		result.simd128[0] = simd_ld<simd128_t>(_x, _y, _z, _w);
+		result.simd128[1] = simd_ld<simd128_t>(_A, _B, _C, _D);
+		return result;
+	}
+
+	template<>
+	BX_SIMD_FORCE_INLINE simd256_ref_t simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w, uint32_t _A, uint32_t _B, uint32_t _C, uint32_t _D)
+	{
+		simd256_ref_t result;
+		result.simd128[0] = simd_ild<simd128_t>(_x, _y, _z, _w);
+		result.simd128[1] = simd_ild<simd128_t>(_A, _B, _C, _D);
+		return result;
+	}
+
+} // namespace bx
+
+#endif // BX_SIMD256_REF_H_HEADER_GUARD

+ 39 - 18
include/bx/simd_t.h

@@ -135,9 +135,15 @@ BX_SIMD128_IMPLEMENT_TEST(xyzw);
 	template<typename Ty>
 	BX_SIMD_FORCE_INLINE Ty simd_ld(float _x, float _y, float _z, float _w);
 
+	template<typename Ty>
+	BX_SIMD_FORCE_INLINE Ty simd_ld(float _x, float _y, float _z, float _w, float _A, float _B, float _C, float _D);
+
 	template<typename Ty>
 	BX_SIMD_FORCE_INLINE Ty simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w);
 
+	template<typename Ty>
+	BX_SIMD_FORCE_INLINE Ty simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w, uint32_t _A, uint32_t _B, uint32_t _C, uint32_t _D);
+
 	template<typename Ty>
 	BX_SIMD_FORCE_INLINE Ty simd_splat(const void* _ptr);
 
@@ -352,14 +358,6 @@ BX_SIMD128_IMPLEMENT_TEST(xyzw);
 	typedef __m128 simd128_sse_t;
 #endif // BX_SIMD_SSE
 
-	union simd128_ref_t
-	{
-		float    fxyzw[4];
-		int32_t  ixyzw[4];
-		uint32_t uxyzw[4];
-
-	};
-
 } // namespace bx
 
 #if BX_SIMD_AVX
@@ -378,27 +376,50 @@ BX_SIMD128_IMPLEMENT_TEST(xyzw);
 #	include "simd128_sse.inl"
 #endif // BX_SIMD_SSE
 
-#include "simd128_ref.inl"
-#include "simd256_ref.inl"
-
 namespace bx
 {
-#if !( BX_SIMD_AVX \
-	|| BX_SIMD_LANGEXT \
+	union simd128_ref_t
+	{
+		float    fxyzw[4];
+		int32_t  ixyzw[4];
+		uint32_t uxyzw[4];
+	};
+
+#ifndef BX_SIMD_WARN_REFERENCE_IMPL
+#	define BX_SIMD_WARN_REFERENCE_IMPL 0
+#endif // BX_SIMD_WARN_REFERENCE_IMPL
+
+#if !( BX_SIMD_LANGEXT \
 	|| BX_SIMD_NEON \
 	|| BX_SIMD_SSE \
 	 )
-#	ifndef BX_SIMD_WARN_REFERENCE_IMPL
-#		define BX_SIMD_WARN_REFERENCE_IMPL 0
-#	endif // BX_SIMD_WARN_REFERENCE_IMPL
-
 #	if BX_SIMD_WARN_REFERENCE_IMPL
-#		pragma message("************************************\nUsing SIMD reference implementation!\n************************************")
+#		pragma message("*** Using SIMD128 reference implementation! ***")
 #	endif // BX_SIMD_WARN_REFERENCE_IMPL
 
 	typedef simd128_ref_t simd128_t;
 #endif //
 
+	union simd256_ref_t
+	{
+		simd128_t simd128[2];
+	};
+
+#if !BX_SIMD_AVX
+#	if BX_SIMD_WARN_REFERENCE_IMPL
+#		pragma message("*** Using SIMD256 reference implementation! ***")
+#	endif // BX_SIMD_WARN_REFERENCE_IMPL
+
+	typedef simd256_ref_t simd256_t;
+#endif // !BX_SIMD_AVX
+
+} // namespace bx
+
+#include "simd128_ref.inl"
+#include "simd256_ref.inl"
+
+namespace bx
+{
 	BX_SIMD_FORCE_INLINE simd128_t simd_zero()
 	{
 		return simd_zero<simd128_t>();

+ 9 - 0
scripts/toolchain.lua

@@ -100,6 +100,11 @@ function toolchain(_buildDir, _libDir)
 		description = "Use 32-bit compiler instead 64-bit.",
 	}
 
+	newoption {
+		trigger     = "with-avx",
+		description = "Use AVX extension.",
+	}
+
 	-- Avoid error when invoking genie --help.
 	if (_ACTION == nil) then return false end
 
@@ -460,6 +465,10 @@ function toolchain(_buildDir, _libDir)
 		flags { "StaticRuntime" }
 	end
 
+	if _OPTIONS["with-avx"] then
+		flags { "EnableAVX" }
+	end
+
 	flags {
 		"NoPCH",
 		"NativeWChar",

+ 136 - 12
tests/simd_t.cpp

@@ -12,11 +12,12 @@ using namespace bx;
 
 union simd_cast
 {
-	bx::simd128_t f4;
-	float f[4];
-	uint32_t ui[4];
-	int32_t i[4];
-	char c[16];
+	bx::simd256_t simd256;
+	bx::simd128_t simd128;
+	float    f[8];
+	uint32_t ui[8];
+	int32_t  i[8];
+	char     c[32];
 };
 
 void simd_check_bool(const char* _str, bool _a, bool _0)
@@ -30,9 +31,16 @@ void simd_check_bool(const char* _str, bool _a, bool _0)
 	CHECK_EQUAL(_a, _0);
 }
 
-void simd_check_int32(const char* _str, bx::simd128_t _a, int32_t _0, int32_t _1, int32_t _2, int32_t _3)
+void simd_check_int32(
+	  const char* _str
+	, bx::simd128_t _a
+	, int32_t _0
+	, int32_t _1
+	, int32_t _2
+	, int32_t _3
+	)
 {
-	simd_cast c; c.f4 = _a;
+	simd_cast c; c.simd128 = _a;
 	DBG("%s (%d, %d, %d, %d) == (%d, %d, %d, %d)"
 		, _str
 		, c.i[0], c.i[1], c.i[2], c.i[3]
@@ -45,9 +53,46 @@ void simd_check_int32(const char* _str, bx::simd128_t _a, int32_t _0, int32_t _1
 	CHECK_EQUAL(c.i[3], _3);
 }
 
-void simd_check_uint32(const char* _str, bx::simd128_t _a, uint32_t _0, uint32_t _1, uint32_t _2, uint32_t _3)
+void simd_check_int32(
+	  const char* _str
+	, bx::simd256_t _a
+	, int32_t _0
+	, int32_t _1
+	, int32_t _2
+	, int32_t _3
+	, int32_t _4
+	, int32_t _5
+	, int32_t _6
+	, int32_t _7
+	)
 {
-	simd_cast c; c.f4 = _a;
+	simd_cast c; c.simd256 = _a;
+	DBG("%s (%d, %d, %d, %d, %d, %d, %d, %d) == (%d, %d, %d, %d, %d, %d, %d, %d)"
+		, _str
+		, c.i[0], c.i[1], c.i[2], c.i[3], c.i[4], c.i[5], c.i[6], c.i[7]
+		, _0, _1, _2, _3, _4, _5, _6, _7
+		);
+
+	CHECK_EQUAL(c.i[0], _0);
+	CHECK_EQUAL(c.i[1], _1);
+	CHECK_EQUAL(c.i[2], _2);
+	CHECK_EQUAL(c.i[3], _3);
+	CHECK_EQUAL(c.i[4], _4);
+	CHECK_EQUAL(c.i[5], _5);
+	CHECK_EQUAL(c.i[6], _6);
+	CHECK_EQUAL(c.i[7], _7);
+}
+
+void simd_check_uint32(
+	  const char* _str
+	, bx::simd128_t _a
+	, uint32_t _0
+	, uint32_t _1
+	, uint32_t _2
+	, uint32_t _3
+	)
+{
+	simd_cast c; c.simd128 = _a;
 
 	DBG("%s (0x%08x, 0x%08x, 0x%08x, 0x%08x) == (0x%08x, 0x%08x, 0x%08x, 0x%08x)"
 		, _str
@@ -61,9 +106,47 @@ void simd_check_uint32(const char* _str, bx::simd128_t _a, uint32_t _0, uint32_t
 	CHECK_EQUAL(c.ui[3], _3);
 }
 
-void simd_check_float(const char* _str, bx::simd128_t _a, float _0, float _1, float _2, float _3)
+void simd_check_uint32(
+	  const char* _str
+	, bx::simd256_t _a
+	, uint32_t _0
+	, uint32_t _1
+	, uint32_t _2
+	, uint32_t _3
+	, uint32_t _4
+	, uint32_t _5
+	, uint32_t _6
+	, uint32_t _7
+	)
+{
+	simd_cast c; c.simd256 = _a;
+
+	DBG("%s (0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x) == (0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x)"
+		, _str
+		, c.ui[0], c.ui[1], c.ui[2], c.ui[3], c.ui[4], c.ui[5], c.ui[6], c.ui[7]
+		, _0, _1, _2, _3, _4, _5, _6, _7
+		);
+
+	CHECK_EQUAL(c.ui[0], _0);
+	CHECK_EQUAL(c.ui[1], _1);
+	CHECK_EQUAL(c.ui[2], _2);
+	CHECK_EQUAL(c.ui[3], _3);
+	CHECK_EQUAL(c.ui[4], _4);
+	CHECK_EQUAL(c.ui[5], _5);
+	CHECK_EQUAL(c.ui[6], _6);
+	CHECK_EQUAL(c.ui[7], _7);
+}
+
+void simd_check_float(
+	  const char* _str
+	, bx::simd128_t _a
+	, float _0
+	, float _1
+	, float _2
+	, float _3
+	)
 {
-	simd_cast c; c.f4 = _a;
+	simd_cast c; c.simd128 = _a;
 
 	DBG("%s (%f, %f, %f, %f) == (%f, %f, %f, %f)"
 		, _str
@@ -77,9 +160,40 @@ void simd_check_float(const char* _str, bx::simd128_t _a, float _0, float _1, fl
 	CHECK(bx::fequal(c.f[3], _3, 0.0001f) );
 }
 
+void simd_check_float(
+	  const char* _str
+	, bx::simd256_t _a
+	, float _0
+	, float _1
+	, float _2
+	, float _3
+	, float _4
+	, float _5
+	, float _6
+	, float _7
+	)
+{
+	simd_cast c; c.simd256 = _a;
+
+	DBG("%s (%f, %f, %f, %f, %f, %f, %f, %f) == (%f, %f, %f, %f, %f, %f, %f, %f)"
+		, _str
+		, c.f[0], c.f[1], c.f[2], c.f[3], c.f[4], c.f[5], c.f[6], c.f[7]
+		, _0, _1, _2, _3, _4, _5, _6, _7
+		);
+
+	CHECK(bx::fequal(c.f[0], _0, 0.0001f) );
+	CHECK(bx::fequal(c.f[1], _1, 0.0001f) );
+	CHECK(bx::fequal(c.f[2], _2, 0.0001f) );
+	CHECK(bx::fequal(c.f[3], _3, 0.0001f) );
+	CHECK(bx::fequal(c.f[4], _4, 0.0001f) );
+	CHECK(bx::fequal(c.f[5], _5, 0.0001f) );
+	CHECK(bx::fequal(c.f[6], _6, 0.0001f) );
+	CHECK(bx::fequal(c.f[7], _7, 0.0001f) );
+}
+
 void simd_check_string(const char* _str, bx::simd128_t _a)
 {
-	simd_cast c; c.f4 = _a;
+	simd_cast c; c.simd128 = _a;
 	const char test[5] = { c.c[0], c.c[4], c.c[8], c.c[12], '\0' };
 
 	DBG("%s %s", _str, test);
@@ -200,11 +314,21 @@ TEST(simd_load)
 		, 0.0f, 1.0f, 2.0f, 3.0f
 		);
 
+	simd_check_float("ld"
+		, simd_ld<simd256_t>(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f)
+		, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f
+		);
+
 	simd_check_int32("ild"
 		, simd_ild(uint32_t(-1), 0, 1, 2)
 		, uint32_t(-1), 0, 1, 2
 		);
 
+	simd_check_int32("ild"
+		, simd_ild<simd256_t>(uint32_t(-1), 0, 1, 2, 3, 4, 5, 6)
+		, uint32_t(-1), 0, 1, 2, 3, 4, 5, 6
+		);
+
 	simd_check_int32("ild"
 		, simd_ild(uint32_t(-1), uint32_t(-2), uint32_t(-3), uint32_t(-4) )
 		, uint32_t(-1), uint32_t(-2), uint32_t(-3), uint32_t(-4)