Browse Source

Vector extract using NEON/SSSE3 with SSE2/scalar fallbacks.

David Piuva 4 years ago
parent
commit
be14efccde
3 changed files with 161 additions and 2 deletions
  1. 91 1
      Source/DFPSR/base/simd.h
  2. 3 1
      Source/test.sh
  3. 67 0
      Source/test/tests/SimdTest.cpp

+ 91 - 1
Source/DFPSR/base/simd.h

@@ -51,6 +51,11 @@
 		#define USE_DIRECT_SIMD_MEMORY_ACCESS
 		#include <emmintrin.h> // SSE2
 
+		#ifdef __SSSE3__
+			#include <tmmintrin.h> // SSSE3
+			// Comment out this line to test without SSSE3
+			#define USE_SSSE3
+		#endif
 		#ifdef __AVX2__
 			#include <immintrin.h> // AVX2
 			#define GATHER_U32_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_epi32((const int32_t*)(SOURCE), FOUR_OFFSETS, SCALE)
@@ -1493,7 +1498,6 @@
 		#endif
 	}
 
-	// TODO: Use overloading to only name the target type
 	inline I32x4 truncateToI32(const F32x4& vector) {
 		#ifdef USE_BASIC_SIMD
 			return I32x4(F32_TO_I32_SIMD(vector.v));
@@ -1620,5 +1624,91 @@
 		#endif
 	}
 
+	// Helper macros for generating the vector extract functions.
+	//   Having one function for each type and offset makes sure that the compiler gets an immediate integer within the valid range.
+	#ifdef USE_BASIC_SIMD
+		#ifdef USE_SSE2
+			#ifdef USE_SSSE3
+				// This does not work as expected when compiling with "-mssse3"!
+				#define _MM_ALIGNR_EPI8(A, B, OFFSET) _mm_alignr_epi8(A, B, OFFSET)
+			#else
+				// If SSSE3 is not used, emulate it using stack memory and unaligned reading of data.
+				static inline SIMD_U8x16 _MM_ALIGNR_EPI8(SIMD_U8x16 a, SIMD_U8x16 b, int offset) {
+					ALIGN16 uint8_t vectorBuffer[32];
+					_mm_store_si128((SIMD_U8x16*)(vectorBuffer), b);
+					_mm_store_si128((SIMD_U8x16*)(vectorBuffer + 16), a);
+					return _mm_loadu_si128((SIMD_U8x16*)(vectorBuffer + offset));
+				}
+			#endif
+			#define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return U8x16(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET));
+			#define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return U16x8(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 2));
+			#define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return U32x4(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 4));
+			#define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return I32x4(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 4));
+			#define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return F32x4(SIMD_F32x4(_MM_ALIGNR_EPI8(SIMD_U32x4(b.v), SIMD_U32x4(a.v), OFFSET * 4)));
+		#elif USE_NEON
+			#define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return U8x16(vextq_u8(a.v, b.v, OFFSET));
+			#define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return U16x8(vextq_u16(a.v, b.v, OFFSET));
+			#define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return U32x4(vextq_u32(a.v, b.v, OFFSET));
+			#define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return I32x4(vextq_s32(a.v, b.v, OFFSET));
+			#define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return F32x4(vextq_f32(a.v, b.v, OFFSET));
+		#endif
+	#else
+		#define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
+		#define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
+		#define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
+		#define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
+		#define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
+	#endif
+
+	// Vector extraction concatunates two input vectors and reads a vector between them using an offset.
+	//   The first and last offsets that only return one of the inputs can be used for readability, because they will be inlined and removed by the compiler.
+	//   To get elements from the right side, combine the center vector with the right vector and shift one element to the left using vectorExtract_1 for the given type.
+	//   To get elements from the left side, combine the left vector with the center vector and shift one element to the right using vectorExtract_15 for 16 lanes, vectorExtract_7 for 8 lanes, or vectorExtract_3 for 4 lanes.
+	U8x16 inline vectorExtract_0(const U8x16 &a, const U8x16 &b) { return a; }
+	U8x16 inline vectorExtract_1(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(1, U8x16(a.emulated[1], a.emulated[2], a.emulated[3], a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0])) }
+	U8x16 inline vectorExtract_2(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(2, U8x16(a.emulated[2], a.emulated[3], a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1])) }
+	U8x16 inline vectorExtract_3(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(3, U8x16(a.emulated[3], a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2])) }
+	U8x16 inline vectorExtract_4(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(4, U8x16(a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3])) }
+	U8x16 inline vectorExtract_5(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(5, U8x16(a.emulated[5], a.emulated[6], a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4])) }
+	U8x16 inline vectorExtract_6(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(6, U8x16(a.emulated[6], a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5])) }
+	U8x16 inline vectorExtract_7(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(7, U8x16(a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6])) }
+	U8x16 inline vectorExtract_8(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(8, U8x16(a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7])) }
+	U8x16 inline vectorExtract_9(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(9, U8x16(a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8])) }
+	U8x16 inline vectorExtract_10(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(10, U8x16(a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8], b.emulated[9])) }
+	U8x16 inline vectorExtract_11(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(11, U8x16(a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8], b.emulated[9], b.emulated[10])) }
+	U8x16 inline vectorExtract_12(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(12, U8x16(a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8], b.emulated[9], b.emulated[10], b.emulated[11])) }
+	U8x16 inline vectorExtract_13(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(13, U8x16(a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8], b.emulated[9], b.emulated[10], b.emulated[11], b.emulated[12])) }
+	U8x16 inline vectorExtract_14(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(14, U8x16(a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8], b.emulated[9], b.emulated[10], b.emulated[11], b.emulated[12], b.emulated[13])) }
+	U8x16 inline vectorExtract_15(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(15, U8x16(a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8], b.emulated[9], b.emulated[10], b.emulated[11], b.emulated[12], b.emulated[13], b.emulated[14])) }
+	U8x16 inline vectorExtract_16(const U8x16 &a, const U8x16 &b) { return b; }
+
+	U16x8 inline vectorExtract_0(const U16x8 &a, const U16x8 &b) { return a; }
+	U16x8 inline vectorExtract_1(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(1, U16x8(a.emulated[1], a.emulated[2], a.emulated[3], a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], b.emulated[0])) }
+	U16x8 inline vectorExtract_2(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(2, U16x8(a.emulated[2], a.emulated[3], a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], b.emulated[0], b.emulated[1])) }
+	U16x8 inline vectorExtract_3(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(3, U16x8(a.emulated[3], a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], b.emulated[0], b.emulated[1], b.emulated[2])) }
+	U16x8 inline vectorExtract_4(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(4, U16x8(a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3])) }
+	U16x8 inline vectorExtract_5(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(5, U16x8(a.emulated[5], a.emulated[6], a.emulated[7], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4])) }
+	U16x8 inline vectorExtract_6(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(6, U16x8(a.emulated[6], a.emulated[7], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5])) }
+	U16x8 inline vectorExtract_7(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(7, U16x8(a.emulated[7], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6])) }
+	U16x8 inline vectorExtract_8(const U16x8 &a, const U16x8 &b) { return b; }
+
+	U32x4 inline vectorExtract_0(const U32x4 &a, const U32x4 &b) { return a; }
+	U32x4 inline vectorExtract_1(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(1, U32x4(a.emulated[1], a.emulated[2], a.emulated[3], b.emulated[0])) }
+	U32x4 inline vectorExtract_2(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(2, U32x4(a.emulated[2], a.emulated[3], b.emulated[0], b.emulated[1])) }
+	U32x4 inline vectorExtract_3(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(3, U32x4(a.emulated[3], b.emulated[0], b.emulated[1], b.emulated[2])) }
+	U32x4 inline vectorExtract_4(const U32x4 &a, const U32x4 &b) { return b; }
+
+	I32x4 inline vectorExtract_0(const I32x4 &a, const I32x4 &b) { return a; }
+	I32x4 inline vectorExtract_1(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(1, I32x4(a.emulated[1], a.emulated[2], a.emulated[3], b.emulated[0])) }
+	I32x4 inline vectorExtract_2(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(2, I32x4(a.emulated[2], a.emulated[3], b.emulated[0], b.emulated[1])) }
+	I32x4 inline vectorExtract_3(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(3, I32x4(a.emulated[3], b.emulated[0], b.emulated[1], b.emulated[2])) }
+	I32x4 inline vectorExtract_4(const I32x4 &a, const I32x4 &b) { return b; }
+
+	F32x4 inline vectorExtract_0(const F32x4 &a, const F32x4 &b) { return a; }
+	F32x4 inline vectorExtract_1(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(1, F32x4(a.emulated[1], a.emulated[2], a.emulated[3], b.emulated[0])) }
+	F32x4 inline vectorExtract_2(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(2, F32x4(a.emulated[2], a.emulated[3], b.emulated[0], b.emulated[1])) }
+	F32x4 inline vectorExtract_3(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(3, F32x4(a.emulated[3], b.emulated[0], b.emulated[1], b.emulated[2])) }
+	F32x4 inline vectorExtract_4(const F32x4 &a, const F32x4 &b) { return b; }
+
 #endif
 

+ 3 - 1
Source/test.sh

@@ -5,6 +5,8 @@ TEMP_ROOT=${ROOT_PATH}/../../temporary
 CPP_VERSION=-std=c++14
 MODE=-DDEBUG
 O_LEVEL=-O2
+SIMD_FLAGS=""
+#SIMD_FLAGS="-msse2 -mssse3 -mavx2"
 
 chmod +x ${ROOT_PATH}/tools/build.sh;
 ${ROOT_PATH}/tools/build.sh "NONE" "NONE" "${ROOT_PATH}" "${TEMP_ROOT}" "NONE" "${MODE} ${CPP_VERSION} ${O_LEVEL}";
@@ -30,7 +32,7 @@ for file in ./test/tests/*.cpp; do
 	rm -f ${TEMP_DIR}/application;
 	# Compile test case that defines main
 	echo "Compiling ${name}";
-	g++ ${CPP_VERSION} ${MODE} -c ${file} -o ${TEMP_DIR}/${base}_test.o;
+	g++ ${CPP_VERSION} ${MODE} ${SIMD_FLAGS} -c ${file} -o ${TEMP_DIR}/${base}_test.o;
 	# Linking with frameworks
 	echo "Linking ${name}";
 	g++ ${TEMP_DIR}/*.o ${TEMP_DIR}/*.a -lm -pthread -o ${TEMP_DIR}/application;

+ 67 - 0
Source/test/tests/SimdTest.cpp

@@ -3,6 +3,20 @@
 #include "../../DFPSR/base/simdExtra.h"
 
 START_TEST(Simd)
+	printText("\nSIMD test is compiled using:\n");
+	#ifdef USE_SSE2
+		printText("	* SSE2\n");
+	#endif
+	#ifdef USE_SSSE3
+		printText("	* SSSE3\n");
+	#endif
+	#ifdef USE_AVX2
+		printText("	* AVX2\n");
+	#endif
+	#ifdef USE_NEON
+		printText("	* NEON\n");
+	#endif
+
 	// F32x4 Comparisons
 	ASSERT_EQUAL(F32x4(1.5f), F32x4(1.5f, 1.5f, 1.5f, 1.5f));
 	ASSERT_EQUAL(F32x4(-1.5f), F32x4(-1.5f, -1.5f, -1.5f, -1.5f));
@@ -255,6 +269,59 @@ START_TEST(Simd)
 	ASSERT_EQUAL(U32x4(0x0AB12CD0, 0xFFFFFFFF, 0x12345678, 0xF0000000) << 4, U32x4(0xAB12CD00, 0xFFFFFFF0, 0x23456780, 0x00000000));
 	ASSERT_EQUAL(U32x4(0x0AB12CD0, 0xFFFFFFFF, 0x12345678, 0x0000000F) >> 4, U32x4(0x00AB12CD, 0x0FFFFFFF, 0x01234567, 0x00000000));
 
+	// Element shift with insert
+	ASSERT_EQUAL(vectorExtract_0(U32x4(1, 2, 3, 4), U32x4(5, 6, 7, 8)), U32x4(1, 2, 3, 4));
+	ASSERT_EQUAL(vectorExtract_1(U32x4(1, 2, 3, 4), U32x4(5, 6, 7, 8)), U32x4(2, 3, 4, 5));
+	ASSERT_EQUAL(vectorExtract_2(U32x4(1, 2, 3, 4), U32x4(5, 6, 7, 8)), U32x4(3, 4, 5, 6));
+	ASSERT_EQUAL(vectorExtract_3(U32x4(1, 2, 3, 4), U32x4(5, 6, 7, 8)), U32x4(4, 5, 6, 7));
+	ASSERT_EQUAL(vectorExtract_4(U32x4(1, 2, 3, 4), U32x4(5, 6, 7, 8)), U32x4(5, 6, 7, 8));
+	ASSERT_EQUAL(vectorExtract_0(U32x4(123, 4294967295, 712, 45), U32x4(850514, 27, 0, 174)), U32x4(123, 4294967295, 712, 45));
+	ASSERT_EQUAL(vectorExtract_1(U32x4(123, 4294967295, 712, 45), U32x4(850514, 27, 0, 174)), U32x4(4294967295, 712, 45, 850514));
+	ASSERT_EQUAL(vectorExtract_2(U32x4(123, 4294967295, 712, 45), U32x4(850514, 27, 0, 174)), U32x4(712, 45, 850514, 27));
+	ASSERT_EQUAL(vectorExtract_3(U32x4(123, 4294967295, 712, 45), U32x4(850514, 27, 0, 174)), U32x4(45, 850514, 27, 0));
+	ASSERT_EQUAL(vectorExtract_4(U32x4(123, 4294967295, 712, 45), U32x4(850514, 27, 0, 174)), U32x4(850514, 27, 0, 174));
+	ASSERT_EQUAL(vectorExtract_0(I32x4(1, 2, 3, 4), I32x4(5, 6, 7, 8)), I32x4(1, 2, 3, 4));
+	ASSERT_EQUAL(vectorExtract_1(I32x4(1, 2, 3, 4), I32x4(5, 6, 7, 8)), I32x4(2, 3, 4, 5));
+	ASSERT_EQUAL(vectorExtract_2(I32x4(1, 2, 3, 4), I32x4(5, 6, 7, 8)), I32x4(3, 4, 5, 6));
+	ASSERT_EQUAL(vectorExtract_3(I32x4(1, 2, 3, 4), I32x4(5, 6, 7, 8)), I32x4(4, 5, 6, 7));
+	ASSERT_EQUAL(vectorExtract_4(I32x4(1, 2, 3, 4), I32x4(5, 6, 7, 8)), I32x4(5, 6, 7, 8));
+	ASSERT_EQUAL(vectorExtract_0(I32x4(123, 8462784, -712, 45), I32x4(-37562, 27, 0, 174)), I32x4(123, 8462784, -712, 45));
+	ASSERT_EQUAL(vectorExtract_1(I32x4(123, 8462784, -712, 45), I32x4(-37562, 27, 0, 174)), I32x4(8462784, -712, 45, -37562));
+	ASSERT_EQUAL(vectorExtract_2(I32x4(123, 8462784, -712, 45), I32x4(-37562, 27, 0, 174)), I32x4(-712, 45, -37562, 27));
+	ASSERT_EQUAL(vectorExtract_3(I32x4(123, 8462784, -712, 45), I32x4(-37562, 27, 0, 174)), I32x4(45, -37562, 27, 0));
+	ASSERT_EQUAL(vectorExtract_4(I32x4(123, 8462784, -712, 45), I32x4(-37562, 27, 0, 174)), I32x4(-37562, 27, 0, 174));
+	ASSERT_EQUAL(vectorExtract_0(F32x4(1.0f, -2.0f, 3.0f, -4.0f), F32x4(5.0f, 6.0f, -7.0f, 8.0f)), F32x4(1.0f, -2.0f, 3.0f, -4.0f));
+	ASSERT_EQUAL(vectorExtract_1(F32x4(1.0f, -2.0f, 3.0f, -4.0f), F32x4(5.0f, 6.0f, -7.0f, 8.0f)), F32x4(-2.0f, 3.0f, -4.0f, 5.0f));
+	ASSERT_EQUAL(vectorExtract_2(F32x4(1.0f, -2.0f, 3.0f, -4.0f), F32x4(5.0f, 6.0f, -7.0f, 8.0f)), F32x4(3.0f, -4.0f, 5.0f, 6.0f));
+	ASSERT_EQUAL(vectorExtract_3(F32x4(1.0f, -2.0f, 3.0f, -4.0f), F32x4(5.0f, 6.0f, -7.0f, 8.0f)), F32x4(-4.0f, 5.0f, 6.0f, -7.0f));
+	ASSERT_EQUAL(vectorExtract_4(F32x4(1.0f, -2.0f, 3.0f, -4.0f), F32x4(5.0f, 6.0f, -7.0f, 8.0f)), F32x4(5.0f, 6.0f, -7.0f, 8.0f));
+	ASSERT_EQUAL(vectorExtract_0(U16x8(1, 2, 3, 4, 5, 6, 7, 8), U16x8(9, 10, 11, 12, 13, 14, 15, 16)), U16x8(1, 2, 3, 4, 5, 6, 7, 8));
+	ASSERT_EQUAL(vectorExtract_1(U16x8(1, 2, 3, 4, 5, 6, 7, 8), U16x8(9, 10, 11, 12, 13, 14, 15, 16)), U16x8(2, 3, 4, 5, 6, 7, 8, 9));
+	ASSERT_EQUAL(vectorExtract_2(U16x8(1, 2, 3, 4, 5, 6, 7, 8), U16x8(9, 10, 11, 12, 13, 14, 15, 16)), U16x8(3, 4, 5, 6, 7, 8, 9, 10));
+	ASSERT_EQUAL(vectorExtract_3(U16x8(1, 2, 3, 4, 5, 6, 7, 8), U16x8(9, 10, 11, 12, 13, 14, 15, 16)), U16x8(4, 5, 6, 7, 8, 9, 10, 11));
+	ASSERT_EQUAL(vectorExtract_4(U16x8(1, 2, 3, 4, 5, 6, 7, 8), U16x8(9, 10, 11, 12, 13, 14, 15, 16)), U16x8(5, 6, 7, 8, 9, 10, 11, 12));
+	ASSERT_EQUAL(vectorExtract_5(U16x8(1, 2, 3, 4, 5, 6, 7, 8), U16x8(9, 10, 11, 12, 13, 14, 15, 16)), U16x8(6, 7, 8, 9, 10, 11, 12, 13));
+	ASSERT_EQUAL(vectorExtract_6(U16x8(1, 2, 3, 4, 5, 6, 7, 8), U16x8(9, 10, 11, 12, 13, 14, 15, 16)), U16x8(7, 8, 9, 10, 11, 12, 13, 14));
+	ASSERT_EQUAL(vectorExtract_7(U16x8(1, 2, 3, 4, 5, 6, 7, 8), U16x8(9, 10, 11, 12, 13, 14, 15, 16)), U16x8(8, 9, 10, 11, 12, 13, 14, 15));
+	ASSERT_EQUAL(vectorExtract_8(U16x8(1, 2, 3, 4, 5, 6, 7, 8), U16x8(9, 10, 11, 12, 13, 14, 15, 16)), U16x8(9, 10, 11, 12, 13, 14, 15, 16));
+	ASSERT_EQUAL(vectorExtract_0(U8x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U8x16(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)), U8x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16));
+	ASSERT_EQUAL(vectorExtract_1(U8x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U8x16(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)), U8x16(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17));
+	ASSERT_EQUAL(vectorExtract_2(U8x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U8x16(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)), U8x16(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18));
+	ASSERT_EQUAL(vectorExtract_3(U8x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U8x16(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)), U8x16(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19));
+	ASSERT_EQUAL(vectorExtract_4(U8x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U8x16(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)), U8x16(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20));
+	ASSERT_EQUAL(vectorExtract_5(U8x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U8x16(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)), U8x16(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21));
+	ASSERT_EQUAL(vectorExtract_6(U8x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U8x16(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)), U8x16(7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22));
+	ASSERT_EQUAL(vectorExtract_7(U8x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U8x16(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)), U8x16(8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23));
+	ASSERT_EQUAL(vectorExtract_8(U8x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U8x16(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)), U8x16(9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24));
+	ASSERT_EQUAL(vectorExtract_9(U8x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U8x16(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)), U8x16(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25));
+	ASSERT_EQUAL(vectorExtract_10(U8x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U8x16(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)), U8x16(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26));
+	ASSERT_EQUAL(vectorExtract_11(U8x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U8x16(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)), U8x16(12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27));
+	ASSERT_EQUAL(vectorExtract_12(U8x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U8x16(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)), U8x16(13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28));
+	ASSERT_EQUAL(vectorExtract_13(U8x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U8x16(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)), U8x16(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29));
+	ASSERT_EQUAL(vectorExtract_14(U8x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U8x16(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)), U8x16(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30));
+	ASSERT_EQUAL(vectorExtract_15(U8x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U8x16(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)), U8x16(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31));
+	ASSERT_EQUAL(vectorExtract_16(U8x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), U8x16(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32)), U8x16(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32));
+
 	#ifdef USE_SIMD_EXTRA
 		SIMD_U32x4 a = U32x4(1, 2, 3, 4).v;
 		SIMD_U32x4 b = U32x4(5, 6, 7, 8).v;