Răsfoiți Sursa

Added saturated subtraction to simd.h

David Piuva 3 ani în urmă
părinte
comite
cb3806314d
2 a modificat fișierele cu 66 adăugiri și 36 ștergeri
  1. 60 36
      Source/DFPSR/base/simd.h
  2. 6 0
      Source/test/tests/SimdTest.cpp

+ 60 - 36
Source/DFPSR/base/simd.h

@@ -137,6 +137,7 @@
 		#define ADD_U8_SIMD(A, B) _mm_add_epi8(A, B)
 		#define ADD_U8_SIMD(A, B) _mm_add_epi8(A, B)
 		#define ADD_SAT_U8_SIMD(A, B) _mm_adds_epu8(A, B) // Saturated addition
 		#define ADD_SAT_U8_SIMD(A, B) _mm_adds_epu8(A, B) // Saturated addition
 		#define SUB_U8_SIMD(A, B) _mm_sub_epi8(A, B)
 		#define SUB_U8_SIMD(A, B) _mm_sub_epi8(A, B)
+		#define SUB_SAT_U8_SIMD(A, B) _mm_subs_epu8(A, B) // Saturated subtraction
 		// No 8-bit multiplications
 		// No 8-bit multiplications
 
 
 		// Statistics
 		// Statistics
@@ -247,6 +248,7 @@
 		#define ADD_U8_SIMD(A, B) vaddq_u8(A, B)
 		#define ADD_U8_SIMD(A, B) vaddq_u8(A, B)
 		#define ADD_SAT_U8_SIMD(A, B) vqaddq_u8(A, B) // Saturated addition
 		#define ADD_SAT_U8_SIMD(A, B) vqaddq_u8(A, B) // Saturated addition
 		#define SUB_U8_SIMD(A, B) vsubq_u8(A, B)
 		#define SUB_U8_SIMD(A, B) vsubq_u8(A, B)
+		#define SUB_SAT_U8_SIMD(A, B) vqaddq_u8(A, B) // Saturated subtraction
 		// No 8-bit multiplications
 		// No 8-bit multiplications
 
 
 		// Statistics
 		// Statistics
@@ -1482,31 +1484,53 @@
 			);
 			);
 		#endif
 		#endif
 	}
 	}
-	inline uint8_t saturateToU8(uint32_t x) {
-		// No need to check lower bound for unsigned input
-		return x > 255 ? 255 : x;
-	}
+	inline uint8_t impl_limit0(int32_t x) { return x < 0 ? 0 : x; }
+	inline uint8_t impl_limit255(uint32_t x) { return x > 255 ? 255 : x; }
 	inline U8x16 saturatedAddition(const U8x16& left, const U8x16& right) {
 	inline U8x16 saturatedAddition(const U8x16& left, const U8x16& right) {
 		#ifdef USE_BASIC_SIMD
 		#ifdef USE_BASIC_SIMD
 			return U8x16(ADD_SAT_U8_SIMD(left.v, right.v));
 			return U8x16(ADD_SAT_U8_SIMD(left.v, right.v));
 		#else
 		#else
 			return U8x16(
 			return U8x16(
-			  saturateToU8((uint32_t)left.emulated[0] + (uint32_t)right.emulated[0]),
-			  saturateToU8((uint32_t)left.emulated[1] + (uint32_t)right.emulated[1]),
-			  saturateToU8((uint32_t)left.emulated[2] + (uint32_t)right.emulated[2]),
-			  saturateToU8((uint32_t)left.emulated[3] + (uint32_t)right.emulated[3]),
-			  saturateToU8((uint32_t)left.emulated[4] + (uint32_t)right.emulated[4]),
-			  saturateToU8((uint32_t)left.emulated[5] + (uint32_t)right.emulated[5]),
-			  saturateToU8((uint32_t)left.emulated[6] + (uint32_t)right.emulated[6]),
-			  saturateToU8((uint32_t)left.emulated[7] + (uint32_t)right.emulated[7]),
-			  saturateToU8((uint32_t)left.emulated[8] + (uint32_t)right.emulated[8]),
-			  saturateToU8((uint32_t)left.emulated[9] + (uint32_t)right.emulated[9]),
-			  saturateToU8((uint32_t)left.emulated[10] + (uint32_t)right.emulated[10]),
-			  saturateToU8((uint32_t)left.emulated[11] + (uint32_t)right.emulated[11]),
-			  saturateToU8((uint32_t)left.emulated[12] + (uint32_t)right.emulated[12]),
-			  saturateToU8((uint32_t)left.emulated[13] + (uint32_t)right.emulated[13]),
-			  saturateToU8((uint32_t)left.emulated[14] + (uint32_t)right.emulated[14]),
-			  saturateToU8((uint32_t)left.emulated[15] + (uint32_t)right.emulated[15])
+			  impl_limit255((uint32_t)left.emulated[0] + (uint32_t)right.emulated[0]),
+			  impl_limit255((uint32_t)left.emulated[1] + (uint32_t)right.emulated[1]),
+			  impl_limit255((uint32_t)left.emulated[2] + (uint32_t)right.emulated[2]),
+			  impl_limit255((uint32_t)left.emulated[3] + (uint32_t)right.emulated[3]),
+			  impl_limit255((uint32_t)left.emulated[4] + (uint32_t)right.emulated[4]),
+			  impl_limit255((uint32_t)left.emulated[5] + (uint32_t)right.emulated[5]),
+			  impl_limit255((uint32_t)left.emulated[6] + (uint32_t)right.emulated[6]),
+			  impl_limit255((uint32_t)left.emulated[7] + (uint32_t)right.emulated[7]),
+			  impl_limit255((uint32_t)left.emulated[8] + (uint32_t)right.emulated[8]),
+			  impl_limit255((uint32_t)left.emulated[9] + (uint32_t)right.emulated[9]),
+			  impl_limit255((uint32_t)left.emulated[10] + (uint32_t)right.emulated[10]),
+			  impl_limit255((uint32_t)left.emulated[11] + (uint32_t)right.emulated[11]),
+			  impl_limit255((uint32_t)left.emulated[12] + (uint32_t)right.emulated[12]),
+			  impl_limit255((uint32_t)left.emulated[13] + (uint32_t)right.emulated[13]),
+			  impl_limit255((uint32_t)left.emulated[14] + (uint32_t)right.emulated[14]),
+			  impl_limit255((uint32_t)left.emulated[15] + (uint32_t)right.emulated[15])
+			);
+		#endif
+	}
+	inline U8x16 saturatedSubtraction(const U8x16& left, const U8x16& right) {
+		#ifdef USE_BASIC_SIMD
+			return U8x16(SUB_SAT_U8_SIMD(left.v, right.v));
+		#else
+			return U8x16(
+			  impl_limit0((int32_t)left.emulated[0] - (int32_t)right.emulated[0]),
+			  impl_limit0((int32_t)left.emulated[1] - (int32_t)right.emulated[1]),
+			  impl_limit0((int32_t)left.emulated[2] - (int32_t)right.emulated[2]),
+			  impl_limit0((int32_t)left.emulated[3] - (int32_t)right.emulated[3]),
+			  impl_limit0((int32_t)left.emulated[4] - (int32_t)right.emulated[4]),
+			  impl_limit0((int32_t)left.emulated[5] - (int32_t)right.emulated[5]),
+			  impl_limit0((int32_t)left.emulated[6] - (int32_t)right.emulated[6]),
+			  impl_limit0((int32_t)left.emulated[7] - (int32_t)right.emulated[7]),
+			  impl_limit0((int32_t)left.emulated[8] - (int32_t)right.emulated[8]),
+			  impl_limit0((int32_t)left.emulated[9] - (int32_t)right.emulated[9]),
+			  impl_limit0((int32_t)left.emulated[10] - (int32_t)right.emulated[10]),
+			  impl_limit0((int32_t)left.emulated[11] - (int32_t)right.emulated[11]),
+			  impl_limit0((int32_t)left.emulated[12] - (int32_t)right.emulated[12]),
+			  impl_limit0((int32_t)left.emulated[13] - (int32_t)right.emulated[13]),
+			  impl_limit0((int32_t)left.emulated[14] - (int32_t)right.emulated[14]),
+			  impl_limit0((int32_t)left.emulated[15] - (int32_t)right.emulated[15])
 			);
 			);
 		#endif
 		#endif
 	}
 	}
@@ -1617,22 +1641,22 @@
 			return U8x16(PACK_SAT_U16_TO_U8(lower.v, upper.v));
 			return U8x16(PACK_SAT_U16_TO_U8(lower.v, upper.v));
 		#else
 		#else
 			return U8x16(
 			return U8x16(
-			  saturateToU8(lower.emulated[0]),
-			  saturateToU8(lower.emulated[1]),
-			  saturateToU8(lower.emulated[2]),
-			  saturateToU8(lower.emulated[3]),
-			  saturateToU8(lower.emulated[4]),
-			  saturateToU8(lower.emulated[5]),
-			  saturateToU8(lower.emulated[6]),
-			  saturateToU8(lower.emulated[7]),
-			  saturateToU8(upper.emulated[0]),
-			  saturateToU8(upper.emulated[1]),
-			  saturateToU8(upper.emulated[2]),
-			  saturateToU8(upper.emulated[3]),
-			  saturateToU8(upper.emulated[4]),
-			  saturateToU8(upper.emulated[5]),
-			  saturateToU8(upper.emulated[6]),
-			  saturateToU8(upper.emulated[7])
+			  impl_limit255(lower.emulated[0]),
+			  impl_limit255(lower.emulated[1]),
+			  impl_limit255(lower.emulated[2]),
+			  impl_limit255(lower.emulated[3]),
+			  impl_limit255(lower.emulated[4]),
+			  impl_limit255(lower.emulated[5]),
+			  impl_limit255(lower.emulated[6]),
+			  impl_limit255(lower.emulated[7]),
+			  impl_limit255(upper.emulated[0]),
+			  impl_limit255(upper.emulated[1]),
+			  impl_limit255(upper.emulated[2]),
+			  impl_limit255(upper.emulated[3]),
+			  impl_limit255(upper.emulated[4]),
+			  impl_limit255(upper.emulated[5]),
+			  impl_limit255(upper.emulated[6]),
+			  impl_limit255(upper.emulated[7])
 			);
 			);
 		#endif
 		#endif
 	}
 	}

+ 6 - 0
Source/test/tests/SimdTest.cpp

@@ -203,6 +203,12 @@ START_TEST(Simd)
 	  saturatedAddition(U8x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 255), U8x16((uint8_t)250)),
 	  saturatedAddition(U8x16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 255), U8x16((uint8_t)250)),
 	  U8x16(251, 252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255)
 	  U8x16(251, 252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255)
 	);
 	);
+	ASSERT_EQUAL(
+	  saturatedSubtraction(
+	  U8x16(128, 128, 128, 0, 255, 255,   0, 200, 123, 80, 46, 46, 46, 255, 255, 255),
+	  U8x16(  0, 128, 255, 0, 255,   0, 255, 100,  23, 81, 45, 46, 47, 128, 127, 200)),
+	  U8x16(128,   0,   0, 0,   0, 255,   0, 100, 100,  0,  1,  0,  0, 127, 128,  55)
+	);
 
 
 	// Saturated unsigned integer packing
 	// Saturated unsigned integer packing
 	ASSERT_EQUAL(saturateToU8(U16x8(1, 2, 3, 4, 65535, 6, 7, 8), U16x8(9, 10, 11, 12, 1000, 14, 15, 16)), U8x16(1, 2, 3, 4, 255, 6, 7, 8, 9, 10, 11, 12, 255, 14, 15, 16));
 	ASSERT_EQUAL(saturateToU8(U16x8(1, 2, 3, 4, 65535, 6, 7, 8), U16x8(9, 10, 11, 12, 1000, 14, 15, 16)), U8x16(1, 2, 3, 4, 255, 6, 7, 8, 9, 10, 11, 12, 255, 14, 15, 16));