Browse Source

core/simd/x86: Fix some intrinsics

- _mm_slli_si128 produced totally incorrect output
- _mm_storeu_si128 refered to a LLVM intrinsic that is missing
Yawning Angel 1 year ago
parent
commit
390cd3c30d
1 changed files with 19 additions and 14 deletions
  1. 19 14
      core/simd/x86/sse2.odin

+ 19 - 14
core/simd/x86/sse2.odin

@@ -144,19 +144,26 @@ _mm_subs_epu16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
 _mm_slli_si128_impl :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
 	shift :: IMM8 & 0xff
 
+	// This needs to emit behavior identical to PSLLDQ which is as follows:
+	//
+	// TEMP := COUNT
+	// IF (TEMP > 15) THEN TEMP := 16; FI
+	// DEST := DEST << (TEMP * 8)
+	// DEST[MAXVL-1:128] (Unmodified)
+
 	return transmute(__m128i)simd.shuffle(
-		transmute(i8x16)a,
 		i8x16(0),
-		0  when shift > 15 else (16 - shift + 0),
-		1  when shift > 15 else (16 - shift + 1),
-		2  when shift > 15 else (16 - shift + 2),
-		3  when shift > 15 else (16 - shift + 3),
-		4  when shift > 15 else (16 - shift + 4),
-		5  when shift > 15 else (16 - shift + 5),
-		6  when shift > 15 else (16 - shift + 6),
-		7  when shift > 15 else (16 - shift + 7),
-		8  when shift > 15 else (16 - shift + 8),
-		9  when shift > 15 else (16 - shift + 9),
+		transmute(i8x16)a,
+		0 when shift > 15 else (16 - shift + 0),
+		1 when shift > 15 else (16 - shift + 1),
+		2 when shift > 15 else (16 - shift + 2),
+		3 when shift > 15 else (16 - shift + 3),
+		4 when shift > 15 else (16 - shift + 4),
+		5 when shift > 15 else (16 - shift + 5),
+		6 when shift > 15 else (16 - shift + 6),
+		7 when shift > 15 else (16 - shift + 7),
+		8 when shift > 15 else (16 - shift + 8),
+		9 when shift > 15 else (16 - shift + 9),
 		10 when shift > 15 else (16 - shift + 10),
 		11 when shift > 15 else (16 - shift + 11),
 		12 when shift > 15 else (16 - shift + 12),
@@ -435,7 +442,7 @@ _mm_store_si128 :: #force_inline proc "c" (mem_addr: ^__m128i, a: __m128i) {
 }
 @(enable_target_feature="sse2")
 _mm_storeu_si128 :: #force_inline proc "c" (mem_addr: ^__m128i, a: __m128i) {
-	storeudq(mem_addr, a)
+	intrinsics.unaligned_store(mem_addr, a)
 }
 @(enable_target_feature="sse2")
 _mm_storel_epi64 :: #force_inline proc "c" (mem_addr: ^__m128i, a: __m128i) {
@@ -1178,8 +1185,6 @@ foreign _ {
 	cvttsd2si  :: proc(a: __m128d) -> i32 ---
 	@(link_name="llvm.x86.sse2.cvttps2dq")
 	cvttps2dq  :: proc(a: __m128) -> i32x4 ---
-	@(link_name="llvm.x86.sse2.storeu.dq")
-	storeudq   :: proc(mem_addr: rawptr, a: __m128i) ---
 	@(link_name="llvm.x86.sse2.storeu.pd")
 	storeupd   :: proc(mem_addr: rawptr, a: __m128d) ---