1 year ago · 12dd0cb72a
--- a/core/bytes/bytes.odin
+++ b/core/bytes/bytes.odin
@@ -309,14 +309,8 @@ index_byte :: proc(s: []byte, c: byte) -> int {
 
				 	// NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a
			
 
				 	// significant speedup when compiling in either Size or Speed mode.
			
 
				 	// The SIMD version is usually 2-3x slower without optimizations on.
			
 
				-	when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") {
			
 
				-		// SIMD's benefits are noticeable only past a certain threshold of data.
			
 
				-		// For small data, use the plain old algorithm.
			
 
				-		if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE {
			
 
				-			return simd_util.index_byte(s, c)
			
 
				-		} else {
			
 
				-			return _index_byte(s, c)
			
 
				-		}
			
 
				+	when ODIN_OPTIMIZATION_MODE > .Minimal {
			
 
				+		return #force_inline simd_util.index_byte(s, c)
			
 
				 	} else {
			
 
				 		return _index_byte(s, c)
			
 
				 	}
			
@@ -333,12 +327,8 @@ last_index_byte :: proc(s: []byte, c: byte) -> int {
 
				 		return -1
			
 
				 	}
			
 
				 
			
 
				-	when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") {
			
 
				-		if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE {
			
 
				-			return simd_util.last_index_byte(s, c)
			
 
				-		} else {
			
 
				-			return _last_index_byte(s, c)
			
 
				-		}
			
 
				+	when ODIN_OPTIMIZATION_MODE > .Minimal {
			
 
				+		return #force_inline simd_util.last_index_byte(s, c)
			
 
				 	} else {
			
 
				 		return _last_index_byte(s, c)
			
 
				 	}
			
--- a/core/simd/util/util.odin
+++ b/core/simd/util/util.odin
@@ -8,26 +8,24 @@
 
				 
			
 
				 // package simd_util implements compositions of SIMD operations for optimizing
			
 
				 // the core library where available.
			
 
				-
			
 
				-//+build i386, amd64
			
 
				 package simd_util
			
 
				 
			
 
				 import "base:intrinsics"
			
 
				-import "core:simd/x86"
			
 
				 
			
 
				-@private SCAN_REGISTER_SIZE :: 16
			
 
				-@private SCAN_REGISTERS     :: 4
			
 
				-@private SCAN_WIDTH         :: SCAN_REGISTERS * SCAN_REGISTER_SIZE
			
 
				+@private SCAN_WIDTH :: 32
			
 
				 
			
 
				-// How long should a string be before using any of the `index_*` procedures in
			
 
				-// this package.
			
 
				-RECOMMENDED_SCAN_SIZE :: SCAN_REGISTER_SIZE
			
 
				+@(private, rodata)
			
 
				+simd_scanner_indices := #simd[SCAN_WIDTH]u8 {
			
 
				+	 0,  1,  2,  3,  4,  5,  6,  7,
			
 
				+	 8,  9, 10, 11, 12, 13, 14, 15,
			
 
				+	16, 17, 18, 19, 20, 21, 22, 23,
			
 
				+	24, 25, 26, 27, 28, 29, 30, 31,
			
 
				+}
			
 
				 
			
 
				 /*
			
 
				 Scan a slice of bytes for a specific byte.
			
 
				 
			
 
				-This procedure safely handles padding out slices of any length, including empty
			
 
				-slices.
			
 
				+This procedure safely handles slices of any length, including empty slices.
			
 
				 
			
 
				 Inputs:
			
 
				 - data: A slice of bytes.
			
@@ -36,83 +34,54 @@ Inputs:
 
				 Returns:
			
 
				 - index: The index of the byte `c`, or -1 if it was not found.
			
 
				 */
			
 
				-@(enable_target_feature="sse2")
			
 
				 index_byte :: proc(data: []u8, c: byte) -> (index: int) #no_bounds_check {
			
 
				-	scanner_data: [SCAN_REGISTER_SIZE]u8 = c
			
 
				-	scanner := intrinsics.unaligned_load(cast(^x86.__m128i)&scanner_data[0])
			
 
				-
			
 
				-	i: int
			
 
				 	length := len(data)
			
 
				-	full_chunks_length := length - length % SCAN_WIDTH
			
 
				-
			
 
				-	for /**/; i < full_chunks_length; i += SCAN_WIDTH {
			
 
				-		simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&data[i])
			
 
				-
			
 
				-		#unroll for j in 0..<SCAN_REGISTERS {
			
 
				-			cmp := x86._mm_cmpeq_epi8(simd_load[j], scanner)
			
 
				-			mask := x86._mm_movemask_epi8(cmp)
			
 
				-
			
 
				-			// NOTE(Feoramund): I experimented with ORing all the masks onto a
			
 
				-			// 128-bit integer before performing the `mask != 0` check to see
			
 
				-			// if that might be faster. However, the cost to avoid 3
			
 
				-			// compares resulted in a marginally slower runtime on my machine.
			
 
				-			//
			
 
				-			// Simpler won out here.
			
 
				-			if mask != 0 {
			
 
				-				ctz := intrinsics.count_trailing_zeros(mask)
			
 
				-				return i + j * SCAN_REGISTER_SIZE + cast(int)ctz
			
 
				+	i := 0
			
 
				+
			
 
				+	// Guard against small strings.
			
 
				+	if length < SCAN_WIDTH {
			
 
				+		for /**/; i < length; i += 1 {
			
 
				+			if data[i] == c {
			
 
				+				return i
			
 
				 			}
			
 
				 		}
			
 
				+		return -1
			
 
				 	}
			
 
				 
			
 
				-	if i < length {
			
 
				-		// The data is not exactly divisible by SCAN_WIDTH, and we haven't found
			
 
				-		// what we're looking for yet, so we must pad out the end, then run our
			
 
				-		// algorithm on it.
			
 
				-		padded_data_end: [SCAN_WIDTH]u8 = ---
			
 
				-		remnant_length := length % SCAN_WIDTH
			
 
				-		intrinsics.mem_copy_non_overlapping(
			
 
				-			&padded_data_end[0],
			
 
				-			&raw_data(data)[full_chunks_length],
			
 
				-			remnant_length,
			
 
				-		)
			
 
				-
			
 
				-		simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&padded_data_end[0])
			
 
				-
			
 
				-		#unroll for j in 0..<SCAN_REGISTERS {
			
 
				-			cmp := x86._mm_cmpeq_epi8(simd_load[j], scanner)
			
 
				-			mask := x86._mm_movemask_epi8(cmp)
			
 
				-
			
 
				-			// Because this data is padded out, it's possible that we could
			
 
				-			// match on uninitialized memory, so we must guard against that.
			
 
				-
			
 
				-			// Create a relevancy mask: (Example)
			
 
				-			//
			
 
				-			//    max(u64)        = 0xFFFF_FFFF_FFFF_FFFF
			
 
				-			//
			
 
				-			//  Convert an integer into a stream of on-bits by using the
			
 
				-			//  shifted negation of the maximum. The subtraction selects which
			
 
				-			//  section of the overall mask we should apply.
			
 
				-			//
			
 
				-			//                   << 17 - (1 * SCAN_REGISTER_SIZE)
			
 
				-			//                    = 0xFFFF_FFFF_FFFF_FFFE
			
 
				-			//
			
 
				-			submask := max(u64) << u64(remnant_length - (j * SCAN_REGISTER_SIZE))
			
 
				-			//
			
 
				-			//    ~submask        = 0x0000_0000_0000_0001
			
 
				-			//    (submask >> 63) = 0x0000_0000_0000_0001
			
 
				-			//
			
 
				-			//  The multiplication is a guard against zero.
			
 
				-			//
			
 
				-			submask = ~submask * (submask >> 63)
			
 
				-			//
			
 
				-			//  Finally, mask out any irrelevant bits with the submask.
			
 
				-			mask &= i32(submask)
			
 
				-
			
 
				-			if mask != 0 {
			
 
				-				ctz := int(intrinsics.count_trailing_zeros(mask))
			
 
				-				return i + j * SCAN_REGISTER_SIZE + ctz
			
 
				-			}
			
 
				+	ptr := cast(int)cast(uintptr)raw_data(data)
			
 
				+
			
 
				+	alignment_start := (SCAN_WIDTH - ptr % SCAN_WIDTH) % SCAN_WIDTH
			
 
				+
			
 
				+	// Iterate as a scalar until the data is aligned on a `SCAN_WIDTH` boundary.
			
 
				+	//
			
 
				+	// This way, every load in the vector loop will be aligned, which should be
			
 
				+	// the fastest possible scenario.
			
 
				+	for /**/; i < alignment_start; i += 1 {
			
 
				+		if data[i] == c {
			
 
				+			return i
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level.
			
 
				+	scanner: #simd[SCAN_WIDTH]u8 = c
			
 
				+	tail := length - (length - alignment_start) % SCAN_WIDTH
			
 
				+
			
 
				+	for /**/; i < tail; i += SCAN_WIDTH {
			
 
				+		load := (cast(^#simd[SCAN_WIDTH]u8)(&data[i]))^
			
 
				+		comparison := intrinsics.simd_lanes_eq(load, scanner)
			
 
				+		match := intrinsics.simd_reduce_or(comparison)
			
 
				+		if match > 0 {
			
 
				+			sentinel: #simd[SCAN_WIDTH]u8 = u8(0xFF)
			
 
				+			index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel)
			
 
				+			index_reduce := intrinsics.simd_reduce_min(index_select)
			
 
				+			return i + cast(int)index_reduce
			
 
				+		}
			
 
				+	}
			
 
				+	
			
 
				+	// Iterate as a scalar over the remaining unaligned portion.
			
 
				+	for /**/; i < length; i += 1 {
			
 
				+		if data[i] == c {
			
 
				+			return i
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -123,8 +92,7 @@ index_byte :: proc(data: []u8, c: byte) -> (index: int) #no_bounds_check {
 
				 Scan a slice of bytes for a specific byte, starting from the end and working
			
 
				 backwards to the start.
			
 
				 
			
 
				-This procedure safely handles padding out slices of any length, including empty
			
 
				-slices.
			
 
				+This procedure safely handles slices of any length, including empty slices.
			
 
				 
			
 
				 Inputs:
			
 
				 - data: A slice of bytes.
			
@@ -133,54 +101,58 @@ Inputs:
 
				 Returns:
			
 
				 - index: The index of the byte `c`, or -1 if it was not found.
			
 
				 */
			
 
				-@(enable_target_feature="sse2")
			
 
				 last_index_byte :: proc(data: []u8, c: byte) -> int #no_bounds_check {
			
 
				-	scanner_data: [SCAN_REGISTER_SIZE]u8 = c
			
 
				-	scanner := intrinsics.unaligned_load(cast(^x86.__m128i)&scanner_data[0])
			
 
				-
			
 
				-	i := len(data) - SCAN_WIDTH
			
 
				-
			
 
				-	for /**/; i >= 0; i -= SCAN_WIDTH {
			
 
				-		simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&data[i])
			
 
				-
			
 
				-		// There is no #reverse #unroll at the time of this writing, so we use
			
 
				-		// `j` to count down by subtraction.
			
 
				-		#unroll for j in 1..=SCAN_REGISTERS {
			
 
				-			cmp := x86._mm_cmpeq_epi8(simd_load[SCAN_REGISTERS-j], scanner)
			
 
				-			mask := x86._mm_movemask_epi8(cmp)
			
 
				+	length := len(data)
			
 
				+	i := length - 1
			
 
				 
			
 
				-			if mask != 0 {
			
 
				-				// CLZ is used instead to get the on-bit from the other end.
			
 
				-				clz := (8 * size_of(mask) - 1) - int(intrinsics.count_leading_zeros(mask))
			
 
				-				return i + SCAN_WIDTH - j * SCAN_REGISTER_SIZE + clz
			
 
				+	// Guard against small strings.
			
 
				+	if length < SCAN_WIDTH {
			
 
				+		for /**/; i >= 0; i -= 1 {
			
 
				+			if data[i] == c {
			
 
				+				return i
			
 
				 			}
			
 
				 		}
			
 
				+		return -1
			
 
				 	}
			
 
				 
			
 
				-	if i < 0 {
			
 
				-		padded_data_end: [SCAN_WIDTH]u8 = ---
			
 
				-		remnant_length := len(data) % SCAN_WIDTH
			
 
				-		intrinsics.mem_copy_non_overlapping(
			
 
				-			&padded_data_end[0],
			
 
				-			&raw_data(data)[0],
			
 
				-			remnant_length,
			
 
				-		)
			
 
				-
			
 
				-		simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&padded_data_end[0])
			
 
				+	ptr := cast(int)cast(uintptr)raw_data(data)
			
 
				 
			
 
				-		#unroll for j in 1..=SCAN_REGISTERS {
			
 
				-			cmp := x86._mm_cmpeq_epi8(simd_load[SCAN_REGISTERS-j], scanner)
			
 
				-			mask := x86._mm_movemask_epi8(cmp)
			
 
				+	tail := length - (ptr + length) % SCAN_WIDTH
			
 
				 
			
 
				-			submask := max(u64) << u64(remnant_length - (SCAN_REGISTERS-j) * SCAN_REGISTER_SIZE)
			
 
				-			submask = ~submask * (submask >> 63)
			
 
				+	// Iterate as a scalar until the data is aligned on a `SCAN_WIDTH` boundary.
			
 
				+	//
			
 
				+	// This way, every load in the vector loop will be aligned, which should be
			
 
				+	// the fastest possible scenario.
			
 
				+	for /**/; i >= tail; i -= 1 {
			
 
				+		if data[i] == c {
			
 
				+			return i
			
 
				+		}
			
 
				+	}
			
 
				 
			
 
				-			mask &= i32(submask)
			
 
				+	// Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level.
			
 
				+	scanner: #simd[SCAN_WIDTH]u8 = c
			
 
				+	alignment_start := (SCAN_WIDTH - ptr % SCAN_WIDTH) % SCAN_WIDTH
			
 
				+
			
 
				+	i -= SCAN_WIDTH - 1
			
 
				+
			
 
				+	for /**/; i >= alignment_start; i -= SCAN_WIDTH {
			
 
				+		load := (cast(^#simd[SCAN_WIDTH]u8)(&data[i]))^
			
 
				+		comparison := intrinsics.simd_lanes_eq(load, scanner)
			
 
				+		match := intrinsics.simd_reduce_or(comparison)
			
 
				+		if match > 0 {
			
 
				+			sentinel: #simd[SCAN_WIDTH]u8
			
 
				+			index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel)
			
 
				+			index_reduce := intrinsics.simd_reduce_max(index_select)
			
 
				+			return i + cast(int)index_reduce
			
 
				+		}
			
 
				+	}
			
 
				 
			
 
				-			if mask != 0 {
			
 
				-				clz := (8 * size_of(mask) - 1) - int(intrinsics.count_leading_zeros(mask))
			
 
				-				return SCAN_WIDTH - j * SCAN_REGISTER_SIZE + clz
			
 
				-			}
			
 
				+	// Iterate as a scalar over the remaining unaligned portion.
			
 
				+	i += SCAN_WIDTH - 1
			
 
				+	
			
 
				+	for /**/; i >= 0; i -= 1 {
			
 
				+		if data[i] == c {
			
 
				+			return i
			
 
				 		}
			
 
				 	}
			
 
				 
			
--- a/core/strings/strings.odin
+++ b/core/strings/strings.odin
@@ -1438,14 +1438,8 @@ index_byte :: proc(s: string, c: byte) -> (res: int) {
 
				 	// NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a
			
 
				 	// significant speedup when compiling in either Size or Speed mode.
			
 
				 	// The SIMD version is usually 2-3x slower without optimizations on.
			
 
				-	when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") {
			
 
				-		// SIMD's benefits are noticeable only past a certain threshold of data.
			
 
				-		// For small data, use the plain old algorithm.
			
 
				-		if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE {
			
 
				-			return simd_util.index_byte(transmute([]u8)s, c)
			
 
				-		} else {
			
 
				-			return _index_byte(s, c)
			
 
				-		}
			
 
				+	when ODIN_OPTIMIZATION_MODE > .Minimal {
			
 
				+		return #force_inline simd_util.index_byte(transmute([]u8)s, c)
			
 
				 	} else {
			
 
				 		return _index_byte(s, c)
			
 
				 	}
			
@@ -1492,12 +1486,8 @@ last_index_byte :: proc(s: string, c: byte) -> (res: int) {
 
				 		return -1
			
 
				 	}
			
 
				 
			
 
				-	when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") {
			
 
				-		if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE {
			
 
				-			return simd_util.last_index_byte(transmute([]u8)s, c)
			
 
				-		} else {
			
 
				-			return _last_index_byte(s, c)
			
 
				-		}
			
 
				+	when ODIN_OPTIMIZATION_MODE > .Minimal {
			
 
				+		return #force_inline simd_util.last_index_byte(transmute([]u8)s, c)
			
 
				 	} else {
			
 
				 		return _last_index_byte(s, c)
			
 
				 	}
			
--- a/tests/benchmark/simd/util/benchmark_simd_util.odin
+++ b/tests/benchmark/simd/util/benchmark_simd_util.odin
@@ -1,4 +1,3 @@
 
				-//+build i386, amd64
			
 
				 package benchmark_simd_util
			
 
				 
			
 
				 import "core:fmt"
			
--- a/tests/core/simd/util/test_core_simd_util.odin
+++ b/tests/core/simd/util/test_core_simd_util.odin
@@ -1,4 +1,3 @@
 
				-//+build i386, amd64
			
 
				 package test_core_simd_util
			
 
				 
			
 
				 import simd_util "core:simd/util"