1 year ago · 3fc1f6f1cd
--- a/core/bytes/bytes.odin
+++ b/core/bytes/bytes.odin
@@ -2,36 +2,36 @@ package bytes
 
				 
			
 
				 import "base:intrinsics"
			
 
				 import "core:mem"
			
 
				+import "core:simd"
			
 
				 import "core:unicode"
			
 
				 import "core:unicode/utf8"
			
 
				 
			
 
				-
			
 
				-@private SIMD_SCAN_WIDTH :: 8 * size_of(uintptr)
			
 
				-
			
 
				-when SIMD_SCAN_WIDTH == 32 {
			
 
				-	@(private, rodata)
			
 
				-	simd_scanner_indices := #simd[SIMD_SCAN_WIDTH]u8 {
			
 
				-		 0,  1,  2,  3,  4,  5,  6,  7,
			
 
				-		 8,  9, 10, 11, 12, 13, 14, 15,
			
 
				+when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
			
 
				+	@(private)
			
 
				+	SCANNER_INDICES_256 : simd.u8x32 : {
			
 
				+		0,  1,  2,  3,  4,  5,  6,  7,
			
 
				+		8,  9, 10, 11, 12, 13, 14, 15,
			
 
				 		16, 17, 18, 19, 20, 21, 22, 23,
			
 
				 		24, 25, 26, 27, 28, 29, 30, 31,
			
 
				 	}
			
 
				-} else when SIMD_SCAN_WIDTH == 64 {
			
 
				-	@(private, rodata)
			
 
				-	simd_scanner_indices := #simd[SIMD_SCAN_WIDTH]u8 {
			
 
				-		 0,  1,  2,  3,  4,  5,  6,  7,
			
 
				-		 8,  9, 10, 11, 12, 13, 14, 15,
			
 
				-		16, 17, 18, 19, 20, 21, 22, 23,
			
 
				-		24, 25, 26, 27, 28, 29, 30, 31,
			
 
				-		32, 33, 34, 35, 36, 37, 38, 39,
			
 
				-		40, 41, 42, 43, 44, 45, 46, 47,
			
 
				-		48, 49, 50, 51, 52, 53, 54, 55,
			
 
				-		56, 57, 58, 59, 60, 61, 62, 63,
			
 
				-	}
			
 
				-} else {
			
 
				-	#panic("Invalid SIMD_SCAN_WIDTH. Must be 32 or 64.")
			
 
				-}
			
 
				-
			
 
				+	@(private)
			
 
				+	SCANNER_SENTINEL_MAX_256: simd.u8x32 : u8(0x00)
			
 
				+	@(private)
			
 
				+	SCANNER_SENTINEL_MIN_256: simd.u8x32 : u8(0xff)
			
 
				+	@(private)
			
 
				+	SIMD_REG_SIZE_256 :: 32
			
 
				+}
			
 
				+@(private)
			
 
				+SCANNER_INDICES_128 : simd.u8x16 : {
			
 
				+	0,  1,  2,  3,  4,  5,  6,  7,
			
 
				+	8,  9, 10, 11, 12, 13, 14, 15,
			
 
				+}
			
 
				+@(private)
			
 
				+SCANNER_SENTINEL_MAX_128: simd.u8x16 : u8(0x00)
			
 
				+@(private)
			
 
				+SCANNER_SENTINEL_MIN_128: simd.u8x16 : u8(0xff)
			
 
				+@(private)
			
 
				+SIMD_REG_SIZE_128 :: 16
			
 
				 
			
 
				 clone :: proc(s: []byte, allocator := context.allocator, loc := #caller_location) -> []byte {
			
 
				 	c := make([]byte, len(s), allocator, loc)
			
@@ -335,12 +335,13 @@ Returns:
 
				 - index: The index of the byte `c`, or -1 if it was not found.
			
 
				 */
			
 
				 index_byte :: proc(s: []byte, c: byte) -> (index: int) #no_bounds_check {
			
 
				-	length := len(s)
			
 
				-	i := 0
			
 
				+	i, l := 0, len(s)
			
 
				 
			
 
				-	// Guard against small strings.
			
 
				-	if length < SIMD_SCAN_WIDTH {
			
 
				-		for /**/; i < length; i += 1 {
			
 
				+	// Guard against small strings.  On modern systems, it is ALWAYS
			
 
				+	// worth vectorizing assuming there is a hardware vector unit, and
			
 
				+	// the data size is large enough.
			
 
				+	if l < SIMD_REG_SIZE_128 {
			
 
				+		for /**/; i < l; i += 1 {
			
 
				 			if s[i] == c {
			
 
				 				return i
			
 
				 			}
			
@@ -348,38 +349,105 @@ index_byte :: proc(s: []byte, c: byte) -> (index: int) #no_bounds_check {
 
				 		return -1
			
 
				 	}
			
 
				 
			
 
				-	ptr := int(uintptr(raw_data(s)))
			
 
				+	c_vec: simd.u8x16 = c
			
 
				+	when !simd.IS_EMULATED {
			
 
				+		// Note: While this is something that could also logically take
			
 
				+		// advantage of AVX512, the various downclocking and power
			
 
				+		// consumption related woes make premature to have a dedicated
			
 
				+		// code path.
			
 
				+		when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
			
 
				+			c_vec_256: sind.u8x32 = c
			
 
				+
			
 
				+			s_vecs: [4]simd.u8x32 = ---
			
 
				+			c_vecs: [4]simd.u8x32 = ---
			
 
				+			m_vec: [4]u8 = ---
			
 
				+
			
 
				+			// Scan 128-byte chunks, using 256-bit SIMD.
			
 
				+			for nr_blocks := l / (4 * SIMD_REG_SIZE_256); nr_blocks > 0; nr_blocks -= 1 {
			
 
				+				#unroll for j in 0..<4 {
			
 
				+					s_vecs[j] = intrinsics.unaligned_load(cast(^simd.u8x32)raw_data(s[i+j*SIMD_REG_SIZE_256:]))
			
 
				+					c_vecs[j] = simd.lanes_eq(s_vecs[j], c_vec_256)
			
 
				+					m_vec[j] = simd.reduce_or(c_vecs[j])
			
 
				+				}
			
 
				+				if m_vec[0] | m_vec[1] | m_vec[2] | m_vec[3] > 0 {
			
 
				+					#unroll for j in 0..<4 {
			
 
				+						if m_vec[j] > 0 {
			
 
				+							sel := simd.select(c_vecs[j], SCANNER_INDICES_256, SCANNER_SENTINEL_MIN_256)
			
 
				+							off := simd.reduce_min(sel)
			
 
				+							return i + j * SIMD_REG_SIZE_256 + int(off)
			
 
				+						}
			
 
				+					}
			
 
				+				}
			
 
				 
			
 
				-	alignment_start := (SIMD_SCAN_WIDTH - ptr % SIMD_SCAN_WIDTH) % SIMD_SCAN_WIDTH
			
 
				+				i += 4 * SIMD_REG_SIZE_256
			
 
				+			}
			
 
				 
			
 
				-	// Iterate as a scalar until the data is aligned on a `SIMD_SCAN_WIDTH` boundary.
			
 
				-	//
			
 
				-	// This way, every load in the vector loop will be aligned, which should be
			
 
				-	// the fastest possible scenario.
			
 
				-	for /**/; i < alignment_start; i += 1 {
			
 
				-		if s[i] == c {
			
 
				-			return i
			
 
				+			// Scan 64-byte chunks, using 256-bit SIMD.
			
 
				+			for nr_blocks := (l - i) / (2 * SIMD_REG_SIZE_256); nr_blocks > 0; nr_blocks -= 1 {
			
 
				+				#unroll for j in 0..<2 {
			
 
				+					s_vecs[j] = intrinsics.unaligned_load(cast(^simd.u8x32)raw_data(s[i+j*SIMD_REG_SIZE_256:]))
			
 
				+					c_vecs[j] = simd.lanes_eq(s_vecs[j], c_vec_256)
			
 
				+					m_vec[j] = simd.reduce_or(c_vecs[j])
			
 
				+				}
			
 
				+				if m_vec[0] | m_vec[1] > 0 {
			
 
				+					#unroll for j in 0..<2 {
			
 
				+						if m_vec[j] > 0 {
			
 
				+							sel := simd.select(c_vecs[j], SCANNER_INDICES_256, SCANNER_SENTINEL_MIN_256)
			
 
				+							off := simd.reduce_min(sel)
			
 
				+							return i + j * SIMD_REG_SIZE_256 + int(off)
			
 
				+						}
			
 
				+					}
			
 
				+				}
			
 
				+
			
 
				+				i += 2 * SIMD_REG_SIZE_256
			
 
				+			}
			
 
				+		} else {
			
 
				+			s_vecs: [4]simd.u8x16 = ---
			
 
				+			c_vecs: [4]simd.u8x16 = ---
			
 
				+			m_vecs: [4]u8 = ---
			
 
				+
			
 
				+			// Scan 64-byte chunks, using 128-bit SIMD.
			
 
				+			for nr_blocks := l / (4 * SIMD_REG_SIZE_128); nr_blocks > 0; nr_blocks -= 1 {
			
 
				+				#unroll for j in 0..<4 {
			
 
				+					s_vecs[j]= intrinsics.unaligned_load(cast(^simd.u8x16)raw_data(s[i+j*SIMD_REG_SIZE_128:]))
			
 
				+					c_vecs[j] = simd.lanes_eq(s_vecs[j], c_vec)
			
 
				+					m_vecs[j] = simd.reduce_or(c_vecs[j])
			
 
				+				}
			
 
				+				if m_vecs[0] | m_vecs[1] | m_vecs[2] | m_vecs[3] > 0 {
			
 
				+					#unroll for j in 0..<4 {
			
 
				+						if m_vecs[j] > 0 {
			
 
				+							sel := simd.select(c_vecs[j], SCANNER_INDICES_128, SCANNER_SENTINEL_MIN_128)
			
 
				+							off := simd.reduce_min(sel)
			
 
				+							return i + j * SIMD_REG_SIZE_128 + int(off)
			
 
				+						}
			
 
				+					}
			
 
				+				}
			
 
				+
			
 
				+				i += 4 * SIMD_REG_SIZE_128
			
 
				+			}
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	// Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level.
			
 
				-	scanner: #simd[SIMD_SCAN_WIDTH]u8 = c
			
 
				-	tail := length - (length - alignment_start) % SIMD_SCAN_WIDTH
			
 
				-
			
 
				-	for /**/; i < tail; i += SIMD_SCAN_WIDTH {
			
 
				-		load := (^#simd[SIMD_SCAN_WIDTH]u8)(&s[i])^
			
 
				-		comparison := intrinsics.simd_lanes_eq(load, scanner)
			
 
				-		match := intrinsics.simd_reduce_or(comparison)
			
 
				-		if match > 0 {
			
 
				-			sentinel: #simd[SIMD_SCAN_WIDTH]u8 = u8(0xFF)
			
 
				-			index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel)
			
 
				-			index_reduce := intrinsics.simd_reduce_min(index_select)
			
 
				-			return i + int(index_reduce)
			
 
				+	// Scan the remaining SIMD register sized chunks.
			
 
				+	//
			
 
				+	// Apparently LLVM does ok with 128-bit SWAR, so this path is also taken
			
 
				+	// on potato targets.  Scanning more at a time when LLVM is emulating SIMD
			
 
				+	// likely does not buy much, as all that does is increase GP register
			
 
				+	// pressure.
			
 
				+	for nr_blocks := (l - i) / SIMD_REG_SIZE_128; nr_blocks > 0; nr_blocks -= 1 {
			
 
				+		s0 := intrinsics.unaligned_load(cast(^simd.u8x16)raw_data(s[i:]))
			
 
				+		c0 := simd.lanes_eq(s0, c_vec)
			
 
				+		if simd.reduce_or(c0) > 0 {
			
 
				+			sel := simd.select(c0, SCANNER_INDICES_128, SCANNER_SENTINEL_MIN_128)
			
 
				+			off := simd.reduce_min(sel)
			
 
				+			return i + int(off)
			
 
				 		}
			
 
				+
			
 
				+		i += SIMD_REG_SIZE_128
			
 
				 	}
			
 
				 
			
 
				-	// Iterate as a scalar over the remaining unaligned portion.
			
 
				-	for /**/; i < length; i += 1 {
			
 
				+	// Scan serially for the remainder.
			
 
				+	for /**/; i < l; i += 1 {
			
 
				 		if s[i] == c {
			
 
				 			return i
			
 
				 		}
			
@@ -402,55 +470,122 @@ Returns:
 
				 - index: The index of the byte `c`, or -1 if it was not found.
			
 
				 */
			
 
				 last_index_byte :: proc(s: []byte, c: byte) -> int #no_bounds_check {
			
 
				-	length := len(s)
			
 
				-	i := length - 1
			
 
				-
			
 
				-	// Guard against small strings.
			
 
				-	if length < SIMD_SCAN_WIDTH {
			
 
				-		for /**/; i >= 0; i -= 1 {
			
 
				-			if s[i] == c {
			
 
				-				return i
			
 
				+	i := len(s)
			
 
				+
			
 
				+	// Guard against small strings.  On modern systems, it is ALWAYS
			
 
				+	// worth vectorizing assuming there is a hardware vector unit, and
			
 
				+	// the data size is large enough.
			
 
				+	if i < SIMD_REG_SIZE_128 {
			
 
				+		if i > 0 { // Handle s == nil.
			
 
				+			for /**/; i >= 0; i -= 1 {
			
 
				+				if s[i] == c {
			
 
				+					return i
			
 
				+				}
			
 
				 			}
			
 
				 		}
			
 
				 		return -1
			
 
				 	}
			
 
				 
			
 
				-	ptr := int(uintptr(raw_data(s)))
			
 
				+	c_vec: simd.u8x16 = c
			
 
				+	when !simd.IS_EMULATED {
			
 
				+		// Note: While this is something that could also logically take
			
 
				+		// advantage of AVX512, the various downclocking and power
			
 
				+		// consumption related woes make premature to have a dedicated
			
 
				+		// code path.
			
 
				+		when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
			
 
				+			c_vec_256: simd.u8x32 = c
			
 
				 
			
 
				-	tail := length - (ptr + length) % SIMD_SCAN_WIDTH
			
 
				+			s_vecs: [4]simd.u8x32 = ---
			
 
				+			c_vecs: [4]simd.u8x32 = ---
			
 
				+			m_vec: [4]u8 = ---
			
 
				 
			
 
				-	// Iterate as a scalar until the data is aligned on a `SIMD_SCAN_WIDTH` boundary.
			
 
				-	//
			
 
				-	// This way, every load in the vector loop will be aligned, which should be
			
 
				-	// the fastest possible scenario.
			
 
				-	for /**/; i >= tail; i -= 1 {
			
 
				-		if s[i] == c {
			
 
				-			return i
			
 
				-		}
			
 
				-	}
			
 
				+			// Scan 128-byte chunks, using 256-bit SIMD.
			
 
				+			for i >= 4 * SIMD_REG_SIZE_256 {
			
 
				+				i -= 4 * SIMD_REG_SIZE_256
			
 
				 
			
 
				-	// Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level.
			
 
				-	scanner: #simd[SIMD_SCAN_WIDTH]u8 = c
			
 
				-	alignment_start := (SIMD_SCAN_WIDTH - ptr % SIMD_SCAN_WIDTH) % SIMD_SCAN_WIDTH
			
 
				+				#unroll for j in 0..<4 {
			
 
				+					s_vecs[j] = intrinsics.unaligned_load(cast(^simd.u8x32)raw_data(s[i+j*SIMD_REG_SIZE_256:]))
			
 
				+					c_vecs[j] = simd.lanes_eq(s_vecs[j], c_vec_256)
			
 
				+					m_vec[j] = simd.reduce_or(c_vecs[j])
			
 
				+				}
			
 
				+				if m_vec[0] | m_vec[1] | m_vec[2] | m_vec[3] > 0 {
			
 
				+					#unroll for j in 0..<4 {
			
 
				+						if m_vec[3-j] > 0 {
			
 
				+							sel := simd.select(c_vecs[3-j], SCANNER_INDICES_256, SCANNER_SENTINEL_MAX_256)
			
 
				+							off := simd.reduce_max(sel)
			
 
				+							return i + (3-j) * SIMD_REG_SIZE_256 + int(off)
			
 
				+						}
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				 
			
 
				-	i -= SIMD_SCAN_WIDTH - 1
			
 
				+			// Scan 64-byte chunks, using 256-bit SIMD.
			
 
				+			for i >= 2 * SIMD_REG_SIZE_256 {
			
 
				+				i -= 2 * SIMD_REG_SIZE_256
			
 
				 
			
 
				-	for /**/; i >= alignment_start; i -= SIMD_SCAN_WIDTH {
			
 
				-		load := (^#simd[SIMD_SCAN_WIDTH]u8)(&s[i])^
			
 
				-		comparison := intrinsics.simd_lanes_eq(load, scanner)
			
 
				-		match := intrinsics.simd_reduce_or(comparison)
			
 
				-		if match > 0 {
			
 
				-			sentinel: #simd[SIMD_SCAN_WIDTH]u8
			
 
				-			index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel)
			
 
				-			index_reduce := intrinsics.simd_reduce_max(index_select)
			
 
				-			return i + int(index_reduce)
			
 
				+				#unroll for j in 0..<2 {
			
 
				+					s_vecs[j] = intrinsics.unaligned_load(cast(^simd.u8x32)raw_data(s[i+j*SIMD_REG_SIZE_256:]))
			
 
				+					c_vecs[j] = simd.lanes_eq(s_vecs[j], c_vec_256)
			
 
				+					m_vec[j] = simd.reduce_or(c_vecs[j])
			
 
				+				}
			
 
				+				if m_vec[0] | m_vec[1] > 0 {
			
 
				+					#unroll for j in 0..<2 {
			
 
				+						if m_vec[1-j] > 0 {
			
 
				+							sel := simd.select(c_vecs[1-j], SCANNER_INDICES_256, SCANNER_SENTINEL_MAX_256)
			
 
				+							off := simd.reduce_max(sel)
			
 
				+							return i + (1-j) * SIMD_REG_SIZE_256 + int(off)
			
 
				+						}
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+		} else {
			
 
				+			s_vecs: [4]simd.u8x16 = ---
			
 
				+			c_vecs: [4]simd.u8x16 = ---
			
 
				+			m_vecs: [4]u8 = ---
			
 
				+
			
 
				+			// Scan 64-byte chunks, using 128-bit SIMD.
			
 
				+			for i >= 4 * SIMD_REG_SIZE_128 {
			
 
				+				i -= 4 * SIMD_REG_SIZE_128
			
 
				+
			
 
				+				#unroll for j in 0..<4 {
			
 
				+					s_vecs[j] = intrinsics.unaligned_load(cast(^simd.u8x16)raw_data(s[i+j*SIMD_REG_SIZE_128:]))
			
 
				+					c_vecs[j] = simd.lanes_eq(s_vecs[j], c_vec)
			
 
				+					m_vecs[j] = simd.reduce_or(c_vecs[j])
			
 
				+				}
			
 
				+				if m_vecs[0] | m_vecs[1] | m_vecs[2] | m_vecs[3] > 0 {
			
 
				+					#unroll for j in 0..<4 {
			
 
				+						if m_vecs[3-j] > 0 {
			
 
				+							sel := simd.select(c_vecs[3-j], SCANNER_INDICES_128, SCANNER_SENTINEL_MAX_128)
			
 
				+							off := simd.reduce_max(sel)
			
 
				+							return i + (3-j) * SIMD_REG_SIZE_128 + int(off)
			
 
				+						}
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	// Iterate as a scalar over the remaining unaligned portion.
			
 
				-	i += SIMD_SCAN_WIDTH - 1
			
 
				+	// Scan the remaining SIMD register sized chunks.
			
 
				+	//
			
 
				+	// Apparently LLVM does ok with 128-bit SWAR, so this path is also taken
			
 
				+	// on potato targets.  Scanning more at a time when LLVM is emulating SIMD
			
 
				+	// likely does not buy much, as all that does is increase GP register
			
 
				+	// pressure.
			
 
				+	for i >= SIMD_REG_SIZE_128 {
			
 
				+		i -= SIMD_REG_SIZE_128
			
 
				+
			
 
				+		s0 := intrinsics.unaligned_load(cast(^simd.u8x16)raw_data(s[i:]))
			
 
				+		c0 := simd.lanes_eq(s0, c_vec)
			
 
				+		if simd.reduce_or(c0) > 0 {
			
 
				+			sel := simd.select(c0, SCANNER_INDICES_128, SCANNER_SENTINEL_MAX_128)
			
 
				+			off := simd.reduce_max(sel)
			
 
				+			return i + int(off)
			
 
				+		}
			
 
				+	}
			
 
				 
			
 
				-	for /**/; i >= 0; i -= 1 {
			
 
				+	// Scan serially for the remainder.
			
 
				+	for i > 0 {
			
 
				+		i -= 1
			
 
				 		if s[i] == c {
			
 
				 			return i
			
 
				 		}
			
@@ -460,7 +595,6 @@ last_index_byte :: proc(s: []byte, c: byte) -> int #no_bounds_check {
 
				 }
			
 
				 
			
 
				 
			
 
				-
			
 
				 @private PRIME_RABIN_KARP :: 16777619
			
 
				 
			
 
				 index :: proc(s, substr: []byte) -> int {
			
--- a/core/simd/simd.odin
+++ b/core/simd/simd.odin
@@ -3,6 +3,13 @@ package simd
 
				 import "base:builtin"
			
 
				 import "base:intrinsics"
			
 
				 
			
 
				+// IS_EMULATED is true iff the compile-time target lacks hardware support
			
 
				+// for at least 128-bit SIMD.
			
 
				+IS_EMULATED :: true when (ODIN_ARCH == .amd64 || ODIN_ARCH == .i386) && !intrinsics.has_target_feature("sse2") else
			
 
				+	true when (ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32) && !intrinsics.has_target_feature("neon") else
			
 
				+	true when (ODIN_ARCH == .wasm64p32 || ODIN_ARCH == .wasm32) && !intrinsics.has_target_feature("simd128") else
			
 
				+	false
			
 
				+
			
 
				 // 128-bit vector aliases
			
 
				 u8x16 :: #simd[16]u8
			
 
				 i8x16 :: #simd[16]i8
			
--- a/tests/benchmark/bytes/benchmark_bytes.odin
+++ b/tests/benchmark/bytes/benchmark_bytes.odin
@@ -13,9 +13,12 @@ RUNS_PER_SIZE :: 2500
 
				 sizes := [?]int {
			
 
				 	15, 16, 17,
			
 
				 	31, 32, 33,
			
 
				+	63, 64, 65,
			
 
				+	128,
			
 
				 	256,
			
 
				 	512,
			
 
				 	1024,
			
 
				+	4096,
			
 
				 	1024 * 1024,
			
 
				 	// 1024 * 1024 * 1024,
			
 
				 }