пре 1 година · c69fa87d53
--- a/core/bytes/bytes.odin
+++ b/core/bytes/bytes.odin
@@ -2,10 +2,21 @@ package bytes
 
															 import "base:intrinsics"
														
 
															 import "core:mem"
														
 
															-@require import simd_util "core:simd/util"
														
 
															 import "core:unicode"
														
 
															 import "core:unicode/utf8"
														
 
															+
														
 
															+@private SIMD_SCAN_WIDTH :: 32
														
 
															+
														
 
															+@(private, rodata)
														
 
															+simd_scanner_indices := #simd[SIMD_SCAN_WIDTH]u8 {
														
 
															+	 0,  1,  2,  3,  4,  5,  6,  7,
														
 
															+	 8,  9, 10, 11, 12, 13, 14, 15,
														
 
															+	16, 17, 18, 19, 20, 21, 22, 23,
														
 
															+	24, 25, 26, 27, 28, 29, 30, 31,
														
 
															+}
														
 
															+
														
 
															+
														
 
															 clone :: proc(s: []byte, allocator := context.allocator, loc := #caller_location) -> []byte {
														
 
															 	c := make([]byte, len(s), allocator, loc)
														
 
															 	copy(c, s)
														
@@ -295,43 +306,141 @@ split_after_iterator :: proc(s: ^[]byte, sep: []byte) -> ([]byte, bool) {
 
															 	return _split_iterator(s, sep, len(sep))
														
 
															 }
														
 
															+/*
														
 
															+Scan a slice of bytes for a specific byte.
														
 
															+
														
 
															+This procedure safely handles slices of any length, including empty slices.
														
 
															-index_byte :: proc(s: []byte, c: byte) -> int {
														
 
															-	_index_byte :: #force_inline proc "contextless" (s: []byte, c: byte) -> int {
														
 
															-		for ch, i in s {
														
 
															-			if ch == c {
														
 
															+Inputs:
														
 
															+- data: A slice of bytes.
														
 
															+- c: The byte to search for.
														
 
															+
														
 
															+Returns:
														
 
															+- index: The index of the byte `c`, or -1 if it was not found.
														
 
															+*/
														
 
															+index_byte :: proc(s: []byte, c: byte) -> (index: int) #no_bounds_check {
														
 
															+	length := len(s)
														
 
															+	i := 0
														
 
															+
														
 
															+	// Guard against small strings.
														
 
															+	if length < SIMD_SCAN_WIDTH {
														
 
															+		for /**/; i < length; i += 1 {
														
 
															+			if s[i] == c {
														
 
															 				return i
														
 
															 			}
														
 
															 		}
														
 
															 		return -1
														
 
															 	}
														
 
															-	// NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a
														
 
															-	// significant speedup when compiling in either Size or Speed mode.
														
 
															-	// The SIMD version is usually 2-3x slower without optimizations on.
														
 
															-	when ODIN_OPTIMIZATION_MODE > .Minimal {
														
 
															-		return #force_inline simd_util.index_byte(s, c)
														
 
															-	} else {
														
 
															-		return _index_byte(s, c)
														
 
															+	ptr := cast(int)cast(uintptr)raw_data(s)
														
 
															+
														
 
															+	alignment_start := (SIMD_SCAN_WIDTH - ptr % SIMD_SCAN_WIDTH) % SIMD_SCAN_WIDTH
														
 
															+
														
 
															+	// Iterate as a scalar until the data is aligned on a `SIMD_SCAN_WIDTH` boundary.
														
 
															+	//
														
 
															+	// This way, every load in the vector loop will be aligned, which should be
														
 
															+	// the fastest possible scenario.
														
 
															+	for /**/; i < alignment_start; i += 1 {
														
 
															+		if s[i] == c {
														
 
															+			return i
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	// Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level.
														
 
															+	scanner: #simd[SIMD_SCAN_WIDTH]u8 = c
														
 
															+	tail := length - (length - alignment_start) % SIMD_SCAN_WIDTH
														
 
															+
														
 
															+	for /**/; i < tail; i += SIMD_SCAN_WIDTH {
														
 
															+		load := (cast(^#simd[SIMD_SCAN_WIDTH]u8)(&s[i]))^
														
 
															+		comparison := intrinsics.simd_lanes_eq(load, scanner)
														
 
															+		match := intrinsics.simd_reduce_or(comparison)
														
 
															+		if match > 0 {
														
 
															+			sentinel: #simd[SIMD_SCAN_WIDTH]u8 = u8(0xFF)
														
 
															+			index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel)
														
 
															+			index_reduce := intrinsics.simd_reduce_min(index_select)
														
 
															+			return i + cast(int)index_reduce
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	// Iterate as a scalar over the remaining unaligned portion.
														
 
															+	for /**/; i < length; i += 1 {
														
 
															+		if s[i] == c {
														
 
															+			return i
														
 
															+		}
														
 
															 	}
														
 
															+
														
 
															+	return -1
														
 
															 }
														
 
															-// Returns -1 if c is not present
														
 
															-last_index_byte :: proc(s: []byte, c: byte) -> int {
														
 
															-	_last_index_byte :: #force_inline proc "contextless" (s: []byte, c: byte) -> int {
														
 
															-		#reverse for ch, i in s {
														
 
															-			if ch == c {
														
 
															+/*
														
 
															+Scan a slice of bytes for a specific byte, starting from the end and working
														
 
															+backwards to the start.
														
 
															+
														
 
															+This procedure safely handles slices of any length, including empty slices.
														
 
															+
														
 
															+Inputs:
														
 
															+- data: A slice of bytes.
														
 
															+- c: The byte to search for.
														
 
															+
														
 
															+Returns:
														
 
															+- index: The index of the byte `c`, or -1 if it was not found.
														
 
															+*/
														
 
															+last_index_byte :: proc(s: []byte, c: byte) -> int #no_bounds_check {
														
 
															+	length := len(s)
														
 
															+	i := length - 1
														
 
															+
														
 
															+	// Guard against small strings.
														
 
															+	if length < SIMD_SCAN_WIDTH {
														
 
															+		for /**/; i >= 0; i -= 1 {
														
 
															+			if s[i] == c {
														
 
															 				return i
														
 
															 			}
														
 
															 		}
														
 
															 		return -1
														
 
															 	}
														
 
															-	when ODIN_OPTIMIZATION_MODE > .Minimal {
														
 
															-		return #force_inline simd_util.last_index_byte(s, c)
														
 
															-	} else {
														
 
															-		return _last_index_byte(s, c)
														
 
															+	ptr := cast(int)cast(uintptr)raw_data(s)
														
 
															+
														
 
															+	tail := length - (ptr + length) % SIMD_SCAN_WIDTH
														
 
															+
														
 
															+	// Iterate as a scalar until the data is aligned on a `SIMD_SCAN_WIDTH` boundary.
														
 
															+	//
														
 
															+	// This way, every load in the vector loop will be aligned, which should be
														
 
															+	// the fastest possible scenario.
														
 
															+	for /**/; i >= tail; i -= 1 {
														
 
															+		if s[i] == c {
														
 
															+			return i
														
 
															+		}
														
 
															 	}
														
 
															+
														
 
															+	// Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level.
														
 
															+	scanner: #simd[SIMD_SCAN_WIDTH]u8 = c
														
 
															+	alignment_start := (SIMD_SCAN_WIDTH - ptr % SIMD_SCAN_WIDTH) % SIMD_SCAN_WIDTH
														
 
															+
														
 
															+	i -= SIMD_SCAN_WIDTH - 1
														
 
															+
														
 
															+	for /**/; i >= alignment_start; i -= SIMD_SCAN_WIDTH {
														
 
															+		load := (cast(^#simd[SIMD_SCAN_WIDTH]u8)(&s[i]))^
														
 
															+		comparison := intrinsics.simd_lanes_eq(load, scanner)
														
 
															+		match := intrinsics.simd_reduce_or(comparison)
														
 
															+		if match > 0 {
														
 
															+			sentinel: #simd[SIMD_SCAN_WIDTH]u8
														
 
															+			index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel)
														
 
															+			index_reduce := intrinsics.simd_reduce_max(index_select)
														
 
															+			return i + cast(int)index_reduce
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	// Iterate as a scalar over the remaining unaligned portion.
														
 
															+	i += SIMD_SCAN_WIDTH - 1
														
 
															+
														
 
															+	for /**/; i >= 0; i -= 1 {
														
 
															+		if s[i] == c {
														
 
															+			return i
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return -1
														
 
															 }
														
--- a/core/simd/util/util.odin
+++ b/core/simd/util/util.odin
@@ -1,160 +0,0 @@
 
															-/*
														
 
															-	(c) Copyright 2024 Feoramund <[email protected]>.
														
 
															-	Made available under Odin's BSD-3 license.
														
 
															-
														
 
															-	List of contributors:
														
 
															-		Feoramund: `index_byte` procedures.
														
 
															-*/
														
 
															-
														
 
															-// package simd_util implements compositions of SIMD operations for optimizing
														
 
															-// the core library where available.
														
 
															-package simd_util
														
 
															-
														
 
															-import "base:intrinsics"
														
 
															-
														
 
															-@private SCAN_WIDTH :: 32
														
 
															-
														
 
															-@(private, rodata)
														
 
															-simd_scanner_indices := #simd[SCAN_WIDTH]u8 {
														
 
															-	 0,  1,  2,  3,  4,  5,  6,  7,
														
 
															-	 8,  9, 10, 11, 12, 13, 14, 15,
														
 
															-	16, 17, 18, 19, 20, 21, 22, 23,
														
 
															-	24, 25, 26, 27, 28, 29, 30, 31,
														
 
															-}
														
 
															-
														
 
															-/*
														
 
															-Scan a slice of bytes for a specific byte.
														
 
															-
														
 
															-This procedure safely handles slices of any length, including empty slices.
														
 
															-
														
 
															-Inputs:
														
 
															-- data: A slice of bytes.
														
 
															-- c: The byte to search for.
														
 
															-
														
 
															-Returns:
														
 
															-- index: The index of the byte `c`, or -1 if it was not found.
														
 
															-*/
														
 
															-index_byte :: proc "contextless" (data: []u8, c: byte) -> (index: int) #no_bounds_check {
														
 
															-	length := len(data)
														
 
															-	i := 0
														
 
															-
														
 
															-	// Guard against small strings.
														
 
															-	if length < SCAN_WIDTH {
														
 
															-		for /**/; i < length; i += 1 {
														
 
															-			if data[i] == c {
														
 
															-				return i
														
 
															-			}
														
 
															-		}
														
 
															-		return -1
														
 
															-	}
														
 
															-
														
 
															-	ptr := cast(int)cast(uintptr)raw_data(data)
														
 
															-
														
 
															-	alignment_start := (SCAN_WIDTH - ptr % SCAN_WIDTH) % SCAN_WIDTH
														
 
															-
														
 
															-	// Iterate as a scalar until the data is aligned on a `SCAN_WIDTH` boundary.
														
 
															-	//
														
 
															-	// This way, every load in the vector loop will be aligned, which should be
														
 
															-	// the fastest possible scenario.
														
 
															-	for /**/; i < alignment_start; i += 1 {
														
 
															-		if data[i] == c {
														
 
															-			return i
														
 
															-		}
														
 
															-	}
														
 
															-
														
 
															-	// Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level.
														
 
															-	scanner: #simd[SCAN_WIDTH]u8 = c
														
 
															-	tail := length - (length - alignment_start) % SCAN_WIDTH
														
 
															-
														
 
															-	for /**/; i < tail; i += SCAN_WIDTH {
														
 
															-		load := (cast(^#simd[SCAN_WIDTH]u8)(&data[i]))^
														
 
															-		comparison := intrinsics.simd_lanes_eq(load, scanner)
														
 
															-		match := intrinsics.simd_reduce_or(comparison)
														
 
															-		if match > 0 {
														
 
															-			sentinel: #simd[SCAN_WIDTH]u8 = u8(0xFF)
														
 
															-			index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel)
														
 
															-			index_reduce := intrinsics.simd_reduce_min(index_select)
														
 
															-			return i + cast(int)index_reduce
														
 
															-		}
														
 
															-	}
														
 
															-	
														
 
															-	// Iterate as a scalar over the remaining unaligned portion.
														
 
															-	for /**/; i < length; i += 1 {
														
 
															-		if data[i] == c {
														
 
															-			return i
														
 
															-		}
														
 
															-	}
														
 
															-
														
 
															-	return -1
														
 
															-}
														
 
															-
														
 
															-/*
														
 
															-Scan a slice of bytes for a specific byte, starting from the end and working
														
 
															-backwards to the start.
														
 
															-
														
 
															-This procedure safely handles slices of any length, including empty slices.
														
 
															-
														
 
															-Inputs:
														
 
															-- data: A slice of bytes.
														
 
															-- c: The byte to search for.
														
 
															-
														
 
															-Returns:
														
 
															-- index: The index of the byte `c`, or -1 if it was not found.
														
 
															-*/
														
 
															-last_index_byte :: proc "contextless" (data: []u8, c: byte) -> int #no_bounds_check {
														
 
															-	length := len(data)
														
 
															-	i := length - 1
														
 
															-
														
 
															-	// Guard against small strings.
														
 
															-	if length < SCAN_WIDTH {
														
 
															-		for /**/; i >= 0; i -= 1 {
														
 
															-			if data[i] == c {
														
 
															-				return i
														
 
															-			}
														
 
															-		}
														
 
															-		return -1
														
 
															-	}
														
 
															-
														
 
															-	ptr := cast(int)cast(uintptr)raw_data(data)
														
 
															-
														
 
															-	tail := length - (ptr + length) % SCAN_WIDTH
														
 
															-
														
 
															-	// Iterate as a scalar until the data is aligned on a `SCAN_WIDTH` boundary.
														
 
															-	//
														
 
															-	// This way, every load in the vector loop will be aligned, which should be
														
 
															-	// the fastest possible scenario.
														
 
															-	for /**/; i >= tail; i -= 1 {
														
 
															-		if data[i] == c {
														
 
															-			return i
														
 
															-		}
														
 
															-	}
														
 
															-
														
 
															-	// Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level.
														
 
															-	scanner: #simd[SCAN_WIDTH]u8 = c
														
 
															-	alignment_start := (SCAN_WIDTH - ptr % SCAN_WIDTH) % SCAN_WIDTH
														
 
															-
														
 
															-	i -= SCAN_WIDTH - 1
														
 
															-
														
 
															-	for /**/; i >= alignment_start; i -= SCAN_WIDTH {
														
 
															-		load := (cast(^#simd[SCAN_WIDTH]u8)(&data[i]))^
														
 
															-		comparison := intrinsics.simd_lanes_eq(load, scanner)
														
 
															-		match := intrinsics.simd_reduce_or(comparison)
														
 
															-		if match > 0 {
														
 
															-			sentinel: #simd[SCAN_WIDTH]u8
														
 
															-			index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel)
														
 
															-			index_reduce := intrinsics.simd_reduce_max(index_select)
														
 
															-			return i + cast(int)index_reduce
														
 
															-		}
														
 
															-	}
														
 
															-
														
 
															-	// Iterate as a scalar over the remaining unaligned portion.
														
 
															-	i += SCAN_WIDTH - 1
														
 
															-	
														
 
															-	for /**/; i >= 0; i -= 1 {
														
 
															-		if data[i] == c {
														
 
															-			return i
														
 
															-		}
														
 
															-	}
														
 
															-
														
 
															-	return -1
														
 
															-}
														
--- a/core/strings/strings.odin
+++ b/core/strings/strings.odin
@@ -2,8 +2,8 @@
 
															 package strings
														
 
															 import "base:intrinsics"
														
 
															+import "core:bytes"
														
 
															 import "core:io"
														
 
															-@require import simd_util "core:simd/util"
														
 
															 import "core:mem"
														
 
															 import "core:unicode"
														
 
															 import "core:unicode/utf8"
														
@@ -1426,23 +1426,7 @@ Output:
 
															 */
														
 
															 index_byte :: proc(s: string, c: byte) -> (res: int) {
														
 
															-	_index_byte :: #force_inline proc "contextless" (s: string, c: byte) -> int {
														
 
															-		for i := 0; i < len(s); i += 1 {
														
 
															-			if s[i] == c {
														
 
															-				return i
														
 
															-			}
														
 
															-		}
														
 
															-		return -1
														
 
															-	}
														
 
															-
														
 
															-	// NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a
														
 
															-	// significant speedup when compiling in either Size or Speed mode.
														
 
															-	// The SIMD version is usually 2-3x slower without optimizations on.
														
 
															-	when ODIN_OPTIMIZATION_MODE > .Minimal {
														
 
															-		return #force_inline simd_util.index_byte(transmute([]u8)s, c)
														
 
															-	} else {
														
 
															-		return _index_byte(s, c)
														
 
															-	}
														
 
															+	return #force_inline bytes.index_byte(transmute([]u8)s, c)
														
 
															 }
														
 
															 /*
														
 
															 Returns the byte offset of the last byte `c` in the string `s`, -1 when not found.
														
@@ -1477,20 +1461,7 @@ Output:
 
															 */
														
 
															 last_index_byte :: proc(s: string, c: byte) -> (res: int) {
														
 
															-	_last_index_byte :: #force_inline proc "contextless" (s: string, c: byte) -> int {
														
 
															-		for i := len(s)-1; i >= 0; i -= 1 {
														
 
															-			if s[i] == c {
														
 
															-				return i
														
 
															-			}
														
 
															-		}
														
 
															-		return -1
														
 
															-	}
														
 
															-
														
 
															-	when ODIN_OPTIMIZATION_MODE > .Minimal {
														
 
															-		return #force_inline simd_util.last_index_byte(transmute([]u8)s, c)
														
 
															-	} else {
														
 
															-		return _last_index_byte(s, c)
														
 
															-	}
														
 
															+	return #force_inline bytes.last_index_byte(transmute([]u8)s, c)
														
 
															 }
														
 
															 /*
														
 
															 Returns the byte offset of the first rune `r` in the string `s` it finds, -1 when not found.
														
--- a/examples/all/all_main.odin
+++ b/examples/all/all_main.odin
@@ -115,7 +115,6 @@ import relative         "core:relative"
 
															 import reflect          "core:reflect"
														
 
															 import runtime          "base:runtime"
														
 
															 import simd             "core:simd"
														
 
															-import simd_util        "core:simd/util"
														
 
															 import x86              "core:simd/x86"
														
 
															 import slice            "core:slice"
														
 
															 import slice_heap       "core:slice/heap"
														
@@ -238,7 +237,6 @@ _ :: relative
 
															 _ :: reflect
														
 
															 _ :: runtime
														
 
															 _ :: simd
														
 
															-_ :: simd_util
														
 
															 _ :: x86
														
 
															 _ :: slice
														
 
															 _ :: slice_heap
														
--- a/tests/benchmark/all.odin
+++ b/tests/benchmark/all.odin
@@ -1,5 +1,5 @@
 
															 package benchmarks
														
 
															+@(require) import "bytes"
														
 
															 @(require) import "crypto"
														
 
															 @(require) import "hash"
														
 
															-@(require) import "simd/util"
														
--- a/tests/benchmark/simd/util/benchmark_simd_util.odin
+++ b/tests/benchmark/simd/util/benchmark_simd_util.odin
@@ -1,15 +1,15 @@
 
															-package benchmark_simd_util
														
 
															+package benchmark_bytes
														
 
															+import "core:bytes"
														
 
															 import "core:fmt"
														
 
															 import "core:log"
														
 
															-import simd_util "core:simd/util"
														
 
															 import "core:testing"
														
 
															 import "core:time"
														
 
															 // These are the normal, unoptimized algorithms.
														
 
															-plain_index_byte :: proc "contextless" (s: []u8, c: byte) -> (res: int) #no_bounds_check {
														
 
															+plain_index_byte :: proc(s: []u8, c: byte) -> (res: int) #no_bounds_check {
														
 
															 	for i := 0; i < len(s); i += 1 {
														
 
															 		if s[i] == c {
														
 
															 			return i
														
@@ -18,7 +18,7 @@ plain_index_byte :: proc "contextless" (s: []u8, c: byte) -> (res: int) #no_boun
 
															 	return -1
														
 
															 }
														
 
															-plain_last_index_byte :: proc "contextless" (s: []u8, c: byte) -> (res: int) #no_bounds_check {
														
 
															+plain_last_index_byte :: proc(s: []u8, c: byte) -> (res: int) #no_bounds_check {
														
 
															 	for i := len(s)-1; i >= 0; i -= 1 {
														
 
															 		if s[i] == c {
														
 
															 			return i
														
@@ -37,7 +37,7 @@ sizes := [?]int {
 
															 	1024 * 1024 * 1024,
														
 
															 }
														
 
															-run_trial_size :: proc(p: proc "contextless" ([]u8, byte) -> int, size: int, idx: int, warmup: int, runs: int) -> (timing: time.Duration) {
														
 
															+run_trial_size :: proc(p: proc([]u8, byte) -> int, size: int, idx: int, warmup: int, runs: int) -> (timing: time.Duration) {
														
 
															 	data := make([]u8, size)
														
 
															 	defer delete(data)
														
@@ -95,9 +95,9 @@ benchmark_plain_index_hot :: proc(t: ^testing.T) {
 
															 benchmark_simd_index_cold :: proc(t: ^testing.T) {
														
 
															 	report: string
														
 
															 	for size in sizes {
														
 
															-		timing := run_trial_size(simd_util.index_byte, size, size - 1, 0, 1)
														
 
															+		timing := run_trial_size(bytes.index_byte, size, size - 1, 0, 1)
														
 
															 		report = fmt.tprintf("%s\n        +++ % 8M | %v", report, size, timing)
														
 
															-		timing = run_trial_size(simd_util.last_index_byte, size, 0, 0, 1)
														
 
															+		timing = run_trial_size(bytes.last_index_byte, size, 0, 0, 1)
														
 
															 		report = fmt.tprintf("%s\n (last) +++ % 8M | %v", report, size, timing)
														
 
															 	}
														
 
															 	log.info(report)
														
@@ -107,9 +107,9 @@ benchmark_simd_index_cold :: proc(t: ^testing.T) {
 
															 benchmark_simd_index_hot :: proc(t: ^testing.T) {
														
 
															 	report: string
														
 
															 	for size in sizes {
														
 
															-		timing := run_trial_size(simd_util.index_byte, size, size - 1, HOT, HOT)
														
 
															+		timing := run_trial_size(bytes.index_byte, size, size - 1, HOT, HOT)
														
 
															 		report = fmt.tprintf("%s\n        +++ % 8M | %v", report, size, timing)
														
 
															-		timing = run_trial_size(simd_util.last_index_byte, size, 0, HOT, HOT)
														
 
															+		timing = run_trial_size(bytes.last_index_byte, size, 0, HOT, HOT)
														
 
															 		report = fmt.tprintf("%s\n (last) +++ % 8M | %v", report, size, timing)
														
 
															 	}
														
 
															 	log.info(report)
														
--- a/tests/core/simd/util/test_core_simd_util.odin
+++ b/tests/core/simd/util/test_core_simd_util.odin
@@ -1,6 +1,6 @@
 
															-package test_core_simd_util
														
 
															+package test_core_bytes
														
 
															-import simd_util "core:simd/util"
														
 
															+import "core:bytes"
														
 
															 import "core:testing"
														
 
															 @test
														
@@ -15,30 +15,30 @@ test_index_byte_sanity :: proc(t: ^testing.T) {
 
															 		// Find it at the end.
														
 
															 		data[n-1] = 'o'
														
 
															-		if !testing.expect_value(t, simd_util.index_byte(data, 'o'), n-1) {
														
 
															+		if !testing.expect_value(t, bytes.index_byte(data, 'o'), n-1) {
														
 
															 			return
														
 
															 		}
														
 
															-		if !testing.expect_value(t, simd_util.last_index_byte(data, 'o'), n-1) {
														
 
															+		if !testing.expect_value(t, bytes.last_index_byte(data, 'o'), n-1) {
														
 
															 			return
														
 
															 		}
														
 
															 		data[n-1] = '-'
														
 
															 		// Find it in the middle.
														
 
															 		data[n/2] = 'o'
														
 
															-		if !testing.expect_value(t, simd_util.index_byte(data, 'o'), n/2) {
														
 
															+		if !testing.expect_value(t, bytes.index_byte(data, 'o'), n/2) {
														
 
															 			return
														
 
															 		}
														
 
															-		if !testing.expect_value(t, simd_util.last_index_byte(data, 'o'), n/2) {
														
 
															+		if !testing.expect_value(t, bytes.last_index_byte(data, 'o'), n/2) {
														
 
															 			return
														
 
															 		}
														
 
															 		data[n/2] = '-'
														
 
															 		// Find it at the start.
														
 
															 		data[0] = 'o'
														
 
															-		if !testing.expect_value(t, simd_util.index_byte(data, 'o'), 0) {
														
 
															+		if !testing.expect_value(t, bytes.index_byte(data, 'o'), 0) {
														
 
															 			return
														
 
															 		}
														
 
															-		if !testing.expect_value(t, simd_util.last_index_byte(data, 'o'), 0) {
														
 
															+		if !testing.expect_value(t, bytes.last_index_byte(data, 'o'), 0) {
														
 
															 			return
														
 
															 		}
														
 
															 	}
														
@@ -47,8 +47,8 @@ test_index_byte_sanity :: proc(t: ^testing.T) {
 
															 @test
														
 
															 test_index_byte_empty :: proc(t: ^testing.T) {
														
 
															 	a: [1]u8
														
 
															-	testing.expect_value(t, simd_util.index_byte(a[0:0], 'o'), -1)
														
 
															-	testing.expect_value(t, simd_util.last_index_byte(a[0:0], 'o'), -1)
														
 
															+	testing.expect_value(t, bytes.index_byte(a[0:0], 'o'), -1)
														
 
															+	testing.expect_value(t, bytes.last_index_byte(a[0:0], 'o'), -1)
														
 
															 }
														
 
															 @test
														
@@ -65,12 +65,12 @@ test_index_byte_multiple_hits :: proc(t: ^testing.T) {
 
															 		data[n-5] = 'o'
														
 
															 		// Find the first one.
														
 
															-		if !testing.expect_value(t, simd_util.index_byte(data, 'o'), n-5) {
														
 
															+		if !testing.expect_value(t, bytes.index_byte(data, 'o'), n-5) {
														
 
															 			return
														
 
															 		}
														
 
															 		// Find the last one.
														
 
															-		if !testing.expect_value(t, simd_util.last_index_byte(data, 'o'), n-1) {
														
 
															+		if !testing.expect_value(t, bytes.last_index_byte(data, 'o'), n-1) {
														
 
															 			return
														
 
															 		}
														
 
															 	}
														
@@ -88,19 +88,19 @@ test_index_byte_zero :: proc(t: ^testing.T) {
 
															 		// Positive hit.
														
 
															 		data[n-1] = 0
														
 
															-		if !testing.expect_value(t, simd_util.index_byte(data[:n], 0), n-1) {
														
 
															+		if !testing.expect_value(t, bytes.index_byte(data[:n], 0), n-1) {
														
 
															 			return
														
 
															 		}
														
 
															-		if !testing.expect_value(t, simd_util.last_index_byte(data[:n], 0), n-1) {
														
 
															+		if !testing.expect_value(t, bytes.last_index_byte(data[:n], 0), n-1) {
														
 
															 			return
														
 
															 		}
														
 
															 		// Test for false positives.
														
 
															 		data[n-1] = '-'
														
 
															-		if !testing.expect_value(t, simd_util.index_byte(data[:n], 0), -1) {
														
 
															+		if !testing.expect_value(t, bytes.index_byte(data[:n], 0), -1) {
														
 
															 			return
														
 
															 		}
														
 
															-		if !testing.expect_value(t, simd_util.last_index_byte(data[:n], 0), -1) {
														
 
															+		if !testing.expect_value(t, bytes.last_index_byte(data[:n], 0), -1) {
														
 
															 			return
														
 
															 		}
														
 
															 	}
														
@@ -117,22 +117,22 @@ test_misaligned_data :: proc(t: ^testing.T) {
 
															 		for m in 1..<n {
														
 
															 			data[n-1] = 'o'
														
 
															-			if !testing.expect_value(t, simd_util.index_byte(data[m:n], 'o'), n-1-m) {
														
 
															+			if !testing.expect_value(t, bytes.index_byte(data[m:n], 'o'), n-1-m) {
														
 
															 				return
														
 
															 			}
														
 
															 			data[n-1] = '-'
														
 
															 			data[m+(n-m)/2] = 'o'
														
 
															-			if !testing.expect_value(t, simd_util.index_byte(data[m:n], 'o'), (n-m)/2) {
														
 
															+			if !testing.expect_value(t, bytes.index_byte(data[m:n], 'o'), (n-m)/2) {
														
 
															 				return
														
 
															 			}
														
 
															-			if !testing.expect_value(t, simd_util.last_index_byte(data[m:n], 'o'), (n-m)/2) {
														
 
															+			if !testing.expect_value(t, bytes.last_index_byte(data[m:n], 'o'), (n-m)/2) {
														
 
															 				return
														
 
															 			}
														
 
															 			data[m+(n-m)/2] = '-'
														
 
															 			data[m]   = 'o'
														
 
															-			if !testing.expect_value(t, simd_util.last_index_byte(data[m:n], 'o'), 0) {
														
 
															+			if !testing.expect_value(t, bytes.last_index_byte(data[m:n], 'o'), 0) {
														
 
															 				return
														
 
															 			}
														
 
															 			data[m]   = '-'
														
--- a/tests/core/normal.odin
+++ b/tests/core/normal.odin
@@ -9,6 +9,7 @@ download_assets :: proc() {
 
															 	}
														
 
															 }
														
 
															+@(require) import "bytes"
														
 
															 @(require) import "c/libc"
														
 
															 @(require) import "compress"
														
 
															 @(require) import "container"
														
@@ -34,7 +35,6 @@ download_assets :: proc() {
 
															 @(require) import "path/filepath"
														
 
															 @(require) import "reflect"
														
 
															 @(require) import "runtime"
														
 
															-@(require) import "simd/util"
														
 
															 @(require) import "slice"
														
 
															 @(require) import "strconv"
														
 
															 @(require) import "strings"