3 mesi fa · 3142aaf497
--- a/base/runtime/internal.odin
+++ b/base/runtime/internal.odin
@@ -16,6 +16,12 @@ RUNTIME_REQUIRE :: false // !ODIN_TILDE
 
				 @(private)
			
 
				 __float16 :: f16 when __ODIN_LLVM_F16_SUPPORTED else u16
			
 
				 
			
 
				+HAS_HARDWARE_SIMD :: false when (ODIN_ARCH == .amd64 || ODIN_ARCH == .i386) && !intrinsics.has_target_feature("sse2") else
			
 
				+	false when (ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32) && !intrinsics.has_target_feature("neon") else
			
 
				+	false when (ODIN_ARCH == .wasm64p32 || ODIN_ARCH == .wasm32) && !intrinsics.has_target_feature("simd128") else
			
 
				+	false when (ODIN_ARCH == .riscv64) && !intrinsics.has_target_feature("v") else
			
 
				+	true
			
 
				+
			
 
				 
			
 
				 @(private)
			
 
				 byte_slice :: #force_inline proc "contextless" (data: rawptr, len: int) -> []byte #no_bounds_check {
			
@@ -229,150 +235,173 @@ memory_equal :: proc "contextless" (x, y: rawptr, n: int) -> bool {
 
				 	case n == 0: return true
			
 
				 	case x == y: return true
			
 
				 	}
			
 
				-	a, b := ([^]byte)(x), ([^]byte)(y)
			
 
				-	length := uint(n)
			
 
				-
			
 
				-	for i := uint(0); i < length; i += 1 {
			
 
				-		if a[i] != b[i] {
			
 
				-			return false
			
 
				-		}
			
 
				-	}
			
 
				-	return true
			
 
				-	
			
 
				-/*
			
 
				-
			
 
				-	when size_of(uint) == 8 {
			
 
				-		if word_length := length >> 3; word_length != 0 {
			
 
				-			for _ in 0..<word_length {
			
 
				-				if intrinsics.unaligned_load((^u64)(a)) != intrinsics.unaligned_load((^u64)(b)) {
			
 
				-					return false
			
 
				+	a, b := cast([^]byte)x, cast([^]byte)y
			
 
				+
			
 
				+	n := uint(n)
			
 
				+	i := uint(0)
			
 
				+	m := uint(0)
			
 
				+
			
 
				+	if n >= 8 {
			
 
				+		when HAS_HARDWARE_SIMD {
			
 
				+			// Avoid using 256-bit SIMD on platforms where its emulation is
			
 
				+			// likely to be less than ideal.
			
 
				+			when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
			
 
				+				m = n / 32 * 32
			
 
				+				for /**/; i < m; i += 32 {
			
 
				+					load_a := intrinsics.unaligned_load(cast(^#simd[32]u8)&a[i])
			
 
				+					load_b := intrinsics.unaligned_load(cast(^#simd[32]u8)&b[i])
			
 
				+					ne := intrinsics.simd_lanes_ne(load_a, load_b)
			
 
				+					if intrinsics.simd_reduce_or(ne) != 0 {
			
 
				+						return false
			
 
				+					}
			
 
				 				}
			
 
				-				a = a[size_of(u64):]
			
 
				-				b = b[size_of(u64):]
			
 
				 			}
			
 
				 		}
			
 
				-		
			
 
				-		if length & 4 != 0 {
			
 
				-			if intrinsics.unaligned_load((^u32)(a)) != intrinsics.unaligned_load((^u32)(b)) {
			
 
				+
			
 
				+		m = (n-i) / 16 * 16
			
 
				+		for /**/; i < m; i += 16 {
			
 
				+			load_a := intrinsics.unaligned_load(cast(^#simd[16]u8)&a[i])
			
 
				+			load_b := intrinsics.unaligned_load(cast(^#simd[16]u8)&b[i])
			
 
				+			ne := intrinsics.simd_lanes_ne(load_a, load_b)
			
 
				+			if intrinsics.simd_reduce_or(ne) != 0 {
			
 
				 				return false
			
 
				 			}
			
 
				-			a = a[size_of(u32):]
			
 
				-			b = b[size_of(u32):]
			
 
				 		}
			
 
				-		
			
 
				-		if length & 2 != 0 {
			
 
				-			if intrinsics.unaligned_load((^u16)(a)) != intrinsics.unaligned_load((^u16)(b)) {
			
 
				+
			
 
				+		m = (n-i) / 8 * 8
			
 
				+		for /**/; i < m; i += 8 {
			
 
				+			if intrinsics.unaligned_load(cast(^uintptr)&a[i]) != intrinsics.unaligned_load(cast(^uintptr)&b[i]) {
			
 
				 				return false
			
 
				 			}
			
 
				-			a = a[size_of(u16):]
			
 
				-			b = b[size_of(u16):]
			
 
				 		}
			
 
				-		
			
 
				-		if length & 1 != 0 && a[0] != b[0] {
			
 
				-			return false	
			
 
				-		}
			
 
				-		return true
			
 
				-	} else {
			
 
				-		if word_length := length >> 2; word_length != 0 {
			
 
				-			for _ in 0..<word_length {
			
 
				-				if intrinsics.unaligned_load((^u32)(a)) != intrinsics.unaligned_load((^u32)(b)) {
			
 
				-					return false
			
 
				-				}
			
 
				-				a = a[size_of(u32):]
			
 
				-				b = b[size_of(u32):]
			
 
				-			}
			
 
				-		}
			
 
				-		
			
 
				-		length &= 3
			
 
				-		
			
 
				-		if length != 0 {
			
 
				-			for i in 0..<length {
			
 
				-				if a[i] != b[i] {
			
 
				-					return false
			
 
				-				}
			
 
				-			}
			
 
				-		}
			
 
				-
			
 
				-		return true
			
 
				 	}
			
 
				-*/
			
 
				 
			
 
				+	for /**/; i < n; i += 1 {
			
 
				+		if a[i] != b[i] {
			
 
				+			return false
			
 
				+		}
			
 
				+	}
			
 
				+	return true
			
 
				 }
			
 
				-memory_compare :: proc "contextless" (a, b: rawptr, n: int) -> int #no_bounds_check {
			
 
				+
			
 
				+memory_compare :: proc "contextless" (x, y: rawptr, n: int) -> int #no_bounds_check {
			
 
				 	switch {
			
 
				-	case a == b:   return 0
			
 
				-	case a == nil: return -1
			
 
				-	case b == nil: return +1
			
 
				-	}
			
 
				-
			
 
				-	x := uintptr(a)
			
 
				-	y := uintptr(b)
			
 
				-	n := uintptr(n)
			
 
				-
			
 
				-	SU :: size_of(uintptr)
			
 
				-	fast := n/SU + 1
			
 
				-	offset := (fast-1)*SU
			
 
				-	curr_block := uintptr(0)
			
 
				-	if n < SU {
			
 
				-		fast = 0
			
 
				-	}
			
 
				-
			
 
				-	for /**/; curr_block < fast; curr_block += 1 {
			
 
				-		va := (^uintptr)(x + curr_block * size_of(uintptr))^
			
 
				-		vb := (^uintptr)(y + curr_block * size_of(uintptr))^
			
 
				-		if va ~ vb != 0 {
			
 
				-			for pos := curr_block*SU; pos < n; pos += 1 {
			
 
				-				a := (^byte)(x+pos)^
			
 
				-				b := (^byte)(y+pos)^
			
 
				-				if a ~ b != 0 {
			
 
				-					return -1 if (int(a) - int(b)) < 0 else +1
			
 
				+	case x == y:   return 0
			
 
				+	case x == nil: return -1
			
 
				+	case y == nil: return +1
			
 
				+	}
			
 
				+	a, b := cast([^]byte)x, cast([^]byte)y
			
 
				+	
			
 
				+	n := uint(n)
			
 
				+	i := uint(0)
			
 
				+	m := uint(0)
			
 
				+
			
 
				+	when HAS_HARDWARE_SIMD {
			
 
				+		when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
			
 
				+			m = n / 32 * 32
			
 
				+			for /**/; i < m; i += 32 {
			
 
				+				load_a := intrinsics.unaligned_load(cast(^#simd[32]u8)&a[i])
			
 
				+				load_b := intrinsics.unaligned_load(cast(^#simd[32]u8)&b[i])
			
 
				+				comparison := intrinsics.simd_lanes_ne(load_a, load_b)
			
 
				+				if intrinsics.simd_reduce_or(comparison) != 0 {
			
 
				+					sentinel: #simd[32]u8 = u8(0xFF)
			
 
				+					indices := intrinsics.simd_indices(#simd[32]u8)
			
 
				+					index_select := intrinsics.simd_select(comparison, indices, sentinel)
			
 
				+					index_reduce := cast(uint)intrinsics.simd_reduce_min(index_select)
			
 
				+					return -1 if a[i+index_reduce] < b[i+index_reduce] else +1
			
 
				 				}
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	for /**/; offset < n; offset += 1 {
			
 
				-		a := (^byte)(x+offset)^
			
 
				-		b := (^byte)(y+offset)^
			
 
				-		if a ~ b != 0 {
			
 
				-			return -1 if (int(a) - int(b)) < 0 else +1
			
 
				+	m = (n-i) / 16 * 16
			
 
				+	for /**/; i < m; i += 16 {
			
 
				+		load_a := intrinsics.unaligned_load(cast(^#simd[16]u8)&a[i])
			
 
				+		load_b := intrinsics.unaligned_load(cast(^#simd[16]u8)&b[i])
			
 
				+		comparison := intrinsics.simd_lanes_ne(load_a, load_b)
			
 
				+		if intrinsics.simd_reduce_or(comparison) != 0 {
			
 
				+			sentinel: #simd[16]u8 = u8(0xFF)
			
 
				+			indices := intrinsics.simd_indices(#simd[16]u8)
			
 
				+			index_select := intrinsics.simd_select(comparison, indices, sentinel)
			
 
				+			index_reduce := cast(uint)intrinsics.simd_reduce_min(index_select)
			
 
				+			return -1 if a[i+index_reduce] < b[i+index_reduce] else +1
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	// 64-bit SIMD is faster than using a `uintptr` to detect a difference then
			
 
				+	// re-iterating with the byte-by-byte loop, at least on AMD64.
			
 
				+	m = (n-i) / 8 * 8
			
 
				+	for /**/; i < m; i += 8 {
			
 
				+		load_a := intrinsics.unaligned_load(cast(^#simd[8]u8)&a[i])
			
 
				+		load_b := intrinsics.unaligned_load(cast(^#simd[8]u8)&b[i])
			
 
				+		comparison := intrinsics.simd_lanes_ne(load_a, load_b)
			
 
				+		if intrinsics.simd_reduce_or(comparison) != 0 {
			
 
				+			sentinel: #simd[8]u8 = u8(0xFF)
			
 
				+			indices := intrinsics.simd_indices(#simd[8]u8)
			
 
				+			index_select := intrinsics.simd_select(comparison, indices, sentinel)
			
 
				+			index_reduce := cast(uint)intrinsics.simd_reduce_min(index_select)
			
 
				+			return -1 if a[i+index_reduce] < b[i+index_reduce] else +1
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for /**/; i < n; i += 1 {
			
 
				+		if a[i] ~ b[i] != 0 {
			
 
				+			return -1 if int(a[i]) - int(b[i]) < 0 else +1
			
 
				+		}
			
 
				+	}
			
 
				 	return 0
			
 
				 }
			
 
				 
			
 
				 memory_compare_zero :: proc "contextless" (a: rawptr, n: int) -> int #no_bounds_check {
			
 
				-	x := uintptr(a)
			
 
				-	n := uintptr(n)
			
 
				-
			
 
				-	SU :: size_of(uintptr)
			
 
				-	fast := n/SU + 1
			
 
				-	offset := (fast-1)*SU
			
 
				-	curr_block := uintptr(0)
			
 
				-	if n < SU {
			
 
				-		fast = 0
			
 
				-	}
			
 
				-
			
 
				-	for /**/; curr_block < fast; curr_block += 1 {
			
 
				-		va := (^uintptr)(x + curr_block * size_of(uintptr))^
			
 
				-		if va ~ 0 != 0 {
			
 
				-			for pos := curr_block*SU; pos < n; pos += 1 {
			
 
				-				a := (^byte)(x+pos)^
			
 
				-				if a ~ 0 != 0 {
			
 
				-					return -1 if int(a) < 0 else +1
			
 
				+	n := uint(n)
			
 
				+	i := uint(0)
			
 
				+	m := uint(0)
			
 
				+
			
 
				+	// Because we're comparing against zero, we never return -1, as that would
			
 
				+	// indicate the compared value is less than zero.
			
 
				+	//
			
 
				+	// Note that a zero return value here means equality.
			
 
				+
			
 
				+	bytes := ([^]u8)(a)
			
 
				+
			
 
				+	if n >= 8 {
			
 
				+		when HAS_HARDWARE_SIMD {
			
 
				+			when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
			
 
				+				scanner32: #simd[32]u8
			
 
				+				m = n / 32 * 32
			
 
				+				for /**/; i < m; i += 32 {
			
 
				+					load := intrinsics.unaligned_load(cast(^#simd[32]u8)&bytes[i])
			
 
				+					ne := intrinsics.simd_lanes_ne(scanner32, load)
			
 
				+					if intrinsics.simd_reduce_or(ne) > 0 {
			
 
				+						return 1
			
 
				+					}
			
 
				 				}
			
 
				 			}
			
 
				 		}
			
 
				-	}
			
 
				 
			
 
				-	for /**/; offset < n; offset += 1 {
			
 
				-		a := (^byte)(x+offset)^
			
 
				-		if a ~ 0 != 0 {
			
 
				-			return -1 if int(a) < 0 else +1
			
 
				+		scanner16: #simd[16]u8
			
 
				+		m = (n-i) / 16 * 16
			
 
				+		for /**/; i < m; i += 16 {
			
 
				+			load := intrinsics.unaligned_load(cast(^#simd[16]u8)&bytes[i])
			
 
				+			ne := intrinsics.simd_lanes_ne(scanner16, load)
			
 
				+			if intrinsics.simd_reduce_or(ne) != 0 {
			
 
				+				return 1
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		m = (n-i) / 8 * 8
			
 
				+		for /**/; i < m; i += 8 {
			
 
				+			if intrinsics.unaligned_load(cast(^uintptr)&bytes[i]) != 0 {
			
 
				+				return 1
			
 
				+			}
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	for /**/; i < n; i += 1 {
			
 
				+		if bytes[i] != 0 {
			
 
				+			return 1
			
 
				+		}
			
 
				+	}
			
 
				 	return 0
			
 
				 }
			
 
				 
			
--- a/core/bytes/bytes.odin
+++ b/core/bytes/bytes.odin
@@ -350,7 +350,7 @@ index_byte :: proc "contextless" (s: []byte, c: byte) -> (index: int) #no_bounds
 
				 	}
			
 
				 
			
 
				 	c_vec: simd.u8x16 = c
			
 
				-	when !simd.IS_EMULATED {
			
 
				+	when simd.HAS_HARDWARE_SIMD {
			
 
				 		// Note: While this is something that could also logically take
			
 
				 		// advantage of AVX512, the various downclocking and power
			
 
				 		// consumption related woes make premature to have a dedicated
			
@@ -485,7 +485,7 @@ last_index_byte :: proc "contextless" (s: []byte, c: byte) -> int #no_bounds_che
 
				 	}
			
 
				 
			
 
				 	c_vec: simd.u8x16 = c
			
 
				-	when !simd.IS_EMULATED {
			
 
				+	when simd.HAS_HARDWARE_SIMD {
			
 
				 		// Note: While this is something that could also logically take
			
 
				 		// advantage of AVX512, the various downclocking and power
			
 
				 		// consumption related woes make premature to have a dedicated
			
--- a/core/crypto/_chacha20/simd128/chacha20_simd128.odin
+++ b/core/crypto/_chacha20/simd128/chacha20_simd128.odin
@@ -39,7 +39,7 @@ when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
 
				 
			
 
				 // Some targets lack runtime feature detection, and will flat out refuse
			
 
				 // to load binaries that have unknown instructions.  This is distinct from
			
 
				-// `simd.IS_EMULATED` as actually good designs support runtime feature
			
 
				+// `simd.HAS_HARDWARE_SIMD` as actually good designs support runtime feature
			
 
				 // detection and that constant establishes a baseline.
			
 
				 //
			
 
				 // See:
			
--- a/core/simd/simd.odin
+++ b/core/simd/simd.odin
@@ -21,20 +21,17 @@ package simd
 
				 
			
 
				 import "base:builtin"
			
 
				 import "base:intrinsics"
			
 
				+import "base:runtime"
			
 
				 
			
 
				 /*
			
 
				 Check if SIMD is software-emulated on a target platform.
			
 
				 
			
 
				-This value is `false`, when the compile-time target has the hardware support for
			
 
				-at 128-bit (or wider) SIMD. If the compile-time target lacks the hardware support
			
 
				-for 128-bit SIMD, this value is `true`, and all SIMD operations will likely be
			
 
				+This value is `true`, when the compile-time target has the hardware support for
			
 
				+at least 128-bit (or wider) SIMD. If the compile-time target lacks the hardware support
			
 
				+for 128-bit SIMD, this value is `false`, and all SIMD operations will likely be
			
 
				 emulated.
			
 
				 */
			
 
				-IS_EMULATED :: true when (ODIN_ARCH == .amd64 || ODIN_ARCH == .i386) && !intrinsics.has_target_feature("sse2") else
			
 
				-	true when (ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32) && !intrinsics.has_target_feature("neon") else
			
 
				-	true when (ODIN_ARCH == .wasm64p32 || ODIN_ARCH == .wasm32) && !intrinsics.has_target_feature("simd128") else
			
 
				-	true when (ODIN_ARCH == .riscv64) && !intrinsics.has_target_feature("v") else
			
 
				-	false
			
 
				+HAS_HARDWARE_SIMD :: runtime.HAS_HARDWARE_SIMD
			
 
				 
			
 
				 /*
			
 
				 Vector of 16 `u8` lanes (128 bits).
			
--- a/tests/benchmark/bytes/benchmark_bytes.odin
+++ b/tests/benchmark/bytes/benchmark_bytes.odin
@@ -54,14 +54,15 @@ run_trial_size :: proc(p: proc "contextless" ([]u8, byte) -> int, size: int, idx
 
				 
			
 
				 	accumulator: int
			
 
				 
			
 
				+	watch: time.Stopwatch
			
 
				+
			
 
				+	time.stopwatch_start(&watch)
			
 
				 	for _ in 0..<runs {
			
 
				-		start := time.now()
			
 
				 		accumulator += p(data, 'z')
			
 
				-		done := time.since(start)
			
 
				-		timing += done
			
 
				 	}
			
 
				+	time.stopwatch_stop(&watch)
			
 
				 
			
 
				-	timing /= time.Duration(runs)
			
 
				+	timing = time.stopwatch_duration(watch)
			
 
				 
			
 
				 	log.debug(accumulator)
			
 
				 	return
			
--- a/tests/benchmark/runtime/benchmark_runtime.odin
+++ b/tests/benchmark/runtime/benchmark_runtime.odin
@@ -0,0 +1,227 @@
 
				+package benchmark_runtime
			
 
				+
			
 
				+import "base:runtime"
			
 
				+import "core:fmt"
			
 
				+import "core:log"
			
 
				+import "core:testing"
			
 
				+import "core:strings"
			
 
				+import "core:text/table"
			
 
				+import "core:time"
			
 
				+
			
 
				+RUNS_PER_SIZE :: 2500
			
 
				+
			
 
				+sizes := [?]int {
			
 
				+	7, 8, 9,
			
 
				+	15, 16, 17,
			
 
				+	31, 32, 33,
			
 
				+	63, 64, 65,
			
 
				+	95, 96, 97,
			
 
				+	128,
			
 
				+	256,
			
 
				+	512,
			
 
				+	1024,
			
 
				+	4096,
			
 
				+	1024 * 1024,
			
 
				+}
			
 
				+
			
 
				+// These are the normal, unoptimized algorithms.
			
 
				+
			
 
				+plain_memory_equal :: proc "contextless" (x, y: rawptr, n: int) -> bool {
			
 
				+	switch {
			
 
				+	case n == 0: return true
			
 
				+	case x == y: return true
			
 
				+	}
			
 
				+	a, b := ([^]byte)(x), ([^]byte)(y)
			
 
				+	length := uint(n)
			
 
				+
			
 
				+	for i := uint(0); i < length; i += 1 {
			
 
				+		if a[i] != b[i] {
			
 
				+			return false
			
 
				+		}
			
 
				+	}
			
 
				+	return true
			
 
				+}
			
 
				+
			
 
				+plain_memory_compare :: proc "contextless" (a, b: rawptr, n: int) -> int #no_bounds_check {
			
 
				+	switch {
			
 
				+	case a == b:   return 0
			
 
				+	case a == nil: return -1
			
 
				+	case b == nil: return +1
			
 
				+	}
			
 
				+
			
 
				+	x := uintptr(a)
			
 
				+	y := uintptr(b)
			
 
				+	n := uintptr(n)
			
 
				+
			
 
				+	SU :: size_of(uintptr)
			
 
				+	fast := n/SU + 1
			
 
				+	offset := (fast-1)*SU
			
 
				+	curr_block := uintptr(0)
			
 
				+	if n < SU {
			
 
				+		fast = 0
			
 
				+	}
			
 
				+
			
 
				+	for /**/; curr_block < fast; curr_block += 1 {
			
 
				+		va := (^uintptr)(x + curr_block * size_of(uintptr))^
			
 
				+		vb := (^uintptr)(y + curr_block * size_of(uintptr))^
			
 
				+		if va ~ vb != 0 {
			
 
				+			for pos := curr_block*SU; pos < n; pos += 1 {
			
 
				+				a := (^byte)(x+pos)^
			
 
				+				b := (^byte)(y+pos)^
			
 
				+				if a ~ b != 0 {
			
 
				+					return -1 if (int(a) - int(b)) < 0 else +1
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for /**/; offset < n; offset += 1 {
			
 
				+		a := (^byte)(x+offset)^
			
 
				+		b := (^byte)(y+offset)^
			
 
				+		if a ~ b != 0 {
			
 
				+			return -1 if (int(a) - int(b)) < 0 else +1
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return 0
			
 
				+}
			
 
				+
			
 
				+plain_memory_compare_zero :: proc "contextless" (a: rawptr, n: int) -> int #no_bounds_check {
			
 
				+	x := uintptr(a)
			
 
				+	n := uintptr(n)
			
 
				+
			
 
				+	SU :: size_of(uintptr)
			
 
				+	fast := n/SU + 1
			
 
				+	offset := (fast-1)*SU
			
 
				+	curr_block := uintptr(0)
			
 
				+	if n < SU {
			
 
				+		fast = 0
			
 
				+	}
			
 
				+
			
 
				+	for /**/; curr_block < fast; curr_block += 1 {
			
 
				+		va := (^uintptr)(x + curr_block * size_of(uintptr))^
			
 
				+		if va ~ 0 != 0 {
			
 
				+			for pos := curr_block*SU; pos < n; pos += 1 {
			
 
				+				a := (^byte)(x+pos)^
			
 
				+				if a ~ 0 != 0 {
			
 
				+					return -1 if int(a) < 0 else +1
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for /**/; offset < n; offset += 1 {
			
 
				+		a := (^byte)(x+offset)^
			
 
				+		if a ~ 0 != 0 {
			
 
				+			return -1 if int(a) < 0 else +1
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return 0
			
 
				+}
			
 
				+
			
 
				+run_trial_size_cmp :: proc(p: proc "contextless" (rawptr, rawptr, int) -> $R, size: int, idx: int, runs: int, loc := #caller_location) -> (timing: time.Duration) {
			
 
				+	left  := make([]u8, size)
			
 
				+	right := make([]u8, size)
			
 
				+	defer {
			
 
				+		delete(left)
			
 
				+		delete(right)
			
 
				+	}
			
 
				+
			
 
				+	right[idx] = 0x01
			
 
				+
			
 
				+	accumulator: int
			
 
				+
			
 
				+	watch: time.Stopwatch
			
 
				+
			
 
				+	time.stopwatch_start(&watch)
			
 
				+	for _ in 0..<runs {
			
 
				+		result := p(&left[0], &right[0], size)
			
 
				+		when R == bool {
			
 
				+			assert(result == false, loc = loc)
			
 
				+			accumulator += 1
			
 
				+		} else when R == int {
			
 
				+			assert(result == -1, loc = loc)
			
 
				+			accumulator += result
			
 
				+		}
			
 
				+	}
			
 
				+	time.stopwatch_stop(&watch)
			
 
				+	timing = time.stopwatch_duration(watch)
			
 
				+
			
 
				+	log.debug(accumulator)
			
 
				+	return
			
 
				+}
			
 
				+
			
 
				+run_trial_size_zero :: proc(p: proc "contextless" (rawptr, int) -> int, size: int, idx: int, runs: int, loc := #caller_location) -> (timing: time.Duration) {
			
 
				+	data := make([]u8, size)
			
 
				+	defer delete(data)
			
 
				+
			
 
				+	data[idx] = 0x01
			
 
				+
			
 
				+	accumulator: int
			
 
				+
			
 
				+	watch: time.Stopwatch
			
 
				+
			
 
				+	time.stopwatch_start(&watch)
			
 
				+	for _ in 0..<runs {
			
 
				+		result := p(&data[0], size)
			
 
				+		assert(result == 1, loc = loc)
			
 
				+		accumulator += result
			
 
				+	}
			
 
				+	time.stopwatch_stop(&watch)
			
 
				+	timing = time.stopwatch_duration(watch)
			
 
				+
			
 
				+	log.debug(accumulator)
			
 
				+	return
			
 
				+}
			
 
				+
			
 
				+run_trial_size :: proc {
			
 
				+	run_trial_size_cmp,
			
 
				+	run_trial_size_zero,
			
 
				+}
			
 
				+
			
 
				+
			
 
				+bench_table :: proc(algo_name: string, plain, simd: $P) {
			
 
				+	string_buffer := strings.builder_make()
			
 
				+	defer strings.builder_destroy(&string_buffer)
			
 
				+
			
 
				+	tbl: table.Table
			
 
				+	table.init(&tbl)
			
 
				+	defer table.destroy(&tbl)
			
 
				+
			
 
				+	table.aligned_header_of_values(&tbl, .Right, "Algorithm", "Size", "Iterations", "Scalar", "SIMD", "SIMD Relative (%)", "SIMD Relative (x)")
			
 
				+
			
 
				+	for size in sizes {
			
 
				+		// Place the non-zero byte somewhere in the middle.
			
 
				+		needle_index := size / 2
			
 
				+
			
 
				+		plain_timing := run_trial_size(plain, size, needle_index, RUNS_PER_SIZE)
			
 
				+		simd_timing  := run_trial_size(simd,  size, needle_index, RUNS_PER_SIZE)
			
 
				+
			
 
				+		_plain := fmt.tprintf("%8M",  plain_timing)
			
 
				+		_simd  := fmt.tprintf("%8M",  simd_timing)
			
 
				+		_relp  := fmt.tprintf("%.3f %%", f64(simd_timing) / f64(plain_timing) * 100.0)
			
 
				+		_relx  := fmt.tprintf("%.3f x",  1 / (f64(simd_timing) / f64(plain_timing)))
			
 
				+
			
 
				+		table.aligned_row_of_values(
			
 
				+			&tbl,
			
 
				+			.Right,
			
 
				+			algo_name,
			
 
				+			size, RUNS_PER_SIZE, _plain, _simd, _relp, _relx)
			
 
				+	}
			
 
				+
			
 
				+	builder_writer := strings.to_writer(&string_buffer)
			
 
				+
			
 
				+	fmt.sbprintln(&string_buffer)
			
 
				+	table.write_plain_table(builder_writer, &tbl)
			
 
				+
			
 
				+	my_table_string := strings.to_string(string_buffer)
			
 
				+	log.info(my_table_string)
			
 
				+}
			
 
				+
			
 
				+@test
			
 
				+benchmark_memory_procs :: proc(t: ^testing.T) {
			
 
				+	bench_table("memory_equal", plain_memory_equal, runtime.memory_equal)
			
 
				+	bench_table("memory_compare", plain_memory_compare, runtime.memory_compare)
			
 
				+	bench_table("memory_compare_zero", plain_memory_compare_zero, runtime.memory_compare_zero)
			
 
				+}
			
--- a/tests/core/runtime/test_core_runtime.odin
+++ b/tests/core/runtime/test_core_runtime.odin
@@ -4,6 +4,7 @@ package test_core_runtime
 
				 import "base:intrinsics"
			
 
				 import "core:mem"
			
 
				 import "base:runtime"
			
 
				+import "core:slice"
			
 
				 import "core:testing"
			
 
				 
			
 
				 // Tests that having space for the allocation, but not for the allocation and alignment
			
@@ -177,3 +178,78 @@ test_map_get :: proc(t: ^testing.T) {
 
				 		check(t, m)
			
 
				 	}
			
 
				 }
			
 
				+
			
 
				+@(test)
			
 
				+test_memory_equal :: proc(t: ^testing.T) {
			
 
				+	data: [256]u8
			
 
				+	cmp: [256]u8
			
 
				+
			
 
				+	slice.fill(data[:], 0xAA)
			
 
				+	slice.fill(cmp[:], 0xAA)
			
 
				+
			
 
				+	for offset in 0..<len(data) {
			
 
				+		subdata := data[offset:]
			
 
				+		subcmp := cmp[offset:]
			
 
				+		for idx in 0..<len(subdata) {
			
 
				+			if !testing.expect_value(t, runtime.memory_equal(&subdata[0], &subcmp[0], len(subdata)), true) {
			
 
				+				return
			
 
				+			}
			
 
				+
			
 
				+			subcmp[idx] = 0x55
			
 
				+			if !testing.expect_value(t, runtime.memory_equal(&subdata[0], &subcmp[0], len(subdata)), false) {
			
 
				+				return
			
 
				+			}
			
 
				+			subcmp[idx] = 0xAA
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+@(test)
			
 
				+test_memory_compare :: proc(t: ^testing.T) {
			
 
				+	data: [256]u8
			
 
				+	cmp: [256]u8
			
 
				+
			
 
				+	for offset in 0..<len(data) {
			
 
				+		subdata := data[offset:]
			
 
				+		subcmp := cmp[offset:]
			
 
				+		for idx in 0..<len(subdata) {
			
 
				+			if !testing.expect_value(t, runtime.memory_compare(&subdata[0], &subcmp[0], len(subdata)), 0) {
			
 
				+				return
			
 
				+			}
			
 
				+
			
 
				+			subdata[idx] = 0x7F
			
 
				+			subcmp[idx] = 0xFF
			
 
				+			if !testing.expect_value(t, runtime.memory_compare(&subdata[0], &subcmp[0], len(subdata)), -1) {
			
 
				+				return
			
 
				+			}
			
 
				+
			
 
				+			subdata[idx] = 0xFF
			
 
				+			subcmp[idx] = 0x7F
			
 
				+			if !testing.expect_value(t, runtime.memory_compare(&subdata[0], &subcmp[0], len(subdata)), 1) {
			
 
				+				return
			
 
				+			}
			
 
				+
			
 
				+			subdata[idx] = 0
			
 
				+			subcmp[idx] = 0
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+@(test)
			
 
				+test_memory_compare_zero :: proc(t: ^testing.T) {
			
 
				+	data: [256]u8
			
 
				+
			
 
				+	for offset in 0..<len(data) {
			
 
				+		subdata := data[offset:]
			
 
				+		for idx in 0..<len(subdata) {
			
 
				+			if !testing.expect_value(t, runtime.memory_compare_zero(&subdata[0], len(subdata)), 0) {
			
 
				+				return
			
 
				+			}
			
 
				+			subdata[idx] = 0xFF
			
 
				+			if !testing.expect_value(t, runtime.memory_compare_zero(&subdata[0], len(subdata)), 1) {
			
 
				+				return
			
 
				+			}
			
 
				+			subdata[idx] = 0
			
 
				+		}
			
 
				+	}
			
 
				+}