123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160 |
- /*
- (c) Copyright 2024 Feoramund <[email protected]>.
- Made available under Odin's BSD-3 license.
- List of contributors:
- Feoramund: `index_byte` procedures.
- */
- // package simd_util implements compositions of SIMD operations for optimizing
- // the core library where available.
- package simd_util
- import "base:intrinsics"
- @private SCAN_WIDTH :: 32
- @(private, rodata)
- simd_scanner_indices := #simd[SCAN_WIDTH]u8 {
- 0, 1, 2, 3, 4, 5, 6, 7,
- 8, 9, 10, 11, 12, 13, 14, 15,
- 16, 17, 18, 19, 20, 21, 22, 23,
- 24, 25, 26, 27, 28, 29, 30, 31,
- }
- /*
- Scan a slice of bytes for a specific byte.
- This procedure safely handles slices of any length, including empty slices.
- Inputs:
- - data: A slice of bytes.
- - c: The byte to search for.
- Returns:
- - index: The index of the byte `c`, or -1 if it was not found.
- */
- index_byte :: proc "contextless" (data: []u8, c: byte) -> (index: int) #no_bounds_check {
- length := len(data)
- i := 0
- // Guard against small strings.
- if length < SCAN_WIDTH {
- for /**/; i < length; i += 1 {
- if data[i] == c {
- return i
- }
- }
- return -1
- }
- ptr := cast(int)cast(uintptr)raw_data(data)
- alignment_start := (SCAN_WIDTH - ptr % SCAN_WIDTH) % SCAN_WIDTH
- // Iterate as a scalar until the data is aligned on a `SCAN_WIDTH` boundary.
- //
- // This way, every load in the vector loop will be aligned, which should be
- // the fastest possible scenario.
- for /**/; i < alignment_start; i += 1 {
- if data[i] == c {
- return i
- }
- }
- // Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level.
- scanner: #simd[SCAN_WIDTH]u8 = c
- tail := length - (length - alignment_start) % SCAN_WIDTH
- for /**/; i < tail; i += SCAN_WIDTH {
- load := (cast(^#simd[SCAN_WIDTH]u8)(&data[i]))^
- comparison := intrinsics.simd_lanes_eq(load, scanner)
- match := intrinsics.simd_reduce_or(comparison)
- if match > 0 {
- sentinel: #simd[SCAN_WIDTH]u8 = u8(0xFF)
- index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel)
- index_reduce := intrinsics.simd_reduce_min(index_select)
- return i + cast(int)index_reduce
- }
- }
-
- // Iterate as a scalar over the remaining unaligned portion.
- for /**/; i < length; i += 1 {
- if data[i] == c {
- return i
- }
- }
- return -1
- }
- /*
- Scan a slice of bytes for a specific byte, starting from the end and working
- backwards to the start.
- This procedure safely handles slices of any length, including empty slices.
- Inputs:
- - data: A slice of bytes.
- - c: The byte to search for.
- Returns:
- - index: The index of the byte `c`, or -1 if it was not found.
- */
- last_index_byte :: proc "contextless" (data: []u8, c: byte) -> int #no_bounds_check {
- length := len(data)
- i := length - 1
- // Guard against small strings.
- if length < SCAN_WIDTH {
- for /**/; i >= 0; i -= 1 {
- if data[i] == c {
- return i
- }
- }
- return -1
- }
- ptr := cast(int)cast(uintptr)raw_data(data)
- tail := length - (ptr + length) % SCAN_WIDTH
- // Iterate as a scalar until the data is aligned on a `SCAN_WIDTH` boundary.
- //
- // This way, every load in the vector loop will be aligned, which should be
- // the fastest possible scenario.
- for /**/; i >= tail; i -= 1 {
- if data[i] == c {
- return i
- }
- }
- // Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level.
- scanner: #simd[SCAN_WIDTH]u8 = c
- alignment_start := (SCAN_WIDTH - ptr % SCAN_WIDTH) % SCAN_WIDTH
- i -= SCAN_WIDTH - 1
- for /**/; i >= alignment_start; i -= SCAN_WIDTH {
- load := (cast(^#simd[SCAN_WIDTH]u8)(&data[i]))^
- comparison := intrinsics.simd_lanes_eq(load, scanner)
- match := intrinsics.simd_reduce_or(comparison)
- if match > 0 {
- sentinel: #simd[SCAN_WIDTH]u8
- index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel)
- index_reduce := intrinsics.simd_reduce_max(index_select)
- return i + cast(int)index_reduce
- }
- }
- // Iterate as a scalar over the remaining unaligned portion.
- i += SCAN_WIDTH - 1
-
- for /**/; i >= 0; i -= 1 {
- if data[i] == c {
- return i
- }
- }
- return -1
- }
|