util.odin 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. /*
  2. (c) Copyright 2024 Feoramund <[email protected]>.
  3. Made available under Odin's BSD-3 license.
  4. List of contributors:
  5. Feoramund: `index_byte` procedures.
  6. */
  7. // package simd_util implements compositions of SIMD operations for optimizing
  8. // the core library where available.
  9. package simd_util
  10. import "base:intrinsics"
  11. @private SCAN_WIDTH :: 32
  12. @(private, rodata)
  13. simd_scanner_indices := #simd[SCAN_WIDTH]u8 {
  14. 0, 1, 2, 3, 4, 5, 6, 7,
  15. 8, 9, 10, 11, 12, 13, 14, 15,
  16. 16, 17, 18, 19, 20, 21, 22, 23,
  17. 24, 25, 26, 27, 28, 29, 30, 31,
  18. }
  19. /*
  20. Scan a slice of bytes for a specific byte.
  21. This procedure safely handles slices of any length, including empty slices.
  22. Inputs:
  23. - data: A slice of bytes.
  24. - c: The byte to search for.
  25. Returns:
  26. - index: The index of the byte `c`, or -1 if it was not found.
  27. */
  28. index_byte :: proc "contextless" (data: []u8, c: byte) -> (index: int) #no_bounds_check {
  29. length := len(data)
  30. i := 0
  31. // Guard against small strings.
  32. if length < SCAN_WIDTH {
  33. for /**/; i < length; i += 1 {
  34. if data[i] == c {
  35. return i
  36. }
  37. }
  38. return -1
  39. }
  40. ptr := cast(int)cast(uintptr)raw_data(data)
  41. alignment_start := (SCAN_WIDTH - ptr % SCAN_WIDTH) % SCAN_WIDTH
  42. // Iterate as a scalar until the data is aligned on a `SCAN_WIDTH` boundary.
  43. //
  44. // This way, every load in the vector loop will be aligned, which should be
  45. // the fastest possible scenario.
  46. for /**/; i < alignment_start; i += 1 {
  47. if data[i] == c {
  48. return i
  49. }
  50. }
  51. // Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level.
  52. scanner: #simd[SCAN_WIDTH]u8 = c
  53. tail := length - (length - alignment_start) % SCAN_WIDTH
  54. for /**/; i < tail; i += SCAN_WIDTH {
  55. load := (cast(^#simd[SCAN_WIDTH]u8)(&data[i]))^
  56. comparison := intrinsics.simd_lanes_eq(load, scanner)
  57. match := intrinsics.simd_reduce_or(comparison)
  58. if match > 0 {
  59. sentinel: #simd[SCAN_WIDTH]u8 = u8(0xFF)
  60. index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel)
  61. index_reduce := intrinsics.simd_reduce_min(index_select)
  62. return i + cast(int)index_reduce
  63. }
  64. }
  65. // Iterate as a scalar over the remaining unaligned portion.
  66. for /**/; i < length; i += 1 {
  67. if data[i] == c {
  68. return i
  69. }
  70. }
  71. return -1
  72. }
  73. /*
  74. Scan a slice of bytes for a specific byte, starting from the end and working
  75. backwards to the start.
  76. This procedure safely handles slices of any length, including empty slices.
  77. Inputs:
  78. - data: A slice of bytes.
  79. - c: The byte to search for.
  80. Returns:
  81. - index: The index of the byte `c`, or -1 if it was not found.
  82. */
  83. last_index_byte :: proc "contextless" (data: []u8, c: byte) -> int #no_bounds_check {
  84. length := len(data)
  85. i := length - 1
  86. // Guard against small strings.
  87. if length < SCAN_WIDTH {
  88. for /**/; i >= 0; i -= 1 {
  89. if data[i] == c {
  90. return i
  91. }
  92. }
  93. return -1
  94. }
  95. ptr := cast(int)cast(uintptr)raw_data(data)
  96. tail := length - (ptr + length) % SCAN_WIDTH
  97. // Iterate as a scalar until the data is aligned on a `SCAN_WIDTH` boundary.
  98. //
  99. // This way, every load in the vector loop will be aligned, which should be
  100. // the fastest possible scenario.
  101. for /**/; i >= tail; i -= 1 {
  102. if data[i] == c {
  103. return i
  104. }
  105. }
  106. // Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level.
  107. scanner: #simd[SCAN_WIDTH]u8 = c
  108. alignment_start := (SCAN_WIDTH - ptr % SCAN_WIDTH) % SCAN_WIDTH
  109. i -= SCAN_WIDTH - 1
  110. for /**/; i >= alignment_start; i -= SCAN_WIDTH {
  111. load := (cast(^#simd[SCAN_WIDTH]u8)(&data[i]))^
  112. comparison := intrinsics.simd_lanes_eq(load, scanner)
  113. match := intrinsics.simd_reduce_or(comparison)
  114. if match > 0 {
  115. sentinel: #simd[SCAN_WIDTH]u8
  116. index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel)
  117. index_reduce := intrinsics.simd_reduce_max(index_select)
  118. return i + cast(int)index_reduce
  119. }
  120. }
  121. // Iterate as a scalar over the remaining unaligned portion.
  122. i += SCAN_WIDTH - 1
  123. for /**/; i >= 0; i -= 1 {
  124. if data[i] == c {
  125. return i
  126. }
  127. }
  128. return -1
  129. }