Browse Source

Merge pull request #5525 from Barinzaya/xxh3-simd

`core:hash/xxhash`: Static SIMD Support for XXH3
gingerBill 2 months ago
parent
commit
3eb9c5fa65

+ 32 - 0
core/hash/xxhash/common.odin

@@ -101,3 +101,35 @@ XXH64_read64 :: #force_inline proc(buf: []u8, alignment := Alignment.Unaligned)
 		return u64(b)
 		return u64(b)
 	}
 	}
 }
 }
+
+XXH64_read64_simd :: #force_inline proc(buf: []$E, $W: uint, alignment := Alignment.Unaligned) -> (res: #simd[W]u64) {
+	if alignment == .Aligned {
+		res = (^#simd[W]u64)(raw_data(buf))^
+	} else {
+		res = intrinsics.unaligned_load((^#simd[W]u64)(raw_data(buf)))
+	}
+
+	when ODIN_ENDIAN == .Big {
+		bytes := transmute(#simd[W*8]u8)res
+		bytes = intrinsics.simd_lanes_reverse(bytes)
+		res = transmute(#simd[W]u64)bytes
+		res = intrinsics.simd_lanes_reverse(res)
+	}
+	return
+}
+
+XXH64_write64_simd :: #force_inline proc(buf: []$E, value: $V/#simd[$W]u64, alignment := Alignment.Unaligned) {
+	value := value
+	when ODIN_ENDIAN == .Big {
+		bytes := transmute(#simd[W*8]u8)value
+		bytes = intrinsics.simd_lanes_reverse(bytes)
+		value = transmute(#simd[W]u64)bytes
+		value = intrinsics.simd_lanes_reverse(value)
+	}
+
+	if alignment == .Aligned {
+		(^V)(raw_data(buf))^ = value
+	} else {
+		intrinsics.unaligned_store((^V)(raw_data(buf)), value)
+	}
+}

+ 100 - 33
core/hash/xxhash/xxhash_3.odin

@@ -52,6 +52,7 @@ XXH3_SECRET_SIZE_MIN    :: 136
 #assert(len(XXH3_kSecret) == 192 && len(XXH3_kSecret) > XXH3_SECRET_SIZE_MIN)
 #assert(len(XXH3_kSecret) == 192 && len(XXH3_kSecret) > XXH3_SECRET_SIZE_MIN)
 
 
 XXH_ACC_ALIGN           :: 8   /* scalar */
 XXH_ACC_ALIGN           :: 8   /* scalar */
+XXH_MAX_WIDTH           :: #config(XXH_MAX_WIDTH, 512) / 64
 
 
 /*
 /*
 	This is the optimal update size for incremental hashing.
 	This is the optimal update size for incremental hashing.
@@ -62,10 +63,11 @@ XXH3_INTERNAL_BUFFER_SIZE :: 256
 	Streaming state.
 	Streaming state.
 
 
 	IMPORTANT: This structure has a strict alignment requirement of 64 bytes!! **
 	IMPORTANT: This structure has a strict alignment requirement of 64 bytes!! **
-	Do not allocate this with `make()` or `new`, it will not be sufficiently aligned.
-	Use`XXH3_create_state` and `XXH3_destroy_state, or stack allocation.
+	Default allocators will align it correctly if created via `new`, as will
+	placing this struct on the stack, but if using a custom allocator make sure
+	that it handles the alignment correctly!
 */
 */
-XXH3_state :: struct {
+XXH3_state :: struct #align(64) {
 	acc:               [8]u64,
 	acc:               [8]u64,
 	custom_secret:     [XXH_SECRET_DEFAULT_SIZE]u8,
 	custom_secret:     [XXH_SECRET_DEFAULT_SIZE]u8,
 	buffer:            [XXH3_INTERNAL_BUFFER_SIZE]u8,
 	buffer:            [XXH3_INTERNAL_BUFFER_SIZE]u8,
@@ -380,7 +382,6 @@ XXH3_INIT_ACC :: [XXH_ACC_NB]xxh_u64{
 
 
 XXH_SECRET_MERGEACCS_START :: 11
 XXH_SECRET_MERGEACCS_START :: 11
 
 
-@(optimization_mode="favor_size")
 XXH3_hashLong_128b_internal :: #force_inline proc(
 XXH3_hashLong_128b_internal :: #force_inline proc(
 			input: []u8,
 			input: []u8,
 			secret: []u8,
 			secret: []u8,
@@ -408,7 +409,6 @@ XXH3_hashLong_128b_internal :: #force_inline proc(
 /*
 /*
  * It's important for performance that XXH3_hashLong is not inlined.
  * It's important for performance that XXH3_hashLong is not inlined.
  */
  */
-@(optimization_mode="favor_size")
 XXH3_hashLong_128b_default :: #force_no_inline proc(input: []u8, seed: xxh_u64, secret: []u8) -> (res: XXH3_128_hash) {
 XXH3_hashLong_128b_default :: #force_no_inline proc(input: []u8, seed: xxh_u64, secret: []u8) -> (res: XXH3_128_hash) {
 	return XXH3_hashLong_128b_internal(input, XXH3_kSecret[:], XXH3_accumulate_512, XXH3_scramble_accumulator)
 	return XXH3_hashLong_128b_internal(input, XXH3_kSecret[:], XXH3_accumulate_512, XXH3_scramble_accumulator)
 }
 }
@@ -416,12 +416,10 @@ XXH3_hashLong_128b_default :: #force_no_inline proc(input: []u8, seed: xxh_u64,
 /*
 /*
  * It's important for performance that XXH3_hashLong is not inlined.
  * It's important for performance that XXH3_hashLong is not inlined.
  */
  */
-@(optimization_mode="favor_size")
 XXH3_hashLong_128b_withSecret :: #force_no_inline proc(input: []u8, seed: xxh_u64, secret: []u8) -> (res: XXH3_128_hash) {
 XXH3_hashLong_128b_withSecret :: #force_no_inline proc(input: []u8, seed: xxh_u64, secret: []u8) -> (res: XXH3_128_hash) {
 	return XXH3_hashLong_128b_internal(input, secret, XXH3_accumulate_512, XXH3_scramble_accumulator)
 	return XXH3_hashLong_128b_internal(input, secret, XXH3_accumulate_512, XXH3_scramble_accumulator)
 }
 }
 
 
-@(optimization_mode="favor_size")
 XXH3_hashLong_128b_withSeed_internal :: #force_inline proc(
 XXH3_hashLong_128b_withSeed_internal :: #force_inline proc(
 								input: []u8, seed: xxh_u64, secret: []u8,
 								input: []u8, seed: xxh_u64, secret: []u8,
 								f_acc512: XXH3_accumulate_512_f,
 								f_acc512: XXH3_accumulate_512_f,
@@ -442,7 +440,6 @@ XXH3_hashLong_128b_withSeed_internal :: #force_inline proc(
 /*
 /*
  * It's important for performance that XXH3_hashLong is not inlined.
  * It's important for performance that XXH3_hashLong is not inlined.
  */
  */
- @(optimization_mode="favor_size")
 XXH3_hashLong_128b_withSeed :: #force_no_inline proc(input: []u8, seed: xxh_u64, secret: []u8) -> (res: XXH3_128_hash) {
 XXH3_hashLong_128b_withSeed :: #force_no_inline proc(input: []u8, seed: xxh_u64, secret: []u8) -> (res: XXH3_128_hash) {
 	return XXH3_hashLong_128b_withSeed_internal(input, seed, secret, XXH3_accumulate_512, XXH3_scramble_accumulator , XXH3_init_custom_secret)
 	return XXH3_hashLong_128b_withSeed_internal(input, seed, secret, XXH3_accumulate_512, XXH3_scramble_accumulator , XXH3_init_custom_secret)
 }
 }
@@ -477,7 +474,7 @@ XXH3_128bits_internal :: #force_inline proc(
 /* ===   Public XXH128 API   === */
 /* ===   Public XXH128 API   === */
 @(optimization_mode="favor_size")
 @(optimization_mode="favor_size")
 XXH3_128_default :: proc(input: []u8) -> (hash: XXH3_128_hash) {
 XXH3_128_default :: proc(input: []u8) -> (hash: XXH3_128_hash) {
-	return XXH3_128bits_internal(input, 0, XXH3_kSecret[:], XXH3_hashLong_128b_withSeed)
+	return XXH3_128bits_internal(input, 0, XXH3_kSecret[:], XXH3_hashLong_128b_default)
 }
 }
 
 
 @(optimization_mode="favor_size")
 @(optimization_mode="favor_size")
@@ -733,10 +730,6 @@ XXH3_accumulate_512_f       :: #type proc(acc: []xxh_u64, input:  []u8, secret:
 XXH3_scramble_accumulator_f :: #type proc(acc: []xxh_u64, secret: []u8)
 XXH3_scramble_accumulator_f :: #type proc(acc: []xxh_u64, secret: []u8)
 XXH3_init_custom_secret_f   :: #type proc(custom_secret: []u8, seed64: xxh_u64)
 XXH3_init_custom_secret_f   :: #type proc(custom_secret: []u8, seed64: xxh_u64)
 
 
-XXH3_accumulate_512       : XXH3_accumulate_512_f       = XXH3_accumulate_512_scalar
-XXH3_scramble_accumulator : XXH3_scramble_accumulator_f = XXH3_scramble_accumulator_scalar
-XXH3_init_custom_secret   : XXH3_init_custom_secret_f   = XXH3_init_custom_secret_scalar
-
 /* scalar variants - universal */
 /* scalar variants - universal */
 @(optimization_mode="favor_size")
 @(optimization_mode="favor_size")
 XXH3_accumulate_512_scalar :: #force_inline proc(acc: []xxh_u64, input: []u8, secret: []u8) {
 XXH3_accumulate_512_scalar :: #force_inline proc(acc: []xxh_u64, input: []u8, secret: []u8) {
@@ -751,7 +744,7 @@ XXH3_accumulate_512_scalar :: #force_inline proc(acc: []xxh_u64, input: []u8, se
 		sec := XXH64_read64(xsecret[8 * i:])
 		sec := XXH64_read64(xsecret[8 * i:])
 		data_key    := data_val ~ sec
 		data_key    := data_val ~ sec
 		xacc[i ~ 1] += data_val /* swap adjacent lanes */
 		xacc[i ~ 1] += data_val /* swap adjacent lanes */
-		xacc[i    ] += u64(u128(u32(data_key)) * u128(u64(data_key >> 32)))
+		xacc[i    ] += u64(u32(data_key)) * u64(data_key >> 32)
 	}
 	}
 }
 }
 
 
@@ -785,6 +778,87 @@ XXH3_init_custom_secret_scalar :: #force_inline proc(custom_secret: []u8, seed64
 	}
 	}
 }
 }
 
 
+/* generalized SIMD variants */
+XXH3_accumulate_512_simd_generic :: #force_inline proc(acc: []xxh_u64, input: []u8, secret: []u8, $W: uint) {
+	u32xW :: #simd[W]u32
+	u64xW :: #simd[W]u64
+
+	#no_bounds_check for i in uint(0)..<XXH_ACC_NB/W {
+		data_val := XXH64_read64_simd(input[8 * W * i:], W)
+		sec      := XXH64_read64_simd(secret[8 * W * i:], W)
+		data_key := data_val ~ sec
+
+		// Swap adjacent lanes
+		when W == 2 {
+			data_val = swizzle(data_val, 1, 0)
+		} else when W == 4 {
+			data_val = swizzle(data_val, 1, 0, 3, 2)
+		} else when W == 8 {
+			data_val = swizzle(data_val, 1, 0, 3, 2, 5, 4, 7, 6)
+		} else {
+			#panic("Unsupported vector size!")
+		}
+
+		a := XXH64_read64_simd(acc[W * i:], W)
+		a += data_val
+		a += u64xW(u32xW(data_key)) * intrinsics.simd_shr(data_key, 32)
+		XXH64_write64_simd(acc[W * i:], a)
+	}
+}
+
+XXH3_scramble_accumulator_simd_generic :: #force_inline proc(acc: []xxh_u64, secret: []u8, $W: uint) {
+	u64xW :: #simd[W]u64
+	#no_bounds_check for i in uint(0)..<XXH_ACC_NB/W {
+		key64 := XXH64_read64_simd(secret[8 * W * i:], W)
+		acc64 := XXH64_read64_simd(acc[W * i:], W)
+		acc64 ~= intrinsics.simd_shr(acc64, 47)
+		acc64 ~= key64
+		acc64 *= XXH_PRIME32_1
+		XXH64_write64_simd(acc[W * i:], acc64)
+	}
+}
+
+XXH3_init_custom_secret_simd_generic :: #force_inline proc(custom_secret: []u8, seed64: xxh_u64, $W: uint) {
+	u64xW :: #simd[W]u64
+
+	seedVec := u64xW(seed64)
+	for i in 0..<W/2 {
+		j := 2*i + 1
+		seedVec = intrinsics.simd_replace(seedVec, j, -intrinsics.simd_extract(seedVec, j))
+	}
+
+	nbRounds := XXH_SECRET_DEFAULT_SIZE / 8 / W
+	#no_bounds_check for i in uint(0)..<nbRounds {
+		block := XXH64_read64_simd(XXH3_kSecret[8 * W * i:], W)
+		block += seedVec
+		XXH64_write64_simd(custom_secret[8 * W * i:], block)
+	}
+}
+
+XXH3_accumulate_512 :: #force_inline proc(acc: []xxh_u64, input: []u8, secret: []u8) {
+	when XXH_NATIVE_WIDTH > 1 {
+		XXH3_accumulate_512_simd_generic(acc, input, secret, XXH_NATIVE_WIDTH)
+	} else {
+		XXH3_accumulate_512_scalar(acc, input, secret)
+	}
+}
+
+XXH3_scramble_accumulator :: #force_inline proc(acc: []xxh_u64, secret: []u8) {
+	when XXH_NATIVE_WIDTH > 1 {
+		XXH3_scramble_accumulator_simd_generic(acc, secret, XXH_NATIVE_WIDTH)
+	} else {
+		XXH3_scramble_accumulator_scalar(acc, secret)
+	}
+}
+
+XXH3_init_custom_secret :: #force_inline proc(custom_secret: []u8, seed64: xxh_u64) {
+	when XXH_NATIVE_WIDTH > 1 {
+		XXH3_init_custom_secret_simd_generic(custom_secret, seed64, XXH_NATIVE_WIDTH)
+	} else {
+		XXH3_init_custom_secret_scalar(custom_secret, seed64)
+	}
+}
+
 XXH_PREFETCH_DIST :: 320
 XXH_PREFETCH_DIST :: 320
 
 
 /*
 /*
@@ -796,7 +870,7 @@ XXH_PREFETCH_DIST :: 320
 XXH3_accumulate :: #force_inline proc(
 XXH3_accumulate :: #force_inline proc(
 	acc: []xxh_u64, input: []u8, secret: []u8, nbStripes: uint, f_acc512: XXH3_accumulate_512_f) {
 	acc: []xxh_u64, input: []u8, secret: []u8, nbStripes: uint, f_acc512: XXH3_accumulate_512_f) {
 
 
-	for n := uint(0); n < nbStripes; n += 1 {
+	#no_bounds_check for n := uint(0); n < nbStripes; n += 1 {
 		when !XXH_DISABLE_PREFETCH {
 		when !XXH_DISABLE_PREFETCH {
 			in_ptr := &input[n * XXH_STRIPE_LEN]
 			in_ptr := &input[n * XXH_STRIPE_LEN]
 			prefetch(in_ptr, XXH_PREFETCH_DIST)
 			prefetch(in_ptr, XXH_PREFETCH_DIST)
@@ -869,7 +943,6 @@ XXH3_hashLong_64b_internal :: #force_inline proc(input: []u8, secret: []u8,
 /*
 /*
 	It's important for performance that XXH3_hashLong is not inlined.
 	It's important for performance that XXH3_hashLong is not inlined.
 */
 */
-@(optimization_mode="favor_size")
 XXH3_hashLong_64b_withSecret :: #force_no_inline proc(input: []u8, seed64: xxh_u64, secret: []u8) -> (hash: xxh_u64) {
 XXH3_hashLong_64b_withSecret :: #force_no_inline proc(input: []u8, seed64: xxh_u64, secret: []u8) -> (hash: xxh_u64) {
 	return XXH3_hashLong_64b_internal(input, secret, XXH3_accumulate_512, XXH3_scramble_accumulator)
 	return XXH3_hashLong_64b_internal(input, secret, XXH3_accumulate_512, XXH3_scramble_accumulator)
 }
 }
@@ -881,24 +954,11 @@ XXH3_hashLong_64b_withSecret :: #force_no_inline proc(input: []u8, seed64: xxh_u
 	This variant enforces that the compiler can detect that,
 	This variant enforces that the compiler can detect that,
 	and uses this opportunity to streamline the generated code for better performance.
 	and uses this opportunity to streamline the generated code for better performance.
 */
 */
-@(optimization_mode="favor_size")
 XXH3_hashLong_64b_default :: #force_no_inline proc(input: []u8, seed64: xxh_u64, secret: []u8) -> (hash: xxh_u64) {
 XXH3_hashLong_64b_default :: #force_no_inline proc(input: []u8, seed64: xxh_u64, secret: []u8) -> (hash: xxh_u64) {
 	return XXH3_hashLong_64b_internal(input, XXH3_kSecret[:], XXH3_accumulate_512, XXH3_scramble_accumulator)
 	return XXH3_hashLong_64b_internal(input, XXH3_kSecret[:], XXH3_accumulate_512, XXH3_scramble_accumulator)
 }
 }
 
 
-/*
-	XXH3_hashLong_64b_withSeed():
-	Generate a custom key based on alteration of default XXH3_kSecret with the seed,
-	and then use this key for long mode hashing.
-
-	This operation is decently fast but nonetheless costs a little bit of time.
-	Try to avoid it whenever possible (typically when seed==0).
-
-	It's important for performance that XXH3_hashLong is not inlined. Not sure
-	why (uop cache maybe?), but the difference is large and easily measurable.
-*/
-@(optimization_mode="favor_size")
-XXH3_hashLong_64b_withSeed_internal :: #force_no_inline proc(
+XXH3_hashLong_64b_withSeed_internal :: #force_inline proc(
 	input:       []u8,
 	input:       []u8,
 	seed:        xxh_u64,
 	seed:        xxh_u64,
 	f_acc512:    XXH3_accumulate_512_f,
 	f_acc512:    XXH3_accumulate_512_f,
@@ -915,9 +975,16 @@ XXH3_hashLong_64b_withSeed_internal :: #force_no_inline proc(
 }
 }
 
 
 /*
 /*
-	It's important for performance that XXH3_hashLong is not inlined.
+	XXH3_hashLong_64b_withSeed():
+	Generate a custom key based on alteration of default XXH3_kSecret with the seed,
+	and then use this key for long mode hashing.
+
+	This operation is decently fast but nonetheless costs a little bit of time.
+	Try to avoid it whenever possible (typically when seed==0).
+
+	It's important for performance that XXH3_hashLong is not inlined. Not sure
+	why (uop cache maybe?), but the difference is large and easily measurable.
 */
 */
-@(optimization_mode="favor_size")
 XXH3_hashLong_64b_withSeed :: #force_no_inline proc(input: []u8, seed: xxh_u64, secret: []u8) -> (hash: xxh_u64) {
 XXH3_hashLong_64b_withSeed :: #force_no_inline proc(input: []u8, seed: xxh_u64, secret: []u8) -> (hash: xxh_u64) {
 	return XXH3_hashLong_64b_withSeed_internal(input, seed, XXH3_accumulate_512, XXH3_scramble_accumulator, XXH3_init_custom_secret)
 	return XXH3_hashLong_64b_withSeed_internal(input, seed, XXH3_accumulate_512, XXH3_scramble_accumulator, XXH3_init_custom_secret)
 }
 }
@@ -926,7 +993,7 @@ XXH3_hashLong_64b_withSeed :: #force_no_inline proc(input: []u8, seed: xxh_u64,
 XXH3_hashLong64_f :: #type proc(input: []u8, seed: xxh_u64, secret: []u8)  -> (res: xxh_u64)
 XXH3_hashLong64_f :: #type proc(input: []u8, seed: xxh_u64, secret: []u8)  -> (res: xxh_u64)
 
 
 @(optimization_mode="favor_size")
 @(optimization_mode="favor_size")
-XXH3_64bits_internal :: proc(input: []u8, seed: xxh_u64, secret: []u8, f_hashLong: XXH3_hashLong64_f) -> (hash: xxh_u64) {
+XXH3_64bits_internal :: #force_inline proc(input: []u8, seed: xxh_u64, secret: []u8, f_hashLong: XXH3_hashLong64_f) -> (hash: xxh_u64) {
 	assert(len(secret) >= XXH3_SECRET_SIZE_MIN)
 	assert(len(secret) >= XXH3_SECRET_SIZE_MIN)
 	/*
 	/*
 		If an action is to be taken if len(secret) condition is not respected, it should be done here.
 		If an action is to be taken if len(secret) condition is not respected, it should be done here.

+ 13 - 0
core/hash/xxhash/xxhash_3_intel.odin

@@ -0,0 +1,13 @@
+#+build amd64, i386
+package xxhash
+
+import "base:intrinsics"
+
+@(private="file") SSE2_FEATURES :: "sse2"
+@(private="file") AVX2_FEATURES :: "avx2"
+@(private="file") AVX512_FEATURES :: "avx512dq,evex512"
+
+XXH_NATIVE_WIDTH :: min(XXH_MAX_WIDTH,
+	8 when intrinsics.has_target_feature(AVX512_FEATURES) else
+	4 when intrinsics.has_target_feature(AVX2_FEATURES) else
+	2 when intrinsics.has_target_feature(SSE2_FEATURES) else 1)

+ 8 - 0
core/hash/xxhash/xxhash_3_other.odin

@@ -0,0 +1,8 @@
+#+build !amd64
+#+build !i386
+package xxhash
+
+import "base:runtime"
+
+XXH_NATIVE_WIDTH :: min(XXH_MAX_WIDTH,
+	2 when runtime.HAS_HARDWARE_SIMD else 1)