|
@@ -0,0 +1,260 @@
|
|
|
+#+build amd64
|
|
|
+package sha2
|
|
|
+
|
|
|
+// Based on the public domain code by Jeffrey Walton, though
|
|
|
+// realistically, there only is one sensible way to write this
|
|
|
+// and Intel's whitepaper covers it.
|
|
|
+//
|
|
|
+// See: https://github.com/noloader/SHA-Intrinsics
|
|
|
+
|
|
|
+import "base:intrinsics"
|
|
|
+import "core:simd"
|
|
|
+import "core:simd/x86"
|
|
|
+import "core:sys/info"
|
|
|
+
|
|
|
+@(private = "file")
|
|
|
+MASK :: x86.__m128i{0x0405060700010203, 0x0c0d0e0f08090a0b}
|
|
|
+
|
|
|
+@(private = "file")
|
|
|
+K_0 :: simd.u64x2{0x71374491428a2f98, 0xe9b5dba5b5c0fbcf}
|
|
|
+@(private = "file")
|
|
|
+K_1 :: simd.u64x2{0x59f111f13956c25b, 0xab1c5ed5923f82a4}
|
|
|
+@(private = "file")
|
|
|
+K_2 :: simd.u64x2{0x12835b01d807aa98, 0x550c7dc3243185be}
|
|
|
+@(private = "file")
|
|
|
+K_3 :: simd.u64x2{0x80deb1fe72be5d74, 0xc19bf1749bdc06a7}
|
|
|
+@(private = "file")
|
|
|
+K_4 :: simd.u64x2{0xefbe4786e49b69c1, 0x240ca1cc0fc19dc6}
|
|
|
+@(private = "file")
|
|
|
+K_5 :: simd.u64x2{0x4a7484aa2de92c6f, 0x76f988da5cb0a9dc}
|
|
|
+@(private = "file")
|
|
|
+K_6 :: simd.u64x2{0xa831c66d983e5152, 0xbf597fc7b00327c8}
|
|
|
+@(private = "file")
|
|
|
+K_7 :: simd.u64x2{0xd5a79147c6e00bf3, 0x1429296706ca6351}
|
|
|
+@(private = "file")
|
|
|
+K_8 :: simd.u64x2{0x2e1b213827b70a85, 0x53380d134d2c6dfc}
|
|
|
+@(private = "file")
|
|
|
+K_9 :: simd.u64x2{0x766a0abb650a7354, 0x92722c8581c2c92e}
|
|
|
+@(private = "file")
|
|
|
+K_10 :: simd.u64x2{0xa81a664ba2bfe8a1, 0xc76c51a3c24b8b70}
|
|
|
+@(private = "file")
|
|
|
+K_11 :: simd.u64x2{0xd6990624d192e819, 0x106aa070f40e3585}
|
|
|
+@(private = "file")
|
|
|
+K_12 :: simd.u64x2{0x1e376c0819a4c116, 0x34b0bcb52748774c}
|
|
|
+@(private = "file")
|
|
|
+K_13 :: simd.u64x2{0x4ed8aa4a391c0cb3, 0x682e6ff35b9cca4f}
|
|
|
+@(private = "file")
|
|
|
+K_14 :: simd.u64x2{0x78a5636f748f82ee, 0x8cc7020884c87814}
|
|
|
+@(private = "file")
|
|
|
+K_15 :: simd.u64x2{0xa4506ceb90befffa, 0xc67178f2bef9a3f7}
|
|
|
+
|
|
|
+
|
|
|
+// is_hardware_accelerated_256 returns true iff hardware accelerated
|
|
|
+// SHA-224/SHA-256 is supported.
|
|
|
+is_hardware_accelerated_256 :: proc "contextless" () -> bool {
|
|
|
+ features, ok := info.cpu_features.?
|
|
|
+ if !ok {
|
|
|
+ return false
|
|
|
+ }
|
|
|
+
|
|
|
+ req_features :: info.CPU_Features{
|
|
|
+ .sse2,
|
|
|
+ .ssse3,
|
|
|
+ .sse41,
|
|
|
+ .sha,
|
|
|
+ }
|
|
|
+ return features >= req_features
|
|
|
+}
|
|
|
+
|
|
|
+@(private, enable_target_feature="sse2,ssse3,sse4.1,sha")
|
|
|
+sha256_transf_hw :: proc "contextless" (ctx: ^Context_256, data: []byte) #no_bounds_check {
|
|
|
+ // Load the state
|
|
|
+ tmp := intrinsics.unaligned_load((^x86.__m128i)(&ctx.h[0]))
|
|
|
+ state_1 := intrinsics.unaligned_load((^x86.__m128i)(&ctx.h[4]))
|
|
|
+
|
|
|
+ tmp = x86._mm_shuffle_epi32(tmp, 0xb1) // CDAB
|
|
|
+ state_1 = x86._mm_shuffle_epi32(state_1, 0x1b) // EFGH
|
|
|
+ state_0 := x86._mm_alignr_epi8(tmp, state_1, 8) // ABEF
|
|
|
+ // state_1 = x86._mm_blend_epi16(state_1, tmp, 0xf0) // CDGH
|
|
|
+ state_1 = kludge_mm_blend_epi16_0xf0(state_1, tmp)
|
|
|
+
|
|
|
+ data := data
|
|
|
+ for len(data) >= BLOCK_SIZE_256 {
|
|
|
+ state_0_save, state_1_save := state_0, state_1
|
|
|
+
|
|
|
+ // Rounds 0-3
|
|
|
+ msg := intrinsics.unaligned_load((^x86.__m128i)(raw_data(data)))
|
|
|
+ msg_0 := x86._mm_shuffle_epi8(msg, MASK)
|
|
|
+ msg = x86._mm_add_epi32(msg_0, x86.__m128i(K_0))
|
|
|
+ state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
|
|
|
+ msg = x86._mm_shuffle_epi32(msg, 0xe)
|
|
|
+ state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
|
|
|
+
|
|
|
+ // Rounds 4-7
|
|
|
+ msg_1 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(data[16:])))
|
|
|
+ msg_1 = x86._mm_shuffle_epi8(msg_1, MASK)
|
|
|
+ msg = x86._mm_add_epi32(msg_1, x86.__m128i(K_1))
|
|
|
+ state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
|
|
|
+ msg = x86._mm_shuffle_epi32(msg, 0xe)
|
|
|
+ state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
|
|
|
+ msg_0 = x86._mm_sha256msg1_epu32(msg_0, msg_1)
|
|
|
+
|
|
|
+ // Rounds 8-11
|
|
|
+ msg_2 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(data[32:])))
|
|
|
+ msg_2 = x86._mm_shuffle_epi8(msg_2, MASK)
|
|
|
+ msg = x86._mm_add_epi32(msg_2, x86.__m128i(K_2))
|
|
|
+ state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
|
|
|
+ msg = x86._mm_shuffle_epi32(msg, 0xe)
|
|
|
+ state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
|
|
|
+ msg_1 = x86._mm_sha256msg1_epu32(msg_1, msg_2)
|
|
|
+
|
|
|
+ // Rounds 12-15
|
|
|
+ msg_3 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(data[48:])))
|
|
|
+ msg_3 = x86._mm_shuffle_epi8(msg_3, MASK)
|
|
|
+ msg = x86._mm_add_epi32(msg_3, x86.__m128i(K_3))
|
|
|
+ state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
|
|
|
+ tmp = x86._mm_alignr_epi8(msg_3, msg_2, 4)
|
|
|
+ msg_0 = x86._mm_add_epi32(msg_0, tmp)
|
|
|
+ msg_0 = x86._mm_sha256msg2_epu32(msg_0, msg_3)
|
|
|
+ msg = x86._mm_shuffle_epi32(msg, 0x0e)
|
|
|
+ state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
|
|
|
+ msg_2 = x86._mm_sha256msg1_epu32(msg_2, msg_3)
|
|
|
+
|
|
|
+ // Rounds 16-19
|
|
|
+ msg = x86._mm_add_epi32(msg_0, x86.__m128i(K_4))
|
|
|
+ state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
|
|
|
+ tmp = x86._mm_alignr_epi8(msg_0, msg_3, 4)
|
|
|
+ msg_1 = x86._mm_add_epi32(msg_1, tmp)
|
|
|
+ msg_1 = x86._mm_sha256msg2_epu32(msg_1, msg_0)
|
|
|
+ msg = x86._mm_shuffle_epi32(msg, 0x0e)
|
|
|
+ state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
|
|
|
+ msg_3 = x86._mm_sha256msg1_epu32(msg_3, msg_0)
|
|
|
+
|
|
|
+ // Rounds 20-23
|
|
|
+ msg = x86._mm_add_epi32(msg_1, x86.__m128i(K_5))
|
|
|
+ state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
|
|
|
+ tmp = x86._mm_alignr_epi8(msg_1, msg_0, 4)
|
|
|
+ msg_2 = x86._mm_add_epi32(msg_2, tmp)
|
|
|
+ msg_2 = x86._mm_sha256msg2_epu32(msg_2, msg_1)
|
|
|
+ msg = x86._mm_shuffle_epi32(msg, 0x0e)
|
|
|
+ state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
|
|
|
+ msg_0 = x86._mm_sha256msg1_epu32(msg_0, msg_1)
|
|
|
+
|
|
|
+ // Rounds 24-27
|
|
|
+ msg = x86._mm_add_epi32(msg_2, x86.__m128i(K_6))
|
|
|
+ state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
|
|
|
+ tmp = x86._mm_alignr_epi8(msg_2, msg_1, 4)
|
|
|
+ msg_3 = x86._mm_add_epi32(msg_3, tmp)
|
|
|
+ msg_3 = x86._mm_sha256msg2_epu32(msg_3, msg_2)
|
|
|
+ msg = x86._mm_shuffle_epi32(msg, 0x0e)
|
|
|
+ state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
|
|
|
+ msg_1 = x86._mm_sha256msg1_epu32(msg_1, msg_2)
|
|
|
+
|
|
|
+ // Rounds 28-31
|
|
|
+ msg = x86._mm_add_epi32(msg_3, x86.__m128i(K_7))
|
|
|
+ state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
|
|
|
+ tmp = x86._mm_alignr_epi8(msg_3, msg_2, 4)
|
|
|
+ msg_0 = x86._mm_add_epi32(msg_0, tmp)
|
|
|
+ msg_0 = x86._mm_sha256msg2_epu32(msg_0, msg_3)
|
|
|
+ msg = x86._mm_shuffle_epi32(msg, 0x0e)
|
|
|
+ state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
|
|
|
+ msg_2 = x86._mm_sha256msg1_epu32(msg_2, msg_3)
|
|
|
+
|
|
|
+ // Rounds 32-35
|
|
|
+ msg = x86._mm_add_epi32(msg_0, x86.__m128i(K_8))
|
|
|
+ state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
|
|
|
+ tmp = x86._mm_alignr_epi8(msg_0, msg_3, 4)
|
|
|
+ msg_1 = x86._mm_add_epi32(msg_1, tmp)
|
|
|
+ msg_1 = x86._mm_sha256msg2_epu32(msg_1, msg_0)
|
|
|
+ msg = x86._mm_shuffle_epi32(msg, 0x0e)
|
|
|
+ state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
|
|
|
+ msg_3 = x86._mm_sha256msg1_epu32(msg_3, msg_0)
|
|
|
+
|
|
|
+ // Rounds 36-39
|
|
|
+ msg = x86._mm_add_epi32(msg_1, x86.__m128i(K_9))
|
|
|
+ state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
|
|
|
+ tmp = x86._mm_alignr_epi8(msg_1, msg_0, 4)
|
|
|
+ msg_2 = x86._mm_add_epi32(msg_2, tmp)
|
|
|
+ msg_2 = x86._mm_sha256msg2_epu32(msg_2, msg_1)
|
|
|
+ msg = x86._mm_shuffle_epi32(msg, 0x0e)
|
|
|
+ state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
|
|
|
+ msg_0 = x86._mm_sha256msg1_epu32(msg_0, msg_1)
|
|
|
+
|
|
|
+ // Rounds 40-43
|
|
|
+ msg = x86._mm_add_epi32(msg_2, x86.__m128i(K_10))
|
|
|
+ state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
|
|
|
+ tmp = x86._mm_alignr_epi8(msg_2, msg_1, 4)
|
|
|
+ msg_3 = x86._mm_add_epi32(msg_3, tmp)
|
|
|
+ msg_3 = x86._mm_sha256msg2_epu32(msg_3, msg_2)
|
|
|
+ msg = x86._mm_shuffle_epi32(msg, 0x0e)
|
|
|
+ state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
|
|
|
+ msg_1 = x86._mm_sha256msg1_epu32(msg_1, msg_2)
|
|
|
+
|
|
|
+ // Rounds 44-47
|
|
|
+ msg = x86._mm_add_epi32(msg_3, x86.__m128i(K_11))
|
|
|
+ state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
|
|
|
+ tmp = x86._mm_alignr_epi8(msg_3, msg_2, 4)
|
|
|
+ msg_0 = x86._mm_add_epi32(msg_0, tmp)
|
|
|
+ msg_0 = x86._mm_sha256msg2_epu32(msg_0, msg_3)
|
|
|
+ msg = x86._mm_shuffle_epi32(msg, 0x0e)
|
|
|
+ state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
|
|
|
+ msg_2 = x86._mm_sha256msg1_epu32(msg_2, msg_3)
|
|
|
+
|
|
|
+ // Rounds 48-51
|
|
|
+ msg = x86._mm_add_epi32(msg_0, x86.__m128i(K_12))
|
|
|
+ state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
|
|
|
+ tmp = x86._mm_alignr_epi8(msg_0, msg_3, 4)
|
|
|
+ msg_1 = x86._mm_add_epi32(msg_1, tmp)
|
|
|
+ msg_1 = x86._mm_sha256msg2_epu32(msg_1, msg_0)
|
|
|
+ msg = x86._mm_shuffle_epi32(msg, 0x0e)
|
|
|
+ state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
|
|
|
+ msg_3 = x86._mm_sha256msg1_epu32(msg_3, msg_0)
|
|
|
+
|
|
|
+ // Rounds 52-55
|
|
|
+ msg = x86._mm_add_epi32(msg_1, x86.__m128i(K_13))
|
|
|
+ state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
|
|
|
+ tmp = x86._mm_alignr_epi8(msg_1, msg_0, 4)
|
|
|
+ msg_2 = x86._mm_add_epi32(msg_2, tmp)
|
|
|
+ msg_2 = x86._mm_sha256msg2_epu32(msg_2, msg_1)
|
|
|
+ msg = x86._mm_shuffle_epi32(msg, 0x0e)
|
|
|
+ state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
|
|
|
+
|
|
|
+ /* Rounds 56-59 */
|
|
|
+ msg = x86._mm_add_epi32(msg_2, x86.__m128i(K_14))
|
|
|
+ state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
|
|
|
+ tmp = x86._mm_alignr_epi8(msg_2, msg_1, 4)
|
|
|
+ msg_3 = x86._mm_add_epi32(msg_3, tmp)
|
|
|
+ msg_3 = x86._mm_sha256msg2_epu32(msg_3, msg_2)
|
|
|
+ msg = x86._mm_shuffle_epi32(msg, 0x0e)
|
|
|
+ state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
|
|
|
+
|
|
|
+ // Rounds 60-63
|
|
|
+ msg = x86._mm_add_epi32(msg_3, x86.__m128i(K_15))
|
|
|
+ state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
|
|
|
+ msg = x86._mm_shuffle_epi32(msg, 0x0e)
|
|
|
+ state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
|
|
|
+
|
|
|
+ state_0 = x86._mm_add_epi32(state_0, state_0_save)
|
|
|
+ state_1 = x86._mm_add_epi32(state_1, state_1_save)
|
|
|
+
|
|
|
+ data = data[BLOCK_SIZE_256:]
|
|
|
+ }
|
|
|
+
|
|
|
+ // Write back the updated state
|
|
|
+ tmp = x86._mm_shuffle_epi32(state_0, 0x1b) // FEBA
|
|
|
+ state_1 = x86._mm_shuffle_epi32(state_1, 0xb1) // DCHG
|
|
|
+ // state_0 = x86._mm_blend_epi16(tmp, state_1, 0xf0) // DCBA
|
|
|
+ state_0 = kludge_mm_blend_epi16_0xf0(tmp, state_1)
|
|
|
+ state_1 = x86._mm_alignr_epi8(state_1, tmp, 8) // ABEF
|
|
|
+
|
|
|
+ intrinsics.unaligned_store((^x86.__m128i)(&ctx.h[0]), state_0)
|
|
|
+ intrinsics.unaligned_store((^x86.__m128i)(&ctx.h[4]), state_1)
|
|
|
+}
|
|
|
+
|
|
|
+@(private = "file")
|
|
|
+kludge_mm_blend_epi16_0xf0 :: #force_inline proc "contextless"(a, b: x86.__m128i) -> x86.__m128i {
|
|
|
+ // HACK HACK HACK: LLVM got rid of `llvm.x86.sse41.pblendw`.
|
|
|
+ a_ := simd.to_array(a)
|
|
|
+ b_ := simd.to_array(b)
|
|
|
+ return x86.__m128i{a_[0], b_[1]}
|
|
|
+}
|