Răsfoiți Sursa

core:crypto/deoxysii: Initial import

Yawning Angel 10 luni în urmă
părinte
comite
b220df60b8

+ 17 - 0
core/crypto/aead/low_level.odin

@@ -4,6 +4,7 @@ import "core:crypto/aegis"
 import "core:crypto/aes"
 import "core:crypto/chacha20"
 import "core:crypto/chacha20poly1305"
+import "core:crypto/deoxysii"
 import "core:reflect"
 
 // Implementation is an AEAD implementation.  Most callers will not need
@@ -30,6 +31,7 @@ Algorithm :: enum {
 	AEGIS_128L_256, // AEGIS-128L (256-bit tag)
 	AEGIS_256,
 	AEGIS_256_256, // AEGIS-256 (256-bit tag)
+	DEOXYS_II_256,
 }
 
 // ALGORITM_NAMES is the Algorithm to algorithm name string.
@@ -44,6 +46,7 @@ ALGORITHM_NAMES := [Algorithm]string {
 	.AEGIS_128L_256    = "AEGIS-128L-256",
 	.AEGIS_256         = "AEGIS-256",
 	.AEGIS_256_256     = "AEGIS-256-256",
+	.DEOXYS_II_256     = "Deoxys-II-256",
 }
 
 // TAG_SIZES is the Algorithm to tag size in bytes.
@@ -58,6 +61,7 @@ TAG_SIZES := [Algorithm]int {
 	.AEGIS_128L_256    = aegis.TAG_SIZE_256,
 	.AEGIS_256         = aegis.TAG_SIZE_128,
 	.AEGIS_256_256     = aegis.TAG_SIZE_256,
+	.DEOXYS_II_256     = deoxysii.TAG_SIZE,
 }
 
 // KEY_SIZES is the Algorithm to key size in bytes.
@@ -72,6 +76,7 @@ KEY_SIZES := [Algorithm]int {
 	.AEGIS_128L_256    = aegis.KEY_SIZE_128L,
 	.AEGIS_256         = aegis.KEY_SIZE_256,
 	.AEGIS_256_256     = aegis.KEY_SIZE_256,
+	.DEOXYS_II_256     = deoxysii.KEY_SIZE,
 }
 
 // IV_SIZES is the Algorithm to initialization vector size in bytes.
@@ -88,6 +93,7 @@ IV_SIZES := [Algorithm]int {
 	.AEGIS_128L_256    = aegis.IV_SIZE_128L,
 	.AEGIS_256         = aegis.IV_SIZE_256,
 	.AEGIS_256_256     = aegis.IV_SIZE_256,
+	.DEOXYS_II_256     = deoxysii.IV_SIZE,
 }
 
 // Context is a concrete instantiation of a specific AEAD algorithm.
@@ -97,6 +103,7 @@ Context :: struct {
 		aes.Context_GCM,
 		chacha20poly1305.Context,
 		aegis.Context,
+		deoxysii.Context,
 	},
 }
 
@@ -112,6 +119,7 @@ _IMPL_IDS := [Algorithm]typeid {
 	.AEGIS_128L_256    = typeid_of(aegis.Context),
 	.AEGIS_256         = typeid_of(aegis.Context),
 	.AEGIS_256_256     = typeid_of(aegis.Context),
+	.DEOXYS_II_256     = typeid_of(deoxysii.Context),
 }
 
 // init initializes a Context with a specific AEAD Algorithm.
@@ -142,6 +150,9 @@ init :: proc(ctx: ^Context, algorithm: Algorithm, key: []byte, impl: Implementat
 	case .AEGIS_128L, .AEGIS_128L_256, .AEGIS_256, .AEGIS_256_256:
 		impl_ := impl != nil ? impl.(aes.Implementation) : aes.DEFAULT_IMPLEMENTATION
 		aegis.init(&ctx._impl.(aegis.Context), key, impl_)
+	case .DEOXYS_II_256:
+		impl_ := impl != nil ? impl.(aes.Implementation) : aes.DEFAULT_IMPLEMENTATION
+		deoxysii.init(&ctx._impl.(deoxysii.Context), key, impl_)
 	case .Invalid:
 		panic("crypto/aead: uninitialized algorithm")
 	case:
@@ -167,6 +178,8 @@ seal_ctx :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {
 		chacha20poly1305.seal(&impl, dst, tag, iv, aad, plaintext)
 	case aegis.Context:
 		aegis.seal(&impl, dst, tag, iv, aad, plaintext)
+	case deoxysii.Context:
+		deoxysii.seal(&impl, dst, tag, iv, aad, plaintext)
 	case:
 		panic("crypto/aead: uninitialized algorithm")
 	}
@@ -191,6 +204,8 @@ open_ctx :: proc(ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool {
 		return chacha20poly1305.open(&impl, dst, iv, aad, ciphertext, tag)
 	case aegis.Context:
 		return aegis.open(&impl, dst, iv, aad, ciphertext, tag)
+	case deoxysii.Context:
+		return deoxysii.open(&impl, dst, iv, aad, ciphertext, tag)
 	case:
 		panic("crypto/aead: uninitialized algorithm")
 	}
@@ -206,6 +221,8 @@ reset :: proc(ctx: ^Context) {
 		chacha20poly1305.reset(&impl)
 	case aegis.Context:
 		aegis.reset(&impl)
+	case deoxysii.Context:
+		deoxysii.reset(&impl)
 	case:
 		// Calling reset repeatedly is fine.
 	}

+ 295 - 0
core/crypto/deoxysii/deoxysii.odin

@@ -0,0 +1,295 @@
+/*
+package deoxysii implements the Deoxys-II-256 Authenticated Encryption
+with Additional Data algorithm.
+
+- [[ https://sites.google.com/view/deoxyscipher ]]
+- [[ https://thomaspeyrin.github.io/web/assets/docs/papers/Jean-etal-JoC2021.pdf ]]
+*/
+package deoxysii
+
+import "base:intrinsics"
+import "core:bytes"
+import "core:crypto/aes"
+import "core:mem"
+import "core:simd"
+
+// KEY_SIZE is the Deoxys-II-256 key size in bytes.
+KEY_SIZE :: 32
+// IV_SIZE iss the Deoxys-II-256 IV size in bytes.
+IV_SIZE :: 15 // 120-bits
+// TAG_SIZE is the Deoxys-II-256 tag size in bytes.
+TAG_SIZE :: 16
+
+@(private)
+PREFIX_AD_BLOCK :: 0b0010
+@(private)
+PREFIX_AD_FINAL :: 0b0110
+@(private)
+PREFIX_MSG_BLOCK :: 0b0000
+@(private)
+PREFIX_MSG_FINAL :: 0b0100
+@(private)
+PREFIX_TAG :: 0b0001
+@(private)
+PREFIX_SHIFT :: 4
+
+@(private)
+BC_ROUNDS :: 16
+@(private)
+BLOCK_SIZE :: aes.BLOCK_SIZE
+
+@(private = "file")
+_LFSR2_MASK :: simd.u8x16{
+	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+}
+@(private = "file")
+_LFSR3_MASK :: simd.u8x16{
+	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+}
+@(private = "file")
+_LFSR_SH1 :: _LFSR2_MASK
+@(private = "file")
+_LFSR_SH5 :: simd.u8x16{
+	0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
+	0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
+}
+@(private = "file")
+_LFSR_SH7 :: simd.u8x16{
+	0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+	0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+}
+@(private = "file", rodata)
+_RCONS := []byte {
+	0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a,
+	0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39,
+	0x72,
+}
+
+// Context is a keyed Deoxys-II-256 instance.
+Context :: struct {
+	_subkeys:        [BC_ROUNDS+1][16]byte,
+	_impl:           aes.Implementation,
+	_is_initialized: bool,
+}
+
+@(private)
+_validate_common_slice_sizes :: proc (ctx: ^Context, tag, iv, aad, text: []byte) {
+	if len(tag) != TAG_SIZE {
+		panic("crypto/deoxysii: invalid tag size")
+	}
+
+	if len(iv) != IV_SIZE {
+		panic("crypto/deoxysii: invalid IV size")
+	}
+
+	#assert(size_of(int) == 8 || size_of(int) <= 4)
+	// For the nonce-misuse resistant mode, the total size of the
+	// associated data and the total size of the message do not exceed
+	// `16 * 2^max_l * 2^max_m bytes`, thus 2^128 bytes for all variants
+	// of Deoxys-II. Moreover, the maximum number of messages that can
+	// be handled for a same key is 2^max_m, that is 2^64 for all variants
+	// of Deoxys.
+}
+
+// init initializes a Context with the provided key.
+init :: proc(ctx: ^Context, key: []byte, impl := aes.DEFAULT_IMPLEMENTATION) {
+	if len(key) != KEY_SIZE {
+		panic("crypto/deoxysii: invalid key size")
+	}
+
+	ctx._impl = impl
+	if ctx._impl == .Hardware && !is_hardware_accelerated() {
+		ctx._impl = .Portable
+	}
+
+	derive_ks(ctx, key)
+
+	ctx._is_initialized = true
+}
+
+// seal encrypts the plaintext and authenticates the aad and ciphertext,
+// with the provided Context and iv, stores the output in dst and tag.
+//
+// dst and plaintext MUST alias exactly or not at all.
+seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {
+	assert(ctx._is_initialized)
+
+	_validate_common_slice_sizes(ctx, tag, iv, aad, plaintext)
+	if len(dst) != len(plaintext) {
+		panic("crypto/deoxysii: invalid destination ciphertext size")
+	}
+	if bytes.alias_inexactly(dst, plaintext) {
+		panic("crypto/deoxysii: dst and plaintext alias inexactly")
+	}
+
+	switch ctx._impl {
+	case .Hardware:
+		e_hw(ctx, dst, tag, iv, aad, plaintext)
+	case .Portable:
+		e_ref(ctx, dst, tag, iv, aad, plaintext)
+	}
+}
+
+// open authenticates the aad and ciphertext, and decrypts the ciphertext,
+// with the provided Context, iv, and tag, and stores the output in dst,
+// returning true iff the authentication was successful.  If authentication
+// fails, the destination buffer will be zeroed.
+//
+// dst and plaintext MUST alias exactly or not at all.
+@(require_results)
+open :: proc(ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool {
+	assert(ctx._is_initialized)
+
+	_validate_common_slice_sizes(ctx, tag, iv, aad, ciphertext)
+	if len(dst) != len(ciphertext) {
+		panic("crypto/deoxysii: invalid destination plaintext size")
+	}
+	if bytes.alias_inexactly(dst, ciphertext) {
+		panic("crypto/deoxysii: dst and ciphertext alias inexactly")
+	}
+
+	ok: bool
+	switch ctx._impl {
+	case .Hardware:
+		ok = d_hw(ctx, dst, iv, aad, ciphertext, tag)
+	case .Portable:
+		ok = d_ref(ctx, dst, iv, aad, ciphertext, tag)
+	}
+	if !ok {
+		mem.zero_explicit(raw_data(dst), len(ciphertext))
+	}
+
+	return ok
+}
+
+// reset sanitizes the Context.  The Context must be
+// re-initialized to be used again.
+reset :: proc "contextless" (ctx: ^Context) {
+	mem.zero_explicit(&ctx._subkeys, len(ctx._subkeys))
+	ctx._is_initialized = false
+}
+
+@(private = "file")
+derive_ks :: proc "contextless" (ctx: ^Context, key: []byte) {
+	// Derive the constant component of each subtweakkey.
+	//
+	// The key schedule is as thus:
+	//
+	//   STK_i = TK1_i ^ TK2_i ^ TK3_i ^ RC_i
+	//
+	//   TK1_i = h(TK1_(i-1))
+	//   TK2_i = h(LFSR2(TK2_(i-1)))
+	//   TK3_i = h(LFSR3(TK2_(i-1)))
+	//
+	// where:
+	//
+	//   KT = K || T
+	//   W3 = KT[:16]
+	//   W2 = KT[16:32]
+	//   W1 = KT[32:]
+	//
+	//   TK1_0 = W1
+	//   TK2_0 = W2
+	//   TK3_0 = W3
+	//
+	// As `K` is fixed per Context, the XORs of `TK3_0 .. TK3_n`,
+	// `TK2_0 .. TK2_n` and RC_i can be precomputed in advance like
+	// thus:
+	//
+	//   subkey_i = TK3_i ^ TK2_i ^ RC_i
+	//
+	// When it is time to actually call Deoxys-BC-384, it is then
+	// a simple matter of deriving each round subtweakkey via:
+	//
+	//   TK1_0 = T (Tweak)
+	//   STK_0 = subkey_0 ^ TK1_0
+	//   STK_i = subkey_i (precomputed) ^ H(TK1_(i-1))
+	//
+	// We opt to use SIMD here and for the subtweakkey deriviation
+	// as `H()` is typically a single vector instruction.
+
+	tk2 := intrinsics.unaligned_load((^simd.u8x16)(raw_data(key[16:])))
+	tk3 := intrinsics.unaligned_load((^simd.u8x16)(raw_data(key)))
+
+	// subkey_0 does not apply LFSR2/3 or H.
+	intrinsics.unaligned_store(
+		(^simd.u8x16)(&ctx._subkeys[0]),
+		simd.bit_xor(
+			tk2,
+			simd.bit_xor(
+				tk3,
+				rcon(0),
+			),
+		),
+	)
+
+	// Precompute k_1 .. k_16.
+	for i in 1 ..< BC_ROUNDS+1 {
+		tk2 = h(lfsr2(tk2))
+		tk3 = h(lfsr3(tk3))
+		intrinsics.unaligned_store(
+			(^simd.u8x16)(&ctx._subkeys[i]),
+			simd.bit_xor(
+				tk2,
+				simd.bit_xor(
+					tk3,
+					rcon(i),
+				),
+			),
+		)
+	}
+}
+
+@(private = "file")
+lfsr2 :: #force_inline proc "contextless" (tk: simd.u8x16) -> simd.u8x16 {
+	// LFSR2 is a application of the following LFSR to each byte of input.
+	// (x7||x6||x5||x4||x3||x2||x1||x0) -> (x6||x5||x4||x3||x2||x1||x0||x7 ^ x5)
+	return simd.bit_or(
+		simd.shl(tk, _LFSR_SH1),
+		simd.bit_and(
+			simd.bit_xor(
+				simd.shr(tk, _LFSR_SH7), // x7
+				simd.shr(tk, _LFSR_SH5), // x5
+			),
+			_LFSR2_MASK,
+		),
+	)
+}
+
+@(private = "file")
+lfsr3 :: #force_inline proc "contextless"  (tk: simd.u8x16) -> simd.u8x16 {
+	// LFSR3 is a application of the following LFSR to each byte of input.
+	// (x7||x6||x5||x4||x3||x2||x1||x0) -> (x0 ^ x6||x7||x6||x5||x4||x3||x2||x1)
+	return simd.bit_or(
+		simd.shr(tk, _LFSR_SH1),
+		simd.bit_and(
+			simd.bit_xor(
+				simd.shl(tk, _LFSR_SH7), // x0
+				simd.shl(tk, _LFSR_SH1), // x6
+			),
+			_LFSR3_MASK,
+		),
+	)
+}
+
+@(private)
+h :: #force_inline proc "contextless" (tk: simd.u8x16) -> simd.u8x16 {
+	return simd.swizzle(
+		tk,
+		0x01, 0x06, 0x0b, 0x0c, 0x05, 0x0a, 0x0f, 0x00,
+		0x09, 0x0e, 0x03, 0x04, 0x0d, 0x02, 0x07, 0x08,
+	)
+}
+
+@(private = "file")
+rcon :: #force_inline proc "contextless" (rd: int) -> simd.u8x16 #no_bounds_check {
+	rc := _RCONS[rd]
+	return simd.u8x16{
+		1, 2, 4, 8,
+		rc, rc, rc, rc,
+		0, 0, 0, 0,
+		0, 0, 0, 0,
+	}
+}

+ 399 - 0
core/crypto/deoxysii/deoxysii_impl_ct64.odin

@@ -0,0 +1,399 @@
+package deoxysii
+
+import "base:intrinsics"
+import "core:crypto"
+import aes "core:crypto/_aes/ct64"
+import "core:encoding/endian"
+import "core:mem"
+import "core:simd"
+
+// This uses the bitlsiced 64-bit general purpose register SWAR AES
+// round function.  The encryption pass skips orthogonalizing the
+// AES round function input as it is aways going to be the leading 0
+// padded IV, and doing a 64-byte copy is faster.
+
+@(private = "file")
+TWEAK_SIZE :: 16
+
+@(private = "file")
+State_SW :: struct {
+	ctx:        ^Context,
+	q_stk, q_b: [8]u64,
+}
+
+@(private = "file")
+auth_tweak :: #force_inline proc "contextless" (
+	dst: ^[TWEAK_SIZE]byte,
+	prefix: byte,
+	block_nr: int,
+) {
+	endian.unchecked_put_u64be(dst[8:], u64(block_nr))
+	endian.unchecked_put_u64le(dst[0:], u64(prefix) << PREFIX_SHIFT) // dst[0] = prefix << PREFIX_SHIFT
+}
+
+@(private = "file")
+enc_tweak :: #force_inline proc "contextless" (
+	dst: ^[TWEAK_SIZE]byte,
+	tag: ^[TAG_SIZE]byte,
+	block_nr: int,
+) {
+	tmp: [8]byte
+	endian.unchecked_put_u64be(tmp[:], u64(block_nr))
+
+	copy(dst[:], tag[:])
+	dst[0] |= 0x80
+	for i in 0 ..< 8 {
+		dst[i+8] ~= tmp[i]
+	}
+}
+
+@(private = "file")
+enc_plaintext :: #force_inline proc "contextless" (
+	dst: ^[8]u64,
+	iv:  []byte,
+) {
+	tmp: [BLOCK_SIZE]byte = ---
+	tmp[0] = 0
+	copy(tmp[1:], iv[:])
+
+	q_0, q_1 := aes.load_interleaved(tmp[:])
+	for i in 0 ..< 4 {
+		dst[i], dst[i+4] = q_0, q_1
+	}
+	aes.orthogonalize(dst)
+}
+
+@(private = "file")
+bc_x4 :: proc "contextless" (
+	ctx:     ^Context,
+	dst:     []byte,
+	tweaks:  ^[4][TWEAK_SIZE]byte,
+	q_stk:   ^[8]u64,
+	q_b:     ^[8]u64, // Orthogonalized
+	n:       int,
+) {
+	tk1s: [4]simd.u8x16
+	for j in 0 ..< n {
+		tk1s[j] = intrinsics.unaligned_load((^simd.u8x16)(&tweaks[j]))
+	}
+
+	// Deoxys-BC-384
+	for i in 0 ..= BC_ROUNDS {
+		// Derive the round's subtweakkey
+		sk := intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[i]))
+		for j in 0 ..< n {
+			if i != 0 {
+				tk1s[j] = h(tk1s[j])
+			}
+			intrinsics.unaligned_store(
+				(^simd.u8x16)(raw_data(dst)),
+				simd.bit_xor(sk, tk1s[j]),
+			)
+			q_stk[j], q_stk[j+4] = aes.load_interleaved(dst[:])
+		}
+		aes.orthogonalize(q_stk)
+
+		if i != 0 {
+			aes.sub_bytes(q_b)
+			aes.shift_rows(q_b)
+			aes.mix_columns(q_b)
+		}
+		aes.add_round_key(q_b, q_stk[:])
+	}
+
+	aes.orthogonalize(q_b)
+	for i in 0 ..< n {
+		aes.store_interleaved(dst[i*BLOCK_SIZE:], q_b[i], q_b[i+4])
+	}
+}
+
+@(private = "file", require_results)
+bc_absorb :: proc "contextless" (
+	st:           ^State_SW,
+	dst:          []byte,
+	src:          []byte,
+	tweak_prefix: byte,
+	stk_block_nr: int,
+) -> int {
+	tweaks: [4][TWEAK_SIZE]byte = ---
+	tmp: [BLOCK_SIZE*4]byte = ---
+
+	src, stk_block_nr := src, stk_block_nr
+	dst_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(dst)))
+
+	nr_blocks := len(src) / BLOCK_SIZE
+	for nr_blocks > 0 {
+		// Derive the tweak(s), orthogonalize the plaintext
+		n := min(nr_blocks, 4)
+		for i in 0 ..< n {
+			auth_tweak(&tweaks[i], tweak_prefix, stk_block_nr + i)
+			st.q_b[i], st.q_b[i + 4] = aes.load_interleaved(src)
+			src = src[BLOCK_SIZE:]
+		}
+		aes.orthogonalize(&st.q_b)
+
+		// Deoxys-BC-384
+		bc_x4(st.ctx, tmp[:], &tweaks, &st.q_stk, &st.q_b, n)
+
+		// XOR in the existing Auth/tag
+		for i in 0 ..< n {
+			dst_ = simd.bit_xor(
+				dst_,
+				intrinsics.unaligned_load((^simd.u8x16)(raw_data(tmp[i*BLOCK_SIZE:]))),
+			)
+		}
+
+		stk_block_nr += n
+		nr_blocks -= n
+	}
+
+	intrinsics.unaligned_store((^simd.u8x16)(raw_data(dst)), dst_)
+
+	mem.zero_explicit(&tweaks, size_of(tweaks))
+	mem.zero_explicit(&tmp, size_of(tmp))
+
+	return stk_block_nr
+}
+
+@(private = "file")
+bc_final :: proc "contextless" (
+	st:  ^State_SW,
+	dst: []byte,
+	iv:  []byte,
+) {
+	tweaks: [4][TWEAK_SIZE]byte = ---
+
+	tweaks[0][0] = PREFIX_TAG << PREFIX_SHIFT
+	copy(tweaks[0][1:], iv)
+
+	st.q_b[0], st.q_b[4] = aes.load_interleaved(dst)
+	aes.orthogonalize(&st.q_b)
+
+	bc_x4(st.ctx, dst, &tweaks, &st.q_stk, &st.q_b, 1)
+}
+
+@(private = "file", require_results)
+bc_encrypt :: proc "contextless" (
+	st:           ^State_SW,
+	dst:          []byte,
+	src:          []byte,
+	q_n:          ^[8]u64, // Orthogonalized
+	tweak_tag:    ^[TAG_SIZE]byte,
+	stk_block_nr: int,
+) -> int {
+	tweaks: [4][TWEAK_SIZE]byte = ---
+	tmp: [BLOCK_SIZE*4]byte = ---
+
+	dst, src, stk_block_nr := dst, src, stk_block_nr
+
+	nr_blocks := len(src) / BLOCK_SIZE
+	for nr_blocks > 0 {
+		// Derive the tweak(s)
+		n := min(nr_blocks, 4)
+		for i in 0 ..< n {
+			enc_tweak(&tweaks[i], tweak_tag, stk_block_nr + i)
+		}
+		st.q_b = q_n^ // The plaintext is always `0^8 || N`
+
+		// Deoxys-BC-384
+		bc_x4(st.ctx, tmp[:], &tweaks, &st.q_stk, &st.q_b, n)
+
+		// XOR the ciphertext
+		for i in 0 ..< n {
+			intrinsics.unaligned_store(
+				(^simd.u8x16)(raw_data(dst[i*BLOCK_SIZE:])),
+				simd.bit_xor(
+					intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[i*BLOCK_SIZE:]))),
+					intrinsics.unaligned_load((^simd.u8x16)(raw_data(tmp[i*BLOCK_SIZE:]))),
+				),
+			)
+		}
+
+		dst, src = dst[n*BLOCK_SIZE:], src[n*BLOCK_SIZE:]
+		stk_block_nr += n
+		nr_blocks -= n
+	}
+
+	mem.zero_explicit(&tweaks, size_of(tweaks))
+	mem.zero_explicit(&tmp, size_of(tmp))
+
+	return stk_block_nr
+}
+
+@(private)
+e_ref :: proc "contextless" (ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) #no_bounds_check {
+	st: State_SW = ---
+	st.ctx = ctx
+
+	// Algorithm 3
+	//
+	// Associated data
+	// A_1 || ... || A_la || A_∗ <- A where each |A_i| = n and |A_∗| < n
+	// Auth <- 0^n
+	// for i = 0 to la − 1 do
+	//   Auth <- Auth ^ EK(0010 || i, A_i+1)
+	// end
+	// if A_∗ != nil then
+	//   Auth <- Auth ^ EK(0110 || la, pad10∗(A_∗))
+	// end
+	auth: [TAG_SIZE]byte
+	aad := aad
+	n := bc_absorb(&st, auth[:], aad, PREFIX_AD_BLOCK, 0)
+	aad = aad[n*BLOCK_SIZE:]
+	if l := len(aad); l > 0 {
+		a_star: [BLOCK_SIZE]byte
+
+		copy(a_star[:], aad)
+		a_star[l] = 0x80
+
+		_ = bc_absorb(&st, auth[:], a_star[:], PREFIX_AD_FINAL, n)
+	}
+
+	// Message authentication and tag generation
+	// M_1 || ... || M_l || M_∗ <- M where each |M_j| = n and |M_∗| < n
+	// tag <- Auth
+	// for j = 0 to l − 1 do
+	//   tag <- tag ^ EK(0000 || j, M_j+1)
+	// end
+	// if M_∗ != nil then
+	//   tag <- tag ^ EK(0100 || l, pad10∗(M_∗))
+	// end
+	// tag <- EK(0001 || 0^4 || N, tag)
+	m := plaintext
+	n = bc_absorb(&st, auth[:], m, PREFIX_MSG_BLOCK, 0)
+	m = m[n*BLOCK_SIZE:]
+	if l := len(m); l > 0 {
+		m_star: [BLOCK_SIZE]byte
+
+		copy(m_star[:], m)
+		m_star[l] = 0x80
+
+		_ = bc_absorb(&st, auth[:], m_star[:], PREFIX_MSG_FINAL, n)
+	}
+	bc_final(&st, auth[:], iv)
+
+	// Message encryption
+	// for j = 0 to l − 1 do
+	//   C_j <- M_j ^ EK(1 || tag ^ j, 0^8 || N)
+	// end
+	// if M_∗ != nil then
+	//   C_∗ <- M_* ^ EK(1 || tag ^ l, 0^8 || N)
+	// end
+	//
+	// return (C_1 || ... || C_l || C_∗, tag)
+	q_iv: [8]u64 = ---
+	enc_plaintext(&q_iv, iv)
+
+	m = plaintext
+	n = bc_encrypt(&st, dst, m, &q_iv, &auth, 0)
+	m = m[n*BLOCK_SIZE:]
+	if l := len(m); l > 0 {
+		m_star: [BLOCK_SIZE]byte
+
+		copy(m_star[:], m)
+		_ = bc_encrypt(&st, m_star[:], m_star[:], &q_iv, &auth, n)
+
+		copy(dst[n*BLOCK_SIZE:], m_star[:])
+
+		mem.zero_explicit(&m_star, size_of(m_star))
+	}
+
+	copy(tag, auth[:])
+
+	mem.zero_explicit(&st.q_stk, size_of(st.q_stk))
+	mem.zero_explicit(&st.q_b, size_of(st.q_b))
+}
+
+@(private, require_results)
+d_ref :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool {
+	st: State_SW = ---
+	st.ctx = ctx
+
+	// Algorithm 4
+	//
+	// Message decryption
+	// C_1 || ... || C_l || C_∗ <- C where each |C_j| = n and |C_∗| < n
+	// for j = 0 to l − 1 do
+	//   M_j <- C_j ^ EK(1 || tag ^ j, 0^8 || N)
+	// end
+	// if C_∗ != nil then
+	//   M_∗ <- C_∗ ^ EK(1 || tag ^ l, 0^8 || N)
+	// end
+	q_iv: [8]u64 = ---
+	enc_plaintext(&q_iv, iv)
+
+	auth: [TAG_SIZE]byte
+	copy(auth[:], tag)
+
+	m := ciphertext
+	n := bc_encrypt(&st, dst, m, &q_iv, &auth, 0)
+	m = m[n*BLOCK_SIZE:]
+	if l := len(m); l > 0 {
+		m_star: [BLOCK_SIZE]byte
+
+		copy(m_star[:], m)
+		_ = bc_encrypt(&st, m_star[:], m_star[:], &q_iv, &auth, n)
+
+		copy(dst[n*BLOCK_SIZE:], m_star[:])
+
+		mem.zero_explicit(&m_star, size_of(m_star))
+	}
+
+	// Associated data
+	// A_1 || ... || Al_a || A_∗ <- A where each |Ai_| = n and |A_∗| < n
+	// Auth <- 0
+	// for i = 0 to la − 1 do
+	//   Auth <- Auth ^ EK(0010 || i, A_i+1)
+	// end
+	// if A∗ != nil then
+	//   Auth <- Auth ^ EK(0110| | l_a, pad10∗(A_∗))
+	// end
+	auth = 0
+	aad := aad
+	n = bc_absorb(&st, auth[:], aad, PREFIX_AD_BLOCK, 0)
+	aad = aad[n*BLOCK_SIZE:]
+	if l := len(aad); l > 0 {
+		a_star: [BLOCK_SIZE]byte
+
+		copy(a_star[:], aad)
+		a_star[l] = 0x80
+
+		_ = bc_absorb(&st, auth[:], a_star[:], PREFIX_AD_FINAL, n)
+	}
+
+	// Message authentication and tag generation
+	// M_1 || ... || M_l || M_∗ <- M where each |M_j| = n and |M_∗| < n
+	// tag0 <- Auth
+	// for j = 0 to l − 1 do
+	//   tag0 <- tag0 ^ EK(0000 || j, M_j+1)
+	// end
+	// if M_∗ != nil then
+	//   tag0 <- tag0 ^ EK(0100 || l, pad10∗(M_∗))
+	// end
+	// tag0 <- EK(0001 || 0^4 || N, tag0)
+	m = dst[:len(ciphertext)]
+	n = bc_absorb(&st, auth[:], m, PREFIX_MSG_BLOCK, 0)
+	m = m[n*BLOCK_SIZE:]
+	if l := len(m); l > 0 {
+		m_star: [BLOCK_SIZE]byte
+
+		copy(m_star[:], m)
+		m_star[l] = 0x80
+
+		_ = bc_absorb(&st, auth[:], m_star[:], PREFIX_MSG_FINAL, n)
+
+		mem.zero_explicit(&m_star, size_of(m_star))
+	}
+	bc_final(&st, auth[:], iv)
+
+	// Tag verification
+	// if tag0 = tag then return (M_1 || ... || M_l || M_∗)
+	// else return false
+	ok := crypto.compare_constant_time(auth[:], tag) == 1
+
+	mem.zero_explicit(&auth, size_of(auth))
+	mem.zero_explicit(&st.q_stk, size_of(st.q_stk))
+	mem.zero_explicit(&st.q_b, size_of(st.q_b))
+
+	return ok
+}

+ 21 - 0
core/crypto/deoxysii/deoxysii_impl_hw_gen.odin

@@ -0,0 +1,21 @@
+#+build !amd64
+package deoxysii
+
+@(private = "file")
+ERR_HW_NOT_SUPPORTED :: "crypto/deoxysii: hardware implementation unsupported"
+
+// is_hardware_accelerated returns true iff hardware accelerated Deoxys-II
+// is supported.
+is_hardware_accelerated :: proc "contextless" () -> bool {
+	return false
+}
+
+@(private)
+e_hw :: proc "contextless" (ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) #no_bounds_check {
+	panic_contextless(ERR_HW_NOT_SUPPORTED)
+}
+
+@(private, require_results)
+d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool {
+	panic_contextless(ERR_HW_NOT_SUPPORTED)
+}

+ 434 - 0
core/crypto/deoxysii/deoxysii_impl_hw_intel.odin

@@ -0,0 +1,434 @@
+#+build amd64
+package deoxysii
+
+import "base:intrinsics"
+import "core:crypto"
+import "core:crypto/aes"
+import "core:mem"
+import "core:simd"
+import "core:simd/x86"
+
+// This processes a maximum of 4 blocks at a time, as that is suitable
+// for most current hardware that doesn't say "Xeon".
+
+@(private = "file")
+_BIT_ENC :: x86.__m128i{0x80, 0}
+@(private = "file")
+_PREFIX_AD_BLOCK :: x86.__m128i{PREFIX_AD_BLOCK << PREFIX_SHIFT, 0}
+@(private = "file")
+_PREFIX_AD_FINAL :: x86.__m128i{PREFIX_AD_FINAL << PREFIX_SHIFT, 0}
+@(private = "file")
+_PREFIX_MSG_BLOCK :: x86.__m128i{PREFIX_MSG_BLOCK << PREFIX_SHIFT, 0}
+@(private = "file")
+_PREFIX_MSG_FINAL :: x86.__m128i{PREFIX_MSG_FINAL << PREFIX_SHIFT, 0}
+
+// is_hardware_accelerated returns true iff hardware accelerated Deoxys-II
+// is supported.
+is_hardware_accelerated :: proc "contextless" () -> bool {
+	return aes.is_hardware_accelerated()
+}
+
+@(private = "file", enable_target_feature = "sse4.1", require_results)
+auth_tweak :: #force_inline proc "contextless" (
+	prefix:   x86.__m128i,
+	block_nr: int,
+) -> x86.__m128i {
+	return x86._mm_insert_epi64(prefix, i64(intrinsics.byte_swap(u64(block_nr))), 1)
+}
+
+@(private = "file", enable_target_feature = "sse2", require_results)
+enc_tweak :: #force_inline proc "contextless" (
+	tag:      x86.__m128i,
+	block_nr: int,
+) -> x86.__m128i {
+	return x86._mm_xor_si128(
+		x86._mm_or_si128(tag, _BIT_ENC),
+		x86.__m128i{0, i64(intrinsics.byte_swap(u64(block_nr)))},
+	)
+}
+
+@(private = "file", enable_target_feature = "ssse3", require_results)
+h_ :: #force_inline proc "contextless" (tk1: x86.__m128i) -> x86.__m128i {
+	return transmute(x86.__m128i)h(transmute(simd.u8x16)tk1)
+}
+
+@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results)
+bc_x4 :: #force_inline proc "contextless" (
+	ctx: ^Context,
+	s_0, s_1, s_2, s_3:                 x86.__m128i,
+	tweak_0, tweak_1, tweak_2, tweak_3: x86.__m128i,
+) -> (x86.__m128i, x86.__m128i, x86.__m128i, x86.__m128i) #no_bounds_check {
+	s_0, s_1, s_2, s_3 := s_0, s_1, s_2, s_3
+	tk1_0, tk1_1, tk1_2, tk1_3 := tweak_0, tweak_1, tweak_2, tweak_3
+
+	sk := intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[0]))
+	stk_0 := x86._mm_xor_si128(tk1_0, sk)
+	stk_1 := x86._mm_xor_si128(tk1_1, sk)
+	stk_2 := x86._mm_xor_si128(tk1_2, sk)
+	stk_3 := x86._mm_xor_si128(tk1_3, sk)
+
+	s_0 = x86._mm_xor_si128(s_0, stk_0)
+	s_1 = x86._mm_xor_si128(s_1, stk_1)
+	s_2 = x86._mm_xor_si128(s_2, stk_2)
+	s_3 = x86._mm_xor_si128(s_3, stk_3)
+
+	for i in 1 ..= BC_ROUNDS {
+		sk = intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[i]))
+
+		tk1_0 = h_(tk1_0)
+		tk1_1 = h_(tk1_1)
+		tk1_2 = h_(tk1_2)
+		tk1_3 = h_(tk1_3)
+
+		stk_0 = x86._mm_xor_si128(tk1_0, sk)
+		stk_1 = x86._mm_xor_si128(tk1_1, sk)
+		stk_2 = x86._mm_xor_si128(tk1_2, sk)
+		stk_3 = x86._mm_xor_si128(tk1_3, sk)
+
+		s_0 = x86._mm_aesenc_si128(s_0, stk_0)
+		s_1 = x86._mm_aesenc_si128(s_1, stk_1)
+		s_2 = x86._mm_aesenc_si128(s_2, stk_2)
+		s_3 = x86._mm_aesenc_si128(s_3, stk_3)
+	}
+
+	return s_0, s_1, s_2, s_3
+}
+
+@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results)
+bc_x1 :: #force_inline proc "contextless" (
+	ctx:   ^Context,
+	s:     x86.__m128i,
+	tweak: x86.__m128i,
+) -> x86.__m128i #no_bounds_check {
+	s, tk1 := s, tweak
+
+	sk := intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[0]))
+	stk := x86._mm_xor_si128(tk1, sk)
+
+	s = x86._mm_xor_si128(s, stk)
+
+	for i in 1 ..= BC_ROUNDS {
+		sk = intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[i]))
+
+		tk1 = h_(tk1)
+
+		stk = x86._mm_xor_si128(tk1, sk)
+
+		s = x86._mm_aesenc_si128(s, stk)
+	}
+
+	return s
+}
+
+@(private = "file", enable_target_feature = "sse2,ssse3,sse4.1,aes", require_results)
+bc_absorb :: proc "contextless" (
+	ctx:          ^Context,
+	tag:          x86.__m128i,
+	src:          []byte,
+	tweak_prefix: x86.__m128i,
+	stk_block_nr: int,
+) -> (x86.__m128i, int) #no_bounds_check {
+	src, stk_block_nr, tag := src, stk_block_nr, tag
+
+	nr_blocks := len(src) / BLOCK_SIZE
+	for nr_blocks >= 4 {
+		d_0, d_1, d_2, d_3 := bc_x4(
+			ctx,
+			intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))),
+			intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[BLOCK_SIZE:]))),
+			intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[2*BLOCK_SIZE:]))),
+			intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[3*BLOCK_SIZE:]))),
+			auth_tweak(tweak_prefix, stk_block_nr),
+			auth_tweak(tweak_prefix, stk_block_nr + 1),
+			auth_tweak(tweak_prefix, stk_block_nr + 2),
+			auth_tweak(tweak_prefix, stk_block_nr + 3),
+		)
+
+		tag = x86._mm_xor_si128(tag, d_0)
+		tag = x86._mm_xor_si128(tag, d_1)
+		tag = x86._mm_xor_si128(tag, d_2)
+		tag = x86._mm_xor_si128(tag, d_3)
+
+		src = src[4*BLOCK_SIZE:]
+		stk_block_nr += 4
+		nr_blocks -= 4
+	}
+
+	for nr_blocks > 0 {
+		d := bc_x1(
+			ctx,
+			intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))),
+			auth_tweak(tweak_prefix, stk_block_nr),
+		)
+
+		tag = x86._mm_xor_si128(tag, d)
+
+		src = src[BLOCK_SIZE:]
+		stk_block_nr += 1
+		nr_blocks -= 1
+	}
+
+	return tag, stk_block_nr
+}
+
+@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results)
+bc_final :: proc "contextless" (
+	ctx: ^Context,
+	tag: x86.__m128i,
+	iv:  []byte,
+) -> x86.__m128i {
+	tmp: [BLOCK_SIZE]byte
+
+	tmp[0] = PREFIX_TAG << PREFIX_SHIFT
+	copy(tmp[1:], iv)
+
+	tweak := intrinsics.unaligned_load((^x86.__m128i)(&tmp))
+
+	return bc_x1(ctx, tag, tweak)
+}
+
+@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results)
+bc_encrypt :: proc "contextless" (
+	ctx:          ^Context,
+	dst:          []byte,
+	src:          []byte,
+	iv:           x86.__m128i,
+	tweak_tag:    x86.__m128i,
+	stk_block_nr: int,
+) -> int {
+	dst, src, stk_block_nr := dst, src, stk_block_nr
+
+	nr_blocks := len(src) / BLOCK_SIZE
+	for nr_blocks >= 4 {
+		d_0, d_1, d_2, d_3 := bc_x4(
+			ctx,
+			iv, iv, iv, iv,
+			enc_tweak(tweak_tag, stk_block_nr),
+			enc_tweak(tweak_tag, stk_block_nr + 1),
+			enc_tweak(tweak_tag, stk_block_nr + 2),
+			enc_tweak(tweak_tag, stk_block_nr + 3),
+		)
+
+		intrinsics.unaligned_store(
+			(^x86.__m128i)(raw_data(dst)),
+			x86._mm_xor_si128(
+				d_0,
+				intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))),
+			),
+		)
+		intrinsics.unaligned_store(
+			(^x86.__m128i)(raw_data(dst[BLOCK_SIZE:])),
+			x86._mm_xor_si128(
+				d_1,
+				intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[BLOCK_SIZE:]))),
+			),
+		)
+		intrinsics.unaligned_store(
+			(^x86.__m128i)(raw_data(dst[2*BLOCK_SIZE:])),
+			x86._mm_xor_si128(
+				d_2,
+				intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[2*BLOCK_SIZE:]))),
+			),
+		)
+		intrinsics.unaligned_store(
+			(^x86.__m128i)(raw_data(dst[3*BLOCK_SIZE:])),
+			x86._mm_xor_si128(
+				d_3,
+				intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[3*BLOCK_SIZE:]))),
+			),
+		)
+
+		src, dst = src[4*BLOCK_SIZE:], dst[4*BLOCK_SIZE:]
+		stk_block_nr += 4
+		nr_blocks -= 4
+	}
+
+	for nr_blocks > 0 {
+		d := bc_x1(
+			ctx,
+			iv,
+			enc_tweak(tweak_tag, stk_block_nr),
+		)
+
+		intrinsics.unaligned_store(
+			(^x86.__m128i)(raw_data(dst)),
+			x86._mm_xor_si128(
+				d,
+				intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))),
+			),
+		)
+
+		src, dst = src[BLOCK_SIZE:], dst[BLOCK_SIZE:]
+		stk_block_nr += 1
+		nr_blocks -= 1
+	}
+
+	return stk_block_nr
+}
+
+@(private)
+e_hw :: proc "contextless" (ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) #no_bounds_check {
+	tmp: [BLOCK_SIZE]byte
+	copy(tmp[1:], iv)
+	iv_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(&tmp)))
+
+	// Algorithm 3
+	//
+	// Associated data
+	// A_1 || ... || A_la || A_∗ <- A where each |A_i| = n and |A_∗| < n
+	// Auth <- 0^n
+	// for i = 0 to la − 1 do
+	//   Auth <- Auth ^ EK(0010 || i, A_i+1)
+	// end
+	// if A_∗ != nil then
+	//   Auth <- Auth ^ EK(0110 || la, pad10∗(A_∗))
+	// end
+	auth: x86.__m128i
+	n: int
+
+	aad := aad
+	auth, n = bc_absorb(ctx, auth, aad, _PREFIX_AD_BLOCK, 0)
+	aad = aad[n*BLOCK_SIZE:]
+	if l := len(aad); l > 0 {
+		a_star: [BLOCK_SIZE]byte
+
+		copy(a_star[:], aad)
+		a_star[l] = 0x80
+
+		auth, _ = bc_absorb(ctx, auth, a_star[:], _PREFIX_AD_FINAL, n)
+	}
+
+	// Message authentication and tag generation
+	// M_1 || ... || M_l || M_∗ <- M where each |M_j| = n and |M_∗| < n
+	// tag <- Auth
+	// for j = 0 to l − 1 do
+	//   tag <- tag ^ EK(0000 || j, M_j+1)
+	// end
+	// if M_∗ != nil then
+	//   tag <- tag ^ EK(0100 || l, pad10∗(M_∗))
+	// end
+	// tag <- EK(0001 || 0^4 ||N, tag)
+	m := plaintext
+	auth, n = bc_absorb(ctx, auth, m, _PREFIX_MSG_BLOCK, 0)
+	m = m[n*BLOCK_SIZE:]
+	if l := len(m); l > 0 {
+		m_star: [BLOCK_SIZE]byte
+
+		copy(m_star[:], m)
+		m_star[l] = 0x80
+
+		auth, _ = bc_absorb(ctx, auth, m_star[:], _PREFIX_MSG_FINAL, n)
+	}
+	auth = bc_final(ctx, auth, iv)
+
+	// Message encryption
+	// for j = 0 to l − 1 do
+	//   C_j <- M_j ^ EK(1 || tag ^ j, 0^8 || N)
+	// end
+	// if M_∗ != nil then
+	//   C_∗ <- M_* ^ EK(1 || tag ^ l, 0^8 || N)
+	// end
+	//
+	// return (C_1 || ... || C_l || C_∗, tag)
+	m = plaintext
+	n = bc_encrypt(ctx, dst, m, iv_, auth, 0)
+	m = m[n*BLOCK_SIZE:]
+	if l := len(m); l > 0 {
+		m_star: [BLOCK_SIZE]byte
+
+		copy(m_star[:], m)
+		_ = bc_encrypt(ctx, m_star[:], m_star[:], iv_, auth, n)
+
+		copy(dst[n*BLOCK_SIZE:], m_star[:])
+	}
+
+	intrinsics.unaligned_store((^x86.__m128i)(raw_data(tag)), auth)
+}
+
+@(private, require_results)
+d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool {
+	tmp: [BLOCK_SIZE]byte
+	copy(tmp[1:], iv)
+	iv_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(&tmp)))
+
+	// Algorithm 4
+	//
+	// Message decryption
+	// C_1 || ... || C_l || C_∗ <- C where each |C_j| = n and |C_∗| < n
+	// for j = 0 to l − 1 do
+	//   M_j <- C_j ^ EK(1 || tag ^ j, 0^8 || N)
+	// end
+	// if C_∗ != nil then
+	//   M_∗ <- C_∗ ^ EK(1 || tag ^ l, 0^8 || N)
+	// end
+	auth := intrinsics.unaligned_load((^x86.__m128i)(raw_data(tag)))
+
+	m := ciphertext
+	n := bc_encrypt(ctx, dst, m, iv_, auth, 0)
+	m = m[n*BLOCK_SIZE:]
+	if l := len(m); l > 0 {
+		m_star: [BLOCK_SIZE]byte
+
+		copy(m_star[:], m)
+		_ = bc_encrypt(ctx, m_star[:], m_star[:], iv_, auth, n)
+
+		copy(dst[n*BLOCK_SIZE:], m_star[:])
+
+		mem.zero_explicit(&m_star, size_of(m_star))
+	}
+
+	// Associated data
+	// A_1 || ... || Al_a || A_∗ <- A where each |Ai_| = n and |A_∗| < n
+	// Auth <- 0
+	// for i = 0 to la − 1 do
+	//   Auth <- Auth ^ EK(0010 || i, A_i+1)
+	// end
+	// if A∗ != nil then
+	//   Auth <- Auth ^ EK(0110| | l_a, pad10∗(A_∗))
+	// end
+	auth = x86.__m128i{0, 0}
+	aad := aad
+	auth, n = bc_absorb(ctx, auth, aad, _PREFIX_AD_BLOCK, 0)
+	aad = aad[BLOCK_SIZE*n:]
+	if l := len(aad); l > 0 {
+		a_star: [BLOCK_SIZE]byte
+
+		copy(a_star[:], aad)
+		a_star[l] = 0x80
+
+		auth, _ = bc_absorb(ctx, auth, a_star[:], _PREFIX_AD_FINAL, n)
+	}
+
+	// Message authentication and tag generation
+	// M_1 || ... || M_l || M_∗ <- M where each |M_j| = n and |M_∗| < n
+	// tag0 <- Auth
+	// for j = 0 to l − 1 do
+	//   tag0 <- tag0 ^ EK(0000 || j, M_j+1)
+	// end
+	// if M_∗ != nil then
+	//   tag0 <- tag0 ^ EK(0100 || l, pad10∗(M_∗))
+	// end
+	// tag0 <- EK(0001 || 0^4 || N, tag0)
+	m = dst[:len(ciphertext)]
+	auth, n = bc_absorb(ctx, auth, m, _PREFIX_MSG_BLOCK, 0)
+	m = m[n*BLOCK_SIZE:]
+	if l := len(m); l > 0 {
+		m_star: [BLOCK_SIZE]byte
+
+		copy(m_star[:], m)
+		m_star[l] = 0x80
+
+		auth, _ = bc_absorb(ctx, auth, m_star[:], _PREFIX_MSG_FINAL, n)
+	}
+	auth = bc_final(ctx, auth, iv)
+
+	// Tag verification
+	// if tag0 = tag then return (M_1 || ... || M_l || M_∗)
+	// else return false
+	intrinsics.unaligned_store((^x86.__m128i)(raw_data(&tmp)), auth)
+	ok := crypto.compare_constant_time(tmp[:], tag) == 1
+
+	mem.zero_explicit(&tmp, size_of(tmp))
+
+	return ok
+}

+ 2 - 0
examples/all/all_main.odin

@@ -33,6 +33,7 @@ import blake2s          "core:crypto/blake2s"
 import chacha20         "core:crypto/chacha20"
 import chacha20poly1305 "core:crypto/chacha20poly1305"
 import crypto_hash      "core:crypto/hash"
+import deoxysii         "core:crypto/deoxysii"
 import ed25519          "core:crypto/ed25519"
 import hkdf             "core:crypto/hkdf"
 import hmac             "core:crypto/hmac"
@@ -177,6 +178,7 @@ _ :: blake2b
 _ :: blake2s
 _ :: chacha20
 _ :: chacha20poly1305
+_ :: deoxysii
 _ :: ed25519
 _ :: hmac
 _ :: hkdf

+ 58 - 0
tests/benchmark/crypto/benchmark_crypto.odin

@@ -12,6 +12,7 @@ import "core:crypto/aegis"
 import "core:crypto/aes"
 import "core:crypto/chacha20"
 import "core:crypto/chacha20poly1305"
+import "core:crypto/deoxysii"
 import "core:crypto/ed25519"
 import "core:crypto/poly1305"
 import "core:crypto/x25519"
@@ -202,6 +203,43 @@ benchmark_crypto :: proc(t: ^testing.T) {
 		testing.expect(t, err == nil, name)
 		benchmark_print(&str, name, options)
 	}
+	{
+		name := "Deoxys-II-256 64 bytes"
+		options := &time.Benchmark_Options {
+			rounds = 1_000,
+			bytes = 64,
+			setup = _setup_sized_buf,
+			bench = _benchmark_deoxysii_256,
+			teardown = _teardown_sized_buf,
+		}
+
+		key := [aegis.KEY_SIZE_256]byte {
+			0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
+			0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
+			0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
+			0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
+		}
+		ctx: deoxysii.Context
+		deoxysii.init(&ctx, key[:])
+
+		context.user_ptr = &ctx
+
+		err := time.benchmark(options, context.allocator)
+		testing.expect(t, err == nil, name)
+		benchmark_print(&str, name, options)
+
+		name = "Deoxys-II-256 1024 bytes"
+		options.bytes = 1024
+		err = time.benchmark(options, context.allocator)
+		testing.expect(t, err == nil, name)
+		benchmark_print(&str, name, options)
+
+		name = "Deoxys-II-256 65536 bytes"
+		options.bytes = 65536
+		err = time.benchmark(options, context.allocator)
+		testing.expect(t, err == nil, name)
+		benchmark_print(&str, name, options)
+	}
 	{
 		iters :: 10000
 
@@ -481,6 +519,26 @@ _benchmark_aegis_256 :: proc(
 	return nil
 }
 
+_benchmark_deoxysii_256 :: proc(
+	options: ^time.Benchmark_Options,
+	allocator := context.allocator,
+) -> (
+	err: time.Benchmark_Error,
+) {
+	buf := options.input
+	iv: [deoxysii.IV_SIZE]byte
+	tag: [deoxysii.TAG_SIZE]byte = ---
+
+	ctx := (^deoxysii.Context)(context.user_ptr)
+
+	for _ in 0 ..= options.rounds {
+		deoxysii.seal(ctx, buf, tag[:], iv[:], nil, buf)
+	}
+	options.count = options.rounds
+	options.processed = options.rounds * options.bytes
+	return nil
+}
+
 @(private)
 benchmark_print :: proc(str: ^strings.Builder, name: string, options: ^time.Benchmark_Options, loc := #caller_location) {
 	fmt.sbprintfln(str, "[%v] %v rounds, %v bytes processed in %v ns\n\t\t%5.3f rounds/s, %5.3f MiB/s\n",

+ 89 - 0
tests/core/crypto/test_core_crypto_aead.odin

@@ -4,6 +4,7 @@ import "base:runtime"
 import "core:crypto/aes"
 import "core:crypto/aegis"
 import "core:crypto/aead"
+import "core:crypto/deoxysii"
 import "core:encoding/hex"
 import "core:testing"
 
@@ -23,6 +24,10 @@ test_aead :: proc(t: ^testing.T) {
 	for impl in supported_aegis_impls() {
 		append(&aegis_impls, impl)
 	}
+	deoxysii_impls := make([dynamic]aead.Implementation, context.temp_allocator)
+	for impl in supported_deoxysii_impls() {
+		append(&deoxysii_impls, impl)
+	}
 	impls := [aead.Algorithm][dynamic]aead.Implementation{
 		.Invalid           = nil,
 		.AES_GCM_128       = aes_impls,
@@ -34,6 +39,7 @@ test_aead :: proc(t: ^testing.T) {
 		.AEGIS_128L_256    = aegis_impls,
 		.AEGIS_256         = aegis_impls,
 		.AEGIS_256_256     = aegis_impls,
+		.DEOXYS_II_256     = deoxysii_impls,
 	}
 
 	test_vectors := []struct{
@@ -418,6 +424,79 @@ test_aead :: proc(t: ^testing.T) {
 			"57754a7d09963e7c787583a2e7b859bb24fa1e04d49fd550b2511a358e3bca252a9b1b8b30cc4a67",
 			"a3aca270c006094d71c20e6910b5161c0826df233d08919a566ec2c05990f734",
 		},
+		// Deoxys-II-256
+		{
+			.DEOXYS_II_256,
+			"101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f",
+			"202122232425262728292a2b2c2d2e",
+			"",
+			"",
+			"",
+			"2b97bd77712f0cde975309959dfe1d7c",
+		},
+		{
+			.DEOXYS_II_256,
+			"101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f",
+			"202122232425262728292a2b2c2d2e",
+			"000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f",
+			"",
+			"",
+			"54708ae5565a71f147bdb94d7ba3aed7",
+		},
+		{
+			.DEOXYS_II_256,
+			"101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f",
+			"202122232425262728292a2b2c2d2e",
+			"f495c9c03d29989695d98ff5d430650125805c1e0576d06f26cbda42b1f82238b8",
+			"",
+			"",
+			"3277689dc4208cc1ff59d15434a1baf1",
+		},
+		{
+			.DEOXYS_II_256,
+			"101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f",
+			"202122232425262728292a2b2c2d2e",
+			"",
+			"000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f",
+			"9da20db1c2781f6669257d87e2a4d9be1970f7581bef2c995e1149331e5e8cc1",
+			"92ce3aec3a4b72ff9eab71c2a93492fa",
+		},
+		{
+			.DEOXYS_II_256,
+			"101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f",
+			"202122232425262728292a2b2c2d2e",
+			"",
+			"15cd77732f9d0c4c6e581ef400876ad9188c5b8850ebd38224da95d7cdc99f7acc",
+			"e5ffd2abc5b459a73667756eda6443ede86c0883fc51dd75d22bb14992c684618c",
+			"5fa78d57308f19d0252072ee39df5ecc",
+		},
+		{
+			.DEOXYS_II_256,
+			"101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f",
+			"202122232425262728292a2b2c2d2e",
+			"000102030405060708090a0b0c0d0e0f",
+			"000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f",
+			"109f8a168b36dfade02628a9e129d5257f03cc7912aefa79729b67b186a2b08f",
+			"6549f9bf10acba0a451dbb2484a60d90",
+		},
+		{
+			.DEOXYS_II_256,
+			"101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f",
+			"202122232425262728292a2b2c2d2e",
+			"000102030405060708090a0b0c0d0e0f10",
+			"422857fb165af0a35c03199fb895604dca9cea6d788954962c419e0d5c225c0327",
+			"7d772203fa38be296d8d20d805163130c69aba8cb16ed845c2296c61a8f34b394e",
+			"0b3f10e3933c78190b24b33008bf80e9",
+		},
+		{
+			.DEOXYS_II_256,
+			"101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f",
+			"202122232425262728292a2b2c2d2e",
+			"3290bb8441279dc6083a43e9048c3dc08966ab30d7a6b35759e7a13339f124918f3b5ab1affa65e6c0e3680eb33a6ec82424ab1ce5a40b8654e13d845c29b13896a1466a75fc875acba4527ded37ed00c600a357c9a6e586c74cf3d85cd3258c813218f319d12b82480e5124ff19ec00bda1fbb8bd25eeb3de9fcbf3296deba250caf7e9f4ef0be1918e24221dd0be888c59c166ad761d7b58462a1b1d44b04265b45827172c133dd5b6c870b9af7b21368d12a88f4efa1751047543d584382d9ec22e7550d50ecddba27d1f65453f1f3398de54ee8c1f4ac8e16f5523d89641e99a632380af0f0b1e6b0e192ec29bf1d8714978ff9fbfb93604142393e9a82c3aaebbbe15e3b4e5cfd18bdfe309315c9f9f830deebe2edcdc24f8eca90fda49f6646e789c5041fb5be933fa843278e95f3a54f8eb41f14777ea949d5ea442b01249e64816151a325769e264ed4acd5c3f21700ca755d5bc0c2c5f9453419510bc74f2d71621dcecb9efc9c24791b4bb560fb70a8231521d6560af89d8d50144d9c080863f043781153bcd59030e60bd17a6d7aa083211b67b581fa4f74cce4d030d1e8f9429fd725c110040d41eb6989ffb1595c72cbe3c9b78a8ab80d71a6a5283da77b89cae295bb13c14fbe466b617f4da8ad60b085e2ea153f6713ae0046aa31e0ba44e43ef36a111bf05c073a4e3624cd35f63a546f9142b35aa81b8826d",
+			"83dab23b1379e090755c99079cfe918cb737e989f2d720ccaff493a744927644fec3653211fa75306a83486e5c34ecfe63870c97251a73e4b9033ae374809711b211ed5d293a592e466a81170f1d85750b5ca025ccd4579947edbae9ec132bfb1a7233ad79fae30006a6699f143893861b975226ed9d3cfb8a240be232fbf4e83755d59d20bc2faa2ea5e5b0428427485cca5e76a89fe32bdd59ab4177ad7cb1899c101e3c4f7535129591390ebdf30140846078b13867bbb2efd6cf434afe356eb18d716b21fd664c26c908496534bf2cde6d6b897799016594fb6d9f830ae5f44ccec26d42ff0d1a21b80cdbe8c8c170a5f766fad884abcc781b5b8ebc0f559bfeaa4557b04d977d51411a7f47bf437d0280cf9f92bc4f9cd6226337a492320851955adae2cafea22a89c3132dd252e4728328eda05555dff3241404341b8aa502d45c456113af42a8e91a85e4b4e9555028982ec3d144722af0eb04a6d3b8127c3040629de53f5fd187048198e8f8e8cc857afcbae45c693fec12fc2149d5e7587d0121b1717d0147f6979f75e8f085293f705c3399a6cc8df7057bf481e6c374edf0a0af7479f858045357b7fe21021c3fabdaf012652bf2e5db257bd9490ce637a81477bd3f9814a2198fdb9afa9344321f2393798670e588c47a1924d592cda3eb5a96754dfd92d87ee1ffa9d4ee586c85d7518c5d2db57d0451c33de0",
+			"88294fcef65a1bdfd7baaa472816c64ef5bef2622b88c1ec5a739396157ef4935f3aa76449e391c32da28ee2857f399ac3dd95aed30cfb26cc0063cd4cd8f7431108176fbf370123856662b000a8348e5925fbb97c9ec0c737758330a7983f06b51590c1d2f5e5faaf0eb58e34e19e5fc85cec03d3926dd46a79ba7026e83dec24e07484c9103dd0cdb0edb505500caca5e1d5dbc71348cf00648821488ebaab7f9d84bbbf91b3c521dbef30110e7bd94f8dad5ab8e0cc5411ca9682d210d5d80c0c4bdbba8181789a4273d6deb80899fdcd976ca6f3a9770b54305f586a04256cfbeb4c11254e88559f294db3b9a94b80ab9f9a02cb4c0748de0af7818685521691dba5738be546dba13a56016fb8635af9dff50f25d1b17ad21707db2640a76a741e65e559b2afaaec0f37e18436bf02008f84dbd7b2698687a22376b65dc7524fca8a28709eee3f3caee3b28ed1173d1e08ee849e2ca63d2c90d555755c8fbafd5d2f4b37f06a1dbd6852ee2ffcfe79d510152e98fc4f3094f740a4aede9ee378b606d34576776bf5f1269f5385a84b3928433bfca177550ccfcd22cd0331bbc595e38c2758b2662476fa66354c4e84c7b360405aa3f5b2a48621bdca1a90c69b21789c91b5b8c568e3c741d99e22f6d7e26f2abed045f1d578b782ab4a5cf2af636d842b3012e180e4b045d8d15b057b69c92398a517053daf9be7c2935e",
+			"a616f0c218e18b526cf2a3f8c115e262",
+		},
 	}
 	for v, _ in test_vectors {
 		algo_name := aead.ALGORITHM_NAMES[v.algo]
@@ -541,3 +620,13 @@ supported_aegis_impls :: proc() -> [dynamic]aes.Implementation {
 
 	return impls
 }
+
+supported_deoxysii_impls :: proc() -> [dynamic]aes.Implementation {
+	impls := make([dynamic]aes.Implementation, 0, 2, context.temp_allocator)
+	append(&impls, aes.Implementation.Portable)
+	if deoxysii.is_hardware_accelerated() {
+		append(&impls, aes.Implementation.Hardware)
+	}
+
+	return impls
+}