1 vuosi sitten · 9759d56c81
--- a/core/crypto/_aes/ct64/ghash.odin
+++ b/core/crypto/_aes/ct64/ghash.odin
@@ -80,8 +80,8 @@ ghash :: proc "contextless" (dst, key, data: []byte) {
 
				 	h2 := h0 ~ h1
			
 
				 	h2r := h0r ~ h1r
			
 
				 
			
 
				-	src: []byte
			
 
				 	for l > 0 {
			
 
				+		src: []byte = ---
			
 
				 		if l >= _aes.GHASH_BLOCK_SIZE {
			
 
				 			src = buf
			
 
				 			buf = buf[_aes.GHASH_BLOCK_SIZE:]
			
--- a/core/crypto/_aes/hw_intel/api.odin
+++ b/core/crypto/_aes/hw_intel/api.odin
@@ -3,7 +3,7 @@ package aes_hw_intel
 
				 
			
 
				 import "core:sys/info"
			
 
				 
			
 
				-// is_supporte returns true iff hardware accelerated AES
			
 
				+// is_supported returns true iff hardware accelerated AES
			
 
				 // is supported.
			
 
				 is_supported :: proc "contextless" () -> bool {
			
 
				 	features, ok := info.cpu_features.?
			
--- a/core/crypto/_aes/hw_intel/ghash.odin
+++ b/core/crypto/_aes/hw_intel/ghash.odin
@@ -25,7 +25,6 @@ package aes_hw_intel
 
				 
			
 
				 import "base:intrinsics"
			
 
				 import "core:crypto/_aes"
			
 
				-import "core:simd"
			
 
				 import "core:simd/x86"
			
 
				 
			
 
				 @(private = "file")
			
@@ -58,14 +57,11 @@ GHASH_STRIDE_BYTES_HW :: GHASH_STRIDE_HW * _aes.GHASH_BLOCK_SIZE
 
				 // chunks. We number chunks from 0 to 3 in left to right order.
			
 
				 
			
 
				 @(private = "file")
			
 
				-byteswap_index := transmute(x86.__m128i)simd.i8x16{
			
 
				-	// Note: simd.i8x16 is reverse order from x86._mm_set_epi8.
			
 
				-	15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
			
 
				-}
			
 
				+_BYTESWAP_INDEX: x86.__m128i : { 0x08090a0b0c0d0e0f, 0x0001020304050607 }
			
 
				 
			
 
				 @(private = "file", require_results, enable_target_feature = "sse2,ssse3")
			
 
				 byteswap :: #force_inline proc "contextless" (x: x86.__m128i) -> x86.__m128i {
			
 
				-	return x86._mm_shuffle_epi8(x, byteswap_index)
			
 
				+	return x86._mm_shuffle_epi8(x, _BYTESWAP_INDEX)
			
 
				 }
			
 
				 
			
 
				 // From a 128-bit value kw, compute kx as the XOR of the two 64-bit
			
@@ -244,8 +240,8 @@ ghash :: proc "contextless" (dst, key, data: []byte) #no_bounds_check {
 
				 	}
			
 
				 
			
 
				 	// Process 1 block at a time
			
 
				-	src: []byte
			
 
				 	for l > 0 {
			
 
				+		src: []byte = ---
			
 
				 		if l >= _aes.GHASH_BLOCK_SIZE {
			
 
				 			src = buf
			
 
				 			buf = buf[_aes.GHASH_BLOCK_SIZE:]
			
--- a/core/crypto/_chacha20/chacha20.odin
+++ b/core/crypto/_chacha20/chacha20.odin
@@ -0,0 +1,123 @@
 
				+package _chacha20
			
 
				+
			
 
				+import "base:intrinsics"
			
 
				+import "core:encoding/endian"
			
 
				+import "core:math/bits"
			
 
				+import "core:mem"
			
 
				+
			
 
				+// KEY_SIZE is the (X)ChaCha20 key size in bytes.
			
 
				+KEY_SIZE :: 32
			
 
				+// IV_SIZE is the ChaCha20 IV size in bytes.
			
 
				+IV_SIZE :: 12
			
 
				+// XIV_SIZE is the XChaCha20 IV size in bytes.
			
 
				+XIV_SIZE :: 24
			
 
				+
			
 
				+// MAX_CTR_IETF is the maximum counter value for the IETF flavor ChaCha20.
			
 
				+MAX_CTR_IETF :: 0xffffffff
			
 
				+// BLOCK_SIZE is the (X)ChaCha20 block size in bytes.
			
 
				+BLOCK_SIZE :: 64
			
 
				+// STATE_SIZE_U32 is the (X)ChaCha20 state size in u32s.
			
 
				+STATE_SIZE_U32 :: 16
			
 
				+// Rounds is the (X)ChaCha20 round count.
			
 
				+ROUNDS :: 20
			
 
				+
			
 
				+// SIGMA_0 is sigma[0:4].
			
 
				+SIGMA_0: u32 : 0x61707865
			
 
				+// SIGMA_1 is sigma[4:8].
			
 
				+SIGMA_1: u32 : 0x3320646e
			
 
				+// SIGMA_2 is sigma[8:12].
			
 
				+SIGMA_2: u32 : 0x79622d32
			
 
				+// SIGMA_3 is sigma[12:16].
			
 
				+SIGMA_3: u32 : 0x6b206574
			
 
				+
			
 
				+// Context is a ChaCha20 or XChaCha20 instance.
			
 
				+Context :: struct {
			
 
				+	_s:              [STATE_SIZE_U32]u32,
			
 
				+	_buffer:         [BLOCK_SIZE]byte,
			
 
				+	_off:            int,
			
 
				+	_is_ietf_flavor: bool,
			
 
				+	_is_initialized: bool,
			
 
				+}
			
 
				+
			
 
				+// init inititializes a Context for ChaCha20 with the provided key and
			
 
				+// iv.
			
 
				+//
			
 
				+// WARNING: This ONLY handles ChaCha20.  XChaCha20 sub-key and IV
			
 
				+// derivation is expected to be handled by the caller, so that the
			
 
				+// HChaCha call can be suitably accelerated.
			
 
				+init :: proc "contextless" (ctx: ^Context, key, iv: []byte, is_xchacha: bool) {
			
 
				+	if len(key) != KEY_SIZE || len(iv) != IV_SIZE {
			
 
				+		intrinsics.trap()
			
 
				+	}
			
 
				+
			
 
				+	k, n := key, iv
			
 
				+
			
 
				+	ctx._s[0] = SIGMA_0
			
 
				+	ctx._s[1] = SIGMA_1
			
 
				+	ctx._s[2] = SIGMA_2
			
 
				+	ctx._s[3] = SIGMA_3
			
 
				+	ctx._s[4] = endian.unchecked_get_u32le(k[0:4])
			
 
				+	ctx._s[5] = endian.unchecked_get_u32le(k[4:8])
			
 
				+	ctx._s[6] = endian.unchecked_get_u32le(k[8:12])
			
 
				+	ctx._s[7] = endian.unchecked_get_u32le(k[12:16])
			
 
				+	ctx._s[8] = endian.unchecked_get_u32le(k[16:20])
			
 
				+	ctx._s[9] = endian.unchecked_get_u32le(k[20:24])
			
 
				+	ctx._s[10] = endian.unchecked_get_u32le(k[24:28])
			
 
				+	ctx._s[11] = endian.unchecked_get_u32le(k[28:32])
			
 
				+	ctx._s[12] = 0
			
 
				+	ctx._s[13] = endian.unchecked_get_u32le(n[0:4])
			
 
				+	ctx._s[14] = endian.unchecked_get_u32le(n[4:8])
			
 
				+	ctx._s[15] = endian.unchecked_get_u32le(n[8:12])
			
 
				+
			
 
				+	ctx._off = BLOCK_SIZE
			
 
				+	ctx._is_ietf_flavor = !is_xchacha
			
 
				+	ctx._is_initialized = true
			
 
				+}
			
 
				+
			
 
				+// seek seeks the (X)ChaCha20 stream counter to the specified block.
			
 
				+seek :: proc(ctx: ^Context, block_nr: u64) {
			
 
				+	assert(ctx._is_initialized)
			
 
				+
			
 
				+	if ctx._is_ietf_flavor {
			
 
				+		if block_nr > MAX_CTR_IETF {
			
 
				+			panic("crypto/chacha20: attempted to seek past maximum counter")
			
 
				+		}
			
 
				+	} else {
			
 
				+		ctx._s[13] = u32(block_nr >> 32)
			
 
				+	}
			
 
				+	ctx._s[12] = u32(block_nr)
			
 
				+	ctx._off = BLOCK_SIZE
			
 
				+}
			
 
				+
			
 
				+// reset sanitizes the Context.  The Context must be re-initialized to
			
 
				+// be used again.
			
 
				+reset :: proc(ctx: ^Context) {
			
 
				+	mem.zero_explicit(&ctx._s, size_of(ctx._s))
			
 
				+	mem.zero_explicit(&ctx._buffer, size_of(ctx._buffer))
			
 
				+
			
 
				+	ctx._is_initialized = false
			
 
				+}
			
 
				+
			
 
				+check_counter_limit :: proc(ctx: ^Context, nr_blocks: int) {
			
 
				+	// Enforce the maximum consumed keystream per IV.
			
 
				+	//
			
 
				+	// While all modern "standard" definitions of ChaCha20 use
			
 
				+	// the IETF 32-bit counter, for XChaCha20 most common
			
 
				+	// implementations allow for a 64-bit counter.
			
 
				+	//
			
 
				+	// Honestly, the answer here is "use a MRAE primitive", but
			
 
				+	// go with "common" practice in the case of XChaCha20.
			
 
				+
			
 
				+	ERR_CTR_EXHAUSTED :: "crypto/chacha20: maximum (X)ChaCha20 keystream per IV reached"
			
 
				+
			
 
				+	if ctx._is_ietf_flavor {
			
 
				+		if u64(ctx._s[12]) + u64(nr_blocks) > MAX_CTR_IETF {
			
 
				+			panic(ERR_CTR_EXHAUSTED)
			
 
				+		}
			
 
				+	} else {
			
 
				+		ctr := (u64(ctx._s[13]) << 32) | u64(ctx._s[12])
			
 
				+		if _, carry := bits.add_u64(ctr, u64(nr_blocks), 0); carry != 0 {
			
 
				+			panic(ERR_CTR_EXHAUSTED)
			
 
				+		}
			
 
				+	}
			
 
				+}
			
--- a/core/crypto/_chacha20/ref/chacha20_ref.odin
+++ b/core/crypto/_chacha20/ref/chacha20_ref.odin
@@ -0,0 +1,360 @@
 
				+package chacha20_ref
			
 
				+
			
 
				+import "core:crypto/_chacha20"
			
 
				+import "core:encoding/endian"
			
 
				+import "core:math/bits"
			
 
				+
			
 
				+stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int) {
			
 
				+	// Enforce the maximum consumed keystream per IV.
			
 
				+	_chacha20.check_counter_limit(ctx, nr_blocks)
			
 
				+
			
 
				+	dst, src := dst, src
			
 
				+	x := &ctx._s
			
 
				+	for n := 0; n < nr_blocks; n = n + 1 {
			
 
				+		x0, x1, x2, x3 :=
			
 
				+			_chacha20.SIGMA_0, _chacha20.SIGMA_1, _chacha20.SIGMA_2, _chacha20.SIGMA_3
			
 
				+		x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 :=
			
 
				+			x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
			
 
				+
			
 
				+		for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
			
 
				+			// Even when forcing inlining manually inlining all of
			
 
				+			// these is decently faster.
			
 
				+
			
 
				+			// quarterround(x, 0, 4, 8, 12)
			
 
				+			x0 += x4
			
 
				+			x12 ~= x0
			
 
				+			x12 = bits.rotate_left32(x12, 16)
			
 
				+			x8 += x12
			
 
				+			x4 ~= x8
			
 
				+			x4 = bits.rotate_left32(x4, 12)
			
 
				+			x0 += x4
			
 
				+			x12 ~= x0
			
 
				+			x12 = bits.rotate_left32(x12, 8)
			
 
				+			x8 += x12
			
 
				+			x4 ~= x8
			
 
				+			x4 = bits.rotate_left32(x4, 7)
			
 
				+
			
 
				+			// quarterround(x, 1, 5, 9, 13)
			
 
				+			x1 += x5
			
 
				+			x13 ~= x1
			
 
				+			x13 = bits.rotate_left32(x13, 16)
			
 
				+			x9 += x13
			
 
				+			x5 ~= x9
			
 
				+			x5 = bits.rotate_left32(x5, 12)
			
 
				+			x1 += x5
			
 
				+			x13 ~= x1
			
 
				+			x13 = bits.rotate_left32(x13, 8)
			
 
				+			x9 += x13
			
 
				+			x5 ~= x9
			
 
				+			x5 = bits.rotate_left32(x5, 7)
			
 
				+
			
 
				+			// quarterround(x, 2, 6, 10, 14)
			
 
				+			x2 += x6
			
 
				+			x14 ~= x2
			
 
				+			x14 = bits.rotate_left32(x14, 16)
			
 
				+			x10 += x14
			
 
				+			x6 ~= x10
			
 
				+			x6 = bits.rotate_left32(x6, 12)
			
 
				+			x2 += x6
			
 
				+			x14 ~= x2
			
 
				+			x14 = bits.rotate_left32(x14, 8)
			
 
				+			x10 += x14
			
 
				+			x6 ~= x10
			
 
				+			x6 = bits.rotate_left32(x6, 7)
			
 
				+
			
 
				+			// quarterround(x, 3, 7, 11, 15)
			
 
				+			x3 += x7
			
 
				+			x15 ~= x3
			
 
				+			x15 = bits.rotate_left32(x15, 16)
			
 
				+			x11 += x15
			
 
				+			x7 ~= x11
			
 
				+			x7 = bits.rotate_left32(x7, 12)
			
 
				+			x3 += x7
			
 
				+			x15 ~= x3
			
 
				+			x15 = bits.rotate_left32(x15, 8)
			
 
				+			x11 += x15
			
 
				+			x7 ~= x11
			
 
				+			x7 = bits.rotate_left32(x7, 7)
			
 
				+
			
 
				+			// quarterround(x, 0, 5, 10, 15)
			
 
				+			x0 += x5
			
 
				+			x15 ~= x0
			
 
				+			x15 = bits.rotate_left32(x15, 16)
			
 
				+			x10 += x15
			
 
				+			x5 ~= x10
			
 
				+			x5 = bits.rotate_left32(x5, 12)
			
 
				+			x0 += x5
			
 
				+			x15 ~= x0
			
 
				+			x15 = bits.rotate_left32(x15, 8)
			
 
				+			x10 += x15
			
 
				+			x5 ~= x10
			
 
				+			x5 = bits.rotate_left32(x5, 7)
			
 
				+
			
 
				+			// quarterround(x, 1, 6, 11, 12)
			
 
				+			x1 += x6
			
 
				+			x12 ~= x1
			
 
				+			x12 = bits.rotate_left32(x12, 16)
			
 
				+			x11 += x12
			
 
				+			x6 ~= x11
			
 
				+			x6 = bits.rotate_left32(x6, 12)
			
 
				+			x1 += x6
			
 
				+			x12 ~= x1
			
 
				+			x12 = bits.rotate_left32(x12, 8)
			
 
				+			x11 += x12
			
 
				+			x6 ~= x11
			
 
				+			x6 = bits.rotate_left32(x6, 7)
			
 
				+
			
 
				+			// quarterround(x, 2, 7, 8, 13)
			
 
				+			x2 += x7
			
 
				+			x13 ~= x2
			
 
				+			x13 = bits.rotate_left32(x13, 16)
			
 
				+			x8 += x13
			
 
				+			x7 ~= x8
			
 
				+			x7 = bits.rotate_left32(x7, 12)
			
 
				+			x2 += x7
			
 
				+			x13 ~= x2
			
 
				+			x13 = bits.rotate_left32(x13, 8)
			
 
				+			x8 += x13
			
 
				+			x7 ~= x8
			
 
				+			x7 = bits.rotate_left32(x7, 7)
			
 
				+
			
 
				+			// quarterround(x, 3, 4, 9, 14)
			
 
				+			x3 += x4
			
 
				+			x14 ~= x3
			
 
				+			x14 = bits.rotate_left32(x14, 16)
			
 
				+			x9 += x14
			
 
				+			x4 ~= x9
			
 
				+			x4 = bits.rotate_left32(x4, 12)
			
 
				+			x3 += x4
			
 
				+			x14 ~= x3
			
 
				+			x14 = bits.rotate_left32(x14, 8)
			
 
				+			x9 += x14
			
 
				+			x4 ~= x9
			
 
				+			x4 = bits.rotate_left32(x4, 7)
			
 
				+		}
			
 
				+
			
 
				+		x0 += _chacha20.SIGMA_0
			
 
				+		x1 += _chacha20.SIGMA_1
			
 
				+		x2 += _chacha20.SIGMA_2
			
 
				+		x3 += _chacha20.SIGMA_3
			
 
				+		x4 += x[4]
			
 
				+		x5 += x[5]
			
 
				+		x6 += x[6]
			
 
				+		x7 += x[7]
			
 
				+		x8 += x[8]
			
 
				+		x9 += x[9]
			
 
				+		x10 += x[10]
			
 
				+		x11 += x[11]
			
 
				+		x12 += x[12]
			
 
				+		x13 += x[13]
			
 
				+		x14 += x[14]
			
 
				+		x15 += x[15]
			
 
				+
			
 
				+		// - The caller(s) ensure that src/dst are valid.
			
 
				+		// - The compiler knows if the target is picky about alignment.
			
 
				+
			
 
				+		#no_bounds_check {
			
 
				+			if src != nil {
			
 
				+				endian.unchecked_put_u32le(dst[0:4], endian.unchecked_get_u32le(src[0:4]) ~ x0)
			
 
				+				endian.unchecked_put_u32le(dst[4:8], endian.unchecked_get_u32le(src[4:8]) ~ x1)
			
 
				+				endian.unchecked_put_u32le(dst[8:12], endian.unchecked_get_u32le(src[8:12]) ~ x2)
			
 
				+				endian.unchecked_put_u32le(dst[12:16], endian.unchecked_get_u32le(src[12:16]) ~ x3)
			
 
				+				endian.unchecked_put_u32le(dst[16:20], endian.unchecked_get_u32le(src[16:20]) ~ x4)
			
 
				+				endian.unchecked_put_u32le(dst[20:24], endian.unchecked_get_u32le(src[20:24]) ~ x5)
			
 
				+				endian.unchecked_put_u32le(dst[24:28], endian.unchecked_get_u32le(src[24:28]) ~ x6)
			
 
				+				endian.unchecked_put_u32le(dst[28:32], endian.unchecked_get_u32le(src[28:32]) ~ x7)
			
 
				+				endian.unchecked_put_u32le(dst[32:36], endian.unchecked_get_u32le(src[32:36]) ~ x8)
			
 
				+				endian.unchecked_put_u32le(dst[36:40], endian.unchecked_get_u32le(src[36:40]) ~ x9)
			
 
				+				endian.unchecked_put_u32le(
			
 
				+					dst[40:44],
			
 
				+					endian.unchecked_get_u32le(src[40:44]) ~ x10,
			
 
				+				)
			
 
				+				endian.unchecked_put_u32le(
			
 
				+					dst[44:48],
			
 
				+					endian.unchecked_get_u32le(src[44:48]) ~ x11,
			
 
				+				)
			
 
				+				endian.unchecked_put_u32le(
			
 
				+					dst[48:52],
			
 
				+					endian.unchecked_get_u32le(src[48:52]) ~ x12,
			
 
				+				)
			
 
				+				endian.unchecked_put_u32le(
			
 
				+					dst[52:56],
			
 
				+					endian.unchecked_get_u32le(src[52:56]) ~ x13,
			
 
				+				)
			
 
				+				endian.unchecked_put_u32le(
			
 
				+					dst[56:60],
			
 
				+					endian.unchecked_get_u32le(src[56:60]) ~ x14,
			
 
				+				)
			
 
				+				endian.unchecked_put_u32le(
			
 
				+					dst[60:64],
			
 
				+					endian.unchecked_get_u32le(src[60:64]) ~ x15,
			
 
				+				)
			
 
				+				src = src[_chacha20.BLOCK_SIZE:]
			
 
				+			} else {
			
 
				+				endian.unchecked_put_u32le(dst[0:4], x0)
			
 
				+				endian.unchecked_put_u32le(dst[4:8], x1)
			
 
				+				endian.unchecked_put_u32le(dst[8:12], x2)
			
 
				+				endian.unchecked_put_u32le(dst[12:16], x3)
			
 
				+				endian.unchecked_put_u32le(dst[16:20], x4)
			
 
				+				endian.unchecked_put_u32le(dst[20:24], x5)
			
 
				+				endian.unchecked_put_u32le(dst[24:28], x6)
			
 
				+				endian.unchecked_put_u32le(dst[28:32], x7)
			
 
				+				endian.unchecked_put_u32le(dst[32:36], x8)
			
 
				+				endian.unchecked_put_u32le(dst[36:40], x9)
			
 
				+				endian.unchecked_put_u32le(dst[40:44], x10)
			
 
				+				endian.unchecked_put_u32le(dst[44:48], x11)
			
 
				+				endian.unchecked_put_u32le(dst[48:52], x12)
			
 
				+				endian.unchecked_put_u32le(dst[52:56], x13)
			
 
				+				endian.unchecked_put_u32le(dst[56:60], x14)
			
 
				+				endian.unchecked_put_u32le(dst[60:64], x15)
			
 
				+			}
			
 
				+			dst = dst[_chacha20.BLOCK_SIZE:]
			
 
				+		}
			
 
				+
			
 
				+		// Increment the counter.  Overflow checking is done upon
			
 
				+		// entry into the routine, so a 64-bit increment safely
			
 
				+		// covers both cases.
			
 
				+		new_ctr := ((u64(ctx._s[13]) << 32) | u64(ctx._s[12])) + 1
			
 
				+		x[12] = u32(new_ctr)
			
 
				+		x[13] = u32(new_ctr >> 32)
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+hchacha20 :: proc "contextless" (dst, key, iv: []byte) {
			
 
				+	x0, x1, x2, x3 := _chacha20.SIGMA_0, _chacha20.SIGMA_1, _chacha20.SIGMA_2, _chacha20.SIGMA_3
			
 
				+	x4 := endian.unchecked_get_u32le(key[0:4])
			
 
				+	x5 := endian.unchecked_get_u32le(key[4:8])
			
 
				+	x6 := endian.unchecked_get_u32le(key[8:12])
			
 
				+	x7 := endian.unchecked_get_u32le(key[12:16])
			
 
				+	x8 := endian.unchecked_get_u32le(key[16:20])
			
 
				+	x9 := endian.unchecked_get_u32le(key[20:24])
			
 
				+	x10 := endian.unchecked_get_u32le(key[24:28])
			
 
				+	x11 := endian.unchecked_get_u32le(key[28:32])
			
 
				+	x12 := endian.unchecked_get_u32le(iv[0:4])
			
 
				+	x13 := endian.unchecked_get_u32le(iv[4:8])
			
 
				+	x14 := endian.unchecked_get_u32le(iv[8:12])
			
 
				+	x15 := endian.unchecked_get_u32le(iv[12:16])
			
 
				+
			
 
				+	for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
			
 
				+		// quarterround(x, 0, 4, 8, 12)
			
 
				+		x0 += x4
			
 
				+		x12 ~= x0
			
 
				+		x12 = bits.rotate_left32(x12, 16)
			
 
				+		x8 += x12
			
 
				+		x4 ~= x8
			
 
				+		x4 = bits.rotate_left32(x4, 12)
			
 
				+		x0 += x4
			
 
				+		x12 ~= x0
			
 
				+		x12 = bits.rotate_left32(x12, 8)
			
 
				+		x8 += x12
			
 
				+		x4 ~= x8
			
 
				+		x4 = bits.rotate_left32(x4, 7)
			
 
				+
			
 
				+		// quarterround(x, 1, 5, 9, 13)
			
 
				+		x1 += x5
			
 
				+		x13 ~= x1
			
 
				+		x13 = bits.rotate_left32(x13, 16)
			
 
				+		x9 += x13
			
 
				+		x5 ~= x9
			
 
				+		x5 = bits.rotate_left32(x5, 12)
			
 
				+		x1 += x5
			
 
				+		x13 ~= x1
			
 
				+		x13 = bits.rotate_left32(x13, 8)
			
 
				+		x9 += x13
			
 
				+		x5 ~= x9
			
 
				+		x5 = bits.rotate_left32(x5, 7)
			
 
				+
			
 
				+		// quarterround(x, 2, 6, 10, 14)
			
 
				+		x2 += x6
			
 
				+		x14 ~= x2
			
 
				+		x14 = bits.rotate_left32(x14, 16)
			
 
				+		x10 += x14
			
 
				+		x6 ~= x10
			
 
				+		x6 = bits.rotate_left32(x6, 12)
			
 
				+		x2 += x6
			
 
				+		x14 ~= x2
			
 
				+		x14 = bits.rotate_left32(x14, 8)
			
 
				+		x10 += x14
			
 
				+		x6 ~= x10
			
 
				+		x6 = bits.rotate_left32(x6, 7)
			
 
				+
			
 
				+		// quarterround(x, 3, 7, 11, 15)
			
 
				+		x3 += x7
			
 
				+		x15 ~= x3
			
 
				+		x15 = bits.rotate_left32(x15, 16)
			
 
				+		x11 += x15
			
 
				+		x7 ~= x11
			
 
				+		x7 = bits.rotate_left32(x7, 12)
			
 
				+		x3 += x7
			
 
				+		x15 ~= x3
			
 
				+		x15 = bits.rotate_left32(x15, 8)
			
 
				+		x11 += x15
			
 
				+		x7 ~= x11
			
 
				+		x7 = bits.rotate_left32(x7, 7)
			
 
				+
			
 
				+		// quarterround(x, 0, 5, 10, 15)
			
 
				+		x0 += x5
			
 
				+		x15 ~= x0
			
 
				+		x15 = bits.rotate_left32(x15, 16)
			
 
				+		x10 += x15
			
 
				+		x5 ~= x10
			
 
				+		x5 = bits.rotate_left32(x5, 12)
			
 
				+		x0 += x5
			
 
				+		x15 ~= x0
			
 
				+		x15 = bits.rotate_left32(x15, 8)
			
 
				+		x10 += x15
			
 
				+		x5 ~= x10
			
 
				+		x5 = bits.rotate_left32(x5, 7)
			
 
				+
			
 
				+		// quarterround(x, 1, 6, 11, 12)
			
 
				+		x1 += x6
			
 
				+		x12 ~= x1
			
 
				+		x12 = bits.rotate_left32(x12, 16)
			
 
				+		x11 += x12
			
 
				+		x6 ~= x11
			
 
				+		x6 = bits.rotate_left32(x6, 12)
			
 
				+		x1 += x6
			
 
				+		x12 ~= x1
			
 
				+		x12 = bits.rotate_left32(x12, 8)
			
 
				+		x11 += x12
			
 
				+		x6 ~= x11
			
 
				+		x6 = bits.rotate_left32(x6, 7)
			
 
				+
			
 
				+		// quarterround(x, 2, 7, 8, 13)
			
 
				+		x2 += x7
			
 
				+		x13 ~= x2
			
 
				+		x13 = bits.rotate_left32(x13, 16)
			
 
				+		x8 += x13
			
 
				+		x7 ~= x8
			
 
				+		x7 = bits.rotate_left32(x7, 12)
			
 
				+		x2 += x7
			
 
				+		x13 ~= x2
			
 
				+		x13 = bits.rotate_left32(x13, 8)
			
 
				+		x8 += x13
			
 
				+		x7 ~= x8
			
 
				+		x7 = bits.rotate_left32(x7, 7)
			
 
				+
			
 
				+		// quarterround(x, 3, 4, 9, 14)
			
 
				+		x3 += x4
			
 
				+		x14 ~= x3
			
 
				+		x14 = bits.rotate_left32(x14, 16)
			
 
				+		x9 += x14
			
 
				+		x4 ~= x9
			
 
				+		x4 = bits.rotate_left32(x4, 12)
			
 
				+		x3 += x4
			
 
				+		x14 ~= x3
			
 
				+		x14 = bits.rotate_left32(x14, 8)
			
 
				+		x9 += x14
			
 
				+		x4 ~= x9
			
 
				+		x4 = bits.rotate_left32(x4, 7)
			
 
				+	}
			
 
				+
			
 
				+	endian.unchecked_put_u32le(dst[0:4], x0)
			
 
				+	endian.unchecked_put_u32le(dst[4:8], x1)
			
 
				+	endian.unchecked_put_u32le(dst[8:12], x2)
			
 
				+	endian.unchecked_put_u32le(dst[12:16], x3)
			
 
				+	endian.unchecked_put_u32le(dst[16:20], x12)
			
 
				+	endian.unchecked_put_u32le(dst[20:24], x13)
			
 
				+	endian.unchecked_put_u32le(dst[24:28], x14)
			
 
				+	endian.unchecked_put_u32le(dst[28:32], x15)
			
 
				+}
			
--- a/core/crypto/_chacha20/simd128/chacha20_simd128.odin
+++ b/core/crypto/_chacha20/simd128/chacha20_simd128.odin
@@ -0,0 +1,481 @@
 
				+package chacha20_simd128
			
 
				+
			
 
				+import "base:intrinsics"
			
 
				+import "core:crypto/_chacha20"
			
 
				+import "core:simd"
			
 
				+import "core:sys/info"
			
 
				+
			
 
				+// Portable 128-bit `core:simd` implementation.
			
 
				+//
			
 
				+// This is loosely based on Ted Krovetz's public domain C intrinsic
			
 
				+// implementation.
			
 
				+//
			
 
				+// This is written to perform adequately on any target that has "enough"
			
 
				+// 128-bit vector registers, the current thought is that 4 blocks at at
			
 
				+// time is reasonable for amd64, though Ted's code is more conservative.
			
 
				+//
			
 
				+// See:
			
 
				+// supercop-20230530/crypto_stream/chacha20/krovetz/vec128
			
 
				+
			
 
				+// Ensure the compiler emits SIMD instructions.  This is a minimum, and
			
 
				+// setting the microarchitecture at compile time will allow for better
			
 
				+// code gen when applicable (eg: AVX).  This is somewhat redundant with
			
 
				+// the default microarchitecture configurations.
			
 
				+when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
			
 
				+	@(private = "file")
			
 
				+	TARGET_SIMD_FEATURES :: "neon"
			
 
				+} else when ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 {
			
 
				+	// Note: LLVM appears to be smart enough to use PSHUFB despite not
			
 
				+	// explicitly using simd.u8x16 shuffles.
			
 
				+	@(private = "file")
			
 
				+	TARGET_SIMD_FEATURES :: "sse2,ssse3"
			
 
				+} else {
			
 
				+	@(private = "file")
			
 
				+	TARGET_SIMD_FEATURES :: ""
			
 
				+}
			
 
				+
			
 
				+@(private = "file")
			
 
				+_ROT_7L: simd.u32x4 : {7, 7, 7, 7}
			
 
				+@(private = "file")
			
 
				+_ROT_7R: simd.u32x4 : {25, 25, 25, 25}
			
 
				+@(private = "file")
			
 
				+_ROT_12L: simd.u32x4 : {12, 12, 12, 12}
			
 
				+@(private = "file")
			
 
				+_ROT_12R: simd.u32x4 : {20, 20, 20, 20}
			
 
				+@(private = "file")
			
 
				+_ROT_8L: simd.u32x4 : {8, 8, 8, 8}
			
 
				+@(private = "file")
			
 
				+_ROT_8R: simd.u32x4 : {24, 24, 24, 24}
			
 
				+@(private = "file")
			
 
				+_ROT_16: simd.u32x4 : {16, 16, 16, 16}
			
 
				+
			
 
				+when ODIN_ENDIAN == .Big {
			
 
				+	@(private = "file")
			
 
				+	_increment_counter :: #force_inline proc "contextless" (ctx: ^Context) -> simd.u32x4 {
			
 
				+		// In the Big Endian case, the low and high portions in the vector
			
 
				+		// are flipped, so the 64-bit addition can't be done with a simple
			
 
				+		// vector add.
			
 
				+		x := &ctx._s
			
 
				+
			
 
				+		new_ctr := ((u64(ctx._s[13]) << 32) | u64(ctx._s[12])) + 1
			
 
				+		x[12] = u32(new_ctr)
			
 
				+		x[13] = u32(new_ctr >> 32)
			
 
				+
			
 
				+		return intrinsics.unaligned_load(transmute(^simd.u32x4)&x[12])
			
 
				+	}
			
 
				+
			
 
				+	// Convert the endian-ness of the components of a u32x4 vector, for
			
 
				+	// the purposes of output.
			
 
				+	@(private = "file")
			
 
				+	_byteswap_u32x4 :: #force_inline proc "contextless" (v: simd.u32x4) -> simd.u32x4 {
			
 
				+		return(
			
 
				+			transmute(simd.u32x4)simd.shuffle(
			
 
				+				transmute(simd.u8x16)v,
			
 
				+				transmute(simd.u8x16)v,
			
 
				+				3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
			
 
				+			)
			
 
				+		)
			
 
				+	}
			
 
				+} else {
			
 
				+	@(private = "file")
			
 
				+	_VEC_ONE: simd.u64x2 : {1, 0}
			
 
				+}
			
 
				+
			
 
				+@(private = "file")
			
 
				+_dq_round_simd128 :: #force_inline proc "contextless" (
			
 
				+	v0, v1, v2, v3: simd.u32x4,
			
 
				+) -> (
			
 
				+	simd.u32x4,
			
 
				+	simd.u32x4,
			
 
				+	simd.u32x4,
			
 
				+	simd.u32x4,
			
 
				+) {
			
 
				+	v0, v1, v2, v3 := v0, v1, v2, v3
			
 
				+
			
 
				+	// a += b; d ^= a; d = ROTW16(d);
			
 
				+	v0 = simd.add(v0, v1)
			
 
				+	v3 = simd.bit_xor(v3, v0)
			
 
				+	v3 = simd.bit_xor(simd.shl(v3, _ROT_16), simd.shr(v3, _ROT_16))
			
 
				+
			
 
				+	// c += d; b ^= c; b = ROTW12(b);
			
 
				+	v2 = simd.add(v2, v3)
			
 
				+	v1 = simd.bit_xor(v1, v2)
			
 
				+	v1 = simd.bit_xor(simd.shl(v1, _ROT_12L), simd.shr(v1, _ROT_12R))
			
 
				+
			
 
				+	// a += b; d ^= a; d = ROTW8(d);
			
 
				+	v0 = simd.add(v0, v1)
			
 
				+	v3 = simd.bit_xor(v3, v0)
			
 
				+	v3 = simd.bit_xor(simd.shl(v3, _ROT_8L), simd.shr(v3, _ROT_8R))
			
 
				+
			
 
				+	// c += d; b ^= c; b = ROTW7(b);
			
 
				+	v2 = simd.add(v2, v3)
			
 
				+	v1 = simd.bit_xor(v1, v2)
			
 
				+	v1 = simd.bit_xor(simd.shl(v1, _ROT_7L), simd.shr(v1, _ROT_7R))
			
 
				+
			
 
				+	// b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
			
 
				+	v1 = simd.shuffle(v1, v1, 1, 2, 3, 0)
			
 
				+	v2 = simd.shuffle(v2, v2, 2, 3, 0, 1)
			
 
				+	v3 = simd.shuffle(v3, v3, 3, 0, 1, 2)
			
 
				+
			
 
				+	// a += b; d ^= a; d = ROTW16(d);
			
 
				+	v0 = simd.add(v0, v1)
			
 
				+	v3 = simd.bit_xor(v3, v0)
			
 
				+	v3 = simd.bit_xor(simd.shl(v3, _ROT_16), simd.shr(v3, _ROT_16))
			
 
				+
			
 
				+	// c += d; b ^= c; b = ROTW12(b);
			
 
				+	v2 = simd.add(v2, v3)
			
 
				+	v1 = simd.bit_xor(v1, v2)
			
 
				+	v1 = simd.bit_xor(simd.shl(v1, _ROT_12L), simd.shr(v1, _ROT_12R))
			
 
				+
			
 
				+	// a += b; d ^= a; d = ROTW8(d);
			
 
				+	v0 = simd.add(v0, v1)
			
 
				+	v3 = simd.bit_xor(v3, v0)
			
 
				+	v3 = simd.bit_xor(simd.shl(v3, _ROT_8L), simd.shr(v3, _ROT_8R))
			
 
				+
			
 
				+	// c += d; b ^= c; b = ROTW7(b);
			
 
				+	v2 = simd.add(v2, v3)
			
 
				+	v1 = simd.bit_xor(v1, v2)
			
 
				+	v1 = simd.bit_xor(simd.shl(v1, _ROT_7L), simd.shr(v1, _ROT_7R))
			
 
				+
			
 
				+	// b = ROTV3(b); c = ROTV2(c); d = ROTV1(d);
			
 
				+	v1 = simd.shuffle(v1, v1, 3, 0, 1, 2)
			
 
				+	v2 = simd.shuffle(v2, v2, 2, 3, 0, 1)
			
 
				+	v3 = simd.shuffle(v3, v3, 1, 2, 3, 0)
			
 
				+
			
 
				+	return v0, v1, v2, v3
			
 
				+}
			
 
				+
			
 
				+@(private = "file")
			
 
				+_add_state_simd128 :: #force_inline proc "contextless" (
			
 
				+	v0, v1, v2, v3, s0, s1, s2, s3: simd.u32x4,
			
 
				+) -> (
			
 
				+	simd.u32x4,
			
 
				+	simd.u32x4,
			
 
				+	simd.u32x4,
			
 
				+	simd.u32x4,
			
 
				+) {
			
 
				+	v0, v1, v2, v3 := v0, v1, v2, v3
			
 
				+
			
 
				+	v0 = simd.add(v0, s0)
			
 
				+	v1 = simd.add(v1, s1)
			
 
				+	v2 = simd.add(v2, s2)
			
 
				+	v3 = simd.add(v3, s3)
			
 
				+
			
 
				+	when ODIN_ENDIAN == .Big {
			
 
				+		v0 = _byteswap_u32x4(v0)
			
 
				+		v1 = _byteswap_u32x4(v1)
			
 
				+		v2 = _byteswap_u32x4(v2)
			
 
				+		v3 = _byteswap_u32x4(v3)
			
 
				+	}
			
 
				+
			
 
				+	return v0, v1, v2, v3
			
 
				+}
			
 
				+
			
 
				+@(private = "file")
			
 
				+_xor_simd128 :: #force_inline proc "contextless" (
			
 
				+	src: [^]simd.u32x4,
			
 
				+	v0, v1, v2, v3: simd.u32x4,
			
 
				+) -> (
			
 
				+	simd.u32x4,
			
 
				+	simd.u32x4,
			
 
				+	simd.u32x4,
			
 
				+	simd.u32x4,
			
 
				+) {
			
 
				+	v0, v1, v2, v3 := v0, v1, v2, v3
			
 
				+
			
 
				+	v0 = simd.bit_xor(v0, intrinsics.unaligned_load((^simd.u32x4)(src[0:])))
			
 
				+	v1 = simd.bit_xor(v1, intrinsics.unaligned_load((^simd.u32x4)(src[1:])))
			
 
				+	v2 = simd.bit_xor(v2, intrinsics.unaligned_load((^simd.u32x4)(src[2:])))
			
 
				+	v3 = simd.bit_xor(v3, intrinsics.unaligned_load((^simd.u32x4)(src[3:])))
			
 
				+
			
 
				+	return v0, v1, v2, v3
			
 
				+}
			
 
				+
			
 
				+@(private = "file")
			
 
				+_store_simd128 :: #force_inline proc "contextless" (
			
 
				+	dst: [^]simd.u32x4,
			
 
				+	v0, v1, v2, v3: simd.u32x4,
			
 
				+) {
			
 
				+	intrinsics.unaligned_store((^simd.u32x4)(dst[0:]), v0)
			
 
				+	intrinsics.unaligned_store((^simd.u32x4)(dst[1:]), v1)
			
 
				+	intrinsics.unaligned_store((^simd.u32x4)(dst[2:]), v2)
			
 
				+	intrinsics.unaligned_store((^simd.u32x4)(dst[3:]), v3)
			
 
				+}
			
 
				+
			
 
				+// is_performant returns true iff the target and current host both support
			
 
				+// "enough" 128-bit SIMD to make this implementation performant.
			
 
				+is_performant :: proc "contextless" () -> bool {
			
 
				+	when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 || ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 {
			
 
				+		when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
			
 
				+			req_features :: info.CPU_Features{.asimd}
			
 
				+		} else when ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 {
			
 
				+			req_features :: info.CPU_Features{.sse2, .ssse3}
			
 
				+		}
			
 
				+
			
 
				+		features, ok := info.cpu_features.?
			
 
				+		if !ok {
			
 
				+			return false
			
 
				+		}
			
 
				+
			
 
				+		return features >= req_features
			
 
				+	} else when ODIN_ARCH == .wasm64p32 || ODIN_ARCH == .wasm32 {
			
 
				+		return intrinsics.has_target_feature("simd128")
			
 
				+	} else {
			
 
				+		return false
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+@(enable_target_feature = TARGET_SIMD_FEATURES)
			
 
				+stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int) {
			
 
				+	// Enforce the maximum consumed keystream per IV.
			
 
				+	_chacha20.check_counter_limit(ctx, nr_blocks)
			
 
				+
			
 
				+	dst_v := ([^]simd.u32x4)(raw_data(dst))
			
 
				+	src_v := ([^]simd.u32x4)(raw_data(src))
			
 
				+
			
 
				+	x := &ctx._s
			
 
				+	n := nr_blocks
			
 
				+
			
 
				+	// The state vector is an array of uint32s in native byte-order.
			
 
				+	x_v := ([^]simd.u32x4)(raw_data(x))
			
 
				+	s0 := intrinsics.unaligned_load((^simd.u32x4)(x_v[0:]))
			
 
				+	s1 := intrinsics.unaligned_load((^simd.u32x4)(x_v[1:]))
			
 
				+	s2 := intrinsics.unaligned_load((^simd.u32x4)(x_v[2:]))
			
 
				+	s3 := intrinsics.unaligned_load((^simd.u32x4)(x_v[3:]))
			
 
				+
			
 
				+	// 8 blocks at a time.
			
 
				+	//
			
 
				+	// Note: This is only worth it on Aarch64.
			
 
				+	when ODIN_ARCH == .arm64 {
			
 
				+		for ; n >= 8; n = n - 8 {
			
 
				+			v0, v1, v2, v3 := s0, s1, s2, s3
			
 
				+
			
 
				+			when ODIN_ENDIAN == .Little {
			
 
				+				s7 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s3, _VEC_ONE)
			
 
				+			} else {
			
 
				+				s7 := _increment_counter(ctx)
			
 
				+			}
			
 
				+			v4, v5, v6, v7 := s0, s1, s2, s7
			
 
				+
			
 
				+			when ODIN_ENDIAN == .Little {
			
 
				+				s11 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s7, _VEC_ONE)
			
 
				+			} else {
			
 
				+				s11 := _increment_counter(ctx)
			
 
				+			}
			
 
				+			v8, v9, v10, v11 := s0, s1, s2, s11
			
 
				+
			
 
				+			when ODIN_ENDIAN == .Little {
			
 
				+				s15 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s11, _VEC_ONE)
			
 
				+			} else {
			
 
				+				s15 := _increment_counter(ctx)
			
 
				+			}
			
 
				+			v12, v13, v14, v15 := s0, s1, s2, s15
			
 
				+
			
 
				+			when ODIN_ENDIAN == .Little {
			
 
				+				s19 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s15, _VEC_ONE)
			
 
				+			} else {
			
 
				+				s19 := _increment_counter(ctx)
			
 
				+			}
			
 
				+
			
 
				+			v16, v17, v18, v19 := s0, s1, s2, s19
			
 
				+			when ODIN_ENDIAN == .Little {
			
 
				+				s23 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s19, _VEC_ONE)
			
 
				+			} else {
			
 
				+				s23 := _increment_counter(ctx)
			
 
				+			}
			
 
				+
			
 
				+			v20, v21, v22, v23 := s0, s1, s2, s23
			
 
				+			when ODIN_ENDIAN == .Little {
			
 
				+				s27 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s23, _VEC_ONE)
			
 
				+			} else {
			
 
				+				s27 := _increment_counter(ctx)
			
 
				+			}
			
 
				+
			
 
				+			v24, v25, v26, v27 := s0, s1, s2, s27
			
 
				+			when ODIN_ENDIAN == .Little {
			
 
				+				s31 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s27, _VEC_ONE)
			
 
				+			} else {
			
 
				+				s31 := _increment_counter(ctx)
			
 
				+			}
			
 
				+			v28, v29, v30, v31 := s0, s1, s2, s31
			
 
				+
			
 
				+			for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
			
 
				+				v0, v1, v2, v3 = _dq_round_simd128(v0, v1, v2, v3)
			
 
				+				v4, v5, v6, v7 = _dq_round_simd128(v4, v5, v6, v7)
			
 
				+				v8, v9, v10, v11 = _dq_round_simd128(v8, v9, v10, v11)
			
 
				+				v12, v13, v14, v15 = _dq_round_simd128(v12, v13, v14, v15)
			
 
				+				v16, v17, v18, v19 = _dq_round_simd128(v16, v17, v18, v19)
			
 
				+				v20, v21, v22, v23 = _dq_round_simd128(v20, v21, v22, v23)
			
 
				+				v24, v25, v26, v27 = _dq_round_simd128(v24, v25, v26, v27)
			
 
				+				v28, v29, v30, v31 = _dq_round_simd128(v28, v29, v30, v31)
			
 
				+			}
			
 
				+
			
 
				+			v0, v1, v2, v3 = _add_state_simd128(v0, v1, v2, v3, s0, s1, s2, s3)
			
 
				+			v4, v5, v6, v7 = _add_state_simd128(v4, v5, v6, v7, s0, s1, s2, s7)
			
 
				+			v8, v9, v10, v11 = _add_state_simd128(v8, v9, v10, v11, s0, s1, s2, s11)
			
 
				+			v12, v13, v14, v15 = _add_state_simd128(v12, v13, v14, v15, s0, s1, s2, s15)
			
 
				+			v16, v17, v18, v19 = _add_state_simd128(v16, v17, v18, v19, s0, s1, s2, s19)
			
 
				+			v20, v21, v22, v23 = _add_state_simd128(v20, v21, v22, v23, s0, s1, s2, s23)
			
 
				+			v24, v25, v26, v27 = _add_state_simd128(v24, v25, v26, v27, s0, s1, s2, s27)
			
 
				+			v28, v29, v30, v31 = _add_state_simd128(v28, v29, v30, v31, s0, s1, s2, s31)
			
 
				+
			
 
				+			#no_bounds_check {
			
 
				+				if src != nil {
			
 
				+					v0, v1, v2, v3 = _xor_simd128(src_v, v0, v1, v2, v3)
			
 
				+					v4, v5, v6, v7 = _xor_simd128(src_v[4:], v4, v5, v6, v7)
			
 
				+					v8, v9, v10, v11 = _xor_simd128(src_v[8:], v8, v9, v10, v11)
			
 
				+					v12, v13, v14, v15 = _xor_simd128(src_v[12:], v12, v13, v14, v15)
			
 
				+					v16, v17, v18, v19 = _xor_simd128(src_v[16:], v16, v17, v18, v19)
			
 
				+					v20, v21, v22, v23 = _xor_simd128(src_v[20:], v20, v21, v22, v23)
			
 
				+					v24, v25, v26, v27 = _xor_simd128(src_v[24:], v24, v25, v26, v27)
			
 
				+					v28, v29, v30, v31 = _xor_simd128(src_v[28:], v28, v29, v30, v31)
			
 
				+					src_v = src_v[32:]
			
 
				+				}
			
 
				+
			
 
				+				_store_simd128(dst_v, v0, v1, v2, v3)
			
 
				+				_store_simd128(dst_v[4:], v4, v5, v6, v7)
			
 
				+				_store_simd128(dst_v[8:], v8, v9, v10, v11)
			
 
				+				_store_simd128(dst_v[12:], v12, v13, v14, v15)
			
 
				+				_store_simd128(dst_v[16:], v16, v17, v18, v19)
			
 
				+				_store_simd128(dst_v[20:], v20, v21, v22, v23)
			
 
				+				_store_simd128(dst_v[24:], v24, v25, v26, v27)
			
 
				+				_store_simd128(dst_v[28:], v28, v29, v30, v31)
			
 
				+				dst_v = dst_v[32:]
			
 
				+			}
			
 
				+
			
 
				+			when ODIN_ENDIAN == .Little {
			
 
				+				// s31 holds the most current counter, so `s3 = s31 + 1`.
			
 
				+				s3 = transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s31, _VEC_ONE)
			
 
				+			} else {
			
 
				+				s3 = _increment_counter(ctx)
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// 4 blocks at a time.
			
 
				+	//
			
 
				+	// Note: The i386 target lacks the required number of registers
			
 
				+	// for this to be performant, so it is skipped.
			
 
				+	when ODIN_ARCH != .i386 {
			
 
				+		for ; n >= 4; n = n - 4 {
			
 
				+			v0, v1, v2, v3 := s0, s1, s2, s3
			
 
				+
			
 
				+			when ODIN_ENDIAN == .Little {
			
 
				+				s7 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s3, _VEC_ONE)
			
 
				+			} else {
			
 
				+				s7 := _increment_counter(ctx)
			
 
				+			}
			
 
				+			v4, v5, v6, v7 := s0, s1, s2, s7
			
 
				+
			
 
				+			when ODIN_ENDIAN == .Little {
			
 
				+				s11 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s7, _VEC_ONE)
			
 
				+			} else {
			
 
				+				s11 := _increment_counter(ctx)
			
 
				+			}
			
 
				+			v8, v9, v10, v11 := s0, s1, s2, s11
			
 
				+
			
 
				+			when ODIN_ENDIAN == .Little {
			
 
				+				s15 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s11, _VEC_ONE)
			
 
				+			} else {
			
 
				+				s15 := _increment_counter(ctx)
			
 
				+			}
			
 
				+			v12, v13, v14, v15 := s0, s1, s2, s15
			
 
				+
			
 
				+			for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
			
 
				+				v0, v1, v2, v3 = _dq_round_simd128(v0, v1, v2, v3)
			
 
				+				v4, v5, v6, v7 = _dq_round_simd128(v4, v5, v6, v7)
			
 
				+				v8, v9, v10, v11 = _dq_round_simd128(v8, v9, v10, v11)
			
 
				+				v12, v13, v14, v15 = _dq_round_simd128(v12, v13, v14, v15)
			
 
				+			}
			
 
				+
			
 
				+			v0, v1, v2, v3 = _add_state_simd128(v0, v1, v2, v3, s0, s1, s2, s3)
			
 
				+			v4, v5, v6, v7 = _add_state_simd128(v4, v5, v6, v7, s0, s1, s2, s7)
			
 
				+			v8, v9, v10, v11 = _add_state_simd128(v8, v9, v10, v11, s0, s1, s2, s11)
			
 
				+			v12, v13, v14, v15 = _add_state_simd128(v12, v13, v14, v15, s0, s1, s2, s15)
			
 
				+
			
 
				+			#no_bounds_check {
			
 
				+				if src != nil {
			
 
				+					v0, v1, v2, v3 = _xor_simd128(src_v, v0, v1, v2, v3)
			
 
				+					v4, v5, v6, v7 = _xor_simd128(src_v[4:], v4, v5, v6, v7)
			
 
				+					v8, v9, v10, v11 = _xor_simd128(src_v[8:], v8, v9, v10, v11)
			
 
				+					v12, v13, v14, v15 = _xor_simd128(src_v[12:], v12, v13, v14, v15)
			
 
				+					src_v = src_v[16:]
			
 
				+				}
			
 
				+
			
 
				+				_store_simd128(dst_v, v0, v1, v2, v3)
			
 
				+				_store_simd128(dst_v[4:], v4, v5, v6, v7)
			
 
				+				_store_simd128(dst_v[8:], v8, v9, v10, v11)
			
 
				+				_store_simd128(dst_v[12:], v12, v13, v14, v15)
			
 
				+				dst_v = dst_v[16:]
			
 
				+			}
			
 
				+
			
 
				+			when ODIN_ENDIAN == .Little {
			
 
				+				// s15 holds the most current counter, so `s3 = s15 + 1`.
			
 
				+				s3 = transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s15, _VEC_ONE)
			
 
				+			} else {
			
 
				+				s3 = _increment_counter(ctx)
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// 1 block at a time.
			
 
				+	for ; n > 0; n = n - 1 {
			
 
				+		v0, v1, v2, v3 := s0, s1, s2, s3
			
 
				+
			
 
				+		for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
			
 
				+			v0, v1, v2, v3 = _dq_round_simd128(v0, v1, v2, v3)
			
 
				+		}
			
 
				+		v0, v1, v2, v3 = _add_state_simd128(v0, v1, v2, v3, s0, s1, s2, s3)
			
 
				+
			
 
				+		#no_bounds_check {
			
 
				+			if src != nil {
			
 
				+				v0, v1, v2, v3 = _xor_simd128(src_v, v0, v1, v2, v3)
			
 
				+				src_v = src_v[4:]
			
 
				+			}
			
 
				+
			
 
				+			_store_simd128(dst_v, v0, v1, v2, v3)
			
 
				+			dst_v = dst_v[4:]
			
 
				+		}
			
 
				+
			
 
				+		// Increment the counter.  Overflow checking is done upon
			
 
				+		// entry into the routine, so a 64-bit increment safely
			
 
				+		// covers both cases.
			
 
				+		when ODIN_ENDIAN == .Little {
			
 
				+			s3 = transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s3, _VEC_ONE)
			
 
				+		} else {
			
 
				+			s3 = _increment_counter(ctx)
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	when ODIN_ENDIAN == .Little {
			
 
				+		// Write back the counter to the state.
			
 
				+		intrinsics.unaligned_store((^simd.u32x4)(x_v[3:]), s3)
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+@(enable_target_feature = TARGET_SIMD_FEATURES)
			
 
				+hchacha20 :: proc "contextless" (dst, key, iv: []byte) {
			
 
				+	v0 := simd.u32x4{_chacha20.SIGMA_0, _chacha20.SIGMA_1, _chacha20.SIGMA_2, _chacha20.SIGMA_3}
			
 
				+	v1 := intrinsics.unaligned_load((^simd.u32x4)(&key[0]))
			
 
				+	v2 := intrinsics.unaligned_load((^simd.u32x4)(&key[16]))
			
 
				+	v3 := intrinsics.unaligned_load((^simd.u32x4)(&iv[0]))
			
 
				+
			
 
				+	when ODIN_ENDIAN == .Big {
			
 
				+		v1 = _byteswap_u32x4(v1)
			
 
				+		v2 = _byteswap_u32x4(v2)
			
 
				+		v3 = _byteswap_u32x4(v3)
			
 
				+	}
			
 
				+
			
 
				+	for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
			
 
				+		v0, v1, v2, v3 = _dq_round_simd128(v0, v1, v2, v3)
			
 
				+	}
			
 
				+
			
 
				+	when ODIN_ENDIAN == .Big {
			
 
				+		v0 = _byteswap_u32x4(v0)
			
 
				+		v3 = _byteswap_u32x4(v3)
			
 
				+	}
			
 
				+
			
 
				+	dst_v := ([^]simd.u32x4)(raw_data(dst))
			
 
				+	intrinsics.unaligned_store((^simd.u32x4)(dst_v[0:]), v0)
			
 
				+	intrinsics.unaligned_store((^simd.u32x4)(dst_v[1:]), v3)
			
 
				+}
			
--- a/core/crypto/_chacha20/simd256/chacha20_simd256.odin
+++ b/core/crypto/_chacha20/simd256/chacha20_simd256.odin
@@ -0,0 +1,319 @@
 
				+//+build amd64
			
 
				+package chacha20_simd256
			
 
				+
			
 
				+import "base:intrinsics"
			
 
				+import "core:crypto/_chacha20"
			
 
				+import chacha_simd128 "core:crypto/_chacha20/simd128"
			
 
				+import "core:simd"
			
 
				+import "core:sys/info"
			
 
				+
			
 
				+// This is loosely based on Ted Krovetz's public domain C intrinsic
			
 
				+// implementations.  While written using `core:simd`, this is currently
			
 
				+// amd64 specific because we do not have a way to detect ARM SVE.
			
 
				+//
			
 
				+// See:
			
 
				+// supercop-20230530/crypto_stream/chacha20/krovetz/vec128
			
 
				+// supercop-20230530/crypto_stream/chacha20/krovetz/avx2
			
 
				+
			
 
				+#assert(ODIN_ENDIAN == .Little)
			
 
				+
			
 
				+@(private = "file")
			
 
				+_ROT_7L: simd.u32x8 : {7, 7, 7, 7, 7, 7, 7, 7}
			
 
				+@(private = "file")
			
 
				+_ROT_7R: simd.u32x8 : {25, 25, 25, 25, 25, 25, 25, 25}
			
 
				+@(private = "file")
			
 
				+_ROT_12L: simd.u32x8 : {12, 12, 12, 12, 12, 12, 12, 12}
			
 
				+@(private = "file")
			
 
				+_ROT_12R: simd.u32x8 : {20, 20, 20, 20, 20, 20, 20, 20}
			
 
				+@(private = "file")
			
 
				+_ROT_8L: simd.u32x8 : {8, 8, 8, 8, 8, 8, 8, 8}
			
 
				+@(private = "file")
			
 
				+_ROT_8R: simd.u32x8 : {24, 24, 24, 24, 24, 24, 24, 24}
			
 
				+@(private = "file")
			
 
				+_ROT_16: simd.u32x8 : {16, 16, 16, 16, 16, 16, 16, 16}
			
 
				+@(private = "file")
			
 
				+_VEC_ZERO_ONE: simd.u64x4 : {0, 0, 1, 0}
			
 
				+@(private = "file")
			
 
				+_VEC_TWO: simd.u64x4 : {2, 0, 2, 0}
			
 
				+
			
 
				+// is_performant returns true iff the target and current host both support
			
 
				+// "enough" SIMD to make this implementation performant.
			
 
				+is_performant :: proc "contextless" () -> bool {
			
 
				+	req_features :: info.CPU_Features{.avx, .avx2}
			
 
				+
			
 
				+	features, ok := info.cpu_features.?
			
 
				+	if !ok {
			
 
				+		return false
			
 
				+	}
			
 
				+
			
 
				+	return features >= req_features
			
 
				+}
			
 
				+
			
 
				+@(private = "file")
			
 
				+_dq_round_simd256 :: #force_inline proc "contextless" (
			
 
				+	v0, v1, v2, v3: simd.u32x8,
			
 
				+) -> (
			
 
				+	simd.u32x8,
			
 
				+	simd.u32x8,
			
 
				+	simd.u32x8,
			
 
				+	simd.u32x8,
			
 
				+) {
			
 
				+	v0, v1, v2, v3 := v0, v1, v2, v3
			
 
				+
			
 
				+	// a += b; d ^= a; d = ROTW16(d);
			
 
				+	v0 = simd.add(v0, v1)
			
 
				+	v3 = simd.bit_xor(v3, v0)
			
 
				+	v3 = simd.bit_xor(simd.shl(v3, _ROT_16), simd.shr(v3, _ROT_16))
			
 
				+
			
 
				+	// c += d; b ^= c; b = ROTW12(b);
			
 
				+	v2 = simd.add(v2, v3)
			
 
				+	v1 = simd.bit_xor(v1, v2)
			
 
				+	v1 = simd.bit_xor(simd.shl(v1, _ROT_12L), simd.shr(v1, _ROT_12R))
			
 
				+
			
 
				+	// a += b; d ^= a; d = ROTW8(d);
			
 
				+	v0 = simd.add(v0, v1)
			
 
				+	v3 = simd.bit_xor(v3, v0)
			
 
				+	v3 = simd.bit_xor(simd.shl(v3, _ROT_8L), simd.shr(v3, _ROT_8R))
			
 
				+
			
 
				+	// c += d; b ^= c; b = ROTW7(b);
			
 
				+	v2 = simd.add(v2, v3)
			
 
				+	v1 = simd.bit_xor(v1, v2)
			
 
				+	v1 = simd.bit_xor(simd.shl(v1, _ROT_7L), simd.shr(v1, _ROT_7R))
			
 
				+
			
 
				+	// b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
			
 
				+	v1 = simd.shuffle(v1, v1, 1, 2, 3, 0, 5, 6, 7, 4)
			
 
				+	v2 = simd.shuffle(v2, v2, 2, 3, 0, 1, 6, 7, 4, 5)
			
 
				+	v3 = simd.shuffle(v3, v3, 3, 0, 1, 2, 7, 4, 5, 6)
			
 
				+
			
 
				+	// a += b; d ^= a; d = ROTW16(d);
			
 
				+	v0 = simd.add(v0, v1)
			
 
				+	v3 = simd.bit_xor(v3, v0)
			
 
				+	v3 = simd.bit_xor(simd.shl(v3, _ROT_16), simd.shr(v3, _ROT_16))
			
 
				+
			
 
				+	// c += d; b ^= c; b = ROTW12(b);
			
 
				+	v2 = simd.add(v2, v3)
			
 
				+	v1 = simd.bit_xor(v1, v2)
			
 
				+	v1 = simd.bit_xor(simd.shl(v1, _ROT_12L), simd.shr(v1, _ROT_12R))
			
 
				+
			
 
				+	// a += b; d ^= a; d = ROTW8(d);
			
 
				+	v0 = simd.add(v0, v1)
			
 
				+	v3 = simd.bit_xor(v3, v0)
			
 
				+	v3 = simd.bit_xor(simd.shl(v3, _ROT_8L), simd.shr(v3, _ROT_8R))
			
 
				+
			
 
				+	// c += d; b ^= c; b = ROTW7(b);
			
 
				+	v2 = simd.add(v2, v3)
			
 
				+	v1 = simd.bit_xor(v1, v2)
			
 
				+	v1 = simd.bit_xor(simd.shl(v1, _ROT_7L), simd.shr(v1, _ROT_7R))
			
 
				+
			
 
				+	// b = ROTV3(b); c = ROTV2(c); d = ROTV1(d);
			
 
				+	v1 = simd.shuffle(v1, v1, 3, 0, 1, 2, 7, 4, 5, 6)
			
 
				+	v2 = simd.shuffle(v2, v2, 2, 3, 0, 1, 6, 7, 4, 5)
			
 
				+	v3 = simd.shuffle(v3, v3, 1, 2, 3, 0, 5, 6, 7, 4)
			
 
				+
			
 
				+	return v0, v1, v2, v3
			
 
				+}
			
 
				+
			
 
				+@(private = "file")
			
 
				+_add_and_permute_state_simd256 :: #force_inline proc "contextless" (
			
 
				+	v0, v1, v2, v3, s0, s1, s2, s3: simd.u32x8,
			
 
				+) -> (
			
 
				+	simd.u32x8,
			
 
				+	simd.u32x8,
			
 
				+	simd.u32x8,
			
 
				+	simd.u32x8,
			
 
				+) {
			
 
				+	t0 := simd.add(v0, s0)
			
 
				+	t1 := simd.add(v1, s1)
			
 
				+	t2 := simd.add(v2, s2)
			
 
				+	t3 := simd.add(v3, s3)
			
 
				+
			
 
				+	// Big Endian would byteswap here.
			
 
				+
			
 
				+	// Each of v0 .. v3 has 128-bits of keystream for 2 separate blocks.
			
 
				+	// permute the state such that (r0, r1) contains block 0, and (r2, r3)
			
 
				+	// contains block 1.
			
 
				+	r0 := simd.shuffle(t0, t1, 0, 1, 2, 3, 8, 9, 10, 11)
			
 
				+	r2 := simd.shuffle(t0, t1, 4, 5, 6, 7, 12, 13, 14, 15)
			
 
				+	r1 := simd.shuffle(t2, t3, 0, 1, 2, 3, 8, 9, 10, 11)
			
 
				+	r3 := simd.shuffle(t2, t3, 4, 5, 6, 7, 12, 13, 14, 15)
			
 
				+
			
 
				+	return r0, r1, r2, r3
			
 
				+}
			
 
				+
			
 
				+@(private = "file")
			
 
				+_xor_simd256 :: #force_inline proc "contextless" (
			
 
				+	src: [^]simd.u32x8,
			
 
				+	v0, v1, v2, v3: simd.u32x8,
			
 
				+) -> (
			
 
				+	simd.u32x8,
			
 
				+	simd.u32x8,
			
 
				+	simd.u32x8,
			
 
				+	simd.u32x8,
			
 
				+) {
			
 
				+	v0, v1, v2, v3 := v0, v1, v2, v3
			
 
				+
			
 
				+	v0 = simd.bit_xor(v0, intrinsics.unaligned_load((^simd.u32x8)(src[0:])))
			
 
				+	v1 = simd.bit_xor(v1, intrinsics.unaligned_load((^simd.u32x8)(src[1:])))
			
 
				+	v2 = simd.bit_xor(v2, intrinsics.unaligned_load((^simd.u32x8)(src[2:])))
			
 
				+	v3 = simd.bit_xor(v3, intrinsics.unaligned_load((^simd.u32x8)(src[3:])))
			
 
				+
			
 
				+	return v0, v1, v2, v3
			
 
				+}
			
 
				+
			
 
				+@(private = "file")
			
 
				+_xor_simd256_x1 :: #force_inline proc "contextless" (
			
 
				+	src: [^]simd.u32x8,
			
 
				+	v0, v1: simd.u32x8,
			
 
				+) -> (
			
 
				+	simd.u32x8,
			
 
				+	simd.u32x8,
			
 
				+) {
			
 
				+	v0, v1 := v0, v1
			
 
				+
			
 
				+	v0 = simd.bit_xor(v0, intrinsics.unaligned_load((^simd.u32x8)(src[0:])))
			
 
				+	v1 = simd.bit_xor(v1, intrinsics.unaligned_load((^simd.u32x8)(src[1:])))
			
 
				+
			
 
				+	return v0, v1
			
 
				+}
			
 
				+
			
 
				+@(private = "file")
			
 
				+_store_simd256 :: #force_inline proc "contextless" (
			
 
				+	dst: [^]simd.u32x8,
			
 
				+	v0, v1, v2, v3: simd.u32x8,
			
 
				+) {
			
 
				+	intrinsics.unaligned_store((^simd.u32x8)(dst[0:]), v0)
			
 
				+	intrinsics.unaligned_store((^simd.u32x8)(dst[1:]), v1)
			
 
				+	intrinsics.unaligned_store((^simd.u32x8)(dst[2:]), v2)
			
 
				+	intrinsics.unaligned_store((^simd.u32x8)(dst[3:]), v3)
			
 
				+}
			
 
				+
			
 
				+@(private = "file")
			
 
				+_store_simd256_x1 :: #force_inline proc "contextless" (
			
 
				+	dst: [^]simd.u32x8,
			
 
				+	v0, v1: simd.u32x8,
			
 
				+) {
			
 
				+	intrinsics.unaligned_store((^simd.u32x8)(dst[0:]), v0)
			
 
				+	intrinsics.unaligned_store((^simd.u32x8)(dst[1:]), v1)
			
 
				+}
			
 
				+
			
 
				+@(enable_target_feature = "sse2,ssse3,avx,avx2")
			
 
				+stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int) {
			
 
				+	// Enforce the maximum consumed keystream per IV.
			
 
				+	_chacha20.check_counter_limit(ctx, nr_blocks)
			
 
				+
			
 
				+	dst_v := ([^]simd.u32x8)(raw_data(dst))
			
 
				+	src_v := ([^]simd.u32x8)(raw_data(src))
			
 
				+
			
 
				+	x := &ctx._s
			
 
				+	n := nr_blocks
			
 
				+
			
 
				+	// The state vector is an array of uint32s in native byte-order.
			
 
				+	// Setup s0 .. s3 such that each register stores 2 copies of the
			
 
				+	// state.
			
 
				+	x_v := ([^]simd.u32x4)(raw_data(x))
			
 
				+	t0 := intrinsics.unaligned_load((^simd.u32x4)(x_v[0:]))
			
 
				+	t1 := intrinsics.unaligned_load((^simd.u32x4)(x_v[1:]))
			
 
				+	t2 := intrinsics.unaligned_load((^simd.u32x4)(x_v[2:]))
			
 
				+	t3 := intrinsics.unaligned_load((^simd.u32x4)(x_v[3:]))
			
 
				+	s0 := simd.swizzle(t0, 0, 1, 2, 3, 0, 1, 2, 3)
			
 
				+	s1 := simd.swizzle(t1, 0, 1, 2, 3, 0, 1, 2, 3)
			
 
				+	s2 := simd.swizzle(t2, 0, 1, 2, 3, 0, 1, 2, 3)
			
 
				+	s3 := simd.swizzle(t3, 0, 1, 2, 3, 0, 1, 2, 3)
			
 
				+
			
 
				+	// Advance the counter in the 2nd copy of the state by one.
			
 
				+	s3 = transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s3, _VEC_ZERO_ONE)
			
 
				+
			
 
				+	// 8 blocks at a time.
			
 
				+	for ; n >= 8; n = n - 8 {
			
 
				+		v0, v1, v2, v3 := s0, s1, s2, s3
			
 
				+
			
 
				+		s7 := transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s3, _VEC_TWO)
			
 
				+		v4, v5, v6, v7 := s0, s1, s2, s7
			
 
				+
			
 
				+		s11 := transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s7, _VEC_TWO)
			
 
				+		v8, v9, v10, v11 := s0, s1, s2, s11
			
 
				+
			
 
				+		s15 := transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s11, _VEC_TWO)
			
 
				+		v12, v13, v14, v15 := s0, s1, s2, s15
			
 
				+
			
 
				+		for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
			
 
				+			v0, v1, v2, v3 = _dq_round_simd256(v0, v1, v2, v3)
			
 
				+			v4, v5, v6, v7 = _dq_round_simd256(v4, v5, v6, v7)
			
 
				+			v8, v9, v10, v11 = _dq_round_simd256(v8, v9, v10, v11)
			
 
				+			v12, v13, v14, v15 = _dq_round_simd256(v12, v13, v14, v15)
			
 
				+		}
			
 
				+
			
 
				+		v0, v1, v2, v3 = _add_and_permute_state_simd256(v0, v1, v2, v3, s0, s1, s2, s3)
			
 
				+		v4, v5, v6, v7 = _add_and_permute_state_simd256(v4, v5, v6, v7, s0, s1, s2, s7)
			
 
				+		v8, v9, v10, v11 = _add_and_permute_state_simd256(v8, v9, v10, v11, s0, s1, s2, s11)
			
 
				+		v12, v13, v14, v15 = _add_and_permute_state_simd256(v12, v13, v14, v15, s0, s1, s2, s15)
			
 
				+
			
 
				+		#no_bounds_check {
			
 
				+			if src != nil {
			
 
				+				v0, v1, v2, v3 = _xor_simd256(src_v, v0, v1, v2, v3)
			
 
				+				v4, v5, v6, v7 = _xor_simd256(src_v[4:], v4, v5, v6, v7)
			
 
				+				v8, v9, v10, v11 = _xor_simd256(src_v[8:], v8, v9, v10, v11)
			
 
				+				v12, v13, v14, v15 = _xor_simd256(src_v[12:], v12, v13, v14, v15)
			
 
				+				src_v = src_v[16:]
			
 
				+			}
			
 
				+
			
 
				+			_store_simd256(dst_v, v0, v1, v2, v3)
			
 
				+			_store_simd256(dst_v[4:], v4, v5, v6, v7)
			
 
				+			_store_simd256(dst_v[8:], v8, v9, v10, v11)
			
 
				+			_store_simd256(dst_v[12:], v12, v13, v14, v15)
			
 
				+			dst_v = dst_v[16:]
			
 
				+		}
			
 
				+
			
 
				+		s3 = transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s15, _VEC_TWO)
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	// 2 (or 1) block at a time.
			
 
				+	for ; n > 0; n = n - 2 {
			
 
				+		v0, v1, v2, v3 := s0, s1, s2, s3
			
 
				+
			
 
				+		for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
			
 
				+			v0, v1, v2, v3 = _dq_round_simd256(v0, v1, v2, v3)
			
 
				+		}
			
 
				+		v0, v1, v2, v3 = _add_and_permute_state_simd256(v0, v1, v2, v3, s0, s1, s2, s3)
			
 
				+
			
 
				+		if n == 1 {
			
 
				+			// Note: No need to advance src_v, dst_v, or increment the counter
			
 
				+			// since this is guaranteed to be the final block.
			
 
				+			#no_bounds_check {
			
 
				+				if src != nil {
			
 
				+					v0, v1 = _xor_simd256_x1(src_v, v0, v1)
			
 
				+				}
			
 
				+
			
 
				+				_store_simd256_x1(dst_v, v0, v1)
			
 
				+			}
			
 
				+			break
			
 
				+		}
			
 
				+
			
 
				+		#no_bounds_check {
			
 
				+			if src != nil {
			
 
				+				v0, v1, v2, v3 = _xor_simd256(src_v, v0, v1, v2, v3)
			
 
				+				src_v = src_v[4:]
			
 
				+			}
			
 
				+
			
 
				+			_store_simd256(dst_v, v0, v1, v2, v3)
			
 
				+			dst_v = dst_v[4:]
			
 
				+		}
			
 
				+
			
 
				+		s3 = transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s3, _VEC_TWO)
			
 
				+	}
			
 
				+
			
 
				+	// Write back the counter.  Doing it this way, saves having to
			
 
				+	// pull out the correct counter value from s3.
			
 
				+	new_ctr := ((u64(ctx._s[13]) << 32) | u64(ctx._s[12])) + u64(nr_blocks)
			
 
				+	ctx._s[12] = u32(new_ctr)
			
 
				+	ctx._s[13] = u32(new_ctr >> 32)
			
 
				+}
			
 
				+
			
 
				+@(enable_target_feature = "sse2,ssse3,avx")
			
 
				+hchacha20 :: proc "contextless" (dst, key, iv: []byte) {
			
 
				+	// We can just enable AVX and call the simd128 code as going
			
 
				+	// wider has 0 performance benefit, but VEX encoded instructions
			
 
				+	// is nice.
			
 
				+	#force_inline chacha_simd128.hchacha20(dst, key, iv)
			
 
				+}
			
--- a/core/crypto/_chacha20/simd256/chacha20_simd256_stub.odin
+++ b/core/crypto/_chacha20/simd256/chacha20_simd256_stub.odin
@@ -0,0 +1,17 @@
 
				+//+build !amd64
			
 
				+package chacha20_simd256
			
 
				+
			
 
				+import "base:intrinsics"
			
 
				+import "core:crypto/_chacha20"
			
 
				+
			
 
				+is_performant :: proc "contextless" () -> bool {
			
 
				+	return false
			
 
				+}
			
 
				+
			
 
				+stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int) {
			
 
				+	panic("crypto/chacha20: simd256 implementation unsupported")
			
 
				+}
			
 
				+
			
 
				+hchacha20 :: proc "contextless" (dst, key, iv: []byte) {
			
 
				+	intrinsics.trap()
			
 
				+}
			
--- a/core/crypto/aead/aead.odin
+++ b/core/crypto/aead/aead.odin
@@ -0,0 +1,36 @@
 
				+package aead
			
 
				+
			
 
				+// seal_oneshot encrypts the plaintext and authenticates the aad and ciphertext,
			
 
				+// with the provided algorithm, key, and iv, stores the output in dst and tag.
			
 
				+//
			
 
				+// dst and plaintext MUST alias exactly or not at all.
			
 
				+seal_oneshot :: proc(algo: Algorithm, dst, tag, key, iv, aad, plaintext: []byte, impl: Implementation = nil) {
			
 
				+	ctx: Context
			
 
				+	init(&ctx, algo, key, impl)
			
 
				+	defer reset(&ctx)
			
 
				+	seal_ctx(&ctx, dst, tag, iv, aad, plaintext)
			
 
				+}
			
 
				+
			
 
				+// open authenticates the aad and ciphertext, and decrypts the ciphertext,
			
 
				+// with the provided algorithm, key, iv, and tag, and stores the output in dst,
			
 
				+// returning true iff the authentication was successful.  If authentication
			
 
				+// fails, the destination buffer will be zeroed.
			
 
				+//
			
 
				+// dst and plaintext MUST alias exactly or not at all.
			
 
				+@(require_results)
			
 
				+open_oneshot :: proc(algo: Algorithm, dst, key, iv, aad, ciphertext, tag: []byte, impl: Implementation = nil) -> bool {
			
 
				+	ctx: Context
			
 
				+	init(&ctx, algo, key, impl)
			
 
				+	defer reset(&ctx)
			
 
				+	return open_ctx(&ctx, dst, iv, aad, ciphertext, tag)
			
 
				+}
			
 
				+
			
 
				+seal :: proc {
			
 
				+	seal_ctx,
			
 
				+	seal_oneshot,
			
 
				+}
			
 
				+
			
 
				+open :: proc {
			
 
				+	open_ctx,
			
 
				+	open_oneshot,
			
 
				+}
			
--- a/core/crypto/aead/doc.odin
+++ b/core/crypto/aead/doc.odin
@@ -0,0 +1,58 @@
 
				+/*
			
 
				+package aead provides a generic interface to the supported Authenticated
			
 
				+Encryption with Associated Data algorithms.
			
 
				+
			
 
				+Both a one-shot and context based interface are provided, with similar
			
 
				+usage.  If multiple messages are to be sealed/opened via the same key,
			
 
				+the context based interface may be more efficient, depending on the
			
 
				+algorithm.
			
 
				+
			
 
				+WARNING: Reusing the same key + iv to seal (encrypt) multiple messages
			
 
				+results in catastrophic loss of security for most algorithms.
			
 
				+
			
 
				+```odin
			
 
				+package aead_example
			
 
				+
			
 
				+import "core:bytes"
			
 
				+import "core:crypto"
			
 
				+import "core:crypto/aead"
			
 
				+
			
 
				+main :: proc() {
			
 
				+	algo := aead.Algorithm.XCHACHA20POLY1305
			
 
				+
			
 
				+	// The example added associated data, and plaintext.
			
 
				+	aad_str := "Get your ass in gear boys."
			
 
				+	pt_str := "They're immanetizing the Eschaton."
			
 
				+
			
 
				+	aad := transmute([]byte)aad_str
			
 
				+	plaintext := transmute([]byte)pt_str
			
 
				+	pt_len := len(plaintext)
			
 
				+
			
 
				+	// Generate a random key for the purposes of illustration.
			
 
				+	key := make([]byte, aead.KEY_SIZES[algo])
			
 
				+	defer delete(key)
			
 
				+	crypto.rand_bytes(key)
			
 
				+
			
 
				+	// `ciphertext || tag`, is a common way data is transmitted, so
			
 
				+	// demonstrate that.
			
 
				+	buf := make([]byte, pt_len + aead.TAG_SIZES[algo])
			
 
				+	defer delete(buf)
			
 
				+	ciphertext, tag := buf[:pt_len], buf[pt_len:]
			
 
				+
			
 
				+	// Seal the AAD + Plaintext.
			
 
				+	iv := make([]byte, aead.IV_SIZES[algo])
			
 
				+	defer delete(iv)
			
 
				+	crypto.rand_bytes(iv) // Random IVs are safe with XChaCha20-Poly1305.
			
 
				+	aead.seal(algo, ciphertext, tag, key, iv, aad, plaintext)
			
 
				+
			
 
				+	// Open the AAD + Ciphertext.
			
 
				+	opened_pt := buf[:pt_len]
			
 
				+	if ok := aead.open(algo, opened_pt, key, iv, aad, ciphertext, tag); !ok {
			
 
				+		panic("aead example: failed to open")
			
 
				+	}
			
 
				+
			
 
				+	assert(bytes.equal(opened_pt, plaintext))
			
 
				+}
			
 
				+```
			
 
				+*/
			
 
				+package aead
			
--- a/core/crypto/aead/low_level.odin
+++ b/core/crypto/aead/low_level.odin
@@ -0,0 +1,187 @@
 
				+package aead
			
 
				+
			
 
				+import "core:crypto/aes"
			
 
				+import "core:crypto/chacha20"
			
 
				+import "core:crypto/chacha20poly1305"
			
 
				+import "core:reflect"
			
 
				+
			
 
				+// Implementation is an AEAD implementation.  Most callers will not need
			
 
				+// to use this as the package will automatically select the most performant
			
 
				+// implementation available.
			
 
				+Implementation :: union {
			
 
				+	aes.Implementation,
			
 
				+	chacha20.Implementation,
			
 
				+}
			
 
				+
			
 
				+// MAX_TAG_SIZE is the maximum size tag that can be returned by any of the
			
 
				+// Algorithms supported via this package.
			
 
				+MAX_TAG_SIZE :: 16
			
 
				+
			
 
				+// Algorithm is the algorithm identifier associated with a given Context.
			
 
				+Algorithm :: enum {
			
 
				+	Invalid,
			
 
				+	AES_GCM_128,
			
 
				+	AES_GCM_192,
			
 
				+	AES_GCM_256,
			
 
				+	CHACHA20POLY1305,
			
 
				+	XCHACHA20POLY1305,
			
 
				+}
			
 
				+
			
 
				+// ALGORITM_NAMES is the Agorithm to algorithm name string.
			
 
				+ALGORITHM_NAMES := [Algorithm]string {
			
 
				+	.Invalid           = "Invalid",
			
 
				+	.AES_GCM_128       = "AES-GCM-128",
			
 
				+	.AES_GCM_192       = "AES-GCM-192",
			
 
				+	.AES_GCM_256       = "AES-GCM-256",
			
 
				+	.CHACHA20POLY1305  = "chacha20poly1305",
			
 
				+	.XCHACHA20POLY1305 = "xchacha20poly1305",
			
 
				+}
			
 
				+
			
 
				+// TAG_SIZES is the Algorithm to tag size in bytes.
			
 
				+TAG_SIZES := [Algorithm]int {
			
 
				+	.Invalid           = 0,
			
 
				+	.AES_GCM_128       = aes.GCM_TAG_SIZE,
			
 
				+	.AES_GCM_192       = aes.GCM_TAG_SIZE,
			
 
				+	.AES_GCM_256       = aes.GCM_TAG_SIZE,
			
 
				+	.CHACHA20POLY1305  = chacha20poly1305.TAG_SIZE,
			
 
				+	.XCHACHA20POLY1305 = chacha20poly1305.TAG_SIZE,
			
 
				+}
			
 
				+
			
 
				+// KEY_SIZES is the Algorithm to key size in bytes.
			
 
				+KEY_SIZES := [Algorithm]int {
			
 
				+	.Invalid           = 0,
			
 
				+	.AES_GCM_128       = aes.KEY_SIZE_128,
			
 
				+	.AES_GCM_192       = aes.KEY_SIZE_192,
			
 
				+	.AES_GCM_256       = aes.KEY_SIZE_256,
			
 
				+	.CHACHA20POLY1305  = chacha20poly1305.KEY_SIZE,
			
 
				+	.XCHACHA20POLY1305 = chacha20poly1305.KEY_SIZE,
			
 
				+}
			
 
				+
			
 
				+// IV_SIZES is the Algorithm to initialization vector size in bytes.
			
 
				+//
			
 
				+// Note: Some algorithms (such as AES-GCM) support variable IV sizes.
			
 
				+IV_SIZES := [Algorithm]int {
			
 
				+	.Invalid           = 0,
			
 
				+	.AES_GCM_128       = aes.GCM_IV_SIZE,
			
 
				+	.AES_GCM_192       = aes.GCM_IV_SIZE,
			
 
				+	.AES_GCM_256       = aes.GCM_IV_SIZE,
			
 
				+	.CHACHA20POLY1305  = chacha20poly1305.IV_SIZE,
			
 
				+	.XCHACHA20POLY1305 = chacha20poly1305.XIV_SIZE,
			
 
				+}
			
 
				+
			
 
				+// Context is a concrete instantiation of a specific AEAD algorithm.
			
 
				+Context :: struct {
			
 
				+	_algo: Algorithm,
			
 
				+	_impl: union {
			
 
				+		aes.Context_GCM,
			
 
				+		chacha20poly1305.Context,
			
 
				+	},
			
 
				+}
			
 
				+
			
 
				+@(private)
			
 
				+_IMPL_IDS := [Algorithm]typeid {
			
 
				+	.Invalid           = nil,
			
 
				+	.AES_GCM_128       = typeid_of(aes.Context_GCM),
			
 
				+	.AES_GCM_192       = typeid_of(aes.Context_GCM),
			
 
				+	.AES_GCM_256       = typeid_of(aes.Context_GCM),
			
 
				+	.CHACHA20POLY1305  = typeid_of(chacha20poly1305.Context),
			
 
				+	.XCHACHA20POLY1305 = typeid_of(chacha20poly1305.Context),
			
 
				+}
			
 
				+
			
 
				+// init initializes a Context with a specific AEAD Algorithm.
			
 
				+init :: proc(ctx: ^Context, algorithm: Algorithm, key: []byte, impl: Implementation = nil) {
			
 
				+	if ctx._impl != nil {
			
 
				+		reset(ctx)
			
 
				+	}
			
 
				+
			
 
				+	if len(key) != KEY_SIZES[algorithm] {
			
 
				+		panic("crypto/aead: invalid key size")
			
 
				+	}
			
 
				+
			
 
				+	// Directly specialize the union by setting the type ID (save a copy).
			
 
				+	reflect.set_union_variant_typeid(
			
 
				+		ctx._impl,
			
 
				+		_IMPL_IDS[algorithm],
			
 
				+	)
			
 
				+	switch algorithm {
			
 
				+	case .AES_GCM_128, .AES_GCM_192, .AES_GCM_256:
			
 
				+		impl_ := impl != nil ? impl.(aes.Implementation) : aes.DEFAULT_IMPLEMENTATION
			
 
				+		aes.init_gcm(&ctx._impl.(aes.Context_GCM), key, impl_)
			
 
				+	case .CHACHA20POLY1305:
			
 
				+		impl_ := impl != nil ? impl.(chacha20.Implementation) : chacha20.DEFAULT_IMPLEMENTATION
			
 
				+		chacha20poly1305.init(&ctx._impl.(chacha20poly1305.Context), key, impl_)
			
 
				+	case .XCHACHA20POLY1305:
			
 
				+		impl_ := impl != nil ? impl.(chacha20.Implementation) : chacha20.DEFAULT_IMPLEMENTATION
			
 
				+		chacha20poly1305.init_xchacha(&ctx._impl.(chacha20poly1305.Context), key, impl_)
			
 
				+	case .Invalid:
			
 
				+		panic("crypto/aead: uninitialized algorithm")
			
 
				+	case:
			
 
				+		panic("crypto/aead: invalid algorithm")
			
 
				+	}
			
 
				+
			
 
				+	ctx._algo = algorithm
			
 
				+}
			
 
				+
			
 
				+// seal_ctx encrypts the plaintext and authenticates the aad and ciphertext,
			
 
				+// with the provided Context and iv, stores the output in dst and tag.
			
 
				+//
			
 
				+// dst and plaintext MUST alias exactly or not at all.
			
 
				+seal_ctx :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {
			
 
				+	switch &impl in ctx._impl {
			
 
				+	case aes.Context_GCM:
			
 
				+		aes.seal_gcm(&impl, dst, tag, iv, aad, plaintext)
			
 
				+	case chacha20poly1305.Context:
			
 
				+		chacha20poly1305.seal(&impl, dst, tag, iv, aad, plaintext)
			
 
				+	case:
			
 
				+		panic("crypto/aead: uninitialized algorithm")
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// open_ctx authenticates the aad and ciphertext, and decrypts the ciphertext,
			
 
				+// with the provided Context, iv, and tag, and stores the output in dst,
			
 
				+// returning true iff the authentication was successful.  If authentication
			
 
				+// fails, the destination buffer will be zeroed.
			
 
				+//
			
 
				+// dst and plaintext MUST alias exactly or not at all.
			
 
				+@(require_results)
			
 
				+open_ctx :: proc(ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool {
			
 
				+	switch &impl in ctx._impl {
			
 
				+	case aes.Context_GCM:
			
 
				+		return aes.open_gcm(&impl, dst, iv, aad, ciphertext, tag)
			
 
				+	case chacha20poly1305.Context:
			
 
				+		return chacha20poly1305.open(&impl, dst, iv, aad, ciphertext, tag)
			
 
				+	case:
			
 
				+		panic("crypto/aead: uninitialized algorithm")
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// reset sanitizes the Context.  The Context must be re-initialized to
			
 
				+// be used again.
			
 
				+reset :: proc(ctx: ^Context) {
			
 
				+	switch &impl in ctx._impl {
			
 
				+	case aes.Context_GCM:
			
 
				+		aes.reset_gcm(&impl)
			
 
				+	case chacha20poly1305.Context:
			
 
				+		chacha20poly1305.reset(&impl)
			
 
				+	case:
			
 
				+		// Calling reset repeatedly is fine.
			
 
				+	}
			
 
				+
			
 
				+	ctx._algo = .Invalid
			
 
				+	ctx._impl = nil
			
 
				+}
			
 
				+
			
 
				+// algorithm returns the Algorithm used by a Context instance.
			
 
				+algorithm :: proc(ctx: ^Context) -> Algorithm {
			
 
				+	return ctx._algo
			
 
				+}
			
 
				+
			
 
				+// iv_size returns the IV size of a Context instance in bytes.
			
 
				+iv_size :: proc(ctx: ^Context) -> int {
			
 
				+	return IV_SIZES[ctx._algo]
			
 
				+}
			
 
				+
			
 
				+// tag_size returns the tag size of a Context instance in bytes.
			
 
				+tag_size :: proc(ctx: ^Context) -> int {
			
 
				+	return TAG_SIZES[ctx._algo]
			
 
				+}
			
--- a/core/crypto/aes/aes_ctr.odin
+++ b/core/crypto/aes/aes_ctr.odin
@@ -20,7 +20,7 @@ Context_CTR :: struct {
 
				 }
			
 
				 
			
 
				 // init_ctr initializes a Context_CTR with the provided key and IV.
			
 
				-init_ctr :: proc(ctx: ^Context_CTR, key, iv: []byte, impl := Implementation.Hardware) {
			
 
				+init_ctr :: proc(ctx: ^Context_CTR, key, iv: []byte, impl := DEFAULT_IMPLEMENTATION) {
			
 
				 	if len(iv) != CTR_IV_SIZE {
			
 
				 		panic("crypto/aes: invalid CTR IV size")
			
 
				 	}
			
@@ -47,7 +47,7 @@ xor_bytes_ctr :: proc(ctx: ^Context_CTR, dst, src: []byte) {
 
				 		panic("crypto/aes: dst and src alias inexactly")
			
 
				 	}
			
 
				 
			
 
				-	for remaining := len(src); remaining > 0; {
			
 
				+	#no_bounds_check for remaining := len(src); remaining > 0; {
			
 
				 		// Process multiple blocks at once
			
 
				 		if ctx._off == BLOCK_SIZE {
			
 
				 			if nr_blocks := remaining / BLOCK_SIZE; nr_blocks > 0 {
			
@@ -85,7 +85,7 @@ keystream_bytes_ctr :: proc(ctx: ^Context_CTR, dst: []byte) {
 
				 	assert(ctx._is_initialized)
			
 
				 
			
 
				 	dst := dst
			
 
				-	for remaining := len(dst); remaining > 0; {
			
 
				+	#no_bounds_check for remaining := len(dst); remaining > 0; {
			
 
				 		// Process multiple blocks at once
			
 
				 		if ctx._off == BLOCK_SIZE {
			
 
				 			if nr_blocks := remaining / BLOCK_SIZE; nr_blocks > 0 {
			
--- a/core/crypto/aes/aes_ecb.odin
+++ b/core/crypto/aes/aes_ecb.odin
@@ -12,7 +12,7 @@ Context_ECB :: struct {
 
				 }
			
 
				 
			
 
				 // init_ecb initializes a Context_ECB with the provided key.
			
 
				-init_ecb :: proc(ctx: ^Context_ECB, key: []byte, impl := Implementation.Hardware) {
			
 
				+init_ecb :: proc(ctx: ^Context_ECB, key: []byte, impl := DEFAULT_IMPLEMENTATION) {
			
 
				 	init_impl(&ctx._impl, key, impl)
			
 
				 	ctx._is_initialized = true
			
 
				 }
			
--- a/core/crypto/aes/aes_gcm.odin
+++ b/core/crypto/aes/aes_gcm.odin
@@ -7,10 +7,10 @@ import "core:crypto/_aes/ct64"
 
				 import "core:encoding/endian"
			
 
				 import "core:mem"
			
 
				 
			
 
				-// GCM_NONCE_SIZE is the default size of the GCM nonce in bytes.
			
 
				-GCM_NONCE_SIZE :: 12
			
 
				-// GCM_NONCE_SIZE_MAX is the maximum size of the GCM nonce in bytes.
			
 
				-GCM_NONCE_SIZE_MAX :: 0x2000000000000000 // floor((2^64 - 1) / 8) bits
			
 
				+// GCM_IV_SIZE is the default size of the GCM IV in bytes.
			
 
				+GCM_IV_SIZE :: 12
			
 
				+// GCM_IV_SIZE_MAX is the maximum size of the GCM IV in bytes.
			
 
				+GCM_IV_SIZE_MAX :: 0x2000000000000000 // floor((2^64 - 1) / 8) bits
			
 
				 // GCM_TAG_SIZE is the size of a GCM tag in bytes.
			
 
				 GCM_TAG_SIZE :: _aes.GHASH_TAG_SIZE
			
 
				 
			
@@ -26,19 +26,19 @@ Context_GCM :: struct {
 
				 }
			
 
				 
			
 
				 // init_gcm initializes a Context_GCM with the provided key.
			
 
				-init_gcm :: proc(ctx: ^Context_GCM, key: []byte, impl := Implementation.Hardware) {
			
 
				+init_gcm :: proc(ctx: ^Context_GCM, key: []byte, impl := DEFAULT_IMPLEMENTATION) {
			
 
				 	init_impl(&ctx._impl, key, impl)
			
 
				 	ctx._is_initialized = true
			
 
				 }
			
 
				 
			
 
				 // seal_gcm encrypts the plaintext and authenticates the aad and ciphertext,
			
 
				-// with the provided Context_GCM and nonce, stores the output in dst and tag.
			
 
				+// with the provided Context_GCM and iv, stores the output in dst and tag.
			
 
				 //
			
 
				 // dst and plaintext MUST alias exactly or not at all.
			
 
				-seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, nonce, aad, plaintext: []byte) {
			
 
				+seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, iv, aad, plaintext: []byte) {
			
 
				 	assert(ctx._is_initialized)
			
 
				 
			
 
				-	gcm_validate_common_slice_sizes(tag, nonce, aad, plaintext)
			
 
				+	gcm_validate_common_slice_sizes(tag, iv, aad, plaintext)
			
 
				 	if len(dst) != len(plaintext) {
			
 
				 		panic("crypto/aes: invalid destination ciphertext size")
			
 
				 	}
			
@@ -47,7 +47,7 @@ seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, nonce, aad, plaintext: []byte) {
 
				 	}
			
 
				 
			
 
				 	if impl, is_hw := ctx._impl.(Context_Impl_Hardware); is_hw {
			
 
				-		gcm_seal_hw(&impl, dst, tag, nonce, aad, plaintext)
			
 
				+		gcm_seal_hw(&impl, dst, tag, iv, aad, plaintext)
			
 
				 		return
			
 
				 	}
			
 
				 
			
@@ -55,7 +55,7 @@ seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, nonce, aad, plaintext: []byte) {
 
				 	j0: [_aes.GHASH_BLOCK_SIZE]byte
			
 
				 	j0_enc: [_aes.GHASH_BLOCK_SIZE]byte
			
 
				 	s: [_aes.GHASH_TAG_SIZE]byte
			
 
				-	init_ghash_ct64(ctx, &h, &j0, &j0_enc, nonce)
			
 
				+	init_ghash_ct64(ctx, &h, &j0, &j0_enc, iv)
			
 
				 
			
 
				 	// Note: Our GHASH implementation handles appending padding.
			
 
				 	ct64.ghash(s[:], h[:], aad)
			
@@ -69,15 +69,16 @@ seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, nonce, aad, plaintext: []byte) {
 
				 }
			
 
				 
			
 
				 // open_gcm authenticates the aad and ciphertext, and decrypts the ciphertext,
			
 
				-// with the provided Context_GCM, nonce, and tag, and stores the output in dst,
			
 
				+// with the provided Context_GCM, iv, and tag, and stores the output in dst,
			
 
				 // returning true iff the authentication was successful.  If authentication
			
 
				 // fails, the destination buffer will be zeroed.
			
 
				 //
			
 
				 // dst and plaintext MUST alias exactly or not at all.
			
 
				-open_gcm :: proc(ctx: ^Context_GCM, dst, nonce, aad, ciphertext, tag: []byte) -> bool {
			
 
				+@(require_results)
			
 
				+open_gcm :: proc(ctx: ^Context_GCM, dst, iv, aad, ciphertext, tag: []byte) -> bool {
			
 
				 	assert(ctx._is_initialized)
			
 
				 
			
 
				-	gcm_validate_common_slice_sizes(tag, nonce, aad, ciphertext)
			
 
				+	gcm_validate_common_slice_sizes(tag, iv, aad, ciphertext)
			
 
				 	if len(dst) != len(ciphertext) {
			
 
				 		panic("crypto/aes: invalid destination plaintext size")
			
 
				 	}
			
@@ -86,14 +87,14 @@ open_gcm :: proc(ctx: ^Context_GCM, dst, nonce, aad, ciphertext, tag: []byte) ->
 
				 	}
			
 
				 
			
 
				 	if impl, is_hw := ctx._impl.(Context_Impl_Hardware); is_hw {
			
 
				-		return gcm_open_hw(&impl, dst, nonce, aad, ciphertext, tag)
			
 
				+		return gcm_open_hw(&impl, dst, iv, aad, ciphertext, tag)
			
 
				 	}
			
 
				 
			
 
				 	h: [_aes.GHASH_KEY_SIZE]byte
			
 
				 	j0: [_aes.GHASH_BLOCK_SIZE]byte
			
 
				 	j0_enc: [_aes.GHASH_BLOCK_SIZE]byte
			
 
				 	s: [_aes.GHASH_TAG_SIZE]byte
			
 
				-	init_ghash_ct64(ctx, &h, &j0, &j0_enc, nonce)
			
 
				+	init_ghash_ct64(ctx, &h, &j0, &j0_enc, iv)
			
 
				 
			
 
				 	ct64.ghash(s[:], h[:], aad)
			
 
				 	gctr_ct64(ctx, dst, &s, ciphertext, &h, &j0, false)
			
@@ -112,7 +113,7 @@ open_gcm :: proc(ctx: ^Context_GCM, dst, nonce, aad, ciphertext, tag: []byte) ->
 
				 	return ok
			
 
				 }
			
 
				 
			
 
				-// reset_ctr sanitizes the Context_GCM.  The Context_GCM must be
			
 
				+// reset_gcm sanitizes the Context_GCM.  The Context_GCM must be
			
 
				 // re-initialized to be used again.
			
 
				 reset_gcm :: proc "contextless" (ctx: ^Context_GCM) {
			
 
				 	reset_impl(&ctx._impl)
			
@@ -120,14 +121,14 @@ reset_gcm :: proc "contextless" (ctx: ^Context_GCM) {
 
				 }
			
 
				 
			
 
				 @(private = "file")
			
 
				-gcm_validate_common_slice_sizes :: proc(tag, nonce, aad, text: []byte) {
			
 
				+gcm_validate_common_slice_sizes :: proc(tag, iv, aad, text: []byte) {
			
 
				 	if len(tag) != GCM_TAG_SIZE {
			
 
				 		panic("crypto/aes: invalid GCM tag size")
			
 
				 	}
			
 
				 
			
 
				-	// The specification supports nonces in the range [1, 2^64) bits.
			
 
				-	if l := len(nonce); l == 0 || u64(l) >= GCM_NONCE_SIZE_MAX {
			
 
				-		panic("crypto/aes: invalid GCM nonce size")
			
 
				+	// The specification supports IVs in the range [1, 2^64) bits.
			
 
				+	if l := len(iv); l == 0 || u64(l) >= GCM_IV_SIZE_MAX {
			
 
				+		panic("crypto/aes: invalid GCM IV size")
			
 
				 	}
			
 
				 
			
 
				 	if aad_len := u64(len(aad)); aad_len > GCM_A_MAX {
			
@@ -144,7 +145,7 @@ init_ghash_ct64 :: proc(
 
				 	h: ^[_aes.GHASH_KEY_SIZE]byte,
			
 
				 	j0: ^[_aes.GHASH_BLOCK_SIZE]byte,
			
 
				 	j0_enc: ^[_aes.GHASH_BLOCK_SIZE]byte,
			
 
				-	nonce: []byte,
			
 
				+	iv: []byte,
			
 
				 ) {
			
 
				 	impl := &ctx._impl.(ct64.Context)
			
 
				 
			
@@ -152,14 +153,14 @@ init_ghash_ct64 :: proc(
 
				 	ct64.encrypt_block(impl, h[:], h[:])
			
 
				 
			
 
				 	// Define a block, J0, as follows:
			
 
				-	if l := len(nonce); l == GCM_NONCE_SIZE {
			
 
				+	if l := len(iv); l == GCM_IV_SIZE {
			
 
				 		// if len(IV) = 96, then let J0 = IV || 0^31 || 1
			
 
				-		copy(j0[:], nonce)
			
 
				+		copy(j0[:], iv)
			
 
				 		j0[_aes.GHASH_BLOCK_SIZE - 1] = 1
			
 
				 	} else {
			
 
				 		// If len(IV) != 96, then let s = 128 ceil(len(IV)/128) - len(IV),
			
 
				 		// and let J0 = GHASHH(IV || 0^(s+64) || ceil(len(IV))^64).
			
 
				-		ct64.ghash(j0[:], h[:], nonce)
			
 
				+		ct64.ghash(j0[:], h[:], iv)
			
 
				 
			
 
				 		tmp: [_aes.GHASH_BLOCK_SIZE]byte
			
 
				 		endian.unchecked_put_u64be(tmp[8:], u64(l) * 8)
			
@@ -197,7 +198,7 @@ gctr_ct64 :: proc(
 
				 	s: ^[_aes.GHASH_BLOCK_SIZE]byte,
			
 
				 	src: []byte,
			
 
				 	h: ^[_aes.GHASH_KEY_SIZE]byte,
			
 
				-	nonce: ^[_aes.GHASH_BLOCK_SIZE]byte,
			
 
				+	iv: ^[_aes.GHASH_BLOCK_SIZE]byte,
			
 
				 	is_seal: bool,
			
 
				 ) #no_bounds_check {
			
 
				 	ct64_inc_ctr32 := #force_inline proc "contextless" (dst: []byte, ctr: u32) -> u32 {
			
@@ -208,14 +209,14 @@ gctr_ct64 :: proc(
 
				 	// Setup the counter blocks.
			
 
				 	tmp, tmp2: [ct64.STRIDE][BLOCK_SIZE]byte = ---, ---
			
 
				 	ctrs, blks: [ct64.STRIDE][]byte = ---, ---
			
 
				-	ctr := endian.unchecked_get_u32be(nonce[GCM_NONCE_SIZE:]) + 1
			
 
				+	ctr := endian.unchecked_get_u32be(iv[GCM_IV_SIZE:]) + 1
			
 
				 	for i in 0 ..< ct64.STRIDE {
			
 
				 		// Setup scratch space for the keystream.
			
 
				 		blks[i] = tmp2[i][:]
			
 
				 
			
 
				 		// Pre-copy the IV to all the counter blocks.
			
 
				 		ctrs[i] = tmp[i][:]
			
 
				-		copy(ctrs[i], nonce[:GCM_NONCE_SIZE])
			
 
				+		copy(ctrs[i], iv[:GCM_IV_SIZE])
			
 
				 	}
			
 
				 
			
 
				 	impl := &ctx._impl.(ct64.Context)
			
--- a/core/crypto/aes/aes_gcm_hw_intel.odin
+++ b/core/crypto/aes/aes_gcm_hw_intel.odin
@@ -10,12 +10,12 @@ import "core:mem"
 
				 import "core:simd/x86"
			
 
				 
			
 
				 @(private)
			
 
				-gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, nonce, aad, plaintext: []byte) {
			
 
				+gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, iv, aad, plaintext: []byte) {
			
 
				 	h: [_aes.GHASH_KEY_SIZE]byte
			
 
				 	j0: [_aes.GHASH_BLOCK_SIZE]byte
			
 
				 	j0_enc: [_aes.GHASH_BLOCK_SIZE]byte
			
 
				 	s: [_aes.GHASH_TAG_SIZE]byte
			
 
				-	init_ghash_hw(ctx, &h, &j0, &j0_enc, nonce)
			
 
				+	init_ghash_hw(ctx, &h, &j0, &j0_enc, iv)
			
 
				 
			
 
				 	// Note: Our GHASH implementation handles appending padding.
			
 
				 	hw_intel.ghash(s[:], h[:], aad)
			
@@ -29,12 +29,12 @@ gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, nonce, aad, plaintext
 
				 }
			
 
				 
			
 
				 @(private)
			
 
				-gcm_open_hw :: proc(ctx: ^Context_Impl_Hardware, dst, nonce, aad, ciphertext, tag: []byte) -> bool {
			
 
				+gcm_open_hw :: proc(ctx: ^Context_Impl_Hardware, dst, iv, aad, ciphertext, tag: []byte) -> bool {
			
 
				 	h: [_aes.GHASH_KEY_SIZE]byte
			
 
				 	j0: [_aes.GHASH_BLOCK_SIZE]byte
			
 
				 	j0_enc: [_aes.GHASH_BLOCK_SIZE]byte
			
 
				 	s: [_aes.GHASH_TAG_SIZE]byte
			
 
				-	init_ghash_hw(ctx, &h, &j0, &j0_enc, nonce)
			
 
				+	init_ghash_hw(ctx, &h, &j0, &j0_enc, iv)
			
 
				 
			
 
				 	hw_intel.ghash(s[:], h[:], aad)
			
 
				 	gctr_hw(ctx, dst, &s, ciphertext, &h, &j0, false)
			
@@ -59,20 +59,20 @@ init_ghash_hw :: proc(
 
				 	h: ^[_aes.GHASH_KEY_SIZE]byte,
			
 
				 	j0: ^[_aes.GHASH_BLOCK_SIZE]byte,
			
 
				 	j0_enc: ^[_aes.GHASH_BLOCK_SIZE]byte,
			
 
				-	nonce: []byte,
			
 
				+	iv: []byte,
			
 
				 ) {
			
 
				 	// 1. Let H = CIPH(k, 0^128)
			
 
				 	encrypt_block_hw(ctx, h[:], h[:])
			
 
				 
			
 
				 	// Define a block, J0, as follows:
			
 
				-	if l := len(nonce); l == GCM_NONCE_SIZE {
			
 
				+	if l := len(iv); l == GCM_IV_SIZE {
			
 
				 		// if len(IV) = 96, then let J0 = IV || 0^31 || 1
			
 
				-		copy(j0[:], nonce)
			
 
				+		copy(j0[:], iv)
			
 
				 		j0[_aes.GHASH_BLOCK_SIZE - 1] = 1
			
 
				 	} else {
			
 
				 		// If len(IV) != 96, then let s = 128 ceil(len(IV)/128) - len(IV),
			
 
				 		// and let J0 = GHASHH(IV || 0^(s+64) || ceil(len(IV))^64).
			
 
				-		hw_intel.ghash(j0[:], h[:], nonce)
			
 
				+		hw_intel.ghash(j0[:], h[:], iv)
			
 
				 
			
 
				 		tmp: [_aes.GHASH_BLOCK_SIZE]byte
			
 
				 		endian.unchecked_put_u64be(tmp[8:], u64(l) * 8)
			
@@ -109,7 +109,7 @@ gctr_hw :: proc(
 
				 	s: ^[_aes.GHASH_BLOCK_SIZE]byte,
			
 
				 	src: []byte,
			
 
				 	h: ^[_aes.GHASH_KEY_SIZE]byte,
			
 
				-	nonce: ^[_aes.GHASH_BLOCK_SIZE]byte,
			
 
				+	iv: ^[_aes.GHASH_BLOCK_SIZE]byte,
			
 
				 	is_seal: bool,
			
 
				 ) #no_bounds_check {
			
 
				 	sks: [15]x86.__m128i = ---
			
@@ -118,8 +118,8 @@ gctr_hw :: proc(
 
				 	}
			
 
				 
			
 
				 	// Setup the counter block
			
 
				-	ctr_blk := intrinsics.unaligned_load((^x86.__m128i)(nonce))
			
 
				-	ctr := endian.unchecked_get_u32be(nonce[GCM_NONCE_SIZE:]) + 1
			
 
				+	ctr_blk := intrinsics.unaligned_load((^x86.__m128i)(iv))
			
 
				+	ctr := endian.unchecked_get_u32be(iv[GCM_IV_SIZE:]) + 1
			
 
				 
			
 
				 	src, dst := src, dst
			
 
				 
			
--- a/core/crypto/aes/aes_impl.odin
+++ b/core/crypto/aes/aes_impl.odin
@@ -10,6 +10,10 @@ Context_Impl :: union {
 
				 	Context_Impl_Hardware,
			
 
				 }
			
 
				 
			
 
				+// DEFAULT_IMPLEMENTATION is the implementation that will be used by
			
 
				+// default if possible.
			
 
				+DEFAULT_IMPLEMENTATION :: Implementation.Hardware
			
 
				+
			
 
				 // Implementation is an AES implementation.  Most callers will not need
			
 
				 // to use this as the package will automatically select the most performant
			
 
				 // implementation available (See `is_hardware_accelerated()`).
			
--- a/core/crypto/aes/aes_impl_hw_gen.odin
+++ b/core/crypto/aes/aes_impl_hw_gen.odin
@@ -34,11 +34,11 @@ ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) {
 
				 }
			
 
				 
			
 
				 @(private)
			
 
				-gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, nonce, aad, plaintext: []byte) {
			
 
				+gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, iv, aad, plaintext: []byte) {
			
 
				 	panic(ERR_HW_NOT_SUPPORTED)
			
 
				 }
			
 
				 
			
 
				 @(private)
			
 
				-gcm_open_hw :: proc(ctx: ^Context_Impl_Hardware, dst, nonce, aad, ciphertext, tag: []byte) -> bool {
			
 
				+gcm_open_hw :: proc(ctx: ^Context_Impl_Hardware, dst, iv, aad, ciphertext, tag: []byte) -> bool {
			
 
				 	panic(ERR_HW_NOT_SUPPORTED)
			
 
				 }
			
--- a/core/crypto/chacha20/chacha20.odin
+++ b/core/crypto/chacha20/chacha20.odin
@@ -8,119 +8,66 @@ See:
 
				 package chacha20
			
 
				 
			
 
				 import "core:bytes"
			
 
				-import "core:encoding/endian"
			
 
				-import "core:math/bits"
			
 
				+import "core:crypto/_chacha20"
			
 
				 import "core:mem"
			
 
				 
			
 
				 // KEY_SIZE is the (X)ChaCha20 key size in bytes.
			
 
				-KEY_SIZE :: 32
			
 
				-// NONCE_SIZE is the ChaCha20 nonce size in bytes.
			
 
				-NONCE_SIZE :: 12
			
 
				-// XNONCE_SIZE is the XChaCha20 nonce size in bytes.
			
 
				-XNONCE_SIZE :: 24
			
 
				-
			
 
				-@(private)
			
 
				-_MAX_CTR_IETF :: 0xffffffff
			
 
				-
			
 
				-@(private)
			
 
				-_BLOCK_SIZE :: 64
			
 
				-@(private)
			
 
				-_STATE_SIZE_U32 :: 16
			
 
				-@(private)
			
 
				-_ROUNDS :: 20
			
 
				-
			
 
				-@(private)
			
 
				-_SIGMA_0: u32 : 0x61707865
			
 
				-@(private)
			
 
				-_SIGMA_1: u32 : 0x3320646e
			
 
				-@(private)
			
 
				-_SIGMA_2: u32 : 0x79622d32
			
 
				-@(private)
			
 
				-_SIGMA_3: u32 : 0x6b206574
			
 
				+KEY_SIZE :: _chacha20.KEY_SIZE
			
 
				+// IV_SIZE is the ChaCha20 IV size in bytes.
			
 
				+IV_SIZE :: _chacha20.IV_SIZE
			
 
				+// XIV_SIZE is the XChaCha20 IV size in bytes.
			
 
				+XIV_SIZE :: _chacha20.XIV_SIZE
			
 
				 
			
 
				 // Context is a ChaCha20 or XChaCha20 instance.
			
 
				 Context :: struct {
			
 
				-	_s:              [_STATE_SIZE_U32]u32,
			
 
				-	_buffer:         [_BLOCK_SIZE]byte,
			
 
				-	_off:            int,
			
 
				-	_is_ietf_flavor: bool,
			
 
				-	_is_initialized: bool,
			
 
				+	_state: _chacha20.Context,
			
 
				+	_impl:  Implementation,
			
 
				 }
			
 
				 
			
 
				 // init inititializes a Context for ChaCha20 or XChaCha20 with the provided
			
 
				-// key and nonce.
			
 
				-init :: proc(ctx: ^Context, key, nonce: []byte) {
			
 
				+// key and iv.
			
 
				+init :: proc(ctx: ^Context, key, iv: []byte, impl := DEFAULT_IMPLEMENTATION) {
			
 
				 	if len(key) != KEY_SIZE {
			
 
				-		panic("crypto/chacha20: invalid ChaCha20 key size")
			
 
				+		panic("crypto/chacha20: invalid (X)ChaCha20 key size")
			
 
				 	}
			
 
				-	if n_len := len(nonce); n_len != NONCE_SIZE && n_len != XNONCE_SIZE {
			
 
				-		panic("crypto/chacha20: invalid (X)ChaCha20 nonce size")
			
 
				+	if l := len(iv); l != IV_SIZE && l != XIV_SIZE {
			
 
				+		panic("crypto/chacha20: invalid (X)ChaCha20 IV size")
			
 
				 	}
			
 
				 
			
 
				-	k, n := key, nonce
			
 
				+	k, n := key, iv
			
 
				 
			
 
				-	// Derive the XChaCha20 subkey and sub-nonce via HChaCha20.
			
 
				-	is_xchacha := len(nonce) == XNONCE_SIZE
			
 
				+	init_impl(ctx, impl)
			
 
				+
			
 
				+	is_xchacha := len(iv) == XIV_SIZE
			
 
				 	if is_xchacha {
			
 
				-		sub_key := ctx._buffer[:KEY_SIZE]
			
 
				-		_hchacha20(sub_key, k, n)
			
 
				+		sub_iv: [IV_SIZE]byte
			
 
				+		sub_key := ctx._state._buffer[:KEY_SIZE]
			
 
				+		hchacha20(sub_key, k, n, ctx._impl)
			
 
				 		k = sub_key
			
 
				-		n = n[16:24]
			
 
				+		copy(sub_iv[4:], n[16:])
			
 
				+		n = sub_iv[:]
			
 
				 	}
			
 
				 
			
 
				-	ctx._s[0] = _SIGMA_0
			
 
				-	ctx._s[1] = _SIGMA_1
			
 
				-	ctx._s[2] = _SIGMA_2
			
 
				-	ctx._s[3] = _SIGMA_3
			
 
				-	ctx._s[4] = endian.unchecked_get_u32le(k[0:4])
			
 
				-	ctx._s[5] = endian.unchecked_get_u32le(k[4:8])
			
 
				-	ctx._s[6] = endian.unchecked_get_u32le(k[8:12])
			
 
				-	ctx._s[7] = endian.unchecked_get_u32le(k[12:16])
			
 
				-	ctx._s[8] = endian.unchecked_get_u32le(k[16:20])
			
 
				-	ctx._s[9] = endian.unchecked_get_u32le(k[20:24])
			
 
				-	ctx._s[10] = endian.unchecked_get_u32le(k[24:28])
			
 
				-	ctx._s[11] = endian.unchecked_get_u32le(k[28:32])
			
 
				-	ctx._s[12] = 0
			
 
				-	if !is_xchacha {
			
 
				-		ctx._s[13] = endian.unchecked_get_u32le(n[0:4])
			
 
				-		ctx._s[14] = endian.unchecked_get_u32le(n[4:8])
			
 
				-		ctx._s[15] = endian.unchecked_get_u32le(n[8:12])
			
 
				-	} else {
			
 
				-		ctx._s[13] = 0
			
 
				-		ctx._s[14] = endian.unchecked_get_u32le(n[0:4])
			
 
				-		ctx._s[15] = endian.unchecked_get_u32le(n[4:8])
			
 
				+	_chacha20.init(&ctx._state, k, n, is_xchacha)
			
 
				 
			
 
				+	if is_xchacha {
			
 
				 		// The sub-key is stored in the keystream buffer.  While
			
 
				 		// this will be overwritten in most circumstances, explicitly
			
 
				 		// clear it out early.
			
 
				-		mem.zero_explicit(&ctx._buffer, KEY_SIZE)
			
 
				+		mem.zero_explicit(&ctx._state._buffer, KEY_SIZE)
			
 
				 	}
			
 
				-
			
 
				-	ctx._off = _BLOCK_SIZE
			
 
				-	ctx._is_ietf_flavor = !is_xchacha
			
 
				-	ctx._is_initialized = true
			
 
				 }
			
 
				 
			
 
				 // seek seeks the (X)ChaCha20 stream counter to the specified block.
			
 
				 seek :: proc(ctx: ^Context, block_nr: u64) {
			
 
				-	assert(ctx._is_initialized)
			
 
				-
			
 
				-	if ctx._is_ietf_flavor {
			
 
				-		if block_nr > _MAX_CTR_IETF {
			
 
				-			panic("crypto/chacha20: attempted to seek past maximum counter")
			
 
				-		}
			
 
				-	} else {
			
 
				-		ctx._s[13] = u32(block_nr >> 32)
			
 
				-	}
			
 
				-	ctx._s[12] = u32(block_nr)
			
 
				-	ctx._off = _BLOCK_SIZE
			
 
				+	_chacha20.seek(&ctx._state, block_nr)
			
 
				 }
			
 
				 
			
 
				 // xor_bytes XORs each byte in src with bytes taken from the (X)ChaCha20
			
 
				 // keystream, and writes the resulting output to dst.  Dst and src MUST
			
 
				 // alias exactly or not at all.
			
 
				 xor_bytes :: proc(ctx: ^Context, dst, src: []byte) {
			
 
				-	assert(ctx._is_initialized)
			
 
				+	assert(ctx._state._is_initialized)
			
 
				 
			
 
				 	src, dst := src, dst
			
 
				 	if dst_len := len(dst); dst_len < len(src) {
			
@@ -131,12 +78,13 @@ xor_bytes :: proc(ctx: ^Context, dst, src: []byte) {
 
				 		panic("crypto/chacha20: dst and src alias inexactly")
			
 
				 	}
			
 
				 
			
 
				-	for remaining := len(src); remaining > 0; {
			
 
				+	st := &ctx._state
			
 
				+	#no_bounds_check for remaining := len(src); remaining > 0; {
			
 
				 		// Process multiple blocks at once
			
 
				-		if ctx._off == _BLOCK_SIZE {
			
 
				-			if nr_blocks := remaining / _BLOCK_SIZE; nr_blocks > 0 {
			
 
				-				direct_bytes := nr_blocks * _BLOCK_SIZE
			
 
				-				_do_blocks(ctx, dst, src, nr_blocks)
			
 
				+		if st._off == _chacha20.BLOCK_SIZE {
			
 
				+			if nr_blocks := remaining / _chacha20.BLOCK_SIZE; nr_blocks > 0 {
			
 
				+				direct_bytes := nr_blocks * _chacha20.BLOCK_SIZE
			
 
				+				stream_blocks(ctx, dst, src, nr_blocks)
			
 
				 				remaining -= direct_bytes
			
 
				 				if remaining == 0 {
			
 
				 					return
			
@@ -147,17 +95,17 @@ xor_bytes :: proc(ctx: ^Context, dst, src: []byte) {
 
				 
			
 
				 			// If there is a partial block, generate and buffer 1 block
			
 
				 			// worth of keystream.
			
 
				-			_do_blocks(ctx, ctx._buffer[:], nil, 1)
			
 
				-			ctx._off = 0
			
 
				+			stream_blocks(ctx, st._buffer[:], nil, 1)
			
 
				+			st._off = 0
			
 
				 		}
			
 
				 
			
 
				 		// Process partial blocks from the buffered keystream.
			
 
				-		to_xor := min(_BLOCK_SIZE - ctx._off, remaining)
			
 
				-		buffered_keystream := ctx._buffer[ctx._off:]
			
 
				+		to_xor := min(_chacha20.BLOCK_SIZE - st._off, remaining)
			
 
				+		buffered_keystream := st._buffer[st._off:]
			
 
				 		for i := 0; i < to_xor; i = i + 1 {
			
 
				 			dst[i] = buffered_keystream[i] ~ src[i]
			
 
				 		}
			
 
				-		ctx._off += to_xor
			
 
				+		st._off += to_xor
			
 
				 		dst = dst[to_xor:]
			
 
				 		src = src[to_xor:]
			
 
				 		remaining -= to_xor
			
@@ -166,15 +114,15 @@ xor_bytes :: proc(ctx: ^Context, dst, src: []byte) {
 
				 
			
 
				 // keystream_bytes fills dst with the raw (X)ChaCha20 keystream output.
			
 
				 keystream_bytes :: proc(ctx: ^Context, dst: []byte) {
			
 
				-	assert(ctx._is_initialized)
			
 
				+	assert(ctx._state._is_initialized)
			
 
				 
			
 
				-	dst := dst
			
 
				-	for remaining := len(dst); remaining > 0; {
			
 
				+	dst, st := dst, &ctx._state
			
 
				+	#no_bounds_check for remaining := len(dst); remaining > 0; {
			
 
				 		// Process multiple blocks at once
			
 
				-		if ctx._off == _BLOCK_SIZE {
			
 
				-			if nr_blocks := remaining / _BLOCK_SIZE; nr_blocks > 0 {
			
 
				-				direct_bytes := nr_blocks * _BLOCK_SIZE
			
 
				-				_do_blocks(ctx, dst, nil, nr_blocks)
			
 
				+		if st._off == _chacha20.BLOCK_SIZE {
			
 
				+			if nr_blocks := remaining / _chacha20.BLOCK_SIZE; nr_blocks > 0 {
			
 
				+				direct_bytes := nr_blocks * _chacha20.BLOCK_SIZE
			
 
				+				stream_blocks(ctx, dst, nil, nr_blocks)
			
 
				 				remaining -= direct_bytes
			
 
				 				if remaining == 0 {
			
 
				 					return
			
@@ -184,15 +132,15 @@ keystream_bytes :: proc(ctx: ^Context, dst: []byte) {
 
				 
			
 
				 			// If there is a partial block, generate and buffer 1 block
			
 
				 			// worth of keystream.
			
 
				-			_do_blocks(ctx, ctx._buffer[:], nil, 1)
			
 
				-			ctx._off = 0
			
 
				+			stream_blocks(ctx, st._buffer[:], nil, 1)
			
 
				+			st._off = 0
			
 
				 		}
			
 
				 
			
 
				 		// Process partial blocks from the buffered keystream.
			
 
				-		to_copy := min(_BLOCK_SIZE - ctx._off, remaining)
			
 
				-		buffered_keystream := ctx._buffer[ctx._off:]
			
 
				+		to_copy := min(_chacha20.BLOCK_SIZE - st._off, remaining)
			
 
				+		buffered_keystream := st._buffer[st._off:]
			
 
				 		copy(dst[:to_copy], buffered_keystream[:to_copy])
			
 
				-		ctx._off += to_copy
			
 
				+		st._off += to_copy
			
 
				 		dst = dst[to_copy:]
			
 
				 		remaining -= to_copy
			
 
				 	}
			
@@ -201,366 +149,5 @@ keystream_bytes :: proc(ctx: ^Context, dst: []byte) {
 
				 // reset sanitizes the Context.  The Context must be re-initialized to
			
 
				 // be used again.
			
 
				 reset :: proc(ctx: ^Context) {
			
 
				-	mem.zero_explicit(&ctx._s, size_of(ctx._s))
			
 
				-	mem.zero_explicit(&ctx._buffer, size_of(ctx._buffer))
			
 
				-
			
 
				-	ctx._is_initialized = false
			
 
				-}
			
 
				-
			
 
				-@(private)
			
 
				-_do_blocks :: proc(ctx: ^Context, dst, src: []byte, nr_blocks: int) {
			
 
				-	// Enforce the maximum consumed keystream per nonce.
			
 
				-	//
			
 
				-	// While all modern "standard" definitions of ChaCha20 use
			
 
				-	// the IETF 32-bit counter, for XChaCha20 most common
			
 
				-	// implementations allow for a 64-bit counter.
			
 
				-	//
			
 
				-	// Honestly, the answer here is "use a MRAE primitive", but
			
 
				-	// go with common practice in the case of XChaCha20.
			
 
				-	if ctx._is_ietf_flavor {
			
 
				-		if u64(ctx._s[12]) + u64(nr_blocks) > 0xffffffff {
			
 
				-			panic("crypto/chacha20: maximum ChaCha20 keystream per nonce reached")
			
 
				-		}
			
 
				-	} else {
			
 
				-		ctr := (u64(ctx._s[13]) << 32) | u64(ctx._s[12])
			
 
				-		if _, carry := bits.add_u64(ctr, u64(nr_blocks), 0); carry != 0 {
			
 
				-			panic("crypto/chacha20: maximum XChaCha20 keystream per nonce reached")
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	dst, src := dst, src
			
 
				-	x := &ctx._s
			
 
				-	for n := 0; n < nr_blocks; n = n + 1 {
			
 
				-		x0, x1, x2, x3 := _SIGMA_0, _SIGMA_1, _SIGMA_2, _SIGMA_3
			
 
				-		x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
			
 
				-
			
 
				-		for i := _ROUNDS; i > 0; i = i - 2 {
			
 
				-			// Even when forcing inlining manually inlining all of
			
 
				-			// these is decently faster.
			
 
				-
			
 
				-			// quarterround(x, 0, 4, 8, 12)
			
 
				-			x0 += x4
			
 
				-			x12 ~= x0
			
 
				-			x12 = bits.rotate_left32(x12, 16)
			
 
				-			x8 += x12
			
 
				-			x4 ~= x8
			
 
				-			x4 = bits.rotate_left32(x4, 12)
			
 
				-			x0 += x4
			
 
				-			x12 ~= x0
			
 
				-			x12 = bits.rotate_left32(x12, 8)
			
 
				-			x8 += x12
			
 
				-			x4 ~= x8
			
 
				-			x4 = bits.rotate_left32(x4, 7)
			
 
				-
			
 
				-			// quarterround(x, 1, 5, 9, 13)
			
 
				-			x1 += x5
			
 
				-			x13 ~= x1
			
 
				-			x13 = bits.rotate_left32(x13, 16)
			
 
				-			x9 += x13
			
 
				-			x5 ~= x9
			
 
				-			x5 = bits.rotate_left32(x5, 12)
			
 
				-			x1 += x5
			
 
				-			x13 ~= x1
			
 
				-			x13 = bits.rotate_left32(x13, 8)
			
 
				-			x9 += x13
			
 
				-			x5 ~= x9
			
 
				-			x5 = bits.rotate_left32(x5, 7)
			
 
				-
			
 
				-			// quarterround(x, 2, 6, 10, 14)
			
 
				-			x2 += x6
			
 
				-			x14 ~= x2
			
 
				-			x14 = bits.rotate_left32(x14, 16)
			
 
				-			x10 += x14
			
 
				-			x6 ~= x10
			
 
				-			x6 = bits.rotate_left32(x6, 12)
			
 
				-			x2 += x6
			
 
				-			x14 ~= x2
			
 
				-			x14 = bits.rotate_left32(x14, 8)
			
 
				-			x10 += x14
			
 
				-			x6 ~= x10
			
 
				-			x6 = bits.rotate_left32(x6, 7)
			
 
				-
			
 
				-			// quarterround(x, 3, 7, 11, 15)
			
 
				-			x3 += x7
			
 
				-			x15 ~= x3
			
 
				-			x15 = bits.rotate_left32(x15, 16)
			
 
				-			x11 += x15
			
 
				-			x7 ~= x11
			
 
				-			x7 = bits.rotate_left32(x7, 12)
			
 
				-			x3 += x7
			
 
				-			x15 ~= x3
			
 
				-			x15 = bits.rotate_left32(x15, 8)
			
 
				-			x11 += x15
			
 
				-			x7 ~= x11
			
 
				-			x7 = bits.rotate_left32(x7, 7)
			
 
				-
			
 
				-			// quarterround(x, 0, 5, 10, 15)
			
 
				-			x0 += x5
			
 
				-			x15 ~= x0
			
 
				-			x15 = bits.rotate_left32(x15, 16)
			
 
				-			x10 += x15
			
 
				-			x5 ~= x10
			
 
				-			x5 = bits.rotate_left32(x5, 12)
			
 
				-			x0 += x5
			
 
				-			x15 ~= x0
			
 
				-			x15 = bits.rotate_left32(x15, 8)
			
 
				-			x10 += x15
			
 
				-			x5 ~= x10
			
 
				-			x5 = bits.rotate_left32(x5, 7)
			
 
				-
			
 
				-			// quarterround(x, 1, 6, 11, 12)
			
 
				-			x1 += x6
			
 
				-			x12 ~= x1
			
 
				-			x12 = bits.rotate_left32(x12, 16)
			
 
				-			x11 += x12
			
 
				-			x6 ~= x11
			
 
				-			x6 = bits.rotate_left32(x6, 12)
			
 
				-			x1 += x6
			
 
				-			x12 ~= x1
			
 
				-			x12 = bits.rotate_left32(x12, 8)
			
 
				-			x11 += x12
			
 
				-			x6 ~= x11
			
 
				-			x6 = bits.rotate_left32(x6, 7)
			
 
				-
			
 
				-			// quarterround(x, 2, 7, 8, 13)
			
 
				-			x2 += x7
			
 
				-			x13 ~= x2
			
 
				-			x13 = bits.rotate_left32(x13, 16)
			
 
				-			x8 += x13
			
 
				-			x7 ~= x8
			
 
				-			x7 = bits.rotate_left32(x7, 12)
			
 
				-			x2 += x7
			
 
				-			x13 ~= x2
			
 
				-			x13 = bits.rotate_left32(x13, 8)
			
 
				-			x8 += x13
			
 
				-			x7 ~= x8
			
 
				-			x7 = bits.rotate_left32(x7, 7)
			
 
				-
			
 
				-			// quarterround(x, 3, 4, 9, 14)
			
 
				-			x3 += x4
			
 
				-			x14 ~= x3
			
 
				-			x14 = bits.rotate_left32(x14, 16)
			
 
				-			x9 += x14
			
 
				-			x4 ~= x9
			
 
				-			x4 = bits.rotate_left32(x4, 12)
			
 
				-			x3 += x4
			
 
				-			x14 ~= x3
			
 
				-			x14 = bits.rotate_left32(x14, 8)
			
 
				-			x9 += x14
			
 
				-			x4 ~= x9
			
 
				-			x4 = bits.rotate_left32(x4, 7)
			
 
				-		}
			
 
				-
			
 
				-		x0 += _SIGMA_0
			
 
				-		x1 += _SIGMA_1
			
 
				-		x2 += _SIGMA_2
			
 
				-		x3 += _SIGMA_3
			
 
				-		x4 += x[4]
			
 
				-		x5 += x[5]
			
 
				-		x6 += x[6]
			
 
				-		x7 += x[7]
			
 
				-		x8 += x[8]
			
 
				-		x9 += x[9]
			
 
				-		x10 += x[10]
			
 
				-		x11 += x[11]
			
 
				-		x12 += x[12]
			
 
				-		x13 += x[13]
			
 
				-		x14 += x[14]
			
 
				-		x15 += x[15]
			
 
				-
			
 
				-		// While the "correct" answer to getting more performance out of
			
 
				-		// this is "use vector operations", support for that is currently
			
 
				-		// a work in progress/to be designed.
			
 
				-		//
			
 
				-		// In the meantime:
			
 
				-		// - The caller(s) ensure that src/dst are valid.
			
 
				-		// - The compiler knows if the target is picky about alignment.
			
 
				-
			
 
				-		#no_bounds_check {
			
 
				-			if src != nil {
			
 
				-				endian.unchecked_put_u32le(dst[0:4], endian.unchecked_get_u32le(src[0:4]) ~ x0)
			
 
				-				endian.unchecked_put_u32le(dst[4:8], endian.unchecked_get_u32le(src[4:8]) ~ x1)
			
 
				-				endian.unchecked_put_u32le(dst[8:12], endian.unchecked_get_u32le(src[8:12]) ~ x2)
			
 
				-				endian.unchecked_put_u32le(dst[12:16], endian.unchecked_get_u32le(src[12:16]) ~ x3)
			
 
				-				endian.unchecked_put_u32le(dst[16:20], endian.unchecked_get_u32le(src[16:20]) ~ x4)
			
 
				-				endian.unchecked_put_u32le(dst[20:24], endian.unchecked_get_u32le(src[20:24]) ~ x5)
			
 
				-				endian.unchecked_put_u32le(dst[24:28], endian.unchecked_get_u32le(src[24:28]) ~ x6)
			
 
				-				endian.unchecked_put_u32le(dst[28:32], endian.unchecked_get_u32le(src[28:32]) ~ x7)
			
 
				-				endian.unchecked_put_u32le(dst[32:36], endian.unchecked_get_u32le(src[32:36]) ~ x8)
			
 
				-				endian.unchecked_put_u32le(dst[36:40], endian.unchecked_get_u32le(src[36:40]) ~ x9)
			
 
				-				endian.unchecked_put_u32le(dst[40:44], endian.unchecked_get_u32le(src[40:44]) ~ x10)
			
 
				-				endian.unchecked_put_u32le(dst[44:48], endian.unchecked_get_u32le(src[44:48]) ~ x11)
			
 
				-				endian.unchecked_put_u32le(dst[48:52], endian.unchecked_get_u32le(src[48:52]) ~ x12)
			
 
				-				endian.unchecked_put_u32le(dst[52:56], endian.unchecked_get_u32le(src[52:56]) ~ x13)
			
 
				-				endian.unchecked_put_u32le(dst[56:60], endian.unchecked_get_u32le(src[56:60]) ~ x14)
			
 
				-				endian.unchecked_put_u32le(dst[60:64], endian.unchecked_get_u32le(src[60:64]) ~ x15)
			
 
				-				src = src[_BLOCK_SIZE:]
			
 
				-			} else {
			
 
				-				endian.unchecked_put_u32le(dst[0:4], x0)
			
 
				-				endian.unchecked_put_u32le(dst[4:8], x1)
			
 
				-				endian.unchecked_put_u32le(dst[8:12], x2)
			
 
				-				endian.unchecked_put_u32le(dst[12:16], x3)
			
 
				-				endian.unchecked_put_u32le(dst[16:20], x4)
			
 
				-				endian.unchecked_put_u32le(dst[20:24], x5)
			
 
				-				endian.unchecked_put_u32le(dst[24:28], x6)
			
 
				-				endian.unchecked_put_u32le(dst[28:32], x7)
			
 
				-				endian.unchecked_put_u32le(dst[32:36], x8)
			
 
				-				endian.unchecked_put_u32le(dst[36:40], x9)
			
 
				-				endian.unchecked_put_u32le(dst[40:44], x10)
			
 
				-				endian.unchecked_put_u32le(dst[44:48], x11)
			
 
				-				endian.unchecked_put_u32le(dst[48:52], x12)
			
 
				-				endian.unchecked_put_u32le(dst[52:56], x13)
			
 
				-				endian.unchecked_put_u32le(dst[56:60], x14)
			
 
				-				endian.unchecked_put_u32le(dst[60:64], x15)
			
 
				-			}
			
 
				-			dst = dst[_BLOCK_SIZE:]
			
 
				-		}
			
 
				-
			
 
				-		// Increment the counter.  Overflow checking is done upon
			
 
				-		// entry into the routine, so a 64-bit increment safely
			
 
				-		// covers both cases.
			
 
				-		new_ctr := ((u64(ctx._s[13]) << 32) | u64(ctx._s[12])) + 1
			
 
				-		x[12] = u32(new_ctr)
			
 
				-		x[13] = u32(new_ctr >> 32)
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-@(private)
			
 
				-_hchacha20 :: proc "contextless" (dst, key, nonce: []byte) {
			
 
				-	x0, x1, x2, x3 := _SIGMA_0, _SIGMA_1, _SIGMA_2, _SIGMA_3
			
 
				-	x4 := endian.unchecked_get_u32le(key[0:4])
			
 
				-	x5 := endian.unchecked_get_u32le(key[4:8])
			
 
				-	x6 := endian.unchecked_get_u32le(key[8:12])
			
 
				-	x7 := endian.unchecked_get_u32le(key[12:16])
			
 
				-	x8 := endian.unchecked_get_u32le(key[16:20])
			
 
				-	x9 := endian.unchecked_get_u32le(key[20:24])
			
 
				-	x10 := endian.unchecked_get_u32le(key[24:28])
			
 
				-	x11 := endian.unchecked_get_u32le(key[28:32])
			
 
				-	x12 := endian.unchecked_get_u32le(nonce[0:4])
			
 
				-	x13 := endian.unchecked_get_u32le(nonce[4:8])
			
 
				-	x14 := endian.unchecked_get_u32le(nonce[8:12])
			
 
				-	x15 := endian.unchecked_get_u32le(nonce[12:16])
			
 
				-
			
 
				-	for i := _ROUNDS; i > 0; i = i - 2 {
			
 
				-		// quarterround(x, 0, 4, 8, 12)
			
 
				-		x0 += x4
			
 
				-		x12 ~= x0
			
 
				-		x12 = bits.rotate_left32(x12, 16)
			
 
				-		x8 += x12
			
 
				-		x4 ~= x8
			
 
				-		x4 = bits.rotate_left32(x4, 12)
			
 
				-		x0 += x4
			
 
				-		x12 ~= x0
			
 
				-		x12 = bits.rotate_left32(x12, 8)
			
 
				-		x8 += x12
			
 
				-		x4 ~= x8
			
 
				-		x4 = bits.rotate_left32(x4, 7)
			
 
				-
			
 
				-		// quarterround(x, 1, 5, 9, 13)
			
 
				-		x1 += x5
			
 
				-		x13 ~= x1
			
 
				-		x13 = bits.rotate_left32(x13, 16)
			
 
				-		x9 += x13
			
 
				-		x5 ~= x9
			
 
				-		x5 = bits.rotate_left32(x5, 12)
			
 
				-		x1 += x5
			
 
				-		x13 ~= x1
			
 
				-		x13 = bits.rotate_left32(x13, 8)
			
 
				-		x9 += x13
			
 
				-		x5 ~= x9
			
 
				-		x5 = bits.rotate_left32(x5, 7)
			
 
				-
			
 
				-		// quarterround(x, 2, 6, 10, 14)
			
 
				-		x2 += x6
			
 
				-		x14 ~= x2
			
 
				-		x14 = bits.rotate_left32(x14, 16)
			
 
				-		x10 += x14
			
 
				-		x6 ~= x10
			
 
				-		x6 = bits.rotate_left32(x6, 12)
			
 
				-		x2 += x6
			
 
				-		x14 ~= x2
			
 
				-		x14 = bits.rotate_left32(x14, 8)
			
 
				-		x10 += x14
			
 
				-		x6 ~= x10
			
 
				-		x6 = bits.rotate_left32(x6, 7)
			
 
				-
			
 
				-		// quarterround(x, 3, 7, 11, 15)
			
 
				-		x3 += x7
			
 
				-		x15 ~= x3
			
 
				-		x15 = bits.rotate_left32(x15, 16)
			
 
				-		x11 += x15
			
 
				-		x7 ~= x11
			
 
				-		x7 = bits.rotate_left32(x7, 12)
			
 
				-		x3 += x7
			
 
				-		x15 ~= x3
			
 
				-		x15 = bits.rotate_left32(x15, 8)
			
 
				-		x11 += x15
			
 
				-		x7 ~= x11
			
 
				-		x7 = bits.rotate_left32(x7, 7)
			
 
				-
			
 
				-		// quarterround(x, 0, 5, 10, 15)
			
 
				-		x0 += x5
			
 
				-		x15 ~= x0
			
 
				-		x15 = bits.rotate_left32(x15, 16)
			
 
				-		x10 += x15
			
 
				-		x5 ~= x10
			
 
				-		x5 = bits.rotate_left32(x5, 12)
			
 
				-		x0 += x5
			
 
				-		x15 ~= x0
			
 
				-		x15 = bits.rotate_left32(x15, 8)
			
 
				-		x10 += x15
			
 
				-		x5 ~= x10
			
 
				-		x5 = bits.rotate_left32(x5, 7)
			
 
				-
			
 
				-		// quarterround(x, 1, 6, 11, 12)
			
 
				-		x1 += x6
			
 
				-		x12 ~= x1
			
 
				-		x12 = bits.rotate_left32(x12, 16)
			
 
				-		x11 += x12
			
 
				-		x6 ~= x11
			
 
				-		x6 = bits.rotate_left32(x6, 12)
			
 
				-		x1 += x6
			
 
				-		x12 ~= x1
			
 
				-		x12 = bits.rotate_left32(x12, 8)
			
 
				-		x11 += x12
			
 
				-		x6 ~= x11
			
 
				-		x6 = bits.rotate_left32(x6, 7)
			
 
				-
			
 
				-		// quarterround(x, 2, 7, 8, 13)
			
 
				-		x2 += x7
			
 
				-		x13 ~= x2
			
 
				-		x13 = bits.rotate_left32(x13, 16)
			
 
				-		x8 += x13
			
 
				-		x7 ~= x8
			
 
				-		x7 = bits.rotate_left32(x7, 12)
			
 
				-		x2 += x7
			
 
				-		x13 ~= x2
			
 
				-		x13 = bits.rotate_left32(x13, 8)
			
 
				-		x8 += x13
			
 
				-		x7 ~= x8
			
 
				-		x7 = bits.rotate_left32(x7, 7)
			
 
				-
			
 
				-		// quarterround(x, 3, 4, 9, 14)
			
 
				-		x3 += x4
			
 
				-		x14 ~= x3
			
 
				-		x14 = bits.rotate_left32(x14, 16)
			
 
				-		x9 += x14
			
 
				-		x4 ~= x9
			
 
				-		x4 = bits.rotate_left32(x4, 12)
			
 
				-		x3 += x4
			
 
				-		x14 ~= x3
			
 
				-		x14 = bits.rotate_left32(x14, 8)
			
 
				-		x9 += x14
			
 
				-		x4 ~= x9
			
 
				-		x4 = bits.rotate_left32(x4, 7)
			
 
				-	}
			
 
				-
			
 
				-	endian.unchecked_put_u32le(dst[0:4], x0)
			
 
				-	endian.unchecked_put_u32le(dst[4:8], x1)
			
 
				-	endian.unchecked_put_u32le(dst[8:12], x2)
			
 
				-	endian.unchecked_put_u32le(dst[12:16], x3)
			
 
				-	endian.unchecked_put_u32le(dst[16:20], x12)
			
 
				-	endian.unchecked_put_u32le(dst[20:24], x13)
			
 
				-	endian.unchecked_put_u32le(dst[24:28], x14)
			
 
				-	endian.unchecked_put_u32le(dst[28:32], x15)
			
 
				+	_chacha20.reset(&ctx._state)
			
 
				 }
			
--- a/core/crypto/chacha20/chacha20_impl.odin
+++ b/core/crypto/chacha20/chacha20_impl.odin
@@ -0,0 +1,56 @@
 
				+package chacha20
			
 
				+
			
 
				+import "base:intrinsics"
			
 
				+import "core:crypto/_chacha20/ref"
			
 
				+import "core:crypto/_chacha20/simd128"
			
 
				+import "core:crypto/_chacha20/simd256"
			
 
				+
			
 
				+// DEFAULT_IMPLEMENTATION is the implementation that will be used by
			
 
				+// default if possible.
			
 
				+DEFAULT_IMPLEMENTATION :: Implementation.Simd256
			
 
				+
			
 
				+// Implementation is a ChaCha20 implementation.  Most callers will not need
			
 
				+// to use this as the package will automatically select the most performant
			
 
				+// implementation available.
			
 
				+Implementation :: enum {
			
 
				+	Portable,
			
 
				+	Simd128,
			
 
				+	Simd256,
			
 
				+}
			
 
				+
			
 
				+@(private)
			
 
				+init_impl :: proc(ctx: ^Context, impl: Implementation) {
			
 
				+	impl := impl
			
 
				+	if impl == .Simd256 && !simd256.is_performant() {
			
 
				+			impl = .Simd128
			
 
				+	}
			
 
				+	if impl == .Simd128 && !simd128.is_performant() {
			
 
				+		impl = .Portable
			
 
				+	}
			
 
				+
			
 
				+	ctx._impl = impl
			
 
				+}
			
 
				+
			
 
				+@(private)
			
 
				+stream_blocks :: proc(ctx: ^Context, dst, src: []byte, nr_blocks: int) {
			
 
				+	switch ctx._impl {
			
 
				+	case .Simd256:
			
 
				+		simd256.stream_blocks(&ctx._state, dst, src, nr_blocks)
			
 
				+	case .Simd128:
			
 
				+		simd128.stream_blocks(&ctx._state, dst, src, nr_blocks)
			
 
				+	case .Portable:
			
 
				+		ref.stream_blocks(&ctx._state, dst, src, nr_blocks)
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+@(private)
			
 
				+hchacha20 :: proc "contextless" (dst, key, iv: []byte, impl: Implementation) {
			
 
				+	switch impl {
			
 
				+	case .Simd256:
			
 
				+		simd256.hchacha20(dst, key, iv)
			
 
				+	case .Simd128:
			
 
				+		simd128.hchacha20(dst, key, iv)
			
 
				+	case .Portable:
			
 
				+		ref.hchacha20(dst, key, iv)
			
 
				+	}
			
 
				+}
			
--- a/core/crypto/chacha20poly1305/chacha20poly1305.odin
+++ b/core/crypto/chacha20poly1305/chacha20poly1305.odin
@@ -1,9 +1,11 @@
 
				 /*
			
 
				-package chacha20poly1305 implements the AEAD_CHACHA20_POLY1305 Authenticated
			
 
				-Encryption with Additional Data algorithm.
			
 
				+package chacha20poly1305 implements the AEAD_CHACHA20_POLY1305 and
			
 
				+AEAD_XChaCha20_Poly1305 Authenticated Encryption with Additional Data
			
 
				+algorithms.
			
 
				 
			
 
				 See:
			
 
				 - https://www.rfc-editor.org/rfc/rfc8439
			
 
				+- https://datatracker.ietf.org/doc/html/draft-arciszewski-xchacha-03
			
 
				 */
			
 
				 package chacha20poly1305
			
 
				 
			
@@ -15,8 +17,10 @@ import "core:mem"
 
				 
			
 
				 // KEY_SIZE is the chacha20poly1305 key size in bytes.
			
 
				 KEY_SIZE :: chacha20.KEY_SIZE
			
 
				-// NONCE_SIZE is the chacha20poly1305 nonce size in bytes.
			
 
				-NONCE_SIZE :: chacha20.NONCE_SIZE
			
 
				+// IV_SIZE is the chacha20poly1305 IV size in bytes.
			
 
				+IV_SIZE :: chacha20.IV_SIZE
			
 
				+// XIV_SIZE is the xchacha20poly1305 IV size in bytes.
			
 
				+XIV_SIZE :: chacha20.XIV_SIZE
			
 
				 // TAG_SIZE is the chacha20poly1305 tag size in bytes.
			
 
				 TAG_SIZE :: poly1305.TAG_SIZE
			
 
				 
			
@@ -24,15 +28,13 @@ TAG_SIZE :: poly1305.TAG_SIZE
 
				 _P_MAX :: 64 * 0xffffffff // 64 * (2^32-1)
			
 
				 
			
 
				 @(private)
			
 
				-_validate_common_slice_sizes :: proc (tag, key, nonce, aad, text: []byte) {
			
 
				+_validate_common_slice_sizes :: proc (tag, iv, aad, text: []byte, is_xchacha: bool) {
			
 
				 	if len(tag) != TAG_SIZE {
			
 
				 		panic("crypto/chacha20poly1305: invalid destination tag size")
			
 
				 	}
			
 
				-	if len(key) != KEY_SIZE {
			
 
				-		panic("crypto/chacha20poly1305: invalid key size")
			
 
				-	}
			
 
				-	if len(nonce) != NONCE_SIZE {
			
 
				-		panic("crypto/chacha20poly1305: invalid nonce size")
			
 
				+	expected_iv_len := is_xchacha ? XIV_SIZE : IV_SIZE
			
 
				+	if len(iv) != expected_iv_len {
			
 
				+		panic("crypto/chacha20poly1305: invalid IV size")
			
 
				 	}
			
 
				 
			
 
				 	#assert(size_of(int) == 8 || size_of(int) <= 4)
			
@@ -59,18 +61,52 @@ _update_mac_pad16 :: #force_inline proc (ctx: ^poly1305.Context, x_len: int) {
 
				 	}
			
 
				 }
			
 
				 
			
 
				-// encrypt encrypts the plaintext and authenticates the aad and ciphertext,
			
 
				-// with the provided key and nonce, stores the output in ciphertext and tag.
			
 
				-encrypt :: proc (ciphertext, tag, key, nonce, aad, plaintext: []byte) {
			
 
				-	_validate_common_slice_sizes(tag, key, nonce, aad, plaintext)
			
 
				+// Context is a keyed (X)Chacha20Poly1305 instance.
			
 
				+Context :: struct {
			
 
				+	_key:            [KEY_SIZE]byte,
			
 
				+	_impl:           chacha20.Implementation,
			
 
				+	_is_xchacha:     bool,
			
 
				+	_is_initialized: bool,
			
 
				+}
			
 
				+
			
 
				+// init initializes a Context with the provided key, for AEAD_CHACHA20_POLY1305.
			
 
				+init :: proc(ctx: ^Context, key: []byte, impl := chacha20.DEFAULT_IMPLEMENTATION) {
			
 
				+	if len(key) != KEY_SIZE {
			
 
				+		panic("crypto/chacha20poly1305: invalid key size")
			
 
				+	}
			
 
				+
			
 
				+	copy(ctx._key[:], key)
			
 
				+	ctx._impl = impl
			
 
				+	ctx._is_xchacha = false
			
 
				+	ctx._is_initialized = true
			
 
				+}
			
 
				+
			
 
				+// init_xchacha initializes a Context with the provided key, for
			
 
				+// AEAD_XChaCha20_Poly1305.
			
 
				+//
			
 
				+// Note: While there are multiple definitions of XChaCha20-Poly1305
			
 
				+// this sticks to the IETF draft and uses a 32-bit counter.
			
 
				+init_xchacha :: proc(ctx: ^Context, key: []byte, impl := chacha20.DEFAULT_IMPLEMENTATION) {
			
 
				+	init(ctx, key, impl)
			
 
				+	ctx._is_xchacha = true
			
 
				+}
			
 
				+
			
 
				+// seal encrypts the plaintext and authenticates the aad and ciphertext,
			
 
				+// with the provided Context and iv, stores the output in dst and tag.
			
 
				+//
			
 
				+// dst and plaintext MUST alias exactly or not at all.
			
 
				+seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {
			
 
				+	ciphertext := dst
			
 
				+	_validate_common_slice_sizes(tag, iv, aad, plaintext, ctx._is_xchacha)
			
 
				 	if len(ciphertext) != len(plaintext) {
			
 
				 		panic("crypto/chacha20poly1305: invalid destination ciphertext size")
			
 
				 	}
			
 
				 
			
 
				 	stream_ctx: chacha20.Context = ---
			
 
				-	chacha20.init(&stream_ctx, key, nonce)
			
 
				+	chacha20.init(&stream_ctx, ctx._key[:],iv, ctx._impl)
			
 
				+	stream_ctx._state._is_ietf_flavor = true
			
 
				 
			
 
				-	// otk = poly1305_key_gen(key, nonce)
			
 
				+	// otk = poly1305_key_gen(key, iv)
			
 
				 	otk: [poly1305.KEY_SIZE]byte = ---
			
 
				 	chacha20.keystream_bytes(&stream_ctx, otk[:])
			
 
				 	mac_ctx: poly1305.Context = ---
			
@@ -87,7 +123,7 @@ encrypt :: proc (ciphertext, tag, key, nonce, aad, plaintext: []byte) {
 
				 	poly1305.update(&mac_ctx, aad)
			
 
				 	_update_mac_pad16(&mac_ctx, aad_len)
			
 
				 
			
 
				-	// ciphertext = chacha20_encrypt(key, 1, nonce, plaintext)
			
 
				+	// ciphertext = chacha20_encrypt(key, 1, iv, plaintext)
			
 
				 	chacha20.seek(&stream_ctx, 1)
			
 
				 	chacha20.xor_bytes(&stream_ctx, ciphertext, plaintext)
			
 
				 	chacha20.reset(&stream_ctx) // Don't need the stream context anymore.
			
@@ -107,13 +143,16 @@ encrypt :: proc (ciphertext, tag, key, nonce, aad, plaintext: []byte) {
 
				 	poly1305.final(&mac_ctx, tag) // Implicitly sanitizes context.
			
 
				 }
			
 
				 
			
 
				-// decrypt authenticates the aad and ciphertext, and decrypts the ciphertext,
			
 
				-// with the provided key, nonce, and tag, and stores the output in plaintext,
			
 
				-// returning true iff the authentication was successful.
			
 
				+// open authenticates the aad and ciphertext, and decrypts the ciphertext,
			
 
				+// with the provided Context, iv, and tag, and stores the output in dst,
			
 
				+// returning true iff the authentication was successful.  If authentication
			
 
				+// fails, the destination buffer will be zeroed.
			
 
				 //
			
 
				-// If authentication fails, the destination plaintext buffer will be zeroed.
			
 
				-decrypt :: proc (plaintext, tag, key, nonce, aad, ciphertext: []byte) -> bool {
			
 
				-	_validate_common_slice_sizes(tag, key, nonce, aad, ciphertext)
			
 
				+// dst and plaintext MUST alias exactly or not at all.
			
 
				+@(require_results)
			
 
				+open :: proc(ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool {
			
 
				+	plaintext := dst
			
 
				+	_validate_common_slice_sizes(tag, iv, aad, ciphertext, ctx._is_xchacha)
			
 
				 	if len(ciphertext) != len(plaintext) {
			
 
				 		panic("crypto/chacha20poly1305: invalid destination plaintext size")
			
 
				 	}
			
@@ -123,9 +162,10 @@ decrypt :: proc (plaintext, tag, key, nonce, aad, ciphertext: []byte) -> bool {
 
				 	// points where needed.
			
 
				 
			
 
				 	stream_ctx: chacha20.Context = ---
			
 
				-	chacha20.init(&stream_ctx, key, nonce)
			
 
				+	chacha20.init(&stream_ctx, ctx._key[:], iv, ctx._impl)
			
 
				+	stream_ctx._state._is_ietf_flavor = true
			
 
				 
			
 
				-	// otk = poly1305_key_gen(key, nonce)
			
 
				+	// otk = poly1305_key_gen(key, iv)
			
 
				 	otk: [poly1305.KEY_SIZE]byte = ---
			
 
				 	chacha20.keystream_bytes(&stream_ctx, otk[:])
			
 
				 	defer chacha20.reset(&stream_ctx)
			
@@ -160,9 +200,17 @@ decrypt :: proc (plaintext, tag, key, nonce, aad, ciphertext: []byte) -> bool {
 
				 		return false
			
 
				 	}
			
 
				 
			
 
				-	// plaintext = chacha20_decrypt(key, 1, nonce, ciphertext)
			
 
				+	// plaintext = chacha20_decrypt(key, 1, iv, ciphertext)
			
 
				 	chacha20.seek(&stream_ctx, 1)
			
 
				 	chacha20.xor_bytes(&stream_ctx, plaintext, ciphertext)
			
 
				 
			
 
				 	return true
			
 
				 }
			
 
				+
			
 
				+// reset sanitizes the Context.  The Context must be
			
 
				+// re-initialized to be used again.
			
 
				+reset :: proc "contextless" (ctx: ^Context) {
			
 
				+	mem.zero_explicit(&ctx._key, len(ctx._key))
			
 
				+	ctx._is_xchacha = false
			
 
				+	ctx._is_initialized = false
			
 
				+}
			
--- a/core/crypto/ed25519/ed25519.odin
+++ b/core/crypto/ed25519/ed25519.odin
@@ -21,7 +21,7 @@ PUBLIC_KEY_SIZE :: 32
 
				 SIGNATURE_SIZE :: 64
			
 
				 
			
 
				 @(private)
			
 
				-NONCE_SIZE :: 32
			
 
				+HDIGEST2_SIZE :: 32
			
 
				 
			
 
				 // Private_Key is an Ed25519 private key.
			
 
				 Private_Key :: struct {
			
@@ -33,7 +33,7 @@ Private_Key :: struct {
 
				 	// See: https://github.com/MystenLabs/ed25519-unsafe-libs
			
 
				 	_b:              [PRIVATE_KEY_SIZE]byte,
			
 
				 	_s:              grp.Scalar,
			
 
				-	_nonce:          [NONCE_SIZE]byte,
			
 
				+	_hdigest2:       [HDIGEST2_SIZE]byte,
			
 
				 	_pub_key:        Public_Key,
			
 
				 	_is_initialized: bool,
			
 
				 }
			
@@ -63,7 +63,7 @@ private_key_set_bytes :: proc(priv_key: ^Private_Key, b: []byte) -> bool {
 
				 	sha2.final(&ctx, h_bytes[:])
			
 
				 
			
 
				 	copy(priv_key._b[:], b)
			
 
				-	copy(priv_key._nonce[:], h_bytes[32:])
			
 
				+	copy(priv_key._hdigest2[:], h_bytes[32:])
			
 
				 	grp.sc_set_bytes_rfc8032(&priv_key._s, h_bytes[:32])
			
 
				 
			
 
				 	// Derive the corresponding public key.
			
@@ -116,7 +116,7 @@ sign :: proc(priv_key: ^Private_Key, msg, sig: []byte) {
 
				 	ctx: sha2.Context_512 = ---
			
 
				 	digest_bytes: [sha2.DIGEST_SIZE_512]byte = ---
			
 
				 	sha2.init_512(&ctx)
			
 
				-	sha2.update(&ctx, priv_key._nonce[:])
			
 
				+	sha2.update(&ctx, priv_key._hdigest2[:])
			
 
				 	sha2.update(&ctx, msg)
			
 
				 	sha2.final(&ctx, digest_bytes[:])
			
 
				 
			
--- a/core/crypto/hash/hash.odin
+++ b/core/crypto/hash/hash.odin
@@ -28,20 +28,26 @@ hash_bytes :: proc(algorithm: Algorithm, data: []byte, allocator := context.allo
 
				 
			
 
				 // hash_string_to_buffer will hash the given input and assign the
			
 
				 // computed digest to the third parameter.  It requires that the
			
 
				-// destination buffer is at least as big as the digest size.
			
 
				-hash_string_to_buffer :: proc(algorithm: Algorithm, data: string, hash: []byte) {
			
 
				-	hash_bytes_to_buffer(algorithm, transmute([]byte)(data), hash)
			
 
				+// destination buffer is at least as big as the digest size.  The
			
 
				+// provided destination buffer is returned to match the behavior of
			
 
				+// `hash_string`.
			
 
				+hash_string_to_buffer :: proc(algorithm: Algorithm, data: string, hash: []byte) -> []byte {
			
 
				+	return hash_bytes_to_buffer(algorithm, transmute([]byte)(data), hash)
			
 
				 }
			
 
				 
			
 
				 // hash_bytes_to_buffer will hash the given input and write the
			
 
				 // computed digest into the third parameter.  It requires that the
			
 
				-// destination buffer is at least as big as the digest size.
			
 
				-hash_bytes_to_buffer :: proc(algorithm: Algorithm, data, hash: []byte) {
			
 
				+// destination buffer is at least as big as the digest size.  The
			
 
				+// provided destination buffer is returned to match the behavior of
			
 
				+// `hash_bytes`.
			
 
				+hash_bytes_to_buffer :: proc(algorithm: Algorithm, data, hash: []byte) -> []byte {
			
 
				 	ctx: Context
			
 
				 
			
 
				 	init(&ctx, algorithm)
			
 
				 	update(&ctx, data)
			
 
				 	final(&ctx, hash)
			
 
				+
			
 
				+	return hash
			
 
				 }
			
 
				 
			
 
				 // hash_stream will incrementally fully consume a stream, and return the
			
--- a/examples/all/all_main.odin
+++ b/examples/all/all_main.odin
@@ -25,6 +25,7 @@ import rbtree           "core:container/rbtree"
 
				 import topological_sort "core:container/topological_sort"
			
 
				 
			
 
				 import crypto           "core:crypto"
			
 
				+import aead             "core:crypto/aead"
			
 
				 import aes              "core:crypto/aes"
			
 
				 import blake2b          "core:crypto/blake2b"
			
 
				 import blake2s          "core:crypto/blake2s"
			
@@ -164,6 +165,7 @@ _ :: rbtree
 
				 _ :: topological_sort
			
 
				 _ :: crypto
			
 
				 _ :: crypto_hash
			
 
				+_ :: aead
			
 
				 _ :: aes
			
 
				 _ :: blake2b
			
 
				 _ :: blake2s
			
--- a/tests/benchmark/crypto/benchmark_crypto.odin
+++ b/tests/benchmark/crypto/benchmark_crypto.odin
@@ -279,13 +279,13 @@ _benchmark_chacha20 :: proc(
 
				 		0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
			
 
				 		0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
			
 
				 	}
			
 
				-	nonce := [chacha20.NONCE_SIZE]byte {
			
 
				+	iv := [chacha20.IV_SIZE]byte {
			
 
				 		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
			
 
				 		0x00, 0x00, 0x00, 0x00,
			
 
				 	}
			
 
				 
			
 
				 	ctx: chacha20.Context = ---
			
 
				-	chacha20.init(&ctx, key[:], nonce[:])
			
 
				+	chacha20.init(&ctx, key[:], iv[:])
			
 
				 
			
 
				 	for _ in 0 ..= options.rounds {
			
 
				 		chacha20.xor_bytes(&ctx, buf, buf)
			
@@ -334,15 +334,18 @@ _benchmark_chacha20poly1305 :: proc(
 
				 		0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
			
 
				 		0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
			
 
				 	}
			
 
				-	nonce := [chacha20.NONCE_SIZE]byte {
			
 
				+	iv := [chacha20.IV_SIZE]byte {
			
 
				 		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
			
 
				 		0x00, 0x00, 0x00, 0x00,
			
 
				 	}
			
 
				 
			
 
				+	ctx: chacha20poly1305.Context = ---
			
 
				+	chacha20poly1305.init(&ctx, key[:]) // Basically 0 overhead.
			
 
				+
			
 
				 	tag: [chacha20poly1305.TAG_SIZE]byte = ---
			
 
				 
			
 
				 	for _ in 0 ..= options.rounds {
			
 
				-		chacha20poly1305.encrypt(buf, tag[:], key[:], nonce[:], nil, buf)
			
 
				+		chacha20poly1305.seal(&ctx, buf, tag[:], iv[:], nil, buf)
			
 
				 	}
			
 
				 	options.count = options.rounds
			
 
				 	options.processed = options.rounds * options.bytes
			
@@ -363,13 +366,13 @@ _benchmark_aes256_ctr :: proc(
 
				 		0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
			
 
				 		0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
			
 
				 	}
			
 
				-	nonce := [aes.CTR_IV_SIZE]byte {
			
 
				+	iv := [aes.CTR_IV_SIZE]byte {
			
 
				 		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
			
 
				 		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
			
 
				 	}
			
 
				 
			
 
				 	ctx: aes.Context_CTR = ---
			
 
				-	aes.init_ctr(&ctx, key[:], nonce[:])
			
 
				+	aes.init_ctr(&ctx, key[:], iv[:])
			
 
				 
			
 
				 	for _ in 0 ..= options.rounds {
			
 
				 		aes.xor_bytes_ctr(&ctx, buf, buf)
			
@@ -386,13 +389,13 @@ _benchmark_aes256_gcm :: proc(
 
				 	err: time.Benchmark_Error,
			
 
				 ) {
			
 
				 	buf := options.input
			
 
				-	nonce: [aes.GCM_NONCE_SIZE]byte
			
 
				+	iv: [aes.GCM_IV_SIZE]byte
			
 
				 	tag: [aes.GCM_TAG_SIZE]byte = ---
			
 
				 
			
 
				 	ctx := transmute(^aes.Context_GCM)context.user_ptr
			
 
				 
			
 
				 	for _ in 0 ..= options.rounds {
			
 
				-		aes.seal_gcm(ctx, buf, tag[:], nonce[:], nil, buf)
			
 
				+		aes.seal_gcm(ctx, buf, tag[:], iv[:], nil, buf)
			
 
				 	}
			
 
				 	options.count = options.rounds
			
 
				 	options.processed = options.rounds * options.bytes
			
--- a/tests/core/crypto/test_core_crypto.odin
+++ b/tests/core/crypto/test_core_crypto.odin
@@ -19,15 +19,39 @@ import "base:runtime"
 
				 import "core:log"
			
 
				 
			
 
				 import "core:crypto"
			
 
				+import chacha_simd128 "core:crypto/_chacha20/simd128"
			
 
				+import chacha_simd256 "core:crypto/_chacha20/simd256"
			
 
				 import "core:crypto/chacha20"
			
 
				-import "core:crypto/chacha20poly1305"
			
 
				+import "core:crypto/sha2"
			
 
				 
			
 
				+@(private)
			
 
				 _PLAINTEXT_SUNSCREEN_STR := "Ladies and Gentlemen of the class of '99: If I could offer you only one tip for the future, sunscreen would be it."
			
 
				 
			
 
				 @(test)
			
 
				 test_chacha20 :: proc(t: ^testing.T) {
			
 
				 	runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD()
			
 
				 
			
 
				+	impls := supported_chacha_impls()
			
 
				+
			
 
				+	for impl in impls {
			
 
				+		test_chacha20_stream(t, impl)
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+supported_chacha_impls :: proc() -> [dynamic]chacha20.Implementation {
			
 
				+	impls := make([dynamic]chacha20.Implementation, 0, 3, context.temp_allocator)
			
 
				+	append(&impls, chacha20.Implementation.Portable)
			
 
				+	if chacha_simd128.is_performant() {
			
 
				+		append(&impls, chacha20.Implementation.Simd128)
			
 
				+	}
			
 
				+	if chacha_simd256.is_performant() {
			
 
				+		append(&impls, chacha20.Implementation.Simd256)
			
 
				+	}
			
 
				+
			
 
				+	return impls
			
 
				+}
			
 
				+
			
 
				+test_chacha20_stream :: proc(t: ^testing.T, impl: chacha20.Implementation) {
			
 
				 	// Test cases taken from RFC 8439, and draft-irtf-cfrg-xchacha-03
			
 
				 	plaintext := transmute([]byte)(_PLAINTEXT_SUNSCREEN_STR)
			
 
				 
			
@@ -38,7 +62,7 @@ test_chacha20 :: proc(t: ^testing.T) {
 
				 		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
			
 
				 	}
			
 
				 
			
 
				-	nonce := [chacha20.NONCE_SIZE]byte {
			
 
				+	iv := [chacha20.IV_SIZE]byte {
			
 
				 		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4a,
			
 
				 		0x00, 0x00, 0x00, 0x00,
			
 
				 	}
			
@@ -64,7 +88,7 @@ test_chacha20 :: proc(t: ^testing.T) {
 
				 
			
 
				 	derived_ciphertext: [114]byte
			
 
				 	ctx: chacha20.Context = ---
			
 
				-	chacha20.init(&ctx, key[:], nonce[:])
			
 
				+	chacha20.init(&ctx, key[:], iv[:], impl)
			
 
				 	chacha20.seek(&ctx, 1) // The test vectors start the counter at 1.
			
 
				 	chacha20.xor_bytes(&ctx, derived_ciphertext[:], plaintext[:])
			
 
				 
			
@@ -72,7 +96,8 @@ test_chacha20 :: proc(t: ^testing.T) {
 
				 	testing.expectf(
			
 
				 		t,
			
 
				 		derived_ciphertext_str == ciphertext_str,
			
 
				-		"Expected %s for xor_bytes(plaintext_str), but got %s instead",
			
 
				+		"chacha20/%v: Expected %s for xor_bytes(plaintext_str), but got %s instead",
			
 
				+		impl,
			
 
				 		ciphertext_str,
			
 
				 		derived_ciphertext_str,
			
 
				 	)
			
@@ -84,7 +109,7 @@ test_chacha20 :: proc(t: ^testing.T) {
 
				 		0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
			
 
				 	}
			
 
				 
			
 
				-	xnonce := [chacha20.XNONCE_SIZE]byte {
			
 
				+	xiv := [chacha20.XIV_SIZE]byte {
			
 
				 		0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
			
 
				 		0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
			
 
				 		0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
			
@@ -109,7 +134,7 @@ test_chacha20 :: proc(t: ^testing.T) {
 
				 	}
			
 
				 	xciphertext_str := string(hex.encode(xciphertext[:], context.temp_allocator))
			
 
				 
			
 
				-	chacha20.init(&ctx, xkey[:], xnonce[:])
			
 
				+	chacha20.init(&ctx, xkey[:], xiv[:], impl)
			
 
				 	chacha20.seek(&ctx, 1)
			
 
				 	chacha20.xor_bytes(&ctx, derived_ciphertext[:], plaintext[:])
			
 
				 
			
@@ -117,128 +142,44 @@ test_chacha20 :: proc(t: ^testing.T) {
 
				 	testing.expectf(
			
 
				 		t,
			
 
				 		derived_ciphertext_str == xciphertext_str,
			
 
				-		"Expected %s for xor_bytes(plaintext_str), but got %s instead",
			
 
				+		"chacha20/%v: Expected %s for xor_bytes(plaintext_str), but got %s instead",
			
 
				+		impl,
			
 
				 		xciphertext_str,
			
 
				 		derived_ciphertext_str,
			
 
				 	)
			
 
				-}
			
 
				 
			
 
				-@(test)
			
 
				-test_chacha20poly1305 :: proc(t: ^testing.T) {
			
 
				-	plaintext := transmute([]byte)(_PLAINTEXT_SUNSCREEN_STR)
			
 
				-
			
 
				-	aad := [12]byte {
			
 
				-		0x50, 0x51, 0x52, 0x53, 0xc0, 0xc1, 0xc2, 0xc3,
			
 
				-		0xc4, 0xc5, 0xc6, 0xc7,
			
 
				-	}
			
 
				+	// Incrementally read 1, 2, 3, ..., 2048 bytes of keystream, and
			
 
				+	// compare the SHA-512/256 digest with a known value.  Results
			
 
				+	// and testcase taken from a known good implementation by the
			
 
				+	// same author as the Odin test case.
			
 
				 
			
 
				-	key := [chacha20poly1305.KEY_SIZE]byte {
			
 
				-		0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
			
 
				-		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
			
 
				-		0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
			
 
				-		0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
			
 
				-	}
			
 
				+	tmp := make([]byte, 2048, context.temp_allocator)
			
 
				 
			
 
				-	nonce := [chacha20poly1305.NONCE_SIZE]byte {
			
 
				-		0x07, 0x00, 0x00, 0x00, 0x40, 0x41, 0x42, 0x43,
			
 
				-		0x44, 0x45, 0x46, 0x47,
			
 
				-	}
			
 
				+	mem.zero(&key, size_of(key))
			
 
				+	mem.zero(&iv, size_of(iv))
			
 
				+	chacha20.init(&ctx, key[:], iv[:], impl)
			
 
				 
			
 
				-	ciphertext := [114]byte {
			
 
				-		0xd3, 0x1a, 0x8d, 0x34, 0x64, 0x8e, 0x60, 0xdb,
			
 
				-		0x7b, 0x86, 0xaf, 0xbc, 0x53, 0xef, 0x7e, 0xc2,
			
 
				-		0xa4, 0xad, 0xed, 0x51, 0x29, 0x6e, 0x08, 0xfe,
			
 
				-		0xa9, 0xe2, 0xb5, 0xa7, 0x36, 0xee, 0x62, 0xd6,
			
 
				-		0x3d, 0xbe, 0xa4, 0x5e, 0x8c, 0xa9, 0x67, 0x12,
			
 
				-		0x82, 0xfa, 0xfb, 0x69, 0xda, 0x92, 0x72, 0x8b,
			
 
				-		0x1a, 0x71, 0xde, 0x0a, 0x9e, 0x06, 0x0b, 0x29,
			
 
				-		0x05, 0xd6, 0xa5, 0xb6, 0x7e, 0xcd, 0x3b, 0x36,
			
 
				-		0x92, 0xdd, 0xbd, 0x7f, 0x2d, 0x77, 0x8b, 0x8c,
			
 
				-		0x98, 0x03, 0xae, 0xe3, 0x28, 0x09, 0x1b, 0x58,
			
 
				-		0xfa, 0xb3, 0x24, 0xe4, 0xfa, 0xd6, 0x75, 0x94,
			
 
				-		0x55, 0x85, 0x80, 0x8b, 0x48, 0x31, 0xd7, 0xbc,
			
 
				-		0x3f, 0xf4, 0xde, 0xf0, 0x8e, 0x4b, 0x7a, 0x9d,
			
 
				-		0xe5, 0x76, 0xd2, 0x65, 0x86, 0xce, 0xc6, 0x4b,
			
 
				-		0x61, 0x16,
			
 
				-	}
			
 
				-	ciphertext_str := string(hex.encode(ciphertext[:], context.temp_allocator))
			
 
				+	h_ctx: sha2.Context_512
			
 
				+	sha2.init_512_256(&h_ctx)
			
 
				 
			
 
				-	tag := [chacha20poly1305.TAG_SIZE]byte {
			
 
				-		0x1a, 0xe1, 0x0b, 0x59, 0x4f, 0x09, 0xe2, 0x6a,
			
 
				-		0x7e, 0x90, 0x2e, 0xcb, 0xd0, 0x60, 0x06, 0x91,
			
 
				+	for i := 1; i <= 2048; i = i + 1 {
			
 
				+		chacha20.keystream_bytes(&ctx, tmp[:i])
			
 
				+		sha2.update(&h_ctx, tmp[:i])
			
 
				 	}
			
 
				-	tag_str := string(hex.encode(tag[:], context.temp_allocator))
			
 
				-
			
 
				-	derived_tag: [chacha20poly1305.TAG_SIZE]byte
			
 
				-	derived_ciphertext: [114]byte
			
 
				 
			
 
				-	chacha20poly1305.encrypt(
			
 
				-		derived_ciphertext[:],
			
 
				-		derived_tag[:],
			
 
				-		key[:],
			
 
				-		nonce[:],
			
 
				-		aad[:],
			
 
				-		plaintext,
			
 
				-	)
			
 
				+	digest: [32]byte
			
 
				+	sha2.final(&h_ctx, digest[:])
			
 
				+	digest_str := string(hex.encode(digest[:], context.temp_allocator))
			
 
				 
			
 
				-	derived_ciphertext_str := string(hex.encode(derived_ciphertext[:], context.temp_allocator))
			
 
				+	expected_digest_str := "cfd6e949225b854fe04946491e6935ff05ff983d1554bc885bca0ec8082dd5b8"
			
 
				 	testing.expectf(
			
 
				 		t,
			
 
				-		derived_ciphertext_str == ciphertext_str,
			
 
				-		"Expected ciphertext %s for encrypt(aad, plaintext), but got %s instead",
			
 
				-		ciphertext_str,
			
 
				-		derived_ciphertext_str,
			
 
				-	)
			
 
				-
			
 
				-	derived_tag_str := string(hex.encode(derived_tag[:], context.temp_allocator))
			
 
				-	testing.expectf(
			
 
				-		t,
			
 
				-		derived_tag_str == tag_str,
			
 
				-		"Expected tag %s for encrypt(aad, plaintext), but got %s instead",
			
 
				-		tag_str,
			
 
				-		derived_tag_str,
			
 
				-	)
			
 
				-
			
 
				-	derived_plaintext: [114]byte
			
 
				-	ok := chacha20poly1305.decrypt(
			
 
				-		derived_plaintext[:],
			
 
				-		tag[:],
			
 
				-		key[:],
			
 
				-		nonce[:],
			
 
				-		aad[:],
			
 
				-		ciphertext[:],
			
 
				-	)
			
 
				-	derived_plaintext_str := string(derived_plaintext[:])
			
 
				-	testing.expect(t, ok, "Expected true for decrypt(tag, aad, ciphertext)")
			
 
				-	testing.expectf(
			
 
				-		t,
			
 
				-		derived_plaintext_str == _PLAINTEXT_SUNSCREEN_STR,
			
 
				-		"Expected plaintext %s for decrypt(tag, aad, ciphertext), but got %s instead",
			
 
				-		_PLAINTEXT_SUNSCREEN_STR,
			
 
				-		derived_plaintext_str,
			
 
				-	)
			
 
				-
			
 
				-	derived_ciphertext[0] ~= 0xa5
			
 
				-	ok = chacha20poly1305.decrypt(
			
 
				-		derived_plaintext[:],
			
 
				-		tag[:],
			
 
				-		key[:],
			
 
				-		nonce[:],
			
 
				-		aad[:],
			
 
				-		derived_ciphertext[:],
			
 
				-	)
			
 
				-	testing.expect(t, !ok, "Expected false for decrypt(tag, aad, corrupted_ciphertext)")
			
 
				-
			
 
				-	aad[0] ~= 0xa5
			
 
				-	ok = chacha20poly1305.decrypt(
			
 
				-		derived_plaintext[:],
			
 
				-		tag[:],
			
 
				-		key[:],
			
 
				-		nonce[:],
			
 
				-		aad[:],
			
 
				-		ciphertext[:],
			
 
				+		expected_digest_str == digest_str,
			
 
				+		"chacha20/%v: Expected %s for keystream digest, but got %s instead",
			
 
				+		impl,
			
 
				+		expected_digest_str,
			
 
				+		digest_str,
			
 
				 	)
			
 
				-	testing.expect(t, !ok, "Expected false for decrypt(tag, corrupted_aad, ciphertext)")
			
 
				 }
			
 
				 
			
 
				 @(test)
			
--- a/tests/core/crypto/test_core_crypto_aead.odin
+++ b/tests/core/crypto/test_core_crypto_aead.odin
@@ -0,0 +1,339 @@
 
				+package test_core_crypto
			
 
				+
			
 
				+import "base:runtime"
			
 
				+import "core:crypto/aead"
			
 
				+import "core:encoding/hex"
			
 
				+import "core:testing"
			
 
				+
			
 
				+@(test)
			
 
				+test_aead :: proc(t: ^testing.T) {
			
 
				+	runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD()
			
 
				+
			
 
				+	aes_impls := make([dynamic]aead.Implementation, context.temp_allocator)
			
 
				+	for impl in supported_aes_impls() {
			
 
				+		append(&aes_impls, impl)
			
 
				+	}
			
 
				+	chacha_impls := make([dynamic]aead.Implementation, context.temp_allocator)
			
 
				+	for impl in supported_chacha_impls() {
			
 
				+		append(&chacha_impls, impl)
			
 
				+	}
			
 
				+	impls := [aead.Algorithm][dynamic]aead.Implementation{
			
 
				+		.Invalid           = nil,
			
 
				+		.AES_GCM_128       = aes_impls,
			
 
				+		.AES_GCM_192       = aes_impls,
			
 
				+		.AES_GCM_256       = aes_impls,
			
 
				+		.CHACHA20POLY1305  = chacha_impls,
			
 
				+		.XCHACHA20POLY1305 = chacha_impls,
			
 
				+	}
			
 
				+
			
 
				+	test_vectors := []struct{
			
 
				+		algo: aead.Algorithm,
			
 
				+		key: string,
			
 
				+		iv: string,
			
 
				+		aad: string,
			
 
				+		plaintext: string,
			
 
				+		ciphertext: string,
			
 
				+		tag: string,
			
 
				+	} {
			
 
				+		// AES-GCM
			
 
				+		// - https://csrc.nist.rip/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
			
 
				+		//
			
 
				+		// Note: NIST did a reorg of their site, so the source of the test vectors
			
 
				+		// is only available from an archive.
			
 
				+		{
			
 
				+			.AES_GCM_128,
			
 
				+			"00000000000000000000000000000000",
			
 
				+			"000000000000000000000000",
			
 
				+			"",
			
 
				+			"",
			
 
				+			"",
			
 
				+			"58e2fccefa7e3061367f1d57a4e7455a",
			
 
				+		},
			
 
				+		{
			
 
				+			.AES_GCM_128,
			
 
				+			"00000000000000000000000000000000",
			
 
				+			"000000000000000000000000",
			
 
				+			"",
			
 
				+			"00000000000000000000000000000000",
			
 
				+			"0388dace60b6a392f328c2b971b2fe78",
			
 
				+			"ab6e47d42cec13bdf53a67b21257bddf",
			
 
				+		},
			
 
				+		{
			
 
				+			.AES_GCM_128,
			
 
				+			"feffe9928665731c6d6a8f9467308308",
			
 
				+			"cafebabefacedbaddecaf888",
			
 
				+			"",
			
 
				+			"d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b391aafd255",
			
 
				+			"42831ec2217774244b7221b784d0d49ce3aa212f2c02a4e035c17e2329aca12e21d514b25466931c7d8f6a5aac84aa051ba30b396a0aac973d58e091473f5985",
			
 
				+			"4d5c2af327cd64a62cf35abd2ba6fab4",
			
 
				+		},
			
 
				+		{
			
 
				+			.AES_GCM_128,
			
 
				+			"feffe9928665731c6d6a8f9467308308",
			
 
				+			"cafebabefacedbaddecaf888",
			
 
				+			"feedfacedeadbeeffeedfacedeadbeefabaddad2",
			
 
				+			"d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39",
			
 
				+			"42831ec2217774244b7221b784d0d49ce3aa212f2c02a4e035c17e2329aca12e21d514b25466931c7d8f6a5aac84aa051ba30b396a0aac973d58e091",
			
 
				+			"5bc94fbc3221a5db94fae95ae7121a47",
			
 
				+		},
			
 
				+		{
			
 
				+			.AES_GCM_128,
			
 
				+			"feffe9928665731c6d6a8f9467308308",
			
 
				+			"cafebabefacedbad",
			
 
				+			"feedfacedeadbeeffeedfacedeadbeefabaddad2",
			
 
				+			"d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39",
			
 
				+			"61353b4c2806934a777ff51fa22a4755699b2a714fcdc6f83766e5f97b6c742373806900e49f24b22b097544d4896b424989b5e1ebac0f07c23f4598",
			
 
				+			"3612d2e79e3b0785561be14aaca2fccb",
			
 
				+		},
			
 
				+		{
			
 
				+			.AES_GCM_128,
			
 
				+			"feffe9928665731c6d6a8f9467308308",
			
 
				+			"9313225df88406e555909c5aff5269aa6a7a9538534f7da1e4c303d2a318a728c3c0c95156809539fcf0e2429a6b525416aedbf5a0de6a57a637b39b",
			
 
				+			"feedfacedeadbeeffeedfacedeadbeefabaddad2",
			
 
				+			"d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39",
			
 
				+			"8ce24998625615b603a033aca13fb894be9112a5c3a211a8ba262a3cca7e2ca701e4a9a4fba43c90ccdcb281d48c7c6fd62875d2aca417034c34aee5",
			
 
				+			"619cc5aefffe0bfa462af43c1699d050",
			
 
				+		},
			
 
				+		{
			
 
				+			.AES_GCM_192,
			
 
				+			"000000000000000000000000000000000000000000000000",
			
 
				+			"000000000000000000000000",
			
 
				+			"",
			
 
				+			"",
			
 
				+			"",
			
 
				+			"cd33b28ac773f74ba00ed1f312572435",
			
 
				+		},
			
 
				+		{
			
 
				+			.AES_GCM_192,
			
 
				+			"000000000000000000000000000000000000000000000000",
			
 
				+			"000000000000000000000000",
			
 
				+			"",
			
 
				+			"00000000000000000000000000000000",
			
 
				+			"98e7247c07f0fe411c267e4384b0f600",
			
 
				+			"2ff58d80033927ab8ef4d4587514f0fb",
			
 
				+		},
			
 
				+		{
			
 
				+			.AES_GCM_192,
			
 
				+			"feffe9928665731c6d6a8f9467308308feffe9928665731c",
			
 
				+			"cafebabefacedbaddecaf888",
			
 
				+			"",
			
 
				+			"d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b391aafd255",
			
 
				+			"3980ca0b3c00e841eb06fac4872a2757859e1ceaa6efd984628593b40ca1e19c7d773d00c144c525ac619d18c84a3f4718e2448b2fe324d9ccda2710acade256",
			
 
				+			"9924a7c8587336bfb118024db8674a14",
			
 
				+		},
			
 
				+		{
			
 
				+			.AES_GCM_192,
			
 
				+			"feffe9928665731c6d6a8f9467308308feffe9928665731c",
			
 
				+			"cafebabefacedbaddecaf888",
			
 
				+			"feedfacedeadbeeffeedfacedeadbeefabaddad2",
			
 
				+			"d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39",
			
 
				+			"3980ca0b3c00e841eb06fac4872a2757859e1ceaa6efd984628593b40ca1e19c7d773d00c144c525ac619d18c84a3f4718e2448b2fe324d9ccda2710",
			
 
				+			"2519498e80f1478f37ba55bd6d27618c",
			
 
				+		},
			
 
				+		{
			
 
				+			.AES_GCM_192,
			
 
				+			"feffe9928665731c6d6a8f9467308308feffe9928665731c",
			
 
				+			"cafebabefacedbad",
			
 
				+			"feedfacedeadbeeffeedfacedeadbeefabaddad2",
			
 
				+			"d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39",
			
 
				+			"0f10f599ae14a154ed24b36e25324db8c566632ef2bbb34f8347280fc4507057fddc29df9a471f75c66541d4d4dad1c9e93a19a58e8b473fa0f062f7",
			
 
				+			"65dcc57fcf623a24094fcca40d3533f8",
			
 
				+		},
			
 
				+		{
			
 
				+			.AES_GCM_192,
			
 
				+			"feffe9928665731c6d6a8f9467308308feffe9928665731c",
			
 
				+			"9313225df88406e555909c5aff5269aa6a7a9538534f7da1e4c303d2a318a728c3c0c95156809539fcf0e2429a6b525416aedbf5a0de6a57a637b39b",
			
 
				+			"feedfacedeadbeeffeedfacedeadbeefabaddad2",
			
 
				+			"d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39",
			
 
				+			"d27e88681ce3243c4830165a8fdcf9ff1de9a1d8e6b447ef6ef7b79828666e4581e79012af34ddd9e2f037589b292db3e67c036745fa22e7e9b7373b",
			
 
				+			"dcf566ff291c25bbb8568fc3d376a6d9",
			
 
				+		},
			
 
				+		{
			
 
				+			.AES_GCM_256,
			
 
				+			"0000000000000000000000000000000000000000000000000000000000000000",
			
 
				+			"000000000000000000000000",
			
 
				+			"",
			
 
				+			"",
			
 
				+			"",
			
 
				+			"530f8afbc74536b9a963b4f1c4cb738b",
			
 
				+		},
			
 
				+		{
			
 
				+			.AES_GCM_256,
			
 
				+			"0000000000000000000000000000000000000000000000000000000000000000",
			
 
				+			"000000000000000000000000",
			
 
				+			"",
			
 
				+			"00000000000000000000000000000000",
			
 
				+			"cea7403d4d606b6e074ec5d3baf39d18",
			
 
				+			"d0d1c8a799996bf0265b98b5d48ab919",
			
 
				+		},
			
 
				+		{
			
 
				+			.AES_GCM_256,
			
 
				+			"feffe9928665731c6d6a8f9467308308feffe9928665731c6d6a8f9467308308",
			
 
				+			"cafebabefacedbaddecaf888",
			
 
				+			"",
			
 
				+			"d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b391aafd255",
			
 
				+			"522dc1f099567d07f47f37a32a84427d643a8cdcbfe5c0c97598a2bd2555d1aa8cb08e48590dbb3da7b08b1056828838c5f61e6393ba7a0abcc9f662898015ad",
			
 
				+			"b094dac5d93471bdec1a502270e3cc6c",
			
 
				+		},
			
 
				+		{
			
 
				+			.AES_GCM_256,
			
 
				+			"feffe9928665731c6d6a8f9467308308feffe9928665731c6d6a8f9467308308",
			
 
				+			"cafebabefacedbaddecaf888",
			
 
				+			"feedfacedeadbeeffeedfacedeadbeefabaddad2",
			
 
				+			"d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39",
			
 
				+			"522dc1f099567d07f47f37a32a84427d643a8cdcbfe5c0c97598a2bd2555d1aa8cb08e48590dbb3da7b08b1056828838c5f61e6393ba7a0abcc9f662",
			
 
				+			"76fc6ece0f4e1768cddf8853bb2d551b",
			
 
				+		},
			
 
				+		{
			
 
				+			.AES_GCM_256,
			
 
				+			"feffe9928665731c6d6a8f9467308308feffe9928665731c6d6a8f9467308308",
			
 
				+			"cafebabefacedbad",
			
 
				+			"feedfacedeadbeeffeedfacedeadbeefabaddad2",
			
 
				+			"d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39",
			
 
				+			"c3762df1ca787d32ae47c13bf19844cbaf1ae14d0b976afac52ff7d79bba9de0feb582d33934a4f0954cc2363bc73f7862ac430e64abe499f47c9b1f",
			
 
				+			"3a337dbf46a792c45e454913fe2ea8f2",
			
 
				+		},
			
 
				+		{
			
 
				+			.AES_GCM_256,
			
 
				+			"feffe9928665731c6d6a8f9467308308feffe9928665731c6d6a8f9467308308",
			
 
				+			"9313225df88406e555909c5aff5269aa6a7a9538534f7da1e4c303d2a318a728c3c0c95156809539fcf0e2429a6b525416aedbf5a0de6a57a637b39b",
			
 
				+			"feedfacedeadbeeffeedfacedeadbeefabaddad2",
			
 
				+			"d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39",
			
 
				+			"5a8def2f0c9e53f1f75d7853659e2a20eeb2b22aafde6419a058ab4f6f746bf40fc0c3b780f244452da3ebf1c5d82cdea2418997200ef82e44ae7e3f",
			
 
				+			"a44a8266ee1c8eb0c8b5d4cf5ae9f19a",
			
 
				+		},
			
 
				+		// Chacha20-Poly1305
			
 
				+		// https://www.rfc-editor.org/rfc/rfc8439
			
 
				+		{
			
 
				+			.CHACHA20POLY1305,
			
 
				+			"808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f",
			
 
				+			"070000004041424344454647",
			
 
				+			"50515253c0c1c2c3c4c5c6c7",
			
 
				+			string(hex.encode(transmute([]byte)(_PLAINTEXT_SUNSCREEN_STR), context.temp_allocator)),
			
 
				+			"d31a8d34648e60db7b86afbc53ef7ec2a4aded51296e08fea9e2b5a736ee62d63dbea45e8ca9671282fafb69da92728b1a71de0a9e060b2905d6a5b67ecd3b3692ddbd7f2d778b8c9803aee328091b58fab324e4fad675945585808b4831d7bc3ff4def08e4b7a9de576d26586cec64b6116",
			
 
				+			"1ae10b594f09e26a7e902ecbd0600691",
			
 
				+		},
			
 
				+		// XChaCha20-Poly1305-IETF
			
 
				+		// - https://datatracker.ietf.org/doc/html/draft-arciszewski-xchacha-03
			
 
				+		{
			
 
				+			.XCHACHA20POLY1305,
			
 
				+			"808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f",
			
 
				+			"404142434445464748494a4b4c4d4e4f5051525354555657",
			
 
				+			"50515253c0c1c2c3c4c5c6c7",
			
 
				+			"4c616469657320616e642047656e746c656d656e206f662074686520636c617373206f66202739393a204966204920636f756c64206f6666657220796f75206f6e6c79206f6e652074697020666f7220746865206675747572652c2073756e73637265656e20776f756c642062652069742e",
			
 
				+			"bd6d179d3e83d43b9576579493c0e939572a1700252bfaccbed2902c21396cbb731c7f1b0b4aa6440bf3a82f4eda7e39ae64c6708c54c216cb96b72e1213b4522f8c9ba40db5d945b11b69b982c1bb9e3f3fac2bc369488f76b2383565d3fff921f9664c97637da9768812f615c68b13b52e",
			
 
				+			"c0875924c1c7987947deafd8780acf49",
			
 
				+		},
			
 
				+	}
			
 
				+	for v, _ in test_vectors {
			
 
				+		algo_name := aead.ALGORITHM_NAMES[v.algo]
			
 
				+
			
 
				+		key, _ := hex.decode(transmute([]byte)(v.key), context.temp_allocator)
			
 
				+		iv, _ := hex.decode(transmute([]byte)(v.iv), context.temp_allocator)
			
 
				+		aad, _ := hex.decode(transmute([]byte)(v.aad), context.temp_allocator)
			
 
				+		plaintext, _ := hex.decode(transmute([]byte)(v.plaintext), context.temp_allocator)
			
 
				+		ciphertext, _ := hex.decode(transmute([]byte)(v.ciphertext), context.temp_allocator)
			
 
				+		tag, _ := hex.decode(transmute([]byte)(v.tag), context.temp_allocator)
			
 
				+
			
 
				+		tag_ := make([]byte, len(tag), context.temp_allocator)
			
 
				+		dst := make([]byte, len(ciphertext), context.temp_allocator)
			
 
				+
			
 
				+		ctx: aead.Context
			
 
				+		for impl in impls[v.algo] {
			
 
				+			aead.init(&ctx, v.algo, key, impl)
			
 
				+
			
 
				+			aead.seal(&ctx, dst, tag_, iv, aad, plaintext)
			
 
				+			dst_str := string(hex.encode(dst, context.temp_allocator))
			
 
				+			tag_str := string(hex.encode(tag_, context.temp_allocator))
			
 
				+			testing.expectf(
			
 
				+				t,
			
 
				+				dst_str == v.ciphertext && tag_str == v.tag,
			
 
				+				"%s/%v: Expected: (%s, %s) for seal_ctx(%s, %s, %s, %s), but got (%s, %s) instead",
			
 
				+				algo_name,
			
 
				+				impl,
			
 
				+				v.ciphertext,
			
 
				+				v.tag,
			
 
				+				v.key,
			
 
				+				v.iv,
			
 
				+				v.aad,
			
 
				+				v.plaintext,
			
 
				+				dst_str,
			
 
				+				tag_str,
			
 
				+			)
			
 
				+
			
 
				+			aead.seal(v.algo, dst, tag_, key, iv, aad, plaintext, impl)
			
 
				+			dst_str = string(hex.encode(dst, context.temp_allocator))
			
 
				+			tag_str = string(hex.encode(tag_, context.temp_allocator))
			
 
				+			testing.expectf(
			
 
				+				t,
			
 
				+				dst_str == v.ciphertext && tag_str == v.tag,
			
 
				+				"%s/%v: Expected: (%s, %s) for seal_oneshot(%s, %s, %s, %s), but got (%s, %s) instead",
			
 
				+				algo_name,
			
 
				+				impl,
			
 
				+				v.ciphertext,
			
 
				+				v.tag,
			
 
				+				v.key,
			
 
				+				v.iv,
			
 
				+				v.aad,
			
 
				+				v.plaintext,
			
 
				+				dst_str,
			
 
				+				tag_str,
			
 
				+			)
			
 
				+
			
 
				+			ok := aead.open(&ctx, dst, iv, aad, ciphertext, tag)
			
 
				+			dst_str = string(hex.encode(dst, context.temp_allocator))
			
 
				+			testing.expectf(
			
 
				+				t,
			
 
				+				ok && dst_str == v.plaintext,
			
 
				+				"%s/%v: Expected: (%s, true) for open_ctx(%s, %s, %s, %s, %s), but got (%s, %v) instead",
			
 
				+				algo_name,
			
 
				+				impl,
			
 
				+				v.plaintext,
			
 
				+				v.key,
			
 
				+				v.iv,
			
 
				+				v.aad,
			
 
				+				v.ciphertext,
			
 
				+				v.tag,
			
 
				+				dst_str,
			
 
				+				ok,
			
 
				+			)
			
 
				+
			
 
				+			ok = aead.open(v.algo, dst, key, iv, aad, ciphertext, tag, impl)
			
 
				+			dst_str = string(hex.encode(dst, context.temp_allocator))
			
 
				+			testing.expectf(
			
 
				+				t,
			
 
				+				ok && dst_str == v.plaintext,
			
 
				+				"%s/%v: Expected: (%s, true) for open_oneshot(%s, %s, %s, %s, %s), but got (%s, %v) instead",
			
 
				+				algo_name,
			
 
				+				impl,
			
 
				+				v.plaintext,
			
 
				+				v.key,
			
 
				+				v.iv,
			
 
				+				v.aad,
			
 
				+				v.ciphertext,
			
 
				+				v.tag,
			
 
				+				dst_str,
			
 
				+				ok,
			
 
				+			)
			
 
				+
			
 
				+			tag_[0] ~= 0xa5
			
 
				+			ok = aead.open(&ctx, dst, iv, aad, ciphertext, tag_)
			
 
				+			testing.expectf(t, !ok, "%s/%v: Expected false for open(bad_tag, aad, ciphertext)", algo_name, impl)
			
 
				+
			
 
				+			if len(dst) > 0 {
			
 
				+				copy(dst, ciphertext[:])
			
 
				+				dst[0] ~= 0xa5
			
 
				+				ok = aead.open(&ctx, dst, iv, aad, dst, tag)
			
 
				+				testing.expectf(t, !ok, "%s/%v: Expected false for open(tag, aad, bad_ciphertext)", algo_name, impl)
			
 
				+			}
			
 
				+
			
 
				+			if len(aad) > 0 {
			
 
				+				aad_ := make([]byte, len(aad), context.temp_allocator)
			
 
				+				copy(aad_, aad)
			
 
				+				aad_[0] ~= 0xa5
			
 
				+				ok = aead.open(&ctx, dst, iv, aad_, ciphertext, tag)
			
 
				+				testing.expectf(t, !ok, "%s/%v: Expected false for open(tag, bad_aad, ciphertext)", algo_name, impl)
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
--- a/tests/core/crypto/test_core_crypto_aes.odin
+++ b/tests/core/crypto/test_core_crypto_aes.odin
@@ -12,20 +12,24 @@ import "core:crypto/sha2"
 
				 test_aes :: proc(t: ^testing.T) {
			
 
				 	runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD()
			
 
				 
			
 
				-	impls := make([dynamic]aes.Implementation, 0, 2)
			
 
				-	defer delete(impls)
			
 
				-	append(&impls, aes.Implementation.Portable)
			
 
				-	if aes.is_hardware_accelerated() {
			
 
				-		append(&impls, aes.Implementation.Hardware)
			
 
				-	}
			
 
				+	impls := supported_aes_impls()
			
 
				 
			
 
				 	for impl in impls {
			
 
				 		test_aes_ecb(t, impl)
			
 
				 		test_aes_ctr(t, impl)
			
 
				-		test_aes_gcm(t, impl)
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				+supported_aes_impls :: proc() -> [dynamic]aes.Implementation {
			
 
				+	impls := make([dynamic]aes.Implementation, 0, 2, context.temp_allocator)
			
 
				+	append(&impls, aes.Implementation.Portable)
			
 
				+	if aes.is_hardware_accelerated() {
			
 
				+		append(&impls, aes.Implementation.Hardware)
			
 
				+	}
			
 
				+
			
 
				+	return impls
			
 
				+}
			
 
				+
			
 
				 test_aes_ecb :: proc(t: ^testing.T, impl: aes.Implementation) {
			
 
				 	log.debugf("Testing AES-ECB/%v", impl)
			
 
				 
			
@@ -197,13 +201,13 @@ test_aes_ctr :: proc(t: ^testing.T, impl: aes.Implementation) {
 
				 
			
 
				 	ctx: aes.Context_CTR
			
 
				 	key: [aes.KEY_SIZE_256]byte
			
 
				-	nonce: [aes.CTR_IV_SIZE]byte
			
 
				-	aes.init_ctr(&ctx, key[:], nonce[:], impl)
			
 
				+	iv: [aes.CTR_IV_SIZE]byte
			
 
				+	aes.init_ctr(&ctx, key[:], iv[:], impl)
			
 
				 
			
 
				 	h_ctx: sha2.Context_512
			
 
				 	sha2.init_512_256(&h_ctx)
			
 
				 
			
 
				-	for i := 1; i < 2048; i = i + 1 {
			
 
				+	for i := 1; i <= 2048; i = i + 1 {
			
 
				 		aes.keystream_bytes_ctr(&ctx, tmp[:i])
			
 
				 		sha2.update(&h_ctx, tmp[:i])
			
 
				 	}
			
@@ -212,7 +216,7 @@ test_aes_ctr :: proc(t: ^testing.T, impl: aes.Implementation) {
 
				 	sha2.final(&h_ctx, digest[:])
			
 
				 	digest_str := string(hex.encode(digest[:], context.temp_allocator))
			
 
				 
			
 
				-	expected_digest_str := "d4445343afeb9d1237f95b10d00358aed4c1d7d57c9fe480cd0afb5e2ffd448c"
			
 
				+	expected_digest_str := "b5ba4e7d6e3d1ff2bb54387fc1528573a6b351610ce7bcc80b00da089f4b1bf0"
			
 
				 	testing.expectf(
			
 
				 		t,
			
 
				 		expected_digest_str == digest_str,
			
@@ -222,223 +226,3 @@ test_aes_ctr :: proc(t: ^testing.T, impl: aes.Implementation) {
 
				 		digest_str,
			
 
				 	)
			
 
				 }
			
 
				-
			
 
				-test_aes_gcm :: proc(t: ^testing.T, impl: aes.Implementation) {
			
 
				-	log.debugf("Testing AES-GCM/%v", impl)
			
 
				-
			
 
				-	// NIST did a reorg of their site, so the source of the test vectors
			
 
				-	// is only available from an archive.  The commented out tests are
			
 
				-	// for non-96-bit IVs which our implementation does not support.
			
 
				-	//
			
 
				-	// https://csrc.nist.rip/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
			
 
				-	test_vectors := []struct {
			
 
				-		key: string,
			
 
				-		iv: string,
			
 
				-		aad: string,
			
 
				-		plaintext: string,
			
 
				-		ciphertext: string,
			
 
				-		tag: string,
			
 
				-	} {
			
 
				-		{
			
 
				-			"00000000000000000000000000000000",
			
 
				-			"000000000000000000000000",
			
 
				-			"",
			
 
				-			"",
			
 
				-			"",
			
 
				-			"58e2fccefa7e3061367f1d57a4e7455a",
			
 
				-		},
			
 
				-		{
			
 
				-			"00000000000000000000000000000000",
			
 
				-			"000000000000000000000000",
			
 
				-			"",
			
 
				-			"00000000000000000000000000000000",
			
 
				-			"0388dace60b6a392f328c2b971b2fe78",
			
 
				-			"ab6e47d42cec13bdf53a67b21257bddf",
			
 
				-		},
			
 
				-		{
			
 
				-			"feffe9928665731c6d6a8f9467308308",
			
 
				-			"cafebabefacedbaddecaf888",
			
 
				-			"",
			
 
				-			"d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b391aafd255",
			
 
				-			"42831ec2217774244b7221b784d0d49ce3aa212f2c02a4e035c17e2329aca12e21d514b25466931c7d8f6a5aac84aa051ba30b396a0aac973d58e091473f5985",
			
 
				-			"4d5c2af327cd64a62cf35abd2ba6fab4",
			
 
				-		},
			
 
				-		{
			
 
				-			"feffe9928665731c6d6a8f9467308308",
			
 
				-			"cafebabefacedbaddecaf888",
			
 
				-			"feedfacedeadbeeffeedfacedeadbeefabaddad2",
			
 
				-			"d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39",
			
 
				-			"42831ec2217774244b7221b784d0d49ce3aa212f2c02a4e035c17e2329aca12e21d514b25466931c7d8f6a5aac84aa051ba30b396a0aac973d58e091",
			
 
				-			"5bc94fbc3221a5db94fae95ae7121a47",
			
 
				-		},
			
 
				-		/*
			
 
				-			{
			
 
				-				"feffe9928665731c6d6a8f9467308308",
			
 
				-				"cafebabefacedbad",
			
 
				-				"feedfacedeadbeeffeedfacedeadbeefabaddad2",
			
 
				-				"d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39",
			
 
				-				"61353b4c2806934a777ff51fa22a4755699b2a714fcdc6f83766e5f97b6c742373806900e49f24b22b097544d4896b424989b5e1ebac0f07c23f4598",
			
 
				-				"3612d2e79e3b0785561be14aaca2fccb",
			
 
				-			},
			
 
				-			{
			
 
				-				"feffe9928665731c6d6a8f9467308308",
			
 
				-				"9313225df88406e555909c5aff5269aa6a7a9538534f7da1e4c303d2a318a728c3c0c95156809539fcf0e2429a6b525416aedbf5a0de6a57a637b39b",
			
 
				-				"feedfacedeadbeeffeedfacedeadbeefabaddad2",
			
 
				-				"d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39",
			
 
				-				"8ce24998625615b603a033aca13fb894be9112a5c3a211a8ba262a3cca7e2ca701e4a9a4fba43c90ccdcb281d48c7c6fd62875d2aca417034c34aee5",
			
 
				-				"619cc5aefffe0bfa462af43c1699d050",
			
 
				-			},
			
 
				-		*/
			
 
				-		{
			
 
				-			"000000000000000000000000000000000000000000000000",
			
 
				-			"000000000000000000000000",
			
 
				-			"",
			
 
				-			"",
			
 
				-			"",
			
 
				-			"cd33b28ac773f74ba00ed1f312572435",
			
 
				-		},
			
 
				-		{
			
 
				-			"000000000000000000000000000000000000000000000000",
			
 
				-			"000000000000000000000000",
			
 
				-			"",
			
 
				-			"00000000000000000000000000000000",
			
 
				-			"98e7247c07f0fe411c267e4384b0f600",
			
 
				-			"2ff58d80033927ab8ef4d4587514f0fb",
			
 
				-		},
			
 
				-		{
			
 
				-			"feffe9928665731c6d6a8f9467308308feffe9928665731c",
			
 
				-			"cafebabefacedbaddecaf888",
			
 
				-			"",
			
 
				-			"d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b391aafd255",
			
 
				-			"3980ca0b3c00e841eb06fac4872a2757859e1ceaa6efd984628593b40ca1e19c7d773d00c144c525ac619d18c84a3f4718e2448b2fe324d9ccda2710acade256",
			
 
				-			"9924a7c8587336bfb118024db8674a14",
			
 
				-		},
			
 
				-		{
			
 
				-			"feffe9928665731c6d6a8f9467308308feffe9928665731c",
			
 
				-			"cafebabefacedbaddecaf888",
			
 
				-			"feedfacedeadbeeffeedfacedeadbeefabaddad2",
			
 
				-			"d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39",
			
 
				-			"3980ca0b3c00e841eb06fac4872a2757859e1ceaa6efd984628593b40ca1e19c7d773d00c144c525ac619d18c84a3f4718e2448b2fe324d9ccda2710",
			
 
				-			"2519498e80f1478f37ba55bd6d27618c",
			
 
				-		},
			
 
				-		/*
			
 
				-			{
			
 
				-				"feffe9928665731c6d6a8f9467308308feffe9928665731c",
			
 
				-				"cafebabefacedbad",
			
 
				-				"feedfacedeadbeeffeedfacedeadbeefabaddad2",
			
 
				-				"d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39",
			
 
				-				"0f10f599ae14a154ed24b36e25324db8c566632ef2bbb34f8347280fc4507057fddc29df9a471f75c66541d4d4dad1c9e93a19a58e8b473fa0f062f7",
			
 
				-				"65dcc57fcf623a24094fcca40d3533f8",
			
 
				-			},
			
 
				-			{
			
 
				-				"feffe9928665731c6d6a8f9467308308feffe9928665731c",
			
 
				-				"9313225df88406e555909c5aff5269aa6a7a9538534f7da1e4c303d2a318a728c3c0c95156809539fcf0e2429a6b525416aedbf5a0de6a57a637b39b",
			
 
				-				"feedfacedeadbeeffeedfacedeadbeefabaddad2",
			
 
				-				"d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39",
			
 
				-				"d27e88681ce3243c4830165a8fdcf9ff1de9a1d8e6b447ef6ef7b79828666e4581e79012af34ddd9e2f037589b292db3e67c036745fa22e7e9b7373b",
			
 
				-				"dcf566ff291c25bbb8568fc3d376a6d9",
			
 
				-			},
			
 
				-		*/
			
 
				-		{
			
 
				-			"0000000000000000000000000000000000000000000000000000000000000000",
			
 
				-			"000000000000000000000000",
			
 
				-			"",
			
 
				-			"",
			
 
				-			"",
			
 
				-			"530f8afbc74536b9a963b4f1c4cb738b",
			
 
				-		},
			
 
				-		{
			
 
				-			"0000000000000000000000000000000000000000000000000000000000000000",
			
 
				-			"000000000000000000000000",
			
 
				-			"",
			
 
				-			"00000000000000000000000000000000",
			
 
				-			"cea7403d4d606b6e074ec5d3baf39d18",
			
 
				-			"d0d1c8a799996bf0265b98b5d48ab919",
			
 
				-		},
			
 
				-		{
			
 
				-			"feffe9928665731c6d6a8f9467308308feffe9928665731c6d6a8f9467308308",
			
 
				-			"cafebabefacedbaddecaf888",
			
 
				-			"",
			
 
				-			"d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b391aafd255",
			
 
				-			"522dc1f099567d07f47f37a32a84427d643a8cdcbfe5c0c97598a2bd2555d1aa8cb08e48590dbb3da7b08b1056828838c5f61e6393ba7a0abcc9f662898015ad",
			
 
				-			"b094dac5d93471bdec1a502270e3cc6c",
			
 
				-		},
			
 
				-		{
			
 
				-			"feffe9928665731c6d6a8f9467308308feffe9928665731c6d6a8f9467308308",
			
 
				-			"cafebabefacedbaddecaf888",
			
 
				-			"feedfacedeadbeeffeedfacedeadbeefabaddad2",
			
 
				-			"d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39",
			
 
				-			"522dc1f099567d07f47f37a32a84427d643a8cdcbfe5c0c97598a2bd2555d1aa8cb08e48590dbb3da7b08b1056828838c5f61e6393ba7a0abcc9f662",
			
 
				-			"76fc6ece0f4e1768cddf8853bb2d551b",
			
 
				-		},
			
 
				-		/*
			
 
				-			{
			
 
				-				"feffe9928665731c6d6a8f9467308308feffe9928665731c6d6a8f9467308308",
			
 
				-				"cafebabefacedbad",
			
 
				-				"feedfacedeadbeeffeedfacedeadbeefabaddad2",
			
 
				-				"d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39",
			
 
				-				"c3762df1ca787d32ae47c13bf19844cbaf1ae14d0b976afac52ff7d79bba9de0feb582d33934a4f0954cc2363bc73f7862ac430e64abe499f47c9b1f",
			
 
				-				"3a337dbf46a792c45e454913fe2ea8f2",
			
 
				-			},
			
 
				-			{
			
 
				-				"feffe9928665731c6d6a8f9467308308feffe9928665731c6d6a8f9467308308",
			
 
				-				"9313225df88406e555909c5aff5269aa6a7a9538534f7da1e4c303d2a318a728c3c0c95156809539fcf0e2429a6b525416aedbf5a0de6a57a637b39b",
			
 
				-				"feedfacedeadbeeffeedfacedeadbeefabaddad2",
			
 
				-				"d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39",
			
 
				-				"5a8def2f0c9e53f1f75d7853659e2a20eeb2b22aafde6419a058ab4f6f746bf40fc0c3b780f244452da3ebf1c5d82cdea2418997200ef82e44ae7e3f",
			
 
				-				"a44a8266ee1c8eb0c8b5d4cf5ae9f19a",
			
 
				-			},
			
 
				-		*/
			
 
				-	}
			
 
				-	for v, _ in test_vectors {
			
 
				-		key, _ := hex.decode(transmute([]byte)(v.key), context.temp_allocator)
			
 
				-		iv, _ := hex.decode(transmute([]byte)(v.iv), context.temp_allocator)
			
 
				-		aad, _ := hex.decode(transmute([]byte)(v.aad), context.temp_allocator)
			
 
				-		plaintext, _ := hex.decode(transmute([]byte)(v.plaintext), context.temp_allocator)
			
 
				-		ciphertext, _ := hex.decode(transmute([]byte)(v.ciphertext), context.temp_allocator)
			
 
				-		tag, _ := hex.decode(transmute([]byte)(v.tag), context.temp_allocator)
			
 
				-
			
 
				-		tag_ := make([]byte, len(tag), context.temp_allocator)
			
 
				-		dst := make([]byte, len(ciphertext), context.temp_allocator)
			
 
				-
			
 
				-		ctx: aes.Context_GCM
			
 
				-		aes.init_gcm(&ctx, key, impl)
			
 
				-
			
 
				-		aes.seal_gcm(&ctx, dst, tag_, iv, aad, plaintext)
			
 
				-		dst_str := string(hex.encode(dst[:], context.temp_allocator))
			
 
				-		tag_str := string(hex.encode(tag_[:], context.temp_allocator))
			
 
				-
			
 
				-		testing.expectf(
			
 
				-			t,
			
 
				-			dst_str == v.ciphertext && tag_str == v.tag,
			
 
				-			"AES-GCM/%v: Expected: (%s, %s) for seal(%s, %s, %s, %s), but got (%s, %s) instead",
			
 
				-			impl,
			
 
				-			v.ciphertext,
			
 
				-			v.tag,
			
 
				-			v.key,
			
 
				-			v.iv,
			
 
				-			v.aad,
			
 
				-			v.plaintext,
			
 
				-			dst_str,
			
 
				-			tag_str,
			
 
				-		)
			
 
				-
			
 
				-		ok := aes.open_gcm(&ctx, dst, iv, aad, ciphertext, tag)
			
 
				-		dst_str = string(hex.encode(dst[:], context.temp_allocator))
			
 
				-
			
 
				-		testing.expectf(
			
 
				-			t,
			
 
				-			ok && dst_str == v.plaintext,
			
 
				-			"AES-GCM/%v: Expected: (%s, true) for open(%s, %s, %s, %s, %s), but got (%s, %v) instead",
			
 
				-			impl,
			
 
				-			v.plaintext,
			
 
				-			v.key,
			
 
				-			v.iv,
			
 
				-			v.aad,
			
 
				-			v.ciphertext,
			
 
				-			v.tag,
			
 
				-			dst_str,
			
 
				-			ok,
			
 
				-		)
			
 
				-	}
			
 
				-}