3 gadi atpakaļ · 7bed317636
--- a/core/crypto/chacha20/chacha20.odin
+++ b/core/crypto/chacha20/chacha20.odin
@@ -0,0 +1,581 @@
 
															+package chacha20
														
 
															+
														
 
															+import "core:crypto/util"
														
 
															+import "core:math/bits"
														
 
															+import "core:mem"
														
 
															+
														
 
															+KEY_SIZE :: 32
														
 
															+NONCE_SIZE :: 12
														
 
															+XNONCE_SIZE :: 24
														
 
															+
														
 
															+_MAX_CTR_IETF :: 0xffffffff
														
 
															+
														
 
															+_BLOCK_SIZE :: 64
														
 
															+_STATE_SIZE_U32 :: 16
														
 
															+_ROUNDS :: 20
														
 
															+
														
 
															+_SIGMA_0 : u32 : 0x61707865
														
 
															+_SIGMA_1 : u32 : 0x3320646e
														
 
															+_SIGMA_2 : u32 : 0x79622d32
														
 
															+_SIGMA_3 : u32 : 0x6b206574
														
 
															+
														
 
															+Context :: struct {
														
 
															+	_s: [_STATE_SIZE_U32]u32,
														
 
															+
														
 
															+	_buffer: [_BLOCK_SIZE]byte,
														
 
															+	_off: int,
														
 
															+
														
 
															+	_is_ietf_flavor: bool,
														
 
															+	_is_initialized: bool,
														
 
															+}
														
 
															+
														
 
															+init :: proc (ctx: ^Context, key, nonce: []byte) {
														
 
															+	if len(key) != KEY_SIZE {
														
 
															+		panic("crypto/chacha20: invalid ChaCha20 key size")
														
 
															+	}
														
 
															+	if n_len := len(nonce); n_len != NONCE_SIZE && n_len != XNONCE_SIZE {
														
 
															+		panic("crypto/chacha20: invalid (X)ChaCha20 nonce size")
														
 
															+	}
														
 
															+
														
 
															+	k, n := key, nonce
														
 
															+
														
 
															+	// Derive the XChaCha20 subkey and sub-nonce via HChaCha20.
														
 
															+	is_xchacha := len(nonce) == XNONCE_SIZE
														
 
															+	if is_xchacha {
														
 
															+		sub_key := ctx._buffer[:KEY_SIZE]
														
 
															+		_hchacha20(sub_key, k, n)
														
 
															+		k = sub_key
														
 
															+		n = n[16:24]
														
 
															+	}
														
 
															+
														
 
															+	ctx._s[0] = _SIGMA_0
														
 
															+	ctx._s[1] = _SIGMA_1
														
 
															+	ctx._s[2] = _SIGMA_2
														
 
															+	ctx._s[3] = _SIGMA_3
														
 
															+	ctx._s[4] = util.U32_LE(k[0:4])
														
 
															+	ctx._s[5] = util.U32_LE(k[4:8])
														
 
															+	ctx._s[6] = util.U32_LE(k[8:12])
														
 
															+	ctx._s[7] = util.U32_LE(k[12:16])
														
 
															+	ctx._s[8] = util.U32_LE(k[16:20])
														
 
															+	ctx._s[9] = util.U32_LE(k[20:24])
														
 
															+	ctx._s[10] = util.U32_LE(k[24:28])
														
 
															+	ctx._s[11] = util.U32_LE(k[28:32])
														
 
															+	ctx._s[12] = 0
														
 
															+	if !is_xchacha {
														
 
															+		ctx._s[13] = util.U32_LE(n[0:4])
														
 
															+		ctx._s[14] = util.U32_LE(n[4:8])
														
 
															+		ctx._s[15] = util.U32_LE(n[8:12])
														
 
															+	} else {
														
 
															+		ctx._s[13] = 0
														
 
															+		ctx._s[14] = util.U32_LE(n[0:4])
														
 
															+		ctx._s[15] = util.U32_LE(n[4:8])
														
 
															+
														
 
															+		// The sub-key is stored in the keystream buffer.  While
														
 
															+		// this will be overwritten in most circumstances, explicitly
														
 
															+		// clear it out early.
														
 
															+		mem.zero_explicit(&ctx._buffer, KEY_SIZE)
														
 
															+	}
														
 
															+
														
 
															+	ctx._off = _BLOCK_SIZE
														
 
															+	ctx._is_ietf_flavor = !is_xchacha
														
 
															+	ctx._is_initialized = true
														
 
															+}
														
 
															+
														
 
															+seek :: proc (ctx: ^Context, block_nr: u64) {
														
 
															+	assert(ctx._is_initialized)
														
 
															+
														
 
															+	if ctx._is_ietf_flavor {
														
 
															+		if block_nr > _MAX_CTR_IETF {
														
 
															+			panic("crypto/chacha20: attempted to seek past maximum counter")
														
 
															+		}
														
 
															+	} else {
														
 
															+		ctx._s[13] = u32(block_nr >> 32)
														
 
															+	}
														
 
															+	ctx._s[12] = u32(block_nr)
														
 
															+	ctx._off = _BLOCK_SIZE
														
 
															+}
														
 
															+
														
 
															+xor_bytes :: proc (ctx: ^Context, dst, src: []byte) {
														
 
															+	assert(ctx._is_initialized)
														
 
															+
														
 
															+	// TODO: Enforcing that dst and src alias exactly or not at all
														
 
															+	// is a good idea, though odd aliasing should be extremely uncommon.
														
 
															+
														
 
															+	src, dst := src, dst
														
 
															+	if dst_len := len(dst); dst_len < len(src) {
														
 
															+		src = src[:dst_len]
														
 
															+	}
														
 
															+
														
 
															+	for remaining := len(src); remaining > 0; {
														
 
															+		// Process multiple blocks at once
														
 
															+		if ctx._off == _BLOCK_SIZE {
														
 
															+			if nr_blocks := remaining / _BLOCK_SIZE; nr_blocks > 0 {
														
 
															+				direct_bytes := nr_blocks * _BLOCK_SIZE
														
 
															+				_do_blocks(ctx, dst, src, nr_blocks)
														
 
															+				remaining -= direct_bytes
														
 
															+				if remaining == 0 {
														
 
															+					return
														
 
															+				}
														
 
															+				dst = dst[direct_bytes:]
														
 
															+				src = src[direct_bytes:]
														
 
															+			}
														
 
															+
														
 
															+			// If there is a partial block, generate and buffer 1 block
														
 
															+			// worth of keystream.
														
 
															+			_do_blocks(ctx, ctx._buffer[:], nil, 1)
														
 
															+			ctx._off = 0
														
 
															+		}
														
 
															+
														
 
															+		// Process partial blocks from the buffered keystream.
														
 
															+		to_xor := min(_BLOCK_SIZE - ctx._off, remaining)
														
 
															+		buffered_keystream := ctx._buffer[ctx._off:]
														
 
															+		for i := 0; i < to_xor; i = i + 1 {
														
 
															+			dst[i] = buffered_keystream[i] ~ src[i]
														
 
															+		}
														
 
															+		ctx._off += to_xor
														
 
															+		dst = dst[to_xor:]
														
 
															+		src = src[to_xor:]
														
 
															+		remaining -= to_xor
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+keystream_bytes :: proc (ctx: ^Context, dst: []byte) {
														
 
															+	assert(ctx._is_initialized)
														
 
															+
														
 
															+	dst := dst
														
 
															+	for remaining := len(dst); remaining > 0; {
														
 
															+		// Process multiple blocks at once
														
 
															+		if ctx._off == _BLOCK_SIZE {
														
 
															+			if nr_blocks := remaining / _BLOCK_SIZE; nr_blocks > 0 {
														
 
															+				direct_bytes := nr_blocks * _BLOCK_SIZE
														
 
															+				_do_blocks(ctx, dst, nil, nr_blocks)
														
 
															+				remaining -= direct_bytes
														
 
															+				if remaining == 0 {
														
 
															+					return
														
 
															+				}
														
 
															+				dst = dst[direct_bytes:]
														
 
															+			}
														
 
															+
														
 
															+			// If there is a partial block, generate and buffer 1 block
														
 
															+			// worth of keystream.
														
 
															+			_do_blocks(ctx, ctx._buffer[:], nil, 1)
														
 
															+			ctx._off = 0
														
 
															+		}
														
 
															+
														
 
															+		// Process partial blocks from the buffered keystream.
														
 
															+		to_copy := min(_BLOCK_SIZE - ctx._off, remaining)
														
 
															+		buffered_keystream := ctx._buffer[ctx._off:]
														
 
															+		copy(dst[:to_copy], buffered_keystream[:to_copy])
														
 
															+		ctx._off += to_copy
														
 
															+		dst = dst[to_copy:]
														
 
															+		remaining -= to_copy
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+reset :: proc (ctx: ^Context) {
														
 
															+	mem.zero_explicit(&ctx._s, size_of(ctx._s))
														
 
															+	mem.zero_explicit(&ctx._buffer, size_of(ctx._buffer))
														
 
															+
														
 
															+	ctx._is_initialized = false
														
 
															+}
														
 
															+
														
 
															+_do_blocks :: proc (ctx: ^Context, dst, src: []byte, nr_blocks: int) {
														
 
															+	// Enforce the maximum consumed keystream per nonce.
														
 
															+	//
														
 
															+	// While all modern "standard" definitions of ChaCha20 use
														
 
															+	// the IETF 32-bit counter, for XChaCha20 most common
														
 
															+	// implementations allow for a 64-bit counter.
														
 
															+	//
														
 
															+	// Honestly, the answer here is "use a MRAE primitive", but
														
 
															+	// go with common practice in the case of XChaCha20.
														
 
															+	if ctx._is_ietf_flavor {
														
 
															+		if u64(ctx._s[12]) + u64(nr_blocks) > 0xffffffff {
														
 
															+			panic("crypto/chacha20: maximum ChaCha20 keystream per nonce reached")
														
 
															+		}
														
 
															+	} else {
														
 
															+		ctr := (u64(ctx._s[13]) << 32) | u64(ctx._s[12])
														
 
															+		if _, carry := bits.add_u64(ctr, u64(nr_blocks), 0); carry != 0 {
														
 
															+			panic("crypto/chacha20: maximum XChaCha20 keystream per nonce reached")
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	dst, src := dst, src
														
 
															+	x := &ctx._s
														
 
															+	for n := 0; n < nr_blocks; n = n + 1 {
														
 
															+		x0, x1, x2, x3 := _SIGMA_0, _SIGMA_1, _SIGMA_2, _SIGMA_3
														
 
															+		x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
														
 
															+
														
 
															+		for i := _ROUNDS; i > 0; i = i - 2 {
														
 
															+			// Even when forcing inlining manually inlining all of
														
 
															+			// these is decently faster.
														
 
															+
														
 
															+			// quarterround(x, 0, 4, 8, 12)
														
 
															+			x0 += x4
														
 
															+			x12 ~= x0
														
 
															+			x12 = util.ROTL32(x12, 16)
														
 
															+			x8 += x12
														
 
															+			x4 ~= x8
														
 
															+			x4 = util.ROTL32(x4, 12)
														
 
															+			x0 += x4
														
 
															+			x12 ~= x0
														
 
															+			x12 = util.ROTL32(x12, 8)
														
 
															+			x8 += x12
														
 
															+			x4 ~= x8
														
 
															+			x4 = util.ROTL32(x4, 7)
														
 
															+
														
 
															+			// quarterround(x, 1, 5, 9, 13)
														
 
															+			x1 += x5
														
 
															+			x13 ~= x1
														
 
															+			x13 = util.ROTL32(x13, 16)
														
 
															+			x9 += x13
														
 
															+			x5 ~= x9
														
 
															+			x5 = util.ROTL32(x5, 12)
														
 
															+			x1 += x5
														
 
															+			x13 ~= x1
														
 
															+			x13 = util.ROTL32(x13, 8)
														
 
															+			x9 += x13
														
 
															+			x5 ~= x9
														
 
															+			x5 = util.ROTL32(x5, 7)
														
 
															+
														
 
															+			// quarterround(x, 2, 6, 10, 14)
														
 
															+			x2 += x6
														
 
															+			x14 ~= x2
														
 
															+			x14 = util.ROTL32(x14, 16)
														
 
															+			x10 += x14
														
 
															+			x6 ~= x10
														
 
															+			x6 = util.ROTL32(x6, 12)
														
 
															+			x2 += x6
														
 
															+			x14 ~= x2
														
 
															+			x14 = util.ROTL32(x14, 8)
														
 
															+			x10 += x14
														
 
															+			x6 ~= x10
														
 
															+			x6 = util.ROTL32(x6, 7)
														
 
															+
														
 
															+			// quarterround(x, 3, 7, 11, 15)
														
 
															+			x3 += x7
														
 
															+			x15 ~= x3
														
 
															+			x15 = util.ROTL32(x15, 16)
														
 
															+			x11 += x15
														
 
															+			x7 ~= x11
														
 
															+			x7 = util.ROTL32(x7, 12)
														
 
															+			x3 += x7
														
 
															+			x15 ~= x3
														
 
															+			x15 = util.ROTL32(x15, 8)
														
 
															+			x11 += x15
														
 
															+			x7 ~= x11
														
 
															+			x7 = util.ROTL32(x7, 7)
														
 
															+
														
 
															+			// quarterround(x, 0, 5, 10, 15)
														
 
															+			x0 += x5
														
 
															+			x15 ~= x0
														
 
															+			x15 = util.ROTL32(x15, 16)
														
 
															+			x10 += x15
														
 
															+			x5 ~= x10
														
 
															+			x5 = util.ROTL32(x5, 12)
														
 
															+			x0 += x5
														
 
															+			x15 ~= x0
														
 
															+			x15 = util.ROTL32(x15, 8)
														
 
															+			x10 += x15
														
 
															+			x5 ~= x10
														
 
															+			x5 = util.ROTL32(x5, 7)
														
 
															+
														
 
															+			// quarterround(x, 1, 6, 11, 12)
														
 
															+			x1 += x6
														
 
															+			x12 ~= x1
														
 
															+			x12 = util.ROTL32(x12, 16)
														
 
															+			x11 += x12
														
 
															+			x6 ~= x11
														
 
															+			x6 = util.ROTL32(x6, 12)
														
 
															+			x1 += x6
														
 
															+			x12 ~= x1
														
 
															+			x12 = util.ROTL32(x12, 8)
														
 
															+			x11 += x12
														
 
															+			x6 ~= x11
														
 
															+			x6 = util.ROTL32(x6, 7)
														
 
															+
														
 
															+			// quarterround(x, 2, 7, 8, 13)
														
 
															+			x2 += x7
														
 
															+			x13 ~= x2
														
 
															+			x13 = util.ROTL32(x13, 16)
														
 
															+			x8 += x13
														
 
															+			x7 ~= x8
														
 
															+			x7 = util.ROTL32(x7, 12)
														
 
															+			x2 += x7
														
 
															+			x13 ~= x2
														
 
															+			x13 = util.ROTL32(x13, 8)
														
 
															+			x8 += x13
														
 
															+			x7 ~= x8
														
 
															+			x7 = util.ROTL32(x7, 7)
														
 
															+
														
 
															+			// quarterround(x, 3, 4, 9, 14)
														
 
															+			x3 += x4
														
 
															+			x14 ~= x3
														
 
															+			x14 = util.ROTL32(x14, 16)
														
 
															+			x9 += x14
														
 
															+			x4 ~= x9
														
 
															+			x4 = util.ROTL32(x4, 12)
														
 
															+			x3 += x4
														
 
															+			x14 ~= x3
														
 
															+			x14 = util.ROTL32(x14, 8)
														
 
															+			x9 += x14
														
 
															+			x4 ~= x9
														
 
															+			x4 = util.ROTL32(x4, 7)
														
 
															+		}
														
 
															+
														
 
															+		x0 += _SIGMA_0
														
 
															+		x1 += _SIGMA_1
														
 
															+		x2 += _SIGMA_2
														
 
															+		x3 += _SIGMA_3
														
 
															+		x4 += x[4]
														
 
															+		x5 += x[5]
														
 
															+		x6 += x[6]
														
 
															+		x7 += x[7]
														
 
															+		x8 += x[8]
														
 
															+		x9 += x[9]
														
 
															+		x10 += x[10]
														
 
															+		x11 += x[11]
														
 
															+		x12 += x[12]
														
 
															+		x13 += x[13]
														
 
															+		x14 += x[14]
														
 
															+		x15 += x[15]
														
 
															+
														
 
															+		// While the "correct" answer to getting more performance out of
														
 
															+		// this is "use vector operations", support for that is currently
														
 
															+		// a work in progress/to be designed.
														
 
															+		//
														
 
															+		// Until dedicated assembly can be written leverage the fact that
														
 
															+		// the callers of this routine ensure that src/dst are valid.
														
 
															+
														
 
															+		when ODIN_ARCH == "386" || ODIN_ARCH == "amd64" {
														
 
															+			// util.PUT_U32_LE/util.U32_LE are not required on little-endian
														
 
															+			// systems that also happen to not be strict about aligned
														
 
															+			// memory access.
														
 
															+
														
 
															+			dst_p := transmute(^[16]u32)(&dst[0])
														
 
															+			if src != nil {
														
 
															+				src_p := transmute(^[16]u32)(&src[0])
														
 
															+				dst_p[0] = src_p[0] ~ x0
														
 
															+				dst_p[1] = src_p[1] ~ x1
														
 
															+				dst_p[2] = src_p[2] ~ x2
														
 
															+				dst_p[3] = src_p[3] ~ x3
														
 
															+				dst_p[4] = src_p[4] ~ x4
														
 
															+				dst_p[5] = src_p[5] ~ x5
														
 
															+				dst_p[6] = src_p[6] ~ x6
														
 
															+				dst_p[7] = src_p[7] ~ x7
														
 
															+				dst_p[8] = src_p[8] ~ x8
														
 
															+				dst_p[9] = src_p[9] ~ x9
														
 
															+				dst_p[10] = src_p[10] ~ x10
														
 
															+				dst_p[11] = src_p[11] ~ x11
														
 
															+				dst_p[12] = src_p[12] ~ x12
														
 
															+				dst_p[13] = src_p[13] ~ x13
														
 
															+				dst_p[14] = src_p[14] ~ x14
														
 
															+				dst_p[15] = src_p[15] ~ x15
														
 
															+				src = src[_BLOCK_SIZE:]
														
 
															+			} else {
														
 
															+				dst_p[0] = x0
														
 
															+				dst_p[1] = x1
														
 
															+				dst_p[2] = x2
														
 
															+				dst_p[3] = x3
														
 
															+				dst_p[4] = x4
														
 
															+				dst_p[5] = x5
														
 
															+				dst_p[6] = x6
														
 
															+				dst_p[7] = x7
														
 
															+				dst_p[8] = x8
														
 
															+				dst_p[9] = x9
														
 
															+				dst_p[10] = x10
														
 
															+				dst_p[11] = x11
														
 
															+				dst_p[12] = x12
														
 
															+				dst_p[13] = x13
														
 
															+				dst_p[14] = x14
														
 
															+				dst_p[15] = x15
														
 
															+			}
														
 
															+			dst = dst[_BLOCK_SIZE:]
														
 
															+		} else {
														
 
															+			#no_bounds_check {
														
 
															+				if src != nil {
														
 
															+					util.PUT_U32_LE(dst[0:4], util.U32_LE(src[0:4]) ~ x0)
														
 
															+					util.PUT_U32_LE(dst[4:8], util.U32_LE(src[4:8]) ~ x1)
														
 
															+					util.PUT_U32_LE(dst[8:12], util.U32_LE(src[8:12]) ~ x2)
														
 
															+					util.PUT_U32_LE(dst[12:16], util.U32_LE(src[12:16]) ~ x3)
														
 
															+					util.PUT_U32_LE(dst[16:20], util.U32_LE(src[16:20]) ~ x4)
														
 
															+					util.PUT_U32_LE(dst[20:24], util.U32_LE(src[20:24]) ~ x5)
														
 
															+					util.PUT_U32_LE(dst[24:28], util.U32_LE(src[24:28]) ~ x6)
														
 
															+					util.PUT_U32_LE(dst[28:32], util.U32_LE(src[28:32]) ~ x7)
														
 
															+					util.PUT_U32_LE(dst[32:36], util.U32_LE(src[32:36]) ~ x8)
														
 
															+					util.PUT_U32_LE(dst[36:40], util.U32_LE(src[36:40]) ~ x9)
														
 
															+					util.PUT_U32_LE(dst[40:44], util.U32_LE(src[40:44]) ~ x10)
														
 
															+					util.PUT_U32_LE(dst[44:48], util.U32_LE(src[44:48]) ~ x11)
														
 
															+					util.PUT_U32_LE(dst[48:52], util.U32_LE(src[48:52]) ~ x12)
														
 
															+					util.PUT_U32_LE(dst[52:56], util.U32_LE(src[52:56]) ~ x13)
														
 
															+					util.PUT_U32_LE(dst[56:60], util.U32_LE(src[56:60]) ~ x14)
														
 
															+					util.PUT_U32_LE(dst[60:64], util.U32_LE(src[60:64]) ~ x15)
														
 
															+					src = src[_BLOCK_SIZE:]
														
 
															+				} else {
														
 
															+					util.PUT_U32_LE(dst[0:4], x0)
														
 
															+					util.PUT_U32_LE(dst[4:8], x1)
														
 
															+					util.PUT_U32_LE(dst[8:12], x2)
														
 
															+					util.PUT_U32_LE(dst[12:16], x3)
														
 
															+					util.PUT_U32_LE(dst[16:20], x4)
														
 
															+					util.PUT_U32_LE(dst[20:24], x5)
														
 
															+					util.PUT_U32_LE(dst[24:28], x6)
														
 
															+					util.PUT_U32_LE(dst[28:32], x7)
														
 
															+					util.PUT_U32_LE(dst[32:36], x8)
														
 
															+					util.PUT_U32_LE(dst[36:40], x9)
														
 
															+					util.PUT_U32_LE(dst[40:44], x10)
														
 
															+					util.PUT_U32_LE(dst[44:48], x11)
														
 
															+					util.PUT_U32_LE(dst[48:52], x12)
														
 
															+					util.PUT_U32_LE(dst[52:56], x13)
														
 
															+					util.PUT_U32_LE(dst[56:60], x14)
														
 
															+					util.PUT_U32_LE(dst[60:64], x15)
														
 
															+				}
														
 
															+				dst = dst[_BLOCK_SIZE:]
														
 
															+			}
														
 
															+		}
														
 
															+
														
 
															+		// Increment the counter.  Overflow checking is done upon
														
 
															+		// entry into the routine, so a 64-bit increment safely
														
 
															+		// covers both cases.
														
 
															+		new_ctr := ((u64(ctx._s[13]) << 32) | u64(ctx._s[12])) + 1
														
 
															+		x[12] = u32(new_ctr)
														
 
															+		x[13] = u32(new_ctr >> 32)
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+_hchacha20 :: proc (dst, key, nonce: []byte) {
														
 
															+	x0, x1, x2, x3 := _SIGMA_0, _SIGMA_1, _SIGMA_2, _SIGMA_3
														
 
															+	x4 := util.U32_LE(key[0:4])
														
 
															+	x5 := util.U32_LE(key[4:8])
														
 
															+	x6 := util.U32_LE(key[8:12])
														
 
															+	x7 := util.U32_LE(key[12:16])
														
 
															+	x8 := util.U32_LE(key[16:20])
														
 
															+	x9 := util.U32_LE(key[20:24])
														
 
															+	x10 := util.U32_LE(key[24:28])
														
 
															+	x11 := util.U32_LE(key[28:32])
														
 
															+	x12 := util.U32_LE(nonce[0:4])
														
 
															+	x13 := util.U32_LE(nonce[4:8])
														
 
															+	x14 := util.U32_LE(nonce[8:12])
														
 
															+	x15 := util.U32_LE(nonce[12:16])
														
 
															+
														
 
															+	for i := _ROUNDS; i > 0; i = i - 2 {
														
 
															+		// quarterround(x, 0, 4, 8, 12)
														
 
															+		x0 += x4
														
 
															+		x12 ~= x0
														
 
															+		x12 = util.ROTL32(x12, 16)
														
 
															+		x8 += x12
														
 
															+		x4 ~= x8
														
 
															+		x4 = util.ROTL32(x4, 12)
														
 
															+		x0 += x4
														
 
															+		x12 ~= x0
														
 
															+		x12 = util.ROTL32(x12, 8)
														
 
															+		x8 += x12
														
 
															+		x4 ~= x8
														
 
															+		x4 = util.ROTL32(x4, 7)
														
 
															+
														
 
															+		// quarterround(x, 1, 5, 9, 13)
														
 
															+		x1 += x5
														
 
															+		x13 ~= x1
														
 
															+		x13 = util.ROTL32(x13, 16)
														
 
															+		x9 += x13
														
 
															+		x5 ~= x9
														
 
															+		x5 = util.ROTL32(x5, 12)
														
 
															+		x1 += x5
														
 
															+		x13 ~= x1
														
 
															+		x13 = util.ROTL32(x13, 8)
														
 
															+		x9 += x13
														
 
															+		x5 ~= x9
														
 
															+		x5 = util.ROTL32(x5, 7)
														
 
															+
														
 
															+		// quarterround(x, 2, 6, 10, 14)
														
 
															+		x2 += x6
														
 
															+		x14 ~= x2
														
 
															+		x14 = util.ROTL32(x14, 16)
														
 
															+		x10 += x14
														
 
															+		x6 ~= x10
														
 
															+		x6 = util.ROTL32(x6, 12)
														
 
															+		x2 += x6
														
 
															+		x14 ~= x2
														
 
															+		x14 = util.ROTL32(x14, 8)
														
 
															+		x10 += x14
														
 
															+		x6 ~= x10
														
 
															+		x6 = util.ROTL32(x6, 7)
														
 
															+
														
 
															+		// quarterround(x, 3, 7, 11, 15)
														
 
															+		x3 += x7
														
 
															+		x15 ~= x3
														
 
															+		x15 = util.ROTL32(x15, 16)
														
 
															+		x11 += x15
														
 
															+		x7 ~= x11
														
 
															+		x7 = util.ROTL32(x7, 12)
														
 
															+		x3 += x7
														
 
															+		x15 ~= x3
														
 
															+		x15 = util.ROTL32(x15, 8)
														
 
															+		x11 += x15
														
 
															+		x7 ~= x11
														
 
															+		x7 = util.ROTL32(x7, 7)
														
 
															+
														
 
															+		// quarterround(x, 0, 5, 10, 15)
														
 
															+		x0 += x5
														
 
															+		x15 ~= x0
														
 
															+		x15 = util.ROTL32(x15, 16)
														
 
															+		x10 += x15
														
 
															+		x5 ~= x10
														
 
															+		x5 = util.ROTL32(x5, 12)
														
 
															+		x0 += x5
														
 
															+		x15 ~= x0
														
 
															+		x15 = util.ROTL32(x15, 8)
														
 
															+		x10 += x15
														
 
															+		x5 ~= x10
														
 
															+		x5 = util.ROTL32(x5, 7)
														
 
															+
														
 
															+		// quarterround(x, 1, 6, 11, 12)
														
 
															+		x1 += x6
														
 
															+		x12 ~= x1
														
 
															+		x12 = util.ROTL32(x12, 16)
														
 
															+		x11 += x12
														
 
															+		x6 ~= x11
														
 
															+		x6 = util.ROTL32(x6, 12)
														
 
															+		x1 += x6
														
 
															+		x12 ~= x1
														
 
															+		x12 = util.ROTL32(x12, 8)
														
 
															+		x11 += x12
														
 
															+		x6 ~= x11
														
 
															+		x6 = util.ROTL32(x6, 7)
														
 
															+
														
 
															+		// quarterround(x, 2, 7, 8, 13)
														
 
															+		x2 += x7
														
 
															+		x13 ~= x2
														
 
															+		x13 = util.ROTL32(x13, 16)
														
 
															+		x8 += x13
														
 
															+		x7 ~= x8
														
 
															+		x7 = util.ROTL32(x7, 12)
														
 
															+		x2 += x7
														
 
															+		x13 ~= x2
														
 
															+		x13 = util.ROTL32(x13, 8)
														
 
															+		x8 += x13
														
 
															+		x7 ~= x8
														
 
															+		x7 = util.ROTL32(x7, 7)
														
 
															+
														
 
															+		// quarterround(x, 3, 4, 9, 14)
														
 
															+		x3 += x4
														
 
															+		x14 ~= x3
														
 
															+		x14 = util.ROTL32(x14, 16)
														
 
															+		x9 += x14
														
 
															+		x4 ~= x9
														
 
															+		x4 = util.ROTL32(x4, 12)
														
 
															+		x3 += x4
														
 
															+		x14 ~= x3
														
 
															+		x14 = util.ROTL32(x14, 8)
														
 
															+		x9 += x14
														
 
															+		x4 ~= x9
														
 
															+		x4 = util.ROTL32(x4, 7)
														
 
															+	}
														
 
															+
														
 
															+	util.PUT_U32_LE(dst[0:4], x0)
														
 
															+	util.PUT_U32_LE(dst[4:8], x1)
														
 
															+	util.PUT_U32_LE(dst[8:12], x2)
														
 
															+	util.PUT_U32_LE(dst[12:16], x3)
														
 
															+	util.PUT_U32_LE(dst[16:20], x12)
														
 
															+	util.PUT_U32_LE(dst[20:24], x13)
														
 
															+	util.PUT_U32_LE(dst[24:28], x14)
														
 
															+	util.PUT_U32_LE(dst[28:32], x15)
														
 
															+}
														
--- a/tests/core/crypto/test_core_crypto.odin
+++ b/tests/core/crypto/test_core_crypto.odin
@@ -116,6 +116,7 @@ main :: proc() {
 
															     test_haval_256(&t)
														
 
															     // "modern" crypto tests
														
 
															+    test_chacha20(&t)
														
 
															     test_poly1305(&t)
														
 
															     test_x25519(&t)
														
--- a/tests/core/crypto/test_core_crypto_modern.odin
+++ b/tests/core/crypto/test_core_crypto_modern.odin
@@ -2,8 +2,10 @@ package test_core_crypto
 
															 import "core:testing"
														
 
															 import "core:fmt"
														
 
															+import "core:mem"
														
 
															 import "core:time"
														
 
															+import "core:crypto/chacha20"
														
 
															 import "core:crypto/poly1305"
														
 
															 import "core:crypto/x25519"
														
@@ -28,6 +30,94 @@ _decode_hex32 :: proc(s: string) -> [32]byte{
 
															 	return b
														
 
															 }
														
 
															+@(test)
														
 
															+test_chacha20 :: proc(t: ^testing.T) {
														
 
															+	log(t, "Testing (X)ChaCha20")
														
 
															+
														
 
															+	// Test cases taken from RFC 8439, and draft-irtf-cfrg-xchacha-03
														
 
															+	plaintext_str := "Ladies and Gentlemen of the class of '99: If I could offer you only one tip for the future, sunscreen would be it."
														
 
															+	plaintext := transmute([]byte)(plaintext_str)
														
 
															+
														
 
															+	key := [chacha20.KEY_SIZE]byte{
														
 
															+		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
														
 
															+		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
														
 
															+		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
														
 
															+		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
														
 
															+	}
														
 
															+
														
 
															+	nonce := [chacha20.NONCE_SIZE]byte{
														
 
															+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4a,
														
 
															+		0x00, 0x00, 0x00, 0x00,
														
 
															+	}
														
 
															+
														
 
															+	ciphertext := [114]byte{
														
 
															+		0x6e, 0x2e, 0x35, 0x9a, 0x25, 0x68, 0xf9, 0x80,
														
 
															+		0x41, 0xba, 0x07, 0x28, 0xdd, 0x0d, 0x69, 0x81,
														
 
															+		0xe9, 0x7e, 0x7a, 0xec, 0x1d, 0x43, 0x60, 0xc2,
														
 
															+		0x0a, 0x27, 0xaf, 0xcc, 0xfd, 0x9f, 0xae, 0x0b,
														
 
															+		0xf9, 0x1b, 0x65, 0xc5, 0x52, 0x47, 0x33, 0xab,
														
 
															+		0x8f, 0x59, 0x3d, 0xab, 0xcd, 0x62, 0xb3, 0x57,
														
 
															+		0x16, 0x39, 0xd6, 0x24, 0xe6, 0x51, 0x52, 0xab,
														
 
															+		0x8f, 0x53, 0x0c, 0x35, 0x9f, 0x08, 0x61, 0xd8,
														
 
															+		0x07, 0xca, 0x0d, 0xbf, 0x50, 0x0d, 0x6a, 0x61,
														
 
															+		0x56, 0xa3, 0x8e, 0x08, 0x8a, 0x22, 0xb6, 0x5e,
														
 
															+		0x52, 0xbc, 0x51, 0x4d, 0x16, 0xcc, 0xf8, 0x06,
														
 
															+		0x81, 0x8c, 0xe9, 0x1a, 0xb7, 0x79, 0x37, 0x36,
														
 
															+		0x5a, 0xf9, 0x0b, 0xbf, 0x74, 0xa3, 0x5b, 0xe6,
														
 
															+		0xb4, 0x0b, 0x8e, 0xed, 0xf2, 0x78, 0x5e, 0x42,
														
 
															+		0x87, 0x4d,
														
 
															+	}
														
 
															+	ciphertext_str := hex_string(ciphertext[:])
														
 
															+
														
 
															+	derived_ciphertext: [114]byte
														
 
															+	ctx: chacha20.Context = ---
														
 
															+	chacha20.init(&ctx, key[:], nonce[:])
														
 
															+	chacha20.seek(&ctx, 1) // The test vectors start the counter at 1.
														
 
															+	chacha20.xor_bytes(&ctx, derived_ciphertext[:], plaintext[:])
														
 
															+
														
 
															+	derived_ciphertext_str := hex_string(derived_ciphertext[:])
														
 
															+	expect(t, derived_ciphertext_str == ciphertext_str, fmt.tprintf("Expected %s for xor_bytes(plaintext_str), but got %s instead", ciphertext_str, derived_ciphertext_str))
														
 
															+
														
 
															+	xkey := [chacha20.KEY_SIZE]byte{
														
 
															+		0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
														
 
															+		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
														
 
															+		0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
														
 
															+		0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
														
 
															+	}
														
 
															+
														
 
															+	xnonce := [chacha20.XNONCE_SIZE]byte{
														
 
															+		0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
														
 
															+		0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
														
 
															+		0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
														
 
															+	}
														
 
															+
														
 
															+	xciphertext := [114]byte{
														
 
															+		0xbd, 0x6d, 0x17, 0x9d, 0x3e, 0x83, 0xd4, 0x3b,
														
 
															+		0x95, 0x76, 0x57, 0x94, 0x93, 0xc0, 0xe9, 0x39,
														
 
															+		0x57, 0x2a, 0x17, 0x00, 0x25, 0x2b, 0xfa, 0xcc,
														
 
															+		0xbe, 0xd2, 0x90, 0x2c, 0x21, 0x39, 0x6c, 0xbb,
														
 
															+		0x73, 0x1c, 0x7f, 0x1b, 0x0b, 0x4a, 0xa6, 0x44,
														
 
															+		0x0b, 0xf3, 0xa8, 0x2f, 0x4e, 0xda, 0x7e, 0x39,
														
 
															+		0xae, 0x64, 0xc6, 0x70, 0x8c, 0x54, 0xc2, 0x16,
														
 
															+		0xcb, 0x96, 0xb7, 0x2e, 0x12, 0x13, 0xb4, 0x52,
														
 
															+		0x2f, 0x8c, 0x9b, 0xa4, 0x0d, 0xb5, 0xd9, 0x45,
														
 
															+		0xb1, 0x1b, 0x69, 0xb9, 0x82, 0xc1, 0xbb, 0x9e,
														
 
															+		0x3f, 0x3f, 0xac, 0x2b, 0xc3, 0x69, 0x48, 0x8f,
														
 
															+		0x76, 0xb2, 0x38, 0x35, 0x65, 0xd3, 0xff, 0xf9,
														
 
															+		0x21, 0xf9, 0x66, 0x4c, 0x97, 0x63, 0x7d, 0xa9,
														
 
															+		0x76, 0x88, 0x12, 0xf6, 0x15, 0xc6, 0x8b, 0x13,
														
 
															+		0xb5, 0x2e,
														
 
															+	}
														
 
															+	xciphertext_str := hex_string(xciphertext[:])
														
 
															+
														
 
															+	chacha20.init(&ctx, xkey[:], xnonce[:])
														
 
															+	chacha20.seek(&ctx, 1)
														
 
															+	chacha20.xor_bytes(&ctx, derived_ciphertext[:], plaintext[:])
														
 
															+
														
 
															+	derived_ciphertext_str = hex_string(derived_ciphertext[:])
														
 
															+	expect(t, derived_ciphertext_str == xciphertext_str, fmt.tprintf("Expected %s for xor_bytes(plaintext_str), but got %s instead", xciphertext_str, derived_ciphertext_str))
														
 
															+}
														
 
															+
														
 
															 @(test)
														
 
															 test_poly1305 :: proc(t: ^testing.T) {
														
 
															 	log(t, "Testing poly1305")
														
@@ -141,24 +231,49 @@ test_x25519 :: proc(t: ^testing.T) {
 
															 bench_modern :: proc(t: ^testing.T) {
														
 
															 	fmt.println("Starting benchmarks:")
														
 
															+	bench_chacha20(t)
														
 
															 	bench_poly1305(t)
														
 
															 	bench_x25519(t)
														
 
															 }
														
 
															-_setup_poly1305 :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) {
														
 
															+_setup_sized_buf :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) {
														
 
															 	assert(options != nil)
														
 
															 	options.input = make([]u8, options.bytes, allocator)
														
 
															 	return nil if len(options.input) == options.bytes else .Allocation_Error
														
 
															 }
														
 
															-_teardown_poly1305 :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) {
														
 
															+_teardown_sized_buf :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) {
														
 
															 	assert(options != nil)
														
 
															 	delete(options.input)
														
 
															 	return nil
														
 
															 }
														
 
															+_benchmark_chacha20 :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) {
														
 
															+	buf := options.input
														
 
															+	key := [chacha20.KEY_SIZE]byte{
														
 
															+		0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
														
 
															+		0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
														
 
															+		0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
														
 
															+		0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
														
 
															+	}
														
 
															+	nonce := [chacha20.NONCE_SIZE]byte{
														
 
															+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
														
 
															+		0x00, 0x00, 0x00, 0x00,
														
 
															+	}
														
 
															+
														
 
															+	ctx: chacha20.Context = ---
														
 
															+	chacha20.init(&ctx, key[:], nonce[:])
														
 
															+
														
 
															+	for _ in 0..=options.rounds {
														
 
															+		chacha20.xor_bytes(&ctx, buf, buf)
														
 
															+	}
														
 
															+	options.count     = options.rounds
														
 
															+	options.processed = options.rounds * options.bytes
														
 
															+	return nil
														
 
															+}
														
 
															+
														
 
															 _benchmark_poly1305 :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) {
														
 
															 	buf := options.input
														
 
															 	key := [poly1305.KEY_SIZE]byte{
														
@@ -189,14 +304,41 @@ benchmark_print :: proc(name: string, options: ^time.Benchmark_Options) {
 
															 	)
														
 
															 }
														
 
															+bench_chacha20 :: proc(t: ^testing.T) {
														
 
															+	name    := "ChaCha20 64 bytes"
														
 
															+	options := &time.Benchmark_Options{
														
 
															+		rounds   = 1_000,
														
 
															+		bytes    = 64,
														
 
															+		setup    = _setup_sized_buf,
														
 
															+		bench    = _benchmark_chacha20,
														
 
															+		teardown = _teardown_sized_buf,
														
 
															+	}
														
 
															+
														
 
															+	err  := time.benchmark(options, context.allocator)
														
 
															+	expect(t, err == nil, name)
														
 
															+	benchmark_print(name, options)
														
 
															+
														
 
															+	name = "ChaCha20 1024 bytes"
														
 
															+	options.bytes = 1024
														
 
															+	err = time.benchmark(options, context.allocator)
														
 
															+	expect(t, err == nil, name)
														
 
															+	benchmark_print(name, options)
														
 
															+
														
 
															+	name = "ChaCha20 65536 bytes"
														
 
															+	options.bytes = 65536
														
 
															+	err = time.benchmark(options, context.allocator)
														
 
															+	expect(t, err == nil, name)
														
 
															+	benchmark_print(name, options)
														
 
															+}
														
 
															+
														
 
															 bench_poly1305 :: proc(t: ^testing.T) {
														
 
															 	name    := "Poly1305 64 zero bytes"
														
 
															 	options := &time.Benchmark_Options{
														
 
															 		rounds   = 1_000,
														
 
															 		bytes    = 64,
														
 
															-		setup    = _setup_poly1305,
														
 
															+		setup    = _setup_sized_buf,
														
 
															 		bench    = _benchmark_poly1305,
														
 
															-		teardown = _teardown_poly1305,
														
 
															+		teardown = _teardown_sized_buf,
														
 
															 	}
														
 
															 	err  := time.benchmark(options, context.allocator)