1 year ago · 69026852ce
--- a/core/crypto/_aes/hw_intel/api.odin
+++ b/core/crypto/_aes/hw_intel/api.odin
@@ -0,0 +1,43 @@
 
				+//+build amd64
			
 
				+package aes_hw_intel
			
 
				+
			
 
				+import "core:sys/info"
			
 
				+
			
 
				+// is_supporte returns true iff hardware accelerated AES
			
 
				+// is supported.
			
 
				+is_supported :: proc "contextless" () -> bool {
			
 
				+	features, ok := info.cpu_features.?
			
 
				+	if !ok {
			
 
				+		return false
			
 
				+	}
			
 
				+
			
 
				+	// Note: Everything with AES-NI and PCLMULQDQ has support for
			
 
				+	// the required SSE extxtensions.
			
 
				+	req_features :: info.CPU_Features{
			
 
				+		.sse2,
			
 
				+		.ssse3,
			
 
				+		.sse41,
			
 
				+		.aes,
			
 
				+		.pclmulqdq,
			
 
				+	}
			
 
				+	return features >= req_features
			
 
				+}
			
 
				+
			
 
				+// Context is a keyed AES (ECB) instance.
			
 
				+Context :: struct {
			
 
				+	// Note: The ideal thing to do is for the expanded round keys to be
			
 
				+	// arrays of `__m128i`, however that implies alignment (or using AVX).
			
 
				+	//
			
 
				+	// All the people using e-waste processors that don't support an
			
 
				+	// insturction set that has been around for over 10 years are why
			
 
				+	// we can't have nice things.
			
 
				+	_sk_exp_enc: [15][16]byte,
			
 
				+	_sk_exp_dec: [15][16]byte,
			
 
				+	_num_rounds: int,
			
 
				+}
			
 
				+
			
 
				+// init initializes a context for AES with the provided key.
			
 
				+init :: proc(ctx: ^Context, key: []byte) {
			
 
				+	keysched(ctx, key)
			
 
				+}
			
 
				+
			
--- a/core/crypto/_aes/hw_intel/ghash.odin
+++ b/core/crypto/_aes/hw_intel/ghash.odin
@@ -0,0 +1,281 @@
 
				+// Copyright (c) 2017 Thomas Pornin <[email protected]>
			
 
				+// All rights reserved.
			
 
				+//
			
 
				+// Redistribution and use in source and binary forms, with or without
			
 
				+// modification, are permitted provided that the following conditions
			
 
				+// are met:
			
 
				+//
			
 
				+//   1. Redistributions of source code must retain the above copyright
			
 
				+//      notice, this list of conditions and the following disclaimer.
			
 
				+//
			
 
				+// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR
			
 
				+// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
			
 
				+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
			
 
				+// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
			
 
				+// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
			
 
				+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
			
 
				+// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
			
 
				+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
			
 
				+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
			
 
				+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
			
 
				+// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				+
			
 
				+//+build amd64
			
 
				+package aes_hw_intel
			
 
				+
			
 
				+import "base:intrinsics"
			
 
				+import "core:crypto/_aes"
			
 
				+import "core:simd"
			
 
				+import "core:simd/x86"
			
 
				+
			
 
				+@(private = "file")
			
 
				+GHASH_STRIDE_HW :: 4
			
 
				+@(private = "file")
			
 
				+GHASH_STRIDE_BYTES_HW :: GHASH_STRIDE_HW * _aes.GHASH_BLOCK_SIZE
			
 
				+
			
 
				+// GHASH is defined over elements of GF(2^128) with "full little-endian"
			
 
				+// representation: leftmost byte is least significant, and, within each
			
 
				+// byte, leftmost _bit_ is least significant. The natural ordering in
			
 
				+// x86 is "mixed little-endian": bytes are ordered from least to most
			
 
				+// significant, but bits within a byte are in most-to-least significant
			
 
				+// order. Going to full little-endian representation would require
			
 
				+// reversing bits within each byte, which is doable but expensive.
			
 
				+//
			
 
				+// Instead, we go to full big-endian representation, by swapping bytes
			
 
				+// around, which is done with a single _mm_shuffle_epi8() opcode (it
			
 
				+// comes with SSSE3; all CPU that offer pclmulqdq also have SSSE3). We
			
 
				+// can use a full big-endian representation because in a carryless
			
 
				+// multiplication, we have a nice bit reversal property:
			
 
				+//
			
 
				+// rev_128(x) * rev_128(y) = rev_255(x * y)
			
 
				+//
			
 
				+// So by using full big-endian, we still get the right result, except
			
 
				+// that it is right-shifted by 1 bit. The left-shift is relatively
			
 
				+// inexpensive, and it can be mutualised.
			
 
				+//
			
 
				+// Since SSE2 opcodes do not have facilities for shitfting full 128-bit
			
 
				+// values with bit precision, we have to break down values into 64-bit
			
 
				+// chunks. We number chunks from 0 to 3 in left to right order.
			
 
				+
			
 
				+@(private = "file")
			
 
				+byteswap_index := transmute(x86.__m128i)simd.i8x16{
			
 
				+	// Note: simd.i8x16 is reverse order from x86._mm_set_epi8.
			
 
				+	15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
			
 
				+}
			
 
				+
			
 
				+@(private = "file", require_results, enable_target_feature = "sse2,ssse3")
			
 
				+byteswap :: #force_inline proc "contextless" (x: x86.__m128i) -> x86.__m128i {
			
 
				+	return x86._mm_shuffle_epi8(x, byteswap_index)
			
 
				+}
			
 
				+
			
 
				+// From a 128-bit value kw, compute kx as the XOR of the two 64-bit
			
 
				+// halves of kw (into the right half of kx; left half is unspecified),
			
 
				+// and return kx.
			
 
				+@(private = "file", require_results, enable_target_feature = "sse2")
			
 
				+bk :: #force_inline proc "contextless" (kw: x86.__m128i) -> x86.__m128i {
			
 
				+	return x86._mm_xor_si128(kw, x86._mm_shuffle_epi32(kw, 0x0e))
			
 
				+}
			
 
				+
			
 
				+// Combine two 64-bit values (k0:k1) into a 128-bit (kw) value and
			
 
				+// the XOR of the two values (kx), and return (kw, kx).
			
 
				+@(private = "file", enable_target_feature = "sse2")
			
 
				+pbk :: #force_inline proc "contextless" (k0, k1: x86.__m128i) -> (x86.__m128i, x86.__m128i) {
			
 
				+	kw := x86._mm_unpacklo_epi64(k1, k0)
			
 
				+	kx := x86._mm_xor_si128(k0, k1)
			
 
				+	return kw, kx
			
 
				+}
			
 
				+
			
 
				+// Left-shift by 1 bit a 256-bit value (in four 64-bit words).
			
 
				+@(private = "file", require_results, enable_target_feature = "sse2")
			
 
				+sl_256 :: #force_inline proc "contextless" (x0, x1, x2, x3: x86.__m128i) -> (x86.__m128i, x86.__m128i, x86.__m128i, x86.__m128i) {
			
 
				+	x0, x1, x2, x3 := x0, x1, x2, x3
			
 
				+
			
 
				+	x0 = x86._mm_or_si128(x86._mm_slli_epi64(x0, 1), x86._mm_srli_epi64(x1, 63))
			
 
				+	x1 = x86._mm_or_si128(x86._mm_slli_epi64(x1, 1), x86._mm_srli_epi64(x2, 63))
			
 
				+	x2 = x86._mm_or_si128(x86._mm_slli_epi64(x2, 1), x86._mm_srli_epi64(x3, 63))
			
 
				+	x3 = x86._mm_slli_epi64(x3, 1)
			
 
				+
			
 
				+	return x0, x1, x2, x3
			
 
				+}
			
 
				+
			
 
				+// Perform reduction in GF(2^128).
			
 
				+@(private = "file", require_results, enable_target_feature = "sse2")
			
 
				+reduce_f128 :: #force_inline proc "contextless" (x0, x1, x2, x3: x86.__m128i) -> (x86.__m128i, x86.__m128i) {
			
 
				+	x0, x1, x2 := x0, x1, x2
			
 
				+
			
 
				+	x1 = x86._mm_xor_si128(
			
 
				+		x1,
			
 
				+		x86._mm_xor_si128(
			
 
				+			x86._mm_xor_si128(
			
 
				+				x3,
			
 
				+				x86._mm_srli_epi64(x3, 1)),
			
 
				+			x86._mm_xor_si128(
			
 
				+				x86._mm_srli_epi64(x3, 2),
			
 
				+				x86._mm_srli_epi64(x3, 7))))
			
 
				+	x2 = x86._mm_xor_si128(
			
 
				+		x86._mm_xor_si128(
			
 
				+			x2,
			
 
				+			x86._mm_slli_epi64(x3, 63)),
			
 
				+		x86._mm_xor_si128(
			
 
				+			x86._mm_slli_epi64(x3, 62),
			
 
				+			x86._mm_slli_epi64(x3, 57)))
			
 
				+	x0 = x86._mm_xor_si128(
			
 
				+		x0,
			
 
				+		x86._mm_xor_si128(
			
 
				+			x86._mm_xor_si128(
			
 
				+				x2,
			
 
				+				x86._mm_srli_epi64(x2, 1)),
			
 
				+			x86._mm_xor_si128(
			
 
				+				x86._mm_srli_epi64(x2, 2),
			
 
				+				x86._mm_srli_epi64(x2, 7))))
			
 
				+	x1 = x86._mm_xor_si128(
			
 
				+		x86._mm_xor_si128(
			
 
				+			x1,
			
 
				+			x86._mm_slli_epi64(x2, 63)),
			
 
				+		x86._mm_xor_si128(
			
 
				+			x86._mm_slli_epi64(x2, 62),
			
 
				+			x86._mm_slli_epi64(x2, 57)))
			
 
				+
			
 
				+	return x0, x1
			
 
				+}
			
 
				+
			
 
				+// Square value kw in GF(2^128) into (dw,dx).
			
 
				+@(private = "file", require_results, enable_target_feature = "sse2,pclmul")
			
 
				+square_f128 :: #force_inline proc "contextless" (kw: x86.__m128i) -> (x86.__m128i, x86.__m128i) {
			
 
				+	z1 := x86._mm_clmulepi64_si128(kw, kw, 0x11)
			
 
				+	z3 := x86._mm_clmulepi64_si128(kw, kw, 0x00)
			
 
				+	z0 := x86._mm_shuffle_epi32(z1, 0x0E)
			
 
				+	z2 := x86._mm_shuffle_epi32(z3, 0x0E)
			
 
				+	z0, z1, z2, z3 = sl_256(z0, z1, z2, z3)
			
 
				+	z0, z1 = reduce_f128(z0, z1, z2, z3)
			
 
				+	return pbk(z0, z1)
			
 
				+}
			
 
				+
			
 
				+// ghash calculates the GHASH of data, with the key `key`, and input `dst`
			
 
				+// and `data`, and stores the resulting digest in `dst`.
			
 
				+//
			
 
				+// Note: `dst` is both an input and an output, to support easy implementation
			
 
				+// of GCM.
			
 
				+@(enable_target_feature = "sse2,ssse3,pclmul")
			
 
				+ghash :: proc "contextless" (dst, key, data: []byte) #no_bounds_check {
			
 
				+	if len(dst) != _aes.GHASH_BLOCK_SIZE || len(key) != _aes.GHASH_BLOCK_SIZE {
			
 
				+		intrinsics.trap()
			
 
				+	}
			
 
				+
			
 
				+	// Note: BearSSL opts to copy the remainder into a zero-filled
			
 
				+	// 64-byte buffer.  We do something slightly more simple.
			
 
				+
			
 
				+	// Load key and dst (h and y).
			
 
				+	yw := intrinsics.unaligned_load((^x86.__m128i)(raw_data(dst)))
			
 
				+	h1w := intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
			
 
				+	yw = byteswap(yw)
			
 
				+	h1w = byteswap(h1w)
			
 
				+	h1x := bk(h1w)
			
 
				+
			
 
				+	// Process 4 blocks at a time
			
 
				+	buf := data
			
 
				+	l := len(buf)
			
 
				+	if l >= GHASH_STRIDE_BYTES_HW {
			
 
				+		// Compute h2 = h^2
			
 
				+		h2w, h2x := square_f128(h1w)
			
 
				+
			
 
				+		// Compute h3 = h^3 = h*(h^2)
			
 
				+		t1 := x86._mm_clmulepi64_si128(h1w, h2w, 0x11)
			
 
				+		t3 := x86._mm_clmulepi64_si128(h1w, h2w, 0x00)
			
 
				+		t2 := x86._mm_xor_si128(
			
 
				+			x86._mm_clmulepi64_si128(h1x, h2x, 0x00),
			
 
				+			x86._mm_xor_si128(t1, t3))
			
 
				+		t0 := x86._mm_shuffle_epi32(t1, 0x0E)
			
 
				+		t1 = x86._mm_xor_si128(t1, x86._mm_shuffle_epi32(t2, 0x0E))
			
 
				+		t2 = x86._mm_xor_si128(t2, x86._mm_shuffle_epi32(t3, 0x0E))
			
 
				+		t0, t1, t2, t3 = sl_256(t0, t1, t2, t3)
			
 
				+		t0, t1 = reduce_f128(t0, t1, t2, t3)
			
 
				+		h3w, h3x := pbk(t0, t1)
			
 
				+
			
 
				+		// Compute h4 = h^4 = (h^2)^2
			
 
				+		h4w, h4x := square_f128(h2w)
			
 
				+
			
 
				+		for l >= GHASH_STRIDE_BYTES_HW {
			
 
				+			aw0 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(buf)))
			
 
				+			aw1 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(buf[16:])))
			
 
				+			aw2 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(buf[32:])))
			
 
				+			aw3 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(buf[48:])))
			
 
				+			aw0 = byteswap(aw0)
			
 
				+			aw1 = byteswap(aw1)
			
 
				+			aw2 = byteswap(aw2)
			
 
				+			aw3 = byteswap(aw3)
			
 
				+			buf, l = buf[GHASH_STRIDE_BYTES_HW:], l - GHASH_STRIDE_BYTES_HW
			
 
				+
			
 
				+			aw0 = x86._mm_xor_si128(aw0, yw)
			
 
				+			ax1 := bk(aw1)
			
 
				+			ax2 := bk(aw2)
			
 
				+			ax3 := bk(aw3)
			
 
				+			ax0 := bk(aw0)
			
 
				+
			
 
				+			t1 = x86._mm_xor_si128(
			
 
				+				x86._mm_xor_si128(
			
 
				+					x86._mm_clmulepi64_si128(aw0, h4w, 0x11),
			
 
				+					x86._mm_clmulepi64_si128(aw1, h3w, 0x11)),
			
 
				+				x86._mm_xor_si128(
			
 
				+					x86._mm_clmulepi64_si128(aw2, h2w, 0x11),
			
 
				+					x86._mm_clmulepi64_si128(aw3, h1w, 0x11)))
			
 
				+			t3 = x86._mm_xor_si128(
			
 
				+				x86._mm_xor_si128(
			
 
				+					x86._mm_clmulepi64_si128(aw0, h4w, 0x00),
			
 
				+					x86._mm_clmulepi64_si128(aw1, h3w, 0x00)),
			
 
				+				x86._mm_xor_si128(
			
 
				+					x86._mm_clmulepi64_si128(aw2, h2w, 0x00),
			
 
				+					x86._mm_clmulepi64_si128(aw3, h1w, 0x00)))
			
 
				+			t2 = x86._mm_xor_si128(
			
 
				+				x86._mm_xor_si128(
			
 
				+					x86._mm_clmulepi64_si128(ax0, h4x, 0x00),
			
 
				+					x86._mm_clmulepi64_si128(ax1, h3x, 0x00)),
			
 
				+				x86._mm_xor_si128(
			
 
				+					x86._mm_clmulepi64_si128(ax2, h2x, 0x00),
			
 
				+					x86._mm_clmulepi64_si128(ax3, h1x, 0x00)))
			
 
				+			t2 = x86._mm_xor_si128(t2, x86._mm_xor_si128(t1, t3))
			
 
				+			t0 = x86._mm_shuffle_epi32(t1, 0x0E)
			
 
				+			t1 = x86._mm_xor_si128(t1, x86._mm_shuffle_epi32(t2, 0x0E))
			
 
				+			t2 = x86._mm_xor_si128(t2, x86._mm_shuffle_epi32(t3, 0x0E))
			
 
				+			t0, t1, t2, t3 = sl_256(t0, t1, t2, t3)
			
 
				+			t0, t1 = reduce_f128(t0, t1, t2, t3)
			
 
				+			yw = x86._mm_unpacklo_epi64(t1, t0)
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// Process 1 block at a time
			
 
				+	src: []byte
			
 
				+	for l > 0 {
			
 
				+		if l >= _aes.GHASH_BLOCK_SIZE {
			
 
				+			src = buf
			
 
				+			buf = buf[_aes.GHASH_BLOCK_SIZE:]
			
 
				+			l -= _aes.GHASH_BLOCK_SIZE
			
 
				+		} else {
			
 
				+			tmp: [_aes.GHASH_BLOCK_SIZE]byte
			
 
				+			copy(tmp[:], buf)
			
 
				+			src = tmp[:]
			
 
				+			l = 0
			
 
				+		}
			
 
				+
			
 
				+		aw := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src)))
			
 
				+		aw = byteswap(aw)
			
 
				+
			
 
				+		aw = x86._mm_xor_si128(aw, yw)
			
 
				+		ax := bk(aw)
			
 
				+
			
 
				+		t1 := x86._mm_clmulepi64_si128(aw, h1w, 0x11)
			
 
				+		t3 := x86._mm_clmulepi64_si128(aw, h1w, 0x00)
			
 
				+		t2 := x86._mm_clmulepi64_si128(ax, h1x, 0x00)
			
 
				+		t2 = x86._mm_xor_si128(t2, x86._mm_xor_si128(t1, t3))
			
 
				+		t0 := x86._mm_shuffle_epi32(t1, 0x0E)
			
 
				+		t1 = x86._mm_xor_si128(t1, x86._mm_shuffle_epi32(t2, 0x0E))
			
 
				+		t2 = x86._mm_xor_si128(t2, x86._mm_shuffle_epi32(t3, 0x0E))
			
 
				+		t0, t1, t2, t3 = sl_256(t0, t1, t2, t3)
			
 
				+		t0, t1 = reduce_f128(t0, t1, t2, t3)
			
 
				+		yw = x86._mm_unpacklo_epi64(t1, t0)
			
 
				+	}
			
 
				+
			
 
				+	// Write back the hash (dst, aka y)
			
 
				+	yw = byteswap(yw)
			
 
				+	intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst)), yw)
			
 
				+}
			
--- a/core/crypto/_aes/hw_intel/hw_intel_keysched.odin
+++ b/core/crypto/_aes/hw_intel/hw_intel_keysched.odin
@@ -0,0 +1,178 @@
 
				+// Copyright (c) 2017 Thomas Pornin <[email protected]>
			
 
				+// All rights reserved.
			
 
				+//
			
 
				+// Redistribution and use in source and binary forms, with or without
			
 
				+// modification, are permitted provided that the following conditions
			
 
				+// are met:
			
 
				+//
			
 
				+//   1. Redistributions of source code must retain the above copyright
			
 
				+//      notice, this list of conditions and the following disclaimer.
			
 
				+//
			
 
				+// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR
			
 
				+// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
			
 
				+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
			
 
				+// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
			
 
				+// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
			
 
				+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
			
 
				+// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
			
 
				+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
			
 
				+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
			
 
				+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
			
 
				+// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				+
			
 
				+//+build amd64
			
 
				+package aes_hw_intel
			
 
				+
			
 
				+import "base:intrinsics"
			
 
				+import "core:crypto/_aes"
			
 
				+import "core:mem"
			
 
				+import "core:simd/x86"
			
 
				+
			
 
				+// Intel AES-NI based implementation.  Inspiration taken from BearSSL.
			
 
				+//
			
 
				+// Note: This assumes that the SROA optimization pass is enabled to be
			
 
				+// anything resembling performat otherwise, LLVM will not elide a massive
			
 
				+// number of redundant loads/stores it generates for every intrinsic call.
			
 
				+
			
 
				+@(private = "file", require_results, enable_target_feature = "sse2")
			
 
				+expand_step128 :: #force_inline proc(k1, k2: x86.__m128i) -> x86.__m128i {
			
 
				+	k1, k2 := k1, k2
			
 
				+
			
 
				+	k2 = x86._mm_shuffle_epi32(k2, 0xff)
			
 
				+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
			
 
				+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
			
 
				+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
			
 
				+	return x86._mm_xor_si128(k1, k2)
			
 
				+}
			
 
				+
			
 
				+@(private = "file", require_results, enable_target_feature = "sse,sse2")
			
 
				+expand_step192a :: #force_inline proc (k1_, k2_: ^x86.__m128i, k3: x86.__m128i) -> (x86.__m128i, x86.__m128i) {
			
 
				+	k1, k2, k3 := k1_^, k2_^, k3
			
 
				+
			
 
				+	k3 = x86._mm_shuffle_epi32(k3, 0x55)
			
 
				+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
			
 
				+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
			
 
				+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
			
 
				+	k1 = x86._mm_xor_si128(k1, k3)
			
 
				+
			
 
				+	tmp := k2
			
 
				+	k2 = x86._mm_xor_si128(k2, x86._mm_slli_si128(k2, 0x04))
			
 
				+	k2 = x86._mm_xor_si128(k2, x86._mm_shuffle_epi32(k1, 0xff))
			
 
				+
			
 
				+	k1_, k2_ := k1_, k2_
			
 
				+	k1_^, k2_^ = k1, k2
			
 
				+
			
 
				+	r1 := transmute(x86.__m128i)(x86._mm_shuffle_ps(transmute(x86.__m128)(tmp), transmute(x86.__m128)(k1), 0x44))
			
 
				+	r2 := transmute(x86.__m128i)(x86._mm_shuffle_ps(transmute(x86.__m128)(k1), transmute(x86.__m128)(k2), 0x4e))
			
 
				+
			
 
				+	return r1, r2
			
 
				+}
			
 
				+
			
 
				+@(private = "file", require_results, enable_target_feature = "sse2")
			
 
				+expand_step192b :: #force_inline proc (k1_, k2_: ^x86.__m128i, k3: x86.__m128i) -> x86.__m128i {
			
 
				+	k1, k2, k3 := k1_^, k2_^, k3
			
 
				+
			
 
				+	k3 = x86._mm_shuffle_epi32(k3, 0x55)
			
 
				+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
			
 
				+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
			
 
				+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
			
 
				+	k1 = x86._mm_xor_si128(k1, k3)
			
 
				+
			
 
				+	k2 = x86._mm_xor_si128(k2, x86._mm_slli_si128(k2, 0x04))
			
 
				+	k2 = x86._mm_xor_si128(k2, x86._mm_shuffle_epi32(k1, 0xff))
			
 
				+
			
 
				+	k1_, k2_ := k1_, k2_
			
 
				+	k1_^, k2_^ = k1, k2
			
 
				+
			
 
				+	return k1
			
 
				+}
			
 
				+
			
 
				+@(private = "file", require_results, enable_target_feature = "sse2")
			
 
				+expand_step256b :: #force_inline proc(k1, k2: x86.__m128i) -> x86.__m128i {
			
 
				+	k1, k2 := k1, k2
			
 
				+
			
 
				+	k2 = x86._mm_shuffle_epi32(k2, 0xaa)
			
 
				+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
			
 
				+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
			
 
				+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
			
 
				+	return x86._mm_xor_si128(k1, k2)
			
 
				+}
			
 
				+
			
 
				+@(private = "file", enable_target_feature = "aes")
			
 
				+derive_dec_keys :: proc(ctx: ^Context, sks: ^[15]x86.__m128i, num_rounds: int) {
			
 
				+	intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[0]), sks[num_rounds])
			
 
				+	for i in 1 ..< num_rounds {
			
 
				+		tmp := x86._mm_aesimc_si128(sks[i])
			
 
				+		intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[num_rounds - i]), tmp)
			
 
				+	}
			
 
				+	intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[num_rounds]), sks[0])
			
 
				+}
			
 
				+
			
 
				+@(private, enable_target_feature = "sse,sse2,aes")
			
 
				+keysched :: proc(ctx: ^Context, key: []byte) {
			
 
				+	sks: [15]x86.__m128i = ---
			
 
				+
			
 
				+	// Compute the encryption keys.
			
 
				+	num_rounds, key_len := 0, len(key)
			
 
				+	switch key_len {
			
 
				+	case _aes.KEY_SIZE_128:
			
 
				+		sks[0] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
			
 
				+		sks[1] = expand_step128(sks[0], x86._mm_aeskeygenassist_si128(sks[0], 0x01))
			
 
				+		sks[2] = expand_step128(sks[1], x86._mm_aeskeygenassist_si128(sks[1], 0x02))
			
 
				+		sks[3] = expand_step128(sks[2], x86._mm_aeskeygenassist_si128(sks[2], 0x04))
			
 
				+		sks[4] = expand_step128(sks[3], x86._mm_aeskeygenassist_si128(sks[3], 0x08))
			
 
				+		sks[5] = expand_step128(sks[4], x86._mm_aeskeygenassist_si128(sks[4], 0x10))
			
 
				+		sks[6] = expand_step128(sks[5], x86._mm_aeskeygenassist_si128(sks[5], 0x20))
			
 
				+		sks[7] = expand_step128(sks[6], x86._mm_aeskeygenassist_si128(sks[6], 0x40))
			
 
				+		sks[8] = expand_step128(sks[7], x86._mm_aeskeygenassist_si128(sks[7], 0x80))
			
 
				+		sks[9] = expand_step128(sks[8], x86._mm_aeskeygenassist_si128(sks[8], 0x1b))
			
 
				+		sks[10] = expand_step128(sks[9], x86._mm_aeskeygenassist_si128(sks[9], 0x36))
			
 
				+		num_rounds = _aes.ROUNDS_128
			
 
				+	case _aes.KEY_SIZE_192:
			
 
				+		k0 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
			
 
				+		k1 := x86.__m128i{
			
 
				+			intrinsics.unaligned_load((^i64)(raw_data(key[16:]))),
			
 
				+			0,
			
 
				+		}
			
 
				+		sks[0] = k0
			
 
				+		sks[1], sks[2] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x01))
			
 
				+		sks[3] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x02))
			
 
				+		sks[4], sks[5] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x04))
			
 
				+		sks[6] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x08))
			
 
				+		sks[7], sks[8] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x10))
			
 
				+		sks[9] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x20))
			
 
				+		sks[10], sks[11] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x40))
			
 
				+		sks[12] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x80))
			
 
				+		num_rounds = _aes.ROUNDS_192
			
 
				+	case _aes.KEY_SIZE_256:
			
 
				+		sks[0] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
			
 
				+		sks[1] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key[16:])))
			
 
				+		sks[2] = expand_step128(sks[0], x86._mm_aeskeygenassist_si128(sks[1], 0x01))
			
 
				+		sks[3] = expand_step256b(sks[1], x86._mm_aeskeygenassist_si128(sks[2], 0x01))
			
 
				+		sks[4] = expand_step128(sks[2], x86._mm_aeskeygenassist_si128(sks[3], 0x02))
			
 
				+		sks[5] = expand_step256b(sks[3], x86._mm_aeskeygenassist_si128(sks[4], 0x02))
			
 
				+		sks[6] = expand_step128(sks[4], x86._mm_aeskeygenassist_si128(sks[5], 0x04))
			
 
				+		sks[7] = expand_step256b(sks[5], x86._mm_aeskeygenassist_si128(sks[6], 0x04))
			
 
				+		sks[8] = expand_step128(sks[6], x86._mm_aeskeygenassist_si128(sks[7], 0x08))
			
 
				+		sks[9] = expand_step256b(sks[7], x86._mm_aeskeygenassist_si128(sks[8], 0x08))
			
 
				+		sks[10] = expand_step128(sks[8], x86._mm_aeskeygenassist_si128(sks[9], 0x10))
			
 
				+		sks[11] = expand_step256b(sks[9], x86._mm_aeskeygenassist_si128(sks[10], 0x10))
			
 
				+		sks[12] = expand_step128(sks[10], x86._mm_aeskeygenassist_si128(sks[11], 0x20))
			
 
				+		sks[13] = expand_step256b(sks[11], x86._mm_aeskeygenassist_si128(sks[12], 0x20))
			
 
				+		sks[14] = expand_step128(sks[12], x86._mm_aeskeygenassist_si128(sks[13], 0x40))
			
 
				+		num_rounds = _aes.ROUNDS_256
			
 
				+	case:
			
 
				+		panic("crypto/aes: invalid AES key size")
			
 
				+	}
			
 
				+	for i in 0 ..= num_rounds {
			
 
				+		intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_enc[i]), sks[i])
			
 
				+	}
			
 
				+
			
 
				+	// Compute the decryption keys.  GCM and CTR do not need this, however
			
 
				+	// ECB, CBC, OCB3, etc do.
			
 
				+	derive_dec_keys(ctx, &sks, num_rounds)
			
 
				+
			
 
				+	ctx._num_rounds = num_rounds
			
 
				+
			
 
				+	mem.zero_explicit(&sks, size_of(sks))
			
 
				+}
			
--- a/core/crypto/aes/aes_ctr.odin
+++ b/core/crypto/aes/aes_ctr.odin
@@ -125,8 +125,8 @@ reset_ctr :: proc "contextless" (ctx: ^Context_CTR) {
 
				 	ctx._is_initialized = false
			
 
				 }
			
 
				 
			
 
				-@(private)
			
 
				-ctr_blocks :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) {
			
 
				+@(private = "file")
			
 
				+ctr_blocks :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_bounds_check {
			
 
				 	// Use the optimized hardware implementation if available.
			
 
				 	if _, is_hw := ctx._impl.(Context_Impl_Hardware); is_hw {
			
 
				 		ctr_blocks_hw(ctx, dst, src, nr_blocks)
			
@@ -185,17 +185,17 @@ xor_blocks :: #force_inline proc "contextless" (dst, src: []byte, blocks: [][]by
 
				 	// performance of this implementation matters to where that
			
 
				 	// optimization would be worth it, use chacha20poly1305, or a
			
 
				 	// CPU that isn't e-waste.
			
 
				-	if src != nil {
			
 
				-		#no_bounds_check {
			
 
				-			for i in 0 ..< len(blocks) {
			
 
				-				off := i * BLOCK_SIZE
			
 
				-				for j in 0 ..< BLOCK_SIZE {
			
 
				-					blocks[i][j] ~= src[off + j]
			
 
				+	#no_bounds_check {
			
 
				+		if src != nil {
			
 
				+				for i in 0 ..< len(blocks) {
			
 
				+					off := i * BLOCK_SIZE
			
 
				+					for j in 0 ..< BLOCK_SIZE {
			
 
				+						blocks[i][j] ~= src[off + j]
			
 
				+					}
			
 
				 				}
			
 
				-			}
			
 
				 		}
			
 
				-	}
			
 
				-	for i in 0 ..< len(blocks) {
			
 
				-		copy(dst[i * BLOCK_SIZE:], blocks[i])
			
 
				+		for i in 0 ..< len(blocks) {
			
 
				+			copy(dst[i * BLOCK_SIZE:], blocks[i])
			
 
				+		}
			
 
				 	}
			
 
				 }
			
--- a/core/crypto/aes/aes_ctr_hw_intel.odin
+++ b/core/crypto/aes/aes_ctr_hw_intel.odin
@@ -0,0 +1,151 @@
 
				+//+build amd64
			
 
				+package aes
			
 
				+
			
 
				+import "base:intrinsics"
			
 
				+import "core:crypto/_aes"
			
 
				+import "core:math/bits"
			
 
				+import "core:mem"
			
 
				+import "core:simd/x86"
			
 
				+
			
 
				+@(private)
			
 
				+CTR_STRIDE_HW :: 4
			
 
				+@(private)
			
 
				+CTR_STRIDE_BYTES_HW :: CTR_STRIDE_HW * BLOCK_SIZE
			
 
				+
			
 
				+@(private, enable_target_feature = "sse2,aes")
			
 
				+ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_bounds_check {
			
 
				+	hw_ctx := ctx._impl.(Context_Impl_Hardware)
			
 
				+
			
 
				+	sks: [15]x86.__m128i = ---
			
 
				+	for i in 0 ..= hw_ctx._num_rounds {
			
 
				+		sks[i] = intrinsics.unaligned_load((^x86.__m128i)(&hw_ctx._sk_exp_enc[i]))
			
 
				+	}
			
 
				+
			
 
				+	hw_inc_ctr := #force_inline proc "contextless" (hi, lo: u64) -> (x86.__m128i, u64, u64) {
			
 
				+		ret := x86.__m128i{
			
 
				+			i64(intrinsics.byte_swap(hi)),
			
 
				+			i64(intrinsics.byte_swap(lo)),
			
 
				+		}
			
 
				+
			
 
				+		hi, lo := hi, lo
			
 
				+		carry: u64
			
 
				+
			
 
				+		lo, carry = bits.add_u64(lo, 1, 0)
			
 
				+		hi, _ = bits.add_u64(hi, 0, carry)
			
 
				+		return ret, hi, lo
			
 
				+	}
			
 
				+
			
 
				+	// The latency of AESENC depends on mfg and microarchitecture:
			
 
				+	// - 7 -> up to Broadwell
			
 
				+	// - 4 -> AMD and Skylake - Cascade Lake
			
 
				+	// - 3 -> Ice Lake and newer
			
 
				+	//
			
 
				+	// This implementation does 4 blocks at once, since performance
			
 
				+	// should be "adequate" across most CPUs.
			
 
				+
			
 
				+	src, dst := src, dst
			
 
				+	nr_blocks := nr_blocks
			
 
				+	ctr_hi, ctr_lo := ctx._ctr_hi, ctx._ctr_lo
			
 
				+
			
 
				+	blks: [CTR_STRIDE_HW]x86.__m128i = ---
			
 
				+	for nr_blocks >= CTR_STRIDE_HW {
			
 
				+		#unroll for i in 0..< CTR_STRIDE_HW {
			
 
				+			blks[i], ctr_hi, ctr_lo = hw_inc_ctr(ctr_hi, ctr_lo)
			
 
				+		}
			
 
				+
			
 
				+		#unroll for i in 0 ..< CTR_STRIDE_HW {
			
 
				+			blks[i] = x86._mm_xor_si128(blks[i], sks[0])
			
 
				+		}
			
 
				+		#unroll for i in 1 ..= 9 {
			
 
				+			#unroll for j in 0 ..< CTR_STRIDE_HW {
			
 
				+				blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
			
 
				+			}
			
 
				+		}
			
 
				+		switch hw_ctx._num_rounds {
			
 
				+		case _aes.ROUNDS_128:
			
 
				+			#unroll for i in 0 ..< CTR_STRIDE_HW {
			
 
				+				blks[i] = x86._mm_aesenclast_si128(blks[i], sks[10])
			
 
				+			}
			
 
				+		case _aes.ROUNDS_192:
			
 
				+			#unroll for i in 10 ..= 11 {
			
 
				+				#unroll for j in 0 ..< CTR_STRIDE_HW {
			
 
				+					blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
			
 
				+				}
			
 
				+			}
			
 
				+			#unroll for i in 0 ..< CTR_STRIDE_HW {
			
 
				+				blks[i] = x86._mm_aesenclast_si128(blks[i], sks[12])
			
 
				+			}
			
 
				+		case _aes.ROUNDS_256:
			
 
				+			#unroll for i in 10 ..= 13 {
			
 
				+				#unroll for j in 0 ..< CTR_STRIDE_HW {
			
 
				+					blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
			
 
				+				}
			
 
				+			}
			
 
				+			#unroll for i in 0 ..< CTR_STRIDE_HW {
			
 
				+				blks[i] = x86._mm_aesenclast_si128(blks[i], sks[14])
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		xor_blocks_hw(dst, src, blks[:])
			
 
				+
			
 
				+		if src != nil {
			
 
				+			src = src[CTR_STRIDE_BYTES_HW:]
			
 
				+		}
			
 
				+		dst = dst[CTR_STRIDE_BYTES_HW:]
			
 
				+		nr_blocks -= CTR_STRIDE_HW
			
 
				+	}
			
 
				+
			
 
				+	// Handle the remainder.
			
 
				+	for nr_blocks > 0 {
			
 
				+		blks[0], ctr_hi, ctr_lo = hw_inc_ctr(ctr_hi, ctr_lo)
			
 
				+
			
 
				+		blks[0] = x86._mm_xor_si128(blks[0], sks[0])
			
 
				+		#unroll for i in 1 ..= 9 {
			
 
				+			blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
			
 
				+		}
			
 
				+		switch hw_ctx._num_rounds {
			
 
				+		case _aes.ROUNDS_128:
			
 
				+			blks[0] = x86._mm_aesenclast_si128(blks[0], sks[10])
			
 
				+		case _aes.ROUNDS_192:
			
 
				+			#unroll for i in 10 ..= 11 {
			
 
				+				blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
			
 
				+			}
			
 
				+			blks[0] = x86._mm_aesenclast_si128(blks[0], sks[12])
			
 
				+		case _aes.ROUNDS_256:
			
 
				+			#unroll for i in 10 ..= 13 {
			
 
				+				blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
			
 
				+			}
			
 
				+			blks[0] = x86._mm_aesenclast_si128(blks[0], sks[14])
			
 
				+		}
			
 
				+
			
 
				+		xor_blocks_hw(dst, src, blks[:1])
			
 
				+
			
 
				+		if src != nil {
			
 
				+			src = src[BLOCK_SIZE:]
			
 
				+		}
			
 
				+		dst = dst[BLOCK_SIZE:]
			
 
				+		nr_blocks -= 1
			
 
				+	}
			
 
				+
			
 
				+	// Write back the counter.
			
 
				+	ctx._ctr_hi, ctx._ctr_lo = ctr_hi, ctr_lo
			
 
				+
			
 
				+	mem.zero_explicit(&blks, size_of(blks))
			
 
				+	mem.zero_explicit(&sks, size_of(sks))
			
 
				+}
			
 
				+
			
 
				+@(private, enable_target_feature = "sse2")
			
 
				+xor_blocks_hw :: proc(dst, src: []byte, blocks: []x86.__m128i) {
			
 
				+	#no_bounds_check {
			
 
				+		if src != nil {
			
 
				+				for i in 0 ..< len(blocks) {
			
 
				+					off := i * BLOCK_SIZE
			
 
				+					tmp := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[off:])))
			
 
				+					blocks[i] = x86._mm_xor_si128(blocks[i], tmp)
			
 
				+				}
			
 
				+		}
			
 
				+		for i in 0 ..< len(blocks) {
			
 
				+			intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst[i * BLOCK_SIZE:])), blocks[i])
			
 
				+		}
			
 
				+	}
			
 
				+}
			
--- a/core/crypto/aes/aes_ecb_hw_intel.odin
+++ b/core/crypto/aes/aes_ecb_hw_intel.odin
@@ -0,0 +1,58 @@
 
				+//+build amd64
			
 
				+package aes
			
 
				+
			
 
				+import "base:intrinsics"
			
 
				+import "core:crypto/_aes"
			
 
				+import "core:simd/x86"
			
 
				+
			
 
				+@(private, enable_target_feature = "sse2,aes")
			
 
				+encrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) {
			
 
				+	blk := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src)))
			
 
				+
			
 
				+	blk = x86._mm_xor_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[0])))
			
 
				+	#unroll for i in 1 ..= 9 {
			
 
				+		blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i])))
			
 
				+	}
			
 
				+	switch ctx._num_rounds {
			
 
				+	case _aes.ROUNDS_128:
			
 
				+		blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[10])))
			
 
				+	case _aes.ROUNDS_192:
			
 
				+		#unroll for i in 10 ..= 11 {
			
 
				+			blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i])))
			
 
				+		}
			
 
				+		blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[12])))
			
 
				+	case _aes.ROUNDS_256:
			
 
				+		#unroll for i in 10 ..= 13 {
			
 
				+			blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i])))
			
 
				+		}
			
 
				+		blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[14])))
			
 
				+	}
			
 
				+
			
 
				+	intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst)), blk)
			
 
				+}
			
 
				+
			
 
				+@(private, enable_target_feature = "sse2,aes")
			
 
				+decrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) {
			
 
				+	blk := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src)))
			
 
				+
			
 
				+	blk = x86._mm_xor_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[0])))
			
 
				+	#unroll for i in 1 ..= 9 {
			
 
				+		blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i])))
			
 
				+	}
			
 
				+	switch ctx._num_rounds {
			
 
				+	case _aes.ROUNDS_128:
			
 
				+		blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[10])))
			
 
				+	case _aes.ROUNDS_192:
			
 
				+		#unroll for i in 10 ..= 11 {
			
 
				+			blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i])))
			
 
				+		}
			
 
				+		blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[12])))
			
 
				+	case _aes.ROUNDS_256:
			
 
				+		#unroll for i in 10 ..= 13 {
			
 
				+			blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i])))
			
 
				+		}
			
 
				+		blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[14])))
			
 
				+	}
			
 
				+
			
 
				+	intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst)), blk)
			
 
				+}
			
--- a/core/crypto/aes/aes_gcm.odin
+++ b/core/crypto/aes/aes_gcm.odin
@@ -113,7 +113,7 @@ reset_gcm :: proc "contextless" (ctx: ^Context_GCM) {
 
				 	ctx._is_initialized = false
			
 
				 }
			
 
				 
			
 
				-@(private)
			
 
				+@(private = "file")
			
 
				 gcm_validate_common_slice_sizes :: proc(tag, nonce, aad, text: []byte) {
			
 
				 	if len(tag) != GCM_TAG_SIZE {
			
 
				 		panic("crypto/aes: invalid GCM tag size")
			
@@ -184,7 +184,7 @@ gctr_ct64 :: proc(
 
				 	h: ^[_aes.GHASH_KEY_SIZE]byte,
			
 
				 	nonce: []byte,
			
 
				 	is_seal: bool,
			
 
				-) {
			
 
				+) #no_bounds_check {
			
 
				 	ct64_inc_ctr32 := #force_inline proc "contextless" (dst: []byte, ctr: u32) -> u32 {
			
 
				 		endian.unchecked_put_u32be(dst[12:], ctr)
			
 
				 		return ctr + 1
			
@@ -206,9 +206,6 @@ gctr_ct64 :: proc(
 
				 		copy(ctrs[i], nonce)
			
 
				 	}
			
 
				 
			
 
				-	// We stitch the GCTR and GHASH operations together, so that only
			
 
				-	// one pass over the ciphertext is required.
			
 
				-
			
 
				 	impl := &ctx._impl.(ct64.Context)
			
 
				 	src, dst := src, dst
			
 
				 
			
--- a/core/crypto/aes/aes_gcm_hw_intel.odin
+++ b/core/crypto/aes/aes_gcm_hw_intel.odin
@@ -0,0 +1,231 @@
 
				+//+build amd64
			
 
				+package aes
			
 
				+
			
 
				+import "base:intrinsics"
			
 
				+import "core:crypto"
			
 
				+import "core:crypto/_aes"
			
 
				+import "core:crypto/_aes/hw_intel"
			
 
				+import "core:encoding/endian"
			
 
				+import "core:mem"
			
 
				+import "core:simd/x86"
			
 
				+
			
 
				+@(private)
			
 
				+gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, nonce, aad, plaintext: []byte) {
			
 
				+	h: [_aes.GHASH_KEY_SIZE]byte
			
 
				+	j0: [_aes.GHASH_BLOCK_SIZE]byte
			
 
				+	s: [_aes.GHASH_TAG_SIZE]byte
			
 
				+	init_ghash_hw(ctx, &h, &j0, nonce)
			
 
				+
			
 
				+	// Note: Our GHASH implementation handles appending padding.
			
 
				+	hw_intel.ghash(s[:], h[:], aad)
			
 
				+	gctr_hw(ctx, dst, &s, plaintext, &h, nonce, true)
			
 
				+	final_ghash_hw(&s, &h, &j0, len(aad), len(plaintext))
			
 
				+	copy(tag, s[:])
			
 
				+
			
 
				+	mem.zero_explicit(&h, len(h))
			
 
				+	mem.zero_explicit(&j0, len(j0))
			
 
				+}
			
 
				+
			
 
				+@(private)
			
 
				+gcm_open_hw :: proc(ctx: ^Context_Impl_Hardware, dst, nonce, aad, ciphertext, tag: []byte) -> bool {
			
 
				+	h: [_aes.GHASH_KEY_SIZE]byte
			
 
				+	j0: [_aes.GHASH_BLOCK_SIZE]byte
			
 
				+	s: [_aes.GHASH_TAG_SIZE]byte
			
 
				+	init_ghash_hw(ctx, &h, &j0, nonce)
			
 
				+
			
 
				+	hw_intel.ghash(s[:], h[:], aad)
			
 
				+	gctr_hw(ctx, dst, &s, ciphertext, &h, nonce, false)
			
 
				+	final_ghash_hw(&s, &h, &j0, len(aad), len(ciphertext))
			
 
				+
			
 
				+	ok := crypto.compare_constant_time(s[:], tag) == 1
			
 
				+	if !ok {
			
 
				+		mem.zero_explicit(raw_data(dst), len(dst))
			
 
				+	}
			
 
				+
			
 
				+	mem.zero_explicit(&h, len(h))
			
 
				+	mem.zero_explicit(&j0, len(j0))
			
 
				+	mem.zero_explicit(&s, len(s))
			
 
				+
			
 
				+	return ok
			
 
				+}
			
 
				+
			
 
				+@(private = "file")
			
 
				+init_ghash_hw :: proc(
			
 
				+	ctx: ^Context_Impl_Hardware,
			
 
				+	h: ^[_aes.GHASH_KEY_SIZE]byte,
			
 
				+	j0: ^[_aes.GHASH_BLOCK_SIZE]byte,
			
 
				+	nonce: []byte,
			
 
				+) {
			
 
				+	// 1. Let H = CIPH(k, 0^128)
			
 
				+	encrypt_block_hw(ctx, h[:], h[:])
			
 
				+
			
 
				+	// ECB encrypt j0, so that we can just XOR with the tag.
			
 
				+	copy(j0[:], nonce)
			
 
				+	j0[_aes.GHASH_BLOCK_SIZE - 1] = 1
			
 
				+	encrypt_block_hw(ctx, j0[:], j0[:])
			
 
				+}
			
 
				+
			
 
				+@(private = "file", enable_target_feature = "sse2")
			
 
				+final_ghash_hw :: proc(
			
 
				+	s: ^[_aes.GHASH_BLOCK_SIZE]byte,
			
 
				+	h: ^[_aes.GHASH_KEY_SIZE]byte,
			
 
				+	j0: ^[_aes.GHASH_BLOCK_SIZE]byte,
			
 
				+	a_len: int,
			
 
				+	t_len: int,
			
 
				+) {
			
 
				+	blk: [_aes.GHASH_BLOCK_SIZE]byte
			
 
				+	endian.unchecked_put_u64be(blk[0:], u64(a_len) * 8)
			
 
				+	endian.unchecked_put_u64be(blk[8:], u64(t_len) * 8)
			
 
				+
			
 
				+	hw_intel.ghash(s[:], h[:], blk[:])
			
 
				+	j0_vec := intrinsics.unaligned_load((^x86.__m128i)(j0))
			
 
				+	s_vec := intrinsics.unaligned_load((^x86.__m128i)(s))
			
 
				+	s_vec = x86._mm_xor_si128(s_vec, j0_vec)
			
 
				+	intrinsics.unaligned_store((^x86.__m128i)(s), s_vec)
			
 
				+}
			
 
				+
			
 
				+@(private = "file", enable_target_feature = "sse2,sse4.1,aes")
			
 
				+gctr_hw :: proc(
			
 
				+	ctx: ^Context_Impl_Hardware,
			
 
				+	dst: []byte,
			
 
				+	s: ^[_aes.GHASH_BLOCK_SIZE]byte,
			
 
				+	src: []byte,
			
 
				+	h: ^[_aes.GHASH_KEY_SIZE]byte,
			
 
				+	nonce: []byte,
			
 
				+	is_seal: bool,
			
 
				+) #no_bounds_check {
			
 
				+	sks: [15]x86.__m128i = ---
			
 
				+	for i in 0 ..= ctx._num_rounds {
			
 
				+		sks[i] = intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i]))
			
 
				+	}
			
 
				+
			
 
				+	// 2. Define a block J_0 as follows:
			
 
				+	//    if len(IV) = 96, then let J0 = IV || 0^31 || 1
			
 
				+	//
			
 
				+	// Note: We only support 96 bit IVs.
			
 
				+	tmp: [BLOCK_SIZE]byte
			
 
				+	ctr_blk: x86.__m128i
			
 
				+	copy(tmp[:], nonce)
			
 
				+	ctr_blk = intrinsics.unaligned_load((^x86.__m128i)(&tmp))
			
 
				+	ctr: u32 = 2
			
 
				+
			
 
				+	src, dst := src, dst
			
 
				+
			
 
				+	// Note: Instead of doing GHASH and CTR separately, it is more
			
 
				+	// performant to interleave (stitch) the two operations together.
			
 
				+	// This results in an unreadable mess, so we opt for simplicity
			
 
				+	// as performance is adequate.
			
 
				+
			
 
				+	blks: [CTR_STRIDE_HW]x86.__m128i = ---
			
 
				+	nr_blocks := len(src) / BLOCK_SIZE
			
 
				+	for nr_blocks >= CTR_STRIDE_HW {
			
 
				+		if !is_seal {
			
 
				+			hw_intel.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW])
			
 
				+		}
			
 
				+
			
 
				+		#unroll for i in 0 ..< CTR_STRIDE_HW {
			
 
				+			blks[i], ctr = hw_inc_ctr32(&ctr_blk, ctr)
			
 
				+		}
			
 
				+
			
 
				+		#unroll for i in 0 ..< CTR_STRIDE_HW {
			
 
				+			blks[i] = x86._mm_xor_si128(blks[i], sks[0])
			
 
				+		}
			
 
				+		#unroll for i in 1 ..= 9 {
			
 
				+			#unroll for j in 0 ..< CTR_STRIDE_HW {
			
 
				+				blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
			
 
				+			}
			
 
				+		}
			
 
				+		switch ctx._num_rounds {
			
 
				+		case _aes.ROUNDS_128:
			
 
				+			#unroll for i in 0 ..< CTR_STRIDE_HW {
			
 
				+				blks[i] = x86._mm_aesenclast_si128(blks[i], sks[10])
			
 
				+			}
			
 
				+		case _aes.ROUNDS_192:
			
 
				+			#unroll for i in 10 ..= 11 {
			
 
				+				#unroll for j in 0 ..< CTR_STRIDE_HW {
			
 
				+					blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
			
 
				+				}
			
 
				+			}
			
 
				+			#unroll for i in 0 ..< CTR_STRIDE_HW {
			
 
				+				blks[i] = x86._mm_aesenclast_si128(blks[i], sks[12])
			
 
				+			}
			
 
				+		case _aes.ROUNDS_256:
			
 
				+			#unroll for i in 10 ..= 13 {
			
 
				+				#unroll for j in 0 ..< CTR_STRIDE_HW {
			
 
				+					blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
			
 
				+				}
			
 
				+			}
			
 
				+			#unroll for i in 0 ..< CTR_STRIDE_HW {
			
 
				+				blks[i] = x86._mm_aesenclast_si128(blks[i], sks[14])
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		xor_blocks_hw(dst, src, blks[:])
			
 
				+
			
 
				+		if is_seal {
			
 
				+			hw_intel.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW])
			
 
				+		}
			
 
				+
			
 
				+		src = src[CTR_STRIDE_BYTES_HW:]
			
 
				+		dst = dst[CTR_STRIDE_BYTES_HW:]
			
 
				+		nr_blocks -= CTR_STRIDE_HW
			
 
				+	}
			
 
				+
			
 
				+	// Handle the remainder.
			
 
				+	for n := len(src); n > 0; {
			
 
				+		l := min(n, BLOCK_SIZE)
			
 
				+		if !is_seal {
			
 
				+			hw_intel.ghash(s[:], h[:], src[:l])
			
 
				+		}
			
 
				+
			
 
				+		blks[0], ctr = hw_inc_ctr32(&ctr_blk, ctr)
			
 
				+
			
 
				+		blks[0] = x86._mm_xor_si128(blks[0], sks[0])
			
 
				+		#unroll for i in 1 ..= 9 {
			
 
				+			blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
			
 
				+		}
			
 
				+		switch ctx._num_rounds {
			
 
				+		case _aes.ROUNDS_128:
			
 
				+			blks[0] = x86._mm_aesenclast_si128(blks[0], sks[10])
			
 
				+		case _aes.ROUNDS_192:
			
 
				+			#unroll for i in 10 ..= 11 {
			
 
				+				blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
			
 
				+			}
			
 
				+			blks[0] = x86._mm_aesenclast_si128(blks[0], sks[12])
			
 
				+		case _aes.ROUNDS_256:
			
 
				+			#unroll for i in 10 ..= 13 {
			
 
				+				blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
			
 
				+			}
			
 
				+			blks[0] = x86._mm_aesenclast_si128(blks[0], sks[14])
			
 
				+		}
			
 
				+
			
 
				+		if l == BLOCK_SIZE {
			
 
				+			xor_blocks_hw(dst, src, blks[:1])
			
 
				+		} else {
			
 
				+			blk: [BLOCK_SIZE]byte
			
 
				+			copy(blk[:], src)
			
 
				+			xor_blocks_hw(blk[:], blk[:], blks[:1])
			
 
				+			copy(dst, blk[:l])
			
 
				+		}
			
 
				+		if is_seal {
			
 
				+			hw_intel.ghash(s[:], h[:], dst[:l])
			
 
				+		}
			
 
				+
			
 
				+		dst = dst[l:]
			
 
				+		src = src[l:]
			
 
				+		n -= l
			
 
				+	}
			
 
				+
			
 
				+	mem.zero_explicit(&blks, size_of(blks))
			
 
				+	mem.zero_explicit(&sks, size_of(sks))
			
 
				+}
			
 
				+
			
 
				+// BUG: Sticking this in gctr_hw (like the other implementations) crashes
			
 
				+// the compiler.
			
 
				+//
			
 
				+// src/check_expr.cpp(7892): Assertion Failure: `c->curr_proc_decl->entity`
			
 
				+@(private = "file", enable_target_feature = "sse4.1")
			
 
				+hw_inc_ctr32 :: #force_inline proc "contextless" (src: ^x86.__m128i, ctr: u32) -> (x86.__m128i, u32) {
			
 
				+	ret := x86._mm_insert_epi32(src^, i32(intrinsics.byte_swap(ctr)), 3)
			
 
				+	return ret, ctr + 1
			
 
				+}
			
--- a/core/crypto/aes/aes_impl_hw_gen.odin
+++ b/core/crypto/aes/aes_impl_hw_gen.odin
@@ -1,3 +1,4 @@
 
				+//+build !amd64
			
 
				 package aes
			
 
				 
			
 
				 @(private = "file")
			
--- a/core/crypto/aes/aes_impl_hw_intel.odin
+++ b/core/crypto/aes/aes_impl_hw_intel.odin
@@ -0,0 +1,18 @@
 
				+//+build amd64
			
 
				+package aes
			
 
				+
			
 
				+import "core:crypto/_aes/hw_intel"
			
 
				+
			
 
				+// is_hardware_accelerated returns true iff hardware accelerated AES
			
 
				+// is supported.
			
 
				+is_hardware_accelerated :: proc "contextless" () -> bool {
			
 
				+	return hw_intel.is_supported()
			
 
				+}
			
 
				+
			
 
				+@(private)
			
 
				+Context_Impl_Hardware :: hw_intel.Context
			
 
				+
			
 
				+@(private, enable_target_feature = "sse2,aes")
			
 
				+init_impl_hw :: proc(ctx: ^Context_Impl_Hardware, key: []byte) {
			
 
				+	hw_intel.init(ctx, key)
			
 
				+}
			
--- a/core/simd/x86/aes.odin
+++ b/core/simd/x86/aes.odin
@@ -28,7 +28,7 @@ _mm_aesimc_si128 :: #force_inline proc "c" (a: __m128i) -> __m128i {
 
				 
			
 
				 @(require_results, enable_target_feature = "aes")
			
 
				 _mm_aeskeygenassist_si128 :: #force_inline proc "c" (a: __m128i, $IMM8: u8) -> __m128i {
			
 
				-	return aeskeygenassist(a, u8(IMM8))
			
 
				+	return aeskeygenassist(a, IMM8)
			
 
				 }
			
 
				 
			
 
				 
			
@@ -45,5 +45,5 @@ foreign _ {
 
				 	@(link_name = "llvm.x86.aesni.aesimc")
			
 
				 	aesimc :: proc(a: __m128i) -> __m128i ---
			
 
				 	@(link_name = "llvm.x86.aesni.aeskeygenassist")
			
 
				-	aeskeygenassist :: proc(a: __m128i, imm8: u8) -> __m128i ---
			
 
				+	aeskeygenassist :: proc(a: __m128i, #const imm8: u8) -> __m128i ---
			
 
				 }
			
--- a/tests/core/crypto/test_core_crypto_aes.odin
+++ b/tests/core/crypto/test_core_crypto_aes.odin
@@ -12,8 +12,6 @@ import "core:crypto/sha2"
 
				 test_aes :: proc(t: ^testing.T) {
			
 
				 	runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD()
			
 
				 
			
 
				-	log.info("Testing AES")
			
 
				-
			
 
				 	impls := make([dynamic]aes.Implementation, 0, 2)
			
 
				 	defer delete(impls)
			
 
				 	append(&impls, aes.Implementation.Portable)
			
@@ -29,7 +27,7 @@ test_aes :: proc(t: ^testing.T) {
 
				 }
			
 
				 
			
 
				 test_aes_ecb :: proc(t: ^testing.T, impl: aes.Implementation) {
			
 
				-	log.infof("Testing AES-ECB/%v", impl)
			
 
				+	log.debugf("Testing AES-ECB/%v", impl)
			
 
				 
			
 
				 	test_vectors := []struct {
			
 
				 		key: string,
			
@@ -136,7 +134,7 @@ test_aes_ecb :: proc(t: ^testing.T, impl: aes.Implementation) {
 
				 }
			
 
				 
			
 
				 test_aes_ctr :: proc(t: ^testing.T, impl: aes.Implementation) {
			
 
				-	log.infof("Testing AES-CTR/%v", impl)
			
 
				+	log.debugf("Testing AES-CTR/%v", impl)
			
 
				 
			
 
				 	test_vectors := []struct {
			
 
				 		key: string,
			
@@ -200,7 +198,7 @@ test_aes_ctr :: proc(t: ^testing.T, impl: aes.Implementation) {
 
				 	ctx: aes.Context_CTR
			
 
				 	key: [aes.KEY_SIZE_256]byte
			
 
				 	nonce: [aes.CTR_IV_SIZE]byte
			
 
				-	aes.init_ctr(&ctx, key[:], nonce[:])
			
 
				+	aes.init_ctr(&ctx, key[:], nonce[:], impl)
			
 
				 
			
 
				 	h_ctx: sha2.Context_512
			
 
				 	sha2.init_512_256(&h_ctx)
			
@@ -226,7 +224,7 @@ test_aes_ctr :: proc(t: ^testing.T, impl: aes.Implementation) {
 
				 }
			
 
				 
			
 
				 test_aes_gcm :: proc(t: ^testing.T, impl: aes.Implementation) {
			
 
				-	log.infof("Testing AES-GCM/%v", impl)
			
 
				+	log.debugf("Testing AES-GCM/%v", impl)
			
 
				 
			
 
				 	// NIST did a reorg of their site, so the source of the test vectors
			
 
				 	// is only available from an archive.  The commented out tests are
			
@@ -431,7 +429,7 @@ test_aes_gcm :: proc(t: ^testing.T, impl: aes.Implementation) {
 
				 		testing.expectf(
			
 
				 			t,
			
 
				 			ok && dst_str == v.plaintext,
			
 
				-			"AES-GCM/%v: Expected: (%s, true) for open(%s, %s, %s, %s, %s), but got (%s, %s) instead",
			
 
				+			"AES-GCM/%v: Expected: (%s, true) for open(%s, %s, %s, %s, %s), but got (%s, %v) instead",
			
 
				 			impl,
			
 
				 			v.plaintext,
			
 
				 			v.key,