Browse Source

MOAR AES V-TEC

Adam Ierymenko 5 years ago
parent
commit
15e88a8b7e
1 changed files with 249 additions and 131 deletions
  1. 249 131
      node/AES.cpp

+ 249 - 131
node/AES.cpp

@@ -508,6 +508,7 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept
 				out[totalLen++] = *(in++);
 				if (!(totalLen & 15U)) {
 					__m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
+					if (unlikely(++c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
 					d0 = _mm_xor_si128(d0,k0);
 					d0 = _mm_aesenc_si128(d0,k1);
 					d0 = _mm_aesenc_si128(d0,k2);
@@ -526,7 +527,6 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept
 					d0 = _mm_aesenc_si128(d0,k13);
 					d0 = _mm_aesenclast_si128(d0,k14);
 					_mm_storeu_si128(outblk,_mm_xor_si128(p0,d0));
-					if (unlikely(++c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
 					break;
 				}
 			}
@@ -535,147 +535,265 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept
 		out += totalLen;
 		_len = (totalLen + len);
 
-		while (len >= 64) {
-			__m128i d0,d1,d2,d3;
-			if (likely(c1 < 0xfffffffffffffffcULL)) {
-				d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
-				d1 = _mm_set_epi64x((long long)Utils::hton(c1 + 1ULL),(long long)c0);
-				d2 = _mm_set_epi64x((long long)Utils::hton(c1 + 2ULL),(long long)c0);
-				d3 = _mm_set_epi64x((long long)Utils::hton(c1 + 3ULL),(long long)c0);
+		if (likely((c1 + len) > c1)) { // it's incredibly likely that we can ignore carry in counter increment
+			while (len >= 64) {
+				__m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
+				__m128i d1 = _mm_set_epi64x((long long)Utils::hton(c1 + 1ULL),(long long)c0);
+				__m128i d2 = _mm_set_epi64x((long long)Utils::hton(c1 + 2ULL),(long long)c0);
+				__m128i d3 = _mm_set_epi64x((long long)Utils::hton(c1 + 3ULL),(long long)c0);
 				c1 += 4;
-			} else {
-				d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
-				if (unlikely(++c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
-				d1 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
-				if (unlikely(++c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
-				d2 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
-				if (unlikely(++c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
-				d3 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
-				if (unlikely(++c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
-			}
 
-			d0 = _mm_xor_si128(d0,k0);
-			d1 = _mm_xor_si128(d1,k0);
-			d2 = _mm_xor_si128(d2,k0);
-			d3 = _mm_xor_si128(d3,k0);
-			d0 = _mm_aesenc_si128(d0,k1);
-			d1 = _mm_aesenc_si128(d1,k1);
-			d2 = _mm_aesenc_si128(d2,k1);
-			d3 = _mm_aesenc_si128(d3,k1);
-			__m128i ka = k[6];
-			d0 = _mm_aesenc_si128(d0,k2);
-			d1 = _mm_aesenc_si128(d1,k2);
-			d2 = _mm_aesenc_si128(d2,k2);
-			d3 = _mm_aesenc_si128(d3,k2);
-			__m128i kb = k[7];
-			d0 = _mm_aesenc_si128(d0,k3);
-			d1 = _mm_aesenc_si128(d1,k3);
-			d2 = _mm_aesenc_si128(d2,k3);
-			d3 = _mm_aesenc_si128(d3,k3);
-			__m128i kc = k[8];
-			d0 = _mm_aesenc_si128(d0,k4);
-			d1 = _mm_aesenc_si128(d1,k4);
-			d2 = _mm_aesenc_si128(d2,k4);
-			d3 = _mm_aesenc_si128(d3,k4);
-			__m128i kd = k[9];
-			d0 = _mm_aesenc_si128(d0,k5);
-			d1 = _mm_aesenc_si128(d1,k5);
-			d2 = _mm_aesenc_si128(d2,k5);
-			d3 = _mm_aesenc_si128(d3,k5);
-			__m128i ke = k[10];
-			d0 = _mm_aesenc_si128(d0,ka);
-			d1 = _mm_aesenc_si128(d1,ka);
-			d2 = _mm_aesenc_si128(d2,ka);
-			d3 = _mm_aesenc_si128(d3,ka);
-			__m128i kf = k[11];
-			d0 = _mm_aesenc_si128(d0,kb);
-			d1 = _mm_aesenc_si128(d1,kb);
-			d2 = _mm_aesenc_si128(d2,kb);
-			d3 = _mm_aesenc_si128(d3,kb);
-			ka = k[12];
-			d0 = _mm_aesenc_si128(d0,kc);
-			d1 = _mm_aesenc_si128(d1,kc);
-			d2 = _mm_aesenc_si128(d2,kc);
-			d3 = _mm_aesenc_si128(d3,kc);
-			kb = k[13];
-			d0 = _mm_aesenc_si128(d0,kd);
-			d1 = _mm_aesenc_si128(d1,kd);
-			d2 = _mm_aesenc_si128(d2,kd);
-			d3 = _mm_aesenc_si128(d3,kd);
-			kc = k[14];
-			d0 = _mm_aesenc_si128(d0,ke);
-			d1 = _mm_aesenc_si128(d1,ke);
-			d2 = _mm_aesenc_si128(d2,ke);
-			d3 = _mm_aesenc_si128(d3,ke);
-			kd = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in));
-			d0 = _mm_aesenc_si128(d0,kf);
-			d1 = _mm_aesenc_si128(d1,kf);
-			d2 = _mm_aesenc_si128(d2,kf);
-			d3 = _mm_aesenc_si128(d3,kf);
-			ke = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 16));
-			d0 = _mm_aesenc_si128(d0,ka);
-			d1 = _mm_aesenc_si128(d1,ka);
-			d2 = _mm_aesenc_si128(d2,ka);
-			d3 = _mm_aesenc_si128(d3,ka);
-			kf = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 32));
-			d0 = _mm_aesenc_si128(d0,kb);
-			d1 = _mm_aesenc_si128(d1,kb);
-			d2 = _mm_aesenc_si128(d2,kb);
-			d3 = _mm_aesenc_si128(d3,kb);
-			ka = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 48));
-			d0 = _mm_aesenclast_si128(d0,kc);
-			d1 = _mm_aesenclast_si128(d1,kc);
-			d2 = _mm_aesenclast_si128(d2,kc);
-			d3 = _mm_aesenclast_si128(d3,kc);
-			kd = _mm_xor_si128(d0,kd);
-			ke = _mm_xor_si128(d1,ke);
-			kf = _mm_xor_si128(d2,kf);
-			ka = _mm_xor_si128(d3,ka);
-			_mm_storeu_si128(reinterpret_cast<__m128i *>(out),kd);
-			_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 16),ke);
-			_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 32),kf);
-			_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 48),ka);
+				d0 = _mm_xor_si128(d0,k0);
+				d1 = _mm_xor_si128(d1,k0);
+				d2 = _mm_xor_si128(d2,k0);
+				d3 = _mm_xor_si128(d3,k0);
+				d0 = _mm_aesenc_si128(d0,k1);
+				d1 = _mm_aesenc_si128(d1,k1);
+				d2 = _mm_aesenc_si128(d2,k1);
+				d3 = _mm_aesenc_si128(d3,k1);
+				__m128i ka = k[6];
+				d0 = _mm_aesenc_si128(d0,k2);
+				d1 = _mm_aesenc_si128(d1,k2);
+				d2 = _mm_aesenc_si128(d2,k2);
+				d3 = _mm_aesenc_si128(d3,k2);
+				__m128i kb = k[7];
+				d0 = _mm_aesenc_si128(d0,k3);
+				d1 = _mm_aesenc_si128(d1,k3);
+				d2 = _mm_aesenc_si128(d2,k3);
+				d3 = _mm_aesenc_si128(d3,k3);
+				__m128i kc = k[8];
+				d0 = _mm_aesenc_si128(d0,k4);
+				d1 = _mm_aesenc_si128(d1,k4);
+				d2 = _mm_aesenc_si128(d2,k4);
+				d3 = _mm_aesenc_si128(d3,k4);
+				__m128i kd = k[9];
+				d0 = _mm_aesenc_si128(d0,k5);
+				d1 = _mm_aesenc_si128(d1,k5);
+				d2 = _mm_aesenc_si128(d2,k5);
+				d3 = _mm_aesenc_si128(d3,k5);
+				__m128i ke = k[10];
+				d0 = _mm_aesenc_si128(d0,ka);
+				d1 = _mm_aesenc_si128(d1,ka);
+				d2 = _mm_aesenc_si128(d2,ka);
+				d3 = _mm_aesenc_si128(d3,ka);
+				__m128i kf = k[11];
+				d0 = _mm_aesenc_si128(d0,kb);
+				d1 = _mm_aesenc_si128(d1,kb);
+				d2 = _mm_aesenc_si128(d2,kb);
+				d3 = _mm_aesenc_si128(d3,kb);
+				ka = k[12];
+				d0 = _mm_aesenc_si128(d0,kc);
+				d1 = _mm_aesenc_si128(d1,kc);
+				d2 = _mm_aesenc_si128(d2,kc);
+				d3 = _mm_aesenc_si128(d3,kc);
+				kb = k[13];
+				d0 = _mm_aesenc_si128(d0,kd);
+				d1 = _mm_aesenc_si128(d1,kd);
+				d2 = _mm_aesenc_si128(d2,kd);
+				d3 = _mm_aesenc_si128(d3,kd);
+				kc = k[14];
+				d0 = _mm_aesenc_si128(d0,ke);
+				d1 = _mm_aesenc_si128(d1,ke);
+				d2 = _mm_aesenc_si128(d2,ke);
+				d3 = _mm_aesenc_si128(d3,ke);
+				kd = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in));
+				d0 = _mm_aesenc_si128(d0,kf);
+				d1 = _mm_aesenc_si128(d1,kf);
+				d2 = _mm_aesenc_si128(d2,kf);
+				d3 = _mm_aesenc_si128(d3,kf);
+				ke = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 16));
+				d0 = _mm_aesenc_si128(d0,ka);
+				d1 = _mm_aesenc_si128(d1,ka);
+				d2 = _mm_aesenc_si128(d2,ka);
+				d3 = _mm_aesenc_si128(d3,ka);
+				kf = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 32));
+				d0 = _mm_aesenc_si128(d0,kb);
+				d1 = _mm_aesenc_si128(d1,kb);
+				d2 = _mm_aesenc_si128(d2,kb);
+				d3 = _mm_aesenc_si128(d3,kb);
+				ka = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 48));
+				d0 = _mm_aesenclast_si128(d0,kc);
+				d1 = _mm_aesenclast_si128(d1,kc);
+				d2 = _mm_aesenclast_si128(d2,kc);
+				d3 = _mm_aesenclast_si128(d3,kc);
+				kd = _mm_xor_si128(d0,kd);
+				ke = _mm_xor_si128(d1,ke);
+				kf = _mm_xor_si128(d2,kf);
+				ka = _mm_xor_si128(d3,ka);
+				_mm_storeu_si128(reinterpret_cast<__m128i *>(out),kd);
+				_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 16),ke);
+				_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 32),kf);
+				_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 48),ka);
+
+				in += 64;
+				len -= 64;
+				out += 64;
+			}
 
-			in += 64;
-			len -= 64;
-			out += 64;
-		}
+			if (len >= 16) {
+				const __m128i k7 = k[7];
+				const __m128i k8 = k[8];
+				const __m128i k9 = k[9];
+				const __m128i k10 = k[10];
+				const __m128i k11 = k[11];
+				const __m128i k12 = k[12];
+				const __m128i k13 = k[13];
+				const __m128i k14 = k[14];
+				do {
+					__m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
+					d0 = _mm_xor_si128(d0,k0);
+					d0 = _mm_aesenc_si128(d0,k1);
+					d0 = _mm_aesenc_si128(d0,k2);
+					d0 = _mm_aesenc_si128(d0,k3);
+					d0 = _mm_aesenc_si128(d0,k4);
+					d0 = _mm_aesenc_si128(d0,k5);
+					d0 = _mm_aesenc_si128(d0,k[6]);
+					d0 = _mm_aesenc_si128(d0,k7);
+					d0 = _mm_aesenc_si128(d0,k8);
+					d0 = _mm_aesenc_si128(d0,k9);
+					d0 = _mm_aesenc_si128(d0,k10);
+					d0 = _mm_aesenc_si128(d0,k11);
+					d0 = _mm_aesenc_si128(d0,k12);
+					d0 = _mm_aesenc_si128(d0,k13);
+					d0 = _mm_aesenclast_si128(d0,k14);
+					_mm_storeu_si128(reinterpret_cast<__m128i *>(out),_mm_xor_si128(d0,_mm_loadu_si128(reinterpret_cast<const __m128i *>(in))));
 
-		{
-			const __m128i k7 = k[7];
-			const __m128i k8 = k[8];
-			const __m128i k9 = k[9];
-			const __m128i k10 = k[10];
-			const __m128i k11 = k[11];
-			const __m128i k12 = k[12];
-			const __m128i k13 = k[13];
-			const __m128i k14 = k[14];
-			while (len >= 16) {
+					in += 16;
+					len -= 16;
+					out += 16;
+				} while (len >= 16);
+			}
+		} else {
+			while (len >= 64) {
 				__m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
-				if (unlikely(c1 == 0)) {
-					c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
-					d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
-				}
+				if (unlikely(c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
+				__m128i d1 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
+				if (unlikely(c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
+				__m128i d2 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
+				if (unlikely(c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
+				__m128i d3 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
+				if (unlikely(c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
+
 				d0 = _mm_xor_si128(d0,k0);
+				d1 = _mm_xor_si128(d1,k0);
+				d2 = _mm_xor_si128(d2,k0);
+				d3 = _mm_xor_si128(d3,k0);
 				d0 = _mm_aesenc_si128(d0,k1);
+				d1 = _mm_aesenc_si128(d1,k1);
+				d2 = _mm_aesenc_si128(d2,k1);
+				d3 = _mm_aesenc_si128(d3,k1);
+				__m128i ka = k[6];
 				d0 = _mm_aesenc_si128(d0,k2);
+				d1 = _mm_aesenc_si128(d1,k2);
+				d2 = _mm_aesenc_si128(d2,k2);
+				d3 = _mm_aesenc_si128(d3,k2);
+				__m128i kb = k[7];
 				d0 = _mm_aesenc_si128(d0,k3);
+				d1 = _mm_aesenc_si128(d1,k3);
+				d2 = _mm_aesenc_si128(d2,k3);
+				d3 = _mm_aesenc_si128(d3,k3);
+				__m128i kc = k[8];
 				d0 = _mm_aesenc_si128(d0,k4);
+				d1 = _mm_aesenc_si128(d1,k4);
+				d2 = _mm_aesenc_si128(d2,k4);
+				d3 = _mm_aesenc_si128(d3,k4);
+				__m128i kd = k[9];
 				d0 = _mm_aesenc_si128(d0,k5);
-				d0 = _mm_aesenc_si128(d0,k[6]);
-				d0 = _mm_aesenc_si128(d0,k7);
-				d0 = _mm_aesenc_si128(d0,k8);
-				d0 = _mm_aesenc_si128(d0,k9);
-				d0 = _mm_aesenc_si128(d0,k10);
-				d0 = _mm_aesenc_si128(d0,k11);
-				d0 = _mm_aesenc_si128(d0,k12);
-				d0 = _mm_aesenc_si128(d0,k13);
-				d0 = _mm_aesenclast_si128(d0,k14);
-				_mm_storeu_si128(reinterpret_cast<__m128i *>(out),_mm_xor_si128(d0,_mm_loadu_si128(reinterpret_cast<const __m128i *>(in))));
-
-				in += 16;
-				len -= 16;
-				out += 16;
+				d1 = _mm_aesenc_si128(d1,k5);
+				d2 = _mm_aesenc_si128(d2,k5);
+				d3 = _mm_aesenc_si128(d3,k5);
+				__m128i ke = k[10];
+				d0 = _mm_aesenc_si128(d0,ka);
+				d1 = _mm_aesenc_si128(d1,ka);
+				d2 = _mm_aesenc_si128(d2,ka);
+				d3 = _mm_aesenc_si128(d3,ka);
+				__m128i kf = k[11];
+				d0 = _mm_aesenc_si128(d0,kb);
+				d1 = _mm_aesenc_si128(d1,kb);
+				d2 = _mm_aesenc_si128(d2,kb);
+				d3 = _mm_aesenc_si128(d3,kb);
+				ka = k[12];
+				d0 = _mm_aesenc_si128(d0,kc);
+				d1 = _mm_aesenc_si128(d1,kc);
+				d2 = _mm_aesenc_si128(d2,kc);
+				d3 = _mm_aesenc_si128(d3,kc);
+				kb = k[13];
+				d0 = _mm_aesenc_si128(d0,kd);
+				d1 = _mm_aesenc_si128(d1,kd);
+				d2 = _mm_aesenc_si128(d2,kd);
+				d3 = _mm_aesenc_si128(d3,kd);
+				kc = k[14];
+				d0 = _mm_aesenc_si128(d0,ke);
+				d1 = _mm_aesenc_si128(d1,ke);
+				d2 = _mm_aesenc_si128(d2,ke);
+				d3 = _mm_aesenc_si128(d3,ke);
+				kd = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in));
+				d0 = _mm_aesenc_si128(d0,kf);
+				d1 = _mm_aesenc_si128(d1,kf);
+				d2 = _mm_aesenc_si128(d2,kf);
+				d3 = _mm_aesenc_si128(d3,kf);
+				ke = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 16));
+				d0 = _mm_aesenc_si128(d0,ka);
+				d1 = _mm_aesenc_si128(d1,ka);
+				d2 = _mm_aesenc_si128(d2,ka);
+				d3 = _mm_aesenc_si128(d3,ka);
+				kf = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 32));
+				d0 = _mm_aesenc_si128(d0,kb);
+				d1 = _mm_aesenc_si128(d1,kb);
+				d2 = _mm_aesenc_si128(d2,kb);
+				d3 = _mm_aesenc_si128(d3,kb);
+				ka = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 48));
+				d0 = _mm_aesenclast_si128(d0,kc);
+				d1 = _mm_aesenclast_si128(d1,kc);
+				d2 = _mm_aesenclast_si128(d2,kc);
+				d3 = _mm_aesenclast_si128(d3,kc);
+				kd = _mm_xor_si128(d0,kd);
+				ke = _mm_xor_si128(d1,ke);
+				kf = _mm_xor_si128(d2,kf);
+				ka = _mm_xor_si128(d3,ka);
+				_mm_storeu_si128(reinterpret_cast<__m128i *>(out),kd);
+				_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 16),ke);
+				_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 32),kf);
+				_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 48),ka);
+
+				in += 64;
+				len -= 64;
+				out += 64;
+			}
+
+			if (len >= 16) {
+				const __m128i k7 = k[7];
+				const __m128i k8 = k[8];
+				const __m128i k9 = k[9];
+				const __m128i k10 = k[10];
+				const __m128i k11 = k[11];
+				const __m128i k12 = k[12];
+				const __m128i k13 = k[13];
+				const __m128i k14 = k[14];
+				do {
+					__m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
+					if (unlikely(c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
+					d0 = _mm_xor_si128(d0,k0);
+					d0 = _mm_aesenc_si128(d0,k1);
+					d0 = _mm_aesenc_si128(d0,k2);
+					d0 = _mm_aesenc_si128(d0,k3);
+					d0 = _mm_aesenc_si128(d0,k4);
+					d0 = _mm_aesenc_si128(d0,k5);
+					d0 = _mm_aesenc_si128(d0,k[6]);
+					d0 = _mm_aesenc_si128(d0,k7);
+					d0 = _mm_aesenc_si128(d0,k8);
+					d0 = _mm_aesenc_si128(d0,k9);
+					d0 = _mm_aesenc_si128(d0,k10);
+					d0 = _mm_aesenc_si128(d0,k11);
+					d0 = _mm_aesenc_si128(d0,k12);
+					d0 = _mm_aesenc_si128(d0,k13);
+					d0 = _mm_aesenclast_si128(d0,k14);
+					_mm_storeu_si128(reinterpret_cast<__m128i *>(out),_mm_xor_si128(d0,_mm_loadu_si128(reinterpret_cast<const __m128i *>(in))));
+
+					in += 16;
+					len -= 16;
+					out += 16;
+				} while (len >= 16);
 			}
 		}