Adam Ierymenko 6 years ago
parent
commit
81f0175251
1 changed files with 103 additions and 91 deletions
  1. 103 91
      node/AES.hpp

+ 103 - 91
node/AES.hpp

@@ -611,91 +611,103 @@ private:
 	}
 	inline void _decrypt_gcm256_aesni(unsigned int len,const uint8_t *in,uint8_t *out,const uint8_t *iv,unsigned int alen,const uint8_t *assoc,uint8_t *icv,unsigned int icvsize)
 	{
-		__m128i d1,d2,d3,d4,t1,t2,t3,t4,k;
-		__m128i y,j,cb,*bi,*bo;
-		unsigned int blocks,pblocks,rem;
+		__m128i j = _create_j_aesni(iv);
+		__m128i cb = _increment_be_aesni(j);
+		__m128i y = _icv_header_aesni(assoc,alen);
+		unsigned int blocks = len / 16;
+		unsigned int pblocks = blocks - (blocks % 4);
+		unsigned int rem = len % 16;
+		__m128i *bi = (__m128i *)in;
+		__m128i *bo = (__m128i *)out;
 
-		j = _create_j_aesni(iv);
-		cb = _increment_be_aesni(j);
-		y = _icv_header_aesni(assoc,alen);
-		blocks = len / 16;
-		pblocks = blocks - (blocks % 4);
-		rem = len % 16;
-		bi = (__m128i *)in;
-		bo = (__m128i *)out;
+		__m128i k0 = _k.ni.k[0];
+		__m128i k1 = _k.ni.k[1];
+		__m128i k2 = _k.ni.k[2];
+		__m128i k3 = _k.ni.k[3];
+		__m128i k4 = _k.ni.k[4];
+		__m128i k5 = _k.ni.k[5];
+		__m128i k6 = _k.ni.k[6];
+		__m128i k7 = _k.ni.k[7];
+		__m128i k8 = _k.ni.k[8];
+		__m128i k9 = _k.ni.k[9];
+		__m128i k10 = _k.ni.k[10];
+		__m128i k11 = _k.ni.k[11];
+		__m128i k12 = _k.ni.k[12];
+		__m128i k13 = _k.ni.k[13];
+		__m128i k14 = _k.ni.k[14];
 
 		unsigned int i;
 		for (i=0;i<pblocks;i+=4) {
-			d1 = _mm_loadu_si128(bi + i + 0);
-			d2 = _mm_loadu_si128(bi + i + 1);
-			d3 = _mm_loadu_si128(bi + i + 2);
-			d4 = _mm_loadu_si128(bi + i + 3);
+			__m128i d1 = _mm_loadu_si128(bi + i + 0);
+			__m128i d2 = _mm_loadu_si128(bi + i + 1);
+			__m128i d3 = _mm_loadu_si128(bi + i + 2);
+			__m128i d4 = _mm_loadu_si128(bi + i + 3);
 			y = _mm_xor_si128(y,d1);
 			y = _mult4xor_aesni(_k.ni.hhhh,_k.ni.hhh,_k.ni.hh,_k.ni.h,y,d2,d3,d4);
-			t1 = _mm_xor_si128(cb,k = _k.ni.k[0]);
+			__m128i t1 = _mm_xor_si128(cb,k0);
 			cb = _increment_be_aesni(cb);
-			t2 = _mm_xor_si128(cb,k);
+			__m128i t2 = _mm_xor_si128(cb,k0);
 			cb = _increment_be_aesni(cb);
-			t3 = _mm_xor_si128(cb,k);
+			__m128i t3 = _mm_xor_si128(cb,k0);
 			cb = _increment_be_aesni(cb);
-			t4 = _mm_xor_si128(cb,k);
+			__m128i t4 = _mm_xor_si128(cb,k0);
 			cb = _increment_be_aesni(cb);
-			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[1]);
-			t2 = _mm_aesenc_si128(t2,k);
-			t3 = _mm_aesenc_si128(t3,k);
-			t4 = _mm_aesenc_si128(t4,k);
-			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[2]);
-			t2 = _mm_aesenc_si128(t2,k);
-			t3 = _mm_aesenc_si128(t3,k);
-			t4 = _mm_aesenc_si128(t4,k);
-			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[3]);
-			t2 = _mm_aesenc_si128(t2,k);
-			t3 = _mm_aesenc_si128(t3,k);
-			t4 = _mm_aesenc_si128(t4,k);
-			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[4]);
-			t2 = _mm_aesenc_si128(t2,k);
-			t3 = _mm_aesenc_si128(t3,k);
-			t4 = _mm_aesenc_si128(t4,k);
-			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[5]);
-			t2 = _mm_aesenc_si128(t2,k);
-			t3 = _mm_aesenc_si128(t3,k);
-			t4 = _mm_aesenc_si128(t4,k);
-			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[6]);
-			t2 = _mm_aesenc_si128(t2,k);
-			t3 = _mm_aesenc_si128(t3,k);
-			t4 = _mm_aesenc_si128(t4,k);
-			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[7]);
-			t2 = _mm_aesenc_si128(t2,k);
-			t3 = _mm_aesenc_si128(t3,k);
-			t4 = _mm_aesenc_si128(t4,k);
-			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[8]);
-			t2 = _mm_aesenc_si128(t2,k);
-			t3 = _mm_aesenc_si128(t3,k);
-			t4 = _mm_aesenc_si128(t4,k);
-			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[9]);
-			t2 = _mm_aesenc_si128(t2,k);
-			t3 = _mm_aesenc_si128(t3,k);
-			t4 = _mm_aesenc_si128(t4,k);
-			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[10]);
-			t2 = _mm_aesenc_si128(t2,k);
-			t3 = _mm_aesenc_si128(t3,k);
-			t4 = _mm_aesenc_si128(t4,k);
-			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[11]);
-			t2 = _mm_aesenc_si128(t2,k);
-			t3 = _mm_aesenc_si128(t3,k);
-			t4 = _mm_aesenc_si128(t4,k);
-			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[12]);
-			t2 = _mm_aesenc_si128(t2,k);
-			t3 = _mm_aesenc_si128(t3,k);
-			t4 = _mm_aesenc_si128(t4,k);
-			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[13]);
-			t2 = _mm_aesenc_si128(t2,k);
-			t3 = _mm_aesenc_si128(t3,k);
-			t4 = _mm_aesenc_si128(t4,k);
-			t1 = _mm_aesenclast_si128(t1,k = _k.ni.k[14]);
-			t2 = _mm_aesenclast_si128(t2,k);
-			t3 = _mm_aesenclast_si128(t3,k);
-			t4 = _mm_aesenclast_si128(t4,k);
+			t1 = _mm_aesenc_si128(t1,k1);
+			t2 = _mm_aesenc_si128(t2,k1);
+			t3 = _mm_aesenc_si128(t3,k1);
+			t4 = _mm_aesenc_si128(t4,k1);
+			t1 = _mm_aesenc_si128(t1,k2);
+			t2 = _mm_aesenc_si128(t2,k2);
+			t3 = _mm_aesenc_si128(t3,k2);
+			t4 = _mm_aesenc_si128(t4,k2);
+			t1 = _mm_aesenc_si128(t1,k3);
+			t2 = _mm_aesenc_si128(t2,k3);
+			t3 = _mm_aesenc_si128(t3,k3);
+			t4 = _mm_aesenc_si128(t4,k3);
+			t1 = _mm_aesenc_si128(t1,k4);
+			t2 = _mm_aesenc_si128(t2,k4);
+			t3 = _mm_aesenc_si128(t3,k4);
+			t4 = _mm_aesenc_si128(t4,k4);
+			t1 = _mm_aesenc_si128(t1,k5);
+			t2 = _mm_aesenc_si128(t2,k5);
+			t3 = _mm_aesenc_si128(t3,k5);
+			t4 = _mm_aesenc_si128(t4,k5);
+			t1 = _mm_aesenc_si128(t1,k6);
+			t2 = _mm_aesenc_si128(t2,k6);
+			t3 = _mm_aesenc_si128(t3,k6);
+			t4 = _mm_aesenc_si128(t4,k6);
+			t1 = _mm_aesenc_si128(t1,k7);
+			t2 = _mm_aesenc_si128(t2,k7);
+			t3 = _mm_aesenc_si128(t3,k7);
+			t4 = _mm_aesenc_si128(t4,k7);
+			t1 = _mm_aesenc_si128(t1,k8);
+			t2 = _mm_aesenc_si128(t2,k8);
+			t3 = _mm_aesenc_si128(t3,k8);
+			t4 = _mm_aesenc_si128(t4,k8);
+			t1 = _mm_aesenc_si128(t1,k9);
+			t2 = _mm_aesenc_si128(t2,k9);
+			t3 = _mm_aesenc_si128(t3,k9);
+			t4 = _mm_aesenc_si128(t4,k9);
+			t1 = _mm_aesenc_si128(t1,k10);
+			t2 = _mm_aesenc_si128(t2,k10);
+			t3 = _mm_aesenc_si128(t3,k10);
+			t4 = _mm_aesenc_si128(t4,k10);
+			t1 = _mm_aesenc_si128(t1,k11);
+			t2 = _mm_aesenc_si128(t2,k11);
+			t3 = _mm_aesenc_si128(t3,k11);
+			t4 = _mm_aesenc_si128(t4,k11);
+			t1 = _mm_aesenc_si128(t1,k12);
+			t2 = _mm_aesenc_si128(t2,k12);
+			t3 = _mm_aesenc_si128(t3,k12);
+			t4 = _mm_aesenc_si128(t4,k12);
+			t1 = _mm_aesenc_si128(t1,k13);
+			t2 = _mm_aesenc_si128(t2,k13);
+			t3 = _mm_aesenc_si128(t3,k13);
+			t4 = _mm_aesenc_si128(t4,k13);
+			t1 = _mm_aesenclast_si128(t1,k14);
+			t2 = _mm_aesenclast_si128(t2,k14);
+			t3 = _mm_aesenclast_si128(t3,k14);
+			t4 = _mm_aesenclast_si128(t4,k14);
 			t1 = _mm_xor_si128(t1,d1);
 			t2 = _mm_xor_si128(t2,d2);
 			t3 = _mm_xor_si128(t3,d3);
@@ -707,23 +719,23 @@ private:
 		}
 
 		for (i=pblocks;i<blocks;i++) {
-			d1 = _mm_loadu_si128(bi + i);
+			__m128i d1 = _mm_loadu_si128(bi + i);
 			y = _ghash_aesni(_k.ni.h,y,d1);
-			t1 = _mm_xor_si128(cb,_k.ni.k[0]);
-			t1 = _mm_aesenc_si128(t1,_k.ni.k[1]);
-			t1 = _mm_aesenc_si128(t1,_k.ni.k[2]);
-			t1 = _mm_aesenc_si128(t1,_k.ni.k[3]);
-			t1 = _mm_aesenc_si128(t1,_k.ni.k[4]);
-			t1 = _mm_aesenc_si128(t1,_k.ni.k[5]);
-			t1 = _mm_aesenc_si128(t1,_k.ni.k[6]);
-			t1 = _mm_aesenc_si128(t1,_k.ni.k[7]);
-			t1 = _mm_aesenc_si128(t1,_k.ni.k[8]);
-			t1 = _mm_aesenc_si128(t1,_k.ni.k[9]);
-			t1 = _mm_aesenc_si128(t1,_k.ni.k[10]);
-			t1 = _mm_aesenc_si128(t1,_k.ni.k[11]);
-			t1 = _mm_aesenc_si128(t1,_k.ni.k[12]);
-			t1 = _mm_aesenc_si128(t1,_k.ni.k[13]);
-			t1 = _mm_aesenclast_si128(t1,_k.ni.k[14]);
+			__m128i t1 = _mm_xor_si128(cb,k0);
+			t1 = _mm_aesenc_si128(t1,k1);
+			t1 = _mm_aesenc_si128(t1,k2);
+			t1 = _mm_aesenc_si128(t1,k3);
+			t1 = _mm_aesenc_si128(t1,k4);
+			t1 = _mm_aesenc_si128(t1,k5);
+			t1 = _mm_aesenc_si128(t1,k6);
+			t1 = _mm_aesenc_si128(t1,k7);
+			t1 = _mm_aesenc_si128(t1,k8);
+			t1 = _mm_aesenc_si128(t1,k9);
+			t1 = _mm_aesenc_si128(t1,k10);
+			t1 = _mm_aesenc_si128(t1,k11);
+			t1 = _mm_aesenc_si128(t1,k12);
+			t1 = _mm_aesenc_si128(t1,k13);
+			t1 = _mm_aesenclast_si128(t1,k14);
 			t1 = _mm_xor_si128(t1,d1);
 			_mm_storeu_si128(bo + i,t1);
 			cb = _increment_be_aesni(cb);