Browse Source

More AES stuff

Adam Ierymenko 6 years ago
parent
commit
f7bc9f01c9
1 changed files with 302 additions and 295 deletions
  1. 302 295
      node/AES.hpp

+ 302 - 295
node/AES.hpp

@@ -97,6 +97,13 @@ public:
 
 	inline bool gcmDecrypt(const uint8_t iv[12],const void *in,unsigned int inlen,const void *assoc,unsigned int assoclen,void *out,const uint8_t *tag,unsigned int taglen)
 	{
+#ifdef ZT_AES_AESNI
+		if (HW_ACCEL) {
+			uint8_t tagbuf[16];
+			_decrypt_gcm256_aesni(inlen,(const uint8_t *)in,(uint8_t *)out,iv,assoclen,(const uint8_t *)assoc,tagbuf,taglen);
+			return Utils::secureEq(tagbuf,tag,taglen);
+		}
+#endif
 		abort(); // TODO: software
 		return false;
 	}
@@ -217,41 +224,41 @@ private:
 	{
 		__m128i t1,t2,t3,t4,t5,t6;
 		y = _swap128_aesni(y);
-		t1 = _mm_clmulepi64_si128(h, y, 0x00);
-		t2 = _mm_clmulepi64_si128(h, y, 0x01);
-		t3 = _mm_clmulepi64_si128(h, y, 0x10);
-		t4 = _mm_clmulepi64_si128(h, y, 0x11);
-		t2 = _mm_xor_si128(t2, t3);
-		t3 = _mm_slli_si128(t2, 8);
-		t2 = _mm_srli_si128(t2, 8);
-		t1 = _mm_xor_si128(t1, t3);
-		t4 = _mm_xor_si128(t4, t2);
-		t5 = _mm_srli_epi32(t1, 31);
-		t1 = _mm_slli_epi32(t1, 1);
-		t6 = _mm_srli_epi32(t4, 31);
-		t4 = _mm_slli_epi32(t4, 1);
-		t3 = _mm_srli_si128(t5, 12);
-		t6 = _mm_slli_si128(t6, 4);
-		t5 = _mm_slli_si128(t5, 4);
-		t1 = _mm_or_si128(t1, t5);
-		t4 = _mm_or_si128(t4, t6);
-		t4 = _mm_or_si128(t4, t3);
-		t5 = _mm_slli_epi32(t1, 31);
-		t6 = _mm_slli_epi32(t1, 30);
-		t3 = _mm_slli_epi32(t1, 25);
-		t5 = _mm_xor_si128(t5, t6);
-		t5 = _mm_xor_si128(t5, t3);
-		t6 = _mm_srli_si128(t5, 4);
-		t4 = _mm_xor_si128(t4, t6);
-		t5 = _mm_slli_si128(t5, 12);
-		t1 = _mm_xor_si128(t1, t5);
-		t4 = _mm_xor_si128(t4, t1);
-		t5 = _mm_srli_epi32(t1, 1);
-		t2 = _mm_srli_epi32(t1, 2);
-		t3 = _mm_srli_epi32(t1, 7);
-		t4 = _mm_xor_si128(t4, t2);
-		t4 = _mm_xor_si128(t4, t3);
-		t4 = _mm_xor_si128(t4, t5);
+		t1 = _mm_clmulepi64_si128(h,y,0x00);
+		t2 = _mm_clmulepi64_si128(h,y,0x01);
+		t3 = _mm_clmulepi64_si128(h,y,0x10);
+		t4 = _mm_clmulepi64_si128(h,y,0x11);
+		t2 = _mm_xor_si128(t2,t3);
+		t3 = _mm_slli_si128(t2,8);
+		t2 = _mm_srli_si128(t2,8);
+		t1 = _mm_xor_si128(t1,t3);
+		t4 = _mm_xor_si128(t4,t2);
+		t5 = _mm_srli_epi32(t1,31);
+		t1 = _mm_slli_epi32(t1,1);
+		t6 = _mm_srli_epi32(t4,31);
+		t4 = _mm_slli_epi32(t4,1);
+		t3 = _mm_srli_si128(t5,12);
+		t6 = _mm_slli_si128(t6,4);
+		t5 = _mm_slli_si128(t5,4);
+		t1 = _mm_or_si128(t1,t5);
+		t4 = _mm_or_si128(t4,t6);
+		t4 = _mm_or_si128(t4,t3);
+		t5 = _mm_slli_epi32(t1,31);
+		t6 = _mm_slli_epi32(t1,30);
+		t3 = _mm_slli_epi32(t1,25);
+		t5 = _mm_xor_si128(t5,t6);
+		t5 = _mm_xor_si128(t5,t3);
+		t6 = _mm_srli_si128(t5,4);
+		t4 = _mm_xor_si128(t4,t6);
+		t5 = _mm_slli_si128(t5,12);
+		t1 = _mm_xor_si128(t1,t5);
+		t4 = _mm_xor_si128(t4,t1);
+		t5 = _mm_srli_epi32(t1,1);
+		t2 = _mm_srli_epi32(t1,2);
+		t3 = _mm_srli_epi32(t1,7);
+		t4 = _mm_xor_si128(t4,t2);
+		t4 = _mm_xor_si128(t4,t3);
+		t4 = _mm_xor_si128(t4,t5);
 		return _swap128_aesni(t4);
 	}
 	static inline __m128i _mult4xor_aesni(__m128i h1,__m128i h2,__m128i h3,__m128i h4,__m128i d1,__m128i d2,__m128i d3,__m128i d4)
@@ -261,75 +268,75 @@ private:
 		d2 = _swap128_aesni(d2);
 		d3 = _swap128_aesni(d3);
 		d4 = _swap128_aesni(d4);
-		t0 = _mm_clmulepi64_si128(h1, d1, 0x00);
-		t1 = _mm_clmulepi64_si128(h2, d2, 0x00);
-		t2 = _mm_clmulepi64_si128(h3, d3, 0x00);
-		t3 = _mm_clmulepi64_si128(h4, d4, 0x00);
-		t8 = _mm_xor_si128(t0, t1);
-		t8 = _mm_xor_si128(t8, t2);
-		t8 = _mm_xor_si128(t8, t3);
-		t4 = _mm_clmulepi64_si128(h1, d1, 0x11);
-		t5 = _mm_clmulepi64_si128(h2, d2, 0x11);
-		t6 = _mm_clmulepi64_si128(h3, d3, 0x11);
-		t7 = _mm_clmulepi64_si128(h4, d4, 0x11);
-		t9 = _mm_xor_si128(t4, t5);
-		t9 = _mm_xor_si128(t9, t6);
-		t9 = _mm_xor_si128(t9, t7);
-		t0 = _mm_shuffle_epi32(h1, 78);
-		t4 = _mm_shuffle_epi32(d1, 78);
-		t0 = _mm_xor_si128(t0, h1);
-		t4 = _mm_xor_si128(t4, d1);
-		t1 = _mm_shuffle_epi32(h2, 78);
-		t5 = _mm_shuffle_epi32(d2, 78);
-		t1 = _mm_xor_si128(t1, h2);
-		t5 = _mm_xor_si128(t5, d2);
-		t2 = _mm_shuffle_epi32(h3, 78);
-		t6 = _mm_shuffle_epi32(d3, 78);
-		t2 = _mm_xor_si128(t2, h3);
-		t6 = _mm_xor_si128(t6, d3);
-		t3 = _mm_shuffle_epi32(h4, 78);
-		t7 = _mm_shuffle_epi32(d4, 78);
-		t3 = _mm_xor_si128(t3, h4);
-		t7 = _mm_xor_si128(t7, d4);
-		t0 = _mm_clmulepi64_si128(t0, t4, 0x00);
-		t1 = _mm_clmulepi64_si128(t1, t5, 0x00);
-		t2 = _mm_clmulepi64_si128(t2, t6, 0x00);
-		t3 = _mm_clmulepi64_si128(t3, t7, 0x00);
-		t0 = _mm_xor_si128(t0, t8);
-		t0 = _mm_xor_si128(t0, t9);
-		t0 = _mm_xor_si128(t1, t0);
-		t0 = _mm_xor_si128(t2, t0);
-		t0 = _mm_xor_si128(t3, t0);
-		t4 = _mm_slli_si128(t0, 8);
-		t0 = _mm_srli_si128(t0, 8);
-		t3 = _mm_xor_si128(t4, t8);
-		t6 = _mm_xor_si128(t0, t9);
-		t7 = _mm_srli_epi32(t3, 31);
-		t8 = _mm_srli_epi32(t6, 31);
-		t3 = _mm_slli_epi32(t3, 1);
-		t6 = _mm_slli_epi32(t6, 1);
-		t9 = _mm_srli_si128(t7, 12);
-		t8 = _mm_slli_si128(t8, 4);
-		t7 = _mm_slli_si128(t7, 4);
-		t3 = _mm_or_si128(t3, t7);
-		t6 = _mm_or_si128(t6, t8);
-		t6 = _mm_or_si128(t6, t9);
-		t7 = _mm_slli_epi32(t3, 31);
-		t8 = _mm_slli_epi32(t3, 30);
-		t9 = _mm_slli_epi32(t3, 25);
-		t7 = _mm_xor_si128(t7, t8);
-		t7 = _mm_xor_si128(t7, t9);
-		t8 = _mm_srli_si128(t7, 4);
-		t7 = _mm_slli_si128(t7, 12);
-		t3 = _mm_xor_si128(t3, t7);
-		t2 = _mm_srli_epi32(t3, 1);
-		t4 = _mm_srli_epi32(t3, 2);
-		t5 = _mm_srli_epi32(t3, 7);
-		t2 = _mm_xor_si128(t2, t4);
-		t2 = _mm_xor_si128(t2, t5);
-		t2 = _mm_xor_si128(t2, t8);
-		t3 = _mm_xor_si128(t3, t2);
-		t6 = _mm_xor_si128(t6, t3);
+		t0 = _mm_clmulepi64_si128(h1,d1,0x00);
+		t1 = _mm_clmulepi64_si128(h2,d2,0x00);
+		t2 = _mm_clmulepi64_si128(h3,d3,0x00);
+		t3 = _mm_clmulepi64_si128(h4,d4,0x00);
+		t8 = _mm_xor_si128(t0,t1);
+		t8 = _mm_xor_si128(t8,t2);
+		t8 = _mm_xor_si128(t8,t3);
+		t4 = _mm_clmulepi64_si128(h1,d1,0x11);
+		t5 = _mm_clmulepi64_si128(h2,d2,0x11);
+		t6 = _mm_clmulepi64_si128(h3,d3,0x11);
+		t7 = _mm_clmulepi64_si128(h4,d4,0x11);
+		t9 = _mm_xor_si128(t4,t5);
+		t9 = _mm_xor_si128(t9,t6);
+		t9 = _mm_xor_si128(t9,t7);
+		t0 = _mm_shuffle_epi32(h1,78);
+		t4 = _mm_shuffle_epi32(d1,78);
+		t0 = _mm_xor_si128(t0,h1);
+		t4 = _mm_xor_si128(t4,d1);
+		t1 = _mm_shuffle_epi32(h2,78);
+		t5 = _mm_shuffle_epi32(d2,78);
+		t1 = _mm_xor_si128(t1,h2);
+		t5 = _mm_xor_si128(t5,d2);
+		t2 = _mm_shuffle_epi32(h3,78);
+		t6 = _mm_shuffle_epi32(d3,78);
+		t2 = _mm_xor_si128(t2,h3);
+		t6 = _mm_xor_si128(t6,d3);
+		t3 = _mm_shuffle_epi32(h4,78);
+		t7 = _mm_shuffle_epi32(d4,78);
+		t3 = _mm_xor_si128(t3,h4);
+		t7 = _mm_xor_si128(t7,d4);
+		t0 = _mm_clmulepi64_si128(t0,t4,0x00);
+		t1 = _mm_clmulepi64_si128(t1,t5,0x00);
+		t2 = _mm_clmulepi64_si128(t2,t6,0x00);
+		t3 = _mm_clmulepi64_si128(t3,t7,0x00);
+		t0 = _mm_xor_si128(t0,t8);
+		t0 = _mm_xor_si128(t0,t9);
+		t0 = _mm_xor_si128(t1,t0);
+		t0 = _mm_xor_si128(t2,t0);
+		t0 = _mm_xor_si128(t3,t0);
+		t4 = _mm_slli_si128(t0,8);
+		t0 = _mm_srli_si128(t0,8);
+		t3 = _mm_xor_si128(t4,t8);
+		t6 = _mm_xor_si128(t0,t9);
+		t7 = _mm_srli_epi32(t3,31);
+		t8 = _mm_srli_epi32(t6,31);
+		t3 = _mm_slli_epi32(t3,1);
+		t6 = _mm_slli_epi32(t6,1);
+		t9 = _mm_srli_si128(t7,12);
+		t8 = _mm_slli_si128(t8,4);
+		t7 = _mm_slli_si128(t7,4);
+		t3 = _mm_or_si128(t3,t7);
+		t6 = _mm_or_si128(t6,t8);
+		t6 = _mm_or_si128(t6,t9);
+		t7 = _mm_slli_epi32(t3,31);
+		t8 = _mm_slli_epi32(t3,30);
+		t9 = _mm_slli_epi32(t3,25);
+		t7 = _mm_xor_si128(t7,t8);
+		t7 = _mm_xor_si128(t7,t9);
+		t8 = _mm_srli_si128(t7,4);
+		t7 = _mm_slli_si128(t7,12);
+		t3 = _mm_xor_si128(t3,t7);
+		t2 = _mm_srli_epi32(t3,1);
+		t4 = _mm_srli_epi32(t3,2);
+		t5 = _mm_srli_epi32(t3,7);
+		t2 = _mm_xor_si128(t2,t4);
+		t2 = _mm_xor_si128(t2,t5);
+		t2 = _mm_xor_si128(t2,t8);
+		t3 = _mm_xor_si128(t3,t2);
+		t6 = _mm_xor_si128(t6,t3);
 		return _swap128_aesni(t6);
 	}
 	static inline __m128i _ghash_aesni(__m128i h,__m128i y,__m128i x) { return _mult_block_aesni(h,_mm_xor_si128(y,x)); }
@@ -375,13 +382,13 @@ private:
 			d3 = _mm_loadu_si128(ab + i + 2);
 			d4 = _mm_loadu_si128(ab + i + 3);
 			y = _mm_xor_si128(y, d1);
-			y = _mult4xor_aesni(h1, h2, h3, h4, y, d2, d3, d4);
+			y = _mult4xor_aesni(h1,h2,h3,h4,y,d2,d3,d4);
 		}
 		for (i = pblocks; i < blocks; i++)
 			y = _ghash_aesni(_k.ni.h,y,_mm_loadu_si128(ab + i));
 		if (rem) {
 			last = _mm_setzero_si128();
-			memcpy(&last, ab + blocks, rem);
+			memcpy(&last,ab + blocks,rem);
 			y = _ghash_aesni(_k.ni.h,y,last);
 		}
 		return y;
@@ -395,7 +402,7 @@ private:
 	}
 	inline void _icv_crypt_aesni(__m128i y,__m128i j,uint8_t *icv,unsigned int icvsize) const
 	{
-		__m128i *ks,t,b;
+		__m128i t,b;
 		t = _mm_xor_si128(j,_k.ni.k[0]);
 		t = _mm_aesenc_si128(t,_k.ni.k[1]);
 		t = _mm_aesenc_si128(t,_k.ni.k[2]);
@@ -418,7 +425,7 @@ private:
 
 	inline __m128i _encrypt_gcm_rem_aesni(unsigned int rem,const void *in,void *out,__m128i cb,__m128i y) const
 	{
-		__m128i *ks,t,b;
+		__m128i t,b;
 		memset(&b,0,sizeof(b));
 		memcpy(&b,in,rem);
 		t = _mm_xor_si128(cb,_k.ni.k[0]);
@@ -436,15 +443,15 @@ private:
 		t = _mm_aesenc_si128(t,_k.ni.k[12]);
 		t = _mm_aesenc_si128(t,_k.ni.k[13]);
 		t = _mm_aesenclast_si128(t,_k.ni.k[14]);
-		b = _mm_xor_si128(t, b);
+		b = _mm_xor_si128(t,b);
 		memcpy(out,&b,rem);
 		memset((u_char*)&b + rem,0,16 - rem);
 		return _ghash_aesni(_k.ni.h,y,b);
 	}
 	inline void _encrypt_gcm256_aesni(unsigned int len,const uint8_t *in,uint8_t *out,const uint8_t *iv,unsigned int alen,const uint8_t *assoc,uint8_t *icv,unsigned int icvsize) const
 	{
-		__m128i d1,d2,d3,d4,t1,t2,t3,t4;
-		__m128i *ks,y,j,cb,*bi,*bo;
+		__m128i d1,d2,d3,d4,t1,t2,t3,t4,k;
+		__m128i y,j,cb,*bi,*bo;
 
 		j = _create_j_aesni(iv);
 		cb = _increment_be_aesni(j);
@@ -461,102 +468,102 @@ private:
 			d2 = _mm_loadu_si128(bi + i + 1);
 			d3 = _mm_loadu_si128(bi + i + 2);
 			d4 = _mm_loadu_si128(bi + i + 3);
-			t1 = _mm_xor_si128(cb, _k.ni.k[0]);
+			t1 = _mm_xor_si128(cb,k = _k.ni.k[0]);
 			cb = _increment_be_aesni(cb);
-			t2 = _mm_xor_si128(cb, _k.ni.k[0]);
+			t2 = _mm_xor_si128(cb,k);
 			cb = _increment_be_aesni(cb);
-			t3 = _mm_xor_si128(cb, _k.ni.k[0]);
+			t3 = _mm_xor_si128(cb,k);
 			cb = _increment_be_aesni(cb);
-			t4 = _mm_xor_si128(cb, _k.ni.k[0]);
+			t4 = _mm_xor_si128(cb,k);
 			cb = _increment_be_aesni(cb);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[1]);
-			t2 = _mm_aesenc_si128(t2, _k.ni.k[1]);
-			t3 = _mm_aesenc_si128(t3, _k.ni.k[1]);
-			t4 = _mm_aesenc_si128(t4, _k.ni.k[1]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[2]);
-			t2 = _mm_aesenc_si128(t2, _k.ni.k[2]);
-			t3 = _mm_aesenc_si128(t3, _k.ni.k[2]);
-			t4 = _mm_aesenc_si128(t4, _k.ni.k[2]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[3]);
-			t2 = _mm_aesenc_si128(t2, _k.ni.k[3]);
-			t3 = _mm_aesenc_si128(t3, _k.ni.k[3]);
-			t4 = _mm_aesenc_si128(t4, _k.ni.k[3]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[4]);
-			t2 = _mm_aesenc_si128(t2, _k.ni.k[4]);
-			t3 = _mm_aesenc_si128(t3, _k.ni.k[4]);
-			t4 = _mm_aesenc_si128(t4, _k.ni.k[4]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[5]);
-			t2 = _mm_aesenc_si128(t2, _k.ni.k[5]);
-			t3 = _mm_aesenc_si128(t3, _k.ni.k[5]);
-			t4 = _mm_aesenc_si128(t4, _k.ni.k[5]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[6]);
-			t2 = _mm_aesenc_si128(t2, _k.ni.k[6]);
-			t3 = _mm_aesenc_si128(t3, _k.ni.k[6]);
-			t4 = _mm_aesenc_si128(t4, _k.ni.k[6]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[7]);
-			t2 = _mm_aesenc_si128(t2, _k.ni.k[7]);
-			t3 = _mm_aesenc_si128(t3, _k.ni.k[7]);
-			t4 = _mm_aesenc_si128(t4, _k.ni.k[7]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[8]);
-			t2 = _mm_aesenc_si128(t2, _k.ni.k[8]);
-			t3 = _mm_aesenc_si128(t3, _k.ni.k[8]);
-			t4 = _mm_aesenc_si128(t4, _k.ni.k[8]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[9]);
-			t2 = _mm_aesenc_si128(t2, _k.ni.k[9]);
-			t3 = _mm_aesenc_si128(t3, _k.ni.k[9]);
-			t4 = _mm_aesenc_si128(t4, _k.ni.k[9]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[10]);
-			t2 = _mm_aesenc_si128(t2, _k.ni.k[10]);
-			t3 = _mm_aesenc_si128(t3, _k.ni.k[10]);
-			t4 = _mm_aesenc_si128(t4, _k.ni.k[10]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[11]);
-			t2 = _mm_aesenc_si128(t2, _k.ni.k[11]);
-			t3 = _mm_aesenc_si128(t3, _k.ni.k[11]);
-			t4 = _mm_aesenc_si128(t4, _k.ni.k[11]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[12]);
-			t2 = _mm_aesenc_si128(t2, _k.ni.k[12]);
-			t3 = _mm_aesenc_si128(t3, _k.ni.k[12]);
-			t4 = _mm_aesenc_si128(t4, _k.ni.k[12]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[13]);
-			t2 = _mm_aesenc_si128(t2, _k.ni.k[13]);
-			t3 = _mm_aesenc_si128(t3, _k.ni.k[13]);
-			t4 = _mm_aesenc_si128(t4, _k.ni.k[13]);
-			t1 = _mm_aesenclast_si128(t1, _k.ni.k[14]);
-			t2 = _mm_aesenclast_si128(t2, _k.ni.k[14]);
-			t3 = _mm_aesenclast_si128(t3, _k.ni.k[14]);
-			t4 = _mm_aesenclast_si128(t4, _k.ni.k[14]);
-			t1 = _mm_xor_si128(t1, d1);
-			t2 = _mm_xor_si128(t2, d2);
-			t3 = _mm_xor_si128(t3, d3);
-			t4 = _mm_xor_si128(t4, d4);
-			y = _mm_xor_si128(y, t1);
+			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[1]);
+			t2 = _mm_aesenc_si128(t2,k);
+			t3 = _mm_aesenc_si128(t3,k);
+			t4 = _mm_aesenc_si128(t4,k);
+			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[2]);
+			t2 = _mm_aesenc_si128(t2,k);
+			t3 = _mm_aesenc_si128(t3,k);
+			t4 = _mm_aesenc_si128(t4,k);
+			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[3]);
+			t2 = _mm_aesenc_si128(t2,k);
+			t3 = _mm_aesenc_si128(t3,k);
+			t4 = _mm_aesenc_si128(t4,k);
+			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[4]);
+			t2 = _mm_aesenc_si128(t2,k);
+			t3 = _mm_aesenc_si128(t3,k);
+			t4 = _mm_aesenc_si128(t4,k);
+			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[5]);
+			t2 = _mm_aesenc_si128(t2,k);
+			t3 = _mm_aesenc_si128(t3,k);
+			t4 = _mm_aesenc_si128(t4,k);
+			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[6]);
+			t2 = _mm_aesenc_si128(t2,k);
+			t3 = _mm_aesenc_si128(t3,k);
+			t4 = _mm_aesenc_si128(t4,k);
+			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[7]);
+			t2 = _mm_aesenc_si128(t2,k);
+			t3 = _mm_aesenc_si128(t3,k);
+			t4 = _mm_aesenc_si128(t4,k);
+			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[8]);
+			t2 = _mm_aesenc_si128(t2,k);
+			t3 = _mm_aesenc_si128(t3,k);
+			t4 = _mm_aesenc_si128(t4,k);
+			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[9]);
+			t2 = _mm_aesenc_si128(t2,k);
+			t3 = _mm_aesenc_si128(t3,k);
+			t4 = _mm_aesenc_si128(t4,k);
+			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[10]);
+			t2 = _mm_aesenc_si128(t2,k);
+			t3 = _mm_aesenc_si128(t3,k);
+			t4 = _mm_aesenc_si128(t4,k);
+			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[11]);
+			t2 = _mm_aesenc_si128(t2,k);
+			t3 = _mm_aesenc_si128(t3,k);
+			t4 = _mm_aesenc_si128(t4,k);
+			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[12]);
+			t2 = _mm_aesenc_si128(t2,k);
+			t3 = _mm_aesenc_si128(t3,k);
+			t4 = _mm_aesenc_si128(t4,k);
+			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[13]);
+			t2 = _mm_aesenc_si128(t2,k);
+			t3 = _mm_aesenc_si128(t3,k);
+			t4 = _mm_aesenc_si128(t4,k);
+			t1 = _mm_aesenclast_si128(t1,k = _k.ni.k[14]);
+			t2 = _mm_aesenclast_si128(t2,k);
+			t3 = _mm_aesenclast_si128(t3,k);
+			t4 = _mm_aesenclast_si128(t4,k);
+			t1 = _mm_xor_si128(t1,d1);
+			t2 = _mm_xor_si128(t2,d2);
+			t3 = _mm_xor_si128(t3,d3);
+			t4 = _mm_xor_si128(t4,d4);
+			y = _mm_xor_si128(y,t1);
 			y = _mult4xor_aesni(_k.ni.hhhh,_k.ni.hhh,_k.ni.hh,_k.ni.h,y,t2,t3,t4);
-			_mm_storeu_si128(bo + i + 0, t1);
-			_mm_storeu_si128(bo + i + 1, t2);
-			_mm_storeu_si128(bo + i + 2, t3);
-			_mm_storeu_si128(bo + i + 3, t4);
+			_mm_storeu_si128(bo + i + 0,t1);
+			_mm_storeu_si128(bo + i + 1,t2);
+			_mm_storeu_si128(bo + i + 2,t3);
+			_mm_storeu_si128(bo + i + 3,t4);
 		}
 
 		for (i=pblocks;i<blocks;++i) {
 			d1 = _mm_loadu_si128(bi + i);
-			t1 = _mm_xor_si128(cb, _k.ni.k[0]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[1]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[2]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[3]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[4]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[5]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[6]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[7]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[8]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[9]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[10]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[11]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[12]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[13]);
-			t1 = _mm_aesenclast_si128(t1, _k.ni.k[14]);
-			t1 = _mm_xor_si128(t1, d1);
-			_mm_storeu_si128(bo + i, t1);
-			y = _ghash_aesni(_k.ni.h, y, t1);
+			t1 = _mm_xor_si128(cb,_k.ni.k[0]);
+			t1 = _mm_aesenc_si128(t1,_k.ni.k[1]);
+			t1 = _mm_aesenc_si128(t1,_k.ni.k[2]);
+			t1 = _mm_aesenc_si128(t1,_k.ni.k[3]);
+			t1 = _mm_aesenc_si128(t1,_k.ni.k[4]);
+			t1 = _mm_aesenc_si128(t1,_k.ni.k[5]);
+			t1 = _mm_aesenc_si128(t1,_k.ni.k[6]);
+			t1 = _mm_aesenc_si128(t1,_k.ni.k[7]);
+			t1 = _mm_aesenc_si128(t1,_k.ni.k[8]);
+			t1 = _mm_aesenc_si128(t1,_k.ni.k[9]);
+			t1 = _mm_aesenc_si128(t1,_k.ni.k[10]);
+			t1 = _mm_aesenc_si128(t1,_k.ni.k[11]);
+			t1 = _mm_aesenc_si128(t1,_k.ni.k[12]);
+			t1 = _mm_aesenc_si128(t1,_k.ni.k[13]);
+			t1 = _mm_aesenclast_si128(t1,_k.ni.k[14]);
+			t1 = _mm_xor_si128(t1,d1);
+			_mm_storeu_si128(bo + i,t1);
+			y = _ghash_aesni(_k.ni.h,y,t1);
 			cb = _increment_be_aesni(cb);
 		}
 
@@ -567,10 +574,10 @@ private:
 	}
 	inline __m128i _decrypt_gcm_rem_aesni(unsigned int rem,const void *in,void *out,__m128i cb,__m128i y)
 	{
-		__m128i *ks, t, b;
-		memset(&b, 0, sizeof(b));
-		memcpy(&b, in, rem);
-		y = _ghash_aesni(_k.ni.h, y, b);
+		__m128i t,b;
+		memset(&b,0,sizeof(b));
+		memcpy(&b,in,rem);
+		y = _ghash_aesni(_k.ni.h,y,b);
 		t = _mm_xor_si128(cb,_k.ni.k[0]);
 		t = _mm_aesenc_si128(t,_k.ni.k[1]);
 		t = _mm_aesenc_si128(t,_k.ni.k[2]);
@@ -585,16 +592,16 @@ private:
 		t = _mm_aesenc_si128(t,_k.ni.k[11]);
 		t = _mm_aesenc_si128(t,_k.ni.k[12]);
 		t = _mm_aesenc_si128(t,_k.ni.k[13]);
-		t = _mm_aesenclast_si128(t, _k.ni.k[14]);
-		b = _mm_xor_si128(t, b);
-		memcpy(out, &b, rem);
+		t = _mm_aesenclast_si128(t,_k.ni.k[14]);
+		b = _mm_xor_si128(t,b);
+		memcpy(out,&b,rem);
 		return y;
 	}
-	inline void decrypt_gcm256(unsigned int len,const uint8_t *in,uint8_t *out,const uint8_t *iv,unsigned int alen,const uint8_t *assoc,uint8_t *icv,unsigned int icvsize)
+	inline void _decrypt_gcm256_aesni(unsigned int len,const uint8_t *in,uint8_t *out,const uint8_t *iv,unsigned int alen,const uint8_t *assoc,uint8_t *icv,unsigned int icvsize)
 	{
-		__m128i d1, d2, d3, d4, t1, t2, t3, t4;
-		__m128i *ks, y, j, cb, *bi, *bo;
-		unsigned int blocks, pblocks, rem;
+		__m128i d1,d2,d3,d4,t1,t2,t3,t4,k;
+		__m128i y,j,cb,*bi,*bo;
+		unsigned int blocks,pblocks,rem;
 
 		j = _create_j_aesni(iv);
 		cb = _increment_be_aesni(j);
@@ -611,102 +618,102 @@ private:
 			d2 = _mm_loadu_si128(bi + i + 1);
 			d3 = _mm_loadu_si128(bi + i + 2);
 			d4 = _mm_loadu_si128(bi + i + 3);
-			y = _mm_xor_si128(y, d1);
+			y = _mm_xor_si128(y,d1);
 			y = _mult4xor_aesni(_k.ni.hhhh,_k.ni.hhh,_k.ni.hh,_k.ni.h,y,d2,d3,d4);
-			t1 = _mm_xor_si128(cb, _k.ni.k[0]);
+			t1 = _mm_xor_si128(cb,k = _k.ni.k[0]);
 			cb = _increment_be_aesni(cb);
-			t2 = _mm_xor_si128(cb, _k.ni.k[0]);
+			t2 = _mm_xor_si128(cb,k);
 			cb = _increment_be_aesni(cb);
-			t3 = _mm_xor_si128(cb, _k.ni.k[0]);
+			t3 = _mm_xor_si128(cb,k);
 			cb = _increment_be_aesni(cb);
-			t4 = _mm_xor_si128(cb, _k.ni.k[0]);
+			t4 = _mm_xor_si128(cb,k);
 			cb = _increment_be_aesni(cb);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[1]);
-			t2 = _mm_aesenc_si128(t2, _k.ni.k[1]);
-			t3 = _mm_aesenc_si128(t3, _k.ni.k[1]);
-			t4 = _mm_aesenc_si128(t4, _k.ni.k[1]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[2]);
-			t2 = _mm_aesenc_si128(t2, _k.ni.k[2]);
-			t3 = _mm_aesenc_si128(t3, _k.ni.k[2]);
-			t4 = _mm_aesenc_si128(t4, _k.ni.k[2]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[3]);
-			t2 = _mm_aesenc_si128(t2, _k.ni.k[3]);
-			t3 = _mm_aesenc_si128(t3, _k.ni.k[3]);
-			t4 = _mm_aesenc_si128(t4, _k.ni.k[3]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[4]);
-			t2 = _mm_aesenc_si128(t2, _k.ni.k[4]);
-			t3 = _mm_aesenc_si128(t3, _k.ni.k[4]);
-			t4 = _mm_aesenc_si128(t4, _k.ni.k[4]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[5]);
-			t2 = _mm_aesenc_si128(t2, _k.ni.k[5]);
-			t3 = _mm_aesenc_si128(t3, _k.ni.k[5]);
-			t4 = _mm_aesenc_si128(t4, _k.ni.k[5]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[6]);
-			t2 = _mm_aesenc_si128(t2, _k.ni.k[6]);
-			t3 = _mm_aesenc_si128(t3, _k.ni.k[6]);
-			t4 = _mm_aesenc_si128(t4, _k.ni.k[6]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[7]);
-			t2 = _mm_aesenc_si128(t2, _k.ni.k[7]);
-			t3 = _mm_aesenc_si128(t3, _k.ni.k[7]);
-			t4 = _mm_aesenc_si128(t4, _k.ni.k[7]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[8]);
-			t2 = _mm_aesenc_si128(t2, _k.ni.k[8]);
-			t3 = _mm_aesenc_si128(t3, _k.ni.k[8]);
-			t4 = _mm_aesenc_si128(t4, _k.ni.k[8]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[9]);
-			t2 = _mm_aesenc_si128(t2, _k.ni.k[9]);
-			t3 = _mm_aesenc_si128(t3, _k.ni.k[9]);
-			t4 = _mm_aesenc_si128(t4, _k.ni.k[9]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[10]);
-			t2 = _mm_aesenc_si128(t2, _k.ni.k[10]);
-			t3 = _mm_aesenc_si128(t3, _k.ni.k[10]);
-			t4 = _mm_aesenc_si128(t4, _k.ni.k[10]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[11]);
-			t2 = _mm_aesenc_si128(t2, _k.ni.k[11]);
-			t3 = _mm_aesenc_si128(t3, _k.ni.k[11]);
-			t4 = _mm_aesenc_si128(t4, _k.ni.k[11]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[12]);
-			t2 = _mm_aesenc_si128(t2, _k.ni.k[12]);
-			t3 = _mm_aesenc_si128(t3, _k.ni.k[12]);
-			t4 = _mm_aesenc_si128(t4, _k.ni.k[12]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[13]);
-			t2 = _mm_aesenc_si128(t2, _k.ni.k[13]);
-			t3 = _mm_aesenc_si128(t3, _k.ni.k[13]);
-			t4 = _mm_aesenc_si128(t4, _k.ni.k[13]);
-			t1 = _mm_aesenclast_si128(t1, _k.ni.k[14]);
-			t2 = _mm_aesenclast_si128(t2, _k.ni.k[14]);
-			t3 = _mm_aesenclast_si128(t3, _k.ni.k[14]);
-			t4 = _mm_aesenclast_si128(t4, _k.ni.k[14]);
-			t1 = _mm_xor_si128(t1, d1);
-			t2 = _mm_xor_si128(t2, d2);
-			t3 = _mm_xor_si128(t3, d3);
-			t4 = _mm_xor_si128(t4, d4);
-			_mm_storeu_si128(bo + i + 0, t1);
-			_mm_storeu_si128(bo + i + 1, t2);
-			_mm_storeu_si128(bo + i + 2, t3);
-			_mm_storeu_si128(bo + i + 3, t4);
+			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[1]);
+			t2 = _mm_aesenc_si128(t2,k);
+			t3 = _mm_aesenc_si128(t3,k);
+			t4 = _mm_aesenc_si128(t4,k);
+			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[2]);
+			t2 = _mm_aesenc_si128(t2,k);
+			t3 = _mm_aesenc_si128(t3,k);
+			t4 = _mm_aesenc_si128(t4,k);
+			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[3]);
+			t2 = _mm_aesenc_si128(t2,k);
+			t3 = _mm_aesenc_si128(t3,k);
+			t4 = _mm_aesenc_si128(t4,k);
+			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[4]);
+			t2 = _mm_aesenc_si128(t2,k);
+			t3 = _mm_aesenc_si128(t3,k);
+			t4 = _mm_aesenc_si128(t4,k);
+			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[5]);
+			t2 = _mm_aesenc_si128(t2,k);
+			t3 = _mm_aesenc_si128(t3,k);
+			t4 = _mm_aesenc_si128(t4,k);
+			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[6]);
+			t2 = _mm_aesenc_si128(t2,k);
+			t3 = _mm_aesenc_si128(t3,k);
+			t4 = _mm_aesenc_si128(t4,k);
+			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[7]);
+			t2 = _mm_aesenc_si128(t2,k);
+			t3 = _mm_aesenc_si128(t3,k);
+			t4 = _mm_aesenc_si128(t4,k);
+			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[8]);
+			t2 = _mm_aesenc_si128(t2,k);
+			t3 = _mm_aesenc_si128(t3,k);
+			t4 = _mm_aesenc_si128(t4,k);
+			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[9]);
+			t2 = _mm_aesenc_si128(t2,k);
+			t3 = _mm_aesenc_si128(t3,k);
+			t4 = _mm_aesenc_si128(t4,k);
+			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[10]);
+			t2 = _mm_aesenc_si128(t2,k);
+			t3 = _mm_aesenc_si128(t3,k);
+			t4 = _mm_aesenc_si128(t4,k);
+			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[11]);
+			t2 = _mm_aesenc_si128(t2,k);
+			t3 = _mm_aesenc_si128(t3,k);
+			t4 = _mm_aesenc_si128(t4,k);
+			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[12]);
+			t2 = _mm_aesenc_si128(t2,k);
+			t3 = _mm_aesenc_si128(t3,k);
+			t4 = _mm_aesenc_si128(t4,k);
+			t1 = _mm_aesenc_si128(t1,k = _k.ni.k[13]);
+			t2 = _mm_aesenc_si128(t2,k);
+			t3 = _mm_aesenc_si128(t3,k);
+			t4 = _mm_aesenc_si128(t4,k);
+			t1 = _mm_aesenclast_si128(t1,k = _k.ni.k[14]);
+			t2 = _mm_aesenclast_si128(t2,k);
+			t3 = _mm_aesenclast_si128(t3,k);
+			t4 = _mm_aesenclast_si128(t4,k);
+			t1 = _mm_xor_si128(t1,d1);
+			t2 = _mm_xor_si128(t2,d2);
+			t3 = _mm_xor_si128(t3,d3);
+			t4 = _mm_xor_si128(t4,d4);
+			_mm_storeu_si128(bo + i + 0,t1);
+			_mm_storeu_si128(bo + i + 1,t2);
+			_mm_storeu_si128(bo + i + 2,t3);
+			_mm_storeu_si128(bo + i + 3,t4);
 		}
 
 		for (i=pblocks;i<blocks;i++) {
 			d1 = _mm_loadu_si128(bi + i);
 			y = _ghash_aesni(_k.ni.h,y,d1);
-			t1 = _mm_xor_si128(cb, _k.ni.k[0]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[1]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[2]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[3]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[4]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[5]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[6]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[7]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[8]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[9]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[10]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[11]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[12]);
-			t1 = _mm_aesenc_si128(t1, _k.ni.k[13]);
-			t1 = _mm_aesenclast_si128(t1, _k.ni.k[14]);
-			t1 = _mm_xor_si128(t1, d1);
-			_mm_storeu_si128(bo + i, t1);
+			t1 = _mm_xor_si128(cb,_k.ni.k[0]);
+			t1 = _mm_aesenc_si128(t1,_k.ni.k[1]);
+			t1 = _mm_aesenc_si128(t1,_k.ni.k[2]);
+			t1 = _mm_aesenc_si128(t1,_k.ni.k[3]);
+			t1 = _mm_aesenc_si128(t1,_k.ni.k[4]);
+			t1 = _mm_aesenc_si128(t1,_k.ni.k[5]);
+			t1 = _mm_aesenc_si128(t1,_k.ni.k[6]);
+			t1 = _mm_aesenc_si128(t1,_k.ni.k[7]);
+			t1 = _mm_aesenc_si128(t1,_k.ni.k[8]);
+			t1 = _mm_aesenc_si128(t1,_k.ni.k[9]);
+			t1 = _mm_aesenc_si128(t1,_k.ni.k[10]);
+			t1 = _mm_aesenc_si128(t1,_k.ni.k[11]);
+			t1 = _mm_aesenc_si128(t1,_k.ni.k[12]);
+			t1 = _mm_aesenc_si128(t1,_k.ni.k[13]);
+			t1 = _mm_aesenclast_si128(t1,_k.ni.k[14]);
+			t1 = _mm_xor_si128(t1,d1);
+			_mm_storeu_si128(bo + i,t1);
 			cb = _increment_be_aesni(cb);
 		}