Browse Source

Get rid of complicated AES-GCM + scramble construct in favor of a simpler construct thanks to "Squeamish Ossifrage" on crypto.stackexchange.com

Adam Ierymenko 6 years ago
parent
commit
9fd5ec673b
3 changed files with 289 additions and 969 deletions
  1. 0 0
      node/AES.cpp
  2. 172 827
      node/AES.hpp
  3. 117 142
      selftest.cpp

File diff suppressed because it is too large
+ 0 - 0
node/AES.cpp


+ 172 - 827
node/AES.hpp

@@ -80,155 +80,35 @@ public:
 		_encryptSW(in,out);
 	}
 
-	inline void decrypt(const uint8_t in[16],uint8_t out[16]) const
+	inline void gmac(const uint8_t iv[12],const void *in,const unsigned int len,uint8_t out[16]) const
 	{
 #ifdef ZT_AES_AESNI
 		if (likely(HW_ACCEL)) {
-			_decrypt_aesni(in,out);
-			return;
-		}
-#endif
-
-		_decryptSW(in,out);
-	}
-
-	inline void gcmEncrypt(const uint8_t iv[12],const void *in,unsigned int inlen,const void *assoc,unsigned int assoclen,void *out,uint8_t *tag,unsigned int taglen)
-	{
-#ifdef ZT_AES_AESNI
-		if (likely(HW_ACCEL)) {
-			_encrypt_gcm256_aesni(inlen,(const uint8_t *)in,(uint8_t *)out,iv,assoclen,(const uint8_t *)assoc,tag,taglen);
-			return;
-		}
-#endif
-		abort(); // TODO: software
-	}
-
-	inline bool gcmDecrypt(const uint8_t iv[12],const void *in,unsigned int inlen,const void *assoc,unsigned int assoclen,void *out,const uint8_t *tag,unsigned int taglen)
-	{
-#ifdef ZT_AES_AESNI
-		if (likely(HW_ACCEL)) {
-			uint8_t tagbuf[16];
-			_decrypt_gcm256_aesni(inlen,(const uint8_t *)in,(uint8_t *)out,iv,assoclen,(const uint8_t *)assoc,tagbuf,taglen);
-			return Utils::secureEq(tagbuf,tag,taglen);
-		}
-#endif
-		abort(); // TODO: software
-		return false;
-	}
-
-	static inline void scramble(const uint8_t key[16],const void *in,unsigned int inlen,void *out)
-	{
-		if (inlen < 16)
-			return;
-
-#ifdef ZT_AES_AESNI
-		if (likely(HW_ACCEL)) {
-			_scramble_aesni(key,(const uint8_t *)in,(uint8_t *)out,inlen);
+			_gmac_aesni(iv,(const uint8_t *)in,len,out);
 			return;
 		}
 #endif
 	}
 
-	static inline void unscramble(const uint8_t key[16],const void *in,unsigned int inlen,void *out)
+	inline void ctr(const uint8_t iv[16],const void *in,const unsigned int len,void *out) const
 	{
-		if (inlen < 16)
-			return;
-
 #ifdef ZT_AES_AESNI
 		if (likely(HW_ACCEL)) {
-			_unscramble_aesni(key,(const uint8_t *)in,(uint8_t *)out,inlen);
+			_crypt_ctr_aesni(iv,(const uint8_t *)in,len,(uint8_t *)out);
 			return;
 		}
 #endif
 	}
 
-	/**
-	 * Encrypt with AES256-GCM-DDS
-	 *
-	 * DDS stands for Data Dependent Scramble and refers to our scheme for nonce
-	 * duplication resistance.
-	 *
-	 * @param iv IV (usually random)
-	 * @param in Input plaintext
-	 * @param inlen Length of plaintext
-	 * @param assoc Associated data that won't be encrypted
-	 * @param assoclen Length of associated data
-	 * @param out Output ciphertext buffer (must be at least inlen in size)
-	 * @param combinedTag Buffer to receive 128-bit encrypted combined IV and MAC
-	 */
-	inline void gcmDdsEncrypt(const uint64_t iv,const void *in,unsigned int inlen,const void *assoc,unsigned int assoclen,void *out,uint64_t combinedTag[2])
-	{
-		// Make 12-byte GCM IV (use combinedTag as tmp buffer)
-		combinedTag[0] = iv;
-		((uint8_t *)combinedTag)[8] = (uint8_t)(inlen >> 16);
-		((uint8_t *)combinedTag)[9] = (uint8_t)(inlen >> 8);
-		((uint8_t *)combinedTag)[10] = (uint8_t)inlen;
-		((uint8_t *)combinedTag)[11] = (uint8_t)assoclen;
-
-		// Encrypt data and store 64-bit tag/MAC code in second 64 bits of combinedTag.
-		gcmEncrypt((const uint8_t *)combinedTag,in,inlen,assoc,assoclen,out,((uint8_t *)&(combinedTag[1])),8);
-
-		// Encrypt combinedTag once to get scramble key
-		encrypt((const uint8_t *)combinedTag,(uint8_t *)combinedTag);
-
-		// Scramble ciphertext
-		scramble((const uint8_t *)combinedTag,out,inlen,out);
-
-		// Encrypt combinedTag again to get masked tag to include with message
-		encrypt((const uint8_t *)combinedTag,(uint8_t *)combinedTag);
-	}
-
-	/**
-	 * Decrypt with AES256-GCM-DDS
-	 *
-	 * @param combinedTag Encrypted combined tag
-	 * @param in Input ciphertext
-	 * @param inlen Length of ciphertext
-	 * @param assoc Associated data that wasn't encrypted
-	 * @param assoclen Length of associated data
-	 * @param out Output plaintext buffer (must be at least inlen in size)
-	 * @return True if GCM authentication check succeeded (if false, discard packet)
-	 */
-	inline bool gcmDdsDecrypt(const uint64_t combinedTag[2],const void *in,unsigned int inlen,const void *assoc,unsigned int assoclen,void *out)
-	{
-		uint64_t tmp[2],gcmIv[2];
-
-		// Decrypt combinedTag to get scramble key
-		decrypt((const uint8_t *)combinedTag,(uint8_t *)tmp);
-
-		// Unscramble ciphertext
-		unscramble((const uint8_t *)tmp,in,inlen,out);
-
-		// Decrypt combinedTag again to get original IV and AES-GCM MAC
-		decrypt((const uint8_t *)tmp,(uint8_t *)tmp);
-
-		// Make 12-byte GCM IV
-		gcmIv[0] = tmp[0];
-		((uint8_t *)gcmIv)[8] = (uint8_t)(inlen >> 16);
-		((uint8_t *)gcmIv)[9] = (uint8_t)(inlen >> 8);
-		((uint8_t *)gcmIv)[10] = (uint8_t)inlen;
-		((uint8_t *)gcmIv)[11] = (uint8_t)assoclen;
-
-		// Perform GCM decryption and authentication
-		return gcmDecrypt((const uint8_t *)gcmIv,out,inlen,assoc,assoclen,out,(const uint8_t *)&(tmp[1]),8);
-	}
-
 private:
 	static const uint32_t Te0[256];
 	static const uint32_t Te1[256];
 	static const uint32_t Te2[256];
 	static const uint32_t Te3[256];
-	static const uint32_t Te4[256];
-	static const uint32_t Td0[256];
-	static const uint32_t Td1[256];
-	static const uint32_t Td2[256];
-	static const uint32_t Td3[256];
-	static const uint8_t Td4[256];
 	static const uint32_t rcon[10];
 
 	void _initSW(const uint8_t key[32]);
 	void _encryptSW(const uint8_t in[16],uint8_t out[16]) const;
-	void _decryptSW(const uint8_t in[16],uint8_t out[16]) const;
 
 	/**************************************************************************/
 	union {
@@ -239,13 +119,12 @@ private:
 #endif
 #ifdef ZT_AES_AESNI
 		struct {
-			__m128i k[28];
+			__m128i k[15];
 			__m128i h,hh,hhh,hhhh;
 		} ni;
 #endif
 		struct {
 			uint32_t ek[60];
-			uint32_t dk[60];
 		} sw;
 	} _k;
 	/**************************************************************************/
@@ -331,24 +210,6 @@ private:
 		*data = vaesmcq_u8(vaeseq_u8(*data, (uint8x16_t)_k.neon.k[13]));
 		*data = vaeseq_u8(*data, _k.neon.k[14]);
 	}
-	inline void _decrypt_armneon(uint8x16_t *data) const
-	{
-		*data = veorq_u8(*data, _k.neon.k[14]);
-		*data = vaesimcq_u8(vaesdq_u8(*data, (uint8x16_t)_k.neon.k[13]));
-		*data = vaesimcq_u8(vaesdq_u8(*data, (uint8x16_t)_k.neon.k[12]));
-		*data = vaesimcq_u8(vaesdq_u8(*data, (uint8x16_t)_k.neon.k[11]));
-		*data = vaesimcq_u8(vaesdq_u8(*data, (uint8x16_t)_k.neon.k[10]));
-		*data = vaesimcq_u8(vaesdq_u8(*data, (uint8x16_t)_k.neon.k[9]));
-		*data = vaesimcq_u8(vaesdq_u8(*data, (uint8x16_t)_k.neon.k[8]));
-		*data = vaesimcq_u8(vaesdq_u8(*data, (uint8x16_t)_k.neon.k[7]));
-		*data = vaesimcq_u8(vaesdq_u8(*data, (uint8x16_t)_k.neon.k[6]));
-		*data = vaesimcq_u8(vaesdq_u8(*data, (uint8x16_t)_k.neon.k[5]));
-		*data = vaesimcq_u8(vaesdq_u8(*data, (uint8x16_t)_k.neon.k[4]));
-		*data = vaesimcq_u8(vaesdq_u8(*data, (uint8x16_t)_k.neon.k[3]));
-		*data = vaesimcq_u8(vaesdq_u8(*data, (uint8x16_t)_k.neon.k[2]));
-		*data = vaesimcq_u8(vaesdq_u8(*data, (uint8x16_t)_k.neon.k[1]));
-		*data = vaesdq_u8(*data, (uint8x16_t)_k.neon.k[0]);
-	}
 #endif /*********************************************************************/
 
 #ifdef ZT_AES_AESNI /********************************************************/
@@ -397,19 +258,6 @@ private:
 		_k.ni.k[12] = t1 = _init256_1_aesni(t1,_mm_aeskeygenassist_si128(t2,0x20));
 		_k.ni.k[13] = t2 = _init256_2_aesni(t1,t2);
 		_k.ni.k[14] = _init256_1_aesni(t1,_mm_aeskeygenassist_si128(t2,0x40));
-		_k.ni.k[15] = _mm_aesimc_si128(_k.ni.k[13]);
-		_k.ni.k[16] = _mm_aesimc_si128(_k.ni.k[12]);
-		_k.ni.k[17] = _mm_aesimc_si128(_k.ni.k[11]);
-		_k.ni.k[18] = _mm_aesimc_si128(_k.ni.k[10]);
-		_k.ni.k[19] = _mm_aesimc_si128(_k.ni.k[9]);
-		_k.ni.k[20] = _mm_aesimc_si128(_k.ni.k[8]);
-		_k.ni.k[21] = _mm_aesimc_si128(_k.ni.k[7]);
-		_k.ni.k[22] = _mm_aesimc_si128(_k.ni.k[6]);
-		_k.ni.k[23] = _mm_aesimc_si128(_k.ni.k[5]);
-		_k.ni.k[24] = _mm_aesimc_si128(_k.ni.k[4]);
-		_k.ni.k[25] = _mm_aesimc_si128(_k.ni.k[3]);
-		_k.ni.k[26] = _mm_aesimc_si128(_k.ni.k[2]);
-		_k.ni.k[27] = _mm_aesimc_si128(_k.ni.k[1]);
 
 		__m128i h = _mm_xor_si128(_mm_setzero_si128(),_k.ni.k[0]);
 		h = _mm_aesenc_si128(h,_k.ni.k[1]);
@@ -436,268 +284,6 @@ private:
 		_k.ni.hhhh = _swap128_aesni(hhhh);
 	}
 
-	static ZT_ALWAYS_INLINE __m128i _assist128_aesni(__m128i a,__m128i b)
-	{
-		__m128i c;
-		b = _mm_shuffle_epi32(b ,0xff);
-		c = _mm_slli_si128(a, 0x04);
-		a = _mm_xor_si128(a, c);
-		c = _mm_slli_si128(c, 0x04);
-		a = _mm_xor_si128(a, c);
-		c = _mm_slli_si128(c, 0x04);
-		a = _mm_xor_si128(a, c);
-		a = _mm_xor_si128(a, b);
-		return a;
-	}
-	static ZT_ALWAYS_INLINE void _scramble_aesni(const uint8_t key[16],const uint8_t *in,uint8_t *out,unsigned int len)
-	{
-		__m128i t = _mm_loadu_si128((const __m128i *)key);
-		__m128i k0 = t;
-		__m128i k1 = t = _assist128_aesni(t, _mm_aeskeygenassist_si128(t, 0x01));
-		__m128i k2 = t = _assist128_aesni(t, _mm_aeskeygenassist_si128(t, 0x02));
-		__m128i k3 = t = _assist128_aesni(t, _mm_aeskeygenassist_si128(t, 0x04));
-		__m128i k4 = t = _assist128_aesni(t, _mm_aeskeygenassist_si128(t, 0x08));
-		__m128i k5 = t = _assist128_aesni(t, _mm_aeskeygenassist_si128(t, 0x10));
-		__m128i k6 = t = _assist128_aesni(t, _mm_aeskeygenassist_si128(t, 0x20));
-		__m128i k7 = t = _assist128_aesni(t, _mm_aeskeygenassist_si128(t, 0x40));
-		__m128i k8 = t = _assist128_aesni(t, _mm_aeskeygenassist_si128(t, 0x80));
-		__m128i k9 = t = _assist128_aesni(t, _mm_aeskeygenassist_si128(t, 0x1b));
-		__m128i k10 = t = _assist128_aesni(t, _mm_aeskeygenassist_si128(t, 0x36));
-		__m128i ctr = _mm_setzero_si128();
-		const __m128i one = _mm_set_epi32(0,0,0,1);
-
-		while (len >= 64) {
-			len -= 64;
-			__m128i d0 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)in),ctr);
-			ctr = _mm_add_epi64(ctr,one);
-			in += 16;
-			__m128i d1 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)in),ctr);
-			ctr = _mm_add_epi64(ctr,one);
-			in += 16;
-			__m128i d2 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)in),ctr);
-			ctr = _mm_add_epi64(ctr,one);
-			in += 16;
-			__m128i d3 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)in),ctr);
-			ctr = _mm_add_epi64(ctr,one);
-			in += 16;
-			d0 = _mm_xor_si128(d0,k0);
-			d1 = _mm_xor_si128(d1,k0);
-			d2 = _mm_xor_si128(d2,k0);
-			d3 = _mm_xor_si128(d3,k0);
-			d0 = _mm_aesenc_si128(d0,k1);
-			d1 = _mm_aesenc_si128(d1,k1);
-			d2 = _mm_aesenc_si128(d2,k1);
-			d3 = _mm_aesenc_si128(d3,k1);
-			d0 = _mm_aesenc_si128(d0,k2);
-			d1 = _mm_aesenc_si128(d1,k2);
-			d2 = _mm_aesenc_si128(d2,k2);
-			d3 = _mm_aesenc_si128(d3,k2);
-			d0 = _mm_aesenc_si128(d0,k3);
-			d1 = _mm_aesenc_si128(d1,k3);
-			d2 = _mm_aesenc_si128(d2,k3);
-			d3 = _mm_aesenc_si128(d3,k3);
-			d0 = _mm_aesenc_si128(d0,k4);
-			d1 = _mm_aesenc_si128(d1,k4);
-			d2 = _mm_aesenc_si128(d2,k4);
-			d3 = _mm_aesenc_si128(d3,k4);
-			d0 = _mm_aesenc_si128(d0,k5);
-			d1 = _mm_aesenc_si128(d1,k5);
-			d2 = _mm_aesenc_si128(d2,k5);
-			d3 = _mm_aesenc_si128(d3,k5);
-			d0 = _mm_aesenc_si128(d0,k6);
-			d1 = _mm_aesenc_si128(d1,k6);
-			d2 = _mm_aesenc_si128(d2,k6);
-			d3 = _mm_aesenc_si128(d3,k6);
-			d0 = _mm_aesenc_si128(d0,k7);
-			d1 = _mm_aesenc_si128(d1,k7);
-			d2 = _mm_aesenc_si128(d2,k7);
-			d3 = _mm_aesenc_si128(d3,k7);
-			d0 = _mm_aesenc_si128(d0,k8);
-			d1 = _mm_aesenc_si128(d1,k8);
-			d2 = _mm_aesenc_si128(d2,k8);
-			d3 = _mm_aesenc_si128(d3,k8);
-			d0 = _mm_aesenc_si128(d0,k9);
-			d1 = _mm_aesenc_si128(d1,k9);
-			d2 = _mm_aesenc_si128(d2,k9);
-			d3 = _mm_aesenc_si128(d3,k9);
-			_mm_storeu_si128((__m128i *)out,_mm_aesenclast_si128(d0,k10));
-			out += 16;
-			_mm_storeu_si128((__m128i *)out,_mm_aesenclast_si128(d1,k10));
-			out += 16;
-			_mm_storeu_si128((__m128i *)out,_mm_aesenclast_si128(d2,k10));
-			out += 16;
-			_mm_storeu_si128((__m128i *)out,_mm_aesenclast_si128(d3,k10));
-			out += 16;
-		}
-
-		while (len >= 16) {
-			len -= 16;
-			__m128i d0 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)in),ctr);
-			ctr = _mm_add_epi64(ctr,one);
-			in += 16;
-			d0 = _mm_xor_si128(d0,k0);
-			d0 = _mm_aesenc_si128(d0,k1);
-			d0 = _mm_aesenc_si128(d0,k2);
-			d0 = _mm_aesenc_si128(d0,k3);
-			d0 = _mm_aesenc_si128(d0,k4);
-			d0 = _mm_aesenc_si128(d0,k5);
-			d0 = _mm_aesenc_si128(d0,k6);
-			d0 = _mm_aesenc_si128(d0,k7);
-			d0 = _mm_aesenc_si128(d0,k8);
-			d0 = _mm_aesenc_si128(d0,k9);
-			_mm_storeu_si128((__m128i *)out,_mm_aesenclast_si128(d0,k10));
-			out += 16;
-		}
-
-		if (len) {
-			__m128i last = ctr;
-			last = _mm_xor_si128(last,k0);
-			last = _mm_aesenc_si128(last,k1);
-			last = _mm_aesenc_si128(last,k2);
-			last = _mm_aesenc_si128(last,k3);
-			last = _mm_aesenc_si128(last,k4);
-			last = _mm_aesenc_si128(last,k5);
-			last = _mm_aesenc_si128(last,k6);
-			last = _mm_aesenc_si128(last,k7);
-			last = _mm_aesenc_si128(last,k8);
-			last = _mm_aesenc_si128(last,k9);
-			uint8_t lpad[16];
-			_mm_storeu_si128((__m128i *)lpad,_mm_aesenclast_si128(last,k10));
-			for(unsigned int i=0;i<len;++i) {
-				out[i] = in[i] ^ lpad[i];
-			}
-		}
-	}
-
-	static ZT_ALWAYS_INLINE void _unscramble_aesni(const uint8_t key[16],const uint8_t *in,uint8_t *out,unsigned int len)
-	{
-		__m128i t = _mm_loadu_si128((const __m128i *)key);
-		__m128i dk10 = t; // k0
-		__m128i k1 = t = _assist128_aesni(t, _mm_aeskeygenassist_si128(t, 0x01));
-		__m128i k2 = t = _assist128_aesni(t, _mm_aeskeygenassist_si128(t, 0x02));
-		__m128i k3 = t = _assist128_aesni(t, _mm_aeskeygenassist_si128(t, 0x04));
-		__m128i k4 = t = _assist128_aesni(t, _mm_aeskeygenassist_si128(t, 0x08));
-		__m128i k5 = t = _assist128_aesni(t, _mm_aeskeygenassist_si128(t, 0x10));
-		__m128i k6 = t = _assist128_aesni(t, _mm_aeskeygenassist_si128(t, 0x20));
-		__m128i k7 = t = _assist128_aesni(t, _mm_aeskeygenassist_si128(t, 0x40));
-		__m128i k8 = t = _assist128_aesni(t, _mm_aeskeygenassist_si128(t, 0x80));
-		__m128i k9 = t = _assist128_aesni(t, _mm_aeskeygenassist_si128(t, 0x1b));
-		__m128i dk0 = t = _assist128_aesni(t, _mm_aeskeygenassist_si128(t, 0x36)); // k10
-		__m128i dk1 = _mm_aesimc_si128(k9);
-		__m128i dk2 = _mm_aesimc_si128(k8);
-		__m128i dk3 = _mm_aesimc_si128(k7);
-		__m128i dk4 = _mm_aesimc_si128(k6);
-		__m128i dk5 = _mm_aesimc_si128(k5);
-		__m128i dk6 = _mm_aesimc_si128(k4);
-		__m128i dk7 = _mm_aesimc_si128(k3);
-		__m128i dk8 = _mm_aesimc_si128(k2);
-		__m128i dk9 = _mm_aesimc_si128(k1);
-		__m128i ctr = _mm_setzero_si128();
-		const __m128i one = _mm_set_epi32(0,0,0,1);
-
-		while (len >= 64) {
-			len -= 64;
-			__m128i d0 = _mm_loadu_si128((const __m128i *)in);
-			in += 16;
-			__m128i d1 = _mm_loadu_si128((const __m128i *)in);
-			in += 16;
-			__m128i d2 = _mm_loadu_si128((const __m128i *)in);
-			in += 16;
-			__m128i d3 = _mm_loadu_si128((const __m128i *)in);
-			in += 16;
-			d0 = _mm_xor_si128(d0,dk0);
-			d1 = _mm_xor_si128(d1,dk0);
-			d2 = _mm_xor_si128(d2,dk0);
-			d3 = _mm_xor_si128(d3,dk0);
-			d0 = _mm_aesdec_si128(d0,dk1);
-			d1 = _mm_aesdec_si128(d1,dk1);
-			d2 = _mm_aesdec_si128(d2,dk1);
-			d3 = _mm_aesdec_si128(d3,dk1);
-			d0 = _mm_aesdec_si128(d0,dk2);
-			d1 = _mm_aesdec_si128(d1,dk2);
-			d2 = _mm_aesdec_si128(d2,dk2);
-			d3 = _mm_aesdec_si128(d3,dk2);
-			d0 = _mm_aesdec_si128(d0,dk3);
-			d1 = _mm_aesdec_si128(d1,dk3);
-			d2 = _mm_aesdec_si128(d2,dk3);
-			d3 = _mm_aesdec_si128(d3,dk3);
-			d0 = _mm_aesdec_si128(d0,dk4);
-			d1 = _mm_aesdec_si128(d1,dk4);
-			d2 = _mm_aesdec_si128(d2,dk4);
-			d3 = _mm_aesdec_si128(d3,dk4);
-			d0 = _mm_aesdec_si128(d0,dk5);
-			d1 = _mm_aesdec_si128(d1,dk5);
-			d2 = _mm_aesdec_si128(d2,dk5);
-			d3 = _mm_aesdec_si128(d3,dk5);
-			d0 = _mm_aesdec_si128(d0,dk6);
-			d1 = _mm_aesdec_si128(d1,dk6);
-			d2 = _mm_aesdec_si128(d2,dk6);
-			d3 = _mm_aesdec_si128(d3,dk6);
-			d0 = _mm_aesdec_si128(d0,dk7);
-			d1 = _mm_aesdec_si128(d1,dk7);
-			d2 = _mm_aesdec_si128(d2,dk7);
-			d3 = _mm_aesdec_si128(d3,dk7);
-			d0 = _mm_aesdec_si128(d0,dk8);
-			d1 = _mm_aesdec_si128(d1,dk8);
-			d2 = _mm_aesdec_si128(d2,dk8);
-			d3 = _mm_aesdec_si128(d3,dk8);
-			d0 = _mm_aesdec_si128(d0,dk9);
-			d1 = _mm_aesdec_si128(d1,dk9);
-			d2 = _mm_aesdec_si128(d2,dk9);
-			d3 = _mm_aesdec_si128(d3,dk9);
-			_mm_storeu_si128((__m128i *)out,_mm_xor_si128(_mm_aesdeclast_si128(d0,dk10),ctr));
-			ctr = _mm_add_epi64(ctr,one);
-			out += 16;
-			_mm_storeu_si128((__m128i *)out,_mm_xor_si128(_mm_aesdeclast_si128(d1,dk10),ctr));
-			ctr = _mm_add_epi64(ctr,one);
-			out += 16;
-			_mm_storeu_si128((__m128i *)out,_mm_xor_si128(_mm_aesdeclast_si128(d2,dk10),ctr));
-			ctr = _mm_add_epi64(ctr,one);
-			out += 16;
-			_mm_storeu_si128((__m128i *)out,_mm_xor_si128(_mm_aesdeclast_si128(d3,dk10),ctr));
-			ctr = _mm_add_epi64(ctr,one);
-			out += 16;
-		}
-
-		while (len >= 16) {
-			len -= 16;
-			__m128i d0 = _mm_loadu_si128((const __m128i *)in);
-			in += 16;
-			d0 = _mm_xor_si128(d0,dk0);
-			d0 = _mm_aesdec_si128(d0,dk1);
-			d0 = _mm_aesdec_si128(d0,dk2);
-			d0 = _mm_aesdec_si128(d0,dk3);
-			d0 = _mm_aesdec_si128(d0,dk4);
-			d0 = _mm_aesdec_si128(d0,dk5);
-			d0 = _mm_aesdec_si128(d0,dk6);
-			d0 = _mm_aesdec_si128(d0,dk7);
-			d0 = _mm_aesdec_si128(d0,dk8);
-			d0 = _mm_aesdec_si128(d0,dk9);
-			_mm_storeu_si128((__m128i *)out,_mm_xor_si128(_mm_aesdeclast_si128(d0,dk10),ctr));
-			ctr = _mm_add_epi64(ctr,one);
-			out += 16;
-		}
-
-		if (len) {
-			__m128i last = ctr;
-			last = _mm_xor_si128(last,dk10); // k0
-			last = _mm_aesenc_si128(last,k1);
-			last = _mm_aesenc_si128(last,k2);
-			last = _mm_aesenc_si128(last,k3);
-			last = _mm_aesenc_si128(last,k4);
-			last = _mm_aesenc_si128(last,k5);
-			last = _mm_aesenc_si128(last,k6);
-			last = _mm_aesenc_si128(last,k7);
-			last = _mm_aesenc_si128(last,k8);
-			last = _mm_aesenc_si128(last,k9);
-			uint8_t lpad[16];
-			_mm_storeu_si128((__m128i *)lpad,_mm_aesenclast_si128(last,dk0)); // k10
-			for(unsigned int i=0;i<len;++i) {
-				out[i] = in[i] ^ lpad[i];
-			}
-		}
-	}
-
 	ZT_ALWAYS_INLINE void _encrypt_aesni(const void *in,void *out) const
 	{
 		__m128i tmp;
@@ -719,44 +305,152 @@ private:
 		_mm_storeu_si128((__m128i *)out,_mm_aesenclast_si128(tmp,_k.ni.k[14]));
 	}
 
-	ZT_ALWAYS_INLINE void _decrypt_aesni(const void *in,void *out) const
+	ZT_ALWAYS_INLINE void _crypt_ctr_aesni(const uint8_t iv[16],const uint8_t *in,unsigned int len,uint8_t *out) const
 	{
-		__m128i tmp;
-		tmp = _mm_loadu_si128((const __m128i *)in);
-		tmp = _mm_xor_si128(tmp,_k.ni.k[14]);
-		tmp = _mm_aesdec_si128(tmp,_k.ni.k[15]);
-		tmp = _mm_aesdec_si128(tmp,_k.ni.k[16]);
-		tmp = _mm_aesdec_si128(tmp,_k.ni.k[17]);
-		tmp = _mm_aesdec_si128(tmp,_k.ni.k[18]);
-		tmp = _mm_aesdec_si128(tmp,_k.ni.k[19]);
-		tmp = _mm_aesdec_si128(tmp,_k.ni.k[20]);
-		tmp = _mm_aesdec_si128(tmp,_k.ni.k[21]);
-		tmp = _mm_aesdec_si128(tmp,_k.ni.k[22]);
-		tmp = _mm_aesdec_si128(tmp,_k.ni.k[23]);
-		tmp = _mm_aesdec_si128(tmp,_k.ni.k[24]);
-		tmp = _mm_aesdec_si128(tmp,_k.ni.k[25]);
-		tmp = _mm_aesdec_si128(tmp,_k.ni.k[26]);
-		tmp = _mm_aesdec_si128(tmp,_k.ni.k[27]);
-		_mm_storeu_si128((__m128i *)out,_mm_aesdeclast_si128(tmp,_k.ni.k[0]));
+		const uint64_t iv0 = *((const uint64_t *)iv);
+		uint64_t ctr = Utils::ntoh(*((const uint64_t *)(iv+8)));
+
+		const __m128i k0 = _k.ni.k[0];
+		const __m128i k1 = _k.ni.k[1];
+		const __m128i k2 = _k.ni.k[2];
+		const __m128i k3 = _k.ni.k[3];
+		const __m128i k4 = _k.ni.k[4];
+		const __m128i k5 = _k.ni.k[5];
+		const __m128i k6 = _k.ni.k[6];
+		const __m128i k7 = _k.ni.k[7];
+		const __m128i k8 = _k.ni.k[8];
+		const __m128i k9 = _k.ni.k[9];
+		const __m128i k10 = _k.ni.k[10];
+		const __m128i k11 = _k.ni.k[11];
+		const __m128i k12 = _k.ni.k[12];
+		const __m128i k13 = _k.ni.k[13];
+		const __m128i k14 = _k.ni.k[14];
+
+		while (len >= 64) {
+			__m128i c0 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton(ctr),(__m64)iv0),k0);
+			__m128i c1 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton(ctr+1ULL),(__m64)iv0),k0);
+			__m128i c2 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton(ctr+2ULL),(__m64)iv0),k0);
+			__m128i c3 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton(ctr+3ULL),(__m64)iv0),k0);
+			ctr += 4;
+			c0 = _mm_aesenc_si128(c0,k1);
+			c1 = _mm_aesenc_si128(c1,k1);
+			c2 = _mm_aesenc_si128(c2,k1);
+			c3 = _mm_aesenc_si128(c3,k1);
+			c0 = _mm_aesenc_si128(c0,k2);
+			c1 = _mm_aesenc_si128(c1,k2);
+			c2 = _mm_aesenc_si128(c2,k2);
+			c3 = _mm_aesenc_si128(c3,k2);
+			c0 = _mm_aesenc_si128(c0,k3);
+			c1 = _mm_aesenc_si128(c1,k3);
+			c2 = _mm_aesenc_si128(c2,k3);
+			c3 = _mm_aesenc_si128(c3,k3);
+			c0 = _mm_aesenc_si128(c0,k4);
+			c1 = _mm_aesenc_si128(c1,k4);
+			c2 = _mm_aesenc_si128(c2,k4);
+			c3 = _mm_aesenc_si128(c3,k4);
+			c0 = _mm_aesenc_si128(c0,k5);
+			c1 = _mm_aesenc_si128(c1,k5);
+			c2 = _mm_aesenc_si128(c2,k5);
+			c3 = _mm_aesenc_si128(c3,k5);
+			c0 = _mm_aesenc_si128(c0,k6);
+			c1 = _mm_aesenc_si128(c1,k6);
+			c2 = _mm_aesenc_si128(c2,k6);
+			c3 = _mm_aesenc_si128(c3,k6);
+			c0 = _mm_aesenc_si128(c0,k7);
+			c1 = _mm_aesenc_si128(c1,k7);
+			c2 = _mm_aesenc_si128(c2,k7);
+			c3 = _mm_aesenc_si128(c3,k7);
+			c0 = _mm_aesenc_si128(c0,k8);
+			c1 = _mm_aesenc_si128(c1,k8);
+			c2 = _mm_aesenc_si128(c2,k8);
+			c3 = _mm_aesenc_si128(c3,k8);
+			c0 = _mm_aesenc_si128(c0,k9);
+			c1 = _mm_aesenc_si128(c1,k9);
+			c2 = _mm_aesenc_si128(c2,k9);
+			c3 = _mm_aesenc_si128(c3,k9);
+			c0 = _mm_aesenc_si128(c0,k10);
+			c1 = _mm_aesenc_si128(c1,k10);
+			c2 = _mm_aesenc_si128(c2,k10);
+			c3 = _mm_aesenc_si128(c3,k10);
+			c0 = _mm_aesenc_si128(c0,k11);
+			c1 = _mm_aesenc_si128(c1,k11);
+			c2 = _mm_aesenc_si128(c2,k11);
+			c3 = _mm_aesenc_si128(c3,k11);
+			c0 = _mm_aesenc_si128(c0,k12);
+			c1 = _mm_aesenc_si128(c1,k12);
+			c2 = _mm_aesenc_si128(c2,k12);
+			c3 = _mm_aesenc_si128(c3,k12);
+			c0 = _mm_aesenc_si128(c0,k13);
+			c1 = _mm_aesenc_si128(c1,k13);
+			c2 = _mm_aesenc_si128(c2,k13);
+			c3 = _mm_aesenc_si128(c3,k13);
+			_mm_storeu_si128((__m128i *)out,_mm_xor_si128(_mm_loadu_si128((const __m128i *)in),_mm_aesenclast_si128(c0,k14)));
+			_mm_storeu_si128((__m128i *)(out + 16),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 16)),_mm_aesenclast_si128(c1,k14)));
+			_mm_storeu_si128((__m128i *)(out + 32),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 32)),_mm_aesenclast_si128(c2,k14)));
+			_mm_storeu_si128((__m128i *)(out + 48),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 48)),_mm_aesenclast_si128(c3,k14)));
+			in += 64;
+			out += 64;
+			len -= 64;
+		}
+
+		while (len >= 16) {
+			__m128i c0 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton(ctr++),(__m64)iv0),k0);
+			c0 = _mm_aesenc_si128(c0,k1);
+			c0 = _mm_aesenc_si128(c0,k2);
+			c0 = _mm_aesenc_si128(c0,k3);
+			c0 = _mm_aesenc_si128(c0,k4);
+			c0 = _mm_aesenc_si128(c0,k5);
+			c0 = _mm_aesenc_si128(c0,k6);
+			c0 = _mm_aesenc_si128(c0,k7);
+			c0 = _mm_aesenc_si128(c0,k8);
+			c0 = _mm_aesenc_si128(c0,k9);
+			c0 = _mm_aesenc_si128(c0,k10);
+			c0 = _mm_aesenc_si128(c0,k11);
+			c0 = _mm_aesenc_si128(c0,k12);
+			c0 = _mm_aesenc_si128(c0,k13);
+			_mm_storeu_si128((__m128i *)out,_mm_xor_si128(_mm_loadu_si128((const __m128i *)in),_mm_aesenclast_si128(c0,k14)));
+			in += 16;
+			out += 16;
+			len -= 16;
+		}
+
+		if (len) {
+			__m128i c0 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton(ctr++),(__m64)iv0),k0);
+			c0 = _mm_aesenc_si128(c0,k1);
+			c0 = _mm_aesenc_si128(c0,k2);
+			c0 = _mm_aesenc_si128(c0,k3);
+			c0 = _mm_aesenc_si128(c0,k4);
+			c0 = _mm_aesenc_si128(c0,k5);
+			c0 = _mm_aesenc_si128(c0,k6);
+			c0 = _mm_aesenc_si128(c0,k7);
+			c0 = _mm_aesenc_si128(c0,k8);
+			c0 = _mm_aesenc_si128(c0,k9);
+			c0 = _mm_aesenc_si128(c0,k10);
+			c0 = _mm_aesenc_si128(c0,k11);
+			c0 = _mm_aesenc_si128(c0,k12);
+			c0 = _mm_aesenc_si128(c0,k13);
+			c0 = _mm_aesenclast_si128(c0,k14);
+			for(unsigned int i=0;i<len;++i)
+				out[i] = in[i] ^ ((const uint8_t *)&c0)[i];
+		}
 	}
 
 	static ZT_ALWAYS_INLINE __m128i _swap128_aesni(__m128i x) { return _mm_shuffle_epi8(x,_mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15)); }
 	static ZT_ALWAYS_INLINE __m128i _mult_block_aesni(__m128i h,__m128i y)
 	{
-		__m128i t1,t2,t3,t4,t5,t6;
 		y = _swap128_aesni(y);
-		t1 = _mm_clmulepi64_si128(h,y,0x00);
-		t2 = _mm_clmulepi64_si128(h,y,0x01);
-		t3 = _mm_clmulepi64_si128(h,y,0x10);
-		t4 = _mm_clmulepi64_si128(h,y,0x11);
+		__m128i t1 = _mm_clmulepi64_si128(h,y,0x00);
+		__m128i t2 = _mm_clmulepi64_si128(h,y,0x01);
+		__m128i t3 = _mm_clmulepi64_si128(h,y,0x10);
+		__m128i t4 = _mm_clmulepi64_si128(h,y,0x11);
 		t2 = _mm_xor_si128(t2,t3);
 		t3 = _mm_slli_si128(t2,8);
 		t2 = _mm_srli_si128(t2,8);
 		t1 = _mm_xor_si128(t1,t3);
 		t4 = _mm_xor_si128(t4,t2);
-		t5 = _mm_srli_epi32(t1,31);
+		__m128i t5 = _mm_srli_epi32(t1,31);
 		t1 = _mm_slli_epi32(t1,1);
-		t6 = _mm_srli_epi32(t4,31);
+		__m128i t6 = _mm_srli_epi32(t4,31);
 		t4 = _mm_slli_epi32(t4,1);
 		t3 = _mm_srli_si128(t5,12);
 		t6 = _mm_slli_si128(t6,4);
@@ -784,23 +478,22 @@ private:
 	}
 	static ZT_ALWAYS_INLINE __m128i _mult4xor_aesni(__m128i h1,__m128i h2,__m128i h3,__m128i h4,__m128i d1,__m128i d2,__m128i d3,__m128i d4)
 	{
-		__m128i t0,t1,t2,t3,t4,t5,t6,t7,t8,t9;
 		d1 = _swap128_aesni(d1);
 		d2 = _swap128_aesni(d2);
 		d3 = _swap128_aesni(d3);
 		d4 = _swap128_aesni(d4);
-		t0 = _mm_clmulepi64_si128(h1,d1,0x00);
-		t1 = _mm_clmulepi64_si128(h2,d2,0x00);
-		t2 = _mm_clmulepi64_si128(h3,d3,0x00);
-		t3 = _mm_clmulepi64_si128(h4,d4,0x00);
-		t8 = _mm_xor_si128(t0,t1);
+		__m128i t0 = _mm_clmulepi64_si128(h1,d1,0x00);
+		__m128i t1 = _mm_clmulepi64_si128(h2,d2,0x00);
+		__m128i t2 = _mm_clmulepi64_si128(h3,d3,0x00);
+		__m128i t3 = _mm_clmulepi64_si128(h4,d4,0x00);
+		__m128i t8 = _mm_xor_si128(t0,t1);
 		t8 = _mm_xor_si128(t8,t2);
 		t8 = _mm_xor_si128(t8,t3);
-		t4 = _mm_clmulepi64_si128(h1,d1,0x11);
-		t5 = _mm_clmulepi64_si128(h2,d2,0x11);
-		t6 = _mm_clmulepi64_si128(h3,d3,0x11);
-		t7 = _mm_clmulepi64_si128(h4,d4,0x11);
-		t9 = _mm_xor_si128(t4,t5);
+		__m128i t4 = _mm_clmulepi64_si128(h1,d1,0x11);
+		__m128i t5 = _mm_clmulepi64_si128(h2,d2,0x11);
+		__m128i t6 = _mm_clmulepi64_si128(h3,d3,0x11);
+		__m128i t7 = _mm_clmulepi64_si128(h4,d4,0x11);
+		__m128i t9 = _mm_xor_si128(t4,t5);
 		t9 = _mm_xor_si128(t9,t6);
 		t9 = _mm_xor_si128(t9,t7);
 		t0 = _mm_shuffle_epi32(h1,78);
@@ -861,68 +554,37 @@ private:
 		return _swap128_aesni(t6);
 	}
 	static ZT_ALWAYS_INLINE __m128i _ghash_aesni(__m128i h,__m128i y,__m128i x) { return _mult_block_aesni(h,_mm_xor_si128(y,x)); }
-	static ZT_ALWAYS_INLINE __m128i _increment_be_aesni(__m128i x)
-	{
-		x = _swap128_aesni(x);
-		x = _mm_add_epi64(x,_mm_set_epi32(0,0,0,1));
-		x = _swap128_aesni(x);
-		return x;
-	}
-	static ZT_ALWAYS_INLINE void _htoun64_aesni(void *network,const uint64_t host) { *((uint64_t *)network) = Utils::hton(host); }
-	static ZT_ALWAYS_INLINE __m128i _create_j_aesni(const uint8_t *iv)
-	{
-		uint8_t j[16];
-		*((uint64_t *)j) = *((const uint64_t *)iv);
-		*((uint32_t *)(j+8)) = *((const uint32_t *)(iv+8));
-		j[12] = 0;
-		j[13] = 0;
-		j[14] = 0;
-		j[15] = 1;
-		return _mm_loadu_si128((__m128i *)j);
-	}
-	ZT_ALWAYS_INLINE __m128i _icv_header_aesni(const void *assoc,unsigned int alen) const
+
+	ZT_ALWAYS_INLINE void _gmac_aesni(const uint8_t iv[12],const uint8_t *in,const unsigned int len,uint8_t out[16]) const
 	{
-		unsigned int blocks,pblocks,rem,i;
-		__m128i h1,h2,h3,h4,d1,d2,d3,d4;
-		__m128i y,last;
-		const __m128i *ab;
-		h1 = _k.ni.hhhh;
-		h2 = _k.ni.hhh;
-		h3 = _k.ni.hh;
-		h4 = _k.ni.h;
-		y = _mm_setzero_si128();
-		ab = (const __m128i *)assoc;
-		blocks = alen / 16;
-		pblocks = blocks - (blocks % 4);
-		rem = alen % 16;
-		for (i=0;i<pblocks;i+=4) {
-			d1 = _mm_loadu_si128(ab + i + 0);
-			d2 = _mm_loadu_si128(ab + i + 1);
-			d3 = _mm_loadu_si128(ab + i + 2);
-			d4 = _mm_loadu_si128(ab + i + 3);
-			y = _mm_xor_si128(y, d1);
+		__m128i h1 = _k.ni.hhhh;
+		__m128i h2 = _k.ni.hhh;
+		__m128i h3 = _k.ni.hh;
+		__m128i h4 = _k.ni.h;
+		__m128i y = _mm_setzero_si128();
+		const __m128i *ab = (const __m128i *)in;
+		unsigned int blocks = len / 16;
+		unsigned int pblocks = blocks - (blocks % 4);
+		unsigned int rem = len % 16;
+		for (unsigned int i=0;i<pblocks;i+=4) {
+			__m128i d1 = _mm_loadu_si128(ab + i + 0);
+			__m128i d2 = _mm_loadu_si128(ab + i + 1);
+			__m128i d3 = _mm_loadu_si128(ab + i + 2);
+			__m128i d4 = _mm_loadu_si128(ab + i + 3);
+			y = _mm_xor_si128(y,d1);
 			y = _mult4xor_aesni(h1,h2,h3,h4,y,d2,d3,d4);
 		}
-		for (i = pblocks; i < blocks; i++)
+		for (unsigned int i=pblocks;i<blocks;++i)
 			y = _ghash_aesni(_k.ni.h,y,_mm_loadu_si128(ab + i));
 		if (rem) {
-			last = _mm_setzero_si128();
+			__m128i last = _mm_setzero_si128();
 			memcpy(&last,ab + blocks,rem);
 			y = _ghash_aesni(_k.ni.h,y,last);
 		}
-		return y;
-	}
-	ZT_ALWAYS_INLINE __m128i _icv_tailer_aesni(__m128i y,size_t alen,size_t dlen) const
-	{
-		__m128i b;
-		_htoun64_aesni(&b, alen * 8);
-		_htoun64_aesni((uint8_t *)&b + sizeof(uint64_t),dlen * 8);
-		return _ghash_aesni(_k.ni.h,y,b);
-	}
-	ZT_ALWAYS_INLINE void _icv_crypt_aesni(__m128i y,__m128i j,uint8_t *icv,unsigned int icvsize) const
-	{
-		__m128i t,b;
-		t = _mm_xor_si128(j,_k.ni.k[0]);
+
+		y = _ghash_aesni(_k.ni.h,y,_mm_set_epi64((__m64)0LL,(__m64)Utils::hton((uint64_t)len * (uint64_t)8)));
+
+		__m128i t = _mm_xor_si128(_mm_set_epi32(0x01000000,(int)*((const uint32_t *)(iv+8)),(int)*((const uint32_t *)(iv+4)),(int)*((const uint32_t *)(iv))),_k.ni.k[0]);
 		t = _mm_aesenc_si128(t,_k.ni.k[1]);
 		t = _mm_aesenc_si128(t,_k.ni.k[2]);
 		t = _mm_aesenc_si128(t,_k.ni.k[3]);
@@ -938,324 +600,7 @@ private:
 		t = _mm_aesenc_si128(t,_k.ni.k[13]);
 		t = _mm_aesenclast_si128(t,_k.ni.k[14]);
 		t = _mm_xor_si128(y,t);
-		_mm_storeu_si128(&b,t);
-		memcpy(icv,&b,icvsize);
-	}
-
-	ZT_ALWAYS_INLINE void _encrypt_gcm256_aesni(unsigned int len,const uint8_t *in,uint8_t *out,const uint8_t *iv,unsigned int alen,const uint8_t *assoc,uint8_t *icv,unsigned int icvsize) const
-	{
-		__m128i j = _create_j_aesni(iv);
-		__m128i cb = _increment_be_aesni(j);
-		__m128i y = _icv_header_aesni(assoc,alen);
-		unsigned int blocks = len / 16;
-		unsigned int pblocks = blocks - (blocks % 4);
-		unsigned int rem = len % 16;
-		__m128i *bi = (__m128i *)in;
-		__m128i *bo = (__m128i *)out;
-
-		const __m128i k0 = _k.ni.k[0];
-		const __m128i k1 = _k.ni.k[1];
-		const __m128i k2 = _k.ni.k[2];
-		const __m128i k3 = _k.ni.k[3];
-		const __m128i k4 = _k.ni.k[4];
-		const __m128i k5 = _k.ni.k[5];
-		const __m128i k6 = _k.ni.k[6];
-		const __m128i k7 = _k.ni.k[7];
-		const __m128i k8 = _k.ni.k[8];
-		const __m128i k9 = _k.ni.k[9];
-		const __m128i k10 = _k.ni.k[10];
-		const __m128i k11 = _k.ni.k[11];
-		const __m128i k12 = _k.ni.k[12];
-		const __m128i k13 = _k.ni.k[13];
-		const __m128i k14 = _k.ni.k[14];
-
-		unsigned int i;
-		for (i=0;i<pblocks;i+=4) {
-			__m128i t1 = _mm_xor_si128(cb,k0);
-			cb = _increment_be_aesni(cb);
-			__m128i t2 = _mm_xor_si128(cb,k0);
-			cb = _increment_be_aesni(cb);
-			__m128i t3 = _mm_xor_si128(cb,k0);
-			cb = _increment_be_aesni(cb);
-			__m128i t4 = _mm_xor_si128(cb,k0);
-			cb = _increment_be_aesni(cb);
-			t1 = _mm_aesenc_si128(t1,k1);
-			t2 = _mm_aesenc_si128(t2,k1);
-			t3 = _mm_aesenc_si128(t3,k1);
-			t4 = _mm_aesenc_si128(t4,k1);
-			t1 = _mm_aesenc_si128(t1,k2);
-			t2 = _mm_aesenc_si128(t2,k2);
-			t3 = _mm_aesenc_si128(t3,k2);
-			t4 = _mm_aesenc_si128(t4,k2);
-			t1 = _mm_aesenc_si128(t1,k3);
-			t2 = _mm_aesenc_si128(t2,k3);
-			t3 = _mm_aesenc_si128(t3,k3);
-			t4 = _mm_aesenc_si128(t4,k3);
-			t1 = _mm_aesenc_si128(t1,k4);
-			t2 = _mm_aesenc_si128(t2,k4);
-			t3 = _mm_aesenc_si128(t3,k4);
-			t4 = _mm_aesenc_si128(t4,k4);
-			t1 = _mm_aesenc_si128(t1,k5);
-			t2 = _mm_aesenc_si128(t2,k5);
-			t3 = _mm_aesenc_si128(t3,k5);
-			t4 = _mm_aesenc_si128(t4,k5);
-			t1 = _mm_aesenc_si128(t1,k6);
-			t2 = _mm_aesenc_si128(t2,k6);
-			t3 = _mm_aesenc_si128(t3,k6);
-			t4 = _mm_aesenc_si128(t4,k6);
-			t1 = _mm_aesenc_si128(t1,k7);
-			t2 = _mm_aesenc_si128(t2,k7);
-			t3 = _mm_aesenc_si128(t3,k7);
-			t4 = _mm_aesenc_si128(t4,k7);
-			t1 = _mm_aesenc_si128(t1,k8);
-			t2 = _mm_aesenc_si128(t2,k8);
-			t3 = _mm_aesenc_si128(t3,k8);
-			t4 = _mm_aesenc_si128(t4,k8);
-			t1 = _mm_aesenc_si128(t1,k9);
-			t2 = _mm_aesenc_si128(t2,k9);
-			t3 = _mm_aesenc_si128(t3,k9);
-			t4 = _mm_aesenc_si128(t4,k9);
-			t1 = _mm_aesenc_si128(t1,k10);
-			t2 = _mm_aesenc_si128(t2,k10);
-			t3 = _mm_aesenc_si128(t3,k10);
-			t4 = _mm_aesenc_si128(t4,k10);
-			t1 = _mm_aesenc_si128(t1,k11);
-			t2 = _mm_aesenc_si128(t2,k11);
-			t3 = _mm_aesenc_si128(t3,k11);
-			t4 = _mm_aesenc_si128(t4,k11);
-			t1 = _mm_aesenc_si128(t1,k12);
-			t2 = _mm_aesenc_si128(t2,k12);
-			t3 = _mm_aesenc_si128(t3,k12);
-			t4 = _mm_aesenc_si128(t4,k12);
-			t1 = _mm_aesenc_si128(t1,k13);
-			t2 = _mm_aesenc_si128(t2,k13);
-			t3 = _mm_aesenc_si128(t3,k13);
-			t4 = _mm_aesenc_si128(t4,k13);
-			t1 = _mm_aesenclast_si128(t1,k14);
-			t2 = _mm_aesenclast_si128(t2,k14);
-			t3 = _mm_aesenclast_si128(t3,k14);
-			t4 = _mm_aesenclast_si128(t4,k14);
-			t1 = _mm_xor_si128(t1,_mm_loadu_si128(bi + i + 0));
-			t2 = _mm_xor_si128(t2,_mm_loadu_si128(bi + i + 1));
-			t3 = _mm_xor_si128(t3,_mm_loadu_si128(bi + i + 2));
-			t4 = _mm_xor_si128(t4,_mm_loadu_si128(bi + i + 3));
-			y = _mm_xor_si128(y,t1);
-			y = _mult4xor_aesni(_k.ni.hhhh,_k.ni.hhh,_k.ni.hh,_k.ni.h,y,t2,t3,t4);
-			_mm_storeu_si128(bo + i + 0,t1);
-			_mm_storeu_si128(bo + i + 1,t2);
-			_mm_storeu_si128(bo + i + 2,t3);
-			_mm_storeu_si128(bo + i + 3,t4);
-		}
-
-		for (i=pblocks;i<blocks;++i) {
-			__m128i t1 = _mm_xor_si128(cb,k0);
-			t1 = _mm_aesenc_si128(t1,k1);
-			t1 = _mm_aesenc_si128(t1,k2);
-			t1 = _mm_aesenc_si128(t1,k3);
-			t1 = _mm_aesenc_si128(t1,k4);
-			t1 = _mm_aesenc_si128(t1,k5);
-			t1 = _mm_aesenc_si128(t1,k6);
-			t1 = _mm_aesenc_si128(t1,k7);
-			t1 = _mm_aesenc_si128(t1,k8);
-			t1 = _mm_aesenc_si128(t1,k9);
-			t1 = _mm_aesenc_si128(t1,k10);
-			t1 = _mm_aesenc_si128(t1,k11);
-			t1 = _mm_aesenc_si128(t1,k12);
-			t1 = _mm_aesenc_si128(t1,k13);
-			t1 = _mm_aesenclast_si128(t1,k14);
-			t1 = _mm_xor_si128(t1,_mm_loadu_si128(bi + i));
-			_mm_storeu_si128(bo + i,t1);
-			y = _ghash_aesni(_k.ni.h,y,t1);
-			cb = _increment_be_aesni(cb);
-		}
-
-		if (rem) {
-			__m128i t,b;
-			memset(&b,0,sizeof(b));
-			memcpy(&b,bi + blocks,rem);
-			t = _mm_xor_si128(cb,k0);
-			t = _mm_aesenc_si128(t,k1);
-			t = _mm_aesenc_si128(t,k2);
-			t = _mm_aesenc_si128(t,k3);
-			t = _mm_aesenc_si128(t,k4);
-			t = _mm_aesenc_si128(t,k5);
-			t = _mm_aesenc_si128(t,k6);
-			t = _mm_aesenc_si128(t,k7);
-			t = _mm_aesenc_si128(t,k8);
-			t = _mm_aesenc_si128(t,k9);
-			t = _mm_aesenc_si128(t,k10);
-			t = _mm_aesenc_si128(t,k11);
-			t = _mm_aesenc_si128(t,k12);
-			t = _mm_aesenc_si128(t,k13);
-			t = _mm_aesenclast_si128(t,k14);
-			b = _mm_xor_si128(t,b);
-			memcpy(bo + blocks,&b,rem);
-			memset((u_char*)&b + rem,0,16 - rem);
-			y = _ghash_aesni(_k.ni.h,y,b);
-		}
-
-		y = _icv_tailer_aesni(y,alen,len);
-		_icv_crypt_aesni(y,j,icv,icvsize);
-	}
-
-	ZT_ALWAYS_INLINE void _decrypt_gcm256_aesni(unsigned int len,const uint8_t *in,uint8_t *out,const uint8_t *iv,unsigned int alen,const uint8_t *assoc,uint8_t *icv,unsigned int icvsize)
-	{
-		__m128i j = _create_j_aesni(iv);
-		__m128i cb = _increment_be_aesni(j);
-		__m128i y = _icv_header_aesni(assoc,alen);
-		unsigned int blocks = len / 16;
-		unsigned int pblocks = blocks - (blocks % 4);
-		unsigned int rem = len % 16;
-		__m128i *bi = (__m128i *)in;
-		__m128i *bo = (__m128i *)out;
-
-		const __m128i k0 = _k.ni.k[0];
-		const __m128i k1 = _k.ni.k[1];
-		const __m128i k2 = _k.ni.k[2];
-		const __m128i k3 = _k.ni.k[3];
-		const __m128i k4 = _k.ni.k[4];
-		const __m128i k5 = _k.ni.k[5];
-		const __m128i k6 = _k.ni.k[6];
-		const __m128i k7 = _k.ni.k[7];
-		const __m128i k8 = _k.ni.k[8];
-		const __m128i k9 = _k.ni.k[9];
-		const __m128i k10 = _k.ni.k[10];
-		const __m128i k11 = _k.ni.k[11];
-		const __m128i k12 = _k.ni.k[12];
-		const __m128i k13 = _k.ni.k[13];
-		const __m128i k14 = _k.ni.k[14];
-
-		unsigned int i;
-		for (i=0;i<pblocks;i+=4) {
-			__m128i t1 = _mm_xor_si128(cb,k0);
-			cb = _increment_be_aesni(cb);
-			__m128i t2 = _mm_xor_si128(cb,k0);
-			cb = _increment_be_aesni(cb);
-			__m128i t3 = _mm_xor_si128(cb,k0);
-			cb = _increment_be_aesni(cb);
-			__m128i t4 = _mm_xor_si128(cb,k0);
-			cb = _increment_be_aesni(cb);
-			t1 = _mm_aesenc_si128(t1,k1);
-			t2 = _mm_aesenc_si128(t2,k1);
-			t3 = _mm_aesenc_si128(t3,k1);
-			t4 = _mm_aesenc_si128(t4,k1);
-			t1 = _mm_aesenc_si128(t1,k2);
-			t2 = _mm_aesenc_si128(t2,k2);
-			t3 = _mm_aesenc_si128(t3,k2);
-			t4 = _mm_aesenc_si128(t4,k2);
-			t1 = _mm_aesenc_si128(t1,k3);
-			t2 = _mm_aesenc_si128(t2,k3);
-			t3 = _mm_aesenc_si128(t3,k3);
-			t4 = _mm_aesenc_si128(t4,k3);
-			t1 = _mm_aesenc_si128(t1,k4);
-			t2 = _mm_aesenc_si128(t2,k4);
-			t3 = _mm_aesenc_si128(t3,k4);
-			t4 = _mm_aesenc_si128(t4,k4);
-			t1 = _mm_aesenc_si128(t1,k5);
-			t2 = _mm_aesenc_si128(t2,k5);
-			t3 = _mm_aesenc_si128(t3,k5);
-			t4 = _mm_aesenc_si128(t4,k5);
-			t1 = _mm_aesenc_si128(t1,k6);
-			t2 = _mm_aesenc_si128(t2,k6);
-			t3 = _mm_aesenc_si128(t3,k6);
-			t4 = _mm_aesenc_si128(t4,k6);
-			t1 = _mm_aesenc_si128(t1,k7);
-			t2 = _mm_aesenc_si128(t2,k7);
-			t3 = _mm_aesenc_si128(t3,k7);
-			t4 = _mm_aesenc_si128(t4,k7);
-			t1 = _mm_aesenc_si128(t1,k8);
-			t2 = _mm_aesenc_si128(t2,k8);
-			t3 = _mm_aesenc_si128(t3,k8);
-			t4 = _mm_aesenc_si128(t4,k8);
-			t1 = _mm_aesenc_si128(t1,k9);
-			t2 = _mm_aesenc_si128(t2,k9);
-			t3 = _mm_aesenc_si128(t3,k9);
-			t4 = _mm_aesenc_si128(t4,k9);
-			t1 = _mm_aesenc_si128(t1,k10);
-			t2 = _mm_aesenc_si128(t2,k10);
-			t3 = _mm_aesenc_si128(t3,k10);
-			t4 = _mm_aesenc_si128(t4,k10);
-			t1 = _mm_aesenc_si128(t1,k11);
-			t2 = _mm_aesenc_si128(t2,k11);
-			t3 = _mm_aesenc_si128(t3,k11);
-			t4 = _mm_aesenc_si128(t4,k11);
-			t1 = _mm_aesenc_si128(t1,k12);
-			t2 = _mm_aesenc_si128(t2,k12);
-			t3 = _mm_aesenc_si128(t3,k12);
-			t4 = _mm_aesenc_si128(t4,k12);
-			t1 = _mm_aesenc_si128(t1,k13);
-			t2 = _mm_aesenc_si128(t2,k13);
-			t3 = _mm_aesenc_si128(t3,k13);
-			t4 = _mm_aesenc_si128(t4,k13);
-			t1 = _mm_aesenclast_si128(t1,k14);
-			t2 = _mm_aesenclast_si128(t2,k14);
-			t3 = _mm_aesenclast_si128(t3,k14);
-			t4 = _mm_aesenclast_si128(t4,k14);
-			__m128i d1 = _mm_loadu_si128(bi + i + 0);
-			__m128i d2 = _mm_loadu_si128(bi + i + 1);
-			__m128i d3 = _mm_loadu_si128(bi + i + 2);
-			__m128i d4 = _mm_loadu_si128(bi + i + 3);
-			y = _mm_xor_si128(y,d1);
-			y = _mult4xor_aesni(_k.ni.hhhh,_k.ni.hhh,_k.ni.hh,_k.ni.h,y,d2,d3,d4);
-			t1 = _mm_xor_si128(t1,d1);
-			t2 = _mm_xor_si128(t2,d2);
-			t3 = _mm_xor_si128(t3,d3);
-			t4 = _mm_xor_si128(t4,d4);
-			_mm_storeu_si128(bo + i + 0,t1);
-			_mm_storeu_si128(bo + i + 1,t2);
-			_mm_storeu_si128(bo + i + 2,t3);
-			_mm_storeu_si128(bo + i + 3,t4);
-		}
-
-		for (i=pblocks;i<blocks;i++) {
-			__m128i t1 = _mm_xor_si128(cb,k0);
-			t1 = _mm_aesenc_si128(t1,k1);
-			t1 = _mm_aesenc_si128(t1,k2);
-			t1 = _mm_aesenc_si128(t1,k3);
-			t1 = _mm_aesenc_si128(t1,k4);
-			t1 = _mm_aesenc_si128(t1,k5);
-			t1 = _mm_aesenc_si128(t1,k6);
-			t1 = _mm_aesenc_si128(t1,k7);
-			t1 = _mm_aesenc_si128(t1,k8);
-			t1 = _mm_aesenc_si128(t1,k9);
-			t1 = _mm_aesenc_si128(t1,k10);
-			t1 = _mm_aesenc_si128(t1,k11);
-			t1 = _mm_aesenc_si128(t1,k12);
-			t1 = _mm_aesenc_si128(t1,k13);
-			t1 = _mm_aesenclast_si128(t1,k14);
-			__m128i d1 = _mm_loadu_si128(bi + i);
-			y = _ghash_aesni(_k.ni.h,y,d1);
-			t1 = _mm_xor_si128(t1,d1);
-			_mm_storeu_si128(bo + i,t1);
-			cb = _increment_be_aesni(cb);
-		}
-
-		if (rem) {
-			__m128i t,b;
-			memset(&b,0,sizeof(b));
-			memcpy(&b,bi + blocks,rem);
-			y = _ghash_aesni(_k.ni.h,y,b);
-			t = _mm_xor_si128(cb,k0);
-			t = _mm_aesenc_si128(t,k1);
-			t = _mm_aesenc_si128(t,k2);
-			t = _mm_aesenc_si128(t,k3);
-			t = _mm_aesenc_si128(t,k4);
-			t = _mm_aesenc_si128(t,k5);
-			t = _mm_aesenc_si128(t,k6);
-			t = _mm_aesenc_si128(t,k7);
-			t = _mm_aesenc_si128(t,k8);
-			t = _mm_aesenc_si128(t,k9);
-			t = _mm_aesenc_si128(t,k10);
-			t = _mm_aesenc_si128(t,k11);
-			t = _mm_aesenc_si128(t,k12);
-			t = _mm_aesenc_si128(t,k13);
-			t = _mm_aesenclast_si128(t,k14);
-			b = _mm_xor_si128(t,b);
-			memcpy(bo + blocks,&b,rem);
-		}
-
-		y = _icv_tailer_aesni(y,alen,len);
-		_icv_crypt_aesni(y,j,icv,icvsize);
+		_mm_storeu_si128((__m128i *)out,t);
 	}
 #endif /* ZT_AES_AESNI ******************************************************/
 };

+ 117 - 142
selftest.cpp

@@ -148,11 +148,10 @@ static const uint8_t AES_TEST_VECTOR_0_KEY[32] = { 0x60,0x3d,0xeb,0x10,0x15,0xca
 static const uint8_t AES_TEST_VECTOR_0_IN[16] = { 0x6b,0xc1,0xbe,0xe2,0x2e,0x40,0x9f,0x96,0xe9,0x3d,0x7e,0x11,0x73,0x93,0x17,0x2a };
 static const uint8_t AES_TEST_VECTOR_0_OUT[16] = { 0xf3,0xee,0xd1,0xbd,0xb5,0xd2,0xa0,0x3c,0x06,0x4b,0x5a,0x7e,0x3d,0xb1,0x81,0xf8 };
 
-static const uint8_t AES_GCM_TEST_VECTOR_0_KEY[32] = { 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08 };
-static const uint8_t AES_GCM_TEST_VECTOR_0_IV[12] = { 0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88 };
-static const uint8_t AES_GCM_TEST_VECTOR_0_IN[64] = { 0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55 };
-static const uint8_t AES_GCM_TEST_VECTOR_0_OUT[64] = { 0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad };
-static const uint8_t AES_GCM_TEST_VECTOR_0_TAG[16] = { 0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c };
+static const uint8_t AES_GMAC_VECTOR_0_KEY[32] = { 0xbb, 0x10, 0x10, 0x06, 0x4f, 0xb8, 0x35, 0x23, 0xea, 0x9d, 0xf3, 0x2b, 0xad, 0x9f, 0x1f, 0x2a, 0x4f, 0xce, 0xfc, 0x0f, 0x21, 0x07, 0xc0, 0xaa, 0xba, 0xd9, 0xb7, 0x56, 0xd8, 0x09, 0x21, 0x9d };
+static const uint8_t AES_GMAC_VECTOR_0_IV[12] = { 0x2f, 0x9a, 0xd0, 0x12, 0xad, 0xfc, 0x12, 0x73, 0x43, 0xfb, 0xe0, 0x56 };
+static const uint8_t AES_GMAC_VECTOR_0_IN[16] = { 0xdb, 0x98, 0xd9, 0x0d, 0x1b, 0x69, 0x5c, 0xdb, 0x74, 0x7a, 0x34, 0x3f, 0xbb, 0xc9, 0xf1, 0x41 };
+static const uint8_t AES_GMAC_VECTOR_0_OUT[16] = { 0xef, 0x06, 0xd5, 0x4d, 0xfd, 0x00, 0x02, 0x1d, 0x75, 0x27, 0xdf, 0xf2, 0x6f, 0xc9, 0xd4, 0x84 };
 
 //////////////////////////////////////////////////////////////////////////////
 
@@ -168,151 +167,127 @@ static int testCrypto()
 		std::cout << "[crypto] getSecureRandom: " << Utils::hex(buf1,64,hexbuf) << ZT_EOL_S;
 	}
 
-	std::cout << "[crypto] Testing and benchmarking AES-256 and GCM..." ZT_EOL_S << "  AES-256 (test vectors): "; std::cout.flush();
-	AES tv(AES_TEST_VECTOR_0_KEY);
-	tv.encrypt(AES_TEST_VECTOR_0_IN,(uint8_t *)buf1);
-	if (memcmp(buf1,AES_TEST_VECTOR_0_OUT,16) != 0) {
-		std::cout << "FAILED (test vector 0 encrypt)" ZT_EOL_S;
-		return -1;
-	}
-	tv.decrypt((const uint8_t *)buf1,(uint8_t *)buf2);
-	if (memcmp(AES_TEST_VECTOR_0_IN,buf2,16) != 0) {
-		std::cout << "FAILED (test vector 0 decrypt)" ZT_EOL_S;
-		return -1;
-	}
-	std::cout << "PASS" ZT_EOL_S << "  AES-256 GCM (test vectors, benchmark): "; std::cout.flush();
-	tv.gcmEncrypt((const uint8_t *)hexbuf,buf1,sizeof(buf1),nullptr,0,buf2,(uint8_t *)(hexbuf + 32),16);
-	if (!tv.gcmDecrypt((const uint8_t *)hexbuf,buf2,sizeof(buf2),nullptr,0,buf3,(const uint8_t *)(hexbuf + 32),16)) {
-		std::cout << "FAILED (encrypt/decrypt, auth tag mismatch)" ZT_EOL_S;
-		return -1;
-	}
-	if (memcmp(buf1,buf3,sizeof(buf1)) != 0) {
-		std::cout << "FAILED (encrypt/decrypt, data mismatch)" ZT_EOL_S;
-		return -1;
-	}
-	tv.init(AES_GCM_TEST_VECTOR_0_KEY);
-	tv.gcmEncrypt(AES_GCM_TEST_VECTOR_0_IV,AES_GCM_TEST_VECTOR_0_IN,sizeof(AES_GCM_TEST_VECTOR_0_IN),nullptr,0,(uint8_t *)buf1,(uint8_t *)buf2,16);
-	if (memcmp(buf2,AES_GCM_TEST_VECTOR_0_TAG,16) != 0) {
-		std::cout << "FAILED (test vector, tag mismatch) " << Utils::hex(buf2,16,hexbuf) << ZT_EOL_S;
-		return -1;
-	}
-	if (memcmp(buf1,AES_GCM_TEST_VECTOR_0_OUT,sizeof(AES_GCM_TEST_VECTOR_0_OUT)) != 0) {
-		std::cout << "FAILED (test vector, ciphertext mismatch) " << Utils::hex(buf2,16,hexbuf) << ZT_EOL_S;
-		return -1;
-	}
-	double gcmBytes = 0.0;
-	int64_t start = OSUtils::now();
-	for(unsigned long i=0;i<50000;++i) {
-		tv.gcmEncrypt((const uint8_t *)hexbuf,buf1,sizeof(buf1),nullptr,0,buf2,(uint8_t *)(hexbuf + 32),16);
-		tv.gcmEncrypt((const uint8_t *)hexbuf,buf2,sizeof(buf2),nullptr,0,buf1,(uint8_t *)(hexbuf + 32),16);
-		gcmBytes += (double)(sizeof(buf1) * 2);
-	}
-	int64_t end = OSUtils::now();
-	*dummy = buf1[0];
-	std::cout << ((gcmBytes / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second" ZT_EOL_S << "  AES scramble (benchmark): "; std::cout.flush();
-	double ecbBytes = 0.0;
-	AES::scramble((const uint8_t *)hexbuf,buf1,sizeof(buf1),buf2);
-	AES::unscramble((const uint8_t *)hexbuf,buf2,sizeof(buf2),buf3);
-	if (memcmp(buf1,buf3,sizeof(buf1)) != 0) {
-		std::cout << "FAILED (scramble/unscramble did not generate identical data)" ZT_EOL_S;
-		return -1;
-	}
-	start = OSUtils::now();
-	for(unsigned long i=0;i<200000;++i) {
-		AES::scramble((const uint8_t *)hexbuf,buf1,sizeof(buf1),buf2);
-		AES::scramble((const uint8_t *)hexbuf,buf2,sizeof(buf1),buf1);
-		ecbBytes += (double)(sizeof(buf1) * 2);
-	}
-	end = OSUtils::now();
-	*dummy = buf1[0];
-	std::cout << ((ecbBytes / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second" ZT_EOL_S << "  AES-256 GCM + scramble (benchmark): "; std::cout.flush();
-	ecbBytes = 0.0;
-	start = OSUtils::now();
-	for(unsigned long i=0;i<50000;++i) {
-		tv.gcmEncrypt((const uint8_t *)hexbuf,buf1,sizeof(buf1),nullptr,0,buf2,(uint8_t *)(hexbuf + 32),16);
-		AES::scramble((const uint8_t *)hexbuf,buf1,sizeof(buf1),buf2);
-		tv.gcmEncrypt((const uint8_t *)hexbuf,buf2,sizeof(buf2),nullptr,0,buf1,(uint8_t *)(hexbuf + 32),16);
-		AES::scramble((const uint8_t *)hexbuf,buf2,sizeof(buf1),buf1);
-		ecbBytes += (double)(sizeof(buf1) * 2);
-	}
-	end = OSUtils::now();
-	*dummy = buf1[0];
-	std::cout << ((ecbBytes / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second" ZT_EOL_S;
-
-	std::cout << "[crypto] Testing Salsa20... "; std::cout.flush();
-	for(unsigned int i=0;i<4;++i) {
-		for(unsigned int k=0;k<sizeof(buf1);++k)
-			buf1[k] = (unsigned char)rand();
-		memset(buf2,0,sizeof(buf2));
-		memset(buf3,0,sizeof(buf3));
-		Salsa20 s20;
-		s20.init("12345678123456781234567812345678","12345678");
-		s20.crypt20(buf1,buf2,sizeof(buf1));
-		s20.init("12345678123456781234567812345678","12345678");
-		s20.crypt20(buf2,buf3,sizeof(buf2));
-		if (memcmp(buf1,buf3,sizeof(buf1))) {
-			std::cout << "FAIL (encrypt/decrypt test)" ZT_EOL_S;
+	{
+		std::cout << "[crypto] Testing and benchmarking AES-256 and GCM..." ZT_EOL_S << "  AES-256 (test vectors): "; std::cout.flush();
+		AES tv(AES_TEST_VECTOR_0_KEY);
+		tv.encrypt(AES_TEST_VECTOR_0_IN,(uint8_t *)buf1);
+		if (memcmp(buf1,AES_TEST_VECTOR_0_OUT,16) != 0) {
+			std::cout << "FAILED (test vector 0 encrypt)" ZT_EOL_S;
 			return -1;
 		}
-	}
-	Salsa20 s20(s20TV0Key,s20TV0Iv);
-	memset(buf1,0,sizeof(buf1));
-	memset(buf2,0,sizeof(buf2));
-	s20.crypt20(buf1,buf2,64);
-	if (memcmp(buf2,s20TV0Ks,64)) {
-		std::cout << "FAIL (test vector 0)" ZT_EOL_S;
-		return -1;
-	}
-	s20.init(s2012TV0Key,s2012TV0Iv);
-	memset(buf1,0,sizeof(buf1));
-	memset(buf2,0,sizeof(buf2));
-	s20.crypt12(buf1,buf2,64);
-	if (memcmp(buf2,s2012TV0Ks,64)) {
-		std::cout << "FAIL (test vector 1)" ZT_EOL_S;
-		return -1;
-	}
-	std::cout << "PASS" ZT_EOL_S;
-
-#ifdef ZT_SALSA20_SSE
-	std::cout << "[crypto] Salsa20 SSE: ENABLED" ZT_EOL_S;
-#else
-	std::cout << "[crypto] Salsa20 SSE: DISABLED" ZT_EOL_S;
-#endif
-
-	std::cout << "[crypto] Benchmarking Salsa20/12... "; std::cout.flush();
-	{
-		unsigned char *bb = (unsigned char *)::malloc(1234567);
-		for(unsigned int i=0;i<1234567;++i)
-			bb[i] = (unsigned char)i;
-		Salsa20 s20(s20TV0Key,s20TV0Iv);
-		long double bytes = 0.0;
+		std::cout << "OK" ZT_EOL_S << "  GMAC-AES-256 (test vectors): "; std::cout.flush();
+		tv.init(AES_GMAC_VECTOR_0_KEY);
+		tv.gmac(AES_GMAC_VECTOR_0_IV,AES_GMAC_VECTOR_0_IN,sizeof(AES_GMAC_VECTOR_0_IN),(uint8_t *)hexbuf);
+		if (memcmp(hexbuf,AES_GMAC_VECTOR_0_OUT,16) != 0) {
+			std::cout << "FAILED (test vector 0)" ZT_EOL_S;
+			return -1;
+		}
+		std::cout << "OK" ZT_EOL_S << "  GMAC-AES-256 (benchmark): "; std::cout.flush();
+		int64_t start = OSUtils::now();
+		for(unsigned long i=0;i<200000;++i) {
+			tv.gmac(AES_GMAC_VECTOR_0_IV,buf1,sizeof(buf1),(uint8_t *)hexbuf);
+			buf1[0] = hexbuf[0];
+		}
+		int64_t end = OSUtils::now();
+		*dummy = hexbuf[0];
+		std::cout << (((double)(200000 * sizeof(buf1)) / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second" ZT_EOL_S;
+		std::cout << "  AES-256-CTR (benchmark): "; std::cout.flush();
 		start = OSUtils::now();
-		for(unsigned int i=0;i<200;++i) {
-			s20.crypt12(bb,bb,1234567);
-			bytes += 1234567.0;
+		for(unsigned long i=0;i<200000;++i) {
+			tv.ctr((const uint8_t *)hexbuf,buf1,sizeof(buf1),buf2);
+			hexbuf[0] = buf2[0];
 		}
 		end = OSUtils::now();
-		SHA512(buf1,bb,1234567);
-		std::cout << ((bytes / 1048576.0) / ((long double)(end - start) / 1000.0)) << " MiB/second (" << Utils::hex(buf1,16,hexbuf) << ')' << ZT_EOL_S;
-		::free((void *)bb);
+		*dummy = buf2[0];
+		std::cout << (((double)(200000 * sizeof(buf1)) / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second" ZT_EOL_S;
+		std::cout << "  GMAC-AES-256 and AES-256-CTR (benchmark): "; std::cout.flush();
+		start = OSUtils::now();
+		for(unsigned long i=0;i<200000;++i) {
+			tv.gmac(AES_GMAC_VECTOR_0_IV,buf1,sizeof(buf1),(uint8_t *)hexbuf);
+			tv.ctr((const uint8_t *)hexbuf,buf1,sizeof(buf1),buf2);
+			hexbuf[0] = buf2[0];
+		}
+		end = OSUtils::now();
+		*dummy = buf2[0];
+		std::cout << (((double)(200000 * sizeof(buf1)) / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second" ZT_EOL_S;
 	}
 
-	std::cout << "[crypto] Benchmarking Salsa20/20... "; std::cout.flush();
 	{
-		unsigned char *bb = (unsigned char *)::malloc(1234567);
-		for(unsigned int i=0;i<1234567;++i)
-			bb[i] = (unsigned char)i;
+		std::cout << "[crypto] Testing Salsa20... "; std::cout.flush();
+		for(unsigned int i=0;i<4;++i) {
+			for(unsigned int k=0;k<sizeof(buf1);++k)
+				buf1[k] = (unsigned char)rand();
+			memset(buf2,0,sizeof(buf2));
+			memset(buf3,0,sizeof(buf3));
+			Salsa20 s20;
+			s20.init("12345678123456781234567812345678","12345678");
+			s20.crypt20(buf1,buf2,sizeof(buf1));
+			s20.init("12345678123456781234567812345678","12345678");
+			s20.crypt20(buf2,buf3,sizeof(buf2));
+			if (memcmp(buf1,buf3,sizeof(buf1))) {
+				std::cout << "FAIL (encrypt/decrypt test)" ZT_EOL_S;
+				return -1;
+			}
+		}
 		Salsa20 s20(s20TV0Key,s20TV0Iv);
-		long double bytes = 0.0;
-		uint64_t start = OSUtils::now();
-		for(unsigned int i=0;i<200;++i) {
-			s20.crypt20(bb,bb,1234567);
-			bytes += 1234567.0;
+		memset(buf1,0,sizeof(buf1));
+		memset(buf2,0,sizeof(buf2));
+		s20.crypt20(buf1,buf2,64);
+		if (memcmp(buf2,s20TV0Ks,64)) {
+			std::cout << "FAIL (test vector 0)" ZT_EOL_S;
+			return -1;
+		}
+		s20.init(s2012TV0Key,s2012TV0Iv);
+		memset(buf1,0,sizeof(buf1));
+		memset(buf2,0,sizeof(buf2));
+		s20.crypt12(buf1,buf2,64);
+		if (memcmp(buf2,s2012TV0Ks,64)) {
+			std::cout << "FAIL (test vector 1)" ZT_EOL_S;
+			return -1;
+		}
+		std::cout << "PASS" ZT_EOL_S;
+
+	#ifdef ZT_SALSA20_SSE
+		std::cout << "[crypto] Salsa20 SSE: ENABLED" ZT_EOL_S;
+	#else
+		std::cout << "[crypto] Salsa20 SSE: DISABLED" ZT_EOL_S;
+	#endif
+
+		std::cout << "[crypto] Benchmarking Salsa20/12... "; std::cout.flush();
+		{
+			unsigned char *bb = (unsigned char *)::malloc(1234567);
+			for(unsigned int i=0;i<1234567;++i)
+				bb[i] = (unsigned char)i;
+			Salsa20 s20(s20TV0Key,s20TV0Iv);
+			long double bytes = 0.0;
+			int64_t start = OSUtils::now();
+			for(unsigned int i=0;i<200;++i) {
+				s20.crypt12(bb,bb,1234567);
+				bytes += 1234567.0;
+			}
+			int64_t end = OSUtils::now();
+			SHA512(buf1,bb,1234567);
+			std::cout << ((bytes / 1048576.0) / ((long double)(end - start) / 1000.0)) << " MiB/second (" << Utils::hex(buf1,16,hexbuf) << ')' << ZT_EOL_S;
+			::free((void *)bb);
+		}
+
+		std::cout << "[crypto] Benchmarking Salsa20/20... "; std::cout.flush();
+		{
+			unsigned char *bb = (unsigned char *)::malloc(1234567);
+			for(unsigned int i=0;i<1234567;++i)
+				bb[i] = (unsigned char)i;
+			Salsa20 s20(s20TV0Key,s20TV0Iv);
+			long double bytes = 0.0;
+			int64_t start = OSUtils::now();
+			for(unsigned int i=0;i<200;++i) {
+				s20.crypt20(bb,bb,1234567);
+				bytes += 1234567.0;
+			}
+			int64_t end = OSUtils::now();
+			SHA512(buf1,bb,1234567);
+			std::cout << ((bytes / 1048576.0) / ((long double)(end - start) / 1000.0)) << " MiB/second (" << Utils::hex(buf1,16,hexbuf) << ')' << ZT_EOL_S;
+			::free((void *)bb);
 		}
-		uint64_t end = OSUtils::now();
-		SHA512(buf1,bb,1234567);
-		std::cout << ((bytes / 1048576.0) / ((long double)(end - start) / 1000.0)) << " MiB/second (" << Utils::hex(buf1,16,hexbuf) << ')' << ZT_EOL_S;
-		::free((void *)bb);
 	}
 
 	std::cout << "[crypto] Testing SHA-512... "; std::cout.flush();
@@ -323,11 +298,11 @@ static int testCrypto()
 	}
 	std::cout << "PASS" ZT_EOL_S;
 	std::cout << "[crypto] Benchmarking SHA-512 (64 byte input)... "; std::cout.flush();
-	start = OSUtils::now();
+	int64_t start = OSUtils::now();
 	for(unsigned int i=0;i<2000000;++i) {
 		SHA512(buf1,buf1,64);
 	}
-	end = OSUtils::now();
+	int64_t end = OSUtils::now();
 	std::cout << (uint64_t)(2000000.0 / ((double)(end - start) / 1000.0)) << " hashes/second" ZT_EOL_S;
 
 	std::cout << "[crypto] Testing SHA-384... "; std::cout.flush();

Some files were not shown because too many files changed in this diff