Browse Source

Remove ASM Salsa20 since it will not be the default in 2.x any more... reduce build complexity.

Adam Ierymenko 6 years ago
parent
commit
51a25fdec9
5 changed files with 284 additions and 230 deletions
  1. 5 1
      Makefile
  2. 232 24
      node/AES.hpp
  3. 21 90
      node/Packet.cpp
  4. 0 76
      node/Salsa20.hpp
  5. 26 39
      selftest.cpp

+ 5 - 1
Makefile

@@ -1,11 +1,15 @@
 # Common makefile -- loads make rules for each platform
 # Common makefile -- loads make rules for each platform
 
 
 BUILDDIR := build
 BUILDDIR := build
+CMAKE_OPTS := -DCMAKE_BUILD_TYPE=Release
 
 
 .PHONY: all
 .PHONY: all
 
 
 all:
 all:
-	mkdir -p ${BUILDDIR} && cd ${BUILDDIR} && cmake .. && $(MAKE)
+	mkdir -p ${BUILDDIR} && cd ${BUILDDIR} && cmake .. ${CMAKE_OPTS} && $(MAKE)
 
 
 clean:
 clean:
 	rm -rf ${BUILDDIR}
 	rm -rf ${BUILDDIR}
+
+distclean:
+	rm -rf ${BUILDDIR}

+ 232 - 24
node/AES.hpp

@@ -57,15 +57,12 @@ public:
 	inline AES() {}
 	inline AES() {}
 	inline AES(const uint8_t key[32]) { this->init(key); }
 	inline AES(const uint8_t key[32]) { this->init(key); }
 
 
-	inline ~AES()
-	{
-		Utils::burn(&_k,sizeof(_k));
-	}
+	inline ~AES() { Utils::burn(&_k,sizeof(_k)); }
 
 
 	inline void init(const uint8_t key[32])
 	inline void init(const uint8_t key[32])
 	{
 	{
 #ifdef ZT_AES_AESNI
 #ifdef ZT_AES_AESNI
-		if (HW_ACCEL) {
+		if (likely(HW_ACCEL)) {
 			_init_aesni(key);
 			_init_aesni(key);
 			return;
 			return;
 		}
 		}
@@ -76,7 +73,7 @@ public:
 	inline void encrypt(const uint8_t in[16],uint8_t out[16]) const
 	inline void encrypt(const uint8_t in[16],uint8_t out[16]) const
 	{
 	{
 #ifdef ZT_AES_AESNI
 #ifdef ZT_AES_AESNI
-		if (HW_ACCEL) {
+		if (likely(HW_ACCEL)) {
 			_encrypt_aesni(in,out);
 			_encrypt_aesni(in,out);
 			return;
 			return;
 		}
 		}
@@ -84,10 +81,53 @@ public:
 		_encryptSW(in,out);
 		_encryptSW(in,out);
 	}
 	}
 
 
+	inline void ecbEncrypt(const void *in,unsigned int inlen,void *out)
+	{
+		if (inlen < 16)
+			return;
+#ifdef ZT_AES_AESNI
+		if (likely(HW_ACCEL)) {
+			const uint8_t *i = (const uint8_t *)in;
+			uint8_t *o = (uint8_t *)out;
+			while (inlen >= 128) {
+				_encrypt_8xecb_aesni(i,o);
+				i += 128;
+				o += 128;
+				inlen -= 128;
+			}
+			while (inlen >= 16) {
+				_encrypt_aesni(i,o);
+				i += 16;
+				o += 16;
+				inlen -= 16;
+			}
+			if (inlen != 0) {
+				i -= (16 - inlen);
+				o -= (16 - inlen);
+				_encrypt_aesni(i,o);
+			}
+			return;
+		}
+#endif
+		const uint8_t *i = (const uint8_t *)in;
+		uint8_t *o = (uint8_t *)out;
+		while (inlen >= 16) {
+			_encryptSW(i,o);
+			i += 16;
+			o += 16;
+			inlen -= 16;
+		}
+		if (inlen != 0) {
+			i -= (16 - inlen);
+			o -= (16 - inlen);
+			_encryptSW(i,o);
+		}
+	}
+
 	inline void gcmEncrypt(const uint8_t iv[12],const void *in,unsigned int inlen,const void *assoc,unsigned int assoclen,void *out,uint8_t *tag,unsigned int taglen)
 	inline void gcmEncrypt(const uint8_t iv[12],const void *in,unsigned int inlen,const void *assoc,unsigned int assoclen,void *out,uint8_t *tag,unsigned int taglen)
 	{
 	{
 #ifdef ZT_AES_AESNI
 #ifdef ZT_AES_AESNI
-		if (HW_ACCEL) {
+		if (likely(HW_ACCEL)) {
 			_encrypt_gcm256_aesni(inlen,(const uint8_t *)in,(uint8_t *)out,iv,assoclen,(const uint8_t *)assoc,tag,taglen);
 			_encrypt_gcm256_aesni(inlen,(const uint8_t *)in,(uint8_t *)out,iv,assoclen,(const uint8_t *)assoc,tag,taglen);
 			return;
 			return;
 		}
 		}
@@ -98,7 +138,7 @@ public:
 	inline bool gcmDecrypt(const uint8_t iv[12],const void *in,unsigned int inlen,const void *assoc,unsigned int assoclen,void *out,const uint8_t *tag,unsigned int taglen)
 	inline bool gcmDecrypt(const uint8_t iv[12],const void *in,unsigned int inlen,const void *assoc,unsigned int assoclen,void *out,const uint8_t *tag,unsigned int taglen)
 	{
 	{
 #ifdef ZT_AES_AESNI
 #ifdef ZT_AES_AESNI
-		if (HW_ACCEL) {
+		if (likely(HW_ACCEL)) {
 			uint8_t tagbuf[16];
 			uint8_t tagbuf[16];
 			_decrypt_gcm256_aesni(inlen,(const uint8_t *)in,(uint8_t *)out,iv,assoclen,(const uint8_t *)assoc,tagbuf,taglen);
 			_decrypt_gcm256_aesni(inlen,(const uint8_t *)in,(uint8_t *)out,iv,assoclen,(const uint8_t *)assoc,tagbuf,taglen);
 			return Utils::secureEq(tagbuf,tag,taglen);
 			return Utils::secureEq(tagbuf,tag,taglen);
@@ -218,6 +258,160 @@ private:
 		tmp = _mm_aesenc_si128(tmp,_k.ni.k[13]);
 		tmp = _mm_aesenc_si128(tmp,_k.ni.k[13]);
 		_mm_storeu_si128((__m128i *)out,_mm_aesenclast_si128(tmp,_k.ni.k[14]));
 		_mm_storeu_si128((__m128i *)out,_mm_aesenclast_si128(tmp,_k.ni.k[14]));
 	}
 	}
+	inline void _encrypt_8xecb_aesni(const void *in,void *out) const
+	{
+		__m128i tmp0 = _mm_loadu_si128((const __m128i *)in);
+		__m128i tmp1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)in + 16));
+		__m128i tmp2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)in + 32));
+		__m128i tmp3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)in + 48));
+		__m128i tmp4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)in + 64));
+		__m128i tmp5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)in + 80));
+		__m128i tmp6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)in + 96));
+		__m128i tmp7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)in + 112));
+		{
+			__m128i k0 = _k.ni.k[0];
+			__m128i k1 = _k.ni.k[1];
+			__m128i k2 = _k.ni.k[2];
+			__m128i k3 = _k.ni.k[3];
+			tmp0 = _mm_xor_si128(tmp0,k0);
+			tmp1 = _mm_xor_si128(tmp1,k0);
+			tmp2 = _mm_xor_si128(tmp2,k0);
+			tmp3 = _mm_xor_si128(tmp3,k0);
+			tmp4 = _mm_xor_si128(tmp4,k0);
+			tmp5 = _mm_xor_si128(tmp5,k0);
+			tmp6 = _mm_xor_si128(tmp6,k0);
+			tmp7 = _mm_xor_si128(tmp7,k0);
+			tmp0 = _mm_aesenc_si128(tmp0,k1);
+			tmp1 = _mm_aesenc_si128(tmp1,k1);
+			tmp2 = _mm_aesenc_si128(tmp2,k1);
+			tmp3 = _mm_aesenc_si128(tmp3,k1);
+			tmp4 = _mm_aesenc_si128(tmp4,k1);
+			tmp5 = _mm_aesenc_si128(tmp5,k1);
+			tmp6 = _mm_aesenc_si128(tmp6,k1);
+			tmp7 = _mm_aesenc_si128(tmp7,k1);
+			tmp0 = _mm_aesenc_si128(tmp0,k2);
+			tmp1 = _mm_aesenc_si128(tmp1,k2);
+			tmp2 = _mm_aesenc_si128(tmp2,k2);
+			tmp3 = _mm_aesenc_si128(tmp3,k2);
+			tmp4 = _mm_aesenc_si128(tmp4,k2);
+			tmp5 = _mm_aesenc_si128(tmp5,k2);
+			tmp6 = _mm_aesenc_si128(tmp6,k2);
+			tmp7 = _mm_aesenc_si128(tmp7,k2);
+			tmp0 = _mm_aesenc_si128(tmp0,k3);
+			tmp1 = _mm_aesenc_si128(tmp1,k3);
+			tmp2 = _mm_aesenc_si128(tmp2,k3);
+			tmp3 = _mm_aesenc_si128(tmp3,k3);
+			tmp4 = _mm_aesenc_si128(tmp4,k3);
+			tmp5 = _mm_aesenc_si128(tmp5,k3);
+			tmp6 = _mm_aesenc_si128(tmp6,k3);
+			tmp7 = _mm_aesenc_si128(tmp7,k3);
+		}
+		{
+			__m128i k4 = _k.ni.k[4];
+			__m128i k5 = _k.ni.k[5];
+			__m128i k6 = _k.ni.k[6];
+			__m128i k7 = _k.ni.k[7];
+			tmp0 = _mm_aesenc_si128(tmp0,k4);
+			tmp1 = _mm_aesenc_si128(tmp1,k4);
+			tmp2 = _mm_aesenc_si128(tmp2,k4);
+			tmp3 = _mm_aesenc_si128(tmp3,k4);
+			tmp4 = _mm_aesenc_si128(tmp4,k4);
+			tmp5 = _mm_aesenc_si128(tmp5,k4);
+			tmp6 = _mm_aesenc_si128(tmp6,k4);
+			tmp7 = _mm_aesenc_si128(tmp7,k4);
+			tmp0 = _mm_aesenc_si128(tmp0,k5);
+			tmp1 = _mm_aesenc_si128(tmp1,k5);
+			tmp2 = _mm_aesenc_si128(tmp2,k5);
+			tmp3 = _mm_aesenc_si128(tmp3,k5);
+			tmp4 = _mm_aesenc_si128(tmp4,k5);
+			tmp5 = _mm_aesenc_si128(tmp5,k5);
+			tmp6 = _mm_aesenc_si128(tmp6,k5);
+			tmp7 = _mm_aesenc_si128(tmp7,k5);
+			tmp0 = _mm_aesenc_si128(tmp0,k6);
+			tmp1 = _mm_aesenc_si128(tmp1,k6);
+			tmp2 = _mm_aesenc_si128(tmp2,k6);
+			tmp3 = _mm_aesenc_si128(tmp3,k6);
+			tmp4 = _mm_aesenc_si128(tmp4,k6);
+			tmp5 = _mm_aesenc_si128(tmp5,k6);
+			tmp6 = _mm_aesenc_si128(tmp6,k6);
+			tmp7 = _mm_aesenc_si128(tmp7,k6);
+			tmp0 = _mm_aesenc_si128(tmp0,k7);
+			tmp1 = _mm_aesenc_si128(tmp1,k7);
+			tmp2 = _mm_aesenc_si128(tmp2,k7);
+			tmp3 = _mm_aesenc_si128(tmp3,k7);
+			tmp4 = _mm_aesenc_si128(tmp4,k7);
+			tmp5 = _mm_aesenc_si128(tmp5,k7);
+			tmp6 = _mm_aesenc_si128(tmp6,k7);
+			tmp7 = _mm_aesenc_si128(tmp7,k7);
+		}
+		{
+			__m128i k8 = _k.ni.k[8];
+			__m128i k9 = _k.ni.k[9];
+			__m128i k10 = _k.ni.k[10];
+			__m128i k11 = _k.ni.k[11];
+			tmp0 = _mm_aesenc_si128(tmp0,k8);
+			tmp1 = _mm_aesenc_si128(tmp1,k8);
+			tmp2 = _mm_aesenc_si128(tmp2,k8);
+			tmp3 = _mm_aesenc_si128(tmp3,k8);
+			tmp4 = _mm_aesenc_si128(tmp4,k8);
+			tmp5 = _mm_aesenc_si128(tmp5,k8);
+			tmp6 = _mm_aesenc_si128(tmp6,k8);
+			tmp7 = _mm_aesenc_si128(tmp7,k8);
+			tmp0 = _mm_aesenc_si128(tmp0,k9);
+			tmp1 = _mm_aesenc_si128(tmp1,k9);
+			tmp2 = _mm_aesenc_si128(tmp2,k9);
+			tmp3 = _mm_aesenc_si128(tmp3,k9);
+			tmp4 = _mm_aesenc_si128(tmp4,k9);
+			tmp5 = _mm_aesenc_si128(tmp5,k9);
+			tmp6 = _mm_aesenc_si128(tmp6,k9);
+			tmp7 = _mm_aesenc_si128(tmp7,k9);
+			tmp0 = _mm_aesenc_si128(tmp0,k10);
+			tmp1 = _mm_aesenc_si128(tmp1,k10);
+			tmp2 = _mm_aesenc_si128(tmp2,k10);
+			tmp3 = _mm_aesenc_si128(tmp3,k10);
+			tmp4 = _mm_aesenc_si128(tmp4,k10);
+			tmp5 = _mm_aesenc_si128(tmp5,k10);
+			tmp6 = _mm_aesenc_si128(tmp6,k10);
+			tmp7 = _mm_aesenc_si128(tmp7,k10);
+			tmp0 = _mm_aesenc_si128(tmp0,k11);
+			tmp1 = _mm_aesenc_si128(tmp1,k11);
+			tmp2 = _mm_aesenc_si128(tmp2,k11);
+			tmp3 = _mm_aesenc_si128(tmp3,k11);
+			tmp4 = _mm_aesenc_si128(tmp4,k11);
+			tmp5 = _mm_aesenc_si128(tmp5,k11);
+			tmp6 = _mm_aesenc_si128(tmp6,k11);
+			tmp7 = _mm_aesenc_si128(tmp7,k11);
+		}
+		{
+			__m128i k12 = _k.ni.k[12];
+			__m128i k13 = _k.ni.k[13];
+			__m128i k14 = _k.ni.k[14];
+			tmp0 = _mm_aesenc_si128(tmp0,k12);
+			tmp1 = _mm_aesenc_si128(tmp1,k12);
+			tmp2 = _mm_aesenc_si128(tmp2,k12);
+			tmp3 = _mm_aesenc_si128(tmp3,k12);
+			tmp4 = _mm_aesenc_si128(tmp4,k12);
+			tmp5 = _mm_aesenc_si128(tmp5,k12);
+			tmp6 = _mm_aesenc_si128(tmp6,k12);
+			tmp7 = _mm_aesenc_si128(tmp7,k12);
+			tmp0 = _mm_aesenc_si128(tmp0,k13);
+			tmp1 = _mm_aesenc_si128(tmp1,k13);
+			tmp2 = _mm_aesenc_si128(tmp2,k13);
+			tmp3 = _mm_aesenc_si128(tmp3,k13);
+			tmp4 = _mm_aesenc_si128(tmp4,k13);
+			tmp5 = _mm_aesenc_si128(tmp5,k13);
+			tmp6 = _mm_aesenc_si128(tmp6,k13);
+			tmp7 = _mm_aesenc_si128(tmp7,k13);
+			_mm_storeu_si128((__m128i *)out,_mm_aesenclast_si128(tmp0,k14));
+			_mm_storeu_si128((__m128i *)((uint8_t *)out + 16),_mm_aesenclast_si128(tmp1,k14));
+			_mm_storeu_si128((__m128i *)((uint8_t *)out + 32),_mm_aesenclast_si128(tmp2,k14));
+			_mm_storeu_si128((__m128i *)((uint8_t *)out + 48),_mm_aesenclast_si128(tmp3,k14));
+			_mm_storeu_si128((__m128i *)((uint8_t *)out + 64),_mm_aesenclast_si128(tmp4,k14));
+			_mm_storeu_si128((__m128i *)((uint8_t *)out + 80),_mm_aesenclast_si128(tmp5,k14));
+			_mm_storeu_si128((__m128i *)((uint8_t *)out + 96),_mm_aesenclast_si128(tmp6,k14));
+			_mm_storeu_si128((__m128i *)((uint8_t *)out + 112),_mm_aesenclast_si128(tmp7,k14));
+		}
+	}
 
 
 	static inline __m128i _swap128_aesni(__m128i x) { return _mm_shuffle_epi8(x,_mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15)); }
 	static inline __m128i _swap128_aesni(__m128i x) { return _mm_shuffle_epi8(x,_mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15)); }
 	static inline __m128i _mult_block_aesni(__m128i h,__m128i y)
 	static inline __m128i _mult_block_aesni(__m128i h,__m128i y)
@@ -458,28 +652,16 @@ private:
 		__m128i *bi = (__m128i *)in;
 		__m128i *bi = (__m128i *)in;
 		__m128i *bo = (__m128i *)out;
 		__m128i *bo = (__m128i *)out;
 
 
-		__m128i k0 = _k.ni.k[0];
-		__m128i k1 = _k.ni.k[1];
-		__m128i k2 = _k.ni.k[2];
-		__m128i k3 = _k.ni.k[3];
-		__m128i k4 = _k.ni.k[4];
-		__m128i k5 = _k.ni.k[5];
-		__m128i k6 = _k.ni.k[6];
-		__m128i k7 = _k.ni.k[7];
-		__m128i k8 = _k.ni.k[8];
-		__m128i k9 = _k.ni.k[9];
-		__m128i k10 = _k.ni.k[10];
-		__m128i k11 = _k.ni.k[11];
-		__m128i k12 = _k.ni.k[12];
-		__m128i k13 = _k.ni.k[13];
-		__m128i k14 = _k.ni.k[14];
-
 		unsigned int i;
 		unsigned int i;
 		for (i=0;i<pblocks;i+=4) {
 		for (i=0;i<pblocks;i+=4) {
 			__m128i d1 = _mm_loadu_si128(bi + i + 0);
 			__m128i d1 = _mm_loadu_si128(bi + i + 0);
 			__m128i d2 = _mm_loadu_si128(bi + i + 1);
 			__m128i d2 = _mm_loadu_si128(bi + i + 1);
 			__m128i d3 = _mm_loadu_si128(bi + i + 2);
 			__m128i d3 = _mm_loadu_si128(bi + i + 2);
 			__m128i d4 = _mm_loadu_si128(bi + i + 3);
 			__m128i d4 = _mm_loadu_si128(bi + i + 3);
+			__m128i k0 = _k.ni.k[0];
+			__m128i k1 = _k.ni.k[1];
+			__m128i k2 = _k.ni.k[2];
+			__m128i k3 = _k.ni.k[3];
 			__m128i t1 = _mm_xor_si128(cb,k0);
 			__m128i t1 = _mm_xor_si128(cb,k0);
 			cb = _increment_be_aesni(cb);
 			cb = _increment_be_aesni(cb);
 			__m128i t2 = _mm_xor_si128(cb,k0);
 			__m128i t2 = _mm_xor_si128(cb,k0);
@@ -500,6 +682,10 @@ private:
 			t2 = _mm_aesenc_si128(t2,k3);
 			t2 = _mm_aesenc_si128(t2,k3);
 			t3 = _mm_aesenc_si128(t3,k3);
 			t3 = _mm_aesenc_si128(t3,k3);
 			t4 = _mm_aesenc_si128(t4,k3);
 			t4 = _mm_aesenc_si128(t4,k3);
+			__m128i k4 = _k.ni.k[4];
+			__m128i k5 = _k.ni.k[5];
+			__m128i k6 = _k.ni.k[6];
+			__m128i k7 = _k.ni.k[7];
 			t1 = _mm_aesenc_si128(t1,k4);
 			t1 = _mm_aesenc_si128(t1,k4);
 			t2 = _mm_aesenc_si128(t2,k4);
 			t2 = _mm_aesenc_si128(t2,k4);
 			t3 = _mm_aesenc_si128(t3,k4);
 			t3 = _mm_aesenc_si128(t3,k4);
@@ -516,6 +702,10 @@ private:
 			t2 = _mm_aesenc_si128(t2,k7);
 			t2 = _mm_aesenc_si128(t2,k7);
 			t3 = _mm_aesenc_si128(t3,k7);
 			t3 = _mm_aesenc_si128(t3,k7);
 			t4 = _mm_aesenc_si128(t4,k7);
 			t4 = _mm_aesenc_si128(t4,k7);
+			__m128i k8 = _k.ni.k[8];
+			__m128i k9 = _k.ni.k[9];
+			__m128i k10 = _k.ni.k[10];
+			__m128i k11 = _k.ni.k[11];
 			t1 = _mm_aesenc_si128(t1,k8);
 			t1 = _mm_aesenc_si128(t1,k8);
 			t2 = _mm_aesenc_si128(t2,k8);
 			t2 = _mm_aesenc_si128(t2,k8);
 			t3 = _mm_aesenc_si128(t3,k8);
 			t3 = _mm_aesenc_si128(t3,k8);
@@ -532,6 +722,9 @@ private:
 			t2 = _mm_aesenc_si128(t2,k11);
 			t2 = _mm_aesenc_si128(t2,k11);
 			t3 = _mm_aesenc_si128(t3,k11);
 			t3 = _mm_aesenc_si128(t3,k11);
 			t4 = _mm_aesenc_si128(t4,k11);
 			t4 = _mm_aesenc_si128(t4,k11);
+			__m128i k12 = _k.ni.k[12];
+			__m128i k13 = _k.ni.k[13];
+			__m128i k14 = _k.ni.k[14];
 			t1 = _mm_aesenc_si128(t1,k12);
 			t1 = _mm_aesenc_si128(t1,k12);
 			t2 = _mm_aesenc_si128(t2,k12);
 			t2 = _mm_aesenc_si128(t2,k12);
 			t3 = _mm_aesenc_si128(t3,k12);
 			t3 = _mm_aesenc_si128(t3,k12);
@@ -558,18 +751,33 @@ private:
 
 
 		for (i=pblocks;i<blocks;++i) {
 		for (i=pblocks;i<blocks;++i) {
 			__m128i d1 = _mm_loadu_si128(bi + i);
 			__m128i d1 = _mm_loadu_si128(bi + i);
+			__m128i k0 = _k.ni.k[0];
+			__m128i k1 = _k.ni.k[1];
+			__m128i k2 = _k.ni.k[2];
+			__m128i k3 = _k.ni.k[3];
 			__m128i t1 = _mm_xor_si128(cb,k0);
 			__m128i t1 = _mm_xor_si128(cb,k0);
 			t1 = _mm_aesenc_si128(t1,k1);
 			t1 = _mm_aesenc_si128(t1,k1);
 			t1 = _mm_aesenc_si128(t1,k2);
 			t1 = _mm_aesenc_si128(t1,k2);
 			t1 = _mm_aesenc_si128(t1,k3);
 			t1 = _mm_aesenc_si128(t1,k3);
+			__m128i k4 = _k.ni.k[4];
+			__m128i k5 = _k.ni.k[5];
+			__m128i k6 = _k.ni.k[6];
+			__m128i k7 = _k.ni.k[7];
 			t1 = _mm_aesenc_si128(t1,k4);
 			t1 = _mm_aesenc_si128(t1,k4);
 			t1 = _mm_aesenc_si128(t1,k5);
 			t1 = _mm_aesenc_si128(t1,k5);
 			t1 = _mm_aesenc_si128(t1,k6);
 			t1 = _mm_aesenc_si128(t1,k6);
 			t1 = _mm_aesenc_si128(t1,k7);
 			t1 = _mm_aesenc_si128(t1,k7);
+			__m128i k8 = _k.ni.k[8];
+			__m128i k9 = _k.ni.k[9];
+			__m128i k10 = _k.ni.k[10];
+			__m128i k11 = _k.ni.k[11];
 			t1 = _mm_aesenc_si128(t1,k8);
 			t1 = _mm_aesenc_si128(t1,k8);
 			t1 = _mm_aesenc_si128(t1,k9);
 			t1 = _mm_aesenc_si128(t1,k9);
 			t1 = _mm_aesenc_si128(t1,k10);
 			t1 = _mm_aesenc_si128(t1,k10);
 			t1 = _mm_aesenc_si128(t1,k11);
 			t1 = _mm_aesenc_si128(t1,k11);
+			__m128i k12 = _k.ni.k[12];
+			__m128i k13 = _k.ni.k[13];
+			__m128i k14 = _k.ni.k[14];
 			t1 = _mm_aesenc_si128(t1,k12);
 			t1 = _mm_aesenc_si128(t1,k12);
 			t1 = _mm_aesenc_si128(t1,k13);
 			t1 = _mm_aesenc_si128(t1,k13);
 			t1 = _mm_aesenclast_si128(t1,k14);
 			t1 = _mm_aesenclast_si128(t1,k14);

+ 21 - 90
node/Packet.cpp

@@ -32,13 +32,6 @@
 
 
 #include "Packet.hpp"
 #include "Packet.hpp"
 
 
-#ifdef ZT_USE_X64_ASM_SALSA2012
-#include "../ext/x64-salsa2012-asm/salsa2012.h"
-#endif
-#ifdef ZT_USE_ARM32_NEON_ASM_SALSA2012
-#include "../ext/arm32-neon-salsa2012-asm/salsa2012.h"
-#endif
-
 #ifdef _MSC_VER
 #ifdef _MSC_VER
 #define FORCE_INLINE static __forceinline
 #define FORCE_INLINE static __forceinline
 #include <intrin.h>
 #include <intrin.h>
@@ -50,37 +43,6 @@
 
 
 namespace ZeroTier {
 namespace ZeroTier {
 
 
-/************************************************************************** */
-
-/* Set up macros for fast single-pass ASM Salsa20/12 crypto, if we have it */
-
-// x64 SSE crypto
-#ifdef ZT_USE_X64_ASM_SALSA2012
-#define ZT_HAS_FAST_CRYPTO() (true)
-#define ZT_FAST_SINGLE_PASS_SALSA2012(b,l,n,k) zt_salsa2012_amd64_xmm6(reinterpret_cast<unsigned char *>(b),(l),reinterpret_cast<const unsigned char *>(n),reinterpret_cast<const unsigned char *>(k))
-#endif
-
-// ARM (32-bit) NEON crypto (must be detected)
-#ifdef ZT_USE_ARM32_NEON_ASM_SALSA2012
-class _FastCryptoChecker
-{
-public:
-	_FastCryptoChecker() : canHas(zt_arm_has_neon()) {}
-	bool canHas;
-};
-static const _FastCryptoChecker _ZT_FAST_CRYPTO_CHECK;
-#define ZT_HAS_FAST_CRYPTO() (_ZT_FAST_CRYPTO_CHECK.canHas)
-#define ZT_FAST_SINGLE_PASS_SALSA2012(b,l,n,k) zt_salsa2012_armneon3_xor(reinterpret_cast<unsigned char *>(b),(const unsigned char *)0,(l),reinterpret_cast<const unsigned char *>(n),reinterpret_cast<const unsigned char *>(k))
-#endif
-
-// No fast crypto available
-#ifndef ZT_HAS_FAST_CRYPTO
-#define ZT_HAS_FAST_CRYPTO() (false)
-#define ZT_FAST_SINGLE_PASS_SALSA2012(b,l,n,k) {}
-#endif
-
-/************************************************************************** */
-
 /* LZ4 is shipped encapsulated into Packet in an anonymous namespace.
 /* LZ4 is shipped encapsulated into Packet in an anonymous namespace.
  *
  *
  * We're doing this as a deliberate workaround for various Linux distribution
  * We're doing this as a deliberate workaround for various Linux distribution
@@ -899,30 +861,16 @@ void Packet::armor(const void *key,bool encryptPayload)
 
 
 	_salsa20MangleKey((const unsigned char *)key,mangledKey);
 	_salsa20MangleKey((const unsigned char *)key,mangledKey);
 
 
-	if (ZT_HAS_FAST_CRYPTO()) {
-		const unsigned int encryptLen = (encryptPayload) ? (size() - ZT_PACKET_IDX_VERB) : 0;
-		uint64_t keyStream[(ZT_PROTO_MAX_PACKET_LENGTH + 64 + 8) / 8];
-		ZT_FAST_SINGLE_PASS_SALSA2012(keyStream,encryptLen + 64,(data + ZT_PACKET_IDX_IV),mangledKey);
-		Salsa20::memxor(data + ZT_PACKET_IDX_VERB,reinterpret_cast<const uint8_t *>(keyStream + 8),encryptLen);
-		uint64_t mac[2];
-		poly1305(mac,data + ZT_PACKET_IDX_VERB,size() - ZT_PACKET_IDX_VERB,keyStream);
-#ifdef ZT_NO_TYPE_PUNNING
-		memcpy(data + ZT_PACKET_IDX_MAC,mac,8);
-#else
-		(*reinterpret_cast<uint64_t *>(data + ZT_PACKET_IDX_MAC)) = mac[0];
-#endif
-	} else {
-		Salsa20 s20(mangledKey,data + ZT_PACKET_IDX_IV);
-		uint64_t macKey[4];
-		s20.crypt12(ZERO_KEY,macKey,sizeof(macKey));
-		uint8_t *const payload = data + ZT_PACKET_IDX_VERB;
-		const unsigned int payloadLen = size() - ZT_PACKET_IDX_VERB;
-		if (encryptPayload)
-			s20.crypt12(payload,payload,payloadLen);
-		uint64_t mac[2];
-		poly1305(mac,payload,payloadLen,macKey);
-		memcpy(data + ZT_PACKET_IDX_MAC,mac,8);
-	}
+	Salsa20 s20(mangledKey,data + ZT_PACKET_IDX_IV);
+	uint64_t macKey[4];
+	s20.crypt12(ZERO_KEY,macKey,sizeof(macKey));
+	uint8_t *const payload = data + ZT_PACKET_IDX_VERB;
+	const unsigned int payloadLen = size() - ZT_PACKET_IDX_VERB;
+	if (encryptPayload)
+		s20.crypt12(payload,payload,payloadLen);
+	uint64_t mac[2];
+	poly1305(mac,payload,payloadLen,macKey);
+	memcpy(data + ZT_PACKET_IDX_MAC,mac,8);
 }
 }
 
 
 bool Packet::dearmor(const void *key)
 bool Packet::dearmor(const void *key)
@@ -935,37 +883,20 @@ bool Packet::dearmor(const void *key)
 
 
 	if ((cs == ZT_PROTO_CIPHER_SUITE__POLY1305_NONE)||(cs == ZT_PROTO_CIPHER_SUITE__POLY1305_SALSA2012)) {
 	if ((cs == ZT_PROTO_CIPHER_SUITE__POLY1305_NONE)||(cs == ZT_PROTO_CIPHER_SUITE__POLY1305_SALSA2012)) {
 		_salsa20MangleKey((const unsigned char *)key,mangledKey);
 		_salsa20MangleKey((const unsigned char *)key,mangledKey);
-		if (ZT_HAS_FAST_CRYPTO()) {
-			uint64_t keyStream[(ZT_PROTO_MAX_PACKET_LENGTH + 64 + 8) / 8];
-			ZT_FAST_SINGLE_PASS_SALSA2012(keyStream,((cs == ZT_PROTO_CIPHER_SUITE__POLY1305_SALSA2012) ? (payloadLen + 64) : 64),(data + ZT_PACKET_IDX_IV),mangledKey);
-			uint64_t mac[2];
-			poly1305(mac,payload,payloadLen,keyStream);
-#ifdef ZT_NO_TYPE_PUNNING
-			if (!Utils::secureEq(mac,data + ZT_PACKET_IDX_MAC,8))
-				return false;
-#else
-			if ((*reinterpret_cast<const uint64_t *>(data + ZT_PACKET_IDX_MAC)) != mac[0]) // also secure, constant time
-				return false;
-#endif
-			if (cs == ZT_PROTO_CIPHER_SUITE__POLY1305_SALSA2012)
-				Salsa20::memxor(data + ZT_PACKET_IDX_VERB,reinterpret_cast<const uint8_t *>(keyStream + 8),payloadLen);
-		} else {
-			Salsa20 s20(mangledKey,data + ZT_PACKET_IDX_IV);
-			uint64_t macKey[4];
-			s20.crypt12(ZERO_KEY,macKey,sizeof(macKey));
-			uint64_t mac[2];
-			poly1305(mac,payload,payloadLen,macKey);
+		Salsa20 s20(mangledKey,data + ZT_PACKET_IDX_IV);
+		uint64_t macKey[4];
+		s20.crypt12(ZERO_KEY,macKey,sizeof(macKey));
+		uint64_t mac[2];
+		poly1305(mac,payload,payloadLen,macKey);
 #ifdef ZT_NO_TYPE_PUNNING
 #ifdef ZT_NO_TYPE_PUNNING
-			if (!Utils::secureEq(mac,data + ZT_PACKET_IDX_MAC,8))
-				return false;
+		if (!Utils::secureEq(mac,data + ZT_PACKET_IDX_MAC,8))
+			return false;
 #else
 #else
-			if ((*reinterpret_cast<const uint64_t *>(data + ZT_PACKET_IDX_MAC)) != mac[0]) // also secure, constant time
-				return false;
+		if ((*reinterpret_cast<const uint64_t *>(data + ZT_PACKET_IDX_MAC)) != mac[0]) // also secure, constant time
+			return false;
 #endif
 #endif
-			if (cs == ZT_PROTO_CIPHER_SUITE__POLY1305_SALSA2012)
-				s20.crypt12(payload,payload,payloadLen);
-		}
-
+		if (cs == ZT_PROTO_CIPHER_SUITE__POLY1305_SALSA2012)
+			s20.crypt12(payload,payload,payloadLen);
 		return true;
 		return true;
 	} else {
 	} else {
 		return false; // unrecognized cipher suite
 		return false; // unrecognized cipher suite

+ 0 - 76
node/Salsa20.hpp

@@ -34,82 +34,6 @@ public:
 	inline Salsa20() {}
 	inline Salsa20() {}
 	inline ~Salsa20() { Utils::burn(&_state,sizeof(_state)); }
 	inline ~Salsa20() { Utils::burn(&_state,sizeof(_state)); }
 
 
-	/**
-	 * XOR d with s
-	 *
-	 * This is done efficiently using e.g. SSE if available. It's used when
-	 * alternative Salsa20 implementations are used in Packet and is here
-	 * since this is where all the SSE stuff is already included.
-	 *
-	 * @param d Destination to XOR
-	 * @param s Source bytes to XOR with destination
-	 * @param len Length of s and d
-	 */
-	static inline void memxor(uint8_t *d,const uint8_t *s,unsigned int len)
-	{
-#ifdef ZT_SALSA20_SSE
-		while (len >= 128) {
-			__m128i s0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s));
-			__m128i s1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s + 16));
-			__m128i s2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s + 32));
-			__m128i s3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s + 48));
-			__m128i s4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s + 64));
-			__m128i s5 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s + 80));
-			__m128i s6 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s + 96));
-			__m128i s7 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s + 112));
-			__m128i d0 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d));
-			__m128i d1 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 16));
-			__m128i d2 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 32));
-			__m128i d3 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 48));
-			__m128i d4 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 64));
-			__m128i d5 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 80));
-			__m128i d6 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 96));
-			__m128i d7 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 112));
-			d0 = _mm_xor_si128(d0,s0);
-			d1 = _mm_xor_si128(d1,s1);
-			d2 = _mm_xor_si128(d2,s2);
-			d3 = _mm_xor_si128(d3,s3);
-			d4 = _mm_xor_si128(d4,s4);
-			d5 = _mm_xor_si128(d5,s5);
-			d6 = _mm_xor_si128(d6,s6);
-			d7 = _mm_xor_si128(d7,s7);
-			_mm_storeu_si128(reinterpret_cast<__m128i *>(d),d0);
-			_mm_storeu_si128(reinterpret_cast<__m128i *>(d + 16),d1);
-			_mm_storeu_si128(reinterpret_cast<__m128i *>(d + 32),d2);
-			_mm_storeu_si128(reinterpret_cast<__m128i *>(d + 48),d3);
-			_mm_storeu_si128(reinterpret_cast<__m128i *>(d + 64),d4);
-			_mm_storeu_si128(reinterpret_cast<__m128i *>(d + 80),d5);
-			_mm_storeu_si128(reinterpret_cast<__m128i *>(d + 96),d6);
-			_mm_storeu_si128(reinterpret_cast<__m128i *>(d + 112),d7);
-			s += 128;
-			d += 128;
-			len -= 128;
-		}
-		while (len >= 16) {
-			_mm_storeu_si128(reinterpret_cast<__m128i *>(d),_mm_xor_si128(_mm_loadu_si128(reinterpret_cast<__m128i *>(d)),_mm_loadu_si128(reinterpret_cast<const __m128i *>(s))));
-			s += 16;
-			d += 16;
-			len -= 16;
-		}
-#else
-#ifndef ZT_NO_TYPE_PUNNING
-		while (len >= 16) {
-			(*reinterpret_cast<uint64_t *>(d)) ^= (*reinterpret_cast<const uint64_t *>(s));
-			s += 8;
-			d += 8;
-			(*reinterpret_cast<uint64_t *>(d)) ^= (*reinterpret_cast<const uint64_t *>(s));
-			s += 8;
-			d += 8;
-			len -= 16;
-		}
-#endif
-#endif
-		while (len) {
-			--len;
-			*(d++) ^= *(s++);
-		}
-	}
-
 	/**
 	/**
 	 * @param key 256-bit (32 byte) key
 	 * @param key 256-bit (32 byte) key
 	 * @param iv 64-bit initialization vector
 	 * @param iv 64-bit initialization vector

+ 26 - 39
selftest.cpp

@@ -209,13 +209,36 @@ static int testCrypto()
 	}
 	}
 	double gcmBytes = 0.0;
 	double gcmBytes = 0.0;
 	int64_t start = OSUtils::now();
 	int64_t start = OSUtils::now();
-	for(unsigned long i=0;i<150000;++i) {
+	for(unsigned long i=0;i<100000;++i) {
 		tv.gcmEncrypt((const uint8_t *)hexbuf,buf1,sizeof(buf1),nullptr,0,buf2,(uint8_t *)(hexbuf + 32),16);
 		tv.gcmEncrypt((const uint8_t *)hexbuf,buf1,sizeof(buf1),nullptr,0,buf2,(uint8_t *)(hexbuf + 32),16);
-		gcmBytes += (double)sizeof(buf1);
+		tv.gcmEncrypt((const uint8_t *)hexbuf,buf2,sizeof(buf2),nullptr,0,buf1,(uint8_t *)(hexbuf + 32),16);
+		gcmBytes += (double)(sizeof(buf1) * 2);
 	}
 	}
 	int64_t end = OSUtils::now();
 	int64_t end = OSUtils::now();
 	*dummy = buf1[0];
 	*dummy = buf1[0];
-	std::cout << ((gcmBytes / 1048576.0) / ((long double)(end - start) / 1000.0)) << " MiB/second" << std::endl;
+	std::cout << ((gcmBytes / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second" << std::endl << "  AES-256 ECB scramble (benchmark): "; std::cout.flush();
+	double ecbBytes = 0.0;
+	start = OSUtils::now();
+	for(unsigned long i=0;i<100000;++i) {
+		tv.ecbEncrypt(buf1,sizeof(buf1),buf2);
+		tv.ecbEncrypt(buf2,sizeof(buf1),buf1);
+		ecbBytes += (double)(sizeof(buf1) * 2);
+	}
+	end = OSUtils::now();
+	*dummy = buf1[0];
+	std::cout << ((ecbBytes / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second" << std::endl << "  AES-256 GCM + ECB scramble (benchmark): "; std::cout.flush();
+	ecbBytes = 0.0;
+	start = OSUtils::now();
+	for(unsigned long i=0;i<100000;++i) {
+		tv.gcmEncrypt((const uint8_t *)hexbuf,buf1,sizeof(buf1),nullptr,0,buf2,(uint8_t *)(hexbuf + 32),16);
+		tv.ecbEncrypt(buf1,sizeof(buf1),buf2);
+		tv.gcmEncrypt((const uint8_t *)hexbuf,buf2,sizeof(buf2),nullptr,0,buf1,(uint8_t *)(hexbuf + 32),16);
+		tv.ecbEncrypt(buf2,sizeof(buf1),buf1);
+		ecbBytes += (double)(sizeof(buf1) * 2);
+	}
+	end = OSUtils::now();
+	*dummy = buf1[0];
+	std::cout << ((ecbBytes / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second" << std::endl;
 
 
 	std::cout << "[crypto] Testing Salsa20... "; std::cout.flush();
 	std::cout << "[crypto] Testing Salsa20... "; std::cout.flush();
 	for(unsigned int i=0;i<4;++i) {
 	for(unsigned int i=0;i<4;++i) {
@@ -275,42 +298,6 @@ static int testCrypto()
 		::free((void *)bb);
 		::free((void *)bb);
 	}
 	}
 
 
-#ifdef ZT_USE_X64_ASM_SALSA2012
-	std::cout << "[crypto] Benchmarking Salsa20/12 fast x64 ASM... "; std::cout.flush();
-	{
-		unsigned char *bb = (unsigned char *)::malloc(1234567);
-		double bytes = 0.0;
-		uint64_t start = OSUtils::now();
-		for(unsigned int i=0;i<200;++i) {
-			zt_salsa2012_amd64_xmm6(bb,1234567,s20TV0Iv,s20TV0Key);
-			bytes += 1234567.0;
-		}
-		uint64_t end = OSUtils::now();
-		*dummy = bb[0];
-		std::cout << ((bytes / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second" << std::endl;
-		::free((void *)bb);
-	}
-#endif
-
-#ifdef ZT_USE_ARM32_NEON_ASM_SALSA2012
-	if (zt_arm_has_neon()) {
-		std::cout << "[crypto] Benchmarking Salsa20/12 fast arm32/neon ASM... "; std::cout.flush();
-		{
-			unsigned char *bb = (unsigned char *)::malloc(1234567);
-			double bytes = 0.0;
-			uint64_t start = OSUtils::now();
-			for(unsigned int i=0;i<200;++i) {
-				zt_salsa2012_armneon3_xor(bb,(const unsigned char *)0,1234567,s20TV0Iv,s20TV0Key);
-				bytes += 1234567.0;
-			}
-			uint64_t end = OSUtils::now();
-			*dummy = bb[0];
-			std::cout << ((bytes / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second" << std::endl;
-			::free((void *)bb);
-		}
-	}
-#endif
-
 	std::cout << "[crypto] Benchmarking Salsa20/20... "; std::cout.flush();
 	std::cout << "[crypto] Benchmarking Salsa20/20... "; std::cout.flush();
 	{
 	{
 		unsigned char *bb = (unsigned char *)::malloc(1234567);
 		unsigned char *bb = (unsigned char *)::malloc(1234567);