Browse Source

Speed up Salsa20 just a bit.

Adam Ierymenko 9 years ago
parent
commit
789046ca57
2 changed files with 75 additions and 4 deletions
  1. 74 3
      node/Salsa20.cpp
  2. 1 1
      node/Salsa20.hpp

+ 74 - 3
node/Salsa20.cpp

@@ -122,7 +122,7 @@ void Salsa20::init(const void *key,unsigned int kbits,const void *iv,unsigned in
 	_state.i[0] = U8TO32_LITTLE(constants + 0);
 #endif
 
-	_roundsDiv2 = rounds / 2;
+	_roundsDiv4 = rounds / 4;
 }
 
 void Salsa20::encrypt(const void *in,void *out,unsigned int bytes)
@@ -180,7 +180,7 @@ void Salsa20::encrypt(const void *in,void *out,unsigned int bytes)
 		__m128i X2s = X2;
 		__m128i X3s = X3;
 
-		for (i=0;i<_roundsDiv2;++i) {
+		for (i=0;i<_roundsDiv4;++i) {
 			__m128i T = _mm_add_epi32(X0, X3);
 			X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 7));
 			X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 25));
@@ -214,6 +214,42 @@ void Salsa20::encrypt(const void *in,void *out,unsigned int bytes)
 			X1 = _mm_shuffle_epi32(X1, 0x39);
 			X2 = _mm_shuffle_epi32(X2, 0x4E);
 			X3 = _mm_shuffle_epi32(X3, 0x93);
+
+			// --
+
+			T = _mm_add_epi32(X0, X3);
+			X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 7));
+			X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 25));
+			T = _mm_add_epi32(X1, X0);
+			X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9));
+			X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23));
+			T = _mm_add_epi32(X2, X1);
+			X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 13));
+			X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 19));
+			T = _mm_add_epi32(X3, X2);
+			X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18));
+			X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14));
+
+			X1 = _mm_shuffle_epi32(X1, 0x93);
+			X2 = _mm_shuffle_epi32(X2, 0x4E);
+			X3 = _mm_shuffle_epi32(X3, 0x39);
+
+			T = _mm_add_epi32(X0, X1);
+			X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 7));
+			X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 25));
+			T = _mm_add_epi32(X3, X0);
+			X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9));
+			X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23));
+			T = _mm_add_epi32(X2, X3);
+			X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 13));
+			X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 19));
+			T = _mm_add_epi32(X1, X2);
+			X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18));
+			X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14));
+
+			X1 = _mm_shuffle_epi32(X1, 0x39);
+			X2 = _mm_shuffle_epi32(X2, 0x4E);
+			X3 = _mm_shuffle_epi32(X3, 0x93);
 		}
 
 		X0 = _mm_add_epi32(X0s,X0);
@@ -260,7 +296,42 @@ void Salsa20::encrypt(const void *in,void *out,unsigned int bytes)
 		x14 = j14;
 		x15 = j15;
 
-		for(i=0;i<_roundsDiv2;++i) {
+		for(i=0;i<_roundsDiv4;++i) {
+			 x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+			 x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+			x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+			 x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+			 x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+			x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+			 x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+			 x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+			x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+			 x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+			 x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+			x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+			 x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+			 x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+			x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+			x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+			 x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+			 x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+			 x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+			 x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+			 x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+			 x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+			 x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+			 x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+			x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+			 x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+			 x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+			x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+			x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+			x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+			x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+			x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+			// --
+
 			 x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
 			 x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
 			x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));

+ 1 - 1
node/Salsa20.hpp

@@ -84,7 +84,7 @@ private:
 #endif // ZT_SALSA20_SSE
 		uint32_t i[16];
 	} _state;
-	unsigned int _roundsDiv2;
+	unsigned int _roundsDiv4;
 };
 
 } // namespace ZeroTier