Browse Source

Unroll Salsa20 fully for a little more speed (non-SSE now almost as fast as SSE)

Adam Ierymenko 9 years ago
parent
commit
0c498556d5
7 changed files with 1138 additions and 185 deletions
  1. 4 5
      node/Identity.cpp
  2. 4 4
      node/IncomingPacket.cpp
  3. 3 3
      node/Node.cpp
  4. 6 6
      node/Packet.cpp
  5. 1079 127
      node/Salsa20.cpp
  6. 30 10
      node/Salsa20.hpp
  7. 12 30
      selftest.cpp

+ 4 - 5
node/Identity.cpp

@@ -41,7 +41,6 @@
 
 #define ZT_IDENTITY_GEN_HASHCASH_FIRST_BYTE_LESS_THAN 17
 #define ZT_IDENTITY_GEN_MEMORY 2097152
-#define ZT_IDENTITY_GEN_SALSA20_ROUNDS 20
 
 namespace ZeroTier {
 
@@ -55,8 +54,8 @@ static inline void _computeMemoryHardHash(const void *publicKey,unsigned int pub
 	// ordinary Salsa20 is randomly seekable. This is good for a cipher
 	// but is not what we want for sequential memory-harndess.
 	memset(genmem,0,ZT_IDENTITY_GEN_MEMORY);
-	Salsa20 s20(digest,256,(char *)digest + 32,ZT_IDENTITY_GEN_SALSA20_ROUNDS);
-	s20.encrypt((char *)genmem,(char *)genmem,64);
+	Salsa20 s20(digest,256,(char *)digest + 32);
+	s20.encrypt20((char *)genmem,(char *)genmem,64);
 	for(unsigned long i=64;i<ZT_IDENTITY_GEN_MEMORY;i+=64) {
 		unsigned long k = i - 64;
 		*((uint64_t *)((char *)genmem + i)) = *((uint64_t *)((char *)genmem + k));
@@ -67,7 +66,7 @@ static inline void _computeMemoryHardHash(const void *publicKey,unsigned int pub
 		*((uint64_t *)((char *)genmem + i + 40)) = *((uint64_t *)((char *)genmem + k + 40));
 		*((uint64_t *)((char *)genmem + i + 48)) = *((uint64_t *)((char *)genmem + k + 48));
 		*((uint64_t *)((char *)genmem + i + 56)) = *((uint64_t *)((char *)genmem + k + 56));
-		s20.encrypt((char *)genmem + i,(char *)genmem + i,64);
+		s20.encrypt20((char *)genmem + i,(char *)genmem + i,64);
 	}
 
 	// Render final digest using genmem as a lookup table
@@ -77,7 +76,7 @@ static inline void _computeMemoryHardHash(const void *publicKey,unsigned int pub
 		uint64_t tmp = ((uint64_t *)genmem)[idx2];
 		((uint64_t *)genmem)[idx2] = ((uint64_t *)digest)[idx1];
 		((uint64_t *)digest)[idx1] = tmp;
-		s20.encrypt(digest,digest,64);
+		s20.encrypt20(digest,digest,64);
 	}
 }
 

+ 4 - 4
node/IncomingPacket.cpp

@@ -1149,9 +1149,9 @@ try_salsa2012sha512_again:
 	++*(reinterpret_cast<volatile uint64_t *>(candidate));
 
 	SHA512::hash(shabuf,candidate,16 + challengeLength);
-	s20.init(shabuf,256,&s20iv,12);
+	s20.init(shabuf,256,&s20iv);
 	memset(salsabuf,0,sizeof(salsabuf));
-	s20.encrypt(salsabuf,salsabuf,sizeof(salsabuf));
+	s20.encrypt12(salsabuf,salsabuf,sizeof(salsabuf));
 	SHA512::hash(shabuf,salsabuf,sizeof(salsabuf));
 
 	d = difficulty;
@@ -1186,9 +1186,9 @@ bool IncomingPacket::testSalsa2012Sha512ProofOfWorkResult(unsigned int difficult
 	memcpy(candidate + 16,challenge,challengeLength);
 
 	SHA512::hash(shabuf,candidate,16 + challengeLength);
-	s20.init(shabuf,256,&s20iv,12);
+	s20.init(shabuf,256,&s20iv);
 	memset(salsabuf,0,sizeof(salsabuf));
-	s20.encrypt(salsabuf,salsabuf,sizeof(salsabuf));
+	s20.encrypt12(salsabuf,salsabuf,sizeof(salsabuf));
 	SHA512::hash(shabuf,salsabuf,sizeof(salsabuf));
 
 	d = difficulty;

+ 3 - 3
node/Node.cpp

@@ -88,9 +88,9 @@ Node::Node(
 	{
 		char foo[32];
 		Utils::getSecureRandom(foo,32);
-		_prng.init(foo,256,foo,8);
+		_prng.init(foo,256,foo);
 		memset(_prngStream,0,sizeof(_prngStream));
-		_prng.encrypt(_prngStream,_prngStream,sizeof(_prngStream));
+		_prng.encrypt12(_prngStream,_prngStream,sizeof(_prngStream));
 	}
 
 	std::string idtmp(dataStoreGet("identity.secret"));
@@ -574,7 +574,7 @@ uint64_t Node::prng()
 {
 	unsigned int p = (++_prngStreamPtr % (sizeof(_prngStream) / sizeof(uint64_t)));
 	if (!p)
-		_prng.encrypt(_prngStream,_prngStream,sizeof(_prngStream));
+		_prng.encrypt12(_prngStream,_prngStream,sizeof(_prngStream));
 	return _prngStream[p];
 }
 

+ 6 - 6
node/Packet.cpp

@@ -92,14 +92,14 @@ void Packet::armor(const void *key,bool encryptPayload)
 	setCipher(encryptPayload ? ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_SALSA2012 : ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_NONE);
 
 	_salsa20MangleKey((const unsigned char *)key,mangledKey);
-	Salsa20 s20(mangledKey,256,field(ZT_PACKET_IDX_IV,8),ZT_PROTO_SALSA20_ROUNDS);
+	Salsa20 s20(mangledKey,256,field(ZT_PACKET_IDX_IV,8)/*,ZT_PROTO_SALSA20_ROUNDS*/);
 
 	// MAC key is always the first 32 bytes of the Salsa20 key stream
 	// This is the same construction DJB's NaCl library uses
-	s20.encrypt(ZERO_KEY,macKey,sizeof(macKey));
+	s20.encrypt12(ZERO_KEY,macKey,sizeof(macKey));
 
 	if (encryptPayload)
-		s20.encrypt(payload,payload,payloadLen);
+		s20.encrypt12(payload,payload,payloadLen);
 
 	Poly1305::compute(mac,payload,payloadLen,macKey);
 	memcpy(field(ZT_PACKET_IDX_MAC,8),mac,8);
@@ -116,15 +116,15 @@ bool Packet::dearmor(const void *key)
 
 	if ((cs == ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_NONE)||(cs == ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_SALSA2012)) {
 		_salsa20MangleKey((const unsigned char *)key,mangledKey);
-		Salsa20 s20(mangledKey,256,field(ZT_PACKET_IDX_IV,8),ZT_PROTO_SALSA20_ROUNDS);
+		Salsa20 s20(mangledKey,256,field(ZT_PACKET_IDX_IV,8)/*,ZT_PROTO_SALSA20_ROUNDS*/);
 
-		s20.encrypt(ZERO_KEY,macKey,sizeof(macKey));
+		s20.encrypt12(ZERO_KEY,macKey,sizeof(macKey));
 		Poly1305::compute(mac,payload,payloadLen,macKey);
 		if (!Utils::secureEq(mac,field(ZT_PACKET_IDX_MAC,8),8))
 			return false;
 
 		if (cs == ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_SALSA2012)
-			s20.decrypt(payload,payload,payloadLen);
+			s20.decrypt12(payload,payload,payloadLen);
 
 		return true;
 	} else return false; // unrecognized cipher suite

+ 1079 - 127
node/Salsa20.cpp

@@ -66,7 +66,7 @@ static const _s20sseconsts _S20SSECONSTANTS;
 
 namespace ZeroTier {
 
-void Salsa20::init(const void *key,unsigned int kbits,const void *iv,unsigned int rounds)
+void Salsa20::init(const void *key,unsigned int kbits,const void *iv)
 	throw()
 {
 #ifdef ZT_SALSA20_SSE
@@ -121,11 +121,9 @@ void Salsa20::init(const void *key,unsigned int kbits,const void *iv,unsigned in
 	_state.i[15] = U8TO32_LITTLE(constants + 12);
 	_state.i[0] = U8TO32_LITTLE(constants + 0);
 #endif
-
-	_roundsDiv4 = rounds / 4;
 }
 
-void Salsa20::encrypt(const void *in,void *out,unsigned int bytes)
+void Salsa20::encrypt12(const void *in,void *out,unsigned int bytes)
 	throw()
 {
 	uint8_t tmp[64];
@@ -181,61 +179,149 @@ void Salsa20::encrypt(const void *in,void *out,unsigned int bytes)
 		__m128i X2s = X2;
 		__m128i X3s = X3;
 
-		for (i=0;i<_roundsDiv4;++i) {
-			T = _mm_add_epi32(X0, X3);
-			X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
-			T = _mm_add_epi32(X1, X0);
-			X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
-			T = _mm_add_epi32(X2, X1);
-			X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
-			T = _mm_add_epi32(X3, X2);
-			X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
-
-			X1 = _mm_shuffle_epi32(X1, 0x93);
-			X2 = _mm_shuffle_epi32(X2, 0x4E);
-			X3 = _mm_shuffle_epi32(X3, 0x39);
-
-			T = _mm_add_epi32(X0, X1);
-			X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
-			T = _mm_add_epi32(X3, X0);
-			X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
-			T = _mm_add_epi32(X2, X3);
-			X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
-			T = _mm_add_epi32(X1, X2);
-			X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
-
-			X1 = _mm_shuffle_epi32(X1, 0x39);
-			X2 = _mm_shuffle_epi32(X2, 0x4E);
-			X3 = _mm_shuffle_epi32(X3, 0x93);
-
-			// --
-
-			T = _mm_add_epi32(X0, X3);
-			X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
-			T = _mm_add_epi32(X1, X0);
-			X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
-			T = _mm_add_epi32(X2, X1);
-			X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
-			T = _mm_add_epi32(X3, X2);
-			X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
-
-			X1 = _mm_shuffle_epi32(X1, 0x93);
-			X2 = _mm_shuffle_epi32(X2, 0x4E);
-			X3 = _mm_shuffle_epi32(X3, 0x39);
-
-			T = _mm_add_epi32(X0, X1);
-			X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
-			T = _mm_add_epi32(X3, X0);
-			X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
-			T = _mm_add_epi32(X2, X3);
-			X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
-			T = _mm_add_epi32(X1, X2);
-			X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
-
-			X1 = _mm_shuffle_epi32(X1, 0x39);
-			X2 = _mm_shuffle_epi32(X2, 0x4E);
-			X3 = _mm_shuffle_epi32(X3, 0x93);
-		}
+		// 2X round -------------------------------------------------------------
+		T = _mm_add_epi32(X0, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X1, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X3, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x93);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x39);
+		T = _mm_add_epi32(X0, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X3, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X1, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x39);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x93);
+
+		// 2X round -------------------------------------------------------------
+		T = _mm_add_epi32(X0, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X1, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X3, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x93);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x39);
+		T = _mm_add_epi32(X0, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X3, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X1, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x39);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x93);
+
+		// 2X round -------------------------------------------------------------
+		T = _mm_add_epi32(X0, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X1, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X3, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x93);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x39);
+		T = _mm_add_epi32(X0, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X3, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X1, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x39);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x93);
+
+		// 2X round -------------------------------------------------------------
+		T = _mm_add_epi32(X0, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X1, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X3, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x93);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x39);
+		T = _mm_add_epi32(X0, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X3, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X1, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x39);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x93);
+
+		// 2X round -------------------------------------------------------------
+		T = _mm_add_epi32(X0, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X1, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X3, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x93);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x39);
+		T = _mm_add_epi32(X0, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X3, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X1, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x39);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x93);
+
+		// 2X round -------------------------------------------------------------
+		T = _mm_add_epi32(X0, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X1, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X3, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x93);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x39);
+		T = _mm_add_epi32(X0, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X3, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X1, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x39);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x93);
 
 		X0 = _mm_add_epi32(X0s,X0);
 		X1 = _mm_add_epi32(X1s,X1);
@@ -273,76 +359,942 @@ void Salsa20::encrypt(const void *in,void *out,unsigned int bytes)
 		x14 = j14;
 		x15 = j15;
 
-		for(i=0;i<_roundsDiv4;++i) {
-			 x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
-			 x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
-			x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
-			 x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
-			 x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
-			x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
-			 x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
-			 x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
-			x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
-			 x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
-			 x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
-			x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
-			 x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
-			 x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
-			x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
-			x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
-			 x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
-			 x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
-			 x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
-			 x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
-			 x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
-			 x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
-			 x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
-			 x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
-			x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
-			 x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
-			 x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
-			x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
-			x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
-			x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
-			x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
-			x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
-
-			// --
-
-			 x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
-			 x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
-			x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
-			 x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
-			 x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
-			x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
-			 x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
-			 x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
-			x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
-			 x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
-			 x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
-			x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
-			 x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
-			 x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
-			x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
-			x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
-			 x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
-			 x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
-			 x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
-			 x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
-			 x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
-			 x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
-			 x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
-			 x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
-			x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
-			 x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
-			 x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
-			x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
-			x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
-			x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
-			x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
-			x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+		// 2X round -------------------------------------------------------------
+		 x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+		 x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+		x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+		 x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+		 x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+		x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+		 x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+		x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+		 x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+		 x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+		x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+		 x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+		x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+		x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+		 x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+		 x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+		 x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+		 x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+		 x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+		 x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+		x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+		 x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+		 x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+		x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+		x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+		x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+		x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+		x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+		// 2X round -------------------------------------------------------------
+		 x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+		 x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+		x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+		 x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+		 x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+		x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+		 x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+		x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+		 x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+		 x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+		x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+		 x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+		x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+		x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+		 x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+		 x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+		 x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+		 x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+		 x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+		 x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+		x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+		 x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+		 x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+		x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+		x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+		x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+		x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+		x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+		// 2X round -------------------------------------------------------------
+		 x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+		 x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+		x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+		 x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+		 x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+		x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+		 x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+		x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+		 x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+		 x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+		x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+		 x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+		x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+		x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+		 x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+		 x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+		 x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+		 x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+		 x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+		 x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+		x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+		 x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+		 x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+		x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+		x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+		x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+		x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+		x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+		// 2X round -------------------------------------------------------------
+		 x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+		 x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+		x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+		 x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+		 x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+		x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+		 x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+		x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+		 x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+		 x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+		x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+		 x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+		x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+		x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+		 x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+		 x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+		 x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+		 x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+		 x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+		 x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+		x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+		 x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+		 x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+		x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+		x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+		x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+		x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+		x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+		// 2X round -------------------------------------------------------------
+		 x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+		 x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+		x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+		 x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+		 x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+		x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+		 x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+		x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+		 x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+		 x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+		x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+		 x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+		x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+		x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+		 x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+		 x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+		 x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+		 x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+		 x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+		 x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+		x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+		 x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+		 x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+		x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+		x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+		x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+		x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+		x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+		// 2X round -------------------------------------------------------------
+		 x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+		 x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+		x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+		 x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+		 x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+		x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+		 x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+		x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+		 x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+		 x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+		x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+		 x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+		x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+		x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+		 x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+		 x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+		 x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+		 x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+		 x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+		 x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+		x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+		 x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+		 x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+		x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+		x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+		x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+		x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+		x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+		x0 = PLUS(x0,j0);
+		x1 = PLUS(x1,j1);
+		x2 = PLUS(x2,j2);
+		x3 = PLUS(x3,j3);
+		x4 = PLUS(x4,j4);
+		x5 = PLUS(x5,j5);
+		x6 = PLUS(x6,j6);
+		x7 = PLUS(x7,j7);
+		x8 = PLUS(x8,j8);
+		x9 = PLUS(x9,j9);
+		x10 = PLUS(x10,j10);
+		x11 = PLUS(x11,j11);
+		x12 = PLUS(x12,j12);
+		x13 = PLUS(x13,j13);
+		x14 = PLUS(x14,j14);
+		x15 = PLUS(x15,j15);
+
+		U32TO8_LITTLE(c + 0,XOR(x0,U8TO32_LITTLE(m + 0)));
+		U32TO8_LITTLE(c + 4,XOR(x1,U8TO32_LITTLE(m + 4)));
+		U32TO8_LITTLE(c + 8,XOR(x2,U8TO32_LITTLE(m + 8)));
+		U32TO8_LITTLE(c + 12,XOR(x3,U8TO32_LITTLE(m + 12)));
+		U32TO8_LITTLE(c + 16,XOR(x4,U8TO32_LITTLE(m + 16)));
+		U32TO8_LITTLE(c + 20,XOR(x5,U8TO32_LITTLE(m + 20)));
+		U32TO8_LITTLE(c + 24,XOR(x6,U8TO32_LITTLE(m + 24)));
+		U32TO8_LITTLE(c + 28,XOR(x7,U8TO32_LITTLE(m + 28)));
+		U32TO8_LITTLE(c + 32,XOR(x8,U8TO32_LITTLE(m + 32)));
+		U32TO8_LITTLE(c + 36,XOR(x9,U8TO32_LITTLE(m + 36)));
+		U32TO8_LITTLE(c + 40,XOR(x10,U8TO32_LITTLE(m + 40)));
+		U32TO8_LITTLE(c + 44,XOR(x11,U8TO32_LITTLE(m + 44)));
+		U32TO8_LITTLE(c + 48,XOR(x12,U8TO32_LITTLE(m + 48)));
+		U32TO8_LITTLE(c + 52,XOR(x13,U8TO32_LITTLE(m + 52)));
+		U32TO8_LITTLE(c + 56,XOR(x14,U8TO32_LITTLE(m + 56)));
+		U32TO8_LITTLE(c + 60,XOR(x15,U8TO32_LITTLE(m + 60)));
+
+		if (!(++j8)) {
+			++j9;
+			/* stopping at 2^70 bytes per nonce is user's responsibility */
+		}
+#endif
+
+		if (bytes <= 64) {
+			if (bytes < 64) {
+				for (i = 0;i < bytes;++i)
+					ctarget[i] = c[i];
+			}
+
+#ifndef ZT_SALSA20_SSE
+			_state.i[8] = j8;
+			_state.i[9] = j9;
+#endif
+
+			return;
+		}
+
+		bytes -= 64;
+		c += 64;
+		m += 64;
+	}
+}
+
+void Salsa20::encrypt20(const void *in,void *out,unsigned int bytes)
+	throw()
+{
+	uint8_t tmp[64];
+	const uint8_t *m = (const uint8_t *)in;
+	uint8_t *c = (uint8_t *)out;
+	uint8_t *ctarget = c;
+	unsigned int i;
+
+#ifndef ZT_SALSA20_SSE
+	uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+	uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15;
+#endif
+
+	if (!bytes)
+		return;
+
+#ifndef ZT_SALSA20_SSE
+	j0 = _state.i[0];
+	j1 = _state.i[1];
+	j2 = _state.i[2];
+	j3 = _state.i[3];
+	j4 = _state.i[4];
+	j5 = _state.i[5];
+	j6 = _state.i[6];
+	j7 = _state.i[7];
+	j8 = _state.i[8];
+	j9 = _state.i[9];
+	j10 = _state.i[10];
+	j11 = _state.i[11];
+	j12 = _state.i[12];
+	j13 = _state.i[13];
+	j14 = _state.i[14];
+	j15 = _state.i[15];
+#endif
+
+	for (;;) {
+		if (bytes < 64) {
+			for (i = 0;i < bytes;++i)
+				tmp[i] = m[i];
+			m = tmp;
+			ctarget = c;
+			c = tmp;
 		}
 
+#ifdef ZT_SALSA20_SSE
+		__m128i X0 = _mm_loadu_si128((const __m128i *)&(_state.v[0]));
+		__m128i X1 = _mm_loadu_si128((const __m128i *)&(_state.v[1]));
+		__m128i X2 = _mm_loadu_si128((const __m128i *)&(_state.v[2]));
+		__m128i X3 = _mm_loadu_si128((const __m128i *)&(_state.v[3]));
+		__m128i T;
+		__m128i X0s = X0;
+		__m128i X1s = X1;
+		__m128i X2s = X2;
+		__m128i X3s = X3;
+
+		// 2X round -------------------------------------------------------------
+		T = _mm_add_epi32(X0, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X1, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X3, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x93);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x39);
+		T = _mm_add_epi32(X0, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X3, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X1, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x39);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x93);
+
+		// 2X round -------------------------------------------------------------
+		T = _mm_add_epi32(X0, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X1, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X3, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x93);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x39);
+		T = _mm_add_epi32(X0, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X3, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X1, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x39);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x93);
+
+		// 2X round -------------------------------------------------------------
+		T = _mm_add_epi32(X0, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X1, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X3, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x93);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x39);
+		T = _mm_add_epi32(X0, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X3, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X1, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x39);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x93);
+
+		// 2X round -------------------------------------------------------------
+		T = _mm_add_epi32(X0, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X1, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X3, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x93);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x39);
+		T = _mm_add_epi32(X0, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X3, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X1, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x39);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x93);
+
+		// 2X round -------------------------------------------------------------
+		T = _mm_add_epi32(X0, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X1, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X3, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x93);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x39);
+		T = _mm_add_epi32(X0, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X3, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X1, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x39);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x93);
+
+		// 2X round -------------------------------------------------------------
+		T = _mm_add_epi32(X0, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X1, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X3, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x93);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x39);
+		T = _mm_add_epi32(X0, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X3, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X1, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x39);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x93);
+
+		// 2X round -------------------------------------------------------------
+		T = _mm_add_epi32(X0, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X1, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X3, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x93);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x39);
+		T = _mm_add_epi32(X0, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X3, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X1, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x39);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x93);
+
+		// 2X round -------------------------------------------------------------
+		T = _mm_add_epi32(X0, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X1, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X3, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x93);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x39);
+		T = _mm_add_epi32(X0, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X3, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X1, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x39);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x93);
+
+		// 2X round -------------------------------------------------------------
+		T = _mm_add_epi32(X0, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X1, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X3, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x93);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x39);
+		T = _mm_add_epi32(X0, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X3, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X1, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x39);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x93);
+
+		// 2X round -------------------------------------------------------------
+		T = _mm_add_epi32(X0, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X1, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X3, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x93);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x39);
+		T = _mm_add_epi32(X0, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X3, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X1, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x39);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x93);
+
+		X0 = _mm_add_epi32(X0s,X0);
+		X1 = _mm_add_epi32(X1s,X1);
+		X2 = _mm_add_epi32(X2s,X2);
+		X3 = _mm_add_epi32(X3s,X3);
+
+		__m128i k02 = _mm_shuffle_epi32(_mm_or_si128(_mm_slli_epi64(X0, 32), _mm_srli_epi64(X3, 32)), _MM_SHUFFLE(0, 1, 2, 3));
+		__m128i k13 = _mm_shuffle_epi32(_mm_or_si128(_mm_slli_epi64(X1, 32), _mm_srli_epi64(X0, 32)), _MM_SHUFFLE(0, 1, 2, 3));
+		__m128i k20 = _mm_or_si128(_mm_and_si128(X2, _S20SSECONSTANTS.maskLo32), _mm_and_si128(X1, _S20SSECONSTANTS.maskHi32));
+		__m128i k31 = _mm_or_si128(_mm_and_si128(X3, _S20SSECONSTANTS.maskLo32), _mm_and_si128(X2, _S20SSECONSTANTS.maskHi32));
+		_mm_storeu_ps(reinterpret_cast<float *>(c),_mm_castsi128_ps(_mm_xor_si128(_mm_unpackhi_epi64(k02,k20),_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<const float *>(m))))));
+		_mm_storeu_ps(reinterpret_cast<float *>(c) + 4,_mm_castsi128_ps(_mm_xor_si128(_mm_unpackhi_epi64(k13,k31),_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<const float *>(m) + 4)))));
+		_mm_storeu_ps(reinterpret_cast<float *>(c) + 8,_mm_castsi128_ps(_mm_xor_si128(_mm_unpacklo_epi64(k20,k02),_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<const float *>(m) + 8)))));
+		_mm_storeu_ps(reinterpret_cast<float *>(c) + 12,_mm_castsi128_ps(_mm_xor_si128(_mm_unpacklo_epi64(k31,k13),_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<const float *>(m) + 12)))));
+
+		if (!(++_state.i[8])) {
+			++_state.i[5]; // state reordered for SSE
+			/* stopping at 2^70 bytes per nonce is user's responsibility */
+		}
+#else
+		x0 = j0;
+		x1 = j1;
+		x2 = j2;
+		x3 = j3;
+		x4 = j4;
+		x5 = j5;
+		x6 = j6;
+		x7 = j7;
+		x8 = j8;
+		x9 = j9;
+		x10 = j10;
+		x11 = j11;
+		x12 = j12;
+		x13 = j13;
+		x14 = j14;
+		x15 = j15;
+
+		// 2X round -------------------------------------------------------------
+		 x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+		 x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+		x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+		 x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+		 x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+		x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+		 x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+		x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+		 x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+		 x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+		x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+		 x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+		x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+		x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+		 x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+		 x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+		 x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+		 x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+		 x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+		 x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+		x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+		 x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+		 x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+		x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+		x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+		x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+		x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+		x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+		// 2X round -------------------------------------------------------------
+		 x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+		 x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+		x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+		 x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+		 x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+		x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+		 x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+		x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+		 x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+		 x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+		x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+		 x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+		x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+		x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+		 x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+		 x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+		 x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+		 x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+		 x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+		 x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+		x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+		 x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+		 x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+		x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+		x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+		x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+		x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+		x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+		// 2X round -------------------------------------------------------------
+		 x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+		 x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+		x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+		 x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+		 x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+		x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+		 x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+		x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+		 x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+		 x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+		x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+		 x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+		x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+		x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+		 x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+		 x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+		 x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+		 x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+		 x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+		 x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+		x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+		 x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+		 x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+		x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+		x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+		x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+		x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+		x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+		// 2X round -------------------------------------------------------------
+		 x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+		 x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+		x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+		 x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+		 x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+		x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+		 x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+		x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+		 x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+		 x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+		x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+		 x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+		x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+		x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+		 x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+		 x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+		 x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+		 x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+		 x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+		 x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+		x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+		 x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+		 x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+		x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+		x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+		x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+		x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+		x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+		// 2X round -------------------------------------------------------------
+		 x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+		 x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+		x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+		 x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+		 x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+		x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+		 x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+		x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+		 x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+		 x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+		x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+		 x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+		x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+		x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+		 x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+		 x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+		 x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+		 x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+		 x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+		 x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+		x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+		 x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+		 x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+		x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+		x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+		x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+		x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+		x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+		// 2X round -------------------------------------------------------------
+		 x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+		 x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+		x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+		 x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+		 x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+		x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+		 x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+		x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+		 x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+		 x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+		x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+		 x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+		x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+		x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+		 x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+		 x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+		 x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+		 x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+		 x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+		 x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+		x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+		 x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+		 x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+		x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+		x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+		x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+		x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+		x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+		// 2X round -------------------------------------------------------------
+		 x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+		 x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+		x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+		 x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+		 x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+		x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+		 x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+		x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+		 x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+		 x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+		x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+		 x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+		x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+		x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+		 x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+		 x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+		 x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+		 x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+		 x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+		 x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+		x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+		 x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+		 x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+		x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+		x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+		x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+		x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+		x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+		// 2X round -------------------------------------------------------------
+		 x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+		 x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+		x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+		 x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+		 x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+		x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+		 x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+		x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+		 x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+		 x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+		x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+		 x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+		x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+		x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+		 x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+		 x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+		 x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+		 x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+		 x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+		 x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+		x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+		 x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+		 x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+		x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+		x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+		x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+		x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+		x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+		// 2X round -------------------------------------------------------------
+		 x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+		 x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+		x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+		 x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+		 x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+		x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+		 x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+		x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+		 x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+		 x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+		x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+		 x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+		x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+		x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+		 x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+		 x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+		 x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+		 x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+		 x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+		 x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+		x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+		 x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+		 x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+		x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+		x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+		x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+		x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+		x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+		// 2X round -------------------------------------------------------------
+		 x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+		 x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+		x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+		 x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+		 x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+		x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+		 x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+		x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+		 x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+		 x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+		x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+		 x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+		x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+		x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+		 x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+		 x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+		 x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+		 x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+		 x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+		 x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+		 x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+		 x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+		x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+		 x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+		 x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+		x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+		x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+		x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+		x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+		x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
 		x0 = PLUS(x0,j0);
 		x1 = PLUS(x1,j1);
 		x2 = PLUS(x2,j2);

+ 30 - 10
node/Salsa20.hpp

@@ -35,12 +35,11 @@ public:
 	 * @param key Key bits
 	 * @param kbits Number of key bits: 128 or 256 (recommended)
 	 * @param iv 64-bit initialization vector
-	 * @param rounds Number of rounds: 8, 12, or 20
 	 */
-	Salsa20(const void *key,unsigned int kbits,const void *iv,unsigned int rounds)
+	Salsa20(const void *key,unsigned int kbits,const void *iv)
 		throw()
 	{
-		init(key,kbits,iv,rounds);
+		init(key,kbits,iv);
 	}
 
 	/**
@@ -49,21 +48,43 @@ public:
 	 * @param key Key bits
 	 * @param kbits Number of key bits: 128 or 256 (recommended)
 	 * @param iv 64-bit initialization vector
-	 * @param rounds Number of rounds: 8, 12, or 20
 	 */
-	void init(const void *key,unsigned int kbits,const void *iv,unsigned int rounds)
+	void init(const void *key,unsigned int kbits,const void *iv)
 		throw();
 
 	/**
-	 * Encrypt data
+	 * Encrypt data using Salsa20/12
 	 *
 	 * @param in Input data
 	 * @param out Output buffer
 	 * @param bytes Length of data
 	 */
-	void encrypt(const void *in,void *out,unsigned int bytes)
+	void encrypt12(const void *in,void *out,unsigned int bytes)
 		throw();
 
+	/**
+	 * Encrypt data using Salsa20/20
+	 *
+	 * @param in Input data
+	 * @param out Output buffer
+	 * @param bytes Length of data
+	 */
+	void encrypt20(const void *in,void *out,unsigned int bytes)
+		throw();
+
+	/**
+	 * Decrypt data
+	 *
+	 * @param in Input data
+	 * @param out Output buffer
+	 * @param bytes Length of data
+	 */
+	inline void decrypt12(const void *in,void *out,unsigned int bytes)
+		throw()
+	{
+		encrypt12(in,out,bytes);
+	}
+
 	/**
 	 * Decrypt data
 	 *
@@ -71,10 +92,10 @@ public:
 	 * @param out Output buffer
 	 * @param bytes Length of data
 	 */
-	inline void decrypt(const void *in,void *out,unsigned int bytes)
+	inline void decrypt20(const void *in,void *out,unsigned int bytes)
 		throw()
 	{
-		encrypt(in,out,bytes);
+		encrypt20(in,out,bytes);
 	}
 
 private:
@@ -84,7 +105,6 @@ private:
 #endif // ZT_SALSA20_SSE
 		uint32_t i[16];
 	} _state;
-	unsigned int _roundsDiv4;
 };
 
 } // namespace ZeroTier

+ 12 - 30
selftest.cpp

@@ -162,27 +162,27 @@ static int testCrypto()
 		memset(buf2,0,sizeof(buf2));
 		memset(buf3,0,sizeof(buf3));
 		Salsa20 s20;
-		s20.init("12345678123456781234567812345678",256,"12345678",20);
-		s20.encrypt(buf1,buf2,sizeof(buf1));
-		s20.init("12345678123456781234567812345678",256,"12345678",20);
-		s20.decrypt(buf2,buf3,sizeof(buf2));
+		s20.init("12345678123456781234567812345678",256,"12345678");
+		s20.encrypt20(buf1,buf2,sizeof(buf1));
+		s20.init("12345678123456781234567812345678",256,"12345678");
+		s20.decrypt20(buf2,buf3,sizeof(buf2));
 		if (memcmp(buf1,buf3,sizeof(buf1))) {
 			std::cout << "FAIL (encrypt/decrypt test)" << std::endl;
 			return -1;
 		}
 	}
-	Salsa20 s20(s20TV0Key,256,s20TV0Iv,20);
+	Salsa20 s20(s20TV0Key,256,s20TV0Iv);
 	memset(buf1,0,sizeof(buf1));
 	memset(buf2,0,sizeof(buf2));
-	s20.encrypt(buf1,buf2,64);
+	s20.encrypt20(buf1,buf2,64);
 	if (memcmp(buf2,s20TV0Ks,64)) {
 		std::cout << "FAIL (test vector 0)" << std::endl;
 		return -1;
 	}
-	s20.init(s2012TV0Key,256,s2012TV0Iv,12);
+	s20.init(s2012TV0Key,256,s2012TV0Iv);
 	memset(buf1,0,sizeof(buf1));
 	memset(buf2,0,sizeof(buf2));
-	s20.encrypt(buf1,buf2,64);
+	s20.encrypt12(buf1,buf2,64);
 	if (memcmp(buf2,s2012TV0Ks,64)) {
 		std::cout << "FAIL (test vector 1)" << std::endl;
 		return -1;
@@ -195,34 +195,16 @@ static int testCrypto()
 	std::cout << "[crypto] Salsa20 SSE: DISABLED" << std::endl;
 #endif
 
-	std::cout << "[crypto] Benchmarking Salsa20/8... "; std::cout.flush();
-	{
-		unsigned char *bb = (unsigned char *)::malloc(1234567);
-		for(unsigned int i=0;i<1234567;++i)
-			bb[i] = (unsigned char)i;
-		Salsa20 s20(s20TV0Key,256,s20TV0Iv,8);
-		double bytes = 0.0;
-		uint64_t start = OSUtils::now();
-		for(unsigned int i=0;i<200;++i) {
-			s20.encrypt(bb,bb,1234567);
-			bytes += 1234567.0;
-		}
-		uint64_t end = OSUtils::now();
-		SHA512::hash(buf1,bb,1234567);
-		std::cout << ((bytes / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second (" << Utils::hex(buf1,16) << ')' << std::endl;
-		::free((void *)bb);
-	}
-
 	std::cout << "[crypto] Benchmarking Salsa20/12... "; std::cout.flush();
 	{
 		unsigned char *bb = (unsigned char *)::malloc(1234567);
 		for(unsigned int i=0;i<1234567;++i)
 			bb[i] = (unsigned char)i;
-		Salsa20 s20(s20TV0Key,256,s20TV0Iv,12);
+		Salsa20 s20(s20TV0Key,256,s20TV0Iv);
 		double bytes = 0.0;
 		uint64_t start = OSUtils::now();
 		for(unsigned int i=0;i<200;++i) {
-			s20.encrypt(bb,bb,1234567);
+			s20.encrypt12(bb,bb,1234567);
 			bytes += 1234567.0;
 		}
 		uint64_t end = OSUtils::now();
@@ -236,11 +218,11 @@ static int testCrypto()
 		unsigned char *bb = (unsigned char *)::malloc(1234567);
 		for(unsigned int i=0;i<1234567;++i)
 			bb[i] = (unsigned char)i;
-		Salsa20 s20(s20TV0Key,256,s20TV0Iv,20);
+		Salsa20 s20(s20TV0Key,256,s20TV0Iv);
 		double bytes = 0.0;
 		uint64_t start = OSUtils::now();
 		for(unsigned int i=0;i<200;++i) {
-			s20.encrypt(bb,bb,1234567);
+			s20.encrypt20(bb,bb,1234567);
 			bytes += 1234567.0;
 		}
 		uint64_t end = OSUtils::now();