Browse Source

more fastitude

Adam Ierymenko 6 years ago
parent
commit
43e6a9e9ee
2 changed files with 48 additions and 68 deletions
  1. 38 57
      node/AES.hpp
  2. 10 11
      selftest.cpp

+ 38 - 57
node/AES.hpp

@@ -505,73 +505,54 @@ private:
 		const __m128i k13 = _k.ni.k[13];
 		const __m128i k14 = _k.ni.k[14];
 
-		while (len >= 64) {
+#define ZT_AES_CTR_AESNI_ROUND(k) \
+	c0 = _mm_aesenc_si128(c0,k); \
+	c1 = _mm_aesenc_si128(c1,k); \
+	c2 = _mm_aesenc_si128(c2,k); \
+	c3 = _mm_aesenc_si128(c3,k); \
+	c4 = _mm_aesenc_si128(c4,k); \
+	c5 = _mm_aesenc_si128(c5,k); \
+	c6 = _mm_aesenc_si128(c6,k); \
+	c7 = _mm_aesenc_si128(c7,k)
+
+		while (len >= 128) {
 			__m128i c0 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton(ctr),iv0),k0);
 			__m128i c1 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+1ULL)),iv0),k0);
 			__m128i c2 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+2ULL)),iv0),k0);
 			__m128i c3 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+3ULL)),iv0),k0);
-			ctr += 4;
-			c0 = _mm_aesenc_si128(c0,k1);
-			c1 = _mm_aesenc_si128(c1,k1);
-			c2 = _mm_aesenc_si128(c2,k1);
-			c3 = _mm_aesenc_si128(c3,k1);
-			c0 = _mm_aesenc_si128(c0,k2);
-			c1 = _mm_aesenc_si128(c1,k2);
-			c2 = _mm_aesenc_si128(c2,k2);
-			c3 = _mm_aesenc_si128(c3,k2);
-			c0 = _mm_aesenc_si128(c0,k3);
-			c1 = _mm_aesenc_si128(c1,k3);
-			c2 = _mm_aesenc_si128(c2,k3);
-			c3 = _mm_aesenc_si128(c3,k3);
-			c0 = _mm_aesenc_si128(c0,k4);
-			c1 = _mm_aesenc_si128(c1,k4);
-			c2 = _mm_aesenc_si128(c2,k4);
-			c3 = _mm_aesenc_si128(c3,k4);
-			c0 = _mm_aesenc_si128(c0,k5);
-			c1 = _mm_aesenc_si128(c1,k5);
-			c2 = _mm_aesenc_si128(c2,k5);
-			c3 = _mm_aesenc_si128(c3,k5);
-			c0 = _mm_aesenc_si128(c0,k6);
-			c1 = _mm_aesenc_si128(c1,k6);
-			c2 = _mm_aesenc_si128(c2,k6);
-			c3 = _mm_aesenc_si128(c3,k6);
-			c0 = _mm_aesenc_si128(c0,k7);
-			c1 = _mm_aesenc_si128(c1,k7);
-			c2 = _mm_aesenc_si128(c2,k7);
-			c3 = _mm_aesenc_si128(c3,k7);
-			c0 = _mm_aesenc_si128(c0,k8);
-			c1 = _mm_aesenc_si128(c1,k8);
-			c2 = _mm_aesenc_si128(c2,k8);
-			c3 = _mm_aesenc_si128(c3,k8);
-			c0 = _mm_aesenc_si128(c0,k9);
-			c1 = _mm_aesenc_si128(c1,k9);
-			c2 = _mm_aesenc_si128(c2,k9);
-			c3 = _mm_aesenc_si128(c3,k9);
-			c0 = _mm_aesenc_si128(c0,k10);
-			c1 = _mm_aesenc_si128(c1,k10);
-			c2 = _mm_aesenc_si128(c2,k10);
-			c3 = _mm_aesenc_si128(c3,k10);
-			c0 = _mm_aesenc_si128(c0,k11);
-			c1 = _mm_aesenc_si128(c1,k11);
-			c2 = _mm_aesenc_si128(c2,k11);
-			c3 = _mm_aesenc_si128(c3,k11);
-			c0 = _mm_aesenc_si128(c0,k12);
-			c1 = _mm_aesenc_si128(c1,k12);
-			c2 = _mm_aesenc_si128(c2,k12);
-			c3 = _mm_aesenc_si128(c3,k12);
-			c0 = _mm_aesenc_si128(c0,k13);
-			c1 = _mm_aesenc_si128(c1,k13);
-			c2 = _mm_aesenc_si128(c2,k13);
-			c3 = _mm_aesenc_si128(c3,k13);
+			__m128i c4 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+4ULL)),iv0),k0);
+			__m128i c5 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+5ULL)),iv0),k0);
+			__m128i c6 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+6ULL)),iv0),k0);
+			__m128i c7 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+7ULL)),iv0),k0);
+			ctr += 8;
+			ZT_AES_CTR_AESNI_ROUND(k1);
+			ZT_AES_CTR_AESNI_ROUND(k2);
+			ZT_AES_CTR_AESNI_ROUND(k3);
+			ZT_AES_CTR_AESNI_ROUND(k4);
+			ZT_AES_CTR_AESNI_ROUND(k5);
+			ZT_AES_CTR_AESNI_ROUND(k6);
+			ZT_AES_CTR_AESNI_ROUND(k7);
+			ZT_AES_CTR_AESNI_ROUND(k8);
+			ZT_AES_CTR_AESNI_ROUND(k9);
+			ZT_AES_CTR_AESNI_ROUND(k10);
+			ZT_AES_CTR_AESNI_ROUND(k11);
+			ZT_AES_CTR_AESNI_ROUND(k12);
+			ZT_AES_CTR_AESNI_ROUND(k13);
 			_mm_storeu_si128((__m128i *)out,_mm_xor_si128(_mm_loadu_si128((const __m128i *)in),_mm_aesenclast_si128(c0,k14)));
 			_mm_storeu_si128((__m128i *)(out + 16),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 16)),_mm_aesenclast_si128(c1,k14)));
 			_mm_storeu_si128((__m128i *)(out + 32),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 32)),_mm_aesenclast_si128(c2,k14)));
 			_mm_storeu_si128((__m128i *)(out + 48),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 48)),_mm_aesenclast_si128(c3,k14)));
-			in += 64;
-			out += 64;
-			len -= 64;
+			_mm_storeu_si128((__m128i *)(out + 64),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 64)),_mm_aesenclast_si128(c4,k14)));
+			_mm_storeu_si128((__m128i *)(out + 80),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 80)),_mm_aesenclast_si128(c5,k14)));
+			_mm_storeu_si128((__m128i *)(out + 96),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 96)),_mm_aesenclast_si128(c6,k14)));
+			_mm_storeu_si128((__m128i *)(out + 112),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 112)),_mm_aesenclast_si128(c7,k14)));
+			in += 128;
+			out += 128;
+			len -= 128;
 		}
 
+#undef ZT_AES_CTR_AESNI_ROUND
+
 		while (len >= 16) {
 			__m128i c0 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton(ctr++),(__m64)iv0),k0);
 			c0 = _mm_aesenc_si128(c0,k1);

+ 10 - 11
selftest.cpp

@@ -207,33 +207,32 @@ static int testCrypto()
 			}
 			std::cout << "OK" ZT_EOL_S << "  GMAC-AES-256 (benchmark): "; std::cout.flush();
 			int64_t start = OSUtils::now();
-			for(unsigned long i=0;i<200000;++i) {
-				tv.gmac((const uint8_t *)buf1,buf1,sizeof(buf1),(uint8_t *)buf1);
+			for(unsigned long i=0;i<500000;++i) {
+				tv.gmac((const uint8_t *)buf1,buf1,ZT_DEFAULT_MTU,(uint8_t *)buf1);
 			}
 			int64_t end = OSUtils::now();
 			*dummy = hexbuf[0];
-			std::cout << (((double)(200000 * sizeof(buf1)) / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second" ZT_EOL_S;
+			std::cout << (((double)(500000 * ZT_DEFAULT_MTU) / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second (dummy: " << (unsigned int)*dummy << ")" ZT_EOL_S;
 			std::cout << "  AES-256-CTR (benchmark): "; std::cout.flush();
 			start = OSUtils::now();
-			for(unsigned long i=0;i<200000;++i) {
-				tv.ctr((const uint8_t *)hexbuf,buf1,sizeof(buf1),buf2);
-				hexbuf[0] = buf2[0];
+			for(unsigned long i=0;i<500000;++i) {
+				tv.ctr((const uint8_t *)hexbuf,buf1,ZT_DEFAULT_MTU,buf1);
+				*dummy = buf1[0];
 			}
 			end = OSUtils::now();
-			*dummy = buf2[0];
-			std::cout << (((double)(200000 * sizeof(buf1)) / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second" ZT_EOL_S;
+			std::cout << (((double)(500000 * ZT_DEFAULT_MTU) / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second (dummy: " << (unsigned int)*dummy << ")" ZT_EOL_S;
 		}
 		{
 			std::cout << "  AES-256-GMAC-CTR (benchmark): "; std::cout.flush();
 			AES k1,k2,k3,k4;
 			AES::initGmacCtrKeys(AES_TEST_VECTOR_0_KEY,k1,k2,k3,k4);
 			int64_t start = OSUtils::now();
-			for(unsigned long i=0;i<200000;++i) {
-				AES::ztGmacCtrEncrypt(k1,k2,k3,k4,(const uint8_t *)hexbuf,buf1,sizeof(buf1),buf1,(uint8_t *)(hexbuf + 8));
+			for(unsigned long i=0;i<500000;++i) {
+				AES::ztGmacCtrEncrypt(k1,k2,k3,k4,(const uint8_t *)hexbuf,buf1,ZT_DEFAULT_MTU,buf1,(uint8_t *)(hexbuf + 8));
 				*dummy = buf1[0];
 			}
 			int64_t end = OSUtils::now();
-			std::cout << (((double)(200000 * sizeof(buf1)) / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second" ZT_EOL_S;
+			std::cout << (((double)(500000 * ZT_DEFAULT_MTU) / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second (dummy: " << (unsigned int)*dummy << ")" ZT_EOL_S;
 		}
 	}