|
@@ -14,8 +14,6 @@
|
|
|
#include "Constants.hpp"
|
|
|
#include "AES.hpp"
|
|
|
|
|
|
-#include <cstdio>
|
|
|
-
|
|
|
namespace ZeroTier {
|
|
|
|
|
|
// GMAC ---------------------------------------------------------------------------------------------------------------
|
|
@@ -191,90 +189,93 @@ void AES::GMAC::update(const void *const data,unsigned int len) noexcept
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- while (len >= 64) {
|
|
|
- __m128i d1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in));
|
|
|
- __m128i d2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 16));
|
|
|
- __m128i d3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 32));
|
|
|
- __m128i d4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 48));
|
|
|
-
|
|
|
- in += 64;
|
|
|
- len -= 64;
|
|
|
-
|
|
|
- // This does 4X parallel mult_block via instruction level parallelism.
|
|
|
- d1 = _mm_shuffle_epi8(_mm_xor_si128(y,d1),shuf);
|
|
|
- d2 = _mm_shuffle_epi8(d2,shuf);
|
|
|
- d3 = _mm_shuffle_epi8(d3,shuf);
|
|
|
- d4 = _mm_shuffle_epi8(d4,shuf);
|
|
|
- __m128i t0 = _mm_clmulepi64_si128(_aes._k.ni.hhhh,d1,0x00);
|
|
|
- __m128i t1 = _mm_clmulepi64_si128(_aes._k.ni.hhh,d2,0x00);
|
|
|
- __m128i t2 = _mm_clmulepi64_si128(_aes._k.ni.hh,d3,0x00);
|
|
|
- __m128i t3 = _mm_clmulepi64_si128(_aes._k.ni.h,d4,0x00);
|
|
|
- __m128i t8 = _mm_xor_si128(t0,t1);
|
|
|
- t8 = _mm_xor_si128(t8,t2);
|
|
|
- t8 = _mm_xor_si128(t8,t3);
|
|
|
- __m128i t4 = _mm_clmulepi64_si128(_aes._k.ni.hhhh,d1,0x11);
|
|
|
- __m128i t5 = _mm_clmulepi64_si128(_aes._k.ni.hhh,d2,0x11);
|
|
|
- __m128i t6 = _mm_clmulepi64_si128(_aes._k.ni.hh,d3,0x11);
|
|
|
- __m128i t7 = _mm_clmulepi64_si128(_aes._k.ni.h,d4,0x11);
|
|
|
- __m128i t9 = _mm_xor_si128(t4,t5);
|
|
|
- t9 = _mm_xor_si128(t9,t6);
|
|
|
- t9 = _mm_xor_si128(t9,t7);
|
|
|
- t0 = _mm_shuffle_epi32(_aes._k.ni.hhhh,78);
|
|
|
- t4 = _mm_shuffle_epi32(d1,78);
|
|
|
- t0 = _mm_xor_si128(t0,_aes._k.ni.hhhh);
|
|
|
- t4 = _mm_xor_si128(t4,d1);
|
|
|
- t1 = _mm_shuffle_epi32(_aes._k.ni.hhh,78);
|
|
|
- t5 = _mm_shuffle_epi32(d2,78);
|
|
|
- t1 = _mm_xor_si128(t1,_aes._k.ni.hhh);
|
|
|
- t5 = _mm_xor_si128(t5,d2);
|
|
|
- t2 = _mm_shuffle_epi32(_aes._k.ni.hh,78);
|
|
|
- t6 = _mm_shuffle_epi32(d3,78);
|
|
|
- t2 = _mm_xor_si128(t2,_aes._k.ni.hh);
|
|
|
- t6 = _mm_xor_si128(t6,d3);
|
|
|
- t3 = _mm_shuffle_epi32(_aes._k.ni.h,78);
|
|
|
- t7 = _mm_shuffle_epi32(d4,78);
|
|
|
- t3 = _mm_xor_si128(t3,_aes._k.ni.h);
|
|
|
- t7 = _mm_xor_si128(t7,d4);
|
|
|
- t0 = _mm_clmulepi64_si128(t0,t4,0x00);
|
|
|
- t1 = _mm_clmulepi64_si128(t1,t5,0x00);
|
|
|
- t2 = _mm_clmulepi64_si128(t2,t6,0x00);
|
|
|
- t3 = _mm_clmulepi64_si128(t3,t7,0x00);
|
|
|
- t0 = _mm_xor_si128(t0,t8);
|
|
|
- t0 = _mm_xor_si128(t0,t9);
|
|
|
- t0 = _mm_xor_si128(t1,t0);
|
|
|
- t0 = _mm_xor_si128(t2,t0);
|
|
|
- t0 = _mm_xor_si128(t3,t0);
|
|
|
- t4 = _mm_slli_si128(t0,8);
|
|
|
- t0 = _mm_srli_si128(t0,8);
|
|
|
- t3 = _mm_xor_si128(t4,t8);
|
|
|
- t6 = _mm_xor_si128(t0,t9);
|
|
|
- t7 = _mm_srli_epi32(t3,31);
|
|
|
- t8 = _mm_srli_epi32(t6,31);
|
|
|
- t3 = _mm_slli_epi32(t3,1);
|
|
|
- t6 = _mm_slli_epi32(t6,1);
|
|
|
- t9 = _mm_srli_si128(t7,12);
|
|
|
- t8 = _mm_slli_si128(t8,4);
|
|
|
- t7 = _mm_slli_si128(t7,4);
|
|
|
- t3 = _mm_or_si128(t3,t7);
|
|
|
- t6 = _mm_or_si128(t6,t8);
|
|
|
- t6 = _mm_or_si128(t6,t9);
|
|
|
- t7 = _mm_slli_epi32(t3,31);
|
|
|
- t8 = _mm_slli_epi32(t3,30);
|
|
|
- t9 = _mm_slli_epi32(t3,25);
|
|
|
- t7 = _mm_xor_si128(t7,t8);
|
|
|
- t7 = _mm_xor_si128(t7,t9);
|
|
|
- t8 = _mm_srli_si128(t7,4);
|
|
|
- t7 = _mm_slli_si128(t7,12);
|
|
|
- t3 = _mm_xor_si128(t3,t7);
|
|
|
- t2 = _mm_srli_epi32(t3,1);
|
|
|
- t4 = _mm_srli_epi32(t3,2);
|
|
|
- t5 = _mm_srli_epi32(t3,7);
|
|
|
- t2 = _mm_xor_si128(t2,t4);
|
|
|
- t2 = _mm_xor_si128(t2,t5);
|
|
|
- t2 = _mm_xor_si128(t2,t8);
|
|
|
- t3 = _mm_xor_si128(t3,t2);
|
|
|
- t6 = _mm_xor_si128(t6,t3);
|
|
|
- y = _mm_shuffle_epi8(t6,shuf);
|
|
|
+ if (likely(len >= 64)) {
|
|
|
+ const __m128i h = _aes._k.ni.h;
|
|
|
+ const __m128i hh = _aes._k.ni.hh;
|
|
|
+ const __m128i hhh = _aes._k.ni.hhh;
|
|
|
+ const __m128i hhhh = _aes._k.ni.hhhh;
|
|
|
+ do {
|
|
|
+ __m128i d1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in));
|
|
|
+ __m128i d2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 16));
|
|
|
+ __m128i d3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 32));
|
|
|
+ __m128i d4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 48));
|
|
|
+ in += 64;
|
|
|
+ len -= 64;
|
|
|
+ d1 = _mm_shuffle_epi8(_mm_xor_si128(y,d1),shuf);
|
|
|
+ d2 = _mm_shuffle_epi8(d2,shuf);
|
|
|
+ d3 = _mm_shuffle_epi8(d3,shuf);
|
|
|
+ d4 = _mm_shuffle_epi8(d4,shuf);
|
|
|
+ __m128i t0 = _mm_clmulepi64_si128(hhhh,d1,0x00);
|
|
|
+ __m128i t1 = _mm_clmulepi64_si128(hhh,d2,0x00);
|
|
|
+ __m128i t2 = _mm_clmulepi64_si128(hh,d3,0x00);
|
|
|
+ __m128i t8 = _mm_xor_si128(t0,t1);
|
|
|
+ t8 = _mm_xor_si128(t8,t2);
|
|
|
+ __m128i t3 = _mm_clmulepi64_si128(h,d4,0x00);
|
|
|
+ __m128i t4 = _mm_clmulepi64_si128(hhhh,d1,0x11);
|
|
|
+ __m128i t5 = _mm_clmulepi64_si128(hhh,d2,0x11);
|
|
|
+ t8 = _mm_xor_si128(t8,t3);
|
|
|
+ __m128i t6 = _mm_clmulepi64_si128(hh,d3,0x11);
|
|
|
+ __m128i t7 = _mm_clmulepi64_si128(h,d4,0x11);
|
|
|
+ __m128i t9 = _mm_xor_si128(t4,t5);
|
|
|
+ t9 = _mm_xor_si128(t9,t6);
|
|
|
+ t9 = _mm_xor_si128(t9,t7);
|
|
|
+ t0 = _mm_shuffle_epi32(hhhh,78);
|
|
|
+ t4 = _mm_shuffle_epi32(d1,78);
|
|
|
+ t0 = _mm_xor_si128(t0,hhhh);
|
|
|
+ t4 = _mm_xor_si128(t4,d1);
|
|
|
+ t1 = _mm_shuffle_epi32(hhh,78);
|
|
|
+ t5 = _mm_shuffle_epi32(d2,78);
|
|
|
+ t1 = _mm_xor_si128(t1,hhh);
|
|
|
+ t5 = _mm_xor_si128(t5,d2);
|
|
|
+ t2 = _mm_shuffle_epi32(hh,78);
|
|
|
+ t6 = _mm_shuffle_epi32(d3,78);
|
|
|
+ t2 = _mm_xor_si128(t2,hh);
|
|
|
+ t6 = _mm_xor_si128(t6,d3);
|
|
|
+ t3 = _mm_shuffle_epi32(h,78);
|
|
|
+ t7 = _mm_shuffle_epi32(d4,78);
|
|
|
+ t3 = _mm_xor_si128(t3,h);
|
|
|
+ t7 = _mm_xor_si128(t7,d4);
|
|
|
+ t0 = _mm_clmulepi64_si128(t0,t4,0x00);
|
|
|
+ t1 = _mm_clmulepi64_si128(t1,t5,0x00);
|
|
|
+ t2 = _mm_clmulepi64_si128(t2,t6,0x00);
|
|
|
+ t3 = _mm_clmulepi64_si128(t3,t7,0x00);
|
|
|
+ t0 = _mm_xor_si128(t0,t8);
|
|
|
+ t0 = _mm_xor_si128(t0,t9);
|
|
|
+ t0 = _mm_xor_si128(t1,t0);
|
|
|
+ t0 = _mm_xor_si128(t2,t0);
|
|
|
+ t0 = _mm_xor_si128(t3,t0);
|
|
|
+ t4 = _mm_slli_si128(t0,8);
|
|
|
+ t0 = _mm_srli_si128(t0,8);
|
|
|
+ t3 = _mm_xor_si128(t4,t8);
|
|
|
+ t6 = _mm_xor_si128(t0,t9);
|
|
|
+ t7 = _mm_srli_epi32(t3,31);
|
|
|
+ t8 = _mm_srli_epi32(t6,31);
|
|
|
+ t3 = _mm_slli_epi32(t3,1);
|
|
|
+ t6 = _mm_slli_epi32(t6,1);
|
|
|
+ t9 = _mm_srli_si128(t7,12);
|
|
|
+ t8 = _mm_slli_si128(t8,4);
|
|
|
+ t7 = _mm_slli_si128(t7,4);
|
|
|
+ t3 = _mm_or_si128(t3,t7);
|
|
|
+ t6 = _mm_or_si128(t6,t8);
|
|
|
+ t6 = _mm_or_si128(t6,t9);
|
|
|
+ t7 = _mm_slli_epi32(t3,31);
|
|
|
+ t8 = _mm_slli_epi32(t3,30);
|
|
|
+ t9 = _mm_slli_epi32(t3,25);
|
|
|
+ t7 = _mm_xor_si128(t7,t8);
|
|
|
+ t7 = _mm_xor_si128(t7,t9);
|
|
|
+ t8 = _mm_srli_si128(t7,4);
|
|
|
+ t7 = _mm_slli_si128(t7,12);
|
|
|
+ t3 = _mm_xor_si128(t3,t7);
|
|
|
+ t2 = _mm_srli_epi32(t3,1);
|
|
|
+ t4 = _mm_srli_epi32(t3,2);
|
|
|
+ t5 = _mm_srli_epi32(t3,7);
|
|
|
+ t2 = _mm_xor_si128(t2,t4);
|
|
|
+ t2 = _mm_xor_si128(t2,t5);
|
|
|
+ t2 = _mm_xor_si128(t2,t8);
|
|
|
+ t3 = _mm_xor_si128(t3,t2);
|
|
|
+ t6 = _mm_xor_si128(t6,t3);
|
|
|
+ y = _mm_shuffle_epi8(t6,shuf);
|
|
|
+ } while (len >= 64);
|
|
|
}
|
|
|
|
|
|
while (len >= 16) {
|
|
@@ -476,29 +477,13 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept
|
|
|
if (likely(Utils::CPUID.aes)) {
|
|
|
uint64_t c0 = _ctr[0];
|
|
|
uint64_t c1 = Utils::ntoh(_ctr[1]);
|
|
|
-
|
|
|
- // This uses some spare XMM registers to hold some of the key.
|
|
|
const __m128i *const k = _aes._k.ni.k;
|
|
|
- const __m128i k0 = k[0];
|
|
|
- const __m128i k1 = k[1];
|
|
|
- const __m128i k2 = k[2];
|
|
|
- const __m128i k3 = k[3];
|
|
|
- const __m128i k4 = k[4];
|
|
|
- const __m128i k5 = k[5];
|
|
|
|
|
|
// Complete any unfinished blocks from previous calls to crypt().
|
|
|
unsigned int totalLen = _len;
|
|
|
if ((totalLen & 15U)) {
|
|
|
- const __m128i k7 = k[7];
|
|
|
- const __m128i k8 = k[8];
|
|
|
- const __m128i k9 = k[9];
|
|
|
- const __m128i k10 = k[10];
|
|
|
- const __m128i k11 = k[11];
|
|
|
- const __m128i k12 = k[12];
|
|
|
- const __m128i k13 = k[13];
|
|
|
- const __m128i k14 = k[14];
|
|
|
for (;;) {
|
|
|
- if (!len) {
|
|
|
+ if (unlikely(!len)) {
|
|
|
_ctr[0] = c0;
|
|
|
_ctr[1] = Utils::hton(c1);
|
|
|
_len = totalLen;
|
|
@@ -508,152 +493,260 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept
|
|
|
out[totalLen++] = *(in++);
|
|
|
if (!(totalLen & 15U)) {
|
|
|
__m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
|
|
|
+ d0 = _mm_xor_si128(d0,k[0]);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k[1]);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k[2]);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k[3]);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k[4]);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k[5]);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k[6]);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k[7]);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k[8]);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k[9]);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k[10]);
|
|
|
+ __m128i *const outblk = reinterpret_cast<__m128i *>(out + (totalLen - 16));
|
|
|
+ d0 = _mm_aesenc_si128(d0,k[11]);
|
|
|
+ const __m128i p0 = _mm_loadu_si128(outblk);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k[12]);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k[13]);
|
|
|
+ d0 = _mm_aesenclast_si128(d0,k[14]);
|
|
|
+ _mm_storeu_si128(outblk,_mm_xor_si128(p0,d0));
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ out += totalLen;
|
|
|
+ _len = totalLen + len;
|
|
|
+
|
|
|
+ if (likely(len >= 64)) {
|
|
|
+ if (Utils::CPUID.vaes) { // is only true if AVX is also present
|
|
|
+ if ((!Utils::CPUID.avx512f)||((len < 1024))) {
|
|
|
+ const __m256i kk0 = _mm256_broadcastsi128_si256(k[0]);
|
|
|
+ const __m256i kk1 = _mm256_broadcastsi128_si256(k[1]);
|
|
|
+ const __m256i kk2 = _mm256_broadcastsi128_si256(k[2]);
|
|
|
+ const __m256i kk3 = _mm256_broadcastsi128_si256(k[3]);
|
|
|
+ const __m256i kk4 = _mm256_broadcastsi128_si256(k[4]);
|
|
|
+ const __m256i kk5 = _mm256_broadcastsi128_si256(k[5]);
|
|
|
+ const __m256i kk6 = _mm256_broadcastsi128_si256(k[6]);
|
|
|
+ const __m256i kk7 = _mm256_broadcastsi128_si256(k[7]);
|
|
|
+ const __m256i kk8 = _mm256_broadcastsi128_si256(k[8]);
|
|
|
+ const __m256i kk9 = _mm256_broadcastsi128_si256(k[9]);
|
|
|
+ const __m256i kk10 = _mm256_broadcastsi128_si256(k[10]);
|
|
|
+ const __m256i kk11 = _mm256_broadcastsi128_si256(k[11]);
|
|
|
+ const __m256i kk12 = _mm256_broadcastsi128_si256(k[12]);
|
|
|
+ const __m256i kk13 = _mm256_broadcastsi128_si256(k[13]);
|
|
|
+ const __m256i kk14 = _mm256_broadcastsi128_si256(k[14]);
|
|
|
+ do {
|
|
|
+ __m256i d0 = _mm256_set_epi64x(
|
|
|
+ (long long)Utils::hton(c1 + 1ULL),(long long)c0,
|
|
|
+ (long long)Utils::hton(c1),(long long)c0);
|
|
|
+ __m256i d1 = _mm256_set_epi64x(
|
|
|
+ (long long)Utils::hton(c1 + 3ULL),(long long)c0,
|
|
|
+ (long long)Utils::hton(c1 + 2ULL),(long long)c0);
|
|
|
+ c1 += 4;
|
|
|
+ __m256i p0 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(in));
|
|
|
+ __m256i p1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(in + 32));
|
|
|
+ in += 64;
|
|
|
+ d0 = _mm256_xor_si256(d0,kk0);
|
|
|
+ d1 = _mm256_xor_si256(d1,kk0);
|
|
|
+ d0 = _mm256_aesenc_epi128(d0,kk1);
|
|
|
+ d1 = _mm256_aesenc_epi128(d1,kk1);
|
|
|
+ d0 = _mm256_aesenc_epi128(d0,kk2);
|
|
|
+ d1 = _mm256_aesenc_epi128(d1,kk2);
|
|
|
+ d0 = _mm256_aesenc_epi128(d0,kk3);
|
|
|
+ d1 = _mm256_aesenc_epi128(d1,kk3);
|
|
|
+ d0 = _mm256_aesenc_epi128(d0,kk4);
|
|
|
+ d1 = _mm256_aesenc_epi128(d1,kk4);
|
|
|
+ d0 = _mm256_aesenc_epi128(d0,kk5);
|
|
|
+ d1 = _mm256_aesenc_epi128(d1,kk5);
|
|
|
+ d0 = _mm256_aesenc_epi128(d0,kk6);
|
|
|
+ d1 = _mm256_aesenc_epi128(d1,kk6);
|
|
|
+ d0 = _mm256_aesenc_epi128(d0,kk7);
|
|
|
+ d1 = _mm256_aesenc_epi128(d1,kk7);
|
|
|
+ d0 = _mm256_aesenc_epi128(d0,kk8);
|
|
|
+ d1 = _mm256_aesenc_epi128(d1,kk8);
|
|
|
+ d0 = _mm256_aesenc_epi128(d0,kk9);
|
|
|
+ d1 = _mm256_aesenc_epi128(d1,kk9);
|
|
|
+ d0 = _mm256_aesenc_epi128(d0,kk10);
|
|
|
+ d1 = _mm256_aesenc_epi128(d1,kk10);
|
|
|
+ d0 = _mm256_aesenc_epi128(d0,kk11);
|
|
|
+ d1 = _mm256_aesenc_epi128(d1,kk11);
|
|
|
+ d0 = _mm256_aesenc_epi128(d0,kk12);
|
|
|
+ d1 = _mm256_aesenc_epi128(d1,kk12);
|
|
|
+ d0 = _mm256_aesenc_epi128(d0,kk13);
|
|
|
+ d1 = _mm256_aesenc_epi128(d1,kk13);
|
|
|
+ d0 = _mm256_aesenclast_epi128(d0,kk14);
|
|
|
+ d1 = _mm256_aesenclast_epi128(d1,kk14);
|
|
|
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),_mm256_xor_si256(d0,p0));
|
|
|
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 32),_mm256_xor_si256(d1,p1));
|
|
|
+ out += 64;
|
|
|
+ len -= 64;
|
|
|
+ } while (len >= 64);
|
|
|
+ } else {
|
|
|
+ const __m512i kk0 = _mm512_broadcast_i32x4(k[0]);
|
|
|
+ const __m512i kk1 = _mm512_broadcast_i32x4(k[1]);
|
|
|
+ const __m512i kk2 = _mm512_broadcast_i32x4(k[2]);
|
|
|
+ const __m512i kk3 = _mm512_broadcast_i32x4(k[3]);
|
|
|
+ const __m512i kk4 = _mm512_broadcast_i32x4(k[4]);
|
|
|
+ const __m512i kk5 = _mm512_broadcast_i32x4(k[5]);
|
|
|
+ const __m512i kk6 = _mm512_broadcast_i32x4(k[6]);
|
|
|
+ const __m512i kk7 = _mm512_broadcast_i32x4(k[7]);
|
|
|
+ const __m512i kk8 = _mm512_broadcast_i32x4(k[8]);
|
|
|
+ const __m512i kk9 = _mm512_broadcast_i32x4(k[9]);
|
|
|
+ const __m512i kk10 = _mm512_broadcast_i32x4(k[10]);
|
|
|
+ const __m512i kk11 = _mm512_broadcast_i32x4(k[11]);
|
|
|
+ const __m512i kk12 = _mm512_broadcast_i32x4(k[12]);
|
|
|
+ const __m512i kk13 = _mm512_broadcast_i32x4(k[13]);
|
|
|
+ const __m512i kk14 = _mm512_broadcast_i32x4(k[14]);
|
|
|
+ do {
|
|
|
+ __m512i d0 = _mm512_set_epi64(
|
|
|
+ (long long)Utils::hton(c1 + 3ULL),(long long)c0,
|
|
|
+ (long long)Utils::hton(c1 + 2ULL),(long long)c0,
|
|
|
+ (long long)Utils::hton(c1 + 1ULL),(long long)c0,
|
|
|
+ (long long)Utils::hton(c1),(long long)c0);
|
|
|
+ c1 += 4;
|
|
|
+ __m512i p0 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(in));
|
|
|
+ in += 64;
|
|
|
+ d0 = _mm512_xor_si512(d0,kk0);
|
|
|
+ d0 = _mm512_aesenc_epi128(d0,kk1);
|
|
|
+ d0 = _mm512_aesenc_epi128(d0,kk2);
|
|
|
+ d0 = _mm512_aesenc_epi128(d0,kk3);
|
|
|
+ d0 = _mm512_aesenc_epi128(d0,kk4);
|
|
|
+ d0 = _mm512_aesenc_epi128(d0,kk5);
|
|
|
+ d0 = _mm512_aesenc_epi128(d0,kk6);
|
|
|
+ d0 = _mm512_aesenc_epi128(d0,kk7);
|
|
|
+ d0 = _mm512_aesenc_epi128(d0,kk8);
|
|
|
+ d0 = _mm512_aesenc_epi128(d0,kk9);
|
|
|
+ d0 = _mm512_aesenc_epi128(d0,kk10);
|
|
|
+ d0 = _mm512_aesenc_epi128(d0,kk11);
|
|
|
+ d0 = _mm512_aesenc_epi128(d0,kk12);
|
|
|
+ d0 = _mm512_aesenc_epi128(d0,kk13);
|
|
|
+ d0 = _mm512_aesenclast_epi128(d0,kk14);
|
|
|
+ _mm512_storeu_si512(reinterpret_cast<__m512i *>(out),_mm512_xor_si512(p0,d0));
|
|
|
+ out += 64;
|
|
|
+ len -= 64;
|
|
|
+ } while (len >= 64);
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ const __m128i k0 = k[0];
|
|
|
+ const __m128i k1 = k[1];
|
|
|
+ const __m128i k2 = k[2];
|
|
|
+ const __m128i k3 = k[3];
|
|
|
+ const __m128i k4 = k[4];
|
|
|
+ const __m128i k5 = k[5];
|
|
|
+ const __m128i k6 = k[6];
|
|
|
+ const __m128i k7 = k[7];
|
|
|
+ const __m128i k8 = k[8];
|
|
|
+ const __m128i k9 = k[9];
|
|
|
+ const __m128i k10 = k[10];
|
|
|
+ const __m128i k11 = k[11];
|
|
|
+ const __m128i k12 = k[12];
|
|
|
+ const __m128i k13 = k[13];
|
|
|
+ const __m128i k14 = k[14];
|
|
|
+ do {
|
|
|
+ __m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
|
|
|
+ __m128i d1 = _mm_set_epi64x((long long)Utils::hton(c1 + 1ULL),(long long)c0);
|
|
|
+ __m128i d2 = _mm_set_epi64x((long long)Utils::hton(c1 + 2ULL),(long long)c0);
|
|
|
+ __m128i d3 = _mm_set_epi64x((long long)Utils::hton(c1 + 3ULL),(long long)c0);
|
|
|
+ c1 += 4;
|
|
|
d0 = _mm_xor_si128(d0,k0);
|
|
|
+ d1 = _mm_xor_si128(d1,k0);
|
|
|
+ d2 = _mm_xor_si128(d2,k0);
|
|
|
+ d3 = _mm_xor_si128(d3,k0);
|
|
|
d0 = _mm_aesenc_si128(d0,k1);
|
|
|
+ d1 = _mm_aesenc_si128(d1,k1);
|
|
|
+ d2 = _mm_aesenc_si128(d2,k1);
|
|
|
+ d3 = _mm_aesenc_si128(d3,k1);
|
|
|
d0 = _mm_aesenc_si128(d0,k2);
|
|
|
+ d1 = _mm_aesenc_si128(d1,k2);
|
|
|
+ d2 = _mm_aesenc_si128(d2,k2);
|
|
|
+ d3 = _mm_aesenc_si128(d3,k2);
|
|
|
d0 = _mm_aesenc_si128(d0,k3);
|
|
|
+ d1 = _mm_aesenc_si128(d1,k3);
|
|
|
+ d2 = _mm_aesenc_si128(d2,k3);
|
|
|
+ d3 = _mm_aesenc_si128(d3,k3);
|
|
|
d0 = _mm_aesenc_si128(d0,k4);
|
|
|
+ d1 = _mm_aesenc_si128(d1,k4);
|
|
|
+ d2 = _mm_aesenc_si128(d2,k4);
|
|
|
+ d3 = _mm_aesenc_si128(d3,k4);
|
|
|
d0 = _mm_aesenc_si128(d0,k5);
|
|
|
- d0 = _mm_aesenc_si128(d0,k[6]);
|
|
|
+ d1 = _mm_aesenc_si128(d1,k5);
|
|
|
+ d2 = _mm_aesenc_si128(d2,k5);
|
|
|
+ d3 = _mm_aesenc_si128(d3,k5);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k6);
|
|
|
+ d1 = _mm_aesenc_si128(d1,k6);
|
|
|
+ d2 = _mm_aesenc_si128(d2,k6);
|
|
|
+ d3 = _mm_aesenc_si128(d3,k6);
|
|
|
d0 = _mm_aesenc_si128(d0,k7);
|
|
|
+ d1 = _mm_aesenc_si128(d1,k7);
|
|
|
+ d2 = _mm_aesenc_si128(d2,k7);
|
|
|
+ d3 = _mm_aesenc_si128(d3,k7);
|
|
|
d0 = _mm_aesenc_si128(d0,k8);
|
|
|
+ d1 = _mm_aesenc_si128(d1,k8);
|
|
|
+ d2 = _mm_aesenc_si128(d2,k8);
|
|
|
+ d3 = _mm_aesenc_si128(d3,k8);
|
|
|
d0 = _mm_aesenc_si128(d0,k9);
|
|
|
+ d1 = _mm_aesenc_si128(d1,k9);
|
|
|
+ d2 = _mm_aesenc_si128(d2,k9);
|
|
|
+ d3 = _mm_aesenc_si128(d3,k9);
|
|
|
d0 = _mm_aesenc_si128(d0,k10);
|
|
|
- __m128i *const outblk = reinterpret_cast<__m128i *>(out + (totalLen - 16));
|
|
|
+ d1 = _mm_aesenc_si128(d1,k10);
|
|
|
+ d2 = _mm_aesenc_si128(d2,k10);
|
|
|
+ d3 = _mm_aesenc_si128(d3,k10);
|
|
|
+ __m128i p0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in));
|
|
|
d0 = _mm_aesenc_si128(d0,k11);
|
|
|
- const __m128i p0 = _mm_loadu_si128(outblk);
|
|
|
+ d1 = _mm_aesenc_si128(d1,k11);
|
|
|
+ d2 = _mm_aesenc_si128(d2,k11);
|
|
|
+ d3 = _mm_aesenc_si128(d3,k11);
|
|
|
+ __m128i p1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 16));
|
|
|
d0 = _mm_aesenc_si128(d0,k12);
|
|
|
+ d1 = _mm_aesenc_si128(d1,k12);
|
|
|
+ d2 = _mm_aesenc_si128(d2,k12);
|
|
|
+ d3 = _mm_aesenc_si128(d3,k12);
|
|
|
+ __m128i p2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 32));
|
|
|
d0 = _mm_aesenc_si128(d0,k13);
|
|
|
+ d1 = _mm_aesenc_si128(d1,k13);
|
|
|
+ d2 = _mm_aesenc_si128(d2,k13);
|
|
|
+ d3 = _mm_aesenc_si128(d3,k13);
|
|
|
+ __m128i p3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 48));
|
|
|
+ in += 64;
|
|
|
d0 = _mm_aesenclast_si128(d0,k14);
|
|
|
- _mm_storeu_si128(outblk,_mm_xor_si128(p0,d0));
|
|
|
- break;
|
|
|
- }
|
|
|
+ d1 = _mm_aesenclast_si128(d1,k14);
|
|
|
+ d2 = _mm_aesenclast_si128(d2,k14);
|
|
|
+ d3 = _mm_aesenclast_si128(d3,k14);
|
|
|
+ p0 = _mm_xor_si128(d0,p0);
|
|
|
+ p1 = _mm_xor_si128(d1,p1);
|
|
|
+ p2 = _mm_xor_si128(d2,p2);
|
|
|
+ p3 = _mm_xor_si128(d3,p3);
|
|
|
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(out),p0);
|
|
|
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 16),p1);
|
|
|
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 32),p2);
|
|
|
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 48),p3);
|
|
|
+ out += 64;
|
|
|
+ len -= 64;
|
|
|
+ } while (len >= 64);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- out += totalLen;
|
|
|
- _len = (totalLen + len);
|
|
|
-
|
|
|
- while (len >= 64) {
|
|
|
- __m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
|
|
|
- __m128i d1 = _mm_set_epi64x((long long)Utils::hton(c1 + 1ULL),(long long)c0);
|
|
|
- __m128i d2 = _mm_set_epi64x((long long)Utils::hton(c1 + 2ULL),(long long)c0);
|
|
|
- __m128i d3 = _mm_set_epi64x((long long)Utils::hton(c1 + 3ULL),(long long)c0);
|
|
|
- c1 += 4;
|
|
|
-
|
|
|
- d0 = _mm_xor_si128(d0,k0);
|
|
|
- d1 = _mm_xor_si128(d1,k0);
|
|
|
- d2 = _mm_xor_si128(d2,k0);
|
|
|
- d3 = _mm_xor_si128(d3,k0);
|
|
|
- d0 = _mm_aesenc_si128(d0,k1);
|
|
|
- d1 = _mm_aesenc_si128(d1,k1);
|
|
|
- d2 = _mm_aesenc_si128(d2,k1);
|
|
|
- d3 = _mm_aesenc_si128(d3,k1);
|
|
|
- __m128i ka = k[6];
|
|
|
- d0 = _mm_aesenc_si128(d0,k2);
|
|
|
- d1 = _mm_aesenc_si128(d1,k2);
|
|
|
- d2 = _mm_aesenc_si128(d2,k2);
|
|
|
- d3 = _mm_aesenc_si128(d3,k2);
|
|
|
- __m128i kb = k[7];
|
|
|
- d0 = _mm_aesenc_si128(d0,k3);
|
|
|
- d1 = _mm_aesenc_si128(d1,k3);
|
|
|
- d2 = _mm_aesenc_si128(d2,k3);
|
|
|
- d3 = _mm_aesenc_si128(d3,k3);
|
|
|
- __m128i kc = k[8];
|
|
|
- d0 = _mm_aesenc_si128(d0,k4);
|
|
|
- d1 = _mm_aesenc_si128(d1,k4);
|
|
|
- d2 = _mm_aesenc_si128(d2,k4);
|
|
|
- d3 = _mm_aesenc_si128(d3,k4);
|
|
|
- __m128i kd = k[9];
|
|
|
- d0 = _mm_aesenc_si128(d0,k5);
|
|
|
- d1 = _mm_aesenc_si128(d1,k5);
|
|
|
- d2 = _mm_aesenc_si128(d2,k5);
|
|
|
- d3 = _mm_aesenc_si128(d3,k5);
|
|
|
- __m128i ke = k[10];
|
|
|
- d0 = _mm_aesenc_si128(d0,ka);
|
|
|
- d1 = _mm_aesenc_si128(d1,ka);
|
|
|
- d2 = _mm_aesenc_si128(d2,ka);
|
|
|
- d3 = _mm_aesenc_si128(d3,ka);
|
|
|
- __m128i kf = k[11];
|
|
|
- d0 = _mm_aesenc_si128(d0,kb);
|
|
|
- d1 = _mm_aesenc_si128(d1,kb);
|
|
|
- d2 = _mm_aesenc_si128(d2,kb);
|
|
|
- d3 = _mm_aesenc_si128(d3,kb);
|
|
|
- ka = k[12];
|
|
|
- d0 = _mm_aesenc_si128(d0,kc);
|
|
|
- d1 = _mm_aesenc_si128(d1,kc);
|
|
|
- d2 = _mm_aesenc_si128(d2,kc);
|
|
|
- d3 = _mm_aesenc_si128(d3,kc);
|
|
|
- kb = k[13];
|
|
|
- d0 = _mm_aesenc_si128(d0,kd);
|
|
|
- d1 = _mm_aesenc_si128(d1,kd);
|
|
|
- d2 = _mm_aesenc_si128(d2,kd);
|
|
|
- d3 = _mm_aesenc_si128(d3,kd);
|
|
|
- kc = k[14];
|
|
|
- d0 = _mm_aesenc_si128(d0,ke);
|
|
|
- d1 = _mm_aesenc_si128(d1,ke);
|
|
|
- d2 = _mm_aesenc_si128(d2,ke);
|
|
|
- d3 = _mm_aesenc_si128(d3,ke);
|
|
|
- kd = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in));
|
|
|
- d0 = _mm_aesenc_si128(d0,kf);
|
|
|
- d1 = _mm_aesenc_si128(d1,kf);
|
|
|
- d2 = _mm_aesenc_si128(d2,kf);
|
|
|
- d3 = _mm_aesenc_si128(d3,kf);
|
|
|
- ke = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 16));
|
|
|
- d0 = _mm_aesenc_si128(d0,ka);
|
|
|
- d1 = _mm_aesenc_si128(d1,ka);
|
|
|
- d2 = _mm_aesenc_si128(d2,ka);
|
|
|
- d3 = _mm_aesenc_si128(d3,ka);
|
|
|
- kf = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 32));
|
|
|
- d0 = _mm_aesenc_si128(d0,kb);
|
|
|
- d1 = _mm_aesenc_si128(d1,kb);
|
|
|
- d2 = _mm_aesenc_si128(d2,kb);
|
|
|
- d3 = _mm_aesenc_si128(d3,kb);
|
|
|
- ka = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 48));
|
|
|
- d0 = _mm_aesenclast_si128(d0,kc);
|
|
|
- d1 = _mm_aesenclast_si128(d1,kc);
|
|
|
- d2 = _mm_aesenclast_si128(d2,kc);
|
|
|
- d3 = _mm_aesenclast_si128(d3,kc);
|
|
|
- kd = _mm_xor_si128(d0,kd);
|
|
|
- ke = _mm_xor_si128(d1,ke);
|
|
|
- kf = _mm_xor_si128(d2,kf);
|
|
|
- ka = _mm_xor_si128(d3,ka);
|
|
|
- _mm_storeu_si128(reinterpret_cast<__m128i *>(out),kd);
|
|
|
- _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 16),ke);
|
|
|
- _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 32),kf);
|
|
|
- _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 48),ka);
|
|
|
-
|
|
|
- in += 64;
|
|
|
- len -= 64;
|
|
|
- out += 64;
|
|
|
- }
|
|
|
-
|
|
|
if (len >= 16) {
|
|
|
- const __m128i k7 = k[7];
|
|
|
- const __m128i k8 = k[8];
|
|
|
- const __m128i k9 = k[9];
|
|
|
- const __m128i k10 = k[10];
|
|
|
- const __m128i k11 = k[11];
|
|
|
- const __m128i k12 = k[12];
|
|
|
- const __m128i k13 = k[13];
|
|
|
- const __m128i k14 = k[14];
|
|
|
do {
|
|
|
__m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
|
|
|
- d0 = _mm_xor_si128(d0,k0);
|
|
|
- d0 = _mm_aesenc_si128(d0,k1);
|
|
|
- d0 = _mm_aesenc_si128(d0,k2);
|
|
|
- d0 = _mm_aesenc_si128(d0,k3);
|
|
|
- d0 = _mm_aesenc_si128(d0,k4);
|
|
|
- d0 = _mm_aesenc_si128(d0,k5);
|
|
|
+ d0 = _mm_xor_si128(d0,k[0]);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k[1]);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k[2]);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k[3]);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k[4]);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k[5]);
|
|
|
d0 = _mm_aesenc_si128(d0,k[6]);
|
|
|
- d0 = _mm_aesenc_si128(d0,k7);
|
|
|
- d0 = _mm_aesenc_si128(d0,k8);
|
|
|
- d0 = _mm_aesenc_si128(d0,k9);
|
|
|
- d0 = _mm_aesenc_si128(d0,k10);
|
|
|
- d0 = _mm_aesenc_si128(d0,k11);
|
|
|
- d0 = _mm_aesenc_si128(d0,k12);
|
|
|
- d0 = _mm_aesenc_si128(d0,k13);
|
|
|
- d0 = _mm_aesenclast_si128(d0,k14);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k[7]);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k[8]);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k[9]);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k[10]);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k[11]);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k[12]);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k[13]);
|
|
|
+ d0 = _mm_aesenclast_si128(d0,k[14]);
|
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(out),_mm_xor_si128(d0,_mm_loadu_si128(reinterpret_cast<const __m128i *>(in))));
|
|
|
in += 16;
|
|
|
len -= 16;
|