|
@@ -508,6 +508,7 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept
|
|
|
out[totalLen++] = *(in++);
|
|
|
if (!(totalLen & 15U)) {
|
|
|
__m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
|
|
|
+ if (unlikely(++c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
|
|
|
d0 = _mm_xor_si128(d0,k0);
|
|
|
d0 = _mm_aesenc_si128(d0,k1);
|
|
|
d0 = _mm_aesenc_si128(d0,k2);
|
|
@@ -526,7 +527,6 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept
|
|
|
d0 = _mm_aesenc_si128(d0,k13);
|
|
|
d0 = _mm_aesenclast_si128(d0,k14);
|
|
|
_mm_storeu_si128(outblk,_mm_xor_si128(p0,d0));
|
|
|
- if (unlikely(++c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
|
|
|
break;
|
|
|
}
|
|
|
}
|
|
@@ -535,147 +535,265 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept
|
|
|
out += totalLen;
|
|
|
_len = (totalLen + len);
|
|
|
|
|
|
- while (len >= 64) {
|
|
|
- __m128i d0,d1,d2,d3;
|
|
|
- if (likely(c1 < 0xfffffffffffffffcULL)) {
|
|
|
- d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
|
|
|
- d1 = _mm_set_epi64x((long long)Utils::hton(c1 + 1ULL),(long long)c0);
|
|
|
- d2 = _mm_set_epi64x((long long)Utils::hton(c1 + 2ULL),(long long)c0);
|
|
|
- d3 = _mm_set_epi64x((long long)Utils::hton(c1 + 3ULL),(long long)c0);
|
|
|
+ if (likely((c1 + len) > c1)) { // it's incredibly likely that we can ignore carry in counter increment
|
|
|
+ while (len >= 64) {
|
|
|
+ __m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
|
|
|
+ __m128i d1 = _mm_set_epi64x((long long)Utils::hton(c1 + 1ULL),(long long)c0);
|
|
|
+ __m128i d2 = _mm_set_epi64x((long long)Utils::hton(c1 + 2ULL),(long long)c0);
|
|
|
+ __m128i d3 = _mm_set_epi64x((long long)Utils::hton(c1 + 3ULL),(long long)c0);
|
|
|
c1 += 4;
|
|
|
- } else {
|
|
|
- d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
|
|
|
- if (unlikely(++c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
|
|
|
- d1 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
|
|
|
- if (unlikely(++c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
|
|
|
- d2 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
|
|
|
- if (unlikely(++c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
|
|
|
- d3 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
|
|
|
- if (unlikely(++c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
|
|
|
- }
|
|
|
|
|
|
- d0 = _mm_xor_si128(d0,k0);
|
|
|
- d1 = _mm_xor_si128(d1,k0);
|
|
|
- d2 = _mm_xor_si128(d2,k0);
|
|
|
- d3 = _mm_xor_si128(d3,k0);
|
|
|
- d0 = _mm_aesenc_si128(d0,k1);
|
|
|
- d1 = _mm_aesenc_si128(d1,k1);
|
|
|
- d2 = _mm_aesenc_si128(d2,k1);
|
|
|
- d3 = _mm_aesenc_si128(d3,k1);
|
|
|
- __m128i ka = k[6];
|
|
|
- d0 = _mm_aesenc_si128(d0,k2);
|
|
|
- d1 = _mm_aesenc_si128(d1,k2);
|
|
|
- d2 = _mm_aesenc_si128(d2,k2);
|
|
|
- d3 = _mm_aesenc_si128(d3,k2);
|
|
|
- __m128i kb = k[7];
|
|
|
- d0 = _mm_aesenc_si128(d0,k3);
|
|
|
- d1 = _mm_aesenc_si128(d1,k3);
|
|
|
- d2 = _mm_aesenc_si128(d2,k3);
|
|
|
- d3 = _mm_aesenc_si128(d3,k3);
|
|
|
- __m128i kc = k[8];
|
|
|
- d0 = _mm_aesenc_si128(d0,k4);
|
|
|
- d1 = _mm_aesenc_si128(d1,k4);
|
|
|
- d2 = _mm_aesenc_si128(d2,k4);
|
|
|
- d3 = _mm_aesenc_si128(d3,k4);
|
|
|
- __m128i kd = k[9];
|
|
|
- d0 = _mm_aesenc_si128(d0,k5);
|
|
|
- d1 = _mm_aesenc_si128(d1,k5);
|
|
|
- d2 = _mm_aesenc_si128(d2,k5);
|
|
|
- d3 = _mm_aesenc_si128(d3,k5);
|
|
|
- __m128i ke = k[10];
|
|
|
- d0 = _mm_aesenc_si128(d0,ka);
|
|
|
- d1 = _mm_aesenc_si128(d1,ka);
|
|
|
- d2 = _mm_aesenc_si128(d2,ka);
|
|
|
- d3 = _mm_aesenc_si128(d3,ka);
|
|
|
- __m128i kf = k[11];
|
|
|
- d0 = _mm_aesenc_si128(d0,kb);
|
|
|
- d1 = _mm_aesenc_si128(d1,kb);
|
|
|
- d2 = _mm_aesenc_si128(d2,kb);
|
|
|
- d3 = _mm_aesenc_si128(d3,kb);
|
|
|
- ka = k[12];
|
|
|
- d0 = _mm_aesenc_si128(d0,kc);
|
|
|
- d1 = _mm_aesenc_si128(d1,kc);
|
|
|
- d2 = _mm_aesenc_si128(d2,kc);
|
|
|
- d3 = _mm_aesenc_si128(d3,kc);
|
|
|
- kb = k[13];
|
|
|
- d0 = _mm_aesenc_si128(d0,kd);
|
|
|
- d1 = _mm_aesenc_si128(d1,kd);
|
|
|
- d2 = _mm_aesenc_si128(d2,kd);
|
|
|
- d3 = _mm_aesenc_si128(d3,kd);
|
|
|
- kc = k[14];
|
|
|
- d0 = _mm_aesenc_si128(d0,ke);
|
|
|
- d1 = _mm_aesenc_si128(d1,ke);
|
|
|
- d2 = _mm_aesenc_si128(d2,ke);
|
|
|
- d3 = _mm_aesenc_si128(d3,ke);
|
|
|
- kd = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in));
|
|
|
- d0 = _mm_aesenc_si128(d0,kf);
|
|
|
- d1 = _mm_aesenc_si128(d1,kf);
|
|
|
- d2 = _mm_aesenc_si128(d2,kf);
|
|
|
- d3 = _mm_aesenc_si128(d3,kf);
|
|
|
- ke = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 16));
|
|
|
- d0 = _mm_aesenc_si128(d0,ka);
|
|
|
- d1 = _mm_aesenc_si128(d1,ka);
|
|
|
- d2 = _mm_aesenc_si128(d2,ka);
|
|
|
- d3 = _mm_aesenc_si128(d3,ka);
|
|
|
- kf = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 32));
|
|
|
- d0 = _mm_aesenc_si128(d0,kb);
|
|
|
- d1 = _mm_aesenc_si128(d1,kb);
|
|
|
- d2 = _mm_aesenc_si128(d2,kb);
|
|
|
- d3 = _mm_aesenc_si128(d3,kb);
|
|
|
- ka = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 48));
|
|
|
- d0 = _mm_aesenclast_si128(d0,kc);
|
|
|
- d1 = _mm_aesenclast_si128(d1,kc);
|
|
|
- d2 = _mm_aesenclast_si128(d2,kc);
|
|
|
- d3 = _mm_aesenclast_si128(d3,kc);
|
|
|
- kd = _mm_xor_si128(d0,kd);
|
|
|
- ke = _mm_xor_si128(d1,ke);
|
|
|
- kf = _mm_xor_si128(d2,kf);
|
|
|
- ka = _mm_xor_si128(d3,ka);
|
|
|
- _mm_storeu_si128(reinterpret_cast<__m128i *>(out),kd);
|
|
|
- _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 16),ke);
|
|
|
- _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 32),kf);
|
|
|
- _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 48),ka);
|
|
|
+ d0 = _mm_xor_si128(d0,k0);
|
|
|
+ d1 = _mm_xor_si128(d1,k0);
|
|
|
+ d2 = _mm_xor_si128(d2,k0);
|
|
|
+ d3 = _mm_xor_si128(d3,k0);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k1);
|
|
|
+ d1 = _mm_aesenc_si128(d1,k1);
|
|
|
+ d2 = _mm_aesenc_si128(d2,k1);
|
|
|
+ d3 = _mm_aesenc_si128(d3,k1);
|
|
|
+ __m128i ka = k[6];
|
|
|
+ d0 = _mm_aesenc_si128(d0,k2);
|
|
|
+ d1 = _mm_aesenc_si128(d1,k2);
|
|
|
+ d2 = _mm_aesenc_si128(d2,k2);
|
|
|
+ d3 = _mm_aesenc_si128(d3,k2);
|
|
|
+ __m128i kb = k[7];
|
|
|
+ d0 = _mm_aesenc_si128(d0,k3);
|
|
|
+ d1 = _mm_aesenc_si128(d1,k3);
|
|
|
+ d2 = _mm_aesenc_si128(d2,k3);
|
|
|
+ d3 = _mm_aesenc_si128(d3,k3);
|
|
|
+ __m128i kc = k[8];
|
|
|
+ d0 = _mm_aesenc_si128(d0,k4);
|
|
|
+ d1 = _mm_aesenc_si128(d1,k4);
|
|
|
+ d2 = _mm_aesenc_si128(d2,k4);
|
|
|
+ d3 = _mm_aesenc_si128(d3,k4);
|
|
|
+ __m128i kd = k[9];
|
|
|
+ d0 = _mm_aesenc_si128(d0,k5);
|
|
|
+ d1 = _mm_aesenc_si128(d1,k5);
|
|
|
+ d2 = _mm_aesenc_si128(d2,k5);
|
|
|
+ d3 = _mm_aesenc_si128(d3,k5);
|
|
|
+ __m128i ke = k[10];
|
|
|
+ d0 = _mm_aesenc_si128(d0,ka);
|
|
|
+ d1 = _mm_aesenc_si128(d1,ka);
|
|
|
+ d2 = _mm_aesenc_si128(d2,ka);
|
|
|
+ d3 = _mm_aesenc_si128(d3,ka);
|
|
|
+ __m128i kf = k[11];
|
|
|
+ d0 = _mm_aesenc_si128(d0,kb);
|
|
|
+ d1 = _mm_aesenc_si128(d1,kb);
|
|
|
+ d2 = _mm_aesenc_si128(d2,kb);
|
|
|
+ d3 = _mm_aesenc_si128(d3,kb);
|
|
|
+ ka = k[12];
|
|
|
+ d0 = _mm_aesenc_si128(d0,kc);
|
|
|
+ d1 = _mm_aesenc_si128(d1,kc);
|
|
|
+ d2 = _mm_aesenc_si128(d2,kc);
|
|
|
+ d3 = _mm_aesenc_si128(d3,kc);
|
|
|
+ kb = k[13];
|
|
|
+ d0 = _mm_aesenc_si128(d0,kd);
|
|
|
+ d1 = _mm_aesenc_si128(d1,kd);
|
|
|
+ d2 = _mm_aesenc_si128(d2,kd);
|
|
|
+ d3 = _mm_aesenc_si128(d3,kd);
|
|
|
+ kc = k[14];
|
|
|
+ d0 = _mm_aesenc_si128(d0,ke);
|
|
|
+ d1 = _mm_aesenc_si128(d1,ke);
|
|
|
+ d2 = _mm_aesenc_si128(d2,ke);
|
|
|
+ d3 = _mm_aesenc_si128(d3,ke);
|
|
|
+ kd = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in));
|
|
|
+ d0 = _mm_aesenc_si128(d0,kf);
|
|
|
+ d1 = _mm_aesenc_si128(d1,kf);
|
|
|
+ d2 = _mm_aesenc_si128(d2,kf);
|
|
|
+ d3 = _mm_aesenc_si128(d3,kf);
|
|
|
+ ke = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 16));
|
|
|
+ d0 = _mm_aesenc_si128(d0,ka);
|
|
|
+ d1 = _mm_aesenc_si128(d1,ka);
|
|
|
+ d2 = _mm_aesenc_si128(d2,ka);
|
|
|
+ d3 = _mm_aesenc_si128(d3,ka);
|
|
|
+ kf = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 32));
|
|
|
+ d0 = _mm_aesenc_si128(d0,kb);
|
|
|
+ d1 = _mm_aesenc_si128(d1,kb);
|
|
|
+ d2 = _mm_aesenc_si128(d2,kb);
|
|
|
+ d3 = _mm_aesenc_si128(d3,kb);
|
|
|
+ ka = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 48));
|
|
|
+ d0 = _mm_aesenclast_si128(d0,kc);
|
|
|
+ d1 = _mm_aesenclast_si128(d1,kc);
|
|
|
+ d2 = _mm_aesenclast_si128(d2,kc);
|
|
|
+ d3 = _mm_aesenclast_si128(d3,kc);
|
|
|
+ kd = _mm_xor_si128(d0,kd);
|
|
|
+ ke = _mm_xor_si128(d1,ke);
|
|
|
+ kf = _mm_xor_si128(d2,kf);
|
|
|
+ ka = _mm_xor_si128(d3,ka);
|
|
|
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(out),kd);
|
|
|
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 16),ke);
|
|
|
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 32),kf);
|
|
|
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 48),ka);
|
|
|
+
|
|
|
+ in += 64;
|
|
|
+ len -= 64;
|
|
|
+ out += 64;
|
|
|
+ }
|
|
|
|
|
|
- in += 64;
|
|
|
- len -= 64;
|
|
|
- out += 64;
|
|
|
- }
|
|
|
+ if (len >= 16) {
|
|
|
+ const __m128i k7 = k[7];
|
|
|
+ const __m128i k8 = k[8];
|
|
|
+ const __m128i k9 = k[9];
|
|
|
+ const __m128i k10 = k[10];
|
|
|
+ const __m128i k11 = k[11];
|
|
|
+ const __m128i k12 = k[12];
|
|
|
+ const __m128i k13 = k[13];
|
|
|
+ const __m128i k14 = k[14];
|
|
|
+ do {
|
|
|
+ __m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
|
|
|
+ d0 = _mm_xor_si128(d0,k0);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k1);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k2);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k3);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k4);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k5);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k[6]);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k7);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k8);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k9);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k10);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k11);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k12);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k13);
|
|
|
+ d0 = _mm_aesenclast_si128(d0,k14);
|
|
|
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(out),_mm_xor_si128(d0,_mm_loadu_si128(reinterpret_cast<const __m128i *>(in))));
|
|
|
|
|
|
- {
|
|
|
- const __m128i k7 = k[7];
|
|
|
- const __m128i k8 = k[8];
|
|
|
- const __m128i k9 = k[9];
|
|
|
- const __m128i k10 = k[10];
|
|
|
- const __m128i k11 = k[11];
|
|
|
- const __m128i k12 = k[12];
|
|
|
- const __m128i k13 = k[13];
|
|
|
- const __m128i k14 = k[14];
|
|
|
- while (len >= 16) {
|
|
|
+ in += 16;
|
|
|
+ len -= 16;
|
|
|
+ out += 16;
|
|
|
+ } while (len >= 16);
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ while (len >= 64) {
|
|
|
__m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
|
|
|
- if (unlikely(c1 == 0)) {
|
|
|
- c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
|
|
|
- d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
|
|
|
- }
|
|
|
+ if (unlikely(c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
|
|
|
+ __m128i d1 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
|
|
|
+ if (unlikely(c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
|
|
|
+ __m128i d2 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
|
|
|
+ if (unlikely(c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
|
|
|
+ __m128i d3 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
|
|
|
+ if (unlikely(c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
|
|
|
+
|
|
|
d0 = _mm_xor_si128(d0,k0);
|
|
|
+ d1 = _mm_xor_si128(d1,k0);
|
|
|
+ d2 = _mm_xor_si128(d2,k0);
|
|
|
+ d3 = _mm_xor_si128(d3,k0);
|
|
|
d0 = _mm_aesenc_si128(d0,k1);
|
|
|
+ d1 = _mm_aesenc_si128(d1,k1);
|
|
|
+ d2 = _mm_aesenc_si128(d2,k1);
|
|
|
+ d3 = _mm_aesenc_si128(d3,k1);
|
|
|
+ __m128i ka = k[6];
|
|
|
d0 = _mm_aesenc_si128(d0,k2);
|
|
|
+ d1 = _mm_aesenc_si128(d1,k2);
|
|
|
+ d2 = _mm_aesenc_si128(d2,k2);
|
|
|
+ d3 = _mm_aesenc_si128(d3,k2);
|
|
|
+ __m128i kb = k[7];
|
|
|
d0 = _mm_aesenc_si128(d0,k3);
|
|
|
+ d1 = _mm_aesenc_si128(d1,k3);
|
|
|
+ d2 = _mm_aesenc_si128(d2,k3);
|
|
|
+ d3 = _mm_aesenc_si128(d3,k3);
|
|
|
+ __m128i kc = k[8];
|
|
|
d0 = _mm_aesenc_si128(d0,k4);
|
|
|
+ d1 = _mm_aesenc_si128(d1,k4);
|
|
|
+ d2 = _mm_aesenc_si128(d2,k4);
|
|
|
+ d3 = _mm_aesenc_si128(d3,k4);
|
|
|
+ __m128i kd = k[9];
|
|
|
d0 = _mm_aesenc_si128(d0,k5);
|
|
|
- d0 = _mm_aesenc_si128(d0,k[6]);
|
|
|
- d0 = _mm_aesenc_si128(d0,k7);
|
|
|
- d0 = _mm_aesenc_si128(d0,k8);
|
|
|
- d0 = _mm_aesenc_si128(d0,k9);
|
|
|
- d0 = _mm_aesenc_si128(d0,k10);
|
|
|
- d0 = _mm_aesenc_si128(d0,k11);
|
|
|
- d0 = _mm_aesenc_si128(d0,k12);
|
|
|
- d0 = _mm_aesenc_si128(d0,k13);
|
|
|
- d0 = _mm_aesenclast_si128(d0,k14);
|
|
|
- _mm_storeu_si128(reinterpret_cast<__m128i *>(out),_mm_xor_si128(d0,_mm_loadu_si128(reinterpret_cast<const __m128i *>(in))));
|
|
|
-
|
|
|
- in += 16;
|
|
|
- len -= 16;
|
|
|
- out += 16;
|
|
|
+ d1 = _mm_aesenc_si128(d1,k5);
|
|
|
+ d2 = _mm_aesenc_si128(d2,k5);
|
|
|
+ d3 = _mm_aesenc_si128(d3,k5);
|
|
|
+ __m128i ke = k[10];
|
|
|
+ d0 = _mm_aesenc_si128(d0,ka);
|
|
|
+ d1 = _mm_aesenc_si128(d1,ka);
|
|
|
+ d2 = _mm_aesenc_si128(d2,ka);
|
|
|
+ d3 = _mm_aesenc_si128(d3,ka);
|
|
|
+ __m128i kf = k[11];
|
|
|
+ d0 = _mm_aesenc_si128(d0,kb);
|
|
|
+ d1 = _mm_aesenc_si128(d1,kb);
|
|
|
+ d2 = _mm_aesenc_si128(d2,kb);
|
|
|
+ d3 = _mm_aesenc_si128(d3,kb);
|
|
|
+ ka = k[12];
|
|
|
+ d0 = _mm_aesenc_si128(d0,kc);
|
|
|
+ d1 = _mm_aesenc_si128(d1,kc);
|
|
|
+ d2 = _mm_aesenc_si128(d2,kc);
|
|
|
+ d3 = _mm_aesenc_si128(d3,kc);
|
|
|
+ kb = k[13];
|
|
|
+ d0 = _mm_aesenc_si128(d0,kd);
|
|
|
+ d1 = _mm_aesenc_si128(d1,kd);
|
|
|
+ d2 = _mm_aesenc_si128(d2,kd);
|
|
|
+ d3 = _mm_aesenc_si128(d3,kd);
|
|
|
+ kc = k[14];
|
|
|
+ d0 = _mm_aesenc_si128(d0,ke);
|
|
|
+ d1 = _mm_aesenc_si128(d1,ke);
|
|
|
+ d2 = _mm_aesenc_si128(d2,ke);
|
|
|
+ d3 = _mm_aesenc_si128(d3,ke);
|
|
|
+ kd = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in));
|
|
|
+ d0 = _mm_aesenc_si128(d0,kf);
|
|
|
+ d1 = _mm_aesenc_si128(d1,kf);
|
|
|
+ d2 = _mm_aesenc_si128(d2,kf);
|
|
|
+ d3 = _mm_aesenc_si128(d3,kf);
|
|
|
+ ke = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 16));
|
|
|
+ d0 = _mm_aesenc_si128(d0,ka);
|
|
|
+ d1 = _mm_aesenc_si128(d1,ka);
|
|
|
+ d2 = _mm_aesenc_si128(d2,ka);
|
|
|
+ d3 = _mm_aesenc_si128(d3,ka);
|
|
|
+ kf = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 32));
|
|
|
+ d0 = _mm_aesenc_si128(d0,kb);
|
|
|
+ d1 = _mm_aesenc_si128(d1,kb);
|
|
|
+ d2 = _mm_aesenc_si128(d2,kb);
|
|
|
+ d3 = _mm_aesenc_si128(d3,kb);
|
|
|
+ ka = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 48));
|
|
|
+ d0 = _mm_aesenclast_si128(d0,kc);
|
|
|
+ d1 = _mm_aesenclast_si128(d1,kc);
|
|
|
+ d2 = _mm_aesenclast_si128(d2,kc);
|
|
|
+ d3 = _mm_aesenclast_si128(d3,kc);
|
|
|
+ kd = _mm_xor_si128(d0,kd);
|
|
|
+ ke = _mm_xor_si128(d1,ke);
|
|
|
+ kf = _mm_xor_si128(d2,kf);
|
|
|
+ ka = _mm_xor_si128(d3,ka);
|
|
|
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(out),kd);
|
|
|
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 16),ke);
|
|
|
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 32),kf);
|
|
|
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 48),ka);
|
|
|
+
|
|
|
+ in += 64;
|
|
|
+ len -= 64;
|
|
|
+ out += 64;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (len >= 16) {
|
|
|
+ const __m128i k7 = k[7];
|
|
|
+ const __m128i k8 = k[8];
|
|
|
+ const __m128i k9 = k[9];
|
|
|
+ const __m128i k10 = k[10];
|
|
|
+ const __m128i k11 = k[11];
|
|
|
+ const __m128i k12 = k[12];
|
|
|
+ const __m128i k13 = k[13];
|
|
|
+ const __m128i k14 = k[14];
|
|
|
+ do {
|
|
|
+ __m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
|
|
|
+ if (unlikely(c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
|
|
|
+ d0 = _mm_xor_si128(d0,k0);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k1);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k2);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k3);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k4);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k5);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k[6]);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k7);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k8);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k9);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k10);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k11);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k12);
|
|
|
+ d0 = _mm_aesenc_si128(d0,k13);
|
|
|
+ d0 = _mm_aesenclast_si128(d0,k14);
|
|
|
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(out),_mm_xor_si128(d0,_mm_loadu_si128(reinterpret_cast<const __m128i *>(in))));
|
|
|
+
|
|
|
+ in += 16;
|
|
|
+ len -= 16;
|
|
|
+ out += 16;
|
|
|
+ } while (len >= 16);
|
|
|
}
|
|
|
}
|
|
|
|