|
@@ -1159,50 +1159,50 @@ static ZT_ALWAYS_INLINE __m128i _init256_2_aesni(__m128i a,__m128i b) noexcept
|
|
|
|
|
|
void AES::_init_aesni(const uint8_t key[32]) noexcept
|
|
|
{
|
|
|
- __m128i t1,t2;
|
|
|
+ __m128i t1,t2,k1,k2,k3,k4,k5,k6,k7,k8,k9,k10,k11,k12,k13;
|
|
|
_k.ni.k[0] = t1 = _mm_loadu_si128((const __m128i *)key);
|
|
|
- _k.ni.k[1] = t2 = _mm_loadu_si128((const __m128i *)(key+16));
|
|
|
- _k.ni.k[2] = t1 = _init256_1_aesni(t1,_mm_aeskeygenassist_si128(t2,0x01));
|
|
|
- _k.ni.k[3] = t2 = _init256_2_aesni(t1,t2);
|
|
|
- _k.ni.k[4] = t1 = _init256_1_aesni(t1,_mm_aeskeygenassist_si128(t2,0x02));
|
|
|
- _k.ni.k[5] = t2 = _init256_2_aesni(t1,t2);
|
|
|
- _k.ni.k[6] = t1 = _init256_1_aesni(t1,_mm_aeskeygenassist_si128(t2,0x04));
|
|
|
- _k.ni.k[7] = t2 = _init256_2_aesni(t1,t2);
|
|
|
- _k.ni.k[8] = t1 = _init256_1_aesni(t1,_mm_aeskeygenassist_si128(t2,0x08));
|
|
|
- _k.ni.k[9] = t2 = _init256_2_aesni(t1,t2);
|
|
|
- _k.ni.k[10] = t1 = _init256_1_aesni(t1,_mm_aeskeygenassist_si128(t2,0x10));
|
|
|
- _k.ni.k[11] = t2 = _init256_2_aesni(t1,t2);
|
|
|
- _k.ni.k[12] = t1 = _init256_1_aesni(t1,_mm_aeskeygenassist_si128(t2,0x20));
|
|
|
- _k.ni.k[13] = t2 = _init256_2_aesni(t1,t2);
|
|
|
+ _k.ni.k[1] = k1 = t2 = _mm_loadu_si128((const __m128i *)(key + 16));
|
|
|
+ _k.ni.k[2] = k2 = t1 = _init256_1_aesni(t1,_mm_aeskeygenassist_si128(t2,0x01));
|
|
|
+ _k.ni.k[3] = k3 = t2 = _init256_2_aesni(t1,t2);
|
|
|
+ _k.ni.k[4] = k4 = t1 = _init256_1_aesni(t1,_mm_aeskeygenassist_si128(t2,0x02));
|
|
|
+ _k.ni.k[5] = k5 = t2 = _init256_2_aesni(t1,t2);
|
|
|
+ _k.ni.k[6] = k6 = t1 = _init256_1_aesni(t1,_mm_aeskeygenassist_si128(t2,0x04));
|
|
|
+ _k.ni.k[7] = k7 = t2 = _init256_2_aesni(t1,t2);
|
|
|
+ _k.ni.k[8] = k8 = t1 = _init256_1_aesni(t1,_mm_aeskeygenassist_si128(t2,0x08));
|
|
|
+ _k.ni.k[9] = k9 = t2 = _init256_2_aesni(t1,t2);
|
|
|
+ _k.ni.k[10] = k10 = t1 = _init256_1_aesni(t1,_mm_aeskeygenassist_si128(t2,0x10));
|
|
|
+ _k.ni.k[11] = k11 = t2 = _init256_2_aesni(t1,t2);
|
|
|
+ _k.ni.k[12] = k12 = t1 = _init256_1_aesni(t1,_mm_aeskeygenassist_si128(t2,0x20));
|
|
|
+ _k.ni.k[13] = k13 = t2 = _init256_2_aesni(t1,t2);
|
|
|
_k.ni.k[14] = _init256_1_aesni(t1,_mm_aeskeygenassist_si128(t2,0x40));
|
|
|
- _k.ni.k[15] = _mm_aesimc_si128(_k.ni.k[13]);
|
|
|
- _k.ni.k[16] = _mm_aesimc_si128(_k.ni.k[12]);
|
|
|
- _k.ni.k[17] = _mm_aesimc_si128(_k.ni.k[11]);
|
|
|
- _k.ni.k[18] = _mm_aesimc_si128(_k.ni.k[10]);
|
|
|
- _k.ni.k[19] = _mm_aesimc_si128(_k.ni.k[9]);
|
|
|
- _k.ni.k[20] = _mm_aesimc_si128(_k.ni.k[8]);
|
|
|
- _k.ni.k[21] = _mm_aesimc_si128(_k.ni.k[7]);
|
|
|
- _k.ni.k[22] = _mm_aesimc_si128(_k.ni.k[6]);
|
|
|
- _k.ni.k[23] = _mm_aesimc_si128(_k.ni.k[5]);
|
|
|
- _k.ni.k[24] = _mm_aesimc_si128(_k.ni.k[4]);
|
|
|
- _k.ni.k[25] = _mm_aesimc_si128(_k.ni.k[3]);
|
|
|
- _k.ni.k[26] = _mm_aesimc_si128(_k.ni.k[2]);
|
|
|
- _k.ni.k[27] = _mm_aesimc_si128(_k.ni.k[1]);
|
|
|
+ _k.ni.k[15] = _mm_aesimc_si128(k13);
|
|
|
+ _k.ni.k[16] = _mm_aesimc_si128(k12);
|
|
|
+ _k.ni.k[17] = _mm_aesimc_si128(k11);
|
|
|
+ _k.ni.k[18] = _mm_aesimc_si128(k10);
|
|
|
+ _k.ni.k[19] = _mm_aesimc_si128(k9);
|
|
|
+ _k.ni.k[20] = _mm_aesimc_si128(k8);
|
|
|
+ _k.ni.k[21] = _mm_aesimc_si128(k7);
|
|
|
+ _k.ni.k[22] = _mm_aesimc_si128(k6);
|
|
|
+ _k.ni.k[23] = _mm_aesimc_si128(k5);
|
|
|
+ _k.ni.k[24] = _mm_aesimc_si128(k4);
|
|
|
+ _k.ni.k[25] = _mm_aesimc_si128(k3);
|
|
|
+ _k.ni.k[26] = _mm_aesimc_si128(k2);
|
|
|
+ _k.ni.k[27] = _mm_aesimc_si128(k1);
|
|
|
|
|
|
__m128i h = _k.ni.k[0]; // _mm_xor_si128(_mm_setzero_si128(),_k.ni.k[0]);
|
|
|
- h = _mm_aesenc_si128(h,_k.ni.k[1]);
|
|
|
- h = _mm_aesenc_si128(h,_k.ni.k[2]);
|
|
|
- h = _mm_aesenc_si128(h,_k.ni.k[3]);
|
|
|
- h = _mm_aesenc_si128(h,_k.ni.k[4]);
|
|
|
- h = _mm_aesenc_si128(h,_k.ni.k[5]);
|
|
|
- h = _mm_aesenc_si128(h,_k.ni.k[6]);
|
|
|
- h = _mm_aesenc_si128(h,_k.ni.k[7]);
|
|
|
- h = _mm_aesenc_si128(h,_k.ni.k[8]);
|
|
|
- h = _mm_aesenc_si128(h,_k.ni.k[9]);
|
|
|
- h = _mm_aesenc_si128(h,_k.ni.k[10]);
|
|
|
- h = _mm_aesenc_si128(h,_k.ni.k[11]);
|
|
|
- h = _mm_aesenc_si128(h,_k.ni.k[12]);
|
|
|
- h = _mm_aesenc_si128(h,_k.ni.k[13]);
|
|
|
+ h = _mm_aesenc_si128(h,k1);
|
|
|
+ h = _mm_aesenc_si128(h,k2);
|
|
|
+ h = _mm_aesenc_si128(h,k3);
|
|
|
+ h = _mm_aesenc_si128(h,k4);
|
|
|
+ h = _mm_aesenc_si128(h,k5);
|
|
|
+ h = _mm_aesenc_si128(h,k6);
|
|
|
+ h = _mm_aesenc_si128(h,k7);
|
|
|
+ h = _mm_aesenc_si128(h,k8);
|
|
|
+ h = _mm_aesenc_si128(h,k9);
|
|
|
+ h = _mm_aesenc_si128(h,k10);
|
|
|
+ h = _mm_aesenc_si128(h,k11);
|
|
|
+ h = _mm_aesenc_si128(h,k12);
|
|
|
+ h = _mm_aesenc_si128(h,k13);
|
|
|
h = _mm_aesenclast_si128(h,_k.ni.k[14]);
|
|
|
const __m128i shuf = s_shuf;
|
|
|
__m128i hswap = _mm_shuffle_epi8(h,shuf);
|