2
0

sse2.h 2.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. #include <tmmintrin.h>
  2. #define S 9
  3. static inline __m128i
  4. shift(__m128i x, int bits)
  5. {
  6. return _mm_slli_epi32(x, bits);
  7. }
  8. static inline __m128i
  9. rotate(__m128i x, int bits)
  10. {
  11. return _mm_slli_epi32(x, bits) | _mm_srli_epi32(x, 32 - bits);
  12. }
  13. #ifdef __SSSE3__
  14. static inline __m128i
  15. rotate24(__m128i x)
  16. {
  17. return _mm_shuffle_epi8(x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));
  18. }
  19. #else
  20. static inline __m128i
  21. rotate24(__m128i x)
  22. {
  23. uint8_t _hydro_attr_aligned_(16) x8[16], y8[16];
  24. _mm_storeu_si128((__m128i *) (void *) x8, x);
  25. y8[ 0] = x8[ 1]; y8[ 1] = x8[ 2]; y8[ 2] = x8[ 3]; y8[ 3] = x8[ 0];
  26. y8[ 4] = x8[ 5]; y8[ 5] = x8[ 6]; y8[ 6] = x8[ 7]; y8[ 7] = x8[ 4];
  27. y8[ 8] = x8[ 9]; y8[ 9] = x8[10]; y8[10] = x8[11]; y8[11] = x8[ 8];
  28. y8[12] = x8[13]; y8[13] = x8[14]; y8[14] = x8[15]; y8[15] = x8[12];
  29. return _mm_loadu_si128((const __m128i *) (const void *) y8);
  30. }
  31. #endif
  32. static const uint32_t coeffs[24] _hydro_attr_aligned_(16) = {
  33. 0x9e377904, 0, 0, 0, 0x9e377908, 0, 0, 0, 0x9e37790c, 0, 0, 0,
  34. 0x9e377910, 0, 0, 0, 0x9e377914, 0, 0, 0, 0x9e377918, 0, 0, 0,
  35. };
  36. static void
  37. gimli_core(uint32_t state[gimli_BLOCKBYTES / 4])
  38. {
  39. __m128i x = _mm_loadu_si128((const __m128i *) (const void *) &state[0]);
  40. __m128i y = _mm_loadu_si128((const __m128i *) (const void *) &state[4]);
  41. __m128i z = _mm_loadu_si128((const __m128i *) (const void *) &state[8]);
  42. __m128i newy;
  43. __m128i newz;
  44. int round;
  45. for (round = 5; round >= 0; round--) {
  46. x = rotate24(x);
  47. y = rotate(y, S);
  48. newz = x ^ shift(z, 1) ^ shift(y & z, 2);
  49. newy = y ^ x ^ shift(x | z, 1);
  50. x = z ^ y ^ shift(x & y, 3);
  51. y = newy;
  52. z = newz;
  53. x = _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1));
  54. x ^= ((const __m128i *) (const void *) coeffs)[round];
  55. x = rotate24(x);
  56. y = rotate(y, S);
  57. newz = x ^ shift(z, 1) ^ shift(y & z, 2);
  58. newy = y ^ x ^ shift(x | z, 1);
  59. x = z ^ y ^ shift(x & y, 3);
  60. y = newy;
  61. z = newz;
  62. x = rotate24(x);
  63. y = rotate(y, S);
  64. newz = x ^ shift(z, 1) ^ shift(y & z, 2);
  65. newy = y ^ x ^ shift(x | z, 1);
  66. x = z ^ y ^ shift(x & y, 3);
  67. y = newy;
  68. z = newz;
  69. x = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2));
  70. x = rotate24(x);
  71. y = rotate(y, S);
  72. newz = x ^ shift(z, 1) ^ shift(y & z, 2);
  73. newy = y ^ x ^ shift(x | z, 1);
  74. x = z ^ y ^ shift(x & y, 3);
  75. y = newy;
  76. z = newz;
  77. }
  78. _mm_storeu_si128((__m128i *) (void *) &state[0], x);
  79. _mm_storeu_si128((__m128i *) (void *) &state[4], y);
  80. _mm_storeu_si128((__m128i *) (void *) &state[8], z);
  81. }