poly1305_vec.c 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864
  1. /* Copyright (c) 2014, Google Inc.
  2. *
  3. * Permission to use, copy, modify, and/or distribute this software for any
  4. * purpose with or without fee is hereby granted, provided that the above
  5. * copyright notice and this permission notice appear in all copies.
  6. *
  7. * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  8. * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  9. * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
  10. * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  11. * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
  12. * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
  13. * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
  14. // This implementation of poly1305 is by Andrew Moon
  15. // (https://github.com/floodyberry/poly1305-donna) and released as public
  16. // domain. It implements SIMD vectorization based on the algorithm described in
  17. // http://cr.yp.to/papers.html#neoncrypto. Unrolled to 2 powers, i.e. 64 byte
  18. // block size
  19. #include <GFp/poly1305.h>
  20. #include "internal.h"
  21. #include "../internal.h"
  22. #if defined(BORINGSSL_HAS_UINT128) && defined(OPENSSL_X86_64)
  23. #pragma GCC diagnostic ignored "-Wcast-align"
  24. #pragma GCC diagnostic ignored "-Wsign-conversion"
  25. #include <emmintrin.h>
  26. static uint32_t load_u32_le(const uint8_t in[4]) {
  27. uint32_t ret;
  28. GFp_memcpy(&ret, in, 4);
  29. return ret;
  30. }
  31. static uint64_t load_u64_le(const uint8_t in[8]) {
  32. uint64_t ret;
  33. GFp_memcpy(&ret, in, 8);
  34. return ret;
  35. }
  36. static void store_u64_le(uint8_t out[8], uint64_t v) {
  37. GFp_memcpy(out, &v, 8);
  38. }
  39. typedef __m128i xmmi;
  40. static const alignas(16) uint32_t poly1305_x64_sse2_message_mask[4] = {
  41. (1 << 26) - 1, 0, (1 << 26) - 1, 0};
  42. static const alignas(16) uint32_t poly1305_x64_sse2_5[4] = {5, 0, 5, 0};
  43. static const alignas(16) uint32_t poly1305_x64_sse2_1shl128[4] = {
  44. (1 << 24), 0, (1 << 24), 0};
  45. static inline uint128_t add128(uint128_t a, uint128_t b) { return a + b; }
  46. static inline uint128_t add128_64(uint128_t a, uint64_t b) { return a + b; }
  47. static inline uint128_t mul64x64_128(uint64_t a, uint64_t b) {
  48. return (uint128_t)a * b;
  49. }
  50. static inline uint64_t lo128(uint128_t a) { return (uint64_t)a; }
  51. static inline uint64_t shr128(uint128_t v, const int shift) {
  52. return (uint64_t)(v >> shift);
  53. }
  54. static inline uint64_t shr128_pair(uint64_t hi, uint64_t lo, const int shift) {
  55. return (uint64_t)((((uint128_t)hi << 64) | lo) >> shift);
  56. }
  57. typedef struct poly1305_power_t {
  58. union {
  59. xmmi v;
  60. uint64_t u[2];
  61. uint32_t d[4];
  62. } R20, R21, R22, R23, R24, S21, S22, S23, S24;
  63. } poly1305_power;
  64. typedef struct poly1305_state_internal_t {
  65. poly1305_power P[2]; /* 288 bytes, top 32 bit halves unused = 144
  66. bytes of free storage */
  67. union {
  68. xmmi H[5]; // 80 bytes
  69. uint64_t HH[10];
  70. };
  71. // uint64_t r0,r1,r2; [24 bytes]
  72. // uint64_t pad0,pad1; [16 bytes]
  73. uint64_t started; // 8 bytes
  74. uint64_t leftover; // 8 bytes
  75. uint8_t buffer[64]; // 64 bytes
  76. } poly1305_state_internal; /* 448 bytes total + 63 bytes for
  77. alignment = 511 bytes raw */
  78. OPENSSL_STATIC_ASSERT(sizeof(poly1305_state_internal) <= sizeof(poly1305_state),
  79. "poly1305_state isn't large enough to hold aligned poly1305_state_internal");
  80. static inline poly1305_state_internal *poly1305_aligned_state(
  81. poly1305_state *state) {
  82. dev_assert_secret(((uintptr_t)state & 63) == 0);
  83. return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63);
  84. }
  85. static inline size_t poly1305_min(size_t a, size_t b) {
  86. return (a < b) ? a : b;
  87. }
  88. void GFp_poly1305_init(poly1305_state *state, const uint8_t key[32]) {
  89. poly1305_state_internal *st = poly1305_aligned_state(state);
  90. poly1305_power *p;
  91. uint64_t r0, r1, r2;
  92. uint64_t t0, t1;
  93. // clamp key
  94. t0 = load_u64_le(key + 0);
  95. t1 = load_u64_le(key + 8);
  96. r0 = t0 & 0xffc0fffffff;
  97. t0 >>= 44;
  98. t0 |= t1 << 20;
  99. r1 = t0 & 0xfffffc0ffff;
  100. t1 >>= 24;
  101. r2 = t1 & 0x00ffffffc0f;
  102. // store r in un-used space of st->P[1]
  103. p = &st->P[1];
  104. p->R20.d[1] = (uint32_t)(r0);
  105. p->R20.d[3] = (uint32_t)(r0 >> 32);
  106. p->R21.d[1] = (uint32_t)(r1);
  107. p->R21.d[3] = (uint32_t)(r1 >> 32);
  108. p->R22.d[1] = (uint32_t)(r2);
  109. p->R22.d[3] = (uint32_t)(r2 >> 32);
  110. // store pad
  111. p->R23.d[1] = load_u32_le(key + 16);
  112. p->R23.d[3] = load_u32_le(key + 20);
  113. p->R24.d[1] = load_u32_le(key + 24);
  114. p->R24.d[3] = load_u32_le(key + 28);
  115. // H = 0
  116. st->H[0] = _mm_setzero_si128();
  117. st->H[1] = _mm_setzero_si128();
  118. st->H[2] = _mm_setzero_si128();
  119. st->H[3] = _mm_setzero_si128();
  120. st->H[4] = _mm_setzero_si128();
  121. st->started = 0;
  122. st->leftover = 0;
  123. }
  124. static void poly1305_first_block(poly1305_state_internal *st,
  125. const uint8_t *m) {
  126. const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
  127. const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
  128. const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
  129. xmmi T5, T6;
  130. poly1305_power *p;
  131. uint128_t d[3];
  132. uint64_t r0, r1, r2;
  133. uint64_t r20, r21, r22, s22;
  134. uint64_t pad0, pad1;
  135. uint64_t c;
  136. uint64_t i;
  137. // pull out stored info
  138. p = &st->P[1];
  139. r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
  140. r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
  141. r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
  142. pad0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
  143. pad1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
  144. // compute powers r^2,r^4
  145. r20 = r0;
  146. r21 = r1;
  147. r22 = r2;
  148. for (i = 0; i < 2; i++) {
  149. s22 = r22 * (5 << 2);
  150. d[0] = add128(mul64x64_128(r20, r20), mul64x64_128(r21 * 2, s22));
  151. d[1] = add128(mul64x64_128(r22, s22), mul64x64_128(r20 * 2, r21));
  152. d[2] = add128(mul64x64_128(r21, r21), mul64x64_128(r22 * 2, r20));
  153. r20 = lo128(d[0]) & 0xfffffffffff;
  154. c = shr128(d[0], 44);
  155. d[1] = add128_64(d[1], c);
  156. r21 = lo128(d[1]) & 0xfffffffffff;
  157. c = shr128(d[1], 44);
  158. d[2] = add128_64(d[2], c);
  159. r22 = lo128(d[2]) & 0x3ffffffffff;
  160. c = shr128(d[2], 42);
  161. r20 += c * 5;
  162. c = (r20 >> 44);
  163. r20 = r20 & 0xfffffffffff;
  164. r21 += c;
  165. p->R20.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)(r20)&0x3ffffff),
  166. _MM_SHUFFLE(1, 0, 1, 0));
  167. p->R21.v = _mm_shuffle_epi32(
  168. _mm_cvtsi32_si128((uint32_t)((r20 >> 26) | (r21 << 18)) & 0x3ffffff),
  169. _MM_SHUFFLE(1, 0, 1, 0));
  170. p->R22.v =
  171. _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 8)) & 0x3ffffff),
  172. _MM_SHUFFLE(1, 0, 1, 0));
  173. p->R23.v = _mm_shuffle_epi32(
  174. _mm_cvtsi32_si128((uint32_t)((r21 >> 34) | (r22 << 10)) & 0x3ffffff),
  175. _MM_SHUFFLE(1, 0, 1, 0));
  176. p->R24.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r22 >> 16))),
  177. _MM_SHUFFLE(1, 0, 1, 0));
  178. p->S21.v = _mm_mul_epu32(p->R21.v, FIVE);
  179. p->S22.v = _mm_mul_epu32(p->R22.v, FIVE);
  180. p->S23.v = _mm_mul_epu32(p->R23.v, FIVE);
  181. p->S24.v = _mm_mul_epu32(p->R24.v, FIVE);
  182. p--;
  183. }
  184. // put saved info back
  185. p = &st->P[1];
  186. p->R20.d[1] = (uint32_t)(r0);
  187. p->R20.d[3] = (uint32_t)(r0 >> 32);
  188. p->R21.d[1] = (uint32_t)(r1);
  189. p->R21.d[3] = (uint32_t)(r1 >> 32);
  190. p->R22.d[1] = (uint32_t)(r2);
  191. p->R22.d[3] = (uint32_t)(r2 >> 32);
  192. p->R23.d[1] = (uint32_t)(pad0);
  193. p->R23.d[3] = (uint32_t)(pad0 >> 32);
  194. p->R24.d[1] = (uint32_t)(pad1);
  195. p->R24.d[3] = (uint32_t)(pad1 >> 32);
  196. // H = [Mx,My]
  197. T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
  198. _mm_loadl_epi64((const xmmi *)(m + 16)));
  199. T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
  200. _mm_loadl_epi64((const xmmi *)(m + 24)));
  201. st->H[0] = _mm_and_si128(MMASK, T5);
  202. st->H[1] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  203. T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
  204. st->H[2] = _mm_and_si128(MMASK, T5);
  205. st->H[3] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  206. st->H[4] = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
  207. }
  208. static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m,
  209. size_t bytes) {
  210. const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
  211. const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
  212. const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
  213. poly1305_power *p;
  214. xmmi H0, H1, H2, H3, H4;
  215. xmmi T0, T1, T2, T3, T4, T5, T6;
  216. xmmi M0, M1, M2, M3, M4;
  217. xmmi C1, C2;
  218. H0 = st->H[0];
  219. H1 = st->H[1];
  220. H2 = st->H[2];
  221. H3 = st->H[3];
  222. H4 = st->H[4];
  223. while (bytes >= 64) {
  224. // H *= [r^4,r^4]
  225. p = &st->P[0];
  226. T0 = _mm_mul_epu32(H0, p->R20.v);
  227. T1 = _mm_mul_epu32(H0, p->R21.v);
  228. T2 = _mm_mul_epu32(H0, p->R22.v);
  229. T3 = _mm_mul_epu32(H0, p->R23.v);
  230. T4 = _mm_mul_epu32(H0, p->R24.v);
  231. T5 = _mm_mul_epu32(H1, p->S24.v);
  232. T6 = _mm_mul_epu32(H1, p->R20.v);
  233. T0 = _mm_add_epi64(T0, T5);
  234. T1 = _mm_add_epi64(T1, T6);
  235. T5 = _mm_mul_epu32(H2, p->S23.v);
  236. T6 = _mm_mul_epu32(H2, p->S24.v);
  237. T0 = _mm_add_epi64(T0, T5);
  238. T1 = _mm_add_epi64(T1, T6);
  239. T5 = _mm_mul_epu32(H3, p->S22.v);
  240. T6 = _mm_mul_epu32(H3, p->S23.v);
  241. T0 = _mm_add_epi64(T0, T5);
  242. T1 = _mm_add_epi64(T1, T6);
  243. T5 = _mm_mul_epu32(H4, p->S21.v);
  244. T6 = _mm_mul_epu32(H4, p->S22.v);
  245. T0 = _mm_add_epi64(T0, T5);
  246. T1 = _mm_add_epi64(T1, T6);
  247. T5 = _mm_mul_epu32(H1, p->R21.v);
  248. T6 = _mm_mul_epu32(H1, p->R22.v);
  249. T2 = _mm_add_epi64(T2, T5);
  250. T3 = _mm_add_epi64(T3, T6);
  251. T5 = _mm_mul_epu32(H2, p->R20.v);
  252. T6 = _mm_mul_epu32(H2, p->R21.v);
  253. T2 = _mm_add_epi64(T2, T5);
  254. T3 = _mm_add_epi64(T3, T6);
  255. T5 = _mm_mul_epu32(H3, p->S24.v);
  256. T6 = _mm_mul_epu32(H3, p->R20.v);
  257. T2 = _mm_add_epi64(T2, T5);
  258. T3 = _mm_add_epi64(T3, T6);
  259. T5 = _mm_mul_epu32(H4, p->S23.v);
  260. T6 = _mm_mul_epu32(H4, p->S24.v);
  261. T2 = _mm_add_epi64(T2, T5);
  262. T3 = _mm_add_epi64(T3, T6);
  263. T5 = _mm_mul_epu32(H1, p->R23.v);
  264. T4 = _mm_add_epi64(T4, T5);
  265. T5 = _mm_mul_epu32(H2, p->R22.v);
  266. T4 = _mm_add_epi64(T4, T5);
  267. T5 = _mm_mul_epu32(H3, p->R21.v);
  268. T4 = _mm_add_epi64(T4, T5);
  269. T5 = _mm_mul_epu32(H4, p->R20.v);
  270. T4 = _mm_add_epi64(T4, T5);
  271. // H += [Mx,My]*[r^2,r^2]
  272. T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
  273. _mm_loadl_epi64((const xmmi *)(m + 16)));
  274. T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
  275. _mm_loadl_epi64((const xmmi *)(m + 24)));
  276. M0 = _mm_and_si128(MMASK, T5);
  277. M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  278. T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
  279. M2 = _mm_and_si128(MMASK, T5);
  280. M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  281. M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
  282. p = &st->P[1];
  283. T5 = _mm_mul_epu32(M0, p->R20.v);
  284. T6 = _mm_mul_epu32(M0, p->R21.v);
  285. T0 = _mm_add_epi64(T0, T5);
  286. T1 = _mm_add_epi64(T1, T6);
  287. T5 = _mm_mul_epu32(M1, p->S24.v);
  288. T6 = _mm_mul_epu32(M1, p->R20.v);
  289. T0 = _mm_add_epi64(T0, T5);
  290. T1 = _mm_add_epi64(T1, T6);
  291. T5 = _mm_mul_epu32(M2, p->S23.v);
  292. T6 = _mm_mul_epu32(M2, p->S24.v);
  293. T0 = _mm_add_epi64(T0, T5);
  294. T1 = _mm_add_epi64(T1, T6);
  295. T5 = _mm_mul_epu32(M3, p->S22.v);
  296. T6 = _mm_mul_epu32(M3, p->S23.v);
  297. T0 = _mm_add_epi64(T0, T5);
  298. T1 = _mm_add_epi64(T1, T6);
  299. T5 = _mm_mul_epu32(M4, p->S21.v);
  300. T6 = _mm_mul_epu32(M4, p->S22.v);
  301. T0 = _mm_add_epi64(T0, T5);
  302. T1 = _mm_add_epi64(T1, T6);
  303. T5 = _mm_mul_epu32(M0, p->R22.v);
  304. T6 = _mm_mul_epu32(M0, p->R23.v);
  305. T2 = _mm_add_epi64(T2, T5);
  306. T3 = _mm_add_epi64(T3, T6);
  307. T5 = _mm_mul_epu32(M1, p->R21.v);
  308. T6 = _mm_mul_epu32(M1, p->R22.v);
  309. T2 = _mm_add_epi64(T2, T5);
  310. T3 = _mm_add_epi64(T3, T6);
  311. T5 = _mm_mul_epu32(M2, p->R20.v);
  312. T6 = _mm_mul_epu32(M2, p->R21.v);
  313. T2 = _mm_add_epi64(T2, T5);
  314. T3 = _mm_add_epi64(T3, T6);
  315. T5 = _mm_mul_epu32(M3, p->S24.v);
  316. T6 = _mm_mul_epu32(M3, p->R20.v);
  317. T2 = _mm_add_epi64(T2, T5);
  318. T3 = _mm_add_epi64(T3, T6);
  319. T5 = _mm_mul_epu32(M4, p->S23.v);
  320. T6 = _mm_mul_epu32(M4, p->S24.v);
  321. T2 = _mm_add_epi64(T2, T5);
  322. T3 = _mm_add_epi64(T3, T6);
  323. T5 = _mm_mul_epu32(M0, p->R24.v);
  324. T4 = _mm_add_epi64(T4, T5);
  325. T5 = _mm_mul_epu32(M1, p->R23.v);
  326. T4 = _mm_add_epi64(T4, T5);
  327. T5 = _mm_mul_epu32(M2, p->R22.v);
  328. T4 = _mm_add_epi64(T4, T5);
  329. T5 = _mm_mul_epu32(M3, p->R21.v);
  330. T4 = _mm_add_epi64(T4, T5);
  331. T5 = _mm_mul_epu32(M4, p->R20.v);
  332. T4 = _mm_add_epi64(T4, T5);
  333. // H += [Mx,My]
  334. T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 32)),
  335. _mm_loadl_epi64((const xmmi *)(m + 48)));
  336. T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 40)),
  337. _mm_loadl_epi64((const xmmi *)(m + 56)));
  338. M0 = _mm_and_si128(MMASK, T5);
  339. M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  340. T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
  341. M2 = _mm_and_si128(MMASK, T5);
  342. M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  343. M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
  344. T0 = _mm_add_epi64(T0, M0);
  345. T1 = _mm_add_epi64(T1, M1);
  346. T2 = _mm_add_epi64(T2, M2);
  347. T3 = _mm_add_epi64(T3, M3);
  348. T4 = _mm_add_epi64(T4, M4);
  349. // reduce
  350. C1 = _mm_srli_epi64(T0, 26);
  351. C2 = _mm_srli_epi64(T3, 26);
  352. T0 = _mm_and_si128(T0, MMASK);
  353. T3 = _mm_and_si128(T3, MMASK);
  354. T1 = _mm_add_epi64(T1, C1);
  355. T4 = _mm_add_epi64(T4, C2);
  356. C1 = _mm_srli_epi64(T1, 26);
  357. C2 = _mm_srli_epi64(T4, 26);
  358. T1 = _mm_and_si128(T1, MMASK);
  359. T4 = _mm_and_si128(T4, MMASK);
  360. T2 = _mm_add_epi64(T2, C1);
  361. T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
  362. C1 = _mm_srli_epi64(T2, 26);
  363. C2 = _mm_srli_epi64(T0, 26);
  364. T2 = _mm_and_si128(T2, MMASK);
  365. T0 = _mm_and_si128(T0, MMASK);
  366. T3 = _mm_add_epi64(T3, C1);
  367. T1 = _mm_add_epi64(T1, C2);
  368. C1 = _mm_srli_epi64(T3, 26);
  369. T3 = _mm_and_si128(T3, MMASK);
  370. T4 = _mm_add_epi64(T4, C1);
  371. // H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My])
  372. H0 = T0;
  373. H1 = T1;
  374. H2 = T2;
  375. H3 = T3;
  376. H4 = T4;
  377. m += 64;
  378. bytes -= 64;
  379. }
  380. st->H[0] = H0;
  381. st->H[1] = H1;
  382. st->H[2] = H2;
  383. st->H[3] = H3;
  384. st->H[4] = H4;
  385. }
  386. static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m,
  387. size_t bytes) {
  388. const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
  389. const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
  390. const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
  391. poly1305_power *p;
  392. xmmi H0, H1, H2, H3, H4;
  393. xmmi M0, M1, M2, M3, M4;
  394. xmmi T0, T1, T2, T3, T4, T5, T6;
  395. xmmi C1, C2;
  396. uint64_t r0, r1, r2;
  397. uint64_t t0, t1, t2, t3, t4;
  398. uint64_t c;
  399. size_t consumed = 0;
  400. H0 = st->H[0];
  401. H1 = st->H[1];
  402. H2 = st->H[2];
  403. H3 = st->H[3];
  404. H4 = st->H[4];
  405. // p = [r^2,r^2]
  406. p = &st->P[1];
  407. if (bytes >= 32) {
  408. // H *= [r^2,r^2]
  409. T0 = _mm_mul_epu32(H0, p->R20.v);
  410. T1 = _mm_mul_epu32(H0, p->R21.v);
  411. T2 = _mm_mul_epu32(H0, p->R22.v);
  412. T3 = _mm_mul_epu32(H0, p->R23.v);
  413. T4 = _mm_mul_epu32(H0, p->R24.v);
  414. T5 = _mm_mul_epu32(H1, p->S24.v);
  415. T6 = _mm_mul_epu32(H1, p->R20.v);
  416. T0 = _mm_add_epi64(T0, T5);
  417. T1 = _mm_add_epi64(T1, T6);
  418. T5 = _mm_mul_epu32(H2, p->S23.v);
  419. T6 = _mm_mul_epu32(H2, p->S24.v);
  420. T0 = _mm_add_epi64(T0, T5);
  421. T1 = _mm_add_epi64(T1, T6);
  422. T5 = _mm_mul_epu32(H3, p->S22.v);
  423. T6 = _mm_mul_epu32(H3, p->S23.v);
  424. T0 = _mm_add_epi64(T0, T5);
  425. T1 = _mm_add_epi64(T1, T6);
  426. T5 = _mm_mul_epu32(H4, p->S21.v);
  427. T6 = _mm_mul_epu32(H4, p->S22.v);
  428. T0 = _mm_add_epi64(T0, T5);
  429. T1 = _mm_add_epi64(T1, T6);
  430. T5 = _mm_mul_epu32(H1, p->R21.v);
  431. T6 = _mm_mul_epu32(H1, p->R22.v);
  432. T2 = _mm_add_epi64(T2, T5);
  433. T3 = _mm_add_epi64(T3, T6);
  434. T5 = _mm_mul_epu32(H2, p->R20.v);
  435. T6 = _mm_mul_epu32(H2, p->R21.v);
  436. T2 = _mm_add_epi64(T2, T5);
  437. T3 = _mm_add_epi64(T3, T6);
  438. T5 = _mm_mul_epu32(H3, p->S24.v);
  439. T6 = _mm_mul_epu32(H3, p->R20.v);
  440. T2 = _mm_add_epi64(T2, T5);
  441. T3 = _mm_add_epi64(T3, T6);
  442. T5 = _mm_mul_epu32(H4, p->S23.v);
  443. T6 = _mm_mul_epu32(H4, p->S24.v);
  444. T2 = _mm_add_epi64(T2, T5);
  445. T3 = _mm_add_epi64(T3, T6);
  446. T5 = _mm_mul_epu32(H1, p->R23.v);
  447. T4 = _mm_add_epi64(T4, T5);
  448. T5 = _mm_mul_epu32(H2, p->R22.v);
  449. T4 = _mm_add_epi64(T4, T5);
  450. T5 = _mm_mul_epu32(H3, p->R21.v);
  451. T4 = _mm_add_epi64(T4, T5);
  452. T5 = _mm_mul_epu32(H4, p->R20.v);
  453. T4 = _mm_add_epi64(T4, T5);
  454. // H += [Mx,My]
  455. T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
  456. _mm_loadl_epi64((const xmmi *)(m + 16)));
  457. T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
  458. _mm_loadl_epi64((const xmmi *)(m + 24)));
  459. M0 = _mm_and_si128(MMASK, T5);
  460. M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  461. T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
  462. M2 = _mm_and_si128(MMASK, T5);
  463. M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  464. M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
  465. T0 = _mm_add_epi64(T0, M0);
  466. T1 = _mm_add_epi64(T1, M1);
  467. T2 = _mm_add_epi64(T2, M2);
  468. T3 = _mm_add_epi64(T3, M3);
  469. T4 = _mm_add_epi64(T4, M4);
  470. // reduce
  471. C1 = _mm_srli_epi64(T0, 26);
  472. C2 = _mm_srli_epi64(T3, 26);
  473. T0 = _mm_and_si128(T0, MMASK);
  474. T3 = _mm_and_si128(T3, MMASK);
  475. T1 = _mm_add_epi64(T1, C1);
  476. T4 = _mm_add_epi64(T4, C2);
  477. C1 = _mm_srli_epi64(T1, 26);
  478. C2 = _mm_srli_epi64(T4, 26);
  479. T1 = _mm_and_si128(T1, MMASK);
  480. T4 = _mm_and_si128(T4, MMASK);
  481. T2 = _mm_add_epi64(T2, C1);
  482. T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
  483. C1 = _mm_srli_epi64(T2, 26);
  484. C2 = _mm_srli_epi64(T0, 26);
  485. T2 = _mm_and_si128(T2, MMASK);
  486. T0 = _mm_and_si128(T0, MMASK);
  487. T3 = _mm_add_epi64(T3, C1);
  488. T1 = _mm_add_epi64(T1, C2);
  489. C1 = _mm_srli_epi64(T3, 26);
  490. T3 = _mm_and_si128(T3, MMASK);
  491. T4 = _mm_add_epi64(T4, C1);
  492. // H = (H*[r^2,r^2] + [Mx,My])
  493. H0 = T0;
  494. H1 = T1;
  495. H2 = T2;
  496. H3 = T3;
  497. H4 = T4;
  498. consumed = 32;
  499. }
  500. // finalize, H *= [r^2,r]
  501. r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
  502. r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
  503. r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
  504. p->R20.d[2] = (uint32_t)(r0)&0x3ffffff;
  505. p->R21.d[2] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
  506. p->R22.d[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff;
  507. p->R23.d[2] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
  508. p->R24.d[2] = (uint32_t)((r2 >> 16));
  509. p->S21.d[2] = p->R21.d[2] * 5;
  510. p->S22.d[2] = p->R22.d[2] * 5;
  511. p->S23.d[2] = p->R23.d[2] * 5;
  512. p->S24.d[2] = p->R24.d[2] * 5;
  513. // H *= [r^2,r]
  514. T0 = _mm_mul_epu32(H0, p->R20.v);
  515. T1 = _mm_mul_epu32(H0, p->R21.v);
  516. T2 = _mm_mul_epu32(H0, p->R22.v);
  517. T3 = _mm_mul_epu32(H0, p->R23.v);
  518. T4 = _mm_mul_epu32(H0, p->R24.v);
  519. T5 = _mm_mul_epu32(H1, p->S24.v);
  520. T6 = _mm_mul_epu32(H1, p->R20.v);
  521. T0 = _mm_add_epi64(T0, T5);
  522. T1 = _mm_add_epi64(T1, T6);
  523. T5 = _mm_mul_epu32(H2, p->S23.v);
  524. T6 = _mm_mul_epu32(H2, p->S24.v);
  525. T0 = _mm_add_epi64(T0, T5);
  526. T1 = _mm_add_epi64(T1, T6);
  527. T5 = _mm_mul_epu32(H3, p->S22.v);
  528. T6 = _mm_mul_epu32(H3, p->S23.v);
  529. T0 = _mm_add_epi64(T0, T5);
  530. T1 = _mm_add_epi64(T1, T6);
  531. T5 = _mm_mul_epu32(H4, p->S21.v);
  532. T6 = _mm_mul_epu32(H4, p->S22.v);
  533. T0 = _mm_add_epi64(T0, T5);
  534. T1 = _mm_add_epi64(T1, T6);
  535. T5 = _mm_mul_epu32(H1, p->R21.v);
  536. T6 = _mm_mul_epu32(H1, p->R22.v);
  537. T2 = _mm_add_epi64(T2, T5);
  538. T3 = _mm_add_epi64(T3, T6);
  539. T5 = _mm_mul_epu32(H2, p->R20.v);
  540. T6 = _mm_mul_epu32(H2, p->R21.v);
  541. T2 = _mm_add_epi64(T2, T5);
  542. T3 = _mm_add_epi64(T3, T6);
  543. T5 = _mm_mul_epu32(H3, p->S24.v);
  544. T6 = _mm_mul_epu32(H3, p->R20.v);
  545. T2 = _mm_add_epi64(T2, T5);
  546. T3 = _mm_add_epi64(T3, T6);
  547. T5 = _mm_mul_epu32(H4, p->S23.v);
  548. T6 = _mm_mul_epu32(H4, p->S24.v);
  549. T2 = _mm_add_epi64(T2, T5);
  550. T3 = _mm_add_epi64(T3, T6);
  551. T5 = _mm_mul_epu32(H1, p->R23.v);
  552. T4 = _mm_add_epi64(T4, T5);
  553. T5 = _mm_mul_epu32(H2, p->R22.v);
  554. T4 = _mm_add_epi64(T4, T5);
  555. T5 = _mm_mul_epu32(H3, p->R21.v);
  556. T4 = _mm_add_epi64(T4, T5);
  557. T5 = _mm_mul_epu32(H4, p->R20.v);
  558. T4 = _mm_add_epi64(T4, T5);
  559. C1 = _mm_srli_epi64(T0, 26);
  560. C2 = _mm_srli_epi64(T3, 26);
  561. T0 = _mm_and_si128(T0, MMASK);
  562. T3 = _mm_and_si128(T3, MMASK);
  563. T1 = _mm_add_epi64(T1, C1);
  564. T4 = _mm_add_epi64(T4, C2);
  565. C1 = _mm_srli_epi64(T1, 26);
  566. C2 = _mm_srli_epi64(T4, 26);
  567. T1 = _mm_and_si128(T1, MMASK);
  568. T4 = _mm_and_si128(T4, MMASK);
  569. T2 = _mm_add_epi64(T2, C1);
  570. T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
  571. C1 = _mm_srli_epi64(T2, 26);
  572. C2 = _mm_srli_epi64(T0, 26);
  573. T2 = _mm_and_si128(T2, MMASK);
  574. T0 = _mm_and_si128(T0, MMASK);
  575. T3 = _mm_add_epi64(T3, C1);
  576. T1 = _mm_add_epi64(T1, C2);
  577. C1 = _mm_srli_epi64(T3, 26);
  578. T3 = _mm_and_si128(T3, MMASK);
  579. T4 = _mm_add_epi64(T4, C1);
  580. // H = H[0]+H[1]
  581. H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
  582. H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
  583. H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
  584. H3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
  585. H4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));
  586. t0 = _mm_cvtsi128_si32(H0);
  587. c = (t0 >> 26);
  588. t0 &= 0x3ffffff;
  589. t1 = _mm_cvtsi128_si32(H1) + c;
  590. c = (t1 >> 26);
  591. t1 &= 0x3ffffff;
  592. t2 = _mm_cvtsi128_si32(H2) + c;
  593. c = (t2 >> 26);
  594. t2 &= 0x3ffffff;
  595. t3 = _mm_cvtsi128_si32(H3) + c;
  596. c = (t3 >> 26);
  597. t3 &= 0x3ffffff;
  598. t4 = _mm_cvtsi128_si32(H4) + c;
  599. c = (t4 >> 26);
  600. t4 &= 0x3ffffff;
  601. t0 = t0 + (c * 5);
  602. c = (t0 >> 26);
  603. t0 &= 0x3ffffff;
  604. t1 = t1 + c;
  605. st->HH[0] = ((t0) | (t1 << 26)) & UINT64_C(0xfffffffffff);
  606. st->HH[1] = ((t1 >> 18) | (t2 << 8) | (t3 << 34)) & UINT64_C(0xfffffffffff);
  607. st->HH[2] = ((t3 >> 10) | (t4 << 16)) & UINT64_C(0x3ffffffffff);
  608. return consumed;
  609. }
  610. void GFp_poly1305_update(poly1305_state *state, const uint8_t *m,
  611. size_t bytes) {
  612. poly1305_state_internal *st = poly1305_aligned_state(state);
  613. size_t want;
  614. // Work around a C language bug. See https://crbug.com/1019588.
  615. if (bytes == 0) {
  616. return;
  617. }
  618. // need at least 32 initial bytes to start the accelerated branch
  619. if (!st->started) {
  620. if ((st->leftover == 0) && (bytes > 32)) {
  621. poly1305_first_block(st, m);
  622. m += 32;
  623. bytes -= 32;
  624. } else {
  625. want = poly1305_min(32 - st->leftover, bytes);
  626. GFp_memcpy(st->buffer + st->leftover, m, want);
  627. bytes -= want;
  628. m += want;
  629. st->leftover += want;
  630. if ((st->leftover < 32) || (bytes == 0)) {
  631. return;
  632. }
  633. poly1305_first_block(st, st->buffer);
  634. st->leftover = 0;
  635. }
  636. st->started = 1;
  637. }
  638. // handle leftover
  639. if (st->leftover) {
  640. want = poly1305_min(64 - st->leftover, bytes);
  641. GFp_memcpy(st->buffer + st->leftover, m, want);
  642. bytes -= want;
  643. m += want;
  644. st->leftover += want;
  645. if (st->leftover < 64) {
  646. return;
  647. }
  648. poly1305_blocks(st, st->buffer, 64);
  649. st->leftover = 0;
  650. }
  651. // process 64 byte blocks
  652. if (bytes >= 64) {
  653. want = (bytes & ~63);
  654. poly1305_blocks(st, m, want);
  655. m += want;
  656. bytes -= want;
  657. }
  658. if (bytes) {
  659. GFp_memcpy(st->buffer + st->leftover, m, bytes);
  660. st->leftover += bytes;
  661. }
  662. }
  663. void GFp_poly1305_finish(poly1305_state *state, uint8_t mac[16]) {
  664. poly1305_state_internal *st = poly1305_aligned_state(state);
  665. size_t leftover = st->leftover;
  666. uint8_t *m = st->buffer;
  667. uint128_t d[3];
  668. uint64_t h0, h1, h2;
  669. uint64_t t0, t1;
  670. uint64_t g0, g1, g2, c, nc;
  671. uint64_t r0, r1, r2, s1, s2;
  672. poly1305_power *p;
  673. if (st->started) {
  674. size_t consumed = poly1305_combine(st, m, leftover);
  675. leftover -= consumed;
  676. m += consumed;
  677. }
  678. // st->HH will either be 0 or have the combined result
  679. h0 = st->HH[0];
  680. h1 = st->HH[1];
  681. h2 = st->HH[2];
  682. p = &st->P[1];
  683. r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
  684. r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
  685. r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
  686. s1 = r1 * (5 << 2);
  687. s2 = r2 * (5 << 2);
  688. if (leftover < 16) {
  689. goto poly1305_donna_atmost15bytes;
  690. }
  691. poly1305_donna_atleast16bytes:
  692. t0 = load_u64_le(m + 0);
  693. t1 = load_u64_le(m + 8);
  694. h0 += t0 & 0xfffffffffff;
  695. t0 = shr128_pair(t1, t0, 44);
  696. h1 += t0 & 0xfffffffffff;
  697. h2 += (t1 >> 24) | ((uint64_t)1 << 40);
  698. poly1305_donna_mul:
  699. d[0] = add128(add128(mul64x64_128(h0, r0), mul64x64_128(h1, s2)),
  700. mul64x64_128(h2, s1));
  701. d[1] = add128(add128(mul64x64_128(h0, r1), mul64x64_128(h1, r0)),
  702. mul64x64_128(h2, s2));
  703. d[2] = add128(add128(mul64x64_128(h0, r2), mul64x64_128(h1, r1)),
  704. mul64x64_128(h2, r0));
  705. h0 = lo128(d[0]) & 0xfffffffffff;
  706. c = shr128(d[0], 44);
  707. d[1] = add128_64(d[1], c);
  708. h1 = lo128(d[1]) & 0xfffffffffff;
  709. c = shr128(d[1], 44);
  710. d[2] = add128_64(d[2], c);
  711. h2 = lo128(d[2]) & 0x3ffffffffff;
  712. c = shr128(d[2], 42);
  713. h0 += c * 5;
  714. m += 16;
  715. leftover -= 16;
  716. if (leftover >= 16) {
  717. goto poly1305_donna_atleast16bytes;
  718. }
  719. // final bytes
  720. poly1305_donna_atmost15bytes:
  721. if (!leftover) {
  722. goto poly1305_donna_finish;
  723. }
  724. m[leftover++] = 1;
  725. GFp_memset(m + leftover, 0, 16 - leftover);
  726. leftover = 16;
  727. t0 = load_u64_le(m + 0);
  728. t1 = load_u64_le(m + 8);
  729. h0 += t0 & 0xfffffffffff;
  730. t0 = shr128_pair(t1, t0, 44);
  731. h1 += t0 & 0xfffffffffff;
  732. h2 += (t1 >> 24);
  733. goto poly1305_donna_mul;
  734. poly1305_donna_finish:
  735. c = (h0 >> 44);
  736. h0 &= 0xfffffffffff;
  737. h1 += c;
  738. c = (h1 >> 44);
  739. h1 &= 0xfffffffffff;
  740. h2 += c;
  741. c = (h2 >> 42);
  742. h2 &= 0x3ffffffffff;
  743. h0 += c * 5;
  744. g0 = h0 + 5;
  745. c = (g0 >> 44);
  746. g0 &= 0xfffffffffff;
  747. g1 = h1 + c;
  748. c = (g1 >> 44);
  749. g1 &= 0xfffffffffff;
  750. g2 = h2 + c - ((uint64_t)1 << 42);
  751. c = (g2 >> 63) - 1;
  752. nc = ~c;
  753. h0 = (h0 & nc) | (g0 & c);
  754. h1 = (h1 & nc) | (g1 & c);
  755. h2 = (h2 & nc) | (g2 & c);
  756. // pad
  757. t0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
  758. t1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
  759. h0 += (t0 & 0xfffffffffff);
  760. c = (h0 >> 44);
  761. h0 &= 0xfffffffffff;
  762. t0 = shr128_pair(t1, t0, 44);
  763. h1 += (t0 & 0xfffffffffff) + c;
  764. c = (h1 >> 44);
  765. h1 &= 0xfffffffffff;
  766. t1 = (t1 >> 24);
  767. h2 += (t1)+c;
  768. store_u64_le(mac + 0, ((h0) | (h1 << 44)));
  769. store_u64_le(mac + 8, ((h1 >> 20) | (h2 << 24)));
  770. }
  771. #endif // BORINGSSL_HAS_UINT128 && OPENSSL_X86_64