bn_mul.h 44 KB


  1. /**
  2. * \file bn_mul.h
  3. *
  4. * \brief Multi-precision integer library
  5. */
  6. /*
  7. * Copyright The Mbed TLS Contributors
  8. * SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  9. */
  10. /*
  11. * Multiply source vector [s] with b, add result
  12. * to destination vector [d] and set carry c.
  13. *
  14. * Currently supports:
  15. *
  16. * . IA-32 (386+) . AMD64 / EM64T
  17. * . IA-32 (SSE2) . Motorola 68000
  18. * . PowerPC, 32-bit . MicroBlaze
  19. * . PowerPC, 64-bit . TriCore
  20. * . SPARC v8 . ARM v3+
  21. * . Alpha . MIPS32
  22. * . C, longlong . C, generic
  23. */
  24. #ifndef MBEDTLS_BN_MUL_H
  25. #define MBEDTLS_BN_MUL_H
  26. #include "mbedtls/build_info.h"
  27. #include "mbedtls/bignum.h"
  28. /*
  29. * Conversion macros for embedded constants:
  30. * build lists of mbedtls_mpi_uint's from lists of unsigned char's grouped by 8, 4 or 2
  31. */
  32. #if defined(MBEDTLS_HAVE_INT32)
  33. #define MBEDTLS_BYTES_TO_T_UINT_4(a, b, c, d) \
  34. ((mbedtls_mpi_uint) (a) << 0) | \
  35. ((mbedtls_mpi_uint) (b) << 8) | \
  36. ((mbedtls_mpi_uint) (c) << 16) | \
  37. ((mbedtls_mpi_uint) (d) << 24)
  38. #define MBEDTLS_BYTES_TO_T_UINT_2(a, b) \
  39. MBEDTLS_BYTES_TO_T_UINT_4(a, b, 0, 0)
  40. #define MBEDTLS_BYTES_TO_T_UINT_8(a, b, c, d, e, f, g, h) \
  41. MBEDTLS_BYTES_TO_T_UINT_4(a, b, c, d), \
  42. MBEDTLS_BYTES_TO_T_UINT_4(e, f, g, h)
  43. #else /* 64-bits */
  44. #define MBEDTLS_BYTES_TO_T_UINT_8(a, b, c, d, e, f, g, h) \
  45. ((mbedtls_mpi_uint) (a) << 0) | \
  46. ((mbedtls_mpi_uint) (b) << 8) | \
  47. ((mbedtls_mpi_uint) (c) << 16) | \
  48. ((mbedtls_mpi_uint) (d) << 24) | \
  49. ((mbedtls_mpi_uint) (e) << 32) | \
  50. ((mbedtls_mpi_uint) (f) << 40) | \
  51. ((mbedtls_mpi_uint) (g) << 48) | \
  52. ((mbedtls_mpi_uint) (h) << 56)
  53. #define MBEDTLS_BYTES_TO_T_UINT_4(a, b, c, d) \
  54. MBEDTLS_BYTES_TO_T_UINT_8(a, b, c, d, 0, 0, 0, 0)
  55. #define MBEDTLS_BYTES_TO_T_UINT_2(a, b) \
  56. MBEDTLS_BYTES_TO_T_UINT_8(a, b, 0, 0, 0, 0, 0, 0)
  57. #endif /* bits in mbedtls_mpi_uint */
  58. /* *INDENT-OFF* */
  59. #if defined(MBEDTLS_HAVE_ASM)
  60. /* armcc5 --gnu defines __GNUC__ but doesn't support GNU's extended asm */
  61. #if defined(__GNUC__) && \
  62. ( !defined(__ARMCC_VERSION) || __ARMCC_VERSION >= 6000000 )
  63. /*
  64. * GCC < 5.0 treated the x86 ebx (which is used for the GOT) as a
  65. * fixed reserved register when building as PIC, leading to errors
  66. * like: bn_mul.h:46:13: error: PIC register clobbered by 'ebx' in 'asm'
  67. *
  68. * This is fixed by an improved register allocator in GCC 5+. From the
  69. * release notes:
  70. * Register allocation improvements: Reuse of the PIC hard register,
  71. * instead of using a fixed register, was implemented on x86/x86-64
  72. * targets. This improves generated PIC code performance as more hard
  73. * registers can be used.
  74. */
  75. #if defined(__GNUC__) && __GNUC__ < 5 && defined(__PIC__)
  76. #define MULADDC_CANNOT_USE_EBX
  77. #endif
  78. /*
  79. * Disable use of the i386 assembly code below if option -O0, to disable all
  80. * compiler optimisations, is passed, detected with __OPTIMIZE__
  81. * This is done as the number of registers used in the assembly code doesn't
  82. * work with the -O0 option.
  83. */
  84. #if defined(__i386__) && defined(__OPTIMIZE__) && !defined(MULADDC_CANNOT_USE_EBX)
  85. #define MULADDC_X1_INIT \
  86. { mbedtls_mpi_uint t; \
  87. asm( \
  88. "movl %%ebx, %0 \n\t" \
  89. "movl %5, %%esi \n\t" \
  90. "movl %6, %%edi \n\t" \
  91. "movl %7, %%ecx \n\t" \
  92. "movl %8, %%ebx \n\t"
  93. #define MULADDC_X1_CORE \
  94. "lodsl \n\t" \
  95. "mull %%ebx \n\t" \
  96. "addl %%ecx, %%eax \n\t" \
  97. "adcl $0, %%edx \n\t" \
  98. "addl (%%edi), %%eax \n\t" \
  99. "adcl $0, %%edx \n\t" \
  100. "movl %%edx, %%ecx \n\t" \
  101. "stosl \n\t"
  102. #define MULADDC_X1_STOP \
  103. "movl %4, %%ebx \n\t" \
  104. "movl %%ecx, %1 \n\t" \
  105. "movl %%edi, %2 \n\t" \
  106. "movl %%esi, %3 \n\t" \
  107. : "=m" (t), "=m" (c), "=m" (d), "=m" (s) \
  108. : "m" (t), "m" (s), "m" (d), "m" (c), "m" (b) \
  109. : "eax", "ebx", "ecx", "edx", "esi", "edi" \
  110. ); }
  111. #if defined(MBEDTLS_HAVE_SSE2)
  112. #define MULADDC_X8_INIT MULADDC_X1_INIT
  113. #define MULADDC_X8_CORE \
  114. "movd %%ecx, %%mm1 \n\t" \
  115. "movd %%ebx, %%mm0 \n\t" \
  116. "movd (%%edi), %%mm3 \n\t" \
  117. "paddq %%mm3, %%mm1 \n\t" \
  118. "movd (%%esi), %%mm2 \n\t" \
  119. "pmuludq %%mm0, %%mm2 \n\t" \
  120. "movd 4(%%esi), %%mm4 \n\t" \
  121. "pmuludq %%mm0, %%mm4 \n\t" \
  122. "movd 8(%%esi), %%mm6 \n\t" \
  123. "pmuludq %%mm0, %%mm6 \n\t" \
  124. "movd 12(%%esi), %%mm7 \n\t" \
  125. "pmuludq %%mm0, %%mm7 \n\t" \
  126. "paddq %%mm2, %%mm1 \n\t" \
  127. "movd 4(%%edi), %%mm3 \n\t" \
  128. "paddq %%mm4, %%mm3 \n\t" \
  129. "movd 8(%%edi), %%mm5 \n\t" \
  130. "paddq %%mm6, %%mm5 \n\t" \
  131. "movd 12(%%edi), %%mm4 \n\t" \
  132. "paddq %%mm4, %%mm7 \n\t" \
  133. "movd %%mm1, (%%edi) \n\t" \
  134. "movd 16(%%esi), %%mm2 \n\t" \
  135. "pmuludq %%mm0, %%mm2 \n\t" \
  136. "psrlq $32, %%mm1 \n\t" \
  137. "movd 20(%%esi), %%mm4 \n\t" \
  138. "pmuludq %%mm0, %%mm4 \n\t" \
  139. "paddq %%mm3, %%mm1 \n\t" \
  140. "movd 24(%%esi), %%mm6 \n\t" \
  141. "pmuludq %%mm0, %%mm6 \n\t" \
  142. "movd %%mm1, 4(%%edi) \n\t" \
  143. "psrlq $32, %%mm1 \n\t" \
  144. "movd 28(%%esi), %%mm3 \n\t" \
  145. "pmuludq %%mm0, %%mm3 \n\t" \
  146. "paddq %%mm5, %%mm1 \n\t" \
  147. "movd 16(%%edi), %%mm5 \n\t" \
  148. "paddq %%mm5, %%mm2 \n\t" \
  149. "movd %%mm1, 8(%%edi) \n\t" \
  150. "psrlq $32, %%mm1 \n\t" \
  151. "paddq %%mm7, %%mm1 \n\t" \
  152. "movd 20(%%edi), %%mm5 \n\t" \
  153. "paddq %%mm5, %%mm4 \n\t" \
  154. "movd %%mm1, 12(%%edi) \n\t" \
  155. "psrlq $32, %%mm1 \n\t" \
  156. "paddq %%mm2, %%mm1 \n\t" \
  157. "movd 24(%%edi), %%mm5 \n\t" \
  158. "paddq %%mm5, %%mm6 \n\t" \
  159. "movd %%mm1, 16(%%edi) \n\t" \
  160. "psrlq $32, %%mm1 \n\t" \
  161. "paddq %%mm4, %%mm1 \n\t" \
  162. "movd 28(%%edi), %%mm5 \n\t" \
  163. "paddq %%mm5, %%mm3 \n\t" \
  164. "movd %%mm1, 20(%%edi) \n\t" \
  165. "psrlq $32, %%mm1 \n\t" \
  166. "paddq %%mm6, %%mm1 \n\t" \
  167. "movd %%mm1, 24(%%edi) \n\t" \
  168. "psrlq $32, %%mm1 \n\t" \
  169. "paddq %%mm3, %%mm1 \n\t" \
  170. "movd %%mm1, 28(%%edi) \n\t" \
  171. "addl $32, %%edi \n\t" \
  172. "addl $32, %%esi \n\t" \
  173. "psrlq $32, %%mm1 \n\t" \
  174. "movd %%mm1, %%ecx \n\t"
  175. #define MULADDC_X8_STOP \
  176. "emms \n\t" \
  177. "movl %4, %%ebx \n\t" \
  178. "movl %%ecx, %1 \n\t" \
  179. "movl %%edi, %2 \n\t" \
  180. "movl %%esi, %3 \n\t" \
  181. : "=m" (t), "=m" (c), "=m" (d), "=m" (s) \
  182. : "m" (t), "m" (s), "m" (d), "m" (c), "m" (b) \
  183. : "eax", "ebx", "ecx", "edx", "esi", "edi" \
  184. ); } \
  185. #endif /* SSE2 */
  186. #endif /* i386 */
  187. #if defined(__amd64__) || defined (__x86_64__)
  188. #define MULADDC_X1_INIT \
  189. asm( \
  190. "xorq %%r8, %%r8\n"
  191. #define MULADDC_X1_CORE \
  192. "movq (%%rsi), %%rax\n" \
  193. "mulq %%rbx\n" \
  194. "addq $8, %%rsi\n" \
  195. "addq %%rcx, %%rax\n" \
  196. "movq %%r8, %%rcx\n" \
  197. "adcq $0, %%rdx\n" \
  198. "nop \n" \
  199. "addq %%rax, (%%rdi)\n" \
  200. "adcq %%rdx, %%rcx\n" \
  201. "addq $8, %%rdi\n"
  202. #define MULADDC_X1_STOP \
  203. : "+c" (c), "+D" (d), "+S" (s), "+m" (*(uint64_t (*)[16]) d) \
  204. : "b" (b), "m" (*(const uint64_t (*)[16]) s) \
  205. : "rax", "rdx", "r8" \
  206. );
  207. #endif /* AMD64 */
  208. // The following assembly code assumes that a pointer will fit in a 64-bit register
  209. // (including ILP32 __aarch64__ ABIs such as on watchOS, hence the 2^32 - 1)
  210. #if defined(__aarch64__) && (UINTPTR_MAX == 0xfffffffful || UINTPTR_MAX == 0xfffffffffffffffful)
  211. /*
  212. * There are some issues around different compilers requiring different constraint
  213. * syntax for updating pointers from assembly code (see notes for
  214. * MBEDTLS_ASM_AARCH64_PTR_CONSTRAINT in common.h), especially on aarch64_32 (aka ILP32).
  215. *
  216. * For this reason we cast the pointers to/from uintptr_t here.
  217. */
  218. #define MULADDC_X1_INIT \
  219. do { uintptr_t muladdc_d = (uintptr_t) d, muladdc_s = (uintptr_t) s; asm(
  220. #define MULADDC_X1_CORE \
  221. "ldr x4, [%x2], #8 \n\t" \
  222. "ldr x5, [%x1] \n\t" \
  223. "mul x6, x4, %4 \n\t" \
  224. "umulh x7, x4, %4 \n\t" \
  225. "adds x5, x5, x6 \n\t" \
  226. "adc x7, x7, xzr \n\t" \
  227. "adds x5, x5, %0 \n\t" \
  228. "adc %0, x7, xzr \n\t" \
  229. "str x5, [%x1], #8 \n\t"
  230. #define MULADDC_X1_STOP \
  231. : "+r" (c), \
  232. "+r" (muladdc_d), \
  233. "+r" (muladdc_s), \
  234. "+m" (*(uint64_t (*)[16]) d) \
  235. : "r" (b), "m" (*(const uint64_t (*)[16]) s) \
  236. : "x4", "x5", "x6", "x7", "cc" \
  237. ); d = (mbedtls_mpi_uint *)muladdc_d; s = (mbedtls_mpi_uint *)muladdc_s; } while (0);
  238. #endif /* Aarch64 */
  239. #if defined(__mc68020__) || defined(__mcpu32__)
  240. #define MULADDC_X1_INIT \
  241. asm( \
  242. "movl %3, %%a2 \n\t" \
  243. "movl %4, %%a3 \n\t" \
  244. "movl %5, %%d3 \n\t" \
  245. "movl %6, %%d2 \n\t" \
  246. "moveq #0, %%d0 \n\t"
  247. #define MULADDC_X1_CORE \
  248. "movel %%a2@+, %%d1 \n\t" \
  249. "mulul %%d2, %%d4:%%d1 \n\t" \
  250. "addl %%d3, %%d1 \n\t" \
  251. "addxl %%d0, %%d4 \n\t" \
  252. "moveq #0, %%d3 \n\t" \
  253. "addl %%d1, %%a3@+ \n\t" \
  254. "addxl %%d4, %%d3 \n\t"
  255. #define MULADDC_X1_STOP \
  256. "movl %%d3, %0 \n\t" \
  257. "movl %%a3, %1 \n\t" \
  258. "movl %%a2, %2 \n\t" \
  259. : "=m" (c), "=m" (d), "=m" (s) \
  260. : "m" (s), "m" (d), "m" (c), "m" (b) \
  261. : "d0", "d1", "d2", "d3", "d4", "a2", "a3" \
  262. );
  263. #define MULADDC_X8_INIT MULADDC_X1_INIT
  264. #define MULADDC_X8_CORE \
  265. "movel %%a2@+, %%d1 \n\t" \
  266. "mulul %%d2, %%d4:%%d1 \n\t" \
  267. "addxl %%d3, %%d1 \n\t" \
  268. "addxl %%d0, %%d4 \n\t" \
  269. "addl %%d1, %%a3@+ \n\t" \
  270. "movel %%a2@+, %%d1 \n\t" \
  271. "mulul %%d2, %%d3:%%d1 \n\t" \
  272. "addxl %%d4, %%d1 \n\t" \
  273. "addxl %%d0, %%d3 \n\t" \
  274. "addl %%d1, %%a3@+ \n\t" \
  275. "movel %%a2@+, %%d1 \n\t" \
  276. "mulul %%d2, %%d4:%%d1 \n\t" \
  277. "addxl %%d3, %%d1 \n\t" \
  278. "addxl %%d0, %%d4 \n\t" \
  279. "addl %%d1, %%a3@+ \n\t" \
  280. "movel %%a2@+, %%d1 \n\t" \
  281. "mulul %%d2, %%d3:%%d1 \n\t" \
  282. "addxl %%d4, %%d1 \n\t" \
  283. "addxl %%d0, %%d3 \n\t" \
  284. "addl %%d1, %%a3@+ \n\t" \
  285. "movel %%a2@+, %%d1 \n\t" \
  286. "mulul %%d2, %%d4:%%d1 \n\t" \
  287. "addxl %%d3, %%d1 \n\t" \
  288. "addxl %%d0, %%d4 \n\t" \
  289. "addl %%d1, %%a3@+ \n\t" \
  290. "movel %%a2@+, %%d1 \n\t" \
  291. "mulul %%d2, %%d3:%%d1 \n\t" \
  292. "addxl %%d4, %%d1 \n\t" \
  293. "addxl %%d0, %%d3 \n\t" \
  294. "addl %%d1, %%a3@+ \n\t" \
  295. "movel %%a2@+, %%d1 \n\t" \
  296. "mulul %%d2, %%d4:%%d1 \n\t" \
  297. "addxl %%d3, %%d1 \n\t" \
  298. "addxl %%d0, %%d4 \n\t" \
  299. "addl %%d1, %%a3@+ \n\t" \
  300. "movel %%a2@+, %%d1 \n\t" \
  301. "mulul %%d2, %%d3:%%d1 \n\t" \
  302. "addxl %%d4, %%d1 \n\t" \
  303. "addxl %%d0, %%d3 \n\t" \
  304. "addl %%d1, %%a3@+ \n\t" \
  305. "addxl %%d0, %%d3 \n\t"
  306. #define MULADDC_X8_STOP MULADDC_X1_STOP
  307. #endif /* MC68000 */
  308. #if defined(__powerpc64__) || defined(__ppc64__)
  309. #if defined(__MACH__) && defined(__APPLE__)
  310. #define MULADDC_X1_INIT \
  311. asm( \
  312. "ld r3, %3 \n\t" \
  313. "ld r4, %4 \n\t" \
  314. "ld r5, %5 \n\t" \
  315. "ld r6, %6 \n\t" \
  316. "addi r3, r3, -8 \n\t" \
  317. "addi r4, r4, -8 \n\t" \
  318. "addic r5, r5, 0 \n\t"
  319. #define MULADDC_X1_CORE \
  320. "ldu r7, 8(r3) \n\t" \
  321. "mulld r8, r7, r6 \n\t" \
  322. "mulhdu r9, r7, r6 \n\t" \
  323. "adde r8, r8, r5 \n\t" \
  324. "ld r7, 8(r4) \n\t" \
  325. "addze r5, r9 \n\t" \
  326. "addc r8, r8, r7 \n\t" \
  327. "stdu r8, 8(r4) \n\t"
  328. #define MULADDC_X1_STOP \
  329. "addze r5, r5 \n\t" \
  330. "addi r4, r4, 8 \n\t" \
  331. "addi r3, r3, 8 \n\t" \
  332. "std r5, %0 \n\t" \
  333. "std r4, %1 \n\t" \
  334. "std r3, %2 \n\t" \
  335. : "=m" (c), "=m" (d), "=m" (s) \
  336. : "m" (s), "m" (d), "m" (c), "m" (b) \
  337. : "r3", "r4", "r5", "r6", "r7", "r8", "r9" \
  338. );
  339. #else /* __MACH__ && __APPLE__ */
  340. #define MULADDC_X1_INIT \
  341. asm( \
  342. "ld %%r3, %3 \n\t" \
  343. "ld %%r4, %4 \n\t" \
  344. "ld %%r5, %5 \n\t" \
  345. "ld %%r6, %6 \n\t" \
  346. "addi %%r3, %%r3, -8 \n\t" \
  347. "addi %%r4, %%r4, -8 \n\t" \
  348. "addic %%r5, %%r5, 0 \n\t"
  349. #define MULADDC_X1_CORE \
  350. "ldu %%r7, 8(%%r3) \n\t" \
  351. "mulld %%r8, %%r7, %%r6 \n\t" \
  352. "mulhdu %%r9, %%r7, %%r6 \n\t" \
  353. "adde %%r8, %%r8, %%r5 \n\t" \
  354. "ld %%r7, 8(%%r4) \n\t" \
  355. "addze %%r5, %%r9 \n\t" \
  356. "addc %%r8, %%r8, %%r7 \n\t" \
  357. "stdu %%r8, 8(%%r4) \n\t"
  358. #define MULADDC_X1_STOP \
  359. "addze %%r5, %%r5 \n\t" \
  360. "addi %%r4, %%r4, 8 \n\t" \
  361. "addi %%r3, %%r3, 8 \n\t" \
  362. "std %%r5, %0 \n\t" \
  363. "std %%r4, %1 \n\t" \
  364. "std %%r3, %2 \n\t" \
  365. : "=m" (c), "=m" (d), "=m" (s) \
  366. : "m" (s), "m" (d), "m" (c), "m" (b) \
  367. : "r3", "r4", "r5", "r6", "r7", "r8", "r9" \
  368. );
  369. #endif /* __MACH__ && __APPLE__ */
  370. #elif defined(__powerpc__) || defined(__ppc__) /* end PPC64/begin PPC32 */
  371. #if defined(__MACH__) && defined(__APPLE__)
  372. #define MULADDC_X1_INIT \
  373. asm( \
  374. "lwz r3, %3 \n\t" \
  375. "lwz r4, %4 \n\t" \
  376. "lwz r5, %5 \n\t" \
  377. "lwz r6, %6 \n\t" \
  378. "addi r3, r3, -4 \n\t" \
  379. "addi r4, r4, -4 \n\t" \
  380. "addic r5, r5, 0 \n\t"
  381. #define MULADDC_X1_CORE \
  382. "lwzu r7, 4(r3) \n\t" \
  383. "mullw r8, r7, r6 \n\t" \
  384. "mulhwu r9, r7, r6 \n\t" \
  385. "adde r8, r8, r5 \n\t" \
  386. "lwz r7, 4(r4) \n\t" \
  387. "addze r5, r9 \n\t" \
  388. "addc r8, r8, r7 \n\t" \
  389. "stwu r8, 4(r4) \n\t"
  390. #define MULADDC_X1_STOP \
  391. "addze r5, r5 \n\t" \
  392. "addi r4, r4, 4 \n\t" \
  393. "addi r3, r3, 4 \n\t" \
  394. "stw r5, %0 \n\t" \
  395. "stw r4, %1 \n\t" \
  396. "stw r3, %2 \n\t" \
  397. : "=m" (c), "=m" (d), "=m" (s) \
  398. : "m" (s), "m" (d), "m" (c), "m" (b) \
  399. : "r3", "r4", "r5", "r6", "r7", "r8", "r9" \
  400. );
  401. #else /* __MACH__ && __APPLE__ */
  402. #define MULADDC_X1_INIT \
  403. asm( \
  404. "lwz %%r3, %3 \n\t" \
  405. "lwz %%r4, %4 \n\t" \
  406. "lwz %%r5, %5 \n\t" \
  407. "lwz %%r6, %6 \n\t" \
  408. "addi %%r3, %%r3, -4 \n\t" \
  409. "addi %%r4, %%r4, -4 \n\t" \
  410. "addic %%r5, %%r5, 0 \n\t"
  411. #define MULADDC_X1_CORE \
  412. "lwzu %%r7, 4(%%r3) \n\t" \
  413. "mullw %%r8, %%r7, %%r6 \n\t" \
  414. "mulhwu %%r9, %%r7, %%r6 \n\t" \
  415. "adde %%r8, %%r8, %%r5 \n\t" \
  416. "lwz %%r7, 4(%%r4) \n\t" \
  417. "addze %%r5, %%r9 \n\t" \
  418. "addc %%r8, %%r8, %%r7 \n\t" \
  419. "stwu %%r8, 4(%%r4) \n\t"
  420. #define MULADDC_X1_STOP \
  421. "addze %%r5, %%r5 \n\t" \
  422. "addi %%r4, %%r4, 4 \n\t" \
  423. "addi %%r3, %%r3, 4 \n\t" \
  424. "stw %%r5, %0 \n\t" \
  425. "stw %%r4, %1 \n\t" \
  426. "stw %%r3, %2 \n\t" \
  427. : "=m" (c), "=m" (d), "=m" (s) \
  428. : "m" (s), "m" (d), "m" (c), "m" (b) \
  429. : "r3", "r4", "r5", "r6", "r7", "r8", "r9" \
  430. );
  431. #endif /* __MACH__ && __APPLE__ */
  432. #endif /* PPC32 */
  433. /*
  434. * The Sparc(64) assembly is reported to be broken.
  435. * Disable it for now, until we're able to fix it.
  436. */
  437. #if 0 && defined(__sparc__)
  438. #if defined(__sparc64__)
  439. #define MULADDC_X1_INIT \
  440. asm( \
  441. "ldx %3, %%o0 \n\t" \
  442. "ldx %4, %%o1 \n\t" \
  443. "ld %5, %%o2 \n\t" \
  444. "ld %6, %%o3 \n\t"
  445. #define MULADDC_X1_CORE \
  446. "ld [%%o0], %%o4 \n\t" \
  447. "inc 4, %%o0 \n\t" \
  448. "ld [%%o1], %%o5 \n\t" \
  449. "umul %%o3, %%o4, %%o4 \n\t" \
  450. "addcc %%o4, %%o2, %%o4 \n\t" \
  451. "rd %%y, %%g1 \n\t" \
  452. "addx %%g1, 0, %%g1 \n\t" \
  453. "addcc %%o4, %%o5, %%o4 \n\t" \
  454. "st %%o4, [%%o1] \n\t" \
  455. "addx %%g1, 0, %%o2 \n\t" \
  456. "inc 4, %%o1 \n\t"
  457. #define MULADDC_X1_STOP \
  458. "st %%o2, %0 \n\t" \
  459. "stx %%o1, %1 \n\t" \
  460. "stx %%o0, %2 \n\t" \
  461. : "=m" (c), "=m" (d), "=m" (s) \
  462. : "m" (s), "m" (d), "m" (c), "m" (b) \
  463. : "g1", "o0", "o1", "o2", "o3", "o4", \
  464. "o5" \
  465. );
  466. #else /* __sparc64__ */
  467. #define MULADDC_X1_INIT \
  468. asm( \
  469. "ld %3, %%o0 \n\t" \
  470. "ld %4, %%o1 \n\t" \
  471. "ld %5, %%o2 \n\t" \
  472. "ld %6, %%o3 \n\t"
  473. #define MULADDC_X1_CORE \
  474. "ld [%%o0], %%o4 \n\t" \
  475. "inc 4, %%o0 \n\t" \
  476. "ld [%%o1], %%o5 \n\t" \
  477. "umul %%o3, %%o4, %%o4 \n\t" \
  478. "addcc %%o4, %%o2, %%o4 \n\t" \
  479. "rd %%y, %%g1 \n\t" \
  480. "addx %%g1, 0, %%g1 \n\t" \
  481. "addcc %%o4, %%o5, %%o4 \n\t" \
  482. "st %%o4, [%%o1] \n\t" \
  483. "addx %%g1, 0, %%o2 \n\t" \
  484. "inc 4, %%o1 \n\t"
  485. #define MULADDC_X1_STOP \
  486. "st %%o2, %0 \n\t" \
  487. "st %%o1, %1 \n\t" \
  488. "st %%o0, %2 \n\t" \
  489. : "=m" (c), "=m" (d), "=m" (s) \
  490. : "m" (s), "m" (d), "m" (c), "m" (b) \
  491. : "g1", "o0", "o1", "o2", "o3", "o4", \
  492. "o5" \
  493. );
  494. #endif /* __sparc64__ */
  495. #endif /* __sparc__ */
  496. #if defined(__microblaze__) || defined(microblaze)
  497. #define MULADDC_X1_INIT \
  498. asm( \
  499. "lwi r3, %3 \n\t" \
  500. "lwi r4, %4 \n\t" \
  501. "lwi r5, %5 \n\t" \
  502. "lwi r6, %6 \n\t" \
  503. "andi r7, r6, 0xffff \n\t" \
  504. "bsrli r6, r6, 16 \n\t"
  505. #if(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
  506. #define MULADDC_LHUI \
  507. "lhui r9, r3, 0 \n\t" \
  508. "addi r3, r3, 2 \n\t" \
  509. "lhui r8, r3, 0 \n\t"
  510. #else
  511. #define MULADDC_LHUI \
  512. "lhui r8, r3, 0 \n\t" \
  513. "addi r3, r3, 2 \n\t" \
  514. "lhui r9, r3, 0 \n\t"
  515. #endif
  516. #define MULADDC_X1_CORE \
  517. MULADDC_LHUI \
  518. "addi r3, r3, 2 \n\t" \
  519. "mul r10, r9, r6 \n\t" \
  520. "mul r11, r8, r7 \n\t" \
  521. "mul r12, r9, r7 \n\t" \
  522. "mul r13, r8, r6 \n\t" \
  523. "bsrli r8, r10, 16 \n\t" \
  524. "bsrli r9, r11, 16 \n\t" \
  525. "add r13, r13, r8 \n\t" \
  526. "add r13, r13, r9 \n\t" \
  527. "bslli r10, r10, 16 \n\t" \
  528. "bslli r11, r11, 16 \n\t" \
  529. "add r12, r12, r10 \n\t" \
  530. "addc r13, r13, r0 \n\t" \
  531. "add r12, r12, r11 \n\t" \
  532. "addc r13, r13, r0 \n\t" \
  533. "lwi r10, r4, 0 \n\t" \
  534. "add r12, r12, r10 \n\t" \
  535. "addc r13, r13, r0 \n\t" \
  536. "add r12, r12, r5 \n\t" \
  537. "addc r5, r13, r0 \n\t" \
  538. "swi r12, r4, 0 \n\t" \
  539. "addi r4, r4, 4 \n\t"
  540. #define MULADDC_X1_STOP \
  541. "swi r5, %0 \n\t" \
  542. "swi r4, %1 \n\t" \
  543. "swi r3, %2 \n\t" \
  544. : "=m" (c), "=m" (d), "=m" (s) \
  545. : "m" (s), "m" (d), "m" (c), "m" (b) \
  546. : "r3", "r4", "r5", "r6", "r7", "r8", \
  547. "r9", "r10", "r11", "r12", "r13" \
  548. );
  549. #endif /* MicroBlaze */
  550. #if defined(__tricore__)
  551. #define MULADDC_X1_INIT \
  552. asm( \
  553. "ld.a %%a2, %3 \n\t" \
  554. "ld.a %%a3, %4 \n\t" \
  555. "ld.w %%d4, %5 \n\t" \
  556. "ld.w %%d1, %6 \n\t" \
  557. "xor %%d5, %%d5 \n\t"
  558. #define MULADDC_X1_CORE \
  559. "ld.w %%d0, [%%a2+] \n\t" \
  560. "madd.u %%e2, %%e4, %%d0, %%d1 \n\t" \
  561. "ld.w %%d0, [%%a3] \n\t" \
  562. "addx %%d2, %%d2, %%d0 \n\t" \
  563. "addc %%d3, %%d3, 0 \n\t" \
  564. "mov %%d4, %%d3 \n\t" \
  565. "st.w [%%a3+], %%d2 \n\t"
  566. #define MULADDC_X1_STOP \
  567. "st.w %0, %%d4 \n\t" \
  568. "st.a %1, %%a3 \n\t" \
  569. "st.a %2, %%a2 \n\t" \
  570. : "=m" (c), "=m" (d), "=m" (s) \
  571. : "m" (s), "m" (d), "m" (c), "m" (b) \
  572. : "d0", "d1", "e2", "d4", "a2", "a3" \
  573. );
  574. #endif /* TriCore */
  575. #if defined(__arm__)
  576. #if defined(__thumb__) && !defined(__thumb2__)
  577. #if defined(MBEDTLS_COMPILER_IS_GCC)
  578. /*
  579. * Thumb 1 ISA. This code path has only been tested successfully on gcc;
  580. * it does not compile on clang or armclang.
  581. */
  582. #if !defined(__OPTIMIZE__) && defined(__GNUC__)
  583. /*
  584. * Note, gcc -O0 by default uses r7 for the frame pointer, so it complains about
  585. * our use of r7 below, unless -fomit-frame-pointer is passed.
  586. *
  587. * On the other hand, -fomit-frame-pointer is implied by any -Ox options with
  588. * x !=0, which we can detect using __OPTIMIZE__ (which is also defined by
  589. * clang and armcc5 under the same conditions).
  590. *
  591. * If gcc needs to use r7, we use r1 as a scratch register and have a few extra
  592. * instructions to preserve/restore it; otherwise, we can use r7 and avoid
  593. * the preserve/restore overhead.
  594. */
  595. #define MULADDC_SCRATCH "RS .req r1 \n\t"
  596. #define MULADDC_PRESERVE_SCRATCH "mov r10, r1 \n\t"
  597. #define MULADDC_RESTORE_SCRATCH "mov r1, r10 \n\t"
  598. #define MULADDC_SCRATCH_CLOBBER "r10"
  599. #else /* !defined(__OPTIMIZE__) && defined(__GNUC__) */
  600. #define MULADDC_SCRATCH "RS .req r7 \n\t"
  601. #define MULADDC_PRESERVE_SCRATCH ""
  602. #define MULADDC_RESTORE_SCRATCH ""
  603. #define MULADDC_SCRATCH_CLOBBER "r7"
  604. #endif /* !defined(__OPTIMIZE__) && defined(__GNUC__) */
  605. #define MULADDC_X1_INIT \
  606. asm( \
  607. MULADDC_SCRATCH \
  608. "ldr r0, %3 \n\t" \
  609. "ldr r1, %4 \n\t" \
  610. "ldr r2, %5 \n\t" \
  611. "ldr r3, %6 \n\t" \
  612. "lsr r4, r3, #16 \n\t" \
  613. "mov r9, r4 \n\t" \
  614. "lsl r4, r3, #16 \n\t" \
  615. "lsr r4, r4, #16 \n\t" \
  616. "mov r8, r4 \n\t" \
  617. #define MULADDC_X1_CORE \
  618. MULADDC_PRESERVE_SCRATCH \
  619. "ldmia r0!, {r6} \n\t" \
  620. "lsr RS, r6, #16 \n\t" \
  621. "lsl r6, r6, #16 \n\t" \
  622. "lsr r6, r6, #16 \n\t" \
  623. "mov r4, r8 \n\t" \
  624. "mul r4, r6 \n\t" \
  625. "mov r3, r9 \n\t" \
  626. "mul r6, r3 \n\t" \
  627. "mov r5, r9 \n\t" \
  628. "mul r5, RS \n\t" \
  629. "mov r3, r8 \n\t" \
  630. "mul RS, r3 \n\t" \
  631. "lsr r3, r6, #16 \n\t" \
  632. "add r5, r5, r3 \n\t" \
  633. "lsr r3, RS, #16 \n\t" \
  634. "add r5, r5, r3 \n\t" \
  635. "add r4, r4, r2 \n\t" \
  636. "mov r2, #0 \n\t" \
  637. "adc r5, r2 \n\t" \
  638. "lsl r3, r6, #16 \n\t" \
  639. "add r4, r4, r3 \n\t" \
  640. "adc r5, r2 \n\t" \
  641. "lsl r3, RS, #16 \n\t" \
  642. "add r4, r4, r3 \n\t" \
  643. "adc r5, r2 \n\t" \
  644. MULADDC_RESTORE_SCRATCH \
  645. "ldr r3, [r1] \n\t" \
  646. "add r4, r4, r3 \n\t" \
  647. "adc r2, r5 \n\t" \
  648. "stmia r1!, {r4} \n\t"
  649. #define MULADDC_X1_STOP \
  650. "str r2, %0 \n\t" \
  651. "str r1, %1 \n\t" \
  652. "str r0, %2 \n\t" \
  653. : "=m" (c), "=m" (d), "=m" (s) \
  654. : "m" (s), "m" (d), "m" (c), "m" (b) \
  655. : "r0", "r1", "r2", "r3", "r4", "r5", \
  656. "r6", MULADDC_SCRATCH_CLOBBER, "r8", "r9", "cc" \
  657. );
  658. #endif /* !defined(__ARMCC_VERSION) && !defined(__clang__) */
  659. #elif (__ARM_ARCH >= 6) && \
  660. defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1)
  661. /* Armv6-M (or later) with DSP Instruction Set Extensions.
  662. * Requires support for either Thumb 2 or Arm ISA.
  663. */
  664. #define MULADDC_X1_INIT \
  665. { \
  666. mbedtls_mpi_uint tmp_a, tmp_b; \
  667. asm volatile (
  668. #define MULADDC_X1_CORE \
  669. ".p2align 2 \n\t" \
  670. "ldr %[a], [%[in]], #4 \n\t" \
  671. "ldr %[b], [%[acc]] \n\t" \
  672. "umaal %[b], %[carry], %[scalar], %[a] \n\t" \
  673. "str %[b], [%[acc]], #4 \n\t"
  674. #define MULADDC_X1_STOP \
  675. : [a] "=&r" (tmp_a), \
  676. [b] "=&r" (tmp_b), \
  677. [in] "+r" (s), \
  678. [acc] "+r" (d), \
  679. [carry] "+l" (c) \
  680. : [scalar] "r" (b) \
  681. : "memory" \
  682. ); \
  683. }
  684. #define MULADDC_X2_INIT \
  685. { \
  686. mbedtls_mpi_uint tmp_a0, tmp_b0; \
  687. mbedtls_mpi_uint tmp_a1, tmp_b1; \
  688. asm volatile (
  689. /* - Make sure loop is 4-byte aligned to avoid stalls
  690. * upon repeated non-word aligned instructions in
  691. * some microarchitectures.
  692. * - Don't use ldm with post-increment or back-to-back
  693. * loads with post-increment and same address register
  694. * to avoid stalls on some microarchitectures.
  695. * - Bunch loads and stores to reduce latency on some
  696. * microarchitectures. E.g., on Cortex-M4, the first
  697. * in a series of load/store operations has latency
  698. * 2 cycles, while subsequent loads/stores are single-cycle. */
  699. #define MULADDC_X2_CORE \
  700. ".p2align 2 \n\t" \
  701. "ldr %[a0], [%[in]], #+8 \n\t" \
  702. "ldr %[b0], [%[acc]], #+8 \n\t" \
  703. "ldr %[a1], [%[in], #-4] \n\t" \
  704. "ldr %[b1], [%[acc], #-4] \n\t" \
  705. "umaal %[b0], %[carry], %[scalar], %[a0] \n\t" \
  706. "umaal %[b1], %[carry], %[scalar], %[a1] \n\t" \
  707. "str %[b0], [%[acc], #-8] \n\t" \
  708. "str %[b1], [%[acc], #-4] \n\t"
  709. #define MULADDC_X2_STOP \
  710. : [a0] "=&r" (tmp_a0), \
  711. [b0] "=&r" (tmp_b0), \
  712. [a1] "=&r" (tmp_a1), \
  713. [b1] "=&r" (tmp_b1), \
  714. [in] "+r" (s), \
  715. [acc] "+r" (d), \
  716. [carry] "+l" (c) \
  717. : [scalar] "r" (b) \
  718. : "memory" \
  719. ); \
  720. }
  721. #else /* Thumb 2 or Arm ISA, without DSP extensions */
  722. #define MULADDC_X1_INIT \
  723. asm( \
  724. "ldr r0, %3 \n\t" \
  725. "ldr r1, %4 \n\t" \
  726. "ldr r2, %5 \n\t" \
  727. "ldr r3, %6 \n\t"
  728. #define MULADDC_X1_CORE \
  729. "ldr r4, [r0], #4 \n\t" \
  730. "mov r5, #0 \n\t" \
  731. "ldr r6, [r1] \n\t" \
  732. "umlal r2, r5, r3, r4 \n\t" \
  733. "adds r4, r6, r2 \n\t" \
  734. "adc r2, r5, #0 \n\t" \
  735. "str r4, [r1], #4 \n\t"
  736. #define MULADDC_X1_STOP \
  737. "str r2, %0 \n\t" \
  738. "str r1, %1 \n\t" \
  739. "str r0, %2 \n\t" \
  740. : "=m" (c), "=m" (d), "=m" (s) \
  741. : "m" (s), "m" (d), "m" (c), "m" (b) \
  742. : "r0", "r1", "r2", "r3", "r4", "r5", \
  743. "r6", "cc" \
  744. );
  745. #endif /* ISA codepath selection */
  746. #endif /* defined(__arm__) */
  747. #if defined(__alpha__)
  748. #define MULADDC_X1_INIT \
  749. asm( \
  750. "ldq $1, %3 \n\t" \
  751. "ldq $2, %4 \n\t" \
  752. "ldq $3, %5 \n\t" \
  753. "ldq $4, %6 \n\t"
  754. #define MULADDC_X1_CORE \
  755. "ldq $6, 0($1) \n\t" \
  756. "addq $1, 8, $1 \n\t" \
  757. "mulq $6, $4, $7 \n\t" \
  758. "umulh $6, $4, $6 \n\t" \
  759. "addq $7, $3, $7 \n\t" \
  760. "cmpult $7, $3, $3 \n\t" \
  761. "ldq $5, 0($2) \n\t" \
  762. "addq $7, $5, $7 \n\t" \
  763. "cmpult $7, $5, $5 \n\t" \
  764. "stq $7, 0($2) \n\t" \
  765. "addq $2, 8, $2 \n\t" \
  766. "addq $6, $3, $3 \n\t" \
  767. "addq $5, $3, $3 \n\t"
  768. #define MULADDC_X1_STOP \
  769. "stq $3, %0 \n\t" \
  770. "stq $2, %1 \n\t" \
  771. "stq $1, %2 \n\t" \
  772. : "=m" (c), "=m" (d), "=m" (s) \
  773. : "m" (s), "m" (d), "m" (c), "m" (b) \
  774. : "$1", "$2", "$3", "$4", "$5", "$6", "$7" \
  775. );
  776. #endif /* Alpha */
  777. #if defined(__mips__) && !defined(__mips64)
  778. #define MULADDC_X1_INIT \
  779. asm( \
  780. "lw $10, %3 \n\t" \
  781. "lw $11, %4 \n\t" \
  782. "lw $12, %5 \n\t" \
  783. "lw $13, %6 \n\t"
  784. #define MULADDC_X1_CORE \
  785. "lw $14, 0($10) \n\t" \
  786. "multu $13, $14 \n\t" \
  787. "addi $10, $10, 4 \n\t" \
  788. "mflo $14 \n\t" \
  789. "mfhi $9 \n\t" \
  790. "addu $14, $12, $14 \n\t" \
  791. "lw $15, 0($11) \n\t" \
  792. "sltu $12, $14, $12 \n\t" \
  793. "addu $15, $14, $15 \n\t" \
  794. "sltu $14, $15, $14 \n\t" \
  795. "addu $12, $12, $9 \n\t" \
  796. "sw $15, 0($11) \n\t" \
  797. "addu $12, $12, $14 \n\t" \
  798. "addi $11, $11, 4 \n\t"
  799. #define MULADDC_X1_STOP \
  800. "sw $12, %0 \n\t" \
  801. "sw $11, %1 \n\t" \
  802. "sw $10, %2 \n\t" \
  803. : "=m" (c), "=m" (d), "=m" (s) \
  804. : "m" (s), "m" (d), "m" (c), "m" (b) \
  805. : "$9", "$10", "$11", "$12", "$13", "$14", "$15", "lo", "hi" \
  806. );
  807. #endif /* MIPS */
  808. #endif /* GNUC */
  809. #if (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)
  810. #define MULADDC_X1_INIT \
  811. __asm mov esi, s \
  812. __asm mov edi, d \
  813. __asm mov ecx, c \
  814. __asm mov ebx, b
  815. #define MULADDC_X1_CORE \
  816. __asm lodsd \
  817. __asm mul ebx \
  818. __asm add eax, ecx \
  819. __asm adc edx, 0 \
  820. __asm add eax, [edi] \
  821. __asm adc edx, 0 \
  822. __asm mov ecx, edx \
  823. __asm stosd
  824. #define MULADDC_X1_STOP \
  825. __asm mov c, ecx \
  826. __asm mov d, edi \
  827. __asm mov s, esi
  828. #if defined(MBEDTLS_HAVE_SSE2)
  829. #define EMIT __asm _emit
  830. #define MULADDC_X8_INIT MULADDC_X1_INIT
  831. #define MULADDC_X8_CORE \
  832. EMIT 0x0F EMIT 0x6E EMIT 0xC9 \
  833. EMIT 0x0F EMIT 0x6E EMIT 0xC3 \
  834. EMIT 0x0F EMIT 0x6E EMIT 0x1F \
  835. EMIT 0x0F EMIT 0xD4 EMIT 0xCB \
  836. EMIT 0x0F EMIT 0x6E EMIT 0x16 \
  837. EMIT 0x0F EMIT 0xF4 EMIT 0xD0 \
  838. EMIT 0x0F EMIT 0x6E EMIT 0x66 EMIT 0x04 \
  839. EMIT 0x0F EMIT 0xF4 EMIT 0xE0 \
  840. EMIT 0x0F EMIT 0x6E EMIT 0x76 EMIT 0x08 \
  841. EMIT 0x0F EMIT 0xF4 EMIT 0xF0 \
  842. EMIT 0x0F EMIT 0x6E EMIT 0x7E EMIT 0x0C \
  843. EMIT 0x0F EMIT 0xF4 EMIT 0xF8 \
  844. EMIT 0x0F EMIT 0xD4 EMIT 0xCA \
  845. EMIT 0x0F EMIT 0x6E EMIT 0x5F EMIT 0x04 \
  846. EMIT 0x0F EMIT 0xD4 EMIT 0xDC \
  847. EMIT 0x0F EMIT 0x6E EMIT 0x6F EMIT 0x08 \
  848. EMIT 0x0F EMIT 0xD4 EMIT 0xEE \
  849. EMIT 0x0F EMIT 0x6E EMIT 0x67 EMIT 0x0C \
  850. EMIT 0x0F EMIT 0xD4 EMIT 0xFC \
  851. EMIT 0x0F EMIT 0x7E EMIT 0x0F \
  852. EMIT 0x0F EMIT 0x6E EMIT 0x56 EMIT 0x10 \
  853. EMIT 0x0F EMIT 0xF4 EMIT 0xD0 \
  854. EMIT 0x0F EMIT 0x73 EMIT 0xD1 EMIT 0x20 \
  855. EMIT 0x0F EMIT 0x6E EMIT 0x66 EMIT 0x14 \
  856. EMIT 0x0F EMIT 0xF4 EMIT 0xE0 \
  857. EMIT 0x0F EMIT 0xD4 EMIT 0xCB \
  858. EMIT 0x0F EMIT 0x6E EMIT 0x76 EMIT 0x18 \
  859. EMIT 0x0F EMIT 0xF4 EMIT 0xF0 \
  860. EMIT 0x0F EMIT 0x7E EMIT 0x4F EMIT 0x04 \
  861. EMIT 0x0F EMIT 0x73 EMIT 0xD1 EMIT 0x20 \
  862. EMIT 0x0F EMIT 0x6E EMIT 0x5E EMIT 0x1C \
  863. EMIT 0x0F EMIT 0xF4 EMIT 0xD8 \
  864. EMIT 0x0F EMIT 0xD4 EMIT 0xCD \
  865. EMIT 0x0F EMIT 0x6E EMIT 0x6F EMIT 0x10 \
  866. EMIT 0x0F EMIT 0xD4 EMIT 0xD5 \
  867. EMIT 0x0F EMIT 0x7E EMIT 0x4F EMIT 0x08 \
  868. EMIT 0x0F EMIT 0x73 EMIT 0xD1 EMIT 0x20 \
  869. EMIT 0x0F EMIT 0xD4 EMIT 0xCF \
  870. EMIT 0x0F EMIT 0x6E EMIT 0x6F EMIT 0x14 \
  871. EMIT 0x0F EMIT 0xD4 EMIT 0xE5 \
  872. EMIT 0x0F EMIT 0x7E EMIT 0x4F EMIT 0x0C \
  873. EMIT 0x0F EMIT 0x73 EMIT 0xD1 EMIT 0x20 \
  874. EMIT 0x0F EMIT 0xD4 EMIT 0xCA \
  875. EMIT 0x0F EMIT 0x6E EMIT 0x6F EMIT 0x18 \
  876. EMIT 0x0F EMIT 0xD4 EMIT 0xF5 \
  877. EMIT 0x0F EMIT 0x7E EMIT 0x4F EMIT 0x10 \
  878. EMIT 0x0F EMIT 0x73 EMIT 0xD1 EMIT 0x20 \
  879. EMIT 0x0F EMIT 0xD4 EMIT 0xCC \
  880. EMIT 0x0F EMIT 0x6E EMIT 0x6F EMIT 0x1C \
  881. EMIT 0x0F EMIT 0xD4 EMIT 0xDD \
  882. EMIT 0x0F EMIT 0x7E EMIT 0x4F EMIT 0x14 \
  883. EMIT 0x0F EMIT 0x73 EMIT 0xD1 EMIT 0x20 \
  884. EMIT 0x0F EMIT 0xD4 EMIT 0xCE \
  885. EMIT 0x0F EMIT 0x7E EMIT 0x4F EMIT 0x18 \
  886. EMIT 0x0F EMIT 0x73 EMIT 0xD1 EMIT 0x20 \
  887. EMIT 0x0F EMIT 0xD4 EMIT 0xCB \
  888. EMIT 0x0F EMIT 0x7E EMIT 0x4F EMIT 0x1C \
  889. EMIT 0x83 EMIT 0xC7 EMIT 0x20 \
  890. EMIT 0x83 EMIT 0xC6 EMIT 0x20 \
  891. EMIT 0x0F EMIT 0x73 EMIT 0xD1 EMIT 0x20 \
  892. EMIT 0x0F EMIT 0x7E EMIT 0xC9
  893. #define MULADDC_X8_STOP \
  894. EMIT 0x0F EMIT 0x77 \
  895. __asm mov c, ecx \
  896. __asm mov d, edi \
  897. __asm mov s, esi
  898. #endif /* SSE2 */
  899. #endif /* MSVC */
  900. #endif /* MBEDTLS_HAVE_ASM */
  901. #if !defined(MULADDC_X1_CORE)
  902. #if defined(MBEDTLS_HAVE_UDBL)
  903. #define MULADDC_X1_INIT \
  904. { \
  905. mbedtls_t_udbl r; \
  906. mbedtls_mpi_uint r0, r1;
  907. #define MULADDC_X1_CORE \
  908. r = *(s++) * (mbedtls_t_udbl) b; \
  909. r0 = (mbedtls_mpi_uint) r; \
  910. r1 = (mbedtls_mpi_uint)( r >> biL ); \
  911. r0 += c; r1 += (r0 < c); \
  912. r0 += *d; r1 += (r0 < *d); \
  913. c = r1; *(d++) = r0;
  914. #define MULADDC_X1_STOP \
  915. }
  916. #else /* MBEDTLS_HAVE_UDBL */
  917. #define MULADDC_X1_INIT \
  918. { \
  919. mbedtls_mpi_uint s0, s1, b0, b1; \
  920. mbedtls_mpi_uint r0, r1, rx, ry; \
  921. b0 = ( b << biH ) >> biH; \
  922. b1 = ( b >> biH );
  923. #define MULADDC_X1_CORE \
  924. s0 = ( *s << biH ) >> biH; \
  925. s1 = ( *s >> biH ); s++; \
  926. rx = s0 * b1; r0 = s0 * b0; \
  927. ry = s1 * b0; r1 = s1 * b1; \
  928. r1 += ( rx >> biH ); \
  929. r1 += ( ry >> biH ); \
  930. rx <<= biH; ry <<= biH; \
  931. r0 += rx; r1 += (r0 < rx); \
  932. r0 += ry; r1 += (r0 < ry); \
  933. r0 += c; r1 += (r0 < c); \
  934. r0 += *d; r1 += (r0 < *d); \
  935. c = r1; *(d++) = r0;
  936. #define MULADDC_X1_STOP \
  937. }
  938. #endif /* C (longlong) */
  939. #endif /* C (generic) */
  940. #if !defined(MULADDC_X2_CORE)
  941. #define MULADDC_X2_INIT MULADDC_X1_INIT
  942. #define MULADDC_X2_STOP MULADDC_X1_STOP
  943. #define MULADDC_X2_CORE MULADDC_X1_CORE MULADDC_X1_CORE
  944. #endif /* MULADDC_X2_CORE */
  945. #if !defined(MULADDC_X4_CORE)
  946. #define MULADDC_X4_INIT MULADDC_X2_INIT
  947. #define MULADDC_X4_STOP MULADDC_X2_STOP
  948. #define MULADDC_X4_CORE MULADDC_X2_CORE MULADDC_X2_CORE
  949. #endif /* MULADDC_X4_CORE */
  950. #if !defined(MULADDC_X8_CORE)
  951. #define MULADDC_X8_INIT MULADDC_X4_INIT
  952. #define MULADDC_X8_STOP MULADDC_X4_STOP
  953. #define MULADDC_X8_CORE MULADDC_X4_CORE MULADDC_X4_CORE
  954. #endif /* MULADDC_X8_CORE */
  955. /* *INDENT-ON* */
  956. #endif /* bn_mul.h */