x86_64-mont5-nasm.asm 63 KB


  1. ; This file is generated from a similarly-named Perl script in the BoringSSL
  2. ; source tree. Do not edit by hand.
  3. default rel
  4. %define XMMWORD
  5. %define YMMWORD
  6. %define ZMMWORD
  7. section .text code align=64
  8. EXTERN GFp_ia32cap_P
  9. global GFp_bn_mul_mont_gather5
  10. ALIGN 64
  11. GFp_bn_mul_mont_gather5:
  12. mov QWORD[8+rsp],rdi ;WIN64 prologue
  13. mov QWORD[16+rsp],rsi
  14. mov rax,rsp
  15. $L$SEH_begin_GFp_bn_mul_mont_gather5:
  16. mov rdi,rcx
  17. mov rsi,rdx
  18. mov rdx,r8
  19. mov rcx,r9
  20. mov r8,QWORD[40+rsp]
  21. mov r9,QWORD[48+rsp]
  22. mov r9d,r9d
  23. mov rax,rsp
  24. test r9d,7
  25. jnz NEAR $L$mul_enter
  26. lea r11,[GFp_ia32cap_P]
  27. mov r11d,DWORD[8+r11]
  28. jmp NEAR $L$mul4x_enter
  29. ALIGN 16
  30. $L$mul_enter:
  31. movd xmm5,DWORD[56+rsp]
  32. push rbx
  33. push rbp
  34. push r12
  35. push r13
  36. push r14
  37. push r15
  38. neg r9
  39. mov r11,rsp
  40. lea r10,[((-280))+r9*8+rsp]
  41. neg r9
  42. and r10,-1024
  43. sub r11,r10
  44. and r11,-4096
  45. lea rsp,[r11*1+r10]
  46. mov r11,QWORD[rsp]
  47. cmp rsp,r10
  48. ja NEAR $L$mul_page_walk
  49. jmp NEAR $L$mul_page_walk_done
  50. $L$mul_page_walk:
  51. lea rsp,[((-4096))+rsp]
  52. mov r11,QWORD[rsp]
  53. cmp rsp,r10
  54. ja NEAR $L$mul_page_walk
  55. $L$mul_page_walk_done:
  56. lea r10,[$L$inc]
  57. mov QWORD[8+r9*8+rsp],rax
  58. $L$mul_body:
  59. lea r12,[128+rdx]
  60. movdqa xmm0,XMMWORD[r10]
  61. movdqa xmm1,XMMWORD[16+r10]
  62. lea r10,[((24-112))+r9*8+rsp]
  63. and r10,-16
  64. pshufd xmm5,xmm5,0
  65. movdqa xmm4,xmm1
  66. movdqa xmm2,xmm1
  67. paddd xmm1,xmm0
  68. pcmpeqd xmm0,xmm5
  69. DB 0x67
  70. movdqa xmm3,xmm4
  71. paddd xmm2,xmm1
  72. pcmpeqd xmm1,xmm5
  73. movdqa XMMWORD[112+r10],xmm0
  74. movdqa xmm0,xmm4
  75. paddd xmm3,xmm2
  76. pcmpeqd xmm2,xmm5
  77. movdqa XMMWORD[128+r10],xmm1
  78. movdqa xmm1,xmm4
  79. paddd xmm0,xmm3
  80. pcmpeqd xmm3,xmm5
  81. movdqa XMMWORD[144+r10],xmm2
  82. movdqa xmm2,xmm4
  83. paddd xmm1,xmm0
  84. pcmpeqd xmm0,xmm5
  85. movdqa XMMWORD[160+r10],xmm3
  86. movdqa xmm3,xmm4
  87. paddd xmm2,xmm1
  88. pcmpeqd xmm1,xmm5
  89. movdqa XMMWORD[176+r10],xmm0
  90. movdqa xmm0,xmm4
  91. paddd xmm3,xmm2
  92. pcmpeqd xmm2,xmm5
  93. movdqa XMMWORD[192+r10],xmm1
  94. movdqa xmm1,xmm4
  95. paddd xmm0,xmm3
  96. pcmpeqd xmm3,xmm5
  97. movdqa XMMWORD[208+r10],xmm2
  98. movdqa xmm2,xmm4
  99. paddd xmm1,xmm0
  100. pcmpeqd xmm0,xmm5
  101. movdqa XMMWORD[224+r10],xmm3
  102. movdqa xmm3,xmm4
  103. paddd xmm2,xmm1
  104. pcmpeqd xmm1,xmm5
  105. movdqa XMMWORD[240+r10],xmm0
  106. movdqa xmm0,xmm4
  107. paddd xmm3,xmm2
  108. pcmpeqd xmm2,xmm5
  109. movdqa XMMWORD[256+r10],xmm1
  110. movdqa xmm1,xmm4
  111. paddd xmm0,xmm3
  112. pcmpeqd xmm3,xmm5
  113. movdqa XMMWORD[272+r10],xmm2
  114. movdqa xmm2,xmm4
  115. paddd xmm1,xmm0
  116. pcmpeqd xmm0,xmm5
  117. movdqa XMMWORD[288+r10],xmm3
  118. movdqa xmm3,xmm4
  119. paddd xmm2,xmm1
  120. pcmpeqd xmm1,xmm5
  121. movdqa XMMWORD[304+r10],xmm0
  122. paddd xmm3,xmm2
  123. DB 0x67
  124. pcmpeqd xmm2,xmm5
  125. movdqa XMMWORD[320+r10],xmm1
  126. pcmpeqd xmm3,xmm5
  127. movdqa XMMWORD[336+r10],xmm2
  128. pand xmm0,XMMWORD[64+r12]
  129. pand xmm1,XMMWORD[80+r12]
  130. pand xmm2,XMMWORD[96+r12]
  131. movdqa XMMWORD[352+r10],xmm3
  132. pand xmm3,XMMWORD[112+r12]
  133. por xmm0,xmm2
  134. por xmm1,xmm3
  135. movdqa xmm4,XMMWORD[((-128))+r12]
  136. movdqa xmm5,XMMWORD[((-112))+r12]
  137. movdqa xmm2,XMMWORD[((-96))+r12]
  138. pand xmm4,XMMWORD[112+r10]
  139. movdqa xmm3,XMMWORD[((-80))+r12]
  140. pand xmm5,XMMWORD[128+r10]
  141. por xmm0,xmm4
  142. pand xmm2,XMMWORD[144+r10]
  143. por xmm1,xmm5
  144. pand xmm3,XMMWORD[160+r10]
  145. por xmm0,xmm2
  146. por xmm1,xmm3
  147. movdqa xmm4,XMMWORD[((-64))+r12]
  148. movdqa xmm5,XMMWORD[((-48))+r12]
  149. movdqa xmm2,XMMWORD[((-32))+r12]
  150. pand xmm4,XMMWORD[176+r10]
  151. movdqa xmm3,XMMWORD[((-16))+r12]
  152. pand xmm5,XMMWORD[192+r10]
  153. por xmm0,xmm4
  154. pand xmm2,XMMWORD[208+r10]
  155. por xmm1,xmm5
  156. pand xmm3,XMMWORD[224+r10]
  157. por xmm0,xmm2
  158. por xmm1,xmm3
  159. movdqa xmm4,XMMWORD[r12]
  160. movdqa xmm5,XMMWORD[16+r12]
  161. movdqa xmm2,XMMWORD[32+r12]
  162. pand xmm4,XMMWORD[240+r10]
  163. movdqa xmm3,XMMWORD[48+r12]
  164. pand xmm5,XMMWORD[256+r10]
  165. por xmm0,xmm4
  166. pand xmm2,XMMWORD[272+r10]
  167. por xmm1,xmm5
  168. pand xmm3,XMMWORD[288+r10]
  169. por xmm0,xmm2
  170. por xmm1,xmm3
  171. por xmm0,xmm1
  172. pshufd xmm1,xmm0,0x4e
  173. por xmm0,xmm1
  174. lea r12,[256+r12]
  175. DB 102,72,15,126,195
  176. mov r8,QWORD[r8]
  177. mov rax,QWORD[rsi]
  178. xor r14,r14
  179. xor r15,r15
  180. mov rbp,r8
  181. mul rbx
  182. mov r10,rax
  183. mov rax,QWORD[rcx]
  184. imul rbp,r10
  185. mov r11,rdx
  186. mul rbp
  187. add r10,rax
  188. mov rax,QWORD[8+rsi]
  189. adc rdx,0
  190. mov r13,rdx
  191. lea r15,[1+r15]
  192. jmp NEAR $L$1st_enter
  193. ALIGN 16
  194. $L$1st:
  195. add r13,rax
  196. mov rax,QWORD[r15*8+rsi]
  197. adc rdx,0
  198. add r13,r11
  199. mov r11,r10
  200. adc rdx,0
  201. mov QWORD[((-16))+r15*8+rsp],r13
  202. mov r13,rdx
  203. $L$1st_enter:
  204. mul rbx
  205. add r11,rax
  206. mov rax,QWORD[r15*8+rcx]
  207. adc rdx,0
  208. lea r15,[1+r15]
  209. mov r10,rdx
  210. mul rbp
  211. cmp r15,r9
  212. jne NEAR $L$1st
  213. add r13,rax
  214. adc rdx,0
  215. add r13,r11
  216. adc rdx,0
  217. mov QWORD[((-16))+r9*8+rsp],r13
  218. mov r13,rdx
  219. mov r11,r10
  220. xor rdx,rdx
  221. add r13,r11
  222. adc rdx,0
  223. mov QWORD[((-8))+r9*8+rsp],r13
  224. mov QWORD[r9*8+rsp],rdx
  225. lea r14,[1+r14]
  226. jmp NEAR $L$outer
  227. ALIGN 16
  228. $L$outer:
  229. lea rdx,[((24+128))+r9*8+rsp]
  230. and rdx,-16
  231. pxor xmm4,xmm4
  232. pxor xmm5,xmm5
  233. movdqa xmm0,XMMWORD[((-128))+r12]
  234. movdqa xmm1,XMMWORD[((-112))+r12]
  235. movdqa xmm2,XMMWORD[((-96))+r12]
  236. movdqa xmm3,XMMWORD[((-80))+r12]
  237. pand xmm0,XMMWORD[((-128))+rdx]
  238. pand xmm1,XMMWORD[((-112))+rdx]
  239. por xmm4,xmm0
  240. pand xmm2,XMMWORD[((-96))+rdx]
  241. por xmm5,xmm1
  242. pand xmm3,XMMWORD[((-80))+rdx]
  243. por xmm4,xmm2
  244. por xmm5,xmm3
  245. movdqa xmm0,XMMWORD[((-64))+r12]
  246. movdqa xmm1,XMMWORD[((-48))+r12]
  247. movdqa xmm2,XMMWORD[((-32))+r12]
  248. movdqa xmm3,XMMWORD[((-16))+r12]
  249. pand xmm0,XMMWORD[((-64))+rdx]
  250. pand xmm1,XMMWORD[((-48))+rdx]
  251. por xmm4,xmm0
  252. pand xmm2,XMMWORD[((-32))+rdx]
  253. por xmm5,xmm1
  254. pand xmm3,XMMWORD[((-16))+rdx]
  255. por xmm4,xmm2
  256. por xmm5,xmm3
  257. movdqa xmm0,XMMWORD[r12]
  258. movdqa xmm1,XMMWORD[16+r12]
  259. movdqa xmm2,XMMWORD[32+r12]
  260. movdqa xmm3,XMMWORD[48+r12]
  261. pand xmm0,XMMWORD[rdx]
  262. pand xmm1,XMMWORD[16+rdx]
  263. por xmm4,xmm0
  264. pand xmm2,XMMWORD[32+rdx]
  265. por xmm5,xmm1
  266. pand xmm3,XMMWORD[48+rdx]
  267. por xmm4,xmm2
  268. por xmm5,xmm3
  269. movdqa xmm0,XMMWORD[64+r12]
  270. movdqa xmm1,XMMWORD[80+r12]
  271. movdqa xmm2,XMMWORD[96+r12]
  272. movdqa xmm3,XMMWORD[112+r12]
  273. pand xmm0,XMMWORD[64+rdx]
  274. pand xmm1,XMMWORD[80+rdx]
  275. por xmm4,xmm0
  276. pand xmm2,XMMWORD[96+rdx]
  277. por xmm5,xmm1
  278. pand xmm3,XMMWORD[112+rdx]
  279. por xmm4,xmm2
  280. por xmm5,xmm3
  281. por xmm4,xmm5
  282. pshufd xmm0,xmm4,0x4e
  283. por xmm0,xmm4
  284. lea r12,[256+r12]
  285. mov rax,QWORD[rsi]
  286. DB 102,72,15,126,195
  287. xor r15,r15
  288. mov rbp,r8
  289. mov r10,QWORD[rsp]
  290. mul rbx
  291. add r10,rax
  292. mov rax,QWORD[rcx]
  293. adc rdx,0
  294. imul rbp,r10
  295. mov r11,rdx
  296. mul rbp
  297. add r10,rax
  298. mov rax,QWORD[8+rsi]
  299. adc rdx,0
  300. mov r10,QWORD[8+rsp]
  301. mov r13,rdx
  302. lea r15,[1+r15]
  303. jmp NEAR $L$inner_enter
  304. ALIGN 16
  305. $L$inner:
  306. add r13,rax
  307. mov rax,QWORD[r15*8+rsi]
  308. adc rdx,0
  309. add r13,r10
  310. mov r10,QWORD[r15*8+rsp]
  311. adc rdx,0
  312. mov QWORD[((-16))+r15*8+rsp],r13
  313. mov r13,rdx
  314. $L$inner_enter:
  315. mul rbx
  316. add r11,rax
  317. mov rax,QWORD[r15*8+rcx]
  318. adc rdx,0
  319. add r10,r11
  320. mov r11,rdx
  321. adc r11,0
  322. lea r15,[1+r15]
  323. mul rbp
  324. cmp r15,r9
  325. jne NEAR $L$inner
  326. add r13,rax
  327. adc rdx,0
  328. add r13,r10
  329. mov r10,QWORD[r9*8+rsp]
  330. adc rdx,0
  331. mov QWORD[((-16))+r9*8+rsp],r13
  332. mov r13,rdx
  333. xor rdx,rdx
  334. add r13,r11
  335. adc rdx,0
  336. add r13,r10
  337. adc rdx,0
  338. mov QWORD[((-8))+r9*8+rsp],r13
  339. mov QWORD[r9*8+rsp],rdx
  340. lea r14,[1+r14]
  341. cmp r14,r9
  342. jb NEAR $L$outer
  343. xor r14,r14
  344. mov rax,QWORD[rsp]
  345. lea rsi,[rsp]
  346. mov r15,r9
  347. jmp NEAR $L$sub
  348. ALIGN 16
  349. $L$sub: sbb rax,QWORD[r14*8+rcx]
  350. mov QWORD[r14*8+rdi],rax
  351. mov rax,QWORD[8+r14*8+rsi]
  352. lea r14,[1+r14]
  353. dec r15
  354. jnz NEAR $L$sub
  355. sbb rax,0
  356. mov rbx,-1
  357. xor rbx,rax
  358. xor r14,r14
  359. mov r15,r9
  360. $L$copy:
  361. mov rcx,QWORD[r14*8+rdi]
  362. mov rdx,QWORD[r14*8+rsp]
  363. and rcx,rbx
  364. and rdx,rax
  365. mov QWORD[r14*8+rsp],r14
  366. or rdx,rcx
  367. mov QWORD[r14*8+rdi],rdx
  368. lea r14,[1+r14]
  369. sub r15,1
  370. jnz NEAR $L$copy
  371. mov rsi,QWORD[8+r9*8+rsp]
  372. mov rax,1
  373. mov r15,QWORD[((-48))+rsi]
  374. mov r14,QWORD[((-40))+rsi]
  375. mov r13,QWORD[((-32))+rsi]
  376. mov r12,QWORD[((-24))+rsi]
  377. mov rbp,QWORD[((-16))+rsi]
  378. mov rbx,QWORD[((-8))+rsi]
  379. lea rsp,[rsi]
  380. $L$mul_epilogue:
  381. mov rdi,QWORD[8+rsp] ;WIN64 epilogue
  382. mov rsi,QWORD[16+rsp]
  383. DB 0F3h,0C3h ;repret
  384. $L$SEH_end_GFp_bn_mul_mont_gather5:
  385. ALIGN 32
  386. bn_mul4x_mont_gather5:
  387. mov QWORD[8+rsp],rdi ;WIN64 prologue
  388. mov QWORD[16+rsp],rsi
  389. mov rax,rsp
  390. $L$SEH_begin_bn_mul4x_mont_gather5:
  391. mov rdi,rcx
  392. mov rsi,rdx
  393. mov rdx,r8
  394. mov rcx,r9
  395. mov r8,QWORD[40+rsp]
  396. mov r9,QWORD[48+rsp]
  397. DB 0x67
  398. mov rax,rsp
  399. $L$mul4x_enter:
  400. and r11d,0x80108
  401. cmp r11d,0x80108
  402. je NEAR $L$mulx4x_enter
  403. push rbx
  404. push rbp
  405. push r12
  406. push r13
  407. push r14
  408. push r15
  409. $L$mul4x_prologue:
  410. DB 0x67
  411. shl r9d,3
  412. lea r10,[r9*2+r9]
  413. neg r9
  414. lea r11,[((-320))+r9*2+rsp]
  415. mov rbp,rsp
  416. sub r11,rdi
  417. and r11,4095
  418. cmp r10,r11
  419. jb NEAR $L$mul4xsp_alt
  420. sub rbp,r11
  421. lea rbp,[((-320))+r9*2+rbp]
  422. jmp NEAR $L$mul4xsp_done
  423. ALIGN 32
  424. $L$mul4xsp_alt:
  425. lea r10,[((4096-320))+r9*2]
  426. lea rbp,[((-320))+r9*2+rbp]
  427. sub r11,r10
  428. mov r10,0
  429. cmovc r11,r10
  430. sub rbp,r11
  431. $L$mul4xsp_done:
  432. and rbp,-64
  433. mov r11,rsp
  434. sub r11,rbp
  435. and r11,-4096
  436. lea rsp,[rbp*1+r11]
  437. mov r10,QWORD[rsp]
  438. cmp rsp,rbp
  439. ja NEAR $L$mul4x_page_walk
  440. jmp NEAR $L$mul4x_page_walk_done
  441. $L$mul4x_page_walk:
  442. lea rsp,[((-4096))+rsp]
  443. mov r10,QWORD[rsp]
  444. cmp rsp,rbp
  445. ja NEAR $L$mul4x_page_walk
  446. $L$mul4x_page_walk_done:
  447. neg r9
  448. mov QWORD[40+rsp],rax
  449. $L$mul4x_body:
  450. call mul4x_internal
  451. mov rsi,QWORD[40+rsp]
  452. mov rax,1
  453. mov r15,QWORD[((-48))+rsi]
  454. mov r14,QWORD[((-40))+rsi]
  455. mov r13,QWORD[((-32))+rsi]
  456. mov r12,QWORD[((-24))+rsi]
  457. mov rbp,QWORD[((-16))+rsi]
  458. mov rbx,QWORD[((-8))+rsi]
  459. lea rsp,[rsi]
  460. $L$mul4x_epilogue:
  461. mov rdi,QWORD[8+rsp] ;WIN64 epilogue
  462. mov rsi,QWORD[16+rsp]
  463. DB 0F3h,0C3h ;repret
  464. $L$SEH_end_bn_mul4x_mont_gather5:
  465. ALIGN 32
  466. mul4x_internal:
  467. shl r9,5
  468. movd xmm5,DWORD[56+rax]
  469. lea rax,[$L$inc]
  470. lea r13,[128+r9*1+rdx]
  471. shr r9,5
  472. movdqa xmm0,XMMWORD[rax]
  473. movdqa xmm1,XMMWORD[16+rax]
  474. lea r10,[((88-112))+r9*1+rsp]
  475. lea r12,[128+rdx]
  476. pshufd xmm5,xmm5,0
  477. movdqa xmm4,xmm1
  478. DB 0x67,0x67
  479. movdqa xmm2,xmm1
  480. paddd xmm1,xmm0
  481. pcmpeqd xmm0,xmm5
  482. DB 0x67
  483. movdqa xmm3,xmm4
  484. paddd xmm2,xmm1
  485. pcmpeqd xmm1,xmm5
  486. movdqa XMMWORD[112+r10],xmm0
  487. movdqa xmm0,xmm4
  488. paddd xmm3,xmm2
  489. pcmpeqd xmm2,xmm5
  490. movdqa XMMWORD[128+r10],xmm1
  491. movdqa xmm1,xmm4
  492. paddd xmm0,xmm3
  493. pcmpeqd xmm3,xmm5
  494. movdqa XMMWORD[144+r10],xmm2
  495. movdqa xmm2,xmm4
  496. paddd xmm1,xmm0
  497. pcmpeqd xmm0,xmm5
  498. movdqa XMMWORD[160+r10],xmm3
  499. movdqa xmm3,xmm4
  500. paddd xmm2,xmm1
  501. pcmpeqd xmm1,xmm5
  502. movdqa XMMWORD[176+r10],xmm0
  503. movdqa xmm0,xmm4
  504. paddd xmm3,xmm2
  505. pcmpeqd xmm2,xmm5
  506. movdqa XMMWORD[192+r10],xmm1
  507. movdqa xmm1,xmm4
  508. paddd xmm0,xmm3
  509. pcmpeqd xmm3,xmm5
  510. movdqa XMMWORD[208+r10],xmm2
  511. movdqa xmm2,xmm4
  512. paddd xmm1,xmm0
  513. pcmpeqd xmm0,xmm5
  514. movdqa XMMWORD[224+r10],xmm3
  515. movdqa xmm3,xmm4
  516. paddd xmm2,xmm1
  517. pcmpeqd xmm1,xmm5
  518. movdqa XMMWORD[240+r10],xmm0
  519. movdqa xmm0,xmm4
  520. paddd xmm3,xmm2
  521. pcmpeqd xmm2,xmm5
  522. movdqa XMMWORD[256+r10],xmm1
  523. movdqa xmm1,xmm4
  524. paddd xmm0,xmm3
  525. pcmpeqd xmm3,xmm5
  526. movdqa XMMWORD[272+r10],xmm2
  527. movdqa xmm2,xmm4
  528. paddd xmm1,xmm0
  529. pcmpeqd xmm0,xmm5
  530. movdqa XMMWORD[288+r10],xmm3
  531. movdqa xmm3,xmm4
  532. paddd xmm2,xmm1
  533. pcmpeqd xmm1,xmm5
  534. movdqa XMMWORD[304+r10],xmm0
  535. paddd xmm3,xmm2
  536. DB 0x67
  537. pcmpeqd xmm2,xmm5
  538. movdqa XMMWORD[320+r10],xmm1
  539. pcmpeqd xmm3,xmm5
  540. movdqa XMMWORD[336+r10],xmm2
  541. pand xmm0,XMMWORD[64+r12]
  542. pand xmm1,XMMWORD[80+r12]
  543. pand xmm2,XMMWORD[96+r12]
  544. movdqa XMMWORD[352+r10],xmm3
  545. pand xmm3,XMMWORD[112+r12]
  546. por xmm0,xmm2
  547. por xmm1,xmm3
  548. movdqa xmm4,XMMWORD[((-128))+r12]
  549. movdqa xmm5,XMMWORD[((-112))+r12]
  550. movdqa xmm2,XMMWORD[((-96))+r12]
  551. pand xmm4,XMMWORD[112+r10]
  552. movdqa xmm3,XMMWORD[((-80))+r12]
  553. pand xmm5,XMMWORD[128+r10]
  554. por xmm0,xmm4
  555. pand xmm2,XMMWORD[144+r10]
  556. por xmm1,xmm5
  557. pand xmm3,XMMWORD[160+r10]
  558. por xmm0,xmm2
  559. por xmm1,xmm3
  560. movdqa xmm4,XMMWORD[((-64))+r12]
  561. movdqa xmm5,XMMWORD[((-48))+r12]
  562. movdqa xmm2,XMMWORD[((-32))+r12]
  563. pand xmm4,XMMWORD[176+r10]
  564. movdqa xmm3,XMMWORD[((-16))+r12]
  565. pand xmm5,XMMWORD[192+r10]
  566. por xmm0,xmm4
  567. pand xmm2,XMMWORD[208+r10]
  568. por xmm1,xmm5
  569. pand xmm3,XMMWORD[224+r10]
  570. por xmm0,xmm2
  571. por xmm1,xmm3
  572. movdqa xmm4,XMMWORD[r12]
  573. movdqa xmm5,XMMWORD[16+r12]
  574. movdqa xmm2,XMMWORD[32+r12]
  575. pand xmm4,XMMWORD[240+r10]
  576. movdqa xmm3,XMMWORD[48+r12]
  577. pand xmm5,XMMWORD[256+r10]
  578. por xmm0,xmm4
  579. pand xmm2,XMMWORD[272+r10]
  580. por xmm1,xmm5
  581. pand xmm3,XMMWORD[288+r10]
  582. por xmm0,xmm2
  583. por xmm1,xmm3
  584. por xmm0,xmm1
  585. pshufd xmm1,xmm0,0x4e
  586. por xmm0,xmm1
  587. lea r12,[256+r12]
  588. DB 102,72,15,126,195
  589. mov QWORD[((16+8))+rsp],r13
  590. mov QWORD[((56+8))+rsp],rdi
  591. mov r8,QWORD[r8]
  592. mov rax,QWORD[rsi]
  593. lea rsi,[r9*1+rsi]
  594. neg r9
  595. mov rbp,r8
  596. mul rbx
  597. mov r10,rax
  598. mov rax,QWORD[rcx]
  599. imul rbp,r10
  600. lea r14,[((64+8))+rsp]
  601. mov r11,rdx
  602. mul rbp
  603. add r10,rax
  604. mov rax,QWORD[8+r9*1+rsi]
  605. adc rdx,0
  606. mov rdi,rdx
  607. mul rbx
  608. add r11,rax
  609. mov rax,QWORD[8+rcx]
  610. adc rdx,0
  611. mov r10,rdx
  612. mul rbp
  613. add rdi,rax
  614. mov rax,QWORD[16+r9*1+rsi]
  615. adc rdx,0
  616. add rdi,r11
  617. lea r15,[32+r9]
  618. lea rcx,[32+rcx]
  619. adc rdx,0
  620. mov QWORD[r14],rdi
  621. mov r13,rdx
  622. jmp NEAR $L$1st4x
  623. ALIGN 32
  624. $L$1st4x:
  625. mul rbx
  626. add r10,rax
  627. mov rax,QWORD[((-16))+rcx]
  628. lea r14,[32+r14]
  629. adc rdx,0
  630. mov r11,rdx
  631. mul rbp
  632. add r13,rax
  633. mov rax,QWORD[((-8))+r15*1+rsi]
  634. adc rdx,0
  635. add r13,r10
  636. adc rdx,0
  637. mov QWORD[((-24))+r14],r13
  638. mov rdi,rdx
  639. mul rbx
  640. add r11,rax
  641. mov rax,QWORD[((-8))+rcx]
  642. adc rdx,0
  643. mov r10,rdx
  644. mul rbp
  645. add rdi,rax
  646. mov rax,QWORD[r15*1+rsi]
  647. adc rdx,0
  648. add rdi,r11
  649. adc rdx,0
  650. mov QWORD[((-16))+r14],rdi
  651. mov r13,rdx
  652. mul rbx
  653. add r10,rax
  654. mov rax,QWORD[rcx]
  655. adc rdx,0
  656. mov r11,rdx
  657. mul rbp
  658. add r13,rax
  659. mov rax,QWORD[8+r15*1+rsi]
  660. adc rdx,0
  661. add r13,r10
  662. adc rdx,0
  663. mov QWORD[((-8))+r14],r13
  664. mov rdi,rdx
  665. mul rbx
  666. add r11,rax
  667. mov rax,QWORD[8+rcx]
  668. adc rdx,0
  669. mov r10,rdx
  670. mul rbp
  671. add rdi,rax
  672. mov rax,QWORD[16+r15*1+rsi]
  673. adc rdx,0
  674. add rdi,r11
  675. lea rcx,[32+rcx]
  676. adc rdx,0
  677. mov QWORD[r14],rdi
  678. mov r13,rdx
  679. add r15,32
  680. jnz NEAR $L$1st4x
  681. mul rbx
  682. add r10,rax
  683. mov rax,QWORD[((-16))+rcx]
  684. lea r14,[32+r14]
  685. adc rdx,0
  686. mov r11,rdx
  687. mul rbp
  688. add r13,rax
  689. mov rax,QWORD[((-8))+rsi]
  690. adc rdx,0
  691. add r13,r10
  692. adc rdx,0
  693. mov QWORD[((-24))+r14],r13
  694. mov rdi,rdx
  695. mul rbx
  696. add r11,rax
  697. mov rax,QWORD[((-8))+rcx]
  698. adc rdx,0
  699. mov r10,rdx
  700. mul rbp
  701. add rdi,rax
  702. mov rax,QWORD[r9*1+rsi]
  703. adc rdx,0
  704. add rdi,r11
  705. adc rdx,0
  706. mov QWORD[((-16))+r14],rdi
  707. mov r13,rdx
  708. lea rcx,[r9*1+rcx]
  709. xor rdi,rdi
  710. add r13,r10
  711. adc rdi,0
  712. mov QWORD[((-8))+r14],r13
  713. jmp NEAR $L$outer4x
  714. ALIGN 32
  715. $L$outer4x:
  716. lea rdx,[((16+128))+r14]
  717. pxor xmm4,xmm4
  718. pxor xmm5,xmm5
  719. movdqa xmm0,XMMWORD[((-128))+r12]
  720. movdqa xmm1,XMMWORD[((-112))+r12]
  721. movdqa xmm2,XMMWORD[((-96))+r12]
  722. movdqa xmm3,XMMWORD[((-80))+r12]
  723. pand xmm0,XMMWORD[((-128))+rdx]
  724. pand xmm1,XMMWORD[((-112))+rdx]
  725. por xmm4,xmm0
  726. pand xmm2,XMMWORD[((-96))+rdx]
  727. por xmm5,xmm1
  728. pand xmm3,XMMWORD[((-80))+rdx]
  729. por xmm4,xmm2
  730. por xmm5,xmm3
  731. movdqa xmm0,XMMWORD[((-64))+r12]
  732. movdqa xmm1,XMMWORD[((-48))+r12]
  733. movdqa xmm2,XMMWORD[((-32))+r12]
  734. movdqa xmm3,XMMWORD[((-16))+r12]
  735. pand xmm0,XMMWORD[((-64))+rdx]
  736. pand xmm1,XMMWORD[((-48))+rdx]
  737. por xmm4,xmm0
  738. pand xmm2,XMMWORD[((-32))+rdx]
  739. por xmm5,xmm1
  740. pand xmm3,XMMWORD[((-16))+rdx]
  741. por xmm4,xmm2
  742. por xmm5,xmm3
  743. movdqa xmm0,XMMWORD[r12]
  744. movdqa xmm1,XMMWORD[16+r12]
  745. movdqa xmm2,XMMWORD[32+r12]
  746. movdqa xmm3,XMMWORD[48+r12]
  747. pand xmm0,XMMWORD[rdx]
  748. pand xmm1,XMMWORD[16+rdx]
  749. por xmm4,xmm0
  750. pand xmm2,XMMWORD[32+rdx]
  751. por xmm5,xmm1
  752. pand xmm3,XMMWORD[48+rdx]
  753. por xmm4,xmm2
  754. por xmm5,xmm3
  755. movdqa xmm0,XMMWORD[64+r12]
  756. movdqa xmm1,XMMWORD[80+r12]
  757. movdqa xmm2,XMMWORD[96+r12]
  758. movdqa xmm3,XMMWORD[112+r12]
  759. pand xmm0,XMMWORD[64+rdx]
  760. pand xmm1,XMMWORD[80+rdx]
  761. por xmm4,xmm0
  762. pand xmm2,XMMWORD[96+rdx]
  763. por xmm5,xmm1
  764. pand xmm3,XMMWORD[112+rdx]
  765. por xmm4,xmm2
  766. por xmm5,xmm3
  767. por xmm4,xmm5
  768. pshufd xmm0,xmm4,0x4e
  769. por xmm0,xmm4
  770. lea r12,[256+r12]
  771. DB 102,72,15,126,195
  772. mov r10,QWORD[r9*1+r14]
  773. mov rbp,r8
  774. mul rbx
  775. add r10,rax
  776. mov rax,QWORD[rcx]
  777. adc rdx,0
  778. imul rbp,r10
  779. mov r11,rdx
  780. mov QWORD[r14],rdi
  781. lea r14,[r9*1+r14]
  782. mul rbp
  783. add r10,rax
  784. mov rax,QWORD[8+r9*1+rsi]
  785. adc rdx,0
  786. mov rdi,rdx
  787. mul rbx
  788. add r11,rax
  789. mov rax,QWORD[8+rcx]
  790. adc rdx,0
  791. add r11,QWORD[8+r14]
  792. adc rdx,0
  793. mov r10,rdx
  794. mul rbp
  795. add rdi,rax
  796. mov rax,QWORD[16+r9*1+rsi]
  797. adc rdx,0
  798. add rdi,r11
  799. lea r15,[32+r9]
  800. lea rcx,[32+rcx]
  801. adc rdx,0
  802. mov r13,rdx
  803. jmp NEAR $L$inner4x
  804. ALIGN 32
  805. $L$inner4x:
  806. mul rbx
  807. add r10,rax
  808. mov rax,QWORD[((-16))+rcx]
  809. adc rdx,0
  810. add r10,QWORD[16+r14]
  811. lea r14,[32+r14]
  812. adc rdx,0
  813. mov r11,rdx
  814. mul rbp
  815. add r13,rax
  816. mov rax,QWORD[((-8))+r15*1+rsi]
  817. adc rdx,0
  818. add r13,r10
  819. adc rdx,0
  820. mov QWORD[((-32))+r14],rdi
  821. mov rdi,rdx
  822. mul rbx
  823. add r11,rax
  824. mov rax,QWORD[((-8))+rcx]
  825. adc rdx,0
  826. add r11,QWORD[((-8))+r14]
  827. adc rdx,0
  828. mov r10,rdx
  829. mul rbp
  830. add rdi,rax
  831. mov rax,QWORD[r15*1+rsi]
  832. adc rdx,0
  833. add rdi,r11
  834. adc rdx,0
  835. mov QWORD[((-24))+r14],r13
  836. mov r13,rdx
  837. mul rbx
  838. add r10,rax
  839. mov rax,QWORD[rcx]
  840. adc rdx,0
  841. add r10,QWORD[r14]
  842. adc rdx,0
  843. mov r11,rdx
  844. mul rbp
  845. add r13,rax
  846. mov rax,QWORD[8+r15*1+rsi]
  847. adc rdx,0
  848. add r13,r10
  849. adc rdx,0
  850. mov QWORD[((-16))+r14],rdi
  851. mov rdi,rdx
  852. mul rbx
  853. add r11,rax
  854. mov rax,QWORD[8+rcx]
  855. adc rdx,0
  856. add r11,QWORD[8+r14]
  857. adc rdx,0
  858. mov r10,rdx
  859. mul rbp
  860. add rdi,rax
  861. mov rax,QWORD[16+r15*1+rsi]
  862. adc rdx,0
  863. add rdi,r11
  864. lea rcx,[32+rcx]
  865. adc rdx,0
  866. mov QWORD[((-8))+r14],r13
  867. mov r13,rdx
  868. add r15,32
  869. jnz NEAR $L$inner4x
  870. mul rbx
  871. add r10,rax
  872. mov rax,QWORD[((-16))+rcx]
  873. adc rdx,0
  874. add r10,QWORD[16+r14]
  875. lea r14,[32+r14]
  876. adc rdx,0
  877. mov r11,rdx
  878. mul rbp
  879. add r13,rax
  880. mov rax,QWORD[((-8))+rsi]
  881. adc rdx,0
  882. add r13,r10
  883. adc rdx,0
  884. mov QWORD[((-32))+r14],rdi
  885. mov rdi,rdx
  886. mul rbx
  887. add r11,rax
  888. mov rax,rbp
  889. mov rbp,QWORD[((-8))+rcx]
  890. adc rdx,0
  891. add r11,QWORD[((-8))+r14]
  892. adc rdx,0
  893. mov r10,rdx
  894. mul rbp
  895. add rdi,rax
  896. mov rax,QWORD[r9*1+rsi]
  897. adc rdx,0
  898. add rdi,r11
  899. adc rdx,0
  900. mov QWORD[((-24))+r14],r13
  901. mov r13,rdx
  902. mov QWORD[((-16))+r14],rdi
  903. lea rcx,[r9*1+rcx]
  904. xor rdi,rdi
  905. add r13,r10
  906. adc rdi,0
  907. add r13,QWORD[r14]
  908. adc rdi,0
  909. mov QWORD[((-8))+r14],r13
  910. cmp r12,QWORD[((16+8))+rsp]
  911. jb NEAR $L$outer4x
  912. xor rax,rax
  913. sub rbp,r13
  914. adc r15,r15
  915. or rdi,r15
  916. sub rax,rdi
  917. lea rbx,[r9*1+r14]
  918. mov r12,QWORD[rcx]
  919. lea rbp,[rcx]
  920. mov rcx,r9
  921. sar rcx,3+2
  922. mov rdi,QWORD[((56+8))+rsp]
  923. dec r12
  924. xor r10,r10
  925. mov r13,QWORD[8+rbp]
  926. mov r14,QWORD[16+rbp]
  927. mov r15,QWORD[24+rbp]
  928. jmp NEAR $L$sqr4x_sub_entry
  929. global GFp_bn_power5
  930. ALIGN 32
  931. GFp_bn_power5:
  932. mov QWORD[8+rsp],rdi ;WIN64 prologue
  933. mov QWORD[16+rsp],rsi
  934. mov rax,rsp
  935. $L$SEH_begin_GFp_bn_power5:
  936. mov rdi,rcx
  937. mov rsi,rdx
  938. mov rdx,r8
  939. mov rcx,r9
  940. mov r8,QWORD[40+rsp]
  941. mov r9,QWORD[48+rsp]
  942. mov rax,rsp
  943. lea r11,[GFp_ia32cap_P]
  944. mov r11d,DWORD[8+r11]
  945. and r11d,0x80108
  946. cmp r11d,0x80108
  947. je NEAR $L$powerx5_enter
  948. push rbx
  949. push rbp
  950. push r12
  951. push r13
  952. push r14
  953. push r15
  954. $L$power5_prologue:
  955. shl r9d,3
  956. lea r10d,[r9*2+r9]
  957. neg r9
  958. mov r8,QWORD[r8]
  959. lea r11,[((-320))+r9*2+rsp]
  960. mov rbp,rsp
  961. sub r11,rdi
  962. and r11,4095
  963. cmp r10,r11
  964. jb NEAR $L$pwr_sp_alt
  965. sub rbp,r11
  966. lea rbp,[((-320))+r9*2+rbp]
  967. jmp NEAR $L$pwr_sp_done
  968. ALIGN 32
  969. $L$pwr_sp_alt:
  970. lea r10,[((4096-320))+r9*2]
  971. lea rbp,[((-320))+r9*2+rbp]
  972. sub r11,r10
  973. mov r10,0
  974. cmovc r11,r10
  975. sub rbp,r11
  976. $L$pwr_sp_done:
  977. and rbp,-64
  978. mov r11,rsp
  979. sub r11,rbp
  980. and r11,-4096
  981. lea rsp,[rbp*1+r11]
  982. mov r10,QWORD[rsp]
  983. cmp rsp,rbp
  984. ja NEAR $L$pwr_page_walk
  985. jmp NEAR $L$pwr_page_walk_done
  986. $L$pwr_page_walk:
  987. lea rsp,[((-4096))+rsp]
  988. mov r10,QWORD[rsp]
  989. cmp rsp,rbp
  990. ja NEAR $L$pwr_page_walk
  991. $L$pwr_page_walk_done:
  992. mov r10,r9
  993. neg r9
  994. mov QWORD[32+rsp],r8
  995. mov QWORD[40+rsp],rax
  996. $L$power5_body:
  997. DB 102,72,15,110,207
  998. DB 102,72,15,110,209
  999. DB 102,73,15,110,218
  1000. DB 102,72,15,110,226
  1001. call __bn_sqr8x_internal
  1002. call __bn_post4x_internal
  1003. call __bn_sqr8x_internal
  1004. call __bn_post4x_internal
  1005. call __bn_sqr8x_internal
  1006. call __bn_post4x_internal
  1007. call __bn_sqr8x_internal
  1008. call __bn_post4x_internal
  1009. call __bn_sqr8x_internal
  1010. call __bn_post4x_internal
  1011. DB 102,72,15,126,209
  1012. DB 102,72,15,126,226
  1013. mov rdi,rsi
  1014. mov rax,QWORD[40+rsp]
  1015. lea r8,[32+rsp]
  1016. call mul4x_internal
  1017. mov rsi,QWORD[40+rsp]
  1018. mov rax,1
  1019. mov r15,QWORD[((-48))+rsi]
  1020. mov r14,QWORD[((-40))+rsi]
  1021. mov r13,QWORD[((-32))+rsi]
  1022. mov r12,QWORD[((-24))+rsi]
  1023. mov rbp,QWORD[((-16))+rsi]
  1024. mov rbx,QWORD[((-8))+rsi]
  1025. lea rsp,[rsi]
  1026. $L$power5_epilogue:
  1027. mov rdi,QWORD[8+rsp] ;WIN64 epilogue
  1028. mov rsi,QWORD[16+rsp]
  1029. DB 0F3h,0C3h ;repret
  1030. $L$SEH_end_GFp_bn_power5:
  1031. global GFp_bn_sqr8x_internal
  1032. ALIGN 32
  1033. GFp_bn_sqr8x_internal:
  1034. __bn_sqr8x_internal:
  1035. lea rbp,[32+r10]
  1036. lea rsi,[r9*1+rsi]
  1037. mov rcx,r9
  1038. mov r14,QWORD[((-32))+rbp*1+rsi]
  1039. lea rdi,[((48+8))+r9*2+rsp]
  1040. mov rax,QWORD[((-24))+rbp*1+rsi]
  1041. lea rdi,[((-32))+rbp*1+rdi]
  1042. mov rbx,QWORD[((-16))+rbp*1+rsi]
  1043. mov r15,rax
  1044. mul r14
  1045. mov r10,rax
  1046. mov rax,rbx
  1047. mov r11,rdx
  1048. mov QWORD[((-24))+rbp*1+rdi],r10
  1049. mul r14
  1050. add r11,rax
  1051. mov rax,rbx
  1052. adc rdx,0
  1053. mov QWORD[((-16))+rbp*1+rdi],r11
  1054. mov r10,rdx
  1055. mov rbx,QWORD[((-8))+rbp*1+rsi]
  1056. mul r15
  1057. mov r12,rax
  1058. mov rax,rbx
  1059. mov r13,rdx
  1060. lea rcx,[rbp]
  1061. mul r14
  1062. add r10,rax
  1063. mov rax,rbx
  1064. mov r11,rdx
  1065. adc r11,0
  1066. add r10,r12
  1067. adc r11,0
  1068. mov QWORD[((-8))+rcx*1+rdi],r10
  1069. jmp NEAR $L$sqr4x_1st
  1070. ALIGN 32
  1071. $L$sqr4x_1st:
  1072. mov rbx,QWORD[rcx*1+rsi]
  1073. mul r15
  1074. add r13,rax
  1075. mov rax,rbx
  1076. mov r12,rdx
  1077. adc r12,0
  1078. mul r14
  1079. add r11,rax
  1080. mov rax,rbx
  1081. mov rbx,QWORD[8+rcx*1+rsi]
  1082. mov r10,rdx
  1083. adc r10,0
  1084. add r11,r13
  1085. adc r10,0
  1086. mul r15
  1087. add r12,rax
  1088. mov rax,rbx
  1089. mov QWORD[rcx*1+rdi],r11
  1090. mov r13,rdx
  1091. adc r13,0
  1092. mul r14
  1093. add r10,rax
  1094. mov rax,rbx
  1095. mov rbx,QWORD[16+rcx*1+rsi]
  1096. mov r11,rdx
  1097. adc r11,0
  1098. add r10,r12
  1099. adc r11,0
  1100. mul r15
  1101. add r13,rax
  1102. mov rax,rbx
  1103. mov QWORD[8+rcx*1+rdi],r10
  1104. mov r12,rdx
  1105. adc r12,0
  1106. mul r14
  1107. add r11,rax
  1108. mov rax,rbx
  1109. mov rbx,QWORD[24+rcx*1+rsi]
  1110. mov r10,rdx
  1111. adc r10,0
  1112. add r11,r13
  1113. adc r10,0
  1114. mul r15
  1115. add r12,rax
  1116. mov rax,rbx
  1117. mov QWORD[16+rcx*1+rdi],r11
  1118. mov r13,rdx
  1119. adc r13,0
  1120. lea rcx,[32+rcx]
  1121. mul r14
  1122. add r10,rax
  1123. mov rax,rbx
  1124. mov r11,rdx
  1125. adc r11,0
  1126. add r10,r12
  1127. adc r11,0
  1128. mov QWORD[((-8))+rcx*1+rdi],r10
  1129. cmp rcx,0
  1130. jne NEAR $L$sqr4x_1st
  1131. mul r15
  1132. add r13,rax
  1133. lea rbp,[16+rbp]
  1134. adc rdx,0
  1135. add r13,r11
  1136. adc rdx,0
  1137. mov QWORD[rdi],r13
  1138. mov r12,rdx
  1139. mov QWORD[8+rdi],rdx
  1140. jmp NEAR $L$sqr4x_outer
  1141. ALIGN 32
  1142. $L$sqr4x_outer:
  1143. mov r14,QWORD[((-32))+rbp*1+rsi]
  1144. lea rdi,[((48+8))+r9*2+rsp]
  1145. mov rax,QWORD[((-24))+rbp*1+rsi]
  1146. lea rdi,[((-32))+rbp*1+rdi]
  1147. mov rbx,QWORD[((-16))+rbp*1+rsi]
  1148. mov r15,rax
  1149. mul r14
  1150. mov r10,QWORD[((-24))+rbp*1+rdi]
  1151. add r10,rax
  1152. mov rax,rbx
  1153. adc rdx,0
  1154. mov QWORD[((-24))+rbp*1+rdi],r10
  1155. mov r11,rdx
  1156. mul r14
  1157. add r11,rax
  1158. mov rax,rbx
  1159. adc rdx,0
  1160. add r11,QWORD[((-16))+rbp*1+rdi]
  1161. mov r10,rdx
  1162. adc r10,0
  1163. mov QWORD[((-16))+rbp*1+rdi],r11
  1164. xor r12,r12
  1165. mov rbx,QWORD[((-8))+rbp*1+rsi]
  1166. mul r15
  1167. add r12,rax
  1168. mov rax,rbx
  1169. adc rdx,0
  1170. add r12,QWORD[((-8))+rbp*1+rdi]
  1171. mov r13,rdx
  1172. adc r13,0
  1173. mul r14
  1174. add r10,rax
  1175. mov rax,rbx
  1176. adc rdx,0
  1177. add r10,r12
  1178. mov r11,rdx
  1179. adc r11,0
  1180. mov QWORD[((-8))+rbp*1+rdi],r10
  1181. lea rcx,[rbp]
  1182. jmp NEAR $L$sqr4x_inner
  1183. ALIGN 32
  1184. $L$sqr4x_inner:
  1185. mov rbx,QWORD[rcx*1+rsi]
  1186. mul r15
  1187. add r13,rax
  1188. mov rax,rbx
  1189. mov r12,rdx
  1190. adc r12,0
  1191. add r13,QWORD[rcx*1+rdi]
  1192. adc r12,0
  1193. DB 0x67
  1194. mul r14
  1195. add r11,rax
  1196. mov rax,rbx
  1197. mov rbx,QWORD[8+rcx*1+rsi]
  1198. mov r10,rdx
  1199. adc r10,0
  1200. add r11,r13
  1201. adc r10,0
  1202. mul r15
  1203. add r12,rax
  1204. mov QWORD[rcx*1+rdi],r11
  1205. mov rax,rbx
  1206. mov r13,rdx
  1207. adc r13,0
  1208. add r12,QWORD[8+rcx*1+rdi]
  1209. lea rcx,[16+rcx]
  1210. adc r13,0
  1211. mul r14
  1212. add r10,rax
  1213. mov rax,rbx
  1214. adc rdx,0
  1215. add r10,r12
  1216. mov r11,rdx
  1217. adc r11,0
  1218. mov QWORD[((-8))+rcx*1+rdi],r10
  1219. cmp rcx,0
  1220. jne NEAR $L$sqr4x_inner
  1221. DB 0x67
  1222. mul r15
  1223. add r13,rax
  1224. adc rdx,0
  1225. add r13,r11
  1226. adc rdx,0
  1227. mov QWORD[rdi],r13
  1228. mov r12,rdx
  1229. mov QWORD[8+rdi],rdx
  1230. add rbp,16
  1231. jnz NEAR $L$sqr4x_outer
  1232. mov r14,QWORD[((-32))+rsi]
  1233. lea rdi,[((48+8))+r9*2+rsp]
  1234. mov rax,QWORD[((-24))+rsi]
  1235. lea rdi,[((-32))+rbp*1+rdi]
  1236. mov rbx,QWORD[((-16))+rsi]
  1237. mov r15,rax
  1238. mul r14
  1239. add r10,rax
  1240. mov rax,rbx
  1241. mov r11,rdx
  1242. adc r11,0
  1243. mul r14
  1244. add r11,rax
  1245. mov rax,rbx
  1246. mov QWORD[((-24))+rdi],r10
  1247. mov r10,rdx
  1248. adc r10,0
  1249. add r11,r13
  1250. mov rbx,QWORD[((-8))+rsi]
  1251. adc r10,0
  1252. mul r15
  1253. add r12,rax
  1254. mov rax,rbx
  1255. mov QWORD[((-16))+rdi],r11
  1256. mov r13,rdx
  1257. adc r13,0
  1258. mul r14
  1259. add r10,rax
  1260. mov rax,rbx
  1261. mov r11,rdx
  1262. adc r11,0
  1263. add r10,r12
  1264. adc r11,0
  1265. mov QWORD[((-8))+rdi],r10
  1266. mul r15
  1267. add r13,rax
  1268. mov rax,QWORD[((-16))+rsi]
  1269. adc rdx,0
  1270. add r13,r11
  1271. adc rdx,0
  1272. mov QWORD[rdi],r13
  1273. mov r12,rdx
  1274. mov QWORD[8+rdi],rdx
  1275. mul rbx
  1276. add rbp,16
  1277. xor r14,r14
  1278. sub rbp,r9
  1279. xor r15,r15
  1280. add rax,r12
  1281. adc rdx,0
  1282. mov QWORD[8+rdi],rax
  1283. mov QWORD[16+rdi],rdx
  1284. mov QWORD[24+rdi],r15
  1285. mov rax,QWORD[((-16))+rbp*1+rsi]
  1286. lea rdi,[((48+8))+rsp]
  1287. xor r10,r10
  1288. mov r11,QWORD[8+rdi]
  1289. lea r12,[r10*2+r14]
  1290. shr r10,63
  1291. lea r13,[r11*2+rcx]
  1292. shr r11,63
  1293. or r13,r10
  1294. mov r10,QWORD[16+rdi]
  1295. mov r14,r11
  1296. mul rax
  1297. neg r15
  1298. mov r11,QWORD[24+rdi]
  1299. adc r12,rax
  1300. mov rax,QWORD[((-8))+rbp*1+rsi]
  1301. mov QWORD[rdi],r12
  1302. adc r13,rdx
  1303. lea rbx,[r10*2+r14]
  1304. mov QWORD[8+rdi],r13
  1305. sbb r15,r15
  1306. shr r10,63
  1307. lea r8,[r11*2+rcx]
  1308. shr r11,63
  1309. or r8,r10
  1310. mov r10,QWORD[32+rdi]
  1311. mov r14,r11
  1312. mul rax
  1313. neg r15
  1314. mov r11,QWORD[40+rdi]
  1315. adc rbx,rax
  1316. mov rax,QWORD[rbp*1+rsi]
  1317. mov QWORD[16+rdi],rbx
  1318. adc r8,rdx
  1319. lea rbp,[16+rbp]
  1320. mov QWORD[24+rdi],r8
  1321. sbb r15,r15
  1322. lea rdi,[64+rdi]
  1323. jmp NEAR $L$sqr4x_shift_n_add
  1324. ALIGN 32
  1325. $L$sqr4x_shift_n_add:
  1326. lea r12,[r10*2+r14]
  1327. shr r10,63
  1328. lea r13,[r11*2+rcx]
  1329. shr r11,63
  1330. or r13,r10
  1331. mov r10,QWORD[((-16))+rdi]
  1332. mov r14,r11
  1333. mul rax
  1334. neg r15
  1335. mov r11,QWORD[((-8))+rdi]
  1336. adc r12,rax
  1337. mov rax,QWORD[((-8))+rbp*1+rsi]
  1338. mov QWORD[((-32))+rdi],r12
  1339. adc r13,rdx
  1340. lea rbx,[r10*2+r14]
  1341. mov QWORD[((-24))+rdi],r13
  1342. sbb r15,r15
  1343. shr r10,63
  1344. lea r8,[r11*2+rcx]
  1345. shr r11,63
  1346. or r8,r10
  1347. mov r10,QWORD[rdi]
  1348. mov r14,r11
  1349. mul rax
  1350. neg r15
  1351. mov r11,QWORD[8+rdi]
  1352. adc rbx,rax
  1353. mov rax,QWORD[rbp*1+rsi]
  1354. mov QWORD[((-16))+rdi],rbx
  1355. adc r8,rdx
  1356. lea r12,[r10*2+r14]
  1357. mov QWORD[((-8))+rdi],r8
  1358. sbb r15,r15
  1359. shr r10,63
  1360. lea r13,[r11*2+rcx]
  1361. shr r11,63
  1362. or r13,r10
  1363. mov r10,QWORD[16+rdi]
  1364. mov r14,r11
  1365. mul rax
  1366. neg r15
  1367. mov r11,QWORD[24+rdi]
  1368. adc r12,rax
  1369. mov rax,QWORD[8+rbp*1+rsi]
  1370. mov QWORD[rdi],r12
  1371. adc r13,rdx
  1372. lea rbx,[r10*2+r14]
  1373. mov QWORD[8+rdi],r13
  1374. sbb r15,r15
  1375. shr r10,63
  1376. lea r8,[r11*2+rcx]
  1377. shr r11,63
  1378. or r8,r10
  1379. mov r10,QWORD[32+rdi]
  1380. mov r14,r11
  1381. mul rax
  1382. neg r15
  1383. mov r11,QWORD[40+rdi]
  1384. adc rbx,rax
  1385. mov rax,QWORD[16+rbp*1+rsi]
  1386. mov QWORD[16+rdi],rbx
  1387. adc r8,rdx
  1388. mov QWORD[24+rdi],r8
  1389. sbb r15,r15
  1390. lea rdi,[64+rdi]
  1391. add rbp,32
  1392. jnz NEAR $L$sqr4x_shift_n_add
  1393. lea r12,[r10*2+r14]
  1394. DB 0x67
  1395. shr r10,63
  1396. lea r13,[r11*2+rcx]
  1397. shr r11,63
  1398. or r13,r10
  1399. mov r10,QWORD[((-16))+rdi]
  1400. mov r14,r11
  1401. mul rax
  1402. neg r15
  1403. mov r11,QWORD[((-8))+rdi]
  1404. adc r12,rax
  1405. mov rax,QWORD[((-8))+rsi]
  1406. mov QWORD[((-32))+rdi],r12
  1407. adc r13,rdx
  1408. lea rbx,[r10*2+r14]
  1409. mov QWORD[((-24))+rdi],r13
  1410. sbb r15,r15
  1411. shr r10,63
  1412. lea r8,[r11*2+rcx]
  1413. shr r11,63
  1414. or r8,r10
  1415. mul rax
  1416. neg r15
  1417. adc rbx,rax
  1418. adc r8,rdx
  1419. mov QWORD[((-16))+rdi],rbx
  1420. mov QWORD[((-8))+rdi],r8
  1421. DB 102,72,15,126,213
  1422. __bn_sqr8x_reduction:
  1423. xor rax,rax
  1424. lea rcx,[rbp*1+r9]
  1425. lea rdx,[((48+8))+r9*2+rsp]
  1426. mov QWORD[((0+8))+rsp],rcx
  1427. lea rdi,[((48+8))+r9*1+rsp]
  1428. mov QWORD[((8+8))+rsp],rdx
  1429. neg r9
  1430. jmp NEAR $L$8x_reduction_loop
  1431. ALIGN 32
  1432. $L$8x_reduction_loop:
  1433. lea rdi,[r9*1+rdi]
  1434. DB 0x66
  1435. mov rbx,QWORD[rdi]
  1436. mov r9,QWORD[8+rdi]
  1437. mov r10,QWORD[16+rdi]
  1438. mov r11,QWORD[24+rdi]
  1439. mov r12,QWORD[32+rdi]
  1440. mov r13,QWORD[40+rdi]
  1441. mov r14,QWORD[48+rdi]
  1442. mov r15,QWORD[56+rdi]
  1443. mov QWORD[rdx],rax
  1444. lea rdi,[64+rdi]
  1445. DB 0x67
  1446. mov r8,rbx
  1447. imul rbx,QWORD[((32+8))+rsp]
  1448. mov rax,QWORD[rbp]
  1449. mov ecx,8
  1450. jmp NEAR $L$8x_reduce
  1451. ALIGN 32
  1452. $L$8x_reduce:
  1453. mul rbx
  1454. mov rax,QWORD[8+rbp]
  1455. neg r8
  1456. mov r8,rdx
  1457. adc r8,0
  1458. mul rbx
  1459. add r9,rax
  1460. mov rax,QWORD[16+rbp]
  1461. adc rdx,0
  1462. add r8,r9
  1463. mov QWORD[((48-8+8))+rcx*8+rsp],rbx
  1464. mov r9,rdx
  1465. adc r9,0
  1466. mul rbx
  1467. add r10,rax
  1468. mov rax,QWORD[24+rbp]
  1469. adc rdx,0
  1470. add r9,r10
  1471. mov rsi,QWORD[((32+8))+rsp]
  1472. mov r10,rdx
  1473. adc r10,0
  1474. mul rbx
  1475. add r11,rax
  1476. mov rax,QWORD[32+rbp]
  1477. adc rdx,0
  1478. imul rsi,r8
  1479. add r10,r11
  1480. mov r11,rdx
  1481. adc r11,0
  1482. mul rbx
  1483. add r12,rax
  1484. mov rax,QWORD[40+rbp]
  1485. adc rdx,0
  1486. add r11,r12
  1487. mov r12,rdx
  1488. adc r12,0
  1489. mul rbx
  1490. add r13,rax
  1491. mov rax,QWORD[48+rbp]
  1492. adc rdx,0
  1493. add r12,r13
  1494. mov r13,rdx
  1495. adc r13,0
  1496. mul rbx
  1497. add r14,rax
  1498. mov rax,QWORD[56+rbp]
  1499. adc rdx,0
  1500. add r13,r14
  1501. mov r14,rdx
  1502. adc r14,0
  1503. mul rbx
  1504. mov rbx,rsi
  1505. add r15,rax
  1506. mov rax,QWORD[rbp]
  1507. adc rdx,0
  1508. add r14,r15
  1509. mov r15,rdx
  1510. adc r15,0
  1511. dec ecx
  1512. jnz NEAR $L$8x_reduce
  1513. lea rbp,[64+rbp]
  1514. xor rax,rax
  1515. mov rdx,QWORD[((8+8))+rsp]
  1516. cmp rbp,QWORD[((0+8))+rsp]
  1517. jae NEAR $L$8x_no_tail
  1518. DB 0x66
  1519. add r8,QWORD[rdi]
  1520. adc r9,QWORD[8+rdi]
  1521. adc r10,QWORD[16+rdi]
  1522. adc r11,QWORD[24+rdi]
  1523. adc r12,QWORD[32+rdi]
  1524. adc r13,QWORD[40+rdi]
  1525. adc r14,QWORD[48+rdi]
  1526. adc r15,QWORD[56+rdi]
  1527. sbb rsi,rsi
  1528. mov rbx,QWORD[((48+56+8))+rsp]
  1529. mov ecx,8
  1530. mov rax,QWORD[rbp]
  1531. jmp NEAR $L$8x_tail
  1532. ALIGN 32
  1533. $L$8x_tail:
  1534. mul rbx
  1535. add r8,rax
  1536. mov rax,QWORD[8+rbp]
  1537. mov QWORD[rdi],r8
  1538. mov r8,rdx
  1539. adc r8,0
  1540. mul rbx
  1541. add r9,rax
  1542. mov rax,QWORD[16+rbp]
  1543. adc rdx,0
  1544. add r8,r9
  1545. lea rdi,[8+rdi]
  1546. mov r9,rdx
  1547. adc r9,0
  1548. mul rbx
  1549. add r10,rax
  1550. mov rax,QWORD[24+rbp]
  1551. adc rdx,0
  1552. add r9,r10
  1553. mov r10,rdx
  1554. adc r10,0
  1555. mul rbx
  1556. add r11,rax
  1557. mov rax,QWORD[32+rbp]
  1558. adc rdx,0
  1559. add r10,r11
  1560. mov r11,rdx
  1561. adc r11,0
  1562. mul rbx
  1563. add r12,rax
  1564. mov rax,QWORD[40+rbp]
  1565. adc rdx,0
  1566. add r11,r12
  1567. mov r12,rdx
  1568. adc r12,0
  1569. mul rbx
  1570. add r13,rax
  1571. mov rax,QWORD[48+rbp]
  1572. adc rdx,0
  1573. add r12,r13
  1574. mov r13,rdx
  1575. adc r13,0
  1576. mul rbx
  1577. add r14,rax
  1578. mov rax,QWORD[56+rbp]
  1579. adc rdx,0
  1580. add r13,r14
  1581. mov r14,rdx
  1582. adc r14,0
  1583. mul rbx
  1584. mov rbx,QWORD[((48-16+8))+rcx*8+rsp]
  1585. add r15,rax
  1586. adc rdx,0
  1587. add r14,r15
  1588. mov rax,QWORD[rbp]
  1589. mov r15,rdx
  1590. adc r15,0
  1591. dec ecx
  1592. jnz NEAR $L$8x_tail
  1593. lea rbp,[64+rbp]
  1594. mov rdx,QWORD[((8+8))+rsp]
  1595. cmp rbp,QWORD[((0+8))+rsp]
  1596. jae NEAR $L$8x_tail_done
  1597. mov rbx,QWORD[((48+56+8))+rsp]
  1598. neg rsi
  1599. mov rax,QWORD[rbp]
  1600. adc r8,QWORD[rdi]
  1601. adc r9,QWORD[8+rdi]
  1602. adc r10,QWORD[16+rdi]
  1603. adc r11,QWORD[24+rdi]
  1604. adc r12,QWORD[32+rdi]
  1605. adc r13,QWORD[40+rdi]
  1606. adc r14,QWORD[48+rdi]
  1607. adc r15,QWORD[56+rdi]
  1608. sbb rsi,rsi
  1609. mov ecx,8
  1610. jmp NEAR $L$8x_tail
  1611. ALIGN 32
  1612. $L$8x_tail_done:
  1613. xor rax,rax
  1614. add r8,QWORD[rdx]
  1615. adc r9,0
  1616. adc r10,0
  1617. adc r11,0
  1618. adc r12,0
  1619. adc r13,0
  1620. adc r14,0
  1621. adc r15,0
  1622. adc rax,0
  1623. neg rsi
  1624. $L$8x_no_tail:
  1625. adc r8,QWORD[rdi]
  1626. adc r9,QWORD[8+rdi]
  1627. adc r10,QWORD[16+rdi]
  1628. adc r11,QWORD[24+rdi]
  1629. adc r12,QWORD[32+rdi]
  1630. adc r13,QWORD[40+rdi]
  1631. adc r14,QWORD[48+rdi]
  1632. adc r15,QWORD[56+rdi]
  1633. adc rax,0
  1634. mov rcx,QWORD[((-8))+rbp]
  1635. xor rsi,rsi
  1636. DB 102,72,15,126,213
  1637. mov QWORD[rdi],r8
  1638. mov QWORD[8+rdi],r9
  1639. DB 102,73,15,126,217
  1640. mov QWORD[16+rdi],r10
  1641. mov QWORD[24+rdi],r11
  1642. mov QWORD[32+rdi],r12
  1643. mov QWORD[40+rdi],r13
  1644. mov QWORD[48+rdi],r14
  1645. mov QWORD[56+rdi],r15
  1646. lea rdi,[64+rdi]
  1647. cmp rdi,rdx
  1648. jb NEAR $L$8x_reduction_loop
  1649. DB 0F3h,0C3h ;repret
  1650. ALIGN 32
  1651. __bn_post4x_internal:
  1652. mov r12,QWORD[rbp]
  1653. lea rbx,[r9*1+rdi]
  1654. mov rcx,r9
  1655. DB 102,72,15,126,207
  1656. neg rax
  1657. DB 102,72,15,126,206
  1658. sar rcx,3+2
  1659. dec r12
  1660. xor r10,r10
  1661. mov r13,QWORD[8+rbp]
  1662. mov r14,QWORD[16+rbp]
  1663. mov r15,QWORD[24+rbp]
  1664. jmp NEAR $L$sqr4x_sub_entry
  1665. ALIGN 16
  1666. $L$sqr4x_sub:
  1667. mov r12,QWORD[rbp]
  1668. mov r13,QWORD[8+rbp]
  1669. mov r14,QWORD[16+rbp]
  1670. mov r15,QWORD[24+rbp]
  1671. $L$sqr4x_sub_entry:
  1672. lea rbp,[32+rbp]
  1673. not r12
  1674. not r13
  1675. not r14
  1676. not r15
  1677. and r12,rax
  1678. and r13,rax
  1679. and r14,rax
  1680. and r15,rax
  1681. neg r10
  1682. adc r12,QWORD[rbx]
  1683. adc r13,QWORD[8+rbx]
  1684. adc r14,QWORD[16+rbx]
  1685. adc r15,QWORD[24+rbx]
  1686. mov QWORD[rdi],r12
  1687. lea rbx,[32+rbx]
  1688. mov QWORD[8+rdi],r13
  1689. sbb r10,r10
  1690. mov QWORD[16+rdi],r14
  1691. mov QWORD[24+rdi],r15
  1692. lea rdi,[32+rdi]
  1693. inc rcx
  1694. jnz NEAR $L$sqr4x_sub
  1695. mov r10,r9
  1696. neg r9
  1697. DB 0F3h,0C3h ;repret
  1698. global GFp_bn_from_montgomery
  1699. ALIGN 32
  1700. GFp_bn_from_montgomery:
  1701. test DWORD[48+rsp],7
  1702. jz NEAR bn_from_mont8x
  1703. xor eax,eax
  1704. DB 0F3h,0C3h ;repret
  1705. ALIGN 32
  1706. bn_from_mont8x:
  1707. mov QWORD[8+rsp],rdi ;WIN64 prologue
  1708. mov QWORD[16+rsp],rsi
  1709. mov rax,rsp
  1710. $L$SEH_begin_bn_from_mont8x:
  1711. mov rdi,rcx
  1712. mov rsi,rdx
  1713. mov rdx,r8
  1714. mov rcx,r9
  1715. mov r8,QWORD[40+rsp]
  1716. mov r9,QWORD[48+rsp]
  1717. DB 0x67
  1718. mov rax,rsp
  1719. push rbx
  1720. push rbp
  1721. push r12
  1722. push r13
  1723. push r14
  1724. push r15
  1725. $L$from_prologue:
  1726. shl r9d,3
  1727. lea r10,[r9*2+r9]
  1728. neg r9
  1729. mov r8,QWORD[r8]
  1730. lea r11,[((-320))+r9*2+rsp]
  1731. mov rbp,rsp
  1732. sub r11,rdi
  1733. and r11,4095
  1734. cmp r10,r11
  1735. jb NEAR $L$from_sp_alt
  1736. sub rbp,r11
  1737. lea rbp,[((-320))+r9*2+rbp]
  1738. jmp NEAR $L$from_sp_done
  1739. ALIGN 32
  1740. $L$from_sp_alt:
  1741. lea r10,[((4096-320))+r9*2]
  1742. lea rbp,[((-320))+r9*2+rbp]
  1743. sub r11,r10
  1744. mov r10,0
  1745. cmovc r11,r10
  1746. sub rbp,r11
  1747. $L$from_sp_done:
  1748. and rbp,-64
  1749. mov r11,rsp
  1750. sub r11,rbp
  1751. and r11,-4096
  1752. lea rsp,[rbp*1+r11]
  1753. mov r10,QWORD[rsp]
  1754. cmp rsp,rbp
  1755. ja NEAR $L$from_page_walk
  1756. jmp NEAR $L$from_page_walk_done
  1757. $L$from_page_walk:
  1758. lea rsp,[((-4096))+rsp]
  1759. mov r10,QWORD[rsp]
  1760. cmp rsp,rbp
  1761. ja NEAR $L$from_page_walk
  1762. $L$from_page_walk_done:
  1763. mov r10,r9
  1764. neg r9
  1765. mov QWORD[32+rsp],r8
  1766. mov QWORD[40+rsp],rax
  1767. $L$from_body:
  1768. mov r11,r9
  1769. lea rax,[48+rsp]
  1770. pxor xmm0,xmm0
  1771. jmp NEAR $L$mul_by_1
  1772. ALIGN 32
  1773. $L$mul_by_1:
  1774. movdqu xmm1,XMMWORD[rsi]
  1775. movdqu xmm2,XMMWORD[16+rsi]
  1776. movdqu xmm3,XMMWORD[32+rsi]
  1777. movdqa XMMWORD[r9*1+rax],xmm0
  1778. movdqu xmm4,XMMWORD[48+rsi]
  1779. movdqa XMMWORD[16+r9*1+rax],xmm0
  1780. DB 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
  1781. movdqa XMMWORD[rax],xmm1
  1782. movdqa XMMWORD[32+r9*1+rax],xmm0
  1783. movdqa XMMWORD[16+rax],xmm2
  1784. movdqa XMMWORD[48+r9*1+rax],xmm0
  1785. movdqa XMMWORD[32+rax],xmm3
  1786. movdqa XMMWORD[48+rax],xmm4
  1787. lea rax,[64+rax]
  1788. sub r11,64
  1789. jnz NEAR $L$mul_by_1
  1790. DB 102,72,15,110,207
  1791. DB 102,72,15,110,209
  1792. DB 0x67
  1793. mov rbp,rcx
  1794. DB 102,73,15,110,218
  1795. lea r11,[GFp_ia32cap_P]
  1796. mov r11d,DWORD[8+r11]
  1797. and r11d,0x80108
  1798. cmp r11d,0x80108
  1799. jne NEAR $L$from_mont_nox
  1800. lea rdi,[r9*1+rax]
  1801. call __bn_sqrx8x_reduction
  1802. call __bn_postx4x_internal
  1803. pxor xmm0,xmm0
  1804. lea rax,[48+rsp]
  1805. jmp NEAR $L$from_mont_zero
  1806. ALIGN 32
  1807. $L$from_mont_nox:
  1808. call __bn_sqr8x_reduction
  1809. call __bn_post4x_internal
  1810. pxor xmm0,xmm0
  1811. lea rax,[48+rsp]
  1812. jmp NEAR $L$from_mont_zero
  1813. ALIGN 32
  1814. $L$from_mont_zero:
  1815. mov rsi,QWORD[40+rsp]
  1816. movdqa XMMWORD[rax],xmm0
  1817. movdqa XMMWORD[16+rax],xmm0
  1818. movdqa XMMWORD[32+rax],xmm0
  1819. movdqa XMMWORD[48+rax],xmm0
  1820. lea rax,[64+rax]
  1821. sub r9,32
  1822. jnz NEAR $L$from_mont_zero
  1823. mov rax,1
  1824. mov r15,QWORD[((-48))+rsi]
  1825. mov r14,QWORD[((-40))+rsi]
  1826. mov r13,QWORD[((-32))+rsi]
  1827. mov r12,QWORD[((-24))+rsi]
  1828. mov rbp,QWORD[((-16))+rsi]
  1829. mov rbx,QWORD[((-8))+rsi]
  1830. lea rsp,[rsi]
  1831. $L$from_epilogue:
  1832. mov rdi,QWORD[8+rsp] ;WIN64 epilogue
  1833. mov rsi,QWORD[16+rsp]
  1834. DB 0F3h,0C3h ;repret
  1835. $L$SEH_end_bn_from_mont8x:
  1836. ALIGN 32
  1837. bn_mulx4x_mont_gather5:
  1838. mov QWORD[8+rsp],rdi ;WIN64 prologue
  1839. mov QWORD[16+rsp],rsi
  1840. mov rax,rsp
  1841. $L$SEH_begin_bn_mulx4x_mont_gather5:
  1842. mov rdi,rcx
  1843. mov rsi,rdx
  1844. mov rdx,r8
  1845. mov rcx,r9
  1846. mov r8,QWORD[40+rsp]
  1847. mov r9,QWORD[48+rsp]
  1848. mov rax,rsp
  1849. $L$mulx4x_enter:
  1850. push rbx
  1851. push rbp
  1852. push r12
  1853. push r13
  1854. push r14
  1855. push r15
  1856. $L$mulx4x_prologue:
  1857. shl r9d,3
  1858. lea r10,[r9*2+r9]
  1859. neg r9
  1860. mov r8,QWORD[r8]
  1861. lea r11,[((-320))+r9*2+rsp]
  1862. mov rbp,rsp
  1863. sub r11,rdi
  1864. and r11,4095
  1865. cmp r10,r11
  1866. jb NEAR $L$mulx4xsp_alt
  1867. sub rbp,r11
  1868. lea rbp,[((-320))+r9*2+rbp]
  1869. jmp NEAR $L$mulx4xsp_done
  1870. $L$mulx4xsp_alt:
  1871. lea r10,[((4096-320))+r9*2]
  1872. lea rbp,[((-320))+r9*2+rbp]
  1873. sub r11,r10
  1874. mov r10,0
  1875. cmovc r11,r10
  1876. sub rbp,r11
  1877. $L$mulx4xsp_done:
  1878. and rbp,-64
  1879. mov r11,rsp
  1880. sub r11,rbp
  1881. and r11,-4096
  1882. lea rsp,[rbp*1+r11]
  1883. mov r10,QWORD[rsp]
  1884. cmp rsp,rbp
  1885. ja NEAR $L$mulx4x_page_walk
  1886. jmp NEAR $L$mulx4x_page_walk_done
  1887. $L$mulx4x_page_walk:
  1888. lea rsp,[((-4096))+rsp]
  1889. mov r10,QWORD[rsp]
  1890. cmp rsp,rbp
  1891. ja NEAR $L$mulx4x_page_walk
  1892. $L$mulx4x_page_walk_done:
  1893. mov QWORD[32+rsp],r8
  1894. mov QWORD[40+rsp],rax
  1895. $L$mulx4x_body:
  1896. call mulx4x_internal
  1897. mov rsi,QWORD[40+rsp]
  1898. mov rax,1
  1899. mov r15,QWORD[((-48))+rsi]
  1900. mov r14,QWORD[((-40))+rsi]
  1901. mov r13,QWORD[((-32))+rsi]
  1902. mov r12,QWORD[((-24))+rsi]
  1903. mov rbp,QWORD[((-16))+rsi]
  1904. mov rbx,QWORD[((-8))+rsi]
  1905. lea rsp,[rsi]
  1906. $L$mulx4x_epilogue:
  1907. mov rdi,QWORD[8+rsp] ;WIN64 epilogue
  1908. mov rsi,QWORD[16+rsp]
  1909. DB 0F3h,0C3h ;repret
  1910. $L$SEH_end_bn_mulx4x_mont_gather5:
  1911. ALIGN 32
  1912. mulx4x_internal:
  1913. mov QWORD[8+rsp],r9
  1914. mov r10,r9
  1915. neg r9
  1916. shl r9,5
  1917. neg r10
  1918. lea r13,[128+r9*1+rdx]
  1919. shr r9,5+5
  1920. movd xmm5,DWORD[56+rax]
  1921. sub r9,1
  1922. lea rax,[$L$inc]
  1923. mov QWORD[((16+8))+rsp],r13
  1924. mov QWORD[((24+8))+rsp],r9
  1925. mov QWORD[((56+8))+rsp],rdi
  1926. movdqa xmm0,XMMWORD[rax]
  1927. movdqa xmm1,XMMWORD[16+rax]
  1928. lea r10,[((88-112))+r10*1+rsp]
  1929. lea rdi,[128+rdx]
  1930. pshufd xmm5,xmm5,0
  1931. movdqa xmm4,xmm1
  1932. DB 0x67
  1933. movdqa xmm2,xmm1
  1934. DB 0x67
  1935. paddd xmm1,xmm0
  1936. pcmpeqd xmm0,xmm5
  1937. movdqa xmm3,xmm4
  1938. paddd xmm2,xmm1
  1939. pcmpeqd xmm1,xmm5
  1940. movdqa XMMWORD[112+r10],xmm0
  1941. movdqa xmm0,xmm4
  1942. paddd xmm3,xmm2
  1943. pcmpeqd xmm2,xmm5
  1944. movdqa XMMWORD[128+r10],xmm1
  1945. movdqa xmm1,xmm4
  1946. paddd xmm0,xmm3
  1947. pcmpeqd xmm3,xmm5
  1948. movdqa XMMWORD[144+r10],xmm2
  1949. movdqa xmm2,xmm4
  1950. paddd xmm1,xmm0
  1951. pcmpeqd xmm0,xmm5
  1952. movdqa XMMWORD[160+r10],xmm3
  1953. movdqa xmm3,xmm4
  1954. paddd xmm2,xmm1
  1955. pcmpeqd xmm1,xmm5
  1956. movdqa XMMWORD[176+r10],xmm0
  1957. movdqa xmm0,xmm4
  1958. paddd xmm3,xmm2
  1959. pcmpeqd xmm2,xmm5
  1960. movdqa XMMWORD[192+r10],xmm1
  1961. movdqa xmm1,xmm4
  1962. paddd xmm0,xmm3
  1963. pcmpeqd xmm3,xmm5
  1964. movdqa XMMWORD[208+r10],xmm2
  1965. movdqa xmm2,xmm4
  1966. paddd xmm1,xmm0
  1967. pcmpeqd xmm0,xmm5
  1968. movdqa XMMWORD[224+r10],xmm3
  1969. movdqa xmm3,xmm4
  1970. paddd xmm2,xmm1
  1971. pcmpeqd xmm1,xmm5
  1972. movdqa XMMWORD[240+r10],xmm0
  1973. movdqa xmm0,xmm4
  1974. paddd xmm3,xmm2
  1975. pcmpeqd xmm2,xmm5
  1976. movdqa XMMWORD[256+r10],xmm1
  1977. movdqa xmm1,xmm4
  1978. paddd xmm0,xmm3
  1979. pcmpeqd xmm3,xmm5
  1980. movdqa XMMWORD[272+r10],xmm2
  1981. movdqa xmm2,xmm4
  1982. paddd xmm1,xmm0
  1983. pcmpeqd xmm0,xmm5
  1984. movdqa XMMWORD[288+r10],xmm3
  1985. movdqa xmm3,xmm4
  1986. DB 0x67
  1987. paddd xmm2,xmm1
  1988. pcmpeqd xmm1,xmm5
  1989. movdqa XMMWORD[304+r10],xmm0
  1990. paddd xmm3,xmm2
  1991. pcmpeqd xmm2,xmm5
  1992. movdqa XMMWORD[320+r10],xmm1
  1993. pcmpeqd xmm3,xmm5
  1994. movdqa XMMWORD[336+r10],xmm2
  1995. pand xmm0,XMMWORD[64+rdi]
  1996. pand xmm1,XMMWORD[80+rdi]
  1997. pand xmm2,XMMWORD[96+rdi]
  1998. movdqa XMMWORD[352+r10],xmm3
  1999. pand xmm3,XMMWORD[112+rdi]
  2000. por xmm0,xmm2
  2001. por xmm1,xmm3
  2002. movdqa xmm4,XMMWORD[((-128))+rdi]
  2003. movdqa xmm5,XMMWORD[((-112))+rdi]
  2004. movdqa xmm2,XMMWORD[((-96))+rdi]
  2005. pand xmm4,XMMWORD[112+r10]
  2006. movdqa xmm3,XMMWORD[((-80))+rdi]
  2007. pand xmm5,XMMWORD[128+r10]
  2008. por xmm0,xmm4
  2009. pand xmm2,XMMWORD[144+r10]
  2010. por xmm1,xmm5
  2011. pand xmm3,XMMWORD[160+r10]
  2012. por xmm0,xmm2
  2013. por xmm1,xmm3
  2014. movdqa xmm4,XMMWORD[((-64))+rdi]
  2015. movdqa xmm5,XMMWORD[((-48))+rdi]
  2016. movdqa xmm2,XMMWORD[((-32))+rdi]
  2017. pand xmm4,XMMWORD[176+r10]
  2018. movdqa xmm3,XMMWORD[((-16))+rdi]
  2019. pand xmm5,XMMWORD[192+r10]
  2020. por xmm0,xmm4
  2021. pand xmm2,XMMWORD[208+r10]
  2022. por xmm1,xmm5
  2023. pand xmm3,XMMWORD[224+r10]
  2024. por xmm0,xmm2
  2025. por xmm1,xmm3
  2026. movdqa xmm4,XMMWORD[rdi]
  2027. movdqa xmm5,XMMWORD[16+rdi]
  2028. movdqa xmm2,XMMWORD[32+rdi]
  2029. pand xmm4,XMMWORD[240+r10]
  2030. movdqa xmm3,XMMWORD[48+rdi]
  2031. pand xmm5,XMMWORD[256+r10]
  2032. por xmm0,xmm4
  2033. pand xmm2,XMMWORD[272+r10]
  2034. por xmm1,xmm5
  2035. pand xmm3,XMMWORD[288+r10]
  2036. por xmm0,xmm2
  2037. por xmm1,xmm3
  2038. pxor xmm0,xmm1
  2039. pshufd xmm1,xmm0,0x4e
  2040. por xmm0,xmm1
  2041. lea rdi,[256+rdi]
  2042. DB 102,72,15,126,194
  2043. lea rbx,[((64+32+8))+rsp]
  2044. mov r9,rdx
  2045. mulx rax,r8,QWORD[rsi]
  2046. mulx r12,r11,QWORD[8+rsi]
  2047. add r11,rax
  2048. mulx r13,rax,QWORD[16+rsi]
  2049. adc r12,rax
  2050. adc r13,0
  2051. mulx r14,rax,QWORD[24+rsi]
  2052. mov r15,r8
  2053. imul r8,QWORD[((32+8))+rsp]
  2054. xor rbp,rbp
  2055. mov rdx,r8
  2056. mov QWORD[((8+8))+rsp],rdi
  2057. lea rsi,[32+rsi]
  2058. adcx r13,rax
  2059. adcx r14,rbp
  2060. mulx r10,rax,QWORD[rcx]
  2061. adcx r15,rax
  2062. adox r10,r11
  2063. mulx r11,rax,QWORD[8+rcx]
  2064. adcx r10,rax
  2065. adox r11,r12
  2066. mulx r12,rax,QWORD[16+rcx]
  2067. mov rdi,QWORD[((24+8))+rsp]
  2068. mov QWORD[((-32))+rbx],r10
  2069. adcx r11,rax
  2070. adox r12,r13
  2071. mulx r15,rax,QWORD[24+rcx]
  2072. mov rdx,r9
  2073. mov QWORD[((-24))+rbx],r11
  2074. adcx r12,rax
  2075. adox r15,rbp
  2076. lea rcx,[32+rcx]
  2077. mov QWORD[((-16))+rbx],r12
  2078. jmp NEAR $L$mulx4x_1st
  2079. ALIGN 32
  2080. $L$mulx4x_1st:
  2081. adcx r15,rbp
  2082. mulx rax,r10,QWORD[rsi]
  2083. adcx r10,r14
  2084. mulx r14,r11,QWORD[8+rsi]
  2085. adcx r11,rax
  2086. mulx rax,r12,QWORD[16+rsi]
  2087. adcx r12,r14
  2088. mulx r14,r13,QWORD[24+rsi]
  2089. DB 0x67,0x67
  2090. mov rdx,r8
  2091. adcx r13,rax
  2092. adcx r14,rbp
  2093. lea rsi,[32+rsi]
  2094. lea rbx,[32+rbx]
  2095. adox r10,r15
  2096. mulx r15,rax,QWORD[rcx]
  2097. adcx r10,rax
  2098. adox r11,r15
  2099. mulx r15,rax,QWORD[8+rcx]
  2100. adcx r11,rax
  2101. adox r12,r15
  2102. mulx r15,rax,QWORD[16+rcx]
  2103. mov QWORD[((-40))+rbx],r10
  2104. adcx r12,rax
  2105. mov QWORD[((-32))+rbx],r11
  2106. adox r13,r15
  2107. mulx r15,rax,QWORD[24+rcx]
  2108. mov rdx,r9
  2109. mov QWORD[((-24))+rbx],r12
  2110. adcx r13,rax
  2111. adox r15,rbp
  2112. lea rcx,[32+rcx]
  2113. mov QWORD[((-16))+rbx],r13
  2114. dec rdi
  2115. jnz NEAR $L$mulx4x_1st
  2116. mov rax,QWORD[8+rsp]
  2117. adc r15,rbp
  2118. lea rsi,[rax*1+rsi]
  2119. add r14,r15
  2120. mov rdi,QWORD[((8+8))+rsp]
  2121. adc rbp,rbp
  2122. mov QWORD[((-8))+rbx],r14
  2123. jmp NEAR $L$mulx4x_outer
  2124. ALIGN 32
  2125. $L$mulx4x_outer:
  2126. lea r10,[((16-256))+rbx]
  2127. pxor xmm4,xmm4
  2128. DB 0x67,0x67
  2129. pxor xmm5,xmm5
  2130. movdqa xmm0,XMMWORD[((-128))+rdi]
  2131. movdqa xmm1,XMMWORD[((-112))+rdi]
  2132. movdqa xmm2,XMMWORD[((-96))+rdi]
  2133. pand xmm0,XMMWORD[256+r10]
  2134. movdqa xmm3,XMMWORD[((-80))+rdi]
  2135. pand xmm1,XMMWORD[272+r10]
  2136. por xmm4,xmm0
  2137. pand xmm2,XMMWORD[288+r10]
  2138. por xmm5,xmm1
  2139. pand xmm3,XMMWORD[304+r10]
  2140. por xmm4,xmm2
  2141. por xmm5,xmm3
  2142. movdqa xmm0,XMMWORD[((-64))+rdi]
  2143. movdqa xmm1,XMMWORD[((-48))+rdi]
  2144. movdqa xmm2,XMMWORD[((-32))+rdi]
  2145. pand xmm0,XMMWORD[320+r10]
  2146. movdqa xmm3,XMMWORD[((-16))+rdi]
  2147. pand xmm1,XMMWORD[336+r10]
  2148. por xmm4,xmm0
  2149. pand xmm2,XMMWORD[352+r10]
  2150. por xmm5,xmm1
  2151. pand xmm3,XMMWORD[368+r10]
  2152. por xmm4,xmm2
  2153. por xmm5,xmm3
  2154. movdqa xmm0,XMMWORD[rdi]
  2155. movdqa xmm1,XMMWORD[16+rdi]
  2156. movdqa xmm2,XMMWORD[32+rdi]
  2157. pand xmm0,XMMWORD[384+r10]
  2158. movdqa xmm3,XMMWORD[48+rdi]
  2159. pand xmm1,XMMWORD[400+r10]
  2160. por xmm4,xmm0
  2161. pand xmm2,XMMWORD[416+r10]
  2162. por xmm5,xmm1
  2163. pand xmm3,XMMWORD[432+r10]
  2164. por xmm4,xmm2
  2165. por xmm5,xmm3
  2166. movdqa xmm0,XMMWORD[64+rdi]
  2167. movdqa xmm1,XMMWORD[80+rdi]
  2168. movdqa xmm2,XMMWORD[96+rdi]
  2169. pand xmm0,XMMWORD[448+r10]
  2170. movdqa xmm3,XMMWORD[112+rdi]
  2171. pand xmm1,XMMWORD[464+r10]
  2172. por xmm4,xmm0
  2173. pand xmm2,XMMWORD[480+r10]
  2174. por xmm5,xmm1
  2175. pand xmm3,XMMWORD[496+r10]
  2176. por xmm4,xmm2
  2177. por xmm5,xmm3
  2178. por xmm4,xmm5
  2179. pshufd xmm0,xmm4,0x4e
  2180. por xmm0,xmm4
  2181. lea rdi,[256+rdi]
  2182. DB 102,72,15,126,194
  2183. mov QWORD[rbx],rbp
  2184. lea rbx,[32+rax*1+rbx]
  2185. mulx r11,r8,QWORD[rsi]
  2186. xor rbp,rbp
  2187. mov r9,rdx
  2188. mulx r12,r14,QWORD[8+rsi]
  2189. adox r8,QWORD[((-32))+rbx]
  2190. adcx r11,r14
  2191. mulx r13,r15,QWORD[16+rsi]
  2192. adox r11,QWORD[((-24))+rbx]
  2193. adcx r12,r15
  2194. mulx r14,rdx,QWORD[24+rsi]
  2195. adox r12,QWORD[((-16))+rbx]
  2196. adcx r13,rdx
  2197. lea rcx,[rax*1+rcx]
  2198. lea rsi,[32+rsi]
  2199. adox r13,QWORD[((-8))+rbx]
  2200. adcx r14,rbp
  2201. adox r14,rbp
  2202. mov r15,r8
  2203. imul r8,QWORD[((32+8))+rsp]
  2204. mov rdx,r8
  2205. xor rbp,rbp
  2206. mov QWORD[((8+8))+rsp],rdi
  2207. mulx r10,rax,QWORD[rcx]
  2208. adcx r15,rax
  2209. adox r10,r11
  2210. mulx r11,rax,QWORD[8+rcx]
  2211. adcx r10,rax
  2212. adox r11,r12
  2213. mulx r12,rax,QWORD[16+rcx]
  2214. adcx r11,rax
  2215. adox r12,r13
  2216. mulx r15,rax,QWORD[24+rcx]
  2217. mov rdx,r9
  2218. mov rdi,QWORD[((24+8))+rsp]
  2219. mov QWORD[((-32))+rbx],r10
  2220. adcx r12,rax
  2221. mov QWORD[((-24))+rbx],r11
  2222. adox r15,rbp
  2223. mov QWORD[((-16))+rbx],r12
  2224. lea rcx,[32+rcx]
  2225. jmp NEAR $L$mulx4x_inner
  2226. ALIGN 32
  2227. $L$mulx4x_inner:
  2228. mulx rax,r10,QWORD[rsi]
  2229. adcx r15,rbp
  2230. adox r10,r14
  2231. mulx r14,r11,QWORD[8+rsi]
  2232. adcx r10,QWORD[rbx]
  2233. adox r11,rax
  2234. mulx rax,r12,QWORD[16+rsi]
  2235. adcx r11,QWORD[8+rbx]
  2236. adox r12,r14
  2237. mulx r14,r13,QWORD[24+rsi]
  2238. mov rdx,r8
  2239. adcx r12,QWORD[16+rbx]
  2240. adox r13,rax
  2241. adcx r13,QWORD[24+rbx]
  2242. adox r14,rbp
  2243. lea rsi,[32+rsi]
  2244. lea rbx,[32+rbx]
  2245. adcx r14,rbp
  2246. adox r10,r15
  2247. mulx r15,rax,QWORD[rcx]
  2248. adcx r10,rax
  2249. adox r11,r15
  2250. mulx r15,rax,QWORD[8+rcx]
  2251. adcx r11,rax
  2252. adox r12,r15
  2253. mulx r15,rax,QWORD[16+rcx]
  2254. mov QWORD[((-40))+rbx],r10
  2255. adcx r12,rax
  2256. adox r13,r15
  2257. mov QWORD[((-32))+rbx],r11
  2258. mulx r15,rax,QWORD[24+rcx]
  2259. mov rdx,r9
  2260. lea rcx,[32+rcx]
  2261. mov QWORD[((-24))+rbx],r12
  2262. adcx r13,rax
  2263. adox r15,rbp
  2264. mov QWORD[((-16))+rbx],r13
  2265. dec rdi
  2266. jnz NEAR $L$mulx4x_inner
  2267. mov rax,QWORD[((0+8))+rsp]
  2268. adc r15,rbp
  2269. sub rdi,QWORD[rbx]
  2270. mov rdi,QWORD[((8+8))+rsp]
  2271. mov r10,QWORD[((16+8))+rsp]
  2272. adc r14,r15
  2273. lea rsi,[rax*1+rsi]
  2274. adc rbp,rbp
  2275. mov QWORD[((-8))+rbx],r14
  2276. cmp rdi,r10
  2277. jb NEAR $L$mulx4x_outer
  2278. mov r10,QWORD[((-8))+rcx]
  2279. mov r8,rbp
  2280. mov r12,QWORD[rax*1+rcx]
  2281. lea rbp,[rax*1+rcx]
  2282. mov rcx,rax
  2283. lea rdi,[rax*1+rbx]
  2284. xor eax,eax
  2285. xor r15,r15
  2286. sub r10,r14
  2287. adc r15,r15
  2288. or r8,r15
  2289. sar rcx,3+2
  2290. sub rax,r8
  2291. mov rdx,QWORD[((56+8))+rsp]
  2292. dec r12
  2293. mov r13,QWORD[8+rbp]
  2294. xor r8,r8
  2295. mov r14,QWORD[16+rbp]
  2296. mov r15,QWORD[24+rbp]
  2297. jmp NEAR $L$sqrx4x_sub_entry
  2298. ALIGN 32
  2299. bn_powerx5:
  2300. mov QWORD[8+rsp],rdi ;WIN64 prologue
  2301. mov QWORD[16+rsp],rsi
  2302. mov rax,rsp
  2303. $L$SEH_begin_bn_powerx5:
  2304. mov rdi,rcx
  2305. mov rsi,rdx
  2306. mov rdx,r8
  2307. mov rcx,r9
  2308. mov r8,QWORD[40+rsp]
  2309. mov r9,QWORD[48+rsp]
  2310. mov rax,rsp
  2311. $L$powerx5_enter:
  2312. push rbx
  2313. push rbp
  2314. push r12
  2315. push r13
  2316. push r14
  2317. push r15
  2318. $L$powerx5_prologue:
  2319. shl r9d,3
  2320. lea r10,[r9*2+r9]
  2321. neg r9
  2322. mov r8,QWORD[r8]
  2323. lea r11,[((-320))+r9*2+rsp]
  2324. mov rbp,rsp
  2325. sub r11,rdi
  2326. and r11,4095
  2327. cmp r10,r11
  2328. jb NEAR $L$pwrx_sp_alt
  2329. sub rbp,r11
  2330. lea rbp,[((-320))+r9*2+rbp]
  2331. jmp NEAR $L$pwrx_sp_done
  2332. ALIGN 32
  2333. $L$pwrx_sp_alt:
  2334. lea r10,[((4096-320))+r9*2]
  2335. lea rbp,[((-320))+r9*2+rbp]
  2336. sub r11,r10
  2337. mov r10,0
  2338. cmovc r11,r10
  2339. sub rbp,r11
  2340. $L$pwrx_sp_done:
  2341. and rbp,-64
  2342. mov r11,rsp
  2343. sub r11,rbp
  2344. and r11,-4096
  2345. lea rsp,[rbp*1+r11]
  2346. mov r10,QWORD[rsp]
  2347. cmp rsp,rbp
  2348. ja NEAR $L$pwrx_page_walk
  2349. jmp NEAR $L$pwrx_page_walk_done
  2350. $L$pwrx_page_walk:
  2351. lea rsp,[((-4096))+rsp]
  2352. mov r10,QWORD[rsp]
  2353. cmp rsp,rbp
  2354. ja NEAR $L$pwrx_page_walk
  2355. $L$pwrx_page_walk_done:
  2356. mov r10,r9
  2357. neg r9
  2358. pxor xmm0,xmm0
  2359. DB 102,72,15,110,207
  2360. DB 102,72,15,110,209
  2361. DB 102,73,15,110,218
  2362. DB 102,72,15,110,226
  2363. mov QWORD[32+rsp],r8
  2364. mov QWORD[40+rsp],rax
  2365. $L$powerx5_body:
  2366. call __bn_sqrx8x_internal
  2367. call __bn_postx4x_internal
  2368. call __bn_sqrx8x_internal
  2369. call __bn_postx4x_internal
  2370. call __bn_sqrx8x_internal
  2371. call __bn_postx4x_internal
  2372. call __bn_sqrx8x_internal
  2373. call __bn_postx4x_internal
  2374. call __bn_sqrx8x_internal
  2375. call __bn_postx4x_internal
  2376. mov r9,r10
  2377. mov rdi,rsi
  2378. DB 102,72,15,126,209
  2379. DB 102,72,15,126,226
  2380. mov rax,QWORD[40+rsp]
  2381. call mulx4x_internal
  2382. mov rsi,QWORD[40+rsp]
  2383. mov rax,1
  2384. mov r15,QWORD[((-48))+rsi]
  2385. mov r14,QWORD[((-40))+rsi]
  2386. mov r13,QWORD[((-32))+rsi]
  2387. mov r12,QWORD[((-24))+rsi]
  2388. mov rbp,QWORD[((-16))+rsi]
  2389. mov rbx,QWORD[((-8))+rsi]
  2390. lea rsp,[rsi]
  2391. $L$powerx5_epilogue:
  2392. mov rdi,QWORD[8+rsp] ;WIN64 epilogue
  2393. mov rsi,QWORD[16+rsp]
  2394. DB 0F3h,0C3h ;repret
  2395. $L$SEH_end_bn_powerx5:
  2396. global GFp_bn_sqrx8x_internal
  2397. ALIGN 32
  2398. GFp_bn_sqrx8x_internal:
  2399. __bn_sqrx8x_internal:
  2400. lea rdi,[((48+8))+rsp]
  2401. lea rbp,[r9*1+rsi]
  2402. mov QWORD[((0+8))+rsp],r9
  2403. mov QWORD[((8+8))+rsp],rbp
  2404. jmp NEAR $L$sqr8x_zero_start
  2405. ALIGN 32
  2406. DB 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
  2407. $L$sqrx8x_zero:
  2408. DB 0x3e
  2409. movdqa XMMWORD[rdi],xmm0
  2410. movdqa XMMWORD[16+rdi],xmm0
  2411. movdqa XMMWORD[32+rdi],xmm0
  2412. movdqa XMMWORD[48+rdi],xmm0
  2413. $L$sqr8x_zero_start:
  2414. movdqa XMMWORD[64+rdi],xmm0
  2415. movdqa XMMWORD[80+rdi],xmm0
  2416. movdqa XMMWORD[96+rdi],xmm0
  2417. movdqa XMMWORD[112+rdi],xmm0
  2418. lea rdi,[128+rdi]
  2419. sub r9,64
  2420. jnz NEAR $L$sqrx8x_zero
  2421. mov rdx,QWORD[rsi]
  2422. xor r10,r10
  2423. xor r11,r11
  2424. xor r12,r12
  2425. xor r13,r13
  2426. xor r14,r14
  2427. xor r15,r15
  2428. lea rdi,[((48+8))+rsp]
  2429. xor rbp,rbp
  2430. jmp NEAR $L$sqrx8x_outer_loop
  2431. ALIGN 32
  2432. $L$sqrx8x_outer_loop:
  2433. mulx rax,r8,QWORD[8+rsi]
  2434. adcx r8,r9
  2435. adox r10,rax
  2436. mulx rax,r9,QWORD[16+rsi]
  2437. adcx r9,r10
  2438. adox r11,rax
  2439. DB 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
  2440. adcx r10,r11
  2441. adox r12,rax
  2442. DB 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
  2443. adcx r11,r12
  2444. adox r13,rax
  2445. mulx rax,r12,QWORD[40+rsi]
  2446. adcx r12,r13
  2447. adox r14,rax
  2448. mulx rax,r13,QWORD[48+rsi]
  2449. adcx r13,r14
  2450. adox rax,r15
  2451. mulx r15,r14,QWORD[56+rsi]
  2452. mov rdx,QWORD[8+rsi]
  2453. adcx r14,rax
  2454. adox r15,rbp
  2455. adc r15,QWORD[64+rdi]
  2456. mov QWORD[8+rdi],r8
  2457. mov QWORD[16+rdi],r9
  2458. sbb rcx,rcx
  2459. xor rbp,rbp
  2460. mulx rbx,r8,QWORD[16+rsi]
  2461. mulx rax,r9,QWORD[24+rsi]
  2462. adcx r8,r10
  2463. adox r9,rbx
  2464. mulx rbx,r10,QWORD[32+rsi]
  2465. adcx r9,r11
  2466. adox r10,rax
  2467. DB 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
  2468. adcx r10,r12
  2469. adox r11,rbx
  2470. DB 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
  2471. adcx r11,r13
  2472. adox r12,r14
  2473. DB 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
  2474. mov rdx,QWORD[16+rsi]
  2475. adcx r12,rax
  2476. adox r13,rbx
  2477. adcx r13,r15
  2478. adox r14,rbp
  2479. adcx r14,rbp
  2480. mov QWORD[24+rdi],r8
  2481. mov QWORD[32+rdi],r9
  2482. mulx rbx,r8,QWORD[24+rsi]
  2483. mulx rax,r9,QWORD[32+rsi]
  2484. adcx r8,r10
  2485. adox r9,rbx
  2486. mulx rbx,r10,QWORD[40+rsi]
  2487. adcx r9,r11
  2488. adox r10,rax
  2489. DB 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
  2490. adcx r10,r12
  2491. adox r11,r13
  2492. DB 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
  2493. DB 0x3e
  2494. mov rdx,QWORD[24+rsi]
  2495. adcx r11,rbx
  2496. adox r12,rax
  2497. adcx r12,r14
  2498. mov QWORD[40+rdi],r8
  2499. mov QWORD[48+rdi],r9
  2500. mulx rax,r8,QWORD[32+rsi]
  2501. adox r13,rbp
  2502. adcx r13,rbp
  2503. mulx rbx,r9,QWORD[40+rsi]
  2504. adcx r8,r10
  2505. adox r9,rax
  2506. mulx rax,r10,QWORD[48+rsi]
  2507. adcx r9,r11
  2508. adox r10,r12
  2509. mulx r12,r11,QWORD[56+rsi]
  2510. mov rdx,QWORD[32+rsi]
  2511. mov r14,QWORD[40+rsi]
  2512. adcx r10,rbx
  2513. adox r11,rax
  2514. mov r15,QWORD[48+rsi]
  2515. adcx r11,r13
  2516. adox r12,rbp
  2517. adcx r12,rbp
  2518. mov QWORD[56+rdi],r8
  2519. mov QWORD[64+rdi],r9
  2520. mulx rax,r9,r14
  2521. mov r8,QWORD[56+rsi]
  2522. adcx r9,r10
  2523. mulx rbx,r10,r15
  2524. adox r10,rax
  2525. adcx r10,r11
  2526. mulx rax,r11,r8
  2527. mov rdx,r14
  2528. adox r11,rbx
  2529. adcx r11,r12
  2530. adcx rax,rbp
  2531. mulx rbx,r14,r15
  2532. mulx r13,r12,r8
  2533. mov rdx,r15
  2534. lea rsi,[64+rsi]
  2535. adcx r11,r14
  2536. adox r12,rbx
  2537. adcx r12,rax
  2538. adox r13,rbp
  2539. DB 0x67,0x67
  2540. mulx r14,r8,r8
  2541. adcx r13,r8
  2542. adcx r14,rbp
  2543. cmp rsi,QWORD[((8+8))+rsp]
  2544. je NEAR $L$sqrx8x_outer_break
  2545. neg rcx
  2546. mov rcx,-8
  2547. mov r15,rbp
  2548. mov r8,QWORD[64+rdi]
  2549. adcx r9,QWORD[72+rdi]
  2550. adcx r10,QWORD[80+rdi]
  2551. adcx r11,QWORD[88+rdi]
  2552. adc r12,QWORD[96+rdi]
  2553. adc r13,QWORD[104+rdi]
  2554. adc r14,QWORD[112+rdi]
  2555. adc r15,QWORD[120+rdi]
  2556. lea rbp,[rsi]
  2557. lea rdi,[128+rdi]
  2558. sbb rax,rax
  2559. mov rdx,QWORD[((-64))+rsi]
  2560. mov QWORD[((16+8))+rsp],rax
  2561. mov QWORD[((24+8))+rsp],rdi
  2562. xor eax,eax
  2563. jmp NEAR $L$sqrx8x_loop
  2564. ALIGN 32
  2565. $L$sqrx8x_loop:
  2566. mov rbx,r8
  2567. mulx r8,rax,QWORD[rbp]
  2568. adcx rbx,rax
  2569. adox r8,r9
  2570. mulx r9,rax,QWORD[8+rbp]
  2571. adcx r8,rax
  2572. adox r9,r10
  2573. mulx r10,rax,QWORD[16+rbp]
  2574. adcx r9,rax
  2575. adox r10,r11
  2576. mulx r11,rax,QWORD[24+rbp]
  2577. adcx r10,rax
  2578. adox r11,r12
  2579. DB 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
  2580. adcx r11,rax
  2581. adox r12,r13
  2582. mulx r13,rax,QWORD[40+rbp]
  2583. adcx r12,rax
  2584. adox r13,r14
  2585. mulx r14,rax,QWORD[48+rbp]
  2586. mov QWORD[rcx*8+rdi],rbx
  2587. mov ebx,0
  2588. adcx r13,rax
  2589. adox r14,r15
  2590. DB 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
  2591. mov rdx,QWORD[8+rcx*8+rsi]
  2592. adcx r14,rax
  2593. adox r15,rbx
  2594. adcx r15,rbx
  2595. DB 0x67
  2596. inc rcx
  2597. jnz NEAR $L$sqrx8x_loop
  2598. lea rbp,[64+rbp]
  2599. mov rcx,-8
  2600. cmp rbp,QWORD[((8+8))+rsp]
  2601. je NEAR $L$sqrx8x_break
  2602. sub rbx,QWORD[((16+8))+rsp]
  2603. DB 0x66
  2604. mov rdx,QWORD[((-64))+rsi]
  2605. adcx r8,QWORD[rdi]
  2606. adcx r9,QWORD[8+rdi]
  2607. adc r10,QWORD[16+rdi]
  2608. adc r11,QWORD[24+rdi]
  2609. adc r12,QWORD[32+rdi]
  2610. adc r13,QWORD[40+rdi]
  2611. adc r14,QWORD[48+rdi]
  2612. adc r15,QWORD[56+rdi]
  2613. lea rdi,[64+rdi]
  2614. DB 0x67
  2615. sbb rax,rax
  2616. xor ebx,ebx
  2617. mov QWORD[((16+8))+rsp],rax
  2618. jmp NEAR $L$sqrx8x_loop
  2619. ALIGN 32
  2620. $L$sqrx8x_break:
  2621. xor rbp,rbp
  2622. sub rbx,QWORD[((16+8))+rsp]
  2623. adcx r8,rbp
  2624. mov rcx,QWORD[((24+8))+rsp]
  2625. adcx r9,rbp
  2626. mov rdx,QWORD[rsi]
  2627. adc r10,0
  2628. mov QWORD[rdi],r8
  2629. adc r11,0
  2630. adc r12,0
  2631. adc r13,0
  2632. adc r14,0
  2633. adc r15,0
  2634. cmp rdi,rcx
  2635. je NEAR $L$sqrx8x_outer_loop
  2636. mov QWORD[8+rdi],r9
  2637. mov r9,QWORD[8+rcx]
  2638. mov QWORD[16+rdi],r10
  2639. mov r10,QWORD[16+rcx]
  2640. mov QWORD[24+rdi],r11
  2641. mov r11,QWORD[24+rcx]
  2642. mov QWORD[32+rdi],r12
  2643. mov r12,QWORD[32+rcx]
  2644. mov QWORD[40+rdi],r13
  2645. mov r13,QWORD[40+rcx]
  2646. mov QWORD[48+rdi],r14
  2647. mov r14,QWORD[48+rcx]
  2648. mov QWORD[56+rdi],r15
  2649. mov r15,QWORD[56+rcx]
  2650. mov rdi,rcx
  2651. jmp NEAR $L$sqrx8x_outer_loop
  2652. ALIGN 32
  2653. $L$sqrx8x_outer_break:
  2654. mov QWORD[72+rdi],r9
  2655. DB 102,72,15,126,217
  2656. mov QWORD[80+rdi],r10
  2657. mov QWORD[88+rdi],r11
  2658. mov QWORD[96+rdi],r12
  2659. mov QWORD[104+rdi],r13
  2660. mov QWORD[112+rdi],r14
  2661. lea rdi,[((48+8))+rsp]
  2662. mov rdx,QWORD[rcx*1+rsi]
  2663. mov r11,QWORD[8+rdi]
  2664. xor r10,r10
  2665. mov r9,QWORD[((0+8))+rsp]
  2666. adox r11,r11
  2667. mov r12,QWORD[16+rdi]
  2668. mov r13,QWORD[24+rdi]
  2669. ALIGN 32
  2670. $L$sqrx4x_shift_n_add:
  2671. mulx rbx,rax,rdx
  2672. adox r12,r12
  2673. adcx rax,r10
  2674. DB 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
  2675. DB 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
  2676. adox r13,r13
  2677. adcx rbx,r11
  2678. mov r11,QWORD[40+rdi]
  2679. mov QWORD[rdi],rax
  2680. mov QWORD[8+rdi],rbx
  2681. mulx rbx,rax,rdx
  2682. adox r10,r10
  2683. adcx rax,r12
  2684. mov rdx,QWORD[16+rcx*1+rsi]
  2685. mov r12,QWORD[48+rdi]
  2686. adox r11,r11
  2687. adcx rbx,r13
  2688. mov r13,QWORD[56+rdi]
  2689. mov QWORD[16+rdi],rax
  2690. mov QWORD[24+rdi],rbx
  2691. mulx rbx,rax,rdx
  2692. adox r12,r12
  2693. adcx rax,r10
  2694. mov rdx,QWORD[24+rcx*1+rsi]
  2695. lea rcx,[32+rcx]
  2696. mov r10,QWORD[64+rdi]
  2697. adox r13,r13
  2698. adcx rbx,r11
  2699. mov r11,QWORD[72+rdi]
  2700. mov QWORD[32+rdi],rax
  2701. mov QWORD[40+rdi],rbx
  2702. mulx rbx,rax,rdx
  2703. adox r10,r10
  2704. adcx rax,r12
  2705. jrcxz $L$sqrx4x_shift_n_add_break
  2706. DB 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
  2707. adox r11,r11
  2708. adcx rbx,r13
  2709. mov r12,QWORD[80+rdi]
  2710. mov r13,QWORD[88+rdi]
  2711. mov QWORD[48+rdi],rax
  2712. mov QWORD[56+rdi],rbx
  2713. lea rdi,[64+rdi]
  2714. nop
  2715. jmp NEAR $L$sqrx4x_shift_n_add
  2716. ALIGN 32
  2717. $L$sqrx4x_shift_n_add_break:
  2718. adcx rbx,r13
  2719. mov QWORD[48+rdi],rax
  2720. mov QWORD[56+rdi],rbx
  2721. lea rdi,[64+rdi]
  2722. DB 102,72,15,126,213
  2723. __bn_sqrx8x_reduction:
  2724. xor eax,eax
  2725. mov rbx,QWORD[((32+8))+rsp]
  2726. mov rdx,QWORD[((48+8))+rsp]
  2727. lea rcx,[((-64))+r9*1+rbp]
  2728. mov QWORD[((0+8))+rsp],rcx
  2729. mov QWORD[((8+8))+rsp],rdi
  2730. lea rdi,[((48+8))+rsp]
  2731. jmp NEAR $L$sqrx8x_reduction_loop
  2732. ALIGN 32
  2733. $L$sqrx8x_reduction_loop:
  2734. mov r9,QWORD[8+rdi]
  2735. mov r10,QWORD[16+rdi]
  2736. mov r11,QWORD[24+rdi]
  2737. mov r12,QWORD[32+rdi]
  2738. mov r8,rdx
  2739. imul rdx,rbx
  2740. mov r13,QWORD[40+rdi]
  2741. mov r14,QWORD[48+rdi]
  2742. mov r15,QWORD[56+rdi]
  2743. mov QWORD[((24+8))+rsp],rax
  2744. lea rdi,[64+rdi]
  2745. xor rsi,rsi
  2746. mov rcx,-8
  2747. jmp NEAR $L$sqrx8x_reduce
  2748. ALIGN 32
  2749. $L$sqrx8x_reduce:
  2750. mov rbx,r8
  2751. mulx r8,rax,QWORD[rbp]
  2752. adcx rax,rbx
  2753. adox r8,r9
  2754. mulx r9,rbx,QWORD[8+rbp]
  2755. adcx r8,rbx
  2756. adox r9,r10
  2757. mulx r10,rbx,QWORD[16+rbp]
  2758. adcx r9,rbx
  2759. adox r10,r11
  2760. mulx r11,rbx,QWORD[24+rbp]
  2761. adcx r10,rbx
  2762. adox r11,r12
  2763. DB 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
  2764. mov rax,rdx
  2765. mov rdx,r8
  2766. adcx r11,rbx
  2767. adox r12,r13
  2768. mulx rdx,rbx,QWORD[((32+8))+rsp]
  2769. mov rdx,rax
  2770. mov QWORD[((64+48+8))+rcx*8+rsp],rax
  2771. mulx r13,rax,QWORD[40+rbp]
  2772. adcx r12,rax
  2773. adox r13,r14
  2774. mulx r14,rax,QWORD[48+rbp]
  2775. adcx r13,rax
  2776. adox r14,r15
  2777. mulx r15,rax,QWORD[56+rbp]
  2778. mov rdx,rbx
  2779. adcx r14,rax
  2780. adox r15,rsi
  2781. adcx r15,rsi
  2782. DB 0x67,0x67,0x67
  2783. inc rcx
  2784. jnz NEAR $L$sqrx8x_reduce
  2785. mov rax,rsi
  2786. cmp rbp,QWORD[((0+8))+rsp]
  2787. jae NEAR $L$sqrx8x_no_tail
  2788. mov rdx,QWORD[((48+8))+rsp]
  2789. add r8,QWORD[rdi]
  2790. lea rbp,[64+rbp]
  2791. mov rcx,-8
  2792. adcx r9,QWORD[8+rdi]
  2793. adcx r10,QWORD[16+rdi]
  2794. adc r11,QWORD[24+rdi]
  2795. adc r12,QWORD[32+rdi]
  2796. adc r13,QWORD[40+rdi]
  2797. adc r14,QWORD[48+rdi]
  2798. adc r15,QWORD[56+rdi]
  2799. lea rdi,[64+rdi]
  2800. sbb rax,rax
  2801. xor rsi,rsi
  2802. mov QWORD[((16+8))+rsp],rax
  2803. jmp NEAR $L$sqrx8x_tail
  2804. ALIGN 32
  2805. $L$sqrx8x_tail:
  2806. mov rbx,r8
  2807. mulx r8,rax,QWORD[rbp]
  2808. adcx rbx,rax
  2809. adox r8,r9
  2810. mulx r9,rax,QWORD[8+rbp]
  2811. adcx r8,rax
  2812. adox r9,r10
  2813. mulx r10,rax,QWORD[16+rbp]
  2814. adcx r9,rax
  2815. adox r10,r11
  2816. mulx r11,rax,QWORD[24+rbp]
  2817. adcx r10,rax
  2818. adox r11,r12
  2819. DB 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
  2820. adcx r11,rax
  2821. adox r12,r13
  2822. mulx r13,rax,QWORD[40+rbp]
  2823. adcx r12,rax
  2824. adox r13,r14
  2825. mulx r14,rax,QWORD[48+rbp]
  2826. adcx r13,rax
  2827. adox r14,r15
  2828. mulx r15,rax,QWORD[56+rbp]
  2829. mov rdx,QWORD[((72+48+8))+rcx*8+rsp]
  2830. adcx r14,rax
  2831. adox r15,rsi
  2832. mov QWORD[rcx*8+rdi],rbx
  2833. mov rbx,r8
  2834. adcx r15,rsi
  2835. inc rcx
  2836. jnz NEAR $L$sqrx8x_tail
  2837. cmp rbp,QWORD[((0+8))+rsp]
  2838. jae NEAR $L$sqrx8x_tail_done
  2839. sub rsi,QWORD[((16+8))+rsp]
  2840. mov rdx,QWORD[((48+8))+rsp]
  2841. lea rbp,[64+rbp]
  2842. adc r8,QWORD[rdi]
  2843. adc r9,QWORD[8+rdi]
  2844. adc r10,QWORD[16+rdi]
  2845. adc r11,QWORD[24+rdi]
  2846. adc r12,QWORD[32+rdi]
  2847. adc r13,QWORD[40+rdi]
  2848. adc r14,QWORD[48+rdi]
  2849. adc r15,QWORD[56+rdi]
  2850. lea rdi,[64+rdi]
  2851. sbb rax,rax
  2852. sub rcx,8
  2853. xor rsi,rsi
  2854. mov QWORD[((16+8))+rsp],rax
  2855. jmp NEAR $L$sqrx8x_tail
  2856. ALIGN 32
  2857. $L$sqrx8x_tail_done:
  2858. xor rax,rax
  2859. add r8,QWORD[((24+8))+rsp]
  2860. adc r9,0
  2861. adc r10,0
  2862. adc r11,0
  2863. adc r12,0
  2864. adc r13,0
  2865. adc r14,0
  2866. adc r15,0
  2867. adc rax,0
  2868. sub rsi,QWORD[((16+8))+rsp]
  2869. $L$sqrx8x_no_tail:
  2870. adc r8,QWORD[rdi]
  2871. DB 102,72,15,126,217
  2872. adc r9,QWORD[8+rdi]
  2873. mov rsi,QWORD[56+rbp]
  2874. DB 102,72,15,126,213
  2875. adc r10,QWORD[16+rdi]
  2876. adc r11,QWORD[24+rdi]
  2877. adc r12,QWORD[32+rdi]
  2878. adc r13,QWORD[40+rdi]
  2879. adc r14,QWORD[48+rdi]
  2880. adc r15,QWORD[56+rdi]
  2881. adc rax,0
  2882. mov rbx,QWORD[((32+8))+rsp]
  2883. mov rdx,QWORD[64+rcx*1+rdi]
  2884. mov QWORD[rdi],r8
  2885. lea r8,[64+rdi]
  2886. mov QWORD[8+rdi],r9
  2887. mov QWORD[16+rdi],r10
  2888. mov QWORD[24+rdi],r11
  2889. mov QWORD[32+rdi],r12
  2890. mov QWORD[40+rdi],r13
  2891. mov QWORD[48+rdi],r14
  2892. mov QWORD[56+rdi],r15
  2893. lea rdi,[64+rcx*1+rdi]
  2894. cmp r8,QWORD[((8+8))+rsp]
  2895. jb NEAR $L$sqrx8x_reduction_loop
  2896. DB 0F3h,0C3h ;repret
  2897. ALIGN 32
  2898. __bn_postx4x_internal:
  2899. mov r12,QWORD[rbp]
  2900. mov r10,rcx
  2901. mov r9,rcx
  2902. neg rax
  2903. sar rcx,3+2
  2904. DB 102,72,15,126,202
  2905. DB 102,72,15,126,206
  2906. dec r12
  2907. mov r13,QWORD[8+rbp]
  2908. xor r8,r8
  2909. mov r14,QWORD[16+rbp]
  2910. mov r15,QWORD[24+rbp]
  2911. jmp NEAR $L$sqrx4x_sub_entry
  2912. ALIGN 16
  2913. $L$sqrx4x_sub:
  2914. mov r12,QWORD[rbp]
  2915. mov r13,QWORD[8+rbp]
  2916. mov r14,QWORD[16+rbp]
  2917. mov r15,QWORD[24+rbp]
  2918. $L$sqrx4x_sub_entry:
  2919. andn r12,r12,rax
  2920. lea rbp,[32+rbp]
  2921. andn r13,r13,rax
  2922. andn r14,r14,rax
  2923. andn r15,r15,rax
  2924. neg r8
  2925. adc r12,QWORD[rdi]
  2926. adc r13,QWORD[8+rdi]
  2927. adc r14,QWORD[16+rdi]
  2928. adc r15,QWORD[24+rdi]
  2929. mov QWORD[rdx],r12
  2930. lea rdi,[32+rdi]
  2931. mov QWORD[8+rdx],r13
  2932. sbb r8,r8
  2933. mov QWORD[16+rdx],r14
  2934. mov QWORD[24+rdx],r15
  2935. lea rdx,[32+rdx]
  2936. inc rcx
  2937. jnz NEAR $L$sqrx4x_sub
  2938. neg r9
  2939. DB 0F3h,0C3h ;repret
  2940. global GFp_bn_scatter5
  2941. ALIGN 16
  2942. GFp_bn_scatter5:
  2943. cmp edx,0
  2944. jz NEAR $L$scatter_epilogue
  2945. lea r8,[r9*8+r8]
  2946. $L$scatter:
  2947. mov rax,QWORD[rcx]
  2948. lea rcx,[8+rcx]
  2949. mov QWORD[r8],rax
  2950. lea r8,[256+r8]
  2951. sub edx,1
  2952. jnz NEAR $L$scatter
  2953. $L$scatter_epilogue:
  2954. DB 0F3h,0C3h ;repret
  2955. global GFp_bn_gather5
  2956. ALIGN 32
  2957. GFp_bn_gather5:
  2958. $L$SEH_begin_GFp_bn_gather5:
  2959. DB 0x4c,0x8d,0x14,0x24
  2960. DB 0x48,0x81,0xec,0x08,0x01,0x00,0x00
  2961. lea rax,[$L$inc]
  2962. and rsp,-16
  2963. movd xmm5,r9d
  2964. movdqa xmm0,XMMWORD[rax]
  2965. movdqa xmm1,XMMWORD[16+rax]
  2966. lea r11,[128+r8]
  2967. lea rax,[128+rsp]
  2968. pshufd xmm5,xmm5,0
  2969. movdqa xmm4,xmm1
  2970. movdqa xmm2,xmm1
  2971. paddd xmm1,xmm0
  2972. pcmpeqd xmm0,xmm5
  2973. movdqa xmm3,xmm4
  2974. paddd xmm2,xmm1
  2975. pcmpeqd xmm1,xmm5
  2976. movdqa XMMWORD[(-128)+rax],xmm0
  2977. movdqa xmm0,xmm4
  2978. paddd xmm3,xmm2
  2979. pcmpeqd xmm2,xmm5
  2980. movdqa XMMWORD[(-112)+rax],xmm1
  2981. movdqa xmm1,xmm4
  2982. paddd xmm0,xmm3
  2983. pcmpeqd xmm3,xmm5
  2984. movdqa XMMWORD[(-96)+rax],xmm2
  2985. movdqa xmm2,xmm4
  2986. paddd xmm1,xmm0
  2987. pcmpeqd xmm0,xmm5
  2988. movdqa XMMWORD[(-80)+rax],xmm3
  2989. movdqa xmm3,xmm4
  2990. paddd xmm2,xmm1
  2991. pcmpeqd xmm1,xmm5
  2992. movdqa XMMWORD[(-64)+rax],xmm0
  2993. movdqa xmm0,xmm4
  2994. paddd xmm3,xmm2
  2995. pcmpeqd xmm2,xmm5
  2996. movdqa XMMWORD[(-48)+rax],xmm1
  2997. movdqa xmm1,xmm4
  2998. paddd xmm0,xmm3
  2999. pcmpeqd xmm3,xmm5
  3000. movdqa XMMWORD[(-32)+rax],xmm2
  3001. movdqa xmm2,xmm4
  3002. paddd xmm1,xmm0
  3003. pcmpeqd xmm0,xmm5
  3004. movdqa XMMWORD[(-16)+rax],xmm3
  3005. movdqa xmm3,xmm4
  3006. paddd xmm2,xmm1
  3007. pcmpeqd xmm1,xmm5
  3008. movdqa XMMWORD[rax],xmm0
  3009. movdqa xmm0,xmm4
  3010. paddd xmm3,xmm2
  3011. pcmpeqd xmm2,xmm5
  3012. movdqa XMMWORD[16+rax],xmm1
  3013. movdqa xmm1,xmm4
  3014. paddd xmm0,xmm3
  3015. pcmpeqd xmm3,xmm5
  3016. movdqa XMMWORD[32+rax],xmm2
  3017. movdqa xmm2,xmm4
  3018. paddd xmm1,xmm0
  3019. pcmpeqd xmm0,xmm5
  3020. movdqa XMMWORD[48+rax],xmm3
  3021. movdqa xmm3,xmm4
  3022. paddd xmm2,xmm1
  3023. pcmpeqd xmm1,xmm5
  3024. movdqa XMMWORD[64+rax],xmm0
  3025. movdqa xmm0,xmm4
  3026. paddd xmm3,xmm2
  3027. pcmpeqd xmm2,xmm5
  3028. movdqa XMMWORD[80+rax],xmm1
  3029. movdqa xmm1,xmm4
  3030. paddd xmm0,xmm3
  3031. pcmpeqd xmm3,xmm5
  3032. movdqa XMMWORD[96+rax],xmm2
  3033. movdqa xmm2,xmm4
  3034. movdqa XMMWORD[112+rax],xmm3
  3035. jmp NEAR $L$gather
  3036. ALIGN 32
  3037. $L$gather:
  3038. pxor xmm4,xmm4
  3039. pxor xmm5,xmm5
  3040. movdqa xmm0,XMMWORD[((-128))+r11]
  3041. movdqa xmm1,XMMWORD[((-112))+r11]
  3042. movdqa xmm2,XMMWORD[((-96))+r11]
  3043. pand xmm0,XMMWORD[((-128))+rax]
  3044. movdqa xmm3,XMMWORD[((-80))+r11]
  3045. pand xmm1,XMMWORD[((-112))+rax]
  3046. por xmm4,xmm0
  3047. pand xmm2,XMMWORD[((-96))+rax]
  3048. por xmm5,xmm1
  3049. pand xmm3,XMMWORD[((-80))+rax]
  3050. por xmm4,xmm2
  3051. por xmm5,xmm3
  3052. movdqa xmm0,XMMWORD[((-64))+r11]
  3053. movdqa xmm1,XMMWORD[((-48))+r11]
  3054. movdqa xmm2,XMMWORD[((-32))+r11]
  3055. pand xmm0,XMMWORD[((-64))+rax]
  3056. movdqa xmm3,XMMWORD[((-16))+r11]
  3057. pand xmm1,XMMWORD[((-48))+rax]
  3058. por xmm4,xmm0
  3059. pand xmm2,XMMWORD[((-32))+rax]
  3060. por xmm5,xmm1
  3061. pand xmm3,XMMWORD[((-16))+rax]
  3062. por xmm4,xmm2
  3063. por xmm5,xmm3
  3064. movdqa xmm0,XMMWORD[r11]
  3065. movdqa xmm1,XMMWORD[16+r11]
  3066. movdqa xmm2,XMMWORD[32+r11]
  3067. pand xmm0,XMMWORD[rax]
  3068. movdqa xmm3,XMMWORD[48+r11]
  3069. pand xmm1,XMMWORD[16+rax]
  3070. por xmm4,xmm0
  3071. pand xmm2,XMMWORD[32+rax]
  3072. por xmm5,xmm1
  3073. pand xmm3,XMMWORD[48+rax]
  3074. por xmm4,xmm2
  3075. por xmm5,xmm3
  3076. movdqa xmm0,XMMWORD[64+r11]
  3077. movdqa xmm1,XMMWORD[80+r11]
  3078. movdqa xmm2,XMMWORD[96+r11]
  3079. pand xmm0,XMMWORD[64+rax]
  3080. movdqa xmm3,XMMWORD[112+r11]
  3081. pand xmm1,XMMWORD[80+rax]
  3082. por xmm4,xmm0
  3083. pand xmm2,XMMWORD[96+rax]
  3084. por xmm5,xmm1
  3085. pand xmm3,XMMWORD[112+rax]
  3086. por xmm4,xmm2
  3087. por xmm5,xmm3
  3088. por xmm4,xmm5
  3089. lea r11,[256+r11]
  3090. pshufd xmm0,xmm4,0x4e
  3091. por xmm0,xmm4
  3092. movq QWORD[rcx],xmm0
  3093. lea rcx,[8+rcx]
  3094. sub edx,1
  3095. jnz NEAR $L$gather
  3096. lea rsp,[r10]
  3097. DB 0F3h,0C3h ;repret
  3098. $L$SEH_end_GFp_bn_gather5:
  3099. ALIGN 64
  3100. $L$inc:
  3101. DD 0,0,1,1
  3102. DD 2,2,2,2
  3103. DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
  3104. DB 112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115
  3105. DB 99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111
  3106. DB 114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79
  3107. DB 71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111
  3108. DB 112,101,110,115,115,108,46,111,114,103,62,0
  3109. EXTERN __imp_RtlVirtualUnwind
  3110. ALIGN 16
  3111. mul_handler:
  3112. push rsi
  3113. push rdi
  3114. push rbx
  3115. push rbp
  3116. push r12
  3117. push r13
  3118. push r14
  3119. push r15
  3120. pushfq
  3121. sub rsp,64
  3122. mov rax,QWORD[120+r8]
  3123. mov rbx,QWORD[248+r8]
  3124. mov rsi,QWORD[8+r9]
  3125. mov r11,QWORD[56+r9]
  3126. mov r10d,DWORD[r11]
  3127. lea r10,[r10*1+rsi]
  3128. cmp rbx,r10
  3129. jb NEAR $L$common_seh_tail
  3130. mov r10d,DWORD[4+r11]
  3131. lea r10,[r10*1+rsi]
  3132. cmp rbx,r10
  3133. jb NEAR $L$common_pop_regs
  3134. mov rax,QWORD[152+r8]
  3135. mov r10d,DWORD[8+r11]
  3136. lea r10,[r10*1+rsi]
  3137. cmp rbx,r10
  3138. jae NEAR $L$common_seh_tail
  3139. lea r10,[$L$mul_epilogue]
  3140. cmp rbx,r10
  3141. ja NEAR $L$body_40
  3142. mov r10,QWORD[192+r8]
  3143. mov rax,QWORD[8+r10*8+rax]
  3144. jmp NEAR $L$common_pop_regs
  3145. $L$body_40:
  3146. mov rax,QWORD[40+rax]
  3147. $L$common_pop_regs:
  3148. mov rbx,QWORD[((-8))+rax]
  3149. mov rbp,QWORD[((-16))+rax]
  3150. mov r12,QWORD[((-24))+rax]
  3151. mov r13,QWORD[((-32))+rax]
  3152. mov r14,QWORD[((-40))+rax]
  3153. mov r15,QWORD[((-48))+rax]
  3154. mov QWORD[144+r8],rbx
  3155. mov QWORD[160+r8],rbp
  3156. mov QWORD[216+r8],r12
  3157. mov QWORD[224+r8],r13
  3158. mov QWORD[232+r8],r14
  3159. mov QWORD[240+r8],r15
  3160. $L$common_seh_tail:
  3161. mov rdi,QWORD[8+rax]
  3162. mov rsi,QWORD[16+rax]
  3163. mov QWORD[152+r8],rax
  3164. mov QWORD[168+r8],rsi
  3165. mov QWORD[176+r8],rdi
  3166. mov rdi,QWORD[40+r9]
  3167. mov rsi,r8
  3168. mov ecx,154
  3169. DD 0xa548f3fc
  3170. mov rsi,r9
  3171. xor rcx,rcx
  3172. mov rdx,QWORD[8+rsi]
  3173. mov r8,QWORD[rsi]
  3174. mov r9,QWORD[16+rsi]
  3175. mov r10,QWORD[40+rsi]
  3176. lea r11,[56+rsi]
  3177. lea r12,[24+rsi]
  3178. mov QWORD[32+rsp],r10
  3179. mov QWORD[40+rsp],r11
  3180. mov QWORD[48+rsp],r12
  3181. mov QWORD[56+rsp],rcx
  3182. call QWORD[__imp_RtlVirtualUnwind]
  3183. mov eax,1
  3184. add rsp,64
  3185. popfq
  3186. pop r15
  3187. pop r14
  3188. pop r13
  3189. pop r12
  3190. pop rbp
  3191. pop rbx
  3192. pop rdi
  3193. pop rsi
  3194. DB 0F3h,0C3h ;repret
  3195. section .pdata rdata align=4
  3196. ALIGN 4
  3197. DD $L$SEH_begin_GFp_bn_mul_mont_gather5 wrt ..imagebase
  3198. DD $L$SEH_end_GFp_bn_mul_mont_gather5 wrt ..imagebase
  3199. DD $L$SEH_info_GFp_bn_mul_mont_gather5 wrt ..imagebase
  3200. DD $L$SEH_begin_bn_mul4x_mont_gather5 wrt ..imagebase
  3201. DD $L$SEH_end_bn_mul4x_mont_gather5 wrt ..imagebase
  3202. DD $L$SEH_info_bn_mul4x_mont_gather5 wrt ..imagebase
  3203. DD $L$SEH_begin_GFp_bn_power5 wrt ..imagebase
  3204. DD $L$SEH_end_GFp_bn_power5 wrt ..imagebase
  3205. DD $L$SEH_info_GFp_bn_power5 wrt ..imagebase
  3206. DD $L$SEH_begin_bn_from_mont8x wrt ..imagebase
  3207. DD $L$SEH_end_bn_from_mont8x wrt ..imagebase
  3208. DD $L$SEH_info_bn_from_mont8x wrt ..imagebase
  3209. DD $L$SEH_begin_bn_mulx4x_mont_gather5 wrt ..imagebase
  3210. DD $L$SEH_end_bn_mulx4x_mont_gather5 wrt ..imagebase
  3211. DD $L$SEH_info_bn_mulx4x_mont_gather5 wrt ..imagebase
  3212. DD $L$SEH_begin_bn_powerx5 wrt ..imagebase
  3213. DD $L$SEH_end_bn_powerx5 wrt ..imagebase
  3214. DD $L$SEH_info_GFp_bn_powerx5 wrt ..imagebase
  3215. DD $L$SEH_begin_GFp_bn_gather5 wrt ..imagebase
  3216. DD $L$SEH_end_GFp_bn_gather5 wrt ..imagebase
  3217. DD $L$SEH_info_GFp_bn_gather5 wrt ..imagebase
  3218. section .xdata rdata align=8
  3219. ALIGN 8
  3220. $L$SEH_info_GFp_bn_mul_mont_gather5:
  3221. DB 9,0,0,0
  3222. DD mul_handler wrt ..imagebase
  3223. DD $L$mul_body wrt ..imagebase,$L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase
  3224. ALIGN 8
  3225. $L$SEH_info_bn_mul4x_mont_gather5:
  3226. DB 9,0,0,0
  3227. DD mul_handler wrt ..imagebase
  3228. DD $L$mul4x_prologue wrt ..imagebase,$L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase
  3229. ALIGN 8
  3230. $L$SEH_info_GFp_bn_power5:
  3231. DB 9,0,0,0
  3232. DD mul_handler wrt ..imagebase
  3233. DD $L$power5_prologue wrt ..imagebase,$L$power5_body wrt ..imagebase,$L$power5_epilogue wrt ..imagebase
  3234. ALIGN 8
  3235. $L$SEH_info_bn_from_mont8x:
  3236. DB 9,0,0,0
  3237. DD mul_handler wrt ..imagebase
  3238. DD $L$from_prologue wrt ..imagebase,$L$from_body wrt ..imagebase,$L$from_epilogue wrt ..imagebase
  3239. ALIGN 8
  3240. $L$SEH_info_bn_mulx4x_mont_gather5:
  3241. DB 9,0,0,0
  3242. DD mul_handler wrt ..imagebase
  3243. DD $L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase
  3244. ALIGN 8
  3245. $L$SEH_info_GFp_bn_powerx5:
  3246. DB 9,0,0,0
  3247. DD mul_handler wrt ..imagebase
  3248. DD $L$powerx5_prologue wrt ..imagebase,$L$powerx5_body wrt ..imagebase,$L$powerx5_epilogue wrt ..imagebase
  3249. ALIGN 8
  3250. $L$SEH_info_GFp_bn_gather5:
  3251. DB 0x01,0x0b,0x03,0x0a
  3252. DB 0x0b,0x01,0x21,0x00
  3253. DB 0x04,0xa3,0x00,0x00
  3254. ALIGN 8