x86_64-mont-nasm.asm 23 KB


  1. ; This file is generated from a similarly-named Perl script in the BoringSSL
  2. ; source tree. Do not edit by hand.
  3. default rel
  4. %define XMMWORD
  5. %define YMMWORD
  6. %define ZMMWORD
  7. section .text code align=64
  8. EXTERN GFp_ia32cap_P
  9. global GFp_bn_mul_mont
  10. ALIGN 16
  11. GFp_bn_mul_mont:
  12. mov QWORD[8+rsp],rdi ;WIN64 prologue
  13. mov QWORD[16+rsp],rsi
  14. mov rax,rsp
  15. $L$SEH_begin_GFp_bn_mul_mont:
  16. mov rdi,rcx
  17. mov rsi,rdx
  18. mov rdx,r8
  19. mov rcx,r9
  20. mov r8,QWORD[40+rsp]
  21. mov r9,QWORD[48+rsp]
  22. mov r9d,r9d
  23. mov rax,rsp
  24. test r9d,3
  25. jnz NEAR $L$mul_enter
  26. cmp r9d,8
  27. jb NEAR $L$mul_enter
  28. mov r11d,DWORD[((GFp_ia32cap_P+8))]
  29. cmp rdx,rsi
  30. jne NEAR $L$mul4x_enter
  31. test r9d,7
  32. jz NEAR $L$sqr8x_enter
  33. jmp NEAR $L$mul4x_enter
  34. ALIGN 16
  35. $L$mul_enter:
  36. push rbx
  37. push rbp
  38. push r12
  39. push r13
  40. push r14
  41. push r15
  42. neg r9
  43. mov r11,rsp
  44. lea r10,[((-16))+r9*8+rsp]
  45. neg r9
  46. and r10,-1024
  47. sub r11,r10
  48. and r11,-4096
  49. lea rsp,[r11*1+r10]
  50. mov r11,QWORD[rsp]
  51. cmp rsp,r10
  52. ja NEAR $L$mul_page_walk
  53. jmp NEAR $L$mul_page_walk_done
  54. ALIGN 16
  55. $L$mul_page_walk:
  56. lea rsp,[((-4096))+rsp]
  57. mov r11,QWORD[rsp]
  58. cmp rsp,r10
  59. ja NEAR $L$mul_page_walk
  60. $L$mul_page_walk_done:
  61. mov QWORD[8+r9*8+rsp],rax
  62. $L$mul_body:
  63. mov r12,rdx
  64. mov r8,QWORD[r8]
  65. mov rbx,QWORD[r12]
  66. mov rax,QWORD[rsi]
  67. xor r14,r14
  68. xor r15,r15
  69. mov rbp,r8
  70. mul rbx
  71. mov r10,rax
  72. mov rax,QWORD[rcx]
  73. imul rbp,r10
  74. mov r11,rdx
  75. mul rbp
  76. add r10,rax
  77. mov rax,QWORD[8+rsi]
  78. adc rdx,0
  79. mov r13,rdx
  80. lea r15,[1+r15]
  81. jmp NEAR $L$1st_enter
  82. ALIGN 16
  83. $L$1st:
  84. add r13,rax
  85. mov rax,QWORD[r15*8+rsi]
  86. adc rdx,0
  87. add r13,r11
  88. mov r11,r10
  89. adc rdx,0
  90. mov QWORD[((-16))+r15*8+rsp],r13
  91. mov r13,rdx
  92. $L$1st_enter:
  93. mul rbx
  94. add r11,rax
  95. mov rax,QWORD[r15*8+rcx]
  96. adc rdx,0
  97. lea r15,[1+r15]
  98. mov r10,rdx
  99. mul rbp
  100. cmp r15,r9
  101. jne NEAR $L$1st
  102. add r13,rax
  103. mov rax,QWORD[rsi]
  104. adc rdx,0
  105. add r13,r11
  106. adc rdx,0
  107. mov QWORD[((-16))+r15*8+rsp],r13
  108. mov r13,rdx
  109. mov r11,r10
  110. xor rdx,rdx
  111. add r13,r11
  112. adc rdx,0
  113. mov QWORD[((-8))+r9*8+rsp],r13
  114. mov QWORD[r9*8+rsp],rdx
  115. lea r14,[1+r14]
  116. jmp NEAR $L$outer
  117. ALIGN 16
  118. $L$outer:
  119. mov rbx,QWORD[r14*8+r12]
  120. xor r15,r15
  121. mov rbp,r8
  122. mov r10,QWORD[rsp]
  123. mul rbx
  124. add r10,rax
  125. mov rax,QWORD[rcx]
  126. adc rdx,0
  127. imul rbp,r10
  128. mov r11,rdx
  129. mul rbp
  130. add r10,rax
  131. mov rax,QWORD[8+rsi]
  132. adc rdx,0
  133. mov r10,QWORD[8+rsp]
  134. mov r13,rdx
  135. lea r15,[1+r15]
  136. jmp NEAR $L$inner_enter
  137. ALIGN 16
  138. $L$inner:
  139. add r13,rax
  140. mov rax,QWORD[r15*8+rsi]
  141. adc rdx,0
  142. add r13,r10
  143. mov r10,QWORD[r15*8+rsp]
  144. adc rdx,0
  145. mov QWORD[((-16))+r15*8+rsp],r13
  146. mov r13,rdx
  147. $L$inner_enter:
  148. mul rbx
  149. add r11,rax
  150. mov rax,QWORD[r15*8+rcx]
  151. adc rdx,0
  152. add r10,r11
  153. mov r11,rdx
  154. adc r11,0
  155. lea r15,[1+r15]
  156. mul rbp
  157. cmp r15,r9
  158. jne NEAR $L$inner
  159. add r13,rax
  160. mov rax,QWORD[rsi]
  161. adc rdx,0
  162. add r13,r10
  163. mov r10,QWORD[r15*8+rsp]
  164. adc rdx,0
  165. mov QWORD[((-16))+r15*8+rsp],r13
  166. mov r13,rdx
  167. xor rdx,rdx
  168. add r13,r11
  169. adc rdx,0
  170. add r13,r10
  171. adc rdx,0
  172. mov QWORD[((-8))+r9*8+rsp],r13
  173. mov QWORD[r9*8+rsp],rdx
  174. lea r14,[1+r14]
  175. cmp r14,r9
  176. jb NEAR $L$outer
  177. xor r14,r14
  178. mov rax,QWORD[rsp]
  179. mov r15,r9
  180. ALIGN 16
  181. $L$sub: sbb rax,QWORD[r14*8+rcx]
  182. mov QWORD[r14*8+rdi],rax
  183. mov rax,QWORD[8+r14*8+rsp]
  184. lea r14,[1+r14]
  185. dec r15
  186. jnz NEAR $L$sub
  187. sbb rax,0
  188. mov rbx,-1
  189. xor rbx,rax
  190. xor r14,r14
  191. mov r15,r9
  192. $L$copy:
  193. mov rcx,QWORD[r14*8+rdi]
  194. mov rdx,QWORD[r14*8+rsp]
  195. and rcx,rbx
  196. and rdx,rax
  197. mov QWORD[r14*8+rsp],r9
  198. or rdx,rcx
  199. mov QWORD[r14*8+rdi],rdx
  200. lea r14,[1+r14]
  201. sub r15,1
  202. jnz NEAR $L$copy
  203. mov rsi,QWORD[8+r9*8+rsp]
  204. mov rax,1
  205. mov r15,QWORD[((-48))+rsi]
  206. mov r14,QWORD[((-40))+rsi]
  207. mov r13,QWORD[((-32))+rsi]
  208. mov r12,QWORD[((-24))+rsi]
  209. mov rbp,QWORD[((-16))+rsi]
  210. mov rbx,QWORD[((-8))+rsi]
  211. lea rsp,[rsi]
  212. $L$mul_epilogue:
  213. mov rdi,QWORD[8+rsp] ;WIN64 epilogue
  214. mov rsi,QWORD[16+rsp]
  215. DB 0F3h,0C3h ;repret
  216. $L$SEH_end_GFp_bn_mul_mont:
  217. ALIGN 16
  218. bn_mul4x_mont:
  219. mov QWORD[8+rsp],rdi ;WIN64 prologue
  220. mov QWORD[16+rsp],rsi
  221. mov rax,rsp
  222. $L$SEH_begin_bn_mul4x_mont:
  223. mov rdi,rcx
  224. mov rsi,rdx
  225. mov rdx,r8
  226. mov rcx,r9
  227. mov r8,QWORD[40+rsp]
  228. mov r9,QWORD[48+rsp]
  229. mov r9d,r9d
  230. mov rax,rsp
  231. $L$mul4x_enter:
  232. and r11d,0x80100
  233. cmp r11d,0x80100
  234. je NEAR $L$mulx4x_enter
  235. push rbx
  236. push rbp
  237. push r12
  238. push r13
  239. push r14
  240. push r15
  241. neg r9
  242. mov r11,rsp
  243. lea r10,[((-32))+r9*8+rsp]
  244. neg r9
  245. and r10,-1024
  246. sub r11,r10
  247. and r11,-4096
  248. lea rsp,[r11*1+r10]
  249. mov r11,QWORD[rsp]
  250. cmp rsp,r10
  251. ja NEAR $L$mul4x_page_walk
  252. jmp NEAR $L$mul4x_page_walk_done
  253. $L$mul4x_page_walk:
  254. lea rsp,[((-4096))+rsp]
  255. mov r11,QWORD[rsp]
  256. cmp rsp,r10
  257. ja NEAR $L$mul4x_page_walk
  258. $L$mul4x_page_walk_done:
  259. mov QWORD[8+r9*8+rsp],rax
  260. $L$mul4x_body:
  261. mov QWORD[16+r9*8+rsp],rdi
  262. mov r12,rdx
  263. mov r8,QWORD[r8]
  264. mov rbx,QWORD[r12]
  265. mov rax,QWORD[rsi]
  266. xor r14,r14
  267. xor r15,r15
  268. mov rbp,r8
  269. mul rbx
  270. mov r10,rax
  271. mov rax,QWORD[rcx]
  272. imul rbp,r10
  273. mov r11,rdx
  274. mul rbp
  275. add r10,rax
  276. mov rax,QWORD[8+rsi]
  277. adc rdx,0
  278. mov rdi,rdx
  279. mul rbx
  280. add r11,rax
  281. mov rax,QWORD[8+rcx]
  282. adc rdx,0
  283. mov r10,rdx
  284. mul rbp
  285. add rdi,rax
  286. mov rax,QWORD[16+rsi]
  287. adc rdx,0
  288. add rdi,r11
  289. lea r15,[4+r15]
  290. adc rdx,0
  291. mov QWORD[rsp],rdi
  292. mov r13,rdx
  293. jmp NEAR $L$1st4x
  294. ALIGN 16
  295. $L$1st4x:
  296. mul rbx
  297. add r10,rax
  298. mov rax,QWORD[((-16))+r15*8+rcx]
  299. adc rdx,0
  300. mov r11,rdx
  301. mul rbp
  302. add r13,rax
  303. mov rax,QWORD[((-8))+r15*8+rsi]
  304. adc rdx,0
  305. add r13,r10
  306. adc rdx,0
  307. mov QWORD[((-24))+r15*8+rsp],r13
  308. mov rdi,rdx
  309. mul rbx
  310. add r11,rax
  311. mov rax,QWORD[((-8))+r15*8+rcx]
  312. adc rdx,0
  313. mov r10,rdx
  314. mul rbp
  315. add rdi,rax
  316. mov rax,QWORD[r15*8+rsi]
  317. adc rdx,0
  318. add rdi,r11
  319. adc rdx,0
  320. mov QWORD[((-16))+r15*8+rsp],rdi
  321. mov r13,rdx
  322. mul rbx
  323. add r10,rax
  324. mov rax,QWORD[r15*8+rcx]
  325. adc rdx,0
  326. mov r11,rdx
  327. mul rbp
  328. add r13,rax
  329. mov rax,QWORD[8+r15*8+rsi]
  330. adc rdx,0
  331. add r13,r10
  332. adc rdx,0
  333. mov QWORD[((-8))+r15*8+rsp],r13
  334. mov rdi,rdx
  335. mul rbx
  336. add r11,rax
  337. mov rax,QWORD[8+r15*8+rcx]
  338. adc rdx,0
  339. lea r15,[4+r15]
  340. mov r10,rdx
  341. mul rbp
  342. add rdi,rax
  343. mov rax,QWORD[((-16))+r15*8+rsi]
  344. adc rdx,0
  345. add rdi,r11
  346. adc rdx,0
  347. mov QWORD[((-32))+r15*8+rsp],rdi
  348. mov r13,rdx
  349. cmp r15,r9
  350. jb NEAR $L$1st4x
  351. mul rbx
  352. add r10,rax
  353. mov rax,QWORD[((-16))+r15*8+rcx]
  354. adc rdx,0
  355. mov r11,rdx
  356. mul rbp
  357. add r13,rax
  358. mov rax,QWORD[((-8))+r15*8+rsi]
  359. adc rdx,0
  360. add r13,r10
  361. adc rdx,0
  362. mov QWORD[((-24))+r15*8+rsp],r13
  363. mov rdi,rdx
  364. mul rbx
  365. add r11,rax
  366. mov rax,QWORD[((-8))+r15*8+rcx]
  367. adc rdx,0
  368. mov r10,rdx
  369. mul rbp
  370. add rdi,rax
  371. mov rax,QWORD[rsi]
  372. adc rdx,0
  373. add rdi,r11
  374. adc rdx,0
  375. mov QWORD[((-16))+r15*8+rsp],rdi
  376. mov r13,rdx
  377. xor rdi,rdi
  378. add r13,r10
  379. adc rdi,0
  380. mov QWORD[((-8))+r15*8+rsp],r13
  381. mov QWORD[r15*8+rsp],rdi
  382. lea r14,[1+r14]
  383. ALIGN 4
  384. $L$outer4x:
  385. mov rbx,QWORD[r14*8+r12]
  386. xor r15,r15
  387. mov r10,QWORD[rsp]
  388. mov rbp,r8
  389. mul rbx
  390. add r10,rax
  391. mov rax,QWORD[rcx]
  392. adc rdx,0
  393. imul rbp,r10
  394. mov r11,rdx
  395. mul rbp
  396. add r10,rax
  397. mov rax,QWORD[8+rsi]
  398. adc rdx,0
  399. mov rdi,rdx
  400. mul rbx
  401. add r11,rax
  402. mov rax,QWORD[8+rcx]
  403. adc rdx,0
  404. add r11,QWORD[8+rsp]
  405. adc rdx,0
  406. mov r10,rdx
  407. mul rbp
  408. add rdi,rax
  409. mov rax,QWORD[16+rsi]
  410. adc rdx,0
  411. add rdi,r11
  412. lea r15,[4+r15]
  413. adc rdx,0
  414. mov QWORD[rsp],rdi
  415. mov r13,rdx
  416. jmp NEAR $L$inner4x
  417. ALIGN 16
  418. $L$inner4x:
  419. mul rbx
  420. add r10,rax
  421. mov rax,QWORD[((-16))+r15*8+rcx]
  422. adc rdx,0
  423. add r10,QWORD[((-16))+r15*8+rsp]
  424. adc rdx,0
  425. mov r11,rdx
  426. mul rbp
  427. add r13,rax
  428. mov rax,QWORD[((-8))+r15*8+rsi]
  429. adc rdx,0
  430. add r13,r10
  431. adc rdx,0
  432. mov QWORD[((-24))+r15*8+rsp],r13
  433. mov rdi,rdx
  434. mul rbx
  435. add r11,rax
  436. mov rax,QWORD[((-8))+r15*8+rcx]
  437. adc rdx,0
  438. add r11,QWORD[((-8))+r15*8+rsp]
  439. adc rdx,0
  440. mov r10,rdx
  441. mul rbp
  442. add rdi,rax
  443. mov rax,QWORD[r15*8+rsi]
  444. adc rdx,0
  445. add rdi,r11
  446. adc rdx,0
  447. mov QWORD[((-16))+r15*8+rsp],rdi
  448. mov r13,rdx
  449. mul rbx
  450. add r10,rax
  451. mov rax,QWORD[r15*8+rcx]
  452. adc rdx,0
  453. add r10,QWORD[r15*8+rsp]
  454. adc rdx,0
  455. mov r11,rdx
  456. mul rbp
  457. add r13,rax
  458. mov rax,QWORD[8+r15*8+rsi]
  459. adc rdx,0
  460. add r13,r10
  461. adc rdx,0
  462. mov QWORD[((-8))+r15*8+rsp],r13
  463. mov rdi,rdx
  464. mul rbx
  465. add r11,rax
  466. mov rax,QWORD[8+r15*8+rcx]
  467. adc rdx,0
  468. add r11,QWORD[8+r15*8+rsp]
  469. adc rdx,0
  470. lea r15,[4+r15]
  471. mov r10,rdx
  472. mul rbp
  473. add rdi,rax
  474. mov rax,QWORD[((-16))+r15*8+rsi]
  475. adc rdx,0
  476. add rdi,r11
  477. adc rdx,0
  478. mov QWORD[((-32))+r15*8+rsp],rdi
  479. mov r13,rdx
  480. cmp r15,r9
  481. jb NEAR $L$inner4x
  482. mul rbx
  483. add r10,rax
  484. mov rax,QWORD[((-16))+r15*8+rcx]
  485. adc rdx,0
  486. add r10,QWORD[((-16))+r15*8+rsp]
  487. adc rdx,0
  488. mov r11,rdx
  489. mul rbp
  490. add r13,rax
  491. mov rax,QWORD[((-8))+r15*8+rsi]
  492. adc rdx,0
  493. add r13,r10
  494. adc rdx,0
  495. mov QWORD[((-24))+r15*8+rsp],r13
  496. mov rdi,rdx
  497. mul rbx
  498. add r11,rax
  499. mov rax,QWORD[((-8))+r15*8+rcx]
  500. adc rdx,0
  501. add r11,QWORD[((-8))+r15*8+rsp]
  502. adc rdx,0
  503. lea r14,[1+r14]
  504. mov r10,rdx
  505. mul rbp
  506. add rdi,rax
  507. mov rax,QWORD[rsi]
  508. adc rdx,0
  509. add rdi,r11
  510. adc rdx,0
  511. mov QWORD[((-16))+r15*8+rsp],rdi
  512. mov r13,rdx
  513. xor rdi,rdi
  514. add r13,r10
  515. adc rdi,0
  516. add r13,QWORD[r9*8+rsp]
  517. adc rdi,0
  518. mov QWORD[((-8))+r15*8+rsp],r13
  519. mov QWORD[r15*8+rsp],rdi
  520. cmp r14,r9
  521. jb NEAR $L$outer4x
  522. mov rdi,QWORD[16+r9*8+rsp]
  523. lea r15,[((-4))+r9]
  524. mov rax,QWORD[rsp]
  525. mov rdx,QWORD[8+rsp]
  526. shr r15,2
  527. lea rsi,[rsp]
  528. xor r14,r14
  529. sub rax,QWORD[rcx]
  530. mov rbx,QWORD[16+rsi]
  531. mov rbp,QWORD[24+rsi]
  532. sbb rdx,QWORD[8+rcx]
  533. $L$sub4x:
  534. mov QWORD[r14*8+rdi],rax
  535. mov QWORD[8+r14*8+rdi],rdx
  536. sbb rbx,QWORD[16+r14*8+rcx]
  537. mov rax,QWORD[32+r14*8+rsi]
  538. mov rdx,QWORD[40+r14*8+rsi]
  539. sbb rbp,QWORD[24+r14*8+rcx]
  540. mov QWORD[16+r14*8+rdi],rbx
  541. mov QWORD[24+r14*8+rdi],rbp
  542. sbb rax,QWORD[32+r14*8+rcx]
  543. mov rbx,QWORD[48+r14*8+rsi]
  544. mov rbp,QWORD[56+r14*8+rsi]
  545. sbb rdx,QWORD[40+r14*8+rcx]
  546. lea r14,[4+r14]
  547. dec r15
  548. jnz NEAR $L$sub4x
  549. mov QWORD[r14*8+rdi],rax
  550. mov rax,QWORD[32+r14*8+rsi]
  551. sbb rbx,QWORD[16+r14*8+rcx]
  552. mov QWORD[8+r14*8+rdi],rdx
  553. sbb rbp,QWORD[24+r14*8+rcx]
  554. mov QWORD[16+r14*8+rdi],rbx
  555. sbb rax,0
  556. mov QWORD[24+r14*8+rdi],rbp
  557. pxor xmm0,xmm0
  558. DB 102,72,15,110,224
  559. pcmpeqd xmm5,xmm5
  560. pshufd xmm4,xmm4,0
  561. mov r15,r9
  562. pxor xmm5,xmm4
  563. shr r15,2
  564. xor eax,eax
  565. jmp NEAR $L$copy4x
  566. ALIGN 16
  567. $L$copy4x:
  568. movdqa xmm1,XMMWORD[rax*1+rsp]
  569. movdqu xmm2,XMMWORD[rax*1+rdi]
  570. pand xmm1,xmm4
  571. pand xmm2,xmm5
  572. movdqa xmm3,XMMWORD[16+rax*1+rsp]
  573. movdqa XMMWORD[rax*1+rsp],xmm0
  574. por xmm1,xmm2
  575. movdqu xmm2,XMMWORD[16+rax*1+rdi]
  576. movdqu XMMWORD[rax*1+rdi],xmm1
  577. pand xmm3,xmm4
  578. pand xmm2,xmm5
  579. movdqa XMMWORD[16+rax*1+rsp],xmm0
  580. por xmm3,xmm2
  581. movdqu XMMWORD[16+rax*1+rdi],xmm3
  582. lea rax,[32+rax]
  583. dec r15
  584. jnz NEAR $L$copy4x
  585. mov rsi,QWORD[8+r9*8+rsp]
  586. mov rax,1
  587. mov r15,QWORD[((-48))+rsi]
  588. mov r14,QWORD[((-40))+rsi]
  589. mov r13,QWORD[((-32))+rsi]
  590. mov r12,QWORD[((-24))+rsi]
  591. mov rbp,QWORD[((-16))+rsi]
  592. mov rbx,QWORD[((-8))+rsi]
  593. lea rsp,[rsi]
  594. $L$mul4x_epilogue:
  595. mov rdi,QWORD[8+rsp] ;WIN64 epilogue
  596. mov rsi,QWORD[16+rsp]
  597. DB 0F3h,0C3h ;repret
  598. $L$SEH_end_bn_mul4x_mont:
  599. EXTERN GFp_bn_sqrx8x_internal
  600. EXTERN GFp_bn_sqr8x_internal
  601. ALIGN 32
  602. bn_sqr8x_mont:
  603. mov QWORD[8+rsp],rdi ;WIN64 prologue
  604. mov QWORD[16+rsp],rsi
  605. mov rax,rsp
  606. $L$SEH_begin_bn_sqr8x_mont:
  607. mov rdi,rcx
  608. mov rsi,rdx
  609. mov rdx,r8
  610. mov rcx,r9
  611. mov r8,QWORD[40+rsp]
  612. mov r9,QWORD[48+rsp]
  613. mov rax,rsp
  614. $L$sqr8x_enter:
  615. push rbx
  616. push rbp
  617. push r12
  618. push r13
  619. push r14
  620. push r15
  621. $L$sqr8x_prologue:
  622. mov r10d,r9d
  623. shl r9d,3
  624. shl r10,3+2
  625. neg r9
  626. lea r11,[((-64))+r9*2+rsp]
  627. mov rbp,rsp
  628. mov r8,QWORD[r8]
  629. sub r11,rsi
  630. and r11,4095
  631. cmp r10,r11
  632. jb NEAR $L$sqr8x_sp_alt
  633. sub rbp,r11
  634. lea rbp,[((-64))+r9*2+rbp]
  635. jmp NEAR $L$sqr8x_sp_done
  636. ALIGN 32
  637. $L$sqr8x_sp_alt:
  638. lea r10,[((4096-64))+r9*2]
  639. lea rbp,[((-64))+r9*2+rbp]
  640. sub r11,r10
  641. mov r10,0
  642. cmovc r11,r10
  643. sub rbp,r11
  644. $L$sqr8x_sp_done:
  645. and rbp,-64
  646. mov r11,rsp
  647. sub r11,rbp
  648. and r11,-4096
  649. lea rsp,[rbp*1+r11]
  650. mov r10,QWORD[rsp]
  651. cmp rsp,rbp
  652. ja NEAR $L$sqr8x_page_walk
  653. jmp NEAR $L$sqr8x_page_walk_done
  654. ALIGN 16
  655. $L$sqr8x_page_walk:
  656. lea rsp,[((-4096))+rsp]
  657. mov r10,QWORD[rsp]
  658. cmp rsp,rbp
  659. ja NEAR $L$sqr8x_page_walk
  660. $L$sqr8x_page_walk_done:
  661. mov r10,r9
  662. neg r9
  663. mov QWORD[32+rsp],r8
  664. mov QWORD[40+rsp],rax
  665. $L$sqr8x_body:
  666. DB 102,72,15,110,209
  667. pxor xmm0,xmm0
  668. DB 102,72,15,110,207
  669. DB 102,73,15,110,218
  670. mov eax,DWORD[((GFp_ia32cap_P+8))]
  671. and eax,0x80100
  672. cmp eax,0x80100
  673. jne NEAR $L$sqr8x_nox
  674. call GFp_bn_sqrx8x_internal
  675. lea rbx,[rcx*1+r8]
  676. mov r9,rcx
  677. mov rdx,rcx
  678. DB 102,72,15,126,207
  679. sar rcx,3+2
  680. jmp NEAR $L$sqr8x_sub
  681. ALIGN 32
  682. $L$sqr8x_nox:
  683. call GFp_bn_sqr8x_internal
  684. lea rbx,[r9*1+rdi]
  685. mov rcx,r9
  686. mov rdx,r9
  687. DB 102,72,15,126,207
  688. sar rcx,3+2
  689. jmp NEAR $L$sqr8x_sub
  690. ALIGN 32
  691. $L$sqr8x_sub:
  692. mov r12,QWORD[rbx]
  693. mov r13,QWORD[8+rbx]
  694. mov r14,QWORD[16+rbx]
  695. mov r15,QWORD[24+rbx]
  696. lea rbx,[32+rbx]
  697. sbb r12,QWORD[rbp]
  698. sbb r13,QWORD[8+rbp]
  699. sbb r14,QWORD[16+rbp]
  700. sbb r15,QWORD[24+rbp]
  701. lea rbp,[32+rbp]
  702. mov QWORD[rdi],r12
  703. mov QWORD[8+rdi],r13
  704. mov QWORD[16+rdi],r14
  705. mov QWORD[24+rdi],r15
  706. lea rdi,[32+rdi]
  707. inc rcx
  708. jnz NEAR $L$sqr8x_sub
  709. sbb rax,0
  710. lea rbx,[r9*1+rbx]
  711. lea rdi,[r9*1+rdi]
  712. DB 102,72,15,110,200
  713. pxor xmm0,xmm0
  714. pshufd xmm1,xmm1,0
  715. mov rsi,QWORD[40+rsp]
  716. jmp NEAR $L$sqr8x_cond_copy
  717. ALIGN 32
  718. $L$sqr8x_cond_copy:
  719. movdqa xmm2,XMMWORD[rbx]
  720. movdqa xmm3,XMMWORD[16+rbx]
  721. lea rbx,[32+rbx]
  722. movdqu xmm4,XMMWORD[rdi]
  723. movdqu xmm5,XMMWORD[16+rdi]
  724. lea rdi,[32+rdi]
  725. movdqa XMMWORD[(-32)+rbx],xmm0
  726. movdqa XMMWORD[(-16)+rbx],xmm0
  727. movdqa XMMWORD[(-32)+rdx*1+rbx],xmm0
  728. movdqa XMMWORD[(-16)+rdx*1+rbx],xmm0
  729. pcmpeqd xmm0,xmm1
  730. pand xmm2,xmm1
  731. pand xmm3,xmm1
  732. pand xmm4,xmm0
  733. pand xmm5,xmm0
  734. pxor xmm0,xmm0
  735. por xmm4,xmm2
  736. por xmm5,xmm3
  737. movdqu XMMWORD[(-32)+rdi],xmm4
  738. movdqu XMMWORD[(-16)+rdi],xmm5
  739. add r9,32
  740. jnz NEAR $L$sqr8x_cond_copy
  741. mov rax,1
  742. mov r15,QWORD[((-48))+rsi]
  743. mov r14,QWORD[((-40))+rsi]
  744. mov r13,QWORD[((-32))+rsi]
  745. mov r12,QWORD[((-24))+rsi]
  746. mov rbp,QWORD[((-16))+rsi]
  747. mov rbx,QWORD[((-8))+rsi]
  748. lea rsp,[rsi]
  749. $L$sqr8x_epilogue:
  750. mov rdi,QWORD[8+rsp] ;WIN64 epilogue
  751. mov rsi,QWORD[16+rsp]
  752. DB 0F3h,0C3h ;repret
  753. $L$SEH_end_bn_sqr8x_mont:
  754. ALIGN 32
  755. bn_mulx4x_mont:
  756. mov QWORD[8+rsp],rdi ;WIN64 prologue
  757. mov QWORD[16+rsp],rsi
  758. mov rax,rsp
  759. $L$SEH_begin_bn_mulx4x_mont:
  760. mov rdi,rcx
  761. mov rsi,rdx
  762. mov rdx,r8
  763. mov rcx,r9
  764. mov r8,QWORD[40+rsp]
  765. mov r9,QWORD[48+rsp]
  766. mov rax,rsp
  767. $L$mulx4x_enter:
  768. push rbx
  769. push rbp
  770. push r12
  771. push r13
  772. push r14
  773. push r15
  774. $L$mulx4x_prologue:
  775. shl r9d,3
  776. xor r10,r10
  777. sub r10,r9
  778. mov r8,QWORD[r8]
  779. lea rbp,[((-72))+r10*1+rsp]
  780. and rbp,-128
  781. mov r11,rsp
  782. sub r11,rbp
  783. and r11,-4096
  784. lea rsp,[rbp*1+r11]
  785. mov r10,QWORD[rsp]
  786. cmp rsp,rbp
  787. ja NEAR $L$mulx4x_page_walk
  788. jmp NEAR $L$mulx4x_page_walk_done
  789. ALIGN 16
  790. $L$mulx4x_page_walk:
  791. lea rsp,[((-4096))+rsp]
  792. mov r10,QWORD[rsp]
  793. cmp rsp,rbp
  794. ja NEAR $L$mulx4x_page_walk
  795. $L$mulx4x_page_walk_done:
  796. lea r10,[r9*1+rdx]
  797. mov QWORD[rsp],r9
  798. shr r9,5
  799. mov QWORD[16+rsp],r10
  800. sub r9,1
  801. mov QWORD[24+rsp],r8
  802. mov QWORD[32+rsp],rdi
  803. mov QWORD[40+rsp],rax
  804. mov QWORD[48+rsp],r9
  805. jmp NEAR $L$mulx4x_body
  806. ALIGN 32
  807. $L$mulx4x_body:
  808. lea rdi,[8+rdx]
  809. mov rdx,QWORD[rdx]
  810. lea rbx,[((64+32))+rsp]
  811. mov r9,rdx
  812. mulx rax,r8,QWORD[rsi]
  813. mulx r14,r11,QWORD[8+rsi]
  814. add r11,rax
  815. mov QWORD[8+rsp],rdi
  816. mulx r13,r12,QWORD[16+rsi]
  817. adc r12,r14
  818. adc r13,0
  819. mov rdi,r8
  820. imul r8,QWORD[24+rsp]
  821. xor rbp,rbp
  822. mulx r14,rax,QWORD[24+rsi]
  823. mov rdx,r8
  824. lea rsi,[32+rsi]
  825. adcx r13,rax
  826. adcx r14,rbp
  827. mulx r10,rax,QWORD[rcx]
  828. adcx rdi,rax
  829. adox r10,r11
  830. mulx r11,rax,QWORD[8+rcx]
  831. adcx r10,rax
  832. adox r11,r12
  833. DB 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
  834. mov rdi,QWORD[48+rsp]
  835. mov QWORD[((-32))+rbx],r10
  836. adcx r11,rax
  837. adox r12,r13
  838. mulx r15,rax,QWORD[24+rcx]
  839. mov rdx,r9
  840. mov QWORD[((-24))+rbx],r11
  841. adcx r12,rax
  842. adox r15,rbp
  843. lea rcx,[32+rcx]
  844. mov QWORD[((-16))+rbx],r12
  845. jmp NEAR $L$mulx4x_1st
  846. ALIGN 32
  847. $L$mulx4x_1st:
  848. adcx r15,rbp
  849. mulx rax,r10,QWORD[rsi]
  850. adcx r10,r14
  851. mulx r14,r11,QWORD[8+rsi]
  852. adcx r11,rax
  853. mulx rax,r12,QWORD[16+rsi]
  854. adcx r12,r14
  855. mulx r14,r13,QWORD[24+rsi]
  856. DB 0x67,0x67
  857. mov rdx,r8
  858. adcx r13,rax
  859. adcx r14,rbp
  860. lea rsi,[32+rsi]
  861. lea rbx,[32+rbx]
  862. adox r10,r15
  863. mulx r15,rax,QWORD[rcx]
  864. adcx r10,rax
  865. adox r11,r15
  866. mulx r15,rax,QWORD[8+rcx]
  867. adcx r11,rax
  868. adox r12,r15
  869. mulx r15,rax,QWORD[16+rcx]
  870. mov QWORD[((-40))+rbx],r10
  871. adcx r12,rax
  872. mov QWORD[((-32))+rbx],r11
  873. adox r13,r15
  874. mulx r15,rax,QWORD[24+rcx]
  875. mov rdx,r9
  876. mov QWORD[((-24))+rbx],r12
  877. adcx r13,rax
  878. adox r15,rbp
  879. lea rcx,[32+rcx]
  880. mov QWORD[((-16))+rbx],r13
  881. dec rdi
  882. jnz NEAR $L$mulx4x_1st
  883. mov rax,QWORD[rsp]
  884. mov rdi,QWORD[8+rsp]
  885. adc r15,rbp
  886. add r14,r15
  887. sbb r15,r15
  888. mov QWORD[((-8))+rbx],r14
  889. jmp NEAR $L$mulx4x_outer
  890. ALIGN 32
  891. $L$mulx4x_outer:
  892. mov rdx,QWORD[rdi]
  893. lea rdi,[8+rdi]
  894. sub rsi,rax
  895. mov QWORD[rbx],r15
  896. lea rbx,[((64+32))+rsp]
  897. sub rcx,rax
  898. mulx r11,r8,QWORD[rsi]
  899. xor ebp,ebp
  900. mov r9,rdx
  901. mulx r12,r14,QWORD[8+rsi]
  902. adox r8,QWORD[((-32))+rbx]
  903. adcx r11,r14
  904. mulx r13,r15,QWORD[16+rsi]
  905. adox r11,QWORD[((-24))+rbx]
  906. adcx r12,r15
  907. adox r12,QWORD[((-16))+rbx]
  908. adcx r13,rbp
  909. adox r13,rbp
  910. mov QWORD[8+rsp],rdi
  911. mov r15,r8
  912. imul r8,QWORD[24+rsp]
  913. xor ebp,ebp
  914. mulx r14,rax,QWORD[24+rsi]
  915. mov rdx,r8
  916. adcx r13,rax
  917. adox r13,QWORD[((-8))+rbx]
  918. adcx r14,rbp
  919. lea rsi,[32+rsi]
  920. adox r14,rbp
  921. mulx r10,rax,QWORD[rcx]
  922. adcx r15,rax
  923. adox r10,r11
  924. mulx r11,rax,QWORD[8+rcx]
  925. adcx r10,rax
  926. adox r11,r12
  927. mulx r12,rax,QWORD[16+rcx]
  928. mov QWORD[((-32))+rbx],r10
  929. adcx r11,rax
  930. adox r12,r13
  931. mulx r15,rax,QWORD[24+rcx]
  932. mov rdx,r9
  933. mov QWORD[((-24))+rbx],r11
  934. lea rcx,[32+rcx]
  935. adcx r12,rax
  936. adox r15,rbp
  937. mov rdi,QWORD[48+rsp]
  938. mov QWORD[((-16))+rbx],r12
  939. jmp NEAR $L$mulx4x_inner
  940. ALIGN 32
  941. $L$mulx4x_inner:
  942. mulx rax,r10,QWORD[rsi]
  943. adcx r15,rbp
  944. adox r10,r14
  945. mulx r14,r11,QWORD[8+rsi]
  946. adcx r10,QWORD[rbx]
  947. adox r11,rax
  948. mulx rax,r12,QWORD[16+rsi]
  949. adcx r11,QWORD[8+rbx]
  950. adox r12,r14
  951. mulx r14,r13,QWORD[24+rsi]
  952. mov rdx,r8
  953. adcx r12,QWORD[16+rbx]
  954. adox r13,rax
  955. adcx r13,QWORD[24+rbx]
  956. adox r14,rbp
  957. lea rsi,[32+rsi]
  958. lea rbx,[32+rbx]
  959. adcx r14,rbp
  960. adox r10,r15
  961. mulx r15,rax,QWORD[rcx]
  962. adcx r10,rax
  963. adox r11,r15
  964. mulx r15,rax,QWORD[8+rcx]
  965. adcx r11,rax
  966. adox r12,r15
  967. mulx r15,rax,QWORD[16+rcx]
  968. mov QWORD[((-40))+rbx],r10
  969. adcx r12,rax
  970. adox r13,r15
  971. mulx r15,rax,QWORD[24+rcx]
  972. mov rdx,r9
  973. mov QWORD[((-32))+rbx],r11
  974. mov QWORD[((-24))+rbx],r12
  975. adcx r13,rax
  976. adox r15,rbp
  977. lea rcx,[32+rcx]
  978. mov QWORD[((-16))+rbx],r13
  979. dec rdi
  980. jnz NEAR $L$mulx4x_inner
  981. mov rax,QWORD[rsp]
  982. mov rdi,QWORD[8+rsp]
  983. adc r15,rbp
  984. sub rbp,QWORD[rbx]
  985. adc r14,r15
  986. sbb r15,r15
  987. mov QWORD[((-8))+rbx],r14
  988. cmp rdi,QWORD[16+rsp]
  989. jne NEAR $L$mulx4x_outer
  990. lea rbx,[64+rsp]
  991. sub rcx,rax
  992. neg r15
  993. mov rdx,rax
  994. shr rax,3+2
  995. mov rdi,QWORD[32+rsp]
  996. jmp NEAR $L$mulx4x_sub
  997. ALIGN 32
  998. $L$mulx4x_sub:
  999. mov r11,QWORD[rbx]
  1000. mov r12,QWORD[8+rbx]
  1001. mov r13,QWORD[16+rbx]
  1002. mov r14,QWORD[24+rbx]
  1003. lea rbx,[32+rbx]
  1004. sbb r11,QWORD[rcx]
  1005. sbb r12,QWORD[8+rcx]
  1006. sbb r13,QWORD[16+rcx]
  1007. sbb r14,QWORD[24+rcx]
  1008. lea rcx,[32+rcx]
  1009. mov QWORD[rdi],r11
  1010. mov QWORD[8+rdi],r12
  1011. mov QWORD[16+rdi],r13
  1012. mov QWORD[24+rdi],r14
  1013. lea rdi,[32+rdi]
  1014. dec rax
  1015. jnz NEAR $L$mulx4x_sub
  1016. sbb r15,0
  1017. lea rbx,[64+rsp]
  1018. sub rdi,rdx
  1019. DB 102,73,15,110,207
  1020. pxor xmm0,xmm0
  1021. pshufd xmm1,xmm1,0
  1022. mov rsi,QWORD[40+rsp]
  1023. jmp NEAR $L$mulx4x_cond_copy
  1024. ALIGN 32
  1025. $L$mulx4x_cond_copy:
  1026. movdqa xmm2,XMMWORD[rbx]
  1027. movdqa xmm3,XMMWORD[16+rbx]
  1028. lea rbx,[32+rbx]
  1029. movdqu xmm4,XMMWORD[rdi]
  1030. movdqu xmm5,XMMWORD[16+rdi]
  1031. lea rdi,[32+rdi]
  1032. movdqa XMMWORD[(-32)+rbx],xmm0
  1033. movdqa XMMWORD[(-16)+rbx],xmm0
  1034. pcmpeqd xmm0,xmm1
  1035. pand xmm2,xmm1
  1036. pand xmm3,xmm1
  1037. pand xmm4,xmm0
  1038. pand xmm5,xmm0
  1039. pxor xmm0,xmm0
  1040. por xmm4,xmm2
  1041. por xmm5,xmm3
  1042. movdqu XMMWORD[(-32)+rdi],xmm4
  1043. movdqu XMMWORD[(-16)+rdi],xmm5
  1044. sub rdx,32
  1045. jnz NEAR $L$mulx4x_cond_copy
  1046. mov QWORD[rbx],rdx
  1047. mov rax,1
  1048. mov r15,QWORD[((-48))+rsi]
  1049. mov r14,QWORD[((-40))+rsi]
  1050. mov r13,QWORD[((-32))+rsi]
  1051. mov r12,QWORD[((-24))+rsi]
  1052. mov rbp,QWORD[((-16))+rsi]
  1053. mov rbx,QWORD[((-8))+rsi]
  1054. lea rsp,[rsi]
  1055. $L$mulx4x_epilogue:
  1056. mov rdi,QWORD[8+rsp] ;WIN64 epilogue
  1057. mov rsi,QWORD[16+rsp]
  1058. DB 0F3h,0C3h ;repret
  1059. $L$SEH_end_bn_mulx4x_mont:
  1060. DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
  1061. DB 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
  1062. DB 54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83
  1063. DB 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
  1064. DB 115,108,46,111,114,103,62,0
  1065. ALIGN 16
  1066. EXTERN __imp_RtlVirtualUnwind
  1067. ALIGN 16
  1068. mul_handler:
  1069. push rsi
  1070. push rdi
  1071. push rbx
  1072. push rbp
  1073. push r12
  1074. push r13
  1075. push r14
  1076. push r15
  1077. pushfq
  1078. sub rsp,64
  1079. mov rax,QWORD[120+r8]
  1080. mov rbx,QWORD[248+r8]
  1081. mov rsi,QWORD[8+r9]
  1082. mov r11,QWORD[56+r9]
  1083. mov r10d,DWORD[r11]
  1084. lea r10,[r10*1+rsi]
  1085. cmp rbx,r10
  1086. jb NEAR $L$common_seh_tail
  1087. mov rax,QWORD[152+r8]
  1088. mov r10d,DWORD[4+r11]
  1089. lea r10,[r10*1+rsi]
  1090. cmp rbx,r10
  1091. jae NEAR $L$common_seh_tail
  1092. mov r10,QWORD[192+r8]
  1093. mov rax,QWORD[8+r10*8+rax]
  1094. jmp NEAR $L$common_pop_regs
  1095. ALIGN 16
  1096. sqr_handler:
  1097. push rsi
  1098. push rdi
  1099. push rbx
  1100. push rbp
  1101. push r12
  1102. push r13
  1103. push r14
  1104. push r15
  1105. pushfq
  1106. sub rsp,64
  1107. mov rax,QWORD[120+r8]
  1108. mov rbx,QWORD[248+r8]
  1109. mov rsi,QWORD[8+r9]
  1110. mov r11,QWORD[56+r9]
  1111. mov r10d,DWORD[r11]
  1112. lea r10,[r10*1+rsi]
  1113. cmp rbx,r10
  1114. jb NEAR $L$common_seh_tail
  1115. mov r10d,DWORD[4+r11]
  1116. lea r10,[r10*1+rsi]
  1117. cmp rbx,r10
  1118. jb NEAR $L$common_pop_regs
  1119. mov rax,QWORD[152+r8]
  1120. mov r10d,DWORD[8+r11]
  1121. lea r10,[r10*1+rsi]
  1122. cmp rbx,r10
  1123. jae NEAR $L$common_seh_tail
  1124. mov rax,QWORD[40+rax]
  1125. $L$common_pop_regs:
  1126. mov rbx,QWORD[((-8))+rax]
  1127. mov rbp,QWORD[((-16))+rax]
  1128. mov r12,QWORD[((-24))+rax]
  1129. mov r13,QWORD[((-32))+rax]
  1130. mov r14,QWORD[((-40))+rax]
  1131. mov r15,QWORD[((-48))+rax]
  1132. mov QWORD[144+r8],rbx
  1133. mov QWORD[160+r8],rbp
  1134. mov QWORD[216+r8],r12
  1135. mov QWORD[224+r8],r13
  1136. mov QWORD[232+r8],r14
  1137. mov QWORD[240+r8],r15
  1138. $L$common_seh_tail:
  1139. mov rdi,QWORD[8+rax]
  1140. mov rsi,QWORD[16+rax]
  1141. mov QWORD[152+r8],rax
  1142. mov QWORD[168+r8],rsi
  1143. mov QWORD[176+r8],rdi
  1144. mov rdi,QWORD[40+r9]
  1145. mov rsi,r8
  1146. mov ecx,154
  1147. DD 0xa548f3fc
  1148. mov rsi,r9
  1149. xor rcx,rcx
  1150. mov rdx,QWORD[8+rsi]
  1151. mov r8,QWORD[rsi]
  1152. mov r9,QWORD[16+rsi]
  1153. mov r10,QWORD[40+rsi]
  1154. lea r11,[56+rsi]
  1155. lea r12,[24+rsi]
  1156. mov QWORD[32+rsp],r10
  1157. mov QWORD[40+rsp],r11
  1158. mov QWORD[48+rsp],r12
  1159. mov QWORD[56+rsp],rcx
  1160. call QWORD[__imp_RtlVirtualUnwind]
  1161. mov eax,1
  1162. add rsp,64
  1163. popfq
  1164. pop r15
  1165. pop r14
  1166. pop r13
  1167. pop r12
  1168. pop rbp
  1169. pop rbx
  1170. pop rdi
  1171. pop rsi
  1172. DB 0F3h,0C3h ;repret
  1173. section .pdata rdata align=4
  1174. ALIGN 4
  1175. DD $L$SEH_begin_GFp_bn_mul_mont wrt ..imagebase
  1176. DD $L$SEH_end_GFp_bn_mul_mont wrt ..imagebase
  1177. DD $L$SEH_info_GFp_bn_mul_mont wrt ..imagebase
  1178. DD $L$SEH_begin_bn_mul4x_mont wrt ..imagebase
  1179. DD $L$SEH_end_bn_mul4x_mont wrt ..imagebase
  1180. DD $L$SEH_info_bn_mul4x_mont wrt ..imagebase
  1181. DD $L$SEH_begin_bn_sqr8x_mont wrt ..imagebase
  1182. DD $L$SEH_end_bn_sqr8x_mont wrt ..imagebase
  1183. DD $L$SEH_info_bn_sqr8x_mont wrt ..imagebase
  1184. DD $L$SEH_begin_bn_mulx4x_mont wrt ..imagebase
  1185. DD $L$SEH_end_bn_mulx4x_mont wrt ..imagebase
  1186. DD $L$SEH_info_bn_mulx4x_mont wrt ..imagebase
  1187. section .xdata rdata align=8
  1188. ALIGN 8
  1189. $L$SEH_info_GFp_bn_mul_mont:
  1190. DB 9,0,0,0
  1191. DD mul_handler wrt ..imagebase
  1192. DD $L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase
  1193. $L$SEH_info_bn_mul4x_mont:
  1194. DB 9,0,0,0
  1195. DD mul_handler wrt ..imagebase
  1196. DD $L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase
  1197. $L$SEH_info_bn_sqr8x_mont:
  1198. DB 9,0,0,0
  1199. DD sqr_handler wrt ..imagebase
  1200. DD $L$sqr8x_prologue wrt ..imagebase,$L$sqr8x_body wrt ..imagebase,$L$sqr8x_epilogue wrt ..imagebase
  1201. ALIGN 8
  1202. $L$SEH_info_bn_mulx4x_mont:
  1203. DB 9,0,0,0
  1204. DD sqr_handler wrt ..imagebase
  1205. DD $L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase
  1206. ALIGN 8