chacha20_poly1305_x86_64-nasm.asm 183 KB


  1. ; This file is generated from a similarly-named Perl script in the BoringSSL
  2. ; source tree. Do not edit by hand.
  3. default rel
  4. %define XMMWORD
  5. %define YMMWORD
  6. %define ZMMWORD
  7. section .text code align=64
  8. EXTERN GFp_ia32cap_P
  9. chacha20_poly1305_constants:
  10. ALIGN 64
  11. $L$chacha20_consts:
  12. DB 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
  13. DB 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
  14. $L$rol8:
  15. DB 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
  16. DB 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
  17. $L$rol16:
  18. DB 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
  19. DB 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
  20. $L$avx2_init:
  21. DD 0,0,0,0
  22. $L$sse_inc:
  23. DD 1,0,0,0
  24. $L$avx2_inc:
  25. DD 2,0,0,0,2,0,0,0
  26. $L$clamp:
  27. DQ 0x0FFFFFFC0FFFFFFF,0x0FFFFFFC0FFFFFFC
  28. DQ 0xFFFFFFFFFFFFFFFF,0xFFFFFFFFFFFFFFFF
  29. ALIGN 16
  30. $L$and_masks:
  31. DB 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
  32. DB 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
  33. DB 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
  34. DB 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
  35. DB 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
  36. DB 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
  37. DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
  38. DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
  39. DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
  40. DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
  41. DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
  42. DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
  43. DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
  44. DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
  45. DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
  46. DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
  47. ALIGN 64
  48. poly_hash_ad_internal:
  49. xor r10,r10
  50. xor r11,r11
  51. xor r12,r12
  52. cmp r8,13
  53. jne NEAR $L$hash_ad_loop
  54. $L$poly_fast_tls_ad:
  55. mov r10,QWORD[rcx]
  56. mov r11,QWORD[5+rcx]
  57. shr r11,24
  58. mov r12,1
  59. mov rax,QWORD[((0+160+0))+rbp]
  60. mov r15,rax
  61. mul r10
  62. mov r13,rax
  63. mov r14,rdx
  64. mov rax,QWORD[((0+160+0))+rbp]
  65. mul r11
  66. imul r15,r12
  67. add r14,rax
  68. adc r15,rdx
  69. mov rax,QWORD[((8+160+0))+rbp]
  70. mov r9,rax
  71. mul r10
  72. add r14,rax
  73. adc rdx,0
  74. mov r10,rdx
  75. mov rax,QWORD[((8+160+0))+rbp]
  76. mul r11
  77. add r15,rax
  78. adc rdx,0
  79. imul r9,r12
  80. add r15,r10
  81. adc r9,rdx
  82. mov r10,r13
  83. mov r11,r14
  84. mov r12,r15
  85. and r12,3
  86. mov r13,r15
  87. and r13,-4
  88. mov r14,r9
  89. shrd r15,r9,2
  90. shr r9,2
  91. add r15,r13
  92. adc r9,r14
  93. add r10,r15
  94. adc r11,r9
  95. adc r12,0
  96. DB 0F3h,0C3h ;repret
  97. $L$hash_ad_loop:
  98. cmp r8,16
  99. jb NEAR $L$hash_ad_tail
  100. add r10,QWORD[((0+0))+rcx]
  101. adc r11,QWORD[((8+0))+rcx]
  102. adc r12,1
  103. mov rax,QWORD[((0+160+0))+rbp]
  104. mov r15,rax
  105. mul r10
  106. mov r13,rax
  107. mov r14,rdx
  108. mov rax,QWORD[((0+160+0))+rbp]
  109. mul r11
  110. imul r15,r12
  111. add r14,rax
  112. adc r15,rdx
  113. mov rax,QWORD[((8+160+0))+rbp]
  114. mov r9,rax
  115. mul r10
  116. add r14,rax
  117. adc rdx,0
  118. mov r10,rdx
  119. mov rax,QWORD[((8+160+0))+rbp]
  120. mul r11
  121. add r15,rax
  122. adc rdx,0
  123. imul r9,r12
  124. add r15,r10
  125. adc r9,rdx
  126. mov r10,r13
  127. mov r11,r14
  128. mov r12,r15
  129. and r12,3
  130. mov r13,r15
  131. and r13,-4
  132. mov r14,r9
  133. shrd r15,r9,2
  134. shr r9,2
  135. add r15,r13
  136. adc r9,r14
  137. add r10,r15
  138. adc r11,r9
  139. adc r12,0
  140. lea rcx,[16+rcx]
  141. sub r8,16
  142. jmp NEAR $L$hash_ad_loop
  143. $L$hash_ad_tail:
  144. cmp r8,0
  145. je NEAR $L$hash_ad_done
  146. xor r13,r13
  147. xor r14,r14
  148. xor r15,r15
  149. add rcx,r8
  150. $L$hash_ad_tail_loop:
  151. shld r14,r13,8
  152. shl r13,8
  153. movzx r15,BYTE[((-1))+rcx]
  154. xor r13,r15
  155. dec rcx
  156. dec r8
  157. jne NEAR $L$hash_ad_tail_loop
  158. add r10,r13
  159. adc r11,r14
  160. adc r12,1
  161. mov rax,QWORD[((0+160+0))+rbp]
  162. mov r15,rax
  163. mul r10
  164. mov r13,rax
  165. mov r14,rdx
  166. mov rax,QWORD[((0+160+0))+rbp]
  167. mul r11
  168. imul r15,r12
  169. add r14,rax
  170. adc r15,rdx
  171. mov rax,QWORD[((8+160+0))+rbp]
  172. mov r9,rax
  173. mul r10
  174. add r14,rax
  175. adc rdx,0
  176. mov r10,rdx
  177. mov rax,QWORD[((8+160+0))+rbp]
  178. mul r11
  179. add r15,rax
  180. adc rdx,0
  181. imul r9,r12
  182. add r15,r10
  183. adc r9,rdx
  184. mov r10,r13
  185. mov r11,r14
  186. mov r12,r15
  187. and r12,3
  188. mov r13,r15
  189. and r13,-4
  190. mov r14,r9
  191. shrd r15,r9,2
  192. shr r9,2
  193. add r15,r13
  194. adc r9,r14
  195. add r10,r15
  196. adc r11,r9
  197. adc r12,0
  198. $L$hash_ad_done:
  199. DB 0F3h,0C3h ;repret
  200. global GFp_chacha20_poly1305_open
  201. ALIGN 64
  202. GFp_chacha20_poly1305_open:
  203. mov QWORD[8+rsp],rdi ;WIN64 prologue
  204. mov QWORD[16+rsp],rsi
  205. mov rax,rsp
  206. $L$SEH_begin_GFp_chacha20_poly1305_open:
  207. mov rdi,rcx
  208. mov rsi,rdx
  209. mov rdx,r8
  210. mov rcx,r9
  211. mov r8,QWORD[40+rsp]
  212. mov r9,QWORD[48+rsp]
  213. push rbp
  214. push rbx
  215. push r12
  216. push r13
  217. push r14
  218. push r15
  219. push r9
  220. sub rsp,288 + 160 + 32
  221. lea rbp,[32+rsp]
  222. and rbp,-32
  223. movaps XMMWORD[(0+0)+rbp],xmm6
  224. movaps XMMWORD[(16+0)+rbp],xmm7
  225. movaps XMMWORD[(32+0)+rbp],xmm8
  226. movaps XMMWORD[(48+0)+rbp],xmm9
  227. movaps XMMWORD[(64+0)+rbp],xmm10
  228. movaps XMMWORD[(80+0)+rbp],xmm11
  229. movaps XMMWORD[(96+0)+rbp],xmm12
  230. movaps XMMWORD[(112+0)+rbp],xmm13
  231. movaps XMMWORD[(128+0)+rbp],xmm14
  232. movaps XMMWORD[(144+0)+rbp],xmm15
  233. mov rbx,rdx
  234. mov QWORD[((0+160+32))+rbp],r8
  235. mov QWORD[((8+160+32))+rbp],rbx
  236. mov eax,DWORD[((GFp_ia32cap_P+8))]
  237. and eax,288
  238. xor eax,288
  239. jz NEAR chacha20_poly1305_open_avx2
  240. cmp rbx,128
  241. jbe NEAR $L$open_sse_128
  242. movdqa xmm0,XMMWORD[$L$chacha20_consts]
  243. movdqu xmm4,XMMWORD[r9]
  244. movdqu xmm8,XMMWORD[16+r9]
  245. movdqu xmm12,XMMWORD[32+r9]
  246. movdqa xmm7,xmm12
  247. movdqa XMMWORD[(160+48)+rbp],xmm4
  248. movdqa XMMWORD[(160+64)+rbp],xmm8
  249. movdqa XMMWORD[(160+96)+rbp],xmm12
  250. mov r10,10
  251. $L$open_sse_init_rounds:
  252. paddd xmm0,xmm4
  253. pxor xmm12,xmm0
  254. pshufb xmm12,XMMWORD[$L$rol16]
  255. paddd xmm8,xmm12
  256. pxor xmm4,xmm8
  257. movdqa xmm3,xmm4
  258. pslld xmm3,12
  259. psrld xmm4,20
  260. pxor xmm4,xmm3
  261. paddd xmm0,xmm4
  262. pxor xmm12,xmm0
  263. pshufb xmm12,XMMWORD[$L$rol8]
  264. paddd xmm8,xmm12
  265. pxor xmm4,xmm8
  266. movdqa xmm3,xmm4
  267. pslld xmm3,7
  268. psrld xmm4,25
  269. pxor xmm4,xmm3
  270. DB 102,15,58,15,228,4
  271. DB 102,69,15,58,15,192,8
  272. DB 102,69,15,58,15,228,12
  273. paddd xmm0,xmm4
  274. pxor xmm12,xmm0
  275. pshufb xmm12,XMMWORD[$L$rol16]
  276. paddd xmm8,xmm12
  277. pxor xmm4,xmm8
  278. movdqa xmm3,xmm4
  279. pslld xmm3,12
  280. psrld xmm4,20
  281. pxor xmm4,xmm3
  282. paddd xmm0,xmm4
  283. pxor xmm12,xmm0
  284. pshufb xmm12,XMMWORD[$L$rol8]
  285. paddd xmm8,xmm12
  286. pxor xmm4,xmm8
  287. movdqa xmm3,xmm4
  288. pslld xmm3,7
  289. psrld xmm4,25
  290. pxor xmm4,xmm3
  291. DB 102,15,58,15,228,12
  292. DB 102,69,15,58,15,192,8
  293. DB 102,69,15,58,15,228,4
  294. dec r10
  295. jne NEAR $L$open_sse_init_rounds
  296. paddd xmm0,XMMWORD[$L$chacha20_consts]
  297. paddd xmm4,XMMWORD[((160+48))+rbp]
  298. pand xmm0,XMMWORD[$L$clamp]
  299. movdqa XMMWORD[(160+0)+rbp],xmm0
  300. movdqa XMMWORD[(160+16)+rbp],xmm4
  301. mov r8,r8
  302. call poly_hash_ad_internal
  303. $L$open_sse_main_loop:
  304. cmp rbx,16*16
  305. jb NEAR $L$open_sse_tail
  306. movdqa xmm0,XMMWORD[$L$chacha20_consts]
  307. movdqa xmm4,XMMWORD[((160+48))+rbp]
  308. movdqa xmm8,XMMWORD[((160+64))+rbp]
  309. movdqa xmm1,xmm0
  310. movdqa xmm5,xmm4
  311. movdqa xmm9,xmm8
  312. movdqa xmm2,xmm0
  313. movdqa xmm6,xmm4
  314. movdqa xmm10,xmm8
  315. movdqa xmm3,xmm0
  316. movdqa xmm7,xmm4
  317. movdqa xmm11,xmm8
  318. movdqa xmm15,XMMWORD[((160+96))+rbp]
  319. paddd xmm15,XMMWORD[$L$sse_inc]
  320. movdqa xmm14,xmm15
  321. paddd xmm14,XMMWORD[$L$sse_inc]
  322. movdqa xmm13,xmm14
  323. paddd xmm13,XMMWORD[$L$sse_inc]
  324. movdqa xmm12,xmm13
  325. paddd xmm12,XMMWORD[$L$sse_inc]
  326. movdqa XMMWORD[(160+96)+rbp],xmm12
  327. movdqa XMMWORD[(160+112)+rbp],xmm13
  328. movdqa XMMWORD[(160+128)+rbp],xmm14
  329. movdqa XMMWORD[(160+144)+rbp],xmm15
  330. mov rcx,4
  331. mov r8,rsi
  332. $L$open_sse_main_loop_rounds:
  333. movdqa XMMWORD[(160+80)+rbp],xmm8
  334. movdqa xmm8,XMMWORD[$L$rol16]
  335. paddd xmm3,xmm7
  336. paddd xmm2,xmm6
  337. paddd xmm1,xmm5
  338. paddd xmm0,xmm4
  339. pxor xmm15,xmm3
  340. pxor xmm14,xmm2
  341. pxor xmm13,xmm1
  342. pxor xmm12,xmm0
  343. DB 102,69,15,56,0,248
  344. DB 102,69,15,56,0,240
  345. DB 102,69,15,56,0,232
  346. DB 102,69,15,56,0,224
  347. movdqa xmm8,XMMWORD[((160+80))+rbp]
  348. paddd xmm11,xmm15
  349. paddd xmm10,xmm14
  350. paddd xmm9,xmm13
  351. paddd xmm8,xmm12
  352. pxor xmm7,xmm11
  353. add r10,QWORD[((0+0))+r8]
  354. adc r11,QWORD[((8+0))+r8]
  355. adc r12,1
  356. lea r8,[16+r8]
  357. pxor xmm6,xmm10
  358. pxor xmm5,xmm9
  359. pxor xmm4,xmm8
  360. movdqa XMMWORD[(160+80)+rbp],xmm8
  361. movdqa xmm8,xmm7
  362. psrld xmm8,20
  363. pslld xmm7,32-20
  364. pxor xmm7,xmm8
  365. movdqa xmm8,xmm6
  366. psrld xmm8,20
  367. pslld xmm6,32-20
  368. pxor xmm6,xmm8
  369. movdqa xmm8,xmm5
  370. psrld xmm8,20
  371. pslld xmm5,32-20
  372. pxor xmm5,xmm8
  373. movdqa xmm8,xmm4
  374. psrld xmm8,20
  375. pslld xmm4,32-20
  376. pxor xmm4,xmm8
  377. mov rax,QWORD[((0+160+0))+rbp]
  378. mov r15,rax
  379. mul r10
  380. mov r13,rax
  381. mov r14,rdx
  382. mov rax,QWORD[((0+160+0))+rbp]
  383. mul r11
  384. imul r15,r12
  385. add r14,rax
  386. adc r15,rdx
  387. movdqa xmm8,XMMWORD[$L$rol8]
  388. paddd xmm3,xmm7
  389. paddd xmm2,xmm6
  390. paddd xmm1,xmm5
  391. paddd xmm0,xmm4
  392. pxor xmm15,xmm3
  393. pxor xmm14,xmm2
  394. pxor xmm13,xmm1
  395. pxor xmm12,xmm0
  396. DB 102,69,15,56,0,248
  397. DB 102,69,15,56,0,240
  398. DB 102,69,15,56,0,232
  399. DB 102,69,15,56,0,224
  400. movdqa xmm8,XMMWORD[((160+80))+rbp]
  401. paddd xmm11,xmm15
  402. paddd xmm10,xmm14
  403. paddd xmm9,xmm13
  404. paddd xmm8,xmm12
  405. pxor xmm7,xmm11
  406. pxor xmm6,xmm10
  407. mov rax,QWORD[((8+160+0))+rbp]
  408. mov r9,rax
  409. mul r10
  410. add r14,rax
  411. adc rdx,0
  412. mov r10,rdx
  413. mov rax,QWORD[((8+160+0))+rbp]
  414. mul r11
  415. add r15,rax
  416. adc rdx,0
  417. pxor xmm5,xmm9
  418. pxor xmm4,xmm8
  419. movdqa XMMWORD[(160+80)+rbp],xmm8
  420. movdqa xmm8,xmm7
  421. psrld xmm8,25
  422. pslld xmm7,32-25
  423. pxor xmm7,xmm8
  424. movdqa xmm8,xmm6
  425. psrld xmm8,25
  426. pslld xmm6,32-25
  427. pxor xmm6,xmm8
  428. movdqa xmm8,xmm5
  429. psrld xmm8,25
  430. pslld xmm5,32-25
  431. pxor xmm5,xmm8
  432. movdqa xmm8,xmm4
  433. psrld xmm8,25
  434. pslld xmm4,32-25
  435. pxor xmm4,xmm8
  436. movdqa xmm8,XMMWORD[((160+80))+rbp]
  437. imul r9,r12
  438. add r15,r10
  439. adc r9,rdx
  440. DB 102,15,58,15,255,4
  441. DB 102,69,15,58,15,219,8
  442. DB 102,69,15,58,15,255,12
  443. DB 102,15,58,15,246,4
  444. DB 102,69,15,58,15,210,8
  445. DB 102,69,15,58,15,246,12
  446. DB 102,15,58,15,237,4
  447. DB 102,69,15,58,15,201,8
  448. DB 102,69,15,58,15,237,12
  449. DB 102,15,58,15,228,4
  450. DB 102,69,15,58,15,192,8
  451. DB 102,69,15,58,15,228,12
  452. movdqa XMMWORD[(160+80)+rbp],xmm8
  453. movdqa xmm8,XMMWORD[$L$rol16]
  454. paddd xmm3,xmm7
  455. paddd xmm2,xmm6
  456. paddd xmm1,xmm5
  457. paddd xmm0,xmm4
  458. pxor xmm15,xmm3
  459. pxor xmm14,xmm2
  460. mov r10,r13
  461. mov r11,r14
  462. mov r12,r15
  463. and r12,3
  464. mov r13,r15
  465. and r13,-4
  466. mov r14,r9
  467. shrd r15,r9,2
  468. shr r9,2
  469. add r15,r13
  470. adc r9,r14
  471. add r10,r15
  472. adc r11,r9
  473. adc r12,0
  474. pxor xmm13,xmm1
  475. pxor xmm12,xmm0
  476. DB 102,69,15,56,0,248
  477. DB 102,69,15,56,0,240
  478. DB 102,69,15,56,0,232
  479. DB 102,69,15,56,0,224
  480. movdqa xmm8,XMMWORD[((160+80))+rbp]
  481. paddd xmm11,xmm15
  482. paddd xmm10,xmm14
  483. paddd xmm9,xmm13
  484. paddd xmm8,xmm12
  485. pxor xmm7,xmm11
  486. pxor xmm6,xmm10
  487. pxor xmm5,xmm9
  488. pxor xmm4,xmm8
  489. movdqa XMMWORD[(160+80)+rbp],xmm8
  490. movdqa xmm8,xmm7
  491. psrld xmm8,20
  492. pslld xmm7,32-20
  493. pxor xmm7,xmm8
  494. movdqa xmm8,xmm6
  495. psrld xmm8,20
  496. pslld xmm6,32-20
  497. pxor xmm6,xmm8
  498. movdqa xmm8,xmm5
  499. psrld xmm8,20
  500. pslld xmm5,32-20
  501. pxor xmm5,xmm8
  502. movdqa xmm8,xmm4
  503. psrld xmm8,20
  504. pslld xmm4,32-20
  505. pxor xmm4,xmm8
  506. movdqa xmm8,XMMWORD[$L$rol8]
  507. paddd xmm3,xmm7
  508. paddd xmm2,xmm6
  509. paddd xmm1,xmm5
  510. paddd xmm0,xmm4
  511. pxor xmm15,xmm3
  512. pxor xmm14,xmm2
  513. pxor xmm13,xmm1
  514. pxor xmm12,xmm0
  515. DB 102,69,15,56,0,248
  516. DB 102,69,15,56,0,240
  517. DB 102,69,15,56,0,232
  518. DB 102,69,15,56,0,224
  519. movdqa xmm8,XMMWORD[((160+80))+rbp]
  520. paddd xmm11,xmm15
  521. paddd xmm10,xmm14
  522. paddd xmm9,xmm13
  523. paddd xmm8,xmm12
  524. pxor xmm7,xmm11
  525. pxor xmm6,xmm10
  526. pxor xmm5,xmm9
  527. pxor xmm4,xmm8
  528. movdqa XMMWORD[(160+80)+rbp],xmm8
  529. movdqa xmm8,xmm7
  530. psrld xmm8,25
  531. pslld xmm7,32-25
  532. pxor xmm7,xmm8
  533. movdqa xmm8,xmm6
  534. psrld xmm8,25
  535. pslld xmm6,32-25
  536. pxor xmm6,xmm8
  537. movdqa xmm8,xmm5
  538. psrld xmm8,25
  539. pslld xmm5,32-25
  540. pxor xmm5,xmm8
  541. movdqa xmm8,xmm4
  542. psrld xmm8,25
  543. pslld xmm4,32-25
  544. pxor xmm4,xmm8
  545. movdqa xmm8,XMMWORD[((160+80))+rbp]
  546. DB 102,15,58,15,255,12
  547. DB 102,69,15,58,15,219,8
  548. DB 102,69,15,58,15,255,4
  549. DB 102,15,58,15,246,12
  550. DB 102,69,15,58,15,210,8
  551. DB 102,69,15,58,15,246,4
  552. DB 102,15,58,15,237,12
  553. DB 102,69,15,58,15,201,8
  554. DB 102,69,15,58,15,237,4
  555. DB 102,15,58,15,228,12
  556. DB 102,69,15,58,15,192,8
  557. DB 102,69,15,58,15,228,4
  558. dec rcx
  559. jge NEAR $L$open_sse_main_loop_rounds
  560. add r10,QWORD[((0+0))+r8]
  561. adc r11,QWORD[((8+0))+r8]
  562. adc r12,1
  563. mov rax,QWORD[((0+160+0))+rbp]
  564. mov r15,rax
  565. mul r10
  566. mov r13,rax
  567. mov r14,rdx
  568. mov rax,QWORD[((0+160+0))+rbp]
  569. mul r11
  570. imul r15,r12
  571. add r14,rax
  572. adc r15,rdx
  573. mov rax,QWORD[((8+160+0))+rbp]
  574. mov r9,rax
  575. mul r10
  576. add r14,rax
  577. adc rdx,0
  578. mov r10,rdx
  579. mov rax,QWORD[((8+160+0))+rbp]
  580. mul r11
  581. add r15,rax
  582. adc rdx,0
  583. imul r9,r12
  584. add r15,r10
  585. adc r9,rdx
  586. mov r10,r13
  587. mov r11,r14
  588. mov r12,r15
  589. and r12,3
  590. mov r13,r15
  591. and r13,-4
  592. mov r14,r9
  593. shrd r15,r9,2
  594. shr r9,2
  595. add r15,r13
  596. adc r9,r14
  597. add r10,r15
  598. adc r11,r9
  599. adc r12,0
  600. lea r8,[16+r8]
  601. cmp rcx,-6
  602. jg NEAR $L$open_sse_main_loop_rounds
  603. paddd xmm3,XMMWORD[$L$chacha20_consts]
  604. paddd xmm7,XMMWORD[((160+48))+rbp]
  605. paddd xmm11,XMMWORD[((160+64))+rbp]
  606. paddd xmm15,XMMWORD[((160+144))+rbp]
  607. paddd xmm2,XMMWORD[$L$chacha20_consts]
  608. paddd xmm6,XMMWORD[((160+48))+rbp]
  609. paddd xmm10,XMMWORD[((160+64))+rbp]
  610. paddd xmm14,XMMWORD[((160+128))+rbp]
  611. paddd xmm1,XMMWORD[$L$chacha20_consts]
  612. paddd xmm5,XMMWORD[((160+48))+rbp]
  613. paddd xmm9,XMMWORD[((160+64))+rbp]
  614. paddd xmm13,XMMWORD[((160+112))+rbp]
  615. paddd xmm0,XMMWORD[$L$chacha20_consts]
  616. paddd xmm4,XMMWORD[((160+48))+rbp]
  617. paddd xmm8,XMMWORD[((160+64))+rbp]
  618. paddd xmm12,XMMWORD[((160+96))+rbp]
  619. movdqa XMMWORD[(160+80)+rbp],xmm12
  620. movdqu xmm12,XMMWORD[((0 + 0))+rsi]
  621. pxor xmm12,xmm3
  622. movdqu XMMWORD[(0 + 0)+rdi],xmm12
  623. movdqu xmm12,XMMWORD[((16 + 0))+rsi]
  624. pxor xmm12,xmm7
  625. movdqu XMMWORD[(16 + 0)+rdi],xmm12
  626. movdqu xmm12,XMMWORD[((32 + 0))+rsi]
  627. pxor xmm12,xmm11
  628. movdqu XMMWORD[(32 + 0)+rdi],xmm12
  629. movdqu xmm12,XMMWORD[((48 + 0))+rsi]
  630. pxor xmm12,xmm15
  631. movdqu XMMWORD[(48 + 0)+rdi],xmm12
  632. movdqu xmm3,XMMWORD[((0 + 64))+rsi]
  633. movdqu xmm7,XMMWORD[((16 + 64))+rsi]
  634. movdqu xmm11,XMMWORD[((32 + 64))+rsi]
  635. movdqu xmm15,XMMWORD[((48 + 64))+rsi]
  636. pxor xmm2,xmm3
  637. pxor xmm6,xmm7
  638. pxor xmm10,xmm11
  639. pxor xmm15,xmm14
  640. movdqu XMMWORD[(0 + 64)+rdi],xmm2
  641. movdqu XMMWORD[(16 + 64)+rdi],xmm6
  642. movdqu XMMWORD[(32 + 64)+rdi],xmm10
  643. movdqu XMMWORD[(48 + 64)+rdi],xmm15
  644. movdqu xmm3,XMMWORD[((0 + 128))+rsi]
  645. movdqu xmm7,XMMWORD[((16 + 128))+rsi]
  646. movdqu xmm11,XMMWORD[((32 + 128))+rsi]
  647. movdqu xmm15,XMMWORD[((48 + 128))+rsi]
  648. pxor xmm1,xmm3
  649. pxor xmm5,xmm7
  650. pxor xmm9,xmm11
  651. pxor xmm15,xmm13
  652. movdqu XMMWORD[(0 + 128)+rdi],xmm1
  653. movdqu XMMWORD[(16 + 128)+rdi],xmm5
  654. movdqu XMMWORD[(32 + 128)+rdi],xmm9
  655. movdqu XMMWORD[(48 + 128)+rdi],xmm15
  656. movdqu xmm3,XMMWORD[((0 + 192))+rsi]
  657. movdqu xmm7,XMMWORD[((16 + 192))+rsi]
  658. movdqu xmm11,XMMWORD[((32 + 192))+rsi]
  659. movdqu xmm15,XMMWORD[((48 + 192))+rsi]
  660. pxor xmm0,xmm3
  661. pxor xmm4,xmm7
  662. pxor xmm8,xmm11
  663. pxor xmm15,XMMWORD[((160+80))+rbp]
  664. movdqu XMMWORD[(0 + 192)+rdi],xmm0
  665. movdqu XMMWORD[(16 + 192)+rdi],xmm4
  666. movdqu XMMWORD[(32 + 192)+rdi],xmm8
  667. movdqu XMMWORD[(48 + 192)+rdi],xmm15
  668. lea rsi,[256+rsi]
  669. lea rdi,[256+rdi]
  670. sub rbx,16*16
  671. jmp NEAR $L$open_sse_main_loop
  672. $L$open_sse_tail:
  673. test rbx,rbx
  674. jz NEAR $L$open_sse_finalize
  675. cmp rbx,12*16
  676. ja NEAR $L$open_sse_tail_256
  677. cmp rbx,8*16
  678. ja NEAR $L$open_sse_tail_192
  679. cmp rbx,4*16
  680. ja NEAR $L$open_sse_tail_128
  681. movdqa xmm0,XMMWORD[$L$chacha20_consts]
  682. movdqa xmm4,XMMWORD[((160+48))+rbp]
  683. movdqa xmm8,XMMWORD[((160+64))+rbp]
  684. movdqa xmm12,XMMWORD[((160+96))+rbp]
  685. paddd xmm12,XMMWORD[$L$sse_inc]
  686. movdqa XMMWORD[(160+96)+rbp],xmm12
  687. xor r8,r8
  688. mov rcx,rbx
  689. cmp rcx,16
  690. jb NEAR $L$open_sse_tail_64_rounds
  691. $L$open_sse_tail_64_rounds_and_x1hash:
  692. add r10,QWORD[((0+0))+r8*1+rsi]
  693. adc r11,QWORD[((8+0))+r8*1+rsi]
  694. adc r12,1
  695. mov rax,QWORD[((0+160+0))+rbp]
  696. mov r15,rax
  697. mul r10
  698. mov r13,rax
  699. mov r14,rdx
  700. mov rax,QWORD[((0+160+0))+rbp]
  701. mul r11
  702. imul r15,r12
  703. add r14,rax
  704. adc r15,rdx
  705. mov rax,QWORD[((8+160+0))+rbp]
  706. mov r9,rax
  707. mul r10
  708. add r14,rax
  709. adc rdx,0
  710. mov r10,rdx
  711. mov rax,QWORD[((8+160+0))+rbp]
  712. mul r11
  713. add r15,rax
  714. adc rdx,0
  715. imul r9,r12
  716. add r15,r10
  717. adc r9,rdx
  718. mov r10,r13
  719. mov r11,r14
  720. mov r12,r15
  721. and r12,3
  722. mov r13,r15
  723. and r13,-4
  724. mov r14,r9
  725. shrd r15,r9,2
  726. shr r9,2
  727. add r15,r13
  728. adc r9,r14
  729. add r10,r15
  730. adc r11,r9
  731. adc r12,0
  732. sub rcx,16
  733. $L$open_sse_tail_64_rounds:
  734. add r8,16
  735. paddd xmm0,xmm4
  736. pxor xmm12,xmm0
  737. pshufb xmm12,XMMWORD[$L$rol16]
  738. paddd xmm8,xmm12
  739. pxor xmm4,xmm8
  740. movdqa xmm3,xmm4
  741. pslld xmm3,12
  742. psrld xmm4,20
  743. pxor xmm4,xmm3
  744. paddd xmm0,xmm4
  745. pxor xmm12,xmm0
  746. pshufb xmm12,XMMWORD[$L$rol8]
  747. paddd xmm8,xmm12
  748. pxor xmm4,xmm8
  749. movdqa xmm3,xmm4
  750. pslld xmm3,7
  751. psrld xmm4,25
  752. pxor xmm4,xmm3
  753. DB 102,15,58,15,228,4
  754. DB 102,69,15,58,15,192,8
  755. DB 102,69,15,58,15,228,12
  756. paddd xmm0,xmm4
  757. pxor xmm12,xmm0
  758. pshufb xmm12,XMMWORD[$L$rol16]
  759. paddd xmm8,xmm12
  760. pxor xmm4,xmm8
  761. movdqa xmm3,xmm4
  762. pslld xmm3,12
  763. psrld xmm4,20
  764. pxor xmm4,xmm3
  765. paddd xmm0,xmm4
  766. pxor xmm12,xmm0
  767. pshufb xmm12,XMMWORD[$L$rol8]
  768. paddd xmm8,xmm12
  769. pxor xmm4,xmm8
  770. movdqa xmm3,xmm4
  771. pslld xmm3,7
  772. psrld xmm4,25
  773. pxor xmm4,xmm3
  774. DB 102,15,58,15,228,12
  775. DB 102,69,15,58,15,192,8
  776. DB 102,69,15,58,15,228,4
  777. cmp rcx,16
  778. jae NEAR $L$open_sse_tail_64_rounds_and_x1hash
  779. cmp r8,10*16
  780. jne NEAR $L$open_sse_tail_64_rounds
  781. paddd xmm0,XMMWORD[$L$chacha20_consts]
  782. paddd xmm4,XMMWORD[((160+48))+rbp]
  783. paddd xmm8,XMMWORD[((160+64))+rbp]
  784. paddd xmm12,XMMWORD[((160+96))+rbp]
  785. jmp NEAR $L$open_sse_tail_64_dec_loop
  786. $L$open_sse_tail_128:
  787. movdqa xmm0,XMMWORD[$L$chacha20_consts]
  788. movdqa xmm4,XMMWORD[((160+48))+rbp]
  789. movdqa xmm8,XMMWORD[((160+64))+rbp]
  790. movdqa xmm1,xmm0
  791. movdqa xmm5,xmm4
  792. movdqa xmm9,xmm8
  793. movdqa xmm13,XMMWORD[((160+96))+rbp]
  794. paddd xmm13,XMMWORD[$L$sse_inc]
  795. movdqa xmm12,xmm13
  796. paddd xmm12,XMMWORD[$L$sse_inc]
  797. movdqa XMMWORD[(160+96)+rbp],xmm12
  798. movdqa XMMWORD[(160+112)+rbp],xmm13
  799. mov rcx,rbx
  800. and rcx,-16
  801. xor r8,r8
  802. $L$open_sse_tail_128_rounds_and_x1hash:
  803. add r10,QWORD[((0+0))+r8*1+rsi]
  804. adc r11,QWORD[((8+0))+r8*1+rsi]
  805. adc r12,1
  806. mov rax,QWORD[((0+160+0))+rbp]
  807. mov r15,rax
  808. mul r10
  809. mov r13,rax
  810. mov r14,rdx
  811. mov rax,QWORD[((0+160+0))+rbp]
  812. mul r11
  813. imul r15,r12
  814. add r14,rax
  815. adc r15,rdx
  816. mov rax,QWORD[((8+160+0))+rbp]
  817. mov r9,rax
  818. mul r10
  819. add r14,rax
  820. adc rdx,0
  821. mov r10,rdx
  822. mov rax,QWORD[((8+160+0))+rbp]
  823. mul r11
  824. add r15,rax
  825. adc rdx,0
  826. imul r9,r12
  827. add r15,r10
  828. adc r9,rdx
  829. mov r10,r13
  830. mov r11,r14
  831. mov r12,r15
  832. and r12,3
  833. mov r13,r15
  834. and r13,-4
  835. mov r14,r9
  836. shrd r15,r9,2
  837. shr r9,2
  838. add r15,r13
  839. adc r9,r14
  840. add r10,r15
  841. adc r11,r9
  842. adc r12,0
  843. $L$open_sse_tail_128_rounds:
  844. add r8,16
  845. paddd xmm0,xmm4
  846. pxor xmm12,xmm0
  847. pshufb xmm12,XMMWORD[$L$rol16]
  848. paddd xmm8,xmm12
  849. pxor xmm4,xmm8
  850. movdqa xmm3,xmm4
  851. pslld xmm3,12
  852. psrld xmm4,20
  853. pxor xmm4,xmm3
  854. paddd xmm0,xmm4
  855. pxor xmm12,xmm0
  856. pshufb xmm12,XMMWORD[$L$rol8]
  857. paddd xmm8,xmm12
  858. pxor xmm4,xmm8
  859. movdqa xmm3,xmm4
  860. pslld xmm3,7
  861. psrld xmm4,25
  862. pxor xmm4,xmm3
  863. DB 102,15,58,15,228,4
  864. DB 102,69,15,58,15,192,8
  865. DB 102,69,15,58,15,228,12
  866. paddd xmm1,xmm5
  867. pxor xmm13,xmm1
  868. pshufb xmm13,XMMWORD[$L$rol16]
  869. paddd xmm9,xmm13
  870. pxor xmm5,xmm9
  871. movdqa xmm3,xmm5
  872. pslld xmm3,12
  873. psrld xmm5,20
  874. pxor xmm5,xmm3
  875. paddd xmm1,xmm5
  876. pxor xmm13,xmm1
  877. pshufb xmm13,XMMWORD[$L$rol8]
  878. paddd xmm9,xmm13
  879. pxor xmm5,xmm9
  880. movdqa xmm3,xmm5
  881. pslld xmm3,7
  882. psrld xmm5,25
  883. pxor xmm5,xmm3
  884. DB 102,15,58,15,237,4
  885. DB 102,69,15,58,15,201,8
  886. DB 102,69,15,58,15,237,12
  887. paddd xmm0,xmm4
  888. pxor xmm12,xmm0
  889. pshufb xmm12,XMMWORD[$L$rol16]
  890. paddd xmm8,xmm12
  891. pxor xmm4,xmm8
  892. movdqa xmm3,xmm4
  893. pslld xmm3,12
  894. psrld xmm4,20
  895. pxor xmm4,xmm3
  896. paddd xmm0,xmm4
  897. pxor xmm12,xmm0
  898. pshufb xmm12,XMMWORD[$L$rol8]
  899. paddd xmm8,xmm12
  900. pxor xmm4,xmm8
  901. movdqa xmm3,xmm4
  902. pslld xmm3,7
  903. psrld xmm4,25
  904. pxor xmm4,xmm3
  905. DB 102,15,58,15,228,12
  906. DB 102,69,15,58,15,192,8
  907. DB 102,69,15,58,15,228,4
  908. paddd xmm1,xmm5
  909. pxor xmm13,xmm1
  910. pshufb xmm13,XMMWORD[$L$rol16]
  911. paddd xmm9,xmm13
  912. pxor xmm5,xmm9
  913. movdqa xmm3,xmm5
  914. pslld xmm3,12
  915. psrld xmm5,20
  916. pxor xmm5,xmm3
  917. paddd xmm1,xmm5
  918. pxor xmm13,xmm1
  919. pshufb xmm13,XMMWORD[$L$rol8]
  920. paddd xmm9,xmm13
  921. pxor xmm5,xmm9
  922. movdqa xmm3,xmm5
  923. pslld xmm3,7
  924. psrld xmm5,25
  925. pxor xmm5,xmm3
  926. DB 102,15,58,15,237,12
  927. DB 102,69,15,58,15,201,8
  928. DB 102,69,15,58,15,237,4
  929. cmp r8,rcx
  930. jb NEAR $L$open_sse_tail_128_rounds_and_x1hash
  931. cmp r8,10*16
  932. jne NEAR $L$open_sse_tail_128_rounds
  933. paddd xmm1,XMMWORD[$L$chacha20_consts]
  934. paddd xmm5,XMMWORD[((160+48))+rbp]
  935. paddd xmm9,XMMWORD[((160+64))+rbp]
  936. paddd xmm13,XMMWORD[((160+112))+rbp]
  937. paddd xmm0,XMMWORD[$L$chacha20_consts]
  938. paddd xmm4,XMMWORD[((160+48))+rbp]
  939. paddd xmm8,XMMWORD[((160+64))+rbp]
  940. paddd xmm12,XMMWORD[((160+96))+rbp]
  941. movdqu xmm3,XMMWORD[((0 + 0))+rsi]
  942. movdqu xmm7,XMMWORD[((16 + 0))+rsi]
  943. movdqu xmm11,XMMWORD[((32 + 0))+rsi]
  944. movdqu xmm15,XMMWORD[((48 + 0))+rsi]
  945. pxor xmm1,xmm3
  946. pxor xmm5,xmm7
  947. pxor xmm9,xmm11
  948. pxor xmm15,xmm13
  949. movdqu XMMWORD[(0 + 0)+rdi],xmm1
  950. movdqu XMMWORD[(16 + 0)+rdi],xmm5
  951. movdqu XMMWORD[(32 + 0)+rdi],xmm9
  952. movdqu XMMWORD[(48 + 0)+rdi],xmm15
  953. sub rbx,4*16
  954. lea rsi,[64+rsi]
  955. lea rdi,[64+rdi]
  956. jmp NEAR $L$open_sse_tail_64_dec_loop
  957. $L$open_sse_tail_192:
  958. movdqa xmm0,XMMWORD[$L$chacha20_consts]
  959. movdqa xmm4,XMMWORD[((160+48))+rbp]
  960. movdqa xmm8,XMMWORD[((160+64))+rbp]
  961. movdqa xmm1,xmm0
  962. movdqa xmm5,xmm4
  963. movdqa xmm9,xmm8
  964. movdqa xmm2,xmm0
  965. movdqa xmm6,xmm4
  966. movdqa xmm10,xmm8
  967. movdqa xmm14,XMMWORD[((160+96))+rbp]
  968. paddd xmm14,XMMWORD[$L$sse_inc]
  969. movdqa xmm13,xmm14
  970. paddd xmm13,XMMWORD[$L$sse_inc]
  971. movdqa xmm12,xmm13
  972. paddd xmm12,XMMWORD[$L$sse_inc]
  973. movdqa XMMWORD[(160+96)+rbp],xmm12
  974. movdqa XMMWORD[(160+112)+rbp],xmm13
  975. movdqa XMMWORD[(160+128)+rbp],xmm14
  976. mov rcx,rbx
  977. mov r8,10*16
  978. cmp rcx,10*16
  979. cmovg rcx,r8
  980. and rcx,-16
  981. xor r8,r8
  982. $L$open_sse_tail_192_rounds_and_x1hash:
  983. add r10,QWORD[((0+0))+r8*1+rsi]
  984. adc r11,QWORD[((8+0))+r8*1+rsi]
  985. adc r12,1
  986. mov rax,QWORD[((0+160+0))+rbp]
  987. mov r15,rax
  988. mul r10
  989. mov r13,rax
  990. mov r14,rdx
  991. mov rax,QWORD[((0+160+0))+rbp]
  992. mul r11
  993. imul r15,r12
  994. add r14,rax
  995. adc r15,rdx
  996. mov rax,QWORD[((8+160+0))+rbp]
  997. mov r9,rax
  998. mul r10
  999. add r14,rax
  1000. adc rdx,0
  1001. mov r10,rdx
  1002. mov rax,QWORD[((8+160+0))+rbp]
  1003. mul r11
  1004. add r15,rax
  1005. adc rdx,0
  1006. imul r9,r12
  1007. add r15,r10
  1008. adc r9,rdx
  1009. mov r10,r13
  1010. mov r11,r14
  1011. mov r12,r15
  1012. and r12,3
  1013. mov r13,r15
  1014. and r13,-4
  1015. mov r14,r9
  1016. shrd r15,r9,2
  1017. shr r9,2
  1018. add r15,r13
  1019. adc r9,r14
  1020. add r10,r15
  1021. adc r11,r9
  1022. adc r12,0
  1023. $L$open_sse_tail_192_rounds:
  1024. add r8,16
  1025. paddd xmm0,xmm4
  1026. pxor xmm12,xmm0
  1027. pshufb xmm12,XMMWORD[$L$rol16]
  1028. paddd xmm8,xmm12
  1029. pxor xmm4,xmm8
  1030. movdqa xmm3,xmm4
  1031. pslld xmm3,12
  1032. psrld xmm4,20
  1033. pxor xmm4,xmm3
  1034. paddd xmm0,xmm4
  1035. pxor xmm12,xmm0
  1036. pshufb xmm12,XMMWORD[$L$rol8]
  1037. paddd xmm8,xmm12
  1038. pxor xmm4,xmm8
  1039. movdqa xmm3,xmm4
  1040. pslld xmm3,7
  1041. psrld xmm4,25
  1042. pxor xmm4,xmm3
  1043. DB 102,15,58,15,228,4
  1044. DB 102,69,15,58,15,192,8
  1045. DB 102,69,15,58,15,228,12
  1046. paddd xmm1,xmm5
  1047. pxor xmm13,xmm1
  1048. pshufb xmm13,XMMWORD[$L$rol16]
  1049. paddd xmm9,xmm13
  1050. pxor xmm5,xmm9
  1051. movdqa xmm3,xmm5
  1052. pslld xmm3,12
  1053. psrld xmm5,20
  1054. pxor xmm5,xmm3
  1055. paddd xmm1,xmm5
  1056. pxor xmm13,xmm1
  1057. pshufb xmm13,XMMWORD[$L$rol8]
  1058. paddd xmm9,xmm13
  1059. pxor xmm5,xmm9
  1060. movdqa xmm3,xmm5
  1061. pslld xmm3,7
  1062. psrld xmm5,25
  1063. pxor xmm5,xmm3
  1064. DB 102,15,58,15,237,4
  1065. DB 102,69,15,58,15,201,8
  1066. DB 102,69,15,58,15,237,12
  1067. paddd xmm2,xmm6
  1068. pxor xmm14,xmm2
  1069. pshufb xmm14,XMMWORD[$L$rol16]
  1070. paddd xmm10,xmm14
  1071. pxor xmm6,xmm10
  1072. movdqa xmm3,xmm6
  1073. pslld xmm3,12
  1074. psrld xmm6,20
  1075. pxor xmm6,xmm3
  1076. paddd xmm2,xmm6
  1077. pxor xmm14,xmm2
  1078. pshufb xmm14,XMMWORD[$L$rol8]
  1079. paddd xmm10,xmm14
  1080. pxor xmm6,xmm10
  1081. movdqa xmm3,xmm6
  1082. pslld xmm3,7
  1083. psrld xmm6,25
  1084. pxor xmm6,xmm3
  1085. DB 102,15,58,15,246,4
  1086. DB 102,69,15,58,15,210,8
  1087. DB 102,69,15,58,15,246,12
  1088. paddd xmm0,xmm4
  1089. pxor xmm12,xmm0
  1090. pshufb xmm12,XMMWORD[$L$rol16]
  1091. paddd xmm8,xmm12
  1092. pxor xmm4,xmm8
  1093. movdqa xmm3,xmm4
  1094. pslld xmm3,12
  1095. psrld xmm4,20
  1096. pxor xmm4,xmm3
  1097. paddd xmm0,xmm4
  1098. pxor xmm12,xmm0
  1099. pshufb xmm12,XMMWORD[$L$rol8]
  1100. paddd xmm8,xmm12
  1101. pxor xmm4,xmm8
  1102. movdqa xmm3,xmm4
  1103. pslld xmm3,7
  1104. psrld xmm4,25
  1105. pxor xmm4,xmm3
  1106. DB 102,15,58,15,228,12
  1107. DB 102,69,15,58,15,192,8
  1108. DB 102,69,15,58,15,228,4
  1109. paddd xmm1,xmm5
  1110. pxor xmm13,xmm1
  1111. pshufb xmm13,XMMWORD[$L$rol16]
  1112. paddd xmm9,xmm13
  1113. pxor xmm5,xmm9
  1114. movdqa xmm3,xmm5
  1115. pslld xmm3,12
  1116. psrld xmm5,20
  1117. pxor xmm5,xmm3
  1118. paddd xmm1,xmm5
  1119. pxor xmm13,xmm1
  1120. pshufb xmm13,XMMWORD[$L$rol8]
  1121. paddd xmm9,xmm13
  1122. pxor xmm5,xmm9
  1123. movdqa xmm3,xmm5
  1124. pslld xmm3,7
  1125. psrld xmm5,25
  1126. pxor xmm5,xmm3
  1127. DB 102,15,58,15,237,12
  1128. DB 102,69,15,58,15,201,8
  1129. DB 102,69,15,58,15,237,4
  1130. paddd xmm2,xmm6
  1131. pxor xmm14,xmm2
  1132. pshufb xmm14,XMMWORD[$L$rol16]
  1133. paddd xmm10,xmm14
  1134. pxor xmm6,xmm10
  1135. movdqa xmm3,xmm6
  1136. pslld xmm3,12
  1137. psrld xmm6,20
  1138. pxor xmm6,xmm3
  1139. paddd xmm2,xmm6
  1140. pxor xmm14,xmm2
  1141. pshufb xmm14,XMMWORD[$L$rol8]
  1142. paddd xmm10,xmm14
  1143. pxor xmm6,xmm10
  1144. movdqa xmm3,xmm6
  1145. pslld xmm3,7
  1146. psrld xmm6,25
  1147. pxor xmm6,xmm3
  1148. DB 102,15,58,15,246,12
  1149. DB 102,69,15,58,15,210,8
  1150. DB 102,69,15,58,15,246,4
  1151. cmp r8,rcx
  1152. jb NEAR $L$open_sse_tail_192_rounds_and_x1hash
  1153. cmp r8,10*16
  1154. jne NEAR $L$open_sse_tail_192_rounds
  1155. cmp rbx,11*16
  1156. jb NEAR $L$open_sse_tail_192_finish
  1157. add r10,QWORD[((0+160))+rsi]
  1158. adc r11,QWORD[((8+160))+rsi]
  1159. adc r12,1
  1160. mov rax,QWORD[((0+160+0))+rbp]
  1161. mov r15,rax
  1162. mul r10
  1163. mov r13,rax
  1164. mov r14,rdx
  1165. mov rax,QWORD[((0+160+0))+rbp]
  1166. mul r11
  1167. imul r15,r12
  1168. add r14,rax
  1169. adc r15,rdx
  1170. mov rax,QWORD[((8+160+0))+rbp]
  1171. mov r9,rax
  1172. mul r10
  1173. add r14,rax
  1174. adc rdx,0
  1175. mov r10,rdx
  1176. mov rax,QWORD[((8+160+0))+rbp]
  1177. mul r11
  1178. add r15,rax
  1179. adc rdx,0
  1180. imul r9,r12
  1181. add r15,r10
  1182. adc r9,rdx
  1183. mov r10,r13
  1184. mov r11,r14
  1185. mov r12,r15
  1186. and r12,3
  1187. mov r13,r15
  1188. and r13,-4
  1189. mov r14,r9
  1190. shrd r15,r9,2
  1191. shr r9,2
  1192. add r15,r13
  1193. adc r9,r14
  1194. add r10,r15
  1195. adc r11,r9
  1196. adc r12,0
  1197. cmp rbx,12*16
  1198. jb NEAR $L$open_sse_tail_192_finish
  1199. add r10,QWORD[((0+176))+rsi]
  1200. adc r11,QWORD[((8+176))+rsi]
  1201. adc r12,1
  1202. mov rax,QWORD[((0+160+0))+rbp]
  1203. mov r15,rax
  1204. mul r10
  1205. mov r13,rax
  1206. mov r14,rdx
  1207. mov rax,QWORD[((0+160+0))+rbp]
  1208. mul r11
  1209. imul r15,r12
  1210. add r14,rax
  1211. adc r15,rdx
  1212. mov rax,QWORD[((8+160+0))+rbp]
  1213. mov r9,rax
  1214. mul r10
  1215. add r14,rax
  1216. adc rdx,0
  1217. mov r10,rdx
  1218. mov rax,QWORD[((8+160+0))+rbp]
  1219. mul r11
  1220. add r15,rax
  1221. adc rdx,0
  1222. imul r9,r12
  1223. add r15,r10
  1224. adc r9,rdx
  1225. mov r10,r13
  1226. mov r11,r14
  1227. mov r12,r15
  1228. and r12,3
  1229. mov r13,r15
  1230. and r13,-4
  1231. mov r14,r9
  1232. shrd r15,r9,2
  1233. shr r9,2
  1234. add r15,r13
  1235. adc r9,r14
  1236. add r10,r15
  1237. adc r11,r9
  1238. adc r12,0
  1239. $L$open_sse_tail_192_finish:
  1240. paddd xmm2,XMMWORD[$L$chacha20_consts]
  1241. paddd xmm6,XMMWORD[((160+48))+rbp]
  1242. paddd xmm10,XMMWORD[((160+64))+rbp]
  1243. paddd xmm14,XMMWORD[((160+128))+rbp]
  1244. paddd xmm1,XMMWORD[$L$chacha20_consts]
  1245. paddd xmm5,XMMWORD[((160+48))+rbp]
  1246. paddd xmm9,XMMWORD[((160+64))+rbp]
  1247. paddd xmm13,XMMWORD[((160+112))+rbp]
  1248. paddd xmm0,XMMWORD[$L$chacha20_consts]
  1249. paddd xmm4,XMMWORD[((160+48))+rbp]
  1250. paddd xmm8,XMMWORD[((160+64))+rbp]
  1251. paddd xmm12,XMMWORD[((160+96))+rbp]
  1252. movdqu xmm3,XMMWORD[((0 + 0))+rsi]
  1253. movdqu xmm7,XMMWORD[((16 + 0))+rsi]
  1254. movdqu xmm11,XMMWORD[((32 + 0))+rsi]
  1255. movdqu xmm15,XMMWORD[((48 + 0))+rsi]
  1256. pxor xmm2,xmm3
  1257. pxor xmm6,xmm7
  1258. pxor xmm10,xmm11
  1259. pxor xmm15,xmm14
  1260. movdqu XMMWORD[(0 + 0)+rdi],xmm2
  1261. movdqu XMMWORD[(16 + 0)+rdi],xmm6
  1262. movdqu XMMWORD[(32 + 0)+rdi],xmm10
  1263. movdqu XMMWORD[(48 + 0)+rdi],xmm15
  1264. movdqu xmm3,XMMWORD[((0 + 64))+rsi]
  1265. movdqu xmm7,XMMWORD[((16 + 64))+rsi]
  1266. movdqu xmm11,XMMWORD[((32 + 64))+rsi]
  1267. movdqu xmm15,XMMWORD[((48 + 64))+rsi]
  1268. pxor xmm1,xmm3
  1269. pxor xmm5,xmm7
  1270. pxor xmm9,xmm11
  1271. pxor xmm15,xmm13
  1272. movdqu XMMWORD[(0 + 64)+rdi],xmm1
  1273. movdqu XMMWORD[(16 + 64)+rdi],xmm5
  1274. movdqu XMMWORD[(32 + 64)+rdi],xmm9
  1275. movdqu XMMWORD[(48 + 64)+rdi],xmm15
  1276. sub rbx,8*16
  1277. lea rsi,[128+rsi]
  1278. lea rdi,[128+rdi]
  1279. jmp NEAR $L$open_sse_tail_64_dec_loop
  1280. $L$open_sse_tail_256:
  1281. movdqa xmm0,XMMWORD[$L$chacha20_consts]
  1282. movdqa xmm4,XMMWORD[((160+48))+rbp]
  1283. movdqa xmm8,XMMWORD[((160+64))+rbp]
  1284. movdqa xmm1,xmm0
  1285. movdqa xmm5,xmm4
  1286. movdqa xmm9,xmm8
  1287. movdqa xmm2,xmm0
  1288. movdqa xmm6,xmm4
  1289. movdqa xmm10,xmm8
  1290. movdqa xmm3,xmm0
  1291. movdqa xmm7,xmm4
  1292. movdqa xmm11,xmm8
  1293. movdqa xmm15,XMMWORD[((160+96))+rbp]
  1294. paddd xmm15,XMMWORD[$L$sse_inc]
  1295. movdqa xmm14,xmm15
  1296. paddd xmm14,XMMWORD[$L$sse_inc]
  1297. movdqa xmm13,xmm14
  1298. paddd xmm13,XMMWORD[$L$sse_inc]
  1299. movdqa xmm12,xmm13
  1300. paddd xmm12,XMMWORD[$L$sse_inc]
  1301. movdqa XMMWORD[(160+96)+rbp],xmm12
  1302. movdqa XMMWORD[(160+112)+rbp],xmm13
  1303. movdqa XMMWORD[(160+128)+rbp],xmm14
  1304. movdqa XMMWORD[(160+144)+rbp],xmm15
  1305. xor r8,r8
  1306. $L$open_sse_tail_256_rounds_and_x1hash:
  1307. add r10,QWORD[((0+0))+r8*1+rsi]
  1308. adc r11,QWORD[((8+0))+r8*1+rsi]
  1309. adc r12,1
  1310. movdqa XMMWORD[(160+80)+rbp],xmm11
  1311. paddd xmm0,xmm4
  1312. pxor xmm12,xmm0
  1313. pshufb xmm12,XMMWORD[$L$rol16]
  1314. paddd xmm8,xmm12
  1315. pxor xmm4,xmm8
  1316. movdqa xmm11,xmm4
  1317. pslld xmm11,12
  1318. psrld xmm4,20
  1319. pxor xmm4,xmm11
  1320. paddd xmm0,xmm4
  1321. pxor xmm12,xmm0
  1322. pshufb xmm12,XMMWORD[$L$rol8]
  1323. paddd xmm8,xmm12
  1324. pxor xmm4,xmm8
  1325. movdqa xmm11,xmm4
  1326. pslld xmm11,7
  1327. psrld xmm4,25
  1328. pxor xmm4,xmm11
  1329. DB 102,15,58,15,228,4
  1330. DB 102,69,15,58,15,192,8
  1331. DB 102,69,15,58,15,228,12
  1332. paddd xmm1,xmm5
  1333. pxor xmm13,xmm1
  1334. pshufb xmm13,XMMWORD[$L$rol16]
  1335. paddd xmm9,xmm13
  1336. pxor xmm5,xmm9
  1337. movdqa xmm11,xmm5
  1338. pslld xmm11,12
  1339. psrld xmm5,20
  1340. pxor xmm5,xmm11
  1341. paddd xmm1,xmm5
  1342. pxor xmm13,xmm1
  1343. pshufb xmm13,XMMWORD[$L$rol8]
  1344. paddd xmm9,xmm13
  1345. pxor xmm5,xmm9
  1346. movdqa xmm11,xmm5
  1347. pslld xmm11,7
  1348. psrld xmm5,25
  1349. pxor xmm5,xmm11
  1350. DB 102,15,58,15,237,4
  1351. DB 102,69,15,58,15,201,8
  1352. DB 102,69,15,58,15,237,12
  1353. paddd xmm2,xmm6
  1354. pxor xmm14,xmm2
  1355. pshufb xmm14,XMMWORD[$L$rol16]
  1356. paddd xmm10,xmm14
  1357. pxor xmm6,xmm10
  1358. movdqa xmm11,xmm6
  1359. pslld xmm11,12
  1360. psrld xmm6,20
  1361. pxor xmm6,xmm11
  1362. paddd xmm2,xmm6
  1363. pxor xmm14,xmm2
  1364. pshufb xmm14,XMMWORD[$L$rol8]
  1365. paddd xmm10,xmm14
  1366. pxor xmm6,xmm10
  1367. movdqa xmm11,xmm6
  1368. pslld xmm11,7
  1369. psrld xmm6,25
  1370. pxor xmm6,xmm11
  1371. DB 102,15,58,15,246,4
  1372. DB 102,69,15,58,15,210,8
  1373. DB 102,69,15,58,15,246,12
  1374. movdqa xmm11,XMMWORD[((160+80))+rbp]
  1375. mov rax,QWORD[((0+160+0))+rbp]
  1376. mov r15,rax
  1377. mul r10
  1378. mov r13,rax
  1379. mov r14,rdx
  1380. mov rax,QWORD[((0+160+0))+rbp]
  1381. mul r11
  1382. imul r15,r12
  1383. add r14,rax
  1384. adc r15,rdx
  1385. movdqa XMMWORD[(160+80)+rbp],xmm9
  1386. paddd xmm3,xmm7
  1387. pxor xmm15,xmm3
  1388. pshufb xmm15,XMMWORD[$L$rol16]
  1389. paddd xmm11,xmm15
  1390. pxor xmm7,xmm11
  1391. movdqa xmm9,xmm7
  1392. pslld xmm9,12
  1393. psrld xmm7,20
  1394. pxor xmm7,xmm9
  1395. paddd xmm3,xmm7
  1396. pxor xmm15,xmm3
  1397. pshufb xmm15,XMMWORD[$L$rol8]
  1398. paddd xmm11,xmm15
  1399. pxor xmm7,xmm11
  1400. movdqa xmm9,xmm7
  1401. pslld xmm9,7
  1402. psrld xmm7,25
  1403. pxor xmm7,xmm9
  1404. DB 102,15,58,15,255,4
  1405. DB 102,69,15,58,15,219,8
  1406. DB 102,69,15,58,15,255,12
  1407. movdqa xmm9,XMMWORD[((160+80))+rbp]
  1408. mov rax,QWORD[((8+160+0))+rbp]
  1409. mov r9,rax
  1410. mul r10
  1411. add r14,rax
  1412. adc rdx,0
  1413. mov r10,rdx
  1414. mov rax,QWORD[((8+160+0))+rbp]
  1415. mul r11
  1416. add r15,rax
  1417. adc rdx,0
  1418. movdqa XMMWORD[(160+80)+rbp],xmm11
  1419. paddd xmm0,xmm4
  1420. pxor xmm12,xmm0
  1421. pshufb xmm12,XMMWORD[$L$rol16]
  1422. paddd xmm8,xmm12
  1423. pxor xmm4,xmm8
  1424. movdqa xmm11,xmm4
  1425. pslld xmm11,12
  1426. psrld xmm4,20
  1427. pxor xmm4,xmm11
  1428. paddd xmm0,xmm4
  1429. pxor xmm12,xmm0
  1430. pshufb xmm12,XMMWORD[$L$rol8]
  1431. paddd xmm8,xmm12
  1432. pxor xmm4,xmm8
  1433. movdqa xmm11,xmm4
  1434. pslld xmm11,7
  1435. psrld xmm4,25
  1436. pxor xmm4,xmm11
  1437. DB 102,15,58,15,228,12
  1438. DB 102,69,15,58,15,192,8
  1439. DB 102,69,15,58,15,228,4
  1440. paddd xmm1,xmm5
  1441. pxor xmm13,xmm1
  1442. pshufb xmm13,XMMWORD[$L$rol16]
  1443. paddd xmm9,xmm13
  1444. pxor xmm5,xmm9
  1445. movdqa xmm11,xmm5
  1446. pslld xmm11,12
  1447. psrld xmm5,20
  1448. pxor xmm5,xmm11
  1449. paddd xmm1,xmm5
  1450. pxor xmm13,xmm1
  1451. pshufb xmm13,XMMWORD[$L$rol8]
  1452. paddd xmm9,xmm13
  1453. pxor xmm5,xmm9
  1454. movdqa xmm11,xmm5
  1455. pslld xmm11,7
  1456. psrld xmm5,25
  1457. pxor xmm5,xmm11
  1458. DB 102,15,58,15,237,12
  1459. DB 102,69,15,58,15,201,8
  1460. DB 102,69,15,58,15,237,4
  1461. imul r9,r12
  1462. add r15,r10
  1463. adc r9,rdx
  1464. paddd xmm2,xmm6
  1465. pxor xmm14,xmm2
  1466. pshufb xmm14,XMMWORD[$L$rol16]
  1467. paddd xmm10,xmm14
  1468. pxor xmm6,xmm10
  1469. movdqa xmm11,xmm6
  1470. pslld xmm11,12
  1471. psrld xmm6,20
  1472. pxor xmm6,xmm11
  1473. paddd xmm2,xmm6
  1474. pxor xmm14,xmm2
  1475. pshufb xmm14,XMMWORD[$L$rol8]
  1476. paddd xmm10,xmm14
  1477. pxor xmm6,xmm10
  1478. movdqa xmm11,xmm6
  1479. pslld xmm11,7
  1480. psrld xmm6,25
  1481. pxor xmm6,xmm11
  1482. DB 102,15,58,15,246,12
  1483. DB 102,69,15,58,15,210,8
  1484. DB 102,69,15,58,15,246,4
  1485. movdqa xmm11,XMMWORD[((160+80))+rbp]
  1486. mov r10,r13
  1487. mov r11,r14
  1488. mov r12,r15
  1489. and r12,3
  1490. mov r13,r15
  1491. and r13,-4
  1492. mov r14,r9
  1493. shrd r15,r9,2
  1494. shr r9,2
  1495. add r15,r13
  1496. adc r9,r14
  1497. add r10,r15
  1498. adc r11,r9
  1499. adc r12,0
  1500. movdqa XMMWORD[(160+80)+rbp],xmm9
  1501. paddd xmm3,xmm7
  1502. pxor xmm15,xmm3
  1503. pshufb xmm15,XMMWORD[$L$rol16]
  1504. paddd xmm11,xmm15
  1505. pxor xmm7,xmm11
  1506. movdqa xmm9,xmm7
  1507. pslld xmm9,12
  1508. psrld xmm7,20
  1509. pxor xmm7,xmm9
  1510. paddd xmm3,xmm7
  1511. pxor xmm15,xmm3
  1512. pshufb xmm15,XMMWORD[$L$rol8]
  1513. paddd xmm11,xmm15
  1514. pxor xmm7,xmm11
  1515. movdqa xmm9,xmm7
  1516. pslld xmm9,7
  1517. psrld xmm7,25
  1518. pxor xmm7,xmm9
  1519. DB 102,15,58,15,255,12
  1520. DB 102,69,15,58,15,219,8
  1521. DB 102,69,15,58,15,255,4
  1522. movdqa xmm9,XMMWORD[((160+80))+rbp]
  1523. add r8,16
  1524. cmp r8,10*16
  1525. jb NEAR $L$open_sse_tail_256_rounds_and_x1hash
  1526. mov rcx,rbx
  1527. and rcx,-16
  1528. $L$open_sse_tail_256_hash:
  1529. add r10,QWORD[((0+0))+r8*1+rsi]
  1530. adc r11,QWORD[((8+0))+r8*1+rsi]
  1531. adc r12,1
  1532. mov rax,QWORD[((0+160+0))+rbp]
  1533. mov r15,rax
  1534. mul r10
  1535. mov r13,rax
  1536. mov r14,rdx
  1537. mov rax,QWORD[((0+160+0))+rbp]
  1538. mul r11
  1539. imul r15,r12
  1540. add r14,rax
  1541. adc r15,rdx
  1542. mov rax,QWORD[((8+160+0))+rbp]
  1543. mov r9,rax
  1544. mul r10
  1545. add r14,rax
  1546. adc rdx,0
  1547. mov r10,rdx
  1548. mov rax,QWORD[((8+160+0))+rbp]
  1549. mul r11
  1550. add r15,rax
  1551. adc rdx,0
  1552. imul r9,r12
  1553. add r15,r10
  1554. adc r9,rdx
  1555. mov r10,r13
  1556. mov r11,r14
  1557. mov r12,r15
  1558. and r12,3
  1559. mov r13,r15
  1560. and r13,-4
  1561. mov r14,r9
  1562. shrd r15,r9,2
  1563. shr r9,2
  1564. add r15,r13
  1565. adc r9,r14
  1566. add r10,r15
  1567. adc r11,r9
  1568. adc r12,0
  1569. add r8,16
  1570. cmp r8,rcx
  1571. jb NEAR $L$open_sse_tail_256_hash
  1572. paddd xmm3,XMMWORD[$L$chacha20_consts]
  1573. paddd xmm7,XMMWORD[((160+48))+rbp]
  1574. paddd xmm11,XMMWORD[((160+64))+rbp]
  1575. paddd xmm15,XMMWORD[((160+144))+rbp]
  1576. paddd xmm2,XMMWORD[$L$chacha20_consts]
  1577. paddd xmm6,XMMWORD[((160+48))+rbp]
  1578. paddd xmm10,XMMWORD[((160+64))+rbp]
  1579. paddd xmm14,XMMWORD[((160+128))+rbp]
  1580. paddd xmm1,XMMWORD[$L$chacha20_consts]
  1581. paddd xmm5,XMMWORD[((160+48))+rbp]
  1582. paddd xmm9,XMMWORD[((160+64))+rbp]
  1583. paddd xmm13,XMMWORD[((160+112))+rbp]
  1584. paddd xmm0,XMMWORD[$L$chacha20_consts]
  1585. paddd xmm4,XMMWORD[((160+48))+rbp]
  1586. paddd xmm8,XMMWORD[((160+64))+rbp]
  1587. paddd xmm12,XMMWORD[((160+96))+rbp]
  1588. movdqa XMMWORD[(160+80)+rbp],xmm12
  1589. movdqu xmm12,XMMWORD[((0 + 0))+rsi]
  1590. pxor xmm12,xmm3
  1591. movdqu XMMWORD[(0 + 0)+rdi],xmm12
  1592. movdqu xmm12,XMMWORD[((16 + 0))+rsi]
  1593. pxor xmm12,xmm7
  1594. movdqu XMMWORD[(16 + 0)+rdi],xmm12
  1595. movdqu xmm12,XMMWORD[((32 + 0))+rsi]
  1596. pxor xmm12,xmm11
  1597. movdqu XMMWORD[(32 + 0)+rdi],xmm12
  1598. movdqu xmm12,XMMWORD[((48 + 0))+rsi]
  1599. pxor xmm12,xmm15
  1600. movdqu XMMWORD[(48 + 0)+rdi],xmm12
  1601. movdqu xmm3,XMMWORD[((0 + 64))+rsi]
  1602. movdqu xmm7,XMMWORD[((16 + 64))+rsi]
  1603. movdqu xmm11,XMMWORD[((32 + 64))+rsi]
  1604. movdqu xmm15,XMMWORD[((48 + 64))+rsi]
  1605. pxor xmm2,xmm3
  1606. pxor xmm6,xmm7
  1607. pxor xmm10,xmm11
  1608. pxor xmm15,xmm14
  1609. movdqu XMMWORD[(0 + 64)+rdi],xmm2
  1610. movdqu XMMWORD[(16 + 64)+rdi],xmm6
  1611. movdqu XMMWORD[(32 + 64)+rdi],xmm10
  1612. movdqu XMMWORD[(48 + 64)+rdi],xmm15
  1613. movdqu xmm3,XMMWORD[((0 + 128))+rsi]
  1614. movdqu xmm7,XMMWORD[((16 + 128))+rsi]
  1615. movdqu xmm11,XMMWORD[((32 + 128))+rsi]
  1616. movdqu xmm15,XMMWORD[((48 + 128))+rsi]
  1617. pxor xmm1,xmm3
  1618. pxor xmm5,xmm7
  1619. pxor xmm9,xmm11
  1620. pxor xmm15,xmm13
  1621. movdqu XMMWORD[(0 + 128)+rdi],xmm1
  1622. movdqu XMMWORD[(16 + 128)+rdi],xmm5
  1623. movdqu XMMWORD[(32 + 128)+rdi],xmm9
  1624. movdqu XMMWORD[(48 + 128)+rdi],xmm15
  1625. movdqa xmm12,XMMWORD[((160+80))+rbp]
  1626. sub rbx,12*16
  1627. lea rsi,[192+rsi]
  1628. lea rdi,[192+rdi]
  1629. $L$open_sse_tail_64_dec_loop:
  1630. cmp rbx,16
  1631. jb NEAR $L$open_sse_tail_16_init
  1632. sub rbx,16
  1633. movdqu xmm3,XMMWORD[rsi]
  1634. pxor xmm0,xmm3
  1635. movdqu XMMWORD[rdi],xmm0
  1636. lea rsi,[16+rsi]
  1637. lea rdi,[16+rdi]
  1638. movdqa xmm0,xmm4
  1639. movdqa xmm4,xmm8
  1640. movdqa xmm8,xmm12
  1641. jmp NEAR $L$open_sse_tail_64_dec_loop
  1642. $L$open_sse_tail_16_init:
  1643. movdqa xmm1,xmm0
  1644. $L$open_sse_tail_16:
  1645. test rbx,rbx
  1646. jz NEAR $L$open_sse_finalize
  1647. pxor xmm3,xmm3
  1648. lea rsi,[((-1))+rbx*1+rsi]
  1649. mov r8,rbx
  1650. $L$open_sse_tail_16_compose:
  1651. pslldq xmm3,1
  1652. pinsrb xmm3,BYTE[rsi],0
  1653. sub rsi,1
  1654. sub r8,1
  1655. jnz NEAR $L$open_sse_tail_16_compose
  1656. DB 102,73,15,126,221
  1657. pextrq r14,xmm3,1
  1658. pxor xmm3,xmm1
  1659. $L$open_sse_tail_16_extract:
  1660. pextrb XMMWORD[rdi],xmm3,0
  1661. psrldq xmm3,1
  1662. add rdi,1
  1663. sub rbx,1
  1664. jne NEAR $L$open_sse_tail_16_extract
  1665. add r10,r13
  1666. adc r11,r14
  1667. adc r12,1
  1668. mov rax,QWORD[((0+160+0))+rbp]
  1669. mov r15,rax
  1670. mul r10
  1671. mov r13,rax
  1672. mov r14,rdx
  1673. mov rax,QWORD[((0+160+0))+rbp]
  1674. mul r11
  1675. imul r15,r12
  1676. add r14,rax
  1677. adc r15,rdx
  1678. mov rax,QWORD[((8+160+0))+rbp]
  1679. mov r9,rax
  1680. mul r10
  1681. add r14,rax
  1682. adc rdx,0
  1683. mov r10,rdx
  1684. mov rax,QWORD[((8+160+0))+rbp]
  1685. mul r11
  1686. add r15,rax
  1687. adc rdx,0
  1688. imul r9,r12
  1689. add r15,r10
  1690. adc r9,rdx
  1691. mov r10,r13
  1692. mov r11,r14
  1693. mov r12,r15
  1694. and r12,3
  1695. mov r13,r15
  1696. and r13,-4
  1697. mov r14,r9
  1698. shrd r15,r9,2
  1699. shr r9,2
  1700. add r15,r13
  1701. adc r9,r14
  1702. add r10,r15
  1703. adc r11,r9
  1704. adc r12,0
  1705. $L$open_sse_finalize:
  1706. add r10,QWORD[((0+160+32))+rbp]
  1707. adc r11,QWORD[((8+160+32))+rbp]
  1708. adc r12,1
  1709. mov rax,QWORD[((0+160+0))+rbp]
  1710. mov r15,rax
  1711. mul r10
  1712. mov r13,rax
  1713. mov r14,rdx
  1714. mov rax,QWORD[((0+160+0))+rbp]
  1715. mul r11
  1716. imul r15,r12
  1717. add r14,rax
  1718. adc r15,rdx
  1719. mov rax,QWORD[((8+160+0))+rbp]
  1720. mov r9,rax
  1721. mul r10
  1722. add r14,rax
  1723. adc rdx,0
  1724. mov r10,rdx
  1725. mov rax,QWORD[((8+160+0))+rbp]
  1726. mul r11
  1727. add r15,rax
  1728. adc rdx,0
  1729. imul r9,r12
  1730. add r15,r10
  1731. adc r9,rdx
  1732. mov r10,r13
  1733. mov r11,r14
  1734. mov r12,r15
  1735. and r12,3
  1736. mov r13,r15
  1737. and r13,-4
  1738. mov r14,r9
  1739. shrd r15,r9,2
  1740. shr r9,2
  1741. add r15,r13
  1742. adc r9,r14
  1743. add r10,r15
  1744. adc r11,r9
  1745. adc r12,0
  1746. mov r13,r10
  1747. mov r14,r11
  1748. mov r15,r12
  1749. sub r10,-5
  1750. sbb r11,-1
  1751. sbb r12,3
  1752. cmovc r10,r13
  1753. cmovc r11,r14
  1754. cmovc r12,r15
  1755. add r10,QWORD[((0+160+16))+rbp]
  1756. adc r11,QWORD[((8+160+16))+rbp]
  1757. movaps xmm6,XMMWORD[((0+0))+rbp]
  1758. movaps xmm7,XMMWORD[((16+0))+rbp]
  1759. movaps xmm8,XMMWORD[((32+0))+rbp]
  1760. movaps xmm9,XMMWORD[((48+0))+rbp]
  1761. movaps xmm10,XMMWORD[((64+0))+rbp]
  1762. movaps xmm11,XMMWORD[((80+0))+rbp]
  1763. movaps xmm12,XMMWORD[((96+0))+rbp]
  1764. movaps xmm13,XMMWORD[((112+0))+rbp]
  1765. movaps xmm14,XMMWORD[((128+0))+rbp]
  1766. movaps xmm15,XMMWORD[((144+0))+rbp]
  1767. add rsp,288 + 160 + 32
  1768. pop r9
  1769. mov QWORD[r9],r10
  1770. mov QWORD[8+r9],r11
  1771. pop r15
  1772. pop r14
  1773. pop r13
  1774. pop r12
  1775. pop rbx
  1776. pop rbp
  1777. mov rdi,QWORD[8+rsp] ;WIN64 epilogue
  1778. mov rsi,QWORD[16+rsp]
  1779. DB 0F3h,0C3h ;repret
  1780. $L$open_sse_128:
  1781. movdqu xmm0,XMMWORD[$L$chacha20_consts]
  1782. movdqa xmm1,xmm0
  1783. movdqa xmm2,xmm0
  1784. movdqu xmm4,XMMWORD[r9]
  1785. movdqa xmm5,xmm4
  1786. movdqa xmm6,xmm4
  1787. movdqu xmm8,XMMWORD[16+r9]
  1788. movdqa xmm9,xmm8
  1789. movdqa xmm10,xmm8
  1790. movdqu xmm12,XMMWORD[32+r9]
  1791. movdqa xmm13,xmm12
  1792. paddd xmm13,XMMWORD[$L$sse_inc]
  1793. movdqa xmm14,xmm13
  1794. paddd xmm14,XMMWORD[$L$sse_inc]
  1795. movdqa xmm7,xmm4
  1796. movdqa xmm11,xmm8
  1797. movdqa xmm15,xmm13
  1798. mov r10,10
  1799. $L$open_sse_128_rounds:
  1800. paddd xmm0,xmm4
  1801. pxor xmm12,xmm0
  1802. pshufb xmm12,XMMWORD[$L$rol16]
  1803. paddd xmm8,xmm12
  1804. pxor xmm4,xmm8
  1805. movdqa xmm3,xmm4
  1806. pslld xmm3,12
  1807. psrld xmm4,20
  1808. pxor xmm4,xmm3
  1809. paddd xmm0,xmm4
  1810. pxor xmm12,xmm0
  1811. pshufb xmm12,XMMWORD[$L$rol8]
  1812. paddd xmm8,xmm12
  1813. pxor xmm4,xmm8
  1814. movdqa xmm3,xmm4
  1815. pslld xmm3,7
  1816. psrld xmm4,25
  1817. pxor xmm4,xmm3
  1818. DB 102,15,58,15,228,4
  1819. DB 102,69,15,58,15,192,8
  1820. DB 102,69,15,58,15,228,12
  1821. paddd xmm1,xmm5
  1822. pxor xmm13,xmm1
  1823. pshufb xmm13,XMMWORD[$L$rol16]
  1824. paddd xmm9,xmm13
  1825. pxor xmm5,xmm9
  1826. movdqa xmm3,xmm5
  1827. pslld xmm3,12
  1828. psrld xmm5,20
  1829. pxor xmm5,xmm3
  1830. paddd xmm1,xmm5
  1831. pxor xmm13,xmm1
  1832. pshufb xmm13,XMMWORD[$L$rol8]
  1833. paddd xmm9,xmm13
  1834. pxor xmm5,xmm9
  1835. movdqa xmm3,xmm5
  1836. pslld xmm3,7
  1837. psrld xmm5,25
  1838. pxor xmm5,xmm3
  1839. DB 102,15,58,15,237,4
  1840. DB 102,69,15,58,15,201,8
  1841. DB 102,69,15,58,15,237,12
  1842. paddd xmm2,xmm6
  1843. pxor xmm14,xmm2
  1844. pshufb xmm14,XMMWORD[$L$rol16]
  1845. paddd xmm10,xmm14
  1846. pxor xmm6,xmm10
  1847. movdqa xmm3,xmm6
  1848. pslld xmm3,12
  1849. psrld xmm6,20
  1850. pxor xmm6,xmm3
  1851. paddd xmm2,xmm6
  1852. pxor xmm14,xmm2
  1853. pshufb xmm14,XMMWORD[$L$rol8]
  1854. paddd xmm10,xmm14
  1855. pxor xmm6,xmm10
  1856. movdqa xmm3,xmm6
  1857. pslld xmm3,7
  1858. psrld xmm6,25
  1859. pxor xmm6,xmm3
  1860. DB 102,15,58,15,246,4
  1861. DB 102,69,15,58,15,210,8
  1862. DB 102,69,15,58,15,246,12
  1863. paddd xmm0,xmm4
  1864. pxor xmm12,xmm0
  1865. pshufb xmm12,XMMWORD[$L$rol16]
  1866. paddd xmm8,xmm12
  1867. pxor xmm4,xmm8
  1868. movdqa xmm3,xmm4
  1869. pslld xmm3,12
  1870. psrld xmm4,20
  1871. pxor xmm4,xmm3
  1872. paddd xmm0,xmm4
  1873. pxor xmm12,xmm0
  1874. pshufb xmm12,XMMWORD[$L$rol8]
  1875. paddd xmm8,xmm12
  1876. pxor xmm4,xmm8
  1877. movdqa xmm3,xmm4
  1878. pslld xmm3,7
  1879. psrld xmm4,25
  1880. pxor xmm4,xmm3
  1881. DB 102,15,58,15,228,12
  1882. DB 102,69,15,58,15,192,8
  1883. DB 102,69,15,58,15,228,4
  1884. paddd xmm1,xmm5
  1885. pxor xmm13,xmm1
  1886. pshufb xmm13,XMMWORD[$L$rol16]
  1887. paddd xmm9,xmm13
  1888. pxor xmm5,xmm9
  1889. movdqa xmm3,xmm5
  1890. pslld xmm3,12
  1891. psrld xmm5,20
  1892. pxor xmm5,xmm3
  1893. paddd xmm1,xmm5
  1894. pxor xmm13,xmm1
  1895. pshufb xmm13,XMMWORD[$L$rol8]
  1896. paddd xmm9,xmm13
  1897. pxor xmm5,xmm9
  1898. movdqa xmm3,xmm5
  1899. pslld xmm3,7
  1900. psrld xmm5,25
  1901. pxor xmm5,xmm3
  1902. DB 102,15,58,15,237,12
  1903. DB 102,69,15,58,15,201,8
  1904. DB 102,69,15,58,15,237,4
  1905. paddd xmm2,xmm6
  1906. pxor xmm14,xmm2
  1907. pshufb xmm14,XMMWORD[$L$rol16]
  1908. paddd xmm10,xmm14
  1909. pxor xmm6,xmm10
  1910. movdqa xmm3,xmm6
  1911. pslld xmm3,12
  1912. psrld xmm6,20
  1913. pxor xmm6,xmm3
  1914. paddd xmm2,xmm6
  1915. pxor xmm14,xmm2
  1916. pshufb xmm14,XMMWORD[$L$rol8]
  1917. paddd xmm10,xmm14
  1918. pxor xmm6,xmm10
  1919. movdqa xmm3,xmm6
  1920. pslld xmm3,7
  1921. psrld xmm6,25
  1922. pxor xmm6,xmm3
  1923. DB 102,15,58,15,246,12
  1924. DB 102,69,15,58,15,210,8
  1925. DB 102,69,15,58,15,246,4
  1926. dec r10
  1927. jnz NEAR $L$open_sse_128_rounds
  1928. paddd xmm0,XMMWORD[$L$chacha20_consts]
  1929. paddd xmm1,XMMWORD[$L$chacha20_consts]
  1930. paddd xmm2,XMMWORD[$L$chacha20_consts]
  1931. paddd xmm4,xmm7
  1932. paddd xmm5,xmm7
  1933. paddd xmm6,xmm7
  1934. paddd xmm9,xmm11
  1935. paddd xmm10,xmm11
  1936. paddd xmm13,xmm15
  1937. paddd xmm15,XMMWORD[$L$sse_inc]
  1938. paddd xmm14,xmm15
  1939. pand xmm0,XMMWORD[$L$clamp]
  1940. movdqa XMMWORD[(160+0)+rbp],xmm0
  1941. movdqa XMMWORD[(160+16)+rbp],xmm4
  1942. mov r8,r8
  1943. call poly_hash_ad_internal
  1944. $L$open_sse_128_xor_hash:
  1945. cmp rbx,16
  1946. jb NEAR $L$open_sse_tail_16
  1947. sub rbx,16
  1948. add r10,QWORD[((0+0))+rsi]
  1949. adc r11,QWORD[((8+0))+rsi]
  1950. adc r12,1
  1951. movdqu xmm3,XMMWORD[rsi]
  1952. pxor xmm1,xmm3
  1953. movdqu XMMWORD[rdi],xmm1
  1954. lea rsi,[16+rsi]
  1955. lea rdi,[16+rdi]
  1956. mov rax,QWORD[((0+160+0))+rbp]
  1957. mov r15,rax
  1958. mul r10
  1959. mov r13,rax
  1960. mov r14,rdx
  1961. mov rax,QWORD[((0+160+0))+rbp]
  1962. mul r11
  1963. imul r15,r12
  1964. add r14,rax
  1965. adc r15,rdx
  1966. mov rax,QWORD[((8+160+0))+rbp]
  1967. mov r9,rax
  1968. mul r10
  1969. add r14,rax
  1970. adc rdx,0
  1971. mov r10,rdx
  1972. mov rax,QWORD[((8+160+0))+rbp]
  1973. mul r11
  1974. add r15,rax
  1975. adc rdx,0
  1976. imul r9,r12
  1977. add r15,r10
  1978. adc r9,rdx
  1979. mov r10,r13
  1980. mov r11,r14
  1981. mov r12,r15
  1982. and r12,3
  1983. mov r13,r15
  1984. and r13,-4
  1985. mov r14,r9
  1986. shrd r15,r9,2
  1987. shr r9,2
  1988. add r15,r13
  1989. adc r9,r14
  1990. add r10,r15
  1991. adc r11,r9
  1992. adc r12,0
  1993. movdqa xmm1,xmm5
  1994. movdqa xmm5,xmm9
  1995. movdqa xmm9,xmm13
  1996. movdqa xmm13,xmm2
  1997. movdqa xmm2,xmm6
  1998. movdqa xmm6,xmm10
  1999. movdqa xmm10,xmm14
  2000. jmp NEAR $L$open_sse_128_xor_hash
  2001. $L$SEH_end_GFp_chacha20_poly1305_open:
  2002. global GFp_chacha20_poly1305_seal
  2003. ALIGN 64
  2004. GFp_chacha20_poly1305_seal:
  2005. mov QWORD[8+rsp],rdi ;WIN64 prologue
  2006. mov QWORD[16+rsp],rsi
  2007. mov rax,rsp
  2008. $L$SEH_begin_GFp_chacha20_poly1305_seal:
  2009. mov rdi,rcx
  2010. mov rsi,rdx
  2011. mov rdx,r8
  2012. mov rcx,r9
  2013. mov r8,QWORD[40+rsp]
  2014. mov r9,QWORD[48+rsp]
  2015. push rbp
  2016. push rbx
  2017. push r12
  2018. push r13
  2019. push r14
  2020. push r15
  2021. push r9
  2022. sub rsp,288 + 160 + 32
  2023. lea rbp,[32+rsp]
  2024. and rbp,-32
  2025. movaps XMMWORD[(0+0)+rbp],xmm6
  2026. movaps XMMWORD[(16+0)+rbp],xmm7
  2027. movaps XMMWORD[(32+0)+rbp],xmm8
  2028. movaps XMMWORD[(48+0)+rbp],xmm9
  2029. movaps XMMWORD[(64+0)+rbp],xmm10
  2030. movaps XMMWORD[(80+0)+rbp],xmm11
  2031. movaps XMMWORD[(96+0)+rbp],xmm12
  2032. movaps XMMWORD[(112+0)+rbp],xmm13
  2033. movaps XMMWORD[(128+0)+rbp],xmm14
  2034. movaps XMMWORD[(144+0)+rbp],xmm15
  2035. mov rbx,QWORD[56+r9]
  2036. add rbx,rdx
  2037. mov QWORD[((0+160+32))+rbp],r8
  2038. mov QWORD[((8+160+32))+rbp],rbx
  2039. mov rbx,rdx
  2040. mov eax,DWORD[((GFp_ia32cap_P+8))]
  2041. and eax,288
  2042. xor eax,288
  2043. jz NEAR chacha20_poly1305_seal_avx2
  2044. cmp rbx,128
  2045. jbe NEAR $L$seal_sse_128
  2046. movdqa xmm0,XMMWORD[$L$chacha20_consts]
  2047. movdqu xmm4,XMMWORD[r9]
  2048. movdqu xmm8,XMMWORD[16+r9]
  2049. movdqu xmm12,XMMWORD[32+r9]
  2050. movdqa xmm1,xmm0
  2051. movdqa xmm2,xmm0
  2052. movdqa xmm3,xmm0
  2053. movdqa xmm5,xmm4
  2054. movdqa xmm6,xmm4
  2055. movdqa xmm7,xmm4
  2056. movdqa xmm9,xmm8
  2057. movdqa xmm10,xmm8
  2058. movdqa xmm11,xmm8
  2059. movdqa xmm15,xmm12
  2060. paddd xmm12,XMMWORD[$L$sse_inc]
  2061. movdqa xmm14,xmm12
  2062. paddd xmm12,XMMWORD[$L$sse_inc]
  2063. movdqa xmm13,xmm12
  2064. paddd xmm12,XMMWORD[$L$sse_inc]
  2065. movdqa XMMWORD[(160+48)+rbp],xmm4
  2066. movdqa XMMWORD[(160+64)+rbp],xmm8
  2067. movdqa XMMWORD[(160+96)+rbp],xmm12
  2068. movdqa XMMWORD[(160+112)+rbp],xmm13
  2069. movdqa XMMWORD[(160+128)+rbp],xmm14
  2070. movdqa XMMWORD[(160+144)+rbp],xmm15
  2071. mov r10,10
  2072. $L$seal_sse_init_rounds:
  2073. movdqa XMMWORD[(160+80)+rbp],xmm8
  2074. movdqa xmm8,XMMWORD[$L$rol16]
  2075. paddd xmm3,xmm7
  2076. paddd xmm2,xmm6
  2077. paddd xmm1,xmm5
  2078. paddd xmm0,xmm4
  2079. pxor xmm15,xmm3
  2080. pxor xmm14,xmm2
  2081. pxor xmm13,xmm1
  2082. pxor xmm12,xmm0
  2083. DB 102,69,15,56,0,248
  2084. DB 102,69,15,56,0,240
  2085. DB 102,69,15,56,0,232
  2086. DB 102,69,15,56,0,224
  2087. movdqa xmm8,XMMWORD[((160+80))+rbp]
  2088. paddd xmm11,xmm15
  2089. paddd xmm10,xmm14
  2090. paddd xmm9,xmm13
  2091. paddd xmm8,xmm12
  2092. pxor xmm7,xmm11
  2093. pxor xmm6,xmm10
  2094. pxor xmm5,xmm9
  2095. pxor xmm4,xmm8
  2096. movdqa XMMWORD[(160+80)+rbp],xmm8
  2097. movdqa xmm8,xmm7
  2098. psrld xmm8,20
  2099. pslld xmm7,32-20
  2100. pxor xmm7,xmm8
  2101. movdqa xmm8,xmm6
  2102. psrld xmm8,20
  2103. pslld xmm6,32-20
  2104. pxor xmm6,xmm8
  2105. movdqa xmm8,xmm5
  2106. psrld xmm8,20
  2107. pslld xmm5,32-20
  2108. pxor xmm5,xmm8
  2109. movdqa xmm8,xmm4
  2110. psrld xmm8,20
  2111. pslld xmm4,32-20
  2112. pxor xmm4,xmm8
  2113. movdqa xmm8,XMMWORD[$L$rol8]
  2114. paddd xmm3,xmm7
  2115. paddd xmm2,xmm6
  2116. paddd xmm1,xmm5
  2117. paddd xmm0,xmm4
  2118. pxor xmm15,xmm3
  2119. pxor xmm14,xmm2
  2120. pxor xmm13,xmm1
  2121. pxor xmm12,xmm0
  2122. DB 102,69,15,56,0,248
  2123. DB 102,69,15,56,0,240
  2124. DB 102,69,15,56,0,232
  2125. DB 102,69,15,56,0,224
  2126. movdqa xmm8,XMMWORD[((160+80))+rbp]
  2127. paddd xmm11,xmm15
  2128. paddd xmm10,xmm14
  2129. paddd xmm9,xmm13
  2130. paddd xmm8,xmm12
  2131. pxor xmm7,xmm11
  2132. pxor xmm6,xmm10
  2133. pxor xmm5,xmm9
  2134. pxor xmm4,xmm8
  2135. movdqa XMMWORD[(160+80)+rbp],xmm8
  2136. movdqa xmm8,xmm7
  2137. psrld xmm8,25
  2138. pslld xmm7,32-25
  2139. pxor xmm7,xmm8
  2140. movdqa xmm8,xmm6
  2141. psrld xmm8,25
  2142. pslld xmm6,32-25
  2143. pxor xmm6,xmm8
  2144. movdqa xmm8,xmm5
  2145. psrld xmm8,25
  2146. pslld xmm5,32-25
  2147. pxor xmm5,xmm8
  2148. movdqa xmm8,xmm4
  2149. psrld xmm8,25
  2150. pslld xmm4,32-25
  2151. pxor xmm4,xmm8
  2152. movdqa xmm8,XMMWORD[((160+80))+rbp]
  2153. DB 102,15,58,15,255,4
  2154. DB 102,69,15,58,15,219,8
  2155. DB 102,69,15,58,15,255,12
  2156. DB 102,15,58,15,246,4
  2157. DB 102,69,15,58,15,210,8
  2158. DB 102,69,15,58,15,246,12
  2159. DB 102,15,58,15,237,4
  2160. DB 102,69,15,58,15,201,8
  2161. DB 102,69,15,58,15,237,12
  2162. DB 102,15,58,15,228,4
  2163. DB 102,69,15,58,15,192,8
  2164. DB 102,69,15,58,15,228,12
  2165. movdqa XMMWORD[(160+80)+rbp],xmm8
  2166. movdqa xmm8,XMMWORD[$L$rol16]
  2167. paddd xmm3,xmm7
  2168. paddd xmm2,xmm6
  2169. paddd xmm1,xmm5
  2170. paddd xmm0,xmm4
  2171. pxor xmm15,xmm3
  2172. pxor xmm14,xmm2
  2173. pxor xmm13,xmm1
  2174. pxor xmm12,xmm0
  2175. DB 102,69,15,56,0,248
  2176. DB 102,69,15,56,0,240
  2177. DB 102,69,15,56,0,232
  2178. DB 102,69,15,56,0,224
  2179. movdqa xmm8,XMMWORD[((160+80))+rbp]
  2180. paddd xmm11,xmm15
  2181. paddd xmm10,xmm14
  2182. paddd xmm9,xmm13
  2183. paddd xmm8,xmm12
  2184. pxor xmm7,xmm11
  2185. pxor xmm6,xmm10
  2186. pxor xmm5,xmm9
  2187. pxor xmm4,xmm8
  2188. movdqa XMMWORD[(160+80)+rbp],xmm8
  2189. movdqa xmm8,xmm7
  2190. psrld xmm8,20
  2191. pslld xmm7,32-20
  2192. pxor xmm7,xmm8
  2193. movdqa xmm8,xmm6
  2194. psrld xmm8,20
  2195. pslld xmm6,32-20
  2196. pxor xmm6,xmm8
  2197. movdqa xmm8,xmm5
  2198. psrld xmm8,20
  2199. pslld xmm5,32-20
  2200. pxor xmm5,xmm8
  2201. movdqa xmm8,xmm4
  2202. psrld xmm8,20
  2203. pslld xmm4,32-20
  2204. pxor xmm4,xmm8
  2205. movdqa xmm8,XMMWORD[$L$rol8]
  2206. paddd xmm3,xmm7
  2207. paddd xmm2,xmm6
  2208. paddd xmm1,xmm5
  2209. paddd xmm0,xmm4
  2210. pxor xmm15,xmm3
  2211. pxor xmm14,xmm2
  2212. pxor xmm13,xmm1
  2213. pxor xmm12,xmm0
  2214. DB 102,69,15,56,0,248
  2215. DB 102,69,15,56,0,240
  2216. DB 102,69,15,56,0,232
  2217. DB 102,69,15,56,0,224
  2218. movdqa xmm8,XMMWORD[((160+80))+rbp]
  2219. paddd xmm11,xmm15
  2220. paddd xmm10,xmm14
  2221. paddd xmm9,xmm13
  2222. paddd xmm8,xmm12
  2223. pxor xmm7,xmm11
  2224. pxor xmm6,xmm10
  2225. pxor xmm5,xmm9
  2226. pxor xmm4,xmm8
  2227. movdqa XMMWORD[(160+80)+rbp],xmm8
  2228. movdqa xmm8,xmm7
  2229. psrld xmm8,25
  2230. pslld xmm7,32-25
  2231. pxor xmm7,xmm8
  2232. movdqa xmm8,xmm6
  2233. psrld xmm8,25
  2234. pslld xmm6,32-25
  2235. pxor xmm6,xmm8
  2236. movdqa xmm8,xmm5
  2237. psrld xmm8,25
  2238. pslld xmm5,32-25
  2239. pxor xmm5,xmm8
  2240. movdqa xmm8,xmm4
  2241. psrld xmm8,25
  2242. pslld xmm4,32-25
  2243. pxor xmm4,xmm8
  2244. movdqa xmm8,XMMWORD[((160+80))+rbp]
  2245. DB 102,15,58,15,255,12
  2246. DB 102,69,15,58,15,219,8
  2247. DB 102,69,15,58,15,255,4
  2248. DB 102,15,58,15,246,12
  2249. DB 102,69,15,58,15,210,8
  2250. DB 102,69,15,58,15,246,4
  2251. DB 102,15,58,15,237,12
  2252. DB 102,69,15,58,15,201,8
  2253. DB 102,69,15,58,15,237,4
  2254. DB 102,15,58,15,228,12
  2255. DB 102,69,15,58,15,192,8
  2256. DB 102,69,15,58,15,228,4
  2257. dec r10
  2258. jnz NEAR $L$seal_sse_init_rounds
  2259. paddd xmm3,XMMWORD[$L$chacha20_consts]
  2260. paddd xmm7,XMMWORD[((160+48))+rbp]
  2261. paddd xmm11,XMMWORD[((160+64))+rbp]
  2262. paddd xmm15,XMMWORD[((160+144))+rbp]
  2263. paddd xmm2,XMMWORD[$L$chacha20_consts]
  2264. paddd xmm6,XMMWORD[((160+48))+rbp]
  2265. paddd xmm10,XMMWORD[((160+64))+rbp]
  2266. paddd xmm14,XMMWORD[((160+128))+rbp]
  2267. paddd xmm1,XMMWORD[$L$chacha20_consts]
  2268. paddd xmm5,XMMWORD[((160+48))+rbp]
  2269. paddd xmm9,XMMWORD[((160+64))+rbp]
  2270. paddd xmm13,XMMWORD[((160+112))+rbp]
  2271. paddd xmm0,XMMWORD[$L$chacha20_consts]
  2272. paddd xmm4,XMMWORD[((160+48))+rbp]
  2273. paddd xmm8,XMMWORD[((160+64))+rbp]
  2274. paddd xmm12,XMMWORD[((160+96))+rbp]
  2275. pand xmm3,XMMWORD[$L$clamp]
  2276. movdqa XMMWORD[(160+0)+rbp],xmm3
  2277. movdqa XMMWORD[(160+16)+rbp],xmm7
  2278. mov r8,r8
  2279. call poly_hash_ad_internal
  2280. movdqu xmm3,XMMWORD[((0 + 0))+rsi]
  2281. movdqu xmm7,XMMWORD[((16 + 0))+rsi]
  2282. movdqu xmm11,XMMWORD[((32 + 0))+rsi]
  2283. movdqu xmm15,XMMWORD[((48 + 0))+rsi]
  2284. pxor xmm2,xmm3
  2285. pxor xmm6,xmm7
  2286. pxor xmm10,xmm11
  2287. pxor xmm15,xmm14
  2288. movdqu XMMWORD[(0 + 0)+rdi],xmm2
  2289. movdqu XMMWORD[(16 + 0)+rdi],xmm6
  2290. movdqu XMMWORD[(32 + 0)+rdi],xmm10
  2291. movdqu XMMWORD[(48 + 0)+rdi],xmm15
  2292. movdqu xmm3,XMMWORD[((0 + 64))+rsi]
  2293. movdqu xmm7,XMMWORD[((16 + 64))+rsi]
  2294. movdqu xmm11,XMMWORD[((32 + 64))+rsi]
  2295. movdqu xmm15,XMMWORD[((48 + 64))+rsi]
  2296. pxor xmm1,xmm3
  2297. pxor xmm5,xmm7
  2298. pxor xmm9,xmm11
  2299. pxor xmm15,xmm13
  2300. movdqu XMMWORD[(0 + 64)+rdi],xmm1
  2301. movdqu XMMWORD[(16 + 64)+rdi],xmm5
  2302. movdqu XMMWORD[(32 + 64)+rdi],xmm9
  2303. movdqu XMMWORD[(48 + 64)+rdi],xmm15
  2304. cmp rbx,12*16
  2305. ja NEAR $L$seal_sse_main_init
  2306. mov rcx,8*16
  2307. sub rbx,8*16
  2308. lea rsi,[128+rsi]
  2309. jmp NEAR $L$seal_sse_128_tail_hash
  2310. $L$seal_sse_main_init:
  2311. movdqu xmm3,XMMWORD[((0 + 128))+rsi]
  2312. movdqu xmm7,XMMWORD[((16 + 128))+rsi]
  2313. movdqu xmm11,XMMWORD[((32 + 128))+rsi]
  2314. movdqu xmm15,XMMWORD[((48 + 128))+rsi]
  2315. pxor xmm0,xmm3
  2316. pxor xmm4,xmm7
  2317. pxor xmm8,xmm11
  2318. pxor xmm15,xmm12
  2319. movdqu XMMWORD[(0 + 128)+rdi],xmm0
  2320. movdqu XMMWORD[(16 + 128)+rdi],xmm4
  2321. movdqu XMMWORD[(32 + 128)+rdi],xmm8
  2322. movdqu XMMWORD[(48 + 128)+rdi],xmm15
  2323. mov rcx,12*16
  2324. sub rbx,12*16
  2325. lea rsi,[192+rsi]
  2326. mov rcx,2
  2327. mov r8,8
  2328. cmp rbx,4*16
  2329. jbe NEAR $L$seal_sse_tail_64
  2330. cmp rbx,8*16
  2331. jbe NEAR $L$seal_sse_tail_128
  2332. cmp rbx,12*16
  2333. jbe NEAR $L$seal_sse_tail_192
  2334. $L$seal_sse_main_loop:
  2335. movdqa xmm0,XMMWORD[$L$chacha20_consts]
  2336. movdqa xmm4,XMMWORD[((160+48))+rbp]
  2337. movdqa xmm8,XMMWORD[((160+64))+rbp]
  2338. movdqa xmm1,xmm0
  2339. movdqa xmm5,xmm4
  2340. movdqa xmm9,xmm8
  2341. movdqa xmm2,xmm0
  2342. movdqa xmm6,xmm4
  2343. movdqa xmm10,xmm8
  2344. movdqa xmm3,xmm0
  2345. movdqa xmm7,xmm4
  2346. movdqa xmm11,xmm8
  2347. movdqa xmm15,XMMWORD[((160+96))+rbp]
  2348. paddd xmm15,XMMWORD[$L$sse_inc]
  2349. movdqa xmm14,xmm15
  2350. paddd xmm14,XMMWORD[$L$sse_inc]
  2351. movdqa xmm13,xmm14
  2352. paddd xmm13,XMMWORD[$L$sse_inc]
  2353. movdqa xmm12,xmm13
  2354. paddd xmm12,XMMWORD[$L$sse_inc]
  2355. movdqa XMMWORD[(160+96)+rbp],xmm12
  2356. movdqa XMMWORD[(160+112)+rbp],xmm13
  2357. movdqa XMMWORD[(160+128)+rbp],xmm14
  2358. movdqa XMMWORD[(160+144)+rbp],xmm15
  2359. ALIGN 32
  2360. $L$seal_sse_main_rounds:
  2361. movdqa XMMWORD[(160+80)+rbp],xmm8
  2362. movdqa xmm8,XMMWORD[$L$rol16]
  2363. paddd xmm3,xmm7
  2364. paddd xmm2,xmm6
  2365. paddd xmm1,xmm5
  2366. paddd xmm0,xmm4
  2367. pxor xmm15,xmm3
  2368. pxor xmm14,xmm2
  2369. pxor xmm13,xmm1
  2370. pxor xmm12,xmm0
  2371. DB 102,69,15,56,0,248
  2372. DB 102,69,15,56,0,240
  2373. DB 102,69,15,56,0,232
  2374. DB 102,69,15,56,0,224
  2375. movdqa xmm8,XMMWORD[((160+80))+rbp]
  2376. paddd xmm11,xmm15
  2377. paddd xmm10,xmm14
  2378. paddd xmm9,xmm13
  2379. paddd xmm8,xmm12
  2380. pxor xmm7,xmm11
  2381. add r10,QWORD[((0+0))+rdi]
  2382. adc r11,QWORD[((8+0))+rdi]
  2383. adc r12,1
  2384. pxor xmm6,xmm10
  2385. pxor xmm5,xmm9
  2386. pxor xmm4,xmm8
  2387. movdqa XMMWORD[(160+80)+rbp],xmm8
  2388. movdqa xmm8,xmm7
  2389. psrld xmm8,20
  2390. pslld xmm7,32-20
  2391. pxor xmm7,xmm8
  2392. movdqa xmm8,xmm6
  2393. psrld xmm8,20
  2394. pslld xmm6,32-20
  2395. pxor xmm6,xmm8
  2396. movdqa xmm8,xmm5
  2397. psrld xmm8,20
  2398. pslld xmm5,32-20
  2399. pxor xmm5,xmm8
  2400. movdqa xmm8,xmm4
  2401. psrld xmm8,20
  2402. pslld xmm4,32-20
  2403. pxor xmm4,xmm8
  2404. mov rax,QWORD[((0+160+0))+rbp]
  2405. mov r15,rax
  2406. mul r10
  2407. mov r13,rax
  2408. mov r14,rdx
  2409. mov rax,QWORD[((0+160+0))+rbp]
  2410. mul r11
  2411. imul r15,r12
  2412. add r14,rax
  2413. adc r15,rdx
  2414. movdqa xmm8,XMMWORD[$L$rol8]
  2415. paddd xmm3,xmm7
  2416. paddd xmm2,xmm6
  2417. paddd xmm1,xmm5
  2418. paddd xmm0,xmm4
  2419. pxor xmm15,xmm3
  2420. pxor xmm14,xmm2
  2421. pxor xmm13,xmm1
  2422. pxor xmm12,xmm0
  2423. DB 102,69,15,56,0,248
  2424. DB 102,69,15,56,0,240
  2425. DB 102,69,15,56,0,232
  2426. DB 102,69,15,56,0,224
  2427. movdqa xmm8,XMMWORD[((160+80))+rbp]
  2428. paddd xmm11,xmm15
  2429. paddd xmm10,xmm14
  2430. paddd xmm9,xmm13
  2431. paddd xmm8,xmm12
  2432. pxor xmm7,xmm11
  2433. pxor xmm6,xmm10
  2434. mov rax,QWORD[((8+160+0))+rbp]
  2435. mov r9,rax
  2436. mul r10
  2437. add r14,rax
  2438. adc rdx,0
  2439. mov r10,rdx
  2440. mov rax,QWORD[((8+160+0))+rbp]
  2441. mul r11
  2442. add r15,rax
  2443. adc rdx,0
  2444. pxor xmm5,xmm9
  2445. pxor xmm4,xmm8
  2446. movdqa XMMWORD[(160+80)+rbp],xmm8
  2447. movdqa xmm8,xmm7
  2448. psrld xmm8,25
  2449. pslld xmm7,32-25
  2450. pxor xmm7,xmm8
  2451. movdqa xmm8,xmm6
  2452. psrld xmm8,25
  2453. pslld xmm6,32-25
  2454. pxor xmm6,xmm8
  2455. movdqa xmm8,xmm5
  2456. psrld xmm8,25
  2457. pslld xmm5,32-25
  2458. pxor xmm5,xmm8
  2459. movdqa xmm8,xmm4
  2460. psrld xmm8,25
  2461. pslld xmm4,32-25
  2462. pxor xmm4,xmm8
  2463. movdqa xmm8,XMMWORD[((160+80))+rbp]
  2464. imul r9,r12
  2465. add r15,r10
  2466. adc r9,rdx
  2467. DB 102,15,58,15,255,4
  2468. DB 102,69,15,58,15,219,8
  2469. DB 102,69,15,58,15,255,12
  2470. DB 102,15,58,15,246,4
  2471. DB 102,69,15,58,15,210,8
  2472. DB 102,69,15,58,15,246,12
  2473. DB 102,15,58,15,237,4
  2474. DB 102,69,15,58,15,201,8
  2475. DB 102,69,15,58,15,237,12
  2476. DB 102,15,58,15,228,4
  2477. DB 102,69,15,58,15,192,8
  2478. DB 102,69,15,58,15,228,12
  2479. movdqa XMMWORD[(160+80)+rbp],xmm8
  2480. movdqa xmm8,XMMWORD[$L$rol16]
  2481. paddd xmm3,xmm7
  2482. paddd xmm2,xmm6
  2483. paddd xmm1,xmm5
  2484. paddd xmm0,xmm4
  2485. pxor xmm15,xmm3
  2486. pxor xmm14,xmm2
  2487. mov r10,r13
  2488. mov r11,r14
  2489. mov r12,r15
  2490. and r12,3
  2491. mov r13,r15
  2492. and r13,-4
  2493. mov r14,r9
  2494. shrd r15,r9,2
  2495. shr r9,2
  2496. add r15,r13
  2497. adc r9,r14
  2498. add r10,r15
  2499. adc r11,r9
  2500. adc r12,0
  2501. pxor xmm13,xmm1
  2502. pxor xmm12,xmm0
  2503. DB 102,69,15,56,0,248
  2504. DB 102,69,15,56,0,240
  2505. DB 102,69,15,56,0,232
  2506. DB 102,69,15,56,0,224
  2507. movdqa xmm8,XMMWORD[((160+80))+rbp]
  2508. paddd xmm11,xmm15
  2509. paddd xmm10,xmm14
  2510. paddd xmm9,xmm13
  2511. paddd xmm8,xmm12
  2512. pxor xmm7,xmm11
  2513. pxor xmm6,xmm10
  2514. pxor xmm5,xmm9
  2515. pxor xmm4,xmm8
  2516. movdqa XMMWORD[(160+80)+rbp],xmm8
  2517. movdqa xmm8,xmm7
  2518. psrld xmm8,20
  2519. pslld xmm7,32-20
  2520. pxor xmm7,xmm8
  2521. movdqa xmm8,xmm6
  2522. psrld xmm8,20
  2523. pslld xmm6,32-20
  2524. pxor xmm6,xmm8
  2525. movdqa xmm8,xmm5
  2526. psrld xmm8,20
  2527. pslld xmm5,32-20
  2528. pxor xmm5,xmm8
  2529. movdqa xmm8,xmm4
  2530. psrld xmm8,20
  2531. pslld xmm4,32-20
  2532. pxor xmm4,xmm8
  2533. movdqa xmm8,XMMWORD[$L$rol8]
  2534. paddd xmm3,xmm7
  2535. paddd xmm2,xmm6
  2536. paddd xmm1,xmm5
  2537. paddd xmm0,xmm4
  2538. pxor xmm15,xmm3
  2539. pxor xmm14,xmm2
  2540. pxor xmm13,xmm1
  2541. pxor xmm12,xmm0
  2542. DB 102,69,15,56,0,248
  2543. DB 102,69,15,56,0,240
  2544. DB 102,69,15,56,0,232
  2545. DB 102,69,15,56,0,224
  2546. movdqa xmm8,XMMWORD[((160+80))+rbp]
  2547. paddd xmm11,xmm15
  2548. paddd xmm10,xmm14
  2549. paddd xmm9,xmm13
  2550. paddd xmm8,xmm12
  2551. pxor xmm7,xmm11
  2552. pxor xmm6,xmm10
  2553. pxor xmm5,xmm9
  2554. pxor xmm4,xmm8
  2555. movdqa XMMWORD[(160+80)+rbp],xmm8
  2556. movdqa xmm8,xmm7
  2557. psrld xmm8,25
  2558. pslld xmm7,32-25
  2559. pxor xmm7,xmm8
  2560. movdqa xmm8,xmm6
  2561. psrld xmm8,25
  2562. pslld xmm6,32-25
  2563. pxor xmm6,xmm8
  2564. movdqa xmm8,xmm5
  2565. psrld xmm8,25
  2566. pslld xmm5,32-25
  2567. pxor xmm5,xmm8
  2568. movdqa xmm8,xmm4
  2569. psrld xmm8,25
  2570. pslld xmm4,32-25
  2571. pxor xmm4,xmm8
  2572. movdqa xmm8,XMMWORD[((160+80))+rbp]
  2573. DB 102,15,58,15,255,12
  2574. DB 102,69,15,58,15,219,8
  2575. DB 102,69,15,58,15,255,4
  2576. DB 102,15,58,15,246,12
  2577. DB 102,69,15,58,15,210,8
  2578. DB 102,69,15,58,15,246,4
  2579. DB 102,15,58,15,237,12
  2580. DB 102,69,15,58,15,201,8
  2581. DB 102,69,15,58,15,237,4
  2582. DB 102,15,58,15,228,12
  2583. DB 102,69,15,58,15,192,8
  2584. DB 102,69,15,58,15,228,4
  2585. lea rdi,[16+rdi]
  2586. dec r8
  2587. jge NEAR $L$seal_sse_main_rounds
  2588. add r10,QWORD[((0+0))+rdi]
  2589. adc r11,QWORD[((8+0))+rdi]
  2590. adc r12,1
  2591. mov rax,QWORD[((0+160+0))+rbp]
  2592. mov r15,rax
  2593. mul r10
  2594. mov r13,rax
  2595. mov r14,rdx
  2596. mov rax,QWORD[((0+160+0))+rbp]
  2597. mul r11
  2598. imul r15,r12
  2599. add r14,rax
  2600. adc r15,rdx
  2601. mov rax,QWORD[((8+160+0))+rbp]
  2602. mov r9,rax
  2603. mul r10
  2604. add r14,rax
  2605. adc rdx,0
  2606. mov r10,rdx
  2607. mov rax,QWORD[((8+160+0))+rbp]
  2608. mul r11
  2609. add r15,rax
  2610. adc rdx,0
  2611. imul r9,r12
  2612. add r15,r10
  2613. adc r9,rdx
  2614. mov r10,r13
  2615. mov r11,r14
  2616. mov r12,r15
  2617. and r12,3
  2618. mov r13,r15
  2619. and r13,-4
  2620. mov r14,r9
  2621. shrd r15,r9,2
  2622. shr r9,2
  2623. add r15,r13
  2624. adc r9,r14
  2625. add r10,r15
  2626. adc r11,r9
  2627. adc r12,0
  2628. lea rdi,[16+rdi]
  2629. dec rcx
  2630. jg NEAR $L$seal_sse_main_rounds
  2631. paddd xmm3,XMMWORD[$L$chacha20_consts]
  2632. paddd xmm7,XMMWORD[((160+48))+rbp]
  2633. paddd xmm11,XMMWORD[((160+64))+rbp]
  2634. paddd xmm15,XMMWORD[((160+144))+rbp]
  2635. paddd xmm2,XMMWORD[$L$chacha20_consts]
  2636. paddd xmm6,XMMWORD[((160+48))+rbp]
  2637. paddd xmm10,XMMWORD[((160+64))+rbp]
  2638. paddd xmm14,XMMWORD[((160+128))+rbp]
  2639. paddd xmm1,XMMWORD[$L$chacha20_consts]
  2640. paddd xmm5,XMMWORD[((160+48))+rbp]
  2641. paddd xmm9,XMMWORD[((160+64))+rbp]
  2642. paddd xmm13,XMMWORD[((160+112))+rbp]
  2643. paddd xmm0,XMMWORD[$L$chacha20_consts]
  2644. paddd xmm4,XMMWORD[((160+48))+rbp]
  2645. paddd xmm8,XMMWORD[((160+64))+rbp]
  2646. paddd xmm12,XMMWORD[((160+96))+rbp]
  2647. movdqa XMMWORD[(160+80)+rbp],xmm14
  2648. movdqa XMMWORD[(160+80)+rbp],xmm14
  2649. movdqu xmm14,XMMWORD[((0 + 0))+rsi]
  2650. pxor xmm14,xmm3
  2651. movdqu XMMWORD[(0 + 0)+rdi],xmm14
  2652. movdqu xmm14,XMMWORD[((16 + 0))+rsi]
  2653. pxor xmm14,xmm7
  2654. movdqu XMMWORD[(16 + 0)+rdi],xmm14
  2655. movdqu xmm14,XMMWORD[((32 + 0))+rsi]
  2656. pxor xmm14,xmm11
  2657. movdqu XMMWORD[(32 + 0)+rdi],xmm14
  2658. movdqu xmm14,XMMWORD[((48 + 0))+rsi]
  2659. pxor xmm14,xmm15
  2660. movdqu XMMWORD[(48 + 0)+rdi],xmm14
  2661. movdqa xmm14,XMMWORD[((160+80))+rbp]
  2662. movdqu xmm3,XMMWORD[((0 + 64))+rsi]
  2663. movdqu xmm7,XMMWORD[((16 + 64))+rsi]
  2664. movdqu xmm11,XMMWORD[((32 + 64))+rsi]
  2665. movdqu xmm15,XMMWORD[((48 + 64))+rsi]
  2666. pxor xmm2,xmm3
  2667. pxor xmm6,xmm7
  2668. pxor xmm10,xmm11
  2669. pxor xmm15,xmm14
  2670. movdqu XMMWORD[(0 + 64)+rdi],xmm2
  2671. movdqu XMMWORD[(16 + 64)+rdi],xmm6
  2672. movdqu XMMWORD[(32 + 64)+rdi],xmm10
  2673. movdqu XMMWORD[(48 + 64)+rdi],xmm15
  2674. movdqu xmm3,XMMWORD[((0 + 128))+rsi]
  2675. movdqu xmm7,XMMWORD[((16 + 128))+rsi]
  2676. movdqu xmm11,XMMWORD[((32 + 128))+rsi]
  2677. movdqu xmm15,XMMWORD[((48 + 128))+rsi]
  2678. pxor xmm1,xmm3
  2679. pxor xmm5,xmm7
  2680. pxor xmm9,xmm11
  2681. pxor xmm15,xmm13
  2682. movdqu XMMWORD[(0 + 128)+rdi],xmm1
  2683. movdqu XMMWORD[(16 + 128)+rdi],xmm5
  2684. movdqu XMMWORD[(32 + 128)+rdi],xmm9
  2685. movdqu XMMWORD[(48 + 128)+rdi],xmm15
  2686. cmp rbx,16*16
  2687. ja NEAR $L$seal_sse_main_loop_xor
  2688. mov rcx,12*16
  2689. sub rbx,12*16
  2690. lea rsi,[192+rsi]
  2691. jmp NEAR $L$seal_sse_128_tail_hash
  2692. $L$seal_sse_main_loop_xor:
  2693. movdqu xmm3,XMMWORD[((0 + 192))+rsi]
  2694. movdqu xmm7,XMMWORD[((16 + 192))+rsi]
  2695. movdqu xmm11,XMMWORD[((32 + 192))+rsi]
  2696. movdqu xmm15,XMMWORD[((48 + 192))+rsi]
  2697. pxor xmm0,xmm3
  2698. pxor xmm4,xmm7
  2699. pxor xmm8,xmm11
  2700. pxor xmm15,xmm12
  2701. movdqu XMMWORD[(0 + 192)+rdi],xmm0
  2702. movdqu XMMWORD[(16 + 192)+rdi],xmm4
  2703. movdqu XMMWORD[(32 + 192)+rdi],xmm8
  2704. movdqu XMMWORD[(48 + 192)+rdi],xmm15
  2705. lea rsi,[256+rsi]
  2706. sub rbx,16*16
  2707. mov rcx,6
  2708. mov r8,4
  2709. cmp rbx,12*16
  2710. jg NEAR $L$seal_sse_main_loop
  2711. mov rcx,rbx
  2712. test rbx,rbx
  2713. je NEAR $L$seal_sse_128_tail_hash
  2714. mov rcx,6
  2715. cmp rbx,8*16
  2716. ja NEAR $L$seal_sse_tail_192
  2717. cmp rbx,4*16
  2718. ja NEAR $L$seal_sse_tail_128
  2719. $L$seal_sse_tail_64:
  2720. movdqa xmm0,XMMWORD[$L$chacha20_consts]
  2721. movdqa xmm4,XMMWORD[((160+48))+rbp]
  2722. movdqa xmm8,XMMWORD[((160+64))+rbp]
  2723. movdqa xmm12,XMMWORD[((160+96))+rbp]
  2724. paddd xmm12,XMMWORD[$L$sse_inc]
  2725. movdqa XMMWORD[(160+96)+rbp],xmm12
  2726. $L$seal_sse_tail_64_rounds_and_x2hash:
  2727. add r10,QWORD[((0+0))+rdi]
  2728. adc r11,QWORD[((8+0))+rdi]
  2729. adc r12,1
  2730. mov rax,QWORD[((0+160+0))+rbp]
  2731. mov r15,rax
  2732. mul r10
  2733. mov r13,rax
  2734. mov r14,rdx
  2735. mov rax,QWORD[((0+160+0))+rbp]
  2736. mul r11
  2737. imul r15,r12
  2738. add r14,rax
  2739. adc r15,rdx
  2740. mov rax,QWORD[((8+160+0))+rbp]
  2741. mov r9,rax
  2742. mul r10
  2743. add r14,rax
  2744. adc rdx,0
  2745. mov r10,rdx
  2746. mov rax,QWORD[((8+160+0))+rbp]
  2747. mul r11
  2748. add r15,rax
  2749. adc rdx,0
  2750. imul r9,r12
  2751. add r15,r10
  2752. adc r9,rdx
  2753. mov r10,r13
  2754. mov r11,r14
  2755. mov r12,r15
  2756. and r12,3
  2757. mov r13,r15
  2758. and r13,-4
  2759. mov r14,r9
  2760. shrd r15,r9,2
  2761. shr r9,2
  2762. add r15,r13
  2763. adc r9,r14
  2764. add r10,r15
  2765. adc r11,r9
  2766. adc r12,0
  2767. lea rdi,[16+rdi]
  2768. $L$seal_sse_tail_64_rounds_and_x1hash:
  2769. paddd xmm0,xmm4
  2770. pxor xmm12,xmm0
  2771. pshufb xmm12,XMMWORD[$L$rol16]
  2772. paddd xmm8,xmm12
  2773. pxor xmm4,xmm8
  2774. movdqa xmm3,xmm4
  2775. pslld xmm3,12
  2776. psrld xmm4,20
  2777. pxor xmm4,xmm3
  2778. paddd xmm0,xmm4
  2779. pxor xmm12,xmm0
  2780. pshufb xmm12,XMMWORD[$L$rol8]
  2781. paddd xmm8,xmm12
  2782. pxor xmm4,xmm8
  2783. movdqa xmm3,xmm4
  2784. pslld xmm3,7
  2785. psrld xmm4,25
  2786. pxor xmm4,xmm3
  2787. DB 102,15,58,15,228,4
  2788. DB 102,69,15,58,15,192,8
  2789. DB 102,69,15,58,15,228,12
  2790. paddd xmm0,xmm4
  2791. pxor xmm12,xmm0
  2792. pshufb xmm12,XMMWORD[$L$rol16]
  2793. paddd xmm8,xmm12
  2794. pxor xmm4,xmm8
  2795. movdqa xmm3,xmm4
  2796. pslld xmm3,12
  2797. psrld xmm4,20
  2798. pxor xmm4,xmm3
  2799. paddd xmm0,xmm4
  2800. pxor xmm12,xmm0
  2801. pshufb xmm12,XMMWORD[$L$rol8]
  2802. paddd xmm8,xmm12
  2803. pxor xmm4,xmm8
  2804. movdqa xmm3,xmm4
  2805. pslld xmm3,7
  2806. psrld xmm4,25
  2807. pxor xmm4,xmm3
  2808. DB 102,15,58,15,228,12
  2809. DB 102,69,15,58,15,192,8
  2810. DB 102,69,15,58,15,228,4
  2811. add r10,QWORD[((0+0))+rdi]
  2812. adc r11,QWORD[((8+0))+rdi]
  2813. adc r12,1
  2814. mov rax,QWORD[((0+160+0))+rbp]
  2815. mov r15,rax
  2816. mul r10
  2817. mov r13,rax
  2818. mov r14,rdx
  2819. mov rax,QWORD[((0+160+0))+rbp]
  2820. mul r11
  2821. imul r15,r12
  2822. add r14,rax
  2823. adc r15,rdx
  2824. mov rax,QWORD[((8+160+0))+rbp]
  2825. mov r9,rax
  2826. mul r10
  2827. add r14,rax
  2828. adc rdx,0
  2829. mov r10,rdx
  2830. mov rax,QWORD[((8+160+0))+rbp]
  2831. mul r11
  2832. add r15,rax
  2833. adc rdx,0
  2834. imul r9,r12
  2835. add r15,r10
  2836. adc r9,rdx
  2837. mov r10,r13
  2838. mov r11,r14
  2839. mov r12,r15
  2840. and r12,3
  2841. mov r13,r15
  2842. and r13,-4
  2843. mov r14,r9
  2844. shrd r15,r9,2
  2845. shr r9,2
  2846. add r15,r13
  2847. adc r9,r14
  2848. add r10,r15
  2849. adc r11,r9
  2850. adc r12,0
  2851. lea rdi,[16+rdi]
  2852. dec rcx
  2853. jg NEAR $L$seal_sse_tail_64_rounds_and_x2hash
  2854. dec r8
  2855. jge NEAR $L$seal_sse_tail_64_rounds_and_x1hash
  2856. paddd xmm0,XMMWORD[$L$chacha20_consts]
  2857. paddd xmm4,XMMWORD[((160+48))+rbp]
  2858. paddd xmm8,XMMWORD[((160+64))+rbp]
  2859. paddd xmm12,XMMWORD[((160+96))+rbp]
  2860. jmp NEAR $L$seal_sse_128_tail_xor
  2861. $L$seal_sse_tail_128:
  2862. movdqa xmm0,XMMWORD[$L$chacha20_consts]
  2863. movdqa xmm4,XMMWORD[((160+48))+rbp]
  2864. movdqa xmm8,XMMWORD[((160+64))+rbp]
  2865. movdqa xmm1,xmm0
  2866. movdqa xmm5,xmm4
  2867. movdqa xmm9,xmm8
  2868. movdqa xmm13,XMMWORD[((160+96))+rbp]
  2869. paddd xmm13,XMMWORD[$L$sse_inc]
  2870. movdqa xmm12,xmm13
  2871. paddd xmm12,XMMWORD[$L$sse_inc]
  2872. movdqa XMMWORD[(160+96)+rbp],xmm12
  2873. movdqa XMMWORD[(160+112)+rbp],xmm13
  2874. $L$seal_sse_tail_128_rounds_and_x2hash:
  2875. add r10,QWORD[((0+0))+rdi]
  2876. adc r11,QWORD[((8+0))+rdi]
  2877. adc r12,1
  2878. mov rax,QWORD[((0+160+0))+rbp]
  2879. mov r15,rax
  2880. mul r10
  2881. mov r13,rax
  2882. mov r14,rdx
  2883. mov rax,QWORD[((0+160+0))+rbp]
  2884. mul r11
  2885. imul r15,r12
  2886. add r14,rax
  2887. adc r15,rdx
  2888. mov rax,QWORD[((8+160+0))+rbp]
  2889. mov r9,rax
  2890. mul r10
  2891. add r14,rax
  2892. adc rdx,0
  2893. mov r10,rdx
  2894. mov rax,QWORD[((8+160+0))+rbp]
  2895. mul r11
  2896. add r15,rax
  2897. adc rdx,0
  2898. imul r9,r12
  2899. add r15,r10
  2900. adc r9,rdx
  2901. mov r10,r13
  2902. mov r11,r14
  2903. mov r12,r15
  2904. and r12,3
  2905. mov r13,r15
  2906. and r13,-4
  2907. mov r14,r9
  2908. shrd r15,r9,2
  2909. shr r9,2
  2910. add r15,r13
  2911. adc r9,r14
  2912. add r10,r15
  2913. adc r11,r9
  2914. adc r12,0
  2915. lea rdi,[16+rdi]
  2916. $L$seal_sse_tail_128_rounds_and_x1hash:
  2917. paddd xmm0,xmm4
  2918. pxor xmm12,xmm0
  2919. pshufb xmm12,XMMWORD[$L$rol16]
  2920. paddd xmm8,xmm12
  2921. pxor xmm4,xmm8
  2922. movdqa xmm3,xmm4
  2923. pslld xmm3,12
  2924. psrld xmm4,20
  2925. pxor xmm4,xmm3
  2926. paddd xmm0,xmm4
  2927. pxor xmm12,xmm0
  2928. pshufb xmm12,XMMWORD[$L$rol8]
  2929. paddd xmm8,xmm12
  2930. pxor xmm4,xmm8
  2931. movdqa xmm3,xmm4
  2932. pslld xmm3,7
  2933. psrld xmm4,25
  2934. pxor xmm4,xmm3
  2935. DB 102,15,58,15,228,4
  2936. DB 102,69,15,58,15,192,8
  2937. DB 102,69,15,58,15,228,12
  2938. paddd xmm1,xmm5
  2939. pxor xmm13,xmm1
  2940. pshufb xmm13,XMMWORD[$L$rol16]
  2941. paddd xmm9,xmm13
  2942. pxor xmm5,xmm9
  2943. movdqa xmm3,xmm5
  2944. pslld xmm3,12
  2945. psrld xmm5,20
  2946. pxor xmm5,xmm3
  2947. paddd xmm1,xmm5
  2948. pxor xmm13,xmm1
  2949. pshufb xmm13,XMMWORD[$L$rol8]
  2950. paddd xmm9,xmm13
  2951. pxor xmm5,xmm9
  2952. movdqa xmm3,xmm5
  2953. pslld xmm3,7
  2954. psrld xmm5,25
  2955. pxor xmm5,xmm3
  2956. DB 102,15,58,15,237,4
  2957. DB 102,69,15,58,15,201,8
  2958. DB 102,69,15,58,15,237,12
  2959. add r10,QWORD[((0+0))+rdi]
  2960. adc r11,QWORD[((8+0))+rdi]
  2961. adc r12,1
  2962. mov rax,QWORD[((0+160+0))+rbp]
  2963. mov r15,rax
  2964. mul r10
  2965. mov r13,rax
  2966. mov r14,rdx
  2967. mov rax,QWORD[((0+160+0))+rbp]
  2968. mul r11
  2969. imul r15,r12
  2970. add r14,rax
  2971. adc r15,rdx
  2972. mov rax,QWORD[((8+160+0))+rbp]
  2973. mov r9,rax
  2974. mul r10
  2975. add r14,rax
  2976. adc rdx,0
  2977. mov r10,rdx
  2978. mov rax,QWORD[((8+160+0))+rbp]
  2979. mul r11
  2980. add r15,rax
  2981. adc rdx,0
  2982. imul r9,r12
  2983. add r15,r10
  2984. adc r9,rdx
  2985. mov r10,r13
  2986. mov r11,r14
  2987. mov r12,r15
  2988. and r12,3
  2989. mov r13,r15
  2990. and r13,-4
  2991. mov r14,r9
  2992. shrd r15,r9,2
  2993. shr r9,2
  2994. add r15,r13
  2995. adc r9,r14
  2996. add r10,r15
  2997. adc r11,r9
  2998. adc r12,0
  2999. paddd xmm0,xmm4
  3000. pxor xmm12,xmm0
  3001. pshufb xmm12,XMMWORD[$L$rol16]
  3002. paddd xmm8,xmm12
  3003. pxor xmm4,xmm8
  3004. movdqa xmm3,xmm4
  3005. pslld xmm3,12
  3006. psrld xmm4,20
  3007. pxor xmm4,xmm3
  3008. paddd xmm0,xmm4
  3009. pxor xmm12,xmm0
  3010. pshufb xmm12,XMMWORD[$L$rol8]
  3011. paddd xmm8,xmm12
  3012. pxor xmm4,xmm8
  3013. movdqa xmm3,xmm4
  3014. pslld xmm3,7
  3015. psrld xmm4,25
  3016. pxor xmm4,xmm3
  3017. DB 102,15,58,15,228,12
  3018. DB 102,69,15,58,15,192,8
  3019. DB 102,69,15,58,15,228,4
  3020. paddd xmm1,xmm5
  3021. pxor xmm13,xmm1
  3022. pshufb xmm13,XMMWORD[$L$rol16]
  3023. paddd xmm9,xmm13
  3024. pxor xmm5,xmm9
  3025. movdqa xmm3,xmm5
  3026. pslld xmm3,12
  3027. psrld xmm5,20
  3028. pxor xmm5,xmm3
  3029. paddd xmm1,xmm5
  3030. pxor xmm13,xmm1
  3031. pshufb xmm13,XMMWORD[$L$rol8]
  3032. paddd xmm9,xmm13
  3033. pxor xmm5,xmm9
  3034. movdqa xmm3,xmm5
  3035. pslld xmm3,7
  3036. psrld xmm5,25
  3037. pxor xmm5,xmm3
  3038. DB 102,15,58,15,237,12
  3039. DB 102,69,15,58,15,201,8
  3040. DB 102,69,15,58,15,237,4
  3041. lea rdi,[16+rdi]
  3042. dec rcx
  3043. jg NEAR $L$seal_sse_tail_128_rounds_and_x2hash
  3044. dec r8
  3045. jge NEAR $L$seal_sse_tail_128_rounds_and_x1hash
  3046. paddd xmm1,XMMWORD[$L$chacha20_consts]
  3047. paddd xmm5,XMMWORD[((160+48))+rbp]
  3048. paddd xmm9,XMMWORD[((160+64))+rbp]
  3049. paddd xmm13,XMMWORD[((160+112))+rbp]
  3050. paddd xmm0,XMMWORD[$L$chacha20_consts]
  3051. paddd xmm4,XMMWORD[((160+48))+rbp]
  3052. paddd xmm8,XMMWORD[((160+64))+rbp]
  3053. paddd xmm12,XMMWORD[((160+96))+rbp]
  3054. movdqu xmm3,XMMWORD[((0 + 0))+rsi]
  3055. movdqu xmm7,XMMWORD[((16 + 0))+rsi]
  3056. movdqu xmm11,XMMWORD[((32 + 0))+rsi]
  3057. movdqu xmm15,XMMWORD[((48 + 0))+rsi]
  3058. pxor xmm1,xmm3
  3059. pxor xmm5,xmm7
  3060. pxor xmm9,xmm11
  3061. pxor xmm15,xmm13
  3062. movdqu XMMWORD[(0 + 0)+rdi],xmm1
  3063. movdqu XMMWORD[(16 + 0)+rdi],xmm5
  3064. movdqu XMMWORD[(32 + 0)+rdi],xmm9
  3065. movdqu XMMWORD[(48 + 0)+rdi],xmm15
  3066. mov rcx,4*16
  3067. sub rbx,4*16
  3068. lea rsi,[64+rsi]
  3069. jmp NEAR $L$seal_sse_128_tail_hash
  3070. $L$seal_sse_tail_192:
  3071. movdqa xmm0,XMMWORD[$L$chacha20_consts]
  3072. movdqa xmm4,XMMWORD[((160+48))+rbp]
  3073. movdqa xmm8,XMMWORD[((160+64))+rbp]
  3074. movdqa xmm1,xmm0
  3075. movdqa xmm5,xmm4
  3076. movdqa xmm9,xmm8
  3077. movdqa xmm2,xmm0
  3078. movdqa xmm6,xmm4
  3079. movdqa xmm10,xmm8
  3080. movdqa xmm14,XMMWORD[((160+96))+rbp]
  3081. paddd xmm14,XMMWORD[$L$sse_inc]
  3082. movdqa xmm13,xmm14
  3083. paddd xmm13,XMMWORD[$L$sse_inc]
  3084. movdqa xmm12,xmm13
  3085. paddd xmm12,XMMWORD[$L$sse_inc]
  3086. movdqa XMMWORD[(160+96)+rbp],xmm12
  3087. movdqa XMMWORD[(160+112)+rbp],xmm13
  3088. movdqa XMMWORD[(160+128)+rbp],xmm14
  3089. $L$seal_sse_tail_192_rounds_and_x2hash:
  3090. add r10,QWORD[((0+0))+rdi]
  3091. adc r11,QWORD[((8+0))+rdi]
  3092. adc r12,1
  3093. mov rax,QWORD[((0+160+0))+rbp]
  3094. mov r15,rax
  3095. mul r10
  3096. mov r13,rax
  3097. mov r14,rdx
  3098. mov rax,QWORD[((0+160+0))+rbp]
  3099. mul r11
  3100. imul r15,r12
  3101. add r14,rax
  3102. adc r15,rdx
  3103. mov rax,QWORD[((8+160+0))+rbp]
  3104. mov r9,rax
  3105. mul r10
  3106. add r14,rax
  3107. adc rdx,0
  3108. mov r10,rdx
  3109. mov rax,QWORD[((8+160+0))+rbp]
  3110. mul r11
  3111. add r15,rax
  3112. adc rdx,0
  3113. imul r9,r12
  3114. add r15,r10
  3115. adc r9,rdx
  3116. mov r10,r13
  3117. mov r11,r14
  3118. mov r12,r15
  3119. and r12,3
  3120. mov r13,r15
  3121. and r13,-4
  3122. mov r14,r9
  3123. shrd r15,r9,2
  3124. shr r9,2
  3125. add r15,r13
  3126. adc r9,r14
  3127. add r10,r15
  3128. adc r11,r9
  3129. adc r12,0
  3130. lea rdi,[16+rdi]
  3131. $L$seal_sse_tail_192_rounds_and_x1hash:
  3132. paddd xmm0,xmm4
  3133. pxor xmm12,xmm0
  3134. pshufb xmm12,XMMWORD[$L$rol16]
  3135. paddd xmm8,xmm12
  3136. pxor xmm4,xmm8
  3137. movdqa xmm3,xmm4
  3138. pslld xmm3,12
  3139. psrld xmm4,20
  3140. pxor xmm4,xmm3
  3141. paddd xmm0,xmm4
  3142. pxor xmm12,xmm0
  3143. pshufb xmm12,XMMWORD[$L$rol8]
  3144. paddd xmm8,xmm12
  3145. pxor xmm4,xmm8
  3146. movdqa xmm3,xmm4
  3147. pslld xmm3,7
  3148. psrld xmm4,25
  3149. pxor xmm4,xmm3
  3150. DB 102,15,58,15,228,4
  3151. DB 102,69,15,58,15,192,8
  3152. DB 102,69,15,58,15,228,12
  3153. paddd xmm1,xmm5
  3154. pxor xmm13,xmm1
  3155. pshufb xmm13,XMMWORD[$L$rol16]
  3156. paddd xmm9,xmm13
  3157. pxor xmm5,xmm9
  3158. movdqa xmm3,xmm5
  3159. pslld xmm3,12
  3160. psrld xmm5,20
  3161. pxor xmm5,xmm3
  3162. paddd xmm1,xmm5
  3163. pxor xmm13,xmm1
  3164. pshufb xmm13,XMMWORD[$L$rol8]
  3165. paddd xmm9,xmm13
  3166. pxor xmm5,xmm9
  3167. movdqa xmm3,xmm5
  3168. pslld xmm3,7
  3169. psrld xmm5,25
  3170. pxor xmm5,xmm3
  3171. DB 102,15,58,15,237,4
  3172. DB 102,69,15,58,15,201,8
  3173. DB 102,69,15,58,15,237,12
  3174. paddd xmm2,xmm6
  3175. pxor xmm14,xmm2
  3176. pshufb xmm14,XMMWORD[$L$rol16]
  3177. paddd xmm10,xmm14
  3178. pxor xmm6,xmm10
  3179. movdqa xmm3,xmm6
  3180. pslld xmm3,12
  3181. psrld xmm6,20
  3182. pxor xmm6,xmm3
  3183. paddd xmm2,xmm6
  3184. pxor xmm14,xmm2
  3185. pshufb xmm14,XMMWORD[$L$rol8]
  3186. paddd xmm10,xmm14
  3187. pxor xmm6,xmm10
  3188. movdqa xmm3,xmm6
  3189. pslld xmm3,7
  3190. psrld xmm6,25
  3191. pxor xmm6,xmm3
  3192. DB 102,15,58,15,246,4
  3193. DB 102,69,15,58,15,210,8
  3194. DB 102,69,15,58,15,246,12
  3195. add r10,QWORD[((0+0))+rdi]
  3196. adc r11,QWORD[((8+0))+rdi]
  3197. adc r12,1
  3198. mov rax,QWORD[((0+160+0))+rbp]
  3199. mov r15,rax
  3200. mul r10
  3201. mov r13,rax
  3202. mov r14,rdx
  3203. mov rax,QWORD[((0+160+0))+rbp]
  3204. mul r11
  3205. imul r15,r12
  3206. add r14,rax
  3207. adc r15,rdx
  3208. mov rax,QWORD[((8+160+0))+rbp]
  3209. mov r9,rax
  3210. mul r10
  3211. add r14,rax
  3212. adc rdx,0
  3213. mov r10,rdx
  3214. mov rax,QWORD[((8+160+0))+rbp]
  3215. mul r11
  3216. add r15,rax
  3217. adc rdx,0
  3218. imul r9,r12
  3219. add r15,r10
  3220. adc r9,rdx
  3221. mov r10,r13
  3222. mov r11,r14
  3223. mov r12,r15
  3224. and r12,3
  3225. mov r13,r15
  3226. and r13,-4
  3227. mov r14,r9
  3228. shrd r15,r9,2
  3229. shr r9,2
  3230. add r15,r13
  3231. adc r9,r14
  3232. add r10,r15
  3233. adc r11,r9
  3234. adc r12,0
  3235. paddd xmm0,xmm4
  3236. pxor xmm12,xmm0
  3237. pshufb xmm12,XMMWORD[$L$rol16]
  3238. paddd xmm8,xmm12
  3239. pxor xmm4,xmm8
  3240. movdqa xmm3,xmm4
  3241. pslld xmm3,12
  3242. psrld xmm4,20
  3243. pxor xmm4,xmm3
  3244. paddd xmm0,xmm4
  3245. pxor xmm12,xmm0
  3246. pshufb xmm12,XMMWORD[$L$rol8]
  3247. paddd xmm8,xmm12
  3248. pxor xmm4,xmm8
  3249. movdqa xmm3,xmm4
  3250. pslld xmm3,7
  3251. psrld xmm4,25
  3252. pxor xmm4,xmm3
  3253. DB 102,15,58,15,228,12
  3254. DB 102,69,15,58,15,192,8
  3255. DB 102,69,15,58,15,228,4
  3256. paddd xmm1,xmm5
  3257. pxor xmm13,xmm1
  3258. pshufb xmm13,XMMWORD[$L$rol16]
  3259. paddd xmm9,xmm13
  3260. pxor xmm5,xmm9
  3261. movdqa xmm3,xmm5
  3262. pslld xmm3,12
  3263. psrld xmm5,20
  3264. pxor xmm5,xmm3
  3265. paddd xmm1,xmm5
  3266. pxor xmm13,xmm1
  3267. pshufb xmm13,XMMWORD[$L$rol8]
  3268. paddd xmm9,xmm13
  3269. pxor xmm5,xmm9
  3270. movdqa xmm3,xmm5
  3271. pslld xmm3,7
  3272. psrld xmm5,25
  3273. pxor xmm5,xmm3
  3274. DB 102,15,58,15,237,12
  3275. DB 102,69,15,58,15,201,8
  3276. DB 102,69,15,58,15,237,4
  3277. paddd xmm2,xmm6
  3278. pxor xmm14,xmm2
  3279. pshufb xmm14,XMMWORD[$L$rol16]
  3280. paddd xmm10,xmm14
  3281. pxor xmm6,xmm10
  3282. movdqa xmm3,xmm6
  3283. pslld xmm3,12
  3284. psrld xmm6,20
  3285. pxor xmm6,xmm3
  3286. paddd xmm2,xmm6
  3287. pxor xmm14,xmm2
  3288. pshufb xmm14,XMMWORD[$L$rol8]
  3289. paddd xmm10,xmm14
  3290. pxor xmm6,xmm10
  3291. movdqa xmm3,xmm6
  3292. pslld xmm3,7
  3293. psrld xmm6,25
  3294. pxor xmm6,xmm3
  3295. DB 102,15,58,15,246,12
  3296. DB 102,69,15,58,15,210,8
  3297. DB 102,69,15,58,15,246,4
  3298. lea rdi,[16+rdi]
  3299. dec rcx
  3300. jg NEAR $L$seal_sse_tail_192_rounds_and_x2hash
  3301. dec r8
  3302. jge NEAR $L$seal_sse_tail_192_rounds_and_x1hash
  3303. paddd xmm2,XMMWORD[$L$chacha20_consts]
  3304. paddd xmm6,XMMWORD[((160+48))+rbp]
  3305. paddd xmm10,XMMWORD[((160+64))+rbp]
  3306. paddd xmm14,XMMWORD[((160+128))+rbp]
  3307. paddd xmm1,XMMWORD[$L$chacha20_consts]
  3308. paddd xmm5,XMMWORD[((160+48))+rbp]
  3309. paddd xmm9,XMMWORD[((160+64))+rbp]
  3310. paddd xmm13,XMMWORD[((160+112))+rbp]
  3311. paddd xmm0,XMMWORD[$L$chacha20_consts]
  3312. paddd xmm4,XMMWORD[((160+48))+rbp]
  3313. paddd xmm8,XMMWORD[((160+64))+rbp]
  3314. paddd xmm12,XMMWORD[((160+96))+rbp]
  3315. movdqu xmm3,XMMWORD[((0 + 0))+rsi]
  3316. movdqu xmm7,XMMWORD[((16 + 0))+rsi]
  3317. movdqu xmm11,XMMWORD[((32 + 0))+rsi]
  3318. movdqu xmm15,XMMWORD[((48 + 0))+rsi]
  3319. pxor xmm2,xmm3
  3320. pxor xmm6,xmm7
  3321. pxor xmm10,xmm11
  3322. pxor xmm15,xmm14
  3323. movdqu XMMWORD[(0 + 0)+rdi],xmm2
  3324. movdqu XMMWORD[(16 + 0)+rdi],xmm6
  3325. movdqu XMMWORD[(32 + 0)+rdi],xmm10
  3326. movdqu XMMWORD[(48 + 0)+rdi],xmm15
  3327. movdqu xmm3,XMMWORD[((0 + 64))+rsi]
  3328. movdqu xmm7,XMMWORD[((16 + 64))+rsi]
  3329. movdqu xmm11,XMMWORD[((32 + 64))+rsi]
  3330. movdqu xmm15,XMMWORD[((48 + 64))+rsi]
  3331. pxor xmm1,xmm3
  3332. pxor xmm5,xmm7
  3333. pxor xmm9,xmm11
  3334. pxor xmm15,xmm13
  3335. movdqu XMMWORD[(0 + 64)+rdi],xmm1
  3336. movdqu XMMWORD[(16 + 64)+rdi],xmm5
  3337. movdqu XMMWORD[(32 + 64)+rdi],xmm9
  3338. movdqu XMMWORD[(48 + 64)+rdi],xmm15
  3339. mov rcx,8*16
  3340. sub rbx,8*16
  3341. lea rsi,[128+rsi]
  3342. $L$seal_sse_128_tail_hash:
  3343. cmp rcx,16
  3344. jb NEAR $L$seal_sse_128_tail_xor
  3345. add r10,QWORD[((0+0))+rdi]
  3346. adc r11,QWORD[((8+0))+rdi]
  3347. adc r12,1
  3348. mov rax,QWORD[((0+160+0))+rbp]
  3349. mov r15,rax
  3350. mul r10
  3351. mov r13,rax
  3352. mov r14,rdx
  3353. mov rax,QWORD[((0+160+0))+rbp]
  3354. mul r11
  3355. imul r15,r12
  3356. add r14,rax
  3357. adc r15,rdx
  3358. mov rax,QWORD[((8+160+0))+rbp]
  3359. mov r9,rax
  3360. mul r10
  3361. add r14,rax
  3362. adc rdx,0
  3363. mov r10,rdx
  3364. mov rax,QWORD[((8+160+0))+rbp]
  3365. mul r11
  3366. add r15,rax
  3367. adc rdx,0
  3368. imul r9,r12
  3369. add r15,r10
  3370. adc r9,rdx
  3371. mov r10,r13
  3372. mov r11,r14
  3373. mov r12,r15
  3374. and r12,3
  3375. mov r13,r15
  3376. and r13,-4
  3377. mov r14,r9
  3378. shrd r15,r9,2
  3379. shr r9,2
  3380. add r15,r13
  3381. adc r9,r14
  3382. add r10,r15
  3383. adc r11,r9
  3384. adc r12,0
  3385. sub rcx,16
  3386. lea rdi,[16+rdi]
  3387. jmp NEAR $L$seal_sse_128_tail_hash
  3388. $L$seal_sse_128_tail_xor:
  3389. cmp rbx,16
  3390. jb NEAR $L$seal_sse_tail_16
  3391. sub rbx,16
  3392. movdqu xmm3,XMMWORD[rsi]
  3393. pxor xmm0,xmm3
  3394. movdqu XMMWORD[rdi],xmm0
  3395. add r10,QWORD[rdi]
  3396. adc r11,QWORD[8+rdi]
  3397. adc r12,1
  3398. lea rsi,[16+rsi]
  3399. lea rdi,[16+rdi]
  3400. mov rax,QWORD[((0+160+0))+rbp]
  3401. mov r15,rax
  3402. mul r10
  3403. mov r13,rax
  3404. mov r14,rdx
  3405. mov rax,QWORD[((0+160+0))+rbp]
  3406. mul r11
  3407. imul r15,r12
  3408. add r14,rax
  3409. adc r15,rdx
  3410. mov rax,QWORD[((8+160+0))+rbp]
  3411. mov r9,rax
  3412. mul r10
  3413. add r14,rax
  3414. adc rdx,0
  3415. mov r10,rdx
  3416. mov rax,QWORD[((8+160+0))+rbp]
  3417. mul r11
  3418. add r15,rax
  3419. adc rdx,0
  3420. imul r9,r12
  3421. add r15,r10
  3422. adc r9,rdx
  3423. mov r10,r13
  3424. mov r11,r14
  3425. mov r12,r15
  3426. and r12,3
  3427. mov r13,r15
  3428. and r13,-4
  3429. mov r14,r9
  3430. shrd r15,r9,2
  3431. shr r9,2
  3432. add r15,r13
  3433. adc r9,r14
  3434. add r10,r15
  3435. adc r11,r9
  3436. adc r12,0
  3437. movdqa xmm0,xmm4
  3438. movdqa xmm4,xmm8
  3439. movdqa xmm8,xmm12
  3440. movdqa xmm12,xmm1
  3441. movdqa xmm1,xmm5
  3442. movdqa xmm5,xmm9
  3443. movdqa xmm9,xmm13
  3444. jmp NEAR $L$seal_sse_128_tail_xor
  3445. $L$seal_sse_tail_16:
  3446. test rbx,rbx
  3447. jz NEAR $L$process_blocks_of_extra_in
  3448. mov r8,rbx
  3449. mov rcx,rbx
  3450. lea rsi,[((-1))+rbx*1+rsi]
  3451. pxor xmm15,xmm15
  3452. $L$seal_sse_tail_16_compose:
  3453. pslldq xmm15,1
  3454. pinsrb xmm15,BYTE[rsi],0
  3455. lea rsi,[((-1))+rsi]
  3456. dec rcx
  3457. jne NEAR $L$seal_sse_tail_16_compose
  3458. pxor xmm15,xmm0
  3459. mov rcx,rbx
  3460. movdqu xmm0,xmm15
  3461. $L$seal_sse_tail_16_extract:
  3462. pextrb XMMWORD[rdi],xmm0,0
  3463. psrldq xmm0,1
  3464. add rdi,1
  3465. sub rcx,1
  3466. jnz NEAR $L$seal_sse_tail_16_extract
  3467. mov r9,QWORD[((288 + 160 + 32))+rsp]
  3468. mov r14,QWORD[56+r9]
  3469. mov r13,QWORD[48+r9]
  3470. test r14,r14
  3471. jz NEAR $L$process_partial_block
  3472. mov r15,16
  3473. sub r15,rbx
  3474. cmp r14,r15
  3475. jge NEAR $L$load_extra_in
  3476. mov r15,r14
  3477. $L$load_extra_in:
  3478. lea rsi,[((-1))+r15*1+r13]
  3479. add r13,r15
  3480. sub r14,r15
  3481. mov QWORD[48+r9],r13
  3482. mov QWORD[56+r9],r14
  3483. add r8,r15
  3484. pxor xmm11,xmm11
  3485. $L$load_extra_load_loop:
  3486. pslldq xmm11,1
  3487. pinsrb xmm11,BYTE[rsi],0
  3488. lea rsi,[((-1))+rsi]
  3489. sub r15,1
  3490. jnz NEAR $L$load_extra_load_loop
  3491. mov r15,rbx
  3492. $L$load_extra_shift_loop:
  3493. pslldq xmm11,1
  3494. sub r15,1
  3495. jnz NEAR $L$load_extra_shift_loop
  3496. lea r15,[$L$and_masks]
  3497. shl rbx,4
  3498. pand xmm15,XMMWORD[((-16))+rbx*1+r15]
  3499. por xmm15,xmm11
  3500. DB 102,77,15,126,253
  3501. pextrq r14,xmm15,1
  3502. add r10,r13
  3503. adc r11,r14
  3504. adc r12,1
  3505. mov rax,QWORD[((0+160+0))+rbp]
  3506. mov r15,rax
  3507. mul r10
  3508. mov r13,rax
  3509. mov r14,rdx
  3510. mov rax,QWORD[((0+160+0))+rbp]
  3511. mul r11
  3512. imul r15,r12
  3513. add r14,rax
  3514. adc r15,rdx
  3515. mov rax,QWORD[((8+160+0))+rbp]
  3516. mov r9,rax
  3517. mul r10
  3518. add r14,rax
  3519. adc rdx,0
  3520. mov r10,rdx
  3521. mov rax,QWORD[((8+160+0))+rbp]
  3522. mul r11
  3523. add r15,rax
  3524. adc rdx,0
  3525. imul r9,r12
  3526. add r15,r10
  3527. adc r9,rdx
  3528. mov r10,r13
  3529. mov r11,r14
  3530. mov r12,r15
  3531. and r12,3
  3532. mov r13,r15
  3533. and r13,-4
  3534. mov r14,r9
  3535. shrd r15,r9,2
  3536. shr r9,2
  3537. add r15,r13
  3538. adc r9,r14
  3539. add r10,r15
  3540. adc r11,r9
  3541. adc r12,0
  3542. $L$process_blocks_of_extra_in:
  3543. mov r9,QWORD[((288+32+160 ))+rsp]
  3544. mov rsi,QWORD[48+r9]
  3545. mov r8,QWORD[56+r9]
  3546. mov rcx,r8
  3547. shr r8,4
  3548. $L$process_extra_hash_loop:
  3549. jz NEAR process_extra_in_trailer
  3550. add r10,QWORD[((0+0))+rsi]
  3551. adc r11,QWORD[((8+0))+rsi]
  3552. adc r12,1
  3553. mov rax,QWORD[((0+160+0))+rbp]
  3554. mov r15,rax
  3555. mul r10
  3556. mov r13,rax
  3557. mov r14,rdx
  3558. mov rax,QWORD[((0+160+0))+rbp]
  3559. mul r11
  3560. imul r15,r12
  3561. add r14,rax
  3562. adc r15,rdx
  3563. mov rax,QWORD[((8+160+0))+rbp]
  3564. mov r9,rax
  3565. mul r10
  3566. add r14,rax
  3567. adc rdx,0
  3568. mov r10,rdx
  3569. mov rax,QWORD[((8+160+0))+rbp]
  3570. mul r11
  3571. add r15,rax
  3572. adc rdx,0
  3573. imul r9,r12
  3574. add r15,r10
  3575. adc r9,rdx
  3576. mov r10,r13
  3577. mov r11,r14
  3578. mov r12,r15
  3579. and r12,3
  3580. mov r13,r15
  3581. and r13,-4
  3582. mov r14,r9
  3583. shrd r15,r9,2
  3584. shr r9,2
  3585. add r15,r13
  3586. adc r9,r14
  3587. add r10,r15
  3588. adc r11,r9
  3589. adc r12,0
  3590. lea rsi,[16+rsi]
  3591. sub r8,1
  3592. jmp NEAR $L$process_extra_hash_loop
  3593. process_extra_in_trailer:
  3594. and rcx,15
  3595. mov rbx,rcx
  3596. jz NEAR $L$do_length_block
  3597. lea rsi,[((-1))+rcx*1+rsi]
  3598. $L$process_extra_in_trailer_load:
  3599. pslldq xmm15,1
  3600. pinsrb xmm15,BYTE[rsi],0
  3601. lea rsi,[((-1))+rsi]
  3602. sub rcx,1
  3603. jnz NEAR $L$process_extra_in_trailer_load
  3604. $L$process_partial_block:
  3605. lea r15,[$L$and_masks]
  3606. shl rbx,4
  3607. pand xmm15,XMMWORD[((-16))+rbx*1+r15]
  3608. DB 102,77,15,126,253
  3609. pextrq r14,xmm15,1
  3610. add r10,r13
  3611. adc r11,r14
  3612. adc r12,1
  3613. mov rax,QWORD[((0+160+0))+rbp]
  3614. mov r15,rax
  3615. mul r10
  3616. mov r13,rax
  3617. mov r14,rdx
  3618. mov rax,QWORD[((0+160+0))+rbp]
  3619. mul r11
  3620. imul r15,r12
  3621. add r14,rax
  3622. adc r15,rdx
  3623. mov rax,QWORD[((8+160+0))+rbp]
  3624. mov r9,rax
  3625. mul r10
  3626. add r14,rax
  3627. adc rdx,0
  3628. mov r10,rdx
  3629. mov rax,QWORD[((8+160+0))+rbp]
  3630. mul r11
  3631. add r15,rax
  3632. adc rdx,0
  3633. imul r9,r12
  3634. add r15,r10
  3635. adc r9,rdx
  3636. mov r10,r13
  3637. mov r11,r14
  3638. mov r12,r15
  3639. and r12,3
  3640. mov r13,r15
  3641. and r13,-4
  3642. mov r14,r9
  3643. shrd r15,r9,2
  3644. shr r9,2
  3645. add r15,r13
  3646. adc r9,r14
  3647. add r10,r15
  3648. adc r11,r9
  3649. adc r12,0
  3650. $L$do_length_block:
  3651. add r10,QWORD[((0+160+32))+rbp]
  3652. adc r11,QWORD[((8+160+32))+rbp]
  3653. adc r12,1
  3654. mov rax,QWORD[((0+160+0))+rbp]
  3655. mov r15,rax
  3656. mul r10
  3657. mov r13,rax
  3658. mov r14,rdx
  3659. mov rax,QWORD[((0+160+0))+rbp]
  3660. mul r11
  3661. imul r15,r12
  3662. add r14,rax
  3663. adc r15,rdx
  3664. mov rax,QWORD[((8+160+0))+rbp]
  3665. mov r9,rax
  3666. mul r10
  3667. add r14,rax
  3668. adc rdx,0
  3669. mov r10,rdx
  3670. mov rax,QWORD[((8+160+0))+rbp]
  3671. mul r11
  3672. add r15,rax
  3673. adc rdx,0
  3674. imul r9,r12
  3675. add r15,r10
  3676. adc r9,rdx
  3677. mov r10,r13
  3678. mov r11,r14
  3679. mov r12,r15
  3680. and r12,3
  3681. mov r13,r15
  3682. and r13,-4
  3683. mov r14,r9
  3684. shrd r15,r9,2
  3685. shr r9,2
  3686. add r15,r13
  3687. adc r9,r14
  3688. add r10,r15
  3689. adc r11,r9
  3690. adc r12,0
  3691. mov r13,r10
  3692. mov r14,r11
  3693. mov r15,r12
  3694. sub r10,-5
  3695. sbb r11,-1
  3696. sbb r12,3
  3697. cmovc r10,r13
  3698. cmovc r11,r14
  3699. cmovc r12,r15
  3700. add r10,QWORD[((0+160+16))+rbp]
  3701. adc r11,QWORD[((8+160+16))+rbp]
  3702. movaps xmm6,XMMWORD[((0+0))+rbp]
  3703. movaps xmm7,XMMWORD[((16+0))+rbp]
  3704. movaps xmm8,XMMWORD[((32+0))+rbp]
  3705. movaps xmm9,XMMWORD[((48+0))+rbp]
  3706. movaps xmm10,XMMWORD[((64+0))+rbp]
  3707. movaps xmm11,XMMWORD[((80+0))+rbp]
  3708. movaps xmm12,XMMWORD[((96+0))+rbp]
  3709. movaps xmm13,XMMWORD[((112+0))+rbp]
  3710. movaps xmm14,XMMWORD[((128+0))+rbp]
  3711. movaps xmm15,XMMWORD[((144+0))+rbp]
  3712. add rsp,288 + 160 + 32
  3713. pop r9
  3714. mov QWORD[r9],r10
  3715. mov QWORD[8+r9],r11
  3716. pop r15
  3717. pop r14
  3718. pop r13
  3719. pop r12
  3720. pop rbx
  3721. pop rbp
  3722. mov rdi,QWORD[8+rsp] ;WIN64 epilogue
  3723. mov rsi,QWORD[16+rsp]
  3724. DB 0F3h,0C3h ;repret
  3725. $L$seal_sse_128:
  3726. movdqu xmm0,XMMWORD[$L$chacha20_consts]
  3727. movdqa xmm1,xmm0
  3728. movdqa xmm2,xmm0
  3729. movdqu xmm4,XMMWORD[r9]
  3730. movdqa xmm5,xmm4
  3731. movdqa xmm6,xmm4
  3732. movdqu xmm8,XMMWORD[16+r9]
  3733. movdqa xmm9,xmm8
  3734. movdqa xmm10,xmm8
  3735. movdqu xmm14,XMMWORD[32+r9]
  3736. movdqa xmm12,xmm14
  3737. paddd xmm12,XMMWORD[$L$sse_inc]
  3738. movdqa xmm13,xmm12
  3739. paddd xmm13,XMMWORD[$L$sse_inc]
  3740. movdqa xmm7,xmm4
  3741. movdqa xmm11,xmm8
  3742. movdqa xmm15,xmm12
  3743. mov r10,10
  3744. $L$seal_sse_128_rounds:
  3745. paddd xmm0,xmm4
  3746. pxor xmm12,xmm0
  3747. pshufb xmm12,XMMWORD[$L$rol16]
  3748. paddd xmm8,xmm12
  3749. pxor xmm4,xmm8
  3750. movdqa xmm3,xmm4
  3751. pslld xmm3,12
  3752. psrld xmm4,20
  3753. pxor xmm4,xmm3
  3754. paddd xmm0,xmm4
  3755. pxor xmm12,xmm0
  3756. pshufb xmm12,XMMWORD[$L$rol8]
  3757. paddd xmm8,xmm12
  3758. pxor xmm4,xmm8
  3759. movdqa xmm3,xmm4
  3760. pslld xmm3,7
  3761. psrld xmm4,25
  3762. pxor xmm4,xmm3
  3763. DB 102,15,58,15,228,4
  3764. DB 102,69,15,58,15,192,8
  3765. DB 102,69,15,58,15,228,12
  3766. paddd xmm1,xmm5
  3767. pxor xmm13,xmm1
  3768. pshufb xmm13,XMMWORD[$L$rol16]
  3769. paddd xmm9,xmm13
  3770. pxor xmm5,xmm9
  3771. movdqa xmm3,xmm5
  3772. pslld xmm3,12
  3773. psrld xmm5,20
  3774. pxor xmm5,xmm3
  3775. paddd xmm1,xmm5
  3776. pxor xmm13,xmm1
  3777. pshufb xmm13,XMMWORD[$L$rol8]
  3778. paddd xmm9,xmm13
  3779. pxor xmm5,xmm9
  3780. movdqa xmm3,xmm5
  3781. pslld xmm3,7
  3782. psrld xmm5,25
  3783. pxor xmm5,xmm3
  3784. DB 102,15,58,15,237,4
  3785. DB 102,69,15,58,15,201,8
  3786. DB 102,69,15,58,15,237,12
  3787. paddd xmm2,xmm6
  3788. pxor xmm14,xmm2
  3789. pshufb xmm14,XMMWORD[$L$rol16]
  3790. paddd xmm10,xmm14
  3791. pxor xmm6,xmm10
  3792. movdqa xmm3,xmm6
  3793. pslld xmm3,12
  3794. psrld xmm6,20
  3795. pxor xmm6,xmm3
  3796. paddd xmm2,xmm6
  3797. pxor xmm14,xmm2
  3798. pshufb xmm14,XMMWORD[$L$rol8]
  3799. paddd xmm10,xmm14
  3800. pxor xmm6,xmm10
  3801. movdqa xmm3,xmm6
  3802. pslld xmm3,7
  3803. psrld xmm6,25
  3804. pxor xmm6,xmm3
  3805. DB 102,15,58,15,246,4
  3806. DB 102,69,15,58,15,210,8
  3807. DB 102,69,15,58,15,246,12
  3808. paddd xmm0,xmm4
  3809. pxor xmm12,xmm0
  3810. pshufb xmm12,XMMWORD[$L$rol16]
  3811. paddd xmm8,xmm12
  3812. pxor xmm4,xmm8
  3813. movdqa xmm3,xmm4
  3814. pslld xmm3,12
  3815. psrld xmm4,20
  3816. pxor xmm4,xmm3
  3817. paddd xmm0,xmm4
  3818. pxor xmm12,xmm0
  3819. pshufb xmm12,XMMWORD[$L$rol8]
  3820. paddd xmm8,xmm12
  3821. pxor xmm4,xmm8
  3822. movdqa xmm3,xmm4
  3823. pslld xmm3,7
  3824. psrld xmm4,25
  3825. pxor xmm4,xmm3
  3826. DB 102,15,58,15,228,12
  3827. DB 102,69,15,58,15,192,8
  3828. DB 102,69,15,58,15,228,4
  3829. paddd xmm1,xmm5
  3830. pxor xmm13,xmm1
  3831. pshufb xmm13,XMMWORD[$L$rol16]
  3832. paddd xmm9,xmm13
  3833. pxor xmm5,xmm9
  3834. movdqa xmm3,xmm5
  3835. pslld xmm3,12
  3836. psrld xmm5,20
  3837. pxor xmm5,xmm3
  3838. paddd xmm1,xmm5
  3839. pxor xmm13,xmm1
  3840. pshufb xmm13,XMMWORD[$L$rol8]
  3841. paddd xmm9,xmm13
  3842. pxor xmm5,xmm9
  3843. movdqa xmm3,xmm5
  3844. pslld xmm3,7
  3845. psrld xmm5,25
  3846. pxor xmm5,xmm3
  3847. DB 102,15,58,15,237,12
  3848. DB 102,69,15,58,15,201,8
  3849. DB 102,69,15,58,15,237,4
  3850. paddd xmm2,xmm6
  3851. pxor xmm14,xmm2
  3852. pshufb xmm14,XMMWORD[$L$rol16]
  3853. paddd xmm10,xmm14
  3854. pxor xmm6,xmm10
  3855. movdqa xmm3,xmm6
  3856. pslld xmm3,12
  3857. psrld xmm6,20
  3858. pxor xmm6,xmm3
  3859. paddd xmm2,xmm6
  3860. pxor xmm14,xmm2
  3861. pshufb xmm14,XMMWORD[$L$rol8]
  3862. paddd xmm10,xmm14
  3863. pxor xmm6,xmm10
  3864. movdqa xmm3,xmm6
  3865. pslld xmm3,7
  3866. psrld xmm6,25
  3867. pxor xmm6,xmm3
  3868. DB 102,15,58,15,246,12
  3869. DB 102,69,15,58,15,210,8
  3870. DB 102,69,15,58,15,246,4
  3871. dec r10
  3872. jnz NEAR $L$seal_sse_128_rounds
  3873. paddd xmm0,XMMWORD[$L$chacha20_consts]
  3874. paddd xmm1,XMMWORD[$L$chacha20_consts]
  3875. paddd xmm2,XMMWORD[$L$chacha20_consts]
  3876. paddd xmm4,xmm7
  3877. paddd xmm5,xmm7
  3878. paddd xmm6,xmm7
  3879. paddd xmm8,xmm11
  3880. paddd xmm9,xmm11
  3881. paddd xmm12,xmm15
  3882. paddd xmm15,XMMWORD[$L$sse_inc]
  3883. paddd xmm13,xmm15
  3884. pand xmm2,XMMWORD[$L$clamp]
  3885. movdqa XMMWORD[(160+0)+rbp],xmm2
  3886. movdqa XMMWORD[(160+16)+rbp],xmm6
  3887. mov r8,r8
  3888. call poly_hash_ad_internal
  3889. jmp NEAR $L$seal_sse_128_tail_xor
  3890. $L$SEH_end_GFp_chacha20_poly1305_seal:
  3891. ALIGN 64
  3892. chacha20_poly1305_open_avx2:
  3893. vzeroupper
  3894. vmovdqa ymm0,YMMWORD[$L$chacha20_consts]
  3895. vbroadcasti128 ymm4,XMMWORD[r9]
  3896. vbroadcasti128 ymm8,XMMWORD[16+r9]
  3897. vbroadcasti128 ymm12,XMMWORD[32+r9]
  3898. vpaddd ymm12,ymm12,YMMWORD[$L$avx2_init]
  3899. cmp rbx,6*32
  3900. jbe NEAR $L$open_avx2_192
  3901. cmp rbx,10*32
  3902. jbe NEAR $L$open_avx2_320
  3903. vmovdqa YMMWORD[(160+64)+rbp],ymm4
  3904. vmovdqa YMMWORD[(160+96)+rbp],ymm8
  3905. vmovdqa YMMWORD[(160+160)+rbp],ymm12
  3906. mov r10,10
  3907. $L$open_avx2_init_rounds:
  3908. vpaddd ymm0,ymm0,ymm4
  3909. vpxor ymm12,ymm12,ymm0
  3910. vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
  3911. vpaddd ymm8,ymm8,ymm12
  3912. vpxor ymm4,ymm4,ymm8
  3913. vpsrld ymm3,ymm4,20
  3914. vpslld ymm4,ymm4,12
  3915. vpxor ymm4,ymm4,ymm3
  3916. vpaddd ymm0,ymm0,ymm4
  3917. vpxor ymm12,ymm12,ymm0
  3918. vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
  3919. vpaddd ymm8,ymm8,ymm12
  3920. vpxor ymm4,ymm4,ymm8
  3921. vpslld ymm3,ymm4,7
  3922. vpsrld ymm4,ymm4,25
  3923. vpxor ymm4,ymm4,ymm3
  3924. vpalignr ymm12,ymm12,ymm12,12
  3925. vpalignr ymm8,ymm8,ymm8,8
  3926. vpalignr ymm4,ymm4,ymm4,4
  3927. vpaddd ymm0,ymm0,ymm4
  3928. vpxor ymm12,ymm12,ymm0
  3929. vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
  3930. vpaddd ymm8,ymm8,ymm12
  3931. vpxor ymm4,ymm4,ymm8
  3932. vpsrld ymm3,ymm4,20
  3933. vpslld ymm4,ymm4,12
  3934. vpxor ymm4,ymm4,ymm3
  3935. vpaddd ymm0,ymm0,ymm4
  3936. vpxor ymm12,ymm12,ymm0
  3937. vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
  3938. vpaddd ymm8,ymm8,ymm12
  3939. vpxor ymm4,ymm4,ymm8
  3940. vpslld ymm3,ymm4,7
  3941. vpsrld ymm4,ymm4,25
  3942. vpxor ymm4,ymm4,ymm3
  3943. vpalignr ymm12,ymm12,ymm12,4
  3944. vpalignr ymm8,ymm8,ymm8,8
  3945. vpalignr ymm4,ymm4,ymm4,12
  3946. dec r10
  3947. jne NEAR $L$open_avx2_init_rounds
  3948. vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts]
  3949. vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp]
  3950. vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp]
  3951. vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
  3952. vperm2i128 ymm3,ymm4,ymm0,0x02
  3953. vpand ymm3,ymm3,YMMWORD[$L$clamp]
  3954. vmovdqa YMMWORD[(160+0)+rbp],ymm3
  3955. vperm2i128 ymm0,ymm4,ymm0,0x13
  3956. vperm2i128 ymm4,ymm12,ymm8,0x13
  3957. mov r8,r8
  3958. call poly_hash_ad_internal
  3959. xor rcx,rcx
  3960. $L$open_avx2_init_hash:
  3961. add r10,QWORD[((0+0))+rcx*1+rsi]
  3962. adc r11,QWORD[((8+0))+rcx*1+rsi]
  3963. adc r12,1
  3964. mov rax,QWORD[((0+160+0))+rbp]
  3965. mov r15,rax
  3966. mul r10
  3967. mov r13,rax
  3968. mov r14,rdx
  3969. mov rax,QWORD[((0+160+0))+rbp]
  3970. mul r11
  3971. imul r15,r12
  3972. add r14,rax
  3973. adc r15,rdx
  3974. mov rax,QWORD[((8+160+0))+rbp]
  3975. mov r9,rax
  3976. mul r10
  3977. add r14,rax
  3978. adc rdx,0
  3979. mov r10,rdx
  3980. mov rax,QWORD[((8+160+0))+rbp]
  3981. mul r11
  3982. add r15,rax
  3983. adc rdx,0
  3984. imul r9,r12
  3985. add r15,r10
  3986. adc r9,rdx
  3987. mov r10,r13
  3988. mov r11,r14
  3989. mov r12,r15
  3990. and r12,3
  3991. mov r13,r15
  3992. and r13,-4
  3993. mov r14,r9
  3994. shrd r15,r9,2
  3995. shr r9,2
  3996. add r15,r13
  3997. adc r9,r14
  3998. add r10,r15
  3999. adc r11,r9
  4000. adc r12,0
  4001. add rcx,16
  4002. cmp rcx,2*32
  4003. jne NEAR $L$open_avx2_init_hash
  4004. vpxor ymm0,ymm0,YMMWORD[rsi]
  4005. vpxor ymm4,ymm4,YMMWORD[32+rsi]
  4006. vmovdqu YMMWORD[rdi],ymm0
  4007. vmovdqu YMMWORD[32+rdi],ymm4
  4008. lea rsi,[64+rsi]
  4009. lea rdi,[64+rdi]
  4010. sub rbx,2*32
  4011. $L$open_avx2_main_loop:
  4012. cmp rbx,16*32
  4013. jb NEAR $L$open_avx2_main_loop_done
  4014. vmovdqa ymm0,YMMWORD[$L$chacha20_consts]
  4015. vmovdqa ymm4,YMMWORD[((160+64))+rbp]
  4016. vmovdqa ymm8,YMMWORD[((160+96))+rbp]
  4017. vmovdqa ymm1,ymm0
  4018. vmovdqa ymm5,ymm4
  4019. vmovdqa ymm9,ymm8
  4020. vmovdqa ymm2,ymm0
  4021. vmovdqa ymm6,ymm4
  4022. vmovdqa ymm10,ymm8
  4023. vmovdqa ymm3,ymm0
  4024. vmovdqa ymm7,ymm4
  4025. vmovdqa ymm11,ymm8
  4026. vmovdqa ymm12,YMMWORD[$L$avx2_inc]
  4027. vpaddd ymm15,ymm12,YMMWORD[((160+160))+rbp]
  4028. vpaddd ymm14,ymm12,ymm15
  4029. vpaddd ymm13,ymm12,ymm14
  4030. vpaddd ymm12,ymm12,ymm13
  4031. vmovdqa YMMWORD[(160+256)+rbp],ymm15
  4032. vmovdqa YMMWORD[(160+224)+rbp],ymm14
  4033. vmovdqa YMMWORD[(160+192)+rbp],ymm13
  4034. vmovdqa YMMWORD[(160+160)+rbp],ymm12
  4035. xor rcx,rcx
  4036. $L$open_avx2_main_loop_rounds:
  4037. add r10,QWORD[((0+0))+rcx*1+rsi]
  4038. adc r11,QWORD[((8+0))+rcx*1+rsi]
  4039. adc r12,1
  4040. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  4041. vmovdqa ymm8,YMMWORD[$L$rol16]
  4042. vpaddd ymm3,ymm3,ymm7
  4043. vpaddd ymm2,ymm2,ymm6
  4044. vpaddd ymm1,ymm1,ymm5
  4045. vpaddd ymm0,ymm0,ymm4
  4046. vpxor ymm15,ymm15,ymm3
  4047. vpxor ymm14,ymm14,ymm2
  4048. vpxor ymm13,ymm13,ymm1
  4049. vpxor ymm12,ymm12,ymm0
  4050. mov rdx,QWORD[((0+160+0))+rbp]
  4051. mov r15,rdx
  4052. mulx r14,r13,r10
  4053. mulx rdx,rax,r11
  4054. imul r15,r12
  4055. add r14,rax
  4056. adc r15,rdx
  4057. vpshufb ymm15,ymm15,ymm8
  4058. vpshufb ymm14,ymm14,ymm8
  4059. vpshufb ymm13,ymm13,ymm8
  4060. vpshufb ymm12,ymm12,ymm8
  4061. vpaddd ymm11,ymm11,ymm15
  4062. vpaddd ymm10,ymm10,ymm14
  4063. vpaddd ymm9,ymm9,ymm13
  4064. vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
  4065. vpxor ymm7,ymm7,ymm11
  4066. mov rdx,QWORD[((8+160+0))+rbp]
  4067. mulx rax,r10,r10
  4068. add r14,r10
  4069. mulx r9,r11,r11
  4070. adc r15,r11
  4071. adc r9,0
  4072. imul rdx,r12
  4073. vpxor ymm6,ymm6,ymm10
  4074. vpxor ymm5,ymm5,ymm9
  4075. vpxor ymm4,ymm4,ymm8
  4076. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  4077. vpsrld ymm8,ymm7,20
  4078. vpslld ymm7,ymm7,32-20
  4079. vpxor ymm7,ymm7,ymm8
  4080. vpsrld ymm8,ymm6,20
  4081. vpslld ymm6,ymm6,32-20
  4082. vpxor ymm6,ymm6,ymm8
  4083. vpsrld ymm8,ymm5,20
  4084. vpslld ymm5,ymm5,32-20
  4085. add r15,rax
  4086. adc r9,rdx
  4087. vpxor ymm5,ymm5,ymm8
  4088. vpsrld ymm8,ymm4,20
  4089. vpslld ymm4,ymm4,32-20
  4090. vpxor ymm4,ymm4,ymm8
  4091. vmovdqa ymm8,YMMWORD[$L$rol8]
  4092. vpaddd ymm3,ymm3,ymm7
  4093. vpaddd ymm2,ymm2,ymm6
  4094. vpaddd ymm1,ymm1,ymm5
  4095. vpaddd ymm0,ymm0,ymm4
  4096. vpxor ymm15,ymm15,ymm3
  4097. mov r10,r13
  4098. mov r11,r14
  4099. mov r12,r15
  4100. and r12,3
  4101. mov r13,r15
  4102. and r13,-4
  4103. mov r14,r9
  4104. shrd r15,r9,2
  4105. shr r9,2
  4106. add r15,r13
  4107. adc r9,r14
  4108. add r10,r15
  4109. adc r11,r9
  4110. adc r12,0
  4111. vpxor ymm14,ymm14,ymm2
  4112. vpxor ymm13,ymm13,ymm1
  4113. vpxor ymm12,ymm12,ymm0
  4114. vpshufb ymm15,ymm15,ymm8
  4115. vpshufb ymm14,ymm14,ymm8
  4116. vpshufb ymm13,ymm13,ymm8
  4117. vpshufb ymm12,ymm12,ymm8
  4118. vpaddd ymm11,ymm11,ymm15
  4119. vpaddd ymm10,ymm10,ymm14
  4120. add r10,QWORD[((0+16))+rcx*1+rsi]
  4121. adc r11,QWORD[((8+16))+rcx*1+rsi]
  4122. adc r12,1
  4123. vpaddd ymm9,ymm9,ymm13
  4124. vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
  4125. vpxor ymm7,ymm7,ymm11
  4126. vpxor ymm6,ymm6,ymm10
  4127. vpxor ymm5,ymm5,ymm9
  4128. vpxor ymm4,ymm4,ymm8
  4129. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  4130. vpsrld ymm8,ymm7,25
  4131. mov rdx,QWORD[((0+160+0))+rbp]
  4132. mov r15,rdx
  4133. mulx r14,r13,r10
  4134. mulx rdx,rax,r11
  4135. imul r15,r12
  4136. add r14,rax
  4137. adc r15,rdx
  4138. vpslld ymm7,ymm7,32-25
  4139. vpxor ymm7,ymm7,ymm8
  4140. vpsrld ymm8,ymm6,25
  4141. vpslld ymm6,ymm6,32-25
  4142. vpxor ymm6,ymm6,ymm8
  4143. vpsrld ymm8,ymm5,25
  4144. vpslld ymm5,ymm5,32-25
  4145. vpxor ymm5,ymm5,ymm8
  4146. vpsrld ymm8,ymm4,25
  4147. vpslld ymm4,ymm4,32-25
  4148. vpxor ymm4,ymm4,ymm8
  4149. vmovdqa ymm8,YMMWORD[((160+128))+rbp]
  4150. vpalignr ymm7,ymm7,ymm7,4
  4151. vpalignr ymm11,ymm11,ymm11,8
  4152. vpalignr ymm15,ymm15,ymm15,12
  4153. vpalignr ymm6,ymm6,ymm6,4
  4154. vpalignr ymm10,ymm10,ymm10,8
  4155. vpalignr ymm14,ymm14,ymm14,12
  4156. mov rdx,QWORD[((8+160+0))+rbp]
  4157. mulx rax,r10,r10
  4158. add r14,r10
  4159. mulx r9,r11,r11
  4160. adc r15,r11
  4161. adc r9,0
  4162. imul rdx,r12
  4163. vpalignr ymm5,ymm5,ymm5,4
  4164. vpalignr ymm9,ymm9,ymm9,8
  4165. vpalignr ymm13,ymm13,ymm13,12
  4166. vpalignr ymm4,ymm4,ymm4,4
  4167. vpalignr ymm8,ymm8,ymm8,8
  4168. vpalignr ymm12,ymm12,ymm12,12
  4169. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  4170. vmovdqa ymm8,YMMWORD[$L$rol16]
  4171. vpaddd ymm3,ymm3,ymm7
  4172. vpaddd ymm2,ymm2,ymm6
  4173. vpaddd ymm1,ymm1,ymm5
  4174. vpaddd ymm0,ymm0,ymm4
  4175. vpxor ymm15,ymm15,ymm3
  4176. vpxor ymm14,ymm14,ymm2
  4177. vpxor ymm13,ymm13,ymm1
  4178. vpxor ymm12,ymm12,ymm0
  4179. vpshufb ymm15,ymm15,ymm8
  4180. vpshufb ymm14,ymm14,ymm8
  4181. add r15,rax
  4182. adc r9,rdx
  4183. vpshufb ymm13,ymm13,ymm8
  4184. vpshufb ymm12,ymm12,ymm8
  4185. vpaddd ymm11,ymm11,ymm15
  4186. vpaddd ymm10,ymm10,ymm14
  4187. vpaddd ymm9,ymm9,ymm13
  4188. vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
  4189. vpxor ymm7,ymm7,ymm11
  4190. vpxor ymm6,ymm6,ymm10
  4191. vpxor ymm5,ymm5,ymm9
  4192. mov r10,r13
  4193. mov r11,r14
  4194. mov r12,r15
  4195. and r12,3
  4196. mov r13,r15
  4197. and r13,-4
  4198. mov r14,r9
  4199. shrd r15,r9,2
  4200. shr r9,2
  4201. add r15,r13
  4202. adc r9,r14
  4203. add r10,r15
  4204. adc r11,r9
  4205. adc r12,0
  4206. vpxor ymm4,ymm4,ymm8
  4207. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  4208. vpsrld ymm8,ymm7,20
  4209. vpslld ymm7,ymm7,32-20
  4210. vpxor ymm7,ymm7,ymm8
  4211. vpsrld ymm8,ymm6,20
  4212. vpslld ymm6,ymm6,32-20
  4213. vpxor ymm6,ymm6,ymm8
  4214. add r10,QWORD[((0+32))+rcx*1+rsi]
  4215. adc r11,QWORD[((8+32))+rcx*1+rsi]
  4216. adc r12,1
  4217. lea rcx,[48+rcx]
  4218. vpsrld ymm8,ymm5,20
  4219. vpslld ymm5,ymm5,32-20
  4220. vpxor ymm5,ymm5,ymm8
  4221. vpsrld ymm8,ymm4,20
  4222. vpslld ymm4,ymm4,32-20
  4223. vpxor ymm4,ymm4,ymm8
  4224. vmovdqa ymm8,YMMWORD[$L$rol8]
  4225. vpaddd ymm3,ymm3,ymm7
  4226. vpaddd ymm2,ymm2,ymm6
  4227. vpaddd ymm1,ymm1,ymm5
  4228. vpaddd ymm0,ymm0,ymm4
  4229. vpxor ymm15,ymm15,ymm3
  4230. vpxor ymm14,ymm14,ymm2
  4231. vpxor ymm13,ymm13,ymm1
  4232. vpxor ymm12,ymm12,ymm0
  4233. vpshufb ymm15,ymm15,ymm8
  4234. vpshufb ymm14,ymm14,ymm8
  4235. vpshufb ymm13,ymm13,ymm8
  4236. mov rdx,QWORD[((0+160+0))+rbp]
  4237. mov r15,rdx
  4238. mulx r14,r13,r10
  4239. mulx rdx,rax,r11
  4240. imul r15,r12
  4241. add r14,rax
  4242. adc r15,rdx
  4243. vpshufb ymm12,ymm12,ymm8
  4244. vpaddd ymm11,ymm11,ymm15
  4245. vpaddd ymm10,ymm10,ymm14
  4246. vpaddd ymm9,ymm9,ymm13
  4247. vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
  4248. vpxor ymm7,ymm7,ymm11
  4249. vpxor ymm6,ymm6,ymm10
  4250. vpxor ymm5,ymm5,ymm9
  4251. mov rdx,QWORD[((8+160+0))+rbp]
  4252. mulx rax,r10,r10
  4253. add r14,r10
  4254. mulx r9,r11,r11
  4255. adc r15,r11
  4256. adc r9,0
  4257. imul rdx,r12
  4258. vpxor ymm4,ymm4,ymm8
  4259. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  4260. vpsrld ymm8,ymm7,25
  4261. vpslld ymm7,ymm7,32-25
  4262. vpxor ymm7,ymm7,ymm8
  4263. vpsrld ymm8,ymm6,25
  4264. vpslld ymm6,ymm6,32-25
  4265. vpxor ymm6,ymm6,ymm8
  4266. add r15,rax
  4267. adc r9,rdx
  4268. vpsrld ymm8,ymm5,25
  4269. vpslld ymm5,ymm5,32-25
  4270. vpxor ymm5,ymm5,ymm8
  4271. vpsrld ymm8,ymm4,25
  4272. vpslld ymm4,ymm4,32-25
  4273. vpxor ymm4,ymm4,ymm8
  4274. vmovdqa ymm8,YMMWORD[((160+128))+rbp]
  4275. vpalignr ymm7,ymm7,ymm7,12
  4276. vpalignr ymm11,ymm11,ymm11,8
  4277. vpalignr ymm15,ymm15,ymm15,4
  4278. vpalignr ymm6,ymm6,ymm6,12
  4279. vpalignr ymm10,ymm10,ymm10,8
  4280. vpalignr ymm14,ymm14,ymm14,4
  4281. vpalignr ymm5,ymm5,ymm5,12
  4282. vpalignr ymm9,ymm9,ymm9,8
  4283. vpalignr ymm13,ymm13,ymm13,4
  4284. vpalignr ymm4,ymm4,ymm4,12
  4285. vpalignr ymm8,ymm8,ymm8,8
  4286. mov r10,r13
  4287. mov r11,r14
  4288. mov r12,r15
  4289. and r12,3
  4290. mov r13,r15
  4291. and r13,-4
  4292. mov r14,r9
  4293. shrd r15,r9,2
  4294. shr r9,2
  4295. add r15,r13
  4296. adc r9,r14
  4297. add r10,r15
  4298. adc r11,r9
  4299. adc r12,0
  4300. vpalignr ymm12,ymm12,ymm12,4
  4301. cmp rcx,10*6*8
  4302. jne NEAR $L$open_avx2_main_loop_rounds
  4303. vpaddd ymm3,ymm3,YMMWORD[$L$chacha20_consts]
  4304. vpaddd ymm7,ymm7,YMMWORD[((160+64))+rbp]
  4305. vpaddd ymm11,ymm11,YMMWORD[((160+96))+rbp]
  4306. vpaddd ymm15,ymm15,YMMWORD[((160+256))+rbp]
  4307. vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts]
  4308. vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp]
  4309. vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp]
  4310. vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp]
  4311. vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts]
  4312. vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp]
  4313. vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp]
  4314. vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp]
  4315. vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts]
  4316. vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp]
  4317. vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp]
  4318. vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
  4319. vmovdqa YMMWORD[(160+128)+rbp],ymm0
  4320. add r10,QWORD[((0+480))+rsi]
  4321. adc r11,QWORD[((8+480))+rsi]
  4322. adc r12,1
  4323. vperm2i128 ymm0,ymm7,ymm3,0x02
  4324. vperm2i128 ymm7,ymm7,ymm3,0x13
  4325. vperm2i128 ymm3,ymm15,ymm11,0x02
  4326. vperm2i128 ymm11,ymm15,ymm11,0x13
  4327. vpxor ymm0,ymm0,YMMWORD[((0+0))+rsi]
  4328. vpxor ymm3,ymm3,YMMWORD[((32+0))+rsi]
  4329. vpxor ymm7,ymm7,YMMWORD[((64+0))+rsi]
  4330. vpxor ymm11,ymm11,YMMWORD[((96+0))+rsi]
  4331. vmovdqu YMMWORD[(0+0)+rdi],ymm0
  4332. vmovdqu YMMWORD[(32+0)+rdi],ymm3
  4333. vmovdqu YMMWORD[(64+0)+rdi],ymm7
  4334. vmovdqu YMMWORD[(96+0)+rdi],ymm11
  4335. vmovdqa ymm0,YMMWORD[((160+128))+rbp]
  4336. mov rax,QWORD[((0+160+0))+rbp]
  4337. mov r15,rax
  4338. mul r10
  4339. mov r13,rax
  4340. mov r14,rdx
  4341. mov rax,QWORD[((0+160+0))+rbp]
  4342. mul r11
  4343. imul r15,r12
  4344. add r14,rax
  4345. adc r15,rdx
  4346. mov rax,QWORD[((8+160+0))+rbp]
  4347. mov r9,rax
  4348. mul r10
  4349. add r14,rax
  4350. adc rdx,0
  4351. mov r10,rdx
  4352. mov rax,QWORD[((8+160+0))+rbp]
  4353. mul r11
  4354. add r15,rax
  4355. adc rdx,0
  4356. imul r9,r12
  4357. add r15,r10
  4358. adc r9,rdx
  4359. mov r10,r13
  4360. mov r11,r14
  4361. mov r12,r15
  4362. and r12,3
  4363. mov r13,r15
  4364. and r13,-4
  4365. mov r14,r9
  4366. shrd r15,r9,2
  4367. shr r9,2
  4368. add r15,r13
  4369. adc r9,r14
  4370. add r10,r15
  4371. adc r11,r9
  4372. adc r12,0
  4373. vperm2i128 ymm3,ymm6,ymm2,0x02
  4374. vperm2i128 ymm6,ymm6,ymm2,0x13
  4375. vperm2i128 ymm2,ymm14,ymm10,0x02
  4376. vperm2i128 ymm10,ymm14,ymm10,0x13
  4377. vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi]
  4378. vpxor ymm2,ymm2,YMMWORD[((32+128))+rsi]
  4379. vpxor ymm6,ymm6,YMMWORD[((64+128))+rsi]
  4380. vpxor ymm10,ymm10,YMMWORD[((96+128))+rsi]
  4381. vmovdqu YMMWORD[(0+128)+rdi],ymm3
  4382. vmovdqu YMMWORD[(32+128)+rdi],ymm2
  4383. vmovdqu YMMWORD[(64+128)+rdi],ymm6
  4384. vmovdqu YMMWORD[(96+128)+rdi],ymm10
  4385. add r10,QWORD[((0+480+16))+rsi]
  4386. adc r11,QWORD[((8+480+16))+rsi]
  4387. adc r12,1
  4388. vperm2i128 ymm3,ymm5,ymm1,0x02
  4389. vperm2i128 ymm5,ymm5,ymm1,0x13
  4390. vperm2i128 ymm1,ymm13,ymm9,0x02
  4391. vperm2i128 ymm9,ymm13,ymm9,0x13
  4392. vpxor ymm3,ymm3,YMMWORD[((0+256))+rsi]
  4393. vpxor ymm1,ymm1,YMMWORD[((32+256))+rsi]
  4394. vpxor ymm5,ymm5,YMMWORD[((64+256))+rsi]
  4395. vpxor ymm9,ymm9,YMMWORD[((96+256))+rsi]
  4396. vmovdqu YMMWORD[(0+256)+rdi],ymm3
  4397. vmovdqu YMMWORD[(32+256)+rdi],ymm1
  4398. vmovdqu YMMWORD[(64+256)+rdi],ymm5
  4399. vmovdqu YMMWORD[(96+256)+rdi],ymm9
  4400. mov rax,QWORD[((0+160+0))+rbp]
  4401. mov r15,rax
  4402. mul r10
  4403. mov r13,rax
  4404. mov r14,rdx
  4405. mov rax,QWORD[((0+160+0))+rbp]
  4406. mul r11
  4407. imul r15,r12
  4408. add r14,rax
  4409. adc r15,rdx
  4410. mov rax,QWORD[((8+160+0))+rbp]
  4411. mov r9,rax
  4412. mul r10
  4413. add r14,rax
  4414. adc rdx,0
  4415. mov r10,rdx
  4416. mov rax,QWORD[((8+160+0))+rbp]
  4417. mul r11
  4418. add r15,rax
  4419. adc rdx,0
  4420. imul r9,r12
  4421. add r15,r10
  4422. adc r9,rdx
  4423. mov r10,r13
  4424. mov r11,r14
  4425. mov r12,r15
  4426. and r12,3
  4427. mov r13,r15
  4428. and r13,-4
  4429. mov r14,r9
  4430. shrd r15,r9,2
  4431. shr r9,2
  4432. add r15,r13
  4433. adc r9,r14
  4434. add r10,r15
  4435. adc r11,r9
  4436. adc r12,0
  4437. vperm2i128 ymm3,ymm4,ymm0,0x02
  4438. vperm2i128 ymm4,ymm4,ymm0,0x13
  4439. vperm2i128 ymm0,ymm12,ymm8,0x02
  4440. vperm2i128 ymm8,ymm12,ymm8,0x13
  4441. vpxor ymm3,ymm3,YMMWORD[((0+384))+rsi]
  4442. vpxor ymm0,ymm0,YMMWORD[((32+384))+rsi]
  4443. vpxor ymm4,ymm4,YMMWORD[((64+384))+rsi]
  4444. vpxor ymm8,ymm8,YMMWORD[((96+384))+rsi]
  4445. vmovdqu YMMWORD[(0+384)+rdi],ymm3
  4446. vmovdqu YMMWORD[(32+384)+rdi],ymm0
  4447. vmovdqu YMMWORD[(64+384)+rdi],ymm4
  4448. vmovdqu YMMWORD[(96+384)+rdi],ymm8
  4449. lea rsi,[512+rsi]
  4450. lea rdi,[512+rdi]
  4451. sub rbx,16*32
  4452. jmp NEAR $L$open_avx2_main_loop
  4453. $L$open_avx2_main_loop_done:
  4454. test rbx,rbx
  4455. vzeroupper
  4456. je NEAR $L$open_sse_finalize
  4457. cmp rbx,12*32
  4458. ja NEAR $L$open_avx2_tail_512
  4459. cmp rbx,8*32
  4460. ja NEAR $L$open_avx2_tail_384
  4461. cmp rbx,4*32
  4462. ja NEAR $L$open_avx2_tail_256
  4463. vmovdqa ymm0,YMMWORD[$L$chacha20_consts]
  4464. vmovdqa ymm4,YMMWORD[((160+64))+rbp]
  4465. vmovdqa ymm8,YMMWORD[((160+96))+rbp]
  4466. vmovdqa ymm12,YMMWORD[$L$avx2_inc]
  4467. vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
  4468. vmovdqa YMMWORD[(160+160)+rbp],ymm12
  4469. xor r8,r8
  4470. mov rcx,rbx
  4471. and rcx,-16
  4472. test rcx,rcx
  4473. je NEAR $L$open_avx2_tail_128_rounds
  4474. $L$open_avx2_tail_128_rounds_and_x1hash:
  4475. add r10,QWORD[((0+0))+r8*1+rsi]
  4476. adc r11,QWORD[((8+0))+r8*1+rsi]
  4477. adc r12,1
  4478. mov rax,QWORD[((0+160+0))+rbp]
  4479. mov r15,rax
  4480. mul r10
  4481. mov r13,rax
  4482. mov r14,rdx
  4483. mov rax,QWORD[((0+160+0))+rbp]
  4484. mul r11
  4485. imul r15,r12
  4486. add r14,rax
  4487. adc r15,rdx
  4488. mov rax,QWORD[((8+160+0))+rbp]
  4489. mov r9,rax
  4490. mul r10
  4491. add r14,rax
  4492. adc rdx,0
  4493. mov r10,rdx
  4494. mov rax,QWORD[((8+160+0))+rbp]
  4495. mul r11
  4496. add r15,rax
  4497. adc rdx,0
  4498. imul r9,r12
  4499. add r15,r10
  4500. adc r9,rdx
  4501. mov r10,r13
  4502. mov r11,r14
  4503. mov r12,r15
  4504. and r12,3
  4505. mov r13,r15
  4506. and r13,-4
  4507. mov r14,r9
  4508. shrd r15,r9,2
  4509. shr r9,2
  4510. add r15,r13
  4511. adc r9,r14
  4512. add r10,r15
  4513. adc r11,r9
  4514. adc r12,0
  4515. $L$open_avx2_tail_128_rounds:
  4516. add r8,16
  4517. vpaddd ymm0,ymm0,ymm4
  4518. vpxor ymm12,ymm12,ymm0
  4519. vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
  4520. vpaddd ymm8,ymm8,ymm12
  4521. vpxor ymm4,ymm4,ymm8
  4522. vpsrld ymm3,ymm4,20
  4523. vpslld ymm4,ymm4,12
  4524. vpxor ymm4,ymm4,ymm3
  4525. vpaddd ymm0,ymm0,ymm4
  4526. vpxor ymm12,ymm12,ymm0
  4527. vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
  4528. vpaddd ymm8,ymm8,ymm12
  4529. vpxor ymm4,ymm4,ymm8
  4530. vpslld ymm3,ymm4,7
  4531. vpsrld ymm4,ymm4,25
  4532. vpxor ymm4,ymm4,ymm3
  4533. vpalignr ymm12,ymm12,ymm12,12
  4534. vpalignr ymm8,ymm8,ymm8,8
  4535. vpalignr ymm4,ymm4,ymm4,4
  4536. vpaddd ymm0,ymm0,ymm4
  4537. vpxor ymm12,ymm12,ymm0
  4538. vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
  4539. vpaddd ymm8,ymm8,ymm12
  4540. vpxor ymm4,ymm4,ymm8
  4541. vpsrld ymm3,ymm4,20
  4542. vpslld ymm4,ymm4,12
  4543. vpxor ymm4,ymm4,ymm3
  4544. vpaddd ymm0,ymm0,ymm4
  4545. vpxor ymm12,ymm12,ymm0
  4546. vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
  4547. vpaddd ymm8,ymm8,ymm12
  4548. vpxor ymm4,ymm4,ymm8
  4549. vpslld ymm3,ymm4,7
  4550. vpsrld ymm4,ymm4,25
  4551. vpxor ymm4,ymm4,ymm3
  4552. vpalignr ymm12,ymm12,ymm12,4
  4553. vpalignr ymm8,ymm8,ymm8,8
  4554. vpalignr ymm4,ymm4,ymm4,12
  4555. cmp r8,rcx
  4556. jb NEAR $L$open_avx2_tail_128_rounds_and_x1hash
  4557. cmp r8,160
  4558. jne NEAR $L$open_avx2_tail_128_rounds
  4559. vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts]
  4560. vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp]
  4561. vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp]
  4562. vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
  4563. vperm2i128 ymm3,ymm4,ymm0,0x13
  4564. vperm2i128 ymm0,ymm4,ymm0,0x02
  4565. vperm2i128 ymm4,ymm12,ymm8,0x02
  4566. vperm2i128 ymm12,ymm12,ymm8,0x13
  4567. vmovdqa ymm8,ymm3
  4568. jmp NEAR $L$open_avx2_tail_128_xor
  4569. $L$open_avx2_tail_256:
  4570. vmovdqa ymm0,YMMWORD[$L$chacha20_consts]
  4571. vmovdqa ymm4,YMMWORD[((160+64))+rbp]
  4572. vmovdqa ymm8,YMMWORD[((160+96))+rbp]
  4573. vmovdqa ymm1,ymm0
  4574. vmovdqa ymm5,ymm4
  4575. vmovdqa ymm9,ymm8
  4576. vmovdqa ymm12,YMMWORD[$L$avx2_inc]
  4577. vpaddd ymm13,ymm12,YMMWORD[((160+160))+rbp]
  4578. vpaddd ymm12,ymm12,ymm13
  4579. vmovdqa YMMWORD[(160+160)+rbp],ymm12
  4580. vmovdqa YMMWORD[(160+192)+rbp],ymm13
  4581. mov QWORD[((160+128))+rbp],rbx
  4582. mov rcx,rbx
  4583. sub rcx,4*32
  4584. shr rcx,4
  4585. mov r8,10
  4586. cmp rcx,10
  4587. cmovg rcx,r8
  4588. mov rbx,rsi
  4589. xor r8,r8
  4590. $L$open_avx2_tail_256_rounds_and_x1hash:
  4591. add r10,QWORD[((0+0))+rbx]
  4592. adc r11,QWORD[((8+0))+rbx]
  4593. adc r12,1
  4594. mov rdx,QWORD[((0+160+0))+rbp]
  4595. mov r15,rdx
  4596. mulx r14,r13,r10
  4597. mulx rdx,rax,r11
  4598. imul r15,r12
  4599. add r14,rax
  4600. adc r15,rdx
  4601. mov rdx,QWORD[((8+160+0))+rbp]
  4602. mulx rax,r10,r10
  4603. add r14,r10
  4604. mulx r9,r11,r11
  4605. adc r15,r11
  4606. adc r9,0
  4607. imul rdx,r12
  4608. add r15,rax
  4609. adc r9,rdx
  4610. mov r10,r13
  4611. mov r11,r14
  4612. mov r12,r15
  4613. and r12,3
  4614. mov r13,r15
  4615. and r13,-4
  4616. mov r14,r9
  4617. shrd r15,r9,2
  4618. shr r9,2
  4619. add r15,r13
  4620. adc r9,r14
  4621. add r10,r15
  4622. adc r11,r9
  4623. adc r12,0
  4624. lea rbx,[16+rbx]
  4625. $L$open_avx2_tail_256_rounds:
  4626. vpaddd ymm0,ymm0,ymm4
  4627. vpxor ymm12,ymm12,ymm0
  4628. vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
  4629. vpaddd ymm8,ymm8,ymm12
  4630. vpxor ymm4,ymm4,ymm8
  4631. vpsrld ymm3,ymm4,20
  4632. vpslld ymm4,ymm4,12
  4633. vpxor ymm4,ymm4,ymm3
  4634. vpaddd ymm0,ymm0,ymm4
  4635. vpxor ymm12,ymm12,ymm0
  4636. vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
  4637. vpaddd ymm8,ymm8,ymm12
  4638. vpxor ymm4,ymm4,ymm8
  4639. vpslld ymm3,ymm4,7
  4640. vpsrld ymm4,ymm4,25
  4641. vpxor ymm4,ymm4,ymm3
  4642. vpalignr ymm12,ymm12,ymm12,12
  4643. vpalignr ymm8,ymm8,ymm8,8
  4644. vpalignr ymm4,ymm4,ymm4,4
  4645. vpaddd ymm1,ymm1,ymm5
  4646. vpxor ymm13,ymm13,ymm1
  4647. vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
  4648. vpaddd ymm9,ymm9,ymm13
  4649. vpxor ymm5,ymm5,ymm9
  4650. vpsrld ymm3,ymm5,20
  4651. vpslld ymm5,ymm5,12
  4652. vpxor ymm5,ymm5,ymm3
  4653. vpaddd ymm1,ymm1,ymm5
  4654. vpxor ymm13,ymm13,ymm1
  4655. vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
  4656. vpaddd ymm9,ymm9,ymm13
  4657. vpxor ymm5,ymm5,ymm9
  4658. vpslld ymm3,ymm5,7
  4659. vpsrld ymm5,ymm5,25
  4660. vpxor ymm5,ymm5,ymm3
  4661. vpalignr ymm13,ymm13,ymm13,12
  4662. vpalignr ymm9,ymm9,ymm9,8
  4663. vpalignr ymm5,ymm5,ymm5,4
  4664. inc r8
  4665. vpaddd ymm0,ymm0,ymm4
  4666. vpxor ymm12,ymm12,ymm0
  4667. vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
  4668. vpaddd ymm8,ymm8,ymm12
  4669. vpxor ymm4,ymm4,ymm8
  4670. vpsrld ymm3,ymm4,20
  4671. vpslld ymm4,ymm4,12
  4672. vpxor ymm4,ymm4,ymm3
  4673. vpaddd ymm0,ymm0,ymm4
  4674. vpxor ymm12,ymm12,ymm0
  4675. vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
  4676. vpaddd ymm8,ymm8,ymm12
  4677. vpxor ymm4,ymm4,ymm8
  4678. vpslld ymm3,ymm4,7
  4679. vpsrld ymm4,ymm4,25
  4680. vpxor ymm4,ymm4,ymm3
  4681. vpalignr ymm12,ymm12,ymm12,4
  4682. vpalignr ymm8,ymm8,ymm8,8
  4683. vpalignr ymm4,ymm4,ymm4,12
  4684. vpaddd ymm1,ymm1,ymm5
  4685. vpxor ymm13,ymm13,ymm1
  4686. vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
  4687. vpaddd ymm9,ymm9,ymm13
  4688. vpxor ymm5,ymm5,ymm9
  4689. vpsrld ymm3,ymm5,20
  4690. vpslld ymm5,ymm5,12
  4691. vpxor ymm5,ymm5,ymm3
  4692. vpaddd ymm1,ymm1,ymm5
  4693. vpxor ymm13,ymm13,ymm1
  4694. vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
  4695. vpaddd ymm9,ymm9,ymm13
  4696. vpxor ymm5,ymm5,ymm9
  4697. vpslld ymm3,ymm5,7
  4698. vpsrld ymm5,ymm5,25
  4699. vpxor ymm5,ymm5,ymm3
  4700. vpalignr ymm13,ymm13,ymm13,4
  4701. vpalignr ymm9,ymm9,ymm9,8
  4702. vpalignr ymm5,ymm5,ymm5,12
  4703. vpaddd ymm2,ymm2,ymm6
  4704. vpxor ymm14,ymm14,ymm2
  4705. vpshufb ymm14,ymm14,YMMWORD[$L$rol16]
  4706. vpaddd ymm10,ymm10,ymm14
  4707. vpxor ymm6,ymm6,ymm10
  4708. vpsrld ymm3,ymm6,20
  4709. vpslld ymm6,ymm6,12
  4710. vpxor ymm6,ymm6,ymm3
  4711. vpaddd ymm2,ymm2,ymm6
  4712. vpxor ymm14,ymm14,ymm2
  4713. vpshufb ymm14,ymm14,YMMWORD[$L$rol8]
  4714. vpaddd ymm10,ymm10,ymm14
  4715. vpxor ymm6,ymm6,ymm10
  4716. vpslld ymm3,ymm6,7
  4717. vpsrld ymm6,ymm6,25
  4718. vpxor ymm6,ymm6,ymm3
  4719. vpalignr ymm14,ymm14,ymm14,4
  4720. vpalignr ymm10,ymm10,ymm10,8
  4721. vpalignr ymm6,ymm6,ymm6,12
  4722. cmp r8,rcx
  4723. jb NEAR $L$open_avx2_tail_256_rounds_and_x1hash
  4724. cmp r8,10
  4725. jne NEAR $L$open_avx2_tail_256_rounds
  4726. mov r8,rbx
  4727. sub rbx,rsi
  4728. mov rcx,rbx
  4729. mov rbx,QWORD[((160+128))+rbp]
  4730. $L$open_avx2_tail_256_hash:
  4731. add rcx,16
  4732. cmp rcx,rbx
  4733. jg NEAR $L$open_avx2_tail_256_done
  4734. add r10,QWORD[((0+0))+r8]
  4735. adc r11,QWORD[((8+0))+r8]
  4736. adc r12,1
  4737. mov rdx,QWORD[((0+160+0))+rbp]
  4738. mov r15,rdx
  4739. mulx r14,r13,r10
  4740. mulx rdx,rax,r11
  4741. imul r15,r12
  4742. add r14,rax
  4743. adc r15,rdx
  4744. mov rdx,QWORD[((8+160+0))+rbp]
  4745. mulx rax,r10,r10
  4746. add r14,r10
  4747. mulx r9,r11,r11
  4748. adc r15,r11
  4749. adc r9,0
  4750. imul rdx,r12
  4751. add r15,rax
  4752. adc r9,rdx
  4753. mov r10,r13
  4754. mov r11,r14
  4755. mov r12,r15
  4756. and r12,3
  4757. mov r13,r15
  4758. and r13,-4
  4759. mov r14,r9
  4760. shrd r15,r9,2
  4761. shr r9,2
  4762. add r15,r13
  4763. adc r9,r14
  4764. add r10,r15
  4765. adc r11,r9
  4766. adc r12,0
  4767. lea r8,[16+r8]
  4768. jmp NEAR $L$open_avx2_tail_256_hash
  4769. $L$open_avx2_tail_256_done:
  4770. vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts]
  4771. vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp]
  4772. vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp]
  4773. vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp]
  4774. vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts]
  4775. vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp]
  4776. vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp]
  4777. vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
  4778. vperm2i128 ymm3,ymm5,ymm1,0x02
  4779. vperm2i128 ymm5,ymm5,ymm1,0x13
  4780. vperm2i128 ymm1,ymm13,ymm9,0x02
  4781. vperm2i128 ymm9,ymm13,ymm9,0x13
  4782. vpxor ymm3,ymm3,YMMWORD[((0+0))+rsi]
  4783. vpxor ymm1,ymm1,YMMWORD[((32+0))+rsi]
  4784. vpxor ymm5,ymm5,YMMWORD[((64+0))+rsi]
  4785. vpxor ymm9,ymm9,YMMWORD[((96+0))+rsi]
  4786. vmovdqu YMMWORD[(0+0)+rdi],ymm3
  4787. vmovdqu YMMWORD[(32+0)+rdi],ymm1
  4788. vmovdqu YMMWORD[(64+0)+rdi],ymm5
  4789. vmovdqu YMMWORD[(96+0)+rdi],ymm9
  4790. vperm2i128 ymm3,ymm4,ymm0,0x13
  4791. vperm2i128 ymm0,ymm4,ymm0,0x02
  4792. vperm2i128 ymm4,ymm12,ymm8,0x02
  4793. vperm2i128 ymm12,ymm12,ymm8,0x13
  4794. vmovdqa ymm8,ymm3
  4795. lea rsi,[128+rsi]
  4796. lea rdi,[128+rdi]
  4797. sub rbx,4*32
  4798. jmp NEAR $L$open_avx2_tail_128_xor
  4799. $L$open_avx2_tail_384:
  4800. vmovdqa ymm0,YMMWORD[$L$chacha20_consts]
  4801. vmovdqa ymm4,YMMWORD[((160+64))+rbp]
  4802. vmovdqa ymm8,YMMWORD[((160+96))+rbp]
  4803. vmovdqa ymm1,ymm0
  4804. vmovdqa ymm5,ymm4
  4805. vmovdqa ymm9,ymm8
  4806. vmovdqa ymm2,ymm0
  4807. vmovdqa ymm6,ymm4
  4808. vmovdqa ymm10,ymm8
  4809. vmovdqa ymm12,YMMWORD[$L$avx2_inc]
  4810. vpaddd ymm14,ymm12,YMMWORD[((160+160))+rbp]
  4811. vpaddd ymm13,ymm12,ymm14
  4812. vpaddd ymm12,ymm12,ymm13
  4813. vmovdqa YMMWORD[(160+160)+rbp],ymm12
  4814. vmovdqa YMMWORD[(160+192)+rbp],ymm13
  4815. vmovdqa YMMWORD[(160+224)+rbp],ymm14
  4816. mov QWORD[((160+128))+rbp],rbx
  4817. mov rcx,rbx
  4818. sub rcx,8*32
  4819. shr rcx,4
  4820. add rcx,6
  4821. mov r8,10
  4822. cmp rcx,10
  4823. cmovg rcx,r8
  4824. mov rbx,rsi
  4825. xor r8,r8
  4826. $L$open_avx2_tail_384_rounds_and_x2hash:
  4827. add r10,QWORD[((0+0))+rbx]
  4828. adc r11,QWORD[((8+0))+rbx]
  4829. adc r12,1
  4830. mov rdx,QWORD[((0+160+0))+rbp]
  4831. mov r15,rdx
  4832. mulx r14,r13,r10
  4833. mulx rdx,rax,r11
  4834. imul r15,r12
  4835. add r14,rax
  4836. adc r15,rdx
  4837. mov rdx,QWORD[((8+160+0))+rbp]
  4838. mulx rax,r10,r10
  4839. add r14,r10
  4840. mulx r9,r11,r11
  4841. adc r15,r11
  4842. adc r9,0
  4843. imul rdx,r12
  4844. add r15,rax
  4845. adc r9,rdx
  4846. mov r10,r13
  4847. mov r11,r14
  4848. mov r12,r15
  4849. and r12,3
  4850. mov r13,r15
  4851. and r13,-4
  4852. mov r14,r9
  4853. shrd r15,r9,2
  4854. shr r9,2
  4855. add r15,r13
  4856. adc r9,r14
  4857. add r10,r15
  4858. adc r11,r9
  4859. adc r12,0
  4860. lea rbx,[16+rbx]
  4861. $L$open_avx2_tail_384_rounds_and_x1hash:
  4862. vpaddd ymm2,ymm2,ymm6
  4863. vpxor ymm14,ymm14,ymm2
  4864. vpshufb ymm14,ymm14,YMMWORD[$L$rol16]
  4865. vpaddd ymm10,ymm10,ymm14
  4866. vpxor ymm6,ymm6,ymm10
  4867. vpsrld ymm3,ymm6,20
  4868. vpslld ymm6,ymm6,12
  4869. vpxor ymm6,ymm6,ymm3
  4870. vpaddd ymm2,ymm2,ymm6
  4871. vpxor ymm14,ymm14,ymm2
  4872. vpshufb ymm14,ymm14,YMMWORD[$L$rol8]
  4873. vpaddd ymm10,ymm10,ymm14
  4874. vpxor ymm6,ymm6,ymm10
  4875. vpslld ymm3,ymm6,7
  4876. vpsrld ymm6,ymm6,25
  4877. vpxor ymm6,ymm6,ymm3
  4878. vpalignr ymm14,ymm14,ymm14,12
  4879. vpalignr ymm10,ymm10,ymm10,8
  4880. vpalignr ymm6,ymm6,ymm6,4
  4881. vpaddd ymm1,ymm1,ymm5
  4882. vpxor ymm13,ymm13,ymm1
  4883. vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
  4884. vpaddd ymm9,ymm9,ymm13
  4885. vpxor ymm5,ymm5,ymm9
  4886. vpsrld ymm3,ymm5,20
  4887. vpslld ymm5,ymm5,12
  4888. vpxor ymm5,ymm5,ymm3
  4889. vpaddd ymm1,ymm1,ymm5
  4890. vpxor ymm13,ymm13,ymm1
  4891. vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
  4892. vpaddd ymm9,ymm9,ymm13
  4893. vpxor ymm5,ymm5,ymm9
  4894. vpslld ymm3,ymm5,7
  4895. vpsrld ymm5,ymm5,25
  4896. vpxor ymm5,ymm5,ymm3
  4897. vpalignr ymm13,ymm13,ymm13,12
  4898. vpalignr ymm9,ymm9,ymm9,8
  4899. vpalignr ymm5,ymm5,ymm5,4
  4900. vpaddd ymm0,ymm0,ymm4
  4901. vpxor ymm12,ymm12,ymm0
  4902. vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
  4903. vpaddd ymm8,ymm8,ymm12
  4904. vpxor ymm4,ymm4,ymm8
  4905. vpsrld ymm3,ymm4,20
  4906. vpslld ymm4,ymm4,12
  4907. vpxor ymm4,ymm4,ymm3
  4908. vpaddd ymm0,ymm0,ymm4
  4909. vpxor ymm12,ymm12,ymm0
  4910. vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
  4911. vpaddd ymm8,ymm8,ymm12
  4912. vpxor ymm4,ymm4,ymm8
  4913. vpslld ymm3,ymm4,7
  4914. vpsrld ymm4,ymm4,25
  4915. vpxor ymm4,ymm4,ymm3
  4916. vpalignr ymm12,ymm12,ymm12,12
  4917. vpalignr ymm8,ymm8,ymm8,8
  4918. vpalignr ymm4,ymm4,ymm4,4
  4919. add r10,QWORD[((0+0))+rbx]
  4920. adc r11,QWORD[((8+0))+rbx]
  4921. adc r12,1
  4922. mov rax,QWORD[((0+160+0))+rbp]
  4923. mov r15,rax
  4924. mul r10
  4925. mov r13,rax
  4926. mov r14,rdx
  4927. mov rax,QWORD[((0+160+0))+rbp]
  4928. mul r11
  4929. imul r15,r12
  4930. add r14,rax
  4931. adc r15,rdx
  4932. mov rax,QWORD[((8+160+0))+rbp]
  4933. mov r9,rax
  4934. mul r10
  4935. add r14,rax
  4936. adc rdx,0
  4937. mov r10,rdx
  4938. mov rax,QWORD[((8+160+0))+rbp]
  4939. mul r11
  4940. add r15,rax
  4941. adc rdx,0
  4942. imul r9,r12
  4943. add r15,r10
  4944. adc r9,rdx
  4945. mov r10,r13
  4946. mov r11,r14
  4947. mov r12,r15
  4948. and r12,3
  4949. mov r13,r15
  4950. and r13,-4
  4951. mov r14,r9
  4952. shrd r15,r9,2
  4953. shr r9,2
  4954. add r15,r13
  4955. adc r9,r14
  4956. add r10,r15
  4957. adc r11,r9
  4958. adc r12,0
  4959. lea rbx,[16+rbx]
  4960. inc r8
  4961. vpaddd ymm2,ymm2,ymm6
  4962. vpxor ymm14,ymm14,ymm2
  4963. vpshufb ymm14,ymm14,YMMWORD[$L$rol16]
  4964. vpaddd ymm10,ymm10,ymm14
  4965. vpxor ymm6,ymm6,ymm10
  4966. vpsrld ymm3,ymm6,20
  4967. vpslld ymm6,ymm6,12
  4968. vpxor ymm6,ymm6,ymm3
  4969. vpaddd ymm2,ymm2,ymm6
  4970. vpxor ymm14,ymm14,ymm2
  4971. vpshufb ymm14,ymm14,YMMWORD[$L$rol8]
  4972. vpaddd ymm10,ymm10,ymm14
  4973. vpxor ymm6,ymm6,ymm10
  4974. vpslld ymm3,ymm6,7
  4975. vpsrld ymm6,ymm6,25
  4976. vpxor ymm6,ymm6,ymm3
  4977. vpalignr ymm14,ymm14,ymm14,4
  4978. vpalignr ymm10,ymm10,ymm10,8
  4979. vpalignr ymm6,ymm6,ymm6,12
  4980. vpaddd ymm1,ymm1,ymm5
  4981. vpxor ymm13,ymm13,ymm1
  4982. vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
  4983. vpaddd ymm9,ymm9,ymm13
  4984. vpxor ymm5,ymm5,ymm9
  4985. vpsrld ymm3,ymm5,20
  4986. vpslld ymm5,ymm5,12
  4987. vpxor ymm5,ymm5,ymm3
  4988. vpaddd ymm1,ymm1,ymm5
  4989. vpxor ymm13,ymm13,ymm1
  4990. vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
  4991. vpaddd ymm9,ymm9,ymm13
  4992. vpxor ymm5,ymm5,ymm9
  4993. vpslld ymm3,ymm5,7
  4994. vpsrld ymm5,ymm5,25
  4995. vpxor ymm5,ymm5,ymm3
  4996. vpalignr ymm13,ymm13,ymm13,4
  4997. vpalignr ymm9,ymm9,ymm9,8
  4998. vpalignr ymm5,ymm5,ymm5,12
  4999. vpaddd ymm0,ymm0,ymm4
  5000. vpxor ymm12,ymm12,ymm0
  5001. vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
  5002. vpaddd ymm8,ymm8,ymm12
  5003. vpxor ymm4,ymm4,ymm8
  5004. vpsrld ymm3,ymm4,20
  5005. vpslld ymm4,ymm4,12
  5006. vpxor ymm4,ymm4,ymm3
  5007. vpaddd ymm0,ymm0,ymm4
  5008. vpxor ymm12,ymm12,ymm0
  5009. vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
  5010. vpaddd ymm8,ymm8,ymm12
  5011. vpxor ymm4,ymm4,ymm8
  5012. vpslld ymm3,ymm4,7
  5013. vpsrld ymm4,ymm4,25
  5014. vpxor ymm4,ymm4,ymm3
  5015. vpalignr ymm12,ymm12,ymm12,4
  5016. vpalignr ymm8,ymm8,ymm8,8
  5017. vpalignr ymm4,ymm4,ymm4,12
  5018. cmp r8,rcx
  5019. jb NEAR $L$open_avx2_tail_384_rounds_and_x2hash
  5020. cmp r8,10
  5021. jne NEAR $L$open_avx2_tail_384_rounds_and_x1hash
  5022. mov r8,rbx
  5023. sub rbx,rsi
  5024. mov rcx,rbx
  5025. mov rbx,QWORD[((160+128))+rbp]
  5026. $L$open_avx2_384_tail_hash:
  5027. add rcx,16
  5028. cmp rcx,rbx
  5029. jg NEAR $L$open_avx2_384_tail_done
  5030. add r10,QWORD[((0+0))+r8]
  5031. adc r11,QWORD[((8+0))+r8]
  5032. adc r12,1
  5033. mov rdx,QWORD[((0+160+0))+rbp]
  5034. mov r15,rdx
  5035. mulx r14,r13,r10
  5036. mulx rdx,rax,r11
  5037. imul r15,r12
  5038. add r14,rax
  5039. adc r15,rdx
  5040. mov rdx,QWORD[((8+160+0))+rbp]
  5041. mulx rax,r10,r10
  5042. add r14,r10
  5043. mulx r9,r11,r11
  5044. adc r15,r11
  5045. adc r9,0
  5046. imul rdx,r12
  5047. add r15,rax
  5048. adc r9,rdx
  5049. mov r10,r13
  5050. mov r11,r14
  5051. mov r12,r15
  5052. and r12,3
  5053. mov r13,r15
  5054. and r13,-4
  5055. mov r14,r9
  5056. shrd r15,r9,2
  5057. shr r9,2
  5058. add r15,r13
  5059. adc r9,r14
  5060. add r10,r15
  5061. adc r11,r9
  5062. adc r12,0
  5063. lea r8,[16+r8]
  5064. jmp NEAR $L$open_avx2_384_tail_hash
  5065. $L$open_avx2_384_tail_done:
  5066. vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts]
  5067. vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp]
  5068. vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp]
  5069. vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp]
  5070. vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts]
  5071. vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp]
  5072. vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp]
  5073. vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp]
  5074. vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts]
  5075. vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp]
  5076. vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp]
  5077. vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
  5078. vperm2i128 ymm3,ymm6,ymm2,0x02
  5079. vperm2i128 ymm6,ymm6,ymm2,0x13
  5080. vperm2i128 ymm2,ymm14,ymm10,0x02
  5081. vperm2i128 ymm10,ymm14,ymm10,0x13
  5082. vpxor ymm3,ymm3,YMMWORD[((0+0))+rsi]
  5083. vpxor ymm2,ymm2,YMMWORD[((32+0))+rsi]
  5084. vpxor ymm6,ymm6,YMMWORD[((64+0))+rsi]
  5085. vpxor ymm10,ymm10,YMMWORD[((96+0))+rsi]
  5086. vmovdqu YMMWORD[(0+0)+rdi],ymm3
  5087. vmovdqu YMMWORD[(32+0)+rdi],ymm2
  5088. vmovdqu YMMWORD[(64+0)+rdi],ymm6
  5089. vmovdqu YMMWORD[(96+0)+rdi],ymm10
  5090. vperm2i128 ymm3,ymm5,ymm1,0x02
  5091. vperm2i128 ymm5,ymm5,ymm1,0x13
  5092. vperm2i128 ymm1,ymm13,ymm9,0x02
  5093. vperm2i128 ymm9,ymm13,ymm9,0x13
  5094. vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi]
  5095. vpxor ymm1,ymm1,YMMWORD[((32+128))+rsi]
  5096. vpxor ymm5,ymm5,YMMWORD[((64+128))+rsi]
  5097. vpxor ymm9,ymm9,YMMWORD[((96+128))+rsi]
  5098. vmovdqu YMMWORD[(0+128)+rdi],ymm3
  5099. vmovdqu YMMWORD[(32+128)+rdi],ymm1
  5100. vmovdqu YMMWORD[(64+128)+rdi],ymm5
  5101. vmovdqu YMMWORD[(96+128)+rdi],ymm9
  5102. vperm2i128 ymm3,ymm4,ymm0,0x13
  5103. vperm2i128 ymm0,ymm4,ymm0,0x02
  5104. vperm2i128 ymm4,ymm12,ymm8,0x02
  5105. vperm2i128 ymm12,ymm12,ymm8,0x13
  5106. vmovdqa ymm8,ymm3
  5107. lea rsi,[256+rsi]
  5108. lea rdi,[256+rdi]
  5109. sub rbx,8*32
  5110. jmp NEAR $L$open_avx2_tail_128_xor
  5111. $L$open_avx2_tail_512:
  5112. vmovdqa ymm0,YMMWORD[$L$chacha20_consts]
  5113. vmovdqa ymm4,YMMWORD[((160+64))+rbp]
  5114. vmovdqa ymm8,YMMWORD[((160+96))+rbp]
  5115. vmovdqa ymm1,ymm0
  5116. vmovdqa ymm5,ymm4
  5117. vmovdqa ymm9,ymm8
  5118. vmovdqa ymm2,ymm0
  5119. vmovdqa ymm6,ymm4
  5120. vmovdqa ymm10,ymm8
  5121. vmovdqa ymm3,ymm0
  5122. vmovdqa ymm7,ymm4
  5123. vmovdqa ymm11,ymm8
  5124. vmovdqa ymm12,YMMWORD[$L$avx2_inc]
  5125. vpaddd ymm15,ymm12,YMMWORD[((160+160))+rbp]
  5126. vpaddd ymm14,ymm12,ymm15
  5127. vpaddd ymm13,ymm12,ymm14
  5128. vpaddd ymm12,ymm12,ymm13
  5129. vmovdqa YMMWORD[(160+256)+rbp],ymm15
  5130. vmovdqa YMMWORD[(160+224)+rbp],ymm14
  5131. vmovdqa YMMWORD[(160+192)+rbp],ymm13
  5132. vmovdqa YMMWORD[(160+160)+rbp],ymm12
  5133. xor rcx,rcx
  5134. mov r8,rsi
  5135. $L$open_avx2_tail_512_rounds_and_x2hash:
  5136. add r10,QWORD[((0+0))+r8]
  5137. adc r11,QWORD[((8+0))+r8]
  5138. adc r12,1
  5139. mov rax,QWORD[((0+160+0))+rbp]
  5140. mov r15,rax
  5141. mul r10
  5142. mov r13,rax
  5143. mov r14,rdx
  5144. mov rax,QWORD[((0+160+0))+rbp]
  5145. mul r11
  5146. imul r15,r12
  5147. add r14,rax
  5148. adc r15,rdx
  5149. mov rax,QWORD[((8+160+0))+rbp]
  5150. mov r9,rax
  5151. mul r10
  5152. add r14,rax
  5153. adc rdx,0
  5154. mov r10,rdx
  5155. mov rax,QWORD[((8+160+0))+rbp]
  5156. mul r11
  5157. add r15,rax
  5158. adc rdx,0
  5159. imul r9,r12
  5160. add r15,r10
  5161. adc r9,rdx
  5162. mov r10,r13
  5163. mov r11,r14
  5164. mov r12,r15
  5165. and r12,3
  5166. mov r13,r15
  5167. and r13,-4
  5168. mov r14,r9
  5169. shrd r15,r9,2
  5170. shr r9,2
  5171. add r15,r13
  5172. adc r9,r14
  5173. add r10,r15
  5174. adc r11,r9
  5175. adc r12,0
  5176. lea r8,[16+r8]
  5177. $L$open_avx2_tail_512_rounds_and_x1hash:
  5178. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  5179. vmovdqa ymm8,YMMWORD[$L$rol16]
  5180. vpaddd ymm3,ymm3,ymm7
  5181. vpaddd ymm2,ymm2,ymm6
  5182. vpaddd ymm1,ymm1,ymm5
  5183. vpaddd ymm0,ymm0,ymm4
  5184. vpxor ymm15,ymm15,ymm3
  5185. vpxor ymm14,ymm14,ymm2
  5186. vpxor ymm13,ymm13,ymm1
  5187. vpxor ymm12,ymm12,ymm0
  5188. vpshufb ymm15,ymm15,ymm8
  5189. vpshufb ymm14,ymm14,ymm8
  5190. vpshufb ymm13,ymm13,ymm8
  5191. vpshufb ymm12,ymm12,ymm8
  5192. vpaddd ymm11,ymm11,ymm15
  5193. vpaddd ymm10,ymm10,ymm14
  5194. vpaddd ymm9,ymm9,ymm13
  5195. vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
  5196. vpxor ymm7,ymm7,ymm11
  5197. vpxor ymm6,ymm6,ymm10
  5198. vpxor ymm5,ymm5,ymm9
  5199. vpxor ymm4,ymm4,ymm8
  5200. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  5201. vpsrld ymm8,ymm7,20
  5202. vpslld ymm7,ymm7,32-20
  5203. vpxor ymm7,ymm7,ymm8
  5204. vpsrld ymm8,ymm6,20
  5205. vpslld ymm6,ymm6,32-20
  5206. vpxor ymm6,ymm6,ymm8
  5207. vpsrld ymm8,ymm5,20
  5208. vpslld ymm5,ymm5,32-20
  5209. vpxor ymm5,ymm5,ymm8
  5210. vpsrld ymm8,ymm4,20
  5211. vpslld ymm4,ymm4,32-20
  5212. vpxor ymm4,ymm4,ymm8
  5213. vmovdqa ymm8,YMMWORD[$L$rol8]
  5214. vpaddd ymm3,ymm3,ymm7
  5215. add r10,QWORD[((0+0))+r8]
  5216. adc r11,QWORD[((8+0))+r8]
  5217. adc r12,1
  5218. mov rdx,QWORD[((0+160+0))+rbp]
  5219. mov r15,rdx
  5220. mulx r14,r13,r10
  5221. mulx rdx,rax,r11
  5222. imul r15,r12
  5223. add r14,rax
  5224. adc r15,rdx
  5225. mov rdx,QWORD[((8+160+0))+rbp]
  5226. mulx rax,r10,r10
  5227. add r14,r10
  5228. mulx r9,r11,r11
  5229. adc r15,r11
  5230. adc r9,0
  5231. imul rdx,r12
  5232. add r15,rax
  5233. adc r9,rdx
  5234. mov r10,r13
  5235. mov r11,r14
  5236. mov r12,r15
  5237. and r12,3
  5238. mov r13,r15
  5239. and r13,-4
  5240. mov r14,r9
  5241. shrd r15,r9,2
  5242. shr r9,2
  5243. add r15,r13
  5244. adc r9,r14
  5245. add r10,r15
  5246. adc r11,r9
  5247. adc r12,0
  5248. vpaddd ymm2,ymm2,ymm6
  5249. vpaddd ymm1,ymm1,ymm5
  5250. vpaddd ymm0,ymm0,ymm4
  5251. vpxor ymm15,ymm15,ymm3
  5252. vpxor ymm14,ymm14,ymm2
  5253. vpxor ymm13,ymm13,ymm1
  5254. vpxor ymm12,ymm12,ymm0
  5255. vpshufb ymm15,ymm15,ymm8
  5256. vpshufb ymm14,ymm14,ymm8
  5257. vpshufb ymm13,ymm13,ymm8
  5258. vpshufb ymm12,ymm12,ymm8
  5259. vpaddd ymm11,ymm11,ymm15
  5260. vpaddd ymm10,ymm10,ymm14
  5261. vpaddd ymm9,ymm9,ymm13
  5262. vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
  5263. vpxor ymm7,ymm7,ymm11
  5264. vpxor ymm6,ymm6,ymm10
  5265. vpxor ymm5,ymm5,ymm9
  5266. vpxor ymm4,ymm4,ymm8
  5267. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  5268. vpsrld ymm8,ymm7,25
  5269. vpslld ymm7,ymm7,32-25
  5270. vpxor ymm7,ymm7,ymm8
  5271. vpsrld ymm8,ymm6,25
  5272. vpslld ymm6,ymm6,32-25
  5273. vpxor ymm6,ymm6,ymm8
  5274. vpsrld ymm8,ymm5,25
  5275. vpslld ymm5,ymm5,32-25
  5276. vpxor ymm5,ymm5,ymm8
  5277. vpsrld ymm8,ymm4,25
  5278. vpslld ymm4,ymm4,32-25
  5279. vpxor ymm4,ymm4,ymm8
  5280. vmovdqa ymm8,YMMWORD[((160+128))+rbp]
  5281. vpalignr ymm7,ymm7,ymm7,4
  5282. vpalignr ymm11,ymm11,ymm11,8
  5283. vpalignr ymm15,ymm15,ymm15,12
  5284. vpalignr ymm6,ymm6,ymm6,4
  5285. vpalignr ymm10,ymm10,ymm10,8
  5286. vpalignr ymm14,ymm14,ymm14,12
  5287. vpalignr ymm5,ymm5,ymm5,4
  5288. vpalignr ymm9,ymm9,ymm9,8
  5289. vpalignr ymm13,ymm13,ymm13,12
  5290. vpalignr ymm4,ymm4,ymm4,4
  5291. vpalignr ymm8,ymm8,ymm8,8
  5292. vpalignr ymm12,ymm12,ymm12,12
  5293. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  5294. vmovdqa ymm8,YMMWORD[$L$rol16]
  5295. vpaddd ymm3,ymm3,ymm7
  5296. add r10,QWORD[((0+16))+r8]
  5297. adc r11,QWORD[((8+16))+r8]
  5298. adc r12,1
  5299. mov rdx,QWORD[((0+160+0))+rbp]
  5300. mov r15,rdx
  5301. mulx r14,r13,r10
  5302. mulx rdx,rax,r11
  5303. imul r15,r12
  5304. add r14,rax
  5305. adc r15,rdx
  5306. mov rdx,QWORD[((8+160+0))+rbp]
  5307. mulx rax,r10,r10
  5308. add r14,r10
  5309. mulx r9,r11,r11
  5310. adc r15,r11
  5311. adc r9,0
  5312. imul rdx,r12
  5313. add r15,rax
  5314. adc r9,rdx
  5315. mov r10,r13
  5316. mov r11,r14
  5317. mov r12,r15
  5318. and r12,3
  5319. mov r13,r15
  5320. and r13,-4
  5321. mov r14,r9
  5322. shrd r15,r9,2
  5323. shr r9,2
  5324. add r15,r13
  5325. adc r9,r14
  5326. add r10,r15
  5327. adc r11,r9
  5328. adc r12,0
  5329. lea r8,[32+r8]
  5330. vpaddd ymm2,ymm2,ymm6
  5331. vpaddd ymm1,ymm1,ymm5
  5332. vpaddd ymm0,ymm0,ymm4
  5333. vpxor ymm15,ymm15,ymm3
  5334. vpxor ymm14,ymm14,ymm2
  5335. vpxor ymm13,ymm13,ymm1
  5336. vpxor ymm12,ymm12,ymm0
  5337. vpshufb ymm15,ymm15,ymm8
  5338. vpshufb ymm14,ymm14,ymm8
  5339. vpshufb ymm13,ymm13,ymm8
  5340. vpshufb ymm12,ymm12,ymm8
  5341. vpaddd ymm11,ymm11,ymm15
  5342. vpaddd ymm10,ymm10,ymm14
  5343. vpaddd ymm9,ymm9,ymm13
  5344. vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
  5345. vpxor ymm7,ymm7,ymm11
  5346. vpxor ymm6,ymm6,ymm10
  5347. vpxor ymm5,ymm5,ymm9
  5348. vpxor ymm4,ymm4,ymm8
  5349. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  5350. vpsrld ymm8,ymm7,20
  5351. vpslld ymm7,ymm7,32-20
  5352. vpxor ymm7,ymm7,ymm8
  5353. vpsrld ymm8,ymm6,20
  5354. vpslld ymm6,ymm6,32-20
  5355. vpxor ymm6,ymm6,ymm8
  5356. vpsrld ymm8,ymm5,20
  5357. vpslld ymm5,ymm5,32-20
  5358. vpxor ymm5,ymm5,ymm8
  5359. vpsrld ymm8,ymm4,20
  5360. vpslld ymm4,ymm4,32-20
  5361. vpxor ymm4,ymm4,ymm8
  5362. vmovdqa ymm8,YMMWORD[$L$rol8]
  5363. vpaddd ymm3,ymm3,ymm7
  5364. vpaddd ymm2,ymm2,ymm6
  5365. vpaddd ymm1,ymm1,ymm5
  5366. vpaddd ymm0,ymm0,ymm4
  5367. vpxor ymm15,ymm15,ymm3
  5368. vpxor ymm14,ymm14,ymm2
  5369. vpxor ymm13,ymm13,ymm1
  5370. vpxor ymm12,ymm12,ymm0
  5371. vpshufb ymm15,ymm15,ymm8
  5372. vpshufb ymm14,ymm14,ymm8
  5373. vpshufb ymm13,ymm13,ymm8
  5374. vpshufb ymm12,ymm12,ymm8
  5375. vpaddd ymm11,ymm11,ymm15
  5376. vpaddd ymm10,ymm10,ymm14
  5377. vpaddd ymm9,ymm9,ymm13
  5378. vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
  5379. vpxor ymm7,ymm7,ymm11
  5380. vpxor ymm6,ymm6,ymm10
  5381. vpxor ymm5,ymm5,ymm9
  5382. vpxor ymm4,ymm4,ymm8
  5383. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  5384. vpsrld ymm8,ymm7,25
  5385. vpslld ymm7,ymm7,32-25
  5386. vpxor ymm7,ymm7,ymm8
  5387. vpsrld ymm8,ymm6,25
  5388. vpslld ymm6,ymm6,32-25
  5389. vpxor ymm6,ymm6,ymm8
  5390. vpsrld ymm8,ymm5,25
  5391. vpslld ymm5,ymm5,32-25
  5392. vpxor ymm5,ymm5,ymm8
  5393. vpsrld ymm8,ymm4,25
  5394. vpslld ymm4,ymm4,32-25
  5395. vpxor ymm4,ymm4,ymm8
  5396. vmovdqa ymm8,YMMWORD[((160+128))+rbp]
  5397. vpalignr ymm7,ymm7,ymm7,12
  5398. vpalignr ymm11,ymm11,ymm11,8
  5399. vpalignr ymm15,ymm15,ymm15,4
  5400. vpalignr ymm6,ymm6,ymm6,12
  5401. vpalignr ymm10,ymm10,ymm10,8
  5402. vpalignr ymm14,ymm14,ymm14,4
  5403. vpalignr ymm5,ymm5,ymm5,12
  5404. vpalignr ymm9,ymm9,ymm9,8
  5405. vpalignr ymm13,ymm13,ymm13,4
  5406. vpalignr ymm4,ymm4,ymm4,12
  5407. vpalignr ymm8,ymm8,ymm8,8
  5408. vpalignr ymm12,ymm12,ymm12,4
  5409. inc rcx
  5410. cmp rcx,4
  5411. jl NEAR $L$open_avx2_tail_512_rounds_and_x2hash
  5412. cmp rcx,10
  5413. jne NEAR $L$open_avx2_tail_512_rounds_and_x1hash
  5414. mov rcx,rbx
  5415. sub rcx,12*32
  5416. and rcx,-16
  5417. $L$open_avx2_tail_512_hash:
  5418. test rcx,rcx
  5419. je NEAR $L$open_avx2_tail_512_done
  5420. add r10,QWORD[((0+0))+r8]
  5421. adc r11,QWORD[((8+0))+r8]
  5422. adc r12,1
  5423. mov rdx,QWORD[((0+160+0))+rbp]
  5424. mov r15,rdx
  5425. mulx r14,r13,r10
  5426. mulx rdx,rax,r11
  5427. imul r15,r12
  5428. add r14,rax
  5429. adc r15,rdx
  5430. mov rdx,QWORD[((8+160+0))+rbp]
  5431. mulx rax,r10,r10
  5432. add r14,r10
  5433. mulx r9,r11,r11
  5434. adc r15,r11
  5435. adc r9,0
  5436. imul rdx,r12
  5437. add r15,rax
  5438. adc r9,rdx
  5439. mov r10,r13
  5440. mov r11,r14
  5441. mov r12,r15
  5442. and r12,3
  5443. mov r13,r15
  5444. and r13,-4
  5445. mov r14,r9
  5446. shrd r15,r9,2
  5447. shr r9,2
  5448. add r15,r13
  5449. adc r9,r14
  5450. add r10,r15
  5451. adc r11,r9
  5452. adc r12,0
  5453. lea r8,[16+r8]
  5454. sub rcx,2*8
  5455. jmp NEAR $L$open_avx2_tail_512_hash
  5456. $L$open_avx2_tail_512_done:
  5457. vpaddd ymm3,ymm3,YMMWORD[$L$chacha20_consts]
  5458. vpaddd ymm7,ymm7,YMMWORD[((160+64))+rbp]
  5459. vpaddd ymm11,ymm11,YMMWORD[((160+96))+rbp]
  5460. vpaddd ymm15,ymm15,YMMWORD[((160+256))+rbp]
  5461. vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts]
  5462. vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp]
  5463. vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp]
  5464. vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp]
  5465. vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts]
  5466. vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp]
  5467. vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp]
  5468. vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp]
  5469. vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts]
  5470. vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp]
  5471. vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp]
  5472. vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
  5473. vmovdqa YMMWORD[(160+128)+rbp],ymm0
  5474. vperm2i128 ymm0,ymm7,ymm3,0x02
  5475. vperm2i128 ymm7,ymm7,ymm3,0x13
  5476. vperm2i128 ymm3,ymm15,ymm11,0x02
  5477. vperm2i128 ymm11,ymm15,ymm11,0x13
  5478. vpxor ymm0,ymm0,YMMWORD[((0+0))+rsi]
  5479. vpxor ymm3,ymm3,YMMWORD[((32+0))+rsi]
  5480. vpxor ymm7,ymm7,YMMWORD[((64+0))+rsi]
  5481. vpxor ymm11,ymm11,YMMWORD[((96+0))+rsi]
  5482. vmovdqu YMMWORD[(0+0)+rdi],ymm0
  5483. vmovdqu YMMWORD[(32+0)+rdi],ymm3
  5484. vmovdqu YMMWORD[(64+0)+rdi],ymm7
  5485. vmovdqu YMMWORD[(96+0)+rdi],ymm11
  5486. vmovdqa ymm0,YMMWORD[((160+128))+rbp]
  5487. vperm2i128 ymm3,ymm6,ymm2,0x02
  5488. vperm2i128 ymm6,ymm6,ymm2,0x13
  5489. vperm2i128 ymm2,ymm14,ymm10,0x02
  5490. vperm2i128 ymm10,ymm14,ymm10,0x13
  5491. vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi]
  5492. vpxor ymm2,ymm2,YMMWORD[((32+128))+rsi]
  5493. vpxor ymm6,ymm6,YMMWORD[((64+128))+rsi]
  5494. vpxor ymm10,ymm10,YMMWORD[((96+128))+rsi]
  5495. vmovdqu YMMWORD[(0+128)+rdi],ymm3
  5496. vmovdqu YMMWORD[(32+128)+rdi],ymm2
  5497. vmovdqu YMMWORD[(64+128)+rdi],ymm6
  5498. vmovdqu YMMWORD[(96+128)+rdi],ymm10
  5499. vperm2i128 ymm3,ymm5,ymm1,0x02
  5500. vperm2i128 ymm5,ymm5,ymm1,0x13
  5501. vperm2i128 ymm1,ymm13,ymm9,0x02
  5502. vperm2i128 ymm9,ymm13,ymm9,0x13
  5503. vpxor ymm3,ymm3,YMMWORD[((0+256))+rsi]
  5504. vpxor ymm1,ymm1,YMMWORD[((32+256))+rsi]
  5505. vpxor ymm5,ymm5,YMMWORD[((64+256))+rsi]
  5506. vpxor ymm9,ymm9,YMMWORD[((96+256))+rsi]
  5507. vmovdqu YMMWORD[(0+256)+rdi],ymm3
  5508. vmovdqu YMMWORD[(32+256)+rdi],ymm1
  5509. vmovdqu YMMWORD[(64+256)+rdi],ymm5
  5510. vmovdqu YMMWORD[(96+256)+rdi],ymm9
  5511. vperm2i128 ymm3,ymm4,ymm0,0x13
  5512. vperm2i128 ymm0,ymm4,ymm0,0x02
  5513. vperm2i128 ymm4,ymm12,ymm8,0x02
  5514. vperm2i128 ymm12,ymm12,ymm8,0x13
  5515. vmovdqa ymm8,ymm3
  5516. lea rsi,[384+rsi]
  5517. lea rdi,[384+rdi]
  5518. sub rbx,12*32
  5519. $L$open_avx2_tail_128_xor:
  5520. cmp rbx,32
  5521. jb NEAR $L$open_avx2_tail_32_xor
  5522. sub rbx,32
  5523. vpxor ymm0,ymm0,YMMWORD[rsi]
  5524. vmovdqu YMMWORD[rdi],ymm0
  5525. lea rsi,[32+rsi]
  5526. lea rdi,[32+rdi]
  5527. vmovdqa ymm0,ymm4
  5528. vmovdqa ymm4,ymm8
  5529. vmovdqa ymm8,ymm12
  5530. jmp NEAR $L$open_avx2_tail_128_xor
  5531. $L$open_avx2_tail_32_xor:
  5532. cmp rbx,16
  5533. vmovdqa xmm1,xmm0
  5534. jb NEAR $L$open_avx2_exit
  5535. sub rbx,16
  5536. vpxor xmm1,xmm0,XMMWORD[rsi]
  5537. vmovdqu XMMWORD[rdi],xmm1
  5538. lea rsi,[16+rsi]
  5539. lea rdi,[16+rdi]
  5540. vperm2i128 ymm0,ymm0,ymm0,0x11
  5541. vmovdqa xmm1,xmm0
  5542. $L$open_avx2_exit:
  5543. vzeroupper
  5544. jmp NEAR $L$open_sse_tail_16
  5545. $L$open_avx2_192:
  5546. vmovdqa ymm1,ymm0
  5547. vmovdqa ymm2,ymm0
  5548. vmovdqa ymm5,ymm4
  5549. vmovdqa ymm6,ymm4
  5550. vmovdqa ymm9,ymm8
  5551. vmovdqa ymm10,ymm8
  5552. vpaddd ymm13,ymm12,YMMWORD[$L$avx2_inc]
  5553. vmovdqa ymm11,ymm12
  5554. vmovdqa ymm15,ymm13
  5555. mov r10,10
  5556. $L$open_avx2_192_rounds:
  5557. vpaddd ymm0,ymm0,ymm4
  5558. vpxor ymm12,ymm12,ymm0
  5559. vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
  5560. vpaddd ymm8,ymm8,ymm12
  5561. vpxor ymm4,ymm4,ymm8
  5562. vpsrld ymm3,ymm4,20
  5563. vpslld ymm4,ymm4,12
  5564. vpxor ymm4,ymm4,ymm3
  5565. vpaddd ymm0,ymm0,ymm4
  5566. vpxor ymm12,ymm12,ymm0
  5567. vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
  5568. vpaddd ymm8,ymm8,ymm12
  5569. vpxor ymm4,ymm4,ymm8
  5570. vpslld ymm3,ymm4,7
  5571. vpsrld ymm4,ymm4,25
  5572. vpxor ymm4,ymm4,ymm3
  5573. vpalignr ymm12,ymm12,ymm12,12
  5574. vpalignr ymm8,ymm8,ymm8,8
  5575. vpalignr ymm4,ymm4,ymm4,4
  5576. vpaddd ymm1,ymm1,ymm5
  5577. vpxor ymm13,ymm13,ymm1
  5578. vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
  5579. vpaddd ymm9,ymm9,ymm13
  5580. vpxor ymm5,ymm5,ymm9
  5581. vpsrld ymm3,ymm5,20
  5582. vpslld ymm5,ymm5,12
  5583. vpxor ymm5,ymm5,ymm3
  5584. vpaddd ymm1,ymm1,ymm5
  5585. vpxor ymm13,ymm13,ymm1
  5586. vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
  5587. vpaddd ymm9,ymm9,ymm13
  5588. vpxor ymm5,ymm5,ymm9
  5589. vpslld ymm3,ymm5,7
  5590. vpsrld ymm5,ymm5,25
  5591. vpxor ymm5,ymm5,ymm3
  5592. vpalignr ymm13,ymm13,ymm13,12
  5593. vpalignr ymm9,ymm9,ymm9,8
  5594. vpalignr ymm5,ymm5,ymm5,4
  5595. vpaddd ymm0,ymm0,ymm4
  5596. vpxor ymm12,ymm12,ymm0
  5597. vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
  5598. vpaddd ymm8,ymm8,ymm12
  5599. vpxor ymm4,ymm4,ymm8
  5600. vpsrld ymm3,ymm4,20
  5601. vpslld ymm4,ymm4,12
  5602. vpxor ymm4,ymm4,ymm3
  5603. vpaddd ymm0,ymm0,ymm4
  5604. vpxor ymm12,ymm12,ymm0
  5605. vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
  5606. vpaddd ymm8,ymm8,ymm12
  5607. vpxor ymm4,ymm4,ymm8
  5608. vpslld ymm3,ymm4,7
  5609. vpsrld ymm4,ymm4,25
  5610. vpxor ymm4,ymm4,ymm3
  5611. vpalignr ymm12,ymm12,ymm12,4
  5612. vpalignr ymm8,ymm8,ymm8,8
  5613. vpalignr ymm4,ymm4,ymm4,12
  5614. vpaddd ymm1,ymm1,ymm5
  5615. vpxor ymm13,ymm13,ymm1
  5616. vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
  5617. vpaddd ymm9,ymm9,ymm13
  5618. vpxor ymm5,ymm5,ymm9
  5619. vpsrld ymm3,ymm5,20
  5620. vpslld ymm5,ymm5,12
  5621. vpxor ymm5,ymm5,ymm3
  5622. vpaddd ymm1,ymm1,ymm5
  5623. vpxor ymm13,ymm13,ymm1
  5624. vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
  5625. vpaddd ymm9,ymm9,ymm13
  5626. vpxor ymm5,ymm5,ymm9
  5627. vpslld ymm3,ymm5,7
  5628. vpsrld ymm5,ymm5,25
  5629. vpxor ymm5,ymm5,ymm3
  5630. vpalignr ymm13,ymm13,ymm13,4
  5631. vpalignr ymm9,ymm9,ymm9,8
  5632. vpalignr ymm5,ymm5,ymm5,12
  5633. dec r10
  5634. jne NEAR $L$open_avx2_192_rounds
  5635. vpaddd ymm0,ymm0,ymm2
  5636. vpaddd ymm1,ymm1,ymm2
  5637. vpaddd ymm4,ymm4,ymm6
  5638. vpaddd ymm5,ymm5,ymm6
  5639. vpaddd ymm8,ymm8,ymm10
  5640. vpaddd ymm9,ymm9,ymm10
  5641. vpaddd ymm12,ymm12,ymm11
  5642. vpaddd ymm13,ymm13,ymm15
  5643. vperm2i128 ymm3,ymm4,ymm0,0x02
  5644. vpand ymm3,ymm3,YMMWORD[$L$clamp]
  5645. vmovdqa YMMWORD[(160+0)+rbp],ymm3
  5646. vperm2i128 ymm0,ymm4,ymm0,0x13
  5647. vperm2i128 ymm4,ymm12,ymm8,0x13
  5648. vperm2i128 ymm8,ymm5,ymm1,0x02
  5649. vperm2i128 ymm12,ymm13,ymm9,0x02
  5650. vperm2i128 ymm1,ymm5,ymm1,0x13
  5651. vperm2i128 ymm5,ymm13,ymm9,0x13
  5652. $L$open_avx2_short:
  5653. mov r8,r8
  5654. call poly_hash_ad_internal
  5655. $L$open_avx2_short_hash_and_xor_loop:
  5656. cmp rbx,32
  5657. jb NEAR $L$open_avx2_short_tail_32
  5658. sub rbx,32
  5659. add r10,QWORD[((0+0))+rsi]
  5660. adc r11,QWORD[((8+0))+rsi]
  5661. adc r12,1
  5662. mov rax,QWORD[((0+160+0))+rbp]
  5663. mov r15,rax
  5664. mul r10
  5665. mov r13,rax
  5666. mov r14,rdx
  5667. mov rax,QWORD[((0+160+0))+rbp]
  5668. mul r11
  5669. imul r15,r12
  5670. add r14,rax
  5671. adc r15,rdx
  5672. mov rax,QWORD[((8+160+0))+rbp]
  5673. mov r9,rax
  5674. mul r10
  5675. add r14,rax
  5676. adc rdx,0
  5677. mov r10,rdx
  5678. mov rax,QWORD[((8+160+0))+rbp]
  5679. mul r11
  5680. add r15,rax
  5681. adc rdx,0
  5682. imul r9,r12
  5683. add r15,r10
  5684. adc r9,rdx
  5685. mov r10,r13
  5686. mov r11,r14
  5687. mov r12,r15
  5688. and r12,3
  5689. mov r13,r15
  5690. and r13,-4
  5691. mov r14,r9
  5692. shrd r15,r9,2
  5693. shr r9,2
  5694. add r15,r13
  5695. adc r9,r14
  5696. add r10,r15
  5697. adc r11,r9
  5698. adc r12,0
  5699. add r10,QWORD[((0+16))+rsi]
  5700. adc r11,QWORD[((8+16))+rsi]
  5701. adc r12,1
  5702. mov rax,QWORD[((0+160+0))+rbp]
  5703. mov r15,rax
  5704. mul r10
  5705. mov r13,rax
  5706. mov r14,rdx
  5707. mov rax,QWORD[((0+160+0))+rbp]
  5708. mul r11
  5709. imul r15,r12
  5710. add r14,rax
  5711. adc r15,rdx
  5712. mov rax,QWORD[((8+160+0))+rbp]
  5713. mov r9,rax
  5714. mul r10
  5715. add r14,rax
  5716. adc rdx,0
  5717. mov r10,rdx
  5718. mov rax,QWORD[((8+160+0))+rbp]
  5719. mul r11
  5720. add r15,rax
  5721. adc rdx,0
  5722. imul r9,r12
  5723. add r15,r10
  5724. adc r9,rdx
  5725. mov r10,r13
  5726. mov r11,r14
  5727. mov r12,r15
  5728. and r12,3
  5729. mov r13,r15
  5730. and r13,-4
  5731. mov r14,r9
  5732. shrd r15,r9,2
  5733. shr r9,2
  5734. add r15,r13
  5735. adc r9,r14
  5736. add r10,r15
  5737. adc r11,r9
  5738. adc r12,0
  5739. vpxor ymm0,ymm0,YMMWORD[rsi]
  5740. vmovdqu YMMWORD[rdi],ymm0
  5741. lea rsi,[32+rsi]
  5742. lea rdi,[32+rdi]
  5743. vmovdqa ymm0,ymm4
  5744. vmovdqa ymm4,ymm8
  5745. vmovdqa ymm8,ymm12
  5746. vmovdqa ymm12,ymm1
  5747. vmovdqa ymm1,ymm5
  5748. vmovdqa ymm5,ymm9
  5749. vmovdqa ymm9,ymm13
  5750. vmovdqa ymm13,ymm2
  5751. vmovdqa ymm2,ymm6
  5752. jmp NEAR $L$open_avx2_short_hash_and_xor_loop
  5753. $L$open_avx2_short_tail_32:
  5754. cmp rbx,16
  5755. vmovdqa xmm1,xmm0
  5756. jb NEAR $L$open_avx2_short_tail_32_exit
  5757. sub rbx,16
  5758. add r10,QWORD[((0+0))+rsi]
  5759. adc r11,QWORD[((8+0))+rsi]
  5760. adc r12,1
  5761. mov rax,QWORD[((0+160+0))+rbp]
  5762. mov r15,rax
  5763. mul r10
  5764. mov r13,rax
  5765. mov r14,rdx
  5766. mov rax,QWORD[((0+160+0))+rbp]
  5767. mul r11
  5768. imul r15,r12
  5769. add r14,rax
  5770. adc r15,rdx
  5771. mov rax,QWORD[((8+160+0))+rbp]
  5772. mov r9,rax
  5773. mul r10
  5774. add r14,rax
  5775. adc rdx,0
  5776. mov r10,rdx
  5777. mov rax,QWORD[((8+160+0))+rbp]
  5778. mul r11
  5779. add r15,rax
  5780. adc rdx,0
  5781. imul r9,r12
  5782. add r15,r10
  5783. adc r9,rdx
  5784. mov r10,r13
  5785. mov r11,r14
  5786. mov r12,r15
  5787. and r12,3
  5788. mov r13,r15
  5789. and r13,-4
  5790. mov r14,r9
  5791. shrd r15,r9,2
  5792. shr r9,2
  5793. add r15,r13
  5794. adc r9,r14
  5795. add r10,r15
  5796. adc r11,r9
  5797. adc r12,0
  5798. vpxor xmm3,xmm0,XMMWORD[rsi]
  5799. vmovdqu XMMWORD[rdi],xmm3
  5800. lea rsi,[16+rsi]
  5801. lea rdi,[16+rdi]
  5802. vextracti128 xmm1,ymm0,1
  5803. $L$open_avx2_short_tail_32_exit:
  5804. vzeroupper
  5805. jmp NEAR $L$open_sse_tail_16
  5806. $L$open_avx2_320:
  5807. vmovdqa ymm1,ymm0
  5808. vmovdqa ymm2,ymm0
  5809. vmovdqa ymm5,ymm4
  5810. vmovdqa ymm6,ymm4
  5811. vmovdqa ymm9,ymm8
  5812. vmovdqa ymm10,ymm8
  5813. vpaddd ymm13,ymm12,YMMWORD[$L$avx2_inc]
  5814. vpaddd ymm14,ymm13,YMMWORD[$L$avx2_inc]
  5815. vmovdqa ymm7,ymm4
  5816. vmovdqa ymm11,ymm8
  5817. vmovdqa YMMWORD[(160+160)+rbp],ymm12
  5818. vmovdqa YMMWORD[(160+192)+rbp],ymm13
  5819. vmovdqa YMMWORD[(160+224)+rbp],ymm14
  5820. mov r10,10
  5821. $L$open_avx2_320_rounds:
  5822. vpaddd ymm0,ymm0,ymm4
  5823. vpxor ymm12,ymm12,ymm0
  5824. vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
  5825. vpaddd ymm8,ymm8,ymm12
  5826. vpxor ymm4,ymm4,ymm8
  5827. vpsrld ymm3,ymm4,20
  5828. vpslld ymm4,ymm4,12
  5829. vpxor ymm4,ymm4,ymm3
  5830. vpaddd ymm0,ymm0,ymm4
  5831. vpxor ymm12,ymm12,ymm0
  5832. vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
  5833. vpaddd ymm8,ymm8,ymm12
  5834. vpxor ymm4,ymm4,ymm8
  5835. vpslld ymm3,ymm4,7
  5836. vpsrld ymm4,ymm4,25
  5837. vpxor ymm4,ymm4,ymm3
  5838. vpalignr ymm12,ymm12,ymm12,12
  5839. vpalignr ymm8,ymm8,ymm8,8
  5840. vpalignr ymm4,ymm4,ymm4,4
  5841. vpaddd ymm1,ymm1,ymm5
  5842. vpxor ymm13,ymm13,ymm1
  5843. vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
  5844. vpaddd ymm9,ymm9,ymm13
  5845. vpxor ymm5,ymm5,ymm9
  5846. vpsrld ymm3,ymm5,20
  5847. vpslld ymm5,ymm5,12
  5848. vpxor ymm5,ymm5,ymm3
  5849. vpaddd ymm1,ymm1,ymm5
  5850. vpxor ymm13,ymm13,ymm1
  5851. vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
  5852. vpaddd ymm9,ymm9,ymm13
  5853. vpxor ymm5,ymm5,ymm9
  5854. vpslld ymm3,ymm5,7
  5855. vpsrld ymm5,ymm5,25
  5856. vpxor ymm5,ymm5,ymm3
  5857. vpalignr ymm13,ymm13,ymm13,12
  5858. vpalignr ymm9,ymm9,ymm9,8
  5859. vpalignr ymm5,ymm5,ymm5,4
  5860. vpaddd ymm2,ymm2,ymm6
  5861. vpxor ymm14,ymm14,ymm2
  5862. vpshufb ymm14,ymm14,YMMWORD[$L$rol16]
  5863. vpaddd ymm10,ymm10,ymm14
  5864. vpxor ymm6,ymm6,ymm10
  5865. vpsrld ymm3,ymm6,20
  5866. vpslld ymm6,ymm6,12
  5867. vpxor ymm6,ymm6,ymm3
  5868. vpaddd ymm2,ymm2,ymm6
  5869. vpxor ymm14,ymm14,ymm2
  5870. vpshufb ymm14,ymm14,YMMWORD[$L$rol8]
  5871. vpaddd ymm10,ymm10,ymm14
  5872. vpxor ymm6,ymm6,ymm10
  5873. vpslld ymm3,ymm6,7
  5874. vpsrld ymm6,ymm6,25
  5875. vpxor ymm6,ymm6,ymm3
  5876. vpalignr ymm14,ymm14,ymm14,12
  5877. vpalignr ymm10,ymm10,ymm10,8
  5878. vpalignr ymm6,ymm6,ymm6,4
  5879. vpaddd ymm0,ymm0,ymm4
  5880. vpxor ymm12,ymm12,ymm0
  5881. vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
  5882. vpaddd ymm8,ymm8,ymm12
  5883. vpxor ymm4,ymm4,ymm8
  5884. vpsrld ymm3,ymm4,20
  5885. vpslld ymm4,ymm4,12
  5886. vpxor ymm4,ymm4,ymm3
  5887. vpaddd ymm0,ymm0,ymm4
  5888. vpxor ymm12,ymm12,ymm0
  5889. vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
  5890. vpaddd ymm8,ymm8,ymm12
  5891. vpxor ymm4,ymm4,ymm8
  5892. vpslld ymm3,ymm4,7
  5893. vpsrld ymm4,ymm4,25
  5894. vpxor ymm4,ymm4,ymm3
  5895. vpalignr ymm12,ymm12,ymm12,4
  5896. vpalignr ymm8,ymm8,ymm8,8
  5897. vpalignr ymm4,ymm4,ymm4,12
  5898. vpaddd ymm1,ymm1,ymm5
  5899. vpxor ymm13,ymm13,ymm1
  5900. vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
  5901. vpaddd ymm9,ymm9,ymm13
  5902. vpxor ymm5,ymm5,ymm9
  5903. vpsrld ymm3,ymm5,20
  5904. vpslld ymm5,ymm5,12
  5905. vpxor ymm5,ymm5,ymm3
  5906. vpaddd ymm1,ymm1,ymm5
  5907. vpxor ymm13,ymm13,ymm1
  5908. vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
  5909. vpaddd ymm9,ymm9,ymm13
  5910. vpxor ymm5,ymm5,ymm9
  5911. vpslld ymm3,ymm5,7
  5912. vpsrld ymm5,ymm5,25
  5913. vpxor ymm5,ymm5,ymm3
  5914. vpalignr ymm13,ymm13,ymm13,4
  5915. vpalignr ymm9,ymm9,ymm9,8
  5916. vpalignr ymm5,ymm5,ymm5,12
  5917. vpaddd ymm2,ymm2,ymm6
  5918. vpxor ymm14,ymm14,ymm2
  5919. vpshufb ymm14,ymm14,YMMWORD[$L$rol16]
  5920. vpaddd ymm10,ymm10,ymm14
  5921. vpxor ymm6,ymm6,ymm10
  5922. vpsrld ymm3,ymm6,20
  5923. vpslld ymm6,ymm6,12
  5924. vpxor ymm6,ymm6,ymm3
  5925. vpaddd ymm2,ymm2,ymm6
  5926. vpxor ymm14,ymm14,ymm2
  5927. vpshufb ymm14,ymm14,YMMWORD[$L$rol8]
  5928. vpaddd ymm10,ymm10,ymm14
  5929. vpxor ymm6,ymm6,ymm10
  5930. vpslld ymm3,ymm6,7
  5931. vpsrld ymm6,ymm6,25
  5932. vpxor ymm6,ymm6,ymm3
  5933. vpalignr ymm14,ymm14,ymm14,4
  5934. vpalignr ymm10,ymm10,ymm10,8
  5935. vpalignr ymm6,ymm6,ymm6,12
  5936. dec r10
  5937. jne NEAR $L$open_avx2_320_rounds
  5938. vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts]
  5939. vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts]
  5940. vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts]
  5941. vpaddd ymm4,ymm4,ymm7
  5942. vpaddd ymm5,ymm5,ymm7
  5943. vpaddd ymm6,ymm6,ymm7
  5944. vpaddd ymm8,ymm8,ymm11
  5945. vpaddd ymm9,ymm9,ymm11
  5946. vpaddd ymm10,ymm10,ymm11
  5947. vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
  5948. vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp]
  5949. vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp]
  5950. vperm2i128 ymm3,ymm4,ymm0,0x02
  5951. vpand ymm3,ymm3,YMMWORD[$L$clamp]
  5952. vmovdqa YMMWORD[(160+0)+rbp],ymm3
  5953. vperm2i128 ymm0,ymm4,ymm0,0x13
  5954. vperm2i128 ymm4,ymm12,ymm8,0x13
  5955. vperm2i128 ymm8,ymm5,ymm1,0x02
  5956. vperm2i128 ymm12,ymm13,ymm9,0x02
  5957. vperm2i128 ymm1,ymm5,ymm1,0x13
  5958. vperm2i128 ymm5,ymm13,ymm9,0x13
  5959. vperm2i128 ymm9,ymm6,ymm2,0x02
  5960. vperm2i128 ymm13,ymm14,ymm10,0x02
  5961. vperm2i128 ymm2,ymm6,ymm2,0x13
  5962. vperm2i128 ymm6,ymm14,ymm10,0x13
  5963. jmp NEAR $L$open_avx2_short
  5964. ALIGN 64
  5965. chacha20_poly1305_seal_avx2:
  5966. vzeroupper
  5967. vmovdqa ymm0,YMMWORD[$L$chacha20_consts]
  5968. vbroadcasti128 ymm4,XMMWORD[r9]
  5969. vbroadcasti128 ymm8,XMMWORD[16+r9]
  5970. vbroadcasti128 ymm12,XMMWORD[32+r9]
  5971. vpaddd ymm12,ymm12,YMMWORD[$L$avx2_init]
  5972. cmp rbx,6*32
  5973. jbe NEAR $L$seal_avx2_192
  5974. cmp rbx,10*32
  5975. jbe NEAR $L$seal_avx2_320
  5976. vmovdqa ymm1,ymm0
  5977. vmovdqa ymm2,ymm0
  5978. vmovdqa ymm3,ymm0
  5979. vmovdqa ymm5,ymm4
  5980. vmovdqa ymm6,ymm4
  5981. vmovdqa ymm7,ymm4
  5982. vmovdqa YMMWORD[(160+64)+rbp],ymm4
  5983. vmovdqa ymm9,ymm8
  5984. vmovdqa ymm10,ymm8
  5985. vmovdqa ymm11,ymm8
  5986. vmovdqa YMMWORD[(160+96)+rbp],ymm8
  5987. vmovdqa ymm15,ymm12
  5988. vpaddd ymm14,ymm15,YMMWORD[$L$avx2_inc]
  5989. vpaddd ymm13,ymm14,YMMWORD[$L$avx2_inc]
  5990. vpaddd ymm12,ymm13,YMMWORD[$L$avx2_inc]
  5991. vmovdqa YMMWORD[(160+160)+rbp],ymm12
  5992. vmovdqa YMMWORD[(160+192)+rbp],ymm13
  5993. vmovdqa YMMWORD[(160+224)+rbp],ymm14
  5994. vmovdqa YMMWORD[(160+256)+rbp],ymm15
  5995. mov r10,10
  5996. $L$seal_avx2_init_rounds:
  5997. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  5998. vmovdqa ymm8,YMMWORD[$L$rol16]
  5999. vpaddd ymm3,ymm3,ymm7
  6000. vpaddd ymm2,ymm2,ymm6
  6001. vpaddd ymm1,ymm1,ymm5
  6002. vpaddd ymm0,ymm0,ymm4
  6003. vpxor ymm15,ymm15,ymm3
  6004. vpxor ymm14,ymm14,ymm2
  6005. vpxor ymm13,ymm13,ymm1
  6006. vpxor ymm12,ymm12,ymm0
  6007. vpshufb ymm15,ymm15,ymm8
  6008. vpshufb ymm14,ymm14,ymm8
  6009. vpshufb ymm13,ymm13,ymm8
  6010. vpshufb ymm12,ymm12,ymm8
  6011. vpaddd ymm11,ymm11,ymm15
  6012. vpaddd ymm10,ymm10,ymm14
  6013. vpaddd ymm9,ymm9,ymm13
  6014. vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
  6015. vpxor ymm7,ymm7,ymm11
  6016. vpxor ymm6,ymm6,ymm10
  6017. vpxor ymm5,ymm5,ymm9
  6018. vpxor ymm4,ymm4,ymm8
  6019. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  6020. vpsrld ymm8,ymm7,20
  6021. vpslld ymm7,ymm7,32-20
  6022. vpxor ymm7,ymm7,ymm8
  6023. vpsrld ymm8,ymm6,20
  6024. vpslld ymm6,ymm6,32-20
  6025. vpxor ymm6,ymm6,ymm8
  6026. vpsrld ymm8,ymm5,20
  6027. vpslld ymm5,ymm5,32-20
  6028. vpxor ymm5,ymm5,ymm8
  6029. vpsrld ymm8,ymm4,20
  6030. vpslld ymm4,ymm4,32-20
  6031. vpxor ymm4,ymm4,ymm8
  6032. vmovdqa ymm8,YMMWORD[$L$rol8]
  6033. vpaddd ymm3,ymm3,ymm7
  6034. vpaddd ymm2,ymm2,ymm6
  6035. vpaddd ymm1,ymm1,ymm5
  6036. vpaddd ymm0,ymm0,ymm4
  6037. vpxor ymm15,ymm15,ymm3
  6038. vpxor ymm14,ymm14,ymm2
  6039. vpxor ymm13,ymm13,ymm1
  6040. vpxor ymm12,ymm12,ymm0
  6041. vpshufb ymm15,ymm15,ymm8
  6042. vpshufb ymm14,ymm14,ymm8
  6043. vpshufb ymm13,ymm13,ymm8
  6044. vpshufb ymm12,ymm12,ymm8
  6045. vpaddd ymm11,ymm11,ymm15
  6046. vpaddd ymm10,ymm10,ymm14
  6047. vpaddd ymm9,ymm9,ymm13
  6048. vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
  6049. vpxor ymm7,ymm7,ymm11
  6050. vpxor ymm6,ymm6,ymm10
  6051. vpxor ymm5,ymm5,ymm9
  6052. vpxor ymm4,ymm4,ymm8
  6053. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  6054. vpsrld ymm8,ymm7,25
  6055. vpslld ymm7,ymm7,32-25
  6056. vpxor ymm7,ymm7,ymm8
  6057. vpsrld ymm8,ymm6,25
  6058. vpslld ymm6,ymm6,32-25
  6059. vpxor ymm6,ymm6,ymm8
  6060. vpsrld ymm8,ymm5,25
  6061. vpslld ymm5,ymm5,32-25
  6062. vpxor ymm5,ymm5,ymm8
  6063. vpsrld ymm8,ymm4,25
  6064. vpslld ymm4,ymm4,32-25
  6065. vpxor ymm4,ymm4,ymm8
  6066. vmovdqa ymm8,YMMWORD[((160+128))+rbp]
  6067. vpalignr ymm7,ymm7,ymm7,4
  6068. vpalignr ymm11,ymm11,ymm11,8
  6069. vpalignr ymm15,ymm15,ymm15,12
  6070. vpalignr ymm6,ymm6,ymm6,4
  6071. vpalignr ymm10,ymm10,ymm10,8
  6072. vpalignr ymm14,ymm14,ymm14,12
  6073. vpalignr ymm5,ymm5,ymm5,4
  6074. vpalignr ymm9,ymm9,ymm9,8
  6075. vpalignr ymm13,ymm13,ymm13,12
  6076. vpalignr ymm4,ymm4,ymm4,4
  6077. vpalignr ymm8,ymm8,ymm8,8
  6078. vpalignr ymm12,ymm12,ymm12,12
  6079. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  6080. vmovdqa ymm8,YMMWORD[$L$rol16]
  6081. vpaddd ymm3,ymm3,ymm7
  6082. vpaddd ymm2,ymm2,ymm6
  6083. vpaddd ymm1,ymm1,ymm5
  6084. vpaddd ymm0,ymm0,ymm4
  6085. vpxor ymm15,ymm15,ymm3
  6086. vpxor ymm14,ymm14,ymm2
  6087. vpxor ymm13,ymm13,ymm1
  6088. vpxor ymm12,ymm12,ymm0
  6089. vpshufb ymm15,ymm15,ymm8
  6090. vpshufb ymm14,ymm14,ymm8
  6091. vpshufb ymm13,ymm13,ymm8
  6092. vpshufb ymm12,ymm12,ymm8
  6093. vpaddd ymm11,ymm11,ymm15
  6094. vpaddd ymm10,ymm10,ymm14
  6095. vpaddd ymm9,ymm9,ymm13
  6096. vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
  6097. vpxor ymm7,ymm7,ymm11
  6098. vpxor ymm6,ymm6,ymm10
  6099. vpxor ymm5,ymm5,ymm9
  6100. vpxor ymm4,ymm4,ymm8
  6101. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  6102. vpsrld ymm8,ymm7,20
  6103. vpslld ymm7,ymm7,32-20
  6104. vpxor ymm7,ymm7,ymm8
  6105. vpsrld ymm8,ymm6,20
  6106. vpslld ymm6,ymm6,32-20
  6107. vpxor ymm6,ymm6,ymm8
  6108. vpsrld ymm8,ymm5,20
  6109. vpslld ymm5,ymm5,32-20
  6110. vpxor ymm5,ymm5,ymm8
  6111. vpsrld ymm8,ymm4,20
  6112. vpslld ymm4,ymm4,32-20
  6113. vpxor ymm4,ymm4,ymm8
  6114. vmovdqa ymm8,YMMWORD[$L$rol8]
  6115. vpaddd ymm3,ymm3,ymm7
  6116. vpaddd ymm2,ymm2,ymm6
  6117. vpaddd ymm1,ymm1,ymm5
  6118. vpaddd ymm0,ymm0,ymm4
  6119. vpxor ymm15,ymm15,ymm3
  6120. vpxor ymm14,ymm14,ymm2
  6121. vpxor ymm13,ymm13,ymm1
  6122. vpxor ymm12,ymm12,ymm0
  6123. vpshufb ymm15,ymm15,ymm8
  6124. vpshufb ymm14,ymm14,ymm8
  6125. vpshufb ymm13,ymm13,ymm8
  6126. vpshufb ymm12,ymm12,ymm8
  6127. vpaddd ymm11,ymm11,ymm15
  6128. vpaddd ymm10,ymm10,ymm14
  6129. vpaddd ymm9,ymm9,ymm13
  6130. vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
  6131. vpxor ymm7,ymm7,ymm11
  6132. vpxor ymm6,ymm6,ymm10
  6133. vpxor ymm5,ymm5,ymm9
  6134. vpxor ymm4,ymm4,ymm8
  6135. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  6136. vpsrld ymm8,ymm7,25
  6137. vpslld ymm7,ymm7,32-25
  6138. vpxor ymm7,ymm7,ymm8
  6139. vpsrld ymm8,ymm6,25
  6140. vpslld ymm6,ymm6,32-25
  6141. vpxor ymm6,ymm6,ymm8
  6142. vpsrld ymm8,ymm5,25
  6143. vpslld ymm5,ymm5,32-25
  6144. vpxor ymm5,ymm5,ymm8
  6145. vpsrld ymm8,ymm4,25
  6146. vpslld ymm4,ymm4,32-25
  6147. vpxor ymm4,ymm4,ymm8
  6148. vmovdqa ymm8,YMMWORD[((160+128))+rbp]
  6149. vpalignr ymm7,ymm7,ymm7,12
  6150. vpalignr ymm11,ymm11,ymm11,8
  6151. vpalignr ymm15,ymm15,ymm15,4
  6152. vpalignr ymm6,ymm6,ymm6,12
  6153. vpalignr ymm10,ymm10,ymm10,8
  6154. vpalignr ymm14,ymm14,ymm14,4
  6155. vpalignr ymm5,ymm5,ymm5,12
  6156. vpalignr ymm9,ymm9,ymm9,8
  6157. vpalignr ymm13,ymm13,ymm13,4
  6158. vpalignr ymm4,ymm4,ymm4,12
  6159. vpalignr ymm8,ymm8,ymm8,8
  6160. vpalignr ymm12,ymm12,ymm12,4
  6161. dec r10
  6162. jnz NEAR $L$seal_avx2_init_rounds
  6163. vpaddd ymm3,ymm3,YMMWORD[$L$chacha20_consts]
  6164. vpaddd ymm7,ymm7,YMMWORD[((160+64))+rbp]
  6165. vpaddd ymm11,ymm11,YMMWORD[((160+96))+rbp]
  6166. vpaddd ymm15,ymm15,YMMWORD[((160+256))+rbp]
  6167. vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts]
  6168. vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp]
  6169. vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp]
  6170. vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp]
  6171. vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts]
  6172. vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp]
  6173. vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp]
  6174. vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp]
  6175. vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts]
  6176. vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp]
  6177. vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp]
  6178. vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
  6179. vperm2i128 ymm11,ymm15,ymm11,0x13
  6180. vperm2i128 ymm15,ymm7,ymm3,0x02
  6181. vperm2i128 ymm3,ymm7,ymm3,0x13
  6182. vpand ymm15,ymm15,YMMWORD[$L$clamp]
  6183. vmovdqa YMMWORD[(160+0)+rbp],ymm15
  6184. mov r8,r8
  6185. call poly_hash_ad_internal
  6186. vpxor ymm3,ymm3,YMMWORD[rsi]
  6187. vpxor ymm11,ymm11,YMMWORD[32+rsi]
  6188. vmovdqu YMMWORD[rdi],ymm3
  6189. vmovdqu YMMWORD[32+rdi],ymm11
  6190. vperm2i128 ymm15,ymm6,ymm2,0x02
  6191. vperm2i128 ymm6,ymm6,ymm2,0x13
  6192. vperm2i128 ymm2,ymm14,ymm10,0x02
  6193. vperm2i128 ymm10,ymm14,ymm10,0x13
  6194. vpxor ymm15,ymm15,YMMWORD[((0+64))+rsi]
  6195. vpxor ymm2,ymm2,YMMWORD[((32+64))+rsi]
  6196. vpxor ymm6,ymm6,YMMWORD[((64+64))+rsi]
  6197. vpxor ymm10,ymm10,YMMWORD[((96+64))+rsi]
  6198. vmovdqu YMMWORD[(0+64)+rdi],ymm15
  6199. vmovdqu YMMWORD[(32+64)+rdi],ymm2
  6200. vmovdqu YMMWORD[(64+64)+rdi],ymm6
  6201. vmovdqu YMMWORD[(96+64)+rdi],ymm10
  6202. vperm2i128 ymm15,ymm5,ymm1,0x02
  6203. vperm2i128 ymm5,ymm5,ymm1,0x13
  6204. vperm2i128 ymm1,ymm13,ymm9,0x02
  6205. vperm2i128 ymm9,ymm13,ymm9,0x13
  6206. vpxor ymm15,ymm15,YMMWORD[((0+192))+rsi]
  6207. vpxor ymm1,ymm1,YMMWORD[((32+192))+rsi]
  6208. vpxor ymm5,ymm5,YMMWORD[((64+192))+rsi]
  6209. vpxor ymm9,ymm9,YMMWORD[((96+192))+rsi]
  6210. vmovdqu YMMWORD[(0+192)+rdi],ymm15
  6211. vmovdqu YMMWORD[(32+192)+rdi],ymm1
  6212. vmovdqu YMMWORD[(64+192)+rdi],ymm5
  6213. vmovdqu YMMWORD[(96+192)+rdi],ymm9
  6214. vperm2i128 ymm15,ymm4,ymm0,0x13
  6215. vperm2i128 ymm0,ymm4,ymm0,0x02
  6216. vperm2i128 ymm4,ymm12,ymm8,0x02
  6217. vperm2i128 ymm12,ymm12,ymm8,0x13
  6218. vmovdqa ymm8,ymm15
  6219. lea rsi,[320+rsi]
  6220. sub rbx,10*32
  6221. mov rcx,10*32
  6222. cmp rbx,4*32
  6223. jbe NEAR $L$seal_avx2_short_hash_remainder
  6224. vpxor ymm0,ymm0,YMMWORD[rsi]
  6225. vpxor ymm4,ymm4,YMMWORD[32+rsi]
  6226. vpxor ymm8,ymm8,YMMWORD[64+rsi]
  6227. vpxor ymm12,ymm12,YMMWORD[96+rsi]
  6228. vmovdqu YMMWORD[320+rdi],ymm0
  6229. vmovdqu YMMWORD[352+rdi],ymm4
  6230. vmovdqu YMMWORD[384+rdi],ymm8
  6231. vmovdqu YMMWORD[416+rdi],ymm12
  6232. lea rsi,[128+rsi]
  6233. sub rbx,4*32
  6234. mov rcx,8
  6235. mov r8,2
  6236. cmp rbx,4*32
  6237. jbe NEAR $L$seal_avx2_tail_128
  6238. cmp rbx,8*32
  6239. jbe NEAR $L$seal_avx2_tail_256
  6240. cmp rbx,12*32
  6241. jbe NEAR $L$seal_avx2_tail_384
  6242. cmp rbx,16*32
  6243. jbe NEAR $L$seal_avx2_tail_512
  6244. vmovdqa ymm0,YMMWORD[$L$chacha20_consts]
  6245. vmovdqa ymm4,YMMWORD[((160+64))+rbp]
  6246. vmovdqa ymm8,YMMWORD[((160+96))+rbp]
  6247. vmovdqa ymm1,ymm0
  6248. vmovdqa ymm5,ymm4
  6249. vmovdqa ymm9,ymm8
  6250. vmovdqa ymm2,ymm0
  6251. vmovdqa ymm6,ymm4
  6252. vmovdqa ymm10,ymm8
  6253. vmovdqa ymm3,ymm0
  6254. vmovdqa ymm7,ymm4
  6255. vmovdqa ymm11,ymm8
  6256. vmovdqa ymm12,YMMWORD[$L$avx2_inc]
  6257. vpaddd ymm15,ymm12,YMMWORD[((160+160))+rbp]
  6258. vpaddd ymm14,ymm12,ymm15
  6259. vpaddd ymm13,ymm12,ymm14
  6260. vpaddd ymm12,ymm12,ymm13
  6261. vmovdqa YMMWORD[(160+256)+rbp],ymm15
  6262. vmovdqa YMMWORD[(160+224)+rbp],ymm14
  6263. vmovdqa YMMWORD[(160+192)+rbp],ymm13
  6264. vmovdqa YMMWORD[(160+160)+rbp],ymm12
  6265. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  6266. vmovdqa ymm8,YMMWORD[$L$rol16]
  6267. vpaddd ymm3,ymm3,ymm7
  6268. vpaddd ymm2,ymm2,ymm6
  6269. vpaddd ymm1,ymm1,ymm5
  6270. vpaddd ymm0,ymm0,ymm4
  6271. vpxor ymm15,ymm15,ymm3
  6272. vpxor ymm14,ymm14,ymm2
  6273. vpxor ymm13,ymm13,ymm1
  6274. vpxor ymm12,ymm12,ymm0
  6275. vpshufb ymm15,ymm15,ymm8
  6276. vpshufb ymm14,ymm14,ymm8
  6277. vpshufb ymm13,ymm13,ymm8
  6278. vpshufb ymm12,ymm12,ymm8
  6279. vpaddd ymm11,ymm11,ymm15
  6280. vpaddd ymm10,ymm10,ymm14
  6281. vpaddd ymm9,ymm9,ymm13
  6282. vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
  6283. vpxor ymm7,ymm7,ymm11
  6284. vpxor ymm6,ymm6,ymm10
  6285. vpxor ymm5,ymm5,ymm9
  6286. vpxor ymm4,ymm4,ymm8
  6287. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  6288. vpsrld ymm8,ymm7,20
  6289. vpslld ymm7,ymm7,32-20
  6290. vpxor ymm7,ymm7,ymm8
  6291. vpsrld ymm8,ymm6,20
  6292. vpslld ymm6,ymm6,32-20
  6293. vpxor ymm6,ymm6,ymm8
  6294. vpsrld ymm8,ymm5,20
  6295. vpslld ymm5,ymm5,32-20
  6296. vpxor ymm5,ymm5,ymm8
  6297. vpsrld ymm8,ymm4,20
  6298. vpslld ymm4,ymm4,32-20
  6299. vpxor ymm4,ymm4,ymm8
  6300. vmovdqa ymm8,YMMWORD[$L$rol8]
  6301. vpaddd ymm3,ymm3,ymm7
  6302. vpaddd ymm2,ymm2,ymm6
  6303. vpaddd ymm1,ymm1,ymm5
  6304. vpaddd ymm0,ymm0,ymm4
  6305. vpxor ymm15,ymm15,ymm3
  6306. vpxor ymm14,ymm14,ymm2
  6307. vpxor ymm13,ymm13,ymm1
  6308. vpxor ymm12,ymm12,ymm0
  6309. vpshufb ymm15,ymm15,ymm8
  6310. vpshufb ymm14,ymm14,ymm8
  6311. vpshufb ymm13,ymm13,ymm8
  6312. vpshufb ymm12,ymm12,ymm8
  6313. vpaddd ymm11,ymm11,ymm15
  6314. vpaddd ymm10,ymm10,ymm14
  6315. vpaddd ymm9,ymm9,ymm13
  6316. vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
  6317. vpxor ymm7,ymm7,ymm11
  6318. vpxor ymm6,ymm6,ymm10
  6319. vpxor ymm5,ymm5,ymm9
  6320. vpxor ymm4,ymm4,ymm8
  6321. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  6322. vpsrld ymm8,ymm7,25
  6323. vpslld ymm7,ymm7,32-25
  6324. vpxor ymm7,ymm7,ymm8
  6325. vpsrld ymm8,ymm6,25
  6326. vpslld ymm6,ymm6,32-25
  6327. vpxor ymm6,ymm6,ymm8
  6328. vpsrld ymm8,ymm5,25
  6329. vpslld ymm5,ymm5,32-25
  6330. vpxor ymm5,ymm5,ymm8
  6331. vpsrld ymm8,ymm4,25
  6332. vpslld ymm4,ymm4,32-25
  6333. vpxor ymm4,ymm4,ymm8
  6334. vmovdqa ymm8,YMMWORD[((160+128))+rbp]
  6335. vpalignr ymm7,ymm7,ymm7,4
  6336. vpalignr ymm11,ymm11,ymm11,8
  6337. vpalignr ymm15,ymm15,ymm15,12
  6338. vpalignr ymm6,ymm6,ymm6,4
  6339. vpalignr ymm10,ymm10,ymm10,8
  6340. vpalignr ymm14,ymm14,ymm14,12
  6341. vpalignr ymm5,ymm5,ymm5,4
  6342. vpalignr ymm9,ymm9,ymm9,8
  6343. vpalignr ymm13,ymm13,ymm13,12
  6344. vpalignr ymm4,ymm4,ymm4,4
  6345. vpalignr ymm8,ymm8,ymm8,8
  6346. vpalignr ymm12,ymm12,ymm12,12
  6347. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  6348. vmovdqa ymm8,YMMWORD[$L$rol16]
  6349. vpaddd ymm3,ymm3,ymm7
  6350. vpaddd ymm2,ymm2,ymm6
  6351. vpaddd ymm1,ymm1,ymm5
  6352. vpaddd ymm0,ymm0,ymm4
  6353. vpxor ymm15,ymm15,ymm3
  6354. vpxor ymm14,ymm14,ymm2
  6355. vpxor ymm13,ymm13,ymm1
  6356. vpxor ymm12,ymm12,ymm0
  6357. vpshufb ymm15,ymm15,ymm8
  6358. vpshufb ymm14,ymm14,ymm8
  6359. vpshufb ymm13,ymm13,ymm8
  6360. vpshufb ymm12,ymm12,ymm8
  6361. vpaddd ymm11,ymm11,ymm15
  6362. vpaddd ymm10,ymm10,ymm14
  6363. vpaddd ymm9,ymm9,ymm13
  6364. vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
  6365. vpxor ymm7,ymm7,ymm11
  6366. vpxor ymm6,ymm6,ymm10
  6367. vpxor ymm5,ymm5,ymm9
  6368. vpxor ymm4,ymm4,ymm8
  6369. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  6370. vpsrld ymm8,ymm7,20
  6371. vpslld ymm7,ymm7,32-20
  6372. vpxor ymm7,ymm7,ymm8
  6373. vpsrld ymm8,ymm6,20
  6374. vpslld ymm6,ymm6,32-20
  6375. vpxor ymm6,ymm6,ymm8
  6376. vpsrld ymm8,ymm5,20
  6377. vpslld ymm5,ymm5,32-20
  6378. vpxor ymm5,ymm5,ymm8
  6379. vpsrld ymm8,ymm4,20
  6380. vpslld ymm4,ymm4,32-20
  6381. vpxor ymm4,ymm4,ymm8
  6382. vmovdqa ymm8,YMMWORD[$L$rol8]
  6383. vpaddd ymm3,ymm3,ymm7
  6384. vpaddd ymm2,ymm2,ymm6
  6385. vpaddd ymm1,ymm1,ymm5
  6386. vpaddd ymm0,ymm0,ymm4
  6387. vpxor ymm15,ymm15,ymm3
  6388. vpxor ymm14,ymm14,ymm2
  6389. vpxor ymm13,ymm13,ymm1
  6390. vpxor ymm12,ymm12,ymm0
  6391. vpshufb ymm15,ymm15,ymm8
  6392. vpshufb ymm14,ymm14,ymm8
  6393. vpshufb ymm13,ymm13,ymm8
  6394. vpshufb ymm12,ymm12,ymm8
  6395. vpaddd ymm11,ymm11,ymm15
  6396. vpaddd ymm10,ymm10,ymm14
  6397. vpaddd ymm9,ymm9,ymm13
  6398. vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
  6399. vpxor ymm7,ymm7,ymm11
  6400. vpxor ymm6,ymm6,ymm10
  6401. vpxor ymm5,ymm5,ymm9
  6402. vpxor ymm4,ymm4,ymm8
  6403. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  6404. vpsrld ymm8,ymm7,25
  6405. vpslld ymm7,ymm7,32-25
  6406. vpxor ymm7,ymm7,ymm8
  6407. vpsrld ymm8,ymm6,25
  6408. vpslld ymm6,ymm6,32-25
  6409. vpxor ymm6,ymm6,ymm8
  6410. vpsrld ymm8,ymm5,25
  6411. vpslld ymm5,ymm5,32-25
  6412. vpxor ymm5,ymm5,ymm8
  6413. vpsrld ymm8,ymm4,25
  6414. vpslld ymm4,ymm4,32-25
  6415. vpxor ymm4,ymm4,ymm8
  6416. vmovdqa ymm8,YMMWORD[((160+128))+rbp]
  6417. vpalignr ymm7,ymm7,ymm7,12
  6418. vpalignr ymm11,ymm11,ymm11,8
  6419. vpalignr ymm15,ymm15,ymm15,4
  6420. vpalignr ymm6,ymm6,ymm6,12
  6421. vpalignr ymm10,ymm10,ymm10,8
  6422. vpalignr ymm14,ymm14,ymm14,4
  6423. vpalignr ymm5,ymm5,ymm5,12
  6424. vpalignr ymm9,ymm9,ymm9,8
  6425. vpalignr ymm13,ymm13,ymm13,4
  6426. vpalignr ymm4,ymm4,ymm4,12
  6427. vpalignr ymm8,ymm8,ymm8,8
  6428. vpalignr ymm12,ymm12,ymm12,4
  6429. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  6430. vmovdqa ymm8,YMMWORD[$L$rol16]
  6431. vpaddd ymm3,ymm3,ymm7
  6432. vpaddd ymm2,ymm2,ymm6
  6433. vpaddd ymm1,ymm1,ymm5
  6434. vpaddd ymm0,ymm0,ymm4
  6435. vpxor ymm15,ymm15,ymm3
  6436. vpxor ymm14,ymm14,ymm2
  6437. vpxor ymm13,ymm13,ymm1
  6438. vpxor ymm12,ymm12,ymm0
  6439. vpshufb ymm15,ymm15,ymm8
  6440. vpshufb ymm14,ymm14,ymm8
  6441. vpshufb ymm13,ymm13,ymm8
  6442. vpshufb ymm12,ymm12,ymm8
  6443. vpaddd ymm11,ymm11,ymm15
  6444. vpaddd ymm10,ymm10,ymm14
  6445. vpaddd ymm9,ymm9,ymm13
  6446. vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
  6447. vpxor ymm7,ymm7,ymm11
  6448. vpxor ymm6,ymm6,ymm10
  6449. vpxor ymm5,ymm5,ymm9
  6450. vpxor ymm4,ymm4,ymm8
  6451. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  6452. vpsrld ymm8,ymm7,20
  6453. vpslld ymm7,ymm7,32-20
  6454. vpxor ymm7,ymm7,ymm8
  6455. vpsrld ymm8,ymm6,20
  6456. vpslld ymm6,ymm6,32-20
  6457. vpxor ymm6,ymm6,ymm8
  6458. vpsrld ymm8,ymm5,20
  6459. vpslld ymm5,ymm5,32-20
  6460. vpxor ymm5,ymm5,ymm8
  6461. vpsrld ymm8,ymm4,20
  6462. vpslld ymm4,ymm4,32-20
  6463. vpxor ymm4,ymm4,ymm8
  6464. vmovdqa ymm8,YMMWORD[$L$rol8]
  6465. vpaddd ymm3,ymm3,ymm7
  6466. vpaddd ymm2,ymm2,ymm6
  6467. vpaddd ymm1,ymm1,ymm5
  6468. vpaddd ymm0,ymm0,ymm4
  6469. vpxor ymm15,ymm15,ymm3
  6470. sub rdi,16
  6471. mov rcx,9
  6472. jmp NEAR $L$seal_avx2_main_loop_rounds_entry
  6473. ALIGN 32
  6474. $L$seal_avx2_main_loop:
  6475. vmovdqa ymm0,YMMWORD[$L$chacha20_consts]
  6476. vmovdqa ymm4,YMMWORD[((160+64))+rbp]
  6477. vmovdqa ymm8,YMMWORD[((160+96))+rbp]
  6478. vmovdqa ymm1,ymm0
  6479. vmovdqa ymm5,ymm4
  6480. vmovdqa ymm9,ymm8
  6481. vmovdqa ymm2,ymm0
  6482. vmovdqa ymm6,ymm4
  6483. vmovdqa ymm10,ymm8
  6484. vmovdqa ymm3,ymm0
  6485. vmovdqa ymm7,ymm4
  6486. vmovdqa ymm11,ymm8
  6487. vmovdqa ymm12,YMMWORD[$L$avx2_inc]
  6488. vpaddd ymm15,ymm12,YMMWORD[((160+160))+rbp]
  6489. vpaddd ymm14,ymm12,ymm15
  6490. vpaddd ymm13,ymm12,ymm14
  6491. vpaddd ymm12,ymm12,ymm13
  6492. vmovdqa YMMWORD[(160+256)+rbp],ymm15
  6493. vmovdqa YMMWORD[(160+224)+rbp],ymm14
  6494. vmovdqa YMMWORD[(160+192)+rbp],ymm13
  6495. vmovdqa YMMWORD[(160+160)+rbp],ymm12
  6496. mov rcx,10
  6497. ALIGN 32
  6498. $L$seal_avx2_main_loop_rounds:
  6499. add r10,QWORD[((0+0))+rdi]
  6500. adc r11,QWORD[((8+0))+rdi]
  6501. adc r12,1
  6502. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  6503. vmovdqa ymm8,YMMWORD[$L$rol16]
  6504. vpaddd ymm3,ymm3,ymm7
  6505. vpaddd ymm2,ymm2,ymm6
  6506. vpaddd ymm1,ymm1,ymm5
  6507. vpaddd ymm0,ymm0,ymm4
  6508. vpxor ymm15,ymm15,ymm3
  6509. vpxor ymm14,ymm14,ymm2
  6510. vpxor ymm13,ymm13,ymm1
  6511. vpxor ymm12,ymm12,ymm0
  6512. mov rdx,QWORD[((0+160+0))+rbp]
  6513. mov r15,rdx
  6514. mulx r14,r13,r10
  6515. mulx rdx,rax,r11
  6516. imul r15,r12
  6517. add r14,rax
  6518. adc r15,rdx
  6519. vpshufb ymm15,ymm15,ymm8
  6520. vpshufb ymm14,ymm14,ymm8
  6521. vpshufb ymm13,ymm13,ymm8
  6522. vpshufb ymm12,ymm12,ymm8
  6523. vpaddd ymm11,ymm11,ymm15
  6524. vpaddd ymm10,ymm10,ymm14
  6525. vpaddd ymm9,ymm9,ymm13
  6526. vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
  6527. vpxor ymm7,ymm7,ymm11
  6528. mov rdx,QWORD[((8+160+0))+rbp]
  6529. mulx rax,r10,r10
  6530. add r14,r10
  6531. mulx r9,r11,r11
  6532. adc r15,r11
  6533. adc r9,0
  6534. imul rdx,r12
  6535. vpxor ymm6,ymm6,ymm10
  6536. vpxor ymm5,ymm5,ymm9
  6537. vpxor ymm4,ymm4,ymm8
  6538. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  6539. vpsrld ymm8,ymm7,20
  6540. vpslld ymm7,ymm7,32-20
  6541. vpxor ymm7,ymm7,ymm8
  6542. vpsrld ymm8,ymm6,20
  6543. vpslld ymm6,ymm6,32-20
  6544. vpxor ymm6,ymm6,ymm8
  6545. vpsrld ymm8,ymm5,20
  6546. vpslld ymm5,ymm5,32-20
  6547. add r15,rax
  6548. adc r9,rdx
  6549. vpxor ymm5,ymm5,ymm8
  6550. vpsrld ymm8,ymm4,20
  6551. vpslld ymm4,ymm4,32-20
  6552. vpxor ymm4,ymm4,ymm8
  6553. vmovdqa ymm8,YMMWORD[$L$rol8]
  6554. vpaddd ymm3,ymm3,ymm7
  6555. vpaddd ymm2,ymm2,ymm6
  6556. vpaddd ymm1,ymm1,ymm5
  6557. vpaddd ymm0,ymm0,ymm4
  6558. vpxor ymm15,ymm15,ymm3
  6559. mov r10,r13
  6560. mov r11,r14
  6561. mov r12,r15
  6562. and r12,3
  6563. mov r13,r15
  6564. and r13,-4
  6565. mov r14,r9
  6566. shrd r15,r9,2
  6567. shr r9,2
  6568. add r15,r13
  6569. adc r9,r14
  6570. add r10,r15
  6571. adc r11,r9
  6572. adc r12,0
  6573. $L$seal_avx2_main_loop_rounds_entry:
  6574. vpxor ymm14,ymm14,ymm2
  6575. vpxor ymm13,ymm13,ymm1
  6576. vpxor ymm12,ymm12,ymm0
  6577. vpshufb ymm15,ymm15,ymm8
  6578. vpshufb ymm14,ymm14,ymm8
  6579. vpshufb ymm13,ymm13,ymm8
  6580. vpshufb ymm12,ymm12,ymm8
  6581. vpaddd ymm11,ymm11,ymm15
  6582. vpaddd ymm10,ymm10,ymm14
  6583. add r10,QWORD[((0+16))+rdi]
  6584. adc r11,QWORD[((8+16))+rdi]
  6585. adc r12,1
  6586. vpaddd ymm9,ymm9,ymm13
  6587. vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
  6588. vpxor ymm7,ymm7,ymm11
  6589. vpxor ymm6,ymm6,ymm10
  6590. vpxor ymm5,ymm5,ymm9
  6591. vpxor ymm4,ymm4,ymm8
  6592. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  6593. vpsrld ymm8,ymm7,25
  6594. mov rdx,QWORD[((0+160+0))+rbp]
  6595. mov r15,rdx
  6596. mulx r14,r13,r10
  6597. mulx rdx,rax,r11
  6598. imul r15,r12
  6599. add r14,rax
  6600. adc r15,rdx
  6601. vpslld ymm7,ymm7,32-25
  6602. vpxor ymm7,ymm7,ymm8
  6603. vpsrld ymm8,ymm6,25
  6604. vpslld ymm6,ymm6,32-25
  6605. vpxor ymm6,ymm6,ymm8
  6606. vpsrld ymm8,ymm5,25
  6607. vpslld ymm5,ymm5,32-25
  6608. vpxor ymm5,ymm5,ymm8
  6609. vpsrld ymm8,ymm4,25
  6610. vpslld ymm4,ymm4,32-25
  6611. vpxor ymm4,ymm4,ymm8
  6612. vmovdqa ymm8,YMMWORD[((160+128))+rbp]
  6613. vpalignr ymm7,ymm7,ymm7,4
  6614. vpalignr ymm11,ymm11,ymm11,8
  6615. vpalignr ymm15,ymm15,ymm15,12
  6616. vpalignr ymm6,ymm6,ymm6,4
  6617. vpalignr ymm10,ymm10,ymm10,8
  6618. vpalignr ymm14,ymm14,ymm14,12
  6619. mov rdx,QWORD[((8+160+0))+rbp]
  6620. mulx rax,r10,r10
  6621. add r14,r10
  6622. mulx r9,r11,r11
  6623. adc r15,r11
  6624. adc r9,0
  6625. imul rdx,r12
  6626. vpalignr ymm5,ymm5,ymm5,4
  6627. vpalignr ymm9,ymm9,ymm9,8
  6628. vpalignr ymm13,ymm13,ymm13,12
  6629. vpalignr ymm4,ymm4,ymm4,4
  6630. vpalignr ymm8,ymm8,ymm8,8
  6631. vpalignr ymm12,ymm12,ymm12,12
  6632. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  6633. vmovdqa ymm8,YMMWORD[$L$rol16]
  6634. vpaddd ymm3,ymm3,ymm7
  6635. vpaddd ymm2,ymm2,ymm6
  6636. vpaddd ymm1,ymm1,ymm5
  6637. vpaddd ymm0,ymm0,ymm4
  6638. vpxor ymm15,ymm15,ymm3
  6639. vpxor ymm14,ymm14,ymm2
  6640. vpxor ymm13,ymm13,ymm1
  6641. vpxor ymm12,ymm12,ymm0
  6642. vpshufb ymm15,ymm15,ymm8
  6643. vpshufb ymm14,ymm14,ymm8
  6644. add r15,rax
  6645. adc r9,rdx
  6646. vpshufb ymm13,ymm13,ymm8
  6647. vpshufb ymm12,ymm12,ymm8
  6648. vpaddd ymm11,ymm11,ymm15
  6649. vpaddd ymm10,ymm10,ymm14
  6650. vpaddd ymm9,ymm9,ymm13
  6651. vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
  6652. vpxor ymm7,ymm7,ymm11
  6653. vpxor ymm6,ymm6,ymm10
  6654. vpxor ymm5,ymm5,ymm9
  6655. mov r10,r13
  6656. mov r11,r14
  6657. mov r12,r15
  6658. and r12,3
  6659. mov r13,r15
  6660. and r13,-4
  6661. mov r14,r9
  6662. shrd r15,r9,2
  6663. shr r9,2
  6664. add r15,r13
  6665. adc r9,r14
  6666. add r10,r15
  6667. adc r11,r9
  6668. adc r12,0
  6669. vpxor ymm4,ymm4,ymm8
  6670. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  6671. vpsrld ymm8,ymm7,20
  6672. vpslld ymm7,ymm7,32-20
  6673. vpxor ymm7,ymm7,ymm8
  6674. vpsrld ymm8,ymm6,20
  6675. vpslld ymm6,ymm6,32-20
  6676. vpxor ymm6,ymm6,ymm8
  6677. add r10,QWORD[((0+32))+rdi]
  6678. adc r11,QWORD[((8+32))+rdi]
  6679. adc r12,1
  6680. lea rdi,[48+rdi]
  6681. vpsrld ymm8,ymm5,20
  6682. vpslld ymm5,ymm5,32-20
  6683. vpxor ymm5,ymm5,ymm8
  6684. vpsrld ymm8,ymm4,20
  6685. vpslld ymm4,ymm4,32-20
  6686. vpxor ymm4,ymm4,ymm8
  6687. vmovdqa ymm8,YMMWORD[$L$rol8]
  6688. vpaddd ymm3,ymm3,ymm7
  6689. vpaddd ymm2,ymm2,ymm6
  6690. vpaddd ymm1,ymm1,ymm5
  6691. vpaddd ymm0,ymm0,ymm4
  6692. vpxor ymm15,ymm15,ymm3
  6693. vpxor ymm14,ymm14,ymm2
  6694. vpxor ymm13,ymm13,ymm1
  6695. vpxor ymm12,ymm12,ymm0
  6696. vpshufb ymm15,ymm15,ymm8
  6697. vpshufb ymm14,ymm14,ymm8
  6698. vpshufb ymm13,ymm13,ymm8
  6699. mov rdx,QWORD[((0+160+0))+rbp]
  6700. mov r15,rdx
  6701. mulx r14,r13,r10
  6702. mulx rdx,rax,r11
  6703. imul r15,r12
  6704. add r14,rax
  6705. adc r15,rdx
  6706. vpshufb ymm12,ymm12,ymm8
  6707. vpaddd ymm11,ymm11,ymm15
  6708. vpaddd ymm10,ymm10,ymm14
  6709. vpaddd ymm9,ymm9,ymm13
  6710. vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
  6711. vpxor ymm7,ymm7,ymm11
  6712. vpxor ymm6,ymm6,ymm10
  6713. vpxor ymm5,ymm5,ymm9
  6714. mov rdx,QWORD[((8+160+0))+rbp]
  6715. mulx rax,r10,r10
  6716. add r14,r10
  6717. mulx r9,r11,r11
  6718. adc r15,r11
  6719. adc r9,0
  6720. imul rdx,r12
  6721. vpxor ymm4,ymm4,ymm8
  6722. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  6723. vpsrld ymm8,ymm7,25
  6724. vpslld ymm7,ymm7,32-25
  6725. vpxor ymm7,ymm7,ymm8
  6726. vpsrld ymm8,ymm6,25
  6727. vpslld ymm6,ymm6,32-25
  6728. vpxor ymm6,ymm6,ymm8
  6729. add r15,rax
  6730. adc r9,rdx
  6731. vpsrld ymm8,ymm5,25
  6732. vpslld ymm5,ymm5,32-25
  6733. vpxor ymm5,ymm5,ymm8
  6734. vpsrld ymm8,ymm4,25
  6735. vpslld ymm4,ymm4,32-25
  6736. vpxor ymm4,ymm4,ymm8
  6737. vmovdqa ymm8,YMMWORD[((160+128))+rbp]
  6738. vpalignr ymm7,ymm7,ymm7,12
  6739. vpalignr ymm11,ymm11,ymm11,8
  6740. vpalignr ymm15,ymm15,ymm15,4
  6741. vpalignr ymm6,ymm6,ymm6,12
  6742. vpalignr ymm10,ymm10,ymm10,8
  6743. vpalignr ymm14,ymm14,ymm14,4
  6744. vpalignr ymm5,ymm5,ymm5,12
  6745. vpalignr ymm9,ymm9,ymm9,8
  6746. vpalignr ymm13,ymm13,ymm13,4
  6747. vpalignr ymm4,ymm4,ymm4,12
  6748. vpalignr ymm8,ymm8,ymm8,8
  6749. mov r10,r13
  6750. mov r11,r14
  6751. mov r12,r15
  6752. and r12,3
  6753. mov r13,r15
  6754. and r13,-4
  6755. mov r14,r9
  6756. shrd r15,r9,2
  6757. shr r9,2
  6758. add r15,r13
  6759. adc r9,r14
  6760. add r10,r15
  6761. adc r11,r9
  6762. adc r12,0
  6763. vpalignr ymm12,ymm12,ymm12,4
  6764. dec rcx
  6765. jne NEAR $L$seal_avx2_main_loop_rounds
  6766. vpaddd ymm3,ymm3,YMMWORD[$L$chacha20_consts]
  6767. vpaddd ymm7,ymm7,YMMWORD[((160+64))+rbp]
  6768. vpaddd ymm11,ymm11,YMMWORD[((160+96))+rbp]
  6769. vpaddd ymm15,ymm15,YMMWORD[((160+256))+rbp]
  6770. vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts]
  6771. vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp]
  6772. vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp]
  6773. vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp]
  6774. vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts]
  6775. vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp]
  6776. vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp]
  6777. vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp]
  6778. vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts]
  6779. vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp]
  6780. vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp]
  6781. vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
  6782. vmovdqa YMMWORD[(160+128)+rbp],ymm0
  6783. add r10,QWORD[((0+0))+rdi]
  6784. adc r11,QWORD[((8+0))+rdi]
  6785. adc r12,1
  6786. mov rdx,QWORD[((0+160+0))+rbp]
  6787. mov r15,rdx
  6788. mulx r14,r13,r10
  6789. mulx rdx,rax,r11
  6790. imul r15,r12
  6791. add r14,rax
  6792. adc r15,rdx
  6793. mov rdx,QWORD[((8+160+0))+rbp]
  6794. mulx rax,r10,r10
  6795. add r14,r10
  6796. mulx r9,r11,r11
  6797. adc r15,r11
  6798. adc r9,0
  6799. imul rdx,r12
  6800. add r15,rax
  6801. adc r9,rdx
  6802. mov r10,r13
  6803. mov r11,r14
  6804. mov r12,r15
  6805. and r12,3
  6806. mov r13,r15
  6807. and r13,-4
  6808. mov r14,r9
  6809. shrd r15,r9,2
  6810. shr r9,2
  6811. add r15,r13
  6812. adc r9,r14
  6813. add r10,r15
  6814. adc r11,r9
  6815. adc r12,0
  6816. add r10,QWORD[((0+16))+rdi]
  6817. adc r11,QWORD[((8+16))+rdi]
  6818. adc r12,1
  6819. mov rdx,QWORD[((0+160+0))+rbp]
  6820. mov r15,rdx
  6821. mulx r14,r13,r10
  6822. mulx rdx,rax,r11
  6823. imul r15,r12
  6824. add r14,rax
  6825. adc r15,rdx
  6826. mov rdx,QWORD[((8+160+0))+rbp]
  6827. mulx rax,r10,r10
  6828. add r14,r10
  6829. mulx r9,r11,r11
  6830. adc r15,r11
  6831. adc r9,0
  6832. imul rdx,r12
  6833. add r15,rax
  6834. adc r9,rdx
  6835. mov r10,r13
  6836. mov r11,r14
  6837. mov r12,r15
  6838. and r12,3
  6839. mov r13,r15
  6840. and r13,-4
  6841. mov r14,r9
  6842. shrd r15,r9,2
  6843. shr r9,2
  6844. add r15,r13
  6845. adc r9,r14
  6846. add r10,r15
  6847. adc r11,r9
  6848. adc r12,0
  6849. lea rdi,[32+rdi]
  6850. vperm2i128 ymm0,ymm7,ymm3,0x02
  6851. vperm2i128 ymm7,ymm7,ymm3,0x13
  6852. vperm2i128 ymm3,ymm15,ymm11,0x02
  6853. vperm2i128 ymm11,ymm15,ymm11,0x13
  6854. vpxor ymm0,ymm0,YMMWORD[((0+0))+rsi]
  6855. vpxor ymm3,ymm3,YMMWORD[((32+0))+rsi]
  6856. vpxor ymm7,ymm7,YMMWORD[((64+0))+rsi]
  6857. vpxor ymm11,ymm11,YMMWORD[((96+0))+rsi]
  6858. vmovdqu YMMWORD[(0+0)+rdi],ymm0
  6859. vmovdqu YMMWORD[(32+0)+rdi],ymm3
  6860. vmovdqu YMMWORD[(64+0)+rdi],ymm7
  6861. vmovdqu YMMWORD[(96+0)+rdi],ymm11
  6862. vmovdqa ymm0,YMMWORD[((160+128))+rbp]
  6863. vperm2i128 ymm3,ymm6,ymm2,0x02
  6864. vperm2i128 ymm6,ymm6,ymm2,0x13
  6865. vperm2i128 ymm2,ymm14,ymm10,0x02
  6866. vperm2i128 ymm10,ymm14,ymm10,0x13
  6867. vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi]
  6868. vpxor ymm2,ymm2,YMMWORD[((32+128))+rsi]
  6869. vpxor ymm6,ymm6,YMMWORD[((64+128))+rsi]
  6870. vpxor ymm10,ymm10,YMMWORD[((96+128))+rsi]
  6871. vmovdqu YMMWORD[(0+128)+rdi],ymm3
  6872. vmovdqu YMMWORD[(32+128)+rdi],ymm2
  6873. vmovdqu YMMWORD[(64+128)+rdi],ymm6
  6874. vmovdqu YMMWORD[(96+128)+rdi],ymm10
  6875. vperm2i128 ymm3,ymm5,ymm1,0x02
  6876. vperm2i128 ymm5,ymm5,ymm1,0x13
  6877. vperm2i128 ymm1,ymm13,ymm9,0x02
  6878. vperm2i128 ymm9,ymm13,ymm9,0x13
  6879. vpxor ymm3,ymm3,YMMWORD[((0+256))+rsi]
  6880. vpxor ymm1,ymm1,YMMWORD[((32+256))+rsi]
  6881. vpxor ymm5,ymm5,YMMWORD[((64+256))+rsi]
  6882. vpxor ymm9,ymm9,YMMWORD[((96+256))+rsi]
  6883. vmovdqu YMMWORD[(0+256)+rdi],ymm3
  6884. vmovdqu YMMWORD[(32+256)+rdi],ymm1
  6885. vmovdqu YMMWORD[(64+256)+rdi],ymm5
  6886. vmovdqu YMMWORD[(96+256)+rdi],ymm9
  6887. vperm2i128 ymm3,ymm4,ymm0,0x02
  6888. vperm2i128 ymm4,ymm4,ymm0,0x13
  6889. vperm2i128 ymm0,ymm12,ymm8,0x02
  6890. vperm2i128 ymm8,ymm12,ymm8,0x13
  6891. vpxor ymm3,ymm3,YMMWORD[((0+384))+rsi]
  6892. vpxor ymm0,ymm0,YMMWORD[((32+384))+rsi]
  6893. vpxor ymm4,ymm4,YMMWORD[((64+384))+rsi]
  6894. vpxor ymm8,ymm8,YMMWORD[((96+384))+rsi]
  6895. vmovdqu YMMWORD[(0+384)+rdi],ymm3
  6896. vmovdqu YMMWORD[(32+384)+rdi],ymm0
  6897. vmovdqu YMMWORD[(64+384)+rdi],ymm4
  6898. vmovdqu YMMWORD[(96+384)+rdi],ymm8
  6899. lea rsi,[512+rsi]
  6900. sub rbx,16*32
  6901. cmp rbx,16*32
  6902. jg NEAR $L$seal_avx2_main_loop
  6903. add r10,QWORD[((0+0))+rdi]
  6904. adc r11,QWORD[((8+0))+rdi]
  6905. adc r12,1
  6906. mov rdx,QWORD[((0+160+0))+rbp]
  6907. mov r15,rdx
  6908. mulx r14,r13,r10
  6909. mulx rdx,rax,r11
  6910. imul r15,r12
  6911. add r14,rax
  6912. adc r15,rdx
  6913. mov rdx,QWORD[((8+160+0))+rbp]
  6914. mulx rax,r10,r10
  6915. add r14,r10
  6916. mulx r9,r11,r11
  6917. adc r15,r11
  6918. adc r9,0
  6919. imul rdx,r12
  6920. add r15,rax
  6921. adc r9,rdx
  6922. mov r10,r13
  6923. mov r11,r14
  6924. mov r12,r15
  6925. and r12,3
  6926. mov r13,r15
  6927. and r13,-4
  6928. mov r14,r9
  6929. shrd r15,r9,2
  6930. shr r9,2
  6931. add r15,r13
  6932. adc r9,r14
  6933. add r10,r15
  6934. adc r11,r9
  6935. adc r12,0
  6936. add r10,QWORD[((0+16))+rdi]
  6937. adc r11,QWORD[((8+16))+rdi]
  6938. adc r12,1
  6939. mov rdx,QWORD[((0+160+0))+rbp]
  6940. mov r15,rdx
  6941. mulx r14,r13,r10
  6942. mulx rdx,rax,r11
  6943. imul r15,r12
  6944. add r14,rax
  6945. adc r15,rdx
  6946. mov rdx,QWORD[((8+160+0))+rbp]
  6947. mulx rax,r10,r10
  6948. add r14,r10
  6949. mulx r9,r11,r11
  6950. adc r15,r11
  6951. adc r9,0
  6952. imul rdx,r12
  6953. add r15,rax
  6954. adc r9,rdx
  6955. mov r10,r13
  6956. mov r11,r14
  6957. mov r12,r15
  6958. and r12,3
  6959. mov r13,r15
  6960. and r13,-4
  6961. mov r14,r9
  6962. shrd r15,r9,2
  6963. shr r9,2
  6964. add r15,r13
  6965. adc r9,r14
  6966. add r10,r15
  6967. adc r11,r9
  6968. adc r12,0
  6969. lea rdi,[32+rdi]
  6970. mov rcx,10
  6971. xor r8,r8
  6972. cmp rbx,12*32
  6973. ja NEAR $L$seal_avx2_tail_512
  6974. cmp rbx,8*32
  6975. ja NEAR $L$seal_avx2_tail_384
  6976. cmp rbx,4*32
  6977. ja NEAR $L$seal_avx2_tail_256
  6978. $L$seal_avx2_tail_128:
  6979. vmovdqa ymm0,YMMWORD[$L$chacha20_consts]
  6980. vmovdqa ymm4,YMMWORD[((160+64))+rbp]
  6981. vmovdqa ymm8,YMMWORD[((160+96))+rbp]
  6982. vmovdqa ymm12,YMMWORD[$L$avx2_inc]
  6983. vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
  6984. vmovdqa YMMWORD[(160+160)+rbp],ymm12
  6985. $L$seal_avx2_tail_128_rounds_and_3xhash:
  6986. add r10,QWORD[((0+0))+rdi]
  6987. adc r11,QWORD[((8+0))+rdi]
  6988. adc r12,1
  6989. mov rdx,QWORD[((0+160+0))+rbp]
  6990. mov r15,rdx
  6991. mulx r14,r13,r10
  6992. mulx rdx,rax,r11
  6993. imul r15,r12
  6994. add r14,rax
  6995. adc r15,rdx
  6996. mov rdx,QWORD[((8+160+0))+rbp]
  6997. mulx rax,r10,r10
  6998. add r14,r10
  6999. mulx r9,r11,r11
  7000. adc r15,r11
  7001. adc r9,0
  7002. imul rdx,r12
  7003. add r15,rax
  7004. adc r9,rdx
  7005. mov r10,r13
  7006. mov r11,r14
  7007. mov r12,r15
  7008. and r12,3
  7009. mov r13,r15
  7010. and r13,-4
  7011. mov r14,r9
  7012. shrd r15,r9,2
  7013. shr r9,2
  7014. add r15,r13
  7015. adc r9,r14
  7016. add r10,r15
  7017. adc r11,r9
  7018. adc r12,0
  7019. lea rdi,[16+rdi]
  7020. $L$seal_avx2_tail_128_rounds_and_2xhash:
  7021. vpaddd ymm0,ymm0,ymm4
  7022. vpxor ymm12,ymm12,ymm0
  7023. vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
  7024. vpaddd ymm8,ymm8,ymm12
  7025. vpxor ymm4,ymm4,ymm8
  7026. vpsrld ymm3,ymm4,20
  7027. vpslld ymm4,ymm4,12
  7028. vpxor ymm4,ymm4,ymm3
  7029. vpaddd ymm0,ymm0,ymm4
  7030. vpxor ymm12,ymm12,ymm0
  7031. vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
  7032. vpaddd ymm8,ymm8,ymm12
  7033. vpxor ymm4,ymm4,ymm8
  7034. vpslld ymm3,ymm4,7
  7035. vpsrld ymm4,ymm4,25
  7036. vpxor ymm4,ymm4,ymm3
  7037. vpalignr ymm12,ymm12,ymm12,12
  7038. vpalignr ymm8,ymm8,ymm8,8
  7039. vpalignr ymm4,ymm4,ymm4,4
  7040. add r10,QWORD[((0+0))+rdi]
  7041. adc r11,QWORD[((8+0))+rdi]
  7042. adc r12,1
  7043. mov rdx,QWORD[((0+160+0))+rbp]
  7044. mov r15,rdx
  7045. mulx r14,r13,r10
  7046. mulx rdx,rax,r11
  7047. imul r15,r12
  7048. add r14,rax
  7049. adc r15,rdx
  7050. mov rdx,QWORD[((8+160+0))+rbp]
  7051. mulx rax,r10,r10
  7052. add r14,r10
  7053. mulx r9,r11,r11
  7054. adc r15,r11
  7055. adc r9,0
  7056. imul rdx,r12
  7057. add r15,rax
  7058. adc r9,rdx
  7059. mov r10,r13
  7060. mov r11,r14
  7061. mov r12,r15
  7062. and r12,3
  7063. mov r13,r15
  7064. and r13,-4
  7065. mov r14,r9
  7066. shrd r15,r9,2
  7067. shr r9,2
  7068. add r15,r13
  7069. adc r9,r14
  7070. add r10,r15
  7071. adc r11,r9
  7072. adc r12,0
  7073. vpaddd ymm0,ymm0,ymm4
  7074. vpxor ymm12,ymm12,ymm0
  7075. vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
  7076. vpaddd ymm8,ymm8,ymm12
  7077. vpxor ymm4,ymm4,ymm8
  7078. vpsrld ymm3,ymm4,20
  7079. vpslld ymm4,ymm4,12
  7080. vpxor ymm4,ymm4,ymm3
  7081. vpaddd ymm0,ymm0,ymm4
  7082. vpxor ymm12,ymm12,ymm0
  7083. vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
  7084. vpaddd ymm8,ymm8,ymm12
  7085. vpxor ymm4,ymm4,ymm8
  7086. vpslld ymm3,ymm4,7
  7087. vpsrld ymm4,ymm4,25
  7088. vpxor ymm4,ymm4,ymm3
  7089. vpalignr ymm12,ymm12,ymm12,4
  7090. vpalignr ymm8,ymm8,ymm8,8
  7091. vpalignr ymm4,ymm4,ymm4,12
  7092. add r10,QWORD[((0+16))+rdi]
  7093. adc r11,QWORD[((8+16))+rdi]
  7094. adc r12,1
  7095. mov rdx,QWORD[((0+160+0))+rbp]
  7096. mov r15,rdx
  7097. mulx r14,r13,r10
  7098. mulx rdx,rax,r11
  7099. imul r15,r12
  7100. add r14,rax
  7101. adc r15,rdx
  7102. mov rdx,QWORD[((8+160+0))+rbp]
  7103. mulx rax,r10,r10
  7104. add r14,r10
  7105. mulx r9,r11,r11
  7106. adc r15,r11
  7107. adc r9,0
  7108. imul rdx,r12
  7109. add r15,rax
  7110. adc r9,rdx
  7111. mov r10,r13
  7112. mov r11,r14
  7113. mov r12,r15
  7114. and r12,3
  7115. mov r13,r15
  7116. and r13,-4
  7117. mov r14,r9
  7118. shrd r15,r9,2
  7119. shr r9,2
  7120. add r15,r13
  7121. adc r9,r14
  7122. add r10,r15
  7123. adc r11,r9
  7124. adc r12,0
  7125. lea rdi,[32+rdi]
  7126. dec rcx
  7127. jg NEAR $L$seal_avx2_tail_128_rounds_and_3xhash
  7128. dec r8
  7129. jge NEAR $L$seal_avx2_tail_128_rounds_and_2xhash
  7130. vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts]
  7131. vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp]
  7132. vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp]
  7133. vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
  7134. vperm2i128 ymm3,ymm4,ymm0,0x13
  7135. vperm2i128 ymm0,ymm4,ymm0,0x02
  7136. vperm2i128 ymm4,ymm12,ymm8,0x02
  7137. vperm2i128 ymm12,ymm12,ymm8,0x13
  7138. vmovdqa ymm8,ymm3
  7139. jmp NEAR $L$seal_avx2_short_loop
  7140. $L$seal_avx2_tail_256:
  7141. vmovdqa ymm0,YMMWORD[$L$chacha20_consts]
  7142. vmovdqa ymm4,YMMWORD[((160+64))+rbp]
  7143. vmovdqa ymm8,YMMWORD[((160+96))+rbp]
  7144. vmovdqa ymm1,ymm0
  7145. vmovdqa ymm5,ymm4
  7146. vmovdqa ymm9,ymm8
  7147. vmovdqa ymm12,YMMWORD[$L$avx2_inc]
  7148. vpaddd ymm13,ymm12,YMMWORD[((160+160))+rbp]
  7149. vpaddd ymm12,ymm12,ymm13
  7150. vmovdqa YMMWORD[(160+160)+rbp],ymm12
  7151. vmovdqa YMMWORD[(160+192)+rbp],ymm13
  7152. $L$seal_avx2_tail_256_rounds_and_3xhash:
  7153. add r10,QWORD[((0+0))+rdi]
  7154. adc r11,QWORD[((8+0))+rdi]
  7155. adc r12,1
  7156. mov rax,QWORD[((0+160+0))+rbp]
  7157. mov r15,rax
  7158. mul r10
  7159. mov r13,rax
  7160. mov r14,rdx
  7161. mov rax,QWORD[((0+160+0))+rbp]
  7162. mul r11
  7163. imul r15,r12
  7164. add r14,rax
  7165. adc r15,rdx
  7166. mov rax,QWORD[((8+160+0))+rbp]
  7167. mov r9,rax
  7168. mul r10
  7169. add r14,rax
  7170. adc rdx,0
  7171. mov r10,rdx
  7172. mov rax,QWORD[((8+160+0))+rbp]
  7173. mul r11
  7174. add r15,rax
  7175. adc rdx,0
  7176. imul r9,r12
  7177. add r15,r10
  7178. adc r9,rdx
  7179. mov r10,r13
  7180. mov r11,r14
  7181. mov r12,r15
  7182. and r12,3
  7183. mov r13,r15
  7184. and r13,-4
  7185. mov r14,r9
  7186. shrd r15,r9,2
  7187. shr r9,2
  7188. add r15,r13
  7189. adc r9,r14
  7190. add r10,r15
  7191. adc r11,r9
  7192. adc r12,0
  7193. lea rdi,[16+rdi]
  7194. $L$seal_avx2_tail_256_rounds_and_2xhash:
  7195. vpaddd ymm0,ymm0,ymm4
  7196. vpxor ymm12,ymm12,ymm0
  7197. vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
  7198. vpaddd ymm8,ymm8,ymm12
  7199. vpxor ymm4,ymm4,ymm8
  7200. vpsrld ymm3,ymm4,20
  7201. vpslld ymm4,ymm4,12
  7202. vpxor ymm4,ymm4,ymm3
  7203. vpaddd ymm0,ymm0,ymm4
  7204. vpxor ymm12,ymm12,ymm0
  7205. vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
  7206. vpaddd ymm8,ymm8,ymm12
  7207. vpxor ymm4,ymm4,ymm8
  7208. vpslld ymm3,ymm4,7
  7209. vpsrld ymm4,ymm4,25
  7210. vpxor ymm4,ymm4,ymm3
  7211. vpalignr ymm12,ymm12,ymm12,12
  7212. vpalignr ymm8,ymm8,ymm8,8
  7213. vpalignr ymm4,ymm4,ymm4,4
  7214. vpaddd ymm1,ymm1,ymm5
  7215. vpxor ymm13,ymm13,ymm1
  7216. vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
  7217. vpaddd ymm9,ymm9,ymm13
  7218. vpxor ymm5,ymm5,ymm9
  7219. vpsrld ymm3,ymm5,20
  7220. vpslld ymm5,ymm5,12
  7221. vpxor ymm5,ymm5,ymm3
  7222. vpaddd ymm1,ymm1,ymm5
  7223. vpxor ymm13,ymm13,ymm1
  7224. vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
  7225. vpaddd ymm9,ymm9,ymm13
  7226. vpxor ymm5,ymm5,ymm9
  7227. vpslld ymm3,ymm5,7
  7228. vpsrld ymm5,ymm5,25
  7229. vpxor ymm5,ymm5,ymm3
  7230. vpalignr ymm13,ymm13,ymm13,12
  7231. vpalignr ymm9,ymm9,ymm9,8
  7232. vpalignr ymm5,ymm5,ymm5,4
  7233. add r10,QWORD[((0+0))+rdi]
  7234. adc r11,QWORD[((8+0))+rdi]
  7235. adc r12,1
  7236. mov rax,QWORD[((0+160+0))+rbp]
  7237. mov r15,rax
  7238. mul r10
  7239. mov r13,rax
  7240. mov r14,rdx
  7241. mov rax,QWORD[((0+160+0))+rbp]
  7242. mul r11
  7243. imul r15,r12
  7244. add r14,rax
  7245. adc r15,rdx
  7246. mov rax,QWORD[((8+160+0))+rbp]
  7247. mov r9,rax
  7248. mul r10
  7249. add r14,rax
  7250. adc rdx,0
  7251. mov r10,rdx
  7252. mov rax,QWORD[((8+160+0))+rbp]
  7253. mul r11
  7254. add r15,rax
  7255. adc rdx,0
  7256. imul r9,r12
  7257. add r15,r10
  7258. adc r9,rdx
  7259. mov r10,r13
  7260. mov r11,r14
  7261. mov r12,r15
  7262. and r12,3
  7263. mov r13,r15
  7264. and r13,-4
  7265. mov r14,r9
  7266. shrd r15,r9,2
  7267. shr r9,2
  7268. add r15,r13
  7269. adc r9,r14
  7270. add r10,r15
  7271. adc r11,r9
  7272. adc r12,0
  7273. vpaddd ymm0,ymm0,ymm4
  7274. vpxor ymm12,ymm12,ymm0
  7275. vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
  7276. vpaddd ymm8,ymm8,ymm12
  7277. vpxor ymm4,ymm4,ymm8
  7278. vpsrld ymm3,ymm4,20
  7279. vpslld ymm4,ymm4,12
  7280. vpxor ymm4,ymm4,ymm3
  7281. vpaddd ymm0,ymm0,ymm4
  7282. vpxor ymm12,ymm12,ymm0
  7283. vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
  7284. vpaddd ymm8,ymm8,ymm12
  7285. vpxor ymm4,ymm4,ymm8
  7286. vpslld ymm3,ymm4,7
  7287. vpsrld ymm4,ymm4,25
  7288. vpxor ymm4,ymm4,ymm3
  7289. vpalignr ymm12,ymm12,ymm12,4
  7290. vpalignr ymm8,ymm8,ymm8,8
  7291. vpalignr ymm4,ymm4,ymm4,12
  7292. vpaddd ymm1,ymm1,ymm5
  7293. vpxor ymm13,ymm13,ymm1
  7294. vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
  7295. vpaddd ymm9,ymm9,ymm13
  7296. vpxor ymm5,ymm5,ymm9
  7297. vpsrld ymm3,ymm5,20
  7298. vpslld ymm5,ymm5,12
  7299. vpxor ymm5,ymm5,ymm3
  7300. vpaddd ymm1,ymm1,ymm5
  7301. vpxor ymm13,ymm13,ymm1
  7302. vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
  7303. vpaddd ymm9,ymm9,ymm13
  7304. vpxor ymm5,ymm5,ymm9
  7305. vpslld ymm3,ymm5,7
  7306. vpsrld ymm5,ymm5,25
  7307. vpxor ymm5,ymm5,ymm3
  7308. vpalignr ymm13,ymm13,ymm13,4
  7309. vpalignr ymm9,ymm9,ymm9,8
  7310. vpalignr ymm5,ymm5,ymm5,12
  7311. add r10,QWORD[((0+16))+rdi]
  7312. adc r11,QWORD[((8+16))+rdi]
  7313. adc r12,1
  7314. mov rax,QWORD[((0+160+0))+rbp]
  7315. mov r15,rax
  7316. mul r10
  7317. mov r13,rax
  7318. mov r14,rdx
  7319. mov rax,QWORD[((0+160+0))+rbp]
  7320. mul r11
  7321. imul r15,r12
  7322. add r14,rax
  7323. adc r15,rdx
  7324. mov rax,QWORD[((8+160+0))+rbp]
  7325. mov r9,rax
  7326. mul r10
  7327. add r14,rax
  7328. adc rdx,0
  7329. mov r10,rdx
  7330. mov rax,QWORD[((8+160+0))+rbp]
  7331. mul r11
  7332. add r15,rax
  7333. adc rdx,0
  7334. imul r9,r12
  7335. add r15,r10
  7336. adc r9,rdx
  7337. mov r10,r13
  7338. mov r11,r14
  7339. mov r12,r15
  7340. and r12,3
  7341. mov r13,r15
  7342. and r13,-4
  7343. mov r14,r9
  7344. shrd r15,r9,2
  7345. shr r9,2
  7346. add r15,r13
  7347. adc r9,r14
  7348. add r10,r15
  7349. adc r11,r9
  7350. adc r12,0
  7351. lea rdi,[32+rdi]
  7352. dec rcx
  7353. jg NEAR $L$seal_avx2_tail_256_rounds_and_3xhash
  7354. dec r8
  7355. jge NEAR $L$seal_avx2_tail_256_rounds_and_2xhash
  7356. vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts]
  7357. vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp]
  7358. vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp]
  7359. vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp]
  7360. vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts]
  7361. vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp]
  7362. vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp]
  7363. vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
  7364. vperm2i128 ymm3,ymm5,ymm1,0x02
  7365. vperm2i128 ymm5,ymm5,ymm1,0x13
  7366. vperm2i128 ymm1,ymm13,ymm9,0x02
  7367. vperm2i128 ymm9,ymm13,ymm9,0x13
  7368. vpxor ymm3,ymm3,YMMWORD[((0+0))+rsi]
  7369. vpxor ymm1,ymm1,YMMWORD[((32+0))+rsi]
  7370. vpxor ymm5,ymm5,YMMWORD[((64+0))+rsi]
  7371. vpxor ymm9,ymm9,YMMWORD[((96+0))+rsi]
  7372. vmovdqu YMMWORD[(0+0)+rdi],ymm3
  7373. vmovdqu YMMWORD[(32+0)+rdi],ymm1
  7374. vmovdqu YMMWORD[(64+0)+rdi],ymm5
  7375. vmovdqu YMMWORD[(96+0)+rdi],ymm9
  7376. vperm2i128 ymm3,ymm4,ymm0,0x13
  7377. vperm2i128 ymm0,ymm4,ymm0,0x02
  7378. vperm2i128 ymm4,ymm12,ymm8,0x02
  7379. vperm2i128 ymm12,ymm12,ymm8,0x13
  7380. vmovdqa ymm8,ymm3
  7381. mov rcx,4*32
  7382. lea rsi,[128+rsi]
  7383. sub rbx,4*32
  7384. jmp NEAR $L$seal_avx2_short_hash_remainder
  7385. $L$seal_avx2_tail_384:
  7386. vmovdqa ymm0,YMMWORD[$L$chacha20_consts]
  7387. vmovdqa ymm4,YMMWORD[((160+64))+rbp]
  7388. vmovdqa ymm8,YMMWORD[((160+96))+rbp]
  7389. vmovdqa ymm1,ymm0
  7390. vmovdqa ymm5,ymm4
  7391. vmovdqa ymm9,ymm8
  7392. vmovdqa ymm2,ymm0
  7393. vmovdqa ymm6,ymm4
  7394. vmovdqa ymm10,ymm8
  7395. vmovdqa ymm12,YMMWORD[$L$avx2_inc]
  7396. vpaddd ymm14,ymm12,YMMWORD[((160+160))+rbp]
  7397. vpaddd ymm13,ymm12,ymm14
  7398. vpaddd ymm12,ymm12,ymm13
  7399. vmovdqa YMMWORD[(160+160)+rbp],ymm12
  7400. vmovdqa YMMWORD[(160+192)+rbp],ymm13
  7401. vmovdqa YMMWORD[(160+224)+rbp],ymm14
  7402. $L$seal_avx2_tail_384_rounds_and_3xhash:
  7403. add r10,QWORD[((0+0))+rdi]
  7404. adc r11,QWORD[((8+0))+rdi]
  7405. adc r12,1
  7406. mov rax,QWORD[((0+160+0))+rbp]
  7407. mov r15,rax
  7408. mul r10
  7409. mov r13,rax
  7410. mov r14,rdx
  7411. mov rax,QWORD[((0+160+0))+rbp]
  7412. mul r11
  7413. imul r15,r12
  7414. add r14,rax
  7415. adc r15,rdx
  7416. mov rax,QWORD[((8+160+0))+rbp]
  7417. mov r9,rax
  7418. mul r10
  7419. add r14,rax
  7420. adc rdx,0
  7421. mov r10,rdx
  7422. mov rax,QWORD[((8+160+0))+rbp]
  7423. mul r11
  7424. add r15,rax
  7425. adc rdx,0
  7426. imul r9,r12
  7427. add r15,r10
  7428. adc r9,rdx
  7429. mov r10,r13
  7430. mov r11,r14
  7431. mov r12,r15
  7432. and r12,3
  7433. mov r13,r15
  7434. and r13,-4
  7435. mov r14,r9
  7436. shrd r15,r9,2
  7437. shr r9,2
  7438. add r15,r13
  7439. adc r9,r14
  7440. add r10,r15
  7441. adc r11,r9
  7442. adc r12,0
  7443. lea rdi,[16+rdi]
  7444. $L$seal_avx2_tail_384_rounds_and_2xhash:
  7445. vpaddd ymm0,ymm0,ymm4
  7446. vpxor ymm12,ymm12,ymm0
  7447. vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
  7448. vpaddd ymm8,ymm8,ymm12
  7449. vpxor ymm4,ymm4,ymm8
  7450. vpsrld ymm3,ymm4,20
  7451. vpslld ymm4,ymm4,12
  7452. vpxor ymm4,ymm4,ymm3
  7453. vpaddd ymm0,ymm0,ymm4
  7454. vpxor ymm12,ymm12,ymm0
  7455. vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
  7456. vpaddd ymm8,ymm8,ymm12
  7457. vpxor ymm4,ymm4,ymm8
  7458. vpslld ymm3,ymm4,7
  7459. vpsrld ymm4,ymm4,25
  7460. vpxor ymm4,ymm4,ymm3
  7461. vpalignr ymm12,ymm12,ymm12,12
  7462. vpalignr ymm8,ymm8,ymm8,8
  7463. vpalignr ymm4,ymm4,ymm4,4
  7464. vpaddd ymm1,ymm1,ymm5
  7465. vpxor ymm13,ymm13,ymm1
  7466. vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
  7467. vpaddd ymm9,ymm9,ymm13
  7468. vpxor ymm5,ymm5,ymm9
  7469. vpsrld ymm3,ymm5,20
  7470. vpslld ymm5,ymm5,12
  7471. vpxor ymm5,ymm5,ymm3
  7472. vpaddd ymm1,ymm1,ymm5
  7473. vpxor ymm13,ymm13,ymm1
  7474. vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
  7475. vpaddd ymm9,ymm9,ymm13
  7476. vpxor ymm5,ymm5,ymm9
  7477. vpslld ymm3,ymm5,7
  7478. vpsrld ymm5,ymm5,25
  7479. vpxor ymm5,ymm5,ymm3
  7480. vpalignr ymm13,ymm13,ymm13,12
  7481. vpalignr ymm9,ymm9,ymm9,8
  7482. vpalignr ymm5,ymm5,ymm5,4
  7483. add r10,QWORD[((0+0))+rdi]
  7484. adc r11,QWORD[((8+0))+rdi]
  7485. adc r12,1
  7486. mov rax,QWORD[((0+160+0))+rbp]
  7487. mov r15,rax
  7488. mul r10
  7489. mov r13,rax
  7490. mov r14,rdx
  7491. mov rax,QWORD[((0+160+0))+rbp]
  7492. mul r11
  7493. imul r15,r12
  7494. add r14,rax
  7495. adc r15,rdx
  7496. mov rax,QWORD[((8+160+0))+rbp]
  7497. mov r9,rax
  7498. mul r10
  7499. add r14,rax
  7500. adc rdx,0
  7501. mov r10,rdx
  7502. mov rax,QWORD[((8+160+0))+rbp]
  7503. mul r11
  7504. add r15,rax
  7505. adc rdx,0
  7506. imul r9,r12
  7507. add r15,r10
  7508. adc r9,rdx
  7509. mov r10,r13
  7510. mov r11,r14
  7511. mov r12,r15
  7512. and r12,3
  7513. mov r13,r15
  7514. and r13,-4
  7515. mov r14,r9
  7516. shrd r15,r9,2
  7517. shr r9,2
  7518. add r15,r13
  7519. adc r9,r14
  7520. add r10,r15
  7521. adc r11,r9
  7522. adc r12,0
  7523. vpaddd ymm2,ymm2,ymm6
  7524. vpxor ymm14,ymm14,ymm2
  7525. vpshufb ymm14,ymm14,YMMWORD[$L$rol16]
  7526. vpaddd ymm10,ymm10,ymm14
  7527. vpxor ymm6,ymm6,ymm10
  7528. vpsrld ymm3,ymm6,20
  7529. vpslld ymm6,ymm6,12
  7530. vpxor ymm6,ymm6,ymm3
  7531. vpaddd ymm2,ymm2,ymm6
  7532. vpxor ymm14,ymm14,ymm2
  7533. vpshufb ymm14,ymm14,YMMWORD[$L$rol8]
  7534. vpaddd ymm10,ymm10,ymm14
  7535. vpxor ymm6,ymm6,ymm10
  7536. vpslld ymm3,ymm6,7
  7537. vpsrld ymm6,ymm6,25
  7538. vpxor ymm6,ymm6,ymm3
  7539. vpalignr ymm14,ymm14,ymm14,12
  7540. vpalignr ymm10,ymm10,ymm10,8
  7541. vpalignr ymm6,ymm6,ymm6,4
  7542. vpaddd ymm0,ymm0,ymm4
  7543. vpxor ymm12,ymm12,ymm0
  7544. vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
  7545. vpaddd ymm8,ymm8,ymm12
  7546. vpxor ymm4,ymm4,ymm8
  7547. vpsrld ymm3,ymm4,20
  7548. vpslld ymm4,ymm4,12
  7549. vpxor ymm4,ymm4,ymm3
  7550. vpaddd ymm0,ymm0,ymm4
  7551. vpxor ymm12,ymm12,ymm0
  7552. vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
  7553. vpaddd ymm8,ymm8,ymm12
  7554. vpxor ymm4,ymm4,ymm8
  7555. vpslld ymm3,ymm4,7
  7556. vpsrld ymm4,ymm4,25
  7557. vpxor ymm4,ymm4,ymm3
  7558. vpalignr ymm12,ymm12,ymm12,4
  7559. vpalignr ymm8,ymm8,ymm8,8
  7560. vpalignr ymm4,ymm4,ymm4,12
  7561. add r10,QWORD[((0+16))+rdi]
  7562. adc r11,QWORD[((8+16))+rdi]
  7563. adc r12,1
  7564. mov rax,QWORD[((0+160+0))+rbp]
  7565. mov r15,rax
  7566. mul r10
  7567. mov r13,rax
  7568. mov r14,rdx
  7569. mov rax,QWORD[((0+160+0))+rbp]
  7570. mul r11
  7571. imul r15,r12
  7572. add r14,rax
  7573. adc r15,rdx
  7574. mov rax,QWORD[((8+160+0))+rbp]
  7575. mov r9,rax
  7576. mul r10
  7577. add r14,rax
  7578. adc rdx,0
  7579. mov r10,rdx
  7580. mov rax,QWORD[((8+160+0))+rbp]
  7581. mul r11
  7582. add r15,rax
  7583. adc rdx,0
  7584. imul r9,r12
  7585. add r15,r10
  7586. adc r9,rdx
  7587. mov r10,r13
  7588. mov r11,r14
  7589. mov r12,r15
  7590. and r12,3
  7591. mov r13,r15
  7592. and r13,-4
  7593. mov r14,r9
  7594. shrd r15,r9,2
  7595. shr r9,2
  7596. add r15,r13
  7597. adc r9,r14
  7598. add r10,r15
  7599. adc r11,r9
  7600. adc r12,0
  7601. vpaddd ymm1,ymm1,ymm5
  7602. vpxor ymm13,ymm13,ymm1
  7603. vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
  7604. vpaddd ymm9,ymm9,ymm13
  7605. vpxor ymm5,ymm5,ymm9
  7606. vpsrld ymm3,ymm5,20
  7607. vpslld ymm5,ymm5,12
  7608. vpxor ymm5,ymm5,ymm3
  7609. vpaddd ymm1,ymm1,ymm5
  7610. vpxor ymm13,ymm13,ymm1
  7611. vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
  7612. vpaddd ymm9,ymm9,ymm13
  7613. vpxor ymm5,ymm5,ymm9
  7614. vpslld ymm3,ymm5,7
  7615. vpsrld ymm5,ymm5,25
  7616. vpxor ymm5,ymm5,ymm3
  7617. vpalignr ymm13,ymm13,ymm13,4
  7618. vpalignr ymm9,ymm9,ymm9,8
  7619. vpalignr ymm5,ymm5,ymm5,12
  7620. vpaddd ymm2,ymm2,ymm6
  7621. vpxor ymm14,ymm14,ymm2
  7622. vpshufb ymm14,ymm14,YMMWORD[$L$rol16]
  7623. vpaddd ymm10,ymm10,ymm14
  7624. vpxor ymm6,ymm6,ymm10
  7625. vpsrld ymm3,ymm6,20
  7626. vpslld ymm6,ymm6,12
  7627. vpxor ymm6,ymm6,ymm3
  7628. vpaddd ymm2,ymm2,ymm6
  7629. vpxor ymm14,ymm14,ymm2
  7630. vpshufb ymm14,ymm14,YMMWORD[$L$rol8]
  7631. vpaddd ymm10,ymm10,ymm14
  7632. vpxor ymm6,ymm6,ymm10
  7633. vpslld ymm3,ymm6,7
  7634. vpsrld ymm6,ymm6,25
  7635. vpxor ymm6,ymm6,ymm3
  7636. vpalignr ymm14,ymm14,ymm14,4
  7637. vpalignr ymm10,ymm10,ymm10,8
  7638. vpalignr ymm6,ymm6,ymm6,12
  7639. lea rdi,[32+rdi]
  7640. dec rcx
  7641. jg NEAR $L$seal_avx2_tail_384_rounds_and_3xhash
  7642. dec r8
  7643. jge NEAR $L$seal_avx2_tail_384_rounds_and_2xhash
  7644. vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts]
  7645. vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp]
  7646. vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp]
  7647. vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp]
  7648. vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts]
  7649. vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp]
  7650. vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp]
  7651. vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp]
  7652. vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts]
  7653. vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp]
  7654. vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp]
  7655. vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
  7656. vperm2i128 ymm3,ymm6,ymm2,0x02
  7657. vperm2i128 ymm6,ymm6,ymm2,0x13
  7658. vperm2i128 ymm2,ymm14,ymm10,0x02
  7659. vperm2i128 ymm10,ymm14,ymm10,0x13
  7660. vpxor ymm3,ymm3,YMMWORD[((0+0))+rsi]
  7661. vpxor ymm2,ymm2,YMMWORD[((32+0))+rsi]
  7662. vpxor ymm6,ymm6,YMMWORD[((64+0))+rsi]
  7663. vpxor ymm10,ymm10,YMMWORD[((96+0))+rsi]
  7664. vmovdqu YMMWORD[(0+0)+rdi],ymm3
  7665. vmovdqu YMMWORD[(32+0)+rdi],ymm2
  7666. vmovdqu YMMWORD[(64+0)+rdi],ymm6
  7667. vmovdqu YMMWORD[(96+0)+rdi],ymm10
  7668. vperm2i128 ymm3,ymm5,ymm1,0x02
  7669. vperm2i128 ymm5,ymm5,ymm1,0x13
  7670. vperm2i128 ymm1,ymm13,ymm9,0x02
  7671. vperm2i128 ymm9,ymm13,ymm9,0x13
  7672. vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi]
  7673. vpxor ymm1,ymm1,YMMWORD[((32+128))+rsi]
  7674. vpxor ymm5,ymm5,YMMWORD[((64+128))+rsi]
  7675. vpxor ymm9,ymm9,YMMWORD[((96+128))+rsi]
  7676. vmovdqu YMMWORD[(0+128)+rdi],ymm3
  7677. vmovdqu YMMWORD[(32+128)+rdi],ymm1
  7678. vmovdqu YMMWORD[(64+128)+rdi],ymm5
  7679. vmovdqu YMMWORD[(96+128)+rdi],ymm9
  7680. vperm2i128 ymm3,ymm4,ymm0,0x13
  7681. vperm2i128 ymm0,ymm4,ymm0,0x02
  7682. vperm2i128 ymm4,ymm12,ymm8,0x02
  7683. vperm2i128 ymm12,ymm12,ymm8,0x13
  7684. vmovdqa ymm8,ymm3
  7685. mov rcx,8*32
  7686. lea rsi,[256+rsi]
  7687. sub rbx,8*32
  7688. jmp NEAR $L$seal_avx2_short_hash_remainder
  7689. $L$seal_avx2_tail_512:
  7690. vmovdqa ymm0,YMMWORD[$L$chacha20_consts]
  7691. vmovdqa ymm4,YMMWORD[((160+64))+rbp]
  7692. vmovdqa ymm8,YMMWORD[((160+96))+rbp]
  7693. vmovdqa ymm1,ymm0
  7694. vmovdqa ymm5,ymm4
  7695. vmovdqa ymm9,ymm8
  7696. vmovdqa ymm2,ymm0
  7697. vmovdqa ymm6,ymm4
  7698. vmovdqa ymm10,ymm8
  7699. vmovdqa ymm3,ymm0
  7700. vmovdqa ymm7,ymm4
  7701. vmovdqa ymm11,ymm8
  7702. vmovdqa ymm12,YMMWORD[$L$avx2_inc]
  7703. vpaddd ymm15,ymm12,YMMWORD[((160+160))+rbp]
  7704. vpaddd ymm14,ymm12,ymm15
  7705. vpaddd ymm13,ymm12,ymm14
  7706. vpaddd ymm12,ymm12,ymm13
  7707. vmovdqa YMMWORD[(160+256)+rbp],ymm15
  7708. vmovdqa YMMWORD[(160+224)+rbp],ymm14
  7709. vmovdqa YMMWORD[(160+192)+rbp],ymm13
  7710. vmovdqa YMMWORD[(160+160)+rbp],ymm12
  7711. $L$seal_avx2_tail_512_rounds_and_3xhash:
  7712. add r10,QWORD[((0+0))+rdi]
  7713. adc r11,QWORD[((8+0))+rdi]
  7714. adc r12,1
  7715. mov rdx,QWORD[((0+160+0))+rbp]
  7716. mov r15,rdx
  7717. mulx r14,r13,r10
  7718. mulx rdx,rax,r11
  7719. imul r15,r12
  7720. add r14,rax
  7721. adc r15,rdx
  7722. mov rdx,QWORD[((8+160+0))+rbp]
  7723. mulx rax,r10,r10
  7724. add r14,r10
  7725. mulx r9,r11,r11
  7726. adc r15,r11
  7727. adc r9,0
  7728. imul rdx,r12
  7729. add r15,rax
  7730. adc r9,rdx
  7731. mov r10,r13
  7732. mov r11,r14
  7733. mov r12,r15
  7734. and r12,3
  7735. mov r13,r15
  7736. and r13,-4
  7737. mov r14,r9
  7738. shrd r15,r9,2
  7739. shr r9,2
  7740. add r15,r13
  7741. adc r9,r14
  7742. add r10,r15
  7743. adc r11,r9
  7744. adc r12,0
  7745. lea rdi,[16+rdi]
  7746. $L$seal_avx2_tail_512_rounds_and_2xhash:
  7747. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  7748. vmovdqa ymm8,YMMWORD[$L$rol16]
  7749. vpaddd ymm3,ymm3,ymm7
  7750. vpaddd ymm2,ymm2,ymm6
  7751. vpaddd ymm1,ymm1,ymm5
  7752. vpaddd ymm0,ymm0,ymm4
  7753. vpxor ymm15,ymm15,ymm3
  7754. vpxor ymm14,ymm14,ymm2
  7755. vpxor ymm13,ymm13,ymm1
  7756. vpxor ymm12,ymm12,ymm0
  7757. vpshufb ymm15,ymm15,ymm8
  7758. vpshufb ymm14,ymm14,ymm8
  7759. vpshufb ymm13,ymm13,ymm8
  7760. vpshufb ymm12,ymm12,ymm8
  7761. vpaddd ymm11,ymm11,ymm15
  7762. vpaddd ymm10,ymm10,ymm14
  7763. vpaddd ymm9,ymm9,ymm13
  7764. vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
  7765. vpxor ymm7,ymm7,ymm11
  7766. vpxor ymm6,ymm6,ymm10
  7767. add r10,QWORD[((0+0))+rdi]
  7768. adc r11,QWORD[((8+0))+rdi]
  7769. adc r12,1
  7770. vpxor ymm5,ymm5,ymm9
  7771. vpxor ymm4,ymm4,ymm8
  7772. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  7773. vpsrld ymm8,ymm7,20
  7774. vpslld ymm7,ymm7,32-20
  7775. vpxor ymm7,ymm7,ymm8
  7776. vpsrld ymm8,ymm6,20
  7777. vpslld ymm6,ymm6,32-20
  7778. vpxor ymm6,ymm6,ymm8
  7779. vpsrld ymm8,ymm5,20
  7780. vpslld ymm5,ymm5,32-20
  7781. vpxor ymm5,ymm5,ymm8
  7782. vpsrld ymm8,ymm4,20
  7783. vpslld ymm4,ymm4,32-20
  7784. vpxor ymm4,ymm4,ymm8
  7785. vmovdqa ymm8,YMMWORD[$L$rol8]
  7786. vpaddd ymm3,ymm3,ymm7
  7787. vpaddd ymm2,ymm2,ymm6
  7788. vpaddd ymm1,ymm1,ymm5
  7789. vpaddd ymm0,ymm0,ymm4
  7790. mov rdx,QWORD[((0+160+0))+rbp]
  7791. mov r15,rdx
  7792. mulx r14,r13,r10
  7793. mulx rdx,rax,r11
  7794. imul r15,r12
  7795. add r14,rax
  7796. adc r15,rdx
  7797. vpxor ymm15,ymm15,ymm3
  7798. vpxor ymm14,ymm14,ymm2
  7799. vpxor ymm13,ymm13,ymm1
  7800. vpxor ymm12,ymm12,ymm0
  7801. vpshufb ymm15,ymm15,ymm8
  7802. vpshufb ymm14,ymm14,ymm8
  7803. vpshufb ymm13,ymm13,ymm8
  7804. vpshufb ymm12,ymm12,ymm8
  7805. vpaddd ymm11,ymm11,ymm15
  7806. vpaddd ymm10,ymm10,ymm14
  7807. vpaddd ymm9,ymm9,ymm13
  7808. vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
  7809. vpxor ymm7,ymm7,ymm11
  7810. vpxor ymm6,ymm6,ymm10
  7811. vpxor ymm5,ymm5,ymm9
  7812. vpxor ymm4,ymm4,ymm8
  7813. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  7814. vpsrld ymm8,ymm7,25
  7815. vpslld ymm7,ymm7,32-25
  7816. vpxor ymm7,ymm7,ymm8
  7817. mov rdx,QWORD[((8+160+0))+rbp]
  7818. mulx rax,r10,r10
  7819. add r14,r10
  7820. mulx r9,r11,r11
  7821. adc r15,r11
  7822. adc r9,0
  7823. imul rdx,r12
  7824. vpsrld ymm8,ymm6,25
  7825. vpslld ymm6,ymm6,32-25
  7826. vpxor ymm6,ymm6,ymm8
  7827. vpsrld ymm8,ymm5,25
  7828. vpslld ymm5,ymm5,32-25
  7829. vpxor ymm5,ymm5,ymm8
  7830. vpsrld ymm8,ymm4,25
  7831. vpslld ymm4,ymm4,32-25
  7832. vpxor ymm4,ymm4,ymm8
  7833. vmovdqa ymm8,YMMWORD[((160+128))+rbp]
  7834. vpalignr ymm7,ymm7,ymm7,4
  7835. vpalignr ymm11,ymm11,ymm11,8
  7836. vpalignr ymm15,ymm15,ymm15,12
  7837. vpalignr ymm6,ymm6,ymm6,4
  7838. vpalignr ymm10,ymm10,ymm10,8
  7839. vpalignr ymm14,ymm14,ymm14,12
  7840. vpalignr ymm5,ymm5,ymm5,4
  7841. vpalignr ymm9,ymm9,ymm9,8
  7842. vpalignr ymm13,ymm13,ymm13,12
  7843. vpalignr ymm4,ymm4,ymm4,4
  7844. add r15,rax
  7845. adc r9,rdx
  7846. vpalignr ymm8,ymm8,ymm8,8
  7847. vpalignr ymm12,ymm12,ymm12,12
  7848. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  7849. vmovdqa ymm8,YMMWORD[$L$rol16]
  7850. vpaddd ymm3,ymm3,ymm7
  7851. vpaddd ymm2,ymm2,ymm6
  7852. vpaddd ymm1,ymm1,ymm5
  7853. vpaddd ymm0,ymm0,ymm4
  7854. vpxor ymm15,ymm15,ymm3
  7855. vpxor ymm14,ymm14,ymm2
  7856. vpxor ymm13,ymm13,ymm1
  7857. vpxor ymm12,ymm12,ymm0
  7858. vpshufb ymm15,ymm15,ymm8
  7859. vpshufb ymm14,ymm14,ymm8
  7860. vpshufb ymm13,ymm13,ymm8
  7861. vpshufb ymm12,ymm12,ymm8
  7862. vpaddd ymm11,ymm11,ymm15
  7863. vpaddd ymm10,ymm10,ymm14
  7864. vpaddd ymm9,ymm9,ymm13
  7865. vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
  7866. mov r10,r13
  7867. mov r11,r14
  7868. mov r12,r15
  7869. and r12,3
  7870. mov r13,r15
  7871. and r13,-4
  7872. mov r14,r9
  7873. shrd r15,r9,2
  7874. shr r9,2
  7875. add r15,r13
  7876. adc r9,r14
  7877. add r10,r15
  7878. adc r11,r9
  7879. adc r12,0
  7880. vpxor ymm7,ymm7,ymm11
  7881. vpxor ymm6,ymm6,ymm10
  7882. vpxor ymm5,ymm5,ymm9
  7883. vpxor ymm4,ymm4,ymm8
  7884. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  7885. vpsrld ymm8,ymm7,20
  7886. vpslld ymm7,ymm7,32-20
  7887. vpxor ymm7,ymm7,ymm8
  7888. vpsrld ymm8,ymm6,20
  7889. vpslld ymm6,ymm6,32-20
  7890. vpxor ymm6,ymm6,ymm8
  7891. vpsrld ymm8,ymm5,20
  7892. vpslld ymm5,ymm5,32-20
  7893. vpxor ymm5,ymm5,ymm8
  7894. vpsrld ymm8,ymm4,20
  7895. vpslld ymm4,ymm4,32-20
  7896. vpxor ymm4,ymm4,ymm8
  7897. vmovdqa ymm8,YMMWORD[$L$rol8]
  7898. vpaddd ymm3,ymm3,ymm7
  7899. vpaddd ymm2,ymm2,ymm6
  7900. add r10,QWORD[((0+16))+rdi]
  7901. adc r11,QWORD[((8+16))+rdi]
  7902. adc r12,1
  7903. vpaddd ymm1,ymm1,ymm5
  7904. vpaddd ymm0,ymm0,ymm4
  7905. vpxor ymm15,ymm15,ymm3
  7906. vpxor ymm14,ymm14,ymm2
  7907. vpxor ymm13,ymm13,ymm1
  7908. vpxor ymm12,ymm12,ymm0
  7909. vpshufb ymm15,ymm15,ymm8
  7910. vpshufb ymm14,ymm14,ymm8
  7911. vpshufb ymm13,ymm13,ymm8
  7912. vpshufb ymm12,ymm12,ymm8
  7913. vpaddd ymm11,ymm11,ymm15
  7914. vpaddd ymm10,ymm10,ymm14
  7915. vpaddd ymm9,ymm9,ymm13
  7916. vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
  7917. vpxor ymm7,ymm7,ymm11
  7918. vpxor ymm6,ymm6,ymm10
  7919. vpxor ymm5,ymm5,ymm9
  7920. vpxor ymm4,ymm4,ymm8
  7921. vmovdqa YMMWORD[(160+128)+rbp],ymm8
  7922. vpsrld ymm8,ymm7,25
  7923. mov rdx,QWORD[((0+160+0))+rbp]
  7924. mov r15,rdx
  7925. mulx r14,r13,r10
  7926. mulx rdx,rax,r11
  7927. imul r15,r12
  7928. add r14,rax
  7929. adc r15,rdx
  7930. vpslld ymm7,ymm7,32-25
  7931. vpxor ymm7,ymm7,ymm8
  7932. vpsrld ymm8,ymm6,25
  7933. vpslld ymm6,ymm6,32-25
  7934. vpxor ymm6,ymm6,ymm8
  7935. vpsrld ymm8,ymm5,25
  7936. vpslld ymm5,ymm5,32-25
  7937. vpxor ymm5,ymm5,ymm8
  7938. vpsrld ymm8,ymm4,25
  7939. vpslld ymm4,ymm4,32-25
  7940. vpxor ymm4,ymm4,ymm8
  7941. vmovdqa ymm8,YMMWORD[((160+128))+rbp]
  7942. vpalignr ymm7,ymm7,ymm7,12
  7943. vpalignr ymm11,ymm11,ymm11,8
  7944. vpalignr ymm15,ymm15,ymm15,4
  7945. vpalignr ymm6,ymm6,ymm6,12
  7946. vpalignr ymm10,ymm10,ymm10,8
  7947. vpalignr ymm14,ymm14,ymm14,4
  7948. vpalignr ymm5,ymm5,ymm5,12
  7949. vpalignr ymm9,ymm9,ymm9,8
  7950. mov rdx,QWORD[((8+160+0))+rbp]
  7951. mulx rax,r10,r10
  7952. add r14,r10
  7953. mulx r9,r11,r11
  7954. adc r15,r11
  7955. adc r9,0
  7956. imul rdx,r12
  7957. vpalignr ymm13,ymm13,ymm13,4
  7958. vpalignr ymm4,ymm4,ymm4,12
  7959. vpalignr ymm8,ymm8,ymm8,8
  7960. vpalignr ymm12,ymm12,ymm12,4
  7961. add r15,rax
  7962. adc r9,rdx
  7963. mov r10,r13
  7964. mov r11,r14
  7965. mov r12,r15
  7966. and r12,3
  7967. mov r13,r15
  7968. and r13,-4
  7969. mov r14,r9
  7970. shrd r15,r9,2
  7971. shr r9,2
  7972. add r15,r13
  7973. adc r9,r14
  7974. add r10,r15
  7975. adc r11,r9
  7976. adc r12,0
  7977. lea rdi,[32+rdi]
  7978. dec rcx
  7979. jg NEAR $L$seal_avx2_tail_512_rounds_and_3xhash
  7980. dec r8
  7981. jge NEAR $L$seal_avx2_tail_512_rounds_and_2xhash
  7982. vpaddd ymm3,ymm3,YMMWORD[$L$chacha20_consts]
  7983. vpaddd ymm7,ymm7,YMMWORD[((160+64))+rbp]
  7984. vpaddd ymm11,ymm11,YMMWORD[((160+96))+rbp]
  7985. vpaddd ymm15,ymm15,YMMWORD[((160+256))+rbp]
  7986. vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts]
  7987. vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp]
  7988. vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp]
  7989. vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp]
  7990. vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts]
  7991. vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp]
  7992. vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp]
  7993. vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp]
  7994. vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts]
  7995. vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp]
  7996. vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp]
  7997. vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
  7998. vmovdqa YMMWORD[(160+128)+rbp],ymm0
  7999. vperm2i128 ymm0,ymm7,ymm3,0x02
  8000. vperm2i128 ymm7,ymm7,ymm3,0x13
  8001. vperm2i128 ymm3,ymm15,ymm11,0x02
  8002. vperm2i128 ymm11,ymm15,ymm11,0x13
  8003. vpxor ymm0,ymm0,YMMWORD[((0+0))+rsi]
  8004. vpxor ymm3,ymm3,YMMWORD[((32+0))+rsi]
  8005. vpxor ymm7,ymm7,YMMWORD[((64+0))+rsi]
  8006. vpxor ymm11,ymm11,YMMWORD[((96+0))+rsi]
  8007. vmovdqu YMMWORD[(0+0)+rdi],ymm0
  8008. vmovdqu YMMWORD[(32+0)+rdi],ymm3
  8009. vmovdqu YMMWORD[(64+0)+rdi],ymm7
  8010. vmovdqu YMMWORD[(96+0)+rdi],ymm11
  8011. vmovdqa ymm0,YMMWORD[((160+128))+rbp]
  8012. vperm2i128 ymm3,ymm6,ymm2,0x02
  8013. vperm2i128 ymm6,ymm6,ymm2,0x13
  8014. vperm2i128 ymm2,ymm14,ymm10,0x02
  8015. vperm2i128 ymm10,ymm14,ymm10,0x13
  8016. vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi]
  8017. vpxor ymm2,ymm2,YMMWORD[((32+128))+rsi]
  8018. vpxor ymm6,ymm6,YMMWORD[((64+128))+rsi]
  8019. vpxor ymm10,ymm10,YMMWORD[((96+128))+rsi]
  8020. vmovdqu YMMWORD[(0+128)+rdi],ymm3
  8021. vmovdqu YMMWORD[(32+128)+rdi],ymm2
  8022. vmovdqu YMMWORD[(64+128)+rdi],ymm6
  8023. vmovdqu YMMWORD[(96+128)+rdi],ymm10
  8024. vperm2i128 ymm3,ymm5,ymm1,0x02
  8025. vperm2i128 ymm5,ymm5,ymm1,0x13
  8026. vperm2i128 ymm1,ymm13,ymm9,0x02
  8027. vperm2i128 ymm9,ymm13,ymm9,0x13
  8028. vpxor ymm3,ymm3,YMMWORD[((0+256))+rsi]
  8029. vpxor ymm1,ymm1,YMMWORD[((32+256))+rsi]
  8030. vpxor ymm5,ymm5,YMMWORD[((64+256))+rsi]
  8031. vpxor ymm9,ymm9,YMMWORD[((96+256))+rsi]
  8032. vmovdqu YMMWORD[(0+256)+rdi],ymm3
  8033. vmovdqu YMMWORD[(32+256)+rdi],ymm1
  8034. vmovdqu YMMWORD[(64+256)+rdi],ymm5
  8035. vmovdqu YMMWORD[(96+256)+rdi],ymm9
  8036. vperm2i128 ymm3,ymm4,ymm0,0x13
  8037. vperm2i128 ymm0,ymm4,ymm0,0x02
  8038. vperm2i128 ymm4,ymm12,ymm8,0x02
  8039. vperm2i128 ymm12,ymm12,ymm8,0x13
  8040. vmovdqa ymm8,ymm3
  8041. mov rcx,12*32
  8042. lea rsi,[384+rsi]
  8043. sub rbx,12*32
  8044. jmp NEAR $L$seal_avx2_short_hash_remainder
  8045. $L$seal_avx2_320:
  8046. vmovdqa ymm1,ymm0
  8047. vmovdqa ymm2,ymm0
  8048. vmovdqa ymm5,ymm4
  8049. vmovdqa ymm6,ymm4
  8050. vmovdqa ymm9,ymm8
  8051. vmovdqa ymm10,ymm8
  8052. vpaddd ymm13,ymm12,YMMWORD[$L$avx2_inc]
  8053. vpaddd ymm14,ymm13,YMMWORD[$L$avx2_inc]
  8054. vmovdqa ymm7,ymm4
  8055. vmovdqa ymm11,ymm8
  8056. vmovdqa YMMWORD[(160+160)+rbp],ymm12
  8057. vmovdqa YMMWORD[(160+192)+rbp],ymm13
  8058. vmovdqa YMMWORD[(160+224)+rbp],ymm14
  8059. mov r10,10
  8060. $L$seal_avx2_320_rounds:
  8061. vpaddd ymm0,ymm0,ymm4
  8062. vpxor ymm12,ymm12,ymm0
  8063. vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
  8064. vpaddd ymm8,ymm8,ymm12
  8065. vpxor ymm4,ymm4,ymm8
  8066. vpsrld ymm3,ymm4,20
  8067. vpslld ymm4,ymm4,12
  8068. vpxor ymm4,ymm4,ymm3
  8069. vpaddd ymm0,ymm0,ymm4
  8070. vpxor ymm12,ymm12,ymm0
  8071. vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
  8072. vpaddd ymm8,ymm8,ymm12
  8073. vpxor ymm4,ymm4,ymm8
  8074. vpslld ymm3,ymm4,7
  8075. vpsrld ymm4,ymm4,25
  8076. vpxor ymm4,ymm4,ymm3
  8077. vpalignr ymm12,ymm12,ymm12,12
  8078. vpalignr ymm8,ymm8,ymm8,8
  8079. vpalignr ymm4,ymm4,ymm4,4
  8080. vpaddd ymm1,ymm1,ymm5
  8081. vpxor ymm13,ymm13,ymm1
  8082. vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
  8083. vpaddd ymm9,ymm9,ymm13
  8084. vpxor ymm5,ymm5,ymm9
  8085. vpsrld ymm3,ymm5,20
  8086. vpslld ymm5,ymm5,12
  8087. vpxor ymm5,ymm5,ymm3
  8088. vpaddd ymm1,ymm1,ymm5
  8089. vpxor ymm13,ymm13,ymm1
  8090. vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
  8091. vpaddd ymm9,ymm9,ymm13
  8092. vpxor ymm5,ymm5,ymm9
  8093. vpslld ymm3,ymm5,7
  8094. vpsrld ymm5,ymm5,25
  8095. vpxor ymm5,ymm5,ymm3
  8096. vpalignr ymm13,ymm13,ymm13,12
  8097. vpalignr ymm9,ymm9,ymm9,8
  8098. vpalignr ymm5,ymm5,ymm5,4
  8099. vpaddd ymm2,ymm2,ymm6
  8100. vpxor ymm14,ymm14,ymm2
  8101. vpshufb ymm14,ymm14,YMMWORD[$L$rol16]
  8102. vpaddd ymm10,ymm10,ymm14
  8103. vpxor ymm6,ymm6,ymm10
  8104. vpsrld ymm3,ymm6,20
  8105. vpslld ymm6,ymm6,12
  8106. vpxor ymm6,ymm6,ymm3
  8107. vpaddd ymm2,ymm2,ymm6
  8108. vpxor ymm14,ymm14,ymm2
  8109. vpshufb ymm14,ymm14,YMMWORD[$L$rol8]
  8110. vpaddd ymm10,ymm10,ymm14
  8111. vpxor ymm6,ymm6,ymm10
  8112. vpslld ymm3,ymm6,7
  8113. vpsrld ymm6,ymm6,25
  8114. vpxor ymm6,ymm6,ymm3
  8115. vpalignr ymm14,ymm14,ymm14,12
  8116. vpalignr ymm10,ymm10,ymm10,8
  8117. vpalignr ymm6,ymm6,ymm6,4
  8118. vpaddd ymm0,ymm0,ymm4
  8119. vpxor ymm12,ymm12,ymm0
  8120. vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
  8121. vpaddd ymm8,ymm8,ymm12
  8122. vpxor ymm4,ymm4,ymm8
  8123. vpsrld ymm3,ymm4,20
  8124. vpslld ymm4,ymm4,12
  8125. vpxor ymm4,ymm4,ymm3
  8126. vpaddd ymm0,ymm0,ymm4
  8127. vpxor ymm12,ymm12,ymm0
  8128. vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
  8129. vpaddd ymm8,ymm8,ymm12
  8130. vpxor ymm4,ymm4,ymm8
  8131. vpslld ymm3,ymm4,7
  8132. vpsrld ymm4,ymm4,25
  8133. vpxor ymm4,ymm4,ymm3
  8134. vpalignr ymm12,ymm12,ymm12,4
  8135. vpalignr ymm8,ymm8,ymm8,8
  8136. vpalignr ymm4,ymm4,ymm4,12
  8137. vpaddd ymm1,ymm1,ymm5
  8138. vpxor ymm13,ymm13,ymm1
  8139. vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
  8140. vpaddd ymm9,ymm9,ymm13
  8141. vpxor ymm5,ymm5,ymm9
  8142. vpsrld ymm3,ymm5,20
  8143. vpslld ymm5,ymm5,12
  8144. vpxor ymm5,ymm5,ymm3
  8145. vpaddd ymm1,ymm1,ymm5
  8146. vpxor ymm13,ymm13,ymm1
  8147. vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
  8148. vpaddd ymm9,ymm9,ymm13
  8149. vpxor ymm5,ymm5,ymm9
  8150. vpslld ymm3,ymm5,7
  8151. vpsrld ymm5,ymm5,25
  8152. vpxor ymm5,ymm5,ymm3
  8153. vpalignr ymm13,ymm13,ymm13,4
  8154. vpalignr ymm9,ymm9,ymm9,8
  8155. vpalignr ymm5,ymm5,ymm5,12
  8156. vpaddd ymm2,ymm2,ymm6
  8157. vpxor ymm14,ymm14,ymm2
  8158. vpshufb ymm14,ymm14,YMMWORD[$L$rol16]
  8159. vpaddd ymm10,ymm10,ymm14
  8160. vpxor ymm6,ymm6,ymm10
  8161. vpsrld ymm3,ymm6,20
  8162. vpslld ymm6,ymm6,12
  8163. vpxor ymm6,ymm6,ymm3
  8164. vpaddd ymm2,ymm2,ymm6
  8165. vpxor ymm14,ymm14,ymm2
  8166. vpshufb ymm14,ymm14,YMMWORD[$L$rol8]
  8167. vpaddd ymm10,ymm10,ymm14
  8168. vpxor ymm6,ymm6,ymm10
  8169. vpslld ymm3,ymm6,7
  8170. vpsrld ymm6,ymm6,25
  8171. vpxor ymm6,ymm6,ymm3
  8172. vpalignr ymm14,ymm14,ymm14,4
  8173. vpalignr ymm10,ymm10,ymm10,8
  8174. vpalignr ymm6,ymm6,ymm6,12
  8175. dec r10
  8176. jne NEAR $L$seal_avx2_320_rounds
  8177. vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts]
  8178. vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts]
  8179. vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts]
  8180. vpaddd ymm4,ymm4,ymm7
  8181. vpaddd ymm5,ymm5,ymm7
  8182. vpaddd ymm6,ymm6,ymm7
  8183. vpaddd ymm8,ymm8,ymm11
  8184. vpaddd ymm9,ymm9,ymm11
  8185. vpaddd ymm10,ymm10,ymm11
  8186. vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
  8187. vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp]
  8188. vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp]
  8189. vperm2i128 ymm3,ymm4,ymm0,0x02
  8190. vpand ymm3,ymm3,YMMWORD[$L$clamp]
  8191. vmovdqa YMMWORD[(160+0)+rbp],ymm3
  8192. vperm2i128 ymm0,ymm4,ymm0,0x13
  8193. vperm2i128 ymm4,ymm12,ymm8,0x13
  8194. vperm2i128 ymm8,ymm5,ymm1,0x02
  8195. vperm2i128 ymm12,ymm13,ymm9,0x02
  8196. vperm2i128 ymm1,ymm5,ymm1,0x13
  8197. vperm2i128 ymm5,ymm13,ymm9,0x13
  8198. vperm2i128 ymm9,ymm6,ymm2,0x02
  8199. vperm2i128 ymm13,ymm14,ymm10,0x02
  8200. vperm2i128 ymm2,ymm6,ymm2,0x13
  8201. vperm2i128 ymm6,ymm14,ymm10,0x13
  8202. jmp NEAR $L$seal_avx2_short
  8203. $L$seal_avx2_192:
  8204. vmovdqa ymm1,ymm0
  8205. vmovdqa ymm2,ymm0
  8206. vmovdqa ymm5,ymm4
  8207. vmovdqa ymm6,ymm4
  8208. vmovdqa ymm9,ymm8
  8209. vmovdqa ymm10,ymm8
  8210. vpaddd ymm13,ymm12,YMMWORD[$L$avx2_inc]
  8211. vmovdqa ymm11,ymm12
  8212. vmovdqa ymm15,ymm13
  8213. mov r10,10
  8214. $L$seal_avx2_192_rounds:
  8215. vpaddd ymm0,ymm0,ymm4
  8216. vpxor ymm12,ymm12,ymm0
  8217. vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
  8218. vpaddd ymm8,ymm8,ymm12
  8219. vpxor ymm4,ymm4,ymm8
  8220. vpsrld ymm3,ymm4,20
  8221. vpslld ymm4,ymm4,12
  8222. vpxor ymm4,ymm4,ymm3
  8223. vpaddd ymm0,ymm0,ymm4
  8224. vpxor ymm12,ymm12,ymm0
  8225. vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
  8226. vpaddd ymm8,ymm8,ymm12
  8227. vpxor ymm4,ymm4,ymm8
  8228. vpslld ymm3,ymm4,7
  8229. vpsrld ymm4,ymm4,25
  8230. vpxor ymm4,ymm4,ymm3
  8231. vpalignr ymm12,ymm12,ymm12,12
  8232. vpalignr ymm8,ymm8,ymm8,8
  8233. vpalignr ymm4,ymm4,ymm4,4
  8234. vpaddd ymm1,ymm1,ymm5
  8235. vpxor ymm13,ymm13,ymm1
  8236. vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
  8237. vpaddd ymm9,ymm9,ymm13
  8238. vpxor ymm5,ymm5,ymm9
  8239. vpsrld ymm3,ymm5,20
  8240. vpslld ymm5,ymm5,12
  8241. vpxor ymm5,ymm5,ymm3
  8242. vpaddd ymm1,ymm1,ymm5
  8243. vpxor ymm13,ymm13,ymm1
  8244. vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
  8245. vpaddd ymm9,ymm9,ymm13
  8246. vpxor ymm5,ymm5,ymm9
  8247. vpslld ymm3,ymm5,7
  8248. vpsrld ymm5,ymm5,25
  8249. vpxor ymm5,ymm5,ymm3
  8250. vpalignr ymm13,ymm13,ymm13,12
  8251. vpalignr ymm9,ymm9,ymm9,8
  8252. vpalignr ymm5,ymm5,ymm5,4
  8253. vpaddd ymm0,ymm0,ymm4
  8254. vpxor ymm12,ymm12,ymm0
  8255. vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
  8256. vpaddd ymm8,ymm8,ymm12
  8257. vpxor ymm4,ymm4,ymm8
  8258. vpsrld ymm3,ymm4,20
  8259. vpslld ymm4,ymm4,12
  8260. vpxor ymm4,ymm4,ymm3
  8261. vpaddd ymm0,ymm0,ymm4
  8262. vpxor ymm12,ymm12,ymm0
  8263. vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
  8264. vpaddd ymm8,ymm8,ymm12
  8265. vpxor ymm4,ymm4,ymm8
  8266. vpslld ymm3,ymm4,7
  8267. vpsrld ymm4,ymm4,25
  8268. vpxor ymm4,ymm4,ymm3
  8269. vpalignr ymm12,ymm12,ymm12,4
  8270. vpalignr ymm8,ymm8,ymm8,8
  8271. vpalignr ymm4,ymm4,ymm4,12
  8272. vpaddd ymm1,ymm1,ymm5
  8273. vpxor ymm13,ymm13,ymm1
  8274. vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
  8275. vpaddd ymm9,ymm9,ymm13
  8276. vpxor ymm5,ymm5,ymm9
  8277. vpsrld ymm3,ymm5,20
  8278. vpslld ymm5,ymm5,12
  8279. vpxor ymm5,ymm5,ymm3
  8280. vpaddd ymm1,ymm1,ymm5
  8281. vpxor ymm13,ymm13,ymm1
  8282. vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
  8283. vpaddd ymm9,ymm9,ymm13
  8284. vpxor ymm5,ymm5,ymm9
  8285. vpslld ymm3,ymm5,7
  8286. vpsrld ymm5,ymm5,25
  8287. vpxor ymm5,ymm5,ymm3
  8288. vpalignr ymm13,ymm13,ymm13,4
  8289. vpalignr ymm9,ymm9,ymm9,8
  8290. vpalignr ymm5,ymm5,ymm5,12
  8291. dec r10
  8292. jne NEAR $L$seal_avx2_192_rounds
  8293. vpaddd ymm0,ymm0,ymm2
  8294. vpaddd ymm1,ymm1,ymm2
  8295. vpaddd ymm4,ymm4,ymm6
  8296. vpaddd ymm5,ymm5,ymm6
  8297. vpaddd ymm8,ymm8,ymm10
  8298. vpaddd ymm9,ymm9,ymm10
  8299. vpaddd ymm12,ymm12,ymm11
  8300. vpaddd ymm13,ymm13,ymm15
  8301. vperm2i128 ymm3,ymm4,ymm0,0x02
  8302. vpand ymm3,ymm3,YMMWORD[$L$clamp]
  8303. vmovdqa YMMWORD[(160+0)+rbp],ymm3
  8304. vperm2i128 ymm0,ymm4,ymm0,0x13
  8305. vperm2i128 ymm4,ymm12,ymm8,0x13
  8306. vperm2i128 ymm8,ymm5,ymm1,0x02
  8307. vperm2i128 ymm12,ymm13,ymm9,0x02
  8308. vperm2i128 ymm1,ymm5,ymm1,0x13
  8309. vperm2i128 ymm5,ymm13,ymm9,0x13
  8310. $L$seal_avx2_short:
  8311. mov r8,r8
  8312. call poly_hash_ad_internal
  8313. xor rcx,rcx
  8314. $L$seal_avx2_short_hash_remainder:
  8315. cmp rcx,16
  8316. jb NEAR $L$seal_avx2_short_loop
  8317. add r10,QWORD[((0+0))+rdi]
  8318. adc r11,QWORD[((8+0))+rdi]
  8319. adc r12,1
  8320. mov rax,QWORD[((0+160+0))+rbp]
  8321. mov r15,rax
  8322. mul r10
  8323. mov r13,rax
  8324. mov r14,rdx
  8325. mov rax,QWORD[((0+160+0))+rbp]
  8326. mul r11
  8327. imul r15,r12
  8328. add r14,rax
  8329. adc r15,rdx
  8330. mov rax,QWORD[((8+160+0))+rbp]
  8331. mov r9,rax
  8332. mul r10
  8333. add r14,rax
  8334. adc rdx,0
  8335. mov r10,rdx
  8336. mov rax,QWORD[((8+160+0))+rbp]
  8337. mul r11
  8338. add r15,rax
  8339. adc rdx,0
  8340. imul r9,r12
  8341. add r15,r10
  8342. adc r9,rdx
  8343. mov r10,r13
  8344. mov r11,r14
  8345. mov r12,r15
  8346. and r12,3
  8347. mov r13,r15
  8348. and r13,-4
  8349. mov r14,r9
  8350. shrd r15,r9,2
  8351. shr r9,2
  8352. add r15,r13
  8353. adc r9,r14
  8354. add r10,r15
  8355. adc r11,r9
  8356. adc r12,0
  8357. sub rcx,16
  8358. add rdi,16
  8359. jmp NEAR $L$seal_avx2_short_hash_remainder
  8360. $L$seal_avx2_short_loop:
  8361. cmp rbx,32
  8362. jb NEAR $L$seal_avx2_short_tail
  8363. sub rbx,32
  8364. vpxor ymm0,ymm0,YMMWORD[rsi]
  8365. vmovdqu YMMWORD[rdi],ymm0
  8366. lea rsi,[32+rsi]
  8367. add r10,QWORD[((0+0))+rdi]
  8368. adc r11,QWORD[((8+0))+rdi]
  8369. adc r12,1
  8370. mov rax,QWORD[((0+160+0))+rbp]
  8371. mov r15,rax
  8372. mul r10
  8373. mov r13,rax
  8374. mov r14,rdx
  8375. mov rax,QWORD[((0+160+0))+rbp]
  8376. mul r11
  8377. imul r15,r12
  8378. add r14,rax
  8379. adc r15,rdx
  8380. mov rax,QWORD[((8+160+0))+rbp]
  8381. mov r9,rax
  8382. mul r10
  8383. add r14,rax
  8384. adc rdx,0
  8385. mov r10,rdx
  8386. mov rax,QWORD[((8+160+0))+rbp]
  8387. mul r11
  8388. add r15,rax
  8389. adc rdx,0
  8390. imul r9,r12
  8391. add r15,r10
  8392. adc r9,rdx
  8393. mov r10,r13
  8394. mov r11,r14
  8395. mov r12,r15
  8396. and r12,3
  8397. mov r13,r15
  8398. and r13,-4
  8399. mov r14,r9
  8400. shrd r15,r9,2
  8401. shr r9,2
  8402. add r15,r13
  8403. adc r9,r14
  8404. add r10,r15
  8405. adc r11,r9
  8406. adc r12,0
  8407. add r10,QWORD[((0+16))+rdi]
  8408. adc r11,QWORD[((8+16))+rdi]
  8409. adc r12,1
  8410. mov rax,QWORD[((0+160+0))+rbp]
  8411. mov r15,rax
  8412. mul r10
  8413. mov r13,rax
  8414. mov r14,rdx
  8415. mov rax,QWORD[((0+160+0))+rbp]
  8416. mul r11
  8417. imul r15,r12
  8418. add r14,rax
  8419. adc r15,rdx
  8420. mov rax,QWORD[((8+160+0))+rbp]
  8421. mov r9,rax
  8422. mul r10
  8423. add r14,rax
  8424. adc rdx,0
  8425. mov r10,rdx
  8426. mov rax,QWORD[((8+160+0))+rbp]
  8427. mul r11
  8428. add r15,rax
  8429. adc rdx,0
  8430. imul r9,r12
  8431. add r15,r10
  8432. adc r9,rdx
  8433. mov r10,r13
  8434. mov r11,r14
  8435. mov r12,r15
  8436. and r12,3
  8437. mov r13,r15
  8438. and r13,-4
  8439. mov r14,r9
  8440. shrd r15,r9,2
  8441. shr r9,2
  8442. add r15,r13
  8443. adc r9,r14
  8444. add r10,r15
  8445. adc r11,r9
  8446. adc r12,0
  8447. lea rdi,[32+rdi]
  8448. vmovdqa ymm0,ymm4
  8449. vmovdqa ymm4,ymm8
  8450. vmovdqa ymm8,ymm12
  8451. vmovdqa ymm12,ymm1
  8452. vmovdqa ymm1,ymm5
  8453. vmovdqa ymm5,ymm9
  8454. vmovdqa ymm9,ymm13
  8455. vmovdqa ymm13,ymm2
  8456. vmovdqa ymm2,ymm6
  8457. jmp NEAR $L$seal_avx2_short_loop
  8458. $L$seal_avx2_short_tail:
  8459. cmp rbx,16
  8460. jb NEAR $L$seal_avx2_exit
  8461. sub rbx,16
  8462. vpxor xmm3,xmm0,XMMWORD[rsi]
  8463. vmovdqu XMMWORD[rdi],xmm3
  8464. lea rsi,[16+rsi]
  8465. add r10,QWORD[((0+0))+rdi]
  8466. adc r11,QWORD[((8+0))+rdi]
  8467. adc r12,1
  8468. mov rax,QWORD[((0+160+0))+rbp]
  8469. mov r15,rax
  8470. mul r10
  8471. mov r13,rax
  8472. mov r14,rdx
  8473. mov rax,QWORD[((0+160+0))+rbp]
  8474. mul r11
  8475. imul r15,r12
  8476. add r14,rax
  8477. adc r15,rdx
  8478. mov rax,QWORD[((8+160+0))+rbp]
  8479. mov r9,rax
  8480. mul r10
  8481. add r14,rax
  8482. adc rdx,0
  8483. mov r10,rdx
  8484. mov rax,QWORD[((8+160+0))+rbp]
  8485. mul r11
  8486. add r15,rax
  8487. adc rdx,0
  8488. imul r9,r12
  8489. add r15,r10
  8490. adc r9,rdx
  8491. mov r10,r13
  8492. mov r11,r14
  8493. mov r12,r15
  8494. and r12,3
  8495. mov r13,r15
  8496. and r13,-4
  8497. mov r14,r9
  8498. shrd r15,r9,2
  8499. shr r9,2
  8500. add r15,r13
  8501. adc r9,r14
  8502. add r10,r15
  8503. adc r11,r9
  8504. adc r12,0
  8505. lea rdi,[16+rdi]
  8506. vextracti128 xmm0,ymm0,1
  8507. $L$seal_avx2_exit:
  8508. vzeroupper
  8509. jmp NEAR $L$seal_sse_tail_16