chacha-x86_64-nasm.asm 39 KB


  1. ; This file is generated from a similarly-named Perl script in the BoringSSL
  2. ; source tree. Do not edit by hand.
  3. default rel
  4. %define XMMWORD
  5. %define YMMWORD
  6. %define ZMMWORD
  7. section .text code align=64
  8. EXTERN GFp_ia32cap_P
  9. ALIGN 64
  10. $L$zero:
  11. DD 0,0,0,0
  12. $L$one:
  13. DD 1,0,0,0
  14. $L$inc:
  15. DD 0,1,2,3
  16. $L$four:
  17. DD 4,4,4,4
  18. $L$incy:
  19. DD 0,2,4,6,1,3,5,7
  20. $L$eight:
  21. DD 8,8,8,8,8,8,8,8
  22. $L$rot16:
  23. DB 0x2,0x3,0x0,0x1,0x6,0x7,0x4,0x5,0xa,0xb,0x8,0x9,0xe,0xf,0xc,0xd
  24. $L$rot24:
  25. DB 0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe
  26. $L$sigma:
  27. DB 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107
  28. DB 0
  29. ALIGN 64
  30. $L$zeroz:
  31. DD 0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0
  32. $L$fourz:
  33. DD 4,0,0,0,4,0,0,0,4,0,0,0,4,0,0,0
  34. $L$incz:
  35. DD 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
  36. $L$sixteen:
  37. DD 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
  38. DB 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
  39. DB 95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32
  40. DB 98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115
  41. DB 108,46,111,114,103,62,0
  42. global GFp_ChaCha20_ctr32
  43. ALIGN 64
  44. GFp_ChaCha20_ctr32:
  45. mov QWORD[8+rsp],rdi ;WIN64 prologue
  46. mov QWORD[16+rsp],rsi
  47. mov rax,rsp
  48. $L$SEH_begin_GFp_ChaCha20_ctr32:
  49. mov rdi,rcx
  50. mov rsi,rdx
  51. mov rdx,r8
  52. mov rcx,r9
  53. mov r8,QWORD[40+rsp]
  54. cmp rdx,0
  55. je NEAR $L$no_data
  56. mov r10,QWORD[((GFp_ia32cap_P+4))]
  57. test r10d,512
  58. jnz NEAR $L$ChaCha20_ssse3
  59. push rbx
  60. push rbp
  61. push r12
  62. push r13
  63. push r14
  64. push r15
  65. sub rsp,64+24
  66. $L$ctr32_body:
  67. movdqu xmm1,XMMWORD[rcx]
  68. movdqu xmm2,XMMWORD[16+rcx]
  69. movdqu xmm3,XMMWORD[r8]
  70. movdqa xmm4,XMMWORD[$L$one]
  71. movdqa XMMWORD[16+rsp],xmm1
  72. movdqa XMMWORD[32+rsp],xmm2
  73. movdqa XMMWORD[48+rsp],xmm3
  74. mov rbp,rdx
  75. jmp NEAR $L$oop_outer
  76. ALIGN 32
  77. $L$oop_outer:
  78. mov eax,0x61707865
  79. mov ebx,0x3320646e
  80. mov ecx,0x79622d32
  81. mov edx,0x6b206574
  82. mov r8d,DWORD[16+rsp]
  83. mov r9d,DWORD[20+rsp]
  84. mov r10d,DWORD[24+rsp]
  85. mov r11d,DWORD[28+rsp]
  86. movd r12d,xmm3
  87. mov r13d,DWORD[52+rsp]
  88. mov r14d,DWORD[56+rsp]
  89. mov r15d,DWORD[60+rsp]
  90. mov QWORD[((64+0))+rsp],rbp
  91. mov ebp,10
  92. mov QWORD[((64+8))+rsp],rsi
  93. DB 102,72,15,126,214
  94. mov QWORD[((64+16))+rsp],rdi
  95. mov rdi,rsi
  96. shr rdi,32
  97. jmp NEAR $L$oop
  98. ALIGN 32
  99. $L$oop:
  100. add eax,r8d
  101. xor r12d,eax
  102. rol r12d,16
  103. add ebx,r9d
  104. xor r13d,ebx
  105. rol r13d,16
  106. add esi,r12d
  107. xor r8d,esi
  108. rol r8d,12
  109. add edi,r13d
  110. xor r9d,edi
  111. rol r9d,12
  112. add eax,r8d
  113. xor r12d,eax
  114. rol r12d,8
  115. add ebx,r9d
  116. xor r13d,ebx
  117. rol r13d,8
  118. add esi,r12d
  119. xor r8d,esi
  120. rol r8d,7
  121. add edi,r13d
  122. xor r9d,edi
  123. rol r9d,7
  124. mov DWORD[32+rsp],esi
  125. mov DWORD[36+rsp],edi
  126. mov esi,DWORD[40+rsp]
  127. mov edi,DWORD[44+rsp]
  128. add ecx,r10d
  129. xor r14d,ecx
  130. rol r14d,16
  131. add edx,r11d
  132. xor r15d,edx
  133. rol r15d,16
  134. add esi,r14d
  135. xor r10d,esi
  136. rol r10d,12
  137. add edi,r15d
  138. xor r11d,edi
  139. rol r11d,12
  140. add ecx,r10d
  141. xor r14d,ecx
  142. rol r14d,8
  143. add edx,r11d
  144. xor r15d,edx
  145. rol r15d,8
  146. add esi,r14d
  147. xor r10d,esi
  148. rol r10d,7
  149. add edi,r15d
  150. xor r11d,edi
  151. rol r11d,7
  152. add eax,r9d
  153. xor r15d,eax
  154. rol r15d,16
  155. add ebx,r10d
  156. xor r12d,ebx
  157. rol r12d,16
  158. add esi,r15d
  159. xor r9d,esi
  160. rol r9d,12
  161. add edi,r12d
  162. xor r10d,edi
  163. rol r10d,12
  164. add eax,r9d
  165. xor r15d,eax
  166. rol r15d,8
  167. add ebx,r10d
  168. xor r12d,ebx
  169. rol r12d,8
  170. add esi,r15d
  171. xor r9d,esi
  172. rol r9d,7
  173. add edi,r12d
  174. xor r10d,edi
  175. rol r10d,7
  176. mov DWORD[40+rsp],esi
  177. mov DWORD[44+rsp],edi
  178. mov esi,DWORD[32+rsp]
  179. mov edi,DWORD[36+rsp]
  180. add ecx,r11d
  181. xor r13d,ecx
  182. rol r13d,16
  183. add edx,r8d
  184. xor r14d,edx
  185. rol r14d,16
  186. add esi,r13d
  187. xor r11d,esi
  188. rol r11d,12
  189. add edi,r14d
  190. xor r8d,edi
  191. rol r8d,12
  192. add ecx,r11d
  193. xor r13d,ecx
  194. rol r13d,8
  195. add edx,r8d
  196. xor r14d,edx
  197. rol r14d,8
  198. add esi,r13d
  199. xor r11d,esi
  200. rol r11d,7
  201. add edi,r14d
  202. xor r8d,edi
  203. rol r8d,7
  204. dec ebp
  205. jnz NEAR $L$oop
  206. mov DWORD[36+rsp],edi
  207. mov DWORD[32+rsp],esi
  208. mov rbp,QWORD[64+rsp]
  209. movdqa xmm1,xmm2
  210. mov rsi,QWORD[((64+8))+rsp]
  211. paddd xmm3,xmm4
  212. mov rdi,QWORD[((64+16))+rsp]
  213. add eax,0x61707865
  214. add ebx,0x3320646e
  215. add ecx,0x79622d32
  216. add edx,0x6b206574
  217. add r8d,DWORD[16+rsp]
  218. add r9d,DWORD[20+rsp]
  219. add r10d,DWORD[24+rsp]
  220. add r11d,DWORD[28+rsp]
  221. add r12d,DWORD[48+rsp]
  222. add r13d,DWORD[52+rsp]
  223. add r14d,DWORD[56+rsp]
  224. add r15d,DWORD[60+rsp]
  225. paddd xmm1,XMMWORD[32+rsp]
  226. cmp rbp,64
  227. jb NEAR $L$tail
  228. xor eax,DWORD[rsi]
  229. xor ebx,DWORD[4+rsi]
  230. xor ecx,DWORD[8+rsi]
  231. xor edx,DWORD[12+rsi]
  232. xor r8d,DWORD[16+rsi]
  233. xor r9d,DWORD[20+rsi]
  234. xor r10d,DWORD[24+rsi]
  235. xor r11d,DWORD[28+rsi]
  236. movdqu xmm0,XMMWORD[32+rsi]
  237. xor r12d,DWORD[48+rsi]
  238. xor r13d,DWORD[52+rsi]
  239. xor r14d,DWORD[56+rsi]
  240. xor r15d,DWORD[60+rsi]
  241. lea rsi,[64+rsi]
  242. pxor xmm0,xmm1
  243. movdqa XMMWORD[32+rsp],xmm2
  244. movd DWORD[48+rsp],xmm3
  245. mov DWORD[rdi],eax
  246. mov DWORD[4+rdi],ebx
  247. mov DWORD[8+rdi],ecx
  248. mov DWORD[12+rdi],edx
  249. mov DWORD[16+rdi],r8d
  250. mov DWORD[20+rdi],r9d
  251. mov DWORD[24+rdi],r10d
  252. mov DWORD[28+rdi],r11d
  253. movdqu XMMWORD[32+rdi],xmm0
  254. mov DWORD[48+rdi],r12d
  255. mov DWORD[52+rdi],r13d
  256. mov DWORD[56+rdi],r14d
  257. mov DWORD[60+rdi],r15d
  258. lea rdi,[64+rdi]
  259. sub rbp,64
  260. jnz NEAR $L$oop_outer
  261. jmp NEAR $L$done
  262. ALIGN 16
  263. $L$tail:
  264. mov DWORD[rsp],eax
  265. mov DWORD[4+rsp],ebx
  266. xor rbx,rbx
  267. mov DWORD[8+rsp],ecx
  268. mov DWORD[12+rsp],edx
  269. mov DWORD[16+rsp],r8d
  270. mov DWORD[20+rsp],r9d
  271. mov DWORD[24+rsp],r10d
  272. mov DWORD[28+rsp],r11d
  273. movdqa XMMWORD[32+rsp],xmm1
  274. mov DWORD[48+rsp],r12d
  275. mov DWORD[52+rsp],r13d
  276. mov DWORD[56+rsp],r14d
  277. mov DWORD[60+rsp],r15d
  278. $L$oop_tail:
  279. movzx eax,BYTE[rbx*1+rsi]
  280. movzx edx,BYTE[rbx*1+rsp]
  281. lea rbx,[1+rbx]
  282. xor eax,edx
  283. mov BYTE[((-1))+rbx*1+rdi],al
  284. dec rbp
  285. jnz NEAR $L$oop_tail
  286. $L$done:
  287. lea rsi,[((64+24+48))+rsp]
  288. mov r15,QWORD[((-48))+rsi]
  289. mov r14,QWORD[((-40))+rsi]
  290. mov r13,QWORD[((-32))+rsi]
  291. mov r12,QWORD[((-24))+rsi]
  292. mov rbp,QWORD[((-16))+rsi]
  293. mov rbx,QWORD[((-8))+rsi]
  294. lea rsp,[rsi]
  295. $L$no_data:
  296. mov rdi,QWORD[8+rsp] ;WIN64 epilogue
  297. mov rsi,QWORD[16+rsp]
  298. DB 0F3h,0C3h ;repret
  299. $L$SEH_end_GFp_ChaCha20_ctr32:
  300. ALIGN 32
  301. ChaCha20_ssse3:
  302. mov QWORD[8+rsp],rdi ;WIN64 prologue
  303. mov QWORD[16+rsp],rsi
  304. mov rax,rsp
  305. $L$SEH_begin_ChaCha20_ssse3:
  306. mov rdi,rcx
  307. mov rsi,rdx
  308. mov rdx,r8
  309. mov rcx,r9
  310. mov r8,QWORD[40+rsp]
  311. $L$ChaCha20_ssse3:
  312. mov r9,rsp
  313. cmp rdx,128
  314. ja NEAR $L$ChaCha20_4x
  315. $L$do_sse3_after_all:
  316. sub rsp,64+40
  317. movaps XMMWORD[(-40)+r9],xmm6
  318. movaps XMMWORD[(-24)+r9],xmm7
  319. $L$ssse3_body:
  320. movdqa xmm0,XMMWORD[$L$sigma]
  321. movdqu xmm1,XMMWORD[rcx]
  322. movdqu xmm2,XMMWORD[16+rcx]
  323. movdqu xmm3,XMMWORD[r8]
  324. movdqa xmm6,XMMWORD[$L$rot16]
  325. movdqa xmm7,XMMWORD[$L$rot24]
  326. movdqa XMMWORD[rsp],xmm0
  327. movdqa XMMWORD[16+rsp],xmm1
  328. movdqa XMMWORD[32+rsp],xmm2
  329. movdqa XMMWORD[48+rsp],xmm3
  330. mov r8,10
  331. jmp NEAR $L$oop_ssse3
  332. ALIGN 32
  333. $L$oop_outer_ssse3:
  334. movdqa xmm3,XMMWORD[$L$one]
  335. movdqa xmm0,XMMWORD[rsp]
  336. movdqa xmm1,XMMWORD[16+rsp]
  337. movdqa xmm2,XMMWORD[32+rsp]
  338. paddd xmm3,XMMWORD[48+rsp]
  339. mov r8,10
  340. movdqa XMMWORD[48+rsp],xmm3
  341. jmp NEAR $L$oop_ssse3
  342. ALIGN 32
  343. $L$oop_ssse3:
  344. paddd xmm0,xmm1
  345. pxor xmm3,xmm0
  346. DB 102,15,56,0,222
  347. paddd xmm2,xmm3
  348. pxor xmm1,xmm2
  349. movdqa xmm4,xmm1
  350. psrld xmm1,20
  351. pslld xmm4,12
  352. por xmm1,xmm4
  353. paddd xmm0,xmm1
  354. pxor xmm3,xmm0
  355. DB 102,15,56,0,223
  356. paddd xmm2,xmm3
  357. pxor xmm1,xmm2
  358. movdqa xmm4,xmm1
  359. psrld xmm1,25
  360. pslld xmm4,7
  361. por xmm1,xmm4
  362. pshufd xmm2,xmm2,78
  363. pshufd xmm1,xmm1,57
  364. pshufd xmm3,xmm3,147
  365. nop
  366. paddd xmm0,xmm1
  367. pxor xmm3,xmm0
  368. DB 102,15,56,0,222
  369. paddd xmm2,xmm3
  370. pxor xmm1,xmm2
  371. movdqa xmm4,xmm1
  372. psrld xmm1,20
  373. pslld xmm4,12
  374. por xmm1,xmm4
  375. paddd xmm0,xmm1
  376. pxor xmm3,xmm0
  377. DB 102,15,56,0,223
  378. paddd xmm2,xmm3
  379. pxor xmm1,xmm2
  380. movdqa xmm4,xmm1
  381. psrld xmm1,25
  382. pslld xmm4,7
  383. por xmm1,xmm4
  384. pshufd xmm2,xmm2,78
  385. pshufd xmm1,xmm1,147
  386. pshufd xmm3,xmm3,57
  387. dec r8
  388. jnz NEAR $L$oop_ssse3
  389. paddd xmm0,XMMWORD[rsp]
  390. paddd xmm1,XMMWORD[16+rsp]
  391. paddd xmm2,XMMWORD[32+rsp]
  392. paddd xmm3,XMMWORD[48+rsp]
  393. cmp rdx,64
  394. jb NEAR $L$tail_ssse3
  395. movdqu xmm4,XMMWORD[rsi]
  396. movdqu xmm5,XMMWORD[16+rsi]
  397. pxor xmm0,xmm4
  398. movdqu xmm4,XMMWORD[32+rsi]
  399. pxor xmm1,xmm5
  400. movdqu xmm5,XMMWORD[48+rsi]
  401. lea rsi,[64+rsi]
  402. pxor xmm2,xmm4
  403. pxor xmm3,xmm5
  404. movdqu XMMWORD[rdi],xmm0
  405. movdqu XMMWORD[16+rdi],xmm1
  406. movdqu XMMWORD[32+rdi],xmm2
  407. movdqu XMMWORD[48+rdi],xmm3
  408. lea rdi,[64+rdi]
  409. sub rdx,64
  410. jnz NEAR $L$oop_outer_ssse3
  411. jmp NEAR $L$done_ssse3
  412. ALIGN 16
  413. $L$tail_ssse3:
  414. movdqa XMMWORD[rsp],xmm0
  415. movdqa XMMWORD[16+rsp],xmm1
  416. movdqa XMMWORD[32+rsp],xmm2
  417. movdqa XMMWORD[48+rsp],xmm3
  418. xor r8,r8
  419. $L$oop_tail_ssse3:
  420. movzx eax,BYTE[r8*1+rsi]
  421. movzx ecx,BYTE[r8*1+rsp]
  422. lea r8,[1+r8]
  423. xor eax,ecx
  424. mov BYTE[((-1))+r8*1+rdi],al
  425. dec rdx
  426. jnz NEAR $L$oop_tail_ssse3
  427. $L$done_ssse3:
  428. movaps xmm6,XMMWORD[((-40))+r9]
  429. movaps xmm7,XMMWORD[((-24))+r9]
  430. lea rsp,[r9]
  431. $L$ssse3_epilogue:
  432. mov rdi,QWORD[8+rsp] ;WIN64 epilogue
  433. mov rsi,QWORD[16+rsp]
  434. DB 0F3h,0C3h ;repret
  435. $L$SEH_end_ChaCha20_ssse3:
  436. ALIGN 32
  437. ChaCha20_4x:
  438. mov QWORD[8+rsp],rdi ;WIN64 prologue
  439. mov QWORD[16+rsp],rsi
  440. mov rax,rsp
  441. $L$SEH_begin_ChaCha20_4x:
  442. mov rdi,rcx
  443. mov rsi,rdx
  444. mov rdx,r8
  445. mov rcx,r9
  446. mov r8,QWORD[40+rsp]
  447. $L$ChaCha20_4x:
  448. mov r9,rsp
  449. mov r11,r10
  450. shr r10,32
  451. test r10,32
  452. jnz NEAR $L$ChaCha20_8x
  453. cmp rdx,192
  454. ja NEAR $L$proceed4x
  455. and r11,71303168
  456. cmp r11,4194304
  457. je NEAR $L$do_sse3_after_all
  458. $L$proceed4x:
  459. sub rsp,0x140+168
  460. movaps XMMWORD[(-168)+r9],xmm6
  461. movaps XMMWORD[(-152)+r9],xmm7
  462. movaps XMMWORD[(-136)+r9],xmm8
  463. movaps XMMWORD[(-120)+r9],xmm9
  464. movaps XMMWORD[(-104)+r9],xmm10
  465. movaps XMMWORD[(-88)+r9],xmm11
  466. movaps XMMWORD[(-72)+r9],xmm12
  467. movaps XMMWORD[(-56)+r9],xmm13
  468. movaps XMMWORD[(-40)+r9],xmm14
  469. movaps XMMWORD[(-24)+r9],xmm15
  470. $L$4x_body:
  471. movdqa xmm11,XMMWORD[$L$sigma]
  472. movdqu xmm15,XMMWORD[rcx]
  473. movdqu xmm7,XMMWORD[16+rcx]
  474. movdqu xmm3,XMMWORD[r8]
  475. lea rcx,[256+rsp]
  476. lea r10,[$L$rot16]
  477. lea r11,[$L$rot24]
  478. pshufd xmm8,xmm11,0x00
  479. pshufd xmm9,xmm11,0x55
  480. movdqa XMMWORD[64+rsp],xmm8
  481. pshufd xmm10,xmm11,0xaa
  482. movdqa XMMWORD[80+rsp],xmm9
  483. pshufd xmm11,xmm11,0xff
  484. movdqa XMMWORD[96+rsp],xmm10
  485. movdqa XMMWORD[112+rsp],xmm11
  486. pshufd xmm12,xmm15,0x00
  487. pshufd xmm13,xmm15,0x55
  488. movdqa XMMWORD[(128-256)+rcx],xmm12
  489. pshufd xmm14,xmm15,0xaa
  490. movdqa XMMWORD[(144-256)+rcx],xmm13
  491. pshufd xmm15,xmm15,0xff
  492. movdqa XMMWORD[(160-256)+rcx],xmm14
  493. movdqa XMMWORD[(176-256)+rcx],xmm15
  494. pshufd xmm4,xmm7,0x00
  495. pshufd xmm5,xmm7,0x55
  496. movdqa XMMWORD[(192-256)+rcx],xmm4
  497. pshufd xmm6,xmm7,0xaa
  498. movdqa XMMWORD[(208-256)+rcx],xmm5
  499. pshufd xmm7,xmm7,0xff
  500. movdqa XMMWORD[(224-256)+rcx],xmm6
  501. movdqa XMMWORD[(240-256)+rcx],xmm7
  502. pshufd xmm0,xmm3,0x00
  503. pshufd xmm1,xmm3,0x55
  504. paddd xmm0,XMMWORD[$L$inc]
  505. pshufd xmm2,xmm3,0xaa
  506. movdqa XMMWORD[(272-256)+rcx],xmm1
  507. pshufd xmm3,xmm3,0xff
  508. movdqa XMMWORD[(288-256)+rcx],xmm2
  509. movdqa XMMWORD[(304-256)+rcx],xmm3
  510. jmp NEAR $L$oop_enter4x
  511. ALIGN 32
  512. $L$oop_outer4x:
  513. movdqa xmm8,XMMWORD[64+rsp]
  514. movdqa xmm9,XMMWORD[80+rsp]
  515. movdqa xmm10,XMMWORD[96+rsp]
  516. movdqa xmm11,XMMWORD[112+rsp]
  517. movdqa xmm12,XMMWORD[((128-256))+rcx]
  518. movdqa xmm13,XMMWORD[((144-256))+rcx]
  519. movdqa xmm14,XMMWORD[((160-256))+rcx]
  520. movdqa xmm15,XMMWORD[((176-256))+rcx]
  521. movdqa xmm4,XMMWORD[((192-256))+rcx]
  522. movdqa xmm5,XMMWORD[((208-256))+rcx]
  523. movdqa xmm6,XMMWORD[((224-256))+rcx]
  524. movdqa xmm7,XMMWORD[((240-256))+rcx]
  525. movdqa xmm0,XMMWORD[((256-256))+rcx]
  526. movdqa xmm1,XMMWORD[((272-256))+rcx]
  527. movdqa xmm2,XMMWORD[((288-256))+rcx]
  528. movdqa xmm3,XMMWORD[((304-256))+rcx]
  529. paddd xmm0,XMMWORD[$L$four]
  530. $L$oop_enter4x:
  531. movdqa XMMWORD[32+rsp],xmm6
  532. movdqa XMMWORD[48+rsp],xmm7
  533. movdqa xmm7,XMMWORD[r10]
  534. mov eax,10
  535. movdqa XMMWORD[(256-256)+rcx],xmm0
  536. jmp NEAR $L$oop4x
  537. ALIGN 32
  538. $L$oop4x:
  539. paddd xmm8,xmm12
  540. paddd xmm9,xmm13
  541. pxor xmm0,xmm8
  542. pxor xmm1,xmm9
  543. DB 102,15,56,0,199
  544. DB 102,15,56,0,207
  545. paddd xmm4,xmm0
  546. paddd xmm5,xmm1
  547. pxor xmm12,xmm4
  548. pxor xmm13,xmm5
  549. movdqa xmm6,xmm12
  550. pslld xmm12,12
  551. psrld xmm6,20
  552. movdqa xmm7,xmm13
  553. pslld xmm13,12
  554. por xmm12,xmm6
  555. psrld xmm7,20
  556. movdqa xmm6,XMMWORD[r11]
  557. por xmm13,xmm7
  558. paddd xmm8,xmm12
  559. paddd xmm9,xmm13
  560. pxor xmm0,xmm8
  561. pxor xmm1,xmm9
  562. DB 102,15,56,0,198
  563. DB 102,15,56,0,206
  564. paddd xmm4,xmm0
  565. paddd xmm5,xmm1
  566. pxor xmm12,xmm4
  567. pxor xmm13,xmm5
  568. movdqa xmm7,xmm12
  569. pslld xmm12,7
  570. psrld xmm7,25
  571. movdqa xmm6,xmm13
  572. pslld xmm13,7
  573. por xmm12,xmm7
  574. psrld xmm6,25
  575. movdqa xmm7,XMMWORD[r10]
  576. por xmm13,xmm6
  577. movdqa XMMWORD[rsp],xmm4
  578. movdqa XMMWORD[16+rsp],xmm5
  579. movdqa xmm4,XMMWORD[32+rsp]
  580. movdqa xmm5,XMMWORD[48+rsp]
  581. paddd xmm10,xmm14
  582. paddd xmm11,xmm15
  583. pxor xmm2,xmm10
  584. pxor xmm3,xmm11
  585. DB 102,15,56,0,215
  586. DB 102,15,56,0,223
  587. paddd xmm4,xmm2
  588. paddd xmm5,xmm3
  589. pxor xmm14,xmm4
  590. pxor xmm15,xmm5
  591. movdqa xmm6,xmm14
  592. pslld xmm14,12
  593. psrld xmm6,20
  594. movdqa xmm7,xmm15
  595. pslld xmm15,12
  596. por xmm14,xmm6
  597. psrld xmm7,20
  598. movdqa xmm6,XMMWORD[r11]
  599. por xmm15,xmm7
  600. paddd xmm10,xmm14
  601. paddd xmm11,xmm15
  602. pxor xmm2,xmm10
  603. pxor xmm3,xmm11
  604. DB 102,15,56,0,214
  605. DB 102,15,56,0,222
  606. paddd xmm4,xmm2
  607. paddd xmm5,xmm3
  608. pxor xmm14,xmm4
  609. pxor xmm15,xmm5
  610. movdqa xmm7,xmm14
  611. pslld xmm14,7
  612. psrld xmm7,25
  613. movdqa xmm6,xmm15
  614. pslld xmm15,7
  615. por xmm14,xmm7
  616. psrld xmm6,25
  617. movdqa xmm7,XMMWORD[r10]
  618. por xmm15,xmm6
  619. paddd xmm8,xmm13
  620. paddd xmm9,xmm14
  621. pxor xmm3,xmm8
  622. pxor xmm0,xmm9
  623. DB 102,15,56,0,223
  624. DB 102,15,56,0,199
  625. paddd xmm4,xmm3
  626. paddd xmm5,xmm0
  627. pxor xmm13,xmm4
  628. pxor xmm14,xmm5
  629. movdqa xmm6,xmm13
  630. pslld xmm13,12
  631. psrld xmm6,20
  632. movdqa xmm7,xmm14
  633. pslld xmm14,12
  634. por xmm13,xmm6
  635. psrld xmm7,20
  636. movdqa xmm6,XMMWORD[r11]
  637. por xmm14,xmm7
  638. paddd xmm8,xmm13
  639. paddd xmm9,xmm14
  640. pxor xmm3,xmm8
  641. pxor xmm0,xmm9
  642. DB 102,15,56,0,222
  643. DB 102,15,56,0,198
  644. paddd xmm4,xmm3
  645. paddd xmm5,xmm0
  646. pxor xmm13,xmm4
  647. pxor xmm14,xmm5
  648. movdqa xmm7,xmm13
  649. pslld xmm13,7
  650. psrld xmm7,25
  651. movdqa xmm6,xmm14
  652. pslld xmm14,7
  653. por xmm13,xmm7
  654. psrld xmm6,25
  655. movdqa xmm7,XMMWORD[r10]
  656. por xmm14,xmm6
  657. movdqa XMMWORD[32+rsp],xmm4
  658. movdqa XMMWORD[48+rsp],xmm5
  659. movdqa xmm4,XMMWORD[rsp]
  660. movdqa xmm5,XMMWORD[16+rsp]
  661. paddd xmm10,xmm15
  662. paddd xmm11,xmm12
  663. pxor xmm1,xmm10
  664. pxor xmm2,xmm11
  665. DB 102,15,56,0,207
  666. DB 102,15,56,0,215
  667. paddd xmm4,xmm1
  668. paddd xmm5,xmm2
  669. pxor xmm15,xmm4
  670. pxor xmm12,xmm5
  671. movdqa xmm6,xmm15
  672. pslld xmm15,12
  673. psrld xmm6,20
  674. movdqa xmm7,xmm12
  675. pslld xmm12,12
  676. por xmm15,xmm6
  677. psrld xmm7,20
  678. movdqa xmm6,XMMWORD[r11]
  679. por xmm12,xmm7
  680. paddd xmm10,xmm15
  681. paddd xmm11,xmm12
  682. pxor xmm1,xmm10
  683. pxor xmm2,xmm11
  684. DB 102,15,56,0,206
  685. DB 102,15,56,0,214
  686. paddd xmm4,xmm1
  687. paddd xmm5,xmm2
  688. pxor xmm15,xmm4
  689. pxor xmm12,xmm5
  690. movdqa xmm7,xmm15
  691. pslld xmm15,7
  692. psrld xmm7,25
  693. movdqa xmm6,xmm12
  694. pslld xmm12,7
  695. por xmm15,xmm7
  696. psrld xmm6,25
  697. movdqa xmm7,XMMWORD[r10]
  698. por xmm12,xmm6
  699. dec eax
  700. jnz NEAR $L$oop4x
  701. paddd xmm8,XMMWORD[64+rsp]
  702. paddd xmm9,XMMWORD[80+rsp]
  703. paddd xmm10,XMMWORD[96+rsp]
  704. paddd xmm11,XMMWORD[112+rsp]
  705. movdqa xmm6,xmm8
  706. punpckldq xmm8,xmm9
  707. movdqa xmm7,xmm10
  708. punpckldq xmm10,xmm11
  709. punpckhdq xmm6,xmm9
  710. punpckhdq xmm7,xmm11
  711. movdqa xmm9,xmm8
  712. punpcklqdq xmm8,xmm10
  713. movdqa xmm11,xmm6
  714. punpcklqdq xmm6,xmm7
  715. punpckhqdq xmm9,xmm10
  716. punpckhqdq xmm11,xmm7
  717. paddd xmm12,XMMWORD[((128-256))+rcx]
  718. paddd xmm13,XMMWORD[((144-256))+rcx]
  719. paddd xmm14,XMMWORD[((160-256))+rcx]
  720. paddd xmm15,XMMWORD[((176-256))+rcx]
  721. movdqa XMMWORD[rsp],xmm8
  722. movdqa XMMWORD[16+rsp],xmm9
  723. movdqa xmm8,XMMWORD[32+rsp]
  724. movdqa xmm9,XMMWORD[48+rsp]
  725. movdqa xmm10,xmm12
  726. punpckldq xmm12,xmm13
  727. movdqa xmm7,xmm14
  728. punpckldq xmm14,xmm15
  729. punpckhdq xmm10,xmm13
  730. punpckhdq xmm7,xmm15
  731. movdqa xmm13,xmm12
  732. punpcklqdq xmm12,xmm14
  733. movdqa xmm15,xmm10
  734. punpcklqdq xmm10,xmm7
  735. punpckhqdq xmm13,xmm14
  736. punpckhqdq xmm15,xmm7
  737. paddd xmm4,XMMWORD[((192-256))+rcx]
  738. paddd xmm5,XMMWORD[((208-256))+rcx]
  739. paddd xmm8,XMMWORD[((224-256))+rcx]
  740. paddd xmm9,XMMWORD[((240-256))+rcx]
  741. movdqa XMMWORD[32+rsp],xmm6
  742. movdqa XMMWORD[48+rsp],xmm11
  743. movdqa xmm14,xmm4
  744. punpckldq xmm4,xmm5
  745. movdqa xmm7,xmm8
  746. punpckldq xmm8,xmm9
  747. punpckhdq xmm14,xmm5
  748. punpckhdq xmm7,xmm9
  749. movdqa xmm5,xmm4
  750. punpcklqdq xmm4,xmm8
  751. movdqa xmm9,xmm14
  752. punpcklqdq xmm14,xmm7
  753. punpckhqdq xmm5,xmm8
  754. punpckhqdq xmm9,xmm7
  755. paddd xmm0,XMMWORD[((256-256))+rcx]
  756. paddd xmm1,XMMWORD[((272-256))+rcx]
  757. paddd xmm2,XMMWORD[((288-256))+rcx]
  758. paddd xmm3,XMMWORD[((304-256))+rcx]
  759. movdqa xmm8,xmm0
  760. punpckldq xmm0,xmm1
  761. movdqa xmm7,xmm2
  762. punpckldq xmm2,xmm3
  763. punpckhdq xmm8,xmm1
  764. punpckhdq xmm7,xmm3
  765. movdqa xmm1,xmm0
  766. punpcklqdq xmm0,xmm2
  767. movdqa xmm3,xmm8
  768. punpcklqdq xmm8,xmm7
  769. punpckhqdq xmm1,xmm2
  770. punpckhqdq xmm3,xmm7
  771. cmp rdx,64*4
  772. jb NEAR $L$tail4x
  773. movdqu xmm6,XMMWORD[rsi]
  774. movdqu xmm11,XMMWORD[16+rsi]
  775. movdqu xmm2,XMMWORD[32+rsi]
  776. movdqu xmm7,XMMWORD[48+rsi]
  777. pxor xmm6,XMMWORD[rsp]
  778. pxor xmm11,xmm12
  779. pxor xmm2,xmm4
  780. pxor xmm7,xmm0
  781. movdqu XMMWORD[rdi],xmm6
  782. movdqu xmm6,XMMWORD[64+rsi]
  783. movdqu XMMWORD[16+rdi],xmm11
  784. movdqu xmm11,XMMWORD[80+rsi]
  785. movdqu XMMWORD[32+rdi],xmm2
  786. movdqu xmm2,XMMWORD[96+rsi]
  787. movdqu XMMWORD[48+rdi],xmm7
  788. movdqu xmm7,XMMWORD[112+rsi]
  789. lea rsi,[128+rsi]
  790. pxor xmm6,XMMWORD[16+rsp]
  791. pxor xmm11,xmm13
  792. pxor xmm2,xmm5
  793. pxor xmm7,xmm1
  794. movdqu XMMWORD[64+rdi],xmm6
  795. movdqu xmm6,XMMWORD[rsi]
  796. movdqu XMMWORD[80+rdi],xmm11
  797. movdqu xmm11,XMMWORD[16+rsi]
  798. movdqu XMMWORD[96+rdi],xmm2
  799. movdqu xmm2,XMMWORD[32+rsi]
  800. movdqu XMMWORD[112+rdi],xmm7
  801. lea rdi,[128+rdi]
  802. movdqu xmm7,XMMWORD[48+rsi]
  803. pxor xmm6,XMMWORD[32+rsp]
  804. pxor xmm11,xmm10
  805. pxor xmm2,xmm14
  806. pxor xmm7,xmm8
  807. movdqu XMMWORD[rdi],xmm6
  808. movdqu xmm6,XMMWORD[64+rsi]
  809. movdqu XMMWORD[16+rdi],xmm11
  810. movdqu xmm11,XMMWORD[80+rsi]
  811. movdqu XMMWORD[32+rdi],xmm2
  812. movdqu xmm2,XMMWORD[96+rsi]
  813. movdqu XMMWORD[48+rdi],xmm7
  814. movdqu xmm7,XMMWORD[112+rsi]
  815. lea rsi,[128+rsi]
  816. pxor xmm6,XMMWORD[48+rsp]
  817. pxor xmm11,xmm15
  818. pxor xmm2,xmm9
  819. pxor xmm7,xmm3
  820. movdqu XMMWORD[64+rdi],xmm6
  821. movdqu XMMWORD[80+rdi],xmm11
  822. movdqu XMMWORD[96+rdi],xmm2
  823. movdqu XMMWORD[112+rdi],xmm7
  824. lea rdi,[128+rdi]
  825. sub rdx,64*4
  826. jnz NEAR $L$oop_outer4x
  827. jmp NEAR $L$done4x
  828. $L$tail4x:
  829. cmp rdx,192
  830. jae NEAR $L$192_or_more4x
  831. cmp rdx,128
  832. jae NEAR $L$128_or_more4x
  833. cmp rdx,64
  834. jae NEAR $L$64_or_more4x
  835. xor r10,r10
  836. movdqa XMMWORD[16+rsp],xmm12
  837. movdqa XMMWORD[32+rsp],xmm4
  838. movdqa XMMWORD[48+rsp],xmm0
  839. jmp NEAR $L$oop_tail4x
  840. ALIGN 32
  841. $L$64_or_more4x:
  842. movdqu xmm6,XMMWORD[rsi]
  843. movdqu xmm11,XMMWORD[16+rsi]
  844. movdqu xmm2,XMMWORD[32+rsi]
  845. movdqu xmm7,XMMWORD[48+rsi]
  846. pxor xmm6,XMMWORD[rsp]
  847. pxor xmm11,xmm12
  848. pxor xmm2,xmm4
  849. pxor xmm7,xmm0
  850. movdqu XMMWORD[rdi],xmm6
  851. movdqu XMMWORD[16+rdi],xmm11
  852. movdqu XMMWORD[32+rdi],xmm2
  853. movdqu XMMWORD[48+rdi],xmm7
  854. je NEAR $L$done4x
  855. movdqa xmm6,XMMWORD[16+rsp]
  856. lea rsi,[64+rsi]
  857. xor r10,r10
  858. movdqa XMMWORD[rsp],xmm6
  859. movdqa XMMWORD[16+rsp],xmm13
  860. lea rdi,[64+rdi]
  861. movdqa XMMWORD[32+rsp],xmm5
  862. sub rdx,64
  863. movdqa XMMWORD[48+rsp],xmm1
  864. jmp NEAR $L$oop_tail4x
  865. ALIGN 32
  866. $L$128_or_more4x:
  867. movdqu xmm6,XMMWORD[rsi]
  868. movdqu xmm11,XMMWORD[16+rsi]
  869. movdqu xmm2,XMMWORD[32+rsi]
  870. movdqu xmm7,XMMWORD[48+rsi]
  871. pxor xmm6,XMMWORD[rsp]
  872. pxor xmm11,xmm12
  873. pxor xmm2,xmm4
  874. pxor xmm7,xmm0
  875. movdqu XMMWORD[rdi],xmm6
  876. movdqu xmm6,XMMWORD[64+rsi]
  877. movdqu XMMWORD[16+rdi],xmm11
  878. movdqu xmm11,XMMWORD[80+rsi]
  879. movdqu XMMWORD[32+rdi],xmm2
  880. movdqu xmm2,XMMWORD[96+rsi]
  881. movdqu XMMWORD[48+rdi],xmm7
  882. movdqu xmm7,XMMWORD[112+rsi]
  883. pxor xmm6,XMMWORD[16+rsp]
  884. pxor xmm11,xmm13
  885. pxor xmm2,xmm5
  886. pxor xmm7,xmm1
  887. movdqu XMMWORD[64+rdi],xmm6
  888. movdqu XMMWORD[80+rdi],xmm11
  889. movdqu XMMWORD[96+rdi],xmm2
  890. movdqu XMMWORD[112+rdi],xmm7
  891. je NEAR $L$done4x
  892. movdqa xmm6,XMMWORD[32+rsp]
  893. lea rsi,[128+rsi]
  894. xor r10,r10
  895. movdqa XMMWORD[rsp],xmm6
  896. movdqa XMMWORD[16+rsp],xmm10
  897. lea rdi,[128+rdi]
  898. movdqa XMMWORD[32+rsp],xmm14
  899. sub rdx,128
  900. movdqa XMMWORD[48+rsp],xmm8
  901. jmp NEAR $L$oop_tail4x
  902. ALIGN 32
  903. $L$192_or_more4x:
  904. movdqu xmm6,XMMWORD[rsi]
  905. movdqu xmm11,XMMWORD[16+rsi]
  906. movdqu xmm2,XMMWORD[32+rsi]
  907. movdqu xmm7,XMMWORD[48+rsi]
  908. pxor xmm6,XMMWORD[rsp]
  909. pxor xmm11,xmm12
  910. pxor xmm2,xmm4
  911. pxor xmm7,xmm0
  912. movdqu XMMWORD[rdi],xmm6
  913. movdqu xmm6,XMMWORD[64+rsi]
  914. movdqu XMMWORD[16+rdi],xmm11
  915. movdqu xmm11,XMMWORD[80+rsi]
  916. movdqu XMMWORD[32+rdi],xmm2
  917. movdqu xmm2,XMMWORD[96+rsi]
  918. movdqu XMMWORD[48+rdi],xmm7
  919. movdqu xmm7,XMMWORD[112+rsi]
  920. lea rsi,[128+rsi]
  921. pxor xmm6,XMMWORD[16+rsp]
  922. pxor xmm11,xmm13
  923. pxor xmm2,xmm5
  924. pxor xmm7,xmm1
  925. movdqu XMMWORD[64+rdi],xmm6
  926. movdqu xmm6,XMMWORD[rsi]
  927. movdqu XMMWORD[80+rdi],xmm11
  928. movdqu xmm11,XMMWORD[16+rsi]
  929. movdqu XMMWORD[96+rdi],xmm2
  930. movdqu xmm2,XMMWORD[32+rsi]
  931. movdqu XMMWORD[112+rdi],xmm7
  932. lea rdi,[128+rdi]
  933. movdqu xmm7,XMMWORD[48+rsi]
  934. pxor xmm6,XMMWORD[32+rsp]
  935. pxor xmm11,xmm10
  936. pxor xmm2,xmm14
  937. pxor xmm7,xmm8
  938. movdqu XMMWORD[rdi],xmm6
  939. movdqu XMMWORD[16+rdi],xmm11
  940. movdqu XMMWORD[32+rdi],xmm2
  941. movdqu XMMWORD[48+rdi],xmm7
  942. je NEAR $L$done4x
  943. movdqa xmm6,XMMWORD[48+rsp]
  944. lea rsi,[64+rsi]
  945. xor r10,r10
  946. movdqa XMMWORD[rsp],xmm6
  947. movdqa XMMWORD[16+rsp],xmm15
  948. lea rdi,[64+rdi]
  949. movdqa XMMWORD[32+rsp],xmm9
  950. sub rdx,192
  951. movdqa XMMWORD[48+rsp],xmm3
  952. $L$oop_tail4x:
  953. movzx eax,BYTE[r10*1+rsi]
  954. movzx ecx,BYTE[r10*1+rsp]
  955. lea r10,[1+r10]
  956. xor eax,ecx
  957. mov BYTE[((-1))+r10*1+rdi],al
  958. dec rdx
  959. jnz NEAR $L$oop_tail4x
  960. $L$done4x:
  961. movaps xmm6,XMMWORD[((-168))+r9]
  962. movaps xmm7,XMMWORD[((-152))+r9]
  963. movaps xmm8,XMMWORD[((-136))+r9]
  964. movaps xmm9,XMMWORD[((-120))+r9]
  965. movaps xmm10,XMMWORD[((-104))+r9]
  966. movaps xmm11,XMMWORD[((-88))+r9]
  967. movaps xmm12,XMMWORD[((-72))+r9]
  968. movaps xmm13,XMMWORD[((-56))+r9]
  969. movaps xmm14,XMMWORD[((-40))+r9]
  970. movaps xmm15,XMMWORD[((-24))+r9]
  971. lea rsp,[r9]
  972. $L$4x_epilogue:
  973. mov rdi,QWORD[8+rsp] ;WIN64 epilogue
  974. mov rsi,QWORD[16+rsp]
  975. DB 0F3h,0C3h ;repret
  976. $L$SEH_end_ChaCha20_4x:
  977. ALIGN 32
  978. ChaCha20_8x:
  979. mov QWORD[8+rsp],rdi ;WIN64 prologue
  980. mov QWORD[16+rsp],rsi
  981. mov rax,rsp
  982. $L$SEH_begin_ChaCha20_8x:
  983. mov rdi,rcx
  984. mov rsi,rdx
  985. mov rdx,r8
  986. mov rcx,r9
  987. mov r8,QWORD[40+rsp]
  988. $L$ChaCha20_8x:
  989. mov r9,rsp
  990. sub rsp,0x280+168
  991. and rsp,-32
  992. movaps XMMWORD[(-168)+r9],xmm6
  993. movaps XMMWORD[(-152)+r9],xmm7
  994. movaps XMMWORD[(-136)+r9],xmm8
  995. movaps XMMWORD[(-120)+r9],xmm9
  996. movaps XMMWORD[(-104)+r9],xmm10
  997. movaps XMMWORD[(-88)+r9],xmm11
  998. movaps XMMWORD[(-72)+r9],xmm12
  999. movaps XMMWORD[(-56)+r9],xmm13
  1000. movaps XMMWORD[(-40)+r9],xmm14
  1001. movaps XMMWORD[(-24)+r9],xmm15
  1002. $L$8x_body:
  1003. vzeroupper
  1004. vbroadcasti128 ymm11,XMMWORD[$L$sigma]
  1005. vbroadcasti128 ymm3,XMMWORD[rcx]
  1006. vbroadcasti128 ymm15,XMMWORD[16+rcx]
  1007. vbroadcasti128 ymm7,XMMWORD[r8]
  1008. lea rcx,[256+rsp]
  1009. lea rax,[512+rsp]
  1010. lea r10,[$L$rot16]
  1011. lea r11,[$L$rot24]
  1012. vpshufd ymm8,ymm11,0x00
  1013. vpshufd ymm9,ymm11,0x55
  1014. vmovdqa YMMWORD[(128-256)+rcx],ymm8
  1015. vpshufd ymm10,ymm11,0xaa
  1016. vmovdqa YMMWORD[(160-256)+rcx],ymm9
  1017. vpshufd ymm11,ymm11,0xff
  1018. vmovdqa YMMWORD[(192-256)+rcx],ymm10
  1019. vmovdqa YMMWORD[(224-256)+rcx],ymm11
  1020. vpshufd ymm0,ymm3,0x00
  1021. vpshufd ymm1,ymm3,0x55
  1022. vmovdqa YMMWORD[(256-256)+rcx],ymm0
  1023. vpshufd ymm2,ymm3,0xaa
  1024. vmovdqa YMMWORD[(288-256)+rcx],ymm1
  1025. vpshufd ymm3,ymm3,0xff
  1026. vmovdqa YMMWORD[(320-256)+rcx],ymm2
  1027. vmovdqa YMMWORD[(352-256)+rcx],ymm3
  1028. vpshufd ymm12,ymm15,0x00
  1029. vpshufd ymm13,ymm15,0x55
  1030. vmovdqa YMMWORD[(384-512)+rax],ymm12
  1031. vpshufd ymm14,ymm15,0xaa
  1032. vmovdqa YMMWORD[(416-512)+rax],ymm13
  1033. vpshufd ymm15,ymm15,0xff
  1034. vmovdqa YMMWORD[(448-512)+rax],ymm14
  1035. vmovdqa YMMWORD[(480-512)+rax],ymm15
  1036. vpshufd ymm4,ymm7,0x00
  1037. vpshufd ymm5,ymm7,0x55
  1038. vpaddd ymm4,ymm4,YMMWORD[$L$incy]
  1039. vpshufd ymm6,ymm7,0xaa
  1040. vmovdqa YMMWORD[(544-512)+rax],ymm5
  1041. vpshufd ymm7,ymm7,0xff
  1042. vmovdqa YMMWORD[(576-512)+rax],ymm6
  1043. vmovdqa YMMWORD[(608-512)+rax],ymm7
  1044. jmp NEAR $L$oop_enter8x
  1045. ALIGN 32
  1046. $L$oop_outer8x:
  1047. vmovdqa ymm8,YMMWORD[((128-256))+rcx]
  1048. vmovdqa ymm9,YMMWORD[((160-256))+rcx]
  1049. vmovdqa ymm10,YMMWORD[((192-256))+rcx]
  1050. vmovdqa ymm11,YMMWORD[((224-256))+rcx]
  1051. vmovdqa ymm0,YMMWORD[((256-256))+rcx]
  1052. vmovdqa ymm1,YMMWORD[((288-256))+rcx]
  1053. vmovdqa ymm2,YMMWORD[((320-256))+rcx]
  1054. vmovdqa ymm3,YMMWORD[((352-256))+rcx]
  1055. vmovdqa ymm12,YMMWORD[((384-512))+rax]
  1056. vmovdqa ymm13,YMMWORD[((416-512))+rax]
  1057. vmovdqa ymm14,YMMWORD[((448-512))+rax]
  1058. vmovdqa ymm15,YMMWORD[((480-512))+rax]
  1059. vmovdqa ymm4,YMMWORD[((512-512))+rax]
  1060. vmovdqa ymm5,YMMWORD[((544-512))+rax]
  1061. vmovdqa ymm6,YMMWORD[((576-512))+rax]
  1062. vmovdqa ymm7,YMMWORD[((608-512))+rax]
  1063. vpaddd ymm4,ymm4,YMMWORD[$L$eight]
  1064. $L$oop_enter8x:
  1065. vmovdqa YMMWORD[64+rsp],ymm14
  1066. vmovdqa YMMWORD[96+rsp],ymm15
  1067. vbroadcasti128 ymm15,XMMWORD[r10]
  1068. vmovdqa YMMWORD[(512-512)+rax],ymm4
  1069. mov eax,10
  1070. jmp NEAR $L$oop8x
  1071. ALIGN 32
  1072. $L$oop8x:
  1073. vpaddd ymm8,ymm8,ymm0
  1074. vpxor ymm4,ymm8,ymm4
  1075. vpshufb ymm4,ymm4,ymm15
  1076. vpaddd ymm9,ymm9,ymm1
  1077. vpxor ymm5,ymm9,ymm5
  1078. vpshufb ymm5,ymm5,ymm15
  1079. vpaddd ymm12,ymm12,ymm4
  1080. vpxor ymm0,ymm12,ymm0
  1081. vpslld ymm14,ymm0,12
  1082. vpsrld ymm0,ymm0,20
  1083. vpor ymm0,ymm14,ymm0
  1084. vbroadcasti128 ymm14,XMMWORD[r11]
  1085. vpaddd ymm13,ymm13,ymm5
  1086. vpxor ymm1,ymm13,ymm1
  1087. vpslld ymm15,ymm1,12
  1088. vpsrld ymm1,ymm1,20
  1089. vpor ymm1,ymm15,ymm1
  1090. vpaddd ymm8,ymm8,ymm0
  1091. vpxor ymm4,ymm8,ymm4
  1092. vpshufb ymm4,ymm4,ymm14
  1093. vpaddd ymm9,ymm9,ymm1
  1094. vpxor ymm5,ymm9,ymm5
  1095. vpshufb ymm5,ymm5,ymm14
  1096. vpaddd ymm12,ymm12,ymm4
  1097. vpxor ymm0,ymm12,ymm0
  1098. vpslld ymm15,ymm0,7
  1099. vpsrld ymm0,ymm0,25
  1100. vpor ymm0,ymm15,ymm0
  1101. vbroadcasti128 ymm15,XMMWORD[r10]
  1102. vpaddd ymm13,ymm13,ymm5
  1103. vpxor ymm1,ymm13,ymm1
  1104. vpslld ymm14,ymm1,7
  1105. vpsrld ymm1,ymm1,25
  1106. vpor ymm1,ymm14,ymm1
  1107. vmovdqa YMMWORD[rsp],ymm12
  1108. vmovdqa YMMWORD[32+rsp],ymm13
  1109. vmovdqa ymm12,YMMWORD[64+rsp]
  1110. vmovdqa ymm13,YMMWORD[96+rsp]
  1111. vpaddd ymm10,ymm10,ymm2
  1112. vpxor ymm6,ymm10,ymm6
  1113. vpshufb ymm6,ymm6,ymm15
  1114. vpaddd ymm11,ymm11,ymm3
  1115. vpxor ymm7,ymm11,ymm7
  1116. vpshufb ymm7,ymm7,ymm15
  1117. vpaddd ymm12,ymm12,ymm6
  1118. vpxor ymm2,ymm12,ymm2
  1119. vpslld ymm14,ymm2,12
  1120. vpsrld ymm2,ymm2,20
  1121. vpor ymm2,ymm14,ymm2
  1122. vbroadcasti128 ymm14,XMMWORD[r11]
  1123. vpaddd ymm13,ymm13,ymm7
  1124. vpxor ymm3,ymm13,ymm3
  1125. vpslld ymm15,ymm3,12
  1126. vpsrld ymm3,ymm3,20
  1127. vpor ymm3,ymm15,ymm3
  1128. vpaddd ymm10,ymm10,ymm2
  1129. vpxor ymm6,ymm10,ymm6
  1130. vpshufb ymm6,ymm6,ymm14
  1131. vpaddd ymm11,ymm11,ymm3
  1132. vpxor ymm7,ymm11,ymm7
  1133. vpshufb ymm7,ymm7,ymm14
  1134. vpaddd ymm12,ymm12,ymm6
  1135. vpxor ymm2,ymm12,ymm2
  1136. vpslld ymm15,ymm2,7
  1137. vpsrld ymm2,ymm2,25
  1138. vpor ymm2,ymm15,ymm2
  1139. vbroadcasti128 ymm15,XMMWORD[r10]
  1140. vpaddd ymm13,ymm13,ymm7
  1141. vpxor ymm3,ymm13,ymm3
  1142. vpslld ymm14,ymm3,7
  1143. vpsrld ymm3,ymm3,25
  1144. vpor ymm3,ymm14,ymm3
  1145. vpaddd ymm8,ymm8,ymm1
  1146. vpxor ymm7,ymm8,ymm7
  1147. vpshufb ymm7,ymm7,ymm15
  1148. vpaddd ymm9,ymm9,ymm2
  1149. vpxor ymm4,ymm9,ymm4
  1150. vpshufb ymm4,ymm4,ymm15
  1151. vpaddd ymm12,ymm12,ymm7
  1152. vpxor ymm1,ymm12,ymm1
  1153. vpslld ymm14,ymm1,12
  1154. vpsrld ymm1,ymm1,20
  1155. vpor ymm1,ymm14,ymm1
  1156. vbroadcasti128 ymm14,XMMWORD[r11]
  1157. vpaddd ymm13,ymm13,ymm4
  1158. vpxor ymm2,ymm13,ymm2
  1159. vpslld ymm15,ymm2,12
  1160. vpsrld ymm2,ymm2,20
  1161. vpor ymm2,ymm15,ymm2
  1162. vpaddd ymm8,ymm8,ymm1
  1163. vpxor ymm7,ymm8,ymm7
  1164. vpshufb ymm7,ymm7,ymm14
  1165. vpaddd ymm9,ymm9,ymm2
  1166. vpxor ymm4,ymm9,ymm4
  1167. vpshufb ymm4,ymm4,ymm14
  1168. vpaddd ymm12,ymm12,ymm7
  1169. vpxor ymm1,ymm12,ymm1
  1170. vpslld ymm15,ymm1,7
  1171. vpsrld ymm1,ymm1,25
  1172. vpor ymm1,ymm15,ymm1
  1173. vbroadcasti128 ymm15,XMMWORD[r10]
  1174. vpaddd ymm13,ymm13,ymm4
  1175. vpxor ymm2,ymm13,ymm2
  1176. vpslld ymm14,ymm2,7
  1177. vpsrld ymm2,ymm2,25
  1178. vpor ymm2,ymm14,ymm2
  1179. vmovdqa YMMWORD[64+rsp],ymm12
  1180. vmovdqa YMMWORD[96+rsp],ymm13
  1181. vmovdqa ymm12,YMMWORD[rsp]
  1182. vmovdqa ymm13,YMMWORD[32+rsp]
  1183. vpaddd ymm10,ymm10,ymm3
  1184. vpxor ymm5,ymm10,ymm5
  1185. vpshufb ymm5,ymm5,ymm15
  1186. vpaddd ymm11,ymm11,ymm0
  1187. vpxor ymm6,ymm11,ymm6
  1188. vpshufb ymm6,ymm6,ymm15
  1189. vpaddd ymm12,ymm12,ymm5
  1190. vpxor ymm3,ymm12,ymm3
  1191. vpslld ymm14,ymm3,12
  1192. vpsrld ymm3,ymm3,20
  1193. vpor ymm3,ymm14,ymm3
  1194. vbroadcasti128 ymm14,XMMWORD[r11]
  1195. vpaddd ymm13,ymm13,ymm6
  1196. vpxor ymm0,ymm13,ymm0
  1197. vpslld ymm15,ymm0,12
  1198. vpsrld ymm0,ymm0,20
  1199. vpor ymm0,ymm15,ymm0
  1200. vpaddd ymm10,ymm10,ymm3
  1201. vpxor ymm5,ymm10,ymm5
  1202. vpshufb ymm5,ymm5,ymm14
  1203. vpaddd ymm11,ymm11,ymm0
  1204. vpxor ymm6,ymm11,ymm6
  1205. vpshufb ymm6,ymm6,ymm14
  1206. vpaddd ymm12,ymm12,ymm5
  1207. vpxor ymm3,ymm12,ymm3
  1208. vpslld ymm15,ymm3,7
  1209. vpsrld ymm3,ymm3,25
  1210. vpor ymm3,ymm15,ymm3
  1211. vbroadcasti128 ymm15,XMMWORD[r10]
  1212. vpaddd ymm13,ymm13,ymm6
  1213. vpxor ymm0,ymm13,ymm0
  1214. vpslld ymm14,ymm0,7
  1215. vpsrld ymm0,ymm0,25
  1216. vpor ymm0,ymm14,ymm0
  1217. dec eax
  1218. jnz NEAR $L$oop8x
  1219. lea rax,[512+rsp]
  1220. vpaddd ymm8,ymm8,YMMWORD[((128-256))+rcx]
  1221. vpaddd ymm9,ymm9,YMMWORD[((160-256))+rcx]
  1222. vpaddd ymm10,ymm10,YMMWORD[((192-256))+rcx]
  1223. vpaddd ymm11,ymm11,YMMWORD[((224-256))+rcx]
  1224. vpunpckldq ymm14,ymm8,ymm9
  1225. vpunpckldq ymm15,ymm10,ymm11
  1226. vpunpckhdq ymm8,ymm8,ymm9
  1227. vpunpckhdq ymm10,ymm10,ymm11
  1228. vpunpcklqdq ymm9,ymm14,ymm15
  1229. vpunpckhqdq ymm14,ymm14,ymm15
  1230. vpunpcklqdq ymm11,ymm8,ymm10
  1231. vpunpckhqdq ymm8,ymm8,ymm10
  1232. vpaddd ymm0,ymm0,YMMWORD[((256-256))+rcx]
  1233. vpaddd ymm1,ymm1,YMMWORD[((288-256))+rcx]
  1234. vpaddd ymm2,ymm2,YMMWORD[((320-256))+rcx]
  1235. vpaddd ymm3,ymm3,YMMWORD[((352-256))+rcx]
  1236. vpunpckldq ymm10,ymm0,ymm1
  1237. vpunpckldq ymm15,ymm2,ymm3
  1238. vpunpckhdq ymm0,ymm0,ymm1
  1239. vpunpckhdq ymm2,ymm2,ymm3
  1240. vpunpcklqdq ymm1,ymm10,ymm15
  1241. vpunpckhqdq ymm10,ymm10,ymm15
  1242. vpunpcklqdq ymm3,ymm0,ymm2
  1243. vpunpckhqdq ymm0,ymm0,ymm2
  1244. vperm2i128 ymm15,ymm9,ymm1,0x20
  1245. vperm2i128 ymm1,ymm9,ymm1,0x31
  1246. vperm2i128 ymm9,ymm14,ymm10,0x20
  1247. vperm2i128 ymm10,ymm14,ymm10,0x31
  1248. vperm2i128 ymm14,ymm11,ymm3,0x20
  1249. vperm2i128 ymm3,ymm11,ymm3,0x31
  1250. vperm2i128 ymm11,ymm8,ymm0,0x20
  1251. vperm2i128 ymm0,ymm8,ymm0,0x31
  1252. vmovdqa YMMWORD[rsp],ymm15
  1253. vmovdqa YMMWORD[32+rsp],ymm9
  1254. vmovdqa ymm15,YMMWORD[64+rsp]
  1255. vmovdqa ymm9,YMMWORD[96+rsp]
  1256. vpaddd ymm12,ymm12,YMMWORD[((384-512))+rax]
  1257. vpaddd ymm13,ymm13,YMMWORD[((416-512))+rax]
  1258. vpaddd ymm15,ymm15,YMMWORD[((448-512))+rax]
  1259. vpaddd ymm9,ymm9,YMMWORD[((480-512))+rax]
  1260. vpunpckldq ymm2,ymm12,ymm13
  1261. vpunpckldq ymm8,ymm15,ymm9
  1262. vpunpckhdq ymm12,ymm12,ymm13
  1263. vpunpckhdq ymm15,ymm15,ymm9
  1264. vpunpcklqdq ymm13,ymm2,ymm8
  1265. vpunpckhqdq ymm2,ymm2,ymm8
  1266. vpunpcklqdq ymm9,ymm12,ymm15
  1267. vpunpckhqdq ymm12,ymm12,ymm15
  1268. vpaddd ymm4,ymm4,YMMWORD[((512-512))+rax]
  1269. vpaddd ymm5,ymm5,YMMWORD[((544-512))+rax]
  1270. vpaddd ymm6,ymm6,YMMWORD[((576-512))+rax]
  1271. vpaddd ymm7,ymm7,YMMWORD[((608-512))+rax]
  1272. vpunpckldq ymm15,ymm4,ymm5
  1273. vpunpckldq ymm8,ymm6,ymm7
  1274. vpunpckhdq ymm4,ymm4,ymm5
  1275. vpunpckhdq ymm6,ymm6,ymm7
  1276. vpunpcklqdq ymm5,ymm15,ymm8
  1277. vpunpckhqdq ymm15,ymm15,ymm8
  1278. vpunpcklqdq ymm7,ymm4,ymm6
  1279. vpunpckhqdq ymm4,ymm4,ymm6
  1280. vperm2i128 ymm8,ymm13,ymm5,0x20
  1281. vperm2i128 ymm5,ymm13,ymm5,0x31
  1282. vperm2i128 ymm13,ymm2,ymm15,0x20
  1283. vperm2i128 ymm15,ymm2,ymm15,0x31
  1284. vperm2i128 ymm2,ymm9,ymm7,0x20
  1285. vperm2i128 ymm7,ymm9,ymm7,0x31
  1286. vperm2i128 ymm9,ymm12,ymm4,0x20
  1287. vperm2i128 ymm4,ymm12,ymm4,0x31
  1288. vmovdqa ymm6,YMMWORD[rsp]
  1289. vmovdqa ymm12,YMMWORD[32+rsp]
  1290. cmp rdx,64*8
  1291. jb NEAR $L$tail8x
  1292. vpxor ymm6,ymm6,YMMWORD[rsi]
  1293. vpxor ymm8,ymm8,YMMWORD[32+rsi]
  1294. vpxor ymm1,ymm1,YMMWORD[64+rsi]
  1295. vpxor ymm5,ymm5,YMMWORD[96+rsi]
  1296. lea rsi,[128+rsi]
  1297. vmovdqu YMMWORD[rdi],ymm6
  1298. vmovdqu YMMWORD[32+rdi],ymm8
  1299. vmovdqu YMMWORD[64+rdi],ymm1
  1300. vmovdqu YMMWORD[96+rdi],ymm5
  1301. lea rdi,[128+rdi]
  1302. vpxor ymm12,ymm12,YMMWORD[rsi]
  1303. vpxor ymm13,ymm13,YMMWORD[32+rsi]
  1304. vpxor ymm10,ymm10,YMMWORD[64+rsi]
  1305. vpxor ymm15,ymm15,YMMWORD[96+rsi]
  1306. lea rsi,[128+rsi]
  1307. vmovdqu YMMWORD[rdi],ymm12
  1308. vmovdqu YMMWORD[32+rdi],ymm13
  1309. vmovdqu YMMWORD[64+rdi],ymm10
  1310. vmovdqu YMMWORD[96+rdi],ymm15
  1311. lea rdi,[128+rdi]
  1312. vpxor ymm14,ymm14,YMMWORD[rsi]
  1313. vpxor ymm2,ymm2,YMMWORD[32+rsi]
  1314. vpxor ymm3,ymm3,YMMWORD[64+rsi]
  1315. vpxor ymm7,ymm7,YMMWORD[96+rsi]
  1316. lea rsi,[128+rsi]
  1317. vmovdqu YMMWORD[rdi],ymm14
  1318. vmovdqu YMMWORD[32+rdi],ymm2
  1319. vmovdqu YMMWORD[64+rdi],ymm3
  1320. vmovdqu YMMWORD[96+rdi],ymm7
  1321. lea rdi,[128+rdi]
  1322. vpxor ymm11,ymm11,YMMWORD[rsi]
  1323. vpxor ymm9,ymm9,YMMWORD[32+rsi]
  1324. vpxor ymm0,ymm0,YMMWORD[64+rsi]
  1325. vpxor ymm4,ymm4,YMMWORD[96+rsi]
  1326. lea rsi,[128+rsi]
  1327. vmovdqu YMMWORD[rdi],ymm11
  1328. vmovdqu YMMWORD[32+rdi],ymm9
  1329. vmovdqu YMMWORD[64+rdi],ymm0
  1330. vmovdqu YMMWORD[96+rdi],ymm4
  1331. lea rdi,[128+rdi]
  1332. sub rdx,64*8
  1333. jnz NEAR $L$oop_outer8x
  1334. jmp NEAR $L$done8x
  1335. $L$tail8x:
  1336. cmp rdx,448
  1337. jae NEAR $L$448_or_more8x
  1338. cmp rdx,384
  1339. jae NEAR $L$384_or_more8x
  1340. cmp rdx,320
  1341. jae NEAR $L$320_or_more8x
  1342. cmp rdx,256
  1343. jae NEAR $L$256_or_more8x
  1344. cmp rdx,192
  1345. jae NEAR $L$192_or_more8x
  1346. cmp rdx,128
  1347. jae NEAR $L$128_or_more8x
  1348. cmp rdx,64
  1349. jae NEAR $L$64_or_more8x
  1350. xor r10,r10
  1351. vmovdqa YMMWORD[rsp],ymm6
  1352. vmovdqa YMMWORD[32+rsp],ymm8
  1353. jmp NEAR $L$oop_tail8x
  1354. ALIGN 32
  1355. $L$64_or_more8x:
  1356. vpxor ymm6,ymm6,YMMWORD[rsi]
  1357. vpxor ymm8,ymm8,YMMWORD[32+rsi]
  1358. vmovdqu YMMWORD[rdi],ymm6
  1359. vmovdqu YMMWORD[32+rdi],ymm8
  1360. je NEAR $L$done8x
  1361. lea rsi,[64+rsi]
  1362. xor r10,r10
  1363. vmovdqa YMMWORD[rsp],ymm1
  1364. lea rdi,[64+rdi]
  1365. sub rdx,64
  1366. vmovdqa YMMWORD[32+rsp],ymm5
  1367. jmp NEAR $L$oop_tail8x
  1368. ALIGN 32
  1369. $L$128_or_more8x:
  1370. vpxor ymm6,ymm6,YMMWORD[rsi]
  1371. vpxor ymm8,ymm8,YMMWORD[32+rsi]
  1372. vpxor ymm1,ymm1,YMMWORD[64+rsi]
  1373. vpxor ymm5,ymm5,YMMWORD[96+rsi]
  1374. vmovdqu YMMWORD[rdi],ymm6
  1375. vmovdqu YMMWORD[32+rdi],ymm8
  1376. vmovdqu YMMWORD[64+rdi],ymm1
  1377. vmovdqu YMMWORD[96+rdi],ymm5
  1378. je NEAR $L$done8x
  1379. lea rsi,[128+rsi]
  1380. xor r10,r10
  1381. vmovdqa YMMWORD[rsp],ymm12
  1382. lea rdi,[128+rdi]
  1383. sub rdx,128
  1384. vmovdqa YMMWORD[32+rsp],ymm13
  1385. jmp NEAR $L$oop_tail8x
  1386. ALIGN 32
  1387. $L$192_or_more8x:
  1388. vpxor ymm6,ymm6,YMMWORD[rsi]
  1389. vpxor ymm8,ymm8,YMMWORD[32+rsi]
  1390. vpxor ymm1,ymm1,YMMWORD[64+rsi]
  1391. vpxor ymm5,ymm5,YMMWORD[96+rsi]
  1392. vpxor ymm12,ymm12,YMMWORD[128+rsi]
  1393. vpxor ymm13,ymm13,YMMWORD[160+rsi]
  1394. vmovdqu YMMWORD[rdi],ymm6
  1395. vmovdqu YMMWORD[32+rdi],ymm8
  1396. vmovdqu YMMWORD[64+rdi],ymm1
  1397. vmovdqu YMMWORD[96+rdi],ymm5
  1398. vmovdqu YMMWORD[128+rdi],ymm12
  1399. vmovdqu YMMWORD[160+rdi],ymm13
  1400. je NEAR $L$done8x
  1401. lea rsi,[192+rsi]
  1402. xor r10,r10
  1403. vmovdqa YMMWORD[rsp],ymm10
  1404. lea rdi,[192+rdi]
  1405. sub rdx,192
  1406. vmovdqa YMMWORD[32+rsp],ymm15
  1407. jmp NEAR $L$oop_tail8x
  1408. ALIGN 32
  1409. $L$256_or_more8x:
  1410. vpxor ymm6,ymm6,YMMWORD[rsi]
  1411. vpxor ymm8,ymm8,YMMWORD[32+rsi]
  1412. vpxor ymm1,ymm1,YMMWORD[64+rsi]
  1413. vpxor ymm5,ymm5,YMMWORD[96+rsi]
  1414. vpxor ymm12,ymm12,YMMWORD[128+rsi]
  1415. vpxor ymm13,ymm13,YMMWORD[160+rsi]
  1416. vpxor ymm10,ymm10,YMMWORD[192+rsi]
  1417. vpxor ymm15,ymm15,YMMWORD[224+rsi]
  1418. vmovdqu YMMWORD[rdi],ymm6
  1419. vmovdqu YMMWORD[32+rdi],ymm8
  1420. vmovdqu YMMWORD[64+rdi],ymm1
  1421. vmovdqu YMMWORD[96+rdi],ymm5
  1422. vmovdqu YMMWORD[128+rdi],ymm12
  1423. vmovdqu YMMWORD[160+rdi],ymm13
  1424. vmovdqu YMMWORD[192+rdi],ymm10
  1425. vmovdqu YMMWORD[224+rdi],ymm15
  1426. je NEAR $L$done8x
  1427. lea rsi,[256+rsi]
  1428. xor r10,r10
  1429. vmovdqa YMMWORD[rsp],ymm14
  1430. lea rdi,[256+rdi]
  1431. sub rdx,256
  1432. vmovdqa YMMWORD[32+rsp],ymm2
  1433. jmp NEAR $L$oop_tail8x
  1434. ALIGN 32
  1435. $L$320_or_more8x:
  1436. vpxor ymm6,ymm6,YMMWORD[rsi]
  1437. vpxor ymm8,ymm8,YMMWORD[32+rsi]
  1438. vpxor ymm1,ymm1,YMMWORD[64+rsi]
  1439. vpxor ymm5,ymm5,YMMWORD[96+rsi]
  1440. vpxor ymm12,ymm12,YMMWORD[128+rsi]
  1441. vpxor ymm13,ymm13,YMMWORD[160+rsi]
  1442. vpxor ymm10,ymm10,YMMWORD[192+rsi]
  1443. vpxor ymm15,ymm15,YMMWORD[224+rsi]
  1444. vpxor ymm14,ymm14,YMMWORD[256+rsi]
  1445. vpxor ymm2,ymm2,YMMWORD[288+rsi]
  1446. vmovdqu YMMWORD[rdi],ymm6
  1447. vmovdqu YMMWORD[32+rdi],ymm8
  1448. vmovdqu YMMWORD[64+rdi],ymm1
  1449. vmovdqu YMMWORD[96+rdi],ymm5
  1450. vmovdqu YMMWORD[128+rdi],ymm12
  1451. vmovdqu YMMWORD[160+rdi],ymm13
  1452. vmovdqu YMMWORD[192+rdi],ymm10
  1453. vmovdqu YMMWORD[224+rdi],ymm15
  1454. vmovdqu YMMWORD[256+rdi],ymm14
  1455. vmovdqu YMMWORD[288+rdi],ymm2
  1456. je NEAR $L$done8x
  1457. lea rsi,[320+rsi]
  1458. xor r10,r10
  1459. vmovdqa YMMWORD[rsp],ymm3
  1460. lea rdi,[320+rdi]
  1461. sub rdx,320
  1462. vmovdqa YMMWORD[32+rsp],ymm7
  1463. jmp NEAR $L$oop_tail8x
  1464. ALIGN 32
  1465. $L$384_or_more8x:
  1466. vpxor ymm6,ymm6,YMMWORD[rsi]
  1467. vpxor ymm8,ymm8,YMMWORD[32+rsi]
  1468. vpxor ymm1,ymm1,YMMWORD[64+rsi]
  1469. vpxor ymm5,ymm5,YMMWORD[96+rsi]
  1470. vpxor ymm12,ymm12,YMMWORD[128+rsi]
  1471. vpxor ymm13,ymm13,YMMWORD[160+rsi]
  1472. vpxor ymm10,ymm10,YMMWORD[192+rsi]
  1473. vpxor ymm15,ymm15,YMMWORD[224+rsi]
  1474. vpxor ymm14,ymm14,YMMWORD[256+rsi]
  1475. vpxor ymm2,ymm2,YMMWORD[288+rsi]
  1476. vpxor ymm3,ymm3,YMMWORD[320+rsi]
  1477. vpxor ymm7,ymm7,YMMWORD[352+rsi]
  1478. vmovdqu YMMWORD[rdi],ymm6
  1479. vmovdqu YMMWORD[32+rdi],ymm8
  1480. vmovdqu YMMWORD[64+rdi],ymm1
  1481. vmovdqu YMMWORD[96+rdi],ymm5
  1482. vmovdqu YMMWORD[128+rdi],ymm12
  1483. vmovdqu YMMWORD[160+rdi],ymm13
  1484. vmovdqu YMMWORD[192+rdi],ymm10
  1485. vmovdqu YMMWORD[224+rdi],ymm15
  1486. vmovdqu YMMWORD[256+rdi],ymm14
  1487. vmovdqu YMMWORD[288+rdi],ymm2
  1488. vmovdqu YMMWORD[320+rdi],ymm3
  1489. vmovdqu YMMWORD[352+rdi],ymm7
  1490. je NEAR $L$done8x
  1491. lea rsi,[384+rsi]
  1492. xor r10,r10
  1493. vmovdqa YMMWORD[rsp],ymm11
  1494. lea rdi,[384+rdi]
  1495. sub rdx,384
  1496. vmovdqa YMMWORD[32+rsp],ymm9
  1497. jmp NEAR $L$oop_tail8x
  1498. ALIGN 32
  1499. $L$448_or_more8x:
  1500. vpxor ymm6,ymm6,YMMWORD[rsi]
  1501. vpxor ymm8,ymm8,YMMWORD[32+rsi]
  1502. vpxor ymm1,ymm1,YMMWORD[64+rsi]
  1503. vpxor ymm5,ymm5,YMMWORD[96+rsi]
  1504. vpxor ymm12,ymm12,YMMWORD[128+rsi]
  1505. vpxor ymm13,ymm13,YMMWORD[160+rsi]
  1506. vpxor ymm10,ymm10,YMMWORD[192+rsi]
  1507. vpxor ymm15,ymm15,YMMWORD[224+rsi]
  1508. vpxor ymm14,ymm14,YMMWORD[256+rsi]
  1509. vpxor ymm2,ymm2,YMMWORD[288+rsi]
  1510. vpxor ymm3,ymm3,YMMWORD[320+rsi]
  1511. vpxor ymm7,ymm7,YMMWORD[352+rsi]
  1512. vpxor ymm11,ymm11,YMMWORD[384+rsi]
  1513. vpxor ymm9,ymm9,YMMWORD[416+rsi]
  1514. vmovdqu YMMWORD[rdi],ymm6
  1515. vmovdqu YMMWORD[32+rdi],ymm8
  1516. vmovdqu YMMWORD[64+rdi],ymm1
  1517. vmovdqu YMMWORD[96+rdi],ymm5
  1518. vmovdqu YMMWORD[128+rdi],ymm12
  1519. vmovdqu YMMWORD[160+rdi],ymm13
  1520. vmovdqu YMMWORD[192+rdi],ymm10
  1521. vmovdqu YMMWORD[224+rdi],ymm15
  1522. vmovdqu YMMWORD[256+rdi],ymm14
  1523. vmovdqu YMMWORD[288+rdi],ymm2
  1524. vmovdqu YMMWORD[320+rdi],ymm3
  1525. vmovdqu YMMWORD[352+rdi],ymm7
  1526. vmovdqu YMMWORD[384+rdi],ymm11
  1527. vmovdqu YMMWORD[416+rdi],ymm9
  1528. je NEAR $L$done8x
  1529. lea rsi,[448+rsi]
  1530. xor r10,r10
  1531. vmovdqa YMMWORD[rsp],ymm0
  1532. lea rdi,[448+rdi]
  1533. sub rdx,448
  1534. vmovdqa YMMWORD[32+rsp],ymm4
  1535. $L$oop_tail8x:
  1536. movzx eax,BYTE[r10*1+rsi]
  1537. movzx ecx,BYTE[r10*1+rsp]
  1538. lea r10,[1+r10]
  1539. xor eax,ecx
  1540. mov BYTE[((-1))+r10*1+rdi],al
  1541. dec rdx
  1542. jnz NEAR $L$oop_tail8x
  1543. $L$done8x:
  1544. vzeroall
  1545. movaps xmm6,XMMWORD[((-168))+r9]
  1546. movaps xmm7,XMMWORD[((-152))+r9]
  1547. movaps xmm8,XMMWORD[((-136))+r9]
  1548. movaps xmm9,XMMWORD[((-120))+r9]
  1549. movaps xmm10,XMMWORD[((-104))+r9]
  1550. movaps xmm11,XMMWORD[((-88))+r9]
  1551. movaps xmm12,XMMWORD[((-72))+r9]
  1552. movaps xmm13,XMMWORD[((-56))+r9]
  1553. movaps xmm14,XMMWORD[((-40))+r9]
  1554. movaps xmm15,XMMWORD[((-24))+r9]
  1555. lea rsp,[r9]
  1556. $L$8x_epilogue:
  1557. mov rdi,QWORD[8+rsp] ;WIN64 epilogue
  1558. mov rsi,QWORD[16+rsp]
  1559. DB 0F3h,0C3h ;repret
  1560. $L$SEH_end_ChaCha20_8x:
  1561. EXTERN __imp_RtlVirtualUnwind
  1562. ALIGN 16
  1563. se_handler:
  1564. push rsi
  1565. push rdi
  1566. push rbx
  1567. push rbp
  1568. push r12
  1569. push r13
  1570. push r14
  1571. push r15
  1572. pushfq
  1573. sub rsp,64
  1574. mov rax,QWORD[120+r8]
  1575. mov rbx,QWORD[248+r8]
  1576. mov rsi,QWORD[8+r9]
  1577. mov r11,QWORD[56+r9]
  1578. lea r10,[$L$ctr32_body]
  1579. cmp rbx,r10
  1580. jb NEAR $L$common_seh_tail
  1581. mov rax,QWORD[152+r8]
  1582. lea r10,[$L$no_data]
  1583. cmp rbx,r10
  1584. jae NEAR $L$common_seh_tail
  1585. lea rax,[((64+24+48))+rax]
  1586. mov rbx,QWORD[((-8))+rax]
  1587. mov rbp,QWORD[((-16))+rax]
  1588. mov r12,QWORD[((-24))+rax]
  1589. mov r13,QWORD[((-32))+rax]
  1590. mov r14,QWORD[((-40))+rax]
  1591. mov r15,QWORD[((-48))+rax]
  1592. mov QWORD[144+r8],rbx
  1593. mov QWORD[160+r8],rbp
  1594. mov QWORD[216+r8],r12
  1595. mov QWORD[224+r8],r13
  1596. mov QWORD[232+r8],r14
  1597. mov QWORD[240+r8],r15
  1598. $L$common_seh_tail:
  1599. mov rdi,QWORD[8+rax]
  1600. mov rsi,QWORD[16+rax]
  1601. mov QWORD[152+r8],rax
  1602. mov QWORD[168+r8],rsi
  1603. mov QWORD[176+r8],rdi
  1604. mov rdi,QWORD[40+r9]
  1605. mov rsi,r8
  1606. mov ecx,154
  1607. DD 0xa548f3fc
  1608. mov rsi,r9
  1609. xor rcx,rcx
  1610. mov rdx,QWORD[8+rsi]
  1611. mov r8,QWORD[rsi]
  1612. mov r9,QWORD[16+rsi]
  1613. mov r10,QWORD[40+rsi]
  1614. lea r11,[56+rsi]
  1615. lea r12,[24+rsi]
  1616. mov QWORD[32+rsp],r10
  1617. mov QWORD[40+rsp],r11
  1618. mov QWORD[48+rsp],r12
  1619. mov QWORD[56+rsp],rcx
  1620. call QWORD[__imp_RtlVirtualUnwind]
  1621. mov eax,1
  1622. add rsp,64
  1623. popfq
  1624. pop r15
  1625. pop r14
  1626. pop r13
  1627. pop r12
  1628. pop rbp
  1629. pop rbx
  1630. pop rdi
  1631. pop rsi
  1632. DB 0F3h,0C3h ;repret
  1633. ALIGN 16
  1634. ssse3_handler:
  1635. push rsi
  1636. push rdi
  1637. push rbx
  1638. push rbp
  1639. push r12
  1640. push r13
  1641. push r14
  1642. push r15
  1643. pushfq
  1644. sub rsp,64
  1645. mov rax,QWORD[120+r8]
  1646. mov rbx,QWORD[248+r8]
  1647. mov rsi,QWORD[8+r9]
  1648. mov r11,QWORD[56+r9]
  1649. mov r10d,DWORD[r11]
  1650. lea r10,[r10*1+rsi]
  1651. cmp rbx,r10
  1652. jb NEAR $L$common_seh_tail
  1653. mov rax,QWORD[192+r8]
  1654. mov r10d,DWORD[4+r11]
  1655. lea r10,[r10*1+rsi]
  1656. cmp rbx,r10
  1657. jae NEAR $L$common_seh_tail
  1658. lea rsi,[((-40))+rax]
  1659. lea rdi,[512+r8]
  1660. mov ecx,4
  1661. DD 0xa548f3fc
  1662. jmp NEAR $L$common_seh_tail
  1663. ALIGN 16
  1664. full_handler:
  1665. push rsi
  1666. push rdi
  1667. push rbx
  1668. push rbp
  1669. push r12
  1670. push r13
  1671. push r14
  1672. push r15
  1673. pushfq
  1674. sub rsp,64
  1675. mov rax,QWORD[120+r8]
  1676. mov rbx,QWORD[248+r8]
  1677. mov rsi,QWORD[8+r9]
  1678. mov r11,QWORD[56+r9]
  1679. mov r10d,DWORD[r11]
  1680. lea r10,[r10*1+rsi]
  1681. cmp rbx,r10
  1682. jb NEAR $L$common_seh_tail
  1683. mov rax,QWORD[192+r8]
  1684. mov r10d,DWORD[4+r11]
  1685. lea r10,[r10*1+rsi]
  1686. cmp rbx,r10
  1687. jae NEAR $L$common_seh_tail
  1688. lea rsi,[((-168))+rax]
  1689. lea rdi,[512+r8]
  1690. mov ecx,20
  1691. DD 0xa548f3fc
  1692. jmp NEAR $L$common_seh_tail
  1693. section .pdata rdata align=4
  1694. ALIGN 4
  1695. DD $L$SEH_begin_GFp_ChaCha20_ctr32 wrt ..imagebase
  1696. DD $L$SEH_end_GFp_ChaCha20_ctr32 wrt ..imagebase
  1697. DD $L$SEH_info_GFp_ChaCha20_ctr32 wrt ..imagebase
  1698. DD $L$SEH_begin_ChaCha20_ssse3 wrt ..imagebase
  1699. DD $L$SEH_end_ChaCha20_ssse3 wrt ..imagebase
  1700. DD $L$SEH_info_ChaCha20_ssse3 wrt ..imagebase
  1701. DD $L$SEH_begin_ChaCha20_4x wrt ..imagebase
  1702. DD $L$SEH_end_ChaCha20_4x wrt ..imagebase
  1703. DD $L$SEH_info_ChaCha20_4x wrt ..imagebase
  1704. DD $L$SEH_begin_ChaCha20_8x wrt ..imagebase
  1705. DD $L$SEH_end_ChaCha20_8x wrt ..imagebase
  1706. DD $L$SEH_info_ChaCha20_8x wrt ..imagebase
  1707. section .xdata rdata align=8
  1708. ALIGN 8
  1709. $L$SEH_info_GFp_ChaCha20_ctr32:
  1710. DB 9,0,0,0
  1711. DD se_handler wrt ..imagebase
  1712. $L$SEH_info_ChaCha20_ssse3:
  1713. DB 9,0,0,0
  1714. DD ssse3_handler wrt ..imagebase
  1715. DD $L$ssse3_body wrt ..imagebase,$L$ssse3_epilogue wrt ..imagebase
  1716. $L$SEH_info_ChaCha20_4x:
  1717. DB 9,0,0,0
  1718. DD full_handler wrt ..imagebase
  1719. DD $L$4x_body wrt ..imagebase,$L$4x_epilogue wrt ..imagebase
  1720. $L$SEH_info_ChaCha20_8x:
  1721. DB 9,0,0,0
  1722. DD full_handler wrt ..imagebase
  1723. DD $L$8x_body wrt ..imagebase,$L$8x_epilogue wrt ..imagebase